diff options
author | Oded Gabbay <oded.gabbay@gmail.com> | 2015-09-06 10:58:30 +0300 |
---|---|---|
committer | Oded Gabbay <oded.gabbay@gmail.com> | 2015-09-18 10:06:50 +0300 |
commit | 857880f0e4d1d42a8508ac77be33556cc6f7f546 (patch) | |
tree | b0b64884dc83ef1a87fee9e1c770b9d434bd49c9 | |
parent | 73e586efb3ee149f76f15d9e549bffa15d8e30ec (diff) |
vmx: optimize scaled_nearest_scanline_vmx_8888_8888_OVER
This patch optimizes scaled_nearest_scanline_vmx_8888_8888_OVER and all
the functions it calls (combine1, combine4 and
core_combine_over_u_pixel_vmx).
The optimization is done by removing use of expand_alpha_1x128 and
expand_alpha_2x128 in favor of splat_alpha and MUL/ADD macros from
pixman_combine32.h.
Running "lowlevel-blt-bench -n over_8888_8888" on POWER8, 8 cores,
3.4GHz, RHEL 7.2 ppc64le gave the following results:
reference memcpy speed = 24847.3MB/s (6211.8MP/s for 32bpp fills)
Before After Change
--------------------------------------------
L1 182.05 210.22 +15.47%
L2 180.6 208.92 +15.68%
M 180.52 208.22 +15.34%
HT 130.17 178.97 +37.49%
VT 145.82 184.22 +26.33%
R 104.51 129.38 +23.80%
RT 48.3 61.54 +27.41%
Kops/s 430 504 +17.21%
v2: Check *pm is not NULL before dereferencing it in combine1()
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Acked-by: Pekka Paalanen <pekka.paalanen@collabora.co.uk>
Acked-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>
-rw-r--r-- | pixman/pixman-vmx.c | 79 |
1 files changed, 17 insertions, 62 deletions
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c index a9bd0246..4105dee1 100644 --- a/pixman/pixman-vmx.c +++ b/pixman/pixman-vmx.c @@ -623,10 +623,9 @@ in_over_2x128 (vector unsigned int* src_lo, static force_inline uint32_t core_combine_over_u_pixel_vmx (uint32_t src, uint32_t dst) { - uint8_t a; - vector unsigned int vmxs; + uint32_t a; - a = src >> 24; + a = ALPHA_8(src); if (a == 0xff) { @@ -634,9 +633,7 @@ core_combine_over_u_pixel_vmx (uint32_t src, uint32_t dst) } else if (src) { - vmxs = unpack_32_1x128 (src); - return pack_1x128_32( - over(vmxs, expand_alpha_1x128 (vmxs), unpack_32_1x128 (dst))); + UN8x4_MUL_UN8_ADD_UN8x4(dst, (~a & MASK), src); } return dst; @@ -648,17 +645,7 @@ combine1 (const uint32_t *ps, const uint32_t *pm) uint32_t s = *ps; if (pm) - { - vector unsigned int ms, mm; - - mm = unpack_32_1x128 (*pm); - mm = expand_alpha_1x128 (mm); - - ms = unpack_32_1x128 (s); - ms = pix_multiply (ms, mm); - - s = pack_1x128_32 (ms); - } + UN8x4_MUL_UN8(s, ALPHA_8(*pm)); return s; } @@ -666,38 +653,22 @@ combine1 (const uint32_t *ps, const uint32_t *pm) static force_inline vector unsigned int combine4 (const uint32_t* ps, const uint32_t* pm) { - vector unsigned int vmx_src_lo, vmx_src_hi; - vector unsigned int vmx_msk_lo, vmx_msk_hi; - vector unsigned int s; + vector unsigned int src, msk; if (pm) { - vmx_msk_lo = load_128_unaligned(pm); + msk = load_128_unaligned(pm); - if (is_transparent(vmx_msk_lo)) + if (is_transparent(msk)) return (vector unsigned int) AVV(0); } - s = load_128_unaligned(ps); + src = load_128_unaligned(ps); if (pm) - { - unpack_128_2x128(s, (vector unsigned int) AVV(0), - &vmx_src_lo, &vmx_src_hi); - - unpack_128_2x128(vmx_msk_lo, (vector unsigned int) AVV(0), - &vmx_msk_lo, &vmx_msk_hi); - - expand_alpha_2x128(vmx_msk_lo, vmx_msk_hi, &vmx_msk_lo, &vmx_msk_hi); + src = pix_multiply(src, msk); - pix_multiply_2x128(&vmx_src_lo, &vmx_src_hi, - &vmx_msk_lo, &vmx_msk_hi, - &vmx_src_lo, &vmx_src_hi); - - s = pack_2x128_128(vmx_src_lo, vmx_src_hi); - } - - return s; + return src; } static void @@ -2966,9 +2937,7 @@ scaled_nearest_scanline_vmx_8888_8888_OVER (uint32_t* pd, uint32_t s, d; const uint32_t* pm = NULL; - vector unsigned int vmx_dst_lo, vmx_dst_hi; - vector unsigned int vmx_src_lo, vmx_src_hi; - vector unsigned int vmx_alpha_lo, vmx_alpha_hi; + vector unsigned int vsrc, vdst; if (fully_transparent_src) return; @@ -3015,31 +2984,17 @@ scaled_nearest_scanline_vmx_8888_8888_OVER (uint32_t* pd, tmp[2] = tmp3; tmp[3] = tmp4; - vmx_src_hi = combine4 ((const uint32_t *) &tmp, pm); + vsrc = combine4 ((const uint32_t *) &tmp, pm); - if (is_opaque (vmx_src_hi)) + if (is_opaque (vsrc)) { - save_128_aligned (pd, vmx_src_hi); + save_128_aligned (pd, vsrc); } - else if (!is_zero (vmx_src_hi)) + else if (!is_zero (vsrc)) { - vmx_dst_hi = load_128_aligned (pd); - - unpack_128_2x128 (vmx_src_hi, (vector unsigned int) AVV(0), - &vmx_src_lo, &vmx_src_hi); - - unpack_128_2x128 (vmx_dst_hi, (vector unsigned int) AVV(0), - &vmx_dst_lo, &vmx_dst_hi); - - expand_alpha_2x128 ( - vmx_src_lo, vmx_src_hi, &vmx_alpha_lo, &vmx_alpha_hi); - - over_2x128 (&vmx_src_lo, &vmx_src_hi, - &vmx_alpha_lo, &vmx_alpha_hi, - &vmx_dst_lo, &vmx_dst_hi); + vdst = over(vsrc, splat_alpha(vsrc), load_128_aligned (pd)); - /* rebuid the 4 pixel data and save*/ - save_128_aligned (pd, pack_2x128_128 (vmx_dst_lo, vmx_dst_hi)); + save_128_aligned (pd, vdst); } w -= 4; |