From 857880f0e4d1d42a8508ac77be33556cc6f7f546 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Sun, 6 Sep 2015 10:58:30 +0300 Subject: vmx: optimize scaled_nearest_scanline_vmx_8888_8888_OVER This patch optimizes scaled_nearest_scanline_vmx_8888_8888_OVER and all the functions it calls (combine1, combine4 and core_combine_over_u_pixel_vmx). The optimization is done by removing use of expand_alpha_1x128 and expand_alpha_2x128 in favor of splat_alpha and MUL/ADD macros from pixman_combine32.h. Running "lowlevel-blt-bench -n over_8888_8888" on POWER8, 8 cores, 3.4GHz, RHEL 7.2 ppc64le gave the following results: reference memcpy speed = 24847.3MB/s (6211.8MP/s for 32bpp fills) Before After Change -------------------------------------------- L1 182.05 210.22 +15.47% L2 180.6 208.92 +15.68% M 180.52 208.22 +15.34% HT 130.17 178.97 +37.49% VT 145.82 184.22 +26.33% R 104.51 129.38 +23.80% RT 48.3 61.54 +27.41% Kops/s 430 504 +17.21% v2: Check *pm is not NULL before dereferencing it in combine1() Signed-off-by: Oded Gabbay Acked-by: Pekka Paalanen Acked-by: Siarhei Siamashka --- pixman/pixman-vmx.c | 79 ++++++++++++----------------------------------------- 1 file changed, 17 insertions(+), 62 deletions(-) diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c index a9bd0246..4105dee1 100644 --- a/pixman/pixman-vmx.c +++ b/pixman/pixman-vmx.c @@ -623,10 +623,9 @@ in_over_2x128 (vector unsigned int* src_lo, static force_inline uint32_t core_combine_over_u_pixel_vmx (uint32_t src, uint32_t dst) { - uint8_t a; - vector unsigned int vmxs; + uint32_t a; - a = src >> 24; + a = ALPHA_8(src); if (a == 0xff) { @@ -634,9 +633,7 @@ core_combine_over_u_pixel_vmx (uint32_t src, uint32_t dst) } else if (src) { - vmxs = unpack_32_1x128 (src); - return pack_1x128_32( - over(vmxs, expand_alpha_1x128 (vmxs), unpack_32_1x128 (dst))); + UN8x4_MUL_UN8_ADD_UN8x4(dst, (~a & MASK), src); } return dst; @@ -648,17 +645,7 @@ combine1 (const uint32_t *ps, const uint32_t *pm) uint32_t s = *ps; if (pm) - { - vector unsigned int ms, mm; - - mm = unpack_32_1x128 (*pm); - mm = expand_alpha_1x128 (mm); - - ms = unpack_32_1x128 (s); - ms = pix_multiply (ms, mm); - - s = pack_1x128_32 (ms); - } + UN8x4_MUL_UN8(s, ALPHA_8(*pm)); return s; } @@ -666,38 +653,22 @@ combine1 (const uint32_t *ps, const uint32_t *pm) static force_inline vector unsigned int combine4 (const uint32_t* ps, const uint32_t* pm) { - vector unsigned int vmx_src_lo, vmx_src_hi; - vector unsigned int vmx_msk_lo, vmx_msk_hi; - vector unsigned int s; + vector unsigned int src, msk; if (pm) { - vmx_msk_lo = load_128_unaligned(pm); + msk = load_128_unaligned(pm); - if (is_transparent(vmx_msk_lo)) + if (is_transparent(msk)) return (vector unsigned int) AVV(0); } - s = load_128_unaligned(ps); + src = load_128_unaligned(ps); if (pm) - { - unpack_128_2x128(s, (vector unsigned int) AVV(0), - &vmx_src_lo, &vmx_src_hi); - - unpack_128_2x128(vmx_msk_lo, (vector unsigned int) AVV(0), - &vmx_msk_lo, &vmx_msk_hi); - - expand_alpha_2x128(vmx_msk_lo, vmx_msk_hi, &vmx_msk_lo, &vmx_msk_hi); + src = pix_multiply(src, msk); - pix_multiply_2x128(&vmx_src_lo, &vmx_src_hi, - &vmx_msk_lo, &vmx_msk_hi, - &vmx_src_lo, &vmx_src_hi); - - s = pack_2x128_128(vmx_src_lo, vmx_src_hi); - } - - return s; + return src; } static void @@ -2966,9 +2937,7 @@ scaled_nearest_scanline_vmx_8888_8888_OVER (uint32_t* pd, uint32_t s, d; const uint32_t* pm = NULL; - vector unsigned int vmx_dst_lo, vmx_dst_hi; - vector unsigned int vmx_src_lo, vmx_src_hi; - vector unsigned int vmx_alpha_lo, vmx_alpha_hi; + vector unsigned int vsrc, vdst; if (fully_transparent_src) return; @@ -3015,31 +2984,17 @@ scaled_nearest_scanline_vmx_8888_8888_OVER (uint32_t* pd, tmp[2] = tmp3; tmp[3] = tmp4; - vmx_src_hi = combine4 ((const uint32_t *) &tmp, pm); + vsrc = combine4 ((const uint32_t *) &tmp, pm); - if (is_opaque (vmx_src_hi)) + if (is_opaque (vsrc)) { - save_128_aligned (pd, vmx_src_hi); + save_128_aligned (pd, vsrc); } - else if (!is_zero (vmx_src_hi)) + else if (!is_zero (vsrc)) { - vmx_dst_hi = load_128_aligned (pd); - - unpack_128_2x128 (vmx_src_hi, (vector unsigned int) AVV(0), - &vmx_src_lo, &vmx_src_hi); - - unpack_128_2x128 (vmx_dst_hi, (vector unsigned int) AVV(0), - &vmx_dst_lo, &vmx_dst_hi); - - expand_alpha_2x128 ( - vmx_src_lo, vmx_src_hi, &vmx_alpha_lo, &vmx_alpha_hi); - - over_2x128 (&vmx_src_lo, &vmx_src_hi, - &vmx_alpha_lo, &vmx_alpha_hi, - &vmx_dst_lo, &vmx_dst_hi); + vdst = over(vsrc, splat_alpha(vsrc), load_128_aligned (pd)); - /* rebuid the 4 pixel data and save*/ - save_128_aligned (pd, pack_2x128_128 (vmx_dst_lo, vmx_dst_hi)); + save_128_aligned (pd, vdst); } w -= 4; -- cgit v1.2.3