diff options
author | Oded Gabbay <oded.gabbay@gmail.com> | 2015-09-06 11:46:15 +0300 |
---|---|---|
committer | Oded Gabbay <oded.gabbay@gmail.com> | 2015-09-18 10:07:03 +0300 |
commit | 8d8caa55a38c00351047d24322e23b201b6b29ff (patch) | |
tree | 4125f69c85a6f07c4e3d8cb83cde018f3e40ef01 | |
parent | 857880f0e4d1d42a8508ac77be33556cc6f7f546 (diff) |
vmx: optimize vmx_composite_over_n_8888_8888_ca
This patch optimizes vmx_composite_over_n_8888_8888_ca by removing use
of expand_alpha_1x128, unpack/pack and in_over_2x128 in favor of
splat_alpha, in_over and MUL/ADD macros from pixman_combine32.h.
Running "lowlevel-blt-bench -n over_8888_8888" on POWER8, 8 cores,
3.4GHz, RHEL 7.2 ppc64le gave the following results:
reference memcpy speed = 23475.4MB/s (5868.8MP/s for 32bpp fills)
Before After Change
--------------------------------------------
L1 244.97 474.05 +93.51%
L2 243.74 473.05 +94.08%
M 243.29 467.16 +92.02%
HT 144.03 252.79 +75.51%
VT 174.24 279.03 +60.14%
R 109.86 149.98 +36.52%
RT 47.96 53.18 +10.88%
Kops/s 524 576 +9.92%
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Acked-by: Pekka Paalanen <pekka.paalanen@collabora.co.uk>
Acked-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>
-rw-r--r-- | pixman/pixman-vmx.c | 52 |
1 files changed, 21 insertions, 31 deletions
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c index 4105dee1..a5c5db98 100644 --- a/pixman/pixman-vmx.c +++ b/pixman/pixman-vmx.c @@ -2746,7 +2746,7 @@ vmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, pixman_composite_info_t *info) { PIXMAN_COMPOSITE_ARGS (info); - uint32_t src; + uint32_t src, ia; uint32_t *dst_line, d; uint32_t *mask_line, m; uint32_t pack_cmp; @@ -2754,9 +2754,6 @@ vmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, vector unsigned int vsrc, valpha, vmask, vdest; - vector unsigned int vmx_dst, vmx_dst_lo, vmx_dst_hi; - vector unsigned int vmx_mask, vmx_mask_lo, vmx_mask_hi; - src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); if (src == 0) @@ -2767,31 +2764,33 @@ vmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, PIXMAN_IMAGE_GET_LINE ( mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); - vsrc = unpacklo_128_16x8(create_mask_1x32_128 (&src), - (vector unsigned int) AVV(0)); - - valpha = expand_alpha_1x128(vsrc); + vsrc = (vector unsigned int) {src, src, src, src}; + valpha = splat_alpha(vsrc); + ia = ALPHA_8 (src); while (height--) { int w = width; const uint32_t *pm = (uint32_t *)mask_line; uint32_t *pd = (uint32_t *)dst_line; + uint32_t s; dst_line += dst_stride; mask_line += mask_stride; while (w && (uintptr_t)pd & 15) { + s = src; m = *pm++; if (m) { d = *pd; - vmask = unpack_32_1x128(m); - vdest = unpack_32_1x128(d); - - *pd = pack_1x128_32(in_over (vsrc, valpha, vmask, vdest)); + UN8x4_MUL_UN8x4 (s, m); + UN8x4_MUL_UN8 (m, ia); + m = ~m; + UN8x4_MUL_UN8x4_ADD_UN8x4 (d, m, s); + *pd = d; } pd++; @@ -2801,28 +2800,17 @@ vmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, while (w >= 4) { /* pm is NOT necessarily 16-byte aligned */ - vmx_mask = load_128_unaligned (pm); + vmask = load_128_unaligned (pm); - pack_cmp = vec_all_eq(vmx_mask, (vector unsigned int) AVV(0)); + pack_cmp = vec_all_eq(vmask, (vector unsigned int) AVV(0)); /* if all bits in mask are zero, pack_cmp is not 0 */ if (pack_cmp == 0) { /* pd is 16-byte aligned */ - vmx_dst = load_128_aligned (pd); - - unpack_128_2x128 (vmx_mask, (vector unsigned int) AVV(0), - &vmx_mask_lo, &vmx_mask_hi); + vdest = in_over (vsrc, valpha, vmask, load_128_aligned (pd)); - unpack_128_2x128 (vmx_dst, (vector unsigned int) AVV(0), - &vmx_dst_lo, &vmx_dst_hi); - - in_over_2x128 (&vsrc, &vsrc, - &valpha, &valpha, - &vmx_mask_lo, &vmx_mask_hi, - &vmx_dst_lo, &vmx_dst_hi); - - save_128_aligned(pd, pack_2x128_128(vmx_dst_lo, vmx_dst_hi)); + save_128_aligned(pd, vdest); } pd += 4; @@ -2832,15 +2820,17 @@ vmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, while (w) { + s = src; m = *pm++; if (m) { d = *pd; - vmask = unpack_32_1x128(m); - vdest = unpack_32_1x128(d); - - *pd = pack_1x128_32(in_over (vsrc, valpha, vmask, vdest)); + UN8x4_MUL_UN8x4 (s, m); + UN8x4_MUL_UN8 (m, ia); + m = ~m; + UN8x4_MUL_UN8x4_ADD_UN8x4 (d, m, s); + *pd = d; } pd++; |