diff options
author | Oded Gabbay <oded.gabbay@gmail.com> | 2015-06-28 13:17:41 +0300 |
---|---|---|
committer | Oded Gabbay <oded.gabbay@gmail.com> | 2015-08-25 18:35:57 +0300 |
commit | d35093eff4280350b230fb5c9ead08e45f06e9a4 (patch) | |
tree | 7104433d4e2ae07735009b64e6eeb76e23d583b2 | |
parent | 6577035966cb0c5a8a25f61d6bc7cb11c6ce60e7 (diff) |
vmx: implement fast path vmx_composite_over_n_8_8888
POWER8, 8 cores, 3.4GHz, RHEL 7.1 ppc64le.
reference memcpy speed = 24764.8MB/s (6191.2MP/s for 32bpp fills)
Before After Change
---------------------------------------------
L1 90.21 133.21 +47.67%
L2 94.91 132.95 +40.08%
M 95.49 132.53 +38.79%
HT 88.07 100.43 +14.03%
VT 86.65 112.45 +29.77%
R 82.77 96.25 +16.29%
RT 65.64 55.14 -16.00%
Kops/s 673 580 -13.82%
cairo trimmed benchmarks :
Speedups
========
t-firefox-asteroids 533.92 -> 495.51 : 1.08x
Slowdowns
=========
t-poppler 364.99 -> 393.72 : 1.08x
t-firefox-canvas-alpha 984.55 -> 1197.85 : 1.22x
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
-rw-r--r-- | pixman/pixman-vmx.c | 139 |
1 files changed, 139 insertions, 0 deletions
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c index 2da204f..2a990b4 100644 --- a/pixman/pixman-vmx.c +++ b/pixman/pixman-vmx.c @@ -659,6 +659,19 @@ expand_pixel_8_1x128 (uint8_t data) } static force_inline vector unsigned int +expand_pixel_32_1x128 (uint32_t data) +{ + vector unsigned int vdata; + + vdata = unpack_32_1x128 (data); + + return vec_perm (vdata, vdata, + (vector unsigned char)AVV ( + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F)); +} + +static force_inline vector unsigned int expand_alpha_1x128 (vector unsigned int data) { #ifdef WORDS_BIGENDIAN @@ -2685,6 +2698,128 @@ vmx_combine_add_ca (pixman_implementation_t *imp, } } +static void +vmx_composite_over_n_8_8888 (pixman_implementation_t *imp, + pixman_composite_info_t *info) +{ + PIXMAN_COMPOSITE_ARGS (info); + uint32_t src, srca; + uint32_t *dst_line, *dst; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + int32_t w; + uint32_t m, d; + + vector unsigned int vsrc, valpha, vmask; + + vector unsigned int vmx_dst, vmx_dst_lo, vmx_dst_hi; + vector unsigned int vmx_mask, vmx_mask_lo, vmx_mask_hi; + + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + + srca = src >> 24; + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE ( + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + vmask = create_mask_1x32_128 (&src); + vsrc = expand_pixel_32_1x128 (src); + valpha = expand_alpha_1x128 (vsrc); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w && (uintptr_t)dst & 15) + { + uint8_t m = *mask++; + + if (m) + { + d = *dst; + vmx_mask = expand_pixel_8_1x128 (m); + vmx_dst = unpack_32_1x128 (d); + + *dst = pack_1x128_32 (in_over (vsrc, + valpha, + vmx_mask, + vmx_dst)); + } + + w--; + dst++; + } + + while (w >= 4) + { + m = *((uint32_t*)mask); + + if (srca == 0xff && m == 0xffffffff) + { + save_128_aligned(dst, vmask); + } + else if (m) + { + vmx_dst = load_128_aligned (dst); + + vmx_mask = unpack_32_1x128 (m); + vmx_mask = unpacklo_128_16x8 (vmx_mask, + (vector unsigned int) AVV(0)); + + /* Unpacking */ + unpack_128_2x128 (vmx_dst, (vector unsigned int) AVV(0), + &vmx_dst_lo, &vmx_dst_hi); + + unpack_128_2x128 (vmx_mask, (vector unsigned int) AVV(0), + &vmx_mask_lo, &vmx_mask_hi); + + expand_alpha_rev_2x128 (vmx_mask_lo, vmx_mask_hi, + &vmx_mask_lo, &vmx_mask_hi); + + in_over_2x128 (&vsrc, &vsrc, + &valpha, &valpha, + &vmx_mask_lo, &vmx_mask_hi, + &vmx_dst_lo, &vmx_dst_hi); + + save_128_aligned(dst, pack_2x128_128 (vmx_dst_lo, vmx_dst_hi)); + } + + w -= 4; + dst += 4; + mask += 4; + } + + while (w) + { + uint8_t m = *mask++; + + if (m) + { + d = *dst; + vmx_mask = expand_pixel_8_1x128 (m); + vmx_dst = unpack_32_1x128 (d); + + *dst = pack_1x128_32 (in_over (vsrc, + valpha, + vmx_mask, + vmx_dst)); + } + + w--; + dst++; + } + } + +} + static pixman_bool_t vmx_fill (pixman_implementation_t *imp, uint32_t * bits, @@ -3368,6 +3503,10 @@ static const pixman_fast_path_t vmx_fast_paths[] = PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888), PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888), PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, vmx_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, vmx_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, vmx_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, vmx_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, vmx_composite_over_n_8_8888), PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, vmx_composite_over_n_8888_8888_ca), PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, vmx_composite_over_n_8888_8888_ca), PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, vmx_composite_over_n_8888_8888_ca), |