diff options
author | Oded Gabbay <oded.gabbay@gmail.com> | 2015-06-28 13:17:41 +0300 |
---|---|---|
committer | Oded Gabbay <oded.gabbay@gmail.com> | 2015-09-18 10:07:08 +0300 |
commit | 6b1b8b2b90da11bf6101a151786b2a8c9f087338 (patch) | |
tree | 85e59c257fca729c8151a67d6050eb7968584a51 | |
parent | 8d8caa55a38c00351047d24322e23b201b6b29ff (diff) |
vmx: implement fast path vmx_composite_over_n_8_8888
POWER8, 8 cores, 3.4GHz, RHEL 7.2 ppc64le.
reference memcpy speed = 25008.9MB/s (6252.2MP/s for 32bpp fills)
Before After Change
---------------------------------------------
L1 91.32 182.84 +100.22%
L2 94.94 182.83 +92.57%
M 95.55 181.51 +89.96%
HT 88.96 162.09 +82.21%
VT 87.4 168.35 +92.62%
R 83.37 146.23 +75.40%
RT 66.4 91.5 +37.80%
Kops/s 683 859 +25.77%
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Acked-by: Pekka Paalanen <pekka.paalanen@collabora.co.uk>
Acked-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>
-rw-r--r-- | pixman/pixman-vmx.c | 111 |
1 files changed, 111 insertions, 0 deletions
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c index a5c5db98..7ef8bedc 100644 --- a/pixman/pixman-vmx.c +++ b/pixman/pixman-vmx.c @@ -60,6 +60,15 @@ splat_alpha (vector unsigned int pix) } static force_inline vector unsigned int +splat_pixel (vector unsigned int pix) +{ + return vec_perm (pix, pix, + (vector unsigned char)AVV ( + 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, + 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03)); +} + +static force_inline vector unsigned int pix_multiply (vector unsigned int p, vector unsigned int a) { vector unsigned short hi, lo, mod; @@ -2508,6 +2517,104 @@ vmx_combine_add_ca (pixman_implementation_t *imp, } } +static void +vmx_composite_over_n_8_8888 (pixman_implementation_t *imp, + pixman_composite_info_t *info) +{ + PIXMAN_COMPOSITE_ARGS (info); + uint32_t src, srca; + uint32_t *dst_line, *dst; + uint8_t *mask_line; + int dst_stride, mask_stride; + int32_t w; + uint32_t m, d, s, ia; + + vector unsigned int vsrc, valpha, vmask, vdst; + + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + + srca = ALPHA_8(src); + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE ( + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + vsrc = (vector unsigned int) {src, src, src, src}; + valpha = splat_alpha(vsrc); + + while (height--) + { + const uint8_t *pm = mask_line; + dst = dst_line; + dst_line += dst_stride; + mask_line += mask_stride; + w = width; + + while (w && (uintptr_t)dst & 15) + { + s = src; + m = *pm++; + + if (m) + { + d = *dst; + UN8x4_MUL_UN8 (s, m); + ia = ALPHA_8 (~s); + UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s); + *dst = d; + } + + w--; + dst++; + } + + while (w >= 4) + { + m = *((uint32_t*)pm); + + if (srca == 0xff && m == 0xffffffff) + { + save_128_aligned(dst, vsrc); + } + else if (m) + { + vmask = splat_pixel((vector unsigned int) {m, m, m, m}); + + /* dst is 16-byte aligned */ + vdst = in_over (vsrc, valpha, vmask, load_128_aligned (dst)); + + save_128_aligned(dst, vdst); + } + + w -= 4; + dst += 4; + pm += 4; + } + + while (w) + { + s = src; + m = *pm++; + + if (m) + { + d = *dst; + UN8x4_MUL_UN8 (s, m); + ia = ALPHA_8 (~s); + UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s); + *dst = d; + } + + w--; + dst++; + } + } + +} + static pixman_bool_t vmx_fill (pixman_implementation_t *imp, uint32_t * bits, @@ -3028,6 +3135,10 @@ static const pixman_fast_path_t vmx_fast_paths[] = PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888), PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888), PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, vmx_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, vmx_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, vmx_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, vmx_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, vmx_composite_over_n_8_8888), PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, vmx_composite_over_n_8888_8888_ca), PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, vmx_composite_over_n_8888_8888_ca), PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, vmx_composite_over_n_8888_8888_ca), |