summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorOded Gabbay <oded.gabbay@gmail.com>2015-06-28 13:17:41 +0300
committerOded Gabbay <oded.gabbay@gmail.com>2015-08-25 18:35:57 +0300
commitd35093eff4280350b230fb5c9ead08e45f06e9a4 (patch)
tree7104433d4e2ae07735009b64e6eeb76e23d583b2
parent6577035966cb0c5a8a25f61d6bc7cb11c6ce60e7 (diff)
vmx: implement fast path vmx_composite_over_n_8_8888
POWER8, 8 cores, 3.4GHz, RHEL 7.1 ppc64le. reference memcpy speed = 24764.8MB/s (6191.2MP/s for 32bpp fills) Before After Change --------------------------------------------- L1 90.21 133.21 +47.67% L2 94.91 132.95 +40.08% M 95.49 132.53 +38.79% HT 88.07 100.43 +14.03% VT 86.65 112.45 +29.77% R 82.77 96.25 +16.29% RT 65.64 55.14 -16.00% Kops/s 673 580 -13.82% cairo trimmed benchmarks : Speedups ======== t-firefox-asteroids 533.92 -> 495.51 : 1.08x Slowdowns ========= t-poppler 364.99 -> 393.72 : 1.08x t-firefox-canvas-alpha 984.55 -> 1197.85 : 1.22x Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
-rw-r--r--pixman/pixman-vmx.c139
1 files changed, 139 insertions, 0 deletions
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 2da204f..2a990b4 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -659,6 +659,19 @@ expand_pixel_8_1x128 (uint8_t data)
}
static force_inline vector unsigned int
+expand_pixel_32_1x128 (uint32_t data)
+{
+ vector unsigned int vdata;
+
+ vdata = unpack_32_1x128 (data);
+
+ return vec_perm (vdata, vdata,
+ (vector unsigned char)AVV (
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F));
+}
+
+static force_inline vector unsigned int
expand_alpha_1x128 (vector unsigned int data)
{
#ifdef WORDS_BIGENDIAN
@@ -2685,6 +2698,128 @@ vmx_combine_add_ca (pixman_implementation_t *imp,
}
}
+static void
+vmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t src, srca;
+ uint32_t *dst_line, *dst;
+ uint8_t *mask_line, *mask;
+ int dst_stride, mask_stride;
+ int32_t w;
+ uint32_t m, d;
+
+ vector unsigned int vsrc, valpha, vmask;
+
+ vector unsigned int vmx_dst, vmx_dst_lo, vmx_dst_hi;
+ vector unsigned int vmx_mask, vmx_mask_lo, vmx_mask_hi;
+
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+ srca = src >> 24;
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+ vmask = create_mask_1x32_128 (&src);
+ vsrc = expand_pixel_32_1x128 (src);
+ valpha = expand_alpha_1x128 (vsrc);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w && (uintptr_t)dst & 15)
+ {
+ uint8_t m = *mask++;
+
+ if (m)
+ {
+ d = *dst;
+ vmx_mask = expand_pixel_8_1x128 (m);
+ vmx_dst = unpack_32_1x128 (d);
+
+ *dst = pack_1x128_32 (in_over (vsrc,
+ valpha,
+ vmx_mask,
+ vmx_dst));
+ }
+
+ w--;
+ dst++;
+ }
+
+ while (w >= 4)
+ {
+ m = *((uint32_t*)mask);
+
+ if (srca == 0xff && m == 0xffffffff)
+ {
+ save_128_aligned(dst, vmask);
+ }
+ else if (m)
+ {
+ vmx_dst = load_128_aligned (dst);
+
+ vmx_mask = unpack_32_1x128 (m);
+ vmx_mask = unpacklo_128_16x8 (vmx_mask,
+ (vector unsigned int) AVV(0));
+
+ /* Unpacking */
+ unpack_128_2x128 (vmx_dst, (vector unsigned int) AVV(0),
+ &vmx_dst_lo, &vmx_dst_hi);
+
+ unpack_128_2x128 (vmx_mask, (vector unsigned int) AVV(0),
+ &vmx_mask_lo, &vmx_mask_hi);
+
+ expand_alpha_rev_2x128 (vmx_mask_lo, vmx_mask_hi,
+ &vmx_mask_lo, &vmx_mask_hi);
+
+ in_over_2x128 (&vsrc, &vsrc,
+ &valpha, &valpha,
+ &vmx_mask_lo, &vmx_mask_hi,
+ &vmx_dst_lo, &vmx_dst_hi);
+
+ save_128_aligned(dst, pack_2x128_128 (vmx_dst_lo, vmx_dst_hi));
+ }
+
+ w -= 4;
+ dst += 4;
+ mask += 4;
+ }
+
+ while (w)
+ {
+ uint8_t m = *mask++;
+
+ if (m)
+ {
+ d = *dst;
+ vmx_mask = expand_pixel_8_1x128 (m);
+ vmx_dst = unpack_32_1x128 (d);
+
+ *dst = pack_1x128_32 (in_over (vsrc,
+ valpha,
+ vmx_mask,
+ vmx_dst));
+ }
+
+ w--;
+ dst++;
+ }
+ }
+
+}
+
static pixman_bool_t
vmx_fill (pixman_implementation_t *imp,
uint32_t * bits,
@@ -3368,6 +3503,10 @@ static const pixman_fast_path_t vmx_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, vmx_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, vmx_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, vmx_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, vmx_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, vmx_composite_over_n_8_8888),
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, vmx_composite_over_n_8888_8888_ca),
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, vmx_composite_over_n_8888_8888_ca),
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, vmx_composite_over_n_8888_8888_ca),