vmx: implement fast path vmx_composite_over_n_8888_8888_ca

It was benchmarked against commid id 2be523b from pixman/master POWER8, 8 cores, 3.4GHz, RHEL 7.1 ppc64le. reference memcpy speed = 24764.8MB/s (6191.2MP/s for 32bpp fills) Before After Change --------------------------------------------- L1 61.92 244.91 +295.53% L2 62.74 243.3 +287.79% M 63.03 241.94 +283.85% HT 59.91 144.22 +140.73% VT 59.4 174.39 +193.59% R 53.6 111.37 +107.78% RT 37.99 46.38 +22.08% Kops/s 436 506 +16.06% cairo trimmed benchmarks : Speedups ======== t-xfce4-terminal-a1 1540.37 -> 1226.14 : 1.26x t-firefox-talos-gfx 1488.59 -> 1209.19 : 1.23x Slowdowns ========= t-evolution 553.88 -> 581.63 : 1.05x t-poppler 364.99 -> 383.79 : 1.05x t-firefox-scrolling 1223.65 -> 1304.34 : 1.07x The slowdowns can be explained in cases where the images are small and un-aligned to 16-byte boundary. In that case, the function will first work on the un-aligned area, even in operations of 1 byte. In case of small images, the overhead of such operations can be more than the savings we get from using the vmx instructions that are done on the aligned part of the image. In the C fast-path implementation, there is no special treatment for the un-aligned part, as it works in 4 byte quantities on the entire image. Because llbb is a synthetic test, I would assume it has much less alignment issues than "real-world" scenario, such as cairo benchmarks, which are basically recorded traces of real application activity. Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com> Acked-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>
author: Oded Gabbay <oded.gabbay@gmail.com> 2015-06-28 10:14:20 +0300
committer: Oded Gabbay <oded.gabbay@gmail.com> 2015-07-16 16:13:35 +0300
commit: fafc1d403b8405727d3918bcb605cb98044af90a (patch)
tree: e0002471a86f97d32f59d21cbf30f1255abdb93b
parent: a3e914407e354df70b9200e263608f1fc2e686cf (diff)
1 files changed, 112 insertions, 0 deletions
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 641c487..47393dc 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -2715,6 +2715,114 @@ vmx_composite_over_8888_8888 (pixman_implementation_t *imp,
 }
 
 static void
+vmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+                                    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, d;
+    uint32_t    *mask_line, m;
+    uint32_t pack_cmp;
+    int dst_stride, mask_stride;
+
+    vector unsigned int vsrc, valpha, vmask, vdest;
+
+    vector unsigned int vmx_dst, vmx_dst_lo, vmx_dst_hi;
+    vector unsigned int vmx_mask, vmx_mask_lo, vmx_mask_hi;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    vsrc = unpacklo_128_16x8(create_mask_1x32_128 (&src),
+			    (vector unsigned int) AVV(0));
+
+    valpha = expand_alpha_1x128(vsrc);
+
+    while (height--)
+    {
+	int w = width;
+	const uint32_t *pm = (uint32_t *)mask_line;
+	uint32_t *pd = (uint32_t *)dst_line;
+
+	dst_line += dst_stride;
+	mask_line += mask_stride;
+
+	while (w && (uintptr_t)pd & 15)
+	{
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *pd;
+		vmask = unpack_32_1x128(m);
+		vdest = unpack_32_1x128(d);
+
+		*pd = pack_1x128_32(in_over (vsrc, valpha, vmask, vdest));
+	    }
+
+	    pd++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    /* pm is NOT necessarily 16-byte aligned */
+	    vmx_mask = load_128_unaligned (pm);
+
+	    pack_cmp = vec_all_eq(vmx_mask, (vector unsigned int) AVV(0));
+
+	    /* if all bits in mask are zero, pack_cmp is not 0 */
+	    if (pack_cmp == 0)
+	    {
+		/* pd is 16-byte aligned */
+		vmx_dst = load_128_aligned (pd);
+
+		unpack_128_2x128 (vmx_mask, (vector unsigned int) AVV(0),
+				    &vmx_mask_lo, &vmx_mask_hi);
+
+		unpack_128_2x128 (vmx_dst, (vector unsigned int) AVV(0),
+				    &vmx_dst_lo, &vmx_dst_hi);
+
+		in_over_2x128 (&vsrc, &vsrc,
+			       &valpha, &valpha,
+			       &vmx_mask_lo, &vmx_mask_hi,
+			       &vmx_dst_lo, &vmx_dst_hi);
+
+		save_128_aligned(pd, pack_2x128_128(vmx_dst_lo, vmx_dst_hi));
+	    }
+
+	    pd += 4;
+	    pm += 4;
+	    w -= 4;
+	}
+
+	while (w)
+	{
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *pd;
+		vmask = unpack_32_1x128(m);
+		vdest = unpack_32_1x128(d);
+
+		*pd = pack_1x128_32(in_over (vsrc, valpha, vmask, vdest));
+	    }
+
+	    pd++;
+	    w--;
+	}
+    }
+}
+
+static void
 vmx_composite_add_8_8 (pixman_implementation_t *imp,
             pixman_composite_info_t *info)
 {
@@ -2796,6 +2904,10 @@ static const pixman_fast_path_t vmx_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, vmx_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, vmx_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, vmx_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, vmx_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, vmx_composite_over_n_8888_8888_ca),
 
     /* PIXMAN_OP_ADD */
     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, vmx_composite_add_8_8),
author	Oded Gabbay <oded.gabbay@gmail.com>	2015-06-28 10:14:20 +0300
committer	Oded Gabbay <oded.gabbay@gmail.com>	2015-07-16 16:13:35 +0300
commit	fafc1d403b8405727d3918bcb605cb98044af90a (patch)
tree	e0002471a86f97d32f59d21cbf30f1255abdb93b
parent	a3e914407e354df70b9200e263608f1fc2e686cf (diff)