summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSiarhei Siamashka <siarhei.siamashka@gmail.com>2015-09-04 15:39:00 +0300
committerOded Gabbay <oded.gabbay@gmail.com>2015-09-29 14:21:46 +0300
commit90e62c086766afffd289a321c7de8ea4b5cac87d (patch)
tree2fb983f7e160e5066b031813338ea30fdb011c55
parent2876d8d3dd6a71cb9eb3ac93e5b9c18b71a452da (diff)
vmx: implement fast path vmx_composite_over_n_8888
Running "lowlevel-blt-bench over_n_8888" on Playstation3 3.2GHz, Gentoo ppc (32-bit userland) gave the following results: before: over_n_8888 = L1: 147.47 L2: 205.86 M:121.07 after: over_n_8888 = L1: 287.27 L2: 261.09 M:133.48 Cairo non-trimmed benchmarks on POWER8, 3.4GHz 8 Cores: ocitysmap 659.69 -> 611.71 : 1.08x speedup xfce4-terminal-a1 2725.22 -> 2547.47 : 1.07x speedup Signed-off-by: Siarhei Siamashka <siarhei.siamashka@gmail.com> Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
-rw-r--r--pixman/pixman-vmx.c54
1 files changed, 54 insertions, 0 deletions
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 3eaa866..41efdcf 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -2628,6 +2628,58 @@ vmx_composite_src_x888_8888 (pixman_implementation_t *imp,
}
static void
+vmx_composite_over_n_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t *dst_line, *dst;
+ uint32_t src, ia;
+ int i, w, dst_stride;
+ vector unsigned int vdst, vsrc, via;
+
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+ vsrc = (vector unsigned int){src, src, src, src};
+ via = negate (splat_alpha (vsrc));
+ ia = ALPHA_8 (~src);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ w = width;
+
+ while (w && ((uintptr_t)dst & 15))
+ {
+ uint32_t d = *dst;
+ UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, src);
+ *dst++ = d;
+ w--;
+ }
+
+ for (i = w / 4; i > 0; i--)
+ {
+ vdst = pix_multiply (load_128_aligned (dst), via);
+ save_128_aligned (dst, pix_add (vsrc, vdst));
+ dst += 4;
+ }
+
+ for (i = w % 4; --i >= 0;)
+ {
+ uint32_t d = dst[i];
+ UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, src);
+ dst[i] = d;
+ }
+ }
+}
+
+static void
vmx_composite_over_8888_8888 (pixman_implementation_t *imp,
pixman_composite_info_t *info)
{
@@ -2936,6 +2988,8 @@ FAST_NEAREST_MAINLOOP (vmx_8888_8888_normal_OVER,
static const pixman_fast_path_t vmx_fast_paths[] =
{
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, vmx_composite_over_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, vmx_composite_over_n_8888),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, vmx_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888),