summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorOded Gabbay <oded.gabbay@gmail.com>2015-06-28 09:42:19 +0300
committerOded Gabbay <oded.gabbay@gmail.com>2015-07-16 16:13:35 +0300
commit0cc8a2e9714efcb7cdd7e2a94c9cba49c3e29e00 (patch)
tree82197e7adb8fdecfa9bdded888993cad105649f8
parentc12ee95089e7d281a29a24bf56b81f5c16dec6ee (diff)
vmx: implement fast path vmx_fill
Based on sse2 impl. It was benchmarked against commid id e2d211a from pixman/master Tested cairo trimmed benchmarks on POWER8, 8 cores, 3.4GHz, RHEL 7.1 ppc64le : speedups ======== t-swfdec-giant-steps 1383.09 -> 718.63 : 1.92x speedup t-gnome-system-monitor 1403.53 -> 918.77 : 1.53x speedup t-evolution 552.34 -> 415.24 : 1.33x speedup t-xfce4-terminal-a1 1573.97 -> 1351.46 : 1.16x speedup t-firefox-paintball 847.87 -> 734.50 : 1.15x speedup t-firefox-asteroids 565.99 -> 492.77 : 1.15x speedup t-firefox-canvas-swscroll 1656.87 -> 1447.48 : 1.14x speedup t-midori-zoomed 724.73 -> 642.16 : 1.13x speedup t-firefox-planet-gnome 975.78 -> 911.92 : 1.07x speedup t-chromium-tabs 292.12 -> 274.74 : 1.06x speedup t-firefox-chalkboard 690.78 -> 653.93 : 1.06x speedup t-firefox-talos-gfx 1375.30 -> 1303.74 : 1.05x speedup t-firefox-canvas-alpha 1016.79 -> 967.24 : 1.05x speedup Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com> Acked-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>
-rw-r--r--pixman/pixman-vmx.c153
1 files changed, 153 insertions, 0 deletions
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 39d1a06..61fdb80 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -2537,6 +2537,157 @@ vmx_combine_add_ca (pixman_implementation_t *imp,
}
}
+static pixman_bool_t
+vmx_fill (pixman_implementation_t *imp,
+ uint32_t * bits,
+ int stride,
+ int bpp,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t filler)
+{
+ uint32_t byte_width;
+ uint8_t *byte_line;
+
+ vector unsigned int vfiller;
+
+ if (bpp == 8)
+ {
+ uint8_t b;
+ uint16_t w;
+
+ stride = stride * (int) sizeof (uint32_t) / 1;
+ byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
+ byte_width = width;
+ stride *= 1;
+
+ b = filler & 0xff;
+ w = (b << 8) | b;
+ filler = (w << 16) | w;
+ }
+ else if (bpp == 16)
+ {
+ stride = stride * (int) sizeof (uint32_t) / 2;
+ byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
+ byte_width = 2 * width;
+ stride *= 2;
+
+ filler = (filler & 0xffff) * 0x00010001;
+ }
+ else if (bpp == 32)
+ {
+ stride = stride * (int) sizeof (uint32_t) / 4;
+ byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
+ byte_width = 4 * width;
+ stride *= 4;
+ }
+ else
+ {
+ return FALSE;
+ }
+
+ vfiller = create_mask_1x32_128(&filler);
+
+ while (height--)
+ {
+ int w;
+ uint8_t *d = byte_line;
+ byte_line += stride;
+ w = byte_width;
+
+ if (w >= 1 && ((uintptr_t)d & 1))
+ {
+ *(uint8_t *)d = filler;
+ w -= 1;
+ d += 1;
+ }
+
+ while (w >= 2 && ((uintptr_t)d & 3))
+ {
+ *(uint16_t *)d = filler;
+ w -= 2;
+ d += 2;
+ }
+
+ while (w >= 4 && ((uintptr_t)d & 15))
+ {
+ *(uint32_t *)d = filler;
+
+ w -= 4;
+ d += 4;
+ }
+
+ while (w >= 128)
+ {
+ vec_st(vfiller, 0, (uint32_t *) d);
+ vec_st(vfiller, 0, (uint32_t *) d + 4);
+ vec_st(vfiller, 0, (uint32_t *) d + 8);
+ vec_st(vfiller, 0, (uint32_t *) d + 12);
+ vec_st(vfiller, 0, (uint32_t *) d + 16);
+ vec_st(vfiller, 0, (uint32_t *) d + 20);
+ vec_st(vfiller, 0, (uint32_t *) d + 24);
+ vec_st(vfiller, 0, (uint32_t *) d + 28);
+
+ d += 128;
+ w -= 128;
+ }
+
+ if (w >= 64)
+ {
+ vec_st(vfiller, 0, (uint32_t *) d);
+ vec_st(vfiller, 0, (uint32_t *) d + 4);
+ vec_st(vfiller, 0, (uint32_t *) d + 8);
+ vec_st(vfiller, 0, (uint32_t *) d + 12);
+
+ d += 64;
+ w -= 64;
+ }
+
+ if (w >= 32)
+ {
+ vec_st(vfiller, 0, (uint32_t *) d);
+ vec_st(vfiller, 0, (uint32_t *) d + 4);
+
+ d += 32;
+ w -= 32;
+ }
+
+ if (w >= 16)
+ {
+ vec_st(vfiller, 0, (uint32_t *) d);
+
+ d += 16;
+ w -= 16;
+ }
+
+ while (w >= 4)
+ {
+ *(uint32_t *)d = filler;
+
+ w -= 4;
+ d += 4;
+ }
+
+ if (w >= 2)
+ {
+ *(uint16_t *)d = filler;
+ w -= 2;
+ d += 2;
+ }
+
+ if (w >= 1)
+ {
+ *(uint8_t *)d = filler;
+ w -= 1;
+ d += 1;
+ }
+ }
+
+ return TRUE;
+}
+
static const pixman_fast_path_t vmx_fast_paths[] =
{
{ PIXMAN_OP_NONE },
@@ -2582,5 +2733,7 @@ _pixman_implementation_create_vmx (pixman_implementation_t *fallback)
imp->combine_32_ca[PIXMAN_OP_XOR] = vmx_combine_xor_ca;
imp->combine_32_ca[PIXMAN_OP_ADD] = vmx_combine_add_ca;
+ imp->fill = vmx_fill;
+
return imp;
}