summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorOded Gabbay <oded.gabbay@gmail.com>2015-07-01 14:34:07 +0300
committerOded Gabbay <oded.gabbay@gmail.com>2015-07-16 16:13:35 +0300
commit8d9be3619a906855a3e3a1e052317833cb24cabe (patch)
treeda49f02bb1506d6ecf2b2651189b70ebce555f1e
parent47f74ca94637d79ee66c37a81eea0200e453fcc1 (diff)
vmx: implement fast path iterator vmx_fetch_a8
no changes were observed when running cairo trimmed benchmarks. Running "lowlevel-blt-bench src_8_8888" on POWER8, 8 cores, 3.4GHz, RHEL 7.1 ppc64le gave the following results: reference memcpy speed = 25197.2MB/s (6299.3MP/s for 32bpp fills) Before After Change -------------------------------------------- L1 965.34 3936 +307.73% L2 942.99 3436.29 +264.40% M 902.24 2757.77 +205.66% HT 448.46 784.99 +75.04% VT 430.05 819.78 +90.62% R 412.9 717.04 +73.66% RT 168.93 220.63 +30.60% Kops/s 1025 1303 +27.12% It was benchmarked against commid id e2d211a from pixman/master Siarhei Siamashka reported that on playstation3, it shows the following results: == before == src_8_8888 = L1: 194.37 L2: 198.46 M:155.90 (148.35%) HT: 59.18 VT: 36.71 R: 38.93 RT: 12.79 ( 106Kops/s) == after == src_8_8888 = L1: 373.96 L2: 391.10 M:245.81 (233.88%) HT: 80.81 VT: 44.33 R: 48.10 RT: 14.79 ( 122Kops/s) Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com> Acked-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>
-rw-r--r--pixman/pixman-vmx.c46
1 files changed, 46 insertions, 0 deletions
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 773ad76..a9bd024 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -3139,6 +3139,49 @@ vmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
return iter->buffer;
}
+static uint32_t *
+vmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+ int w = iter->width;
+ uint32_t *dst = iter->buffer;
+ uint8_t *src = iter->bits;
+ vector unsigned int vmx0, vmx1, vmx2, vmx3, vmx4, vmx5, vmx6;
+
+ iter->bits += iter->stride;
+
+ while (w && (((uintptr_t)dst) & 15))
+ {
+ *dst++ = *(src++) << 24;
+ w--;
+ }
+
+ while (w >= 16)
+ {
+ vmx0 = load_128_unaligned((uint32_t *) src);
+
+ unpack_128_2x128((vector unsigned int) AVV(0), vmx0, &vmx1, &vmx2);
+ unpack_128_2x128_16((vector unsigned int) AVV(0), vmx1, &vmx3, &vmx4);
+ unpack_128_2x128_16((vector unsigned int) AVV(0), vmx2, &vmx5, &vmx6);
+
+ save_128_aligned(dst, vmx6);
+ save_128_aligned((dst + 4), vmx5);
+ save_128_aligned((dst + 8), vmx4);
+ save_128_aligned((dst + 12), vmx3);
+
+ dst += 16;
+ src += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ *dst++ = *(src++) << 24;
+ w--;
+ }
+
+ return iter->buffer;
+}
+
#define IMAGE_FLAGS \
(FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \
FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
@@ -3148,6 +3191,9 @@ static const pixman_iter_info_t vmx_iters[] =
{ PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
_pixman_iter_init_bits_stride, vmx_fetch_x8r8g8b8, NULL
},
+ { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
+ _pixman_iter_init_bits_stride, vmx_fetch_a8, NULL
+ },
{ PIXMAN_null },
};