diff options
author | Oded Gabbay <oded.gabbay@gmail.com> | 2015-07-01 14:34:07 +0300 |
---|---|---|
committer | Oded Gabbay <oded.gabbay@gmail.com> | 2015-07-16 16:13:35 +0300 |
commit | 8d9be3619a906855a3e3a1e052317833cb24cabe (patch) | |
tree | da49f02bb1506d6ecf2b2651189b70ebce555f1e | |
parent | 47f74ca94637d79ee66c37a81eea0200e453fcc1 (diff) |
vmx: implement fast path iterator vmx_fetch_a8
no changes were observed when running cairo trimmed benchmarks.
Running "lowlevel-blt-bench src_8_8888" on POWER8, 8 cores,
3.4GHz, RHEL 7.1 ppc64le gave the following results:
reference memcpy speed = 25197.2MB/s (6299.3MP/s for 32bpp fills)
Before After Change
--------------------------------------------
L1 965.34 3936 +307.73%
L2 942.99 3436.29 +264.40%
M 902.24 2757.77 +205.66%
HT 448.46 784.99 +75.04%
VT 430.05 819.78 +90.62%
R 412.9 717.04 +73.66%
RT 168.93 220.63 +30.60%
Kops/s 1025 1303 +27.12%
It was benchmarked against commid id e2d211a from pixman/master
Siarhei Siamashka reported that on playstation3, it shows the following
results:
== before ==
src_8_8888 = L1: 194.37 L2: 198.46 M:155.90 (148.35%)
HT: 59.18 VT: 36.71 R: 38.93 RT: 12.79 ( 106Kops/s)
== after ==
src_8_8888 = L1: 373.96 L2: 391.10 M:245.81 (233.88%)
HT: 80.81 VT: 44.33 R: 48.10 RT: 14.79 ( 122Kops/s)
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Acked-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>
-rw-r--r-- | pixman/pixman-vmx.c | 46 |
1 files changed, 46 insertions, 0 deletions
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c index 773ad76..a9bd024 100644 --- a/pixman/pixman-vmx.c +++ b/pixman/pixman-vmx.c @@ -3139,6 +3139,49 @@ vmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask) return iter->buffer; } +static uint32_t * +vmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask) +{ + int w = iter->width; + uint32_t *dst = iter->buffer; + uint8_t *src = iter->bits; + vector unsigned int vmx0, vmx1, vmx2, vmx3, vmx4, vmx5, vmx6; + + iter->bits += iter->stride; + + while (w && (((uintptr_t)dst) & 15)) + { + *dst++ = *(src++) << 24; + w--; + } + + while (w >= 16) + { + vmx0 = load_128_unaligned((uint32_t *) src); + + unpack_128_2x128((vector unsigned int) AVV(0), vmx0, &vmx1, &vmx2); + unpack_128_2x128_16((vector unsigned int) AVV(0), vmx1, &vmx3, &vmx4); + unpack_128_2x128_16((vector unsigned int) AVV(0), vmx2, &vmx5, &vmx6); + + save_128_aligned(dst, vmx6); + save_128_aligned((dst + 4), vmx5); + save_128_aligned((dst + 8), vmx4); + save_128_aligned((dst + 12), vmx3); + + dst += 16; + src += 16; + w -= 16; + } + + while (w) + { + *dst++ = *(src++) << 24; + w--; + } + + return iter->buffer; +} + #define IMAGE_FLAGS \ (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \ FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST) @@ -3148,6 +3191,9 @@ static const pixman_iter_info_t vmx_iters[] = { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW, _pixman_iter_init_bits_stride, vmx_fetch_x8r8g8b8, NULL }, + { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW, + _pixman_iter_init_bits_stride, vmx_fetch_a8, NULL + }, { PIXMAN_null }, }; |