diff options
author | Ben Avison <bavison@riscosopen.org> | 2015-04-17 01:01:27 +0100 |
---|---|---|
committer | Siarhei Siamashka <siarhei.siamashka@gmail.com> | 2016-04-05 00:48:36 +0300 |
commit | efbed4163eff3859abd50148a75ea037b79aa698 (patch) | |
tree | bf6242f9ffd48bac8b2e35fd6ae6bd4aeb27573d | |
parent | 767e8b4b31f35b4aa9a9b72bf913dbc13e04fde9 (diff) |
armv7: Add src_1555_8888 fast path
This is tuned for Cortex-A7 (Raspberry Pi 2).
lowlevel-blt-bench results, compared to the ARMv6 fast path:
Before After
Mean StdDev Mean StdDev Confidence Change
L1 88.6 0.2 221.3 0.5 100.0% +149.7%
L2 88.1 0.4 219.2 0.8 100.0% +148.9%
M 87.9 0.1 178.2 0.1 100.0% +102.6%
HT 59.7 0.4 72.0 0.2 100.0% +20.7%
VT 53.2 0.4 69.8 0.2 100.0% +31.3%
R 48.5 0.3 53.6 0.1 100.0% +10.6%
RT 21.2 0.1 23.0 0.1 100.0% +8.5%
-rw-r--r-- | pixman/pixman-arm-neon-asm.S | 51 | ||||
-rw-r--r-- | pixman/pixman-arm-neon.c | 8 |
2 files changed, 59 insertions, 0 deletions
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S index 3190518..5c1c30a 100644 --- a/pixman/pixman-arm-neon-asm.S +++ b/pixman/pixman-arm-neon-asm.S @@ -3178,6 +3178,57 @@ generate_composite_function \ /******************************************************************************/ +.macro pixman_composite_src_1555_8888_process_pixblock_head + /* src is in d0-d1 */ + vshrn.i16 d31, q0, #8 + vshrn.u16 d29, q0, #2 + vshrn.i16 d30, q0, #7 + vsli.u16 q0, q0, #5 + vshr.s8 d31, d31, #7 + vsri.8 d29, d29, #5 + vsri.8 d30, d30, #5 + vshrn.u16 d28, q0, #2 +.endm + +.macro pixman_composite_src_1555_8888_process_pixblock_tail + vzip.8 d29, d31 + vzip.8 d28, d30 + vzip.8 d28, d29 + vzip.8 d30, d31 + /* result is in d28-d31 */ +.endm + +.macro pixman_composite_src_1555_8888_process_pixblock_tail_head + vzip.8 d29, d31 + vzip.8 d28, d30 + vld1.16 {d0-d1}, [SRC]! + cache_preload 8, 8 + vzip.8 d28, d29 + vzip.8 d30, d31 + vst1.8 {d28-d31}, [DST_W :128]! + vshrn.i16 d31, q0, #8 + vshrn.u16 d29, q0, #2 + vshrn.i16 d30, q0, #7 + vsli.u16 q0, q0, #5 + vshr.s8 d31, d31, #7 + vsri.8 d29, d29, #5 + vsri.8 d30, d30, #5 + vshrn.u16 d28, q0, #2 +.endm + +generate_composite_function \ + pixman_composite_src_1555_8888_asm_neon, 16, 0, 32, \ + FLAG_DST_WRITEONLY, \ + 8, /* number of pixels, processed in a single block */ \ + 6, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_1555_8888_process_pixblock_head, \ + pixman_composite_src_1555_8888_process_pixblock_tail, \ + pixman_composite_src_1555_8888_process_pixblock_tail_head + +/******************************************************************************/ + generate_composite_function_nearest_scanline \ pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c index 9b99c75..b597b82 100644 --- a/pixman/pixman-arm-neon.c +++ b/pixman/pixman-arm-neon.c @@ -46,6 +46,8 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_8888_0565, uint32_t, 1, uint16_t, 1) PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0565_8888, uint16_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_1555_8888, + uint16_t, 1, uint32_t, 1) PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_8888_rev, uint8_t, 3, uint32_t, 1) PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_0565_rev, @@ -282,6 +284,12 @@ static const pixman_fast_path_t arm_neon_fast_paths[] = PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, x8r8g8b8, neon_composite_src_0565_8888), PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, a8b8g8r8, neon_composite_src_0565_8888), PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, x8b8g8r8, neon_composite_src_0565_8888), + PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x8r8g8b8, neon_composite_src_1555_8888), + PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x8r8g8b8, neon_composite_src_1555_8888), + PIXMAN_STD_FAST_PATH (SRC, x1b5g5r5, null, x8b8g8r8, neon_composite_src_1555_8888), + PIXMAN_STD_FAST_PATH (SRC, a1b5g5r5, null, x8b8g8r8, neon_composite_src_1555_8888), + PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, a8r8g8b8, neon_composite_src_1555_8888), + PIXMAN_STD_FAST_PATH (SRC, a1b5g5r5, null, a8b8g8r8, neon_composite_src_1555_8888), PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, neon_composite_src_8888_8888), PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, neon_composite_src_8888_8888), PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, neon_composite_src_8888_8888), |