diff options
author | Ben Avison <bavison@riscosopen.org> | 2015-09-09 21:40:05 +0100 |
---|---|---|
committer | Siarhei Siamashka <siarhei.siamashka@gmail.com> | 2016-04-05 00:48:36 +0300 |
commit | 52494369fba8f28e3fd6645b017cfd25ea7f2c00 (patch) | |
tree | b471b5286b9a87c4ea14d8d703c1560c58e9faed | |
parent | efbed4163eff3859abd50148a75ea037b79aa698 (diff) |
armv7: Add optimised untransformed scanline fetchers r5g6b5 & a1r5g5b5
lowlevel-blt-bench results on Cortex-A7 for a couple of sample operations
that utilise these fetchers are below.
add_0565_8888:
Before After
Mean StdDev Mean StdDev Confidence Change
L1 75.4 0.38 147.5 0.90 100.00% +95.7%
L2 72.3 0.36 129.3 0.57 100.00% +79.0%
M 64.4 0.05 94.6 0.90 100.00% +46.8%
HT 35.8 0.03 42.3 0.26 100.00% +18.1%
VT 29.9 0.04 34.3 0.31 100.00% +14.5%
R 26.1 0.02 28.6 0.11 100.00% +9.4%
RT 12.2 0.06 13.1 0.15 100.00% +7.9%
add_1555_8888:
Before After
Mean StdDev Mean StdDev Confidence Change
L1 73.3 0.38 160.7 0.89 100.00% +119.2%
L2 69.8 0.08 139.1 0.74 100.00% +99.4%
M 62.2 0.03 100.4 0.76 100.00% +61.4%
HT 35.1 0.03 42.9 0.42 100.00% +22.1%
VT 29.5 0.03 34.7 0.33 100.00% +17.8%
R 25.8 0.02 28.7 0.27 100.00% +11.4%
RT 12.1 0.02 13.2 0.15 100.00% +8.5%
---
For the record, I tried writing an a8 fetcher, but benchmarking indicated that
it couldn't improve upon the ARMv6 a8 fetcher results.
I also tried adding prefetch to the above fetchers - since they are the
first iterator in a chain and won't benefit from write-allocate caches, you
might think that this would help. Benchmarking indicated otherwise.
-rw-r--r-- | pixman/pixman-arm-neon-asm.S | 20 | ||||
-rw-r--r-- | pixman/pixman-arm-neon.c | 13 |
2 files changed, 33 insertions, 0 deletions
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S index 5c1c30a..f48f773 100644 --- a/pixman/pixman-arm-neon-asm.S +++ b/pixman/pixman-arm-neon-asm.S @@ -496,6 +496,16 @@ generate_composite_function \ pixman_composite_src_0565_8888_process_pixblock_tail, \ pixman_composite_src_0565_8888_process_pixblock_tail_head +generate_composite_function_single_scanline \ + pixman_get_scanline_r5g6b5_asm_neon, 16, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_0565_8888_process_pixblock_head, \ + pixman_composite_src_0565_8888_process_pixblock_tail, \ + pixman_composite_src_0565_8888_process_pixblock_tail_head + /******************************************************************************/ .macro pixman_composite_add_8_8_process_pixblock_head @@ -3227,6 +3237,16 @@ generate_composite_function \ pixman_composite_src_1555_8888_process_pixblock_tail, \ pixman_composite_src_1555_8888_process_pixblock_tail_head +generate_composite_function_single_scanline \ + pixman_get_scanline_a1r5g5b5_asm_neon, 16, 0, 32, \ + FLAG_DST_WRITEONLY, \ + 8, /* number of pixels, processed in a single block */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_1555_8888_process_pixblock_head, \ + pixman_composite_src_1555_8888_process_pixblock_tail, \ + pixman_composite_src_1555_8888_process_pixblock_tail_head + /******************************************************************************/ generate_composite_function_nearest_scanline \ diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c index b597b82..d4675f1 100644 --- a/pixman/pixman-arm-neon.c +++ b/pixman/pixman-arm-neon.c @@ -165,6 +165,9 @@ PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_8888, OV PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_8888, ADD, uint32_t, uint32_t) +PIXMAN_ARM_BIND_GET_SCANLINE (neon, r5g6b5) +PIXMAN_ARM_BIND_GET_SCANLINE (neon, a1r5g5b5) + void pixman_composite_src_n_8_asm_neon (int32_t w, int32_t h, @@ -438,6 +441,15 @@ static const pixman_fast_path_t arm_neon_fast_paths[] = { PIXMAN_OP_NONE }, }; +static const pixman_iter_info_t arm_neon_iters[] = +{ + PIXMAN_ARM_UNTRANSFORMED_COVER_FETCHER (neon, r5g6b5), + + PIXMAN_ARM_UNTRANSFORMED_COVER_FETCHER (neon, a1r5g5b5), + + { PIXMAN_null }, +}; + void pixman_composite_scanline_src_mask_asm_neon (int32_t w, uint32_t *dst, @@ -481,6 +493,7 @@ _pixman_implementation_create_arm_neon (pixman_implementation_t *fallback) imp->combine_32[PIXMAN_OP_OUT_REVERSE] = neon_combine_out_reverse_u; imp->combine_32[PIXMAN_OP_ADD] = neon_combine_add_u; + imp->iter_info = arm_neon_iters; imp->blt = arm_neon_blt; imp->fill = arm_neon_fill; |