summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBen Avison <bavison@riscosopen.org>2015-09-09 21:40:05 +0100
committerSiarhei Siamashka <siarhei.siamashka@gmail.com>2016-04-05 00:48:36 +0300
commit52494369fba8f28e3fd6645b017cfd25ea7f2c00 (patch)
treeb471b5286b9a87c4ea14d8d703c1560c58e9faed
parentefbed4163eff3859abd50148a75ea037b79aa698 (diff)
armv7: Add optimised untransformed scanline fetchers r5g6b5 & a1r5g5b5
lowlevel-blt-bench results on Cortex-A7 for a couple of sample operations that utilise these fetchers are below. add_0565_8888: Before After Mean StdDev Mean StdDev Confidence Change L1 75.4 0.38 147.5 0.90 100.00% +95.7% L2 72.3 0.36 129.3 0.57 100.00% +79.0% M 64.4 0.05 94.6 0.90 100.00% +46.8% HT 35.8 0.03 42.3 0.26 100.00% +18.1% VT 29.9 0.04 34.3 0.31 100.00% +14.5% R 26.1 0.02 28.6 0.11 100.00% +9.4% RT 12.2 0.06 13.1 0.15 100.00% +7.9% add_1555_8888: Before After Mean StdDev Mean StdDev Confidence Change L1 73.3 0.38 160.7 0.89 100.00% +119.2% L2 69.8 0.08 139.1 0.74 100.00% +99.4% M 62.2 0.03 100.4 0.76 100.00% +61.4% HT 35.1 0.03 42.9 0.42 100.00% +22.1% VT 29.5 0.03 34.7 0.33 100.00% +17.8% R 25.8 0.02 28.7 0.27 100.00% +11.4% RT 12.1 0.02 13.2 0.15 100.00% +8.5% --- For the record, I tried writing an a8 fetcher, but benchmarking indicated that it couldn't improve upon the ARMv6 a8 fetcher results. I also tried adding prefetch to the above fetchers - since they are the first iterator in a chain and won't benefit from write-allocate caches, you might think that this would help. Benchmarking indicated otherwise.
-rw-r--r--pixman/pixman-arm-neon-asm.S20
-rw-r--r--pixman/pixman-arm-neon.c13
2 files changed, 33 insertions, 0 deletions
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 5c1c30a..f48f773 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -496,6 +496,16 @@ generate_composite_function \
pixman_composite_src_0565_8888_process_pixblock_tail, \
pixman_composite_src_0565_8888_process_pixblock_tail_head
+generate_composite_function_single_scanline \
+ pixman_get_scanline_r5g6b5_asm_neon, 16, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_0565_8888_process_pixblock_head, \
+ pixman_composite_src_0565_8888_process_pixblock_tail, \
+ pixman_composite_src_0565_8888_process_pixblock_tail_head
+
/******************************************************************************/
.macro pixman_composite_add_8_8_process_pixblock_head
@@ -3227,6 +3237,16 @@ generate_composite_function \
pixman_composite_src_1555_8888_process_pixblock_tail, \
pixman_composite_src_1555_8888_process_pixblock_tail_head
+generate_composite_function_single_scanline \
+ pixman_get_scanline_a1r5g5b5_asm_neon, 16, 0, 32, \
+ FLAG_DST_WRITEONLY, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_1555_8888_process_pixblock_head, \
+ pixman_composite_src_1555_8888_process_pixblock_tail, \
+ pixman_composite_src_1555_8888_process_pixblock_tail_head
+
/******************************************************************************/
generate_composite_function_nearest_scanline \
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index b597b82..d4675f1 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -165,6 +165,9 @@ PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_8888, OV
PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_8888, ADD,
uint32_t, uint32_t)
+PIXMAN_ARM_BIND_GET_SCANLINE (neon, r5g6b5)
+PIXMAN_ARM_BIND_GET_SCANLINE (neon, a1r5g5b5)
+
void
pixman_composite_src_n_8_asm_neon (int32_t w,
int32_t h,
@@ -438,6 +441,15 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
{ PIXMAN_OP_NONE },
};
+static const pixman_iter_info_t arm_neon_iters[] =
+{
+ PIXMAN_ARM_UNTRANSFORMED_COVER_FETCHER (neon, r5g6b5),
+
+ PIXMAN_ARM_UNTRANSFORMED_COVER_FETCHER (neon, a1r5g5b5),
+
+ { PIXMAN_null },
+};
+
void
pixman_composite_scanline_src_mask_asm_neon (int32_t w,
uint32_t *dst,
@@ -481,6 +493,7 @@ _pixman_implementation_create_arm_neon (pixman_implementation_t *fallback)
imp->combine_32[PIXMAN_OP_OUT_REVERSE] = neon_combine_out_reverse_u;
imp->combine_32[PIXMAN_OP_ADD] = neon_combine_add_u;
+ imp->iter_info = arm_neon_iters;
imp->blt = arm_neon_blt;
imp->fill = arm_neon_fill;