summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBen Avison <bavison@riscosopen.org>2015-09-04 22:07:29 +0100
committerBen Avison <bavison@riscosopen.org>2015-10-15 13:52:04 +0100
commit0e4451de5a40504837c852d2d369c80975974f88 (patch)
tree4c3e16e390fa51f8c8053f2dd5205366b76f9386
parent5bc8f99d2c33c72679e5dc7e71349720e9915674 (diff)
armv7: Add OVER_REVERSE combiner
In common with the ARMv6 version of this combiner, this code features a shortcut for the case where the destination is opaque. Without that, the NEON version performs significantly worse than the ARMv6 version (though it muct be noted that the effect of repeated application of the OVER_REVERSE operator is to set the destination opaque, so lowlevel-blt-bench is perhaps not best representing real-world usage in this case). lowlevel-blt-bench results for over_reverse_0565_8888 (compared to ARMv6 version): Before After Mean StdDev Mean StdDev Confidence Change L1 73.4 0.21 77.9 0.40 100.00% +6.2% L2 72.8 0.18 76.0 0.40 100.00% +4.4% M 66.3 0.02 70.1 0.67 100.00% +5.8% HT 34.0 0.19 31.0 0.38 100.00% -9.0% VT 30.2 0.16 27.4 0.35 100.00% -9.1% R 28.5 0.16 23.4 0.32 100.00% -17.9% RT 12.4 0.10 10.5 0.17 100.00% -15.2% lowlevel-blt-bench results for over_reverse_0565_8_8888 (compared to ARMv6 version): Before After Mean StdDev Mean StdDev Confidence Change L1 60.0 0.20 65.4 0.29 100.00% +9.0% L2 59.1 0.18 63.4 0.38 100.00% +7.2% M 50.3 0.24 55.8 0.09 100.00% +10.9% HT 24.1 0.15 22.4 0.12 100.00% -7.1% VT 20.8 0.12 19.6 0.13 100.00% -5.6% R 19.6 0.13 17.2 0.01 100.00% -12.4% RT 8.2 0.06 7.5 0.05 100.00% -8.2% It's notable that the compatative performance depends heavily upon the rectangle size - not surprising since one of the main features of NEON is the ability to work on larger blocks of data at once, which is mainly a benefit to large data sets, and the larger granularity works against it for smaller data sets. Comments welcome on whether it would be desirable to select between ARMv6 and ARMv7 implementations at runtime based upon the rectangle size.
-rw-r--r--pixman/pixman-arm-neon-asm.S145
-rw-r--r--pixman/pixman-arm-neon.c2
2 files changed, 147 insertions, 0 deletions
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index ba2d1be..3190518 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -812,6 +812,75 @@ generate_composite_function_single_scanline \
/******************************************************************************/
+.macro pixman_composite_over_reverse_8888_8888_init
+ push {v1-v2}
+.endm
+
+.macro pixman_composite_over_reverse_8888_8888_cleanup
+ pop {v1-v2}
+.endm
+
+.macro pixman_composite_over_reverse_8888_8888_common
+ vmull.u8 q8, d0, d24
+ vmull.u8 q9, d1, d24
+ vmull.u8 q10, d2, d24
+ vmull.u8 q11, d3, d24
+ vrshr.u16 q12, q8, #8
+ vrshr.u16 q13, q9, #8
+ vrshr.u16 q14, q10, #8
+ vrshr.u16 q15, q11, #8
+ vraddhn.u16 d0, q8, q12
+ vraddhn.u16 d1, q9, q13
+ vraddhn.u16 d2, q10, q14
+ vraddhn.u16 d3, q11, q15
+ vqadd.u8 q2, q0, q2
+ vqadd.u8 q3, q1, q3
+.endm
+
+.macro pixman_composite_over_reverse_8888_8888_process_pixblock_head
+ vmov v1, v2, d7
+ vmvn.8 d24, d7
+ cmp v1, #-1
+ cmpeq v2, #-1
+ beq 20f
+ pixman_composite_over_reverse_8888_8888_common
+20:
+.endm
+
+.macro pixman_composite_over_reverse_8888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_over_reverse_8888_8888_process_pixblock_tail_head
+ cmp v1, #-1
+ cmpeq v2, #-1
+ beq 10f
+ vst4.8 {d4-d7}, [DST_W :128]
+10: add DST_W, DST_W, #4*8
+ pixld_a 8, 32, dst_r_basereg - 4, DST_R // vld4.8 {d4-d7}, [DST_R :128]!
+ vmov v1, v2, d7
+ vmvn.8 d24, d7
+ cmp v1, #-1
+ cmpeq v2, #-1
+ addeq SRC, SRC, #4*8
+ beq 20f
+ fetch_src_pixblock // vld4.8 {d0-d3}, [SRC]!
+ pixman_composite_over_reverse_8888_8888_common
+20:
+.endm
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_over_reverse_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ pixman_composite_over_reverse_8888_8888_init, \
+ pixman_composite_over_reverse_8888_8888_cleanup, \
+ pixman_composite_over_reverse_8888_8888_process_pixblock_head, \
+ pixman_composite_over_reverse_8888_8888_process_pixblock_tail, \
+ pixman_composite_over_reverse_8888_8888_process_pixblock_tail_head, \
+ 4 /* dst_w_basereg */ \
+
+/******************************************************************************/
+
.macro pixman_composite_over_n_8888_process_pixblock_head
/* deinterleaved source pixels in {d0, d1, d2, d3} */
/* inverted alpha in {d24} */
@@ -2397,6 +2466,82 @@ generate_composite_function \
/******************************************************************************/
+.macro pixman_composite_over_reverse_8888_8888_8888_common
+ vmull.u8 q8, d0, d27
+ vmull.u8 q9, d1, d27
+ vmull.u8 q10, d2, d27
+ vmull.u8 q11, d3, d27
+ vrshr.u16 q12, q8, #8
+ vrshr.u16 q13, q9, #8
+ vrshr.u16 q14, q10, #8
+ vrshr.u16 q15, q11, #8
+ vraddhn.u16 d0, q8, q12
+ vmvn.8 d24, d7
+ vraddhn.u16 d1, q9, q13
+ vraddhn.u16 d2, q10, q14
+ vraddhn.u16 d3, q11, q15
+ vmull.u8 q8, d0, d24
+ vmull.u8 q9, d1, d24
+ vmull.u8 q10, d2, d24
+ vmull.u8 q11, d3, d24
+ vrshr.u16 q12, q8, #8
+ vrshr.u16 q13, q9, #8
+ vrshr.u16 q14, q10, #8
+ vrshr.u16 q15, q11, #8
+ vraddhn.u16 d0, q8, q12
+ vraddhn.u16 d1, q9, q13
+ vraddhn.u16 d2, q10, q14
+ vraddhn.u16 d3, q11, q15
+ vqadd.u8 q2, q0, q2
+ vqadd.u8 q3, q1, q3
+.endm
+
+.macro pixman_composite_over_reverse_8888_8888_8888_process_pixblock_head
+ vmov v1, v2, d7
+ vmvn.8 d24, d7
+ cmp v1, #-1
+ cmpeq v2, #-1
+ beq 20f
+ pixman_composite_over_reverse_8888_8888_8888_common
+20:
+.endm
+
+.macro pixman_composite_over_reverse_8888_8888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_over_reverse_8888_8888_8888_process_pixblock_tail_head
+ cmp v1, #-1
+ cmpeq v2, #-1
+ beq 10f
+ vst4.8 {d4-d7}, [DST_W :128]
+10: add DST_W, DST_W, #4*8
+ pixld_a 8, 32, dst_r_basereg - 4, DST_R // vld4.8 {d4-d7}, [DST_R :128]!
+ vmov v1, v2, d7
+ vmvn.8 d24, d7
+ cmp v1, #-1
+ cmpeq v2, #-1
+ addeq SRC, SRC, #4*8
+ addeq MASK, MASK, #4*8
+ beq 20f
+ fetch_src_pixblock // vld4.8 {d0-d3}, [SRC]!
+ fetch_mask_pixblock // vld4.8 {d24-27}, [MASK]!
+ pixman_composite_over_reverse_8888_8888_8888_common
+20:
+.endm
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_over_reverse_mask_asm_neon, 32, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ pixman_composite_over_reverse_8888_8888_init, \
+ pixman_composite_over_reverse_8888_8888_cleanup, \
+ pixman_composite_over_reverse_8888_8888_8888_process_pixblock_head, \
+ pixman_composite_over_reverse_8888_8888_8888_process_pixblock_tail, \
+ pixman_composite_over_reverse_8888_8888_8888_process_pixblock_tail_head, \
+ 4 /* dst_w_basereg */ \
+
+/******************************************************************************/
+
.macro pixman_composite_in_out_mask_process_pixblock_head out, reverse
.if reverse
vmull.u8 q11, d3, d27
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index b899ec8..9b99c75 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -451,6 +451,7 @@ neon_combine_src_u (pixman_implementation_t *imp,
}
PIXMAN_ARM_BIND_COMBINE_U (neon, over)
+PIXMAN_ARM_BIND_COMBINE_U (neon, over_reverse)
PIXMAN_ARM_BIND_COMBINE_U (neon, in)
PIXMAN_ARM_BIND_COMBINE_U (neon, in_reverse)
PIXMAN_ARM_BIND_COMBINE_U (neon, out)
@@ -465,6 +466,7 @@ _pixman_implementation_create_arm_neon (pixman_implementation_t *fallback)
imp->combine_32[PIXMAN_OP_SRC] = neon_combine_src_u;
imp->combine_32[PIXMAN_OP_OVER] = neon_combine_over_u;
+ imp->combine_32[PIXMAN_OP_OVER_REVERSE] = neon_combine_over_reverse_u;
imp->combine_32[PIXMAN_OP_IN] = neon_combine_in_u;
imp->combine_32[PIXMAN_OP_IN_REVERSE] = neon_combine_in_reverse_u;
imp->combine_32[PIXMAN_OP_OUT] = neon_combine_out_u;