diff options
author | Ben Avison <bavison@riscosopen.org> | 2015-09-04 22:07:29 +0100 |
---|---|---|
committer | Ben Avison <bavison@riscosopen.org> | 2015-10-15 13:52:04 +0100 |
commit | 0e4451de5a40504837c852d2d369c80975974f88 (patch) | |
tree | 4c3e16e390fa51f8c8053f2dd5205366b76f9386 | |
parent | 5bc8f99d2c33c72679e5dc7e71349720e9915674 (diff) |
armv7: Add OVER_REVERSE combiner
In common with the ARMv6 version of this combiner, this code features a
shortcut for the case where the destination is opaque. Without that, the
NEON version performs significantly worse than the ARMv6 version (though it
muct be noted that the effect of repeated application of the OVER_REVERSE
operator is to set the destination opaque, so lowlevel-blt-bench is perhaps
not best representing real-world usage in this case).
lowlevel-blt-bench results for over_reverse_0565_8888 (compared to ARMv6
version):
Before After
Mean StdDev Mean StdDev Confidence Change
L1 73.4 0.21 77.9 0.40 100.00% +6.2%
L2 72.8 0.18 76.0 0.40 100.00% +4.4%
M 66.3 0.02 70.1 0.67 100.00% +5.8%
HT 34.0 0.19 31.0 0.38 100.00% -9.0%
VT 30.2 0.16 27.4 0.35 100.00% -9.1%
R 28.5 0.16 23.4 0.32 100.00% -17.9%
RT 12.4 0.10 10.5 0.17 100.00% -15.2%
lowlevel-blt-bench results for over_reverse_0565_8_8888 (compared to ARMv6
version):
Before After
Mean StdDev Mean StdDev Confidence Change
L1 60.0 0.20 65.4 0.29 100.00% +9.0%
L2 59.1 0.18 63.4 0.38 100.00% +7.2%
M 50.3 0.24 55.8 0.09 100.00% +10.9%
HT 24.1 0.15 22.4 0.12 100.00% -7.1%
VT 20.8 0.12 19.6 0.13 100.00% -5.6%
R 19.6 0.13 17.2 0.01 100.00% -12.4%
RT 8.2 0.06 7.5 0.05 100.00% -8.2%
It's notable that the compatative performance depends heavily upon the
rectangle size - not surprising since one of the main features of NEON is
the ability to work on larger blocks of data at once, which is mainly a
benefit to large data sets, and the larger granularity works against it for
smaller data sets. Comments welcome on whether it would be desirable to select
between ARMv6 and ARMv7 implementations at runtime based upon the rectangle
size.
-rw-r--r-- | pixman/pixman-arm-neon-asm.S | 145 | ||||
-rw-r--r-- | pixman/pixman-arm-neon.c | 2 |
2 files changed, 147 insertions, 0 deletions
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S index ba2d1be..3190518 100644 --- a/pixman/pixman-arm-neon-asm.S +++ b/pixman/pixman-arm-neon-asm.S @@ -812,6 +812,75 @@ generate_composite_function_single_scanline \ /******************************************************************************/ +.macro pixman_composite_over_reverse_8888_8888_init + push {v1-v2} +.endm + +.macro pixman_composite_over_reverse_8888_8888_cleanup + pop {v1-v2} +.endm + +.macro pixman_composite_over_reverse_8888_8888_common + vmull.u8 q8, d0, d24 + vmull.u8 q9, d1, d24 + vmull.u8 q10, d2, d24 + vmull.u8 q11, d3, d24 + vrshr.u16 q12, q8, #8 + vrshr.u16 q13, q9, #8 + vrshr.u16 q14, q10, #8 + vrshr.u16 q15, q11, #8 + vraddhn.u16 d0, q8, q12 + vraddhn.u16 d1, q9, q13 + vraddhn.u16 d2, q10, q14 + vraddhn.u16 d3, q11, q15 + vqadd.u8 q2, q0, q2 + vqadd.u8 q3, q1, q3 +.endm + +.macro pixman_composite_over_reverse_8888_8888_process_pixblock_head + vmov v1, v2, d7 + vmvn.8 d24, d7 + cmp v1, #-1 + cmpeq v2, #-1 + beq 20f + pixman_composite_over_reverse_8888_8888_common +20: +.endm + +.macro pixman_composite_over_reverse_8888_8888_process_pixblock_tail +.endm + +.macro pixman_composite_over_reverse_8888_8888_process_pixblock_tail_head + cmp v1, #-1 + cmpeq v2, #-1 + beq 10f + vst4.8 {d4-d7}, [DST_W :128] +10: add DST_W, DST_W, #4*8 + pixld_a 8, 32, dst_r_basereg - 4, DST_R // vld4.8 {d4-d7}, [DST_R :128]! + vmov v1, v2, d7 + vmvn.8 d24, d7 + cmp v1, #-1 + cmpeq v2, #-1 + addeq SRC, SRC, #4*8 + beq 20f + fetch_src_pixblock // vld4.8 {d0-d3}, [SRC]! + pixman_composite_over_reverse_8888_8888_common +20: +.endm + +generate_composite_function_single_scanline \ + pixman_composite_scanline_over_reverse_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + pixman_composite_over_reverse_8888_8888_init, \ + pixman_composite_over_reverse_8888_8888_cleanup, \ + pixman_composite_over_reverse_8888_8888_process_pixblock_head, \ + pixman_composite_over_reverse_8888_8888_process_pixblock_tail, \ + pixman_composite_over_reverse_8888_8888_process_pixblock_tail_head, \ + 4 /* dst_w_basereg */ \ + +/******************************************************************************/ + .macro pixman_composite_over_n_8888_process_pixblock_head /* deinterleaved source pixels in {d0, d1, d2, d3} */ /* inverted alpha in {d24} */ @@ -2397,6 +2466,82 @@ generate_composite_function \ /******************************************************************************/ +.macro pixman_composite_over_reverse_8888_8888_8888_common + vmull.u8 q8, d0, d27 + vmull.u8 q9, d1, d27 + vmull.u8 q10, d2, d27 + vmull.u8 q11, d3, d27 + vrshr.u16 q12, q8, #8 + vrshr.u16 q13, q9, #8 + vrshr.u16 q14, q10, #8 + vrshr.u16 q15, q11, #8 + vraddhn.u16 d0, q8, q12 + vmvn.8 d24, d7 + vraddhn.u16 d1, q9, q13 + vraddhn.u16 d2, q10, q14 + vraddhn.u16 d3, q11, q15 + vmull.u8 q8, d0, d24 + vmull.u8 q9, d1, d24 + vmull.u8 q10, d2, d24 + vmull.u8 q11, d3, d24 + vrshr.u16 q12, q8, #8 + vrshr.u16 q13, q9, #8 + vrshr.u16 q14, q10, #8 + vrshr.u16 q15, q11, #8 + vraddhn.u16 d0, q8, q12 + vraddhn.u16 d1, q9, q13 + vraddhn.u16 d2, q10, q14 + vraddhn.u16 d3, q11, q15 + vqadd.u8 q2, q0, q2 + vqadd.u8 q3, q1, q3 +.endm + +.macro pixman_composite_over_reverse_8888_8888_8888_process_pixblock_head + vmov v1, v2, d7 + vmvn.8 d24, d7 + cmp v1, #-1 + cmpeq v2, #-1 + beq 20f + pixman_composite_over_reverse_8888_8888_8888_common +20: +.endm + +.macro pixman_composite_over_reverse_8888_8888_8888_process_pixblock_tail +.endm + +.macro pixman_composite_over_reverse_8888_8888_8888_process_pixblock_tail_head + cmp v1, #-1 + cmpeq v2, #-1 + beq 10f + vst4.8 {d4-d7}, [DST_W :128] +10: add DST_W, DST_W, #4*8 + pixld_a 8, 32, dst_r_basereg - 4, DST_R // vld4.8 {d4-d7}, [DST_R :128]! + vmov v1, v2, d7 + vmvn.8 d24, d7 + cmp v1, #-1 + cmpeq v2, #-1 + addeq SRC, SRC, #4*8 + addeq MASK, MASK, #4*8 + beq 20f + fetch_src_pixblock // vld4.8 {d0-d3}, [SRC]! + fetch_mask_pixblock // vld4.8 {d24-27}, [MASK]! + pixman_composite_over_reverse_8888_8888_8888_common +20: +.endm + +generate_composite_function_single_scanline \ + pixman_composite_scanline_over_reverse_mask_asm_neon, 32, 32, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + pixman_composite_over_reverse_8888_8888_init, \ + pixman_composite_over_reverse_8888_8888_cleanup, \ + pixman_composite_over_reverse_8888_8888_8888_process_pixblock_head, \ + pixman_composite_over_reverse_8888_8888_8888_process_pixblock_tail, \ + pixman_composite_over_reverse_8888_8888_8888_process_pixblock_tail_head, \ + 4 /* dst_w_basereg */ \ + +/******************************************************************************/ + .macro pixman_composite_in_out_mask_process_pixblock_head out, reverse .if reverse vmull.u8 q11, d3, d27 diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c index b899ec8..9b99c75 100644 --- a/pixman/pixman-arm-neon.c +++ b/pixman/pixman-arm-neon.c @@ -451,6 +451,7 @@ neon_combine_src_u (pixman_implementation_t *imp, } PIXMAN_ARM_BIND_COMBINE_U (neon, over) +PIXMAN_ARM_BIND_COMBINE_U (neon, over_reverse) PIXMAN_ARM_BIND_COMBINE_U (neon, in) PIXMAN_ARM_BIND_COMBINE_U (neon, in_reverse) PIXMAN_ARM_BIND_COMBINE_U (neon, out) @@ -465,6 +466,7 @@ _pixman_implementation_create_arm_neon (pixman_implementation_t *fallback) imp->combine_32[PIXMAN_OP_SRC] = neon_combine_src_u; imp->combine_32[PIXMAN_OP_OVER] = neon_combine_over_u; + imp->combine_32[PIXMAN_OP_OVER_REVERSE] = neon_combine_over_reverse_u; imp->combine_32[PIXMAN_OP_IN] = neon_combine_in_u; imp->combine_32[PIXMAN_OP_IN_REVERSE] = neon_combine_in_reverse_u; imp->combine_32[PIXMAN_OP_OUT] = neon_combine_out_u; |