armv7: Add OVER_REVERSE combiner

In common with the ARMv6 version of this combiner, this code features a shortcut for the case where the destination is opaque. Without that, the NEON version performs significantly worse than the ARMv6 version (though it muct be noted that the effect of repeated application of the OVER_REVERSE operator is to set the destination opaque, so lowlevel-blt-bench is perhaps not best representing real-world usage in this case). lowlevel-blt-bench results for over_reverse_0565_8888 (compared to ARMv6 version): Before After Mean StdDev Mean StdDev Confidence Change L1 73.4 0.21 77.9 0.40 100.00% +6.2% L2 72.8 0.18 76.0 0.40 100.00% +4.4% M 66.3 0.02 70.1 0.67 100.00% +5.8% HT 34.0 0.19 31.0 0.38 100.00% -9.0% VT 30.2 0.16 27.4 0.35 100.00% -9.1% R 28.5 0.16 23.4 0.32 100.00% -17.9% RT 12.4 0.10 10.5 0.17 100.00% -15.2% lowlevel-blt-bench results for over_reverse_0565_8_8888 (compared to ARMv6 version): Before After Mean StdDev Mean StdDev Confidence Change L1 60.0 0.20 65.4 0.29 100.00% +9.0% L2 59.1 0.18 63.4 0.38 100.00% +7.2% M 50.3 0.24 55.8 0.09 100.00% +10.9% HT 24.1 0.15 22.4 0.12 100.00% -7.1% VT 20.8 0.12 19.6 0.13 100.00% -5.6% R 19.6 0.13 17.2 0.01 100.00% -12.4% RT 8.2 0.06 7.5 0.05 100.00% -8.2% It's notable that the compatative performance depends heavily upon the rectangle size - not surprising since one of the main features of NEON is the ability to work on larger blocks of data at once, which is mainly a benefit to large data sets, and the larger granularity works against it for smaller data sets. Comments welcome on whether it would be desirable to select between ARMv6 and ARMv7 implementations at runtime based upon the rectangle size.
author: Ben Avison <bavison@riscosopen.org> 2015-09-04 22:07:29 +0100
committer: Ben Avison <bavison@riscosopen.org> 2015-10-15 13:52:04 +0100
commit: 0e4451de5a40504837c852d2d369c80975974f88 (patch)
tree: 4c3e16e390fa51f8c8053f2dd5205366b76f9386
parent: 5bc8f99d2c33c72679e5dc7e71349720e9915674 (diff)
2 files changed, 147 insertions, 0 deletions
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index ba2d1be..3190518 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -812,6 +812,75 @@ generate_composite_function_single_scanline \
 
 /******************************************************************************/
 
+.macro pixman_composite_over_reverse_8888_8888_init
+    push        {v1-v2}
+.endm
+
+.macro pixman_composite_over_reverse_8888_8888_cleanup
+    pop         {v1-v2}
+.endm
+
+.macro pixman_composite_over_reverse_8888_8888_common
+    vmull.u8    q8, d0, d24
+    vmull.u8    q9, d1, d24
+    vmull.u8    q10, d2, d24
+    vmull.u8    q11, d3, d24
+    vrshr.u16   q12, q8, #8
+    vrshr.u16   q13, q9, #8
+    vrshr.u16   q14, q10, #8
+    vrshr.u16   q15, q11, #8
+    vraddhn.u16 d0, q8, q12
+    vraddhn.u16 d1, q9, q13
+    vraddhn.u16 d2, q10, q14
+    vraddhn.u16 d3, q11, q15
+    vqadd.u8    q2, q0, q2
+    vqadd.u8    q3, q1, q3
+.endm
+
+.macro pixman_composite_over_reverse_8888_8888_process_pixblock_head
+    vmov        v1, v2, d7
+    vmvn.8      d24, d7
+    cmp         v1, #-1
+    cmpeq       v2, #-1
+    beq         20f
+    pixman_composite_over_reverse_8888_8888_common
+20:
+.endm
+
+.macro pixman_composite_over_reverse_8888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_over_reverse_8888_8888_process_pixblock_tail_head
+    cmp         v1, #-1
+    cmpeq       v2, #-1
+    beq         10f
+    vst4.8      {d4-d7}, [DST_W :128]
+10: add         DST_W, DST_W, #4*8
+    pixld_a 8, 32, dst_r_basereg - 4, DST_R //  vld4.8      {d4-d7}, [DST_R :128]!
+    vmov        v1, v2, d7
+    vmvn.8      d24, d7
+    cmp         v1, #-1
+    cmpeq       v2, #-1
+    addeq       SRC, SRC, #4*8
+    beq         20f
+    fetch_src_pixblock                      //  vld4.8      {d0-d3}, [SRC]!
+    pixman_composite_over_reverse_8888_8888_common
+20:
+.endm
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_over_reverse_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    pixman_composite_over_reverse_8888_8888_init, \
+    pixman_composite_over_reverse_8888_8888_cleanup, \
+    pixman_composite_over_reverse_8888_8888_process_pixblock_head, \
+    pixman_composite_over_reverse_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_reverse_8888_8888_process_pixblock_tail_head, \
+    4 /* dst_w_basereg */ \
+
+/******************************************************************************/
+
 .macro pixman_composite_over_n_8888_process_pixblock_head
     /* deinterleaved source pixels in {d0, d1, d2, d3} */
     /* inverted alpha in {d24} */
@@ -2397,6 +2466,82 @@ generate_composite_function \
 
 /******************************************************************************/
 
+.macro pixman_composite_over_reverse_8888_8888_8888_common
+    vmull.u8    q8, d0, d27
+    vmull.u8    q9, d1, d27
+    vmull.u8    q10, d2, d27
+    vmull.u8    q11, d3, d27
+    vrshr.u16   q12, q8, #8
+    vrshr.u16   q13, q9, #8
+    vrshr.u16   q14, q10, #8
+    vrshr.u16   q15, q11, #8
+    vraddhn.u16 d0, q8, q12
+    vmvn.8      d24, d7
+    vraddhn.u16 d1, q9, q13
+    vraddhn.u16 d2, q10, q14
+    vraddhn.u16 d3, q11, q15
+    vmull.u8    q8, d0, d24
+    vmull.u8    q9, d1, d24
+    vmull.u8    q10, d2, d24
+    vmull.u8    q11, d3, d24
+    vrshr.u16   q12, q8, #8
+    vrshr.u16   q13, q9, #8
+    vrshr.u16   q14, q10, #8
+    vrshr.u16   q15, q11, #8
+    vraddhn.u16 d0, q8, q12
+    vraddhn.u16 d1, q9, q13
+    vraddhn.u16 d2, q10, q14
+    vraddhn.u16 d3, q11, q15
+    vqadd.u8    q2, q0, q2
+    vqadd.u8    q3, q1, q3
+.endm
+
+.macro pixman_composite_over_reverse_8888_8888_8888_process_pixblock_head
+    vmov        v1, v2, d7
+    vmvn.8      d24, d7
+    cmp         v1, #-1
+    cmpeq       v2, #-1
+    beq         20f
+    pixman_composite_over_reverse_8888_8888_8888_common
+20:
+.endm
+
+.macro pixman_composite_over_reverse_8888_8888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_over_reverse_8888_8888_8888_process_pixblock_tail_head
+    cmp         v1, #-1
+    cmpeq       v2, #-1
+    beq         10f
+    vst4.8      {d4-d7}, [DST_W :128]
+10: add         DST_W, DST_W, #4*8
+    pixld_a 8, 32, dst_r_basereg - 4, DST_R //  vld4.8      {d4-d7}, [DST_R :128]!
+    vmov        v1, v2, d7
+    vmvn.8      d24, d7
+    cmp         v1, #-1
+    cmpeq       v2, #-1
+    addeq       SRC, SRC, #4*8
+    addeq       MASK, MASK, #4*8
+    beq         20f
+    fetch_src_pixblock                      //  vld4.8      {d0-d3}, [SRC]!
+    fetch_mask_pixblock                     //  vld4.8      {d24-27}, [MASK]!
+    pixman_composite_over_reverse_8888_8888_8888_common
+20:
+.endm
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_over_reverse_mask_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    pixman_composite_over_reverse_8888_8888_init, \
+    pixman_composite_over_reverse_8888_8888_cleanup, \
+    pixman_composite_over_reverse_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_over_reverse_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_reverse_8888_8888_8888_process_pixblock_tail_head, \
+    4 /* dst_w_basereg */ \
+
+/******************************************************************************/
+
 .macro pixman_composite_in_out_mask_process_pixblock_head  out, reverse
  .if reverse
     vmull.u8    q11, d3, d27
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index b899ec8..9b99c75 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -451,6 +451,7 @@ neon_combine_src_u (pixman_implementation_t *imp,
 }
 
 PIXMAN_ARM_BIND_COMBINE_U (neon, over)
+PIXMAN_ARM_BIND_COMBINE_U (neon, over_reverse)
 PIXMAN_ARM_BIND_COMBINE_U (neon, in)
 PIXMAN_ARM_BIND_COMBINE_U (neon, in_reverse)
 PIXMAN_ARM_BIND_COMBINE_U (neon, out)
@@ -465,6 +466,7 @@ _pixman_implementation_create_arm_neon (pixman_implementation_t *fallback)
 
     imp->combine_32[PIXMAN_OP_SRC] = neon_combine_src_u;
     imp->combine_32[PIXMAN_OP_OVER] = neon_combine_over_u;
+    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = neon_combine_over_reverse_u;
     imp->combine_32[PIXMAN_OP_IN] = neon_combine_in_u;
     imp->combine_32[PIXMAN_OP_IN_REVERSE] = neon_combine_in_reverse_u;
     imp->combine_32[PIXMAN_OP_OUT] = neon_combine_out_u;
author	Ben Avison <bavison@riscosopen.org>	2015-09-04 22:07:29 +0100
committer	Ben Avison <bavison@riscosopen.org>	2015-10-15 13:52:04 +0100
commit	0e4451de5a40504837c852d2d369c80975974f88 (patch)
tree	4c3e16e390fa51f8c8053f2dd5205366b76f9386
parent	5bc8f99d2c33c72679e5dc7e71349720e9915674 (diff)