summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSøren Sandmann Pedersen <ssp@redhat.com>2011-04-02 23:24:48 -0400
committerSøren Sandmann Pedersen <ssp@redhat.com>2011-04-15 17:25:03 -0400
commitd865e32ceeeb604a42e0628c996bab708c047617 (patch)
tree351f4bb3ea92a4f1237ad2f6aca25edc01bfca77
parent89c7b29b6ab5b85f524d05e26ff6b12a486519c6 (diff)
ARM: Add 'neon_composite_over_n_8888_0565' fast pathneon_565_ca
This improves the performance of the firefox-talos-gfx benchmark with the image16 backend. Benchmark on an 800 MHz ARM Cortex A8: Before: [ # ] backend test min(s) median(s) stddev. count [ 0] image16 firefox-talos-gfx 121.773 122.218 0.15% 6/6 After: [ # ] backend test min(s) median(s) stddev. count [ 0] image16 firefox-talos-gfx 85.247 85.563 0.22% 6/6 V2: Slightly better instruction scheduling based on comments from Taekyun Kim. V3: Inline the head of the loop and do even better pipelining. This gets rid of all stalls in the inner loop. Also based on comments from Taekyun Kim.
-rw-r--r--pixman/pixman-arm-neon-asm.S169
-rw-r--r--pixman/pixman-arm-neon.c4
2 files changed, 173 insertions, 0 deletions
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 5e9fda34..833f18c2 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1426,6 +1426,175 @@ generate_composite_function \
/******************************************************************************/
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
+ /*
+ * 'combine_mask_ca' replacement
+ *
+ * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A]
+ * mask in {d24, d25, d26} [B, G, R]
+ * output: updated src in {d0, d1, d2 } [B, G, R]
+ * updated mask in {d24, d25, d26} [B, G, R]
+ */
+ vmull.u8 q0, d24, d8
+ vmull.u8 q1, d25, d9
+ vmull.u8 q6, d26, d10
+ vmull.u8 q9, d11, d25
+ vmull.u8 q12, d11, d24
+ vmull.u8 q13, d11, d26
+ vrshr.u16 q8, q0, #8
+ vrshr.u16 q10, q1, #8
+ vrshr.u16 q11, q6, #8
+ vraddhn.u16 d0, q0, q8
+ vraddhn.u16 d1, q1, q10
+ vraddhn.u16 d2, q6, q11
+ vrshr.u16 q11, q12, #8
+ vrshr.u16 q8, q9, #8
+ vrshr.u16 q6, q13, #8
+ vraddhn.u16 d24, q12, q11
+ vraddhn.u16 d25, q9, q8
+ /*
+ * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+ * and put data into d16 - blue, d17 - green, d18 - red
+ */
+ vshrn.u16 d17, q2, #3
+ vshrn.u16 d18, q2, #8
+ vraddhn.u16 d26, q13, q6
+ vsli.u16 q2, q2, #5
+ vsri.u8 d18, d18, #5
+ vsri.u8 d17, d17, #6
+ /*
+ * 'combine_over_ca' replacement
+ *
+ * output: updated dest in d16 - blue, d17 - green, d18 - red
+ */
+ vmvn.8 q12, q12
+ vshrn.u16 d16, q2, #2
+ vmvn.8 d26, d26
+ vmull.u8 q6, d16, d24
+ vmull.u8 q7, d17, d25
+ vmull.u8 q11, d18, d26
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
+ /* ... continue 'combine_over_ca' replacement */
+ vrshr.u16 q10, q6, #8
+ vrshr.u16 q14, q7, #8
+ vrshr.u16 q15, q11, #8
+ vraddhn.u16 d16, q10, q6
+ vraddhn.u16 d17, q14, q7
+ vraddhn.u16 d18, q15, q11
+ vqadd.u8 q8, q0, q8
+ vqadd.u8 d18, d2, d18
+ /*
+ * convert the results in d16, d17, d18 to r5g6b5 and store
+ * them into {d28, d29}
+ */
+ vshll.u8 q14, d18, #8
+ vshll.u8 q10, d17, #8
+ vshll.u8 q15, d16, #8
+ vsri.u16 q14, q10, #5
+ vsri.u16 q14, q15, #11
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
+ fetch_mask_pixblock
+ vrshr.u16 q10, q6, #8
+ vrshr.u16 q14, q7, #8
+ vld1.16 {d4, d5}, [DST_R, :128]!
+ vrshr.u16 q15, q11, #8
+ vraddhn.u16 d16, q10, q6
+ vraddhn.u16 d17, q14, q7
+ vraddhn.u16 d22, q15, q11
+ /* process_pixblock_head */
+ /*
+ * 'combine_mask_ca' replacement
+ *
+ * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A]
+ * mask in {d24, d25, d26} [B, G, R]
+ * output: updated src in {d0, d1, d2 } [B, G, R]
+ * updated mask in {d24, d25, d26} [B, G, R]
+ */
+ vmull.u8 q1, d25, d9
+ vqadd.u8 q8, q0, q8
+ vmull.u8 q0, d24, d8
+ vqadd.u8 d22, d2, d22
+ vmull.u8 q6, d26, d10
+ /*
+ * convert the result in d16, d17, d22 to r5g6b5 and store
+ * it into {d28, d29}
+ */
+ vshll.u8 q14, d22, #8
+ vshll.u8 q10, d17, #8
+ vshll.u8 q15, d16, #8
+ vmull.u8 q9, d11, d25
+ vsri.u16 q14, q10, #5
+ vmull.u8 q12, d11, d24
+ vmull.u8 q13, d11, d26
+ vsri.u16 q14, q15, #11
+ cache_preload 8, 8
+ vrshr.u16 q8, q0, #8
+ vrshr.u16 q10, q1, #8
+ vrshr.u16 q11, q6, #8
+ vraddhn.u16 d0, q0, q8
+ vraddhn.u16 d1, q1, q10
+ vraddhn.u16 d2, q6, q11
+ vrshr.u16 q11, q12, #8
+ vrshr.u16 q8, q9, #8
+ vrshr.u16 q6, q13, #8
+ vraddhn.u16 d25, q9, q8
+ /*
+ * convert 8 r5g6b5 pixel data from {d4, d5} to planar
+ * 8-bit format and put data into d16 - blue, d17 - green,
+ * d18 - red
+ */
+ vshrn.u16 d17, q2, #3
+ vshrn.u16 d18, q2, #8
+ vraddhn.u16 d24, q12, q11
+ vraddhn.u16 d26, q13, q6
+ vsli.u16 q2, q2, #5
+ vsri.u8 d18, d18, #5
+ vsri.u8 d17, d17, #6
+ /*
+ * 'combine_over_ca' replacement
+ *
+ * output: updated dest in d16 - blue, d17 - green, d18 - red
+ */
+ vmvn.8 q12, q12
+ vshrn.u16 d16, q2, #2
+ vmvn.8 d26, d26
+ vmull.u8 q7, d17, d25
+ vmull.u8 q6, d16, d24
+ vmull.u8 q11, d18, d26
+ vst1.16 {d28, d29}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vpush {d8-d15}
+ vld1.32 {d11[0]}, [DUMMY]
+ vdup.8 d8, d11[0]
+ vdup.8 d9, d11[1]
+ vdup.8 d10, d11[2]
+ vdup.8 d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_8888_0565_ca_init, \
+ pixman_composite_over_n_8888_0565_ca_cleanup, \
+ pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
+ pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
+ pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
+
+/******************************************************************************/
+
.macro pixman_composite_in_n_8_process_pixblock_head
/* expecting source data in {d0, d1, d2, d3} */
/* and destination data in {d4, d5, d6, d7} */
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 0a10ca10..77875ad5 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -80,6 +80,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_8888,
uint8_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8888_8888_ca,
uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8888_0565_ca,
+ uint32_t, 1, uint16_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_8,
uint8_t, 1, uint8_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8,
@@ -282,6 +284,8 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, neon_composite_over_n_8888_8888_ca),
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, neon_composite_over_n_8888_8888_ca),
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, neon_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, neon_composite_over_n_8888_0565_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, neon_composite_over_n_8888_0565_ca),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, neon_composite_over_8888_n_8888),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, neon_composite_over_8888_n_8888),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, r5g6b5, neon_composite_over_8888_n_0565),