summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTaekyun Kim <tkq.kim@samsung.com>2011-09-21 15:52:13 +0900
committerTaekyun Kim <tkq.kim@samsung.com>2011-10-18 13:14:28 +0900
commit4481920f405e47b3a92811a8cb06afbd37dee01b (patch)
treeedb867e0661cb5d9c78ee273529351485e1ee070
parent1cd916f3a5ebeb943f66eecf0b8ce99af0b95d11 (diff)
ARM: NEON: Instruction scheduling of bilinear over_8888_8888
Instructions are reordered to eliminate pipeline stalls and get better memory access. Performance of before/after on cortex-a8 @ 1GHz << 2000 x 2000 with scale factor close to 1.x >> before : 50.43 Mpix/s after : 61.09 Mpix/s
-rw-r--r--pixman/pixman-arm-neon-asm-bilinear.S149
1 files changed, 146 insertions, 3 deletions
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index 25bcb24..82d248e 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -893,15 +893,158 @@ pixman_asm_function fname
.endm
.macro bilinear_over_8888_8888_process_pixblock_head
- bilinear_over_8888_8888_process_four_pixels
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #2
+
+ vld1.32 {d22}, [TMP1], STRIDE
+ vld1.32 {d23}, [TMP1]
+ mov TMP3, X, asr #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, asl #2
+ vmull.u8 q8, d22, d28
+ vmlal.u8 q8, d23, d29
+
+ vld1.32 {d22}, [TMP2], STRIDE
+ vld1.32 {d23}, [TMP2]
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, asl #2
+ vmull.u8 q9, d22, d28
+ vmlal.u8 q9, d23, d29
+
+ vld1.32 {d22}, [TMP3], STRIDE
+ vld1.32 {d23}, [TMP3]
+ vmull.u8 q10, d22, d28
+ vmlal.u8 q10, d23, d29
+
+ vshll.u16 q0, d16, #8
+ vmlsl.u16 q0, d16, d30
+ vmlal.u16 q0, d17, d30
+
+ pld [TMP4, PF_OFFS]
+ vld1.32 {d16}, [TMP4], STRIDE
+ vld1.32 {d17}, [TMP4]
+ pld [TMP4, PF_OFFS]
+ vmull.u8 q11, d16, d28
+ vmlal.u8 q11, d17, d29
+
+ vshll.u16 q1, d18, #8
+ vmlsl.u16 q1, d18, d31
+ vmlal.u16 q1, d19, d31
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
.endm
.macro bilinear_over_8888_8888_process_pixblock_tail
+ vshll.u16 q2, d20, #8
+ vmlsl.u16 q2, d20, d30
+ vmlal.u16 q2, d21, d30
+ vshll.u16 q3, d22, #8
+ vmlsl.u16 q3, d22, d31
+ vmlal.u16 q3, d23, d31
+ vshrn.u32 d0, q0, #16
+ vshrn.u32 d1, q1, #16
+ vld1.32 {d2, d3}, [OUT, :128]
+ pld [OUT, PF_OFFS]
+ vshrn.u32 d4, q2, #16
+ vshr.u16 q15, q12, #8
+ vshrn.u32 d5, q3, #16
+ vmovn.u16 d6, q0
+ vmovn.u16 d7, q2
+ vuzp.8 d6, d7
+ vuzp.8 d2, d3
+ vuzp.8 d6, d7
+ vuzp.8 d2, d3
+ vdup.32 d4, d7[1]
+ vmvn.8 d4, d4
+ vmull.u8 q11, d2, d4
+ vmull.u8 q2, d3, d4
+ vrshr.u16 q1, q11, #8
+ vrshr.u16 q10, q2, #8
+ vraddhn.u16 d2, q1, q11
+ vraddhn.u16 d3, q10, q2
+ vqadd.u8 q3, q1, q3
+ vuzp.8 d6, d7
+ vuzp.8 d6, d7
+ vadd.u16 q12, q12, q13
+ vst1.32 {d6, d7}, [OUT, :128]!
.endm
.macro bilinear_over_8888_8888_process_pixblock_tail_head
- bilinear_over_8888_8888_process_pixblock_tail
- bilinear_over_8888_8888_process_pixblock_head
+ vshll.u16 q2, d20, #8
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ vmlsl.u16 q2, d20, d30
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #2
+ vmlal.u16 q2, d21, d30
+ vshll.u16 q3, d22, #8
+ vld1.32 {d20}, [TMP1], STRIDE
+ vmlsl.u16 q3, d22, d31
+ vmlal.u16 q3, d23, d31
+ vld1.32 {d21}, [TMP1]
+ vmull.u8 q8, d20, d28
+ vmlal.u8 q8, d21, d29
+ vshrn.u32 d0, q0, #16
+ vshrn.u32 d1, q1, #16
+ vld1.32 {d2, d3}, [OUT, :128]
+ pld [OUT, PF_OFFS]
+ vshrn.u32 d4, q2, #16
+ vshr.u16 q15, q12, #8
+ vld1.32 {d22}, [TMP2], STRIDE
+ vshrn.u32 d5, q3, #16
+ vmovn.u16 d6, q0
+ vld1.32 {d23}, [TMP2]
+ vmull.u8 q9, d22, d28
+ mov TMP3, X, asr #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, asl #2
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, asl #2
+ vmlal.u8 q9, d23, d29
+ vmovn.u16 d7, q2
+ vld1.32 {d22}, [TMP3], STRIDE
+ vuzp.8 d6, d7
+ vuzp.8 d2, d3
+ vuzp.8 d6, d7
+ vuzp.8 d2, d3
+ vdup.32 d4, d7[1]
+ vld1.32 {d23}, [TMP3]
+ vmvn.8 d4, d4
+ vmull.u8 q10, d22, d28
+ vmlal.u8 q10, d23, d29
+ vmull.u8 q11, d2, d4
+ vmull.u8 q2, d3, d4
+ vshll.u16 q0, d16, #8
+ vmlsl.u16 q0, d16, d30
+ vrshr.u16 q1, q11, #8
+ vmlal.u16 q0, d17, d30
+ vrshr.u16 q8, q2, #8
+ vraddhn.u16 d2, q1, q11
+ vraddhn.u16 d3, q8, q2
+ pld [TMP4, PF_OFFS]
+ vld1.32 {d16}, [TMP4], STRIDE
+ vqadd.u8 q3, q1, q3
+ vld1.32 {d17}, [TMP4]
+ pld [TMP4, PF_OFFS]
+ vmull.u8 q11, d16, d28
+ vmlal.u8 q11, d17, d29
+ vuzp.8 d6, d7
+ vshll.u16 q1, d18, #8
+ vuzp.8 d6, d7
+ vmlsl.u16 q1, d18, d31
+ vadd.u16 q12, q12, q13
+ vmlal.u16 q1, d19, d31
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vst1.32 {d6, d7}, [OUT, :128]!
.endm
/* over_8888_8_8888 */