summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBen Avison <bavison@riscosopen.org>2014-05-26 19:16:25 +0100
committerBen Avison <bavison@riscosopen.org>2015-10-15 13:55:52 +0100
commite77f84ab88fc93f6acd9b81c7e862462494f9b11 (patch)
treefc79f1d00612552d70757b3b006b9236b21f60d9
parent9aa45b6dcb75efb797b7a0f7b96569a409a8d18e (diff)
armv6: Add over_n_0565 fast path
This is used instead of the equivalent C fast path. lowlevel-blt-bench results, compared to no fast path at all: Before After Mean StdDev Mean StdDev Confidence Change L1 8.2 0.0 38.7 0.5 100.0% +372.7% L2 7.9 0.1 37.6 0.5 100.0% +376.8% M 7.3 0.0 38.5 0.1 100.0% +425.6% HT 6.9 0.0 26.1 0.3 100.0% +279.9% VT 6.8 0.0 24.5 0.3 100.0% +258.0% R 6.6 0.1 23.6 0.2 100.0% +255.1% RT 4.5 0.1 10.9 0.2 100.0% +143.1%
-rw-r--r--pixman/pixman-arm-simd-asm.S114
-rw-r--r--pixman/pixman-arm-simd.c4
2 files changed, 118 insertions, 0 deletions
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index 208f0de..fb085c6 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -1782,6 +1782,120 @@ generate_composite_function \
/******************************************************************************/
+.macro over_n_0565_init
+ BITMSK5 .req Y
+ BITMSK6 .req STRIDE_D
+ SRCRB .req SRC
+ SRCG .req STRIDE_S
+ HALF .req MASK
+ ALPHA .req STRIDE_M
+ TMP0 .req SCRATCH
+ TMP1 .req ORIG_W
+ line_saved_regs Y, STRIDE_D, ORIG_W
+ ldr SRC, [sp, #ARGS_STACK_OFFSET]
+ mov ALPHA, #255
+ pkhbt SRCG, SRC, SRC, lsl #16 @ GGGGGGGGxxxxxxxxGGGGGGGGxxxxxxxx
+ sub ALPHA, ALPHA, SRC, lsr #24
+ mov SRCRB, SRC, lsl #8 @ RRRRRRRRxxxxxxxxBBBBBBBBxxxxxxxx
+ ldr HALF, =0x00800080
+.endm
+
+.macro over_n_0565_newline
+ ldr BITMSK5, =0x001f001f
+ ldr BITMSK6, =0xfc00fc00
+.endm
+
+.macro over_n_0565_cleanup
+ .unreq BITMSK5
+ .unreq BITMSK6
+ .unreq SRCRB
+ .unreq SRCG
+ .unreq HALF
+ .unreq ALPHA
+ .unreq TMP0
+ .unreq TMP1
+.endm
+
+.macro over_n_0565_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ pixld , numbytes, firstreg, DST, 0
+.endm
+
+.macro over_n_0565_1pixel dst
+ mov TMP1, WK&dst, lsl #16 @ rrrrrggggggbbbbb0000000000000000
+ bic TMP1, TMP1, BITMSK6, lsr #5 @ rrrrr000000bbbbb0000000000000000
+ and TMP0, BITMSK6, WK&dst, lsl #5 @ 0000000000000000gggggg0000000000
+ orr WK&dst, TMP1, TMP1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000
+ orr TMP0, TMP0, lsr #6 @ 0000000000000000gggggggggggg0000
+ pkhtb WK&dst, WK&dst, WK&dst, asr #5 @ rrrrrrrrrr0xxxxxbbbbbbbbbb000000
+ uxtb TMP0, TMP0, ror #8 @ 000000000000000000000000gggggggg
+ uxtb16 WK&dst, WK&dst, ror #8 @ 00000000rrrrrrrr00000000bbbbbbbb
+ mla TMP0, TMP0, ALPHA, HALF @ xxxxxxxxxxxxxxxxgggggggggggggggg
+ mla WK&dst, WK&dst, ALPHA, HALF @ rrrrrrrrrrrrrrrrbbbbbbbbbbbbbbbb
+ uxtab TMP0, TMP0, TMP0, ror #8 @ xxxxxxxxxxxxxxxxgggggggggggggggg
+ uxtab16 WK&dst, WK&dst, WK&dst, ror #8 @ rrrrrrrrrrrrrrrrbbbbbbbbbbbbbbbb
+ uqadd8 TMP0, TMP0, SRCG @ xxxxxxxxxxxxxxxxggggggggxxxxxxxx
+ uqadd8 WK&dst, WK&dst, SRCRB @ rrrrrrrrxxxxxxxxbbbbbbbbxxxxxxxx
+ and TMP0, TMP0, BITMSK6 @ xxxxxx0000000000gggggg0000000000
+ and WK&dst, BITMSK5, WK&dst, lsr #11 @ 00000000000rrrrr00000000000bbbbb
+ orr WK&dst, WK&dst, WK&dst, lsr #5 @ 00000000000xxxxxrrrrr000000bbbbb
+ orr WK&dst, WK&dst, TMP0, lsr #5 @ 00000xxxxxxxxxxxrrrrrggggggbbbbb
+.endm
+
+.macro over_n_0565_2pixels dst
+ bic TMP1, WK&dst, BITMSK6, lsr #5 @ RRRRR000000BBBBBrrrrr000000bbbbb
+ and TMP0, BITMSK6, WK&dst, lsl #5 @ GGGGGG0000000000gggggg0000000000
+ mov WK&dst, TMP1, lsl #16 @ rrrrr000000bbbbb0000000000000000
+ orr TMP0, TMP0, lsr #6 @ GGGGGGGGGGGG0000gggggggggggg0000
+ bic TMP1, TMP1, WK&dst, lsr #16 @ RRRRR000000BBBBB0000000000000000
+ orr WK&dst, WK&dst, WK&dst, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000
+ orr TMP1, TMP1, TMP1, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000
+ pkhtb WK&dst, WK&dst, WK&dst, asr #5 @ rrrrrrrrrr0xxxxxbbbbbbbbbb000000
+ pkhtb TMP1, TMP1, TMP1, asr #5 @ RRRRRRRRRR0xxxxxBBBBBBBBBB000000
+ uxtb16 TMP0, TMP0, ror #8 @ 00000000GGGGGGGG00000000gggggggg
+ uxtb16 WK&dst, WK&dst, ror #8 @ 00000000rrrrrrrr00000000bbbbbbbb
+ uxtb16 TMP1, TMP1, ror #8 @ 00000000RRRRRRRR00000000BBBBBBBB
+ mla TMP0, TMP0, ALPHA, HALF @ GGGGGGGGGGGGGGGGgggggggggggggggg
+ mla WK&dst, WK&dst, ALPHA, HALF @ rrrrrrrrrrrrrrrrbbbbbbbbbbbbbbbb
+ mla TMP1, TMP1, ALPHA, HALF @ RRRRRRRRRRRRRRRRBBBBBBBBBBBBBBBB
+ uxtab16 TMP0, TMP0, TMP0, ror #8 @ GGGGGGGGGGGGGGGGgggggggggggggggg
+ uxtab16 WK&dst, WK&dst, WK&dst, ror #8 @ rrrrrrrrrrrrrrrrbbbbbbbbbbbbbbbb
+ uxtab16 TMP1, TMP1, TMP1, ror #8 @ RRRRRRRRRRRRRRRRBBBBBBBBBBBBBBBB
+ uqadd8 TMP0, TMP0, SRCG @ GGGGGGGGxxxxxxxxggggggggxxxxxxxx
+ uqadd8 TMP1, TMP1, SRCRB @ RRRRRRRRxxxxxxxxBBBBBBBBxxxxxxxx
+ uqadd8 WK&dst, WK&dst, SRCRB @ rrrrrrrrxxxxxxxxbbbbbbbbxxxxxxxx
+ and TMP0, TMP0, BITMSK6 @ GGGGGG0000000000gggggg0000000000
+ and TMP1, BITMSK5, TMP1, lsr #11 @ 00000000000RRRRR00000000000BBBBB
+ and WK&dst, BITMSK5, WK&dst, lsr #11 @ 00000000000rrrrr00000000000bbbbb
+ orr TMP1, TMP1, TMP1, lsr #5 @ 00000000000xxxxxRRRRR000000BBBBB
+ orr WK&dst, WK&dst, WK&dst, lsr #5 @ 00000000000xxxxxrrrrr000000bbbbb
+ pkhbt TMP1, WK&dst, TMP1, LSL #16 @ RRRRR000000BBBBBrrrrr000000bbbbb
+ orr WK&dst, TMP1, TMP0, lsr #5 @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
+.endm
+
+.macro over_n_0565_process_tail cond, numbytes, firstreg
+ .if numbytes == 2
+ over_n_0565_1pixel firstreg
+ .else
+ .set PROCESS_REG, firstreg
+ .rept numbytes / 4
+ over_n_0565_2pixels %(PROCESS_REG)
+ .set PROCESS_REG, PROCESS_REG+1
+ .endr
+ .endif
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_0565_asm_armv6, 0, 0, 16, \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_SPILL_LINE_VARS, \
+ 2, /* prefetch distance */ \
+ over_n_0565_init, \
+ over_n_0565_newline, \
+ over_n_0565_cleanup, \
+ over_n_0565_process_head, \
+ over_n_0565_process_tail
+
+/******************************************************************************/
+
.macro add_8888_8888_8888_init
/* Set GE[3:0] to 0101 so SEL instructions do what we want */
msr CPSR_s, #0x50000
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index c252d83..4473431 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -55,6 +55,8 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, in_reverse_8888_8888,
PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, armv6, over_n_8888,
uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, armv6, over_n_0565,
+ uint16_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_N_DST (0, armv6, over_reverse_n_8888,
uint32_t, 1)
@@ -296,6 +298,8 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, armv6_composite_over_n_8888),
PIXMAN_STD_FAST_PATH (OVER, solid, null, a8b8g8r8, armv6_composite_over_n_8888),
PIXMAN_STD_FAST_PATH (OVER, solid, null, x8b8g8r8, armv6_composite_over_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, armv6_composite_over_n_0565),
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, armv6_composite_over_n_0565),
PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, armv6_composite_over_reverse_n_8888),
PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, armv6_composite_over_reverse_n_8888),