summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBen Avison <bavison@riscosopen.org>2013-01-19 16:16:52 +0000
committerSiarhei Siamashka <siarhei.siamashka@gmail.com>2013-01-29 21:47:59 +0200
commitf87dfd6f37a29c69320edd92f28aed5334b09366 (patch)
tree539512c19e72d1e8bd08a795137e9e09340418fa
parenta0f59f3b2884b056428229363576666f158a9bb4 (diff)
ARMv6: New conversion routines
There was no previous attempt at accelerating these specifically for ARMv6. src_x888_8888 Before After Mean StdDev Mean StdDev Confidence Change L1 96.7 0.5 270.4 2.6 100.0% +179.5% L2 44.6 2.7 110.6 9.7 100.0% +148.0% M 26.9 0.1 87.6 0.5 100.0% +226.1% HT 19.3 0.2 37.5 0.4 100.0% +93.7% VT 18.6 0.1 33.7 0.4 100.0% +81.6% R 18.4 0.1 32.2 0.3 100.0% +75.2% RT 9.2 0.2 12.1 0.3 100.0% +31.4% src_0565_8888 Before After Mean StdDev Mean StdDev Confidence Change L1 37.0 0.3 66.9 0.2 100.0% +80.8% L2 30.3 0.2 55.9 0.3 100.0% +84.4% M 25.9 0.0 62.3 0.2 100.0% +140.3% HT 15.2 0.1 33.1 0.3 100.0% +116.9% VT 15.1 0.1 30.7 0.3 100.0% +103.6% R 14.2 0.1 27.6 0.3 100.0% +94.0% RT 6.0 0.1 11.2 0.3 100.0% +87.2%
-rw-r--r--pixman/pixman-arm-simd-asm.S123
-rw-r--r--pixman/pixman-arm-simd.c12
2 files changed, 135 insertions, 0 deletions
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index 49993b5e..a3e2d045 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -180,3 +180,126 @@ generate_composite_function \
/******************************************************************************/
+.macro src_x888_8888_pixel, cond, reg
+ orr&cond WK&reg, WK&reg, #0xFF000000
+.endm
+
+.macro pixman_composite_src_x888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ pixld cond, numbytes, firstreg, SRC, unaligned_src
+.endm
+
+.macro pixman_composite_src_x888_8888_process_tail cond, numbytes, firstreg
+ src_x888_8888_pixel cond, %(firstreg+0)
+ .if numbytes >= 8
+ src_x888_8888_pixel cond, %(firstreg+1)
+ .if numbytes == 16
+ src_x888_8888_pixel cond, %(firstreg+2)
+ src_x888_8888_pixel cond, %(firstreg+3)
+ .endif
+ .endif
+.endm
+
+generate_composite_function \
+ pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
+ 3, /* prefetch distance */ \
+ nop_macro, /* init */ \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ pixman_composite_src_x888_8888_process_head, \
+ pixman_composite_src_x888_8888_process_tail
+
+/******************************************************************************/
+
+.macro src_0565_8888_init
+ /* Hold loop invariants in MASK and STRIDE_M */
+ ldr MASK, =0x07E007E0
+ mov STRIDE_M, #0xFF000000
+ /* Set GE[3:0] to 1010 so SEL instructions do what we want */
+ ldr SCRATCH, =0x80008000
+ uadd8 SCRATCH, SCRATCH, SCRATCH
+.endm
+
+.macro src_0565_8888_2pixels, reg1, reg2
+ and SCRATCH, WK&reg1, MASK @ 00000GGGGGG0000000000gggggg00000
+ bic WK&reg2, WK&reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
+ orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
+ mov WK&reg1, WK&reg2, lsl #16 @ rrrrr000000bbbbb0000000000000000
+ mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG
+ bic WK&reg2, WK&reg2, WK&reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
+ orr WK&reg1, WK&reg1, WK&reg1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000
+ orr WK&reg2, WK&reg2, WK&reg2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000
+ pkhtb WK&reg1, WK&reg1, WK&reg1, asr #5 @ rrrrrrrr--------bbbbbbbb--------
+ sel WK&reg1, WK&reg1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb--------
+ mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg
+ pkhtb WK&reg2, WK&reg2, WK&reg2, asr #5 @ RRRRRRRR--------BBBBBBBB--------
+ sel WK&reg2, WK&reg2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB--------
+ orr WK&reg1, STRIDE_M, WK&reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
+ orr WK&reg2, STRIDE_M, WK&reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
+.endm
+
+/* This version doesn't need STRIDE_M, but is one instruction longer.
+ It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
+ and SCRATCH, WK&reg1, MASK @ 00000GGGGGG0000000000gggggg00000
+ bic WK&reg1, WK&reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
+ orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
+ mov WK&reg2, WK&reg1, lsr #16 @ 0000000000000000RRRRR000000BBBBB
+ mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000
+ bic WK&reg1, WK&reg1, WK&reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
+ mov WK&reg2, WK&reg2, lsl #3 @ 0000000000000RRRRR000000BBBBB000
+ mov WK&reg1, WK&reg1, lsl #3 @ 0000000000000rrrrr000000bbbbb000
+ orr WK&reg2, WK&reg2, WK&reg2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB
+ orr WK&reg1, WK&reg1, WK&reg1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
+ pkhbt WK&reg2, WK&reg2, WK&reg2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB
+ pkhbt WK&reg1, WK&reg1, WK&reg1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
+ sel WK&reg2, SCRATCH, WK&reg2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB
+ sel WK&reg1, SCRATCH, WK&reg1 @ --------rrrrrrrrggggggggbbbbbbbb
+ orr WK&reg2, WK&reg2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
+ orr WK&reg1, WK&reg1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
+*/
+
+.macro src_0565_8888_1pixel, reg
+ bic SCRATCH, WK&reg, MASK @ 0000000000000000rrrrr000000bbbbb
+ and WK&reg, WK&reg, MASK @ 000000000000000000000gggggg00000
+ mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000
+ mov WK&reg, WK&reg, lsl #5 @ 0000000000000000gggggg0000000000
+ orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
+ orr WK&reg, WK&reg, WK&reg, lsr #6 @ 000000000000000gggggggggggg00000
+ pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
+ sel WK&reg, WK&reg, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb
+ orr WK&reg, WK&reg, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
+.endm
+
+.macro src_0565_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .if numbytes == 16
+ pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
+ .elseif numbytes == 8
+ pixld , 4, firstreg, SRC, unaligned_src
+ .elseif numbytes == 4
+ pixld , 2, firstreg, SRC, unaligned_src
+ .endif
+.endm
+
+.macro src_0565_8888_process_tail cond, numbytes, firstreg
+ .if numbytes == 16
+ src_0565_8888_2pixels firstreg, %(firstreg+1)
+ src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)
+ .elseif numbytes == 8
+ src_0565_8888_2pixels firstreg, %(firstreg+1)
+ .else
+ src_0565_8888_1pixel firstreg
+ .endif
+.endm
+
+generate_composite_function \
+ pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
+ 3, /* prefetch distance */ \
+ src_0565_8888_init, \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ src_0565_8888_process_head, \
+ src_0565_8888_process_tail
+
+/******************************************************************************/
+
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index de66e575..09a5036a 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -378,10 +378,14 @@ pixman_composite_over_n_8_8888_asm_armv6 (int32_t width,
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8888_8888,
uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_x888_8888,
+ uint32_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_0565_0565,
uint16_t, 1, uint16_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8_8,
uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_0565_8888,
+ uint16_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
uint8_t, 1, uint8_t, 1)
@@ -523,6 +527,9 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, armv6_composite_src_8888_8888),
PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, armv6_composite_src_8888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, armv6_composite_src_x888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, armv6_composite_src_x888_8888),
+
PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, armv6_composite_src_0565_0565),
PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, armv6_composite_src_0565_0565),
PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, a1r5g5b5, armv6_composite_src_0565_0565),
@@ -549,6 +556,11 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
PIXMAN_STD_FAST_PATH (SRC, x4c4, null, x4c4, armv6_composite_src_8_8),
PIXMAN_STD_FAST_PATH (SRC, x4g4, null, x4g4, armv6_composite_src_8_8),
+ PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, a8r8g8b8, armv6_composite_src_0565_8888),
+ PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, x8r8g8b8, armv6_composite_src_0565_8888),
+ PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, a8b8g8r8, armv6_composite_src_0565_8888),
+ PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, x8b8g8r8, armv6_composite_src_0565_8888),
+
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888),