diff options
author | Ben Avison <bavison@riscosopen.org> | 2013-01-19 16:16:52 +0000 |
---|---|---|
committer | Siarhei Siamashka <siarhei.siamashka@gmail.com> | 2013-01-29 21:47:59 +0200 |
commit | f87dfd6f37a29c69320edd92f28aed5334b09366 (patch) | |
tree | 539512c19e72d1e8bd08a795137e9e09340418fa | |
parent | a0f59f3b2884b056428229363576666f158a9bb4 (diff) |
ARMv6: New conversion routines
There was no previous attempt at accelerating these specifically for
ARMv6.
src_x888_8888
Before After
Mean StdDev Mean StdDev Confidence Change
L1 96.7 0.5 270.4 2.6 100.0% +179.5%
L2 44.6 2.7 110.6 9.7 100.0% +148.0%
M 26.9 0.1 87.6 0.5 100.0% +226.1%
HT 19.3 0.2 37.5 0.4 100.0% +93.7%
VT 18.6 0.1 33.7 0.4 100.0% +81.6%
R 18.4 0.1 32.2 0.3 100.0% +75.2%
RT 9.2 0.2 12.1 0.3 100.0% +31.4%
src_0565_8888
Before After
Mean StdDev Mean StdDev Confidence Change
L1 37.0 0.3 66.9 0.2 100.0% +80.8%
L2 30.3 0.2 55.9 0.3 100.0% +84.4%
M 25.9 0.0 62.3 0.2 100.0% +140.3%
HT 15.2 0.1 33.1 0.3 100.0% +116.9%
VT 15.1 0.1 30.7 0.3 100.0% +103.6%
R 14.2 0.1 27.6 0.3 100.0% +94.0%
RT 6.0 0.1 11.2 0.3 100.0% +87.2%
-rw-r--r-- | pixman/pixman-arm-simd-asm.S | 123 | ||||
-rw-r--r-- | pixman/pixman-arm-simd.c | 12 |
2 files changed, 135 insertions, 0 deletions
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S index 49993b5e..a3e2d045 100644 --- a/pixman/pixman-arm-simd-asm.S +++ b/pixman/pixman-arm-simd-asm.S @@ -180,3 +180,126 @@ generate_composite_function \ /******************************************************************************/ +.macro src_x888_8888_pixel, cond, reg + orr&cond WK®, WK®, #0xFF000000 +.endm + +.macro pixman_composite_src_x888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + pixld cond, numbytes, firstreg, SRC, unaligned_src +.endm + +.macro pixman_composite_src_x888_8888_process_tail cond, numbytes, firstreg + src_x888_8888_pixel cond, %(firstreg+0) + .if numbytes >= 8 + src_x888_8888_pixel cond, %(firstreg+1) + .if numbytes == 16 + src_x888_8888_pixel cond, %(firstreg+2) + src_x888_8888_pixel cond, %(firstreg+3) + .endif + .endif +.endm + +generate_composite_function \ + pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \ + 3, /* prefetch distance */ \ + nop_macro, /* init */ \ + nop_macro, /* newline */ \ + nop_macro, /* cleanup */ \ + pixman_composite_src_x888_8888_process_head, \ + pixman_composite_src_x888_8888_process_tail + +/******************************************************************************/ + +.macro src_0565_8888_init + /* Hold loop invariants in MASK and STRIDE_M */ + ldr MASK, =0x07E007E0 + mov STRIDE_M, #0xFF000000 + /* Set GE[3:0] to 1010 so SEL instructions do what we want */ + ldr SCRATCH, =0x80008000 + uadd8 SCRATCH, SCRATCH, SCRATCH +.endm + +.macro src_0565_8888_2pixels, reg1, reg2 + and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000 + bic WK®2, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb + orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg + mov WK®1, WK®2, lsl #16 @ rrrrr000000bbbbb0000000000000000 + mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG + bic WK®2, WK®2, WK®1, lsr #16 @ RRRRR000000BBBBB0000000000000000 + orr WK®1, WK®1, WK®1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000 + orr WK®2, WK®2, WK®2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000 + pkhtb WK®1, WK®1, WK®1, asr #5 @ rrrrrrrr--------bbbbbbbb-------- + sel WK®1, WK®1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb-------- + mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg + pkhtb WK®2, WK®2, WK®2, asr #5 @ RRRRRRRR--------BBBBBBBB-------- + sel WK®2, WK®2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB-------- + orr WK®1, STRIDE_M, WK®1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb + orr WK®2, STRIDE_M, WK®2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB +.endm + +/* This version doesn't need STRIDE_M, but is one instruction longer. + It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case? + and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000 + bic WK®1, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb + orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg + mov WK®2, WK®1, lsr #16 @ 0000000000000000RRRRR000000BBBBB + mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000 + bic WK®1, WK®1, WK®2, lsl #16 @ 0000000000000000rrrrr000000bbbbb + mov WK®2, WK®2, lsl #3 @ 0000000000000RRRRR000000BBBBB000 + mov WK®1, WK®1, lsl #3 @ 0000000000000rrrrr000000bbbbb000 + orr WK®2, WK®2, WK®2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB + orr WK®1, WK®1, WK®1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb + pkhbt WK®2, WK®2, WK®2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB + pkhbt WK®1, WK®1, WK®1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb + sel WK®2, SCRATCH, WK®2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB + sel WK®1, SCRATCH, WK®1 @ --------rrrrrrrrggggggggbbbbbbbb + orr WK®2, WK®2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB + orr WK®1, WK®1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb +*/ + +.macro src_0565_8888_1pixel, reg + bic SCRATCH, WK®, MASK @ 0000000000000000rrrrr000000bbbbb + and WK®, WK®, MASK @ 000000000000000000000gggggg00000 + mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000 + mov WK®, WK®, lsl #5 @ 0000000000000000gggggg0000000000 + orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb + orr WK®, WK®, WK®, lsr #6 @ 000000000000000gggggggggggg00000 + pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb + sel WK®, WK®, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb + orr WK®, WK®, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb +.endm + +.macro src_0565_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + .if numbytes == 16 + pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src + .elseif numbytes == 8 + pixld , 4, firstreg, SRC, unaligned_src + .elseif numbytes == 4 + pixld , 2, firstreg, SRC, unaligned_src + .endif +.endm + +.macro src_0565_8888_process_tail cond, numbytes, firstreg + .if numbytes == 16 + src_0565_8888_2pixels firstreg, %(firstreg+1) + src_0565_8888_2pixels %(firstreg+2), %(firstreg+3) + .elseif numbytes == 8 + src_0565_8888_2pixels firstreg, %(firstreg+1) + .else + src_0565_8888_1pixel firstreg + .endif +.endm + +generate_composite_function \ + pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \ + 3, /* prefetch distance */ \ + src_0565_8888_init, \ + nop_macro, /* newline */ \ + nop_macro, /* cleanup */ \ + src_0565_8888_process_head, \ + src_0565_8888_process_tail + +/******************************************************************************/ + diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c index de66e575..09a5036a 100644 --- a/pixman/pixman-arm-simd.c +++ b/pixman/pixman-arm-simd.c @@ -378,10 +378,14 @@ pixman_composite_over_n_8_8888_asm_armv6 (int32_t width, PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8888_8888, uint32_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_x888_8888, + uint32_t, 1, uint32_t, 1) PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_0565_0565, uint16_t, 1, uint16_t, 1) PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8_8, uint8_t, 1, uint8_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_0565_8888, + uint16_t, 1, uint32_t, 1) PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8, uint8_t, 1, uint8_t, 1) @@ -523,6 +527,9 @@ static const pixman_fast_path_t arm_simd_fast_paths[] = PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, armv6_composite_src_8888_8888), PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, armv6_composite_src_8888_8888), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, armv6_composite_src_x888_8888), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, armv6_composite_src_x888_8888), + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, armv6_composite_src_0565_0565), PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, armv6_composite_src_0565_0565), PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, a1r5g5b5, armv6_composite_src_0565_0565), @@ -549,6 +556,11 @@ static const pixman_fast_path_t arm_simd_fast_paths[] = PIXMAN_STD_FAST_PATH (SRC, x4c4, null, x4c4, armv6_composite_src_8_8), PIXMAN_STD_FAST_PATH (SRC, x4g4, null, x4g4, armv6_composite_src_8_8), + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, a8r8g8b8, armv6_composite_src_0565_8888), + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, x8r8g8b8, armv6_composite_src_0565_8888), + PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, a8b8g8r8, armv6_composite_src_0565_8888), + PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, x8b8g8r8, armv6_composite_src_0565_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888), PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888), PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888), |