diff options
author | Ben Avison <bavison@riscosopen.org> | 2014-06-18 00:27:16 +0100 |
---|---|---|
committer | Ben Avison <bavison@riscosopen.org> | 2015-10-15 13:52:04 +0100 |
commit | ef69752de8ce76ef38c284cb940ce50433c0c8d4 (patch) | |
tree | a10bd68163b10b6bec32941039b122e4bbea0d07 | |
parent | 0e4451de5a40504837c852d2d369c80975974f88 (diff) |
armv6: Add optimised scanline fetchers and writeback for r5g6b5 and a8
This supports r5g6b5 source and desitination images, and a8 source images.
lowlevel-blt-bench results for example operations which use these because
they lack a dedicated fast path at the time of writing:
in_reverse_8_8888
Before After
Mean StdDev Mean StdDev Confidence Change
L1 30.0 0.3 37.0 0.3 100.0% +23.2%
L2 23.3 0.3 29.4 0.4 100.0% +26.1%
M 24.0 0.0 31.3 0.1 100.0% +30.5%
HT 12.8 0.1 16.1 0.1 100.0% +25.8%
VT 11.9 0.1 14.8 0.1 100.0% +24.6%
R 11.7 0.1 14.6 0.1 100.0% +24.5%
RT 5.1 0.1 6.2 0.1 100.0% +20.2%
in_0565_8888
Before After
Mean StdDev Mean StdDev Confidence Change
L1 22.0 0.1 28.3 0.2 100.0% +28.4%
L2 16.6 0.2 23.6 0.3 100.0% +42.2%
M 16.5 0.0 24.7 0.1 100.0% +49.5%
HT 11.0 0.1 13.7 0.1 100.0% +24.4%
VT 10.7 0.0 13.1 0.1 100.0% +22.0%
R 10.3 0.0 12.6 0.1 100.0% +22.5%
RT 5.3 0.1 5.7 0.1 100.0% +9.0%
in_reverse_8888_0565
Before After
Mean StdDev Mean StdDev Confidence Change
L1 16.6 0.1 20.9 0.1 100.0% +25.5%
L2 13.1 0.1 17.7 0.3 100.0% +35.3%
M 13.2 0.0 19.2 0.0 100.0% +45.3%
HT 9.6 0.0 11.7 0.1 100.0% +21.8%
VT 9.3 0.0 11.4 0.1 100.0% +22.4%
R 9.0 0.0 10.9 0.1 100.0% +21.1%
RT 4.7 0.1 5.2 0.1 100.0% +8.7%
-rw-r--r-- | pixman/pixman-arm-common.h | 62 | ||||
-rw-r--r-- | pixman/pixman-arm-simd-asm.S | 94 | ||||
-rw-r--r-- | pixman/pixman-arm-simd.c | 22 |
3 files changed, 178 insertions, 0 deletions
diff --git a/pixman/pixman-arm-common.h b/pixman/pixman-arm-common.h index 6b905b6..31e75ac 100644 --- a/pixman/pixman-arm-common.h +++ b/pixman/pixman-arm-common.h @@ -444,4 +444,66 @@ cputype##_combine_##name##_u (pixman_implementation_t *imp, \ pixman_composite_scanline_##name##_asm_##cputype (width, dest, src); \ } +/*****************************************************************************/ + +/* Support for untransformed fetchers and writeback */ + +#define PIXMAN_ARM_BIND_GET_SCANLINE(cputype, name) \ +void \ +pixman_get_scanline_##name##_asm_##cputype (int32_t w, \ + uint32_t *dst, \ + const uint32_t *src); \ + \ +uint32_t * \ +cputype##_get_scanline_##name (pixman_iter_t *iter, const uint32_t *mask) \ +{ \ + pixman_get_scanline_##name##_asm_##cputype (iter->width, iter->buffer, \ + (uint32_t *) iter->bits); \ + iter->bits += iter->stride; \ + return iter->buffer; \ +} + +#define PIXMAN_ARM_BIND_WRITE_BACK(cputype, name) \ +void \ +pixman_write_back_##name##_asm_##cputype (int32_t w, \ + uint32_t *dst, \ + const uint32_t *src); \ + \ +void \ +cputype##_write_back_##name (pixman_iter_t *iter) \ +{ \ + pixman_write_back_##name##_asm_##cputype (iter->width, \ + (uint32_t *)(iter->bits - iter->stride), \ + iter->buffer); \ +} + +#define PIXMAN_ARM_UNTRANSFORMED_COVER_FETCHER(cputype, format) \ + { PIXMAN_ ## format, \ + (FAST_PATH_STANDARD_FLAGS | \ + FAST_PATH_ID_TRANSFORM | \ + FAST_PATH_SAMPLES_COVER_CLIP_NEAREST | \ + FAST_PATH_BITS_IMAGE), \ + ITER_NARROW | ITER_SRC, \ + _pixman_iter_init_bits_stride, \ + cputype ## _get_scanline_ ## format, \ + NULL \ + } + +#define PIXMAN_ARM_WRITEBACK(cputype, format) \ + { PIXMAN_ ## format, \ + FAST_PATH_STD_DEST_FLAGS, \ + ITER_NARROW | ITER_DEST | ITER_IGNORE_RGB | ITER_IGNORE_ALPHA, \ + _pixman_iter_init_bits_stride, \ + fast_dest_fetch_noop, \ + cputype ## _write_back_ ## format \ + }, \ + \ + { PIXMAN_ ## format, \ + FAST_PATH_STD_DEST_FLAGS, \ + ITER_NARROW | ITER_DEST, \ + _pixman_iter_init_bits_stride, \ + cputype ## _get_scanline_ ## format, \ + cputype ## _write_back_ ## format \ + } + #endif diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S index 9ada8a2..abd03d5 100644 --- a/pixman/pixman-arm-simd-asm.S +++ b/pixman/pixman-arm-simd-asm.S @@ -388,6 +388,16 @@ generate_composite_function \ src_0565_8888_process_head, \ src_0565_8888_process_tail +generate_composite_function_single_scanline \ + pixman_get_scanline_r5g6b5_asm_armv6, 16, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \ + 3, /* prefetch distance */ \ + src_0565_8888_init, \ + nop_macro, /* newline */ \ + nop_macro, /* cleanup */ \ + src_0565_8888_process_head, \ + src_0565_8888_process_tail + /******************************************************************************/ .macro src_x888_0565_init @@ -465,6 +475,90 @@ generate_composite_function \ src_x888_0565_process_head, \ src_x888_0565_process_tail +generate_composite_function_single_scanline \ + pixman_write_back_r5g6b5_asm_armv6, 32, 0, 16, \ + FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \ + 3, /* prefetch distance */ \ + src_x888_0565_init, \ + nop_macro, /* newline */ \ + nop_macro, /* cleanup */ \ + src_x888_0565_process_head, \ + src_x888_0565_process_tail + +/******************************************************************************/ + +.macro src_8_8888_init + mov MASK, #0xff000000 +.endm + +.macro src_8_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + pixld cond, numbytes/4, firstreg, SRC, unaligned_src +.endm + +.macro src_8_8888_1pixel cond, d0 + mov&cond WK&d0, WK&d0, lsl #24 +.endm + +.macro src_8_8888_2pixels cond, d0, d1 + and&cond WK&d1, MASK, WK&d0, lsl #16 + mov&cond WK&d0, WK&d0, lsl #24 +.endm + +.macro src_8_8888_4pixels cond, d0, d1, d2, d3 + and&cond WK&d3, MASK, WK&d0 + and&cond WK&d2, MASK, WK&d0, lsl #8 + and&cond WK&d1, MASK, WK&d0, lsl #16 + mov&cond WK&d0, WK&d0, lsl #24 +.endm + +.macro src_8_8888_process_tail cond, numbytes, firstreg + .if numbytes == 16 + src_8_8888_4pixels cond, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3) + .elseif numbytes == 8 + src_8_8888_2pixels cond, %(firstreg+0), %(firstreg+1) + .else // numbytes == 4 + src_8_8888_1pixel cond, %(firstreg+0) + .endif +.endm + +.macro src_8_8888_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment +110: /* Length of inner loop is set to allow one preload per 32 source pixels */ + ldr STRIDE_M, [SRC], #4 + and WK3, MASK, STRIDE_M + and WK2, MASK, STRIDE_M, lsl #8 + and WK1, MASK, STRIDE_M, lsl #16 + mov WK0, STRIDE_M, lsl #24 + ldr STRIDE_M, [SRC], #4 + .rept 6 + pixst , 16, 0, DST + and WK3, MASK, STRIDE_M + and WK2, MASK, STRIDE_M, lsl #8 + and WK1, MASK, STRIDE_M, lsl #16 + mov WK0, STRIDE_M, lsl #24 + ldr STRIDE_M, [SRC], #4 + .endr + pld [SRC, SCRATCH] + pixst , 16, 0, DST + and WK3, MASK, STRIDE_M + and WK2, MASK, STRIDE_M, lsl #8 + and WK1, MASK, STRIDE_M, lsl #16 + mov WK0, STRIDE_M, lsl #24 + pixst , 16, 0, DST + subs X, X, #32 + bhs 110b +.endm + +generate_composite_function_single_scanline \ + pixman_get_scanline_a8_asm_armv6, 8, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \ + 2, /* prefetch distance */ \ + src_8_8888_init, \ + nop_macro, /* newline */ \ + nop_macro, /* cleanup */ \ + src_8_8888_process_head, \ + src_8_8888_process_tail, \ + src_8_8888_inner_loop + /******************************************************************************/ .macro add_8_8_8pixels cond, dst1, dst2 diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c index 6f263eb..095e418 100644 --- a/pixman/pixman-arm-simd.c +++ b/pixman/pixman-arm-simd.c @@ -98,6 +98,17 @@ PIXMAN_ARM_BIND_COMBINE_U (armv6, out) PIXMAN_ARM_BIND_COMBINE_U (armv6, out_reverse) PIXMAN_ARM_BIND_COMBINE_U (armv6, add) +PIXMAN_ARM_BIND_GET_SCANLINE (armv6, r5g6b5) +PIXMAN_ARM_BIND_WRITE_BACK (armv6, r5g6b5) +PIXMAN_ARM_BIND_GET_SCANLINE (armv6, a8) + +static uint32_t * +fast_dest_fetch_noop (pixman_iter_t *iter, const uint32_t *mask) +{ + iter->bits += iter->stride; + return iter->buffer; +} + void pixman_composite_src_n_8888_asm_armv6 (int32_t w, int32_t h, @@ -307,6 +318,16 @@ static const pixman_fast_path_t arm_simd_fast_paths[] = { PIXMAN_OP_NONE }, }; +static const pixman_iter_info_t arm_simd_iters[] = +{ + PIXMAN_ARM_UNTRANSFORMED_COVER_FETCHER (armv6, r5g6b5), + PIXMAN_ARM_WRITEBACK (armv6, r5g6b5), + + PIXMAN_ARM_UNTRANSFORMED_COVER_FETCHER (armv6, a8), + + { PIXMAN_null }, +}; + pixman_implementation_t * _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback) { @@ -321,6 +342,7 @@ _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback) imp->combine_32[PIXMAN_OP_OUT_REVERSE] = armv6_combine_out_reverse_u; imp->combine_32[PIXMAN_OP_ADD] = armv6_combine_add_u; + imp->iter_info = arm_simd_iters; imp->blt = arm_simd_blt; imp->fill = arm_simd_fill; |