summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBen Avison <bavison@riscosopen.org>2014-06-18 00:27:16 +0100
committerBen Avison <bavison@riscosopen.org>2015-10-15 13:52:04 +0100
commitef69752de8ce76ef38c284cb940ce50433c0c8d4 (patch)
treea10bd68163b10b6bec32941039b122e4bbea0d07
parent0e4451de5a40504837c852d2d369c80975974f88 (diff)
armv6: Add optimised scanline fetchers and writeback for r5g6b5 and a8
This supports r5g6b5 source and desitination images, and a8 source images. lowlevel-blt-bench results for example operations which use these because they lack a dedicated fast path at the time of writing: in_reverse_8_8888 Before After Mean StdDev Mean StdDev Confidence Change L1 30.0 0.3 37.0 0.3 100.0% +23.2% L2 23.3 0.3 29.4 0.4 100.0% +26.1% M 24.0 0.0 31.3 0.1 100.0% +30.5% HT 12.8 0.1 16.1 0.1 100.0% +25.8% VT 11.9 0.1 14.8 0.1 100.0% +24.6% R 11.7 0.1 14.6 0.1 100.0% +24.5% RT 5.1 0.1 6.2 0.1 100.0% +20.2% in_0565_8888 Before After Mean StdDev Mean StdDev Confidence Change L1 22.0 0.1 28.3 0.2 100.0% +28.4% L2 16.6 0.2 23.6 0.3 100.0% +42.2% M 16.5 0.0 24.7 0.1 100.0% +49.5% HT 11.0 0.1 13.7 0.1 100.0% +24.4% VT 10.7 0.0 13.1 0.1 100.0% +22.0% R 10.3 0.0 12.6 0.1 100.0% +22.5% RT 5.3 0.1 5.7 0.1 100.0% +9.0% in_reverse_8888_0565 Before After Mean StdDev Mean StdDev Confidence Change L1 16.6 0.1 20.9 0.1 100.0% +25.5% L2 13.1 0.1 17.7 0.3 100.0% +35.3% M 13.2 0.0 19.2 0.0 100.0% +45.3% HT 9.6 0.0 11.7 0.1 100.0% +21.8% VT 9.3 0.0 11.4 0.1 100.0% +22.4% R 9.0 0.0 10.9 0.1 100.0% +21.1% RT 4.7 0.1 5.2 0.1 100.0% +8.7%
-rw-r--r--pixman/pixman-arm-common.h62
-rw-r--r--pixman/pixman-arm-simd-asm.S94
-rw-r--r--pixman/pixman-arm-simd.c22
3 files changed, 178 insertions, 0 deletions
diff --git a/pixman/pixman-arm-common.h b/pixman/pixman-arm-common.h
index 6b905b6..31e75ac 100644
--- a/pixman/pixman-arm-common.h
+++ b/pixman/pixman-arm-common.h
@@ -444,4 +444,66 @@ cputype##_combine_##name##_u (pixman_implementation_t *imp, \
pixman_composite_scanline_##name##_asm_##cputype (width, dest, src); \
}
+/*****************************************************************************/
+
+/* Support for untransformed fetchers and writeback */
+
+#define PIXMAN_ARM_BIND_GET_SCANLINE(cputype, name) \
+void \
+pixman_get_scanline_##name##_asm_##cputype (int32_t w, \
+ uint32_t *dst, \
+ const uint32_t *src); \
+ \
+uint32_t * \
+cputype##_get_scanline_##name (pixman_iter_t *iter, const uint32_t *mask) \
+{ \
+ pixman_get_scanline_##name##_asm_##cputype (iter->width, iter->buffer, \
+ (uint32_t *) iter->bits); \
+ iter->bits += iter->stride; \
+ return iter->buffer; \
+}
+
+#define PIXMAN_ARM_BIND_WRITE_BACK(cputype, name) \
+void \
+pixman_write_back_##name##_asm_##cputype (int32_t w, \
+ uint32_t *dst, \
+ const uint32_t *src); \
+ \
+void \
+cputype##_write_back_##name (pixman_iter_t *iter) \
+{ \
+ pixman_write_back_##name##_asm_##cputype (iter->width, \
+ (uint32_t *)(iter->bits - iter->stride), \
+ iter->buffer); \
+}
+
+#define PIXMAN_ARM_UNTRANSFORMED_COVER_FETCHER(cputype, format) \
+ { PIXMAN_ ## format, \
+ (FAST_PATH_STANDARD_FLAGS | \
+ FAST_PATH_ID_TRANSFORM | \
+ FAST_PATH_SAMPLES_COVER_CLIP_NEAREST | \
+ FAST_PATH_BITS_IMAGE), \
+ ITER_NARROW | ITER_SRC, \
+ _pixman_iter_init_bits_stride, \
+ cputype ## _get_scanline_ ## format, \
+ NULL \
+ }
+
+#define PIXMAN_ARM_WRITEBACK(cputype, format) \
+ { PIXMAN_ ## format, \
+ FAST_PATH_STD_DEST_FLAGS, \
+ ITER_NARROW | ITER_DEST | ITER_IGNORE_RGB | ITER_IGNORE_ALPHA, \
+ _pixman_iter_init_bits_stride, \
+ fast_dest_fetch_noop, \
+ cputype ## _write_back_ ## format \
+ }, \
+ \
+ { PIXMAN_ ## format, \
+ FAST_PATH_STD_DEST_FLAGS, \
+ ITER_NARROW | ITER_DEST, \
+ _pixman_iter_init_bits_stride, \
+ cputype ## _get_scanline_ ## format, \
+ cputype ## _write_back_ ## format \
+ }
+
#endif
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index 9ada8a2..abd03d5 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -388,6 +388,16 @@ generate_composite_function \
src_0565_8888_process_head, \
src_0565_8888_process_tail
+generate_composite_function_single_scanline \
+ pixman_get_scanline_r5g6b5_asm_armv6, 16, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
+ 3, /* prefetch distance */ \
+ src_0565_8888_init, \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ src_0565_8888_process_head, \
+ src_0565_8888_process_tail
+
/******************************************************************************/
.macro src_x888_0565_init
@@ -465,6 +475,90 @@ generate_composite_function \
src_x888_0565_process_head, \
src_x888_0565_process_tail
+generate_composite_function_single_scanline \
+ pixman_write_back_r5g6b5_asm_armv6, 32, 0, 16, \
+ FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \
+ 3, /* prefetch distance */ \
+ src_x888_0565_init, \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ src_x888_0565_process_head, \
+ src_x888_0565_process_tail
+
+/******************************************************************************/
+
+.macro src_8_8888_init
+ mov MASK, #0xff000000
+.endm
+
+.macro src_8_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ pixld cond, numbytes/4, firstreg, SRC, unaligned_src
+.endm
+
+.macro src_8_8888_1pixel cond, d0
+ mov&cond WK&d0, WK&d0, lsl #24
+.endm
+
+.macro src_8_8888_2pixels cond, d0, d1
+ and&cond WK&d1, MASK, WK&d0, lsl #16
+ mov&cond WK&d0, WK&d0, lsl #24
+.endm
+
+.macro src_8_8888_4pixels cond, d0, d1, d2, d3
+ and&cond WK&d3, MASK, WK&d0
+ and&cond WK&d2, MASK, WK&d0, lsl #8
+ and&cond WK&d1, MASK, WK&d0, lsl #16
+ mov&cond WK&d0, WK&d0, lsl #24
+.endm
+
+.macro src_8_8888_process_tail cond, numbytes, firstreg
+ .if numbytes == 16
+ src_8_8888_4pixels cond, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3)
+ .elseif numbytes == 8
+ src_8_8888_2pixels cond, %(firstreg+0), %(firstreg+1)
+ .else // numbytes == 4
+ src_8_8888_1pixel cond, %(firstreg+0)
+ .endif
+.endm
+
+.macro src_8_8888_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
+110: /* Length of inner loop is set to allow one preload per 32 source pixels */
+ ldr STRIDE_M, [SRC], #4
+ and WK3, MASK, STRIDE_M
+ and WK2, MASK, STRIDE_M, lsl #8
+ and WK1, MASK, STRIDE_M, lsl #16
+ mov WK0, STRIDE_M, lsl #24
+ ldr STRIDE_M, [SRC], #4
+ .rept 6
+ pixst , 16, 0, DST
+ and WK3, MASK, STRIDE_M
+ and WK2, MASK, STRIDE_M, lsl #8
+ and WK1, MASK, STRIDE_M, lsl #16
+ mov WK0, STRIDE_M, lsl #24
+ ldr STRIDE_M, [SRC], #4
+ .endr
+ pld [SRC, SCRATCH]
+ pixst , 16, 0, DST
+ and WK3, MASK, STRIDE_M
+ and WK2, MASK, STRIDE_M, lsl #8
+ and WK1, MASK, STRIDE_M, lsl #16
+ mov WK0, STRIDE_M, lsl #24
+ pixst , 16, 0, DST
+ subs X, X, #32
+ bhs 110b
+.endm
+
+generate_composite_function_single_scanline \
+ pixman_get_scanline_a8_asm_armv6, 8, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
+ 2, /* prefetch distance */ \
+ src_8_8888_init, \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ src_8_8888_process_head, \
+ src_8_8888_process_tail, \
+ src_8_8888_inner_loop
+
/******************************************************************************/
.macro add_8_8_8pixels cond, dst1, dst2
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index 6f263eb..095e418 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -98,6 +98,17 @@ PIXMAN_ARM_BIND_COMBINE_U (armv6, out)
PIXMAN_ARM_BIND_COMBINE_U (armv6, out_reverse)
PIXMAN_ARM_BIND_COMBINE_U (armv6, add)
+PIXMAN_ARM_BIND_GET_SCANLINE (armv6, r5g6b5)
+PIXMAN_ARM_BIND_WRITE_BACK (armv6, r5g6b5)
+PIXMAN_ARM_BIND_GET_SCANLINE (armv6, a8)
+
+static uint32_t *
+fast_dest_fetch_noop (pixman_iter_t *iter, const uint32_t *mask)
+{
+ iter->bits += iter->stride;
+ return iter->buffer;
+}
+
void
pixman_composite_src_n_8888_asm_armv6 (int32_t w,
int32_t h,
@@ -307,6 +318,16 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
{ PIXMAN_OP_NONE },
};
+static const pixman_iter_info_t arm_simd_iters[] =
+{
+ PIXMAN_ARM_UNTRANSFORMED_COVER_FETCHER (armv6, r5g6b5),
+ PIXMAN_ARM_WRITEBACK (armv6, r5g6b5),
+
+ PIXMAN_ARM_UNTRANSFORMED_COVER_FETCHER (armv6, a8),
+
+ { PIXMAN_null },
+};
+
pixman_implementation_t *
_pixman_implementation_create_arm_simd (pixman_implementation_t *fallback)
{
@@ -321,6 +342,7 @@ _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback)
imp->combine_32[PIXMAN_OP_OUT_REVERSE] = armv6_combine_out_reverse_u;
imp->combine_32[PIXMAN_OP_ADD] = armv6_combine_add_u;
+ imp->iter_info = arm_simd_iters;
imp->blt = arm_simd_blt;
imp->fill = arm_simd_fill;