From 9eb6889b15a180cc94aad8ac97189af5b3a68b96 Mon Sep 17 00:00:00 2001 From: Ben Avison Date: Mon, 7 Sep 2015 14:40:48 +0300 Subject: armv6: Add over_n_8888 fast path (disabled) This new fast path is initially disabled by putting the entries in the lookup table after the sentinel. The compiler cannot tell the new code is not used, so it cannot eliminate the code. Also the lookup table size will include the new fast path. When the follow-up patch then enables the new fast path, the binary layout (alignments, size, etc.) will stay the same compared to the disabled case. Keeping the binary layout identical is important for benchmarking on Raspberry Pi 1. The addresses at which functions are loaded will have a significant impact on benchmark results, causing unexpected performance changes. Keeping all function addresses the same across the patch enabling a new fast path improves the reliability of benchmarks. Benchmark results are included in the patch enabling this fast path. [Pekka: disabled the fast path, commit message] Signed-off-by: Pekka Paalanen --- pixman/pixman-arm-simd-asm.S | 41 +++++++++++++++++++++++++++++++++++++++++ pixman/pixman-arm-simd.c | 7 +++++++ 2 files changed, 48 insertions(+) diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S index 7b0727b..a74a0a8 100644 --- a/pixman/pixman-arm-simd-asm.S +++ b/pixman/pixman-arm-simd-asm.S @@ -1136,3 +1136,44 @@ generate_composite_function \ in_reverse_8888_8888_process_tail /******************************************************************************/ + +.macro over_n_8888_init + ldr SRC, [sp, #ARGS_STACK_OFFSET] + /* Hold loop invariant in MASK */ + ldr MASK, =0x00800080 + /* Hold multiplier for destination in STRIDE_M */ + mov STRIDE_M, #255 + sub STRIDE_M, STRIDE_M, SRC, lsr #24 + /* Set GE[3:0] to 0101 so SEL instructions do what we want */ + uadd8 SCRATCH, MASK, MASK +.endm + +.macro over_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + pixld , numbytes, firstreg, DST, 0 +.endm + +.macro over_n_8888_1pixel dst + mul_8888_8 WK&dst, STRIDE_M, SCRATCH, MASK + uqadd8 WK&dst, WK&dst, SRC +.endm + +.macro over_n_8888_process_tail cond, numbytes, firstreg + .set PROCESS_REG, firstreg + .rept numbytes / 4 + over_n_8888_1pixel %(PROCESS_REG) + .set PROCESS_REG, PROCESS_REG+1 + .endr + pixst , numbytes, firstreg, DST +.endm + +generate_composite_function \ + pixman_composite_over_n_8888_asm_armv6, 0, 0, 32 \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE \ + 2, /* prefetch distance */ \ + over_n_8888_init, \ + nop_macro, /* newline */ \ + nop_macro, /* cleanup */ \ + over_n_8888_process_head, \ + over_n_8888_process_tail + +/******************************************************************************/ diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c index f40ff36..62c0f41 100644 --- a/pixman/pixman-arm-simd.c +++ b/pixman/pixman-arm-simd.c @@ -51,6 +51,8 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888, PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, in_reverse_8888_8888, uint32_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, armv6, over_n_8888, + uint32_t, 1) PIXMAN_ARM_BIND_FAST_PATH_N_DST (0, armv6, over_reverse_n_8888, uint32_t, 1) @@ -271,6 +273,11 @@ static const pixman_fast_path_t arm_simd_fast_paths[] = SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, armv6_8888_8888), { PIXMAN_OP_NONE }, + + PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, armv6_composite_over_n_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, armv6_composite_over_n_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, null, a8b8g8r8, armv6_composite_over_n_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, null, x8b8g8r8, armv6_composite_over_n_8888), }; pixman_implementation_t * -- cgit v1.2.3