From 68d2f7b486a9ccc877a2214f7f5ef562e2846581 Mon Sep 17 00:00:00 2001 From: Ben Avison Date: Wed, 9 Apr 2014 16:25:31 +0300 Subject: ARMv6: Add fast path flag to force no preload of destination buffer --- pixman/pixman-arm-simd-asm.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pixman/pixman-arm-simd-asm.h b/pixman/pixman-arm-simd-asm.h index 1bb8b45..0baf87a 100644 --- a/pixman/pixman-arm-simd-asm.h +++ b/pixman/pixman-arm-simd-asm.h @@ -78,6 +78,8 @@ .set FLAG_PROCESS_PRESERVES_SCRATCH, 64 .set FLAG_PROCESS_PRESERVES_WK0, 0 .set FLAG_PROCESS_CORRUPTS_WK0, 128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */ +.set FLAG_PRELOAD_DST, 0 +.set FLAG_NO_PRELOAD_DST, 256 /* * Number of bytes by which to adjust preload offset of destination @@ -445,7 +447,7 @@ preload_middle src_bpp, SRC, 0 preload_middle mask_bpp, MASK, 0 .endif - .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) + .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0) /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that * destination prefetches are 32-byte aligned. It's also the easiest channel to offset * preloads for, to achieve staggered prefetches for multiple channels, because there are @@ -480,7 +482,9 @@ .endif preload_trailing src_bpp, src_bpp_shift, SRC preload_trailing mask_bpp, mask_bpp_shift, MASK + .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 preload_trailing dst_r_bpp, dst_bpp_shift, DST + .endif add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp /* The remainder of the line is handled identically to the medium case */ medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask @@ -779,7 +783,9 @@ fname: newline preload_leading_step1 src_bpp, WK1, SRC preload_leading_step1 mask_bpp, WK2, MASK + .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 preload_leading_step1 dst_r_bpp, WK3, DST + .endif ands WK0, DST, #15 beq 154f @@ -787,7 +793,9 @@ fname: preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK + .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST + .endif leading_15bytes process_head, process_tail @@ -827,7 +835,9 @@ fname: newline preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ preload_line 0, mask_bpp, mask_bpp_shift, MASK + .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 preload_line 0, dst_r_bpp, dst_bpp_shift, DST + .endif sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */ ands WK0, DST, #15 @@ -856,7 +866,9 @@ fname: newline preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ preload_line 1, mask_bpp, mask_bpp_shift, MASK + .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 preload_line 1, dst_r_bpp, dst_bpp_shift, DST + .endif .if dst_w_bpp == 8 tst DST, #3 -- cgit v1.2.3