summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBen Avison <bavison@riscosopen.org>2014-04-09 16:25:29 +0300
committerSøren Sandmann <ssp@redhat.com>2014-04-21 20:34:26 -0400
commit73d2f8b61ae5e320a7795c29b6041b7885cf2953 (patch)
treeaf636e5b39b9316402f2fd86ab47c03f9946995b
parent857e40f3d2bc2cfb714913e0cd7e6184cf69aca3 (diff)
ARMv6: Support for very variable-hungry composite operations
Previously, the variable ARGS_STACK_OFFSET was available to extract values from function arguments during the init macro. Now this changes dynamically around stack operations in the function as a whole so that arguments can be accessed at any point. It is also joined by LOCALS_STACK_OFFSET, which allows access to space reserved on the stack during the init macro. On top of this, composite macros now have the option of using all of WK0-WK3 registers rather than just the subset it was told to use; this requires the pixel count to be spilled to the stack over the leading pixels at the start of each line. Thus, at best, each composite operation can use 11 registers, plus any pointer registers not required for the composite type, plus as much stack space as it needs, divided up into constants and variables as necessary.
-rw-r--r--pixman/pixman-arm-simd-asm.h56
1 files changed, 53 insertions, 3 deletions
diff --git a/pixman/pixman-arm-simd-asm.h b/pixman/pixman-arm-simd-asm.h
index 24b1ad2f..852a113f 100644
--- a/pixman/pixman-arm-simd-asm.h
+++ b/pixman/pixman-arm-simd-asm.h
@@ -76,6 +76,8 @@
.set FLAG_SPILL_LINE_VARS, 48
.set FLAG_PROCESS_CORRUPTS_SCRATCH, 0
.set FLAG_PROCESS_PRESERVES_SCRATCH, 64
+.set FLAG_PROCESS_PRESERVES_WK0, 0
+.set FLAG_PROCESS_CORRUPTS_WK0, 128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */
/*
* Offset into stack where mask and source pointer/stride can be accessed.
@@ -87,6 +89,11 @@
#endif
/*
+ * Offset into stack where space allocated during init macro can be accessed.
+ */
+.set LOCALS_STACK_OFFSET, 0
+
+/*
* Constants for selecting preferable prefetch type.
*/
.set PREFETCH_TYPE_NONE, 0
@@ -359,23 +366,41 @@
.macro test_bits_1_0_ptr
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ movs SCRATCH, X, lsl #32-1 /* C,N = bits 1,0 of DST */
+ .else
movs SCRATCH, WK0, lsl #32-1 /* C,N = bits 1,0 of DST */
+ .endif
.endm
.macro test_bits_3_2_ptr
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ movs SCRATCH, X, lsl #32-3 /* C,N = bits 3, 2 of DST */
+ .else
movs SCRATCH, WK0, lsl #32-3 /* C,N = bits 3, 2 of DST */
+ .endif
.endm
.macro leading_15bytes process_head, process_tail
/* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */
+ .set DECREMENT_X, 1
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ .set DECREMENT_X, 0
+ sub X, X, WK0, lsr #dst_bpp_shift
+ str X, [sp, #LINE_SAVED_REG_COUNT*4]
+ mov X, WK0
+ .endif
/* Use unaligned loads in all cases for simplicity */
.if dst_w_bpp == 8
- conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, 1
+ conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
.elseif dst_w_bpp == 16
test_bits_1_0_ptr
- conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, 1
+ conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X
+ .endif
+ conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ ldr X, [sp, #LINE_SAVED_REG_COUNT*4]
.endif
- conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, 1
.endm
.macro test_bits_3_2_pix
@@ -705,6 +730,13 @@ fname:
#endif
init
+
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ /* Reserve a word in which to store X during leading pixels */
+ sub sp, sp, #4
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4
+ .endif
lsl STRIDE_D, #dst_bpp_shift /* stride in bytes */
sub STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift
@@ -734,6 +766,8 @@ fname:
.if (flags) & FLAG_SPILL_LINE_VARS_WIDE
/* This is stmdb sp!,{} */
.word 0xE92D0000 | LINE_SAVED_REGS
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
.endif
151: /* New line */
newline
@@ -767,6 +801,10 @@ fname:
157: /* Check for another line */
end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
+ .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+ .endif
.endif
.ltorg
@@ -776,6 +814,8 @@ fname:
.if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
/* This is stmdb sp!,{} */
.word 0xE92D0000 | LINE_SAVED_REGS
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
.endif
161: /* New line */
newline
@@ -841,12 +881,22 @@ fname:
177: /* Check for another line */
end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
+ .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+ .endif
197:
.if (flags) & FLAG_SPILL_LINE_VARS
add sp, sp, #LINE_SAVED_REG_COUNT*4
.endif
198:
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4
+ add sp, sp, #4
+ .endif
+
cleanup
#ifdef DEBUG_PARAMS