summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--pixman/pixman-arm-neon-asm.h176
1 files changed, 163 insertions, 13 deletions
diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index aa5e9bd..d3b506d 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -205,6 +205,100 @@
.endif
.endm
+/*
+ * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
+ * aliases to be defined)
+ */
+.macro pixld1_s elem_size, reg1, mem_operand
+.if elem_size == 16
+ mov TMP1, VX, asr #16
+ add VX, VX, UNIT_X
+ add TMP1, mem_operand, TMP1, asl #1
+ mov TMP2, VX, asr #16
+ add VX, VX, UNIT_X
+ add TMP2, mem_operand, TMP2, asl #1
+ vld1.16 {d&reg1&[0]}, [TMP1, :16]
+ mov TMP1, VX, asr #16
+ add VX, VX, UNIT_X
+ add TMP1, mem_operand, TMP1, asl #1
+ vld1.16 {d&reg1&[1]}, [TMP2, :16]
+ mov TMP2, VX, asr #16
+ add VX, VX, UNIT_X
+ add TMP2, mem_operand, TMP2, asl #1
+ vld1.16 {d&reg1&[2]}, [TMP1, :16]
+ vld1.16 {d&reg1&[3]}, [TMP2, :16]
+.elseif elem_size == 32
+ mov TMP1, VX, asr #16
+ add VX, VX, UNIT_X
+ add TMP1, mem_operand, TMP1, asl #2
+ mov TMP2, VX, asr #16
+ add VX, VX, UNIT_X
+ add TMP2, mem_operand, TMP2, asl #2
+ vld1.32 {d&reg1&[0]}, [TMP1, :32]
+ vld1.32 {d&reg1&[1]}, [TMP2, :32]
+.else
+ .error "unsupported"
+.endif
+.endm
+
+.macro pixld0_s elem_size, reg1, idx, mem_operand
+.if elem_size == 16
+ mov TMP1, VX, asr #16
+ add VX, VX, UNIT_X
+ add TMP1, mem_operand, TMP1, asl #1
+ vld1.16 {d&reg1&[idx]}, [TMP1, :16]
+.elseif elem_size == 32
+ mov TMP1, VX, asr #16
+ add VX, VX, UNIT_X
+ add TMP1, mem_operand, TMP1, asl #2
+ vld1.32 {d&reg1&[idx]}, [TMP1, :32]
+.endif
+.endm
+
+.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
+.if numbytes == 32
+ pixld1_s elem_size, %(basereg+4), mem_operand
+ pixld1_s elem_size, %(basereg+5), mem_operand
+ pixld1_s elem_size, %(basereg+6), mem_operand
+ pixld1_s elem_size, %(basereg+7), mem_operand
+ pixdeinterleave elem_size, %(basereg+4)
+.elseif numbytes == 16
+ pixld1_s elem_size, %(basereg+2), mem_operand
+ pixld1_s elem_size, %(basereg+3), mem_operand
+.elseif numbytes == 8
+ pixld1_s elem_size, %(basereg+1), mem_operand
+.elseif numbytes == 4
+ .if elem_size == 32
+ pixld0_s elem_size, %(basereg+0), 1, mem_operand
+ .elseif elem_size == 16
+ pixld0_s elem_size, %(basereg+0), 2, mem_operand
+ pixld0_s elem_size, %(basereg+0), 3, mem_operand
+ .else
+ pixld0_s elem_size, %(basereg+0), 4, mem_operand
+ pixld0_s elem_size, %(basereg+0), 5, mem_operand
+ pixld0_s elem_size, %(basereg+0), 6, mem_operand
+ pixld0_s elem_size, %(basereg+0), 7, mem_operand
+ .endif
+.elseif numbytes == 2
+ .if elem_size == 16
+ pixld0_s elem_size, %(basereg+0), 1, mem_operand
+ .else
+ pixld0_s elem_size, %(basereg+0), 2, mem_operand
+ pixld0_s elem_size, %(basereg+0), 3, mem_operand
+ .endif
+.elseif numbytes == 1
+ pixld0_s elem_size, %(basereg+0), 1, mem_operand
+.else
+ .error "unsupported size: numbytes"
+.endif
+.endm
+
+.macro pixld_s numpix, bpp, basereg, mem_operand
+.if bpp > 0
+ pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
+.endif
+.endm
+
.macro vuzp8 reg1, reg2
vuzp.8 d&reg1, d&reg2
.endm
@@ -792,7 +886,8 @@ fname:
* A simplified variant of function generation template for a single
* scanline processing (for implementing pixman combine functions)
*/
-.macro generate_composite_function_single_scanline fname, \
+.macro generate_composite_function_scanline use_nearest_scaling, \
+ fname, \
src_bpp_, \
mask_bpp_, \
dst_w_bpp_, \
@@ -830,23 +925,44 @@ fname:
.set src_basereg, src_basereg_
.set mask_basereg, mask_basereg_
+.if use_nearest_scaling != 0
+ /*
+ * Assign symbolic names to registers for nearest scaling
+ */
+ W .req r0
+ DST_W .req r1
+ SRC .req r2
+ VX .req r3
+ UNIT_X .req ip
+ MASK .req lr
+ TMP1 .req r4
+ TMP2 .req r5
+ DST_R .req r6
+
.macro pixld_src x:vararg
- pixld x
- .endm
- .macro fetch_src_pixblock
- pixld_src pixblock_size, src_bpp, \
- (src_basereg - pixblock_size * src_bpp / 64), SRC
+ pixld_s x
.endm
-/*
- * Assign symbolic names to registers
- */
+ ldr UNIT_X, [sp]
+ push {r4-r6, lr}
+ .if mask_bpp != 0
+ ldr MASK, [sp, #(16 + 4)]
+ .endif
+.else
+ /*
+ * Assign symbolic names to registers
+ */
W .req r0 /* width (is updated during processing) */
DST_W .req r1 /* destination buffer pointer for writes */
SRC .req r2 /* source buffer pointer */
DST_R .req ip /* destination buffer pointer for reads */
MASK .req r3 /* mask pointer */
+ .macro pixld_src x:vararg
+ pixld x
+ .endm
+.endif
+
.if (((flags) & FLAG_DST_READWRITE) != 0)
.set dst_r_bpp, dst_w_bpp
.else
@@ -858,6 +974,11 @@ fname:
.set DEINTERLEAVE_32BPP_ENABLED, 0
.endif
+ .macro fetch_src_pixblock
+ pixld_src pixblock_size, src_bpp, \
+ (src_basereg - pixblock_size * src_bpp / 64), SRC
+ .endm
+
init
mov DST_R, DST_W
@@ -896,7 +1017,11 @@ fname:
process_pixblock_tail_head
cleanup
- bx lr /* exit */
+.if use_nearest_scaling != 0
+ pop {r4-r6, pc} /* exit */
+.else
+ bx lr /* exit */
+.endif
8:
/* Process the remaining trailing pixels in the scanline (dst unaligned) */
process_trailing_pixels 0, 0, \
@@ -905,19 +1030,44 @@ fname:
process_pixblock_tail_head
cleanup
- bx lr /* exit */
- .purgem fetch_src_pixblock
- .purgem pixld_src
+.if use_nearest_scaling != 0
+ pop {r4-r6, pc} /* exit */
+
+ .unreq DST_R
+ .unreq SRC
+ .unreq W
+ .unreq VX
+ .unreq UNIT_X
+ .unreq TMP1
+ .unreq TMP2
+ .unreq DST_W
+ .unreq MASK
+
+.else
+ bx lr /* exit */
.unreq SRC
.unreq MASK
.unreq DST_R
.unreq DST_W
.unreq W
+.endif
+
+ .purgem fetch_src_pixblock
+ .purgem pixld_src
+
.endfunc
.endm
+.macro generate_composite_function_single_scanline x:vararg
+ generate_composite_function_scanline 0, x
+.endm
+
+.macro generate_composite_function_nearest_scanline x:vararg
+ generate_composite_function_scanline 1, x
+.endm
+
/* Default prologue/epilogue, nothing special needs to be done */
.macro default_init