diff options
-rw-r--r-- | pixman/pixman-arm-neon-asm.h | 176 |
1 files changed, 163 insertions, 13 deletions
diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h index aa5e9bd..d3b506d 100644 --- a/pixman/pixman-arm-neon-asm.h +++ b/pixman/pixman-arm-neon-asm.h @@ -205,6 +205,100 @@ .endif .endm +/* + * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register + * aliases to be defined) + */ +.macro pixld1_s elem_size, reg1, mem_operand +.if elem_size == 16 + mov TMP1, VX, asr #16 + add VX, VX, UNIT_X + add TMP1, mem_operand, TMP1, asl #1 + mov TMP2, VX, asr #16 + add VX, VX, UNIT_X + add TMP2, mem_operand, TMP2, asl #1 + vld1.16 {d®1&[0]}, [TMP1, :16] + mov TMP1, VX, asr #16 + add VX, VX, UNIT_X + add TMP1, mem_operand, TMP1, asl #1 + vld1.16 {d®1&[1]}, [TMP2, :16] + mov TMP2, VX, asr #16 + add VX, VX, UNIT_X + add TMP2, mem_operand, TMP2, asl #1 + vld1.16 {d®1&[2]}, [TMP1, :16] + vld1.16 {d®1&[3]}, [TMP2, :16] +.elseif elem_size == 32 + mov TMP1, VX, asr #16 + add VX, VX, UNIT_X + add TMP1, mem_operand, TMP1, asl #2 + mov TMP2, VX, asr #16 + add VX, VX, UNIT_X + add TMP2, mem_operand, TMP2, asl #2 + vld1.32 {d®1&[0]}, [TMP1, :32] + vld1.32 {d®1&[1]}, [TMP2, :32] +.else + .error "unsupported" +.endif +.endm + +.macro pixld0_s elem_size, reg1, idx, mem_operand +.if elem_size == 16 + mov TMP1, VX, asr #16 + add VX, VX, UNIT_X + add TMP1, mem_operand, TMP1, asl #1 + vld1.16 {d®1&[idx]}, [TMP1, :16] +.elseif elem_size == 32 + mov TMP1, VX, asr #16 + add VX, VX, UNIT_X + add TMP1, mem_operand, TMP1, asl #2 + vld1.32 {d®1&[idx]}, [TMP1, :32] +.endif +.endm + +.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand +.if numbytes == 32 + pixld1_s elem_size, %(basereg+4), mem_operand + pixld1_s elem_size, %(basereg+5), mem_operand + pixld1_s elem_size, %(basereg+6), mem_operand + pixld1_s elem_size, %(basereg+7), mem_operand + pixdeinterleave elem_size, %(basereg+4) +.elseif numbytes == 16 + pixld1_s elem_size, %(basereg+2), mem_operand + pixld1_s elem_size, %(basereg+3), mem_operand +.elseif numbytes == 8 + pixld1_s elem_size, %(basereg+1), mem_operand +.elseif numbytes == 4 + .if elem_size == 32 + pixld0_s elem_size, %(basereg+0), 1, mem_operand + .elseif elem_size == 16 + pixld0_s elem_size, %(basereg+0), 2, mem_operand + pixld0_s elem_size, %(basereg+0), 3, mem_operand + .else + pixld0_s elem_size, %(basereg+0), 4, mem_operand + pixld0_s elem_size, %(basereg+0), 5, mem_operand + pixld0_s elem_size, %(basereg+0), 6, mem_operand + pixld0_s elem_size, %(basereg+0), 7, mem_operand + .endif +.elseif numbytes == 2 + .if elem_size == 16 + pixld0_s elem_size, %(basereg+0), 1, mem_operand + .else + pixld0_s elem_size, %(basereg+0), 2, mem_operand + pixld0_s elem_size, %(basereg+0), 3, mem_operand + .endif +.elseif numbytes == 1 + pixld0_s elem_size, %(basereg+0), 1, mem_operand +.else + .error "unsupported size: numbytes" +.endif +.endm + +.macro pixld_s numpix, bpp, basereg, mem_operand +.if bpp > 0 + pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand +.endif +.endm + .macro vuzp8 reg1, reg2 vuzp.8 d®1, d®2 .endm @@ -792,7 +886,8 @@ fname: * A simplified variant of function generation template for a single * scanline processing (for implementing pixman combine functions) */ -.macro generate_composite_function_single_scanline fname, \ +.macro generate_composite_function_scanline use_nearest_scaling, \ + fname, \ src_bpp_, \ mask_bpp_, \ dst_w_bpp_, \ @@ -830,23 +925,44 @@ fname: .set src_basereg, src_basereg_ .set mask_basereg, mask_basereg_ +.if use_nearest_scaling != 0 + /* + * Assign symbolic names to registers for nearest scaling + */ + W .req r0 + DST_W .req r1 + SRC .req r2 + VX .req r3 + UNIT_X .req ip + MASK .req lr + TMP1 .req r4 + TMP2 .req r5 + DST_R .req r6 + .macro pixld_src x:vararg - pixld x - .endm - .macro fetch_src_pixblock - pixld_src pixblock_size, src_bpp, \ - (src_basereg - pixblock_size * src_bpp / 64), SRC + pixld_s x .endm -/* - * Assign symbolic names to registers - */ + ldr UNIT_X, [sp] + push {r4-r6, lr} + .if mask_bpp != 0 + ldr MASK, [sp, #(16 + 4)] + .endif +.else + /* + * Assign symbolic names to registers + */ W .req r0 /* width (is updated during processing) */ DST_W .req r1 /* destination buffer pointer for writes */ SRC .req r2 /* source buffer pointer */ DST_R .req ip /* destination buffer pointer for reads */ MASK .req r3 /* mask pointer */ + .macro pixld_src x:vararg + pixld x + .endm +.endif + .if (((flags) & FLAG_DST_READWRITE) != 0) .set dst_r_bpp, dst_w_bpp .else @@ -858,6 +974,11 @@ fname: .set DEINTERLEAVE_32BPP_ENABLED, 0 .endif + .macro fetch_src_pixblock + pixld_src pixblock_size, src_bpp, \ + (src_basereg - pixblock_size * src_bpp / 64), SRC + .endm + init mov DST_R, DST_W @@ -896,7 +1017,11 @@ fname: process_pixblock_tail_head cleanup - bx lr /* exit */ +.if use_nearest_scaling != 0 + pop {r4-r6, pc} /* exit */ +.else + bx lr /* exit */ +.endif 8: /* Process the remaining trailing pixels in the scanline (dst unaligned) */ process_trailing_pixels 0, 0, \ @@ -905,19 +1030,44 @@ fname: process_pixblock_tail_head cleanup - bx lr /* exit */ - .purgem fetch_src_pixblock - .purgem pixld_src +.if use_nearest_scaling != 0 + pop {r4-r6, pc} /* exit */ + + .unreq DST_R + .unreq SRC + .unreq W + .unreq VX + .unreq UNIT_X + .unreq TMP1 + .unreq TMP2 + .unreq DST_W + .unreq MASK + +.else + bx lr /* exit */ .unreq SRC .unreq MASK .unreq DST_R .unreq DST_W .unreq W +.endif + + .purgem fetch_src_pixblock + .purgem pixld_src + .endfunc .endm +.macro generate_composite_function_single_scanline x:vararg + generate_composite_function_scanline 0, x +.endm + +.macro generate_composite_function_nearest_scanline x:vararg + generate_composite_function_scanline 1, x +.endm + /* Default prologue/epilogue, nothing special needs to be done */ .macro default_init |