diff options
Diffstat (limited to 'pixman/pixman-arm-simd-asm.S')
-rw-r--r-- | pixman/pixman-arm-simd-asm.S | 470 |
1 files changed, 236 insertions, 234 deletions
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S index a74a0a8..34d38f1 100644 --- a/pixman/pixman-arm-simd-asm.S +++ b/pixman/pixman-arm-simd-asm.S @@ -40,6 +40,8 @@ #include "pixman-arm-asm.h" #include "pixman-arm-simd-asm.h" + pixman_syntax_unified + /* A head macro should do all processing which results in an output of up to * 16 bytes, as far as the final load instruction. The corresponding tail macro * should complete the processing of the up-to-16 bytes. The calling macro will @@ -57,7 +59,7 @@ .endm .macro blit_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - pixld cond, numbytes, firstreg, SRC, unaligned_src + pixld \cond, \numbytes, \firstreg, SRC, \unaligned_src .endm .macro blit_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment @@ -65,8 +67,8 @@ WK5 .req STRIDE_S WK6 .req MASK WK7 .req STRIDE_M -110: pixld , 16, 0, SRC, unaligned_src - pixld , 16, 4, SRC, unaligned_src +110: pixld , 16, 0, SRC, \unaligned_src + pixld , 16, 4, SRC, \unaligned_src pld [SRC, SCRATCH] pixst , 16, 0, DST pixst , 16, 4, DST @@ -122,7 +124,7 @@ generate_composite_function \ .macro src_n_0565_init ldrh SRC, [sp, #ARGS_STACK_OFFSET] - orr SRC, SRC, lsl #16 + orr SRC, SRC, SRC, lsl #16 mov STRIDE_S, SRC mov MASK, SRC mov STRIDE_M, SRC @@ -130,8 +132,8 @@ generate_composite_function \ .macro src_n_8_init ldrb SRC, [sp, #ARGS_STACK_OFFSET] - orr SRC, SRC, lsl #8 - orr SRC, SRC, lsl #16 + orr SRC, SRC, SRC, lsl #8 + orr SRC, SRC, SRC, lsl #16 mov STRIDE_S, SRC mov MASK, SRC mov STRIDE_M, SRC @@ -142,7 +144,7 @@ generate_composite_function \ WK5 .req STRIDE_S WK6 .req MASK WK7 .req STRIDE_M - pixst cond, numbytes, 4, DST + pixst \cond, \numbytes, 4, DST .unreq WK4 .unreq WK5 .unreq WK6 @@ -182,20 +184,20 @@ generate_composite_function \ /******************************************************************************/ .macro src_x888_8888_pixel, cond, reg - orr&cond WK®, WK®, #0xFF000000 + orr\()\cond WK\()\reg, WK\()\reg, #0xFF000000 .endm .macro pixman_composite_src_x888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - pixld cond, numbytes, firstreg, SRC, unaligned_src + pixld \cond, \numbytes, \firstreg, SRC, \unaligned_src .endm .macro pixman_composite_src_x888_8888_process_tail cond, numbytes, firstreg - src_x888_8888_pixel cond, %(firstreg+0) - .if numbytes >= 8 - src_x888_8888_pixel cond, %(firstreg+1) - .if numbytes == 16 - src_x888_8888_pixel cond, %(firstreg+2) - src_x888_8888_pixel cond, %(firstreg+3) + src_x888_8888_pixel \cond, %(\firstreg+0) + .if \numbytes >= 8 + src_x888_8888_pixel \cond, %(\firstreg+1) + .if \numbytes == 16 + src_x888_8888_pixel \cond, %(\firstreg+2) + src_x888_8888_pixel \cond, %(\firstreg+3) .endif .endif .endm @@ -222,73 +224,73 @@ generate_composite_function \ .endm .macro src_0565_8888_2pixels, reg1, reg2 - and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000 - bic WK®2, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb - orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg - mov WK®1, WK®2, lsl #16 @ rrrrr000000bbbbb0000000000000000 - mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG - bic WK®2, WK®2, WK®1, lsr #16 @ RRRRR000000BBBBB0000000000000000 - orr WK®1, WK®1, WK®1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000 - orr WK®2, WK®2, WK®2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000 - pkhtb WK®1, WK®1, WK®1, asr #5 @ rrrrrrrr--------bbbbbbbb-------- - sel WK®1, WK®1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb-------- - mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg - pkhtb WK®2, WK®2, WK®2, asr #5 @ RRRRRRRR--------BBBBBBBB-------- - sel WK®2, WK®2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB-------- - orr WK®1, STRIDE_M, WK®1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb - orr WK®2, STRIDE_M, WK®2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB + and SCRATCH, WK\()\reg1, MASK @ 00000GGGGGG0000000000gggggg00000 + bic WK\()\reg2, WK\()\reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb + orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg + mov WK\()\reg1, WK\()\reg2, lsl #16 @ rrrrr000000bbbbb0000000000000000 + mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG + bic WK\()\reg2, WK\()\reg2, WK\()\reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000 + orr WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000 + orr WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000 + pkhtb WK\()\reg1, WK\()\reg1, WK\()\reg1, asr #5 @ rrrrrrrr--------bbbbbbbb-------- + sel WK\()\reg1, WK\()\reg1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb-------- + mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg + pkhtb WK\()\reg2, WK\()\reg2, WK\()\reg2, asr #5 @ RRRRRRRR--------BBBBBBBB-------- + sel WK\()\reg2, WK\()\reg2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB-------- + orr WK\()\reg1, STRIDE_M, WK\()\reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb + orr WK\()\reg2, STRIDE_M, WK\()\reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB .endm /* This version doesn't need STRIDE_M, but is one instruction longer. It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case? - and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000 - bic WK®1, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb - orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg - mov WK®2, WK®1, lsr #16 @ 0000000000000000RRRRR000000BBBBB - mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000 - bic WK®1, WK®1, WK®2, lsl #16 @ 0000000000000000rrrrr000000bbbbb - mov WK®2, WK®2, lsl #3 @ 0000000000000RRRRR000000BBBBB000 - mov WK®1, WK®1, lsl #3 @ 0000000000000rrrrr000000bbbbb000 - orr WK®2, WK®2, WK®2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB - orr WK®1, WK®1, WK®1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb - pkhbt WK®2, WK®2, WK®2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB - pkhbt WK®1, WK®1, WK®1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb - sel WK®2, SCRATCH, WK®2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB - sel WK®1, SCRATCH, WK®1 @ --------rrrrrrrrggggggggbbbbbbbb - orr WK®2, WK®2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB - orr WK®1, WK®1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb + and SCRATCH, WK\()\reg1, MASK @ 00000GGGGGG0000000000gggggg00000 + bic WK\()\reg1, WK\()\reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb + orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg + mov WK\()\reg2, WK\()\reg1, lsr #16 @ 0000000000000000RRRRR000000BBBBB + mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000 + bic WK\()\reg1, WK\()\reg1, WK\()\reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb + mov WK\()\reg2, WK\()\reg2, lsl #3 @ 0000000000000RRRRR000000BBBBB000 + mov WK\()\reg1, WK\()\reg1, lsl #3 @ 0000000000000rrrrr000000bbbbb000 + orr WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB + orr WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb + pkhbt WK\()\reg2, WK\()\reg2, WK\()\reg2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB + pkhbt WK\()\reg1, WK\()\reg1, WK\()\reg1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb + sel WK\()\reg2, SCRATCH, WK\()\reg2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB + sel WK\()\reg1, SCRATCH, WK\()\reg1 @ --------rrrrrrrrggggggggbbbbbbbb + orr WK\()\reg2, WK\()\reg2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB + orr WK\()\reg1, WK\()\reg1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb */ .macro src_0565_8888_1pixel, reg - bic SCRATCH, WK®, MASK @ 0000000000000000rrrrr000000bbbbb - and WK®, WK®, MASK @ 000000000000000000000gggggg00000 - mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000 - mov WK®, WK®, lsl #5 @ 0000000000000000gggggg0000000000 - orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb - orr WK®, WK®, WK®, lsr #6 @ 000000000000000gggggggggggg00000 - pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb - sel WK®, WK®, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb - orr WK®, WK®, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb + bic SCRATCH, WK\()\reg, MASK @ 0000000000000000rrrrr000000bbbbb + and WK\()\reg, WK\()\reg, MASK @ 000000000000000000000gggggg00000 + mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000 + mov WK\()\reg, WK\()\reg, lsl #5 @ 0000000000000000gggggg0000000000 + orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb + orr WK\()\reg, WK\()\reg, WK\()\reg, lsr #6 @ 000000000000000gggggggggggg00000 + pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb + sel WK\()\reg, WK\()\reg, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb + orr WK\()\reg, WK\()\reg, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb .endm .macro src_0565_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - .if numbytes == 16 - pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src - .elseif numbytes == 8 - pixld , 4, firstreg, SRC, unaligned_src - .elseif numbytes == 4 - pixld , 2, firstreg, SRC, unaligned_src + .if \numbytes == 16 + pixldst ld,, 8, \firstreg, %(\firstreg+2),,, SRC, \unaligned_src + .elseif \numbytes == 8 + pixld , 4, \firstreg, SRC, \unaligned_src + .elseif \numbytes == 4 + pixld , 2, \firstreg, SRC, \unaligned_src .endif .endm .macro src_0565_8888_process_tail cond, numbytes, firstreg - .if numbytes == 16 - src_0565_8888_2pixels firstreg, %(firstreg+1) - src_0565_8888_2pixels %(firstreg+2), %(firstreg+3) - .elseif numbytes == 8 - src_0565_8888_2pixels firstreg, %(firstreg+1) + .if \numbytes == 16 + src_0565_8888_2pixels \firstreg, %(\firstreg+1) + src_0565_8888_2pixels %(\firstreg+2), %(\firstreg+3) + .elseif \numbytes == 8 + src_0565_8888_2pixels \firstreg, %(\firstreg+1) .else - src_0565_8888_1pixel firstreg + src_0565_8888_1pixel \firstreg .endif .endm @@ -311,23 +313,23 @@ generate_composite_function \ .endm .macro src_x888_0565_1pixel s, d - and WK&d, MASK, WK&s, lsr #3 @ 00000000000rrrrr00000000000bbbbb - and STRIDE_S, WK&s, #0xFC00 @ 0000000000000000gggggg0000000000 - orr WK&d, WK&d, WK&d, lsr #5 @ 00000000000-----rrrrr000000bbbbb - orr WK&d, WK&d, STRIDE_S, lsr #5 @ 00000000000-----rrrrrggggggbbbbb + and WK\()\d, MASK, WK\()\s, lsr #3 @ 00000000000rrrrr00000000000bbbbb + and STRIDE_S, WK\()\s, #0xFC00 @ 0000000000000000gggggg0000000000 + orr WK\()\d, WK\()\d, WK\()\d, lsr #5 @ 00000000000-----rrrrr000000bbbbb + orr WK\()\d, WK\()\d, STRIDE_S, lsr #5 @ 00000000000-----rrrrrggggggbbbbb /* Top 16 bits are discarded during the following STRH */ .endm .macro src_x888_0565_2pixels slo, shi, d, tmp - and SCRATCH, WK&shi, #0xFC00 @ 0000000000000000GGGGGG0000000000 - and WK&tmp, MASK, WK&shi, lsr #3 @ 00000000000RRRRR00000000000BBBBB - and WK&shi, MASK, WK&slo, lsr #3 @ 00000000000rrrrr00000000000bbbbb - orr WK&tmp, WK&tmp, WK&tmp, lsr #5 @ 00000000000-----RRRRR000000BBBBB - orr WK&tmp, WK&tmp, SCRATCH, lsr #5 @ 00000000000-----RRRRRGGGGGGBBBBB - and SCRATCH, WK&slo, #0xFC00 @ 0000000000000000gggggg0000000000 - orr WK&shi, WK&shi, WK&shi, lsr #5 @ 00000000000-----rrrrr000000bbbbb - orr WK&shi, WK&shi, SCRATCH, lsr #5 @ 00000000000-----rrrrrggggggbbbbb - pkhbt WK&d, WK&shi, WK&tmp, lsl #16 @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb + and SCRATCH, WK\()\shi, #0xFC00 @ 0000000000000000GGGGGG0000000000 + and WK\()\tmp, MASK, WK\()\shi, lsr #3 @ 00000000000RRRRR00000000000BBBBB + and WK\()\shi, MASK, WK\()\slo, lsr #3 @ 00000000000rrrrr00000000000bbbbb + orr WK\()\tmp, WK\()\tmp, WK\()\tmp, lsr #5 @ 00000000000-----RRRRR000000BBBBB + orr WK\()\tmp, WK\()\tmp, SCRATCH, lsr #5 @ 00000000000-----RRRRRGGGGGGBBBBB + and SCRATCH, WK\()\slo, #0xFC00 @ 0000000000000000gggggg0000000000 + orr WK\()\shi, WK\()\shi, WK\()\shi, lsr #5 @ 00000000000-----rrrrr000000bbbbb + orr WK\()\shi, WK\()\shi, SCRATCH, lsr #5 @ 00000000000-----rrrrrggggggbbbbb + pkhbt WK\()\d, WK\()\shi, WK\()\tmp, lsl #16 @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb .endm .macro src_x888_0565_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload @@ -335,33 +337,33 @@ generate_composite_function \ WK5 .req STRIDE_M WK6 .req WK3 WK7 .req ORIG_W - .if numbytes == 16 + .if \numbytes == 16 pixld , 16, 4, SRC, 0 src_x888_0565_2pixels 4, 5, 0, 0 pixld , 8, 4, SRC, 0 src_x888_0565_2pixels 6, 7, 1, 1 pixld , 8, 6, SRC, 0 .else - pixld , numbytes*2, 4, SRC, 0 + pixld , \numbytes*2, 4, SRC, 0 .endif .endm .macro src_x888_0565_process_tail cond, numbytes, firstreg - .if numbytes == 16 + .if \numbytes == 16 src_x888_0565_2pixels 4, 5, 2, 2 src_x888_0565_2pixels 6, 7, 3, 4 - .elseif numbytes == 8 + .elseif \numbytes == 8 src_x888_0565_2pixels 4, 5, 1, 1 src_x888_0565_2pixels 6, 7, 2, 2 - .elseif numbytes == 4 + .elseif \numbytes == 4 src_x888_0565_2pixels 4, 5, 1, 1 .else src_x888_0565_1pixel 4, 1 .endif - .if numbytes == 16 - pixst , numbytes, 0, DST + .if \numbytes == 16 + pixst , \numbytes, 0, DST .else - pixst , numbytes, 1, DST + pixst , \numbytes, 1, DST .endif .unreq WK4 .unreq WK5 @@ -382,37 +384,37 @@ generate_composite_function \ /******************************************************************************/ .macro add_8_8_8pixels cond, dst1, dst2 - uqadd8&cond WK&dst1, WK&dst1, MASK - uqadd8&cond WK&dst2, WK&dst2, STRIDE_M + uqadd8\()\cond WK\()\dst1, WK\()\dst1, MASK + uqadd8\()\cond WK\()\dst2, WK\()\dst2, STRIDE_M .endm .macro add_8_8_4pixels cond, dst - uqadd8&cond WK&dst, WK&dst, MASK + uqadd8\()\cond WK\()\dst, WK\()\dst, MASK .endm .macro add_8_8_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload WK4 .req MASK WK5 .req STRIDE_M - .if numbytes == 16 - pixld cond, 8, 4, SRC, unaligned_src - pixld cond, 16, firstreg, DST, 0 - add_8_8_8pixels cond, firstreg, %(firstreg+1) - pixld cond, 8, 4, SRC, unaligned_src + .if \numbytes == 16 + pixld \cond, 8, 4, SRC, \unaligned_src + pixld \cond, 16, \firstreg, DST, 0 + add_8_8_8pixels \cond, \firstreg, %(\firstreg+1) + pixld \cond, 8, 4, SRC, \unaligned_src .else - pixld cond, numbytes, 4, SRC, unaligned_src - pixld cond, numbytes, firstreg, DST, 0 + pixld \cond, \numbytes, 4, SRC, \unaligned_src + pixld \cond, \numbytes, \firstreg, DST, 0 .endif .unreq WK4 .unreq WK5 .endm .macro add_8_8_process_tail cond, numbytes, firstreg - .if numbytes == 16 - add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3) - .elseif numbytes == 8 - add_8_8_8pixels cond, firstreg, %(firstreg+1) + .if \numbytes == 16 + add_8_8_8pixels \cond, %(\firstreg+2), %(\firstreg+3) + .elseif \numbytes == 8 + add_8_8_8pixels \cond, \firstreg, %(\firstreg+1) .else - add_8_8_4pixels cond, firstreg + add_8_8_4pixels \cond, \firstreg .endif .endm @@ -441,8 +443,8 @@ generate_composite_function \ WK5 .req STRIDE_S WK6 .req STRIDE_M WK7 .req ORIG_W - pixld , numbytes, %(4+firstreg), SRC, unaligned_src - pixld , numbytes, firstreg, DST, 0 + pixld , \numbytes, %(4+\firstreg), SRC, \unaligned_src + pixld , \numbytes, \firstreg, DST, 0 .unreq WK4 .unreq WK5 .unreq WK6 @@ -451,44 +453,44 @@ generate_composite_function \ .macro over_8888_8888_check_transparent numbytes, reg0, reg1, reg2, reg3 /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */ - teq WK®0, #0 - .if numbytes > 4 - teqeq WK®1, #0 - .if numbytes > 8 - teqeq WK®2, #0 - teqeq WK®3, #0 + teq WK\()\reg0, #0 + .if \numbytes > 4 + teqeq WK\()\reg1, #0 + .if \numbytes > 8 + teqeq WK\()\reg2, #0 + teqeq WK\()\reg3, #0 .endif .endif .endm .macro over_8888_8888_prepare next - mov WK&next, WK&next, lsr #24 + mov WK\()\next, WK\()\next, lsr #24 .endm .macro over_8888_8888_1pixel src, dst, offset, next /* src = destination component multiplier */ - rsb WK&src, WK&src, #255 + rsb WK\()\src, WK\()\src, #255 /* Split even/odd bytes of dst into SCRATCH/dst */ - uxtb16 SCRATCH, WK&dst - uxtb16 WK&dst, WK&dst, ror #8 + uxtb16 SCRATCH, WK\()\dst + uxtb16 WK\()\dst, WK\()\dst, ror #8 /* Multiply through, adding 0.5 to the upper byte of result for rounding */ - mla SCRATCH, SCRATCH, WK&src, MASK - mla WK&dst, WK&dst, WK&src, MASK + mla SCRATCH, SCRATCH, WK\()\src, MASK + mla WK\()\dst, WK\()\dst, WK\()\src, MASK /* Where we would have had a stall between the result of the first MLA and the shifter input, * reload the complete source pixel */ - ldr WK&src, [SRC, #offset] + ldr WK\()\src, [SRC, #\offset] /* Multiply by 257/256 to approximate 256/255 */ uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 /* In this stall, start processing the next pixel */ - .if offset < -4 - mov WK&next, WK&next, lsr #24 + .if \offset < -4 + mov WK\()\next, WK\()\next, lsr #24 .endif - uxtab16 WK&dst, WK&dst, WK&dst, ror #8 + uxtab16 WK\()\dst, WK\()\dst, WK\()\dst, ror #8 /* Recombine even/odd bytes of multiplied destination */ mov SCRATCH, SCRATCH, ror #8 - sel WK&dst, SCRATCH, WK&dst + sel WK\()\dst, SCRATCH, WK\()\dst /* Saturated add of source to multiplied destination */ - uqadd8 WK&dst, WK&dst, WK&src + uqadd8 WK\()\dst, WK\()\dst, WK\()\src .endm .macro over_8888_8888_process_tail cond, numbytes, firstreg @@ -496,17 +498,17 @@ generate_composite_function \ WK5 .req STRIDE_S WK6 .req STRIDE_M WK7 .req ORIG_W - over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg) + over_8888_8888_check_transparent \numbytes, %(4+\firstreg), %(5+\firstreg), %(6+\firstreg), %(7+\firstreg) beq 10f - over_8888_8888_prepare %(4+firstreg) - .set PROCESS_REG, firstreg - .set PROCESS_OFF, -numbytes - .rept numbytes / 4 + over_8888_8888_prepare %(4+\firstreg) + .set PROCESS_REG, \firstreg + .set PROCESS_OFF, -\numbytes + .rept \numbytes / 4 over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG) .set PROCESS_REG, PROCESS_REG+1 .set PROCESS_OFF, PROCESS_OFF+4 .endr - pixst , numbytes, firstreg, DST + pixst , \numbytes, \firstreg, DST 10: .unreq WK4 .unreq WK5 @@ -536,16 +538,16 @@ generate_composite_function \ */ .macro mul_8888_8 word, byte, tmp, half /* Split even/odd bytes of word apart */ - uxtb16 tmp, word - uxtb16 word, word, ror #8 + uxtb16 \tmp, \word + uxtb16 \word, \word, ror #8 /* Multiply bytes together with rounding, then by 257/256 */ - mla tmp, tmp, byte, half - mla word, word, byte, half /* 1 stall follows */ - uxtab16 tmp, tmp, tmp, ror #8 /* 1 stall follows */ - uxtab16 word, word, word, ror #8 + mla \tmp, \tmp, \byte, \half + mla \word, \word, \byte, \half /* 1 stall follows */ + uxtab16 \tmp, \tmp, \tmp, ror #8 /* 1 stall follows */ + uxtab16 \word, \word, \word, ror #8 /* Recombine bytes */ - mov tmp, tmp, ror #8 - sel word, tmp, word + mov \tmp, \tmp, ror #8 + sel \word, \tmp, \word .endm /******************************************************************************/ @@ -567,8 +569,8 @@ generate_composite_function \ WK5 .req STRIDE_D WK6 .req STRIDE_S WK7 .req ORIG_W - pixld , numbytes, %(4+(firstreg%2)), SRC, unaligned_src - pixld , numbytes, firstreg, DST, 0 + pixld , \numbytes, %(4+(\firstreg%2)), SRC, \unaligned_src + pixld , \numbytes, \firstreg, DST, 0 .unreq WK4 .unreq WK5 .unreq WK6 @@ -576,10 +578,10 @@ generate_composite_function \ .endm .macro over_8888_n_8888_1pixel src, dst - mul_8888_8 WK&src, MASK, SCRATCH, STRIDE_M - sub WK7, WK6, WK&src, lsr #24 - mul_8888_8 WK&dst, WK7, SCRATCH, STRIDE_M - uqadd8 WK&dst, WK&dst, WK&src + mul_8888_8 WK\()\src, MASK, SCRATCH, STRIDE_M + sub WK7, WK6, WK\()\src, lsr #24 + mul_8888_8 WK\()\dst, WK7, SCRATCH, STRIDE_M + uqadd8 WK\()\dst, WK\()\dst, WK\()\src .endm .macro over_8888_n_8888_process_tail cond, numbytes, firstreg @@ -587,12 +589,12 @@ generate_composite_function \ WK5 .req STRIDE_D WK6 .req STRIDE_S WK7 .req ORIG_W - over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg) + over_8888_8888_check_transparent \numbytes, %(4+(\firstreg%2)), %(5+(\firstreg%2)), %(6+\firstreg), %(7+\firstreg) beq 10f mov WK6, #255 - .set PROCESS_REG, firstreg - .rept numbytes / 4 - .if numbytes == 16 && PROCESS_REG == 2 + .set PROCESS_REG, \firstreg + .rept \numbytes / 4 + .if \numbytes == 16 && PROCESS_REG == 2 /* We're using WK6 and WK7 as temporaries, so half way through * 4 pixels, reload the second two source pixels but this time * into WK4 and WK5 */ @@ -601,7 +603,7 @@ generate_composite_function \ over_8888_n_8888_1pixel %(4+(PROCESS_REG%2)), %(PROCESS_REG) .set PROCESS_REG, PROCESS_REG+1 .endr - pixst , numbytes, firstreg, DST + pixst , \numbytes, \firstreg, DST 10: .unreq WK4 .unreq WK5 @@ -642,13 +644,13 @@ generate_composite_function \ .macro over_n_8_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload WK4 .req STRIDE_M - pixld , numbytes/4, 4, MASK, unaligned_mask - pixld , numbytes, firstreg, DST, 0 + pixld , \numbytes/4, 4, MASK, \unaligned_mask + pixld , \numbytes, \firstreg, DST, 0 .unreq WK4 .endm .macro over_n_8_8888_1pixel src, dst - uxtb Y, WK4, ror #src*8 + uxtb Y, WK4, ror #\src*8 /* Trailing part of multiplication of source */ mla SCRATCH, STRIDE_S, Y, STRIDE_D mla Y, SRC, Y, STRIDE_D @@ -659,20 +661,20 @@ generate_composite_function \ sub ORIG_W, ORIG_W, Y, lsr #24 sel Y, SCRATCH, Y /* Then multiply the destination */ - mul_8888_8 WK&dst, ORIG_W, SCRATCH, STRIDE_D - uqadd8 WK&dst, WK&dst, Y + mul_8888_8 WK\()\dst, ORIG_W, SCRATCH, STRIDE_D + uqadd8 WK\()\dst, WK\()\dst, Y .endm .macro over_n_8_8888_process_tail cond, numbytes, firstreg WK4 .req STRIDE_M teq WK4, #0 beq 10f - .set PROCESS_REG, firstreg - .rept numbytes / 4 - over_n_8_8888_1pixel %(PROCESS_REG-firstreg), %(PROCESS_REG) + .set PROCESS_REG, \firstreg + .rept \numbytes / 4 + over_n_8_8888_1pixel %(PROCESS_REG-\firstreg), %(PROCESS_REG) .set PROCESS_REG, PROCESS_REG+1 .endr - pixst , numbytes, firstreg, DST + pixst , \numbytes, \firstreg, DST 10: .unreq WK4 .endm @@ -705,14 +707,14 @@ generate_composite_function \ .endm .macro over_reverse_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - pixld , numbytes, firstreg, DST, 0 + pixld , \numbytes, \firstreg, DST, 0 .endm .macro over_reverse_n_8888_1pixel d, is_only - teq WK&d, #0 + teq WK\()\d, #0 beq 8f /* replace with source */ - bics ORIG_W, STRIDE_D, WK&d, lsr #24 - .if is_only == 1 + bics ORIG_W, STRIDE_D, WK\()\d, lsr #24 + .if \is_only == 1 beq 49f /* skip store */ .else beq 9f /* write same value back */ @@ -723,36 +725,36 @@ generate_composite_function \ uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8 mov SCRATCH, SCRATCH, ror #8 sel ORIG_W, SCRATCH, ORIG_W - uqadd8 WK&d, WK&d, ORIG_W + uqadd8 WK\()\d, WK\()\d, ORIG_W b 9f -8: mov WK&d, SRC +8: mov WK\()\d, SRC 9: .endm .macro over_reverse_n_8888_tail numbytes, reg1, reg2, reg3, reg4 - .if numbytes == 4 - over_reverse_n_8888_1pixel reg1, 1 + .if \numbytes == 4 + over_reverse_n_8888_1pixel \reg1, 1 .else - and SCRATCH, WK®1, WK®2 - .if numbytes == 16 - and SCRATCH, SCRATCH, WK®3 - and SCRATCH, SCRATCH, WK®4 + and SCRATCH, WK\()\reg1, WK\()\reg2 + .if \numbytes == 16 + and SCRATCH, SCRATCH, WK\()\reg3 + and SCRATCH, SCRATCH, WK\()\reg4 .endif mvns SCRATCH, SCRATCH, asr #24 beq 49f /* skip store if all opaque */ - over_reverse_n_8888_1pixel reg1, 0 - over_reverse_n_8888_1pixel reg2, 0 - .if numbytes == 16 - over_reverse_n_8888_1pixel reg3, 0 - over_reverse_n_8888_1pixel reg4, 0 + over_reverse_n_8888_1pixel \reg1, 0 + over_reverse_n_8888_1pixel \reg2, 0 + .if \numbytes == 16 + over_reverse_n_8888_1pixel \reg3, 0 + over_reverse_n_8888_1pixel \reg4, 0 .endif .endif - pixst , numbytes, reg1, DST + pixst , \numbytes, \reg1, DST 49: .endm .macro over_reverse_n_8888_process_tail cond, numbytes, firstreg - over_reverse_n_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3) + over_reverse_n_8888_tail \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3) .endm generate_composite_function \ @@ -794,20 +796,20 @@ generate_composite_function \ .macro over_white_8888_8888_ca_combine m, d uxtb16 TMP1, TMP0 /* rb_notmask */ - uxtb16 TMP2, d /* rb_dest; 1 stall follows */ + uxtb16 TMP2, \d /* rb_dest; 1 stall follows */ smlatt TMP3, TMP2, TMP1, HALF /* red */ smlabb TMP2, TMP2, TMP1, HALF /* blue */ uxtb16 TMP0, TMP0, ror #8 /* ag_notmask */ - uxtb16 TMP1, d, ror #8 /* ag_dest; 1 stall follows */ - smlatt d, TMP1, TMP0, HALF /* alpha */ + uxtb16 TMP1, \d, ror #8 /* ag_dest; 1 stall follows */ + smlatt \d, TMP1, TMP0, HALF /* alpha */ smlabb TMP1, TMP1, TMP0, HALF /* green */ pkhbt TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */ - pkhbt TMP1, TMP1, d, lsl #16 /* ag */ + pkhbt TMP1, TMP1, \d, lsl #16 /* ag */ uxtab16 TMP0, TMP0, TMP0, ror #8 uxtab16 TMP1, TMP1, TMP1, ror #8 mov TMP0, TMP0, ror #8 - sel d, TMP0, TMP1 - uqadd8 d, d, m /* d is a late result */ + sel \d, TMP0, TMP1 + uqadd8 \d, \d, \m /* d is a late result */ .endm .macro over_white_8888_8888_ca_1pixel_head @@ -853,10 +855,10 @@ generate_composite_function \ .endm .macro over_white_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - .if numbytes == 4 + .if \numbytes == 4 over_white_8888_8888_ca_1pixel_head .else - .if numbytes == 16 + .if \numbytes == 16 over_white_8888_8888_ca_2pixels_head over_white_8888_8888_ca_2pixels_tail .endif @@ -865,7 +867,7 @@ generate_composite_function \ .endm .macro over_white_8888_8888_ca_process_tail cond, numbytes, firstreg - .if numbytes == 4 + .if \numbytes == 4 over_white_8888_8888_ca_1pixel_tail .else over_white_8888_8888_ca_2pixels_tail @@ -1004,7 +1006,7 @@ generate_composite_function \ .endm .macro over_n_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - .rept (numbytes / 4) - 1 + .rept (\numbytes / 4) - 1 over_n_8888_8888_ca_1pixel_head over_n_8888_8888_ca_1pixel_tail .endr @@ -1020,7 +1022,7 @@ pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6 cmp ip, #-1 beq pixman_composite_over_white_8888_8888_ca_asm_armv6 /* else drop through... */ - .endfunc +pixman_end_asm_function generate_composite_function \ pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \ @@ -1045,84 +1047,84 @@ generate_composite_function \ .macro in_reverse_8888_8888_head numbytes, reg1, reg2, reg3 ldrb ORIG_W, [SRC], #4 - .if numbytes >= 8 - ldrb WK®1, [SRC], #4 - .if numbytes == 16 - ldrb WK®2, [SRC], #4 - ldrb WK®3, [SRC], #4 + .if \numbytes >= 8 + ldrb WK\()\reg1, [SRC], #4 + .if \numbytes == 16 + ldrb WK\()\reg2, [SRC], #4 + ldrb WK\()\reg3, [SRC], #4 .endif .endif - add DST, DST, #numbytes + add DST, DST, #\numbytes .endm .macro in_reverse_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - in_reverse_8888_8888_head numbytes, firstreg, %(firstreg+1), %(firstreg+2) + in_reverse_8888_8888_head \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2) .endm .macro in_reverse_8888_8888_1pixel s, d, offset, is_only - .if is_only != 1 - movs s, ORIG_W - .if offset != 0 - ldrb ORIG_W, [SRC, #offset] + .if \is_only != 1 + movs \s, ORIG_W + .if \offset != 0 + ldrb ORIG_W, [SRC, #\offset] .endif beq 01f teq STRIDE_M, #0xFF beq 02f .endif - uxtb16 SCRATCH, d /* rb_dest */ - uxtb16 d, d, ror #8 /* ag_dest */ - mla SCRATCH, SCRATCH, s, MASK - mla d, d, s, MASK + uxtb16 SCRATCH, \d /* rb_dest */ + uxtb16 \d, \d, ror #8 /* ag_dest */ + mla SCRATCH, SCRATCH, \s, MASK + mla \d, \d, \s, MASK uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 - uxtab16 d, d, d, ror #8 + uxtab16 \d, \d, \d, ror #8 mov SCRATCH, SCRATCH, ror #8 - sel d, SCRATCH, d + sel \d, SCRATCH, \d b 02f - .if offset == 0 + .if \offset == 0 48: /* Last mov d,#0 of the set - used as part of shortcut for * source values all 0 */ .endif -01: mov d, #0 +01: mov \d, #0 02: .endm .macro in_reverse_8888_8888_tail numbytes, reg1, reg2, reg3, reg4 - .if numbytes == 4 + .if \numbytes == 4 teq ORIG_W, ORIG_W, asr #32 - ldrne WK®1, [DST, #-4] - .elseif numbytes == 8 - teq ORIG_W, WK®1 + ldrne WK\()\reg1, [DST, #-4] + .elseif \numbytes == 8 + teq ORIG_W, WK\()\reg1 teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */ - ldmnedb DST, {WK®1-WK®2} + ldmdbne DST, {WK\()\reg1-WK\()\reg2} .else - teq ORIG_W, WK®1 - teqeq ORIG_W, WK®2 - teqeq ORIG_W, WK®3 + teq ORIG_W, WK\()\reg1 + teqeq ORIG_W, WK\()\reg2 + teqeq ORIG_W, WK\()\reg3 teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */ - ldmnedb DST, {WK®1-WK®4} + ldmdbne DST, {WK\()\reg1-WK\()\reg4} .endif cmnne DST, #0 /* clear C if NE */ bcs 49f /* no writes to dest if source all -1 */ beq 48f /* set dest to all 0 if source all 0 */ - .if numbytes == 4 - in_reverse_8888_8888_1pixel ORIG_W, WK®1, 0, 1 - str WK®1, [DST, #-4] - .elseif numbytes == 8 - in_reverse_8888_8888_1pixel STRIDE_M, WK®1, -4, 0 - in_reverse_8888_8888_1pixel STRIDE_M, WK®2, 0, 0 - stmdb DST, {WK®1-WK®2} + .if \numbytes == 4 + in_reverse_8888_8888_1pixel ORIG_W, WK\()\reg1, 0, 1 + str WK\()\reg1, [DST, #-4] + .elseif \numbytes == 8 + in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg1, -4, 0 + in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg2, 0, 0 + stmdb DST, {WK\()\reg1-WK\()\reg2} .else - in_reverse_8888_8888_1pixel STRIDE_M, WK®1, -12, 0 - in_reverse_8888_8888_1pixel STRIDE_M, WK®2, -8, 0 - in_reverse_8888_8888_1pixel STRIDE_M, WK®3, -4, 0 - in_reverse_8888_8888_1pixel STRIDE_M, WK®4, 0, 0 - stmdb DST, {WK®1-WK®4} + in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg1, -12, 0 + in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg2, -8, 0 + in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg3, -4, 0 + in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg4, 0, 0 + stmdb DST, {WK\()\reg1-WK\()\reg4} .endif 49: .endm .macro in_reverse_8888_8888_process_tail cond, numbytes, firstreg - in_reverse_8888_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3) + in_reverse_8888_8888_tail \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3) .endm generate_composite_function \ @@ -1149,21 +1151,21 @@ generate_composite_function \ .endm .macro over_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - pixld , numbytes, firstreg, DST, 0 + pixld , \numbytes, \firstreg, DST, 0 .endm .macro over_n_8888_1pixel dst - mul_8888_8 WK&dst, STRIDE_M, SCRATCH, MASK - uqadd8 WK&dst, WK&dst, SRC + mul_8888_8 WK\()\dst, STRIDE_M, SCRATCH, MASK + uqadd8 WK\()\dst, WK\()\dst, SRC .endm .macro over_n_8888_process_tail cond, numbytes, firstreg - .set PROCESS_REG, firstreg - .rept numbytes / 4 + .set PROCESS_REG, \firstreg + .rept \numbytes / 4 over_n_8888_1pixel %(PROCESS_REG) .set PROCESS_REG, PROCESS_REG+1 .endr - pixst , numbytes, firstreg, DST + pixst , \numbytes, \firstreg, DST .endm generate_composite_function \ |