1 files changed, 236 insertions, 234 deletions
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index a74a0a8..34d38f1 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -40,6 +40,8 @@
 #include "pixman-arm-asm.h"
 #include "pixman-arm-simd-asm.h"
 
+	pixman_syntax_unified
+
 /* A head macro should do all processing which results in an output of up to
  * 16 bytes, as far as the final load instruction. The corresponding tail macro
  * should complete the processing of the up-to-16 bytes. The calling macro will
@@ -57,7 +59,7 @@
 .endm
 
 .macro blit_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
-        pixld   cond, numbytes, firstreg, SRC, unaligned_src
+        pixld   \cond, \numbytes, \firstreg, SRC, \unaligned_src
 .endm
 
 .macro blit_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
@@ -65,8 +67,8 @@
     WK5     .req    STRIDE_S
     WK6     .req    MASK
     WK7     .req    STRIDE_M
-110:    pixld   , 16, 0, SRC, unaligned_src
-        pixld   , 16, 4, SRC, unaligned_src
+110:    pixld   , 16, 0, SRC, \unaligned_src
+        pixld   , 16, 4, SRC, \unaligned_src
         pld     [SRC, SCRATCH]
         pixst   , 16, 0, DST
         pixst   , 16, 4, DST
@@ -122,7 +124,7 @@ generate_composite_function \
 
 .macro src_n_0565_init
         ldrh    SRC, [sp, #ARGS_STACK_OFFSET]
-        orr     SRC, SRC, lsl #16
+        orr     SRC, SRC, SRC, lsl #16
         mov     STRIDE_S, SRC
         mov     MASK, SRC
         mov     STRIDE_M, SRC
@@ -130,8 +132,8 @@ generate_composite_function \
 
 .macro src_n_8_init
         ldrb    SRC, [sp, #ARGS_STACK_OFFSET]
-        orr     SRC, SRC, lsl #8
-        orr     SRC, SRC, lsl #16
+        orr     SRC, SRC, SRC, lsl #8
+        orr     SRC, SRC, SRC, lsl #16
         mov     STRIDE_S, SRC
         mov     MASK, SRC
         mov     STRIDE_M, SRC
@@ -142,7 +144,7 @@ generate_composite_function \
     WK5     .req    STRIDE_S
     WK6     .req    MASK
     WK7     .req    STRIDE_M
-        pixst   cond, numbytes, 4, DST
+        pixst   \cond, \numbytes, 4, DST
     .unreq  WK4
     .unreq  WK5
     .unreq  WK6
@@ -182,20 +184,20 @@ generate_composite_function \
 /******************************************************************************/
 
 .macro src_x888_8888_pixel, cond, reg
-        orr&cond WK&reg, WK&reg, #0xFF000000
+        orr\()\cond WK\()\reg, WK\()\reg, #0xFF000000
 .endm
 
 .macro pixman_composite_src_x888_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
-        pixld   cond, numbytes, firstreg, SRC, unaligned_src
+        pixld   \cond, \numbytes, \firstreg, SRC, \unaligned_src
 .endm
 
 .macro pixman_composite_src_x888_8888_process_tail   cond, numbytes, firstreg
-        src_x888_8888_pixel cond, %(firstreg+0)
- .if numbytes >= 8
-        src_x888_8888_pixel cond, %(firstreg+1)
-  .if numbytes == 16
-        src_x888_8888_pixel cond, %(firstreg+2)
-        src_x888_8888_pixel cond, %(firstreg+3)
+        src_x888_8888_pixel \cond, %(\firstreg+0)
+ .if \numbytes >= 8
+        src_x888_8888_pixel \cond, %(\firstreg+1)
+  .if \numbytes == 16
+        src_x888_8888_pixel \cond, %(\firstreg+2)
+        src_x888_8888_pixel \cond, %(\firstreg+3)
   .endif
  .endif
 .endm
@@ -222,73 +224,73 @@ generate_composite_function \
 .endm
 
 .macro src_0565_8888_2pixels, reg1, reg2
-        and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
-        bic     WK&reg2, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
-        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
-        mov     WK&reg1, WK&reg2, lsl #16          @ rrrrr000000bbbbb0000000000000000
-        mov     SCRATCH, SCRATCH, ror #19          @ GGGG0000ggggggggggg00000GGGGGGGG
-        bic     WK&reg2, WK&reg2, WK&reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
-        orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ rrrrrrrrrr0bbbbbbbbbb00000000000
-        orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ RRRRRRRRRR0BBBBBBBBBB00000000000
-        pkhtb   WK&reg1, WK&reg1, WK&reg1, asr #5  @ rrrrrrrr--------bbbbbbbb--------
-        sel     WK&reg1, WK&reg1, SCRATCH          @ rrrrrrrrggggggggbbbbbbbb--------
-        mov     SCRATCH, SCRATCH, ror #16          @ ggg00000GGGGGGGGGGGG0000gggggggg
-        pkhtb   WK&reg2, WK&reg2, WK&reg2, asr #5  @ RRRRRRRR--------BBBBBBBB--------
-        sel     WK&reg2, WK&reg2, SCRATCH          @ RRRRRRRRGGGGGGGGBBBBBBBB--------
-        orr     WK&reg1, STRIDE_M, WK&reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
-        orr     WK&reg2, STRIDE_M, WK&reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
+        and     SCRATCH, WK\()\reg1, MASK                   @ 00000GGGGGG0000000000gggggg00000
+        bic     WK\()\reg2, WK\()\reg1, MASK                @ RRRRR000000BBBBBrrrrr000000bbbbb
+        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6           @ 00000GGGGGGGGGGGG0000ggggggggggg
+        mov     WK\()\reg1, WK\()\reg2, lsl #16             @ rrrrr000000bbbbb0000000000000000
+        mov     SCRATCH, SCRATCH, ror #19                   @ GGGG0000ggggggggggg00000GGGGGGGG
+        bic     WK\()\reg2, WK\()\reg2, WK\()\reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
+        orr     WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5  @ rrrrrrrrrr0bbbbbbbbbb00000000000
+        orr     WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5  @ RRRRRRRRRR0BBBBBBBBBB00000000000
+        pkhtb   WK\()\reg1, WK\()\reg1, WK\()\reg1, asr #5  @ rrrrrrrr--------bbbbbbbb--------
+        sel     WK\()\reg1, WK\()\reg1, SCRATCH             @ rrrrrrrrggggggggbbbbbbbb--------
+        mov     SCRATCH, SCRATCH, ror #16                   @ ggg00000GGGGGGGGGGGG0000gggggggg
+        pkhtb   WK\()\reg2, WK\()\reg2, WK\()\reg2, asr #5  @ RRRRRRRR--------BBBBBBBB--------
+        sel     WK\()\reg2, WK\()\reg2, SCRATCH             @ RRRRRRRRGGGGGGGGBBBBBBBB--------
+        orr     WK\()\reg1, STRIDE_M, WK\()\reg1, lsr #8    @ 11111111rrrrrrrrggggggggbbbbbbbb
+        orr     WK\()\reg2, STRIDE_M, WK\()\reg2, lsr #8    @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
 .endm
 
 /* This version doesn't need STRIDE_M, but is one instruction longer.
    It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
-        and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
-        bic     WK&reg1, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
-        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
-        mov     WK&reg2, WK&reg1, lsr #16          @ 0000000000000000RRRRR000000BBBBB
-        mov     SCRATCH, SCRATCH, ror #27          @ GGGGGGGGGGGG0000ggggggggggg00000
-        bic     WK&reg1, WK&reg1, WK&reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
-        mov     WK&reg2, WK&reg2, lsl #3           @ 0000000000000RRRRR000000BBBBB000
-        mov     WK&reg1, WK&reg1, lsl #3           @ 0000000000000rrrrr000000bbbbb000
-        orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ 0000000000000RRRRRRRRRR0BBBBBBBB
-        orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
-        pkhbt   WK&reg2, WK&reg2, WK&reg2, lsl #5  @ --------RRRRRRRR--------BBBBBBBB
-        pkhbt   WK&reg1, WK&reg1, WK&reg1, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
-        sel     WK&reg2, SCRATCH, WK&reg2          @ --------RRRRRRRRGGGGGGGGBBBBBBBB
-        sel     WK&reg1, SCRATCH, WK&reg1          @ --------rrrrrrrrggggggggbbbbbbbb
-        orr     WK&reg2, WK&reg2, #0xFF000000      @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
-        orr     WK&reg1, WK&reg1, #0xFF000000      @ 11111111rrrrrrrrggggggggbbbbbbbb
+        and     SCRATCH, WK\()\reg1, MASK                   @ 00000GGGGGG0000000000gggggg00000
+        bic     WK\()\reg1, WK\()\reg1, MASK                @ RRRRR000000BBBBBrrrrr000000bbbbb
+        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6           @ 00000GGGGGGGGGGGG0000ggggggggggg
+        mov     WK\()\reg2, WK\()\reg1, lsr #16             @ 0000000000000000RRRRR000000BBBBB
+        mov     SCRATCH, SCRATCH, ror #27                   @ GGGGGGGGGGGG0000ggggggggggg00000
+        bic     WK\()\reg1, WK\()\reg1, WK\()\reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
+        mov     WK\()\reg2, WK\()\reg2, lsl #3              @ 0000000000000RRRRR000000BBBBB000
+        mov     WK\()\reg1, WK\()\reg1, lsl #3              @ 0000000000000rrrrr000000bbbbb000
+        orr     WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5  @ 0000000000000RRRRRRRRRR0BBBBBBBB
+        orr     WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
+        pkhbt   WK\()\reg2, WK\()\reg2, WK\()\reg2, lsl #5  @ --------RRRRRRRR--------BBBBBBBB
+        pkhbt   WK\()\reg1, WK\()\reg1, WK\()\reg1, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
+        sel     WK\()\reg2, SCRATCH, WK\()\reg2             @ --------RRRRRRRRGGGGGGGGBBBBBBBB
+        sel     WK\()\reg1, SCRATCH, WK\()\reg1             @ --------rrrrrrrrggggggggbbbbbbbb
+        orr     WK\()\reg2, WK\()\reg2, #0xFF000000         @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
+        orr     WK\()\reg1, WK\()\reg1, #0xFF000000         @ 11111111rrrrrrrrggggggggbbbbbbbb
 */
 
 .macro src_0565_8888_1pixel, reg
-        bic     SCRATCH, WK&reg, MASK              @ 0000000000000000rrrrr000000bbbbb
-        and     WK&reg, WK&reg, MASK               @ 000000000000000000000gggggg00000
-        mov     SCRATCH, SCRATCH, lsl #3           @ 0000000000000rrrrr000000bbbbb000
-        mov     WK&reg, WK&reg, lsl #5             @ 0000000000000000gggggg0000000000
-        orr     SCRATCH, SCRATCH, SCRATCH, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
-        orr     WK&reg, WK&reg, WK&reg, lsr #6     @ 000000000000000gggggggggggg00000
-        pkhbt   SCRATCH, SCRATCH, SCRATCH, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
-        sel     WK&reg, WK&reg, SCRATCH            @ --------rrrrrrrrggggggggbbbbbbbb
-        orr     WK&reg, WK&reg, #0xFF000000        @ 11111111rrrrrrrrggggggggbbbbbbbb
+        bic     SCRATCH, WK\()\reg, MASK                 @ 0000000000000000rrrrr000000bbbbb
+        and     WK\()\reg, WK\()\reg, MASK               @ 000000000000000000000gggggg00000
+        mov     SCRATCH, SCRATCH, lsl #3                 @ 0000000000000rrrrr000000bbbbb000
+        mov     WK\()\reg, WK\()\reg, lsl #5             @ 0000000000000000gggggg0000000000
+        orr     SCRATCH, SCRATCH, SCRATCH, lsr #5        @ 0000000000000rrrrrrrrrr0bbbbbbbb
+        orr     WK\()\reg, WK\()\reg, WK\()\reg, lsr #6  @ 000000000000000gggggggggggg00000
+        pkhbt   SCRATCH, SCRATCH, SCRATCH, lsl #5        @ --------rrrrrrrr--------bbbbbbbb
+        sel     WK\()\reg, WK\()\reg, SCRATCH            @ --------rrrrrrrrggggggggbbbbbbbb
+        orr     WK\()\reg, WK\()\reg, #0xFF000000        @ 11111111rrrrrrrrggggggggbbbbbbbb
 .endm
 
 .macro src_0565_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
- .if numbytes == 16
-        pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
- .elseif numbytes == 8
-        pixld   , 4, firstreg, SRC, unaligned_src
- .elseif numbytes == 4
-        pixld   , 2, firstreg, SRC, unaligned_src
+ .if \numbytes == 16
+        pixldst ld,, 8, \firstreg, %(\firstreg+2),,, SRC, \unaligned_src
+ .elseif \numbytes == 8
+        pixld   , 4, \firstreg, SRC, \unaligned_src
+ .elseif \numbytes == 4
+        pixld   , 2, \firstreg, SRC, \unaligned_src
  .endif
 .endm
 
 .macro src_0565_8888_process_tail   cond, numbytes, firstreg
- .if numbytes == 16
-        src_0565_8888_2pixels firstreg, %(firstreg+1)
-        src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)
- .elseif numbytes == 8
-        src_0565_8888_2pixels firstreg, %(firstreg+1)
+ .if \numbytes == 16
+        src_0565_8888_2pixels \firstreg, %(\firstreg+1)
+        src_0565_8888_2pixels %(\firstreg+2), %(\firstreg+3)
+ .elseif \numbytes == 8
+        src_0565_8888_2pixels \firstreg, %(\firstreg+1)
  .else
-        src_0565_8888_1pixel firstreg
+        src_0565_8888_1pixel \firstreg
  .endif
 .endm
 
@@ -311,23 +313,23 @@ generate_composite_function \
 .endm
 
 .macro src_x888_0565_1pixel  s, d
-        and     WK&d, MASK, WK&s, lsr #3           @ 00000000000rrrrr00000000000bbbbb
-        and     STRIDE_S, WK&s, #0xFC00            @ 0000000000000000gggggg0000000000
-        orr     WK&d, WK&d, WK&d, lsr #5           @ 00000000000-----rrrrr000000bbbbb
-        orr     WK&d, WK&d, STRIDE_S, lsr #5       @ 00000000000-----rrrrrggggggbbbbb
+        and     WK\()\d, MASK, WK\()\s, lsr #3           @ 00000000000rrrrr00000000000bbbbb
+        and     STRIDE_S, WK\()\s, #0xFC00               @ 0000000000000000gggggg0000000000
+        orr     WK\()\d, WK\()\d, WK\()\d, lsr #5        @ 00000000000-----rrrrr000000bbbbb
+        orr     WK\()\d, WK\()\d, STRIDE_S, lsr #5       @ 00000000000-----rrrrrggggggbbbbb
         /* Top 16 bits are discarded during the following STRH */
 .endm
 
 .macro src_x888_0565_2pixels  slo, shi, d, tmp
-        and     SCRATCH, WK&shi, #0xFC00           @ 0000000000000000GGGGGG0000000000
-        and     WK&tmp, MASK, WK&shi, lsr #3       @ 00000000000RRRRR00000000000BBBBB
-        and     WK&shi, MASK, WK&slo, lsr #3       @ 00000000000rrrrr00000000000bbbbb
-        orr     WK&tmp, WK&tmp, WK&tmp, lsr #5     @ 00000000000-----RRRRR000000BBBBB
-        orr     WK&tmp, WK&tmp, SCRATCH, lsr #5    @ 00000000000-----RRRRRGGGGGGBBBBB
-        and     SCRATCH, WK&slo, #0xFC00           @ 0000000000000000gggggg0000000000
-        orr     WK&shi, WK&shi, WK&shi, lsr #5     @ 00000000000-----rrrrr000000bbbbb
-        orr     WK&shi, WK&shi, SCRATCH, lsr #5    @ 00000000000-----rrrrrggggggbbbbb
-        pkhbt   WK&d, WK&shi, WK&tmp, lsl #16      @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
+        and     SCRATCH, WK\()\shi, #0xFC00                 @ 0000000000000000GGGGGG0000000000
+        and     WK\()\tmp, MASK, WK\()\shi, lsr #3          @ 00000000000RRRRR00000000000BBBBB
+        and     WK\()\shi, MASK, WK\()\slo, lsr #3          @ 00000000000rrrrr00000000000bbbbb
+        orr     WK\()\tmp, WK\()\tmp, WK\()\tmp, lsr #5     @ 00000000000-----RRRRR000000BBBBB
+        orr     WK\()\tmp, WK\()\tmp, SCRATCH, lsr #5       @ 00000000000-----RRRRRGGGGGGBBBBB
+        and     SCRATCH, WK\()\slo, #0xFC00                 @ 0000000000000000gggggg0000000000
+        orr     WK\()\shi, WK\()\shi, WK\()\shi, lsr #5     @ 00000000000-----rrrrr000000bbbbb
+        orr     WK\()\shi, WK\()\shi, SCRATCH, lsr #5       @ 00000000000-----rrrrrggggggbbbbb
+        pkhbt   WK\()\d, WK\()\shi, WK\()\tmp, lsl #16      @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
 .endm
 
 .macro src_x888_0565_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
@@ -335,33 +337,33 @@ generate_composite_function \
         WK5     .req    STRIDE_M
         WK6     .req    WK3
         WK7     .req    ORIG_W
- .if numbytes == 16
+ .if \numbytes == 16
         pixld   , 16, 4, SRC, 0
         src_x888_0565_2pixels  4, 5, 0, 0
         pixld   , 8, 4, SRC, 0
         src_x888_0565_2pixels  6, 7, 1, 1
         pixld   , 8, 6, SRC, 0
  .else
-        pixld   , numbytes*2, 4, SRC, 0
+        pixld   , \numbytes*2, 4, SRC, 0
  .endif
 .endm
 
 .macro src_x888_0565_process_tail   cond, numbytes, firstreg
- .if numbytes == 16
+ .if \numbytes == 16
         src_x888_0565_2pixels  4, 5, 2, 2
         src_x888_0565_2pixels  6, 7, 3, 4
- .elseif numbytes == 8
+ .elseif \numbytes == 8
         src_x888_0565_2pixels  4, 5, 1, 1
         src_x888_0565_2pixels  6, 7, 2, 2
- .elseif numbytes == 4
+ .elseif \numbytes == 4
         src_x888_0565_2pixels  4, 5, 1, 1
  .else
         src_x888_0565_1pixel  4, 1
  .endif
- .if numbytes == 16
-        pixst   , numbytes, 0, DST
+ .if \numbytes == 16
+        pixst   , \numbytes, 0, DST
  .else
-        pixst   , numbytes, 1, DST
+        pixst   , \numbytes, 1, DST
  .endif
         .unreq  WK4
         .unreq  WK5
@@ -382,37 +384,37 @@ generate_composite_function \
 /******************************************************************************/
 
 .macro add_8_8_8pixels  cond, dst1, dst2
-        uqadd8&cond  WK&dst1, WK&dst1, MASK
-        uqadd8&cond  WK&dst2, WK&dst2, STRIDE_M
+        uqadd8\()\cond  WK\()\dst1, WK\()\dst1, MASK
+        uqadd8\()\cond  WK\()\dst2, WK\()\dst2, STRIDE_M
 .endm
 
 .macro add_8_8_4pixels  cond, dst
-        uqadd8&cond  WK&dst, WK&dst, MASK
+        uqadd8\()\cond  WK\()\dst, WK\()\dst, MASK
 .endm
 
 .macro add_8_8_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
     WK4     .req    MASK
     WK5     .req    STRIDE_M
- .if numbytes == 16
-        pixld   cond, 8, 4, SRC, unaligned_src
-        pixld   cond, 16, firstreg, DST, 0
-        add_8_8_8pixels cond, firstreg, %(firstreg+1)
-        pixld   cond, 8, 4, SRC, unaligned_src
+ .if \numbytes == 16
+        pixld   \cond, 8, 4, SRC, \unaligned_src
+        pixld   \cond, 16, \firstreg, DST, 0
+        add_8_8_8pixels \cond, \firstreg, %(\firstreg+1)
+        pixld   \cond, 8, 4, SRC, \unaligned_src
  .else
-        pixld   cond, numbytes, 4, SRC, unaligned_src
-        pixld   cond, numbytes, firstreg, DST, 0
+        pixld   \cond, \numbytes, 4, SRC, \unaligned_src
+        pixld   \cond, \numbytes, \firstreg, DST, 0
  .endif
     .unreq  WK4
     .unreq  WK5
 .endm
 
 .macro add_8_8_process_tail  cond, numbytes, firstreg
- .if numbytes == 16
-        add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3)
- .elseif numbytes == 8
-        add_8_8_8pixels cond, firstreg, %(firstreg+1)
+ .if \numbytes == 16
+        add_8_8_8pixels \cond, %(\firstreg+2), %(\firstreg+3)
+ .elseif \numbytes == 8
+        add_8_8_8pixels \cond, \firstreg, %(\firstreg+1)
  .else
-        add_8_8_4pixels cond, firstreg
+        add_8_8_4pixels \cond, \firstreg
  .endif
 .endm
 
@@ -441,8 +443,8 @@ generate_composite_function \
     WK5     .req    STRIDE_S
     WK6     .req    STRIDE_M
     WK7     .req    ORIG_W
-        pixld   , numbytes, %(4+firstreg), SRC, unaligned_src
-        pixld   , numbytes, firstreg, DST, 0
+        pixld   , \numbytes, %(4+\firstreg), SRC, \unaligned_src
+        pixld   , \numbytes, \firstreg, DST, 0
     .unreq  WK4
     .unreq  WK5
     .unreq  WK6
@@ -451,44 +453,44 @@ generate_composite_function \
 
 .macro over_8888_8888_check_transparent  numbytes, reg0, reg1, reg2, reg3
         /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */
-        teq     WK&reg0, #0
- .if numbytes > 4
-        teqeq   WK&reg1, #0
-  .if numbytes > 8
-        teqeq   WK&reg2, #0
-        teqeq   WK&reg3, #0
+        teq     WK\()\reg0, #0
+ .if \numbytes > 4
+        teqeq   WK\()\reg1, #0
+  .if \numbytes > 8
+        teqeq   WK\()\reg2, #0
+        teqeq   WK\()\reg3, #0
   .endif
  .endif
 .endm
 
 .macro over_8888_8888_prepare  next
-        mov     WK&next, WK&next, lsr #24
+        mov     WK\()\next, WK\()\next, lsr #24
 .endm
 
 .macro over_8888_8888_1pixel src, dst, offset, next
         /* src = destination component multiplier */
-        rsb     WK&src, WK&src, #255
+        rsb     WK\()\src, WK\()\src, #255
         /* Split even/odd bytes of dst into SCRATCH/dst */
-        uxtb16  SCRATCH, WK&dst
-        uxtb16  WK&dst, WK&dst, ror #8
+        uxtb16  SCRATCH, WK\()\dst
+        uxtb16  WK\()\dst, WK\()\dst, ror #8
         /* Multiply through, adding 0.5 to the upper byte of result for rounding */
-        mla     SCRATCH, SCRATCH, WK&src, MASK
-        mla     WK&dst, WK&dst, WK&src, MASK
+        mla     SCRATCH, SCRATCH, WK\()\src, MASK
+        mla     WK\()\dst, WK\()\dst, WK\()\src, MASK
         /* Where we would have had a stall between the result of the first MLA and the shifter input,
          * reload the complete source pixel */
-        ldr     WK&src, [SRC, #offset]
+        ldr     WK\()\src, [SRC, #\offset]
         /* Multiply by 257/256 to approximate 256/255 */
         uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
         /* In this stall, start processing the next pixel */
- .if offset < -4
-        mov     WK&next, WK&next, lsr #24
+ .if \offset < -4
+        mov     WK\()\next, WK\()\next, lsr #24
  .endif
-        uxtab16 WK&dst, WK&dst, WK&dst, ror #8
+        uxtab16 WK\()\dst, WK\()\dst, WK\()\dst, ror #8
         /* Recombine even/odd bytes of multiplied destination */
         mov     SCRATCH, SCRATCH, ror #8
-        sel     WK&dst, SCRATCH, WK&dst
+        sel     WK\()\dst, SCRATCH, WK\()\dst
         /* Saturated add of source to multiplied destination */
-        uqadd8  WK&dst, WK&dst, WK&src
+        uqadd8  WK\()\dst, WK\()\dst, WK\()\src
 .endm
 
 .macro over_8888_8888_process_tail  cond, numbytes, firstreg
@@ -496,17 +498,17 @@ generate_composite_function \
     WK5     .req    STRIDE_S
     WK6     .req    STRIDE_M
     WK7     .req    ORIG_W
-        over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg)
+        over_8888_8888_check_transparent \numbytes, %(4+\firstreg), %(5+\firstreg), %(6+\firstreg), %(7+\firstreg)
         beq     10f
-        over_8888_8888_prepare  %(4+firstreg)
- .set PROCESS_REG, firstreg
- .set PROCESS_OFF, -numbytes
- .rept numbytes / 4
+        over_8888_8888_prepare  %(4+\firstreg)
+ .set PROCESS_REG, \firstreg
+ .set PROCESS_OFF, -\numbytes
+ .rept \numbytes / 4
         over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)
   .set PROCESS_REG, PROCESS_REG+1
   .set PROCESS_OFF, PROCESS_OFF+4
  .endr
-        pixst   , numbytes, firstreg, DST
+        pixst   , \numbytes, \firstreg, DST
 10:
     .unreq  WK4
     .unreq  WK5
@@ -536,16 +538,16 @@ generate_composite_function \
  */
 .macro mul_8888_8  word, byte, tmp, half
         /* Split even/odd bytes of word apart */
-        uxtb16  tmp, word
-        uxtb16  word, word, ror #8
+        uxtb16  \tmp, \word
+        uxtb16  \word, \word, ror #8
         /* Multiply bytes together with rounding, then by 257/256 */
-        mla     tmp, tmp, byte, half
-        mla     word, word, byte, half /* 1 stall follows */
-        uxtab16 tmp, tmp, tmp, ror #8  /* 1 stall follows */
-        uxtab16 word, word, word, ror #8
+        mla     \tmp, \tmp, \byte, \half
+        mla     \word, \word, \byte, \half /* 1 stall follows */
+        uxtab16 \tmp, \tmp, \tmp, ror #8  /* 1 stall follows */
+        uxtab16 \word, \word, \word, ror #8
         /* Recombine bytes */
-        mov     tmp, tmp, ror #8
-        sel     word, tmp, word
+        mov     \tmp, \tmp, ror #8
+        sel     \word, \tmp, \word
 .endm
 
 /******************************************************************************/
@@ -567,8 +569,8 @@ generate_composite_function \
     WK5     .req    STRIDE_D
     WK6     .req    STRIDE_S
     WK7     .req    ORIG_W
-        pixld   , numbytes, %(4+(firstreg%2)), SRC, unaligned_src
-        pixld   , numbytes, firstreg, DST, 0
+        pixld   , \numbytes, %(4+(\firstreg%2)), SRC, \unaligned_src
+        pixld   , \numbytes, \firstreg, DST, 0
     .unreq  WK4
     .unreq  WK5
     .unreq  WK6
@@ -576,10 +578,10 @@ generate_composite_function \
 .endm
 
 .macro over_8888_n_8888_1pixel src, dst
-        mul_8888_8  WK&src, MASK, SCRATCH, STRIDE_M
-        sub     WK7, WK6, WK&src, lsr #24
-        mul_8888_8  WK&dst, WK7, SCRATCH, STRIDE_M
-        uqadd8  WK&dst, WK&dst, WK&src
+        mul_8888_8  WK\()\src, MASK, SCRATCH, STRIDE_M
+        sub     WK7, WK6, WK\()\src, lsr #24
+        mul_8888_8  WK\()\dst, WK7, SCRATCH, STRIDE_M
+        uqadd8  WK\()\dst, WK\()\dst, WK\()\src
 .endm
 
 .macro over_8888_n_8888_process_tail  cond, numbytes, firstreg
@@ -587,12 +589,12 @@ generate_composite_function \
     WK5     .req    STRIDE_D
     WK6     .req    STRIDE_S
     WK7     .req    ORIG_W
-        over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg)
+        over_8888_8888_check_transparent \numbytes, %(4+(\firstreg%2)), %(5+(\firstreg%2)), %(6+\firstreg), %(7+\firstreg)
         beq     10f
         mov     WK6, #255
- .set PROCESS_REG, firstreg
- .rept numbytes / 4
-  .if numbytes == 16 && PROCESS_REG == 2
+ .set PROCESS_REG, \firstreg
+ .rept \numbytes / 4
+  .if \numbytes == 16 && PROCESS_REG == 2
         /* We're using WK6 and WK7 as temporaries, so half way through
          * 4 pixels, reload the second two source pixels but this time
          * into WK4 and WK5 */
@@ -601,7 +603,7 @@ generate_composite_function \
         over_8888_n_8888_1pixel  %(4+(PROCESS_REG%2)), %(PROCESS_REG)
   .set PROCESS_REG, PROCESS_REG+1
  .endr
-        pixst   , numbytes, firstreg, DST
+        pixst   , \numbytes, \firstreg, DST
 10:
     .unreq  WK4
     .unreq  WK5
@@ -642,13 +644,13 @@ generate_composite_function \
 
 .macro over_n_8_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
     WK4     .req    STRIDE_M
-        pixld   , numbytes/4, 4, MASK, unaligned_mask
-        pixld   , numbytes, firstreg, DST, 0
+        pixld   , \numbytes/4, 4, MASK, \unaligned_mask
+        pixld   , \numbytes, \firstreg, DST, 0
     .unreq  WK4
 .endm
 
 .macro over_n_8_8888_1pixel src, dst
-        uxtb    Y, WK4, ror #src*8
+        uxtb    Y, WK4, ror #\src*8
         /* Trailing part of multiplication of source */
         mla     SCRATCH, STRIDE_S, Y, STRIDE_D
         mla     Y, SRC, Y, STRIDE_D
@@ -659,20 +661,20 @@ generate_composite_function \
         sub     ORIG_W, ORIG_W, Y, lsr #24
         sel     Y, SCRATCH, Y
         /* Then multiply the destination */
-        mul_8888_8  WK&dst, ORIG_W, SCRATCH, STRIDE_D
-        uqadd8  WK&dst, WK&dst, Y
+        mul_8888_8  WK\()\dst, ORIG_W, SCRATCH, STRIDE_D
+        uqadd8  WK\()\dst, WK\()\dst, Y
 .endm
 
 .macro over_n_8_8888_process_tail  cond, numbytes, firstreg
     WK4     .req    STRIDE_M
         teq     WK4, #0
         beq     10f
- .set PROCESS_REG, firstreg
- .rept numbytes / 4
-        over_n_8_8888_1pixel  %(PROCESS_REG-firstreg), %(PROCESS_REG)
+ .set PROCESS_REG, \firstreg
+ .rept \numbytes / 4
+        over_n_8_8888_1pixel  %(PROCESS_REG-\firstreg), %(PROCESS_REG)
   .set PROCESS_REG, PROCESS_REG+1
  .endr
-        pixst   , numbytes, firstreg, DST
+        pixst   , \numbytes, \firstreg, DST
 10:
     .unreq  WK4
 .endm
@@ -705,14 +707,14 @@ generate_composite_function \
 .endm
 
 .macro over_reverse_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
-        pixld   , numbytes, firstreg, DST, 0
+        pixld   , \numbytes, \firstreg, DST, 0
 .endm
 
 .macro over_reverse_n_8888_1pixel  d, is_only
-        teq     WK&d, #0
+        teq     WK\()\d, #0
         beq     8f       /* replace with source */
-        bics    ORIG_W, STRIDE_D, WK&d, lsr #24
- .if is_only == 1
+        bics    ORIG_W, STRIDE_D, WK\()\d, lsr #24
+ .if \is_only == 1
         beq     49f      /* skip store */
  .else
         beq     9f       /* write same value back */
@@ -723,36 +725,36 @@ generate_composite_function \
         uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8
         mov     SCRATCH, SCRATCH, ror #8
         sel     ORIG_W, SCRATCH, ORIG_W
-        uqadd8  WK&d, WK&d, ORIG_W
+        uqadd8  WK\()\d, WK\()\d, ORIG_W
         b       9f
-8:      mov     WK&d, SRC
+8:      mov     WK\()\d, SRC
 9:
 .endm
 
 .macro over_reverse_n_8888_tail  numbytes, reg1, reg2, reg3, reg4
- .if numbytes == 4
-        over_reverse_n_8888_1pixel  reg1, 1
+ .if \numbytes == 4
+        over_reverse_n_8888_1pixel  \reg1, 1
  .else
-        and     SCRATCH, WK&reg1, WK&reg2
-  .if numbytes == 16
-        and     SCRATCH, SCRATCH, WK&reg3
-        and     SCRATCH, SCRATCH, WK&reg4
+        and     SCRATCH, WK\()\reg1, WK\()\reg2
+  .if \numbytes == 16
+        and     SCRATCH, SCRATCH, WK\()\reg3
+        and     SCRATCH, SCRATCH, WK\()\reg4
   .endif
         mvns    SCRATCH, SCRATCH, asr #24
         beq     49f /* skip store if all opaque */
-        over_reverse_n_8888_1pixel  reg1, 0
-        over_reverse_n_8888_1pixel  reg2, 0
-  .if numbytes == 16
-        over_reverse_n_8888_1pixel  reg3, 0
-        over_reverse_n_8888_1pixel  reg4, 0
+        over_reverse_n_8888_1pixel  \reg1, 0
+        over_reverse_n_8888_1pixel  \reg2, 0
+  .if \numbytes == 16
+        over_reverse_n_8888_1pixel  \reg3, 0
+        over_reverse_n_8888_1pixel  \reg4, 0
   .endif
  .endif
-        pixst   , numbytes, reg1, DST
+        pixst   , \numbytes, \reg1, DST
 49:
 .endm
 
 .macro over_reverse_n_8888_process_tail  cond, numbytes, firstreg
-        over_reverse_n_8888_tail  numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
+        over_reverse_n_8888_tail  \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3)
 .endm
 
 generate_composite_function \
@@ -794,20 +796,20 @@ generate_composite_function \
 
 .macro over_white_8888_8888_ca_combine  m, d
         uxtb16  TMP1, TMP0                /* rb_notmask */
-        uxtb16  TMP2, d                   /* rb_dest; 1 stall follows */
+        uxtb16  TMP2, \d                  /* rb_dest; 1 stall follows */
         smlatt  TMP3, TMP2, TMP1, HALF    /* red */
         smlabb  TMP2, TMP2, TMP1, HALF    /* blue */
         uxtb16  TMP0, TMP0, ror #8        /* ag_notmask */
-        uxtb16  TMP1, d, ror #8           /* ag_dest; 1 stall follows */
-        smlatt  d, TMP1, TMP0, HALF       /* alpha */
+        uxtb16  TMP1, \d, ror #8          /* ag_dest; 1 stall follows */
+        smlatt  \d, TMP1, TMP0, HALF      /* alpha */
         smlabb  TMP1, TMP1, TMP0, HALF    /* green */
         pkhbt   TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */
-        pkhbt   TMP1, TMP1, d, lsl #16    /* ag */
+        pkhbt   TMP1, TMP1, \d, lsl #16   /* ag */
         uxtab16 TMP0, TMP0, TMP0, ror #8
         uxtab16 TMP1, TMP1, TMP1, ror #8
         mov     TMP0, TMP0, ror #8
-        sel     d, TMP0, TMP1
-        uqadd8  d, d, m                   /* d is a late result */
+        sel     \d, TMP0, TMP1
+        uqadd8  \d, \d, \m                 /* d is a late result */
 .endm
 
 .macro over_white_8888_8888_ca_1pixel_head
@@ -853,10 +855,10 @@ generate_composite_function \
 .endm
 
 .macro over_white_8888_8888_ca_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
- .if numbytes == 4
+ .if \numbytes == 4
         over_white_8888_8888_ca_1pixel_head
  .else
-  .if numbytes == 16
+  .if \numbytes == 16
         over_white_8888_8888_ca_2pixels_head
         over_white_8888_8888_ca_2pixels_tail
   .endif
@@ -865,7 +867,7 @@ generate_composite_function \
 .endm
 
 .macro over_white_8888_8888_ca_process_tail  cond, numbytes, firstreg
- .if numbytes == 4
+ .if \numbytes == 4
         over_white_8888_8888_ca_1pixel_tail
  .else
         over_white_8888_8888_ca_2pixels_tail
@@ -1004,7 +1006,7 @@ generate_composite_function \
 .endm
 
 .macro over_n_8888_8888_ca_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
- .rept (numbytes / 4) - 1
+ .rept (\numbytes / 4) - 1
         over_n_8888_8888_ca_1pixel_head
         over_n_8888_8888_ca_1pixel_tail
  .endr
@@ -1020,7 +1022,7 @@ pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6
         cmp     ip, #-1
         beq     pixman_composite_over_white_8888_8888_ca_asm_armv6
         /* else drop through... */
- .endfunc
+pixman_end_asm_function
 generate_composite_function \
     pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \
     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \
@@ -1045,84 +1047,84 @@ generate_composite_function \
 
 .macro in_reverse_8888_8888_head  numbytes, reg1, reg2, reg3
         ldrb    ORIG_W, [SRC], #4
- .if numbytes >= 8
-        ldrb    WK&reg1, [SRC], #4
-  .if numbytes == 16
-        ldrb    WK&reg2, [SRC], #4
-        ldrb    WK&reg3, [SRC], #4
+ .if \numbytes >= 8
+        ldrb    WK\()\reg1, [SRC], #4
+  .if \numbytes == 16
+        ldrb    WK\()\reg2, [SRC], #4
+        ldrb    WK\()\reg3, [SRC], #4
   .endif
  .endif
-        add     DST, DST, #numbytes
+        add     DST, DST, #\numbytes
 .endm
 
 .macro in_reverse_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
-        in_reverse_8888_8888_head  numbytes, firstreg, %(firstreg+1), %(firstreg+2)
+        in_reverse_8888_8888_head  \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2)
 .endm
 
 .macro in_reverse_8888_8888_1pixel  s, d, offset, is_only
- .if is_only != 1
-        movs    s, ORIG_W
-  .if offset != 0
-        ldrb    ORIG_W, [SRC, #offset]
+ .if \is_only != 1
+        movs    \s, ORIG_W
+  .if \offset != 0
+        ldrb    ORIG_W, [SRC, #\offset]
   .endif
         beq     01f
         teq     STRIDE_M, #0xFF
         beq     02f
  .endif
-        uxtb16  SCRATCH, d                 /* rb_dest */
-        uxtb16  d, d, ror #8               /* ag_dest */
-        mla     SCRATCH, SCRATCH, s, MASK
-        mla     d, d, s, MASK
+        uxtb16  SCRATCH, \d                 /* rb_dest */
+        uxtb16  \d, \d, ror #8               /* ag_dest */
+        mla     SCRATCH, SCRATCH, \s, MASK
+        mla     \d, \d, \s, MASK
         uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
-        uxtab16 d, d, d, ror #8
+        uxtab16 \d, \d, \d, ror #8
         mov     SCRATCH, SCRATCH, ror #8
-        sel     d, SCRATCH, d
+        sel     \d, SCRATCH, \d
         b       02f
- .if offset == 0
+ .if \offset == 0
 48:     /* Last mov d,#0 of the set - used as part of shortcut for
          * source values all 0 */
  .endif
-01:     mov     d, #0
+01:     mov     \d, #0
 02:
 .endm
 
 .macro in_reverse_8888_8888_tail  numbytes, reg1, reg2, reg3, reg4
- .if numbytes == 4
+ .if \numbytes == 4
         teq     ORIG_W, ORIG_W, asr #32
-        ldrne   WK&reg1, [DST, #-4]
- .elseif numbytes == 8
-        teq     ORIG_W, WK&reg1
+        ldrne   WK\()\reg1, [DST, #-4]
+ .elseif \numbytes == 8
+        teq     ORIG_W, WK\()\reg1
         teqeq   ORIG_W, ORIG_W, asr #32  /* all 0 or all -1? */
-        ldmnedb DST, {WK&reg1-WK&reg2}
+        ldmdbne DST, {WK\()\reg1-WK\()\reg2}
  .else
-        teq     ORIG_W, WK&reg1
-        teqeq   ORIG_W, WK&reg2
-        teqeq   ORIG_W, WK&reg3
+        teq     ORIG_W, WK\()\reg1
+        teqeq   ORIG_W, WK\()\reg2
+        teqeq   ORIG_W, WK\()\reg3
         teqeq   ORIG_W, ORIG_W, asr #32  /* all 0 or all -1? */
-        ldmnedb DST, {WK&reg1-WK&reg4}
+        ldmdbne DST, {WK\()\reg1-WK\()\reg4}
  .endif
         cmnne   DST, #0   /* clear C if NE */
         bcs     49f       /* no writes to dest if source all -1 */
         beq     48f       /* set dest to all 0 if source all 0 */
- .if numbytes == 4
-        in_reverse_8888_8888_1pixel  ORIG_W, WK&reg1, 0, 1
-        str     WK&reg1, [DST, #-4]
- .elseif numbytes == 8
-        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg1, -4, 0
-        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg2, 0, 0
-        stmdb   DST, {WK&reg1-WK&reg2}
+ .if \numbytes == 4
+        in_reverse_8888_8888_1pixel  ORIG_W, WK\()\reg1, 0, 1
+        str     WK\()\reg1, [DST, #-4]
+ .elseif \numbytes == 8
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg1, -4, 0
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg2, 0, 0
+        stmdb   DST, {WK\()\reg1-WK\()\reg2}
  .else
-        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg1, -12, 0
-        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg2, -8, 0
-        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg3, -4, 0
-        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg4, 0, 0
-        stmdb   DST, {WK&reg1-WK&reg4}
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg1, -12, 0
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg2, -8, 0
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg3, -4, 0
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg4, 0, 0
+        stmdb   DST, {WK\()\reg1-WK\()\reg4}
  .endif
 49:
 .endm
 
 .macro in_reverse_8888_8888_process_tail  cond, numbytes, firstreg
-        in_reverse_8888_8888_tail  numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
+        in_reverse_8888_8888_tail  \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3)
 .endm
 
 generate_composite_function \
@@ -1149,21 +1151,21 @@ generate_composite_function \
 .endm
 
 .macro over_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
-        pixld   , numbytes, firstreg, DST, 0
+        pixld   , \numbytes, \firstreg, DST, 0
 .endm
 
 .macro over_n_8888_1pixel dst
-        mul_8888_8  WK&dst, STRIDE_M, SCRATCH, MASK
-        uqadd8  WK&dst, WK&dst, SRC
+        mul_8888_8  WK\()\dst, STRIDE_M, SCRATCH, MASK
+        uqadd8  WK\()\dst, WK\()\dst, SRC
 .endm
 
 .macro over_n_8888_process_tail  cond, numbytes, firstreg
- .set PROCESS_REG, firstreg
- .rept numbytes / 4
+ .set PROCESS_REG, \firstreg
+ .rept \numbytes / 4
         over_n_8888_1pixel %(PROCESS_REG)
   .set PROCESS_REG, PROCESS_REG+1
  .endr
-        pixst   , numbytes, firstreg, DST
+        pixst   , \numbytes, \firstreg, DST
 .endm
 
 generate_composite_function \