summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHeiko Lewin <hlewin@worldiety.de>2024-02-29 14:46:55 +0000
committerMatt Turner <mattst88@gmail.com>2024-02-29 14:46:55 +0000
commit74130e84c577f9ce1a54be40104f43ead8b8dac3 (patch)
tree8c6070092fb52225d9de52b9d3f2eabc06333b28
parent63332b4e72caace85b7e57b99592e61ca12fd777 (diff)
Allow to build pixman on clang/arm32
-rw-r--r--meson.build7
-rw-r--r--pixman/pixman-arm-asm.h6
-rw-r--r--pixman/pixman-arm-neon-asm-bilinear.S362
-rw-r--r--pixman/pixman-arm-neon-asm.S436
-rw-r--r--pixman/pixman-arm-neon-asm.h607
-rw-r--r--pixman/pixman-arm-simd-asm-scaled.S42
-rw-r--r--pixman/pixman-arm-simd-asm.S470
-rw-r--r--pixman/pixman-arm-simd-asm.h300
8 files changed, 1125 insertions, 1105 deletions
diff --git a/meson.build b/meson.build
index 4337f93..438e6cf 100644
--- a/meson.build
+++ b/meson.build
@@ -252,6 +252,13 @@ if cc.compiles('''
config.set('ASM_HAVE_FUNC_DIRECTIVE', 1)
endif
+if cc.compiles('''
+ __asm__ (
+ ".syntax unified\n"
+ );''',
+ name : 'test for ASM .syntax unified directive')
+ config.set('ASM_HAVE_SYNTAX_UNIFIED', 1)
+endif
if cc.links('''
#include <stdint.h>
diff --git a/pixman/pixman-arm-asm.h b/pixman/pixman-arm-asm.h
index 8253906..edf8e82 100644
--- a/pixman/pixman-arm-asm.h
+++ b/pixman/pixman-arm-asm.h
@@ -50,6 +50,12 @@
#endif
.endm
+.macro pixman_syntax_unified
+#ifdef ASM_HAVE_SYNTAX_UNIFIED
+ .syntax unified
+#endif
+.endm
+
.macro pixman_end_asm_function
#ifdef ASM_HAVE_FUNC_DIRECTIVE
.endfunc
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index 0fd92d6..6bd2736 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -68,6 +68,8 @@
#include "pixman-arm-asm.h"
#include "pixman-arm-neon-asm.h"
+pixman_syntax_unified
+
/*
* Bilinear macros from pixman-arm-neon-asm.S
*/
@@ -82,28 +84,28 @@
mov TMP1, X, asr #16
add X, X, UX
add TMP1, TOP, TMP1, asl #2
- vld1.32 {reg1}, [TMP1], STRIDE
- vld1.32 {reg2}, [TMP1]
+ vld1.32 {\reg1}, [TMP1], STRIDE
+ vld1.32 {\reg2}, [TMP1]
.endm
.macro bilinear_load_0565 reg1, reg2, tmp
mov TMP1, X, asr #16
add X, X, UX
add TMP1, TOP, TMP1, asl #1
- vld1.32 {reg2[0]}, [TMP1], STRIDE
- vld1.32 {reg2[1]}, [TMP1]
- convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
+ vld1.32 {\reg2[0]}, [TMP1], STRIDE
+ vld1.32 {\reg2[1]}, [TMP1]
+ convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
.endm
.macro bilinear_load_and_vertical_interpolate_two_8888 \
acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
- bilinear_load_8888 reg1, reg2, tmp1
- vmull.u8 acc1, reg1, d28
- vmlal.u8 acc1, reg2, d29
- bilinear_load_8888 reg3, reg4, tmp2
- vmull.u8 acc2, reg3, d28
- vmlal.u8 acc2, reg4, d29
+ bilinear_load_8888 \reg1, \reg2, \tmp1
+ vmull.u8 \acc1, \reg1, d28
+ vmlal.u8 \acc1, \reg2, d29
+ bilinear_load_8888 \reg3, \reg4, \tmp2
+ vmull.u8 \acc2, \reg3, d28
+ vmlal.u8 \acc2, \reg4, d29
.endm
.macro bilinear_load_and_vertical_interpolate_four_8888 \
@@ -111,9 +113,9 @@
yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
bilinear_load_and_vertical_interpolate_two_8888 \
- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
+ \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi
bilinear_load_and_vertical_interpolate_two_8888 \
- yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+ \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
.endm
.macro bilinear_load_and_vertical_interpolate_two_0565 \
@@ -125,19 +127,19 @@
mov TMP2, X, asr #16
add X, X, UX
add TMP2, TOP, TMP2, asl #1
- vld1.32 {acc2lo[0]}, [TMP1], STRIDE
- vld1.32 {acc2hi[0]}, [TMP2], STRIDE
- vld1.32 {acc2lo[1]}, [TMP1]
- vld1.32 {acc2hi[1]}, [TMP2]
- convert_0565_to_x888 acc2, reg3, reg2, reg1
- vzip.u8 reg1, reg3
- vzip.u8 reg2, reg4
- vzip.u8 reg3, reg4
- vzip.u8 reg1, reg2
- vmull.u8 acc1, reg1, d28
- vmlal.u8 acc1, reg2, d29
- vmull.u8 acc2, reg3, d28
- vmlal.u8 acc2, reg4, d29
+ vld1.32 {\acc2lo[0]}, [TMP1], STRIDE
+ vld1.32 {\acc2hi[0]}, [TMP2], STRIDE
+ vld1.32 {\acc2lo[1]}, [TMP1]
+ vld1.32 {\acc2hi[1]}, [TMP2]
+ convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
+ vzip.u8 \reg1, \reg3
+ vzip.u8 \reg2, \reg4
+ vzip.u8 \reg3, \reg4
+ vzip.u8 \reg1, \reg2
+ vmull.u8 \acc1, \reg1, d28
+ vmlal.u8 \acc1, \reg2, d29
+ vmull.u8 \acc2, \reg3, d28
+ vmlal.u8 \acc2, \reg4, d29
.endm
.macro bilinear_load_and_vertical_interpolate_four_0565 \
@@ -150,46 +152,46 @@
mov TMP2, X, asr #16
add X, X, UX
add TMP2, TOP, TMP2, asl #1
- vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
- vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
- vld1.32 {xacc2lo[1]}, [TMP1]
- vld1.32 {xacc2hi[1]}, [TMP2]
- convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
+ vld1.32 {\xacc2lo[0]}, [TMP1], STRIDE
+ vld1.32 {\xacc2hi[0]}, [TMP2], STRIDE
+ vld1.32 {\xacc2lo[1]}, [TMP1]
+ vld1.32 {\xacc2hi[1]}, [TMP2]
+ convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
mov TMP1, X, asr #16
add X, X, UX
add TMP1, TOP, TMP1, asl #1
mov TMP2, X, asr #16
add X, X, UX
add TMP2, TOP, TMP2, asl #1
- vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
- vzip.u8 xreg1, xreg3
- vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
- vzip.u8 xreg2, xreg4
- vld1.32 {yacc2lo[1]}, [TMP1]
- vzip.u8 xreg3, xreg4
- vld1.32 {yacc2hi[1]}, [TMP2]
- vzip.u8 xreg1, xreg2
- convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
- vmull.u8 xacc1, xreg1, d28
- vzip.u8 yreg1, yreg3
- vmlal.u8 xacc1, xreg2, d29
- vzip.u8 yreg2, yreg4
- vmull.u8 xacc2, xreg3, d28
- vzip.u8 yreg3, yreg4
- vmlal.u8 xacc2, xreg4, d29
- vzip.u8 yreg1, yreg2
- vmull.u8 yacc1, yreg1, d28
- vmlal.u8 yacc1, yreg2, d29
- vmull.u8 yacc2, yreg3, d28
- vmlal.u8 yacc2, yreg4, d29
+ vld1.32 {\yacc2lo[0]}, [TMP1], STRIDE
+ vzip.u8 \xreg1, \xreg3
+ vld1.32 {\yacc2hi[0]}, [TMP2], STRIDE
+ vzip.u8 \xreg2, \xreg4
+ vld1.32 {\yacc2lo[1]}, [TMP1]
+ vzip.u8 \xreg3, \xreg4
+ vld1.32 {\yacc2hi[1]}, [TMP2]
+ vzip.u8 \xreg1, \xreg2
+ convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
+ vmull.u8 \xacc1, \xreg1, d28
+ vzip.u8 \yreg1, \yreg3
+ vmlal.u8 \xacc1, \xreg2, d29
+ vzip.u8 \yreg2, \yreg4
+ vmull.u8 \xacc2, \xreg3, d28
+ vzip.u8 \yreg3, \yreg4
+ vmlal.u8 \xacc2, \xreg4, d29
+ vzip.u8 \yreg1, \yreg2
+ vmull.u8 \yacc1, \yreg1, d28
+ vmlal.u8 \yacc1, \yreg2, d29
+ vmull.u8 \yacc2, \yreg3, d28
+ vmlal.u8 \yacc2, \yreg4, d29
.endm
.macro bilinear_store_8888 numpix, tmp1, tmp2
-.if numpix == 4
+.if \numpix == 4
vst1.32 {d0, d1}, [OUT]!
-.elseif numpix == 2
+.elseif \numpix == 2
vst1.32 {d0}, [OUT]!
-.elseif numpix == 1
+.elseif \numpix == 1
vst1.32 {d0[0]}, [OUT, :32]!
.else
.error bilinear_store_8888 numpix is unsupported
@@ -201,12 +203,12 @@
vuzp.u8 d2, d3
vuzp.u8 d1, d3
vuzp.u8 d0, d2
- convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
-.if numpix == 4
+ convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2
+.if \numpix == 4
vst1.16 {d2}, [OUT]!
-.elseif numpix == 2
+.elseif \numpix == 2
vst1.32 {d2[0]}, [OUT]!
-.elseif numpix == 1
+.elseif \numpix == 1
vst1.16 {d2[0]}, [OUT]!
.else
.error bilinear_store_0565 numpix is unsupported
@@ -222,20 +224,20 @@
.endm
.macro bilinear_load_mask_8 numpix, mask
-.if numpix == 4
- vld1.32 {mask[0]}, [MASK]!
-.elseif numpix == 2
- vld1.16 {mask[0]}, [MASK]!
-.elseif numpix == 1
- vld1.8 {mask[0]}, [MASK]!
+.if \numpix == 4
+ vld1.32 {\mask[0]}, [MASK]!
+.elseif \numpix == 2
+ vld1.16 {\mask[0]}, [MASK]!
+.elseif \numpix == 1
+ vld1.8 {\mask[0]}, [MASK]!
.else
- .error bilinear_load_mask_8 numpix is unsupported
+ .error bilinear_load_mask_8 \numpix is unsupported
.endif
pld [MASK, #prefetch_offset]
.endm
.macro bilinear_load_mask mask_fmt, numpix, mask
- bilinear_load_mask_&mask_fmt numpix, mask
+ bilinear_load_mask_\()\mask_fmt \numpix, \mask
.endm
@@ -250,28 +252,28 @@
.endm
.macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
-.if numpix == 4
- vld1.32 {dst0, dst1}, [OUT]
-.elseif numpix == 2
- vld1.32 {dst0}, [OUT]
-.elseif numpix == 1
- vld1.32 {dst0[0]}, [OUT]
+.if \numpix == 4
+ vld1.32 {\dst0, \dst1}, [OUT]
+.elseif \numpix == 2
+ vld1.32 {\dst0}, [OUT]
+.elseif \numpix == 1
+ vld1.32 {\dst0[0]}, [OUT]
.else
- .error bilinear_load_dst_8888 numpix is unsupported
+ .error bilinear_load_dst_8888 \numpix is unsupported
.endif
pld [OUT, #(prefetch_offset * 4)]
.endm
.macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
- bilinear_load_dst_8888 numpix, dst0, dst1, dst01
+ bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
.endm
.macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
- bilinear_load_dst_8888 numpix, dst0, dst1, dst01
+ bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
.endm
.macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
- bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01
+ bilinear_load_dst_\()\dst_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
.endm
/*
@@ -290,19 +292,19 @@
.endm
.macro bilinear_duplicate_mask_8 numpix, mask
-.if numpix == 4
- vdup.32 mask, mask[0]
-.elseif numpix == 2
- vdup.16 mask, mask[0]
-.elseif numpix == 1
- vdup.8 mask, mask[0]
+.if \numpix == 4
+ vdup.32 \mask, \mask[0]
+.elseif \numpix == 2
+ vdup.16 \mask, \mask[0]
+.elseif \numpix == 1
+ vdup.8 \mask, \mask[0]
.else
.error bilinear_duplicate_mask_8 is unsupported
.endif
.endm
.macro bilinear_duplicate_mask mask_fmt, numpix, mask
- bilinear_duplicate_mask_&mask_fmt numpix, mask
+ bilinear_duplicate_mask_\()\mask_fmt \numpix, \mask
.endm
/*
@@ -310,10 +312,10 @@
* Interleave should be done when maks is enabled or operator is 'over'.
*/
.macro bilinear_interleave src0, src1, dst0, dst1
- vuzp.8 src0, src1
- vuzp.8 dst0, dst1
- vuzp.8 src0, src1
- vuzp.8 dst0, dst1
+ vuzp.8 \src0, \src1
+ vuzp.8 \dst0, \dst1
+ vuzp.8 \src0, \src1
+ vuzp.8 \dst0, \dst1
.endm
.macro bilinear_interleave_src_dst_x_src \
@@ -323,7 +325,7 @@
.macro bilinear_interleave_src_dst_x_over \
numpix, src0, src1, src01, dst0, dst1, dst01
- bilinear_interleave src0, src1, dst0, dst1
+ bilinear_interleave \src0, \src1, \dst0, \dst1
.endm
.macro bilinear_interleave_src_dst_x_add \
@@ -333,26 +335,26 @@
.macro bilinear_interleave_src_dst_8_src \
numpix, src0, src1, src01, dst0, dst1, dst01
- bilinear_interleave src0, src1, dst0, dst1
+ bilinear_interleave \src0, \src1, \dst0, \dst1
.endm
.macro bilinear_interleave_src_dst_8_over \
numpix, src0, src1, src01, dst0, dst1, dst01
- bilinear_interleave src0, src1, dst0, dst1
+ bilinear_interleave \src0, \src1, \dst0, \dst1
.endm
.macro bilinear_interleave_src_dst_8_add \
numpix, src0, src1, src01, dst0, dst1, dst01
- bilinear_interleave src0, src1, dst0, dst1
+ bilinear_interleave \src0, \src1, \dst0, \dst1
.endm
.macro bilinear_interleave_src_dst \
mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
- bilinear_interleave_src_dst_&mask_fmt&_&op \
- numpix, src0, src1, src01, dst0, dst1, dst01
+ bilinear_interleave_src_dst_\()\mask_fmt\()_\()\op \
+ \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01
.endm
@@ -370,23 +372,23 @@
numpix, src0, src1, src01, mask, \
tmp01, tmp23, tmp45, tmp67
- vmull.u8 tmp01, src0, mask
- vmull.u8 tmp23, src1, mask
+ vmull.u8 \tmp01, \src0, \mask
+ vmull.u8 \tmp23, \src1, \mask
/* bubbles */
- vrshr.u16 tmp45, tmp01, #8
- vrshr.u16 tmp67, tmp23, #8
+ vrshr.u16 \tmp45, \tmp01, #8
+ vrshr.u16 \tmp67, \tmp23, #8
/* bubbles */
- vraddhn.u16 src0, tmp45, tmp01
- vraddhn.u16 src1, tmp67, tmp23
+ vraddhn.u16 \src0, \tmp45, \tmp01
+ vraddhn.u16 \src1, \tmp67, \tmp23
.endm
.macro bilinear_apply_mask_to_src \
mask_fmt, numpix, src0, src1, src01, mask, \
tmp01, tmp23, tmp45, tmp67
- bilinear_apply_mask_to_src_&mask_fmt \
- numpix, src0, src1, src01, mask, \
- tmp01, tmp23, tmp45, tmp67
+ bilinear_apply_mask_to_src_\()\mask_fmt \
+ \numpix, \src0, \src1, \src01, \mask, \
+ \tmp01, \tmp23, \tmp45, \tmp67
.endm
@@ -403,79 +405,79 @@
numpix, src0, src1, src01, dst0, dst1, dst01, \
tmp01, tmp23, tmp45, tmp67, tmp8
- vdup.32 tmp8, src1[1]
+ vdup.32 \tmp8, \src1[1]
/* bubbles */
- vmvn.8 tmp8, tmp8
+ vmvn.8 \tmp8, \tmp8
/* bubbles */
- vmull.u8 tmp01, dst0, tmp8
+ vmull.u8 \tmp01, \dst0, \tmp8
/* bubbles */
- vmull.u8 tmp23, dst1, tmp8
+ vmull.u8 \tmp23, \dst1, \tmp8
/* bubbles */
- vrshr.u16 tmp45, tmp01, #8
- vrshr.u16 tmp67, tmp23, #8
+ vrshr.u16 \tmp45, \tmp01, #8
+ vrshr.u16 \tmp67, \tmp23, #8
/* bubbles */
- vraddhn.u16 dst0, tmp45, tmp01
- vraddhn.u16 dst1, tmp67, tmp23
+ vraddhn.u16 \dst0, \tmp45, \tmp01
+ vraddhn.u16 \dst1, \tmp67, \tmp23
/* bubbles */
- vqadd.u8 src01, dst01, src01
+ vqadd.u8 \src01, \dst01, \src01
.endm
.macro bilinear_combine_add \
numpix, src0, src1, src01, dst0, dst1, dst01, \
tmp01, tmp23, tmp45, tmp67, tmp8
- vqadd.u8 src01, dst01, src01
+ vqadd.u8 \src01, \dst01, \src01
.endm
.macro bilinear_combine \
op, numpix, src0, src1, src01, dst0, dst1, dst01, \
tmp01, tmp23, tmp45, tmp67, tmp8
- bilinear_combine_&op \
- numpix, src0, src1, src01, dst0, dst1, dst01, \
- tmp01, tmp23, tmp45, tmp67, tmp8
+ bilinear_combine_\()\op \
+ \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01, \
+ \tmp01, \tmp23, \tmp45, \tmp67, \tmp8
.endm
/*
* Macros for final deinterleaving of destination pixels if needed.
*/
.macro bilinear_deinterleave numpix, dst0, dst1, dst01
- vuzp.8 dst0, dst1
+ vuzp.8 \dst0, \dst1
/* bubbles */
- vuzp.8 dst0, dst1
+ vuzp.8 \dst0, \dst1
.endm
.macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
.endm
.macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
- bilinear_deinterleave numpix, dst0, dst1, dst01
+ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
.endm
.macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
.endm
.macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
- bilinear_deinterleave numpix, dst0, dst1, dst01
+ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
.endm
.macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
- bilinear_deinterleave numpix, dst0, dst1, dst01
+ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
.endm
.macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
- bilinear_deinterleave numpix, dst0, dst1, dst01
+ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
.endm
.macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
- bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01
+ bilinear_deinterleave_dst_\()\mask_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
.endm
.macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
- bilinear_load_&src_fmt d0, d1, d2
- bilinear_load_mask mask_fmt, 1, d4
- bilinear_load_dst dst_fmt, op, 1, d18, d19, q9
+ bilinear_load_\()\src_fmt d0, d1, d2
+ bilinear_load_mask \mask_fmt, 1, d4
+ bilinear_load_dst \dst_fmt, \op, 1, d18, d19, q9
vmull.u8 q1, d0, d28
vmlal.u8 q1, d1, d29
/* 5 cycles bubble */
@@ -483,28 +485,28 @@
vmlsl.u16 q0, d2, d30
vmlal.u16 q0, d3, d30
/* 5 cycles bubble */
- bilinear_duplicate_mask mask_fmt, 1, d4
+ bilinear_duplicate_mask \mask_fmt, 1, d4
vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
/* 3 cycles bubble */
vmovn.u16 d0, q0
/* 1 cycle bubble */
bilinear_interleave_src_dst \
- mask_fmt, op, 1, d0, d1, q0, d18, d19, q9
+ \mask_fmt, \op, 1, d0, d1, q0, d18, d19, q9
bilinear_apply_mask_to_src \
- mask_fmt, 1, d0, d1, q0, d4, \
+ \mask_fmt, 1, d0, d1, q0, d4, \
q3, q8, q10, q11
bilinear_combine \
- op, 1, d0, d1, q0, d18, d19, q9, \
+ \op, 1, d0, d1, q0, d18, d19, q9, \
q3, q8, q10, q11, d5
- bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0
- bilinear_store_&dst_fmt 1, q2, q3
+ bilinear_deinterleave_dst \mask_fmt, \op, 1, d0, d1, q0
+ bilinear_store_\()\dst_fmt 1, q2, q3
.endm
.macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
- bilinear_load_and_vertical_interpolate_two_&src_fmt \
+ bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
q1, q11, d0, d1, d20, d21, d22, d23
- bilinear_load_mask mask_fmt, 2, d4
- bilinear_load_dst dst_fmt, op, 2, d18, d19, q9
+ bilinear_load_mask \mask_fmt, 2, d4
+ bilinear_load_dst \dst_fmt, \op, 2, d18, d19, q9
vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q0, d2, d30
vmlal.u16 q0, d3, d30
@@ -513,24 +515,24 @@
vmlal.u16 q10, d23, d31
vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
- bilinear_duplicate_mask mask_fmt, 2, d4
+ bilinear_duplicate_mask \mask_fmt, 2, d4
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vadd.u16 q12, q12, q13
vmovn.u16 d0, q0
bilinear_interleave_src_dst \
- mask_fmt, op, 2, d0, d1, q0, d18, d19, q9
+ \mask_fmt, \op, 2, d0, d1, q0, d18, d19, q9
bilinear_apply_mask_to_src \
- mask_fmt, 2, d0, d1, q0, d4, \
+ \mask_fmt, 2, d0, d1, q0, d4, \
q3, q8, q10, q11
bilinear_combine \
- op, 2, d0, d1, q0, d18, d19, q9, \
+ \op, 2, d0, d1, q0, d18, d19, q9, \
q3, q8, q10, q11, d5
- bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0
- bilinear_store_&dst_fmt 2, q2, q3
+ bilinear_deinterleave_dst \mask_fmt, \op, 2, d0, d1, q0
+ bilinear_store_\()\dst_fmt 2, q2, q3
.endm
.macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
- bilinear_load_and_vertical_interpolate_four_&src_fmt \
+ bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
q1, q11, d0, d1, d20, d21, d22, d23 \
q3, q9, d4, d5, d16, d17, d18, d19
pld [TMP1, PF_OFFS]
@@ -546,8 +548,8 @@
vmlsl.u16 q2, d6, d30
vmlal.u16 q2, d7, d30
vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
- bilinear_load_mask mask_fmt, 4, d22
- bilinear_load_dst dst_fmt, op, 4, d2, d3, q1
+ bilinear_load_mask \mask_fmt, 4, d22
+ bilinear_load_dst \dst_fmt, \op, 4, d2, d3, q1
pld [TMP1, PF_OFFS]
vmlsl.u16 q8, d18, d31
vmlal.u16 q8, d19, d31
@@ -556,21 +558,21 @@
vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
- bilinear_duplicate_mask mask_fmt, 4, d22
+ bilinear_duplicate_mask \mask_fmt, 4, d22
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vmovn.u16 d0, q0
vmovn.u16 d1, q2
vadd.u16 q12, q12, q13
bilinear_interleave_src_dst \
- mask_fmt, op, 4, d0, d1, q0, d2, d3, q1
+ \mask_fmt, \op, 4, d0, d1, q0, d2, d3, q1
bilinear_apply_mask_to_src \
- mask_fmt, 4, d0, d1, q0, d22, \
+ \mask_fmt, 4, d0, d1, q0, d22, \
q3, q8, q9, q10
bilinear_combine \
- op, 4, d0, d1, q0, d2, d3, q1, \
+ \op, 4, d0, d1, q0, d2, d3, q1, \
q3, q8, q9, q10, d23
- bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0
- bilinear_store_&dst_fmt 4, q2, q3
+ bilinear_deinterleave_dst \mask_fmt, \op, 4, d0, d1, q0
+ bilinear_store_\()\dst_fmt 4, q2, q3
.endm
.set BILINEAR_FLAG_USE_MASK, 1
@@ -610,14 +612,14 @@
prefetch_distance, \
flags
-pixman_asm_function fname
-.if pixblock_size == 8
-.elseif pixblock_size == 4
+pixman_asm_function \fname
+.if \pixblock_size == 8
+.elseif \pixblock_size == 4
.else
.error unsupported pixblock size
.endif
-.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
+.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
OUT .req r0
TOP .req r1
BOTTOM .req r2
@@ -635,7 +637,7 @@ pixman_asm_function fname
mov ip, sp
push {r4, r5, r6, r7, r8, r9}
- mov PF_OFFS, #prefetch_distance
+ mov PF_OFFS, #\prefetch_distance
ldmia ip, {WB, X, UX, WIDTH}
.else
OUT .req r0
@@ -654,17 +656,17 @@ pixman_asm_function fname
TMP4 .req r10
STRIDE .req r3
- .set prefetch_offset, prefetch_distance
+ .set prefetch_offset, \prefetch_distance
mov ip, sp
push {r4, r5, r6, r7, r8, r9, r10, ip}
- mov PF_OFFS, #prefetch_distance
+ mov PF_OFFS, #\prefetch_distance
ldmia ip, {WT, WB, X, UX, WIDTH}
.endif
mul PF_OFFS, PF_OFFS, UX
-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
vpush {d8-d15}
.endif
@@ -683,11 +685,11 @@ pixman_asm_function fname
/* ensure good destination alignment */
cmp WIDTH, #1
blt 0f
- tst OUT, #(1 << dst_bpp_shift)
+ tst OUT, #(1 << \dst_bpp_shift)
beq 0f
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vadd.u16 q12, q12, q13
- bilinear_process_last_pixel
+ \bilinear_process_last_pixel
sub WIDTH, WIDTH, #1
0:
vadd.u16 q13, q13, q13
@@ -696,53 +698,53 @@ pixman_asm_function fname
cmp WIDTH, #2
blt 0f
- tst OUT, #(1 << (dst_bpp_shift + 1))
+ tst OUT, #(1 << (\dst_bpp_shift + 1))
beq 0f
- bilinear_process_two_pixels
+ \bilinear_process_two_pixels
sub WIDTH, WIDTH, #2
0:
-.if pixblock_size == 8
+.if \pixblock_size == 8
cmp WIDTH, #4
blt 0f
- tst OUT, #(1 << (dst_bpp_shift + 2))
+ tst OUT, #(1 << (\dst_bpp_shift + 2))
beq 0f
- bilinear_process_four_pixels
+ \bilinear_process_four_pixels
sub WIDTH, WIDTH, #4
0:
.endif
- subs WIDTH, WIDTH, #pixblock_size
+ subs WIDTH, WIDTH, #\pixblock_size
blt 1f
- mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
- bilinear_process_pixblock_head
- subs WIDTH, WIDTH, #pixblock_size
+ mov PF_OFFS, PF_OFFS, asr #(16 - \src_bpp_shift)
+ \bilinear_process_pixblock_head
+ subs WIDTH, WIDTH, #\pixblock_size
blt 5f
0:
- bilinear_process_pixblock_tail_head
- subs WIDTH, WIDTH, #pixblock_size
+ \bilinear_process_pixblock_tail_head
+ subs WIDTH, WIDTH, #\pixblock_size
bge 0b
5:
- bilinear_process_pixblock_tail
+ \bilinear_process_pixblock_tail
1:
-.if pixblock_size == 8
+.if \pixblock_size == 8
tst WIDTH, #4
beq 2f
- bilinear_process_four_pixels
+ \bilinear_process_four_pixels
2:
.endif
/* handle the remaining trailing pixels */
tst WIDTH, #2
beq 2f
- bilinear_process_two_pixels
+ \bilinear_process_two_pixels
2:
tst WIDTH, #1
beq 3f
- bilinear_process_last_pixel
+ \bilinear_process_last_pixel
3:
-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
vpop {d8-d15}
.endif
-.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
+.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
pop {r4, r5, r6, r7, r8, r9}
.else
pop {r4, r5, r6, r7, r8, r9, r10, ip}
@@ -762,11 +764,11 @@ pixman_asm_function fname
.unreq TMP3
.unreq TMP4
.unreq STRIDE
-.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
+.if ((\flags) & BILINEAR_FLAG_USE_MASK) != 0
.unreq MASK
.endif
-.endfunc
+pixman_end_asm_function
.endm
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 7e949a3..0e09257 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -53,6 +53,8 @@
#include "pixman-arm-asm.h"
#include "pixman-arm-neon-asm.h"
+ pixman_syntax_unified
+
/* Global configuration options and preferences */
/*
@@ -260,13 +262,13 @@
vshrn.u16 d7, q2, #3
vsli.u16 q2, q2, #5
vshll.u8 q14, d16, #8
- PF add PF_X, PF_X, #8
+ PF add, PF_X, PF_X, #8
vshll.u8 q8, d19, #8
- PF tst PF_CTL, #0xF
+ PF tst, PF_CTL, #0xF
vsri.u8 d6, d6, #5
- PF addne PF_X, PF_X, #8
+ PF addne, PF_X, PF_X, #8
vmvn.8 d3, d3
- PF subne PF_CTL, PF_CTL, #1
+ PF subne, PF_CTL, PF_CTL, #1
vsri.u8 d7, d7, #6
vshrn.u16 d30, q2, #2
vmull.u8 q10, d3, d6
@@ -275,18 +277,18 @@
vmull.u8 q12, d3, d30
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
vsri.u16 q14, q8, #5
- PF cmp PF_X, ORIG_W
+ PF cmp, PF_X, ORIG_W
vshll.u8 q9, d18, #8
vrshr.u16 q13, q10, #8
- PF subge PF_X, PF_X, ORIG_W
+ PF subge, PF_X, PF_X, ORIG_W
vrshr.u16 q3, q11, #8
vrshr.u16 q15, q12, #8
- PF subges PF_CTL, PF_CTL, #0x10
+ PF subsge, PF_CTL, PF_CTL, #0x10
vsri.u16 q14, q9, #11
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
vraddhn.u16 d20, q10, q13
vraddhn.u16 d23, q11, q3
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vraddhn.u16 d22, q12, q15
vst1.16 {d28, d29}, [DST_W, :128]!
.endm
@@ -434,20 +436,20 @@ generate_composite_function \
.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
vsri.u16 q14, q8, #5
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0xF
+ PF add, PF_X, PF_X, #8
+ PF tst, PF_CTL, #0xF
fetch_src_pixblock
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
+ PF addne, PF_X, PF_X, #8
+ PF subne, PF_CTL, PF_CTL, #1
vsri.u16 q14, q9, #11
- PF cmp PF_X, ORIG_W
+ PF cmp, PF_X, ORIG_W
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
vshll.u8 q8, d1, #8
vst1.16 {d28, d29}, [DST_W, :128]!
- PF subge PF_X, PF_X, ORIG_W
- PF subges PF_CTL, PF_CTL, #0x10
+ PF subge, PF_X, PF_X, ORIG_W
+ PF subsge, PF_CTL, PF_CTL, #0x10
vshll.u8 q14, d2, #8
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
vshll.u8 q9, d0, #8
.endm
@@ -509,20 +511,20 @@ generate_composite_function \
.macro pixman_composite_add_8_8_process_pixblock_tail_head
fetch_src_pixblock
- PF add PF_X, PF_X, #32
- PF tst PF_CTL, #0xF
+ PF add, PF_X, PF_X, #32
+ PF tst, PF_CTL, #0xF
vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
- PF addne PF_X, PF_X, #32
- PF subne PF_CTL, PF_CTL, #1
+ PF addne, PF_X, PF_X, #32
+ PF subne, PF_CTL, PF_CTL, #1
vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
- PF cmp PF_X, ORIG_W
+ PF cmp, PF_X, ORIG_W
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
- PF subge PF_X, PF_X, ORIG_W
- PF subges PF_CTL, PF_CTL, #0x10
+ PF subge, PF_X, PF_X, ORIG_W
+ PF subsge, PF_CTL, PF_CTL, #0x10
vqadd.u8 q14, q0, q2
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vqadd.u8 q15, q1, q3
.endm
@@ -541,20 +543,20 @@ generate_composite_function \
.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
fetch_src_pixblock
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0xF
+ PF add, PF_X, PF_X, #8
+ PF tst, PF_CTL, #0xF
vld1.32 {d4, d5, d6, d7}, [DST_R, :128]!
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
+ PF addne, PF_X, PF_X, #8
+ PF subne, PF_CTL, PF_CTL, #1
vst1.32 {d28, d29, d30, d31}, [DST_W, :128]!
- PF cmp PF_X, ORIG_W
+ PF cmp, PF_X, ORIG_W
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
- PF subge PF_X, PF_X, ORIG_W
- PF subges PF_CTL, PF_CTL, #0x10
+ PF subge, PF_X, PF_X, ORIG_W
+ PF subsge, PF_CTL, PF_CTL, #0x10
vqadd.u8 q14, q0, q2
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vqadd.u8 q15, q1, q3
.endm
@@ -604,16 +606,16 @@ generate_composite_function_single_scanline \
.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
vrshr.u16 q14, q8, #8
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0xF
+ PF add, PF_X, PF_X, #8
+ PF tst, PF_CTL, #0xF
vrshr.u16 q15, q9, #8
vrshr.u16 q12, q10, #8
vrshr.u16 q13, q11, #8
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
+ PF addne, PF_X, PF_X, #8
+ PF subne, PF_CTL, PF_CTL, #1
vraddhn.u16 d28, q14, q8
vraddhn.u16 d29, q15, q9
- PF cmp PF_X, ORIG_W
+ PF cmp, PF_X, ORIG_W
vraddhn.u16 d30, q12, q10
vraddhn.u16 d31, q13, q11
fetch_src_pixblock
@@ -621,13 +623,13 @@ generate_composite_function_single_scanline \
vmvn.8 d22, d3
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- PF subge PF_X, PF_X, ORIG_W
+ PF subge, PF_X, PF_X, ORIG_W
vmull.u8 q8, d22, d4
- PF subges PF_CTL, PF_CTL, #0x10
+ PF subsge, PF_CTL, PF_CTL, #0x10
vmull.u8 q9, d22, d5
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
vmull.u8 q10, d22, d6
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vmull.u8 q11, d22, d7
.endm
@@ -656,16 +658,16 @@ generate_composite_function_single_scanline \
.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
vrshr.u16 q14, q8, #8
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0xF
+ PF add, PF_X, PF_X, #8
+ PF tst, PF_CTL, #0xF
vrshr.u16 q15, q9, #8
vrshr.u16 q12, q10, #8
vrshr.u16 q13, q11, #8
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
+ PF addne, PF_X, PF_X, #8
+ PF subne, PF_CTL, PF_CTL, #1
vraddhn.u16 d28, q14, q8
vraddhn.u16 d29, q15, q9
- PF cmp PF_X, ORIG_W
+ PF cmp, PF_X, ORIG_W
vraddhn.u16 d30, q12, q10
vraddhn.u16 d31, q13, q11
vqadd.u8 q14, q0, q14
@@ -675,13 +677,13 @@ generate_composite_function_single_scanline \
vmvn.8 d22, d3
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- PF subge PF_X, PF_X, ORIG_W
+ PF subge, PF_X, PF_X, ORIG_W
vmull.u8 q8, d22, d4
- PF subges PF_CTL, PF_CTL, #0x10
+ PF subsge, PF_CTL, PF_CTL, #0x10
vmull.u8 q9, d22, d5
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
vmull.u8 q10, d22, d6
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vmull.u8 q11, d22, d7
.endm
@@ -742,20 +744,20 @@ generate_composite_function_single_scanline \
vraddhn.u16 d31, q3, q11
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
vqadd.u8 q14, q0, q14
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0x0F
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
+ PF add, PF_X, PF_X, #8
+ PF tst, PF_CTL, #0x0F
+ PF addne, PF_X, PF_X, #8
+ PF subne, PF_CTL, PF_CTL, #1
vqadd.u8 q15, q1, q15
- PF cmp PF_X, ORIG_W
+ PF cmp, PF_X, ORIG_W
vmull.u8 q8, d24, d4
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
vmull.u8 q9, d24, d5
- PF subge PF_X, PF_X, ORIG_W
+ PF subge, PF_X, PF_X, ORIG_W
vmull.u8 q10, d24, d6
- PF subges PF_CTL, PF_CTL, #0x10
+ PF subsge, PF_CTL, PF_CTL, #0x10
vmull.u8 q11, d24, d7
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
.endm
@@ -784,16 +786,16 @@ generate_composite_function \
.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
vrshr.u16 q14, q8, #8
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0xF
+ PF add, PF_X, PF_X, #8
+ PF tst, PF_CTL, #0xF
vrshr.u16 q15, q9, #8
vrshr.u16 q12, q10, #8
vrshr.u16 q13, q11, #8
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
+ PF addne, PF_X, PF_X, #8
+ PF subne, PF_CTL, PF_CTL, #1
vraddhn.u16 d28, q14, q8
vraddhn.u16 d29, q15, q9
- PF cmp PF_X, ORIG_W
+ PF cmp, PF_X, ORIG_W
vraddhn.u16 d30, q12, q10
vraddhn.u16 d31, q13, q11
vqadd.u8 q14, q0, q14
@@ -802,12 +804,12 @@ generate_composite_function \
vmvn.8 d22, d3
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- PF subge PF_X, PF_X, ORIG_W
+ PF subge, PF_X, PF_X, ORIG_W
vmull.u8 q8, d22, d4
- PF subges PF_CTL, PF_CTL, #0x10
+ PF subsge, PF_CTL, PF_CTL, #0x10
vmull.u8 q9, d22, d5
vmull.u8 q10, d22, d6
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vmull.u8 q11, d22, d7
.endm
@@ -1245,23 +1247,23 @@ generate_composite_function \
.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
fetch_mask_pixblock
- PF add PF_X, PF_X, #8
+ PF add, PF_X, PF_X, #8
vrshrn.u16 d28, q8, #8
- PF tst PF_CTL, #0x0F
+ PF tst, PF_CTL, #0x0F
vrshrn.u16 d29, q9, #8
- PF addne PF_X, PF_X, #8
+ PF addne, PF_X, PF_X, #8
vrshrn.u16 d30, q10, #8
- PF subne PF_CTL, PF_CTL, #1
+ PF subne, PF_CTL, PF_CTL, #1
vrshrn.u16 d31, q11, #8
- PF cmp PF_X, ORIG_W
+ PF cmp, PF_X, ORIG_W
vmull.u8 q8, d24, d0
PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
vmull.u8 q9, d24, d1
- PF subge PF_X, PF_X, ORIG_W
+ PF subge, PF_X, PF_X, ORIG_W
vmull.u8 q10, d24, d2
- PF subges PF_CTL, PF_CTL, #0x10
+ PF subsge, PF_CTL, PF_CTL, #0x10
vmull.u8 q11, d24, d3
- PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
vrsra.u16 q8, q8, #8
vrsra.u16 q9, q9, #8
@@ -1314,23 +1316,23 @@ generate_composite_function \
.macro pixman_composite_src_n_8_8_process_pixblock_tail_head
fetch_mask_pixblock
- PF add PF_X, PF_X, #8
+ PF add, PF_X, PF_X, #8
vrshrn.u16 d28, q0, #8
- PF tst PF_CTL, #0x0F
+ PF tst, PF_CTL, #0x0F
vrshrn.u16 d29, q1, #8
- PF addne PF_X, PF_X, #8
+ PF addne, PF_X, PF_X, #8
vrshrn.u16 d30, q2, #8
- PF subne PF_CTL, PF_CTL, #1
+ PF subne, PF_CTL, PF_CTL, #1
vrshrn.u16 d31, q3, #8
- PF cmp PF_X, ORIG_W
+ PF cmp, PF_X, ORIG_W
vmull.u8 q0, d24, d16
PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
vmull.u8 q1, d25, d16
- PF subge PF_X, PF_X, ORIG_W
+ PF subge, PF_X, PF_X, ORIG_W
vmull.u8 q2, d26, d16
- PF subges PF_CTL, PF_CTL, #0x10
+ PF subsge, PF_CTL, PF_CTL, #0x10
vmull.u8 q3, d27, d16
- PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
vrsra.u16 q0, q0, #8
vrsra.u16 q1, q1, #8
@@ -1408,27 +1410,27 @@ generate_composite_function \
vrshr.u16 q15, q9, #8
fetch_mask_pixblock
vrshr.u16 q6, q10, #8
- PF add PF_X, PF_X, #8
+ PF add, PF_X, PF_X, #8
vrshr.u16 q7, q11, #8
- PF tst PF_CTL, #0x0F
+ PF tst, PF_CTL, #0x0F
vraddhn.u16 d28, q14, q8
- PF addne PF_X, PF_X, #8
+ PF addne, PF_X, PF_X, #8
vraddhn.u16 d29, q15, q9
- PF subne PF_CTL, PF_CTL, #1
+ PF subne, PF_CTL, PF_CTL, #1
vraddhn.u16 d30, q6, q10
- PF cmp PF_X, ORIG_W
+ PF cmp, PF_X, ORIG_W
vraddhn.u16 d31, q7, q11
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
vmull.u8 q6, d24, d8
PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
vmull.u8 q7, d24, d9
- PF subge PF_X, PF_X, ORIG_W
+ PF subge, PF_X, PF_X, ORIG_W
vmull.u8 q8, d24, d10
- PF subges PF_CTL, PF_CTL, #0x10
+ PF subsge, PF_CTL, PF_CTL, #0x10
vmull.u8 q9, d24, d11
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vqadd.u8 q14, q0, q14
- PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
vqadd.u8 q15, q1, q15
vrshr.u16 q10, q6, #8
vrshr.u16 q11, q7, #8
@@ -2425,21 +2427,21 @@ generate_composite_function \
vrshr.u16 q13, q10, #8
fetch_src_pixblock
vraddhn.u16 d30, q11, q8
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0xF
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
+ PF add, PF_X, PF_X, #8
+ PF tst, PF_CTL, #0xF
+ PF addne, PF_X, PF_X, #8
+ PF subne, PF_CTL, PF_CTL, #1
vraddhn.u16 d29, q12, q9
vraddhn.u16 d28, q13, q10
vmull.u8 q8, d3, d0
vmull.u8 q9, d3, d1
vmull.u8 q10, d3, d2
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- PF cmp PF_X, ORIG_W
+ PF cmp, PF_X, ORIG_W
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
- PF subge PF_X, PF_X, ORIG_W
- PF subges PF_CTL, PF_CTL, #0x10
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF subge, PF_X, PF_X, ORIG_W
+ PF subsge, PF_CTL, PF_CTL, #0x10
+ PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
.endm
generate_composite_function \
@@ -2482,21 +2484,21 @@ generate_composite_function \
vrshr.u16 q13, q10, #8
fetch_src_pixblock
vraddhn.u16 d28, q11, q8
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0xF
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
+ PF add, PF_X, PF_X, #8
+ PF tst, PF_CTL, #0xF
+ PF addne, PF_X, PF_X, #8
+ PF subne, PF_CTL, PF_CTL, #1
vraddhn.u16 d29, q12, q9
vraddhn.u16 d30, q13, q10
vmull.u8 q8, d3, d0
vmull.u8 q9, d3, d1
vmull.u8 q10, d3, d2
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- PF cmp PF_X, ORIG_W
+ PF cmp, PF_X, ORIG_W
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
- PF subge PF_X, PF_X, ORIG_W
- PF subges PF_CTL, PF_CTL, #0x10
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF subge, PF_X, PF_X, ORIG_W
+ PF subsge, PF_CTL, PF_CTL, #0x10
+ PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
.endm
generate_composite_function \
@@ -2841,28 +2843,28 @@ generate_composite_function_nearest_scanline \
mov TMP1, X, asr #16
add X, X, UX
add TMP1, TOP, TMP1, asl #2
- vld1.32 {reg1}, [TMP1], STRIDE
- vld1.32 {reg2}, [TMP1]
+ vld1.32 {\reg1}, [TMP1], STRIDE
+ vld1.32 {\reg2}, [TMP1]
.endm
.macro bilinear_load_0565 reg1, reg2, tmp
mov TMP1, X, asr #16
add X, X, UX
add TMP1, TOP, TMP1, asl #1
- vld1.32 {reg2[0]}, [TMP1], STRIDE
- vld1.32 {reg2[1]}, [TMP1]
- convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
+ vld1.32 {\reg2[0]}, [TMP1], STRIDE
+ vld1.32 {\reg2[1]}, [TMP1]
+ convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
.endm
.macro bilinear_load_and_vertical_interpolate_two_8888 \
acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
- bilinear_load_8888 reg1, reg2, tmp1
- vmull.u8 acc1, reg1, d28
- vmlal.u8 acc1, reg2, d29
- bilinear_load_8888 reg3, reg4, tmp2
- vmull.u8 acc2, reg3, d28
- vmlal.u8 acc2, reg4, d29
+ bilinear_load_8888 \reg1, \reg2, \tmp1
+ vmull.u8 \acc1, \reg1, d28
+ vmlal.u8 \acc1, \reg2, d29
+ bilinear_load_8888 \reg3, \reg4, \tmp2
+ vmull.u8 \acc2, \reg3, d28
+ vmlal.u8 \acc2, \reg4, d29
.endm
.macro bilinear_load_and_vertical_interpolate_four_8888 \
@@ -2870,9 +2872,9 @@ generate_composite_function_nearest_scanline \
yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
bilinear_load_and_vertical_interpolate_two_8888 \
- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
+ \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi
bilinear_load_and_vertical_interpolate_two_8888 \
- yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+ \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
.endm
.macro bilinear_load_and_vertical_interpolate_two_0565 \
@@ -2884,19 +2886,19 @@ generate_composite_function_nearest_scanline \
mov TMP2, X, asr #16
add X, X, UX
add TMP2, TOP, TMP2, asl #1
- vld1.32 {acc2lo[0]}, [TMP1], STRIDE
- vld1.32 {acc2hi[0]}, [TMP2], STRIDE
- vld1.32 {acc2lo[1]}, [TMP1]
- vld1.32 {acc2hi[1]}, [TMP2]
- convert_0565_to_x888 acc2, reg3, reg2, reg1
- vzip.u8 reg1, reg3
- vzip.u8 reg2, reg4
- vzip.u8 reg3, reg4
- vzip.u8 reg1, reg2
- vmull.u8 acc1, reg1, d28
- vmlal.u8 acc1, reg2, d29
- vmull.u8 acc2, reg3, d28
- vmlal.u8 acc2, reg4, d29
+ vld1.32 {\acc2lo[0]}, [TMP1], STRIDE
+ vld1.32 {\acc2hi[0]}, [TMP2], STRIDE
+ vld1.32 {\acc2lo[1]}, [TMP1]
+ vld1.32 {\acc2hi[1]}, [TMP2]
+ convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
+ vzip.u8 \reg1, \reg3
+ vzip.u8 \reg2, \reg4
+ vzip.u8 \reg3, \reg4
+ vzip.u8 \reg1, \reg2
+ vmull.u8 \acc1, \reg1, d28
+ vmlal.u8 \acc1, \reg2, d29
+ vmull.u8 \acc2, \reg3, d28
+ vmlal.u8 \acc2, \reg4, d29
.endm
.macro bilinear_load_and_vertical_interpolate_four_0565 \
@@ -2909,49 +2911,49 @@ generate_composite_function_nearest_scanline \
mov TMP2, X, asr #16
add X, X, UX
add TMP2, TOP, TMP2, asl #1
- vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
- vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
- vld1.32 {xacc2lo[1]}, [TMP1]
- vld1.32 {xacc2hi[1]}, [TMP2]
- convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
+ vld1.32 {\xacc2lo[0]}, [TMP1], STRIDE
+ vld1.32 {\xacc2hi[0]}, [TMP2], STRIDE
+ vld1.32 {\xacc2lo[1]}, [TMP1]
+ vld1.32 {\xacc2hi[1]}, [TMP2]
+ convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
mov TMP1, X, asr #16
add X, X, UX
add TMP1, TOP, TMP1, asl #1
mov TMP2, X, asr #16
add X, X, UX
add TMP2, TOP, TMP2, asl #1
- vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
- vzip.u8 xreg1, xreg3
- vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
- vzip.u8 xreg2, xreg4
- vld1.32 {yacc2lo[1]}, [TMP1]
- vzip.u8 xreg3, xreg4
- vld1.32 {yacc2hi[1]}, [TMP2]
- vzip.u8 xreg1, xreg2
- convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
- vmull.u8 xacc1, xreg1, d28
- vzip.u8 yreg1, yreg3
- vmlal.u8 xacc1, xreg2, d29
- vzip.u8 yreg2, yreg4
- vmull.u8 xacc2, xreg3, d28
- vzip.u8 yreg3, yreg4
- vmlal.u8 xacc2, xreg4, d29
- vzip.u8 yreg1, yreg2
- vmull.u8 yacc1, yreg1, d28
- vmlal.u8 yacc1, yreg2, d29
- vmull.u8 yacc2, yreg3, d28
- vmlal.u8 yacc2, yreg4, d29
+ vld1.32 {\yacc2lo[0]}, [TMP1], STRIDE
+ vzip.u8 \xreg1, \xreg3
+ vld1.32 {\yacc2hi[0]}, [TMP2], STRIDE
+ vzip.u8 \xreg2, \xreg4
+ vld1.32 {\yacc2lo[1]}, [TMP1]
+ vzip.u8 \xreg3, \xreg4
+ vld1.32 {\yacc2hi[1]}, [TMP2]
+ vzip.u8 \xreg1, \xreg2
+ convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
+ vmull.u8 \xacc1, \xreg1, d28
+ vzip.u8 \yreg1, \yreg3
+ vmlal.u8 \xacc1, \xreg2, d29
+ vzip.u8 \yreg2, \yreg4
+ vmull.u8 \xacc2, \xreg3, d28
+ vzip.u8 \yreg3, \yreg4
+ vmlal.u8 \xacc2, \xreg4, d29
+ vzip.u8 \yreg1, \yreg2
+ vmull.u8 \yacc1, \yreg1, d28
+ vmlal.u8 \yacc1, \yreg2, d29
+ vmull.u8 \yacc2, \yreg3, d28
+ vmlal.u8 \yacc2, \yreg4, d29
.endm
.macro bilinear_store_8888 numpix, tmp1, tmp2
-.if numpix == 4
+.if \numpix == 4
vst1.32 {d0, d1}, [OUT, :128]!
-.elseif numpix == 2
+.elseif \numpix == 2
vst1.32 {d0}, [OUT, :64]!
-.elseif numpix == 1
+.elseif \numpix == 1
vst1.32 {d0[0]}, [OUT, :32]!
.else
- .error bilinear_store_8888 numpix is unsupported
+ .error bilinear_store_8888 \numpix is unsupported
.endif
.endm
@@ -2960,20 +2962,20 @@ generate_composite_function_nearest_scanline \
vuzp.u8 d2, d3
vuzp.u8 d1, d3
vuzp.u8 d0, d2
- convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
-.if numpix == 4
+ convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2
+.if \numpix == 4
vst1.16 {d2}, [OUT, :64]!
-.elseif numpix == 2
+.elseif \numpix == 2
vst1.32 {d2[0]}, [OUT, :32]!
-.elseif numpix == 1
+.elseif \numpix == 1
vst1.16 {d2[0]}, [OUT, :16]!
.else
- .error bilinear_store_0565 numpix is unsupported
+ .error bilinear_store_0565 \numpix is unsupported
.endif
.endm
.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
- bilinear_load_&src_fmt d0, d1, d2
+ bilinear_load_\()\src_fmt d0, d1, d2
vmull.u8 q1, d0, d28
vmlal.u8 q1, d1, d29
/* 5 cycles bubble */
@@ -2985,11 +2987,11 @@ generate_composite_function_nearest_scanline \
/* 3 cycles bubble */
vmovn.u16 d0, q0
/* 1 cycle bubble */
- bilinear_store_&dst_fmt 1, q2, q3
+ bilinear_store_\()\dst_fmt 1, q2, q3
.endm
.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
- bilinear_load_and_vertical_interpolate_two_&src_fmt \
+ bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
q1, q11, d0, d1, d20, d21, d22, d23
vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q0, d2, d30
@@ -3002,11 +3004,11 @@ generate_composite_function_nearest_scanline \
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vadd.u16 q12, q12, q13
vmovn.u16 d0, q0
- bilinear_store_&dst_fmt 2, q2, q3
+ bilinear_store_\()\dst_fmt 2, q2, q3
.endm
.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
- bilinear_load_and_vertical_interpolate_four_&src_fmt \
+ bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
q1, q11, d0, d1, d20, d21, d22, d23 \
q3, q9, d4, d5, d16, d17, d18, d19
pld [TMP1, PF_OFFS]
@@ -3034,54 +3036,54 @@ generate_composite_function_nearest_scanline \
vmovn.u16 d0, q0
vmovn.u16 d1, q2
vadd.u16 q12, q12, q13
- bilinear_store_&dst_fmt 4, q2, q3
+ bilinear_store_\()\dst_fmt 4, q2, q3
.endm
.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
- bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
+ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
.else
- bilinear_interpolate_four_pixels src_fmt, dst_fmt
+ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
.endif
.endm
.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
- bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
+ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
.endif
.endm
.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
- bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
+ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
.else
- bilinear_interpolate_four_pixels src_fmt, dst_fmt
+ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
.endif
.endm
.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
- bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
+ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
.else
- bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+ bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
+ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
.endif
.endm
.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
- bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
+ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
.else
- bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+ bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
.endif
.endm
.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
- bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
+ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
.else
- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
+ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
.endif
.endm
@@ -3106,7 +3108,7 @@ generate_composite_function_nearest_scanline \
src_bpp_shift, dst_bpp_shift, \
prefetch_distance, flags
-pixman_asm_function fname
+pixman_asm_function \fname
OUT .req r0
TOP .req r1
BOTTOM .req r2
@@ -3124,11 +3126,11 @@ pixman_asm_function fname
mov ip, sp
push {r4, r5, r6, r7, r8, r9}
- mov PF_OFFS, #prefetch_distance
+ mov PF_OFFS, #\prefetch_distance
ldmia ip, {WB, X, UX, WIDTH}
mul PF_OFFS, PF_OFFS, UX
-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
vpush {d8-d15}
.endif
@@ -3147,11 +3149,11 @@ pixman_asm_function fname
/* ensure good destination alignment */
cmp WIDTH, #1
blt 0f
- tst OUT, #(1 << dst_bpp_shift)
+ tst OUT, #(1 << \dst_bpp_shift)
beq 0f
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vadd.u16 q12, q12, q13
- bilinear_interpolate_last_pixel src_fmt, dst_fmt
+ bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
sub WIDTH, WIDTH, #1
0:
vadd.u16 q13, q13, q13
@@ -3160,64 +3162,64 @@ pixman_asm_function fname
cmp WIDTH, #2
blt 0f
- tst OUT, #(1 << (dst_bpp_shift + 1))
+ tst OUT, #(1 << (\dst_bpp_shift + 1))
beq 0f
- bilinear_interpolate_two_pixels src_fmt, dst_fmt
+ bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
sub WIDTH, WIDTH, #2
0:
-.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
+.if ((\flags) & BILINEAR_FLAG_UNROLL_8) != 0
/*********** 8 pixels per iteration *****************/
cmp WIDTH, #4
blt 0f
- tst OUT, #(1 << (dst_bpp_shift + 2))
+ tst OUT, #(1 << (\dst_bpp_shift + 2))
beq 0f
- bilinear_interpolate_four_pixels src_fmt, dst_fmt
+ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
sub WIDTH, WIDTH, #4
0:
subs WIDTH, WIDTH, #8
blt 1f
mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
- bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
+ bilinear_interpolate_eight_pixels_head \src_fmt, \dst_fmt
subs WIDTH, WIDTH, #8
blt 5f
0:
- bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
+ bilinear_interpolate_eight_pixels_tail_head \src_fmt, \dst_fmt
subs WIDTH, WIDTH, #8
bge 0b
5:
- bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
+ bilinear_interpolate_eight_pixels_tail \src_fmt, \dst_fmt
1:
tst WIDTH, #4
beq 2f
- bilinear_interpolate_four_pixels src_fmt, dst_fmt
+ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
2:
.else
/*********** 4 pixels per iteration *****************/
subs WIDTH, WIDTH, #4
blt 1f
mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
- bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
+ bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
subs WIDTH, WIDTH, #4
blt 5f
0:
- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
subs WIDTH, WIDTH, #4
bge 0b
5:
- bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+ bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
1:
/****************************************************/
.endif
/* handle the remaining trailing pixels */
tst WIDTH, #2
beq 2f
- bilinear_interpolate_two_pixels src_fmt, dst_fmt
+ bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
2:
tst WIDTH, #1
beq 3f
- bilinear_interpolate_last_pixel src_fmt, dst_fmt
+ bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
3:
-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
vpop {d8-d15}
.endif
pop {r4, r5, r6, r7, r8, r9}
@@ -3236,7 +3238,7 @@ pixman_asm_function fname
.unreq TMP3
.unreq TMP4
.unreq STRIDE
-.endfunc
+ pixman_end_asm_function
.endm
diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index bdcf6a9..06318d9 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -74,134 +74,134 @@
*/
.macro pixldst1 op, elem_size, reg1, mem_operand, abits
-.if abits > 0
- op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
+.if \abits > 0
+ \op\().\()\elem_size {d\()\reg1}, [\()\mem_operand\(), :\()\abits\()]!
.else
- op&.&elem_size {d&reg1}, [&mem_operand&]!
+ \op\().\()\elem_size {d\()\reg1}, [\()\mem_operand\()]!
.endif
.endm
.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
-.if abits > 0
- op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
+.if \abits > 0
+ \op\().\()\elem_size {d\()\reg1, d\()\reg2}, [\()\mem_operand\(), :\()\abits\()]!
.else
- op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
+ \op\().\()\elem_size {d\()\reg1, d\()\reg2}, [\()\mem_operand\()]!
.endif
.endm
.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
-.if abits > 0
- op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
+.if \abits > 0
+ \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3, d\()\reg4}, [\()\mem_operand\(), :\()\abits\()]!
.else
- op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
+ \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3, d\()\reg4}, [\()\mem_operand\()]!
.endif
.endm
.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
- op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
+ \op\().\()\elem_size {d\()\reg1[\idx]}, [\()\mem_operand\()]!
.endm
.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
- op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
+ \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3}, [\()\mem_operand\()]!
.endm
.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
- op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
+ \op\().\()\elem_size {d\()\reg1[\idx], d\()\reg2[\idx], d\()\reg3[\idx]}, [\()\mem_operand\()]!
.endm
.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
-.if numbytes == 32
- pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
- %(basereg+6), %(basereg+7), mem_operand, abits
-.elseif numbytes == 16
- pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
-.elseif numbytes == 8
- pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
-.elseif numbytes == 4
- .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
- pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
- .elseif elem_size == 16
- pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
- pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
+.if \numbytes == 32
+ pixldst4 \op, \elem_size, %(\basereg+4), %(\basereg+5), \
+ %(\basereg+6), %(\basereg+7), \mem_operand, \abits
+.elseif \numbytes == 16
+ pixldst2 \op, \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand, \abits
+.elseif \numbytes == 8
+ pixldst1 \op, \elem_size, %(\basereg+1), \mem_operand, \abits
+.elseif \numbytes == 4
+ .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 32)
+ pixldst0 \op, 32, %(\basereg+0), 1, \mem_operand, \abits
+ .elseif \elem_size == 16
+ pixldst0 \op, 16, %(\basereg+0), 2, \mem_operand, \abits
+ pixldst0 \op, 16, %(\basereg+0), 3, \mem_operand, \abits
.else
- pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
- pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
- pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
- pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
+ pixldst0 \op, 8, %(\basereg+0), 4, \mem_operand, \abits
+ pixldst0 \op, 8, %(\basereg+0), 5, \mem_operand, \abits
+ pixldst0 \op, 8, %(\basereg+0), 6, \mem_operand, \abits
+ pixldst0 \op, 8, %(\basereg+0), 7, \mem_operand, \abits
.endif
-.elseif numbytes == 2
- .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
- pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
+.elseif \numbytes == 2
+ .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 16)
+ pixldst0 \op, 16, %(\basereg+0), 1, \mem_operand, \abits
.else
- pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
- pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
+ pixldst0 \op, 8, %(\basereg+0), 2, \mem_operand, \abits
+ pixldst0 \op, 8, %(\basereg+0), 3, \mem_operand, \abits
.endif
-.elseif numbytes == 1
- pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
+.elseif \numbytes == 1
+ pixldst0 \op, 8, %(\basereg+0), 1, \mem_operand, \abits
.else
- .error "unsupported size: numbytes"
+ .error "unsupported size: \numbytes"
.endif
.endm
.macro pixld numpix, bpp, basereg, mem_operand, abits=0
-.if bpp > 0
-.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
- pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
- %(basereg+6), %(basereg+7), mem_operand, abits
-.elseif (bpp == 24) && (numpix == 8)
- pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
-.elseif (bpp == 24) && (numpix == 4)
- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
-.elseif (bpp == 24) && (numpix == 2)
- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
-.elseif (bpp == 24) && (numpix == 1)
- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+.if \bpp > 0
+.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+ pixldst4 vld4, 8, %(\basereg+4), %(\basereg+5), \
+ %(\basereg+6), %(\basereg+7), \mem_operand, \abits
+.elseif (\bpp == 24) && (\numpix == 8)
+ pixldst3 vld3, 8, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand
+.elseif (\bpp == 24) && (\numpix == 4)
+ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand
+ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand
+ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand
+ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand
+.elseif (\bpp == 24) && (\numpix == 2)
+ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand
+ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand
+.elseif (\bpp == 24) && (\numpix == 1)
+ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand
.else
- pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
+ pixldst %(\numpix * \bpp / 8), vld1, %(\bpp), \basereg, \mem_operand, \abits
.endif
.endif
.endm
.macro pixst numpix, bpp, basereg, mem_operand, abits=0
-.if bpp > 0
-.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
- pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
- %(basereg+6), %(basereg+7), mem_operand, abits
-.elseif (bpp == 24) && (numpix == 8)
- pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
-.elseif (bpp == 24) && (numpix == 4)
- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
-.elseif (bpp == 24) && (numpix == 2)
- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
-.elseif (bpp == 24) && (numpix == 1)
- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+.if \bpp > 0
+.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+ pixldst4 vst4, 8, %(\basereg+4), %(\basereg+5), \
+ %(\basereg+6), %(\basereg+7), \mem_operand, \abits
+.elseif (\bpp == 24) && (\numpix == 8)
+ pixldst3 vst3, 8, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand
+.elseif (\bpp == 24) && (\numpix == 4)
+ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand
+ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand
+ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand
+ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand
+.elseif (\bpp == 24) && (\numpix == 2)
+ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand
+ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand
+.elseif (\bpp == 24) && (\numpix == 1)
+ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand
.else
- pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
+ pixldst %(\numpix * \bpp / 8), vst1, %(\bpp), \basereg, \mem_operand, \abits
.endif
.endif
.endm
.macro pixld_a numpix, bpp, basereg, mem_operand
-.if (bpp * numpix) <= 128
- pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
+.if (\bpp * \numpix) <= 128
+ pixld \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)
.else
- pixld numpix, bpp, basereg, mem_operand, 128
+ pixld \numpix, \bpp, \basereg, \mem_operand, 128
.endif
.endm
.macro pixst_a numpix, bpp, basereg, mem_operand
-.if (bpp * numpix) <= 128
- pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
+.if (\bpp * \numpix) <= 128
+ pixst \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)
.else
- pixst numpix, bpp, basereg, mem_operand, 128
+ pixst \numpix, \bpp, \basereg, \mem_operand, 128
.endif
.endm
@@ -210,44 +210,44 @@
* aliases to be defined)
*/
.macro pixld1_s elem_size, reg1, mem_operand
-.if elem_size == 16
+.if \elem_size == 16
mov TMP1, VX, asr #16
adds VX, VX, UNIT_X
-5: subpls VX, VX, SRC_WIDTH_FIXED
+5: subspl VX, VX, SRC_WIDTH_FIXED
bpl 5b
- add TMP1, mem_operand, TMP1, asl #1
+ add TMP1, \mem_operand, TMP1, asl #1
mov TMP2, VX, asr #16
adds VX, VX, UNIT_X
-5: subpls VX, VX, SRC_WIDTH_FIXED
+5: subspl VX, VX, SRC_WIDTH_FIXED
bpl 5b
- add TMP2, mem_operand, TMP2, asl #1
- vld1.16 {d&reg1&[0]}, [TMP1, :16]
+ add TMP2, \mem_operand, TMP2, asl #1
+ vld1.16 {d\()\reg1\()[0]}, [TMP1, :16]
mov TMP1, VX, asr #16
adds VX, VX, UNIT_X
-5: subpls VX, VX, SRC_WIDTH_FIXED
+5: subspl VX, VX, SRC_WIDTH_FIXED
bpl 5b
- add TMP1, mem_operand, TMP1, asl #1
- vld1.16 {d&reg1&[1]}, [TMP2, :16]
+ add TMP1, \mem_operand, TMP1, asl #1
+ vld1.16 {d\()\reg1\()[1]}, [TMP2, :16]
mov TMP2, VX, asr #16
adds VX, VX, UNIT_X
-5: subpls VX, VX, SRC_WIDTH_FIXED
+5: subspl VX, VX, SRC_WIDTH_FIXED
bpl 5b
- add TMP2, mem_operand, TMP2, asl #1
- vld1.16 {d&reg1&[2]}, [TMP1, :16]
- vld1.16 {d&reg1&[3]}, [TMP2, :16]
-.elseif elem_size == 32
+ add TMP2, \mem_operand, TMP2, asl #1
+ vld1.16 {d\()\reg1\()[2]}, [TMP1, :16]
+ vld1.16 {d\()\reg1\()[3]}, [TMP2, :16]
+.elseif \elem_size == 32
mov TMP1, VX, asr #16
adds VX, VX, UNIT_X
-5: subpls VX, VX, SRC_WIDTH_FIXED
+5: subspl VX, VX, SRC_WIDTH_FIXED
bpl 5b
- add TMP1, mem_operand, TMP1, asl #2
+ add TMP1, \mem_operand, TMP1, asl #2
mov TMP2, VX, asr #16
adds VX, VX, UNIT_X
-5: subpls VX, VX, SRC_WIDTH_FIXED
+5: subspl VX, VX, SRC_WIDTH_FIXED
bpl 5b
- add TMP2, mem_operand, TMP2, asl #2
- vld1.32 {d&reg1&[0]}, [TMP1, :32]
- vld1.32 {d&reg1&[1]}, [TMP2, :32]
+ add TMP2, \mem_operand, TMP2, asl #2
+ vld1.32 {d\()\reg1\()[0]}, [TMP1, :32]
+ vld1.32 {d\()\reg1\()[1]}, [TMP2, :32]
.else
.error "unsupported"
.endif
@@ -257,110 +257,110 @@
.if 0 /* elem_size == 32 */
mov TMP1, VX, asr #16
add VX, VX, UNIT_X, asl #1
- add TMP1, mem_operand, TMP1, asl #2
+ add TMP1, \mem_operand, TMP1, asl #2
mov TMP2, VX, asr #16
sub VX, VX, UNIT_X
- add TMP2, mem_operand, TMP2, asl #2
- vld1.32 {d&reg1&[0]}, [TMP1, :32]
+ add TMP2, \mem_operand, TMP2, asl #2
+ vld1.32 {d\()\reg1\()[0]}, [TMP1, :32]
mov TMP1, VX, asr #16
add VX, VX, UNIT_X, asl #1
- add TMP1, mem_operand, TMP1, asl #2
- vld1.32 {d&reg2&[0]}, [TMP2, :32]
+ add TMP1, \mem_operand, TMP1, asl #2
+ vld1.32 {d\()\reg2\()[0]}, [TMP2, :32]
mov TMP2, VX, asr #16
add VX, VX, UNIT_X
- add TMP2, mem_operand, TMP2, asl #2
- vld1.32 {d&reg1&[1]}, [TMP1, :32]
- vld1.32 {d&reg2&[1]}, [TMP2, :32]
+ add TMP2, \mem_operand, TMP2, asl #2
+ vld1.32 {d\()\reg1\()[1]}, [TMP1, :32]
+ vld1.32 {d\()\reg2\()[1]}, [TMP2, :32]
.else
- pixld1_s elem_size, reg1, mem_operand
- pixld1_s elem_size, reg2, mem_operand
+ pixld1_s \elem_size, \reg1, \mem_operand
+ pixld1_s \elem_size, \reg2, \mem_operand
.endif
.endm
.macro pixld0_s elem_size, reg1, idx, mem_operand
-.if elem_size == 16
+.if \elem_size == 16
mov TMP1, VX, asr #16
adds VX, VX, UNIT_X
-5: subpls VX, VX, SRC_WIDTH_FIXED
+5: subspl VX, VX, SRC_WIDTH_FIXED
bpl 5b
- add TMP1, mem_operand, TMP1, asl #1
- vld1.16 {d&reg1&[idx]}, [TMP1, :16]
-.elseif elem_size == 32
+ add TMP1, \mem_operand, TMP1, asl #1
+ vld1.16 {d\()\reg1\()[\idx]}, [TMP1, :16]
+.elseif \elem_size == 32
mov TMP1, VX, asr #16
adds VX, VX, UNIT_X
-5: subpls VX, VX, SRC_WIDTH_FIXED
+5: subspl VX, VX, SRC_WIDTH_FIXED
bpl 5b
- add TMP1, mem_operand, TMP1, asl #2
- vld1.32 {d&reg1&[idx]}, [TMP1, :32]
+ add TMP1, \mem_operand, TMP1, asl #2
+ vld1.32 {d\()\reg1\()[\idx]}, [TMP1, :32]
.endif
.endm
.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
-.if numbytes == 32
- pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
- pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
- pixdeinterleave elem_size, %(basereg+4)
-.elseif numbytes == 16
- pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
-.elseif numbytes == 8
- pixld1_s elem_size, %(basereg+1), mem_operand
-.elseif numbytes == 4
- .if elem_size == 32
- pixld0_s elem_size, %(basereg+0), 1, mem_operand
- .elseif elem_size == 16
- pixld0_s elem_size, %(basereg+0), 2, mem_operand
- pixld0_s elem_size, %(basereg+0), 3, mem_operand
+.if \numbytes == 32
+ pixld2_s \elem_size, %(\basereg+4), %(\basereg+5), \mem_operand
+ pixld2_s \elem_size, %(\basereg+6), %(\basereg+7), \mem_operand
+ pixdeinterleave \elem_size, %(\basereg+4)
+.elseif \numbytes == 16
+ pixld2_s \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand
+.elseif \numbytes == 8
+ pixld1_s \elem_size, %(\basereg+1), \mem_operand
+.elseif \numbytes == 4
+ .if \elem_size == 32
+ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
+ .elseif \elem_size == 16
+ pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand
+ pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand
.else
- pixld0_s elem_size, %(basereg+0), 4, mem_operand
- pixld0_s elem_size, %(basereg+0), 5, mem_operand
- pixld0_s elem_size, %(basereg+0), 6, mem_operand
- pixld0_s elem_size, %(basereg+0), 7, mem_operand
+ pixld0_s \elem_size, %(\basereg+0), 4, \mem_operand
+ pixld0_s \elem_size, %(\basereg+0), 5, \mem_operand
+ pixld0_s \elem_size, %(\basereg+0), 6, \mem_operand
+ pixld0_s \elem_size, %(\basereg+0), 7, \mem_operand
.endif
-.elseif numbytes == 2
- .if elem_size == 16
- pixld0_s elem_size, %(basereg+0), 1, mem_operand
+.elseif \numbytes == 2
+ .if \elem_size == 16
+ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
.else
- pixld0_s elem_size, %(basereg+0), 2, mem_operand
- pixld0_s elem_size, %(basereg+0), 3, mem_operand
+ pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand
+ pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand
.endif
-.elseif numbytes == 1
- pixld0_s elem_size, %(basereg+0), 1, mem_operand
+.elseif \numbytes == 1
+ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
.else
- .error "unsupported size: numbytes"
+ .error "unsupported size: \numbytes"
.endif
.endm
.macro pixld_s numpix, bpp, basereg, mem_operand
-.if bpp > 0
- pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
+.if \bpp > 0
+ pixld_s_internal %(\numpix * \bpp / 8), %(\bpp), \basereg, \mem_operand
.endif
.endm
.macro vuzp8 reg1, reg2
- vuzp.8 d&reg1, d&reg2
+ vuzp.8 d\()\reg1, d\()\reg2
.endm
.macro vzip8 reg1, reg2
- vzip.8 d&reg1, d&reg2
+ vzip.8 d\()\reg1, d\()\reg2
.endm
/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
.macro pixdeinterleave bpp, basereg
-.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
- vuzp8 %(basereg+0), %(basereg+1)
- vuzp8 %(basereg+2), %(basereg+3)
- vuzp8 %(basereg+1), %(basereg+3)
- vuzp8 %(basereg+0), %(basereg+2)
+.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+ vuzp8 %(\basereg+0), %(\basereg+1)
+ vuzp8 %(\basereg+2), %(\basereg+3)
+ vuzp8 %(\basereg+1), %(\basereg+3)
+ vuzp8 %(\basereg+0), %(\basereg+2)
.endif
.endm
/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
.macro pixinterleave bpp, basereg
-.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
- vzip8 %(basereg+0), %(basereg+2)
- vzip8 %(basereg+1), %(basereg+3)
- vzip8 %(basereg+2), %(basereg+3)
- vzip8 %(basereg+0), %(basereg+1)
+.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+ vzip8 %(\basereg+0), %(\basereg+2)
+ vzip8 %(\basereg+1), %(\basereg+3)
+ vzip8 %(\basereg+2), %(\basereg+3)
+ vzip8 %(\basereg+0), %(\basereg+1)
.endif
.endm
@@ -394,22 +394,22 @@
*/
.macro PF a, x:vararg
.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
- a x
+ \a \x
.endif
.endm
.macro cache_preload std_increment, boost_increment
.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
.if regs_shortage
- PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
+ PF ldr, ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
.endif
-.if std_increment != 0
- PF add PF_X, PF_X, #std_increment
+.if \std_increment != 0
+ PF add, PF_X, PF_X, #\std_increment
.endif
- PF tst PF_CTL, #0xF
- PF addne PF_X, PF_X, #boost_increment
- PF subne PF_CTL, PF_CTL, #1
- PF cmp PF_X, ORIG_W
+ PF tst, PF_CTL, #0xF
+ PF addne, PF_X, PF_X, #\boost_increment
+ PF subne, PF_CTL, PF_CTL, #1
+ PF cmp, PF_X, ORIG_W
.if src_bpp_shift >= 0
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
.endif
@@ -419,16 +419,16 @@
.if mask_bpp_shift >= 0
PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
.endif
- PF subge PF_X, PF_X, ORIG_W
- PF subges PF_CTL, PF_CTL, #0x10
+ PF subge, PF_X, PF_X, ORIG_W
+ PF subsge, PF_CTL, PF_CTL, #0x10
.if src_bpp_shift >= 0
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
.endif
.if dst_r_bpp != 0
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
.endif
.if mask_bpp_shift >= 0
- PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
.endif
.endif
.endm
@@ -465,21 +465,20 @@
beq 2f
.irp lowbit, 1, 2, 4, 8, 16
-local skip1
-.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
-.if lowbit < 16 /* we don't need more than 16-byte alignment */
- tst DST_R, #lowbit
+.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if \lowbit < 16 /* we don't need more than 16-byte alignment */
+ tst DST_R, #\lowbit
beq 1f
.endif
- pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
- pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
+ pixld_src (\lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
+ pixld (\lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
.if dst_r_bpp > 0
- pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
+ pixld_a (\lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
.else
- add DST_R, DST_R, #lowbit
+ add DST_R, DST_R, #\lowbit
.endif
- PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
- sub W, W, #(lowbit * 8 / dst_w_bpp)
+ PF add, PF_X, PF_X, #(\lowbit * 8 / dst_w_bpp)
+ sub W, W, #(\lowbit * 8 / dst_w_bpp)
1:
.endif
.endr
@@ -487,19 +486,19 @@ local skip1
pixdeinterleave mask_bpp, mask_basereg
pixdeinterleave dst_r_bpp, dst_r_basereg
- process_pixblock_head
+ \process_pixblock_head
cache_preload 0, pixblock_size
cache_preload_simple
- process_pixblock_tail
+ \process_pixblock_tail
pixinterleave dst_w_bpp, dst_w_basereg
.irp lowbit, 1, 2, 4, 8, 16
-.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
-.if lowbit < 16 /* we don't need more than 16-byte alignment */
- tst DST_W, #lowbit
+.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if \lowbit < 16 /* we don't need more than 16-byte alignment */
+ tst DST_W, #\lowbit
beq 1f
.endif
- pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
+ pixst_a (\lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
1:
.endif
.endr
@@ -530,18 +529,18 @@ local skip1
tst W, #(pixblock_size - 1)
beq 2f
.irp chunk_size, 16, 8, 4, 2, 1
-.if pixblock_size > chunk_size
- tst W, #chunk_size
+.if pixblock_size > \chunk_size
+ tst W, #\chunk_size
beq 1f
- pixld_src chunk_size, src_bpp, src_basereg, SRC
- pixld chunk_size, mask_bpp, mask_basereg, MASK
-.if dst_aligned_flag != 0
- pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+ pixld_src \chunk_size, src_bpp, src_basereg, SRC
+ pixld \chunk_size, mask_bpp, mask_basereg, MASK
+.if \dst_aligned_flag != 0
+ pixld_a \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
.else
- pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+ pixld \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
.endif
-.if cache_preload_flag != 0
- PF add PF_X, PF_X, #chunk_size
+.if \cache_preload_flag != 0
+ PF add, PF_X, PF_X, #\chunk_size
.endif
1:
.endif
@@ -550,21 +549,21 @@ local skip1
pixdeinterleave mask_bpp, mask_basereg
pixdeinterleave dst_r_bpp, dst_r_basereg
- process_pixblock_head
-.if cache_preload_flag != 0
+ \process_pixblock_head
+.if \cache_preload_flag != 0
cache_preload 0, pixblock_size
cache_preload_simple
.endif
- process_pixblock_tail
+ \process_pixblock_tail
pixinterleave dst_w_bpp, dst_w_basereg
.irp chunk_size, 16, 8, 4, 2, 1
-.if pixblock_size > chunk_size
- tst W, #chunk_size
+.if pixblock_size > \chunk_size
+ tst W, #\chunk_size
beq 1f
-.if dst_aligned_flag != 0
- pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+.if \dst_aligned_flag != 0
+ pixst_a \chunk_size, dst_w_bpp, dst_w_basereg, DST_W
.else
- pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+ pixst \chunk_size, dst_w_bpp, dst_w_basereg, DST_W
.endif
1:
.endif
@@ -604,7 +603,7 @@ local skip1
.if regs_shortage
str H, [sp, #4] /* save updated height to stack */
.endif
- bge start_of_loop_label
+ bge \start_of_loop_label
.endm
/*
@@ -631,7 +630,7 @@ local skip1
src_basereg_ = 0, \
mask_basereg_ = 24
- pixman_asm_function fname
+ pixman_asm_function \fname
push {r4-r12, lr} /* save all registers */
@@ -641,10 +640,10 @@ local skip1
* has to be used instead of ADVANCED.
*/
.set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
-.if prefetch_distance == 0
+.if \prefetch_distance == 0
.set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
- ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
+ ((\src_bpp_ == 24) || (\mask_bpp_ == 24) || (\dst_w_bpp_ == 24))
.set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
.endif
@@ -652,17 +651,17 @@ local skip1
* Make some macro arguments globally visible and accessible
* from other macros
*/
- .set src_bpp, src_bpp_
- .set mask_bpp, mask_bpp_
- .set dst_w_bpp, dst_w_bpp_
- .set pixblock_size, pixblock_size_
- .set dst_w_basereg, dst_w_basereg_
- .set dst_r_basereg, dst_r_basereg_
- .set src_basereg, src_basereg_
- .set mask_basereg, mask_basereg_
+ .set src_bpp, \src_bpp_
+ .set mask_bpp, \mask_bpp_
+ .set dst_w_bpp, \dst_w_bpp_
+ .set pixblock_size, \pixblock_size_
+ .set dst_w_basereg, \dst_w_basereg_
+ .set dst_r_basereg, \dst_r_basereg_
+ .set src_basereg, \src_basereg_
+ .set mask_basereg, \mask_basereg_
.macro pixld_src x:vararg
- pixld x
+ pixld \x
.endm
.macro fetch_src_pixblock
pixld_src pixblock_size, src_bpp, \
@@ -755,19 +754,19 @@ local skip1
.error "requested dst bpp (dst_w_bpp) is not supported"
.endif
-.if (((flags) & FLAG_DST_READWRITE) != 0)
+.if (((\flags) & FLAG_DST_READWRITE) != 0)
.set dst_r_bpp, dst_w_bpp
.else
.set dst_r_bpp, 0
.endif
-.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
.set DEINTERLEAVE_32BPP_ENABLED, 1
.else
.set DEINTERLEAVE_32BPP_ENABLED, 0
.endif
-.if prefetch_distance < 0 || prefetch_distance > 15
- .error "invalid prefetch distance (prefetch_distance)"
+.if \prefetch_distance < 0 || \prefetch_distance > 15
+ .error "invalid prefetch distance (\prefetch_distance)"
.endif
.if src_bpp > 0
@@ -776,7 +775,7 @@ local skip1
.if mask_bpp > 0
ldr MASK, [sp, #48]
.endif
- PF mov PF_X, #0
+ PF mov, PF_X, #0
.if src_bpp > 0
ldr SRC_STRIDE, [sp, #44]
.endif
@@ -801,14 +800,14 @@ local skip1
/*
* Setup advanced prefetcher initial state
*/
- PF mov PF_SRC, SRC
- PF mov PF_DST, DST_R
- PF mov PF_MASK, MASK
+ PF mov, PF_SRC, SRC
+ PF mov, PF_DST, DST_R
+ PF mov, PF_MASK, MASK
/* PF_CTL = prefetch_distance | ((h - 1) << 4) */
- PF mov PF_CTL, H, lsl #4
- PF add PF_CTL, #(prefetch_distance - 0x10)
+ PF mov, PF_CTL, H, lsl #4
+ PF add, PF_CTL, #(\prefetch_distance - 0x10)
- init
+ \init
.if regs_shortage
push {r0, r1}
.endif
@@ -826,9 +825,9 @@ local skip1
* long scanlines
*/
0:
- ensure_destination_ptr_alignment process_pixblock_head, \
- process_pixblock_tail, \
- process_pixblock_tail_head
+ ensure_destination_ptr_alignment \process_pixblock_head, \
+ \process_pixblock_tail, \
+ \process_pixblock_tail_head
/* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
pixld_a pixblock_size, dst_r_bpp, \
@@ -836,33 +835,33 @@ local skip1
fetch_src_pixblock
pixld pixblock_size, mask_bpp, \
(mask_basereg - pixblock_size * mask_bpp / 64), MASK
- PF add PF_X, PF_X, #pixblock_size
- process_pixblock_head
+ PF add, PF_X, PF_X, #pixblock_size
+ \process_pixblock_head
cache_preload 0, pixblock_size
cache_preload_simple
subs W, W, #(pixblock_size * 2)
blt 2f
1:
- process_pixblock_tail_head
+ \process_pixblock_tail_head
cache_preload_simple
subs W, W, #pixblock_size
bge 1b
2:
- process_pixblock_tail
+ \process_pixblock_tail
pixst_a pixblock_size, dst_w_bpp, \
(dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
/* Process the remaining trailing pixels in the scanline */
process_trailing_pixels 1, 1, \
- process_pixblock_head, \
- process_pixblock_tail, \
- process_pixblock_tail_head
+ \process_pixblock_head, \
+ \process_pixblock_tail, \
+ \process_pixblock_tail_head
advance_to_next_scanline 0b
.if regs_shortage
pop {r0, r1}
.endif
- cleanup
+ \cleanup
pop {r4-r12, pc} /* exit */
/*
* This is the start of the loop, designed to process images with small width
@@ -878,22 +877,22 @@ local skip1
fetch_src_pixblock
pixld pixblock_size, mask_bpp, \
(mask_basereg - pixblock_size * mask_bpp / 64), MASK
- process_pixblock_head
- process_pixblock_tail
+ \process_pixblock_head
+ \process_pixblock_tail
pixst pixblock_size, dst_w_bpp, \
(dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
1:
/* Process the remaining trailing pixels in the scanline */
process_trailing_pixels 0, 0, \
- process_pixblock_head, \
- process_pixblock_tail, \
- process_pixblock_tail_head
+ \process_pixblock_head, \
+ \process_pixblock_tail, \
+ \process_pixblock_tail_head
advance_to_next_scanline 8b
9:
.if regs_shortage
pop {r0, r1}
.endif
- cleanup
+ \cleanup
pop {r4-r12, pc} /* exit */
.purgem fetch_src_pixblock
@@ -915,7 +914,7 @@ local skip1
.unreq PF_DST
.unreq PF_MASK
.unreq DUMMY
- .endfunc
+ pixman_end_asm_function
.endm
/*
@@ -939,23 +938,23 @@ local skip1
src_basereg_ = 0, \
mask_basereg_ = 24
- pixman_asm_function fname
+ pixman_asm_function \fname
.set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
/*
* Make some macro arguments globally visible and accessible
* from other macros
*/
- .set src_bpp, src_bpp_
- .set mask_bpp, mask_bpp_
- .set dst_w_bpp, dst_w_bpp_
- .set pixblock_size, pixblock_size_
- .set dst_w_basereg, dst_w_basereg_
- .set dst_r_basereg, dst_r_basereg_
- .set src_basereg, src_basereg_
- .set mask_basereg, mask_basereg_
-
-.if use_nearest_scaling != 0
+ .set src_bpp, \src_bpp_
+ .set mask_bpp, \mask_bpp_
+ .set dst_w_bpp, \dst_w_bpp_
+ .set pixblock_size, \pixblock_size_
+ .set dst_w_basereg, \dst_w_basereg_
+ .set dst_r_basereg, \dst_r_basereg_
+ .set src_basereg, \src_basereg_
+ .set mask_basereg, \mask_basereg_
+
+.if \use_nearest_scaling != 0
/*
* Assign symbolic names to registers for nearest scaling
*/
@@ -971,7 +970,7 @@ local skip1
SRC_WIDTH_FIXED .req r7
.macro pixld_src x:vararg
- pixld_s x
+ pixld_s \x
.endm
ldr UNIT_X, [sp]
@@ -991,16 +990,16 @@ local skip1
MASK .req r3 /* mask pointer */
.macro pixld_src x:vararg
- pixld x
+ pixld \x
.endm
.endif
-.if (((flags) & FLAG_DST_READWRITE) != 0)
+.if (((\flags) & FLAG_DST_READWRITE) != 0)
.set dst_r_bpp, dst_w_bpp
.else
.set dst_r_bpp, 0
.endif
-.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
.set DEINTERLEAVE_32BPP_ENABLED, 1
.else
.set DEINTERLEAVE_32BPP_ENABLED, 0
@@ -1011,15 +1010,15 @@ local skip1
(src_basereg - pixblock_size * src_bpp / 64), SRC
.endm
- init
+ \init
mov DST_R, DST_W
cmp W, #pixblock_size
blt 8f
- ensure_destination_ptr_alignment process_pixblock_head, \
- process_pixblock_tail, \
- process_pixblock_tail_head
+ ensure_destination_ptr_alignment \process_pixblock_head, \
+ \process_pixblock_tail, \
+ \process_pixblock_tail_head
subs W, W, #pixblock_size
blt 7f
@@ -1030,26 +1029,26 @@ local skip1
fetch_src_pixblock
pixld pixblock_size, mask_bpp, \
(mask_basereg - pixblock_size * mask_bpp / 64), MASK
- process_pixblock_head
+ \process_pixblock_head
subs W, W, #pixblock_size
blt 2f
1:
- process_pixblock_tail_head
+ \process_pixblock_tail_head
subs W, W, #pixblock_size
bge 1b
2:
- process_pixblock_tail
+ \process_pixblock_tail
pixst_a pixblock_size, dst_w_bpp, \
(dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
7:
/* Process the remaining trailing pixels in the scanline (dst aligned) */
process_trailing_pixels 0, 1, \
- process_pixblock_head, \
- process_pixblock_tail, \
- process_pixblock_tail_head
+ \process_pixblock_head, \
+ \process_pixblock_tail, \
+ \process_pixblock_tail_head
- cleanup
-.if use_nearest_scaling != 0
+ \cleanup
+.if \use_nearest_scaling != 0
pop {r4-r8, pc} /* exit */
.else
bx lr /* exit */
@@ -1057,13 +1056,13 @@ local skip1
8:
/* Process the remaining trailing pixels in the scanline (dst unaligned) */
process_trailing_pixels 0, 0, \
- process_pixblock_head, \
- process_pixblock_tail, \
- process_pixblock_tail_head
+ \process_pixblock_head, \
+ \process_pixblock_tail, \
+ \process_pixblock_tail_head
- cleanup
+ \cleanup
-.if use_nearest_scaling != 0
+.if \use_nearest_scaling != 0
pop {r4-r8, pc} /* exit */
.unreq DST_R
@@ -1090,15 +1089,15 @@ local skip1
.purgem fetch_src_pixblock
.purgem pixld_src
- .endfunc
+ pixman_end_asm_function
.endm
.macro generate_composite_function_single_scanline x:vararg
- generate_composite_function_scanline 0, x
+ generate_composite_function_scanline 0, \x
.endm
.macro generate_composite_function_nearest_scanline x:vararg
- generate_composite_function_scanline 1, x
+ generate_composite_function_scanline 1, \x
.endm
/* Default prologue/epilogue, nothing special needs to be done */
@@ -1134,22 +1133,22 @@ local skip1
* value (in) is lost.
*/
.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
- vshrn.u16 out_r, in, #8
- vshrn.u16 out_g, in, #3
- vsli.u16 in, in, #5
- vmov.u8 out_a, #255
- vsri.u8 out_r, out_r, #5
- vsri.u8 out_g, out_g, #6
- vshrn.u16 out_b, in, #2
+ vshrn.u16 \out_r, \in, #8
+ vshrn.u16 \out_g, \in, #3
+ vsli.u16 \in, \in, #5
+ vmov.u8 \out_a, #255
+ vsri.u8 \out_r, \out_r, #5
+ vsri.u8 \out_g, \out_g, #6
+ vshrn.u16 \out_b, \in, #2
.endm
.macro convert_0565_to_x888 in, out_r, out_g, out_b
- vshrn.u16 out_r, in, #8
- vshrn.u16 out_g, in, #3
- vsli.u16 in, in, #5
- vsri.u8 out_r, out_r, #5
- vsri.u8 out_g, out_g, #6
- vshrn.u16 out_b, in, #2
+ vshrn.u16 \out_r, \in, #8
+ vshrn.u16 \out_g, \in, #3
+ vsli.u16 \in, \in, #5
+ vsri.u8 \out_r, \out_r, #5
+ vsri.u8 \out_g, \out_g, #6
+ vshrn.u16 \out_b, \in, #2
.endm
/*
@@ -1159,11 +1158,11 @@ local skip1
* registers (tmp1, tmp2)
*/
.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
- vshll.u8 tmp1, in_g, #8
- vshll.u8 out, in_r, #8
- vshll.u8 tmp2, in_b, #8
- vsri.u16 out, tmp1, #5
- vsri.u16 out, tmp2, #11
+ vshll.u8 \tmp1, \in_g, #8
+ vshll.u8 \out, \in_r, #8
+ vshll.u8 \tmp2, \in_b, #8
+ vsri.u16 \out, \tmp1, #5
+ vsri.u16 \out, \tmp2, #11
.endm
/*
@@ -1173,12 +1172,12 @@ local skip1
* value from 'in' is lost
*/
.macro convert_four_0565_to_x888_packed in, out0, out1, tmp
- vshl.u16 out0, in, #5 /* G top 6 bits */
- vshl.u16 tmp, in, #11 /* B top 5 bits */
- vsri.u16 in, in, #5 /* R is ready in top bits */
- vsri.u16 out0, out0, #6 /* G is ready in top bits */
- vsri.u16 tmp, tmp, #5 /* B is ready in top bits */
- vshr.u16 out1, in, #8 /* R is in place */
- vsri.u16 out0, tmp, #8 /* G & B is in place */
- vzip.u16 out0, out1 /* everything is in place */
+ vshl.u16 \out0, \in, #5 /* G top 6 bits */
+ vshl.u16 \tmp, \in, #11 /* B top 5 bits */
+ vsri.u16 \in, \in, #5 /* R is ready in top bits */
+ vsri.u16 \out0, \out0, #6 /* G is ready in top bits */
+ vsri.u16 \tmp, \tmp, #5 /* B is ready in top bits */
+ vshr.u16 \out1, \in, #8 /* R is in place */
+ vsri.u16 \out0, \tmp, #8 /* G & B is in place */
+ vzip.u16 \out0, \out1 /* everything is in place */
.endm
diff --git a/pixman/pixman-arm-simd-asm-scaled.S b/pixman/pixman-arm-simd-asm-scaled.S
index e050292..cc62c81 100644
--- a/pixman/pixman-arm-simd-asm-scaled.S
+++ b/pixman/pixman-arm-simd-asm-scaled.S
@@ -39,6 +39,8 @@
#include "pixman-arm-asm.h"
+ pixman_syntax_unified
+
/*
* Note: This code is only using armv5te instructions (not even armv6),
* but is scheduled for ARM Cortex-A8 pipeline. So it might need to
@@ -62,7 +64,7 @@
prefetch_distance, \
prefetch_braking_distance
-pixman_asm_function fname
+pixman_asm_function \fname
W .req r0
DST .req r1
SRC .req r2
@@ -76,39 +78,39 @@ pixman_asm_function fname
ldr UNIT_X, [sp]
push {r4, r5, r6, r7, r8, r10}
- mvn VXMASK, #((1 << bpp_shift) - 1)
+ mvn VXMASK, #((1 << \bpp_shift) - 1)
ldr SRC_WIDTH_FIXED, [sp, #28]
/* define helper macro */
.macro scale_2_pixels
- ldr&t TMP1, [SRC, TMP1]
- and TMP2, VXMASK, VX, asr #(16 - bpp_shift)
+ ldr\()\t TMP1, [SRC, TMP1]
+ and TMP2, VXMASK, VX, asr #(16 - \bpp_shift)
adds VX, VX, UNIT_X
- str&t TMP1, [DST], #(1 << bpp_shift)
-9: subpls VX, VX, SRC_WIDTH_FIXED
+ str\()\t TMP1, [DST], #(1 << \bpp_shift)
+9: subspl VX, VX, SRC_WIDTH_FIXED
bpl 9b
- ldr&t TMP2, [SRC, TMP2]
- and TMP1, VXMASK, VX, asr #(16 - bpp_shift)
+ ldr\()\t TMP2, [SRC, TMP2]
+ and TMP1, VXMASK, VX, asr #(16 - \bpp_shift)
adds VX, VX, UNIT_X
- str&t TMP2, [DST], #(1 << bpp_shift)
-9: subpls VX, VX, SRC_WIDTH_FIXED
+ str\()\t TMP2, [DST], #(1 << \bpp_shift)
+9: subspl VX, VX, SRC_WIDTH_FIXED
bpl 9b
.endm
/* now do the scaling */
- and TMP1, VXMASK, VX, asr #(16 - bpp_shift)
+ and TMP1, VXMASK, VX, asr #(16 - \bpp_shift)
adds VX, VX, UNIT_X
-9: subpls VX, VX, SRC_WIDTH_FIXED
+9: subspl VX, VX, SRC_WIDTH_FIXED
bpl 9b
- subs W, W, #(8 + prefetch_braking_distance)
+ subs W, W, #(8 + \prefetch_braking_distance)
blt 2f
/* calculate prefetch offset */
- mov PF_OFFS, #prefetch_distance
+ mov PF_OFFS, #\prefetch_distance
mla PF_OFFS, UNIT_X, PF_OFFS, VX
1: /* main loop, process 8 pixels per iteration with prefetch */
- pld [SRC, PF_OFFS, asr #(16 - bpp_shift)]
- add PF_OFFS, UNIT_X, lsl #3
+ pld [SRC, PF_OFFS, asr #(16 - \bpp_shift)]
+ add PF_OFFS, PF_OFFS, UNIT_X, lsl #3
scale_2_pixels
scale_2_pixels
scale_2_pixels
@@ -116,7 +118,7 @@ pixman_asm_function fname
subs W, W, #8
bge 1b
2:
- subs W, W, #(4 - 8 - prefetch_braking_distance)
+ subs W, W, #(4 - 8 - \prefetch_braking_distance)
blt 2f
1: /* process the remaining pixels */
scale_2_pixels
@@ -129,8 +131,8 @@ pixman_asm_function fname
scale_2_pixels
2:
tst W, #1
- ldrne&t TMP1, [SRC, TMP1]
- strne&t TMP1, [DST]
+ ldr\()\t\()ne TMP1, [SRC, TMP1]
+ str\()\t\()ne TMP1, [DST]
/* cleanup helper macro */
.purgem scale_2_pixels
.unreq DST
@@ -146,7 +148,7 @@ pixman_asm_function fname
/* return */
pop {r4, r5, r6, r7, r8, r10}
bx lr
-.endfunc
+ pixman_end_asm_function
.endm
generate_nearest_scanline_func \
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index a74a0a8..34d38f1 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -40,6 +40,8 @@
#include "pixman-arm-asm.h"
#include "pixman-arm-simd-asm.h"
+ pixman_syntax_unified
+
/* A head macro should do all processing which results in an output of up to
* 16 bytes, as far as the final load instruction. The corresponding tail macro
* should complete the processing of the up-to-16 bytes. The calling macro will
@@ -57,7 +59,7 @@
.endm
.macro blit_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
- pixld cond, numbytes, firstreg, SRC, unaligned_src
+ pixld \cond, \numbytes, \firstreg, SRC, \unaligned_src
.endm
.macro blit_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
@@ -65,8 +67,8 @@
WK5 .req STRIDE_S
WK6 .req MASK
WK7 .req STRIDE_M
-110: pixld , 16, 0, SRC, unaligned_src
- pixld , 16, 4, SRC, unaligned_src
+110: pixld , 16, 0, SRC, \unaligned_src
+ pixld , 16, 4, SRC, \unaligned_src
pld [SRC, SCRATCH]
pixst , 16, 0, DST
pixst , 16, 4, DST
@@ -122,7 +124,7 @@ generate_composite_function \
.macro src_n_0565_init
ldrh SRC, [sp, #ARGS_STACK_OFFSET]
- orr SRC, SRC, lsl #16
+ orr SRC, SRC, SRC, lsl #16
mov STRIDE_S, SRC
mov MASK, SRC
mov STRIDE_M, SRC
@@ -130,8 +132,8 @@ generate_composite_function \
.macro src_n_8_init
ldrb SRC, [sp, #ARGS_STACK_OFFSET]
- orr SRC, SRC, lsl #8
- orr SRC, SRC, lsl #16
+ orr SRC, SRC, SRC, lsl #8
+ orr SRC, SRC, SRC, lsl #16
mov STRIDE_S, SRC
mov MASK, SRC
mov STRIDE_M, SRC
@@ -142,7 +144,7 @@ generate_composite_function \
WK5 .req STRIDE_S
WK6 .req MASK
WK7 .req STRIDE_M
- pixst cond, numbytes, 4, DST
+ pixst \cond, \numbytes, 4, DST
.unreq WK4
.unreq WK5
.unreq WK6
@@ -182,20 +184,20 @@ generate_composite_function \
/******************************************************************************/
.macro src_x888_8888_pixel, cond, reg
- orr&cond WK&reg, WK&reg, #0xFF000000
+ orr\()\cond WK\()\reg, WK\()\reg, #0xFF000000
.endm
.macro pixman_composite_src_x888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
- pixld cond, numbytes, firstreg, SRC, unaligned_src
+ pixld \cond, \numbytes, \firstreg, SRC, \unaligned_src
.endm
.macro pixman_composite_src_x888_8888_process_tail cond, numbytes, firstreg
- src_x888_8888_pixel cond, %(firstreg+0)
- .if numbytes >= 8
- src_x888_8888_pixel cond, %(firstreg+1)
- .if numbytes == 16
- src_x888_8888_pixel cond, %(firstreg+2)
- src_x888_8888_pixel cond, %(firstreg+3)
+ src_x888_8888_pixel \cond, %(\firstreg+0)
+ .if \numbytes >= 8
+ src_x888_8888_pixel \cond, %(\firstreg+1)
+ .if \numbytes == 16
+ src_x888_8888_pixel \cond, %(\firstreg+2)
+ src_x888_8888_pixel \cond, %(\firstreg+3)
.endif
.endif
.endm
@@ -222,73 +224,73 @@ generate_composite_function \
.endm
.macro src_0565_8888_2pixels, reg1, reg2
- and SCRATCH, WK&reg1, MASK @ 00000GGGGGG0000000000gggggg00000
- bic WK&reg2, WK&reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
- orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
- mov WK&reg1, WK&reg2, lsl #16 @ rrrrr000000bbbbb0000000000000000
- mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG
- bic WK&reg2, WK&reg2, WK&reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
- orr WK&reg1, WK&reg1, WK&reg1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000
- orr WK&reg2, WK&reg2, WK&reg2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000
- pkhtb WK&reg1, WK&reg1, WK&reg1, asr #5 @ rrrrrrrr--------bbbbbbbb--------
- sel WK&reg1, WK&reg1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb--------
- mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg
- pkhtb WK&reg2, WK&reg2, WK&reg2, asr #5 @ RRRRRRRR--------BBBBBBBB--------
- sel WK&reg2, WK&reg2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB--------
- orr WK&reg1, STRIDE_M, WK&reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
- orr WK&reg2, STRIDE_M, WK&reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
+ and SCRATCH, WK\()\reg1, MASK @ 00000GGGGGG0000000000gggggg00000
+ bic WK\()\reg2, WK\()\reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
+ orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
+ mov WK\()\reg1, WK\()\reg2, lsl #16 @ rrrrr000000bbbbb0000000000000000
+ mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG
+ bic WK\()\reg2, WK\()\reg2, WK\()\reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
+ orr WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000
+ orr WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000
+ pkhtb WK\()\reg1, WK\()\reg1, WK\()\reg1, asr #5 @ rrrrrrrr--------bbbbbbbb--------
+ sel WK\()\reg1, WK\()\reg1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb--------
+ mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg
+ pkhtb WK\()\reg2, WK\()\reg2, WK\()\reg2, asr #5 @ RRRRRRRR--------BBBBBBBB--------
+ sel WK\()\reg2, WK\()\reg2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB--------
+ orr WK\()\reg1, STRIDE_M, WK\()\reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
+ orr WK\()\reg2, STRIDE_M, WK\()\reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
.endm
/* This version doesn't need STRIDE_M, but is one instruction longer.
It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
- and SCRATCH, WK&reg1, MASK @ 00000GGGGGG0000000000gggggg00000
- bic WK&reg1, WK&reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
- orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
- mov WK&reg2, WK&reg1, lsr #16 @ 0000000000000000RRRRR000000BBBBB
- mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000
- bic WK&reg1, WK&reg1, WK&reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
- mov WK&reg2, WK&reg2, lsl #3 @ 0000000000000RRRRR000000BBBBB000
- mov WK&reg1, WK&reg1, lsl #3 @ 0000000000000rrrrr000000bbbbb000
- orr WK&reg2, WK&reg2, WK&reg2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB
- orr WK&reg1, WK&reg1, WK&reg1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
- pkhbt WK&reg2, WK&reg2, WK&reg2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB
- pkhbt WK&reg1, WK&reg1, WK&reg1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
- sel WK&reg2, SCRATCH, WK&reg2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB
- sel WK&reg1, SCRATCH, WK&reg1 @ --------rrrrrrrrggggggggbbbbbbbb
- orr WK&reg2, WK&reg2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
- orr WK&reg1, WK&reg1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
+ and SCRATCH, WK\()\reg1, MASK @ 00000GGGGGG0000000000gggggg00000
+ bic WK\()\reg1, WK\()\reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
+ orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
+ mov WK\()\reg2, WK\()\reg1, lsr #16 @ 0000000000000000RRRRR000000BBBBB
+ mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000
+ bic WK\()\reg1, WK\()\reg1, WK\()\reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
+ mov WK\()\reg2, WK\()\reg2, lsl #3 @ 0000000000000RRRRR000000BBBBB000
+ mov WK\()\reg1, WK\()\reg1, lsl #3 @ 0000000000000rrrrr000000bbbbb000
+ orr WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB
+ orr WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
+ pkhbt WK\()\reg2, WK\()\reg2, WK\()\reg2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB
+ pkhbt WK\()\reg1, WK\()\reg1, WK\()\reg1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
+ sel WK\()\reg2, SCRATCH, WK\()\reg2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB
+ sel WK\()\reg1, SCRATCH, WK\()\reg1 @ --------rrrrrrrrggggggggbbbbbbbb
+ orr WK\()\reg2, WK\()\reg2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
+ orr WK\()\reg1, WK\()\reg1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
*/
.macro src_0565_8888_1pixel, reg
- bic SCRATCH, WK&reg, MASK @ 0000000000000000rrrrr000000bbbbb
- and WK&reg, WK&reg, MASK @ 000000000000000000000gggggg00000
- mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000
- mov WK&reg, WK&reg, lsl #5 @ 0000000000000000gggggg0000000000
- orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
- orr WK&reg, WK&reg, WK&reg, lsr #6 @ 000000000000000gggggggggggg00000
- pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
- sel WK&reg, WK&reg, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb
- orr WK&reg, WK&reg, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
+ bic SCRATCH, WK\()\reg, MASK @ 0000000000000000rrrrr000000bbbbb
+ and WK\()\reg, WK\()\reg, MASK @ 000000000000000000000gggggg00000
+ mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000
+ mov WK\()\reg, WK\()\reg, lsl #5 @ 0000000000000000gggggg0000000000
+ orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
+ orr WK\()\reg, WK\()\reg, WK\()\reg, lsr #6 @ 000000000000000gggggggggggg00000
+ pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
+ sel WK\()\reg, WK\()\reg, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb
+ orr WK\()\reg, WK\()\reg, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
.endm
.macro src_0565_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
- .if numbytes == 16
- pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
- .elseif numbytes == 8
- pixld , 4, firstreg, SRC, unaligned_src
- .elseif numbytes == 4
- pixld , 2, firstreg, SRC, unaligned_src
+ .if \numbytes == 16
+ pixldst ld,, 8, \firstreg, %(\firstreg+2),,, SRC, \unaligned_src
+ .elseif \numbytes == 8
+ pixld , 4, \firstreg, SRC, \unaligned_src
+ .elseif \numbytes == 4
+ pixld , 2, \firstreg, SRC, \unaligned_src
.endif
.endm
.macro src_0565_8888_process_tail cond, numbytes, firstreg
- .if numbytes == 16
- src_0565_8888_2pixels firstreg, %(firstreg+1)
- src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)
- .elseif numbytes == 8
- src_0565_8888_2pixels firstreg, %(firstreg+1)
+ .if \numbytes == 16
+ src_0565_8888_2pixels \firstreg, %(\firstreg+1)
+ src_0565_8888_2pixels %(\firstreg+2), %(\firstreg+3)
+ .elseif \numbytes == 8
+ src_0565_8888_2pixels \firstreg, %(\firstreg+1)
.else
- src_0565_8888_1pixel firstreg
+ src_0565_8888_1pixel \firstreg
.endif
.endm
@@ -311,23 +313,23 @@ generate_composite_function \
.endm
.macro src_x888_0565_1pixel s, d
- and WK&d, MASK, WK&s, lsr #3 @ 00000000000rrrrr00000000000bbbbb
- and STRIDE_S, WK&s, #0xFC00 @ 0000000000000000gggggg0000000000
- orr WK&d, WK&d, WK&d, lsr #5 @ 00000000000-----rrrrr000000bbbbb
- orr WK&d, WK&d, STRIDE_S, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
+ and WK\()\d, MASK, WK\()\s, lsr #3 @ 00000000000rrrrr00000000000bbbbb
+ and STRIDE_S, WK\()\s, #0xFC00 @ 0000000000000000gggggg0000000000
+ orr WK\()\d, WK\()\d, WK\()\d, lsr #5 @ 00000000000-----rrrrr000000bbbbb
+ orr WK\()\d, WK\()\d, STRIDE_S, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
/* Top 16 bits are discarded during the following STRH */
.endm
.macro src_x888_0565_2pixels slo, shi, d, tmp
- and SCRATCH, WK&shi, #0xFC00 @ 0000000000000000GGGGGG0000000000
- and WK&tmp, MASK, WK&shi, lsr #3 @ 00000000000RRRRR00000000000BBBBB
- and WK&shi, MASK, WK&slo, lsr #3 @ 00000000000rrrrr00000000000bbbbb
- orr WK&tmp, WK&tmp, WK&tmp, lsr #5 @ 00000000000-----RRRRR000000BBBBB
- orr WK&tmp, WK&tmp, SCRATCH, lsr #5 @ 00000000000-----RRRRRGGGGGGBBBBB
- and SCRATCH, WK&slo, #0xFC00 @ 0000000000000000gggggg0000000000
- orr WK&shi, WK&shi, WK&shi, lsr #5 @ 00000000000-----rrrrr000000bbbbb
- orr WK&shi, WK&shi, SCRATCH, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
- pkhbt WK&d, WK&shi, WK&tmp, lsl #16 @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
+ and SCRATCH, WK\()\shi, #0xFC00 @ 0000000000000000GGGGGG0000000000
+ and WK\()\tmp, MASK, WK\()\shi, lsr #3 @ 00000000000RRRRR00000000000BBBBB
+ and WK\()\shi, MASK, WK\()\slo, lsr #3 @ 00000000000rrrrr00000000000bbbbb
+ orr WK\()\tmp, WK\()\tmp, WK\()\tmp, lsr #5 @ 00000000000-----RRRRR000000BBBBB
+ orr WK\()\tmp, WK\()\tmp, SCRATCH, lsr #5 @ 00000000000-----RRRRRGGGGGGBBBBB
+ and SCRATCH, WK\()\slo, #0xFC00 @ 0000000000000000gggggg0000000000
+ orr WK\()\shi, WK\()\shi, WK\()\shi, lsr #5 @ 00000000000-----rrrrr000000bbbbb
+ orr WK\()\shi, WK\()\shi, SCRATCH, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
+ pkhbt WK\()\d, WK\()\shi, WK\()\tmp, lsl #16 @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
.endm
.macro src_x888_0565_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
@@ -335,33 +337,33 @@ generate_composite_function \
WK5 .req STRIDE_M
WK6 .req WK3
WK7 .req ORIG_W
- .if numbytes == 16
+ .if \numbytes == 16
pixld , 16, 4, SRC, 0
src_x888_0565_2pixels 4, 5, 0, 0
pixld , 8, 4, SRC, 0
src_x888_0565_2pixels 6, 7, 1, 1
pixld , 8, 6, SRC, 0
.else
- pixld , numbytes*2, 4, SRC, 0
+ pixld , \numbytes*2, 4, SRC, 0
.endif
.endm
.macro src_x888_0565_process_tail cond, numbytes, firstreg
- .if numbytes == 16
+ .if \numbytes == 16
src_x888_0565_2pixels 4, 5, 2, 2
src_x888_0565_2pixels 6, 7, 3, 4
- .elseif numbytes == 8
+ .elseif \numbytes == 8
src_x888_0565_2pixels 4, 5, 1, 1
src_x888_0565_2pixels 6, 7, 2, 2
- .elseif numbytes == 4
+ .elseif \numbytes == 4
src_x888_0565_2pixels 4, 5, 1, 1
.else
src_x888_0565_1pixel 4, 1
.endif
- .if numbytes == 16
- pixst , numbytes, 0, DST
+ .if \numbytes == 16
+ pixst , \numbytes, 0, DST
.else
- pixst , numbytes, 1, DST
+ pixst , \numbytes, 1, DST
.endif
.unreq WK4
.unreq WK5
@@ -382,37 +384,37 @@ generate_composite_function \
/******************************************************************************/
.macro add_8_8_8pixels cond, dst1, dst2
- uqadd8&cond WK&dst1, WK&dst1, MASK
- uqadd8&cond WK&dst2, WK&dst2, STRIDE_M
+ uqadd8\()\cond WK\()\dst1, WK\()\dst1, MASK
+ uqadd8\()\cond WK\()\dst2, WK\()\dst2, STRIDE_M
.endm
.macro add_8_8_4pixels cond, dst
- uqadd8&cond WK&dst, WK&dst, MASK
+ uqadd8\()\cond WK\()\dst, WK\()\dst, MASK
.endm
.macro add_8_8_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
WK4 .req MASK
WK5 .req STRIDE_M
- .if numbytes == 16
- pixld cond, 8, 4, SRC, unaligned_src
- pixld cond, 16, firstreg, DST, 0
- add_8_8_8pixels cond, firstreg, %(firstreg+1)
- pixld cond, 8, 4, SRC, unaligned_src
+ .if \numbytes == 16
+ pixld \cond, 8, 4, SRC, \unaligned_src
+ pixld \cond, 16, \firstreg, DST, 0
+ add_8_8_8pixels \cond, \firstreg, %(\firstreg+1)
+ pixld \cond, 8, 4, SRC, \unaligned_src
.else
- pixld cond, numbytes, 4, SRC, unaligned_src
- pixld cond, numbytes, firstreg, DST, 0
+ pixld \cond, \numbytes, 4, SRC, \unaligned_src
+ pixld \cond, \numbytes, \firstreg, DST, 0
.endif
.unreq WK4
.unreq WK5
.endm
.macro add_8_8_process_tail cond, numbytes, firstreg
- .if numbytes == 16
- add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3)
- .elseif numbytes == 8
- add_8_8_8pixels cond, firstreg, %(firstreg+1)
+ .if \numbytes == 16
+ add_8_8_8pixels \cond, %(\firstreg+2), %(\firstreg+3)
+ .elseif \numbytes == 8
+ add_8_8_8pixels \cond, \firstreg, %(\firstreg+1)
.else
- add_8_8_4pixels cond, firstreg
+ add_8_8_4pixels \cond, \firstreg
.endif
.endm
@@ -441,8 +443,8 @@ generate_composite_function \
WK5 .req STRIDE_S
WK6 .req STRIDE_M
WK7 .req ORIG_W
- pixld , numbytes, %(4+firstreg), SRC, unaligned_src
- pixld , numbytes, firstreg, DST, 0
+ pixld , \numbytes, %(4+\firstreg), SRC, \unaligned_src
+ pixld , \numbytes, \firstreg, DST, 0
.unreq WK4
.unreq WK5
.unreq WK6
@@ -451,44 +453,44 @@ generate_composite_function \
.macro over_8888_8888_check_transparent numbytes, reg0, reg1, reg2, reg3
/* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */
- teq WK&reg0, #0
- .if numbytes > 4
- teqeq WK&reg1, #0
- .if numbytes > 8
- teqeq WK&reg2, #0
- teqeq WK&reg3, #0
+ teq WK\()\reg0, #0
+ .if \numbytes > 4
+ teqeq WK\()\reg1, #0
+ .if \numbytes > 8
+ teqeq WK\()\reg2, #0
+ teqeq WK\()\reg3, #0
.endif
.endif
.endm
.macro over_8888_8888_prepare next
- mov WK&next, WK&next, lsr #24
+ mov WK\()\next, WK\()\next, lsr #24
.endm
.macro over_8888_8888_1pixel src, dst, offset, next
/* src = destination component multiplier */
- rsb WK&src, WK&src, #255
+ rsb WK\()\src, WK\()\src, #255
/* Split even/odd bytes of dst into SCRATCH/dst */
- uxtb16 SCRATCH, WK&dst
- uxtb16 WK&dst, WK&dst, ror #8
+ uxtb16 SCRATCH, WK\()\dst
+ uxtb16 WK\()\dst, WK\()\dst, ror #8
/* Multiply through, adding 0.5 to the upper byte of result for rounding */
- mla SCRATCH, SCRATCH, WK&src, MASK
- mla WK&dst, WK&dst, WK&src, MASK
+ mla SCRATCH, SCRATCH, WK\()\src, MASK
+ mla WK\()\dst, WK\()\dst, WK\()\src, MASK
/* Where we would have had a stall between the result of the first MLA and the shifter input,
* reload the complete source pixel */
- ldr WK&src, [SRC, #offset]
+ ldr WK\()\src, [SRC, #\offset]
/* Multiply by 257/256 to approximate 256/255 */
uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
/* In this stall, start processing the next pixel */
- .if offset < -4
- mov WK&next, WK&next, lsr #24
+ .if \offset < -4
+ mov WK\()\next, WK\()\next, lsr #24
.endif
- uxtab16 WK&dst, WK&dst, WK&dst, ror #8
+ uxtab16 WK\()\dst, WK\()\dst, WK\()\dst, ror #8
/* Recombine even/odd bytes of multiplied destination */
mov SCRATCH, SCRATCH, ror #8
- sel WK&dst, SCRATCH, WK&dst
+ sel WK\()\dst, SCRATCH, WK\()\dst
/* Saturated add of source to multiplied destination */
- uqadd8 WK&dst, WK&dst, WK&src
+ uqadd8 WK\()\dst, WK\()\dst, WK\()\src
.endm
.macro over_8888_8888_process_tail cond, numbytes, firstreg
@@ -496,17 +498,17 @@ generate_composite_function \
WK5 .req STRIDE_S
WK6 .req STRIDE_M
WK7 .req ORIG_W
- over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg)
+ over_8888_8888_check_transparent \numbytes, %(4+\firstreg), %(5+\firstreg), %(6+\firstreg), %(7+\firstreg)
beq 10f
- over_8888_8888_prepare %(4+firstreg)
- .set PROCESS_REG, firstreg
- .set PROCESS_OFF, -numbytes
- .rept numbytes / 4
+ over_8888_8888_prepare %(4+\firstreg)
+ .set PROCESS_REG, \firstreg
+ .set PROCESS_OFF, -\numbytes
+ .rept \numbytes / 4
over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)
.set PROCESS_REG, PROCESS_REG+1
.set PROCESS_OFF, PROCESS_OFF+4
.endr
- pixst , numbytes, firstreg, DST
+ pixst , \numbytes, \firstreg, DST
10:
.unreq WK4
.unreq WK5
@@ -536,16 +538,16 @@ generate_composite_function \
*/
.macro mul_8888_8 word, byte, tmp, half
/* Split even/odd bytes of word apart */
- uxtb16 tmp, word
- uxtb16 word, word, ror #8
+ uxtb16 \tmp, \word
+ uxtb16 \word, \word, ror #8
/* Multiply bytes together with rounding, then by 257/256 */
- mla tmp, tmp, byte, half
- mla word, word, byte, half /* 1 stall follows */
- uxtab16 tmp, tmp, tmp, ror #8 /* 1 stall follows */
- uxtab16 word, word, word, ror #8
+ mla \tmp, \tmp, \byte, \half
+ mla \word, \word, \byte, \half /* 1 stall follows */
+ uxtab16 \tmp, \tmp, \tmp, ror #8 /* 1 stall follows */
+ uxtab16 \word, \word, \word, ror #8
/* Recombine bytes */
- mov tmp, tmp, ror #8
- sel word, tmp, word
+ mov \tmp, \tmp, ror #8
+ sel \word, \tmp, \word
.endm
/******************************************************************************/
@@ -567,8 +569,8 @@ generate_composite_function \
WK5 .req STRIDE_D
WK6 .req STRIDE_S
WK7 .req ORIG_W
- pixld , numbytes, %(4+(firstreg%2)), SRC, unaligned_src
- pixld , numbytes, firstreg, DST, 0
+ pixld , \numbytes, %(4+(\firstreg%2)), SRC, \unaligned_src
+ pixld , \numbytes, \firstreg, DST, 0
.unreq WK4
.unreq WK5
.unreq WK6
@@ -576,10 +578,10 @@ generate_composite_function \
.endm
.macro over_8888_n_8888_1pixel src, dst
- mul_8888_8 WK&src, MASK, SCRATCH, STRIDE_M
- sub WK7, WK6, WK&src, lsr #24
- mul_8888_8 WK&dst, WK7, SCRATCH, STRIDE_M
- uqadd8 WK&dst, WK&dst, WK&src
+ mul_8888_8 WK\()\src, MASK, SCRATCH, STRIDE_M
+ sub WK7, WK6, WK\()\src, lsr #24
+ mul_8888_8 WK\()\dst, WK7, SCRATCH, STRIDE_M
+ uqadd8 WK\()\dst, WK\()\dst, WK\()\src
.endm
.macro over_8888_n_8888_process_tail cond, numbytes, firstreg
@@ -587,12 +589,12 @@ generate_composite_function \
WK5 .req STRIDE_D
WK6 .req STRIDE_S
WK7 .req ORIG_W
- over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg)
+ over_8888_8888_check_transparent \numbytes, %(4+(\firstreg%2)), %(5+(\firstreg%2)), %(6+\firstreg), %(7+\firstreg)
beq 10f
mov WK6, #255
- .set PROCESS_REG, firstreg
- .rept numbytes / 4
- .if numbytes == 16 && PROCESS_REG == 2
+ .set PROCESS_REG, \firstreg
+ .rept \numbytes / 4
+ .if \numbytes == 16 && PROCESS_REG == 2
/* We're using WK6 and WK7 as temporaries, so half way through
* 4 pixels, reload the second two source pixels but this time
* into WK4 and WK5 */
@@ -601,7 +603,7 @@ generate_composite_function \
over_8888_n_8888_1pixel %(4+(PROCESS_REG%2)), %(PROCESS_REG)
.set PROCESS_REG, PROCESS_REG+1
.endr
- pixst , numbytes, firstreg, DST
+ pixst , \numbytes, \firstreg, DST
10:
.unreq WK4
.unreq WK5
@@ -642,13 +644,13 @@ generate_composite_function \
.macro over_n_8_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
WK4 .req STRIDE_M
- pixld , numbytes/4, 4, MASK, unaligned_mask
- pixld , numbytes, firstreg, DST, 0
+ pixld , \numbytes/4, 4, MASK, \unaligned_mask
+ pixld , \numbytes, \firstreg, DST, 0
.unreq WK4
.endm
.macro over_n_8_8888_1pixel src, dst
- uxtb Y, WK4, ror #src*8
+ uxtb Y, WK4, ror #\src*8
/* Trailing part of multiplication of source */
mla SCRATCH, STRIDE_S, Y, STRIDE_D
mla Y, SRC, Y, STRIDE_D
@@ -659,20 +661,20 @@ generate_composite_function \
sub ORIG_W, ORIG_W, Y, lsr #24
sel Y, SCRATCH, Y
/* Then multiply the destination */
- mul_8888_8 WK&dst, ORIG_W, SCRATCH, STRIDE_D
- uqadd8 WK&dst, WK&dst, Y
+ mul_8888_8 WK\()\dst, ORIG_W, SCRATCH, STRIDE_D
+ uqadd8 WK\()\dst, WK\()\dst, Y
.endm
.macro over_n_8_8888_process_tail cond, numbytes, firstreg
WK4 .req STRIDE_M
teq WK4, #0
beq 10f
- .set PROCESS_REG, firstreg
- .rept numbytes / 4
- over_n_8_8888_1pixel %(PROCESS_REG-firstreg), %(PROCESS_REG)
+ .set PROCESS_REG, \firstreg
+ .rept \numbytes / 4
+ over_n_8_8888_1pixel %(PROCESS_REG-\firstreg), %(PROCESS_REG)
.set PROCESS_REG, PROCESS_REG+1
.endr
- pixst , numbytes, firstreg, DST
+ pixst , \numbytes, \firstreg, DST
10:
.unreq WK4
.endm
@@ -705,14 +707,14 @@ generate_composite_function \
.endm
.macro over_reverse_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
- pixld , numbytes, firstreg, DST, 0
+ pixld , \numbytes, \firstreg, DST, 0
.endm
.macro over_reverse_n_8888_1pixel d, is_only
- teq WK&d, #0
+ teq WK\()\d, #0
beq 8f /* replace with source */
- bics ORIG_W, STRIDE_D, WK&d, lsr #24
- .if is_only == 1
+ bics ORIG_W, STRIDE_D, WK\()\d, lsr #24
+ .if \is_only == 1
beq 49f /* skip store */
.else
beq 9f /* write same value back */
@@ -723,36 +725,36 @@ generate_composite_function \
uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8
mov SCRATCH, SCRATCH, ror #8
sel ORIG_W, SCRATCH, ORIG_W
- uqadd8 WK&d, WK&d, ORIG_W
+ uqadd8 WK\()\d, WK\()\d, ORIG_W
b 9f
-8: mov WK&d, SRC
+8: mov WK\()\d, SRC
9:
.endm
.macro over_reverse_n_8888_tail numbytes, reg1, reg2, reg3, reg4
- .if numbytes == 4
- over_reverse_n_8888_1pixel reg1, 1
+ .if \numbytes == 4
+ over_reverse_n_8888_1pixel \reg1, 1
.else
- and SCRATCH, WK&reg1, WK&reg2
- .if numbytes == 16
- and SCRATCH, SCRATCH, WK&reg3
- and SCRATCH, SCRATCH, WK&reg4
+ and SCRATCH, WK\()\reg1, WK\()\reg2
+ .if \numbytes == 16
+ and SCRATCH, SCRATCH, WK\()\reg3
+ and SCRATCH, SCRATCH, WK\()\reg4
.endif
mvns SCRATCH, SCRATCH, asr #24
beq 49f /* skip store if all opaque */
- over_reverse_n_8888_1pixel reg1, 0
- over_reverse_n_8888_1pixel reg2, 0
- .if numbytes == 16
- over_reverse_n_8888_1pixel reg3, 0
- over_reverse_n_8888_1pixel reg4, 0
+ over_reverse_n_8888_1pixel \reg1, 0
+ over_reverse_n_8888_1pixel \reg2, 0
+ .if \numbytes == 16
+ over_reverse_n_8888_1pixel \reg3, 0
+ over_reverse_n_8888_1pixel \reg4, 0
.endif
.endif
- pixst , numbytes, reg1, DST
+ pixst , \numbytes, \reg1, DST
49:
.endm
.macro over_reverse_n_8888_process_tail cond, numbytes, firstreg
- over_reverse_n_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
+ over_reverse_n_8888_tail \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3)
.endm
generate_composite_function \
@@ -794,20 +796,20 @@ generate_composite_function \
.macro over_white_8888_8888_ca_combine m, d
uxtb16 TMP1, TMP0 /* rb_notmask */
- uxtb16 TMP2, d /* rb_dest; 1 stall follows */
+ uxtb16 TMP2, \d /* rb_dest; 1 stall follows */
smlatt TMP3, TMP2, TMP1, HALF /* red */
smlabb TMP2, TMP2, TMP1, HALF /* blue */
uxtb16 TMP0, TMP0, ror #8 /* ag_notmask */
- uxtb16 TMP1, d, ror #8 /* ag_dest; 1 stall follows */
- smlatt d, TMP1, TMP0, HALF /* alpha */
+ uxtb16 TMP1, \d, ror #8 /* ag_dest; 1 stall follows */
+ smlatt \d, TMP1, TMP0, HALF /* alpha */
smlabb TMP1, TMP1, TMP0, HALF /* green */
pkhbt TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */
- pkhbt TMP1, TMP1, d, lsl #16 /* ag */
+ pkhbt TMP1, TMP1, \d, lsl #16 /* ag */
uxtab16 TMP0, TMP0, TMP0, ror #8
uxtab16 TMP1, TMP1, TMP1, ror #8
mov TMP0, TMP0, ror #8
- sel d, TMP0, TMP1
- uqadd8 d, d, m /* d is a late result */
+ sel \d, TMP0, TMP1
+ uqadd8 \d, \d, \m /* d is a late result */
.endm
.macro over_white_8888_8888_ca_1pixel_head
@@ -853,10 +855,10 @@ generate_composite_function \
.endm
.macro over_white_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
- .if numbytes == 4
+ .if \numbytes == 4
over_white_8888_8888_ca_1pixel_head
.else
- .if numbytes == 16
+ .if \numbytes == 16
over_white_8888_8888_ca_2pixels_head
over_white_8888_8888_ca_2pixels_tail
.endif
@@ -865,7 +867,7 @@ generate_composite_function \
.endm
.macro over_white_8888_8888_ca_process_tail cond, numbytes, firstreg
- .if numbytes == 4
+ .if \numbytes == 4
over_white_8888_8888_ca_1pixel_tail
.else
over_white_8888_8888_ca_2pixels_tail
@@ -1004,7 +1006,7 @@ generate_composite_function \
.endm
.macro over_n_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
- .rept (numbytes / 4) - 1
+ .rept (\numbytes / 4) - 1
over_n_8888_8888_ca_1pixel_head
over_n_8888_8888_ca_1pixel_tail
.endr
@@ -1020,7 +1022,7 @@ pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6
cmp ip, #-1
beq pixman_composite_over_white_8888_8888_ca_asm_armv6
/* else drop through... */
- .endfunc
+pixman_end_asm_function
generate_composite_function \
pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \
FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \
@@ -1045,84 +1047,84 @@ generate_composite_function \
.macro in_reverse_8888_8888_head numbytes, reg1, reg2, reg3
ldrb ORIG_W, [SRC], #4
- .if numbytes >= 8
- ldrb WK&reg1, [SRC], #4
- .if numbytes == 16
- ldrb WK&reg2, [SRC], #4
- ldrb WK&reg3, [SRC], #4
+ .if \numbytes >= 8
+ ldrb WK\()\reg1, [SRC], #4
+ .if \numbytes == 16
+ ldrb WK\()\reg2, [SRC], #4
+ ldrb WK\()\reg3, [SRC], #4
.endif
.endif
- add DST, DST, #numbytes
+ add DST, DST, #\numbytes
.endm
.macro in_reverse_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
- in_reverse_8888_8888_head numbytes, firstreg, %(firstreg+1), %(firstreg+2)
+ in_reverse_8888_8888_head \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2)
.endm
.macro in_reverse_8888_8888_1pixel s, d, offset, is_only
- .if is_only != 1
- movs s, ORIG_W
- .if offset != 0
- ldrb ORIG_W, [SRC, #offset]
+ .if \is_only != 1
+ movs \s, ORIG_W
+ .if \offset != 0
+ ldrb ORIG_W, [SRC, #\offset]
.endif
beq 01f
teq STRIDE_M, #0xFF
beq 02f
.endif
- uxtb16 SCRATCH, d /* rb_dest */
- uxtb16 d, d, ror #8 /* ag_dest */
- mla SCRATCH, SCRATCH, s, MASK
- mla d, d, s, MASK
+ uxtb16 SCRATCH, \d /* rb_dest */
+ uxtb16 \d, \d, ror #8 /* ag_dest */
+ mla SCRATCH, SCRATCH, \s, MASK
+ mla \d, \d, \s, MASK
uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
- uxtab16 d, d, d, ror #8
+ uxtab16 \d, \d, \d, ror #8
mov SCRATCH, SCRATCH, ror #8
- sel d, SCRATCH, d
+ sel \d, SCRATCH, \d
b 02f
- .if offset == 0
+ .if \offset == 0
48: /* Last mov d,#0 of the set - used as part of shortcut for
* source values all 0 */
.endif
-01: mov d, #0
+01: mov \d, #0
02:
.endm
.macro in_reverse_8888_8888_tail numbytes, reg1, reg2, reg3, reg4
- .if numbytes == 4
+ .if \numbytes == 4
teq ORIG_W, ORIG_W, asr #32
- ldrne WK&reg1, [DST, #-4]
- .elseif numbytes == 8
- teq ORIG_W, WK&reg1
+ ldrne WK\()\reg1, [DST, #-4]
+ .elseif \numbytes == 8
+ teq ORIG_W, WK\()\reg1
teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */
- ldmnedb DST, {WK&reg1-WK&reg2}
+ ldmdbne DST, {WK\()\reg1-WK\()\reg2}
.else
- teq ORIG_W, WK&reg1
- teqeq ORIG_W, WK&reg2
- teqeq ORIG_W, WK&reg3
+ teq ORIG_W, WK\()\reg1
+ teqeq ORIG_W, WK\()\reg2
+ teqeq ORIG_W, WK\()\reg3
teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */
- ldmnedb DST, {WK&reg1-WK&reg4}
+ ldmdbne DST, {WK\()\reg1-WK\()\reg4}
.endif
cmnne DST, #0 /* clear C if NE */
bcs 49f /* no writes to dest if source all -1 */
beq 48f /* set dest to all 0 if source all 0 */
- .if numbytes == 4
- in_reverse_8888_8888_1pixel ORIG_W, WK&reg1, 0, 1
- str WK&reg1, [DST, #-4]
- .elseif numbytes == 8
- in_reverse_8888_8888_1pixel STRIDE_M, WK&reg1, -4, 0
- in_reverse_8888_8888_1pixel STRIDE_M, WK&reg2, 0, 0
- stmdb DST, {WK&reg1-WK&reg2}
+ .if \numbytes == 4
+ in_reverse_8888_8888_1pixel ORIG_W, WK\()\reg1, 0, 1
+ str WK\()\reg1, [DST, #-4]
+ .elseif \numbytes == 8
+ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg1, -4, 0
+ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg2, 0, 0
+ stmdb DST, {WK\()\reg1-WK\()\reg2}
.else
- in_reverse_8888_8888_1pixel STRIDE_M, WK&reg1, -12, 0
- in_reverse_8888_8888_1pixel STRIDE_M, WK&reg2, -8, 0
- in_reverse_8888_8888_1pixel STRIDE_M, WK&reg3, -4, 0
- in_reverse_8888_8888_1pixel STRIDE_M, WK&reg4, 0, 0
- stmdb DST, {WK&reg1-WK&reg4}
+ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg1, -12, 0
+ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg2, -8, 0
+ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg3, -4, 0
+ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg4, 0, 0
+ stmdb DST, {WK\()\reg1-WK\()\reg4}
.endif
49:
.endm
.macro in_reverse_8888_8888_process_tail cond, numbytes, firstreg
- in_reverse_8888_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
+ in_reverse_8888_8888_tail \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3)
.endm
generate_composite_function \
@@ -1149,21 +1151,21 @@ generate_composite_function \
.endm
.macro over_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
- pixld , numbytes, firstreg, DST, 0
+ pixld , \numbytes, \firstreg, DST, 0
.endm
.macro over_n_8888_1pixel dst
- mul_8888_8 WK&dst, STRIDE_M, SCRATCH, MASK
- uqadd8 WK&dst, WK&dst, SRC
+ mul_8888_8 WK\()\dst, STRIDE_M, SCRATCH, MASK
+ uqadd8 WK\()\dst, WK\()\dst, SRC
.endm
.macro over_n_8888_process_tail cond, numbytes, firstreg
- .set PROCESS_REG, firstreg
- .rept numbytes / 4
+ .set PROCESS_REG, \firstreg
+ .rept \numbytes / 4
over_n_8888_1pixel %(PROCESS_REG)
.set PROCESS_REG, PROCESS_REG+1
.endr
- pixst , numbytes, firstreg, DST
+ pixst , \numbytes, \firstreg, DST
.endm
generate_composite_function \
diff --git a/pixman/pixman-arm-simd-asm.h b/pixman/pixman-arm-simd-asm.h
index da153c3..5ec19e0 100644
--- a/pixman/pixman-arm-simd-asm.h
+++ b/pixman/pixman-arm-simd-asm.h
@@ -112,64 +112,64 @@
*/
.macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
- .if numbytes == 16
- .if unaligned == 1
- op&r&cond WK&reg0, [base], #4
- op&r&cond WK&reg1, [base], #4
- op&r&cond WK&reg2, [base], #4
- op&r&cond WK&reg3, [base], #4
+ .if \numbytes == 16
+ .if \unaligned == 1
+ \op\()r\()\cond WK\()\reg0, [\base], #4
+ \op\()r\()\cond WK\()\reg1, [\base], #4
+ \op\()r\()\cond WK\()\reg2, [\base], #4
+ \op\()r\()\cond WK\()\reg3, [\base], #4
.else
- op&m&cond&ia base!, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
+ \op\()mia\()\cond \base!, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
.endif
- .elseif numbytes == 8
- .if unaligned == 1
- op&r&cond WK&reg0, [base], #4
- op&r&cond WK&reg1, [base], #4
+ .elseif \numbytes == 8
+ .if \unaligned == 1
+ \op\()r\()\cond WK\()\reg0, [\base], #4
+ \op\()r\()\cond WK\()\reg1, [\base], #4
.else
- op&m&cond&ia base!, {WK&reg0,WK&reg1}
- .endif
- .elseif numbytes == 4
- op&r&cond WK&reg0, [base], #4
- .elseif numbytes == 2
- op&r&cond&h WK&reg0, [base], #2
- .elseif numbytes == 1
- op&r&cond&b WK&reg0, [base], #1
+ \op\()mia\()\cond \base!, {WK\()\reg0,WK\()\reg1}
+ .endif
+ .elseif \numbytes == 4
+ \op\()r\()\cond WK\()\reg0, [\base], #4
+ .elseif \numbytes == 2
+ \op\()rh\()\cond WK\()\reg0, [\base], #2
+ .elseif \numbytes == 1
+ \op\()rb\()\cond WK\()\reg0, [\base], #1
.else
- .error "unsupported size: numbytes"
+ .error "unsupported size: \numbytes"
.endif
.endm
.macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
- .if numbytes == 16
- stm&cond&db base, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
- .elseif numbytes == 8
- stm&cond&db base, {WK&reg0,WK&reg1}
- .elseif numbytes == 4
- str&cond WK&reg0, [base, #-4]
- .elseif numbytes == 2
- str&cond&h WK&reg0, [base, #-2]
- .elseif numbytes == 1
- str&cond&b WK&reg0, [base, #-1]
+ .if \numbytes == 16
+ stm\()\cond\()db \base, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
+ .elseif \numbytes == 8
+ stmdb\()\cond \base, {WK\()\reg0,WK\()\reg1}
+ .elseif \numbytes == 4
+ str\()\cond WK\()\reg0, [\base, #-4]
+ .elseif \numbytes == 2
+ strh\()\cond WK\()\reg0, [\base, #-2]
+ .elseif \numbytes == 1
+ strb\()\cond WK\()\reg0, [\base, #-1]
.else
- .error "unsupported size: numbytes"
+ .error "unsupported size: \numbytes"
.endif
.endm
.macro pixld cond, numbytes, firstreg, base, unaligned
- pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
+ pixldst ld, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base, \unaligned
.endm
.macro pixst cond, numbytes, firstreg, base
.if (flags) & FLAG_DST_READWRITE
- pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
+ pixst_baseupdated \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base
.else
- pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
+ pixldst st, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base
.endif
.endm
.macro PF a, x:vararg
.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
- a x
+ \a \x
.endif
.endm
@@ -179,11 +179,11 @@
* between 0 and prefetch_distance (inclusive) cache lines ahead so there
* are no gaps when the inner loop starts.
*/
- .if bpp > 0
- PF bic, ptr, base, #31
+ .if \bpp > 0
+ PF bic, \ptr, \base, #31
.set OFFSET, 0
.rept prefetch_distance+1
- PF pld, [ptr, #OFFSET]
+ PF pld, [\ptr, #OFFSET]
.set OFFSET, OFFSET+32
.endr
.endif
@@ -201,42 +201,42 @@
* and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
* possible when there are 4 src bytes for every 1 dst byte).
*/
- .if bpp > 0
- .ifc base,DST
+ .if \bpp > 0
+ .ifc \base,DST
/* The test can be simplified further when preloading the destination */
- PF tst, base, #16
+ PF tst, \base, #16
PF beq, 61f
.else
- .if bpp/dst_w_bpp == 4
- PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
+ .if \bpp/dst_w_bpp == 4
+ PF add, SCRATCH, \base, WK0, lsl #\bpp_shift-dst_bpp_shift
PF and, SCRATCH, SCRATCH, #31
- PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
+ PF rsb, SCRATCH, SCRATCH, WK0, lsl #\bpp_shift-dst_bpp_shift
PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */
PF movs, SCRATCH, SCRATCH, lsl #32-6 /* so this sets NC / nc / Nc */
PF bcs, 61f
PF bpl, 60f
PF pld, [ptr, #32*(prefetch_distance+2)]
.else
- PF mov, SCRATCH, base, lsl #32-5
- PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
- PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
+ PF mov, SCRATCH, \base, lsl #32-5
+ PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift
+ PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift
PF bls, 61f
.endif
.endif
-60: PF pld, [ptr, #32*(prefetch_distance+1)]
+60: PF pld, [\ptr, #32*(prefetch_distance+1)]
61:
.endif
.endm
#define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
.macro preload_middle bpp, base, scratch_holds_offset
- .if bpp > 0
+ .if \bpp > 0
/* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
- .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
- .if scratch_holds_offset
- PF pld, [base, SCRATCH]
+ .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/\bpp)
+ .if \scratch_holds_offset
+ PF pld, [\base, SCRATCH]
.else
- PF bic, SCRATCH, base, #31
+ PF bic, SCRATCH, \base, #31
PF pld, [SCRATCH, #32*prefetch_distance]
.endif
.endif
@@ -244,28 +244,28 @@
.endm
.macro preload_trailing bpp, bpp_shift, base
- .if bpp > 0
- .if bpp*pix_per_block > 256
+ .if \bpp > 0
+ .if \bpp*pix_per_block > 256
/* Calculations are more complex if more than one fetch per block */
- PF and, WK1, base, #31
- PF add, WK1, WK1, WK0, lsl #bpp_shift
- PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
- PF bic, SCRATCH, base, #31
+ PF and, WK1, \base, #31
+ PF add, WK1, WK1, WK0, lsl #\bpp_shift
+ PF add, WK1, WK1, #32*(\bpp*pix_per_block/256-1)*(prefetch_distance+1)
+ PF bic, SCRATCH, \base, #31
80: PF pld, [SCRATCH, #32*(prefetch_distance+1)]
PF add, SCRATCH, SCRATCH, #32
PF subs, WK1, WK1, #32
PF bhi, 80b
.else
/* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
- PF mov, SCRATCH, base, lsl #32-5
- PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
- PF adceqs, SCRATCH, SCRATCH, #0
+ PF mov, SCRATCH, \base, lsl #32-5
+ PF adds, SCRATCH, SCRATCH, X, lsl #32-5+\bpp_shift
+ PF adcseq, SCRATCH, SCRATCH, #0
/* The instruction above has two effects: ensures Z is only
* set if C was clear (so Z indicates that both shifted quantities
* were 0), and clears C if Z was set (so C indicates that the sum
* of the shifted quantities was greater and not equal to 32) */
PF beq, 82f
- PF bic, SCRATCH, base, #31
+ PF bic, SCRATCH, \base, #31
PF bcc, 81f
PF pld, [SCRATCH, #32*(prefetch_distance+2)]
81: PF pld, [SCRATCH, #32*(prefetch_distance+1)]
@@ -288,12 +288,12 @@
* "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
* "base" - base address register of channel to preload (SRC, MASK or DST)
*/
- .if bpp > 0
- .if narrow_case && (bpp <= dst_w_bpp)
+ .if \bpp > 0
+ .if \narrow_case && (\bpp <= dst_w_bpp)
/* In these cases, each line for each channel is in either 1 or 2 cache lines */
- PF bic, WK0, base, #31
+ PF bic, WK0, \base, #31
PF pld, [WK0]
- PF add, WK1, base, X, LSL #bpp_shift
+ PF add, WK1, \base, X, LSL #\bpp_shift
PF sub, WK1, WK1, #1
PF bic, WK1, WK1, #31
PF cmp, WK1, WK0
@@ -301,9 +301,9 @@
PF pld, [WK1]
90:
.else
- PF bic, WK0, base, #31
+ PF bic, WK0, \base, #31
PF pld, [WK0]
- PF add, WK1, base, X, lsl #bpp_shift
+ PF add, WK1, \base, X, lsl #\bpp_shift
PF sub, WK1, WK1, #1
PF bic, WK1, WK1, #31
PF cmp, WK1, WK0
@@ -319,56 +319,56 @@
.macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
- process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
- .if decrementx
- sub&cond X, X, #8*numbytes/dst_w_bpp
+ \process_head \cond, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, 0
+ .if \decrementx
+ sub\()\cond X, X, #8*\numbytes/dst_w_bpp
.endif
- process_tail cond, numbytes, firstreg
+ \process_tail \cond, \numbytes, \firstreg
.if !((flags) & FLAG_PROCESS_DOES_STORE)
- pixst cond, numbytes, firstreg, DST
+ pixst \cond, \numbytes, \firstreg, DST
.endif
.endm
.macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
.if (flags) & FLAG_BRANCH_OVER
- .ifc cond,mi
+ .ifc \cond,mi
bpl 100f
.endif
- .ifc cond,cs
+ .ifc \cond,cs
bcc 100f
.endif
- .ifc cond,ne
+ .ifc \cond,ne
beq 100f
.endif
- conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+ conditional_process1_helper , \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx
100:
.else
- conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+ conditional_process1_helper \cond, \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx
.endif
.endm
.macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
.if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
/* Can't interleave reads and writes */
- test
- conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
+ \test
+ conditional_process1 \cond1, \process_head, \process_tail, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, \decrementx
.if (flags) & FLAG_PROCESS_CORRUPTS_PSR
- test
+ \test
.endif
- conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
+ conditional_process1 \cond2, \process_head, \process_tail, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, \decrementx
.else
/* Can interleave reads and writes for better scheduling */
- test
- process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
- process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
- .if decrementx
- sub&cond1 X, X, #8*numbytes1/dst_w_bpp
- sub&cond2 X, X, #8*numbytes2/dst_w_bpp
- .endif
- process_tail cond1, numbytes1, firstreg1
- process_tail cond2, numbytes2, firstreg2
- pixst cond1, numbytes1, firstreg1, DST
- pixst cond2, numbytes2, firstreg2, DST
+ \test
+ \process_head \cond1, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, 0
+ \process_head \cond2, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, 0
+ .if \decrementx
+ sub\()\cond1 X, X, #8*\numbytes1/dst_w_bpp
+ sub\()\cond2 X, X, #8*\numbytes2/dst_w_bpp
+ .endif
+ \process_tail \cond1, \numbytes1, \firstreg1
+ \process_tail \cond2, \numbytes2, \firstreg2
+ pixst \cond1, \numbytes1, \firstreg1, DST
+ pixst \cond2, \numbytes2, \firstreg2, DST
.endif
.endm
@@ -400,12 +400,12 @@
.endif
/* Use unaligned loads in all cases for simplicity */
.if dst_w_bpp == 8
- conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
+ conditional_process2 test_bits_1_0_ptr, mi, cs, \process_head, \process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
.elseif dst_w_bpp == 16
test_bits_1_0_ptr
- conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X
+ conditional_process1 cs, \process_head, \process_tail, 2, 2, 1, 1, DECREMENT_X
.endif
- conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
+ conditional_process2 test_bits_3_2_ptr, mi, cs, \process_head, \process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
.if (flags) & FLAG_PROCESS_CORRUPTS_WK0
ldr X, [sp, #LINE_SAVED_REG_COUNT*4]
.endif
@@ -424,12 +424,12 @@
.endm
.macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
- conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
+ conditional_process2 test_bits_3_2_pix, cs, mi, \process_head, \process_tail, 8, 4, 0, 2, \unaligned_src, \unaligned_mask, 0
.if dst_w_bpp == 16
test_bits_1_0_pix
- conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
+ conditional_process1 cs, \process_head, \process_tail, 2, 0, \unaligned_src, \unaligned_mask, 0
.elseif dst_w_bpp == 8
- conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
+ conditional_process2 test_bits_1_0_pix, cs, mi, \process_head, \process_tail, 2, 1, 0, 1, \unaligned_src, \unaligned_mask, 0
.endif
.endm
@@ -438,7 +438,7 @@
110:
.set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
.rept pix_per_block*dst_w_bpp/128
- process_head , 16, 0, unaligned_src, unaligned_mask, 1
+ \process_head , 16, 0, \unaligned_src, \unaligned_mask, 1
.if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
preload_middle src_bpp, SRC, 1
.elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
@@ -453,9 +453,9 @@
* preloads for, to achieve staggered prefetches for multiple channels, because there are
* always two STMs per prefetch, so there is always an opposite STM on which to put the
* preload. Note, no need to BIC the base register here */
- PF pld, [DST, #32*prefetch_distance - dst_alignment]
+ PF pld, [DST, #32*prefetch_distance - \dst_alignment]
.endif
- process_tail , 16, 0
+ \process_tail , 16, 0
.if !((flags) & FLAG_PROCESS_DOES_STORE)
pixst , 16, 0, DST
.endif
@@ -470,11 +470,11 @@
.if dst_r_bpp > 0
tst DST, #16
bne 111f
- process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS
+ \process_inner_loop \process_head, \process_tail, \unaligned_src, \unaligned_mask, 16 + DST_PRELOAD_BIAS
b 112f
111:
.endif
- process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS
+ \process_inner_loop \process_head, \process_tail, \unaligned_src, \unaligned_mask, 0 + DST_PRELOAD_BIAS
112:
/* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
.if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
@@ -487,13 +487,13 @@
.endif
add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
/* The remainder of the line is handled identically to the medium case */
- medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
+ medium_case_inner_loop_and_trailing_pixels \process_head, \process_tail,, \exit_label, \unaligned_src, \unaligned_mask
.endm
.macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
120:
- process_head , 16, 0, unaligned_src, unaligned_mask, 0
- process_tail , 16, 0
+ \process_head , 16, 0, \unaligned_src, \unaligned_mask, 0
+ \process_tail , 16, 0
.if !((flags) & FLAG_PROCESS_DOES_STORE)
pixst , 16, 0, DST
.endif
@@ -501,16 +501,16 @@
bhs 120b
/* Trailing pixels */
tst X, #128/dst_w_bpp - 1
- beq exit_label
- trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
+ beq \exit_label
+ trailing_15bytes \process_head, \process_tail, \unaligned_src, \unaligned_mask
.endm
.macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
tst X, #16*8/dst_w_bpp
- conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
+ conditional_process1 ne, \process_head, \process_tail, 16, 0, \unaligned_src, \unaligned_mask, 0
/* Trailing pixels */
/* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
- trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
+ trailing_15bytes \process_head, \process_tail, \unaligned_src, \unaligned_mask
.endm
.macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label
@@ -523,37 +523,37 @@
tst SRC, #3
bne 140f
.endif
- action process_head, process_tail, process_inner_loop, exit_label, 0, 0
+ \action \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 0
.if src_bpp == 8 || src_bpp == 16
- b exit_label
+ b \exit_label
140:
- action process_head, process_tail, process_inner_loop, exit_label, 1, 0
+ \action \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 0
.endif
.if mask_bpp == 8 || mask_bpp == 16
- b exit_label
+ b \exit_label
141:
.if src_bpp == 8 || src_bpp == 16
tst SRC, #3
bne 142f
.endif
- action process_head, process_tail, process_inner_loop, exit_label, 0, 1
+ \action \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 1
.if src_bpp == 8 || src_bpp == 16
- b exit_label
+ b \exit_label
142:
- action process_head, process_tail, process_inner_loop, exit_label, 1, 1
+ \action \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 1
.endif
.endif
.endm
.macro end_of_line restore_x, vars_spilled, loop_label, last_one
- .if vars_spilled
+ .if \vars_spilled
/* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
/* This is ldmia sp,{} */
.word 0xE89D0000 | LINE_SAVED_REGS
.endif
subs Y, Y, #1
- .if vars_spilled
+ .if \vars_spilled
.if (LINE_SAVED_REGS) & (1<<1)
str Y, [sp]
.endif
@@ -565,18 +565,18 @@
.if mask_bpp > 0
add MASK, MASK, STRIDE_M
.endif
- .if restore_x
+ .if \restore_x
mov X, ORIG_W
.endif
- bhs loop_label
- .ifc "last_one",""
- .if vars_spilled
+ bhs \loop_label
+ .ifc "\last_one",""
+ .if \vars_spilled
b 197f
.else
b 198f
.endif
.else
- .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
+ .if (!\vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
b 198f
.endif
.endif
@@ -596,17 +596,17 @@
process_tail, \
process_inner_loop
- pixman_asm_function fname
+ pixman_asm_function \fname
/*
* Make some macro arguments globally visible and accessible
* from other macros
*/
- .set src_bpp, src_bpp_
- .set mask_bpp, mask_bpp_
- .set dst_w_bpp, dst_w_bpp_
- .set flags, flags_
- .set prefetch_distance, prefetch_distance_
+ .set src_bpp, \src_bpp_
+ .set mask_bpp, \mask_bpp_
+ .set dst_w_bpp, \dst_w_bpp_
+ .set flags, \flags_
+ .set prefetch_distance, \prefetch_distance_
/*
* Select prefetch type for this function.
@@ -732,7 +732,7 @@
sub Y, Y, #1
#endif
- init
+ \init
.if (flags) & FLAG_PROCESS_CORRUPTS_WK0
/* Reserve a word in which to store X during leading pixels */
@@ -773,7 +773,7 @@
.set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
.endif
151: /* New line */
- newline
+ \newline
preload_leading_step1 src_bpp, WK1, SRC
preload_leading_step1 mask_bpp, WK2, MASK
.if ((flags) & FLAG_NO_PRELOAD_DST) == 0
@@ -790,7 +790,7 @@
preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST
.endif
- leading_15bytes process_head, process_tail
+ leading_15bytes \process_head, \process_tail
154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
.if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
@@ -800,10 +800,10 @@
and SCRATCH, MASK, #31
rsb SCRATCH, SCRATCH, #32*prefetch_distance
.endif
- .ifc "process_inner_loop",""
- switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
+ .ifc "\process_inner_loop",""
+ switch_on_alignment wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, wide_case_inner_loop, 157f
.else
- switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
+ switch_on_alignment wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, \process_inner_loop, 157f
.endif
157: /* Check for another line */
@@ -825,7 +825,7 @@
.set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
.endif
161: /* New line */
- newline
+ \newline
preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
preload_line 0, mask_bpp, mask_bpp_shift, MASK
.if ((flags) & FLAG_NO_PRELOAD_DST) == 0
@@ -837,10 +837,10 @@
beq 164f
rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */
- leading_15bytes process_head, process_tail
+ leading_15bytes \process_head, \process_tail
164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */
- switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
+ switch_on_alignment medium_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 167f
167: /* Check for another line */
end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
@@ -856,7 +856,7 @@
.word 0xE92D0000 | LINE_SAVED_REGS
.endif
171: /* New line */
- newline
+ \newline
preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
preload_line 1, mask_bpp, mask_bpp_shift, MASK
.if ((flags) & FLAG_NO_PRELOAD_DST) == 0
@@ -868,8 +868,8 @@
beq 174f
172: subs X, X, #1
blo 177f
- process_head , 1, 0, 1, 1, 0
- process_tail , 1, 0
+ \process_head , 1, 0, 1, 1, 0
+ \process_tail , 1, 0
.if !((flags) & FLAG_PROCESS_DOES_STORE)
pixst , 1, 0, DST
.endif
@@ -880,15 +880,15 @@
beq 174f
subs X, X, #1
blo 177f
- process_head , 2, 0, 1, 1, 0
- process_tail , 2, 0
+ \process_head , 2, 0, 1, 1, 0
+ \process_tail , 2, 0
.if !((flags) & FLAG_PROCESS_DOES_STORE)
pixst , 2, 0, DST
.endif
.endif
174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
- switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
+ switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 177f
177: /* Check for another line */
end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
@@ -908,7 +908,7 @@
add sp, sp, #4
.endif
- cleanup
+ \cleanup
#ifdef DEBUG_PARAMS
add sp, sp, #9*4 /* junk the debug copy of arguments */
@@ -932,13 +932,13 @@
.unreq WK3
.unreq SCRATCH
.unreq ORIG_W
- .endfunc
+ pixman_end_asm_function
.endm
.macro line_saved_regs x:vararg
.set LINE_SAVED_REGS, 0
.set LINE_SAVED_REG_COUNT, 0
- .irp SAVED_REG,x
+ .irp SAVED_REG,\x
.ifc "SAVED_REG","Y"
.set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
.set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1