summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Turner <mattst88@gmail.com>2023-09-09 11:36:30 -0400
committerMatt Turner <mattst88@gmail.com>2024-02-29 10:47:07 -0500
commitadd7c8db45f7a253a3a20b0be3a2bfb304fea894 (patch)
tree8c6070092fb52225d9de52b9d3f2eabc06333b28
parent63ae6af9a6f1d4c62239b9883ada72f32e6b4fff (diff)
pixman-arm: Use unified syntax
Allows us to use the same assembly without a bunch of #ifdef __clang__.
-rw-r--r--meson.build7
-rw-r--r--pixman/pixman-arm-asm.h6
-rw-r--r--pixman/pixman-arm-neon-asm-bilinear.S2
-rw-r--r--pixman/pixman-arm-neon-asm.S70
-rw-r--r--pixman/pixman-arm-neon-asm.h24
-rw-r--r--pixman/pixman-arm-simd-asm-scaled.S19
-rw-r--r--pixman/pixman-arm-simd-asm.S17
-rw-r--r--pixman/pixman-arm-simd-asm.h34
8 files changed, 74 insertions, 105 deletions
diff --git a/meson.build b/meson.build
index 4337f93..438e6cf 100644
--- a/meson.build
+++ b/meson.build
@@ -252,6 +252,13 @@ if cc.compiles('''
config.set('ASM_HAVE_FUNC_DIRECTIVE', 1)
endif
+if cc.compiles('''
+ __asm__ (
+ ".syntax unified\n"
+ );''',
+ name : 'test for ASM .syntax unified directive')
+ config.set('ASM_HAVE_SYNTAX_UNIFIED', 1)
+endif
if cc.links('''
#include <stdint.h>
diff --git a/pixman/pixman-arm-asm.h b/pixman/pixman-arm-asm.h
index 8253906..edf8e82 100644
--- a/pixman/pixman-arm-asm.h
+++ b/pixman/pixman-arm-asm.h
@@ -50,6 +50,12 @@
#endif
.endm
+.macro pixman_syntax_unified
+#ifdef ASM_HAVE_SYNTAX_UNIFIED
+ .syntax unified
+#endif
+.endm
+
.macro pixman_end_asm_function
#ifdef ASM_HAVE_FUNC_DIRECTIVE
.endfunc
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index ce4d5f8..6bd2736 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -68,6 +68,8 @@
#include "pixman-arm-asm.h"
#include "pixman-arm-neon-asm.h"
+pixman_syntax_unified
+
/*
* Bilinear macros from pixman-arm-neon-asm.S
*/
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 7025eba..0e09257 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -34,12 +34,6 @@
* - pixman_composite_over_n_8_0565_asm_neon
*/
-#ifdef __clang__
-#define ldrgeb ldrbge
-#define subges subsge
-#define subpls subspl
-#endif
-
/* Prevent the stack from becoming executable for no reason... */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
@@ -59,6 +53,8 @@
#include "pixman-arm-asm.h"
#include "pixman-arm-neon-asm.h"
+ pixman_syntax_unified
+
/* Global configuration options and preferences */
/*
@@ -287,12 +283,12 @@
PF subge, PF_X, PF_X, ORIG_W
vrshr.u16 q3, q11, #8
vrshr.u16 q15, q12, #8
- PF subges, PF_CTL, PF_CTL, #0x10
+ PF subsge, PF_CTL, PF_CTL, #0x10
vsri.u16 q14, q9, #11
- PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
vraddhn.u16 d20, q10, q13
vraddhn.u16 d23, q11, q3
- PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vraddhn.u16 d22, q12, q15
vst1.16 {d28, d29}, [DST_W, :128]!
.endm
@@ -451,9 +447,9 @@ generate_composite_function \
vshll.u8 q8, d1, #8
vst1.16 {d28, d29}, [DST_W, :128]!
PF subge, PF_X, PF_X, ORIG_W
- PF subges, PF_CTL, PF_CTL, #0x10
+ PF subsge, PF_CTL, PF_CTL, #0x10
vshll.u8 q14, d2, #8
- PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
vshll.u8 q9, d0, #8
.endm
@@ -525,10 +521,10 @@ generate_composite_function \
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
PF subge, PF_X, PF_X, ORIG_W
- PF subges, PF_CTL, PF_CTL, #0x10
+ PF subsge, PF_CTL, PF_CTL, #0x10
vqadd.u8 q14, q0, q2
- PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
- PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vqadd.u8 q15, q1, q3
.endm
@@ -557,10 +553,10 @@ generate_composite_function \
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
PF subge, PF_X, PF_X, ORIG_W
- PF subges, PF_CTL, PF_CTL, #0x10
+ PF subsge, PF_CTL, PF_CTL, #0x10
vqadd.u8 q14, q0, q2
- PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
- PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vqadd.u8 q15, q1, q3
.endm
@@ -631,9 +627,9 @@ generate_composite_function_single_scanline \
vmull.u8 q8, d22, d4
PF subsge, PF_CTL, PF_CTL, #0x10
vmull.u8 q9, d22, d5
- PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
vmull.u8 q10, d22, d6
- PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vmull.u8 q11, d22, d7
.endm
@@ -683,11 +679,11 @@ generate_composite_function_single_scanline \
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
PF subge, PF_X, PF_X, ORIG_W
vmull.u8 q8, d22, d4
- PF subges, PF_CTL, PF_CTL, #0x10
+ PF subsge, PF_CTL, PF_CTL, #0x10
vmull.u8 q9, d22, d5
- PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
vmull.u8 q10, d22, d6
- PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vmull.u8 q11, d22, d7
.endm
@@ -759,9 +755,9 @@ generate_composite_function_single_scanline \
vmull.u8 q9, d24, d5
PF subge, PF_X, PF_X, ORIG_W
vmull.u8 q10, d24, d6
- PF subges, PF_CTL, PF_CTL, #0x10
+ PF subsge, PF_CTL, PF_CTL, #0x10
vmull.u8 q11, d24, d7
- PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
.endm
@@ -810,10 +806,10 @@ generate_composite_function \
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
PF subge, PF_X, PF_X, ORIG_W
vmull.u8 q8, d22, d4
- PF subges, PF_CTL, PF_CTL, #0x10
+ PF subsge, PF_CTL, PF_CTL, #0x10
vmull.u8 q9, d22, d5
vmull.u8 q10, d22, d6
- PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vmull.u8 q11, d22, d7
.endm
@@ -1265,9 +1261,9 @@ generate_composite_function \
vmull.u8 q9, d24, d1
PF subge, PF_X, PF_X, ORIG_W
vmull.u8 q10, d24, d2
- PF subges, PF_CTL, PF_CTL, #0x10
+ PF subsge, PF_CTL, PF_CTL, #0x10
vmull.u8 q11, d24, d3
- PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
vrsra.u16 q8, q8, #8
vrsra.u16 q9, q9, #8
@@ -1334,9 +1330,9 @@ generate_composite_function \
vmull.u8 q1, d25, d16
PF subge, PF_X, PF_X, ORIG_W
vmull.u8 q2, d26, d16
- PF subges, PF_CTL, PF_CTL, #0x10
+ PF subsge, PF_CTL, PF_CTL, #0x10
vmull.u8 q3, d27, d16
- PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
vrsra.u16 q0, q0, #8
vrsra.u16 q1, q1, #8
@@ -1430,11 +1426,11 @@ generate_composite_function \
vmull.u8 q7, d24, d9
PF subge, PF_X, PF_X, ORIG_W
vmull.u8 q8, d24, d10
- PF subges, PF_CTL, PF_CTL, #0x10
+ PF subsge, PF_CTL, PF_CTL, #0x10
vmull.u8 q9, d24, d11
- PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vqadd.u8 q14, q0, q14
- PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
vqadd.u8 q15, q1, q15
vrshr.u16 q10, q6, #8
vrshr.u16 q11, q7, #8
@@ -2444,8 +2440,8 @@ generate_composite_function \
PF cmp, PF_X, ORIG_W
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
PF subge, PF_X, PF_X, ORIG_W
- PF subges, PF_CTL, PF_CTL, #0x10
- PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF subsge, PF_CTL, PF_CTL, #0x10
+ PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
.endm
generate_composite_function \
@@ -2501,8 +2497,8 @@ generate_composite_function \
PF cmp, PF_X, ORIG_W
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
PF subge, PF_X, PF_X, ORIG_W
- PF subges, PF_CTL, PF_CTL, #0x10
- PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF subsge, PF_CTL, PF_CTL, #0x10
+ PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
.endm
generate_composite_function \
diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index e85526d..06318d9 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -213,24 +213,24 @@
.if \elem_size == 16
mov TMP1, VX, asr #16
adds VX, VX, UNIT_X
-5: subpls VX, VX, SRC_WIDTH_FIXED
+5: subspl VX, VX, SRC_WIDTH_FIXED
bpl 5b
add TMP1, \mem_operand, TMP1, asl #1
mov TMP2, VX, asr #16
adds VX, VX, UNIT_X
-5: subpls VX, VX, SRC_WIDTH_FIXED
+5: subspl VX, VX, SRC_WIDTH_FIXED
bpl 5b
add TMP2, \mem_operand, TMP2, asl #1
vld1.16 {d\()\reg1\()[0]}, [TMP1, :16]
mov TMP1, VX, asr #16
adds VX, VX, UNIT_X
-5: subpls VX, VX, SRC_WIDTH_FIXED
+5: subspl VX, VX, SRC_WIDTH_FIXED
bpl 5b
add TMP1, \mem_operand, TMP1, asl #1
vld1.16 {d\()\reg1\()[1]}, [TMP2, :16]
mov TMP2, VX, asr #16
adds VX, VX, UNIT_X
-5: subpls VX, VX, SRC_WIDTH_FIXED
+5: subspl VX, VX, SRC_WIDTH_FIXED
bpl 5b
add TMP2, \mem_operand, TMP2, asl #1
vld1.16 {d\()\reg1\()[2]}, [TMP1, :16]
@@ -238,12 +238,12 @@
.elseif \elem_size == 32
mov TMP1, VX, asr #16
adds VX, VX, UNIT_X
-5: subpls VX, VX, SRC_WIDTH_FIXED
+5: subspl VX, VX, SRC_WIDTH_FIXED
bpl 5b
add TMP1, \mem_operand, TMP1, asl #2
mov TMP2, VX, asr #16
adds VX, VX, UNIT_X
-5: subpls VX, VX, SRC_WIDTH_FIXED
+5: subspl VX, VX, SRC_WIDTH_FIXED
bpl 5b
add TMP2, \mem_operand, TMP2, asl #2
vld1.32 {d\()\reg1\()[0]}, [TMP1, :32]
@@ -281,14 +281,14 @@
.if \elem_size == 16
mov TMP1, VX, asr #16
adds VX, VX, UNIT_X
-5: subpls VX, VX, SRC_WIDTH_FIXED
+5: subspl VX, VX, SRC_WIDTH_FIXED
bpl 5b
add TMP1, \mem_operand, TMP1, asl #1
vld1.16 {d\()\reg1\()[\idx]}, [TMP1, :16]
.elseif \elem_size == 32
mov TMP1, VX, asr #16
adds VX, VX, UNIT_X
-5: subpls VX, VX, SRC_WIDTH_FIXED
+5: subspl VX, VX, SRC_WIDTH_FIXED
bpl 5b
add TMP1, \mem_operand, TMP1, asl #2
vld1.32 {d\()\reg1\()[\idx]}, [TMP1, :32]
@@ -420,15 +420,15 @@
PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
.endif
PF subge, PF_X, PF_X, ORIG_W
- PF subges, PF_CTL, PF_CTL, #0x10
+ PF subsge, PF_CTL, PF_CTL, #0x10
.if src_bpp_shift >= 0
- PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
.endif
.if dst_r_bpp != 0
- PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
.endif
.if mask_bpp_shift >= 0
- PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
.endif
.endif
.endm
diff --git a/pixman/pixman-arm-simd-asm-scaled.S b/pixman/pixman-arm-simd-asm-scaled.S
index 25445b8..cc62c81 100644
--- a/pixman/pixman-arm-simd-asm-scaled.S
+++ b/pixman/pixman-arm-simd-asm-scaled.S
@@ -25,10 +25,6 @@
*
*/
-#ifdef __clang__
-#define subpls subspl
-#endif
-
/* Prevent the stack from becoming executable */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
@@ -43,6 +39,8 @@
#include "pixman-arm-asm.h"
+ pixman_syntax_unified
+
/*
* Note: This code is only using armv5te instructions (not even armv6),
* but is scheduled for ARM Cortex-A8 pipeline. So it might need to
@@ -89,21 +87,21 @@ pixman_asm_function \fname
and TMP2, VXMASK, VX, asr #(16 - \bpp_shift)
adds VX, VX, UNIT_X
str\()\t TMP1, [DST], #(1 << \bpp_shift)
-9: subpls VX, VX, SRC_WIDTH_FIXED
+9: subspl VX, VX, SRC_WIDTH_FIXED
bpl 9b
ldr\()\t TMP2, [SRC, TMP2]
and TMP1, VXMASK, VX, asr #(16 - \bpp_shift)
adds VX, VX, UNIT_X
str\()\t TMP2, [DST], #(1 << \bpp_shift)
-9: subpls VX, VX, SRC_WIDTH_FIXED
+9: subspl VX, VX, SRC_WIDTH_FIXED
bpl 9b
.endm
/* now do the scaling */
and TMP1, VXMASK, VX, asr #(16 - \bpp_shift)
adds VX, VX, UNIT_X
-9: subpls VX, VX, SRC_WIDTH_FIXED
+9: subspl VX, VX, SRC_WIDTH_FIXED
bpl 9b
subs W, W, #(8 + \prefetch_braking_distance)
blt 2f
@@ -112,7 +110,7 @@ pixman_asm_function \fname
mla PF_OFFS, UNIT_X, PF_OFFS, VX
1: /* main loop, process 8 pixels per iteration with prefetch */
pld [SRC, PF_OFFS, asr #(16 - \bpp_shift)]
- add PF_OFFS, UNIT_X, lsl #3
+ add PF_OFFS, PF_OFFS, UNIT_X, lsl #3
scale_2_pixels
scale_2_pixels
scale_2_pixels
@@ -133,13 +131,8 @@ pixman_asm_function \fname
scale_2_pixels
2:
tst W, #1
-#ifdef __clang__
ldr\()\t\()ne TMP1, [SRC, TMP1]
str\()\t\()ne TMP1, [DST]
-#else
- ldrne\()\t TMP1, [SRC, TMP1]
- strne\()\t TMP1, [DST]
-#endif
/* cleanup helper macro */
.purgem scale_2_pixels
.unreq DST
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index 0c93ef4..34d38f1 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -25,11 +25,6 @@
*
*/
-#ifdef __clang__
-#define adceqs adcseq
-#define ldmnedb ldmdbne
-#endif
-
/* Prevent the stack from becoming executable */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
@@ -45,6 +40,8 @@
#include "pixman-arm-asm.h"
#include "pixman-arm-simd-asm.h"
+ pixman_syntax_unified
+
/* A head macro should do all processing which results in an output of up to
* 16 bytes, as far as the final load instruction. The corresponding tail macro
* should complete the processing of the up-to-16 bytes. The calling macro will
@@ -127,7 +124,7 @@ generate_composite_function \
.macro src_n_0565_init
ldrh SRC, [sp, #ARGS_STACK_OFFSET]
- orr SRC, SRC, lsl #16
+ orr SRC, SRC, SRC, lsl #16
mov STRIDE_S, SRC
mov MASK, SRC
mov STRIDE_M, SRC
@@ -135,8 +132,8 @@ generate_composite_function \
.macro src_n_8_init
ldrb SRC, [sp, #ARGS_STACK_OFFSET]
- orr SRC, SRC, lsl #8
- orr SRC, SRC, lsl #16
+ orr SRC, SRC, SRC, lsl #8
+ orr SRC, SRC, SRC, lsl #16
mov STRIDE_S, SRC
mov MASK, SRC
mov STRIDE_M, SRC
@@ -1098,13 +1095,13 @@ generate_composite_function \
.elseif \numbytes == 8
teq ORIG_W, WK\()\reg1
teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */
- ldmnedb DST, {WK\()\reg1-WK\()\reg2}
+ ldmdbne DST, {WK\()\reg1-WK\()\reg2}
.else
teq ORIG_W, WK\()\reg1
teqeq ORIG_W, WK\()\reg2
teqeq ORIG_W, WK\()\reg3
teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */
- ldmnedb DST, {WK\()\reg1-WK\()\reg4}
+ ldmdbne DST, {WK\()\reg1-WK\()\reg4}
.endif
cmnne DST, #0 /* clear C if NE */
bcs 49f /* no writes to dest if source all -1 */
diff --git a/pixman/pixman-arm-simd-asm.h b/pixman/pixman-arm-simd-asm.h
index 3e78e8a..5ec19e0 100644
--- a/pixman/pixman-arm-simd-asm.h
+++ b/pixman/pixman-arm-simd-asm.h
@@ -119,37 +119,21 @@
\op\()r\()\cond WK\()\reg2, [\base], #4
\op\()r\()\cond WK\()\reg3, [\base], #4
.else
-#ifdef __clang__
\op\()mia\()\cond \base!, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
-#else
- \op\()m\()\cond\()ia \base!, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
-#endif
.endif
.elseif \numbytes == 8
.if \unaligned == 1
\op\()r\()\cond WK\()\reg0, [\base], #4
\op\()r\()\cond WK\()\reg1, [\base], #4
.else
-#ifdef __clang__
\op\()mia\()\cond \base!, {WK\()\reg0,WK\()\reg1}
-#else
- \op\()m\()\cond\()ia \base!, {WK\()\reg0,WK\()\reg1}
-#endif
.endif
.elseif \numbytes == 4
\op\()r\()\cond WK\()\reg0, [\base], #4
.elseif \numbytes == 2
-#ifdef __clang__
\op\()rh\()\cond WK\()\reg0, [\base], #2
-#else
- \op\()r\()\cond\()h WK\()\reg0, [\base], #2
-#endif
.elseif \numbytes == 1
-#ifdef __clang__
\op\()rb\()\cond WK\()\reg0, [\base], #1
-#else
- \op\()r\()\cond\()b WK\()\reg0, [\base], #1
-#endif
.else
.error "unsupported size: \numbytes"
.endif
@@ -157,31 +141,15 @@
.macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
.if \numbytes == 16
-#ifdef __clang__
stm\()\cond\()db \base, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
-#else
- stmdb\()\cond \base, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
-#endif
.elseif \numbytes == 8
-#ifdef __clang__
stmdb\()\cond \base, {WK\()\reg0,WK\()\reg1}
-#else
- stm\()\cond\()db \base, {WK\()\reg0,WK\()\reg1}
-#endif
.elseif \numbytes == 4
str\()\cond WK\()\reg0, [\base, #-4]
.elseif \numbytes == 2
-#ifdef __clang__
strh\()\cond WK\()\reg0, [\base, #-2]
-#else
- str\()\cond\()h WK\()\reg0, [\base, #-2]
-#endif
.elseif \numbytes == 1
-#ifdef __clang__
strb\()\cond WK\()\reg0, [\base, #-1]
-#else
- str\()\cond\()b WK\()\reg0, [\base, #-1]
-#endif
.else
.error "unsupported size: \numbytes"
.endif
@@ -291,7 +259,7 @@
/* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
PF mov, SCRATCH, \base, lsl #32-5
PF adds, SCRATCH, SCRATCH, X, lsl #32-5+\bpp_shift
- PF adceqs, SCRATCH, SCRATCH, #0
+ PF adcseq, SCRATCH, SCRATCH, #0
/* The instruction above has two effects: ensures Z is only
* set if C was clear (so Z indicates that both shifted quantities
* were 0), and clears C if Z was set (so C indicates that the sum