diff options
author | Siarhei Siamashka <siarhei.siamashka@gmail.com> | 2016-04-25 13:30:55 +0300 |
---|---|---|
committer | Siarhei Siamashka <siarhei.siamashka@gmail.com> | 2016-04-25 13:31:53 +0300 |
commit | a10d35f2c5e2514890bbe54699e31f2a182cec94 (patch) | |
tree | fba7fc5db94c562e74d7319f2e771909fc282342 | |
parent | e134f2f002cffd0e819450a1677b9029c74f4043 (diff) |
Import the v4 of 64-bit NEON patch from Mizuki Asakura20160425-arm64-review
https://lists.freedesktop.org/archives/pixman/2016-April/004569.html
-rw-r--r-- | configure.ac | 34 | ||||
-rw-r--r-- | pixman/Makefile.am | 15 | ||||
-rw-r--r-- | pixman/pixman-arm-neon.c | 2 | ||||
-rw-r--r-- | pixman/pixman-arm.c | 6 | ||||
-rw-r--r-- | pixman/pixman-arma64-neon-asm-bilinear.S | 1075 | ||||
-rw-r--r-- | pixman/pixman-arma64-neon-asm.S | 3315 | ||||
-rw-r--r-- | pixman/pixman-arma64-neon-asm.h | 664 | ||||
-rw-r--r-- | pixman/pixman-private.h | 5 |
8 files changed, 2637 insertions, 2479 deletions
diff --git a/configure.ac b/configure.ac index 6b2134e..bb0192a 100644 --- a/configure.ac +++ b/configure.ac @@ -667,6 +667,40 @@ if test $enable_arm_neon = yes && test $have_arm_neon = no ; then AC_MSG_ERROR([ARM NEON intrinsics not detected]) fi +dnl ========================================================================== +dnl Check if assembler is gas compatible and supports ARM-a64 NEON instructions +have_arm_a64_neon=no +AC_MSG_CHECKING(whether to use ARM A64 NEON assembler) +xserver_save_CFLAGS=$CFLAGS +CFLAGS="-x assembler-with-cpp $CFLAGS" +AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ +.text +.arch armv8-a +.altmacro +prfm pldl2strm, [x0] +xtn v0.8b, v0.8h]])], have_arm_a64_neon=yes) +CFLAGS=$xserver_save_CFLAGS + +AC_ARG_ENABLE(arm-a64-neon, + [AC_HELP_STRING([--disable-arm-a64-neon], + [disable ARM A64 NEON fast paths])], + [enable_arm_a64_neon=$enableval], [enable_arm_a64_neon=auto]) + +if test $enable_arm_a64_neon = no ; then + have_arm_a64_neon=disabled +fi + +if test $have_arm_a64_neon = yes ; then + AC_DEFINE(USE_ARM_A64_NEON, 1, [use ARM A64_NEON assembly optimizations]) +fi + +AM_CONDITIONAL(USE_ARM_A64_NEON, test $have_arm_a64_neon = yes) + +AC_MSG_RESULT($have_arm_a64_neon) +if test $enable_arm_a64_neon = yes && test $have_arm_a64_neon4 = no ; then + AC_MSG_ERROR([ARM A64 NEON intrinsics not detected]) +fi + dnl =========================================================================== dnl Check for IWMMXT diff --git a/pixman/Makefile.am b/pixman/Makefile.am index 581b6f6..8de02d9 100644 --- a/pixman/Makefile.am +++ b/pixman/Makefile.am @@ -94,6 +94,21 @@ libpixman_1_la_LIBADD += libpixman-arm-neon.la ASM_CFLAGS_arm_neon= endif +# arm a64 neon code +if USE_ARM_A64_NEON +noinst_LTLIBRARIES += libpixman-arma64-neon.la +libpixman_arma64_neon_la_SOURCES = \ + pixman-arm-neon.c \ + pixman-arm-common.h \ + pixman-arma64-neon-asm.S \ + pixman-arma64-neon-asm-bilinear.S \ + pixman-arm-asm.h \ + pixman-arma64-neon-asm.h +libpixman_1_la_LIBADD += libpixman-arma64-neon.la + +ASM_CFLAGS_arm_neon= +endif + # iwmmxt code if USE_ARM_IWMMXT libpixman_iwmmxt_la_SOURCES = pixman-mmx.c diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c index be761c9..fcb61bc 100644 --- a/pixman/pixman-arm-neon.c +++ b/pixman/pixman-arm-neon.c @@ -194,7 +194,7 @@ arm_neon_fill (pixman_implementation_t *imp, uint32_t _xor) { /* stride is always multiple of 32bit units in pixman */ - uint32_t byte_stride = stride * sizeof(uint32_t); + int32_t byte_stride = stride * sizeof(uint32_t); switch (bpp) { diff --git a/pixman/pixman-arm.c b/pixman/pixman-arm.c index 23374e4..734cbea 100644 --- a/pixman/pixman-arm.c +++ b/pixman/pixman-arm.c @@ -221,5 +221,11 @@ _pixman_arm_get_implementations (pixman_implementation_t *imp) imp = _pixman_implementation_create_arm_neon (imp); #endif +#ifdef USE_ARM_A64_NEON + /* neon is a part of aarch64 */ + if (!_pixman_disabled ("arm-neon")) + imp = _pixman_implementation_create_arm_neon (imp); +#endif + return imp; } diff --git a/pixman/pixman-arma64-neon-asm-bilinear.S b/pixman/pixman-arma64-neon-asm-bilinear.S index a7d94c3..31d103d 100644 --- a/pixman/pixman-arma64-neon-asm-bilinear.S +++ b/pixman/pixman-arma64-neon-asm-bilinear.S @@ -55,18 +55,13 @@ #endif .text -.fpu neon -.arch armv7a -.object_arch armv4 -.eabi_attribute 10, 0 -.eabi_attribute 12, 0 -.arm +.arch armv8-a .altmacro .p2align 2 #include "pixman-private.h" #include "pixman-arm-asm.h" -#include "pixman-arm-neon-asm.h" +#include "pixman-arma64-neon-asm.h" /* * Bilinear macros from pixman-arm-neon-asm.S @@ -79,19 +74,19 @@ */ .macro bilinear_load_8888 reg1, reg2, tmp - asr TMP1, X, #16 + asr WTMP1, X, #16 add X, X, UX add TMP1, TOP, TMP1, lsl #2 - vld1.32 {reg1}, [TMP1], STRIDE - vld1.32 {reg2}, [TMP1] + ld1 {®1&.2s}, [TMP1], STRIDE + ld1 {®2&.2s}, [TMP1] .endm .macro bilinear_load_0565 reg1, reg2, tmp - asr TMP1, X, #16 + asr WTMP1, X, #16 add X, X, UX add TMP1, TOP, TMP1, lsl #1 - vld1.32 {reg2[0]}, [TMP1], STRIDE - vld1.32 {reg2[1]}, [TMP1] + ld1 {®2&.s}[0], [TMP1], STRIDE + ld1 {®2&.s}[1], [TMP1] convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp .endm @@ -99,11 +94,11 @@ acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 bilinear_load_8888 reg1, reg2, tmp1 - vmull.u8 acc1, reg1, d28 - vmlal.u8 acc1, reg2, d29 + umull &acc1&.8h, ®1&.8b, v28.8b + umlal &acc1&.8h, ®2&.8b, v29.8b bilinear_load_8888 reg3, reg4, tmp2 - vmull.u8 acc2, reg3, d28 - vmlal.u8 acc2, reg4, d29 + umull &acc2&.8h, ®3&.8b, v28.8b + umlal &acc2&.8h, ®4&.8b, v29.8b .endm .macro bilinear_load_and_vertical_interpolate_four_8888 \ @@ -116,98 +111,109 @@ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi .endm +.macro vzip reg1, reg2 + zip1 v24.8b, reg1, reg2 + zip2 reg2, reg1, reg2 + mov reg1, v24.8b +.endm + +.macro vuzp reg1, reg2 + uzp1 v24.8b, reg1, reg2 + uzp2 reg2, reg1, reg2 + mov reg1, v24.8b +.endm + .macro bilinear_load_and_vertical_interpolate_two_0565 \ acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi - - asr TMP1, X, #16 + asr WTMP1, X, #16 add X, X, UX add TMP1, TOP, TMP1, lsl #1 - asr TMP2, X, #16 + asr WTMP2, X, #16 add X, X, UX add TMP2, TOP, TMP2, lsl #1 - vld1.32 {acc2lo[0]}, [TMP1], STRIDE - vld1.32 {acc2hi[0]}, [TMP2], STRIDE - vld1.32 {acc2lo[1]}, [TMP1] - vld1.32 {acc2hi[1]}, [TMP2] + ld1 {&acc2&.s}[0], [TMP1], STRIDE + ld1 {&acc2&.s}[2], [TMP2], STRIDE + ld1 {&acc2&.s}[1], [TMP1] + ld1 {&acc2&.s}[3], [TMP2] convert_0565_to_x888 acc2, reg3, reg2, reg1 - vzip.u8 reg1, reg3 - vzip.u8 reg2, reg4 - vzip.u8 reg3, reg4 - vzip.u8 reg1, reg2 - vmull.u8 acc1, reg1, d28 - vmlal.u8 acc1, reg2, d29 - vmull.u8 acc2, reg3, d28 - vmlal.u8 acc2, reg4, d29 + vzip ®1&.8b, ®3&.8b + vzip ®2&.8b, ®4&.8b + vzip ®3&.8b, ®4&.8b + vzip ®1&.8b, ®2&.8b + umull &acc1&.8h, ®1&.8b, v28.8b + umlal &acc1&.8h, ®2&.8b, v29.8b + umull &acc2&.8h, ®3&.8b, v28.8b + umlal &acc2&.8h, ®4&.8b, v29.8b .endm .macro bilinear_load_and_vertical_interpolate_four_0565 \ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi - asr TMP1, X, #16 + asr WTMP1, X, #16 add X, X, UX add TMP1, TOP, TMP1, lsl #1 - asr TMP2, X, #16 + asr WTMP2, X, #16 add X, X, UX add TMP2, TOP, TMP2, lsl #1 - vld1.32 {xacc2lo[0]}, [TMP1], STRIDE - vld1.32 {xacc2hi[0]}, [TMP2], STRIDE - vld1.32 {xacc2lo[1]}, [TMP1] - vld1.32 {xacc2hi[1]}, [TMP2] + ld1 {&xacc2&.s}[0], [TMP1], STRIDE + ld1 {&xacc2&.s}[2], [TMP2], STRIDE + ld1 {&xacc2&.s}[1], [TMP1] + ld1 {&xacc2&.s}[3], [TMP2] convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 - asr TMP1, X, #16 + asr WTMP1, X, #16 add X, X, UX add TMP1, TOP, TMP1, lsl #1 - asr TMP2, X, #16 + asr WTMP2, X, #16 add X, X, UX add TMP2, TOP, TMP2, lsl #1 - vld1.32 {yacc2lo[0]}, [TMP1], STRIDE - vzip.u8 xreg1, xreg3 - vld1.32 {yacc2hi[0]}, [TMP2], STRIDE - vzip.u8 xreg2, xreg4 - vld1.32 {yacc2lo[1]}, [TMP1] - vzip.u8 xreg3, xreg4 - vld1.32 {yacc2hi[1]}, [TMP2] - vzip.u8 xreg1, xreg2 + ld1 {&yacc2&.s}[0], [TMP1], STRIDE + vzip &xreg1&.8b, &xreg3&.8b + ld1 {&yacc2&.s}[2], [TMP2], STRIDE + vzip &xreg2&.8b, &xreg4&.8b + ld1 {&yacc2&.s}[1], [TMP1] + vzip &xreg3&.8b, &xreg4&.8b + ld1 {&yacc2&.s}[3], [TMP2] + vzip &xreg1&.8b, &xreg2&.8b convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 - vmull.u8 xacc1, xreg1, d28 - vzip.u8 yreg1, yreg3 - vmlal.u8 xacc1, xreg2, d29 - vzip.u8 yreg2, yreg4 - vmull.u8 xacc2, xreg3, d28 - vzip.u8 yreg3, yreg4 - vmlal.u8 xacc2, xreg4, d29 - vzip.u8 yreg1, yreg2 - vmull.u8 yacc1, yreg1, d28 - vmlal.u8 yacc1, yreg2, d29 - vmull.u8 yacc2, yreg3, d28 - vmlal.u8 yacc2, yreg4, d29 + umull &xacc1&.8h, &xreg1&.8b, v28.8b + vzip &yreg1&.8b, &yreg3&.8b + umlal &xacc1&.8h, &xreg2&.8b, v29.8b + vzip &yreg2&.8b, &yreg4&.8b + umull &xacc2&.8h, &xreg3&.8b, v28.8b + vzip &yreg3&.8b, &yreg4&.8b + umlal &xacc2&.8h, &xreg4&.8b, v29.8b + vzip &yreg1&.8b, &yreg2&.8b + umull &yacc1&.8h, &yreg1&.8b, v28.8b + umlal &yacc1&.8h, &yreg2&.8b, v29.8b + umull &yacc2&.8h, &yreg3&.8b, v28.8b + umlal &yacc2&.8h, &yreg4&.8b, v29.8b .endm .macro bilinear_store_8888 numpix, tmp1, tmp2 .if numpix == 4 - vst1.32 {d0, d1}, [OUT]! + st1 {v0.2s, v1.2s}, [OUT], #16 .elseif numpix == 2 - vst1.32 {d0}, [OUT]! + st1 {v0.2s}, [OUT], #8 .elseif numpix == 1 - vst1.32 {d0[0]}, [OUT, :32]! + st1 {v0.s}[0], [OUT], #4 .else .error bilinear_store_8888 numpix is unsupported .endif .endm .macro bilinear_store_0565 numpix, tmp1, tmp2 - vuzp.u8 d0, d1 - vuzp.u8 d2, d3 - vuzp.u8 d1, d3 - vuzp.u8 d0, d2 - convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2 + vuzp v0.8b, v1.8b + vuzp v2.8b, v3.8b + vuzp v1.8b, v3.8b + vuzp v0.8b, v2.8b + convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2 .if numpix == 4 - vst1.16 {d2}, [OUT]! + st1 {v1.4h}, [OUT], #8 .elseif numpix == 2 - vst1.32 {d2[0]}, [OUT]! + st1 {v1.s}[0], [OUT], #4 .elseif numpix == 1 - vst1.16 {d2[0]}, [OUT]! + st1 {v1.h}[0], [OUT], #2 .else .error bilinear_store_0565 numpix is unsupported .endif @@ -216,22 +222,22 @@ /* * Macros for loading mask pixels into register 'mask'. - * vdup must be done in somewhere else. + * dup must be done in somewhere else. */ .macro bilinear_load_mask_x numpix, mask .endm .macro bilinear_load_mask_8 numpix, mask .if numpix == 4 - vld1.32 {mask[0]}, [MASK]! + ld1 {&mask&.s}[0], [MASK], #4 .elseif numpix == 2 - vld1.16 {mask[0]}, [MASK]! + ld1 {&mask&.h}[0], [MASK], #2 .elseif numpix == 1 - vld1.8 {mask[0]}, [MASK]! + ld1 {&mask&.b}[0], [MASK], #1 .else .error bilinear_load_mask_8 numpix is unsupported .endif - pld [MASK, #prefetch_offset] + prfm PREFETCH_MODE, [MASK, #prefetch_offset] .endm .macro bilinear_load_mask mask_fmt, numpix, mask @@ -251,15 +257,17 @@ .macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01 .if numpix == 4 - vld1.32 {dst0, dst1}, [OUT] + ld1 {&dst0&.2s, &dst1&.2s}, [OUT] .elseif numpix == 2 - vld1.32 {dst0}, [OUT] + ld1 {&dst0&.2s}, [OUT] .elseif numpix == 1 - vld1.32 {dst0[0]}, [OUT] + ld1 {&dst0&.s}[0], [OUT] .else .error bilinear_load_dst_8888 numpix is unsupported .endif - pld [OUT, #(prefetch_offset * 4)] + mov &dst01&.d[0], &dst0&.d[0] + mov &dst01&.d[1], &dst1&.d[0] + prfm PREFETCH_MODE, [OUT, #(prefetch_offset * 4)] .endm .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01 @@ -291,11 +299,11 @@ .macro bilinear_duplicate_mask_8 numpix, mask .if numpix == 4 - vdup.32 mask, mask[0] + dup &mask&.2s, &mask&.s[0] .elseif numpix == 2 - vdup.16 mask, mask[0] + dup &mask&.4h, &mask&.h[0] .elseif numpix == 1 - vdup.8 mask, mask[0] + dup &mask&.8b, &mask&.b[0] .else .error bilinear_duplicate_mask_8 is unsupported .endif @@ -309,11 +317,15 @@ * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form. * Interleave should be done when maks is enabled or operator is 'over'. */ -.macro bilinear_interleave src0, src1, dst0, dst1 - vuzp.8 src0, src1 - vuzp.8 dst0, dst1 - vuzp.8 src0, src1 - vuzp.8 dst0, dst1 +.macro bilinear_interleave src0, src1, src01, dst0, dst1, dst01 + vuzp &src0&.8b, &src1&.8b + vuzp &dst0&.8b, &dst1&.8b + vuzp &src0&.8b, &src1&.8b + vuzp &dst0&.8b, &dst1&.8b + mov &src01&.d[1], &src1&.d[0] + mov &src01&.d[0], &src0&.d[0] + mov &dst01&.d[1], &dst1&.d[0] + mov &dst01&.d[0], &dst0&.d[0] .endm .macro bilinear_interleave_src_dst_x_src \ @@ -323,29 +335,30 @@ .macro bilinear_interleave_src_dst_x_over \ numpix, src0, src1, src01, dst0, dst1, dst01 - bilinear_interleave src0, src1, dst0, dst1 + bilinear_interleave src0, src1, src01, dst0, dst1, dst01 .endm .macro bilinear_interleave_src_dst_x_add \ numpix, src0, src1, src01, dst0, dst1, dst01 + bilinear_interleave src0, src1, src01, dst0, dst1, dst01 .endm .macro bilinear_interleave_src_dst_8_src \ numpix, src0, src1, src01, dst0, dst1, dst01 - bilinear_interleave src0, src1, dst0, dst1 + bilinear_interleave src0, src1, src01, dst0, dst1, dst01 .endm .macro bilinear_interleave_src_dst_8_over \ numpix, src0, src1, src01, dst0, dst1, dst01 - bilinear_interleave src0, src1, dst0, dst1 + bilinear_interleave src0, src1, src01, dst0, dst1, dst01 .endm .macro bilinear_interleave_src_dst_8_add \ numpix, src0, src1, src01, dst0, dst1, dst01 - bilinear_interleave src0, src1, dst0, dst1 + bilinear_interleave src0, src1, src01, dst0, dst1, dst01 .endm .macro bilinear_interleave_src_dst \ @@ -370,14 +383,16 @@ numpix, src0, src1, src01, mask, \ tmp01, tmp23, tmp45, tmp67 - vmull.u8 tmp01, src0, mask - vmull.u8 tmp23, src1, mask + umull &tmp01&.8h, &src0&.8b, &mask&.8b + umull &tmp23&.8h, &src1&.8b, &mask&.8b /* bubbles */ - vrshr.u16 tmp45, tmp01, #8 - vrshr.u16 tmp67, tmp23, #8 + urshr &tmp45&.8h, &tmp01&.8h, #8 + urshr &tmp67&.8h, &tmp23&.8h, #8 /* bubbles */ - vraddhn.u16 src0, tmp45, tmp01 - vraddhn.u16 src1, tmp67, tmp23 + raddhn &src0&.8b, &tmp45&.8h, &tmp01&.8h + raddhn &src1&.8b, &tmp67&.8h, &tmp23&.8h + mov &src01&.d[0], &src0&.d[0] + mov &src01&.d[1], &src1&.d[0] .endm .macro bilinear_apply_mask_to_src \ @@ -403,28 +418,36 @@ numpix, src0, src1, src01, dst0, dst1, dst01, \ tmp01, tmp23, tmp45, tmp67, tmp8 - vdup.32 tmp8, src1[1] + dup &tmp8&.2s, &src1&.s[1] /* bubbles */ - vmvn.8 tmp8, tmp8 + mvn &tmp8&.8b, &tmp8&.8b /* bubbles */ - vmull.u8 tmp01, dst0, tmp8 + umull &tmp01&.8h, &dst0&.8b, &tmp8&.8b /* bubbles */ - vmull.u8 tmp23, dst1, tmp8 + umull &tmp23&.8h, &dst1&.8b, &tmp8&.8b /* bubbles */ - vrshr.u16 tmp45, tmp01, #8 - vrshr.u16 tmp67, tmp23, #8 + urshr &tmp45&.8h, &tmp01&.8h, #8 + urshr &tmp67&.8h, &tmp23&.8h, #8 /* bubbles */ - vraddhn.u16 dst0, tmp45, tmp01 - vraddhn.u16 dst1, tmp67, tmp23 + raddhn &dst0&.8b, &tmp45&.8h, &tmp01&.8h + raddhn &dst1&.8b, &tmp67&.8h, &tmp23&.8h + mov &dst01&.d[0], &dst0&.d[0] + mov &dst01&.d[1], &dst1&.d[0] /* bubbles */ - vqadd.u8 src01, dst01, src01 + uqadd &src0&.8b, &dst0&.8b, &src0&.8b + uqadd &src1&.8b, &dst1&.8b, &src1&.8b + mov &src01&.d[0], &src0&.d[0] + mov &src01&.d[1], &src1&.d[0] .endm .macro bilinear_combine_add \ numpix, src0, src1, src01, dst0, dst1, dst01, \ tmp01, tmp23, tmp45, tmp67, tmp8 - vqadd.u8 src01, dst01, src01 + uqadd &src0&.8b, &dst0&.8b, &src0&.8b + uqadd &src1&.8b, &dst1&.8b, &src1&.8b + mov &src01&.d[0], &src0&.d[0] + mov &src01&.d[1], &src1&.d[0] .endm .macro bilinear_combine \ @@ -440,9 +463,11 @@ * Macros for final deinterleaving of destination pixels if needed. */ .macro bilinear_deinterleave numpix, dst0, dst1, dst01 - vuzp.8 dst0, dst1 + vuzp &dst0&.8b, &dst1&.8b /* bubbles */ - vuzp.8 dst0, dst1 + vuzp &dst0&.8b, &dst1&.8b + mov &dst01&.d[0], &dst0&.d[0] + mov &dst01&.d[1], &dst1&.d[0] .endm .macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01 @@ -453,6 +478,7 @@ .endm .macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01 + bilinear_deinterleave numpix, dst0, dst1, dst01 .endm .macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01 @@ -473,142 +499,142 @@ .macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op - bilinear_load_&src_fmt d0, d1, d2 - bilinear_load_mask mask_fmt, 1, d4 - bilinear_load_dst dst_fmt, op, 1, d18, d19, q9 - vmull.u8 q1, d0, d28 - vmlal.u8 q1, d1, d29 + bilinear_load_&src_fmt v0, v1, v2 + bilinear_load_mask mask_fmt, 1, v4 + bilinear_load_dst dst_fmt, op, 1, v18, v19, v9 + umull v2.8h, v0.8b, v28.8b + umlal v2.8h, v1.8b, v29.8b /* 5 cycles bubble */ - vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q0, d2, d30 - vmlal.u16 q0, d3, d30 + ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v0.4s, v2.4h, v15.h[0] + umlal2 v0.4s, v2.8h, v15.h[0] /* 5 cycles bubble */ - bilinear_duplicate_mask mask_fmt, 1, d4 - vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) + bilinear_duplicate_mask mask_fmt, 1, v4 + shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) /* 3 cycles bubble */ - vmovn.u16 d0, q0 + xtn v0.8b, v0.8h /* 1 cycle bubble */ bilinear_interleave_src_dst \ - mask_fmt, op, 1, d0, d1, q0, d18, d19, q9 + mask_fmt, op, 1, v0, v1, v0, v18, v19, v9 bilinear_apply_mask_to_src \ - mask_fmt, 1, d0, d1, q0, d4, \ - q3, q8, q10, q11 + mask_fmt, 1, v0, v1, v0, v4, \ + v3, v8, v10, v11 bilinear_combine \ - op, 1, d0, d1, q0, d18, d19, q9, \ - q3, q8, q10, q11, d5 - bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0 - bilinear_store_&dst_fmt 1, q2, q3 + op, 1, v0, v1, v0, v18, v19, v9, \ + v3, v8, v10, v11, v5 + bilinear_deinterleave_dst mask_fmt, op, 1, v0, v1, v0 + bilinear_store_&dst_fmt 1, v17, v18 .endm .macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op bilinear_load_and_vertical_interpolate_two_&src_fmt \ - q1, q11, d0, d1, d20, d21, d22, d23 - bilinear_load_mask mask_fmt, 2, d4 - bilinear_load_dst dst_fmt, op, 2, d18, d19, q9 - vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q0, d2, d30 - vmlal.u16 q0, d3, d30 - vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q10, d22, d31 - vmlal.u16 q10, d23, d31 - vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) - bilinear_duplicate_mask mask_fmt, 2, d4 - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vadd.u16 q12, q12, q13 - vmovn.u16 d0, q0 + v1, v11, v18, v19, v20, v21, v22, v23 + bilinear_load_mask mask_fmt, 2, v4 + bilinear_load_dst dst_fmt, op, 2, v18, v19, v9 + ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v0.4s, v1.4h, v15.h[0] + umlal2 v0.4s, v1.8h, v15.h[0] + ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v10.4s, v11.4h, v15.h[4] + umlal2 v10.4s, v11.8h, v15.h[4] + shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + bilinear_duplicate_mask mask_fmt, 2, v4 + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + add v12.8h, v12.8h, v13.8h + xtn v0.8b, v0.8h bilinear_interleave_src_dst \ - mask_fmt, op, 2, d0, d1, q0, d18, d19, q9 + mask_fmt, op, 2, v0, v1, v0, v18, v19, v9 bilinear_apply_mask_to_src \ - mask_fmt, 2, d0, d1, q0, d4, \ - q3, q8, q10, q11 + mask_fmt, 2, v0, v1, v0, v4, \ + v3, v8, v10, v11 bilinear_combine \ - op, 2, d0, d1, q0, d18, d19, q9, \ - q3, q8, q10, q11, d5 - bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0 - bilinear_store_&dst_fmt 2, q2, q3 + op, 2, v0, v1, v0, v18, v19, v9, \ + v3, v8, v10, v11, v5 + bilinear_deinterleave_dst mask_fmt, op, 2, v0, v1, v0 + bilinear_store_&dst_fmt 2, v16, v17 .endm .macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op bilinear_load_and_vertical_interpolate_four_&src_fmt \ - q1, q11, d0, d1, d20, d21, d22, d23 \ - q3, q9, d4, d5, d16, d17, d18, d19 - pld [TMP1, PF_OFFS] + v1, v11, v4, v5, v6, v7, v22, v23 \ + v3, v9, v16, v17, v20, v21, v18, v19 + prfm PREFETCH_MODE, [TMP1, PF_OFFS] sub TMP1, TMP1, STRIDE - vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q0, d2, d30 - vmlal.u16 q0, d3, d30 - vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q10, d22, d31 - vmlal.u16 q10, d23, d31 - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q2, d6, d30 - vmlal.u16 q2, d7, d30 - vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS - bilinear_load_mask mask_fmt, 4, d22 - bilinear_load_dst dst_fmt, op, 4, d2, d3, q1 - pld [TMP1, PF_OFFS] - vmlsl.u16 q8, d18, d31 - vmlal.u16 q8, d19, d31 - vadd.u16 q12, q12, q13 - vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS) - bilinear_duplicate_mask mask_fmt, 4, d22 - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vmovn.u16 d0, q0 - vmovn.u16 d1, q2 - vadd.u16 q12, q12, q13 + prfm PREFETCH_MODE, [TMP1, PF_OFFS] + ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v0.4s, v1.4h, v15.h[0] + umlal2 v0.4s, v1.8h, v15.h[0] + ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v10.4s, v11.4h, v15.h[4] + umlal2 v10.4s, v11.8h, v15.h[4] + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + ushll v2.4s, v3.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v2.4s, v3.4h, v15.h[0] + umlal2 v2.4s, v3.8h, v15.h[0] + ushll v8.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v8.4s, v9.4h, v15.h[4] + umlal2 v8.4s, v9.8h, v15.h[4] + add v12.8h, v12.8h, v13.8h + shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn2 v2.8h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + bilinear_load_mask mask_fmt, 4, v4 + bilinear_duplicate_mask mask_fmt, 4, v4 + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + xtn v0.8b, v0.8h + xtn v1.8b, v2.8h + add v12.8h, v12.8h, v13.8h + bilinear_load_dst dst_fmt, op, 4, v2, v3, v21 bilinear_interleave_src_dst \ - mask_fmt, op, 4, d0, d1, q0, d2, d3, q1 + mask_fmt, op, 4, v0, v1, v0, v2, v3, v11 bilinear_apply_mask_to_src \ - mask_fmt, 4, d0, d1, q0, d22, \ - q3, q8, q9, q10 + mask_fmt, 4, v0, v1, v0, v4, \ + v6, v8, v9, v10 bilinear_combine \ - op, 4, d0, d1, q0, d2, d3, q1, \ - q3, q8, q9, q10, d23 - bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0 - bilinear_store_&dst_fmt 4, q2, q3 + op, 4, v0, v1, v0, v2, v3, v1, \ + v6, v8, v9, v10, v23 + bilinear_deinterleave_dst mask_fmt, op, 4, v0, v1, v0 + bilinear_store_&dst_fmt 4, v6, v7 .endm -.set BILINEAR_FLAG_USE_MASK, 1 -.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 +.set BILINEAR_FLAG_USE_MASK, 1 +.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 /* * Main template macro for generating NEON optimized bilinear scanline functions. * * Bilinear scanline generator macro take folling arguments: - * fname - name of the function to generate - * src_fmt - source color format (8888 or 0565) - * dst_fmt - destination color format (8888 or 0565) - * src/dst_bpp_shift - (1 << bpp_shift) is the size of src/dst pixel in bytes - * process_last_pixel - code block that interpolate one pixel and does not - * update horizontal weight - * process_two_pixels - code block that interpolate two pixels and update - * horizontal weight - * process_four_pixels - code block that interpolate four pixels and update - * horizontal weight - * process_pixblock_head - head part of middle loop - * process_pixblock_tail - tail part of middle loop - * process_pixblock_tail_head - tail_head of middle loop - * pixblock_size - number of pixels processed in a single middle loop - * prefetch_distance - prefetch in the source image by that many pixels ahead + * fname - name of the function to generate + * src_fmt - source color format (8888 or 0565) + * dst_fmt - destination color format (8888 or 0565) + * src/dst_bpp_shift - (1 << bpp_shift) is the size of src/dst pixel in bytes + * process_last_pixel - code block that interpolate one pixel and does not + * update horizontal weight + * process_two_pixels - code block that interpolate two pixels and update + * horizontal weight + * process_four_pixels - code block that interpolate four pixels and update + * horizontal weight + * process_pixblock_head - head part of middle loop + * process_pixblock_tail - tail part of middle loop + * process_pixblock_tail_head - tail_head of middle loop + * pixblock_size - number of pixels processed in a single middle loop + * prefetch_distance - prefetch in the source image by that many pixels ahead */ .macro generate_bilinear_scanline_func \ - fname, \ - src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \ - bilinear_process_last_pixel, \ - bilinear_process_two_pixels, \ - bilinear_process_four_pixels, \ - bilinear_process_pixblock_head, \ - bilinear_process_pixblock_tail, \ - bilinear_process_pixblock_tail_head, \ - pixblock_size, \ - prefetch_distance, \ - flags + fname, \ + src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \ + bilinear_process_last_pixel, \ + bilinear_process_two_pixels, \ + bilinear_process_four_pixels, \ + bilinear_process_pixblock_head, \ + bilinear_process_pixblock_tail, \ + bilinear_process_pixblock_tail_head, \ + pixblock_size, \ + prefetch_distance, \ + flags pixman_asm_function fname .if pixblock_size == 8 @@ -618,145 +644,186 @@ pixman_asm_function fname .endif .if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 - OUT .req r0 - TOP .req r1 - BOTTOM .req r2 - WT .req r3 - WB .req r4 - X .req r5 - UX .req r6 - WIDTH .req ip - TMP1 .req r3 - TMP2 .req r4 - PF_OFFS .req r7 - TMP3 .req r8 - TMP4 .req r9 - STRIDE .req r2 - - mov ip, sp - push {r4, r5, r6, r7, r8, r9} - mov PF_OFFS, #prefetch_distance - ldmia ip, {WB, X, UX, WIDTH} + OUT .req x0 + TOP .req x1 + BOTTOM .req x2 + WT .req x3 + WWT .req w3 + WB .req x4 + WWB .req w4 + X .req w5 + UX .req w6 + WIDTH .req x7 + TMP1 .req x10 + WTMP1 .req w10 + TMP2 .req x11 + WTMP2 .req w11 + PF_OFFS .req x12 + TMP3 .req x13 + WTMP3 .req w13 + TMP4 .req x14 + WTMP4 .req w14 + STRIDE .req x15 + DUMMY .req x30 + + stp x29, x30, [sp, -16]! + mov x29, sp + sub sp, sp, 112 + sub x29, x29, 64 + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + stp x10, x11, [x29, -80] + stp x12, x13, [x29, -96] + stp x14, x15, [x29, -112] .else - OUT .req r0 - MASK .req r1 - TOP .req r2 - BOTTOM .req r3 - WT .req r4 - WB .req r5 - X .req r6 - UX .req r7 - WIDTH .req ip - TMP1 .req r4 - TMP2 .req r5 - PF_OFFS .req r8 - TMP3 .req r9 - TMP4 .req r10 - STRIDE .req r3 + OUT .req x0 + MASK .req x1 + TOP .req x2 + BOTTOM .req x3 + WT .req x4 + WWT .req w4 + WB .req x5 + WWB .req w5 + X .req w6 + UX .req w7 + WIDTH .req x8 + TMP1 .req x10 + WTMP1 .req w10 + TMP2 .req x11 + WTMP2 .req w11 + PF_OFFS .req x12 + TMP3 .req x13 + WTMP3 .req w13 + TMP4 .req x14 + WTMP4 .req w14 + STRIDE .req x15 + DUMMY .req x30 .set prefetch_offset, prefetch_distance - mov ip, sp - push {r4, r5, r6, r7, r8, r9, r10, ip} - mov PF_OFFS, #prefetch_distance - ldmia ip, {WT, WB, X, UX, WIDTH} + stp x29, x30, [sp, -16]! + mov x29, sp + sub x29, x29, 64 + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + stp x10, x11, [x29, -80] + stp x12, x13, [x29, -96] + stp x14, x15, [x29, -112] + str x8, [x29, -120] + ldr w8, [x29, 16] + sub sp, sp, 120 .endif - mul PF_OFFS, PF_OFFS, UX - -.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 - vpush {d8-d15} -.endif + mov WTMP1, #prefetch_distance + umull PF_OFFS, WTMP1, UX - sub STRIDE, BOTTOM, TOP - .unreq BOTTOM + sub STRIDE, BOTTOM, TOP + .unreq BOTTOM - cmp WIDTH, #0 - ble 3f + cmp WIDTH, #0 + ble 300f - vdup.u16 q12, X - vdup.u16 q13, UX - vdup.u8 d28, WT - vdup.u8 d29, WB - vadd.u16 d25, d25, d26 + dup v12.8h, X + dup v13.8h, UX + dup v28.8b, WWT + dup v29.8b, WWB + mov v25.d[0], v12.d[1] + mov v26.d[0], v13.d[0] + add v25.4h, v25.4h, v26.4h + mov v12.d[1], v25.d[0] /* ensure good destination alignment */ cmp WIDTH, #1 - blt 0f + blt 100f tst OUT, #(1 << dst_bpp_shift) - beq 0f - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vadd.u16 q12, q12, q13 + beq 100f + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + add v12.8h, v12.8h, v13.8h bilinear_process_last_pixel sub WIDTH, WIDTH, #1 -0: - vadd.u16 q13, q13, q13 - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vadd.u16 q12, q12, q13 +100: + add v13.8h, v13.8h, v13.8h + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + add v12.8h, v12.8h, v13.8h cmp WIDTH, #2 - blt 0f + blt 100f tst OUT, #(1 << (dst_bpp_shift + 1)) - beq 0f + beq 100f bilinear_process_two_pixels sub WIDTH, WIDTH, #2 -0: +100: .if pixblock_size == 8 cmp WIDTH, #4 - blt 0f + blt 100f tst OUT, #(1 << (dst_bpp_shift + 2)) - beq 0f + beq 100f bilinear_process_four_pixels sub WIDTH, WIDTH, #4 -0: +100: .endif subs WIDTH, WIDTH, #pixblock_size - blt 1f + blt 100f asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift) bilinear_process_pixblock_head subs WIDTH, WIDTH, #pixblock_size - blt 5f + blt 500f 0: bilinear_process_pixblock_tail_head subs WIDTH, WIDTH, #pixblock_size bge 0b -5: +500: bilinear_process_pixblock_tail -1: +100: .if pixblock_size == 8 tst WIDTH, #4 - beq 2f + beq 200f bilinear_process_four_pixels -2: +200: .endif /* handle the remaining trailing pixels */ tst WIDTH, #2 - beq 2f + beq 200f bilinear_process_two_pixels -2: +200: tst WIDTH, #1 - beq 3f + beq 300f bilinear_process_last_pixel -3: -.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 - vpop {d8-d15} -.endif +300: .if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 - pop {r4, r5, r6, r7, r8, r9} + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + ldp x10, x11, [x29, -80] + ldp x12, x13, [x29, -96] + ldp x14, x15, [x29, -112] + mov sp, x29 + ldp x29, x30, [sp], 16 .else - pop {r4, r5, r6, r7, r8, r9, r10, ip} + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + ldp x10, x11, [x29, -80] + ldp x12, x13, [x29, -96] + ldp x14, x15, [x29, -112] + ldr x8, [x29, -120] + mov sp, x29 + ldp x29, x30, [sp], 16 .endif - bx lr + ret .unreq OUT .unreq TOP .unreq WT + .unreq WWT .unreq WB + .unreq WWB .unreq X .unreq UX .unreq WIDTH .unreq TMP1 + .unreq WTMP1 .unreq TMP2 .unreq PF_OFFS .unreq TMP3 @@ -884,158 +951,160 @@ pixman_asm_function fname .endm .macro bilinear_over_8888_8888_process_pixblock_head - asr TMP1, X, #16 + asr WTMP1, X, #16 add X, X, UX add TMP1, TOP, TMP1, lsl #2 - asr TMP2, X, #16 + asr WTMP2, X, #16 add X, X, UX add TMP2, TOP, TMP2, lsl #2 - vld1.32 {d22}, [TMP1], STRIDE - vld1.32 {d23}, [TMP1] - asr TMP3, X, #16 + ld1 {v22.2s}, [TMP1], STRIDE + ld1 {v23.2s}, [TMP1] + asr WTMP3, X, #16 add X, X, UX add TMP3, TOP, TMP3, lsl #2 - vmull.u8 q8, d22, d28 - vmlal.u8 q8, d23, d29 + umull v8.8h, v22.8b, v28.8b + umlal v8.8h, v23.8b, v29.8b - vld1.32 {d22}, [TMP2], STRIDE - vld1.32 {d23}, [TMP2] - asr TMP4, X, #16 + ld1 {v22.2s}, [TMP2], STRIDE + ld1 {v23.2s}, [TMP2] + asr WTMP4, X, #16 add X, X, UX add TMP4, TOP, TMP4, lsl #2 - vmull.u8 q9, d22, d28 - vmlal.u8 q9, d23, d29 + umull v9.8h, v22.8b, v28.8b + umlal v9.8h, v23.8b, v29.8b - vld1.32 {d22}, [TMP3], STRIDE - vld1.32 {d23}, [TMP3] - vmull.u8 q10, d22, d28 - vmlal.u8 q10, d23, d29 + ld1 {v22.2s}, [TMP3], STRIDE + ld1 {v23.2s}, [TMP3] + umull v10.8h, v22.8b, v28.8b + umlal v10.8h, v23.8b, v29.8b - vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q0, d16, d30 - vmlal.u16 q0, d17, d30 + ushll v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v0.4s, v8.4h, v15.h[0] + umlal2 v0.4s, v8.8h, v15.h[0] - pld [TMP4, PF_OFFS] - vld1.32 {d16}, [TMP4], STRIDE - vld1.32 {d17}, [TMP4] - pld [TMP4, PF_OFFS] - vmull.u8 q11, d16, d28 - vmlal.u8 q11, d17, d29 + prfm PREFETCH_MODE, [TMP4, PF_OFFS] + ld1 {v16.2s}, [TMP4], STRIDE + ld1 {v17.2s}, [TMP4] + prfm PREFETCH_MODE, [TMP4, PF_OFFS] + umull v11.8h, v16.8b, v28.8b + umlal v11.8h, v17.8b, v29.8b - vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q1, d18, d31 - vmlal.u16 q1, d19, d31 - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vadd.u16 q12, q12, q13 + ushll v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v1.4s, v9.4h, v15.h[4] + umlal2 v1.4s, v9.8h, v15.h[4] + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + add v12.8h, v12.8h, v13.8h .endm .macro bilinear_over_8888_8888_process_pixblock_tail - vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q2, d20, d30 - vmlal.u16 q2, d21, d30 - vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q3, d22, d31 - vmlal.u16 q3, d23, d31 - vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) - vld1.32 {d2, d3}, [OUT, :128] - pld [OUT, #(prefetch_offset * 4)] - vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) - vmovn.u16 d6, q0 - vmovn.u16 d7, q2 - vuzp.8 d6, d7 - vuzp.8 d2, d3 - vuzp.8 d6, d7 - vuzp.8 d2, d3 - vdup.32 d4, d7[1] - vmvn.8 d4, d4 - vmull.u8 q11, d2, d4 - vmull.u8 q2, d3, d4 - vrshr.u16 q1, q11, #8 - vrshr.u16 q10, q2, #8 - vraddhn.u16 d2, q1, q11 - vraddhn.u16 d3, q10, q2 - vqadd.u8 q3, q1, q3 - vuzp.8 d6, d7 - vuzp.8 d6, d7 - vadd.u16 q12, q12, q13 - vst1.32 {d6, d7}, [OUT, :128]! + ushll v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v2.4s, v10.4h, v15.h[0] + umlal2 v2.4s, v10.8h, v15.h[0] + ushll v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v3.4s, v11.4h, v15.h[4] + umlal2 v3.4s, v11.8h, v15.h[4] + shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn2 v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + shrn2 v2.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + xtn v6.8b, v0.8h + xtn v7.8b, v2.8h + ld1 {v2.2s, v3.2s}, [OUT] + prfm PREFETCH_MODE, [OUT, #(prefetch_offset * 4)] + vuzp v6.8b, v7.8b + vuzp v2.8b, v3.8b + vuzp v6.8b, v7.8b + vuzp v2.8b, v3.8b + dup v4.2s, v7.s[1] + mvn v4.8b, v4.8b + umull v11.8h, v2.8b, v4.8b + umull v2.8h, v3.8b, v4.8b + urshr v1.8h, v11.8h, #8 + urshr v10.8h, v2.8h, #8 + raddhn v3.8b, v10.8h, v2.8h + raddhn v2.8b, v1.8h, v11.8h + uqadd v6.8b, v2.8b, v6.8b + uqadd v7.8b, v3.8b, v7.8b + vuzp v6.8b, v7.8b + vuzp v6.8b, v7.8b + add v12.8h, v12.8h, v13.8h + st1 {v6.2s, v7.2s}, [OUT], #16 .endm .macro bilinear_over_8888_8888_process_pixblock_tail_head - vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS - asr TMP1, X, #16 + ushll v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS + asr WTMP1, X, #16 add X, X, UX add TMP1, TOP, TMP1, lsl #2 - vmlsl.u16 q2, d20, d30 - asr TMP2, X, #16 + umlsl v2.4s, v10.4h, v15.h[0] + asr WTMP2, X, #16 add X, X, UX add TMP2, TOP, TMP2, lsl #2 - vmlal.u16 q2, d21, d30 - vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS - vld1.32 {d20}, [TMP1], STRIDE - vmlsl.u16 q3, d22, d31 - vmlal.u16 q3, d23, d31 - vld1.32 {d21}, [TMP1] - vmull.u8 q8, d20, d28 - vmlal.u8 q8, d21, d29 - vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) - vld1.32 {d2, d3}, [OUT, :128] - pld [OUT, PF_OFFS] - vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vld1.32 {d22}, [TMP2], STRIDE - vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) - vmovn.u16 d6, q0 - vld1.32 {d23}, [TMP2] - vmull.u8 q9, d22, d28 - asr TMP3, X, #16 + umlal2 v2.4s, v10.8h, v15.h[0] + ushll v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS + ld1 {v20.2s}, [TMP1], STRIDE + umlsl v3.4s, v11.4h, v15.h[4] + umlal2 v3.4s, v11.8h, v15.h[4] + ld1 {v21.2s}, [TMP1] + umull v8.8h, v20.8b, v28.8b + umlal v8.8h, v21.8b, v29.8b + shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn2 v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + ld1 {v22.2s}, [TMP2], STRIDE + shrn2 v2.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + xtn v6.8b, v0.8h + ld1 {v23.2s}, [TMP2] + umull v9.8h, v22.8b, v28.8b + asr WTMP3, X, #16 add X, X, UX add TMP3, TOP, TMP3, lsl #2 - asr TMP4, X, #16 + asr WTMP4, X, #16 add X, X, UX add TMP4, TOP, TMP4, lsl #2 - vmlal.u8 q9, d23, d29 - vmovn.u16 d7, q2 - vld1.32 {d22}, [TMP3], STRIDE - vuzp.8 d6, d7 - vuzp.8 d2, d3 - vuzp.8 d6, d7 - vuzp.8 d2, d3 - vdup.32 d4, d7[1] - vld1.32 {d23}, [TMP3] - vmvn.8 d4, d4 - vmull.u8 q10, d22, d28 - vmlal.u8 q10, d23, d29 - vmull.u8 q11, d2, d4 - vmull.u8 q2, d3, d4 - vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q0, d16, d30 - vrshr.u16 q1, q11, #8 - vmlal.u16 q0, d17, d30 - vrshr.u16 q8, q2, #8 - vraddhn.u16 d2, q1, q11 - vraddhn.u16 d3, q8, q2 - pld [TMP4, PF_OFFS] - vld1.32 {d16}, [TMP4], STRIDE - vqadd.u8 q3, q1, q3 - vld1.32 {d17}, [TMP4] - pld [TMP4, PF_OFFS] - vmull.u8 q11, d16, d28 - vmlal.u8 q11, d17, d29 - vuzp.8 d6, d7 - vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS - vuzp.8 d6, d7 - vmlsl.u16 q1, d18, d31 - vadd.u16 q12, q12, q13 - vmlal.u16 q1, d19, d31 - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vadd.u16 q12, q12, q13 - vst1.32 {d6, d7}, [OUT, :128]! + umlal v9.8h, v23.8b, v29.8b + xtn v7.8b, v2.8h + ld1 {v2.2s, v3.2s}, [OUT] + prfm PREFETCH_MODE, [OUT, PF_OFFS] + ld1 {v22.2s}, [TMP3], STRIDE + vuzp v6.8b, v7.8b + vuzp v2.8b, v3.8b + vuzp v6.8b, v7.8b + vuzp v2.8b, v3.8b + dup v4.2s, v7.s[1] + ld1 {v23.2s}, [TMP3] + mvn v4.8b, v4.8b + umull v10.8h, v22.8b, v28.8b + umlal v10.8h, v23.8b, v29.8b + umull v11.8h, v2.8b, v4.8b + umull v2.8h, v3.8b, v4.8b + ushll v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v0.4s, v8.4h, v15.h[0] + urshr v1.8h, v11.8h, #8 + umlal2 v0.4s, v8.8h, v15.h[0] + urshr v8.8h, v2.8h, #8 + raddhn v3.8b, v8.8h, v2.8h + raddhn v2.8b, v1.8h, v11.8h + prfm PREFETCH_MODE, [TMP4, PF_OFFS] + ld1 {v16.2s}, [TMP4], STRIDE + uqadd v6.8b, v2.8b, v6.8b + uqadd v7.8b, v3.8b, v7.8b + ld1 {v17.2s}, [TMP4] + prfm PREFETCH_MODE, [TMP4, PF_OFFS] + umull v11.8h, v16.8b, v28.8b + umlal v11.8h, v17.8b, v29.8b + vuzp v6.8b, v7.8b + ushll v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS + vuzp v6.8b, v7.8b + umlsl v1.4s, v9.4h, v15.h[4] + add v12.8h, v12.8h, v13.8h + umlal2 v1.4s, v9.8h, v15.h[4] + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + add v12.8h, v12.8h, v13.8h + st1 {v6.2s, v7.2s}, [OUT], #16 .endm /* over_8888_8_8888 */ @@ -1048,173 +1117,20 @@ pixman_asm_function fname .endm .macro bilinear_over_8888_8_8888_process_four_pixels - bilinear_interpolate_four_pixels 8888, 8, 8888, over + bilinear_interpolate_two_pixels 8888, 8, 8888, over + bilinear_interpolate_two_pixels 8888, 8, 8888, over .endm .macro bilinear_over_8888_8_8888_process_pixblock_head - asr TMP1, X, #16 - add X, X, UX - add TMP1, TOP, TMP1, lsl #2 - vld1.32 {d0}, [TMP1], STRIDE - asr TMP2, X, #16 - add X, X, UX - add TMP2, TOP, TMP2, lsl #2 - vld1.32 {d1}, [TMP1] - asr TMP3, X, #16 - add X, X, UX - add TMP3, TOP, TMP3, lsl #2 - vld1.32 {d2}, [TMP2], STRIDE - asr TMP4, X, #16 - add X, X, UX - add TMP4, TOP, TMP4, lsl #2 - vld1.32 {d3}, [TMP2] - vmull.u8 q2, d0, d28 - vmull.u8 q3, d2, d28 - vmlal.u8 q2, d1, d29 - vmlal.u8 q3, d3, d29 - vshll.u16 q0, d4, #BILINEAR_INTERPOLATION_BITS - vshll.u16 q1, d6, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q0, d4, d30 - vmlsl.u16 q1, d6, d31 - vmlal.u16 q0, d5, d30 - vmlal.u16 q1, d7, d31 - vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) - vld1.32 {d2}, [TMP3], STRIDE - vld1.32 {d3}, [TMP3] - pld [TMP4, PF_OFFS] - vld1.32 {d4}, [TMP4], STRIDE - vld1.32 {d5}, [TMP4] - pld [TMP4, PF_OFFS] - vmull.u8 q3, d2, d28 - vmlal.u8 q3, d3, d29 - vmull.u8 q1, d4, d28 - vmlal.u8 q1, d5, d29 - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vld1.32 {d22[0]}, [MASK]! - pld [MASK, #prefetch_offset] - vadd.u16 q12, q12, q13 - vmovn.u16 d16, q0 + bilinear_over_8888_8_8888_process_four_pixels .endm .macro bilinear_over_8888_8_8888_process_pixblock_tail - vshll.u16 q9, d6, #BILINEAR_INTERPOLATION_BITS - vshll.u16 q10, d2, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q9, d6, d30 - vmlsl.u16 q10, d2, d31 - vmlal.u16 q9, d7, d30 - vmlal.u16 q10, d3, d31 - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vadd.u16 q12, q12, q13 - vdup.32 d22, d22[0] - vshrn.u32 d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS) - vmovn.u16 d17, q9 - vld1.32 {d18, d19}, [OUT, :128] - pld [OUT, PF_OFFS] - vuzp.8 d16, d17 - vuzp.8 d18, d19 - vuzp.8 d16, d17 - vuzp.8 d18, d19 - vmull.u8 q10, d16, d22 - vmull.u8 q11, d17, d22 - vrsra.u16 q10, q10, #8 - vrsra.u16 q11, q11, #8 - vrshrn.u16 d16, q10, #8 - vrshrn.u16 d17, q11, #8 - vdup.32 d22, d17[1] - vmvn.8 d22, d22 - vmull.u8 q10, d18, d22 - vmull.u8 q11, d19, d22 - vrshr.u16 q9, q10, #8 - vrshr.u16 q0, q11, #8 - vraddhn.u16 d18, q9, q10 - vraddhn.u16 d19, q0, q11 - vqadd.u8 q9, q8, q9 - vuzp.8 d18, d19 - vuzp.8 d18, d19 - vst1.32 {d18, d19}, [OUT, :128]! .endm .macro bilinear_over_8888_8_8888_process_pixblock_tail_head - vshll.u16 q9, d6, #BILINEAR_INTERPOLATION_BITS - asr TMP1, X, #16 - add X, X, UX - add TMP1, TOP, TMP1, lsl #2 - vshll.u16 q10, d2, #BILINEAR_INTERPOLATION_BITS - vld1.32 {d0}, [TMP1], STRIDE - asr TMP2, X, #16 - add X, X, UX - add TMP2, TOP, TMP2, lsl #2 - vmlsl.u16 q9, d6, d30 - vmlsl.u16 q10, d2, d31 - vld1.32 {d1}, [TMP1] - asr TMP3, X, #16 - add X, X, UX - add TMP3, TOP, TMP3, lsl #2 - vmlal.u16 q9, d7, d30 - vmlal.u16 q10, d3, d31 - vld1.32 {d2}, [TMP2], STRIDE - asr TMP4, X, #16 - add X, X, UX - add TMP4, TOP, TMP4, lsl #2 - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vadd.u16 q12, q12, q13 - vld1.32 {d3}, [TMP2] - vdup.32 d22, d22[0] - vshrn.u32 d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS) - vmull.u8 q2, d0, d28 - vmull.u8 q3, d2, d28 - vmovn.u16 d17, q9 - vld1.32 {d18, d19}, [OUT, :128] - pld [OUT, #(prefetch_offset * 4)] - vmlal.u8 q2, d1, d29 - vmlal.u8 q3, d3, d29 - vuzp.8 d16, d17 - vuzp.8 d18, d19 - vshll.u16 q0, d4, #BILINEAR_INTERPOLATION_BITS - vshll.u16 q1, d6, #BILINEAR_INTERPOLATION_BITS - vuzp.8 d16, d17 - vuzp.8 d18, d19 - vmlsl.u16 q0, d4, d30 - vmlsl.u16 q1, d6, d31 - vmull.u8 q10, d16, d22 - vmull.u8 q11, d17, d22 - vmlal.u16 q0, d5, d30 - vmlal.u16 q1, d7, d31 - vrsra.u16 q10, q10, #8 - vrsra.u16 q11, q11, #8 - vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) - vrshrn.u16 d16, q10, #8 - vrshrn.u16 d17, q11, #8 - vld1.32 {d2}, [TMP3], STRIDE - vdup.32 d22, d17[1] - vld1.32 {d3}, [TMP3] - vmvn.8 d22, d22 - pld [TMP4, PF_OFFS] - vld1.32 {d4}, [TMP4], STRIDE - vmull.u8 q10, d18, d22 - vmull.u8 q11, d19, d22 - vld1.32 {d5}, [TMP4] - pld [TMP4, PF_OFFS] - vmull.u8 q3, d2, d28 - vrshr.u16 q9, q10, #8 - vrshr.u16 q15, q11, #8 - vmlal.u8 q3, d3, d29 - vmull.u8 q1, d4, d28 - vraddhn.u16 d18, q9, q10 - vraddhn.u16 d19, q15, q11 - vmlal.u8 q1, d5, d29 - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vqadd.u8 q9, q8, q9 - vld1.32 {d22[0]}, [MASK]! - vuzp.8 d18, d19 - vadd.u16 q12, q12, q13 - vuzp.8 d18, d19 - vmovn.u16 d16, q0 - vst1.32 {d18, d19}, [OUT, :128]! + bilinear_over_8888_8_8888_process_pixblock_tail + bilinear_over_8888_8_8888_process_pixblock_head .endm /* add_8888_8888 */ @@ -1227,7 +1143,8 @@ pixman_asm_function fname .endm .macro bilinear_add_8888_8888_process_four_pixels - bilinear_interpolate_four_pixels 8888, x, 8888, add + bilinear_interpolate_two_pixels 8888, x, 8888, add + bilinear_interpolate_two_pixels 8888, x, 8888, add .endm .macro bilinear_add_8888_8888_process_pixblock_head diff --git a/pixman/pixman-arma64-neon-asm.S b/pixman/pixman-arma64-neon-asm.S index 059b285..4849267 100644 --- a/pixman/pixman-arma64-neon-asm.S +++ b/pixman/pixman-arma64-neon-asm.S @@ -39,19 +39,15 @@ .section .note.GNU-stack,"",%progbits #endif - .text - .fpu neon - .arch armv7a - .object_arch armv4 - .eabi_attribute 10, 0 /* suppress Tag_FP_arch */ - .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */ - .arm - .altmacro - .p2align 2 +.text +.arch armv8-a + +.altmacro +.p2align 2 #include "pixman-private.h" #include "pixman-arm-asm.h" -#include "pixman-arm-neon-asm.h" +#include "pixman-arma64-neon-asm.h" /* Global configuration options and preferences */ @@ -101,26 +97,26 @@ * the data in NEON registers. * * NEON registers allocation in general is recommented to be the following: - * d0, d1, d2, d3 - contain loaded source pixel data - * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed) - * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used) - * d28, d29, d30, d31 - place for storing the result (destination pixels) + * v0, v1, v2, v3 - contain loaded source pixel data + * v4, v5, v6, v7 - contain loaded destination pixels (if they are needed) + * v24, v25, v26, v27 - contain loading mask pixel data (if mask is used) + * v28, v29, v30, v31 - place for storing the result (destination pixels) * * As can be seen above, four 64-bit NEON registers are used for keeping * intermediate pixel data and up to 8 pixels can be processed in one step * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp). * * This particular function uses the following registers allocation: - * d0, d1, d2, d3 - contain loaded source pixel data - * d4, d5 - contain loaded destination pixels (they are needed) - * d28, d29 - place for storing the result (destination pixels) + * v0, v1, v2, v3 - contain loaded source pixel data + * v4, v5 - contain loaded destination pixels (they are needed) + * v28, v29 - place for storing the result (destination pixels) */ /* * Step one. We need to have some code to do some arithmetics on pixel data. * This is implemented as a pair of macros: '*_head' and '*_tail'. When used - * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5}, - * perform all the needed calculations and write the result to {d28, d29}. + * back-to-back, they take pixel data from {v0, v1, v2, v3} and {v4, v5}, + * perform all the needed calculations and write the result to {v28, v29}. * The rationale for having two macros and not just one will be explained * later. In practice, any single monolitic function which does the work can * be split into two parts in any arbitrary way without affecting correctness. @@ -129,62 +125,70 @@ * make our life a bit easier by doing R, G, B, A color components * deinterleaving for 32bpp pixel formats (and this feature is used in * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that - * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we - * actually use d0 register for blue channel (a vector of eight 8-bit - * values), d1 register for green, d2 for red and d3 for alpha. This + * instead of having 8 packed pixels in {v0, v1, v2, v3} registers, we + * actually use v0 register for blue channel (a vector of eight 8-bit + * values), v1 register for green, v2 for red and v3 for alpha. This * simple conversion can be also done with a few NEON instructions: * - * Packed to planar conversion: - * vuzp.8 d0, d1 - * vuzp.8 d2, d3 - * vuzp.8 d1, d3 - * vuzp.8 d0, d2 + * Packed to planar conversion: // vuzp8 is a wrapper macro + * vuzp8 v0, v1 + * vuzp8 v2, v3 + * vuzp8 v1, v3 + * vuzp8 v0, v2 * - * Planar to packed conversion: - * vzip.8 d0, d2 - * vzip.8 d1, d3 - * vzip.8 d2, d3 - * vzip.8 d0, d1 + * Planar to packed conversion: // vzip8 is a wrapper macro + * vzip8 v0, v2 + * vzip8 v1, v3 + * vzip8 v2, v3 + * vzip8 v0, v1 * - * But pixel can be loaded directly in planar format using VLD4.8 NEON - * instruction. It is 1 cycle slower than VLD1.32, so this is not always + * But pixel can be loaded directly in planar format using LD4 / b NEON + * instruction. It is 1 cycle slower than LD1 / s, so this is not always * desirable, that's why deinterleaving is optional. * * But anyway, here is the code: */ + .macro pixman_composite_over_8888_0565_process_pixblock_head - /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format - and put data into d6 - red, d7 - green, d30 - blue */ - vshrn.u16 d6, q2, #8 - vshrn.u16 d7, q2, #3 - vsli.u16 q2, q2, #5 - vsri.u8 d6, d6, #5 - vmvn.8 d3, d3 /* invert source alpha */ - vsri.u8 d7, d7, #6 - vshrn.u16 d30, q2, #2 + /* convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format + and put data into v6 - red, v7 - green, v30 - blue */ + mov v4.d[1], v5.d[0] + shrn v6.8b, v4.8h, #8 + shrn v7.8b, v4.8h, #3 + sli v4.8h, v4.8h, #5 + sri v6.8b, v6.8b, #5 + mvn v3.8b, v3.8b /* invert source alpha */ + sri v7.8b, v7.8b, #6 + shrn v30.8b, v4.8h, #2 /* now do alpha blending, storing results in 8-bit planar format - into d16 - red, d19 - green, d18 - blue */ - vmull.u8 q10, d3, d6 - vmull.u8 q11, d3, d7 - vmull.u8 q12, d3, d30 - vrshr.u16 q13, q10, #8 - vrshr.u16 q3, q11, #8 - vrshr.u16 q15, q12, #8 - vraddhn.u16 d20, q10, q13 - vraddhn.u16 d23, q11, q3 - vraddhn.u16 d22, q12, q15 + into v20 - red, v23 - green, v22 - blue */ + umull v10.8h, v3.8b, v6.8b + umull v11.8h, v3.8b, v7.8b + umull v12.8h, v3.8b, v30.8b + urshr v17.8h, v10.8h, #8 + urshr v18.8h, v11.8h, #8 + urshr v19.8h, v12.8h, #8 + raddhn v20.8b, v10.8h, v17.8h + raddhn v23.8b, v11.8h, v18.8h + raddhn v22.8b, v12.8h, v19.8h .endm .macro pixman_composite_over_8888_0565_process_pixblock_tail /* ... continue alpha blending */ - vqadd.u8 d16, d2, d20 - vqadd.u8 q9, q0, q11 - /* convert the result to r5g6b5 and store it into {d28, d29} */ - vshll.u8 q14, d16, #8 - vshll.u8 q8, d19, #8 - vshll.u8 q9, d18, #8 - vsri.u16 q14, q8, #5 - vsri.u16 q14, q9, #11 + uqadd v17.8b, v2.8b, v20.8b + uqadd v18.8b, v0.8b, v22.8b + uqadd v19.8b, v1.8b, v23.8b + /* convert the result to r5g6b5 and store it into {v14} */ + ushll v14.8h, v17.8b, #7 + sli v14.8h, v14.8h, #1 + ushll v8.8h, v19.8b, #7 + sli v8.8h, v8.8h, #1 + ushll v9.8h, v18.8b, #7 + sli v9.8h, v9.8h, #1 + sri v14.8h, v8.8h, #5 + sri v14.8h, v9.8h, #11 + mov v28.d[0], v14.d[0] + mov v29.d[0], v14.d[1] .endm /* @@ -211,9 +215,9 @@ * of this macro would look like this: * * pixman_composite_over_8888_0565_process_pixblock_tail - * vst1.16 {d28, d29}, [DST_W, :128]! - * vld1.16 {d4, d5}, [DST_R, :128]! - * vld4.32 {d0, d1, d2, d3}, [SRC]! + * st1 {v28.4h, v29.4h}, [DST_W], #32 + * ld1 {v4.4h, v5.4h}, [DST_R], #16 + * ld4 {v0.2s, v1.2s, v2.2s, v3.2s}, [SRC], #32 * pixman_composite_over_8888_0565_process_pixblock_head * cache_preload 8, 8 * @@ -244,7 +248,7 @@ * Different instruction streams (originaling from '*_head', '*_tail' * and 'cache_preload' macro) use different indentation levels for * better readability. Actually taking the code from one of these - * indentation levels and ignoring a few VLD/VST instructions would + * indentation levels and ignoring a few LD/ST instructions would * result in exactly the code from '*_head', '*_tail' or 'cache_preload' * macro! */ @@ -252,43 +256,68 @@ #if 1 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head - vqadd.u8 d16, d2, d20 - vld1.16 {d4, d5}, [DST_R, :128]! - vqadd.u8 q9, q0, q11 - vshrn.u16 d6, q2, #8 + uqadd v17.8b, v2.8b, v20.8b + ld1 {v4.4h, v5.4h}, [DST_R], #16 + mov v4.d[1], v5.d[0] + uqadd v18.8b, v0.8b, v22.8b + uqadd v19.8b, v1.8b, v23.8b + shrn v6.8b, v4.8h, #8 fetch_src_pixblock - vshrn.u16 d7, q2, #3 - vsli.u16 q2, q2, #5 - vshll.u8 q14, d16, #8 + shrn v7.8b, v4.8h, #3 + sli v4.8h, v4.8h, #5 + ushll v14.8h, v17.8b, #7 + sli v14.8h, v14.8h, #1 PF add PF_X, PF_X, #8 - vshll.u8 q8, d19, #8 + ushll v8.8h, v19.8b, #7 + sli v8.8h, v8.8h, #1 PF tst PF_CTL, #0xF - vsri.u8 d6, d6, #5 - PF addne PF_X, PF_X, #8 - vmvn.8 d3, d3 - PF subne PF_CTL, PF_CTL, #1 - vsri.u8 d7, d7, #6 - vshrn.u16 d30, q2, #2 - vmull.u8 q10, d3, d6 - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - vmull.u8 q11, d3, d7 - vmull.u8 q12, d3, d30 - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - vsri.u16 q14, q8, #5 + sri v6.8b, v6.8b, #5 + PF beq 10f + PF add PF_X, PF_X, #8 +10: + mvn v3.8b, v3.8b + PF beq 10f + PF sub PF_CTL, PF_CTL, #1 +10: + sri v7.8b, v7.8b, #6 + shrn v30.8b, v4.8h, #2 + umull v10.8h, v3.8b, v6.8b + PF lsl DUMMY, PF_X, #src_bpp_shift + PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] + umull v11.8h, v3.8b, v7.8b + umull v12.8h, v3.8b, v30.8b + PF lsl DUMMY, PF_X, #dst_bpp_shift + PF prfm PREFETCH_MODE, [PF_DST, DUMMY] + sri v14.8h, v8.8h, #5 PF cmp PF_X, ORIG_W - vshll.u8 q9, d18, #8 - vrshr.u16 q13, q10, #8 - PF subge PF_X, PF_X, ORIG_W - vrshr.u16 q3, q11, #8 - vrshr.u16 q15, q12, #8 - PF subges PF_CTL, PF_CTL, #0x10 - vsri.u16 q14, q9, #11 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - vraddhn.u16 d20, q10, q13 - vraddhn.u16 d23, q11, q3 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vraddhn.u16 d22, q12, q15 - vst1.16 {d28, d29}, [DST_W, :128]! + ushll v9.8h, v18.8b, #7 + sli v9.8h, v9.8h, #1 + urshr v17.8h, v10.8h, #8 + PF ble 10f + PF sub PF_X, PF_X, ORIG_W +10: + urshr v19.8h, v11.8h, #8 + urshr v18.8h, v12.8h, #8 + PF ble 10f + PF subs PF_CTL, PF_CTL, #0x10 +10: + sri v14.8h, v9.8h, #11 + mov v28.d[0], v14.d[0] + mov v29.d[0], v14.d[1] + PF ble 10f + PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift + PF ldrsb DUMMY, [PF_SRC, DUMMY] + PF add PF_SRC, PF_SRC, #1 +10: + raddhn v20.8b, v10.8h, v17.8h + raddhn v23.8b, v11.8h, v19.8h + PF ble 10f + PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift + PF ldrsb DUMMY, [PF_DST, DUMMY] + PF add PF_DST, PF_SRC, #1 +10: + raddhn v22.8b, v12.8h, v18.8h + st1 {v14.8h}, [DST_W], #16 .endm #else @@ -296,8 +325,8 @@ /* If we did not care much about the performance, we would just use this... */ .macro pixman_composite_over_8888_0565_process_pixblock_tail_head pixman_composite_over_8888_0565_process_pixblock_tail - vst1.16 {d28, d29}, [DST_W, :128]! - vld1.16 {d4, d5}, [DST_R, :128]! + st1 {v14.8h}, [DST_W], #16 + ld1 {v4.4h, v4.5h}, [DST_R], #16 fetch_src_pixblock pixman_composite_over_8888_0565_process_pixblock_head cache_preload 8, 8 @@ -352,56 +381,62 @@ generate_composite_function \ /******************************************************************************/ .macro pixman_composite_over_n_0565_process_pixblock_head - /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format - and put data into d6 - red, d7 - green, d30 - blue */ - vshrn.u16 d6, q2, #8 - vshrn.u16 d7, q2, #3 - vsli.u16 q2, q2, #5 - vsri.u8 d6, d6, #5 - vsri.u8 d7, d7, #6 - vshrn.u16 d30, q2, #2 + /* convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format + and put data into v6 - red, v7 - green, v30 - blue */ + mov v4.d[1], v5.d[0] + shrn v6.8b, v4.8h, #8 + shrn v7.8b, v4.8h, #3 + sli v4.8h, v4.8h, #5 + sri v6.8b, v6.8b, #5 + sri v7.8b, v7.8b, #6 + shrn v30.8b, v4.8h, #2 /* now do alpha blending, storing results in 8-bit planar format - into d16 - red, d19 - green, d18 - blue */ - vmull.u8 q10, d3, d6 - vmull.u8 q11, d3, d7 - vmull.u8 q12, d3, d30 - vrshr.u16 q13, q10, #8 - vrshr.u16 q3, q11, #8 - vrshr.u16 q15, q12, #8 - vraddhn.u16 d20, q10, q13 - vraddhn.u16 d23, q11, q3 - vraddhn.u16 d22, q12, q15 + into v20 - red, v23 - green, v22 - blue */ + umull v10.8h, v3.8b, v6.8b + umull v11.8h, v3.8b, v7.8b + umull v12.8h, v3.8b, v30.8b + urshr v13.8h, v10.8h, #8 + urshr v14.8h, v11.8h, #8 + urshr v15.8h, v12.8h, #8 + raddhn v20.8b, v10.8h, v13.8h + raddhn v23.8b, v11.8h, v14.8h + raddhn v22.8b, v12.8h, v15.8h .endm .macro pixman_composite_over_n_0565_process_pixblock_tail /* ... continue alpha blending */ - vqadd.u8 d16, d2, d20 - vqadd.u8 q9, q0, q11 - /* convert the result to r5g6b5 and store it into {d28, d29} */ - vshll.u8 q14, d16, #8 - vshll.u8 q8, d19, #8 - vshll.u8 q9, d18, #8 - vsri.u16 q14, q8, #5 - vsri.u16 q14, q9, #11 + uqadd v17.8b, v2.8b, v20.8b + uqadd v18.8b, v0.8b, v22.8b + uqadd v19.8b, v1.8b, v23.8b + /* convert the result to r5g6b5 and store it into {v14} */ + ushll v14.8h, v17.8b, #7 + sli v14.8h, v14.8h, #1 + ushll v8.8h, v19.8b, #7 + sli v8.8h, v8.8h, #1 + ushll v9.8h, v18.8b, #7 + sli v9.8h, v9.8h, #1 + sri v14.8h, v8.8h, #5 + sri v14.8h, v9.8h, #11 + mov v28.d[0], v14.d[0] + mov v29.d[0], v14.d[1] .endm /* TODO: expand macros and do better instructions scheduling */ .macro pixman_composite_over_n_0565_process_pixblock_tail_head pixman_composite_over_n_0565_process_pixblock_tail - vld1.16 {d4, d5}, [DST_R, :128]! - vst1.16 {d28, d29}, [DST_W, :128]! + ld1 {v4.4h, v5.4h}, [DST_R], #16 + st1 {v14.8h}, [DST_W], #16 pixman_composite_over_n_0565_process_pixblock_head cache_preload 8, 8 .endm .macro pixman_composite_over_n_0565_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vld1.32 {d3[0]}, [DUMMY] - vdup.8 d0, d3[0] - vdup.8 d1, d3[1] - vdup.8 d2, d3[2] - vdup.8 d3, d3[3] - vmvn.8 d3, d3 /* invert source alpha */ + mov v3.s[0], w4 + dup v0.8b, v3.b[0] + dup v1.8b, v3.b[1] + dup v2.8b, v3.b[2] + dup v3.8b, v3.b[3] + mvn v3.8b, v3.8b /* invert source alpha */ .endm generate_composite_function \ @@ -422,33 +457,52 @@ generate_composite_function \ /******************************************************************************/ .macro pixman_composite_src_8888_0565_process_pixblock_head - vshll.u8 q8, d1, #8 - vshll.u8 q14, d2, #8 - vshll.u8 q9, d0, #8 + ushll v8.8h, v1.8b, #7 + sli v8.8h, v8.8h, #1 + ushll v14.8h, v2.8b, #7 + sli v14.8h, v14.8h, #1 + ushll v9.8h, v0.8b, #7 + sli v9.8h, v9.8h, #1 .endm .macro pixman_composite_src_8888_0565_process_pixblock_tail - vsri.u16 q14, q8, #5 - vsri.u16 q14, q9, #11 + sri v14.8h, v8.8h, #5 + sri v14.8h, v9.8h, #11 + mov v28.d[0], v14.d[0] + mov v29.d[0], v14.d[1] .endm .macro pixman_composite_src_8888_0565_process_pixblock_tail_head - vsri.u16 q14, q8, #5 + sri v14.8h, v8.8h, #5 PF add PF_X, PF_X, #8 PF tst PF_CTL, #0xF fetch_src_pixblock - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 - vsri.u16 q14, q9, #11 + PF beq 10f + PF add PF_X, PF_X, #8 + PF sub PF_CTL, PF_CTL, #1 +10: + sri v14.8h, v9.8h, #11 + mov v28.d[0], v14.d[0] + mov v29.d[0], v14.d[1] PF cmp PF_X, ORIG_W - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - vshll.u8 q8, d1, #8 - vst1.16 {d28, d29}, [DST_W, :128]! - PF subge PF_X, PF_X, ORIG_W - PF subges PF_CTL, PF_CTL, #0x10 - vshll.u8 q14, d2, #8 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - vshll.u8 q9, d0, #8 + PF lsl DUMMY, PF_X, #src_bpp_shift + PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] + ushll v8.8h, v1.8b, #7 + sli v8.8h, v8.8h, #1 + st1 {v14.8h}, [DST_W], #16 + PF ble 10f + PF sub PF_X, PF_X, ORIG_W + PF subs PF_CTL, PF_CTL, #0x10 +10: + ushll v14.8h, v2.8b, #7 + sli v14.8h, v14.8h, #1 + PF ble 10f + PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift + PF ldrsb DUMMY, [PF_SRC, DUMMY] + PF add PF_SRC, PF_SRC, #1 +10: + ushll v9.8h, v0.8b, #7 + sli v9.8h, v9.8h, #1 .endm generate_composite_function \ @@ -465,13 +519,14 @@ generate_composite_function \ /******************************************************************************/ .macro pixman_composite_src_0565_8888_process_pixblock_head - vshrn.u16 d30, q0, #8 - vshrn.u16 d29, q0, #3 - vsli.u16 q0, q0, #5 - vmov.u8 d31, #255 - vsri.u8 d30, d30, #5 - vsri.u8 d29, d29, #6 - vshrn.u16 d28, q0, #2 + mov v0.d[1], v1.d[0] + shrn v30.8b, v0.8h, #8 + shrn v29.8b, v0.8h, #3 + sli v0.8h, v0.8h, #5 + movi v31.8b, #255 + sri v30.8b, v30.8b, #5 + sri v29.8b, v29.8b, #6 + shrn v28.8b, v0.8h, #2 .endm .macro pixman_composite_src_0565_8888_process_pixblock_tail @@ -480,7 +535,7 @@ generate_composite_function \ /* TODO: expand macros and do better instructions scheduling */ .macro pixman_composite_src_0565_8888_process_pixblock_tail_head pixman_composite_src_0565_8888_process_pixblock_tail - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 fetch_src_pixblock pixman_composite_src_0565_8888_process_pixblock_head cache_preload 8, 8 @@ -500,8 +555,10 @@ generate_composite_function \ /******************************************************************************/ .macro pixman_composite_add_8_8_process_pixblock_head - vqadd.u8 q14, q0, q2 - vqadd.u8 q15, q1, q3 + uqadd v28.8b, v0.8b, v4.8b + uqadd v29.8b, v1.8b, v5.8b + uqadd v30.8b, v2.8b, v6.8b + uqadd v31.8b, v3.8b, v7.8b .endm .macro pixman_composite_add_8_8_process_pixblock_tail @@ -511,19 +568,33 @@ generate_composite_function \ fetch_src_pixblock PF add PF_X, PF_X, #32 PF tst PF_CTL, #0xF - vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! - PF addne PF_X, PF_X, #32 - PF subne PF_CTL, PF_CTL, #1 - vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! + ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 + PF beq 10f + PF add PF_X, PF_X, #32 + PF sub PF_CTL, PF_CTL, #1 +10: + st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 PF cmp PF_X, ORIG_W - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - PF subge PF_X, PF_X, ORIG_W - PF subges PF_CTL, PF_CTL, #0x10 - vqadd.u8 q14, q0, q2 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vqadd.u8 q15, q1, q3 + PF lsl DUMMY, PF_X, #src_bpp_shift + PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] + PF lsl DUMMY, PF_X, #dst_bpp_shift + PF prfm PREFETCH_MODE, [PF_DST, DUMMY] + PF ble 10f + PF sub PF_X, PF_X, ORIG_W + PF subs PF_CTL, PF_CTL, #0x10 +10: + uqadd v28.8b, v0.8b, v4.8b + PF ble 10f + PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift + PF ldrsb DUMMY, [PF_SRC, DUMMY] + PF add PF_SRC, PF_SRC, #1 + PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift + PF ldrsb DUMMY, [PF_DST, DUMMY] + PF add PF_DST, PF_DST, #1 +10: + uqadd v29.8b, v1.8b, v5.8b + uqadd v30.8b, v2.8b, v6.8b + uqadd v31.8b, v3.8b, v7.8b .endm generate_composite_function \ @@ -543,19 +614,33 @@ generate_composite_function \ fetch_src_pixblock PF add PF_X, PF_X, #8 PF tst PF_CTL, #0xF - vld1.32 {d4, d5, d6, d7}, [DST_R, :128]! - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 - vst1.32 {d28, d29, d30, d31}, [DST_W, :128]! + ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 + PF beq 10f + PF add PF_X, PF_X, #8 + PF sub PF_CTL, PF_CTL, #1 +10: + st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 PF cmp PF_X, ORIG_W - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - PF subge PF_X, PF_X, ORIG_W - PF subges PF_CTL, PF_CTL, #0x10 - vqadd.u8 q14, q0, q2 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vqadd.u8 q15, q1, q3 + PF lsl DUMMY, PF_X, #src_bpp_shift + PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] + PF lsl DUMMY, PF_X, #dst_bpp_shift + PF prfm PREFETCH_MODE, [PF_DST, DUMMY] + PF ble 10f + PF sub PF_X, PF_X, ORIG_W + PF subs PF_CTL, PF_CTL, #0x10 +10: + uqadd v28.8b, v0.8b, v4.8b + PF ble 10f + PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift + PF ldrsb DUMMY, [PF_SRC, DUMMY] + PF add PF_SRC, PF_SRC, #1 + PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift + PF ldrsb DUMMY, [PF_DST, DUMMY] + PF add PF_DST, PF_DST, #1 +10: + uqadd v29.8b, v1.8b, v5.8b + uqadd v30.8b, v2.8b, v6.8b + uqadd v31.8b, v3.8b, v7.8b .endm generate_composite_function \ @@ -582,53 +667,69 @@ generate_composite_function_single_scanline \ /******************************************************************************/ .macro pixman_composite_out_reverse_8888_8888_process_pixblock_head - vmvn.8 d24, d3 /* get inverted alpha */ + mvn v24.8b, v3.8b /* get inverted alpha */ /* do alpha blending */ - vmull.u8 q8, d24, d4 - vmull.u8 q9, d24, d5 - vmull.u8 q10, d24, d6 - vmull.u8 q11, d24, d7 + umull v8.8h, v24.8b, v4.8b + umull v9.8h, v24.8b, v5.8b + umull v10.8h, v24.8b, v6.8b + umull v11.8h, v24.8b, v7.8b .endm .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - vraddhn.u16 d30, q12, q10 - vraddhn.u16 d31, q13, q11 + urshr v14.8h, v8.8h, #8 + urshr v15.8h, v9.8h, #8 + urshr v16.8h, v10.8h, #8 + urshr v17.8h, v11.8h, #8 + raddhn v28.8b, v14.8h, v8.8h + raddhn v29.8b, v15.8h, v9.8h + raddhn v30.8b, v16.8h, v10.8h + raddhn v31.8b, v17.8h, v11.8h .endm .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - vrshr.u16 q14, q8, #8 + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 + urshr v14.8h, v8.8h, #8 PF add PF_X, PF_X, #8 PF tst PF_CTL, #0xF - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 + urshr v15.8h, v9.8h, #8 + urshr v16.8h, v10.8h, #8 + urshr v17.8h, v11.8h, #8 + PF beq 10f + PF add PF_X, PF_X, #8 + PF sub PF_CTL, PF_CTL, #1 +10: + raddhn v28.8b, v14.8h, v8.8h + raddhn v29.8b, v15.8h, v9.8h PF cmp PF_X, ORIG_W - vraddhn.u16 d30, q12, q10 - vraddhn.u16 d31, q13, q11 + raddhn v30.8b, v16.8h, v10.8h + raddhn v31.8b, v17.8h, v11.8h fetch_src_pixblock - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - vmvn.8 d22, d3 - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - PF subge PF_X, PF_X, ORIG_W - vmull.u8 q8, d22, d4 - PF subges PF_CTL, PF_CTL, #0x10 - vmull.u8 q9, d22, d5 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - vmull.u8 q10, d22, d6 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vmull.u8 q11, d22, d7 + PF lsl DUMMY, PF_X, #src_bpp_shift + PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] + mvn v22.8b, v3.8b + PF lsl DUMMY, PF_X, #dst_bpp_shift + PF prfm PREFETCH_MODE, [PF_DST, DUMMY] + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 + PF ble 10f + PF sub PF_X, PF_X, ORIG_W +10: + umull v8.8h, v22.8b, v4.8b + PF ble 10f + PF subs PF_CTL, PF_CTL, #0x10 +10: + umull v9.8h, v22.8b, v5.8b + PF ble 10f + PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift + PF ldrsb DUMMY, [PF_SRC, DUMMY] + PF add PF_SRC, PF_SRC, #1 +10: + umull v10.8h, v22.8b, v6.8b + PF ble 10f + PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift + PF ldrsb DUMMY, [PF_DST, DUMMY] + PF add PF_DST, PF_DST, #1 +10: + umull v11.8h, v22.8b, v7.8b .endm generate_composite_function_single_scanline \ @@ -649,40 +750,60 @@ generate_composite_function_single_scanline \ .macro pixman_composite_over_8888_8888_process_pixblock_tail pixman_composite_out_reverse_8888_8888_process_pixblock_tail - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 + uqadd v28.8b, v0.8b, v28.8b + uqadd v29.8b, v1.8b, v29.8b + uqadd v30.8b, v2.8b, v30.8b + uqadd v31.8b, v3.8b, v31.8b .endm .macro pixman_composite_over_8888_8888_process_pixblock_tail_head - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - vrshr.u16 q14, q8, #8 + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 + urshr v14.8h, v8.8h, #8 PF add PF_X, PF_X, #8 PF tst PF_CTL, #0xF - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 + urshr v15.8h, v9.8h, #8 + urshr v16.8h, v10.8h, #8 + urshr v17.8h, v11.8h, #8 + PF beq 10f + PF add PF_X, PF_X, #8 + PF sub PF_CTL, PF_CTL, #1 +10: + raddhn v28.8b, v14.8h, v8.8h + raddhn v29.8b, v15.8h, v9.8h PF cmp PF_X, ORIG_W - vraddhn.u16 d30, q12, q10 - vraddhn.u16 d31, q13, q11 - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 + raddhn v30.8b, v16.8h, v10.8h + raddhn v31.8b, v17.8h, v11.8h + uqadd v28.8b, v0.8b, v28.8b + uqadd v29.8b, v1.8b, v29.8b + uqadd v30.8b, v2.8b, v30.8b + uqadd v31.8b, v3.8b, v31.8b fetch_src_pixblock - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - vmvn.8 d22, d3 - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - PF subge PF_X, PF_X, ORIG_W - vmull.u8 q8, d22, d4 - PF subges PF_CTL, PF_CTL, #0x10 - vmull.u8 q9, d22, d5 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - vmull.u8 q10, d22, d6 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vmull.u8 q11, d22, d7 + PF lsl DUMMY, PF_X, #src_bpp_shift + PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] + mvn v22.8b, v3.8b + PF lsl DUMMY, PF_X, #dst_bpp_shift + PF prfm PREFETCH_MODE, [PF_DST, DUMMY] + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 + PF ble 10f + PF sub PF_X, PF_X, ORIG_W +10: + umull v8.8h, v22.8b, v4.8b + PF ble 10f + PF subs PF_CTL, PF_CTL, #0x10 +10: + umull v9.8h, v22.8b, v5.8b + PF ble 10f + PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift + PF ldrsb DUMMY, [PF_SRC, DUMMY] + PF add PF_SRC, PF_SRC, #1 +10: + umull v10.8h, v22.8b, v6.8b + PF ble 10f + PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift + PF ldrsb DUMMY, [PF_DST, DUMMY] + PF add PF_DST, PF_DST, #1 +10: + umull v11.8h, v22.8b, v7.8b .endm generate_composite_function \ @@ -709,64 +830,76 @@ generate_composite_function_single_scanline \ /******************************************************************************/ .macro pixman_composite_over_n_8888_process_pixblock_head - /* deinterleaved source pixels in {d0, d1, d2, d3} */ - /* inverted alpha in {d24} */ - /* destination pixels in {d4, d5, d6, d7} */ - vmull.u8 q8, d24, d4 - vmull.u8 q9, d24, d5 - vmull.u8 q10, d24, d6 - vmull.u8 q11, d24, d7 + /* deinterleaved source pixels in {v0, v1, v2, v3} */ + /* inverted alpha in {v24} */ + /* destination pixels in {v4, v5, v6, v7} */ + umull v8.8h, v24.8b, v4.8b + umull v9.8h, v24.8b, v5.8b + umull v10.8h, v24.8b, v6.8b + umull v11.8h, v24.8b, v7.8b .endm .macro pixman_composite_over_n_8888_process_pixblock_tail - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vrshr.u16 q2, q10, #8 - vrshr.u16 q3, q11, #8 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - vraddhn.u16 d30, q2, q10 - vraddhn.u16 d31, q3, q11 - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 + urshr v14.8h, v8.8h, #8 + urshr v15.8h, v9.8h, #8 + urshr v16.8h, v10.8h, #8 + urshr v17.8h, v11.8h, #8 + raddhn v28.8b, v14.8h, v8.8h + raddhn v29.8b, v15.8h, v9.8h + raddhn v30.8b, v16.8h, v10.8h + raddhn v31.8b, v17.8h, v11.8h + uqadd v28.8b, v0.8b, v28.8b + uqadd v29.8b, v1.8b, v29.8b + uqadd v30.8b, v2.8b, v30.8b + uqadd v31.8b, v3.8b, v31.8b .endm .macro pixman_composite_over_n_8888_process_pixblock_tail_head - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vrshr.u16 q2, q10, #8 - vrshr.u16 q3, q11, #8 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - vraddhn.u16 d30, q2, q10 - vraddhn.u16 d31, q3, q11 - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - vqadd.u8 q14, q0, q14 + urshr v14.8h, v8.8h, #8 + urshr v15.8h, v9.8h, #8 + urshr v16.8h, v10.8h, #8 + urshr v17.8h, v11.8h, #8 + raddhn v28.8b, v14.8h, v8.8h + raddhn v29.8b, v15.8h, v9.8h + raddhn v30.8b, v16.8h, v10.8h + raddhn v31.8b, v17.8h, v11.8h + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 + uqadd v28.8b, v0.8b, v28.8b PF add PF_X, PF_X, #8 PF tst PF_CTL, #0x0F - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 - vqadd.u8 q15, q1, q15 + PF beq 10f + PF add PF_X, PF_X, #8 + PF sub PF_CTL, PF_CTL, #1 +10: + uqadd v29.8b, v1.8b, v29.8b + uqadd v30.8b, v2.8b, v30.8b + uqadd v31.8b, v3.8b, v31.8b PF cmp PF_X, ORIG_W - vmull.u8 q8, d24, d4 - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - vmull.u8 q9, d24, d5 - PF subge PF_X, PF_X, ORIG_W - vmull.u8 q10, d24, d6 - PF subges PF_CTL, PF_CTL, #0x10 - vmull.u8 q11, d24, d7 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + umull v8.8h, v24.8b, v4.8b + PF lsl DUMMY, PF_X, #dst_bpp_shift + PF prfm PREFETCH_MODE, [PF_DST, DUMMY] + umull v9.8h, v24.8b, v5.8b + PF ble 10f + PF sub PF_X, PF_X, ORIG_W +10: + umull v10.8h, v24.8b, v6.8b + PF subs PF_CTL, PF_CTL, #0x10 + umull v11.8h, v24.8b, v7.8b + PF ble 10f + PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift + PF ldrsb DUMMY, [PF_DST, DUMMY] + PF add PF_DST, PF_DST, #1 +10: + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 .endm .macro pixman_composite_over_n_8888_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vld1.32 {d3[0]}, [DUMMY] - vdup.8 d0, d3[0] - vdup.8 d1, d3[1] - vdup.8 d2, d3[2] - vdup.8 d3, d3[3] - vmvn.8 d24, d3 /* get inverted alpha */ + mov v3.s[0], w4 + dup v0.8b, v3.b[0] + dup v1.8b, v3.b[1] + dup v2.8b, v3.b[2] + dup v3.8b, v3.b[3] + mvn v24.8b, v3.8b /* get inverted alpha */ .endm generate_composite_function \ @@ -783,41 +916,53 @@ generate_composite_function \ /******************************************************************************/ .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head - vrshr.u16 q14, q8, #8 + urshr v14.8h, v8.8h, #8 PF add PF_X, PF_X, #8 PF tst PF_CTL, #0xF - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 + urshr v15.8h, v9.8h, #8 + urshr v12.8h, v10.8h, #8 + urshr v13.8h, v11.8h, #8 + PF beq 10f + PF add PF_X, PF_X, #8 + PF sub PF_CTL, PF_CTL, #1 +10: + raddhn v28.8b, v14.8h, v8.8h + raddhn v29.8b, v15.8h, v9.8h PF cmp PF_X, ORIG_W - vraddhn.u16 d30, q12, q10 - vraddhn.u16 d31, q13, q11 - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 - vld4.8 {d0, d1, d2, d3}, [DST_R, :128]! - vmvn.8 d22, d3 - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - PF subge PF_X, PF_X, ORIG_W - vmull.u8 q8, d22, d4 - PF subges PF_CTL, PF_CTL, #0x10 - vmull.u8 q9, d22, d5 - vmull.u8 q10, d22, d6 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vmull.u8 q11, d22, d7 + raddhn v30.8b, v12.8h, v10.8h + raddhn v31.8b, v13.8h, v11.8h + uqadd v28.8b, v0.8b, v28.8b + uqadd v29.8b, v1.8b, v29.8b + uqadd v30.8b, v2.8b, v30.8b + uqadd v31.8b, v3.8b, v31.8b + ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_R], #32 + mvn v22.8b, v3.8b + PF lsl DUMMY, PF_X, #dst_bpp_shift + PF prfm PREFETCH_MODE, [PF_DST, DUMMY] + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 + PF blt 10f + PF sub PF_X, PF_X, ORIG_W +10: + umull v8.8h, v22.8b, v4.8b + PF blt 10f + PF subs PF_CTL, PF_CTL, #0x10 +10: + umull v9.8h, v22.8b, v5.8b + umull v10.8h, v22.8b, v6.8b + PF blt 10f + PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift + PF ldrsb DUMMY, [PF_DST, DUMMY] + PF add PF_DST, PF_DST, #1 +10: + umull v11.8h, v22.8b, v7.8b .endm .macro pixman_composite_over_reverse_n_8888_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vld1.32 {d7[0]}, [DUMMY] - vdup.8 d4, d7[0] - vdup.8 d5, d7[1] - vdup.8 d6, d7[2] - vdup.8 d7, d7[3] + mov v7.s[0], w4 + dup v4.8b, v7.b[0] + dup v5.8b, v7.b[1] + dup v6.8b, v7.b[2] + dup v7.8b, v7.b[3] .endm generate_composite_function \ @@ -838,92 +983,114 @@ generate_composite_function \ /******************************************************************************/ .macro pixman_composite_over_8888_8_0565_process_pixblock_head - vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */ - vmull.u8 q1, d24, d9 - vmull.u8 q6, d24, d10 - vmull.u8 q7, d24, d11 - vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */ - vshrn.u16 d7, q2, #3 - vsli.u16 q2, q2, #5 - vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */ - vrshr.u16 q9, q1, #8 - vrshr.u16 q10, q6, #8 - vrshr.u16 q11, q7, #8 - vraddhn.u16 d0, q0, q8 - vraddhn.u16 d1, q1, q9 - vraddhn.u16 d2, q6, q10 - vraddhn.u16 d3, q7, q11 - vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */ - vsri.u8 d7, d7, #6 - vmvn.8 d3, d3 - vshrn.u16 d30, q2, #2 - vmull.u8 q8, d3, d6 /* now do alpha blending */ - vmull.u8 q9, d3, d7 - vmull.u8 q10, d3, d30 + umull v0.8h, v24.8b, v8.8b /* IN for SRC pixels (part1) */ + umull v1.8h, v24.8b, v9.8b + umull v2.8h, v24.8b, v10.8b + umull v3.8h, v24.8b, v11.8b + mov v4.d[1], v5.d[0] + shrn v25.8b, v4.8h, #8 /* convert DST_R data to 32-bpp (part1) */ + shrn v26.8b, v4.8h, #3 + sli v4.8h, v4.8h, #5 + urshr v17.8h, v0.8h, #8 /* IN for SRC pixels (part2) */ + urshr v18.8h, v1.8h, #8 + urshr v19.8h, v2.8h, #8 + urshr v20.8h, v3.8h, #8 + raddhn v0.8b, v0.8h, v17.8h + raddhn v1.8b, v1.8h, v18.8h + raddhn v2.8b, v2.8h, v19.8h + raddhn v3.8b, v3.8h, v20.8h + sri v25.8b, v25.8b, #5 /* convert DST_R data to 32-bpp (part2) */ + sri v26.8b, v26.8b, #6 + mvn v3.8b, v3.8b + shrn v30.8b, v4.8h, #2 + umull v18.8h, v3.8b, v25.8b /* now do alpha blending */ + umull v19.8h, v3.8b, v26.8b + umull v20.8h, v3.8b, v30.8b .endm .macro pixman_composite_over_8888_8_0565_process_pixblock_tail /* 3 cycle bubble (after vmull.u8) */ - vrshr.u16 q13, q8, #8 - vrshr.u16 q11, q9, #8 - vrshr.u16 q15, q10, #8 - vraddhn.u16 d16, q8, q13 - vraddhn.u16 d27, q9, q11 - vraddhn.u16 d26, q10, q15 - vqadd.u8 d16, d2, d16 + urshr v5.8h, v18.8h, #8 + urshr v6.8h, v19.8h, #8 + urshr v7.8h, v20.8h, #8 + raddhn v17.8b, v18.8h, v5.8h + raddhn v19.8b, v19.8h, v6.8h + raddhn v18.8b, v20.8h, v7.8h + uqadd v5.8b, v2.8b, v17.8b /* 1 cycle bubble */ - vqadd.u8 q9, q0, q13 - vshll.u8 q14, d16, #8 /* convert to 16bpp */ - vshll.u8 q8, d19, #8 - vshll.u8 q9, d18, #8 - vsri.u16 q14, q8, #5 + uqadd v6.8b, v0.8b, v18.8b + uqadd v7.8b, v1.8b, v19.8b + ushll v14.8h, v5.8b, #7 /* convert to 16bpp */ + sli v14.8h, v14.8h, #1 + ushll v18.8h, v7.8b, #7 + sli v18.8h, v18.8h, #1 + ushll v19.8h, v6.8b, #7 + sli v19.8h, v19.8h, #1 + sri v14.8h, v18.8h, #5 /* 1 cycle bubble */ - vsri.u16 q14, q9, #11 + sri v14.8h, v19.8h, #11 + mov v28.d[0], v14.d[0] + mov v29.d[0], v14.d[1] .endm .macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head - vld1.16 {d4, d5}, [DST_R, :128]! - vshrn.u16 d6, q2, #8 +#if 0 + ld1 {v4.8h}, [DST_R], #16 + shrn v25.8b, v4.8h, #8 fetch_mask_pixblock - vshrn.u16 d7, q2, #3 + shrn v26.8b, v4.8h, #3 fetch_src_pixblock - vmull.u8 q6, d24, d10 - vrshr.u16 q13, q8, #8 - vrshr.u16 q11, q9, #8 - vrshr.u16 q15, q10, #8 - vraddhn.u16 d16, q8, q13 - vraddhn.u16 d27, q9, q11 - vraddhn.u16 d26, q10, q15 - vqadd.u8 d16, d2, d16 - vmull.u8 q1, d24, d9 - vqadd.u8 q9, q0, q13 - vshll.u8 q14, d16, #8 - vmull.u8 q0, d24, d8 - vshll.u8 q8, d19, #8 - vshll.u8 q9, d18, #8 - vsri.u16 q14, q8, #5 - vmull.u8 q7, d24, d11 - vsri.u16 q14, q9, #11 + umull v22.8h, v24.8b, v10.8b + urshr v13.8h, v18.8h, #8 + urshr v11.8h, v19.8h, #8 + urshr v15.8h, v20.8h, #8 + raddhn v17.8b, v18.8h, v13.8h + raddhn v19.8b, v19.8h, v11.8h + raddhn v18.8b, v20.8h, v15.8h + uqadd v17.8b, v2.8b, v17.8b + umull v21.8h, v24.8b, v9.8b + uqadd v18.8b, v0.8b, v18.8b + uqadd v19.8b, v1.8b, v19.8b + ushll v14.8h, v17.8b, #7 + sli v14.8h, v14.8h, #1 + umull v20.8h, v24.8b, v8.8b + ushll v18.8h, v18.8b, #7 + sli v18.8h, v18.8h, #1 + ushll v19.8h, v19.8b, #7 + sli v19.8h, v19.8h, #1 + sri v14.8h, v18.8h, #5 + umull v23.8h, v24.8b, v11.8b + sri v14.8h, v19.8h, #11 + mov v28.d[0], v14.d[0] + mov v29.d[0], v14.d[1] cache_preload 8, 8 - - vsli.u16 q2, q2, #5 - vrshr.u16 q8, q0, #8 - vrshr.u16 q9, q1, #8 - vrshr.u16 q10, q6, #8 - vrshr.u16 q11, q7, #8 - vraddhn.u16 d0, q0, q8 - vraddhn.u16 d1, q1, q9 - vraddhn.u16 d2, q6, q10 - vraddhn.u16 d3, q7, q11 - vsri.u8 d6, d6, #5 - vsri.u8 d7, d7, #6 - vmvn.8 d3, d3 - vshrn.u16 d30, q2, #2 - vst1.16 {d28, d29}, [DST_W, :128]! - vmull.u8 q8, d3, d6 - vmull.u8 q9, d3, d7 - vmull.u8 q10, d3, d30 + + sli v4.8h, v4.8h, #5 + urshr v16.8h, v20.8h, #8 + urshr v17.8h, v21.8h, #8 + urshr v18.8h, v22.8h, #8 + urshr v19.8h, v23.8h, #8 + raddhn v0.8b, v20.8h, v16.8h + raddhn v1.8b, v21.8h, v17.8h + raddhn v2.8b, v22.8h, v18.8h + raddhn v3.8b, v23.8h, v19.8h + sri v25.8b, v25.8b, #5 + sri v26.8b, v26.8b, #6 + mvn v3.8b, v3.8b + shrn v30.8b, v4.8h, #2 + st1 {v14.8h}, [DST_W], #16 + umull v18.8h, v3.8b, v25.8b + umull v19.8h, v3.8b, v26.8b + umull v20.8h, v3.8b, v30.8b +#else + pixman_composite_over_8888_8_0565_process_pixblock_tail + st1 {v28.4h, v29.4h}, [DST_W], #16 + ld1 {v4.4h, v5.4h}, [DST_R], #16 + fetch_mask_pixblock + fetch_src_pixblock + pixman_composite_over_8888_8_0565_process_pixblock_head +#endif .endm generate_composite_function \ @@ -954,17 +1121,14 @@ generate_composite_function \ * without introducing any problems. */ .macro pixman_composite_over_n_8_0565_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vpush {d8-d15} - vld1.32 {d11[0]}, [DUMMY] - vdup.8 d8, d11[0] - vdup.8 d9, d11[1] - vdup.8 d10, d11[2] - vdup.8 d11, d11[3] + mov v11.s[0], w4 + dup v8.8b, v11.b[0] + dup v9.8b, v11.b[1] + dup v10.8b, v11.b[2] + dup v11.8b, v11.b[3] .endm .macro pixman_composite_over_n_8_0565_cleanup - vpop {d8-d15} .endm generate_composite_function \ @@ -976,19 +1140,20 @@ generate_composite_function \ pixman_composite_over_n_8_0565_cleanup, \ pixman_composite_over_8888_8_0565_process_pixblock_head, \ pixman_composite_over_8888_8_0565_process_pixblock_tail, \ - pixman_composite_over_8888_8_0565_process_pixblock_tail_head + pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 8, /* src_basereg */ \ + 24 /* mask_basereg */ /******************************************************************************/ .macro pixman_composite_over_8888_n_0565_init - add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) - vpush {d8-d15} - vld1.32 {d24[0]}, [DUMMY] - vdup.8 d24, d24[3] + mov v24.s[0], w6 + dup v24.8b, v24.b[3] .endm .macro pixman_composite_over_8888_n_0565_cleanup - vpop {d8-d15} .endm generate_composite_function \ @@ -1015,7 +1180,7 @@ generate_composite_function \ .endm .macro pixman_composite_src_0565_0565_process_pixblock_tail_head - vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! + st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DST_W], #32 fetch_src_pixblock cache_preload 16, 16 .endm @@ -1044,17 +1209,15 @@ generate_composite_function \ .endm .macro pixman_composite_src_n_8_process_pixblock_tail_head - vst1.8 {d0, d1, d2, d3}, [DST_W, :128]! + st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_W], 32 .endm .macro pixman_composite_src_n_8_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vld1.32 {d0[0]}, [DUMMY] - vsli.u64 d0, d0, #8 - vsli.u64 d0, d0, #16 - vsli.u64 d0, d0, #32 - vorr d1, d0, d0 - vorr q1, q0, q0 + mov v0.s[0], w4 + dup v3.8b, v0.b[0] + dup v2.8b, v0.b[0] + dup v1.8b, v0.b[0] + dup v0.8b, v0.b[0] .endm .macro pixman_composite_src_n_8_cleanup @@ -1084,16 +1247,15 @@ generate_composite_function \ .endm .macro pixman_composite_src_n_0565_process_pixblock_tail_head - vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! + st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DST_W], #32 .endm .macro pixman_composite_src_n_0565_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vld1.32 {d0[0]}, [DUMMY] - vsli.u64 d0, d0, #16 - vsli.u64 d0, d0, #32 - vorr d1, d0, d0 - vorr q1, q0, q0 + mov v0.s[0], w4 + dup v3.4h, v0.h[0] + dup v2.4h, v0.h[0] + dup v1.4h, v0.h[0] + dup v0.4h, v0.h[0] .endm .macro pixman_composite_src_n_0565_cleanup @@ -1123,15 +1285,15 @@ generate_composite_function \ .endm .macro pixman_composite_src_n_8888_process_pixblock_tail_head - vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! + st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32 .endm .macro pixman_composite_src_n_8888_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vld1.32 {d0[0]}, [DUMMY] - vsli.u64 d0, d0, #32 - vorr d1, d0, d0 - vorr q1, q0, q0 + mov v0.s[0], w4 + dup v3.2s, v0.s[0] + dup v2.2s, v0.s[0] + dup v1.2s, v0.s[0] + dup v0.2s, v0.s[0] .endm .macro pixman_composite_src_n_8888_cleanup @@ -1161,7 +1323,7 @@ generate_composite_function \ .endm .macro pixman_composite_src_8888_8888_process_pixblock_tail_head - vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! + st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32 fetch_src_pixblock cache_preload 8, 8 .endm @@ -1184,24 +1346,29 @@ generate_composite_function \ /******************************************************************************/ .macro pixman_composite_src_x888_8888_process_pixblock_head - vorr q0, q0, q2 - vorr q1, q1, q2 + orr v0.8b, v0.8b, v4.8b + orr v1.8b, v1.8b, v4.8b + orr v2.8b, v2.8b, v4.8b + orr v3.8b, v3.8b, v4.8b .endm .macro pixman_composite_src_x888_8888_process_pixblock_tail .endm .macro pixman_composite_src_x888_8888_process_pixblock_tail_head - vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! + st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32 fetch_src_pixblock - vorr q0, q0, q2 - vorr q1, q1, q2 + orr v0.8b, v0.8b, v4.8b + orr v1.8b, v1.8b, v4.8b + orr v2.8b, v2.8b, v4.8b + orr v3.8b, v3.8b, v4.8b cache_preload 8, 8 .endm .macro pixman_composite_src_x888_8888_init - vmov.u8 q2, #0xFF - vshl.u32 q2, q2, #24 + mov w20, #0xFF + dup v4.8b, w20 + shl v4.2s, v4.2s, #24 .endm generate_composite_function \ @@ -1222,60 +1389,72 @@ generate_composite_function \ /******************************************************************************/ .macro pixman_composite_src_n_8_8888_process_pixblock_head - /* expecting solid source in {d0, d1, d2, d3} */ - /* mask is in d24 (d25, d26, d27 are unused) */ + /* expecting solid source in {v0, v1, v2, v3} */ + /* mask is in v24 (v25, v26, v27 are unused) */ /* in */ - vmull.u8 q8, d24, d0 - vmull.u8 q9, d24, d1 - vmull.u8 q10, d24, d2 - vmull.u8 q11, d24, d3 - vrsra.u16 q8, q8, #8 - vrsra.u16 q9, q9, #8 - vrsra.u16 q10, q10, #8 - vrsra.u16 q11, q11, #8 + umull v8.8h, v24.8b, v0.8b + umull v9.8h, v24.8b, v1.8b + umull v10.8h, v24.8b, v2.8b + umull v11.8h, v24.8b, v3.8b + ursra v8.8h, v8.8h, #8 + ursra v9.8h, v9.8h, #8 + ursra v10.8h, v10.8h, #8 + ursra v11.8h, v11.8h, #8 .endm .macro pixman_composite_src_n_8_8888_process_pixblock_tail - vrshrn.u16 d28, q8, #8 - vrshrn.u16 d29, q9, #8 - vrshrn.u16 d30, q10, #8 - vrshrn.u16 d31, q11, #8 + rshrn v28.8b, v8.8h, #8 + rshrn v29.8b, v9.8h, #8 + rshrn v30.8b, v10.8h, #8 + rshrn v31.8b, v11.8h, #8 .endm .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head fetch_mask_pixblock PF add PF_X, PF_X, #8 - vrshrn.u16 d28, q8, #8 + rshrn v28.8b, v8.8h, #8 PF tst PF_CTL, #0x0F - vrshrn.u16 d29, q9, #8 - PF addne PF_X, PF_X, #8 - vrshrn.u16 d30, q10, #8 - PF subne PF_CTL, PF_CTL, #1 - vrshrn.u16 d31, q11, #8 + rshrn v29.8b, v9.8h, #8 + PF beq 10f + PF add PF_X, PF_X, #8 +10: + rshrn v30.8b, v10.8h, #8 + PF beq 10f + PF sub PF_CTL, PF_CTL, #1 +10: + rshrn v31.8b, v11.8h, #8 PF cmp PF_X, ORIG_W - vmull.u8 q8, d24, d0 - PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] - vmull.u8 q9, d24, d1 - PF subge PF_X, PF_X, ORIG_W - vmull.u8 q10, d24, d2 - PF subges PF_CTL, PF_CTL, #0x10 - vmull.u8 q11, d24, d3 - PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - vrsra.u16 q8, q8, #8 - vrsra.u16 q9, q9, #8 - vrsra.u16 q10, q10, #8 - vrsra.u16 q11, q11, #8 + umull v8.8h, v24.8b, v0.8b + PF lsl DUMMY, PF_X, #mask_bpp_shift + PF prfm PREFETCH_MODE, [PF_MASK, DUMMY] + umull v9.8h, v24.8b, v1.8b + PF ble 10f + PF sub PF_X, PF_X, ORIG_W +10: + umull v10.8h, v24.8b, v2.8b + PF ble 10f + PF subs PF_CTL, PF_CTL, #0x10 +10: + umull v11.8h, v24.8b, v3.8b + PF ble 10f + PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift + PF ldrsb DUMMY, [PF_MASK, DUMMY] + PF add PF_MASK, PF_MASK, #1 +10: + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 + ursra v8.8h, v8.8h, #8 + ursra v9.8h, v9.8h, #8 + ursra v10.8h, v10.8h, #8 + ursra v11.8h, v11.8h, #8 .endm .macro pixman_composite_src_n_8_8888_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vld1.32 {d3[0]}, [DUMMY] - vdup.8 d0, d3[0] - vdup.8 d1, d3[1] - vdup.8 d2, d3[2] - vdup.8 d3, d3[3] + mov v3.s[0], w4 + dup v0.8b, v3.b[0] + dup v1.8b, v3.b[1] + dup v2.8b, v3.b[2] + dup v3.8b, v3.b[3] .endm .macro pixman_composite_src_n_8_8888_cleanup @@ -1295,53 +1474,65 @@ generate_composite_function \ /******************************************************************************/ .macro pixman_composite_src_n_8_8_process_pixblock_head - vmull.u8 q0, d24, d16 - vmull.u8 q1, d25, d16 - vmull.u8 q2, d26, d16 - vmull.u8 q3, d27, d16 - vrsra.u16 q0, q0, #8 - vrsra.u16 q1, q1, #8 - vrsra.u16 q2, q2, #8 - vrsra.u16 q3, q3, #8 + umull v0.8h, v24.8b, v16.8b + umull v1.8h, v25.8b, v16.8b + umull v2.8h, v26.8b, v16.8b + umull v3.8h, v27.8b, v16.8b + ursra v0.8h, v0.8h, #8 + ursra v1.8h, v1.8h, #8 + ursra v2.8h, v2.8h, #8 + ursra v3.8h, v3.8h, #8 .endm .macro pixman_composite_src_n_8_8_process_pixblock_tail - vrshrn.u16 d28, q0, #8 - vrshrn.u16 d29, q1, #8 - vrshrn.u16 d30, q2, #8 - vrshrn.u16 d31, q3, #8 + rshrn v28.8b, v0.8h, #8 + rshrn v29.8b, v1.8h, #8 + rshrn v30.8b, v2.8h, #8 + rshrn v31.8b, v3.8h, #8 .endm .macro pixman_composite_src_n_8_8_process_pixblock_tail_head fetch_mask_pixblock PF add PF_X, PF_X, #8 - vrshrn.u16 d28, q0, #8 + rshrn v28.8b, v0.8h, #8 PF tst PF_CTL, #0x0F - vrshrn.u16 d29, q1, #8 - PF addne PF_X, PF_X, #8 - vrshrn.u16 d30, q2, #8 - PF subne PF_CTL, PF_CTL, #1 - vrshrn.u16 d31, q3, #8 + rshrn v29.8b, v1.8h, #8 + PF beq 10f + PF add PF_X, PF_X, #8 +10: + rshrn v30.8b, v2.8h, #8 + PF beq 10f + PF sub PF_CTL, PF_CTL, #1 +10: + rshrn v31.8b, v3.8h, #8 PF cmp PF_X, ORIG_W - vmull.u8 q0, d24, d16 - PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] - vmull.u8 q1, d25, d16 - PF subge PF_X, PF_X, ORIG_W - vmull.u8 q2, d26, d16 - PF subges PF_CTL, PF_CTL, #0x10 - vmull.u8 q3, d27, d16 - PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! - vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! - vrsra.u16 q0, q0, #8 - vrsra.u16 q1, q1, #8 - vrsra.u16 q2, q2, #8 - vrsra.u16 q3, q3, #8 + umull v0.8h, v24.8b, v16.8b + PF lsl DUMMY, PF_X, mask_bpp_shift + PF prfm PREFETCH_MODE, [PF_MASK, DUMMY] + umull v1.8h, v25.8b, v16.8b + PF ble 10f + PF sub PF_X, PF_X, ORIG_W +10: + umull v2.8h, v26.8b, v16.8b + PF ble 10f + PF subs PF_CTL, PF_CTL, #0x10 +10: + umull v3.8h, v27.8b, v16.8b + PF ble 10f + PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift + PF ldrsb DUMMY, [PF_MASK, DUMMY] + PF add PF_MASK, PF_MASK, #1 +10: + st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 + ursra v0.8h, v0.8h, #8 + ursra v1.8h, v1.8h, #8 + ursra v2.8h, v2.8h, #8 + ursra v3.8h, v3.8h, #8 .endm .macro pixman_composite_src_n_8_8_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vld1.32 {d16[0]}, [DUMMY] - vdup.8 d16, d16[3] + mov v16.s[0], w4 + dup v16.8b, v16.b[3] .endm .macro pixman_composite_src_n_8_8_cleanup @@ -1361,103 +1552,122 @@ generate_composite_function \ /******************************************************************************/ .macro pixman_composite_over_n_8_8888_process_pixblock_head - /* expecting deinterleaved source data in {d8, d9, d10, d11} */ - /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ - /* and destination data in {d4, d5, d6, d7} */ - /* mask is in d24 (d25, d26, d27 are unused) */ + /* expecting deinterleaved source data in {v8, v9, v10, v11} */ + /* v8 - blue, v9 - green, v10 - red, v11 - alpha */ + /* and destination data in {v4, v5, v6, v7} */ + /* mask is in v24 (v25, v26, v27 are unused) */ /* in */ - vmull.u8 q6, d24, d8 - vmull.u8 q7, d24, d9 - vmull.u8 q8, d24, d10 - vmull.u8 q9, d24, d11 - vrshr.u16 q10, q6, #8 - vrshr.u16 q11, q7, #8 - vrshr.u16 q12, q8, #8 - vrshr.u16 q13, q9, #8 - vraddhn.u16 d0, q6, q10 - vraddhn.u16 d1, q7, q11 - vraddhn.u16 d2, q8, q12 - vraddhn.u16 d3, q9, q13 - vmvn.8 d25, d3 /* get inverted alpha */ - /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */ - /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */ + umull v12.8h, v24.8b, v8.8b + umull v13.8h, v24.8b, v9.8b + umull v14.8h, v24.8b, v10.8b + umull v15.8h, v24.8b, v11.8b + urshr v16.8h, v12.8h, #8 + urshr v17.8h, v13.8h, #8 + urshr v18.8h, v14.8h, #8 + urshr v19.8h, v15.8h, #8 + raddhn v0.8b, v12.8h, v16.8h + raddhn v1.8b, v13.8h, v17.8h + raddhn v2.8b, v14.8h, v18.8h + raddhn v3.8b, v15.8h, v19.8h + mvn v25.8b, v3.8b /* get inverted alpha */ + /* source: v0 - blue, v1 - green, v2 - red, v3 - alpha */ + /* destination: v4 - blue, v5 - green, v6 - red, v7 - alpha */ /* now do alpha blending */ - vmull.u8 q8, d25, d4 - vmull.u8 q9, d25, d5 - vmull.u8 q10, d25, d6 - vmull.u8 q11, d25, d7 + umull v12.8h, v25.8b, v4.8b + umull v13.8h, v25.8b, v5.8b + umull v14.8h, v25.8b, v6.8b + umull v15.8h, v25.8b, v7.8b .endm .macro pixman_composite_over_n_8_8888_process_pixblock_tail - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vrshr.u16 q6, q10, #8 - vrshr.u16 q7, q11, #8 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - vraddhn.u16 d30, q6, q10 - vraddhn.u16 d31, q7, q11 - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 + urshr v16.8h, v12.8h, #8 + urshr v17.8h, v13.8h, #8 + urshr v18.8h, v14.8h, #8 + urshr v19.8h, v15.8h, #8 + raddhn v28.8b, v16.8h, v12.8h + raddhn v29.8b, v17.8h, v13.8h + raddhn v30.8b, v18.8h, v14.8h + raddhn v31.8b, v19.8h, v15.8h + uqadd v28.8b, v0.8b, v28.8b + uqadd v29.8b, v1.8b, v29.8b + uqadd v30.8b, v2.8b, v30.8b + uqadd v31.8b, v3.8b, v31.8b .endm .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head - vrshr.u16 q14, q8, #8 - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - vrshr.u16 q15, q9, #8 + urshr v16.8h, v12.8h, #8 + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 + urshr v17.8h, v13.8h, #8 fetch_mask_pixblock - vrshr.u16 q6, q10, #8 + urshr v18.8h, v14.8h, #8 PF add PF_X, PF_X, #8 - vrshr.u16 q7, q11, #8 + urshr v19.8h, v15.8h, #8 PF tst PF_CTL, #0x0F - vraddhn.u16 d28, q14, q8 - PF addne PF_X, PF_X, #8 - vraddhn.u16 d29, q15, q9 - PF subne PF_CTL, PF_CTL, #1 - vraddhn.u16 d30, q6, q10 + raddhn v28.8b, v16.8h, v12.8h + PF beq 10f + PF add PF_X, PF_X, #8 +10: + raddhn v29.8b, v17.8h, v13.8h + PF beq 10f + PF sub PF_CTL, PF_CTL, #1 +10: + raddhn v30.8b, v18.8h, v14.8h PF cmp PF_X, ORIG_W - vraddhn.u16 d31, q7, q11 - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - vmull.u8 q6, d24, d8 - PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] - vmull.u8 q7, d24, d9 - PF subge PF_X, PF_X, ORIG_W - vmull.u8 q8, d24, d10 - PF subges PF_CTL, PF_CTL, #0x10 - vmull.u8 q9, d24, d11 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vqadd.u8 q14, q0, q14 - PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! - vqadd.u8 q15, q1, q15 - vrshr.u16 q10, q6, #8 - vrshr.u16 q11, q7, #8 - vrshr.u16 q12, q8, #8 - vrshr.u16 q13, q9, #8 - vraddhn.u16 d0, q6, q10 - vraddhn.u16 d1, q7, q11 - vraddhn.u16 d2, q8, q12 - vraddhn.u16 d3, q9, q13 - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - vmvn.8 d25, d3 - vmull.u8 q8, d25, d4 - vmull.u8 q9, d25, d5 - vmull.u8 q10, d25, d6 - vmull.u8 q11, d25, d7 + raddhn v31.8b, v19.8h, v15.8h + PF lsl DUMMY, PF_X, #dst_bpp_shift + PF prfm PREFETCH_MODE, [PF_DST, DUMMY] + umull v16.8h, v24.8b, v8.8b + PF lsl DUMMY, PF_X, #mask_bpp_shift + PF prfm PREFETCH_MODE, [PF_MASK, DUMMY] + umull v17.8h, v24.8b, v9.8b + PF ble 10f + PF sub PF_X, PF_X, ORIG_W +10: + umull v18.8h, v24.8b, v10.8b + PF ble 10f + PF subs PF_CTL, PF_CTL, #0x10 +10: + umull v19.8h, v24.8b, v11.8b + PF ble 10f + PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift + PF ldrsb DUMMY, [PF_DST, DUMMY] + PF add PF_DST, PF_DST, #1 +10: + uqadd v28.8b, v0.8b, v28.8b + PF ble 10f + PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift + PF ldrsb DUMMY, [PF_MASK, DUMMY] + PF add PF_MASK, PF_MASK, #1 +10: + uqadd v29.8b, v1.8b, v29.8b + uqadd v30.8b, v2.8b, v30.8b + uqadd v31.8b, v3.8b, v31.8b + urshr v12.8h, v16.8h, #8 + urshr v13.8h, v17.8h, #8 + urshr v14.8h, v18.8h, #8 + urshr v15.8h, v19.8h, #8 + raddhn v0.8b, v16.8h, v12.8h + raddhn v1.8b, v17.8h, v13.8h + raddhn v2.8b, v18.8h, v14.8h + raddhn v3.8b, v19.8h, v15.8h + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 + mvn v25.8b, v3.8b + umull v12.8h, v25.8b, v4.8b + umull v13.8h, v25.8b, v5.8b + umull v14.8h, v25.8b, v6.8b + umull v15.8h, v25.8b, v7.8b .endm .macro pixman_composite_over_n_8_8888_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vpush {d8-d15} - vld1.32 {d11[0]}, [DUMMY] - vdup.8 d8, d11[0] - vdup.8 d9, d11[1] - vdup.8 d10, d11[2] - vdup.8 d11, d11[3] + mov v11.s[0], w4 + dup v8.8b, v11.b[0] + dup v9.8b, v11.b[1] + dup v10.8b, v11.b[2] + dup v11.8b, v11.b[3] .endm .macro pixman_composite_over_n_8_8888_cleanup - vpop {d8-d15} .endm generate_composite_function \ @@ -1474,58 +1684,59 @@ generate_composite_function \ /******************************************************************************/ .macro pixman_composite_over_n_8_8_process_pixblock_head - vmull.u8 q0, d24, d8 - vmull.u8 q1, d25, d8 - vmull.u8 q6, d26, d8 - vmull.u8 q7, d27, d8 - vrshr.u16 q10, q0, #8 - vrshr.u16 q11, q1, #8 - vrshr.u16 q12, q6, #8 - vrshr.u16 q13, q7, #8 - vraddhn.u16 d0, q0, q10 - vraddhn.u16 d1, q1, q11 - vraddhn.u16 d2, q6, q12 - vraddhn.u16 d3, q7, q13 - vmvn.8 q12, q0 - vmvn.8 q13, q1 - vmull.u8 q8, d24, d4 - vmull.u8 q9, d25, d5 - vmull.u8 q10, d26, d6 - vmull.u8 q11, d27, d7 + umull v0.8h, v24.8b, v8.8b + umull v1.8h, v25.8b, v8.8b + umull v2.8h, v26.8b, v8.8b + umull v3.8h, v27.8b, v8.8b + urshr v10.8h, v0.8h, #8 + urshr v11.8h, v1.8h, #8 + urshr v12.8h, v2.8h, #8 + urshr v13.8h, v3.8h, #8 + raddhn v0.8b, v0.8h, v10.8h + raddhn v1.8b, v1.8h, v11.8h + raddhn v2.8b, v2.8h, v12.8h + raddhn v3.8b, v3.8h, v13.8h + mvn v24.8b, v0.8b + mvn v25.8b, v1.8b + mvn v26.8b, v2.8b + mvn v27.8b, v3.8b + umull v10.8h, v24.8b, v4.8b + umull v11.8h, v25.8b, v5.8b + umull v12.8h, v26.8b, v6.8b + umull v13.8h, v27.8b, v7.8b .endm .macro pixman_composite_over_n_8_8_process_pixblock_tail - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - vraddhn.u16 d30, q12, q10 - vraddhn.u16 d31, q13, q11 - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 + urshr v14.8h, v10.8h, #8 + urshr v15.8h, v11.8h, #8 + urshr v16.8h, v12.8h, #8 + urshr v17.8h, v13.8h, #8 + raddhn v28.8b, v14.8h, v10.8h + raddhn v29.8b, v15.8h, v11.8h + raddhn v30.8b, v16.8h, v12.8h + raddhn v31.8b, v17.8h, v13.8h + uqadd v28.8b, v0.8b, v28.8b + uqadd v29.8b, v1.8b, v29.8b + uqadd v30.8b, v2.8b, v30.8b + uqadd v31.8b, v3.8b, v31.8b .endm /* TODO: expand macros and do better instructions scheduling */ .macro pixman_composite_over_n_8_8_process_pixblock_tail_head - vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! + ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 pixman_composite_over_n_8_8_process_pixblock_tail fetch_mask_pixblock cache_preload 32, 32 - vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! + st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 pixman_composite_over_n_8_8_process_pixblock_head .endm .macro pixman_composite_over_n_8_8_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vpush {d8-d15} - vld1.32 {d8[0]}, [DUMMY] - vdup.8 d8, d8[3] + mov v8.s[0], w4 + dup v8.8b, v8.b[3] .endm .macro pixman_composite_over_n_8_8_cleanup - vpop {d8-d15} .endm generate_composite_function \ @@ -1545,91 +1756,93 @@ generate_composite_function \ /* * 'combine_mask_ca' replacement * - * input: solid src (n) in {d8, d9, d10, d11} - * dest in {d4, d5, d6, d7 } - * mask in {d24, d25, d26, d27} - * output: updated src in {d0, d1, d2, d3 } - * updated mask in {d24, d25, d26, d3 } + * input: solid src (n) in {v8, v9, v10, v11} + * dest in {v4, v5, v6, v7 } + * mask in {v24, v25, v26, v27} + * output: updated src in {v0, v1, v2, v3 } + * updated mask in {v24, v25, v26, v3 } */ - vmull.u8 q0, d24, d8 - vmull.u8 q1, d25, d9 - vmull.u8 q6, d26, d10 - vmull.u8 q7, d27, d11 - vmull.u8 q9, d11, d25 - vmull.u8 q12, d11, d24 - vmull.u8 q13, d11, d26 - vrshr.u16 q8, q0, #8 - vrshr.u16 q10, q1, #8 - vrshr.u16 q11, q6, #8 - vraddhn.u16 d0, q0, q8 - vraddhn.u16 d1, q1, q10 - vraddhn.u16 d2, q6, q11 - vrshr.u16 q11, q12, #8 - vrshr.u16 q8, q9, #8 - vrshr.u16 q6, q13, #8 - vrshr.u16 q10, q7, #8 - vraddhn.u16 d24, q12, q11 - vraddhn.u16 d25, q9, q8 - vraddhn.u16 d26, q13, q6 - vraddhn.u16 d3, q7, q10 + umull v0.8h, v24.8b, v8.8b + umull v1.8h, v25.8b, v9.8b + umull v2.8h, v26.8b, v10.8b + umull v3.8h, v27.8b, v11.8b + umull v12.8h, v11.8b, v25.8b + umull v13.8h, v11.8b, v24.8b + umull v14.8h, v11.8b, v26.8b + urshr v15.8h, v0.8h, #8 + urshr v16.8h, v1.8h, #8 + urshr v17.8h, v2.8h, #8 + raddhn v0.8b, v0.8h, v15.8h + raddhn v1.8b, v1.8h, v16.8h + raddhn v2.8b, v2.8h, v17.8h + urshr v15.8h, v13.8h, #8 + urshr v16.8h, v12.8h, #8 + urshr v17.8h, v14.8h, #8 + urshr v18.8h, v3.8h, #8 + raddhn v24.8b, v13.8h, v15.8h + raddhn v25.8b, v12.8h, v16.8h + raddhn v26.8b, v14.8h, v17.8h + raddhn v3.8b, v3.8h, v18.8h /* * 'combine_over_ca' replacement * - * output: updated dest in {d28, d29, d30, d31} + * output: updated dest in {v28, v29, v30, v31} */ - vmvn.8 q12, q12 - vmvn.8 d26, d26 - vmull.u8 q8, d24, d4 - vmull.u8 q9, d25, d5 - vmvn.8 d27, d3 - vmull.u8 q10, d26, d6 - vmull.u8 q11, d27, d7 + mvn v24.8b, v24.8b + mvn v25.8b, v25.8b + mvn v26.8b, v26.8b + mvn v27.8b, v3.8b + umull v12.8h, v24.8b, v4.8b + umull v13.8h, v25.8b, v5.8b + umull v14.8h, v26.8b, v6.8b + umull v15.8h, v27.8b, v7.8b .endm .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail /* ... continue 'combine_over_ca' replacement */ - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vrshr.u16 q6, q10, #8 - vrshr.u16 q7, q11, #8 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - vraddhn.u16 d30, q6, q10 - vraddhn.u16 d31, q7, q11 - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 + urshr v16.8h, v12.8h, #8 + urshr v17.8h, v13.8h, #8 + urshr v18.8h, v14.8h, #8 + urshr v19.8h, v15.8h, #8 + raddhn v28.8b, v16.8h, v12.8h + raddhn v29.8b, v17.8h, v13.8h + raddhn v30.8b, v18.8h, v14.8h + raddhn v31.8b, v19.8h, v15.8h + uqadd v28.8b, v0.8b, v28.8b + uqadd v29.8b, v1.8b, v29.8b + uqadd v30.8b, v2.8b, v30.8b + uqadd v31.8b, v3.8b, v31.8b .endm .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - vrshr.u16 q6, q10, #8 - vrshr.u16 q7, q11, #8 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - vraddhn.u16 d30, q6, q10 - vraddhn.u16 d31, q7, q11 + urshr v16.8h, v12.8h, #8 + urshr v17.8h, v13.8h, #8 + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 + urshr v18.8h, v14.8h, #8 + urshr v19.8h, v15.8h, #8 + raddhn v28.8b, v16.8h, v12.8h + raddhn v29.8b, v17.8h, v13.8h + raddhn v30.8b, v18.8h, v14.8h + raddhn v31.8b, v19.8h, v15.8h fetch_mask_pixblock - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 + uqadd v28.8b, v0.8b, v28.8b + uqadd v29.8b, v1.8b, v29.8b + uqadd v30.8b, v2.8b, v30.8b + uqadd v31.8b, v3.8b, v31.8b cache_preload 8, 8 pixman_composite_over_n_8888_8888_ca_process_pixblock_head - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 .endm .macro pixman_composite_over_n_8888_8888_ca_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vpush {d8-d15} - vld1.32 {d11[0]}, [DUMMY] - vdup.8 d8, d11[0] - vdup.8 d9, d11[1] - vdup.8 d10, d11[2] - vdup.8 d11, d11[3] + mov v13.s[0], w4 + dup v8.8b, v13.b[0] + dup v9.8b, v13.b[1] + dup v10.8b, v13.b[2] + dup v11.8b, v13.b[3] .endm .macro pixman_composite_over_n_8888_8888_ca_cleanup - vpop {d8-d15} .endm generate_composite_function \ @@ -1649,156 +1862,170 @@ generate_composite_function \ /* * 'combine_mask_ca' replacement * - * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A] - * mask in {d24, d25, d26} [B, G, R] - * output: updated src in {d0, d1, d2 } [B, G, R] - * updated mask in {d24, d25, d26} [B, G, R] + * input: solid src (n) in {v8, v9, v10, v11} [B, G, R, A] + * mask in {v24, v25, v26} [B, G, R] + * output: updated src in {v0, v1, v2 } [B, G, R] + * updated mask in {v24, v25, v26} [B, G, R] */ - vmull.u8 q0, d24, d8 - vmull.u8 q1, d25, d9 - vmull.u8 q6, d26, d10 - vmull.u8 q9, d11, d25 - vmull.u8 q12, d11, d24 - vmull.u8 q13, d11, d26 - vrshr.u16 q8, q0, #8 - vrshr.u16 q10, q1, #8 - vrshr.u16 q11, q6, #8 - vraddhn.u16 d0, q0, q8 - vraddhn.u16 d1, q1, q10 - vraddhn.u16 d2, q6, q11 - vrshr.u16 q11, q12, #8 - vrshr.u16 q8, q9, #8 - vrshr.u16 q6, q13, #8 - vraddhn.u16 d24, q12, q11 - vraddhn.u16 d25, q9, q8 + umull v0.8h, v24.8b, v8.8b + umull v1.8h, v25.8b, v9.8b + umull v2.8h, v26.8b, v10.8b + umull v12.8h, v11.8b, v24.8b + umull v13.8h, v11.8b, v25.8b + umull v14.8h, v11.8b, v26.8b + urshr v15.8h, v0.8h, #8 + urshr v16.8h, v1.8h, #8 + urshr v17.8h, v2.8h, #8 + raddhn v0.8b, v0.8h, v15.8h + raddhn v1.8b, v1.8h, v16.8h + raddhn v2.8b, v2.8h, v17.8h + urshr v19.8h, v12.8h, #8 + urshr v20.8h, v13.8h, #8 + urshr v21.8h, v14.8h, #8 + raddhn v24.8b, v12.8h, v19.8h + raddhn v25.8b, v13.8h, v20.8h /* - * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format - * and put data into d16 - blue, d17 - green, d18 - red + * convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format + * and put data into v16 - blue, v17 - green, v18 - red */ - vshrn.u16 d17, q2, #3 - vshrn.u16 d18, q2, #8 - vraddhn.u16 d26, q13, q6 - vsli.u16 q2, q2, #5 - vsri.u8 d18, d18, #5 - vsri.u8 d17, d17, #6 + mov v4.d[1], v5.d[0] + shrn v17.8b, v4.8h, #3 + shrn v18.8b, v4.8h, #8 + raddhn v26.8b, v14.8h, v21.8h + sli v4.8h, v4.8h, #5 + sri v18.8b, v18.8b, #5 + sri v17.8b, v17.8b, #6 /* * 'combine_over_ca' replacement * - * output: updated dest in d16 - blue, d17 - green, d18 - red + * output: updated dest in v16 - blue, v17 - green, v18 - red */ - vmvn.8 q12, q12 - vshrn.u16 d16, q2, #2 - vmvn.8 d26, d26 - vmull.u8 q6, d16, d24 - vmull.u8 q7, d17, d25 - vmull.u8 q11, d18, d26 + mvn v24.8b, v24.8b + mvn v25.8b, v25.8b + shrn v16.8b, v4.8h, #2 + mvn v26.8b, v26.8b + umull v5.8h, v16.8b, v24.8b + umull v6.8h, v17.8b, v25.8b + umull v7.8h, v18.8b, v26.8b .endm .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail /* ... continue 'combine_over_ca' replacement */ - vrshr.u16 q10, q6, #8 - vrshr.u16 q14, q7, #8 - vrshr.u16 q15, q11, #8 - vraddhn.u16 d16, q10, q6 - vraddhn.u16 d17, q14, q7 - vraddhn.u16 d18, q15, q11 - vqadd.u8 q8, q0, q8 - vqadd.u8 d18, d2, d18 + urshr v13.8h, v5.8h, #8 + urshr v14.8h, v6.8h, #8 + urshr v15.8h, v7.8h, #8 + raddhn v16.8b, v13.8h, v5.8h + raddhn v17.8b, v14.8h, v6.8h + raddhn v18.8b, v15.8h, v7.8h + uqadd v16.8b, v0.8b, v16.8b + uqadd v17.8b, v1.8b, v17.8b + uqadd v18.8b, v2.8b, v18.8b /* - * convert the results in d16, d17, d18 to r5g6b5 and store - * them into {d28, d29} + * convert the results in v16, v17, v18 to r5g6b5 and store + * them into {v14} */ - vshll.u8 q14, d18, #8 - vshll.u8 q10, d17, #8 - vshll.u8 q15, d16, #8 - vsri.u16 q14, q10, #5 - vsri.u16 q14, q15, #11 + ushll v14.8h, v18.8b, #7 + sli v14.8h, v14.8h, #1 + ushll v12.8h, v17.8b, #7 + sli v12.8h, v12.8h, #1 + ushll v13.8h, v16.8b, #7 + sli v13.8h, v13.8h, #1 + sri v14.8h, v12.8h, #5 + sri v14.8h, v13.8h, #11 + mov v28.d[0], v14.d[0] + mov v29.d[0], v14.d[1] .endm .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head fetch_mask_pixblock - vrshr.u16 q10, q6, #8 - vrshr.u16 q14, q7, #8 - vld1.16 {d4, d5}, [DST_R, :128]! - vrshr.u16 q15, q11, #8 - vraddhn.u16 d16, q10, q6 - vraddhn.u16 d17, q14, q7 - vraddhn.u16 d22, q15, q11 + urshr v13.8h, v5.8h, #8 + urshr v14.8h, v6.8h, #8 + ld1 {v4.8h}, [DST_R], #16 + urshr v15.8h, v7.8h, #8 + raddhn v16.8b, v13.8h, v5.8h + raddhn v17.8b, v14.8h, v6.8h + raddhn v18.8b, v15.8h, v7.8h + mov v5.d[0], v4.d[1] /* process_pixblock_head */ /* * 'combine_mask_ca' replacement * - * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A] - * mask in {d24, d25, d26} [B, G, R] - * output: updated src in {d0, d1, d2 } [B, G, R] - * updated mask in {d24, d25, d26} [B, G, R] + * input: solid src (n) in {v8, v9, v10, v11} [B, G, R, A] + * mask in {v24, v25, v26} [B, G, R] + * output: updated src in {v0, v1, v2 } [B, G, R] + * updated mask in {v24, v25, v26} [B, G, R] */ - vmull.u8 q6, d26, d10 - vqadd.u8 q8, q0, q8 - vmull.u8 q0, d24, d8 - vqadd.u8 d22, d2, d22 - vmull.u8 q1, d25, d9 + uqadd v16.8b, v0.8b, v16.8b + uqadd v17.8b, v1.8b, v17.8b + uqadd v18.8b, v2.8b, v18.8b + umull v0.8h, v24.8b, v8.8b + umull v1.8h, v25.8b, v9.8b + umull v2.8h, v26.8b, v10.8b /* - * convert the result in d16, d17, d22 to r5g6b5 and store - * it into {d28, d29} + * convert the result in v16, v17, v18 to r5g6b5 and store + * it into {v14} */ - vshll.u8 q14, d22, #8 - vshll.u8 q10, d17, #8 - vshll.u8 q15, d16, #8 - vmull.u8 q9, d11, d25 - vsri.u16 q14, q10, #5 - vmull.u8 q12, d11, d24 - vmull.u8 q13, d11, d26 - vsri.u16 q14, q15, #11 + ushll v14.8h, v18.8b, #7 + sli v14.8h, v14.8h, #1 + ushll v18.8h, v16.8b, #7 + sli v18.8h, v18.8h, #1 + ushll v19.8h, v17.8b, #7 + sli v19.8h, v19.8h, #1 + umull v12.8h, v11.8b, v24.8b + sri v14.8h, v19.8h, #5 + umull v13.8h, v11.8b, v25.8b + umull v15.8h, v11.8b, v26.8b + sri v14.8h, v18.8h, #11 + mov v28.d[0], v14.d[0] + mov v29.d[0], v14.d[1] cache_preload 8, 8 - vrshr.u16 q8, q0, #8 - vrshr.u16 q10, q1, #8 - vrshr.u16 q11, q6, #8 - vraddhn.u16 d0, q0, q8 - vraddhn.u16 d1, q1, q10 - vraddhn.u16 d2, q6, q11 - vrshr.u16 q11, q12, #8 - vrshr.u16 q8, q9, #8 - vrshr.u16 q6, q13, #8 - vraddhn.u16 d24, q12, q11 - vraddhn.u16 d25, q9, q8 + urshr v16.8h, v0.8h, #8 + urshr v17.8h, v1.8h, #8 + urshr v18.8h, v2.8h, #8 + raddhn v0.8b, v0.8h, v16.8h + raddhn v1.8b, v1.8h, v17.8h + raddhn v2.8b, v2.8h, v18.8h + urshr v19.8h, v12.8h, #8 + urshr v20.8h, v13.8h, #8 + urshr v21.8h, v15.8h, #8 + raddhn v24.8b, v12.8h, v19.8h + raddhn v25.8b, v13.8h, v20.8h /* - * convert 8 r5g6b5 pixel data from {d4, d5} to planar - * 8-bit format and put data into d16 - blue, d17 - green, - * d18 - red + * convert 8 r5g6b5 pixel data from {v4, v5} to planar + * 8-bit format and put data into v16 - blue, v17 - green, + * v18 - red */ - vshrn.u16 d17, q2, #3 - vshrn.u16 d18, q2, #8 - vraddhn.u16 d26, q13, q6 - vsli.u16 q2, q2, #5 - vsri.u8 d17, d17, #6 - vsri.u8 d18, d18, #5 + mov v4.d[1], v5.d[0] + shrn v17.8b, v4.8h, #3 + shrn v18.8b, v4.8h, #8 + raddhn v26.8b, v15.8h, v21.8h + sli v4.8h, v4.8h, #5 + sri v17.8b, v17.8b, #6 + sri v18.8b, v18.8b, #5 /* * 'combine_over_ca' replacement * - * output: updated dest in d16 - blue, d17 - green, d18 - red + * output: updated dest in v16 - blue, v17 - green, v18 - red */ - vmvn.8 q12, q12 - vshrn.u16 d16, q2, #2 - vmvn.8 d26, d26 - vmull.u8 q7, d17, d25 - vmull.u8 q6, d16, d24 - vmull.u8 q11, d18, d26 - vst1.16 {d28, d29}, [DST_W, :128]! + mvn v24.8b, v24.8b + mvn v25.8b, v25.8b + shrn v16.8b, v4.8h, #2 + mvn v26.8b, v26.8b + umull v5.8h, v16.8b, v24.8b + umull v6.8h, v17.8b, v25.8b + umull v7.8h, v18.8b, v26.8b + st1 {v14.8h}, [DST_W], #16 .endm .macro pixman_composite_over_n_8888_0565_ca_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vpush {d8-d15} - vld1.32 {d11[0]}, [DUMMY] - vdup.8 d8, d11[0] - vdup.8 d9, d11[1] - vdup.8 d10, d11[2] - vdup.8 d11, d11[3] + mov v13.s[0], w4 + dup v8.8b, v13.b[0] + dup v9.8b, v13.b[1] + dup v10.8b, v13.b[2] + dup v11.8b, v13.b[3] .endm .macro pixman_composite_over_n_8888_0565_ca_cleanup - vpop {d8-d15} .endm generate_composite_function \ @@ -1815,37 +2042,36 @@ generate_composite_function \ /******************************************************************************/ .macro pixman_composite_in_n_8_process_pixblock_head - /* expecting source data in {d0, d1, d2, d3} */ - /* and destination data in {d4, d5, d6, d7} */ - vmull.u8 q8, d4, d3 - vmull.u8 q9, d5, d3 - vmull.u8 q10, d6, d3 - vmull.u8 q11, d7, d3 + /* expecting source data in {v0, v1, v2, v3} */ + /* and destination data in {v4, v5, v6, v7} */ + umull v8.8h, v4.8b, v3.8b + umull v9.8h, v5.8b, v3.8b + umull v10.8h, v6.8b, v3.8b + umull v11.8h, v7.8b, v3.8b .endm .macro pixman_composite_in_n_8_process_pixblock_tail - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 - vraddhn.u16 d28, q8, q14 - vraddhn.u16 d29, q9, q15 - vraddhn.u16 d30, q10, q12 - vraddhn.u16 d31, q11, q13 + urshr v14.8h, v8.8h, #8 + urshr v15.8h, v9.8h, #8 + urshr v12.8h, v10.8h, #8 + urshr v13.8h, v11.8h, #8 + raddhn v28.8b, v8.8h, v14.8h + raddhn v29.8b, v9.8h, v15.8h + raddhn v30.8b, v10.8h, v12.8h + raddhn v31.8b, v11.8h, v13.8h .endm .macro pixman_composite_in_n_8_process_pixblock_tail_head pixman_composite_in_n_8_process_pixblock_tail - vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! + ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 cache_preload 32, 32 pixman_composite_in_n_8_process_pixblock_head - vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! + st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 .endm .macro pixman_composite_in_n_8_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vld1.32 {d3[0]}, [DUMMY] - vdup.8 d3, d3[3] + mov v3.s[0], w4 + dup v3.8b, v3.b[3] .endm .macro pixman_composite_in_n_8_cleanup @@ -1867,24 +2093,26 @@ generate_composite_function \ 24 /* mask_basereg */ .macro pixman_composite_add_n_8_8_process_pixblock_head - /* expecting source data in {d8, d9, d10, d11} */ - /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ - /* and destination data in {d4, d5, d6, d7} */ - /* mask is in d24, d25, d26, d27 */ - vmull.u8 q0, d24, d11 - vmull.u8 q1, d25, d11 - vmull.u8 q6, d26, d11 - vmull.u8 q7, d27, d11 - vrshr.u16 q10, q0, #8 - vrshr.u16 q11, q1, #8 - vrshr.u16 q12, q6, #8 - vrshr.u16 q13, q7, #8 - vraddhn.u16 d0, q0, q10 - vraddhn.u16 d1, q1, q11 - vraddhn.u16 d2, q6, q12 - vraddhn.u16 d3, q7, q13 - vqadd.u8 q14, q0, q2 - vqadd.u8 q15, q1, q3 + /* expecting source data in {v8, v9, v10, v11} */ + /* v8 - blue, v9 - green, v10 - red, v11 - alpha */ + /* and destination data in {v4, v5, v6, v7} */ + /* mask is in v24, v25, v26, v27 */ + umull v0.8h, v24.8b, v11.8b + umull v1.8h, v25.8b, v11.8b + umull v2.8h, v26.8b, v11.8b + umull v3.8h, v27.8b, v11.8b + urshr v12.8h, v0.8h, #8 + urshr v13.8h, v1.8h, #8 + urshr v14.8h, v2.8h, #8 + urshr v15.8h, v3.8h, #8 + raddhn v0.8b, v0.8h, v12.8h + raddhn v1.8b, v1.8h, v13.8h + raddhn v2.8b, v2.8h, v14.8h + raddhn v3.8b, v3.8h, v15.8h + uqadd v28.8b, v0.8b, v4.8b + uqadd v29.8b, v1.8b, v5.8b + uqadd v30.8b, v2.8b, v6.8b + uqadd v31.8b, v3.8b, v7.8b .endm .macro pixman_composite_add_n_8_8_process_pixblock_tail @@ -1893,22 +2121,19 @@ generate_composite_function \ /* TODO: expand macros and do better instructions scheduling */ .macro pixman_composite_add_n_8_8_process_pixblock_tail_head pixman_composite_add_n_8_8_process_pixblock_tail - vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! - vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! + st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 + ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 fetch_mask_pixblock cache_preload 32, 32 pixman_composite_add_n_8_8_process_pixblock_head .endm .macro pixman_composite_add_n_8_8_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vpush {d8-d15} - vld1.32 {d11[0]}, [DUMMY] - vdup.8 d11, d11[3] + mov v11.s[0], w4 + dup v11.8b, v11.b[3] .endm .macro pixman_composite_add_n_8_8_cleanup - vpop {d8-d15} .endm generate_composite_function \ @@ -1925,23 +2150,25 @@ generate_composite_function \ /******************************************************************************/ .macro pixman_composite_add_8_8_8_process_pixblock_head - /* expecting source data in {d0, d1, d2, d3} */ - /* destination data in {d4, d5, d6, d7} */ - /* mask in {d24, d25, d26, d27} */ - vmull.u8 q8, d24, d0 - vmull.u8 q9, d25, d1 - vmull.u8 q10, d26, d2 - vmull.u8 q11, d27, d3 - vrshr.u16 q0, q8, #8 - vrshr.u16 q1, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 - vraddhn.u16 d0, q0, q8 - vraddhn.u16 d1, q1, q9 - vraddhn.u16 d2, q12, q10 - vraddhn.u16 d3, q13, q11 - vqadd.u8 q14, q0, q2 - vqadd.u8 q15, q1, q3 + /* expecting source data in {v0, v1, v2, v3} */ + /* destination data in {v4, v5, v6, v7} */ + /* mask in {v24, v25, v26, v27} */ + umull v8.8h, v24.8b, v0.8b + umull v9.8h, v25.8b, v1.8b + umull v10.8h, v26.8b, v2.8b + umull v11.8h, v27.8b, v3.8b + urshr v0.8h, v8.8h, #8 + urshr v1.8h, v9.8h, #8 + urshr v12.8h, v10.8h, #8 + urshr v13.8h, v11.8h, #8 + raddhn v0.8b, v0.8h, v8.8h + raddhn v1.8b, v1.8h, v9.8h + raddhn v2.8b, v12.8h, v10.8h + raddhn v3.8b, v13.8h, v11.8h + uqadd v28.8b, v0.8b, v4.8b + uqadd v29.8b, v1.8b, v5.8b + uqadd v30.8b, v2.8b, v6.8b + uqadd v31.8b, v3.8b, v7.8b .endm .macro pixman_composite_add_8_8_8_process_pixblock_tail @@ -1950,8 +2177,8 @@ generate_composite_function \ /* TODO: expand macros and do better instructions scheduling */ .macro pixman_composite_add_8_8_8_process_pixblock_tail_head pixman_composite_add_8_8_8_process_pixblock_tail - vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! - vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! + st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 + ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 fetch_mask_pixblock fetch_src_pixblock cache_preload 32, 32 @@ -1978,53 +2205,56 @@ generate_composite_function \ /******************************************************************************/ .macro pixman_composite_add_8888_8888_8888_process_pixblock_head - /* expecting source data in {d0, d1, d2, d3} */ - /* destination data in {d4, d5, d6, d7} */ - /* mask in {d24, d25, d26, d27} */ - vmull.u8 q8, d27, d0 - vmull.u8 q9, d27, d1 - vmull.u8 q10, d27, d2 - vmull.u8 q11, d27, d3 + /* expecting source data in {v0, v1, v2, v3} */ + /* destination data in {v4, v5, v6, v7} */ + /* mask in {v24, v25, v26, v27} */ + umull v8.8h, v27.8b, v0.8b + umull v9.8h, v27.8b, v1.8b + umull v10.8h, v27.8b, v2.8b + umull v11.8h, v27.8b, v3.8b /* 1 cycle bubble */ - vrsra.u16 q8, q8, #8 - vrsra.u16 q9, q9, #8 - vrsra.u16 q10, q10, #8 - vrsra.u16 q11, q11, #8 + ursra v8.8h, v8.8h, #8 + ursra v9.8h, v9.8h, #8 + ursra v10.8h, v10.8h, #8 + ursra v11.8h, v11.8h, #8 .endm .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail /* 2 cycle bubble */ - vrshrn.u16 d28, q8, #8 - vrshrn.u16 d29, q9, #8 - vrshrn.u16 d30, q10, #8 - vrshrn.u16 d31, q11, #8 - vqadd.u8 q14, q2, q14 - /* 1 cycle bubble */ - vqadd.u8 q15, q3, q15 + rshrn v28.8b, v8.8h, #8 + rshrn v29.8b, v9.8h, #8 + rshrn v30.8b, v10.8h, #8 + rshrn v31.8b, v11.8h, #8 + uqadd v28.8b, v4.8b, v28.8b + uqadd v29.8b, v5.8b, v29.8b + uqadd v30.8b, v6.8b, v30.8b + uqadd v31.8b, v7.8b, v31.8b .endm .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head fetch_src_pixblock - vrshrn.u16 d28, q8, #8 + rshrn v28.8b, v8.8h, #8 fetch_mask_pixblock - vrshrn.u16 d29, q9, #8 - vmull.u8 q8, d27, d0 - vrshrn.u16 d30, q10, #8 - vmull.u8 q9, d27, d1 - vrshrn.u16 d31, q11, #8 - vmull.u8 q10, d27, d2 - vqadd.u8 q14, q2, q14 - vmull.u8 q11, d27, d3 - vqadd.u8 q15, q3, q15 - vrsra.u16 q8, q8, #8 - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - vrsra.u16 q9, q9, #8 - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - vrsra.u16 q10, q10, #8 + rshrn v29.8b, v9.8h, #8 + umull v8.8h, v27.8b, v0.8b + rshrn v30.8b, v10.8h, #8 + umull v9.8h, v27.8b, v1.8b + rshrn v31.8b, v11.8h, #8 + umull v10.8h, v27.8b, v2.8b + umull v11.8h, v27.8b, v3.8b + uqadd v28.8b, v4.8b, v28.8b + uqadd v29.8b, v5.8b, v29.8b + uqadd v30.8b, v6.8b, v30.8b + uqadd v31.8b, v7.8b, v31.8b + ursra v8.8h, v8.8h, #8 + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 + ursra v9.8h, v9.8h, #8 + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 + ursra v10.8h, v10.8h, #8 cache_preload 8, 8 - vrsra.u16 q11, q11, #8 + ursra v11.8h, v11.8h, #8 .endm generate_composite_function \ @@ -2036,7 +2266,11 @@ generate_composite_function \ default_cleanup, \ pixman_composite_add_8888_8888_8888_process_pixblock_head, \ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ - pixman_composite_add_8888_8888_8888_process_pixblock_tail_head + pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 24 /* mask_basereg */ generate_composite_function_single_scanline \ pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \ @@ -2046,7 +2280,11 @@ generate_composite_function_single_scanline \ default_cleanup, \ pixman_composite_add_8888_8888_8888_process_pixblock_head, \ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ - pixman_composite_add_8888_8888_8888_process_pixblock_tail_head + pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 24 /* mask_basereg */ /******************************************************************************/ @@ -2068,12 +2306,11 @@ generate_composite_function \ /******************************************************************************/ .macro pixman_composite_add_n_8_8888_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vld1.32 {d3[0]}, [DUMMY] - vdup.8 d0, d3[0] - vdup.8 d1, d3[1] - vdup.8 d2, d3[2] - vdup.8 d3, d3[3] + mov v3.s[0], w4 + dup v0.8b, v3.b[0] + dup v1.8b, v3.b[1] + dup v2.8b, v3.b[2] + dup v3.8b, v3.b[3] .endm .macro pixman_composite_add_n_8_8888_cleanup @@ -2097,9 +2334,8 @@ generate_composite_function \ /******************************************************************************/ .macro pixman_composite_add_8888_n_8888_init - add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) - vld1.32 {d27[0]}, [DUMMY] - vdup.8 d27, d27[3] + mov v27.s[0], w6 + dup v27.8b, v27.b[3] .endm .macro pixman_composite_add_8888_n_8888_cleanup @@ -2123,51 +2359,51 @@ generate_composite_function \ /******************************************************************************/ .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head - /* expecting source data in {d0, d1, d2, d3} */ - /* destination data in {d4, d5, d6, d7} */ - /* solid mask is in d15 */ + /* expecting source data in {v0, v1, v2, v3} */ + /* destination data in {v4, v5, v6, v7} */ + /* solid mask is in v15 */ /* 'in' */ - vmull.u8 q8, d15, d3 - vmull.u8 q6, d15, d2 - vmull.u8 q5, d15, d1 - vmull.u8 q4, d15, d0 - vrshr.u16 q13, q8, #8 - vrshr.u16 q12, q6, #8 - vrshr.u16 q11, q5, #8 - vrshr.u16 q10, q4, #8 - vraddhn.u16 d3, q8, q13 - vraddhn.u16 d2, q6, q12 - vraddhn.u16 d1, q5, q11 - vraddhn.u16 d0, q4, q10 - vmvn.8 d24, d3 /* get inverted alpha */ + umull v11.8h, v15.8b, v3.8b + umull v10.8h, v15.8b, v2.8b + umull v9.8h, v15.8b, v1.8b + umull v8.8h, v15.8b, v0.8b + urshr v16.8h, v11.8h, #8 + urshr v14.8h, v10.8h, #8 + urshr v13.8h, v9.8h, #8 + urshr v12.8h, v8.8h, #8 + raddhn v3.8b, v11.8h, v16.8h + raddhn v2.8b, v10.8h, v14.8h + raddhn v1.8b, v9.8h, v13.8h + raddhn v0.8b, v8.8h, v12.8h + mvn v24.8b, v3.8b /* get inverted alpha */ /* now do alpha blending */ - vmull.u8 q8, d24, d4 - vmull.u8 q9, d24, d5 - vmull.u8 q10, d24, d6 - vmull.u8 q11, d24, d7 + umull v8.8h, v24.8b, v4.8b + umull v9.8h, v24.8b, v5.8b + umull v10.8h, v24.8b, v6.8b + umull v11.8h, v24.8b, v7.8b .endm .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - vraddhn.u16 d30, q12, q10 - vraddhn.u16 d31, q13, q11 + urshr v16.8h, v8.8h, #8 + urshr v17.8h, v9.8h, #8 + urshr v18.8h, v10.8h, #8 + urshr v19.8h, v11.8h, #8 + raddhn v28.8b, v16.8h, v8.8h + raddhn v29.8b, v17.8h, v9.8h + raddhn v30.8b, v18.8h, v10.8h + raddhn v31.8b, v19.8h, v11.8h .endm /* TODO: expand macros and do better instructions scheduling */ .macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail fetch_src_pixblock cache_preload 8, 8 fetch_mask_pixblock pixman_composite_out_reverse_8888_n_8888_process_pixblock_head - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 .endm generate_composite_function_single_scanline \ @@ -2192,29 +2428,28 @@ generate_composite_function_single_scanline \ .macro pixman_composite_over_8888_n_8888_process_pixblock_tail pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 + uqadd v28.8b, v0.8b, v28.8b + uqadd v29.8b, v1.8b, v29.8b + uqadd v30.8b, v2.8b, v30.8b + uqadd v31.8b, v3.8b, v31.8b .endm /* TODO: expand macros and do better instructions scheduling */ .macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 pixman_composite_over_8888_n_8888_process_pixblock_tail fetch_src_pixblock cache_preload 8, 8 pixman_composite_over_8888_n_8888_process_pixblock_head - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 .endm .macro pixman_composite_over_8888_n_8888_init - add DUMMY, sp, #48 - vpush {d8-d15} - vld1.32 {d15[0]}, [DUMMY] - vdup.8 d15, d15[3] + mov v15.s[0], w6 + dup v15.8b, v15.b[3] .endm .macro pixman_composite_over_8888_n_8888_cleanup - vpop {d8-d15} .endm generate_composite_function \ @@ -2226,19 +2461,23 @@ generate_composite_function \ pixman_composite_over_8888_n_8888_cleanup, \ pixman_composite_over_8888_n_8888_process_pixblock_head, \ pixman_composite_over_8888_n_8888_process_pixblock_tail, \ - pixman_composite_over_8888_n_8888_process_pixblock_tail_head + pixman_composite_over_8888_n_8888_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 12 /* mask_basereg */ /******************************************************************************/ /* TODO: expand macros and do better instructions scheduling */ .macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 pixman_composite_over_8888_n_8888_process_pixblock_tail fetch_src_pixblock cache_preload 8, 8 fetch_mask_pixblock pixman_composite_over_8888_n_8888_process_pixblock_head - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 .endm generate_composite_function \ @@ -2274,13 +2513,13 @@ generate_composite_function_single_scanline \ /* TODO: expand macros and do better instructions scheduling */ .macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 pixman_composite_over_8888_n_8888_process_pixblock_tail fetch_src_pixblock cache_preload 8, 8 fetch_mask_pixblock pixman_composite_over_8888_n_8888_process_pixblock_head - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 .endm generate_composite_function \ @@ -2307,7 +2546,7 @@ generate_composite_function \ .endm .macro pixman_composite_src_0888_0888_process_pixblock_tail_head - vst3.8 {d0, d1, d2}, [DST_W]! + st3 {v0.8b, v1.8b, v2.8b}, [DST_W], #24 fetch_src_pixblock cache_preload 8, 8 .endm @@ -2330,21 +2569,25 @@ generate_composite_function \ /******************************************************************************/ .macro pixman_composite_src_0888_8888_rev_process_pixblock_head - vswp d0, d2 + mov v31.8b, v2.8b + mov v2.8b, v0.8b + mov v0.8b, v31.8b .endm .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail .endm .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head - vst4.8 {d0, d1, d2, d3}, [DST_W]! + st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_W], #32 fetch_src_pixblock - vswp d0, d2 + mov v31.8b, v2.8b + mov v2.8b, v0.8b + mov v0.8b, v31.8b cache_preload 8, 8 .endm .macro pixman_composite_src_0888_8888_rev_init - veor d3, d3, d3 + eor v3.8b, v3.8b, v3.8b .endm generate_composite_function \ @@ -2365,24 +2608,34 @@ generate_composite_function \ /******************************************************************************/ .macro pixman_composite_src_0888_0565_rev_process_pixblock_head - vshll.u8 q8, d1, #8 - vshll.u8 q9, d2, #8 + ushll v8.8h, v1.8b, #7 + sli v8.8h, v8.8h, #1 + ushll v9.8h, v2.8b, #7 + sli v9.8h, v9.8h, #1 .endm .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail - vshll.u8 q14, d0, #8 - vsri.u16 q14, q8, #5 - vsri.u16 q14, q9, #11 + ushll v14.8h, v0.8b, #7 + sli v14.8h, v14.8h, #1 + sri v14.8h, v8.8h, #5 + sri v14.8h, v9.8h, #11 + mov v28.d[0], v14.d[0] + mov v29.d[0], v14.d[1] .endm .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head - vshll.u8 q14, d0, #8 + ushll v14.8h, v0.8b, #7 + sli v14.8h, v14.8h, #1 fetch_src_pixblock - vsri.u16 q14, q8, #5 - vsri.u16 q14, q9, #11 - vshll.u8 q8, d1, #8 - vst1.16 {d28, d29}, [DST_W, :128]! - vshll.u8 q9, d2, #8 + sri v14.8h, v8.8h, #5 + sri v14.8h, v9.8h, #11 + mov v28.d[0], v14.d[0] + mov v29.d[0], v14.d[1] + ushll v8.8h, v1.8b, #7 + sli v8.8h, v8.8h, #1 + st1 {v14.8h}, [DST_W], #16 + ushll v9.8h, v2.8b, #7 + sli v9.8h, v9.8h, #1 .endm generate_composite_function \ @@ -2403,43 +2656,55 @@ generate_composite_function \ /******************************************************************************/ .macro pixman_composite_src_pixbuf_8888_process_pixblock_head - vmull.u8 q8, d3, d0 - vmull.u8 q9, d3, d1 - vmull.u8 q10, d3, d2 + umull v8.8h, v3.8b, v0.8b + umull v9.8h, v3.8b, v1.8b + umull v10.8h, v3.8b, v2.8b .endm .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail - vrshr.u16 q11, q8, #8 - vswp d3, d31 - vrshr.u16 q12, q9, #8 - vrshr.u16 q13, q10, #8 - vraddhn.u16 d30, q11, q8 - vraddhn.u16 d29, q12, q9 - vraddhn.u16 d28, q13, q10 + urshr v11.8h, v8.8h, #8 + mov v30.8b, v31.8b + mov v31.8b, v3.8b + mov v3.8b, v30.8b + urshr v12.8h, v9.8h, #8 + urshr v13.8h, v10.8h, #8 + raddhn v30.8b, v11.8h, v8.8h + raddhn v29.8b, v12.8h, v9.8h + raddhn v28.8b, v13.8h, v10.8h .endm .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head - vrshr.u16 q11, q8, #8 - vswp d3, d31 - vrshr.u16 q12, q9, #8 - vrshr.u16 q13, q10, #8 + urshr v11.8h, v8.8h, #8 + mov v30.8b, v31.8b + mov v31.8b, v3.8b + mov v3.8b, v31.8b + urshr v12.8h, v9.8h, #8 + urshr v13.8h, v10.8h, #8 fetch_src_pixblock - vraddhn.u16 d30, q11, q8 + raddhn v30.8b, v11.8h, v8.8h PF add PF_X, PF_X, #8 PF tst PF_CTL, #0xF - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 - vraddhn.u16 d29, q12, q9 - vraddhn.u16 d28, q13, q10 - vmull.u8 q8, d3, d0 - vmull.u8 q9, d3, d1 - vmull.u8 q10, d3, d2 - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + PF beq 10f + PF add PF_X, PF_X, #8 + PF sub PF_CTL, PF_CTL, #1 +10: + raddhn v29.8b, v12.8h, v9.8h + raddhn v28.8b, v13.8h, v10.8h + umull v8.8h, v3.8b, v0.8b + umull v9.8h, v3.8b, v1.8b + umull v10.8h, v3.8b, v2.8b + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 PF cmp PF_X, ORIG_W - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - PF subge PF_X, PF_X, ORIG_W - PF subges PF_CTL, PF_CTL, #0x10 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + PF lsl DUMMY, PF_X, src_bpp_shift + PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] + PF ble 10f + PF sub PF_X, PF_X, ORIG_W + PF subs PF_CTL, PF_CTL, #0x10 + PF ble 10f + PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift + PF ldrsb DUMMY, [PF_SRC, DUMMY] + PF add PF_SRC, PF_SRC, #1 +10: .endm generate_composite_function \ @@ -2460,43 +2725,55 @@ generate_composite_function \ /******************************************************************************/ .macro pixman_composite_src_rpixbuf_8888_process_pixblock_head - vmull.u8 q8, d3, d0 - vmull.u8 q9, d3, d1 - vmull.u8 q10, d3, d2 + umull v8.8h, v3.8b, v0.8b + umull v9.8h, v3.8b, v1.8b + umull v10.8h, v3.8b, v2.8b .endm .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail - vrshr.u16 q11, q8, #8 - vswp d3, d31 - vrshr.u16 q12, q9, #8 - vrshr.u16 q13, q10, #8 - vraddhn.u16 d28, q11, q8 - vraddhn.u16 d29, q12, q9 - vraddhn.u16 d30, q13, q10 + urshr v11.8h, v8.8h, #8 + mov v30.8b, v31.8b + mov v31.8b, v3.8b + mov v3.8b, v30.8b + urshr v12.8h, v9.8h, #8 + urshr v13.8h, v10.8h, #8 + raddhn v28.8b, v11.8h, v8.8h + raddhn v29.8b, v12.8h, v9.8h + raddhn v30.8b, v13.8h, v10.8h .endm .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head - vrshr.u16 q11, q8, #8 - vswp d3, d31 - vrshr.u16 q12, q9, #8 - vrshr.u16 q13, q10, #8 + urshr v11.8h, v8.8h, #8 + mov v30.8b, v31.8b + mov v31.8b, v3.8b + mov v3.8b, v30.8b + urshr v12.8h, v9.8h, #8 + urshr v13.8h, v10.8h, #8 fetch_src_pixblock - vraddhn.u16 d28, q11, q8 + raddhn v28.8b, v11.8h, v8.8h PF add PF_X, PF_X, #8 PF tst PF_CTL, #0xF - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 - vraddhn.u16 d29, q12, q9 - vraddhn.u16 d30, q13, q10 - vmull.u8 q8, d3, d0 - vmull.u8 q9, d3, d1 - vmull.u8 q10, d3, d2 - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + PF beq 10f + PF add PF_X, PF_X, #8 + PF sub PF_CTL, PF_CTL, #1 +10: + raddhn v29.8b, v12.8h, v9.8h + raddhn v30.8b, v13.8h, v10.8h + umull v8.8h, v3.8b, v0.8b + umull v9.8h, v3.8b, v1.8b + umull v10.8h, v3.8b, v2.8b + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 PF cmp PF_X, ORIG_W - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - PF subge PF_X, PF_X, ORIG_W - PF subges PF_CTL, PF_CTL, #0x10 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + PF lsl DUMMY, PF_X, src_bpp_shift + PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] + PF ble 10f + PF sub PF_X, PF_X, ORIG_W + PF subs PF_CTL, PF_CTL, #0x10 + PF ble 10f + PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift + PF ldrsb DUMMY, [PF_SRC, DUMMY] + PF add PF_SRC, PF_SRC, #1 +10: .endm generate_composite_function \ @@ -2517,37 +2794,44 @@ generate_composite_function \ /******************************************************************************/ .macro pixman_composite_over_0565_8_0565_process_pixblock_head - /* mask is in d15 */ - convert_0565_to_x888 q4, d2, d1, d0 - convert_0565_to_x888 q5, d6, d5, d4 - /* source pixel data is in {d0, d1, d2, XX} */ - /* destination pixel data is in {d4, d5, d6, XX} */ - vmvn.8 d7, d15 - vmull.u8 q6, d15, d2 - vmull.u8 q5, d15, d1 - vmull.u8 q4, d15, d0 - vmull.u8 q8, d7, d4 - vmull.u8 q9, d7, d5 - vmull.u8 q13, d7, d6 - vrshr.u16 q12, q6, #8 - vrshr.u16 q11, q5, #8 - vrshr.u16 q10, q4, #8 - vraddhn.u16 d2, q6, q12 - vraddhn.u16 d1, q5, q11 - vraddhn.u16 d0, q4, q10 + /* mask is in v15 */ + mov v4.d[0], v8.d[0] + mov v4.d[1], v9.d[0] + mov v13.d[0], v10.d[0] + mov v13.d[1], v11.d[0] + convert_0565_to_x888 v4, v2, v1, v0 + convert_0565_to_x888 v13, v6, v5, v4 + /* source pixel data is in {v0, v1, v2, XX} */ + /* destination pixel data is in {v4, v5, v6, XX} */ + mvn v7.8b, v15.8b + umull v10.8h, v15.8b, v2.8b + umull v9.8h, v15.8b, v1.8b + umull v8.8h, v15.8b, v0.8b + umull v11.8h, v7.8b, v4.8b + umull v12.8h, v7.8b, v5.8b + umull v13.8h, v7.8b, v6.8b + urshr v19.8h, v10.8h, #8 + urshr v18.8h, v9.8h, #8 + urshr v17.8h, v8.8h, #8 + raddhn v2.8b, v10.8h, v19.8h + raddhn v1.8b, v9.8h, v18.8h + raddhn v0.8b, v8.8h, v17.8h .endm .macro pixman_composite_over_0565_8_0565_process_pixblock_tail - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q13, #8 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - vraddhn.u16 d30, q12, q13 - vqadd.u8 q0, q0, q14 - vqadd.u8 q1, q1, q15 - /* 32bpp result is in {d0, d1, d2, XX} */ - convert_8888_to_0565 d2, d1, d0, q14, q15, q3 + urshr v17.8h, v11.8h, #8 + urshr v18.8h, v12.8h, #8 + urshr v19.8h, v13.8h, #8 + raddhn v28.8b, v17.8h, v11.8h + raddhn v29.8b, v18.8h, v12.8h + raddhn v30.8b, v19.8h, v13.8h + uqadd v0.8b, v0.8b, v28.8b + uqadd v1.8b, v1.8b, v29.8b + uqadd v2.8b, v2.8b, v30.8b + /* 32bpp result is in {v0, v1, v2, XX} */ + convert_8888_to_0565 v2, v1, v0, v14, v30, v13 + mov v28.d[0], v14.d[0] + mov v29.d[0], v14.d[1] .endm /* TODO: expand macros and do better instructions scheduling */ @@ -2555,10 +2839,10 @@ generate_composite_function \ fetch_mask_pixblock pixman_composite_over_0565_8_0565_process_pixblock_tail fetch_src_pixblock - vld1.16 {d10, d11}, [DST_R, :128]! + ld1 {v10.4h, v11.4h}, [DST_R], #16 cache_preload 8, 8 pixman_composite_over_0565_8_0565_process_pixblock_head - vst1.16 {d28, d29}, [DST_W, :128]! + st1 {v14.8h}, [DST_W], #16 .endm generate_composite_function \ @@ -2579,14 +2863,11 @@ generate_composite_function \ /******************************************************************************/ .macro pixman_composite_over_0565_n_0565_init - add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) - vpush {d8-d15} - vld1.32 {d15[0]}, [DUMMY] - vdup.8 d15, d15[3] + mov v15.s[0], w6 + dup v15.8b, v15.b[3] .endm .macro pixman_composite_over_0565_n_0565_cleanup - vpop {d8-d15} .endm generate_composite_function \ @@ -2600,34 +2881,41 @@ generate_composite_function \ pixman_composite_over_0565_8_0565_process_pixblock_tail, \ pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ 28, /* dst_w_basereg */ \ - 10, /* dst_r_basereg */ \ + 10, /* dst_r_basereg */ \ 8, /* src_basereg */ \ 15 /* mask_basereg */ /******************************************************************************/ .macro pixman_composite_add_0565_8_0565_process_pixblock_head - /* mask is in d15 */ - convert_0565_to_x888 q4, d2, d1, d0 - convert_0565_to_x888 q5, d6, d5, d4 - /* source pixel data is in {d0, d1, d2, XX} */ - /* destination pixel data is in {d4, d5, d6, XX} */ - vmull.u8 q6, d15, d2 - vmull.u8 q5, d15, d1 - vmull.u8 q4, d15, d0 - vrshr.u16 q12, q6, #8 - vrshr.u16 q11, q5, #8 - vrshr.u16 q10, q4, #8 - vraddhn.u16 d2, q6, q12 - vraddhn.u16 d1, q5, q11 - vraddhn.u16 d0, q4, q10 + /* mask is in v15 */ + mov v4.d[0], v8.d[0] + mov v4.d[1], v9.d[0] + mov v13.d[0], v10.d[0] + mov v13.d[1], v11.d[0] + convert_0565_to_x888 v4, v2, v1, v0 + convert_0565_to_x888 v13, v6, v5, v4 + /* source pixel data is in {v0, v1, v2, XX} */ + /* destination pixel data is in {v4, v5, v6, XX} */ + umull v9.8h, v15.8b, v2.8b + umull v8.8h, v15.8b, v1.8b + umull v7.8h, v15.8b, v0.8b + urshr v12.8h, v9.8h, #8 + urshr v11.8h, v8.8h, #8 + urshr v10.8h, v7.8h, #8 + raddhn v2.8b, v9.8h, v12.8h + raddhn v1.8b, v8.8h, v11.8h + raddhn v0.8b, v7.8h, v10.8h .endm .macro pixman_composite_add_0565_8_0565_process_pixblock_tail - vqadd.u8 q0, q0, q2 - vqadd.u8 q1, q1, q3 - /* 32bpp result is in {d0, d1, d2, XX} */ - convert_8888_to_0565 d2, d1, d0, q14, q15, q3 + uqadd v0.8b, v0.8b, v4.8b + uqadd v1.8b, v1.8b, v5.8b + uqadd v2.8b, v2.8b, v6.8b + /* 32bpp result is in {v0, v1, v2, XX} */ + convert_8888_to_0565 v2, v1, v0, v14, v30, v13 + mov v28.d[0], v14.d[0] + mov v29.d[0], v14.d[1] .endm /* TODO: expand macros and do better instructions scheduling */ @@ -2635,10 +2923,10 @@ generate_composite_function \ fetch_mask_pixblock pixman_composite_add_0565_8_0565_process_pixblock_tail fetch_src_pixblock - vld1.16 {d10, d11}, [DST_R, :128]! + ld1 {v10.4h, v11.4h}, [DST_R], #16 cache_preload 8, 8 pixman_composite_add_0565_8_0565_process_pixblock_head - vst1.16 {d28, d29}, [DST_W, :128]! + st1 {v14.8h}, [DST_W], #16 .endm generate_composite_function \ @@ -2659,35 +2947,39 @@ generate_composite_function \ /******************************************************************************/ .macro pixman_composite_out_reverse_8_0565_process_pixblock_head - /* mask is in d15 */ - convert_0565_to_x888 q5, d6, d5, d4 - /* destination pixel data is in {d4, d5, d6, xx} */ - vmvn.8 d24, d15 /* get inverted alpha */ + /* mask is in v15 */ + mov v12.d[0], v10.d[0] + mov v12.d[1], v11.d[0] + convert_0565_to_x888 v12, v6, v5, v4 + /* destination pixel data is in {v4, v5, v6, xx} */ + mvn v24.8b, v15.8b /* get inverted alpha */ /* now do alpha blending */ - vmull.u8 q8, d24, d4 - vmull.u8 q9, d24, d5 - vmull.u8 q10, d24, d6 + umull v8.8h, v24.8b, v4.8b + umull v9.8h, v24.8b, v5.8b + umull v10.8h, v24.8b, v6.8b .endm .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vraddhn.u16 d0, q14, q8 - vraddhn.u16 d1, q15, q9 - vraddhn.u16 d2, q12, q10 - /* 32bpp result is in {d0, d1, d2, XX} */ - convert_8888_to_0565 d2, d1, d0, q14, q15, q3 + urshr v11.8h, v8.8h, #8 + urshr v12.8h, v9.8h, #8 + urshr v13.8h, v10.8h, #8 + raddhn v0.8b, v11.8h, v8.8h + raddhn v1.8b, v12.8h, v9.8h + raddhn v2.8b, v13.8h, v10.8h + /* 32bpp result is in {v0, v1, v2, XX} */ + convert_8888_to_0565 v2, v1, v0, v14, v12, v3 + mov v28.d[0], v14.d[0] + mov v29.d[0], v14.d[1] .endm /* TODO: expand macros and do better instructions scheduling */ .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head fetch_src_pixblock pixman_composite_out_reverse_8_0565_process_pixblock_tail - vld1.16 {d10, d11}, [DST_R, :128]! + ld1 {v10.4h, v11.4h}, [DST_R], #16 cache_preload 8, 8 pixman_composite_out_reverse_8_0565_process_pixblock_head - vst1.16 {d28, d29}, [DST_W, :128]! + st1 {v14.8h}, [DST_W], #16 .endm generate_composite_function \ @@ -2701,43 +2993,43 @@ generate_composite_function \ pixman_composite_out_reverse_8_0565_process_pixblock_tail, \ pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \ 28, /* dst_w_basereg */ \ - 10, /* dst_r_basereg */ \ + 10, /* dst_r_basereg */ \ 15, /* src_basereg */ \ 0 /* mask_basereg */ /******************************************************************************/ .macro pixman_composite_out_reverse_8_8888_process_pixblock_head - /* src is in d0 */ - /* destination pixel data is in {d4, d5, d6, d7} */ - vmvn.8 d1, d0 /* get inverted alpha */ + /* src is in v0 */ + /* destination pixel data is in {v4, v5, v6, v7} */ + mvn v1.8b, v0.8b /* get inverted alpha */ /* now do alpha blending */ - vmull.u8 q8, d1, d4 - vmull.u8 q9, d1, d5 - vmull.u8 q10, d1, d6 - vmull.u8 q11, d1, d7 + umull v8.8h, v1.8b, v4.8b + umull v9.8h, v1.8b, v5.8b + umull v10.8h, v1.8b, v6.8b + umull v11.8h, v1.8b, v7.8b .endm .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - vraddhn.u16 d30, q12, q10 - vraddhn.u16 d31, q13, q11 - /* 32bpp result is in {d28, d29, d30, d31} */ + urshr v14.8h, v8.8h, #8 + urshr v15.8h, v9.8h, #8 + urshr v12.8h, v10.8h, #8 + urshr v13.8h, v11.8h, #8 + raddhn v28.8b, v14.8h, v8.8h + raddhn v29.8b, v15.8h, v9.8h + raddhn v30.8b, v12.8h, v10.8h + raddhn v31.8b, v13.8h, v11.8h + /* 32bpp result is in {v28, v29, v30, v31} */ .endm /* TODO: expand macros and do better instructions scheduling */ .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head fetch_src_pixblock pixman_composite_out_reverse_8_8888_process_pixblock_tail - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 cache_preload 8, 8 pixman_composite_out_reverse_8_8888_process_pixblock_head - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 .endm generate_composite_function \ @@ -2754,7 +3046,7 @@ generate_composite_function \ 4, /* dst_r_basereg */ \ 0, /* src_basereg */ \ 0 /* mask_basereg */ - + /******************************************************************************/ generate_composite_function_nearest_scanline \ @@ -2789,8 +3081,8 @@ generate_composite_function_nearest_scanline \ default_cleanup, \ pixman_composite_src_8888_0565_process_pixblock_head, \ pixman_composite_src_8888_0565_process_pixblock_tail, \ - pixman_composite_src_8888_0565_process_pixblock_tail_head - + pixman_composite_src_8888_0565_process_pixblock_tail_head, \ + generate_composite_function_nearest_scanline \ pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ @@ -2841,16 +3133,16 @@ generate_composite_function_nearest_scanline \ asr TMP1, X, #16 add X, X, UX add TMP1, TOP, TMP1, lsl #2 - vld1.32 {reg1}, [TMP1], STRIDE - vld1.32 {reg2}, [TMP1] + ld1 {®1&.2s}, [TMP1], STRIDE + ld1 {®2&.2s}, [TMP1] .endm .macro bilinear_load_0565 reg1, reg2, tmp asr TMP1, X, #16 add X, X, UX add TMP1, TOP, TMP1, lsl #1 - vld1.32 {reg2[0]}, [TMP1], STRIDE - vld1.32 {reg2[1]}, [TMP1] + ld1 {®2&.s}[0], [TMP1], STRIDE + ld1 {®2&.s}[1], [TMP1] convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp .endm @@ -2858,11 +3150,11 @@ generate_composite_function_nearest_scanline \ acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 bilinear_load_8888 reg1, reg2, tmp1 - vmull.u8 acc1, reg1, d28 - vmlal.u8 acc1, reg2, d29 + umull &acc1&.8h, ®1&.8b, v28.8b + umlal &acc1&.8h, ®2&.8b, v29.8b bilinear_load_8888 reg3, reg4, tmp2 - vmull.u8 acc2, reg3, d28 - vmlal.u8 acc2, reg4, d29 + umull &acc2&.8h, ®3&.8b, v28.8b + umlal &acc2&.8h, ®4&.8b, v29.8b .endm .macro bilinear_load_and_vertical_interpolate_four_8888 \ @@ -2875,44 +3167,58 @@ generate_composite_function_nearest_scanline \ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi .endm +.macro vzip reg1, reg2 + umov TMP4, v31.d[0] + zip1 v31.8b, reg1, reg2 + zip2 reg2, reg1, reg2 + mov reg1, v31.8b + mov v31.d[0], TMP4 +.endm + +.macro vuzp reg1, reg2 + umov TMP4, v31.d[0] + uzp1 v31.8b, reg1, reg2 + uzp2 reg2, reg1, reg2 + mov reg1, v31.8b + mov v31.d[0], TMP4 +.endm + .macro bilinear_load_and_vertical_interpolate_two_0565 \ acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi - asr TMP1, X, #16 add X, X, UX add TMP1, TOP, TMP1, lsl #1 asr TMP2, X, #16 add X, X, UX add TMP2, TOP, TMP2, lsl #1 - vld1.32 {acc2lo[0]}, [TMP1], STRIDE - vld1.32 {acc2hi[0]}, [TMP2], STRIDE - vld1.32 {acc2lo[1]}, [TMP1] - vld1.32 {acc2hi[1]}, [TMP2] + ld1 {&acc2&.s}[0], [TMP1], STRIDE + ld1 {&acc2&.s}[2], [TMP2], STRIDE + ld1 {&acc2&.s}[1], [TMP1] + ld1 {&acc2&.s}[3], [TMP2] convert_0565_to_x888 acc2, reg3, reg2, reg1 - vzip.u8 reg1, reg3 - vzip.u8 reg2, reg4 - vzip.u8 reg3, reg4 - vzip.u8 reg1, reg2 - vmull.u8 acc1, reg1, d28 - vmlal.u8 acc1, reg2, d29 - vmull.u8 acc2, reg3, d28 - vmlal.u8 acc2, reg4, d29 + vzip ®1&.8b, ®3&.8b + vzip ®2&.8b, ®4&.8b + vzip ®3&.8b, ®4&.8b + vzip ®1&.8b, ®2&.8b + umull &acc1&.8h, ®1&.8b, v28.8b + umlal &acc1&.8h, ®2&.8b, v29.8b + umull &acc2&.8h, ®3&.8b, v28.8b + umlal &acc2&.8h, ®4&.8b, v29.8b .endm .macro bilinear_load_and_vertical_interpolate_four_0565 \ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi - asr TMP1, X, #16 add X, X, UX add TMP1, TOP, TMP1, lsl #1 asr TMP2, X, #16 add X, X, UX add TMP2, TOP, TMP2, lsl #1 - vld1.32 {xacc2lo[0]}, [TMP1], STRIDE - vld1.32 {xacc2hi[0]}, [TMP2], STRIDE - vld1.32 {xacc2lo[1]}, [TMP1] - vld1.32 {xacc2hi[1]}, [TMP2] + ld1 {&xacc2&.s}[0], [TMP1], STRIDE + ld1 {&xacc2&.s}[2], [TMP2], STRIDE + ld1 {&xacc2&.s}[1], [TMP1] + ld1 {&xacc2&.s}[3], [TMP2] convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 asr TMP1, X, #16 add X, X, UX @@ -2920,121 +3226,121 @@ generate_composite_function_nearest_scanline \ asr TMP2, X, #16 add X, X, UX add TMP2, TOP, TMP2, lsl #1 - vld1.32 {yacc2lo[0]}, [TMP1], STRIDE - vzip.u8 xreg1, xreg3 - vld1.32 {yacc2hi[0]}, [TMP2], STRIDE - vzip.u8 xreg2, xreg4 - vld1.32 {yacc2lo[1]}, [TMP1] - vzip.u8 xreg3, xreg4 - vld1.32 {yacc2hi[1]}, [TMP2] - vzip.u8 xreg1, xreg2 + ld1 {&yacc2&.s}[0], [TMP1], STRIDE + vzip &xreg1&.8b, &xreg3&.8b + ld1 {&yacc2&.s}[2], [TMP2], STRIDE + vzip &xreg2&.8b, &xreg4&.8b + ld1 {&yacc2&.s}[1], [TMP1] + vzip &xreg3&.8b, &xreg4&.8b + ld1 {&yacc2&.s}[3], [TMP2] + vzip &xreg1&.8b, &xreg2&.8b convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 - vmull.u8 xacc1, xreg1, d28 - vzip.u8 yreg1, yreg3 - vmlal.u8 xacc1, xreg2, d29 - vzip.u8 yreg2, yreg4 - vmull.u8 xacc2, xreg3, d28 - vzip.u8 yreg3, yreg4 - vmlal.u8 xacc2, xreg4, d29 - vzip.u8 yreg1, yreg2 - vmull.u8 yacc1, yreg1, d28 - vmlal.u8 yacc1, yreg2, d29 - vmull.u8 yacc2, yreg3, d28 - vmlal.u8 yacc2, yreg4, d29 + umull &xacc1&.8h, &xreg1&.8b, v28.8b + vzip &yreg1&.8b, &yreg3&.8b + umlal &xacc1&.8h, &xreg2&.8b, v29.8b + vzip &yreg2&.8b, &yreg4&.8b + umull &xacc2&.8h, &xreg3&.8b, v28.8b + vzip &yreg3&.8b, &yreg4&.8b + umlal &xacc2&.8h, &xreg4&.8b, v29.8b + vzip &yreg1&.8b, &yreg2&.8b + umull &yacc1&.8h, &yreg1&.8b, v28.8b + umlal &yacc1&.8h, &yreg2&.8b, v29.8b + umull &yacc2&.8h, &yreg3&.8b, v28.8b + umlal &yacc2&.8h, &yreg4&.8b, v29.8b .endm .macro bilinear_store_8888 numpix, tmp1, tmp2 .if numpix == 4 - vst1.32 {d0, d1}, [OUT, :128]! + st1 {v0.2s, v1.2s}, [OUT], #16 .elseif numpix == 2 - vst1.32 {d0}, [OUT, :64]! + st1 {v0.2s}, [OUT], #8 .elseif numpix == 1 - vst1.32 {d0[0]}, [OUT, :32]! + st1 {v0.s}[0], [OUT], #4 .else .error bilinear_store_8888 numpix is unsupported .endif .endm .macro bilinear_store_0565 numpix, tmp1, tmp2 - vuzp.u8 d0, d1 - vuzp.u8 d2, d3 - vuzp.u8 d1, d3 - vuzp.u8 d0, d2 - convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2 + vuzp v0.8b, v1.8b + vuzp v2.8b, v3.8b + vuzp v1.8b, v3.8b + vuzp v0.8b, v2.8b + convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2 .if numpix == 4 - vst1.16 {d2}, [OUT, :64]! + st1 {v1.4h}, [OUT], #8 .elseif numpix == 2 - vst1.32 {d2[0]}, [OUT, :32]! + st1 {v1.s}[0], [OUT], #4 .elseif numpix == 1 - vst1.16 {d2[0]}, [OUT, :16]! + st1 {v1.h}[0], [OUT], #2 .else .error bilinear_store_0565 numpix is unsupported .endif .endm .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt - bilinear_load_&src_fmt d0, d1, d2 - vmull.u8 q1, d0, d28 - vmlal.u8 q1, d1, d29 + bilinear_load_&src_fmt v0, v1, v2 + umull v2.8h, v0.8b, v28.8b + umlal v2.8h, v1.8b, v29.8b /* 5 cycles bubble */ - vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q0, d2, d30 - vmlal.u16 q0, d3, d30 + ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v0.4s, v2.4h, v15.h[0] + umlal2 v0.4s, v2.8h, v15.h[0] /* 5 cycles bubble */ - vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) /* 3 cycles bubble */ - vmovn.u16 d0, q0 + xtn v0.8b, v0.8h /* 1 cycle bubble */ - bilinear_store_&dst_fmt 1, q2, q3 + bilinear_store_&dst_fmt 1, v3, v4 .endm .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt bilinear_load_and_vertical_interpolate_two_&src_fmt \ - q1, q11, d0, d1, d20, d21, d22, d23 - vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q0, d2, d30 - vmlal.u16 q0, d3, d30 - vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q10, d22, d31 - vmlal.u16 q10, d23, d31 - vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vadd.u16 q12, q12, q13 - vmovn.u16 d0, q0 - bilinear_store_&dst_fmt 2, q2, q3 + v1, v11, v2, v3, v20, v21, v22, v23 + ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v0.4s, v1.4h, v15.h[0] + umlal2 v0.4s, v1.8h, v15.h[0] + ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v10.4s, v11.4h, v15.h[4] + umlal2 v10.4s, v11.8h, v15.h[4] + shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + add v12.8h, v12.8h, v13.8h + xtn v0.8b, v0.8h + bilinear_store_&dst_fmt 2, v3, v4 .endm .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt bilinear_load_and_vertical_interpolate_four_&src_fmt \ - q1, q11, d0, d1, d20, d21, d22, d23 \ - q3, q9, d4, d5, d16, d17, d18, d19 - pld [TMP1, PF_OFFS] + v1, v11, v14, v20, v16, v17, v22, v23 \ + v3, v9, v24, v25, v26, v27, v18, v19 + prfm PREFETCH_MODE, [TMP1, PF_OFFS] sub TMP1, TMP1, STRIDE - vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q0, d2, d30 - vmlal.u16 q0, d3, d30 - vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q10, d22, d31 - vmlal.u16 q10, d23, d31 - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q2, d6, d30 - vmlal.u16 q2, d7, d30 - vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS - pld [TMP2, PF_OFFS] - vmlsl.u16 q8, d18, d31 - vmlal.u16 q8, d19, d31 - vadd.u16 q12, q12, q13 - vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS) - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vmovn.u16 d0, q0 - vmovn.u16 d1, q2 - vadd.u16 q12, q12, q13 - bilinear_store_&dst_fmt 4, q2, q3 + ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v0.4s, v1.4h, v15.h[0] + umlal2 v0.4s, v1.8h, v15.h[0] + ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v10.4s, v11.4h, v15.h[4] + umlal2 v10.4s, v11.8h, v15.h[4] + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + ushll v2.4s, v3.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v2.4s, v3.4h, v15.h[0] + umlal2 v2.4s, v3.8h, v15.h[0] + ushll v8.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS + prfm PREFETCH_MODE, [TMP2, PF_OFFS] + umlsl v8.4s, v9.4h, v15.h[4] + umlal2 v8.4s, v9.8h, v15.h[4] + add v12.8h, v12.8h, v13.8h + shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn2 v2.8h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + xtn v0.8b, v0.8h + xtn v1.8b, v2.8h + add v12.8h, v12.8h, v13.8h + bilinear_store_&dst_fmt 4, v3, v4 .endm .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt @@ -3107,121 +3413,137 @@ generate_composite_function_nearest_scanline \ prefetch_distance, flags pixman_asm_function fname - OUT .req r0 - TOP .req r1 - BOTTOM .req r2 - WT .req r3 - WB .req r4 - X .req r5 - UX .req r6 - WIDTH .req ip - TMP1 .req r3 - TMP2 .req r4 - PF_OFFS .req r7 - TMP3 .req r8 - TMP4 .req r9 - STRIDE .req r2 - - mov ip, sp - push {r4, r5, r6, r7, r8, r9} + OUT .req x0 + TOP .req x1 + BOTTOM .req x2 + WT .req x3 + WB .req x4 + X .req x5 + UX .req x6 + WIDTH .req x7 + TMP1 .req x8 + TMP2 .req x9 + PF_OFFS .req x10 + TMP3 .req x11 + TMP4 .req x12 + STRIDE .req x13 + + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 + sxtw x6, w6 + sxtw x7, w7 + + stp x29, x30, [sp, -16]! + mov x29, sp + sub sp, sp, 112 /* push all registers */ + sub x29, x29, 64 + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32 + stp x8, x9, [x29, -80] + stp x10, x11, [x29, -96] + stp x12, x13, [x29, -112] + mov PF_OFFS, #prefetch_distance - ldmia ip, {WB, X, UX, WIDTH} mul PF_OFFS, PF_OFFS, UX -.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 - vpush {d8-d15} -.endif - - sub STRIDE, BOTTOM, TOP + subs STRIDE, BOTTOM, TOP .unreq BOTTOM cmp WIDTH, #0 - ble 3f + ble 300f - vdup.u16 q12, X - vdup.u16 q13, UX - vdup.u8 d28, WT - vdup.u8 d29, WB - vadd.u16 d25, d25, d26 + dup v12.8h, w5 + dup v13.8h, w6 + dup v28.8b, w3 + dup v29.8b, w4 + mov v25.d[0], v12.d[1] + mov v26.d[0], v13.d[0] + add v25.4h, v25.4h, v26.4h + mov v12.d[1], v25.d[0] /* ensure good destination alignment */ cmp WIDTH, #1 - blt 0f + blt 100f tst OUT, #(1 << dst_bpp_shift) - beq 0f - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vadd.u16 q12, q12, q13 + beq 100f + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + add v12.8h, v12.8h, v13.8h bilinear_interpolate_last_pixel src_fmt, dst_fmt sub WIDTH, WIDTH, #1 -0: - vadd.u16 q13, q13, q13 - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vadd.u16 q12, q12, q13 +100: + add v13.8h, v13.8h, v13.8h + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + add v12.8h, v12.8h, v13.8h cmp WIDTH, #2 - blt 0f + blt 100f tst OUT, #(1 << (dst_bpp_shift + 1)) - beq 0f + beq 100f bilinear_interpolate_two_pixels src_fmt, dst_fmt sub WIDTH, WIDTH, #2 -0: +100: .if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0 /*********** 8 pixels per iteration *****************/ cmp WIDTH, #4 - blt 0f + blt 100f tst OUT, #(1 << (dst_bpp_shift + 2)) - beq 0f + beq 100f bilinear_interpolate_four_pixels src_fmt, dst_fmt sub WIDTH, WIDTH, #4 -0: +100: subs WIDTH, WIDTH, #8 - blt 1f + blt 100f asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift) bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt subs WIDTH, WIDTH, #8 - blt 5f -0: + blt 500f +1000: bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt subs WIDTH, WIDTH, #8 - bge 0b -5: + bge 1000b +500: bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt -1: +100: tst WIDTH, #4 - beq 2f + beq 200f bilinear_interpolate_four_pixels src_fmt, dst_fmt -2: +200: .else /*********** 4 pixels per iteration *****************/ subs WIDTH, WIDTH, #4 - blt 1f + blt 100f asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift) bilinear_interpolate_four_pixels_head src_fmt, dst_fmt subs WIDTH, WIDTH, #4 - blt 5f -0: + blt 500f +1000: bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt subs WIDTH, WIDTH, #4 - bge 0b -5: + bge 1000b +500: bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt -1: +100: /****************************************************/ .endif /* handle the remaining trailing pixels */ tst WIDTH, #2 - beq 2f + beq 200f bilinear_interpolate_two_pixels src_fmt, dst_fmt -2: +200: tst WIDTH, #1 - beq 3f + beq 300f bilinear_interpolate_last_pixel src_fmt, dst_fmt -3: -.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 - vpop {d8-d15} -.endif - pop {r4, r5, r6, r7, r8, r9} - bx lr +300: + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32 + ldp x8, x9, [x29, -80] + ldp x10, x11, [x29, -96] + ldp x12, x13, [x29, -104] + mov sp, x29 + ldp x29, x30, [sp], 16 + ret .unreq OUT .unreq TOP @@ -3252,61 +3574,61 @@ pixman_asm_function fname add X, X, UX add TMP2, TOP, TMP2, lsl #2 - vld1.32 {d22}, [TMP1], STRIDE - vld1.32 {d23}, [TMP1] + ld1 {v22.2s}, [TMP1], STRIDE + ld1 {v23.2s}, [TMP1] asr TMP3, X, #16 add X, X, UX add TMP3, TOP, TMP3, lsl #2 - vmull.u8 q8, d22, d28 - vmlal.u8 q8, d23, d29 + umull v8.8h, v22.8b, v28.8b + umlal v8.8h, v23.8b, v29.8b - vld1.32 {d22}, [TMP2], STRIDE - vld1.32 {d23}, [TMP2] + ld1 {v22.2s}, [TMP2], STRIDE + ld1 {v23.2s}, [TMP2] asr TMP4, X, #16 add X, X, UX add TMP4, TOP, TMP4, lsl #2 - vmull.u8 q9, d22, d28 - vmlal.u8 q9, d23, d29 + umull v9.8h, v22.8b, v28.8b + umlal v9.8h, v23.8b, v29.8b - vld1.32 {d22}, [TMP3], STRIDE - vld1.32 {d23}, [TMP3] - vmull.u8 q10, d22, d28 - vmlal.u8 q10, d23, d29 + ld1 {v22.2s}, [TMP3], STRIDE + ld1 {v23.2s}, [TMP3] + umull v10.8h, v22.8b, v28.8b + umlal v10.8h, v23.8b, v29.8b - vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q0, d16, d30 - vmlal.u16 q0, d17, d30 + ushll v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v0.4s, v8.4h, v15.h[0] + umlal2 v0.4s, v8.8h, v15.h[0] - pld [TMP4, PF_OFFS] - vld1.32 {d16}, [TMP4], STRIDE - vld1.32 {d17}, [TMP4] - pld [TMP4, PF_OFFS] - vmull.u8 q11, d16, d28 - vmlal.u8 q11, d17, d29 + prfm PREFETCH_MODE, [TMP4, PF_OFFS] + ld1 {v16.2s}, [TMP4], STRIDE + ld1 {v17.2s}, [TMP4] + prfm PREFETCH_MODE, [TMP4, PF_OFFS] + umull v11.8h, v16.8b, v28.8b + umlal v11.8h, v17.8b, v29.8b - vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q1, d18, d31 + ushll v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v1.4s, v9.4h, v15.h[4] .endm .macro bilinear_interpolate_four_pixels_8888_8888_tail - vmlal.u16 q1, d19, d31 - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q2, d20, d30 - vmlal.u16 q2, d21, d30 - vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q3, d22, d31 - vmlal.u16 q3, d23, d31 - vadd.u16 q12, q12, q13 - vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) - vmovn.u16 d6, q0 - vmovn.u16 d7, q2 - vadd.u16 q12, q12, q13 - vst1.32 {d6, d7}, [OUT, :128]! + umlal2 v1.4s, v9.8h, v15.h[4] + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + ushll v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v2.4s, v10.4h, v15.h[0] + umlal2 v2.4s, v10.8h, v15.h[0] + ushll v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v3.4s, v11.4h, v15.h[4] + umlal2 v3.4s, v11.8h, v15.h[4] + add v12.8h, v12.8h, v13.8h + shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn2 v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + shrn2 v2.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + xtn v6.8b, v0.8h + xtn v7.8b, v2.8h + add v12.8h, v12.8h, v13.8h + st1 {v6.2s, v7.2s}, [OUT], #16 .endm .macro bilinear_interpolate_four_pixels_8888_8888_tail_head @@ -3316,300 +3638,57 @@ pixman_asm_function fname asr TMP2, X, #16 add X, X, UX add TMP2, TOP, TMP2, lsl #2 - vmlal.u16 q1, d19, d31 - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q2, d20, d30 - vmlal.u16 q2, d21, d30 - vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS - vld1.32 {d20}, [TMP1], STRIDE - vmlsl.u16 q3, d22, d31 - vmlal.u16 q3, d23, d31 - vld1.32 {d21}, [TMP1] - vmull.u8 q8, d20, d28 - vmlal.u8 q8, d21, d29 - vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) - vld1.32 {d22}, [TMP2], STRIDE - vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) - vadd.u16 q12, q12, q13 - vld1.32 {d23}, [TMP2] - vmull.u8 q9, d22, d28 + umlal2 v1.4s, v9.8h, v15.h[4] + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + ushll v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v2.4s, v10.4h, v15.h[0] + umlal2 v2.4s, v10.8h, v15.h[0] + ushll v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS + ld1 {v20.2s}, [TMP1], STRIDE + umlsl v3.4s, v11.4h, v15.h[4] + umlal2 v3.4s, v11.8h, v15.h[4] + ld1 {v21.2s}, [TMP1] + umull v8.8h, v20.8b, v28.8b + umlal v8.8h, v21.8b, v29.8b + shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn2 v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn v4.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + ld1 {v22.2s}, [TMP2], STRIDE + shrn2 v4.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + add v12.8h, v12.8h, v13.8h + ld1 {v23.2s}, [TMP2] + umull v9.8h, v22.8b, v28.8b asr TMP3, X, #16 add X, X, UX add TMP3, TOP, TMP3, lsl #2 asr TMP4, X, #16 add X, X, UX add TMP4, TOP, TMP4, lsl #2 - vmlal.u8 q9, d23, d29 - vld1.32 {d22}, [TMP3], STRIDE - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vld1.32 {d23}, [TMP3] - vmull.u8 q10, d22, d28 - vmlal.u8 q10, d23, d29 - vmovn.u16 d6, q0 - vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS - vmovn.u16 d7, q2 - vmlsl.u16 q0, d16, d30 - vmlal.u16 q0, d17, d30 - pld [TMP4, PF_OFFS] - vld1.32 {d16}, [TMP4], STRIDE - vadd.u16 q12, q12, q13 - vld1.32 {d17}, [TMP4] - pld [TMP4, PF_OFFS] - vmull.u8 q11, d16, d28 - vmlal.u8 q11, d17, d29 - vst1.32 {d6, d7}, [OUT, :128]! - vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q1, d18, d31 + umlal v9.8h, v23.8b, v29.8b + ld1 {v22.2s}, [TMP3], STRIDE + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + ld1 {v23.2s}, [TMP3] + umull v10.8h, v22.8b, v28.8b + umlal v10.8h, v23.8b, v29.8b + xtn v6.8b, v0.8h + ushll v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS + xtn v7.8b, v4.8h + umlsl v0.4s, v8.4h, v15.h[0] + umlal2 v0.4s, v8.8h, v15.h[0] + prfm PREFETCH_MODE, [TMP4, PF_OFFS] + ld1 {v16.2s}, [TMP4], STRIDE + add v12.8h, v12.8h, v13.8h + ld1 {v17.2s}, [TMP4] + prfm PREFETCH_MODE, [TMP4, PF_OFFS] + umull v11.8h, v16.8b, v28.8b + umlal v11.8h, v17.8b, v29.8b + st1 {v6.2s, v7.2s}, [OUT], #16 + ushll v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v1.4s, v9.4h, v15.h[4] .endm /*****************************************************************************/ -.set have_bilinear_interpolate_eight_pixels_8888_0565, 1 - -.macro bilinear_interpolate_eight_pixels_8888_0565_head - asr TMP1, X, #16 - add X, X, UX - add TMP1, TOP, TMP1, lsl #2 - asr TMP2, X, #16 - add X, X, UX - add TMP2, TOP, TMP2, lsl #2 - vld1.32 {d20}, [TMP1], STRIDE - vld1.32 {d21}, [TMP1] - vmull.u8 q8, d20, d28 - vmlal.u8 q8, d21, d29 - vld1.32 {d22}, [TMP2], STRIDE - vld1.32 {d23}, [TMP2] - vmull.u8 q9, d22, d28 - asr TMP3, X, #16 - add X, X, UX - add TMP3, TOP, TMP3, lsl #2 - asr TMP4, X, #16 - add X, X, UX - add TMP4, TOP, TMP4, lsl #2 - vmlal.u8 q9, d23, d29 - vld1.32 {d22}, [TMP3], STRIDE - vld1.32 {d23}, [TMP3] - vmull.u8 q10, d22, d28 - vmlal.u8 q10, d23, d29 - vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q0, d16, d30 - vmlal.u16 q0, d17, d30 - pld [TMP4, PF_OFFS] - vld1.32 {d16}, [TMP4], STRIDE - vld1.32 {d17}, [TMP4] - pld [TMP4, PF_OFFS] - vmull.u8 q11, d16, d28 - vmlal.u8 q11, d17, d29 - vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q1, d18, d31 - - asr TMP1, X, #16 - add X, X, UX - add TMP1, TOP, TMP1, lsl #2 - asr TMP2, X, #16 - add X, X, UX - add TMP2, TOP, TMP2, lsl #2 - vmlal.u16 q1, d19, d31 - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q2, d20, d30 - vmlal.u16 q2, d21, d30 - vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS - vld1.32 {d20}, [TMP1], STRIDE - vmlsl.u16 q3, d22, d31 - vmlal.u16 q3, d23, d31 - vld1.32 {d21}, [TMP1] - vmull.u8 q8, d20, d28 - vmlal.u8 q8, d21, d29 - vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) - vld1.32 {d22}, [TMP2], STRIDE - vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) - vadd.u16 q12, q12, q13 - vld1.32 {d23}, [TMP2] - vmull.u8 q9, d22, d28 - asr TMP3, X, #16 - add X, X, UX - add TMP3, TOP, TMP3, lsl #2 - asr TMP4, X, #16 - add X, X, UX - add TMP4, TOP, TMP4, lsl #2 - vmlal.u8 q9, d23, d29 - vld1.32 {d22}, [TMP3], STRIDE - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vld1.32 {d23}, [TMP3] - vmull.u8 q10, d22, d28 - vmlal.u8 q10, d23, d29 - vmovn.u16 d8, q0 - vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS - vmovn.u16 d9, q2 - vmlsl.u16 q0, d16, d30 - vmlal.u16 q0, d17, d30 - pld [TMP4, PF_OFFS] - vld1.32 {d16}, [TMP4], STRIDE - vadd.u16 q12, q12, q13 - vld1.32 {d17}, [TMP4] - pld [TMP4, PF_OFFS] - vmull.u8 q11, d16, d28 - vmlal.u8 q11, d17, d29 - vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q1, d18, d31 -.endm - -.macro bilinear_interpolate_eight_pixels_8888_0565_tail - vmlal.u16 q1, d19, d31 - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q2, d20, d30 - vmlal.u16 q2, d21, d30 - vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q3, d22, d31 - vmlal.u16 q3, d23, d31 - vadd.u16 q12, q12, q13 - vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) - vmovn.u16 d10, q0 - vmovn.u16 d11, q2 - vadd.u16 q12, q12, q13 - - vuzp.u8 d8, d9 - vuzp.u8 d10, d11 - vuzp.u8 d9, d11 - vuzp.u8 d8, d10 - vshll.u8 q6, d9, #8 - vshll.u8 q5, d10, #8 - vshll.u8 q7, d8, #8 - vsri.u16 q5, q6, #5 - vsri.u16 q5, q7, #11 - vst1.32 {d10, d11}, [OUT, :128]! -.endm - -.macro bilinear_interpolate_eight_pixels_8888_0565_tail_head - asr TMP1, X, #16 - add X, X, UX - add TMP1, TOP, TMP1, lsl #2 - asr TMP2, X, #16 - add X, X, UX - add TMP2, TOP, TMP2, lsl #2 - vmlal.u16 q1, d19, d31 - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vuzp.u8 d8, d9 - vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q2, d20, d30 - vmlal.u16 q2, d21, d30 - vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS - vld1.32 {d20}, [TMP1], STRIDE - vmlsl.u16 q3, d22, d31 - vmlal.u16 q3, d23, d31 - vld1.32 {d21}, [TMP1] - vmull.u8 q8, d20, d28 - vmlal.u8 q8, d21, d29 - vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) - vld1.32 {d22}, [TMP2], STRIDE - vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) - vadd.u16 q12, q12, q13 - vld1.32 {d23}, [TMP2] - vmull.u8 q9, d22, d28 - asr TMP3, X, #16 - add X, X, UX - add TMP3, TOP, TMP3, lsl #2 - asr TMP4, X, #16 - add X, X, UX - add TMP4, TOP, TMP4, lsl #2 - vmlal.u8 q9, d23, d29 - vld1.32 {d22}, [TMP3], STRIDE - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vld1.32 {d23}, [TMP3] - vmull.u8 q10, d22, d28 - vmlal.u8 q10, d23, d29 - vmovn.u16 d10, q0 - vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS - vmovn.u16 d11, q2 - vmlsl.u16 q0, d16, d30 - vmlal.u16 q0, d17, d30 - pld [TMP4, PF_OFFS] - vld1.32 {d16}, [TMP4], STRIDE - vadd.u16 q12, q12, q13 - vld1.32 {d17}, [TMP4] - pld [TMP4, PF_OFFS] - vmull.u8 q11, d16, d28 - vmlal.u8 q11, d17, d29 - vuzp.u8 d10, d11 - vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q1, d18, d31 - - asr TMP1, X, #16 - add X, X, UX - add TMP1, TOP, TMP1, lsl #2 - asr TMP2, X, #16 - add X, X, UX - add TMP2, TOP, TMP2, lsl #2 - vmlal.u16 q1, d19, d31 - vuzp.u8 d9, d11 - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS - vuzp.u8 d8, d10 - vmlsl.u16 q2, d20, d30 - vmlal.u16 q2, d21, d30 - vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS - vld1.32 {d20}, [TMP1], STRIDE - vmlsl.u16 q3, d22, d31 - vmlal.u16 q3, d23, d31 - vld1.32 {d21}, [TMP1] - vmull.u8 q8, d20, d28 - vmlal.u8 q8, d21, d29 - vshll.u8 q6, d9, #8 - vshll.u8 q5, d10, #8 - vshll.u8 q7, d8, #8 - vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) - vsri.u16 q5, q6, #5 - vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) - vsri.u16 q5, q7, #11 - vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) - vld1.32 {d22}, [TMP2], STRIDE - vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) - vadd.u16 q12, q12, q13 - vld1.32 {d23}, [TMP2] - vmull.u8 q9, d22, d28 - asr TMP3, X, #16 - add X, X, UX - add TMP3, TOP, TMP3, lsl #2 - asr TMP4, X, #16 - add X, X, UX - add TMP4, TOP, TMP4, lsl #2 - vmlal.u8 q9, d23, d29 - vld1.32 {d22}, [TMP3], STRIDE - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vld1.32 {d23}, [TMP3] - vmull.u8 q10, d22, d28 - vmlal.u8 q10, d23, d29 - vmovn.u16 d8, q0 - vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS - vmovn.u16 d9, q2 - vmlsl.u16 q0, d16, d30 - vmlal.u16 q0, d17, d30 - pld [TMP4, PF_OFFS] - vld1.32 {d16}, [TMP4], STRIDE - vadd.u16 q12, q12, q13 - vld1.32 {d17}, [TMP4] - pld [TMP4, PF_OFFS] - vmull.u8 q11, d16, d28 - vmlal.u8 q11, d17, d29 - vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS - vst1.32 {d10, d11}, [OUT, :128]! - vmlsl.u16 q1, d18, d31 -.endm -/*****************************************************************************/ - generate_bilinear_scanline_func \ pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \ 2, 2, 28, BILINEAR_FLAG_UNROLL_4 diff --git a/pixman/pixman-arma64-neon-asm.h b/pixman/pixman-arma64-neon-asm.h index 6a5a5fe..16c8d16 100644 --- a/pixman/pixman-arma64-neon-asm.h +++ b/pixman/pixman-arma64-neon-asm.h @@ -38,7 +38,7 @@ * The user of this macro has to provide some configuration parameters * (bit depths for the images, prefetch distance, etc.) and a set of * macros, which should implement basic code chunks responsible for - * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage + * pixels processing. See 'pixman-armv8-neon-asm.S' file for the usage * examples. * * TODO: @@ -56,12 +56,6 @@ .set FLAG_DEINTERLEAVE_32BPP, 2 /* - * Offset in stack where mask and source pointer/stride can be accessed - * from 'init' macro. This is useful for doing special handling for solid mask. - */ -.set ARGS_STACK_OFFSET, 40 - -/* * Constants for selecting preferable prefetch type. */ .set PREFETCH_TYPE_NONE, 0 /* No prefetch at all */ @@ -69,75 +63,95 @@ .set PREFETCH_TYPE_ADVANCED, 2 /* Advanced fine-grained prefetch */ /* + * prefetch mode + * available modes are: + * pldl1keep + * pldl1strm + * pldl2keep + * pldl2strm + * pldl3keep + * pldl3strm + */ +#define PREFETCH_MODE pldl1keep + +/* * Definitions of supplementary pixld/pixst macros (for partial load/store of * pixel data). */ .macro pixldst1 op, elem_size, reg1, mem_operand, abits -.if abits > 0 - op&.&elem_size {d®1}, [&mem_operand&, :&abits&]! -.else - op&.&elem_size {d®1}, [&mem_operand&]! -.endif + op {v®1&.&elem_size}, [&mem_operand&], #8 .endm .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits -.if abits > 0 - op&.&elem_size {d®1, d®2}, [&mem_operand&, :&abits&]! -.else - op&.&elem_size {d®1, d®2}, [&mem_operand&]! -.endif + op {v®1&.&elem_size, v®2&.&elem_size}, [&mem_operand&], #16 .endm .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits -.if abits > 0 - op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&, :&abits&]! -.else - op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&]! -.endif + op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size, v®4&.&elem_size}, [&mem_operand&], #32 .endm -.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits - op&.&elem_size {d®1[idx]}, [&mem_operand&]! +.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits, bytes + op {v®1&.&elem_size}[idx], [&mem_operand&], #&bytes& .endm .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand - op&.&elem_size {d®1, d®2, d®3}, [&mem_operand&]! + op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size}, [&mem_operand&], #24 .endm .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand - op&.&elem_size {d®1[idx], d®2[idx], d®3[idx]}, [&mem_operand&]! + op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size}[idx], [&mem_operand&], #3 .endm .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits .if numbytes == 32 - pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \ + .if elem_size==32 + pixldst4 op, 2s, %(basereg+4), %(basereg+5), \ %(basereg+6), %(basereg+7), mem_operand, abits + .elseif elem_size==16 + pixldst4 op, 4h, %(basereg+4), %(basereg+5), \ + %(basereg+6), %(basereg+7), mem_operand, abits + .else + pixldst4 op, 8b, %(basereg+4), %(basereg+5), \ + %(basereg+6), %(basereg+7), mem_operand, abits + .endif .elseif numbytes == 16 - pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits + .if elem_size==32 + pixldst2 op, 2s, %(basereg+2), %(basereg+3), mem_operand, abits + .elseif elem_size==16 + pixldst2 op, 4h, %(basereg+2), %(basereg+3), mem_operand, abits + .else + pixldst2 op, 8b, %(basereg+2), %(basereg+3), mem_operand, abits + .endif .elseif numbytes == 8 - pixldst1 op, elem_size, %(basereg+1), mem_operand, abits + .if elem_size==32 + pixldst1 op, 2s, %(basereg+1), mem_operand, abits + .elseif elem_size==16 + pixldst1 op, 4h, %(basereg+1), mem_operand, abits + .else + pixldst1 op, 8b, %(basereg+1), mem_operand, abits + .endif .elseif numbytes == 4 .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32) - pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits + pixldst0 op, s, %(basereg+0), 1, mem_operand, abits, 4 .elseif elem_size == 16 - pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits - pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits + pixldst0 op, h, %(basereg+0), 2, mem_operand, abits, 2 + pixldst0 op, h, %(basereg+0), 3, mem_operand, abits, 2 .else - pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits - pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits - pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits - pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits + pixldst0 op, b, %(basereg+0), 4, mem_operand, abits, 1 + pixldst0 op, b, %(basereg+0), 5, mem_operand, abits, 1 + pixldst0 op, b, %(basereg+0), 6, mem_operand, abits, 1 + pixldst0 op, b, %(basereg+0), 7, mem_operand, abits, 1 .endif .elseif numbytes == 2 .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16) - pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits + pixldst0 op, h, %(basereg+0), 1, mem_operand, abits, 2 .else - pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits - pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits + pixldst0 op, b, %(basereg+0), 2, mem_operand, abits, 1 + pixldst0 op, b, %(basereg+0), 3, mem_operand, abits, 1 .endif .elseif numbytes == 1 - pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits + pixldst0 op, b, %(basereg+0), 1, mem_operand, abits, 1 .else .error "unsupported size: numbytes" .endif @@ -146,22 +160,22 @@ .macro pixld numpix, bpp, basereg, mem_operand, abits=0 .if bpp > 0 .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) - pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \ + pixldst4 ld4, 8b, %(basereg+4), %(basereg+5), \ %(basereg+6), %(basereg+7), mem_operand, abits .elseif (bpp == 24) && (numpix == 8) - pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand + pixldst3 ld3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand .elseif (bpp == 24) && (numpix == 4) - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand + pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand + pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand + pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand + pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand .elseif (bpp == 24) && (numpix == 2) - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand + pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand + pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand .elseif (bpp == 24) && (numpix == 1) - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand + pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand .else - pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits + pixldst %(numpix * bpp / 8), ld1, %(bpp), basereg, mem_operand, abits .endif .endif .endm @@ -169,22 +183,22 @@ .macro pixst numpix, bpp, basereg, mem_operand, abits=0 .if bpp > 0 .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) - pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \ + pixldst4 st4, 8b, %(basereg+4), %(basereg+5), \ %(basereg+6), %(basereg+7), mem_operand, abits .elseif (bpp == 24) && (numpix == 8) - pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand + pixldst3 st3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand .elseif (bpp == 24) && (numpix == 4) - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand + pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand + pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand + pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand + pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand .elseif (bpp == 24) && (numpix == 2) - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand + pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand + pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand .elseif (bpp == 24) && (numpix == 1) - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand + pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand .else - pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits + pixldst %(numpix * bpp / 8), st1, %(bpp), basereg, mem_operand, abits .endif .endif .endm @@ -213,41 +227,53 @@ .if elem_size == 16 asr TMP1, VX, #16 adds VX, VX, UNIT_X -5: subpls VX, VX, SRC_WIDTH_FIXED + bmi 55f +5: subs VX, VX, SRC_WIDTH_FIXED bpl 5b +55: add TMP1, mem_operand, TMP1, lsl #1 asr TMP2, VX, #16 adds VX, VX, UNIT_X -5: subpls VX, VX, SRC_WIDTH_FIXED + bmi 55f +5: subs VX, VX, SRC_WIDTH_FIXED bpl 5b +55: add TMP2, mem_operand, TMP2, lsl #1 - vld1.16 {d®1&[0]}, [TMP1, :16] + ld1 {v®1&.h}[0], [TMP1] asr TMP1, VX, #16 adds VX, VX, UNIT_X -5: subpls VX, VX, SRC_WIDTH_FIXED + bmi 55f +5: subs VX, VX, SRC_WIDTH_FIXED bpl 5b +55: add TMP1, mem_operand, TMP1, lsl #1 - vld1.16 {d®1&[1]}, [TMP2, :16] + ld1 {v®1&.h}[1], [TMP2] asr TMP2, VX, #16 adds VX, VX, UNIT_X -5: subpls VX, VX, SRC_WIDTH_FIXED + bmi 55f +5: subs VX, VX, SRC_WIDTH_FIXED bpl 5b +55: add TMP2, mem_operand, TMP2, lsl #1 - vld1.16 {d®1&[2]}, [TMP1, :16] - vld1.16 {d®1&[3]}, [TMP2, :16] + ld1 {v®1&.h}[2], [TMP1] + ld1 {v®1&.h}[3], [TMP2] .elseif elem_size == 32 asr TMP1, VX, #16 adds VX, VX, UNIT_X -5: subpls VX, VX, SRC_WIDTH_FIXED + bmi 55f +5: subs VX, VX, SRC_WIDTH_FIXED bpl 5b +55: add TMP1, mem_operand, TMP1, lsl #2 asr TMP2, VX, #16 adds VX, VX, UNIT_X -5: subpls VX, VX, SRC_WIDTH_FIXED + bmi 55f +5: subs VX, VX, SRC_WIDTH_FIXED bpl 5b +55: add TMP2, mem_operand, TMP2, lsl #2 - vld1.32 {d®1&[0]}, [TMP1, :32] - vld1.32 {d®1&[1]}, [TMP2, :32] + ld1 {v®1&.s}[0], [TMP1] + ld1 {v®1&.s}[1], [TMP2] .else .error "unsupported" .endif @@ -255,22 +281,22 @@ .macro pixld2_s elem_size, reg1, reg2, mem_operand .if 0 /* elem_size == 32 */ - asr TMP1, VX, #16 - add VX, VX, UNIT_X, lsl #1 - add TMP1, mem_operand, TMP1, lsl #2 - asr TMP2, VX, #16 + mov TMP1, VX, asr #16 + add VX, VX, UNIT_X, asl #1 + add TMP1, mem_operand, TMP1, asl #2 + mov TMP2, VX, asr #16 sub VX, VX, UNIT_X - add TMP2, mem_operand, TMP2, lsl #2 - vld1.32 {d®1&[0]}, [TMP1, :32] - asr TMP1, VX, #16 - add VX, VX, UNIT_X, lsl #1 - add TMP1, mem_operand, TMP1, lsl #2 - vld1.32 {d®2&[0]}, [TMP2, :32] - asr TMP2, VX, #16 + add TMP2, mem_operand, TMP2, asl #2 + ld1 {v®1&.s}[0], [TMP1] + mov TMP1, VX, asr #16 + add VX, VX, UNIT_X, asl #1 + add TMP1, mem_operand, TMP1, asl #2 + ld1 {v®2&.s}[0], [TMP2, :32] + mov TMP2, VX, asr #16 add VX, VX, UNIT_X - add TMP2, mem_operand, TMP2, lsl #2 - vld1.32 {d®1&[1]}, [TMP1, :32] - vld1.32 {d®2&[1]}, [TMP2, :32] + add TMP2, mem_operand, TMP2, asl #2 + ld1 {v®1&.s}[1], [TMP1] + ld1 {v®2&.s}[1], [TMP2] .else pixld1_s elem_size, reg1, mem_operand pixld1_s elem_size, reg2, mem_operand @@ -281,17 +307,22 @@ .if elem_size == 16 asr TMP1, VX, #16 adds VX, VX, UNIT_X -5: subpls VX, VX, SRC_WIDTH_FIXED + bmi 55f +5: subs VX, VX, SRC_WIDTH_FIXED bpl 5b +55: add TMP1, mem_operand, TMP1, lsl #1 - vld1.16 {d®1&[idx]}, [TMP1, :16] + ld1 {v®1&.h}[idx], [TMP1] .elseif elem_size == 32 - asr TMP1, VX, #16 + asr DUMMY, VX, #16 + mov TMP1, DUMMY adds VX, VX, UNIT_X -5: subpls VX, VX, SRC_WIDTH_FIXED + bmi 55f +5: subs VX, VX, SRC_WIDTH_FIXED bpl 5b +55: add TMP1, mem_operand, TMP1, lsl #2 - vld1.32 {d®1&[idx]}, [TMP1, :32] + ld1 {v®1&.s}[idx], [TMP1] .endif .endm @@ -337,11 +368,19 @@ .endm .macro vuzp8 reg1, reg2 - vuzp.8 d®1, d®2 + umov DUMMY, v16.d[0] + uzp1 v16.8b, v®1&.8b, v®2&.8b + uzp2 v®2&.8b, v®1&.8b, v®2&.8b + mov v®1&.8b, v16.8b + mov v16.d[0], DUMMY .endm .macro vzip8 reg1, reg2 - vzip.8 d®1, d®2 + umov DUMMY, v16.d[0] + zip1 v16.8b, v®1&.8b, v®2&.8b + zip2 v®2&.8b, v®1&.8b, v®2&.8b + mov v®1&.8b, v16.8b + mov v16.d[0], DUMMY .endm /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ @@ -400,49 +439,61 @@ .macro cache_preload std_increment, boost_increment .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0) -.if regs_shortage - PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */ -.endif .if std_increment != 0 PF add PF_X, PF_X, #std_increment .endif PF tst PF_CTL, #0xF - PF addne PF_X, PF_X, #boost_increment - PF subne PF_CTL, PF_CTL, #1 + PF beq 71f + PF add PF_X, PF_X, #boost_increment + PF sub PF_CTL, PF_CTL, #1 +71: PF cmp PF_X, ORIG_W .if src_bpp_shift >= 0 - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + PF lsl DUMMY, PF_X, #src_bpp_shift + PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] .endif .if dst_r_bpp != 0 - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + PF lsl DUMMY, PF_X, #dst_bpp_shift + PF prfm PREFETCH_MODE, [PF_DST, DUMMY] .endif .if mask_bpp_shift >= 0 - PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] -.endif - PF subge PF_X, PF_X, ORIG_W - PF subges PF_CTL, PF_CTL, #0x10 + PF lsl DUMMY, PF_X, #mask_bpp_shift + PF prfm PREFETCH_MODE, [PF_MASK, DUMMY] +.endif + PF ble 71f + PF sub PF_X, PF_X, ORIG_W + PF subs PF_CTL, PF_CTL, #0x10 +71: + PF ble 72f .if src_bpp_shift >= 0 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift + PF ldrsb DUMMY, [PF_SRC, DUMMY] + PF add PF_SRC, PF_SRC, #1 .endif .if dst_r_bpp != 0 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift + PF ldrsb DUMMY, [PF_DST, DUMMY] + PF add PF_DST, PF_DST, #1 .endif .if mask_bpp_shift >= 0 - PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! + PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift + PF ldrsb DUMMY, [PF_MASK, DUMMY] + PF add PF_MASK, PF_MASK, #1 .endif +72: .endif .endm .macro cache_preload_simple .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE) .if src_bpp > 0 - pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)] + prfm PREFETCH_MODE, [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)] .endif .if dst_r_bpp > 0 - pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)] + prfm PREFETCH_MODE, [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)] .endif .if mask_bpp > 0 - pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)] + prfm PREFETCH_MODE, [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)] .endif .endif .endm @@ -462,14 +513,13 @@ process_pixblock_tail_head .if dst_w_bpp != 24 tst DST_R, #0xF - beq 2f - + beq 52f .irp lowbit, 1, 2, 4, 8, 16 local skip1 .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) .if lowbit < 16 /* we don't need more than 16-byte alignment */ tst DST_R, #lowbit - beq 1f + beq 51f .endif pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK @@ -480,7 +530,7 @@ local skip1 .endif PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) sub W, W, #(lowbit * 8 / dst_w_bpp) -1: +51: .endif .endr pixdeinterleave src_bpp, src_basereg @@ -493,18 +543,19 @@ local skip1 process_pixblock_tail pixinterleave dst_w_bpp, dst_w_basereg + .irp lowbit, 1, 2, 4, 8, 16 .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) .if lowbit < 16 /* we don't need more than 16-byte alignment */ tst DST_W, #lowbit - beq 1f + beq 51f .endif pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W -1: +51: .endif .endr .endif -2: +52: .endm /* @@ -528,11 +579,11 @@ local skip1 process_pixblock_tail, \ process_pixblock_tail_head tst W, #(pixblock_size - 1) - beq 2f + beq 52f .irp chunk_size, 16, 8, 4, 2, 1 .if pixblock_size > chunk_size tst W, #chunk_size - beq 1f + beq 51f pixld_src chunk_size, src_bpp, src_basereg, SRC pixld chunk_size, mask_bpp, mask_basereg, MASK .if dst_aligned_flag != 0 @@ -543,7 +594,7 @@ local skip1 .if cache_preload_flag != 0 PF add PF_X, PF_X, #chunk_size .endif -1: +51: .endif .endr pixdeinterleave src_bpp, src_basereg @@ -560,16 +611,16 @@ local skip1 .irp chunk_size, 16, 8, 4, 2, 1 .if pixblock_size > chunk_size tst W, #chunk_size - beq 1f + beq 51f .if dst_aligned_flag != 0 pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W .else pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W .endif -1: +51: .endif .endr -2: +52: .endm /* @@ -578,11 +629,7 @@ local skip1 * are already processed. */ .macro advance_to_next_scanline start_of_loop_label -.if regs_shortage - ldrd W, [sp] /* load W and H (width and height) from stack */ -.else mov W, ORIG_W -.endif add DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift .if src_bpp != 0 add SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift @@ -601,18 +648,15 @@ local skip1 .endif subs H, H, #1 mov DST_R, DST_W -.if regs_shortage - str H, [sp, #4] /* save updated height to stack */ -.endif bge start_of_loop_label .endm /* * Registers are allocated in the following way by default: - * d0, d1, d2, d3 - reserved for loading source pixel data - * d4, d5, d6, d7 - reserved for loading destination pixel data - * d24, d25, d26, d27 - reserved for loading mask pixel data - * d28, d29, d30, d31 - final destination pixel data for writeback to memory + * v0, v1, v2, v3 - reserved for loading source pixel data + * v4, v5, v6, v7 - reserved for loading destination pixel data + * v24, v25, v26, v27 - reserved for loading mask pixel data + * v28, v29, v30, v31 - final destination pixel data for writeback to memory */ .macro generate_composite_function fname, \ src_bpp_, \ @@ -632,8 +676,23 @@ local skip1 mask_basereg_ = 24 pixman_asm_function fname - - push {r4-r12, lr} /* save all registers */ + stp x29, x30, [sp, -16]! + mov x29, sp + sub sp, sp, 232 /* push all registers */ + sub x29, x29, 64 + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32 + stp x8, x9, [x29, -80] + stp x10, x11, [x29, -96] + stp x12, x13, [x29, -112] + stp x14, x15, [x29, -128] + stp x16, x17, [x29, -144] + stp x18, x19, [x29, -160] + stp x20, x21, [x29, -176] + stp x22, x23, [x29, -192] + stp x24, x25, [x29, -208] + stp x26, x27, [x29, -224] + str x28, [x29, -232] /* * Select prefetch type for this function. If prefetch distance is @@ -671,52 +730,36 @@ local skip1 /* * Assign symbolic names to registers */ - W .req r0 /* width (is updated during processing) */ - H .req r1 /* height (is updated during processing) */ - DST_W .req r2 /* destination buffer pointer for writes */ - DST_STRIDE .req r3 /* destination image stride */ - SRC .req r4 /* source buffer pointer */ - SRC_STRIDE .req r5 /* source image stride */ - DST_R .req r6 /* destination buffer pointer for reads */ - - MASK .req r7 /* mask pointer */ - MASK_STRIDE .req r8 /* mask stride */ - - PF_CTL .req r9 /* combined lines counter and prefetch */ + W .req x0 /* width (is updated during processing) */ + H .req x1 /* height (is updated during processing) */ + DST_W .req x2 /* destination buffer pointer for writes */ + DST_STRIDE .req x3 /* destination image stride */ + SRC .req x4 /* source buffer pointer */ + SRC_STRIDE .req x5 /* source image stride */ + MASK .req x6 /* mask pointer */ + MASK_STRIDE .req x7 /* mask stride */ + + DST_R .req x8 /* destination buffer pointer for reads */ + + PF_CTL .req x9 /* combined lines counter and prefetch */ /* distance increment counter */ - PF_X .req r10 /* pixel index in a scanline for current */ + PF_X .req x10 /* pixel index in a scanline for current */ /* pretetch position */ - PF_SRC .req r11 /* pointer to source scanline start */ + PF_SRC .req x11 /* pointer to source scanline start */ /* for prefetch purposes */ - PF_DST .req r12 /* pointer to destination scanline start */ + PF_DST .req x12 /* pointer to destination scanline start */ /* for prefetch purposes */ - PF_MASK .req r14 /* pointer to mask scanline start */ + PF_MASK .req x13 /* pointer to mask scanline start */ /* for prefetch purposes */ -/* - * Check whether we have enough registers for all the local variables. - * If we don't have enough registers, original width and height are - * kept on top of stack (and 'regs_shortage' variable is set to indicate - * this for the rest of code). Even if there are enough registers, the - * allocation scheme may be a bit different depending on whether source - * or mask is not used. - */ -.if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED) - ORIG_W .req r10 /* saved original width */ - DUMMY .req r12 /* temporary register */ - .set regs_shortage, 0 -.elseif mask_bpp == 0 - ORIG_W .req r7 /* saved original width */ - DUMMY .req r8 /* temporary register */ - .set regs_shortage, 0 -.elseif src_bpp == 0 - ORIG_W .req r4 /* saved original width */ - DUMMY .req r5 /* temporary register */ - .set regs_shortage, 0 -.else - ORIG_W .req r1 /* saved original width */ - DUMMY .req r1 /* temporary register */ - .set regs_shortage, 1 -.endif + + ORIG_W .req x14 /* saved original width */ + DUMMY .req x15 /* temporary register */ + + sxtw x0, w0 + sxtw x1, w1 + sxtw x3, w3 + sxtw x5, w5 + sxtw x7, w7 .set mask_bpp_shift, -1 .if src_bpp == 32 @@ -770,19 +813,7 @@ local skip1 .error "invalid prefetch distance (prefetch_distance)" .endif -.if src_bpp > 0 - ldr SRC, [sp, #40] -.endif -.if mask_bpp > 0 - ldr MASK, [sp, #48] -.endif PF mov PF_X, #0 -.if src_bpp > 0 - ldr SRC_STRIDE, [sp, #44] -.endif -.if mask_bpp > 0 - ldr MASK_STRIDE, [sp, #52] -.endif mov DST_R, DST_W .if src_bpp == 24 @@ -805,22 +836,16 @@ local skip1 PF mov PF_DST, DST_R PF mov PF_MASK, MASK /* PF_CTL = prefetch_distance | ((h - 1) << 4) */ - PF mov PF_CTL, H, lsl #4 - PF add PF_CTL, #(prefetch_distance - 0x10) + PF lsl DUMMY, H, #4 + PF mov PF_CTL, DUMMY + PF add PF_CTL, PF_CTL, #(prefetch_distance - 0x10) init -.if regs_shortage - push {r0, r1} -.endif subs H, H, #1 -.if regs_shortage - str H, [sp, #4] /* save updated height to stack */ -.else mov ORIG_W, W -.endif blt 9f cmp W, #(pixblock_size * 2) - blt 8f + blt 800f /* * This is the start of the pipelined loop, which if optimized for * long scanlines @@ -841,13 +866,15 @@ local skip1 cache_preload 0, pixblock_size cache_preload_simple subs W, W, #(pixblock_size * 2) - blt 2f -1: + blt 200f + +100: process_pixblock_tail_head cache_preload_simple subs W, W, #pixblock_size - bge 1b -2: + bge 100b + +200: process_pixblock_tail pixst_a pixblock_size, dst_w_bpp, \ (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W @@ -859,20 +886,35 @@ local skip1 process_pixblock_tail_head advance_to_next_scanline 0b -.if regs_shortage - pop {r0, r1} -.endif cleanup - pop {r4-r12, pc} /* exit */ +1000: + /* pop all registers */ + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + ldp x8, x9, [x29, -80] + ldp x10, x11, [x29, -96] + ldp x12, x13, [x29, -112] + ldp x14, x15, [x29, -128] + ldp x16, x17, [x29, -144] + ldp x18, x19, [x29, -160] + ldp x20, x21, [x29, -176] + ldp x22, x23, [x29, -192] + ldp x24, x25, [x29, -208] + ldp x26, x27, [x29, -224] + ldr x28, [x29, -232] + mov sp, x29 + ldp x29, x30, [sp], 16 + ret /* exit */ /* * This is the start of the loop, designed to process images with small width * (less than pixblock_size * 2 pixels). In this case neither pipelining * nor prefetch are used. */ -8: +800: /* Process exactly pixblock_size pixels if needed */ tst W, #pixblock_size - beq 1f + beq 100f pixld pixblock_size, dst_r_bpp, \ (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R fetch_src_pixblock @@ -882,19 +924,33 @@ local skip1 process_pixblock_tail pixst pixblock_size, dst_w_bpp, \ (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W -1: +100: /* Process the remaining trailing pixels in the scanline */ process_trailing_pixels 0, 0, \ process_pixblock_head, \ process_pixblock_tail, \ process_pixblock_tail_head - advance_to_next_scanline 8b + advance_to_next_scanline 800b 9: -.if regs_shortage - pop {r0, r1} -.endif cleanup - pop {r4-r12, pc} /* exit */ + /* pop all registers */ + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + ldp x8, x9, [x29, -80] + ldp x10, x11, [x29, -96] + ldp x12, x13, [x29, -112] + ldp x14, x15, [x29, -128] + ldp x16, x17, [x29, -144] + ldp x18, x19, [x29, -160] + ldp x20, x21, [x29, -176] + ldp x22, x23, [x29, -192] + ldp x24, x25, [x29, -208] + ldp x26, x27, [x29, -224] + ldr x28, [x29, -232] + mov sp, x29 + ldp x29, x30, [sp], 16 + ret /* exit */ .purgem fetch_src_pixblock .purgem pixld_src @@ -940,8 +996,8 @@ local skip1 mask_basereg_ = 24 pixman_asm_function fname - .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE + /* * Make some macro arguments globally visible and accessible * from other macros @@ -954,45 +1010,63 @@ local skip1 .set dst_r_basereg, dst_r_basereg_ .set src_basereg, src_basereg_ .set mask_basereg, mask_basereg_ - + .if use_nearest_scaling != 0 /* * Assign symbolic names to registers for nearest scaling */ - W .req r0 - DST_W .req r1 - SRC .req r2 - VX .req r3 - UNIT_X .req ip - MASK .req lr - TMP1 .req r4 - TMP2 .req r5 - DST_R .req r6 - SRC_WIDTH_FIXED .req r7 + W .req x0 + DST_W .req x1 + SRC .req x2 + VX .req x3 + UNIT_X .req x4 + SRC_WIDTH_FIXED .req x5 + MASK .req x6 + TMP1 .req x8 + TMP2 .req x9 + DST_R .req x10 + DUMMY .req x30 .macro pixld_src x:vararg pixld_s x .endm - ldr UNIT_X, [sp] - push {r4-r8, lr} - ldr SRC_WIDTH_FIXED, [sp, #(24 + 4)] - .if mask_bpp != 0 - ldr MASK, [sp, #(24 + 8)] - .endif + sxtw x0, w0 + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 + + stp x29, x30, [sp, -16]! + mov x29, sp + sub sp, sp, 88 + sub x29, x29, 64 + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + stp x8, x9, [x29, -80] + str x10, [x29, -88] .else /* * Assign symbolic names to registers */ - W .req r0 /* width (is updated during processing) */ - DST_W .req r1 /* destination buffer pointer for writes */ - SRC .req r2 /* source buffer pointer */ - DST_R .req ip /* destination buffer pointer for reads */ - MASK .req r3 /* mask pointer */ + W .req x0 /* width (is updated during processing) */ + DST_W .req x1 /* destination buffer pointer for writes */ + SRC .req x2 /* source buffer pointer */ + MASK .req x3 /* mask pointer */ + DST_R .req x4 /* destination buffer pointer for reads */ + DUMMY .req x30 .macro pixld_src x:vararg pixld x .endm + + sxtw x0, w0 + + stp x29, x30, [sp, -16]! + mov x29, sp + sub sp, sp, 64 + sub x29, x29, 64 + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 .endif .if (((flags) & FLAG_DST_READWRITE) != 0) @@ -1015,14 +1089,14 @@ local skip1 mov DST_R, DST_W cmp W, #pixblock_size - blt 8f + blt 800f ensure_destination_ptr_alignment process_pixblock_head, \ process_pixblock_tail, \ process_pixblock_tail_head subs W, W, #pixblock_size - blt 7f + blt 700f /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ pixld_a pixblock_size, dst_r_bpp, \ @@ -1032,16 +1106,16 @@ local skip1 (mask_basereg - pixblock_size * mask_bpp / 64), MASK process_pixblock_head subs W, W, #pixblock_size - blt 2f -1: + blt 200f +100: process_pixblock_tail_head subs W, W, #pixblock_size - bge 1b -2: + bge 100b +200: process_pixblock_tail pixst_a pixblock_size, dst_w_bpp, \ (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W -7: +700: /* Process the remaining trailing pixels in the scanline (dst aligned) */ process_trailing_pixels 0, 1, \ process_pixblock_head, \ @@ -1050,11 +1124,23 @@ local skip1 cleanup .if use_nearest_scaling != 0 - pop {r4-r8, pc} /* exit */ + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + ldp x8, x9, [x29, -80] + ldr x10, [x29, -96] + mov sp, x29 + ldp x29, x30, [sp], 16 + ret /* exit */ .else - bx lr /* exit */ -.endif -8: + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + mov sp, x29 + ldp x29, x30, [sp], 16 + ret /* exit */ +.endif +800: /* Process the remaining trailing pixels in the scanline (dst unaligned) */ process_trailing_pixels 0, 0, \ process_pixblock_head, \ @@ -1062,10 +1148,17 @@ local skip1 process_pixblock_tail_head cleanup - .if use_nearest_scaling != 0 - pop {r4-r8, pc} /* exit */ + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + ldp x8, x9, [x29, -80] + ldr x10, [x29, -88] + mov sp, x29 + ldp x29, x30, [sp], 16 + ret /* exit */ + .unreq DUMMY .unreq DST_R .unreq SRC .unreq W @@ -1078,8 +1171,14 @@ local skip1 .unreq SRC_WIDTH_FIXED .else - bx lr /* exit */ + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + mov sp, x29 + ldp x29, x30, [sp], 16 + ret /* exit */ + .unreq DUMMY .unreq SRC .unreq MASK .unreq DST_R @@ -1110,17 +1209,15 @@ local skip1 .endm /* - * Prologue/epilogue variant which additionally saves/restores d8-d15 + * Prologue/epilogue variant which additionally saves/restores v8-v15 * registers (they need to be saved/restored by callee according to ABI). * This is required if the code needs to use all the NEON registers. */ .macro default_init_need_all_regs - vpush {d8-d15} .endm .macro default_cleanup_need_all_regs - vpop {d8-d15} .endm /******************************************************************************/ @@ -1134,22 +1231,22 @@ local skip1 * value (in) is lost. */ .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b - vshrn.u16 out_r, in, #8 - vshrn.u16 out_g, in, #3 - vsli.u16 in, in, #5 - vmov.u8 out_a, #255 - vsri.u8 out_r, out_r, #5 - vsri.u8 out_g, out_g, #6 - vshrn.u16 out_b, in, #2 + shrn &out_r&.8b, &in&.8h, #8 + shrn &out_g&.8b, &in&.8h, #3 + sli &in&.8h, &in&.8h, #5 + movi &out_a&.8b, #255 + sri &out_r&.8b, &out_r&.8b, #5 + sri &out_g&.8b, &out_g&.8b, #6 + shrn &out_b&.8b, &in&.8h, #2 .endm .macro convert_0565_to_x888 in, out_r, out_g, out_b - vshrn.u16 out_r, in, #8 - vshrn.u16 out_g, in, #3 - vsli.u16 in, in, #5 - vsri.u8 out_r, out_r, #5 - vsri.u8 out_g, out_g, #6 - vshrn.u16 out_b, in, #2 + shrn &out_r&.8b, &in&.8h, #8 + shrn &out_g&.8b, &in&.8h, #3 + sli &in&.8h, &in&.8h, #5 + sri &out_r&.8b, &out_r&.8b, #5 + sri &out_g&.8b, &out_g&.8b, #6 + shrn &out_b&.8b, &in&.8h, #2 .endm /* @@ -1159,11 +1256,14 @@ local skip1 * registers (tmp1, tmp2) */ .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2 - vshll.u8 tmp1, in_g, #8 - vshll.u8 out, in_r, #8 - vshll.u8 tmp2, in_b, #8 - vsri.u16 out, tmp1, #5 - vsri.u16 out, tmp2, #11 + ushll &tmp1&.8h, &in_g&.8b, #7 + shl &tmp1&.8h, &tmp1&.8h, #1 + ushll &out&.8h, &in_r&.8b, #7 + shl &out&.8h, &out&.8h, #1 + ushll &tmp2&.8h, &in_b&.8b, #7 + shl &tmp2&.8h, &tmp2&.8h, #1 + sri &out&.8h, &tmp1&.8h, #5 + sri &out&.8h, &tmp2&.8h, #11 .endm /* @@ -1173,12 +1273,14 @@ local skip1 * value from 'in' is lost */ .macro convert_four_0565_to_x888_packed in, out0, out1, tmp - vshl.u16 out0, in, #5 /* G top 6 bits */ - vshl.u16 tmp, in, #11 /* B top 5 bits */ - vsri.u16 in, in, #5 /* R is ready in top bits */ - vsri.u16 out0, out0, #6 /* G is ready in top bits */ - vsri.u16 tmp, tmp, #5 /* B is ready in top bits */ - vshr.u16 out1, in, #8 /* R is in place */ - vsri.u16 out0, tmp, #8 /* G & B is in place */ - vzip.u16 out0, out1 /* everything is in place */ + shl &out0&.4h, &in&.4h, #5 /* G top 6 bits */ + shl &tmp&.4h, &in&.4h, #11 /* B top 5 bits */ + sri &in&.4h, &in&.4h, #5 /* R is ready in top bits */ + sri &out0&.4h, &out0&.4h, #6 /* G is ready in top bits */ + sri &tmp&.4h, &tmp&.4h, #5 /* B is ready in top bits */ + ushr &out1&.4h, &in&.4h, #8 /* R is in place */ + sri &out0&.4h, &tmp&.4h, #8 /* G & B is in place */ + zip1 &tmp&.4h, &out0&.4h, &out1&.4h /* everything is in place */ + zip2 &out1&.4h, &out0&.4h, &out1&.4h + mov &out0&.d[0], &tmp&.d[0] .endm diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h index 73a5414..81e0f23 100644 --- a/pixman/pixman-private.h +++ b/pixman/pixman-private.h @@ -607,6 +607,11 @@ pixman_implementation_t * _pixman_implementation_create_arm_neon (pixman_implementation_t *fallback); #endif +#ifdef USE_ARM_A64_NEON +pixman_implementation_t * +_pixman_implementation_create_arm_neon (pixman_implementation_t *fallback); +#endif + #ifdef USE_MIPS_DSPR2 pixman_implementation_t * _pixman_implementation_create_mips_dspr2 (pixman_implementation_t *fallback); |