summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBen Avison <bavison@riscosopen.org>2014-07-28 14:02:25 +0100
committerBen Avison <bavison@riscosopen.org>2015-10-15 15:04:04 +0100
commit3a6ced6399c7b5497690f6ca7fe033a88933b25f (patch)
treef96c6092ba70e466e85407078b1109373c95aaad
parent190ac85ddd8a0f90440b4e5cb4d39042d7399b3c (diff)
armv6: Add fetcher for a8r8g8b8 nearest-neighbour transformed images
This is constrained to support X increments in the positive X direction only, so this means scaled images (except those reflected in the Y axis) plus parallelogram transformations which preserve the direction of the X axis. It also doesn't attempt to support any form of image repeat. With this optimisation, some operations constructed from fetcher and combiner calls using general_composite_rect() now outperform the versions consructed from FAST_NEAREST macros in pixman-fast-path.c, but unfortunately the FAST_NEAREST ones have higher priority in fast path lookup. Here are some benchmarks for the in_reverse_8888_8888 operation, which is not affected: lowlevel-blt-bench -n : Before After Mean StdDev Mean StdDev Confidence Change L1 10.2 0.0 27.1 0.2 100.0% +164.8% L2 8.2 0.1 23.0 0.4 100.0% +179.2% M 8.3 0.0 24.8 0.0 100.0% +200.3% HT 5.5 0.0 12.7 0.0 100.0% +129.9% VT 5.4 0.0 12.1 0.0 100.0% +123.2% R 5.4 0.0 11.9 0.1 100.0% +122.7% RT 2.8 0.0 5.4 0.1 100.0% +91.9% affine-bench for 5 different scaling factors: Before After Mean StdDev Mean StdDev Confidence Change 0.5 11.1 0.0 28.3 0.0 100.0% +155.1% 0.75 10.5 0.0 26.4 0.0 100.0% +152.2% 1.0 9.9 0.0 24.6 0.0 100.0% +147.5% 1.5 9.0 0.0 21.8 0.0 100.0% +141.4% 2.0 8.3 0.0 19.7 0.0 100.0% +138.4%
-rw-r--r--pixman/pixman-arm-common.h84
-rw-r--r--pixman/pixman-arm-simd-asm-scaled.S9
-rw-r--r--pixman/pixman-arm-simd-asm-scaled.h367
-rw-r--r--pixman/pixman-arm-simd.c19
4 files changed, 479 insertions, 0 deletions
diff --git a/pixman/pixman-arm-common.h b/pixman/pixman-arm-common.h
index 31e75ac..cc27848 100644
--- a/pixman/pixman-arm-common.h
+++ b/pixman/pixman-arm-common.h
@@ -446,6 +446,90 @@ cputype##_combine_##name##_u (pixman_implementation_t *imp, \
/*****************************************************************************/
+/* Support for nearest scaled fetchers and fast paths */
+
+#define PIXMAN_ARM_IMAGE_GET_SCALED(image, unscaled_x, unscaled_y, type, stride, out_bits, scaled_x, scaled_y, uxx, uxy, uyy) \
+ do \
+ { \
+ pixman_image_t *__image__ = (image); \
+ pixman_fixed_t __offset__ = pixman_int_to_fixed (unscaled_x) + pixman_fixed_1 / 2; \
+ pixman_fixed_t __line__ = pixman_int_to_fixed (unscaled_y) + pixman_fixed_1 / 2; \
+ pixman_fixed_t __x__, __y__; \
+ int64_t __x64__, __y64__; \
+ pixman_fixed_t (*__matrix__)[3] = __image__->common.transform->matrix; \
+ \
+ __x64__ = (int64_t) __matrix__[0][0] * (__offset__ & 0xFFFF); \
+ __x64__ += (int64_t) __matrix__[0][1] * (__line__ & 0xFFFF); \
+ __x__ = (__x64__ + 0x8000) >> 16; \
+ __x__ += __matrix__[0][0] * (__offset__ >> 16); \
+ __x__ += __matrix__[0][1] * (__line__ >> 16); \
+ __x__ += __matrix__[0][2]; \
+ __y64__ = (int64_t) __matrix__[1][1] * (__line__ & 0xFFFF); \
+ __y__ = (__y64__ + 0x8000) >> 16; \
+ __y__ += __matrix__[1][1] * (__line__ >> 16); \
+ __y__ += __matrix__[1][2]; \
+ \
+ (stride) = __image__->bits.rowstride * (int) sizeof (uint32_t) / (int) sizeof (type); \
+ (out_bits) = (type *)__image__->bits.bits; \
+ (scaled_x) = __x__; \
+ (scaled_y) = __y__; \
+ (uxx) = __matrix__[0][0]; \
+ (uxy) = __matrix__[0][1]; \
+ (uyy) = __matrix__[1][1]; \
+ } while (0)
+
+#define PIXMAN_ARM_BIND_GET_SCANLINE_NEAREST_SCALED_COVER(cputype, name, alias, type) \
+ \
+DECLARE_NEAREST_SCALED_SCANLINE_FUNCTION (cputype, name, alias, type) \
+ \
+static uint32_t * \
+cputype##_get_scanline_nearest_scaled_cover_##name (pixman_iter_t *iter, \
+ const uint32_t *mask) \
+{ \
+ int stride; \
+ type *bits, *source; \
+ pixman_fixed_t x, y, uxx, uxy, uyy; \
+ \
+ PIXMAN_ARM_IMAGE_GET_SCALED (iter->image, iter->x, iter->y++, type, \
+ stride, bits, x, y, uxx, uxy, uyy); \
+ \
+ (void) uxy; \
+ (void) uyy; \
+ source = bits + stride * pixman_fixed_to_int (y - pixman_fixed_e); \
+ \
+ CALL_NEAREST_SCALED_SCANLINE_FUNCTION ( \
+ cputype, name, alias, \
+ iter->width, x - pixman_fixed_e, uxx, \
+ iter->buffer, source, mask, iter->image->bits.width); \
+ \
+ return iter->buffer; \
+}
+
+#define PIXMAN_ARM_NEAREST_AFFINE_FLAGS \
+ (FAST_PATH_NO_ALPHA_MAP | \
+ FAST_PATH_NO_ACCESSORS | \
+ FAST_PATH_NARROW_FORMAT | \
+ FAST_PATH_NEAREST_FILTER | \
+ FAST_PATH_HAS_TRANSFORM | \
+ FAST_PATH_AFFINE_TRANSFORM)
+
+#define PIXMAN_ARM_NEAREST_SCALED_COVER_FLAGS \
+ (PIXMAN_ARM_NEAREST_AFFINE_FLAGS | \
+ FAST_PATH_SAMPLES_COVER_CLIP_NEAREST | \
+ FAST_PATH_X_UNIT_POSITIVE | \
+ FAST_PATH_Y_UNIT_ZERO)
+
+#define PIXMAN_ARM_NEAREST_SCALED_COVER_FETCHER(cputype, format) \
+ { PIXMAN_ ## format, \
+ PIXMAN_ARM_NEAREST_SCALED_COVER_FLAGS, \
+ ITER_NARROW | ITER_SRC, \
+ NULL, \
+ cputype ## _get_scanline_nearest_scaled_cover_ ## format, \
+ NULL \
+ }
+
+/*****************************************************************************/
+
/* Support for untransformed fetchers and writeback */
#define PIXMAN_ARM_BIND_GET_SCANLINE(cputype, name) \
diff --git a/pixman/pixman-arm-simd-asm-scaled.S b/pixman/pixman-arm-simd-asm-scaled.S
index e050292..2c7e091 100644
--- a/pixman/pixman-arm-simd-asm-scaled.S
+++ b/pixman/pixman-arm-simd-asm-scaled.S
@@ -38,6 +38,7 @@
.p2align 2
#include "pixman-arm-asm.h"
+#include "pixman-arm-simd-asm-scaled.h"
/*
* Note: This code is only using armv5te instructions (not even armv6),
@@ -154,3 +155,11 @@ generate_nearest_scanline_func \
generate_nearest_scanline_func \
pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2, , 48, 32
+
+/******************************************************************************/
+
+generate_nearest_scaled_cover_function \
+ pixman_get_scanline_nearest_scaled_cover_a8r8g8b8_asm_armv6, 32, \
+ 3, 3 /* prefetch distances */, nop_macro, nop_macro
+
+/******************************************************************************/
diff --git a/pixman/pixman-arm-simd-asm-scaled.h b/pixman/pixman-arm-simd-asm-scaled.h
new file mode 100644
index 0000000..fb6eb44
--- /dev/null
+++ b/pixman/pixman-arm-simd-asm-scaled.h
@@ -0,0 +1,367 @@
+/*
+ * Copyright © 2014 RISC OS Open Ltd
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of the copyright holders not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. The copyright holders make no
+ * representations about the suitability of this software for any purpose. It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author: Ben Avison (bavison@riscosopen.org)
+ */
+
+.set log2_32, 5
+.set log2_16, 4
+.set log2_8, 3
+.set log2_4, 2
+.set log2_2, 1
+.set log2_1, 0
+
+.macro ldrx bpp, cond, tail
+ .if \bpp == 32
+ ldr\cond \tail
+ .elseif \bpp == 16
+ ldr\cond\()h \tail
+ .elseif \bpp == 8
+ ldr\cond\()b \tail
+ .else
+ .error "Input bits per pixel not supported"
+ .endif
+.endm
+
+.macro branch cond, label1, label2
+ .ifnc "\label1", ""
+ b\cond \label1
+ .else
+ b\cond \label2
+ .endif
+.endm
+
+.macro nearest_scaled_cover_enlarge_mask_innerloop bpp, reg, convert, mask_hint, may_be_final, exit_label, store
+ .ifnc \mask_hint, mask_is_0
+ teq VALID, #1
+ .ifc \convert, nop_macro
+ .ifnc \mask_hint, mask_is_non_0
+ ittt ne
+ teqne \reg, #0
+ .else
+ itt ne
+ .endif
+ ldrx \bpp, ne, <PIXEL, [SRC]>
+ movne VALID, #1
+ .else
+ .ifnc \mask_hint, mask_is_non_0
+ it ne
+ teqne \reg, #0
+ .endif
+ beq 1101f
+ ldrx \bpp,, <PIXEL, [SRC]>
+ mov VALID, #1
+ \convert PIXEL, TMP
+1101:
+ .endif
+ .endif
+ adds ACCUM, ACCUM, UX
+ .ifnc \mask_hint, mask_is_0
+ mov \reg, PIXEL
+ .endif
+ \store
+ branch cc, \exit_label, 1103f
+ add SRC, SRC, #\bpp/8
+ mov VALID, #0
+ tst SRC, #31
+ branch ne, \exit_label, 1103f
+ subs PLDS, PLDS, #32
+ branch lt, \exit_label, 1103f
+ pld [SRC, #prefetch_distance_src*32]
+1103:
+.endm
+
+.macro nearest_scaled_cover_enlarge_nomask_innerloop bpp, reg, convert, mask_hint, may_be_final, exit_label, store
+ adds ACCUM, ACCUM, UX
+ mov \reg, PIXEL
+ \store
+ branch cc, \exit_label, 1203f
+ .ifnc "\may_be_final",""
+ teq COUNT, #0
+ ldrx \bpp, ne, <PIXEL, [SRC, #\bpp/8]!!>
+ .else
+ ldrx \bpp,, <PIXEL, [SRC, #\bpp/8]!!>
+ .endif
+ tst SRC, #31
+ \convert PIXEL, TMP
+ branch ne, \exit_label, 1203f
+ subs PLDS, PLDS, #32
+ branch lt, \exit_label, 1203f
+ pld [SRC, #prefetch_distance_src*32]
+1203:
+.endm
+
+.macro nearest_scaled_cover_reduce_mask_innerloop bpp, reg, convert, mask_hint, may_be_final, exit_label, store
+ add PTR, SRC, XHI, lsl #log2_\bpp - 3
+ mov TMP, XHI
+ adds XLO, XLO, UX, lsl #16
+ adc XHI, XHI, UX, lsr #16
+ .ifc "\mask_hint",""
+ teq \reg, #0
+ .ifnc \convert, nop_macro
+ beq 1301f
+ ldrx \bpp,, <\reg, [PTR]>
+ .else
+ ldrx \bpp, ne, <\reg, [PTR]>
+ .endif
+ eor TMP, TMP, XHI
+ bics TMP, TMP, #255/\bpp
+ \convert \reg, TMP
+ .ifnc \convert, nop_macro
+ b 1302f
+1301: eor TMP, TMP, XHI
+ bics TMP, TMP, #255/\bpp
+1302:
+ .endif
+ .else
+ .ifc \mask_hint, mask_is_non_0
+ ldrx \bpp,, <\reg, [PTR]>
+ .endif
+ eor TMP, TMP, XHI
+ bics TMP, TMP, #255/\bpp
+ .ifc \mask_hint, mask_is_non_0
+ \convert \reg, TMP
+ .endif
+ .endif
+ \store
+ branch eq, \exit_label, 1303f
+ subs PLDS, PLDS, #32
+ branch lt, \exit_label, 1303f
+ bic PTR, PTR, #31 @ base of *previous* cacheline
+ pld [PTR, #(prefetch_distance_src+1)*32]
+1303:
+.endm
+
+.macro nearest_scaled_cover_reduce_nomask_innerloop bpp, reg, convert, mask_hint, may_be_final, exit_label, store
+ add PTR, SRC, XHI, lsl #log2_\bpp - 3
+ mov TMP, XHI
+ adds XLO, XLO, UX, lsl #16
+ adc XHI, XHI, UX, lsr #16
+ ldrx \bpp,, <\reg, [PTR]>
+ eor TMP, TMP, XHI
+ bics TMP, TMP, #255/\bpp
+ \convert \reg, TMP
+ \store
+ branch eq, \exit_label, 1403f
+ subs PLDS, PLDS, #32
+ branch lt, \exit_label, 1403f
+ bic PTR, PTR, #31 @ base of *previous* cacheline
+ pld [PTR, #(prefetch_distance_src+1)*32]
+1403:
+.endm
+
+.macro process1 bpp, has_mask, disable_prefetch, inner_loop, convert
+ .if \has_mask
+ ldr WK0, [MASK], #4
+ .if !\disable_prefetch
+ tst MASK, #31
+ bne 1501f
+ pld [MASK, #prefetch_distance_mask*32]
+1501:
+ .endif
+ teq WK0, #0
+ bne 1502f
+ \inner_loop \bpp, WK0, \convert, mask_is_0, 1, 1503f, <add DST, DST, #4>
+ b 1503f
+ .endif
+1502: \inner_loop \bpp, WK0, \convert, mask_is_non_0, 1,, <str WK0, [DST], #4>
+1503:
+.endm
+
+.macro process4 bpp, has_mask, disable_mask_prefetch, inner_loop, convert
+ .if \has_mask
+ ldmia MASK!, {WK0-WK3}
+ .if !\disable_mask_prefetch
+ bic TMP, MASK, #31
+ pld [TMP, #prefetch_distance_mask*32]
+ .endif
+ orr WK0, WK0, WK1
+ orr WK2, WK2, WK3
+ orrs WK0, WK0, WK2
+ bne 1601f
+ \inner_loop \bpp, WK0, \convert, mask_is_0
+ \inner_loop \bpp, WK1, \convert, mask_is_0
+ \inner_loop \bpp, WK2, \convert, mask_is_0
+ \inner_loop \bpp, WK3, \convert, mask_is_0, 1, 1602f, <add DST, DST, #4*4>
+ b 1602f
+ .endif
+1601: \inner_loop \bpp, WK0, \convert
+ \inner_loop \bpp, WK1, \convert
+ \inner_loop \bpp, WK2, \convert
+ \inner_loop \bpp, WK3, \convert,, 1,, <stmia DST!!, {WK0,WK1,WK2,WK3}>
+1602:
+.endm
+
+.macro process bpp, has_mask, inner_loop, convert
+ cmp COUNT, #2 * 4 - 1 - 1 @ guaranteed at least one aligned half-cacheline output?
+ blo 1706f
+ tst DST, #15
+ beq 1702f
+1701: process1 \bpp, \has_mask, 0, \inner_loop, \convert
+ sub COUNT, COUNT, #1
+ tst DST, #15
+ bne 1701b
+1702: sub COUNT, COUNT, #4 - 1
+ tst MASK, #16
+ beq 1704f
+1703: process4 \bpp, \has_mask, 0, \inner_loop, \convert
+ subs COUNT, COUNT, #4
+ bcc 1705f
+1704: process4 \bpp, \has_mask, 1, \inner_loop, \convert
+ subs COUNT, COUNT, #4
+ bcs 1703b
+1705: adds COUNT, COUNT, #4 - 1
+ bcc 1707f
+ @ drop through...
+1706: process1 \bpp, \has_mask, 1, \inner_loop, \convert
+ subs COUNT, COUNT, #1
+ bcs 1706b
+1707: pop {r4-r11, pc}
+.endm
+
+.macro generate_nearest_scaled_cover_function fname, \
+ bpp, \
+ prefetch_distance_src_, \
+ prefetch_distance_mask_, \
+ init, \
+ convert
+
+/* void fname(uint32_t width,
+ * pixman_fixed_t x,
+ * pixman_fixed_t ux,
+ * uint32_t *dest,
+ * const uint32_t *source,
+ * const uint32_t *mask);
+ */
+pixman_asm_function fname
+
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+ .set prefetch_distance_src, prefetch_distance_src_
+ .set prefetch_distance_mask, prefetch_distance_mask_
+
+/*
+ * Assign symbolic names to registers
+ */
+COUNT .req a1
+X .req a2
+ACCUM .req a2 @ enlarge only
+XLO .req a2 @ reduce only
+UX .req a3
+DST .req a4
+SRC .req v1
+MASK .req v2
+PLDS .req v3
+PIXEL .req v4 @ enlarge only
+XHI .req v4 @ reduce only
+WK0 .req v5
+WK1 .req v6
+WK2 .req sl
+WK3 .req fp
+VALID .req ip @ enlarge-with-mask only
+PTR .req ip @ reduce only
+TMP .req lr
+
+ mov ip, sp
+ push {r4-r11, lr} /* save all registers */
+ ldmia ip, {SRC, MASK}
+ subs COUNT, COUNT, #1
+ blo 1807f-4
+ \init
+ mla WK2, COUNT, UX, X
+ bics WK0, MASK, #31
+ beq 1801f
+ @ Use a simplified preload process for the mask,
+ @ without a braking distance.
+ .set OFFSET, 0
+ .rept prefetch_distance_mask + 1
+ pld [WK0, #OFFSET]
+ .set OFFSET, OFFSET + 32
+ .endr
+1801:
+ add WK0, SRC, X, lsr #16 - (log2_\bpp - 3)
+ bic WK0, WK0, #31
+ pld [WK0]
+ add WK2, SRC, WK2, lsr #16 - (log2_\bpp - 3)
+ bic WK2, WK2, #31
+ add WK1, WK0, #prefetch_distance_src*32
+ subs PLDS, WK2, WK1
+ movcc WK1, WK2
+1802: add WK0, WK0, #32
+ cmp WK0, WK1
+ bhi 1803f
+ pld [WK0]
+ b 1802b
+1803:
+ cmp UX, #0x10000
+ bhs 1805f
+ @ Enlarge
+ add SRC, X, lsr #16 - (log2_\bpp - 3)
+ mov ACCUM, X, lsl #16
+ mov UX, UX, lsl #16
+ bic SRC, SRC, #(\bpp-1)/8
+ teq MASK, #0
+ beq 1804f
+ mov VALID, #0
+ process \bpp, 1, nearest_scaled_cover_enlarge_mask_innerloop, \convert
+1804:
+ ldrx \bpp,, <PIXEL, [SRC]>
+ \convert PIXEL, TMP
+ process \bpp, 0, nearest_scaled_cover_enlarge_nomask_innerloop, \convert
+
+1805: @ Reduce
+ and TMP, SRC, #31
+ bic SRC, SRC, #31
+ mov XHI, X, lsr #16
+ mov XLO, X, lsl #16
+ add XHI, XHI, TMP, lsr #log2_\bpp - 3
+ teq MASK, #0
+ beq 1806f
+ process \bpp, 1, nearest_scaled_cover_reduce_mask_innerloop, \convert
+1806: process \bpp, 0, nearest_scaled_cover_reduce_nomask_innerloop, \convert
+1807:
+
+ .unreq COUNT
+ .unreq X
+ .unreq ACCUM
+ .unreq XLO
+ .unreq UX
+ .unreq DST
+ .unreq SRC
+ .unreq MASK
+ .unreq PLDS
+ .unreq PIXEL
+ .unreq XHI
+ .unreq WK0
+ .unreq WK1
+ .unreq WK2
+ .unreq WK3
+ .unreq VALID
+ .unreq PTR
+ .unreq TMP
+.endfunc
+.endm
+
+.macro nop_macro x:vararg
+.endm
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index 4e4daa0..579935d 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -124,6 +124,23 @@ fast_dest_fetch_noop (pixman_iter_t *iter, const uint32_t *mask)
return iter->buffer;
}
+#define DECLARE_NEAREST_SCALED_SCANLINE_FUNCTION(cputype, name, alias, type) \
+void \
+pixman_get_scanline_nearest_scaled_cover_##name##_asm_##cputype ( \
+ int32_t width, \
+ pixman_fixed_t x, \
+ pixman_fixed_t ux, \
+ uint32_t *dest, \
+ const type *source, \
+ const uint32_t *mask);
+
+#define CALL_NEAREST_SCALED_SCANLINE_FUNCTION( \
+ cputype, name, alias, width, x, ux, dest, source, mask, source_width) \
+ pixman_get_scanline_nearest_scaled_cover_##name##_asm_##cputype ( \
+ width, x, ux, dest, source, mask);
+
+PIXMAN_ARM_BIND_GET_SCANLINE_NEAREST_SCALED_COVER (armv6, a8r8g8b8, 8888, uint32_t)
+
void
pixman_composite_src_n_8888_asm_armv6 (int32_t w,
int32_t h,
@@ -353,6 +370,8 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
static const pixman_iter_info_t arm_simd_iters[] =
{
+ PIXMAN_ARM_NEAREST_SCALED_COVER_FETCHER (armv6, a8r8g8b8),
+
PIXMAN_ARM_UNTRANSFORMED_COVER_FETCHER (armv6, x8r8g8b8),
PIXMAN_ARM_UNTRANSFORMED_COVER_FETCHER (armv6, r5g6b5),