diff options
author | Ben Avison <bavison@riscosopen.org> | 2014-07-28 14:02:25 +0100 |
---|---|---|
committer | Ben Avison <bavison@riscosopen.org> | 2015-10-15 15:04:04 +0100 |
commit | 3a6ced6399c7b5497690f6ca7fe033a88933b25f (patch) | |
tree | f96c6092ba70e466e85407078b1109373c95aaad | |
parent | 190ac85ddd8a0f90440b4e5cb4d39042d7399b3c (diff) |
armv6: Add fetcher for a8r8g8b8 nearest-neighbour transformed images
This is constrained to support X increments in the positive X direction only,
so this means scaled images (except those reflected in the Y axis) plus
parallelogram transformations which preserve the direction of the X axis.
It also doesn't attempt to support any form of image repeat.
With this optimisation, some operations constructed from fetcher and combiner
calls using general_composite_rect() now outperform the versions consructed
from FAST_NEAREST macros in pixman-fast-path.c, but unfortunately the
FAST_NEAREST ones have higher priority in fast path lookup. Here are some
benchmarks for the in_reverse_8888_8888 operation, which is not affected:
lowlevel-blt-bench -n :
Before After
Mean StdDev Mean StdDev Confidence Change
L1 10.2 0.0 27.1 0.2 100.0% +164.8%
L2 8.2 0.1 23.0 0.4 100.0% +179.2%
M 8.3 0.0 24.8 0.0 100.0% +200.3%
HT 5.5 0.0 12.7 0.0 100.0% +129.9%
VT 5.4 0.0 12.1 0.0 100.0% +123.2%
R 5.4 0.0 11.9 0.1 100.0% +122.7%
RT 2.8 0.0 5.4 0.1 100.0% +91.9%
affine-bench for 5 different scaling factors:
Before After
Mean StdDev Mean StdDev Confidence Change
0.5 11.1 0.0 28.3 0.0 100.0% +155.1%
0.75 10.5 0.0 26.4 0.0 100.0% +152.2%
1.0 9.9 0.0 24.6 0.0 100.0% +147.5%
1.5 9.0 0.0 21.8 0.0 100.0% +141.4%
2.0 8.3 0.0 19.7 0.0 100.0% +138.4%
-rw-r--r-- | pixman/pixman-arm-common.h | 84 | ||||
-rw-r--r-- | pixman/pixman-arm-simd-asm-scaled.S | 9 | ||||
-rw-r--r-- | pixman/pixman-arm-simd-asm-scaled.h | 367 | ||||
-rw-r--r-- | pixman/pixman-arm-simd.c | 19 |
4 files changed, 479 insertions, 0 deletions
diff --git a/pixman/pixman-arm-common.h b/pixman/pixman-arm-common.h index 31e75ac..cc27848 100644 --- a/pixman/pixman-arm-common.h +++ b/pixman/pixman-arm-common.h @@ -446,6 +446,90 @@ cputype##_combine_##name##_u (pixman_implementation_t *imp, \ /*****************************************************************************/ +/* Support for nearest scaled fetchers and fast paths */ + +#define PIXMAN_ARM_IMAGE_GET_SCALED(image, unscaled_x, unscaled_y, type, stride, out_bits, scaled_x, scaled_y, uxx, uxy, uyy) \ + do \ + { \ + pixman_image_t *__image__ = (image); \ + pixman_fixed_t __offset__ = pixman_int_to_fixed (unscaled_x) + pixman_fixed_1 / 2; \ + pixman_fixed_t __line__ = pixman_int_to_fixed (unscaled_y) + pixman_fixed_1 / 2; \ + pixman_fixed_t __x__, __y__; \ + int64_t __x64__, __y64__; \ + pixman_fixed_t (*__matrix__)[3] = __image__->common.transform->matrix; \ + \ + __x64__ = (int64_t) __matrix__[0][0] * (__offset__ & 0xFFFF); \ + __x64__ += (int64_t) __matrix__[0][1] * (__line__ & 0xFFFF); \ + __x__ = (__x64__ + 0x8000) >> 16; \ + __x__ += __matrix__[0][0] * (__offset__ >> 16); \ + __x__ += __matrix__[0][1] * (__line__ >> 16); \ + __x__ += __matrix__[0][2]; \ + __y64__ = (int64_t) __matrix__[1][1] * (__line__ & 0xFFFF); \ + __y__ = (__y64__ + 0x8000) >> 16; \ + __y__ += __matrix__[1][1] * (__line__ >> 16); \ + __y__ += __matrix__[1][2]; \ + \ + (stride) = __image__->bits.rowstride * (int) sizeof (uint32_t) / (int) sizeof (type); \ + (out_bits) = (type *)__image__->bits.bits; \ + (scaled_x) = __x__; \ + (scaled_y) = __y__; \ + (uxx) = __matrix__[0][0]; \ + (uxy) = __matrix__[0][1]; \ + (uyy) = __matrix__[1][1]; \ + } while (0) + +#define PIXMAN_ARM_BIND_GET_SCANLINE_NEAREST_SCALED_COVER(cputype, name, alias, type) \ + \ +DECLARE_NEAREST_SCALED_SCANLINE_FUNCTION (cputype, name, alias, type) \ + \ +static uint32_t * \ +cputype##_get_scanline_nearest_scaled_cover_##name (pixman_iter_t *iter, \ + const uint32_t *mask) \ +{ \ + int stride; \ + type *bits, *source; \ + pixman_fixed_t x, y, uxx, uxy, uyy; \ + \ + PIXMAN_ARM_IMAGE_GET_SCALED (iter->image, iter->x, iter->y++, type, \ + stride, bits, x, y, uxx, uxy, uyy); \ + \ + (void) uxy; \ + (void) uyy; \ + source = bits + stride * pixman_fixed_to_int (y - pixman_fixed_e); \ + \ + CALL_NEAREST_SCALED_SCANLINE_FUNCTION ( \ + cputype, name, alias, \ + iter->width, x - pixman_fixed_e, uxx, \ + iter->buffer, source, mask, iter->image->bits.width); \ + \ + return iter->buffer; \ +} + +#define PIXMAN_ARM_NEAREST_AFFINE_FLAGS \ + (FAST_PATH_NO_ALPHA_MAP | \ + FAST_PATH_NO_ACCESSORS | \ + FAST_PATH_NARROW_FORMAT | \ + FAST_PATH_NEAREST_FILTER | \ + FAST_PATH_HAS_TRANSFORM | \ + FAST_PATH_AFFINE_TRANSFORM) + +#define PIXMAN_ARM_NEAREST_SCALED_COVER_FLAGS \ + (PIXMAN_ARM_NEAREST_AFFINE_FLAGS | \ + FAST_PATH_SAMPLES_COVER_CLIP_NEAREST | \ + FAST_PATH_X_UNIT_POSITIVE | \ + FAST_PATH_Y_UNIT_ZERO) + +#define PIXMAN_ARM_NEAREST_SCALED_COVER_FETCHER(cputype, format) \ + { PIXMAN_ ## format, \ + PIXMAN_ARM_NEAREST_SCALED_COVER_FLAGS, \ + ITER_NARROW | ITER_SRC, \ + NULL, \ + cputype ## _get_scanline_nearest_scaled_cover_ ## format, \ + NULL \ + } + +/*****************************************************************************/ + /* Support for untransformed fetchers and writeback */ #define PIXMAN_ARM_BIND_GET_SCANLINE(cputype, name) \ diff --git a/pixman/pixman-arm-simd-asm-scaled.S b/pixman/pixman-arm-simd-asm-scaled.S index e050292..2c7e091 100644 --- a/pixman/pixman-arm-simd-asm-scaled.S +++ b/pixman/pixman-arm-simd-asm-scaled.S @@ -38,6 +38,7 @@ .p2align 2 #include "pixman-arm-asm.h" +#include "pixman-arm-simd-asm-scaled.h" /* * Note: This code is only using armv5te instructions (not even armv6), @@ -154,3 +155,11 @@ generate_nearest_scanline_func \ generate_nearest_scanline_func \ pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2, , 48, 32 + +/******************************************************************************/ + +generate_nearest_scaled_cover_function \ + pixman_get_scanline_nearest_scaled_cover_a8r8g8b8_asm_armv6, 32, \ + 3, 3 /* prefetch distances */, nop_macro, nop_macro + +/******************************************************************************/ diff --git a/pixman/pixman-arm-simd-asm-scaled.h b/pixman/pixman-arm-simd-asm-scaled.h new file mode 100644 index 0000000..fb6eb44 --- /dev/null +++ b/pixman/pixman-arm-simd-asm-scaled.h @@ -0,0 +1,367 @@ +/* + * Copyright © 2014 RISC OS Open Ltd + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of the copyright holders not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission. The copyright holders make no + * representations about the suitability of this software for any purpose. It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author: Ben Avison (bavison@riscosopen.org) + */ + +.set log2_32, 5 +.set log2_16, 4 +.set log2_8, 3 +.set log2_4, 2 +.set log2_2, 1 +.set log2_1, 0 + +.macro ldrx bpp, cond, tail + .if \bpp == 32 + ldr\cond \tail + .elseif \bpp == 16 + ldr\cond\()h \tail + .elseif \bpp == 8 + ldr\cond\()b \tail + .else + .error "Input bits per pixel not supported" + .endif +.endm + +.macro branch cond, label1, label2 + .ifnc "\label1", "" + b\cond \label1 + .else + b\cond \label2 + .endif +.endm + +.macro nearest_scaled_cover_enlarge_mask_innerloop bpp, reg, convert, mask_hint, may_be_final, exit_label, store + .ifnc \mask_hint, mask_is_0 + teq VALID, #1 + .ifc \convert, nop_macro + .ifnc \mask_hint, mask_is_non_0 + ittt ne + teqne \reg, #0 + .else + itt ne + .endif + ldrx \bpp, ne, <PIXEL, [SRC]> + movne VALID, #1 + .else + .ifnc \mask_hint, mask_is_non_0 + it ne + teqne \reg, #0 + .endif + beq 1101f + ldrx \bpp,, <PIXEL, [SRC]> + mov VALID, #1 + \convert PIXEL, TMP +1101: + .endif + .endif + adds ACCUM, ACCUM, UX + .ifnc \mask_hint, mask_is_0 + mov \reg, PIXEL + .endif + \store + branch cc, \exit_label, 1103f + add SRC, SRC, #\bpp/8 + mov VALID, #0 + tst SRC, #31 + branch ne, \exit_label, 1103f + subs PLDS, PLDS, #32 + branch lt, \exit_label, 1103f + pld [SRC, #prefetch_distance_src*32] +1103: +.endm + +.macro nearest_scaled_cover_enlarge_nomask_innerloop bpp, reg, convert, mask_hint, may_be_final, exit_label, store + adds ACCUM, ACCUM, UX + mov \reg, PIXEL + \store + branch cc, \exit_label, 1203f + .ifnc "\may_be_final","" + teq COUNT, #0 + ldrx \bpp, ne, <PIXEL, [SRC, #\bpp/8]!!> + .else + ldrx \bpp,, <PIXEL, [SRC, #\bpp/8]!!> + .endif + tst SRC, #31 + \convert PIXEL, TMP + branch ne, \exit_label, 1203f + subs PLDS, PLDS, #32 + branch lt, \exit_label, 1203f + pld [SRC, #prefetch_distance_src*32] +1203: +.endm + +.macro nearest_scaled_cover_reduce_mask_innerloop bpp, reg, convert, mask_hint, may_be_final, exit_label, store + add PTR, SRC, XHI, lsl #log2_\bpp - 3 + mov TMP, XHI + adds XLO, XLO, UX, lsl #16 + adc XHI, XHI, UX, lsr #16 + .ifc "\mask_hint","" + teq \reg, #0 + .ifnc \convert, nop_macro + beq 1301f + ldrx \bpp,, <\reg, [PTR]> + .else + ldrx \bpp, ne, <\reg, [PTR]> + .endif + eor TMP, TMP, XHI + bics TMP, TMP, #255/\bpp + \convert \reg, TMP + .ifnc \convert, nop_macro + b 1302f +1301: eor TMP, TMP, XHI + bics TMP, TMP, #255/\bpp +1302: + .endif + .else + .ifc \mask_hint, mask_is_non_0 + ldrx \bpp,, <\reg, [PTR]> + .endif + eor TMP, TMP, XHI + bics TMP, TMP, #255/\bpp + .ifc \mask_hint, mask_is_non_0 + \convert \reg, TMP + .endif + .endif + \store + branch eq, \exit_label, 1303f + subs PLDS, PLDS, #32 + branch lt, \exit_label, 1303f + bic PTR, PTR, #31 @ base of *previous* cacheline + pld [PTR, #(prefetch_distance_src+1)*32] +1303: +.endm + +.macro nearest_scaled_cover_reduce_nomask_innerloop bpp, reg, convert, mask_hint, may_be_final, exit_label, store + add PTR, SRC, XHI, lsl #log2_\bpp - 3 + mov TMP, XHI + adds XLO, XLO, UX, lsl #16 + adc XHI, XHI, UX, lsr #16 + ldrx \bpp,, <\reg, [PTR]> + eor TMP, TMP, XHI + bics TMP, TMP, #255/\bpp + \convert \reg, TMP + \store + branch eq, \exit_label, 1403f + subs PLDS, PLDS, #32 + branch lt, \exit_label, 1403f + bic PTR, PTR, #31 @ base of *previous* cacheline + pld [PTR, #(prefetch_distance_src+1)*32] +1403: +.endm + +.macro process1 bpp, has_mask, disable_prefetch, inner_loop, convert + .if \has_mask + ldr WK0, [MASK], #4 + .if !\disable_prefetch + tst MASK, #31 + bne 1501f + pld [MASK, #prefetch_distance_mask*32] +1501: + .endif + teq WK0, #0 + bne 1502f + \inner_loop \bpp, WK0, \convert, mask_is_0, 1, 1503f, <add DST, DST, #4> + b 1503f + .endif +1502: \inner_loop \bpp, WK0, \convert, mask_is_non_0, 1,, <str WK0, [DST], #4> +1503: +.endm + +.macro process4 bpp, has_mask, disable_mask_prefetch, inner_loop, convert + .if \has_mask + ldmia MASK!, {WK0-WK3} + .if !\disable_mask_prefetch + bic TMP, MASK, #31 + pld [TMP, #prefetch_distance_mask*32] + .endif + orr WK0, WK0, WK1 + orr WK2, WK2, WK3 + orrs WK0, WK0, WK2 + bne 1601f + \inner_loop \bpp, WK0, \convert, mask_is_0 + \inner_loop \bpp, WK1, \convert, mask_is_0 + \inner_loop \bpp, WK2, \convert, mask_is_0 + \inner_loop \bpp, WK3, \convert, mask_is_0, 1, 1602f, <add DST, DST, #4*4> + b 1602f + .endif +1601: \inner_loop \bpp, WK0, \convert + \inner_loop \bpp, WK1, \convert + \inner_loop \bpp, WK2, \convert + \inner_loop \bpp, WK3, \convert,, 1,, <stmia DST!!, {WK0,WK1,WK2,WK3}> +1602: +.endm + +.macro process bpp, has_mask, inner_loop, convert + cmp COUNT, #2 * 4 - 1 - 1 @ guaranteed at least one aligned half-cacheline output? + blo 1706f + tst DST, #15 + beq 1702f +1701: process1 \bpp, \has_mask, 0, \inner_loop, \convert + sub COUNT, COUNT, #1 + tst DST, #15 + bne 1701b +1702: sub COUNT, COUNT, #4 - 1 + tst MASK, #16 + beq 1704f +1703: process4 \bpp, \has_mask, 0, \inner_loop, \convert + subs COUNT, COUNT, #4 + bcc 1705f +1704: process4 \bpp, \has_mask, 1, \inner_loop, \convert + subs COUNT, COUNT, #4 + bcs 1703b +1705: adds COUNT, COUNT, #4 - 1 + bcc 1707f + @ drop through... +1706: process1 \bpp, \has_mask, 1, \inner_loop, \convert + subs COUNT, COUNT, #1 + bcs 1706b +1707: pop {r4-r11, pc} +.endm + +.macro generate_nearest_scaled_cover_function fname, \ + bpp, \ + prefetch_distance_src_, \ + prefetch_distance_mask_, \ + init, \ + convert + +/* void fname(uint32_t width, + * pixman_fixed_t x, + * pixman_fixed_t ux, + * uint32_t *dest, + * const uint32_t *source, + * const uint32_t *mask); + */ +pixman_asm_function fname + +/* + * Make some macro arguments globally visible and accessible + * from other macros + */ + .set prefetch_distance_src, prefetch_distance_src_ + .set prefetch_distance_mask, prefetch_distance_mask_ + +/* + * Assign symbolic names to registers + */ +COUNT .req a1 +X .req a2 +ACCUM .req a2 @ enlarge only +XLO .req a2 @ reduce only +UX .req a3 +DST .req a4 +SRC .req v1 +MASK .req v2 +PLDS .req v3 +PIXEL .req v4 @ enlarge only +XHI .req v4 @ reduce only +WK0 .req v5 +WK1 .req v6 +WK2 .req sl +WK3 .req fp +VALID .req ip @ enlarge-with-mask only +PTR .req ip @ reduce only +TMP .req lr + + mov ip, sp + push {r4-r11, lr} /* save all registers */ + ldmia ip, {SRC, MASK} + subs COUNT, COUNT, #1 + blo 1807f-4 + \init + mla WK2, COUNT, UX, X + bics WK0, MASK, #31 + beq 1801f + @ Use a simplified preload process for the mask, + @ without a braking distance. + .set OFFSET, 0 + .rept prefetch_distance_mask + 1 + pld [WK0, #OFFSET] + .set OFFSET, OFFSET + 32 + .endr +1801: + add WK0, SRC, X, lsr #16 - (log2_\bpp - 3) + bic WK0, WK0, #31 + pld [WK0] + add WK2, SRC, WK2, lsr #16 - (log2_\bpp - 3) + bic WK2, WK2, #31 + add WK1, WK0, #prefetch_distance_src*32 + subs PLDS, WK2, WK1 + movcc WK1, WK2 +1802: add WK0, WK0, #32 + cmp WK0, WK1 + bhi 1803f + pld [WK0] + b 1802b +1803: + cmp UX, #0x10000 + bhs 1805f + @ Enlarge + add SRC, X, lsr #16 - (log2_\bpp - 3) + mov ACCUM, X, lsl #16 + mov UX, UX, lsl #16 + bic SRC, SRC, #(\bpp-1)/8 + teq MASK, #0 + beq 1804f + mov VALID, #0 + process \bpp, 1, nearest_scaled_cover_enlarge_mask_innerloop, \convert +1804: + ldrx \bpp,, <PIXEL, [SRC]> + \convert PIXEL, TMP + process \bpp, 0, nearest_scaled_cover_enlarge_nomask_innerloop, \convert + +1805: @ Reduce + and TMP, SRC, #31 + bic SRC, SRC, #31 + mov XHI, X, lsr #16 + mov XLO, X, lsl #16 + add XHI, XHI, TMP, lsr #log2_\bpp - 3 + teq MASK, #0 + beq 1806f + process \bpp, 1, nearest_scaled_cover_reduce_mask_innerloop, \convert +1806: process \bpp, 0, nearest_scaled_cover_reduce_nomask_innerloop, \convert +1807: + + .unreq COUNT + .unreq X + .unreq ACCUM + .unreq XLO + .unreq UX + .unreq DST + .unreq SRC + .unreq MASK + .unreq PLDS + .unreq PIXEL + .unreq XHI + .unreq WK0 + .unreq WK1 + .unreq WK2 + .unreq WK3 + .unreq VALID + .unreq PTR + .unreq TMP +.endfunc +.endm + +.macro nop_macro x:vararg +.endm diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c index 4e4daa0..579935d 100644 --- a/pixman/pixman-arm-simd.c +++ b/pixman/pixman-arm-simd.c @@ -124,6 +124,23 @@ fast_dest_fetch_noop (pixman_iter_t *iter, const uint32_t *mask) return iter->buffer; } +#define DECLARE_NEAREST_SCALED_SCANLINE_FUNCTION(cputype, name, alias, type) \ +void \ +pixman_get_scanline_nearest_scaled_cover_##name##_asm_##cputype ( \ + int32_t width, \ + pixman_fixed_t x, \ + pixman_fixed_t ux, \ + uint32_t *dest, \ + const type *source, \ + const uint32_t *mask); + +#define CALL_NEAREST_SCALED_SCANLINE_FUNCTION( \ + cputype, name, alias, width, x, ux, dest, source, mask, source_width) \ + pixman_get_scanline_nearest_scaled_cover_##name##_asm_##cputype ( \ + width, x, ux, dest, source, mask); + +PIXMAN_ARM_BIND_GET_SCANLINE_NEAREST_SCALED_COVER (armv6, a8r8g8b8, 8888, uint32_t) + void pixman_composite_src_n_8888_asm_armv6 (int32_t w, int32_t h, @@ -353,6 +370,8 @@ static const pixman_fast_path_t arm_simd_fast_paths[] = static const pixman_iter_info_t arm_simd_iters[] = { + PIXMAN_ARM_NEAREST_SCALED_COVER_FETCHER (armv6, a8r8g8b8), + PIXMAN_ARM_UNTRANSFORMED_COVER_FETCHER (armv6, x8r8g8b8), PIXMAN_ARM_UNTRANSFORMED_COVER_FETCHER (armv6, r5g6b5), |