diff options
author | Siarhei Siamashka <siarhei.siamashka@gmail.com> | 2012-12-03 17:07:31 +0200 |
---|---|---|
committer | Siarhei Siamashka <siarhei.siamashka@gmail.com> | 2013-01-27 20:48:26 +0200 |
commit | e66fd5ccb6b69dfa1acde36220dc3c3c44026890 (patch) | |
tree | 27fb68ca45d4896229561e703436ff399107a809 | |
parent | a9f66694163da9e8e41a69497acbadd630e0cb51 (diff) |
Faster write-back for the C variant of r5g6b5 dest iterator
Unrolling loops improves performance, so just use it here.
Also GCC can't properly optimize this code for RISC processors and
allocate 0x1F001F constant in a register. Because this constant is
too large to be represented as an immediate operand in instructions,
GCC inserts some redundant arithmetics. This problem can be workarounded
by explicitly using a variable for 0x1F001F constant and also initializing
it by a read from another volatile variable. In this case GCC is forced
to allocate a register for it, because it is not seen as a constant anymore.
The speedup relative to the generic store_scanline_r5g6b5() from
"pixman-access.c" (pixman was compiled with gcc 4.7.2):
MIPS 74K 480MHz : 33.22 MPix/s -> 43.42 MPix/s
ARM11 700MHz : 50.16 MPix/s -> 78.23 MPix/s
ARM Cortex-A8 1000MHz : 117.75 MPix/s -> 196.34 MPix/s
ARM Cortex-A9 1700MHz : 177.04 MPix/s -> 320.32 MPix/s
ARM Cortex-A15 1700MHz : 231.44 MPix/s -> 261.64 MPix/s
IBM Cell PPU 3200MHz : 130.25 MPix/s -> 145.61 MPix/s
Intel Core i7 2800MHz : 502.21 MPix/s -> 721.73 MPix/s
That's the performance for C code (SIMD and assembly optimizations
are disabled via PIXMAN_DISABLE environment variable).
-rw-r--r-- | pixman/pixman-fast-path.c | 38 |
1 files changed, 35 insertions, 3 deletions
diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c index cbe34bb..02a5119 100644 --- a/pixman/pixman-fast-path.c +++ b/pixman/pixman-fast-path.c @@ -2186,17 +2186,49 @@ fast_dest_fetch_noop (pixman_iter_t *iter, const uint32_t *mask) return iter->buffer; } +/* Helper function for a workaround, which tries to ensure that 0x1F001F + * constant is always allocated in a register on RISC architectures. + */ +static force_inline uint32_t +convert_8888_to_0565_workaround (uint32_t s, uint32_t x1F001F) +{ + uint32_t a, b; + a = (s >> 3) & x1F001F; + b = s & 0xFC00; + a |= a >> 5; + a |= b >> 5; + return a; +} + static void fast_write_back_r5g6b5 (pixman_iter_t *iter) { int32_t w = iter->width; uint16_t *dst = (uint16_t *)(iter->bits - iter->stride); const uint32_t *src = iter->buffer; + /* Workaround to ensure that x1F001F variable is allocated in a register */ + static volatile uint32_t volatile_x1F001F = 0x1F001F; + uint32_t x1F001F = volatile_x1F001F; - while (w > 0) + while ((w -= 4) >= 0) { - *dst++ = convert_8888_to_0565 (*src++); - w--; + uint32_t s1 = *src++; + uint32_t s2 = *src++; + uint32_t s3 = *src++; + uint32_t s4 = *src++; + *dst++ = convert_8888_to_0565_workaround (s1, x1F001F); + *dst++ = convert_8888_to_0565_workaround (s2, x1F001F); + *dst++ = convert_8888_to_0565_workaround (s3, x1F001F); + *dst++ = convert_8888_to_0565_workaround (s4, x1F001F); + } + if (w & 2) + { + *dst++ = convert_8888_to_0565_workaround (*src++, x1F001F); + *dst++ = convert_8888_to_0565_workaround (*src++, x1F001F); + } + if (w & 1) + { + *dst = convert_8888_to_0565_workaround (*src, x1F001F); } } |