summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSiarhei Siamashka <siarhei.siamashka@gmail.com>2012-12-03 17:07:31 +0200
committerSiarhei Siamashka <siarhei.siamashka@gmail.com>2013-01-27 20:48:26 +0200
commite66fd5ccb6b69dfa1acde36220dc3c3c44026890 (patch)
tree27fb68ca45d4896229561e703436ff399107a809
parenta9f66694163da9e8e41a69497acbadd630e0cb51 (diff)
Faster write-back for the C variant of r5g6b5 dest iterator
Unrolling loops improves performance, so just use it here. Also GCC can't properly optimize this code for RISC processors and allocate 0x1F001F constant in a register. Because this constant is too large to be represented as an immediate operand in instructions, GCC inserts some redundant arithmetics. This problem can be workarounded by explicitly using a variable for 0x1F001F constant and also initializing it by a read from another volatile variable. In this case GCC is forced to allocate a register for it, because it is not seen as a constant anymore. The speedup relative to the generic store_scanline_r5g6b5() from "pixman-access.c" (pixman was compiled with gcc 4.7.2): MIPS 74K 480MHz : 33.22 MPix/s -> 43.42 MPix/s ARM11 700MHz : 50.16 MPix/s -> 78.23 MPix/s ARM Cortex-A8 1000MHz : 117.75 MPix/s -> 196.34 MPix/s ARM Cortex-A9 1700MHz : 177.04 MPix/s -> 320.32 MPix/s ARM Cortex-A15 1700MHz : 231.44 MPix/s -> 261.64 MPix/s IBM Cell PPU 3200MHz : 130.25 MPix/s -> 145.61 MPix/s Intel Core i7 2800MHz : 502.21 MPix/s -> 721.73 MPix/s That's the performance for C code (SIMD and assembly optimizations are disabled via PIXMAN_DISABLE environment variable).
-rw-r--r--pixman/pixman-fast-path.c38
1 files changed, 35 insertions, 3 deletions
diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
index cbe34bb..02a5119 100644
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -2186,17 +2186,49 @@ fast_dest_fetch_noop (pixman_iter_t *iter, const uint32_t *mask)
return iter->buffer;
}
+/* Helper function for a workaround, which tries to ensure that 0x1F001F
+ * constant is always allocated in a register on RISC architectures.
+ */
+static force_inline uint32_t
+convert_8888_to_0565_workaround (uint32_t s, uint32_t x1F001F)
+{
+ uint32_t a, b;
+ a = (s >> 3) & x1F001F;
+ b = s & 0xFC00;
+ a |= a >> 5;
+ a |= b >> 5;
+ return a;
+}
+
static void
fast_write_back_r5g6b5 (pixman_iter_t *iter)
{
int32_t w = iter->width;
uint16_t *dst = (uint16_t *)(iter->bits - iter->stride);
const uint32_t *src = iter->buffer;
+ /* Workaround to ensure that x1F001F variable is allocated in a register */
+ static volatile uint32_t volatile_x1F001F = 0x1F001F;
+ uint32_t x1F001F = volatile_x1F001F;
- while (w > 0)
+ while ((w -= 4) >= 0)
{
- *dst++ = convert_8888_to_0565 (*src++);
- w--;
+ uint32_t s1 = *src++;
+ uint32_t s2 = *src++;
+ uint32_t s3 = *src++;
+ uint32_t s4 = *src++;
+ *dst++ = convert_8888_to_0565_workaround (s1, x1F001F);
+ *dst++ = convert_8888_to_0565_workaround (s2, x1F001F);
+ *dst++ = convert_8888_to_0565_workaround (s3, x1F001F);
+ *dst++ = convert_8888_to_0565_workaround (s4, x1F001F);
+ }
+ if (w & 2)
+ {
+ *dst++ = convert_8888_to_0565_workaround (*src++, x1F001F);
+ *dst++ = convert_8888_to_0565_workaround (*src++, x1F001F);
+ }
+ if (w & 1)
+ {
+ *dst = convert_8888_to_0565_workaround (*src, x1F001F);
}
}