summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSiarhei Siamashka <siarhei.siamashka@gmail.com>2012-12-03 03:01:21 +0200
committerSiarhei Siamashka <siarhei.siamashka@gmail.com>2012-12-18 20:45:57 +0200
commitf9a41703b2d46c988b9e4e378d27396f718006ae (patch)
tree9b02b72c21711625798c1826f70545f9840b145c
parent3922e90c400fca3ac43dc77b8dd0c0591e7e4fbc (diff)
Faster conversion from a8r8g8b8 to r5g6b5 in C code
This change reduces 3 shifts, 3 ANDs and 2 ORs (total 8 arithmetic operations) to 3 shifts, 2 ANDs and 2 ORs (total 7 arithmetic operations). We get garbage in the high 16 bits of the result, which might need to be cleared when casting to uint16_t (it would bring us back to total 8 arithmetic operations). However in the case if the result of a8r8g8b8->r5g6b5 conversion is immediately stored to memory, no extra instructions for clearing these garbage bits are needed. This allows the a8r8g8b8->r5g6b5 conversion code to be compiled into 4 instructions for ARM instead of 5 (assuming a good optimizing compiler), which has no pipeline stalls on ARM11 as an additional bonus. The change in benchmark results for 'lowlevel-blt-bench src_8888_0565' with PIXMAN_DISABLE="arm-simd arm-neon mips-dspr2 mmx sse2" and pixman compiled by gcc-4.7.2: MIPS 74K 480MHz : 40.44 MPix/s -> 40.13 MPix/s ARM11 700MHz : 50.28 MPix/s -> 62.85 MPix/s ARM Cortex-A8 1000MHz : 124.38 MPix/s -> 141.85 MPix/s ARM Cortex-A15 1700MHz : 281.07 MPix/s -> 303.29 MPix/s Intel Core i7 2800MHz : 515.92 MPix/s -> 531.16 MPix/s The same trick was used in xomap (X server for Nokia N800/N810): http://repository.maemo.org/pool/diablo/free/x/xorg-server/ xorg-server_1.3.99.0~git20070321-0osso20083801.tar.gz
-rw-r--r--pixman/pixman-private.h10
1 files changed, 7 insertions, 3 deletions
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index ce71bbd..ea447aa 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -887,9 +887,13 @@ pixman_list_move_to_front (pixman_list_t *list, pixman_link_t *link)
static force_inline uint16_t
convert_8888_to_0565 (uint32_t s)
{
- return ((((s) >> 3) & 0x001f) |
- (((s) >> 5) & 0x07e0) |
- (((s) >> 8) & 0xf800));
+ /* The following code can be compiled into just 4 instructions on ARM */
+ uint32_t a, b;
+ a = (s >> 3) & 0x1F001F;
+ b = s & 0xFC00;
+ a |= a >> 5;
+ a |= b >> 5;
+ return (uint16_t)a;
}
static force_inline uint32_t