diff options
author | Søren Sandmann Pedersen <ssp@redhat.com> | 2010-12-20 16:11:48 -0500 |
---|---|---|
committer | Søren Sandmann Pedersen <ssp@redhat.com> | 2010-12-20 19:37:11 -0500 |
commit | a484a9c49c98dfad0d74af4440039f61bef24d48 (patch) | |
tree | 1a882587570756a87465ab514583311ce8e4c7dc | |
parent | 2610323545cb5ee3dff0b7d7da505a1cd1e01b73 (diff) |
sse2: Skip src pixels that are zero in sse2_composite_over_8888_n_8888()
This is a big speed-up in the SVG helicopter game:
http://ie.microsoft.com/testdrive/Performance/Helicopter/Default.xhtml
when rendered by Firefox 4 since it is compositing big images
consisting almost entirely of zeros.
-rw-r--r-- | pixman/pixman-sse2.c | 75 |
1 files changed, 44 insertions, 31 deletions
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c index 5907de05..032f13be 100644 --- a/pixman/pixman-sse2.c +++ b/pixman/pixman-sse2.c @@ -3051,37 +3051,45 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, while (w && (unsigned long)dst & 15) { uint32_t s = *src++; - uint32_t d = *dst; - - __m64 ms = unpack_32_1x64 (s); - __m64 alpha = expand_alpha_1x64 (ms); - __m64 dest = _mm_movepi64_pi64 (xmm_mask); - __m64 alpha_dst = unpack_32_1x64 (d); - - *dst++ = pack_1x64_32 ( - in_over_1x64 (&ms, &alpha, &dest, &alpha_dst)); + if (s) + { + uint32_t d = *dst; + + __m64 ms = unpack_32_1x64 (s); + __m64 alpha = expand_alpha_1x64 (ms); + __m64 dest = _mm_movepi64_pi64 (xmm_mask); + __m64 alpha_dst = unpack_32_1x64 (d); + + *dst = pack_1x64_32 ( + in_over_1x64 (&ms, &alpha, &dest, &alpha_dst)); + } + dst++; w--; } while (w >= 4) { xmm_src = load_128_unaligned ((__m128i*)src); - xmm_dst = load_128_aligned ((__m128i*)dst); - - unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); - expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, - &xmm_alpha_lo, &xmm_alpha_hi); - - in_over_2x128 (&xmm_src_lo, &xmm_src_hi, - &xmm_alpha_lo, &xmm_alpha_hi, - &xmm_mask, &xmm_mask, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ( - (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + if (!is_zero (xmm_src)) + { + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_mask, &xmm_mask, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + dst += 4; src += 4; w -= 4; @@ -3090,16 +3098,21 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, while (w) { uint32_t s = *src++; - uint32_t d = *dst; - __m64 ms = unpack_32_1x64 (s); - __m64 alpha = expand_alpha_1x64 (ms); - __m64 mask = _mm_movepi64_pi64 (xmm_mask); - __m64 dest = unpack_32_1x64 (d); - - *dst++ = pack_1x64_32 ( - in_over_1x64 (&ms, &alpha, &mask, &dest)); + if (s) + { + uint32_t d = *dst; + + __m64 ms = unpack_32_1x64 (s); + __m64 alpha = expand_alpha_1x64 (ms); + __m64 mask = _mm_movepi64_pi64 (xmm_mask); + __m64 dest = unpack_32_1x64 (d); + + *dst = pack_1x64_32 ( + in_over_1x64 (&ms, &alpha, &mask, &dest)); + } + dst++; w--; } } |