diff options
author | Søren Sandmann Pedersen <ssp@redhat.com> | 2010-03-28 20:02:24 -0400 |
---|---|---|
committer | Søren Sandmann Pedersen <ssp@redhat.com> | 2010-03-28 20:02:24 -0400 |
commit | 1a9a285724757725b0279c037bf012537b254de5 (patch) | |
tree | aa02e8a7f628641abbd6de57a3089c805a1010cd | |
parent | efd41c62875d97c5127233cb6a4c353b4d495531 (diff) |
sse2: Add sse2_composite_over_reverse_n_8888over-reverse
This is a barely-measurable speed-up for the poppler benchmark:
Before:
[ # ] backend test min(s) median(s) stddev. count
[ 0] image poppler 4.443 4.474 0.31% 6/6
After:
[ # ] backend test min(s) median(s) stddev. count
[ 0] image poppler 4.224 4.248 0.42% 6/6
-rw-r--r-- | pixman/pixman-sse2.c | 102 |
1 files changed, 102 insertions, 0 deletions
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c index 946e7ba3..212148da 100644 --- a/pixman/pixman-sse2.c +++ b/pixman/pixman-sse2.c @@ -5765,6 +5765,104 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp, _mm_empty (); } +static void +sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src; + uint32_t *dst_line, *dst; + __m128i xmm_src; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_dsta_hi, xmm_dsta_lo; + int dst_stride; + int32_t w; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + + xmm_src = expand_pixel_32_1x128 (src); + + while (height--) + { + dst = dst_line; + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)dst); + + dst_line += dst_stride; + w = width; + + while (w && (unsigned long)dst & 15) + { + __m64 vd; + + vd = unpack_32_1x64 (*dst); + + *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd), + _mm_movepi64_pi64 (xmm_src))); + w--; + dst++; + } + + cache_prefetch ((__m128i*)dst); + + while (w >= 4) + { + __m128i tmp_lo, tmp_hi; + + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)(dst + 4)); + + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi); + + tmp_lo = xmm_src; + tmp_hi = xmm_src; + + over_2x128 (&xmm_dst_lo, &xmm_dst_hi, + &xmm_dsta_lo, &xmm_dsta_hi, + &tmp_lo, &tmp_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi)); + + w -= 4; + dst += 4; + } + + while (w) + { + __m64 vd; + + vd = unpack_32_1x64 (*dst); + + *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd), + _mm_movepi64_pi64 (xmm_src))); + w--; + } + + } + _mm_empty (); +} + static const pixman_fast_path_t sse2_fast_paths[] = { /* PIXMAN_OP_OVER */ @@ -5814,6 +5912,10 @@ static const pixman_fast_path_t sse2_fast_paths[] = PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), + /* PIXMAN_OP_OVER_REVERSE */ + PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888), + PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888), + /* PIXMAN_OP_ADD */ PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca), PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8000_8000), |