summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSøren Sandmann Pedersen <ssp@redhat.com>2010-03-28 20:02:24 -0400
committerSøren Sandmann Pedersen <ssp@redhat.com>2010-03-28 20:02:24 -0400
commit1a9a285724757725b0279c037bf012537b254de5 (patch)
treeaa02e8a7f628641abbd6de57a3089c805a1010cd
parentefd41c62875d97c5127233cb6a4c353b4d495531 (diff)
sse2: Add sse2_composite_over_reverse_n_8888over-reverse
This is a barely-measurable speed-up for the poppler benchmark: Before: [ # ] backend test min(s) median(s) stddev. count [ 0] image poppler 4.443 4.474 0.31% 6/6 After: [ # ] backend test min(s) median(s) stddev. count [ 0] image poppler 4.224 4.248 0.42% 6/6
-rw-r--r--pixman/pixman-sse2.c102
1 files changed, 102 insertions, 0 deletions
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 946e7ba3..212148da 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -5765,6 +5765,104 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
_mm_empty ();
}
+static void
+sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src;
+ uint32_t *dst_line, *dst;
+ __m128i xmm_src;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_dsta_hi, xmm_dsta_lo;
+ int dst_stride;
+ int32_t w;
+
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+ xmm_src = expand_pixel_32_1x128 (src);
+
+ while (height--)
+ {
+ dst = dst_line;
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)dst);
+
+ dst_line += dst_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ __m64 vd;
+
+ vd = unpack_32_1x64 (*dst);
+
+ *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
+ _mm_movepi64_pi64 (xmm_src)));
+ w--;
+ dst++;
+ }
+
+ cache_prefetch ((__m128i*)dst);
+
+ while (w >= 4)
+ {
+ __m128i tmp_lo, tmp_hi;
+
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)(dst + 4));
+
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
+
+ tmp_lo = xmm_src;
+ tmp_hi = xmm_src;
+
+ over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_dsta_lo, &xmm_dsta_hi,
+ &tmp_lo, &tmp_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
+
+ w -= 4;
+ dst += 4;
+ }
+
+ while (w)
+ {
+ __m64 vd;
+
+ vd = unpack_32_1x64 (*dst);
+
+ *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
+ _mm_movepi64_pi64 (xmm_src)));
+ w--;
+ }
+
+ }
+ _mm_empty ();
+}
+
static const pixman_fast_path_t sse2_fast_paths[] =
{
/* PIXMAN_OP_OVER */
@@ -5814,6 +5912,10 @@ static const pixman_fast_path_t sse2_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+ /* PIXMAN_OP_OVER_REVERSE */
+ PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
+
/* PIXMAN_OP_ADD */
PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8000_8000),