sse2: add src_x888_0565

Port of 2ddd1c498b to SSE2. Uses the pmadd technique described in http://software.intel.com/sites/landingpage/legacy/mmx/MMX_App_24-16_Bit_Conversion.pdf Works around lack of packusdw instruction by first sign extending the values. fast: src_8888_0565 = L1: 681.40 L2: 689.20 M: 644.76 ( 25.51%) HT:404.42 VT:288.04 R:306.07 RT:150.80 (1619Kops/s) mmx: src_8888_0565 = L1:2056.03 L2:1985.44 M:1574.91 ( 61.87%) HT:533.10 VT:376.35 R:416.10 RT:178.79 (1833Kops/s) sse2: src_8888_0565 = L1:3793.42 L2:3653.44 M:1878.83 ( 73.94%) HT:535.03 VT:407.96 R:421.46 RT:163.31 (1727Kops/s) and for reference, using packusdw sse4: src_8888_0565 = L1:4396.18 L2:4229.25 M:1904.04 ( 75.18%) HT:559.79 VT:427.96 R:440.06 RT:165.71 (1744Kops/s) Notice that MMX is faster in the RT case because it can operate on 8-bytes instead of the current 16-bytes for SSE2.
author: Matt Turner <mattst88@gmail.com> 2012-05-02 23:13:43 -0400
committer: Matt Turner <mattst88@gmail.com> 2012-06-16 16:00:00 -0400
commit: 21077e1b83912b5e895b160bbbcd9b4664191506 (patch)
tree: 4c2384c3ddf14d9d0aa42cbea99b94633646a985
parent: 7db07cb731e3689328d9ecbdafffe99d7d38388e (diff)
1 files changed, 83 insertions, 0 deletions
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 7af06db5..0604254a 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -53,6 +53,9 @@ static __m128i mask_blue;
 static __m128i mask_565_fix_rb;
 static __m128i mask_565_fix_g;
 
+static __m128i mask_565_rb;
+static __m128i mask_565_pack_multiplier;
+
 static force_inline __m128i
 unpack_32_1x128 (uint32_t data)
 {
@@ -121,6 +124,29 @@ pack_2x128_128 (__m128i lo, __m128i hi)
 }
 
 static force_inline __m128i
+pack_565_2packedx128_128 (__m128i lo, __m128i hi)
+{
+    __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
+    __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
+
+    __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
+    __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
+
+    __m128i g0 = _mm_and_si128 (lo, mask_green);
+    __m128i g1 = _mm_and_si128 (hi, mask_green);
+
+    t0 = _mm_or_si128 (t0, g0);
+    t1 = _mm_or_si128 (t1, g1);
+
+    /* Simulates _mm_packus_epi32 */
+    t0 = _mm_slli_epi32 (t0, 16 - 5);
+    t1 = _mm_slli_epi32 (t1, 16 - 5);
+    t0 = _mm_srai_epi32 (t0, 16);
+    t1 = _mm_srai_epi32 (t1, 16);
+    return _mm_packs_epi32 (t0, t1);
+}
+
+__m128i
 pack_565_2x128_128 (__m128i lo, __m128i hi)
 {
     __m128i data;
@@ -2832,6 +2858,57 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
 }
 
 static void
+sse2_composite_src_x888_0565 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    s = *src++;
+	    *dst = CONVERT_8888_TO_0565 (s);
+	    dst++;
+	    w--;
+	}
+
+	while (w >= 8)
+	{
+	    __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0);
+	    __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1);
+
+	    save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1));
+
+	    w -= 8;
+	    src += 8;
+	    dst += 8;
+	}
+
+	while (w)
+	{
+	    s = *src++;
+	    *dst = CONVERT_8888_TO_0565 (s);
+	    dst++;
+	    w--;
+	}
+    }
+}
+
+static void
 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
 			      pixman_composite_info_t *info)
 {
@@ -5728,6 +5805,10 @@ static const pixman_fast_path_t sse2_fast_paths[] =
     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
@@ -6030,6 +6111,8 @@ _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
     mask_ffff = create_mask_16_128 (0xffff);
     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
+    mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8);
+    mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004);
 
     /* Set up function pointers */
     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
author	Matt Turner <mattst88@gmail.com>	2012-05-02 23:13:43 -0400
committer	Matt Turner <mattst88@gmail.com>	2012-06-16 16:00:00 -0400
commit	21077e1b83912b5e895b160bbbcd9b4664191506 (patch)
tree	4c2384c3ddf14d9d0aa42cbea99b94633646a985
parent	7db07cb731e3689328d9ecbdafffe99d7d38388e (diff)