mmx: Use _mm_shuffle_pi16

The pshufw x86 instruction is part of Extended 3DNow! and SSE1. The equivalent ARM wshufh instruction was available from the first iwMMXt instrucion set. This instruction is already used in the SSE2 code. Reduces code size by ~9%. amd64 text data bss dec hex filename 29925 2240 0 32165 7da5 .libs/libpixman_mmx_la-pixman-mmx.o 27237 2240 0 29477 7325 .libs/libpixman_mmx_la-pixman-mmx.o x86 text data bss dec hex filename 27677 1792 0 29469 731d .libs/libpixman_mmx_la-pixman-mmx.o 24959 1792 0 26751 687f .libs/libpixman_mmx_la-pixman-mmx.o arm text data bss dec hex filename 30176 1792 0 31968 7ce0 .libs/libpixman_iwmmxt_la-pixman-mmx.o 27384 1792 0 29176 71f8 .libs/libpixman_iwmmxt_la-pixman-mmx.o Signed-off-by: Matt Turner <mattst88@gmail.com>
author: Matt Turner <mattst88@gmail.com> 2012-02-19 18:10:03 -0500
committer: Matt Turner <mattst88@gmail.com> 2012-02-20 18:32:39 -0500
commit: 118815aab5b9d4736202f3bf104ee8173a690c13 (patch)
tree: 1ae1e7a797713943f2c11fdfdd6c7077332fd712
parent: 0a516164e3c4b5ff0213e74b886e4cf8c37af8a7 (diff)
1 files changed, 12 insertions, 36 deletions
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index c12a7df3..ec569026 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -65,8 +65,17 @@ _mm_mulhi_pu16 (__m64 __A, __m64 __B)
 {
   return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B);
 }
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_pi16 (__m64 __A, int const __N)
+{
+  return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N);
+}
 #endif
 
+#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
+ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
+
 /* Notes about writing mmx code
  *
  * give memory operands as the second operand. If you give it as the
@@ -245,52 +254,19 @@ pix_add (__m64 a, __m64 b)
 static force_inline __m64
 expand_alpha (__m64 pixel)
 {
-    __m64 t1, t2;
-
-    t1 = shift (pixel, -48);
-    t2 = shift (t1, 16);
-    t1 = _mm_or_si64 (t1, t2);
-    t2 = shift (t1, 32);
-    t1 = _mm_or_si64 (t1, t2);
-
-    return t1;
+    return _mm_shuffle_pi16(pixel, _MM_SHUFFLE (3, 3, 3, 3));
 }
 
 static force_inline __m64
 expand_alpha_rev (__m64 pixel)
 {
-    __m64 t1, t2;
-
-    /* move alpha to low 16 bits and zero the rest */
-    t1 = shift (pixel,  48);
-    t1 = shift (t1, -48);
-
-    t2 = shift (t1, 16);
-    t1 = _mm_or_si64 (t1, t2);
-    t2 = shift (t1, 32);
-    t1 = _mm_or_si64 (t1, t2);
-
-    return t1;
+    return _mm_shuffle_pi16(pixel, _MM_SHUFFLE (0, 0, 0, 0));
 }
 
 static force_inline __m64
 invert_colors (__m64 pixel)
 {
-    __m64 x, y, z;
-
-    x = y = z = pixel;
-
-    x = _mm_and_si64 (x, MC (ffff0000ffff0000));
-    y = _mm_and_si64 (y, MC (000000000000ffff));
-    z = _mm_and_si64 (z, MC (0000ffff00000000));
-
-    y = shift (y, 32);
-    z = shift (z, -32);
-
-    x = _mm_or_si64 (x, y);
-    x = _mm_or_si64 (x, z);
-
-    return x;
+    return _mm_shuffle_pi16(pixel, _MM_SHUFFLE (3, 0, 1, 2));
 }
 
 static force_inline __m64
author	Matt Turner <mattst88@gmail.com>	2012-02-19 18:10:03 -0500
committer	Matt Turner <mattst88@gmail.com>	2012-02-20 18:32:39 -0500
commit	118815aab5b9d4736202f3bf104ee8173a690c13 (patch)
tree	1ae1e7a797713943f2c11fdfdd6c7077332fd712
parent	0a516164e3c4b5ff0213e74b886e4cf8c37af8a7 (diff)