vmx: add helper functions

This patch adds the following helper functions for reuse of code, hiding BE/LE differences and maintainability. All of the functions were defined as static force_inline. Names were copied from pixman-sse2.c so conversion of fast-paths between sse2 and vmx would be easier from now on. Therefore, I tried to keep the input/output of the functions to be as close as possible to the sse2 definitions. The functions are: - load_128_aligned : load 128-bit from a 16-byte aligned memory address into a vector - load_128_unaligned : load 128-bit from memory into a vector, without guarantee of alignment for the source pointer - save_128_aligned : save 128-bit vector into a 16-byte aligned memory address - create_mask_16_128 : take a 16-bit value and fill with it a new vector - create_mask_1x32_128 : take a 32-bit pointer and fill a new vector with the 32-bit value from that pointer - create_mask_32_128 : take a 32-bit value and fill with it a new vector - unpack_32_1x128 : unpack 32-bit value into a vector - unpacklo_128_16x8 : unpack the eight low 8-bit values of a vector - unpackhi_128_16x8 : unpack the eight high 8-bit values of a vector - unpacklo_128_8x16 : unpack the four low 16-bit values of a vector - unpackhi_128_8x16 : unpack the four high 16-bit values of a vector - unpack_128_2x128 : unpack the eight low 8-bit values of a vector into one vector and the eight high 8-bit values into another vector - unpack_128_2x128_16 : unpack the four low 16-bit values of a vector into one vector and the four high 16-bit values into another vector - unpack_565_to_8888 : unpack an RGB_565 vector to 8888 vector - pack_1x128_32 : pack a vector and return the LSB 32-bit of it - pack_2x128_128 : pack two vectors into one and return it - negate_2x128 : xor two vectors with mask_00ff (separately) - is_opaque : returns whether all the pixels contained in the vector are opaque - is_zero : returns whether the vector equals 0 - is_transparent : returns whether all the pixels contained in the vector are transparent - expand_pixel_8_1x128 : expand an 8-bit pixel into lower 8 bytes of a vector - expand_alpha_1x128 : expand alpha from vector and return the new vector - expand_alpha_2x128 : expand alpha from one vector and another alpha from a second vector - expand_alpha_rev_2x128 : expand a reversed alpha from one vector and another reversed alpha from a second vector - pix_multiply_2x128 : do pix_multiply for two vectors (separately) - over_2x128 : perform over op. on two vectors - in_over_2x128 : perform in-over op. on two vectors v2: removed expand_pixel_32_1x128 as it was not used by any function and its implementation was erroneous Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com> Acked-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>
author: Oded Gabbay <oded.gabbay@gmail.com> 2015-06-28 09:42:08 +0300
committer: Oded Gabbay <oded.gabbay@gmail.com> 2015-07-16 16:13:35 +0300
commit: c12ee95089e7d281a29a24bf56b81f5c16dec6ee (patch)
tree: aab0113df529f3f2860ddb73320194b2e98d7434
parent: 034149537be94862b43fb09699b8c2149bfe948d (diff)
1 files changed, 476 insertions, 0 deletions
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 880a19a..39d1a06 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -30,10 +30,19 @@
 #endif
 #include "pixman-private.h"
 #include "pixman-combine32.h"
+#include "pixman-inlines.h"
 #include <altivec.h>
 
 #define AVV(x...) {x}
 
+static vector unsigned int mask_00ff;
+static vector unsigned int mask_ff000000;
+static vector unsigned int mask_red;
+static vector unsigned int mask_green;
+static vector unsigned int mask_blue;
+static vector unsigned int mask_565_fix_rb;
+static vector unsigned int mask_565_fix_g;
+
 static force_inline vector unsigned int
 splat_alpha (vector unsigned int pix)
 {
@@ -233,6 +242,464 @@ do							  \
 #define STORE_VECTOR(dest)						\
     vec_st ((vector unsigned int) v ## dest, 0, dest);
 
+/* load 4 pixels from a 16-byte boundary aligned address */
+static force_inline vector unsigned int
+load_128_aligned (const uint32_t* src)
+{
+    return *((vector unsigned int *) src);
+}
+
+/* load 4 pixels from a unaligned address */
+static force_inline vector unsigned int
+load_128_unaligned (const uint32_t* src)
+{
+    vector unsigned int vsrc;
+    DECLARE_SRC_MASK_VAR;
+
+    COMPUTE_SHIFT_MASK (src);
+    LOAD_VECTOR (src);
+
+    return vsrc;
+}
+
+/* save 4 pixels on a 16-byte boundary aligned address */
+static force_inline void
+save_128_aligned (uint32_t* data,
+		  vector unsigned int vdata)
+{
+    STORE_VECTOR(data)
+}
+
+static force_inline vector unsigned int
+create_mask_16_128 (uint16_t mask)
+{
+    uint16_t* src;
+    vector unsigned short vsrc;
+    DECLARE_SRC_MASK_VAR;
+
+    src = &mask;
+
+    COMPUTE_SHIFT_MASK (src);
+    LOAD_VECTOR (src);
+    return (vector unsigned int) vec_splat(vsrc, 0);
+}
+
+static force_inline vector unsigned int
+create_mask_1x32_128 (const uint32_t *src)
+{
+    vector unsigned int vsrc;
+    DECLARE_SRC_MASK_VAR;
+
+    COMPUTE_SHIFT_MASK (src);
+    LOAD_VECTOR (src);
+    return vec_splat(vsrc, 0);
+}
+
+static force_inline vector unsigned int
+create_mask_32_128 (uint32_t mask)
+{
+    return create_mask_1x32_128(&mask);
+}
+
+static force_inline vector unsigned int
+unpack_32_1x128 (uint32_t data)
+{
+    vector unsigned int vdata = {0, 0, 0, data};
+    vector unsigned short lo;
+
+    lo = (vector unsigned short)
+#ifdef WORDS_BIGENDIAN
+	vec_mergel ((vector unsigned char) AVV(0),
+		    (vector unsigned char) vdata);
+#else
+	vec_mergel ((vector unsigned char) vdata,
+		    (vector unsigned char) AVV(0));
+#endif
+
+    return (vector unsigned int) lo;
+}
+
+static force_inline vector unsigned int
+unpacklo_128_16x8 (vector unsigned int data1, vector unsigned int data2)
+{
+    vector unsigned char lo;
+
+    /* unpack to short */
+    lo = (vector unsigned char)
+#ifdef WORDS_BIGENDIAN
+	vec_mergel ((vector unsigned char) data2,
+		    (vector unsigned char) data1);
+#else
+	vec_mergel ((vector unsigned char) data1,
+		    (vector unsigned char) data2);
+#endif
+
+    return (vector unsigned int) lo;
+}
+
+static force_inline vector unsigned int
+unpackhi_128_16x8 (vector unsigned int data1, vector unsigned int data2)
+{
+    vector unsigned char hi;
+
+    /* unpack to short */
+    hi = (vector unsigned char)
+#ifdef WORDS_BIGENDIAN
+	vec_mergeh ((vector unsigned char) data2,
+		    (vector unsigned char) data1);
+#else
+	vec_mergeh ((vector unsigned char) data1,
+		    (vector unsigned char) data2);
+#endif
+
+    return (vector unsigned int) hi;
+}
+
+static force_inline vector unsigned int
+unpacklo_128_8x16 (vector unsigned int data1, vector unsigned int data2)
+{
+    vector unsigned short lo;
+
+    /* unpack to char */
+    lo = (vector unsigned short)
+#ifdef WORDS_BIGENDIAN
+	vec_mergel ((vector unsigned short) data2,
+		    (vector unsigned short) data1);
+#else
+	vec_mergel ((vector unsigned short) data1,
+		    (vector unsigned short) data2);
+#endif
+
+    return (vector unsigned int) lo;
+}
+
+static force_inline vector unsigned int
+unpackhi_128_8x16 (vector unsigned int data1, vector unsigned int data2)
+{
+    vector unsigned short hi;
+
+    /* unpack to char */
+    hi = (vector unsigned short)
+#ifdef WORDS_BIGENDIAN
+	vec_mergeh ((vector unsigned short) data2,
+		    (vector unsigned short) data1);
+#else
+	vec_mergeh ((vector unsigned short) data1,
+		    (vector unsigned short) data2);
+#endif
+
+    return (vector unsigned int) hi;
+}
+
+static force_inline void
+unpack_128_2x128 (vector unsigned int data1, vector unsigned int data2,
+		    vector unsigned int* data_lo, vector unsigned int* data_hi)
+{
+    *data_lo = unpacklo_128_16x8(data1, data2);
+    *data_hi = unpackhi_128_16x8(data1, data2);
+}
+
+static force_inline void
+unpack_128_2x128_16 (vector unsigned int data1, vector unsigned int data2,
+		    vector unsigned int* data_lo, vector unsigned int* data_hi)
+{
+    *data_lo = unpacklo_128_8x16(data1, data2);
+    *data_hi = unpackhi_128_8x16(data1, data2);
+}
+
+static force_inline vector unsigned int
+unpack_565_to_8888 (vector unsigned int lo)
+{
+    vector unsigned int r, g, b, rb, t;
+
+    r = vec_and (vec_sl(lo, create_mask_32_128(8)), mask_red);
+    g = vec_and (vec_sl(lo, create_mask_32_128(5)), mask_green);
+    b = vec_and (vec_sl(lo, create_mask_32_128(3)), mask_blue);
+
+    rb = vec_or (r, b);
+    t  = vec_and (rb, mask_565_fix_rb);
+    t  = vec_sr (t, create_mask_32_128(5));
+    rb = vec_or (rb, t);
+
+    t  = vec_and (g, mask_565_fix_g);
+    t  = vec_sr (t, create_mask_32_128(6));
+    g  = vec_or (g, t);
+
+    return vec_or (rb, g);
+}
+
+static force_inline uint32_t
+pack_1x128_32 (vector unsigned int data)
+{
+    vector unsigned char vpack;
+
+    vpack = vec_packsu((vector unsigned short) data,
+			(vector unsigned short) AVV(0));
+
+    return vec_extract((vector unsigned int) vpack, 1);
+}
+
+static force_inline vector unsigned int
+pack_2x128_128 (vector unsigned int lo, vector unsigned int hi)
+{
+    vector unsigned char vpack;
+
+    vpack = vec_packsu((vector unsigned short) hi,
+			(vector unsigned short) lo);
+
+    return (vector unsigned int) vpack;
+}
+
+static force_inline void
+negate_2x128 (vector unsigned int  data_lo,
+	      vector unsigned int  data_hi,
+	      vector unsigned int* neg_lo,
+	      vector unsigned int* neg_hi)
+{
+    *neg_lo = vec_xor (data_lo, mask_00ff);
+    *neg_hi = vec_xor (data_hi, mask_00ff);
+}
+
+static force_inline int
+is_opaque (vector unsigned int x)
+{
+    uint32_t cmp_result;
+    vector bool int ffs = vec_cmpeq(x, x);
+
+    cmp_result = vec_all_eq(x, ffs);
+
+    return (cmp_result & 0x8888) == 0x8888;
+}
+
+static force_inline int
+is_zero (vector unsigned int x)
+{
+    uint32_t cmp_result;
+
+    cmp_result = vec_all_eq(x, (vector unsigned int) AVV(0));
+
+    return cmp_result == 0xffff;
+}
+
+static force_inline int
+is_transparent (vector unsigned int x)
+{
+    uint32_t cmp_result;
+
+    cmp_result = vec_all_eq(x, (vector unsigned int) AVV(0));
+    return (cmp_result & 0x8888) == 0x8888;
+}
+
+static force_inline vector unsigned int
+expand_pixel_8_1x128 (uint8_t data)
+{
+    vector unsigned int vdata;
+
+    vdata = unpack_32_1x128 ((uint32_t) data);
+
+#ifdef WORDS_BIGENDIAN
+    return vec_perm (vdata, vdata,
+		     (vector unsigned char)AVV (
+			 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+			 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F));
+#else
+    return vec_perm (vdata, vdata,
+		     (vector unsigned char)AVV (
+			 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+			 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09));
+#endif
+}
+
+static force_inline vector unsigned int
+expand_alpha_1x128 (vector unsigned int data)
+{
+#ifdef WORDS_BIGENDIAN
+    return vec_perm (data, data,
+		     (vector unsigned char)AVV (
+			 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
+			 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09));
+#else
+    return vec_perm (data, data,
+		     (vector unsigned char)AVV (
+			 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07,
+			 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F));
+#endif
+}
+
+static force_inline void
+expand_alpha_2x128 (vector unsigned int  data_lo,
+		    vector unsigned int  data_hi,
+		    vector unsigned int* alpha_lo,
+		    vector unsigned int* alpha_hi)
+{
+
+    *alpha_lo = expand_alpha_1x128(data_lo);
+    *alpha_hi = expand_alpha_1x128(data_hi);
+}
+
+static force_inline void
+expand_alpha_rev_2x128 (vector unsigned int  data_lo,
+			vector unsigned int  data_hi,
+			vector unsigned int* alpha_lo,
+			vector unsigned int* alpha_hi)
+{
+#ifdef WORDS_BIGENDIAN
+    *alpha_lo = vec_perm (data_lo, data_lo,
+		     (vector unsigned char)AVV (
+			 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07,
+			 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F));
+
+    *alpha_hi = vec_perm (data_hi, data_hi,
+		     (vector unsigned char)AVV (
+			 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07,
+			 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F));
+#else
+    *alpha_lo = vec_perm (data_lo, data_lo,
+		     (vector unsigned char)AVV (
+			 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
+			 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09));
+
+    *alpha_hi = vec_perm (data_hi, data_hi,
+		     (vector unsigned char)AVV (
+			 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
+			 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09));
+#endif
+}
+
+static force_inline void
+pix_multiply_2x128 (vector unsigned int* data_lo,
+		    vector unsigned int* data_hi,
+		    vector unsigned int* alpha_lo,
+		    vector unsigned int* alpha_hi,
+		    vector unsigned int* ret_lo,
+		    vector unsigned int* ret_hi)
+{
+    *ret_lo = pix_multiply(*data_lo, *alpha_lo);
+    *ret_hi = pix_multiply(*data_hi, *alpha_hi);
+}
+
+static force_inline void
+over_2x128 (vector unsigned int* src_lo,
+	    vector unsigned int* src_hi,
+	    vector unsigned int* alpha_lo,
+	    vector unsigned int* alpha_hi,
+	    vector unsigned int* dst_lo,
+	    vector unsigned int* dst_hi)
+{
+    vector unsigned int t1, t2;
+
+    negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
+
+    pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
+
+    *dst_lo = (vector unsigned int)
+		    vec_adds ((vector unsigned char) *src_lo,
+			      (vector unsigned char) *dst_lo);
+
+    *dst_hi = (vector unsigned int)
+		    vec_adds ((vector unsigned char) *src_hi,
+			      (vector unsigned char) *dst_hi);
+}
+
+static force_inline void
+in_over_2x128 (vector unsigned int* src_lo,
+	       vector unsigned int* src_hi,
+	       vector unsigned int* alpha_lo,
+	       vector unsigned int* alpha_hi,
+	       vector unsigned int* mask_lo,
+	       vector unsigned int* mask_hi,
+	       vector unsigned int* dst_lo,
+	       vector unsigned int* dst_hi)
+{
+    vector unsigned int s_lo, s_hi;
+    vector unsigned int a_lo, a_hi;
+
+    pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
+    pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
+
+    over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
+}
+
+static force_inline uint32_t
+core_combine_over_u_pixel_vmx (uint32_t src, uint32_t dst)
+{
+    uint8_t a;
+    vector unsigned int vmxs;
+
+    a = src >> 24;
+
+    if (a == 0xff)
+    {
+	return src;
+    }
+    else if (src)
+    {
+	vmxs = unpack_32_1x128 (src);
+	return pack_1x128_32(
+		over(vmxs, expand_alpha_1x128 (vmxs), unpack_32_1x128 (dst)));
+    }
+
+    return dst;
+}
+
+static force_inline uint32_t
+combine1 (const uint32_t *ps, const uint32_t *pm)
+{
+    uint32_t s = *ps;
+
+    if (pm)
+    {
+	vector unsigned int ms, mm;
+
+	mm = unpack_32_1x128 (*pm);
+	mm = expand_alpha_1x128 (mm);
+
+	ms = unpack_32_1x128 (s);
+	ms = pix_multiply (ms, mm);
+
+	s = pack_1x128_32 (ms);
+    }
+
+    return s;
+}
+
+static force_inline vector unsigned int
+combine4 (const uint32_t* ps, const uint32_t* pm)
+{
+    vector unsigned int vmx_src_lo, vmx_src_hi;
+    vector unsigned int vmx_msk_lo, vmx_msk_hi;
+    vector unsigned int s;
+
+    if (pm)
+    {
+	vmx_msk_lo = load_128_unaligned(pm);
+
+	if (is_transparent(vmx_msk_lo))
+	    return (vector unsigned int) AVV(0);
+    }
+
+    s = load_128_unaligned(ps);
+
+    if (pm)
+    {
+	unpack_128_2x128(s, (vector unsigned int) AVV(0),
+			    &vmx_src_lo, &vmx_src_hi);
+
+	unpack_128_2x128(vmx_msk_lo, (vector unsigned int) AVV(0),
+			    &vmx_msk_lo, &vmx_msk_hi);
+
+	expand_alpha_2x128(vmx_msk_lo, vmx_msk_hi, &vmx_msk_lo, &vmx_msk_hi);
+
+	pix_multiply_2x128(&vmx_src_lo, &vmx_src_hi,
+			   &vmx_msk_lo, &vmx_msk_hi,
+			   &vmx_src_lo, &vmx_src_hi);
+
+	s = pack_2x128_128(vmx_src_lo, vmx_src_hi);
+    }
+
+    return s;
+}
+
 static void
 vmx_combine_over_u_no_mask (uint32_t *      dest,
                             const uint32_t *src,
@@ -2080,6 +2547,15 @@ _pixman_implementation_create_vmx (pixman_implementation_t *fallback)
 {
     pixman_implementation_t *imp = _pixman_implementation_create (fallback, vmx_fast_paths);
 
+    /* VMX constants */
+    mask_00ff = create_mask_16_128 (0x00ff);
+    mask_ff000000 = create_mask_32_128 (0xff000000);
+    mask_red   = create_mask_32_128 (0x00f80000);
+    mask_green = create_mask_32_128 (0x0000fc00);
+    mask_blue  = create_mask_32_128 (0x000000f8);
+    mask_565_fix_rb = create_mask_32_128 (0x00e000e0);
+    mask_565_fix_g = create_mask_32_128  (0x0000c000);
+
     /* Set up function pointers */
 
     imp->combine_32[PIXMAN_OP_OVER] = vmx_combine_over_u;
author	Oded Gabbay <oded.gabbay@gmail.com>	2015-06-28 09:42:08 +0300
committer	Oded Gabbay <oded.gabbay@gmail.com>	2015-07-16 16:13:35 +0300
commit	c12ee95089e7d281a29a24bf56b81f5c16dec6ee (patch)
tree	aab0113df529f3f2860ddb73320194b2e98d7434
parent	034149537be94862b43fb09699b8c2149bfe948d (diff)