gradientssse2-gradients

author: Søren Sandmann Pedersen <ssp@redhat.com> 2011-09-27 11:23:49 -0400
committer: Søren Sandmann Pedersen <ssp@redhat.com> 2011-09-27 11:23:49 -0400
commit: 76c13dac657b1bde44a4fd01acc43d80f8971198 (patch)
tree: ae32292ea05e117622a851854d8ec0dcf0709cc7
parent: 57fd8c37aa3148b1d70bad65e1a49721e9a47d7e (diff)
4 files changed, 885 insertions, 2 deletions
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index 2421a4f9..a4dc03cd 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -50,8 +50,12 @@ endif
 # sse2 code
 if USE_SSE2
 noinst_LTLIBRARIES += libpixman-sse2.la
-libpixman_sse2_la_SOURCES = \
-	pixman-sse2.c
+libpixman_sse2_la_SOURCES =		\
+	pixman-sse2.c			\
+	pixman-sse2-linear-gradient.c	\
+	pixman-sse2-radial-gradient.c	\
+	pixman-sse2-gradient.h		\
+	pixman-sse2-gradient-walker.c
 libpixman_sse2_la_CFLAGS = $(DEP_CFLAGS) $(SSE2_CFLAGS)
 libpixman_sse2_la_LIBADD = $(DEP_LIBS)
 libpixman_1_la_LDFLAGS += $(SSE2_LDFLAGS)
diff --git a/pixman/pixman-sse2-gradient.h b/pixman/pixman-sse2-gradient.h
new file mode 100644
index 00000000..d356b9c4
--- /dev/null
+++ b/pixman/pixman-sse2-gradient.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright © 2010 Novell, Inc
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Chris Toshok (toshok@novell.com)
+ */
+
+#ifndef PIXMAN_GRADIENT_SSE2_H
+#define PIXMAN_GRADIENT_SSE2_H
+
+#define CLAMP_MAX(v,max) ((v) < 0 ? 0 : (v) > (max) ? (max) : (v))
+
+#define SSE2_USE_GRADIENT_WALKER(_ts,_buffer,_lut,_shift,_storeop)	\
+  do {									\
+    _mm_##_storeop##_si128 ((__m128i*)_buffer, _pixman_sse2_gradient_walker_pixel (&walker, _ts)); \
+  } while (0)
+
+#define SSE2_LUT_REPEAT_NORMAL(_ts,_buffer,_lut,_shift,_storeop)	\
+  do {									\
+    __m128i x = _ts;							\
+    									\
+    /* t = abs(t); */							\
+    /* t = t & 0xFFFF; */						\
+    									\
+    /* abs() hack from http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs */ \
+    __m128i mask = _mm_srai_epi32 (x, 31);				\
+    x = _mm_xor_si128 (_mm_add_epi32 (x, mask), mask);			\
+    									\
+    x = _mm_and_si128 (x, mask_0xffff);					\
+    x = _mm_srai_epi32 (x, _shift);					\
+    									\
+    _mm_##_storeop##_si128 ((__m128i*)_buffer, x);			\
+    									\
+    _buffer[0] = _lut[_buffer[0]];					\
+    _buffer[1] = _lut[_buffer[1]];					\
+    _buffer[2] = _lut[_buffer[2]];					\
+    _buffer[3] = _lut[_buffer[3]];					\
+} while (0)
+
+#define SSE2_LUT_REPEAT_REFLECT(_ts,_buffer,_lut,_shift,_storeop)	\
+  do {									\
+    __m128i x = _ts;							\
+    /* s = t << 15 >> 31; */						\
+    /* t = (t ^ s) & 0xFFFF; */						\
+    /* t >>= shift */							\
+    __m128i ss =  _mm_srai_epi32 (_mm_slli_epi32 (ts, 15), 31);		\
+    x = _mm_xor_si128 (ss, x);						\
+    x = _mm_and_si128 (x, mask_0xffff);					\
+    x = _mm_srai_epi32 (x, _shift);					\
+    									\
+    _mm_##_storeop##_si128 ((__m128i*)_buffer, x);			\
+    									\
+    _buffer[0] = _lut[_buffer[0]];					\
+    _buffer[1] = _lut[_buffer[1]];					\
+    _buffer[2] = _lut[_buffer[2]];					\
+    _buffer[3] = _lut[_buffer[3]];					\
+} while (0)
+
+#define SSE2_LUT_REPEAT_NONE(_ts,_buffer,_lut,_shift,_storeop)		\
+  do {									\
+    __m128i x = ts;							\
+    									\
+    __m128i cmp_lt0 = _mm_cmplt_epi32 (x, _mm_set1_epi32 (0));		\
+    __m128i cmp_gt0xffff = _mm_cmpgt_epi32 (x, mask_0xffff);		\
+    									\
+    /* cmp_mask_in_range = 0xffffffff where values are >= 0 and <= 0xffff, */ \
+    /* cmp_mask_in_range = 0x00000000 where values are < 0 or > 0xffff */ \
+    									\
+    /* cmp_mask_out_of_range = 0xffffffff where values are < 0 or > 0xffff, */ \
+    /* cmp_mask_out_of_range = 0x00000000 where values are >= 0 and <= 0xffff */ \
+    __m128i cmp_mask_out_of_range = _mm_or_si128 (cmp_lt0, cmp_gt0xffff); \
+    __m128i cmp_mask_in_range = _mm_andnot_si128 (cmp_mask_out_of_range, \
+						  _mm_set1_epi32 (0xffffffff)); \
+    									\
+    /* take care of the shift now, since we don't want to shift the sentinel (0xffffffff) */ \
+    x = _mm_srai_epi32 (x, _shift);					\
+    									\
+    /* or together the masked in range x values and the out of range sentinel values */ \
+    x = _mm_or_si128 (_mm_and_si128 (cmp_mask_in_range, x),		\
+		      _mm_and_si128 (cmp_mask_out_of_range, _mm_set1_epi32 (0xffffffff))); \
+    									\
+    _mm_##_storeop##_si128 ((__m128i*)_buffer, x);			\
+    									\
+    _buffer[0] = _buffer[0] == 0xffffffff ? 0 : _lut[_buffer[0]];	\
+    _buffer[1] = _buffer[1] == 0xffffffff ? 0 : _lut[_buffer[1]];	\
+    _buffer[2] = _buffer[2] == 0xffffffff ? 0 : _lut[_buffer[2]];	\
+    _buffer[3] = _buffer[3] == 0xffffffff ? 0 : _lut[_buffer[3]];	\
+} while (0)
+
+#define SSE2_LUT_REPEAT_PAD(_ts,_buffer,_lut,_shift,_storeop)		\
+  do {									\
+  									\
+    /* t = t < 0 ? 0 : t > 0xffff ? 0xffff : t */			\
+    /* t >>= shift */							\
+    									\
+    __m128i x = ts;							\
+    __m128i cmp;							\
+    									\
+    /* handle t = t < 0 ? 0 */						\
+    cmp = _mm_cmpgt_epi32 (x, _mm_set1_epi32 (0));			\
+    x = _mm_and_si128 (x, cmp);						\
+    									\
+    /* handle t > 0xffff ? 0xffff */					\
+    cmp = _mm_cmpgt_epi32 (x, mask_0xffff);				\
+    x = _mm_andnot_si128 (cmp, x);					\
+    x = _mm_add_epi32 (x, _mm_and_si128 (cmp, mask_0xffff));		\
+    									\
+    x = _mm_srai_epi32 (x, _shift);					\
+    									\
+    _mm_##_storeop##_si128 ((__m128i*)_buffer, x);			\
+    									\
+    _buffer[0] = _lut[_buffer[0]];					\
+    _buffer[1] = _lut[_buffer[1]];					\
+    _buffer[2] = _lut[_buffer[2]];					\
+    _buffer[3] = _lut[_buffer[3]];					\
+} while (0)
+
+#endif
diff --git a/pixman/pixman-sse2-linear-gradient.c b/pixman/pixman-sse2-linear-gradient.c
new file mode 100644
index 00000000..b47d7c52
--- /dev/null
+++ b/pixman/pixman-sse2-linear-gradient.c
@@ -0,0 +1,356 @@
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ * Copyright © 2010 Novell, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdlib.h>
+#include "pixman-private.h"
+
+#include <mmintrin.h>
+#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
+#include <emmintrin.h>
+
+#include <assert.h>
+
+#include "pixman-sse2-gradient.h"
+
+#include "pixman-sse2-gradient-walker.c"
+
+#define LIKELY(x)       __builtin_expect ((x),1)
+#define UNLIKELY(x)     __builtin_expect ((x),0)
+
+#define SSE2_LINEAR_GRADIENT(lut_macro)					\
+    do									\
+    {									\
+	/* we can process 4 pixels at a time */				\
+	int count;							\
+	int count;							\
+									\
+	if ((int)buffer & 0x0f)						\
+	{								\
+	    /* we aren't aligned to a 16 byte boundary. */		\
+	    /* process the at most 3 pixels until we get */		\
+	    /* there */							\
+	    int initial = 16 - ((int)buffer & 0x0f);			\
+	    								\
+	    lut_macro (ts, tsf, lut, shift, storeu);			\
+	    								\
+	    initial >>= 2;						\
+	    								\
+	    if (initial >= 3)						\
+	    {								\
+		*buffer++ = tsf[0];					\
+		ts = _mm_add_epi32 (ts, _mm_set1_epi32 (inc));		\
+	    }								\
+	    if (initial >= 2)						\
+	    {								\
+		*buffer++ = tsf[1];					\
+		ts = _mm_add_epi32 (ts, _mm_set1_epi32 (inc));		\
+	    }								\
+	    if (initial >= 1)						\
+	    {								\
+		*buffer++ = tsf[2];					\
+		ts = _mm_add_epi32 (ts, _mm_set1_epi32 (inc));		\
+	    }								\
+	}								\
+									\
+	count = end - buffer;						\
+	while (count >= 4)						\
+	{								\
+	    lut_macro (ts, buffer, lut, shift, store);			\
+	    								\
+	    buffer += 4;						\
+	    /* t += t + 4 * inc */					\
+	    ts = _mm_add_epi32 (ts, incs);				\
+	    								\
+	    count -= 4;							\
+	}								\
+									\
+	if (buffer < end)						\
+	{								\
+	    /* we're at most 3 pixels off, so unroll/inline it here */	\
+	    int rem = end - buffer;					\
+	    								\
+	    lut_macro (ts, tsf, lut, shift, storeu);			\
+	    								\
+	    if (rem-- > 0)						\
+		buffer[0] = tsf[0];					\
+	    if (rem-- > 0)						\
+		buffer[1] = tsf[1];					\
+	    if (rem-- > 0)						\
+		buffer[2] = tsf[2];					\
+	}								\
+    }									\
+    while (0)
+
+
+void
+_pixman_sse2_linear_gradient_get_scanline_32 (pixman_image_t *image,
+					      int             x,
+					      int             y,
+					      int             width,
+					      uint32_t *      buffer,
+					      const uint32_t *mask,
+					      uint32_t        mask_bits)
+{
+    pixman_vector_t v, unit;
+    pixman_fixed_32_32_t l;
+    pixman_fixed_48_16_t dx, dy, a, b, off;
+    gradient_t *gradient = (gradient_t *)image;
+    linear_gradient_t *linear = (linear_gradient_t *)image;
+    uint32_t *end = buffer + width;
+    
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+    
+    if (gradient->common.transform)
+    {
+	if (!pixman_transform_point_3d (gradient->common.transform, &v))
+	    return;
+	
+	unit.vector[0] = gradient->common.transform->matrix[0][0];
+	unit.vector[1] = gradient->common.transform->matrix[1][0];
+	unit.vector[2] = gradient->common.transform->matrix[2][0];
+    }
+    else
+    {
+	unit.vector[0] = pixman_fixed_1;
+	unit.vector[1] = 0;
+	unit.vector[2] = 0;
+    }
+    
+    dx = linear->p2.x - linear->p1.x;
+    dy = linear->p2.y - linear->p1.y;
+    
+    l = dx * dx + dy * dy;
+    
+    if (l != 0)
+    {
+	a = (dx << 32) / l;
+	b = (dy << 32) / l;
+	off = (-a * linear->p1.x
+	       -b * linear->p1.y) >> 16;
+    }
+    
+    if (l == 0 || (unit.vector[2] == 0 && v.vector[2] == pixman_fixed_1))
+    {
+	pixman_fixed_48_16_t inc, t;
+	
+	/* affine transformation only */
+	if (l == 0)
+	{
+	    t = 0;
+	    inc = 0;
+	}
+	else
+	{
+	    t = ((a * v.vector[0] + b * v.vector[1]) >> 16) + off;
+	    inc = (a * unit.vector[0] + b * unit.vector[1]) >> 16;
+	}
+	
+	if (gradient->class == SOURCE_IMAGE_CLASS_VERTICAL)
+	{
+	    uint32_t *lut = gradient->color_lut;
+	    uint32_t color;
+	    __m128i colors;
+	    
+	    if (lut != NULL)
+	    {
+		unsigned int repeat = gradient->common.repeat;
+		int shift = 16 - gradient->color_lut_bits;
+		
+		if (repeat == PIXMAN_REPEAT_NORMAL)
+		{
+		    /* color = _pixman_gradient_walker_pixel_lut_repeat_normal (&walker, t); */
+		    if (t < 0)
+		        t = -t;
+		    t = t & 0xFFFF;
+		    
+		    color = lut[t >> shift];
+		}
+		else if (repeat == PIXMAN_REPEAT_PAD)
+		{
+		    /* color = _pixman_gradient_walker_pixel_lut_repeat_pad (&walker, t); */
+		    
+		    t = CLAMP_MAX (t, 0xFFFF);
+		    color = lut[t >> shift];
+		}
+		else if (repeat == PIXMAN_REPEAT_REFLECT)
+		{
+		    /* color = _pixman_gradient_walker_pixel_lut_repeat_reflect (&walker, t); */
+		    int s = t << 15 >> 31;
+		    t = (t ^ s) & 0xFFFF;
+		    
+		    color = lut[x >> shift];
+		}
+		else /* PIXMAN_REPEAT_NONE */
+		{
+		    /* color = _pixman_gradient_walker_pixel_lut_repeat_none (&walker, t); */
+		    if (t < 0 || t > 0xFFFF)
+			color = 0 /* transparent black */;
+		    else
+			color = lut[t >> shift];
+		}
+		
+		colors = _mm_set1_epi32 (color);
+	    }
+	    else
+	    {
+	        pixman_sse2_gradient_walker_t walker;
+		
+		_pixman_sse2_gradient_walker_init (&walker, gradient, gradient->common.repeat);
+		
+	        __m128i ts = _mm_set1_epi32 (t);
+	        colors = _pixman_sse2_gradient_walker_pixel (&walker, ts);
+		color = _mm_cvtsi128_si32 (colors);
+	    }
+	    
+	    int initial = 16 - ((int)buffer & 0x0f);
+	    while (initial > 0) {
+	        *buffer++ = color;
+		initial -= 4;
+	    }
+	    
+	    assert (( (int)buffer & 0x0f) == 0);
+	    
+	    uint32_t *end_aligned = (uint32_t*) ((char*)end - ((int)end & 0x0f));
+	    
+	    while (buffer < end_aligned) {
+	        _mm_store_si128 ((__m128i*)buffer, colors);
+		buffer += 4;
+	    }
+	    if (end_aligned != end) {
+	        int i;
+		for (i = 0; i < (end - end_aligned);  i ++)
+		    *buffer++ = color;
+	    }
+	}
+	else
+	{
+	    uint32_t *lut = gradient->color_lut;
+	    int shift = 16 - gradient->color_lut_bits;
+	    
+	    __m128i incs = _mm_set1_epi32 (inc);
+	    __m128i ts = _mm_set_epi32 (t + 3 * inc, t + 2 * inc, t + inc, t);
+	    __m128i mask_0xffff =  _mm_set1_epi32 (0xffff);
+	    
+	    pixman_fixed_16_16_t tsf[4];
+	    
+	    incs = _mm_slli_epi32 (incs, 2);
+	    
+	    if (gradient->color_lut != NULL && (((int)buffer & 0x03) == 0))
+	    {
+		unsigned int repeat = gradient->common.repeat;
+		
+		if (repeat == PIXMAN_REPEAT_NORMAL)
+		{
+		    SSE2_LINEAR_GRADIENT (SSE2_LUT_REPEAT_NORMAL);
+		}
+		else if (repeat == PIXMAN_REPEAT_PAD)
+		{
+		    SSE2_LINEAR_GRADIENT (SSE2_LUT_REPEAT_PAD);
+		}
+		else if (repeat == PIXMAN_REPEAT_REFLECT)
+		{
+		    SSE2_LINEAR_GRADIENT (SSE2_LUT_REPEAT_REFLECT);
+		}
+		else /* PIXMAN_REPEAT_REFLECT */
+		{
+		    SSE2_LINEAR_GRADIENT (SSE2_LUT_REPEAT_NONE);
+		}
+	    }
+	    else
+	    {
+	        pixman_sse2_gradient_walker_t walker;
+		_pixman_sse2_gradient_walker_init (&walker, gradient, gradient->common.repeat);
+		
+		SSE2_LINEAR_GRADIENT (SSE2_USE_GRADIENT_WALKER);
+	    }
+	}
+    }
+#if 0
+    else
+    {
+	/* projective transformation */
+	pixman_fixed_48_16_t t;
+	
+	if (gradientclass == SOURCE_IMAGE_CLASS_VERTICAL)
+	{
+	    register uint32_t color;
+	    
+	    if (v.vector[2] == 0)
+	    {
+		t = 0;
+	    }
+	    else
+	    {
+		pixman_fixed_48_16_t x, y;
+		
+		x = ((pixman_fixed_48_16_t) v.vector[0] << 16) / v.vector[2];
+		y = ((pixman_fixed_48_16_t) v.vector[1] << 16) / v.vector[2];
+		t = ((a * x + b * y) >> 16) + off;
+	    }
+	    
+	    color = _pixman_gradient_walker_pixel (&walker, t);
+	    
+	    while (buffer < end)
+		*buffer++ = color;
+	}
+	else
+	{
+	    while (buffer < end)
+	    {
+		if (!mask || *mask++ & mask_bits)
+		{
+		    if (v.vector[2] == 0)
+		    {
+			t = 0;
+		    }
+		    else
+		    {
+			pixman_fixed_48_16_t x, y;
+			x = ((pixman_fixed_48_16_t)v.vector[0] << 16) / v.vector[2];
+			y = ((pixman_fixed_48_16_t)v.vector[1] << 16) / v.vector[2];
+			t = ((a * x + b * y) >> 16) + off;
+		    }
+		    
+		    *buffer = _pixman_gradient_walker_pixel (&walker, t);
+		}
+		
+		++buffer;
+		
+		v.vector[0] += unit.vector[0];
+		v.vector[1] += unit.vector[1];
+		v.vector[2] += unit.vector[2];
+	    }
+	}
+    }
+#endif
+}
diff --git a/pixman/pixman-sse2-radial-gradient.c b/pixman/pixman-sse2-radial-gradient.c
new file mode 100644
index 00000000..dbb20eef
--- /dev/null
+++ b/pixman/pixman-sse2-radial-gradient.c
@@ -0,0 +1,387 @@
+/*
+ *
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ * Copyright © 2000 SuSE, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ * Copyright © 2007 Red Hat, Inc.
+ * Copyright © 2010 Novell, Inc.
+ *
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdlib.h>
+#include <math.h>
+#include "pixman-private.h"
+
+#include <mmintrin.h>
+#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
+#include <emmintrin.h>
+
+#include "pixman-sse2-gradient.h"
+
+#include "pixman-sse2-gradient-walker.c"
+
+#define LIKELY(x)       __builtin_expect((x),1)
+#define UNLIKELY(x)     __builtin_expect((x),0)
+
+#define SSE2_RADIAL_GRADIENT(lut_macro) do {		\
+    /* we can process 4 pixels at a time */		\
+  							\
+    int count;							\
+    								\
+    if ((int)buffer & 0x0f)						\
+    {									\
+	/* we aren't aligned to a 16 byte boundary. */			\
+	/* process the at most 3 pixels until we get */			\
+	/* there */							\
+	int initial = 16 - ((int)buffer & 0x0f);			\
+	float scale;							\
+									\
+	__m128i ts = calc_radial_det4 (B_, A4_, invA_,			\
+				       pdx_, pdy_, r1sq_, invert_);	\
+									\
+	lut_macro(ts, tsf, lut, shift, storeu);				\
+									\
+	initial >>= 2;							\
+									\
+	if (initial >= 3)						\
+	  *buffer++ = tsf[2];						\
+	if (initial >= 2)						\
+	  *buffer++ = tsf[1];						\
+	if (initial >= 1)						\
+	  *buffer++ = tsf[1];						\
+									\
+	scale = initial;						\
+									\
+	pdx_ = _mm_add_ps (pdx_, _mm_mul_ps (cx_, _mm_load1_ps (&scale))); \
+	pdy_ = _mm_add_ps (pdy_, _mm_mul_ps (cy_, _mm_load1_ps (&scale))); \
+	B_ = _mm_add_ps (B_, _mm_mul_ps (cB_, _mm_load1_ps (&scale)));	\
+    }									\
+    									\
+    count = end - buffer;						\
+    while (count >= 4)							\
+    {									\
+	__m128i ts = calc_radial_det4 (B_, A4_, invA_,			\
+				       pdx_, pdy_, r1sq_, invert_);	\
+									\
+	lut_macro(ts, buffer, lut, shift, store);			\
+									\
+	buffer += 4;							\
+									\
+	pdx_ = _mm_add_ps (pdx_, cx_4);					\
+	pdy_ = _mm_add_ps (pdy_, cy_4);					\
+	B_ = _mm_add_ps (B_, cB_4);					\
+									\
+	count -= 4;							\
+    }									\
+    									\
+    if (buffer < end)							\
+    {									\
+	/* we're at most 3 pixels off, so unroll/inline it here */	\
+	int rem = end - buffer;						\
+									\
+	__m128i ts = calc_radial_det4 (B_, A4_, invA_,			\
+				       pdx_, pdy_, r1sq_, invert_);	\
+									\
+	lut_macro(ts, tsf, lut, shift, storeu);				\
+									\
+	if (rem-- > 0) buffer[0] = tsf[0];				\
+	if (rem-- > 0) buffer[1] = tsf[1];				\
+	if (rem-- > 0) buffer[2] = tsf[2];				\
+    }									\
+  } while (0)
+
+
+static force_inline __m128i
+calc_radial_det4 (__m128 B, __m128 A4, __m128 invA, __m128 pdx, __m128 pdy, __m128 r1sq, __m128 invert)
+{
+    __m128 Bsq = _mm_mul_ps (B, B);
+
+    /* discriminant = A4 * (pdx*pdx + pdy * pdy - r1sq) + Bsq */
+    __m128 discriminant = _mm_add_ps (_mm_mul_ps (A4,
+						  _mm_sub_ps (_mm_add_ps (_mm_mul_ps (pdx, pdx),
+									  _mm_mul_ps (pdy, pdy)),
+							      r1sq)),
+				      Bsq);
+
+    /* figure out the negative discriminants, and create a mask of 1.0 */
+    /* for those discriminants that are *not* less than 0. */
+    __m128 lt0_mask = _mm_cmplt_ps (discriminant, _mm_setzero_ps());
+
+    /* zero out the negative ones */
+    discriminant = _mm_andnot_ps (lt0_mask, discriminant);
+
+    /* sqrt them all */
+    discriminant = _mm_sqrt_ps (discriminant);
+
+    discriminant = _mm_mul_ps (invA,
+			       _mm_add_ps (B,
+					   _mm_mul_ps (invert,
+						       discriminant)));
+
+    return  _mm_cvtps_epi32 (discriminant);
+}
+
+void
+_pixman_sse2_radial_gradient_get_scanline_32 (pixman_image_t *image,
+					      int             x,
+					      int             y,
+					      int             width,
+					      uint32_t *      buffer,
+					      const uint32_t *mask,
+					      uint32_t        mask_bits)
+{
+    /*
+     * In the radial gradient problem we are given two circles (c₁,r₁) and
+     * (c₂,r₂) that define the gradient itself. Then, for any point p, we
+     * must compute the value(s) of t within [0.0, 1.0] representing the
+     * circle(s) that would color the point.
+     *
+     * There are potentially two values of t since the point p can be
+     * colored by both sides of the circle, (which happens whenever one
+     * circle is not entirely contained within the other).
+     *
+     * If we solve for a value of t that is outside of [0.0, 1.0] then we
+     * use the extend mode (NONE, REPEAT, REFLECT, or PAD) to map to a
+     * value within [0.0, 1.0].
+     *
+     * Here is an illustration of the problem:
+     *
+     *              p₂
+     *           p  •
+     *           •   ╲
+     *        ·       ╲r₂
+     *  p₁ ·           ╲
+     *  •              θ╲
+     *   ╲             ╌╌•
+     *    ╲r₁        ·   c₂
+     *    θ╲    ·
+     *    ╌╌•
+     *      c₁
+     *
+     * Given (c₁,r₁), (c₂,r₂) and p, we must find an angle θ such that two
+     * points p₁ and p₂ on the two circles are collinear with p. Then, the
+     * desired value of t is the ratio of the length of p₁p to the length
+     * of p₁p₂.
+     *
+     * So, we have six unknown values: (p₁x, p₁y), (p₂x, p₂y), θ and t.
+     * We can also write six equations that constrain the problem:
+     *
+     * Point p₁ is a distance r₁ from c₁ at an angle of θ:
+     *
+     *	1. p₁x = c₁x + r₁·cos θ
+     *	2. p₁y = c₁y + r₁·sin θ
+     *
+     * Point p₂ is a distance r₂ from c₂ at an angle of θ:
+     *
+     *	3. p₂x = c₂x + r2·cos θ
+     *	4. p₂y = c₂y + r2·sin θ
+     *
+     * Point p lies at a fraction t along the line segment p₁p₂:
+     *
+     *	5. px = t·p₂x + (1-t)·p₁x
+     *	6. py = t·p₂y + (1-t)·p₁y
+     *
+     * To solve, first subtitute 1-4 into 5 and 6:
+     *
+     * px = t·(c₂x + r₂·cos θ) + (1-t)·(c₁x + r₁·cos θ)
+     * py = t·(c₂y + r₂·sin θ) + (1-t)·(c₁y + r₁·sin θ)
+     *
+     * Then solve each for cos θ and sin θ expressed as a function of t:
+     *
+     * cos θ = (-(c₂x - c₁x)·t + (px - c₁x)) / ((r₂-r₁)·t + r₁)
+     * sin θ = (-(c₂y - c₁y)·t + (py - c₁y)) / ((r₂-r₁)·t + r₁)
+     *
+     * To simplify this a bit, we define new variables for several of the
+     * common terms as shown below:
+     *
+     *              p₂
+     *           p  •
+     *           •   ╲
+     *        ·  ┆    ╲r₂
+     *  p₁ ·     ┆     ╲
+     *  •     pdy┆      ╲
+     *   ╲       ┆       •c₂
+     *    ╲r₁    ┆   ·   ┆
+     *     ╲    ·┆       ┆cdy
+     *      •╌╌╌╌┴╌╌╌╌╌╌╌┘
+     *    c₁  pdx   cdx
+     *
+     * cdx = (c₂x - c₁x)
+     * cdy = (c₂y - c₁y)
+     *  dr =  r₂-r₁
+     * pdx =  px - c₁x
+     * pdy =  py - c₁y
+     *
+     * Note that cdx, cdy, and dr do not depend on point p at all, so can
+     * be pre-computed for the entire gradient. The simplifed equations
+     * are now:
+     *
+     * cos θ = (-cdx·t + pdx) / (dr·t + r₁)
+     * sin θ = (-cdy·t + pdy) / (dr·t + r₁)
+     *
+     * Finally, to get a single function of t and eliminate the last
+     * unknown θ, we use the identity sin²θ + cos²θ = 1. First, square
+     * each equation, (we knew a quadratic was coming since it must be
+     * possible to obtain two solutions in some cases):
+     *
+     * cos²θ = (cdx²t² - 2·cdx·pdx·t + pdx²) / (dr²·t² + 2·r₁·dr·t + r₁²)
+     * sin²θ = (cdy²t² - 2·cdy·pdy·t + pdy²) / (dr²·t² + 2·r₁·dr·t + r₁²)
+     *
+     * Then add both together, set the result equal to 1, and express as a
+     * standard quadratic equation in t of the form At² + Bt + C = 0
+     *
+     * (cdx² + cdy² - dr²)·t² - 2·(cdx·pdx + cdy·pdy + r₁·dr)·t + (pdx² + pdy² - r₁²) = 0
+     *
+     * In other words:
+     *
+     * A = cdx² + cdy² - dr²
+     * B = -2·(pdx·cdx + pdy·cdy + r₁·dr)
+     * C = pdx² + pdy² - r₁²
+     *
+     * And again, notice that A does not depend on p, so can be
+     * precomputed. From here we just use the quadratic formula to solve
+     * for t:
+     *
+     * t = (-2·B ± ⎷(B² - 4·A·C)) / 2·A
+     */
+
+    gradient_t *gradient = (gradient_t *)image;
+    source_image_t *source = (source_image_t *)image;
+    radial_gradient_t *radial = (radial_gradient_t *)image;
+    uint32_t *end = buffer + width;
+    float cx = 1.;
+    float cy = 0.;
+    float cz = 0.;
+    float rx = x + 0.5;
+    float ry = y + 0.5;
+    float rz = 1.;
+
+    if (source->common.transform)
+    {
+	pixman_vector_t v;
+	/* reference point is the center of the pixel */
+	v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
+	v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
+	v.vector[2] = pixman_fixed_1;
+	
+	if (!pixman_transform_point_3d (source->common.transform, &v))
+	    return;
+
+	cx = source->common.transform->matrix[0][0] / 65536.;
+	cy = source->common.transform->matrix[1][0] / 65536.;
+	cz = source->common.transform->matrix[2][0] / 65536.;
+	
+	rx = v.vector[0] / 65536.;
+	ry = v.vector[1] / 65536.;
+	rz = v.vector[2] / 65536.;
+    }
+
+    /* When computing t over a scanline, we notice that some expressions
+     * are constant so we can compute them just once. Given:
+     *
+     * t = (-2·B ± ⎷(B² - 4·A·C)) / 2·A
+     *
+     * where
+     *
+     * A = cdx² + cdy² - dr² [precomputed as radial->A]
+     * B = -2·(pdx·cdx + pdy·cdy + r₁·dr)
+     * C = pdx² + pdy² - r₁²
+     *
+     * Since we have an affine transformation, we know that (pdx, pdy)
+     * increase linearly with each pixel,
+     *
+     * pdx = pdx₀ + n·cx,
+     * pdy = pdy₀ + n·cy,
+     *
+     * we can then express B in terms of an linear increment along
+     * the scanline:
+     *
+     * B = B₀ + n·cB, with
+     * B₀ = -2·(pdx₀·cdx + pdy₀·cdy + r₁·dr) and
+     * cB = -2·(cx·cdx + cy·cdy)
+     *
+     * Thus we can replace the full evaluation of B per-pixel (4 multiplies,
+     * 2 additions) with a single addition.
+     */
+    float r1   = radial->c1.radius / 65536.;
+    float r1sq = r1 * r1;
+    float pdx  = rx - radial->c1.x / 65536.;
+    float pdy  = ry - radial->c1.y / 65536.;
+    float A = radial->A;
+    float invA = -65536. / (2. * A);
+    float A4 = -4. * A;
+    float B  = -2. * (pdx*radial->cdx + pdy*radial->cdy + r1*radial->dr);
+    float cB = -2. *  (cx*radial->cdx +  cy*radial->cdy);
+    pixman_bool_t invert = A * radial->dr < 0;
+
+    uint32_t *lut = gradient->color_lut;
+
+    int shift = 16 - gradient->color_lut_bits;
+
+    __m128i mask_0xffff =  _mm_set1_epi32(0xffff);
+    __m128 invert_ = _mm_set1_ps (invert ? 1.0 : -1.0);
+    __m128 A4_ = _mm_set1_ps (A4);
+    __m128 invA_ = _mm_set1_ps (invA);
+    __m128 r1sq_ = _mm_set1_ps (r1sq);
+    __m128 pdx_ = _mm_set_ps (pdx + 3 * cx, pdx + 2 * cx, pdx + cx, pdx);
+    __m128 pdy_ = _mm_set_ps (pdy + 3 * cy, pdy + 2 * cy, pdy + cy, pdy);
+    __m128 B_ = _mm_set_ps (B + 3 * cB, B + 2 * cB, B + cB, B);	
+    __m128 cx_ = _mm_set1_ps (cx);
+    __m128 cy_ = _mm_set1_ps (cy);
+    __m128 cB_ = _mm_set1_ps (cB);
+    __m128 cx_4 = _mm_mul_ps (cx_, _mm_set1_ps (4.0));
+    __m128 cy_4 = _mm_mul_ps (cy_, _mm_set1_ps (4.0));
+    __m128 cB_4 = _mm_mul_ps (cB_, _mm_set1_ps (4.0));
+
+    pixman_fixed_16_16_t tsf[4];
+
+    if (gradient->color_lut)
+    {
+        unsigned int repeat = source->common.repeat;
+        if (repeat == PIXMAN_REPEAT_NORMAL)
+	{
+	    SSE2_RADIAL_GRADIENT(SSE2_LUT_REPEAT_NORMAL);
+	}
+	else if (repeat == PIXMAN_REPEAT_PAD)
+	{
+	    SSE2_RADIAL_GRADIENT(SSE2_LUT_REPEAT_PAD);
+	}
+	else if (repeat == PIXMAN_REPEAT_REFLECT)
+	{
+	    SSE2_RADIAL_GRADIENT(SSE2_LUT_REPEAT_REFLECT);
+	}
+	else /* PIXMAN_REPEAT_NONE */
+	{
+	    SSE2_RADIAL_GRADIENT(SSE2_LUT_REPEAT_NONE);
+	}
+    }
+    else
+    {
+        pixman_sse2_gradient_walker_t walker;
+	_pixman_sse2_gradient_walker_init (&walker, gradient, source->common.repeat);
+
+	SSE2_RADIAL_GRADIENT(SSE2_USE_GRADIENT_WALKER);
+    }
+}
author	Søren Sandmann Pedersen <ssp@redhat.com>	2011-09-27 11:23:49 -0400
committer	Søren Sandmann Pedersen <ssp@redhat.com>	2011-09-27 11:23:49 -0400
commit	76c13dac657b1bde44a4fd01acc43d80f8971198 (patch)
tree	ae32292ea05e117622a851854d8ec0dcf0709cc7
parent	57fd8c37aa3148b1d70bad65e1a49721e9a47d7e (diff)