summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSøren Sandmann Pedersen <ssp@redhat.com>2011-09-27 11:23:49 -0400
committerSøren Sandmann Pedersen <ssp@redhat.com>2011-09-27 11:23:49 -0400
commit76c13dac657b1bde44a4fd01acc43d80f8971198 (patch)
treeae32292ea05e117622a851854d8ec0dcf0709cc7
parent57fd8c37aa3148b1d70bad65e1a49721e9a47d7e (diff)
-rw-r--r--pixman/Makefile.am8
-rw-r--r--pixman/pixman-sse2-gradient.h136
-rw-r--r--pixman/pixman-sse2-linear-gradient.c356
-rw-r--r--pixman/pixman-sse2-radial-gradient.c387
4 files changed, 885 insertions, 2 deletions
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index 2421a4f9..a4dc03cd 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -50,8 +50,12 @@ endif
# sse2 code
if USE_SSE2
noinst_LTLIBRARIES += libpixman-sse2.la
-libpixman_sse2_la_SOURCES = \
- pixman-sse2.c
+libpixman_sse2_la_SOURCES = \
+ pixman-sse2.c \
+ pixman-sse2-linear-gradient.c \
+ pixman-sse2-radial-gradient.c \
+ pixman-sse2-gradient.h \
+ pixman-sse2-gradient-walker.c
libpixman_sse2_la_CFLAGS = $(DEP_CFLAGS) $(SSE2_CFLAGS)
libpixman_sse2_la_LIBADD = $(DEP_LIBS)
libpixman_1_la_LDFLAGS += $(SSE2_LDFLAGS)
diff --git a/pixman/pixman-sse2-gradient.h b/pixman/pixman-sse2-gradient.h
new file mode 100644
index 00000000..d356b9c4
--- /dev/null
+++ b/pixman/pixman-sse2-gradient.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright © 2010 Novell, Inc
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Chris Toshok (toshok@novell.com)
+ */
+
+#ifndef PIXMAN_GRADIENT_SSE2_H
+#define PIXMAN_GRADIENT_SSE2_H
+
+#define CLAMP_MAX(v,max) ((v) < 0 ? 0 : (v) > (max) ? (max) : (v))
+
+#define SSE2_USE_GRADIENT_WALKER(_ts,_buffer,_lut,_shift,_storeop) \
+ do { \
+ _mm_##_storeop##_si128 ((__m128i*)_buffer, _pixman_sse2_gradient_walker_pixel (&walker, _ts)); \
+ } while (0)
+
+#define SSE2_LUT_REPEAT_NORMAL(_ts,_buffer,_lut,_shift,_storeop) \
+ do { \
+ __m128i x = _ts; \
+ \
+ /* t = abs(t); */ \
+ /* t = t & 0xFFFF; */ \
+ \
+ /* abs() hack from http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs */ \
+ __m128i mask = _mm_srai_epi32 (x, 31); \
+ x = _mm_xor_si128 (_mm_add_epi32 (x, mask), mask); \
+ \
+ x = _mm_and_si128 (x, mask_0xffff); \
+ x = _mm_srai_epi32 (x, _shift); \
+ \
+ _mm_##_storeop##_si128 ((__m128i*)_buffer, x); \
+ \
+ _buffer[0] = _lut[_buffer[0]]; \
+ _buffer[1] = _lut[_buffer[1]]; \
+ _buffer[2] = _lut[_buffer[2]]; \
+ _buffer[3] = _lut[_buffer[3]]; \
+} while (0)
+
+#define SSE2_LUT_REPEAT_REFLECT(_ts,_buffer,_lut,_shift,_storeop) \
+ do { \
+ __m128i x = _ts; \
+ /* s = t << 15 >> 31; */ \
+ /* t = (t ^ s) & 0xFFFF; */ \
+ /* t >>= shift */ \
+ __m128i ss = _mm_srai_epi32 (_mm_slli_epi32 (ts, 15), 31); \
+ x = _mm_xor_si128 (ss, x); \
+ x = _mm_and_si128 (x, mask_0xffff); \
+ x = _mm_srai_epi32 (x, _shift); \
+ \
+ _mm_##_storeop##_si128 ((__m128i*)_buffer, x); \
+ \
+ _buffer[0] = _lut[_buffer[0]]; \
+ _buffer[1] = _lut[_buffer[1]]; \
+ _buffer[2] = _lut[_buffer[2]]; \
+ _buffer[3] = _lut[_buffer[3]]; \
+} while (0)
+
+#define SSE2_LUT_REPEAT_NONE(_ts,_buffer,_lut,_shift,_storeop) \
+ do { \
+ __m128i x = ts; \
+ \
+ __m128i cmp_lt0 = _mm_cmplt_epi32 (x, _mm_set1_epi32 (0)); \
+ __m128i cmp_gt0xffff = _mm_cmpgt_epi32 (x, mask_0xffff); \
+ \
+ /* cmp_mask_in_range = 0xffffffff where values are >= 0 and <= 0xffff, */ \
+ /* cmp_mask_in_range = 0x00000000 where values are < 0 or > 0xffff */ \
+ \
+ /* cmp_mask_out_of_range = 0xffffffff where values are < 0 or > 0xffff, */ \
+ /* cmp_mask_out_of_range = 0x00000000 where values are >= 0 and <= 0xffff */ \
+ __m128i cmp_mask_out_of_range = _mm_or_si128 (cmp_lt0, cmp_gt0xffff); \
+ __m128i cmp_mask_in_range = _mm_andnot_si128 (cmp_mask_out_of_range, \
+ _mm_set1_epi32 (0xffffffff)); \
+ \
+ /* take care of the shift now, since we don't want to shift the sentinel (0xffffffff) */ \
+ x = _mm_srai_epi32 (x, _shift); \
+ \
+ /* or together the masked in range x values and the out of range sentinel values */ \
+ x = _mm_or_si128 (_mm_and_si128 (cmp_mask_in_range, x), \
+ _mm_and_si128 (cmp_mask_out_of_range, _mm_set1_epi32 (0xffffffff))); \
+ \
+ _mm_##_storeop##_si128 ((__m128i*)_buffer, x); \
+ \
+ _buffer[0] = _buffer[0] == 0xffffffff ? 0 : _lut[_buffer[0]]; \
+ _buffer[1] = _buffer[1] == 0xffffffff ? 0 : _lut[_buffer[1]]; \
+ _buffer[2] = _buffer[2] == 0xffffffff ? 0 : _lut[_buffer[2]]; \
+ _buffer[3] = _buffer[3] == 0xffffffff ? 0 : _lut[_buffer[3]]; \
+} while (0)
+
+#define SSE2_LUT_REPEAT_PAD(_ts,_buffer,_lut,_shift,_storeop) \
+ do { \
+ \
+ /* t = t < 0 ? 0 : t > 0xffff ? 0xffff : t */ \
+ /* t >>= shift */ \
+ \
+ __m128i x = ts; \
+ __m128i cmp; \
+ \
+ /* handle t = t < 0 ? 0 */ \
+ cmp = _mm_cmpgt_epi32 (x, _mm_set1_epi32 (0)); \
+ x = _mm_and_si128 (x, cmp); \
+ \
+ /* handle t > 0xffff ? 0xffff */ \
+ cmp = _mm_cmpgt_epi32 (x, mask_0xffff); \
+ x = _mm_andnot_si128 (cmp, x); \
+ x = _mm_add_epi32 (x, _mm_and_si128 (cmp, mask_0xffff)); \
+ \
+ x = _mm_srai_epi32 (x, _shift); \
+ \
+ _mm_##_storeop##_si128 ((__m128i*)_buffer, x); \
+ \
+ _buffer[0] = _lut[_buffer[0]]; \
+ _buffer[1] = _lut[_buffer[1]]; \
+ _buffer[2] = _lut[_buffer[2]]; \
+ _buffer[3] = _lut[_buffer[3]]; \
+} while (0)
+
+#endif
diff --git a/pixman/pixman-sse2-linear-gradient.c b/pixman/pixman-sse2-linear-gradient.c
new file mode 100644
index 00000000..b47d7c52
--- /dev/null
+++ b/pixman/pixman-sse2-linear-gradient.c
@@ -0,0 +1,356 @@
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ * 2005 Lars Knoll & Zack Rusin, Trolltech
+ * Copyright © 2010 Novell, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. Keith Packard makes no
+ * representations about the suitability of this software for any purpose. It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdlib.h>
+#include "pixman-private.h"
+
+#include <mmintrin.h>
+#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
+#include <emmintrin.h>
+
+#include <assert.h>
+
+#include "pixman-sse2-gradient.h"
+
+#include "pixman-sse2-gradient-walker.c"
+
+#define LIKELY(x) __builtin_expect ((x),1)
+#define UNLIKELY(x) __builtin_expect ((x),0)
+
+#define SSE2_LINEAR_GRADIENT(lut_macro) \
+ do \
+ { \
+ /* we can process 4 pixels at a time */ \
+ int count; \
+ int count; \
+ \
+ if ((int)buffer & 0x0f) \
+ { \
+ /* we aren't aligned to a 16 byte boundary. */ \
+ /* process the at most 3 pixels until we get */ \
+ /* there */ \
+ int initial = 16 - ((int)buffer & 0x0f); \
+ \
+ lut_macro (ts, tsf, lut, shift, storeu); \
+ \
+ initial >>= 2; \
+ \
+ if (initial >= 3) \
+ { \
+ *buffer++ = tsf[0]; \
+ ts = _mm_add_epi32 (ts, _mm_set1_epi32 (inc)); \
+ } \
+ if (initial >= 2) \
+ { \
+ *buffer++ = tsf[1]; \
+ ts = _mm_add_epi32 (ts, _mm_set1_epi32 (inc)); \
+ } \
+ if (initial >= 1) \
+ { \
+ *buffer++ = tsf[2]; \
+ ts = _mm_add_epi32 (ts, _mm_set1_epi32 (inc)); \
+ } \
+ } \
+ \
+ count = end - buffer; \
+ while (count >= 4) \
+ { \
+ lut_macro (ts, buffer, lut, shift, store); \
+ \
+ buffer += 4; \
+ /* t += t + 4 * inc */ \
+ ts = _mm_add_epi32 (ts, incs); \
+ \
+ count -= 4; \
+ } \
+ \
+ if (buffer < end) \
+ { \
+ /* we're at most 3 pixels off, so unroll/inline it here */ \
+ int rem = end - buffer; \
+ \
+ lut_macro (ts, tsf, lut, shift, storeu); \
+ \
+ if (rem-- > 0) \
+ buffer[0] = tsf[0]; \
+ if (rem-- > 0) \
+ buffer[1] = tsf[1]; \
+ if (rem-- > 0) \
+ buffer[2] = tsf[2]; \
+ } \
+ } \
+ while (0)
+
+
+void
+_pixman_sse2_linear_gradient_get_scanline_32 (pixman_image_t *image,
+ int x,
+ int y,
+ int width,
+ uint32_t * buffer,
+ const uint32_t *mask,
+ uint32_t mask_bits)
+{
+ pixman_vector_t v, unit;
+ pixman_fixed_32_32_t l;
+ pixman_fixed_48_16_t dx, dy, a, b, off;
+ gradient_t *gradient = (gradient_t *)image;
+ linear_gradient_t *linear = (linear_gradient_t *)image;
+ uint32_t *end = buffer + width;
+
+ /* reference point is the center of the pixel */
+ v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
+ v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
+ v.vector[2] = pixman_fixed_1;
+
+ if (gradient->common.transform)
+ {
+ if (!pixman_transform_point_3d (gradient->common.transform, &v))
+ return;
+
+ unit.vector[0] = gradient->common.transform->matrix[0][0];
+ unit.vector[1] = gradient->common.transform->matrix[1][0];
+ unit.vector[2] = gradient->common.transform->matrix[2][0];
+ }
+ else
+ {
+ unit.vector[0] = pixman_fixed_1;
+ unit.vector[1] = 0;
+ unit.vector[2] = 0;
+ }
+
+ dx = linear->p2.x - linear->p1.x;
+ dy = linear->p2.y - linear->p1.y;
+
+ l = dx * dx + dy * dy;
+
+ if (l != 0)
+ {
+ a = (dx << 32) / l;
+ b = (dy << 32) / l;
+ off = (-a * linear->p1.x
+ -b * linear->p1.y) >> 16;
+ }
+
+ if (l == 0 || (unit.vector[2] == 0 && v.vector[2] == pixman_fixed_1))
+ {
+ pixman_fixed_48_16_t inc, t;
+
+ /* affine transformation only */
+ if (l == 0)
+ {
+ t = 0;
+ inc = 0;
+ }
+ else
+ {
+ t = ((a * v.vector[0] + b * v.vector[1]) >> 16) + off;
+ inc = (a * unit.vector[0] + b * unit.vector[1]) >> 16;
+ }
+
+ if (gradient->class == SOURCE_IMAGE_CLASS_VERTICAL)
+ {
+ uint32_t *lut = gradient->color_lut;
+ uint32_t color;
+ __m128i colors;
+
+ if (lut != NULL)
+ {
+ unsigned int repeat = gradient->common.repeat;
+ int shift = 16 - gradient->color_lut_bits;
+
+ if (repeat == PIXMAN_REPEAT_NORMAL)
+ {
+ /* color = _pixman_gradient_walker_pixel_lut_repeat_normal (&walker, t); */
+ if (t < 0)
+ t = -t;
+ t = t & 0xFFFF;
+
+ color = lut[t >> shift];
+ }
+ else if (repeat == PIXMAN_REPEAT_PAD)
+ {
+ /* color = _pixman_gradient_walker_pixel_lut_repeat_pad (&walker, t); */
+
+ t = CLAMP_MAX (t, 0xFFFF);
+ color = lut[t >> shift];
+ }
+ else if (repeat == PIXMAN_REPEAT_REFLECT)
+ {
+ /* color = _pixman_gradient_walker_pixel_lut_repeat_reflect (&walker, t); */
+ int s = t << 15 >> 31;
+ t = (t ^ s) & 0xFFFF;
+
+ color = lut[x >> shift];
+ }
+ else /* PIXMAN_REPEAT_NONE */
+ {
+ /* color = _pixman_gradient_walker_pixel_lut_repeat_none (&walker, t); */
+ if (t < 0 || t > 0xFFFF)
+ color = 0 /* transparent black */;
+ else
+ color = lut[t >> shift];
+ }
+
+ colors = _mm_set1_epi32 (color);
+ }
+ else
+ {
+ pixman_sse2_gradient_walker_t walker;
+
+ _pixman_sse2_gradient_walker_init (&walker, gradient, gradient->common.repeat);
+
+ __m128i ts = _mm_set1_epi32 (t);
+ colors = _pixman_sse2_gradient_walker_pixel (&walker, ts);
+ color = _mm_cvtsi128_si32 (colors);
+ }
+
+ int initial = 16 - ((int)buffer & 0x0f);
+ while (initial > 0) {
+ *buffer++ = color;
+ initial -= 4;
+ }
+
+ assert (( (int)buffer & 0x0f) == 0);
+
+ uint32_t *end_aligned = (uint32_t*) ((char*)end - ((int)end & 0x0f));
+
+ while (buffer < end_aligned) {
+ _mm_store_si128 ((__m128i*)buffer, colors);
+ buffer += 4;
+ }
+ if (end_aligned != end) {
+ int i;
+ for (i = 0; i < (end - end_aligned); i ++)
+ *buffer++ = color;
+ }
+ }
+ else
+ {
+ uint32_t *lut = gradient->color_lut;
+ int shift = 16 - gradient->color_lut_bits;
+
+ __m128i incs = _mm_set1_epi32 (inc);
+ __m128i ts = _mm_set_epi32 (t + 3 * inc, t + 2 * inc, t + inc, t);
+ __m128i mask_0xffff = _mm_set1_epi32 (0xffff);
+
+ pixman_fixed_16_16_t tsf[4];
+
+ incs = _mm_slli_epi32 (incs, 2);
+
+ if (gradient->color_lut != NULL && (((int)buffer & 0x03) == 0))
+ {
+ unsigned int repeat = gradient->common.repeat;
+
+ if (repeat == PIXMAN_REPEAT_NORMAL)
+ {
+ SSE2_LINEAR_GRADIENT (SSE2_LUT_REPEAT_NORMAL);
+ }
+ else if (repeat == PIXMAN_REPEAT_PAD)
+ {
+ SSE2_LINEAR_GRADIENT (SSE2_LUT_REPEAT_PAD);
+ }
+ else if (repeat == PIXMAN_REPEAT_REFLECT)
+ {
+ SSE2_LINEAR_GRADIENT (SSE2_LUT_REPEAT_REFLECT);
+ }
+ else /* PIXMAN_REPEAT_REFLECT */
+ {
+ SSE2_LINEAR_GRADIENT (SSE2_LUT_REPEAT_NONE);
+ }
+ }
+ else
+ {
+ pixman_sse2_gradient_walker_t walker;
+ _pixman_sse2_gradient_walker_init (&walker, gradient, gradient->common.repeat);
+
+ SSE2_LINEAR_GRADIENT (SSE2_USE_GRADIENT_WALKER);
+ }
+ }
+ }
+#if 0
+ else
+ {
+ /* projective transformation */
+ pixman_fixed_48_16_t t;
+
+ if (gradientclass == SOURCE_IMAGE_CLASS_VERTICAL)
+ {
+ register uint32_t color;
+
+ if (v.vector[2] == 0)
+ {
+ t = 0;
+ }
+ else
+ {
+ pixman_fixed_48_16_t x, y;
+
+ x = ((pixman_fixed_48_16_t) v.vector[0] << 16) / v.vector[2];
+ y = ((pixman_fixed_48_16_t) v.vector[1] << 16) / v.vector[2];
+ t = ((a * x + b * y) >> 16) + off;
+ }
+
+ color = _pixman_gradient_walker_pixel (&walker, t);
+
+ while (buffer < end)
+ *buffer++ = color;
+ }
+ else
+ {
+ while (buffer < end)
+ {
+ if (!mask || *mask++ & mask_bits)
+ {
+ if (v.vector[2] == 0)
+ {
+ t = 0;
+ }
+ else
+ {
+ pixman_fixed_48_16_t x, y;
+ x = ((pixman_fixed_48_16_t)v.vector[0] << 16) / v.vector[2];
+ y = ((pixman_fixed_48_16_t)v.vector[1] << 16) / v.vector[2];
+ t = ((a * x + b * y) >> 16) + off;
+ }
+
+ *buffer = _pixman_gradient_walker_pixel (&walker, t);
+ }
+
+ ++buffer;
+
+ v.vector[0] += unit.vector[0];
+ v.vector[1] += unit.vector[1];
+ v.vector[2] += unit.vector[2];
+ }
+ }
+ }
+#endif
+}
diff --git a/pixman/pixman-sse2-radial-gradient.c b/pixman/pixman-sse2-radial-gradient.c
new file mode 100644
index 00000000..dbb20eef
--- /dev/null
+++ b/pixman/pixman-sse2-radial-gradient.c
@@ -0,0 +1,387 @@
+/*
+ *
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ * Copyright © 2000 SuSE, Inc.
+ * 2005 Lars Knoll & Zack Rusin, Trolltech
+ * Copyright © 2007 Red Hat, Inc.
+ * Copyright © 2010 Novell, Inc.
+ *
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. Keith Packard makes no
+ * representations about the suitability of this software for any purpose. It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdlib.h>
+#include <math.h>
+#include "pixman-private.h"
+
+#include <mmintrin.h>
+#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
+#include <emmintrin.h>
+
+#include "pixman-sse2-gradient.h"
+
+#include "pixman-sse2-gradient-walker.c"
+
+#define LIKELY(x) __builtin_expect((x),1)
+#define UNLIKELY(x) __builtin_expect((x),0)
+
+#define SSE2_RADIAL_GRADIENT(lut_macro) do { \
+ /* we can process 4 pixels at a time */ \
+ \
+ int count; \
+ \
+ if ((int)buffer & 0x0f) \
+ { \
+ /* we aren't aligned to a 16 byte boundary. */ \
+ /* process the at most 3 pixels until we get */ \
+ /* there */ \
+ int initial = 16 - ((int)buffer & 0x0f); \
+ float scale; \
+ \
+ __m128i ts = calc_radial_det4 (B_, A4_, invA_, \
+ pdx_, pdy_, r1sq_, invert_); \
+ \
+ lut_macro(ts, tsf, lut, shift, storeu); \
+ \
+ initial >>= 2; \
+ \
+ if (initial >= 3) \
+ *buffer++ = tsf[2]; \
+ if (initial >= 2) \
+ *buffer++ = tsf[1]; \
+ if (initial >= 1) \
+ *buffer++ = tsf[1]; \
+ \
+ scale = initial; \
+ \
+ pdx_ = _mm_add_ps (pdx_, _mm_mul_ps (cx_, _mm_load1_ps (&scale))); \
+ pdy_ = _mm_add_ps (pdy_, _mm_mul_ps (cy_, _mm_load1_ps (&scale))); \
+ B_ = _mm_add_ps (B_, _mm_mul_ps (cB_, _mm_load1_ps (&scale))); \
+ } \
+ \
+ count = end - buffer; \
+ while (count >= 4) \
+ { \
+ __m128i ts = calc_radial_det4 (B_, A4_, invA_, \
+ pdx_, pdy_, r1sq_, invert_); \
+ \
+ lut_macro(ts, buffer, lut, shift, store); \
+ \
+ buffer += 4; \
+ \
+ pdx_ = _mm_add_ps (pdx_, cx_4); \
+ pdy_ = _mm_add_ps (pdy_, cy_4); \
+ B_ = _mm_add_ps (B_, cB_4); \
+ \
+ count -= 4; \
+ } \
+ \
+ if (buffer < end) \
+ { \
+ /* we're at most 3 pixels off, so unroll/inline it here */ \
+ int rem = end - buffer; \
+ \
+ __m128i ts = calc_radial_det4 (B_, A4_, invA_, \
+ pdx_, pdy_, r1sq_, invert_); \
+ \
+ lut_macro(ts, tsf, lut, shift, storeu); \
+ \
+ if (rem-- > 0) buffer[0] = tsf[0]; \
+ if (rem-- > 0) buffer[1] = tsf[1]; \
+ if (rem-- > 0) buffer[2] = tsf[2]; \
+ } \
+ } while (0)
+
+
+static force_inline __m128i
+calc_radial_det4 (__m128 B, __m128 A4, __m128 invA, __m128 pdx, __m128 pdy, __m128 r1sq, __m128 invert)
+{
+ __m128 Bsq = _mm_mul_ps (B, B);
+
+ /* discriminant = A4 * (pdx*pdx + pdy * pdy - r1sq) + Bsq */
+ __m128 discriminant = _mm_add_ps (_mm_mul_ps (A4,
+ _mm_sub_ps (_mm_add_ps (_mm_mul_ps (pdx, pdx),
+ _mm_mul_ps (pdy, pdy)),
+ r1sq)),
+ Bsq);
+
+ /* figure out the negative discriminants, and create a mask of 1.0 */
+ /* for those discriminants that are *not* less than 0. */
+ __m128 lt0_mask = _mm_cmplt_ps (discriminant, _mm_setzero_ps());
+
+ /* zero out the negative ones */
+ discriminant = _mm_andnot_ps (lt0_mask, discriminant);
+
+ /* sqrt them all */
+ discriminant = _mm_sqrt_ps (discriminant);
+
+ discriminant = _mm_mul_ps (invA,
+ _mm_add_ps (B,
+ _mm_mul_ps (invert,
+ discriminant)));
+
+ return _mm_cvtps_epi32 (discriminant);
+}
+
+void
+_pixman_sse2_radial_gradient_get_scanline_32 (pixman_image_t *image,
+ int x,
+ int y,
+ int width,
+ uint32_t * buffer,
+ const uint32_t *mask,
+ uint32_t mask_bits)
+{
+ /*
+ * In the radial gradient problem we are given two circles (c₁,r₁) and
+ * (c₂,r₂) that define the gradient itself. Then, for any point p, we
+ * must compute the value(s) of t within [0.0, 1.0] representing the
+ * circle(s) that would color the point.
+ *
+ * There are potentially two values of t since the point p can be
+ * colored by both sides of the circle, (which happens whenever one
+ * circle is not entirely contained within the other).
+ *
+ * If we solve for a value of t that is outside of [0.0, 1.0] then we
+ * use the extend mode (NONE, REPEAT, REFLECT, or PAD) to map to a
+ * value within [0.0, 1.0].
+ *
+ * Here is an illustration of the problem:
+ *
+ * p₂
+ * p •
+ * • ╲
+ * · ╲r₂
+ * p₁ · ╲
+ * • θ╲
+ * ╲ ╌╌•
+ * ╲r₁ · c₂
+ * θ╲ ·
+ * ╌╌•
+ * c₁
+ *
+ * Given (c₁,r₁), (c₂,r₂) and p, we must find an angle θ such that two
+ * points p₁ and p₂ on the two circles are collinear with p. Then, the
+ * desired value of t is the ratio of the length of p₁p to the length
+ * of p₁p₂.
+ *
+ * So, we have six unknown values: (p₁x, p₁y), (p₂x, p₂y), θ and t.
+ * We can also write six equations that constrain the problem:
+ *
+ * Point p₁ is a distance r₁ from c₁ at an angle of θ:
+ *
+ * 1. p₁x = c₁x + r₁·cos θ
+ * 2. p₁y = c₁y + r₁·sin θ
+ *
+ * Point p₂ is a distance r₂ from c₂ at an angle of θ:
+ *
+ * 3. p₂x = c₂x + r2·cos θ
+ * 4. p₂y = c₂y + r2·sin θ
+ *
+ * Point p lies at a fraction t along the line segment p₁p₂:
+ *
+ * 5. px = t·p₂x + (1-t)·p₁x
+ * 6. py = t·p₂y + (1-t)·p₁y
+ *
+ * To solve, first subtitute 1-4 into 5 and 6:
+ *
+ * px = t·(c₂x + r₂·cos θ) + (1-t)·(c₁x + r₁·cos θ)
+ * py = t·(c₂y + r₂·sin θ) + (1-t)·(c₁y + r₁·sin θ)
+ *
+ * Then solve each for cos θ and sin θ expressed as a function of t:
+ *
+ * cos θ = (-(c₂x - c₁x)·t + (px - c₁x)) / ((r₂-r₁)·t + r₁)
+ * sin θ = (-(c₂y - c₁y)·t + (py - c₁y)) / ((r₂-r₁)·t + r₁)
+ *
+ * To simplify this a bit, we define new variables for several of the
+ * common terms as shown below:
+ *
+ * p₂
+ * p •
+ * • ╲
+ * · ┆ ╲r₂
+ * p₁ · ┆ ╲
+ * • pdy┆ ╲
+ * ╲ ┆ •c₂
+ * ╲r₁ ┆ · ┆
+ * ╲ ·┆ ┆cdy
+ * •╌╌╌╌┴╌╌╌╌╌╌╌┘
+ * c₁ pdx cdx
+ *
+ * cdx = (c₂x - c₁x)
+ * cdy = (c₂y - c₁y)
+ * dr = r₂-r₁
+ * pdx = px - c₁x
+ * pdy = py - c₁y
+ *
+ * Note that cdx, cdy, and dr do not depend on point p at all, so can
+ * be pre-computed for the entire gradient. The simplifed equations
+ * are now:
+ *
+ * cos θ = (-cdx·t + pdx) / (dr·t + r₁)
+ * sin θ = (-cdy·t + pdy) / (dr·t + r₁)
+ *
+ * Finally, to get a single function of t and eliminate the last
+ * unknown θ, we use the identity sin²θ + cos²θ = 1. First, square
+ * each equation, (we knew a quadratic was coming since it must be
+ * possible to obtain two solutions in some cases):
+ *
+ * cos²θ = (cdx²t² - 2·cdx·pdx·t + pdx²) / (dr²·t² + 2·r₁·dr·t + r₁²)
+ * sin²θ = (cdy²t² - 2·cdy·pdy·t + pdy²) / (dr²·t² + 2·r₁·dr·t + r₁²)
+ *
+ * Then add both together, set the result equal to 1, and express as a
+ * standard quadratic equation in t of the form At² + Bt + C = 0
+ *
+ * (cdx² + cdy² - dr²)·t² - 2·(cdx·pdx + cdy·pdy + r₁·dr)·t + (pdx² + pdy² - r₁²) = 0
+ *
+ * In other words:
+ *
+ * A = cdx² + cdy² - dr²
+ * B = -2·(pdx·cdx + pdy·cdy + r₁·dr)
+ * C = pdx² + pdy² - r₁²
+ *
+ * And again, notice that A does not depend on p, so can be
+ * precomputed. From here we just use the quadratic formula to solve
+ * for t:
+ *
+ * t = (-2·B ± ⎷(B² - 4·A·C)) / 2·A
+ */
+
+ gradient_t *gradient = (gradient_t *)image;
+ source_image_t *source = (source_image_t *)image;
+ radial_gradient_t *radial = (radial_gradient_t *)image;
+ uint32_t *end = buffer + width;
+ float cx = 1.;
+ float cy = 0.;
+ float cz = 0.;
+ float rx = x + 0.5;
+ float ry = y + 0.5;
+ float rz = 1.;
+
+ if (source->common.transform)
+ {
+ pixman_vector_t v;
+ /* reference point is the center of the pixel */
+ v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
+ v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
+ v.vector[2] = pixman_fixed_1;
+
+ if (!pixman_transform_point_3d (source->common.transform, &v))
+ return;
+
+ cx = source->common.transform->matrix[0][0] / 65536.;
+ cy = source->common.transform->matrix[1][0] / 65536.;
+ cz = source->common.transform->matrix[2][0] / 65536.;
+
+ rx = v.vector[0] / 65536.;
+ ry = v.vector[1] / 65536.;
+ rz = v.vector[2] / 65536.;
+ }
+
+ /* When computing t over a scanline, we notice that some expressions
+ * are constant so we can compute them just once. Given:
+ *
+ * t = (-2·B ± ⎷(B² - 4·A·C)) / 2·A
+ *
+ * where
+ *
+ * A = cdx² + cdy² - dr² [precomputed as radial->A]
+ * B = -2·(pdx·cdx + pdy·cdy + r₁·dr)
+ * C = pdx² + pdy² - r₁²
+ *
+ * Since we have an affine transformation, we know that (pdx, pdy)
+ * increase linearly with each pixel,
+ *
+ * pdx = pdx₀ + n·cx,
+ * pdy = pdy₀ + n·cy,
+ *
+ * we can then express B in terms of an linear increment along
+ * the scanline:
+ *
+ * B = B₀ + n·cB, with
+ * B₀ = -2·(pdx₀·cdx + pdy₀·cdy + r₁·dr) and
+ * cB = -2·(cx·cdx + cy·cdy)
+ *
+ * Thus we can replace the full evaluation of B per-pixel (4 multiplies,
+ * 2 additions) with a single addition.
+ */
+ float r1 = radial->c1.radius / 65536.;
+ float r1sq = r1 * r1;
+ float pdx = rx - radial->c1.x / 65536.;
+ float pdy = ry - radial->c1.y / 65536.;
+ float A = radial->A;
+ float invA = -65536. / (2. * A);
+ float A4 = -4. * A;
+ float B = -2. * (pdx*radial->cdx + pdy*radial->cdy + r1*radial->dr);
+ float cB = -2. * (cx*radial->cdx + cy*radial->cdy);
+ pixman_bool_t invert = A * radial->dr < 0;
+
+ uint32_t *lut = gradient->color_lut;
+
+ int shift = 16 - gradient->color_lut_bits;
+
+ __m128i mask_0xffff = _mm_set1_epi32(0xffff);
+ __m128 invert_ = _mm_set1_ps (invert ? 1.0 : -1.0);
+ __m128 A4_ = _mm_set1_ps (A4);
+ __m128 invA_ = _mm_set1_ps (invA);
+ __m128 r1sq_ = _mm_set1_ps (r1sq);
+ __m128 pdx_ = _mm_set_ps (pdx + 3 * cx, pdx + 2 * cx, pdx + cx, pdx);
+ __m128 pdy_ = _mm_set_ps (pdy + 3 * cy, pdy + 2 * cy, pdy + cy, pdy);
+ __m128 B_ = _mm_set_ps (B + 3 * cB, B + 2 * cB, B + cB, B);
+ __m128 cx_ = _mm_set1_ps (cx);
+ __m128 cy_ = _mm_set1_ps (cy);
+ __m128 cB_ = _mm_set1_ps (cB);
+ __m128 cx_4 = _mm_mul_ps (cx_, _mm_set1_ps (4.0));
+ __m128 cy_4 = _mm_mul_ps (cy_, _mm_set1_ps (4.0));
+ __m128 cB_4 = _mm_mul_ps (cB_, _mm_set1_ps (4.0));
+
+ pixman_fixed_16_16_t tsf[4];
+
+ if (gradient->color_lut)
+ {
+ unsigned int repeat = source->common.repeat;
+ if (repeat == PIXMAN_REPEAT_NORMAL)
+ {
+ SSE2_RADIAL_GRADIENT(SSE2_LUT_REPEAT_NORMAL);
+ }
+ else if (repeat == PIXMAN_REPEAT_PAD)
+ {
+ SSE2_RADIAL_GRADIENT(SSE2_LUT_REPEAT_PAD);
+ }
+ else if (repeat == PIXMAN_REPEAT_REFLECT)
+ {
+ SSE2_RADIAL_GRADIENT(SSE2_LUT_REPEAT_REFLECT);
+ }
+ else /* PIXMAN_REPEAT_NONE */
+ {
+ SSE2_RADIAL_GRADIENT(SSE2_LUT_REPEAT_NONE);
+ }
+ }
+ else
+ {
+ pixman_sse2_gradient_walker_t walker;
+ _pixman_sse2_gradient_walker_init (&walker, gradient, source->common.repeat);
+
+ SSE2_RADIAL_GRADIENT(SSE2_USE_GRADIENT_WALKER);
+ }
+}