diff options
author | Søren Sandmann Pedersen <ssp@redhat.com> | 2011-09-27 11:23:49 -0400 |
---|---|---|
committer | Søren Sandmann Pedersen <ssp@redhat.com> | 2011-09-27 11:23:49 -0400 |
commit | 76c13dac657b1bde44a4fd01acc43d80f8971198 (patch) | |
tree | ae32292ea05e117622a851854d8ec0dcf0709cc7 | |
parent | 57fd8c37aa3148b1d70bad65e1a49721e9a47d7e (diff) |
gradientssse2-gradients
-rw-r--r-- | pixman/Makefile.am | 8 | ||||
-rw-r--r-- | pixman/pixman-sse2-gradient.h | 136 | ||||
-rw-r--r-- | pixman/pixman-sse2-linear-gradient.c | 356 | ||||
-rw-r--r-- | pixman/pixman-sse2-radial-gradient.c | 387 |
4 files changed, 885 insertions, 2 deletions
diff --git a/pixman/Makefile.am b/pixman/Makefile.am index 2421a4f9..a4dc03cd 100644 --- a/pixman/Makefile.am +++ b/pixman/Makefile.am @@ -50,8 +50,12 @@ endif # sse2 code if USE_SSE2 noinst_LTLIBRARIES += libpixman-sse2.la -libpixman_sse2_la_SOURCES = \ - pixman-sse2.c +libpixman_sse2_la_SOURCES = \ + pixman-sse2.c \ + pixman-sse2-linear-gradient.c \ + pixman-sse2-radial-gradient.c \ + pixman-sse2-gradient.h \ + pixman-sse2-gradient-walker.c libpixman_sse2_la_CFLAGS = $(DEP_CFLAGS) $(SSE2_CFLAGS) libpixman_sse2_la_LIBADD = $(DEP_LIBS) libpixman_1_la_LDFLAGS += $(SSE2_LDFLAGS) diff --git a/pixman/pixman-sse2-gradient.h b/pixman/pixman-sse2-gradient.h new file mode 100644 index 00000000..d356b9c4 --- /dev/null +++ b/pixman/pixman-sse2-gradient.h @@ -0,0 +1,136 @@ +/* + * Copyright © 2010 Novell, Inc + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Author: Chris Toshok (toshok@novell.com) + */ + +#ifndef PIXMAN_GRADIENT_SSE2_H +#define PIXMAN_GRADIENT_SSE2_H + +#define CLAMP_MAX(v,max) ((v) < 0 ? 0 : (v) > (max) ? (max) : (v)) + +#define SSE2_USE_GRADIENT_WALKER(_ts,_buffer,_lut,_shift,_storeop) \ + do { \ + _mm_##_storeop##_si128 ((__m128i*)_buffer, _pixman_sse2_gradient_walker_pixel (&walker, _ts)); \ + } while (0) + +#define SSE2_LUT_REPEAT_NORMAL(_ts,_buffer,_lut,_shift,_storeop) \ + do { \ + __m128i x = _ts; \ + \ + /* t = abs(t); */ \ + /* t = t & 0xFFFF; */ \ + \ + /* abs() hack from http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs */ \ + __m128i mask = _mm_srai_epi32 (x, 31); \ + x = _mm_xor_si128 (_mm_add_epi32 (x, mask), mask); \ + \ + x = _mm_and_si128 (x, mask_0xffff); \ + x = _mm_srai_epi32 (x, _shift); \ + \ + _mm_##_storeop##_si128 ((__m128i*)_buffer, x); \ + \ + _buffer[0] = _lut[_buffer[0]]; \ + _buffer[1] = _lut[_buffer[1]]; \ + _buffer[2] = _lut[_buffer[2]]; \ + _buffer[3] = _lut[_buffer[3]]; \ +} while (0) + +#define SSE2_LUT_REPEAT_REFLECT(_ts,_buffer,_lut,_shift,_storeop) \ + do { \ + __m128i x = _ts; \ + /* s = t << 15 >> 31; */ \ + /* t = (t ^ s) & 0xFFFF; */ \ + /* t >>= shift */ \ + __m128i ss = _mm_srai_epi32 (_mm_slli_epi32 (ts, 15), 31); \ + x = _mm_xor_si128 (ss, x); \ + x = _mm_and_si128 (x, mask_0xffff); \ + x = _mm_srai_epi32 (x, _shift); \ + \ + _mm_##_storeop##_si128 ((__m128i*)_buffer, x); \ + \ + _buffer[0] = _lut[_buffer[0]]; \ + _buffer[1] = _lut[_buffer[1]]; \ + _buffer[2] = _lut[_buffer[2]]; \ + _buffer[3] = _lut[_buffer[3]]; \ +} while (0) + +#define SSE2_LUT_REPEAT_NONE(_ts,_buffer,_lut,_shift,_storeop) \ + do { \ + __m128i x = ts; \ + \ + __m128i cmp_lt0 = _mm_cmplt_epi32 (x, _mm_set1_epi32 (0)); \ + __m128i cmp_gt0xffff = _mm_cmpgt_epi32 (x, mask_0xffff); \ + \ + /* cmp_mask_in_range = 0xffffffff where values are >= 0 and <= 0xffff, */ \ + /* cmp_mask_in_range = 0x00000000 where values are < 0 or > 0xffff */ \ + \ + /* cmp_mask_out_of_range = 0xffffffff where values are < 0 or > 0xffff, */ \ + /* cmp_mask_out_of_range = 0x00000000 where values are >= 0 and <= 0xffff */ \ + __m128i cmp_mask_out_of_range = _mm_or_si128 (cmp_lt0, cmp_gt0xffff); \ + __m128i cmp_mask_in_range = _mm_andnot_si128 (cmp_mask_out_of_range, \ + _mm_set1_epi32 (0xffffffff)); \ + \ + /* take care of the shift now, since we don't want to shift the sentinel (0xffffffff) */ \ + x = _mm_srai_epi32 (x, _shift); \ + \ + /* or together the masked in range x values and the out of range sentinel values */ \ + x = _mm_or_si128 (_mm_and_si128 (cmp_mask_in_range, x), \ + _mm_and_si128 (cmp_mask_out_of_range, _mm_set1_epi32 (0xffffffff))); \ + \ + _mm_##_storeop##_si128 ((__m128i*)_buffer, x); \ + \ + _buffer[0] = _buffer[0] == 0xffffffff ? 0 : _lut[_buffer[0]]; \ + _buffer[1] = _buffer[1] == 0xffffffff ? 0 : _lut[_buffer[1]]; \ + _buffer[2] = _buffer[2] == 0xffffffff ? 0 : _lut[_buffer[2]]; \ + _buffer[3] = _buffer[3] == 0xffffffff ? 0 : _lut[_buffer[3]]; \ +} while (0) + +#define SSE2_LUT_REPEAT_PAD(_ts,_buffer,_lut,_shift,_storeop) \ + do { \ + \ + /* t = t < 0 ? 0 : t > 0xffff ? 0xffff : t */ \ + /* t >>= shift */ \ + \ + __m128i x = ts; \ + __m128i cmp; \ + \ + /* handle t = t < 0 ? 0 */ \ + cmp = _mm_cmpgt_epi32 (x, _mm_set1_epi32 (0)); \ + x = _mm_and_si128 (x, cmp); \ + \ + /* handle t > 0xffff ? 0xffff */ \ + cmp = _mm_cmpgt_epi32 (x, mask_0xffff); \ + x = _mm_andnot_si128 (cmp, x); \ + x = _mm_add_epi32 (x, _mm_and_si128 (cmp, mask_0xffff)); \ + \ + x = _mm_srai_epi32 (x, _shift); \ + \ + _mm_##_storeop##_si128 ((__m128i*)_buffer, x); \ + \ + _buffer[0] = _lut[_buffer[0]]; \ + _buffer[1] = _lut[_buffer[1]]; \ + _buffer[2] = _lut[_buffer[2]]; \ + _buffer[3] = _lut[_buffer[3]]; \ +} while (0) + +#endif diff --git a/pixman/pixman-sse2-linear-gradient.c b/pixman/pixman-sse2-linear-gradient.c new file mode 100644 index 00000000..b47d7c52 --- /dev/null +++ b/pixman/pixman-sse2-linear-gradient.c @@ -0,0 +1,356 @@ +/* + * Copyright © 2000 SuSE, Inc. + * Copyright © 2007 Red Hat, Inc. + * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc. + * 2005 Lars Knoll & Zack Rusin, Trolltech + * Copyright © 2010 Novell, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Keith Packard not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission. Keith Packard makes no + * representations about the suitability of this software for any purpose. It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif +#include <stdlib.h> +#include "pixman-private.h" + +#include <mmintrin.h> +#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ +#include <emmintrin.h> + +#include <assert.h> + +#include "pixman-sse2-gradient.h" + +#include "pixman-sse2-gradient-walker.c" + +#define LIKELY(x) __builtin_expect ((x),1) +#define UNLIKELY(x) __builtin_expect ((x),0) + +#define SSE2_LINEAR_GRADIENT(lut_macro) \ + do \ + { \ + /* we can process 4 pixels at a time */ \ + int count; \ + int count; \ + \ + if ((int)buffer & 0x0f) \ + { \ + /* we aren't aligned to a 16 byte boundary. */ \ + /* process the at most 3 pixels until we get */ \ + /* there */ \ + int initial = 16 - ((int)buffer & 0x0f); \ + \ + lut_macro (ts, tsf, lut, shift, storeu); \ + \ + initial >>= 2; \ + \ + if (initial >= 3) \ + { \ + *buffer++ = tsf[0]; \ + ts = _mm_add_epi32 (ts, _mm_set1_epi32 (inc)); \ + } \ + if (initial >= 2) \ + { \ + *buffer++ = tsf[1]; \ + ts = _mm_add_epi32 (ts, _mm_set1_epi32 (inc)); \ + } \ + if (initial >= 1) \ + { \ + *buffer++ = tsf[2]; \ + ts = _mm_add_epi32 (ts, _mm_set1_epi32 (inc)); \ + } \ + } \ + \ + count = end - buffer; \ + while (count >= 4) \ + { \ + lut_macro (ts, buffer, lut, shift, store); \ + \ + buffer += 4; \ + /* t += t + 4 * inc */ \ + ts = _mm_add_epi32 (ts, incs); \ + \ + count -= 4; \ + } \ + \ + if (buffer < end) \ + { \ + /* we're at most 3 pixels off, so unroll/inline it here */ \ + int rem = end - buffer; \ + \ + lut_macro (ts, tsf, lut, shift, storeu); \ + \ + if (rem-- > 0) \ + buffer[0] = tsf[0]; \ + if (rem-- > 0) \ + buffer[1] = tsf[1]; \ + if (rem-- > 0) \ + buffer[2] = tsf[2]; \ + } \ + } \ + while (0) + + +void +_pixman_sse2_linear_gradient_get_scanline_32 (pixman_image_t *image, + int x, + int y, + int width, + uint32_t * buffer, + const uint32_t *mask, + uint32_t mask_bits) +{ + pixman_vector_t v, unit; + pixman_fixed_32_32_t l; + pixman_fixed_48_16_t dx, dy, a, b, off; + gradient_t *gradient = (gradient_t *)image; + linear_gradient_t *linear = (linear_gradient_t *)image; + uint32_t *end = buffer + width; + + /* reference point is the center of the pixel */ + v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2; + v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2; + v.vector[2] = pixman_fixed_1; + + if (gradient->common.transform) + { + if (!pixman_transform_point_3d (gradient->common.transform, &v)) + return; + + unit.vector[0] = gradient->common.transform->matrix[0][0]; + unit.vector[1] = gradient->common.transform->matrix[1][0]; + unit.vector[2] = gradient->common.transform->matrix[2][0]; + } + else + { + unit.vector[0] = pixman_fixed_1; + unit.vector[1] = 0; + unit.vector[2] = 0; + } + + dx = linear->p2.x - linear->p1.x; + dy = linear->p2.y - linear->p1.y; + + l = dx * dx + dy * dy; + + if (l != 0) + { + a = (dx << 32) / l; + b = (dy << 32) / l; + off = (-a * linear->p1.x + -b * linear->p1.y) >> 16; + } + + if (l == 0 || (unit.vector[2] == 0 && v.vector[2] == pixman_fixed_1)) + { + pixman_fixed_48_16_t inc, t; + + /* affine transformation only */ + if (l == 0) + { + t = 0; + inc = 0; + } + else + { + t = ((a * v.vector[0] + b * v.vector[1]) >> 16) + off; + inc = (a * unit.vector[0] + b * unit.vector[1]) >> 16; + } + + if (gradient->class == SOURCE_IMAGE_CLASS_VERTICAL) + { + uint32_t *lut = gradient->color_lut; + uint32_t color; + __m128i colors; + + if (lut != NULL) + { + unsigned int repeat = gradient->common.repeat; + int shift = 16 - gradient->color_lut_bits; + + if (repeat == PIXMAN_REPEAT_NORMAL) + { + /* color = _pixman_gradient_walker_pixel_lut_repeat_normal (&walker, t); */ + if (t < 0) + t = -t; + t = t & 0xFFFF; + + color = lut[t >> shift]; + } + else if (repeat == PIXMAN_REPEAT_PAD) + { + /* color = _pixman_gradient_walker_pixel_lut_repeat_pad (&walker, t); */ + + t = CLAMP_MAX (t, 0xFFFF); + color = lut[t >> shift]; + } + else if (repeat == PIXMAN_REPEAT_REFLECT) + { + /* color = _pixman_gradient_walker_pixel_lut_repeat_reflect (&walker, t); */ + int s = t << 15 >> 31; + t = (t ^ s) & 0xFFFF; + + color = lut[x >> shift]; + } + else /* PIXMAN_REPEAT_NONE */ + { + /* color = _pixman_gradient_walker_pixel_lut_repeat_none (&walker, t); */ + if (t < 0 || t > 0xFFFF) + color = 0 /* transparent black */; + else + color = lut[t >> shift]; + } + + colors = _mm_set1_epi32 (color); + } + else + { + pixman_sse2_gradient_walker_t walker; + + _pixman_sse2_gradient_walker_init (&walker, gradient, gradient->common.repeat); + + __m128i ts = _mm_set1_epi32 (t); + colors = _pixman_sse2_gradient_walker_pixel (&walker, ts); + color = _mm_cvtsi128_si32 (colors); + } + + int initial = 16 - ((int)buffer & 0x0f); + while (initial > 0) { + *buffer++ = color; + initial -= 4; + } + + assert (( (int)buffer & 0x0f) == 0); + + uint32_t *end_aligned = (uint32_t*) ((char*)end - ((int)end & 0x0f)); + + while (buffer < end_aligned) { + _mm_store_si128 ((__m128i*)buffer, colors); + buffer += 4; + } + if (end_aligned != end) { + int i; + for (i = 0; i < (end - end_aligned); i ++) + *buffer++ = color; + } + } + else + { + uint32_t *lut = gradient->color_lut; + int shift = 16 - gradient->color_lut_bits; + + __m128i incs = _mm_set1_epi32 (inc); + __m128i ts = _mm_set_epi32 (t + 3 * inc, t + 2 * inc, t + inc, t); + __m128i mask_0xffff = _mm_set1_epi32 (0xffff); + + pixman_fixed_16_16_t tsf[4]; + + incs = _mm_slli_epi32 (incs, 2); + + if (gradient->color_lut != NULL && (((int)buffer & 0x03) == 0)) + { + unsigned int repeat = gradient->common.repeat; + + if (repeat == PIXMAN_REPEAT_NORMAL) + { + SSE2_LINEAR_GRADIENT (SSE2_LUT_REPEAT_NORMAL); + } + else if (repeat == PIXMAN_REPEAT_PAD) + { + SSE2_LINEAR_GRADIENT (SSE2_LUT_REPEAT_PAD); + } + else if (repeat == PIXMAN_REPEAT_REFLECT) + { + SSE2_LINEAR_GRADIENT (SSE2_LUT_REPEAT_REFLECT); + } + else /* PIXMAN_REPEAT_REFLECT */ + { + SSE2_LINEAR_GRADIENT (SSE2_LUT_REPEAT_NONE); + } + } + else + { + pixman_sse2_gradient_walker_t walker; + _pixman_sse2_gradient_walker_init (&walker, gradient, gradient->common.repeat); + + SSE2_LINEAR_GRADIENT (SSE2_USE_GRADIENT_WALKER); + } + } + } +#if 0 + else + { + /* projective transformation */ + pixman_fixed_48_16_t t; + + if (gradientclass == SOURCE_IMAGE_CLASS_VERTICAL) + { + register uint32_t color; + + if (v.vector[2] == 0) + { + t = 0; + } + else + { + pixman_fixed_48_16_t x, y; + + x = ((pixman_fixed_48_16_t) v.vector[0] << 16) / v.vector[2]; + y = ((pixman_fixed_48_16_t) v.vector[1] << 16) / v.vector[2]; + t = ((a * x + b * y) >> 16) + off; + } + + color = _pixman_gradient_walker_pixel (&walker, t); + + while (buffer < end) + *buffer++ = color; + } + else + { + while (buffer < end) + { + if (!mask || *mask++ & mask_bits) + { + if (v.vector[2] == 0) + { + t = 0; + } + else + { + pixman_fixed_48_16_t x, y; + x = ((pixman_fixed_48_16_t)v.vector[0] << 16) / v.vector[2]; + y = ((pixman_fixed_48_16_t)v.vector[1] << 16) / v.vector[2]; + t = ((a * x + b * y) >> 16) + off; + } + + *buffer = _pixman_gradient_walker_pixel (&walker, t); + } + + ++buffer; + + v.vector[0] += unit.vector[0]; + v.vector[1] += unit.vector[1]; + v.vector[2] += unit.vector[2]; + } + } + } +#endif +} diff --git a/pixman/pixman-sse2-radial-gradient.c b/pixman/pixman-sse2-radial-gradient.c new file mode 100644 index 00000000..dbb20eef --- /dev/null +++ b/pixman/pixman-sse2-radial-gradient.c @@ -0,0 +1,387 @@ +/* + * + * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc. + * Copyright © 2000 SuSE, Inc. + * 2005 Lars Knoll & Zack Rusin, Trolltech + * Copyright © 2007 Red Hat, Inc. + * Copyright © 2010 Novell, Inc. + * + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Keith Packard not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission. Keith Packard makes no + * representations about the suitability of this software for any purpose. It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif +#include <stdlib.h> +#include <math.h> +#include "pixman-private.h" + +#include <mmintrin.h> +#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ +#include <emmintrin.h> + +#include "pixman-sse2-gradient.h" + +#include "pixman-sse2-gradient-walker.c" + +#define LIKELY(x) __builtin_expect((x),1) +#define UNLIKELY(x) __builtin_expect((x),0) + +#define SSE2_RADIAL_GRADIENT(lut_macro) do { \ + /* we can process 4 pixels at a time */ \ + \ + int count; \ + \ + if ((int)buffer & 0x0f) \ + { \ + /* we aren't aligned to a 16 byte boundary. */ \ + /* process the at most 3 pixels until we get */ \ + /* there */ \ + int initial = 16 - ((int)buffer & 0x0f); \ + float scale; \ + \ + __m128i ts = calc_radial_det4 (B_, A4_, invA_, \ + pdx_, pdy_, r1sq_, invert_); \ + \ + lut_macro(ts, tsf, lut, shift, storeu); \ + \ + initial >>= 2; \ + \ + if (initial >= 3) \ + *buffer++ = tsf[2]; \ + if (initial >= 2) \ + *buffer++ = tsf[1]; \ + if (initial >= 1) \ + *buffer++ = tsf[1]; \ + \ + scale = initial; \ + \ + pdx_ = _mm_add_ps (pdx_, _mm_mul_ps (cx_, _mm_load1_ps (&scale))); \ + pdy_ = _mm_add_ps (pdy_, _mm_mul_ps (cy_, _mm_load1_ps (&scale))); \ + B_ = _mm_add_ps (B_, _mm_mul_ps (cB_, _mm_load1_ps (&scale))); \ + } \ + \ + count = end - buffer; \ + while (count >= 4) \ + { \ + __m128i ts = calc_radial_det4 (B_, A4_, invA_, \ + pdx_, pdy_, r1sq_, invert_); \ + \ + lut_macro(ts, buffer, lut, shift, store); \ + \ + buffer += 4; \ + \ + pdx_ = _mm_add_ps (pdx_, cx_4); \ + pdy_ = _mm_add_ps (pdy_, cy_4); \ + B_ = _mm_add_ps (B_, cB_4); \ + \ + count -= 4; \ + } \ + \ + if (buffer < end) \ + { \ + /* we're at most 3 pixels off, so unroll/inline it here */ \ + int rem = end - buffer; \ + \ + __m128i ts = calc_radial_det4 (B_, A4_, invA_, \ + pdx_, pdy_, r1sq_, invert_); \ + \ + lut_macro(ts, tsf, lut, shift, storeu); \ + \ + if (rem-- > 0) buffer[0] = tsf[0]; \ + if (rem-- > 0) buffer[1] = tsf[1]; \ + if (rem-- > 0) buffer[2] = tsf[2]; \ + } \ + } while (0) + + +static force_inline __m128i +calc_radial_det4 (__m128 B, __m128 A4, __m128 invA, __m128 pdx, __m128 pdy, __m128 r1sq, __m128 invert) +{ + __m128 Bsq = _mm_mul_ps (B, B); + + /* discriminant = A4 * (pdx*pdx + pdy * pdy - r1sq) + Bsq */ + __m128 discriminant = _mm_add_ps (_mm_mul_ps (A4, + _mm_sub_ps (_mm_add_ps (_mm_mul_ps (pdx, pdx), + _mm_mul_ps (pdy, pdy)), + r1sq)), + Bsq); + + /* figure out the negative discriminants, and create a mask of 1.0 */ + /* for those discriminants that are *not* less than 0. */ + __m128 lt0_mask = _mm_cmplt_ps (discriminant, _mm_setzero_ps()); + + /* zero out the negative ones */ + discriminant = _mm_andnot_ps (lt0_mask, discriminant); + + /* sqrt them all */ + discriminant = _mm_sqrt_ps (discriminant); + + discriminant = _mm_mul_ps (invA, + _mm_add_ps (B, + _mm_mul_ps (invert, + discriminant))); + + return _mm_cvtps_epi32 (discriminant); +} + +void +_pixman_sse2_radial_gradient_get_scanline_32 (pixman_image_t *image, + int x, + int y, + int width, + uint32_t * buffer, + const uint32_t *mask, + uint32_t mask_bits) +{ + /* + * In the radial gradient problem we are given two circles (c₁,r₁) and + * (c₂,r₂) that define the gradient itself. Then, for any point p, we + * must compute the value(s) of t within [0.0, 1.0] representing the + * circle(s) that would color the point. + * + * There are potentially two values of t since the point p can be + * colored by both sides of the circle, (which happens whenever one + * circle is not entirely contained within the other). + * + * If we solve for a value of t that is outside of [0.0, 1.0] then we + * use the extend mode (NONE, REPEAT, REFLECT, or PAD) to map to a + * value within [0.0, 1.0]. + * + * Here is an illustration of the problem: + * + * p₂ + * p • + * • ╲ + * · ╲r₂ + * p₁ · ╲ + * • θ╲ + * ╲ ╌╌• + * ╲r₁ · c₂ + * θ╲ · + * ╌╌• + * c₁ + * + * Given (c₁,r₁), (c₂,r₂) and p, we must find an angle θ such that two + * points p₁ and p₂ on the two circles are collinear with p. Then, the + * desired value of t is the ratio of the length of p₁p to the length + * of p₁p₂. + * + * So, we have six unknown values: (p₁x, p₁y), (p₂x, p₂y), θ and t. + * We can also write six equations that constrain the problem: + * + * Point p₁ is a distance r₁ from c₁ at an angle of θ: + * + * 1. p₁x = c₁x + r₁·cos θ + * 2. p₁y = c₁y + r₁·sin θ + * + * Point p₂ is a distance r₂ from c₂ at an angle of θ: + * + * 3. p₂x = c₂x + r2·cos θ + * 4. p₂y = c₂y + r2·sin θ + * + * Point p lies at a fraction t along the line segment p₁p₂: + * + * 5. px = t·p₂x + (1-t)·p₁x + * 6. py = t·p₂y + (1-t)·p₁y + * + * To solve, first subtitute 1-4 into 5 and 6: + * + * px = t·(c₂x + r₂·cos θ) + (1-t)·(c₁x + r₁·cos θ) + * py = t·(c₂y + r₂·sin θ) + (1-t)·(c₁y + r₁·sin θ) + * + * Then solve each for cos θ and sin θ expressed as a function of t: + * + * cos θ = (-(c₂x - c₁x)·t + (px - c₁x)) / ((r₂-r₁)·t + r₁) + * sin θ = (-(c₂y - c₁y)·t + (py - c₁y)) / ((r₂-r₁)·t + r₁) + * + * To simplify this a bit, we define new variables for several of the + * common terms as shown below: + * + * p₂ + * p • + * • ╲ + * · ┆ ╲r₂ + * p₁ · ┆ ╲ + * • pdy┆ ╲ + * ╲ ┆ •c₂ + * ╲r₁ ┆ · ┆ + * ╲ ·┆ ┆cdy + * •╌╌╌╌┴╌╌╌╌╌╌╌┘ + * c₁ pdx cdx + * + * cdx = (c₂x - c₁x) + * cdy = (c₂y - c₁y) + * dr = r₂-r₁ + * pdx = px - c₁x + * pdy = py - c₁y + * + * Note that cdx, cdy, and dr do not depend on point p at all, so can + * be pre-computed for the entire gradient. The simplifed equations + * are now: + * + * cos θ = (-cdx·t + pdx) / (dr·t + r₁) + * sin θ = (-cdy·t + pdy) / (dr·t + r₁) + * + * Finally, to get a single function of t and eliminate the last + * unknown θ, we use the identity sin²θ + cos²θ = 1. First, square + * each equation, (we knew a quadratic was coming since it must be + * possible to obtain two solutions in some cases): + * + * cos²θ = (cdx²t² - 2·cdx·pdx·t + pdx²) / (dr²·t² + 2·r₁·dr·t + r₁²) + * sin²θ = (cdy²t² - 2·cdy·pdy·t + pdy²) / (dr²·t² + 2·r₁·dr·t + r₁²) + * + * Then add both together, set the result equal to 1, and express as a + * standard quadratic equation in t of the form At² + Bt + C = 0 + * + * (cdx² + cdy² - dr²)·t² - 2·(cdx·pdx + cdy·pdy + r₁·dr)·t + (pdx² + pdy² - r₁²) = 0 + * + * In other words: + * + * A = cdx² + cdy² - dr² + * B = -2·(pdx·cdx + pdy·cdy + r₁·dr) + * C = pdx² + pdy² - r₁² + * + * And again, notice that A does not depend on p, so can be + * precomputed. From here we just use the quadratic formula to solve + * for t: + * + * t = (-2·B ± ⎷(B² - 4·A·C)) / 2·A + */ + + gradient_t *gradient = (gradient_t *)image; + source_image_t *source = (source_image_t *)image; + radial_gradient_t *radial = (radial_gradient_t *)image; + uint32_t *end = buffer + width; + float cx = 1.; + float cy = 0.; + float cz = 0.; + float rx = x + 0.5; + float ry = y + 0.5; + float rz = 1.; + + if (source->common.transform) + { + pixman_vector_t v; + /* reference point is the center of the pixel */ + v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2; + v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2; + v.vector[2] = pixman_fixed_1; + + if (!pixman_transform_point_3d (source->common.transform, &v)) + return; + + cx = source->common.transform->matrix[0][0] / 65536.; + cy = source->common.transform->matrix[1][0] / 65536.; + cz = source->common.transform->matrix[2][0] / 65536.; + + rx = v.vector[0] / 65536.; + ry = v.vector[1] / 65536.; + rz = v.vector[2] / 65536.; + } + + /* When computing t over a scanline, we notice that some expressions + * are constant so we can compute them just once. Given: + * + * t = (-2·B ± ⎷(B² - 4·A·C)) / 2·A + * + * where + * + * A = cdx² + cdy² - dr² [precomputed as radial->A] + * B = -2·(pdx·cdx + pdy·cdy + r₁·dr) + * C = pdx² + pdy² - r₁² + * + * Since we have an affine transformation, we know that (pdx, pdy) + * increase linearly with each pixel, + * + * pdx = pdx₀ + n·cx, + * pdy = pdy₀ + n·cy, + * + * we can then express B in terms of an linear increment along + * the scanline: + * + * B = B₀ + n·cB, with + * B₀ = -2·(pdx₀·cdx + pdy₀·cdy + r₁·dr) and + * cB = -2·(cx·cdx + cy·cdy) + * + * Thus we can replace the full evaluation of B per-pixel (4 multiplies, + * 2 additions) with a single addition. + */ + float r1 = radial->c1.radius / 65536.; + float r1sq = r1 * r1; + float pdx = rx - radial->c1.x / 65536.; + float pdy = ry - radial->c1.y / 65536.; + float A = radial->A; + float invA = -65536. / (2. * A); + float A4 = -4. * A; + float B = -2. * (pdx*radial->cdx + pdy*radial->cdy + r1*radial->dr); + float cB = -2. * (cx*radial->cdx + cy*radial->cdy); + pixman_bool_t invert = A * radial->dr < 0; + + uint32_t *lut = gradient->color_lut; + + int shift = 16 - gradient->color_lut_bits; + + __m128i mask_0xffff = _mm_set1_epi32(0xffff); + __m128 invert_ = _mm_set1_ps (invert ? 1.0 : -1.0); + __m128 A4_ = _mm_set1_ps (A4); + __m128 invA_ = _mm_set1_ps (invA); + __m128 r1sq_ = _mm_set1_ps (r1sq); + __m128 pdx_ = _mm_set_ps (pdx + 3 * cx, pdx + 2 * cx, pdx + cx, pdx); + __m128 pdy_ = _mm_set_ps (pdy + 3 * cy, pdy + 2 * cy, pdy + cy, pdy); + __m128 B_ = _mm_set_ps (B + 3 * cB, B + 2 * cB, B + cB, B); + __m128 cx_ = _mm_set1_ps (cx); + __m128 cy_ = _mm_set1_ps (cy); + __m128 cB_ = _mm_set1_ps (cB); + __m128 cx_4 = _mm_mul_ps (cx_, _mm_set1_ps (4.0)); + __m128 cy_4 = _mm_mul_ps (cy_, _mm_set1_ps (4.0)); + __m128 cB_4 = _mm_mul_ps (cB_, _mm_set1_ps (4.0)); + + pixman_fixed_16_16_t tsf[4]; + + if (gradient->color_lut) + { + unsigned int repeat = source->common.repeat; + if (repeat == PIXMAN_REPEAT_NORMAL) + { + SSE2_RADIAL_GRADIENT(SSE2_LUT_REPEAT_NORMAL); + } + else if (repeat == PIXMAN_REPEAT_PAD) + { + SSE2_RADIAL_GRADIENT(SSE2_LUT_REPEAT_PAD); + } + else if (repeat == PIXMAN_REPEAT_REFLECT) + { + SSE2_RADIAL_GRADIENT(SSE2_LUT_REPEAT_REFLECT); + } + else /* PIXMAN_REPEAT_NONE */ + { + SSE2_RADIAL_GRADIENT(SSE2_LUT_REPEAT_NONE); + } + } + else + { + pixman_sse2_gradient_walker_t walker; + _pixman_sse2_gradient_walker_init (&walker, gradient, source->common.repeat); + + SSE2_RADIAL_GRADIENT(SSE2_USE_GRADIENT_WALKER); + } +} |