summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSøren Sandmann Pedersen <ssp@redhat.com>2013-06-06 16:15:39 -0400
committerSøren Sandmann Pedersen <ssp@redhat.com>2013-09-16 16:50:35 -0400
commit58a79dfe6d1fd62c2b66c69fdb64f6b8ecf61da5 (patch)
tree23a3dcb9022c74946c19fccb4291a6eb56b2b0ea
parentf1792b32215d3b62084ee99fca5c448f1c7f8e1d (diff)
ssse3: Add iterator for separable bilinear scaling
This new iterator uses the SSSE3 instructions pmaddubsw and pabsw to implement a fast iterator for bilinear scaling. There is a graph here recording the per-pixel time for various bilinear scaling algorithms as reported by scaling-bench: http://people.freedesktop.org/~sandmann/ssse3.v2/ssse3.v2.png As the graph shows, this new iterator is clearly faster than the existing C iterator, and when used with an SSE2 combiner, it is also faster than the existing SSE2 fast paths for upscaling, though not for downscaling. Another graph: http://people.freedesktop.org/~sandmann/ssse3.v2/movdqu.png shows the difference between writing to iter->buffer with movdqa, movdqu on an aligned buffer, and movdqu on a deliberately unaligned buffer. Since the differences are very small, the patch here avoids using movdqa because imposing alignment restrictions on iter->buffer may interfere with other optimizations, such as writing directly to the destination image. The data was measured with scaling-bench on a Sandy Bridge Core i3-2350M @ 2.3GHz and is available in this directory: http://people.freedesktop.org/~sandmann/ssse3.v2/ where there is also a Gnumeric spreadsheet ssse3.v2.gnumeric containing the per-pixel values and the graph. V2: - Use uintptr_t instead of unsigned long in the ALIGN macro - Use _mm_storel_epi64 instead of _mm_cvtsi128_si64 as the latter form is not available on x86-32. - Use _mm_storeu_si128() instead of _mm_store_si128() to avoid imposing alignment requirements on iter->buffer
-rw-r--r--pixman/pixman-ssse3.c312
1 files changed, 312 insertions, 0 deletions
diff --git a/pixman/pixman-ssse3.c b/pixman/pixman-ssse3.c
index 19d71e7f..34763e20 100644
--- a/pixman/pixman-ssse3.c
+++ b/pixman/pixman-ssse3.c
@@ -35,6 +35,316 @@
#include "pixman-private.h"
#include "pixman-inlines.h"
+typedef struct
+{
+ int y;
+ uint64_t * buffer;
+} line_t;
+
+typedef struct
+{
+ line_t line0;
+ line_t line1;
+ pixman_fixed_t y;
+ pixman_fixed_t x;
+ uint64_t data[1];
+} bilinear_info_t;
+
+static void
+ssse3_fetch_horizontal (bits_image_t *image, line_t *line,
+ int y, pixman_fixed_t x, pixman_fixed_t ux, int n)
+{
+ uint32_t *bits = image->bits + y * image->rowstride;
+ __m128i vx = _mm_set_epi16 (
+ - (x + 1), x, - (x + 1), x,
+ - (x + ux + 1), x + ux, - (x + ux + 1), x + ux);
+ __m128i vux = _mm_set_epi16 (
+ - 2 * ux, 2 * ux, - 2 * ux, 2 * ux,
+ - 2 * ux, 2 * ux, - 2 * ux, 2 * ux);
+ __m128i vaddc = _mm_set_epi16 (1, 0, 1, 0, 1, 0, 1, 0);
+ __m128i *b = (__m128i *)line->buffer;
+ __m128i vrl0, vrl1;
+
+ while ((n -= 2) >= 0)
+ {
+ __m128i vw, vr, s;
+
+ vrl1 = _mm_loadl_epi64 (
+ (__m128i *)(bits + pixman_fixed_to_int (x + ux)));
+ /* vrl1: R1, L1 */
+
+ final_pixel:
+ vrl0 = _mm_loadl_epi64 (
+ (__m128i *)(bits + pixman_fixed_to_int (x)));
+ /* vrl0: R0, L0 */
+
+ /* The weights are based on vx which is a vector of
+ *
+ * - (x + 1), x, - (x + 1), x,
+ * - (x + ux + 1), x + ux, - (x + ux + 1), x + ux
+ *
+ * so the 16 bit weights end up like this:
+ *
+ * iw0, w0, iw0, w0, iw1, w1, iw1, w1
+ *
+ * and after shifting and packing, we get these bytes:
+ *
+ * iw0, w0, iw0, w0, iw1, w1, iw1, w1,
+ * iw0, w0, iw0, w0, iw1, w1, iw1, w1,
+ *
+ * which means the first and the second input pixel
+ * have to be interleaved like this:
+ *
+ * la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
+ * lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
+ *
+ * before maddubsw can be used.
+ */
+
+ vw = _mm_add_epi16 (
+ vaddc, _mm_srli_epi16 (vx, 16 - BILINEAR_INTERPOLATION_BITS));
+ /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1
+ */
+
+ vw = _mm_packus_epi16 (vw, vw);
+ /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1,
+ * iw0, w0, iw0, w0, iw1, w1, iw1, w1
+ */
+ vx = _mm_add_epi16 (vx, vux);
+
+ x += 2 * ux;
+
+ vr = _mm_unpacklo_epi16 (vrl1, vrl0);
+ /* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */
+
+ s = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (1, 0, 3, 2));
+ /* s: lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */
+
+ vr = _mm_unpackhi_epi8 (vr, s);
+ /* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
+ * lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
+ */
+
+ vr = _mm_maddubs_epi16 (vr, vw);
+
+ /* When the weight is 0, the inverse weight is
+ * 128 which can't be represented in a signed byte.
+ * As a result maddubsw computes the following:
+ *
+ * r = l * -128 + r * 0
+ *
+ * rather than the desired
+ *
+ * r = l * 128 + r * 0
+ *
+ * We fix this by taking the absolute value of the
+ * result.
+ */
+ vr = _mm_abs_epi16 (vr);
+
+ /* vr: A0, R0, A1, R1, G0, B0, G1, B1 */
+ _mm_store_si128 (b++, vr);
+ }
+
+ if (n == -1)
+ {
+ vrl1 = _mm_setzero_si128();
+ goto final_pixel;
+ }
+
+ line->y = y;
+}
+
+static uint32_t *
+ssse3_fetch_bilinear_cover (pixman_iter_t *iter, const uint32_t *mask)
+{
+ pixman_fixed_t fx, ux;
+ bilinear_info_t *info = iter->data;
+ line_t *line0, *line1;
+ int y0, y1;
+ int32_t dist_y;
+ __m128i vw;
+ int i;
+
+ fx = info->x;
+ ux = iter->image->common.transform->matrix[0][0];
+
+ y0 = pixman_fixed_to_int (info->y);
+ y1 = y0 + 1;
+
+ line0 = &info->line0;
+ line1 = &info->line1;
+
+ if (line0->y != y0 || line1->y != y1)
+ {
+ if (line0->y == y1 || line1->y == y0)
+ {
+ line_t tmp = *line0;
+ *line0 = *line1;
+ *line1 = tmp;
+ }
+
+ if (line0->y != y0)
+ {
+ ssse3_fetch_horizontal (
+ &iter->image->bits, line0, y0, fx, ux, iter->width);
+ }
+
+ if (line1->y != y1)
+ {
+ ssse3_fetch_horizontal (
+ &iter->image->bits, line1, y1, fx, ux, iter->width);
+ }
+ }
+
+ dist_y = pixman_fixed_to_bilinear_weight (info->y);
+ dist_y <<= (16 - BILINEAR_INTERPOLATION_BITS);
+
+ vw = _mm_set_epi16 (
+ dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y);
+
+ for (i = 0; i + 3 < iter->width; i += 4)
+ {
+ __m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
+ __m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
+ __m128i top1 = _mm_load_si128 ((__m128i *)(line0->buffer + i + 2));
+ __m128i bot1 = _mm_load_si128 ((__m128i *)(line1->buffer + i + 2));
+ __m128i r0, r1, tmp, p;
+
+ r0 = _mm_mulhi_epu16 (
+ _mm_sub_epi16 (bot0, top0), vw);
+ tmp = _mm_cmplt_epi16 (bot0, top0);
+ tmp = _mm_and_si128 (tmp, vw);
+ r0 = _mm_sub_epi16 (r0, tmp);
+ r0 = _mm_add_epi16 (r0, top0);
+ r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
+ /* r0: A0 R0 A1 R1 G0 B0 G1 B1 */
+ r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
+ /* r0: A1 R1 G1 B1 A0 R0 G0 B0 */
+
+ r1 = _mm_mulhi_epu16 (
+ _mm_sub_epi16 (bot1, top1), vw);
+ tmp = _mm_cmplt_epi16 (bot1, top1);
+ tmp = _mm_and_si128 (tmp, vw);
+ r1 = _mm_sub_epi16 (r1, tmp);
+ r1 = _mm_add_epi16 (r1, top1);
+ r1 = _mm_srli_epi16 (r1, BILINEAR_INTERPOLATION_BITS);
+ r1 = _mm_shuffle_epi32 (r1, _MM_SHUFFLE (2, 0, 3, 1));
+ /* r1: A3 R3 G3 B3 A2 R2 G2 B2 */
+
+ p = _mm_packus_epi16 (r0, r1);
+
+ _mm_storeu_si128 ((__m128i *)(iter->buffer + i), p);
+ }
+
+ while (i < iter->width)
+ {
+ __m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
+ __m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
+ __m128i r0, tmp, p;
+
+ r0 = _mm_mulhi_epu16 (
+ _mm_sub_epi16 (bot0, top0), vw);
+ tmp = _mm_cmplt_epi16 (bot0, top0);
+ tmp = _mm_and_si128 (tmp, vw);
+ r0 = _mm_sub_epi16 (r0, tmp);
+ r0 = _mm_add_epi16 (r0, top0);
+ r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
+ /* r0: A0 R0 A1 R1 G0 B0 G1 B1 */
+ r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
+ /* r0: A1 R1 G1 B1 A0 R0 G0 B0 */
+
+ p = _mm_packus_epi16 (r0, r0);
+
+ if (iter->width - i == 1)
+ {
+ *(uint32_t *)(iter->buffer + i) = _mm_cvtsi128_si32 (p);
+ i++;
+ }
+ else
+ {
+ _mm_storel_epi64 ((__m128i *)(iter->buffer + i), p);
+ i += 2;
+ }
+ }
+
+ info->y += iter->image->common.transform->matrix[1][1];
+
+ return iter->buffer;
+}
+
+static void
+ssse3_bilinear_cover_iter_fini (pixman_iter_t *iter)
+{
+ free (iter->data);
+}
+
+static void
+ssse3_bilinear_cover_iter_init (pixman_iter_t *iter, const pixman_iter_info_t *iter_info)
+{
+ int width = iter->width;
+ bilinear_info_t *info;
+ pixman_vector_t v;
+
+ /* Reference point is the center of the pixel */
+ v.vector[0] = pixman_int_to_fixed (iter->x) + pixman_fixed_1 / 2;
+ v.vector[1] = pixman_int_to_fixed (iter->y) + pixman_fixed_1 / 2;
+ v.vector[2] = pixman_fixed_1;
+
+ if (!pixman_transform_point_3d (iter->image->common.transform, &v))
+ goto fail;
+
+ info = malloc (sizeof (*info) + (2 * width - 1) * sizeof (uint64_t) + 64);
+ if (!info)
+ goto fail;
+
+ info->x = v.vector[0] - pixman_fixed_1 / 2;
+ info->y = v.vector[1] - pixman_fixed_1 / 2;
+
+#define ALIGN(addr) \
+ ((void *)((((uintptr_t)(addr)) + 15) & (~15)))
+
+ /* It is safe to set the y coordinates to -1 initially
+ * because COVER_CLIP_BILINEAR ensures that we will only
+ * be asked to fetch lines in the [0, height) interval
+ */
+ info->line0.y = -1;
+ info->line0.buffer = ALIGN (&(info->data[0]));
+ info->line1.y = -1;
+ info->line1.buffer = ALIGN (info->line0.buffer + width);
+
+ iter->get_scanline = ssse3_fetch_bilinear_cover;
+ iter->fini = ssse3_bilinear_cover_iter_fini;
+
+ iter->data = info;
+ return;
+
+fail:
+ /* Something went wrong, either a bad matrix or OOM; in such cases,
+ * we don't guarantee any particular rendering.
+ */
+ _pixman_log_error (
+ FUNC, "Allocation failure or bad matrix, skipping rendering\n");
+
+ iter->get_scanline = _pixman_iter_get_scanline_noop;
+ iter->fini = NULL;
+}
+
+static const pixman_iter_info_t ssse3_iters[] =
+{
+ { PIXMAN_a8r8g8b8,
+ (FAST_PATH_STANDARD_FLAGS |
+ FAST_PATH_SCALE_TRANSFORM |
+ FAST_PATH_BILINEAR_FILTER |
+ FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR),
+ ITER_NARROW | ITER_SRC,
+ ssse3_bilinear_cover_iter_init,
+ NULL, NULL
+ },
+
+ { PIXMAN_null },
+};
+
static const pixman_fast_path_t ssse3_fast_paths[] =
{
{ PIXMAN_OP_NONE },
@@ -46,5 +356,7 @@ _pixman_implementation_create_ssse3 (pixman_implementation_t *fallback)
pixman_implementation_t *imp =
_pixman_implementation_create (fallback, ssse3_fast_paths);
+ imp->iter_info = ssse3_iters;
+
return imp;
}