Add fast paths for separable convolution

Similar to the fast paths for general affine access, add some fast paths for the separable filter for all combinations of formats x8r8g8b8, a8r8g8b8, r5g6b5, a8 with the four repeat modes. It is easy to see the speedup in the demos/scale program.
author: Søren Sandmann Pedersen <ssp@redhat.com> 2012-11-26 14:27:34 -0500
committer: Søren Sandmann Pedersen <ssp@redhat.com> 2012-12-08 12:38:58 -0500
commit: 8ca4e144724ba2041bc5ef077ccf6d24e7cf4d1f (patch)
tree: 015ff0428069a2e70168467702a16246b49fc226
parent: 4f18ba30cea56331e30992242201b20954c8f7f2 (diff)
3 files changed, 184 insertions, 3 deletions
diff --git a/pixman/pixman-bits-image.c b/pixman/pixman-bits-image.c
index e76b78c3..86d80c3f 100644
--- a/pixman/pixman-bits-image.c
+++ b/pixman/pixman-bits-image.c
@@ -720,11 +720,155 @@ bits_image_fetch_general (pixman_iter_t  *iter,
     return buffer;
 }
 
-static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
-
 typedef uint32_t (* convert_pixel_t) (const uint8_t *row, int x);
 
 static force_inline void
+bits_image_fetch_separable_convolution_affine (pixman_image_t * image,
+					       int              offset,
+					       int              line,
+					       int              width,
+					       uint32_t *       buffer,
+					       const uint32_t * mask,
+
+					       convert_pixel_t	convert_pixel,
+					       pixman_format_code_t	format,
+					       pixman_repeat_t	repeat_mode)
+{
+    bits_image_t *bits = &image->bits;
+    pixman_fixed_t *params = image->common.filter_params;
+    int cwidth = pixman_fixed_to_int (params[0]);
+    int cheight = pixman_fixed_to_int (params[1]);
+    int x_off = ((cwidth << 16) - pixman_fixed_1) >> 1;
+    int y_off = ((cheight << 16) - pixman_fixed_1) >> 1;
+    int x_phase_bits = pixman_fixed_to_int (params[2]);
+    int y_phase_bits = pixman_fixed_to_int (params[3]);
+    int x_phase_shift = 16 - x_phase_bits;
+    int y_phase_shift = 16 - y_phase_bits;
+    pixman_fixed_t vx, vy;
+    pixman_fixed_t ux, uy;
+    pixman_vector_t v;
+    int k;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (image->common.transform, &v))
+	return;
+
+    ux = image->common.transform->matrix[0][0];
+    uy = image->common.transform->matrix[1][0];
+
+    vx = v.vector[0];
+    vy = v.vector[1];
+
+    for (k = 0; k < width; ++k)
+    {
+	pixman_fixed_t *y_params;
+	int satot, srtot, sgtot, sbtot;
+	pixman_fixed_t x, y;
+	int32_t x1, x2, y1, y2;
+	int32_t px, py;
+	int i, j;
+
+	if (mask && !mask[k])
+	    goto next;
+
+	/* Round x and y to the middle of the closest phase before continuing. This
+	 * ensures that the convolution matrix is aligned right, since it was
+	 * positioned relative to a particular phase (and not relative to whatever
+	 * exact fraction we happen to get here).
+	 */
+	x = ((vx >> x_phase_shift) << x_phase_shift) + ((1 << x_phase_shift) >> 1);
+	y = ((vy >> y_phase_shift) << y_phase_shift) + ((1 << y_phase_shift) >> 1);
+
+	px = (x & 0xffff) >> x_phase_shift;
+	py = (y & 0xffff) >> y_phase_shift;
+
+	x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off);
+	y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off);
+	x2 = x1 + cwidth;
+	y2 = y1 + cheight;
+
+	satot = srtot = sgtot = sbtot = 0;
+
+	y_params = params + 4 + (1 << x_phase_bits) * cwidth + py * cheight;
+
+	for (i = y1; i < y2; ++i)
+	{
+	    pixman_fixed_t fy = *y_params++;
+
+	    if (fy)
+	    {
+		pixman_fixed_t *x_params = params + 4 + px * cwidth;
+
+		for (j = x1; j < x2; ++j)
+		{
+		    pixman_fixed_t fx = *x_params++;
+		    int rx = j;
+		    int ry = i;
+		    
+		    if (fx)
+		    {
+			pixman_fixed_t f;
+			uint32_t pixel, mask;
+			uint8_t *row;
+
+			mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+
+			if (repeat_mode != PIXMAN_REPEAT_NONE)
+			{
+			    repeat (repeat_mode, &rx, bits->width);
+			    repeat (repeat_mode, &ry, bits->height);
+
+			    row = (uint8_t *)bits->bits + bits->rowstride * 4 * ry;
+			    pixel = convert_pixel (row, rx) | mask;
+			}
+			else
+			{
+			    if (rx < 0 || ry < 0 || rx >= bits->width || ry >= bits->height)
+			    {
+				pixel = 0;
+			    }
+			    else
+			    {
+				row = (uint8_t *)bits->bits + bits->rowstride * 4 * ry;
+				pixel = convert_pixel (row, rx) | mask;
+			    }
+			}
+
+			f = ((pixman_fixed_32_32_t)fx * fy + 0x8000) >> 16;
+			srtot += (int)RED_8 (pixel) * f;
+			sgtot += (int)GREEN_8 (pixel) * f;
+			sbtot += (int)BLUE_8 (pixel) * f;
+			satot += (int)ALPHA_8 (pixel) * f;
+		    }
+		}
+	    }
+	}
+
+	satot = (satot + 0x8000) >> 16;
+	srtot = (srtot + 0x8000) >> 16;
+	sgtot = (sgtot + 0x8000) >> 16;
+	sbtot = (sbtot + 0x8000) >> 16;
+
+	satot = CLIP (satot, 0, 0xff);
+	srtot = CLIP (srtot, 0, 0xff);
+	sgtot = CLIP (sgtot, 0, 0xff);
+	sbtot = CLIP (sbtot, 0, 0xff);
+
+	buffer[k] = (satot << 24) | (srtot << 16) | (sgtot << 8) | (sbtot << 0);
+
+    next:
+	vx += ux;
+	vy += uy;
+    }
+}
+
+static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+static force_inline void
 bits_image_fetch_bilinear_affine (pixman_image_t * image,
 				  int              offset,
 				  int              line,
@@ -973,6 +1117,23 @@ convert_r5g6b5 (const uint8_t *row, int x)
     return CONVERT_0565_TO_0888 (*((uint16_t *)row + x));
 }
 
+#define MAKE_SEPARABLE_CONVOLUTION_FETCHER(name, format, repeat_mode)  \
+    static uint32_t *							\
+    bits_image_fetch_separable_convolution_affine_ ## name (pixman_iter_t   *iter, \
+							    const uint32_t * mask) \
+    {									\
+	bits_image_fetch_separable_convolution_affine (                 \
+	    iter->image,                                                \
+	    iter->x, iter->y++,                                         \
+	    iter->width,                                                \
+	    iter->buffer, mask,                                         \
+	    convert_ ## format,                                         \
+	    PIXMAN_ ## format,                                          \
+	    repeat_mode);                                               \
+									\
+	return iter->buffer;                                            \
+    }
+
 #define MAKE_BILINEAR_FETCHER(name, format, repeat_mode)		\
     static uint32_t *							\
     bits_image_fetch_bilinear_affine_ ## name (pixman_iter_t   *iter,	\
@@ -1005,7 +1166,8 @@ convert_r5g6b5 (const uint8_t *row, int x)
 
 #define MAKE_FETCHERS(name, format, repeat_mode)			\
     MAKE_NEAREST_FETCHER (name, format, repeat_mode)			\
-    MAKE_BILINEAR_FETCHER (name, format, repeat_mode)
+    MAKE_BILINEAR_FETCHER (name, format, repeat_mode)			\
+    MAKE_SEPARABLE_CONVOLUTION_FETCHER (name, format, repeat_mode)
 
 MAKE_FETCHERS (pad_a8r8g8b8,     a8r8g8b8, PIXMAN_REPEAT_PAD)
 MAKE_FETCHERS (none_a8r8g8b8,    a8r8g8b8, PIXMAN_REPEAT_NONE)
@@ -1255,6 +1417,20 @@ static const fetcher_info_t fetcher_info[] =
      FAST_PATH_AFFINE_TRANSFORM		|				\
      FAST_PATH_NEAREST_FILTER)
 
+#define GENERAL_SEPARABLE_CONVOLUTION_FLAGS				\
+    (FAST_PATH_NO_ALPHA_MAP            |				\
+     FAST_PATH_NO_ACCESSORS            |				\
+     FAST_PATH_HAS_TRANSFORM           |				\
+     FAST_PATH_AFFINE_TRANSFORM        |				\
+     FAST_PATH_SEPARABLE_CONVOLUTION_FILTER)
+    
+#define SEPARABLE_CONVOLUTION_AFFINE_FAST_PATH(name, format, repeat)   \
+    { PIXMAN_ ## format,                                               \
+      GENERAL_SEPARABLE_CONVOLUTION_FLAGS | FAST_PATH_ ## repeat ## _REPEAT, \
+      bits_image_fetch_separable_convolution_affine_ ## name,          \
+      _pixman_image_get_scanline_generic_float			       \
+    },
+
 #define BILINEAR_AFFINE_FAST_PATH(name, format, repeat)			\
     { PIXMAN_ ## format,						\
       GENERAL_BILINEAR_FLAGS | FAST_PATH_ ## repeat ## _REPEAT,		\
@@ -1270,6 +1446,7 @@ static const fetcher_info_t fetcher_info[] =
     },
 
 #define AFFINE_FAST_PATHS(name, format, repeat)				\
+    SEPARABLE_CONVOLUTION_AFFINE_FAST_PATH(name, format, repeat)	\
     BILINEAR_AFFINE_FAST_PATH(name, format, repeat)			\
     NEAREST_AFFINE_FAST_PATH(name, format, repeat)
     
diff --git a/pixman/pixman-image.c b/pixman/pixman-image.c
index 93ed17e3..6f076d5c 100644
--- a/pixman/pixman-image.c
+++ b/pixman/pixman-image.c
@@ -371,7 +371,10 @@ compute_image_info (pixman_image_t *image)
 	break;
 
     case PIXMAN_FILTER_CONVOLUTION:
+	break;
+
     case PIXMAN_FILTER_SEPARABLE_CONVOLUTION:
+	flags |= FAST_PATH_SEPARABLE_CONVOLUTION_FILTER;
 	break;
 
     default:
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index c0a6bc0a..99125a17 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -687,6 +687,7 @@ _pixman_iter_get_scanline_noop (pixman_iter_t *iter, const uint32_t *mask);
 #define FAST_PATH_SAMPLES_COVER_CLIP_NEAREST	(1 << 23)
 #define FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR	(1 << 24)
 #define FAST_PATH_BITS_IMAGE			(1 << 25)
+#define FAST_PATH_SEPARABLE_CONVOLUTION_FILTER  (1 << 26)
 
 #define FAST_PATH_PAD_REPEAT						\
     (FAST_PATH_NO_NONE_REPEAT		|				\
author	Søren Sandmann Pedersen <ssp@redhat.com>	2012-11-26 14:27:34 -0500
committer	Søren Sandmann Pedersen <ssp@redhat.com>	2012-12-08 12:38:58 -0500
commit	8ca4e144724ba2041bc5ef077ccf6d24e7cf4d1f (patch)
tree	015ff0428069a2e70168467702a16246b49fc226
parent	4f18ba30cea56331e30992242201b20954c8f7f2 (diff)