PAD repeat support for fast scaling with nearest filter

When processing pixels from the left and right padding, the same scanline function is used with 'unit_x' set to 0. Actually appears that gcc can handle this quite efficiently. When using 'restrict' keyword, it is able to optimize the whole operation performed on left or right padding pixels to a small unrolled loop (the code is reduced to a simple fill implementation): 9b30: 89 08 mov %ecx,(%rax) 9b32: 89 48 04 mov %ecx,0x4(%rax) 9b35: 48 83 c0 08 add $0x8,%rax 9b39: 49 39 c0 cmp %rax,%r8 9b3c: 75 f2 jne 9b30 Without 'restrict' keyword, there is one instruction more: reloading source pixel data from memory in the beginning of each iteration. That is slower, but also acceptable.
author: Siarhei Siamashka <siarhei.siamashka@nokia.com> 2010-09-16 17:10:40 +0300
committer: Siarhei Siamashka <siarhei.siamashka@nokia.com> 2010-09-21 13:32:11 +0300
commit: 45833d5b198507e9e69b918459eaaf6088e5de00 (patch)
tree: 2b9d0d4faf7880c48952a05784d6f4de1d111bb8
parent: 3db0cc5c75a4a764726059511fa6d67082fbeb64 (diff)
2 files changed, 105 insertions, 4 deletions
diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
index c0607496..5b10d65c 100644
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -1388,14 +1388,19 @@ fast_composite_src_memcpy (pixman_implementation_t *imp,
 }
 
 FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, SRC, COVER);
+FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, SRC, PAD);
 FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL);
 FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, OVER, COVER);
+FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, OVER, PAD);
 FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, OVER, NORMAL);
 FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER);
+FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD);
 FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL);
 FAST_NEAREST (565_565_cover, 0565, 0565, uint16_t, uint16_t, SRC, COVER);
+FAST_NEAREST (565_565_pad, 0565, 0565, uint16_t, uint16_t, SRC, PAD);
 FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL);
 FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER);
+FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD);
 FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL);
 
 static force_inline uint32_t
diff --git a/pixman/pixman-fast-path.h b/pixman/pixman-fast-path.h
index 287b7535..7c14379b 100644
--- a/pixman/pixman-fast-path.h
+++ b/pixman/pixman-fast-path.h
@@ -58,6 +58,63 @@ repeat (pixman_repeat_t repeat, int *c, int size)
     return TRUE;
 }
 
+/*
+ * For each scanline fetched from source image with PAD repeat:
+ * - calculate how many pixels need to be padded on the left side
+ * - calculate how many pixels need to be padded on the right side
+ * - update width to only count pixels which are fetched from the image
+ * All this information is returned via 'width', 'left_pad', 'right_pad'
+ * arguments. The code is assuming that 'unit_x' is positive.
+ *
+ * Note: 64-bit math is used in order to avoid potential overflows, which
+ *       is probably excessive in many cases. This particular function
+ *       may need its own correctness test and performance tuning.
+ */
+static force_inline void
+pad_repeat_get_scanline_bounds (int32_t         source_image_width,
+				pixman_fixed_t  vx,
+				pixman_fixed_t  unit_x,
+				int32_t *       width,
+				int32_t *       left_pad,
+				int32_t *       right_pad)
+{
+    int64_t max_vx = (int64_t) source_image_width << 16;
+    int64_t tmp;
+    if (vx < 0)
+    {
+	tmp = ((int64_t) unit_x - 1 - vx) / unit_x;
+	if (tmp > *width)
+	{
+	    *left_pad = *width;
+	    *width = 0;
+	}
+	else
+	{
+	    *left_pad = (int32_t) tmp;
+	    *width -= (int32_t) tmp;
+	}
+    }
+    else
+    {
+	*left_pad = 0;
+    }
+    tmp = ((int64_t) unit_x - 1 - vx + max_vx) / unit_x - *left_pad;
+    if (tmp < 0)
+    {
+	*right_pad = *width;
+	*width = 0;
+    }
+    else if (tmp >= *width)
+    {
+	*right_pad = 0;
+    }
+    else
+    {
+	*right_pad = *width - (int32_t) tmp;
+	*width = (int32_t) tmp;
+    }
+}
+
 /* A macroified version of specialized nearest scalers for some
  * common 8888 and 565 formats. It supports SRC and OVER ops.
  *
@@ -213,6 +270,7 @@ fast_composite_scaled_nearest_ ## scale_func_name (pixman_implementation_t *imp,
     pixman_vector_t v;										\
     pixman_fixed_t vx, vy;									\
     pixman_fixed_t unit_x, unit_y;								\
+    int32_t left_pad, right_pad;								\
 												\
     src_type_t *src;										\
     dst_type_t *dst;										\
@@ -251,6 +309,13 @@ fast_composite_scaled_nearest_ ## scale_func_name (pixman_implementation_t *imp,
 	repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
     }												\
 												\
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
+    {												\
+	pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x,			\
+					&width, &left_pad, &right_pad);				\
+	vx += left_pad * unit_x;								\
+    }												\
+												\
     while (--height >= 0)									\
     {												\
 	dst = dst_line;										\
@@ -260,10 +325,29 @@ fast_composite_scaled_nearest_ ## scale_func_name (pixman_implementation_t *imp,
 	vy += unit_y;										\
 	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
 	    repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
-												\
-	src = src_first_line + src_stride * y;							\
-												\
-	scanline_func (dst, src, width, vx, unit_x, max_vx);					\
+	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
+	{											\
+	    repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height);				\
+	    src = src_first_line + src_stride * y;						\
+	    if (left_pad > 0)									\
+	    {											\
+		scanline_func (dst, src, left_pad, 0, 0, 0);					\
+	    }											\
+	    if (width > 0)									\
+	    {											\
+		scanline_func (dst + left_pad, src, width, vx, unit_x, 0);			\
+	    }											\
+	    if (right_pad > 0)									\
+	    {											\
+		scanline_func (dst + left_pad + width, src + src_image->bits.width - 1,		\
+			        right_pad, 0, 0, 0);						\
+	    }											\
+	}											\
+	else											\
+	{											\
+	    src = src_first_line + src_stride * y;						\
+	    scanline_func (dst, src, width, vx, unit_x, max_vx);				\
+	}											\
     }												\
 }
 
@@ -295,6 +379,17 @@ fast_composite_scaled_nearest_ ## scale_func_name (pixman_implementation_t *imp,
 	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
     }
 
+#define SIMPLE_NEAREST_FAST_PATH_PAD(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
+    }
+
 #define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func)			\
     {   PIXMAN_OP_ ## op,						\
 	PIXMAN_ ## s,							\
@@ -307,6 +402,7 @@ fast_composite_scaled_nearest_ ## scale_func_name (pixman_implementation_t *imp,
 /* Prefer the use of 'cover' variant, because it is faster */
 #define SIMPLE_NEAREST_FAST_PATH(op,s,d,func)				\
     SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),			\
+    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func),				\
     SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
 
 #endif
author	Siarhei Siamashka <siarhei.siamashka@nokia.com>	2010-09-16 17:10:40 +0300
committer	Siarhei Siamashka <siarhei.siamashka@nokia.com>	2010-09-21 13:32:11 +0300
commit	45833d5b198507e9e69b918459eaaf6088e5de00 (patch)
tree	2b9d0d4faf7880c48952a05784d6f4de1d111bb8
parent	3db0cc5c75a4a764726059511fa6d67082fbeb64 (diff)