Add specialized fast nearest scalers

This is a macroized version of SRC/OVER repeat normal/unneeded nearest neighbour scaling instantiated for some common 8888 and 565 formats. Based on work by Siarhei Siamashka
author: Alexander Larsson <alexl@redhat.com> 2010-03-12 15:45:04 +0100
committer: Søren Sandmann Pedersen <ssp@redhat.com> 2010-03-17 11:03:05 -0400
commit: 3b92b711d031a7752e06d0a5f688f4c54f50a1e6 (patch)
tree: e00d5fd8417d1c3f82228b7a6d9d31080a162048
parent: 5750408e48259f42373a5233231104d9bd3eb35a (diff)
1 files changed, 243 insertions, 0 deletions
diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
index 4d26b0f..6607a47 100644
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -27,6 +27,7 @@
 #include <config.h>
 #endif
 #include <string.h>
+#include <stdlib.h>
 #include "pixman-private.h"
 #include "pixman-combine32.h"
 
@@ -1373,6 +1374,208 @@ repeat (pixman_repeat_t repeat, int *c, int size)
     return TRUE;
 }
 
+/* A macroified version of specialized nearest scalers for some
+ * common 8888 and 565 formats. It supports SRC and OVER ops.
+ *
+ * There are two repeat versions, one that handles repeat normal,
+ * and one without repeat handling that only works if the src region
+ * used is completely covered by the pre-repeated source samples.
+ *
+ * The loops are unrolled to process two pixels per iteration for better
+ * performance on most CPU architectures (superscalar processors
+ * can issue several operations simultaneously, other processors can hide
+ * instructions latencies by pipelining operations). Unrolling more
+ * does not make much sense because the compiler will start running out
+ * of spare registers soon.
+ */
+
+#define GET_8888_ALPHA(s) ((s) >> 24)
+ /* This is not actually used since we don't have an OVER with
+    565 source, but it is needed to build. */
+#define GET_0565_ALPHA(s) 0xff
+
+#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT,					\
+		     src_type_t, dst_type_t, OP, do_repeat)					\
+static void											\
+fast_composite_scaled_nearest_ ## scale_func_name ## _ ## OP (pixman_implementation_t *imp,	\
+							      pixman_op_t              op,      \
+							      pixman_image_t *         src_image, \
+							      pixman_image_t *         mask_image, \
+							      pixman_image_t *         dst_image, \
+							      int32_t                  src_x,   \
+							      int32_t                  src_y,   \
+							      int32_t                  mask_x,  \
+							      int32_t                  mask_y,  \
+							      int32_t                  dst_x,   \
+							      int32_t                  dst_y,   \
+							      int32_t                  width,   \
+							      int32_t                  height)  \
+{												\
+    dst_type_t *dst_line;									\
+    src_type_t *src_first_line;									\
+    uint32_t   d;										\
+    src_type_t s1, s2;										\
+    uint8_t   a1, a2;										\
+    int       w;										\
+    int       x1, x2, y;									\
+    pixman_fixed_t orig_vx;									\
+    pixman_fixed_t max_vx, max_vy;								\
+    pixman_vector_t v;										\
+    pixman_fixed_t vx, vy;									\
+    pixman_fixed_t unit_x, unit_y;								\
+												\
+    src_type_t *src;										\
+    dst_type_t *dst;										\
+    int       src_stride, dst_stride;								\
+												\
+    if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER)		\
+	abort();										\
+												\
+    PIXMAN_IMAGE_GET_LINE (dst_image, dst_x, dst_y, dst_type_t, dst_stride, dst_line, 1);	\
+    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
+     * transformed from destination space to source space */					\
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
+												\
+    /* reference point is the center of the pixel */						\
+    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
+    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
+    v.vector[2] = pixman_fixed_1;								\
+												\
+    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
+	return;											\
+												\
+    unit_x = src_image->common.transform->matrix[0][0];						\
+    unit_y = src_image->common.transform->matrix[1][1];						\
+												\
+    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */			\
+    v.vector[0] -= pixman_fixed_e;								\
+    v.vector[1] -= pixman_fixed_e;								\
+												\
+    vx = v.vector[0];										\
+    vy = v.vector[1];										\
+												\
+    if (do_repeat)										\
+    {												\
+	/* Clamp repeating positions inside the actual samples */				\
+	max_vx = src_image->bits.width << 16;							\
+	max_vy = src_image->bits.height << 16;							\
+												\
+	repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);						\
+	repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
+    }												\
+												\
+    orig_vx = vx;										\
+												\
+    while (--height >= 0)									\
+    {												\
+	dst = dst_line;										\
+	dst_line += dst_stride;									\
+												\
+	y = vy >> 16;										\
+	vy += unit_y;										\
+	if (do_repeat)										\
+	    repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
+												\
+	src = src_first_line + src_stride * y;							\
+												\
+	w = width;										\
+	vx = orig_vx;										\
+	while ((w -= 2) >= 0)									\
+	{											\
+	    x1 = vx >> 16;									\
+	    vx += unit_x;									\
+	    if (do_repeat)									\
+		repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);					\
+	    s1 = src[x1];									\
+												\
+	    x2 = vx >> 16;									\
+	    vx += unit_x;									\
+	    if (do_repeat)									\
+		repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);					\
+	    s2 = src[x2];									\
+												\
+	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
+	    {											\
+		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
+		a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2);						\
+												\
+		if (a1 == 0xff)									\
+		{										\
+		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+		}										\
+		else if (s1)									\
+		{										\
+		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
+		    a1 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
+		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+												\
+		if (a2 == 0xff)									\
+		{										\
+		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
+		}										\
+		else if (s2)									\
+		{										\
+		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
+		    a2 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2);					\
+		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+	    }											\
+	    else /* PIXMAN_OP_SRC */								\
+	    {											\
+		    *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+		    *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
+	    }											\
+	}											\
+												\
+	if (w & 1)										\
+	{											\
+	    x1 = vx >> 16;									\
+	    vx += unit_x;									\
+	    if (do_repeat)									\
+		repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);					\
+	    s1 = src[x1];									\
+												\
+	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
+	    {											\
+		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
+												\
+		if (a1 == 0xff)									\
+		{										\
+		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+		}										\
+		else if (s1)									\
+		{										\
+		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
+		    a1 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
+		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+	    }											\
+	    else /* PIXMAN_OP_SRC */								\
+	    {											\
+		    *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+	    }											\
+	}											\
+    }												\
+}
+
+FAST_NEAREST(x888_x888_none, 8888, 8888, uint32_t, uint32_t, SRC, /*repeat: */ 0);
+FAST_NEAREST(x888_x888_normal, 8888, 8888, uint32_t, uint32_t, SRC, /*repeat: */ 1);
+FAST_NEAREST(x888_x888_none, 8888, 8888, uint32_t, uint32_t, OVER, /*repeat: */ 0);
+FAST_NEAREST(x888_x888_normal, 8888, 8888, uint32_t, uint32_t, OVER, /*repeat: */ 1);
+FAST_NEAREST(x888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, /*repeat: */ 0);
+FAST_NEAREST(x888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, /*repeat: */ 1);
+FAST_NEAREST(565_565_none, 0565, 0565, uint16_t, uint16_t, SRC, /*repeat: */ 0);
+FAST_NEAREST(565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, /*repeat: */ 1);
+FAST_NEAREST(8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, /*repeat: */ 0);
+FAST_NEAREST(8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, /*repeat: */ 1);
+
 static force_inline uint32_t
 fetch_nearest (pixman_repeat_t src_repeat,
 	       pixman_format_code_t format,
@@ -1595,6 +1798,46 @@ static const pixman_fast_path_t c_fast_paths[] =
      FAST_PATH_NO_ACCESSORS	|					\
      FAST_PATH_NO_WIDE_FORMAT)
 
+#define HAS_NORMAL_REPEAT_FLAGS						\
+    (FAST_PATH_NO_REFLECT_REPEAT |					\
+     FAST_PATH_NO_PAD_REPEAT     |					\
+     FAST_PATH_NO_NONE_REPEAT)
+
+#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func)				\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_NEAREST_FLAGS | HAS_NORMAL_REPEAT_FLAGS | FAST_PATH_16BIT_SAFE,	\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
+    },									\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
+    }
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, x888_x888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, x888_x888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, x888_x888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, x888_x888),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, x888_x888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, x888_x888),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, x888_565),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, x888_565),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, 565_565),
+
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, x888_x888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, x888_x888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, x888_x888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, x888_x888),
+
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, r5g6b5, 8888_565),
+
 #define NEAREST_FAST_PATH(op,s,d)		\
     {   PIXMAN_OP_ ## op,			\
 	PIXMAN_ ## s, SCALED_NEAREST_FLAGS,	\
author	Alexander Larsson <alexl@redhat.com>	2010-03-12 15:45:04 +0100
committer	Søren Sandmann Pedersen <ssp@redhat.com>	2010-03-17 11:03:05 -0400
commit	3b92b711d031a7752e06d0a5f688f4c54f50a1e6 (patch)
tree	e00d5fd8417d1c3f82228b7a6d9d31080a162048
parent	5750408e48259f42373a5233231104d9bd3eb35a (diff)