From 8d628e413fcbf79568ff0bfa065617ab991a0012 Mon Sep 17 00:00:00 2001 From: Taekyun Kim Date: Mon, 29 Aug 2011 19:03:55 +0900 Subject: Simple repeat: Extend too short source scanline into temporary buffer Too short scanlines can cause repeat handling overhead and optimized pixman composite functions usually process a bunch of pixels in a single loop iteration it might be beneficial to pre-extend source scanlines. The temporary buffers will usually reside in cache, so accessing them should be quite efficient. --- pixman/pixman-arm-neon.c | 46 +++++++++++++++++++------------------- pixman/pixman-arm-simd.c | 6 ++--- pixman/pixman-fast-path.c | 22 +++++++++---------- pixman/pixman-inlines.h | 56 +++++++++++++++++++++++++++++++++++++++++++---- pixman/pixman-sse2.c | 26 +++++++++++----------- 5 files changed, 102 insertions(+), 54 deletions(-) diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c index 5d5c754..c17e939 100644 --- a/pixman/pixman-arm-neon.c +++ b/pixman/pixman-arm-neon.c @@ -263,29 +263,29 @@ pixman_blt_neon (uint32_t *src_bits, } /* Simple repeat fast path functions. */ -FAST_COMPOSITE_SIMPLE_REPEAT (uint16_t, neon_composite_src_0565_0565) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, neon_composite_src_8888_0565) -FAST_COMPOSITE_SIMPLE_REPEAT (uint16_t, neon_composite_src_0565_8888) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, neon_composite_src_8888_8888) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, neon_composite_src_x888_8888) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, neon_composite_over_8888_n_8888) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, neon_composite_over_8888_n_0565) -FAST_COMPOSITE_SIMPLE_REPEAT (uint16_t, neon_composite_over_0565_n_0565) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, neon_composite_over_8888_8_8888) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, neon_composite_over_8888_8_0565) -FAST_COMPOSITE_SIMPLE_REPEAT (uint16_t, neon_composite_over_0565_8_0565) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, neon_composite_over_8888_8888_8888) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, neon_composite_over_8888_0565) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, neon_composite_over_8888_8888) -FAST_COMPOSITE_SIMPLE_REPEAT (uint8_t, neon_composite_add_8_8_8) -FAST_COMPOSITE_SIMPLE_REPEAT (uint16_t, neon_composite_add_0565_8_0565) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, neon_composite_add_8888_8_8888) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, neon_composite_add_8888_8888_8888) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, neon_composite_add_8888_n_8888) -FAST_COMPOSITE_SIMPLE_REPEAT (uint8_t, neon_composite_add_8_8) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, neon_composite_add_8888_8888) -FAST_COMPOSITE_SIMPLE_REPEAT (uint8_t, neon_composite_out_reverse_8_0565) -FAST_COMPOSITE_SIMPLE_REPEAT (uint8_t, neon_composite_out_reverse_8_8888) +FAST_COMPOSITE_SIMPLE_REPEAT (uint16_t, neon_composite_src_0565_0565, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, neon_composite_src_8888_0565, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint16_t, neon_composite_src_0565_8888, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, neon_composite_src_8888_8888, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, neon_composite_src_x888_8888, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, neon_composite_over_8888_n_8888, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, neon_composite_over_8888_n_0565, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint16_t, neon_composite_over_0565_n_0565, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, neon_composite_over_8888_8_8888, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, neon_composite_over_8888_8_0565, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint16_t, neon_composite_over_0565_8_0565, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, neon_composite_over_8888_8888_8888, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, neon_composite_over_8888_0565, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, neon_composite_over_8888_8888, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint8_t, neon_composite_add_8_8_8, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint16_t, neon_composite_add_0565_8_0565, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, neon_composite_add_8888_8_8888, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, neon_composite_add_8888_8888_8888, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, neon_composite_add_8888_n_8888, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint8_t, neon_composite_add_8_8, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, neon_composite_add_8888_8888, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint8_t, neon_composite_out_reverse_8_0565, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint8_t, neon_composite_out_reverse_8_8888, 32) static const pixman_fast_path_t arm_neon_fast_paths[] = { diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c index 59c00b1..7a4d899 100644 --- a/pixman/pixman-arm-simd.c +++ b/pixman/pixman-arm-simd.c @@ -393,9 +393,9 @@ PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC, uint32_t, uint32_t) /* Simple repeat fast path functions. */ -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, armv6_composite_over_8888_8888) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, armv6_composite_over_8888_n_8888) -FAST_COMPOSITE_SIMPLE_REPEAT (uint8_t, armv6_composite_add_8_8) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, armv6_composite_over_8888_8888, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, armv6_composite_over_8888_n_8888, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint8_t, armv6_composite_add_8_8, 32) static const pixman_fast_path_t arm_simd_fast_paths[] = { diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c index 61597a7..351098e 100644 --- a/pixman/pixman-fast-path.c +++ b/pixman/pixman-fast-path.c @@ -1686,17 +1686,17 @@ FAST_SIMPLE_ROTATE (565, uint16_t) FAST_SIMPLE_ROTATE (8888, uint32_t) /* Simple repeat fast path functions. */ -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, fast_composite_over_x888_8_8888) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, fast_composite_over_8888_8888) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, fast_composite_over_8888_0565) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, fast_composite_add_8888_8888) -FAST_COMPOSITE_SIMPLE_REPEAT (uint8_t, fast_composite_add_8_8) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, fast_composite_src_x888_8888) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, fast_composite_src_8888_8888) -FAST_COMPOSITE_SIMPLE_REPEAT (uint16_t, fast_composite_src_0565_0565) -FAST_COMPOSITE_SIMPLE_REPEAT (uint8_t, fast_composite_src_8_8) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, fast_composite_src_x888_0565) -FAST_COMPOSITE_SIMPLE_REPEAT (uint8_t, fast_composite_in_8_8) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, fast_composite_over_x888_8_8888, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, fast_composite_over_8888_8888, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, fast_composite_over_8888_0565, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, fast_composite_add_8888_8888, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint8_t, fast_composite_add_8_8, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, fast_composite_src_x888_8888, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, fast_composite_src_8888_8888, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint16_t, fast_composite_src_0565_0565, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint8_t, fast_composite_src_8_8, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, fast_composite_src_x888_0565, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint8_t, fast_composite_in_8_8, 32) static const pixman_fast_path_t c_fast_paths[] = { diff --git a/pixman/pixman-inlines.h b/pixman/pixman-inlines.h index fef8fbe..0513f44 100644 --- a/pixman/pixman-inlines.h +++ b/pixman/pixman-inlines.h @@ -233,7 +233,7 @@ pad_repeat_get_scanline_bounds (int32_t source_image_width, * We can stitch source scanlines horizontally to produce best memory access patterns. * By giving height of 1 to composite functions, we can use them as scanline functions. */ -#define FAST_COMPOSITE_SIMPLE_REPEAT(src_type, composite_func) \ +#define FAST_COMPOSITE_SIMPLE_REPEAT(src_type, composite_func, REPEAT_MIN_WIDTH) \ static void \ fast_composite_simple_repeat_##composite_func (pixman_implementation_t *imp, \ pixman_composite_info_t *info) \ @@ -243,6 +243,36 @@ fast_composite_simple_repeat_##composite_func (pixman_implementation_t *imp, \ int32_t width_remain; \ int32_t num_pixels; \ pixman_composite_info_t info2 = *info; \ + int32_t src_width; \ + int32_t i, j; \ + pixman_image_t extended_src_image; \ + src_type extended_src[REPEAT_MIN_WIDTH*2]; \ + pixman_bool_t need_src_extension; \ + src_type *src_line; \ + int32_t src_stride; \ + \ + if (src_image->bits.width < REPEAT_MIN_WIDTH) \ + { \ + sx = src_x; \ + repeat (PIXMAN_REPEAT_NORMAL, &sx, src_image->bits.width); \ + sx += width; \ + src_width = 0; \ + \ + while (src_width < REPEAT_MIN_WIDTH && src_width <= sx) \ + src_width += src_image->bits.width; \ + \ + need_src_extension = TRUE; \ + extended_src_image.bits = src_image->bits; \ + extended_src_image.bits.bits = (uint32_t*)(&extended_src[0]); \ + extended_src_image.bits.width = src_width; \ + extended_src_image.bits.height = 1; \ + info2.src_image = &extended_src_image; \ + } \ + else \ + { \ + src_width = src_image->bits.width; \ + need_src_extension = FALSE; \ + } \ \ sx = src_x; \ sy = src_y; \ @@ -250,19 +280,37 @@ fast_composite_simple_repeat_##composite_func (pixman_implementation_t *imp, \ while (--height >= 0) \ { \ repeat (PIXMAN_REPEAT_NORMAL, &sy, src_image->bits.height); \ - repeat (PIXMAN_REPEAT_NORMAL, &sx, src_image->bits.width); \ + repeat (PIXMAN_REPEAT_NORMAL, &sx, src_width); \ + \ + PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, src_type, src_stride, src_line, 1); \ + \ + if (need_src_extension) \ + { \ + for (i=0; ibits.width; j++, i++) \ + { \ + extended_src[i] = src_line[j]; \ + } \ + } \ + \ + info2.src_y = 0; \ + } \ + else \ + { \ + info2.src_y = sy; \ + } \ \ width_remain = width; \ \ while (width_remain > 0) \ { \ - num_pixels = src_image->bits.width - sx; \ + num_pixels = src_width - sx; \ \ if (num_pixels > width_remain) \ num_pixels = width_remain; \ \ info2.src_x = sx; \ - info2.src_y = sy; \ info2.width = num_pixels; \ info2.height = 1; \ \ diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c index 022ed88..4a57abf 100644 --- a/pixman/pixman-sse2.c +++ b/pixman/pixman-sse2.c @@ -5414,19 +5414,19 @@ FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC, NORMAL, FLAG_NONE) /* Tiled repeat fast path functions. */ -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, sse2_composite_over_8888_8888) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, sse2_composite_over_8888_0565) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, sse2_composite_over_8888_8888_8888) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, sse2_composite_over_8888_8_8888) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, sse2_composite_over_x888_8_8888) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, sse2_composite_over_x888_n_8888) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, sse2_composite_over_8888_n_8888) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, sse2_composite_src_8888_8888) -FAST_COMPOSITE_SIMPLE_REPEAT (uint16_t, sse2_composite_src_0565_0565) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, sse2_composite_src_x888_8888) -FAST_COMPOSITE_SIMPLE_REPEAT (uint8_t, sse2_composite_add_8_8) -FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, sse2_composite_add_8888_8888) -FAST_COMPOSITE_SIMPLE_REPEAT (uint8_t, sse2_composite_in_8_8) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, sse2_composite_over_8888_8888, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, sse2_composite_over_8888_0565, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, sse2_composite_over_8888_8888_8888, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, sse2_composite_over_8888_8_8888, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, sse2_composite_over_x888_8_8888, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, sse2_composite_over_x888_n_8888, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, sse2_composite_over_8888_n_8888, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, sse2_composite_src_8888_8888, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint16_t, sse2_composite_src_0565_0565, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, sse2_composite_src_x888_8888, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint8_t, sse2_composite_add_8_8, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint32_t, sse2_composite_add_8888_8888, 32) +FAST_COMPOSITE_SIMPLE_REPEAT (uint8_t, sse2_composite_in_8_8, 32) static const pixman_fast_path_t sse2_fast_paths[] = { -- cgit v1.2.3