From d67c0b883daeeaacf3f21f1ddbdcf9ecf94fac43 Mon Sep 17 00:00:00 2001 From: Taekyun Kim Date: Wed, 7 Sep 2011 22:51:46 +0900 Subject: sse2: Macros for assembling bilinear interpolation code fractions Primitive bilinear interpolation code is reusable to implement other bilinear functions. BILINEAR_DECLARE_VARIABLES - Declare variables needed to interpolate src pixels. BILINEAR_INTERPOLATE_ONE_PIXEL - Interpolate one pixel and advance to next pixel BILINEAR_SKIP_ONE_PIXEL - Skip interpolation and just advance to next pixel This is useful for skipping zero mask --- pixman/pixman-sse2.c | 157 +++++++++++++++++++++++++-------------------------- 1 file changed, 77 insertions(+), 80 deletions(-) diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c index 6689c53..0bfd26b 100644 --- a/pixman/pixman-sse2.c +++ b/pixman/pixman-sse2.c @@ -5287,83 +5287,53 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, scaled_nearest_scanline_sse2_8888_n_8888_OVER, uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE) -static void -bilinear_interpolate_line_sse2 (uint32_t * out, - const uint32_t * top, - const uint32_t * bottom, - int wt, - int wb, - pixman_fixed_t x, - pixman_fixed_t ux, - int width) -{ - const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); - const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); - const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff); - const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1); - const __m128i xmm_ux = _mm_set_epi16 (ux, ux, ux, ux, ux, ux, ux, ux); - const __m128i xmm_zero = _mm_setzero_si128 (); - __m128i xmm_x = _mm_set_epi16 (x, x, x, x, x, x, x, x); - uint32_t pix1, pix2, pix3, pix4; - - #define INTERPOLATE_ONE_PIXEL(pix) \ - do { \ - __m128i xmm_wh, xmm_lo, xmm_hi, a; \ - /* fetch 2x2 pixel block into sse2 register */ \ - uint32_t tl = top [pixman_fixed_to_int (x)]; \ - uint32_t tr = top [pixman_fixed_to_int (x) + 1]; \ - uint32_t bl = bottom [pixman_fixed_to_int (x)]; \ - uint32_t br = bottom [pixman_fixed_to_int (x) + 1]; \ - a = _mm_set_epi32 (tr, tl, br, bl); \ - x += ux; \ - /* vertical interpolation */ \ - a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero), \ - xmm_wt), \ - _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero), \ - xmm_wb)); \ - /* calculate horizontal weights */ \ - xmm_wh = _mm_add_epi16 (xmm_addc, \ - _mm_xor_si128 (xmm_xorc, \ - _mm_srli_epi16 (xmm_x, 8))); \ - xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ - /* horizontal interpolation */ \ - xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \ - xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \ - a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \ - _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \ - /* shift and pack the result */ \ - a = _mm_srli_epi32 (a, 16); \ - a = _mm_packs_epi32 (a, a); \ - a = _mm_packus_epi16 (a, a); \ - pix = _mm_cvtsi128_si32 (a); \ - } while (0) - - while ((width -= 4) >= 0) - { - INTERPOLATE_ONE_PIXEL (pix1); - INTERPOLATE_ONE_PIXEL (pix2); - INTERPOLATE_ONE_PIXEL (pix3); - INTERPOLATE_ONE_PIXEL (pix4); - *out++ = pix1; - *out++ = pix2; - *out++ = pix3; - *out++ = pix4; - } - if (width & 2) - { - INTERPOLATE_ONE_PIXEL (pix1); - INTERPOLATE_ONE_PIXEL (pix2); - *out++ = pix1; - *out++ = pix2; - } - if (width & 1) - { - INTERPOLATE_ONE_PIXEL (pix1); - *out = pix1; - } - - #undef INTERPOLATE_ONE_PIXEL -} +#define BILINEAR_DECLARE_VARIABLES \ + const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \ + const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \ + const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);\ + const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1); \ + const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x, \ + unit_x, unit_x, unit_x, unit_x); \ + const __m128i xmm_zero = _mm_setzero_si128 (); \ + __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, vx, vx, vx, vx) + +#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \ +do { \ + __m128i xmm_wh, xmm_lo, xmm_hi, a; \ + /* fetch 2x2 pixel block into sse2 register */ \ + uint32_t tl = src_top [pixman_fixed_to_int (vx)]; \ + uint32_t tr = src_top [pixman_fixed_to_int (vx) + 1]; \ + uint32_t bl = src_bottom [pixman_fixed_to_int (vx)]; \ + uint32_t br = src_bottom [pixman_fixed_to_int (vx) + 1]; \ + a = _mm_set_epi32 (tr, tl, br, bl); \ + vx += unit_x; \ + /* vertical interpolation */ \ + a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero), \ + xmm_wt), \ + _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero), \ + xmm_wb)); \ + /* calculate horizontal weights */ \ + xmm_wh = _mm_add_epi16 (xmm_addc, \ + _mm_xor_si128 (xmm_xorc, \ + _mm_srli_epi16 (xmm_x, 8))); \ + xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ + /* horizontal interpolation */ \ + xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \ + xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \ + a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \ + _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \ + /* shift and pack the result */ \ + a = _mm_srli_epi32 (a, 16); \ + a = _mm_packs_epi32 (a, a); \ + a = _mm_packus_epi16 (a, a); \ + pix = _mm_cvtsi128_si32 (a); \ +} while (0) + +#define BILINEAR_SKIP_ONE_PIXEL() \ +do { \ + vx += unit_x; \ + xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ +} while(0) static force_inline void scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst, @@ -5378,8 +5348,35 @@ scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst, pixman_fixed_t max_vx, pixman_bool_t zero_src) { - bilinear_interpolate_line_sse2 (dst, src_top, src_bottom, - wt, wb, vx, unit_x, w); + BILINEAR_DECLARE_VARIABLES; + uint32_t pix1, pix2, pix3, pix4; + + while ((w -= 4) >= 0) + { + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); + BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); + BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); + BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); + *dst++ = pix1; + *dst++ = pix2; + *dst++ = pix3; + *dst++ = pix4; + } + + if (w & 2) + { + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); + BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); + *dst++ = pix1; + *dst++ = pix2; + } + + if (w & 1) + { + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); + *dst = pix1; + } + } FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC, @@ -5399,7 +5396,6 @@ FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC, uint32_t, uint32_t, uint32_t, NORMAL, FLAG_NONE) - static const pixman_fast_path_t sse2_fast_paths[] = { /* PIXMAN_OP_OVER */ @@ -5508,6 +5504,7 @@ static const pixman_fast_path_t sse2_fast_paths[] = { PIXMAN_OP_NONE }, }; + static pixman_bool_t sse2_blt (pixman_implementation_t *imp, uint32_t * src_bits, -- cgit v1.2.3