summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Turner <mattst88@gmail.com>2019-04-01 13:35:20 -0700
committerMatt Turner <mattst88@gmail.com>2019-04-01 15:04:36 -0700
commite00154be111863e194977206c797eff7fdd40943 (patch)
treea728015c900f14deae4e52cb47374737285b34e7
parent01da2e415aaecc2518314fb3ef4d5bfe04a15abe (diff)
wip
-rw-r--r--configure.ac2
-rw-r--r--pixman/pixman-avx2.c68
2 files changed, 47 insertions, 23 deletions
diff --git a/configure.ac b/configure.ac
index 9fb8e64..ee0c84c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -54,7 +54,7 @@ AC_PREREQ([2.57])
m4_define([pixman_major], 0)
m4_define([pixman_minor], 38)
-m4_define([pixman_micro], 0)
+m4_define([pixman_micro], 1)
m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro])
diff --git a/pixman/pixman-avx2.c b/pixman/pixman-avx2.c
index d50f78d..257f1de 100644
--- a/pixman/pixman-avx2.c
+++ b/pixman/pixman-avx2.c
@@ -85,11 +85,24 @@ avx2_composite_add_8888_8888 (pixman_implementation_t *imp,
}
}
+#define eqmask(a, b) (_mm_movemask_epi8(_mm_cmpeq_epi32(a, b)) == 0xffff)
+
#if BILINEAR_INTERPOLATION_BITS < 8
#define BMSK ((1 << BILINEAR_INTERPOLATION_BITS) - 1)
-#define BILINEAR_DECLARE_VARIABLES \
+#define BILINEAR_DECLARE_YMM_VARIABLES \
+ const __m256i ymm_wtb = _mm256_set_epi8 (wt, wb, wt, wb, wt, wb, wt, wb, \
+ wt, wb, wt, wb, wt, wb, wt, wb, \
+ wt, wb, wt, wb, wt, wb, wt, wb, \
+ wt, wb, wt, wb, wt, wb, wt, wb); \
+ const __m256i ymm_xorc7 = _mm256_set1_epi32 (BMSK); \
+ const __m256i ymm_addc7 = _mm256_set1_epi32 (1); \
+ const __m256i ymm_ux = _mm256_set1_epi16 (2 * unit_x); \
+ __m256i ymm_x = _mm256_set_m128i (_mm_set1_epi16 (vx + unit_x), \
+ _mm_set1_epi16 (vx))
+
+#define BILINEAR_DECLARE_XMM_VARIABLES \
const __m128i xmm_wtb = _mm_set_epi8 (wt, wb, wt, wb, wt, wb, wt, wb, \
wt, wb, wt, wb, wt, wb, wt, wb); \
const __m128i xmm_xorc7 = _mm_set1_epi32 (BMSK); \
@@ -122,11 +135,35 @@ do { \
pix = _mm_cvtsi128_si32 (a); \
} while (0)
-#define BILINEAR_SKIP_ONE_PIXEL() \
+#define BILINEAR_INTERPOLATE_TWO_PIXELS(pix1, pix2) \
do { \
+ __m256i ymm_wh, a; \
+ /* fetch two 2x2 pixel blocks into avx2 registers */ \
+ __m256i mask = _mm256_set_epi64x (0, 1ULL << 63, 0, 1ULL << 63); \
+ __m128i vindex = _mm_set_epi32 (0, \
+ pixman_fixed_to_int (vx + unit_x), \
+ 0, \
+ pixman_fixed_to_int (vx)); \
+ __m256i tltr = _mm256_mask_i32gather_epi64 (_mm256_undefined_si256 (), (const long long *)src_top, vindex, mask, 4); \
+ __m256i blbr = _mm256_mask_i32gather_epi64 (_mm256_undefined_si256 (), (const long long *)src_bottom, vindex, mask, 4); \
vx += unit_x; \
- xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
-} while(0)
+ vx += unit_x; \
+ /* vertical interpolation */ \
+ a = _mm256_maddubs_epi16 (_mm256_unpacklo_epi8 (blbr, tltr), ymm_wtb); \
+ /* calculate horizontal weights */ \
+ ymm_wh = _mm256_add_epi16 (ymm_addc7, _mm256_xor_si256 (ymm_xorc7, \
+ _mm256_srli_epi16 (ymm_x, 16 - BILINEAR_INTERPOLATION_BITS))); \
+ ymm_x = _mm256_add_epi16 (ymm_x, ymm_ux); \
+ /* horizontal interpolation */ \
+ a = _mm256_madd_epi16 (_mm256_unpackhi_epi16 (_mm256_shuffle_epi32 ( \
+ a, _MM_SHUFFLE (1, 0, 3, 2)), a), ymm_wh); \
+ /* shift and pack the result */ \
+ a = _mm256_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2); \
+ a = _mm256_packs_epi32 (a, a); \
+ a = _mm256_packus_epi16 (a, a); \
+ pix1 = _mm256_extract_epi32(a, 0); \
+ pix2 = _mm256_extract_epi32(a, 4); \
+} while (0)
static force_inline void
scaled_bilinear_scanline_avx2_8888_8888_SRC (uint32_t * dst,
@@ -141,35 +178,22 @@ scaled_bilinear_scanline_avx2_8888_8888_SRC (uint32_t * dst,
pixman_fixed_t max_vx,
pixman_bool_t zero_src)
{
- BILINEAR_DECLARE_VARIABLES;
- uint32_t pix1, pix2, pix3, pix4;
+ uint32_t pix1, pix2;
- while ((w -= 4) >= 0)
+ while ((w -= 2) >= 0)
{
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
- *dst++ = pix1;
- *dst++ = pix2;
- *dst++ = pix3;
- *dst++ = pix4;
- }
-
- if (w & 2)
- {
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+ BILINEAR_DECLARE_YMM_VARIABLES;
+ BILINEAR_INTERPOLATE_TWO_PIXELS (pix1, pix2);
*dst++ = pix1;
*dst++ = pix2;
}
if (w & 1)
{
+ BILINEAR_DECLARE_XMM_VARIABLES;
BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
*dst = pix1;
}
-
}
FAST_BILINEAR_MAINLOOP_COMMON (avx2_8888_8888_cover_SRC,