summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Turner <mattst88@gmail.com>2013-01-02 13:52:43 -0800
committerMatt Turner <mattst88@gmail.com>2014-03-13 19:45:52 -0700
commit437ca6d0e29bf39537cf536647ffb828f6919eab (patch)
tree6f1f8a63d46874308e4bd54b02848b93cd9bf6f1
parentd725d93b9a94ff1c5cb0fe51b050cb36366cd25b (diff)
mmx: Don't unpack+repack when not needed
Nearest: over_8888_8888 = L1: 225.75 L2: 230.91 M:217.17 ( 11.54%) HT:266.81 VT:212.29 R:184.76 RT: 86.19 ( 752Kops/s) over_8888_8888 = L1: 235.79 L2: 243.24 M:225.78 ( 11.84%) HT:305.29 VT:242.82 R:210.29 RT: 99.14 ( 818Kops/s) Bilinear: over_8888_8888 = L1: 111.66 L2: 112.01 M:108.58 ( 5.69%) HT:118.60 VT:109.76 R: 95.89 RT: 55.55 ( 547Kops/s) over_8888_8888 = L1: 121.62 L2: 122.41 M:118.91 ( 6.29%) HT:126.99 VT:117.31 R:101.50 RT: 57.56 ( 561Kops/s)
-rw-r--r--pixman/pixman-mmx.c35
1 files changed, 27 insertions, 8 deletions
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index c7fd503..57a2223 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -203,6 +203,9 @@ typedef struct
mmxdatafield mmx_mask_3;
#endif
mmxdatafield mmx_full_alpha;
+#ifdef USE_LOONGSON_MMI
+ mmxdatafield mmx_full_alpha_packed;
+#endif
mmxdatafield mmx_4x0101;
mmxdatafield mmx_ff000000;
} mmx_data_t;
@@ -237,6 +240,9 @@ static const mmx_data_t c =
MMXDATA_INIT (.mmx_mask_3, 0x0000ffffffffffff),
#endif
MMXDATA_INIT (.mmx_full_alpha, 0x00ff000000000000),
+#ifdef USE_LOONGSON_MMI
+ MMXDATA_INIT (.mmx_full_alpha_packed, 0x00000000ff000000),
+#endif
MMXDATA_INIT (.mmx_4x0101, 0x0101010101010101),
MMXDATA_INIT (.mmx_ff000000, 0xff000000ff000000),
};
@@ -512,6 +518,18 @@ is_opaque (__m64 v)
}
static force_inline pixman_bool_t
+is_opaque_packed (__m64 v)
+{
+#ifdef USE_LOONGSON_MMI
+ return is_equal (_mm_and_si64 (v, MC (full_alpha_packed)),
+ MC (full_alpha_packed));
+#else
+ __m64 ffs = _mm_cmpeq_pi8 (v, v);
+ return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x08);
+#endif
+}
+
+static force_inline pixman_bool_t
is_zero (__m64 v)
{
return is_equal (v, _mm_setzero_si64 ());
@@ -728,19 +746,20 @@ combine (const uint32_t *src, const uint32_t *mask)
static force_inline __m64
core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
{
- vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ());
-
- if (is_opaque (vsrc))
+ if (is_opaque_packed (vsrc))
{
return vsrc;
}
else if (!is_zero (vsrc))
{
- return over (vsrc, expand_alpha (vsrc),
- _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()));
+ vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ());
+
+ return pack8888 (over (vsrc, expand_alpha (vsrc),
+ _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ())),
+ _mm_setzero_si64 ());
}
- return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ());
+ return vdst;
}
static void
@@ -3575,7 +3594,7 @@ scaled_nearest_scanline_mmx_8888_8888_OVER (uint32_t* pd,
while (vx >= 0)
vx -= src_width_fixed;
- store8888 (pd, core_combine_over_u_pixel_mmx (s, d));
+ store (pd, core_combine_over_u_pixel_mmx (s, d));
pd++;
w--;
@@ -3763,7 +3782,7 @@ scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t * dst,
if (!is_zero (pix1))
{
pix2 = load (dst);
- store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2));
+ store (dst, core_combine_over_u_pixel_mmx (pix1, pix2));
}
w--;