summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Turner <mattst88@gmail.com>2012-04-15 13:35:07 -0400
committerMatt Turner <mattst88@gmail.com>2012-04-15 13:54:19 -0400
commit6645d05eb1c2be39a2e03681fc15bcdeefa01de3 (patch)
treea21ee6dfb1b309ba4ea990b4a2ea2f43b5bde33f
parent2b482911d0b58b13519a3de008fce1c9aaaf12ee (diff)
mmx: optimize over_n_8_0565
Load mask into vector register and improve instruction-level parallelism by calculating independent intermediate results together. Loongson: over_n_8_0565 = L1: 14.35 L2: 14.26 M: 12.00 ( 16.00%) HT: 11.30 VT: 10.78 R: 10.34 RT: 7.12 ( 75Kops/s) over_n_8_0565 = L1: 23.13 L2: 23.35 M: 18.03 ( 24.28%) HT: 16.06 VT: 15.52 R: 14.08 RT: 8.29 ( 83Kops/s)
-rw-r--r--pixman/pixman-mmx.c46
1 files changed, 23 insertions, 23 deletions
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 08eb25d..d526d8a 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -2246,35 +2246,35 @@ mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
while (w >= 4)
{
- uint64_t m0, m1, m2, m3;
- m0 = *mask;
- m1 = *(mask + 1);
- m2 = *(mask + 2);
- m3 = *(mask + 3);
+ __m64 vmask = load8888u ((uint32_t *)mask);
- if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
+ if (srca == 0xff && is_equal (vmask, MC (4x00ff)))
{
*(__m64 *)dst = srcsrcsrcsrc;
}
- else if (m0 | m1 | m2 | m3)
+ else if (!is_zero (vmask))
{
- __m64 vdest;
- __m64 vm0, vm1, vm2, vm3;
-
- vdest = *(__m64 *)dst;
+ __m64 vdest = *(__m64 *)dst;
- vm0 = to_m64 (m0);
- vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm0),
- expand565 (vdest, 0)), vdest, 0);
- vm1 = to_m64 (m1);
- vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm1),
- expand565 (vdest, 1)), vdest, 1);
- vm2 = to_m64 (m2);
- vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm2),
- expand565 (vdest, 2)), vdest, 2);
- vm3 = to_m64 (m3);
- vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm3),
- expand565 (vdest, 3)), vdest, 3);
+ const __m64 vm0 = _mm_shuffle_pi16 (vmask, _MM_SHUFFLE (0, 0, 0, 0));
+ const __m64 vm1 = _mm_shuffle_pi16 (vmask, _MM_SHUFFLE (1, 1, 1, 1));
+ const __m64 vm2 = _mm_shuffle_pi16 (vmask, _MM_SHUFFLE (2, 2, 2, 2));
+ const __m64 vm3 = _mm_shuffle_pi16 (vmask, _MM_SHUFFLE (3, 3, 3, 3));
+
+ const __m64 vd0 = expand565 (vdest, 0);
+ const __m64 vd1 = expand565 (vdest, 1);
+ const __m64 vd2 = expand565 (vdest, 2);
+ const __m64 vd3 = expand565 (vdest, 3);
+
+ const __m64 io0 = in_over (vsrc, vsrca, vm0, vd0);
+ const __m64 io1 = in_over (vsrc, vsrca, vm1, vd1);
+ const __m64 io2 = in_over (vsrc, vsrca, vm2, vd2);
+ const __m64 io3 = in_over (vsrc, vsrca, vm3, vd3);
+
+ vdest = pack_565 (io0, vdest, 0);
+ vdest = pack_565 (io1, vdest, 1);
+ vdest = pack_565 (io2, vdest, 2);
+ vdest = pack_565 (io3, vdest, 3);
*(__m64 *)dst = vdest;
}