diff options
author | Matt Turner <mattst88@gmail.com> | 2012-04-15 13:35:07 -0400 |
---|---|---|
committer | Matt Turner <mattst88@gmail.com> | 2012-04-15 13:54:19 -0400 |
commit | 6645d05eb1c2be39a2e03681fc15bcdeefa01de3 (patch) | |
tree | a21ee6dfb1b309ba4ea990b4a2ea2f43b5bde33f | |
parent | 2b482911d0b58b13519a3de008fce1c9aaaf12ee (diff) |
mmx: optimize over_n_8_0565
Load mask into vector register and improve instruction-level parallelism
by calculating independent intermediate results together.
Loongson:
over_n_8_0565 = L1: 14.35 L2: 14.26 M: 12.00 ( 16.00%) HT: 11.30 VT: 10.78 R: 10.34 RT: 7.12 ( 75Kops/s)
over_n_8_0565 = L1: 23.13 L2: 23.35 M: 18.03 ( 24.28%) HT: 16.06 VT: 15.52 R: 14.08 RT: 8.29 ( 83Kops/s)
-rw-r--r-- | pixman/pixman-mmx.c | 46 |
1 files changed, 23 insertions, 23 deletions
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c index 08eb25d..d526d8a 100644 --- a/pixman/pixman-mmx.c +++ b/pixman/pixman-mmx.c @@ -2246,35 +2246,35 @@ mmx_composite_over_n_8_0565 (pixman_implementation_t *imp, while (w >= 4) { - uint64_t m0, m1, m2, m3; - m0 = *mask; - m1 = *(mask + 1); - m2 = *(mask + 2); - m3 = *(mask + 3); + __m64 vmask = load8888u ((uint32_t *)mask); - if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff) + if (srca == 0xff && is_equal (vmask, MC (4x00ff))) { *(__m64 *)dst = srcsrcsrcsrc; } - else if (m0 | m1 | m2 | m3) + else if (!is_zero (vmask)) { - __m64 vdest; - __m64 vm0, vm1, vm2, vm3; - - vdest = *(__m64 *)dst; + __m64 vdest = *(__m64 *)dst; - vm0 = to_m64 (m0); - vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm0), - expand565 (vdest, 0)), vdest, 0); - vm1 = to_m64 (m1); - vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm1), - expand565 (vdest, 1)), vdest, 1); - vm2 = to_m64 (m2); - vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm2), - expand565 (vdest, 2)), vdest, 2); - vm3 = to_m64 (m3); - vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm3), - expand565 (vdest, 3)), vdest, 3); + const __m64 vm0 = _mm_shuffle_pi16 (vmask, _MM_SHUFFLE (0, 0, 0, 0)); + const __m64 vm1 = _mm_shuffle_pi16 (vmask, _MM_SHUFFLE (1, 1, 1, 1)); + const __m64 vm2 = _mm_shuffle_pi16 (vmask, _MM_SHUFFLE (2, 2, 2, 2)); + const __m64 vm3 = _mm_shuffle_pi16 (vmask, _MM_SHUFFLE (3, 3, 3, 3)); + + const __m64 vd0 = expand565 (vdest, 0); + const __m64 vd1 = expand565 (vdest, 1); + const __m64 vd2 = expand565 (vdest, 2); + const __m64 vd3 = expand565 (vdest, 3); + + const __m64 io0 = in_over (vsrc, vsrca, vm0, vd0); + const __m64 io1 = in_over (vsrc, vsrca, vm1, vd1); + const __m64 io2 = in_over (vsrc, vsrca, vm2, vd2); + const __m64 io3 = in_over (vsrc, vsrca, vm3, vd3); + + vdest = pack_565 (io0, vdest, 0); + vdest = pack_565 (io1, vdest, 1); + vdest = pack_565 (io2, vdest, 2); + vdest = pack_565 (io3, vdest, 3); *(__m64 *)dst = vdest; } |