diff options
author | M Joonas Pihlaja <jpihlaja@cc.helsinki.fi> | 2009-09-14 22:52:29 +0300 |
---|---|---|
committer | M Joonas Pihlaja <jpihlaja@cc.helsinki.fi> | 2009-09-15 00:35:14 +0300 |
commit | ec7c1affcc66c12af1fc29fd000f9885a5d48320 (patch) | |
tree | e07e434adf36f6f9adac1f9437f2cbccf17c2392 | |
parent | 1b5269a585d8dcdb8f5ff9f71113bcf2d5efab26 (diff) |
Don't prefetch from NULL in the SSE2 fast paths.
On an Athlon64 box prefetch from NULL slows down
the rgba OVER rgba fast for predominantly solid sources
by up to 3.5x in the one-rounded-rectangle test case
when run using a tiling polygon renderer. This patch
conditionalises the prefetches of the mask everywhere
where the mask pointer may be NULL in a fast path.
-rw-r--r-- | pixman/pixman-sse2.c | 85 |
1 files changed, 49 insertions, 36 deletions
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c index fc787658..56fda4d8 100644 --- a/pixman/pixman-sse2.c +++ b/pixman/pixman-sse2.c @@ -368,6 +368,22 @@ cache_prefetch_next (__m128i* addr) _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */ } +/* prefetching NULL is very slow on some systems. don't do that. */ + +static force_inline void +maybe_prefetch (__m128i* addr) +{ + if (addr) + cache_prefetch (addr); +} + +static force_inline void +maybe_prefetch_next (__m128i* addr) +{ + if (addr) + cache_prefetch_next (addr); +} + /* load 4 pixels from a 16-byte boundary aligned address */ static force_inline __m128i load_128_aligned (__m128i* src) @@ -629,8 +645,7 @@ core_combine_over_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - if (pm) - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); /* Align dst on a 16-byte boundary */ while (w && ((unsigned long)pd & 15)) @@ -648,16 +663,14 @@ core_combine_over_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - if (pm) - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); - if (pm) - cache_prefetch_next ((__m128i*)pm); + maybe_prefetch_next ((__m128i*)pm); /* I'm loading unaligned because I'm not sure about * the address alignment. @@ -723,7 +736,7 @@ core_combine_over_reverse_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); /* Align dst on a 16-byte boundary */ while (w && @@ -742,14 +755,14 @@ core_combine_over_reverse_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); + maybe_prefetch_next ((__m128i*)pm); /* I'm loading unaligned because I'm not sure * about the address alignment. @@ -825,7 +838,7 @@ core_combine_in_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w && ((unsigned long) pd & 15)) { @@ -842,14 +855,14 @@ core_combine_in_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); + maybe_prefetch_next ((__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm); @@ -899,7 +912,7 @@ core_combine_reverse_in_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w && ((unsigned long) pd & 15)) { @@ -916,14 +929,14 @@ core_combine_reverse_in_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); + maybe_prefetch_next ((__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); @@ -968,7 +981,7 @@ core_combine_reverse_out_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w && ((unsigned long) pd & 15)) { @@ -989,7 +1002,7 @@ core_combine_reverse_out_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w >= 4) { @@ -999,7 +1012,7 @@ core_combine_reverse_out_u_sse2 (uint32_t* pd, /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); + maybe_prefetch_next ((__m128i*)pm); xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); @@ -1050,7 +1063,7 @@ core_combine_out_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w && ((unsigned long) pd & 15)) { @@ -1070,7 +1083,7 @@ core_combine_out_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w >= 4) { @@ -1080,7 +1093,7 @@ core_combine_out_u_sse2 (uint32_t* pd, /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); + maybe_prefetch_next ((__m128i*)pm); xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); @@ -1150,7 +1163,7 @@ core_combine_atop_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w && ((unsigned long) pd & 15)) { @@ -1167,14 +1180,14 @@ core_combine_atop_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); + maybe_prefetch_next ((__m128i*)pm); xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); @@ -1247,7 +1260,7 @@ core_combine_reverse_atop_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w && ((unsigned long) pd & 15)) { @@ -1264,14 +1277,14 @@ core_combine_reverse_atop_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); + maybe_prefetch_next ((__m128i*)pm); xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); @@ -1348,7 +1361,7 @@ core_combine_xor_u_sse2 (uint32_t* dst, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w && ((unsigned long) pd & 15)) { @@ -1365,14 +1378,14 @@ core_combine_xor_u_sse2 (uint32_t* dst, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); + maybe_prefetch_next ((__m128i*)pm); xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm); xmm_dst = load_128_aligned ((__m128i*) pd); @@ -1433,7 +1446,7 @@ core_combine_add_u_sse2 (uint32_t* dst, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { @@ -1451,7 +1464,7 @@ core_combine_add_u_sse2 (uint32_t* dst, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w >= 4) { @@ -1460,7 +1473,7 @@ core_combine_add_u_sse2 (uint32_t* dst, /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); + maybe_prefetch_next ((__m128i*)pm); s = combine4 ((__m128i*)ps, (__m128i*)pm); @@ -1519,7 +1532,7 @@ core_combine_saturate_u_sse2 (uint32_t * pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { @@ -1536,14 +1549,14 @@ core_combine_saturate_u_sse2 (uint32_t * pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); + maybe_prefetch_next ((__m128i*)pm); xmm_dst = load_128_aligned ((__m128i*)pd); xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm); |