diff options
author | sewardj <sewardj@a5019735-40e9-0310-863c-91ae7b9d1cf9> | 2006-05-03 22:13:57 +0000 |
---|---|---|
committer | sewardj <sewardj@a5019735-40e9-0310-863c-91ae7b9d1cf9> | 2006-05-03 22:13:57 +0000 |
commit | f2184914a9113d3d09315db8d71ed5a018cbbe5d (patch) | |
tree | 14bb4abfedee42d99bc29563b0ac86ea3d38f500 /memcheck/mc_main.c | |
parent | d7aca4cf89036cbaa080e217c1d2347cb995db6c (diff) |
Vectorise copy_address_range_perms for common cases. This gives about
40% speedup on artificial programs which just do realloc() and nothing
else, and about a 3-4% speedup on starting kpresenter-1.5.0 and
loading a 16-slide presentation.
git-svn-id: svn://svn.valgrind.org/valgrind/trunk@5880 a5019735-40e9-0310-863c-91ae7b9d1cf9
Diffstat (limited to 'memcheck/mc_main.c')
-rw-r--r-- | memcheck/mc_main.c | 99 |
1 files changed, 82 insertions, 17 deletions
diff --git a/memcheck/mc_main.c b/memcheck/mc_main.c index 0ee073b2..b28e80eb 100644 --- a/memcheck/mc_main.c +++ b/memcheck/mc_main.c @@ -589,6 +589,28 @@ UChar get_vabits2 ( Addr a ) return extract_vabits2_from_vabits8(a, vabits8); } +// *** WARNING! *** +// Any time this function is called, if it is possible that any of the +// 4 2-bit fields in vabits8 are equal to VA_BITS2_PARTDEFINED, then the +// corresponding entry(s) in the sec-V-bits table must also be set! +static INLINE +UChar get_vabits8_for_aligned_word32 ( Addr a ) +{ + SecMap* sm = get_secmap_for_reading(a); + UWord sm_off = SM_OFF(a); + UChar vabits8 = sm->vabits8[sm_off]; + return vabits8; +} + +static INLINE +void set_vabits8_for_aligned_word32 ( Addr a, UChar vabits8 ) +{ + SecMap* sm = get_secmap_for_writing(a); + UWord sm_off = SM_OFF(a); + sm->vabits8[sm_off] = vabits8; +} + + // Forward declarations static UWord get_sec_vbits8(Addr a); static void set_sec_vbits8(Addr a, UWord vbits8); @@ -1227,35 +1249,81 @@ static void make_mem_defined_if_addressable ( Addr a, SizeT len ) void MC_(copy_address_range_state) ( Addr src, Addr dst, SizeT len ) { SizeT i, j; - UChar vabits2; + UChar vabits2, vabits8; + Bool aligned, nooverlap; DEBUG("MC_(copy_address_range_state)\n"); PROF_EVENT(50, "MC_(copy_address_range_state)"); - if (len == 0) + if (len == 0 || src == dst) return; - if (src < dst) { - for (i = 0, j = len-1; i < len; i++, j--) { - PROF_EVENT(51, "MC_(copy_address_range_state)(loop)"); - vabits2 = get_vabits2( src+j ); - set_vabits2( dst+j, vabits2 ); - if (VA_BITS2_PARTDEFINED == vabits2) { - set_sec_vbits8( dst+j, get_sec_vbits8( src+j ) ); + aligned = VG_IS_4_ALIGNED(src) && VG_IS_4_ALIGNED(dst); + nooverlap = src+len <= dst || dst+len <= src; + + if (nooverlap && aligned) { + + /* Vectorised fast case, when no overlap and suitably aligned */ + /* vector loop */ + i = 0; + while (len >= 4) { + vabits8 = get_vabits8_for_aligned_word32( src+i ); + set_vabits8_for_aligned_word32( dst+i, vabits8 ); + if (EXPECTED_TAKEN(VA_BITS8_DEFINED == vabits8 + || VA_BITS8_UNDEFINED == vabits8 + || VA_BITS8_NOACCESS == vabits8)) { + /* do nothing */ + } else { + /* have to copy secondary map info */ + if (VA_BITS2_PARTDEFINED == get_vabits2( src+i+0 )) + set_sec_vbits8( dst+i+0, get_sec_vbits8( src+i+0 ) ); + if (VA_BITS2_PARTDEFINED == get_vabits2( src+i+1 )) + set_sec_vbits8( dst+i+1, get_sec_vbits8( src+i+1 ) ); + if (VA_BITS2_PARTDEFINED == get_vabits2( src+i+2 )) + set_sec_vbits8( dst+i+2, get_sec_vbits8( src+i+2 ) ); + if (VA_BITS2_PARTDEFINED == get_vabits2( src+i+3 )) + set_sec_vbits8( dst+i+3, get_sec_vbits8( src+i+3 ) ); } + i += 4; + len -= 4; } - } - - if (src > dst) { - for (i = 0; i < len; i++) { - PROF_EVENT(52, "MC_(copy_address_range_state)(loop)"); + /* fixup loop */ + while (len >= 1) { vabits2 = get_vabits2( src+i ); set_vabits2( dst+i, vabits2 ); if (VA_BITS2_PARTDEFINED == vabits2) { set_sec_vbits8( dst+i, get_sec_vbits8( src+i ) ); } + i++; + len--; + } + + } else { + + /* We have to do things the slow way */ + if (src < dst) { + for (i = 0, j = len-1; i < len; i++, j--) { + PROF_EVENT(51, "MC_(copy_address_range_state)(loop)"); + vabits2 = get_vabits2( src+j ); + set_vabits2( dst+j, vabits2 ); + if (VA_BITS2_PARTDEFINED == vabits2) { + set_sec_vbits8( dst+j, get_sec_vbits8( src+j ) ); + } + } + } + + if (src > dst) { + for (i = 0; i < len; i++) { + PROF_EVENT(52, "MC_(copy_address_range_state)(loop)"); + vabits2 = get_vabits2( src+i ); + set_vabits2( dst+i, vabits2 ); + if (VA_BITS2_PARTDEFINED == vabits2) { + set_sec_vbits8( dst+i, get_sec_vbits8( src+i ) ); + } + } } } + } @@ -4422,6 +4490,3 @@ VG_DETERMINE_INTERFACE_VERSION(mc_pre_clo_init) /*--------------------------------------------------------------------*/ /*--- end ---*/ /*--------------------------------------------------------------------*/ - - - |