diff options
author | Jose Fonseca <jfonseca@vmware.com> | 2017-03-23 11:05:20 +0000 |
---|---|---|
committer | Jose Fonseca <jfonseca@vmware.com> | 2017-04-08 08:24:55 +0100 |
commit | a684e9e18aaf3e67cd43a516227a568ee1d5c102 (patch) | |
tree | 2c76ec05858e76e573f932b298680370a2194433 /thirdparty | |
parent | e8ee4f3999a36e043224980b2343ead3d658008a (diff) |
snappy: Update to 1.1.4.
Diffstat (limited to 'thirdparty')
-rw-r--r-- | thirdparty/snappy/NEWS | 8 | ||||
-rw-r--r-- | thirdparty/snappy/README | 16 | ||||
-rw-r--r-- | thirdparty/snappy/snappy-internal.h | 109 | ||||
-rw-r--r-- | thirdparty/snappy/snappy-stubs-internal.h | 45 | ||||
-rw-r--r-- | thirdparty/snappy/snappy-stubs-public.h.in | 2 | ||||
-rw-r--r-- | thirdparty/snappy/snappy-test.h | 2 | ||||
-rw-r--r-- | thirdparty/snappy/snappy.cc | 561 | ||||
-rw-r--r-- | thirdparty/snappy/snappy_unittest.cc | 127 |
8 files changed, 512 insertions, 358 deletions
diff --git a/thirdparty/snappy/NEWS b/thirdparty/snappy/NEWS index 4eb7a1d1..3bc0f4b0 100644 --- a/thirdparty/snappy/NEWS +++ b/thirdparty/snappy/NEWS @@ -1,3 +1,11 @@ +Snappy v1.1.4, January 25th 2017: + + * Fix a 1% performance regression when snappy is used in PIE executables. + + * Improve compression performance by 5%. + + * Improve decompression performance by 20%. + Snappy v1.1.3, July 6th 2015: This is the first release to be done from GitHub, which means that diff --git a/thirdparty/snappy/README b/thirdparty/snappy/README index 3bc8888f..c60dab9a 100644 --- a/thirdparty/snappy/README +++ b/thirdparty/snappy/README @@ -29,7 +29,7 @@ and the like. Performance =========== - + Snappy is intended to be fast. On a single core of a Core i7 processor in 64-bit mode, it compresses at about 250 MB/sec or more and decompresses at about 500 MB/sec or more. (These numbers are for the slowest inputs in our @@ -67,7 +67,7 @@ Usage Note that Snappy, both the implementation and the main interface, is written in C++. However, several third-party bindings to other languages -are available; see the Google Code page at http://code.google.com/p/snappy/ +are available; see the home page at http://google.github.io/snappy/ for more information. Also, if you want to use Snappy from C code, you can use the included C bindings in snappy-c.h. @@ -102,12 +102,12 @@ tests to verify you have not broken anything. Note that if you have the Google Test library installed, unit test behavior (especially failures) will be significantly more user-friendly. You can find Google Test at - http://code.google.com/p/googletest/ + http://github.com/google/googletest You probably also want the gflags library for handling of command-line flags; you can find it at - http://code.google.com/p/google-gflags/ + http://gflags.github.io/gflags/ In addition to the unit tests, snappy contains microbenchmarks used to tune compression and decompression performance. These are automatically run @@ -129,7 +129,11 @@ test.) Contact ======= -Snappy is distributed through Google Code. For the latest version, a bug tracker, +Snappy is distributed through GitHub. For the latest version, a bug tracker, and other information, see - http://code.google.com/p/snappy/ + http://google.github.io/snappy/ + +or the repository at + + https://github.com/google/snappy diff --git a/thirdparty/snappy/snappy-internal.h b/thirdparty/snappy/snappy-internal.h index 0653dc65..b8355c08 100644 --- a/thirdparty/snappy/snappy-internal.h +++ b/thirdparty/snappy/snappy-internal.h @@ -70,11 +70,12 @@ char* CompressFragment(const char* input, uint16* table, const int table_size); -// Return the largest n such that +// Find the largest n such that // // s1[0,n-1] == s2[0,n-1] // and n <= (s2_limit - s2). // +// Return make_pair(n, n < 8). // Does not read *s2_limit or beyond. // Does not read *(s1 + (s2_limit - s2)) or beyond. // Requires that s2_limit >= s2. @@ -82,11 +83,28 @@ char* CompressFragment(const char* input, // Separate implementation for x86_64, for speed. Uses the fact that // x86_64 is little endian. #if defined(ARCH_K8) -static inline int FindMatchLength(const char* s1, - const char* s2, - const char* s2_limit) { +static inline std::pair<size_t, bool> FindMatchLength(const char* s1, + const char* s2, + const char* s2_limit) { assert(s2_limit >= s2); - int matched = 0; + size_t matched = 0; + + // This block isn't necessary for correctness; we could just start looping + // immediately. As an optimization though, it is useful. It creates some not + // uncommon code paths that determine, without extra effort, whether the match + // length is less than 8. In short, we are hoping to avoid a conditional + // branch, and perhaps get better code layout from the C++ compiler. + if (PREDICT_TRUE(s2 <= s2_limit - 8)) { + uint64 a1 = UNALIGNED_LOAD64(s1); + uint64 a2 = UNALIGNED_LOAD64(s2); + if (a1 != a2) { + return std::pair<size_t, bool>(Bits::FindLSBSetNonZero64(a1 ^ a2) >> 3, + true); + } else { + matched = 8; + s2 += 8; + } + } // Find out how long the match is. We loop over the data 64 bits at a // time until we find a 64-bit block that doesn't match; then we find @@ -97,14 +115,11 @@ static inline int FindMatchLength(const char* s1, s2 += 8; matched += 8; } else { - // On current (mid-2008) Opteron models there is a 3% more - // efficient code sequence to find the first non-matching byte. - // However, what follows is ~10% better on Intel Core 2 and newer, - // and we expect AMD's bsf instruction to improve. uint64 x = UNALIGNED_LOAD64(s2) ^ UNALIGNED_LOAD64(s1 + matched); int matching_bits = Bits::FindLSBSetNonZero64(x); matched += matching_bits >> 3; - return matched; + assert(matched >= 8); + return std::pair<size_t, bool>(matched, false); } } while (PREDICT_TRUE(s2 < s2_limit)) { @@ -112,15 +127,15 @@ static inline int FindMatchLength(const char* s1, ++s2; ++matched; } else { - return matched; + return std::pair<size_t, bool>(matched, matched < 8); } } - return matched; + return std::pair<size_t, bool>(matched, matched < 8); } #else -static inline int FindMatchLength(const char* s1, - const char* s2, - const char* s2_limit) { +static inline std::pair<size_t, bool> FindMatchLength(const char* s1, + const char* s2, + const char* s2_limit) { // Implementation based on the x86-64 version, above. assert(s2_limit >= s2); int matched = 0; @@ -140,10 +155,72 @@ static inline int FindMatchLength(const char* s1, ++matched; } } - return matched; + return std::pair<size_t, bool>(matched, matched < 8); } #endif +// Lookup tables for decompression code. Give --snappy_dump_decompression_table +// to the unit test to recompute char_table. + +enum { + LITERAL = 0, + COPY_1_BYTE_OFFSET = 1, // 3 bit length + 3 bits of offset in opcode + COPY_2_BYTE_OFFSET = 2, + COPY_4_BYTE_OFFSET = 3 +}; +static const int kMaximumTagLength = 5; // COPY_4_BYTE_OFFSET plus the actual offset. + +// Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits +static const uint32 wordmask[] = { + 0u, 0xffu, 0xffffu, 0xffffffu, 0xffffffffu +}; + +// Data stored per entry in lookup table: +// Range Bits-used Description +// ------------------------------------ +// 1..64 0..7 Literal/copy length encoded in opcode byte +// 0..7 8..10 Copy offset encoded in opcode byte / 256 +// 0..4 11..13 Extra bytes after opcode +// +// We use eight bits for the length even though 7 would have sufficed +// because of efficiency reasons: +// (1) Extracting a byte is faster than a bit-field +// (2) It properly aligns copy offset so we do not need a <<8 +static const uint16 char_table[256] = { + 0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002, + 0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004, + 0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006, + 0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008, + 0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a, + 0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c, + 0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e, + 0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010, + 0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012, + 0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014, + 0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016, + 0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018, + 0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a, + 0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c, + 0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e, + 0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020, + 0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022, + 0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024, + 0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026, + 0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028, + 0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a, + 0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c, + 0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e, + 0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030, + 0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032, + 0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034, + 0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036, + 0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038, + 0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a, + 0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c, + 0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e, + 0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040 +}; + } // end namespace internal } // end namespace snappy diff --git a/thirdparty/snappy/snappy-stubs-internal.h b/thirdparty/snappy/snappy-stubs-internal.h index ddca1a8b..1954c63d 100644 --- a/thirdparty/snappy/snappy-stubs-internal.h +++ b/thirdparty/snappy/snappy-stubs-internal.h @@ -116,6 +116,15 @@ static const int64 kint64max = static_cast<int64>(0x7FFFFFFFFFFFFFFFLL); // sub-architectures. // // This is a mess, but there's not much we can do about it. +// +// To further complicate matters, only LDR instructions (single reads) are +// allowed to be unaligned, not LDRD (two reads) or LDM (many reads). Unless we +// explicitly tell the compiler that these accesses can be unaligned, it can and +// will combine accesses. On armcc, the way to signal this is done by accessing +// through the type (uint32 __packed *), but GCC has no such attribute +// (it ignores __attribute__((packed)) on individual variables). However, +// we can tell it that a _struct_ is unaligned, which has the same effect, +// so we do that. #elif defined(__arm__) && \ !defined(__ARM_ARCH_4__) && \ @@ -131,11 +140,39 @@ static const int64 kint64max = static_cast<int64>(0x7FFFFFFFFFFFFFFFLL); !defined(__ARM_ARCH_6ZK__) && \ !defined(__ARM_ARCH_6T2__) -#define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p)) -#define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p)) +#if __GNUC__ +#define ATTRIBUTE_PACKED __attribute__((__packed__)) +#else +#define ATTRIBUTE_PACKED +#endif -#define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16 *>(_p) = (_val)) -#define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val)) +namespace base { +namespace internal { + +struct Unaligned16Struct { + uint16 value; + uint8 dummy; // To make the size non-power-of-two. +} ATTRIBUTE_PACKED; + +struct Unaligned32Struct { + uint32 value; + uint8 dummy; // To make the size non-power-of-two. +} ATTRIBUTE_PACKED; + +} // namespace internal +} // namespace base + +#define UNALIGNED_LOAD16(_p) \ + ((reinterpret_cast<const ::snappy::base::internal::Unaligned16Struct *>(_p))->value) +#define UNALIGNED_LOAD32(_p) \ + ((reinterpret_cast<const ::snappy::base::internal::Unaligned32Struct *>(_p))->value) + +#define UNALIGNED_STORE16(_p, _val) \ + ((reinterpret_cast< ::snappy::base::internal::Unaligned16Struct *>(_p))->value = \ + (_val)) +#define UNALIGNED_STORE32(_p, _val) \ + ((reinterpret_cast< ::snappy::base::internal::Unaligned32Struct *>(_p))->value = \ + (_val)) // TODO(user): NEON supports unaligned 64-bit loads and stores. // See if that would be more efficient on platforms supporting it, diff --git a/thirdparty/snappy/snappy-stubs-public.h.in b/thirdparty/snappy/snappy-stubs-public.h.in index ebe676cc..96989ac3 100644 --- a/thirdparty/snappy/snappy-stubs-public.h.in +++ b/thirdparty/snappy/snappy-stubs-public.h.in @@ -80,9 +80,11 @@ typedef unsigned long long uint64; typedef std::string string; +#ifndef DISALLOW_COPY_AND_ASSIGN #define DISALLOW_COPY_AND_ASSIGN(TypeName) \ TypeName(const TypeName&); \ void operator=(const TypeName&) +#endif #if !@ac_cv_have_sys_uio_h@ // Windows does not have an iovec type, yet the concept is universally useful. diff --git a/thirdparty/snappy/snappy-test.h b/thirdparty/snappy/snappy-test.h index dbc55b98..5fb09c78 100644 --- a/thirdparty/snappy/snappy-test.h +++ b/thirdparty/snappy/snappy-test.h @@ -196,6 +196,7 @@ void Test_Snappy_RandomData(); void Test_Snappy_FourByteOffset(); void Test_SnappyCorruption_TruncatedVarint(); void Test_SnappyCorruption_UnterminatedVarint(); +void Test_SnappyCorruption_OverflowingVarint(); void Test_Snappy_ReadPastEndOfBuffer(); void Test_Snappy_FindMatchLength(); void Test_Snappy_FindMatchLengthRandom(); @@ -500,6 +501,7 @@ static inline int RUN_ALL_TESTS() { snappy::Test_Snappy_FourByteOffset(); snappy::Test_SnappyCorruption_TruncatedVarint(); snappy::Test_SnappyCorruption_UnterminatedVarint(); + snappy::Test_SnappyCorruption_OverflowingVarint(); snappy::Test_Snappy_ReadPastEndOfBuffer(); snappy::Test_Snappy_FindMatchLength(); snappy::Test_Snappy_FindMatchLengthRandom(); diff --git a/thirdparty/snappy/snappy.cc b/thirdparty/snappy/snappy.cc index b6ca7ece..4bcea0b4 100644 --- a/thirdparty/snappy/snappy.cc +++ b/thirdparty/snappy/snappy.cc @@ -30,6 +30,9 @@ #include "snappy-internal.h" #include "snappy-sinksource.h" +#if defined(__x86_64__) || defined(_M_X64) +#include <emmintrin.h> +#endif #include <stdio.h> #include <algorithm> @@ -39,6 +42,13 @@ namespace snappy { +using internal::COPY_1_BYTE_OFFSET; +using internal::COPY_2_BYTE_OFFSET; +using internal::LITERAL; +using internal::char_table; +using internal::kMaximumTagLength; +using internal::wordmask; + // Any hash function will produce a valid compressed bitstream, but a good // hash function reduces the number of collisions and thus yields better // compression for compressible input, and more speed for incompressible @@ -76,79 +86,125 @@ size_t MaxCompressedLength(size_t source_len) { return 32 + source_len + source_len/6; } -enum { - LITERAL = 0, - COPY_1_BYTE_OFFSET = 1, // 3 bit length + 3 bits of offset in opcode - COPY_2_BYTE_OFFSET = 2, - COPY_4_BYTE_OFFSET = 3 -}; -static const int kMaximumTagLength = 5; // COPY_4_BYTE_OFFSET plus the actual offset. - -// Copy "len" bytes from "src" to "op", one byte at a time. Used for -// handling COPY operations where the input and output regions may -// overlap. For example, suppose: -// src == "ab" -// op == src + 2 -// len == 20 -// After IncrementalCopy(src, op, len), the result will have -// eleven copies of "ab" -// ababababababababababab -// Note that this does not match the semantics of either memcpy() -// or memmove(). -static inline void IncrementalCopy(const char* src, char* op, ssize_t len) { - assert(len > 0); - do { - *op++ = *src++; - } while (--len > 0); +namespace { + +void UnalignedCopy64(const void* src, void* dst) { + memcpy(dst, src, 8); } -// Equivalent to IncrementalCopy except that it can write up to ten extra -// bytes after the end of the copy, and that it is faster. -// -// The main part of this loop is a simple copy of eight bytes at a time until -// we've copied (at least) the requested amount of bytes. However, if op and -// src are less than eight bytes apart (indicating a repeating pattern of -// length < 8), we first need to expand the pattern in order to get the correct -// results. For instance, if the buffer looks like this, with the eight-byte -// <src> and <op> patterns marked as intervals: -// -// abxxxxxxxxxxxx -// [------] src -// [------] op -// -// a single eight-byte copy from <src> to <op> will repeat the pattern once, -// after which we can move <op> two bytes without moving <src>: -// -// ababxxxxxxxxxx -// [------] src -// [------] op -// -// and repeat the exercise until the two no longer overlap. -// -// This allows us to do very well in the special case of one single byte -// repeated many times, without taking a big hit for more general cases. -// -// The worst case of extra writing past the end of the match occurs when -// op - src == 1 and len == 1; the last copy will read from byte positions -// [0..7] and write to [4..11], whereas it was only supposed to write to -// position 1. Thus, ten excess bytes. +void UnalignedCopy128(const void* src, void* dst) { + // TODO(alkis): Remove this when we upgrade to a recent compiler that emits + // SSE2 moves for memcpy(dst, src, 16). +#ifdef __SSE2__ + __m128i x = _mm_loadu_si128(static_cast<const __m128i*>(src)); + _mm_storeu_si128(static_cast<__m128i*>(dst), x); +#else + memcpy(dst, src, 16); +#endif +} -namespace { +// Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used +// for handling COPY operations where the input and output regions may overlap. +// For example, suppose: +// src == "ab" +// op == src + 2 +// op_limit == op + 20 +// After IncrementalCopySlow(src, op, op_limit), the result will have eleven +// copies of "ab" +// ababababababababababab +// Note that this does not match the semantics of either memcpy() or memmove(). +inline char* IncrementalCopySlow(const char* src, char* op, + char* const op_limit) { + while (op < op_limit) { + *op++ = *src++; + } + return op_limit; +} -const int kMaxIncrementCopyOverflow = 10; +// Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) but faster than +// IncrementalCopySlow. buf_limit is the address past the end of the writable +// region of the buffer. +inline char* IncrementalCopy(const char* src, char* op, char* const op_limit, + char* const buf_limit) { + // Terminology: + // + // slop = buf_limit - op + // pat = op - src + // len = limit - op + assert(src < op); + assert(op_limit <= buf_limit); + // NOTE: The compressor always emits 4 <= len <= 64. It is ok to assume that + // to optimize this function but we have to also handle these cases in case + // the input does not satisfy these conditions. + + size_t pattern_size = op - src; + // The cases are split into different branches to allow the branch predictor, + // FDO, and static prediction hints to work better. For each input we list the + // ratio of invocations that match each condition. + // + // input slop < 16 pat < 8 len > 16 + // ------------------------------------------ + // html|html4|cp 0% 1.01% 27.73% + // urls 0% 0.88% 14.79% + // jpg 0% 64.29% 7.14% + // pdf 0% 2.56% 58.06% + // txt[1-4] 0% 0.23% 0.97% + // pb 0% 0.96% 13.88% + // bin 0.01% 22.27% 41.17% + // + // It is very rare that we don't have enough slop for doing block copies. It + // is also rare that we need to expand a pattern. Small patterns are common + // for incompressible formats and for those we are plenty fast already. + // Lengths are normally not greater than 16 but they vary depending on the + // input. In general if we always predict len <= 16 it would be an ok + // prediction. + // + // In order to be fast we want a pattern >= 8 bytes and an unrolled loop + // copying 2x 8 bytes at a time. + + // Handle the uncommon case where pattern is less than 8 bytes. + if (PREDICT_FALSE(pattern_size < 8)) { + // Expand pattern to at least 8 bytes. The worse case scenario in terms of + // buffer usage is when the pattern is size 3. ^ is the original position + // of op. x are irrelevant bytes copied by the last UnalignedCopy64. + // + // abc + // abcabcxxxxx + // abcabcabcabcxxxxx + // ^ + // The last x is 14 bytes after ^. + if (PREDICT_TRUE(op <= buf_limit - 14)) { + while (pattern_size < 8) { + UnalignedCopy64(src, op); + op += pattern_size; + pattern_size *= 2; + } + if (PREDICT_TRUE(op >= op_limit)) return op_limit; + } else { + return IncrementalCopySlow(src, op, op_limit); + } + } + assert(pattern_size >= 8); -inline void IncrementalCopyFastPath(const char* src, char* op, ssize_t len) { - while (PREDICT_FALSE(op - src < 8)) { + // Copy 2x 8 bytes at a time. Because op - src can be < 16, a single + // UnalignedCopy128 might overwrite data in op. UnalignedCopy64 is safe + // because expanding the pattern to at least 8 bytes guarantees that + // op - src >= 8. + while (op <= buf_limit - 16) { UnalignedCopy64(src, op); - len -= op - src; - op += op - src; - } - while (len > 0) { + UnalignedCopy64(src + 8, op + 8); + src += 16; + op += 16; + if (PREDICT_TRUE(op >= op_limit)) return op_limit; + } + // We only take this branch if we didn't have enough slop and we can do a + // single 8 byte copy. + if (PREDICT_FALSE(op <= buf_limit - 8)) { UnalignedCopy64(src, op); src += 8; op += 8; - len -= 8; } + return IncrementalCopySlow(src, op, op_limit); } } // namespace @@ -157,26 +213,29 @@ static inline char* EmitLiteral(char* op, const char* literal, int len, bool allow_fast_path) { - int n = len - 1; // Zero-length literals are disallowed - if (n < 60) { + // The vast majority of copies are below 16 bytes, for which a + // call to memcpy is overkill. This fast path can sometimes + // copy up to 15 bytes too much, but that is okay in the + // main loop, since we have a bit to go on for both sides: + // + // - The input will always have kInputMarginBytes = 15 extra + // available bytes, as long as we're in the main loop, and + // if not, allow_fast_path = false. + // - The output will always have 32 spare bytes (see + // MaxCompressedLength). + assert(len > 0); // Zero-length literals are disallowed + int n = len - 1; + if (allow_fast_path && len <= 16) { // Fits in tag byte *op++ = LITERAL | (n << 2); - // The vast majority of copies are below 16 bytes, for which a - // call to memcpy is overkill. This fast path can sometimes - // copy up to 15 bytes too much, but that is okay in the - // main loop, since we have a bit to go on for both sides: - // - // - The input will always have kInputMarginBytes = 15 extra - // available bytes, as long as we're in the main loop, and - // if not, allow_fast_path = false. - // - The output will always have 32 spare bytes (see - // MaxCompressedLength). - if (allow_fast_path && len <= 16) { - UnalignedCopy64(literal, op); - UnalignedCopy64(literal + 8, op + 8); - return op + len; - } + UnalignedCopy128(literal, op); + return op + len; + } + + if (n < 60) { + // Fits in tag byte + *op++ = LITERAL | (n << 2); } else { // Encode in upcoming bytes char* base = op; @@ -195,42 +254,54 @@ static inline char* EmitLiteral(char* op, return op + len; } -static inline char* EmitCopyLessThan64(char* op, size_t offset, int len) { +static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len, + bool len_less_than_12) { assert(len <= 64); assert(len >= 4); assert(offset < 65536); + assert(len_less_than_12 == (len < 12)); - if ((len < 12) && (offset < 2048)) { - size_t len_minus_4 = len - 4; - assert(len_minus_4 < 8); // Must fit in 3 bits - *op++ = COPY_1_BYTE_OFFSET + ((len_minus_4) << 2) + ((offset >> 8) << 5); + if (len_less_than_12 && PREDICT_TRUE(offset < 2048)) { + // offset fits in 11 bits. The 3 highest go in the top of the first byte, + // and the rest go in the second byte. + *op++ = COPY_1_BYTE_OFFSET + ((len - 4) << 2) + ((offset >> 3) & 0xe0); *op++ = offset & 0xff; } else { - *op++ = COPY_2_BYTE_OFFSET + ((len-1) << 2); - LittleEndian::Store16(op, offset); - op += 2; + // Write 4 bytes, though we only care about 3 of them. The output buffer + // is required to have some slack, so the extra byte won't overrun it. + uint32 u = COPY_2_BYTE_OFFSET + ((len - 1) << 2) + (offset << 8); + LittleEndian::Store32(op, u); + op += 3; } return op; } -static inline char* EmitCopy(char* op, size_t offset, int len) { - // Emit 64 byte copies but make sure to keep at least four bytes reserved - while (PREDICT_FALSE(len >= 68)) { - op = EmitCopyLessThan64(op, offset, 64); - len -= 64; - } +static inline char* EmitCopy(char* op, size_t offset, size_t len, + bool len_less_than_12) { + assert(len_less_than_12 == (len < 12)); + if (len_less_than_12) { + return EmitCopyAtMost64(op, offset, len, true); + } else { + // A special case for len <= 64 might help, but so far measurements suggest + // it's in the noise. - // Emit an extra 60 byte copy if have too much data to fit in one copy - if (len > 64) { - op = EmitCopyLessThan64(op, offset, 60); - len -= 60; - } + // Emit 64 byte copies but make sure to keep at least four bytes reserved. + while (PREDICT_FALSE(len >= 68)) { + op = EmitCopyAtMost64(op, offset, 64, false); + len -= 64; + } - // Emit remainder - op = EmitCopyLessThan64(op, offset, len); - return op; -} + // One or two copies will now finish the job. + if (len > 64) { + op = EmitCopyAtMost64(op, offset, 60, false); + len -= 60; + } + // Emit remainder. + op = EmitCopyAtMost64(op, offset, len, len < 12); + return op; + } +} bool GetUncompressedLength(const char* start, size_t n, size_t* result) { uint32 v = 0; @@ -364,9 +435,9 @@ char* CompressFragment(const char* input, // // Heuristic match skipping: If 32 bytes are scanned with no matches // found, start looking only at every other byte. If 32 more bytes are - // scanned, look at every third byte, etc.. When a match is found, - // immediately go back to looking at every byte. This is a small loss - // (~5% performance, ~0.1% density) for compressible data due to more + // scanned (or skipped), look at every third byte, etc.. When a match is + // found, immediately go back to looking at every byte. This is a small + // loss (~5% performance, ~0.1% density) for compressible data due to more // bookkeeping, but for non-compressible data (such as JPEG) it's a huge // win since the compressor quickly "realizes" the data is incompressible // and doesn't bother looking for matches everywhere. @@ -382,7 +453,8 @@ char* CompressFragment(const char* input, ip = next_ip; uint32 hash = next_hash; assert(hash == Hash(ip, shift)); - uint32 bytes_between_hash_lookups = skip++ >> 5; + uint32 bytes_between_hash_lookups = skip >> 5; + skip += bytes_between_hash_lookups; next_ip = ip + bytes_between_hash_lookups; if (PREDICT_FALSE(next_ip > ip_limit)) { goto emit_remainder; @@ -417,19 +489,21 @@ char* CompressFragment(const char* input, // We have a 4-byte match at ip, and no need to emit any // "literal bytes" prior to ip. const char* base = ip; - int matched = 4 + FindMatchLength(candidate + 4, ip + 4, ip_end); + std::pair<size_t, bool> p = + FindMatchLength(candidate + 4, ip + 4, ip_end); + size_t matched = 4 + p.first; ip += matched; size_t offset = base - candidate; assert(0 == memcmp(base, candidate, matched)); - op = EmitCopy(op, offset, matched); - // We could immediately start working at ip now, but to improve - // compression we first update table[Hash(ip - 1, ...)]. - const char* insert_tail = ip - 1; + op = EmitCopy(op, offset, matched, p.second); next_emit = ip; if (PREDICT_FALSE(ip >= ip_limit)) { goto emit_remainder; } - input_bytes = GetEightBytesAt(insert_tail); + // We are now looking for a 4-byte match again. We read + // table[Hash(ip, shift)] for that. To improve compression, + // we also update table[Hash(ip - 1, shift)] and table[Hash(ip, shift)]. + input_bytes = GetEightBytesAt(ip - 1); uint32 prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 0), shift); table[prev_hash] = ip - base_ip - 1; uint32 cur_hash = HashBytes(GetUint32AtOffset(input_bytes, 1), shift); @@ -493,162 +567,6 @@ char* CompressFragment(const char* input, // bool TryFastAppend(const char* ip, size_t available, size_t length); // }; -// ----------------------------------------------------------------------- -// Lookup table for decompression code. Generated by ComputeTable() below. -// ----------------------------------------------------------------------- - -// Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits -static const uint32 wordmask[] = { - 0u, 0xffu, 0xffffu, 0xffffffu, 0xffffffffu -}; - -// Data stored per entry in lookup table: -// Range Bits-used Description -// ------------------------------------ -// 1..64 0..7 Literal/copy length encoded in opcode byte -// 0..7 8..10 Copy offset encoded in opcode byte / 256 -// 0..4 11..13 Extra bytes after opcode -// -// We use eight bits for the length even though 7 would have sufficed -// because of efficiency reasons: -// (1) Extracting a byte is faster than a bit-field -// (2) It properly aligns copy offset so we do not need a <<8 -static const uint16 char_table[256] = { - 0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002, - 0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004, - 0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006, - 0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008, - 0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a, - 0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c, - 0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e, - 0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010, - 0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012, - 0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014, - 0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016, - 0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018, - 0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a, - 0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c, - 0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e, - 0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020, - 0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022, - 0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024, - 0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026, - 0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028, - 0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a, - 0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c, - 0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e, - 0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030, - 0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032, - 0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034, - 0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036, - 0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038, - 0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a, - 0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c, - 0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e, - 0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040 -}; - -// In debug mode, allow optional computation of the table at startup. -// Also, check that the decompression table is correct. -#ifndef NDEBUG -DEFINE_bool(snappy_dump_decompression_table, false, - "If true, we print the decompression table at startup."); - -static uint16 MakeEntry(unsigned int extra, - unsigned int len, - unsigned int copy_offset) { - // Check that all of the fields fit within the allocated space - assert(extra == (extra & 0x7)); // At most 3 bits - assert(copy_offset == (copy_offset & 0x7)); // At most 3 bits - assert(len == (len & 0x7f)); // At most 7 bits - return len | (copy_offset << 8) | (extra << 11); -} - -static void ComputeTable() { - uint16 dst[256]; - - // Place invalid entries in all places to detect missing initialization - int assigned = 0; - for (int i = 0; i < 256; i++) { - dst[i] = 0xffff; - } - - // Small LITERAL entries. We store (len-1) in the top 6 bits. - for (unsigned int len = 1; len <= 60; len++) { - dst[LITERAL | ((len-1) << 2)] = MakeEntry(0, len, 0); - assigned++; - } - - // Large LITERAL entries. We use 60..63 in the high 6 bits to - // encode the number of bytes of length info that follow the opcode. - for (unsigned int extra_bytes = 1; extra_bytes <= 4; extra_bytes++) { - // We set the length field in the lookup table to 1 because extra - // bytes encode len-1. - dst[LITERAL | ((extra_bytes+59) << 2)] = MakeEntry(extra_bytes, 1, 0); - assigned++; - } - - // COPY_1_BYTE_OFFSET. - // - // The tag byte in the compressed data stores len-4 in 3 bits, and - // offset/256 in 5 bits. offset%256 is stored in the next byte. - // - // This format is used for length in range [4..11] and offset in - // range [0..2047] - for (unsigned int len = 4; len < 12; len++) { - for (unsigned int offset = 0; offset < 2048; offset += 256) { - dst[COPY_1_BYTE_OFFSET | ((len-4)<<2) | ((offset>>8)<<5)] = - MakeEntry(1, len, offset>>8); - assigned++; - } - } - - // COPY_2_BYTE_OFFSET. - // Tag contains len-1 in top 6 bits, and offset in next two bytes. - for (unsigned int len = 1; len <= 64; len++) { - dst[COPY_2_BYTE_OFFSET | ((len-1)<<2)] = MakeEntry(2, len, 0); - assigned++; - } - - // COPY_4_BYTE_OFFSET. - // Tag contents len-1 in top 6 bits, and offset in next four bytes. - for (unsigned int len = 1; len <= 64; len++) { - dst[COPY_4_BYTE_OFFSET | ((len-1)<<2)] = MakeEntry(4, len, 0); - assigned++; - } - - // Check that each entry was initialized exactly once. - if (assigned != 256) { - fprintf(stderr, "ComputeTable: assigned only %d of 256\n", assigned); - abort(); - } - for (int i = 0; i < 256; i++) { - if (dst[i] == 0xffff) { - fprintf(stderr, "ComputeTable: did not assign byte %d\n", i); - abort(); - } - } - - if (FLAGS_snappy_dump_decompression_table) { - printf("static const uint16 char_table[256] = {\n "); - for (int i = 0; i < 256; i++) { - printf("0x%04x%s", - dst[i], - ((i == 255) ? "\n" : (((i%8) == 7) ? ",\n " : ", "))); - } - printf("};\n"); - } - - // Check that computed table matched recorded table - for (int i = 0; i < 256; i++) { - if (dst[i] != char_table[i]) { - fprintf(stderr, "ComputeTable: byte %d: computed (%x), expect (%x)\n", - i, static_cast<int>(dst[i]), static_cast<int>(char_table[i])); - abort(); - } - } -} -#endif /* !NDEBUG */ // Helper class for decompression class SnappyDecompressor { @@ -701,7 +619,9 @@ class SnappyDecompressor { if (n == 0) return false; const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip)); reader_->Skip(1); - *result |= static_cast<uint32>(c & 0x7f) << shift; + uint32 val = c & 0x7f; + if (((val << shift) >> shift) != val) return false; + *result |= val << shift; if (c < 128) { break; } @@ -715,6 +635,10 @@ class SnappyDecompressor { template <class Writer> void DecompressAllTags(Writer* writer) { const char* ip = ip_; + // For position-independent executables, accessing global arrays can be + // slow. Move wordmask array onto the stack to mitigate this. + uint32 wordmask[sizeof(internal::wordmask)/sizeof(uint32)]; + memcpy(wordmask, internal::wordmask, sizeof(wordmask)); // We could have put this refill fragment only at the beginning of the loop. // However, duplicating it at the end of each branch gives the compiler more @@ -731,7 +655,19 @@ class SnappyDecompressor { for ( ;; ) { const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++)); - if ((c & 0x3) == LITERAL) { + // Ratio of iterations that have LITERAL vs non-LITERAL for different + // inputs. + // + // input LITERAL NON_LITERAL + // ----------------------------------- + // html|html4|cp 23% 77% + // urls 36% 64% + // jpg 47% 53% + // pdf 19% 81% + // txt[1-4] 25% 75% + // pb 24% 76% + // bin 24% 76% + if (PREDICT_FALSE((c & 0x3) == LITERAL)) { size_t literal_length = (c >> 2) + 1u; if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) { assert(literal_length < 61); @@ -767,15 +703,15 @@ class SnappyDecompressor { ip += literal_length; MAYBE_REFILL(); } else { - const uint32 entry = char_table[c]; - const uint32 trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11]; - const uint32 length = entry & 0xff; + const size_t entry = char_table[c]; + const size_t trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11]; + const size_t length = entry & 0xff; ip += entry >> 11; // copy_offset/256 is encoded in bits 8..10. By just fetching // those bits, we get copy_offset (since the bit-field starts at // bit 8). - const uint32 copy_offset = entry & 0x700; + const size_t copy_offset = entry & 0x700; if (!writer->AppendFromSelf(copy_offset + trailer, length)) { return; } @@ -795,10 +731,8 @@ bool SnappyDecompressor::RefillTag() { size_t n; ip = reader_->Peek(&n); peeked_ = n; - if (n == 0) { - eof_ = true; - return false; - } + eof_ = (n == 0); + if (eof_) return false; ip_limit_ = ip + n; } @@ -966,7 +900,7 @@ class SnappyIOVecWriter { const size_t output_iov_count_; // We are currently writing into output_iov_[curr_iov_index_]. - int curr_iov_index_; + size_t curr_iov_index_; // Bytes written to output_iov_[curr_iov_index_] so far. size_t curr_iov_written_; @@ -977,7 +911,7 @@ class SnappyIOVecWriter { // Maximum number of bytes that will be decompressed into output_iov_. size_t output_limit_; - inline char* GetIOVecPointer(int index, size_t offset) { + inline char* GetIOVecPointer(size_t index, size_t offset) { return reinterpret_cast<char*>(output_iov_[index].iov_base) + offset; } @@ -1038,8 +972,7 @@ class SnappyIOVecWriter { output_iov_[curr_iov_index_].iov_len - curr_iov_written_ >= 16) { // Fast path, used for the majority (about 95%) of invocations. char* ptr = GetIOVecPointer(curr_iov_index_, curr_iov_written_); - UnalignedCopy64(ip, ptr); - UnalignedCopy64(ip + 8, ptr + 8); + UnalignedCopy128(ip, ptr); curr_iov_written_ += len; total_written_ += len; return true; @@ -1058,7 +991,7 @@ class SnappyIOVecWriter { } // Locate the iovec from which we need to start the copy. - int from_iov_index = curr_iov_index_; + size_t from_iov_index = curr_iov_index_; size_t from_iov_offset = curr_iov_written_; while (offset > 0) { if (from_iov_offset >= offset) { @@ -1067,8 +1000,8 @@ class SnappyIOVecWriter { } offset -= from_iov_offset; + assert(from_iov_index > 0); --from_iov_index; - assert(from_iov_index >= 0); from_iov_offset = output_iov_[from_iov_index].iov_len; } @@ -1103,9 +1036,10 @@ class SnappyIOVecWriter { if (to_copy > len) { to_copy = len; } - IncrementalCopy(GetIOVecPointer(from_iov_index, from_iov_offset), - GetIOVecPointer(curr_iov_index_, curr_iov_written_), - to_copy); + IncrementalCopySlow( + GetIOVecPointer(from_iov_index, from_iov_offset), + GetIOVecPointer(curr_iov_index_, curr_iov_written_), + GetIOVecPointer(curr_iov_index_, curr_iov_written_) + to_copy); curr_iov_written_ += to_copy; from_iov_offset += to_copy; total_written_ += to_copy; @@ -1175,8 +1109,7 @@ class SnappyArrayWriter { const size_t space_left = op_limit_ - op; if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16) { // Fast path, used for the majority (about 95%) of invocations. - UnalignedCopy64(ip, op); - UnalignedCopy64(ip + 8, op + 8); + UnalignedCopy128(ip, op); op_ = op + len; return true; } else { @@ -1185,8 +1118,7 @@ class SnappyArrayWriter { } inline bool AppendFromSelf(size_t offset, size_t len) { - char* op = op_; - const size_t space_left = op_limit_ - op; + char* const op_end = op_ + len; // Check if we try to append from before the start of the buffer. // Normally this would just be a check for "produced < offset", @@ -1195,30 +1127,13 @@ class SnappyArrayWriter { // to a very big number. This is convenient, as offset==0 is another // invalid case that we also want to catch, so that we do not go // into an infinite loop. - assert(op >= base_); - size_t produced = op - base_; - if (produced <= offset - 1u) { - return false; - } - if (len <= 16 && offset >= 8 && space_left >= 16) { - // Fast path, used for the majority (70-80%) of dynamic invocations. - UnalignedCopy64(op - offset, op); - UnalignedCopy64(op - offset + 8, op + 8); - } else { - if (space_left >= len + kMaxIncrementCopyOverflow) { - IncrementalCopyFastPath(op - offset, op, len); - } else { - if (space_left < len) { - return false; - } - IncrementalCopy(op - offset, op, len); - } - } + if (Produced() <= offset - 1u || op_end > op_limit_) return false; + op_ = IncrementalCopy(op_ - offset, op_, op_end, op_limit_); - op_ = op + len; return true; } inline size_t Produced() const { + assert(op_ >= base_); return op_ - base_; } inline void Flush() {} @@ -1304,7 +1219,7 @@ void RawCompress(const char* input, size_t Compress(const char* input, size_t input_length, string* compressed) { // Pre-grow the buffer to the max length of the compressed output - compressed->resize(MaxCompressedLength(input_length)); + STLStringResizeUninitialized(compressed, MaxCompressedLength(input_length)); size_t compressed_length; RawCompress(input, input_length, string_as_array(compressed), @@ -1327,7 +1242,7 @@ class SnappyScatteredWriter { // We need random access into the data generated so far. Therefore // we keep track of all of the generated data as an array of blocks. // All of the blocks except the last have length kBlockSize. - vector<char*> blocks_; + std::vector<char*> blocks_; size_t expected_; // Total size of all fully generated blocks so far @@ -1386,8 +1301,7 @@ class SnappyScatteredWriter { if (length <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16) { // Fast path, used for the majority (about 95%) of invocations. - UNALIGNED_STORE64(op, UNALIGNED_LOAD64(ip)); - UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(ip + 8)); + UnalignedCopy128(ip, op); op_ptr_ = op + length; return true; } else { @@ -1396,16 +1310,13 @@ class SnappyScatteredWriter { } inline bool AppendFromSelf(size_t offset, size_t len) { + char* const op_end = op_ptr_ + len; // See SnappyArrayWriter::AppendFromSelf for an explanation of // the "offset - 1u" trick. - if (offset - 1u < op_ptr_ - op_base_) { - const size_t space_left = op_limit_ - op_ptr_; - if (space_left >= len + kMaxIncrementCopyOverflow) { - // Fast path: src and dst in current block. - IncrementalCopyFastPath(op_ptr_ - offset, op_ptr_, len); - op_ptr_ += len; - return true; - } + if (PREDICT_TRUE(offset - 1u < op_ptr_ - op_base_ && op_end <= op_limit_)) { + // Fast path: src and dst in current block. + op_ptr_ = IncrementalCopy(op_ptr_ - offset, op_ptr_, op_end, op_limit_); + return true; } return SlowAppendFromSelf(offset, len); } @@ -1510,7 +1421,7 @@ class SnappySinkAllocator { } Sink* dest_; - vector<Datablock> blocks_; + std::vector<Datablock> blocks_; // Note: copying this object is allowed }; diff --git a/thirdparty/snappy/snappy_unittest.cc b/thirdparty/snappy/snappy_unittest.cc index 4a80f2ad..ca95e0d8 100644 --- a/thirdparty/snappy/snappy_unittest.cc +++ b/thirdparty/snappy/snappy_unittest.cc @@ -64,6 +64,9 @@ DEFINE_bool(write_compressed, false, DEFINE_bool(write_uncompressed, false, "Write uncompressed versions of each file to <file>.uncomp"); +DEFINE_bool(snappy_dump_decompression_table, false, + "If true, we print the decompression table during tests."); + namespace snappy { @@ -390,10 +393,10 @@ static void Measure(const char* data, { // Chop the input into blocks int num_blocks = (length + block_size - 1) / block_size; - vector<const char*> input(num_blocks); - vector<size_t> input_length(num_blocks); - vector<string> compressed(num_blocks); - vector<string> output(num_blocks); + std::vector<const char*> input(num_blocks); + std::vector<size_t> input_length(num_blocks); + std::vector<string> compressed(num_blocks); + std::vector<string> output(num_blocks); for (int b = 0; b < num_blocks; b++) { int input_start = b * block_size; int input_limit = min<int>((b+1)*block_size, length); @@ -1004,6 +1007,20 @@ TEST(SnappyCorruption, UnterminatedVarint) { &uncompressed)); } +TEST(SnappyCorruption, OverflowingVarint) { + string compressed, uncompressed; + size_t ulength; + compressed.push_back('\xfb'); + compressed.push_back('\xff'); + compressed.push_back('\xff'); + compressed.push_back('\xff'); + compressed.push_back('\x7f'); + CHECK(!CheckUncompressedLength(compressed, &ulength)); + CHECK(!snappy::IsValidCompressedBuffer(compressed.data(), compressed.size())); + CHECK(!snappy::Uncompress(compressed.data(), compressed.size(), + &uncompressed)); +} + TEST(Snappy, ReadPastEndOfBuffer) { // Check that we do not read past end of input @@ -1037,7 +1054,10 @@ TEST(Snappy, ZeroOffsetCopyValidation) { namespace { int TestFindMatchLength(const char* s1, const char *s2, unsigned length) { - return snappy::internal::FindMatchLength(s1, s2, s2 + length); + std::pair<size_t, bool> p = + snappy::internal::FindMatchLength(s1, s2, s2 + length); + CHECK_EQ(p.first < 8, p.second); + return p.first; } } // namespace @@ -1147,8 +1167,7 @@ TEST(Snappy, FindMatchLengthRandom) { } DataEndingAtUnreadablePage u(s); DataEndingAtUnreadablePage v(t); - int matched = snappy::internal::FindMatchLength( - u.data(), v.data(), v.data() + t.size()); + int matched = TestFindMatchLength(u.data(), v.data(), t.size()); if (matched == t.size()) { EXPECT_EQ(s, t); } else { @@ -1160,6 +1179,100 @@ TEST(Snappy, FindMatchLengthRandom) { } } +static uint16 MakeEntry(unsigned int extra, + unsigned int len, + unsigned int copy_offset) { + // Check that all of the fields fit within the allocated space + assert(extra == (extra & 0x7)); // At most 3 bits + assert(copy_offset == (copy_offset & 0x7)); // At most 3 bits + assert(len == (len & 0x7f)); // At most 7 bits + return len | (copy_offset << 8) | (extra << 11); +} + +// Check that the decompression table is correct, and optionally print out +// the computed one. +TEST(Snappy, VerifyCharTable) { + using snappy::internal::LITERAL; + using snappy::internal::COPY_1_BYTE_OFFSET; + using snappy::internal::COPY_2_BYTE_OFFSET; + using snappy::internal::COPY_4_BYTE_OFFSET; + using snappy::internal::char_table; + using snappy::internal::wordmask; + + uint16 dst[256]; + + // Place invalid entries in all places to detect missing initialization + int assigned = 0; + for (int i = 0; i < 256; i++) { + dst[i] = 0xffff; + } + + // Small LITERAL entries. We store (len-1) in the top 6 bits. + for (unsigned int len = 1; len <= 60; len++) { + dst[LITERAL | ((len-1) << 2)] = MakeEntry(0, len, 0); + assigned++; + } + + // Large LITERAL entries. We use 60..63 in the high 6 bits to + // encode the number of bytes of length info that follow the opcode. + for (unsigned int extra_bytes = 1; extra_bytes <= 4; extra_bytes++) { + // We set the length field in the lookup table to 1 because extra + // bytes encode len-1. + dst[LITERAL | ((extra_bytes+59) << 2)] = MakeEntry(extra_bytes, 1, 0); + assigned++; + } + + // COPY_1_BYTE_OFFSET. + // + // The tag byte in the compressed data stores len-4 in 3 bits, and + // offset/256 in 5 bits. offset%256 is stored in the next byte. + // + // This format is used for length in range [4..11] and offset in + // range [0..2047] + for (unsigned int len = 4; len < 12; len++) { + for (unsigned int offset = 0; offset < 2048; offset += 256) { + dst[COPY_1_BYTE_OFFSET | ((len-4)<<2) | ((offset>>8)<<5)] = + MakeEntry(1, len, offset>>8); + assigned++; + } + } + + // COPY_2_BYTE_OFFSET. + // Tag contains len-1 in top 6 bits, and offset in next two bytes. + for (unsigned int len = 1; len <= 64; len++) { + dst[COPY_2_BYTE_OFFSET | ((len-1)<<2)] = MakeEntry(2, len, 0); + assigned++; + } + + // COPY_4_BYTE_OFFSET. + // Tag contents len-1 in top 6 bits, and offset in next four bytes. + for (unsigned int len = 1; len <= 64; len++) { + dst[COPY_4_BYTE_OFFSET | ((len-1)<<2)] = MakeEntry(4, len, 0); + assigned++; + } + + // Check that each entry was initialized exactly once. + EXPECT_EQ(256, assigned) << "Assigned only " << assigned << " of 256"; + for (int i = 0; i < 256; i++) { + EXPECT_NE(0xffff, dst[i]) << "Did not assign byte " << i; + } + + if (FLAGS_snappy_dump_decompression_table) { + printf("static const uint16 char_table[256] = {\n "); + for (int i = 0; i < 256; i++) { + printf("0x%04x%s", + dst[i], + ((i == 255) ? "\n" : (((i%8) == 7) ? ",\n " : ", "))); + } + printf("};\n"); + } + + // Check that computed table matched recorded table. + for (int i = 0; i < 256; i++) { + EXPECT_EQ(dst[i], char_table[i]) << "Mismatch in byte " << i; + } +} + static void CompressFile(const char* fname) { string fullinput; CHECK_OK(file::GetContents(fname, &fullinput, file::Defaults())); |