summaryrefslogtreecommitdiff
path: root/thirdparty
diff options
context:
space:
mode:
authorJose Fonseca <jfonseca@vmware.com>2017-03-23 11:05:20 +0000
committerJose Fonseca <jfonseca@vmware.com>2017-04-08 08:24:55 +0100
commita684e9e18aaf3e67cd43a516227a568ee1d5c102 (patch)
tree2c76ec05858e76e573f932b298680370a2194433 /thirdparty
parente8ee4f3999a36e043224980b2343ead3d658008a (diff)
snappy: Update to 1.1.4.
Diffstat (limited to 'thirdparty')
-rw-r--r--thirdparty/snappy/NEWS8
-rw-r--r--thirdparty/snappy/README16
-rw-r--r--thirdparty/snappy/snappy-internal.h109
-rw-r--r--thirdparty/snappy/snappy-stubs-internal.h45
-rw-r--r--thirdparty/snappy/snappy-stubs-public.h.in2
-rw-r--r--thirdparty/snappy/snappy-test.h2
-rw-r--r--thirdparty/snappy/snappy.cc561
-rw-r--r--thirdparty/snappy/snappy_unittest.cc127
8 files changed, 512 insertions, 358 deletions
diff --git a/thirdparty/snappy/NEWS b/thirdparty/snappy/NEWS
index 4eb7a1d1..3bc0f4b0 100644
--- a/thirdparty/snappy/NEWS
+++ b/thirdparty/snappy/NEWS
@@ -1,3 +1,11 @@
+Snappy v1.1.4, January 25th 2017:
+
+ * Fix a 1% performance regression when snappy is used in PIE executables.
+
+ * Improve compression performance by 5%.
+
+ * Improve decompression performance by 20%.
+
Snappy v1.1.3, July 6th 2015:
This is the first release to be done from GitHub, which means that
diff --git a/thirdparty/snappy/README b/thirdparty/snappy/README
index 3bc8888f..c60dab9a 100644
--- a/thirdparty/snappy/README
+++ b/thirdparty/snappy/README
@@ -29,7 +29,7 @@ and the like.
Performance
===========
-
+
Snappy is intended to be fast. On a single core of a Core i7 processor
in 64-bit mode, it compresses at about 250 MB/sec or more and decompresses at
about 500 MB/sec or more. (These numbers are for the slowest inputs in our
@@ -67,7 +67,7 @@ Usage
Note that Snappy, both the implementation and the main interface,
is written in C++. However, several third-party bindings to other languages
-are available; see the Google Code page at http://code.google.com/p/snappy/
+are available; see the home page at http://google.github.io/snappy/
for more information. Also, if you want to use Snappy from C code, you can
use the included C bindings in snappy-c.h.
@@ -102,12 +102,12 @@ tests to verify you have not broken anything. Note that if you have the
Google Test library installed, unit test behavior (especially failures) will be
significantly more user-friendly. You can find Google Test at
- http://code.google.com/p/googletest/
+ http://github.com/google/googletest
You probably also want the gflags library for handling of command-line flags;
you can find it at
- http://code.google.com/p/google-gflags/
+ http://gflags.github.io/gflags/
In addition to the unit tests, snappy contains microbenchmarks used to
tune compression and decompression performance. These are automatically run
@@ -129,7 +129,11 @@ test.)
Contact
=======
-Snappy is distributed through Google Code. For the latest version, a bug tracker,
+Snappy is distributed through GitHub. For the latest version, a bug tracker,
and other information, see
- http://code.google.com/p/snappy/
+ http://google.github.io/snappy/
+
+or the repository at
+
+ https://github.com/google/snappy
diff --git a/thirdparty/snappy/snappy-internal.h b/thirdparty/snappy/snappy-internal.h
index 0653dc65..b8355c08 100644
--- a/thirdparty/snappy/snappy-internal.h
+++ b/thirdparty/snappy/snappy-internal.h
@@ -70,11 +70,12 @@ char* CompressFragment(const char* input,
uint16* table,
const int table_size);
-// Return the largest n such that
+// Find the largest n such that
//
// s1[0,n-1] == s2[0,n-1]
// and n <= (s2_limit - s2).
//
+// Return make_pair(n, n < 8).
// Does not read *s2_limit or beyond.
// Does not read *(s1 + (s2_limit - s2)) or beyond.
// Requires that s2_limit >= s2.
@@ -82,11 +83,28 @@ char* CompressFragment(const char* input,
// Separate implementation for x86_64, for speed. Uses the fact that
// x86_64 is little endian.
#if defined(ARCH_K8)
-static inline int FindMatchLength(const char* s1,
- const char* s2,
- const char* s2_limit) {
+static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
+ const char* s2,
+ const char* s2_limit) {
assert(s2_limit >= s2);
- int matched = 0;
+ size_t matched = 0;
+
+ // This block isn't necessary for correctness; we could just start looping
+ // immediately. As an optimization though, it is useful. It creates some not
+ // uncommon code paths that determine, without extra effort, whether the match
+ // length is less than 8. In short, we are hoping to avoid a conditional
+ // branch, and perhaps get better code layout from the C++ compiler.
+ if (PREDICT_TRUE(s2 <= s2_limit - 8)) {
+ uint64 a1 = UNALIGNED_LOAD64(s1);
+ uint64 a2 = UNALIGNED_LOAD64(s2);
+ if (a1 != a2) {
+ return std::pair<size_t, bool>(Bits::FindLSBSetNonZero64(a1 ^ a2) >> 3,
+ true);
+ } else {
+ matched = 8;
+ s2 += 8;
+ }
+ }
// Find out how long the match is. We loop over the data 64 bits at a
// time until we find a 64-bit block that doesn't match; then we find
@@ -97,14 +115,11 @@ static inline int FindMatchLength(const char* s1,
s2 += 8;
matched += 8;
} else {
- // On current (mid-2008) Opteron models there is a 3% more
- // efficient code sequence to find the first non-matching byte.
- // However, what follows is ~10% better on Intel Core 2 and newer,
- // and we expect AMD's bsf instruction to improve.
uint64 x = UNALIGNED_LOAD64(s2) ^ UNALIGNED_LOAD64(s1 + matched);
int matching_bits = Bits::FindLSBSetNonZero64(x);
matched += matching_bits >> 3;
- return matched;
+ assert(matched >= 8);
+ return std::pair<size_t, bool>(matched, false);
}
}
while (PREDICT_TRUE(s2 < s2_limit)) {
@@ -112,15 +127,15 @@ static inline int FindMatchLength(const char* s1,
++s2;
++matched;
} else {
- return matched;
+ return std::pair<size_t, bool>(matched, matched < 8);
}
}
- return matched;
+ return std::pair<size_t, bool>(matched, matched < 8);
}
#else
-static inline int FindMatchLength(const char* s1,
- const char* s2,
- const char* s2_limit) {
+static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
+ const char* s2,
+ const char* s2_limit) {
// Implementation based on the x86-64 version, above.
assert(s2_limit >= s2);
int matched = 0;
@@ -140,10 +155,72 @@ static inline int FindMatchLength(const char* s1,
++matched;
}
}
- return matched;
+ return std::pair<size_t, bool>(matched, matched < 8);
}
#endif
+// Lookup tables for decompression code. Give --snappy_dump_decompression_table
+// to the unit test to recompute char_table.
+
+enum {
+ LITERAL = 0,
+ COPY_1_BYTE_OFFSET = 1, // 3 bit length + 3 bits of offset in opcode
+ COPY_2_BYTE_OFFSET = 2,
+ COPY_4_BYTE_OFFSET = 3
+};
+static const int kMaximumTagLength = 5; // COPY_4_BYTE_OFFSET plus the actual offset.
+
+// Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits
+static const uint32 wordmask[] = {
+ 0u, 0xffu, 0xffffu, 0xffffffu, 0xffffffffu
+};
+
+// Data stored per entry in lookup table:
+// Range Bits-used Description
+// ------------------------------------
+// 1..64 0..7 Literal/copy length encoded in opcode byte
+// 0..7 8..10 Copy offset encoded in opcode byte / 256
+// 0..4 11..13 Extra bytes after opcode
+//
+// We use eight bits for the length even though 7 would have sufficed
+// because of efficiency reasons:
+// (1) Extracting a byte is faster than a bit-field
+// (2) It properly aligns copy offset so we do not need a <<8
+static const uint16 char_table[256] = {
+ 0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002,
+ 0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004,
+ 0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006,
+ 0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008,
+ 0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a,
+ 0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c,
+ 0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e,
+ 0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010,
+ 0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012,
+ 0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014,
+ 0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016,
+ 0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018,
+ 0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a,
+ 0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c,
+ 0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e,
+ 0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020,
+ 0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022,
+ 0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024,
+ 0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026,
+ 0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028,
+ 0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a,
+ 0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c,
+ 0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e,
+ 0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030,
+ 0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032,
+ 0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034,
+ 0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036,
+ 0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038,
+ 0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a,
+ 0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c,
+ 0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e,
+ 0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040
+};
+
} // end namespace internal
} // end namespace snappy
diff --git a/thirdparty/snappy/snappy-stubs-internal.h b/thirdparty/snappy/snappy-stubs-internal.h
index ddca1a8b..1954c63d 100644
--- a/thirdparty/snappy/snappy-stubs-internal.h
+++ b/thirdparty/snappy/snappy-stubs-internal.h
@@ -116,6 +116,15 @@ static const int64 kint64max = static_cast<int64>(0x7FFFFFFFFFFFFFFFLL);
// sub-architectures.
//
// This is a mess, but there's not much we can do about it.
+//
+// To further complicate matters, only LDR instructions (single reads) are
+// allowed to be unaligned, not LDRD (two reads) or LDM (many reads). Unless we
+// explicitly tell the compiler that these accesses can be unaligned, it can and
+// will combine accesses. On armcc, the way to signal this is done by accessing
+// through the type (uint32 __packed *), but GCC has no such attribute
+// (it ignores __attribute__((packed)) on individual variables). However,
+// we can tell it that a _struct_ is unaligned, which has the same effect,
+// so we do that.
#elif defined(__arm__) && \
!defined(__ARM_ARCH_4__) && \
@@ -131,11 +140,39 @@ static const int64 kint64max = static_cast<int64>(0x7FFFFFFFFFFFFFFFLL);
!defined(__ARM_ARCH_6ZK__) && \
!defined(__ARM_ARCH_6T2__)
-#define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
-#define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
+#if __GNUC__
+#define ATTRIBUTE_PACKED __attribute__((__packed__))
+#else
+#define ATTRIBUTE_PACKED
+#endif
-#define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16 *>(_p) = (_val))
-#define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val))
+namespace base {
+namespace internal {
+
+struct Unaligned16Struct {
+ uint16 value;
+ uint8 dummy; // To make the size non-power-of-two.
+} ATTRIBUTE_PACKED;
+
+struct Unaligned32Struct {
+ uint32 value;
+ uint8 dummy; // To make the size non-power-of-two.
+} ATTRIBUTE_PACKED;
+
+} // namespace internal
+} // namespace base
+
+#define UNALIGNED_LOAD16(_p) \
+ ((reinterpret_cast<const ::snappy::base::internal::Unaligned16Struct *>(_p))->value)
+#define UNALIGNED_LOAD32(_p) \
+ ((reinterpret_cast<const ::snappy::base::internal::Unaligned32Struct *>(_p))->value)
+
+#define UNALIGNED_STORE16(_p, _val) \
+ ((reinterpret_cast< ::snappy::base::internal::Unaligned16Struct *>(_p))->value = \
+ (_val))
+#define UNALIGNED_STORE32(_p, _val) \
+ ((reinterpret_cast< ::snappy::base::internal::Unaligned32Struct *>(_p))->value = \
+ (_val))
// TODO(user): NEON supports unaligned 64-bit loads and stores.
// See if that would be more efficient on platforms supporting it,
diff --git a/thirdparty/snappy/snappy-stubs-public.h.in b/thirdparty/snappy/snappy-stubs-public.h.in
index ebe676cc..96989ac3 100644
--- a/thirdparty/snappy/snappy-stubs-public.h.in
+++ b/thirdparty/snappy/snappy-stubs-public.h.in
@@ -80,9 +80,11 @@ typedef unsigned long long uint64;
typedef std::string string;
+#ifndef DISALLOW_COPY_AND_ASSIGN
#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
TypeName(const TypeName&); \
void operator=(const TypeName&)
+#endif
#if !@ac_cv_have_sys_uio_h@
// Windows does not have an iovec type, yet the concept is universally useful.
diff --git a/thirdparty/snappy/snappy-test.h b/thirdparty/snappy/snappy-test.h
index dbc55b98..5fb09c78 100644
--- a/thirdparty/snappy/snappy-test.h
+++ b/thirdparty/snappy/snappy-test.h
@@ -196,6 +196,7 @@ void Test_Snappy_RandomData();
void Test_Snappy_FourByteOffset();
void Test_SnappyCorruption_TruncatedVarint();
void Test_SnappyCorruption_UnterminatedVarint();
+void Test_SnappyCorruption_OverflowingVarint();
void Test_Snappy_ReadPastEndOfBuffer();
void Test_Snappy_FindMatchLength();
void Test_Snappy_FindMatchLengthRandom();
@@ -500,6 +501,7 @@ static inline int RUN_ALL_TESTS() {
snappy::Test_Snappy_FourByteOffset();
snappy::Test_SnappyCorruption_TruncatedVarint();
snappy::Test_SnappyCorruption_UnterminatedVarint();
+ snappy::Test_SnappyCorruption_OverflowingVarint();
snappy::Test_Snappy_ReadPastEndOfBuffer();
snappy::Test_Snappy_FindMatchLength();
snappy::Test_Snappy_FindMatchLengthRandom();
diff --git a/thirdparty/snappy/snappy.cc b/thirdparty/snappy/snappy.cc
index b6ca7ece..4bcea0b4 100644
--- a/thirdparty/snappy/snappy.cc
+++ b/thirdparty/snappy/snappy.cc
@@ -30,6 +30,9 @@
#include "snappy-internal.h"
#include "snappy-sinksource.h"
+#if defined(__x86_64__) || defined(_M_X64)
+#include <emmintrin.h>
+#endif
#include <stdio.h>
#include <algorithm>
@@ -39,6 +42,13 @@
namespace snappy {
+using internal::COPY_1_BYTE_OFFSET;
+using internal::COPY_2_BYTE_OFFSET;
+using internal::LITERAL;
+using internal::char_table;
+using internal::kMaximumTagLength;
+using internal::wordmask;
+
// Any hash function will produce a valid compressed bitstream, but a good
// hash function reduces the number of collisions and thus yields better
// compression for compressible input, and more speed for incompressible
@@ -76,79 +86,125 @@ size_t MaxCompressedLength(size_t source_len) {
return 32 + source_len + source_len/6;
}
-enum {
- LITERAL = 0,
- COPY_1_BYTE_OFFSET = 1, // 3 bit length + 3 bits of offset in opcode
- COPY_2_BYTE_OFFSET = 2,
- COPY_4_BYTE_OFFSET = 3
-};
-static const int kMaximumTagLength = 5; // COPY_4_BYTE_OFFSET plus the actual offset.
-
-// Copy "len" bytes from "src" to "op", one byte at a time. Used for
-// handling COPY operations where the input and output regions may
-// overlap. For example, suppose:
-// src == "ab"
-// op == src + 2
-// len == 20
-// After IncrementalCopy(src, op, len), the result will have
-// eleven copies of "ab"
-// ababababababababababab
-// Note that this does not match the semantics of either memcpy()
-// or memmove().
-static inline void IncrementalCopy(const char* src, char* op, ssize_t len) {
- assert(len > 0);
- do {
- *op++ = *src++;
- } while (--len > 0);
+namespace {
+
+void UnalignedCopy64(const void* src, void* dst) {
+ memcpy(dst, src, 8);
}
-// Equivalent to IncrementalCopy except that it can write up to ten extra
-// bytes after the end of the copy, and that it is faster.
-//
-// The main part of this loop is a simple copy of eight bytes at a time until
-// we've copied (at least) the requested amount of bytes. However, if op and
-// src are less than eight bytes apart (indicating a repeating pattern of
-// length < 8), we first need to expand the pattern in order to get the correct
-// results. For instance, if the buffer looks like this, with the eight-byte
-// <src> and <op> patterns marked as intervals:
-//
-// abxxxxxxxxxxxx
-// [------] src
-// [------] op
-//
-// a single eight-byte copy from <src> to <op> will repeat the pattern once,
-// after which we can move <op> two bytes without moving <src>:
-//
-// ababxxxxxxxxxx
-// [------] src
-// [------] op
-//
-// and repeat the exercise until the two no longer overlap.
-//
-// This allows us to do very well in the special case of one single byte
-// repeated many times, without taking a big hit for more general cases.
-//
-// The worst case of extra writing past the end of the match occurs when
-// op - src == 1 and len == 1; the last copy will read from byte positions
-// [0..7] and write to [4..11], whereas it was only supposed to write to
-// position 1. Thus, ten excess bytes.
+void UnalignedCopy128(const void* src, void* dst) {
+ // TODO(alkis): Remove this when we upgrade to a recent compiler that emits
+ // SSE2 moves for memcpy(dst, src, 16).
+#ifdef __SSE2__
+ __m128i x = _mm_loadu_si128(static_cast<const __m128i*>(src));
+ _mm_storeu_si128(static_cast<__m128i*>(dst), x);
+#else
+ memcpy(dst, src, 16);
+#endif
+}
-namespace {
+// Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used
+// for handling COPY operations where the input and output regions may overlap.
+// For example, suppose:
+// src == "ab"
+// op == src + 2
+// op_limit == op + 20
+// After IncrementalCopySlow(src, op, op_limit), the result will have eleven
+// copies of "ab"
+// ababababababababababab
+// Note that this does not match the semantics of either memcpy() or memmove().
+inline char* IncrementalCopySlow(const char* src, char* op,
+ char* const op_limit) {
+ while (op < op_limit) {
+ *op++ = *src++;
+ }
+ return op_limit;
+}
-const int kMaxIncrementCopyOverflow = 10;
+// Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) but faster than
+// IncrementalCopySlow. buf_limit is the address past the end of the writable
+// region of the buffer.
+inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
+ char* const buf_limit) {
+ // Terminology:
+ //
+ // slop = buf_limit - op
+ // pat = op - src
+ // len = limit - op
+ assert(src < op);
+ assert(op_limit <= buf_limit);
+ // NOTE: The compressor always emits 4 <= len <= 64. It is ok to assume that
+ // to optimize this function but we have to also handle these cases in case
+ // the input does not satisfy these conditions.
+
+ size_t pattern_size = op - src;
+ // The cases are split into different branches to allow the branch predictor,
+ // FDO, and static prediction hints to work better. For each input we list the
+ // ratio of invocations that match each condition.
+ //
+ // input slop < 16 pat < 8 len > 16
+ // ------------------------------------------
+ // html|html4|cp 0% 1.01% 27.73%
+ // urls 0% 0.88% 14.79%
+ // jpg 0% 64.29% 7.14%
+ // pdf 0% 2.56% 58.06%
+ // txt[1-4] 0% 0.23% 0.97%
+ // pb 0% 0.96% 13.88%
+ // bin 0.01% 22.27% 41.17%
+ //
+ // It is very rare that we don't have enough slop for doing block copies. It
+ // is also rare that we need to expand a pattern. Small patterns are common
+ // for incompressible formats and for those we are plenty fast already.
+ // Lengths are normally not greater than 16 but they vary depending on the
+ // input. In general if we always predict len <= 16 it would be an ok
+ // prediction.
+ //
+ // In order to be fast we want a pattern >= 8 bytes and an unrolled loop
+ // copying 2x 8 bytes at a time.
+
+ // Handle the uncommon case where pattern is less than 8 bytes.
+ if (PREDICT_FALSE(pattern_size < 8)) {
+ // Expand pattern to at least 8 bytes. The worse case scenario in terms of
+ // buffer usage is when the pattern is size 3. ^ is the original position
+ // of op. x are irrelevant bytes copied by the last UnalignedCopy64.
+ //
+ // abc
+ // abcabcxxxxx
+ // abcabcabcabcxxxxx
+ // ^
+ // The last x is 14 bytes after ^.
+ if (PREDICT_TRUE(op <= buf_limit - 14)) {
+ while (pattern_size < 8) {
+ UnalignedCopy64(src, op);
+ op += pattern_size;
+ pattern_size *= 2;
+ }
+ if (PREDICT_TRUE(op >= op_limit)) return op_limit;
+ } else {
+ return IncrementalCopySlow(src, op, op_limit);
+ }
+ }
+ assert(pattern_size >= 8);
-inline void IncrementalCopyFastPath(const char* src, char* op, ssize_t len) {
- while (PREDICT_FALSE(op - src < 8)) {
+ // Copy 2x 8 bytes at a time. Because op - src can be < 16, a single
+ // UnalignedCopy128 might overwrite data in op. UnalignedCopy64 is safe
+ // because expanding the pattern to at least 8 bytes guarantees that
+ // op - src >= 8.
+ while (op <= buf_limit - 16) {
UnalignedCopy64(src, op);
- len -= op - src;
- op += op - src;
- }
- while (len > 0) {
+ UnalignedCopy64(src + 8, op + 8);
+ src += 16;
+ op += 16;
+ if (PREDICT_TRUE(op >= op_limit)) return op_limit;
+ }
+ // We only take this branch if we didn't have enough slop and we can do a
+ // single 8 byte copy.
+ if (PREDICT_FALSE(op <= buf_limit - 8)) {
UnalignedCopy64(src, op);
src += 8;
op += 8;
- len -= 8;
}
+ return IncrementalCopySlow(src, op, op_limit);
}
} // namespace
@@ -157,26 +213,29 @@ static inline char* EmitLiteral(char* op,
const char* literal,
int len,
bool allow_fast_path) {
- int n = len - 1; // Zero-length literals are disallowed
- if (n < 60) {
+ // The vast majority of copies are below 16 bytes, for which a
+ // call to memcpy is overkill. This fast path can sometimes
+ // copy up to 15 bytes too much, but that is okay in the
+ // main loop, since we have a bit to go on for both sides:
+ //
+ // - The input will always have kInputMarginBytes = 15 extra
+ // available bytes, as long as we're in the main loop, and
+ // if not, allow_fast_path = false.
+ // - The output will always have 32 spare bytes (see
+ // MaxCompressedLength).
+ assert(len > 0); // Zero-length literals are disallowed
+ int n = len - 1;
+ if (allow_fast_path && len <= 16) {
// Fits in tag byte
*op++ = LITERAL | (n << 2);
- // The vast majority of copies are below 16 bytes, for which a
- // call to memcpy is overkill. This fast path can sometimes
- // copy up to 15 bytes too much, but that is okay in the
- // main loop, since we have a bit to go on for both sides:
- //
- // - The input will always have kInputMarginBytes = 15 extra
- // available bytes, as long as we're in the main loop, and
- // if not, allow_fast_path = false.
- // - The output will always have 32 spare bytes (see
- // MaxCompressedLength).
- if (allow_fast_path && len <= 16) {
- UnalignedCopy64(literal, op);
- UnalignedCopy64(literal + 8, op + 8);
- return op + len;
- }
+ UnalignedCopy128(literal, op);
+ return op + len;
+ }
+
+ if (n < 60) {
+ // Fits in tag byte
+ *op++ = LITERAL | (n << 2);
} else {
// Encode in upcoming bytes
char* base = op;
@@ -195,42 +254,54 @@ static inline char* EmitLiteral(char* op,
return op + len;
}
-static inline char* EmitCopyLessThan64(char* op, size_t offset, int len) {
+static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len,
+ bool len_less_than_12) {
assert(len <= 64);
assert(len >= 4);
assert(offset < 65536);
+ assert(len_less_than_12 == (len < 12));
- if ((len < 12) && (offset < 2048)) {
- size_t len_minus_4 = len - 4;
- assert(len_minus_4 < 8); // Must fit in 3 bits
- *op++ = COPY_1_BYTE_OFFSET + ((len_minus_4) << 2) + ((offset >> 8) << 5);
+ if (len_less_than_12 && PREDICT_TRUE(offset < 2048)) {
+ // offset fits in 11 bits. The 3 highest go in the top of the first byte,
+ // and the rest go in the second byte.
+ *op++ = COPY_1_BYTE_OFFSET + ((len - 4) << 2) + ((offset >> 3) & 0xe0);
*op++ = offset & 0xff;
} else {
- *op++ = COPY_2_BYTE_OFFSET + ((len-1) << 2);
- LittleEndian::Store16(op, offset);
- op += 2;
+ // Write 4 bytes, though we only care about 3 of them. The output buffer
+ // is required to have some slack, so the extra byte won't overrun it.
+ uint32 u = COPY_2_BYTE_OFFSET + ((len - 1) << 2) + (offset << 8);
+ LittleEndian::Store32(op, u);
+ op += 3;
}
return op;
}
-static inline char* EmitCopy(char* op, size_t offset, int len) {
- // Emit 64 byte copies but make sure to keep at least four bytes reserved
- while (PREDICT_FALSE(len >= 68)) {
- op = EmitCopyLessThan64(op, offset, 64);
- len -= 64;
- }
+static inline char* EmitCopy(char* op, size_t offset, size_t len,
+ bool len_less_than_12) {
+ assert(len_less_than_12 == (len < 12));
+ if (len_less_than_12) {
+ return EmitCopyAtMost64(op, offset, len, true);
+ } else {
+ // A special case for len <= 64 might help, but so far measurements suggest
+ // it's in the noise.
- // Emit an extra 60 byte copy if have too much data to fit in one copy
- if (len > 64) {
- op = EmitCopyLessThan64(op, offset, 60);
- len -= 60;
- }
+ // Emit 64 byte copies but make sure to keep at least four bytes reserved.
+ while (PREDICT_FALSE(len >= 68)) {
+ op = EmitCopyAtMost64(op, offset, 64, false);
+ len -= 64;
+ }
- // Emit remainder
- op = EmitCopyLessThan64(op, offset, len);
- return op;
-}
+ // One or two copies will now finish the job.
+ if (len > 64) {
+ op = EmitCopyAtMost64(op, offset, 60, false);
+ len -= 60;
+ }
+ // Emit remainder.
+ op = EmitCopyAtMost64(op, offset, len, len < 12);
+ return op;
+ }
+}
bool GetUncompressedLength(const char* start, size_t n, size_t* result) {
uint32 v = 0;
@@ -364,9 +435,9 @@ char* CompressFragment(const char* input,
//
// Heuristic match skipping: If 32 bytes are scanned with no matches
// found, start looking only at every other byte. If 32 more bytes are
- // scanned, look at every third byte, etc.. When a match is found,
- // immediately go back to looking at every byte. This is a small loss
- // (~5% performance, ~0.1% density) for compressible data due to more
+ // scanned (or skipped), look at every third byte, etc.. When a match is
+ // found, immediately go back to looking at every byte. This is a small
+ // loss (~5% performance, ~0.1% density) for compressible data due to more
// bookkeeping, but for non-compressible data (such as JPEG) it's a huge
// win since the compressor quickly "realizes" the data is incompressible
// and doesn't bother looking for matches everywhere.
@@ -382,7 +453,8 @@ char* CompressFragment(const char* input,
ip = next_ip;
uint32 hash = next_hash;
assert(hash == Hash(ip, shift));
- uint32 bytes_between_hash_lookups = skip++ >> 5;
+ uint32 bytes_between_hash_lookups = skip >> 5;
+ skip += bytes_between_hash_lookups;
next_ip = ip + bytes_between_hash_lookups;
if (PREDICT_FALSE(next_ip > ip_limit)) {
goto emit_remainder;
@@ -417,19 +489,21 @@ char* CompressFragment(const char* input,
// We have a 4-byte match at ip, and no need to emit any
// "literal bytes" prior to ip.
const char* base = ip;
- int matched = 4 + FindMatchLength(candidate + 4, ip + 4, ip_end);
+ std::pair<size_t, bool> p =
+ FindMatchLength(candidate + 4, ip + 4, ip_end);
+ size_t matched = 4 + p.first;
ip += matched;
size_t offset = base - candidate;
assert(0 == memcmp(base, candidate, matched));
- op = EmitCopy(op, offset, matched);
- // We could immediately start working at ip now, but to improve
- // compression we first update table[Hash(ip - 1, ...)].
- const char* insert_tail = ip - 1;
+ op = EmitCopy(op, offset, matched, p.second);
next_emit = ip;
if (PREDICT_FALSE(ip >= ip_limit)) {
goto emit_remainder;
}
- input_bytes = GetEightBytesAt(insert_tail);
+ // We are now looking for a 4-byte match again. We read
+ // table[Hash(ip, shift)] for that. To improve compression,
+ // we also update table[Hash(ip - 1, shift)] and table[Hash(ip, shift)].
+ input_bytes = GetEightBytesAt(ip - 1);
uint32 prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 0), shift);
table[prev_hash] = ip - base_ip - 1;
uint32 cur_hash = HashBytes(GetUint32AtOffset(input_bytes, 1), shift);
@@ -493,162 +567,6 @@ char* CompressFragment(const char* input,
// bool TryFastAppend(const char* ip, size_t available, size_t length);
// };
-// -----------------------------------------------------------------------
-// Lookup table for decompression code. Generated by ComputeTable() below.
-// -----------------------------------------------------------------------
-
-// Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits
-static const uint32 wordmask[] = {
- 0u, 0xffu, 0xffffu, 0xffffffu, 0xffffffffu
-};
-
-// Data stored per entry in lookup table:
-// Range Bits-used Description
-// ------------------------------------
-// 1..64 0..7 Literal/copy length encoded in opcode byte
-// 0..7 8..10 Copy offset encoded in opcode byte / 256
-// 0..4 11..13 Extra bytes after opcode
-//
-// We use eight bits for the length even though 7 would have sufficed
-// because of efficiency reasons:
-// (1) Extracting a byte is faster than a bit-field
-// (2) It properly aligns copy offset so we do not need a <<8
-static const uint16 char_table[256] = {
- 0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002,
- 0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004,
- 0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006,
- 0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008,
- 0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a,
- 0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c,
- 0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e,
- 0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010,
- 0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012,
- 0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014,
- 0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016,
- 0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018,
- 0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a,
- 0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c,
- 0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e,
- 0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020,
- 0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022,
- 0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024,
- 0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026,
- 0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028,
- 0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a,
- 0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c,
- 0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e,
- 0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030,
- 0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032,
- 0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034,
- 0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036,
- 0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038,
- 0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a,
- 0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c,
- 0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e,
- 0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040
-};
-
-// In debug mode, allow optional computation of the table at startup.
-// Also, check that the decompression table is correct.
-#ifndef NDEBUG
-DEFINE_bool(snappy_dump_decompression_table, false,
- "If true, we print the decompression table at startup.");
-
-static uint16 MakeEntry(unsigned int extra,
- unsigned int len,
- unsigned int copy_offset) {
- // Check that all of the fields fit within the allocated space
- assert(extra == (extra & 0x7)); // At most 3 bits
- assert(copy_offset == (copy_offset & 0x7)); // At most 3 bits
- assert(len == (len & 0x7f)); // At most 7 bits
- return len | (copy_offset << 8) | (extra << 11);
-}
-
-static void ComputeTable() {
- uint16 dst[256];
-
- // Place invalid entries in all places to detect missing initialization
- int assigned = 0;
- for (int i = 0; i < 256; i++) {
- dst[i] = 0xffff;
- }
-
- // Small LITERAL entries. We store (len-1) in the top 6 bits.
- for (unsigned int len = 1; len <= 60; len++) {
- dst[LITERAL | ((len-1) << 2)] = MakeEntry(0, len, 0);
- assigned++;
- }
-
- // Large LITERAL entries. We use 60..63 in the high 6 bits to
- // encode the number of bytes of length info that follow the opcode.
- for (unsigned int extra_bytes = 1; extra_bytes <= 4; extra_bytes++) {
- // We set the length field in the lookup table to 1 because extra
- // bytes encode len-1.
- dst[LITERAL | ((extra_bytes+59) << 2)] = MakeEntry(extra_bytes, 1, 0);
- assigned++;
- }
-
- // COPY_1_BYTE_OFFSET.
- //
- // The tag byte in the compressed data stores len-4 in 3 bits, and
- // offset/256 in 5 bits. offset%256 is stored in the next byte.
- //
- // This format is used for length in range [4..11] and offset in
- // range [0..2047]
- for (unsigned int len = 4; len < 12; len++) {
- for (unsigned int offset = 0; offset < 2048; offset += 256) {
- dst[COPY_1_BYTE_OFFSET | ((len-4)<<2) | ((offset>>8)<<5)] =
- MakeEntry(1, len, offset>>8);
- assigned++;
- }
- }
-
- // COPY_2_BYTE_OFFSET.
- // Tag contains len-1 in top 6 bits, and offset in next two bytes.
- for (unsigned int len = 1; len <= 64; len++) {
- dst[COPY_2_BYTE_OFFSET | ((len-1)<<2)] = MakeEntry(2, len, 0);
- assigned++;
- }
-
- // COPY_4_BYTE_OFFSET.
- // Tag contents len-1 in top 6 bits, and offset in next four bytes.
- for (unsigned int len = 1; len <= 64; len++) {
- dst[COPY_4_BYTE_OFFSET | ((len-1)<<2)] = MakeEntry(4, len, 0);
- assigned++;
- }
-
- // Check that each entry was initialized exactly once.
- if (assigned != 256) {
- fprintf(stderr, "ComputeTable: assigned only %d of 256\n", assigned);
- abort();
- }
- for (int i = 0; i < 256; i++) {
- if (dst[i] == 0xffff) {
- fprintf(stderr, "ComputeTable: did not assign byte %d\n", i);
- abort();
- }
- }
-
- if (FLAGS_snappy_dump_decompression_table) {
- printf("static const uint16 char_table[256] = {\n ");
- for (int i = 0; i < 256; i++) {
- printf("0x%04x%s",
- dst[i],
- ((i == 255) ? "\n" : (((i%8) == 7) ? ",\n " : ", ")));
- }
- printf("};\n");
- }
-
- // Check that computed table matched recorded table
- for (int i = 0; i < 256; i++) {
- if (dst[i] != char_table[i]) {
- fprintf(stderr, "ComputeTable: byte %d: computed (%x), expect (%x)\n",
- i, static_cast<int>(dst[i]), static_cast<int>(char_table[i]));
- abort();
- }
- }
-}
-#endif /* !NDEBUG */
// Helper class for decompression
class SnappyDecompressor {
@@ -701,7 +619,9 @@ class SnappyDecompressor {
if (n == 0) return false;
const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
reader_->Skip(1);
- *result |= static_cast<uint32>(c & 0x7f) << shift;
+ uint32 val = c & 0x7f;
+ if (((val << shift) >> shift) != val) return false;
+ *result |= val << shift;
if (c < 128) {
break;
}
@@ -715,6 +635,10 @@ class SnappyDecompressor {
template <class Writer>
void DecompressAllTags(Writer* writer) {
const char* ip = ip_;
+ // For position-independent executables, accessing global arrays can be
+ // slow. Move wordmask array onto the stack to mitigate this.
+ uint32 wordmask[sizeof(internal::wordmask)/sizeof(uint32)];
+ memcpy(wordmask, internal::wordmask, sizeof(wordmask));
// We could have put this refill fragment only at the beginning of the loop.
// However, duplicating it at the end of each branch gives the compiler more
@@ -731,7 +655,19 @@ class SnappyDecompressor {
for ( ;; ) {
const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
- if ((c & 0x3) == LITERAL) {
+ // Ratio of iterations that have LITERAL vs non-LITERAL for different
+ // inputs.
+ //
+ // input LITERAL NON_LITERAL
+ // -----------------------------------
+ // html|html4|cp 23% 77%
+ // urls 36% 64%
+ // jpg 47% 53%
+ // pdf 19% 81%
+ // txt[1-4] 25% 75%
+ // pb 24% 76%
+ // bin 24% 76%
+ if (PREDICT_FALSE((c & 0x3) == LITERAL)) {
size_t literal_length = (c >> 2) + 1u;
if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) {
assert(literal_length < 61);
@@ -767,15 +703,15 @@ class SnappyDecompressor {
ip += literal_length;
MAYBE_REFILL();
} else {
- const uint32 entry = char_table[c];
- const uint32 trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
- const uint32 length = entry & 0xff;
+ const size_t entry = char_table[c];
+ const size_t trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
+ const size_t length = entry & 0xff;
ip += entry >> 11;
// copy_offset/256 is encoded in bits 8..10. By just fetching
// those bits, we get copy_offset (since the bit-field starts at
// bit 8).
- const uint32 copy_offset = entry & 0x700;
+ const size_t copy_offset = entry & 0x700;
if (!writer->AppendFromSelf(copy_offset + trailer, length)) {
return;
}
@@ -795,10 +731,8 @@ bool SnappyDecompressor::RefillTag() {
size_t n;
ip = reader_->Peek(&n);
peeked_ = n;
- if (n == 0) {
- eof_ = true;
- return false;
- }
+ eof_ = (n == 0);
+ if (eof_) return false;
ip_limit_ = ip + n;
}
@@ -966,7 +900,7 @@ class SnappyIOVecWriter {
const size_t output_iov_count_;
// We are currently writing into output_iov_[curr_iov_index_].
- int curr_iov_index_;
+ size_t curr_iov_index_;
// Bytes written to output_iov_[curr_iov_index_] so far.
size_t curr_iov_written_;
@@ -977,7 +911,7 @@ class SnappyIOVecWriter {
// Maximum number of bytes that will be decompressed into output_iov_.
size_t output_limit_;
- inline char* GetIOVecPointer(int index, size_t offset) {
+ inline char* GetIOVecPointer(size_t index, size_t offset) {
return reinterpret_cast<char*>(output_iov_[index].iov_base) +
offset;
}
@@ -1038,8 +972,7 @@ class SnappyIOVecWriter {
output_iov_[curr_iov_index_].iov_len - curr_iov_written_ >= 16) {
// Fast path, used for the majority (about 95%) of invocations.
char* ptr = GetIOVecPointer(curr_iov_index_, curr_iov_written_);
- UnalignedCopy64(ip, ptr);
- UnalignedCopy64(ip + 8, ptr + 8);
+ UnalignedCopy128(ip, ptr);
curr_iov_written_ += len;
total_written_ += len;
return true;
@@ -1058,7 +991,7 @@ class SnappyIOVecWriter {
}
// Locate the iovec from which we need to start the copy.
- int from_iov_index = curr_iov_index_;
+ size_t from_iov_index = curr_iov_index_;
size_t from_iov_offset = curr_iov_written_;
while (offset > 0) {
if (from_iov_offset >= offset) {
@@ -1067,8 +1000,8 @@ class SnappyIOVecWriter {
}
offset -= from_iov_offset;
+ assert(from_iov_index > 0);
--from_iov_index;
- assert(from_iov_index >= 0);
from_iov_offset = output_iov_[from_iov_index].iov_len;
}
@@ -1103,9 +1036,10 @@ class SnappyIOVecWriter {
if (to_copy > len) {
to_copy = len;
}
- IncrementalCopy(GetIOVecPointer(from_iov_index, from_iov_offset),
- GetIOVecPointer(curr_iov_index_, curr_iov_written_),
- to_copy);
+ IncrementalCopySlow(
+ GetIOVecPointer(from_iov_index, from_iov_offset),
+ GetIOVecPointer(curr_iov_index_, curr_iov_written_),
+ GetIOVecPointer(curr_iov_index_, curr_iov_written_) + to_copy);
curr_iov_written_ += to_copy;
from_iov_offset += to_copy;
total_written_ += to_copy;
@@ -1175,8 +1109,7 @@ class SnappyArrayWriter {
const size_t space_left = op_limit_ - op;
if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16) {
// Fast path, used for the majority (about 95%) of invocations.
- UnalignedCopy64(ip, op);
- UnalignedCopy64(ip + 8, op + 8);
+ UnalignedCopy128(ip, op);
op_ = op + len;
return true;
} else {
@@ -1185,8 +1118,7 @@ class SnappyArrayWriter {
}
inline bool AppendFromSelf(size_t offset, size_t len) {
- char* op = op_;
- const size_t space_left = op_limit_ - op;
+ char* const op_end = op_ + len;
// Check if we try to append from before the start of the buffer.
// Normally this would just be a check for "produced < offset",
@@ -1195,30 +1127,13 @@ class SnappyArrayWriter {
// to a very big number. This is convenient, as offset==0 is another
// invalid case that we also want to catch, so that we do not go
// into an infinite loop.
- assert(op >= base_);
- size_t produced = op - base_;
- if (produced <= offset - 1u) {
- return false;
- }
- if (len <= 16 && offset >= 8 && space_left >= 16) {
- // Fast path, used for the majority (70-80%) of dynamic invocations.
- UnalignedCopy64(op - offset, op);
- UnalignedCopy64(op - offset + 8, op + 8);
- } else {
- if (space_left >= len + kMaxIncrementCopyOverflow) {
- IncrementalCopyFastPath(op - offset, op, len);
- } else {
- if (space_left < len) {
- return false;
- }
- IncrementalCopy(op - offset, op, len);
- }
- }
+ if (Produced() <= offset - 1u || op_end > op_limit_) return false;
+ op_ = IncrementalCopy(op_ - offset, op_, op_end, op_limit_);
- op_ = op + len;
return true;
}
inline size_t Produced() const {
+ assert(op_ >= base_);
return op_ - base_;
}
inline void Flush() {}
@@ -1304,7 +1219,7 @@ void RawCompress(const char* input,
size_t Compress(const char* input, size_t input_length, string* compressed) {
// Pre-grow the buffer to the max length of the compressed output
- compressed->resize(MaxCompressedLength(input_length));
+ STLStringResizeUninitialized(compressed, MaxCompressedLength(input_length));
size_t compressed_length;
RawCompress(input, input_length, string_as_array(compressed),
@@ -1327,7 +1242,7 @@ class SnappyScatteredWriter {
// We need random access into the data generated so far. Therefore
// we keep track of all of the generated data as an array of blocks.
// All of the blocks except the last have length kBlockSize.
- vector<char*> blocks_;
+ std::vector<char*> blocks_;
size_t expected_;
// Total size of all fully generated blocks so far
@@ -1386,8 +1301,7 @@ class SnappyScatteredWriter {
if (length <= 16 && available >= 16 + kMaximumTagLength &&
space_left >= 16) {
// Fast path, used for the majority (about 95%) of invocations.
- UNALIGNED_STORE64(op, UNALIGNED_LOAD64(ip));
- UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(ip + 8));
+ UnalignedCopy128(ip, op);
op_ptr_ = op + length;
return true;
} else {
@@ -1396,16 +1310,13 @@ class SnappyScatteredWriter {
}
inline bool AppendFromSelf(size_t offset, size_t len) {
+ char* const op_end = op_ptr_ + len;
// See SnappyArrayWriter::AppendFromSelf for an explanation of
// the "offset - 1u" trick.
- if (offset - 1u < op_ptr_ - op_base_) {
- const size_t space_left = op_limit_ - op_ptr_;
- if (space_left >= len + kMaxIncrementCopyOverflow) {
- // Fast path: src and dst in current block.
- IncrementalCopyFastPath(op_ptr_ - offset, op_ptr_, len);
- op_ptr_ += len;
- return true;
- }
+ if (PREDICT_TRUE(offset - 1u < op_ptr_ - op_base_ && op_end <= op_limit_)) {
+ // Fast path: src and dst in current block.
+ op_ptr_ = IncrementalCopy(op_ptr_ - offset, op_ptr_, op_end, op_limit_);
+ return true;
}
return SlowAppendFromSelf(offset, len);
}
@@ -1510,7 +1421,7 @@ class SnappySinkAllocator {
}
Sink* dest_;
- vector<Datablock> blocks_;
+ std::vector<Datablock> blocks_;
// Note: copying this object is allowed
};
diff --git a/thirdparty/snappy/snappy_unittest.cc b/thirdparty/snappy/snappy_unittest.cc
index 4a80f2ad..ca95e0d8 100644
--- a/thirdparty/snappy/snappy_unittest.cc
+++ b/thirdparty/snappy/snappy_unittest.cc
@@ -64,6 +64,9 @@ DEFINE_bool(write_compressed, false,
DEFINE_bool(write_uncompressed, false,
"Write uncompressed versions of each file to <file>.uncomp");
+DEFINE_bool(snappy_dump_decompression_table, false,
+ "If true, we print the decompression table during tests.");
+
namespace snappy {
@@ -390,10 +393,10 @@ static void Measure(const char* data,
{
// Chop the input into blocks
int num_blocks = (length + block_size - 1) / block_size;
- vector<const char*> input(num_blocks);
- vector<size_t> input_length(num_blocks);
- vector<string> compressed(num_blocks);
- vector<string> output(num_blocks);
+ std::vector<const char*> input(num_blocks);
+ std::vector<size_t> input_length(num_blocks);
+ std::vector<string> compressed(num_blocks);
+ std::vector<string> output(num_blocks);
for (int b = 0; b < num_blocks; b++) {
int input_start = b * block_size;
int input_limit = min<int>((b+1)*block_size, length);
@@ -1004,6 +1007,20 @@ TEST(SnappyCorruption, UnterminatedVarint) {
&uncompressed));
}
+TEST(SnappyCorruption, OverflowingVarint) {
+ string compressed, uncompressed;
+ size_t ulength;
+ compressed.push_back('\xfb');
+ compressed.push_back('\xff');
+ compressed.push_back('\xff');
+ compressed.push_back('\xff');
+ compressed.push_back('\x7f');
+ CHECK(!CheckUncompressedLength(compressed, &ulength));
+ CHECK(!snappy::IsValidCompressedBuffer(compressed.data(), compressed.size()));
+ CHECK(!snappy::Uncompress(compressed.data(), compressed.size(),
+ &uncompressed));
+}
+
TEST(Snappy, ReadPastEndOfBuffer) {
// Check that we do not read past end of input
@@ -1037,7 +1054,10 @@ TEST(Snappy, ZeroOffsetCopyValidation) {
namespace {
int TestFindMatchLength(const char* s1, const char *s2, unsigned length) {
- return snappy::internal::FindMatchLength(s1, s2, s2 + length);
+ std::pair<size_t, bool> p =
+ snappy::internal::FindMatchLength(s1, s2, s2 + length);
+ CHECK_EQ(p.first < 8, p.second);
+ return p.first;
}
} // namespace
@@ -1147,8 +1167,7 @@ TEST(Snappy, FindMatchLengthRandom) {
}
DataEndingAtUnreadablePage u(s);
DataEndingAtUnreadablePage v(t);
- int matched = snappy::internal::FindMatchLength(
- u.data(), v.data(), v.data() + t.size());
+ int matched = TestFindMatchLength(u.data(), v.data(), t.size());
if (matched == t.size()) {
EXPECT_EQ(s, t);
} else {
@@ -1160,6 +1179,100 @@ TEST(Snappy, FindMatchLengthRandom) {
}
}
+static uint16 MakeEntry(unsigned int extra,
+ unsigned int len,
+ unsigned int copy_offset) {
+ // Check that all of the fields fit within the allocated space
+ assert(extra == (extra & 0x7)); // At most 3 bits
+ assert(copy_offset == (copy_offset & 0x7)); // At most 3 bits
+ assert(len == (len & 0x7f)); // At most 7 bits
+ return len | (copy_offset << 8) | (extra << 11);
+}
+
+// Check that the decompression table is correct, and optionally print out
+// the computed one.
+TEST(Snappy, VerifyCharTable) {
+ using snappy::internal::LITERAL;
+ using snappy::internal::COPY_1_BYTE_OFFSET;
+ using snappy::internal::COPY_2_BYTE_OFFSET;
+ using snappy::internal::COPY_4_BYTE_OFFSET;
+ using snappy::internal::char_table;
+ using snappy::internal::wordmask;
+
+ uint16 dst[256];
+
+ // Place invalid entries in all places to detect missing initialization
+ int assigned = 0;
+ for (int i = 0; i < 256; i++) {
+ dst[i] = 0xffff;
+ }
+
+ // Small LITERAL entries. We store (len-1) in the top 6 bits.
+ for (unsigned int len = 1; len <= 60; len++) {
+ dst[LITERAL | ((len-1) << 2)] = MakeEntry(0, len, 0);
+ assigned++;
+ }
+
+ // Large LITERAL entries. We use 60..63 in the high 6 bits to
+ // encode the number of bytes of length info that follow the opcode.
+ for (unsigned int extra_bytes = 1; extra_bytes <= 4; extra_bytes++) {
+ // We set the length field in the lookup table to 1 because extra
+ // bytes encode len-1.
+ dst[LITERAL | ((extra_bytes+59) << 2)] = MakeEntry(extra_bytes, 1, 0);
+ assigned++;
+ }
+
+ // COPY_1_BYTE_OFFSET.
+ //
+ // The tag byte in the compressed data stores len-4 in 3 bits, and
+ // offset/256 in 5 bits. offset%256 is stored in the next byte.
+ //
+ // This format is used for length in range [4..11] and offset in
+ // range [0..2047]
+ for (unsigned int len = 4; len < 12; len++) {
+ for (unsigned int offset = 0; offset < 2048; offset += 256) {
+ dst[COPY_1_BYTE_OFFSET | ((len-4)<<2) | ((offset>>8)<<5)] =
+ MakeEntry(1, len, offset>>8);
+ assigned++;
+ }
+ }
+
+ // COPY_2_BYTE_OFFSET.
+ // Tag contains len-1 in top 6 bits, and offset in next two bytes.
+ for (unsigned int len = 1; len <= 64; len++) {
+ dst[COPY_2_BYTE_OFFSET | ((len-1)<<2)] = MakeEntry(2, len, 0);
+ assigned++;
+ }
+
+ // COPY_4_BYTE_OFFSET.
+ // Tag contents len-1 in top 6 bits, and offset in next four bytes.
+ for (unsigned int len = 1; len <= 64; len++) {
+ dst[COPY_4_BYTE_OFFSET | ((len-1)<<2)] = MakeEntry(4, len, 0);
+ assigned++;
+ }
+
+ // Check that each entry was initialized exactly once.
+ EXPECT_EQ(256, assigned) << "Assigned only " << assigned << " of 256";
+ for (int i = 0; i < 256; i++) {
+ EXPECT_NE(0xffff, dst[i]) << "Did not assign byte " << i;
+ }
+
+ if (FLAGS_snappy_dump_decompression_table) {
+ printf("static const uint16 char_table[256] = {\n ");
+ for (int i = 0; i < 256; i++) {
+ printf("0x%04x%s",
+ dst[i],
+ ((i == 255) ? "\n" : (((i%8) == 7) ? ",\n " : ", ")));
+ }
+ printf("};\n");
+ }
+
+ // Check that computed table matched recorded table.
+ for (int i = 0; i < 256; i++) {
+ EXPECT_EQ(dst[i], char_table[i]) << "Mismatch in byte " << i;
+ }
+}
+
static void CompressFile(const char* fname) {
string fullinput;
CHECK_OK(file::GetContents(fname, &fullinput, file::Defaults()));