snappy: Update to 1.1.4.

author: Jose Fonseca <jfonseca@vmware.com> 2017-03-23 11:05:20 +0000
committer: Jose Fonseca <jfonseca@vmware.com> 2017-04-08 08:24:55 +0100
commit: a684e9e18aaf3e67cd43a516227a568ee1d5c102 (patch)
tree: 2c76ec05858e76e573f932b298680370a2194433 /thirdparty
parent: e8ee4f3999a36e043224980b2343ead3d658008a (diff)
8 files changed, 512 insertions, 358 deletions
diff --git a/thirdparty/snappy/NEWS b/thirdparty/snappy/NEWS
index 4eb7a1d1..3bc0f4b0 100644
--- a/thirdparty/snappy/NEWS
+++ b/thirdparty/snappy/NEWS
@@ -1,3 +1,11 @@
+Snappy v1.1.4, January 25th 2017:
+
+  * Fix a 1% performance regression when snappy is used in PIE executables.
+
+  * Improve compression performance by 5%.
+
+  * Improve decompression performance by 20%.
+
 Snappy v1.1.3, July 6th 2015:
 
 This is the first release to be done from GitHub, which means that
diff --git a/thirdparty/snappy/README b/thirdparty/snappy/README
index 3bc8888f..c60dab9a 100644
--- a/thirdparty/snappy/README
+++ b/thirdparty/snappy/README
@@ -29,7 +29,7 @@ and the like.
 
 Performance
 ===========
- 
+
 Snappy is intended to be fast. On a single core of a Core i7 processor
 in 64-bit mode, it compresses at about 250 MB/sec or more and decompresses at
 about 500 MB/sec or more. (These numbers are for the slowest inputs in our
@@ -67,7 +67,7 @@ Usage
 
 Note that Snappy, both the implementation and the main interface,
 is written in C++. However, several third-party bindings to other languages
-are available; see the Google Code page at http://code.google.com/p/snappy/
+are available; see the home page at http://google.github.io/snappy/
 for more information. Also, if you want to use Snappy from C code, you can
 use the included C bindings in snappy-c.h.
 
@@ -102,12 +102,12 @@ tests to verify you have not broken anything. Note that if you have the
 Google Test library installed, unit test behavior (especially failures) will be
 significantly more user-friendly. You can find Google Test at
 
-  http://code.google.com/p/googletest/
+  http://github.com/google/googletest
 
 You probably also want the gflags library for handling of command-line flags;
 you can find it at
 
-  http://code.google.com/p/google-gflags/
+  http://gflags.github.io/gflags/
 
 In addition to the unit tests, snappy contains microbenchmarks used to
 tune compression and decompression performance. These are automatically run
@@ -129,7 +129,11 @@ test.)
 Contact
 =======
 
-Snappy is distributed through Google Code. For the latest version, a bug tracker,
+Snappy is distributed through GitHub. For the latest version, a bug tracker,
 and other information, see
 
-  http://code.google.com/p/snappy/
+  http://google.github.io/snappy/
+
+or the repository at
+
+  https://github.com/google/snappy
diff --git a/thirdparty/snappy/snappy-internal.h b/thirdparty/snappy/snappy-internal.h
index 0653dc65..b8355c08 100644
--- a/thirdparty/snappy/snappy-internal.h
+++ b/thirdparty/snappy/snappy-internal.h
@@ -70,11 +70,12 @@ char* CompressFragment(const char* input,
                        uint16* table,
                        const int table_size);
 
-// Return the largest n such that
+// Find the largest n such that
 //
 //   s1[0,n-1] == s2[0,n-1]
 //   and n <= (s2_limit - s2).
 //
+// Return make_pair(n, n < 8).
 // Does not read *s2_limit or beyond.
 // Does not read *(s1 + (s2_limit - s2)) or beyond.
 // Requires that s2_limit >= s2.
@@ -82,11 +83,28 @@ char* CompressFragment(const char* input,
 // Separate implementation for x86_64, for speed.  Uses the fact that
 // x86_64 is little endian.
 #if defined(ARCH_K8)
-static inline int FindMatchLength(const char* s1,
-                                  const char* s2,
-                                  const char* s2_limit) {
+static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
+                                                      const char* s2,
+                                                      const char* s2_limit) {
   assert(s2_limit >= s2);
-  int matched = 0;
+  size_t matched = 0;
+
+  // This block isn't necessary for correctness; we could just start looping
+  // immediately.  As an optimization though, it is useful.  It creates some not
+  // uncommon code paths that determine, without extra effort, whether the match
+  // length is less than 8.  In short, we are hoping to avoid a conditional
+  // branch, and perhaps get better code layout from the C++ compiler.
+  if (PREDICT_TRUE(s2 <= s2_limit - 8)) {
+    uint64 a1 = UNALIGNED_LOAD64(s1);
+    uint64 a2 = UNALIGNED_LOAD64(s2);
+    if (a1 != a2) {
+      return std::pair<size_t, bool>(Bits::FindLSBSetNonZero64(a1 ^ a2) >> 3,
+                                     true);
+    } else {
+      matched = 8;
+      s2 += 8;
+    }
+  }
 
   // Find out how long the match is. We loop over the data 64 bits at a
   // time until we find a 64-bit block that doesn't match; then we find
@@ -97,14 +115,11 @@ static inline int FindMatchLength(const char* s1,
       s2 += 8;
       matched += 8;
     } else {
-      // On current (mid-2008) Opteron models there is a 3% more
-      // efficient code sequence to find the first non-matching byte.
-      // However, what follows is ~10% better on Intel Core 2 and newer,
-      // and we expect AMD's bsf instruction to improve.
       uint64 x = UNALIGNED_LOAD64(s2) ^ UNALIGNED_LOAD64(s1 + matched);
       int matching_bits = Bits::FindLSBSetNonZero64(x);
       matched += matching_bits >> 3;
-      return matched;
+      assert(matched >= 8);
+      return std::pair<size_t, bool>(matched, false);
     }
   }
   while (PREDICT_TRUE(s2 < s2_limit)) {
@@ -112,15 +127,15 @@ static inline int FindMatchLength(const char* s1,
       ++s2;
       ++matched;
     } else {
-      return matched;
+      return std::pair<size_t, bool>(matched, matched < 8);
     }
   }
-  return matched;
+  return std::pair<size_t, bool>(matched, matched < 8);
 }
 #else
-static inline int FindMatchLength(const char* s1,
-                                  const char* s2,
-                                  const char* s2_limit) {
+static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
+                                                      const char* s2,
+                                                      const char* s2_limit) {
   // Implementation based on the x86-64 version, above.
   assert(s2_limit >= s2);
   int matched = 0;
@@ -140,10 +155,72 @@ static inline int FindMatchLength(const char* s1,
       ++matched;
     }
   }
-  return matched;
+  return std::pair<size_t, bool>(matched, matched < 8);
 }
 #endif
 
+// Lookup tables for decompression code.  Give --snappy_dump_decompression_table
+// to the unit test to recompute char_table.
+
+enum {
+  LITERAL = 0,
+  COPY_1_BYTE_OFFSET = 1,  // 3 bit length + 3 bits of offset in opcode
+  COPY_2_BYTE_OFFSET = 2,
+  COPY_4_BYTE_OFFSET = 3
+};
+static const int kMaximumTagLength = 5;  // COPY_4_BYTE_OFFSET plus the actual offset.
+
+// Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits
+static const uint32 wordmask[] = {
+  0u, 0xffu, 0xffffu, 0xffffffu, 0xffffffffu
+};
+
+// Data stored per entry in lookup table:
+//      Range   Bits-used       Description
+//      ------------------------------------
+//      1..64   0..7            Literal/copy length encoded in opcode byte
+//      0..7    8..10           Copy offset encoded in opcode byte / 256
+//      0..4    11..13          Extra bytes after opcode
+//
+// We use eight bits for the length even though 7 would have sufficed
+// because of efficiency reasons:
+//      (1) Extracting a byte is faster than a bit-field
+//      (2) It properly aligns copy offset so we do not need a <<8
+static const uint16 char_table[256] = {
+  0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002,
+  0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004,
+  0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006,
+  0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008,
+  0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a,
+  0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c,
+  0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e,
+  0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010,
+  0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012,
+  0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014,
+  0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016,
+  0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018,
+  0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a,
+  0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c,
+  0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e,
+  0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020,
+  0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022,
+  0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024,
+  0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026,
+  0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028,
+  0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a,
+  0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c,
+  0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e,
+  0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030,
+  0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032,
+  0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034,
+  0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036,
+  0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038,
+  0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a,
+  0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c,
+  0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e,
+  0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040
+};
+
 }  // end namespace internal
 }  // end namespace snappy
 
diff --git a/thirdparty/snappy/snappy-stubs-internal.h b/thirdparty/snappy/snappy-stubs-internal.h
index ddca1a8b..1954c63d 100644
--- a/thirdparty/snappy/snappy-stubs-internal.h
+++ b/thirdparty/snappy/snappy-stubs-internal.h
@@ -116,6 +116,15 @@ static const int64 kint64max = static_cast<int64>(0x7FFFFFFFFFFFFFFFLL);
 // sub-architectures.
 //
 // This is a mess, but there's not much we can do about it.
+//
+// To further complicate matters, only LDR instructions (single reads) are
+// allowed to be unaligned, not LDRD (two reads) or LDM (many reads). Unless we
+// explicitly tell the compiler that these accesses can be unaligned, it can and
+// will combine accesses. On armcc, the way to signal this is done by accessing
+// through the type (uint32 __packed *), but GCC has no such attribute
+// (it ignores __attribute__((packed)) on individual variables). However,
+// we can tell it that a _struct_ is unaligned, which has the same effect,
+// so we do that.
 
 #elif defined(__arm__) && \
       !defined(__ARM_ARCH_4__) && \
@@ -131,11 +140,39 @@ static const int64 kint64max = static_cast<int64>(0x7FFFFFFFFFFFFFFFLL);
       !defined(__ARM_ARCH_6ZK__) && \
       !defined(__ARM_ARCH_6T2__)
 
-#define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
-#define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
+#if __GNUC__
+#define ATTRIBUTE_PACKED __attribute__((__packed__))
+#else
+#define ATTRIBUTE_PACKED
+#endif
 
-#define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16 *>(_p) = (_val))
-#define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val))
+namespace base {
+namespace internal {
+
+struct Unaligned16Struct {
+  uint16 value;
+  uint8 dummy;  // To make the size non-power-of-two.
+} ATTRIBUTE_PACKED;
+
+struct Unaligned32Struct {
+  uint32 value;
+  uint8 dummy;  // To make the size non-power-of-two.
+} ATTRIBUTE_PACKED;
+
+}  // namespace internal
+}  // namespace base
+
+#define UNALIGNED_LOAD16(_p) \
+    ((reinterpret_cast<const ::snappy::base::internal::Unaligned16Struct *>(_p))->value)
+#define UNALIGNED_LOAD32(_p) \
+    ((reinterpret_cast<const ::snappy::base::internal::Unaligned32Struct *>(_p))->value)
+
+#define UNALIGNED_STORE16(_p, _val) \
+    ((reinterpret_cast< ::snappy::base::internal::Unaligned16Struct *>(_p))->value = \
+         (_val))
+#define UNALIGNED_STORE32(_p, _val) \
+    ((reinterpret_cast< ::snappy::base::internal::Unaligned32Struct *>(_p))->value = \
+         (_val))
 
 // TODO(user): NEON supports unaligned 64-bit loads and stores.
 // See if that would be more efficient on platforms supporting it,
diff --git a/thirdparty/snappy/snappy-stubs-public.h.in b/thirdparty/snappy/snappy-stubs-public.h.in
index ebe676cc..96989ac3 100644
--- a/thirdparty/snappy/snappy-stubs-public.h.in
+++ b/thirdparty/snappy/snappy-stubs-public.h.in
@@ -80,9 +80,11 @@ typedef unsigned long long uint64;
 
 typedef std::string string;
 
+#ifndef DISALLOW_COPY_AND_ASSIGN
 #define DISALLOW_COPY_AND_ASSIGN(TypeName) \
   TypeName(const TypeName&);               \
   void operator=(const TypeName&)
+#endif
 
 #if !@ac_cv_have_sys_uio_h@
 // Windows does not have an iovec type, yet the concept is universally useful.
diff --git a/thirdparty/snappy/snappy-test.h b/thirdparty/snappy/snappy-test.h
index dbc55b98..5fb09c78 100644
--- a/thirdparty/snappy/snappy-test.h
+++ b/thirdparty/snappy/snappy-test.h
@@ -196,6 +196,7 @@ void Test_Snappy_RandomData();
 void Test_Snappy_FourByteOffset();
 void Test_SnappyCorruption_TruncatedVarint();
 void Test_SnappyCorruption_UnterminatedVarint();
+void Test_SnappyCorruption_OverflowingVarint();
 void Test_Snappy_ReadPastEndOfBuffer();
 void Test_Snappy_FindMatchLength();
 void Test_Snappy_FindMatchLengthRandom();
@@ -500,6 +501,7 @@ static inline int RUN_ALL_TESTS() {
   snappy::Test_Snappy_FourByteOffset();
   snappy::Test_SnappyCorruption_TruncatedVarint();
   snappy::Test_SnappyCorruption_UnterminatedVarint();
+  snappy::Test_SnappyCorruption_OverflowingVarint();
   snappy::Test_Snappy_ReadPastEndOfBuffer();
   snappy::Test_Snappy_FindMatchLength();
   snappy::Test_Snappy_FindMatchLengthRandom();
diff --git a/thirdparty/snappy/snappy.cc b/thirdparty/snappy/snappy.cc
index b6ca7ece..4bcea0b4 100644
--- a/thirdparty/snappy/snappy.cc
+++ b/thirdparty/snappy/snappy.cc
@@ -30,6 +30,9 @@
 #include "snappy-internal.h"
 #include "snappy-sinksource.h"
 
+#if defined(__x86_64__) || defined(_M_X64)
+#include <emmintrin.h>
+#endif
 #include <stdio.h>
 
 #include <algorithm>
@@ -39,6 +42,13 @@
 
 namespace snappy {
 
+using internal::COPY_1_BYTE_OFFSET;
+using internal::COPY_2_BYTE_OFFSET;
+using internal::LITERAL;
+using internal::char_table;
+using internal::kMaximumTagLength;
+using internal::wordmask;
+
 // Any hash function will produce a valid compressed bitstream, but a good
 // hash function reduces the number of collisions and thus yields better
 // compression for compressible input, and more speed for incompressible
@@ -76,79 +86,125 @@ size_t MaxCompressedLength(size_t source_len) {
   return 32 + source_len + source_len/6;
 }
 
-enum {
-  LITERAL = 0,
-  COPY_1_BYTE_OFFSET = 1,  // 3 bit length + 3 bits of offset in opcode
-  COPY_2_BYTE_OFFSET = 2,
-  COPY_4_BYTE_OFFSET = 3
-};
-static const int kMaximumTagLength = 5;  // COPY_4_BYTE_OFFSET plus the actual offset.
-
-// Copy "len" bytes from "src" to "op", one byte at a time.  Used for
-// handling COPY operations where the input and output regions may
-// overlap.  For example, suppose:
-//    src    == "ab"
-//    op     == src + 2
-//    len    == 20
-// After IncrementalCopy(src, op, len), the result will have
-// eleven copies of "ab"
-//    ababababababababababab
-// Note that this does not match the semantics of either memcpy()
-// or memmove().
-static inline void IncrementalCopy(const char* src, char* op, ssize_t len) {
-  assert(len > 0);
-  do {
-    *op++ = *src++;
-  } while (--len > 0);
+namespace {
+
+void UnalignedCopy64(const void* src, void* dst) {
+  memcpy(dst, src, 8);
 }
 
-// Equivalent to IncrementalCopy except that it can write up to ten extra
-// bytes after the end of the copy, and that it is faster.
-//
-// The main part of this loop is a simple copy of eight bytes at a time until
-// we've copied (at least) the requested amount of bytes.  However, if op and
-// src are less than eight bytes apart (indicating a repeating pattern of
-// length < 8), we first need to expand the pattern in order to get the correct
-// results. For instance, if the buffer looks like this, with the eight-byte
-// <src> and <op> patterns marked as intervals:
-//
-//    abxxxxxxxxxxxx
-//    [------]           src
-//      [------]         op
-//
-// a single eight-byte copy from <src> to <op> will repeat the pattern once,
-// after which we can move <op> two bytes without moving <src>:
-//
-//    ababxxxxxxxxxx
-//    [------]           src
-//        [------]       op
-//
-// and repeat the exercise until the two no longer overlap.
-//
-// This allows us to do very well in the special case of one single byte
-// repeated many times, without taking a big hit for more general cases.
-//
-// The worst case of extra writing past the end of the match occurs when
-// op - src == 1 and len == 1; the last copy will read from byte positions
-// [0..7] and write to [4..11], whereas it was only supposed to write to
-// position 1. Thus, ten excess bytes.
+void UnalignedCopy128(const void* src, void* dst) {
+  // TODO(alkis): Remove this when we upgrade to a recent compiler that emits
+  // SSE2 moves for memcpy(dst, src, 16).
+#ifdef __SSE2__
+  __m128i x = _mm_loadu_si128(static_cast<const __m128i*>(src));
+  _mm_storeu_si128(static_cast<__m128i*>(dst), x);
+#else
+  memcpy(dst, src, 16);
+#endif
+}
 
-namespace {
+// Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used
+// for handling COPY operations where the input and output regions may overlap.
+// For example, suppose:
+//    src       == "ab"
+//    op        == src + 2
+//    op_limit  == op + 20
+// After IncrementalCopySlow(src, op, op_limit), the result will have eleven
+// copies of "ab"
+//    ababababababababababab
+// Note that this does not match the semantics of either memcpy() or memmove().
+inline char* IncrementalCopySlow(const char* src, char* op,
+                                 char* const op_limit) {
+  while (op < op_limit) {
+    *op++ = *src++;
+  }
+  return op_limit;
+}
 
-const int kMaxIncrementCopyOverflow = 10;
+// Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) but faster than
+// IncrementalCopySlow. buf_limit is the address past the end of the writable
+// region of the buffer.
+inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
+                             char* const buf_limit) {
+  // Terminology:
+  //
+  // slop = buf_limit - op
+  // pat  = op - src
+  // len  = limit - op
+  assert(src < op);
+  assert(op_limit <= buf_limit);
+  // NOTE: The compressor always emits 4 <= len <= 64. It is ok to assume that
+  // to optimize this function but we have to also handle these cases in case
+  // the input does not satisfy these conditions.
+
+  size_t pattern_size = op - src;
+  // The cases are split into different branches to allow the branch predictor,
+  // FDO, and static prediction hints to work better. For each input we list the
+  // ratio of invocations that match each condition.
+  //
+  // input        slop < 16   pat < 8  len > 16
+  // ------------------------------------------
+  // html|html4|cp   0%         1.01%    27.73%
+  // urls            0%         0.88%    14.79%
+  // jpg             0%        64.29%     7.14%
+  // pdf             0%         2.56%    58.06%
+  // txt[1-4]        0%         0.23%     0.97%
+  // pb              0%         0.96%    13.88%
+  // bin             0.01%     22.27%    41.17%
+  //
+  // It is very rare that we don't have enough slop for doing block copies. It
+  // is also rare that we need to expand a pattern. Small patterns are common
+  // for incompressible formats and for those we are plenty fast already.
+  // Lengths are normally not greater than 16 but they vary depending on the
+  // input. In general if we always predict len <= 16 it would be an ok
+  // prediction.
+  //
+  // In order to be fast we want a pattern >= 8 bytes and an unrolled loop
+  // copying 2x 8 bytes at a time.
+
+  // Handle the uncommon case where pattern is less than 8 bytes.
+  if (PREDICT_FALSE(pattern_size < 8)) {
+    // Expand pattern to at least 8 bytes. The worse case scenario in terms of
+    // buffer usage is when the pattern is size 3. ^ is the original position
+    // of op. x are irrelevant bytes copied by the last UnalignedCopy64.
+    //
+    // abc
+    // abcabcxxxxx
+    // abcabcabcabcxxxxx
+    //    ^
+    // The last x is 14 bytes after ^.
+    if (PREDICT_TRUE(op <= buf_limit - 14)) {
+      while (pattern_size < 8) {
+        UnalignedCopy64(src, op);
+        op += pattern_size;
+        pattern_size *= 2;
+      }
+      if (PREDICT_TRUE(op >= op_limit)) return op_limit;
+    } else {
+      return IncrementalCopySlow(src, op, op_limit);
+    }
+  }
+  assert(pattern_size >= 8);
 
-inline void IncrementalCopyFastPath(const char* src, char* op, ssize_t len) {
-  while (PREDICT_FALSE(op - src < 8)) {
+  // Copy 2x 8 bytes at a time. Because op - src can be < 16, a single
+  // UnalignedCopy128 might overwrite data in op. UnalignedCopy64 is safe
+  // because expanding the pattern to at least 8 bytes guarantees that
+  // op - src >= 8.
+  while (op <= buf_limit - 16) {
     UnalignedCopy64(src, op);
-    len -= op - src;
-    op += op - src;
-  }
-  while (len > 0) {
+    UnalignedCopy64(src + 8, op + 8);
+    src += 16;
+    op += 16;
+    if (PREDICT_TRUE(op >= op_limit)) return op_limit;
+  }
+  // We only take this branch if we didn't have enough slop and we can do a
+  // single 8 byte copy.
+  if (PREDICT_FALSE(op <= buf_limit - 8)) {
     UnalignedCopy64(src, op);
     src += 8;
     op += 8;
-    len -= 8;
   }
+  return IncrementalCopySlow(src, op, op_limit);
 }
 
 }  // namespace
@@ -157,26 +213,29 @@ static inline char* EmitLiteral(char* op,
                                 const char* literal,
                                 int len,
                                 bool allow_fast_path) {
-  int n = len - 1;      // Zero-length literals are disallowed
-  if (n < 60) {
+  // The vast majority of copies are below 16 bytes, for which a
+  // call to memcpy is overkill. This fast path can sometimes
+  // copy up to 15 bytes too much, but that is okay in the
+  // main loop, since we have a bit to go on for both sides:
+  //
+  //   - The input will always have kInputMarginBytes = 15 extra
+  //     available bytes, as long as we're in the main loop, and
+  //     if not, allow_fast_path = false.
+  //   - The output will always have 32 spare bytes (see
+  //     MaxCompressedLength).
+  assert(len > 0);      // Zero-length literals are disallowed
+  int n = len - 1;
+  if (allow_fast_path && len <= 16) {
     // Fits in tag byte
     *op++ = LITERAL | (n << 2);
 
-    // The vast majority of copies are below 16 bytes, for which a
-    // call to memcpy is overkill. This fast path can sometimes
-    // copy up to 15 bytes too much, but that is okay in the
-    // main loop, since we have a bit to go on for both sides:
-    //
-    //   - The input will always have kInputMarginBytes = 15 extra
-    //     available bytes, as long as we're in the main loop, and
-    //     if not, allow_fast_path = false.
-    //   - The output will always have 32 spare bytes (see
-    //     MaxCompressedLength).
-    if (allow_fast_path && len <= 16) {
-      UnalignedCopy64(literal, op);
-      UnalignedCopy64(literal + 8, op + 8);
-      return op + len;
-    }
+    UnalignedCopy128(literal, op);
+    return op + len;
+  }
+
+  if (n < 60) {
+    // Fits in tag byte
+    *op++ = LITERAL | (n << 2);
   } else {
     // Encode in upcoming bytes
     char* base = op;
@@ -195,42 +254,54 @@ static inline char* EmitLiteral(char* op,
   return op + len;
 }
 
-static inline char* EmitCopyLessThan64(char* op, size_t offset, int len) {
+static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len,
+                                     bool len_less_than_12) {
   assert(len <= 64);
   assert(len >= 4);
   assert(offset < 65536);
+  assert(len_less_than_12 == (len < 12));
 
-  if ((len < 12) && (offset < 2048)) {
-    size_t len_minus_4 = len - 4;
-    assert(len_minus_4 < 8);            // Must fit in 3 bits
-    *op++ = COPY_1_BYTE_OFFSET + ((len_minus_4) << 2) + ((offset >> 8) << 5);
+  if (len_less_than_12 && PREDICT_TRUE(offset < 2048)) {
+    // offset fits in 11 bits.  The 3 highest go in the top of the first byte,
+    // and the rest go in the second byte.
+    *op++ = COPY_1_BYTE_OFFSET + ((len - 4) << 2) + ((offset >> 3) & 0xe0);
     *op++ = offset & 0xff;
   } else {
-    *op++ = COPY_2_BYTE_OFFSET + ((len-1) << 2);
-    LittleEndian::Store16(op, offset);
-    op += 2;
+    // Write 4 bytes, though we only care about 3 of them.  The output buffer
+    // is required to have some slack, so the extra byte won't overrun it.
+    uint32 u = COPY_2_BYTE_OFFSET + ((len - 1) << 2) + (offset << 8);
+    LittleEndian::Store32(op, u);
+    op += 3;
   }
   return op;
 }
 
-static inline char* EmitCopy(char* op, size_t offset, int len) {
-  // Emit 64 byte copies but make sure to keep at least four bytes reserved
-  while (PREDICT_FALSE(len >= 68)) {
-    op = EmitCopyLessThan64(op, offset, 64);
-    len -= 64;
-  }
+static inline char* EmitCopy(char* op, size_t offset, size_t len,
+                             bool len_less_than_12) {
+  assert(len_less_than_12 == (len < 12));
+  if (len_less_than_12) {
+    return EmitCopyAtMost64(op, offset, len, true);
+  } else {
+    // A special case for len <= 64 might help, but so far measurements suggest
+    // it's in the noise.
 
-  // Emit an extra 60 byte copy if have too much data to fit in one copy
-  if (len > 64) {
-    op = EmitCopyLessThan64(op, offset, 60);
-    len -= 60;
-  }
+    // Emit 64 byte copies but make sure to keep at least four bytes reserved.
+    while (PREDICT_FALSE(len >= 68)) {
+      op = EmitCopyAtMost64(op, offset, 64, false);
+      len -= 64;
+    }
 
-  // Emit remainder
-  op = EmitCopyLessThan64(op, offset, len);
-  return op;
-}
+    // One or two copies will now finish the job.
+    if (len > 64) {
+      op = EmitCopyAtMost64(op, offset, 60, false);
+      len -= 60;
+    }
 
+    // Emit remainder.
+    op = EmitCopyAtMost64(op, offset, len, len < 12);
+    return op;
+  }
+}
 
 bool GetUncompressedLength(const char* start, size_t n, size_t* result) {
   uint32 v = 0;
@@ -364,9 +435,9 @@ char* CompressFragment(const char* input,
       //
       // Heuristic match skipping: If 32 bytes are scanned with no matches
       // found, start looking only at every other byte. If 32 more bytes are
-      // scanned, look at every third byte, etc.. When a match is found,
-      // immediately go back to looking at every byte. This is a small loss
-      // (~5% performance, ~0.1% density) for compressible data due to more
+      // scanned (or skipped), look at every third byte, etc.. When a match is
+      // found, immediately go back to looking at every byte. This is a small
+      // loss (~5% performance, ~0.1% density) for compressible data due to more
       // bookkeeping, but for non-compressible data (such as JPEG) it's a huge
       // win since the compressor quickly "realizes" the data is incompressible
       // and doesn't bother looking for matches everywhere.
@@ -382,7 +453,8 @@ char* CompressFragment(const char* input,
         ip = next_ip;
         uint32 hash = next_hash;
         assert(hash == Hash(ip, shift));
-        uint32 bytes_between_hash_lookups = skip++ >> 5;
+        uint32 bytes_between_hash_lookups = skip >> 5;
+        skip += bytes_between_hash_lookups;
         next_ip = ip + bytes_between_hash_lookups;
         if (PREDICT_FALSE(next_ip > ip_limit)) {
           goto emit_remainder;
@@ -417,19 +489,21 @@ char* CompressFragment(const char* input,
         // We have a 4-byte match at ip, and no need to emit any
         // "literal bytes" prior to ip.
         const char* base = ip;
-        int matched = 4 + FindMatchLength(candidate + 4, ip + 4, ip_end);
+        std::pair<size_t, bool> p =
+            FindMatchLength(candidate + 4, ip + 4, ip_end);
+        size_t matched = 4 + p.first;
         ip += matched;
         size_t offset = base - candidate;
         assert(0 == memcmp(base, candidate, matched));
-        op = EmitCopy(op, offset, matched);
-        // We could immediately start working at ip now, but to improve
-        // compression we first update table[Hash(ip - 1, ...)].
-        const char* insert_tail = ip - 1;
+        op = EmitCopy(op, offset, matched, p.second);
         next_emit = ip;
         if (PREDICT_FALSE(ip >= ip_limit)) {
           goto emit_remainder;
         }
-        input_bytes = GetEightBytesAt(insert_tail);
+        // We are now looking for a 4-byte match again.  We read
+        // table[Hash(ip, shift)] for that.  To improve compression,
+        // we also update table[Hash(ip - 1, shift)] and table[Hash(ip, shift)].
+        input_bytes = GetEightBytesAt(ip - 1);
         uint32 prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 0), shift);
         table[prev_hash] = ip - base_ip - 1;
         uint32 cur_hash = HashBytes(GetUint32AtOffset(input_bytes, 1), shift);
@@ -493,162 +567,6 @@ char* CompressFragment(const char* input,
 //   bool TryFastAppend(const char* ip, size_t available, size_t length);
 // };
 
-// -----------------------------------------------------------------------
-// Lookup table for decompression code.  Generated by ComputeTable() below.
-// -----------------------------------------------------------------------
-
-// Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits
-static const uint32 wordmask[] = {
-  0u, 0xffu, 0xffffu, 0xffffffu, 0xffffffffu
-};
-
-// Data stored per entry in lookup table:
-//      Range   Bits-used       Description
-//      ------------------------------------
-//      1..64   0..7            Literal/copy length encoded in opcode byte
-//      0..7    8..10           Copy offset encoded in opcode byte / 256
-//      0..4    11..13          Extra bytes after opcode
-//
-// We use eight bits for the length even though 7 would have sufficed
-// because of efficiency reasons:
-//      (1) Extracting a byte is faster than a bit-field
-//      (2) It properly aligns copy offset so we do not need a <<8
-static const uint16 char_table[256] = {
-  0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002,
-  0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004,
-  0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006,
-  0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008,
-  0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a,
-  0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c,
-  0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e,
-  0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010,
-  0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012,
-  0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014,
-  0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016,
-  0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018,
-  0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a,
-  0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c,
-  0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e,
-  0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020,
-  0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022,
-  0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024,
-  0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026,
-  0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028,
-  0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a,
-  0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c,
-  0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e,
-  0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030,
-  0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032,
-  0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034,
-  0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036,
-  0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038,
-  0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a,
-  0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c,
-  0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e,
-  0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040
-};
-
-// In debug mode, allow optional computation of the table at startup.
-// Also, check that the decompression table is correct.
-#ifndef NDEBUG
-DEFINE_bool(snappy_dump_decompression_table, false,
-            "If true, we print the decompression table at startup.");
-
-static uint16 MakeEntry(unsigned int extra,
-                        unsigned int len,
-                        unsigned int copy_offset) {
-  // Check that all of the fields fit within the allocated space
-  assert(extra       == (extra & 0x7));          // At most 3 bits
-  assert(copy_offset == (copy_offset & 0x7));    // At most 3 bits
-  assert(len         == (len & 0x7f));           // At most 7 bits
-  return len | (copy_offset << 8) | (extra << 11);
-}
-
-static void ComputeTable() {
-  uint16 dst[256];
-
-  // Place invalid entries in all places to detect missing initialization
-  int assigned = 0;
-  for (int i = 0; i < 256; i++) {
-    dst[i] = 0xffff;
-  }
-
-  // Small LITERAL entries.  We store (len-1) in the top 6 bits.
-  for (unsigned int len = 1; len <= 60; len++) {
-    dst[LITERAL | ((len-1) << 2)] = MakeEntry(0, len, 0);
-    assigned++;
-  }
-
-  // Large LITERAL entries.  We use 60..63 in the high 6 bits to
-  // encode the number of bytes of length info that follow the opcode.
-  for (unsigned int extra_bytes = 1; extra_bytes <= 4; extra_bytes++) {
-    // We set the length field in the lookup table to 1 because extra
-    // bytes encode len-1.
-    dst[LITERAL | ((extra_bytes+59) << 2)] = MakeEntry(extra_bytes, 1, 0);
-    assigned++;
-  }
-
-  // COPY_1_BYTE_OFFSET.
-  //
-  // The tag byte in the compressed data stores len-4 in 3 bits, and
-  // offset/256 in 5 bits.  offset%256 is stored in the next byte.
-  //
-  // This format is used for length in range [4..11] and offset in
-  // range [0..2047]
-  for (unsigned int len = 4; len < 12; len++) {
-    for (unsigned int offset = 0; offset < 2048; offset += 256) {
-      dst[COPY_1_BYTE_OFFSET | ((len-4)<<2) | ((offset>>8)<<5)] =
-        MakeEntry(1, len, offset>>8);
-      assigned++;
-    }
-  }
-
-  // COPY_2_BYTE_OFFSET.
-  // Tag contains len-1 in top 6 bits, and offset in next two bytes.
-  for (unsigned int len = 1; len <= 64; len++) {
-    dst[COPY_2_BYTE_OFFSET | ((len-1)<<2)] = MakeEntry(2, len, 0);
-    assigned++;
-  }
-
-  // COPY_4_BYTE_OFFSET.
-  // Tag contents len-1 in top 6 bits, and offset in next four bytes.
-  for (unsigned int len = 1; len <= 64; len++) {
-    dst[COPY_4_BYTE_OFFSET | ((len-1)<<2)] = MakeEntry(4, len, 0);
-    assigned++;
-  }
-
-  // Check that each entry was initialized exactly once.
-  if (assigned != 256) {
-    fprintf(stderr, "ComputeTable: assigned only %d of 256\n", assigned);
-    abort();
-  }
-  for (int i = 0; i < 256; i++) {
-    if (dst[i] == 0xffff) {
-      fprintf(stderr, "ComputeTable: did not assign byte %d\n", i);
-      abort();
-    }
-  }
-
-  if (FLAGS_snappy_dump_decompression_table) {
-    printf("static const uint16 char_table[256] = {\n  ");
-    for (int i = 0; i < 256; i++) {
-      printf("0x%04x%s",
-             dst[i],
-             ((i == 255) ? "\n" : (((i%8) == 7) ? ",\n  " : ", ")));
-    }
-    printf("};\n");
-  }
-
-  // Check that computed table matched recorded table
-  for (int i = 0; i < 256; i++) {
-    if (dst[i] != char_table[i]) {
-      fprintf(stderr, "ComputeTable: byte %d: computed (%x), expect (%x)\n",
-              i, static_cast<int>(dst[i]), static_cast<int>(char_table[i]));
-      abort();
-    }
-  }
-}
-#endif /* !NDEBUG */
 
 // Helper class for decompression
 class SnappyDecompressor {
@@ -701,7 +619,9 @@ class SnappyDecompressor {
       if (n == 0) return false;
       const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
       reader_->Skip(1);
-      *result |= static_cast<uint32>(c & 0x7f) << shift;
+      uint32 val = c & 0x7f;
+      if (((val << shift) >> shift) != val) return false;
+      *result |= val << shift;
       if (c < 128) {
         break;
       }
@@ -715,6 +635,10 @@ class SnappyDecompressor {
   template <class Writer>
   void DecompressAllTags(Writer* writer) {
     const char* ip = ip_;
+    // For position-independent executables, accessing global arrays can be
+    // slow.  Move wordmask array onto the stack to mitigate this.
+    uint32 wordmask[sizeof(internal::wordmask)/sizeof(uint32)];
+    memcpy(wordmask, internal::wordmask, sizeof(wordmask));
 
     // We could have put this refill fragment only at the beginning of the loop.
     // However, duplicating it at the end of each branch gives the compiler more
@@ -731,7 +655,19 @@ class SnappyDecompressor {
     for ( ;; ) {
       const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
 
-      if ((c & 0x3) == LITERAL) {
+      // Ratio of iterations that have LITERAL vs non-LITERAL for different
+      // inputs.
+      //
+      // input          LITERAL  NON_LITERAL
+      // -----------------------------------
+      // html|html4|cp   23%        77%
+      // urls            36%        64%
+      // jpg             47%        53%
+      // pdf             19%        81%
+      // txt[1-4]        25%        75%
+      // pb              24%        76%
+      // bin             24%        76%
+      if (PREDICT_FALSE((c & 0x3) == LITERAL)) {
         size_t literal_length = (c >> 2) + 1u;
         if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) {
           assert(literal_length < 61);
@@ -767,15 +703,15 @@ class SnappyDecompressor {
         ip += literal_length;
         MAYBE_REFILL();
       } else {
-        const uint32 entry = char_table[c];
-        const uint32 trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
-        const uint32 length = entry & 0xff;
+        const size_t entry = char_table[c];
+        const size_t trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
+        const size_t length = entry & 0xff;
         ip += entry >> 11;
 
         // copy_offset/256 is encoded in bits 8..10.  By just fetching
         // those bits, we get copy_offset (since the bit-field starts at
         // bit 8).
-        const uint32 copy_offset = entry & 0x700;
+        const size_t copy_offset = entry & 0x700;
         if (!writer->AppendFromSelf(copy_offset + trailer, length)) {
           return;
         }
@@ -795,10 +731,8 @@ bool SnappyDecompressor::RefillTag() {
     size_t n;
     ip = reader_->Peek(&n);
     peeked_ = n;
-    if (n == 0) {
-      eof_ = true;
-      return false;
-    }
+    eof_ = (n == 0);
+    if (eof_) return false;
     ip_limit_ = ip + n;
   }
 
@@ -966,7 +900,7 @@ class SnappyIOVecWriter {
   const size_t output_iov_count_;
 
   // We are currently writing into output_iov_[curr_iov_index_].
-  int curr_iov_index_;
+  size_t curr_iov_index_;
 
   // Bytes written to output_iov_[curr_iov_index_] so far.
   size_t curr_iov_written_;
@@ -977,7 +911,7 @@ class SnappyIOVecWriter {
   // Maximum number of bytes that will be decompressed into output_iov_.
   size_t output_limit_;
 
-  inline char* GetIOVecPointer(int index, size_t offset) {
+  inline char* GetIOVecPointer(size_t index, size_t offset) {
     return reinterpret_cast<char*>(output_iov_[index].iov_base) +
         offset;
   }
@@ -1038,8 +972,7 @@ class SnappyIOVecWriter {
         output_iov_[curr_iov_index_].iov_len - curr_iov_written_ >= 16) {
       // Fast path, used for the majority (about 95%) of invocations.
       char* ptr = GetIOVecPointer(curr_iov_index_, curr_iov_written_);
-      UnalignedCopy64(ip, ptr);
-      UnalignedCopy64(ip + 8, ptr + 8);
+      UnalignedCopy128(ip, ptr);
       curr_iov_written_ += len;
       total_written_ += len;
       return true;
@@ -1058,7 +991,7 @@ class SnappyIOVecWriter {
     }
 
     // Locate the iovec from which we need to start the copy.
-    int from_iov_index = curr_iov_index_;
+    size_t from_iov_index = curr_iov_index_;
     size_t from_iov_offset = curr_iov_written_;
     while (offset > 0) {
       if (from_iov_offset >= offset) {
@@ -1067,8 +1000,8 @@ class SnappyIOVecWriter {
       }
 
       offset -= from_iov_offset;
+      assert(from_iov_index > 0);
       --from_iov_index;
-      assert(from_iov_index >= 0);
       from_iov_offset = output_iov_[from_iov_index].iov_len;
     }
 
@@ -1103,9 +1036,10 @@ class SnappyIOVecWriter {
         if (to_copy > len) {
           to_copy = len;
         }
-        IncrementalCopy(GetIOVecPointer(from_iov_index, from_iov_offset),
-                        GetIOVecPointer(curr_iov_index_, curr_iov_written_),
-                        to_copy);
+        IncrementalCopySlow(
+            GetIOVecPointer(from_iov_index, from_iov_offset),
+            GetIOVecPointer(curr_iov_index_, curr_iov_written_),
+            GetIOVecPointer(curr_iov_index_, curr_iov_written_) + to_copy);
         curr_iov_written_ += to_copy;
         from_iov_offset += to_copy;
         total_written_ += to_copy;
@@ -1175,8 +1109,7 @@ class SnappyArrayWriter {
     const size_t space_left = op_limit_ - op;
     if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16) {
       // Fast path, used for the majority (about 95%) of invocations.
-      UnalignedCopy64(ip, op);
-      UnalignedCopy64(ip + 8, op + 8);
+      UnalignedCopy128(ip, op);
       op_ = op + len;
       return true;
     } else {
@@ -1185,8 +1118,7 @@ class SnappyArrayWriter {
   }
 
   inline bool AppendFromSelf(size_t offset, size_t len) {
-    char* op = op_;
-    const size_t space_left = op_limit_ - op;
+    char* const op_end = op_ + len;
 
     // Check if we try to append from before the start of the buffer.
     // Normally this would just be a check for "produced < offset",
@@ -1195,30 +1127,13 @@ class SnappyArrayWriter {
     // to a very big number. This is convenient, as offset==0 is another
     // invalid case that we also want to catch, so that we do not go
     // into an infinite loop.
-    assert(op >= base_);
-    size_t produced = op - base_;
-    if (produced <= offset - 1u) {
-      return false;
-    }
-    if (len <= 16 && offset >= 8 && space_left >= 16) {
-      // Fast path, used for the majority (70-80%) of dynamic invocations.
-      UnalignedCopy64(op - offset, op);
-      UnalignedCopy64(op - offset + 8, op + 8);
-    } else {
-      if (space_left >= len + kMaxIncrementCopyOverflow) {
-        IncrementalCopyFastPath(op - offset, op, len);
-      } else {
-        if (space_left < len) {
-          return false;
-        }
-        IncrementalCopy(op - offset, op, len);
-      }
-    }
+    if (Produced() <= offset - 1u || op_end > op_limit_) return false;
+    op_ = IncrementalCopy(op_ - offset, op_, op_end, op_limit_);
 
-    op_ = op + len;
     return true;
   }
   inline size_t Produced() const {
+    assert(op_ >= base_);
     return op_ - base_;
   }
   inline void Flush() {}
@@ -1304,7 +1219,7 @@ void RawCompress(const char* input,
 
 size_t Compress(const char* input, size_t input_length, string* compressed) {
   // Pre-grow the buffer to the max length of the compressed output
-  compressed->resize(MaxCompressedLength(input_length));
+  STLStringResizeUninitialized(compressed, MaxCompressedLength(input_length));
 
   size_t compressed_length;
   RawCompress(input, input_length, string_as_array(compressed),
@@ -1327,7 +1242,7 @@ class SnappyScatteredWriter {
   // We need random access into the data generated so far.  Therefore
   // we keep track of all of the generated data as an array of blocks.
   // All of the blocks except the last have length kBlockSize.
-  vector<char*> blocks_;
+  std::vector<char*> blocks_;
   size_t expected_;
 
   // Total size of all fully generated blocks so far
@@ -1386,8 +1301,7 @@ class SnappyScatteredWriter {
     if (length <= 16 && available >= 16 + kMaximumTagLength &&
         space_left >= 16) {
       // Fast path, used for the majority (about 95%) of invocations.
-      UNALIGNED_STORE64(op, UNALIGNED_LOAD64(ip));
-      UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(ip + 8));
+      UnalignedCopy128(ip, op);
       op_ptr_ = op + length;
       return true;
     } else {
@@ -1396,16 +1310,13 @@ class SnappyScatteredWriter {
   }
 
   inline bool AppendFromSelf(size_t offset, size_t len) {
+    char* const op_end = op_ptr_ + len;
     // See SnappyArrayWriter::AppendFromSelf for an explanation of
     // the "offset - 1u" trick.
-    if (offset - 1u < op_ptr_ - op_base_) {
-      const size_t space_left = op_limit_ - op_ptr_;
-      if (space_left >= len + kMaxIncrementCopyOverflow) {
-        // Fast path: src and dst in current block.
-        IncrementalCopyFastPath(op_ptr_ - offset, op_ptr_, len);
-        op_ptr_ += len;
-        return true;
-      }
+    if (PREDICT_TRUE(offset - 1u < op_ptr_ - op_base_ && op_end <= op_limit_)) {
+      // Fast path: src and dst in current block.
+      op_ptr_ = IncrementalCopy(op_ptr_ - offset, op_ptr_, op_end, op_limit_);
+      return true;
     }
     return SlowAppendFromSelf(offset, len);
   }
@@ -1510,7 +1421,7 @@ class SnappySinkAllocator {
   }
 
   Sink* dest_;
-  vector<Datablock> blocks_;
+  std::vector<Datablock> blocks_;
 
   // Note: copying this object is allowed
 };
diff --git a/thirdparty/snappy/snappy_unittest.cc b/thirdparty/snappy/snappy_unittest.cc
index 4a80f2ad..ca95e0d8 100644
--- a/thirdparty/snappy/snappy_unittest.cc
+++ b/thirdparty/snappy/snappy_unittest.cc
@@ -64,6 +64,9 @@ DEFINE_bool(write_compressed, false,
 DEFINE_bool(write_uncompressed, false,
             "Write uncompressed versions of each file to <file>.uncomp");
 
+DEFINE_bool(snappy_dump_decompression_table, false,
+            "If true, we print the decompression table during tests.");
+
 namespace snappy {
 
 
@@ -390,10 +393,10 @@ static void Measure(const char* data,
   {
     // Chop the input into blocks
     int num_blocks = (length + block_size - 1) / block_size;
-    vector<const char*> input(num_blocks);
-    vector<size_t> input_length(num_blocks);
-    vector<string> compressed(num_blocks);
-    vector<string> output(num_blocks);
+    std::vector<const char*> input(num_blocks);
+    std::vector<size_t> input_length(num_blocks);
+    std::vector<string> compressed(num_blocks);
+    std::vector<string> output(num_blocks);
     for (int b = 0; b < num_blocks; b++) {
       int input_start = b * block_size;
       int input_limit = min<int>((b+1)*block_size, length);
@@ -1004,6 +1007,20 @@ TEST(SnappyCorruption, UnterminatedVarint) {
                             &uncompressed));
 }
 
+TEST(SnappyCorruption, OverflowingVarint) {
+  string compressed, uncompressed;
+  size_t ulength;
+  compressed.push_back('\xfb');
+  compressed.push_back('\xff');
+  compressed.push_back('\xff');
+  compressed.push_back('\xff');
+  compressed.push_back('\x7f');
+  CHECK(!CheckUncompressedLength(compressed, &ulength));
+  CHECK(!snappy::IsValidCompressedBuffer(compressed.data(), compressed.size()));
+  CHECK(!snappy::Uncompress(compressed.data(), compressed.size(),
+                            &uncompressed));
+}
+
 TEST(Snappy, ReadPastEndOfBuffer) {
   // Check that we do not read past end of input
 
@@ -1037,7 +1054,10 @@ TEST(Snappy, ZeroOffsetCopyValidation) {
 namespace {
 
 int TestFindMatchLength(const char* s1, const char *s2, unsigned length) {
-  return snappy::internal::FindMatchLength(s1, s2, s2 + length);
+  std::pair<size_t, bool> p =
+      snappy::internal::FindMatchLength(s1, s2, s2 + length);
+  CHECK_EQ(p.first < 8, p.second);
+  return p.first;
 }
 
 }  // namespace
@@ -1147,8 +1167,7 @@ TEST(Snappy, FindMatchLengthRandom) {
     }
     DataEndingAtUnreadablePage u(s);
     DataEndingAtUnreadablePage v(t);
-    int matched = snappy::internal::FindMatchLength(
-        u.data(), v.data(), v.data() + t.size());
+    int matched = TestFindMatchLength(u.data(), v.data(), t.size());
     if (matched == t.size()) {
       EXPECT_EQ(s, t);
     } else {
@@ -1160,6 +1179,100 @@ TEST(Snappy, FindMatchLengthRandom) {
   }
 }
 
+static uint16 MakeEntry(unsigned int extra,
+                        unsigned int len,
+                        unsigned int copy_offset) {
+  // Check that all of the fields fit within the allocated space
+  assert(extra       == (extra & 0x7));          // At most 3 bits
+  assert(copy_offset == (copy_offset & 0x7));    // At most 3 bits
+  assert(len         == (len & 0x7f));           // At most 7 bits
+  return len | (copy_offset << 8) | (extra << 11);
+}
+
+// Check that the decompression table is correct, and optionally print out
+// the computed one.
+TEST(Snappy, VerifyCharTable) {
+  using snappy::internal::LITERAL;
+  using snappy::internal::COPY_1_BYTE_OFFSET;
+  using snappy::internal::COPY_2_BYTE_OFFSET;
+  using snappy::internal::COPY_4_BYTE_OFFSET;
+  using snappy::internal::char_table;
+  using snappy::internal::wordmask;
+
+  uint16 dst[256];
+
+  // Place invalid entries in all places to detect missing initialization
+  int assigned = 0;
+  for (int i = 0; i < 256; i++) {
+    dst[i] = 0xffff;
+  }
+
+  // Small LITERAL entries.  We store (len-1) in the top 6 bits.
+  for (unsigned int len = 1; len <= 60; len++) {
+    dst[LITERAL | ((len-1) << 2)] = MakeEntry(0, len, 0);
+    assigned++;
+  }
+
+  // Large LITERAL entries.  We use 60..63 in the high 6 bits to
+  // encode the number of bytes of length info that follow the opcode.
+  for (unsigned int extra_bytes = 1; extra_bytes <= 4; extra_bytes++) {
+    // We set the length field in the lookup table to 1 because extra
+    // bytes encode len-1.
+    dst[LITERAL | ((extra_bytes+59) << 2)] = MakeEntry(extra_bytes, 1, 0);
+    assigned++;
+  }
+
+  // COPY_1_BYTE_OFFSET.
+  //
+  // The tag byte in the compressed data stores len-4 in 3 bits, and
+  // offset/256 in 5 bits.  offset%256 is stored in the next byte.
+  //
+  // This format is used for length in range [4..11] and offset in
+  // range [0..2047]
+  for (unsigned int len = 4; len < 12; len++) {
+    for (unsigned int offset = 0; offset < 2048; offset += 256) {
+      dst[COPY_1_BYTE_OFFSET | ((len-4)<<2) | ((offset>>8)<<5)] =
+        MakeEntry(1, len, offset>>8);
+      assigned++;
+    }
+  }
+
+  // COPY_2_BYTE_OFFSET.
+  // Tag contains len-1 in top 6 bits, and offset in next two bytes.
+  for (unsigned int len = 1; len <= 64; len++) {
+    dst[COPY_2_BYTE_OFFSET | ((len-1)<<2)] = MakeEntry(2, len, 0);
+    assigned++;
+  }
+
+  // COPY_4_BYTE_OFFSET.
+  // Tag contents len-1 in top 6 bits, and offset in next four bytes.
+  for (unsigned int len = 1; len <= 64; len++) {
+    dst[COPY_4_BYTE_OFFSET | ((len-1)<<2)] = MakeEntry(4, len, 0);
+    assigned++;
+  }
+
+  // Check that each entry was initialized exactly once.
+  EXPECT_EQ(256, assigned) << "Assigned only " << assigned << " of 256";
+  for (int i = 0; i < 256; i++) {
+    EXPECT_NE(0xffff, dst[i]) << "Did not assign byte " << i;
+  }
+
+  if (FLAGS_snappy_dump_decompression_table) {
+    printf("static const uint16 char_table[256] = {\n  ");
+    for (int i = 0; i < 256; i++) {
+      printf("0x%04x%s",
+             dst[i],
+             ((i == 255) ? "\n" : (((i%8) == 7) ? ",\n  " : ", ")));
+    }
+    printf("};\n");
+  }
+
+  // Check that computed table matched recorded table.
+  for (int i = 0; i < 256; i++) {
+    EXPECT_EQ(dst[i], char_table[i]) << "Mismatch in byte " << i;
+  }
+}
+
 static void CompressFile(const char* fname) {
   string fullinput;
   CHECK_OK(file::GetContents(fname, &fullinput, file::Defaults()));
author	Jose Fonseca <jfonseca@vmware.com>	2017-03-23 11:05:20 +0000
committer	Jose Fonseca <jfonseca@vmware.com>	2017-04-08 08:24:55 +0100
commit	a684e9e18aaf3e67cd43a516227a568ee1d5c102 (patch)
tree	2c76ec05858e76e573f932b298680370a2194433 /thirdparty
parent	e8ee4f3999a36e043224980b2343ead3d658008a (diff)