Merge the contents of the HGDEV2 branch into trunk:

* performance and scalability improvements * show locks held by both threads in a race * show all 4 locks involved in a lock order violation * better delimited error messages git-svn-id: svn://svn.valgrind.org/valgrind/trunk@11824 a5019735-40e9-0310-863c-91ae7b9d1cf9
author: sewardj <sewardj@a5019735-40e9-0310-863c-91ae7b9d1cf9> 2011-06-24 10:09:41 +0000
committer: sewardj <sewardj@a5019735-40e9-0310-863c-91ae7b9d1cf9> 2011-06-24 10:09:41 +0000
commit: ffce8159a95134f0a2bc1cea3c3e6e265f096d9f (patch)
tree: 675b228c1330fcce4e21162fe7ff6b360eade64f /helgrind
parent: 6e18a08c22b581aec02577096b470978722167a9 (diff)
8 files changed, 1476 insertions, 383 deletions
diff --git a/helgrind/hg_basics.c b/helgrind/hg_basics.c
index 172b1dae..5bbe6080 100644
--- a/helgrind/hg_basics.c
+++ b/helgrind/hg_basics.c
@@ -82,6 +82,9 @@ Word  HG_(clo_sanity_flags) = 0;
 
 Bool  HG_(clo_free_is_write) = False;
 
+UWord HG_(clo_vts_pruning) = 1;
+
+Bool  HG_(clo_check_stack_refs) = True;
 
 /*--------------------------------------------------------------------*/
 /*--- end                                              hg_basics.c ---*/
diff --git a/helgrind/hg_basics.h b/helgrind/hg_basics.h
index edf05e83..a454a904 100644
--- a/helgrind/hg_basics.h
+++ b/helgrind/hg_basics.h
@@ -110,6 +110,22 @@ extern Word HG_(clo_sanity_flags);
    before the free. */
 extern Bool HG_(clo_free_is_write);
 
+/* Controls the application of VTS pruning.  There are three levels:
+
+   0: "never": VTS pruning is never done
+
+   1: "auto": (the default): VTS pruning is sometimes done after VTS
+      GCs, just frequently enough to keep space use under control, as
+      determined by heuristics in libhb_core.c.
+
+   2: "always": VTS pruning is done after every VTS GC.  This is
+      mostly a big time waster, but minimises space use. */
+extern UWord HG_(clo_vts_pruning);
+
+/* When False, race checking ignores memory references which are to
+   the stack, which speeds things up a bit.  Default: True. */
+extern Bool HG_(clo_check_stack_refs); 
+
 #endif /* ! __HG_BASICS_H */
 
 /*--------------------------------------------------------------------*/
diff --git a/helgrind/hg_errors.c b/helgrind/hg_errors.c
index 6076f475..41d99e26 100644
--- a/helgrind/hg_errors.c
+++ b/helgrind/hg_errors.c
@@ -110,11 +110,56 @@ static Word lock_unique_cmp ( UWord lk1W, UWord lk2W )
    return 0;
 }
 
-static Lock* mk_LockP_from_LockN ( Lock* lkn )
+/* Given a normal Lock (LockN), convert it to a persistent Lock
+   (LockP).  In some cases the LockN could be invalid (if it's been
+   freed), so we enquire, in hg_main.c's admin_locks list, whether it
+   is in fact valid.  If allowed_to_be_invalid is True, then it's OK
+   for the LockN to be invalid, in which case Lock_INVALID is
+   returned.  In all other cases, we insist that the LockN is a valid
+   lock, and return its corresponding LockP.
+
+   Why can LockNs sometimes be invalid?  Because they are harvested
+   from locksets that are attached to the OldRef info for conflicting
+   threads.  By the time we detect a race, the some of the elements of
+   the lockset may have been destroyed by the client, in which case
+   the corresponding Lock structures we maintain will have been freed.
+
+   So we check that each LockN is a member of the admin_locks double
+   linked list of all Lock structures.  That stops us prodding around
+   in potentially freed-up Lock structures.  However, it's not quite a
+   proper check: if a new Lock has been reallocated at the same
+   address as one which was previously freed, we'll wind up copying
+   the new one as the basis for the LockP, which is completely bogus
+   because it is unrelated to the previous Lock that lived there.
+   Let's hope that doesn't happen too often.
+*/
+static Lock* mk_LockP_from_LockN ( Lock* lkn,
+                                   Bool allowed_to_be_invalid )
 {
    Lock* lkp = NULL;
    HG_(stats__LockN_to_P_queries)++;
+
+   /* First off, let's do some sanity checks.  If
+      allowed_to_be_invalid is False, we _must_ be able to find 'lkn'
+      in admin_locks; else we must assert.  If it is True, it's OK for
+      it not to be findable, but in that case we must return
+      Lock_INVALID right away. */
+   Lock* lock_list = HG_(get_admin_locks)();
+   while (lock_list) {
+      if (lock_list == lkn)
+         break;
+      lock_list = lock_list->admin_next;
+   }
+   if (lock_list == NULL) {
+      /* We didn't find it.  That possibility has to be OK'd by the
+         caller. */
+      tl_assert(allowed_to_be_invalid);
+      return Lock_INVALID;
+   }
+
+   /* So we must be looking at a valid LockN. */
    tl_assert( HG_(is_sane_LockN)(lkn) );
+
    if (!map_LockN_to_P) {
       map_LockN_to_P = VG_(newFM)( HG_(zalloc), "hg.mLPfLN.1",
                                    HG_(free), lock_unique_cmp );
@@ -139,6 +184,75 @@ static Lock* mk_LockP_from_LockN ( Lock* lkn )
    return lkp;
 }
 
+/* Expand a WordSet of LockN*'s into a NULL-terminated vector of
+   LockP*'s.  Any LockN's that can't be converted into a LockP
+   (because they have been freed, see comment on mk_LockP_from_LockN)
+   are converted instead into the value Lock_INVALID.  Hence the
+   returned vector is a sequence: zero or more (valid LockP* or
+   LockN_INVALID), terminated by a NULL. */
+static 
+Lock** enumerate_WordSet_into_LockP_vector( WordSetU* univ_lsets,
+                                            WordSetID lockset,
+                                            Bool allowed_to_be_invalid )
+{
+   tl_assert(univ_lsets);
+   tl_assert( HG_(plausibleWS)(univ_lsets, lockset) );
+   UWord  nLocks = HG_(cardinalityWS)(univ_lsets, lockset);
+   Lock** lockPs = HG_(zalloc)( "hg.eWSiLPa",
+                                (nLocks+1) * sizeof(Lock*) );
+   tl_assert(lockPs);
+   tl_assert(lockPs[nLocks] == NULL); /* pre-NULL terminated */
+   UWord* lockNs  = NULL;
+   UWord  nLockNs = 0;
+   if (nLocks > 0)  {
+      /* HG_(getPayloadWS) doesn't assign non-NULL to &lockNs if the
+         lockset is empty; hence the guarding "if".  Sigh. */
+      HG_(getPayloadWS)( &lockNs, &nLockNs, univ_lsets, lockset );
+      tl_assert(lockNs);
+   }
+   UWord i;
+   /* Convert to LockPs. */
+   for (i = 0; i < nLockNs; i++) {
+      lockPs[i] = mk_LockP_from_LockN( (Lock*)lockNs[i],
+                                       allowed_to_be_invalid );
+   }
+   return lockPs;
+}
+
+/* Get the number of useful elements in a vector created by
+   enumerate_WordSet_into_LockP_vector.  Returns both the total number
+   of elements (not including the terminating NULL) and the number of
+   non-Lock_INVALID elements. */
+static void count_LockP_vector ( /*OUT*/UWord* nLocks,
+                                 /*OUT*/UWord* nLocksValid,
+                                 Lock** vec )
+{
+   tl_assert(vec);
+   *nLocks = *nLocksValid = 0;
+   UWord n = 0;
+   while (vec[n]) {
+      (*nLocks)++;
+      if (vec[n] != Lock_INVALID)
+         (*nLocksValid)++;
+      n++;
+   }
+}
+
+/* Find out whether 'lk' is in 'vec'. */
+static Bool elem_LockP_vector ( Lock** vec, Lock* lk )
+{
+   tl_assert(vec);
+   tl_assert(lk);
+   UWord n = 0;
+   while (vec[n]) {
+      if (vec[n] == lk)
+         return True;
+      n++;
+   }
+   return False;
+}
+
+
 /* Errors:
 
       race: program counter
@@ -179,6 +293,7 @@ typedef
             Int         szB;
             Bool        isWrite;
             Thread*     thr;
+            Lock**      locksHeldW;
             /* descr1/2 provide a description of stack/global locs */
             XArray*     descr1; /* XArray* of HChar */
             XArray*     descr2; /* XArray* of HChar */
@@ -195,6 +310,7 @@ typedef
             ExeContext* h2_ct_accEC;
             Int         h2_ct_accSzB;
             Bool        h2_ct_accIsW;
+            Lock**      h2_ct_locksHeldW;
          } Race;
          struct {
             Thread* thr;  /* doing the unlocking */
@@ -217,10 +333,19 @@ typedef
          } PthAPIerror;
          struct {
             Thread*     thr;
-            Addr        before_ga; /* always locked first in prog. history */
-            Addr        after_ga;
-            ExeContext* before_ec;
-            ExeContext* after_ec;
+            /* The first 4 fields describe the previously observed
+               (should-be) ordering. */
+            Addr        shouldbe_earlier_ga;
+            Addr        shouldbe_later_ga;
+            ExeContext* shouldbe_earlier_ec;
+            ExeContext* shouldbe_later_ec;
+            /* In principle we need to record two more stacks, from
+               this thread, when acquiring the locks in the "wrong"
+               order.  In fact the wallclock-later acquisition by this
+               thread is recorded in the main stack for this error.
+               So we only need a stack for the earlier acquisition by
+               this thread. */
+            ExeContext* actual_earlier_ec;
          } LockOrder;
          struct {
             Thread*     thr;
@@ -264,6 +389,18 @@ UInt HG_(update_extra) ( Error* err )
 
    if (xe->tag == XE_Race) {
 
+      /* Note the set of locks that the thread is (w-)holding.
+         Convert the WordSetID of LockN*'s into a NULL-terminated
+         vector of LockP*'s.  We don't expect to encounter any invalid
+         LockNs in this conversion. */
+      tl_assert(xe->XE.Race.thr);
+      xe->XE.Race.locksHeldW
+         = enumerate_WordSet_into_LockP_vector(
+              HG_(get_univ_lsets)(),
+              xe->XE.Race.thr->locksetW,
+              False/*!allowed_to_be_invalid*/
+           );
+
       /* See if we can come up with a source level description of the
          raced-upon address.  This is potentially expensive, which is
          why it's only done at the update_extra point, not when the
@@ -325,18 +462,19 @@ UInt HG_(update_extra) ( Error* err )
          can rustle up a plausible-looking conflicting memory access
          to show. */
       if (HG_(clo_history_level) >= 2) { 
-         Thr* thrp = NULL;
-         ExeContext* wherep = NULL;
-         Addr  acc_addr = xe->XE.Race.data_addr;
-         Int   acc_szB  = xe->XE.Race.szB;
-         Thr*  acc_thr  = xe->XE.Race.thr->hbthr;
-         Bool  acc_isW  = xe->XE.Race.isWrite;
-         SizeT conf_szB = 0;
-         Bool  conf_isW = False;
+         Thr*        thrp            = NULL;
+         ExeContext* wherep          = NULL;
+         Addr        acc_addr        = xe->XE.Race.data_addr;
+         Int         acc_szB         = xe->XE.Race.szB;
+         Thr*        acc_thr         = xe->XE.Race.thr->hbthr;
+         Bool        acc_isW         = xe->XE.Race.isWrite;
+         SizeT       conf_szB        = 0;
+         Bool        conf_isW        = False;
+         WordSetID   conf_locksHeldW = 0;
          tl_assert(!xe->XE.Race.h2_ct_accEC);
          tl_assert(!xe->XE.Race.h2_ct);
          if (libhb_event_map_lookup(
-                &wherep, &thrp, &conf_szB, &conf_isW,
+                &wherep, &thrp, &conf_szB, &conf_isW, &conf_locksHeldW,
                 acc_thr, acc_addr, acc_szB, acc_isW )) {
             Thread* threadp;
             tl_assert(wherep);
@@ -347,6 +485,12 @@ UInt HG_(update_extra) ( Error* err )
             xe->XE.Race.h2_ct        = threadp;
             xe->XE.Race.h2_ct_accSzB = (Int)conf_szB;
             xe->XE.Race.h2_ct_accIsW = conf_isW;
+            xe->XE.Race.h2_ct_locksHeldW
+               = enumerate_WordSet_into_LockP_vector(
+                    HG_(get_univ_lsets)(),
+                    conf_locksHeldW,
+                    True/*allowed_to_be_invalid*/
+                 );
         }
       }
 
@@ -424,8 +568,10 @@ void HG_(record_error_UnlockUnlocked) ( Thread* thr, Lock* lk )
    tl_assert( HG_(is_sane_LockN)(lk) );
    init_XError(&xe);
    xe.tag = XE_UnlockUnlocked;
-   xe.XE.UnlockUnlocked.thr  = thr;
-   xe.XE.UnlockUnlocked.lock = mk_LockP_from_LockN(lk);
+   xe.XE.UnlockUnlocked.thr
+      = thr;
+   xe.XE.UnlockUnlocked.lock
+      = mk_LockP_from_LockN(lk, False/*!allowed_to_be_invalid*/);
    // FIXME: tid vs thr
    tl_assert( HG_(is_sane_ThreadId)(thr->coretid) );
    tl_assert( thr->coretid != VG_INVALID_THREADID );
@@ -444,7 +590,8 @@ void HG_(record_error_UnlockForeign) ( Thread* thr,
    xe.tag = XE_UnlockForeign;
    xe.XE.UnlockForeign.thr   = thr;
    xe.XE.UnlockForeign.owner = owner;
-   xe.XE.UnlockForeign.lock  = mk_LockP_from_LockN(lk);
+   xe.XE.UnlockForeign.lock
+      = mk_LockP_from_LockN(lk, False/*!allowed_to_be_invalid*/);
    // FIXME: tid vs thr
    tl_assert( HG_(is_sane_ThreadId)(thr->coretid) );
    tl_assert( thr->coretid != VG_INVALID_THREADID );
@@ -468,8 +615,12 @@ void HG_(record_error_UnlockBogus) ( Thread* thr, Addr lock_ga )
 }
 
 void HG_(record_error_LockOrder)(
-        Thread* thr, Addr before_ga, Addr after_ga,
-        ExeContext* before_ec, ExeContext* after_ec 
+        Thread*     thr, 
+        Addr        shouldbe_earlier_ga,
+        Addr        shouldbe_later_ga,
+        ExeContext* shouldbe_earlier_ec,
+        ExeContext* shouldbe_later_ec,
+        ExeContext* actual_earlier_ec
      )
 {
    XError xe;
@@ -478,10 +629,11 @@ void HG_(record_error_LockOrder)(
    init_XError(&xe);
    xe.tag = XE_LockOrder;
    xe.XE.LockOrder.thr       = thr;
-   xe.XE.LockOrder.before_ga = before_ga;
-   xe.XE.LockOrder.before_ec = before_ec;
-   xe.XE.LockOrder.after_ga  = after_ga;
-   xe.XE.LockOrder.after_ec  = after_ec;
+   xe.XE.LockOrder.shouldbe_earlier_ga = shouldbe_earlier_ga;
+   xe.XE.LockOrder.shouldbe_earlier_ec = shouldbe_earlier_ec;
+   xe.XE.LockOrder.shouldbe_later_ga   = shouldbe_later_ga;
+   xe.XE.LockOrder.shouldbe_later_ec   = shouldbe_later_ec;
+   xe.XE.LockOrder.actual_earlier_ec   = actual_earlier_ec;
    // FIXME: tid vs thr
    tl_assert( HG_(is_sane_ThreadId)(thr->coretid) );
    tl_assert( thr->coretid != VG_INVALID_THREADID );
@@ -639,6 +791,10 @@ static Bool announce_one_thread ( Thread* thr )
 
    } else {
 
+      VG_(umsg)("---Thread-Announcement----------"
+                "--------------------------------" "\n");
+      VG_(umsg)("\n");
+
       if (thr->errmsg_index == 1) {
          tl_assert(thr->created_at == NULL);
          VG_(message)(Vg_UserMsg, 
@@ -659,6 +815,76 @@ static Bool announce_one_thread ( Thread* thr )
 }
 
 
+/* Announce 'lk'. */
+static void announce_LockP ( Lock* lk )
+{
+   tl_assert(lk);
+   if (lk == Lock_INVALID)
+      return; /* Can't be announced -- we know nothing about it. */
+   tl_assert(lk->magic == LockP_MAGIC);
+   if (!lk->appeared_at)
+     return; /* There's nothing we can show */
+
+   if (VG_(clo_xml)) {
+      /* fixme: add announcement */
+   } else {
+      VG_(umsg)( "Lock at %p was first observed\n",
+                 (void*)lk->guestaddr );
+      VG_(pp_ExeContext)( lk->appeared_at );
+      VG_(umsg)("\n");
+   }
+}
+
+/* Announce (that is, print point-of-first-observation) for the
+   locks in 'lockvec' and, if non-NULL, 'lockvec2'. */
+static void announce_combined_LockP_vecs ( Lock** lockvec,
+                                           Lock** lockvec2 )
+{
+   UWord i;
+   tl_assert(lockvec);
+   for (i = 0; lockvec[i]; i++) {
+      announce_LockP(lockvec[i]);
+   }
+   if (lockvec2) {
+      for (i = 0; lockvec2[i]; i++) {
+         Lock* lk = lockvec2[i];
+         if (!elem_LockP_vector(lockvec, lk))
+            announce_LockP(lk);
+      }
+   }
+}
+
+
+static void show_LockP_summary_textmode ( Lock** locks, HChar* pre )
+{
+   tl_assert(locks);
+   UWord i;
+   UWord nLocks = 0, nLocksValid = 0;
+   count_LockP_vector(&nLocks, &nLocksValid, locks);
+   tl_assert(nLocksValid <= nLocks);
+
+   if (nLocks == 0) {
+      VG_(umsg)( "%sLocks held: none", pre );
+   } else {
+      VG_(umsg)( "%sLocks held: %lu, at address%s ",
+                 pre, nLocks, nLocksValid == 1 ? "" : "es" );
+   }
+
+   if (nLocks > 0) {
+      for (i = 0; i < nLocks; i++) {
+         if (locks[i] == Lock_INVALID)
+            continue;
+         VG_(umsg)( "%p", (void*)locks[i]->guestaddr);
+         if (locks[i+1] != NULL)
+            VG_(umsg)(" ");
+      }
+      if (nLocksValid < nLocks)
+         VG_(umsg)(" (and %lu that can't be shown)", nLocks - nLocksValid);
+   }
+   VG_(umsg)("\n");
+}
+
+
 /* This is the "this error is due to be printed shortly; so have a
    look at it any print any preamble you want" function.  We use it to
    announce any previously un-announced threads in the upcoming error
@@ -707,6 +933,12 @@ void HG_(pp_Error) ( Error* err )
 {
    const Bool xml = VG_(clo_xml); /* a shorthand, that's all */
 
+   if (!xml) {
+      VG_(umsg)("--------------------------------"
+                "--------------------------------" "\n");
+      VG_(umsg)("\n");
+   }
+
    XError *xe = (XError*)VG_(get_error_extra)(err);
    tl_assert(xe);
 
@@ -758,38 +990,54 @@ void HG_(pp_Error) ( Error* err )
          emit( "    <text>Thread #%d: lock order \"%p before %p\" "
                     "violated</text>\n",
                (Int)xe->XE.LockOrder.thr->errmsg_index,
-               (void*)xe->XE.LockOrder.before_ga,
-               (void*)xe->XE.LockOrder.after_ga );
+               (void*)xe->XE.LockOrder.shouldbe_earlier_ga,
+               (void*)xe->XE.LockOrder.shouldbe_later_ga );
          emit( "    <hthreadid>%d</hthreadid>\n",
                (Int)xe->XE.LockOrder.thr->errmsg_index );
          emit( "  </xwhat>\n" );
          VG_(pp_ExeContext)( VG_(get_error_where)(err) );
-         if (xe->XE.LockOrder.before_ec && xe->XE.LockOrder.after_ec) {
+         if (xe->XE.LockOrder.shouldbe_earlier_ec
+             && xe->XE.LockOrder.shouldbe_later_ec) {
             emit( "  <auxwhat>Required order was established by "
                   "acquisition of lock at %p</auxwhat>\n",
-                  (void*)xe->XE.LockOrder.before_ga );
-            VG_(pp_ExeContext)( xe->XE.LockOrder.before_ec );
+                  (void*)xe->XE.LockOrder.shouldbe_earlier_ga );
+            VG_(pp_ExeContext)( xe->XE.LockOrder.shouldbe_earlier_ec );
             emit( "  <auxwhat>followed by a later acquisition "
                   "of lock at %p</auxwhat>\n",
-                  (void*)xe->XE.LockOrder.after_ga );
-            VG_(pp_ExeContext)( xe->XE.LockOrder.after_ec );
+                  (void*)xe->XE.LockOrder.shouldbe_later_ga );
+            VG_(pp_ExeContext)( xe->XE.LockOrder.shouldbe_later_ec );
          }
 
       } else {
 
          emit( "Thread #%d: lock order \"%p before %p\" violated\n",
                (Int)xe->XE.LockOrder.thr->errmsg_index,
-               (void*)xe->XE.LockOrder.before_ga,
-               (void*)xe->XE.LockOrder.after_ga );
+               (void*)xe->XE.LockOrder.shouldbe_earlier_ga,
+               (void*)xe->XE.LockOrder.shouldbe_later_ga );
+         emit( "\n" );
+         emit( "Observed (incorrect) order is: "
+               "acquisition of lock at %p\n",
+               (void*)xe->XE.LockOrder.shouldbe_later_ga);
+         if (xe->XE.LockOrder.actual_earlier_ec) {
+             VG_(pp_ExeContext)(xe->XE.LockOrder.actual_earlier_ec);
+         } else {
+            emit("   (stack unavailable)\n");
+         }
+         emit( "\n" );
+         emit(" followed by a later acquisition of lock at %p\n",
+              (void*)xe->XE.LockOrder.shouldbe_earlier_ga);
          VG_(pp_ExeContext)( VG_(get_error_where)(err) );
-         if (xe->XE.LockOrder.before_ec && xe->XE.LockOrder.after_ec) {
-            emit( "  Required order was established by "
+         if (xe->XE.LockOrder.shouldbe_earlier_ec
+             && xe->XE.LockOrder.shouldbe_later_ec) {
+            emit("\n");
+            emit( "Required order was established by "
                   "acquisition of lock at %p\n",
-                  (void*)xe->XE.LockOrder.before_ga );
-            VG_(pp_ExeContext)( xe->XE.LockOrder.before_ec );
-            emit( "  followed by a later acquisition of lock at %p\n",
-                  (void*)xe->XE.LockOrder.after_ga );
-            VG_(pp_ExeContext)( xe->XE.LockOrder.after_ec );
+                  (void*)xe->XE.LockOrder.shouldbe_earlier_ga );
+            VG_(pp_ExeContext)( xe->XE.LockOrder.shouldbe_earlier_ec );
+            emit( "\n" );
+            emit( " followed by a later acquisition of lock at %p\n",
+                  (void*)xe->XE.LockOrder.shouldbe_later_ga );
+            VG_(pp_ExeContext)( xe->XE.LockOrder.shouldbe_later_ec );
          }
 
       }
@@ -960,8 +1208,8 @@ void HG_(pp_Error) ( Error* err )
          emit( "  <kind>Race</kind>\n" );
          emit( "  <xwhat>\n" );
          emit( "    <text>Possible data race during %s of size %d "
-                    "at %#lx by thread #%d</text>\n",
-              what, szB, err_ga, (Int)xe->XE.Race.thr->errmsg_index );
+                    "at %p by thread #%d</text>\n",
+               what, szB, (void*)err_ga, (Int)xe->XE.Race.thr->errmsg_index );
          emit( "    <hthreadid>%d</hthreadid>\n",
                (Int)xe->XE.Race.thr->errmsg_index );
          emit( "  </xwhat>\n" );
@@ -1005,18 +1253,27 @@ void HG_(pp_Error) ( Error* err )
       } else {
 
          /* ------ Text ------ */
+         announce_combined_LockP_vecs( xe->XE.Race.locksHeldW,
+                                       xe->XE.Race.h2_ct_locksHeldW );
+
          emit( "Possible data race during %s of size %d "
-               "at %#lx by thread #%d\n",
-               what, szB, err_ga, (Int)xe->XE.Race.thr->errmsg_index );
+               "at %p by thread #%d\n",
+               what, szB, (void*)err_ga, (Int)xe->XE.Race.thr->errmsg_index );
+
+         tl_assert(xe->XE.Race.locksHeldW);
+         show_LockP_summary_textmode( xe->XE.Race.locksHeldW, "" );
          VG_(pp_ExeContext)( VG_(get_error_where)(err) );
 
          if (xe->XE.Race.h2_ct) {
             tl_assert(xe->XE.Race.h2_ct_accEC); // assured by update_extra
-            emit( " This conflicts with a previous %s of size %d "
+            tl_assert(xe->XE.Race.h2_ct_locksHeldW);
+            emit( "\n" );
+            emit( "This conflicts with a previous %s of size %d "
                   "by thread #%d\n",
                   xe->XE.Race.h2_ct_accIsW ? "write" : "read",
                   xe->XE.Race.h2_ct_accSzB,
                   xe->XE.Race.h2_ct->errmsg_index );
+            show_LockP_summary_textmode( xe->XE.Race.h2_ct_locksHeldW, "" );
             VG_(pp_ExeContext)( xe->XE.Race.h2_ct_accEC );
          }
 
@@ -1044,13 +1301,14 @@ void HG_(pp_Error) ( Error* err )
       if (xe->XE.Race.hctxt) {
          SizeT delta = err_ga - xe->XE.Race.haddr;
          if (xml) {
-            emit("  <auxwhat>Address %#lx is %ld bytes inside a block "
-                 "of size %ld alloc'd</auxwhat>\n", err_ga, delta, 
+            emit("  <auxwhat>Address %p is %ld bytes inside a block "
+                 "of size %ld alloc'd</auxwhat>\n", (void*)err_ga, delta, 
                  xe->XE.Race.hszB);
             VG_(pp_ExeContext)( xe->XE.Race.hctxt );
          } else {
-            emit(" Address %#lx is %ld bytes inside a block "
-                 "of size %ld alloc'd\n", err_ga, delta, 
+            emit("\n");
+            emit("Address %p is %ld bytes inside a block "
+                 "of size %ld alloc'd\n", (void*)err_ga, delta, 
                  xe->XE.Race.hszB);
             VG_(pp_ExeContext)( xe->XE.Race.hctxt );
          }
@@ -1060,12 +1318,23 @@ void HG_(pp_Error) ( Error* err )
          Note that in XML mode, it will already by nicely wrapped up
          in tags, either <auxwhat> or <xauxwhat>, so we can just emit
          it verbatim. */
-      if (xe->XE.Race.descr1)
-         emit( "%s%s\n", xml ? "  " : " ",
-                         (HChar*)VG_(indexXA)( xe->XE.Race.descr1, 0 ) );
-      if (xe->XE.Race.descr2)
-         emit( "%s%s\n", xml ? "  " : " ",
-                         (HChar*)VG_(indexXA)( xe->XE.Race.descr2, 0 ) );
+      if (xml) {
+         if (xe->XE.Race.descr1)
+            emit( "  %s\n",
+                  (HChar*)VG_(indexXA)( xe->XE.Race.descr1, 0 ) );
+         if (xe->XE.Race.descr2)
+            emit( "  %s\n",
+                  (HChar*)VG_(indexXA)( xe->XE.Race.descr2, 0 ) );
+      } else {
+         if (xe->XE.Race.descr1 || xe->XE.Race.descr2)
+            emit("\n");
+         if (xe->XE.Race.descr1)
+            emit( "%s\n",
+                  (HChar*)VG_(indexXA)( xe->XE.Race.descr1, 0 ) );
+         if (xe->XE.Race.descr2)
+            emit( "%s\n",
+                  (HChar*)VG_(indexXA)( xe->XE.Race.descr2, 0 ) );
+      }
 
       break; /* case XE_Race */
    } /* case XE_Race */
diff --git a/helgrind/hg_errors.h b/helgrind/hg_errors.h
index 2a19fe36..115ec923 100644
--- a/helgrind/hg_errors.h
+++ b/helgrind/hg_errors.h
@@ -57,8 +57,12 @@ void HG_(record_error_UnlockUnlocked) ( Thread*, Lock* );
 void HG_(record_error_UnlockForeign)  ( Thread*, Thread*, Lock* );
 void HG_(record_error_UnlockBogus)    ( Thread*, Addr );
 void HG_(record_error_PthAPIerror)    ( Thread*, HChar*, Word, HChar* );
+
+/* see the implementation for meaning of these params */
 void HG_(record_error_LockOrder)      ( Thread*, Addr, Addr,
-                                        ExeContext*, ExeContext* );
+                                        ExeContext*, ExeContext*,
+                                        ExeContext* );
+
 void HG_(record_error_Misc_w_aux)     ( Thread*, HChar* errstr,
                                         HChar* auxstr, ExeContext* auxctx );
 void HG_(record_error_Misc)           ( Thread* thr, HChar* errstr );
diff --git a/helgrind/hg_lock_n_thread.h b/helgrind/hg_lock_n_thread.h
index e109b551..1a6291af 100644
--- a/helgrind/hg_lock_n_thread.h
+++ b/helgrind/hg_lock_n_thread.h
@@ -45,9 +45,9 @@
 #define LockP_MAGIC    0x755b5456 /* persistent (copied) locks */
 
 
-/* These are handles for Word sets.  CONSTRAINTS: must be (very) small
-   ints numbered from zero, since < 30-bit versions of them are used to
-   encode thread-sets and lock-sets in 32-bit shadow words. */
+/* These are handles for Word sets.  CONSTRAINTS: must be small ints
+   numbered from zero, since 32-bit versions of them are used to
+   encode lock-sets in libhb's history records (Thr_n_RCEC). */
 typedef  WordSet  WordSetID;
 
 
@@ -97,6 +97,8 @@ typedef
    }
    Thread;
 
+/* Get hg's admin_threads value, so libhb can visit all of them. */
+Thread* get_admin_threads ( void );
 
 /* Stores information about a lock's current state.  These are
    allocated and later freed (when the containing memory becomes
@@ -154,6 +156,8 @@ typedef
    }
    Lock;
 
+#define Lock_INVALID  ((Lock*)1UL)
+
 /*----------------------------------------------------------------*/
 /*--- Sanity checking                                          ---*/
 /*----------------------------------------------------------------*/
diff --git a/helgrind/hg_main.c b/helgrind/hg_main.c
index bbda2c03..e89ef308 100644
--- a/helgrind/hg_main.c
+++ b/helgrind/hg_main.c
@@ -120,6 +120,7 @@ static void all__sanity_check ( Char* who ); /* fwds */
 
 /* Admin linked list of Threads */
 static Thread* admin_threads = NULL;
+Thread* get_admin_threads ( void ) { return admin_threads; }
 
 /* Admin double linked list of Locks */
 /* We need a double linked list to properly and efficiently
@@ -136,6 +137,14 @@ static WordFM* map_locks = NULL; /* WordFM LockAddr Lock* */
 static WordSetU* univ_lsets = NULL; /* sets of Lock* */
 static WordSetU* univ_laog  = NULL; /* sets of Lock*, for LAOG */
 
+/* Allow libhb to get at the universe of locksets stored
+   here.  Sigh. */
+WordSetU* HG_(get_univ_lsets) ( void ) { return univ_lsets; }
+
+/* Allow libhb to get at the list of locks stored here.  Ditto
+   sigh. */
+Lock* HG_(get_admin_locks) ( void ) { return admin_locks; }
+
 
 /*----------------------------------------------------------------*/
 /*--- Simple helpers for the data structures                   ---*/
@@ -537,6 +546,7 @@ static void pp_everything ( Int flags, Char* caller )
 static void initialise_data_structures ( Thr* hbthr_root )
 {
    Thread*   thr;
+   WordSetID wsid;
 
    /* Get everything initialised and zeroed. */
    tl_assert(admin_threads == NULL);
@@ -558,6 +568,11 @@ static void initialise_data_structures ( Thr* hbthr_root )
    univ_lsets = HG_(newWordSetU)( HG_(zalloc), "hg.ids.4", HG_(free),
                                   8/*cacheSize*/ );
    tl_assert(univ_lsets != NULL);
+   /* Ensure that univ_lsets is non-empty, with lockset zero being the
+      empty lockset.  hg_errors.c relies on the assumption that
+      lockset number zero in univ_lsets is always valid. */
+   wsid = HG_(emptyWS)(univ_lsets);
+   tl_assert(wsid == 0);
 
    tl_assert(univ_laog == NULL);
    if (HG_(clo_track_lockorders)) {
@@ -1653,10 +1668,21 @@ void evh__HG_PTHREAD_JOIN_POST ( ThreadId stay_tid, Thread* quit_thr )
    /* Send last arg of _so_send as False, since the sending thread
       doesn't actually exist any more, so we don't want _so_send to
       try taking stack snapshots of it. */
-   libhb_so_send(hbthr_q, so, True/*strong_send*/);
+   libhb_so_send(hbthr_q, so, True/*strong_send*//*?!? wrt comment above*/);
    libhb_so_recv(hbthr_s, so, True/*strong_recv*/);
    libhb_so_dealloc(so);
 
+   /* Tell libhb that the quitter has been reaped.  Note that we might
+      have to be cleverer about this, to exclude 2nd and subsequent
+      notifications for the same hbthr_q, in the case where the app is
+      buggy (calls pthread_join twice or more on the same thread) AND
+      where libpthread is also buggy and doesn't return ESRCH on
+      subsequent calls.  (If libpthread isn't thusly buggy, then the
+      wrapper for pthread_join in hg_intercepts.c will stop us getting
+      notified here multiple times for the same joinee.)  See also
+      comments in helgrind/tests/jointwice.c. */
+   libhb_joinedwith_done(hbthr_q);
+
    /* evh__pre_thread_ll_exit issues an error message if the exiting
       thread holds any locks.  No need to check here. */
 
@@ -2152,32 +2178,53 @@ static void evh__HG_PTHREAD_COND_SIGNAL_PRE ( ThreadId tid, void* cond )
    // Hmm.  POSIX doesn't actually say that it's an error to call 
    // pthread_cond_signal with the associated mutex being unlocked.
    // Although it does say that it should be "if consistent scheduling
-   // is desired."
+   // is desired."  For that reason, print "dubious" if the lock isn't
+   // held by any thread.  Skip the "dubious" if it is held by some
+   // other thread; that sounds straight-out wrong.
+   //
+   // Anybody who writes code that signals on a CV without holding
+   // the associated MX needs to be shipped off to a lunatic asylum
+   // ASAP, even though POSIX doesn't actually declare such behaviour
+   // illegal -- it makes code extremely difficult to understand/
+   // reason about.  In particular it puts the signalling thread in
+   // a situation where it is racing against the released waiter
+   // as soon as the signalling is done, and so there needs to be
+   // some auxiliary synchronisation mechanism in the program that
+   // makes this safe -- or the race(s) need to be harmless, or
+   // probably nonexistent.
    //
-   // For the moment, disable these checks.
-   //lk = map_locks_maybe_lookup(cvi->mx_ga);
-   //if (lk == NULL || cvi->mx_ga == 0) {
-   //   HG_(record_error_Misc)( thr, 
-   //      "pthread_cond_{signal,broadcast}: "
-   //      "no or invalid mutex associated with cond");
-   //}
-   ///* note: lk could be NULL.  Be careful. */
-   //if (lk) {
-   //   if (lk->kind == LK_rdwr) {
-   //      HG_(record_error_Misc)(thr,
-   //         "pthread_cond_{signal,broadcast}: associated lock is a rwlock");
-   //   }
-   //   if (lk->heldBy == NULL) {
-   //      HG_(record_error_Misc)(thr,
-   //         "pthread_cond_{signal,broadcast}: "
-   //         "associated lock is not held by any thread");
-   //   }
-   //   if (lk->heldBy != NULL && 0 == VG_(elemBag)(lk->heldBy, (Word)thr)) {
-   //      HG_(record_error_Misc)(thr,
-   //         "pthread_cond_{signal,broadcast}: "
-   //         "associated lock is not held by calling thread");
-   //   }
-   //}
+   if (1) {
+      Lock* lk = NULL;
+      if (cvi->mx_ga != 0) {
+         lk = map_locks_maybe_lookup( (Addr)cvi->mx_ga );
+      }
+      /* note: lk could be NULL.  Be careful. */
+      if (lk) {
+         if (lk->kind == LK_rdwr) {
+            HG_(record_error_Misc)(thr,
+               "pthread_cond_{signal,broadcast}: associated lock is a rwlock");
+         }
+         if (lk->heldBy == NULL) {
+            HG_(record_error_Misc)(thr,
+               "pthread_cond_{signal,broadcast}: dubious: "
+               "associated lock is not held by any thread");
+         }
+         if (lk->heldBy != NULL && 0 == VG_(elemBag)(lk->heldBy, (Word)thr)) {
+            HG_(record_error_Misc)(thr,
+               "pthread_cond_{signal,broadcast}: "
+               "associated lock is not held by calling thread");
+         }
+      } else {
+         /* Couldn't even find the damn thing. */
+         // But actually .. that's not necessarily an error.  We don't
+         // know the (CV,MX) binding until a pthread_cond_wait or bcast
+         // shows us what it is, and if that may not have happened yet.
+         // So just keep quiet in this circumstance.
+         //HG_(record_error_Misc)( thr, 
+         //   "pthread_cond_{signal,broadcast}: "
+         //   "no or invalid mutex associated with cond");
+      }
+   }
 
    libhb_so_send( thr->hbthr, cvi->so, True/*strong_send*/ );
 }
@@ -2280,7 +2327,7 @@ static void evh__HG_PTHREAD_COND_WAIT_POST ( ThreadId tid,
          it?  If this happened it would surely be a bug in the threads
          library.  Or one of those fabled "spurious wakeups". */
       HG_(record_error_Misc)( thr, "Bug in libpthread: pthread_cond_wait "
-                                   "succeeded on"
+                                   "succeeded"
                                    " without prior pthread_cond_post");
    }
 
@@ -3533,12 +3580,12 @@ static void laog__pre_thread_acquires_lock (
          tl_assert(found->dst_ec);
          HG_(record_error_LockOrder)( 
             thr, lk->guestaddr, other->guestaddr,
-                 found->src_ec, found->dst_ec );
+                 found->src_ec, found->dst_ec, other->acquired_at );
       } else {
          /* Hmm.  This can't happen (can it?) */
          HG_(record_error_LockOrder)(
             thr, lk->guestaddr, other->guestaddr,
-                 NULL, NULL );
+                 NULL, NULL, NULL );
       }
    }
 
@@ -3901,11 +3948,18 @@ Bool HG_(mm_find_containing_block)( /*OUT*/ExeContext** where,
 /*--- Instrumentation                                        ---*/
 /*--------------------------------------------------------------*/
 
-static void instrument_mem_access ( IRSB*   bbOut, 
+#define binop(_op, _arg1, _arg2) IRExpr_Binop((_op),(_arg1),(_arg2))
+#define mkexpr(_tmp)             IRExpr_RdTmp((_tmp))
+#define mkU32(_n)                IRExpr_Const(IRConst_U32(_n))
+#define mkU64(_n)                IRExpr_Const(IRConst_U64(_n))
+#define assign(_t, _e)           IRStmt_WrTmp((_t), (_e))
+
+static void instrument_mem_access ( IRSB*   sbOut, 
                                     IRExpr* addr,
                                     Int     szB,
                                     Bool    isStore,
-                                    Int     hWordTy_szB )
+                                    Int     hWordTy_szB,
+                                    Int     goff_sp )
 {
    IRType   tyAddr   = Ity_INVALID;
    HChar*   hName    = NULL;
@@ -3914,10 +3968,15 @@ static void instrument_mem_access ( IRSB*   bbOut,
    IRExpr** argv     = NULL;
    IRDirty* di       = NULL;
 
+   // THRESH is the size of the window above SP (well,
+   // mostly above) that we assume implies a stack reference.
+   const Int THRESH = 4096 * 4; // somewhat arbitrary
+   const Int rz_szB = VG_STACK_REDZONE_SZB;
+
    tl_assert(isIRAtom(addr));
    tl_assert(hWordTy_szB == 4 || hWordTy_szB == 8);
 
-   tyAddr = typeOfIRExpr( bbOut->tyenv, addr );
+   tyAddr = typeOfIRExpr( sbOut->tyenv, addr );
    tl_assert(tyAddr == Ity_I32 || tyAddr == Ity_I64);
 
    /* So the effective address is in 'addr' now. */
@@ -3984,14 +4043,71 @@ static void instrument_mem_access ( IRSB*   bbOut,
       }
    }
 
-   /* Add the helper. */
+   /* Create the helper. */
    tl_assert(hName);
    tl_assert(hAddr);
    tl_assert(argv);
    di = unsafeIRDirty_0_N( regparms,
                            hName, VG_(fnptr_to_fnentry)( hAddr ),
                            argv );
-   addStmtToIRSB( bbOut, IRStmt_Dirty(di) );
+
+   if (! HG_(clo_check_stack_refs)) {
+      /* We're ignoring memory references which are (obviously) to the
+         stack.  In fact just skip stack refs that are within 4 pages
+         of SP (SP - the redzone, really), as that's simple, easy, and
+         filters out most stack references. */
+      /* Generate the guard condition: "(addr - (SP - RZ)) >u N", for
+         some arbitrary N.  If that is true then addr is outside the
+         range (SP - RZ .. SP + N - RZ).  If N is smallish (a few
+         pages) then we can say addr is within a few pages of SP and
+         so can't possibly be a heap access, and so can be skipped.
+
+         Note that the condition simplifies to
+            (addr - SP + RZ) >u N
+         which generates better code in x86/amd64 backends, but it does
+         not unfortunately simplify to
+            (addr - SP) >u (N - RZ)
+         (would be beneficial because N - RZ is a constant) because
+         wraparound arithmetic messes up the comparison.  eg.
+         20 >u 10 == True,
+         but (20 - 15) >u (10 - 15) == 5 >u (MAXINT-5) == False.
+      */
+      IRTemp sp = newIRTemp(sbOut->tyenv, tyAddr);
+      addStmtToIRSB( sbOut, assign(sp, IRExpr_Get(goff_sp, tyAddr)));
+
+      /* "addr - SP" */
+      IRTemp addr_minus_sp = newIRTemp(sbOut->tyenv, tyAddr);
+      addStmtToIRSB(
+         sbOut,
+         assign(addr_minus_sp,
+                tyAddr == Ity_I32
+                   ? binop(Iop_Sub32, addr, mkexpr(sp))
+                   : binop(Iop_Sub64, addr, mkexpr(sp)))
+      );
+
+      /* "addr - SP + RZ" */
+      IRTemp diff = newIRTemp(sbOut->tyenv, tyAddr);
+      addStmtToIRSB(
+         sbOut,
+         assign(diff,
+                tyAddr == Ity_I32 
+                   ? binop(Iop_Add32, mkexpr(addr_minus_sp), mkU32(rz_szB))
+                   : binop(Iop_Add64, mkexpr(addr_minus_sp), mkU64(rz_szB)))
+      );
+
+      IRTemp guard = newIRTemp(sbOut->tyenv, Ity_I1);
+      addStmtToIRSB(
+         sbOut,
+         assign(guard,
+                tyAddr == Ity_I32 
+                   ? binop(Iop_CmpLT32U, mkU32(THRESH), mkexpr(diff))
+                   : binop(Iop_CmpLT64U, mkU64(THRESH), mkexpr(diff)))
+      );
+      di->guard = mkexpr(guard);
+   }
+
+   /* Add the helper. */
+   addStmtToIRSB( sbOut, IRStmt_Dirty(di) );
 }
 
 
@@ -4039,6 +4155,8 @@ IRSB* hg_instrument ( VgCallbackClosure* closure,
    Bool    inLDSO = False;
    Addr64  inLDSOmask4K = 1; /* mismatches on first check */
 
+   const Int goff_sp = layout->offset_SP;
+
    if (gWordTy != hWordTy) {
       /* We don't currently support this case. */
       VG_(tool_panic)("host/guest word size mismatch");
@@ -4130,7 +4248,7 @@ IRSB* hg_instrument ( VgCallbackClosure* closure,
                   (isDCAS ? 2 : 1)
                      * sizeofIRType(typeOfIRExpr(bbIn->tyenv, cas->dataLo)),
                   False/*!isStore*/,
-                  sizeofIRType(hWordTy)
+                  sizeofIRType(hWordTy), goff_sp
                );
             }
             break;
@@ -4150,7 +4268,7 @@ IRSB* hg_instrument ( VgCallbackClosure* closure,
                      st->Ist.LLSC.addr,
                      sizeofIRType(dataTy),
                      False/*!isStore*/,
-                     sizeofIRType(hWordTy)
+                     sizeofIRType(hWordTy), goff_sp
                   );
                }
             } else {
@@ -4169,7 +4287,7 @@ IRSB* hg_instrument ( VgCallbackClosure* closure,
                   st->Ist.Store.addr, 
                   sizeofIRType(typeOfIRExpr(bbIn->tyenv, st->Ist.Store.data)),
                   True/*isStore*/,
-                  sizeofIRType(hWordTy)
+                  sizeofIRType(hWordTy), goff_sp
                );
             }
             break;
@@ -4185,7 +4303,7 @@ IRSB* hg_instrument ( VgCallbackClosure* closure,
                      data->Iex.Load.addr,
                      sizeofIRType(data->Iex.Load.ty),
                      False/*!isStore*/,
-                     sizeofIRType(hWordTy)
+                     sizeofIRType(hWordTy), goff_sp
                   );
                }
             }
@@ -4205,7 +4323,7 @@ IRSB* hg_instrument ( VgCallbackClosure* closure,
                   if (!inLDSO) {
                      instrument_mem_access( 
                         bbOut, d->mAddr, dataSize, False/*!isStore*/,
-                        sizeofIRType(hWordTy)
+                        sizeofIRType(hWordTy), goff_sp
                      );
                   }
                }
@@ -4213,7 +4331,7 @@ IRSB* hg_instrument ( VgCallbackClosure* closure,
                   if (!inLDSO) {
                      instrument_mem_access( 
                         bbOut, d->mAddr, dataSize, True/*isStore*/,
-                        sizeofIRType(hWordTy)
+                        sizeofIRType(hWordTy), goff_sp
                      );
                   }
                }
@@ -4237,6 +4355,12 @@ IRSB* hg_instrument ( VgCallbackClosure* closure,
    return bbOut;
 }
 
+#undef binop
+#undef mkexpr
+#undef mkU32
+#undef mkU64
+#undef assign
+
 
 /*----------------------------------------------------------------*/
 /*--- Client requests                                          ---*/
@@ -4620,6 +4744,17 @@ static Bool hg_process_cmd_line_option ( Char* arg )
 
    else if VG_BOOL_CLO(arg, "--free-is-write",
                             HG_(clo_free_is_write)) {}
+
+   else if VG_XACT_CLO(arg, "--vts-pruning=never",
+                            HG_(clo_vts_pruning), 0);
+   else if VG_XACT_CLO(arg, "--vts-pruning=auto",
+                            HG_(clo_vts_pruning), 1);
+   else if VG_XACT_CLO(arg, "--vts-pruning=always",
+                            HG_(clo_vts_pruning), 2);
+
+   else if VG_BOOL_CLO(arg, "--check-stack-refs",
+                            HG_(clo_check_stack_refs)) {}
+
    else 
       return VG_(replacement_malloc_process_cmd_line_option)(arg);
 
@@ -4636,6 +4771,8 @@ static void hg_print_usage ( void )
 "       approx: full trace for one thread, approx for the other (faster)\n"
 "       none:   only show trace for one thread in a race (fastest)\n"
 "    --conflict-cache-size=N   size of 'full' history cache [1000000]\n"
+"    --check-stack-refs=no|yes race-check reads and writes on the\n"
+"                              main stack and thread stacks? [yes]\n"
    );
 }
 
@@ -4653,6 +4790,12 @@ static void hg_print_debug_usage ( void )
                "ranges >= %d bytes\n", SCE_BIGRANGE_T);
    VG_(printf)("       000010   at lock/unlock events\n");
    VG_(printf)("       000001   at thread create/join events\n");
+   VG_(printf)(
+"    --vts-pruning=never|auto|always [auto]\n"
+"       never:   is never done (may cause big space leaks in Helgrind)\n"
+"       auto:    done just often enough to keep space usage under control\n"
+"       always:  done after every VTS GC (mostly just a big time waster)\n"
+    );
 }
 
 static void hg_fini ( Int exitcode )
diff --git a/helgrind/libhb.h b/helgrind/libhb.h
index 4cfc3ed2..10768d03 100644
--- a/helgrind/libhb.h
+++ b/helgrind/libhb.h
@@ -55,7 +55,8 @@ void libhb_shutdown ( Bool show_stats );
 Thr* libhb_create ( Thr* parent );
 
 /* Thread async exit */
-void libhb_async_exit ( Thr* exitter );
+void libhb_async_exit      ( Thr* exitter );
+void libhb_joinedwith_done ( Thr* exitter );
 
 /* Synchronisation objects (abstract to caller) */
 
@@ -145,11 +146,22 @@ void libhb_maybe_GC ( void );
 
 /* Extract info from the conflicting-access machinery. */
 Bool libhb_event_map_lookup ( /*OUT*/ExeContext** resEC,
-                              /*OUT*/Thr**  resThr,
-                              /*OUT*/SizeT* resSzB,
-                              /*OUT*/Bool*  resIsW,
+                              /*OUT*/Thr**        resThr,
+                              /*OUT*/SizeT*       resSzB,
+                              /*OUT*/Bool*        resIsW,
+                              /*OUT*/WordSetID*   locksHeldW,
                               Thr* thr, Addr a, SizeT szB, Bool isW );
 
+/* ------ Exported from hg_main.c ------ */
+/* Yes, this is a horrible tangle.  Sigh. */
+
+/* Get the univ_lset (universe for locksets) from hg_main.c.  Sigh. */
+WordSetU* HG_(get_univ_lsets) ( void );
+
+/* Get the the header pointer for the double linked list of locks
+   (admin_locks). */
+Lock* HG_(get_admin_locks) ( void );
+
 #endif /* __LIBHB_H */
 
 /*--------------------------------------------------------------------*/
diff --git a/helgrind/libhb_core.c b/helgrind/libhb_core.c
index cb4530d9..0e3f90c9 100644
--- a/helgrind/libhb_core.c
+++ b/helgrind/libhb_core.c
@@ -94,30 +94,27 @@
 /////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////
 //                                                             //
-// Forward declarations                                        //
+// data decls: VtsID                                           //
 //                                                             //
 /////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////
 
-/* fwds for
-   Globals needed by other parts of the library.  These are set
-   once at startup and then never changed. */
-static void        (*main_get_stacktrace)( Thr*, Addr*, UWord ) = NULL;
-static ExeContext* (*main_get_EC)( Thr* ) = NULL;
+/* VtsIDs: Unique small-integer IDs for VTSs.  VtsIDs can't exceed 30
+   bits, since they have to be packed into the lowest 30 bits of an
+   SVal. */
+typedef  UInt  VtsID;
+#define VtsID_INVALID 0xFFFFFFFF
 
 
 
 /////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////
 //                                                             //
-// SECTION BEGIN compressed shadow memory                      //
+// data decls: SVal                                            //
 //                                                             //
 /////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////
 
-#ifndef __HB_ZSM_H
-#define __HB_ZSM_H
-
 typedef  ULong  SVal;
 
 /* This value has special significance to the implementation, and callers
@@ -129,6 +126,261 @@ typedef  ULong  SVal;
    value.  TODO: make this caller-defineable. */
 #define SVal_NOACCESS (2ULL << 62)
 
+
+
+/////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////
+//                                                             //
+// data decls: ScalarTS                                        //
+//                                                             //
+/////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////
+
+/* Scalar Timestamp.  We have to store a lot of these, so there is
+   some effort to make them as small as possible.  Logically they are
+   a pair, (Thr*, ULong), but that takes 16 bytes on a 64-bit target.
+   We pack it into 64 bits by representing the Thr* using a ThrID, a
+   small integer (18 bits), and a 46 bit integer for the timestamp
+   number.  The 46/18 split is arbitary, but has the effect that
+   Helgrind can only handle programs that create 2^18 or fewer threads
+   over their entire lifetime, and have no more than 2^46 timestamp
+   ticks (synchronisation operations on the same thread).
+
+   This doesn't seem like much of a limitation.  2^46 ticks is
+   7.06e+13, and if each tick (optimistically) takes the machine 1000
+   cycles to process, then the minimum time to process that many ticks
+   at a clock rate of 5 GHz is 162.9 days.  And that's doing nothing
+   but VTS ticks, which isn't realistic.
+
+   NB1: SCALARTS_N_THRBITS must be 29 or lower.  The obvious limit is
+   32 since a ThrID is a UInt.  29 comes from the fact that
+   'Thr_n_RCEC', which records information about old accesses, packs
+   not only a ThrID but also 2+1 other bits (access size and
+   writeness) in a UInt, hence limiting size to 32-(2+1) == 29.
+
+   NB2: thrid values are issued upwards from 1024, and values less
+   than that aren't valid.  This isn't per se necessary (any order
+   will do, so long as they are unique), but it does help ensure they
+   are less likely to get confused with the various other kinds of
+   small-integer thread ids drifting around (eg, TId).  See also NB5.
+
+   NB3: this probably also relies on the fact that Thr's are never
+   deallocated -- they exist forever.  Hence the 1-1 mapping from
+   Thr's to thrid values (set up in Thr__new) persists forever.
+
+   NB4: temp_max_sized_VTS is allocated at startup and never freed.
+   It is a maximum sized VTS, so has (1 << SCALARTS_N_TYMBITS)
+   ScalarTSs.  So we can't make SCALARTS_N_THRBITS too large without
+   making the memory use for this go sky-high.  With
+   SCALARTS_N_THRBITS at 18, it occupies 2MB of memory, which seems
+   like an OK tradeoff.  If more than 256k threads need to be
+   supported, we could change SCALARTS_N_THRBITS to 20, which would
+   facilitate supporting 1 million threads at the cost of 8MB storage
+   for temp_max_sized_VTS.
+
+   NB5: the conflicting-map mechanism (Thr_n_RCEC, specifically) uses
+   ThrID == 0 to denote an empty Thr_n_RCEC record.  So ThrID == 0
+   must never be a valid ThrID.  Given NB2 that's OK.
+*/
+#define SCALARTS_N_THRBITS 18  /* valid range: 11 to 29 inclusive */
+
+#define SCALARTS_N_TYMBITS (64 - SCALARTS_N_THRBITS)
+typedef
+   struct {
+      ThrID thrid : SCALARTS_N_THRBITS;
+      ULong tym   : SCALARTS_N_TYMBITS;
+   }
+   ScalarTS;
+
+#define ThrID_MAX_VALID ((1 << SCALARTS_N_THRBITS) - 1)
+
+
+
+/////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////
+//                                                             //
+// data decls: Filter                                          //
+//                                                             //
+/////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////
+
+// baseline: 5, 9
+#define FI_LINE_SZB_LOG2  5
+#define FI_NUM_LINES_LOG2 10
+
+#define FI_LINE_SZB       (1 << FI_LINE_SZB_LOG2)
+#define FI_NUM_LINES      (1 << FI_NUM_LINES_LOG2)
+
+#define FI_TAG_MASK        (~(Addr)(FI_LINE_SZB - 1))
+#define FI_GET_TAG(_a)     ((_a) & FI_TAG_MASK)
+
+#define FI_GET_LINENO(_a)  ( ((_a) >> FI_LINE_SZB_LOG2) \
+                             & (Addr)(FI_NUM_LINES-1) )
+
+
+/* In the lines, each 8 bytes are treated individually, and are mapped
+   to a UShort.  Regardless of endianness of the underlying machine,
+   bits 1 and 0 pertain to the lowest address and bits 15 and 14 to
+   the highest address.
+
+   Of each bit pair, the higher numbered bit is set if a R has been
+   seen, so the actual layout is:
+
+   15 14             ...  01 00
+
+   R  W  for addr+7  ...  R  W  for addr+0
+
+   So a mask for the R-bits is 0xAAAA and for the W bits is 0x5555.
+*/
+
+/* tags are separated from lines.  tags are Addrs and are
+   the base address of the line. */
+typedef
+   struct {
+      UShort u16s[FI_LINE_SZB / 8]; /* each UShort covers 8 bytes */
+   }
+   FiLine;
+
+typedef
+   struct {
+      Addr   tags[FI_NUM_LINES];
+      FiLine lines[FI_NUM_LINES];
+   }
+   Filter;
+
+
+
+/////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////
+//                                                             //
+// data decls: Thr, ULong_n_EC                                 //
+//                                                             //
+/////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////
+
+// Records stacks for H1 history mechanism (DRD-style)
+typedef
+   struct { ULong ull; ExeContext* ec; }
+   ULong_n_EC;
+
+
+/* How many of the above records to collect for each thread?  Older
+   ones are dumped when we run out of space.  62.5k requires 1MB per
+   thread, since each ULong_n_EC record is 16 bytes long.  When more
+   than N_KWs_N_STACKs_PER_THREAD are present, the older half are
+   deleted to make space.  Hence in the worst case we will be able to
+   produce a stack at least for the last N_KWs_N_STACKs_PER_THREAD / 2
+   Kw transitions (segments in this thread).  For the current setting
+   that gives a guaranteed stack for at least the last 31.25k
+   segments. */
+#define N_KWs_N_STACKs_PER_THREAD 62500
+
+
+struct _Thr {
+   /* Current VTSs for this thread.  They change as we go along.  viR
+      is the VTS to be used for reads, viW for writes.  Usually they
+      are the same, but can differ when we deal with reader-writer
+      locks.  It is always the case that 
+         VtsID__cmpLEQ(viW,viR) == True
+      that is, viW must be the same, or lagging behind, viR. */
+   VtsID viR;
+   VtsID viW;
+
+   /* Is initially False, and is set to True after the thread really
+      has done a low-level exit.  When True, we expect to never see
+      any more memory references done by this thread. */
+   Bool llexit_done;
+
+   /* Is initially False, and is set to True after the thread has been
+      joined with (reaped by some other thread).  After this point, we
+      do not expect to see any uses of .viR or .viW, so it is safe to
+      set them to VtsID_INVALID. */
+   Bool joinedwith_done;
+
+   /* A small integer giving a unique identity to this Thr.  See
+      comments on the definition of ScalarTS for details. */
+   ThrID thrid : SCALARTS_N_THRBITS;
+
+   /* A filter that removes references for which we believe that
+      msmcread/msmcwrite will not change the state, nor report a
+      race. */
+   Filter* filter;
+
+   /* A pointer back to the top level Thread structure.  There is a
+      1-1 mapping between Thread and Thr structures -- each Thr points
+      at its corresponding Thread, and vice versa.  Really, Thr and
+      Thread should be merged into a single structure. */
+   Thread* hgthread;
+
+   /* The ULongs (scalar Kws) in this accumulate in strictly
+      increasing order, without duplicates.  This is important because
+      we need to be able to find a given scalar Kw in this array
+      later, by binary search. */
+   XArray* /* ULong_n_EC */ local_Kws_n_stacks;
+};
+
+
+
+/////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////
+//                                                             //
+// data decls: SO                                              //
+//                                                             //
+/////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////
+
+// (UInt) `echo "Synchronisation object" | md5sum`
+#define SO_MAGIC 0x56b3c5b0U
+
+struct _SO {
+   struct _SO* admin_prev;
+   struct _SO* admin_next;
+   VtsID viR; /* r-clock of sender */
+   VtsID viW; /* w-clock of sender */
+   UInt  magic;
+};
+
+
+
+/////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////
+//                                                             //
+// Forward declarations                                        //
+//                                                             //
+/////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////
+
+/* fwds for
+   Globals needed by other parts of the library.  These are set
+   once at startup and then never changed. */
+static void        (*main_get_stacktrace)( Thr*, Addr*, UWord ) = NULL;
+static ExeContext* (*main_get_EC)( Thr* ) = NULL;
+
+/* misc fn and data fwdses */
+static void VtsID__rcinc ( VtsID ii );
+static void VtsID__rcdec ( VtsID ii );
+
+static inline Bool SVal__isC ( SVal s );
+static inline VtsID SVal__unC_Rmin ( SVal s );
+static inline VtsID SVal__unC_Wmin ( SVal s );
+static inline SVal SVal__mkC ( VtsID rmini, VtsID wmini );
+
+/* A double linked list of all the SO's. */
+SO* admin_SO;
+
+
+
+/////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////
+//                                                             //
+// SECTION BEGIN compressed shadow memory                      //
+//                                                             //
+/////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////
+
+#ifndef __HB_ZSM_H
+#define __HB_ZSM_H
+
 /* Initialise the library.  Once initialised, it will (or may) call
    rcinc and rcdec in response to all the calls below, in order to
    allow the user to do reference counting on the SVals stored herein.
@@ -1539,58 +1791,6 @@ static void zsm_init ( void(*p_rcinc)(SVal), void(*p_rcdec)(SVal) )
 static ThrID Thr__to_ThrID   ( Thr*  thr   ); /* fwds */
 static Thr*  Thr__from_ThrID ( ThrID thrid ); /* fwds */
 
-
-/* Scalar Timestamp.  We have to store a lot of these, so there is
-   some effort to make them as small as possible.  Logically they are
-   a pair, (Thr*, ULong), but that takes 16 bytes on a 64-bit target.
-   We pack it into 64 bits by representing the Thr* using a ThrID, a
-   small integer (18 bits), and a 46 bit integer for the timestamp
-   number.  The 46/18 split is arbitary, but has the effect that
-   Helgrind can only handle programs that create 2^18 or fewer threads
-   over their entire lifetime, and have no more than 2^46 timestamp
-   ticks (synchronisation operations on the same thread).
-
-   This doesn't seem like much of a limitation.  2^46 ticks is
-   7.06e+13, and if each tick (optimistically) takes the machine 1000
-   cycles to process, then the minimum time to process that many ticks
-   at a clock rate of 5 GHz is 162.9 days.  And that's doing nothing
-   but VTS ticks, which isn't realistic.
-
-   NB1: SCALARTS_N_THRBITS must be 32 or lower, so they fit in a ThrID
-   (== a UInt).
-
-   NB2: thrid values are issued upwards from 1024, and values less
-   than that aren't valid.  This isn't per se necessary (any order
-   will do, so long as they are unique), but it does help ensure they
-   are less likely to get confused with the various other kinds of
-   small-integer thread ids drifting around (eg, TId).
-
-   NB3: this probably also relies on the fact that Thr's are never
-   deallocated -- they exist forever.  Hence the 1-1 mapping from
-   Thr's to thrid values (set up in Thr__new) persists forever.
-
-   NB4: temp_max_sized_VTS is allocated at startup and never freed.
-   It is a maximum sized VTS, so has (1 << SCALARTS_N_TYMBITS)
-   ScalarTSs.  So we can't make SCALARTS_N_THRBITS too large without
-   making the memory use for this go sky-high.  With
-   SCALARTS_N_THRBITS at 18, it occupies 2MB of memory, which seems
-   like an OK tradeoff.  If more than 256k threads need to be
-   supported, we could change SCALARTS_N_THRBITS to 20, which would
-   facilitate supporting 1 million threads at the cost of 8MB storage
-   for temp_max_sized_VTS.
-*/
-#define SCALARTS_N_THRBITS 18  /* valid range: 11 to 32 inclusive */
-#define SCALARTS_N_TYMBITS (64 - SCALARTS_N_THRBITS)
-typedef
-   struct {
-      ThrID thrid : SCALARTS_N_THRBITS;
-      ULong tym   : SCALARTS_N_TYMBITS;
-   }
-   ScalarTS;
-
-#define ThrID_MAX_VALID ((1 << SCALARTS_N_THRBITS) - 1)
-
-
 __attribute__((noreturn))
 static void scalarts_limitations_fail_NORETURN ( Bool due_to_nThrs )
 {
@@ -1618,11 +1818,38 @@ static void scalarts_limitations_fail_NORETURN ( Bool due_to_nThrs )
 }
 
 
-/* VtsIDs: Unique small-integer IDs for VTSs.  VtsIDs can't exceed 30
-   bits, since they have to be packed into the lowest 30 bits of an
-   SVal. */
-typedef  UInt  VtsID;
-#define VtsID_INVALID 0xFFFFFFFF
+/* The dead thread (ThrID, actually) table.  A thread may only be
+   listed here if we have been notified thereof by libhb_async_exit.
+   New entries are added at the end.  The order isn't important, but
+   the ThrID values must be unique.  This table lists the identity of
+   all threads that have ever died -- none are ever removed.  We keep
+   this table so as to be able to prune entries from VTSs.  We don't
+   actually need to keep the set of threads that have ever died --
+   only the threads that have died since the previous round of
+   pruning.  But it's useful for sanity check purposes to keep the
+   entire set, so we do. */
+static XArray* /* of ThrID */ verydead_thread_table = NULL;
+
+/* Arbitrary total ordering on ThrIDs. */
+static Int cmp__ThrID ( void* v1, void* v2 ) {
+   ThrID id1 = *(ThrID*)v1;
+   ThrID id2 = *(ThrID*)v2;
+   if (id1 < id2) return -1;
+   if (id1 > id2) return 1;
+   return 0;
+}
+
+static void verydead_thread_table_init ( void )
+{
+   tl_assert(!verydead_thread_table);
+   verydead_thread_table
+     = VG_(newXA)( HG_(zalloc),
+                   "libhb.verydead_thread_table_init.1",
+                   HG_(free), sizeof(ThrID) );
+   tl_assert(verydead_thread_table);
+   VG_(setCmpFnXA)(verydead_thread_table, cmp__ThrID);
+}
+
 
 /* A VTS contains .ts, its vector clock, and also .id, a field to hold
    a backlink for the caller's convenience.  Since we have no idea
@@ -1640,10 +1867,16 @@ typedef
 /* Allocate a VTS capable of storing 'sizeTS' entries. */
 static VTS* VTS__new ( HChar* who, UInt sizeTS );
 
-/* Make a clone of 'vts', resizing the array to exactly match the
+/* Make a clone of 'vts', sizing the new array to exactly match the
    number of ScalarTSs present. */
 static VTS* VTS__clone ( HChar* who, VTS* vts );
 
+/* Make a clone of 'vts' with the thrids in 'thrids' removed.  The new
+   array is sized exactly to hold the number of required elements.
+   'thridsToDel' is an array of ThrIDs to be omitted in the clone, and
+   must be in strictly increasing order. */
+static VTS* VTS__subtract ( HChar* who, VTS* vts, XArray* thridsToDel );
+
 /* Delete this VTS in its entirety. */
 static void VTS__delete ( VTS* vts );
 
@@ -1687,6 +1920,12 @@ static void VTS__show ( HChar* buf, Int nBuf, VTS* vts );
 /* Debugging only.  Return vts[index], so to speak. */
 static ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx );
 
+/* Notify the VTS machinery that a thread has been declared
+   comprehensively dead: that is, it has done an async exit AND it has
+   been joined with.  This should ensure that its local clocks (.viR
+   and .viW) will never again change, and so all mentions of this
+   thread from all VTSs in the system may be removed. */
+static void VTS__declare_thread_very_dead ( Thr* idx );
 
 /*--------------- to do with Vector Timestamps ---------------*/
 
@@ -1749,6 +1988,42 @@ static VTS* VTS__clone ( HChar* who, VTS* vts )
 }
 
 
+/* Make a clone of a VTS with specified ThrIDs removed.  'thridsToDel'
+   must be in strictly increasing order.  We could obviously do this
+   much more efficiently (in linear time) if necessary.
+*/
+static VTS* VTS__subtract ( HChar* who, VTS* vts, XArray* thridsToDel )
+{
+   UInt i, j;
+   tl_assert(vts);
+   tl_assert(thridsToDel);
+   tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
+   UInt nTS = vts->usedTS;
+   /* Figure out how many ScalarTSs will remain in the output. */
+   UInt nReq = nTS;
+   for (i = 0; i < nTS; i++) {
+      ThrID thrid = vts->ts[i].thrid;
+      if (VG_(lookupXA)(thridsToDel, &thrid, NULL, NULL))
+         nReq--;
+   }
+   tl_assert(nReq <= nTS);
+   /* Copy the ones that will remain. */
+   VTS* res = VTS__new(who, nReq);
+   j = 0;
+   for (i = 0; i < nTS; i++) {
+      ThrID thrid = vts->ts[i].thrid;
+      if (VG_(lookupXA)(thridsToDel, &thrid, NULL, NULL))
+         continue;
+      res->ts[j++] = vts->ts[i];
+   }
+   tl_assert(j == nReq);
+   tl_assert(j == res->sizeTS);
+   res->usedTS = j;
+   tl_assert( *(ULong*)(&res->ts[j]) == 0x0ddC0ffeeBadF00dULL);
+   return res;
+}
+
+
 /* Delete this VTS in its entirety.
 */
 static void VTS__delete ( VTS* vts )
@@ -2155,6 +2430,32 @@ ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx )
 }
 
 
+/* See comment on prototype above.
+*/
+static void VTS__declare_thread_very_dead ( Thr* thr )
+{
+   if (0) VG_(printf)("VTQ:  tae %p\n", thr);
+
+   tl_assert(thr->llexit_done);
+   tl_assert(thr->joinedwith_done);
+
+   ThrID nyu;
+   nyu = Thr__to_ThrID(thr);
+   VG_(addToXA)( verydead_thread_table, &nyu );
+
+   /* We can only get here if we're assured that we'll never again
+      need to look at this thread's ::viR or ::viW.  Set them to
+      VtsID_INVALID, partly so as to avoid holding on to the VTSs, but
+      mostly so that we don't wind up pruning them (as that would be
+      nonsensical: the only interesting ScalarTS entry for a dead
+      thread is its own index, and the pruning will remove that.). */
+   VtsID__rcdec(thr->viR);
+   VtsID__rcdec(thr->viW);
+   thr->viR = VtsID_INVALID;
+   thr->viW = VtsID_INVALID;
+}
+
+
 /////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////
 //                                                             //
@@ -2180,7 +2481,7 @@ ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx )
 //                                                     //
 /////////////////////////////////////////////////////////
 
-static WordFM* /* VTS* void void */ vts_set = NULL;
+static WordFM* /* WordFM VTS* void */ vts_set = NULL;
 
 static void vts_set_init ( void )
 {
@@ -2232,18 +2533,19 @@ static void VtsID__invalidate_caches ( void ); /* fwds */
    If .vts == NULL, then this entry is not in use, so:
    - .rc == 0
    - this entry is on the freelist (unfortunately, does not imply
-     any constraints on value for .nextfree)
+     any constraints on value for .freelink)
    If .vts != NULL, then this entry is in use:
    - .vts is findable in vts_set
    - .vts->id == this entry number
    - no specific value for .rc (even 0 is OK)
-   - this entry is not on freelist, so .nextfree == VtsID_INVALID
+   - this entry is not on freelist, so .freelink == VtsID_INVALID
 */
 typedef
    struct {
       VTS*  vts;      /* vts, in vts_set */
       UWord rc;       /* reference count - enough for entire aspace */
       VtsID freelink; /* chain for free entries, VtsID_INVALID at end */
+      VtsID remap;    /* used only during pruning */
    }
    VtsTE;
 
@@ -2309,6 +2611,7 @@ static VtsID get_new_VtsID ( void )
    te.vts = NULL;
    te.rc = 0;
    te.freelink = VtsID_INVALID;
+   te.remap    = VtsID_INVALID;
    ii = (VtsID)VG_(addToXA)( vts_tab, &te );
    return ii;
 }
@@ -2394,12 +2697,53 @@ static void show_vts_stats ( HChar* caller )
    VG_(printf)("        total rc %4llu\n", totrc);
 }
 
+
+/* --- Helpers for VtsID pruning --- */
+
+static
+void remap_VtsID ( /*MOD*/XArray* /* of VtsTE */ old_tab,
+                   /*MOD*/XArray* /* of VtsTE */ new_tab,
+                   VtsID* ii )
+{
+   VtsTE *old_te, *new_te;
+   VtsID old_id, new_id;
+   /* We're relying here on VG_(indexXA)'s range checking to assert on
+      any stupid values, in particular *ii == VtsID_INVALID. */
+   old_id = *ii;
+   old_te = VG_(indexXA)( old_tab, old_id );
+   old_te->rc--;
+   new_id = old_te->remap;
+   new_te = VG_(indexXA)( new_tab, new_id );
+   new_te->rc++;
+   *ii = new_id;
+}
+
+static
+void remap_VtsIDs_in_SVal ( /*MOD*/XArray* /* of VtsTE */ old_tab,
+                            /*MOD*/XArray* /* of VtsTE */ new_tab,
+                            SVal* s )
+{
+   SVal old_sv, new_sv;
+   old_sv = *s;
+   if (SVal__isC(old_sv)) {
+      VtsID rMin, wMin;
+      rMin = SVal__unC_Rmin(old_sv);
+      wMin = SVal__unC_Wmin(old_sv);
+      remap_VtsID( old_tab, new_tab, &rMin );
+      remap_VtsID( old_tab, new_tab, &wMin );
+      new_sv = SVal__mkC( rMin, wMin );
+      *s = new_sv;
+  }
+}
+
+
 /* NOT TO BE CALLED FROM WITHIN libzsm. */
 __attribute__((noinline))
 static void vts_tab__do_GC ( Bool show_stats )
 {
    UWord i, nTab, nLive, nFreed;
 
+   /* ---------- BEGIN VTS GC ---------- */
    /* check this is actually necessary. */
    tl_assert(vts_tab_freelist == VtsID_INVALID);
 
@@ -2418,13 +2762,13 @@ static void vts_tab__do_GC ( Bool show_stats )
       show_vts_stats("before GC");
    }
 
-   /* Now we can inspect the entire vts_tab.  Any entries
-      with zero .rc fields are now no longer in use and can be
+   /* Now we can inspect the entire vts_tab.  Any entries with zero
+      .rc fields are now no longer in use and can be put back on the
       free list, removed from vts_set, and deleted. */
    nFreed = 0;
    for (i = 0; i < nTab; i++) {
       Bool present;
-      UWord oldK = 0, oldV = 0;
+      UWord oldK = 0, oldV = 12345;
       VtsTE* te = VG_(indexXA)( vts_tab, i );
       if (te->vts == NULL) {
          tl_assert(te->rc == 0);
@@ -2466,12 +2810,332 @@ static void vts_tab__do_GC ( Bool show_stats )
    }
 
    if (VG_(clo_stats)) {
-      static UInt ctr = 0;
+      static UInt ctr = 1;
       tl_assert(nTab > 0);
       VG_(message)(Vg_DebugMsg,
                   "libhb: VTS GC: #%u  old size %lu  live %lu  (%2llu%%)\n",
                   ctr++, nTab, nLive, (100ULL * (ULong)nLive) / (ULong)nTab);
    }
+   /* ---------- END VTS GC ---------- */
+
+   /* Decide whether to do VTS pruning.  We have one of three
+      settings. */
+   static UInt pruning_auto_ctr = 0; /* do not make non-static */
+
+   Bool do_pruning = False;
+   switch (HG_(clo_vts_pruning)) {
+      case 0: /* never */
+         break;
+      case 1: /* auto */
+         do_pruning = (++pruning_auto_ctr % 5) == 0;
+         break;
+      case 2: /* always */
+         do_pruning = True;
+         break;
+      default:
+         tl_assert(0);
+   }
+
+   /* The rest of this routine only handles pruning, so we can
+      quit at this point if it is not to be done. */
+   if (!do_pruning)
+      return;
+
+   /* ---------- BEGIN VTS PRUNING ---------- */
+   /* We begin by sorting the backing table on its .thr values, so as
+      to (1) check they are unique [else something has gone wrong,
+      since it means we must have seen some Thr* exiting more than
+      once, which can't happen], and (2) so that we can quickly look
+      up the dead-thread entries as we work through the VTSs. */
+   VG_(sortXA)( verydead_thread_table );
+   /* Sanity check: check for unique .sts.thr values. */
+   UWord nBT = VG_(sizeXA)( verydead_thread_table );
+   if (nBT > 0) {
+      ThrID thrid1, thrid2;
+      thrid2 = *(ThrID*)VG_(indexXA)( verydead_thread_table, 0 );
+      for (i = 1; i < nBT; i++) {
+         thrid1 = thrid2;
+         thrid2 = *(ThrID*)VG_(indexXA)( verydead_thread_table, i );
+         tl_assert(thrid1 < thrid2);
+      }
+   }
+   /* Ok, so the dead thread table has unique and in-order keys. */
+
+   /* We will run through the old table, and create a new table and
+      set, at the same time setting the .remap entries in the old
+      table to point to the new entries.  Then, visit every VtsID in
+      the system, and replace all of them with new ones, using the
+      .remap entries in the old table.  Finally, we can delete the old
+      table and set. */
+
+   XArray* /* of VtsTE */ new_tab
+      = VG_(newXA)( HG_(zalloc), "libhb.vts_tab__do_GC.new_tab",
+                    HG_(free), sizeof(VtsTE) );
+
+   /* WordFM VTS* void */
+   WordFM* new_set
+      = VG_(newFM)( HG_(zalloc), "libhb.vts_tab__do_GC.new_set",
+                    HG_(free),
+                    (Word(*)(UWord,UWord))VTS__cmp_structural );
+
+   /* Visit each old VTS.  For each one:
+
+      * make a pruned version
+
+      * search new_set for the pruned version, yielding either
+        Nothing (not present) or the new VtsID for it.
+
+      * if not present, allocate a new VtsID for it, insert (pruned
+        VTS, new VtsID) in the tree, and set
+        remap_table[old VtsID] = new VtsID.
+
+      * if present, set remap_table[old VtsID] = new VtsID, where
+        new VtsID was determined by the tree lookup.  Then free up
+        the clone.
+   */
+
+   UWord nBeforePruning = 0, nAfterPruning = 0;
+   UWord nSTSsBefore = 0, nSTSsAfter = 0;
+   VtsID new_VtsID_ctr = 0;
+
+   for (i = 0; i < nTab; i++) {
+
+      /* For each old VTS .. */
+      VtsTE* old_te  = VG_(indexXA)( vts_tab, i );
+      VTS*   old_vts = old_te->vts;
+      tl_assert(old_te->remap == VtsID_INVALID);
+
+      /* Skip it if not in use */
+      if (old_te->rc == 0) {
+         tl_assert(old_vts == NULL);
+         continue;
+      }
+      tl_assert(old_vts != NULL);
+      tl_assert(old_vts->id == i);
+      tl_assert(old_vts->ts != NULL);
+
+      /* It is in use. Make a pruned version. */
+      nBeforePruning++;
+      nSTSsBefore += old_vts->usedTS;
+      VTS* new_vts = VTS__subtract("libhb.vts_tab__do_GC.new_vts",
+                                   old_vts, verydead_thread_table);
+      tl_assert(new_vts->sizeTS == new_vts->usedTS);
+      tl_assert(*(ULong*)(&new_vts->ts[new_vts->usedTS])
+                == 0x0ddC0ffeeBadF00dULL);
+
+      /* Get rid of the old VTS and the tree entry.  It's a bit more
+         complex to incrementally delete the VTSs now than to nuke
+         them all after we're done, but the upside is that we don't
+         wind up temporarily storing potentially two complete copies
+         of each VTS and hence spiking memory use. */
+      UWord oldK = 0, oldV = 12345;
+      Bool  present = VG_(delFromFM)( vts_set,
+                                      &oldK, &oldV, (UWord)old_vts );
+      tl_assert(present); /* else it isn't in vts_set ?! */
+      tl_assert(oldV == 0); /* no info stored in vts_set val fields */
+      tl_assert(oldK == (UWord)old_vts); /* else what did delFromFM find?! */
+      /* now free the VTS itself */
+      VTS__delete(old_vts);
+      old_te->vts = NULL;
+      old_vts = NULL;
+
+      /* NO MENTIONS of old_vts allowed beyond this point. */
+
+      /* Ok, we have the pruned copy in new_vts.  See if a
+         structurally identical version is already present in new_set.
+         If so, delete the one we just made and move on; if not, add
+         it. */
+      VTS*  identical_version = NULL;
+      UWord valW = 12345;
+      if (VG_(lookupFM)(new_set, (UWord*)&identical_version, &valW,
+                        (UWord)new_vts)) {
+         // already have it
+         tl_assert(valW == 0);
+         tl_assert(identical_version != NULL);
+         tl_assert(identical_version != new_vts);
+         VTS__delete(new_vts);
+         new_vts = identical_version;
+         tl_assert(new_vts->id != VtsID_INVALID);
+      } else {
+         tl_assert(valW == 12345);
+         tl_assert(identical_version == NULL);
+         new_vts->id = new_VtsID_ctr++;
+         Bool b = VG_(addToFM)(new_set, (UWord)new_vts, 0);
+         tl_assert(!b);
+         VtsTE new_te;
+         new_te.vts      = new_vts;
+         new_te.rc       = 0;
+         new_te.freelink = VtsID_INVALID;
+         new_te.remap    = VtsID_INVALID;
+         Word j = VG_(addToXA)( new_tab, &new_te );
+         tl_assert(j <= i);
+         tl_assert(j == new_VtsID_ctr - 1);
+         // stats
+         nAfterPruning++;
+         nSTSsAfter += new_vts->usedTS;
+      }
+      old_te->remap = new_vts->id;
+
+   } /* for (i = 0; i < nTab; i++) */
+
+   /* At this point, we have:
+      * the old VTS table, with its .remap entries set,
+        and with all .vts == NULL.
+      * the old VTS tree should be empty, since it and the old VTSs
+        it contained have been incrementally deleted was we worked
+        through the old table.
+      * the new VTS table, with all .rc == 0, all .freelink and .remap
+        == VtsID_INVALID. 
+      * the new VTS tree.
+   */
+   tl_assert( VG_(sizeFM)(vts_set) == 0 );
+
+   /* Now actually apply the mapping. */
+   /* Visit all the VtsIDs in the entire system.  Where do we expect
+      to find them?
+      (a) in shadow memory -- the LineZs and LineFs
+      (b) in our collection of struct _Thrs.
+      (c) in our collection of struct _SOs.
+      Nowhere else, AFAICS.  Not in the zsm cache, because that just
+      got invalidated.
+
+      Using the .remap fields in vts_tab, map each old VtsID to a new
+      VtsID.  For each old VtsID, dec its rc; and for each new one,
+      inc it.  This sets up the new refcounts, and it also gives a
+      cheap sanity check of the old ones: all old refcounts should be
+      zero after this operation.
+   */
+
+   /* Do the mappings for (a) above: iterate over the Primary shadow
+      mem map (WordFM Addr SecMap*). */
+   UWord secmapW = 0;
+   VG_(initIterFM)( map_shmem );
+   while (VG_(nextIterFM)( map_shmem, NULL, &secmapW )) {
+      UWord   j;
+      SecMap* sm = (SecMap*)secmapW;
+      tl_assert(sm->magic == SecMap_MAGIC);
+      /* Deal with the LineZs */
+      for (i = 0; i < N_SECMAP_ZLINES; i++) {
+         LineZ* lineZ = &sm->linesZ[i];
+         if (lineZ->dict[0] == SVal_INVALID)
+            continue; /* not in use -- data is in F rep instead */
+         for (j = 0; j < 4; j++)
+            remap_VtsIDs_in_SVal(vts_tab, new_tab, &lineZ->dict[j]);
+      }
+      /* Deal with the LineFs */
+      for (i = 0; i < sm->linesF_size; i++) {
+         LineF* lineF = &sm->linesF[i];
+         if (!lineF->inUse)
+            continue;
+         for (j = 0; j < N_LINE_ARANGE; j++)
+            remap_VtsIDs_in_SVal(vts_tab, new_tab, &lineF->w64s[j]);
+      }
+   }
+   VG_(doneIterFM)( map_shmem );
+
+   /* Do the mappings for (b) above: visit our collection of struct
+      _Thrs. */
+   Thread* hgthread = get_admin_threads();
+   tl_assert(hgthread);
+   while (hgthread) {
+      Thr* hbthr = hgthread->hbthr;
+      tl_assert(hbthr);
+      /* Threads that are listed in the prunable set have their viR
+         and viW set to VtsID_INVALID, so we can't mess with them. */
+      if (hbthr->llexit_done && hbthr->joinedwith_done) {
+         tl_assert(hbthr->viR == VtsID_INVALID);
+         tl_assert(hbthr->viW == VtsID_INVALID);
+         hgthread = hgthread->admin;
+         continue;
+      }
+      remap_VtsID( vts_tab, new_tab, &hbthr->viR );
+      remap_VtsID( vts_tab, new_tab, &hbthr->viW );
+      hgthread = hgthread->admin;
+   }
+
+   /* Do the mappings for (c) above: visit the struct _SOs. */
+   SO* so = admin_SO;
+   while (so) {
+      if (so->viR != VtsID_INVALID)
+         remap_VtsID( vts_tab, new_tab, &so->viR );
+      if (so->viW != VtsID_INVALID)
+         remap_VtsID( vts_tab, new_tab, &so->viW );
+      so = so->admin_next;
+   }
+
+   /* So, we're nearly done (with this incredibly complex operation).
+      Check the refcounts for the old VtsIDs all fell to zero, as
+      expected.  Any failure is serious. */
+   for (i = 0; i < nTab; i++) {
+      VtsTE* te = VG_(indexXA)( vts_tab, i );
+      tl_assert(te->vts == NULL);
+      /* This is the assert proper.  Note we're also asserting
+         zeroness for old entries which are unmapped (hence have
+         .remap == VtsID_INVALID).  That's OK. */
+      tl_assert(te->rc == 0);
+   }
+
+   /* Install the new table and set. */
+   VG_(deleteFM)(vts_set, NULL/*kFin*/, NULL/*vFin*/);
+   vts_set = new_set;
+   VG_(deleteXA)( vts_tab );
+   vts_tab = new_tab;
+
+   /* The freelist of vts_tab entries is empty now, because we've
+      compacted all of the live entries at the low end of the
+      table. */
+   vts_tab_freelist = VtsID_INVALID;
+
+   /* Sanity check vts_set and vts_tab. */
+
+   /* Because all the live entries got slid down to the bottom of vts_tab: */
+   tl_assert( VG_(sizeXA)( vts_tab ) == VG_(sizeFM)( vts_set ));
+
+   /* Assert that the vts_tab and vts_set entries point at each other
+      in the required way */
+   UWord wordK = 0, wordV = 0;
+   VG_(initIterFM)( vts_set );
+   while (VG_(nextIterFM)( vts_set, &wordK, &wordV )) {
+      tl_assert(wordK != 0);
+      tl_assert(wordV == 0);
+      VTS* vts = (VTS*)wordK;
+      tl_assert(vts->id != VtsID_INVALID);
+      VtsTE* te = VG_(indexXA)( vts_tab, vts->id );
+      tl_assert(te->vts == vts);
+   }
+   VG_(doneIterFM)( vts_set );
+
+   /* Also iterate over the table, and check each entry is
+      plausible. */
+   nTab = VG_(sizeXA)( vts_tab );
+   for (i = 0; i < nTab; i++) {
+      VtsTE* te = VG_(indexXA)( vts_tab, i );
+      tl_assert(te->vts);
+      tl_assert(te->vts->id == i);
+      tl_assert(te->rc > 0); /* 'cos we just GC'd */
+      tl_assert(te->freelink == VtsID_INVALID); /* in use */
+      tl_assert(te->remap == VtsID_INVALID); /* not relevant */
+   }
+
+   /* And we're done.  Bwahahaha. Ha. Ha. Ha. */
+   if (VG_(clo_stats)) {
+      static UInt ctr = 1;
+      tl_assert(nTab > 0);
+      VG_(message)(
+         Vg_DebugMsg,
+         "libhb: VTS PR: #%u  before %lu (avg sz %lu)  "
+            "after %lu (avg sz %lu)\n",
+         ctr++,
+         nBeforePruning, nSTSsBefore / (nBeforePruning ? nBeforePruning : 1),
+         nAfterPruning, nSTSsAfter / (nAfterPruning ? nAfterPruning : 1)
+      );
+   }
+   if (0)
+   VG_(printf)("VTQ: before pruning %lu (avg sz %lu), "
+               "after pruning %lu (avg sz %lu)\n",
+               nBeforePruning, nSTSsBefore / nBeforePruning,
+               nAfterPruning, nSTSsAfter / nAfterPruning);
+   /* ---------- END VTS PRUNING ---------- */
 }
 
 
@@ -2661,50 +3325,6 @@ static Thr* VtsID__findFirst_notLEQ ( VtsID vi1, VtsID vi2 )
 //                                                     //
 /////////////////////////////////////////////////////////
 
-// baseline: 5, 9
-#define FI_LINE_SZB_LOG2  5
-#define FI_NUM_LINES_LOG2 10
-
-#define FI_LINE_SZB       (1 << FI_LINE_SZB_LOG2)
-#define FI_NUM_LINES      (1 << FI_NUM_LINES_LOG2)
-
-#define FI_TAG_MASK        (~(Addr)(FI_LINE_SZB - 1))
-#define FI_GET_TAG(_a)     ((_a) & FI_TAG_MASK)
-
-#define FI_GET_LINENO(_a)  ( ((_a) >> FI_LINE_SZB_LOG2) \
-                             & (Addr)(FI_NUM_LINES-1) )
-
-
-/* In the lines, each 8 bytes are treated individually, and are mapped
-   to a UShort.  Regardless of endianness of the underlying machine,
-   bits 1 and 0 pertain to the lowest address and bits 15 and 14 to
-   the highest address.
-
-   Of each bit pair, the higher numbered bit is set if a R has been
-   seen, so the actual layout is:
-
-   15 14             ...  01 00
-
-   R  W  for addr+7  ...  R  W  for addr+0
-
-   So a mask for the R-bits is 0xAAAA and for the W bits is 0x5555.
-*/
-
-/* tags are separated from lines.  tags are Addrs and are
-   the base address of the line. */
-typedef
-   struct {
-      UShort u16s[FI_LINE_SZB / 8]; /* each UShort covers 8 bytes */
-   }
-   FiLine;
-
-typedef
-   struct {
-      Addr   tags[FI_NUM_LINES];
-      FiLine lines[FI_NUM_LINES];
-   }
-   Filter;
-
 /* Forget everything we know -- clear the filter and let everything
    through.  This needs to be as fast as possible, since it is called
    every time the running thread changes, and every time a thread's
@@ -3022,58 +3642,6 @@ static inline Bool Filter__ok_to_skip_cwr08 ( Filter* fi, Addr a )
 //                                                     //
 /////////////////////////////////////////////////////////
 
-// QQQ move this somewhere else
-typedef  struct { ULong ull; ExeContext* ec; }  ULong_n_EC;
-
-/* How many of the above records to collect for each thread?  Older
-   ones are dumped when we run out of space.  62.5k requires 1MB per
-   thread, since each ULong_n_EC record is 16 bytes long.  When more
-   than N_KWs_N_STACKs_PER_THREAD are present, the older half are
-   deleted to make space.  Hence in the worst case we will be able to
-   produce a stack at least for the last N_KWs_N_STACKs_PER_THREAD / 2
-   Kw transitions (segments in this thread).  For the current setting
-   that gives a guaranteed stack for at least the last 31.25k
-   segments. */
-#define N_KWs_N_STACKs_PER_THREAD 62500
-
-
-struct _Thr {
-   /* Current VTSs for this thread.  They change as we go along.  viR
-      is the VTS to be used for reads, viW for writes.  Usually they
-      are the same, but can differ when we deal with reader-writer
-      locks.  It is always the case that 
-         VtsID__cmpLEQ(viW,viR) == True
-      that is, viW must be the same, or lagging behind, viR. */
-   VtsID viR;
-   VtsID viW;
-
-   /* Is initially False, and is set to true after the thread really
-      has done a low-level exit. */
-   Bool still_alive;
-
-   /* A small integer giving a unique identity to this Thr.  See
-      comments on the definition of ScalarTS for details. */
-   ThrID thrid : SCALARTS_N_THRBITS;
-
-   /* A filter that removes references for which we believe that
-      msmcread/msmcwrite will not change the state, nor report a
-      race. */
-   Filter* filter;
-
-   /* A pointer back to the top level Thread structure.  There is a
-      1-1 mapping between Thread and Thr structures -- each Thr points
-      at its corresponding Thread, and vice versa.  Really, Thr and
-      Thread should be merged into a single structure. */
-   Thread* hgthread;
-
-   /* The ULongs (scalar Kws) in this accumulate in strictly
-      increasing order, without duplicates.  This is important because
-      we need to be able to find a given scalar Kw in this array
-      later, by binary search. */
-   XArray* /* ULong_n_EC */ local_Kws_n_stacks;
-};
-
-
 /* Maps ThrID values to their Thr*s (which contain ThrID values that
    should point back to the relevant slot in the array.  Lowest
    numbered slot (0) is for thrid = 1024, (1) is for 1025, etc. */
@@ -3097,7 +3665,8 @@ static Thr* Thr__new ( void )
    Thr* thr = HG_(zalloc)( "libhb.Thr__new.1", sizeof(Thr) );
    thr->viR = VtsID_INVALID;
    thr->viW = VtsID_INVALID;
-   thr->still_alive = True;
+   thr->llexit_done = False;
+   thr->joinedwith_done = False;
    thr->filter = HG_(zalloc)( "libhb.Thr__new.2", sizeof(Filter) );
    /* We only really need this at history level 1, but unfortunately
       this routine is called before the command line processing is
@@ -3607,13 +4176,19 @@ static RCEC* get_RCEC ( Thr* thr )
 // (UInt) `echo "Old Reference Information" | md5sum`
 #define OldRef_MAGIC 0x30b1f075UL
 
-/* Records an access: a thread and a context.  The size
-   (1,2,4,8) and read-or-writeness are also encoded as
-   follows: bottom bit of .thr is 1 if write, 0 if read
-            bottom 2 bits of .rcec are encode size:
-            00 = 1, 01 = 2, 10 = 4, 11 = 8
+/* Records an access: a thread, a context (size & writeness) and the
+   number of held locks. The size (1,2,4,8) is encoded as 00 = 1, 01 =
+   2, 10 = 4, 11 = 8. 
 */
-typedef  struct { Thr* thr; RCEC* rcec; }  Thr_n_RCEC;
+typedef
+   struct { 
+      RCEC*     rcec;
+      WordSetID locksHeldW;
+      UInt      thrid  : SCALARTS_N_THRBITS;
+      UInt      szLg2B : 2;
+      UInt      isW    : 1;
+   }
+   Thr_n_RCEC;
 
 #define N_OLDREF_ACCS 5
 
@@ -3622,7 +4197,7 @@ typedef
       UWord magic;  /* sanity check only */
       UWord gen;    /* when most recently accessed */
                     /* or free list when not in use */
-      /* unused slots in this array have .thr == NULL */
+      /* unused slots in this array have .thrid == 0, which is invalid */
       Thr_n_RCEC accs[N_OLDREF_ACCS];
    }
    OldRef;
@@ -3647,13 +4222,6 @@ static UWord     oldrefGen      = 0;    /* current LRU generation # */
 static UWord     oldrefTreeN    = 0;    /* # elems in oldrefTree */
 static UWord     oldrefGenIncAt = 0;    /* inc gen # when size hits this */
 
-inline static void* ptr_or_UWord ( void* p, UWord w ) {
-   return (void*)( ((UWord)p) | ((UWord)w) );
-}
-inline static void* ptr_and_UWord ( void* p, UWord w ) {
-   return (void*)( ((UWord)p) & ((UWord)w) );
-}
-
 inline static UInt min_UInt ( UInt a, UInt b ) {
    return a < b ? a : b;
 }
@@ -3685,50 +4253,51 @@ static void event_map_bind ( Addr a, SizeT szB, Bool isW, Thr* thr )
    UWord   keyW, valW;
    Bool    b;
 
+   tl_assert(thr);
+   ThrID thrid = thr->thrid;
+   tl_assert(thrid != 0); /* zero is used to denote an empty slot. */
+
+   WordSetID locksHeldW = thr->hgthread->locksetW;
+
    rcec = get_RCEC( thr );
    ctxt__rcinc(rcec);
 
-   /* encode the size and writeness of the transaction in the bottom
-      two bits of thr and rcec. */
-   thr = ptr_or_UWord(thr, isW ? 1 : 0);
+   UInt szLg2B = 0;
    switch (szB) {
       /* This doesn't look particularly branch-predictor friendly. */
-      case 1:  rcec = ptr_or_UWord(rcec, 0); break;
-      case 2:  rcec = ptr_or_UWord(rcec, 1); break;
-      case 4:  rcec = ptr_or_UWord(rcec, 2); break;
-      case 8:  rcec = ptr_or_UWord(rcec, 3); break;
+      case 1:  szLg2B = 0; break;
+      case 2:  szLg2B = 1; break;
+      case 4:  szLg2B = 2; break;
+      case 8:  szLg2B = 3; break;
       default: tl_assert(0);
    }
 
-   /* Look in the map to see if we already have this. */
+   /* Look in the map to see if we already have a record for this
+      address. */
    b = VG_(lookupSWA)( oldrefTree, &keyW, &valW, a );
 
    if (b) {
 
       /* We already have a record for this address.  We now need to
-         see if we have a stack trace pertaining to this (thread, R/W,
+         see if we have a stack trace pertaining to this (thrid, R/W,
          size) triple. */
       tl_assert(keyW == a);
       ref = (OldRef*)valW;
       tl_assert(ref->magic == OldRef_MAGIC);
 
-      tl_assert(thr);
       for (i = 0; i < N_OLDREF_ACCS; i++) {
-         if (ref->accs[i].thr != thr)
+         if (ref->accs[i].thrid != thrid)
+            continue;
+         if (ref->accs[i].szLg2B != szLg2B)
             continue;
-         /* since .thr encodes both the accessing thread and the
-            read/writeness, we know now that at least those features
-            of the access match this entry.  So we just need to check
-            the size indication.  Do this by inspecting the lowest 2 bits of
-            .rcec, which contain the encoded size info. */
-         if (ptr_and_UWord(ref->accs[i].rcec,3) != ptr_and_UWord(rcec,3))
+         if (ref->accs[i].isW != (UInt)(isW & 1))
             continue;
          /* else we have a match, so stop looking. */
          break;
       }
 
       if (i < N_OLDREF_ACCS) {
-         /* thread 'thr' has an entry at index 'i'.  Update it. */
+         /* thread 'thr' has an entry at index 'i'.  Update its RCEC. */
          if (i > 0) {
             Thr_n_RCEC tmp = ref->accs[i-1];
             ref->accs[i-1] = ref->accs[i];
@@ -3737,31 +4306,36 @@ static void event_map_bind ( Addr a, SizeT szB, Bool isW, Thr* thr )
          }
          if (rcec == ref->accs[i].rcec) stats__ctxt_rcdec1_eq++;
          stats__ctxt_rcdec1++;
-         ctxt__rcdec( ptr_and_UWord(ref->accs[i].rcec, ~3) );
-         ref->accs[i].rcec = rcec;
-         tl_assert(ref->accs[i].thr == thr);
+         ctxt__rcdec( ref->accs[i].rcec );
+         tl_assert(ref->accs[i].thrid == thrid);
+         /* Update the RCEC and the W-held lockset. */
+         ref->accs[i].rcec       = rcec;
+         ref->accs[i].locksHeldW = locksHeldW;
       } else {
-         /* No entry for this (thread, R/W, size) triple.  Shuffle all
-            of them down one slot, and put the new entry at the start
-            of the array. */
-         if (ref->accs[N_OLDREF_ACCS-1].thr) {
+         /* No entry for this (thread, R/W, size, nWHeld) quad.
+            Shuffle all of them down one slot, and put the new entry
+            at the start of the array. */
+         if (ref->accs[N_OLDREF_ACCS-1].thrid != 0) {
             /* the last slot is in use.  We must dec the rc on the
                associated rcec. */
             tl_assert(ref->accs[N_OLDREF_ACCS-1].rcec);
             stats__ctxt_rcdec2++;
             if (0 && 0 == (stats__ctxt_rcdec2 & 0xFFF))
                VG_(printf)("QQQQ %lu overflows\n",stats__ctxt_rcdec2);
-            ctxt__rcdec( ptr_and_UWord(ref->accs[N_OLDREF_ACCS-1].rcec, ~3) );
+            ctxt__rcdec( ref->accs[N_OLDREF_ACCS-1].rcec );
          } else {
             tl_assert(!ref->accs[N_OLDREF_ACCS-1].rcec);
          }
          for (j = N_OLDREF_ACCS-1; j >= 1; j--)
             ref->accs[j] = ref->accs[j-1];
-         ref->accs[0].thr = thr;
-         ref->accs[0].rcec = rcec;
-         /* thr==NULL is used to signify an empty slot, so we can't
-            add a NULL thr. */
-         tl_assert(ptr_and_UWord(thr, ~3) != 0); 
+         ref->accs[0].thrid      = thrid;
+         ref->accs[0].szLg2B     = szLg2B;
+         ref->accs[0].isW        = (UInt)(isW & 1);
+         ref->accs[0].locksHeldW = locksHeldW;
+         ref->accs[0].rcec       = rcec;
+         /* thrid==0 is used to signify an empty slot, so we can't
+            add zero thrid (such a ThrID is invalid anyway). */
+         /* tl_assert(thrid != 0); */ /* There's a dominating assert above. */
       }
 
       ref->gen = oldrefGen;
@@ -3778,15 +4352,24 @@ static void event_map_bind ( Addr a, SizeT szB, Bool isW, Thr* thr )
 
       ref = alloc_OldRef();
       ref->magic = OldRef_MAGIC;
-      ref->gen = oldrefGen;
-      ref->accs[0].rcec = rcec;
-      ref->accs[0].thr = thr;
-      /* thr==NULL is used to signify an empty slot, so we can't add a
-         NULL thr. */
-      tl_assert(ptr_and_UWord(thr, ~3) != 0); 
+      ref->gen   = oldrefGen;
+      ref->accs[0].thrid      = thrid;
+      ref->accs[0].szLg2B     = szLg2B;
+      ref->accs[0].isW        = (UInt)(isW & 1);
+      ref->accs[0].locksHeldW = locksHeldW;
+      ref->accs[0].rcec       = rcec;
+
+      /* thrid==0 is used to signify an empty slot, so we can't
+         add zero thrid (such a ThrID is invalid anyway). */
+      /* tl_assert(thrid != 0); */ /* There's a dominating assert above. */
+
+      /* Clear out the rest of the entries */
       for (j = 1; j < N_OLDREF_ACCS; j++) {
-         ref->accs[j].thr = NULL;
-         ref->accs[j].rcec = NULL;
+         ref->accs[j].rcec       = NULL;
+         ref->accs[j].thrid      = 0;
+         ref->accs[j].szLg2B     = 0;
+         ref->accs[j].isW        = 0;
+         ref->accs[j].locksHeldW = 0;
       }
       VG_(addToSWA)( oldrefTree, a, (UWord)ref );
       oldrefTreeN++;
@@ -3795,10 +4378,12 @@ static void event_map_bind ( Addr a, SizeT szB, Bool isW, Thr* thr )
 }
 
 
+/* Extract info from the conflicting-access machinery. */
 Bool libhb_event_map_lookup ( /*OUT*/ExeContext** resEC,
-                              /*OUT*/Thr**  resThr,
-                              /*OUT*/SizeT* resSzB,
-                              /*OUT*/Bool*  resIsW,
+                              /*OUT*/Thr**        resThr,
+                              /*OUT*/SizeT*       resSzB,
+                              /*OUT*/Bool*        resIsW,
+                              /*OUT*/WordSetID*   locksHeldW,
                               Thr* thr, Addr a, SizeT szB, Bool isW )
 {
    Word    i, j;
@@ -3806,11 +4391,12 @@ Bool libhb_event_map_lookup ( /*OUT*/ExeContext** resEC,
    UWord   keyW, valW;
    Bool    b;
 
-   Thr*    cand_thr;
-   RCEC*   cand_rcec;
-   Bool    cand_isW;
-   SizeT   cand_szB;
-   Addr    cand_a;
+   ThrID     cand_thrid;
+   RCEC*     cand_rcec;
+   Bool      cand_isW;
+   SizeT     cand_szB;
+   WordSetID cand_locksHeldW;
+   Addr      cand_a;
 
    Addr toCheck[15];
    Int  nToCheck = 0;
@@ -3818,6 +4404,8 @@ Bool libhb_event_map_lookup ( /*OUT*/ExeContext** resEC,
    tl_assert(thr);
    tl_assert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
 
+   ThrID thrid = thr->thrid;
+
    toCheck[nToCheck++] = a;
    for (i = -7; i < (Word)szB; i++) {
       if (i != 0)
@@ -3839,33 +4427,27 @@ Bool libhb_event_map_lookup ( /*OUT*/ExeContext** resEC,
       ref = (OldRef*)valW;
       tl_assert(keyW == cand_a);
       tl_assert(ref->magic == OldRef_MAGIC);
-      tl_assert(ref->accs[0].thr); /* first slot must always be used */
+      tl_assert(ref->accs[0].thrid != 0); /* first slot must always be used */
 
-      cand_thr  = NULL;
-      cand_rcec = NULL;
-      cand_isW  = False;
-      cand_szB  = 0;
+      cand_thrid      = 0; /* invalid; see comments in event_map_bind */
+      cand_rcec       = NULL;
+      cand_isW        = False;
+      cand_szB        = 0;
+      cand_locksHeldW = 0; /* always valid; see initialise_data_structures() */
 
       for (i = 0; i < N_OLDREF_ACCS; i++) {
          Thr_n_RCEC* cand = &ref->accs[i];
-         cand_thr  = ptr_and_UWord(cand->thr, ~3);
-         cand_rcec = ptr_and_UWord(cand->rcec, ~3);
-         /* Decode the writeness from the bottom bit of .thr. */
-         cand_isW = 1 == (UWord)ptr_and_UWord(cand->thr, 1);
-         /* Decode the size from the bottom two bits of .rcec. */
-         switch ((UWord)ptr_and_UWord(cand->rcec, 3)) {
-            case 0:  cand_szB = 1; break;
-            case 1:  cand_szB = 2; break;
-            case 2:  cand_szB = 4; break;
-            case 3:  cand_szB = 8; break;
-            default: tl_assert(0);
-         }
+         cand_rcec       = cand->rcec;
+         cand_thrid      = cand->thrid;
+         cand_isW        = (Bool)cand->isW;
+         cand_szB        = 1 << cand->szLg2B;
+         cand_locksHeldW = cand->locksHeldW;
 
-         if (cand_thr == NULL) 
+         if (cand_thrid == 0) 
             /* This slot isn't in use.  Ignore it. */
             continue;
 
-         if (cand_thr == thr)
+         if (cand_thrid == thrid)
             /* This is an access by the same thread, but we're only
                interested in accesses from other threads.  Ignore. */
             continue;
@@ -3888,7 +4470,7 @@ Bool libhb_event_map_lookup ( /*OUT*/ExeContext** resEC,
       if (i < N_OLDREF_ACCS) {
          Int n, maxNFrames;
          /* return with success */
-         tl_assert(cand_thr);
+         tl_assert(cand_thrid);
          tl_assert(cand_rcec);
          tl_assert(cand_rcec->magic == RCEC_MAGIC);
          tl_assert(cand_szB >= 1);
@@ -3897,10 +4479,12 @@ Bool libhb_event_map_lookup ( /*OUT*/ExeContext** resEC,
          for (n = 0; n < maxNFrames; n++) {
             if (0 == cand_rcec->frames[n]) break;
          }
-         *resEC  = VG_(make_ExeContext_from_StackTrace)(cand_rcec->frames, n);
-         *resThr = cand_thr;
-         *resSzB = cand_szB;
-         *resIsW = cand_isW;
+         *resEC      = VG_(make_ExeContext_from_StackTrace)
+                          (cand_rcec->frames, n);
+         *resThr     = Thr__from_ThrID(cand_thrid);
+         *resSzB     = cand_szB;
+         *resIsW     = cand_isW;
+         *locksHeldW = cand_locksHeldW;
          return True;
       }
 
@@ -3987,9 +4571,9 @@ static void event_map__check_reference_counts ( Bool before )
       oldref = (OldRef*)valW;
       tl_assert(oldref->magic == OldRef_MAGIC);
       for (i = 0; i < N_OLDREF_ACCS; i++) {
-         Thr*  aThr = ptr_and_UWord(oldref->accs[i].thr, ~3);
-         RCEC* aRef = ptr_and_UWord(oldref->accs[i].rcec, ~3);
-         if (aThr) {
+         ThrID aThrID = oldref->accs[i].thrid;
+         RCEC* aRef   = oldref->accs[i].rcec;
+         if (aThrID != 0) {
             tl_assert(aRef);
             tl_assert(aRef->magic == RCEC_MAGIC);
             aRef->rcX++;
@@ -4230,14 +4814,14 @@ static void event_map_maybe_GC ( void )
       tl_assert(keyW == ga2del);
       oldref = (OldRef*)valW;
       for (j = 0; j < N_OLDREF_ACCS; j++) {
-         Thr*  aThr = ptr_and_UWord(oldref->accs[j].thr, ~3);
-         RCEC* aRef = ptr_and_UWord(oldref->accs[j].rcec, ~3);
+         ThrID aThrID = oldref->accs[j].thrid;
+         RCEC* aRef   = oldref->accs[j].rcec;
          if (aRef) {
-            tl_assert(aThr);
+            tl_assert(aThrID != 0);
             stats__ctxt_rcdec3++;
             ctxt__rcdec( aRef );
          } else {
-            tl_assert(!aThr);
+            tl_assert(aThrID == 0);
          }
       }
 
@@ -4441,7 +5025,7 @@ static void record_race_info ( Thr* acc_thr,
                we should search a bit further along the array than
                lastIx+1 if hist1_seg_end is NULL. */
          } else {
-            if (confThr->still_alive)
+            if (!confThr->llexit_done)
                hist1_seg_end = main_get_EC( confThr );
          }
          // seg_start could be NULL iff this is the first stack in the thread
@@ -5489,7 +6073,7 @@ void libhb_Thr_resumes ( Thr* thr )
 {
    if (0) VG_(printf)("resume %p\n", thr);
    tl_assert(thr);
-   tl_assert(thr->still_alive);
+   tl_assert(!thr->llexit_done);
    Filter__clear(thr->filter, "libhb_Thr_resumes");
    /* A kludge, but .. if this thread doesn't have any marker stacks
       at all, get one right now.  This is easier than figuring out
@@ -5509,23 +6093,31 @@ void libhb_Thr_resumes ( Thr* thr )
 //                                                     //
 /////////////////////////////////////////////////////////
 
-// (UInt) `echo "Synchronisation object" | md5sum`
-#define SO_MAGIC 0x56b3c5b0U
-
-struct _SO {
-   VtsID viR; /* r-clock of sender */
-   VtsID viW; /* w-clock of sender */
-   UInt  magic;
-};
+/* A double linked list of all the SO's. */
+SO* admin_SO = NULL;
 
-static SO* SO__Alloc ( void ) {
+static SO* SO__Alloc ( void )
+{
    SO* so = HG_(zalloc)( "libhb.SO__Alloc.1", sizeof(SO) );
    so->viR   = VtsID_INVALID;
    so->viW   = VtsID_INVALID;
    so->magic = SO_MAGIC;
+   /* Add to double linked list */
+   if (admin_SO) {
+      tl_assert(admin_SO->admin_prev == NULL);
+      admin_SO->admin_prev = so;
+      so->admin_next = admin_SO;
+   } else {
+      so->admin_next = NULL;
+   }
+   so->admin_prev = NULL;
+   admin_SO = so;
+   /* */
    return so;
 }
-static void SO__Dealloc ( SO* so ) {
+
+static void SO__Dealloc ( SO* so )
+{
    tl_assert(so);
    tl_assert(so->magic == SO_MAGIC);
    if (so->viR == VtsID_INVALID) {
@@ -5536,6 +6128,14 @@ static void SO__Dealloc ( SO* so ) {
       VtsID__rcdec(so->viW);
    }
    so->magic = 0;
+   /* Del from double linked list */
+   if (so->admin_prev)
+      so->admin_prev->admin_next = so->admin_next;
+   if (so->admin_next)
+      so->admin_next->admin_prev = so->admin_prev;
+   if (so == admin_SO)
+      admin_SO = so->admin_next;
+   /* */
    HG_(free)( so );
 }
 
@@ -5574,8 +6174,26 @@ Thr* libhb_init (
    // We will have to have to store a large number of these,
    // so make sure they're the size we expect them to be.
    tl_assert(sizeof(ScalarTS) == 8);
-   tl_assert(SCALARTS_N_THRBITS >= 11); /* because first 1024 unusable */
-   tl_assert(SCALARTS_N_THRBITS <= 32); /* so as to fit in a UInt */
+
+   /* because first 1024 unusable */
+   tl_assert(SCALARTS_N_THRBITS >= 11);
+   /* so as to fit in a UInt w/ 3 bits to spare (see defn of
+      Thr_n_RCEC). */
+   tl_assert(SCALARTS_N_THRBITS <= 29);
+
+   /* Need to be sure that Thr_n_RCEC is 2 words (64-bit) or 3 words
+      (32-bit).  It's not correctness-critical, but there are a lot of
+      them, so it's important from a space viewpoint.  Unfortunately
+      we simply can't pack it into 2 words on a 32-bit target. */
+   if (sizeof(UWord) == 8) {
+      tl_assert(sizeof(Thr_n_RCEC) == 16);
+   } else {
+      tl_assert(sizeof(Thr_n_RCEC) == 12);
+   }
+
+   /* Word sets really are 32 bits.  Even on a 64 bit target. */
+   tl_assert(sizeof(WordSetID) == 4);
+   tl_assert(sizeof(WordSet) == sizeof(WordSetID));
 
    tl_assert(get_stacktrace);
    tl_assert(get_EC);
@@ -5589,6 +6207,7 @@ Thr* libhb_init (
       VTS singleton, tick and join operations. */
    temp_max_sized_VTS = VTS__new( "libhb.libhb_init.1", ThrID_MAX_VALID );
    temp_max_sized_VTS->id = VtsID_INVALID;
+   verydead_thread_table_init();
    vts_set_init();
    vts_tab_init();
    event_map_init();
@@ -5780,11 +6399,14 @@ void libhb_shutdown ( Bool show_stats )
    }
 }
 
+/* Receive notification that a thread has low level exited.  The
+   significance here is that we do not expect to see any more memory
+   references from it. */
 void libhb_async_exit ( Thr* thr )
 {
    tl_assert(thr);
-   tl_assert(thr->still_alive);
-   thr->still_alive = False;
+   tl_assert(!thr->llexit_done);
+   thr->llexit_done = True;
 
    /* free up Filter and local_Kws_n_stacks (well, actually not the
       latter ..) */
@@ -5792,6 +6414,12 @@ void libhb_async_exit ( Thr* thr )
    HG_(free)(thr->filter);
    thr->filter = NULL;
 
+   /* Tell the VTS mechanism this thread has exited, so it can
+      participate in VTS pruning.  Note this can only happen if the
+      thread has both ll_exited and has been joined with. */
+   if (thr->joinedwith_done)
+      VTS__declare_thread_very_dead(thr);
+
    /* Another space-accuracy tradeoff.  Do we want to be able to show
       H1 history for conflicts in threads which have since exited?  If
       yes, then we better not free up thr->local_Kws_n_stacks.  The
@@ -5803,6 +6431,20 @@ void libhb_async_exit ( Thr* thr )
    // thr->local_Kws_n_stacks = NULL;
 }
 
+/* Receive notification that a thread has been joined with.  The
+   significance here is that we do not expect to see any further
+   references to its vector clocks (Thr::viR and Thr::viW). */
+void libhb_joinedwith_done ( Thr* thr )
+{
+   tl_assert(thr);
+   /* Caller must ensure that this is only ever called once per Thr. */
+   tl_assert(!thr->joinedwith_done);
+   thr->joinedwith_done = True;
+   if (thr->llexit_done)
+      VTS__declare_thread_very_dead(thr);
+}
+
+
 /* Both Segs and SOs point to VTSs.  However, there is no sharing, so
    a Seg that points at a VTS is its one-and-only owner, and ditto for
    a SO that points at a VTS. */
@@ -5861,7 +6503,7 @@ void libhb_so_send ( Thr* thr, SO* so, Bool strong_send )
    VtsID__rcdec(thr->viW);
    thr->viR = VtsID__tick( thr->viR, thr );
    thr->viW = VtsID__tick( thr->viW, thr );
-   if (thr->still_alive) {
+   if (!thr->llexit_done) {
       Filter__clear(thr->filter, "libhb_so_send");
       note_local_Kw_n_stack_for(thr);
    }
author	sewardj <sewardj@a5019735-40e9-0310-863c-91ae7b9d1cf9>	2011-06-24 10:09:41 +0000
committer	sewardj <sewardj@a5019735-40e9-0310-863c-91ae7b9d1cf9>	2011-06-24 10:09:41 +0000
commit	ffce8159a95134f0a2bc1cea3c3e6e265f096d9f (patch)
tree	675b228c1330fcce4e21162fe7ff6b360eade64f /helgrind
parent	6e18a08c22b581aec02577096b470978722167a9 (diff)