/*--------------------------------------------------------------------*/
/*--- Cache simulation                                    cg_sim.c ---*/
/*--------------------------------------------------------------------*/

/*
   This file is part of Cachegrind, a Valgrind tool for cache
   profiling programs.

   Copyright (C) 2002-2010 Nicholas Nethercote
      njn@valgrind.org

   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   This program is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
   02111-1307, USA.

   The GNU General Public License is contained in the file COPYING.
*/

/* Notes:
  - simulates a write-allocate cache
  - (block --> set) hash function uses simple bit selection
  - handling of references straddling two cache blocks:
      - counts as only one cache access (not two)
      - both blocks hit                  --> one hit
      - one block hits, the other misses --> one miss
      - both blocks miss                 --> one miss (not two)
*/

typedef struct {
   Int          size;                   /* bytes */
   Int          assoc;
   Int          line_size;              /* bytes */
   Int          sets;
   Int          sets_min_1;
   Int          line_size_bits;
   Int          tag_shift;
   Char         desc_line[128];
   UWord*       tags;
} cache_t2;

/* By this point, the size/assoc/line_size has been checked. */
static void cachesim_initcache(cache_t config, cache_t2* c)
{
   Int i;

   c->size      = config.size;
   c->assoc     = config.assoc;
   c->line_size = config.line_size;

   c->sets           = (c->size / c->line_size) / c->assoc;
   c->sets_min_1     = c->sets - 1;
   c->line_size_bits = VG_(log2)(c->line_size);
   c->tag_shift      = c->line_size_bits + VG_(log2)(c->sets);

   if (c->assoc == 1) {
      VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped", 
                                 c->size, c->line_size);
   } else {
      VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative",
                                 c->size, c->line_size, c->assoc);
   }

   c->tags = VG_(malloc)("cg.sim.ci.1",
                         sizeof(UWord) * c->sets * c->assoc);

   for (i = 0; i < c->sets * c->assoc; i++)
      c->tags[i] = 0;
}

/* This is done as a macro rather than by passing in the cache_t2 as an 
 * arg because it slows things down by a small amount (3-5%) due to all 
 * that extra indirection. */

#define CACHESIM(L, MISS_TREATMENT)                                         \
/* The cache and associated bits and pieces. */                             \
static cache_t2 L;                                                          \
                                                                            \
static void cachesim_##L##_initcache(cache_t config)                        \
{                                                                           \
    cachesim_initcache(config, &L);                                         \
}                                                                           \
                                                                            \
/* This attribute forces GCC to inline this function, even though it's */   \
/* bigger than its usual limit.  Inlining gains around 5--10% speedup. */   \
__attribute__((always_inline))                                              \
static __inline__                                                           \
void cachesim_##L##_doref(Addr a, UChar size, ULong* m1, ULong *mL)         \
{                                                                           \
   UInt  set1 = ( a         >> L.line_size_bits) & (L.sets_min_1);          \
   UInt  set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1);          \
   UWord tag  = a >> L.tag_shift;                                           \
   UWord tag2;                                                              \
   Int i, j;                                                                \
   Bool is_miss = False;                                                    \
   UWord* set;                                                              \
                                                                            \
   /* First case: word entirely within line. */                             \
   if (set1 == set2) {                                                      \
                                                                            \
      set = &(L.tags[set1 * L.assoc]);                                      \
                                                                            \
      /* This loop is unrolled for just the first case, which is the most */\
      /* common.  We can't unroll any further because it would screw up   */\
      /* if we have a direct-mapped (1-way) cache.                        */\
      if (tag == set[0]) {                                                  \
         return;                                                            \
      }                                                                     \
      /* If the tag is one other than the MRU, move it into the MRU spot  */\
      /* and shuffle the rest down.                                       */\
      for (i = 1; i < L.assoc; i++) {                                       \
         if (tag == set[i]) {                                               \
            for (j = i; j > 0; j--) {                                       \
               set[j] = set[j - 1];                                         \
            }                                                               \
            set[0] = tag;                                                   \
            return;                                                         \
         }                                                                  \
      }                                                                     \
                                                                            \
      /* A miss;  install this tag as MRU, shuffle rest down. */            \
      for (j = L.assoc - 1; j > 0; j--) {                                   \
         set[j] = set[j - 1];                                               \
      }                                                                     \
      set[0] = tag;                                                         \
      MISS_TREATMENT;                                                       \
      return;                                                               \
                                                                            \
   /* Second case: word straddles two lines. */                             \
   /* Nb: this is a fast way of doing ((set1+1) % L.sets) */                \
   } else if (((set1 + 1) & (L.sets_min_1)) == set2) {                      \
      set = &(L.tags[set1 * L.assoc]);                                      \
      if (tag == set[0]) {                                                  \
         goto block2;                                                       \
      }                                                                     \
      for (i = 1; i < L.assoc; i++) {                                       \
         if (tag == set[i]) {                                               \
            for (j = i; j > 0; j--) {                                       \
               set[j] = set[j - 1];                                         \
            }                                                               \
            set[0] = tag;                                                   \
            goto block2;                                                    \
         }                                                                  \
      }                                                                     \
      for (j = L.assoc - 1; j > 0; j--) {                                   \
         set[j] = set[j - 1];                                               \
      }                                                                     \
      set[0] = tag;                                                         \
      is_miss = True;                                                       \
block2:                                                                     \
      set = &(L.tags[set2 * L.assoc]);                                      \
      tag2 = (a+size-1) >> L.tag_shift;                                     \
      if (tag2 == set[0]) {                                                 \
         goto miss_treatment;                                               \
      }                                                                     \
      for (i = 1; i < L.assoc; i++) {                                       \
         if (tag2 == set[i]) {                                              \
            for (j = i; j > 0; j--) {                                       \
               set[j] = set[j - 1];                                         \
            }                                                               \
            set[0] = tag2;                                                  \
            goto miss_treatment;                                            \
         }                                                                  \
      }                                                                     \
      for (j = L.assoc - 1; j > 0; j--) {                                   \
         set[j] = set[j - 1];                                               \
      }                                                                     \
      set[0] = tag2;                                                        \
      is_miss = True;                                                       \
miss_treatment:                                                             \
      if (is_miss) { MISS_TREATMENT; }                                      \
                                                                            \
   } else {                                                                 \
       VG_(printf)("addr: %lx  size: %u  sets: %d %d", a, size, set1, set2);\
       VG_(tool_panic)("item straddles more than two cache sets");          \
   }                                                                        \
   return;                                                                  \
}

CACHESIM(LL, (*mL)++ );
CACHESIM(I1, { (*m1)++; cachesim_LL_doref(a, size, m1, mL); } );
CACHESIM(D1, { (*m1)++; cachesim_LL_doref(a, size, m1, mL); } );

/*--------------------------------------------------------------------*/
/*--- end                                                 cg_sim.c ---*/
/*--------------------------------------------------------------------*/