summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrea Canciani <ranma42@gmail.com>2013-04-02 16:47:14 +0200
committerAndrea Canciani <ranma42@gmail.com>2013-04-03 11:38:33 +0200
commit4fa4e5e0bdf7d05945c6139b845e45fb4492a56e (patch)
treefed411cef6e538a5d08b5ffe3b6de5d06c46bc7e
parentd2f8ae6c25517eab769970669e0a220e2969d0cc (diff)
Add hashtable
Add implementation of open-addressing hashtable (collisions are resolved with linear probing by means of double hashing).
-rw-r--r--hashtable.h370
-rwxr-xr-xprimes.py32
2 files changed, 402 insertions, 0 deletions
diff --git a/hashtable.h b/hashtable.h
new file mode 100644
index 0000000..70bba2c
--- /dev/null
+++ b/hashtable.h
@@ -0,0 +1,370 @@
+/* -*- Mode: c; c-basic-offset: 4; indent-tabs-mode: t; tab-width: 8; -*- */
+/*
+ * Copyright 2013 Andrea Canciani
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Author(s): Andrea Canciani <ranma42@gmail.com>
+ */
+
+#ifndef SIMPLEDATA_HASHTABLE_H
+#define SIMPLEDATA_HASHTABLE_H
+
+#include <stdlib.h>
+#include "simpleops/compiler.h"
+
+static const size_t _simpledata_hashtable_sizes[] = {
+ 7,
+ 13,
+ 31,
+ 61,
+ 109,
+ 241,
+ 463,
+ 1021,
+ 2029,
+ 4093,
+ 8089,
+ 16363,
+ 32719,
+ 65521,
+ 131011,
+ 262111,
+ 524221,
+ 1048573,
+ 2097133,
+ 4193803,
+ 8388451,
+ 16777141,
+ 33554011,
+ 67108669,
+ 134217439,
+ 268435009,
+ 536870839,
+ 1073741719,
+ 2147482951,
+ 4294965841
+};
+
+/*
+ * Find the smallest size which would result in a load of at most
+ * 2^(-shift).
+ */
+static simpleops_inline size_t
+_simpledata_hashtable_size (size_t count, int shift)
+{
+ int i, n;
+
+ n = SIMPLEOPS_ARRAY_LENGTH (_simpledata_hashtable_sizes);
+ for (i = 0; i < n; i++)
+ if (count < _simpledata_hashtable_sizes[i] >> shift)
+ break;
+
+ return _simpledata_hashtable_sizes[i];
+}
+
+
+#define SIMPLEDATA_HASHTABLE_STRUCTS(prefix, value_t, key_t) \
+ typedef struct { \
+ size_t hash; \
+ value_t data; \
+ } prefix##_slot_t; \
+ \
+ typedef struct { \
+ prefix##_slot_t *slots; \
+ size_t num_slots; \
+ size_t free_slots; \
+ size_t count; \
+ } prefix##_t;
+
+
+#define SIMPLEDATA_HASHTABLE_DECLARATIONS(visibility, \
+ prefix, \
+ value_t, \
+ key_t) \
+ visibility int prefix##_init (prefix##_t *hashtable, \
+ size_t num_slots); \
+ visibility void prefix##_fini (prefix##_t *hashtable); \
+ visibility void prefix##_clear (prefix##_t *hashtable); \
+ visibility value_t prefix##_get (prefix##_t *hashtable, key_t key); \
+ visibility void \
+ prefix##_foreach (prefix##_t *hashtable, \
+ void (*f) (const prefix##_t *hashtable, \
+ value_t data, void *arg), \
+ void *arg); \
+ visibility int prefix##_add (prefix##_t *hashtable, value_t data); \
+ visibility int prefix##_remove (prefix##_t *hashtable, key_t key);
+
+
+#define SIMPLEDATA_HASHTABLE_DEFINITIONS(visibility, \
+ prefix, \
+ value_t, \
+ key_t, \
+ empty_data, \
+ removed_data, \
+ hash_fn, \
+ key_fn, \
+ equals_fn) \
+ \
+ static simpleops_inline int \
+ _##prefix##_data_is_valid (value_t data) \
+ { \
+ return data != empty_data && data != removed_data; \
+ } \
+ \
+ static simpleops_inline prefix##_slot_t * \
+ _##prefix##_lookup (prefix##_t *hashtable, key_t key) \
+ { \
+ prefix##_slot_t *slot; \
+ size_t h, i, j, first_i; \
+ \
+ h = hash_fn (hashtable, key); \
+ first_i = i = h % hashtable->num_slots; \
+ j = 1 + h % (hashtable->num_slots - 2); \
+ \
+ do { \
+ slot = &hashtable->slots[i]; \
+ \
+ if (slot->data == empty_data) \
+ return NULL; \
+ \
+ if (slot->hash == h && \
+ equals_fn (hashtable, key, \
+ key_fn (hashtable, slot->data))) \
+ { \
+ return slot; \
+ } \
+ \
+ i = (i + j) % hashtable->num_slots; \
+ } while (i != first_i); \
+ \
+ return NULL; \
+ } \
+ \
+ static simpleops_inline void \
+ _##prefix##_rehash (prefix##_t *hashtable) \
+ { \
+ prefix##_slot_t *slots, *src, *dest; \
+ size_t h, i, j, k, num_slots; \
+ \
+ /* Aim to a load of 25% or less */ \
+ num_slots = _simpledata_hashtable_size (hashtable->count, 2); \
+ \
+ slots = calloc (num_slots, sizeof (*slots)); \
+ if (SIMPLEOPS_UNLIKELY (slots == NULL)) \
+ return; \
+ \
+ for (k = 0; k < num_slots; k++) \
+ slots[k].data = empty_data; \
+ \
+ for (k = 0; k < hashtable->num_slots; k++) { \
+ src = &hashtable->slots[k]; \
+ if (_##prefix##_data_is_valid (src->data)) { \
+ h = src->hash; \
+ i = h % num_slots; \
+ j = 1 + h % (num_slots - 2); \
+ \
+ dest = &slots[i]; \
+ while (dest->data != empty_data) { \
+ i = (i + j) % num_slots; \
+ dest = &slots[i]; \
+ } \
+ \
+ *dest = *src; \
+ } \
+ } \
+ \
+ free (hashtable->slots); \
+ \
+ hashtable->slots = slots; \
+ hashtable->num_slots = num_slots; \
+ hashtable->free_slots = num_slots - hashtable->count; \
+ } \
+ \
+ static simpleops_inline int \
+ _##prefix##_add_noresize (prefix##_t *hashtable, value_t data) \
+ { \
+ prefix##_slot_t *slot, *dest; \
+ key_t key; \
+ size_t h, i, j, first_i; \
+ \
+ if (SIMPLEOPS_UNLIKELY (! _##prefix##_data_is_valid (data))) \
+ return -1; /* Illegal value */ \
+ \
+ dest = NULL; \
+ key = key_fn (hashtable, data); \
+ h = hash_fn (hashtable, key); \
+ first_i = i = h % hashtable->num_slots; \
+ j = 1 + h % (hashtable->num_slots - 2); \
+ \
+ do { \
+ slot = &hashtable->slots[i]; \
+ \
+ if (_##prefix##_data_is_valid (slot->data)) { \
+ if (slot->hash == h && \
+ equals_fn (hashtable, key, \
+ key_fn (hashtable, slot->data))) \
+ { \
+ return -1; /* Key already in the set */ \
+ } \
+ } else { \
+ if (dest == NULL) \
+ dest = slot; \
+ if (slot->data == empty_data) \
+ break; \
+ } \
+ \
+ i = (i + j) % hashtable->num_slots; \
+ } while (i != first_i); \
+ \
+ if (SIMPLEOPS_UNLIKELY (dest == NULL)) \
+ return -1; /* No free slot available */ \
+ \
+ hashtable->count++; \
+ if (slot->data == empty_data) \
+ hashtable->free_slots--; \
+ \
+ slot->hash = h; \
+ slot->data = data; \
+ \
+ return 0; \
+ } \
+ \
+ static simpleops_inline int \
+ _##prefix##_remove_noresize (prefix##_t *hashtable, key_t key) \
+ { \
+ prefix##_slot_t *slot; \
+ \
+ slot = _##prefix##_lookup (hashtable, key); \
+ \
+ if (slot == NULL) \
+ return -1; \
+ \
+ slot->hash = ~slot->hash; /* Try to avoid collisions */ \
+ slot->data = removed_data; \
+ \
+ hashtable->count--; \
+ \
+ return 0; \
+ } \
+ \
+ visibility void \
+ prefix##_clear (prefix##_t *hashtable) \
+ { \
+ size_t i; \
+ \
+ hashtable->free_slots = hashtable->num_slots; \
+ hashtable->count = 0; \
+ \
+ for (i = 0; i < hashtable->num_slots; i++) \
+ hashtable->slots[i].data = empty_data; \
+ } \
+ \
+ visibility int \
+ prefix##_init (prefix##_t *hashtable, size_t num_elements) \
+ { \
+ size_t i, num_slots; \
+ prefix##_slot_t *slots; \
+ \
+ /* Aim to a load of 50% or less */ \
+ num_slots = _simpledata_hashtable_size (num_elements, 1); \
+ \
+ slots = calloc (num_slots, sizeof (*slots)); \
+ \
+ hashtable->slots = slots; \
+ hashtable->num_slots = num_slots; \
+ hashtable->free_slots = num_slots; \
+ hashtable->count = 0; \
+ \
+ if (SIMPLEOPS_UNLIKELY (slots == NULL)) \
+ return -1; \
+ \
+ prefix##_clear (hashtable); \
+ \
+ return 0; \
+ } \
+ \
+ visibility void \
+ prefix##_fini (prefix##_t *hashtable) \
+ { \
+ free (hashtable->slots); \
+ hashtable->slots = NULL; \
+ } \
+ \
+ visibility value_t \
+ prefix##_get (prefix##_t *hashtable, key_t key) \
+ { \
+ prefix##_slot_t *slot; \
+ \
+ slot = _##prefix##_lookup (hashtable, key); \
+ if (slot == NULL) \
+ return NULL; \
+ \
+ return slot->data; \
+ } \
+ \
+ visibility void \
+ prefix##_foreach (prefix##_t *hashtable, \
+ void (*f) (const prefix##_t *hashtable, \
+ value_t data, void *arg), \
+ void *arg) \
+ { \
+ value_t data; \
+ size_t i; \
+ \
+ for (i = 0; i < hashtable->num_slots; i++) { \
+ data = hashtable->slots[i].data; \
+ \
+ if (_##prefix##_data_is_valid (data)) \
+ f (hashtable, data, arg); \
+ } \
+ } \
+ \
+ visibility int \
+ prefix##_add (prefix##_t *hashtable, value_t data) \
+ { \
+ size_t threshold; \
+ \
+ /* Guarantee at least 50% free slots */ \
+ threshold = hashtable->num_slots / 8; \
+ if (SIMPLEOPS_UNLIKELY (hashtable->free_slots < threshold)) \
+ _##prefix##_rehash (hashtable); \
+ \
+ return _##prefix##_add_noresize (hashtable, data); \
+ } \
+ \
+ visibility int \
+ prefix##_remove (prefix##_t *hashtable, key_t key) \
+ { \
+ size_t threshold; \
+ int ret; \
+ \
+ ret = _##prefix##_remove_noresize (hashtable, key); \
+ \
+ /* At least 1/8th of the elements should be valid */ \
+ threshold = hashtable->num_slots / 8; \
+ if (SIMPLEOPS_UNLIKELY (hashtable->count < threshold)) \
+ _##prefix##_rehash (hashtable); \
+ \
+ return ret; \
+ }
+
+#endif
diff --git a/primes.py b/primes.py
new file mode 100755
index 0000000..690168a
--- /dev/null
+++ b/primes.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+#
+# Simple program to compute the biggest twin prime pair less than each
+# power of 2.
+#
+# This is used to choose "good" sizes for the hashtable.
+#
+# Using a prime for "num_slot" is useful as that guarantees that the
+# orbit touches every slot (which is useful to have a good
+# implementation of open addressing, because this means that as long
+# as you have free slots, you can add elements, no matter what their
+# hash is).
+#
+# Using a prime for the linear offset is not actually needed, but
+# avoiding composite numbers is usually a good idea, because they tend
+# to show bad behavior depending on the patterns found in the input
+# data.
+#
+
+def isprime(x):
+ d = 2
+ while d * d <= x:
+ if x % d == 0:
+ return False
+ d = d + 1
+ return True
+
+for i in range(3, 33):
+ x = 2 ** i
+ while not (isprime(x) and isprime(x-2)):
+ x = x - 1
+ print " %d," % x