From 4fa4e5e0bdf7d05945c6139b845e45fb4492a56e Mon Sep 17 00:00:00 2001 From: Andrea Canciani Date: Tue, 2 Apr 2013 16:47:14 +0200 Subject: Add hashtable Add implementation of open-addressing hashtable (collisions are resolved with linear probing by means of double hashing). --- hashtable.h | 370 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ primes.py | 32 ++++++ 2 files changed, 402 insertions(+) create mode 100644 hashtable.h create mode 100755 primes.py diff --git a/hashtable.h b/hashtable.h new file mode 100644 index 0000000..70bba2c --- /dev/null +++ b/hashtable.h @@ -0,0 +1,370 @@ +/* -*- Mode: c; c-basic-offset: 4; indent-tabs-mode: t; tab-width: 8; -*- */ +/* + * Copyright 2013 Andrea Canciani + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Author(s): Andrea Canciani + */ + +#ifndef SIMPLEDATA_HASHTABLE_H +#define SIMPLEDATA_HASHTABLE_H + +#include +#include "simpleops/compiler.h" + +static const size_t _simpledata_hashtable_sizes[] = { + 7, + 13, + 31, + 61, + 109, + 241, + 463, + 1021, + 2029, + 4093, + 8089, + 16363, + 32719, + 65521, + 131011, + 262111, + 524221, + 1048573, + 2097133, + 4193803, + 8388451, + 16777141, + 33554011, + 67108669, + 134217439, + 268435009, + 536870839, + 1073741719, + 2147482951, + 4294965841 +}; + +/* + * Find the smallest size which would result in a load of at most + * 2^(-shift). + */ +static simpleops_inline size_t +_simpledata_hashtable_size (size_t count, int shift) +{ + int i, n; + + n = SIMPLEOPS_ARRAY_LENGTH (_simpledata_hashtable_sizes); + for (i = 0; i < n; i++) + if (count < _simpledata_hashtable_sizes[i] >> shift) + break; + + return _simpledata_hashtable_sizes[i]; +} + + +#define SIMPLEDATA_HASHTABLE_STRUCTS(prefix, value_t, key_t) \ + typedef struct { \ + size_t hash; \ + value_t data; \ + } prefix##_slot_t; \ + \ + typedef struct { \ + prefix##_slot_t *slots; \ + size_t num_slots; \ + size_t free_slots; \ + size_t count; \ + } prefix##_t; + + +#define SIMPLEDATA_HASHTABLE_DECLARATIONS(visibility, \ + prefix, \ + value_t, \ + key_t) \ + visibility int prefix##_init (prefix##_t *hashtable, \ + size_t num_slots); \ + visibility void prefix##_fini (prefix##_t *hashtable); \ + visibility void prefix##_clear (prefix##_t *hashtable); \ + visibility value_t prefix##_get (prefix##_t *hashtable, key_t key); \ + visibility void \ + prefix##_foreach (prefix##_t *hashtable, \ + void (*f) (const prefix##_t *hashtable, \ + value_t data, void *arg), \ + void *arg); \ + visibility int prefix##_add (prefix##_t *hashtable, value_t data); \ + visibility int prefix##_remove (prefix##_t *hashtable, key_t key); + + +#define SIMPLEDATA_HASHTABLE_DEFINITIONS(visibility, \ + prefix, \ + value_t, \ + key_t, \ + empty_data, \ + removed_data, \ + hash_fn, \ + key_fn, \ + equals_fn) \ + \ + static simpleops_inline int \ + _##prefix##_data_is_valid (value_t data) \ + { \ + return data != empty_data && data != removed_data; \ + } \ + \ + static simpleops_inline prefix##_slot_t * \ + _##prefix##_lookup (prefix##_t *hashtable, key_t key) \ + { \ + prefix##_slot_t *slot; \ + size_t h, i, j, first_i; \ + \ + h = hash_fn (hashtable, key); \ + first_i = i = h % hashtable->num_slots; \ + j = 1 + h % (hashtable->num_slots - 2); \ + \ + do { \ + slot = &hashtable->slots[i]; \ + \ + if (slot->data == empty_data) \ + return NULL; \ + \ + if (slot->hash == h && \ + equals_fn (hashtable, key, \ + key_fn (hashtable, slot->data))) \ + { \ + return slot; \ + } \ + \ + i = (i + j) % hashtable->num_slots; \ + } while (i != first_i); \ + \ + return NULL; \ + } \ + \ + static simpleops_inline void \ + _##prefix##_rehash (prefix##_t *hashtable) \ + { \ + prefix##_slot_t *slots, *src, *dest; \ + size_t h, i, j, k, num_slots; \ + \ + /* Aim to a load of 25% or less */ \ + num_slots = _simpledata_hashtable_size (hashtable->count, 2); \ + \ + slots = calloc (num_slots, sizeof (*slots)); \ + if (SIMPLEOPS_UNLIKELY (slots == NULL)) \ + return; \ + \ + for (k = 0; k < num_slots; k++) \ + slots[k].data = empty_data; \ + \ + for (k = 0; k < hashtable->num_slots; k++) { \ + src = &hashtable->slots[k]; \ + if (_##prefix##_data_is_valid (src->data)) { \ + h = src->hash; \ + i = h % num_slots; \ + j = 1 + h % (num_slots - 2); \ + \ + dest = &slots[i]; \ + while (dest->data != empty_data) { \ + i = (i + j) % num_slots; \ + dest = &slots[i]; \ + } \ + \ + *dest = *src; \ + } \ + } \ + \ + free (hashtable->slots); \ + \ + hashtable->slots = slots; \ + hashtable->num_slots = num_slots; \ + hashtable->free_slots = num_slots - hashtable->count; \ + } \ + \ + static simpleops_inline int \ + _##prefix##_add_noresize (prefix##_t *hashtable, value_t data) \ + { \ + prefix##_slot_t *slot, *dest; \ + key_t key; \ + size_t h, i, j, first_i; \ + \ + if (SIMPLEOPS_UNLIKELY (! _##prefix##_data_is_valid (data))) \ + return -1; /* Illegal value */ \ + \ + dest = NULL; \ + key = key_fn (hashtable, data); \ + h = hash_fn (hashtable, key); \ + first_i = i = h % hashtable->num_slots; \ + j = 1 + h % (hashtable->num_slots - 2); \ + \ + do { \ + slot = &hashtable->slots[i]; \ + \ + if (_##prefix##_data_is_valid (slot->data)) { \ + if (slot->hash == h && \ + equals_fn (hashtable, key, \ + key_fn (hashtable, slot->data))) \ + { \ + return -1; /* Key already in the set */ \ + } \ + } else { \ + if (dest == NULL) \ + dest = slot; \ + if (slot->data == empty_data) \ + break; \ + } \ + \ + i = (i + j) % hashtable->num_slots; \ + } while (i != first_i); \ + \ + if (SIMPLEOPS_UNLIKELY (dest == NULL)) \ + return -1; /* No free slot available */ \ + \ + hashtable->count++; \ + if (slot->data == empty_data) \ + hashtable->free_slots--; \ + \ + slot->hash = h; \ + slot->data = data; \ + \ + return 0; \ + } \ + \ + static simpleops_inline int \ + _##prefix##_remove_noresize (prefix##_t *hashtable, key_t key) \ + { \ + prefix##_slot_t *slot; \ + \ + slot = _##prefix##_lookup (hashtable, key); \ + \ + if (slot == NULL) \ + return -1; \ + \ + slot->hash = ~slot->hash; /* Try to avoid collisions */ \ + slot->data = removed_data; \ + \ + hashtable->count--; \ + \ + return 0; \ + } \ + \ + visibility void \ + prefix##_clear (prefix##_t *hashtable) \ + { \ + size_t i; \ + \ + hashtable->free_slots = hashtable->num_slots; \ + hashtable->count = 0; \ + \ + for (i = 0; i < hashtable->num_slots; i++) \ + hashtable->slots[i].data = empty_data; \ + } \ + \ + visibility int \ + prefix##_init (prefix##_t *hashtable, size_t num_elements) \ + { \ + size_t i, num_slots; \ + prefix##_slot_t *slots; \ + \ + /* Aim to a load of 50% or less */ \ + num_slots = _simpledata_hashtable_size (num_elements, 1); \ + \ + slots = calloc (num_slots, sizeof (*slots)); \ + \ + hashtable->slots = slots; \ + hashtable->num_slots = num_slots; \ + hashtable->free_slots = num_slots; \ + hashtable->count = 0; \ + \ + if (SIMPLEOPS_UNLIKELY (slots == NULL)) \ + return -1; \ + \ + prefix##_clear (hashtable); \ + \ + return 0; \ + } \ + \ + visibility void \ + prefix##_fini (prefix##_t *hashtable) \ + { \ + free (hashtable->slots); \ + hashtable->slots = NULL; \ + } \ + \ + visibility value_t \ + prefix##_get (prefix##_t *hashtable, key_t key) \ + { \ + prefix##_slot_t *slot; \ + \ + slot = _##prefix##_lookup (hashtable, key); \ + if (slot == NULL) \ + return NULL; \ + \ + return slot->data; \ + } \ + \ + visibility void \ + prefix##_foreach (prefix##_t *hashtable, \ + void (*f) (const prefix##_t *hashtable, \ + value_t data, void *arg), \ + void *arg) \ + { \ + value_t data; \ + size_t i; \ + \ + for (i = 0; i < hashtable->num_slots; i++) { \ + data = hashtable->slots[i].data; \ + \ + if (_##prefix##_data_is_valid (data)) \ + f (hashtable, data, arg); \ + } \ + } \ + \ + visibility int \ + prefix##_add (prefix##_t *hashtable, value_t data) \ + { \ + size_t threshold; \ + \ + /* Guarantee at least 50% free slots */ \ + threshold = hashtable->num_slots / 8; \ + if (SIMPLEOPS_UNLIKELY (hashtable->free_slots < threshold)) \ + _##prefix##_rehash (hashtable); \ + \ + return _##prefix##_add_noresize (hashtable, data); \ + } \ + \ + visibility int \ + prefix##_remove (prefix##_t *hashtable, key_t key) \ + { \ + size_t threshold; \ + int ret; \ + \ + ret = _##prefix##_remove_noresize (hashtable, key); \ + \ + /* At least 1/8th of the elements should be valid */ \ + threshold = hashtable->num_slots / 8; \ + if (SIMPLEOPS_UNLIKELY (hashtable->count < threshold)) \ + _##prefix##_rehash (hashtable); \ + \ + return ret; \ + } + +#endif diff --git a/primes.py b/primes.py new file mode 100755 index 0000000..690168a --- /dev/null +++ b/primes.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python +# +# Simple program to compute the biggest twin prime pair less than each +# power of 2. +# +# This is used to choose "good" sizes for the hashtable. +# +# Using a prime for "num_slot" is useful as that guarantees that the +# orbit touches every slot (which is useful to have a good +# implementation of open addressing, because this means that as long +# as you have free slots, you can add elements, no matter what their +# hash is). +# +# Using a prime for the linear offset is not actually needed, but +# avoiding composite numbers is usually a good idea, because they tend +# to show bad behavior depending on the patterns found in the input +# data. +# + +def isprime(x): + d = 2 + while d * d <= x: + if x % d == 0: + return False + d = d + 1 + return True + +for i in range(3, 33): + x = 2 ** i + while not (isprime(x) and isprime(x-2)): + x = x - 1 + print " %d," % x -- cgit v1.2.3