patterncMinor
Hash table using open-addressing with linear probing
Viewed 0 times
probinglinearaddressingwithopenhashusingtable
Problem
I've written a hash-table implementation. This is the first time I've written such code. The hash-table uses open-addressing with linear probing. The hash function is still subject to change as I found out some properties of it that make it particulary bad for my application.
Could you please review the code and tell me if there are any problems? You can find the code at GitHub.
Code
The code consists of a
Makefile
ht.c
```
/ hash table /
#include
#include
#include
#include
#include "ht.h"
struct ht {
int bits;
uint32_t randoms[256];
ht_value_t table[];
};
/ state for xorshift rng /
static uint32_t xstate = 0;
#define BITMASK(bits) ((bits) == 32 ? 0xffffffff : (1> 17;
xstate ^= xstate randoms[ukey.bytes[i]];
return accum;
}
/ tweakable based on actual way to retrieve key /
static ht_key_t get_key(ht_value_t val) {
return *val;
}
ht_t *ht_new(int bits) {
ht_t *ht;
int i,bitmask;
if (bits = 32) return NULL;
ht = malloc(sizeof*ht + (1table);
ht->bits = bits;
init_xorshift();
/ initiate hash function /
bitmask = BITMASK(bits);
for (i=255;i>=0;i--) ht->randoms[i] = xorshift()&bitmask;
memset(ht->table,0,1table[index] != NULL && get_key(ht->table[index]) != key) {
tmp = ht->table[index];
ht->table[index] = value;
index = MODINC(ht->bits,index);
value = tmp;
iters++;
}
ht->table[index] = value;
return iters;
}
int ht_del(ht_t *ht,ht_key_t key) {
uint32_t hkey = hash(ht,key), index, tindex;
ht_key_t ckey;
int iters = 0;
for (index = hkey;;index = MODINC(ht->bits,index)) {
if (ht-
Could you please review the code and tell me if there are any problems? You can find the code at GitHub.
Code
The code consists of a
Makefile, a source file file ht.c and a header file ht.h. The file main.c is a simple test program that reports the amount of entries moved on average to fill in entries into a hashtable at least twice the size of the entries.Makefile
OUT=main
OBJ=ht.o main.o
$(OUT): $(OBJ)
.PHONY: clean
clean:
$(RM) $(OBJ) $(OUT)ht.c
```
/ hash table /
#include
#include
#include
#include
#include "ht.h"
struct ht {
int bits;
uint32_t randoms[256];
ht_value_t table[];
};
/ state for xorshift rng /
static uint32_t xstate = 0;
#define BITMASK(bits) ((bits) == 32 ? 0xffffffff : (1> 17;
xstate ^= xstate randoms[ukey.bytes[i]];
return accum;
}
/ tweakable based on actual way to retrieve key /
static ht_key_t get_key(ht_value_t val) {
return *val;
}
ht_t *ht_new(int bits) {
ht_t *ht;
int i,bitmask;
if (bits = 32) return NULL;
ht = malloc(sizeof*ht + (1table);
ht->bits = bits;
init_xorshift();
/ initiate hash function /
bitmask = BITMASK(bits);
for (i=255;i>=0;i--) ht->randoms[i] = xorshift()&bitmask;
memset(ht->table,0,1table[index] != NULL && get_key(ht->table[index]) != key) {
tmp = ht->table[index];
ht->table[index] = value;
index = MODINC(ht->bits,index);
value = tmp;
iters++;
}
ht->table[index] = value;
return iters;
}
int ht_del(ht_t *ht,ht_key_t key) {
uint32_t hkey = hash(ht,key), index, tindex;
ht_key_t ckey;
int iters = 0;
for (index = hkey;;index = MODINC(ht->bits,index)) {
if (ht-
Solution
- Bugs
-
The algorithm in
ht_get is incorrect:while (ht->table[index] != NULL
&& hash(ht,ckey = get_key(ht->table[index])) == hkey) {
if (ckey == key) return ht->table[index];
index = MODINC(ht->bits,index);
}You are assuming here that if you find a key with a different hash in your probe sequence, then this means that the key you are looking for is not there. But that's not right. Try the following test program:
#include
#include
#include "ht.h"
#define ARRAY_LENGTH(array) (sizeof(array) / sizeof(array[0]))
int main(int argc, char *argv[]) {
uint64_t k[] = {0x12345671, 0x11223344, 0x22334411, 0x33441122};
ht_t *ht = ht_new(2);
for (size_t i = 0; i < ARRAY_LENGTH(k); ++i) {
ht_put(ht, k[i], &k[i]);
}
printf("%p\n", ht_get(ht, k[0]));
return 0;
}You'll need to run it a few times (because of the randomization in your hash function), but I find that about half the time this prints
0x0 because k[0] failed to be found in the hash table (even though it must be there since it was the first key that was added).-
The algorithm in
ht_del is also incorrect. As Knuth writes, "Many computer programmers have great faith in algorithms, and they are surprised to find that the obvious way to delete records from a hash table doesn't work." (The Art of Computer Programming Vol. III, p. 533.)First, here:
for (index = hkey;;index = MODINC(ht->bits,index)) {
if (ht->table[index] == NULL) return -1;
ckey = get_key(ht->table[index]);
if (hash(ht,ckey) != hkey) return -1; /* <-- UH-OH */
if (ckey == key) break;
}On the indicated line you conclude that
key can't be in the table because you've found a key with a different hash in your probe sequence. But this isn't right: there might very well be keys with different hashes interleaved in the same probe sequence (this is particularly likely in your case because you use the same probe sequence for every key).Second, after deleting the key you continue with the probe sequence, shifting the keys along so that there are no gaps. But this is wrong: if any of those keys have different hashes to
key, then moving them can cause them not to be findable any more.If you want to get this right, Knuth gives an algorithm (pp. 533–4) for deletion in a open hash table with linear probing, but since linear probing is itself not a particularly good idea (see 2.3 below), it's better to do what most open hash table implementations do, which is to mark a key as deleted by putting a placeholder (some constant
KEY_DELETED) in its place. (Combined with automatic growing and rehashing; see 2.4 below.)-
If the table gets full, then
ht_put will go into an infinite loop.Also, if the table is full, and if all the keys have the same hash, then
ht_get and ht_del will go into infinite loops.It's best to avoid all these problems by automatically growing and rehashing before the table gets full. See 2.4 below.
- Other important issues
-
It's not easy to choose a good general-purpose hash function, so it's a bad sign that you are trying to invent your own. It would be much better to choose a well-known and well-tested function, for example from Wikipedia's list of hash functions. For general data, Bernstein's hash is simple and fast; and for integers there's Knuth's multiplicative hash.
In particular, the hash function you have chosen has a couple of undesirable properties:
-
A byte makes the same contribution to the hash wherever it appears in the key, so that all permutations of a sequence of bytes have the same hash. For example,
0x11223344 has the same hash as 0x44332211 and 0x22441133.-
Pairs of bytes in the key with the same value do not contribute to the hash (because their hashes cancel). So
0x1212 has the same hash as 0x3434 and 0x5656, not to mention 0x0000 and many other keys.-
You base the hash function on a pseudo-random sequence seeded by the current time. Varying the hash function like this has an important downside: it makes the code harder to test because the behaviour changes from run to run. So why are you doing this?
I can only guess that you are doing this because you want your hash table to be robust against the collision attack. But if so, your remedy won't be effective, because:
-
Your choice of hash function makes it trivial to construct large numbers of colliding keys (see above) regardless of the seed;
-
time only has resolution in seconds, so it is likely that an attacker will be able to guess or reconstruct your seed value;-
32 bits of randomness are well within the scope of a brute force search in any case.
If you really need to be robust against the collision attack you need a robust hash function, a secure source of randomness, and substantially more than 32 bits of randomness. See section 5 of Crosby & Wallach.
-
Linear probing is well known to be bad. Knuth writes, "Experience with linear probing shows that the algo
Code Snippets
while (ht->table[index] != NULL
&& hash(ht,ckey = get_key(ht->table[index])) == hkey) {
if (ckey == key) return ht->table[index];
index = MODINC(ht->bits,index);
}#include <stdint.h>
#include <stdio.h>
#include "ht.h"
#define ARRAY_LENGTH(array) (sizeof(array) / sizeof(array[0]))
int main(int argc, char *argv[]) {
uint64_t k[] = {0x12345671, 0x11223344, 0x22334411, 0x33441122};
ht_t *ht = ht_new(2);
for (size_t i = 0; i < ARRAY_LENGTH(k); ++i) {
ht_put(ht, k[i], &k[i]);
}
printf("%p\n", ht_get(ht, k[0]));
return 0;
}for (index = hkey;;index = MODINC(ht->bits,index)) {
if (ht->table[index] == NULL) return -1;
ckey = get_key(ht->table[index]);
if (hash(ht,ckey) != hkey) return -1; /* <-- UH-OH */
if (ckey == key) break;
}uint32_t randoms[256];union { ht_key_t key; uint8_t bytes[8]; } ukey;Context
StackExchange Code Review Q#18883, answer score: 5
Revisions (0)
No revisions yet.