🐐 Hash tables

This commit is contained in:
2026-04-25 20:49:31 +02:00
parent c9fd245bb3
commit ff1c392dd6
3 changed files with 146 additions and 29 deletions

View File

@@ -1172,44 +1172,123 @@ static int _cent_key_eq(CentValue a, CentValue b) {
return 0; return 0;
} }
/* splitmix64 finalizer — good distribution for sequential ints */
static uint32_t _cent_hash_int(long v) {
uint64_t x = (uint64_t)v;
x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL;
x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL;
x = x ^ (x >> 31);
return (uint32_t)x;
}
/* FNV-1a */
static uint32_t _cent_hash_str(const char *s) {
uint32_t h = 2166136261u;
for (; *s; s++) {
h ^= (uint8_t)*s;
h *= 16777619u;
}
return h;
}
static uint32_t _cent_hash_key(CentValue k) {
if (k.type == CENT_INT) return _cent_hash_int(k.ival);
if (k.type == CENT_STR) return _cent_hash_str(k.sval);
cent_type_error("dict key must be a numeral or string");
return 0;
}
static int _next_pow2(int n) {
int p = 1;
while (p < n) p <<= 1;
return p;
}
/* Probe for `key` in the bucket array. Returns the bucket slot — either
one whose stored index points to a matching key (hit), or an empty
slot (-1) where the key would be inserted. nbuckets is a power of 2. */
static int _cent_dict_probe(const CentDict *d, CentValue key, uint32_t h) {
uint32_t mask = (uint32_t)d->nbuckets - 1;
uint32_t i = h & mask;
while (1) {
int idx = d->buckets[i];
if (idx < 0) return (int)i;
if (_cent_key_eq(d->keys[idx], key)) return (int)i;
i = (i + 1) & mask;
}
}
static void _cent_dict_rehash(CentDict *d, int new_nbuckets) {
int *new_buckets = cent_arena_alloc(cent_arena, new_nbuckets * sizeof(int));
for (int i = 0; i < new_nbuckets; i++) new_buckets[i] = -1;
uint32_t mask = (uint32_t)new_nbuckets - 1;
for (int idx = 0; idx < d->len; idx++) {
uint32_t h = _cent_hash_key(d->keys[idx]);
uint32_t i = h & mask;
while (new_buckets[i] >= 0) i = (i + 1) & mask;
new_buckets[i] = idx;
}
d->buckets = new_buckets;
d->nbuckets = new_nbuckets;
}
CentValue cent_dict_new(int cap) { CentValue cent_dict_new(int cap) {
if (cap < 4) cap = 4; if (cap < 4) cap = 4;
int nbuckets = _next_pow2(cap * 2);
CentValue *keys = cent_arena_alloc(cent_arena, cap * sizeof(CentValue)); CentValue *keys = cent_arena_alloc(cent_arena, cap * sizeof(CentValue));
CentValue *vals = cent_arena_alloc(cent_arena, cap * sizeof(CentValue)); CentValue *vals = cent_arena_alloc(cent_arena, cap * sizeof(CentValue));
return cent_dict_val(keys, vals, 0, cap); int *buckets = cent_arena_alloc(cent_arena, nbuckets * sizeof(int));
for (int i = 0; i < nbuckets; i++) buckets[i] = -1;
return cent_dict_val(keys, vals, buckets, 0, cap, nbuckets);
} }
void cent_dict_set(CentValue *dict, CentValue key, CentValue val) { void cent_dict_set(CentValue *dict, CentValue key, CentValue val) {
if (dict->type != CENT_DICT) if (dict->type != CENT_DICT)
cent_type_error("dict-set requires a dict"); cent_type_error("dict-set requires a dict");
for (int i = 0; i < dict->dval.len; i++) { CentDict *d = &dict->dval;
if (_cent_key_eq(dict->dval.keys[i], key)) {
dict->dval.vals[i] = val; uint32_t h = _cent_hash_key(key);
return; int slot = _cent_dict_probe(d, key, h);
} int idx = d->buckets[slot];
if (idx >= 0) {
d->vals[idx] = val;
return;
} }
if (dict->dval.len >= dict->dval.cap) {
int new_cap = dict->dval.cap * 2; /* Grow the keys/vals arrays first so the new entry has a stable index. */
if (d->len >= d->cap) {
int new_cap = d->cap * 2;
CentValue *new_keys = cent_arena_alloc(cent_arena, new_cap * sizeof(CentValue)); CentValue *new_keys = cent_arena_alloc(cent_arena, new_cap * sizeof(CentValue));
CentValue *new_vals = cent_arena_alloc(cent_arena, new_cap * sizeof(CentValue)); CentValue *new_vals = cent_arena_alloc(cent_arena, new_cap * sizeof(CentValue));
memcpy(new_keys, dict->dval.keys, dict->dval.len * sizeof(CentValue)); memcpy(new_keys, d->keys, d->len * sizeof(CentValue));
memcpy(new_vals, dict->dval.vals, dict->dval.len * sizeof(CentValue)); memcpy(new_vals, d->vals, d->len * sizeof(CentValue));
dict->dval.keys = new_keys; d->keys = new_keys;
dict->dval.vals = new_vals; d->vals = new_vals;
dict->dval.cap = new_cap; d->cap = new_cap;
}
int new_idx = d->len;
d->keys[new_idx] = key;
d->vals[new_idx] = val;
d->len++;
/* If load factor would exceed 0.75, rehash — this re-inserts every
entry including the one we just appended, so we're done. Otherwise
the slot picked by the earlier probe is still valid. */
if (d->len * 4 >= d->nbuckets * 3) {
_cent_dict_rehash(d, d->nbuckets * 2);
} else {
d->buckets[slot] = new_idx;
} }
dict->dval.keys[dict->dval.len] = key;
dict->dval.vals[dict->dval.len] = val;
dict->dval.len++;
} }
CentValue cent_dict_get(CentValue dict, CentValue key) { CentValue cent_dict_get(CentValue dict, CentValue key) {
if (dict.type != CENT_DICT) if (dict.type != CENT_DICT)
cent_type_error("dict-get requires a dict"); cent_type_error("dict-get requires a dict");
for (int i = 0; i < dict.dval.len; i++) { uint32_t h = _cent_hash_key(key);
if (_cent_key_eq(dict.dval.keys[i], key)) int slot = _cent_dict_probe(&dict.dval, key, h);
return dict.dval.vals[i]; int idx = dict.dval.buckets[slot];
} if (idx >= 0) return dict.dval.vals[idx];
cent_runtime_error("Key not found in dict"); cent_runtime_error("Key not found in dict");
return cent_null(); return cent_null();
} }

View File

@@ -47,10 +47,13 @@ struct CentList {
}; };
struct CentDict { struct CentDict {
CentValue *keys; CentValue *keys; /* insertion-order array, len entries */
CentValue *vals; CentValue *vals; /* parallel to keys */
int len; int *buckets; /* hash table; values are indices into */
int cap; /* keys/vals, or -1 for empty */
int len; /* number of entries */
int cap; /* capacity of keys/vals */
int nbuckets; /* size of buckets, power of 2 */
}; };
struct CentValue { struct CentValue {
@@ -135,13 +138,17 @@ static inline CentValue cent_func_val(CentFuncPtr fn, const char **param_names,
r.fnval.param_count = param_count; r.fnval.param_count = param_count;
return r; return r;
} }
static inline CentValue cent_dict_val(CentValue *keys, CentValue *vals, int len, int cap) { static inline CentValue cent_dict_val(CentValue *keys, CentValue *vals,
int *buckets, int len, int cap,
int nbuckets) {
CentValue r; CentValue r;
r.type = CENT_DICT; r.type = CENT_DICT;
r.dval.keys = keys; r.dval.keys = keys;
r.dval.vals = vals; r.dval.vals = vals;
r.dval.len = len; r.dval.buckets = buckets;
r.dval.cap = cap; r.dval.len = len;
r.dval.cap = cap;
r.dval.nbuckets = nbuckets;
return r; return r;
} }

View File

@@ -189,3 +189,34 @@ class TestDictDisplay(unittest.TestCase):
@parameterized.expand(dict_display_tests) @parameterized.expand(dict_display_tests)
def test_dict_display(self, source, nodes, value, output): def test_dict_display(self, source, nodes, value, output):
run_test(self, source, nodes, value, output) run_test(self, source, nodes, value, output)
class TestDictGrowth(unittest.TestCase):
def test_dict_growth_preserves_order_and_lookup(self):
# Inserts XX entries via PER; pushes the compiled dict through
# multiple rehashes (initial cap=4) and verifies that lookup, length,
# and insertion-order iteration all still hold afterwards.
source = (
"DESIGNA d VT TABVLA {}\n"
"PER i IN [I VSQVE XX] FAC {\n"
"DESIGNA d[i] VT i * II\n"
"}\n"
"DIC(d[X])\n"
"DIC(LONGITVDO(d))\n"
"DIC(CLAVES(d))"
)
nodes = Program([], [
Designa(ID("d"), DataDict([])),
PerStatement(
DataRangeArray(Numeral("I"), Numeral("XX")),
ID("i"),
[DesignaIndex(ID("d"), [ID("i")],
BinOp(ID("i"), Numeral("II"), "SYMBOL_TIMES"))],
),
ExpressionStatement(BuiltIn("DIC", [ArrayIndex(ID("d"), Numeral("X"))])),
ExpressionStatement(BuiltIn("DIC", [BuiltIn("LONGITVDO", [ID("d")])])),
ExpressionStatement(BuiltIn("DIC", [BuiltIn("CLAVES", [ID("d")])])),
])
keys_str = "[" + " ".join(int_to_num(i, False) for i in range(1, 21)) + "]"
output = f"XX\nXX\n{keys_str}\n"
run_test(self, source, nodes, ValStr(keys_str), output)