🐐 Hash tables
This commit is contained in:
@@ -1172,44 +1172,123 @@ static int _cent_key_eq(CentValue a, CentValue b) {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* splitmix64 finalizer — good distribution for sequential ints */
|
||||||
|
static uint32_t _cent_hash_int(long v) {
|
||||||
|
uint64_t x = (uint64_t)v;
|
||||||
|
x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL;
|
||||||
|
x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL;
|
||||||
|
x = x ^ (x >> 31);
|
||||||
|
return (uint32_t)x;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* FNV-1a */
|
||||||
|
static uint32_t _cent_hash_str(const char *s) {
|
||||||
|
uint32_t h = 2166136261u;
|
||||||
|
for (; *s; s++) {
|
||||||
|
h ^= (uint8_t)*s;
|
||||||
|
h *= 16777619u;
|
||||||
|
}
|
||||||
|
return h;
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint32_t _cent_hash_key(CentValue k) {
|
||||||
|
if (k.type == CENT_INT) return _cent_hash_int(k.ival);
|
||||||
|
if (k.type == CENT_STR) return _cent_hash_str(k.sval);
|
||||||
|
cent_type_error("dict key must be a numeral or string");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int _next_pow2(int n) {
|
||||||
|
int p = 1;
|
||||||
|
while (p < n) p <<= 1;
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Probe for `key` in the bucket array. Returns the bucket slot — either
|
||||||
|
one whose stored index points to a matching key (hit), or an empty
|
||||||
|
slot (-1) where the key would be inserted. nbuckets is a power of 2. */
|
||||||
|
static int _cent_dict_probe(const CentDict *d, CentValue key, uint32_t h) {
|
||||||
|
uint32_t mask = (uint32_t)d->nbuckets - 1;
|
||||||
|
uint32_t i = h & mask;
|
||||||
|
while (1) {
|
||||||
|
int idx = d->buckets[i];
|
||||||
|
if (idx < 0) return (int)i;
|
||||||
|
if (_cent_key_eq(d->keys[idx], key)) return (int)i;
|
||||||
|
i = (i + 1) & mask;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void _cent_dict_rehash(CentDict *d, int new_nbuckets) {
|
||||||
|
int *new_buckets = cent_arena_alloc(cent_arena, new_nbuckets * sizeof(int));
|
||||||
|
for (int i = 0; i < new_nbuckets; i++) new_buckets[i] = -1;
|
||||||
|
uint32_t mask = (uint32_t)new_nbuckets - 1;
|
||||||
|
for (int idx = 0; idx < d->len; idx++) {
|
||||||
|
uint32_t h = _cent_hash_key(d->keys[idx]);
|
||||||
|
uint32_t i = h & mask;
|
||||||
|
while (new_buckets[i] >= 0) i = (i + 1) & mask;
|
||||||
|
new_buckets[i] = idx;
|
||||||
|
}
|
||||||
|
d->buckets = new_buckets;
|
||||||
|
d->nbuckets = new_nbuckets;
|
||||||
|
}
|
||||||
|
|
||||||
CentValue cent_dict_new(int cap) {
|
CentValue cent_dict_new(int cap) {
|
||||||
if (cap < 4) cap = 4;
|
if (cap < 4) cap = 4;
|
||||||
|
int nbuckets = _next_pow2(cap * 2);
|
||||||
CentValue *keys = cent_arena_alloc(cent_arena, cap * sizeof(CentValue));
|
CentValue *keys = cent_arena_alloc(cent_arena, cap * sizeof(CentValue));
|
||||||
CentValue *vals = cent_arena_alloc(cent_arena, cap * sizeof(CentValue));
|
CentValue *vals = cent_arena_alloc(cent_arena, cap * sizeof(CentValue));
|
||||||
return cent_dict_val(keys, vals, 0, cap);
|
int *buckets = cent_arena_alloc(cent_arena, nbuckets * sizeof(int));
|
||||||
|
for (int i = 0; i < nbuckets; i++) buckets[i] = -1;
|
||||||
|
return cent_dict_val(keys, vals, buckets, 0, cap, nbuckets);
|
||||||
}
|
}
|
||||||
|
|
||||||
void cent_dict_set(CentValue *dict, CentValue key, CentValue val) {
|
void cent_dict_set(CentValue *dict, CentValue key, CentValue val) {
|
||||||
if (dict->type != CENT_DICT)
|
if (dict->type != CENT_DICT)
|
||||||
cent_type_error("dict-set requires a dict");
|
cent_type_error("dict-set requires a dict");
|
||||||
for (int i = 0; i < dict->dval.len; i++) {
|
CentDict *d = &dict->dval;
|
||||||
if (_cent_key_eq(dict->dval.keys[i], key)) {
|
|
||||||
dict->dval.vals[i] = val;
|
uint32_t h = _cent_hash_key(key);
|
||||||
return;
|
int slot = _cent_dict_probe(d, key, h);
|
||||||
}
|
int idx = d->buckets[slot];
|
||||||
|
if (idx >= 0) {
|
||||||
|
d->vals[idx] = val;
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
if (dict->dval.len >= dict->dval.cap) {
|
|
||||||
int new_cap = dict->dval.cap * 2;
|
/* Grow the keys/vals arrays first so the new entry has a stable index. */
|
||||||
|
if (d->len >= d->cap) {
|
||||||
|
int new_cap = d->cap * 2;
|
||||||
CentValue *new_keys = cent_arena_alloc(cent_arena, new_cap * sizeof(CentValue));
|
CentValue *new_keys = cent_arena_alloc(cent_arena, new_cap * sizeof(CentValue));
|
||||||
CentValue *new_vals = cent_arena_alloc(cent_arena, new_cap * sizeof(CentValue));
|
CentValue *new_vals = cent_arena_alloc(cent_arena, new_cap * sizeof(CentValue));
|
||||||
memcpy(new_keys, dict->dval.keys, dict->dval.len * sizeof(CentValue));
|
memcpy(new_keys, d->keys, d->len * sizeof(CentValue));
|
||||||
memcpy(new_vals, dict->dval.vals, dict->dval.len * sizeof(CentValue));
|
memcpy(new_vals, d->vals, d->len * sizeof(CentValue));
|
||||||
dict->dval.keys = new_keys;
|
d->keys = new_keys;
|
||||||
dict->dval.vals = new_vals;
|
d->vals = new_vals;
|
||||||
dict->dval.cap = new_cap;
|
d->cap = new_cap;
|
||||||
|
}
|
||||||
|
|
||||||
|
int new_idx = d->len;
|
||||||
|
d->keys[new_idx] = key;
|
||||||
|
d->vals[new_idx] = val;
|
||||||
|
d->len++;
|
||||||
|
|
||||||
|
/* If load factor would exceed 0.75, rehash — this re-inserts every
|
||||||
|
entry including the one we just appended, so we're done. Otherwise
|
||||||
|
the slot picked by the earlier probe is still valid. */
|
||||||
|
if (d->len * 4 >= d->nbuckets * 3) {
|
||||||
|
_cent_dict_rehash(d, d->nbuckets * 2);
|
||||||
|
} else {
|
||||||
|
d->buckets[slot] = new_idx;
|
||||||
}
|
}
|
||||||
dict->dval.keys[dict->dval.len] = key;
|
|
||||||
dict->dval.vals[dict->dval.len] = val;
|
|
||||||
dict->dval.len++;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
CentValue cent_dict_get(CentValue dict, CentValue key) {
|
CentValue cent_dict_get(CentValue dict, CentValue key) {
|
||||||
if (dict.type != CENT_DICT)
|
if (dict.type != CENT_DICT)
|
||||||
cent_type_error("dict-get requires a dict");
|
cent_type_error("dict-get requires a dict");
|
||||||
for (int i = 0; i < dict.dval.len; i++) {
|
uint32_t h = _cent_hash_key(key);
|
||||||
if (_cent_key_eq(dict.dval.keys[i], key))
|
int slot = _cent_dict_probe(&dict.dval, key, h);
|
||||||
return dict.dval.vals[i];
|
int idx = dict.dval.buckets[slot];
|
||||||
}
|
if (idx >= 0) return dict.dval.vals[idx];
|
||||||
cent_runtime_error("Key not found in dict");
|
cent_runtime_error("Key not found in dict");
|
||||||
return cent_null();
|
return cent_null();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -47,10 +47,13 @@ struct CentList {
|
|||||||
};
|
};
|
||||||
|
|
||||||
struct CentDict {
|
struct CentDict {
|
||||||
CentValue *keys;
|
CentValue *keys; /* insertion-order array, len entries */
|
||||||
CentValue *vals;
|
CentValue *vals; /* parallel to keys */
|
||||||
int len;
|
int *buckets; /* hash table; values are indices into */
|
||||||
int cap;
|
/* keys/vals, or -1 for empty */
|
||||||
|
int len; /* number of entries */
|
||||||
|
int cap; /* capacity of keys/vals */
|
||||||
|
int nbuckets; /* size of buckets, power of 2 */
|
||||||
};
|
};
|
||||||
|
|
||||||
struct CentValue {
|
struct CentValue {
|
||||||
@@ -135,13 +138,17 @@ static inline CentValue cent_func_val(CentFuncPtr fn, const char **param_names,
|
|||||||
r.fnval.param_count = param_count;
|
r.fnval.param_count = param_count;
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
static inline CentValue cent_dict_val(CentValue *keys, CentValue *vals, int len, int cap) {
|
static inline CentValue cent_dict_val(CentValue *keys, CentValue *vals,
|
||||||
|
int *buckets, int len, int cap,
|
||||||
|
int nbuckets) {
|
||||||
CentValue r;
|
CentValue r;
|
||||||
r.type = CENT_DICT;
|
r.type = CENT_DICT;
|
||||||
r.dval.keys = keys;
|
r.dval.keys = keys;
|
||||||
r.dval.vals = vals;
|
r.dval.vals = vals;
|
||||||
r.dval.len = len;
|
r.dval.buckets = buckets;
|
||||||
r.dval.cap = cap;
|
r.dval.len = len;
|
||||||
|
r.dval.cap = cap;
|
||||||
|
r.dval.nbuckets = nbuckets;
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -189,3 +189,34 @@ class TestDictDisplay(unittest.TestCase):
|
|||||||
@parameterized.expand(dict_display_tests)
|
@parameterized.expand(dict_display_tests)
|
||||||
def test_dict_display(self, source, nodes, value, output):
|
def test_dict_display(self, source, nodes, value, output):
|
||||||
run_test(self, source, nodes, value, output)
|
run_test(self, source, nodes, value, output)
|
||||||
|
|
||||||
|
|
||||||
|
class TestDictGrowth(unittest.TestCase):
|
||||||
|
def test_dict_growth_preserves_order_and_lookup(self):
|
||||||
|
# Inserts XX entries via PER; pushes the compiled dict through
|
||||||
|
# multiple rehashes (initial cap=4) and verifies that lookup, length,
|
||||||
|
# and insertion-order iteration all still hold afterwards.
|
||||||
|
source = (
|
||||||
|
"DESIGNA d VT TABVLA {}\n"
|
||||||
|
"PER i IN [I VSQVE XX] FAC {\n"
|
||||||
|
"DESIGNA d[i] VT i * II\n"
|
||||||
|
"}\n"
|
||||||
|
"DIC(d[X])\n"
|
||||||
|
"DIC(LONGITVDO(d))\n"
|
||||||
|
"DIC(CLAVES(d))"
|
||||||
|
)
|
||||||
|
nodes = Program([], [
|
||||||
|
Designa(ID("d"), DataDict([])),
|
||||||
|
PerStatement(
|
||||||
|
DataRangeArray(Numeral("I"), Numeral("XX")),
|
||||||
|
ID("i"),
|
||||||
|
[DesignaIndex(ID("d"), [ID("i")],
|
||||||
|
BinOp(ID("i"), Numeral("II"), "SYMBOL_TIMES"))],
|
||||||
|
),
|
||||||
|
ExpressionStatement(BuiltIn("DIC", [ArrayIndex(ID("d"), Numeral("X"))])),
|
||||||
|
ExpressionStatement(BuiltIn("DIC", [BuiltIn("LONGITVDO", [ID("d")])])),
|
||||||
|
ExpressionStatement(BuiltIn("DIC", [BuiltIn("CLAVES", [ID("d")])])),
|
||||||
|
])
|
||||||
|
keys_str = "[" + " ".join(int_to_num(i, False) for i in range(1, 21)) + "]"
|
||||||
|
output = f"XX\nXX\n{keys_str}\n"
|
||||||
|
run_test(self, source, nodes, ValStr(keys_str), output)
|
||||||
|
|||||||
Reference in New Issue
Block a user