diff --git a/centvrion/compiler/runtime/cent_runtime.c b/centvrion/compiler/runtime/cent_runtime.c index 53e0c10..f5089dd 100644 --- a/centvrion/compiler/runtime/cent_runtime.c +++ b/centvrion/compiler/runtime/cent_runtime.c @@ -1172,44 +1172,123 @@ static int _cent_key_eq(CentValue a, CentValue b) { return 0; } +/* splitmix64 finalizer — good distribution for sequential ints */ +static uint32_t _cent_hash_int(long v) { + uint64_t x = (uint64_t)v; + x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL; + x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL; + x = x ^ (x >> 31); + return (uint32_t)x; +} + +/* FNV-1a */ +static uint32_t _cent_hash_str(const char *s) { + uint32_t h = 2166136261u; + for (; *s; s++) { + h ^= (uint8_t)*s; + h *= 16777619u; + } + return h; +} + +static uint32_t _cent_hash_key(CentValue k) { + if (k.type == CENT_INT) return _cent_hash_int(k.ival); + if (k.type == CENT_STR) return _cent_hash_str(k.sval); + cent_type_error("dict key must be a numeral or string"); + return 0; +} + +static int _next_pow2(int n) { + int p = 1; + while (p < n) p <<= 1; + return p; +} + +/* Probe for `key` in the bucket array. Returns the bucket slot — either + one whose stored index points to a matching key (hit), or an empty + slot (-1) where the key would be inserted. nbuckets is a power of 2. */ +static int _cent_dict_probe(const CentDict *d, CentValue key, uint32_t h) { + uint32_t mask = (uint32_t)d->nbuckets - 1; + uint32_t i = h & mask; + while (1) { + int idx = d->buckets[i]; + if (idx < 0) return (int)i; + if (_cent_key_eq(d->keys[idx], key)) return (int)i; + i = (i + 1) & mask; + } +} + +static void _cent_dict_rehash(CentDict *d, int new_nbuckets) { + int *new_buckets = cent_arena_alloc(cent_arena, new_nbuckets * sizeof(int)); + for (int i = 0; i < new_nbuckets; i++) new_buckets[i] = -1; + uint32_t mask = (uint32_t)new_nbuckets - 1; + for (int idx = 0; idx < d->len; idx++) { + uint32_t h = _cent_hash_key(d->keys[idx]); + uint32_t i = h & mask; + while (new_buckets[i] >= 0) i = (i + 1) & mask; + new_buckets[i] = idx; + } + d->buckets = new_buckets; + d->nbuckets = new_nbuckets; +} + CentValue cent_dict_new(int cap) { if (cap < 4) cap = 4; + int nbuckets = _next_pow2(cap * 2); CentValue *keys = cent_arena_alloc(cent_arena, cap * sizeof(CentValue)); CentValue *vals = cent_arena_alloc(cent_arena, cap * sizeof(CentValue)); - return cent_dict_val(keys, vals, 0, cap); + int *buckets = cent_arena_alloc(cent_arena, nbuckets * sizeof(int)); + for (int i = 0; i < nbuckets; i++) buckets[i] = -1; + return cent_dict_val(keys, vals, buckets, 0, cap, nbuckets); } void cent_dict_set(CentValue *dict, CentValue key, CentValue val) { if (dict->type != CENT_DICT) cent_type_error("dict-set requires a dict"); - for (int i = 0; i < dict->dval.len; i++) { - if (_cent_key_eq(dict->dval.keys[i], key)) { - dict->dval.vals[i] = val; - return; - } + CentDict *d = &dict->dval; + + uint32_t h = _cent_hash_key(key); + int slot = _cent_dict_probe(d, key, h); + int idx = d->buckets[slot]; + if (idx >= 0) { + d->vals[idx] = val; + return; } - if (dict->dval.len >= dict->dval.cap) { - int new_cap = dict->dval.cap * 2; + + /* Grow the keys/vals arrays first so the new entry has a stable index. */ + if (d->len >= d->cap) { + int new_cap = d->cap * 2; CentValue *new_keys = cent_arena_alloc(cent_arena, new_cap * sizeof(CentValue)); CentValue *new_vals = cent_arena_alloc(cent_arena, new_cap * sizeof(CentValue)); - memcpy(new_keys, dict->dval.keys, dict->dval.len * sizeof(CentValue)); - memcpy(new_vals, dict->dval.vals, dict->dval.len * sizeof(CentValue)); - dict->dval.keys = new_keys; - dict->dval.vals = new_vals; - dict->dval.cap = new_cap; + memcpy(new_keys, d->keys, d->len * sizeof(CentValue)); + memcpy(new_vals, d->vals, d->len * sizeof(CentValue)); + d->keys = new_keys; + d->vals = new_vals; + d->cap = new_cap; + } + + int new_idx = d->len; + d->keys[new_idx] = key; + d->vals[new_idx] = val; + d->len++; + + /* If load factor would exceed 0.75, rehash — this re-inserts every + entry including the one we just appended, so we're done. Otherwise + the slot picked by the earlier probe is still valid. */ + if (d->len * 4 >= d->nbuckets * 3) { + _cent_dict_rehash(d, d->nbuckets * 2); + } else { + d->buckets[slot] = new_idx; } - dict->dval.keys[dict->dval.len] = key; - dict->dval.vals[dict->dval.len] = val; - dict->dval.len++; } CentValue cent_dict_get(CentValue dict, CentValue key) { if (dict.type != CENT_DICT) cent_type_error("dict-get requires a dict"); - for (int i = 0; i < dict.dval.len; i++) { - if (_cent_key_eq(dict.dval.keys[i], key)) - return dict.dval.vals[i]; - } + uint32_t h = _cent_hash_key(key); + int slot = _cent_dict_probe(&dict.dval, key, h); + int idx = dict.dval.buckets[slot]; + if (idx >= 0) return dict.dval.vals[idx]; cent_runtime_error("Key not found in dict"); return cent_null(); } diff --git a/centvrion/compiler/runtime/cent_runtime.h b/centvrion/compiler/runtime/cent_runtime.h index ce0dc52..25ade07 100644 --- a/centvrion/compiler/runtime/cent_runtime.h +++ b/centvrion/compiler/runtime/cent_runtime.h @@ -47,10 +47,13 @@ struct CentList { }; struct CentDict { - CentValue *keys; - CentValue *vals; - int len; - int cap; + CentValue *keys; /* insertion-order array, len entries */ + CentValue *vals; /* parallel to keys */ + int *buckets; /* hash table; values are indices into */ + /* keys/vals, or -1 for empty */ + int len; /* number of entries */ + int cap; /* capacity of keys/vals */ + int nbuckets; /* size of buckets, power of 2 */ }; struct CentValue { @@ -135,13 +138,17 @@ static inline CentValue cent_func_val(CentFuncPtr fn, const char **param_names, r.fnval.param_count = param_count; return r; } -static inline CentValue cent_dict_val(CentValue *keys, CentValue *vals, int len, int cap) { +static inline CentValue cent_dict_val(CentValue *keys, CentValue *vals, + int *buckets, int len, int cap, + int nbuckets) { CentValue r; r.type = CENT_DICT; - r.dval.keys = keys; - r.dval.vals = vals; - r.dval.len = len; - r.dval.cap = cap; + r.dval.keys = keys; + r.dval.vals = vals; + r.dval.buckets = buckets; + r.dval.len = len; + r.dval.cap = cap; + r.dval.nbuckets = nbuckets; return r; } diff --git a/tests/08_test_tabulas_.py b/tests/08_test_tabulas_.py index 146ad49..d2815be 100644 --- a/tests/08_test_tabulas_.py +++ b/tests/08_test_tabulas_.py @@ -189,3 +189,34 @@ class TestDictDisplay(unittest.TestCase): @parameterized.expand(dict_display_tests) def test_dict_display(self, source, nodes, value, output): run_test(self, source, nodes, value, output) + + +class TestDictGrowth(unittest.TestCase): + def test_dict_growth_preserves_order_and_lookup(self): + # Inserts XX entries via PER; pushes the compiled dict through + # multiple rehashes (initial cap=4) and verifies that lookup, length, + # and insertion-order iteration all still hold afterwards. + source = ( + "DESIGNA d VT TABVLA {}\n" + "PER i IN [I VSQVE XX] FAC {\n" + "DESIGNA d[i] VT i * II\n" + "}\n" + "DIC(d[X])\n" + "DIC(LONGITVDO(d))\n" + "DIC(CLAVES(d))" + ) + nodes = Program([], [ + Designa(ID("d"), DataDict([])), + PerStatement( + DataRangeArray(Numeral("I"), Numeral("XX")), + ID("i"), + [DesignaIndex(ID("d"), [ID("i")], + BinOp(ID("i"), Numeral("II"), "SYMBOL_TIMES"))], + ), + ExpressionStatement(BuiltIn("DIC", [ArrayIndex(ID("d"), Numeral("X"))])), + ExpressionStatement(BuiltIn("DIC", [BuiltIn("LONGITVDO", [ID("d")])])), + ExpressionStatement(BuiltIn("DIC", [BuiltIn("CLAVES", [ID("d")])])), + ]) + keys_str = "[" + " ".join(int_to_num(i, False) for i in range(1, 21)) + "]" + output = f"XX\nXX\n{keys_str}\n" + run_test(self, source, nodes, ValStr(keys_str), output)