🐐 Hash tables

2026-04-25 20:49:31 +02:00
parent c9fd245bb3
commit ff1c392dd6
3 changed files with 146 additions and 29 deletions
--- a/centvrion/compiler/runtime/cent_runtime.c
+++ b/centvrion/compiler/runtime/cent_runtime.c
@@ -1172,44 +1172,123 @@ static int _cent_key_eq(CentValue a, CentValue b) {
  return 0;
 }
 /* splitmix64 finalizer — good distribution for sequential ints */
 static uint32_t _cent_hash_int(long v) {
  uint64_t x = (uint64_t)v;
  x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL;
  x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL;
  x =  x ^ (x >> 31);
  return (uint32_t)x;
 }
 /* FNV-1a */
 static uint32_t _cent_hash_str(const char *s) {
  uint32_t h = 2166136261u;
  for (; *s; s++) {
    h ^= (uint8_t)*s;
    h *= 16777619u;
  }
  return h;
 }
 static uint32_t _cent_hash_key(CentValue k) {
  if (k.type == CENT_INT) return _cent_hash_int(k.ival);
  if (k.type == CENT_STR) return _cent_hash_str(k.sval);
  cent_type_error("dict key must be a numeral or string");
  return 0;
 }
 static int _next_pow2(int n) {
  int p = 1;
  while (p < n) p <<= 1;
  return p;
 }
 /* Probe for `key` in the bucket array. Returns the bucket slot — either
   one whose stored index points to a matching key (hit), or an empty
   slot (-1) where the key would be inserted. nbuckets is a power of 2. */
 static int _cent_dict_probe(const CentDict *d, CentValue key, uint32_t h) {
  uint32_t mask = (uint32_t)d->nbuckets - 1;
  uint32_t i = h & mask;
  while (1) {
    int idx = d->buckets[i];
    if (idx < 0) return (int)i;
    if (_cent_key_eq(d->keys[idx], key)) return (int)i;
    i = (i + 1) & mask;
  }
 }
 static void _cent_dict_rehash(CentDict *d, int new_nbuckets) {
  int *new_buckets = cent_arena_alloc(cent_arena, new_nbuckets * sizeof(int));
  for (int i = 0; i < new_nbuckets; i++) new_buckets[i] = -1;
  uint32_t mask = (uint32_t)new_nbuckets - 1;
  for (int idx = 0; idx < d->len; idx++) {
    uint32_t h = _cent_hash_key(d->keys[idx]);
    uint32_t i = h & mask;
    while (new_buckets[i] >= 0) i = (i + 1) & mask;
    new_buckets[i] = idx;
  }
  d->buckets  = new_buckets;
  d->nbuckets = new_nbuckets;
 }
 CentValue cent_dict_new(int cap) {
  if (cap < 4) cap = 4;
  int nbuckets = _next_pow2(cap * 2);
  CentValue *keys = cent_arena_alloc(cent_arena, cap * sizeof(CentValue));
  CentValue *vals = cent_arena_alloc(cent_arena, cap * sizeof(CentValue));
-  return cent_dict_val(keys, vals, 0, cap);
+  int       *buckets = cent_arena_alloc(cent_arena, nbuckets * sizeof(int));
  for (int i = 0; i < nbuckets; i++) buckets[i] = -1;
  return cent_dict_val(keys, vals, buckets, 0, cap, nbuckets);
 }
 void cent_dict_set(CentValue *dict, CentValue key, CentValue val) {
  if (dict->type != CENT_DICT)
    cent_type_error("dict-set requires a dict");
-  for (int i = 0; i < dict->dval.len; i++) {
+  CentDict *d = &dict->dval;
-    if (_cent_key_eq(dict->dval.keys[i], key)) {
+
-      dict->dval.vals[i] = val;
+  uint32_t h = _cent_hash_key(key);
-      return;
+  int slot = _cent_dict_probe(d, key, h);
-    }
+  int idx  = d->buckets[slot];
  if (idx >= 0) {
    d->vals[idx] = val;
    return;
  }
-  if (dict->dval.len >= dict->dval.cap) {
+
-    int new_cap = dict->dval.cap * 2;
+  /* Grow the keys/vals arrays first so the new entry has a stable index. */
  if (d->len >= d->cap) {
    int new_cap = d->cap * 2;
    CentValue *new_keys = cent_arena_alloc(cent_arena, new_cap * sizeof(CentValue));
    CentValue *new_vals = cent_arena_alloc(cent_arena, new_cap * sizeof(CentValue));
-    memcpy(new_keys, dict->dval.keys, dict->dval.len * sizeof(CentValue));
+    memcpy(new_keys, d->keys, d->len * sizeof(CentValue));
-    memcpy(new_vals, dict->dval.vals, dict->dval.len * sizeof(CentValue));
+    memcpy(new_vals, d->vals, d->len * sizeof(CentValue));
-    dict->dval.keys = new_keys;
+    d->keys = new_keys;
-    dict->dval.vals = new_vals;
+    d->vals = new_vals;
-    dict->dval.cap  = new_cap;
+    d->cap  = new_cap;
  }
  int new_idx = d->len;
  d->keys[new_idx] = key;
  d->vals[new_idx] = val;
  d->len++;
  /* If load factor would exceed 0.75, rehash — this re-inserts every
     entry including the one we just appended, so we're done. Otherwise
     the slot picked by the earlier probe is still valid. */
  if (d->len * 4 >= d->nbuckets * 3) {
    _cent_dict_rehash(d, d->nbuckets * 2);
  } else {
    d->buckets[slot] = new_idx;
  }
  dict->dval.keys[dict->dval.len] = key;
  dict->dval.vals[dict->dval.len] = val;
  dict->dval.len++;
 }
 CentValue cent_dict_get(CentValue dict, CentValue key) {
  if (dict.type != CENT_DICT)
    cent_type_error("dict-get requires a dict");
-  for (int i = 0; i < dict.dval.len; i++) {
+  uint32_t h = _cent_hash_key(key);
-    if (_cent_key_eq(dict.dval.keys[i], key))
+  int slot = _cent_dict_probe(&dict.dval, key, h);
-      return dict.dval.vals[i];
+  int idx  = dict.dval.buckets[slot];
-  }
+  if (idx >= 0) return dict.dval.vals[idx];
  cent_runtime_error("Key not found in dict");
  return cent_null();
 }
--- a/centvrion/compiler/runtime/cent_runtime.h
+++ b/centvrion/compiler/runtime/cent_runtime.h
@@ -47,10 +47,13 @@ struct CentList {
 };
 struct CentDict {
-  CentValue *keys;
+  CentValue *keys;     /* insertion-order array, len entries     */
-  CentValue *vals;
+  CentValue *vals;     /* parallel to keys                       */
-  int        len;
+  int       *buckets;  /* hash table; values are indices into    */
-  int        cap;
+                       /* keys/vals, or -1 for empty             */
  int        len;      /* number of entries                      */
  int        cap;      /* capacity of keys/vals                  */
  int        nbuckets; /* size of buckets, power of 2            */
 };
 struct CentValue {
@@ -135,13 +138,17 @@ static inline CentValue cent_func_val(CentFuncPtr fn, const char **param_names,
  r.fnval.param_count = param_count;
  return r;
 }
-static inline CentValue cent_dict_val(CentValue *keys, CentValue *vals, int len, int cap) {
+static inline CentValue cent_dict_val(CentValue *keys, CentValue *vals,
                                      int *buckets, int len, int cap,
                                      int nbuckets) {
  CentValue r;
  r.type = CENT_DICT;
-  r.dval.keys = keys;
+  r.dval.keys     = keys;
-  r.dval.vals = vals;
+  r.dval.vals     = vals;
-  r.dval.len  = len;
+  r.dval.buckets  = buckets;
-  r.dval.cap  = cap;
+  r.dval.len      = len;
  r.dval.cap      = cap;
  r.dval.nbuckets = nbuckets;
  return r;
 }
--- a/tests/08_test_tabulas_.py
+++ b/tests/08_test_tabulas_.py
@@ -189,3 +189,34 @@ class TestDictDisplay(unittest.TestCase):
  @parameterized.expand(dict_display_tests)
  def test_dict_display(self, source, nodes, value, output):
    run_test(self, source, nodes, value, output)
 class TestDictGrowth(unittest.TestCase):
  def test_dict_growth_preserves_order_and_lookup(self):
    # Inserts XX entries via PER; pushes the compiled dict through
    # multiple rehashes (initial cap=4) and verifies that lookup, length,
    # and insertion-order iteration all still hold afterwards.
    source = (
      "DESIGNA d VT TABVLA {}\n"
      "PER i IN [I VSQVE XX] FAC {\n"
      "DESIGNA d[i] VT i * II\n"
      "}\n"
      "DIC(d[X])\n"
      "DIC(LONGITVDO(d))\n"
      "DIC(CLAVES(d))"
    )
    nodes = Program([], [
      Designa(ID("d"), DataDict([])),
      PerStatement(
        DataRangeArray(Numeral("I"), Numeral("XX")),
        ID("i"),
        [DesignaIndex(ID("d"), [ID("i")],
                      BinOp(ID("i"), Numeral("II"), "SYMBOL_TIMES"))],
      ),
      ExpressionStatement(BuiltIn("DIC", [ArrayIndex(ID("d"), Numeral("X"))])),
      ExpressionStatement(BuiltIn("DIC", [BuiltIn("LONGITVDO", [ID("d")])])),
      ExpressionStatement(BuiltIn("DIC", [BuiltIn("CLAVES", [ID("d")])])),
    ])
    keys_str = "[" + " ".join(int_to_num(i, False) for i in range(1, 21)) + "]"
    output = f"XX\nXX\n{keys_str}\n"
    run_test(self, source, nodes, ValStr(keys_str), output)