diff --git a/README.md b/README.md index fa5fab7..1415d9a 100644 --- a/README.md +++ b/README.md @@ -357,12 +357,12 @@ Sleeps for `n` seconds, where `n` is an integer, fraction, or NVLLVS (treated as ### QVAERE `QVAERE(pattern, string)` -Returns an array of all non-overlapping matches of the regex `pattern` in `string`. Both arguments must be strings. Patterns use extended regular expression syntax. Returns an empty array if there are no matches. Raises an error if the pattern is invalid. +Returns an array of all non-overlapping matches of the regex `pattern` in `string`. Both arguments must be strings. Patterns use extended regular expression syntax with Roman numeral quantifiers (`{III}` for exactly 3, `{II,V}` for 2–5, `{III,}` for 3 or more). Returns an empty array if there are no matches. Raises an error if the pattern is invalid. ### SVBSTITVE `SVBSTITVE(pattern, replacement, string)` -Replaces all non-overlapping matches of the regex `pattern` in `string` with `replacement`. All three arguments must be strings. The replacement string supports backreferences (`\1`, `\2`, etc.) to captured groups. Returns the resulting string. Raises an error if the pattern is invalid. +Replaces all non-overlapping matches of the regex `pattern` in `string` with `replacement`. All three arguments must be strings. The replacement string supports backreferences (`\I`, `\II`, etc.) to captured groups. Returns the resulting string. Raises an error if the pattern is invalid. ### SCINDE `SCINDE(string, delimiter)` diff --git a/centvrion/ast_nodes.py b/centvrion/ast_nodes.py index 7539044..571389a 100644 --- a/centvrion/ast_nodes.py +++ b/centvrion/ast_nodes.py @@ -175,6 +175,89 @@ def make_string(val, magnvm=False, svbnvlla=False) -> str: else: raise CentvrionError(f"Cannot display {val!r}") +def _roman_backref(m): + try: + n = num_to_int(m.group(1), False) + except CentvrionError: + return m.group(0) + return f"\\{n}" + +def _check_arabic_backref(s): + for i in range(len(s) - 1): + if s[i] == '\\' and s[i+1].isdigit(): + raise CentvrionError(f"Invalid escape sequence '\\{s[i+1]}' — use Roman numerals for backreferences") + +def _romanize_replacement(s): + _check_arabic_backref(s) + return re.sub(r'\\([IVXLCDM]+)', _roman_backref, s) + +def _convert_quantifier(inner): + parts = inner.split(',') + converted = [] + for p in parts: + p = p.strip() + if p == '': + converted.append('') + else: + try: + converted.append(str(num_to_int(p, False))) + except CentvrionError: + return None + return '{' + ','.join(converted) + '}' + +def _romanize_pattern(s): + result = [] + i = 0 + while i < len(s): + if s[i] == '\\' and i + 1 < len(s) and s[i+1] in 'IVXLCDM': + # backref: collect Roman numeral chars and convert + j = i + 1 + while j < len(s) and s[j] in 'IVXLCDM': + j += 1 + try: + n = num_to_int(s[i+1:j], False) + result.append(f'\\{n}') + except CentvrionError: + result.append(s[i:j]) + i = j + elif s[i] == '\\' and i + 1 < len(s) and s[i+1].isdigit(): + raise CentvrionError(f"Invalid escape sequence '\\{s[i+1]}' — use Roman numerals for backreferences") + elif s[i] == '\\' and i + 1 < len(s): + result.append(s[i:i+2]) + i += 2 + elif s[i] == '[': + # skip character class + j = i + 1 + if j < len(s) and s[j] == '^': + j += 1 + if j < len(s) and s[j] == ']': + j += 1 + while j < len(s) and s[j] != ']': + if s[j] == '\\' and j + 1 < len(s): + j += 1 + j += 1 + result.append(s[i:j+1]) + i = j + 1 + elif s[i] == '{': + j = s.find('}', i) + if j == -1: + result.append(s[i]) + i += 1 + else: + inner = s[i+1:j] + if re.match(r'^[\d,\s]+$', inner) and re.search(r'\d', inner): + raise CentvrionError(f"Invalid quantifier '{{{inner}}}' — use Roman numerals") + converted = _convert_quantifier(inner) + if converted is not None: + result.append(converted) + else: + result.append(s[i:j+1]) + i = j + 1 + else: + result.append(s[i]) + i += 1 + return ''.join(result) + FRAC_SYMBOLS = [("S", 6), (":", 2), (".", 1)] def frac_to_fraction(s, magnvm=False, svbnvlla=False): @@ -1328,7 +1411,7 @@ class BuiltIn(Node): try: matches = [ ValStr(m.group(0)) - for m in re.finditer(pattern.value(), text.value()) + for m in re.finditer(_romanize_pattern(pattern.value()), text.value()) ] except re.error as e: raise CentvrionError(f"Invalid regex: {e}") @@ -1340,7 +1423,11 @@ class BuiltIn(Node): if not isinstance(pattern, ValStr) or not isinstance(replacement, ValStr) or not isinstance(text, ValStr): raise CentvrionError("SVBSTITVE requires three strings") try: - result = re.sub(pattern.value(), replacement.value(), text.value()) + result = re.sub( + _romanize_pattern(pattern.value()), + _romanize_replacement(replacement.value()), + text.value() + ) except re.error as e: raise CentvrionError(f"Invalid regex: {e}") return vtable, ValStr(result) diff --git a/centvrion/compiler/runtime/cent_runtime.c b/centvrion/compiler/runtime/cent_runtime.c index 936bfe5..50ea227 100644 --- a/centvrion/compiler/runtime/cent_runtime.c +++ b/centvrion/compiler/runtime/cent_runtime.c @@ -511,6 +511,14 @@ CentValue cent_eq(CentValue a, CentValue b) { case CENT_BOOL: return cent_bool(a.bval == b.bval); case CENT_FUNC: return cent_bool(a.fnval.fn == b.fnval.fn); case CENT_NULL: return cent_bool(1); + case CENT_LIST: { + if (a.lval.len != b.lval.len) return cent_bool(0); + for (int i = 0; i < a.lval.len; i++) { + CentValue r = cent_eq(a.lval.items[i], b.lval.items[i]); + if (!r.bval) return cent_bool(0); + } + return cent_bool(1); + } default: cent_type_error("'EST' not supported for this type"); return cent_null(); @@ -920,11 +928,160 @@ CentValue cent_dict_keys(CentValue dict) { /* Regex */ /* ------------------------------------------------------------------ */ +static int _is_roman_char(char c) { + return c == 'I' || c == 'V' || c == 'X' || c == 'L' + || c == 'C' || c == 'D' || c == 'M'; +} + +static void _ensure_cap(char **out, size_t *opos, size_t *ocap, size_t need) { + while (*opos + need + 1 > *ocap) { + *ocap *= 2; + char *newbuf = cent_arena_alloc(cent_arena, *ocap); + memcpy(newbuf, *out, *opos); + *out = newbuf; + } +} + +/* Convert Roman numeral quantifiers in pattern: {III} → {3}, {II,V} → {2,5} */ +static char *_romanize_pattern(const char *s) { + size_t slen = strlen(s); + size_t cap = slen * 2 + 1; + char *result = cent_arena_alloc(cent_arena, cap); + size_t rpos = 0; + for (size_t i = 0; i < slen; ) { + if (s[i] == '\\' && i + 1 < slen && _is_roman_char(s[i + 1])) { + /* backref: collect Roman numeral chars and convert */ + size_t j = i + 1; + while (j < slen && _is_roman_char(s[j])) j++; + char buf[64]; + size_t len = j - i - 1; + if (len >= sizeof(buf)) len = sizeof(buf) - 1; + memcpy(buf, s + i + 1, len); + buf[len] = '\0'; + long val = cent_roman_to_int(buf); + char numbuf[32]; + snprintf(numbuf, sizeof(numbuf), "\\%ld", val); + size_t nlen = strlen(numbuf); + while (rpos + nlen >= cap) { cap *= 2; char *nb = cent_arena_alloc(cent_arena, cap); memcpy(nb, result, rpos); result = nb; } + memcpy(result + rpos, numbuf, nlen); + rpos += nlen; + i = j; + } else if (s[i] == '\\' && i + 1 < slen && s[i + 1] >= '0' && s[i + 1] <= '9') { + char msg[128]; + snprintf(msg, sizeof(msg), + "Invalid escape sequence '\\%c' — use Roman numerals for backreferences", s[i + 1]); + cent_runtime_error(msg); + } else if (s[i] == '\\' && i + 1 < slen) { + if (rpos + 2 >= cap) { cap *= 2; char *nb = cent_arena_alloc(cent_arena, cap); memcpy(nb, result, rpos); result = nb; } + result[rpos++] = s[i++]; + result[rpos++] = s[i++]; + } else if (s[i] == '[') { + /* copy character class verbatim */ + if (rpos + 1 >= cap) { cap *= 2; char *nb = cent_arena_alloc(cent_arena, cap); memcpy(nb, result, rpos); result = nb; } + result[rpos++] = s[i++]; + if (i < slen && s[i] == '^') { result[rpos++] = s[i++]; } + if (i < slen && s[i] == ']') { result[rpos++] = s[i++]; } + while (i < slen && s[i] != ']') { + if (s[i] == '\\' && i + 1 < slen) { + if (rpos + 2 >= cap) { cap *= 2; char *nb = cent_arena_alloc(cent_arena, cap); memcpy(nb, result, rpos); result = nb; } + result[rpos++] = s[i++]; + } + if (rpos + 1 >= cap) { cap *= 2; char *nb = cent_arena_alloc(cent_arena, cap); memcpy(nb, result, rpos); result = nb; } + result[rpos++] = s[i++]; + } + if (i < slen) { if (rpos + 1 >= cap) { cap *= 2; char *nb = cent_arena_alloc(cent_arena, cap); memcpy(nb, result, rpos); result = nb; } result[rpos++] = s[i++]; } + } else if (s[i] == '{') { + /* find closing brace */ + size_t j = i + 1; + while (j < slen && s[j] != '}') j++; + if (j >= slen) { + if (rpos + 1 >= cap) { cap *= 2; char *nb = cent_arena_alloc(cent_arena, cap); memcpy(nb, result, rpos); result = nb; } + result[rpos++] = s[i++]; + } else { + /* extract inner content and try to convert */ + size_t inner_len = j - i - 1; + char inner[128]; + if (inner_len >= sizeof(inner)) inner_len = sizeof(inner) - 1; + memcpy(inner, s + i + 1, inner_len); + inner[inner_len] = '\0'; + /* reject Arabic digit quantifiers */ + int has_digit = 0, all_digit_comma_space = 1; + for (size_t k = 0; k < inner_len; k++) { + if (inner[k] >= '0' && inner[k] <= '9') has_digit = 1; + else if (inner[k] != ',' && inner[k] != ' ') all_digit_comma_space = 0; + } + if (has_digit && all_digit_comma_space) { + char msg[192]; + snprintf(msg, sizeof(msg), "Invalid quantifier '{%s}' — use Roman numerals", inner); + cent_runtime_error(msg); + } + /* convert comma-separated Roman parts */ + char converted[128]; + size_t cpos = 0; + converted[0] = '\0'; + int ok = 1; + char *part = inner; + while (ok) { + char *comma = strchr(part, ','); + if (comma) *comma = '\0'; + /* trim spaces */ + while (*part == ' ') part++; + char *pend = part + strlen(part) - 1; + while (pend > part && *pend == ' ') *pend-- = '\0'; + if (*part == '\0') { + /* empty part (e.g. {,V}) */ + } else { + /* check all chars are Roman */ + int all_roman = 1; + for (char *c = part; *c; c++) { if (!_is_roman_char(*c)) { all_roman = 0; break; } } + if (!all_roman) { ok = 0; break; } + long val = cent_roman_to_int(part); + char numbuf[32]; + snprintf(numbuf, sizeof(numbuf), "%ld", val); + size_t nlen = strlen(numbuf); + if (cpos + nlen >= sizeof(converted)) { ok = 0; break; } + memcpy(converted + cpos, numbuf, nlen); + cpos += nlen; + } + if (comma) { + if (cpos + 1 >= sizeof(converted)) { ok = 0; break; } + converted[cpos++] = ','; + part = comma + 1; + } else { + break; + } + } + converted[cpos] = '\0'; + if (ok) { + size_t need = cpos + 2; + while (rpos + need >= cap) { cap *= 2; char *nb = cent_arena_alloc(cent_arena, cap); memcpy(nb, result, rpos); result = nb; } + result[rpos++] = '{'; + memcpy(result + rpos, converted, cpos); + rpos += cpos; + result[rpos++] = '}'; + } else { + /* not valid Roman — copy verbatim */ + size_t chunk = j - i + 1; + while (rpos + chunk >= cap) { cap *= 2; char *nb = cent_arena_alloc(cent_arena, cap); memcpy(nb, result, rpos); result = nb; } + memcpy(result + rpos, s + i, chunk); + rpos += chunk; + } + i = j + 1; + } + } else { + if (rpos + 1 >= cap) { cap *= 2; char *nb = cent_arena_alloc(cent_arena, cap); memcpy(nb, result, rpos); result = nb; } + result[rpos++] = s[i++]; + } + } + result[rpos] = '\0'; + return result; +} + CentValue cent_qvaere(CentValue pattern, CentValue text) { if (pattern.type != CENT_STR || text.type != CENT_STR) cent_type_error("'QVAERE' requires two strings"); regex_t re; - int rc = regcomp(&re, pattern.sval, REG_EXTENDED); + int rc = regcomp(&re, _romanize_pattern(pattern.sval), REG_EXTENDED); if (rc != 0) { char errbuf[256]; regerror(rc, &re, errbuf, sizeof(errbuf)); @@ -947,42 +1104,39 @@ CentValue cent_qvaere(CentValue pattern, CentValue text) { return result; } -/* Expand replacement string, substituting \1..\9 with captured groups */ +/* Expand replacement string, substituting \I..\IX with captured groups */ static void _expand_replacement(const char *repl, const char *subject, regmatch_t *matches, int ngroups, char **out, size_t *opos, size_t *ocap) { for (const char *r = repl; *r; r++) { - if (*r == '\\' && r[1] >= '1' && r[1] <= '9') { - int g = r[1] - '0'; - r++; + if (*r == '\\' && _is_roman_char(r[1])) { + const char *start = r + 1; + const char *end = start; + while (_is_roman_char(*end)) end++; + char buf[64]; + size_t len = (size_t)(end - start); + if (len >= sizeof(buf)) len = sizeof(buf) - 1; + memcpy(buf, start, len); + buf[len] = '\0'; + int g = (int)cent_roman_to_int(buf); + r = end - 1; if (g < ngroups && matches[g].rm_so != -1) { size_t glen = matches[g].rm_eo - matches[g].rm_so; - while (*opos + glen + 1 > *ocap) { - *ocap *= 2; - char *newbuf = cent_arena_alloc(cent_arena, *ocap); - memcpy(newbuf, *out, *opos); - *out = newbuf; - } + _ensure_cap(out, opos, ocap, glen); memcpy(*out + *opos, subject + matches[g].rm_so, glen); *opos += glen; } + } else if (*r == '\\' && r[1] >= '0' && r[1] <= '9') { + char msg[128]; + snprintf(msg, sizeof(msg), + "Invalid escape sequence '\\%c' — use Roman numerals for backreferences", r[1]); + cent_runtime_error(msg); } else if (*r == '\\' && r[1] == '\\') { - /* escaped backslash → literal \ */ - if (*opos + 2 > *ocap) { - *ocap *= 2; - char *newbuf = cent_arena_alloc(cent_arena, *ocap); - memcpy(newbuf, *out, *opos); - *out = newbuf; - } + _ensure_cap(out, opos, ocap, 1); (*out)[(*opos)++] = '\\'; r++; } else { - if (*opos + 2 > *ocap) { - *ocap *= 2; - char *newbuf = cent_arena_alloc(cent_arena, *ocap); - memcpy(newbuf, *out, *opos); - *out = newbuf; - } + _ensure_cap(out, opos, ocap, 1); (*out)[(*opos)++] = *r; } } @@ -992,7 +1146,7 @@ CentValue cent_svbstitve(CentValue pattern, CentValue replacement, CentValue tex if (pattern.type != CENT_STR || replacement.type != CENT_STR || text.type != CENT_STR) cent_type_error("'SVBSTITVE' requires three strings"); regex_t re; - int rc = regcomp(&re, pattern.sval, REG_EXTENDED); + int rc = regcomp(&re, _romanize_pattern(pattern.sval), REG_EXTENDED); if (rc != 0) { char errbuf[256]; regerror(rc, &re, errbuf, sizeof(errbuf)); diff --git a/tests.py b/tests.py index 60c1ef3..b7a7fa1 100644 --- a/tests.py +++ b/tests.py @@ -634,10 +634,22 @@ builtin_tests = [ ('SVBSTITVE("a", "b", "")', Program([], [ExpressionStatement(BuiltIn("SVBSTITVE", [String("a"), String("b"), String("")]))]), ValStr("")), # SVBSTITVE: dot matches any character ('SVBSTITVE(".", "x", "ab")', Program([], [ExpressionStatement(BuiltIn("SVBSTITVE", [String("."), String("x"), String("ab")]))]), ValStr("xx")), - # SVBSTITVE: backreference swaps two groups - ('SVBSTITVE("(a)(b)", "\\2\\1", "ab")', Program([], [ExpressionStatement(BuiltIn("SVBSTITVE", [String("(a)(b)"), String("\\2\\1"), String("ab")]))]), ValStr("ba")), + # SVBSTITVE: backreference swaps two groups (Roman numerals) + ('SVBSTITVE("(a)(b)", "\\II\\I", "ab")', Program([], [ExpressionStatement(BuiltIn("SVBSTITVE", [String("(a)(b)"), String("\\II\\I"), String("ab")]))]), ValStr("ba")), # SVBSTITVE: backreference with unmatched group (ignored) - ('SVBSTITVE("(a)(b)?", "\\1\\2", "a")', Program([], [ExpressionStatement(BuiltIn("SVBSTITVE", [String("(a)(b)?"), String("\\1\\2"), String("a")]))]), ValStr("a")), + ('SVBSTITVE("(a)(b)?", "\\I\\II", "a")', Program([], [ExpressionStatement(BuiltIn("SVBSTITVE", [String("(a)(b)?"), String("\\I\\II"), String("a")]))]), ValStr("a")), + # SVBSTITVE: Roman numeral quantifier in pattern + ("SVBSTITVE('a{III}', 'x', 'aaa')", Program([], [ExpressionStatement(BuiltIn("SVBSTITVE", [String("a{III}"), String("x"), String("aaa")]))]), ValStr("x")), + # QVAERE: Roman numeral quantifier — exact repetition + ("QVAERE('a{III}', 'aaaa')", Program([], [ExpressionStatement(BuiltIn("QVAERE", [String("a{III}"), String("aaaa")]))]), ValList([ValStr("aaa")])), + # QVAERE: Roman numeral quantifier — range + ("QVAERE('a{II,III}', 'aaaaaa')", Program([], [ExpressionStatement(BuiltIn("QVAERE", [String("a{II,III}"), String("aaaaaa")]))]), ValList([ValStr("aaa"), ValStr("aaa")])), + # QVAERE: Roman numeral quantifier — at-least + ("QVAERE('a{II,}', 'a aa aaa')", Program([], [ExpressionStatement(BuiltIn("QVAERE", [String("a{II,}"), String("a aa aaa")]))]), ValList([ValStr("aa"), ValStr("aaa")])), + # QVAERE: pattern backreference — repeated character + ("QVAERE('(.)\\I', 'aabcdd')", Program([], [ExpressionStatement(BuiltIn("QVAERE", [String("(.)\\I"), String("aabcdd")]))]), ValList([ValStr("aa"), ValStr("dd")])), + # QVAERE: pattern backreference — repeated group + ("QVAERE('(..)\\I', 'ababcc')", Program([], [ExpressionStatement(BuiltIn("QVAERE", [String("(..)\\I"), String("ababcc")]))]), ValList([ValStr("abab")])), # NVMERVS: basic conversion ('NVMERVS("XIV")', Program([], [ExpressionStatement(BuiltIn("NVMERVS", [String("XIV")]))]), ValInt(14)), # NVMERVS: simple single numeral @@ -751,6 +763,9 @@ error_tests = [ ('SVBSTITVE("a", I, "c")', CentvrionError), # SVBSTITVE requires strings, not int replacement ('SVBSTITVE("a", "b", I)', CentvrionError), # SVBSTITVE requires strings, not int text ('SVBSTITVE("[", "b", "c")', CentvrionError), # SVBSTITVE invalid regex + ("SVBSTITVE('(a)', '\\1', 'a')", CentvrionError), # Arabic backref in replacement + ("QVAERE('(.)\\1', 'aa')", CentvrionError), # Arabic backref in pattern + ("QVAERE('a{3}', 'aaa')", CentvrionError), # Arabic quantifier in pattern ('SCINDE(I, ",")', CentvrionError), # SCINDE requires strings, not int ('SCINDE("a", I)', CentvrionError), # SCINDE requires strings, not int delimiter ('PETE("http://example.com")', CentvrionError), # RETE required for PETE @@ -1222,6 +1237,13 @@ comparison_tests = [ # non-zero integer does not equal NVLLVS ("I EST NVLLVS", Program([], [ExpressionStatement(BinOp(Numeral("I"), Nullus(), "KEYWORD_EST"))]), ValBool(False)), ("NVLLVS DISPAR I", Program([], [ExpressionStatement(BinOp(Nullus(), Numeral("I"), "KEYWORD_DISPAR"))]), ValBool(True)), + # EST / DISPAR on arrays + ("[I, II] EST [I, II]", Program([], [ExpressionStatement(BinOp(DataArray([Numeral("I"), Numeral("II")]), DataArray([Numeral("I"), Numeral("II")]), "KEYWORD_EST"))]), ValBool(True)), + ("[I, II] EST [I, III]", Program([], [ExpressionStatement(BinOp(DataArray([Numeral("I"), Numeral("II")]), DataArray([Numeral("I"), Numeral("III")]), "KEYWORD_EST"))]), ValBool(False)), + ("[I, II] EST [I, II, III]", Program([], [ExpressionStatement(BinOp(DataArray([Numeral("I"), Numeral("II")]), DataArray([Numeral("I"), Numeral("II"), Numeral("III")]), "KEYWORD_EST"))]), ValBool(False)), + ("[] EST []", Program([], [ExpressionStatement(BinOp(DataArray([]), DataArray([]), "KEYWORD_EST"))]), ValBool(True)), + ("[I, II] DISPAR [I, III]", Program([], [ExpressionStatement(BinOp(DataArray([Numeral("I"), Numeral("II")]), DataArray([Numeral("I"), Numeral("III")]), "KEYWORD_DISPAR"))]), ValBool(True)), + ("[I, II] DISPAR [I, II]", Program([], [ExpressionStatement(BinOp(DataArray([Numeral("I"), Numeral("II")]), DataArray([Numeral("I"), Numeral("II")]), "KEYWORD_DISPAR"))]), ValBool(False)), ] class TestComparisons(unittest.TestCase):