diff --git a/README.md b/README.md index f1247f3..bf63326 100644 --- a/README.md +++ b/README.md @@ -354,6 +354,11 @@ Sleeps for `n` seconds, where `n` is an integer, fraction, or NVLLVS (treated as Returns an array of all non-overlapping matches of the regex `pattern` in `string`. Both arguments must be strings. Patterns use extended regular expression syntax. Returns an empty array if there are no matches. Raises an error if the pattern is invalid. +### SVBSTITVE +`SVBSTITVE(pattern, replacement, string)` + +Replaces all non-overlapping matches of the regex `pattern` in `string` with `replacement`. All three arguments must be strings. The replacement string supports backreferences (`\1`, `\2`, etc.) to captured groups. Returns the resulting string. Raises an error if the pattern is invalid. + ## Modules Modules are additions to the base `CENTVRION` syntax. They add or change certain features. Modules are included in your code by having diff --git a/centvrion/ast_nodes.py b/centvrion/ast_nodes.py index 2f58de1..e2070aa 100644 --- a/centvrion/ast_nodes.py +++ b/centvrion/ast_nodes.py @@ -1290,6 +1290,17 @@ class BuiltIn(Node): except re.error as e: raise CentvrionError(f"Invalid regex: {e}") return vtable, ValList(matches) + case "SVBSTITVE": + pattern = params[0] + replacement = params[1] + text = params[2] + if not isinstance(pattern, ValStr) or not isinstance(replacement, ValStr) or not isinstance(text, ValStr): + raise CentvrionError("SVBSTITVE requires three strings") + try: + result = re.sub(pattern.value(), replacement.value(), text.value()) + except re.error as e: + raise CentvrionError(f"Invalid regex: {e}") + return vtable, ValStr(result) case _: raise NotImplementedError(self.builtin) diff --git a/centvrion/compiler/emit_expr.py b/centvrion/compiler/emit_expr.py index c3a5cb0..4000619 100644 --- a/centvrion/compiler/emit_expr.py +++ b/centvrion/compiler/emit_expr.py @@ -300,6 +300,9 @@ def _emit_builtin(node, ctx): case "QVAERE": lines.append(f"CentValue {tmp} = cent_qvaere({param_vars[0]}, {param_vars[1]});") + case "SVBSTITVE": + lines.append(f"CentValue {tmp} = cent_svbstitve({param_vars[0]}, {param_vars[1]}, {param_vars[2]});") + case _: raise NotImplementedError(node.builtin) diff --git a/centvrion/compiler/runtime/cent_runtime.c b/centvrion/compiler/runtime/cent_runtime.c index 449ba0f..e91e9dc 100644 --- a/centvrion/compiler/runtime/cent_runtime.c +++ b/centvrion/compiler/runtime/cent_runtime.c @@ -902,6 +902,99 @@ CentValue cent_qvaere(CentValue pattern, CentValue text) { return result; } +/* Expand replacement string, substituting \1..\9 with captured groups */ +static void _expand_replacement(const char *repl, const char *subject, + regmatch_t *matches, int ngroups, + char **out, size_t *opos, size_t *ocap) { + for (const char *r = repl; *r; r++) { + if (*r == '\\' && r[1] >= '1' && r[1] <= '9') { + int g = r[1] - '0'; + r++; + if (g < ngroups && matches[g].rm_so != -1) { + size_t glen = matches[g].rm_eo - matches[g].rm_so; + while (*opos + glen + 1 > *ocap) { + *ocap *= 2; + char *newbuf = cent_arena_alloc(cent_arena, *ocap); + memcpy(newbuf, *out, *opos); + *out = newbuf; + } + memcpy(*out + *opos, subject + matches[g].rm_so, glen); + *opos += glen; + } + } else if (*r == '\\' && r[1] == '\\') { + /* escaped backslash → literal \ */ + if (*opos + 2 > *ocap) { + *ocap *= 2; + char *newbuf = cent_arena_alloc(cent_arena, *ocap); + memcpy(newbuf, *out, *opos); + *out = newbuf; + } + (*out)[(*opos)++] = '\\'; + r++; + } else { + if (*opos + 2 > *ocap) { + *ocap *= 2; + char *newbuf = cent_arena_alloc(cent_arena, *ocap); + memcpy(newbuf, *out, *opos); + *out = newbuf; + } + (*out)[(*opos)++] = *r; + } + } +} + +CentValue cent_svbstitve(CentValue pattern, CentValue replacement, CentValue text) { + if (pattern.type != CENT_STR || replacement.type != CENT_STR || text.type != CENT_STR) + cent_type_error("'SVBSTITVE' requires three strings"); + regex_t re; + int rc = regcomp(&re, pattern.sval, REG_EXTENDED); + if (rc != 0) { + char errbuf[256]; + regerror(rc, &re, errbuf, sizeof(errbuf)); + regfree(&re); + cent_runtime_error(errbuf); + } + size_t text_len = strlen(text.sval); + size_t repl_len = strlen(replacement.sval); + size_t cap = text_len + repl_len * 4 + 1; + char *result = cent_arena_alloc(cent_arena, cap); + size_t rpos = 0; + const char *cursor = text.sval; + int ngroups = (int)re.re_nsub + 1; + if (ngroups > 10) ngroups = 10; + regmatch_t matches[10]; + while (*cursor && regexec(&re, cursor, ngroups, matches, 0) == 0) { + /* copy text before match */ + size_t prefix_len = matches[0].rm_so; + while (rpos + prefix_len + 1 > cap) { + cap *= 2; + char *newbuf = cent_arena_alloc(cent_arena, cap); + memcpy(newbuf, result, rpos); + result = newbuf; + } + memcpy(result + rpos, cursor, prefix_len); + rpos += prefix_len; + /* expand replacement with backreferences */ + _expand_replacement(replacement.sval, cursor, matches, ngroups, + &result, &rpos, &cap); + cursor += matches[0].rm_eo; + if (matches[0].rm_eo == 0) cursor++; + } + /* copy remaining text */ + size_t tail_len = strlen(cursor); + while (rpos + tail_len + 1 > cap) { + cap *= 2; + char *newbuf = cent_arena_alloc(cent_arena, cap); + memcpy(newbuf, result, rpos); + result = newbuf; + } + memcpy(result + rpos, cursor, tail_len); + rpos += tail_len; + result[rpos] = '\0'; + regfree(&re); + return cent_str(result); +} + /* ------------------------------------------------------------------ */ /* Initialisation */ /* ------------------------------------------------------------------ */ diff --git a/centvrion/compiler/runtime/cent_runtime.h b/centvrion/compiler/runtime/cent_runtime.h index 4565c35..94e93b8 100644 --- a/centvrion/compiler/runtime/cent_runtime.h +++ b/centvrion/compiler/runtime/cent_runtime.h @@ -233,6 +233,7 @@ CentValue cent_lege(CentValue path); /* LEGE */ void cent_scribe(CentValue path, CentValue content); /* SCRIBE */ void cent_adivnge(CentValue path, CentValue content); /* ADIVNGE */ CentValue cent_qvaere(CentValue pattern, CentValue text); /* QVAERE */ +CentValue cent_svbstitve(CentValue pattern, CentValue replacement, CentValue text); /* SVBSTITVE */ /* ------------------------------------------------------------------ */ /* Array helpers */ diff --git a/centvrion/lexer.py b/centvrion/lexer.py index a2c0a9b..5e65a23 100644 --- a/centvrion/lexer.py +++ b/centvrion/lexer.py @@ -58,7 +58,8 @@ builtin_tokens = [("BUILTIN", i) for i in [ "LEGE", "SCRIBE", "ADIVNGE", - "QVAERE" + "QVAERE", + "SVBSTITVE" ]] data_tokens = [ diff --git a/snippets/syntaxes/centvrion.sublime-syntax b/snippets/syntaxes/centvrion.sublime-syntax index de4a352..94d3a6f 100644 --- a/snippets/syntaxes/centvrion.sublime-syntax +++ b/snippets/syntaxes/centvrion.sublime-syntax @@ -70,7 +70,7 @@ contexts: scope: constant.language.centvrion builtins: - - match: '\b(ADIVNGE|AVDI_NVMERVS|AVDI|CLAVES|DECIMATIO|DIC|EVERRE|FORTVITVS_NVMERVS|FORTVITA_ELECTIO|LEGE|LONGITVDO|ORDINA|SCRIBE|SEMEN|SENATVS)\b' + - match: '\b(ADIVNGE|AVDI_NVMERVS|AVDI|CLAVES|DECIMATIO|DIC|EVERRE|FORTVITVS_NVMERVS|FORTVITA_ELECTIO|LEGE|LONGITVDO|ORDINA|QVAERE|SCRIBE|SEMEN|SENATVS|SVBSTITVE)\b' scope: support.function.builtin.centvrion modules: diff --git a/tests.py b/tests.py index 8c564f6..5e2cd3c 100644 --- a/tests.py +++ b/tests.py @@ -620,6 +620,22 @@ builtin_tests = [ ('QVAERE("", "ab")', Program([], [ExpressionStatement(BuiltIn("QVAERE", [String(""), String("ab")]))]), ValList([ValStr(""), ValStr(""), ValStr("")])), # QVAERE: dot matches any character ('QVAERE(".", "ab")', Program([], [ExpressionStatement(BuiltIn("QVAERE", [String("."), String("ab")]))]), ValList([ValStr("a"), ValStr("b")])), + # SVBSTITVE: basic literal replacement + ('SVBSTITVE("a", "b", "aaa")', Program([], [ExpressionStatement(BuiltIn("SVBSTITVE", [String("a"), String("b"), String("aaa")]))]), ValStr("bbb")), + # SVBSTITVE: regex character class + ('SVBSTITVE("[0-9]+", "N", "abc123def456")', Program([], [ExpressionStatement(BuiltIn("SVBSTITVE", [String("[0-9]+"), String("N"), String("abc123def456")]))]), ValStr("abcNdefN")), + # SVBSTITVE: no match → string unchanged + ('SVBSTITVE("x", "y", "abc")', Program([], [ExpressionStatement(BuiltIn("SVBSTITVE", [String("x"), String("y"), String("abc")]))]), ValStr("abc")), + # SVBSTITVE: empty replacement (deletion) + ('SVBSTITVE("a", "", "banana")', Program([], [ExpressionStatement(BuiltIn("SVBSTITVE", [String("a"), String(""), String("banana")]))]), ValStr("bnn")), + # SVBSTITVE: empty text → empty string + ('SVBSTITVE("a", "b", "")', Program([], [ExpressionStatement(BuiltIn("SVBSTITVE", [String("a"), String("b"), String("")]))]), ValStr("")), + # SVBSTITVE: dot matches any character + ('SVBSTITVE(".", "x", "ab")', Program([], [ExpressionStatement(BuiltIn("SVBSTITVE", [String("."), String("x"), String("ab")]))]), ValStr("xx")), + # SVBSTITVE: backreference swaps two groups + ('SVBSTITVE("(a)(b)", "\\2\\1", "ab")', Program([], [ExpressionStatement(BuiltIn("SVBSTITVE", [String("(a)(b)"), String("\\2\\1"), String("ab")]))]), ValStr("ba")), + # SVBSTITVE: backreference with unmatched group (ignored) + ('SVBSTITVE("(a)(b)?", "\\1\\2", "a")', Program([], [ExpressionStatement(BuiltIn("SVBSTITVE", [String("(a)(b)?"), String("\\1\\2"), String("a")]))]), ValStr("a")), ] class TestBuiltins(unittest.TestCase): @@ -704,6 +720,10 @@ error_tests = [ ('QVAERE(I, "abc")', CentvrionError), # QVAERE requires strings, not int ('QVAERE("abc", I)', CentvrionError), # QVAERE requires strings, not int ('QVAERE("[", "abc")', CentvrionError), # QVAERE invalid regex + ('SVBSTITVE(I, "b", "c")', CentvrionError), # SVBSTITVE requires strings, not int pattern + ('SVBSTITVE("a", I, "c")', CentvrionError), # SVBSTITVE requires strings, not int replacement + ('SVBSTITVE("a", "b", I)', CentvrionError), # SVBSTITVE requires strings, not int text + ('SVBSTITVE("[", "b", "c")', CentvrionError), # SVBSTITVE invalid regex ] class TestErrors(unittest.TestCase):