from rply import LexerGenerator valid_characters = '|'.join(list("abcdefghiklmnopqrstvxyz_")) keyword_tokens = [("KEYWORD_"+i, i) for i in [ "AETERNVM", "ALIVD", "AVGE", "CAPE", "AVT", "DEFINI", "DESIGNA", "DISPAR", "DONICVM", "DVM", "CONTINVA", "ERVMPE", "EST", "ET", "FAC", "FALSITAS", "FVNCTIO", "INVOCA", "IN", "MINVE", "MINVS", "NON", "NVLLVS", "PER", "PLVS", "REDI", "RELIQVVM", "SI", "TVNC", "TABVLA", "TEMPTA", "VSQVE", "VT", "VERITAS", "CVM" ]] builtin_tokens = [("BUILTIN", i) for i in [ "AVDI_NVMERVS", "AVDI", "CLAVES", "DECIMATIO", "DIC", "DORMI", "EVERRE", "FORTVITVS_NVMERVS", "FORTVITA_ELECTIO", "LONGITVDO", "ORDINA", "SEMEN", "SENATVS", "TYPVS", "LEGE", "SCRIBE", "ADIVNGE", "QVAERE", "SVBSTITVE", "PETE", "PETITVR", "AVSCVLTA" ]] data_tokens = [ ("DATA_STRING", r"(\".*?\"|'.*?')"), ("DATA_FRACTION", r"([IVXLCDM][IVXLCDM_]*)?([S][S:.|]*|:[S:.|]+|\.[S:.|]*)"), ("DATA_NUMERAL", r"[IVXLCDM][IVXLCDM_]*") ] module_tokens = [("MODULE", i) for i in [ "FORS", "FRACTIO", "MAGNVM", "SCRIPTA", "SVBNVLLA", "RETE" ]] symbol_tokens = [ ("SYMBOL_LPARENS", r"\("), ("SYMBOL_RPARENS", r"\)"), ("SYMBOL_LBRACKET", r"\["), ("SYMBOL_RBRACKET", r"\]"), ("SYMBOL_LCURL", r"\{"), ("SYMBOL_RCURL", r"\}"), ("SYMBOL_PLUS", r"\+"), ("SYMBOL_MINUS", r"\-"), ("SYMBOL_TIMES", r"\*"), ("SYMBOL_DIVIDE", r"\/"), ("SYMBOL_AMPERSAND", r"&"), ("SYMBOL_COMMA", r",") ] whitespace_tokens = [ ("NEWLINE", r"\n+") ] all_tokens = ( keyword_tokens + builtin_tokens + module_tokens + data_tokens + symbol_tokens + whitespace_tokens + [("ID", f"({valid_characters})+")] ) class Lexer(): def __init__(self): self.lexer = LexerGenerator() def _add_tokens(self): for token in all_tokens: self.lexer.add(*token) self.lexer.ignore(r" +") self.lexer.ignore(r'//[^\n]*') self.lexer.ignore(r'/\*[\s\S]*?\*/') def get_lexer(self): self._add_tokens() return self.lexer.build()