from rply import LexerGenerator valid_characters = '|'.join(list("abcdefghiklmnopqrstvxyz_")) keyword_tokens = [("KEYWORD_"+i, i) for i in [ "ALVID", "DEFINI", "DESIGNA", "DONICVM", "DVM", "ERVMPE", "EST", "FACE", "FALSITAS", "INVOCA", "IN", "MINVS", "NVLLVS", "PER", "PLVS", "REDI", "SI", "TVNC", "VSQVE", "VT", "VERITAS", "VOCA" ]] builtin_tokens = [("BUILTIN", i) for i in [ "AVDI_NVMERVS", "AVDI", "DICE", "FORTIS_NVMERVS", "FORTIS_ELECTIONIS", "LONGITVDO" ]] data_tokens = [ ("DATA_STRING", r"\".*?\""), ("DATA_NUMERAL", r"[IVXLCDM]+") ] module_tokens = [("MODULE", i) for i in [ "FORS", "FRACTIO", "MAGNVM", "SVBNVLLA" ]] symbol_tokens = [ ("SYMBOL_LPARENS", r"\("), ("SYMBOL_RPARENS", r"\)"), ("SYMBOL_LBRACKET", r"\["), ("SYMBOL_RBRACKET", r"\]"), ("SYMBOL_LCURL", r"\{"), ("SYMBOL_RCURL", r"\}"), ("SYMBOL_PLUS", r"\+"), ("SYMBOL_MINUS", r"\-"), ("SYMBOL_TIMES", r"\*"), ("SYMBOL_DIVIDE", r"\/") ] whitespace_tokens = [ ("NEWLINE", r"\n+") ] all_tokens = ( keyword_tokens + builtin_tokens + module_tokens + symbol_tokens + data_tokens + whitespace_tokens + [("ID", f"({valid_characters})+")] ) class Lexer(): def __init__(self): self.lexer = LexerGenerator() def _add_tokens(self): for token in all_tokens: self.lexer.add(*token) self.lexer.ignore(r" +") def get_lexer(self): self._add_tokens() return self.lexer.build()