from rply import LexerGenerator valid_characters = '|'.join(list("abcdefghiklmnopqrstvxyz_")) keyword_tokens = [("KEYWORD_"+i, i) for i in [ "ALUID", "DEFINI", "DESIGNA", "DONICUM", "DUM", "ERUMPE", "EST", "FACE", "FALSITAS", "INVOCA", "MINUS", "NULLUS", "PER", "PLUS", "REDI", "SI", "TUNC", "USQUE", "UT", "VERITAS", "VOCA" ]] builtin_tokens = [("BUILTIN", i) for i in [ "AUDI_NUMERUS", "AUDI", "DICE", "FORTIS_NUMERUS", "FORTIS_ELECTIONIS", "LONGITUDO" ]] data_tokens = [ ("DATA_STRING", r"\".*?\""), ("DATA_NUMERAL", r"[IVXLCDM]+") ] module_tokens = [("MODULE", i) for i in [ "FORS", "FRACTIO", "MAGNUM", "SUBNULLA" ]] symbol_tokens = [ ("SYMBOL_LPARENS", r"\("), ("SYMBOL_RPARENS", r"\)"), ("SYMBOL_LBRACKET", r"\["), ("SYMBOL_RBRACKET", r"\]"), ("SYMBOL_LCURL", r"\{"), ("SYMBOL_RCURL", r"\}"), ("SYMBOL_PLUS", r"\+"), ("SYMBOL_MINUS", r"\-"), ("SYMBOL_TIMES", r"\*"), ("SYMBOL_DIVIDE", r"\/") ] whitespace_tokens = [ ("NEWLINE", r"\n+") ] all_tokens = ( keyword_tokens + builtin_tokens + module_tokens + symbol_tokens + data_tokens + whitespace_tokens + [("ID", f"({valid_characters})+")] ) class Lexer(): def __init__(self): self.lexer = LexerGenerator() def _add_tokens(self): for token in all_tokens: self.lexer.add(*token) self.lexer.ignore(r" +") def get_lexer(self): self._add_tokens() return self.lexer.build()