import re valid_name_re = re.compile('^[a-zA-Z_][a-zA-Z0-9_]*$') reserved_names = ['start', 'null', 'end'] class Token(object): def __init__(self, name, y, x, s, **vargs): self.name = name self.y = y self.x = x self.string = s self.vargs = vargs def add_to_string(self, s): self.string += s def __repr__(self): if len(self.string) < 10: s = self.string else: s = self.string[:10] + '...' return "" % (self.name, self.y, self.x, s) def render(self): return (self,) class Rule: name = 'abstract' def match(self, lexer, context=[]): raise Exception, "%s rule cannot match!" % self.name def make_token(self, lexer, s, name, **vargs): return Token(name, lexer.y, lexer.x, s, **vargs) class ConstantRule(Rule): def __init__(self, name, constant): assert valid_name_re.match(name), 'invalid name %r' % name assert name not in reserved_names, "reserved rule name: %r" % name self.name = name self.constant = constant def match(self, lexer, context=[]): if lexer.lines[lexer.y][lexer.x:].startswith(self.constant): lexer.add_token(self.make_token(lexer, self.constant, self.name)) lexer.x += len(self.constant) return True else: return False class PatternRule(Rule): def __init__(self, name, pattern): assert valid_name_re.match(name), 'invalid name %r' % name assert name not in reserved_names, "reserved rule name: %r" % name self.name = name self.pattern = pattern self.re = re.compile(pattern) def match(self, lexer, context=[]): m = self.re.match(lexer.lines[lexer.y], lexer.x) if m: lexer.add_token(self.make_token(lexer, m.group(0), self.name)) lexer.x += len(m.group(0)) return True else: return False class RegionRule(Rule): def __init__(self, name, start, grammar, end): assert valid_name_re.match(name), 'invalid name %r' % name assert name not in reserved_names, "reserved rule name: %r" % name self.name = name self.start = start self.grammar = grammar self.end = end self.start_re = re.compile(start) def _add_from_regex(self, context, name, lexer, m): t_name = '.'.join(context + [self.name, name]) t = self.make_token(lexer, m.group(0), t_name) lexer.add_token(t) lexer.x += len(m.group(0)) def match(self, lexer, context=[]): m = self.start_re.match(lexer.lines[lexer.y], lexer.x) if m: self._add_from_regex(context, 'start', lexer, m) null_t_name = '.'.join(context + [self.name, 'null']) null_t = None if self.end: end_re = re.compile(self.end % m.groupdict()) done = False while not done and lexer.y < len(lexer.lines): line = lexer.lines[lexer.y] if len(line) == 0: null_t = Token(null_t_name, lexer.y, lexer.x, '') lexer.add_token(null_t) while not done and lexer.x < len(line): if self.end: m = end_re.match(line, lexer.x) if m: self._add_from_regex(context, 'end', lexer, m) done = True continue found = False for rule in self.grammar.rules: if rule.match(lexer, context + [self.name]): found = True null_t = None break if not found: if null_t is None: null_t = Token(null_t_name, lexer.y, lexer.x, '') lexer.add_token(null_t) null_t.add_to_string(line[lexer.x]) lexer.x += 1 null_t = None lexer.y += 1 lexer.x = 0 return True else: return False class Grammar: rules = [] class NullGrammar(Grammar): pass class PodGrammar(Grammar): pass class StringGrammar(Grammar): rules = [ PatternRule('escaped', '\\.'), ] class TestGrammar(Grammar): rules = [ RegionRule( name='heredoc', start="<< *(?P[a-zA-Z0-9_]+) *;", grammar=StringGrammar(), end='^%(heredoc)s$', ), RegionRule( name='string1', start='"', grammar=StringGrammar(), end='"', ), RegionRule( name='string2', start="'", grammar=StringGrammar(), end="'", ), PatternRule( name='word', pattern='[^ \t\n]+', ), ] class PerlGrammar(Grammar): rules = [ RegionRule( name='heredoc', start="<< *(?P[a-zA-Z0-9_]+) *;", grammar=StringGrammar(), end='^%(heredoc)s$', ), RegionRule( name='endblock', start="^__END__|__DATA__ *$", grammar=NullGrammar(), end='', ), RegionRule( name='pod', start='^=[a-zA-Z0-9_]+', grammar=PodGrammar(), end='^=cut', ), PatternRule( name='comment', pattern='#.*$', ), RegionRule( name='string1', start='"', grammar=StringGrammar(), end='"', ), RegionRule( name='string2', start="'", grammar=StringGrammar(), end="'", ), RegionRule( name='evalstring', start="`", grammar=StringGrammar(), end="`", ), PatternRule( name='number', pattern='0?\.[0-9]+|[0-9]+(?:\.[0-9]+)?', ), PatternRule( name='keyword', pattern="(?)(?:STDIN|STDERR|STDOUT|and|cmp|continue|do|else|elsif|eq|eval|foreach|for|if|last|my|next|ne|not|or|our|package|require|return|sub|undef|unless|until|use|while)(?![a-zA-Z0-9_])", ), PatternRule( name='hash_bareword_index', pattern='(?<={) *[A-Za-z0-9_]+(?=})', ), PatternRule( name='literal_hash_bareword_index', pattern='[A-Za-z0-9_]+(?= *=>)', ), PatternRule( name='length_scalar', pattern=r"\$#[A-Za-z0-9_](?:[A-Za-z0-9_]|::)*", ), PatternRule( name='system_scalar', pattern=r"\$[][>