diff --git a/lex2.py b/lex2.py index 225b20f..d43e199 100755 --- a/lex2.py +++ b/lex2.py @@ -23,7 +23,7 @@ class Token(object): class Rule: name = 'abstract' - def match(self, lexer, context=[]): + def match(self, lexer, context=[], d={}): raise Exception, "%s rule cannot match!" % self.name def make_token(self, lexer, s, name, **vargs): return Token(name, lexer.y, lexer.x, s, **vargs) @@ -32,9 +32,9 @@ class ConstantRule(Rule): def __init__(self, name, constant): assert valid_name_re.match(name), 'invalid name %r' % name assert name not in reserved_names, "reserved rule name: %r" % name - self.name = name + self.name = name self.constant = constant - def match(self, lexer, context=[]): + def match(self, lexer, context=[], d={}): if lexer.lines[lexer.y][lexer.x:].startswith(self.constant): name = '.'.join(context + [self.name]) lexer.add_token(self.make_token(lexer, self.constant, name)) @@ -50,7 +50,7 @@ class PatternRule(Rule): self.name = name self.pattern = pattern self.re = re.compile(pattern) - def match(self, lexer, context=[]): + def match(self, lexer, context=[], d={}): m = self.re.match(lexer.lines[lexer.y], lexer.x) if m: name = '.'.join(context + [self.name]) @@ -60,6 +60,28 @@ class PatternRule(Rule): else: return False +class ContextPatternRule(Rule): + def __init__(self, name, pattern, fallback): + assert valid_name_re.match(name), 'invalid name %r' % name + assert name not in reserved_names, "reserved rule name: %r" % name + self.name = name + self.pattern = pattern + self.fallback = fallback + self.fallback_re = re.compile(fallback) + def match(self, lexer, context=[], d={}): + try: + r = re.compile(self.pattern % d) + except KeyError: + r = self.fallback_re + m = r.match(lexer.lines[lexer.y], lexer.x) + if m: + name = '.'.join(context + [self.name]) + lexer.add_token(self.make_token(lexer, m.group(0), name)) + lexer.x += len(m.group(0)) + return True + else: + return False + class RegionRule(Rule): def __init__(self, name, start, grammar, end): assert valid_name_re.match(name), 'invalid name %r' % name @@ -74,11 +96,12 @@ class RegionRule(Rule): t = self.make_token(lexer, m.group(0), t_name) lexer.add_token(t) lexer.x += len(m.group(0)) - def match(self, lexer, context=[]): + def match(self, lexer, context=[], d={}): m = self.start_re.match(lexer.lines[lexer.y], lexer.x) # see if we can match out start token if m: # ok, so create our start token, and get ready to start reading data + d = m.groupdict() self._add_from_regex(context, 'start', lexer, m) null_t_name = '.'.join(context + [self.name, 'null']) null_t = None @@ -87,7 +110,7 @@ class RegionRule(Rule): # reference named groups from the start token. if we have no end, # well, then, we're never getting out of here alive! if self.end: - end_re = re.compile(self.end % m.groupdict()) + end_re = re.compile(self.end % d) # ok, so as long as we aren't done (we haven't found an end token), # keep reading input @@ -117,7 +140,7 @@ class RegionRule(Rule): # find a token, note that we found one and exit the loop found = False for rule in self.grammar.rules: - if rule.match(lexer, context + [self.name]): + if rule.match(lexer, context + [self.name], d): found = True null_t = None break @@ -166,7 +189,7 @@ class DualRegionRule(Rule): t = self.make_token(lexer, m.group(0), t_name) lexer.add_token(t) lexer.x += len(m.group(0)) - def match(self, lexer, context=[]): + def match(self, lexer, context=[], d={}): m1 = self.start_re.match(lexer.lines[lexer.y], lexer.x) # see if we can match out start token if m1: @@ -208,7 +231,7 @@ class DualRegionRule(Rule): # find a token, note that we found one and exit the loop found = False for rule in self.grammar1.rules: - if rule.match(lexer, context + [self.name]): + if rule.match(lexer, context + [self.name], d1): found = True null_t = None break @@ -267,7 +290,7 @@ class DualRegionRule(Rule): # find a token, note that we found one and exit the loop found = False for rule in self.grammar2.rules: - if rule.match(lexer, context + [self.name]): + if rule.match(lexer, context + [self.name], d3): found = True null_t = None break diff --git a/lex2_perl.py b/lex2_perl.py index b618957..f144d08 100755 --- a/lex2_perl.py +++ b/lex2_perl.py @@ -1,4 +1,4 @@ -from lex2 import Grammar, ConstantRule, PatternRule, RegionRule, DualRegionRule +from lex2 import Grammar, ConstantRule, PatternRule, ContextPatternRule, RegionRule, DualRegionRule class PodGrammar(Grammar): rules = [ @@ -34,17 +34,14 @@ class StringGrammar(Grammar): name=r'hash_deref', pattern=r"\$\$*[A-Za-z0-9_](?:[A-Za-z0-9_]|::)*(?:->{(?:[a-zA-Z_][a-zA-Z_0-9]*|'(?:\\.|[^'\\])*'|\"(\\.|[^\\\"])*\")})+", ), - #PatternRule( - # name=r'hash_bareword_index', - # pattern=r'(?<={) *[A-Za-z0-9_]+(?=})', - #), PatternRule( name=r'length_scalar', pattern=r"\$#[A-Za-z0-9_](?:[A-Za-z0-9_]|::)*", ), - PatternRule( + ContextPatternRule( name=r'system_scalar', - pattern=r"\$[][>/)', grammar=StringGrammar(), - end=r'/', + end=r'/[a-z]*', ), RegionRule( - name=r'explicit_match_regex1', + name=r'match_regex2', start=r'm *(?P[^ #a-zA-Z0-9_])', grammar=StringGrammar(), - end=r'%(delim)s', + end=r'%(delim)s[a-z]*', ), RegionRule( - name=r'explicit_match_regex1', - start=r'm#', + name=r'match_regex3', + start=r'm(?P#)', grammar=StringGrammar(), - end=r'#', + end=r'#[a-z]*', + ), + + # replace regexes + DualRegionRule( + name=r'replace_regex1', + start=r's *(?P[^ a-zA-Z0-9_])', + grammar1=StringGrammar(), + middle=r'%(delim)s', + grammar2=StringGrammar(), + end=r'%(delim)s[a-z]*', ), DualRegionRule( - name=r'replace_regex', - start=r's */', + name=r'replace_regex2', + start=r's#', grammar1=StringGrammar(), - middle=r' */ *', + middle=r'#', grammar2=StringGrammar(), - end=r'/ *[a-z]*', + end=r'#[a-z]*', ), PatternRule( @@ -219,7 +228,8 @@ class PerlGrammar(Grammar): ), PatternRule( name=r'sub', - pattern=r"(?<=sub )[a-zA-Z_][a-zA-Z_0-9]*(?= *{)", + #pattern=r"(?<=sub )[a-zA-Z_][a-zA-Z_0-9]*(?= *{)", + pattern=r"(?<=sub )[a-zA-Z_][a-zA-Z_0-9]*", ), PatternRule( name=r'use', diff --git a/test2.py b/test2.py index baa6ac9..394cbee 100644 --- a/test2.py +++ b/test2.py @@ -15,4 +15,4 @@ for path in paths: lexer.lex(lines) print path for token in lexer: - print '%-28s| %s' % (token.name, token.string) + print '%-28s| %r' % (token.name, token.string) diff --git a/test3.py b/test3.py index 2915662..4b5f86f 100644 --- a/test3.py +++ b/test3.py @@ -17,12 +17,17 @@ for i in range(0, len(color_list)): color_dict[color_names[i]] = color_list[i] token_colors = { + 'escaped': 'lpurple', 'null': 'white', 'delimiter': 'white', 'pod.start': 'lred', 'pod.null': 'lred', 'pod.end': 'lred', 'pod.header': 'lpurple', + 'pod.indent_level': 'lpurple', + 'pod.item_entry': 'lpurple', + 'pod.format': 'lpurple', + 'pod.encoding_type': 'lpurple', 'sub': 'lcyan', 'number': 'white', 'operator': 'white', @@ -54,12 +59,39 @@ token_colors = { 'array': 'yellow', 'hash': 'yellow', 'bareword_hash_index': 'lgreen', - 'quoted_region': 'lcyan', - 'match_regex': 'lcyan', - 'replace_regex.start': 'lcyan', - 'replace_regex.middle': 'lcyan', - 'replace_regex.end': 'lcyan', - 'replace_regex.null': 'lcyan', + + # quoted region + 'quoted_region1': 'lcyan', + 'quoted_region1.start': 'lcyan', + 'quoted_region1.null': 'lcyan', + 'quoted_region1.end': 'lcyan', + 'quoted_region2': 'lcyan', + 'quoted_region2.start': 'lcyan', + 'quoted_region2.null': 'lcyan', + 'quoted_region2.end': 'lcyan', + + # match regex + 'match_regex1.start': 'lcyan', + 'match_regex1.end': 'lcyan', + 'match_regex1.null': 'lcyan', + 'match_regex2.start': 'lcyan', + 'match_regex2.end': 'lcyan', + 'match_regex2.null': 'lcyan', + 'match_regex3.start': 'lcyan', + 'match_regex3.end': 'lcyan', + 'match_regex3.null': 'lcyan', + + # replace regex + 'replace_regex1.start': 'lcyan', + 'replace_regex1.middle': 'lcyan', + 'replace_regex1.end': 'lcyan', + 'replace_regex1.null': 'lcyan', + 'replace_regex2.start': 'lcyan', + 'replace_regex2.middle': 'lcyan', + 'replace_regex2.end': 'lcyan', + 'replace_regex2.null': 'lcyan', + + # 'bareword_hash_key': 'lgreen', 'interpolated_scalar': 'yellow', 'interpolated_system_scalar': 'yellow', @@ -76,7 +108,7 @@ token_colors = { 'static_method': 'lcyan', 'builtin_method': 'lpurple', 'bareword_method': 'lcyan', - #'bareword': 'yellow', + #'bareword': 'yellow', 'bizzaro': 'lpurple', }