diff --git a/lex2.py b/lex2.py index 4b56f88..714f7fd 100755 --- a/lex2.py +++ b/lex2.py @@ -1,12 +1,15 @@ import re -class Token: - def __init__(self, rule, y, x, s, role='single'): - self.rule = rule +valid_name_re = re.compile('^[a-zA-Z_][a-zA-Z0-9_]*$') +reserved_names = ['start', 'null', 'end'] + +class Token(object): + def __init__(self, name, y, x, s, **vargs): + self.name = name self.y = y self.x = x self.string = s - self.role = role + self.vargs = vargs def add_to_string(self, s): self.string += s def __repr__(self): @@ -14,203 +17,394 @@ class Token: s = self.string else: s = self.string[:10] + '...' - return "" % (self.rule, self.y, self.x, s) + return "" % (self.name, self.y, self.x, s) + def render(self): + return (self,) class Rule: - def __init__(self): - self.name = 'null' - def add_token(self, lexer, s, role='single'): - t = Token(self, lexer.y, lexer.x, s, role) - lexer.curr_tokens.append(t) - lexer.x += len(s) - def add_to_last_token(self, lexer, s): - assert lexer.curr_tokens - lexer.curr_tokens[-1].add_to_string(s) - lexer.x += len(s) - def match(self): - raise Exception, "not implemented" - -class NullRule(Rule): - def __init__(self): - self.name = 'null' - def match(self): - raise Exception, "null rule does not match!" - -class NewlineRule(Rule): - def __init__(self): - self.name = 'newline' - def match(self): - raise Exception, "newline rule does not match!" + name = 'abstract' + def match(self, lexer, context=[]): + raise Exception, "%s rule cannot match!" % self.name + def make_token(self, lexer, s, name, **vargs): + return Token(name, lexer.y, lexer.x, s, **vargs) class ConstantRule(Rule): - def __init__(self, name="unnamed_constant", const="foo"): - self.name = name - self.const = const - def match(self, lexer): - if lexer.lines[lexer.y][lexer.x:].startswith(self.const): - self.add_token(lexer, self.const) + def __init__(self, name, constant): + assert valid_name_re.match(name), 'invalid name %r' % name + assert name not in reserved_names, "reserved rule name: %r" % name + self.name = name + self.constant = constant + def match(self, lexer, context=[]): + if lexer.lines[lexer.y][lexer.x:].startswith(self.constant): + lexer.add_token(self.make_token(lexer, self.constant, self.name)) + lexer.x += len(self.constant) return True else: return False -class RegexRule(Rule): - def __init__(self, name="unnamed_regex", expr="[^ ]+"): - self.name = name - self.expr = expr - self.re = re.compile(expr) - def match(self, lexer): +class PatternRule(Rule): + def __init__(self, name, pattern): + assert valid_name_re.match(name), 'invalid name %r' % name + assert name not in reserved_names, "reserved rule name: %r" % name + self.name = name + self.pattern = pattern + self.re = re.compile(pattern) + def match(self, lexer, context=[]): m = self.re.match(lexer.lines[lexer.y], lexer.x) if m: - self.add_token(lexer, m.group(0)) + lexer.add_token(self.make_token(lexer, m.group(0), self.name)) + lexer.x += len(m.group(0)) return True else: return False class RegionRule(Rule): - def __init__(self, name, start, mid, end): - self.name = name + def __init__(self, name, start, grammar, end): + assert valid_name_re.match(name), 'invalid name %r' % name + assert name not in reserved_names, "reserved rule name: %r" % name + self.name = name + self.start = start + self.grammar = grammar + self.end = end self.start_re = re.compile(start) - self.mid_re = re.compile(mid) - self.end_re = re.compile(end) - def match(self, lexer): - lt = lexer.last_token - l = lexer.lines[lexer.y] - if lt is not None and lt.rule.name == self.name and lt.role != 'end': - saw_mid = False - while lexer.x < len(l): - m_end = self.end_re.match(l, lexer.x) - if m_end: - self.add_token(lexer, m_end.group(0), 'end') - return True - m_mid = self.mid_re.match(l, lexer.x) - if m_mid: - s = m_mid.group(0) - else: - s = l[lexer.x] - if saw_mid: - self.add_to_last_token(lexer, s) - else: - self.add_token(lexer, s, 'mid') - saw_mid = True - return True - else: - m = self.start_re.match(l, lexer.x) - if m: - self.add_token(lexer, m.group(0), 'start') - return True - else: - return False + def _add_from_regex(self, context, name, lexer, m): + t_name = '.'.join(context + [self.name, name]) + t = self.make_token(lexer, m.group(0), t_name) + lexer.add_token(t) + lexer.x += len(m.group(0)) + def match(self, lexer, context=[]): + m = self.start_re.match(lexer.lines[lexer.y], lexer.x) + if m: + self._add_from_regex(context, 'start', lexer, m) -class DynamicRegionRule(Rule): - def __init__(self, name, start, mid, end_fmt): - self.name = name - self.start_re = re.compile(start) - self.mid_re = re.compile(mid) - self.end_fmt = end_fmt - def add_token(self, lexer, s, role, end_re): - t = Token(self, lexer.y, lexer.x, s, role) - t.end_re = end_re - lexer.curr_tokens.append(t) - lexer.x += len(s) - def match(self, lexer): - lt = lexer.last_token - l = lexer.lines[lexer.y] - if lt is not None and lt.rule.name == self.name and lt.role != 'end': - saw_mid = False - while lexer.x < len(l): - m_end = self.end_re.match(l, lexer.x) - if m_end: - self.add_token(lexer, m_end.group(0), 'end', None) - return True - m_mid = self.mid_re.match(l, lexer.x) - if m_mid: - s = m_mid.group(0) - else: - s = l[lexer.x] - if saw_mid: - self.add_to_last_token(lexer, s) - else: - self.add_token(lexer, s, 'mid', lt.end_re) - saw_mid = True + null_t_name = '.'.join(context + [self.name, 'null']) + null_t = None + + if self.end: + end_re = re.compile(self.end % m.groupdict()) + + done = False + while not done and lexer.y < len(lexer.lines): + line = lexer.lines[lexer.y] + if len(line) == 0: + null_t = Token(null_t_name, lexer.y, lexer.x, '') + lexer.add_token(null_t) + while not done and lexer.x < len(line): + if self.end: + m = end_re.match(line, lexer.x) + if m: + self._add_from_regex(context, 'end', lexer, m) + done = True + continue + + found = False + for rule in self.grammar.rules: + if rule.match(lexer, context + [self.name]): + found = True + null_t = None + break + if not found: + if null_t is None: + null_t = Token(null_t_name, lexer.y, lexer.x, '') + lexer.add_token(null_t) + null_t.add_to_string(line[lexer.x]) + lexer.x += 1 + + null_t = None + lexer.y += 1 + lexer.x = 0 return True else: - m = self.start_re.match(l, lexer.x) - if m: - end_re = re.compile(self.end_fmt % m.groups()) - self.add_token(lexer, m.group(0), 'start', end_re) - return True - else: - return False + return False + +class Grammar: + rules = [] + +class NullGrammar(Grammar): + pass + +class PodGrammar(Grammar): + pass + +class StringGrammar(Grammar): + rules = [ + PatternRule('escaped', '\\.'), + ] + +class TestGrammar(Grammar): + rules = [ + RegionRule( + name='heredoc', + start="<< *(?P[a-zA-Z0-9_]+) *;", + grammar=StringGrammar(), + end='^%(heredoc)s$', + ), + RegionRule( + name='string1', + start='"', + grammar=StringGrammar(), + end='"', + ), + RegionRule( + name='string2', + start="'", + grammar=StringGrammar(), + end="'", + ), + PatternRule( + name='word', + pattern='[^ \t\n]+', + ), + ] + +class PerlGrammar(Grammar): + rules = [ + RegionRule( + name='heredoc', + start="<< *(?P[a-zA-Z0-9_]+) *;", + grammar=StringGrammar(), + end='^%(heredoc)s$', + ), + RegionRule( + name='endblock', + start="^__END__|__DATA__ *$", + grammar=NullGrammar(), + end='', + ), + RegionRule( + name='pod', + start='^=[a-zA-Z0-9_]+', + grammar=PodGrammar(), + end='^=cut', + ), + PatternRule( + name='comment', + pattern='#.*$', + ), + RegionRule( + name='string1', + start='"', + grammar=StringGrammar(), + end='"', + ), + RegionRule( + name='string2', + start="'", + grammar=StringGrammar(), + end="'", + ), + RegionRule( + name='evalstring', + start="`", + grammar=StringGrammar(), + end="`", + ), + PatternRule( + name='number', + pattern='0?\.[0-9]+|[0-9]+(?:\.[0-9]+)?', + ), + PatternRule( + name='keyword', + pattern="(?)(?:STDIN|STDERR|STDOUT|and|cmp|continue|do|else|elsif|eq|eval|foreach|for|if|last|my|next|ne|not|or|our|package|require|return|sub|undef|unless|until|use|while)(?![a-zA-Z0-9_])", + ), + PatternRule( + name='hash_bareword_index', + pattern='(?<={) *[A-Za-z0-9_]+(?=})', + ), + PatternRule( + name='literal_hash_bareword_index', + pattern='[A-Za-z0-9_]+(?= *=>)', + ), + PatternRule( + name='length_scalar', + pattern=r"\$#[A-Za-z0-9_](?:[A-Za-z0-9_]|::)*", + ), + PatternRule( + name='system_scalar', + pattern=r"\$[][>