import re valid_name_re = re.compile('^[a-zA-Z_][a-zA-Z0-9_]*$') reserved_names = ['start', 'null', 'end'] class Token(object): def __init__(self, name, y, x, s, **vargs): self.name = name self.y = y self.x = x self.string = s self.vargs = vargs def add_to_string(self, s): self.string += s def __repr__(self): if len(self.string) < 10: s = self.string else: s = self.string[:10] + '...' return "" % (self.name, self.y, self.x, s) def render(self): return (self,) class Rule: name = 'abstract' def match(self, lexer, context=[]): raise Exception, "%s rule cannot match!" % self.name def make_token(self, lexer, s, name, **vargs): return Token(name, lexer.y, lexer.x, s, **vargs) class ConstantRule(Rule): def __init__(self, name, constant): assert valid_name_re.match(name), 'invalid name %r' % name assert name not in reserved_names, "reserved rule name: %r" % name self.name = name self.constant = constant def match(self, lexer, context=[]): if lexer.lines[lexer.y][lexer.x:].startswith(self.constant): name = '.'.join(context + [self.name]) lexer.add_token(self.make_token(lexer, self.constant, name)) lexer.x += len(self.constant) return True else: return False class PatternRule(Rule): def __init__(self, name, pattern): assert valid_name_re.match(name), 'invalid name %r' % name assert name not in reserved_names, "reserved rule name: %r" % name self.name = name self.pattern = pattern self.re = re.compile(pattern) def match(self, lexer, context=[]): m = self.re.match(lexer.lines[lexer.y], lexer.x) if m: name = '.'.join(context + [self.name]) lexer.add_token(self.make_token(lexer, m.group(0), name)) lexer.x += len(m.group(0)) return True else: return False class RegionRule(Rule): def __init__(self, name, start, grammar, end): assert valid_name_re.match(name), 'invalid name %r' % name assert name not in reserved_names, "reserved rule name: %r" % name self.name = name self.start = start self.grammar = grammar self.end = end self.start_re = re.compile(start) def _add_from_regex(self, context, name, lexer, m): t_name = '.'.join(context + [self.name, name]) t = self.make_token(lexer, m.group(0), t_name) lexer.add_token(t) lexer.x += len(m.group(0)) def match(self, lexer, context=[]): m = self.start_re.match(lexer.lines[lexer.y], lexer.x) if m: self._add_from_regex(context, 'start', lexer, m) null_t_name = '.'.join(context + [self.name, 'null']) null_t = None if self.end: end_re = re.compile(self.end % m.groupdict()) done = False while not done and lexer.y < len(lexer.lines): line = lexer.lines[lexer.y] if len(line) == 0: null_t = Token(null_t_name, lexer.y, lexer.x, '') lexer.add_token(null_t) while not done and lexer.x < len(line): if self.end: m = end_re.match(line, lexer.x) if m: self._add_from_regex(context, 'end', lexer, m) done = True continue found = False for rule in self.grammar.rules: if rule.match(lexer, context + [self.name]): found = True null_t = None break if not found: if null_t is None: null_t = Token(null_t_name, lexer.y, lexer.x, '') lexer.add_token(null_t) null_t.add_to_string(line[lexer.x]) lexer.x += 1 null_t = None lexer.y += 1 lexer.x = 0 return True else: return False class DualRegionRule(Rule): def __init__(self, name, start, grammar1, middle, grammar2, end): assert valid_name_re.match(name), 'invalid name %r' % name assert name not in reserved_names, "reserved rule name: %r" % name self.name = name self.start = start self.grammar1 = grammar1 self.middle = middle self.grammar2 = grammar2 self.end = end self.start_re = re.compile(start) def _add_from_regex(self, context, name, lexer, m): t_name = '.'.join(context + [self.name, name]) t = self.make_token(lexer, m.group(0), t_name) lexer.add_token(t) lexer.x += len(m.group(0)) def match(self, lexer, context=[]): m = self.start_re.match(lexer.lines[lexer.y], lexer.x) if m: self._add_from_regex(context, 'start', lexer, m) null_t_name = '.'.join(context + [self.name, 'null']) null_t = None d1 = m.groupdict() d2 = {} middle_re = re.compile(self.middle % d1) done = False while not done and lexer.y < len(lexer.lines): line = lexer.lines[lexer.y] if len(line) == 0: null_t = Token(null_t_name, lexer.y, lexer.x, '') lexer.add_token(null_t) while not done and lexer.x < len(line): m = middle_re.match(line, lexer.x) if m: d2 = m.groupdict() self._add_from_regex(context, 'middle', lexer, m) done = True continue found = False for rule in self.grammar1.rules: if rule.match(lexer, context + [self.name]): found = True null_t = None break if not found: if null_t is None: null_t = Token(null_t_name, lexer.y, lexer.x, '') lexer.add_token(null_t) null_t.add_to_string(line[lexer.x]) lexer.x += 1 null_t = None if not done: lexer.y += 1 lexer.x = 0 if self.end: d3 = dict(d1.items() + d2.items()) end_re = re.compile(self.end % d3) null_t = None done = False while not done and lexer.y < len(lexer.lines): line = lexer.lines[lexer.y] if len(line) == 0: null_t = Token(null_t_name, lexer.y, lexer.x, '') lexer.add_token(null_t) while not done and lexer.x < len(line): if self.end: m = end_re.match(line, lexer.x) if m: self._add_from_regex(context, 'end', lexer, m) done = True continue found = False for rule in self.grammar2.rules: if rule.match(lexer, context + [self.name]): found = True null_t = None break if not found: if null_t is None: null_t = Token(null_t_name, lexer.y, lexer.x, '') lexer.add_token(null_t) null_t.add_to_string(line[lexer.x]) lexer.x += 1 null_t = None lexer.y += 1 lexer.x = 0 return True else: return False class Grammar: rules = [] class Lexer: def __init__(self, name, grammar): self.name = name self.grammar = grammar self.y = 0 self.x = 0 self.lines = None self.tokens = [] def add_token(self, t): self.tokens.append(t) def lex(self, lines, y=0, x=0): self.y = y self.x = x self.lines = lines self.tokens = [] def __iter__(self): if self.lines is None: raise Exception, "no lines to lex" return self def next(self): null_t_name = 'null' null_t = None while self.y < len(self.lines): line = self.lines[self.y] while self.x < len(line): for rule in self.grammar.rules: if rule.match(self): assert self.tokens, "AAAAA %s" % repr(self.tokens) return self.tokens.pop(0) if null_t is None: null_t = Token(null_t_name, self.y, self.x, '') self.add_token(null_t) null_t.add_to_string(line[self.x]) self.x += 1 self.y += 1 self.x = 0 if self.tokens: return self.tokens.pop(0) else: raise StopIteration