import re valid_name_re = re.compile('^[a-zA-Z_][a-zA-Z0-9_]*$') reserved_names = ['start', 'middle', 'end', 'null'] class Token(object): def __init__(self, name, y, x, s, **vargs): self.name = name self.y = y self.x = x self.string = s self.vargs = vargs def add_to_string(self, s): self.string += s def __repr__(self): if len(self.string) < 10: s = self.string else: s = self.string[:10] + '...' return "" % (self.name, self.y, self.x, s) def render(self): return (self,) class Rule: name = 'abstract' def match(self, lexer, context=[]): raise Exception, "%s rule cannot match!" % self.name def make_token(self, lexer, s, name, **vargs): return Token(name, lexer.y, lexer.x, s, **vargs) class ConstantRule(Rule): def __init__(self, name, constant): assert valid_name_re.match(name), 'invalid name %r' % name assert name not in reserved_names, "reserved rule name: %r" % name self.name = name self.constant = constant def match(self, lexer, context=[]): if lexer.lines[lexer.y][lexer.x:].startswith(self.constant): name = '.'.join(context + [self.name]) lexer.add_token(self.make_token(lexer, self.constant, name)) lexer.x += len(self.constant) return True else: return False class PatternRule(Rule): def __init__(self, name, pattern): assert valid_name_re.match(name), 'invalid name %r' % name assert name not in reserved_names, "reserved rule name: %r" % name self.name = name self.pattern = pattern self.re = re.compile(pattern) def match(self, lexer, context=[]): m = self.re.match(lexer.lines[lexer.y], lexer.x) if m: name = '.'.join(context + [self.name]) lexer.add_token(self.make_token(lexer, m.group(0), name)) lexer.x += len(m.group(0)) return True else: return False class RegionRule(Rule): def __init__(self, name, start, grammar, end): assert valid_name_re.match(name), 'invalid name %r' % name assert name not in reserved_names, "reserved rule name: %r" % name self.name = name self.start = start self.grammar = grammar self.end = end self.start_re = re.compile(start) def _add_from_regex(self, context, name, lexer, m): t_name = '.'.join(context + [self.name, name]) t = self.make_token(lexer, m.group(0), t_name) lexer.add_token(t) lexer.x += len(m.group(0)) def match(self, lexer, context=[]): m = self.start_re.match(lexer.lines[lexer.y], lexer.x) # see if we can match out start token if m: # ok, so create our start token, and get ready to start reading data self._add_from_regex(context, 'start', lexer, m) null_t_name = '.'.join(context + [self.name, 'null']) null_t = None # if we have an end token, then build it here. notice that it can # reference named groups from the start token. if we have no end, # well, then, we're never getting out of here alive! if self.end: end_re = re.compile(self.end % m.groupdict()) # ok, so as long as we aren't done (we haven't found an end token), # keep reading input done = False while not done and lexer.y < len(lexer.lines): old_y = lexer.y # if this line is empty, then we will skip it, but here weinsert # an empty null token just so we have something if len(lexer.lines[lexer.y]) == 0: null_t = Token(null_t_name, lexer.y, lexer.x, '') lexer.add_token(null_t) null_t = None # ok, as long as we haven't found the end token, and have more # data on the current line to read, we will process tokens while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]): # if we are looking for an end token, then see if we've # found it. if so, then we are done! if self.end: m = end_re.match(lexer.lines[lexer.y], lexer.x) if m: self._add_from_regex(context, 'end', lexer, m) done = True break # ok, we need to check all our rules now, in order. if we # find a token, note that we found one and exit the loop found = False for rule in self.grammar.rules: if rule.match(lexer, context + [self.name]): found = True null_t = None break # if we never found a token, then we need to add another # character to the current null token (which we should # create if it isn't set). if not found: if null_t is None: null_t = Token(null_t_name, lexer.y, lexer.x, '') lexer.add_token(null_t) null_t.add_to_string(lexer.lines[lexer.y][lexer.x]) lexer.x += 1 # ok, since we're soon going to be on a different line (or # already are), we want a new null token. so forget about the # current one. null_t = None # if we're still on the same line at this point (and not done) # then that means we're finished with the line and should move # on to the next one here if not done and old_y == lexer.y: lexer.y += 1 lexer.x = 0 # alright, we're finally done procesing the region, so return true return True else: # region was not matched; we never started. so return false return False class DualRegionRule(Rule): def __init__(self, name, start, grammar1, middle, grammar2, end): assert valid_name_re.match(name), 'invalid name %r' % name assert name not in reserved_names, "reserved rule name: %r" % name self.name = name self.start = start self.grammar1 = grammar1 self.middle = middle self.grammar2 = grammar2 self.end = end self.start_re = re.compile(start) def _add_from_regex(self, context, name, lexer, m): t_name = '.'.join(context + [self.name, name]) t = self.make_token(lexer, m.group(0), t_name) lexer.add_token(t) lexer.x += len(m.group(0)) def match(self, lexer, context=[]): m1 = self.start_re.match(lexer.lines[lexer.y], lexer.x) # see if we can match out start token if m1: # ok, so create our start token, and get ready to start reading data self._add_from_regex(context, 'start', lexer, m1) null_t_name = '.'.join(context + [self.name, 'null']) null_t = None d1 = m1.groupdict() d2 = {} middle_re = re.compile(self.middle % d1) # ok, so as long as we aren't done (we haven't found an end token), # keep reading input done = False while not done and lexer.y < len(lexer.lines): old_y = lexer.y # if this line is empty, then we will skip it, but here weinsert # an empty null token just so we have something if len(lexer.lines[lexer.y]) == 0: null_t = Token(null_t_name, lexer.y, lexer.x, '') lexer.add_token(null_t) null_t = None # ok, as long as we haven't found the end token, and have more # data on the current line to read, we will process tokens while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]): # see if we have found the middle token. if so, we can then # proceed to "stage 2" m2 = middle_re.match(lexer.lines[lexer.y], lexer.x) if m2: d2 = m2.groupdict() self._add_from_regex(context, 'middle', lexer, m2) done = True break # ok, we need to check all our rules now, in order. if we # find a token, note that we found one and exit the loop found = False for rule in self.grammar1.rules: if rule.match(lexer, context + [self.name]): found = True null_t = None break # if we never found a token, then we need to add another # character to the current null token (which we should # create if it isn't set). if not found: if null_t is None: null_t = Token(null_t_name, lexer.y, lexer.x, '') lexer.add_token(null_t) null_t.add_to_string(lexer.lines[lexer.y][lexer.x]) lexer.x += 1 # ok, since we're soon going to be on a different line (or # already are), we want a new null token. so forget about the # current one. null_t = None # if we're still on the same line at this point (and not done) # then that means we're finished with the line and should move # on to the next one here if not done and old_y == lexer.y: lexer.y += 1 lexer.x = 0 # ok stage 2 is like stage 1, only we are looking for end tokens # instead of middle tokens d3 = dict(d1.items() + d2.items()) end_re = re.compile(self.end % d3) # ok, so as long as we aren't done (we haven't found an end token), # keep reading input done = False while not done and lexer.y < len(lexer.lines): old_y = lexer.y # if this line is empty, then we will skip it, but here weinsert # an empty null token just so we have something if len(lexer.lines[lexer.y]) == 0: null_t = Token(null_t_name, lexer.y, lexer.x, '') lexer.add_token(null_t) null_t = None # ok, as long as we haven't found the end token, and have more # data on the current line to read, we will process tokens while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]): # see if we have found the middle token. if so, we can then # proceed to "stage 2" m3 = end_re.match(lexer.lines[lexer.y], lexer.x) if m3: self._add_from_regex(context, 'end', lexer, m3) done = True break # ok, we need to check all our rules now, in order. if we # find a token, note that we found one and exit the loop found = False for rule in self.grammar2.rules: if rule.match(lexer, context + [self.name]): found = True null_t = None break # if we never found a token, then we need to add another # character to the current null token (which we should # create if it isn't set). if not found: if null_t is None: null_t = Token(null_t_name, lexer.y, lexer.x, '') lexer.add_token(null_t) null_t.add_to_string(lexer.lines[lexer.y][lexer.x]) lexer.x += 1 # ok, since we're soon going to be on a different line (or # already are), we want a new null token. so forget about the # current one. null_t = None # if we're still on the same line at this point (and not done) # then that means we're finished with the line and should move # on to the next one here if not done and old_y == lexer.y: lexer.y += 1 lexer.x = 0 # alright, we're finally done procesing the dual region; return true return True else: # dual region was not matched; we never started. so return false return False class Grammar: rules = [] def __init__(self): for rule in self.rules: if hasattr(rule, 'grammar') and rule.grammar is None: rule.grammar = self class Lexer: def __init__(self, name, grammar): self.name = name self.grammar = grammar self.y = 0 self.x = 0 self.lines = None self.tokens = [] def add_token(self, t): self.tokens.append(t) def lex(self, lines, y=0, x=0): self.y = y self.x = x self.lines = lines self.tokens = [] def __iter__(self): if self.lines is None: raise Exception, "no lines to lex" return self def next(self): null_t_name = 'null' null_t = None while self.y < len(self.lines): line = self.lines[self.y] while self.x < len(line): curr_t = None for rule in self.grammar.rules: if rule.match(self): assert self.tokens, "AAAAA %s" % repr(self.tokens) return self.tokens.pop(0) if null_t is None: null_t = Token(null_t_name, self.y, self.x, '') self.add_token(null_t) null_t.add_to_string(line[self.x]) self.x += 1 self.y += 1 self.x = 0 if self.tokens: return self.tokens.pop(0) else: raise StopIteration