#!/bin/env python """ lex - a lexer generator in python. """ __author__ = "Dan Williams (dan@osheim.org, dww4s@virginia.edu)" __copyright__ = "2005" # std imports import os.path, re, sys, copy # 2.3 imports from optparse import OptionParser # callbacks def silent(rule, m, offset): '''ignore a hit; return None''' pass def make_token(rule, m, offset): '''return a token from a hit''' return(Token(rule.name, m.start() + offset, m.end() + offset, m.group(0))) class Token: '''Used to store an instance of a lexical token''' def __init__(self, name, start, end, s=None): self.name = name self.start = start self.end = end self.string = s self.debug = False def __repr__(self): if len(self.string) < 10: s = self.string else: s = self.string[:10] + "..." return "<Token(%r, %d, %d, %r)>" % (self.name, self.start, self.end, s) class Rule(object): """Defines a rule used by a lexer.""" def __init__(self, name="Unnamed", expr=r"(.|\n)", action=lambda x,y: None): self.name = name self.expr = expr self.re = re.compile(self.expr) self.action = action def match(self, *args, **kw): """Determine if this rule is matched""" return self.re.match(*args, **kw) def act(self, lexer, m, offset=0): """Act on this rule""" return self.action(self, m, offset) class SubRule(Rule): """Defines a rule which parses a region according to its own grammar, i.e. a sub-grammar with its own rules. This rule may return multiple tokens and span multiple calls to the next() method of Lexer.""" def __init__(self, name="Unnamed", expr=r"(.|\n)", grammar=None): self.name = name self.expr = expr self.re = re.compile(self.expr) if grammar is None: self.grammar = Grammar() else: self.grammar = grammar self.lexer = Lexer(self.grammar) self.data = None self.index = None def match(self, *args, **kw): """Determine if this rule is matched""" m = self.re.match(*args, **kw) if m is not None: self.data = args[0][:m.end()] self.index = args[1] return m def act(self, lexer, m): """Act on this match""" self.lexer.lex(self.data, self.index) try: v = self.lexer.next() lexer.sub_lexer = self.lexer return v except StopIteration: lexer.sub_lexer = None return None class BalancedExprMatch: def __init__(self, start, end, data): self.s = start self.e = end self.d = data def start(self): return self.s def end(self): return self.e def group(self, i): if i == 0 or i == 1: return self.d else: raise IndexError, "no such group" def groupdict(self): return {} def groups(self): return () def span(self): return (self.s, self.e) class BalancedExprRule(Rule): """ Defines a rule that need to take into account opening and closing expressions, i.e. parenthesis, #if and #endif, etc. """ def __init__(self, name="Unnamed", start_expr=r"(#if +0)", enter="#if", leave="#endif", action=lambda x,y: None): self.name = name self.start_expr = start_expr self.start_re = re.compile(self.start_expr) self.enter = enter self.leave = leave self.action = action def match(self, *args, **kw): if not self.start_re.match(*args): return None stack = [] data = args[0] index = args[1] start = index if data[index:].startswith(self.enter): stack.append(self.enter) index += len(self.enter) while len(stack) > 0 and index < len(data): if data[index:].startswith(self.enter): stack.append(self.enter) index += len(self.enter) elif data[index:].startswith(self.leave): stack.pop(-1) index += len(self.leave) else: index += 1 m = BalancedExprMatch(start, index, data[start:index]) return m def act(self, lexer, m): """Act on this rule""" return self.action(self, m) class Grammar(list): """ Defines rules for lexing according to a given grammar. The order of rules in the grammar is their precedence in matching. """ GRAMMAR_LIST = [ {'name': 'default'} ] def __init__(self, *args, **kw): """useful values to pass in: rules -> list of rules (ordered!) if rules are not supplied, self._default_rules() is used""" list.__init__(self) if "rules" in kw: for r in kw["rules"]: self.append(r) else: self._default_rules() self._post_init(*args, **kw) def _default_rules(self): """subclasses can override this to define defaults for a grammar""" for rdir in self.GRAMMAR_LIST: self.add_rule(**rdir) def _post_init(self, *args, **kw): """subclasses can override this to enable other behavior""" pass def add_rule(self, *args, **kw): self.append(Rule(*args, **kw)) def clear_rules(self): while len(self) > 0: del self[0] class Lexer(object): """Defines a lexer, a generator of lexical tokens, etc.""" def __init__(self, grammar=None, rules=None, data=None, index=0): """ If the grammar keyword is provided, then that grammar will be used. Else, if the rules keyword is provided, that list of rules will be used Else, the default (boring) grammar will be used. Normally, lex(data) is used to (re-)intialize the lexer with data to lex. If the data keyword is provided, then the lexer is ready to go on instantiation. """ if grammar is not None: self.grammar = grammar elif rules is not None: self.grammar = Grammar(rules=rules) else: self.grammar = Grammar() self.data = data self.index = index self.offset = 0 self.sub_lexer = None def lex(self, data=None, index=0, offset=0): """ (re-)initialize the lexer with data to lex, and optionally, an offset to start at """ self.data = data self.index = index self.offset = offset def __iter__(self): if self.data is None: raise Exception, "No data to be lexed" return self #def append(self, newdata, offset=0): # self.data += newdata # self.index += offset def next(self): # used for multiple levels of lexing if self.sub_lexer is not None: try: return self.sub_lexer.next() except StopIteration: self.sub_lexer = None if self.index >= len(self.data): raise StopIteration for rule in self.grammar: m = rule.match(self.data, self.index) if m: self.index = m.end() return rule.act(self, m, self.offset) raise Exception, "Failed to consume last %d characters of input: %r" % \ (len(self.data) - self.index, self.data[self.index:])