250 lines
7.4 KiB
Python
Executable File
250 lines
7.4 KiB
Python
Executable File
#!/bin/env python
|
|
|
|
"""
|
|
lex - a lexer generator in python.
|
|
"""
|
|
|
|
__author__ = "Dan Williams (dan@osheim.org, dww4s@virginia.edu)"
|
|
__copyright__ = "2005"
|
|
|
|
# std imports
|
|
import os.path, re, sys, copy
|
|
|
|
# 2.3 imports
|
|
from optparse import OptionParser
|
|
|
|
# callbacks
|
|
|
|
def silent(rule, m, offset):
|
|
'''ignore a hit; return None'''
|
|
pass
|
|
|
|
def make_token(rule, m, offset):
|
|
'''return a token from a hit'''
|
|
return(Token(rule.name, m.start() + offset, m.end() + offset, m.group(0)))
|
|
|
|
class Token:
|
|
'''Used to store an instance of a lexical token'''
|
|
def __init__(self, name, start, end, s=None):
|
|
self.name = name
|
|
self.start = start
|
|
self.end = end
|
|
self.string = s
|
|
self.debug = False
|
|
|
|
def __repr__(self):
|
|
if len(self.string) < 10:
|
|
s = self.string
|
|
else:
|
|
s = self.string[:10] + "..."
|
|
return "<Token(%r, %d, %d, %r)>" % (self.name, self.start, self.end, s)
|
|
|
|
class Rule(object):
|
|
"""Defines a rule used by a lexer."""
|
|
def __init__(self, name="Unnamed", expr=r"(.|\n)", action=lambda x,y: None):
|
|
self.name = name
|
|
self.expr = expr
|
|
self.re = re.compile(self.expr)
|
|
self.action = action
|
|
def match(self, *args, **kw):
|
|
"""Determine if this rule is matched"""
|
|
return self.re.match(*args, **kw)
|
|
def act(self, lexer, m, offset=0):
|
|
"""Act on this rule"""
|
|
return self.action(self, m, offset)
|
|
|
|
class SubRule(Rule):
|
|
"""Defines a rule which parses a region according to its own grammar,
|
|
i.e. a sub-grammar with its own rules. This rule may return multiple
|
|
tokens and span multiple calls to the next() method of Lexer."""
|
|
def __init__(self, name="Unnamed", expr=r"(.|\n)", grammar=None):
|
|
self.name = name
|
|
self.expr = expr
|
|
self.re = re.compile(self.expr)
|
|
|
|
if grammar is None:
|
|
self.grammar = Grammar()
|
|
else:
|
|
self.grammar = grammar
|
|
self.lexer = Lexer(self.grammar)
|
|
|
|
self.data = None
|
|
self.index = None
|
|
|
|
def match(self, *args, **kw):
|
|
"""Determine if this rule is matched"""
|
|
m = self.re.match(*args, **kw)
|
|
if m is not None:
|
|
self.data = args[0][:m.end()]
|
|
self.index = args[1]
|
|
return m
|
|
|
|
def act(self, lexer, m):
|
|
"""Act on this match"""
|
|
self.lexer.lex(self.data, self.index)
|
|
try:
|
|
v = self.lexer.next()
|
|
lexer.sub_lexer = self.lexer
|
|
return v
|
|
except StopIteration:
|
|
lexer.sub_lexer = None
|
|
return None
|
|
|
|
class BalancedExprMatch:
|
|
def __init__(self, start, end, data):
|
|
self.s = start
|
|
self.e = end
|
|
self.d = data
|
|
def start(self):
|
|
return self.s
|
|
def end(self):
|
|
return self.e
|
|
def group(self, i):
|
|
if i == 0 or i == 1:
|
|
return self.d
|
|
else:
|
|
raise IndexError, "no such group"
|
|
def groupdict(self):
|
|
return {}
|
|
def groups(self):
|
|
return ()
|
|
def span(self):
|
|
return (self.s, self.e)
|
|
|
|
class BalancedExprRule(Rule):
|
|
"""
|
|
Defines a rule that need to take into account opening and closing
|
|
expressions, i.e. parenthesis, #if and #endif, etc.
|
|
"""
|
|
def __init__(self, name="Unnamed", start_expr=r"(#if +0)",
|
|
enter="#if", leave="#endif", action=lambda x,y: None):
|
|
self.name = name
|
|
|
|
self.start_expr = start_expr
|
|
self.start_re = re.compile(self.start_expr)
|
|
|
|
self.enter = enter
|
|
self.leave = leave
|
|
self.action = action
|
|
|
|
def match(self, *args, **kw):
|
|
if not self.start_re.match(*args):
|
|
return None
|
|
stack = []
|
|
data = args[0]
|
|
index = args[1]
|
|
start = index
|
|
if data[index:].startswith(self.enter):
|
|
stack.append(self.enter)
|
|
index += len(self.enter)
|
|
while len(stack) > 0 and index < len(data):
|
|
if data[index:].startswith(self.enter):
|
|
stack.append(self.enter)
|
|
index += len(self.enter)
|
|
elif data[index:].startswith(self.leave):
|
|
stack.pop(-1)
|
|
index += len(self.leave)
|
|
else:
|
|
index += 1
|
|
m = BalancedExprMatch(start, index, data[start:index])
|
|
return m
|
|
|
|
def act(self, lexer, m):
|
|
"""Act on this rule"""
|
|
return self.action(self, m)
|
|
|
|
class Grammar(list):
|
|
"""
|
|
Defines rules for lexing according to a given grammar.
|
|
The order of rules in the grammar is their precedence in matching.
|
|
"""
|
|
GRAMMAR_LIST = [ {'name': 'default'} ]
|
|
def __init__(self, *args, **kw):
|
|
"""useful values to pass in:
|
|
rules -> list of rules (ordered!)
|
|
if rules are not supplied, self._default_rules() is used"""
|
|
list.__init__(self)
|
|
if "rules" in kw:
|
|
for r in kw["rules"]:
|
|
self.append(r)
|
|
else:
|
|
self._default_rules()
|
|
self._post_init(*args, **kw)
|
|
|
|
def _default_rules(self):
|
|
"""subclasses can override this to define defaults for a grammar"""
|
|
for rdir in self.GRAMMAR_LIST:
|
|
self.add_rule(**rdir)
|
|
|
|
def _post_init(self, *args, **kw):
|
|
"""subclasses can override this to enable other behavior"""
|
|
pass
|
|
|
|
def add_rule(self, *args, **kw):
|
|
self.append(Rule(*args, **kw))
|
|
|
|
def clear_rules(self):
|
|
while len(self) > 0:
|
|
del self[0]
|
|
|
|
class Lexer(object):
|
|
"""Defines a lexer, a generator of lexical tokens, etc."""
|
|
def __init__(self, grammar=None, rules=None, data=None, index=0):
|
|
"""
|
|
If the grammar keyword is provided, then that grammar will be used.
|
|
Else, if the rules keyword is provided, that list of rules will be used
|
|
Else, the default (boring) grammar will be used.
|
|
|
|
Normally, lex(data) is used to (re-)intialize the lexer with data to
|
|
lex. If the data keyword is provided, then the lexer is ready to go
|
|
on instantiation.
|
|
"""
|
|
if grammar is not None:
|
|
self.grammar = grammar
|
|
elif rules is not None:
|
|
self.grammar = Grammar(rules=rules)
|
|
else:
|
|
self.grammar = Grammar()
|
|
|
|
self.data = data
|
|
self.index = index
|
|
self.offset = 0
|
|
|
|
self.sub_lexer = None
|
|
|
|
def lex(self, data=None, index=0, offset=0):
|
|
"""
|
|
(re-)initialize the lexer with data to lex, and optionally, an offset
|
|
to start at
|
|
"""
|
|
self.data = data
|
|
self.index = index
|
|
self.offset = offset
|
|
|
|
def __iter__(self):
|
|
if self.data is None:
|
|
raise Exception, "No data to be lexed"
|
|
return self
|
|
|
|
#def append(self, newdata, offset=0):
|
|
# self.data += newdata
|
|
# self.index += offset
|
|
|
|
def next(self):
|
|
# used for multiple levels of lexing
|
|
if self.sub_lexer is not None:
|
|
try:
|
|
return self.sub_lexer.next()
|
|
except StopIteration:
|
|
self.sub_lexer = None
|
|
|
|
if self.index >= len(self.data):
|
|
raise StopIteration
|
|
for rule in self.grammar:
|
|
m = rule.match(self.data, self.index)
|
|
if m:
|
|
self.index = m.end()
|
|
return rule.act(self, m, self.offset)
|
|
raise Exception, "Failed to consume last %d characters of input: %r" % \
|
|
(len(self.data) - self.index, self.data[self.index:])
|