pmacs3/lex.py

250 lines
7.4 KiB
Python
Executable File

#!/bin/env python
"""
lex - a lexer generator in python.
"""
__author__ = "Dan Williams (dan@osheim.org, dww4s@virginia.edu)"
__copyright__ = "2005"
# std imports
import os.path, re, sys, copy
# 2.3 imports
from optparse import OptionParser
# callbacks
def silent(rule, m, offset):
'''ignore a hit; return None'''
pass
def make_token(rule, m, offset):
'''return a token from a hit'''
return(Token(rule.name, m.start() + offset, m.end() + offset, m.group(0)))
class Token:
'''Used to store an instance of a lexical token'''
def __init__(self, name, start, end, s=None):
self.name = name
self.start = start
self.end = end
self.string = s
self.debug = False
def __repr__(self):
if len(self.string) < 10:
s = self.string
else:
s = self.string[:10] + "..."
return "<Token(%r, %d, %d, %r)>" % (self.name, self.start, self.end, s)
class Rule(object):
"""Defines a rule used by a lexer."""
def __init__(self, name="Unnamed", expr=r"(.|\n)", action=lambda x,y: None):
self.name = name
self.expr = expr
self.re = re.compile(self.expr)
self.action = action
def match(self, *args, **kw):
"""Determine if this rule is matched"""
return self.re.match(*args, **kw)
def act(self, lexer, m, offset=0):
"""Act on this rule"""
return self.action(self, m, offset)
class SubRule(Rule):
"""Defines a rule which parses a region according to its own grammar,
i.e. a sub-grammar with its own rules. This rule may return multiple
tokens and span multiple calls to the next() method of Lexer."""
def __init__(self, name="Unnamed", expr=r"(.|\n)", grammar=None):
self.name = name
self.expr = expr
self.re = re.compile(self.expr)
if grammar is None:
self.grammar = Grammar()
else:
self.grammar = grammar
self.lexer = Lexer(self.grammar)
self.data = None
self.index = None
def match(self, *args, **kw):
"""Determine if this rule is matched"""
m = self.re.match(*args, **kw)
if m is not None:
self.data = args[0][:m.end()]
self.index = args[1]
return m
def act(self, lexer, m):
"""Act on this match"""
self.lexer.lex(self.data, self.index)
try:
v = self.lexer.next()
lexer.sub_lexer = self.lexer
return v
except StopIteration:
lexer.sub_lexer = None
return None
class BalancedExprMatch:
def __init__(self, start, end, data):
self.s = start
self.e = end
self.d = data
def start(self):
return self.s
def end(self):
return self.e
def group(self, i):
if i == 0 or i == 1:
return self.d
else:
raise IndexError, "no such group"
def groupdict(self):
return {}
def groups(self):
return ()
def span(self):
return (self.s, self.e)
class BalancedExprRule(Rule):
"""
Defines a rule that need to take into account opening and closing
expressions, i.e. parenthesis, #if and #endif, etc.
"""
def __init__(self, name="Unnamed", start_expr=r"(#if +0)",
enter="#if", leave="#endif", action=lambda x,y: None):
self.name = name
self.start_expr = start_expr
self.start_re = re.compile(self.start_expr)
self.enter = enter
self.leave = leave
self.action = action
def match(self, *args, **kw):
if not self.start_re.match(*args):
return None
stack = []
data = args[0]
index = args[1]
start = index
if data[index:].startswith(self.enter):
stack.append(self.enter)
index += len(self.enter)
while len(stack) > 0 and index < len(data):
if data[index:].startswith(self.enter):
stack.append(self.enter)
index += len(self.enter)
elif data[index:].startswith(self.leave):
stack.pop(-1)
index += len(self.leave)
else:
index += 1
m = BalancedExprMatch(start, index, data[start:index])
return m
def act(self, lexer, m):
"""Act on this rule"""
return self.action(self, m)
class Grammar(list):
"""
Defines rules for lexing according to a given grammar.
The order of rules in the grammar is their precedence in matching.
"""
GRAMMAR_LIST = [ {'name': 'default'} ]
def __init__(self, *args, **kw):
"""useful values to pass in:
rules -> list of rules (ordered!)
if rules are not supplied, self._default_rules() is used"""
list.__init__(self)
if "rules" in kw:
for r in kw["rules"]:
self.append(r)
else:
self._default_rules()
self._post_init(*args, **kw)
def _default_rules(self):
"""subclasses can override this to define defaults for a grammar"""
for rdir in self.GRAMMAR_LIST:
self.add_rule(**rdir)
def _post_init(self, *args, **kw):
"""subclasses can override this to enable other behavior"""
pass
def add_rule(self, *args, **kw):
self.append(Rule(*args, **kw))
def clear_rules(self):
while len(self) > 0:
del self[0]
class Lexer(object):
"""Defines a lexer, a generator of lexical tokens, etc."""
def __init__(self, grammar=None, rules=None, data=None, index=0):
"""
If the grammar keyword is provided, then that grammar will be used.
Else, if the rules keyword is provided, that list of rules will be used
Else, the default (boring) grammar will be used.
Normally, lex(data) is used to (re-)intialize the lexer with data to
lex. If the data keyword is provided, then the lexer is ready to go
on instantiation.
"""
if grammar is not None:
self.grammar = grammar
elif rules is not None:
self.grammar = Grammar(rules=rules)
else:
self.grammar = Grammar()
self.data = data
self.index = index
self.offset = 0
self.sub_lexer = None
def lex(self, data=None, index=0, offset=0):
"""
(re-)initialize the lexer with data to lex, and optionally, an offset
to start at
"""
self.data = data
self.index = index
self.offset = offset
def __iter__(self):
if self.data is None:
raise Exception, "No data to be lexed"
return self
#def append(self, newdata, offset=0):
# self.data += newdata
# self.index += offset
def next(self):
# used for multiple levels of lexing
if self.sub_lexer is not None:
try:
return self.sub_lexer.next()
except StopIteration:
self.sub_lexer = None
if self.index >= len(self.data):
raise StopIteration
for rule in self.grammar:
m = rule.match(self.data, self.index)
if m:
self.index = m.end()
return rule.act(self, m, self.offset)
raise Exception, "Failed to consume last %d characters of input: %r" % \
(len(self.data) - self.index, self.data[self.index:])