pmacs3/lex2.py

217 lines
6.6 KiB
Python
Executable File

import re
class Token:
def __init__(self, rule, y, x, s, role='single'):
self.rule = rule
self.y = y
self.x = x
self.string = s
self.role = role
def add_to_string(self, s):
self.string += s
def __repr__(self):
if len(self.string) < 10:
s = self.string
else:
s = self.string[:10] + '...'
return "<Token(%r, %d, %d, %r)>" % (self.rule, self.y, self.x, s)
class Rule:
def __init__(self):
self.name = 'null'
def add_token(self, lexer, s, role='single'):
t = Token(self, lexer.y, lexer.x, s, role)
lexer.curr_tokens.append(t)
lexer.x += len(s)
def add_to_last_token(self, lexer, s):
assert lexer.curr_tokens
lexer.curr_tokens[-1].add_to_string(s)
lexer.x += len(s)
def match(self):
raise Exception, "not implemented"
class NullRule(Rule):
def __init__(self):
self.name = 'null'
def match(self):
raise Exception, "null rule does not match!"
class NewlineRule(Rule):
def __init__(self):
self.name = 'newline'
def match(self):
raise Exception, "newline rule does not match!"
class ConstantRule(Rule):
def __init__(self, name="unnamed_constant", const="foo"):
self.name = name
self.const = const
def match(self, lexer):
if lexer.lines[lexer.y][lexer.x:].startswith(self.const):
self.add_token(lexer, self.const)
return True
else:
return False
class RegexRule(Rule):
def __init__(self, name="unnamed_regex", expr="[^ ]+"):
self.name = name
self.expr = expr
self.re = re.compile(expr)
def match(self, lexer):
m = self.re.match(lexer.lines[lexer.y], lexer.x)
if m:
self.add_token(lexer, m.group(0))
return True
else:
return False
class RegionRule(Rule):
def __init__(self, name, start, mid, end):
self.name = name
self.start_re = re.compile(start)
self.mid_re = re.compile(mid)
self.end_re = re.compile(end)
def match(self, lexer):
lt = lexer.last_token
l = lexer.lines[lexer.y]
if lt is not None and lt.rule.name == self.name and lt.role != 'end':
saw_mid = False
while lexer.x < len(l):
m_end = self.end_re.match(l, lexer.x)
if m_end:
self.add_token(lexer, m_end.group(0), 'end')
return True
m_mid = self.mid_re.match(l, lexer.x)
if m_mid:
s = m_mid.group(0)
else:
s = l[lexer.x]
if saw_mid:
self.add_to_last_token(lexer, s)
else:
self.add_token(lexer, s, 'mid')
saw_mid = True
return True
else:
m = self.start_re.match(l, lexer.x)
if m:
self.add_token(lexer, m.group(0), 'start')
return True
else:
return False
class DynamicRegionRule(Rule):
def __init__(self, name, start, mid, end_fmt):
self.name = name
self.start_re = re.compile(start)
self.mid_re = re.compile(mid)
self.end_fmt = end_fmt
def add_token(self, lexer, s, role, end_re):
t = Token(self, lexer.y, lexer.x, s, role)
t.end_re = end_re
lexer.curr_tokens.append(t)
lexer.x += len(s)
def match(self, lexer):
lt = lexer.last_token
l = lexer.lines[lexer.y]
if lt is not None and lt.rule.name == self.name and lt.role != 'end':
saw_mid = False
while lexer.x < len(l):
m_end = self.end_re.match(l, lexer.x)
if m_end:
self.add_token(lexer, m_end.group(0), 'end', None)
return True
m_mid = self.mid_re.match(l, lexer.x)
if m_mid:
s = m_mid.group(0)
else:
s = l[lexer.x]
if saw_mid:
self.add_to_last_token(lexer, s)
else:
self.add_token(lexer, s, 'mid', lt.end_re)
saw_mid = True
return True
else:
m = self.start_re.match(l, lexer.x)
if m:
end_re = re.compile(self.end_fmt % m.groups())
self.add_token(lexer, m.group(0), 'start', end_re)
return True
else:
return False
class Lexer:
rules = [
RegionRule('heredoc', "<< *([a-zA-Z0-9_]+) *;", '.', '^%s$'),
RegionRule('string1', '"', '\\.|.', '"'),
RegexRule('word'),
]
null = NullRule()
newline = NewlineRule()
def __init__(self):
self.lines = None
self.y = 0
self.x = 0
self.last_token = None
self.curr_tokens = []
def lex(self, lines, y=0, x=0, last_token=None, next_token=None):
self.lines = lines
self.y = y
self.x = x
self.last_token = None
self.curr_tokens = []
def __iter__(self):
if self.lines is None:
raise Exception, "no lines to lex"
return self
def match(self):
for rule in self.rules:
match = rule.match(self)
if match:
assert self.curr_tokens
return True
return False
def add_to_null_token(self):
c = self.lines[self.y][self.x]
if self.curr_tokens:
assert self.curr_tokens[0].rule.name == 'null', self.curr_tokens[0].rule.name
self.curr_tokens[0].add_to_string(c)
else:
self.curr_tokens.append(self.make_null_token(c))
self.x += 1
def make_null_token(self, c):
return Token(self.null, self.y, self.x, c)
def make_newline_token(self):
return Token(self.newline, self.y, self.x, '\n')
def pop_curr_token(self):
t = self.curr_tokens.pop(0)
self.last_token = t
return t
def next(self):
if self.curr_tokens:
return self.pop_curr_token()
while self.y < len(self.lines):
while self.x < len(self.lines[self.y]):
t = self.match()
if t:
return self.pop_curr_token()
else:
self.add_to_null_token()
self.y += 1
self.x = 0
#self.curr_tokens.append(self.make_newline_token())
if self.curr_tokens:
return self.pop_curr_token()
raise StopIteration