217 lines
6.6 KiB
Python
Executable File
217 lines
6.6 KiB
Python
Executable File
import re
|
|
|
|
class Token:
|
|
def __init__(self, rule, y, x, s, role='single'):
|
|
self.rule = rule
|
|
self.y = y
|
|
self.x = x
|
|
self.string = s
|
|
self.role = role
|
|
def add_to_string(self, s):
|
|
self.string += s
|
|
def __repr__(self):
|
|
if len(self.string) < 10:
|
|
s = self.string
|
|
else:
|
|
s = self.string[:10] + '...'
|
|
return "<Token(%r, %d, %d, %r)>" % (self.rule, self.y, self.x, s)
|
|
|
|
class Rule:
|
|
def __init__(self):
|
|
self.name = 'null'
|
|
def add_token(self, lexer, s, role='single'):
|
|
t = Token(self, lexer.y, lexer.x, s, role)
|
|
lexer.curr_tokens.append(t)
|
|
lexer.x += len(s)
|
|
def add_to_last_token(self, lexer, s):
|
|
assert lexer.curr_tokens
|
|
lexer.curr_tokens[-1].add_to_string(s)
|
|
lexer.x += len(s)
|
|
def match(self):
|
|
raise Exception, "not implemented"
|
|
|
|
class NullRule(Rule):
|
|
def __init__(self):
|
|
self.name = 'null'
|
|
def match(self):
|
|
raise Exception, "null rule does not match!"
|
|
|
|
class NewlineRule(Rule):
|
|
def __init__(self):
|
|
self.name = 'newline'
|
|
def match(self):
|
|
raise Exception, "newline rule does not match!"
|
|
|
|
class ConstantRule(Rule):
|
|
def __init__(self, name="unnamed_constant", const="foo"):
|
|
self.name = name
|
|
self.const = const
|
|
def match(self, lexer):
|
|
if lexer.lines[lexer.y][lexer.x:].startswith(self.const):
|
|
self.add_token(lexer, self.const)
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
class RegexRule(Rule):
|
|
def __init__(self, name="unnamed_regex", expr="[^ ]+"):
|
|
self.name = name
|
|
self.expr = expr
|
|
self.re = re.compile(expr)
|
|
def match(self, lexer):
|
|
m = self.re.match(lexer.lines[lexer.y], lexer.x)
|
|
if m:
|
|
self.add_token(lexer, m.group(0))
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
class RegionRule(Rule):
|
|
def __init__(self, name, start, mid, end):
|
|
self.name = name
|
|
self.start_re = re.compile(start)
|
|
self.mid_re = re.compile(mid)
|
|
self.end_re = re.compile(end)
|
|
def match(self, lexer):
|
|
lt = lexer.last_token
|
|
l = lexer.lines[lexer.y]
|
|
if lt is not None and lt.rule.name == self.name and lt.role != 'end':
|
|
saw_mid = False
|
|
while lexer.x < len(l):
|
|
m_end = self.end_re.match(l, lexer.x)
|
|
if m_end:
|
|
self.add_token(lexer, m_end.group(0), 'end')
|
|
return True
|
|
m_mid = self.mid_re.match(l, lexer.x)
|
|
if m_mid:
|
|
s = m_mid.group(0)
|
|
else:
|
|
s = l[lexer.x]
|
|
if saw_mid:
|
|
self.add_to_last_token(lexer, s)
|
|
else:
|
|
self.add_token(lexer, s, 'mid')
|
|
saw_mid = True
|
|
return True
|
|
else:
|
|
m = self.start_re.match(l, lexer.x)
|
|
if m:
|
|
self.add_token(lexer, m.group(0), 'start')
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
class DynamicRegionRule(Rule):
|
|
def __init__(self, name, start, mid, end_fmt):
|
|
self.name = name
|
|
self.start_re = re.compile(start)
|
|
self.mid_re = re.compile(mid)
|
|
self.end_fmt = end_fmt
|
|
def add_token(self, lexer, s, role, end_re):
|
|
t = Token(self, lexer.y, lexer.x, s, role)
|
|
t.end_re = end_re
|
|
lexer.curr_tokens.append(t)
|
|
lexer.x += len(s)
|
|
def match(self, lexer):
|
|
lt = lexer.last_token
|
|
l = lexer.lines[lexer.y]
|
|
if lt is not None and lt.rule.name == self.name and lt.role != 'end':
|
|
saw_mid = False
|
|
while lexer.x < len(l):
|
|
m_end = self.end_re.match(l, lexer.x)
|
|
if m_end:
|
|
self.add_token(lexer, m_end.group(0), 'end', None)
|
|
return True
|
|
m_mid = self.mid_re.match(l, lexer.x)
|
|
if m_mid:
|
|
s = m_mid.group(0)
|
|
else:
|
|
s = l[lexer.x]
|
|
if saw_mid:
|
|
self.add_to_last_token(lexer, s)
|
|
else:
|
|
self.add_token(lexer, s, 'mid', lt.end_re)
|
|
saw_mid = True
|
|
return True
|
|
else:
|
|
m = self.start_re.match(l, lexer.x)
|
|
if m:
|
|
end_re = re.compile(self.end_fmt % m.groups())
|
|
self.add_token(lexer, m.group(0), 'start', end_re)
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
class Lexer:
|
|
rules = [
|
|
RegionRule('heredoc', "<< *([a-zA-Z0-9_]+) *;", '.', '^%s$'),
|
|
RegionRule('string1', '"', '\\.|.', '"'),
|
|
RegexRule('word'),
|
|
]
|
|
null = NullRule()
|
|
newline = NewlineRule()
|
|
def __init__(self):
|
|
self.lines = None
|
|
self.y = 0
|
|
self.x = 0
|
|
self.last_token = None
|
|
self.curr_tokens = []
|
|
|
|
def lex(self, lines, y=0, x=0, last_token=None, next_token=None):
|
|
self.lines = lines
|
|
self.y = y
|
|
self.x = x
|
|
self.last_token = None
|
|
self.curr_tokens = []
|
|
|
|
def __iter__(self):
|
|
if self.lines is None:
|
|
raise Exception, "no lines to lex"
|
|
return self
|
|
|
|
def match(self):
|
|
for rule in self.rules:
|
|
match = rule.match(self)
|
|
if match:
|
|
assert self.curr_tokens
|
|
return True
|
|
return False
|
|
|
|
def add_to_null_token(self):
|
|
c = self.lines[self.y][self.x]
|
|
if self.curr_tokens:
|
|
assert self.curr_tokens[0].rule.name == 'null', self.curr_tokens[0].rule.name
|
|
self.curr_tokens[0].add_to_string(c)
|
|
else:
|
|
self.curr_tokens.append(self.make_null_token(c))
|
|
self.x += 1
|
|
|
|
def make_null_token(self, c):
|
|
return Token(self.null, self.y, self.x, c)
|
|
def make_newline_token(self):
|
|
return Token(self.newline, self.y, self.x, '\n')
|
|
|
|
def pop_curr_token(self):
|
|
t = self.curr_tokens.pop(0)
|
|
self.last_token = t
|
|
return t
|
|
|
|
def next(self):
|
|
if self.curr_tokens:
|
|
return self.pop_curr_token()
|
|
|
|
while self.y < len(self.lines):
|
|
while self.x < len(self.lines[self.y]):
|
|
t = self.match()
|
|
if t:
|
|
return self.pop_curr_token()
|
|
else:
|
|
self.add_to_null_token()
|
|
self.y += 1
|
|
self.x = 0
|
|
#self.curr_tokens.append(self.make_newline_token())
|
|
if self.curr_tokens:
|
|
return self.pop_curr_token()
|
|
|
|
raise StopIteration
|