From 6749e4c1c8a9e86fccb1892fdab763cba0ee508d Mon Sep 17 00:00:00 2001 From: moculus Date: Sat, 14 Jul 2007 20:59:38 +0000 Subject: [PATCH] new lexing maybe --HG-- branch : pmacs2 --- lex3.py | 514 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 514 insertions(+) create mode 100755 lex3.py diff --git a/lex3.py b/lex3.py new file mode 100755 index 0000000..907ea1b --- /dev/null +++ b/lex3.py @@ -0,0 +1,514 @@ +import re +import util + +valid_name_re = re.compile('^[a-zA-Z_][a-zA-Z0-9_]*$') +full_name_re = re.compile('^([a-zA-Z_]+)([0-9]*)$') +reserved_names = ['start', 'middle', 'end', 'null'] + +class Token(object): + def __init__(self, name, rule=None, y=0, x=0, s="", parent=None, matchd={}): + self.name = name + self.rule = rule + self.y = y + self.x = x + self.string = s + self.parent = parent + self.matchd = matchd + def parents(self): + if self.parent is not None: + parents = self.parent.parents() + parents.append(self.parent) + return parents + else: + return [] + def domain(self): + if self.parent is not None: + names = self.parent.domain() + else: + names = [] + if self.name != 'middle': + names.append(self.rule.name) + return names + def fqlist(self): + if self.parent is not None: + names = self.parent.domain() + else: + names = [] + if self.name == 'start': + names.append(self.rule.name) + names.append(self.name) + return names + def fqname(self): + names = self.fqlist() + return '.'.join(names) + def copy(self): + return Token(self.name, self.rule, self.y, self.x, self.string, + self.parent, self.matchd) + def add_to_string(self, s): + self.string += s + def end_x(self): + return self.x + len(self.string) + def __eq__(self, other): + return (self.y == other.y and self.x == other.x + and self.name == other.name and self.parent is other.parent and + self.string == other.string) + def __repr__(self): + if len(self.string) < 10: + s = self.string + else: + s = self.string[:10] + '...' + fields = (self.fqname(), self.rule, self.y, self.x, s) + return "" % fields + +class Rule: + reflags = 0 + def __init__(self, name, group=None): + assert valid_name_re.match(name), 'invalid name %r' % name + assert name not in reserved_names, "reserved rule name: %r" % name + self.name = name + if group is None: + self.group = name + else: + self.group = group + def match(self, lexer, parent): + raise Exception, "not implemented" + def lex(self, lexer, parent, match): + raise Exception, "not implemented" + def make_token(self, lexer, s, name, parent=None, matchd={}): + t = Token(name, self, lexer.y, lexer.x, s, parent, matchd) + lexer.x += len(s) + return t + def get_line(self, lexer): + return lexer.lines[lexer.y] + '\n' + +class PatternRule(Rule): + def __init__(self, name, pattern, group=None): + Rule.__init__(self, name) + self.pattern = pattern + self.re = re.compile(self.pattern, self.reflags) + def match(self, lexer, parent): + return self.re.match(self.get_line(lexer), lexer.x) + def lex(self, lexer, parent, m): + if m: + yield self.make_token(lexer, m.group(0), self.name, parent) + raise StopIteration + +class NocasePatternRule(PatternRule): + reflags = re.IGNORECASE + +class ContextPatternRule(PatternRule): + def __init__(self, name, pattern, fallback, group=None): + Rule.__init__(self, name) + self.pattern = pattern + self.fallback_re = re.compile(fallback, self.reflags) + def match(self, lexer, parent): + try: + r = re.compile(self.pattern % parent.matchd) + except KeyError: + r = self.fallback_re + return r.match(self.get_line(lexer), lexer.x) +class NocaseContextPatternRule(ContextPatternRule): + reflags = re.IGNORECASE + +class RegionRule(Rule): + def __init__(self, name, start, grammar, end, group=None): + Rule.__init__(self, name) + self.grammar = grammar + self.end = end + self.start_re = re.compile(start, self.reflags) + + def resume(self, lexer, toresume): + assert toresume, "can't resume without tokens to resume!" + self._lex(lexer, None, None, toresume) + return True + def match(self, lexer, parent): + return self.start_re.match(self.get_line(lexer), lexer.x) + def lex(self, lexer, parent, m): + self._lex(lexer, parent, m, []) + + def _add_from_regex(self, name, lexer, parent, m, matchd={}): + s = m.group(0) + token = self.make_token(lexer, s, name, parent, matchd) + lexer.add_token(token) + lexer.x += len(s) + return token + + def _lex(self, lexer, parent, m, toresume=[]): + # we either need a match object, or a token to resume + assert m or len(toresume) > 0 + + if m: + # if we had a match, then it becomes the parent, and we save its + # subgroup dict + d = m.groupdict() + yield self.make_token(lexer, 'start', parent, m, d) + else: + # otherwise, we should be resuming the start token, so let's pull + # the relevant info out of the token + parent = toresume[0] + d = parent.matchd + assert parent.name == 'start' + null_t = None + + # this determines whether we are still reentering. if len(toresume) == 1 + # then it means that we have been reentering but will not continue, so + # reenter will be false. + reenter = len(toresume) > 1 + + # if we have an end regex, then build it here. notice that it can + # reference named groups from the start token. if we have no end, + # well, then, we're never getting out of here alive! + if self.end: + end_re = re.compile(self.end % d, self.reflags) + + # ok, so as long as we aren't done (we haven't found an end token), + # keep reading input + done = False + while not done and lexer.y < len(lexer.lines): + old_y = lexer.y + + # ok, as long as we haven't found the end token, and have more + # data on the current line to read, we will process tokens + while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]) + 1: + # if we are reentering mid-parse, then that takes precedence + if reenter: + reenter = False + for t in toresume[1].rule.resume(lexer, toresume[1:]): + yield t + if lexer.y >= len(lexer.lines): + raise StopIteration + elif lexer.x >= len(lexer.lines[lexer.y]) + 1: + lexer.y += 1 + lexer.x = 0 + + line = self.get_line(lexer) + # if we are looking for an end token, then see if we've + # found it. if so, then we are done! + if self.end: + m = end_re.match(line, lexer.x) + if m: + if null_t: + yield null_t + null_t = None + yield self.make_token(lexer, 'end', parent, m, {}) + done = True + break + + # ok, we need to check all our rules now, in order. if we + # find a token, note that we found one and exit the loop + found = False + for rule in self.grammar.rules: + m = rule.match(lexer, parent) + if m: + if null_t: + yield null_t + null_t = None + for t in rule.lex(lexer, parent, m): + yield t + found = True + break + + # if we never found a token, then we need to add another + # character to the current null token (which we should + # create if it isn't set). + if not found: + if not null_t: + null_t = Token('null', None, lexer.y, lexer.x, '', parent) + if lexer.x < len(line): + null_t.add_to_string(line[lexer.x]) + lexer.x += 1 + + # ok, since we're soon going to be on a different line (or + # already are), we want a new null token. so forget about the + # current one (i.e. stop adding to it). + if null_t: + yield null_t + null_t = None + + # if we're still on the same line at this point (and not done) + # then that means we're finished with the line and should move + # on to the next one here + if not done and old_y == lexer.y: + lexer.y += 1 + lexer.x = 0 + + raise StopIteration + +class NocaseRegionRule(RegionRule): + reflags = re.IGNORECASE + +class DualRegionRule(Rule): + def __init__(self, name, start, grammar1, middle, grammar2, end, group=None): + Rule.__init__(self, name) + self.start = start + self.grammar1 = grammar1 + self.middle = middle + self.grammar2 = grammar2 + self.end = end + self.start_re = self._compile_start() + self._set_group(group) + + def _compile_start(self): + return re.compile(self.start) + def _compile_middle(self, d): + return re.compile(self.middle % d) + def _compile_end(self, d): + return re.compile(self.end % d) + + def _add_from_regex(self, name, lexer, parent, m, matchd={}): + s = m.group(0) + token = self.make_token(lexer, s, name, parent, matchd) + lexer.add_token(token) + lexer.x += len(s) + return token + def resume(self, lexer, toresume): + assert toresume, "can't resume without tokens to resume!" + token = toresume[0] + if token.name == 'start': + t2 = self._match_first(lexer, token, toresume) + if t2 is not None: + t3 = self._match_second(lexer, t2, []) + return True + elif token.name == 'middle': + t3 = self._match_second(lexer, token, toresume) + else: + raise Exception, "invalid flag %r" % flag + return True + def match(self, lexer, parent): + # see if we can match our start token + line = self.get_line(lexer) + m = self.start_re.match(line, lexer.x) + if m: + t1 = self._add_from_regex('start', lexer, parent, m, m.groupdict()) + t2 = self._match_first(lexer, t1, []) + if t2 is not None: + t3 = self._match_second(lexer, t2, []) + return True + else: + # region was not matched; we never started. so return false + return False + + def _match_first(self, lexer, parent, toresume=[]): + reenter = len(toresume) > 1 + if reenter: + assert parent is toresume[0] + d1 = parent.matchd + assert parent.name == 'start' + null_t = None + middle_re = self._compile_middle(d1) + d2 = {} + + # ok, so as long as we aren't done (we haven't found an end token), + # keep reading input + t2 = None + done = False + while not done and lexer.y < len(lexer.lines): + old_y = lexer.y + + # ok, as long as we haven't found the end token, and have more + # data on the current line to read, we will process tokens + while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]) + 1: + # if we are reentering mid-parse, then that takes precedence + if reenter: + reenter = False + rule2 = toresume[1].rule + rule2.resume(lexer, toresume[1:]) + null_t = None + + line = self.get_line(lexer) + + # see if we have found the middle token. if so, we can then + # proceed to "stage 2" + m2 = middle_re.match(line, lexer.x) + if m2: + d2 = dict(d1.items() + m2.groupdict().items()) + t2 = self._add_from_regex('middle', lexer, parent, m2, d2) + done = True + break + + # ok, we need to check all our rules now, in order. if we + # find a token, note that we found one and exit the loop + found = False + for rule in self.grammar1.rules: + if rule.match(lexer, parent): + found = True + null_t = None + break + + # if we never found a token, then we need to add another + # character to the current null token (which we should + # create if it isn't set). + if not found: + if null_t is None: + null_t = Token('null', None, lexer.y, lexer.x, '', parent) + lexer.add_token(null_t) + null_t.add_to_string(line[lexer.x]) + lexer.x += 1 + + # ok, since we're soon going to be on a different line (or + # already are), we want a new null token. so forget about the + # current one. + null_t = None + + # if we're still on the same line at this point (and not done) + # then that means we're finished with the line and should move + # on to the next one here + if not done and old_y == lexer.y: + lexer.y += 1 + lexer.x = 0 + return t2 + + def _match_second(self, lexer, parent, toresume=[]): + reenter = len(toresume) > 1 + if reenter: + assert parent is toresume[0] + assert parent.name == 'middle' + d3 = parent.matchd + null_t = None + end_re = self._compile_end(d3) + + # ok, so as long as we aren't done (we haven't found an end token), + # keep reading input + t3 = None + done = False + while not done and lexer.y < len(lexer.lines): + old_y = lexer.y + + # if we are reentering mid-parse, then that takes precedence + if reenter: + reenter = False + rule2 = toresume[1].rule + rule2.resume(lexer, toresume[1:]) + null_t = None + + # ok, as long as we haven't found the end token, and have more + # data on the current line to read, we will process tokens + while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]) + 1: + # see if we have found the middle token. if so, we can then + # proceed to "stage 2" + line = self.get_line(lexer) + m3 = end_re.match(line, lexer.x) + if m3: + t3 = self._add_from_regex('end', lexer, parent, m3, {}) + done = True + break + + # ok, we need to check all our rules now, in order. if we + # find a token, note that we found one and exit the loop + found = False + for rule in self.grammar2.rules: + if rule.match(lexer, parent): + found = True + null_t = None + break + + # if we never found a token, then we need to add another + # character to the current null token (which we should + # create if it isn't set). + if not found: + if null_t is None: + null_t = Token('null', None, lexer.y, lexer.x, '', parent) + lexer.add_token(null_t) + null_t.add_to_string(line[lexer.x]) + lexer.x += 1 + + # ok, since we're soon going to be on a different line (or + # already are), we want a new null token. so forget about the + # current one. + null_t = None + + # if we're still on the same line at this point (and not done) + # then that means we're finished with the line and should move + # on to the next one here + if not done and old_y == lexer.y: + lexer.y += 1 + lexer.x = 0 + + # alright, we're finally done processing; return true + return t3 + +class Grammar: + rules = [] + def __init__(self): + for rule in self.rules: + if hasattr(rule, 'grammar') and rule.grammar is None: + rule.grammar = self + if hasattr(rule, 'grammar1') and rule.grammar is None: + rule.grammar = self + if hasattr(rule, 'grammar2') and rule.grammar is None: + rule.grammar = self +grammar = Grammar() + +class Lexer: + def __init__(self, name, grammar): + self.name = name + self.grammar = grammar + self.y = 0 + self.x = 0 + self.lines = None + self.tokens = [] + + def add_token(self, t): + self.tokens.append(t) + + def lex(self, lines, y=0, x=0): + self.y = y + self.x = x + self.lines = lines + self.tokens = [] + + def resume(self, lines, y, x, token): + self.y = y + self.x = x + self.lines = lines + self.tokens = [] + toresume = token.parents() + + # this is a special case for the "middle" rule of a dual region rule + i = 0 + while i < len(toresume): + if i > 0 and toresume[i].name == 'middle' and toresume[i-1].name == 'start': + del toresume[i-1] + else: + i += 1 + + if toresume: + toresume[0].rule.resume(self, toresume) + + def __iter__(self): + if self.lines is None: + raise Exception, "no lines to lex" + return self + + def next(self): + null_t = None + if self.tokens: + return self.tokens.pop(0) + while self.y < len(self.lines): + line = self.lines[self.y] + '\n' + while self.x < len(line): + curr_t = None + for rule in self.grammar.rules: + if rule.match(self, None): + assert self.tokens, "match rendered no tokens?" + return self.tokens.pop(0) + if null_t is None: + null_t = Token('null', None, self.y, self.x, '') + self.add_token(null_t) + null_t.add_to_string(line[self.x]) + self.x += 1 + null_t = None + self.y += 1 + self.x = 0 + if self.tokens: + return self.tokens.pop(0) + else: + raise StopIteration + +class NocaseDualRegionRule(DualRegionRule): + def _compile_start(self): + return re.compile(self.start, re.IGNORECASE) + def _compile_middle(self, d): + return re.compile(self.middle % d, re.IGNORECASE) + def _compile_end(self, d): + return re.compile(self.end % d, re.IGNORECASE)