From b33772cb8f83137ba0361655fb5f25d3b25264e5 Mon Sep 17 00:00:00 2001 From: moculus Date: Sun, 15 Jul 2007 13:24:25 +0000 Subject: [PATCH] maybe lex3 works now? --HG-- branch : pmacs2 --- IDEAS | 5 + lex3.py | 388 ++++++++++++++++---------------------------------------- 2 files changed, 111 insertions(+), 282 deletions(-) diff --git a/IDEAS b/IDEAS index ccf005f..97c4cdc 100644 --- a/IDEAS +++ b/IDEAS @@ -1,3 +1,8 @@ +2007/07/15: + +Rename "lexing" to "parsing" since really we have moved way beyond a simple +lexing/tokenization strategy. + 2007/07/14: The rules are currently confusingly implemented, and have poor performance when diff --git a/lex3.py b/lex3.py index 5d7c7bd..4b5966c 100755 --- a/lex3.py +++ b/lex3.py @@ -116,63 +116,39 @@ class RegionRule(Rule): self.grammar = grammar self.end = end self.start_re = re.compile(start, self.reflags) - def resume(self, lexer, toresume): - if not toresume: - raise Exception, "can't resume without tokens to resume!" - for t in self._lex(lexer, None, None, toresume): - yield t - raise StopIteration def match(self, lexer, parent): return self.start_re.match(self.get_line(lexer), lexer.x) def lex(self, lexer, parent, m): - for t in self._lex(lexer, parent, m, []): - yield t + t1 = self.make_token(lexer, 'start', None, m, m.groupdict()) + yield t1 + if self.end: + endre = re.compile(self.end % t1.matchd, self.reflags) + else: + endre = None + for t2 in self._lex(lexer, [t1], 'start', 'end'): + yield t2 + raise StopIteration + def resume(self, lexer, toresume): + assert toresume + t1 = toresume[0] + if self.end: + endre = re.compile(self.end % t1.matchd, self.reflags) + else: + endre = None + for t2 in self._lex(lexer, t1, 'end', endre): + yield t2 raise StopIteration - def _lex(self, lexer, parent, m, toresume=[]): - # this determines whether we are still reentering. if len(toresume) == 1 - # then it means that we have been reentering but will not continue, so - # reenter will be false. + def _lex(self, lexer, toresume, stopname, stopre): + parent = toresume[0] reenter = len(toresume) > 1 + null_t = None - # we either need a match object, or a token to resume - assert m or reenter, "we need a current match, or a previous match" - - if m: - # if we had a match, then it becomes the parent, and we save its - # subgroup dict - d = m.groupdict() - yield self.make_token(lexer, 'start', parent, m, d) - else: - # otherwise, we should be resuming the start token, so let's pull - # the relevant info out of the token - parent = toresume[0] - d = parent.matchd - assert parent.name == 'start' - - # this token, when set, will store unmatched characters which will be - # combined into a single "null" token when the end of the document, or - # a named-token, is reached. - null_t = None - - # if we have an end regex, then build it here. notice that it can - # reference named groups from the start token. if we have no end, - # well, then, we're never getting out of here alive! - if self.end: - end_re = re.compile(self.end % d, self.reflags) - - # ok, so as long as we aren't done (we haven't found an end token), - # keep reading input done = False while not done and lexer.y < len(lexer.lines): old_y = lexer.y line = self.get_line(lexer) - - # ok, as long as we haven't found the end token, and have more - # data on the current line to read, we will process tokens while not done and lexer.y == old_y and lexer.x < len(line): - # if we are reentering mid-parse, then that takes precedence. - # afterwards, we need to clean-up and get our new state in order if reenter: reenter = False for t in toresume[1].rule.resume(lexer, toresume[1:]): @@ -184,21 +160,17 @@ class RegionRule(Rule): lexer.x = 0 line = self.get_line(lexer) - # if we are looking for an end token, then see if we've - # found it. if so, then we are done! - if self.end: - m = end_re.match(line, lexer.x) + if stopre: + m = stopre.match(line, lexer.x) if m: if null_t: yield null_t null_t = None - yield self.make_token(lexer, 'end', parent, m, {}) + yield self.make_token(lexer, stopname, parent, m, {}) done = True break - # ok, we need to check all our rules now, in order. if we find a - # token, note that we found one and exit the loop - found = False + m = None for rule in self.grammar.rules: m = rule.match(lexer, parent) if m: @@ -207,233 +179,87 @@ class RegionRule(Rule): null_t = None for t in rule.lex(lexer, parent, m): yield t - found = True break - # if we never found a token, then we need to add another - # character to the current null token (which we should - # create if it isn't set). - if not found: + if not m: if not null_t: null_t = Token('null', None, lexer.y, lexer.x, '', parent) if lexer.x < len(line): null_t.add_to_string(line[lexer.x]) lexer.x += 1 - - # ok, since we're soon going to be on a different line (or - # already are), we want a new null token. so forget about the - # current one (i.e. stop adding to it). if null_t: yield null_t null_t = None - - # if we're still on the same line at this point (and not done) - # then that means we're finished with the line and should move - # on to the next one here if not done and old_y == lexer.y: lexer.y += 1 lexer.x = 0 - raise StopIteration class NocaseRegionRule(RegionRule): reflags = re.IGNORECASE -class DualRegionRule(Rule): +class DualRegionRule(RegionRule): def __init__(self, name, start, grammar1, middle, grammar2, end, group=None): Rule.__init__(self, name) - self.start = start + self.start_re = re.compile(start, self.reflags) self.grammar1 = grammar1 self.middle = middle self.grammar2 = grammar2 self.end = end - self.start_re = self._compile_start() - self._set_group(group) + def match(self, lexer, parent): + return self.start_re.match(self.get_line(lexer), lexer.x) + def lex(self, lexer, parent, m): + assert m + t1 = self.make_token(lexer, 'start', parent, m, m.groupdict()) + yield t1 - def _compile_start(self): - return re.compile(self.start) - def _compile_middle(self, d): - return re.compile(self.middle % d) - def _compile_end(self, d): - return re.compile(self.end % d) + t2 = None + if self.middle: + stopre = re.compile(self.middle % t1.groupdict(), self.reflags) + else: + stopre = None + for t2 in self._lex(lexer, [t1], 'middle', stopre): + yield t2 - def _add_from_regex(self, name, lexer, parent, m, matchd={}): - s = m.group(0) - token = self.make_token(lexer, s, name, parent, matchd) - lexer.add_token(token) - lexer.x += len(s) - return token + if t2 is not None and t2.name == 'middle': + if self.end: + stopre = re.compile(self.end % t2.groupdict(), self.reflags) + else: + stopre = None + for t3 in self._lex(lexer, [t2], 'end', stopre): + yield t3 + + raise StopIteration def resume(self, lexer, toresume): assert toresume, "can't resume without tokens to resume!" - token = toresume[0] - if token.name == 'start': - t2 = self._match_first(lexer, token, toresume) - if t2 is not None: - t3 = self._match_second(lexer, t2, []) - return True - elif token.name == 'middle': - t3 = self._match_second(lexer, token, toresume) + t1 = t2 = None + if toresume[0].name == 'start': + t1 = toresume[0] + elif toresume[0].name == 'middle': + t2 = toresume[0] else: - raise Exception, "invalid flag %r" % flag - return True - def match(self, lexer, parent): - # see if we can match our start token - line = self.get_line(lexer) - m = self.start_re.match(line, lexer.x) - if m: - t1 = self._add_from_regex('start', lexer, parent, m, m.groupdict()) - t2 = self._match_first(lexer, t1, []) - if t2 is not None: - t3 = self._match_second(lexer, t2, []) - return True - else: - # region was not matched; we never started. so return false - return False + raise Exception, "invalid name %r" % toresume[0].name - def _match_first(self, lexer, parent, toresume=[]): - reenter = len(toresume) > 1 - if reenter: - assert parent is toresume[0] - d1 = parent.matchd - assert parent.name == 'start' - null_t = None - middle_re = self._compile_middle(d1) - d2 = {} - - # ok, so as long as we aren't done (we haven't found an end token), - # keep reading input - t2 = None - done = False - while not done and lexer.y < len(lexer.lines): - old_y = lexer.y - - # ok, as long as we haven't found the end token, and have more - # data on the current line to read, we will process tokens - while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]) + 1: - # if we are reentering mid-parse, then that takes precedence - if reenter: - reenter = False - rule2 = toresume[1].rule - rule2.resume(lexer, toresume[1:]) - null_t = None - - line = self.get_line(lexer) - - # see if we have found the middle token. if so, we can then - # proceed to "stage 2" - m2 = middle_re.match(line, lexer.x) - if m2: - d2 = dict(d1.items() + m2.groupdict().items()) - t2 = self._add_from_regex('middle', lexer, parent, m2, d2) - done = True - break - - # ok, we need to check all our rules now, in order. if we - # find a token, note that we found one and exit the loop - found = False - for rule in self.grammar1.rules: - if rule.match(lexer, parent): - found = True - null_t = None - break - - # if we never found a token, then we need to add another - # character to the current null token (which we should - # create if it isn't set). - if not found: - if null_t is None: - null_t = Token('null', None, lexer.y, lexer.x, '', parent) - lexer.add_token(null_t) - null_t.add_to_string(line[lexer.x]) - lexer.x += 1 - - # ok, since we're soon going to be on a different line (or - # already are), we want a new null token. so forget about the - # current one. - null_t = None - - # if we're still on the same line at this point (and not done) - # then that means we're finished with the line and should move - # on to the next one here - if not done and old_y == lexer.y: - lexer.y += 1 - lexer.x = 0 - return t2 - - def _match_second(self, lexer, parent, toresume=[]): - reenter = len(toresume) > 1 - if reenter: - assert parent is toresume[0] - assert parent.name == 'middle' - d3 = parent.matchd - null_t = None - end_re = self._compile_end(d3) - - # ok, so as long as we aren't done (we haven't found an end token), - # keep reading input - t3 = None - done = False - while not done and lexer.y < len(lexer.lines): - old_y = lexer.y - - # if we are reentering mid-parse, then that takes precedence - if reenter: - reenter = False - rule2 = toresume[1].rule - rule2.resume(lexer, toresume[1:]) - null_t = None - - # ok, as long as we haven't found the end token, and have more - # data on the current line to read, we will process tokens - while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]) + 1: - # see if we have found the middle token. if so, we can then - # proceed to "stage 2" - line = self.get_line(lexer) - m3 = end_re.match(line, lexer.x) - if m3: - t3 = self._add_from_regex('end', lexer, parent, m3, {}) - done = True - break - - # ok, we need to check all our rules now, in order. if we - # find a token, note that we found one and exit the loop - found = False - for rule in self.grammar2.rules: - if rule.match(lexer, parent): - found = True - null_t = None - break - - # if we never found a token, then we need to add another - # character to the current null token (which we should - # create if it isn't set). - if not found: - if null_t is None: - null_t = Token('null', None, lexer.y, lexer.x, '', parent) - lexer.add_token(null_t) - null_t.add_to_string(line[lexer.x]) - lexer.x += 1 - - # ok, since we're soon going to be on a different line (or - # already are), we want a new null token. so forget about the - # current one. - null_t = None - - # if we're still on the same line at this point (and not done) - # then that means we're finished with the line and should move - # on to the next one here - if not done and old_y == lexer.y: - lexer.y += 1 - lexer.x = 0 - - # alright, we're finally done processing; return true - return t3 + if t1 is not None: + #assert t1.name == 'start' + if self.middle: + stopre = re.compile(self.middle, self.reflags) + else: + stopre = None + for t2 in self._lex_first(lexer, toresume, 'middle', stopre): + yield t2 + toresume = [t2] + if t2 is not None: + assert t2.name == 'middle' + if self.end: + stopre = re.compile(self.end, self.reflags) + else: + stopre = None + for t3 in self._lex_second(lexer, toresume, 'end', stopre): + yield t3 + #toresume = [t3] + raise StopIteration class NocaseDualRegionRule(DualRegionRule): - def _compile_start(self): - return re.compile(self.start, re.IGNORECASE) - def _compile_middle(self, d): - return re.compile(self.middle % d, re.IGNORECASE) - def _compile_end(self, d): - return re.compile(self.end % d, re.IGNORECASE) + reflags = re.IGNORECASE class Grammar: rules = [] @@ -455,17 +281,16 @@ class Lexer: self.y = 0 self.x = 0 self.lines = None - self.tokens = [] - - def add_token(self, t): - self.tokens.append(t) - + def get_line(self): + return self.lines[lexer.y] + '\n' def lex(self, lines, y=0, x=0): self.y = y self.x = x self.lines = lines self.tokens = [] - + for t in self._lex(): + yield t + raise StopIteration def resume(self, lines, y, x, token): self.y = y self.x = x @@ -474,9 +299,9 @@ class Lexer: toresume = token.parents() # this is a special case for the "middle" rule of a dual region rule - i = 0 + i = 1 while i < len(toresume): - if i > 0 and toresume[i].name == 'middle' and toresume[i-1].name == 'start': + if toresume[i].name == 'middle' and toresume[i-1].name == 'start': del toresume[i-1] else: i += 1 @@ -488,32 +313,31 @@ class Lexer: yield t raise StopIteration - def __iter__(self): - if self.lines is None: - raise Exception, "no lines to lex" - return self - - def next(self): + def _lex(self): null_t = None - if self.tokens: - return self.tokens.pop(0) while self.y < len(self.lines): - line = self.lines[self.y] + '\n' - while self.x < len(line): - curr_t = None + line = self.get_line() + while not done and self.x < len(line): + m = None for rule in self.grammar.rules: - if rule.match(self, None): - assert self.tokens, "match rendered no tokens?" - return self.tokens.pop(0) - if null_t is None: - null_t = Token('null', None, self.y, self.x, '') - self.add_token(null_t) - null_t.add_to_string(line[self.x]) - self.x += 1 - null_t = None + m = rule.match(self, parent) + if m: + if null_t: + yield null_t + null_t = None + for t in rule.lex(self, parent, m): + yield t + break + + if not m: + if not null_t: + null_t = Token('null', None, self.y, self.x, '', parent) + if self.x < len(line): + null_t.add_to_string(line[self.x]) + self.x += 1 + if null_t: + yield null_t + null_t = None self.y += 1 self.x = 0 - if self.tokens: - return self.tokens.pop(0) - else: - raise StopIteration + raise StopIteration