diff --git a/lex3.py b/lex3.py index 907ea1b..5d7c7bd 100755 --- a/lex3.py +++ b/lex3.py @@ -116,26 +116,27 @@ class RegionRule(Rule): self.grammar = grammar self.end = end self.start_re = re.compile(start, self.reflags) - def resume(self, lexer, toresume): - assert toresume, "can't resume without tokens to resume!" - self._lex(lexer, None, None, toresume) - return True + if not toresume: + raise Exception, "can't resume without tokens to resume!" + for t in self._lex(lexer, None, None, toresume): + yield t + raise StopIteration def match(self, lexer, parent): return self.start_re.match(self.get_line(lexer), lexer.x) def lex(self, lexer, parent, m): - self._lex(lexer, parent, m, []) - - def _add_from_regex(self, name, lexer, parent, m, matchd={}): - s = m.group(0) - token = self.make_token(lexer, s, name, parent, matchd) - lexer.add_token(token) - lexer.x += len(s) - return token + for t in self._lex(lexer, parent, m, []): + yield t + raise StopIteration def _lex(self, lexer, parent, m, toresume=[]): + # this determines whether we are still reentering. if len(toresume) == 1 + # then it means that we have been reentering but will not continue, so + # reenter will be false. + reenter = len(toresume) > 1 + # we either need a match object, or a token to resume - assert m or len(toresume) > 0 + assert m or reenter, "we need a current match, or a previous match" if m: # if we had a match, then it becomes the parent, and we save its @@ -148,12 +149,11 @@ class RegionRule(Rule): parent = toresume[0] d = parent.matchd assert parent.name == 'start' - null_t = None - # this determines whether we are still reentering. if len(toresume) == 1 - # then it means that we have been reentering but will not continue, so - # reenter will be false. - reenter = len(toresume) > 1 + # this token, when set, will store unmatched characters which will be + # combined into a single "null" token when the end of the document, or + # a named-token, is reached. + null_t = None # if we have an end regex, then build it here. notice that it can # reference named groups from the start token. if we have no end, @@ -166,22 +166,24 @@ class RegionRule(Rule): done = False while not done and lexer.y < len(lexer.lines): old_y = lexer.y + line = self.get_line(lexer) # ok, as long as we haven't found the end token, and have more # data on the current line to read, we will process tokens - while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]) + 1: - # if we are reentering mid-parse, then that takes precedence + while not done and lexer.y == old_y and lexer.x < len(line): + # if we are reentering mid-parse, then that takes precedence. + # afterwards, we need to clean-up and get our new state in order if reenter: reenter = False for t in toresume[1].rule.resume(lexer, toresume[1:]): yield t if lexer.y >= len(lexer.lines): raise StopIteration - elif lexer.x >= len(lexer.lines[lexer.y]) + 1: + elif lexer.x >= len(line): lexer.y += 1 lexer.x = 0 - line = self.get_line(lexer) + # if we are looking for an end token, then see if we've # found it. if so, then we are done! if self.end: @@ -194,8 +196,8 @@ class RegionRule(Rule): done = True break - # ok, we need to check all our rules now, in order. if we - # find a token, note that we found one and exit the loop + # ok, we need to check all our rules now, in order. if we find a + # token, note that we found one and exit the loop found = False for rule in self.grammar.rules: m = rule.match(lexer, parent) @@ -205,7 +207,7 @@ class RegionRule(Rule): null_t = None for t in rule.lex(lexer, parent, m): yield t - found = True + found = True break # if we never found a token, then we need to add another @@ -233,7 +235,6 @@ class RegionRule(Rule): lexer.x = 0 raise StopIteration - class NocaseRegionRule(RegionRule): reflags = re.IGNORECASE @@ -426,10 +427,18 @@ class DualRegionRule(Rule): # alright, we're finally done processing; return true return t3 +class NocaseDualRegionRule(DualRegionRule): + def _compile_start(self): + return re.compile(self.start, re.IGNORECASE) + def _compile_middle(self, d): + return re.compile(self.middle % d, re.IGNORECASE) + def _compile_end(self, d): + return re.compile(self.end % d, re.IGNORECASE) class Grammar: rules = [] def __init__(self): + # XYZ maybe this is unnecessary for rule in self.rules: if hasattr(rule, 'grammar') and rule.grammar is None: rule.grammar = self @@ -473,7 +482,11 @@ class Lexer: i += 1 if toresume: - toresume[0].rule.resume(self, toresume) + for t in toresume[0].rule.resume(self, toresume): + yield t + for t in self._lex(): + yield t + raise StopIteration def __iter__(self): if self.lines is None: @@ -504,11 +517,3 @@ class Lexer: return self.tokens.pop(0) else: raise StopIteration - -class NocaseDualRegionRule(DualRegionRule): - def _compile_start(self): - return re.compile(self.start, re.IGNORECASE) - def _compile_middle(self, d): - return re.compile(self.middle % d, re.IGNORECASE) - def _compile_end(self, d): - return re.compile(self.end % d, re.IGNORECASE)