maybe lex3 works now?

--HG-- branch : pmacs2
2007-07-15 13:24:25 +00:00 · 2007-07-15 13:24:25 +00:00 · b33772cb8f
parent a780d9d61a
commit b33772cb8f
2 changed files with 111 additions and 282 deletions
--- a/5
+++ b/5
@ -1,3 +1,8 @@
 2007/07/15:
 Rename "lexing" to "parsing" since really we have moved way beyond a simple
 lexing/tokenization strategy.
 2007/07/14:
 The rules are currently confusingly implemented, and have poor performance when
--- a/lex3.py
+++ b/lex3.py
@ -116,63 +116,39 @@ class RegionRule(Rule):
        self.grammar  = grammar
        self.end      = end
        self.start_re = re.compile(start, self.reflags)
    def resume(self, lexer, toresume):
        if not toresume:
            raise Exception, "can't resume without tokens to resume!"
        for t in self._lex(lexer, None, None, toresume):
            yield t
        raise StopIteration
    def match(self, lexer, parent):
        return self.start_re.match(self.get_line(lexer), lexer.x)
    def lex(self, lexer, parent, m):
-        for t in self._lex(lexer, parent, m, []):
+        t1 = self.make_token(lexer, 'start', None, m, m.groupdict())
-            yield t
+        yield t1
        if self.end:
            endre = re.compile(self.end % t1.matchd, self.reflags)
        else:
            endre = None
        for t2 in self._lex(lexer, [t1], 'start', 'end'):
            yield t2
        raise StopIteration
    def resume(self, lexer, toresume):
        assert toresume
        t1 = toresume[0]
        if self.end:
            endre = re.compile(self.end % t1.matchd, self.reflags)
        else:
            endre = None
        for t2 in self._lex(lexer, t1, 'end', endre):
            yield t2
        raise StopIteration
-    def _lex(self, lexer, parent, m, toresume=[]):
+    def _lex(self, lexer, toresume, stopname, stopre):
-        # this determines whether we are still reentering. if len(toresume) == 1
+        parent  = toresume[0]
        # then it means that we have been reentering but will not continue, so
        # reenter will be false.
        reenter = len(toresume) > 1
        null_t  = None
        # we either need a match object, or a token to resume
        assert m or reenter, "we need a current match, or a previous match"
        if m:
            # if we had a match, then it becomes the parent, and we save its
            # subgroup dict
            d = m.groupdict()
            yield self.make_token(lexer, 'start', parent, m, d)
        else:
            # otherwise, we should be resuming the start token, so let's pull
            # the relevant info out of the token
            parent = toresume[0]
            d = parent.matchd
            assert parent.name == 'start'
        # this token, when set, will store unmatched characters which will be
        # combined into a single "null" token when the end of the document, or
        # a named-token, is reached.
        null_t = None
        # if we have an end regex, then build it here. notice that it can
        # reference named groups from the start token. if we have no end,
        # well, then, we're never getting out of here alive!
        if self.end:
            end_re = re.compile(self.end % d, self.reflags)
        # ok, so as long as we aren't done (we haven't found an end token),
        # keep reading input
        done = False
        while not done and lexer.y < len(lexer.lines):
            old_y = lexer.y
            line = self.get_line(lexer)
            # ok, as long as we haven't found the end token, and have more
            # data on the current line to read, we will process tokens
            while not done and lexer.y == old_y and lexer.x < len(line):
                # if we are reentering mid-parse, then that takes precedence.
                # afterwards, we need to clean-up and get our new state in order
                if reenter:
                    reenter = False
                    for t in toresume[1].rule.resume(lexer, toresume[1:]):
@ -184,21 +160,17 @@ class RegionRule(Rule):
                    lexer.x = 0
                line = self.get_line(lexer)
-                # if we are looking for an end token, then see if we've
+                if stopre:
-                # found it. if so, then we are done!
+                    m = stopre.match(line, lexer.x)
                if self.end:
                    m = end_re.match(line, lexer.x)
                    if m:
                        if null_t:
                            yield null_t
                            null_t = None
-                        yield self.make_token(lexer, 'end', parent, m, {})
+                        yield self.make_token(lexer, stopname, parent, m, {})
                        done = True
                        break
-                # ok, we need to check all our rules now, in order. if we find a
+                m = None
                # token, note that we found one and exit the loop
                found = False
                for rule in self.grammar.rules:
                    m = rule.match(lexer, parent)
                    if m:
@ -207,233 +179,87 @@ class RegionRule(Rule):
                            null_t = None
                        for t in rule.lex(lexer, parent, m):
                            yield t
                        found = True
                        break
-                # if we never found a token, then we need to add another
+                if not m:
                # character to the current null token (which we should
                # create if it isn't set).
                if not found:
                    if not null_t:
                        null_t = Token('null', None, lexer.y, lexer.x, '', parent)
                    if lexer.x < len(line):
                        null_t.add_to_string(line[lexer.x])
                        lexer.x += 1
            # ok, since we're soon going to be on a different line (or
            # already are), we want a new null token. so forget about the
            # current one (i.e. stop adding to it).
            if null_t:
                yield null_t
                null_t = None
            # if we're still on the same line at this point (and not done)
            # then that means we're finished with the line and should move
            # on to the next one here
            if not done and old_y == lexer.y:
                lexer.y += 1
                lexer.x = 0
        raise StopIteration
 class NocaseRegionRule(RegionRule):
    reflags = re.IGNORECASE
-class DualRegionRule(Rule):
+class DualRegionRule(RegionRule):
    def __init__(self, name, start, grammar1, middle, grammar2, end, group=None):
        Rule.__init__(self, name)
-        self.start    = start
+        self.start_re = re.compile(start, self.reflags)
        self.grammar1 = grammar1
        self.middle   = middle
        self.grammar2 = grammar2
        self.end      = end
-        self.start_re = self._compile_start()
+    def match(self, lexer, parent):
-        self._set_group(group)
+        return self.start_re.match(self.get_line(lexer), lexer.x)
    def lex(self, lexer, parent, m):
        assert m
        t1 = self.make_token(lexer, 'start', parent, m, m.groupdict())
        yield t1
-    def _compile_start(self):
+        t2 = None
-        return re.compile(self.start)
+        if self.middle:
-    def _compile_middle(self, d):
+            stopre = re.compile(self.middle % t1.groupdict(), self.reflags)
-        return re.compile(self.middle % d)
+        else:
-    def _compile_end(self, d):
+            stopre = None
-        return re.compile(self.end % d)
+        for t2 in self._lex(lexer, [t1], 'middle', stopre):
            yield t2
-    def _add_from_regex(self, name, lexer, parent, m, matchd={}):
+        if t2 is not None and t2.name == 'middle':
-        s = m.group(0)
+            if self.end:
-        token = self.make_token(lexer, s, name, parent, matchd)
+                stopre = re.compile(self.end % t2.groupdict(), self.reflags)
-        lexer.add_token(token)
+            else:
-        lexer.x += len(s)
+                stopre = None
-        return token
+            for t3 in self._lex(lexer, [t2], 'end', stopre):
                yield t3
        raise StopIteration
    def resume(self, lexer, toresume):
        assert toresume, "can't resume without tokens to resume!"
-        token = toresume[0]
+        t1 = t2 = None
-        if token.name == 'start':
+        if toresume[0].name == 'start':
-            t2 = self._match_first(lexer, token, toresume)
+            t1 = toresume[0]
-            if t2 is not None:
+        elif toresume[0].name == 'middle':
-                t3 = self._match_second(lexer, t2, [])
+            t2 = toresume[0]
            return True
        elif token.name == 'middle':
            t3 = self._match_second(lexer, token, toresume)
        else:
-            raise Exception, "invalid flag %r" % flag
+            raise Exception, "invalid name %r" % toresume[0].name
        return True
    def match(self, lexer, parent):
        # see if we can match our start token
        line = self.get_line(lexer)
        m = self.start_re.match(line, lexer.x)
        if m:
            t1 = self._add_from_regex('start', lexer, parent, m, m.groupdict())
            t2 = self._match_first(lexer, t1, [])
            if t2 is not None:
                t3 = self._match_second(lexer, t2, [])
            return True
        else:
            # region was not matched; we never started. so return false
            return False
-    def _match_first(self, lexer, parent, toresume=[]):
+        if t1 is not None:
-        reenter = len(toresume) > 1
+            #assert t1.name == 'start'
-        if reenter:
+            if self.middle:
-            assert parent is toresume[0]
+                stopre = re.compile(self.middle, self.reflags)
-        d1 = parent.matchd
+            else:
-        assert parent.name == 'start'
+                stopre = None
-        null_t = None
+            for t2 in self._lex_first(lexer, toresume, 'middle', stopre):
-        middle_re = self._compile_middle(d1)
+                yield t2
-        d2 = {}
+            toresume = [t2]
-
+        if t2 is not None:
-        # ok, so as long as we aren't done (we haven't found an end token),
+            assert t2.name == 'middle'
-        # keep reading input
+            if self.end:
-        t2   = None
+                stopre = re.compile(self.end, self.reflags)
-        done = False
+            else:
-        while not done and lexer.y < len(lexer.lines):
+                stopre = None
-            old_y = lexer.y
+            for t3 in self._lex_second(lexer, toresume, 'end', stopre):
-
+                yield t3
-            # ok, as long as we haven't found the end token, and have more
+            #toresume = [t3]
-            # data on the current line to read, we will process tokens
+        raise StopIteration
            while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]) + 1:
                # if we are reentering mid-parse, then that takes precedence
                if reenter:
                    reenter = False
                    rule2 = toresume[1].rule
                    rule2.resume(lexer, toresume[1:])
                    null_t = None
                line = self.get_line(lexer)
                # see if we have found the middle token. if so, we can then
                # proceed to "stage 2"
                m2 = middle_re.match(line, lexer.x)
                if m2:
                    d2 = dict(d1.items() + m2.groupdict().items())
                    t2 = self._add_from_regex('middle', lexer, parent, m2, d2)
                    done = True
                    break
                # ok, we need to check all our rules now, in order. if we
                # find a token, note that we found one and exit the loop
                found = False
                for rule in self.grammar1.rules:
                    if rule.match(lexer, parent):
                        found = True
                        null_t = None
                        break
                # if we never found a token, then we need to add another
                # character to the current null token (which we should
                # create if it isn't set).
                if not found:
                    if null_t is None:
                        null_t = Token('null', None, lexer.y, lexer.x, '', parent)
                        lexer.add_token(null_t)
                    null_t.add_to_string(line[lexer.x])
                    lexer.x += 1
            # ok, since we're soon going to be on a different line (or
            # already are), we want a new null token. so forget about the
            # current one.
            null_t = None
            # if we're still on the same line at this point (and not done)
            # then that means we're finished with the line and should move
            # on to the next one here
            if not done and old_y == lexer.y:
                lexer.y += 1
                lexer.x = 0
        return t2
    def _match_second(self, lexer, parent, toresume=[]):
        reenter = len(toresume) > 1
        if reenter:
            assert parent is toresume[0]
        assert parent.name == 'middle'
        d3 = parent.matchd
        null_t = None
        end_re = self._compile_end(d3)
        # ok, so as long as we aren't done (we haven't found an end token),
        # keep reading input
        t3   = None
        done = False
        while not done and lexer.y < len(lexer.lines):
            old_y = lexer.y
            # if we are reentering mid-parse, then that takes precedence
            if reenter:
                reenter = False
                rule2 = toresume[1].rule
                rule2.resume(lexer, toresume[1:])
                null_t = None
            # ok, as long as we haven't found the end token, and have more
            # data on the current line to read, we will process tokens
            while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]) + 1:
                # see if we have found the middle token. if so, we can then
                # proceed to "stage 2"
                line = self.get_line(lexer)
                m3 = end_re.match(line, lexer.x)
                if m3:
                    t3 = self._add_from_regex('end', lexer, parent, m3, {})
                    done = True
                    break
                # ok, we need to check all our rules now, in order. if we
                # find a token, note that we found one and exit the loop
                found = False
                for rule in self.grammar2.rules:
                    if rule.match(lexer, parent):
                        found = True
                        null_t = None
                        break
                # if we never found a token, then we need to add another
                # character to the current null token (which we should
                # create if it isn't set).
                if not found:
                    if null_t is None:
                        null_t = Token('null', None, lexer.y, lexer.x, '', parent)
                        lexer.add_token(null_t)
                    null_t.add_to_string(line[lexer.x])
                    lexer.x += 1
            # ok, since we're soon going to be on a different line (or
            # already are), we want a new null token. so forget about the
            # current one.
            null_t = None
            # if we're still on the same line at this point (and not done)
            # then that means we're finished with the line and should move
            # on to the next one here
            if not done and old_y == lexer.y:
                lexer.y += 1
                lexer.x = 0
        # alright, we're finally done processing; return true
        return t3
 class NocaseDualRegionRule(DualRegionRule):
-    def _compile_start(self):
+    reflags = re.IGNORECASE
        return re.compile(self.start, re.IGNORECASE)
    def _compile_middle(self, d):
        return re.compile(self.middle % d, re.IGNORECASE)
    def _compile_end(self, d):
        return re.compile(self.end % d, re.IGNORECASE)
 class Grammar:
    rules = []
@ -455,17 +281,16 @@ class Lexer:
        self.y       = 0
        self.x       = 0
        self.lines   = None
-        self.tokens  = []
+    def get_line(self):
-
+        return self.lines[lexer.y] + '\n'
    def add_token(self, t):
        self.tokens.append(t)
    def lex(self, lines, y=0, x=0):
        self.y      = y
        self.x      = x
        self.lines  = lines
        self.tokens = []
-
+        for t in self._lex():
            yield t
        raise StopIteration
    def resume(self, lines, y, x, token):
        self.y      = y
        self.x      = x
@ -474,9 +299,9 @@ class Lexer:
        toresume = token.parents()
        # this is a special case for the "middle" rule of a dual region rule
-        i = 0
+        i = 1
        while i < len(toresume):
-            if i > 0 and toresume[i].name == 'middle' and toresume[i-1].name == 'start':
+            if toresume[i].name == 'middle' and toresume[i-1].name == 'start':
                del toresume[i-1]
            else:
                i += 1
@ -488,32 +313,31 @@ class Lexer:
            yield t
        raise StopIteration
-    def __iter__(self):
+    def _lex(self):
        if self.lines is None:
            raise Exception, "no lines to lex"
        return self
    def next(self):
        null_t = None
        if self.tokens:
            return self.tokens.pop(0)
        while self.y < len(self.lines):
-            line = self.lines[self.y] + '\n'
+            line = self.get_line()
-            while self.x < len(line):
+            while not done and self.x < len(line):
-                curr_t = None
+                m = None
                for rule in self.grammar.rules:
-                    if rule.match(self, None):
+                    m = rule.match(self, parent)
-                        assert self.tokens, "match rendered no tokens?"
+                    if m:
-                        return self.tokens.pop(0)
+                        if null_t:
-                if null_t is None:
+                            yield null_t
-                    null_t = Token('null', None, self.y, self.x, '')
+                            null_t = None
-                    self.add_token(null_t)
+                        for t in rule.lex(self, parent, m):
-                null_t.add_to_string(line[self.x])
+                            yield t
-                self.x += 1
+                        break
-            null_t = None
+
                if not m:
                    if not null_t:
                        null_t = Token('null', None, self.y, self.x, '', parent)
                    if self.x < len(line):
                        null_t.add_to_string(line[self.x])
                        self.x += 1
            if null_t:
                yield null_t
                null_t = None
            self.y += 1
            self.x = 0
-        if self.tokens:
+        raise StopIteration
            return self.tokens.pop(0)
        else:
            raise StopIteration