maybe lex3 works now?

--HG-- branch : pmacs2
2007-07-15 13:24:25 +00:00 · 2007-07-15 13:24:25 +00:00 · b33772cb8f
parent a780d9d61a
commit b33772cb8f
2 changed files with 111 additions and 282 deletions
--- a/5
+++ b/5
@ -1,3 +1,8 @@
+2007/07/15:
+
+Rename "lexing" to "parsing" since really we have moved way beyond a simple
+lexing/tokenization strategy.
+
 2007/07/14:

 The rules are currently confusingly implemented, and have poor performance when
--- a/lex3.py
+++ b/lex3.py
@ -116,63 +116,39 @@ class RegionRule(Rule):
        self.grammar  = grammar
        self.end      = end
        self.start_re = re.compile(start, self.reflags)
-    def resume(self, lexer, toresume):
-        if not toresume:
-            raise Exception, "can't resume without tokens to resume!"
-        for t in self._lex(lexer, None, None, toresume):
-            yield t
-        raise StopIteration
    def match(self, lexer, parent):
        return self.start_re.match(self.get_line(lexer), lexer.x)
    def lex(self, lexer, parent, m):
-        for t in self._lex(lexer, parent, m, []):
-            yield t
+        t1 = self.make_token(lexer, 'start', None, m, m.groupdict())
+        yield t1
+        if self.end:
+            endre = re.compile(self.end % t1.matchd, self.reflags)
+        else:
+            endre = None
+        for t2 in self._lex(lexer, [t1], 'start', 'end'):
+            yield t2
+        raise StopIteration
+    def resume(self, lexer, toresume):
+        assert toresume
+        t1 = toresume[0]
+        if self.end:
+            endre = re.compile(self.end % t1.matchd, self.reflags)
+        else:
+            endre = None
+        for t2 in self._lex(lexer, t1, 'end', endre):
+            yield t2
        raise StopIteration

-    def _lex(self, lexer, parent, m, toresume=[]):
-        # this determines whether we are still reentering. if len(toresume) == 1
-        # then it means that we have been reentering but will not continue, so
-        # reenter will be false.
+    def _lex(self, lexer, toresume, stopname, stopre):
+        parent  = toresume[0]
        reenter = len(toresume) > 1
+        null_t  = None

-        # we either need a match object, or a token to resume
-        assert m or reenter, "we need a current match, or a previous match"
-
-        if m:
-            # if we had a match, then it becomes the parent, and we save its
-            # subgroup dict
-            d = m.groupdict()
-            yield self.make_token(lexer, 'start', parent, m, d)
-        else:
-            # otherwise, we should be resuming the start token, so let's pull
-            # the relevant info out of the token
-            parent = toresume[0]
-            d = parent.matchd
-            assert parent.name == 'start'
-
-        # this token, when set, will store unmatched characters which will be
-        # combined into a single "null" token when the end of the document, or
-        # a named-token, is reached.
-        null_t = None
-
-        # if we have an end regex, then build it here. notice that it can
-        # reference named groups from the start token. if we have no end,
-        # well, then, we're never getting out of here alive!
-        if self.end:
-            end_re = re.compile(self.end % d, self.reflags)
-
-        # ok, so as long as we aren't done (we haven't found an end token),
-        # keep reading input
        done = False
        while not done and lexer.y < len(lexer.lines):
            old_y = lexer.y
            line = self.get_line(lexer)
-
-            # ok, as long as we haven't found the end token, and have more
-            # data on the current line to read, we will process tokens
            while not done and lexer.y == old_y and lexer.x < len(line):
-                # if we are reentering mid-parse, then that takes precedence.
-                # afterwards, we need to clean-up and get our new state in order
                if reenter:
                    reenter = False
                    for t in toresume[1].rule.resume(lexer, toresume[1:]):
@ -184,21 +160,17 @@ class RegionRule(Rule):
                    lexer.x = 0
                line = self.get_line(lexer)

-                # if we are looking for an end token, then see if we've
-                # found it. if so, then we are done!
-                if self.end:
-                    m = end_re.match(line, lexer.x)
+                if stopre:
+                    m = stopre.match(line, lexer.x)
                    if m:
                        if null_t:
                            yield null_t
                            null_t = None
-                        yield self.make_token(lexer, 'end', parent, m, {})
+                        yield self.make_token(lexer, stopname, parent, m, {})
                        done = True
                        break

-                # ok, we need to check all our rules now, in order. if we find a
-                # token, note that we found one and exit the loop
-                found = False
+                m = None
                for rule in self.grammar.rules:
                    m = rule.match(lexer, parent)
                    if m:
@ -207,233 +179,87 @@ class RegionRule(Rule):
                            null_t = None
                        for t in rule.lex(lexer, parent, m):
                            yield t
-                        found = True
                        break

-                # if we never found a token, then we need to add another
-                # character to the current null token (which we should
-                # create if it isn't set).
-                if not found:
+                if not m:
                    if not null_t:
                        null_t = Token('null', None, lexer.y, lexer.x, '', parent)
                    if lexer.x < len(line):
                        null_t.add_to_string(line[lexer.x])
                        lexer.x += 1
-
-            # ok, since we're soon going to be on a different line (or
-            # already are), we want a new null token. so forget about the
-            # current one (i.e. stop adding to it).
            if null_t:
                yield null_t
                null_t = None
-
-            # if we're still on the same line at this point (and not done)
-            # then that means we're finished with the line and should move
-            # on to the next one here
            if not done and old_y == lexer.y:
                lexer.y += 1
                lexer.x = 0
-
        raise StopIteration
 class NocaseRegionRule(RegionRule):
    reflags = re.IGNORECASE

-class DualRegionRule(Rule):
+class DualRegionRule(RegionRule):
    def __init__(self, name, start, grammar1, middle, grammar2, end, group=None):
        Rule.__init__(self, name)
-        self.start    = start
+        self.start_re = re.compile(start, self.reflags)
        self.grammar1 = grammar1
        self.middle   = middle
        self.grammar2 = grammar2
        self.end      = end
-        self.start_re = self._compile_start()
-        self._set_group(group)
+    def match(self, lexer, parent):
+        return self.start_re.match(self.get_line(lexer), lexer.x)
+    def lex(self, lexer, parent, m):
+        assert m
+        t1 = self.make_token(lexer, 'start', parent, m, m.groupdict())
+        yield t1

-    def _compile_start(self):
-        return re.compile(self.start)
-    def _compile_middle(self, d):
-        return re.compile(self.middle % d)
-    def _compile_end(self, d):
-        return re.compile(self.end % d)
+        t2 = None
+        if self.middle:
+            stopre = re.compile(self.middle % t1.groupdict(), self.reflags)
+        else:
+            stopre = None
+        for t2 in self._lex(lexer, [t1], 'middle', stopre):
+            yield t2

-    def _add_from_regex(self, name, lexer, parent, m, matchd={}):
-        s = m.group(0)
-        token = self.make_token(lexer, s, name, parent, matchd)
-        lexer.add_token(token)
-        lexer.x += len(s)
-        return token
+        if t2 is not None and t2.name == 'middle':
+            if self.end:
+                stopre = re.compile(self.end % t2.groupdict(), self.reflags)
+            else:
+                stopre = None
+            for t3 in self._lex(lexer, [t2], 'end', stopre):
+                yield t3
+
+        raise StopIteration
    def resume(self, lexer, toresume):
        assert toresume, "can't resume without tokens to resume!"
-        token = toresume[0]
-        if token.name == 'start':
-            t2 = self._match_first(lexer, token, toresume)
-            if t2 is not None:
-                t3 = self._match_second(lexer, t2, [])
-            return True
-        elif token.name == 'middle':
-            t3 = self._match_second(lexer, token, toresume)
+        t1 = t2 = None
+        if toresume[0].name == 'start':
+            t1 = toresume[0]
+        elif toresume[0].name == 'middle':
+            t2 = toresume[0]
        else:
-            raise Exception, "invalid flag %r" % flag
-        return True
-    def match(self, lexer, parent):
-        # see if we can match our start token
-        line = self.get_line(lexer)
-        m = self.start_re.match(line, lexer.x)
-        if m:
-            t1 = self._add_from_regex('start', lexer, parent, m, m.groupdict())
-            t2 = self._match_first(lexer, t1, [])
-            if t2 is not None:
-                t3 = self._match_second(lexer, t2, [])
-            return True
-        else:
-            # region was not matched; we never started. so return false
-            return False
+            raise Exception, "invalid name %r" % toresume[0].name

-    def _match_first(self, lexer, parent, toresume=[]):
-        reenter = len(toresume) > 1
-        if reenter:
-            assert parent is toresume[0]
-        d1 = parent.matchd
-        assert parent.name == 'start'
-        null_t = None
-        middle_re = self._compile_middle(d1)
-        d2 = {}
-
-        # ok, so as long as we aren't done (we haven't found an end token),
-        # keep reading input
-        t2   = None
-        done = False
-        while not done and lexer.y < len(lexer.lines):
-            old_y = lexer.y
-
-            # ok, as long as we haven't found the end token, and have more
-            # data on the current line to read, we will process tokens
-            while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]) + 1:
-                # if we are reentering mid-parse, then that takes precedence
-                if reenter:
-                    reenter = False
-                    rule2 = toresume[1].rule
-                    rule2.resume(lexer, toresume[1:])
-                    null_t = None
-
-                line = self.get_line(lexer)
-
-                # see if we have found the middle token. if so, we can then
-                # proceed to "stage 2"
-                m2 = middle_re.match(line, lexer.x)
-                if m2:
-                    d2 = dict(d1.items() + m2.groupdict().items())
-                    t2 = self._add_from_regex('middle', lexer, parent, m2, d2)
-                    done = True
-                    break
-
-                # ok, we need to check all our rules now, in order. if we
-                # find a token, note that we found one and exit the loop
-                found = False
-                for rule in self.grammar1.rules:
-                    if rule.match(lexer, parent):
-                        found = True
-                        null_t = None
-                        break
-
-                # if we never found a token, then we need to add another
-                # character to the current null token (which we should
-                # create if it isn't set).
-                if not found:
-                    if null_t is None:
-                        null_t = Token('null', None, lexer.y, lexer.x, '', parent)
-                        lexer.add_token(null_t)
-                    null_t.add_to_string(line[lexer.x])
-                    lexer.x += 1
-
-            # ok, since we're soon going to be on a different line (or
-            # already are), we want a new null token. so forget about the
-            # current one.
-            null_t = None
-
-            # if we're still on the same line at this point (and not done)
-            # then that means we're finished with the line and should move
-            # on to the next one here
-            if not done and old_y == lexer.y:
-                lexer.y += 1
-                lexer.x = 0
-        return t2
-
-    def _match_second(self, lexer, parent, toresume=[]):
-        reenter = len(toresume) > 1
-        if reenter:
-            assert parent is toresume[0]
-        assert parent.name == 'middle'
-        d3 = parent.matchd
-        null_t = None
-        end_re = self._compile_end(d3)
-
-        # ok, so as long as we aren't done (we haven't found an end token),
-        # keep reading input
-        t3   = None
-        done = False
-        while not done and lexer.y < len(lexer.lines):
-            old_y = lexer.y
-
-            # if we are reentering mid-parse, then that takes precedence
-            if reenter:
-                reenter = False
-                rule2 = toresume[1].rule
-                rule2.resume(lexer, toresume[1:])
-                null_t = None
-
-            # ok, as long as we haven't found the end token, and have more
-            # data on the current line to read, we will process tokens
-            while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]) + 1:
-                # see if we have found the middle token. if so, we can then
-                # proceed to "stage 2"
-                line = self.get_line(lexer)
-                m3 = end_re.match(line, lexer.x)
-                if m3:
-                    t3 = self._add_from_regex('end', lexer, parent, m3, {})
-                    done = True
-                    break
-
-                # ok, we need to check all our rules now, in order. if we
-                # find a token, note that we found one and exit the loop
-                found = False
-                for rule in self.grammar2.rules:
-                    if rule.match(lexer, parent):
-                        found = True
-                        null_t = None
-                        break
-
-                # if we never found a token, then we need to add another
-                # character to the current null token (which we should
-                # create if it isn't set).
-                if not found:
-                    if null_t is None:
-                        null_t = Token('null', None, lexer.y, lexer.x, '', parent)
-                        lexer.add_token(null_t)
-                    null_t.add_to_string(line[lexer.x])
-                    lexer.x += 1
-
-            # ok, since we're soon going to be on a different line (or
-            # already are), we want a new null token. so forget about the
-            # current one.
-            null_t = None
-
-            # if we're still on the same line at this point (and not done)
-            # then that means we're finished with the line and should move
-            # on to the next one here
-            if not done and old_y == lexer.y:
-                lexer.y += 1
-                lexer.x = 0
-
-        # alright, we're finally done processing; return true
-        return t3
+        if t1 is not None:
+            #assert t1.name == 'start'
+            if self.middle:
+                stopre = re.compile(self.middle, self.reflags)
+            else:
+                stopre = None
+            for t2 in self._lex_first(lexer, toresume, 'middle', stopre):
+                yield t2
+            toresume = [t2]
+        if t2 is not None:
+            assert t2.name == 'middle'
+            if self.end:
+                stopre = re.compile(self.end, self.reflags)
+            else:
+                stopre = None
+            for t3 in self._lex_second(lexer, toresume, 'end', stopre):
+                yield t3
+            #toresume = [t3]
+        raise StopIteration
 class NocaseDualRegionRule(DualRegionRule):
-    def _compile_start(self):
-        return re.compile(self.start, re.IGNORECASE)
-    def _compile_middle(self, d):
-        return re.compile(self.middle % d, re.IGNORECASE)
-    def _compile_end(self, d):
-        return re.compile(self.end % d, re.IGNORECASE)
+    reflags = re.IGNORECASE

 class Grammar:
    rules = []
@ -455,17 +281,16 @@ class Lexer:
        self.y       = 0
        self.x       = 0
        self.lines   = None
-        self.tokens  = []
-
-    def add_token(self, t):
-        self.tokens.append(t)
-
+    def get_line(self):
+        return self.lines[lexer.y] + '\n'
    def lex(self, lines, y=0, x=0):
        self.y      = y
        self.x      = x
        self.lines  = lines
        self.tokens = []
-
+        for t in self._lex():
+            yield t
+        raise StopIteration
    def resume(self, lines, y, x, token):
        self.y      = y
        self.x      = x
@ -474,9 +299,9 @@ class Lexer:
        toresume = token.parents()

        # this is a special case for the "middle" rule of a dual region rule
-        i = 0
+        i = 1
        while i < len(toresume):
-            if i > 0 and toresume[i].name == 'middle' and toresume[i-1].name == 'start':
+            if toresume[i].name == 'middle' and toresume[i-1].name == 'start':
                del toresume[i-1]
            else:
                i += 1
@ -488,32 +313,31 @@ class Lexer:
            yield t
        raise StopIteration

-    def __iter__(self):
-        if self.lines is None:
-            raise Exception, "no lines to lex"
-        return self
-
-    def next(self):
+    def _lex(self):
        null_t = None
-        if self.tokens:
-            return self.tokens.pop(0)
        while self.y < len(self.lines):
-            line = self.lines[self.y] + '\n'
-            while self.x < len(line):
-                curr_t = None
+            line = self.get_line()
+            while not done and self.x < len(line):
+                m = None
                for rule in self.grammar.rules:
-                    if rule.match(self, None):
-                        assert self.tokens, "match rendered no tokens?"
-                        return self.tokens.pop(0)
-                if null_t is None:
-                    null_t = Token('null', None, self.y, self.x, '')
-                    self.add_token(null_t)
-                null_t.add_to_string(line[self.x])
-                self.x += 1
-            null_t = None
+                    m = rule.match(self, parent)
+                    if m:
+                        if null_t:
+                            yield null_t
+                            null_t = None
+                        for t in rule.lex(self, parent, m):
+                            yield t
+                        break
+
+                if not m:
+                    if not null_t:
+                        null_t = Token('null', None, self.y, self.x, '', parent)
+                    if self.x < len(line):
+                        null_t.add_to_string(line[self.x])
+                        self.x += 1
+            if null_t:
+                yield null_t
+                null_t = None
            self.y += 1
            self.x = 0
-        if self.tokens:
-            return self.tokens.pop(0)
-        else:
-            raise StopIteration
+        raise StopIteration