experimental lexing stuff!

--HG-- branch : pmacs2
2007-07-11 16:20:33 +00:00 · 2007-07-11 16:20:33 +00:00 · eb37a919b7
parent 685a318e5e
commit eb37a919b7
3 changed files with 57 additions and 120 deletions
--- a/lex2.py
+++ b/lex2.py
@ -63,6 +63,8 @@ class Rule:
        raise Exception, "%s rule cannot match!" % self.name
    def make_token(self, lexer, s, name, parent=None, matchd={}):
        return Token(name, self, lexer.y, lexer.x, s, parent, matchd)
    def _get_line(self, lexer):
        return lexer.lines[lexer.y] + '\n'
    def _set_group(self, group):
        if group is None:
            self.group = self.name
@ -78,7 +80,8 @@ class ConstantRule(Rule):
        self.length   = len(self.constant)
        self._set_group(group)
    def match(self, lexer, parent):
-        if lexer.lines[lexer.y][lexer.x:].startswith(self.constant):
+        line = self._get_line(lexer)
        if line[lexer.x:].startswith(self.constant):
            token = self.make_token(lexer, self.constant, self.name, parent)
            lexer.add_token(token)
            lexer.x += self.length
@ -102,7 +105,8 @@ class PatternRule(Rule):
        lexer.add_token(token)
        lexer.x += len(s)
    def match(self, lexer, parent):
-        m = self.re.match(lexer.lines[lexer.y], lexer.x)
+        line = self._get_line(lexer)
        m = self.re.match(line, lexer.x)
        if m:
            self._match(lexer, parent, m)
            return True
@ -127,7 +131,8 @@ class ContextPatternRule(PatternRule):
            r = re.compile(self.pattern % parent.matchd)
        except KeyError:
            r = self.fallback_re
-        m = r.match(lexer.lines[lexer.y], lexer.x)
+        line = self._get_line(lexer)
        m = r.match(line, lexer.x)
        if m:
            self._match(lexer, parent, m)
            return True
@ -157,7 +162,8 @@ class RegionRule(Rule):
        return True
    def match(self, lexer, parent):
-        m = self.start_re.match(lexer.lines[lexer.y], lexer.x)
+        line = self._get_line(lexer)
        m = self.start_re.match(line, lexer.x)
        if m:
            self._match(lexer, parent, m, [])
            return True
@ -208,15 +214,16 @@ class RegionRule(Rule):
            # if this line is empty, then we skip it, but here we insert
            # an empty null token just so we have something
-            if not reenter and len(lexer.lines[lexer.y]) == 0:
+            #if not reenter and len(lexer.lines[lexer.y]) == 0:
-                null_t = Token('null', None, lexer.y, lexer.x, '', parent)
+            #    null_t = Token('null', None, lexer.y, lexer.x, '', parent)
-                lexer.add_token(null_t)
+            #    lexer.add_token(null_t)
-                null_t = None
+            #    null_t = None
            # ok, as long as we haven't found the end token, and have more
            # data on the current line to read, we will process tokens
-            while (not done and lexer.y == old_y and
+            #while (not done and lexer.y == old_y and
-                   lexer.x < len(lexer.lines[lexer.y])):
+            #       lexer.x < len(lexer.lines[lexer.y])):
            while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]) + 1:
                # if we are reentering mid-parse, then that takes precedence
                if reenter:
                    reenter = False
@ -225,14 +232,17 @@ class RegionRule(Rule):
                    null_t = None
                if lexer.y >= len(lexer.lines):
                    return True
-                elif lexer.x >= len(lexer.lines[lexer.y]):
+                #elif lexer.x >= len(lexer.lines[lexer.y]):
                elif lexer.x >= len(lexer.lines[lexer.y]) + 1:
                    lexer.y += 1
                    lexer.x = 0
                line = self._get_line(lexer)
                # if we are looking for an end token, then see if we've
                # found it. if so, then we are done!
                if self.end:
-                    m = end_re.match(lexer.lines[lexer.y], lexer.x)
+                    m = end_re.match(line, lexer.x)
                    if m:
                        self._add_from_regex('end', lexer, parent, m, {})
                        done = True
@ -247,6 +257,7 @@ class RegionRule(Rule):
                        null_t = None
                        break
                # if we never found a token, then we need to add another
                # character to the current null token (which we should
                # create if it isn't set).
@ -254,8 +265,10 @@ class RegionRule(Rule):
                    if null_t is None:
                        null_t = Token('null', None, lexer.y, lexer.x, '', parent)
                        lexer.add_token(null_t)
-                    if len(lexer.lines[lexer.y]) > lexer.x:
+                    #if len(lexer.lines[lexer.y]) > lexer.x:
-                        null_t.add_to_string(lexer.lines[lexer.y][lexer.x])
+                    if lexer.x < len(line):
                        #null_t.add_to_string(lexer.lines[lexer.y][lexer.x])
                        null_t.add_to_string(line[lexer.x])
                        lexer.x += 1
            # ok, since we're soon going to be on a different line (or
@ -311,7 +324,8 @@ class DualRegionRule(Rule):
        return True
    def match(self, lexer, parent):
        # see if we can match our start token
-        m = self.start_re.match(lexer.lines[lexer.y], lexer.x)
+        line = self._get_line(lexer)
        m = self.start_re.match(line, lexer.x)
        if m:
            t1 = self._add_from_regex('start', lexer, parent, m, m.groupdict())
            t2 = self._match_first(lexer, t1, [])
@ -341,14 +355,15 @@ class DualRegionRule(Rule):
            # if this line is empty, then we will skip it, but here we insert
            # an empty null token just so we have something
-            if len(lexer.lines[lexer.y]) == 0:
+            #if len(lexer.lines[lexer.y]) == 0:
-                null_t = Token('null', None, lexer.y, lexer.x, '', parent)
+            #    null_t = Token('null', None, lexer.y, lexer.x, '', parent)
-                lexer.add_token(null_t)
+            #    lexer.add_token(null_t)
-                null_t = None
+            #    null_t = None
            # ok, as long as we haven't found the end token, and have more
            # data on the current line to read, we will process tokens
-            while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]):
+            #while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]):
            while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]) + 1:
                # if we are reentering mid-parse, then that takes precedence
                if reenter:
                    raise Exception, "aw damn1"
@ -361,9 +376,11 @@ class DualRegionRule(Rule):
                    #null_t = None
                    #break
                line = self._get_line(lexer)
                # see if we have found the middle token. if so, we can then
                # proceed to "stage 2"
-                m2 = middle_re.match(lexer.lines[lexer.y], lexer.x)
+                m2 = middle_re.match(line, lexer.x)
                if m2:
                    d2 = dict(d1.items() + m2.groupdict().items())
                    t2 = self._add_from_regex('middle', lexer, parent, m2, d2)
@ -386,7 +403,8 @@ class DualRegionRule(Rule):
                    if null_t is None:
                        null_t = Token('null', None, lexer.y, lexer.x, '', parent)
                        lexer.add_token(null_t)
-                    null_t.add_to_string(lexer.lines[lexer.y][lexer.x])
+                    #null_t.add_to_string(lexer.lines[lexer.y][lexer.x])
                    null_t.add_to_string(line[lexer.x])
                    lexer.x += 1
            # ok, since we're soon going to be on a different line (or
@ -433,17 +451,18 @@ class DualRegionRule(Rule):
            # if this line is empty, then we will skip it, but here weinsert
            # an empty null token just so we have something
-            if len(lexer.lines[lexer.y]) == 0:
+            #if len(lexer.lines[lexer.y]) == 0:
-                null_t = Token('null', None, lexer.y, lexer.x, '', parent)
+            #    null_t = Token('null', None, lexer.y, lexer.x, '', parent)
-                lexer.add_token(null_t)
+            #    lexer.add_token(null_t)
-                null_t = None
+            #    null_t = None
            # ok, as long as we haven't found the end token, and have more
            # data on the current line to read, we will process tokens
-            while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]):
+            while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]) + 1:
                # see if we have found the middle token. if so, we can then
                # proceed to "stage 2"
-                m3 = end_re.match(lexer.lines[lexer.y], lexer.x)
+                line = self._get_line(lexer)
                m3 = end_re.match(line, lexer.x)
                if m3:
                    t3 = self._add_from_regex('end', lexer, parent, m3, {})
                    done = True
@ -465,7 +484,8 @@ class DualRegionRule(Rule):
                    if null_t is None:
                        null_t = Token('null', None, lexer.y, lexer.x, '', parent)
                        lexer.add_token(null_t)
-                    null_t.add_to_string(lexer.lines[lexer.y][lexer.x])
+                    #null_t.add_to_string(lexer.lines[lexer.y][lexer.x])
                    null_t.add_to_string(line[lexer.x])
                    lexer.x += 1
            # ok, since we're soon going to be on a different line (or
@ -556,7 +576,8 @@ class Lexer:
            return self.tokens.pop(0)
        while self.y < len(self.lines):
-            line = self.lines[self.y]
+            #line = self.lines[self.y] 
            line = self.lines[self.y] + '\n'
            while self.x < len(line):
                curr_t = None
                for rule in self.grammar.rules:
@ -566,6 +587,8 @@ class Lexer:
                if null_t is None:
                    null_t = Token('null', None, self.y, self.x, '')
                    self.add_token(null_t)
                #assert line[self.x] != '\n', "DAMN"
                #assert line[self.x] != '$', "DAMN"
                null_t.add_to_string(line[self.x])
                self.x += 1
            null_t = None
--- a/mode_python.py
+++ b/mode_python.py
@ -2,7 +2,7 @@ import commands, os.path, sets, string
 import color, completer, default, mode2, lex2, method, regex, tab2
 import ctag_python
 from point2 import Point
-from lex2 import Grammar, PatternRule, RegionRule
+from lex2 import Grammar, PatternRule, RegionRule, ConstantRule
 class StringGrammar(Grammar):
    rules = [
@ -33,7 +33,8 @@ class PythonGrammar(Grammar):
        RegionRule(r'string', r'"', StringGrammar, r'"'),
        RegionRule(r'string', r"'", StringGrammar, r"'"),
        PatternRule(r'comment', r'#.*$'),
-        PatternRule(r'continuation', r'\\$'),
+        PatternRule(r'continuation', r'\\\n$'),
        PatternRule(r'eol', r'\n$'),
    ]
 class PythonTabber(tab2.StackTabber):
@ -125,7 +126,7 @@ class PythonTabber(tab2.StackTabber):
            # since we're done with the string, resume our indentation level
            self._opt_pop('string')
        elif fqname == 'delimiter':
-            # we only reall care about a colon as part of a one-line statement,
+            # we only really care about a colon as part of a one-line statement,
            # i.e.   "while ok: foo()" or "if True: print 3"
            if token.string == ':':
                if self.markers and self.markers[-1].name in ('[', '{'):
@ -183,7 +184,6 @@ class Python(mode2.Fundamental):
        # highlighting
        self.colors = {
            'keyword':           color.build('cyan', 'default'),
            #'reserved':          color.build('cyan', 'default'),
            'reserved':          color.build('magenta', 'default'),
            'builtin':           color.build('cyan', 'default'),
            'functionname':      color.build('blue', 'default'),
--- a/tab_c.py
+++ b/tab_c.py
@ -1,86 +0,0 @@
 import tab, point
 class CTabber(tab.TokenStackTabber):
    close_tags = {')': '(',
                  ']': '[',
                  '}': '{'}
    def stack_append_const(self, c):
        self.stack_append((c, self.tab_stack[-1][1] + 4))
    def stack_append_unique_const(self, c):
        if self.tab_stack[-1][0] != c:
            self.stack_append((c, self.tab_stack[-1][1] + 4))
    def stack_pop_const(self, *c_args):
        if self.tab_stack[-1][0] in c_args:
            self.stack_pop()
    def stack_pop_all_const(self, *c_args):
        while self.tab_stack[-1][0] in c_args:
            self.stack_pop()
    def handle_token(self, prev_token, token, next_token, y=None):
        buffer = self.mode.window.buffer
        name = token.name
        s = token.string
        if name == "c comment":
            if self.tab_stack[-1][0] != "c comment":
                self.stack_append(("c comment", self.tab_stack[-1][1]))
            else:
                self.line_depth += 1
            p = point.Point(len(buffer.lines[self.y]), self.y)
            offset = buffer.get_point_offset(p)
            if token.end <= offset or next_token is not None:
                self.stack_pop()
        elif name == "macro":
            self.line_depth -= 4
        elif name == "operator" and next_token is None:
            self.stack_append_const_unique("cont")
        elif name == "label":
            self.line_depth -= 4
            #self.line_depth = 0
        elif name == "keyword":
            if (s == "do" or
                s == "else" or
                s == "for" or
                s == "if" or
                s == "while"):
                self.stack_append_const("block")
            elif s == "case":
                if prev_token is None:
                    self.line_depth -= 4
        elif name == "delimiter":
            if s == "{" or s == "(" or s == "[":
                if s == "{":
                    if prev_token is None and self.tab_stack[-1][0] == "block":
                        self.line_depth -= 4
                    self.stack_pop_const("block")
                    #self.stack_pop_const("block", "cont")
                else:
                    self.stack_pop_const("cont")
                if next_token is None:
                    self.stack_append((s, self.tab_stack[-1][1] + 4))
                else:
                    p = buffer.get_offset_point(next_token.start)
                    self.stack_append((s, p.x))
            elif s == "}" or s == ")" or s == "]":
                if s == "}":
                    self.stack_pop_all_const("block", "cont")
                else:
                    self.stack_pop_all_const("cont")
                if self.tab_stack[-1][0] == self.close_tags[s]:
                    self.stack_pop()
                    if prev_token is None:
                        self.line_depth = self.tab_stack[-1][1]
                elif self.errors is False:
                    err = "tag mismatch, line %d: expected %r, got %r" % \
                          (self.y, self.tab_stack[-1][0], s)
                    self.mode.window.application.set_error(err)
                    self.errors = True
                if s == "}":
                    self.stack_pop_all_const("block", "cont")
            elif (s == "=" or s == "?") and next_token is None:
                self.stack_append_const_unique("cont")
            elif s == ',':
                self.stack_pop_all_const("cont")
            elif s == ';':
                self.stack_pop_all_const("block", "cont")