diff --git a/IDEAS b/IDEAS index 9aec508..ccf005f 100644 --- a/IDEAS +++ b/IDEAS @@ -1,10 +1,20 @@ -2007/07/11: +2007/07/14: -We need a way to have a rule that matches the end of the line. Many languages -have regions whose "end" token is merely the end of the line. In those cases, -our grammars employ hacks (with varying levels of success) to get around the -fact that rules must match 1-or-more characters from the buffer. +The rules are currently confusingly implemented, and have poor performance when +used in deeply nested grammars. -One solution would be to artificially include a newline character at the end of -the line, which could be matched in regexes. Another would be to create a new -type of rule and write some special-case code in the region rules. +We need to refactor lex2 so that rules have two methods: + +1. match(): +This method should return whether or not the rule can match the current input +that the lexer is lexing. If its result is true, the result will be passed +(along with the lexer, etc.) to the rule's lex() method. Otherwise, the next +rule will be tried. + +2. lex(): +This method is a generator, which is expected to return one or more tokens. In +addition to the arguments given to match() it will be passed the result of the +call to match() (which is guaranteed to be true, and will most often be a +re.Match object). As all generators, this method will raise StopIteration when +there are no more tokens to return, and will raise LexError if there are other +problems. diff --git a/code_examples/Reporting2.pm b/code_examples/Reporting2.pm index 7cab633..ecaf425 100644 --- a/code_examples/Reporting2.pm +++ b/code_examples/Reporting2.pm @@ -1,5 +1,7 @@ package TBB::Reporting2; +my $bar =~ s/foob/blag/g; + my $foo = { 'foo', 'bar', diff --git a/highlight2.py b/highlight2.py index 32776f2..d48bb1c 100644 --- a/highlight2.py +++ b/highlight2.py @@ -1,5 +1,5 @@ import sys -import lex2 +from lex2 import Token color_list = [] color_list.extend(['\033[3%dm' % x for x in range(0, 8)]) @@ -274,10 +274,10 @@ class Highlighter: post_change_list.append(t2) # add in the new data - newtokens[y1].append(lex2.Token('new', '', y1, x1, newlines[0])) + newtokens[y1].append(Token('new', '', y1, x1, newlines[0])) for i in range(1, len(newlines)): yi = y1 + i - newtokens[yi].append(lex2.Token('new', '', yi, 0, newlines[i])) + newtokens[yi].append(Token('new', '', yi, 0, newlines[i])) # add the post-change tokens back for t in post_change_list: diff --git a/lex2.py b/lex2.py index a3cfd5f..bbc9a7c 100755 --- a/lex2.py +++ b/lex2.py @@ -1,6 +1,8 @@ import re +import util valid_name_re = re.compile('^[a-zA-Z_][a-zA-Z0-9_]*$') +full_name_re = re.compile('^([a-zA-Z_]+)([0-9]*)$') reserved_names = ['start', 'middle', 'end', 'null'] class Token(object): @@ -59,7 +61,10 @@ class Token(object): return "" % fields class Rule: - name = 'abstract' + def __init__(self, name): + assert valid_name_re.match(name), 'invalid name %r' % name + assert name not in reserved_names, "reserved rule name: %r" % name + self.name = name def match(self, lexer, parent): raise Exception, "%s rule cannot match!" % self.name def make_token(self, lexer, s, name, parent=None, matchd={}): @@ -74,9 +79,7 @@ class Rule: class ConstantRule(Rule): def __init__(self, name, constant, group=None): - assert valid_name_re.match(name), 'invalid name %r' % name - assert name not in reserved_names, "reserved rule name: %r" % name - self.name = name + Rule.__init__(self, name) self.constant = constant self.length = len(self.constant) self._set_group(group) @@ -92,9 +95,7 @@ class ConstantRule(Rule): class PatternRule(Rule): def __init__(self, name, pattern, group=None): - assert valid_name_re.match(name), 'invalid name %r' % name - assert name not in reserved_names, "reserved rule name: %r" % name - self.name = name + Rule.__init__(self, name) self.pattern = pattern self._compile() self._set_group(group) @@ -120,9 +121,7 @@ class NocasePatternRule(PatternRule): class ContextPatternRule(PatternRule): def __init__(self, name, pattern, fallback, group=None): - assert valid_name_re.match(name), 'invalid name %r' % name - assert name not in reserved_names, "reserved rule name: %r" % name - self.name = name + Rule.__init__(self, name) self.pattern = pattern self.fallback = fallback self.fallback_re = re.compile(fallback) @@ -142,9 +141,7 @@ class ContextPatternRule(PatternRule): class RegionRule(Rule): def __init__(self, name, start, grammar, end, group=None): - assert valid_name_re.match(name), 'invalid name %r' % name - assert name not in reserved_names, "reserved rule name: %r" % name - self.name = name + Rule.__init__(self, name) self.start = start self.grammar = grammar self.end = end @@ -157,7 +154,6 @@ class RegionRule(Rule): return re.compile(self.end % d) def resume(self, lexer, toresume): - #raise Exception, "%r %r" % (lexer, toresume) #XYZ assert toresume, "can't resume without tokens to resume!" self._match(lexer, None, None, toresume) return True @@ -204,7 +200,6 @@ class RegionRule(Rule): # reference named groups from the start token. if we have no end, # well, then, we're never getting out of here alive! if self.end: - #end_re = re.compile(self.end % d) end_re = self._compile_end(d) # ok, so as long as we aren't done (we haven't found an end token), @@ -213,17 +208,8 @@ class RegionRule(Rule): while not done and lexer.y < len(lexer.lines): old_y = lexer.y - # if this line is empty, then we skip it, but here we insert - # an empty null token just so we have something - #if not reenter and len(lexer.lines[lexer.y]) == 0: - # null_t = Token('null', None, lexer.y, lexer.x, '', parent) - # lexer.add_token(null_t) - # null_t = None - # ok, as long as we haven't found the end token, and have more # data on the current line to read, we will process tokens - #while (not done and lexer.y == old_y and - # lexer.x < len(lexer.lines[lexer.y])): while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]) + 1: # if we are reentering mid-parse, then that takes precedence if reenter: @@ -233,7 +219,6 @@ class RegionRule(Rule): null_t = None if lexer.y >= len(lexer.lines): return True - #elif lexer.x >= len(lexer.lines[lexer.y]): elif lexer.x >= len(lexer.lines[lexer.y]) + 1: lexer.y += 1 lexer.x = 0 @@ -266,9 +251,7 @@ class RegionRule(Rule): if null_t is None: null_t = Token('null', None, lexer.y, lexer.x, '', parent) lexer.add_token(null_t) - #if len(lexer.lines[lexer.y]) > lexer.x: if lexer.x < len(line): - #null_t.add_to_string(lexer.lines[lexer.y][lexer.x]) null_t.add_to_string(line[lexer.x]) lexer.x += 1 @@ -294,15 +277,12 @@ class NocaseRegionRule(RegionRule): class DualRegionRule(Rule): def __init__(self, name, start, grammar1, middle, grammar2, end, group=None): - assert valid_name_re.match(name), 'invalid name %r' % name - assert name not in reserved_names, "reserved rule name: %r" % name - self.name = name + Rule.__init__(self, name) self.start = start self.grammar1 = grammar1 self.middle = middle self.grammar2 = grammar2 self.end = end - #self.start_re = re.compile(start) self.start_re = self._compile_start() self._set_group(group) @@ -353,7 +333,6 @@ class DualRegionRule(Rule): d1 = parent.matchd assert parent.name == 'start' null_t = None - #middle_re = re.compile(self.middle % d1) middle_re = self._compile_middle(d1) d2 = {} @@ -364,28 +343,15 @@ class DualRegionRule(Rule): while not done and lexer.y < len(lexer.lines): old_y = lexer.y - # if this line is empty, then we will skip it, but here we insert - # an empty null token just so we have something - #if len(lexer.lines[lexer.y]) == 0: - # null_t = Token('null', None, lexer.y, lexer.x, '', parent) - # lexer.add_token(null_t) - # null_t = None - # ok, as long as we haven't found the end token, and have more # data on the current line to read, we will process tokens - #while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]): while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]) + 1: # if we are reentering mid-parse, then that takes precedence if reenter: - raise Exception, "aw damn1" - #reenter = False - #xrule = rulecontext[0].rule - #xd = rulecontext[0].matchd - #assert rule2.resume(lexer, xcontext, xd, rulecontext[1:]), \ - # "%r %r %r %r" % (lexer, xcontext, xd, rulecontext[1:]) - #found = True - #null_t = None - #break + reenter = False + rule2 = toresume[1].rule + rule2.resume(lexer, toresume[1:]) + null_t = None line = self._get_line(lexer) @@ -414,7 +380,6 @@ class DualRegionRule(Rule): if null_t is None: null_t = Token('null', None, lexer.y, lexer.x, '', parent) lexer.add_token(null_t) - #null_t.add_to_string(lexer.lines[lexer.y][lexer.x]) null_t.add_to_string(line[lexer.x]) lexer.x += 1 @@ -436,10 +401,8 @@ class DualRegionRule(Rule): if reenter: assert parent is toresume[0] assert parent.name == 'middle' - #assert parent.name == 'middle' d3 = parent.matchd null_t = None - #end_re = re.compile(self.end % d3) end_re = self._compile_end(d3) # ok, so as long as we aren't done (we haven't found an end token), @@ -451,22 +414,10 @@ class DualRegionRule(Rule): # if we are reentering mid-parse, then that takes precedence if reenter: - raise Exception, "aw damn2" - #reenter = False - #xrule = rulecontext[0].rule - #xd = rulecontext[0].matchd - #assert rule2.resume(lexer, xcontext, xd, rulecontext[1:]), \ - # "%r %r %r %r" % (lexer, xcontext, xd, rulecontext[1:]) - #found = True - #null_t = None - #break - - # if this line is empty, then we will skip it, but here weinsert - # an empty null token just so we have something - #if len(lexer.lines[lexer.y]) == 0: - # null_t = Token('null', None, lexer.y, lexer.x, '', parent) - # lexer.add_token(null_t) - # null_t = None + reenter = False + rule2 = toresume[1].rule + rule2.resume(lexer, toresume[1:]) + null_t = None # ok, as long as we haven't found the end token, and have more # data on the current line to read, we will process tokens @@ -496,7 +447,6 @@ class DualRegionRule(Rule): if null_t is None: null_t = Token('null', None, lexer.y, lexer.x, '', parent) lexer.add_token(null_t) - #null_t.add_to_string(lexer.lines[lexer.y][lexer.x]) null_t.add_to_string(line[lexer.x]) lexer.x += 1 @@ -525,30 +475,7 @@ class Grammar: rule.grammar = self if hasattr(rule, 'grammar2') and rule.grammar is None: rule.grammar = self - -grammars = {} -grammars['null'] = Grammar() -crash = False - -def add(name, grammar): - global crash, grammars - if crash and name in grammars: - raise Exception, "oh no! already have a grammar for %r" %name - else: - grammars[name] = grammar - - -def get(name): - global crash, grammars - try: - return grammars[name] - except KeyError: - if crash: - raise - elif name == 'null': - return Grammar() - else: - return get('null') +grammar = Grammar() class Lexer: def __init__(self, name, grammar): @@ -569,16 +496,22 @@ class Lexer: self.tokens = [] def resume(self, lines, y, x, token): - #raise Exception, "%r %r" % (self, token) #XYZ self.y = y self.x = x self.lines = lines self.tokens = [] toresume = token.parents() + + # this is a special case for the "middle" rule of a dual region rule + i = 0 + while i < len(toresume): + if i > 0 and toresume[i].name == 'middle' and toresume[i-1].name == 'start': + del toresume[i-1] + else: + i += 1 + if toresume: toresume[0].rule.resume(self, toresume) - #else: - # raise Exception, "dammmmit" def __iter__(self): if self.lines is None: @@ -586,13 +519,10 @@ class Lexer: return self def next(self): - null_t = None - + null_t = None if self.tokens: return self.tokens.pop(0) - while self.y < len(self.lines): - #line = self.lines[self.y] line = self.lines[self.y] + '\n' while self.x < len(line): curr_t = None @@ -603,14 +533,11 @@ class Lexer: if null_t is None: null_t = Token('null', None, self.y, self.x, '') self.add_token(null_t) - #assert line[self.x] != '\n', "DAMN" - #assert line[self.x] != '$', "DAMN" null_t.add_to_string(line[self.x]) self.x += 1 null_t = None self.y += 1 self.x = 0 - if self.tokens: return self.tokens.pop(0) else: diff --git a/mode2.py b/mode2.py index f16eb9c..bfe636c 100644 --- a/mode2.py +++ b/mode2.py @@ -1,5 +1,6 @@ import os, sets, string -import color, lex2, method +import color, method +from lex2 import Lexer DEBUG = False @@ -156,7 +157,7 @@ class Fundamental(Handler): # lexing for highlighting, etc. if self.grammar: - self.lexer = lex2.Lexer(self.name(), self.grammar) + self.lexer = Lexer(self.name(), self.grammar) # tab handling if self.tabbercls: diff --git a/mode_bds.py b/mode_bds.py index bce8242..5f40a2a 100644 --- a/mode_bds.py +++ b/mode_bds.py @@ -1,4 +1,4 @@ -import color, lex2, mode2 +import color, mode2 from lex2 import Grammar, PatternRule, RegionRule, Grammar from mode_perl import PerlGrammar from mode_xml import OpenTagGrammar diff --git a/mode_console.py b/mode_console.py index 3bb2339..49b2a4f 100644 --- a/mode_console.py +++ b/mode_console.py @@ -1,12 +1,17 @@ import color, mode2 -from lex2 import Grammar, PatternRule +from lex2 import Grammar, PatternRule, RegionRule +from mode_python import StringGrammar class ConsoleGrammar(Grammar): rules = [ - PatternRule(name=r'mesg', pattern=r'^[A-Za-z].*$'), - PatternRule(name=r'input', pattern=r'^>>>.*$'), - PatternRule(name=r'input', pattern=r'^-->.*$'), - PatternRule(name=r'output', pattern=r'^ .*$'), + PatternRule(r'mesg', r'^[A-Za-z].*$'), + PatternRule(r'input', r'^>>>.*$'), + PatternRule(r'input', r'^-->.*$'), + #PatternRule(r'output', r'^ .*$'), + + RegionRule(r'string', r'"', StringGrammar, r'"'), + RegionRule(r'string', r"'", StringGrammar, r"'"), + PatternRule(r'bareword', r'[a-zA-Z_][a-zA-Z0-9_]*'), ] class Console(mode2.Fundamental): grammar = ConsoleGrammar() @@ -16,6 +21,12 @@ class Console(mode2.Fundamental): 'mesg': color.build('blue', 'default'), 'input': color.build('cyan', 'default'), 'output': color.build('default', 'default'), + + 'string.start': color.build('green', 'default'), + 'string.octal': color.build('magenta', 'default'), + 'string.escaped': color.build('magenta', 'default'), + 'string.null': color.build('green', 'default'), + 'string.end': color.build('green', 'default'), } def name(self): return "Console" diff --git a/mode_life.py b/mode_life.py index 43b8152..e869663 100644 --- a/mode_life.py +++ b/mode_life.py @@ -1,5 +1,5 @@ import re, sets, string, sys -import color, commands, default, lex2, method, mode2, regex, tab2 +import color, commands, default, method, mode2, regex, tab2 from point2 import Point from lex2 import Grammar, ConstantRule, PatternRule, ContextPatternRule, \ RegionRule, DualRegionRule diff --git a/mode_perl.py b/mode_perl.py index f652cb8..f6a8e3c 100644 --- a/mode_perl.py +++ b/mode_perl.py @@ -1,5 +1,5 @@ import re, sets, string, sys -import color, commands, default, lex2, method, mode2, regex, tab2 +import color, commands, default, method, mode2, regex, tab2 from point2 import Point from lex2 import Grammar, PatternRule, ContextPatternRule, RegionRule, DualRegionRule from method import Argument, Method @@ -11,11 +11,6 @@ class PodGrammar(Grammar): RegionRule(r'entry', r'(?<=^=item) +.*$', Grammar, '^\n$'), RegionRule(r'entry', r'(?:(?<=^=begin)|(?<=^=end)) +.*$', Grammar, '^\n$'), RegionRule(r'entry', r'(?<=^=encoding) +.*$', Grammar, '^\n$'), - #PatternRule(r'entry', r'(?<=^=head[1-4]) +.*$'), - #PatternRule(r'entry', r'(?<=^=over) +.*$'), - #PatternRule(r'entry', r'(?<=^=item) +.*$'), - #PatternRule(r'entry', r'(?:(?<=^=begin)|(?<=^=end)) +.*$'), - #PatternRule(r'entry', r'(?<=^=encoding) +.*$'), ] class StringGrammar(Grammar): @@ -168,12 +163,12 @@ class PerlTabber(tab2.StackTabber): return currlvl class Perl(mode2.Fundamental): - tabbercls = PerlTabber - grammar = PerlGrammar - opentokens = ('delimiter',) - opentags = {'(': ')', '[': ']', '{': '}'} - closetoken = ('delimiter',) - closetags = {')': '(', ']': '[', '}': '{'} + tabbercls = PerlTabber + grammar = PerlGrammar + opentokens = ('delimiter',) + opentags = {'(': ')', '[': ']', '{': '}'} + closetokens = ('delimiter',) + closetags = {')': '(', ']': '[', '}': '{'} def __init__(self, w): mode2.Fundamental.__init__(self, w) @@ -183,7 +178,7 @@ class Perl(mode2.Fundamental): #self.add_action_and_bindings(PerlHashCleanup2(), ('C-c h',)) self.add_action_and_bindings(PerlViewModulePerldoc(), ('C-c v',)) self.add_action_and_bindings(PerlViewWordPerldoc(), ('C-c p',)) - #self.add_action_and_bindings(PerlWrapLine(), ('M-q',)) + self.add_action_and_bindings(PerlWrapLine(), ('M-q',)) self.add_action_and_bindings(PerlGotoFunction(), ('C-c M-g',)) self.add_action_and_bindings(PerlWhichFunction(), ('C-c w',)) self.add_action_and_bindings(PerlListFunctions(), ('C-c W',)) @@ -497,6 +492,46 @@ class PerlHashCleanup(Method): window.kill(start_p, end_p) window.insert_string(start_p, data) +class PerlWrapLine(Method): + '''Wrap Comments and POD''' + margin = 80 + comment_re = re.compile('(#+)( *)(.*)') + def _is_newline(self, t): + return t.name == 'eol' + def _is_space(self, t): + return t.name == 'null' and regex.space.match(t.string) + + def _detect_line_type(self, w, y): + c = w.logical_cursor() + highlighter = w.buffer.highlights[w.mode.name()] + ltype = None + for t in highlighter.tokens[c.y]: + if self._is_space(t): + pass + elif t.name == 'comment': + if ltype: + return None + else: + ltype = 'comment' + elif t.name == 'eol': + return ltype + else: + return None + + def _execute(self, w, **vargs): + c = w.logical_cursor() + ltype = self._detect_line_type(w, c.y) + if ltype == 'comment': + return self._fix_comments(c, w) + elif ltype == 'pod': + return self._fix_pod(c, w) + else: + w.set_error("did not detect comment or pod lines") + return + def _fix_comments(self, c, w): + w.set_error("comment!") + def _fix_pod(self, c, w): + pass #class PerlWrapLine(Method): # '''Wrap lines, comments, POD''' # margin = 80 diff --git a/mode_python.py b/mode_python.py index 3446ef1..3632614 100644 --- a/mode_python.py +++ b/mode_python.py @@ -1,5 +1,5 @@ import commands, os.path, sets, string -import color, completer, default, mode2, lex2, method, regex, tab2 +import color, completer, default, mode2, method, regex, tab2 import ctag_python from point2 import Point from lex2 import Grammar, PatternRule, RegionRule, ConstantRule diff --git a/mode_search.py b/mode_search.py index 3ebc075..1646c8c 100644 --- a/mode_search.py +++ b/mode_search.py @@ -136,4 +136,3 @@ def _end(w): w.application.last_search = w.buffer.make_string() w.buffer.method.old_cursor = None w.buffer.method.old_window = None - w.buffer.method.is_literal = None diff --git a/mode_xml.py b/mode_xml.py index 8d0db82..c623e06 100644 --- a/mode_xml.py +++ b/mode_xml.py @@ -1,4 +1,4 @@ -import color, lex2, mode2 +import color, mode2 from lex2 import Grammar, PatternRule, RegionRule class OpenTagGrammar(Grammar): diff --git a/util.py b/util.py index 4630f5f..e268faf 100644 --- a/util.py +++ b/util.py @@ -50,3 +50,9 @@ def count_leading_whitespace(s): m = regex.leading_whitespace.match(s) assert m, "count leading whitespace failed somehow" return m.end() - m.start() + +def dump(x): + d = {} + for name in dir(x): + d[name] = getattr(x, name) + return '%s: %r' % (x, d)