From 90aab85f258c98fb6cdb23136c39ff53c695292d Mon Sep 17 00:00:00 2001 From: moculus Date: Fri, 8 Jun 2007 02:36:02 +0000 Subject: [PATCH] --HG-- branch : pmacs2 --- highlight2.py | 6 +- lex2.py | 669 ++++++++++++++++++++++++++++++++++---------------- 2 files changed, 459 insertions(+), 216 deletions(-) diff --git a/highlight2.py b/highlight2.py index cd088f8..37d5181 100644 --- a/highlight2.py +++ b/highlight2.py @@ -61,7 +61,9 @@ class Highlighter: # ====================== def relex(self, lines, y1, x1, y2, x2): # start the relexing process - self.lexer.lex(lines, y1, 0) + #self.lexer.lex(lines, y1, 0) + rulecontexts = self.line_contexts[y1] + self.lexer.resume(lines, y1, 0, rulecontexts) # these keep track of the current y coordinate, the current token index # on line[y], and the current "new token", respectively. @@ -108,7 +110,7 @@ class Highlighter: self.tokens[y].insert(i, new_token) i += 1 getnext = True - elif old_token == new_token: + elif '.' not in old_token.name and old_token == new_token: # if they match, then leave the old one alone i += 1 getnext = True diff --git a/lex2.py b/lex2.py index 72f1b9c..2ad5f1b 100755 --- a/lex2.py +++ b/lex2.py @@ -14,6 +14,7 @@ class RuleContext: self.y = y self.x = x self.rule = rule + self.flag = flag self.context = context self.matchd = matchd @@ -121,90 +122,115 @@ class RegionRule(Rule): t = self.make_token(lexer, m.group(0), t_name, grammar=grammar) lexer.add_token(t) lexer.x += len(m.group(0)) - def restart(self, lexer, rulecontext): - pass + + def resume(self, lexer, context, flag, d, rulecontexts): + assert rulecontexts, "can't resume without rulecontexts!" + self._match(lexer, context, d, None, rulecontexts) + return True + def match(self, lexer, context=[], d={}): - m = self.start_re.match(lexer.lines[lexer.y], lexer.x) # see if we can match our start token + m = self.start_re.match(lexer.lines[lexer.y], lexer.x) if m: - - # ok, so create our start token, and get ready to start reading data - d = m.groupdict() - lexer.context.append(RuleContext(lexer.y, lexer.x, self, 'start', - list(context), dict(d))) - self._add_from_regex(context, 'start', lexer, m, lexer.grammar) - null_t_name = '.'.join(context + [self.name, 'null']) - null_t = None - - # if we have an end token, then build it here. notice that it can - # reference named groups from the start token. if we have no end, - # well, then, we're never getting out of here alive! - if self.end: - end_re = re.compile(self.end % d) - - # ok, so as long as we aren't done (we haven't found an end token), - # keep reading input - done = False - while not done and lexer.y < len(lexer.lines): - old_y = lexer.y - # if this line is empty, then we skip it, but here we insert - # an empty null token just so we have something - if len(lexer.lines[lexer.y]) == 0: - null_t = Token(null_t_name, None, lexer.y, lexer.x, '') - lexer.add_token(null_t) - null_t = None - - # ok, as long as we haven't found the end token, and have more - # data on the current line to read, we will process tokens - while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]): - # if we are looking for an end token, then see if we've - # found it. if so, then we are done! - if self.end: - m = end_re.match(lexer.lines[lexer.y], lexer.x) - if m: - self._add_from_regex(context, 'end', lexer, m, None) - done = True - break - - # ok, we need to check all our rules now, in order. if we - # find a token, note that we found one and exit the loop - found = False - for rule in self.grammar.rules: - if rule.match(lexer, context + [self.name], d): - found = True - null_t = None - break - - # if we never found a token, then we need to add another - # character to the current null token (which we should - # create if it isn't set). - if not found: - if null_t is None: - null_t = Token(null_t_name, None, lexer.y, lexer.x, '') - lexer.add_token(null_t) - null_t.add_to_string(lexer.lines[lexer.y][lexer.x]) - lexer.x += 1 - - # ok, since we're soon going to be on a different line (or - # already are), we want a new null token. so forget about the - # current one (i.e. stop adding to it). - null_t = None - - # if we're still on the same line at this point (and not done) - # then that means we're finished with the line and should move - # on to the next one here - if not done and old_y == lexer.y: - lexer.save_context() - lexer.y += 1 - lexer.x = 0 - - # alright, we're finally done procesing the region, so return true - lexer.context.pop(-1) - return True + # region was match, so let's do this + return self._match(lexer, context, m.groupdict(), m, []) else: # region was not matched; we never started. so return false return False + def _match(self, lexer, context, d, m, rulecontext=[]): + # if we have been given rulecontext, then we are going to "resume" a + # parse that can already be assumed to have started + reenter = len(rulecontext) > 0 + assert m or reenter + + # first let's do some bookkeeping + lexer.context.append(RuleContext(lexer.y, lexer.x, self, 'start', + list(context), dict(d))) + if m is not None: + self._add_from_regex(context, 'start', lexer, m, lexer.grammar) + null_t_name = '.'.join(context + [self.name, 'null']) + null_t = None + + # if we have an end token, then build it here. notice that it can + # reference named groups from the start token. if we have no end, + # well, then, we're never getting out of here alive! + if self.end: + end_re = re.compile(self.end % d) + + # ok, so as long as we aren't done (we haven't found an end token), + # keep reading input + done = False + while not done and lexer.y < len(lexer.lines): + old_y = lexer.y + + # if this line is empty, then we skip it, but here we insert + # an empty null token just so we have something + if not reenter and len(lexer.lines[lexer.y]) == 0: + null_t = Token(null_t_name, None, lexer.y, lexer.x, '') + lexer.add_token(null_t) + null_t = None + + # ok, as long as we haven't found the end token, and have more + # data on the current line to read, we will process tokens + while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]): + # if we are reentering mid-parse, then that takes precedence + if reenter: + reenter = False + rule2 = rulecontext[0].rule + context2 = rulecontext[0].context + d2 = rulecontext[0].matchd + assert rule2.resume(lexer, context2, d2, rulecontext[1:]), \ + "%r %r %r %r" % (lexer, context2, d2, rulecontext[1:]) + found = True + null_t = None + break + + # if we are looking for an end token, then see if we've + # found it. if so, then we are done! + if self.end: + m = end_re.match(lexer.lines[lexer.y], lexer.x) + if m: + self._add_from_regex(context, 'end', lexer, m, None) + done = True + break + + # ok, we need to check all our rules now, in order. if we + # find a token, note that we found one and exit the loop + found = False + for rule in self.grammar.rules: + if rule.match(lexer, context + [self.name], d): + found = True + null_t = None + break + + # if we never found a token, then we need to add another + # character to the current null token (which we should + # create if it isn't set). + if not found: + if null_t is None: + null_t = Token(null_t_name, None, lexer.y, lexer.x, '') + lexer.add_token(null_t) + null_t.add_to_string(lexer.lines[lexer.y][lexer.x]) + lexer.x += 1 + + # ok, since we're soon going to be on a different line (or + # already are), we want a new null token. so forget about the + # current one (i.e. stop adding to it). + null_t = None + + # if we're still on the same line at this point (and not done) + # then that means we're finished with the line and should move + # on to the next one here + if not done and old_y == lexer.y: + lexer.save_context() + lexer.y += 1 + lexer.x = 0 + + # alright, we're finally done procesing the region, so return true + lexer.context.pop(-1) + return True + class DualRegionRule(Rule): def __init__(self, name, start, grammar1, middle, grammar2, end): assert valid_name_re.match(name), 'invalid name %r' % name @@ -221,148 +247,349 @@ class DualRegionRule(Rule): t = self.make_token(lexer, m.group(0), t_name, grammar=grammar) lexer.add_token(t) lexer.x += len(m.group(0)) - def match(self, lexer, context=[], d={}): - m1 = self.start_re.match(lexer.lines[lexer.y], lexer.x) - # see if we can match out start token - if m1: - # ok, so create our start token, and get ready to start reading data - self._add_from_regex(context, 'start', lexer, m1, lexer.grammar) - null_t_name = '.'.join(context + [self.name, 'null']) - null_t = None - d1 = m1.groupdict() - lexer.context.append(RuleContext(lexer.y, lexer.x, self, 'start', - list(context), dict(d1))) - d2 = {} - - middle_re = re.compile(self.middle % d1) - - # ok, so as long as we aren't done (we haven't found an end token), - # keep reading input - done = False - while not done and lexer.y < len(lexer.lines): - old_y = lexer.y - # if this line is empty, then we will skip it, but here weinsert - # an empty null token just so we have something - if len(lexer.lines[lexer.y]) == 0: - null_t = Token(null_t_name, None, lexer.y, lexer.x, '') - lexer.add_token(null_t) - null_t = None - - # ok, as long as we haven't found the end token, and have more - # data on the current line to read, we will process tokens - while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]): - # see if we have found the middle token. if so, we can then - # proceed to "stage 2" - m2 = middle_re.match(lexer.lines[lexer.y], lexer.x) - if m2: - d2 = m2.groupdict() - self._add_from_regex(context, 'middle', lexer, m2, None) - done = True - break - - # ok, we need to check all our rules now, in order. if we - # find a token, note that we found one and exit the loop - found = False - for rule in self.grammar1.rules: - if rule.match(lexer, context + [self.name], d1): - found = True - null_t = None - break - - # if we never found a token, then we need to add another - # character to the current null token (which we should - # create if it isn't set). - if not found: - if null_t is None: - null_t = Token(null_t_name, None, lexer.y, lexer.x, '') - lexer.add_token(null_t) - null_t.add_to_string(lexer.lines[lexer.y][lexer.x]) - lexer.x += 1 - - # ok, since we're soon going to be on a different line (or - # already are), we want a new null token. so forget about the - # current one. - null_t = None - - # if we're still on the same line at this point (and not done) - # then that means we're finished with the line and should move - # on to the next one here - if not done and old_y == lexer.y: - lexer.save_context() - lexer.y += 1 - lexer.x = 0 - - # ok stage 2 is like stage 1, only we are looking for end tokens - # instead of middle tokens - d3 = dict(d1.items() + d2.items()) - end_re = re.compile(self.end % d3) - lexer.context.pop(-1) - lexer.context.append(RuleContext(lexer.y, lexer.x, self, 'middle', - list(context), dict(d3))) - - # ok, so as long as we aren't done (we haven't found an end token), - # keep reading input - done = False - while not done and lexer.y < len(lexer.lines): - old_y = lexer.y - # if this line is empty, then we will skip it, but here weinsert - # an empty null token just so we have something - if len(lexer.lines[lexer.y]) == 0: - null_t = Token(null_t_name, None, lexer.y, lexer.x, '') - lexer.add_token(null_t) - null_t = None - - # ok, as long as we haven't found the end token, and have more - # data on the current line to read, we will process tokens - while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]): - # see if we have found the middle token. if so, we can then - # proceed to "stage 2" - m3 = end_re.match(lexer.lines[lexer.y], lexer.x) - if m3: - self._add_from_regex(context, 'end', lexer, m3, None) - done = True - break - - # ok, we need to check all our rules now, in order. if we - # find a token, note that we found one and exit the loop - found = False - for rule in self.grammar2.rules: - if rule.match(lexer, context + [self.name], d3): - found = True - null_t = None - break - - # if we never found a token, then we need to add another - # character to the current null token (which we should - # create if it isn't set). - if not found: - if null_t is None: - null_t = Token(null_t_name, None, lexer.y, lexer.x, '') - lexer.add_token(null_t) - null_t.add_to_string(lexer.lines[lexer.y][lexer.x]) - lexer.x += 1 - - # ok, since we're soon going to be on a different line (or - # already are), we want a new null token. so forget about the - # current one. - null_t = None - - # if we're still on the same line at this point (and not done) - # then that means we're finished with the line and should move - # on to the next one here - if not done and old_y == lexer.y: - lexer.save_context() - lexer.y += 1 - lexer.x = 0 - - # alright, we're finally done processing; return true - lexer.context.pop(-1) + def resume(self, lexer, context, flag, d, rulecontexts): + if flag == 'start': + d2 = self._match_first(lexer, context, d, None, rulecontexts) + d3 = dict(d.items() + d2.items()) + self._match_second(lexer, context, d3, None, rulecontexts) + return True + elif flag == 'middle': + self._match_second(lexer, context, flag, d, None, rulecontexts) return True else: - # dual region was not matched; we never started. so return false + raise Exception, "invalid flag %r" % flag + + def match(self, lexer, context=[], d={}): + # see if we can match our start token + m = self.start_re.match(lexer.lines[lexer.y], lexer.x) + if m: + # region was match, so let's do this + d1 = m.groupdict() + d2 = self._match_first(lexer, context, d1, m, []) + d3 = dict(d1.items() + d2.items()) + self._match_second(lexer, context, d3, None, []) + return True + else: + # region was not matched; we never started. so return false return False + def _match_first(self, lexer, context, d1, m1, rulecontext=[]): + # if we have been given rulecontext, then we are going to "resume" a + # parse that can already be assumed to have started + reenter = len(rulecontext) > 0 + assert m1 or reenter + + # first let's do some bookkeeping + lexer.context.append(RuleContext(lexer.y, lexer.x, self, 'start', + list(context), dict(d1))) + + # ok, so create our start token, and get ready to start reading data + if m1 is not None: + self._add_from_regex(context, 'start', lexer, m1, lexer.grammar) + null_t_name = '.'.join(context + [self.name, 'null']) + null_t = None + + middle_re = re.compile(self.middle % d1) + d2 = {} + + # ok, so as long as we aren't done (we haven't found an end token), + # keep reading input + done = False + while not done and lexer.y < len(lexer.lines): + old_y = lexer.y + + # if this line is empty, then we will skip it, but here weinsert + # an empty null token just so we have something + if len(lexer.lines[lexer.y]) == 0: + null_t = Token(null_t_name, None, lexer.y, lexer.x, '') + lexer.add_token(null_t) + null_t = None + + # ok, as long as we haven't found the end token, and have more + # data on the current line to read, we will process tokens + while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]): + # if we are reentering mid-parse, then that takes precedence + if reenter: + reenter = False + xrule = rulecontext[0].rule + xcontext = rulecontext[0].context + xd = rulecontext[0].matchd + assert rule2.resume(lexer, xcontext, xd, rulecontext[1:]), \ + "%r %r %r %r" % (lexer, xcontext, xd, rulecontext[1:]) + found = True + null_t = None + break + + # see if we have found the middle token. if so, we can then + # proceed to "stage 2" + m2 = middle_re.match(lexer.lines[lexer.y], lexer.x) + if m2: + d2 = m2.groupdict() + self._add_from_regex(context, 'middle', lexer, m2, None) + done = True + break + + # ok, we need to check all our rules now, in order. if we + # find a token, note that we found one and exit the loop + found = False + for rule in self.grammar1.rules: + if rule.match(lexer, context + [self.name], d1): + found = True + null_t = None + break + + # if we never found a token, then we need to add another + # character to the current null token (which we should + # create if it isn't set). + if not found: + if null_t is None: + null_t = Token(null_t_name, None, lexer.y, lexer.x, '') + lexer.add_token(null_t) + null_t.add_to_string(lexer.lines[lexer.y][lexer.x]) + lexer.x += 1 + + # ok, since we're soon going to be on a different line (or + # already are), we want a new null token. so forget about the + # current one. + null_t = None + + # if we're still on the same line at this point (and not done) + # then that means we're finished with the line and should move + # on to the next one here + if not done and old_y == lexer.y: + lexer.save_context() + lexer.y += 1 + lexer.x = 0 + lexer.context.pop(-1) + return d2 + + def _match_second(self, lexer, context, d3, m, rulecontext=[]): + # if we have been given rulecontext, then we are going to "resume" a + # parse that can already be assumed to have started + reenter = len(rulecontext) > 0 + + # ok stage 2 is like stage 1, only we are looking for end tokens + # instead of middle tokens + null_t_name = '.'.join(context + [self.name, 'null']) + null_t = None + end_re = re.compile(self.end % d3) + lexer.context.append(RuleContext(lexer.y, lexer.x, self, 'middle', + list(context), dict(d3))) + + # ok, so as long as we aren't done (we haven't found an end token), + # keep reading input + done = False + while not done and lexer.y < len(lexer.lines): + old_y = lexer.y + + # if we are reentering mid-parse, then that takes precedence + if reenter: + reenter = False + xrule = rulecontext[0].rule + xcontext = rulecontext[0].context + xd = rulecontext[0].matchd + assert rule2.resume(lexer, xcontext, xd, rulecontext[1:]), \ + "%r %r %r %r" % (lexer, xcontext, xd, rulecontext[1:]) + found = True + null_t = None + break + + # if this line is empty, then we will skip it, but here weinsert + # an empty null token just so we have something + if len(lexer.lines[lexer.y]) == 0: + null_t = Token(null_t_name, None, lexer.y, lexer.x, '') + lexer.add_token(null_t) + null_t = None + + # ok, as long as we haven't found the end token, and have more + # data on the current line to read, we will process tokens + while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]): + # see if we have found the middle token. if so, we can then + # proceed to "stage 2" + m3 = end_re.match(lexer.lines[lexer.y], lexer.x) + if m3: + self._add_from_regex(context, 'end', lexer, m3, None) + done = True + break + + # ok, we need to check all our rules now, in order. if we + # find a token, note that we found one and exit the loop + found = False + for rule in self.grammar2.rules: + if rule.match(lexer, context + [self.name], d3): + found = True + null_t = None + break + + # if we never found a token, then we need to add another + # character to the current null token (which we should + # create if it isn't set). + if not found: + if null_t is None: + null_t = Token(null_t_name, None, lexer.y, lexer.x, '') + lexer.add_token(null_t) + null_t.add_to_string(lexer.lines[lexer.y][lexer.x]) + lexer.x += 1 + + # ok, since we're soon going to be on a different line (or + # already are), we want a new null token. so forget about the + # current one. + null_t = None + + # if we're still on the same line at this point (and not done) + # then that means we're finished with the line and should move + # on to the next one here + if not done and old_y == lexer.y: + lexer.save_context() + lexer.y += 1 + lexer.x = 0 + + # alright, we're finally done processing; return true + lexer.context.pop(-1) + return True + +# def matchOLD(self, lexer, context=[], d={}): +# m1 = self.start_re.match(lexer.lines[lexer.y], lexer.x) +# # see if we can match out start token +# if m1: +# # ok, so create our start token, and get ready to start reading data +# self._add_from_regex(context, 'start', lexer, m1, lexer.grammar) +# null_t_name = '.'.join(context + [self.name, 'null']) +# null_t = None +# +# d1 = m1.groupdict() +# lexer.context.append(RuleContext(lexer.y, lexer.x, self, 'start', +# list(context), dict(d1))) +# d2 = {} +# middle_re = re.compile(self.middle % d1) +# +# # ok, so as long as we aren't done (we haven't found an end token), +# # keep reading input +# done = False +# while not done and lexer.y < len(lexer.lines): +# old_y = lexer.y +# # if this line is empty, then we will skip it, but here weinsert +# # an empty null token just so we have something +# if len(lexer.lines[lexer.y]) == 0: +# null_t = Token(null_t_name, None, lexer.y, lexer.x, '') +# lexer.add_token(null_t) +# null_t = None +# +# # ok, as long as we haven't found the end token, and have more +# # data on the current line to read, we will process tokens +# while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]): +# # see if we have found the middle token. if so, we can then +# # proceed to "stage 2" +# m2 = middle_re.match(lexer.lines[lexer.y], lexer.x) +# if m2: +# d2 = m2.groupdict() +# self._add_from_regex(context, 'middle', lexer, m2, None) +# done = True +# break +# +# # ok, we need to check all our rules now, in order. if we +# # find a token, note that we found one and exit the loop +# found = False +# for rule in self.grammar1.rules: +# if rule.match(lexer, context + [self.name], d1): +# found = True +# null_t = None +# break +# +# # if we never found a token, then we need to add another +# # character to the current null token (which we should +# # create if it isn't set). +# if not found: +# if null_t is None: +# null_t = Token(null_t_name, None, lexer.y, lexer.x, '') +# lexer.add_token(null_t) +# null_t.add_to_string(lexer.lines[lexer.y][lexer.x]) +# lexer.x += 1 +# +# # ok, since we're soon going to be on a different line (or +# # already are), we want a new null token. so forget about the +# # current one. +# null_t = None +# +# # if we're still on the same line at this point (and not done) +# # then that means we're finished with the line and should move +# # on to the next one here +# if not done and old_y == lexer.y: +# lexer.save_context() +# lexer.y += 1 +# lexer.x = 0 +# +# # ok stage 2 is like stage 1, only we are looking for end tokens +# # instead of middle tokens +# d3 = dict(d1.items() + d2.items()) +# end_re = re.compile(self.end % d3) +# lexer.context.pop(-1) +# lexer.context.append(RuleContext(lexer.y, lexer.x, self, 'middle', +# list(context), dict(d3))) +# +# # ok, so as long as we aren't done (we haven't found an end token), +# # keep reading input +# done = False +# while not done and lexer.y < len(lexer.lines): +# old_y = lexer.y +# # if this line is empty, then we will skip it, but here weinsert +# # an empty null token just so we have something +# if len(lexer.lines[lexer.y]) == 0: +# null_t = Token(null_t_name, None, lexer.y, lexer.x, '') +# lexer.add_token(null_t) +# null_t = None +# +# # ok, as long as we haven't found the end token, and have more +# # data on the current line to read, we will process tokens +# while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]): +# # see if we have found the middle token. if so, we can then +# # proceed to "stage 2" +# m3 = end_re.match(lexer.lines[lexer.y], lexer.x) +# if m3: +# self._add_from_regex(context, 'end', lexer, m3, None) +# done = True +# break +# +# # ok, we need to check all our rules now, in order. if we +# # find a token, note that we found one and exit the loop +# found = False +# for rule in self.grammar2.rules: +# if rule.match(lexer, context + [self.name], d3): +# found = True +# null_t = None +# break +# +# # if we never found a token, then we need to add another +# # character to the current null token (which we should +# # create if it isn't set). +# if not found: +# if null_t is None: +# null_t = Token(null_t_name, None, lexer.y, lexer.x, '') +# lexer.add_token(null_t) +# null_t.add_to_string(lexer.lines[lexer.y][lexer.x]) +# lexer.x += 1 +# +# # ok, since we're soon going to be on a different line (or +# # already are), we want a new null token. so forget about the +# # current one. +# null_t = None +# +# # if we're still on the same line at this point (and not done) +# # then that means we're finished with the line and should move +# # on to the next one here +# if not done and old_y == lexer.y: +# lexer.save_context() +# lexer.y += 1 +# lexer.x = 0 +# +# # alright, we're finally done processing; return true +# lexer.context.pop(-1) +# return True +# else: +# # dual region was not matched; we never started. so return false +# return False + class Grammar: rules = [] def __init__(self): @@ -394,6 +621,17 @@ class Lexer: self.context = [] self.line_contexts = {} + def resume(self, lines, y=0, x=0, rulecontexts=[]): + if len(rulecontexts) == 0: + self.lex(lines, y, x) + else: + self.y = y + self.x = x + self.lines = lines + self.tokens = [] + rc = rulecontexts[0] + rc.rule.resume(self, rc.context, rc.flag, rc.matchd, rulecontexts[1:]) + def __iter__(self): if self.lines is None: raise Exception, "no lines to lex" @@ -406,6 +644,9 @@ class Lexer: null_t_name = 'null' null_t = None + if self.tokens: + return self.tokens.pop(0) + while self.y < len(self.lines): line = self.lines[self.y] while self.x < len(line):