diff --git a/lex2.py b/lex2.py index f4463a6..225b20f 100755 --- a/lex2.py +++ b/lex2.py @@ -1,7 +1,7 @@ import re valid_name_re = re.compile('^[a-zA-Z_][a-zA-Z0-9_]*$') -reserved_names = ['start', 'null', 'end'] +reserved_names = ['start', 'middle', 'end', 'null'] class Token(object): def __init__(self, name, y, x, s, **vargs): @@ -76,38 +76,55 @@ class RegionRule(Rule): lexer.x += len(m.group(0)) def match(self, lexer, context=[]): m = self.start_re.match(lexer.lines[lexer.y], lexer.x) + # see if we can match out start token if m: + # ok, so create our start token, and get ready to start reading data self._add_from_regex(context, 'start', lexer, m) null_t_name = '.'.join(context + [self.name, 'null']) null_t = None + # if we have an end token, then build it here. notice that it can + # reference named groups from the start token. if we have no end, + # well, then, we're never getting out of here alive! if self.end: end_re = re.compile(self.end % m.groupdict()) + # ok, so as long as we aren't done (we haven't found an end token), + # keep reading input done = False - # NOTE: need to better handle matches that might consume more than - # one line of input. #### also, seems like some "region" matching isn't - # working, and finally, like the end token(s) might not be handled correctly while not done and lexer.y < len(lexer.lines): old_y = lexer.y + # if this line is empty, then we will skip it, but here weinsert + # an empty null token just so we have something if len(lexer.lines[lexer.y]) == 0: null_t = Token(null_t_name, lexer.y, lexer.x, '') lexer.add_token(null_t) null_t = None + + # ok, as long as we haven't found the end token, and have more + # data on the current line to read, we will process tokens while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]): + # if we are looking for an end token, then see if we've + # found it. if so, then we are done! if self.end: m = end_re.match(lexer.lines[lexer.y], lexer.x) if m: self._add_from_regex(context, 'end', lexer, m) done = True - continue + break + # ok, we need to check all our rules now, in order. if we + # find a token, note that we found one and exit the loop found = False for rule in self.grammar.rules: if rule.match(lexer, context + [self.name]): found = True null_t = None break + + # if we never found a token, then we need to add another + # character to the current null token (which we should + # create if it isn't set). if not found: if null_t is None: null_t = Token(null_t_name, lexer.y, lexer.x, '') @@ -115,16 +132,24 @@ class RegionRule(Rule): null_t.add_to_string(lexer.lines[lexer.y][lexer.x]) lexer.x += 1 + # ok, since we're soon going to be on a different line (or + # already are), we want a new null token. so forget about the + # current one. null_t = None + + # if we're still on the same line at this point (and not done) + # then that means we're finished with the line and should move + # on to the next one here if not done and old_y == lexer.y: lexer.y += 1 lexer.x = 0 + + # alright, we're finally done procesing the region, so return true return True else: + # region was not matched; we never started. so return false return False -# NOTE: this needs to get synced up with RegionRule's changes... -# right now, it has at least 2-3 different bugs. suck! class DualRegionRule(Rule): def __init__(self, name, start, grammar1, middle, grammar2, end): assert valid_name_re.match(name), 'invalid name %r' % name @@ -142,86 +167,137 @@ class DualRegionRule(Rule): lexer.add_token(t) lexer.x += len(m.group(0)) def match(self, lexer, context=[]): - m = self.start_re.match(lexer.lines[lexer.y], lexer.x) - if m: - self._add_from_regex(context, 'start', lexer, m) - + m1 = self.start_re.match(lexer.lines[lexer.y], lexer.x) + # see if we can match out start token + if m1: + # ok, so create our start token, and get ready to start reading data + self._add_from_regex(context, 'start', lexer, m1) null_t_name = '.'.join(context + [self.name, 'null']) null_t = None - d1 = m.groupdict() + d1 = m1.groupdict() d2 = {} + middle_re = re.compile(self.middle % d1) + # ok, so as long as we aren't done (we haven't found an end token), + # keep reading input done = False while not done and lexer.y < len(lexer.lines): - line = lexer.lines[lexer.y] - if len(line) == 0: + old_y = lexer.y + # if this line is empty, then we will skip it, but here weinsert + # an empty null token just so we have something + if len(lexer.lines[lexer.y]) == 0: null_t = Token(null_t_name, lexer.y, lexer.x, '') lexer.add_token(null_t) - while not done and lexer.x < len(line): - m = middle_re.match(line, lexer.x) - if m: - d2 = m.groupdict() - self._add_from_regex(context, 'middle', lexer, m) - done = True - continue + null_t = None + # ok, as long as we haven't found the end token, and have more + # data on the current line to read, we will process tokens + while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]): + # see if we have found the middle token. if so, we can then + # proceed to "stage 2" + m2 = middle_re.match(lexer.lines[lexer.y], lexer.x) + if m2: + d2 = m2.groupdict() + self._add_from_regex(context, 'middle', lexer, m2) + done = True + break + + # ok, we need to check all our rules now, in order. if we + # find a token, note that we found one and exit the loop found = False for rule in self.grammar1.rules: if rule.match(lexer, context + [self.name]): found = True null_t = None break + + # if we never found a token, then we need to add another + # character to the current null token (which we should + # create if it isn't set). if not found: if null_t is None: null_t = Token(null_t_name, lexer.y, lexer.x, '') lexer.add_token(null_t) - null_t.add_to_string(line[lexer.x]) + null_t.add_to_string(lexer.lines[lexer.y][lexer.x]) lexer.x += 1 + + # ok, since we're soon going to be on a different line (or + # already are), we want a new null token. so forget about the + # current one. null_t = None - if not done: + + # if we're still on the same line at this point (and not done) + # then that means we're finished with the line and should move + # on to the next one here + if not done and old_y == lexer.y: lexer.y += 1 lexer.x = 0 - if self.end: - d3 = dict(d1.items() + d2.items()) - end_re = re.compile(self.end % d3) + # ok stage 2 is like stage 1, only we are looking for end tokens + # instead of middle tokens + d3 = dict(d1.items() + d2.items()) + end_re = re.compile(self.end % d3) - null_t = None + # ok, so as long as we aren't done (we haven't found an end token), + # keep reading input done = False while not done and lexer.y < len(lexer.lines): - line = lexer.lines[lexer.y] - if len(line) == 0: + old_y = lexer.y + # if this line is empty, then we will skip it, but here weinsert + # an empty null token just so we have something + if len(lexer.lines[lexer.y]) == 0: null_t = Token(null_t_name, lexer.y, lexer.x, '') lexer.add_token(null_t) - while not done and lexer.x < len(line): - if self.end: - m = end_re.match(line, lexer.x) - if m: - self._add_from_regex(context, 'end', lexer, m) - done = True - continue + null_t = None + # ok, as long as we haven't found the end token, and have more + # data on the current line to read, we will process tokens + while not done and lexer.y == old_y and lexer.x < len(lexer.lines[lexer.y]): + # see if we have found the middle token. if so, we can then + # proceed to "stage 2" + m3 = end_re.match(lexer.lines[lexer.y], lexer.x) + if m3: + self._add_from_regex(context, 'end', lexer, m3) + done = True + break + + # ok, we need to check all our rules now, in order. if we + # find a token, note that we found one and exit the loop found = False for rule in self.grammar2.rules: if rule.match(lexer, context + [self.name]): found = True null_t = None break + + # if we never found a token, then we need to add another + # character to the current null token (which we should + # create if it isn't set). if not found: if null_t is None: null_t = Token(null_t_name, lexer.y, lexer.x, '') lexer.add_token(null_t) - null_t.add_to_string(line[lexer.x]) + null_t.add_to_string(lexer.lines[lexer.y][lexer.x]) lexer.x += 1 - + + # ok, since we're soon going to be on a different line (or + # already are), we want a new null token. so forget about the + # current one. null_t = None - if not done: + + # if we're still on the same line at this point (and not done) + # then that means we're finished with the line and should move + # on to the next one here + if not done and old_y == lexer.y: lexer.y += 1 lexer.x = 0 + + # alright, we're finally done procesing the dual region; return true return True else: + # dual region was not matched; we never started. so return false return False class Grammar: @@ -262,11 +338,9 @@ class Lexer: line = self.lines[self.y] while self.x < len(line): curr_t = None - #print 'Checking(%d) %r' % (self.x, line[self.x:]) for rule in self.grammar.rules: if rule.match(self): assert self.tokens, "AAAAA %s" % repr(self.tokens) - #print 'Returning(%d)' % self.x return self.tokens.pop(0) if null_t is None: null_t = Token(null_t_name, self.y, self.x, '') diff --git a/lex2_perl.py b/lex2_perl.py index 9650842..b618957 100755 --- a/lex2_perl.py +++ b/lex2_perl.py @@ -213,13 +213,6 @@ class PerlGrammar(Grammar): end=r'/ *[a-z]*', ), - # we officially don't support the bullshit s{a}{b} thing perl has going. - # those guys are on crack. we only support things like s#a#b# or s/a/b/. - # same comments as above apply - #{'name': 'replace regex', - # 'expr': r"""(?:y|tr|s)([^<[{(A-Za-z0-9 \t\n])(?:\\.|[^\\])*?\1(?:\\.|[^\\])*?\1[a-z]*""", - # 'action': lex.make_token}, - PatternRule( name=r'package', pattern=r"(?<=package )(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*", diff --git a/test3.py b/test3.py index b78d2b8..2915662 100644 --- a/test3.py +++ b/test3.py @@ -30,16 +30,16 @@ token_colors = { 'endblock': 'lred', 'pod': 'lred', 'comment': 'lred', - #'string1': 'lgreen', + #'string1': 'lgreen', 'string1.start': 'lgreen', 'string1.null': 'lgreen', 'string1.escaped': 'lpurple', - #'string1.scalar': 'yellow', - #'string1.system_scalar': 'yellow', + #'string1.scalar': 'yellow', + #'string1.system_scalar': 'yellow', 'string1.hash_deref': 'yellow', - #'string1.hash_bareword_index': 'lgreen', + #'string1.hash_bareword_index': 'lgreen', 'string1.end': 'lgreen', - #'string2': 'lgreen', + #'string2': 'lgreen', 'string2.start': 'lgreen', 'string2.null': 'lgreen', 'string2.end': 'lgreen', @@ -56,7 +56,10 @@ token_colors = { 'bareword_hash_index': 'lgreen', 'quoted_region': 'lcyan', 'match_regex': 'lcyan', - 'replace_regex': 'lcyan', + 'replace_regex.start': 'lcyan', + 'replace_regex.middle': 'lcyan', + 'replace_regex.end': 'lcyan', + 'replace_regex.null': 'lcyan', 'bareword_hash_key': 'lgreen', 'interpolated_scalar': 'yellow', 'interpolated_system_scalar': 'yellow', @@ -73,7 +76,7 @@ token_colors = { 'static_method': 'lcyan', 'builtin_method': 'lpurple', 'bareword_method': 'lcyan', - 'bareword': 'yellow', + #'bareword': 'yellow', 'bizzaro': 'lpurple', }