From 210f8f1a7c2577e3feeba80498bc7a53be31c849 Mon Sep 17 00:00:00 2001 From: moculus Date: Wed, 28 Mar 2007 05:09:04 +0000 Subject: [PATCH] hola --HG-- branch : pmacs2 --- lex2.py | 346 ++++++++++++-------------------------- lex2_perl.py | 457 ++++++++++++++++++++++++++++++--------------------- test2.py | 6 +- 3 files changed, 372 insertions(+), 437 deletions(-) diff --git a/lex2.py b/lex2.py index 714f7fd..593c742 100755 --- a/lex2.py +++ b/lex2.py @@ -36,7 +36,8 @@ class ConstantRule(Rule): self.constant = constant def match(self, lexer, context=[]): if lexer.lines[lexer.y][lexer.x:].startswith(self.constant): - lexer.add_token(self.make_token(lexer, self.constant, self.name)) + name = '.'.join(context + [self.name]) + lexer.add_token(self.make_token(lexer, self.constant, name)) lexer.x += len(self.constant) return True else: @@ -52,7 +53,8 @@ class PatternRule(Rule): def match(self, lexer, context=[]): m = self.re.match(lexer.lines[lexer.y], lexer.x) if m: - lexer.add_token(self.make_token(lexer, m.group(0), self.name)) + name = '.'.join(context + [self.name]) + lexer.add_token(self.make_token(lexer, m.group(0), name)) lexer.x += len(m.group(0)) return True else: @@ -117,251 +119,107 @@ class RegionRule(Rule): else: return False +class DualRegionRule(Rule): + def __init__(self, name, start, grammar1, middle, grammar2, end): + assert valid_name_re.match(name), 'invalid name %r' % name + assert name not in reserved_names, "reserved rule name: %r" % name + self.name = name + self.start = start + self.grammar1 = grammar1 + self.middle = middle + self.grammar2 = grammar2 + self.end = end + self.start_re = re.compile(start) + def _add_from_regex(self, context, name, lexer, m): + t_name = '.'.join(context + [self.name, name]) + t = self.make_token(lexer, m.group(0), t_name) + lexer.add_token(t) + lexer.x += len(m.group(0)) + def match(self, lexer, context=[]): + m = self.start_re.match(lexer.lines[lexer.y], lexer.x) + if m: + self._add_from_regex(context, 'start', lexer, m) + + null_t_name = '.'.join(context + [self.name, 'null']) + null_t = None + + d1 = m.groupdict() + d2 = {} + middle_re = re.compile(self.middle % d1) + + done = False + while not done and lexer.y < len(lexer.lines): + line = lexer.lines[lexer.y] + if len(line) == 0: + null_t = Token(null_t_name, lexer.y, lexer.x, '') + lexer.add_token(null_t) + while not done and lexer.x < len(line): + m = middle_re.match(line, lexer.x) + if m: + d2 = m.groupdict() + self._add_from_regex(context, 'middle', lexer, m) + done = True + continue + + found = False + for rule in self.grammar1.rules: + if rule.match(lexer, context + [self.name]): + found = True + null_t = None + break + if not found: + if null_t is None: + null_t = Token(null_t_name, lexer.y, lexer.x, '') + lexer.add_token(null_t) + null_t.add_to_string(line[lexer.x]) + lexer.x += 1 + null_t = None + if not done: + lexer.y += 1 + lexer.x = 0 + + if self.end: + d3 = dict(d1.items() + d2.items()) + end_re = re.compile(self.end % d3) + + null_t = None + done = False + while not done and lexer.y < len(lexer.lines): + line = lexer.lines[lexer.y] + if len(line) == 0: + null_t = Token(null_t_name, lexer.y, lexer.x, '') + lexer.add_token(null_t) + while not done and lexer.x < len(line): + if self.end: + m = end_re.match(line, lexer.x) + if m: + self._add_from_regex(context, 'end', lexer, m) + done = True + continue + + found = False + for rule in self.grammar2.rules: + if rule.match(lexer, context + [self.name]): + found = True + null_t = None + break + if not found: + if null_t is None: + null_t = Token(null_t_name, lexer.y, lexer.x, '') + lexer.add_token(null_t) + null_t.add_to_string(line[lexer.x]) + lexer.x += 1 + + null_t = None + lexer.y += 1 + lexer.x = 0 + return True + else: + return False + class Grammar: rules = [] -class NullGrammar(Grammar): - pass - -class PodGrammar(Grammar): - pass - -class StringGrammar(Grammar): - rules = [ - PatternRule('escaped', '\\.'), - ] - -class TestGrammar(Grammar): - rules = [ - RegionRule( - name='heredoc', - start="<< *(?P[a-zA-Z0-9_]+) *;", - grammar=StringGrammar(), - end='^%(heredoc)s$', - ), - RegionRule( - name='string1', - start='"', - grammar=StringGrammar(), - end='"', - ), - RegionRule( - name='string2', - start="'", - grammar=StringGrammar(), - end="'", - ), - PatternRule( - name='word', - pattern='[^ \t\n]+', - ), - ] - -class PerlGrammar(Grammar): - rules = [ - RegionRule( - name='heredoc', - start="<< *(?P[a-zA-Z0-9_]+) *;", - grammar=StringGrammar(), - end='^%(heredoc)s$', - ), - RegionRule( - name='endblock', - start="^__END__|__DATA__ *$", - grammar=NullGrammar(), - end='', - ), - RegionRule( - name='pod', - start='^=[a-zA-Z0-9_]+', - grammar=PodGrammar(), - end='^=cut', - ), - PatternRule( - name='comment', - pattern='#.*$', - ), - RegionRule( - name='string1', - start='"', - grammar=StringGrammar(), - end='"', - ), - RegionRule( - name='string2', - start="'", - grammar=StringGrammar(), - end="'", - ), - RegionRule( - name='evalstring', - start="`", - grammar=StringGrammar(), - end="`", - ), - PatternRule( - name='number', - pattern='0?\.[0-9]+|[0-9]+(?:\.[0-9]+)?', - ), - PatternRule( - name='keyword', - pattern="(?)(?:STDIN|STDERR|STDOUT|and|cmp|continue|do|else|elsif|eq|eval|foreach|for|if|last|my|next|ne|not|or|our|package|require|return|sub|undef|unless|until|use|while)(?![a-zA-Z0-9_])", - ), - PatternRule( - name='hash_bareword_index', - pattern='(?<={) *[A-Za-z0-9_]+(?=})', - ), - PatternRule( - name='literal_hash_bareword_index', - pattern='[A-Za-z0-9_]+(?= *=>)', - ), - PatternRule( - name='length_scalar', - pattern=r"\$#[A-Za-z0-9_](?:[A-Za-z0-9_]|::)*", - ), - PatternRule( - name='system_scalar', - pattern=r"\$[][>)(?:STDIN|STDERR|STDOUT|and|cmp|continue|do|else|elsif|eq|eval|foreach|for|if|last|my|next|ne|not|or|our|package|require|return|sub|undef|unless|until|use|while)(?![a-zA-Z_])""", - 'action': lex.make_token}, - - {'name': 'hash bareword index', - 'expr': r"(?<={)[A-Za-z0-9_]+(?=})", - 'action': lex.make_token}, - - {'name': 'literal hash bareword index', - 'expr': r"[A-Za-z0-9_]+(?= *=>)", - 'action': lex.make_token}, - - {'name': 'length scalar', - 'expr': r"\$#[A-Za-z0-9_](?:[A-Za-z0-9_]|::)*", - 'action': lex.make_token}, - - {'name': 'system scalar', - 'expr': r"\$[][>[a-z]*|m\[(?:\\.|[^\\])*?\][a-z]*""", - 'action': lex.make_token}, +class PerlGrammar(Grammar): + rules = [ + RegionRule( + name=r'heredoc', + start=r"<< *(?P[a-zA-Z0-9_]+) *;", + grammar=StringGrammar(), + end=r'^%(heredoc)s$', + ), + RegionRule( + name=r'endblock', + start=r"^__END__|__DATA__ *$", + grammar=Grammar(), + end=r'', + ), + RegionRule( + name=r'pod', + start=r'^=[a-zA-Z0-9_]+', + grammar=PodGrammar(), + end=r'^=cut', + ), + PatternRule( + name=r'comment', + pattern=r'#.*$', + ), + RegionRule( + name=r'string1', + start=r'"', + grammar=StringGrammar(), + end=r'"', + ), + RegionRule( + name=r'string2', + start=r"'", + grammar=StringGrammar(), + end=r"'", + ), + RegionRule( + name=r'evalstring', + start=r"`", + grammar=StringGrammar(), + end=r"`", + ), + PatternRule( + name=r'number', + pattern=r'0?\.[0-9]+|[0-9]+(?:\.[0-9]+)?', + ), + PatternRule( + name=r'keyword', + pattern=r"(?)(?:STDIN|STDERR|STDOUT|and|cmp|continue|do|else|elsif|eq|eval|foreach|for|if|last|my|next|ne|not|or|our|package|require|return|sub|undef|unless|until|use|while)(?![a-zA-Z0-9_])", + ), + PatternRule( + name=r'hash_bareword_index', + pattern=r'(?<={) *[A-Za-z0-9_]+(?=})', + ), + PatternRule( + name=r'literal_hash_bareword_index', + pattern=r'[A-Za-z0-9_]+(?= *=>)', + ), + PatternRule( + name=r'length_scalar', + pattern=r"\$#[A-Za-z0-9_](?:[A-Za-z0-9_]|::)*", + ), + PatternRule( + name=r'system_scalar', + pattern=r"\$[][>[^ #])', + grammar=Grammar(), + end=r'%(delim)s', + ), + RegionRule( + name=r'quoted_region2', + start=r'q[rqwx]?#', + grammar=Grammar(), + end=r'#', + ), + RegionRule( + name=r'bracket_quoted_region1', + start=r'q[rqwx]? *\(', + grammar=Grammar(), + end=r'\)', + ), + RegionRule( + name=r'bracket_quoted_region2', + start=r'q[rqwx]? *{', + grammar=Grammar(), + end=r'}', + ), + RegionRule( + name=r'bracket_quoted_region3', + start=r'q[rqwx]? *<', + grammar=Grammar(), + end=r'>', + ), + RegionRule( + name=r'bracket_quoted_region4', + start=r'q[rqwx]? *\[', + grammar=Grammar(), + end=r'\]', + ), + RegionRule( + name=r'implicit_match_regex', + start=r'(?:(?<==~)|(?<=!~)|(?<=\()) */', + grammar=StringGrammar(), + end=r'/', + ), + RegionRule( + name=r'explicit_match_regex1', + start=r'm *(?P[^ #])', + grammar=StringGrammar(), + end=r'%(delim)s', + ), + RegionRule( + name=r'explicit_match_regex1', + start=r'm#', + grammar=StringGrammar(), + end=r'#', + ), + DualRegionRule( + name=r'replace_regex', + start=r's */', + grammar1=StringGrammar(), + middle=r' */ *', + grammar2=StringGrammar(), + end=r'/ *[a-z]*', + ), # we officially don't support the bullshit s{a}{b} thing perl has going. # those guys are on crack. we only support things like s#a#b# or s/a/b/. # same comments as above apply - {'name': 'replace regex', - 'expr': r"""(?:y|tr|s)([^<[{(A-Za-z0-9 \t\n])(?:\\.|[^\\])*?\1(?:\\.|[^\\])*?\1[a-z]*""", - 'action': lex.make_token}, + #{'name': 'replace regex', + # 'expr': r"""(?:y|tr|s)([^<[{(A-Za-z0-9 \t\n])(?:\\.|[^\\])*?\1(?:\\.|[^\\])*?\1[a-z]*""", + # 'action': lex.make_token}, - {'name': 'package', - 'expr': r"""(?<=package )(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*""", - 'action': lex.make_token}, - - {'name': 'use', - 'expr': r"""(?<=use )(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*""", - 'action': lex.make_token}, - - {'name': 'instance method', - 'expr': r"""(?<=->)[a-zA-Z_][a-zA-Z_0-9]*""", - 'action': lex.make_token}, - - {'name': 'static method', - 'expr': r"""&?(?:[a-zA-Z_][a-zA-Z_0-9]*::)+[a-zA-Z_][a-zA-Z_0-9]*""", - 'action': lex.make_token}, - - {'name': 'method declaration', - 'expr': r"""(?<=sub )[a-zA-Z_][a-zA-Z_0-9]*(?=[ \n]*{)""", - 'action': lex.make_token}, - - {'name': 'built-in method', - 'expr': r"""(?)&?(?:write|warn|wantarray|waitpid|wait|vec|values|utime|use|untie|unshift|unpack|unlink|undef|umask|ucfirst|uc|truncate|times|time|tied|tie|telldir|tell|syswrite|system|sysseek|sysread|sysopen|syscall|symlink|substr|sub|study|stat|srand|sqrt|sprintf|split|splice|sort|socketpair|socket|sleep|sin|shutdown|shmwrite|shmread|shmget|shmctl|shift|setsockopt|setservent|setpwent|setprotoent|setpriority|setpgrp|setnetent|sethostent|setgrent|send|semop|semget|semctl|select|seekdir|seek|scalar|rmdir|rindex|rewinddir|reverse|return|reset|require|rename|ref|redo|recv|readpipe|readlink|readline|readdir|read|rand|quotemeta|push|prototype|printf|print|pos|pop|pipe|package|pack|our|ord|opendir|open|oct|no|next|my|msgsnd|msgrcv|msgget|msgctl|mkdir|map|lstat|log|lock|localtime|local|listen|link|length|lcfirst|lc|last|kill|keys|join|ioctl|int|index|import|hex|grep|goto|gmtime|glob|getsockopt|getsockname|getservent|getservbyport|getservbyname|getpwuid|getpwnam|getpwent|getprotoent|getprotobynumber|getprotobyname|getpriority|getppid|getpgrp|getpeername|getnetent|getnetbyname|getnetbyaddr|getlogin|gethostent|gethostbyname|gethostbyaddr|getgrnam|getgrgid|getgrent|getc|formline|format|fork|flock|fileno|fcntl|exp|exit|exists|exec|eval|eof|endservent|endpwent|endprotoent|endnetent|endhostent|endgrent|each|dump|do|die|delete|defined|dbmopen|dbmclose|crypt|cos|continue|connect|closedir|close|chroot|chr|chown|chop|chomp|chmod|chdir|caller|bless|binmode|bind|atan2|alarm|accept|abs)(?![a-zA-Z0-9_])""", - #'expr':r"""(?)&?(?:abs|accept|alarm|atan2|bind|binmode|bless|caller|chdir|chmod|chomp|chop|chown|chroot|chr|closedir|close|connect|cos|crypt|dbmclose|dbmopen|defined|delete|die|dump|each|eof|exec|exists|exit|exp|fcntl|fileno|flock|fork|format|formline|getc|getlogin|getpeername|grep|int|join|keys|lc|map|open|pop|print|push|rand|readdir|ref|scalar|select|shift|sort|split|srand|time|uc|unshift|values|wantarray|warn)(?![a-zA-Z0-9_])""", - 'action': lex.make_token}, - - {'name': 'method', - 'expr': r"""&(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*""", - 'action': lex.make_token}, - - {'name': 'methodref', - 'expr': r"""&\$(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*""", - 'action': lex.make_token}, - - {'name': 'bareword method', - 'expr': r"""(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*(?=[ \n]*(?:\(|->))""", - 'action': lex.make_token}, - - {'name': "delimiter", - 'expr': r"""\(|\)|\[|\]|{|}|,|;|=>|=|\?|(?>=|<<=|\*\*=""", - 'action': lex.make_token}, - - {'name': "operator", - 'expr': r"""\+|<=>|<>|<<|<=|<|-|>>|>=|>|\*\*|&|\*|\||/|\^|==|//|~|=~|!~|!=|%|!|\.""", - 'action': lex.make_token}, - - {'name': 'bareword', - 'expr': r"""(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*""", - 'action': lex.make_token}, - - {'name': 'default', - 'expr': r""".|\n""", - 'action': lex.silent} - ] - - def _default_rules(self): - """subclasses can override this to define defaults for a grammar""" - sub_exprs = {} - string_rules = [] - - for rdir in PerlGrammar.GRAMMAR_LIST: - self.add_rule(**rdir) - - if INTERPOLATION_HIGHLIGHTING: - if rdir['name'] in ('scalar', 'system scalar', 'array', 'hash', - 'system array'): - rdir2 = rdir.copy() - rdir2['name'] = 'interpolated ' + rdir['name'] - string_rules.append(lex.Rule(**rdir2)) - elif rdir['name'] in ('heredoc', 'string1', 'string2'): - sub_exprs[rdir['name']] = rdir['expr'] - - if INTERPOLATION_HIGHLIGHTING: - string_rules.append(lex.Rule(name="default string", - expr=r"""(?:\\.|[^\\\$]|\n)+|\$""", - action=lex.make_token)) - string_grammar = lex.Grammar(rules=string_rules) - - self.insert(0, lex.SubRule(name='heredoc', - expr=sub_exprs['heredoc'], - grammar=string_grammar)) - - self.insert(4, lex.SubRule(name="string1", - expr=sub_exprs['string1'], - grammar=string_grammar)) - - self.insert(5, lex.SubRule(name="string2", - expr=sub_exprs['string2'], - grammar=string_grammar)) + PatternRule( + name=r'package', + pattern=r"(?<=package )(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*", + ), + PatternRule( + name=r'sub', + pattern=r"(?<=sub )[a-zA-Z_][a-zA-Z_0-9]*(?=[ \n]*{)", + ), + PatternRule( + name=r'use', + pattern=r"(?<=use )(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*", + ), + PatternRule( + name=r'label', + pattern=r'[a-zA-Z_][a-zA-Z0-9_]*:', + ), + PatternRule( + name=r'instance_method', + pattern=r"(?<=->)[a-zA-Z_][a-zA-Z_0-9]*", + ), + PatternRule( + name=r'static_method', + pattern=r"&?(?:[a-zA-Z_][a-zA-Z_0-9]*::)+[a-zA-Z_][a-zA-Z_0-9]*", + ), + PatternRule( + name=r'builtin_method', + pattern=r"(?)&?(?:write|warn|wantarray|waitpid|wait|vec|values|utime|use|untie|unshift|unpack|unlink|undef|umask|ucfirst|uc|truncate|times|time|tied|tie|telldir|tell|syswrite|system|sysseek|sysread|sysopen|syscall|symlink|substr|sub|study|stat|srand|sqrt|sprintf|split|splice|sort|socketpair|socket|sleep|sin|shutdown|shmwrite|shmread|shmget|shmctl|shift|setsockopt|setservent|setpwent|setprotoent|setpriority|setpgrp|setnetent|sethostent|setgrent|send|semop|semget|semctl|select|seekdir|seek|scalar|rmdir|rindex|rewinddir|reverse|return|reset|require|rename|ref|redo|recv|readpipe|readlink|readline|readdir|read|rand|quotemeta|push|prototype|printf|print|pos|pop|pipe|package|pack|our|ord|opendir|open|oct|no|next|my|msgsnd|msgrcv|msgget|msgctl|mkdir|map|lstat|log|lock|localtime|local|listen|link|length|lcfirst|lc|last|kill|keys|join|ioctl|int|index|import|hex|grep|goto|gmtime|glob|getsockopt|getsockname|getservent|getservbyport|getservbyname|getpwuid|getpwnam|getpwent|getprotoent|getprotobynumber|getprotobyname|getpriority|getppid|getpgrp|getpeername|getnetent|getnetbyname|getnetbyaddr|getlogin|gethostent|gethostbyname|gethostbyaddr|getgrnam|getgrgid|getgrent|getc|formline|format|fork|flock|fileno|fcntl|exp|exit|exists|exec|eval|eof|endservent|endpwent|endprotoent|endnetent|endhostent|endgrent|each|dump|do|die|delete|defined|dbmopen|dbmclose|crypt|cos|continue|connect|closedir|close|chroot|chr|chown|chop|chomp|chmod|chdir|caller|bless|binmode|bind|atan2|alarm|accept|abs)(?![a-zA-Z0-9_])", + ), + PatternRule( + name=r'method', + pattern=r"&(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*", + ), + PatternRule( + name=r'ref_method', + pattern=r"&\$(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*", + ), + PatternRule( + name=r'bareword_method', + pattern=r"(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]* *\(", + ), + PatternRule( + name=r'delimiter', + pattern=r"\(|\)|\[|\]|{|}|,|;|->|=>|=|\?|(?>=|<<=|\*\*=", + ), + PatternRule( + name=r'operator', + pattern=r"\+|<=>|<>|<<|<=|<|-|>>|>=|>|\*\*|&|\*|\||/|\^|==|//|~|=~|!~|!=|%|!|\.", + ), + PatternRule( + name=r'bareword', + pattern=r'(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*', + ), + ] diff --git a/test2.py b/test2.py index 7bc404b..baa6ac9 100644 --- a/test2.py +++ b/test2.py @@ -1,5 +1,5 @@ import sys -import lex2 +import lex2, lex2_perl paths = sys.argv[1:] for path in paths: @@ -9,10 +9,10 @@ for path in paths: lines = data.split('\n') - grammar = lex2.PerlGrammar() + grammar = lex2_perl.PerlGrammar() lexer = lex2.Lexer('lexer', grammar) lexer.lex(lines) print path for token in lexer: - print '%-20s| %s' % (token.name, token.string) + print '%-28s| %s' % (token.name, token.string)