#!/bin/env python # 2.3 imports from optparse import OptionParser # our imports import lex # this will support perl's string interpolation; but, it can be slower and also # possibly buggier INTERPOLATION_HIGHLIGHTING = False #INTERPOLATION_HIGHLIGHTING = True class PerlGrammar(lex.Grammar): GRAMMAR_LIST = [ {'name': 'heredoc', 'expr': r"""<< *([a-zA-Z0-9_]+) *;(?:.*?\n)*?(?:\1|$)""", 'action': lex.make_token}, {'name': 'endblock', 'expr': r"""(?:^|\n)(?:__END__|__DATA__)(?:.|\n)*$""", 'action': lex.make_token}, {'name': 'pod', 'expr': r"""(?:^|(?<=\n))=[a-zA-Z0-9_]+.*(?:\n(?!=cut).*)*(?:\n=cut|$)""", 'action': lex.make_token}, {'name': "comment", 'expr': r'[#].*(?:\n|$)', 'action': lex.make_token}, {'name': "string1", 'expr': r'''"(?:\\(?:.|\n)|[^\\"]|[ \n])*(?:"|.?$)''', 'action': lex.make_token}, {'name': "string2", 'expr': r"""'(?:\\(?:.|\n)|[^\\'])*(?:'|.?$)""", 'action': lex.make_token}, {'name': "evalstring", 'expr': r"""`(?:\\(?:.|\n)|[^\\`])*(?:`|.?$)""", 'action': lex.make_token}, {'name': 'number', 'expr': r"""0?\.[0-9]+|[0-9]+(?:\.[0-9]+)?""", 'action': lex.make_token}, {'name': 'label', 'expr': r"""[a-zA-Z_][a-zA-Z0-9_]*:(?= |\n)""", 'action': lex.make_token}, {'name': 'keyword', 'expr': r"""(?<!->)(?:STDIN|STDERR|STDOUT|and|cmp|continue|do|else|elsif|eq|eval|foreach|for|if|last|my|next|ne|not|no|or|our|package|require|return|sub|undef|unless|until|use|while)(?![a-zA-Z_])""", 'action': lex.make_token}, {'name': 'hash bareword index', 'expr': r"(?<={)[A-Za-z0-9_]+(?=})", 'action': lex.make_token}, {'name': 'literal hash bareword index', 'expr': r"[A-Za-z0-9_]+(?= *=>)", 'action': lex.make_token}, {'name': 'length scalar', 'expr': r"\$#[A-Za-z0-9_](?:[A-Za-z0-9_]|::)*", 'action': lex.make_token}, {'name': 'system scalar', 'expr': r"\$[][><ab/'\"_@\?#\$!%^|&*()](?![A-Za-z0-9_])", 'action': lex.make_token}, {'name': 'system array', 'expr': r"@_", 'action': lex.make_token}, {'name': 'scalar', 'expr': r"""\$\$*[A-Za-z0-9_](?:[A-Za-z0-9_]|::)*""", 'action': lex.make_token}, {'name': 'array', 'expr': r"""@\$*[A-Za-z_](?:[A-Za-z0-9_]|::)*""", 'action': lex.make_token}, {'name': 'hash', 'expr': r"""%\$*[A-Za-z_](?:[A-Za-z0-9_]|::)*""", 'action': lex.make_token}, {'name': 'dereference', 'expr': r"""[@%\$&\*](?={)""", 'action': lex.make_token}, # this isn't totally right but it handle's q//, q{} and q() which are # the commonest {'name': 'quoted region', 'expr': r"""q.\((?:\\.|[^\\\)])*\)|q./(?:\\.|[^\\/])*/|q.\{(?:\\.|[^\\\}])*\}""", 'action': lex.make_token}, # match regexes are annoying: the basic gist is easy, but all the perl # crap sucks. if the m is not present, you have to use / as the # delimiter. otherwise, you can use any non-alphanumeric-or-whitespace # character. if you use <, (, [, or {, you close with the opposite kind # of thing. we have to special-case those last 4. ugh. # # basic gist: /(\\.|[^\\])*?/[a-z]* {'name': 'match regex', 'expr': r"""(?:(?<==~)|(?<=!~)|(?<=\()) */(?:\\.|[^\\/])*/[a-z]*|m([^<[{(A-Za-z0-9 \t\n])(?:\\.|[^\\])*?\1[a-z]*|m\((?:\\.|[^\\])*?\)[a-z]*|m{(?:\\.|[^\\])*?}[a-z]*|m<(?:\\.|[^\\])*?>[a-z]*|m\[(?:\\.|[^\\])*?\][a-z]*""", 'action': lex.make_token}, # we officially don't support the bullshit s{a}{b} thing perl has going. # those guys are on crack. we only support things like s#a#b# or s/a/b/. # same comments as above apply {'name': 'replace regex', 'expr': r"""(?:y|tr|s)([^<[{(A-Za-z0-9 \t\n])(?:\\.|[^\\])*?\1(?:\\.|[^\\])*?\1[a-z]*""", 'action': lex.make_token}, {'name': 'package', 'expr': r"""(?<=package )(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*""", 'action': lex.make_token}, {'name': 'use', 'expr': r"""(?<=use )(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*""", 'action': lex.make_token}, {'name': 'instance method', 'expr': r"""(?<=->)[a-zA-Z_][a-zA-Z_0-9]*""", 'action': lex.make_token}, {'name': 'static method', 'expr': r"""&?(?:[a-zA-Z_][a-zA-Z_0-9]*::)+[a-zA-Z_][a-zA-Z_0-9]*""", 'action': lex.make_token}, {'name': 'method declaration', 'expr': r"""(?<=sub )[a-zA-Z_][a-zA-Z_0-9]*(?=[ \n]*{)""", 'action': lex.make_token}, {'name': 'built-in method', 'expr': r"""(?<!->)&?(?:write|warn|wantarray|waitpid|wait|vec|values|utime|use|untie|unshift|unpack|unlink|undef|umask|ucfirst|uc|truncate|times|time|tied|tie|telldir|tell|syswrite|system|sysseek|sysread|sysopen|syscall|symlink|substr|sub|study|stat|srand|sqrt|sprintf|split|splice|sort|socketpair|socket|sleep|sin|shutdown|shmwrite|shmread|shmget|shmctl|shift|setsockopt|setservent|setpwent|setprotoent|setpriority|setpgrp|setnetent|sethostent|setgrent|send|semop|semget|semctl|select|seekdir|seek|scalar|rmdir|rindex|rewinddir|reverse|return|reset|require|rename|ref|redo|recv|readpipe|readlink|readline|readdir|read|rand|quotemeta|push|prototype|printf|print|pos|pop|pipe|package|pack|our|ord|opendir|open|oct|no|next|my|msgsnd|msgrcv|msgget|msgctl|mkdir|map|lstat|log|lock|localtime|local|listen|link|length|lcfirst|lc|last|kill|keys|join|ioctl|int|index|import|hex|grep|goto|gmtime|glob|getsockopt|getsockname|getservent|getservbyport|getservbyname|getpwuid|getpwnam|getpwent|getprotoent|getprotobynumber|getprotobyname|getpriority|getppid|getpgrp|getpeername|getnetent|getnetbyname|getnetbyaddr|getlogin|gethostent|gethostbyname|gethostbyaddr|getgrnam|getgrgid|getgrent|getc|formline|format|fork|flock|fileno|fcntl|exp|exit|exists|exec|eval|eof|endservent|endpwent|endprotoent|endnetent|endhostent|endgrent|each|dump|do|die|delete|defined|dbmopen|dbmclose|crypt|cos|continue|connect|closedir|close|chroot|chr|chown|chop|chomp|chmod|chdir|caller|bless|binmode|bind|atan2|alarm|accept|abs)(?![a-zA-Z0-9_])""", #'expr':r"""(?<!->)&?(?:abs|accept|alarm|atan2|bind|binmode|bless|caller|chdir|chmod|chomp|chop|chown|chroot|chr|closedir|close|connect|cos|crypt|dbmclose|dbmopen|defined|delete|die|dump|each|eof|exec|exists|exit|exp|fcntl|fileno|flock|fork|format|formline|getc|getlogin|getpeername|grep|int|join|keys|lc|map|open|pop|print|push|rand|readdir|ref|scalar|select|shift|sort|split|srand|time|uc|unshift|values|wantarray|warn)(?![a-zA-Z0-9_])""", 'action': lex.make_token}, {'name': 'method', 'expr': r"""&(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*""", 'action': lex.make_token}, {'name': 'methodref', 'expr': r"""&\$(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*""", 'action': lex.make_token}, {'name': 'bareword method', 'expr': r"""(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*(?=[ \n]*(?:\(|->))""", 'action': lex.make_token}, {'name': "delimiter", 'expr': r"""\(|\)|\[|\]|{|}|,|;|=>|=|\?|(?<!:):(?!=:)""", 'action': lex.make_token}, {'name': "unary operator", 'expr': r"""\+=|-=|\*=|/=|//=|%=|&=\|\^=|>>=|<<=|\*\*=""", 'action': lex.make_token}, {'name': "operator", 'expr': r"""\+|<=>|<>|<<|<=|<|-|>>|>=|>|\*\*|&|\*|\||/|\^|==|//|~|=~|!~|!=|%|!|\.""", 'action': lex.make_token}, {'name': 'bareword', 'expr': r"""(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*""", 'action': lex.make_token}, {'name': 'default', 'expr': r""".|\n""", 'action': lex.silent} ] def _default_rules(self): """subclasses can override this to define defaults for a grammar""" sub_exprs = {} string_rules = [] for rdir in PerlGrammar.GRAMMAR_LIST: self.add_rule(**rdir) if INTERPOLATION_HIGHLIGHTING: if rdir['name'] in ('scalar', 'system scalar', 'array', 'hash', 'system array'): rdir2 = rdir.copy() rdir2['name'] = 'interpolated ' + rdir['name'] string_rules.append(lex.Rule(**rdir2)) elif rdir['name'] in ('heredoc', 'string1', 'string2'): sub_exprs[rdir['name']] = rdir['expr'] if INTERPOLATION_HIGHLIGHTING: string_rules.append(lex.Rule(name="default string", expr=r"""(?:\\.|[^\\\$]|\n)+|\$""", action=lex.make_token)) string_grammar = lex.Grammar(rules=string_rules) self.insert(0, lex.SubRule(name='heredoc', expr=sub_exprs['heredoc'], grammar=string_grammar)) self.insert(4, lex.SubRule(name="string1", expr=sub_exprs['string1'], grammar=string_grammar)) self.insert(5, lex.SubRule(name="string2", expr=sub_exprs['string2'], grammar=string_grammar))