pmacs3/lex_perl.py

208 lines
9.5 KiB
Python
Executable File

#!/bin/env python
# 2.3 imports
from optparse import OptionParser
# our imports
import lex
# this will support perl's string interpolation; but, it can be slower and also
# possibly buggier
INTERPOLATION_HIGHLIGHTING = False
#INTERPOLATION_HIGHLIGHTING = True
class PerlGrammar(lex.Grammar):
GRAMMAR_LIST = [
{'name': 'heredoc',
'expr': r"""<< *([a-zA-Z0-9_]+) *;(?:.*?\n)*?(?:\1|$)""",
'action': lex.make_token},
{'name': 'endblock',
'expr': r"""(?:^|\n)(?:__END__|__DATA__)(?:.|\n)*$""",
'action': lex.make_token},
{'name': 'pod',
'expr': r"""(?:^|(?<=\n))=[a-zA-Z0-9_]+.*(?:\n(?!=cut).*)*(?:\n=cut|$)""",
'action': lex.make_token},
{'name': "comment",
'expr': r'[#].*(?:\n|$)',
'action': lex.make_token},
{'name': "string1",
'expr': r'''"(?:\\(?:.|\n)|[^\\"]|[ \n])*(?:"|.?$)''',
'action': lex.make_token},
{'name': "string2",
'expr': r"""'(?:\\(?:.|\n)|[^\\'])*(?:'|.?$)""",
'action': lex.make_token},
{'name': "evalstring",
'expr': r"""`(?:\\(?:.|\n)|[^\\`])*(?:`|.?$)""",
'action': lex.make_token},
{'name': 'number',
'expr': r"""0?\.[0-9]+|[0-9]+(?:\.[0-9]+)?""",
'action': lex.make_token},
{'name': 'label',
'expr': r"""[a-zA-Z_][a-zA-Z0-9_]*:(?= |\n)""",
'action': lex.make_token},
{'name': 'keyword',
'expr': r"""(?<!->)(?:STDIN|STDERR|STDOUT|and|cmp|continue|do|else|elsif|eq|eval|foreach|for|if|last|my|next|ne|not|or|our|package|require|return|sub|undef|unless|until|use|while)(?![a-zA-Z_])""",
'action': lex.make_token},
{'name': 'hash bareword index',
'expr': r"(?<={)[A-Za-z0-9_]+(?=})",
'action': lex.make_token},
{'name': 'literal hash bareword index',
'expr': r"[A-Za-z0-9_]+(?= *=>)",
'action': lex.make_token},
{'name': 'length scalar',
'expr': r"\$#[A-Za-z0-9_](?:[A-Za-z0-9_]|::)*",
'action': lex.make_token},
{'name': 'system scalar',
'expr': r"\$[][><ab/'\"_@\?#\$!%^|&*()](?![A-Za-z0-9_])",
'action': lex.make_token},
{'name': 'system array',
'expr': r"@_",
'action': lex.make_token},
{'name': 'scalar',
'expr': r"""\$\$*[A-Za-z0-9_](?:[A-Za-z0-9_]|::)*""",
'action': lex.make_token},
{'name': 'array',
'expr': r"""@\$*[A-Za-z_](?:[A-Za-z0-9_]|::)*""",
'action': lex.make_token},
{'name': 'hash',
'expr': r"""%\$*[A-Za-z_](?:[A-Za-z0-9_]|::)*""",
'action': lex.make_token},
{'name': 'dereference',
'expr': r"""[@%\$&\*](?={)""",
'action': lex.make_token},
# this isn't totally right but it handle's q//, q{} and q() which are
# the commonest
{'name': 'quoted region',
'expr': r"""q.\((?:\\.|[^\\\)])*\)|q./(?:\\.|[^\\/])*/|q.\{(?:\\.|[^\\\}])*\}""",
'action': lex.make_token},
# match regexes are annoying: the basic gist is easy, but all the perl
# crap sucks. if the m is not present, you have to use / as the
# delimiter. otherwise, you can use any non-alphanumeric-or-whitespace
# character. if you use <, (, [, or {, you close with the opposite kind
# of thing. we have to special-case those last 4. ugh.
#
# basic gist: /(\\.|[^\\])*?/[a-z]*
{'name': 'match regex',
'expr': r"""(?:(?<==~)|(?<=!~)|(?<=\()) */(?:\\.|[^\\/])*/[a-z]*|m([^<[{(A-Za-z0-9 \t\n])(?:\\.|[^\\])*?\1[a-z]*|m\((?:\\.|[^\\])*?\)[a-z]*|m{(?:\\.|[^\\])*?}[a-z]*|m<(?:\\.|[^\\])*?>[a-z]*|m\[(?:\\.|[^\\])*?\][a-z]*""",
'action': lex.make_token},
# we officially don't support the bullshit s{a}{b} thing perl has going.
# those guys are on crack. we only support things like s#a#b# or s/a/b/.
# same comments as above apply
{'name': 'replace regex',
'expr': r"""(?:y|tr|s)([^<[{(A-Za-z0-9 \t\n])(?:\\.|[^\\])*?\1(?:\\.|[^\\])*?\1[a-z]*""",
'action': lex.make_token},
{'name': 'package',
'expr': r"""(?<=package )(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*""",
'action': lex.make_token},
{'name': 'use',
'expr': r"""(?<=use )(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*""",
'action': lex.make_token},
{'name': 'instance method',
'expr': r"""(?<=->)[a-zA-Z_][a-zA-Z_0-9]*""",
'action': lex.make_token},
{'name': 'static method',
'expr': r"""&?(?:[a-zA-Z_][a-zA-Z_0-9]*::)+[a-zA-Z_][a-zA-Z_0-9]*""",
'action': lex.make_token},
{'name': 'method declaration',
'expr': r"""(?<=sub )[a-zA-Z_][a-zA-Z_0-9]*(?=[ \n]*{)""",
'action': lex.make_token},
{'name': 'built-in method',
'expr': r"""(?<!->)&?(?:write|warn|wantarray|waitpid|wait|vec|values|utime|use|untie|unshift|unpack|unlink|undef|umask|ucfirst|uc|truncate|times|time|tied|tie|telldir|tell|syswrite|system|sysseek|sysread|sysopen|syscall|symlink|substr|sub|study|stat|srand|sqrt|sprintf|split|splice|sort|socketpair|socket|sleep|sin|shutdown|shmwrite|shmread|shmget|shmctl|shift|setsockopt|setservent|setpwent|setprotoent|setpriority|setpgrp|setnetent|sethostent|setgrent|send|semop|semget|semctl|select|seekdir|seek|scalar|rmdir|rindex|rewinddir|reverse|return|reset|require|rename|ref|redo|recv|readpipe|readlink|readline|readdir|read|rand|quotemeta|push|prototype|printf|print|pos|pop|pipe|package|pack|our|ord|opendir|open|oct|no|next|my|msgsnd|msgrcv|msgget|msgctl|mkdir|map|lstat|log|lock|localtime|local|listen|link|length|lcfirst|lc|last|kill|keys|join|ioctl|int|index|import|hex|grep|goto|gmtime|glob|getsockopt|getsockname|getservent|getservbyport|getservbyname|getpwuid|getpwnam|getpwent|getprotoent|getprotobynumber|getprotobyname|getpriority|getppid|getpgrp|getpeername|getnetent|getnetbyname|getnetbyaddr|getlogin|gethostent|gethostbyname|gethostbyaddr|getgrnam|getgrgid|getgrent|getc|formline|format|fork|flock|fileno|fcntl|exp|exit|exists|exec|eval|eof|endservent|endpwent|endprotoent|endnetent|endhostent|endgrent|each|dump|do|die|delete|defined|dbmopen|dbmclose|crypt|cos|continue|connect|closedir|close|chroot|chr|chown|chop|chomp|chmod|chdir|caller|bless|binmode|bind|atan2|alarm|accept|abs)(?![a-zA-Z0-9_])""",
#'expr':r"""(?<!->)&?(?:abs|accept|alarm|atan2|bind|binmode|bless|caller|chdir|chmod|chomp|chop|chown|chroot|chr|closedir|close|connect|cos|crypt|dbmclose|dbmopen|defined|delete|die|dump|each|eof|exec|exists|exit|exp|fcntl|fileno|flock|fork|format|formline|getc|getlogin|getpeername|grep|int|join|keys|lc|map|open|pop|print|push|rand|readdir|ref|scalar|select|shift|sort|split|srand|time|uc|unshift|values|wantarray|warn)(?![a-zA-Z0-9_])""",
'action': lex.make_token},
{'name': 'method',
'expr': r"""&(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*""",
'action': lex.make_token},
{'name': 'methodref',
'expr': r"""&\$(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*""",
'action': lex.make_token},
{'name': 'bareword method',
'expr': r"""(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*(?=[ \n]*(?:\(|->))""",
'action': lex.make_token},
{'name': "delimiter",
'expr': r"""\(|\)|\[|\]|{|}|,|;|=>|=|\?|(?<!:):(?!=:)""",
'action': lex.make_token},
{'name': "unary operator",
'expr': r"""\+=|-=|\*=|/=|//=|%=|&=\|\^=|>>=|<<=|\*\*=""",
'action': lex.make_token},
{'name': "operator",
'expr': r"""\+|<=>|<>|<<|<=|<|-|>>|>=|>|\*\*|&|\*|\||/|\^|==|//|~|=~|!~|!=|%|!|\.""",
'action': lex.make_token},
{'name': 'bareword',
'expr': r"""(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*""",
'action': lex.make_token},
{'name': 'default',
'expr': r""".|\n""",
'action': lex.silent}
]
def _default_rules(self):
"""subclasses can override this to define defaults for a grammar"""
sub_exprs = {}
string_rules = []
for rdir in PerlGrammar.GRAMMAR_LIST:
self.add_rule(**rdir)
if INTERPOLATION_HIGHLIGHTING:
if rdir['name'] in ('scalar', 'system scalar', 'array', 'hash',
'system array'):
rdir2 = rdir.copy()
rdir2['name'] = 'interpolated ' + rdir['name']
string_rules.append(lex.Rule(**rdir2))
elif rdir['name'] in ('heredoc', 'string1', 'string2'):
sub_exprs[rdir['name']] = rdir['expr']
if INTERPOLATION_HIGHLIGHTING:
string_rules.append(lex.Rule(name="default string",
expr=r"""(?:\\.|[^\\\$]|\n)+|\$""",
action=lex.make_token))
string_grammar = lex.Grammar(rules=string_rules)
self.insert(0, lex.SubRule(name='heredoc',
expr=sub_exprs['heredoc'],
grammar=string_grammar))
self.insert(4, lex.SubRule(name="string1",
expr=sub_exprs['string1'],
grammar=string_grammar))
self.insert(5, lex.SubRule(name="string2",
expr=sub_exprs['string2'],
grammar=string_grammar))