pmacs3/lex_perl.py

#!/bin/env python

# 2.3 imports
from optparse import OptionParser

# our imports
import lex

# this will support perl's string interpolation; but, it can be slower and also
# possibly buggier
INTERPOLATION_HIGHLIGHTING = False
#INTERPOLATION_HIGHLIGHTING = True

class PerlGrammar(lex.Grammar):
    GRAMMAR_LIST = [
        {'name': 'heredoc',
         'expr': r"""<< *([a-zA-Z0-9_]+) *;(?:.*?\n)*?(?:\1|$)""",
         'action': lex.make_token},

        {'name': 'endblock',
         'expr': r"""(?:^|\n)(?:__END__|__DATA__)(?:.|\n)*$""",
         'action': lex.make_token},

        {'name': 'pod',
         'expr': r"""(?:^|(?<=\n))=[a-zA-Z0-9_]+.*(?:\n(?!=cut).*)*(?:\n=cut|$)""",
         'action': lex.make_token},

        {'name': "comment",
         'expr': r'[#].*(?:\n|$)',
         'action': lex.make_token},

        {'name': "string1",
         'expr': r'''"(?:\\(?:.|\n)|[^\\"]|[ \n])*(?:"|.?$)''',
         'action': lex.make_token},

        {'name': "string2",
         'expr': r"""'(?:\\(?:.|\n)|[^\\'])*(?:'|.?$)""",
         'action': lex.make_token},

        {'name': "evalstring",
         'expr': r"""`(?:\\(?:.|\n)|[^\\`])*(?:`|.?$)""",
         'action': lex.make_token},

        {'name': 'number',
         'expr': r"""0?\.[0-9]+|[0-9]+(?:\.[0-9]+)?""",
         'action': lex.make_token},

        {'name': 'label',
         'expr': r"""[a-zA-Z_][a-zA-Z0-9_]*:(?= |\n)""",
         'action': lex.make_token},

        {'name': 'keyword',
         'expr': r"""(?<!->)(?:STDIN|STDERR|STDOUT|and|cmp|continue|do|else|elsif|eq|eval|foreach|for|if|last|my|next|ne|not|or|our|package|require|return|sub|undef|unless|until|use|while)(?![a-zA-Z_])""",
         'action': lex.make_token},

        {'name': 'hash bareword index',
         'expr': r"(?<={)[A-Za-z0-9_]+(?=})",
         'action': lex.make_token},

        {'name': 'literal hash bareword index',
         'expr': r"[A-Za-z0-9_]+(?= *=>)",
         'action': lex.make_token},

        {'name': 'length scalar',
         'expr': r"\$#[A-Za-z0-9_](?:[A-Za-z0-9_]|::)*",
         'action': lex.make_token},

        {'name': 'system scalar',
         'expr': r"\$[][><ab/'\"_@\?#\$!%^|&*()](?![A-Za-z0-9_])",
         'action': lex.make_token},

        {'name': 'system array',
         'expr': r"@_",
         'action': lex.make_token},

        {'name': 'scalar',
         'expr': r"""\$\$*[A-Za-z0-9_](?:[A-Za-z0-9_]|::)*""",
         'action': lex.make_token},

        {'name': 'array',
         'expr': r"""@\$*[A-Za-z_](?:[A-Za-z0-9_]|::)*""",
         'action': lex.make_token},

        {'name': 'hash',
         'expr': r"""%\$*[A-Za-z_](?:[A-Za-z0-9_]|::)*""",
         'action': lex.make_token},

        {'name': 'dereference',
         'expr': r"""[@%\$&\*](?={)""",
         'action': lex.make_token},

        # this isn't totally right but it handle's q//, q{} and q() which are
        # the commonest
        {'name': 'quoted region',
         'expr': r"""q.\((?:\\.|[^\\\)])*\)|q./(?:\\.|[^\\/])*/|q.\{(?:\\.|[^\\\}])*\}""",
         'action': lex.make_token},

        # match regexes are annoying: the basic gist is easy, but all the perl
        # crap sucks. if the m is not present, you have to use / as the
        # delimiter. otherwise, you can use any non-alphanumeric-or-whitespace
        # character.  if you use <, (, [, or {, you close with the opposite kind
        # of thing.  we have to special-case those last 4. ugh.
        #
        # basic gist: /(\\.|[^\\])*?/[a-z]*
        {'name': 'match regex',
         'expr': r"""(?:(?<==~)|(?<=!~)|(?<=\()) */(?:\\.|[^\\/])*/[a-z]*|m([^<[{(A-Za-z0-9 \t\n])(?:\\.|[^\\])*?\1[a-z]*|m\((?:\\.|[^\\])*?\)[a-z]*|m{(?:\\.|[^\\])*?}[a-z]*|m<(?:\\.|[^\\])*?>[a-z]*|m\[(?:\\.|[^\\])*?\][a-z]*""",
         'action': lex.make_token},

        # we officially don't support the bullshit s{a}{b} thing perl has going.
        # those guys are on crack. we only support things like s#a#b# or s/a/b/.
        # same comments as above apply
        {'name': 'replace regex',
         'expr': r"""(?:y|tr|s)([^<[{(A-Za-z0-9 \t\n])(?:\\.|[^\\])*?\1(?:\\.|[^\\])*?\1[a-z]*""",
         'action': lex.make_token},

        {'name': 'package',
         'expr': r"""(?<=package )(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*""",
         'action': lex.make_token},

        {'name': 'use',
         'expr': r"""(?<=use )(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*""",
         'action': lex.make_token},

        {'name': 'instance method',
         'expr': r"""(?<=->)[a-zA-Z_][a-zA-Z_0-9]*""",
         'action': lex.make_token},

        {'name': 'static method',
         'expr': r"""&?(?:[a-zA-Z_][a-zA-Z_0-9]*::)+[a-zA-Z_][a-zA-Z_0-9]*""",
         'action': lex.make_token},

        {'name': 'method declaration',
         'expr': r"""(?<=sub )[a-zA-Z_][a-zA-Z_0-9]*(?=[ \n]*{)""",
         'action': lex.make_token},

        {'name': 'built-in method',
         'expr': r"""(?<!->)&?(?:write|warn|wantarray|waitpid|wait|vec|values|utime|use|untie|unshift|unpack|unlink|undef|umask|ucfirst|uc|truncate|times|time|tied|tie|telldir|tell|syswrite|system|sysseek|sysread|sysopen|syscall|symlink|substr|sub|study|stat|srand|sqrt|sprintf|split|splice|sort|socketpair|socket|sleep|sin|shutdown|shmwrite|shmread|shmget|shmctl|shift|setsockopt|setservent|setpwent|setprotoent|setpriority|setpgrp|setnetent|sethostent|setgrent|send|semop|semget|semctl|select|seekdir|seek|scalar|rmdir|rindex|rewinddir|reverse|return|reset|require|rename|ref|redo|recv|readpipe|readlink|readline|readdir|read|rand|quotemeta|push|prototype|printf|print|pos|pop|pipe|package|pack|our|ord|opendir|open|oct|no|next|my|msgsnd|msgrcv|msgget|msgctl|mkdir|map|lstat|log|lock|localtime|local|listen|link|length|lcfirst|lc|last|kill|keys|join|ioctl|int|index|import|hex|grep|goto|gmtime|glob|getsockopt|getsockname|getservent|getservbyport|getservbyname|getpwuid|getpwnam|getpwent|getprotoent|getprotobynumber|getprotobyname|getpriority|getppid|getpgrp|getpeername|getnetent|getnetbyname|getnetbyaddr|getlogin|gethostent|gethostbyname|gethostbyaddr|getgrnam|getgrgid|getgrent|getc|formline|format|fork|flock|fileno|fcntl|exp|exit|exists|exec|eval|eof|endservent|endpwent|endprotoent|endnetent|endhostent|endgrent|each|dump|do|die|delete|defined|dbmopen|dbmclose|crypt|cos|continue|connect|closedir|close|chroot|chr|chown|chop|chomp|chmod|chdir|caller|bless|binmode|bind|atan2|alarm|accept|abs)(?![a-zA-Z0-9_])""",
         #'expr':r"""(?<!->)&?(?:abs|accept|alarm|atan2|bind|binmode|bless|caller|chdir|chmod|chomp|chop|chown|chroot|chr|closedir|close|connect|cos|crypt|dbmclose|dbmopen|defined|delete|die|dump|each|eof|exec|exists|exit|exp|fcntl|fileno|flock|fork|format|formline|getc|getlogin|getpeername|grep|int|join|keys|lc|map|open|pop|print|push|rand|readdir|ref|scalar|select|shift|sort|split|srand|time|uc|unshift|values|wantarray|warn)(?![a-zA-Z0-9_])""",
         'action': lex.make_token},

        {'name': 'method',
         'expr': r"""&(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*""",
         'action': lex.make_token},

        {'name': 'methodref',
         'expr': r"""&\$(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*""",
         'action': lex.make_token},

        {'name': 'bareword method',
         'expr': r"""(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*(?=[ \n]*(?:\(|->))""",
         'action': lex.make_token},

        {'name': "delimiter",
         'expr': r"""\(|\)|\[|\]|{|}|,|;|=>|=|\?|(?<!:):(?!=:)""",
         'action': lex.make_token},

        {'name': "unary operator",
         'expr': r"""\+=|-=|\*=|/=|//=|%=|&=\|\^=|>>=|<<=|\*\*=""",
         'action': lex.make_token},

        {'name': "operator",
         'expr': r"""\+|<=>|<>|<<|<=|<|-|>>|>=|>|\*\*|&|\*|\||/|\^|==|//|~|=~|!~|!=|%|!|\.""",
         'action': lex.make_token},

        {'name': 'bareword',
         'expr': r"""(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*""",
         'action': lex.make_token},

        {'name': 'default',
         'expr': r""".|\n""",
         'action': lex.silent}
        ]

    def _default_rules(self):
        """subclasses can override this to define defaults for a grammar"""
        sub_exprs = {}
        string_rules = []

        for rdir in PerlGrammar.GRAMMAR_LIST:
            self.add_rule(**rdir)

            if INTERPOLATION_HIGHLIGHTING:
                if rdir['name'] in ('scalar', 'system scalar', 'array', 'hash',
                                    'system array'):
                    rdir2 = rdir.copy()
                    rdir2['name'] = 'interpolated ' + rdir['name']
                    string_rules.append(lex.Rule(**rdir2))
                elif rdir['name'] in ('heredoc', 'string1', 'string2'):
                    sub_exprs[rdir['name']] = rdir['expr']

        if INTERPOLATION_HIGHLIGHTING:
            string_rules.append(lex.Rule(name="default string",
                                         expr=r"""(?:\\.|[^\\\$]|\n)+|\$""",
                                         action=lex.make_token))
            string_grammar = lex.Grammar(rules=string_rules)

            self.insert(0, lex.SubRule(name='heredoc',
                                       expr=sub_exprs['heredoc'],
                                       grammar=string_grammar))

            self.insert(4, lex.SubRule(name="string1",
                                       expr=sub_exprs['string1'],
                                       grammar=string_grammar))

            self.insert(5, lex.SubRule(name="string2",
                                       expr=sub_exprs['string2'],
                                       grammar=string_grammar))