pmacs3/lex_c.py

#!/bin/env python

# 2.3 imports
from optparse import OptionParser

# our imports
import lex

class CGrammar(lex.Grammar):
    GRAMMAR_LIST = [
        # this might not be complete...
        # see http://gcc.gnu.org/onlinedocs/gcc-2.95.3/cpp_3.html#SEC44
        # we need to absorb the rest of the line cause otherwise shit happens
        {'name': 'macro2',
         'expr': r"#(?:define|import|include|undef)(?= )",
         'action':lex.make_token},

        {'name': 'macro1',
         'expr': r"#(?:assert|cpu|elif|else|error|endif|error|ident|ifdef|ifndef|if|include_next|line|machine|pragma|pragma_once|system|unassert|warning)(?:[^\n]*\\\n)*[^\n]*?(?=\n)",
         'action':lex.make_token},

        {'name': 'header',
         'expr': r'''(?<=#include) +(?:<[A-Za-z0-9_]+\.h?>|"[A-Za-z0-9_]+\.h")''',
         'action': lex.make_token},

        {'name': 'constant',
         'expr': r'''(?<=#define) +[A-Za-z0-9_]+(?= |\(|\n|$)''',
         'action': lex.make_token},

        {'name': 'label',
         'expr': r"""[a-zA-Z_]+(?=:)""",
         'action': lex.make_token},

        {'name': "c++ comment",
         'expr': r'//.*(?:\n|$)',
         'action': lex.make_token},

        {'name': "c comment",
         'expr': r"/\*(?:.|\n)*?(?:\*/|$)",
         'action' : lex.make_token},

        {'name': 'control',
         'expr': r"(?:break|case|continue|default|do|else|for|goto|if|return|switch|while)(?![a-zA-Z_])",
         'action': lex.make_token},

        {'name': 'keyword',
         'expr': r"(?:auto|break|case|char|const|continue|default|do|double|else|enum|extern|float|for|goto|if|int|long|register|return|short|signed|sizeof|static|struct|switch|typedef|union|unsigned|void|volatile|while)(?![a-zA-z_])",
         'action': lex.make_token},

        {'name': 'builtin',
         'expr': r"(?:NULL|TRUE|FALSE)",
         'action': lex.make_token},

        {'name': "identifier",
         'expr': r"[a-zA-Z_][a-zA-Z0-9_]*",
         'action': lex.make_token},

        {'name': "unary operator",
         'expr': r"""\+=|-=|\*=|/=|//=|%=|&=\|\^=|>>=|<<=|\*\*=""",
         'action': lex.make_token},

        {'name': "operator",
         'expr': r"""\+|<>|<<|<=|<|-|>>|>=|>|\*\*|&|\*|\||/|\^|==|//|~|!=|%""",
         'action': lex.make_token},

        # this is sketchy as hell
        {'name': "delimiter",
         'expr': r"""->|\.|\(|\)|\[|\]|{|}|@|,|:|`|;|=|\?""",
         'action': lex.make_token},

        {'name': "integer",
         'expr': r"(?:0(?![x0-9])|[1-9][0-9]*|0[0-7]+|0[xX][0-9a-fA-F]+)[lL]?",
         'action': lex.make_token},

        {'name': "float",
         'expr': r"""[0-9]+\.[0-9]*|\.[0-9]+|(?:[0-9]|[0-9]+\.[0-9]*|\.[0-9]+)[eE][\+-]?[0-9]+""",
         'action': lex.make_token},

        {'name': "string1",
         'expr': r'"(?:\\.|[^"])*(?:"|.?$)',
         'action': lex.make_token},

        # Doesn't handle octal . .  (yeah it does..heh...ughhh)
        {'name': "char",
         'expr': r"'(?:\\[^']+|[^'])(?:'|.?$)",
         'action': lex.make_token},

        {'name': "default",
         'expr': r'\\.|.|\n',
         'action': lex.silent}
    ]

    def _default_rules(self):
        """subclasses can override this to define defaults for a grammar"""
        lex.Grammar._default_rules(self)
        self.insert(0, lex.BalancedExprRule(name='macro comment',
                                            start_expr=r"#if +0",
                                            enter="#if",
                                            leave="#endif",
                                            action=lex.make_token))

if __name__ == "__main__":
    usage = "%%prog <file> [<file> ...]\n\n" \
            "Lex one or more files according to the python grammar"
    parser = OptionParser(usage=usage)
    (options, args) = parser.parse_args()
    g = CGrammar()
    l = lex.Lexer(grammar=g)

    for path in args:
        f = open(path, 'r')
        data = f.read()
        f.close()

        print "Lexing %s:" % (path)
        l.lex(data)

        for t in l:
            if t is not None:
                print t
                #print "%-12s %-40s %d %d" % (t.rule.name, t.string, t.start, t.end)