#!/bin/env python # 2.3 imports from optparse import OptionParser # our imports import lex class CGrammar(lex.Grammar): GRAMMAR_LIST = [ # this might not be complete... # see http://gcc.gnu.org/onlinedocs/gcc-2.95.3/cpp_3.html#SEC44 # we need to absorb the rest of the line cause otherwise shit happens {'name': 'macro2', 'expr': r"#(?:define|import|include|undef)(?= )", 'action':lex.make_token}, {'name': 'macro1', 'expr': r"#(?:assert|cpu|elif|else|error|endif|error|ident|ifdef|ifndef|if|include_next|line|machine|pragma|pragma_once|system|unassert|warning)(?:[^\n]*\\\n)*[^\n]*?(?=\n)", 'action':lex.make_token}, {'name': 'header', 'expr': r'''(?<=#include) +(?:<[A-Za-z0-9_]+\.h?>|"[A-Za-z0-9_]+\.h")''', 'action': lex.make_token}, {'name': 'constant', 'expr': r'''(?<=#define) +[A-Za-z0-9_]+(?= |\(|\n|$)''', 'action': lex.make_token}, {'name': 'label', 'expr': r"""[a-zA-Z_]+(?=:)""", 'action': lex.make_token}, {'name': "c++ comment", 'expr': r'//.*(?:\n|$)', 'action': lex.make_token}, {'name': "c comment", 'expr': r"/\*(?:.|\n)*?(?:\*/|$)", 'action' : lex.make_token}, {'name': 'control', 'expr': r"(?:break|case|continue|default|do|else|for|goto|if|return|switch|while)(?![a-zA-Z_])", 'action': lex.make_token}, {'name': 'keyword', 'expr': r"(?:auto|break|case|char|const|continue|default|do|double|else|enum|extern|float|for|goto|if|int|long|register|return|short|signed|sizeof|static|struct|switch|typedef|union|unsigned|void|volatile|while)(?![a-zA-z_])", 'action': lex.make_token}, {'name': 'builtin', 'expr': r"(?:NULL|TRUE|FALSE)", 'action': lex.make_token}, {'name': "identifier", 'expr': r"[a-zA-Z_][a-zA-Z0-9_]*", 'action': lex.make_token}, {'name': "unary operator", 'expr': r"""\+=|-=|\*=|/=|//=|%=|&=\|\^=|>>=|<<=|\*\*=""", 'action': lex.make_token}, {'name': "operator", 'expr': r"""\+|<>|<<|<=|<|-|>>|>=|>|\*\*|&|\*|\||/|\^|==|//|~|!=|%""", 'action': lex.make_token}, # this is sketchy as hell {'name': "delimiter", 'expr': r"""->|\.|\(|\)|\[|\]|{|}|@|,|:|`|;|=|\?""", 'action': lex.make_token}, {'name': "integer", 'expr': r"(?:0(?![x0-9])|[1-9][0-9]*|0[0-7]+|0[xX][0-9a-fA-F]+)[lL]?", 'action': lex.make_token}, {'name': "float", 'expr': r"""[0-9]+\.[0-9]*|\.[0-9]+|(?:[0-9]|[0-9]+\.[0-9]*|\.[0-9]+)[eE][\+-]?[0-9]+""", 'action': lex.make_token}, {'name': "string1", 'expr': r'"(?:\\.|[^"])*(?:"|.?$)', 'action': lex.make_token}, # Doesn't handle octal . . (yeah it does..heh...ughhh) {'name': "char", 'expr': r"'(?:\\[^']+|[^'])(?:'|.?$)", 'action': lex.make_token}, {'name': "default", 'expr': r'\\.|.|\n', 'action': lex.silent} ] def _default_rules(self): """subclasses can override this to define defaults for a grammar""" lex.Grammar._default_rules(self) self.insert(0, lex.BalancedExprRule(name='macro comment', start_expr=r"#if +0", enter="#if", leave="#endif", action=lex.make_token)) if __name__ == "__main__": usage = "%%prog [ ...]\n\n" \ "Lex one or more files according to the python grammar" parser = OptionParser(usage=usage) (options, args) = parser.parse_args() g = CGrammar() l = lex.Lexer(grammar=g) for path in args: f = open(path, 'r') data = f.read() f.close() print "Lexing %s:" % (path) l.lex(data) for t in l: if t is not None: print t #print "%-12s %-40s %d %d" % (t.rule.name, t.string, t.start, t.end)