123 lines
4.1 KiB
Python
123 lines
4.1 KiB
Python
|
#!/bin/env python
|
||
|
|
||
|
# 2.3 imports
|
||
|
from optparse import OptionParser
|
||
|
|
||
|
# our imports
|
||
|
import lex
|
||
|
|
||
|
class CGrammar(lex.Grammar):
|
||
|
GRAMMAR_LIST = [
|
||
|
# this might not be complete...
|
||
|
# see http://gcc.gnu.org/onlinedocs/gcc-2.95.3/cpp_3.html#SEC44
|
||
|
# we need to absorb the rest of the line cause otherwise shit happens
|
||
|
{'name': 'macro2',
|
||
|
'expr': r"#(?:define|import|include|undef)(?= )",
|
||
|
'action':lex.make_token},
|
||
|
|
||
|
{'name': 'macro1',
|
||
|
'expr': r"#(?:assert|cpu|elif|else|error|endif|error|ident|ifdef|ifndef|if|include_next|line|machine|pragma|pragma_once|system|unassert|warning)(?:[^\n]*\\\n)*[^\n]*?(?=\n)",
|
||
|
'action':lex.make_token},
|
||
|
|
||
|
{'name': 'header',
|
||
|
'expr': r'''(?<=#include) +(?:<[A-Za-z0-9_]+\.h?>|"[A-Za-z0-9_]+\.h")''',
|
||
|
'action': lex.make_token},
|
||
|
|
||
|
{'name': 'constant',
|
||
|
'expr': r'''(?<=#define) +[A-Za-z0-9_]+(?= |\(|\n|$)''',
|
||
|
'action': lex.make_token},
|
||
|
|
||
|
{'name': 'label',
|
||
|
'expr': r"""[a-zA-Z_]+(?=:)""",
|
||
|
'action': lex.make_token},
|
||
|
|
||
|
{'name': "c++ comment",
|
||
|
'expr': r'//.*(?:\n|$)',
|
||
|
'action': lex.make_token},
|
||
|
|
||
|
{'name': "c comment",
|
||
|
'expr': r"/\*(?:.|\n)*?(?:\*/|$)",
|
||
|
'action' : lex.make_token},
|
||
|
|
||
|
{'name': 'control',
|
||
|
'expr': r"(?:break|case|continue|default|do|else|for|goto|if|return|switch|while)(?![a-zA-Z_])",
|
||
|
'action': lex.make_token},
|
||
|
|
||
|
{'name': 'keyword',
|
||
|
'expr': r"(?:auto|break|case|char|const|continue|default|do|double|else|enum|extern|float|for|goto|if|int|long|register|return|short|signed|sizeof|static|struct|switch|typedef|union|unsigned|void|volatile|while)(?![a-zA-z_])",
|
||
|
'action': lex.make_token},
|
||
|
|
||
|
{'name': 'builtin',
|
||
|
'expr': r"(?:NULL|TRUE|FALSE)",
|
||
|
'action': lex.make_token},
|
||
|
|
||
|
{'name': "identifier",
|
||
|
'expr': r"[a-zA-Z_][a-zA-Z0-9_]*",
|
||
|
'action': lex.make_token},
|
||
|
|
||
|
{'name': "unary operator",
|
||
|
'expr': r"""\+=|-=|\*=|/=|//=|%=|&=\|\^=|>>=|<<=|\*\*=""",
|
||
|
'action': lex.make_token},
|
||
|
|
||
|
{'name': "operator",
|
||
|
'expr': r"""\+|<>|<<|<=|<|-|>>|>=|>|\*\*|&|\*|\||/|\^|==|//|~|!=|%""",
|
||
|
'action': lex.make_token},
|
||
|
|
||
|
# this is sketchy as hell
|
||
|
{'name': "delimiter",
|
||
|
'expr': r"""->|\.|\(|\)|\[|\]|{|}|@|,|:|`|;|=|\?""",
|
||
|
'action': lex.make_token},
|
||
|
|
||
|
{'name': "integer",
|
||
|
'expr': r"(?:0(?![x0-9])|[1-9][0-9]*|0[0-7]+|0[xX][0-9a-fA-F]+)[lL]?",
|
||
|
'action': lex.make_token},
|
||
|
|
||
|
{'name': "float",
|
||
|
'expr': r"""[0-9]+\.[0-9]*|\.[0-9]+|(?:[0-9]|[0-9]+\.[0-9]*|\.[0-9]+)[eE][\+-]?[0-9]+""",
|
||
|
'action': lex.make_token},
|
||
|
|
||
|
{'name': "string1",
|
||
|
'expr': r'"(?:\\.|[^"])*(?:"|.?$)',
|
||
|
'action': lex.make_token},
|
||
|
|
||
|
# Doesn't handle octal . . (yeah it does..heh...ughhh)
|
||
|
{'name': "char",
|
||
|
'expr': r"'(?:\\[^']+|[^'])(?:'|.?$)",
|
||
|
'action': lex.make_token},
|
||
|
|
||
|
{'name': "default",
|
||
|
'expr': r'\\.|.|\n',
|
||
|
'action': lex.silent}
|
||
|
]
|
||
|
|
||
|
def _default_rules(self):
|
||
|
"""subclasses can override this to define defaults for a grammar"""
|
||
|
lex.Grammar._default_rules(self)
|
||
|
self.insert(0, lex.BalancedExprRule(name='macro comment',
|
||
|
start_expr=r"#if +0",
|
||
|
enter="#if",
|
||
|
leave="#endif",
|
||
|
action=lex.make_token))
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
usage = "%%prog <file> [<file> ...]\n\n" \
|
||
|
"Lex one or more files according to the python grammar"
|
||
|
parser = OptionParser(usage=usage)
|
||
|
(options, args) = parser.parse_args()
|
||
|
g = CGrammar()
|
||
|
l = lex.Lexer(grammar=g)
|
||
|
|
||
|
for path in args:
|
||
|
f = open(path, 'r')
|
||
|
data = f.read()
|
||
|
f.close()
|
||
|
|
||
|
print "Lexing %s:" % (path)
|
||
|
l.lex(data)
|
||
|
|
||
|
for t in l:
|
||
|
if t is not None:
|
||
|
print t
|
||
|
#print "%-12s %-40s %d %d" % (t.rule.name, t.string, t.start, t.end)
|
||
|
|