pmacs3/lex_c.py

123 lines
4.1 KiB
Python

#!/bin/env python
# 2.3 imports
from optparse import OptionParser
# our imports
import lex
class CGrammar(lex.Grammar):
GRAMMAR_LIST = [
# this might not be complete...
# see http://gcc.gnu.org/onlinedocs/gcc-2.95.3/cpp_3.html#SEC44
# we need to absorb the rest of the line cause otherwise shit happens
{'name': 'macro2',
'expr': r"#(?:define|import|include|undef)(?= )",
'action':lex.make_token},
{'name': 'macro1',
'expr': r"#(?:assert|cpu|elif|else|error|endif|error|ident|ifdef|ifndef|if|include_next|line|machine|pragma|pragma_once|system|unassert|warning)(?:[^\n]*\\\n)*[^\n]*?(?=\n)",
'action':lex.make_token},
{'name': 'header',
'expr': r'''(?<=#include) +(?:<[A-Za-z0-9_]+\.h?>|"[A-Za-z0-9_]+\.h")''',
'action': lex.make_token},
{'name': 'constant',
'expr': r'''(?<=#define) +[A-Za-z0-9_]+(?= |\(|\n|$)''',
'action': lex.make_token},
{'name': 'label',
'expr': r"""[a-zA-Z_]+(?=:)""",
'action': lex.make_token},
{'name': "c++ comment",
'expr': r'//.*(?:\n|$)',
'action': lex.make_token},
{'name': "c comment",
'expr': r"/\*(?:.|\n)*?(?:\*/|$)",
'action' : lex.make_token},
{'name': 'control',
'expr': r"(?:break|case|continue|default|do|else|for|goto|if|return|switch|while)(?![a-zA-Z_])",
'action': lex.make_token},
{'name': 'keyword',
'expr': r"(?:auto|break|case|char|const|continue|default|do|double|else|enum|extern|float|for|goto|if|int|long|register|return|short|signed|sizeof|static|struct|switch|typedef|union|unsigned|void|volatile|while)(?![a-zA-z_])",
'action': lex.make_token},
{'name': 'builtin',
'expr': r"(?:NULL|TRUE|FALSE)",
'action': lex.make_token},
{'name': "identifier",
'expr': r"[a-zA-Z_][a-zA-Z0-9_]*",
'action': lex.make_token},
{'name': "unary operator",
'expr': r"""\+=|-=|\*=|/=|//=|%=|&=\|\^=|>>=|<<=|\*\*=""",
'action': lex.make_token},
{'name': "operator",
'expr': r"""\+|<>|<<|<=|<|-|>>|>=|>|\*\*|&|\*|\||/|\^|==|//|~|!=|%""",
'action': lex.make_token},
# this is sketchy as hell
{'name': "delimiter",
'expr': r"""->|\.|\(|\)|\[|\]|{|}|@|,|:|`|;|=|\?""",
'action': lex.make_token},
{'name': "integer",
'expr': r"(?:0(?![x0-9])|[1-9][0-9]*|0[0-7]+|0[xX][0-9a-fA-F]+)[lL]?",
'action': lex.make_token},
{'name': "float",
'expr': r"""[0-9]+\.[0-9]*|\.[0-9]+|(?:[0-9]|[0-9]+\.[0-9]*|\.[0-9]+)[eE][\+-]?[0-9]+""",
'action': lex.make_token},
{'name': "string1",
'expr': r'"(?:\\.|[^"])*(?:"|.?$)',
'action': lex.make_token},
# Doesn't handle octal . . (yeah it does..heh...ughhh)
{'name': "char",
'expr': r"'(?:\\[^']+|[^'])(?:'|.?$)",
'action': lex.make_token},
{'name': "default",
'expr': r'\\.|.|\n',
'action': lex.silent}
]
def _default_rules(self):
"""subclasses can override this to define defaults for a grammar"""
lex.Grammar._default_rules(self)
self.insert(0, lex.BalancedExprRule(name='macro comment',
start_expr=r"#if +0",
enter="#if",
leave="#endif",
action=lex.make_token))
if __name__ == "__main__":
usage = "%%prog <file> [<file> ...]\n\n" \
"Lex one or more files according to the python grammar"
parser = OptionParser(usage=usage)
(options, args) = parser.parse_args()
g = CGrammar()
l = lex.Lexer(grammar=g)
for path in args:
f = open(path, 'r')
data = f.read()
f.close()
print "Lexing %s:" % (path)
l.lex(data)
for t in l:
if t is not None:
print t
#print "%-12s %-40s %d %d" % (t.rule.name, t.string, t.start, t.end)