pmacs3/lex_c.py

#!/bin/env python

# 2.3 imports
from optparse import OptionParser

# our imports
import lex

class CGrammar(lex.Grammar):
    GRAMMAR_LIST = [
        # this might not be complete...
        # see http://gcc.gnu.org/onlinedocs/gcc-2.95.3/cpp_3.html#SEC44
        # we need to absorb the rest of the line cause otherwise shit happens
        {'name': 'macro2',
         'expr': r"#(?:define|import|include|undef)(?= )",
         'action':lex.make_token},

        {'name': 'macro1',
         'expr': r"#(?:assert|cpu|elif|else|error|endif|error|ident|ifdef|ifndef|if|include_next|line|machine|pragma|pragma_once|system|unassert|warning)(?:[^\n]*\\\n)*[^\n]*?(?=\n)",
         'action':lex.make_token},

        {'name': 'header',
         'expr': r'''(?<=#include) +(?:<[A-Za-z0-9_]+\.h?>|"[A-Za-z0-9_]+\.h")''',
         'action': lex.make_token},

        {'name': 'constant',
         'expr': r'''(?<=#define) +[A-Za-z0-9_]+(?= |\(|\n|$)''',
         'action': lex.make_token},

        {'name': 'label',
         'expr': r"""[a-zA-Z_]+(?=:)""",
         'action': lex.make_token},
        
        {'name': "c++ comment",
         'expr': r'//.*(?:\n|$)',
         'action': lex.make_token},

        {'name': "c comment",
         'expr': r"/\*(?:.|\n)*?(?:\*/|$)",
         'action' : lex.make_token},
        
        {'name': 'control',
         'expr': r"(?:break|case|continue|default|do|else|for|goto|if|return|switch|while)(?![a-zA-Z_])",
         'action': lex.make_token},

        {'name': 'keyword',
         'expr': r"(?:auto|break|case|char|const|continue|default|do|double|else|enum|extern|float|for|goto|if|int|long|register|return|short|signed|sizeof|static|struct|switch|typedef|union|unsigned|void|volatile|while)(?![a-zA-z_])",
         'action': lex.make_token},

        {'name': 'builtin',
         'expr': r"(?:NULL|TRUE|FALSE)",
         'action': lex.make_token},

        {'name': "identifier",
         'expr': r"[a-zA-Z_][a-zA-Z0-9_]*",
         'action': lex.make_token},
        
        {'name': "unary operator",
         'expr': r"""\+=|-=|\*=|/=|//=|%=|&=\|\^=|>>=|<<=|\*\*=""",
         'action': lex.make_token},
        
        {'name': "operator",
         'expr': r"""\+|<>|<<|<=|<|-|>>|>=|>|\*\*|&|\*|\||/|\^|==|//|~|!=|%""",
         'action': lex.make_token},

        # this is sketchy as hell
        {'name': "delimiter",
         'expr': r"""->|\.|\(|\)|\[|\]|{|}|@|,|:|`|;|=|\?""",
         'action': lex.make_token},
        
        {'name': "integer",
         'expr': r"(?:0(?![x0-9])|[1-9][0-9]*|0[0-7]+|0[xX][0-9a-fA-F]+)[lL]?",
         'action': lex.make_token},
        
        {'name': "float",
         'expr': r"""[0-9]+\.[0-9]*|\.[0-9]+|(?:[0-9]|[0-9]+\.[0-9]*|\.[0-9]+)[eE][\+-]?[0-9]+""",
         'action': lex.make_token},
        
        {'name': "string1",
         'expr': r'"(?:\\.|[^"])*(?:"|.?$)',
         'action': lex.make_token},

        # Doesn't handle octal . .  (yeah it does..heh...ughhh)
        {'name': "char",
         'expr': r"'(?:\\[^']+|[^'])(?:'|.?$)",
         'action': lex.make_token},
        
        {'name': "default",
         'expr': r'\\.|.|\n',
         'action': lex.silent}
    ]

    def _default_rules(self):
        """subclasses can override this to define defaults for a grammar"""
        lex.Grammar._default_rules(self)
        self.insert(0, lex.BalancedExprRule(name='macro comment',
                                            start_expr=r"#if +0",
                                            enter="#if",
                                            leave="#endif",
                                            action=lex.make_token))

if __name__ == "__main__":
    usage = "%%prog <file> [<file> ...]\n\n" \
            "Lex one or more files according to the python grammar"
    parser = OptionParser(usage=usage)
    (options, args) = parser.parse_args()
    g = CGrammar()
    l = lex.Lexer(grammar=g)

    for path in args:
        f = open(path, 'r')
        data = f.read()
        f.close()

        print "Lexing %s:" % (path)
        l.lex(data)

        for t in l:
            if t is not None:
                print t
                #print "%-12s %-40s %d %d" % (t.rule.name, t.string, t.start, t.end)
try this again --HG-- branch : pmacs2 2007-03-06 10:05:38 -05:00			`#!/bin/env python`

			`# 2.3 imports`
			`from optparse import OptionParser`

			`# our imports`
			`import lex`

			`class CGrammar(lex.Grammar):`
			`GRAMMAR_LIST = [`
			`# this might not be complete...`
			`# see http://gcc.gnu.org/onlinedocs/gcc-2.95.3/cpp_3.html#SEC44`
			`# we need to absorb the rest of the line cause otherwise shit happens`
			`{'name': 'macro2',`
			`'expr': r"#(?:define\|import\|include\|undef)(?= )",`
			`'action':lex.make_token},`

			`{'name': 'macro1',`
			`'expr': r"#(?:assert\|cpu\|elif\|else\|error\|endif\|error\|ident\|ifdef\|ifndef\|if\|include_next\|line\|machine\|pragma\|pragma_once\|system\|unassert\|warning)(?:[^\n]\\\n)[^\n]*?(?=\n)",`
			`'action':lex.make_token},`

			`{'name': 'header',`
			`'expr': r'''(?<=#include) +(?:<[A-Za-z0-9_]+\.h?>\|"[A-Za-z0-9_]+\.h")''',`
			`'action': lex.make_token},`

			`{'name': 'constant',`
			`'expr': r'''(?<=#define) +[A-Za-z0-9_]+(?= \|\(\|\n\|$)''',`
			`'action': lex.make_token},`

			`{'name': 'label',`
			`'expr': r"""[a-zA-Z_]+(?=:)""",`
			`'action': lex.make_token},`

			`{'name': "c++ comment",`
			`'expr': r'//.*(?:\n\|$)',`
			`'action': lex.make_token},`

			`{'name': "c comment",`
			`'expr': r"/\(?:.\|\n)?(?:\*/\|$)",`
			`'action' : lex.make_token},`

			`{'name': 'control',`
			`'expr': r"(?:break\|case\|continue\|default\|do\|else\|for\|goto\|if\|return\|switch\|while)(?![a-zA-Z_])",`
			`'action': lex.make_token},`

			`{'name': 'keyword',`
			`'expr': r"(?:auto\|break\|case\|char\|const\|continue\|default\|do\|double\|else\|enum\|extern\|float\|for\|goto\|if\|int\|long\|register\|return\|short\|signed\|sizeof\|static\|struct\|switch\|typedef\|union\|unsigned\|void\|volatile\|while)(?![a-zA-z_])",`
			`'action': lex.make_token},`

			`{'name': 'builtin',`
			`'expr': r"(?:NULL\|TRUE\|FALSE)",`
			`'action': lex.make_token},`

			`{'name': "identifier",`
			`'expr': r"[a-zA-Z_][a-zA-Z0-9_]*",`
			`'action': lex.make_token},`

			`{'name': "unary operator",`
			`'expr': r"""\+=\|-=\|\=\|/=\|//=\|%=\|&=\\|\^=\|>>=\|<<=\|\\*=""",`
			`'action': lex.make_token},`

			`{'name': "operator",`
			`'expr': r"""\+\|<>\|<<\|<=\|<\|-\|>>\|>=\|>\|\\\|&\|\*\|\\|\|/\|\^\|==\|//\|~\|!=\|%""",`
			`'action': lex.make_token},`

			`# this is sketchy as hell`
			`{'name': "delimiter",`
			'expr': r"""->\|\.\|\(\|\)\|\[\|\]\|{\|}\|@\|,\|:\|`\|;\|=\|\?""",
			`'action': lex.make_token},`

			`{'name': "integer",`
			`'expr': r"(?:0(?![x0-9])\|[1-9][0-9]*\|0[0-7]+\|0[xX][0-9a-fA-F]+)[lL]?",`
			`'action': lex.make_token},`

			`{'name': "float",`
			`'expr': r"""[0-9]+\.[0-9]\|\.[0-9]+\|(?:[0-9]\|[0-9]+\.[0-9]\|\.[0-9]+)[eE][\+-]?[0-9]+""",`
			`'action': lex.make_token},`

			`{'name': "string1",`
			`'expr': r'"(?:\\.\|[^"])*(?:"\|.?$)',`
			`'action': lex.make_token},`

			`# Doesn't handle octal . . (yeah it does..heh...ughhh)`
			`{'name': "char",`
			`'expr': r"'(?:\\[^']+\|[^'])(?:'\|.?$)",`
			`'action': lex.make_token},`

			`{'name': "default",`
			`'expr': r'\\.\|.\|\n',`
			`'action': lex.silent}`
			`]`

			`def _default_rules(self):`
			`"""subclasses can override this to define defaults for a grammar"""`
			`lex.Grammar._default_rules(self)`
			`self.insert(0, lex.BalancedExprRule(name='macro comment',`
			`start_expr=r"#if +0",`
			`enter="#if",`
			`leave="#endif",`
			`action=lex.make_token))`

			`if __name__ == "__main__":`
			`usage = "%%prog <file> [<file> ...]\n\n" \`
			`"Lex one or more files according to the python grammar"`
			`parser = OptionParser(usage=usage)`
			`(options, args) = parser.parse_args()`
			`g = CGrammar()`
			`l = lex.Lexer(grammar=g)`

			`for path in args:`
			`f = open(path, 'r')`
			`data = f.read()`
			`f.close()`

			`print "Lexing %s:" % (path)`
			`l.lex(data)`

			`for t in l:`
			`if t is not None:`
			`print t`
			`#print "%-12s %-40s %d %d" % (t.rule.name, t.string, t.start, t.end)`