diff --git a/lex.py b/lex.py deleted file mode 100755 index e1c6521..0000000 --- a/lex.py +++ /dev/null @@ -1,249 +0,0 @@ -#!/bin/env python - -""" -lex - a lexer generator in python. -""" - -__author__ = "Dan Williams (dan@osheim.org, dww4s@virginia.edu)" -__copyright__ = "2005" - -# std imports -import os.path, re, sys, copy - -# 2.3 imports -from optparse import OptionParser - -# callbacks - -def silent(rule, m, offset): - '''ignore a hit; return None''' - pass - -def make_token(rule, m, offset): - '''return a token from a hit''' - return(Token(rule.name, m.start() + offset, m.end() + offset, m.group(0))) - -class Token: - '''Used to store an instance of a lexical token''' - def __init__(self, name, start, end, s=None): - self.name = name - self.start = start - self.end = end - self.string = s - self.debug = False - - def __repr__(self): - if len(self.string) < 10: - s = self.string - else: - s = self.string[:10] + "..." - return "" % (self.name, self.start, self.end, s) - -class Rule(object): - """Defines a rule used by a lexer.""" - def __init__(self, name="Unnamed", expr=r"(.|\n)", action=lambda x,y: None): - self.name = name - self.expr = expr - self.re = re.compile(self.expr) - self.action = action - def match(self, *args, **kw): - """Determine if this rule is matched""" - return self.re.match(*args, **kw) - def act(self, lexer, m, offset=0): - """Act on this rule""" - return self.action(self, m, offset) - -class SubRule(Rule): - """Defines a rule which parses a region according to its own grammar, - i.e. a sub-grammar with its own rules. This rule may return multiple - tokens and span multiple calls to the next() method of Lexer.""" - def __init__(self, name="Unnamed", expr=r"(.|\n)", grammar=None): - self.name = name - self.expr = expr - self.re = re.compile(self.expr) - - if grammar is None: - self.grammar = Grammar() - else: - self.grammar = grammar - self.lexer = Lexer(self.grammar) - - self.data = None - self.index = None - - def match(self, *args, **kw): - """Determine if this rule is matched""" - m = self.re.match(*args, **kw) - if m is not None: - self.data = args[0][:m.end()] - self.index = args[1] - return m - - def act(self, lexer, m): - """Act on this match""" - self.lexer.lex(self.data, self.index) - try: - v = self.lexer.next() - lexer.sub_lexer = self.lexer - return v - except StopIteration: - lexer.sub_lexer = None - return None - -class BalancedExprMatch: - def __init__(self, start, end, data): - self.s = start - self.e = end - self.d = data - def start(self): - return self.s - def end(self): - return self.e - def group(self, i): - if i == 0 or i == 1: - return self.d - else: - raise IndexError, "no such group" - def groupdict(self): - return {} - def groups(self): - return () - def span(self): - return (self.s, self.e) - -class BalancedExprRule(Rule): - """ - Defines a rule that need to take into account opening and closing - expressions, i.e. parenthesis, #if and #endif, etc. - """ - def __init__(self, name="Unnamed", start_expr=r"(#if +0)", - enter="#if", leave="#endif", action=lambda x,y: None): - self.name = name - - self.start_expr = start_expr - self.start_re = re.compile(self.start_expr) - - self.enter = enter - self.leave = leave - self.action = action - - def match(self, *args, **kw): - if not self.start_re.match(*args): - return None - stack = [] - data = args[0] - index = args[1] - start = index - if data[index:].startswith(self.enter): - stack.append(self.enter) - index += len(self.enter) - while len(stack) > 0 and index < len(data): - if data[index:].startswith(self.enter): - stack.append(self.enter) - index += len(self.enter) - elif data[index:].startswith(self.leave): - stack.pop(-1) - index += len(self.leave) - else: - index += 1 - m = BalancedExprMatch(start, index, data[start:index]) - return m - - def act(self, lexer, m): - """Act on this rule""" - return self.action(self, m) - -class Grammar(list): - """ - Defines rules for lexing according to a given grammar. - The order of rules in the grammar is their precedence in matching. - """ - GRAMMAR_LIST = [ {'name': 'default'} ] - def __init__(self, *args, **kw): - """useful values to pass in: - rules -> list of rules (ordered!) - if rules are not supplied, self._default_rules() is used""" - list.__init__(self) - if "rules" in kw: - for r in kw["rules"]: - self.append(r) - else: - self._default_rules() - self._post_init(*args, **kw) - - def _default_rules(self): - """subclasses can override this to define defaults for a grammar""" - for rdir in self.GRAMMAR_LIST: - self.add_rule(**rdir) - - def _post_init(self, *args, **kw): - """subclasses can override this to enable other behavior""" - pass - - def add_rule(self, *args, **kw): - self.append(Rule(*args, **kw)) - - def clear_rules(self): - while len(self) > 0: - del self[0] - -class Lexer(object): - """Defines a lexer, a generator of lexical tokens, etc.""" - def __init__(self, grammar=None, rules=None, data=None, index=0): - """ - If the grammar keyword is provided, then that grammar will be used. - Else, if the rules keyword is provided, that list of rules will be used - Else, the default (boring) grammar will be used. - - Normally, lex(data) is used to (re-)intialize the lexer with data to - lex. If the data keyword is provided, then the lexer is ready to go - on instantiation. - """ - if grammar is not None: - self.grammar = grammar - elif rules is not None: - self.grammar = Grammar(rules=rules) - else: - self.grammar = Grammar() - - self.data = data - self.index = index - self.offset = 0 - - self.sub_lexer = None - - def lex(self, data=None, index=0, offset=0): - """ - (re-)initialize the lexer with data to lex, and optionally, an offset - to start at - """ - self.data = data - self.index = index - self.offset = offset - - def __iter__(self): - if self.data is None: - raise Exception, "No data to be lexed" - return self - - #def append(self, newdata, offset=0): - # self.data += newdata - # self.index += offset - - def next(self): - # used for multiple levels of lexing - if self.sub_lexer is not None: - try: - return self.sub_lexer.next() - except StopIteration: - self.sub_lexer = None - - if self.index >= len(self.data): - raise StopIteration - for rule in self.grammar: - m = rule.match(self.data, self.index) - if m: - self.index = m.end() - return rule.act(self, m, self.offset) - raise Exception, "Failed to consume last %d characters of input: %r" % \ - (len(self.data) - self.index, self.data[self.index:]) diff --git a/lex2_perl.py b/lex2_perl.py deleted file mode 100755 index 5dd98db..0000000 --- a/lex2_perl.py +++ /dev/null @@ -1,353 +0,0 @@ -from lex2 import Grammar, ConstantRule, PatternRule, ContextPatternRule, RegionRule, DualRegionRule - -class PodGrammar(Grammar): - rules = [ - PatternRule( - name=r'entry', - pattern=r'(?<=^=head[1-4]) +.*$', - ), - PatternRule( - name=r'entry', - pattern=r'(?<=^=over) +.*$', - ), - PatternRule( - name=r'entry', - pattern=r'(?<=^=item) +.*$', - ), - PatternRule( - name=r'entry', - pattern=r'(?:(?<=^=begin)|(?<=^=end)) +.*$', - ), - PatternRule( - name=r'entry', - pattern=r'(?<=^=encoding) +.*$', - ), - ] - -class StringGrammar(Grammar): - rules = [ - PatternRule( - name=r'escaped', - pattern=r'\\.', - ), - PatternRule( - name=r'deref', - pattern=r"\$\$*[A-Za-z0-9_](?:[A-Za-z0-9_]|::)*(?:->{\$?(?:[a-zA-Z_][a-zA-Z_0-9]*|'(?:\\.|[^'\\])*'|\"(\\.|[^\\\"])*\")}|->\[\$?[0-9a-zA-Z_]+\])+", - ), - PatternRule( - name=r'length', - pattern=r"\$#[A-Za-z0-9_](?:[A-Za-z0-9_]|::)*", - ), - ContextPatternRule( - name=r'scalar', - pattern=r"\$[^A-Za-z0-9 %(delim)s](?![A-Za-z0-9_])", - fallback=r"\$[^A-Za-z0-9 ](?![A-Za-z0-9_])", - ), - #PatternRule( - # name=r'array', - # pattern=r"@_", - #), - PatternRule( - name=r'scalar', - pattern=r"\$\$*[A-Za-z0-9_](?:[A-Za-z0-9_]|::)*", - ), - PatternRule( - name=r'cast', - pattern=r"[\$\@\%\&]{.*?}", - ), - PatternRule( - name=r'array', - pattern=r"@\$*[A-Za-z_](?:[A-Za-z0-9_]|::)*", - ), - PatternRule( - name=r'hash', - pattern=r"%\$*[A-Za-z_](?:[A-Za-z0-9_]|::)*", - ), - ] - -class PerlGrammar(Grammar): - rules = [ - # heredocs - RegionRule( - name=r'heredoc1', - start=r"<<(?P[a-zA-Z0-9_]+) *;", - grammar=StringGrammar(), - end=r'^%(heredoc)s$', - ), - RegionRule( - name=r'heredoc1', - start=r'<< *"(?P[a-zA-Z0-9_]+)" *;', - grammar=StringGrammar(), - end=r'^%(heredoc)s$', - ), - RegionRule( - name=r'heredoc2', - start=r"<< *'(?P[a-zA-Z0-9_]+)' *;", - grammar=Grammar(), - end=r'^%(heredoc)s$', - ), - RegionRule( - name=r'eval_heredoc', - start=r"<< *`(?P[a-zA-Z0-9_]+)` *;", - grammar=StringGrammar(), - end=r'^%(heredoc)s$', - ), - - # end block - RegionRule( - name=r'endblock', - start=r"^__END__|__DATA__ *$", - grammar=Grammar(), - end=r'', - ), - RegionRule( - name=r'pod', - start=r'^=[a-zA-Z0-9_]+', - grammar=PodGrammar(), - end=r'^=cut', - ), - PatternRule( - name=r'comment', - pattern=r'#.*$', - ), - RegionRule( - name=r'string1', - start=r'"', - grammar=StringGrammar(), - end=r'"', - ), - RegionRule( - name=r'string2', - start=r"'", - grammar=Grammar(), - end=r"'", - ), - RegionRule( - name=r'evalstring', - start=r"`", - grammar=StringGrammar(), - end=r"`", - ), - PatternRule( - name=r'number', - pattern=r'0?\.[0-9]+|[0-9]+(?:\.[0-9]+)?', - ), - PatternRule( - name=r'keyword', - pattern=r"(?)(?:STDIN|STDERR|STDOUT|and|cmp|continue|do|else|elsif|eq|eval|foreach|for|if|last|my|next|ne|not|or|our|package|require|return|sub|undef|unless|until|use|while)(?![a-zA-Z0-9_])", - ), - PatternRule( - name=r'hash_key', - pattern=r'(?<={)[A-Za-z0-9_]+(?=})', - ), - PatternRule( - name=r'hash_key', - pattern=r'[A-Za-z0-9_]+(?= *=>)', - ), - PatternRule( - name=r'length', - pattern=r"\$#[A-Za-z0-9_](?:[A-Za-z0-9_]|::)*", - ), - # XYZ - PatternRule( - name=r'cast', - pattern=r'[\$\@\%\^\&](?= *{)', - ), - PatternRule( - name=r'scalar', - pattern=r"\$[][> *\()", - ), - PatternRule( - name=r'scalar', - pattern=r"\$\$*[A-Za-z0-9_](?:[A-Za-z0-9_]|::)*", - ), - PatternRule( - name=r'array', - pattern=r"@\$*[A-Za-z_](?:[A-Za-z0-9_]|::)*", - ), - PatternRule( - name=r'hash', - pattern=r"%\$*[A-Za-z_](?:[A-Za-z0-9_]|::)*", - ), - PatternRule( - name=r'deref', - pattern=r"[@%\$&\*](?={)", - ), - RegionRule( - name=r'quoted', - start=r'q[rqwx]? *\(', - grammar=Grammar(), - end=r'\)', - ), - RegionRule( - name=r'quoted', - start=r'q[rqwx]? *{', - grammar=Grammar(), - end=r'}', - ), - RegionRule( - name=r'quoted', - start=r'q[rqwx]? *<', - grammar=Grammar(), - end=r'>', - ), - RegionRule( - name=r'quoted', - start=r'q[rqwx]? *\[', - grammar=Grammar(), - end=r'\]', - ), - RegionRule( - name=r'quoted', - start=r'q[rqwx]? *(?P[^ #])', - grammar=Grammar(), - end=r'%(delim)s', - ), - RegionRule( - name=r'quoted', - start=r'q[rqwx]?#', - grammar=Grammar(), - end=r'#', - ), - - # match regexes - RegionRule( - name=r'match', - start=r'(?:(?<==~)|(?<=!~)|(?<=\()|(?<=split)) *(?P/)', - grammar=StringGrammar(), - end=r'/[a-z]*', - ), - RegionRule( - name=r'match', - start=r'm *(?P[^ #a-zA-Z0-9_])', - grammar=StringGrammar(), - end=r'%(delim)s[a-z]*', - ), - RegionRule( - name=r'match', - start=r'm(?P#)', - grammar=StringGrammar(), - end=r'#[a-z]*', - ), - - # replace regexes - DualRegionRule( - name=r'replace', - start=r's *(?P[^ a-zA-Z0-9_])', - grammar1=StringGrammar(), - middle=r'%(delim)s', - grammar2=StringGrammar(), - end=r'%(delim)s[a-z]*', - ), - DualRegionRule( - name=r'replace', - start=r's(?P#)', - grammar1=StringGrammar(), - middle=r'#', - grammar2=StringGrammar(), - end=r'#[a-z]*', - ), - - # translate operator - DualRegionRule( - name=r'translate', - start=r'(?:y|tr) *(?P[^ a-zA-Z0-9_])', - grammar1=Grammar(), - middle=r'%(delim)s', - grammar2=Grammar(), - end=r'%(delim)s[a-z]*', - ), - DualRegionRule( - name=r'translate', - start=r'(?:y|tr)#', - grammar1=Grammar(), - middle=r'#', - grammar2=Grammar(), - end=r'#[a-z]*', - ), - - # some more basic stuff - PatternRule( - name=r'package', - pattern=r"(?<=package )(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*", - ), - PatternRule( - name=r'sub', - pattern=r"(?<=sub )[a-zA-Z_][a-zA-Z_0-9]*", - ), - PatternRule( - name=r'use', - pattern=r"(?<=use )(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*", - ), - PatternRule( - name=r'label', - pattern=r'[a-zA-Z_][a-zA-Z0-9_]*:(?!:)', - ), - PatternRule( - name=r'method', - pattern=r"(?<=->)[a-zA-Z_][a-zA-Z_0-9]*", - ), - PatternRule( - name=r'function', - pattern=r"&\$*(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*", - ), - PatternRule( - name=r'function', - pattern=r"(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*(?= *\()", - ), - PatternRule( - name=r'builtin', - pattern=r"(?)&?(?:write|warn|wantarray|waitpid|wait|vec|values|utime|use|untie|unshift|unpack|unlink|undef|umask|ucfirst|uc|truncate|times|time|tied|tie|telldir|tell|syswrite|system|sysseek|sysread|sysopen|syscall|symlink|substr|sub|study|stat|srand|sqrt|sprintf|split|splice|sort|socketpair|socket|sleep|sin|shutdown|shmwrite|shmread|shmget|shmctl|shift|setsockopt|setservent|setpwent|setprotoent|setpriority|setpgrp|setnetent|sethostent|setgrent|send|semop|semget|semctl|select|seekdir|seek|scalar|rmdir|rindex|rewinddir|reverse|return|reset|require|rename|ref|redo|recv|readpipe|readlink|readline|readdir|read|rand|quotemeta|push|prototype|printf|print|pos|pop|pipe|package|pack|our|ord|opendir|open|oct|no|next|my|msgsnd|msgrcv|msgget|msgctl|mkdir|map|lstat|log|lock|localtime|local|listen|link|length|lcfirst|lc|last|kill|keys|join|ioctl|int|index|import|hex|grep|goto|gmtime|glob|getsockopt|getsockname|getservent|getservbyport|getservbyname|getpwuid|getpwnam|getpwent|getprotoent|getprotobynumber|getprotobyname|getpriority|getppid|getpgrp|getpeername|getnetent|getnetbyname|getnetbyaddr|getlogin|gethostent|gethostbyname|gethostbyaddr|getgrnam|getgrgid|getgrent|getc|formline|format|fork|flock|fileno|fcntl|exp|exit|exists|exec|eval|eof|endservent|endpwent|endprotoent|endnetent|endhostent|endgrent|each|dump|do|die|delete|defined|dbmopen|dbmclose|crypt|cos|continue|connect|closedir|close|chroot|chr|chown|chop|chomp|chmod|chdir|caller|bless|binmode|bind|atan2|alarm|accept|abs)(?![a-zA-Z0-9_])", - ), - PatternRule( - name=r'class', - pattern=r"(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*(?=->)", - ), - -# # nested regions -# RegionRule( -# name=r'paren', -# start=r'\(', -# grammar=None, -# end=r'\)', -# ), -# RegionRule( -# name=r'brace', -# start=r'{', -# grammar=None, -# end=r'}', -# ), -# RegionRule( -# name=r'bracket', -# start=r'\[', -# grammar=None, -# end=r'\]', -# ), - - # some basic stuff - PatternRule( - name=r'delimiter', - #pattern=r",|;|->|=>|=|\?|(?|=>|=|\?|\(|\)|{|}|\[|\](?>=|<<=|\*\*=", - ), - PatternRule( - name=r'operator', - pattern=r"\+|<=>|<>|<<|<=|<|-|>>|>=|>|\*\*|&|\*|\||/|\^|==|//|~|=~|!~|!=|%|!|\.", - ), - PatternRule( - name=r'bareword', - pattern=r'(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*', - ), - ] diff --git a/lex2_python.py b/lex2_python.py deleted file mode 100755 index 48bd203..0000000 --- a/lex2_python.py +++ /dev/null @@ -1,115 +0,0 @@ -from lex2 import Grammar, ConstantRule, PatternRule, ContextPatternRule, RegionRule, DualRegionRule - -class StringGrammar(Grammar): - rules = [ - PatternRule( - name=r'octal', - pattern=r'\\[0-7]{3}', - ), - PatternRule( - name=r'escaped', - pattern=r'\\.', - ), - #PatternRule( - # name=r'format', - # pattern=r'%(?:\([a-zA-Z_]+\))?[-# +]*(?:[0-9]+|\*)?\.?(?:[0-9]+|\*)?[hlL]?[a-zA-Z%]', - #), - ] - -class PythonGrammar(Grammar): - rules = [ - PatternRule( - name=r'functiondef', - pattern=r'(?<=def )[a-zA-Z_][a-zA-Z0-9_]*', - ), - PatternRule( - name=r'classdef', - pattern=r'(?<=class )[a-zA-Z_][a-zA-Z0-9_]*', - ), - PatternRule( - name=r'reserved', - pattern=r'(?:True|None|False|Exception|self)(?![a-zA-Z0-9_])', - ), - PatternRule( - name=r'keyword', - pattern=r'(?:yield|while|try|return|raise|print|pass|or|not|lambda|is|in|import|if|global|from|for|finally|exec|except|else|elif|del|def|continue|class|break|assert|as|and)(?![a-zA-Z0-9_])', - ), - PatternRule( - name=r"builtin", - pattern=r'(?>=|<<=|\*\*=', - ), - PatternRule( - name=r"operator", - pattern=r"\+|<>|<<|<=|<|-|>>|>=|>|\*\*|&|\*|\||/|\^|==|//|~|!=|%", - ), - - PatternRule( - name=r"integer", - pattern=r"(?"""|\'\'\')', - grammar=Grammar(), - end=r'%(tag)s', - ), - RegionRule( - name=r'tq_string', - start=r'(?P"""|\'\'\')', - grammar=Grammar(), - end=r'%(tag)s', - ), - RegionRule( - name=r'string', - start=r'(?P"|\')', - grammar=StringGrammar(), - end=r'%(tag)s', - ), - - PatternRule( - name=r'comment', - pattern=r'#.*$', - ), - PatternRule( - name=r'continuation', - pattern=r'\\$', - ), - ] diff --git a/lex_blame.py b/lex_blame.py deleted file mode 100755 index 4998fcc..0000000 --- a/lex_blame.py +++ /dev/null @@ -1,16 +0,0 @@ -import lex - -class BlameGrammar(lex.Grammar): - GRAMMAR_LIST = [ - {'name': 'metadata', - 'expr': "(?:^|(?<=\n))[0-9.]+ +[a-zA-Z0-9_]+ +[-0-9A-Za-z]+", - 'action': lex.make_token}, - - {'name': 'data', - 'expr': ".+(?:$|\n)", - 'action': lex.make_token}, - - {'name': "default", - 'expr': r'\\.|.|\n', - 'action': lex.silent}, - ] diff --git a/lex_c.py b/lex_c.py deleted file mode 100644 index eefa240..0000000 --- a/lex_c.py +++ /dev/null @@ -1,122 +0,0 @@ -#!/bin/env python - -# 2.3 imports -from optparse import OptionParser - -# our imports -import lex - -class CGrammar(lex.Grammar): - GRAMMAR_LIST = [ - # this might not be complete... - # see http://gcc.gnu.org/onlinedocs/gcc-2.95.3/cpp_3.html#SEC44 - # we need to absorb the rest of the line cause otherwise shit happens - {'name': 'macro2', - 'expr': r"#(?:define|import|include|undef)(?= )", - 'action':lex.make_token}, - - {'name': 'macro1', - 'expr': r"#(?:assert|cpu|elif|else|error|endif|error|ident|ifdef|ifndef|if|include_next|line|machine|pragma|pragma_once|system|unassert|warning)(?:[^\n]*\\\n)*[^\n]*?(?=\n)", - 'action':lex.make_token}, - - {'name': 'header', - 'expr': r'''(?<=#include) +(?:<[A-Za-z/0-9_]+\.h?>|"[A-Za-z/0-9_]+\.h")''', - 'action': lex.make_token}, - - {'name': 'constant', - 'expr': r'''(?<=#define) +[A-Za-z0-9_]+(?= |\(|\n|$)''', - 'action': lex.make_token}, - - {'name': 'label', - 'expr': r"""[a-zA-Z_]+(?=:)""", - 'action': lex.make_token}, - - {'name': "c++ comment", - 'expr': r'//.*(?:\n|$)', - 'action': lex.make_token}, - - {'name': "c comment", - 'expr': r"/\*(?:.|\n)*?(?:\*/|$)", - 'action' : lex.make_token}, - - {'name': 'control', - 'expr': r"(?:break|case|continue|default|do|else|for|goto|if|return|switch|while)(?![a-zA-Z_])", - 'action': lex.make_token}, - - {'name': 'keyword', - 'expr': r"(?:auto|break|case|char|const|continue|default|do|double|else|enum|extern|float|for|goto|if|int|long|register|return|short|signed|sizeof|static|struct|switch|typedef|union|unsigned|void|volatile|while)(?![a-zA-z_])", - 'action': lex.make_token}, - - {'name': 'builtin', - 'expr': r"(?:NULL|TRUE|FALSE)", - 'action': lex.make_token}, - - {'name': "identifier", - 'expr': r"[a-zA-Z_][a-zA-Z0-9_]*", - 'action': lex.make_token}, - - {'name': "unary operator", - 'expr': r"""\+=|-=|\*=|/=|//=|%=|&=\|\^=|>>=|<<=|\*\*=""", - 'action': lex.make_token}, - - {'name': "operator", - 'expr': r"""\+|<>|<<|<=|<|-|>>|>=|>|\*\*|&|\*|\||/|\^|==|//|~|!=|%""", - 'action': lex.make_token}, - - # this is sketchy as hell - {'name': "delimiter", - 'expr': r"""->|\.|\(|\)|\[|\]|{|}|@|,|:|`|;|=|\?""", - 'action': lex.make_token}, - - {'name': "integer", - 'expr': r"(?:0(?![x0-9])|[1-9][0-9]*|0[0-7]+|0[xX][0-9a-fA-F]+)[lL]?", - 'action': lex.make_token}, - - {'name': "float", - 'expr': r"""[0-9]+\.[0-9]*|\.[0-9]+|(?:[0-9]|[0-9]+\.[0-9]*|\.[0-9]+)[eE][\+-]?[0-9]+""", - 'action': lex.make_token}, - - {'name': "string1", - 'expr': r'"(?:\\.|[^"])*(?:"|.?$)', - 'action': lex.make_token}, - - # Doesn't handle octal . . (yeah it does..heh...ughhh) - {'name': "char", - 'expr': r"'(?:\\[^']+|[^'])(?:'|.?$)", - 'action': lex.make_token}, - - {'name': "default", - 'expr': r'\\.|.|\n', - 'action': lex.silent} - ] - - def _default_rules(self): - """subclasses can override this to define defaults for a grammar""" - lex.Grammar._default_rules(self) - self.insert(0, lex.BalancedExprRule(name='macro comment', - start_expr=r"#if +0", - enter="#if", - leave="#endif", - action=lex.make_token)) - -if __name__ == "__main__": - usage = "%%prog [ ...]\n\n" \ - "Lex one or more files according to the python grammar" - parser = OptionParser(usage=usage) - (options, args) = parser.parse_args() - g = CGrammar() - l = lex.Lexer(grammar=g) - - for path in args: - f = open(path, 'r') - data = f.read() - f.close() - - print "Lexing %s:" % (path) - l.lex(data) - - for t in l: - if t is not None: - print t - #print "%-12s %-40s %d %d" % (t.rule.name, t.string, t.start, t.end) - diff --git a/lex_diff.py b/lex_diff.py deleted file mode 100755 index 2525e6b..0000000 --- a/lex_diff.py +++ /dev/null @@ -1,41 +0,0 @@ -import lex - -class DiffGrammar(lex.Grammar): - GRAMMAR_LIST = [ - {'name': "left", - 'expr': "(?:^|(?<=\n))\-.*(?:$|\n)", - 'action': lex.make_token}, - - {'name': "right", - 'expr': "(?:^|(?<=\n))\+.*(?:$|\n)", - 'action': lex.make_token}, - -#RCS file: /usr/local/cvsroot/TBB_v2/main_application/lib/TBB/EfileServer.pm,v -#retrieving revision 1.57 -#diff -u -r1.57 EfileServer.pm - - {'name': "cvs metadata", - 'expr': "(?:^|(?<=\n))Index: .*\n={67}\nRCS file: .*,v\nretrieving revision [0-9.]+\ndiff -u .*(?:$|\n)", - 'action': lex.make_token}, - - {'name': "svn metadata", - 'expr': "(?:^|(?<=\n))Index: .*\n={67}(?:$|\n)", - 'action': lex.make_token}, - - {'name': "location", - 'expr': "(?:^|(?<=\n))@@ [-+0-9a-z, ]* @@(?:$|\n)", - 'action': lex.make_token}, - - {'name': "common", - 'expr': "(?:^|(?<=\n)).*(?:$|\n)", - 'action': lex.make_token}, - - {'name': "default", - 'expr': r'\\.|.|\n', - 'action': lex.silent} - ] - - def _default_rules(self): - """subclasses can override this to define defaults for a grammar""" - for rdir in DiffGrammar.GRAMMAR_LIST: - self.add_rule(**rdir) diff --git a/lex_javascript.py b/lex_javascript.py deleted file mode 100755 index 9999bf5..0000000 --- a/lex_javascript.py +++ /dev/null @@ -1,82 +0,0 @@ -#!/bin/env python - -# 2.3 imports -from optparse import OptionParser - -# our imports -import lex - -class JavascriptGrammar(lex.Grammar): - GRAMMAR_LIST = [ -## {'name': "import statement", -## 'expr': r"""(?:^|(?<= ))import [ .]*(?=\n)""", -## 'action': lex.make_token}, - - {'name': "comment", - 'expr': r'//.*(?=\n|$)', - 'action': lex.make_token}, - - {'name': "function declaration", - 'expr': r"(?<=function ) *[a-zA-Z0-9_]* *(?=\()", - 'action': lex.make_token}, - - {'name': "class declaration", - 'expr': r"(?<=class )[a-zA-Z_][a-zA-Z0-9_]*", - 'action': lex.make_token}, - - {'name': 'keyword', - 'expr': r"""(?:and|break|class|continue|def|del|elif|else|except|exec|finally|for|from|function|global|if|import|in|is|lambda|new|not|or|pass|print|raise|return|try|var|while|yield)(?![a-zA-Z0-9_])""", - 'action': lex.make_token}, - - {'name': "pseudo-keyword", - 'expr': r"""(?:as|self|True|False|None|Exception)(?![a-zA-Z0-9_])""", - 'action': lex.make_token}, - -## {'name': "built-in method", -## 'expr': r"""(?>=|<<=|\*\*=""", - 'action': lex.make_token}, - - {'name': "operator", - 'expr': r"""\+|<>|<<|<=|<|-|>>|>=|>|\*\*|&|\*|\||/|\^|==|//|~|!=|%""", - 'action': lex.make_token}, - - {'name': "integer", - 'expr': r"(?:0|[1-9][0-9]*|0[0-7]+|0[xX][0-9a-fA-F]+)[lL]?", - 'action': lex.make_token}, - - {'name': "float", - 'expr': r"""[0-9]+\.[0-9]*|\.[0-9]+|(?:[0-9]|[0-9]+\.[0-9]*|\.[0-9]+)[eE][\+-]?[0-9]+""", - 'action': lex.make_token}, - - {'name': "imaginary", - 'expr': r"""[0-9]+|(?:[0-9]+\.[0-9]*|\.[0-9]+|(?:[0-9]|[0-9]+\.[0-9]*|\.[0-9]+)[eE][\+-]?[0-9]+)[jJ]""", - 'action': lex.make_token}, - - {'name': "string1", - 'expr': r'"(?:\\.|[^\\"])*(?:"|.?$)', - 'action': lex.make_token}, - - {'name': "string2", - 'expr': r"'(?:\\.|[^\\'])*(?:'|.?$)", - 'action': lex.make_token}, - - {'name': "continuation", - 'expr': r'\\(?=(?:\n|$))', - 'action': lex.make_token}, - - {'name': "default", - 'expr': r'\\.|.|\n', - 'action': lex.silent} - ] diff --git a/lex_mutt.py b/lex_mutt.py deleted file mode 100755 index 67ea22d..0000000 --- a/lex_mutt.py +++ /dev/null @@ -1,59 +0,0 @@ -import lex, lex_text - -class MuttGrammar(lex.Grammar): - GRAMMAR_LIST = [ - {'name': 'header', - 'expr': r'(?:^|(?<=\n))(?:From|To|Cc|Bcc|Subject|Reply-To|In-Reply-To|Delivered-To|Date):', - 'action': lex.make_token, - }, - - {'name': 'quote1', - 'expr': r'(?:^|(?<=\n))(?:(?: *>){3})*(?: *>){1} *(?:[^ >\n][^\n]*)?(?:$|\n)', - 'action': lex.make_token, - }, - - {'name': 'quote2', - 'expr': r'(?:^|(?<=\n))(?:(?: *>){3})*(?: *>){2} *(?:[^ >\n][^\n]*)?(?:$|\n)', - 'action': lex.make_token, - }, - - {'name': 'quote3', - 'expr': r'(?:^|(?<=\n))(?:(?: *>){3})*(?: *>){3} *(?:[^ >\n][^\n]*)?(?:$|\n)', - 'action': lex.make_token, - }, - - {'name': 'email', - 'expr': r'(?:^|(?<=[ :\n]))@\n ]+@(?:[^<>@\.\n ]+\.)*[^<>@\.\n ]+>?', - 'action': lex.make_token, - }, - - {'name': 'url', - 'expr': r'(?:^|(?<=[ \n]))(?:http|https|ftp|sftp|file|smtp|smtps|torrent|news|jabber|irc|telnet)://(?:[^\.\n ]+\.)*[^\.\n ]+', - 'action': lex.make_token, - }, - - {'name': 'continued word', - 'expr': r"""([a-zA-Z][a-zA-Z-']*[a-zA-Z])-\n *([a-zA-Z][a-zA-Z-]*[a-zA-Z])""", - 'action': lex_text.make_token_spell, - }, - - {'name': 'word', - 'expr': r"""(?:[a-zA-Z][-']?)*[a-zA-Z]""", - 'action': lex_text.make_token_spell, - }, - - {'name': 'stuff', - 'expr': r"""[^ \n]+""", - 'action': lex.make_token, - }, - - {'name': "default", - 'expr': r'.| |\n', - 'action': lex.silent, - }, - ] - - def _default_rules(self): - """subclasses can override this to define defaults for a grammar""" - for rdir in self.GRAMMAR_LIST: - self.add_rule(**rdir) diff --git a/lex_nasm.py b/lex_nasm.py deleted file mode 100644 index 0f97357..0000000 --- a/lex_nasm.py +++ /dev/null @@ -1,100 +0,0 @@ -#!/bin/env python - -# 2.3 imports -from optparse import OptionParser - -# our imports -import lex - - -class NasmGrammar(lex.Grammar): - - GRAMMAR_LIST = [ - {'name': 'keyword', - 'expr': \ - r"""(?:section|global|extern)(?![a-zA-Z_])""", - 'action': lex.make_token}, - - {'name': "nasm macros", - 'expr': r"%(?:define|undef|assign|strlen|macro|endmacro|if|elif|else|endif|ifdef|ifndef|include|push|pop|stacksize)(?![a-zA-Z_])", - 'action': lex.make_token - }, - - {'name': "instructions", - 'expr': \ - r"""(?:jeq|jne|ja|jmp|push|pushad|pushfd|call|ret|sub|add|pop|popa|popad|popfd|call|and|cwd|cdq|cmp|cmpxchg|cpuid|div|divpd|enter|leave|fadd|fld|fmul|fsqrt|fsub|hlt|imul|inc|int|int3|lea|mov|movd|mul|neg|not|nop|or|sal|sar|shl|shr|shld|shrd|syscall|sysenter|sysexit|test|xchg|xadd|xor)(?![a-zA-Z_])""", - 'action': lex.make_token}, - - {'name': "registers", - 'expr': \ - r"""(?:eax|ax|ah|al|ebx|bx|bh|bl|ecx|cx|ch|cl|esi|edi|esp|ebp)""", - 'action': lex.make_token}, - - {'name': "prefix", - 'expr': r"(?:dword|word|lock)", - 'action': lex.make_token - }, - - {'name': "label", - 'expr': r"[a-zA-Z_.][a-zA-Z0-9_.]*:", - 'action': lex.make_token}, - - {'name': "identifier", - 'expr': r"[a-zA-Z_][a-zA-Z0-9_]*", - 'action': lex.make_token}, - - {'name': "integer", - 'expr': r"(0|[1-9][0-9]*|0[0-7]+|0[xX][0-9a-fA-F]+)[lL]?", - 'action': lex.make_token}, - - {'name': "float", - 'expr': \ - r"""[0-9]+\.[0-9]*|\.[0-9]+|([0-9]| - [0-9]+\.[0-9]*|\.[0-9]+)[eE][\+-]?[0-9]+""", - 'action': lex.make_token}, - - {'name': "string3", - 'expr': r'"""[.|\n]*?(?:"""|$)', - 'action': lex.make_token}, - - {'name': "string1", - 'expr': r'"(?:\\.|[^\\"])*(?:"|$)', - 'action': lex.make_token}, - - {'name': "string2", - 'expr': r"'(?:\\.|[^\\'])*(?:'|$)", - 'action': lex.make_token}, - - {'name': "comment", - 'expr': r'[;].*(?:\n|$)', - 'action': lex.make_token}, - - {'name': "default", - 'expr': r'\\.|.|\n', - 'action': lex.silent} - ] - - def _default_rules(self): - """subclasses can override this to define defaults for a grammar""" - for rdir in NasmGrammar.GRAMMAR_LIST: - self.add_rule(**rdir) - -if __name__ == "__main__": - usage = "%%prog [ ...]\n\n" \ - "Lex one or more files according to the python grammar" - parser = OptionParser(usage=usage) - (options, args) = parser.parse_args() - g = NasmGrammar() - l = lex.Lexer(grammar=g) - - for path in args: - f = open(path, 'r') - data = f.read() - f.close() - - print "Lexing %s:" % (path) - l.lex(data) - - for x in l: - if x is not None: - print x diff --git a/lex_perl.py b/lex_perl.py deleted file mode 100755 index e026b0f..0000000 --- a/lex_perl.py +++ /dev/null @@ -1,207 +0,0 @@ -#!/bin/env python - -# 2.3 imports -from optparse import OptionParser - -# our imports -import lex - -# this will support perl's string interpolation; but, it can be slower and also -# possibly buggier -INTERPOLATION_HIGHLIGHTING = False -#INTERPOLATION_HIGHLIGHTING = True - -class PerlGrammar(lex.Grammar): - GRAMMAR_LIST = [ - {'name': 'heredoc', - 'expr': r"""<< *([a-zA-Z0-9_]+) *;(?:.*?\n)*?(?:\1|$)""", - 'action': lex.make_token}, - - {'name': 'endblock', - 'expr': r"""(?:^|\n)(?:__END__|__DATA__)(?:.|\n)*$""", - 'action': lex.make_token}, - - {'name': 'pod', - 'expr': r"""(?:^|(?<=\n))=[a-zA-Z0-9_]+.*(?:\n(?!=cut).*)*(?:\n=cut|$)""", - 'action': lex.make_token}, - - {'name': "comment", - 'expr': r'[#].*(?:\n|$)', - 'action': lex.make_token}, - - {'name': "string1", - 'expr': r'''"(?:\\(?:.|\n)|[^\\"]|[ \n])*(?:"|.?$)''', - 'action': lex.make_token}, - - {'name': "string2", - 'expr': r"""'(?:\\(?:.|\n)|[^\\'])*(?:'|.?$)""", - 'action': lex.make_token}, - - {'name': "evalstring", - 'expr': r"""`(?:\\(?:.|\n)|[^\\`])*(?:`|.?$)""", - 'action': lex.make_token}, - - {'name': 'number', - 'expr': r"""0?\.[0-9]+|[0-9]+(?:\.[0-9]+)?""", - 'action': lex.make_token}, - - {'name': 'label', - 'expr': r"""[a-zA-Z_][a-zA-Z0-9_]*:(?= |\n)""", - 'action': lex.make_token}, - - {'name': 'keyword', - 'expr': r"""(?)(?:STDIN|STDERR|STDOUT|and|cmp|continue|do|else|elsif|eq|eval|foreach|for|if|last|my|next|ne|not|no|or|our|package|require|return|sub|undef|unless|until|use|while)(?![a-zA-Z_])""", - 'action': lex.make_token}, - - {'name': 'hash bareword index', - 'expr': r"(?<={)[A-Za-z0-9_]+(?=})", - 'action': lex.make_token}, - - {'name': 'literal hash bareword index', - 'expr': r"[A-Za-z0-9_]+(?= *=>)", - 'action': lex.make_token}, - - {'name': 'length scalar', - 'expr': r"\$#[A-Za-z0-9_](?:[A-Za-z0-9_]|::)*", - 'action': lex.make_token}, - - {'name': 'system scalar', - 'expr': r"\$[][>[a-z]*|m\[(?:\\.|[^\\])*?\][a-z]*""", - 'action': lex.make_token}, - - # we officially don't support the bullshit s{a}{b} thing perl has going. - # those guys are on crack. we only support things like s#a#b# or s/a/b/. - # same comments as above apply - {'name': 'replace regex', - 'expr': r"""(?:y|tr|s)([^<[{(A-Za-z0-9 \t\n])(?:\\.|[^\\])*?\1(?:\\.|[^\\])*?\1[a-z]*""", - 'action': lex.make_token}, - - {'name': 'package', - 'expr': r"""(?<=package )(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*""", - 'action': lex.make_token}, - - {'name': 'use', - 'expr': r"""(?<=use )(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*""", - 'action': lex.make_token}, - - {'name': 'instance method', - 'expr': r"""(?<=->)[a-zA-Z_][a-zA-Z_0-9]*""", - 'action': lex.make_token}, - - {'name': 'static method', - 'expr': r"""&?(?:[a-zA-Z_][a-zA-Z_0-9]*::)+[a-zA-Z_][a-zA-Z_0-9]*""", - 'action': lex.make_token}, - - {'name': 'method declaration', - 'expr': r"""(?<=sub )[a-zA-Z_][a-zA-Z_0-9]*(?=[ \n]*{)""", - 'action': lex.make_token}, - - {'name': 'built-in method', - 'expr': r"""(?)&?(?:write|warn|wantarray|waitpid|wait|vec|values|utime|use|untie|unshift|unpack|unlink|undef|umask|ucfirst|uc|truncate|times|time|tied|tie|telldir|tell|syswrite|system|sysseek|sysread|sysopen|syscall|symlink|substr|sub|study|stat|srand|sqrt|sprintf|split|splice|sort|socketpair|socket|sleep|sin|shutdown|shmwrite|shmread|shmget|shmctl|shift|setsockopt|setservent|setpwent|setprotoent|setpriority|setpgrp|setnetent|sethostent|setgrent|send|semop|semget|semctl|select|seekdir|seek|scalar|rmdir|rindex|rewinddir|reverse|return|reset|require|rename|ref|redo|recv|readpipe|readlink|readline|readdir|read|rand|quotemeta|push|prototype|printf|print|pos|pop|pipe|package|pack|our|ord|opendir|open|oct|no|next|my|msgsnd|msgrcv|msgget|msgctl|mkdir|map|lstat|log|lock|localtime|local|listen|link|length|lcfirst|lc|last|kill|keys|join|ioctl|int|index|import|hex|grep|goto|gmtime|glob|getsockopt|getsockname|getservent|getservbyport|getservbyname|getpwuid|getpwnam|getpwent|getprotoent|getprotobynumber|getprotobyname|getpriority|getppid|getpgrp|getpeername|getnetent|getnetbyname|getnetbyaddr|getlogin|gethostent|gethostbyname|gethostbyaddr|getgrnam|getgrgid|getgrent|getc|formline|format|fork|flock|fileno|fcntl|exp|exit|exists|exec|eval|eof|endservent|endpwent|endprotoent|endnetent|endhostent|endgrent|each|dump|do|die|delete|defined|dbmopen|dbmclose|crypt|cos|continue|connect|closedir|close|chroot|chr|chown|chop|chomp|chmod|chdir|caller|bless|binmode|bind|atan2|alarm|accept|abs)(?![a-zA-Z0-9_])""", - #'expr':r"""(?)&?(?:abs|accept|alarm|atan2|bind|binmode|bless|caller|chdir|chmod|chomp|chop|chown|chroot|chr|closedir|close|connect|cos|crypt|dbmclose|dbmopen|defined|delete|die|dump|each|eof|exec|exists|exit|exp|fcntl|fileno|flock|fork|format|formline|getc|getlogin|getpeername|grep|int|join|keys|lc|map|open|pop|print|push|rand|readdir|ref|scalar|select|shift|sort|split|srand|time|uc|unshift|values|wantarray|warn)(?![a-zA-Z0-9_])""", - 'action': lex.make_token}, - - {'name': 'method', - 'expr': r"""&(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*""", - 'action': lex.make_token}, - - {'name': 'methodref', - 'expr': r"""&\$(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*""", - 'action': lex.make_token}, - - {'name': 'bareword method', - 'expr': r"""(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*(?=[ \n]*(?:\(|->))""", - 'action': lex.make_token}, - - {'name': "delimiter", - 'expr': r"""\(|\)|\[|\]|{|}|,|;|=>|=|\?|(?>=|<<=|\*\*=""", - 'action': lex.make_token}, - - {'name': "operator", - 'expr': r"""\+|<=>|<>|<<|<=|<|-|>>|>=|>|\*\*|&|\*|\||/|\^|==|//|~|=~|!~|!=|%|!|\.""", - 'action': lex.make_token}, - - {'name': 'bareword', - 'expr': r"""(?:[a-zA-Z_][a-zA-Z_0-9]*::)*[a-zA-Z_][a-zA-Z_0-9]*""", - 'action': lex.make_token}, - - {'name': 'default', - 'expr': r""".|\n""", - 'action': lex.silent} - ] - - def _default_rules(self): - """subclasses can override this to define defaults for a grammar""" - sub_exprs = {} - string_rules = [] - - for rdir in PerlGrammar.GRAMMAR_LIST: - self.add_rule(**rdir) - - if INTERPOLATION_HIGHLIGHTING: - if rdir['name'] in ('scalar', 'system scalar', 'array', 'hash', - 'system array'): - rdir2 = rdir.copy() - rdir2['name'] = 'interpolated ' + rdir['name'] - string_rules.append(lex.Rule(**rdir2)) - elif rdir['name'] in ('heredoc', 'string1', 'string2'): - sub_exprs[rdir['name']] = rdir['expr'] - - if INTERPOLATION_HIGHLIGHTING: - string_rules.append(lex.Rule(name="default string", - expr=r"""(?:\\.|[^\\\$]|\n)+|\$""", - action=lex.make_token)) - string_grammar = lex.Grammar(rules=string_rules) - - self.insert(0, lex.SubRule(name='heredoc', - expr=sub_exprs['heredoc'], - grammar=string_grammar)) - - self.insert(4, lex.SubRule(name="string1", - expr=sub_exprs['string1'], - grammar=string_grammar)) - - self.insert(5, lex.SubRule(name="string2", - expr=sub_exprs['string2'], - grammar=string_grammar)) diff --git a/lex_python.py b/lex_python.py deleted file mode 100755 index 2096a08..0000000 --- a/lex_python.py +++ /dev/null @@ -1,102 +0,0 @@ -#!/bin/env python - -# 2.3 imports -from optparse import OptionParser - -# our imports -import lex - -class PythonGrammar(lex.Grammar): - GRAMMAR_LIST = [ - {'name': "import statement", - 'expr': r"""(?:^|(?<= ))import [ .]*(?=\n)""", - 'action': lex.make_token}, - - {'name': "method declaration", - 'expr': r"(?<=def )[a-zA-Z_][a-zA-Z0-9_]*", - 'action': lex.make_token}, - - {'name': "class declaration", - 'expr': r"(?<=class )[a-zA-Z_][a-zA-Z0-9_]*", - 'action': lex.make_token}, - - {'name': 'keyword', - 'expr': r"""(?:and|assert|break|class|continue|def|del|elif|else|except|exec|finally|for|from|global|if|import|in|is|lambda|not|or|pass|print|raise|return|try|while|yield)(?![a-zA-Z0-9_])""", - 'action': lex.make_token}, - - {'name': "pseudo-keyword", - 'expr': r"""(?:as|self|True|False|None|Exception)(?![a-zA-Z0-9_])""", - 'action': lex.make_token}, - - {'name': "built-in method", - 'expr': r"""(?>=|<<=|\*\*=""", - 'action': lex.make_token}, - - {'name': "operator", - 'expr': r"""\+|<>|<<|<=|<|-|>>|>=|>|\*\*|&|\*|\||/|\^|==|//|~|!=|%""", - 'action': lex.make_token}, - - {'name': "integer", - 'expr': r"(?:0|[1-9][0-9]*|0[0-7]+|0[xX][0-9a-fA-F]+)[lL]?", - 'action': lex.make_token}, - - {'name': "float", - 'expr': r"""[0-9]+\.[0-9]*|\.[0-9]+|(?:[0-9]|[0-9]+\.[0-9]*|\.[0-9]+)[eE][\+-]?[0-9]+""", - 'action': lex.make_token}, - - {'name': "imaginary", - 'expr': r"""[0-9]+|(?:[0-9]+\.[0-9]*|\.[0-9]+|(?:[0-9]|[0-9]+\.[0-9]*|\.[0-9]+)[eE][\+-]?[0-9]+)[jJ]""", - 'action': lex.make_token}, - - {'name': "string4", - 'expr': r'"""(?:.|\n)*?(?:"""|$)', - 'action': lex.make_token}, - - {'name': "string3", - 'expr': r"'''(?:.|\n)*?(?:'''|$)", - 'action': lex.make_token}, - - {'name': "string1", - 'expr': r'"(?:\\.|[^\\"])*(?:"|.?$)', - 'action': lex.make_token}, - - {'name': "string2", - 'expr': r"'(?:\\.|[^\\'])*(?:'|.?$)", - 'action': lex.make_token}, - - {'name': "comment", - 'expr': r'[#].*(?=\n|$)', - 'action': lex.make_token}, - - {'name': "continuation", - 'expr': r'\\(?=(?:\n|$))', - 'action': lex.make_token}, - - {'name': "default", - 'expr': r'\\.|.|\n', - 'action': lex.silent} - ] diff --git a/lex_sh.py b/lex_sh.py deleted file mode 100755 index 3012ce5..0000000 --- a/lex_sh.py +++ /dev/null @@ -1,85 +0,0 @@ -# 2.3 imports -from optparse import OptionParser - -# our imports -import lex - -class ShGrammar(lex.Grammar): - GRAMMAR_LIST = [ - {'name': "method", - 'expr': r"""[a-zA-Z_][a-zA-Z0-9_]*(?=\(\))""", - 'action': lex.make_token}, - - {'name': 'reserved', - 'expr': r"""(?:case|done|do|elif|else|esac|fi|for|function|if|in|select|then|until|while|time)(?![a-zA-Z0-9_=])""", - 'action': lex.make_token}, - - {'name': 'builtin', - 'expr': r"""(?:source|alias|bg|bind|break|builtin|cd|command|compgen|complete|declare|dirs|disown|echo|enable|eval|exec|exit|export|fc|fg|getops|hash|help|history|jobs|kill|let|local|logout|popd|printf|pushd|pwd|readonly|read|return|set|shift|shopt|suspend|test|times|trap|type|ulimit|umask|unalias|unset|wait)(?![a-zA-Z0-9_=/])""", - 'action': lex.make_token}, - - {'name': 'operator', - 'expr': r"""(?:-eq|-ne|-gt|-lt|-ge|-le| = | != )""", - 'action': lex.make_token}, - -## {'name': 'redirection', -## 'expr': r"(?:[1-6] *)?> *(?:&[1-6]|(?:\\.|[^\\\"';| ])+)", -## 'action': lex.make_token}, - - {'name': 'delimiter', - 'expr': """[][\(\);\{\}|&><]""", - 'action': lex.make_token}, - -## {'name': 'variable0', -## 'expr': r"""(?:(?<=\n)|^) *[a-zA-Z_][a-zA-Z0-9_]*(?=\=)""", -## 'action': lex.make_token}, - {'name': 'variable0', - 'expr': r"""(?:(?<=\n) *|^ *| +)[a-zA-Z_][a-zA-Z0-9_]*(?=\=)""", - 'action': lex.make_token}, - - {'name': "variable1", - 'expr': r"\${(?:[a-zA-Z0-9_]+|\?\$)}", - 'action': lex.make_token}, - - {'name': "variable2", - 'expr': r"\$[^({][a-zA-Z0-9_]*", - 'action': lex.make_token}, - - {'name': "variable3", - 'expr': r"\$(?=\()", - 'action': lex.make_token}, - - {'name': "eval", - 'expr': r'`(?:\\.|[^\\`])*(?:`|.?$)', - 'action': lex.make_token}, - - {'name': "string1", - 'expr': r'"(?:\\.|[^\\"])*(?:"|.?$)', - 'action': lex.make_token}, - - {'name': "string2", - 'expr': r"'(?:\\.|[^\\'])*(?:'|.?$)", - 'action': lex.make_token}, - - {'name': 'continuation', - 'expr': r"""\\(?= *(\n|$))""", - 'action': lex.make_token}, - - {'name': "comment", - 'expr': r'[#].*(?:\n|$)', - 'action': lex.make_token}, - - {'name': 'bareword', - 'expr': r"""[a-zA-Z0-9_-]+""", - 'action': lex.make_token}, - - {'name': "default", - 'expr': r'\\.|.|\n', - 'action': lex.silent} - ] - - def _default_rules(self): - """subclasses can override this to define defaults for a grammar""" - for rdir in ShGrammar.GRAMMAR_LIST: - self.add_rule(**rdir) - diff --git a/lex_sql.py b/lex_sql.py deleted file mode 100755 index 3bc0ae3..0000000 --- a/lex_sql.py +++ /dev/null @@ -1,70 +0,0 @@ -import lex - -class SqlGrammar(lex.Grammar): - GRAMMAR_LIST = [ - {'name': "sql comment", - 'expr': r'--[^\n]*', - 'action': lex.make_token}, - - {'name': "c comment", - 'expr': r'/\*(?:.| |\n)*?(?:\*/|$)', - 'action': lex.make_token}, - - {'name': 'delimiter', - 'expr': r'[][();,\.:$]', - 'action': lex.make_token}, - - {'name': 'attribute1', - 'expr': r'''(?:CHECK|EXISTS|UNIQUE|NOT NULL|DEFAULT|PRIMARY KEY|MINVALUE|FOREIGN KEY|REFERENCES)(?![A-Za-z0-9_])''', - 'action': lex.make_token}, - {'name': 'attribute2', - 'expr': r'''(?:check|exists|unique|not null|default|primary key|minvalue|foreign key|references)(?![A-Za-z0-9_])''', - 'action': lex.make_token}, - - {'name': 'operator1', - 'expr': r'''(?:CASE|WHEN|THEN|ELSE|END|NOT|AND|OR|IS NOT|IS|IN|NOT IN)(?![A-Za-z0-9_])''', - 'action': lex.make_token}, - {'name': 'operator2', - 'expr': r'''(?:case|when|then|else|end|not|and|or|is not|is|in|not in)(?![A-Za-z0-9_])''', - 'action': lex.make_token}, - - {'name': 'keyword1', - 'expr': r'''(?:CREATE DATABASE|CREATE INDEX|CREATE SEQUENCE|CREATE TABLE|CREATE TRIGGER|CREATE VIEW|SELECT|INSERT|UPDATE|DELETE|DROP DATABASE|DROP INDEX|DROP SEQUENCE|DROP TABLE|DROP TRIGGER|DROP VIEW|CREATE USER|ALTER USER|DROP USER|DROP FUNCTION|GRANT|REVOKE|CREATE FUNCTION|CREATE OR REPLACE FUNCTION|CREATE OR REPLACE VIEW|CREATE LANGUAGE|CREATE OPERATOR|CREATE TYPE)(?![A-Za-z0-9_])''', - 'action': lex.make_token}, - {'name': 'keyword2', - 'expr': r'''(?:create database|create index|create sequence|create table|create trigger|create view|select|insert|update|delete|drop database|drop index|drop sequence|drop table|drop trigger|drop view|create user|alter user|drop user|drop function|grant|revoke|create function|create or replace function|create or replace view|create language|create operator|create type)(?![A-Za-z0-9_])''', - 'action': lex.make_token}, - - {'name': 'pseudo-keyword1', - 'expr': r'''(?:RETURNS|LANGUAGE|RIGHT JOIN|LEFT JOIN|INNER JOIN|OUTER JOIN|JOIN|WHERE|NULL|TRUE|FALSE|INTO|VALUES|AS|FROM|ORDER BY|ASC|DESC|LIMIT|DISTINCT|CASCADE|USING|ON)(?![A-Za-z0-9_])''', - 'action': lex.make_token}, - {'name': 'pseudo-keyword1', - 'expr': r'''(?:returns|language|right join|left join|inner join|outer join|join|where|null|true|false|into|values|as|from|order by|asc|desc|limit|distinct|cascade|using|on)(?![A-Za-z0-9_])''', - 'action': lex.make_token}, - - {'name': 'type1', - 'expr': '(?:VOID|ROW|SERIAL|VARCHAR|FLOAT|INTEGER|INT|TEXT|TIMESTAMPTZ|TIMESTAMP|DATETZ|DATE|TIMETZ|TIME|BOOLEAN|BOOL)(?![A-Za-z0-9_])', - 'action': lex.make_token}, - {'name': 'type2', - 'expr': '(?:void|row|serial|varchar|float|integer|int|text|timestamptz|timestamp|datetz|date|timetz|time|boolean|bool)(?![A-Za-z0-9_])', - 'action': lex.make_token}, - - {'name': 'function', - 'expr': r'''(?:nextval|current_timestamp|current_time|current_date)(?![A-Za-z0-9_])''', - 'action': lex.make_token}, - - {'name': 'string', - 'expr': r"""'(?:\\.|[^\\'])*(?:'|$)""", - 'action': lex.make_token}, - {'name': 'quoted', - 'expr': r'''"(?:\\.|[^\\"])*(?:"|$)''', - 'action': lex.make_token}, - - {'name': 'bareword', - 'expr': r'''[A-Za-z0-9_]+''', - 'action': lex.make_token}, - - {'name': "default", - 'expr': r'\\.|.|\n', - 'action': lex.silent} - ] diff --git a/lex_text.py b/lex_text.py deleted file mode 100755 index 6968611..0000000 --- a/lex_text.py +++ /dev/null @@ -1,43 +0,0 @@ -import os -import ispell, lex - -def make_token_spell(rule, m, offset): - '''return a token from a hit''' - # first let's figure out the actual word we need to check - if rule.name == 'continued word': - word = '%s%s' % (m.group(1), m.group(2)) - else: - word = m.group(0) - # okay, now we check the spelling; we don't spell-check all caps words - if ispell.can_spell() and \ - not ispell.get_speller().check(word, caps=False, title=False): - name = "misspelled %s" % rule.name - else: - name = rule.name - return(lex.Token(name, m.start() + offset, m.end() + offset, word)) - -class TextGrammar(lex.Grammar): - GRAMMAR_LIST = [ - {'name': 'continued word', - 'expr': r"""([a-zA-Z][a-zA-Z-']*[a-zA-Z])-\n *([a-zA-Z][a-zA-Z-]*[a-zA-Z])""", - 'action': make_token_spell}, - - {'name': 'word', - 'expr': r"""(?:[a-zA-Z][-']?)*[a-zA-Z]""", - 'action': make_token_spell, - }, - - {'name': 'stuff', - 'expr': r"""[^ \n]+""", - 'action': lex.make_token, - }, - - {'name': "default", - 'expr': r'.| |\n', - 'action': lex.silent} - ] - - def _default_rules(self): - """subclasses can override this to define defaults for a grammar""" - for rdir in TextGrammar.GRAMMAR_LIST: - self.add_rule(**rdir) diff --git a/lex_tt.py b/lex_tt.py deleted file mode 100755 index 2b4d505..0000000 --- a/lex_tt.py +++ /dev/null @@ -1,88 +0,0 @@ -#!/bin/env python - -# 2.3 imports -from optparse import OptionParser - -# our imports -import lex - -class TTGrammar(lex.Grammar): - GRAMMAR_LIST = [ - {'name': 'comment', - 'expr': r'''|$)''', - 'action': lex.make_token}, - - {'name': 'template', - 'expr': r'''\[%(?:.| |\n)*?%\]''', - #'expr': r'''\[%%\]''', - 'action': lex.make_token}, - - {'name': 'ltb', - 'expr': r']*>', - 'action': lex.make_token}, - - {'name': 'ltq', - 'expr': r'<\?', - 'action': lex.make_token}, - - {'name': 'gtq', - 'expr': r'\?>', - 'action': lex.make_token}, - - {'name': 'ltc', - 'expr': r'', - 'action': lex.make_token}, - - {'name': 'lt', - 'expr': r'<', - 'action': lex.make_token}, - - {'name': 'gt', - 'expr': r'>', - 'action': lex.make_token}, - - {'name': 'nodevalue', - 'expr': r'''(?:(?<=>)|(?<=%\]))(?:[^<\[]|\n|\[(?!%.*?%\]))+?(?=(?:<|\[%))''', - 'action': lex.make_token}, - - {'name': 'whitespace', - 'expr': r'''(?: |\n)+''', - 'action': lex.silent}, - - {'name': 'namespace', - 'expr': r'[a-zA-Z_]+:', - 'action': lex.make_token}, - - #{'name': 'xi', - # 'expr': r'xi:', - # 'action': lex.make_token}, - - {'name': 'opentag', - 'expr': r'(?:(?<=<)|(?<=xi:))[^ >\n/]+', - 'action': lex.make_token}, - - {'name': 'attrvalue', - 'expr': r'''(?<==)"(?:\\.|[^"\\])*(?:"|\\?$)|(?<==)'(?:\\.|[^'\\])*(?:'|\\?$)''', - 'action': lex.make_token}, - - {'name': 'attrname', - 'expr': r'[^ \n=>]+(?:(?==)|$)', - 'action': lex.make_token}, - - {'name': 'closetag', - 'expr': r'[^ =\n<>/]+', - 'action': lex.make_token}, - - {'name': 'default', - 'expr': r""".|\n""", - 'action': lex.silent} - ] - - def _default_rules(self): - """subclasses can override this to define defaults for a grammar""" - for rdir in TTGrammar.GRAMMAR_LIST: - self.add_rule(**rdir) diff --git a/lex_xml.py b/lex_xml.py deleted file mode 100755 index b403b2f..0000000 --- a/lex_xml.py +++ /dev/null @@ -1,83 +0,0 @@ -#!/bin/env python - -# 2.3 imports -from optparse import OptionParser - -# our imports -import lex - -class XMLGrammar(lex.Grammar): - GRAMMAR_LIST = [ - {'name': 'comment', - 'expr': r'''|$)''', - 'action': lex.make_token}, - - {'name': 'ltb', - 'expr': r']*>', - 'action': lex.make_token}, - - {'name': 'ltq', - 'expr': r'<\?', - 'action': lex.make_token}, - - {'name': 'gtq', - 'expr': r'\?>', - 'action': lex.make_token}, - - {'name': 'ltc', - 'expr': r'', - 'action': lex.make_token}, - - {'name': 'lt', - 'expr': r'<', - 'action': lex.make_token}, - - {'name': 'gt', - 'expr': r'>', - 'action': lex.make_token}, - - {'name': 'nodevalue', - 'expr': r'''(?<=>)(?:[^<]|\n)+?(?=<)''', - 'action': lex.make_token}, - - {'name': 'whitespace', - 'expr': r'''(?: |\n)+''', - 'action': lex.silent}, - - {'name': 'namespace', - 'expr': r'[a-zA-Z_]+:', - 'action': lex.make_token}, - - #{'name': 'xi', - # 'expr': r'xi:', - # 'action': lex.make_token}, - - {'name': 'opentag', - 'expr': r'(?:(?<=<)|(?<=xi:))[^ >\n/]+', - 'action': lex.make_token}, - - {'name': 'attrvalue', - 'expr': r'''(?<==)"(?:\\.|[^"\\])*(?:"|\\?$)|(?<==)'(?:\\.|[^'\\])*(?:'|\\?$)''', - 'action': lex.make_token}, - - {'name': 'attrname', - 'expr': r'[^ \n=>]+(?:(?==)|$)', - 'action': lex.make_token}, - - {'name': 'closetag', - 'expr': r'[^ =\n<>/]+', - 'action': lex.make_token}, - - {'name': 'default', - 'expr': r""".|\n""", - 'action': lex.silent} - ] - - def _default_rules(self): - """subclasses can override this to define defaults for a grammar""" - for rdir in XMLGrammar.GRAMMAR_LIST: - self.add_rule(**rdir)