parent
3b055e897a
commit
83ef208a0f
43
lex2.py
43
lex2.py
|
@ -23,7 +23,7 @@ class Token(object):
|
||||||
|
|
||||||
class Rule:
|
class Rule:
|
||||||
name = 'abstract'
|
name = 'abstract'
|
||||||
def match(self, lexer, context=[]):
|
def match(self, lexer, context=[], d={}):
|
||||||
raise Exception, "%s rule cannot match!" % self.name
|
raise Exception, "%s rule cannot match!" % self.name
|
||||||
def make_token(self, lexer, s, name, **vargs):
|
def make_token(self, lexer, s, name, **vargs):
|
||||||
return Token(name, lexer.y, lexer.x, s, **vargs)
|
return Token(name, lexer.y, lexer.x, s, **vargs)
|
||||||
|
@ -32,9 +32,9 @@ class ConstantRule(Rule):
|
||||||
def __init__(self, name, constant):
|
def __init__(self, name, constant):
|
||||||
assert valid_name_re.match(name), 'invalid name %r' % name
|
assert valid_name_re.match(name), 'invalid name %r' % name
|
||||||
assert name not in reserved_names, "reserved rule name: %r" % name
|
assert name not in reserved_names, "reserved rule name: %r" % name
|
||||||
self.name = name
|
self.name = name
|
||||||
self.constant = constant
|
self.constant = constant
|
||||||
def match(self, lexer, context=[]):
|
def match(self, lexer, context=[], d={}):
|
||||||
if lexer.lines[lexer.y][lexer.x:].startswith(self.constant):
|
if lexer.lines[lexer.y][lexer.x:].startswith(self.constant):
|
||||||
name = '.'.join(context + [self.name])
|
name = '.'.join(context + [self.name])
|
||||||
lexer.add_token(self.make_token(lexer, self.constant, name))
|
lexer.add_token(self.make_token(lexer, self.constant, name))
|
||||||
|
@ -50,7 +50,7 @@ class PatternRule(Rule):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.pattern = pattern
|
self.pattern = pattern
|
||||||
self.re = re.compile(pattern)
|
self.re = re.compile(pattern)
|
||||||
def match(self, lexer, context=[]):
|
def match(self, lexer, context=[], d={}):
|
||||||
m = self.re.match(lexer.lines[lexer.y], lexer.x)
|
m = self.re.match(lexer.lines[lexer.y], lexer.x)
|
||||||
if m:
|
if m:
|
||||||
name = '.'.join(context + [self.name])
|
name = '.'.join(context + [self.name])
|
||||||
|
@ -60,6 +60,28 @@ class PatternRule(Rule):
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
class ContextPatternRule(Rule):
|
||||||
|
def __init__(self, name, pattern, fallback):
|
||||||
|
assert valid_name_re.match(name), 'invalid name %r' % name
|
||||||
|
assert name not in reserved_names, "reserved rule name: %r" % name
|
||||||
|
self.name = name
|
||||||
|
self.pattern = pattern
|
||||||
|
self.fallback = fallback
|
||||||
|
self.fallback_re = re.compile(fallback)
|
||||||
|
def match(self, lexer, context=[], d={}):
|
||||||
|
try:
|
||||||
|
r = re.compile(self.pattern % d)
|
||||||
|
except KeyError:
|
||||||
|
r = self.fallback_re
|
||||||
|
m = r.match(lexer.lines[lexer.y], lexer.x)
|
||||||
|
if m:
|
||||||
|
name = '.'.join(context + [self.name])
|
||||||
|
lexer.add_token(self.make_token(lexer, m.group(0), name))
|
||||||
|
lexer.x += len(m.group(0))
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
class RegionRule(Rule):
|
class RegionRule(Rule):
|
||||||
def __init__(self, name, start, grammar, end):
|
def __init__(self, name, start, grammar, end):
|
||||||
assert valid_name_re.match(name), 'invalid name %r' % name
|
assert valid_name_re.match(name), 'invalid name %r' % name
|
||||||
|
@ -74,11 +96,12 @@ class RegionRule(Rule):
|
||||||
t = self.make_token(lexer, m.group(0), t_name)
|
t = self.make_token(lexer, m.group(0), t_name)
|
||||||
lexer.add_token(t)
|
lexer.add_token(t)
|
||||||
lexer.x += len(m.group(0))
|
lexer.x += len(m.group(0))
|
||||||
def match(self, lexer, context=[]):
|
def match(self, lexer, context=[], d={}):
|
||||||
m = self.start_re.match(lexer.lines[lexer.y], lexer.x)
|
m = self.start_re.match(lexer.lines[lexer.y], lexer.x)
|
||||||
# see if we can match out start token
|
# see if we can match out start token
|
||||||
if m:
|
if m:
|
||||||
# ok, so create our start token, and get ready to start reading data
|
# ok, so create our start token, and get ready to start reading data
|
||||||
|
d = m.groupdict()
|
||||||
self._add_from_regex(context, 'start', lexer, m)
|
self._add_from_regex(context, 'start', lexer, m)
|
||||||
null_t_name = '.'.join(context + [self.name, 'null'])
|
null_t_name = '.'.join(context + [self.name, 'null'])
|
||||||
null_t = None
|
null_t = None
|
||||||
|
@ -87,7 +110,7 @@ class RegionRule(Rule):
|
||||||
# reference named groups from the start token. if we have no end,
|
# reference named groups from the start token. if we have no end,
|
||||||
# well, then, we're never getting out of here alive!
|
# well, then, we're never getting out of here alive!
|
||||||
if self.end:
|
if self.end:
|
||||||
end_re = re.compile(self.end % m.groupdict())
|
end_re = re.compile(self.end % d)
|
||||||
|
|
||||||
# ok, so as long as we aren't done (we haven't found an end token),
|
# ok, so as long as we aren't done (we haven't found an end token),
|
||||||
# keep reading input
|
# keep reading input
|
||||||
|
@ -117,7 +140,7 @@ class RegionRule(Rule):
|
||||||
# find a token, note that we found one and exit the loop
|
# find a token, note that we found one and exit the loop
|
||||||
found = False
|
found = False
|
||||||
for rule in self.grammar.rules:
|
for rule in self.grammar.rules:
|
||||||
if rule.match(lexer, context + [self.name]):
|
if rule.match(lexer, context + [self.name], d):
|
||||||
found = True
|
found = True
|
||||||
null_t = None
|
null_t = None
|
||||||
break
|
break
|
||||||
|
@ -166,7 +189,7 @@ class DualRegionRule(Rule):
|
||||||
t = self.make_token(lexer, m.group(0), t_name)
|
t = self.make_token(lexer, m.group(0), t_name)
|
||||||
lexer.add_token(t)
|
lexer.add_token(t)
|
||||||
lexer.x += len(m.group(0))
|
lexer.x += len(m.group(0))
|
||||||
def match(self, lexer, context=[]):
|
def match(self, lexer, context=[], d={}):
|
||||||
m1 = self.start_re.match(lexer.lines[lexer.y], lexer.x)
|
m1 = self.start_re.match(lexer.lines[lexer.y], lexer.x)
|
||||||
# see if we can match out start token
|
# see if we can match out start token
|
||||||
if m1:
|
if m1:
|
||||||
|
@ -208,7 +231,7 @@ class DualRegionRule(Rule):
|
||||||
# find a token, note that we found one and exit the loop
|
# find a token, note that we found one and exit the loop
|
||||||
found = False
|
found = False
|
||||||
for rule in self.grammar1.rules:
|
for rule in self.grammar1.rules:
|
||||||
if rule.match(lexer, context + [self.name]):
|
if rule.match(lexer, context + [self.name], d1):
|
||||||
found = True
|
found = True
|
||||||
null_t = None
|
null_t = None
|
||||||
break
|
break
|
||||||
|
@ -267,7 +290,7 @@ class DualRegionRule(Rule):
|
||||||
# find a token, note that we found one and exit the loop
|
# find a token, note that we found one and exit the loop
|
||||||
found = False
|
found = False
|
||||||
for rule in self.grammar2.rules:
|
for rule in self.grammar2.rules:
|
||||||
if rule.match(lexer, context + [self.name]):
|
if rule.match(lexer, context + [self.name], d3):
|
||||||
found = True
|
found = True
|
||||||
null_t = None
|
null_t = None
|
||||||
break
|
break
|
||||||
|
|
50
lex2_perl.py
50
lex2_perl.py
|
@ -1,4 +1,4 @@
|
||||||
from lex2 import Grammar, ConstantRule, PatternRule, RegionRule, DualRegionRule
|
from lex2 import Grammar, ConstantRule, PatternRule, ContextPatternRule, RegionRule, DualRegionRule
|
||||||
|
|
||||||
class PodGrammar(Grammar):
|
class PodGrammar(Grammar):
|
||||||
rules = [
|
rules = [
|
||||||
|
@ -34,17 +34,14 @@ class StringGrammar(Grammar):
|
||||||
name=r'hash_deref',
|
name=r'hash_deref',
|
||||||
pattern=r"\$\$*[A-Za-z0-9_](?:[A-Za-z0-9_]|::)*(?:->{(?:[a-zA-Z_][a-zA-Z_0-9]*|'(?:\\.|[^'\\])*'|\"(\\.|[^\\\"])*\")})+",
|
pattern=r"\$\$*[A-Za-z0-9_](?:[A-Za-z0-9_]|::)*(?:->{(?:[a-zA-Z_][a-zA-Z_0-9]*|'(?:\\.|[^'\\])*'|\"(\\.|[^\\\"])*\")})+",
|
||||||
),
|
),
|
||||||
#PatternRule(
|
|
||||||
# name=r'hash_bareword_index',
|
|
||||||
# pattern=r'(?<={) *[A-Za-z0-9_]+(?=})',
|
|
||||||
#),
|
|
||||||
PatternRule(
|
PatternRule(
|
||||||
name=r'length_scalar',
|
name=r'length_scalar',
|
||||||
pattern=r"\$#[A-Za-z0-9_](?:[A-Za-z0-9_]|::)*",
|
pattern=r"\$#[A-Za-z0-9_](?:[A-Za-z0-9_]|::)*",
|
||||||
),
|
),
|
||||||
PatternRule(
|
ContextPatternRule(
|
||||||
name=r'system_scalar',
|
name=r'system_scalar',
|
||||||
pattern=r"\$[][><ab/'\"_@\?#\$!%^|&*()](?![A-Za-z0-9_])",
|
pattern=r"\$[^A-Za-z0-9 %(delim)s](?![A-Za-z0-9_])",
|
||||||
|
fallback=r"\$[^A-Za-z0-9 ](?![A-Za-z0-9_])",
|
||||||
),
|
),
|
||||||
PatternRule(
|
PatternRule(
|
||||||
name=r'system_array',
|
name=r'system_array',
|
||||||
|
@ -186,31 +183,43 @@ class PerlGrammar(Grammar):
|
||||||
grammar=Grammar(),
|
grammar=Grammar(),
|
||||||
end=r'\]',
|
end=r'\]',
|
||||||
),
|
),
|
||||||
|
|
||||||
|
# match regexes
|
||||||
RegionRule(
|
RegionRule(
|
||||||
name=r'implicit_match_regex',
|
name=r'match_regex1',
|
||||||
start=r'(?:(?<==~)|(?<=!~)|(?<=\()) */',
|
start=r'(?:(?<==~)|(?<=!~)|(?<=\()) *(?P<delim>/)',
|
||||||
grammar=StringGrammar(),
|
grammar=StringGrammar(),
|
||||||
end=r'/',
|
end=r'/[a-z]*',
|
||||||
),
|
),
|
||||||
RegionRule(
|
RegionRule(
|
||||||
name=r'explicit_match_regex1',
|
name=r'match_regex2',
|
||||||
start=r'm *(?P<delim>[^ #a-zA-Z0-9_])',
|
start=r'm *(?P<delim>[^ #a-zA-Z0-9_])',
|
||||||
grammar=StringGrammar(),
|
grammar=StringGrammar(),
|
||||||
end=r'%(delim)s',
|
end=r'%(delim)s[a-z]*',
|
||||||
),
|
),
|
||||||
RegionRule(
|
RegionRule(
|
||||||
name=r'explicit_match_regex1',
|
name=r'match_regex3',
|
||||||
start=r'm#',
|
start=r'm(?P<delim>#)',
|
||||||
grammar=StringGrammar(),
|
grammar=StringGrammar(),
|
||||||
end=r'#',
|
end=r'#[a-z]*',
|
||||||
|
),
|
||||||
|
|
||||||
|
# replace regexes
|
||||||
|
DualRegionRule(
|
||||||
|
name=r'replace_regex1',
|
||||||
|
start=r's *(?P<delim>[^ a-zA-Z0-9_])',
|
||||||
|
grammar1=StringGrammar(),
|
||||||
|
middle=r'%(delim)s',
|
||||||
|
grammar2=StringGrammar(),
|
||||||
|
end=r'%(delim)s[a-z]*',
|
||||||
),
|
),
|
||||||
DualRegionRule(
|
DualRegionRule(
|
||||||
name=r'replace_regex',
|
name=r'replace_regex2',
|
||||||
start=r's */',
|
start=r's#',
|
||||||
grammar1=StringGrammar(),
|
grammar1=StringGrammar(),
|
||||||
middle=r' */ *',
|
middle=r'#',
|
||||||
grammar2=StringGrammar(),
|
grammar2=StringGrammar(),
|
||||||
end=r'/ *[a-z]*',
|
end=r'#[a-z]*',
|
||||||
),
|
),
|
||||||
|
|
||||||
PatternRule(
|
PatternRule(
|
||||||
|
@ -219,7 +228,8 @@ class PerlGrammar(Grammar):
|
||||||
),
|
),
|
||||||
PatternRule(
|
PatternRule(
|
||||||
name=r'sub',
|
name=r'sub',
|
||||||
pattern=r"(?<=sub )[a-zA-Z_][a-zA-Z_0-9]*(?= *{)",
|
#pattern=r"(?<=sub )[a-zA-Z_][a-zA-Z_0-9]*(?= *{)",
|
||||||
|
pattern=r"(?<=sub )[a-zA-Z_][a-zA-Z_0-9]*",
|
||||||
),
|
),
|
||||||
PatternRule(
|
PatternRule(
|
||||||
name=r'use',
|
name=r'use',
|
||||||
|
|
2
test2.py
2
test2.py
|
@ -15,4 +15,4 @@ for path in paths:
|
||||||
lexer.lex(lines)
|
lexer.lex(lines)
|
||||||
print path
|
print path
|
||||||
for token in lexer:
|
for token in lexer:
|
||||||
print '%-28s| %s' % (token.name, token.string)
|
print '%-28s| %r' % (token.name, token.string)
|
||||||
|
|
46
test3.py
46
test3.py
|
@ -17,12 +17,17 @@ for i in range(0, len(color_list)):
|
||||||
color_dict[color_names[i]] = color_list[i]
|
color_dict[color_names[i]] = color_list[i]
|
||||||
|
|
||||||
token_colors = {
|
token_colors = {
|
||||||
|
'escaped': 'lpurple',
|
||||||
'null': 'white',
|
'null': 'white',
|
||||||
'delimiter': 'white',
|
'delimiter': 'white',
|
||||||
'pod.start': 'lred',
|
'pod.start': 'lred',
|
||||||
'pod.null': 'lred',
|
'pod.null': 'lred',
|
||||||
'pod.end': 'lred',
|
'pod.end': 'lred',
|
||||||
'pod.header': 'lpurple',
|
'pod.header': 'lpurple',
|
||||||
|
'pod.indent_level': 'lpurple',
|
||||||
|
'pod.item_entry': 'lpurple',
|
||||||
|
'pod.format': 'lpurple',
|
||||||
|
'pod.encoding_type': 'lpurple',
|
||||||
'sub': 'lcyan',
|
'sub': 'lcyan',
|
||||||
'number': 'white',
|
'number': 'white',
|
||||||
'operator': 'white',
|
'operator': 'white',
|
||||||
|
@ -54,12 +59,39 @@ token_colors = {
|
||||||
'array': 'yellow',
|
'array': 'yellow',
|
||||||
'hash': 'yellow',
|
'hash': 'yellow',
|
||||||
'bareword_hash_index': 'lgreen',
|
'bareword_hash_index': 'lgreen',
|
||||||
'quoted_region': 'lcyan',
|
|
||||||
'match_regex': 'lcyan',
|
# quoted region
|
||||||
'replace_regex.start': 'lcyan',
|
'quoted_region1': 'lcyan',
|
||||||
'replace_regex.middle': 'lcyan',
|
'quoted_region1.start': 'lcyan',
|
||||||
'replace_regex.end': 'lcyan',
|
'quoted_region1.null': 'lcyan',
|
||||||
'replace_regex.null': 'lcyan',
|
'quoted_region1.end': 'lcyan',
|
||||||
|
'quoted_region2': 'lcyan',
|
||||||
|
'quoted_region2.start': 'lcyan',
|
||||||
|
'quoted_region2.null': 'lcyan',
|
||||||
|
'quoted_region2.end': 'lcyan',
|
||||||
|
|
||||||
|
# match regex
|
||||||
|
'match_regex1.start': 'lcyan',
|
||||||
|
'match_regex1.end': 'lcyan',
|
||||||
|
'match_regex1.null': 'lcyan',
|
||||||
|
'match_regex2.start': 'lcyan',
|
||||||
|
'match_regex2.end': 'lcyan',
|
||||||
|
'match_regex2.null': 'lcyan',
|
||||||
|
'match_regex3.start': 'lcyan',
|
||||||
|
'match_regex3.end': 'lcyan',
|
||||||
|
'match_regex3.null': 'lcyan',
|
||||||
|
|
||||||
|
# replace regex
|
||||||
|
'replace_regex1.start': 'lcyan',
|
||||||
|
'replace_regex1.middle': 'lcyan',
|
||||||
|
'replace_regex1.end': 'lcyan',
|
||||||
|
'replace_regex1.null': 'lcyan',
|
||||||
|
'replace_regex2.start': 'lcyan',
|
||||||
|
'replace_regex2.middle': 'lcyan',
|
||||||
|
'replace_regex2.end': 'lcyan',
|
||||||
|
'replace_regex2.null': 'lcyan',
|
||||||
|
|
||||||
|
#
|
||||||
'bareword_hash_key': 'lgreen',
|
'bareword_hash_key': 'lgreen',
|
||||||
'interpolated_scalar': 'yellow',
|
'interpolated_scalar': 'yellow',
|
||||||
'interpolated_system_scalar': 'yellow',
|
'interpolated_system_scalar': 'yellow',
|
||||||
|
@ -76,7 +108,7 @@ token_colors = {
|
||||||
'static_method': 'lcyan',
|
'static_method': 'lcyan',
|
||||||
'builtin_method': 'lpurple',
|
'builtin_method': 'lpurple',
|
||||||
'bareword_method': 'lcyan',
|
'bareword_method': 'lcyan',
|
||||||
#'bareword': 'yellow',
|
#'bareword': 'yellow',
|
||||||
'bizzaro': 'lpurple',
|
'bizzaro': 'lpurple',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue