regex test
This commit is contained in:
parent
d21a8b56f1
commit
8173f99b41
308
regex.tal
308
regex.tal
|
@ -36,41 +36,81 @@
|
||||||
%print { debug newline }
|
%print { debug newline }
|
||||||
%quit! { #01 #0f DEO BRK }
|
%quit! { #01 #0f DEO BRK }
|
||||||
|
|
||||||
( test -> )
|
( TESTING )
|
||||||
|
|
||||||
|
(
|
||||||
|0100
|
|0100
|
||||||
;expr1 ;compile JSR2
|
;expr1 ;compile JSR2 print
|
||||||
print
|
;emit-stack JSR2 newline
|
||||||
;emit-stack JSR2
|
;emit-arena JSR2 newline
|
||||||
;emit-arena JSR2
|
|
||||||
newline
|
|
||||||
|
|
||||||
;test1 OVR2k ;match JSR2 ;emit-byte JSR2 newline
|
;test1 OVR2k ;match JSR2 ;emit-byte JSR2 space
|
||||||
;test2 OVR2k ;match JSR2 ;emit-byte JSR2 newline
|
;test2 OVR2k ;match JSR2 ;emit-byte JSR2 space
|
||||||
;test3 OVR2k ;match JSR2 ;emit-byte JSR2 newline
|
;test3 OVR2k ;match JSR2 ;emit-byte JSR2 space
|
||||||
;test4 OVR2k ;match JSR2 ;emit-byte JSR2 newline
|
;test4 OVR2k ;match JSR2 ;emit-byte JSR2 space
|
||||||
;test5 OVR2k ;match JSR2 ;emit-byte JSR2 newline
|
;test5 OVR2k ;match JSR2 ;emit-byte JSR2 space
|
||||||
;test6 OVR2k ;match JSR2 ;emit-byte JSR2 newline
|
;test6 OVR2k ;match JSR2 ;emit-byte JSR2 space
|
||||||
;test7 OVR2k ;match JSR2 ;emit-byte JSR2 newline
|
;test7 OVR2k ;match JSR2 ;emit-byte JSR2 space
|
||||||
;test8 OVR2k ;match JSR2 ;emit-byte JSR2 newline
|
;test8 OVR2k ;match JSR2 ;emit-byte JSR2 newline
|
||||||
newline
|
|
||||||
|
|
||||||
;test1 ;graph1 ;match JSR2 ;emit-byte JSR2 newline
|
;test1 ;graph1 ;match JSR2 ;emit-byte JSR2 space
|
||||||
;test2 ;graph1 ;match JSR2 ;emit-byte JSR2 newline
|
;test2 ;graph1 ;match JSR2 ;emit-byte JSR2 space
|
||||||
;test3 ;graph1 ;match JSR2 ;emit-byte JSR2 newline
|
;test3 ;graph1 ;match JSR2 ;emit-byte JSR2 space
|
||||||
;test4 ;graph1 ;match JSR2 ;emit-byte JSR2 newline
|
;test4 ;graph1 ;match JSR2 ;emit-byte JSR2 space
|
||||||
;test5 ;graph1 ;match JSR2 ;emit-byte JSR2 newline
|
;test5 ;graph1 ;match JSR2 ;emit-byte JSR2 space
|
||||||
;test6 ;graph1 ;match JSR2 ;emit-byte JSR2 newline
|
;test6 ;graph1 ;match JSR2 ;emit-byte JSR2 space
|
||||||
;test7 ;graph1 ;match JSR2 ;emit-byte JSR2 newline
|
;test7 ;graph1 ;match JSR2 ;emit-byte JSR2 space
|
||||||
;test8 ;graph1 ;match JSR2 ;emit-byte JSR2 newline
|
;test8 ;graph1 ;match JSR2 ;emit-byte JSR2 newline
|
||||||
quit!
|
quit!
|
||||||
|
)
|
||||||
|
( TEST DATA )
|
||||||
|
|
||||||
|
(
|
||||||
|
( corresponds to regex: a(b|c)d* )
|
||||||
|
@expr1 "a(b|c)d* 00
|
||||||
|
|
||||||
|
( corresponds to regex: a(b|c)d* )
|
||||||
|
( accepts "ab" or "ac" followd by any number of d's )
|
||||||
|
@graph1
|
||||||
|
03 'a :x1
|
||||||
|
@x1 04 :x2 :x3
|
||||||
|
@x2 03 'b :x4
|
||||||
|
@x3 03 'c :x4
|
||||||
|
@x4 05 :x5 0000
|
||||||
|
@x5 03 'd :x4
|
||||||
|
|
||||||
|
( test case strings to try matching )
|
||||||
|
@test1 "ab 00 ( yes )
|
||||||
|
@test2 "acdd 00 ( yes )
|
||||||
|
@test3 "add 00 ( no )
|
||||||
|
@test4 "abd 00 ( yes )
|
||||||
|
@test5 "acddddddddddd 00 ( yes )
|
||||||
|
@test6 "bd 00 ( no )
|
||||||
|
@test7 "z 00 ( no )
|
||||||
|
@test8 00 ( no )
|
||||||
|
)
|
||||||
|
|
||||||
|
( PRINTING DATA )
|
||||||
|
|
||||||
|
@emit-short ( byte -- )
|
||||||
|
SWP ;emit-byte JSR2 ;emit-byte JSR2 JMP2r
|
||||||
|
|
||||||
|
@emit-byte ( byte -- )
|
||||||
|
DUP #04 SFT ,&hex JSR #0f AND ,&hex JMP
|
||||||
|
&hex #30 ADD DUP #39 GTH #27 MUL ADD emit
|
||||||
|
JMP2r
|
||||||
|
|
||||||
|
( ERROR HANDLING )
|
||||||
|
|
||||||
|
( using error! will print the given message before causing )
|
||||||
|
( the interpreter to halt. )
|
||||||
@error! ( msg* -> )
|
@error! ( msg* -> )
|
||||||
LIT '! emit space
|
LIT '! emit space
|
||||||
&loop LDAk ,&continue JCN ,&done JMP
|
&loop LDAk ,&continue JCN ,&done JMP
|
||||||
&continue LDAk emit INC2 ,&loop JMP
|
&continue LDAk emit INC2 ,&loop JMP
|
||||||
&done POP2 newline quit!
|
&done POP2 newline quit!
|
||||||
|
|
||||||
|
( error messages )
|
||||||
@unknown-node-type "unknown 20 "node 20 "type 00
|
@unknown-node-type "unknown 20 "node 20 "type 00
|
||||||
@mismatched-parens "mismatched 20 "parenthesis 00
|
@mismatched-parens "mismatched 20 "parenthesis 00
|
||||||
@stack-is-full "stack 20 "is 20 "full 00
|
@stack-is-full "stack 20 "is 20 "full 00
|
||||||
|
@ -78,16 +118,27 @@
|
||||||
@arena-is-full "arena 20 "is 20 "full 00
|
@arena-is-full "arena 20 "is 20 "full 00
|
||||||
@star-invariant "star 20 "invariant 20 "failed 00
|
@star-invariant "star 20 "invariant 20 "failed 00
|
||||||
|
|
||||||
|
( REGEX MATCHING )
|
||||||
|
|
||||||
|
( use stored regex to match against a stored string. )
|
||||||
|
( )
|
||||||
|
( regex* should be the address of a compiled regex )
|
||||||
|
( such as that returned from ;compile. )
|
||||||
|
( )
|
||||||
|
( str* should be a null-terminated string. )
|
||||||
|
( )
|
||||||
|
( returns true if the string, and false otherwise. )
|
||||||
@match ( str* regex* -> bool^ )
|
@match ( str* regex* -> bool^ )
|
||||||
;reset-stack JSR2
|
;reset-stack JSR2
|
||||||
;loop JMP2
|
;loop JMP2
|
||||||
|
|
||||||
|
( loop used during matching )
|
||||||
|
( )
|
||||||
( we don't use the return stack here since that )
|
( we don't use the return stack here since that )
|
||||||
( complicates the back-tracking we need to do. )
|
( complicates the back-tracking we need to do. )
|
||||||
( ultimately this code will issue a JMP2r to )
|
( ultimately this code will issue a JMP2r to )
|
||||||
( return a boolean, which is where the stack )
|
( return a boolean, which is where the stack )
|
||||||
( effects signature comes from. )
|
( effects signature comes from. )
|
||||||
|
|
||||||
@loop ( s* r* -> bool^ )
|
@loop ( s* r* -> bool^ )
|
||||||
LDAk #01 EQU ;do-empty JCN2
|
LDAk #01 EQU ;do-empty JCN2
|
||||||
LDAk #02 EQU ;do-dot JCN2
|
LDAk #02 EQU ;do-dot JCN2
|
||||||
|
@ -96,13 +147,15 @@
|
||||||
LDAk #05 EQU ;do-or JCN2 ( same code as the or case )
|
LDAk #05 EQU ;do-or JCN2 ( same code as the or case )
|
||||||
;unknown-node-type ;error! JSR2
|
;unknown-node-type ;error! JSR2
|
||||||
|
|
||||||
|
( used when we hit a dead-end during matching. )
|
||||||
|
( )
|
||||||
|
( if stack is non-empty we have a point we can resume from. )
|
||||||
@goto-backtrack ( -> bool^ )
|
@goto-backtrack ( -> bool^ )
|
||||||
;stack-exist JSR2 ,&has-stack JCN ( do we have stack? )
|
;stack-exist JSR2 ,&has-stack JCN ( do we have stack? )
|
||||||
#00 JMP2r ( no, return false )
|
#00 JMP2r ( no, return false )
|
||||||
&has-stack
|
&has-stack ;pop4 JSR2 ;goto-next JMP2 ( yes, resume from the top )
|
||||||
;pop4 JSR2
|
|
||||||
;goto-next JMP2 ( yes, resume from the top )
|
|
||||||
|
|
||||||
|
( follow the given address (next*) to continue matching )
|
||||||
@goto-next ( str* next* -> bool^ )
|
@goto-next ( str* next* -> bool^ )
|
||||||
DUP2 #0000 GTH2 ,&has-next JCN
|
DUP2 #0000 GTH2 ,&has-next JCN
|
||||||
POP2 LDA null? ,&end-of-string JCN
|
POP2 LDA null? ,&end-of-string JCN
|
||||||
|
@ -110,16 +163,19 @@
|
||||||
&end-of-string #01 JMP2r
|
&end-of-string #01 JMP2r
|
||||||
&has-next ;loop JMP2
|
&has-next ;loop JMP2
|
||||||
|
|
||||||
|
( handle the empty node -- just follow the next pointer )
|
||||||
@do-empty ( str* regex* -> bool^ )
|
@do-empty ( str* regex* -> bool^ )
|
||||||
INC2 LDA2 ( load next )
|
INC2 LDA2 ( load next )
|
||||||
;goto-next JMP2 ( jump to next )
|
;goto-next JMP2 ( jump to next )
|
||||||
|
|
||||||
|
( handle dot -- match any one character )
|
||||||
@do-dot ( str* regex* -> bool^ )
|
@do-dot ( str* regex* -> bool^ )
|
||||||
INC2 LDA2 STH2 ( load and stash next )
|
INC2 LDA2 STH2 ( load and stash next )
|
||||||
LDAk #00 NEQ ,&non-empty JCN ( is there a char? )
|
LDAk #00 NEQ ,&non-empty JCN ( is there a char? )
|
||||||
POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack )
|
POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack )
|
||||||
&non-empty INC2 STH2r ;goto-next JMP2 ( yes, inc s, restore and jump )
|
&non-empty INC2 STH2r ;goto-next JMP2 ( yes, inc s, restore and jump )
|
||||||
|
|
||||||
|
( handle literal -- match one specific character )
|
||||||
@do-literal ( str* regex* -> bool^ )
|
@do-literal ( str* regex* -> bool^ )
|
||||||
INC2
|
INC2
|
||||||
LDAk STH ( store c )
|
LDAk STH ( store c )
|
||||||
|
@ -130,19 +186,23 @@
|
||||||
&matches
|
&matches
|
||||||
INC2 STH2r ;goto-next JMP2 ( yes, inc s, restore and jump )
|
INC2 STH2r ;goto-next JMP2 ( yes, inc s, restore and jump )
|
||||||
|
|
||||||
|
( handle or -- try the left branch but backtrack to the right if needed )
|
||||||
|
( )
|
||||||
( this also handles asteration, since it ends up having the same structure )
|
( this also handles asteration, since it ends up having the same structure )
|
||||||
@do-or ( str* regex* -> bool^ )
|
@do-or ( str* regex* -> bool^ )
|
||||||
INC2 OVR2 OVR2 #0002 ADD2 ( s r+1 s r+3 )
|
INC2 OVR2 OVR2 #0002 ADD2 ( s r+1 s r+3 )
|
||||||
LDA2 ;push4 JSR2 ( save (s, right) in the stack for possible backtracking )
|
LDA2 ;push4 JSR2 ( save (s, right) in the stack for possible backtracking )
|
||||||
LDA2 ;loop JMP2 ( continue on left branch )
|
LDA2 ;loop JMP2 ( continue on left branch )
|
||||||
|
|
||||||
|
( REGEX PARSING )
|
||||||
|
|
||||||
( track the position in the input string )
|
( track the position in the input string )
|
||||||
@pos $2
|
@pos $2
|
||||||
|
|
||||||
( track how many levels deep we are in parenthesis )
|
( track how many levels deep we are in parenthesis )
|
||||||
@parens $2
|
@parens $2
|
||||||
|
|
||||||
( read and increments pos )
|
( read and increment pos )
|
||||||
@read ( -> c^ )
|
@read ( -> c^ )
|
||||||
;pos LDA2k ( pos s )
|
;pos LDA2k ( pos s )
|
||||||
LDAk STHk #00 EQU ( pos s c=0 [c] )
|
LDAk STHk #00 EQU ( pos s c=0 [c] )
|
||||||
|
@ -151,25 +211,24 @@
|
||||||
SWP2 STA2 ,&return JMP ( [c] )
|
SWP2 STA2 ,&return JMP ( [c] )
|
||||||
&is-eof POP2 POP2
|
&is-eof POP2 POP2
|
||||||
&return STHr ( c )
|
&return STHr ( c )
|
||||||
;pos LDA2 ;emit-short JSR2 LIT '> emit DUP ;emit-byte JSR2 newline
|
|
||||||
JMP2r
|
JMP2r
|
||||||
|
|
||||||
( read pos )
|
( is pos currently pointing to a star? )
|
||||||
@peek ( -> c^ )
|
|
||||||
;pos LDA2 LDA JMP2r
|
|
||||||
|
|
||||||
@peek-to-star ( -> is-star^ )
|
@peek-to-star ( -> is-star^ )
|
||||||
;pos LDA2 LDA LIT '* EQU JMP2r
|
;pos LDA2 LDA LIT '* EQU JMP2r
|
||||||
|
|
||||||
( pos += 1 )
|
( just increment pos )
|
||||||
@skip
|
@skip
|
||||||
;pos LDA2 ;emit-short JSR2 LIT '! emit
|
;pos LDA2 INC2 ;pos STA2 JMP2r
|
||||||
;pos LDA2 INC2 ;pos STA2
|
|
||||||
;pos LDA2 ;emit-short JSR2 newline
|
|
||||||
JMP2r
|
|
||||||
|
|
||||||
( TODO: [] + ? )
|
( TODO: [] + ? )
|
||||||
( compile an expression string into a regex graph )
|
( compile an expression string into a regex graph )
|
||||||
|
( )
|
||||||
|
( the regex will be allocated in the arena; if there is not )
|
||||||
|
( sufficient space an error will be thrown. )
|
||||||
|
( )
|
||||||
|
( the stack will also be used during parsing although unlike )
|
||||||
|
( the arena it will be released once compilation ends. )
|
||||||
@compile ( expr* -> regex* )
|
@compile ( expr* -> regex* )
|
||||||
;pos STA2
|
;pos STA2
|
||||||
#0000 ;parens STA2
|
#0000 ;parens STA2
|
||||||
|
@ -201,28 +260,59 @@
|
||||||
DUP LIT '* EQU ;c-star JCN2
|
DUP LIT '* EQU ;c-star JCN2
|
||||||
;c-char JMP2
|
;c-char JMP2
|
||||||
|
|
||||||
|
( either finalize the given r0/r1 or else wrap it in )
|
||||||
|
( a star node if a star is coming up next. )
|
||||||
|
( )
|
||||||
|
( we use this look-ahead approach rather than compiling )
|
||||||
|
( star nodes directly since the implementation is simpler. )
|
||||||
@c-peek-and-finalize ( r0* r1* -> r2* )
|
@c-peek-and-finalize ( r0* r1* -> r2* )
|
||||||
;peek-to-star JSR2 ( r0 r1 next-is-star? )
|
;peek-to-star JSR2 ( r0 r1 next-is-star? )
|
||||||
,&next-is-star JCN ,&finally JMP ( r0 r1 )
|
,&next-is-star JCN ,&finally JMP ( r0 r1 )
|
||||||
&next-is-star ;skip JSR2 POP2 ;alloc-star JSR2 DUP2 ( star )
|
&next-is-star ;skip JSR2 POP2 ;alloc-star JSR2 DUP2 ( star )
|
||||||
&finally ;push-next JSR2 ;compile-region-loop JMP2
|
&finally ;push-next JSR2 ;compile-region-loop JMP2
|
||||||
|
|
||||||
|
( called when we reach EOF of the input string )
|
||||||
|
( )
|
||||||
|
( as with c-rpar we have to unroll the current level )
|
||||||
|
( of the stack, building any or-nodes that are needed. )
|
||||||
|
( )
|
||||||
|
( this is where we detect unclosed parenthesis. )
|
||||||
@c-done ( c^ -> r2* )
|
@c-done ( c^ -> r2* )
|
||||||
POP
|
POP
|
||||||
;parens LDA2 #0000 GTH2 ,&mismatched-parens JCN
|
;parens LDA2 #0000 GTH2 ,&mismatched-parens JCN
|
||||||
;unroll-stack JSR2 POP2 JMP2r
|
;unroll-stack JSR2 POP2 JMP2r
|
||||||
&mismatched-parens ;mismatched-parens ;error! JSR2
|
&mismatched-parens ;mismatched-parens ;error! JSR2
|
||||||
|
|
||||||
|
( called when we read "|" )
|
||||||
|
( )
|
||||||
|
( since we defer building or-nodes until the end of the region )
|
||||||
|
( we just start a new stack frame and continue. )
|
||||||
@c-or ( c^ -> r2* )
|
@c-or ( c^ -> r2* )
|
||||||
POP
|
POP
|
||||||
#0000 #0000 ;push4 JSR2
|
#0000 #0000 ;push4 JSR2
|
||||||
;compile-region-loop JMP2
|
;compile-region-loop JMP2
|
||||||
|
|
||||||
|
( called when we read "(" )
|
||||||
|
( )
|
||||||
|
( this causes us to: )
|
||||||
|
( )
|
||||||
|
( 1. increment parens )
|
||||||
|
( 2. start a new region on the stack )
|
||||||
|
( 3. jump to compile-region to start parsing the new region )
|
||||||
@c-lpar ( c^ -> r2* )
|
@c-lpar ( c^ -> r2* )
|
||||||
POP
|
POP
|
||||||
;parens LDA2 INC2 ;parens STA2 ( parens++ )
|
;parens LDA2 INC2 ;parens STA2 ( parens++ )
|
||||||
;compile-region JMP2
|
;compile-region JMP2
|
||||||
|
|
||||||
|
( called when we read ")" )
|
||||||
|
( )
|
||||||
|
( this causes us to: )
|
||||||
|
( )
|
||||||
|
( 1. check for mismatched parens )
|
||||||
|
( 2. decrement parens )
|
||||||
|
( 3. unroll the current region on the stack into one regex node )
|
||||||
|
( 4. finalize that node and append it to the previous region )
|
||||||
|
( 5. continue parsing )
|
||||||
@c-rpar ( c^ -> r2* )
|
@c-rpar ( c^ -> r2* )
|
||||||
POP
|
POP
|
||||||
;parens LDA2 #0000 EQU2 ,&mismatched-parens JCN
|
;parens LDA2 #0000 EQU2 ,&mismatched-parens JCN
|
||||||
|
@ -231,36 +321,48 @@
|
||||||
;c-peek-and-finalize JMP2
|
;c-peek-and-finalize JMP2
|
||||||
&mismatched-parens ;mismatched-parens ;error! JSR2
|
&mismatched-parens ;mismatched-parens ;error! JSR2
|
||||||
|
|
||||||
|
( called when we read "." )
|
||||||
|
( )
|
||||||
|
( allocates a dot-node and continues. )
|
||||||
@c-dot ( c^ -> r2* )
|
@c-dot ( c^ -> r2* )
|
||||||
POP
|
POP
|
||||||
;alloc-dot JSR2 ( dot )
|
;alloc-dot JSR2 ( dot )
|
||||||
DUP2 ;c-peek-and-finalize JMP2
|
DUP2 ;c-peek-and-finalize JMP2
|
||||||
|
|
||||||
@c-char ( c^ -> r2* )
|
|
||||||
;alloc-lit JSR2 ( lit )
|
|
||||||
DUP2 ;c-peek-and-finalize JMP2
|
|
||||||
|
|
||||||
( TODO: escaping rules not quite right )
|
( TODO: escaping rules not quite right )
|
||||||
|
|
||||||
|
( called when we read "\" )
|
||||||
|
( )
|
||||||
|
( allocates a literal of the next character. )
|
||||||
|
( )
|
||||||
|
( this doesn't currently handle any special escape sequences. )
|
||||||
@c-esc ( c^ -> r2* )
|
@c-esc ( c^ -> r2* )
|
||||||
POP
|
POP
|
||||||
;read JSR2
|
;read JSR2
|
||||||
;c-char JMP2
|
;c-char JMP2
|
||||||
|
|
||||||
( we don't expect to actually handle this )
|
( called when we read any other character )
|
||||||
|
( )
|
||||||
|
( allocates a literal-node and continues. )
|
||||||
|
@c-char ( c^ -> r2* )
|
||||||
|
;alloc-lit JSR2 ( lit )
|
||||||
|
DUP2 ;c-peek-and-finalize JMP2
|
||||||
|
|
||||||
|
( called if we parse a "*" )
|
||||||
|
( )
|
||||||
|
( actually calling this means the code broke an invariant somewhere. )
|
||||||
@c-star ( c^ -> regex* )
|
@c-star ( c^ -> regex* )
|
||||||
POP
|
POP
|
||||||
;star-invariant ;error! JSR2
|
;star-invariant ;error! JSR2
|
||||||
|
|
||||||
|
( ALLOCATING REGEX NDOES )
|
||||||
( allocate node types ------ )
|
|
||||||
|
|
||||||
@alloc3 ( mode^ -> r* )
|
@alloc3 ( mode^ -> r* )
|
||||||
#0000 ROT ( 00 00 mode^ )
|
#0000 ROT ( 00 00 mode^ )
|
||||||
#03 ;alloc JSR2 ( 00 00 mode^ addr* )
|
#03 ;alloc JSR2 ( 00 00 mode^ addr* )
|
||||||
STH2k STA
|
STH2k STA ( addr <- mode )
|
||||||
STH2kr INC2 STA2
|
STH2kr INC2 STA2 ( addr+1 <- 0000 )
|
||||||
STH2r
|
STH2r JMP2r ( return addr )
|
||||||
JMP2r
|
|
||||||
|
|
||||||
@alloc-empty ( -> r* )
|
@alloc-empty ( -> r* )
|
||||||
#01 ;alloc3 JMP2
|
#01 ;alloc3 JMP2
|
||||||
|
@ -269,13 +371,12 @@
|
||||||
#02 ;alloc3 JMP2
|
#02 ;alloc3 JMP2
|
||||||
|
|
||||||
@alloc-lit ( c^ -> r* )
|
@alloc-lit ( c^ -> r* )
|
||||||
#03 #0000 SWP2
|
#03 #0000 SWP2 ( 0000 c^ 03 )
|
||||||
#04 ;alloc JSR2
|
#04 ;alloc JSR2 ( 0000 c^ 03 addr* )
|
||||||
STH2k STA
|
STH2k STA ( addr <- 03 )
|
||||||
STH2kr INC2 STA
|
STH2kr INC2 STA ( addr+1 <- c )
|
||||||
STH2kr #0002 ADD2 STA2
|
STH2kr #0002 ADD2 STA2 ( addr+2 <- 0000 )
|
||||||
STH2r
|
STH2r JMP2r ( return addr )
|
||||||
JMP2r
|
|
||||||
|
|
||||||
@alloc-or ( right* left* -> r* )
|
@alloc-or ( right* left* -> r* )
|
||||||
#05 ;alloc JSR2 STH2 ( r l [x] )
|
#05 ;alloc JSR2 STH2 ( r l [x] )
|
||||||
|
@ -293,7 +394,7 @@
|
||||||
;set-next JSR2 ( [r] )
|
;set-next JSR2 ( [r] )
|
||||||
STH2r JMP2r
|
STH2r JMP2r
|
||||||
|
|
||||||
( unroll one part of the parsing stack, returning )
|
( unroll one region of the parsing stack, returning )
|
||||||
( a single node consisting of an alternation of )
|
( a single node consisting of an alternation of )
|
||||||
( all elements on the stack. )
|
( all elements on the stack. )
|
||||||
( )
|
( )
|
||||||
|
@ -332,6 +433,10 @@
|
||||||
JMP2r
|
JMP2r
|
||||||
&is-zero POP2 POP2 ;push4 JSR2 JMP2r
|
&is-zero POP2 POP2 ;push4 JSR2 JMP2r
|
||||||
|
|
||||||
|
( load the given address: )
|
||||||
|
( )
|
||||||
|
( 1. if it points to 0000, update it to target )
|
||||||
|
( 2. otherwise, call set-next on it )
|
||||||
@set-next-addr ( target* addr* -> )
|
@set-next-addr ( target* addr* -> )
|
||||||
LDA2k #0000 EQU2 ( target addr addr=0? ) ,&is-zero JCN
|
LDA2k #0000 EQU2 ( target addr addr=0? ) ,&is-zero JCN
|
||||||
LDA2 ;set-next JSR2 JMP2r
|
LDA2 ;set-next JSR2 JMP2r
|
||||||
|
@ -349,60 +454,15 @@
|
||||||
&!4 LDAk #05 NEQ ,&!5 JCN #0003 ADD2 ;set-next-addr JSR2 JMP2r
|
&!4 LDAk #05 NEQ ,&!5 JCN #0003 ADD2 ;set-next-addr JSR2 JMP2r
|
||||||
&!5 ;unknown-node-type ;error! JSR2
|
&!5 ;unknown-node-type ;error! JSR2
|
||||||
|
|
||||||
@remove-addr ( target* addr* -> )
|
( STACK OPERATIONS )
|
||||||
LDA2k #0000 EQU2 ( t a v=0? ) ,&is-zero JCN
|
( )
|
||||||
OVR2 OVR2 LDA2 EQU2 ( t a t=v? ) ,&is-equal JCN
|
( we always push/pop 4 bytes at a time. the stack has a fixed )
|
||||||
LDA2 ( t v ) ;remove-from JSR2 JMP2r
|
( maximum size it can use, defined by ;stack-top. )
|
||||||
&is-zero POP2 POP2 JMP2r
|
( )
|
||||||
&is-equal NIP2 #0000 SWP2 STA2 JMP2r
|
( the stack can be cleared using ;reset-stack, which resets )
|
||||||
|
( the stack pointers but does not zero out any memory. )
|
||||||
( remove target from regex )
|
|
||||||
@remove-from ( target* regex* -> )
|
|
||||||
LDAk #01 NEQ ,&!1 JCN INC2 ;remove-addr JSR2 JMP2r
|
|
||||||
&!1 LDAk #02 NEQ ,&!2 JCN INC2 ;remove-addr JSR2 JMP2r
|
|
||||||
&!2 LDAk #03 NEQ ,&!3 JCN #0002 ADD2 ;remove-addr JSR2 JMP2r
|
|
||||||
&!3 LDAk #04 NEQ ,&!4 JCN
|
|
||||||
OVR2 OVR2 INC2 ;remove-addr JSR2
|
|
||||||
#0003 ADD2 ;remove-addr JSR2 JMP2r
|
|
||||||
&!4 LDAk #05 NEQ ,&!5 JCN #0003 ADD2 ;remove-addr JSR2 JMP2r
|
|
||||||
&!5 ;unknown-node-type ;error! JSR2
|
|
||||||
|
|
||||||
( test cases -------- )
|
|
||||||
|
|
||||||
( corresponds to regex: a(b|c)d* )
|
|
||||||
@expr1 "a(b|c)d* 00
|
|
||||||
|
|
||||||
( corresponds to regex: a(b|c)d* )
|
|
||||||
( accepts "ab" or "ac" followd by any number of d's )
|
|
||||||
@graph1
|
|
||||||
03 'a :x1
|
|
||||||
@x1 04 :x2 :x3
|
|
||||||
@x2 03 'b :x4
|
|
||||||
@x3 03 'c :x4
|
|
||||||
@x4 05 :x5 0000
|
|
||||||
@x5 03 'd :x4
|
|
||||||
|
|
||||||
@test1 "ab 00 ( yes )
|
|
||||||
@test2 "acdd 00 ( yes )
|
|
||||||
@test3 "add 00 ( no )
|
|
||||||
@test4 "abd 00 ( yes )
|
|
||||||
@test5 "acddddddddddd 00 ( yes )
|
|
||||||
@test6 "bd 00 ( no )
|
|
||||||
@test7 "z 00 ( no )
|
|
||||||
@test8 00 ( no )
|
|
||||||
|
|
||||||
( emit byte/short )
|
|
||||||
|
|
||||||
@emit-short ( byte -- )
|
|
||||||
SWP ;emit-byte JSR2 ;emit-byte JSR2 JMP2r
|
|
||||||
|
|
||||||
@emit-byte ( byte -- )
|
|
||||||
DUP #04 SFT ,&hex JSR #0f AND ,&hex JMP
|
|
||||||
&hex #30 ADD DUP #39 GTH #27 MUL ADD emit
|
|
||||||
JMP2r
|
|
||||||
|
|
||||||
( stack operations ---- )
|
|
||||||
|
|
||||||
|
( push 4 bytes onto the stack )
|
||||||
@push4 ( str* regex* -> )
|
@push4 ( str* regex* -> )
|
||||||
;assert-stack-avail JSR2 ( check for space )
|
;assert-stack-avail JSR2 ( check for space )
|
||||||
;stack-pos LDA2 #0002 ADD2 STA2 ( cell[2:3] <- regex )
|
;stack-pos LDA2 #0002 ADD2 STA2 ( cell[2:3] <- regex )
|
||||||
|
@ -410,6 +470,7 @@
|
||||||
;stack-pos LDA2 #0004 ADD2 ;stack-pos STA2 ( pos += 4 )
|
;stack-pos LDA2 #0004 ADD2 ;stack-pos STA2 ( pos += 4 )
|
||||||
JMP2r
|
JMP2r
|
||||||
|
|
||||||
|
( pop 4 bytes from the stack )
|
||||||
@pop4 ( -> str* regex* )
|
@pop4 ( -> str* regex* )
|
||||||
;assert-stack-exist JSR2 ( check for space )
|
;assert-stack-exist JSR2 ( check for space )
|
||||||
;stack-pos LDA2 ( load stack-pos )
|
;stack-pos LDA2 ( load stack-pos )
|
||||||
|
@ -419,18 +480,27 @@
|
||||||
STH2r STH2r ( restore str and regex )
|
STH2r STH2r ( restore str and regex )
|
||||||
JMP2r
|
JMP2r
|
||||||
|
|
||||||
|
( reset stack pointers )
|
||||||
@reset-stack ( -> )
|
@reset-stack ( -> )
|
||||||
;stack-bot ;stack-pos STA2 JMP2r ( pos <- 0 )
|
;stack-bot ;stack-pos STA2 JMP2r ( pos <- 0 )
|
||||||
|
|
||||||
|
( can more stack be allocated? )
|
||||||
@stack-avail ( -> bool^ )
|
@stack-avail ( -> bool^ )
|
||||||
;stack-pos LDA2 ;stack-top LTH2 JMP2r
|
;stack-pos LDA2 ;stack-top LTH2 JMP2r
|
||||||
|
|
||||||
|
( is the stack non-empty? )
|
||||||
@stack-exist ( -> bool^ )
|
@stack-exist ( -> bool^ )
|
||||||
;stack-pos LDA2 ;stack-bot GTH2 JMP2r
|
;stack-pos LDA2 ;stack-bot GTH2 JMP2r
|
||||||
|
|
||||||
|
( error if stack is full )
|
||||||
@assert-stack-avail ( -> )
|
@assert-stack-avail ( -> )
|
||||||
;stack-avail JSR2 ,&ok JCN ;stack-is-full ;error! JSR2 &ok JMP2r
|
;stack-avail JSR2 ,&ok JCN ;stack-is-full ;error! JSR2 &ok JMP2r
|
||||||
|
|
||||||
|
( error is stack is empty )
|
||||||
@assert-stack-exist ( -> )
|
@assert-stack-exist ( -> )
|
||||||
;stack-exist JSR2 ,&ok JCN ;stack-is-empty ;error! JSR2 &ok JMP2r
|
;stack-exist JSR2 ,&ok JCN ;stack-is-empty ;error! JSR2 &ok JMP2r
|
||||||
|
|
||||||
|
( print stack size, followed by contents )
|
||||||
@emit-stack ( -> )
|
@emit-stack ( -> )
|
||||||
space LIT 'n emit LIT '= emit ;stack-pos LDA2 ;stack-bot SUB2 #0004 DIV2 ;emit-short JSR2 LIT ': emit
|
space LIT 'n emit LIT '= emit ;stack-pos LDA2 ;stack-bot SUB2 #0004 DIV2 ;emit-short JSR2 LIT ': emit
|
||||||
;stack-bot
|
;stack-bot
|
||||||
|
@ -441,14 +511,31 @@
|
||||||
space LDA2k ;emit-short JSR2
|
space LDA2k ;emit-short JSR2
|
||||||
#0002 ADD2 ,&loop JMP
|
#0002 ADD2 ,&loop JMP
|
||||||
|
|
||||||
|
( stack-pos points to the next free stack position (or the top if full). )
|
||||||
@stack-pos :stack-bot ( the next position to insert at )
|
@stack-pos :stack-bot ( the next position to insert at )
|
||||||
|
|
||||||
|
( stack-bot is the address of the first stack position. )
|
||||||
|
( stack-top is the address of the first byte beyond the stack. )
|
||||||
@stack-bot $1000 @stack-top ( holds 1024 steps (4096 bytes) )
|
@stack-bot $1000 @stack-top ( holds 1024 steps (4096 bytes) )
|
||||||
|
|
||||||
( arena operations ---- )
|
( ARENA OPERATIONS )
|
||||||
|
( )
|
||||||
|
( the arena represents a heap of memory that can easily be )
|
||||||
|
( allocated in small amounts. )
|
||||||
|
( )
|
||||||
|
( the entire arena can be reclaimed using ;reset-arena, but )
|
||||||
|
( unlike systems such as malloc/free, the arena cannot relcaim )
|
||||||
|
( smaller amounts of memory. )
|
||||||
|
( )
|
||||||
|
( the arena is used to allocate regex graph nodes, which are )
|
||||||
|
( dynamically-allocated as the regex string is parsed. once )
|
||||||
|
( a regex is no longer needed the arena may be reclaimed. )
|
||||||
|
|
||||||
|
( reclaim all the memory used by the arena )
|
||||||
@reset-arena ( -> )
|
@reset-arena ( -> )
|
||||||
;arena-bot ;arena-pos STA2 JMP2r
|
;arena-bot ;arena-pos STA2 JMP2r
|
||||||
|
|
||||||
|
( currently caller is responsible for zeroing out memory if needed )
|
||||||
@alloc ( size^ -> addr* )
|
@alloc ( size^ -> addr* )
|
||||||
#00 SWP ( size* )
|
#00 SWP ( size* )
|
||||||
;arena-pos LDA2 STH2k ADD2 ( pos+size* [pos] )
|
;arena-pos LDA2 STH2k ADD2 ( pos+size* [pos] )
|
||||||
|
@ -462,6 +549,7 @@
|
||||||
@arena-pos :arena-bot ( the next position to allocate )
|
@arena-pos :arena-bot ( the next position to allocate )
|
||||||
@arena-bot $400 @arena-top ( holds up to 1024 bytes )
|
@arena-bot $400 @arena-top ( holds up to 1024 bytes )
|
||||||
|
|
||||||
|
( emit n bytes from the given address )
|
||||||
@emit-n ( addr* count^ -> addr2* )
|
@emit-n ( addr* count^ -> addr2* )
|
||||||
DUP #00 GTH ( addr count count>0? ) ,&ok JCN ( addr count ) POP newline JMP2r
|
DUP #00 GTH ( addr count count>0? ) ,&ok JCN ( addr count ) POP newline JMP2r
|
||||||
&ok
|
&ok
|
||||||
|
@ -469,6 +557,8 @@
|
||||||
STHr #01 SUB ( addr+1 count-1 )
|
STHr #01 SUB ( addr+1 count-1 )
|
||||||
;emit-n JMP2
|
;emit-n JMP2
|
||||||
|
|
||||||
|
( emit the arena, with one line per node )
|
||||||
|
( parses node type, since node size is dynamic (3-5). )
|
||||||
@emit-arena ( -> )
|
@emit-arena ( -> )
|
||||||
;arena-bot
|
;arena-bot
|
||||||
&loop
|
&loop
|
||||||
|
|
|
@ -0,0 +1,56 @@
|
||||||
|
%dbg { #ff #0e DEO }
|
||||||
|
%sp { #20 #18 DEO }
|
||||||
|
%nl { #0a #18 DEO }
|
||||||
|
%exit { #01 #0f DEO BRK }
|
||||||
|
|
||||||
|
|0100
|
||||||
|
;expr1 ;compile JSR2 dbg nl
|
||||||
|
;emit-stack JSR2 nl
|
||||||
|
;emit-arena JSR2 nl
|
||||||
|
|
||||||
|
;test1 OVR2k ;match JSR2 ;emit-byte JSR2 sp
|
||||||
|
;test2 OVR2k ;match JSR2 ;emit-byte JSR2 sp
|
||||||
|
;test3 OVR2k ;match JSR2 ;emit-byte JSR2 sp
|
||||||
|
;test4 OVR2k ;match JSR2 ;emit-byte JSR2 sp
|
||||||
|
;test5 OVR2k ;match JSR2 ;emit-byte JSR2 sp
|
||||||
|
;test6 OVR2k ;match JSR2 ;emit-byte JSR2 sp
|
||||||
|
;test7 OVR2k ;match JSR2 ;emit-byte JSR2 sp
|
||||||
|
;test8 OVR2k ;match JSR2 ;emit-byte JSR2 nl
|
||||||
|
|
||||||
|
;test1 ;graph1 ;match JSR2 ;emit-byte JSR2 sp
|
||||||
|
;test2 ;graph1 ;match JSR2 ;emit-byte JSR2 sp
|
||||||
|
;test3 ;graph1 ;match JSR2 ;emit-byte JSR2 sp
|
||||||
|
;test4 ;graph1 ;match JSR2 ;emit-byte JSR2 sp
|
||||||
|
;test5 ;graph1 ;match JSR2 ;emit-byte JSR2 sp
|
||||||
|
;test6 ;graph1 ;match JSR2 ;emit-byte JSR2 sp
|
||||||
|
;test7 ;graph1 ;match JSR2 ;emit-byte JSR2 sp
|
||||||
|
;test8 ;graph1 ;match JSR2 ;emit-byte JSR2 nl
|
||||||
|
|
||||||
|
;reset-arena JSR2
|
||||||
|
exit
|
||||||
|
|
||||||
|
( corresponds to regex: a(b|c)d* )
|
||||||
|
@expr1 "a(b|c)d* 00
|
||||||
|
|
||||||
|
( corresponds to regex: a(b|c)d* )
|
||||||
|
( accepts "ab" or "ac" followd by any number of d's )
|
||||||
|
@graph1
|
||||||
|
03 'a :x1
|
||||||
|
@x1 04 :x2 :x3
|
||||||
|
@x2 03 'b :x4
|
||||||
|
@x3 03 'c :x4
|
||||||
|
@x4 05 :x5 0000
|
||||||
|
@x5 03 'd :x4
|
||||||
|
|
||||||
|
( test case strings to try matching )
|
||||||
|
@test1 "ab 00 ( yes )
|
||||||
|
@test2 "acdd 00 ( yes )
|
||||||
|
@test3 "add 00 ( no )
|
||||||
|
@test4 "abd 00 ( yes )
|
||||||
|
@test5 "acddddddddddd 00 ( yes )
|
||||||
|
@test6 "bd 00 ( no )
|
||||||
|
@test7 "z 00 ( no )
|
||||||
|
@test8 00 ( no )
|
||||||
|
|
||||||
|
~regex.tal
|
||||||
|
|
Loading…
Reference in New Issue