diff --git a/regex.tal b/regex.tal index 0dabf2d..32ec9a0 100644 --- a/regex.tal +++ b/regex.tal @@ -36,41 +36,81 @@ %print { debug newline } %quit! { #01 #0f DEO BRK } -( test -> ) +( TESTING ) +( |0100 - ;expr1 ;compile JSR2 - print - ;emit-stack JSR2 - ;emit-arena JSR2 - newline + ;expr1 ;compile JSR2 print + ;emit-stack JSR2 newline + ;emit-arena JSR2 newline - ;test1 OVR2k ;match JSR2 ;emit-byte JSR2 newline - ;test2 OVR2k ;match JSR2 ;emit-byte JSR2 newline - ;test3 OVR2k ;match JSR2 ;emit-byte JSR2 newline - ;test4 OVR2k ;match JSR2 ;emit-byte JSR2 newline - ;test5 OVR2k ;match JSR2 ;emit-byte JSR2 newline - ;test6 OVR2k ;match JSR2 ;emit-byte JSR2 newline - ;test7 OVR2k ;match JSR2 ;emit-byte JSR2 newline + ;test1 OVR2k ;match JSR2 ;emit-byte JSR2 space + ;test2 OVR2k ;match JSR2 ;emit-byte JSR2 space + ;test3 OVR2k ;match JSR2 ;emit-byte JSR2 space + ;test4 OVR2k ;match JSR2 ;emit-byte JSR2 space + ;test5 OVR2k ;match JSR2 ;emit-byte JSR2 space + ;test6 OVR2k ;match JSR2 ;emit-byte JSR2 space + ;test7 OVR2k ;match JSR2 ;emit-byte JSR2 space ;test8 OVR2k ;match JSR2 ;emit-byte JSR2 newline - newline - ;test1 ;graph1 ;match JSR2 ;emit-byte JSR2 newline - ;test2 ;graph1 ;match JSR2 ;emit-byte JSR2 newline - ;test3 ;graph1 ;match JSR2 ;emit-byte JSR2 newline - ;test4 ;graph1 ;match JSR2 ;emit-byte JSR2 newline - ;test5 ;graph1 ;match JSR2 ;emit-byte JSR2 newline - ;test6 ;graph1 ;match JSR2 ;emit-byte JSR2 newline - ;test7 ;graph1 ;match JSR2 ;emit-byte JSR2 newline + ;test1 ;graph1 ;match JSR2 ;emit-byte JSR2 space + ;test2 ;graph1 ;match JSR2 ;emit-byte JSR2 space + ;test3 ;graph1 ;match JSR2 ;emit-byte JSR2 space + ;test4 ;graph1 ;match JSR2 ;emit-byte JSR2 space + ;test5 ;graph1 ;match JSR2 ;emit-byte JSR2 space + ;test6 ;graph1 ;match JSR2 ;emit-byte JSR2 space + ;test7 ;graph1 ;match JSR2 ;emit-byte JSR2 space ;test8 ;graph1 ;match JSR2 ;emit-byte JSR2 newline quit! +) +( TEST DATA ) +( +( corresponds to regex: a(b|c)d* ) +@expr1 "a(b|c)d* 00 + +( corresponds to regex: a(b|c)d* ) +( accepts "ab" or "ac" followd by any number of d's ) +@graph1 + 03 'a :x1 + @x1 04 :x2 :x3 + @x2 03 'b :x4 + @x3 03 'c :x4 + @x4 05 :x5 0000 + @x5 03 'd :x4 + +( test case strings to try matching ) +@test1 "ab 00 ( yes ) +@test2 "acdd 00 ( yes ) +@test3 "add 00 ( no ) +@test4 "abd 00 ( yes ) +@test5 "acddddddddddd 00 ( yes ) +@test6 "bd 00 ( no ) +@test7 "z 00 ( no ) +@test8 00 ( no ) +) + +( PRINTING DATA ) + +@emit-short ( byte -- ) + SWP ;emit-byte JSR2 ;emit-byte JSR2 JMP2r + +@emit-byte ( byte -- ) + DUP #04 SFT ,&hex JSR #0f AND ,&hex JMP + &hex #30 ADD DUP #39 GTH #27 MUL ADD emit + JMP2r + +( ERROR HANDLING ) + +( using error! will print the given message before causing ) +( the interpreter to halt. ) @error! ( msg* -> ) LIT '! emit space &loop LDAk ,&continue JCN ,&done JMP &continue LDAk emit INC2 ,&loop JMP &done POP2 newline quit! +( error messages ) @unknown-node-type "unknown 20 "node 20 "type 00 @mismatched-parens "mismatched 20 "parenthesis 00 @stack-is-full "stack 20 "is 20 "full 00 @@ -78,16 +118,27 @@ @arena-is-full "arena 20 "is 20 "full 00 @star-invariant "star 20 "invariant 20 "failed 00 +( REGEX MATCHING ) + +( use stored regex to match against a stored string. ) +( ) +( regex* should be the address of a compiled regex ) +( such as that returned from ;compile. ) +( ) +( str* should be a null-terminated string. ) +( ) +( returns true if the string, and false otherwise. ) @match ( str* regex* -> bool^ ) ;reset-stack JSR2 ;loop JMP2 +( loop used during matching ) +( ) ( we don't use the return stack here since that ) ( complicates the back-tracking we need to do. ) ( ultimately this code will issue a JMP2r to ) ( return a boolean, which is where the stack ) ( effects signature comes from. ) - @loop ( s* r* -> bool^ ) LDAk #01 EQU ;do-empty JCN2 LDAk #02 EQU ;do-dot JCN2 @@ -96,13 +147,15 @@ LDAk #05 EQU ;do-or JCN2 ( same code as the or case ) ;unknown-node-type ;error! JSR2 +( used when we hit a dead-end during matching. ) +( ) +( if stack is non-empty we have a point we can resume from. ) @goto-backtrack ( -> bool^ ) ;stack-exist JSR2 ,&has-stack JCN ( do we have stack? ) #00 JMP2r ( no, return false ) - &has-stack - ;pop4 JSR2 - ;goto-next JMP2 ( yes, resume from the top ) + &has-stack ;pop4 JSR2 ;goto-next JMP2 ( yes, resume from the top ) +( follow the given address (next*) to continue matching ) @goto-next ( str* next* -> bool^ ) DUP2 #0000 GTH2 ,&has-next JCN POP2 LDA null? ,&end-of-string JCN @@ -110,16 +163,19 @@ &end-of-string #01 JMP2r &has-next ;loop JMP2 +( handle the empty node -- just follow the next pointer ) @do-empty ( str* regex* -> bool^ ) INC2 LDA2 ( load next ) ;goto-next JMP2 ( jump to next ) +( handle dot -- match any one character ) @do-dot ( str* regex* -> bool^ ) INC2 LDA2 STH2 ( load and stash next ) LDAk #00 NEQ ,&non-empty JCN ( is there a char? ) POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack ) &non-empty INC2 STH2r ;goto-next JMP2 ( yes, inc s, restore and jump ) +( handle literal -- match one specific character ) @do-literal ( str* regex* -> bool^ ) INC2 LDAk STH ( store c ) @@ -130,19 +186,23 @@ &matches INC2 STH2r ;goto-next JMP2 ( yes, inc s, restore and jump ) +( handle or -- try the left branch but backtrack to the right if needed ) +( ) ( this also handles asteration, since it ends up having the same structure ) @do-or ( str* regex* -> bool^ ) INC2 OVR2 OVR2 #0002 ADD2 ( s r+1 s r+3 ) LDA2 ;push4 JSR2 ( save (s, right) in the stack for possible backtracking ) LDA2 ;loop JMP2 ( continue on left branch ) +( REGEX PARSING ) + ( track the position in the input string ) @pos $2 ( track how many levels deep we are in parenthesis ) @parens $2 -( read and increments pos ) +( read and increment pos ) @read ( -> c^ ) ;pos LDA2k ( pos s ) LDAk STHk #00 EQU ( pos s c=0 [c] ) @@ -151,25 +211,24 @@ SWP2 STA2 ,&return JMP ( [c] ) &is-eof POP2 POP2 &return STHr ( c ) - ;pos LDA2 ;emit-short JSR2 LIT '> emit DUP ;emit-byte JSR2 newline JMP2r -( read pos ) -@peek ( -> c^ ) - ;pos LDA2 LDA JMP2r - +( is pos currently pointing to a star? ) @peek-to-star ( -> is-star^ ) ;pos LDA2 LDA LIT '* EQU JMP2r -( pos += 1 ) +( just increment pos ) @skip - ;pos LDA2 ;emit-short JSR2 LIT '! emit - ;pos LDA2 INC2 ;pos STA2 - ;pos LDA2 ;emit-short JSR2 newline - JMP2r + ;pos LDA2 INC2 ;pos STA2 JMP2r ( TODO: [] + ? ) ( compile an expression string into a regex graph ) +( ) +( the regex will be allocated in the arena; if there is not ) +( sufficient space an error will be thrown. ) +( ) +( the stack will also be used during parsing although unlike ) +( the arena it will be released once compilation ends. ) @compile ( expr* -> regex* ) ;pos STA2 #0000 ;parens STA2 @@ -201,28 +260,59 @@ DUP LIT '* EQU ;c-star JCN2 ;c-char JMP2 +( either finalize the given r0/r1 or else wrap it in ) +( a star node if a star is coming up next. ) +( ) +( we use this look-ahead approach rather than compiling ) +( star nodes directly since the implementation is simpler. ) @c-peek-and-finalize ( r0* r1* -> r2* ) ;peek-to-star JSR2 ( r0 r1 next-is-star? ) ,&next-is-star JCN ,&finally JMP ( r0 r1 ) &next-is-star ;skip JSR2 POP2 ;alloc-star JSR2 DUP2 ( star ) &finally ;push-next JSR2 ;compile-region-loop JMP2 +( called when we reach EOF of the input string ) +( ) +( as with c-rpar we have to unroll the current level ) +( of the stack, building any or-nodes that are needed. ) +( ) +( this is where we detect unclosed parenthesis. ) @c-done ( c^ -> r2* ) POP ;parens LDA2 #0000 GTH2 ,&mismatched-parens JCN ;unroll-stack JSR2 POP2 JMP2r &mismatched-parens ;mismatched-parens ;error! JSR2 +( called when we read "|" ) +( ) +( since we defer building or-nodes until the end of the region ) +( we just start a new stack frame and continue. ) @c-or ( c^ -> r2* ) POP #0000 #0000 ;push4 JSR2 ;compile-region-loop JMP2 +( called when we read "(" ) +( ) +( this causes us to: ) +( ) +( 1. increment parens ) +( 2. start a new region on the stack ) +( 3. jump to compile-region to start parsing the new region ) @c-lpar ( c^ -> r2* ) POP ;parens LDA2 INC2 ;parens STA2 ( parens++ ) ;compile-region JMP2 +( called when we read ")" ) +( ) +( this causes us to: ) +( ) +( 1. check for mismatched parens ) +( 2. decrement parens ) +( 3. unroll the current region on the stack into one regex node ) +( 4. finalize that node and append it to the previous region ) +( 5. continue parsing ) @c-rpar ( c^ -> r2* ) POP ;parens LDA2 #0000 EQU2 ,&mismatched-parens JCN @@ -231,36 +321,48 @@ ;c-peek-and-finalize JMP2 &mismatched-parens ;mismatched-parens ;error! JSR2 +( called when we read "." ) +( ) +( allocates a dot-node and continues. ) @c-dot ( c^ -> r2* ) POP ;alloc-dot JSR2 ( dot ) DUP2 ;c-peek-and-finalize JMP2 -@c-char ( c^ -> r2* ) - ;alloc-lit JSR2 ( lit ) - DUP2 ;c-peek-and-finalize JMP2 - ( TODO: escaping rules not quite right ) + +( called when we read "\" ) +( ) +( allocates a literal of the next character. ) +( ) +( this doesn't currently handle any special escape sequences. ) @c-esc ( c^ -> r2* ) POP ;read JSR2 ;c-char JMP2 -( we don't expect to actually handle this ) +( called when we read any other character ) +( ) +( allocates a literal-node and continues. ) +@c-char ( c^ -> r2* ) + ;alloc-lit JSR2 ( lit ) + DUP2 ;c-peek-and-finalize JMP2 + +( called if we parse a "*" ) +( ) +( actually calling this means the code broke an invariant somewhere. ) @c-star ( c^ -> regex* ) POP ;star-invariant ;error! JSR2 - -( allocate node types ------ ) +( ALLOCATING REGEX NDOES ) @alloc3 ( mode^ -> r* ) #0000 ROT ( 00 00 mode^ ) #03 ;alloc JSR2 ( 00 00 mode^ addr* ) - STH2k STA - STH2kr INC2 STA2 - STH2r - JMP2r + STH2k STA ( addr <- mode ) + STH2kr INC2 STA2 ( addr+1 <- 0000 ) + STH2r JMP2r ( return addr ) @alloc-empty ( -> r* ) #01 ;alloc3 JMP2 @@ -269,13 +371,12 @@ #02 ;alloc3 JMP2 @alloc-lit ( c^ -> r* ) - #03 #0000 SWP2 - #04 ;alloc JSR2 - STH2k STA - STH2kr INC2 STA - STH2kr #0002 ADD2 STA2 - STH2r - JMP2r + #03 #0000 SWP2 ( 0000 c^ 03 ) + #04 ;alloc JSR2 ( 0000 c^ 03 addr* ) + STH2k STA ( addr <- 03 ) + STH2kr INC2 STA ( addr+1 <- c ) + STH2kr #0002 ADD2 STA2 ( addr+2 <- 0000 ) + STH2r JMP2r ( return addr ) @alloc-or ( right* left* -> r* ) #05 ;alloc JSR2 STH2 ( r l [x] ) @@ -293,7 +394,7 @@ ;set-next JSR2 ( [r] ) STH2r JMP2r -( unroll one part of the parsing stack, returning ) +( unroll one region of the parsing stack, returning ) ( a single node consisting of an alternation of ) ( all elements on the stack. ) ( ) @@ -332,6 +433,10 @@ JMP2r &is-zero POP2 POP2 ;push4 JSR2 JMP2r +( load the given address: ) +( ) +( 1. if it points to 0000, update it to target ) +( 2. otherwise, call set-next on it ) @set-next-addr ( target* addr* -> ) LDA2k #0000 EQU2 ( target addr addr=0? ) ,&is-zero JCN LDA2 ;set-next JSR2 JMP2r @@ -349,60 +454,15 @@ &!4 LDAk #05 NEQ ,&!5 JCN #0003 ADD2 ;set-next-addr JSR2 JMP2r &!5 ;unknown-node-type ;error! JSR2 -@remove-addr ( target* addr* -> ) - LDA2k #0000 EQU2 ( t a v=0? ) ,&is-zero JCN - OVR2 OVR2 LDA2 EQU2 ( t a t=v? ) ,&is-equal JCN - LDA2 ( t v ) ;remove-from JSR2 JMP2r - &is-zero POP2 POP2 JMP2r - &is-equal NIP2 #0000 SWP2 STA2 JMP2r - -( remove target from regex ) -@remove-from ( target* regex* -> ) - LDAk #01 NEQ ,&!1 JCN INC2 ;remove-addr JSR2 JMP2r - &!1 LDAk #02 NEQ ,&!2 JCN INC2 ;remove-addr JSR2 JMP2r - &!2 LDAk #03 NEQ ,&!3 JCN #0002 ADD2 ;remove-addr JSR2 JMP2r - &!3 LDAk #04 NEQ ,&!4 JCN - OVR2 OVR2 INC2 ;remove-addr JSR2 - #0003 ADD2 ;remove-addr JSR2 JMP2r - &!4 LDAk #05 NEQ ,&!5 JCN #0003 ADD2 ;remove-addr JSR2 JMP2r - &!5 ;unknown-node-type ;error! JSR2 - -( test cases -------- ) - -( corresponds to regex: a(b|c)d* ) -@expr1 "a(b|c)d* 00 - -( corresponds to regex: a(b|c)d* ) -( accepts "ab" or "ac" followd by any number of d's ) -@graph1 - 03 'a :x1 - @x1 04 :x2 :x3 - @x2 03 'b :x4 - @x3 03 'c :x4 - @x4 05 :x5 0000 - @x5 03 'd :x4 - -@test1 "ab 00 ( yes ) -@test2 "acdd 00 ( yes ) -@test3 "add 00 ( no ) -@test4 "abd 00 ( yes ) -@test5 "acddddddddddd 00 ( yes ) -@test6 "bd 00 ( no ) -@test7 "z 00 ( no ) -@test8 00 ( no ) - -( emit byte/short ) - -@emit-short ( byte -- ) - SWP ;emit-byte JSR2 ;emit-byte JSR2 JMP2r - -@emit-byte ( byte -- ) - DUP #04 SFT ,&hex JSR #0f AND ,&hex JMP - &hex #30 ADD DUP #39 GTH #27 MUL ADD emit - JMP2r - -( stack operations ---- ) +( STACK OPERATIONS ) +( ) +( we always push/pop 4 bytes at a time. the stack has a fixed ) +( maximum size it can use, defined by ;stack-top. ) +( ) +( the stack can be cleared using ;reset-stack, which resets ) +( the stack pointers but does not zero out any memory. ) +( push 4 bytes onto the stack ) @push4 ( str* regex* -> ) ;assert-stack-avail JSR2 ( check for space ) ;stack-pos LDA2 #0002 ADD2 STA2 ( cell[2:3] <- regex ) @@ -410,6 +470,7 @@ ;stack-pos LDA2 #0004 ADD2 ;stack-pos STA2 ( pos += 4 ) JMP2r +( pop 4 bytes from the stack ) @pop4 ( -> str* regex* ) ;assert-stack-exist JSR2 ( check for space ) ;stack-pos LDA2 ( load stack-pos ) @@ -419,18 +480,27 @@ STH2r STH2r ( restore str and regex ) JMP2r +( reset stack pointers ) @reset-stack ( -> ) ;stack-bot ;stack-pos STA2 JMP2r ( pos <- 0 ) +( can more stack be allocated? ) @stack-avail ( -> bool^ ) ;stack-pos LDA2 ;stack-top LTH2 JMP2r + +( is the stack non-empty? ) @stack-exist ( -> bool^ ) ;stack-pos LDA2 ;stack-bot GTH2 JMP2r + +( error if stack is full ) @assert-stack-avail ( -> ) ;stack-avail JSR2 ,&ok JCN ;stack-is-full ;error! JSR2 &ok JMP2r + +( error is stack is empty ) @assert-stack-exist ( -> ) ;stack-exist JSR2 ,&ok JCN ;stack-is-empty ;error! JSR2 &ok JMP2r +( print stack size, followed by contents ) @emit-stack ( -> ) space LIT 'n emit LIT '= emit ;stack-pos LDA2 ;stack-bot SUB2 #0004 DIV2 ;emit-short JSR2 LIT ': emit ;stack-bot @@ -441,14 +511,31 @@ space LDA2k ;emit-short JSR2 #0002 ADD2 ,&loop JMP +( stack-pos points to the next free stack position (or the top if full). ) @stack-pos :stack-bot ( the next position to insert at ) + +( stack-bot is the address of the first stack position. ) +( stack-top is the address of the first byte beyond the stack. ) @stack-bot $1000 @stack-top ( holds 1024 steps (4096 bytes) ) -( arena operations ---- ) +( ARENA OPERATIONS ) +( ) +( the arena represents a heap of memory that can easily be ) +( allocated in small amounts. ) +( ) +( the entire arena can be reclaimed using ;reset-arena, but ) +( unlike systems such as malloc/free, the arena cannot relcaim ) +( smaller amounts of memory. ) +( ) +( the arena is used to allocate regex graph nodes, which are ) +( dynamically-allocated as the regex string is parsed. once ) +( a regex is no longer needed the arena may be reclaimed. ) +( reclaim all the memory used by the arena ) @reset-arena ( -> ) ;arena-bot ;arena-pos STA2 JMP2r +( currently caller is responsible for zeroing out memory if needed ) @alloc ( size^ -> addr* ) #00 SWP ( size* ) ;arena-pos LDA2 STH2k ADD2 ( pos+size* [pos] ) @@ -462,6 +549,7 @@ @arena-pos :arena-bot ( the next position to allocate ) @arena-bot $400 @arena-top ( holds up to 1024 bytes ) +( emit n bytes from the given address ) @emit-n ( addr* count^ -> addr2* ) DUP #00 GTH ( addr count count>0? ) ,&ok JCN ( addr count ) POP newline JMP2r &ok @@ -469,6 +557,8 @@ STHr #01 SUB ( addr+1 count-1 ) ;emit-n JMP2 +( emit the arena, with one line per node ) +( parses node type, since node size is dynamic (3-5). ) @emit-arena ( -> ) ;arena-bot &loop diff --git a/test-regex.tal b/test-regex.tal new file mode 100644 index 0000000..63ec0ae --- /dev/null +++ b/test-regex.tal @@ -0,0 +1,56 @@ +%dbg { #ff #0e DEO } +%sp { #20 #18 DEO } +%nl { #0a #18 DEO } +%exit { #01 #0f DEO BRK } + +|0100 + ;expr1 ;compile JSR2 dbg nl + ;emit-stack JSR2 nl + ;emit-arena JSR2 nl + + ;test1 OVR2k ;match JSR2 ;emit-byte JSR2 sp + ;test2 OVR2k ;match JSR2 ;emit-byte JSR2 sp + ;test3 OVR2k ;match JSR2 ;emit-byte JSR2 sp + ;test4 OVR2k ;match JSR2 ;emit-byte JSR2 sp + ;test5 OVR2k ;match JSR2 ;emit-byte JSR2 sp + ;test6 OVR2k ;match JSR2 ;emit-byte JSR2 sp + ;test7 OVR2k ;match JSR2 ;emit-byte JSR2 sp + ;test8 OVR2k ;match JSR2 ;emit-byte JSR2 nl + + ;test1 ;graph1 ;match JSR2 ;emit-byte JSR2 sp + ;test2 ;graph1 ;match JSR2 ;emit-byte JSR2 sp + ;test3 ;graph1 ;match JSR2 ;emit-byte JSR2 sp + ;test4 ;graph1 ;match JSR2 ;emit-byte JSR2 sp + ;test5 ;graph1 ;match JSR2 ;emit-byte JSR2 sp + ;test6 ;graph1 ;match JSR2 ;emit-byte JSR2 sp + ;test7 ;graph1 ;match JSR2 ;emit-byte JSR2 sp + ;test8 ;graph1 ;match JSR2 ;emit-byte JSR2 nl + + ;reset-arena JSR2 + exit + +( corresponds to regex: a(b|c)d* ) +@expr1 "a(b|c)d* 00 + +( corresponds to regex: a(b|c)d* ) +( accepts "ab" or "ac" followd by any number of d's ) +@graph1 + 03 'a :x1 + @x1 04 :x2 :x3 + @x2 03 'b :x4 + @x3 03 'c :x4 + @x4 05 :x5 0000 + @x5 03 'd :x4 + +( test case strings to try matching ) +@test1 "ab 00 ( yes ) +@test2 "acdd 00 ( yes ) +@test3 "add 00 ( no ) +@test4 "abd 00 ( yes ) +@test5 "acddddddddddd 00 ( yes ) +@test6 "bd 00 ( no ) +@test7 "z 00 ( no ) +@test8 00 ( no ) + +~regex.tal +