diff --git a/regex.tal b/regex.tal index 32ec9a0..c8583e9 100644 --- a/regex.tal +++ b/regex.tal @@ -33,73 +33,8 @@ %emit { #18 DEO } %space { #20 emit } %newline { #0a emit } -%print { debug newline } %quit! { #01 #0f DEO BRK } -( TESTING ) - -( -|0100 - ;expr1 ;compile JSR2 print - ;emit-stack JSR2 newline - ;emit-arena JSR2 newline - - ;test1 OVR2k ;match JSR2 ;emit-byte JSR2 space - ;test2 OVR2k ;match JSR2 ;emit-byte JSR2 space - ;test3 OVR2k ;match JSR2 ;emit-byte JSR2 space - ;test4 OVR2k ;match JSR2 ;emit-byte JSR2 space - ;test5 OVR2k ;match JSR2 ;emit-byte JSR2 space - ;test6 OVR2k ;match JSR2 ;emit-byte JSR2 space - ;test7 OVR2k ;match JSR2 ;emit-byte JSR2 space - ;test8 OVR2k ;match JSR2 ;emit-byte JSR2 newline - - ;test1 ;graph1 ;match JSR2 ;emit-byte JSR2 space - ;test2 ;graph1 ;match JSR2 ;emit-byte JSR2 space - ;test3 ;graph1 ;match JSR2 ;emit-byte JSR2 space - ;test4 ;graph1 ;match JSR2 ;emit-byte JSR2 space - ;test5 ;graph1 ;match JSR2 ;emit-byte JSR2 space - ;test6 ;graph1 ;match JSR2 ;emit-byte JSR2 space - ;test7 ;graph1 ;match JSR2 ;emit-byte JSR2 space - ;test8 ;graph1 ;match JSR2 ;emit-byte JSR2 newline - quit! -) -( TEST DATA ) - -( -( corresponds to regex: a(b|c)d* ) -@expr1 "a(b|c)d* 00 - -( corresponds to regex: a(b|c)d* ) -( accepts "ab" or "ac" followd by any number of d's ) -@graph1 - 03 'a :x1 - @x1 04 :x2 :x3 - @x2 03 'b :x4 - @x3 03 'c :x4 - @x4 05 :x5 0000 - @x5 03 'd :x4 - -( test case strings to try matching ) -@test1 "ab 00 ( yes ) -@test2 "acdd 00 ( yes ) -@test3 "add 00 ( no ) -@test4 "abd 00 ( yes ) -@test5 "acddddddddddd 00 ( yes ) -@test6 "bd 00 ( no ) -@test7 "z 00 ( no ) -@test8 00 ( no ) -) - -( PRINTING DATA ) - -@emit-short ( byte -- ) - SWP ;emit-byte JSR2 ;emit-byte JSR2 JMP2r - -@emit-byte ( byte -- ) - DUP #04 SFT ,&hex JSR #0f AND ,&hex JMP - &hex #30 ADD DUP #39 GTH #27 MUL ADD emit - JMP2r - ( ERROR HANDLING ) ( using error! will print the given message before causing ) @@ -221,7 +156,19 @@ @skip ;pos LDA2 INC2 ;pos STA2 JMP2r -( TODO: [] + ? ) +( TODO: ) +( 1. character groups: [] and [^] ) +( 2. one-or-more: + ) +( 3. zero-or-one: ? ) +( 4. symbolic escapes, e.g. \n ) + +( STRETCH GOALS: ) +( a. ^ and $ ) +( b. counts: {n} and {m,n} ) +( c. substring matching, i.e. searching ) +( d. subgroup extraction ) +( e. back-references, e.g \1 ) + ( compile an expression string into a regex graph ) ( ) ( the regex will be allocated in the arena; if there is not ) @@ -500,17 +447,6 @@ @assert-stack-exist ( -> ) ;stack-exist JSR2 ,&ok JCN ;stack-is-empty ;error! JSR2 &ok JMP2r -( print stack size, followed by contents ) -@emit-stack ( -> ) - space LIT 'n emit LIT '= emit ;stack-pos LDA2 ;stack-bot SUB2 #0004 DIV2 ;emit-short JSR2 LIT ': emit - ;stack-bot - &loop - DUP2 ;stack-pos LDA2 LTH2 ,&ok JCN - POP2 newline JMP2r - &ok - space LDA2k ;emit-short JSR2 - #0002 ADD2 ,&loop JMP - ( stack-pos points to the next free stack position (or the top if full). ) @stack-pos :stack-bot ( the next position to insert at ) @@ -548,27 +484,3 @@ |1ffe @arena-pos :arena-bot ( the next position to allocate ) @arena-bot $400 @arena-top ( holds up to 1024 bytes ) - -( emit n bytes from the given address ) -@emit-n ( addr* count^ -> addr2* ) - DUP #00 GTH ( addr count count>0? ) ,&ok JCN ( addr count ) POP newline JMP2r - &ok - STH ( addr [count] ) space LDAk ;emit-byte JSR2 INC2 ( addr+1 [count] ) - STHr #01 SUB ( addr+1 count-1 ) - ;emit-n JMP2 - -( emit the arena, with one line per node ) -( parses node type, since node size is dynamic (3-5). ) -@emit-arena ( -> ) - ;arena-bot - &loop - DUP2 ;arena-pos LDA2 LTH2 ,&ok JCN POP2 JMP2r - &ok - DUP2 ;emit-short JSR2 - LIT ': emit space - LDAk #01 NEQ ,&!1 JCN #03 ;emit-n JSR2 ,&loop JMP - &!1 LDAk #02 NEQ ,&!2 JCN #03 ;emit-n JSR2 ,&loop JMP - &!2 LDAk #03 NEQ ,&!3 JCN #04 ;emit-n JSR2 ,&loop JMP - &!3 LDAk #04 NEQ ,&!4 JCN #05 ;emit-n JSR2 ,&loop JMP - &!4 LDAk #05 NEQ ,&!5 JCN #05 ;emit-n JSR2 ,&loop JMP - &!5 ;unknown-node-type ;error! JSR2 diff --git a/test-regex.tal b/test-regex.tal index 63ec0ae..57618d2 100644 --- a/test-regex.tal +++ b/test-regex.tal @@ -54,3 +54,45 @@ ~regex.tal +@emit-short ( short* -- ) + SWP ;emit-byte JSR2 ;emit-byte JSR2 JMP2r + +@emit-byte ( byte^ -- ) + DUP #04 SFT ,&hex JSR #0f AND ,&hex JMP + &hex #30 ADD DUP #39 GTH #27 MUL ADD emit + JMP2r + +( print stack size, followed by contents ) +@emit-stack ( -> ) + space LIT 'n emit LIT '= emit ;stack-pos LDA2 ;stack-bot SUB2 #0004 DIV2 ;emit-short JSR2 LIT ': emit + ;stack-bot + &loop + DUP2 ;stack-pos LDA2 LTH2 ,&ok JCN + POP2 newline JMP2r + &ok + space LDA2k ;emit-short JSR2 + #0002 ADD2 ,&loop JMP + +( emit n bytes from the given address ) +@emit-n ( addr* count^ -> addr2* ) + DUP #00 GTH ( addr count count>0? ) ,&ok JCN ( addr count ) POP newline JMP2r + &ok + STH ( addr [count] ) space LDAk ;emit-byte JSR2 INC2 ( addr+1 [count] ) + STHr #01 SUB ( addr+1 count-1 ) + ;emit-n JMP2 + +( emit the arena, with one line per node ) +( parses node type, since node size is dynamic (3-5). ) +@emit-arena ( -> ) + ;arena-bot + &loop + DUP2 ;arena-pos LDA2 LTH2 ,&ok JCN POP2 JMP2r + &ok + DUP2 ;emit-short JSR2 + LIT ': emit space + LDAk #01 NEQ ,&!1 JCN #03 ;emit-n JSR2 ,&loop JMP + &!1 LDAk #02 NEQ ,&!2 JCN #03 ;emit-n JSR2 ,&loop JMP + &!2 LDAk #03 NEQ ,&!3 JCN #04 ;emit-n JSR2 ,&loop JMP + &!3 LDAk #04 NEQ ,&!4 JCN #05 ;emit-n JSR2 ,&loop JMP + &!4 LDAk #05 NEQ ,&!5 JCN #05 ;emit-n JSR2 ,&loop JMP + &!5 ;unknown-node-type ;error! JSR2