diff --git a/asma.rom b/asma.rom new file mode 100644 index 0000000..8e8c264 Binary files /dev/null and b/asma.rom differ diff --git a/regex.tal b/regex.tal index c428c57..566ba5c 100644 --- a/regex.tal +++ b/regex.tal @@ -23,8 +23,12 @@ ( ) ( when searching, ^ matches the beginning of the string OR a line. ) ( $ matches the end of a line OR the end of the entire string. ) -( (the ^ and $ operators aren't yet supported.) the dot nodes will ) -( not match newline characters, which must be matched explicitly. ) +( the dot nodes will not match newline characters, which must be ) +( matched explicitly. ) +( ) +( finally, search-multiline will cause ^ and $ to use the matching ) +( behavior (i.e. only matching the beginning or end of a string). ) +( however dot nodes will still not match newline characters. ) ( ) ( search returns 01 if the string was matched and 00 otherwise. ) ( additionally, the @search-start and @search-end addresses will ) @@ -44,6 +48,10 @@ ( dollar matches end of line/string [ #07 next* ] ) ( lpar starts subgroup region [ #08 i* next* ] ) ( rpar ends subgroup region [ #09 i* next* ] ) +( class character class, e.g. [a-z] [ #0a next* n^ ... ] ) +( (NOTE: n is the number of pairs in ...) ) +( nclass negative class, e.g. [^a-z] [ #0b next* n^ ... ] ) +( (NOTE: n is the number of pairs in ...) ) ( ) ( `or` and `star` have the same structure and are handled by the ) ( same code (;do-or). however, the node types are kept different ) @@ -93,7 +101,7 @@ LIT '! emit! space &loop LDAk #00 EQU ,&done JCN LDAk emit! INC2 ,&loop JMP - &done POP2 newline #010f DEO BRK + &done POP2 newline #ff0e DEO #010f DEO BRK ( error messages ) @unknown-node-type "unknown 20 "node 20 "type 00 @@ -159,7 +167,9 @@ LDAk #07 EQU ;do-dollar JCN2 LDAk #08 EQU ;do-lpar JCN2 LDAk #09 EQU ;do-rpar JCN2 - LDAk ;unknown-node-type ;error!! JSR2 + LDAk #0a EQU ;do-ccls JCN2 + LDAk #0b EQU ;do-ncls JCN2 + LDAk #dd ;unknown-node-type ;error!! JSR2 ( used when we hit a dead-end during matching. ) ( ) @@ -244,6 +254,36 @@ LDA2 ;push4 JSR2 ( save (s, right) in the stack for possible backtracking ) LDA2 ;loop JMP2 ( continue on left branch ) +@matches-cls ( str* regex* -> bool^ ) + OVR2 LDA ,¬-null JCN + ( needs to have a character to match ) + POP2 POP2 ;goto-backtrack JMP2 + ¬-null + DUP2 INC2 LDA2 STH2 ( str regex [next] ) + OVR2 INC2 STH2 ( str regex [str+1 next] ) + SWP2 LDA STH ( regex [c str+1 next] ) + #0003 ADD2 LDAk #00 SWP #0002 MUL2 ( r+3 len*2 [c str+1 next] ) + SWP2 INC2 STH2k ADD2 STH2r ( r+4+len*2 r+4 [c str+1 next] ) + &loop ( limit addr [c str+1 next] ) + EQU2k ,&missing JCN + LDAk STHkr GTH ,&next1 JCN INC2 + LDAk STHkr LTH ,&next2 JCN ,&found JMP + &next1 INC2 + &next2 INC2 ,&loop JMP + &missing POP2 POP2 POPr ,&negated LDR ,&match JCN + &no-match POP2r POP2r ;goto-backtrack JMP2 + &found POP2 POP2 POPr ,&negated LDR ,&no-match JCN + &match STH2r STH2r ;goto-next JMP2 + [ &negated $1 ] + +( ) +@do-ccls ( str* regex* -> bool^ ) + #00 ,matches-cls/negated STR ,matches-cls JMP + +( ) +@do-ncls ( str* regex* -> bool^ ) + #01 ,matches-cls/negated STR ,matches-cls JMP + ( REGEX PARSING ) ( do we match across lines? ) @@ -343,6 +383,8 @@ DUP LIT '$ EQU ;c-dollar JCN2 DUP LIT '( EQU ;c-lpar JCN2 DUP LIT ') EQU ;c-rpar JCN2 + DUP LIT '[ EQU ;c-lbrk JCN2 + DUP LIT '] EQU ;c-rbrk JCN2 DUP LIT '\ EQU ;c-esc JCN2 DUP LIT '* EQU ;c-star JCN2 DUP LIT '+ EQU ;c-plus JCN2 @@ -414,6 +456,42 @@ ;c-peek-and-finalize JMP2 &mismatched-parens ;mismatched-parens ;error!! JSR2 +( doesn't support weird things like []abc] or [-abc] or similar. ) +( doesn't currently handle "special" escapes such as \n ) +@c-lbrk ( c^ -> r2* ) + POP LITr 00 ;pos LDA2 ( pos [0] ) + LDAk LIT '^ NEQ ,&normal JCN INCr INC2 ( pos [negated?^] ) + &normal + #0a STHr ADD ( src* type^ ) + ;arena-pos LDA2 STH2k ( src* type^ dst* [dst*] ) + STA LIT2r 0004 ADD2r ( src* [dst+4] ) + &left-parse ( src* [dst*] ) + LDAk LIT '] EQU ,&done JCN + LDAk LIT '- EQU ,&error JCN + LDAk LIT '\ NEQ ,&left JCN INC2 + &left + LDAk STH2kr STA INC2r + DUP2 INC2 LDA LIT '- NEQ ,&pre-right JCN INC2 INC2 + LDAk LIT '] EQU ,&error JCN + LDAk LIT '- EQU ,&error JCN + &pre-right + LDAk LIT '\ NEQ ,&right JCN INC2 + &right + LDAk STH2kr STA INC2 INC2r ,&left-parse JMP + &done ( src* [dst*] ) + INC2 ;pos STA2 STH2r ( dst* ) + DUP2 ;arena-pos LDA2 ( dst dst a ) + #0004 ADD2 SUB2 #0002 DIV2 NIP ( dst (dst-(a+4))/2 ) + ;arena-pos LDA2 STH2k #0003 ADD2 STA ( dst [a] ) + ;arena-pos STA2 STH2r ( a ) + DUP2 ;c-peek-and-finalize JMP2 + &error + #abcd #0000 DIV ( TODO error here ) + +@c-rbrk ( c^ -> r2* ) + POP + #0000 DIV ( invariant: should never be seen ) + ( called when we read "." ) ( ) ( allocates a dot-node and continues. ) @@ -492,8 +570,8 @@ ( ALLOCATING REGEX NDOES ) @rx-node-sizes - ( 00 01 02 03 04 05 06 07 08 09 ) - [ 00 03 03 04 ] [ 05 05 03 03 ] [ 05 05 ] + ( 00 01 02 03 04 05 06 07 08 09 0a 0b ) + [ 00 03 03 04 ] [ 05 05 03 03 ] [ 05 05 00 00 ] @alloc3 ( mode^ -> r* ) #0000 ROT ( 00 00 mode^ ) @@ -612,11 +690,13 @@ ( you can end up double-appending things. ) @set-next ( target* regex* -> ) LDAk #01 LTH ,&unknown JCN - LDAk #09 GTH ,&unknown JCN + LDAk #0b GTH ,&unknown JCN + LDAk #09 GTH ,&cc JCN LDAk #00 SWP ;rx-node-sizes ADD2 LDA #00 SWP ADD2 #0002 SUB2 ;set-next-addr JMP2 - &unknown LDAk ;unknown-node-type ;error!! JSR2 + &cc INC2 ;set-next-addr JMP2 + &unknown LDAk #ee ;unknown-node-type ;error!! JSR2 @set-next-or-addr ( target* addr* -> ) LDA2k #0000 EQU2 ( target addr addr=0? ) ,&is-zero JCN @@ -776,30 +856,3 @@ @subgroup-pos :subgroup-bot ( the position of the first unallocated subgroup ) @subgroup-bot $400 @subgroup-top ( holds up to 256 subgroups (1024 bytes) ) - -( INTERVAL OPERATIONS ) -( ) -( not baked yet ) -( -@min ( first* last* -> min-addr* ) - SWP2 STH2k ,&incr JMP ( last first [first] ) - &loop LDAk LDAkr STHr LTH ,&replace JCN ,&incr JMP ( last a [c] ) - &replace POP2r STH2k ( last a [a] ) - &incr EQUk ,&done JCN INC2 ,&loop JMP ( last a+1 [c] ) - &done POP2 POP2 STH2r JMP2r ( c ) - -@sort ( first* last* -> ) - SWP2 ( last first ) - &loop ;min JSR2 NEQk ,&swap JCN POP2 ,&incr JMP - &swap STH2 LDA2k ( last first fx [min] ) STH2kr STA STH2r SWP2 ( last min first ) - STH2 LDA2 ( last mx [first] ) STH2kr STA STH2r ( last first ) - &incr EQUk ,&done JCN INC2 ,&loop JMP - &done POP2 POP2 JMP2r - -@iv-in-range ( c^ b0^ b1^ -> bool^ ) - ROT STHk LTH ,&above JCN - STHr GTH ,&below JCN #01 JMP2r - &above POPr POP &below #00 JMP2r - -@iv-find ( c^ iv* -> bool^ ) - ) diff --git a/repl-regex.tal b/repl-regex.tal index 661533e..e436086 100644 --- a/repl-regex.tal +++ b/repl-regex.tal @@ -52,8 +52,8 @@ #00 ;ptr LDA2 STA ( null terminate string ) ;ptr LDA2 ;buffer EQU2 STH ( stash is-empty? ) ;buffer ;ptr STA2 ( reset ptr ) -( ;buffer ;regex LDA2 ;rx-match JSR2 ( match regex ) ) - ;buffer ;regex LDA2 ;rx-search JSR2 ( search regex ) + ;buffer ;regex LDA2 ;rx-match JSR2 ( match regex ) +( ;buffer ;regex LDA2 ;rx-search JSR2 ( search regex ) ) ;emit-byte JSR2 nl ( print result ) STHr ,&was-empty JCN ;s-prompt ;println JSR2 @@ -102,8 +102,12 @@ DUP2 ;emit-short JSR2 LIT ': emit! space LDAk #01 LTH ,&uu JCN - LDAk #09 GTH ,&uu JCN + LDAk #0b GTH ,&uu JCN + LDAk #09 GTH ,&cc JCN LDAk #00 SWP ;rx-node-sizes ADD2 LDA ;emit-n JSR2 ,&loop JMP + &cc + DUP2 #0003 ADD2 LDA #02 MUL #04 ADD + ;emit-n JSR2 ,&loop JMP &uu - ;unknown-node-type ;error!! JSR2 + LDAk #cc ;unknown-node-type ;error!! JSR2