add femto

2023-12-07 17:39:18 -05:00 · 2023-12-07 17:39:18 -05:00 · 768b99d1db
parent 99337665f6
commit 768b99d1db
2 changed files with 2643 additions and 0 deletions
--- a/test/femto.tal
+++ b/test/femto.tal
--- a/test/regex.tal
+++ b/test/regex.tal
@ -0,0 +1,913 @@
+( regex.tal                                                        )
+(                                                                  )
+( compiles regex expression strings into regex nodes, then uses    )
+( regex nodes to match input strings.                              )
+(                                                                  )
+( two methods are currently supported:                             )
+(                                                                  )
+( 1. match                                                         )
+(                                                                  )
+( when matching the regex must match the entire string. this means )
+( that it is unnecessary to use ^ and $ when matching, since their )
+( effect is implied. it also means that that dot nodes will match  )
+( any characters at all including newlines.                        )
+(                                                                  )
+( match returns 01 if the string was matched and 00 otherwise.     )
+(                                                                  )
+( 2. search                                                        )
+(                                                                  )
+( when searching the regex attempts to find matching substrings    )
+( in the given string. this means that after successfully finding  )
+( a match, search may be called on the remaining substring to find )
+( more matches.                                                    )
+(                                                                  )
+( when searching, ^ matches the beginning of the string OR a line. )
+( $ matches the end of a line OR the end of the entire string.     )
+( the dot nodes will not match newline characters, which must be   )
+( matched explicitly.                                              )
+(                                                                  )
+( finally, search-multiline will cause ^ and $ to use the matching )
+( behavior (i.e. only matching the beginning or end of a string).  )
+( however dot nodes will still not match newline characters.       )
+(                                                                  )
+( search returns 01 if the string was matched and 00 otherwise.    )
+( additionally, the @search-start and @search-end addresses will   )
+( contain the starting location and match boundary of the matching )
+( substring.                                                       )
+(                                                                  )
+( regex node types:                                                )
+(                                                                  )
+(   NAME    DESCRIPTION                      STRUCT                )
+(   empty   matches empty string             [ #01 next* ]         )
+(   dot     matches any one char             [ #02 next* ]         )
+(   lit     matches one specific char (c)    [ #03 c^ next* ]      )
+(   or      matches either left or right     [ #04 left* right* ]  )
+(   star    matches expr zero-or-more times  [ #05 expr* next* ]   )
+(           (NOTE: r.expr.next must be r)                          )
+(   caret   matches start of line/string     [ #06 next* ]         )
+(   dollar  matches end of line/string       [ #07 next* ]         )
+(   lpar    starts subgroup region           [ #08 i^ next* ]      )
+(   rpar    ends subgroup region             [ #09 i^ next* ]      )
+(   class   character class, e.g. [a-z]      [ #0a next* n^ ... ]  )
+(           (NOTE: n is the number of pairs in ...)                )
+(   nclass  negative class, e.g. [^a-z]      [ #0b next* n^ ... ]  )
+(           (NOTE: n is the number of pairs in ...)                )
+(                                                                  )
+( `or` and `star` have the same structure and are handled by the   )
+( same code (;do-or). however, the node types are kept different   )
+( to make it clearer how to parse and assemble the nodes.          )
+(                                                                  )
+( dollar nodes contain a next pointer even though this usually     )
+( will not be needed.                                              )
+(                                                                  )
+( lpar and rpar contain addresses pointing between subgroup-bot    )
+( and subgroup-bot. rpar's address will always be +2 relative to   )
+( the corresponding lpar address.                                  )
+(                                                                  )
+( concatenation isn't a node, it is implied by the *next addr.     )
+( a next value of #0000 signals the end of the regex.              )
+(                                                                  )
+( in these docs str* is an address to a null-terminated string.    )
+( regexes should not include nulls and cannot match them (other    )
+( than the null which signals the end of a string).                )
+
+( TODO: we have lpar and rpar nodes but aren't using them yet      )
+( 1. need to modify c-lpar and c-par                               )
+( 2. we need to store subgroup-posd in regions during parsing:     )
+(   a. need to store the current pos in the region                 )
+(   b. need to call start to move subgroup-pos forward             )
+( 3. when finishing parsing a region we need lpar/rpar nodes       )
+( 4. we also need to store "last started subgroup" on the stack    )
+( 5. when backtracking we must rewind to "last started" subgroup   )
+
+%debug { #ff #0e DEO }
+%emit! { #18 DEO }
+%space { #20 emit! }
+%newline { #0a emit! }
+
+( now that uxnasm throws errors about writing into the zero page   )
+( we have to do something like this to be able to compile library  )
+( code. we have to guess what offset to use since it needs to      )
+( avoid conficting with the program we're included in.             )
+(                                                                  )
+( remove this if needed when including it in other projects.       )
+( |2000 )
+
+( ERROR HANDLING )
+
+( using error! will print the given message before causing )
+( the interpreter to halt. )
+@error!! ( msg* -> )
+    LIT "! emit! space
+    &loop LDAk #00 EQU ,&done JCN
+          LDAk emit! INC2 ,&loop JMP
+    &done POP2 newline #ff0e DEO #010f DEO BRK
+
+( error messages )
+@unknown-node-type "unknown 20 "node 20 "type 00
+@mismatched-parens "mismatched 20 "parenthesis 00
+@stack-is-full "stack 20 "is 20 "full 00
+@stack-is-empty "stack 20 "is 20 "empty 00
+@arena-is-full "arena 20 "is 20 "full 00
+@star-invariant "star 20 "invariant 20 "failed 00
+@plus-invariant "plus 20 "invariant 20 "failed 00
+@qmark-invariant "question 20 "mark 20 "invariant 20 "failed 00
+
+( REGEX MATCHING )
+
+( use stored regex to match against a stored string. )
+( )
+( regex* should be the address of a compiled regex )
+( such as that returned from ;compile. )
+( )
+( str* should be a null-terminated string. )
+( )
+( returns true if the string, and false otherwise. )
+@rx-match ( str* regex* -> bool^ )
+    #01 ;match-multiline STA
+    #00 ;search-mode STA
+    ;rx-reset JSR2
+    ;loop JMP2
+
+@rx-search-multiline ( str* regex* -> bool^ )
+    #01 ;match-multiline STA
+    #01 ;search-mode STA
+    ,rx-search/main JMP
+
+@rx-search ( str* regex* -> bool^ )
+    #00 ;match-multiline STA
+    #01 ;search-mode STA
+    &main STH2                    ( s* [r*] )
+          DUP2 ;string-start STA2 ( s* [r*] )
+    &loop LDAk #00 EQU ,&eof JCN  ( s* [r*] )
+          ;rx-reset JSR2          ( s* [r*] )
+          DUP2 ;search-start STA2 ( s* [r*] )
+          DUP2 STH2kr ;loop JSR2  ( s* b^ [r*] )
+          ,&found JCN             ( s* [r*] )
+          INC2 ,&loop JMP         ( s+1* [r*] )
+    &found POP2 POP2r #01 JMP2r   ( 01 )
+    &eof ;rx-reset JSR2           ( s* [r*] )
+         DUP2 ;search-start STA2  ( s* [r*] )
+         STH2r ;loop JMP2         ( b^ )
+
+( reset all "runtime" memory allocated during match/search )
+@rx-reset ( -> )
+    ;reset-stack JSR2
+    ;subgroup-reset JMP2
+
+( loop used during matching )
+( )
+( we don't use the return stack here since that )
+( complicates the back-tracking we need to do. )
+( ultimately this code will issue a JMP2r to )
+( return a boolean, which is where the stack )
+( effects signature comes from. )
+@loop ( s* r* -> bool^ )
+    LDAk #01 EQU ;do-empty   JCN2
+    LDAk #02 EQU ;do-dot     JCN2
+    LDAk #03 EQU ;do-literal JCN2
+    LDAk #04 EQU ;do-or      JCN2
+    LDAk #05 EQU ;do-or      JCN2 ( same code as the or case )
+    LDAk #06 EQU ;do-caret   JCN2
+    LDAk #07 EQU ;do-dollar  JCN2
+    LDAk #08 EQU ;do-lpar    JCN2
+    LDAk #09 EQU ;do-rpar    JCN2
+    LDAk #0a EQU ;do-ccls    JCN2
+    LDAk #0b EQU ;do-ncls    JCN2
+    LDAk #dd ;unknown-node-type ;error!! JSR2
+
+( used when we hit a dead-end during matching. )
+( )
+( if stack is non-empty we have a point we can resume from. )
+@goto-backtrack ( -> bool^ )
+    ;stack-exist JSR2 ,&has-stack JCN ( do we have stack? )
+    #00 JMP2r ( no, return false )
+    &has-stack
+        ;pop4 JSR2
+        ;subgroup-backtrack JSR2
+        ;goto-next JMP2 ( yes, resume from the top )
+
+( follow the given address (next*) to continue matching )
+@goto-next ( str* next* -> bool^ )
+    DUP2 #0000 GTH2 ,&has-next JCN
+    POP2 LDAk #00 EQU ,&end-of-string JCN
+    ;search-mode LDA ,&end-of-search JCN
+    POP2 ;goto-backtrack JMP2
+    &end-of-search DUP2 ;search-end STA2
+    &end-of-string POP2 #01 JMP2r
+    &has-next ;loop JMP2
+
+( handle the empty node -- just follow the next pointer )
+@do-empty ( str* regex* -> bool^ )
+    INC2 LDA2 ( load next )
+    ;goto-next JMP2 ( jump to next )
+
+( FIXME: not currently used )
+@do-lpar ( str* regex* -> bool^ )
+    STH2 DUP2 ( s s [r] )
+    INC2r LDA2kr STH2r ( s s i [r+1] )
+    ;subgroup-start JSR2 ( s [r+1] )
+    STH2r INC2 INC2 ( s r+3 )
+    LDA2 ;goto-next JMP2 ( jump to next )
+
+( FIXME: not currently used )
+@do-rpar ( str* regex* -> bool^ )
+    STH2 DUP2 ( s s [r] )
+    INC2r LDA2kr STH2r ( s s i [r+1] )
+    ;subgroup-finish JSR2 ( s [r+1] )
+    STH2r INC2 INC2 ( s r+3 )
+    LDA2 ;goto-next JMP2 ( jump to next )
+
+( handle dot -- match any one character )
+@do-dot ( str* regex* -> bool^ )
+    INC2 LDA2 STH2                             ( load and stash next )
+    LDAk #00 NEQ ,&non-empty JCN               ( is there a char? )
+    &backtrack POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack )
+    &non-empty LDAk #0a NEQ ,&match JCN        ( yes, match unless \n in search-mode )
+    ;search-mode LDA ,&backtrack JCN           ( if \n and search-mode, treat as EOF )
+    &match INC2 STH2r ;goto-next JMP2          ( on match: inc s, restore and jump )
+
+( hande caret -- match string start (or possibly after newline) without advancing )
+@do-caret ( str* regex* -> bool^ )
+    INC2 LDA2 STH2                              ( load and stash next )
+    DUP2 ;string-start LDA2 EQU2 ,&at-start JCN ( at string start? )
+    ;match-multiline LDA ,&no-match JCN         ( are we in multi-line mode? )
+    DUP2 #0001 SUB2 LDA #0a EQU ,&at-start JCN      ( just after newline? )
+    &no-match POP2r POP2 ;goto-backtrack JMP2   ( clear stacks and backtrack )
+    &at-start STH2r ;goto-next JMP2             ( go to next without advancing )
+
+( hande dollar -- match string end (or possibly before newline) without advancing )
+@do-dollar  ( str* regex* -> bool^ )
+    INC2 LDA2 STH2                            ( load and stash next )
+    LDAk #00 EQU ,&at-end JCN                 ( at string end? )
+    ;match-multiline LDA ,&no-match JCN       ( are we in multi-line mode? )
+    LDAk #0a EQU ,&at-end JCN                 ( at newline? )
+    &no-match POP2r POP2 ;goto-backtrack JMP2 ( clear stacks and backtrack )
+    &at-end STH2r ;goto-next JMP2             ( go to next without advancing )
+
+( handle literal -- match one specific character )
+@do-literal ( str* regex* -> bool^ )
+    INC2
+    LDAk STH ( store c )
+    INC2 LDA2 STH2 ROTr ( store next, move c to top )
+    LDAk
+    STHr EQU ,&matches JCN ( do we match this char? )
+    POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack )
+    &matches
+    INC2 STH2r ;goto-next JMP2 ( yes, inc s, restore and jump )
+
+( handle or -- try the left branch but backtrack to the right if needed )
+( )
+( this also handles asteration, since it ends up having the same structure )
+@do-or ( str* regex* -> bool^ )
+    INC2 OVR2 OVR2 #0002 ADD2 ( s r+1 s r+3 )
+    LDA2 ;push4 JSR2 ( save (s, right) in the stack for possible backtracking )
+    LDA2 ;loop JMP2 ( continue on left branch )
+
+@matches-cls ( str* regex* -> bool^ )
+    OVR2 LDA ,&not-null JCN
+        ( needs to have a character to match )
+        POP2 POP2 ;goto-backtrack JMP2
+    &not-null
+        DUP2 INC2 LDA2 STH2 ( str regex [next] )
+        OVR2 INC2 STH2 ( str regex [str+1 next] )
+        SWP2 LDA STH ( regex [c str+1 next] )
+        #0003 ADD2 LDAk #00 SWP #0002 MUL2 ( r+3 len*2 [c str+1 next] )
+        SWP2 INC2 STH2k ADD2 STH2r         ( r+4+len*2 r+4 [c str+1 next] )
+    &loop ( limit addr [c str+1 next] )
+        EQU2k ,&missing JCN
+        LDAk STHkr GTH ,&next1 JCN INC2
+        LDAk STHkr LTH ,&next2 JCN ,&found JMP
+    &next1 INC2
+    &next2 INC2 ,&loop JMP
+    &missing POP2 POP2 POPr ,&negated LDR ,&match JCN
+    &no-match POP2r POP2r ;goto-backtrack JMP2
+    &found POP2 POP2 POPr ,&negated LDR ,&no-match JCN
+    &match STH2r STH2r ;goto-next JMP2
+    [ &negated $1 ]
+
+( )
+@do-ccls ( str* regex* -> bool^ )
+    #00 ,matches-cls/negated STR ,matches-cls JMP
+
+( )
+@do-ncls  ( str* regex* -> bool^ )
+    #01 ,matches-cls/negated STR ,matches-cls JMP
+
+( REGEX PARSING )
+
+( do we match across lines? )
+( - should be true when matching )
+( - can be true or false when searching )
+( - affects syntax of . ^ and $ )
+@match-multiline $1
+
+( are we in searching mode? )
+( - should be true when searching )
+( - should be false when matching )
+@search-mode $1
+
+( )
+@string-start $2
+@search-start $2
+@search-end   $2
+
+( track the position in the input string )
+@pos $2
+
+( track how many levels deep we are in parenthesis )
+@parens $2
+
+( how many subgroups have we seen so far? )
+@groupnum $1
+
+( read and increment pos )
+@read ( -> c^ )
+    ;pos LDA2k ( pos s )
+    LDAk STHk #00 EQU ( pos s c=0 [c] )
+    ,&is-eof JCN ( pos s [c] )
+    INC2 ( pos s+1 [c] )
+    SWP2 STA2 ,&return JMP ( [c] )
+    &is-eof POP2 POP2
+    &return STHr ( c )
+    JMP2r
+
+( is pos currently pointing to a star? )
+@peek-to-star ( -> is-star^ )
+    ;pos LDA2 LDA LIT "* EQU JMP2r
+
+( is pos currently pointing to a plus? )
+@peek-to-plus ( -> is-plus^ )
+    ;pos LDA2 LDA LIT "+ EQU JMP2r
+
+( is pos currently pointing to a qmark? )
+@peek-to-qmark ( -> is-qmark^ )
+    ;pos LDA2 LDA LIT "? EQU JMP2r
+
+( just increment pos )
+@skip
+    ;pos LDA2 INC2 ;pos STA2 JMP2r
+
+( TODO: )
+( 1. character groups: [] and [^] )
+( 2. symbolic escapes, e.g. \n )
+
+( STRETCH GOALS: )
+( a. ^ and $ )
+( b. counts: {n} and {m,n} )
+( c. substring matching, i.e. searching )
+( d. subgroup extraction )
+( e. back-references, e.g \1 )
+( f. non-capturing groups, e.g. (?:) )
+
+( compile an expression string into a regex graph )
+( )
+( the regex will be allocated in the arena; if there is not )
+( sufficient space an error will be thrown. )
+( )
+( the stack will also be used during parsing although unlike )
+( the arena it will be released once compilation ends. )
+@compile ( expr* -> regex* )
+          ;pos    STA2
+    #0000 ;parens STA2
+    ;rx-reset JSR2
+    ;compile-region JMP2
+
+( the basic strategy here is to build a stack of non-or )
+( expressions to be joined together at the end of the )
+( region. each stack entry has two regex addresses: )
+(   - the start of the regex )
+(   - the current tail of the regex )
+( when we concatenate a new node to a regex we update )
+( the second of these but not the first. )
+( )
+( the bottom of the stack for a given region is denoted )
+( by #ffff #ffff. above that we start with #0000 #0000 )
+( to signal an empty node. )
+@compile-region ( -> r2* )
+    #ffff #ffff ;push4 JSR2 ( stack delimiter )
+    #0000 #0000 ;push4 JSR2 ( stack frame start )
+@compile-region-loop
+    ;read JSR2
+    DUP #00 EQU ;c-done JCN2
+    DUP LIT "| EQU ;c-or JCN2
+    DUP LIT ". EQU ;c-dot JCN2
+    DUP LIT "^ EQU ;c-caret JCN2
+    DUP LIT "$ EQU ;c-dollar JCN2
+    DUP LIT "( EQU ;c-lpar JCN2
+    DUP LIT ") EQU ;c-rpar JCN2
+    DUP LIT "[ EQU ;c-lbrk JCN2
+    DUP LIT "] EQU ;c-rbrk JCN2
+    DUP LIT "\ EQU ;c-esc JCN2
+    DUP LIT "* EQU ;c-star JCN2
+    DUP LIT "+ EQU ;c-plus JCN2
+    DUP LIT "? EQU ;c-qmark JCN2
+                   ;c-char JMP2
+
+( either finalize the given r0/r1 or else wrap it in )
+( a star node if a star is coming up next. )
+( )
+( we use this look-ahead approach rather than compiling )
+( star nodes directly since the implementation is simpler. )
+@c-peek-and-finalize ( r0* r1* -> r2* )
+    ;peek-to-star JSR2 ( r0 r1 next-is-star? ) ,&next-is-star JCN
+    ;peek-to-plus JSR2 ( r0 r1 next-is-plus? ) ,&next-is-plus JCN
+    ;peek-to-qmark JSR2 ( r0 r1 next-is-qmark? ) ,&next-is-qmark JCN
+    ,&finally JMP ( r0 r1 )
+    &next-is-star ;skip JSR2 POP2 ;alloc-star JSR2 DUP2 ,&finally JMP
+    &next-is-plus ;skip JSR2 POP2 ;alloc-plus JSR2 DUP2 ,&finally JMP
+    &next-is-qmark ;skip JSR2 POP2 ;alloc-qmark JSR2 DUP2 ,&finally JMP
+    &finally ;push-next JSR2 ;compile-region-loop JMP2
+
+( called when we reach EOF of the input string )
+( )
+( as with c-rpar we have to unroll the current level )
+( of the stack, building any or-nodes that are needed. )
+( )
+( this is where we detect unclosed parenthesis. )
+@c-done ( c^ -> r2* )
+    POP
+    ;parens LDA2 #0000 GTH2 ,&mismatched-parens JCN
+    ;unroll-stack JSR2 POP2 JMP2r
+    &mismatched-parens ;mismatched-parens ;error!! JSR2
+
+( called when we read "|" )
+( )
+( since we defer building or-nodes until the end of the region )
+( we just start a new stack frame and continue. )
+@c-or ( c^ -> r2* )
+    POP
+    #0000 #0000 ;push4 JSR2
+    ;compile-region-loop JMP2
+
+( called when we read left parenthesis )
+( )
+( this causes us to: )
+( )
+(  1. increment parens )
+(  2. start a new region on the stack )
+(  3. jump to compile-region to start parsing the new region )
+@c-lpar ( c^ -> r2* )
+    POP
+    ;parens LDA2 INC2 ;parens STA2 ( parens++ )
+    ;compile-region JMP2
+
+( called when we read right parenthesis )
+( )
+( this causes us to: )
+( )
+(  1. check for mismatched parens )
+(  2. decrement parens )
+(  3. unroll the current region on the stack into one regex node )
+(  4. finalize that node and append it to the previous region )
+(  5. continue parsing )
+@c-rpar ( c^ -> r2* )
+    POP
+    ;parens LDA2 #0000 EQU2 ,&mismatched-parens JCN
+    ;parens LDA2 #0001 SUB2 ;parens STA2 ( parens-- )
+    ;unroll-stack JSR2
+    ;c-peek-and-finalize JMP2
+    &mismatched-parens ;mismatched-parens ;error!! JSR2
+
+( doesn't support weird things like []abc] or [-abc] or similar. )
+( doesn't currently handle "special" escapes such as \n )
+@c-lbrk ( c^ -> r2* )
+    POP LITr 00 ;pos LDA2 ( pos [0] )
+    LDAk LIT "^ NEQ ,&normal JCN INCr INC2 ( pos [negated?^] )
+    &normal
+        #0a STHr ADD          ( src* type^ )
+        ;arena-pos LDA2 STH2k ( src* type^ dst* [dst*] )
+        STA LIT2r 0004 ADD2r  ( src* [dst+4] )
+    &left-parse ( src* [dst*] )
+        LDAk LIT "] EQU ,&done JCN
+        LDAk LIT "- EQU ,&error JCN
+        LDAk LIT "\ NEQ ,&left JCN INC2
+    &left
+        LDAk STH2kr STA INC2r
+        DUP2 INC2 LDA LIT "- NEQ ,&pre-right JCN INC2 INC2
+        LDAk LIT "] EQU ,&error JCN
+        LDAk LIT "- EQU ,&error JCN
+    &pre-right
+        LDAk LIT "\ NEQ ,&right JCN INC2
+    &right
+        LDAk STH2kr STA INC2 INC2r ,&left-parse JMP
+    &done ( src* [dst*] )
+        INC2 ;pos STA2 STH2r ( dst* )
+        DUP2 ;arena-pos LDA2 ( dst dst a )
+        #0004 ADD2 SUB2 #0002 DIV2 NIP ( dst (dst-(a+4))/2 )
+        ;arena-pos LDA2 STH2k #0003 ADD2 STA ( dst [a] )
+        ;arena-pos STA2 STH2r ( a )
+        #0000 OVR2 INC2 STA2  ( a )
+        DUP2 ;c-peek-and-finalize JMP2
+    &error
+        #abcd #0000 DIV ( TODO error here )
+
+@c-rbrk ( c^ -> r2* )
+    POP
+    #0000 DIV ( invariant: should never be seen )
+
+( called when we read "." )
+( )
+( allocates a dot-node and continues. )
+@c-dot ( c^ -> r2* )
+    POP
+    #02 ;alloc3 JSR2
+    DUP2 ;c-peek-and-finalize JMP2
+
+( called when we read "^" )
+( )
+( allocates a caret-node and continues. )
+@c-caret ( c^ -> r2* )
+    POP
+    #06 ;alloc3 JSR2
+    DUP2 ;c-peek-and-finalize JMP2
+
+( called when we read "$" )
+( )
+( allocates a dollar-node and continues. )
+@c-dollar ( c^ -> r2* )
+    POP
+    #07 ;alloc3 JSR2
+    DUP2 ;c-peek-and-finalize JMP2
+
+( called when we read "\" )
+( )
+( handles special sequences: \a \b \t \n \v \f \r )
+( )
+( otherwise, allocates a literal of the next character. )
+@c-esc ( c^ -> r2* )
+    POP ;read JSR2
+    DUP LIT "a EQU  ,&bel JCN
+    DUP LIT "b EQU   ,&bs JCN
+    DUP LIT "t EQU  ,&tab JCN
+    DUP LIT "n EQU   ,&nl JCN
+    DUP LIT "v EQU ,&vtab JCN
+    DUP LIT "f EQU   ,&ff JCN
+    DUP LIT "r EQU   ,&cr JCN
+    &default ;c-char JMP2
+    &bel  POP #07 ,&default JMP
+    &bs   POP #08 ,&default JMP
+    &tab  POP #09 ,&default JMP
+    &nl   POP #0a ,&default JMP
+    &vtab POP #0b ,&default JMP
+    &ff   POP #0c ,&default JMP
+    &cr   POP #0d ,&default JMP
+
+( called when we read any other character )
+( )
+( allocates a literal-node and continues. )
+@c-char ( c^ -> r2* )
+    ;alloc-lit JSR2 ( lit )
+    DUP2 ;c-peek-and-finalize JMP2
+
+( called if we parse a "*" )
+( )
+( actually calling this means the code broke an invariant somewhere. )
+@c-star ( c^ -> regex* )
+    POP
+    ;star-invariant ;error!! JSR2
+
+( called if we parse a "+" )
+( )
+( actually calling this means the code broke an invariant somewhere. )
+@c-plus ( c^ -> regex* )
+    POP
+    ;plus-invariant ;error!! JSR2
+
+( called if we parse a "?" )
+( )
+( actually calling this means the code broke an invariant somewhere. )
+@c-qmark ( c^ -> regex* )
+    POP
+    ;qmark-invariant ;error!! JSR2
+
+( ALLOCATING REGEX NDOES )
+
+@rx-node-sizes
+    ( 00 01 02 03     04 05 06 07     08 09 0a 0b )
+    [ 00 03 03 04 ] [ 05 05 03 03 ] [ 04 04 00 00 ]
+
+@alloc3 ( mode^ -> r* )
+    #0000 ROT ( 00 00 mode^ )
+    #03 ;alloc JSR2 ( 00 00 mode^ addr* )
+    STH2k STA ( addr <- mode )
+    STH2kr INC2 STA2 ( addr+1 <- 0000 )
+    STH2r JMP2r ( return addr )
+
+@alloc-empty ( -> r* )
+    #01 ;alloc3 JMP2
+
+@alloc-lit ( c^ -> r* )
+    #03 #0000 SWP2 ( 0000 c^ 03 )
+    #04 ;alloc JSR2 ( 0000 c^ 03 addr* )
+    STH2k STA ( addr <- 03 )
+    STH2kr INC2 STA ( addr+1 <- c )
+    STH2kr #0002 ADD2 STA2 ( addr+2 <- 0000 )
+    STH2r JMP2r ( return addr )
+
+@alloc-or ( right* left* -> r* )
+    #05 ;alloc JSR2 STH2 ( r l [x] )
+    #04 STH2kr            STA ( r l [x] )
+        STH2kr       INC2 STA2 ( r [x] )
+        STH2kr #0003 ADD2 STA2 ( [x] )
+    STH2r JMP2r
+
+@alloc-star ( expr* -> r* )
+    #05 ;alloc JSR2 STH2  ( expr [r] )
+    #05 STH2kr STA        ( expr [r] )
+    DUP2 STH2kr INC2 STA2 ( expr [r] )
+    #0000 STH2kr #0003 ADD2 STA2 ( expr [r] )
+    STH2kr SWP2 ( r expr [r] )
+    ;set-next JSR2 ( [r] )
+    STH2r JMP2r
+
+@alloc-plus ( expr* -> r* )
+    #05 ;alloc JSR2 STH2  ( expr [r] )
+    #05 STH2kr STA        ( expr [r] )
+    DUP2 STH2kr INC2 STA2 ( expr [r] )
+    #0000 STH2kr #0003 ADD2 STA2 ( expr [r] )
+    STH2r SWP2 STH2k ( r expr [expr] )
+    ;set-next JSR2 ( [expr] )
+    STH2r JMP2r
+
+@alloc-qmark ( expr* -> r* )
+    ;alloc-empty JSR2 STH2k ( expr e [e] )
+    OVR2 ;set-next JSR2 ( expr [e] )
+    #05 ;alloc JSR2 STH2  ( expr [r e] )
+    #04 STH2kr STA        ( expr [r e] )
+    STH2kr INC2 STA2 ( [r e] )
+    SWP2r STH2r STH2kr ( e r [r] )
+    #0003 ADD2 STA2 ( [r] )
+    STH2r JMP2r
+
+( if r is 0000, allocate an empty node )
+@alloc-if-null ( r* -> r2* )
+    ORAk ,&return JCN POP2 ;alloc-empty JSR2 &return JMP2r
+
+( unroll one region of the parsing stack, returning )
+( a single node consisting of an alternation of )
+( all elements on the stack. )
+( )
+( this unrolls until it hits #ffff #ffff, which it )
+( also removes from the stack. )
+@unroll-stack ( -> start* end* )
+    ;pop4 JSR2 STH2 ( r )
+    #00 STH ( count items in stack frame )
+    ;alloc-if-null JSR2 ( replace 0000 with empty )
+    &loop ( r* )
+    ;pop4 JSR2 POP2 ( r x )
+    DUP2 #ffff EQU2 ( r x x-is-end? ) ,&done JCN
+    INCr ( items++ )
+    ;alloc-or JSR2 ( r|x ) ,&loop JMP
+    &done
+    ( r ffff )
+    POP2
+    STHr ,&is-or JCN
+    STH2r JMP2r
+    &is-or
+    POP2r
+    ;alloc-empty JSR2 OVR2 OVR2 SWP2 ( r empty empty r )
+    ;set-next-or JSR2
+    JMP2r
+
+( add r to the top of the stock. )
+( )
+( in particular, this will write r into tail.next )
+( before replacing tail with r. )
+@push-next ( r0 r1 -> )
+    ;pop4 JSR2 ( r0 r1 x0 x1 )
+    DUP2 #0000 EQU2 ( r0 r1 x0 x1 x1=0? ) ,&is-zero JCN
+    STH2 ROT2 STH2r ( r1 x0 r0 x1 )
+    ;set-next JSR2 SWP2 ( x0 r1 )
+    ;push4 JSR2
+    JMP2r
+    &is-zero POP2 POP2 ;push4 JMP2
+
+( load the given address:  )
+( )
+(  1. if it points to 0000, update it to target )
+(  2. otherwise, call set-next on it )
+@set-next-addr ( target* addr* -> )
+    LDA2k #0000 EQU2 ( target addr addr=0? ) ,&is-zero JCN
+    LDA2 ;set-next JMP2
+    &is-zero STA2 JMP2r
+
+( set regex.next to target )
+( )
+( node types 1-7 are defined. )
+( )
+( all node types except star (5) and lit (3) store their next )
+( pointer one byte off of their own address. )
+( )
+( since both branches of an or (4) node are supposed to meet )
+( back up we only bother taking the left branch. otherwise )
+( you can end up double-appending things. )
+@set-next ( target* regex* -> )
+    LDAk #01 LTH ,&unknown JCN
+    LDAk #0b GTH ,&unknown JCN
+    LDAk #09 GTH ,&cc JCN
+    LDAk #00 SWP ;rx-node-sizes ADD2
+        LDA #00 SWP ADD2 #0002 SUB2
+        ;set-next-addr JMP2
+    &cc INC2 ;set-next-addr JMP2
+    &unknown LDAk #ee ;unknown-node-type ;error!! JSR2
+
+@set-next-or-addr ( target* addr* -> )
+    LDA2k #0000 EQU2 ( target addr addr=0? ) ,&is-zero JCN
+    LDA2 ;set-next-or JMP2
+    &is-zero STA2 JMP2r
+
+( this is used when first building or-nodes )
+( structure will always be: )
+( [x1, [x2, [x3, ..., [xm, xn]]]] )
+( so we recurse on the right side but not the left. )
+@set-next-or ( target* regex* -> )
+    LDAk #04 NEQ ,&!4 JCN
+    OVR2 OVR2 INC2 ;set-next-addr JSR2
+        #0003 ADD2 ;set-next-or-addr JMP2
+    &!4 ;set-next JMP2
+
+( STACK OPERATIONS )
+( )
+( we always push/pop 4 bytes at a time. the stack has a fixed )
+( maximum size it can use, defined by ;stack-top. )
+( )
+( the stack can be cleared using ;reset-stack, which resets )
+( the stack pointers but does not zero out any memory. )
+( )
+( stack size is 4096 bytes here but is configurable. )
+( in some cases it could be very small but this will limit )
+( how many branches can be parsed and executed. )
+
+( push 4 bytes onto the stack )
+@push4 ( str* regex* -> )
+    ;assert-stack-avail JSR2 ( check for space )
+    ;stack-pos LDA2 #0002 ADD2 STA2 ( cell[2:3] <- regex )
+    ;stack-pos LDA2 STA2 ( cell[0:1] <- str )
+    ;stack-pos LDA2 #0004 ADD2 ;stack-pos STA2 ( pos += 4 )
+    JMP2r
+
+( pop 4 bytes from the stack )
+@pop4 ( -> str* regex* )
+    ;assert-stack-exist JSR2 ( check for space )
+    ;stack-pos LDA2 ( load stack-pos )
+    #0002 SUB2 LDA2k STH2 ( pop and stash regex )
+    #0002 SUB2 LDA2k STH2 ( pop and stash str )
+    ;stack-pos STA2 ( save new stack-pos )
+    STH2r STH2r ( restore str and regex )
+    JMP2r
+
+( reset stack pointers )
+@reset-stack ( -> )
+    ;stack-bot ;stack-pos STA2 JMP2r ( pos <- 0 )
+
+( can more stack be allocated? )
+@stack-avail ( -> bool^ )
+    ;stack-pos LDA2 ;stack-top LTH2 JMP2r
+
+( is the stack non-empty? )
+@stack-exist ( -> bool^ )
+    ;stack-pos LDA2 ;stack-bot GTH2 JMP2r
+
+( error if stack is full )
+@assert-stack-avail ( -> )
+    ;stack-avail JSR2 ,&ok JCN ;stack-is-full ;error!! JSR2 &ok JMP2r
+
+( error is stack is empty )
+@assert-stack-exist ( -> )
+    ;stack-exist JSR2 ,&ok JCN ;stack-is-empty ;error!! JSR2 &ok JMP2r
+
+( stack-pos points to the next free stack position (or the top if full). )
+@stack-pos :stack-bot ( the next position to insert at )
+
+( stack-bot is the address of the first stack position. )
+( stack-top is the address of the first byte beyond the stack. )
+@stack-bot $800 @stack-top ( holds 512 steps (2048 bytes) )
+
+( ARENA OPERATIONS )
+( )
+( the arena represents a heap of memory that can easily be )
+( allocated in small amounts. )
+( )
+( the entire arena can be reclaimed using ;reset-arena, but )
+( unlike systems such as malloc/free, the arena cannot relcaim )
+( smaller amounts of memory. )
+( )
+( the arena is used to allocate regex graph nodes, which are )
+( dynamically-allocated as the regex string is parsed. once )
+( a regex is no longer needed the arena may be reclaimed. )
+( )
+( arena size is 1024 bytes here but is configurable. )
+( smaller sizes would likely be fine but will limit the )
+( overall complexity of regexes to be parsed and executed. )
+
+( reclaim all the memory used by the arena )
+@reset-arena ( -> )
+    ;arena-bot ;arena-pos STA2 JMP2r
+
+( currently caller is responsible for zeroing out memory if needed )
+@alloc ( size^ -> addr* )
+    #00 SWP ( size* )
+    ;arena-pos LDA2 STH2k ADD2 ( pos+size* [pos] )
+    DUP2 ;arena-top GTH2 ( pos+size pos+size>top? [pos] )
+    ,&error JCN ( pos+size [pos] )
+    ;arena-pos STA2 ( pos += size [pos] )
+    STH2r JMP2r ( pos )
+    &error POP2 POP2r ;arena-is-full ;error!! JSR2
+
+@arena-pos :arena-bot ( the next position to allocate )
+@arena-bot $400 @arena-top ( holds up to 1024 bytes )
+
+( SUBGROUP OPERATIONS )
+( )
+( subgroups are parts of the input string that are matched by )
+( parenthesized subgroup expressions in a regex. )
+( )
+( for example, (a*)(b*)(c*) has 3 subgroup expressions. )
+( )
+( during matching, subgroups are represented by 5-bytes: )
+( )
+( - byte 1: subgroup index (1-255, 0 is a marker) )
+( - bytes 2-3: absolute address of the start of the subgroup )
+( - bytes 4-5: absolute address of the limit of the subgroup )
+( )
+( this means that to get a null-terminated subgroup string )
+( you will need to copy it somewhere else with enough space, )
+( or else mutate the input string to add a null. )
+( )
+( since input strings themselves are null-terminated, and since )
+( subgroups never include null terminators, we will always have )
+( a valid limit value even for input strings that end at #ffff. )
+( )
+( during regex parsing we will use subgroup-pos to track the )
+( next available subgroup position. )
+( )
+( some regular expressions will write to a subgroup multiple times. )
+( for example when matching ((.)x)+ against "axbx": )
+( )
+( - subgroup 1 will contain "bx" )
+( - subgroup 2 will contain "b" )
+( )
+( this may necessitate backtracking. when matching ((.)x|(.)y)+ )
+( against "axby" we will make the following assignments: )
+( )
+( - position 1: )
+(   + start subgroup 1 )
+(   + start then finish subgroup 2: "a" )
+( - position 2: )
+(   + finish subgroup 1: "ax" )
+( - position 3: )
+(   + start subgroup 1 )
+(   + start then finish subgroup 2: "b" )
+( - position 4: )
+(   + backtrack, reverting subgroup 2 to "a" )
+( - back to position 3 again: )
+(   + start then finish subgroup 3: "b" )
+( - position 4 again: )
+(   + finish subgruop 1: "by" )
+( )
+( the final subgroups will be: {1: "by", 2: "a", 3: "b"} )
+
+@subgroup-start ( s* i^ -> )
+    ;subgroup-pos LDA2 STH2k ( s* i^ pos* [pos*] )
+    ;subgroup-top LTH2 ,&next JCN #0000 DIV ( too many subgroups )
+    &next ( s* i^ [pos*] )
+        STH2kr STA
+        STH2r INC2 STA2
+        JMP2r
+
+@subgroup-finish ( s* i^ -> )
+    ;subgroup-pos LDA2 STH2k ( s* i^ pos* [pos*] )
+    ;subgroup-top LTH2 ,&next JCN #0000 DIV ( too many subgroups )
+    &next ( s* i^ [pos*] )
+        STH2kr LDA EQU ,&ok JCN #0000 DIV ( mismatched subgroups )
+    &ok ( s* [pos*] )
+        STH2kr #0003 ADD2 STA2
+        STH2r #0005 ADD2 ;subgroup-pos STA2
+        JMP2r
+
+@subgroup-branch ( -> )
+    ;subgroup-pos LDA2 STH2k ( pos* [pos*] )
+    ;subgroup-top LTH2 ,&next JCN #0000 DIV ( too many subgroups )
+    &next
+        #00 STH2kr STA ( [*pos] )
+        STH2r #0005 ADD2 ;subgroup-pos STA2
+        JMP2r
+
+@subgroup-backtrack ( -> )
+    ;subgroup-bot ;subgroup-pos LDA2 ( bot* pos* )
+    &loop ( bot* pos* )
+        EQU2k ,&done JCN
+        LDAk #00 EQU ,&done JCN
+        #0005 SUB2 ,&loop JMP
+    &done ( bot* pos* )
+        NIP2 ;subgroup-pos STA2
+        JMP2r
+
+( does not zero out the memory in question )
+@subgroup-reset ( -> )
+    ;subgroup-bot ;subgroup-pos STA2
+    JMP2r
+
+@subgroup-pos :subgroup-bot ( the position of the first unallocated subgroup item )
+@subgroup-bot $280 @subgroup-top ( holds up to 128 subgroup assignments (640 bytes) )