nxu/regex.tal

( regex.tal                                                        )
(                                                                  )
( compiles regex expression strings into regex nodes, then uses    )
( regex nodes to match input strings.                              )
(                                                                  )
( two methods are currently supported:                             )
(                                                                  )
( 1. match                                                         )
(                                                                  )
( when matching the regex must match the entire string. this means )
( that it is unnecessary to use ^ and $ when matching, since their )
( effect is implied. it also means that that dot nodes will match  )
( any characters at all including newlines.                        )
(                                                                  )
( match returns 01 if the string was matched and 00 otherwise.     )
(                                                                  )
( 2. search                                                        )
(                                                                  )
( when searching the regex attempts to find matching substrings    )
( in the given string. this means that after successfully finding  )
( a match, search may be called on the remaining substring to find )
( more matches.                                                    )
(                                                                  )
( when searching, ^ matches the beginning of the string OR a line. )
( $ matches the end of a line OR the end of the entire string.     )
( (the ^ and $ operators aren't yet supported.) the dot nodes will )
( not match newline characters, which must be matched explicitly.  )
(                                                                  )
( search returns 01 if the string was matched and 00 otherwise.    )
( additionally, the @search-start and @search-end addresses will   )
( contain the starting location and match boundary of the matching )
( substring.                                                       )
(                                                                  )
( regex node types:                                                )
(                                                                  )
(   NAME    DESCRIPTION                      STRUCT                )
(   empty   matches empty string             [ #01 next* ]         )
(   dot     matches any one char             [ #02 next* ]         )
(   lit     matches one specific char (c)    [ #03 c^ next* ]      )
(   or      matches either left or right     [ #04 left* right* ]  )
(   star    matches expr zero-or-more times  [ #05 expr* next* ]   )
(           (NOTE: r.expr.next must be r)                          )
(   caret   matches start of line/string     [ #06 next* ]         )
(   dollar  matches end of line/string       [ #07 next* ]         )
(   lpar    starts subgroup region           [ #08 i* next* ]      )
(   rpar    ends subgroup region             [ #09 i* next* ]      )
(                                                                  )
( `or` and `star` have the same structure and are handled by the   )
( same code (;do-or). however, the node types are kept different   )
( to make it clearer how to parse and assemble the nodes.          )
(                                                                  )
( dollar nodes contain a next pointer even though this usually     )
( will not be needed.                                              )
(                                                                  )
( lpar and rpar contain addresses pointing between subgroup-bot    )
( and subgroup-bot. rpar's address will always be +2 relative to   )
( the corresponding lpar address.                                  )
(                                                                  )
( concatenation isn't a node, it is implied by the *next addr.     )
( a next value of #0000 signals the end of the regex.              )
(                                                                  )
( in these docs str* is an address to a null-terminated string.    )
( regexes should not include nulls and cannot match them (other    )
( than the null which signals the end of a string).                )

( TODO: we have lpar and rpar nodes but aren't using them yet      )
( 1. need to modify c-lpar and c-par                               )
( 2. we need to store subgroup-posd in regions during parsing:     )
(   a. need to store the current pos in the region                 )
(   b. need to call start to move subgroup-pos forward             )
( 3. when finishing parsing a region we need lpar/rpar nodes       )
( 4. we also need to store "last started subgroup" on the stack    )
( 5. when backtracking we must rewind to "last started" subgroup   )

%debug { #ff #0e DEO }
%emit { #18 DEO }
%space { #20 emit }
%newline { #0a emit }
%quit! { #01 #0f DEO BRK }

( now that uxnasm throws errors about writing into the zero page   )
( we have to do something like this to be able to compile library  )
( code. we have to guess what offset to use since it needs to      )
( avoid conficting with the program we're included in.             )
(                                                                  )
( remove this if needed when including it in other projects.       )
|2000

( ERROR HANDLING )

( using error! will print the given message before causing )
( the interpreter to halt. )
@error! ( msg* -> )
    LIT '! emit space
    &loop LDAk ,&continue JCN ,&done JMP
    &continue LDAk emit INC2 ,&loop JMP
    &done POP2 newline quit!

( error messages )
@unknown-node-type "unknown 20 "node 20 "type 00
@mismatched-parens "mismatched 20 "parenthesis 00
@stack-is-full "stack 20 "is 20 "full 00
@stack-is-empty "stack 20 "is 20 "empty 00
@arena-is-full "arena 20 "is 20 "full 00
@star-invariant "star 20 "invariant 20 "failed 00
@plus-invariant "plus 20 "invariant 20 "failed 00
@qmark-invariant "question 20 "mark 20 "invariant 20 "failed 00

( REGEX MATCHING )

( use stored regex to match against a stored string. )
( )
( regex* should be the address of a compiled regex )
( such as that returned from ;compile. )
( )
( str* should be a null-terminated string. )
( )
( returns true if the string, and false otherwise. )
@match ( str* regex* -> bool^ )
    #01 ;match-multiline STA
    #00 ;search-mode STA
    ;reset-stack JSR2
    ;loop JMP2

@search ( str* regex* -> bool^ )
    #00 ;match-multiline STA
    #01 ;search-mode STA
    ;_search JMP2

@search-multiline ( str* regex* -> bool^ )
    #01 ;match-multiline STA
    #01 ;search-mode STA
    ;_search JMP2

@_search ( str* regex* -> bool^ )
    STH2                          ( s* [r*] )
    DUP2 ;string-start STA2       ( s* [r*] )
    &loop LDAk #00 EQU ,&eof JCN  ( s* [r*] )
          ;reset-stack JSR2       ( s* [r*] )
          DUP2 ;search-start STA2 ( s* [r*] )
          DUP2 STH2kr ;loop JSR2  ( s* b^ [r*] )
          ,&found JCN             ( s* [r*] )
          INC2 ,&loop JMP         ( s+1* [r*] )
    &found POP2 POP2r #01 JMP2r   ( 01 )
    &eof ;reset-stack JSR2        ( s* [r*] )
         DUP2 ;search-start STA2  ( s* [r*] )
         STH2r ;loop JMP2         ( b^ )

( loop used during matching )
( )
( we don't use the return stack here since that )
( complicates the back-tracking we need to do. )
( ultimately this code will issue a JMP2r to )
( return a boolean, which is where the stack )
( effects signature comes from. )
@loop ( s* r* -> bool^ )
    LDAk #01 EQU ;do-empty   JCN2
    LDAk #02 EQU ;do-dot     JCN2
    LDAk #03 EQU ;do-literal JCN2
    LDAk #04 EQU ;do-or      JCN2
    LDAk #05 EQU ;do-or      JCN2 ( same code as the or case )
    LDAk #06 EQU ;do-caret   JCN2
    LDAk #07 EQU ;do-dollar  JCN2
    LDAk #08 EQU ;do-lpar    JCN2
    LDAk #09 EQU ;do-rpar    JCN2
    ;unknown-node-type ;error! JSR2

( used when we hit a dead-end during matching. )
( )
( if stack is non-empty we have a point we can resume from. )
@goto-backtrack ( -> bool^ )
    ;stack-exist JSR2 ,&has-stack JCN ( do we have stack? )
    #00 JMP2r ( no, return false )
    &has-stack ;pop4 JSR2 ;goto-next JMP2 ( yes, resume from the top )

( follow the given address (next*) to continue matching )
@goto-next ( str* next* -> bool^ )
    DUP2 #0000 GTH2 ,&has-next JCN
    POP2 LDAk #00 EQU ,&end-of-string JCN
    ;search-mode LDA ,&end-of-search JCN
    POP2 ;goto-backtrack JMP2
    &end-of-search DUP2 ;search-end STA2
    &end-of-string POP2 #01 JMP2r
    &has-next ;loop JMP2

( handle the empty node -- just follow the next pointer )
@do-empty ( str* regex* -> bool^ )
    INC2 LDA2 ( load next )
    ;goto-next JMP2 ( jump to next )

@do-lpar ( str* regex* -> bool^ )
    STH2 DUP2 ( s s [r] )
    INC2r LDA2kr STH2r ( s s i [r+1] )
    ;subgroup-start JSR2 ( s [r+1] )
    STH2r INC2 INC2 ( s r+3 )
    LDA2 ;goto-next JMP2 ( jump to next )

@do-rpar ( str* regex* -> bool^ )
    STH2 DUP2 ( s s [r] )
    INC2r LDA2kr STH2r ( s s i [r+1] )
    ;subgroup-finish JSR2 ( s [r+1] )
    STH2r INC2 INC2 ( s r+3 )
    LDA2 ;goto-next JMP2 ( jump to next )

( handle dot -- match any one character )
@do-dot ( str* regex* -> bool^ )
    INC2 LDA2 STH2                             ( load and stash next )
    LDAk #00 NEQ ,&non-empty JCN               ( is there a char? )
    &backtrack POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack )
    &non-empty LDAk #0a NEQ ,&match JCN        ( yes, match unless \n in search-mode )
    ;search-mode LDA ,&backtrack JCN           ( if \n and search-mode, treat as EOF )
    &match INC2 STH2r ;goto-next JMP2          ( on match: inc s, restore and jump )

( hande caret -- match string start (or possibly after newline) without advancing )
@do-caret ( str* regex* -> bool^ )
    INC2 LDA2 STH2                              ( load and stash next )
    DUP2 ;string-start LDA2 EQU2 ,&at-start JCN ( at string start? )
    ;match-multiline LDA ,&no-match JCN         ( are we in multi-line mode? )
    #0001 SUB2 LDAk #0a EQU ,&at-start JCN      ( just after newline? )
    &no-match POP2r POP2 ;goto-backtrack JMP2   ( clear stacks and backtrack )
    &at-start STH2r ;goto-next JMP2             ( go to next without advancing )

( hande dollar -- match string end (or possibly before newline) without advancing )
@do-dollar  ( str* regex* -> bool^ )
    INC2 LDA2 STH2                            ( load and stash next )
    LDAk #00 EQU ,&at-end JCN                 ( at string end? )
    ;match-multiline LDA ,&no-match JCN       ( are we in multi-line mode? )
    LDAk #0a EQU ,&at-end JCN                 ( at newline? )
    &no-match POP2r POP2 ;goto-backtrack JMP2 ( clear stacks and backtrack )
    &at-end STH2r ;goto-next JMP2             ( go to next without advancing )

( handle literal -- match one specific character )
@do-literal ( str* regex* -> bool^ )
    INC2
    LDAk STH ( store c )
    INC2 LDA2 STH2 ROTr ( store next, move c to top )
    LDAk
    STHr EQU ,&matches JCN ( do we match this char? )
    POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack )
    &matches
    INC2 STH2r ;goto-next JMP2 ( yes, inc s, restore and jump )

( handle or -- try the left branch but backtrack to the right if needed )
( )
( this also handles asteration, since it ends up having the same structure )
@do-or ( str* regex* -> bool^ )
    INC2 OVR2 OVR2 #0002 ADD2 ( s r+1 s r+3 )
    LDA2 ;push4 JSR2 ( save (s, right) in the stack for possible backtracking )
    LDA2 ;loop JMP2 ( continue on left branch )

( REGEX PARSING )

( do we match across lines? )
( - should be true when matching )
( - can be true or false when searching )
( - affects syntax of . ^ and $ )
@match-multiline $1

( are we in searching mode? )
( - should be true when searching )
( - should be false when matching )
@search-mode $1

( )
@string-start $2
@search-start $2
@search-end   $2

( track the position in the input string )
@pos $2

( track how many levels deep we are in parenthesis )
@parens $2

( read and increment pos )
@read ( -> c^ )
    ;pos LDA2k ( pos s )
    LDAk STHk #00 EQU ( pos s c=0 [c] )
    ,&is-eof JCN ( pos s [c] )
    INC2 ( pos s+1 [c] )
    SWP2 STA2 ,&return JMP ( [c] )
    &is-eof POP2 POP2
    &return STHr ( c )
    JMP2r

( is pos currently pointing to a star? )
@peek-to-star ( -> is-star^ )
    ;pos LDA2 LDA LIT '* EQU JMP2r

( is pos currently pointing to a plus? )
@peek-to-plus ( -> is-plus^ )
    ;pos LDA2 LDA LIT '+ EQU JMP2r

( is pos currently pointing to a qmark? )
@peek-to-qmark ( -> is-qmark^ )
    ;pos LDA2 LDA LIT '? EQU JMP2r

( just increment pos )
@skip
    ;pos LDA2 INC2 ;pos STA2 JMP2r

( TODO: )
( 1. character groups: [] and [^] )
( 2. symbolic escapes, e.g. \n )

( STRETCH GOALS: )
( a. ^ and $ )
( b. counts: {n} and {m,n} )
( c. substring matching, i.e. searching )
( d. subgroup extraction )
( e. back-references, e.g \1 )
( f. non-capturing groups, e.g. (?:) )

( compile an expression string into a regex graph )
( )
( the regex will be allocated in the arena; if there is not )
( sufficient space an error will be thrown. )
( )
( the stack will also be used during parsing although unlike )
( the arena it will be released once compilation ends. )
@compile ( expr* -> regex* )
          ;pos    STA2
    #0000 ;parens STA2
    ;reset-stack JSR2
    ;compile-region JMP2

( the basic strategy here is to build a stack of non-or )
( expressions to be joined together at the end of the )
( region. each stack entry has two regex addresses: )
(   - the start of the regex )
(   - the current tail of the regex )
( when we concatenate a new node to a regex we update )
( the second of these but not the first. )
( )
( the bottom of the stack for a given region is denoted )
( by #ffff #ffff. above that we start with #0000 #0000 )
( to signal an empty node. )
@compile-region ( -> r2* )
    #ffff #ffff ;push4 JSR2 ( stack delimiter )
    #0000 #0000 ;push4 JSR2 ( stack frame start )
@compile-region-loop
    ;read JSR2
    DUP #00 EQU ;c-done JCN2
    DUP LIT '| EQU ;c-or   JCN2
    DUP LIT '. EQU ;c-dot  JCN2
    DUP LIT '^ EQU ;c-caret  JCN2
    DUP LIT '$ EQU ;c-dollar  JCN2
    DUP LIT '( EQU ;c-lpar JCN2
    DUP LIT ') EQU ;c-rpar JCN2
    DUP LIT '\ EQU ;c-esc  JCN2
    DUP LIT '* EQU ;c-star JCN2
    DUP LIT '+ EQU ;c-plus JCN2
    DUP LIT '? EQU ;c-qmark JCN2
                   ;c-char JMP2

( either finalize the given r0/r1 or else wrap it in )
( a star node if a star is coming up next. )
( )
( we use this look-ahead approach rather than compiling )
( star nodes directly since the implementation is simpler. )
@c-peek-and-finalize ( r0* r1* -> r2* )
    ;peek-to-star JSR2 ( r0 r1 next-is-star? ) ,&next-is-star JCN
    ;peek-to-plus JSR2 ( r0 r1 next-is-plus? ) ,&next-is-plus JCN
    ;peek-to-qmark JSR2 ( r0 r1 next-is-qmark? ) ,&next-is-qmark JCN
    ,&finally JMP ( r0 r1 )
    &next-is-star ;skip JSR2 POP2 ;alloc-star JSR2 DUP2 ,&finally JMP
    &next-is-plus ;skip JSR2 POP2 ;alloc-plus JSR2 DUP2 ,&finally JMP
    &next-is-qmark ;skip JSR2 POP2 ;alloc-qmark JSR2 DUP2 ,&finally JMP
    &finally ;push-next JSR2 ;compile-region-loop JMP2

( called when we reach EOF of the input string )
( )
( as with c-rpar we have to unroll the current level )
( of the stack, building any or-nodes that are needed. )
( )
( this is where we detect unclosed parenthesis. )
@c-done ( c^ -> r2* )
    POP
    ;parens LDA2 #0000 GTH2 ,&mismatched-parens JCN
    ;unroll-stack JSR2 POP2 JMP2r
    &mismatched-parens ;mismatched-parens ;error! JSR2

( called when we read "|" )
( )
( since we defer building or-nodes until the end of the region )
( we just start a new stack frame and continue. )
@c-or ( c^ -> r2* )
    POP
    #0000 #0000 ;push4 JSR2
    ;compile-region-loop JMP2

( called when we read "(" )
( )
( this causes us to: )
( )
(  1. increment parens )
(  2. start a new region on the stack )
(  3. jump to compile-region to start parsing the new region )
@c-lpar ( c^ -> r2* )
    POP
    ;parens LDA2 INC2 ;parens STA2 ( parens++ )
    ;compile-region JMP2

( called when we read ")" )
( )
( this causes us to: )
( )
(  1. check for mismatched parens )
(  2. decrement parens )
(  3. unroll the current region on the stack into one regex node )
(  4. finalize that node and append it to the previous region )
(  5. continue parsing )
@c-rpar ( c^ -> r2* )
    POP
    ;parens LDA2 #0000 EQU2 ,&mismatched-parens JCN
    ;parens LDA2 #0001 SUB2 ;parens STA2 ( parens-- )
    ;unroll-stack JSR2
    ;c-peek-and-finalize JMP2
    &mismatched-parens ;mismatched-parens ;error! JSR2

( called when we read "." )
( )
( allocates a dot-node and continues. )
@c-dot ( c^ -> r2* )
    POP
    #02 ;alloc3 JSR2
    DUP2 ;c-peek-and-finalize JMP2

( called when we read "^" )
( )
( allocates a caret-node and continues. )
@c-caret ( c^ -> r2* )
    POP
    #06 ;alloc3 JMP2
    DUP2 ;c-peek-and-finalize JMP2

( called when we read "$" )
( )
( allocates a dollar-node and continues. )
@c-dollar ( c^ -> r2* )
    POP
    #07 ;alloc3 JMP2
    DUP2 ;c-peek-and-finalize JMP2

( called when we read "\" )
( )
( handles special sequences: \a \b \t \n \v \f \r )
( )
( otherwise, allocates a literal of the next character. )
@c-esc ( c^ -> r2* )
    POP ;read JSR2
    DUP LIT 'a EQU  ,&bel JCN
    DUP LIT 'b EQU   ,&bs JCN
    DUP LIT 't EQU  ,&tab JCN
    DUP LIT 'n EQU   ,&nl JCN
    DUP LIT 'v EQU ,&vtab JCN
    DUP LIT 'f EQU   ,&ff JCN
    DUP LIT 'r EQU   ,&cr JCN
    &default ;c-char JMP2
    &bel  POP #07 ,&default JMP
    &bs   POP #08 ,&default JMP
    &tab  POP #09 ,&default JMP
    &nl   POP #0a ,&default JMP
    &vtab POP #0b ,&default JMP
    &ff   POP #0c ,&default JMP
    &cr   POP #0d ,&default JMP

( called when we read any other character )
( )
( allocates a literal-node and continues. )
@c-char ( c^ -> r2* )
    ;alloc-lit JSR2 ( lit )
    DUP2 ;c-peek-and-finalize JMP2

( called if we parse a "*" )
( )
( actually calling this means the code broke an invariant somewhere. )
@c-star ( c^ -> regex* )
    POP
    ;star-invariant ;error! JSR2

( called if we parse a "+" )
( )
( actually calling this means the code broke an invariant somewhere. )
@c-plus ( c^ -> regex* )
    POP
    ;plus-invariant ;error! JSR2

( called if we parse a "?" )
( )
( actually calling this means the code broke an invariant somewhere. )
@c-qmark ( c^ -> regex* )
    POP
    ;qmark-invariant ;error! JSR2

( ALLOCATING REGEX NDOES )

@alloc3 ( mode^ -> r* )
    #0000 ROT ( 00 00 mode^ )
    #03 ;alloc JSR2 ( 00 00 mode^ addr* )
    STH2k STA ( addr <- mode )
    STH2kr INC2 STA2 ( addr+1 <- 0000 )
    STH2r JMP2r ( return addr )

@alloc-empty ( -> r* )
    #01 ;alloc3 JMP2

@alloc-lit ( c^ -> r* )
    #03 #0000 SWP2 ( 0000 c^ 03 )
    #04 ;alloc JSR2 ( 0000 c^ 03 addr* )
    STH2k STA ( addr <- 03 )
    STH2kr INC2 STA ( addr+1 <- c )
    STH2kr #0002 ADD2 STA2 ( addr+2 <- 0000 )
    STH2r JMP2r ( return addr )

@alloc-or ( right* left* -> r* )
    #05 ;alloc JSR2 STH2 ( r l [x] )
    #04 STH2kr            STA ( r l [x] )
        STH2kr       INC2 STA2 ( r [x] )
        STH2kr #0003 ADD2 STA2 ( [x] )
    STH2r JMP2r

@alloc-star ( expr* -> r* )
    #05 ;alloc JSR2 STH2  ( expr [r] )
    #05 STH2kr STA        ( expr [r] )
    DUP2 STH2kr INC2 STA2 ( expr [r] )
    #0000 STH2kr #0003 ADD2 STA2 ( expr [r] )
    STH2kr SWP2 ( r expr [r] )
    ;set-next JSR2 ( [r] )
    STH2r JMP2r

@alloc-plus ( expr* -> r* )
    #05 ;alloc JSR2 STH2  ( expr [r] )
    #05 STH2kr STA        ( expr [r] )
    DUP2 STH2kr INC2 STA2 ( expr [r] )
    #0000 STH2kr #0003 ADD2 STA2 ( expr [r] )
    STH2r SWP2 STH2k ( r expr [expr] )
    ;set-next JSR2 ( [expr] )
    STH2r JMP2r

@alloc-qmark ( expr* -> r* )
    ;alloc-empty JSR2 STH2k ( expr e [e] )
    OVR2 ;set-next JSR2 ( expr [e] )
    #05 ;alloc JSR2 STH2  ( expr [r e] )
    #04 STH2kr STA        ( expr [r e] )
    STH2kr INC2 STA2 ( [r e] )
    SWP2r STH2r STH2kr ( e r [r] )
    #0003 ADD2 STA2 ( [r] )
    STH2r JMP2r

( if r is 0000, allocate an empty node )
@alloc-if-null ( r* -> r2* )
    ORAk ,&return JCN POP2 ;alloc-empty JSR2 &return JMP2r

( unroll one region of the parsing stack, returning )
( a single node consisting of an alternation of )
( all elements on the stack. )
( )
( this unrolls until it hits #ffff #ffff, which it )
( also removes from the stack. )
@unroll-stack ( -> start* end* )
    ;pop4 JSR2 STH2 ( r )
    #00 STH ( count items in stack frame )
    ;alloc-if-null JSR2 ( replace 0000 with empty )
    &loop ( r* )
    ;pop4 JSR2 POP2 ( r x )
    DUP2 #ffff EQU2 ( r x x-is-end? ) ,&done JCN
    INCr ( items++ )
    ;alloc-or JSR2 ( r|x ) ,&loop JMP
    &done
    ( r ffff )
    POP2
    STHr ,&is-or JCN
    STH2r JMP2r
    &is-or
    POP2r
    ;alloc-empty JSR2 OVR2 OVR2 SWP2 ( r empty empty r )
    ;set-next-or JSR2
    JMP2r

( add r to the top of the stock. )
( )
( in particular, this will write r into tail.next )
( before replacing tail with r. )
@push-next ( r0 r1 -> )
    ;pop4 JSR2 ( r0 r1 x0 x1 )
    DUP2 #0000 EQU2 ( r0 r1 x0 x1 x1=0? ) ,&is-zero JCN
    STH2 ROT2 STH2r ( r1 x0 r0 x1 )
    ;set-next JSR2 SWP2 ( x0 r1 )
    ;push4 JSR2
    JMP2r
    &is-zero POP2 POP2 ;push4 JSR2 JMP2r

( load the given address:  )
( )
(  1. if it points to 0000, update it to target )
(  2. otherwise, call set-next on it )
@set-next-addr ( target* addr* -> )
    LDA2k #0000 EQU2 ( target addr addr=0? ) ,&is-zero JCN
    LDA2 ;set-next JSR2 JMP2r
    &is-zero STA2 JMP2r

( set regex.next to target )
( )
( node types 1-7 are defined. )
( )
( all node types except star (5) and lit (3) store their next )
( pointer one byte off of their own address. )
( )
( since both branches of an or (4) node are supposed to meet )
( back up we only bother taking the left branch. otherwise )
( you can end up double-appending things. )
@set-next ( target* regex* -> )
        LDAk #01 LTH ,&unknown JCN
        LDAk #07 GTH ,&unknown JCN
        LDAk #05 NEQ ,&!5 JCN #0003 ADD2 ,&continue JMP
    &!5 LDAk #03 NEQ ,&!3 JCN #0002 ADD2 ,&continue JMP
    &!3 INC2
    &continue ;set-next-addr JSR2 JMP2r
    &unknown ;unknown-node-type ;error! JSR2

@set-next-or-addr ( target* addr* -> )
    LDA2k #0000 EQU2 ( target addr addr=0? ) ,&is-zero JCN
    LDA2 ;set-next-or JSR2 JMP2r
    &is-zero STA2 JMP2r

( this is used when first building or-nodes )
( structure will always be: )
( [x1, [x2, [x3, ..., [xm, xn]]]] )
( so we recurse on the right side but not the left. )
@set-next-or ( target* regex* -> )
    LDAk #04 NEQ ,&!4 JCN
    OVR2 OVR2 INC2 ;set-next-addr JSR2
        #0003 ADD2 ;set-next-or-addr JSR2 JMP2r
    &!4 ;set-next JMP2

( STACK OPERATIONS )
( )
( we always push/pop 4 bytes at a time. the stack has a fixed )
( maximum size it can use, defined by ;stack-top. )
( )
( the stack can be cleared using ;reset-stack, which resets )
( the stack pointers but does not zero out any memory. )
( )
( stack size is 4096 bytes here but is configurable. )
( in some cases it could be very small but this will limit )
( how many branches can be parsed and executed. )

( push 4 bytes onto the stack )
@push4 ( str* regex* -> )
    ;assert-stack-avail JSR2 ( check for space )
    ;stack-pos LDA2 #0002 ADD2 STA2 ( cell[2:3] <- regex )
    ;stack-pos LDA2 STA2 ( cell[0:1] <- str )
    ;stack-pos LDA2 #0004 ADD2 ;stack-pos STA2 ( pos += 4 )
    JMP2r

( pop 4 bytes from the stack )
@pop4 ( -> str* regex* )
    ;assert-stack-exist JSR2 ( check for space )
    ;stack-pos LDA2 ( load stack-pos )
    #0002 SUB2 LDA2k STH2 ( pop and stash regex )
    #0002 SUB2 LDA2k STH2 ( pop and stash str )
    ;stack-pos STA2 ( save new stack-pos )
    STH2r STH2r ( restore str and regex )
    JMP2r

(
( -> size^ )
@frame-size
    #00 STH ;stack-pos LDA2
    &loop
    #0004 SUB2 LDA2k #ffff EQU2 ,&done JCN
    INCr ,&loop JMP
    &done
    STHr JMP2r )

( reset stack pointers )
@reset-stack ( -> )
    ;stack-bot ;stack-pos STA2 JMP2r ( pos <- 0 )

( can more stack be allocated? )
@stack-avail ( -> bool^ )
    ;stack-pos LDA2 ;stack-top LTH2 JMP2r

( is the stack non-empty? )
@stack-exist ( -> bool^ )
    ;stack-pos LDA2 ;stack-bot GTH2 JMP2r

( error if stack is full )
@assert-stack-avail ( -> )
    ;stack-avail JSR2 ,&ok JCN ;stack-is-full ;error! JSR2 &ok JMP2r

( error is stack is empty )
@assert-stack-exist ( -> )
    ;stack-exist JSR2 ,&ok JCN ;stack-is-empty ;error! JSR2 &ok JMP2r

( stack-pos points to the next free stack position (or the top if full). )
@stack-pos :stack-bot ( the next position to insert at )

( stack-bot is the address of the first stack position. )
( stack-top is the address of the first byte beyond the stack. )
@stack-bot $1000 @stack-top ( holds 1024 steps (4096 bytes) )

( ARENA OPERATIONS )
( )
( the arena represents a heap of memory that can easily be )
( allocated in small amounts. )
( )
( the entire arena can be reclaimed using ;reset-arena, but )
( unlike systems such as malloc/free, the arena cannot relcaim )
( smaller amounts of memory. )
( )
( the arena is used to allocate regex graph nodes, which are )
( dynamically-allocated as the regex string is parsed. once )
( a regex is no longer needed the arena may be reclaimed. )
( )
( arena size is 1024 bytes here but is configurable. )
( smaller sizes would likely be fine but will limit the )
( overall complexity of regexes to be parsed and executed. )

( reclaim all the memory used by the arena )
@reset-arena ( -> )
    ;arena-bot ;arena-pos STA2 JMP2r

( currently caller is responsible for zeroing out memory if needed )
@alloc ( size^ -> addr* )
    #00 SWP ( size* )
    ;arena-pos LDA2 STH2k ADD2 ( pos+size* [pos] )
    DUP2 ;arena-top GTH2 ( pos+size pos+size>top? [pos] )
    ,&error JCN ( pos+size [pos] )
    ;arena-pos STA2 ( pos += size [pos] )
    STH2r JMP2r ( pos )
    &error POP2 POP2r ;arena-is-full ;error! JSR2

@arena-pos :arena-bot ( the next position to allocate )
@arena-bot $400 @arena-top ( holds up to 1024 bytes )

( SUBGROUP OPERATIONS )
( )
( subgroups are parts of the input string that are matched by )
( parenthesized subgroup expressions in a regex. )
( )
( for example, (a*)(b*)(c*) has 3 subgroup expressions. )
( )
( during matching, subgroups are represented by 4-bytes )
( which are interpreted as two short values: )
( )
( - bytes 0-1: absolute address of the start of the subgroup )
( - bytes 2-3: absolute address of the limit of the subgroup )
( )
( this means that to get a null-terminated subgroup string )
( you will need to copy it somewhere else with enough space, )
( or else mutate the input string to add a null. )
( )
( since input strings themselves are null-terminated, and since )
( subgroups never include null terminators, we will always have )
( a valid limit value even for input strings that end at #ffff. )
( )
( during regex parsing we will use subgroup-pos to track the )
( next available subgroup position. )

@subgroup-start ( s* i* -> )
    DUP2 ;subgroup-pos LDA2 LTH2 ,&write JCN ( s i )
    DUP2 #0004 ADD2 ;subgroup-pos STA2 ( s i )
    &write STA2 JMP2r

@subgroup-finish ( s* i* -> )
    STA2 JMP2r

@subgroup-backtrack ( i* -> )
    ;subgroup-pos LDA2
    &loop #0004 SUB2
          LTH2k ,&done JCN
          #0000 OVR2 STA2
          #0000 OVR2 #0002 ADD2 STA2
          ,&loop JMP
    &done POP2 ;subgroup-pos STA2
          JMP2r

@subgroup-reset ( -> )
    ;subgroup-bot ;subgroup-pos STA2
    ;subgroup-top ;subgroup-bot LIT2r 0000
    &loop GTH2k ,&continue JCN
          POP2 POP2 POP2r JMP2r
    &continue STH2kr OVR2 STA2
              INC2 INC2 ,&loop JMP

@subgroup-pos :subgroup-bot ( the position of the first unallocated subgroup )
@subgroup-bot $800 @subgroup-top ( holds up to 512 subgroups (2048 bytes) )

( INTERVAL OPERATIONS )
( )
( not baked yet )
(
@min ( first* last* -> min-addr* )
    SWP2 STH2k ,&incr JMP ( last first [first] )
    &loop LDAk LDAkr STHr LTH ,&replace JCN ,&incr JMP ( last a [c] )
    &replace POP2r STH2k ( last a [a] )
    &incr EQUk ,&done JCN INC2 ,&loop JMP ( last a+1 [c] )
    &done POP2 POP2 STH2r JMP2r ( c )

@sort ( first* last* -> )
    SWP2 ( last first )
    &loop ;min JSR2 NEQk ,&swap JCN POP2 ,&incr JMP
    &swap STH2 LDA2k ( last first fx [min] ) STH2kr STA STH2r SWP2 ( last min first )
          STH2 LDA2  ( last mx [first] )     STH2kr STA STH2r ( last first )
    &incr EQUk ,&done JCN INC2 ,&loop JMP
    &done POP2 POP2 JMP2r

@iv-in-range ( c^ b0^ b1^ -> bool^ )
    ROT STHk LTH ,&above JCN
        STHr GTH ,&below JCN #01 JMP2r
    &above POPr POP &below #00 JMP2r

@iv-find ( c^ iv* -> bool^ )
    )