start implementing subgroups

2022-02-21 15:59:13 -05:00 · 2022-02-21 15:59:13 -05:00 · 8acef622f4
parent 03c4aa5120
commit 8acef622f4
1 changed files with 125 additions and 48 deletions
--- a/regex.tal
+++ b/regex.tal
@ -38,16 +38,24 @@
 (   dot     matches any one char             [ #02 next* ]         )
 (   lit     matches one specific char (c)    [ #03 c^ next* ]      )
 (   or      matches either left or right     [ #04 left* right* ]  )
-(   star    matches expr zero-or-more times  [ #05 r* next* ]      )
+(   star    matches expr zero-or-more times  [ #05 expr* next* ]   )
 (           (NOTE: r.expr.next must be r)                          )
-( )
 (   caret   matches start of line/string     [ #06 next* ]         )
 (   dollar  matches end of line/string       [ #07 next* ]         )
+(   lpar    starts subgroup region           [ #08 i* next* ]      )
+(   rpar    ends subgroup region             [ #09 i* next* ]      )
 (                                                                  )
 ( `or` and `star` have the same structure and are handled by the   )
 ( same code (;do-or). however, the node types are kept different   )
 ( to make it clearer how to parse and assemble the nodes.          )
 (                                                                  )
+( dollar nodes contain a next pointer even though this usually     )
+( will not be needed.                                              )
+(                                                                  )
+( lpar and rpar contain addresses pointing between subgroup-bot    )
+( and subgroup-bot. rpar's address will always be +2 relative to   )
+( the corresponding lpar address.                                  )
+(                                                                  )
 ( concatenation isn't a node, it is implied by the *next addr.     )
 ( a next value of #0000 signals the end of the regex.              )
 (                                                                  )
@ -55,6 +63,15 @@
 ( regexes should not include nulls and cannot match them (other    )
 ( than the null which signals the end of a string).                )

+( TODO: we have lpar and rpar nodes but aren't using them yet      )
+( 1. need to modify c-lpar and c-par                               )
+( 2. we need to store subgroup-posd in regions during parsing:     )
+(   a. need to store the current pos in the region                 )
+(   b. need to call start to move subgroup-pos forward             )
+( 3. when finishing parsing a region we need lpar/rpar nodes       )
+( 4. we also need to store "last started subgroup" on the stack    )
+( 5. when backtracking we must rewind to "last started" subgroup   )
+
 %debug { #ff #0e DEO }
 %emit { #18 DEO }
 %space { #20 emit }
@ -144,6 +161,8 @@
    LDAk #05 EQU ;do-or      JCN2 ( same code as the or case )
    LDAk #06 EQU ;do-caret   JCN2
    LDAk #07 EQU ;do-dollar  JCN2
+    LDAk #08 EQU ;do-lpar    JCN2
+    LDAk #09 EQU ;do-rpar    JCN2
    ;unknown-node-type ;error! JSR2

 ( used when we hit a dead-end during matching. )
@ -169,28 +188,46 @@
    INC2 LDA2 ( load next )
    ;goto-next JMP2 ( jump to next )

+@do-lpar ( str* regex* -> bool^ )
+    STH2 DUP2 ( s s [r] )
+    INC2r LDA2kr STH2r ( s s i [r+1] )
+    ;subgroup-start JSR2 ( s [r+1] )
+    STH2r INC2 INC2 ( s r+3 )
+    LDA2 ;goto-next JMP2 ( jump to next )
+
+@do-rpar ( str* regex* -> bool^ )
+    STH2 DUP2 ( s s [r] )
+    INC2r LDA2kr STH2r ( s s i [r+1] )
+    ;subgroup-finish JSR2 ( s [r+1] )
+    STH2r INC2 INC2 ( s r+3 )
+    LDA2 ;goto-next JMP2 ( jump to next )
+
 ( handle dot -- match any one character )
@do-dot ( str* regex* -> bool^ )
-    INC2 LDA2 STH2 ( load and stash next )
-    LDAk #00 NEQ ,&non-empty JCN ( is there a char? )
+    INC2 LDA2 STH2                             ( load and stash next )
+    LDAk #00 NEQ ,&non-empty JCN               ( is there a char? )
    &backtrack POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack )
-    &non-empty LDAk #0a NEQ ,&match JCN ( yes, match unless \n in search-mode )
-    ;search-mode LDA ,&backtrack JCN ( if \n and search-mode, treat as EOF )
-    &match INC2 STH2r ;goto-next JMP2 ( on match: inc s, restore and jump )
+    &non-empty LDAk #0a NEQ ,&match JCN        ( yes, match unless \n in search-mode )
+    ;search-mode LDA ,&backtrack JCN           ( if \n and search-mode, treat as EOF )
+    &match INC2 STH2r ;goto-next JMP2          ( on match: inc s, restore and jump )

-( TODO: support multi-line=0 )
+( hande caret -- match string start (or possibly after newline) without advancing )
@do-caret ( str* regex* -> bool^ )
-    INC2 LDA2 STH2 ( load and stash next )
+    INC2 LDA2 STH2                              ( load and stash next )
    DUP2 ;string-start LDA2 EQU2 ,&at-start JCN ( at string start? )
-    POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack )
-    &at-start STH2r ;goto-next JMP2 ( yes, goto next without advancing str )    
+    ;match-multiline LDA ,&no-match JCN         ( are we in multi-line mode? )
+    #0001 SUB2 LDAk #0a EQU ,&at-start JCN      ( just after newline? )
+    &no-match POP2r POP2 ;goto-backtrack JMP2   ( clear stacks and backtrack )
+    &at-start STH2r ;goto-next JMP2             ( go to next without advancing )    

-( TODO: support multi-line=0 )
+( hande dollar -- match string end (or possibly before newline) without advancing )
@do-dollar  ( str* regex* -> bool^ )
-    INC2 LDA2 STH2 ( load and stash next )
-    LDAk #00 EQU ,&at-end JCN ( at string end? )
-    POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack )    
-    &at-end STH2r ;goto-next JMP2 ( yes, goto next without advancing str )
+    INC2 LDA2 STH2                            ( load and stash next )
+    LDAk #00 EQU ,&at-end JCN                 ( at string end? )
+    ;match-multiline LDA ,&no-match JCN       ( are we in multi-line mode? )
+    LDAk #0a EQU ,&at-end JCN                 ( at newline? )
+    &no-match POP2r POP2 ;goto-backtrack JMP2 ( clear stacks and backtrack )    
+    &at-end STH2r ;goto-next JMP2             ( go to next without advancing )

 ( handle literal -- match one specific character )
@do-literal ( str* regex* -> bool^ )
@ -229,17 +266,6 @@
@search-start $2
@search-end   $2

-(
-( used for subgroup match start/end addresses )
-@groups [ &s0 $2 &e0 $2 &s1 $2 &e1 $2 
-          &s2 $2 &e2 $2 &s3 $2 &e3 $2 
-          &s4 $2 &e4 $2 &s5 $2 &e5 $2 
-          &s6 $2 &e6 $2 &s7 $2 &e7 $2 
-          &s8 $2 &e8 $2 &s9 $2 &e9 $2 ]
-
-( position of last finished group )
-@group-pos $2 )
-
 ( track the position in the input string )
@pos $2

@ -397,7 +423,7 @@
 ( allocates a dot-node and continues. )
@c-dot ( c^ -> r2* )
    POP
-    ;alloc-dot JSR2 ( dot )
+    #02 ;alloc3 JSR2
    DUP2 ;c-peek-and-finalize JMP2

 ( called when we read "^" )
@ -405,7 +431,7 @@
 ( allocates a caret-node and continues. )
@c-caret ( c^ -> r2* )
    POP
-    ;alloc-caret JSR2 ( caret )
+    #06 ;alloc3 JMP2
    DUP2 ;c-peek-and-finalize JMP2

 ( called when we read "$" )
@ -413,7 +439,7 @@
 ( allocates a dollar-node and continues. )
@c-dollar ( c^ -> r2* )
    POP
-    ;alloc-dollar JSR2 ( dollar )
+    #07 ;alloc3 JMP2
    DUP2 ;c-peek-and-finalize JMP2

 ( called when we read "\" )
@ -479,15 +505,6 @@
@alloc-empty ( -> r* )
    #01 ;alloc3 JMP2

-@alloc-dot ( -> r* )
-    #02 ;alloc3 JMP2
-
-@alloc-caret ( -> r* )
-    #06 ;alloc3 JMP2
-
-@alloc-dollar ( -> r* )
-    #07 ;alloc3 JMP2
-
@alloc-lit ( c^ -> r* )
    #03 #0000 SWP2 ( 0000 c^ 03 )
    #04 ;alloc JSR2 ( 0000 c^ 03 addr* )
@ -584,16 +601,23 @@
    &is-zero STA2 JMP2r

 ( set regex.next to target )
+( )
+( node types 1-7 are defined. )
+( )
+( all node types except star (5) and lit (3) store their next )
+( pointer one byte off of their own address. )
+( )
+( since both branches of an or (4) node are supposed to meet )
+( back up we only bother taking the left branch. otherwise )
+( you can end up double-appending things. )
@set-next ( target* regex* -> )
-        LDAk #01 NEQ ,&!1 JCN       INC2 ;set-next-addr JSR2 JMP2r
-    &!1 LDAk #02 NEQ ,&!2 JCN       INC2 ;set-next-addr JSR2 JMP2r
-    &!2 LDAk #03 NEQ ,&!3 JCN #0002 ADD2 ;set-next-addr JSR2 JMP2r
-    &!3 LDAk #04 NEQ ,&!4 JCN       INC2 ;set-next-addr JSR2 JMP2r
-    &!4 LDAk #05 NEQ ,&!5 JCN #0003 ADD2 ;set-next-addr JSR2 JMP2r
-(    &!5 ;unknown-node-type ;error! JSR2 )
-    &!5 LDAk #06 NEQ ,&!6 JCN       INC2 ;set-next-addr JSR2 JMP2r
-    &!6 LDAk #07 NEQ ,&!7 JCN       INC2 ;set-next-addr JSR2 JMP2r
-    &!7 ;unknown-node-type ;error! JSR2
+        LDAk #01 LTH ,&unknown JCN
+        LDAk #07 GTH ,&unknown JCN
+        LDAk #05 NEQ ,&!5 JCN #0003 ADD2 ,&continue JMP
+    &!5 LDAk #03 NEQ ,&!3 JCN #0002 ADD2 ,&continue JMP
+    &!3 INC2
+    &continue ;set-next-addr JSR2 JMP2r
+    &unknown ;unknown-node-type ;error! JSR2

@set-next-or-addr ( target* addr* -> )
    LDA2k #0000 EQU2 ( target addr addr=0? ) ,&is-zero JCN
@ -711,6 +735,59 @@
@arena-pos :arena-bot ( the next position to allocate )
@arena-bot $400 @arena-top ( holds up to 1024 bytes )

+( SUBGROUP OPERATIONS )
+( )
+( subgroups are parts of the input string that are matched by )
+( parenthesized subgroup expressions in a regex. )
+( )
+( for example, (a*)(b*)(c*) has 3 subgroup expressions. )
+( )
+( during matching, subgroups are represented by 4-bytes )
+( which are interpreted as two short values: )
+( )
+( - bytes 0-1: absolute address of the start of the subgroup )
+( - bytes 2-3: absolute address of the limit of the subgroup )
+( )
+( this means that to get a null-terminated subgroup string )
+( you will need to copy it somewhere else with enough space, )
+( or else mutate the input string to add a null. )
+( )
+( since input strings themselves are null-terminated, and since )
+( subgroups never include null terminators, we will always have )
+( a valid limit value even for input strings that end at #ffff. )
+( )
+( during regex parsing we will use subgroup-pos to track the )
+( next available subgroup position. )
+
+@subgroup-start ( s* i* -> )
+    DUP2 ;subgroup-pos LDA2 LTH2 ,&write JCN ( s i )
+    DUP2 #0004 ADD2 ;subgroup-pos STA2 ( s i )
+    &write STA2 JMP2r
+
+@subgroup-finish ( s* i* -> )
+    STA2 JMP2r
+
+@subgroup-backtrack ( i* -> )
+    ;subgroup-pos LDA2
+    &loop #0004 SUB2
+          LTH2k ,&done JCN
+          #0000 OVR2 STA2
+          #0000 OVR2 #0002 ADD2 STA2
+          ,&loop JMP
+    &done POP2 ;subgroup-pos STA2
+          JMP2r
+
+@subgroup-reset ( -> )
+    ;subgroup-bot ;subgroup-pos STA2
+    ;subgroup-top ;subgroup-bot LIT2r 0000
+    &loop GTH2k ,&continue JCN
+          POP2 POP2 POP2r JMP2r
+    &continue STH2kr OVR2 STA2
+              INC2 INC2 ,&loop JMP
+
+@subgroup-pos :subgroup-bot ( the position of the first unallocated subgroup )
+@subgroup-bot $800 @subgroup-top ( holds up to 512 subgroups (2048 bytes) )
+
 ( INTERVAL OPERATIONS )
 ( )
 ( not baked yet )