From 8acef622f4d0960bcbdc8b2a7266e219dd84ef76 Mon Sep 17 00:00:00 2001 From: d6 Date: Mon, 21 Feb 2022 15:59:13 -0500 Subject: [PATCH] start implementing subgroups --- regex.tal | 173 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 125 insertions(+), 48 deletions(-) diff --git a/regex.tal b/regex.tal index 3d91337..e74abb7 100644 --- a/regex.tal +++ b/regex.tal @@ -38,16 +38,24 @@ ( dot matches any one char [ #02 next* ] ) ( lit matches one specific char (c) [ #03 c^ next* ] ) ( or matches either left or right [ #04 left* right* ] ) -( star matches expr zero-or-more times [ #05 r* next* ] ) +( star matches expr zero-or-more times [ #05 expr* next* ] ) ( (NOTE: r.expr.next must be r) ) -( ) ( caret matches start of line/string [ #06 next* ] ) ( dollar matches end of line/string [ #07 next* ] ) +( lpar starts subgroup region [ #08 i* next* ] ) +( rpar ends subgroup region [ #09 i* next* ] ) ( ) ( `or` and `star` have the same structure and are handled by the ) ( same code (;do-or). however, the node types are kept different ) ( to make it clearer how to parse and assemble the nodes. ) ( ) +( dollar nodes contain a next pointer even though this usually ) +( will not be needed. ) +( ) +( lpar and rpar contain addresses pointing between subgroup-bot ) +( and subgroup-bot. rpar's address will always be +2 relative to ) +( the corresponding lpar address. ) +( ) ( concatenation isn't a node, it is implied by the *next addr. ) ( a next value of #0000 signals the end of the regex. ) ( ) @@ -55,6 +63,15 @@ ( regexes should not include nulls and cannot match them (other ) ( than the null which signals the end of a string). ) +( TODO: we have lpar and rpar nodes but aren't using them yet ) +( 1. need to modify c-lpar and c-par ) +( 2. we need to store subgroup-posd in regions during parsing: ) +( a. need to store the current pos in the region ) +( b. need to call start to move subgroup-pos forward ) +( 3. when finishing parsing a region we need lpar/rpar nodes ) +( 4. we also need to store "last started subgroup" on the stack ) +( 5. when backtracking we must rewind to "last started" subgroup ) + %debug { #ff #0e DEO } %emit { #18 DEO } %space { #20 emit } @@ -144,6 +161,8 @@ LDAk #05 EQU ;do-or JCN2 ( same code as the or case ) LDAk #06 EQU ;do-caret JCN2 LDAk #07 EQU ;do-dollar JCN2 + LDAk #08 EQU ;do-lpar JCN2 + LDAk #09 EQU ;do-rpar JCN2 ;unknown-node-type ;error! JSR2 ( used when we hit a dead-end during matching. ) @@ -169,28 +188,46 @@ INC2 LDA2 ( load next ) ;goto-next JMP2 ( jump to next ) +@do-lpar ( str* regex* -> bool^ ) + STH2 DUP2 ( s s [r] ) + INC2r LDA2kr STH2r ( s s i [r+1] ) + ;subgroup-start JSR2 ( s [r+1] ) + STH2r INC2 INC2 ( s r+3 ) + LDA2 ;goto-next JMP2 ( jump to next ) + +@do-rpar ( str* regex* -> bool^ ) + STH2 DUP2 ( s s [r] ) + INC2r LDA2kr STH2r ( s s i [r+1] ) + ;subgroup-finish JSR2 ( s [r+1] ) + STH2r INC2 INC2 ( s r+3 ) + LDA2 ;goto-next JMP2 ( jump to next ) + ( handle dot -- match any one character ) @do-dot ( str* regex* -> bool^ ) - INC2 LDA2 STH2 ( load and stash next ) - LDAk #00 NEQ ,&non-empty JCN ( is there a char? ) + INC2 LDA2 STH2 ( load and stash next ) + LDAk #00 NEQ ,&non-empty JCN ( is there a char? ) &backtrack POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack ) - &non-empty LDAk #0a NEQ ,&match JCN ( yes, match unless \n in search-mode ) - ;search-mode LDA ,&backtrack JCN ( if \n and search-mode, treat as EOF ) - &match INC2 STH2r ;goto-next JMP2 ( on match: inc s, restore and jump ) + &non-empty LDAk #0a NEQ ,&match JCN ( yes, match unless \n in search-mode ) + ;search-mode LDA ,&backtrack JCN ( if \n and search-mode, treat as EOF ) + &match INC2 STH2r ;goto-next JMP2 ( on match: inc s, restore and jump ) -( TODO: support multi-line=0 ) +( hande caret -- match string start (or possibly after newline) without advancing ) @do-caret ( str* regex* -> bool^ ) - INC2 LDA2 STH2 ( load and stash next ) + INC2 LDA2 STH2 ( load and stash next ) DUP2 ;string-start LDA2 EQU2 ,&at-start JCN ( at string start? ) - POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack ) - &at-start STH2r ;goto-next JMP2 ( yes, goto next without advancing str ) + ;match-multiline LDA ,&no-match JCN ( are we in multi-line mode? ) + #0001 SUB2 LDAk #0a EQU ,&at-start JCN ( just after newline? ) + &no-match POP2r POP2 ;goto-backtrack JMP2 ( clear stacks and backtrack ) + &at-start STH2r ;goto-next JMP2 ( go to next without advancing ) -( TODO: support multi-line=0 ) +( hande dollar -- match string end (or possibly before newline) without advancing ) @do-dollar ( str* regex* -> bool^ ) - INC2 LDA2 STH2 ( load and stash next ) - LDAk #00 EQU ,&at-end JCN ( at string end? ) - POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack ) - &at-end STH2r ;goto-next JMP2 ( yes, goto next without advancing str ) + INC2 LDA2 STH2 ( load and stash next ) + LDAk #00 EQU ,&at-end JCN ( at string end? ) + ;match-multiline LDA ,&no-match JCN ( are we in multi-line mode? ) + LDAk #0a EQU ,&at-end JCN ( at newline? ) + &no-match POP2r POP2 ;goto-backtrack JMP2 ( clear stacks and backtrack ) + &at-end STH2r ;goto-next JMP2 ( go to next without advancing ) ( handle literal -- match one specific character ) @do-literal ( str* regex* -> bool^ ) @@ -229,17 +266,6 @@ @search-start $2 @search-end $2 -( -( used for subgroup match start/end addresses ) -@groups [ &s0 $2 &e0 $2 &s1 $2 &e1 $2 - &s2 $2 &e2 $2 &s3 $2 &e3 $2 - &s4 $2 &e4 $2 &s5 $2 &e5 $2 - &s6 $2 &e6 $2 &s7 $2 &e7 $2 - &s8 $2 &e8 $2 &s9 $2 &e9 $2 ] - -( position of last finished group ) -@group-pos $2 ) - ( track the position in the input string ) @pos $2 @@ -397,7 +423,7 @@ ( allocates a dot-node and continues. ) @c-dot ( c^ -> r2* ) POP - ;alloc-dot JSR2 ( dot ) + #02 ;alloc3 JSR2 DUP2 ;c-peek-and-finalize JMP2 ( called when we read "^" ) @@ -405,7 +431,7 @@ ( allocates a caret-node and continues. ) @c-caret ( c^ -> r2* ) POP - ;alloc-caret JSR2 ( caret ) + #06 ;alloc3 JMP2 DUP2 ;c-peek-and-finalize JMP2 ( called when we read "$" ) @@ -413,7 +439,7 @@ ( allocates a dollar-node and continues. ) @c-dollar ( c^ -> r2* ) POP - ;alloc-dollar JSR2 ( dollar ) + #07 ;alloc3 JMP2 DUP2 ;c-peek-and-finalize JMP2 ( called when we read "\" ) @@ -479,15 +505,6 @@ @alloc-empty ( -> r* ) #01 ;alloc3 JMP2 -@alloc-dot ( -> r* ) - #02 ;alloc3 JMP2 - -@alloc-caret ( -> r* ) - #06 ;alloc3 JMP2 - -@alloc-dollar ( -> r* ) - #07 ;alloc3 JMP2 - @alloc-lit ( c^ -> r* ) #03 #0000 SWP2 ( 0000 c^ 03 ) #04 ;alloc JSR2 ( 0000 c^ 03 addr* ) @@ -584,16 +601,23 @@ &is-zero STA2 JMP2r ( set regex.next to target ) +( ) +( node types 1-7 are defined. ) +( ) +( all node types except star (5) and lit (3) store their next ) +( pointer one byte off of their own address. ) +( ) +( since both branches of an or (4) node are supposed to meet ) +( back up we only bother taking the left branch. otherwise ) +( you can end up double-appending things. ) @set-next ( target* regex* -> ) - LDAk #01 NEQ ,&!1 JCN INC2 ;set-next-addr JSR2 JMP2r - &!1 LDAk #02 NEQ ,&!2 JCN INC2 ;set-next-addr JSR2 JMP2r - &!2 LDAk #03 NEQ ,&!3 JCN #0002 ADD2 ;set-next-addr JSR2 JMP2r - &!3 LDAk #04 NEQ ,&!4 JCN INC2 ;set-next-addr JSR2 JMP2r - &!4 LDAk #05 NEQ ,&!5 JCN #0003 ADD2 ;set-next-addr JSR2 JMP2r -( &!5 ;unknown-node-type ;error! JSR2 ) - &!5 LDAk #06 NEQ ,&!6 JCN INC2 ;set-next-addr JSR2 JMP2r - &!6 LDAk #07 NEQ ,&!7 JCN INC2 ;set-next-addr JSR2 JMP2r - &!7 ;unknown-node-type ;error! JSR2 + LDAk #01 LTH ,&unknown JCN + LDAk #07 GTH ,&unknown JCN + LDAk #05 NEQ ,&!5 JCN #0003 ADD2 ,&continue JMP + &!5 LDAk #03 NEQ ,&!3 JCN #0002 ADD2 ,&continue JMP + &!3 INC2 + &continue ;set-next-addr JSR2 JMP2r + &unknown ;unknown-node-type ;error! JSR2 @set-next-or-addr ( target* addr* -> ) LDA2k #0000 EQU2 ( target addr addr=0? ) ,&is-zero JCN @@ -711,6 +735,59 @@ @arena-pos :arena-bot ( the next position to allocate ) @arena-bot $400 @arena-top ( holds up to 1024 bytes ) +( SUBGROUP OPERATIONS ) +( ) +( subgroups are parts of the input string that are matched by ) +( parenthesized subgroup expressions in a regex. ) +( ) +( for example, (a*)(b*)(c*) has 3 subgroup expressions. ) +( ) +( during matching, subgroups are represented by 4-bytes ) +( which are interpreted as two short values: ) +( ) +( - bytes 0-1: absolute address of the start of the subgroup ) +( - bytes 2-3: absolute address of the limit of the subgroup ) +( ) +( this means that to get a null-terminated subgroup string ) +( you will need to copy it somewhere else with enough space, ) +( or else mutate the input string to add a null. ) +( ) +( since input strings themselves are null-terminated, and since ) +( subgroups never include null terminators, we will always have ) +( a valid limit value even for input strings that end at #ffff. ) +( ) +( during regex parsing we will use subgroup-pos to track the ) +( next available subgroup position. ) + +@subgroup-start ( s* i* -> ) + DUP2 ;subgroup-pos LDA2 LTH2 ,&write JCN ( s i ) + DUP2 #0004 ADD2 ;subgroup-pos STA2 ( s i ) + &write STA2 JMP2r + +@subgroup-finish ( s* i* -> ) + STA2 JMP2r + +@subgroup-backtrack ( i* -> ) + ;subgroup-pos LDA2 + &loop #0004 SUB2 + LTH2k ,&done JCN + #0000 OVR2 STA2 + #0000 OVR2 #0002 ADD2 STA2 + ,&loop JMP + &done POP2 ;subgroup-pos STA2 + JMP2r + +@subgroup-reset ( -> ) + ;subgroup-bot ;subgroup-pos STA2 + ;subgroup-top ;subgroup-bot LIT2r 0000 + &loop GTH2k ,&continue JCN + POP2 POP2 POP2r JMP2r + &continue STH2kr OVR2 STA2 + INC2 INC2 ,&loop JMP + +@subgroup-pos :subgroup-bot ( the position of the first unallocated subgroup ) +@subgroup-bot $800 @subgroup-top ( holds up to 512 subgroups (2048 bytes) ) + ( INTERVAL OPERATIONS ) ( ) ( not baked yet )