From 5172fc95d799f9b3e6fcb1093b7094ee565fdc77 Mon Sep 17 00:00:00 2001 From: d6 Date: Mon, 18 Apr 2022 23:17:44 -0400 Subject: [PATCH] prepare subgroups --- regex.tal | 132 ++++++++++++++++++++++++++++++++++--------------- repl-regex.tal | 22 ++++++++- 2 files changed, 114 insertions(+), 40 deletions(-) diff --git a/regex.tal b/regex.tal index 2f6248e..c1b03b5 100644 --- a/regex.tal +++ b/regex.tal @@ -46,8 +46,8 @@ ( (NOTE: r.expr.next must be r) ) ( caret matches start of line/string [ #06 next* ] ) ( dollar matches end of line/string [ #07 next* ] ) -( lpar starts subgroup region [ #08 i* next* ] ) -( rpar ends subgroup region [ #09 i* next* ] ) +( lpar starts subgroup region [ #08 i^ next* ] ) +( rpar ends subgroup region [ #09 i^ next* ] ) ( class character class, e.g. [a-z] [ #0a next* n^ ... ] ) ( (NOTE: n is the number of pairs in ...) ) ( nclass negative class, e.g. [^a-z] [ #0b next* n^ ... ] ) @@ -126,7 +126,7 @@ @rx-match ( str* regex* -> bool^ ) #01 ;match-multiline STA #00 ;search-mode STA - ;reset-stack JSR2 + ;rx-reset JSR2 ;loop JMP2 @rx-search-multiline ( str* regex* -> bool^ ) @@ -137,19 +137,24 @@ @rx-search ( str* regex* -> bool^ ) #00 ;match-multiline STA #01 ;search-mode STA - &main STH2 ( s* [r*] ) - DUP2 ;string-start STA2 ( s* [r*] ) + &main STH2 ( s* [r*] ) + DUP2 ;string-start STA2 ( s* [r*] ) &loop LDAk #00 EQU ,&eof JCN ( s* [r*] ) - ;reset-stack JSR2 ( s* [r*] ) + ;rx-reset JSR2 ( s* [r*] ) DUP2 ;search-start STA2 ( s* [r*] ) DUP2 STH2kr ;loop JSR2 ( s* b^ [r*] ) ,&found JCN ( s* [r*] ) INC2 ,&loop JMP ( s+1* [r*] ) &found POP2 POP2r #01 JMP2r ( 01 ) - &eof ;reset-stack JSR2 ( s* [r*] ) + &eof ;rx-reset JSR2 ( s* [r*] ) DUP2 ;search-start STA2 ( s* [r*] ) STH2r ;loop JMP2 ( b^ ) +( reset all "runtime" memory allocated during match/search ) +@rx-reset ( -> ) + ;reset-stack JSR2 + ;subgroup-reset JMP2 + ( loop used during matching ) ( ) ( we don't use the return stack here since that ) @@ -177,7 +182,10 @@ @goto-backtrack ( -> bool^ ) ;stack-exist JSR2 ,&has-stack JCN ( do we have stack? ) #00 JMP2r ( no, return false ) - &has-stack ;pop4 JSR2 ;goto-next JMP2 ( yes, resume from the top ) + &has-stack + ;pop4 JSR2 + ;subgroup-backtrack JSR2 + ;goto-next JMP2 ( yes, resume from the top ) ( follow the given address (next*) to continue matching ) @goto-next ( str* next* -> bool^ ) @@ -194,6 +202,7 @@ INC2 LDA2 ( load next ) ;goto-next JMP2 ( jump to next ) +( FIXME: not currently used ) @do-lpar ( str* regex* -> bool^ ) STH2 DUP2 ( s s [r] ) INC2r LDA2kr STH2r ( s s i [r+1] ) @@ -201,6 +210,7 @@ STH2r INC2 INC2 ( s r+3 ) LDA2 ;goto-next JMP2 ( jump to next ) +( FIXME: not currently used ) @do-rpar ( str* regex* -> bool^ ) STH2 DUP2 ( s s [r] ) INC2r LDA2kr STH2r ( s s i [r+1] ) @@ -308,6 +318,9 @@ ( track how many levels deep we are in parenthesis ) @parens $2 +( how many subgroups have we seen so far? ) +@groupnum $1 + ( read and increment pos ) @read ( -> c^ ) ;pos LDA2k ( pos s ) @@ -357,7 +370,7 @@ @compile ( expr* -> regex* ) ;pos STA2 #0000 ;parens STA2 - ;reset-stack JSR2 + ;rx-reset JSR2 ;compile-region JMP2 ( the basic strategy here is to build a stack of non-or ) @@ -427,7 +440,7 @@ #0000 #0000 ;push4 JSR2 ;compile-region-loop JMP2 -( called when we read "(" ) +( called when we read left parenthesis ) ( ) ( this causes us to: ) ( ) @@ -439,7 +452,7 @@ ;parens LDA2 INC2 ;parens STA2 ( parens++ ) ;compile-region JMP2 -( called when we read ")" ) +( called when we read right parenthesis ) ( ) ( this causes us to: ) ( ) @@ -572,7 +585,7 @@ @rx-node-sizes ( 00 01 02 03 04 05 06 07 08 09 0a 0b ) - [ 00 03 03 04 ] [ 05 05 03 03 ] [ 05 05 00 00 ] + [ 00 03 03 04 ] [ 05 05 03 03 ] [ 04 04 00 00 ] @alloc3 ( mode^ -> r* ) #0000 ROT ( 00 00 mode^ ) @@ -769,7 +782,7 @@ ( stack-bot is the address of the first stack position. ) ( stack-top is the address of the first byte beyond the stack. ) -@stack-bot $0800 @stack-top ( holds 512 steps (2048 bytes) ) +@stack-bot $800 @stack-top ( holds 512 steps (2048 bytes) ) ( ARENA OPERATIONS ) ( ) @@ -812,11 +825,11 @@ ( ) ( for example, (a*)(b*)(c*) has 3 subgroup expressions. ) ( ) -( during matching, subgroups are represented by 4-bytes ) -( which are interpreted as two short values: ) +( during matching, subgroups are represented by 5-bytes: ) ( ) -( - bytes 0-1: absolute address of the start of the subgroup ) -( - bytes 2-3: absolute address of the limit of the subgroup ) +( - byte 1: subgroup index (1-255, 0 is a marker) ) +( - bytes 2-3: absolute address of the start of the subgroup ) +( - bytes 4-5: absolute address of the limit of the subgroup ) ( ) ( this means that to get a null-terminated subgroup string ) ( you will need to copy it somewhere else with enough space, ) @@ -828,32 +841,73 @@ ( ) ( during regex parsing we will use subgroup-pos to track the ) ( next available subgroup position. ) +( ) +( some regular expressions will write to a subgroup multiple times. ) +( for example when matching ((.)x)+ against "axbx": ) +( ) +( - subgroup 1 will contain "bx" ) +( - subgroup 2 will contain "b" ) +( ) +( this may necessitate backtracking. when matching ((.)x|(.)y)+ ) +( against "axby" we will make the following assignments: ) +( ) +( - position 1: ) +( + start subgroup 1 ) +( + start then finish subgroup 2: "a" ) +( - position 2: ) +( + finish subgroup 1: "ax" ) +( - position 3: ) +( + start subgroup 1 ) +( + start then finish subgroup 2: "b" ) +( - position 4: ) +( + backtrack, reverting subgroup 2 to "a" ) +( - back to position 3 again: ) +( + start then finish subgroup 3: "b" ) +( - position 4 again: ) +( + finish subgruop 1: "by" ) +( ) +( the final subgroups will be: {1: "by", 2: "a", 3: "b"} ) -@subgroup-start ( s* i* -> ) - DUP2 ;subgroup-pos LDA2 LTH2 ,&write JCN ( s i ) - DUP2 #0004 ADD2 ;subgroup-pos STA2 ( s i ) - &write STA2 JMP2r +@subgroup-start ( s* i^ -> ) + ;subgroup-pos LDA2 STH2k ( s* i^ pos* [pos*] ) + ;subgroup-top LTH2 ,&next JCN #0000 DIV ( too many subgroups ) + &next ( s* i^ [pos*] ) + STH2kr STA + STH2r INC2 STA2 + JMP2r -@subgroup-finish ( s* i* -> ) - STA2 JMP2r +@subgroup-finish ( s* i^ -> ) + ;subgroup-pos LDA2 STH2k ( s* i^ pos* [pos*] ) + ;subgroup-top LTH2 ,&next JCN #0000 DIV ( too many subgroups ) + &next ( s* i^ [pos*] ) + STH2kr LDA EQU ,&ok JCN #0000 DIV ( mismatched subgroups ) + &ok ( s* [pos*] ) + STH2kr #0003 ADD2 STA2 + STH2r #0005 ADD2 ;subgroup-pos STA2 + JMP2r -@subgroup-backtrack ( i* -> ) - ;subgroup-pos LDA2 - &loop #0004 SUB2 - LTH2k ,&done JCN - #0000 OVR2 STA2 - #0000 OVR2 #0002 ADD2 STA2 - ,&loop JMP - &done POP2 ;subgroup-pos STA2 - JMP2r +@subgroup-branch ( -> ) + ;subgroup-pos LDA2 STH2k ( pos* [pos*] ) + ;subgroup-top LTH2 ,&next JCN #0000 DIV ( too many subgroups ) + &next + #00 STH2kr STA ( [*pos] ) + STH2r #0005 ADD2 ;subgroup-pos STA2 + JMP2r +@subgroup-backtrack ( -> ) + ;subgroup-bot ;subgroup-pos LDA2 ( bot* pos* ) + &loop ( bot* pos* ) + EQU2k ,&done JCN + LDAk #00 EQU ,&done JCN + #0005 SUB2 ,&loop JMP + &done ( bot* pos* ) + NIP2 ;subgroup-pos STA2 + JMP2r + +( does not zero out the memory in question ) @subgroup-reset ( -> ) ;subgroup-bot ;subgroup-pos STA2 - ;subgroup-top ;subgroup-bot LIT2r 0000 - &loop GTH2k ,&continue JCN - POP2 POP2 POP2r JMP2r - &continue STH2kr OVR2 STA2 - INC2 INC2 ,&loop JMP + JMP2r -@subgroup-pos :subgroup-bot ( the position of the first unallocated subgroup ) -@subgroup-bot $400 @subgroup-top ( holds up to 256 subgroups (1024 bytes) ) +@subgroup-pos :subgroup-bot ( the position of the first unallocated subgroup item ) +@subgroup-bot $280 @subgroup-top ( holds up to 128 subgroup assignments (640 bytes) ) diff --git a/repl-regex.tal b/repl-regex.tal index e436086..eefb144 100644 --- a/repl-regex.tal +++ b/repl-regex.tal @@ -53,7 +53,6 @@ ;ptr LDA2 ;buffer EQU2 STH ( stash is-empty? ) ;buffer ;ptr STA2 ( reset ptr ) ;buffer ;regex LDA2 ;rx-match JSR2 ( match regex ) -( ;buffer ;regex LDA2 ;rx-search JSR2 ( search regex ) ) ;emit-byte JSR2 nl ( print result ) STHr ,&was-empty JCN ;s-prompt ;println JSR2 @@ -111,3 +110,24 @@ ;emit-n JSR2 ,&loop JMP &uu LDAk #cc ;unknown-node-type ;error!! JSR2 + +@emit-subgroups ( -> ) + ;groups-top ;groups-bot + &init ( top* i* ) + GTH2k #00 EQU ,&start JCN ( top* i* ) + #0000 OVR2 STA2 INC2 INC2 ,&init JMP ( top* i+2* ) + &start + POP2 POP2 ( ) + ;subgroup-pos LDA2 ;subgroup-bot ( limit* pos* ) + &loop + GTH2k ,&ok JCN JMP2r ( limit* pos* ) + &ok + LDAk #13 GTH ,&next JCN ( limit* pos* ) + LDAk #00 SWP ;groups-bot ADD2 STH2 ( limit* pos* [dest*] ) + INC2k LDA2k STH2kr STA2 ( limit* pos* pos+1* [dest*] ) + INC2 INC2 INC2r INC2r ( limit* pos* pos+3* [dest+2*] ) + LDA2 STH2r STA2 ( limit* pos* ) + &next + #0005 ADD2 ,&loop JMP ( limit* pos+5* ) + +@groups-bot $50 @groups-top