prepare subgroups

This commit is contained in:
~d6 2022-04-18 23:17:44 -04:00
parent e8b8ac7445
commit 5172fc95d7
2 changed files with 114 additions and 40 deletions

132
regex.tal
View File

@ -46,8 +46,8 @@
( (NOTE: r.expr.next must be r) ) ( (NOTE: r.expr.next must be r) )
( caret matches start of line/string [ #06 next* ] ) ( caret matches start of line/string [ #06 next* ] )
( dollar matches end of line/string [ #07 next* ] ) ( dollar matches end of line/string [ #07 next* ] )
( lpar starts subgroup region [ #08 i* next* ] ) ( lpar starts subgroup region [ #08 i^ next* ] )
( rpar ends subgroup region [ #09 i* next* ] ) ( rpar ends subgroup region [ #09 i^ next* ] )
( class character class, e.g. [a-z] [ #0a next* n^ ... ] ) ( class character class, e.g. [a-z] [ #0a next* n^ ... ] )
( (NOTE: n is the number of pairs in ...) ) ( (NOTE: n is the number of pairs in ...) )
( nclass negative class, e.g. [^a-z] [ #0b next* n^ ... ] ) ( nclass negative class, e.g. [^a-z] [ #0b next* n^ ... ] )
@ -126,7 +126,7 @@
@rx-match ( str* regex* -> bool^ ) @rx-match ( str* regex* -> bool^ )
#01 ;match-multiline STA #01 ;match-multiline STA
#00 ;search-mode STA #00 ;search-mode STA
;reset-stack JSR2 ;rx-reset JSR2
;loop JMP2 ;loop JMP2
@rx-search-multiline ( str* regex* -> bool^ ) @rx-search-multiline ( str* regex* -> bool^ )
@ -137,19 +137,24 @@
@rx-search ( str* regex* -> bool^ ) @rx-search ( str* regex* -> bool^ )
#00 ;match-multiline STA #00 ;match-multiline STA
#01 ;search-mode STA #01 ;search-mode STA
&main STH2 ( s* [r*] ) &main STH2 ( s* [r*] )
DUP2 ;string-start STA2 ( s* [r*] ) DUP2 ;string-start STA2 ( s* [r*] )
&loop LDAk #00 EQU ,&eof JCN ( s* [r*] ) &loop LDAk #00 EQU ,&eof JCN ( s* [r*] )
;reset-stack JSR2 ( s* [r*] ) ;rx-reset JSR2 ( s* [r*] )
DUP2 ;search-start STA2 ( s* [r*] ) DUP2 ;search-start STA2 ( s* [r*] )
DUP2 STH2kr ;loop JSR2 ( s* b^ [r*] ) DUP2 STH2kr ;loop JSR2 ( s* b^ [r*] )
,&found JCN ( s* [r*] ) ,&found JCN ( s* [r*] )
INC2 ,&loop JMP ( s+1* [r*] ) INC2 ,&loop JMP ( s+1* [r*] )
&found POP2 POP2r #01 JMP2r ( 01 ) &found POP2 POP2r #01 JMP2r ( 01 )
&eof ;reset-stack JSR2 ( s* [r*] ) &eof ;rx-reset JSR2 ( s* [r*] )
DUP2 ;search-start STA2 ( s* [r*] ) DUP2 ;search-start STA2 ( s* [r*] )
STH2r ;loop JMP2 ( b^ ) STH2r ;loop JMP2 ( b^ )
( reset all "runtime" memory allocated during match/search )
@rx-reset ( -> )
;reset-stack JSR2
;subgroup-reset JMP2
( loop used during matching ) ( loop used during matching )
( ) ( )
( we don't use the return stack here since that ) ( we don't use the return stack here since that )
@ -177,7 +182,10 @@
@goto-backtrack ( -> bool^ ) @goto-backtrack ( -> bool^ )
;stack-exist JSR2 ,&has-stack JCN ( do we have stack? ) ;stack-exist JSR2 ,&has-stack JCN ( do we have stack? )
#00 JMP2r ( no, return false ) #00 JMP2r ( no, return false )
&has-stack ;pop4 JSR2 ;goto-next JMP2 ( yes, resume from the top ) &has-stack
;pop4 JSR2
;subgroup-backtrack JSR2
;goto-next JMP2 ( yes, resume from the top )
( follow the given address (next*) to continue matching ) ( follow the given address (next*) to continue matching )
@goto-next ( str* next* -> bool^ ) @goto-next ( str* next* -> bool^ )
@ -194,6 +202,7 @@
INC2 LDA2 ( load next ) INC2 LDA2 ( load next )
;goto-next JMP2 ( jump to next ) ;goto-next JMP2 ( jump to next )
( FIXME: not currently used )
@do-lpar ( str* regex* -> bool^ ) @do-lpar ( str* regex* -> bool^ )
STH2 DUP2 ( s s [r] ) STH2 DUP2 ( s s [r] )
INC2r LDA2kr STH2r ( s s i [r+1] ) INC2r LDA2kr STH2r ( s s i [r+1] )
@ -201,6 +210,7 @@
STH2r INC2 INC2 ( s r+3 ) STH2r INC2 INC2 ( s r+3 )
LDA2 ;goto-next JMP2 ( jump to next ) LDA2 ;goto-next JMP2 ( jump to next )
( FIXME: not currently used )
@do-rpar ( str* regex* -> bool^ ) @do-rpar ( str* regex* -> bool^ )
STH2 DUP2 ( s s [r] ) STH2 DUP2 ( s s [r] )
INC2r LDA2kr STH2r ( s s i [r+1] ) INC2r LDA2kr STH2r ( s s i [r+1] )
@ -308,6 +318,9 @@
( track how many levels deep we are in parenthesis ) ( track how many levels deep we are in parenthesis )
@parens $2 @parens $2
( how many subgroups have we seen so far? )
@groupnum $1
( read and increment pos ) ( read and increment pos )
@read ( -> c^ ) @read ( -> c^ )
;pos LDA2k ( pos s ) ;pos LDA2k ( pos s )
@ -357,7 +370,7 @@
@compile ( expr* -> regex* ) @compile ( expr* -> regex* )
;pos STA2 ;pos STA2
#0000 ;parens STA2 #0000 ;parens STA2
;reset-stack JSR2 ;rx-reset JSR2
;compile-region JMP2 ;compile-region JMP2
( the basic strategy here is to build a stack of non-or ) ( the basic strategy here is to build a stack of non-or )
@ -427,7 +440,7 @@
#0000 #0000 ;push4 JSR2 #0000 #0000 ;push4 JSR2
;compile-region-loop JMP2 ;compile-region-loop JMP2
( called when we read "(" ) ( called when we read left parenthesis )
( ) ( )
( this causes us to: ) ( this causes us to: )
( ) ( )
@ -439,7 +452,7 @@
;parens LDA2 INC2 ;parens STA2 ( parens++ ) ;parens LDA2 INC2 ;parens STA2 ( parens++ )
;compile-region JMP2 ;compile-region JMP2
( called when we read ")" ) ( called when we read right parenthesis )
( ) ( )
( this causes us to: ) ( this causes us to: )
( ) ( )
@ -572,7 +585,7 @@
@rx-node-sizes @rx-node-sizes
( 00 01 02 03 04 05 06 07 08 09 0a 0b ) ( 00 01 02 03 04 05 06 07 08 09 0a 0b )
[ 00 03 03 04 ] [ 05 05 03 03 ] [ 05 05 00 00 ] [ 00 03 03 04 ] [ 05 05 03 03 ] [ 04 04 00 00 ]
@alloc3 ( mode^ -> r* ) @alloc3 ( mode^ -> r* )
#0000 ROT ( 00 00 mode^ ) #0000 ROT ( 00 00 mode^ )
@ -769,7 +782,7 @@
( stack-bot is the address of the first stack position. ) ( stack-bot is the address of the first stack position. )
( stack-top is the address of the first byte beyond the stack. ) ( stack-top is the address of the first byte beyond the stack. )
@stack-bot $0800 @stack-top ( holds 512 steps (2048 bytes) ) @stack-bot $800 @stack-top ( holds 512 steps (2048 bytes) )
( ARENA OPERATIONS ) ( ARENA OPERATIONS )
( ) ( )
@ -812,11 +825,11 @@
( ) ( )
( for example, (a*)(b*)(c*) has 3 subgroup expressions. ) ( for example, (a*)(b*)(c*) has 3 subgroup expressions. )
( ) ( )
( during matching, subgroups are represented by 4-bytes ) ( during matching, subgroups are represented by 5-bytes: )
( which are interpreted as two short values: )
( ) ( )
( - bytes 0-1: absolute address of the start of the subgroup ) ( - byte 1: subgroup index (1-255, 0 is a marker) )
( - bytes 2-3: absolute address of the limit of the subgroup ) ( - bytes 2-3: absolute address of the start of the subgroup )
( - bytes 4-5: absolute address of the limit of the subgroup )
( ) ( )
( this means that to get a null-terminated subgroup string ) ( this means that to get a null-terminated subgroup string )
( you will need to copy it somewhere else with enough space, ) ( you will need to copy it somewhere else with enough space, )
@ -828,32 +841,73 @@
( ) ( )
( during regex parsing we will use subgroup-pos to track the ) ( during regex parsing we will use subgroup-pos to track the )
( next available subgroup position. ) ( next available subgroup position. )
( )
( some regular expressions will write to a subgroup multiple times. )
( for example when matching ((.)x)+ against "axbx": )
( )
( - subgroup 1 will contain "bx" )
( - subgroup 2 will contain "b" )
( )
( this may necessitate backtracking. when matching ((.)x|(.)y)+ )
( against "axby" we will make the following assignments: )
( )
( - position 1: )
( + start subgroup 1 )
( + start then finish subgroup 2: "a" )
( - position 2: )
( + finish subgroup 1: "ax" )
( - position 3: )
( + start subgroup 1 )
( + start then finish subgroup 2: "b" )
( - position 4: )
( + backtrack, reverting subgroup 2 to "a" )
( - back to position 3 again: )
( + start then finish subgroup 3: "b" )
( - position 4 again: )
( + finish subgruop 1: "by" )
( )
( the final subgroups will be: {1: "by", 2: "a", 3: "b"} )
@subgroup-start ( s* i* -> ) @subgroup-start ( s* i^ -> )
DUP2 ;subgroup-pos LDA2 LTH2 ,&write JCN ( s i ) ;subgroup-pos LDA2 STH2k ( s* i^ pos* [pos*] )
DUP2 #0004 ADD2 ;subgroup-pos STA2 ( s i ) ;subgroup-top LTH2 ,&next JCN #0000 DIV ( too many subgroups )
&write STA2 JMP2r &next ( s* i^ [pos*] )
STH2kr STA
STH2r INC2 STA2
JMP2r
@subgroup-finish ( s* i* -> ) @subgroup-finish ( s* i^ -> )
STA2 JMP2r ;subgroup-pos LDA2 STH2k ( s* i^ pos* [pos*] )
;subgroup-top LTH2 ,&next JCN #0000 DIV ( too many subgroups )
&next ( s* i^ [pos*] )
STH2kr LDA EQU ,&ok JCN #0000 DIV ( mismatched subgroups )
&ok ( s* [pos*] )
STH2kr #0003 ADD2 STA2
STH2r #0005 ADD2 ;subgroup-pos STA2
JMP2r
@subgroup-backtrack ( i* -> ) @subgroup-branch ( -> )
;subgroup-pos LDA2 ;subgroup-pos LDA2 STH2k ( pos* [pos*] )
&loop #0004 SUB2 ;subgroup-top LTH2 ,&next JCN #0000 DIV ( too many subgroups )
LTH2k ,&done JCN &next
#0000 OVR2 STA2 #00 STH2kr STA ( [*pos] )
#0000 OVR2 #0002 ADD2 STA2 STH2r #0005 ADD2 ;subgroup-pos STA2
,&loop JMP JMP2r
&done POP2 ;subgroup-pos STA2
JMP2r
@subgroup-backtrack ( -> )
;subgroup-bot ;subgroup-pos LDA2 ( bot* pos* )
&loop ( bot* pos* )
EQU2k ,&done JCN
LDAk #00 EQU ,&done JCN
#0005 SUB2 ,&loop JMP
&done ( bot* pos* )
NIP2 ;subgroup-pos STA2
JMP2r
( does not zero out the memory in question )
@subgroup-reset ( -> ) @subgroup-reset ( -> )
;subgroup-bot ;subgroup-pos STA2 ;subgroup-bot ;subgroup-pos STA2
;subgroup-top ;subgroup-bot LIT2r 0000 JMP2r
&loop GTH2k ,&continue JCN
POP2 POP2 POP2r JMP2r
&continue STH2kr OVR2 STA2
INC2 INC2 ,&loop JMP
@subgroup-pos :subgroup-bot ( the position of the first unallocated subgroup ) @subgroup-pos :subgroup-bot ( the position of the first unallocated subgroup item )
@subgroup-bot $400 @subgroup-top ( holds up to 256 subgroups (1024 bytes) ) @subgroup-bot $280 @subgroup-top ( holds up to 128 subgroup assignments (640 bytes) )

View File

@ -53,7 +53,6 @@
;ptr LDA2 ;buffer EQU2 STH ( stash is-empty? ) ;ptr LDA2 ;buffer EQU2 STH ( stash is-empty? )
;buffer ;ptr STA2 ( reset ptr ) ;buffer ;ptr STA2 ( reset ptr )
;buffer ;regex LDA2 ;rx-match JSR2 ( match regex ) ;buffer ;regex LDA2 ;rx-match JSR2 ( match regex )
( ;buffer ;regex LDA2 ;rx-search JSR2 ( search regex ) )
;emit-byte JSR2 nl ( print result ) ;emit-byte JSR2 nl ( print result )
STHr ,&was-empty JCN STHr ,&was-empty JCN
;s-prompt ;println JSR2 ;s-prompt ;println JSR2
@ -111,3 +110,24 @@
;emit-n JSR2 ,&loop JMP ;emit-n JSR2 ,&loop JMP
&uu &uu
LDAk #cc ;unknown-node-type ;error!! JSR2 LDAk #cc ;unknown-node-type ;error!! JSR2
@emit-subgroups ( -> )
;groups-top ;groups-bot
&init ( top* i* )
GTH2k #00 EQU ,&start JCN ( top* i* )
#0000 OVR2 STA2 INC2 INC2 ,&init JMP ( top* i+2* )
&start
POP2 POP2 ( )
;subgroup-pos LDA2 ;subgroup-bot ( limit* pos* )
&loop
GTH2k ,&ok JCN JMP2r ( limit* pos* )
&ok
LDAk #13 GTH ,&next JCN ( limit* pos* )
LDAk #00 SWP ;groups-bot ADD2 STH2 ( limit* pos* [dest*] )
INC2k LDA2k STH2kr STA2 ( limit* pos* pos+1* [dest*] )
INC2 INC2 INC2r INC2r ( limit* pos* pos+3* [dest+2*] )
LDA2 STH2r STA2 ( limit* pos* )
&next
#0005 ADD2 ,&loop JMP ( limit* pos+5* )
@groups-bot $50 @groups-top