prepare subgroups
This commit is contained in:
parent
e8b8ac7445
commit
5172fc95d7
132
regex.tal
132
regex.tal
|
@ -46,8 +46,8 @@
|
||||||
( (NOTE: r.expr.next must be r) )
|
( (NOTE: r.expr.next must be r) )
|
||||||
( caret matches start of line/string [ #06 next* ] )
|
( caret matches start of line/string [ #06 next* ] )
|
||||||
( dollar matches end of line/string [ #07 next* ] )
|
( dollar matches end of line/string [ #07 next* ] )
|
||||||
( lpar starts subgroup region [ #08 i* next* ] )
|
( lpar starts subgroup region [ #08 i^ next* ] )
|
||||||
( rpar ends subgroup region [ #09 i* next* ] )
|
( rpar ends subgroup region [ #09 i^ next* ] )
|
||||||
( class character class, e.g. [a-z] [ #0a next* n^ ... ] )
|
( class character class, e.g. [a-z] [ #0a next* n^ ... ] )
|
||||||
( (NOTE: n is the number of pairs in ...) )
|
( (NOTE: n is the number of pairs in ...) )
|
||||||
( nclass negative class, e.g. [^a-z] [ #0b next* n^ ... ] )
|
( nclass negative class, e.g. [^a-z] [ #0b next* n^ ... ] )
|
||||||
|
@ -126,7 +126,7 @@
|
||||||
@rx-match ( str* regex* -> bool^ )
|
@rx-match ( str* regex* -> bool^ )
|
||||||
#01 ;match-multiline STA
|
#01 ;match-multiline STA
|
||||||
#00 ;search-mode STA
|
#00 ;search-mode STA
|
||||||
;reset-stack JSR2
|
;rx-reset JSR2
|
||||||
;loop JMP2
|
;loop JMP2
|
||||||
|
|
||||||
@rx-search-multiline ( str* regex* -> bool^ )
|
@rx-search-multiline ( str* regex* -> bool^ )
|
||||||
|
@ -137,19 +137,24 @@
|
||||||
@rx-search ( str* regex* -> bool^ )
|
@rx-search ( str* regex* -> bool^ )
|
||||||
#00 ;match-multiline STA
|
#00 ;match-multiline STA
|
||||||
#01 ;search-mode STA
|
#01 ;search-mode STA
|
||||||
&main STH2 ( s* [r*] )
|
&main STH2 ( s* [r*] )
|
||||||
DUP2 ;string-start STA2 ( s* [r*] )
|
DUP2 ;string-start STA2 ( s* [r*] )
|
||||||
&loop LDAk #00 EQU ,&eof JCN ( s* [r*] )
|
&loop LDAk #00 EQU ,&eof JCN ( s* [r*] )
|
||||||
;reset-stack JSR2 ( s* [r*] )
|
;rx-reset JSR2 ( s* [r*] )
|
||||||
DUP2 ;search-start STA2 ( s* [r*] )
|
DUP2 ;search-start STA2 ( s* [r*] )
|
||||||
DUP2 STH2kr ;loop JSR2 ( s* b^ [r*] )
|
DUP2 STH2kr ;loop JSR2 ( s* b^ [r*] )
|
||||||
,&found JCN ( s* [r*] )
|
,&found JCN ( s* [r*] )
|
||||||
INC2 ,&loop JMP ( s+1* [r*] )
|
INC2 ,&loop JMP ( s+1* [r*] )
|
||||||
&found POP2 POP2r #01 JMP2r ( 01 )
|
&found POP2 POP2r #01 JMP2r ( 01 )
|
||||||
&eof ;reset-stack JSR2 ( s* [r*] )
|
&eof ;rx-reset JSR2 ( s* [r*] )
|
||||||
DUP2 ;search-start STA2 ( s* [r*] )
|
DUP2 ;search-start STA2 ( s* [r*] )
|
||||||
STH2r ;loop JMP2 ( b^ )
|
STH2r ;loop JMP2 ( b^ )
|
||||||
|
|
||||||
|
( reset all "runtime" memory allocated during match/search )
|
||||||
|
@rx-reset ( -> )
|
||||||
|
;reset-stack JSR2
|
||||||
|
;subgroup-reset JMP2
|
||||||
|
|
||||||
( loop used during matching )
|
( loop used during matching )
|
||||||
( )
|
( )
|
||||||
( we don't use the return stack here since that )
|
( we don't use the return stack here since that )
|
||||||
|
@ -177,7 +182,10 @@
|
||||||
@goto-backtrack ( -> bool^ )
|
@goto-backtrack ( -> bool^ )
|
||||||
;stack-exist JSR2 ,&has-stack JCN ( do we have stack? )
|
;stack-exist JSR2 ,&has-stack JCN ( do we have stack? )
|
||||||
#00 JMP2r ( no, return false )
|
#00 JMP2r ( no, return false )
|
||||||
&has-stack ;pop4 JSR2 ;goto-next JMP2 ( yes, resume from the top )
|
&has-stack
|
||||||
|
;pop4 JSR2
|
||||||
|
;subgroup-backtrack JSR2
|
||||||
|
;goto-next JMP2 ( yes, resume from the top )
|
||||||
|
|
||||||
( follow the given address (next*) to continue matching )
|
( follow the given address (next*) to continue matching )
|
||||||
@goto-next ( str* next* -> bool^ )
|
@goto-next ( str* next* -> bool^ )
|
||||||
|
@ -194,6 +202,7 @@
|
||||||
INC2 LDA2 ( load next )
|
INC2 LDA2 ( load next )
|
||||||
;goto-next JMP2 ( jump to next )
|
;goto-next JMP2 ( jump to next )
|
||||||
|
|
||||||
|
( FIXME: not currently used )
|
||||||
@do-lpar ( str* regex* -> bool^ )
|
@do-lpar ( str* regex* -> bool^ )
|
||||||
STH2 DUP2 ( s s [r] )
|
STH2 DUP2 ( s s [r] )
|
||||||
INC2r LDA2kr STH2r ( s s i [r+1] )
|
INC2r LDA2kr STH2r ( s s i [r+1] )
|
||||||
|
@ -201,6 +210,7 @@
|
||||||
STH2r INC2 INC2 ( s r+3 )
|
STH2r INC2 INC2 ( s r+3 )
|
||||||
LDA2 ;goto-next JMP2 ( jump to next )
|
LDA2 ;goto-next JMP2 ( jump to next )
|
||||||
|
|
||||||
|
( FIXME: not currently used )
|
||||||
@do-rpar ( str* regex* -> bool^ )
|
@do-rpar ( str* regex* -> bool^ )
|
||||||
STH2 DUP2 ( s s [r] )
|
STH2 DUP2 ( s s [r] )
|
||||||
INC2r LDA2kr STH2r ( s s i [r+1] )
|
INC2r LDA2kr STH2r ( s s i [r+1] )
|
||||||
|
@ -308,6 +318,9 @@
|
||||||
( track how many levels deep we are in parenthesis )
|
( track how many levels deep we are in parenthesis )
|
||||||
@parens $2
|
@parens $2
|
||||||
|
|
||||||
|
( how many subgroups have we seen so far? )
|
||||||
|
@groupnum $1
|
||||||
|
|
||||||
( read and increment pos )
|
( read and increment pos )
|
||||||
@read ( -> c^ )
|
@read ( -> c^ )
|
||||||
;pos LDA2k ( pos s )
|
;pos LDA2k ( pos s )
|
||||||
|
@ -357,7 +370,7 @@
|
||||||
@compile ( expr* -> regex* )
|
@compile ( expr* -> regex* )
|
||||||
;pos STA2
|
;pos STA2
|
||||||
#0000 ;parens STA2
|
#0000 ;parens STA2
|
||||||
;reset-stack JSR2
|
;rx-reset JSR2
|
||||||
;compile-region JMP2
|
;compile-region JMP2
|
||||||
|
|
||||||
( the basic strategy here is to build a stack of non-or )
|
( the basic strategy here is to build a stack of non-or )
|
||||||
|
@ -427,7 +440,7 @@
|
||||||
#0000 #0000 ;push4 JSR2
|
#0000 #0000 ;push4 JSR2
|
||||||
;compile-region-loop JMP2
|
;compile-region-loop JMP2
|
||||||
|
|
||||||
( called when we read "(" )
|
( called when we read left parenthesis )
|
||||||
( )
|
( )
|
||||||
( this causes us to: )
|
( this causes us to: )
|
||||||
( )
|
( )
|
||||||
|
@ -439,7 +452,7 @@
|
||||||
;parens LDA2 INC2 ;parens STA2 ( parens++ )
|
;parens LDA2 INC2 ;parens STA2 ( parens++ )
|
||||||
;compile-region JMP2
|
;compile-region JMP2
|
||||||
|
|
||||||
( called when we read ")" )
|
( called when we read right parenthesis )
|
||||||
( )
|
( )
|
||||||
( this causes us to: )
|
( this causes us to: )
|
||||||
( )
|
( )
|
||||||
|
@ -572,7 +585,7 @@
|
||||||
|
|
||||||
@rx-node-sizes
|
@rx-node-sizes
|
||||||
( 00 01 02 03 04 05 06 07 08 09 0a 0b )
|
( 00 01 02 03 04 05 06 07 08 09 0a 0b )
|
||||||
[ 00 03 03 04 ] [ 05 05 03 03 ] [ 05 05 00 00 ]
|
[ 00 03 03 04 ] [ 05 05 03 03 ] [ 04 04 00 00 ]
|
||||||
|
|
||||||
@alloc3 ( mode^ -> r* )
|
@alloc3 ( mode^ -> r* )
|
||||||
#0000 ROT ( 00 00 mode^ )
|
#0000 ROT ( 00 00 mode^ )
|
||||||
|
@ -769,7 +782,7 @@
|
||||||
|
|
||||||
( stack-bot is the address of the first stack position. )
|
( stack-bot is the address of the first stack position. )
|
||||||
( stack-top is the address of the first byte beyond the stack. )
|
( stack-top is the address of the first byte beyond the stack. )
|
||||||
@stack-bot $0800 @stack-top ( holds 512 steps (2048 bytes) )
|
@stack-bot $800 @stack-top ( holds 512 steps (2048 bytes) )
|
||||||
|
|
||||||
( ARENA OPERATIONS )
|
( ARENA OPERATIONS )
|
||||||
( )
|
( )
|
||||||
|
@ -812,11 +825,11 @@
|
||||||
( )
|
( )
|
||||||
( for example, (a*)(b*)(c*) has 3 subgroup expressions. )
|
( for example, (a*)(b*)(c*) has 3 subgroup expressions. )
|
||||||
( )
|
( )
|
||||||
( during matching, subgroups are represented by 4-bytes )
|
( during matching, subgroups are represented by 5-bytes: )
|
||||||
( which are interpreted as two short values: )
|
|
||||||
( )
|
( )
|
||||||
( - bytes 0-1: absolute address of the start of the subgroup )
|
( - byte 1: subgroup index (1-255, 0 is a marker) )
|
||||||
( - bytes 2-3: absolute address of the limit of the subgroup )
|
( - bytes 2-3: absolute address of the start of the subgroup )
|
||||||
|
( - bytes 4-5: absolute address of the limit of the subgroup )
|
||||||
( )
|
( )
|
||||||
( this means that to get a null-terminated subgroup string )
|
( this means that to get a null-terminated subgroup string )
|
||||||
( you will need to copy it somewhere else with enough space, )
|
( you will need to copy it somewhere else with enough space, )
|
||||||
|
@ -828,32 +841,73 @@
|
||||||
( )
|
( )
|
||||||
( during regex parsing we will use subgroup-pos to track the )
|
( during regex parsing we will use subgroup-pos to track the )
|
||||||
( next available subgroup position. )
|
( next available subgroup position. )
|
||||||
|
( )
|
||||||
|
( some regular expressions will write to a subgroup multiple times. )
|
||||||
|
( for example when matching ((.)x)+ against "axbx": )
|
||||||
|
( )
|
||||||
|
( - subgroup 1 will contain "bx" )
|
||||||
|
( - subgroup 2 will contain "b" )
|
||||||
|
( )
|
||||||
|
( this may necessitate backtracking. when matching ((.)x|(.)y)+ )
|
||||||
|
( against "axby" we will make the following assignments: )
|
||||||
|
( )
|
||||||
|
( - position 1: )
|
||||||
|
( + start subgroup 1 )
|
||||||
|
( + start then finish subgroup 2: "a" )
|
||||||
|
( - position 2: )
|
||||||
|
( + finish subgroup 1: "ax" )
|
||||||
|
( - position 3: )
|
||||||
|
( + start subgroup 1 )
|
||||||
|
( + start then finish subgroup 2: "b" )
|
||||||
|
( - position 4: )
|
||||||
|
( + backtrack, reverting subgroup 2 to "a" )
|
||||||
|
( - back to position 3 again: )
|
||||||
|
( + start then finish subgroup 3: "b" )
|
||||||
|
( - position 4 again: )
|
||||||
|
( + finish subgruop 1: "by" )
|
||||||
|
( )
|
||||||
|
( the final subgroups will be: {1: "by", 2: "a", 3: "b"} )
|
||||||
|
|
||||||
@subgroup-start ( s* i* -> )
|
@subgroup-start ( s* i^ -> )
|
||||||
DUP2 ;subgroup-pos LDA2 LTH2 ,&write JCN ( s i )
|
;subgroup-pos LDA2 STH2k ( s* i^ pos* [pos*] )
|
||||||
DUP2 #0004 ADD2 ;subgroup-pos STA2 ( s i )
|
;subgroup-top LTH2 ,&next JCN #0000 DIV ( too many subgroups )
|
||||||
&write STA2 JMP2r
|
&next ( s* i^ [pos*] )
|
||||||
|
STH2kr STA
|
||||||
|
STH2r INC2 STA2
|
||||||
|
JMP2r
|
||||||
|
|
||||||
@subgroup-finish ( s* i* -> )
|
@subgroup-finish ( s* i^ -> )
|
||||||
STA2 JMP2r
|
;subgroup-pos LDA2 STH2k ( s* i^ pos* [pos*] )
|
||||||
|
;subgroup-top LTH2 ,&next JCN #0000 DIV ( too many subgroups )
|
||||||
|
&next ( s* i^ [pos*] )
|
||||||
|
STH2kr LDA EQU ,&ok JCN #0000 DIV ( mismatched subgroups )
|
||||||
|
&ok ( s* [pos*] )
|
||||||
|
STH2kr #0003 ADD2 STA2
|
||||||
|
STH2r #0005 ADD2 ;subgroup-pos STA2
|
||||||
|
JMP2r
|
||||||
|
|
||||||
@subgroup-backtrack ( i* -> )
|
@subgroup-branch ( -> )
|
||||||
;subgroup-pos LDA2
|
;subgroup-pos LDA2 STH2k ( pos* [pos*] )
|
||||||
&loop #0004 SUB2
|
;subgroup-top LTH2 ,&next JCN #0000 DIV ( too many subgroups )
|
||||||
LTH2k ,&done JCN
|
&next
|
||||||
#0000 OVR2 STA2
|
#00 STH2kr STA ( [*pos] )
|
||||||
#0000 OVR2 #0002 ADD2 STA2
|
STH2r #0005 ADD2 ;subgroup-pos STA2
|
||||||
,&loop JMP
|
JMP2r
|
||||||
&done POP2 ;subgroup-pos STA2
|
|
||||||
JMP2r
|
|
||||||
|
|
||||||
|
@subgroup-backtrack ( -> )
|
||||||
|
;subgroup-bot ;subgroup-pos LDA2 ( bot* pos* )
|
||||||
|
&loop ( bot* pos* )
|
||||||
|
EQU2k ,&done JCN
|
||||||
|
LDAk #00 EQU ,&done JCN
|
||||||
|
#0005 SUB2 ,&loop JMP
|
||||||
|
&done ( bot* pos* )
|
||||||
|
NIP2 ;subgroup-pos STA2
|
||||||
|
JMP2r
|
||||||
|
|
||||||
|
( does not zero out the memory in question )
|
||||||
@subgroup-reset ( -> )
|
@subgroup-reset ( -> )
|
||||||
;subgroup-bot ;subgroup-pos STA2
|
;subgroup-bot ;subgroup-pos STA2
|
||||||
;subgroup-top ;subgroup-bot LIT2r 0000
|
JMP2r
|
||||||
&loop GTH2k ,&continue JCN
|
|
||||||
POP2 POP2 POP2r JMP2r
|
|
||||||
&continue STH2kr OVR2 STA2
|
|
||||||
INC2 INC2 ,&loop JMP
|
|
||||||
|
|
||||||
@subgroup-pos :subgroup-bot ( the position of the first unallocated subgroup )
|
@subgroup-pos :subgroup-bot ( the position of the first unallocated subgroup item )
|
||||||
@subgroup-bot $400 @subgroup-top ( holds up to 256 subgroups (1024 bytes) )
|
@subgroup-bot $280 @subgroup-top ( holds up to 128 subgroup assignments (640 bytes) )
|
||||||
|
|
|
@ -53,7 +53,6 @@
|
||||||
;ptr LDA2 ;buffer EQU2 STH ( stash is-empty? )
|
;ptr LDA2 ;buffer EQU2 STH ( stash is-empty? )
|
||||||
;buffer ;ptr STA2 ( reset ptr )
|
;buffer ;ptr STA2 ( reset ptr )
|
||||||
;buffer ;regex LDA2 ;rx-match JSR2 ( match regex )
|
;buffer ;regex LDA2 ;rx-match JSR2 ( match regex )
|
||||||
( ;buffer ;regex LDA2 ;rx-search JSR2 ( search regex ) )
|
|
||||||
;emit-byte JSR2 nl ( print result )
|
;emit-byte JSR2 nl ( print result )
|
||||||
STHr ,&was-empty JCN
|
STHr ,&was-empty JCN
|
||||||
;s-prompt ;println JSR2
|
;s-prompt ;println JSR2
|
||||||
|
@ -111,3 +110,24 @@
|
||||||
;emit-n JSR2 ,&loop JMP
|
;emit-n JSR2 ,&loop JMP
|
||||||
&uu
|
&uu
|
||||||
LDAk #cc ;unknown-node-type ;error!! JSR2
|
LDAk #cc ;unknown-node-type ;error!! JSR2
|
||||||
|
|
||||||
|
@emit-subgroups ( -> )
|
||||||
|
;groups-top ;groups-bot
|
||||||
|
&init ( top* i* )
|
||||||
|
GTH2k #00 EQU ,&start JCN ( top* i* )
|
||||||
|
#0000 OVR2 STA2 INC2 INC2 ,&init JMP ( top* i+2* )
|
||||||
|
&start
|
||||||
|
POP2 POP2 ( )
|
||||||
|
;subgroup-pos LDA2 ;subgroup-bot ( limit* pos* )
|
||||||
|
&loop
|
||||||
|
GTH2k ,&ok JCN JMP2r ( limit* pos* )
|
||||||
|
&ok
|
||||||
|
LDAk #13 GTH ,&next JCN ( limit* pos* )
|
||||||
|
LDAk #00 SWP ;groups-bot ADD2 STH2 ( limit* pos* [dest*] )
|
||||||
|
INC2k LDA2k STH2kr STA2 ( limit* pos* pos+1* [dest*] )
|
||||||
|
INC2 INC2 INC2r INC2r ( limit* pos* pos+3* [dest+2*] )
|
||||||
|
LDA2 STH2r STA2 ( limit* pos* )
|
||||||
|
&next
|
||||||
|
#0005 ADD2 ,&loop JMP ( limit* pos+5* )
|
||||||
|
|
||||||
|
@groups-bot $50 @groups-top
|
||||||
|
|
Loading…
Reference in New Issue