start implementing subgroups

This commit is contained in:
~d6 2022-02-21 15:59:13 -05:00
parent 03c4aa5120
commit 8acef622f4
1 changed files with 125 additions and 48 deletions

157
regex.tal
View File

@ -38,16 +38,24 @@
( dot matches any one char [ #02 next* ] )
( lit matches one specific char (c) [ #03 c^ next* ] )
( or matches either left or right [ #04 left* right* ] )
( star matches expr zero-or-more times [ #05 r* next* ] )
( star matches expr zero-or-more times [ #05 expr* next* ] )
( (NOTE: r.expr.next must be r) )
( )
( caret matches start of line/string [ #06 next* ] )
( dollar matches end of line/string [ #07 next* ] )
( lpar starts subgroup region [ #08 i* next* ] )
( rpar ends subgroup region [ #09 i* next* ] )
( )
( `or` and `star` have the same structure and are handled by the )
( same code (;do-or). however, the node types are kept different )
( to make it clearer how to parse and assemble the nodes. )
( )
( dollar nodes contain a next pointer even though this usually )
( will not be needed. )
( )
( lpar and rpar contain addresses pointing between subgroup-bot )
( and subgroup-bot. rpar's address will always be +2 relative to )
( the corresponding lpar address. )
( )
( concatenation isn't a node, it is implied by the *next addr. )
( a next value of #0000 signals the end of the regex. )
( )
@ -55,6 +63,15 @@
( regexes should not include nulls and cannot match them (other )
( than the null which signals the end of a string). )
( TODO: we have lpar and rpar nodes but aren't using them yet )
( 1. need to modify c-lpar and c-par )
( 2. we need to store subgroup-posd in regions during parsing: )
( a. need to store the current pos in the region )
( b. need to call start to move subgroup-pos forward )
( 3. when finishing parsing a region we need lpar/rpar nodes )
( 4. we also need to store "last started subgroup" on the stack )
( 5. when backtracking we must rewind to "last started" subgroup )
%debug { #ff #0e DEO }
%emit { #18 DEO }
%space { #20 emit }
@ -144,6 +161,8 @@
LDAk #05 EQU ;do-or JCN2 ( same code as the or case )
LDAk #06 EQU ;do-caret JCN2
LDAk #07 EQU ;do-dollar JCN2
LDAk #08 EQU ;do-lpar JCN2
LDAk #09 EQU ;do-rpar JCN2
;unknown-node-type ;error! JSR2
( used when we hit a dead-end during matching. )
@ -169,6 +188,20 @@
INC2 LDA2 ( load next )
;goto-next JMP2 ( jump to next )
@do-lpar ( str* regex* -> bool^ )
STH2 DUP2 ( s s [r] )
INC2r LDA2kr STH2r ( s s i [r+1] )
;subgroup-start JSR2 ( s [r+1] )
STH2r INC2 INC2 ( s r+3 )
LDA2 ;goto-next JMP2 ( jump to next )
@do-rpar ( str* regex* -> bool^ )
STH2 DUP2 ( s s [r] )
INC2r LDA2kr STH2r ( s s i [r+1] )
;subgroup-finish JSR2 ( s [r+1] )
STH2r INC2 INC2 ( s r+3 )
LDA2 ;goto-next JMP2 ( jump to next )
( handle dot -- match any one character )
@do-dot ( str* regex* -> bool^ )
INC2 LDA2 STH2 ( load and stash next )
@ -178,19 +211,23 @@
;search-mode LDA ,&backtrack JCN ( if \n and search-mode, treat as EOF )
&match INC2 STH2r ;goto-next JMP2 ( on match: inc s, restore and jump )
( TODO: support multi-line=0 )
( hande caret -- match string start (or possibly after newline) without advancing )
@do-caret ( str* regex* -> bool^ )
INC2 LDA2 STH2 ( load and stash next )
DUP2 ;string-start LDA2 EQU2 ,&at-start JCN ( at string start? )
POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack )
&at-start STH2r ;goto-next JMP2 ( yes, goto next without advancing str )
;match-multiline LDA ,&no-match JCN ( are we in multi-line mode? )
#0001 SUB2 LDAk #0a EQU ,&at-start JCN ( just after newline? )
&no-match POP2r POP2 ;goto-backtrack JMP2 ( clear stacks and backtrack )
&at-start STH2r ;goto-next JMP2 ( go to next without advancing )
( TODO: support multi-line=0 )
( hande dollar -- match string end (or possibly before newline) without advancing )
@do-dollar ( str* regex* -> bool^ )
INC2 LDA2 STH2 ( load and stash next )
LDAk #00 EQU ,&at-end JCN ( at string end? )
POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack )
&at-end STH2r ;goto-next JMP2 ( yes, goto next without advancing str )
;match-multiline LDA ,&no-match JCN ( are we in multi-line mode? )
LDAk #0a EQU ,&at-end JCN ( at newline? )
&no-match POP2r POP2 ;goto-backtrack JMP2 ( clear stacks and backtrack )
&at-end STH2r ;goto-next JMP2 ( go to next without advancing )
( handle literal -- match one specific character )
@do-literal ( str* regex* -> bool^ )
@ -229,17 +266,6 @@
@search-start $2
@search-end $2
(
( used for subgroup match start/end addresses )
@groups [ &s0 $2 &e0 $2 &s1 $2 &e1 $2
&s2 $2 &e2 $2 &s3 $2 &e3 $2
&s4 $2 &e4 $2 &s5 $2 &e5 $2
&s6 $2 &e6 $2 &s7 $2 &e7 $2
&s8 $2 &e8 $2 &s9 $2 &e9 $2 ]
( position of last finished group )
@group-pos $2 )
( track the position in the input string )
@pos $2
@ -397,7 +423,7 @@
( allocates a dot-node and continues. )
@c-dot ( c^ -> r2* )
POP
;alloc-dot JSR2 ( dot )
#02 ;alloc3 JSR2
DUP2 ;c-peek-and-finalize JMP2
( called when we read "^" )
@ -405,7 +431,7 @@
( allocates a caret-node and continues. )
@c-caret ( c^ -> r2* )
POP
;alloc-caret JSR2 ( caret )
#06 ;alloc3 JMP2
DUP2 ;c-peek-and-finalize JMP2
( called when we read "$" )
@ -413,7 +439,7 @@
( allocates a dollar-node and continues. )
@c-dollar ( c^ -> r2* )
POP
;alloc-dollar JSR2 ( dollar )
#07 ;alloc3 JMP2
DUP2 ;c-peek-and-finalize JMP2
( called when we read "\" )
@ -479,15 +505,6 @@
@alloc-empty ( -> r* )
#01 ;alloc3 JMP2
@alloc-dot ( -> r* )
#02 ;alloc3 JMP2
@alloc-caret ( -> r* )
#06 ;alloc3 JMP2
@alloc-dollar ( -> r* )
#07 ;alloc3 JMP2
@alloc-lit ( c^ -> r* )
#03 #0000 SWP2 ( 0000 c^ 03 )
#04 ;alloc JSR2 ( 0000 c^ 03 addr* )
@ -584,16 +601,23 @@
&is-zero STA2 JMP2r
( set regex.next to target )
( )
( node types 1-7 are defined. )
( )
( all node types except star (5) and lit (3) store their next )
( pointer one byte off of their own address. )
( )
( since both branches of an or (4) node are supposed to meet )
( back up we only bother taking the left branch. otherwise )
( you can end up double-appending things. )
@set-next ( target* regex* -> )
LDAk #01 NEQ ,&!1 JCN INC2 ;set-next-addr JSR2 JMP2r
&!1 LDAk #02 NEQ ,&!2 JCN INC2 ;set-next-addr JSR2 JMP2r
&!2 LDAk #03 NEQ ,&!3 JCN #0002 ADD2 ;set-next-addr JSR2 JMP2r
&!3 LDAk #04 NEQ ,&!4 JCN INC2 ;set-next-addr JSR2 JMP2r
&!4 LDAk #05 NEQ ,&!5 JCN #0003 ADD2 ;set-next-addr JSR2 JMP2r
( &!5 ;unknown-node-type ;error! JSR2 )
&!5 LDAk #06 NEQ ,&!6 JCN INC2 ;set-next-addr JSR2 JMP2r
&!6 LDAk #07 NEQ ,&!7 JCN INC2 ;set-next-addr JSR2 JMP2r
&!7 ;unknown-node-type ;error! JSR2
LDAk #01 LTH ,&unknown JCN
LDAk #07 GTH ,&unknown JCN
LDAk #05 NEQ ,&!5 JCN #0003 ADD2 ,&continue JMP
&!5 LDAk #03 NEQ ,&!3 JCN #0002 ADD2 ,&continue JMP
&!3 INC2
&continue ;set-next-addr JSR2 JMP2r
&unknown ;unknown-node-type ;error! JSR2
@set-next-or-addr ( target* addr* -> )
LDA2k #0000 EQU2 ( target addr addr=0? ) ,&is-zero JCN
@ -711,6 +735,59 @@
@arena-pos :arena-bot ( the next position to allocate )
@arena-bot $400 @arena-top ( holds up to 1024 bytes )
( SUBGROUP OPERATIONS )
( )
( subgroups are parts of the input string that are matched by )
( parenthesized subgroup expressions in a regex. )
( )
( for example, (a*)(b*)(c*) has 3 subgroup expressions. )
( )
( during matching, subgroups are represented by 4-bytes )
( which are interpreted as two short values: )
( )
( - bytes 0-1: absolute address of the start of the subgroup )
( - bytes 2-3: absolute address of the limit of the subgroup )
( )
( this means that to get a null-terminated subgroup string )
( you will need to copy it somewhere else with enough space, )
( or else mutate the input string to add a null. )
( )
( since input strings themselves are null-terminated, and since )
( subgroups never include null terminators, we will always have )
( a valid limit value even for input strings that end at #ffff. )
( )
( during regex parsing we will use subgroup-pos to track the )
( next available subgroup position. )
@subgroup-start ( s* i* -> )
DUP2 ;subgroup-pos LDA2 LTH2 ,&write JCN ( s i )
DUP2 #0004 ADD2 ;subgroup-pos STA2 ( s i )
&write STA2 JMP2r
@subgroup-finish ( s* i* -> )
STA2 JMP2r
@subgroup-backtrack ( i* -> )
;subgroup-pos LDA2
&loop #0004 SUB2
LTH2k ,&done JCN
#0000 OVR2 STA2
#0000 OVR2 #0002 ADD2 STA2
,&loop JMP
&done POP2 ;subgroup-pos STA2
JMP2r
@subgroup-reset ( -> )
;subgroup-bot ;subgroup-pos STA2
;subgroup-top ;subgroup-bot LIT2r 0000
&loop GTH2k ,&continue JCN
POP2 POP2 POP2r JMP2r
&continue STH2kr OVR2 STA2
INC2 INC2 ,&loop JMP
@subgroup-pos :subgroup-bot ( the position of the first unallocated subgroup )
@subgroup-bot $800 @subgroup-top ( holds up to 512 subgroups (2048 bytes) )
( INTERVAL OPERATIONS )
( )
( not baked yet )