start implementing subgroups
This commit is contained in:
parent
03c4aa5120
commit
8acef622f4
157
regex.tal
157
regex.tal
|
@ -38,16 +38,24 @@
|
||||||
( dot matches any one char [ #02 next* ] )
|
( dot matches any one char [ #02 next* ] )
|
||||||
( lit matches one specific char (c) [ #03 c^ next* ] )
|
( lit matches one specific char (c) [ #03 c^ next* ] )
|
||||||
( or matches either left or right [ #04 left* right* ] )
|
( or matches either left or right [ #04 left* right* ] )
|
||||||
( star matches expr zero-or-more times [ #05 r* next* ] )
|
( star matches expr zero-or-more times [ #05 expr* next* ] )
|
||||||
( (NOTE: r.expr.next must be r) )
|
( (NOTE: r.expr.next must be r) )
|
||||||
( )
|
|
||||||
( caret matches start of line/string [ #06 next* ] )
|
( caret matches start of line/string [ #06 next* ] )
|
||||||
( dollar matches end of line/string [ #07 next* ] )
|
( dollar matches end of line/string [ #07 next* ] )
|
||||||
|
( lpar starts subgroup region [ #08 i* next* ] )
|
||||||
|
( rpar ends subgroup region [ #09 i* next* ] )
|
||||||
( )
|
( )
|
||||||
( `or` and `star` have the same structure and are handled by the )
|
( `or` and `star` have the same structure and are handled by the )
|
||||||
( same code (;do-or). however, the node types are kept different )
|
( same code (;do-or). however, the node types are kept different )
|
||||||
( to make it clearer how to parse and assemble the nodes. )
|
( to make it clearer how to parse and assemble the nodes. )
|
||||||
( )
|
( )
|
||||||
|
( dollar nodes contain a next pointer even though this usually )
|
||||||
|
( will not be needed. )
|
||||||
|
( )
|
||||||
|
( lpar and rpar contain addresses pointing between subgroup-bot )
|
||||||
|
( and subgroup-bot. rpar's address will always be +2 relative to )
|
||||||
|
( the corresponding lpar address. )
|
||||||
|
( )
|
||||||
( concatenation isn't a node, it is implied by the *next addr. )
|
( concatenation isn't a node, it is implied by the *next addr. )
|
||||||
( a next value of #0000 signals the end of the regex. )
|
( a next value of #0000 signals the end of the regex. )
|
||||||
( )
|
( )
|
||||||
|
@ -55,6 +63,15 @@
|
||||||
( regexes should not include nulls and cannot match them (other )
|
( regexes should not include nulls and cannot match them (other )
|
||||||
( than the null which signals the end of a string). )
|
( than the null which signals the end of a string). )
|
||||||
|
|
||||||
|
( TODO: we have lpar and rpar nodes but aren't using them yet )
|
||||||
|
( 1. need to modify c-lpar and c-par )
|
||||||
|
( 2. we need to store subgroup-posd in regions during parsing: )
|
||||||
|
( a. need to store the current pos in the region )
|
||||||
|
( b. need to call start to move subgroup-pos forward )
|
||||||
|
( 3. when finishing parsing a region we need lpar/rpar nodes )
|
||||||
|
( 4. we also need to store "last started subgroup" on the stack )
|
||||||
|
( 5. when backtracking we must rewind to "last started" subgroup )
|
||||||
|
|
||||||
%debug { #ff #0e DEO }
|
%debug { #ff #0e DEO }
|
||||||
%emit { #18 DEO }
|
%emit { #18 DEO }
|
||||||
%space { #20 emit }
|
%space { #20 emit }
|
||||||
|
@ -144,6 +161,8 @@
|
||||||
LDAk #05 EQU ;do-or JCN2 ( same code as the or case )
|
LDAk #05 EQU ;do-or JCN2 ( same code as the or case )
|
||||||
LDAk #06 EQU ;do-caret JCN2
|
LDAk #06 EQU ;do-caret JCN2
|
||||||
LDAk #07 EQU ;do-dollar JCN2
|
LDAk #07 EQU ;do-dollar JCN2
|
||||||
|
LDAk #08 EQU ;do-lpar JCN2
|
||||||
|
LDAk #09 EQU ;do-rpar JCN2
|
||||||
;unknown-node-type ;error! JSR2
|
;unknown-node-type ;error! JSR2
|
||||||
|
|
||||||
( used when we hit a dead-end during matching. )
|
( used when we hit a dead-end during matching. )
|
||||||
|
@ -169,6 +188,20 @@
|
||||||
INC2 LDA2 ( load next )
|
INC2 LDA2 ( load next )
|
||||||
;goto-next JMP2 ( jump to next )
|
;goto-next JMP2 ( jump to next )
|
||||||
|
|
||||||
|
@do-lpar ( str* regex* -> bool^ )
|
||||||
|
STH2 DUP2 ( s s [r] )
|
||||||
|
INC2r LDA2kr STH2r ( s s i [r+1] )
|
||||||
|
;subgroup-start JSR2 ( s [r+1] )
|
||||||
|
STH2r INC2 INC2 ( s r+3 )
|
||||||
|
LDA2 ;goto-next JMP2 ( jump to next )
|
||||||
|
|
||||||
|
@do-rpar ( str* regex* -> bool^ )
|
||||||
|
STH2 DUP2 ( s s [r] )
|
||||||
|
INC2r LDA2kr STH2r ( s s i [r+1] )
|
||||||
|
;subgroup-finish JSR2 ( s [r+1] )
|
||||||
|
STH2r INC2 INC2 ( s r+3 )
|
||||||
|
LDA2 ;goto-next JMP2 ( jump to next )
|
||||||
|
|
||||||
( handle dot -- match any one character )
|
( handle dot -- match any one character )
|
||||||
@do-dot ( str* regex* -> bool^ )
|
@do-dot ( str* regex* -> bool^ )
|
||||||
INC2 LDA2 STH2 ( load and stash next )
|
INC2 LDA2 STH2 ( load and stash next )
|
||||||
|
@ -178,19 +211,23 @@
|
||||||
;search-mode LDA ,&backtrack JCN ( if \n and search-mode, treat as EOF )
|
;search-mode LDA ,&backtrack JCN ( if \n and search-mode, treat as EOF )
|
||||||
&match INC2 STH2r ;goto-next JMP2 ( on match: inc s, restore and jump )
|
&match INC2 STH2r ;goto-next JMP2 ( on match: inc s, restore and jump )
|
||||||
|
|
||||||
( TODO: support multi-line=0 )
|
( hande caret -- match string start (or possibly after newline) without advancing )
|
||||||
@do-caret ( str* regex* -> bool^ )
|
@do-caret ( str* regex* -> bool^ )
|
||||||
INC2 LDA2 STH2 ( load and stash next )
|
INC2 LDA2 STH2 ( load and stash next )
|
||||||
DUP2 ;string-start LDA2 EQU2 ,&at-start JCN ( at string start? )
|
DUP2 ;string-start LDA2 EQU2 ,&at-start JCN ( at string start? )
|
||||||
POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack )
|
;match-multiline LDA ,&no-match JCN ( are we in multi-line mode? )
|
||||||
&at-start STH2r ;goto-next JMP2 ( yes, goto next without advancing str )
|
#0001 SUB2 LDAk #0a EQU ,&at-start JCN ( just after newline? )
|
||||||
|
&no-match POP2r POP2 ;goto-backtrack JMP2 ( clear stacks and backtrack )
|
||||||
|
&at-start STH2r ;goto-next JMP2 ( go to next without advancing )
|
||||||
|
|
||||||
( TODO: support multi-line=0 )
|
( hande dollar -- match string end (or possibly before newline) without advancing )
|
||||||
@do-dollar ( str* regex* -> bool^ )
|
@do-dollar ( str* regex* -> bool^ )
|
||||||
INC2 LDA2 STH2 ( load and stash next )
|
INC2 LDA2 STH2 ( load and stash next )
|
||||||
LDAk #00 EQU ,&at-end JCN ( at string end? )
|
LDAk #00 EQU ,&at-end JCN ( at string end? )
|
||||||
POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack )
|
;match-multiline LDA ,&no-match JCN ( are we in multi-line mode? )
|
||||||
&at-end STH2r ;goto-next JMP2 ( yes, goto next without advancing str )
|
LDAk #0a EQU ,&at-end JCN ( at newline? )
|
||||||
|
&no-match POP2r POP2 ;goto-backtrack JMP2 ( clear stacks and backtrack )
|
||||||
|
&at-end STH2r ;goto-next JMP2 ( go to next without advancing )
|
||||||
|
|
||||||
( handle literal -- match one specific character )
|
( handle literal -- match one specific character )
|
||||||
@do-literal ( str* regex* -> bool^ )
|
@do-literal ( str* regex* -> bool^ )
|
||||||
|
@ -229,17 +266,6 @@
|
||||||
@search-start $2
|
@search-start $2
|
||||||
@search-end $2
|
@search-end $2
|
||||||
|
|
||||||
(
|
|
||||||
( used for subgroup match start/end addresses )
|
|
||||||
@groups [ &s0 $2 &e0 $2 &s1 $2 &e1 $2
|
|
||||||
&s2 $2 &e2 $2 &s3 $2 &e3 $2
|
|
||||||
&s4 $2 &e4 $2 &s5 $2 &e5 $2
|
|
||||||
&s6 $2 &e6 $2 &s7 $2 &e7 $2
|
|
||||||
&s8 $2 &e8 $2 &s9 $2 &e9 $2 ]
|
|
||||||
|
|
||||||
( position of last finished group )
|
|
||||||
@group-pos $2 )
|
|
||||||
|
|
||||||
( track the position in the input string )
|
( track the position in the input string )
|
||||||
@pos $2
|
@pos $2
|
||||||
|
|
||||||
|
@ -397,7 +423,7 @@
|
||||||
( allocates a dot-node and continues. )
|
( allocates a dot-node and continues. )
|
||||||
@c-dot ( c^ -> r2* )
|
@c-dot ( c^ -> r2* )
|
||||||
POP
|
POP
|
||||||
;alloc-dot JSR2 ( dot )
|
#02 ;alloc3 JSR2
|
||||||
DUP2 ;c-peek-and-finalize JMP2
|
DUP2 ;c-peek-and-finalize JMP2
|
||||||
|
|
||||||
( called when we read "^" )
|
( called when we read "^" )
|
||||||
|
@ -405,7 +431,7 @@
|
||||||
( allocates a caret-node and continues. )
|
( allocates a caret-node and continues. )
|
||||||
@c-caret ( c^ -> r2* )
|
@c-caret ( c^ -> r2* )
|
||||||
POP
|
POP
|
||||||
;alloc-caret JSR2 ( caret )
|
#06 ;alloc3 JMP2
|
||||||
DUP2 ;c-peek-and-finalize JMP2
|
DUP2 ;c-peek-and-finalize JMP2
|
||||||
|
|
||||||
( called when we read "$" )
|
( called when we read "$" )
|
||||||
|
@ -413,7 +439,7 @@
|
||||||
( allocates a dollar-node and continues. )
|
( allocates a dollar-node and continues. )
|
||||||
@c-dollar ( c^ -> r2* )
|
@c-dollar ( c^ -> r2* )
|
||||||
POP
|
POP
|
||||||
;alloc-dollar JSR2 ( dollar )
|
#07 ;alloc3 JMP2
|
||||||
DUP2 ;c-peek-and-finalize JMP2
|
DUP2 ;c-peek-and-finalize JMP2
|
||||||
|
|
||||||
( called when we read "\" )
|
( called when we read "\" )
|
||||||
|
@ -479,15 +505,6 @@
|
||||||
@alloc-empty ( -> r* )
|
@alloc-empty ( -> r* )
|
||||||
#01 ;alloc3 JMP2
|
#01 ;alloc3 JMP2
|
||||||
|
|
||||||
@alloc-dot ( -> r* )
|
|
||||||
#02 ;alloc3 JMP2
|
|
||||||
|
|
||||||
@alloc-caret ( -> r* )
|
|
||||||
#06 ;alloc3 JMP2
|
|
||||||
|
|
||||||
@alloc-dollar ( -> r* )
|
|
||||||
#07 ;alloc3 JMP2
|
|
||||||
|
|
||||||
@alloc-lit ( c^ -> r* )
|
@alloc-lit ( c^ -> r* )
|
||||||
#03 #0000 SWP2 ( 0000 c^ 03 )
|
#03 #0000 SWP2 ( 0000 c^ 03 )
|
||||||
#04 ;alloc JSR2 ( 0000 c^ 03 addr* )
|
#04 ;alloc JSR2 ( 0000 c^ 03 addr* )
|
||||||
|
@ -584,16 +601,23 @@
|
||||||
&is-zero STA2 JMP2r
|
&is-zero STA2 JMP2r
|
||||||
|
|
||||||
( set regex.next to target )
|
( set regex.next to target )
|
||||||
|
( )
|
||||||
|
( node types 1-7 are defined. )
|
||||||
|
( )
|
||||||
|
( all node types except star (5) and lit (3) store their next )
|
||||||
|
( pointer one byte off of their own address. )
|
||||||
|
( )
|
||||||
|
( since both branches of an or (4) node are supposed to meet )
|
||||||
|
( back up we only bother taking the left branch. otherwise )
|
||||||
|
( you can end up double-appending things. )
|
||||||
@set-next ( target* regex* -> )
|
@set-next ( target* regex* -> )
|
||||||
LDAk #01 NEQ ,&!1 JCN INC2 ;set-next-addr JSR2 JMP2r
|
LDAk #01 LTH ,&unknown JCN
|
||||||
&!1 LDAk #02 NEQ ,&!2 JCN INC2 ;set-next-addr JSR2 JMP2r
|
LDAk #07 GTH ,&unknown JCN
|
||||||
&!2 LDAk #03 NEQ ,&!3 JCN #0002 ADD2 ;set-next-addr JSR2 JMP2r
|
LDAk #05 NEQ ,&!5 JCN #0003 ADD2 ,&continue JMP
|
||||||
&!3 LDAk #04 NEQ ,&!4 JCN INC2 ;set-next-addr JSR2 JMP2r
|
&!5 LDAk #03 NEQ ,&!3 JCN #0002 ADD2 ,&continue JMP
|
||||||
&!4 LDAk #05 NEQ ,&!5 JCN #0003 ADD2 ;set-next-addr JSR2 JMP2r
|
&!3 INC2
|
||||||
( &!5 ;unknown-node-type ;error! JSR2 )
|
&continue ;set-next-addr JSR2 JMP2r
|
||||||
&!5 LDAk #06 NEQ ,&!6 JCN INC2 ;set-next-addr JSR2 JMP2r
|
&unknown ;unknown-node-type ;error! JSR2
|
||||||
&!6 LDAk #07 NEQ ,&!7 JCN INC2 ;set-next-addr JSR2 JMP2r
|
|
||||||
&!7 ;unknown-node-type ;error! JSR2
|
|
||||||
|
|
||||||
@set-next-or-addr ( target* addr* -> )
|
@set-next-or-addr ( target* addr* -> )
|
||||||
LDA2k #0000 EQU2 ( target addr addr=0? ) ,&is-zero JCN
|
LDA2k #0000 EQU2 ( target addr addr=0? ) ,&is-zero JCN
|
||||||
|
@ -711,6 +735,59 @@
|
||||||
@arena-pos :arena-bot ( the next position to allocate )
|
@arena-pos :arena-bot ( the next position to allocate )
|
||||||
@arena-bot $400 @arena-top ( holds up to 1024 bytes )
|
@arena-bot $400 @arena-top ( holds up to 1024 bytes )
|
||||||
|
|
||||||
|
( SUBGROUP OPERATIONS )
|
||||||
|
( )
|
||||||
|
( subgroups are parts of the input string that are matched by )
|
||||||
|
( parenthesized subgroup expressions in a regex. )
|
||||||
|
( )
|
||||||
|
( for example, (a*)(b*)(c*) has 3 subgroup expressions. )
|
||||||
|
( )
|
||||||
|
( during matching, subgroups are represented by 4-bytes )
|
||||||
|
( which are interpreted as two short values: )
|
||||||
|
( )
|
||||||
|
( - bytes 0-1: absolute address of the start of the subgroup )
|
||||||
|
( - bytes 2-3: absolute address of the limit of the subgroup )
|
||||||
|
( )
|
||||||
|
( this means that to get a null-terminated subgroup string )
|
||||||
|
( you will need to copy it somewhere else with enough space, )
|
||||||
|
( or else mutate the input string to add a null. )
|
||||||
|
( )
|
||||||
|
( since input strings themselves are null-terminated, and since )
|
||||||
|
( subgroups never include null terminators, we will always have )
|
||||||
|
( a valid limit value even for input strings that end at #ffff. )
|
||||||
|
( )
|
||||||
|
( during regex parsing we will use subgroup-pos to track the )
|
||||||
|
( next available subgroup position. )
|
||||||
|
|
||||||
|
@subgroup-start ( s* i* -> )
|
||||||
|
DUP2 ;subgroup-pos LDA2 LTH2 ,&write JCN ( s i )
|
||||||
|
DUP2 #0004 ADD2 ;subgroup-pos STA2 ( s i )
|
||||||
|
&write STA2 JMP2r
|
||||||
|
|
||||||
|
@subgroup-finish ( s* i* -> )
|
||||||
|
STA2 JMP2r
|
||||||
|
|
||||||
|
@subgroup-backtrack ( i* -> )
|
||||||
|
;subgroup-pos LDA2
|
||||||
|
&loop #0004 SUB2
|
||||||
|
LTH2k ,&done JCN
|
||||||
|
#0000 OVR2 STA2
|
||||||
|
#0000 OVR2 #0002 ADD2 STA2
|
||||||
|
,&loop JMP
|
||||||
|
&done POP2 ;subgroup-pos STA2
|
||||||
|
JMP2r
|
||||||
|
|
||||||
|
@subgroup-reset ( -> )
|
||||||
|
;subgroup-bot ;subgroup-pos STA2
|
||||||
|
;subgroup-top ;subgroup-bot LIT2r 0000
|
||||||
|
&loop GTH2k ,&continue JCN
|
||||||
|
POP2 POP2 POP2r JMP2r
|
||||||
|
&continue STH2kr OVR2 STA2
|
||||||
|
INC2 INC2 ,&loop JMP
|
||||||
|
|
||||||
|
@subgroup-pos :subgroup-bot ( the position of the first unallocated subgroup )
|
||||||
|
@subgroup-bot $800 @subgroup-top ( holds up to 512 subgroups (2048 bytes) )
|
||||||
|
|
||||||
( INTERVAL OPERATIONS )
|
( INTERVAL OPERATIONS )
|
||||||
( )
|
( )
|
||||||
( not baked yet )
|
( not baked yet )
|
||||||
|
|
Loading…
Reference in New Issue