460 lines
14 KiB
Tal
460 lines
14 KiB
Tal
( regex.tal )
|
|
( )
|
|
( compiles regex expression strings into regex nodes, then uses )
|
|
( regex ndoes to match input strings. )
|
|
( )
|
|
( this currently only supports matching an entire string, as )
|
|
( opposed to searching for a matching substring, or extracting )
|
|
( matching subgroups. )
|
|
( )
|
|
( regex node types: )
|
|
( )
|
|
( NAME DESCRIPTION STRUCT )
|
|
( empty matches empty string [ #01 next* ] )
|
|
( dot matches any one char [ #02 next* ] )
|
|
( lit matches one specific char (c) [ #03 c^ next* ] )
|
|
( or matches either left or right [ #04 left* right* ] )
|
|
( star matches expr zero-or-more times [ #05 r* next* ] )
|
|
( (NOTE: r.expr.next must be r) )
|
|
( )
|
|
( `or` and `star` have the same structure and are handled by the )
|
|
( same code (;do-or). however, the node types are kept different )
|
|
( to make it clearer how to parse and assemble the nodes. )
|
|
( )
|
|
( concatenation isn't a node, it is implied by the *next addr. )
|
|
( a next value of #0000 signals the end of the regex. )
|
|
( )
|
|
( in these docs str* is an address to a null-terminated string. )
|
|
( regexes should not include nulls and cannot match them (other )
|
|
( than the null which signals the end of a string). )
|
|
|
|
%null? { #00 EQU }
|
|
%debug { #ff #0e DEO }
|
|
%emit { #18 DEO }
|
|
%space { #20 emit }
|
|
%newline { #0a emit }
|
|
%print { debug newline }
|
|
%error! { #00 #00 DIV }
|
|
|
|
( test -> )
|
|
|
|
|0100
|
|
;expr1 ;compile JSR2
|
|
print
|
|
;emit-stack JSR2
|
|
;emit-arena JSR2
|
|
newline
|
|
|
|
;test1 OVR2k ;match JSR2 ;emit-byte JSR2 newline
|
|
;test2 OVR2k ;match JSR2 ;emit-byte JSR2 newline
|
|
;test3 OVR2k ;match JSR2 ;emit-byte JSR2 newline
|
|
;test4 OVR2k ;match JSR2 ;emit-byte JSR2 newline
|
|
;test5 OVR2k ;match JSR2 ;emit-byte JSR2 newline
|
|
;test6 OVR2k ;match JSR2 ;emit-byte JSR2 newline
|
|
;test7 OVR2k ;match JSR2 ;emit-byte JSR2 newline
|
|
;test8 OVR2k ;match JSR2 ;emit-byte JSR2 newline
|
|
newline
|
|
|
|
;test1 ;graph1 ;match JSR2 ;emit-byte JSR2 newline
|
|
;test2 ;graph1 ;match JSR2 ;emit-byte JSR2 newline
|
|
;test3 ;graph1 ;match JSR2 ;emit-byte JSR2 newline
|
|
;test4 ;graph1 ;match JSR2 ;emit-byte JSR2 newline
|
|
;test5 ;graph1 ;match JSR2 ;emit-byte JSR2 newline
|
|
;test6 ;graph1 ;match JSR2 ;emit-byte JSR2 newline
|
|
;test7 ;graph1 ;match JSR2 ;emit-byte JSR2 newline
|
|
;test8 ;graph1 ;match JSR2 ;emit-byte JSR2 newline
|
|
BRK
|
|
|
|
@match ( str* regex* -> bool^ )
|
|
;reset-stack JSR2
|
|
;loop JMP2
|
|
|
|
( we don't use the return stack here since that )
|
|
( complicates the back-tracking we need to do. )
|
|
( ultimately this code will issue a JMP2r to )
|
|
( return a boolean, which is where the stack )
|
|
( effects signature comes from. )
|
|
|
|
@loop ( s* r* -> bool^ )
|
|
LDAk #01 EQU ;do-empty JCN2
|
|
LDAk #02 EQU ;do-dot JCN2
|
|
LDAk #03 EQU ;do-literal JCN2
|
|
LDAk #04 EQU ;do-or JCN2
|
|
LDAk #05 EQU ;do-or JCN2 ( same code as the or case )
|
|
error!
|
|
|
|
@goto-backtrack ( -> bool^ )
|
|
;stack-exist JSR2 ,&has-stack JCN ( do we have stack? )
|
|
#00 JMP2r ( no, return false )
|
|
&has-stack
|
|
;pop4 JSR2
|
|
;goto-next JMP2 ( yes, resume from the top )
|
|
|
|
@goto-next ( str* next* -> bool^ )
|
|
DUP2 #0000 GTH2 ,&has-next JCN
|
|
POP2 LDA null? ,&end-of-string JCN
|
|
;goto-backtrack JMP2
|
|
&end-of-string #01 JMP2r
|
|
&has-next ;loop JMP2
|
|
|
|
@do-empty ( str* regex* -> bool^ )
|
|
INC2 LDA2 ( load next )
|
|
;goto-next JMP2 ( jump to next )
|
|
|
|
@do-dot ( str* regex* -> bool^ )
|
|
INC2 LDA2 STH2 ( load and stash next )
|
|
LDAk #00 NEQ ,&non-empty JCN ( is there a char? )
|
|
POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack )
|
|
&non-empty INC2 STH2r ;goto-next JMP2 ( yes, inc s, restore and jump )
|
|
|
|
@do-literal ( str* regex* -> bool^ )
|
|
INC2
|
|
LDAk STH ( store c )
|
|
INC2 LDA2 STH2 ROTr ( store next, move c to top )
|
|
LDAk
|
|
STHr EQU ,&matches JCN ( do we match this char? )
|
|
POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack )
|
|
&matches
|
|
INC2 STH2r ;goto-next JMP2 ( yes, inc s, restore and jump )
|
|
|
|
( this also handles asteration, since it ends up having the same structure )
|
|
@do-or ( str* regex* -> bool^ )
|
|
INC2 OVR2 OVR2 #0002 ADD2 ( s r+1 s r+3 )
|
|
LDA2 ;push4 JSR2 ( save (s, right) in the stack for possible backtracking )
|
|
LDA2 ;loop JMP2 ( continue on left branch )
|
|
|
|
( track the position in the input string )
|
|
@pos $2
|
|
|
|
( track how many levels deep we are in parenthesis )
|
|
@parens $2
|
|
|
|
( read and increments pos )
|
|
@read ( -> c^ )
|
|
;pos LDA2k ( pos s )
|
|
LDAk STHk #00 EQU ( pos s c=0 [c] )
|
|
,&is-eof JCN ( pos s [c] )
|
|
INC2 ( pos s+1 [c] )
|
|
SWP2 STA2 ,&return JMP ( [c] )
|
|
&is-eof POP2 POP2
|
|
&return STHr ( c ) JMP2r
|
|
|
|
( read pos )
|
|
@peek ( -> c^ )
|
|
;pos LDA2 LDA JMP2r
|
|
|
|
( TODO: [] + ? )
|
|
( compile an expression string into a regex graph )
|
|
@compile ( expr* -> regex* )
|
|
;pos STA2
|
|
#0000 ;parens STA2
|
|
;reset-stack JSR2
|
|
;compile-region JMP2
|
|
|
|
( the basic strategy here is to build a stack of non-or )
|
|
( expressions to be joined together at the end of the )
|
|
( region. each stack entry has two regex addresses: )
|
|
( - the start of the regex )
|
|
( - the current tail of the regex )
|
|
( when we concatenate a new node to a regex we update )
|
|
( the second of these but not the first. )
|
|
( )
|
|
( the bottom of the stack for a given region is denoted )
|
|
( by #ffff #ffff. above that we start with #0000 #0000 )
|
|
( to signal an empty node. )
|
|
@compile-region ( -> r2* )
|
|
#ffff #ffff ;push4 JSR2 ( stack delimiter )
|
|
#0000 #0000 ;push4 JSR2 ( stack frame start )
|
|
@compile-region-loop
|
|
;read JSR2
|
|
DUP #00 EQU ;c-done JCN2
|
|
DUP LIT '* EQU ;c-star JCN2
|
|
DUP LIT '| EQU ;c-or JCN2
|
|
DUP LIT '. EQU ;c-dot JCN2
|
|
DUP LIT '( EQU ;c-lpar JCN2
|
|
DUP LIT ') EQU ;c-rpar JCN2
|
|
DUP LIT '\ EQU ;c-esc JCN2
|
|
;c-char JMP2
|
|
|
|
@c-done ( c^ -> r2* )
|
|
POP
|
|
;parens LDA2 #0000 GTH2 ,&mismatched-parens JCN
|
|
;unroll-stack JSR2 POP2 JMP2r
|
|
&mismatched-parens error!
|
|
|
|
@c-or ( c^ -> r2* )
|
|
POP
|
|
#0000 #0000 ;push4 JSR2
|
|
;compile-region-loop JMP2
|
|
|
|
@c-lpar ( c^ -> r2* )
|
|
POP
|
|
;parens LDA2 INC2 ;parens STA2 ( parens++ )
|
|
;compile-region JMP2
|
|
|
|
@c-rpar ( c^ -> r2* )
|
|
POP
|
|
;parens LDA2 #0000 EQU2 ,&mismatched-parens JCN
|
|
;parens LDA2 #0001 SUB2 ;parens STA2 ( parens-- )
|
|
;unroll-stack JSR2
|
|
;push-next JSR2
|
|
;compile-region-loop JMP2
|
|
&mismatched-parens error!
|
|
|
|
@c-dot ( c^ -> r2* )
|
|
POP
|
|
;alloc-dot JSR2 DUP2 ;push-next JSR2
|
|
;compile-region-loop JMP2
|
|
|
|
@c-char ( c^ -> r2* )
|
|
;alloc-lit JSR2
|
|
DUP2 ;push-next JSR2
|
|
;compile-region-loop JMP2
|
|
|
|
( TODO: escaping rules not quite right )
|
|
@c-esc ( c^ -> r2* )
|
|
POP
|
|
;read JSR2
|
|
;c-char JMP2
|
|
|
|
@c-star ( c^ -> regex* )
|
|
POP
|
|
;pop4 JSR2 SWP2 STH2 STH2k ( x1 [x0 x1] )
|
|
;alloc-star JSR2 ( r )
|
|
STH2r STH2kr ( r x1 x0 [x0] )
|
|
;remove-from JSR2 ( r [x0] )
|
|
STH2r OVR2 OVR2 ( r x0 r x0 ) ;set-next JSR2
|
|
OVR2 #0003 ADD2 #0000 SWP2 STA2 ( fixme: manually zeroing next )
|
|
( r x0 ) SWP2 ;push4 JSR2
|
|
;compile-region-loop JMP2
|
|
|
|
|
|
( allocate node types ------ )
|
|
|
|
@alloc3 ( mode^ -> r* )
|
|
#0000 ROT ( 00 00 mode^ )
|
|
#03 ;alloc JSR2 ( 00 00 mode^ addr* ) ( LIT 'a emit print )
|
|
STH2k STA
|
|
STH2kr INC2 STA2
|
|
STH2r
|
|
JMP2r
|
|
|
|
@alloc-empty ( -> r* )
|
|
#01 ;alloc3 JMP2
|
|
|
|
@alloc-dot ( -> r* )
|
|
#02 ;alloc3 JMP2
|
|
|
|
@alloc-lit ( c^ -> r* )
|
|
#03 #0000 SWP2
|
|
#04 ;alloc JSR2
|
|
STH2k STA
|
|
STH2kr INC2 STA
|
|
STH2kr #0002 ADD2 STA2
|
|
STH2r
|
|
JMP2r
|
|
|
|
@alloc-or ( right* left* -> r* )
|
|
#05 ;alloc JSR2 STH2 ( r l [x] )
|
|
#04 STH2kr STA ( r l [x] )
|
|
STH2kr INC2 STA2 ( r [x] )
|
|
STH2kr #0003 ADD2 STA2 ( [x] )
|
|
STH2r JMP2r
|
|
|
|
@alloc-star ( expr* -> r* )
|
|
#05 ;alloc JSR2 STH2 ( expr [r] )
|
|
#05 STH2kr STA ( expr [r] )
|
|
DUP2 STH2kr INC2 STA2 ( expr [r] )
|
|
#0000 STH2kr #0003 ADD2 STA2 ( expr [r] )
|
|
STH2kr SWP2 ( r expr [r] )
|
|
;set-next JSR2 ( [r] )
|
|
STH2r JMP2r
|
|
|
|
( unroll one part of the parsing stack, returning )
|
|
( a single node consisting of an alternation of )
|
|
( all elements on the stack. )
|
|
( )
|
|
( this unrolls until it hits #ffff #ffff, which it )
|
|
( also removes from the stack. )
|
|
@unroll-stack ( -> start* end* )
|
|
;pop4 JSR2 STH2 ( r )
|
|
#00 STH ( count items in stack frame )
|
|
DUP2 #0000 NEQ2 ,&loop JCN ;alloc-empty JSR2
|
|
&loop ( r* )
|
|
;pop4 JSR2 POP2 ( r x )
|
|
DUP2 #ffff EQU2 ( r x x-is-end? ) ,&done JCN
|
|
INCr ( items++ )
|
|
;alloc-or JSR2 ( r|x ) ,&loop JMP
|
|
&done
|
|
( r ffff )
|
|
POP2
|
|
STHr ,&is-or JCN
|
|
STH2r JMP2r
|
|
&is-or
|
|
POP2r
|
|
;alloc-empty JSR2 OVR2 OVR2 SWP2 ( r empty empty r )
|
|
;set-next JSR2
|
|
JMP2r
|
|
|
|
( add r to the top of the stock. )
|
|
( )
|
|
( in particular, this will write r into tail.next )
|
|
( before replacing tail with r. )
|
|
@push-next ( r0 r1 -> )
|
|
;pop4 JSR2 ( r0 r1 x0 x1 )
|
|
DUP2 #0000 EQU2 ( r0 r1 x0 x1 x1=0? ) ,&is-zero JCN
|
|
STH2 ROT2 STH2r ( r1 x0 r0 x1 )
|
|
;set-next JSR2 SWP2 ( x0 r1 )
|
|
;push4 JSR2
|
|
JMP2r
|
|
&is-zero POP2 POP2 ;push4 JSR2 JMP2r
|
|
|
|
@set-next-addr ( target* addr* -> )
|
|
LDA2k #0000 EQU2 ( target addr addr=0? ) ,&is-zero JCN
|
|
LDA2 ;set-next JSR2 JMP2r
|
|
&is-zero STA2 JMP2r
|
|
|
|
( set regex.next to target )
|
|
@set-next ( target* regex* -> )
|
|
LDAk #01 NEQ ,&!1 JCN INC2 ;set-next-addr JSR2 JMP2r
|
|
&!1 LDAk #02 NEQ ,&!2 JCN INC2 ;set-next-addr JSR2 JMP2r
|
|
&!2 LDAk #03 NEQ ,&!3 JCN #0002 ADD2 ;set-next-addr JSR2 JMP2r
|
|
&!3 LDAk #04 NEQ ,&!4 JCN
|
|
( todo: this is probably broken )
|
|
OVR2 OVR2 INC2 ;set-next-addr JSR2
|
|
#0003 ADD2 ;set-next-addr JSR2 JMP2r
|
|
&!4 LDAk #05 NEQ ,&!5 JCN #0003 ADD2 ;set-next-addr JSR2 JMP2r
|
|
&!5 error!
|
|
|
|
@remove-addr ( target* addr* -> )
|
|
LDA2k #0000 EQU2 ( t a v=0? ) ,&is-zero JCN
|
|
OVR2 OVR2 LDA2 EQU2 ( t a t=v? ) ,&is-equal JCN
|
|
LDA2 ( t v ) ;remove-from JSR2 JMP2r
|
|
&is-zero POP2 POP2 JMP2r
|
|
&is-equal NIP2 #0000 SWP2 STA2 JMP2r
|
|
|
|
( remove target from regex )
|
|
@remove-from ( target* regex* -> )
|
|
LDAk #01 NEQ ,&!1 JCN INC2 ;remove-addr JSR2 JMP2r
|
|
&!1 LDAk #02 NEQ ,&!2 JCN INC2 ;remove-addr JSR2 JMP2r
|
|
&!2 LDAk #03 NEQ ,&!3 JCN #0002 ADD2 ;remove-addr JSR2 JMP2r
|
|
&!3 LDAk #04 NEQ ,&!4 JCN
|
|
OVR2 OVR2 INC2 ;remove-addr JSR2
|
|
#0003 ADD2 ;remove-addr JSR2 JMP2r
|
|
&!4 LDAk #05 NEQ ,&!5 JCN #0003 ADD2 ;remove-addr JSR2 JMP2r
|
|
&!5 error!
|
|
|
|
( test cases -------- )
|
|
|
|
( corresponds to regex: a(b|c)d* )
|
|
@expr1 "a(b|c)d* 00
|
|
|
|
( corresponds to regex: a(b|c)d* )
|
|
( accepts "ab" or "ac" followd by any number of d's )
|
|
@graph1
|
|
03 'a :x1
|
|
@x1 04 :x2 :x3
|
|
@x2 03 'b :x4
|
|
@x3 03 'c :x4
|
|
@x4 05 :x5 0000
|
|
@x5 03 'd :x4
|
|
|
|
@test1 "ab 00 ( yes )
|
|
@test2 "acdd 00 ( yes )
|
|
@test3 "add 00 ( no )
|
|
@test4 "abd 00 ( yes )
|
|
@test5 "acddddddddddd 00 ( yes )
|
|
@test6 "bd 00 ( no )
|
|
@test7 "z 00 ( no )
|
|
@test8 00 ( no )
|
|
|
|
( emit byte/short )
|
|
|
|
@emit-short ( byte -- )
|
|
SWP ;emit-byte JSR2 ;emit-byte JSR2 JMP2r
|
|
|
|
@emit-byte ( byte -- )
|
|
DUP #04 SFT ,&hex JSR #0f AND ,&hex JMP
|
|
&hex #30 ADD DUP #39 GTH #27 MUL ADD emit
|
|
JMP2r
|
|
|
|
( stack operations ---- )
|
|
|
|
@push4 ( str* regex* -> )
|
|
;assert-avail JSR2 ( check for space )
|
|
;stack-pos LDA2 #0002 ADD2 STA2 ( cell[2:3] <- regex )
|
|
;stack-pos LDA2 STA2 ( cell[0:1] <- str )
|
|
;stack-pos LDA2 #0004 ADD2 ;stack-pos STA2 ( pos += 4 )
|
|
JMP2r
|
|
|
|
@pop4 ( -> str* regex* )
|
|
;assert-exist JSR2 ( check for space )
|
|
;stack-pos LDA2 ( load stack-pos )
|
|
#0002 SUB2 LDA2k STH2 ( pop and stash regex )
|
|
#0002 SUB2 LDA2k STH2 ( pop and stash str )
|
|
;stack-pos STA2 ( save new stack-pos )
|
|
STH2r STH2r ( restore str and regex )
|
|
JMP2r
|
|
|
|
@reset-stack ( -> )
|
|
;stack-bot ;stack-pos STA2 JMP2r ( pos <- 0 )
|
|
|
|
@stack-avail ( -> bool^ )
|
|
;stack-pos LDA2 ;stack-top LTH2 JMP2r
|
|
@stack-exist ( -> bool^ )
|
|
;stack-pos LDA2 ;stack-bot GTH2 JMP2r
|
|
@assert-avail ( -> )
|
|
;stack-avail JSR2 ,&ok JCN error! &ok JMP2r
|
|
@assert-exist ( -> )
|
|
;stack-exist JSR2 ,&ok JCN error! &ok JMP2r
|
|
|
|
@emit-stack ( -> )
|
|
space LIT 'n emit LIT '= emit ;stack-pos LDA2 ;stack-bot SUB2 #0004 DIV2 ;emit-short JSR2 LIT ': emit
|
|
;stack-bot
|
|
&loop
|
|
DUP2 ;stack-pos LDA2 LTH2 ,&ok JCN
|
|
POP2 newline JMP2r
|
|
&ok
|
|
space LDA2k ;emit-short JSR2
|
|
#0002 ADD2 ,&loop JMP
|
|
|
|
@stack-pos :stack-bot ( the next position to insert at )
|
|
@stack-bot $1000 @stack-top ( holds 1024 steps (4096 bytes) )
|
|
|
|
( arena operations ---- )
|
|
|
|
@reset-arena ( -> )
|
|
;arena-bot ;arena-pos STA2 JMP2r
|
|
|
|
@alloc ( size^ -> addr* )
|
|
#00 SWP ( size* )
|
|
;arena-pos LDA2 STH2k ADD2 ( pos+size* {pos} )
|
|
( TODO: ensure we don't exceed our space )
|
|
;arena-pos STA2 ( pos <- pos+size )
|
|
STH2r JMP2r ( return old pos )
|
|
|
|
|1ffe
|
|
@arena-pos :arena-bot ( the next position to allocate )
|
|
@arena-bot $400 @arena-top ( holds up to 1024 bytes )
|
|
|
|
@emit-n ( addr* count^ -> addr2* )
|
|
DUP #00 GTH ( addr count count>0? ) ,&ok JCN ( addr count ) POP newline JMP2r
|
|
&ok
|
|
STH ( addr [count] ) space LDAk ;emit-byte JSR2 INC2 ( addr+1 [count] )
|
|
STHr #01 SUB ( addr+1 count-1 )
|
|
;emit-n JMP2
|
|
|
|
@emit-arena ( -> )
|
|
;arena-bot
|
|
&loop
|
|
DUP2 ;arena-pos LDA2 LTH2 ,&ok JCN POP2 JMP2r
|
|
&ok
|
|
DUP2 ;emit-short JSR2
|
|
LIT ': emit space
|
|
LDAk #01 NEQ ,&!1 JCN #03 ;emit-n JSR2 ,&loop JMP
|
|
&!1 LDAk #02 NEQ ,&!2 JCN #03 ;emit-n JSR2 ,&loop JMP
|
|
&!2 LDAk #03 NEQ ,&!3 JCN #04 ;emit-n JSR2 ,&loop JMP
|
|
&!3 LDAk #04 NEQ ,&!4 JCN #05 ;emit-n JSR2 ,&loop JMP
|
|
&!4 LDAk #05 NEQ ,&!5 JCN #05 ;emit-n JSR2 ,&loop JMP
|
|
&!5 error!
|