nxu/regex.tal

203 lines
7.0 KiB
Tal
Raw Normal View History

2022-01-22 22:54:51 -05:00
( regex.tal )
( )
( compiles regex expression strings into regex nodes, then uses )
( regex ndoes to match input strings. )
( )
( this currently only supports matching an entire string, as )
( opposed to searching for a matching substring, or extracting )
( matching subgroups. )
( )
( regex node types: )
( )
( NAME DESCRIPTION STRUCT )
( empty matches empty string [ #01 next* ] )
( dot matches any one char [ #02 next* ] )
( lit matches one specific char (c) [ #03 c^ next* ] )
( or matches either left or right [ #04 left* right* ] )
( star matches expr zero-or-more times [ #05 r* next* ] )
( (NOTE: r.expr.next must be r) )
( )
( `or` and `star` have the same structure and are handled by the )
( same code (;do-or). however, the node types are kept different )
( to make it clearer how to parse and assemble the nodes. )
( )
( concatenation isn't a node, it is implied by the *next addr. )
( a next value of #0000 signals the end of the regex. )
( )
( in these docs str* is an address to a null-terminated string. )
( regexes should not include nulls and cannot match them (other )
( than the null which signals the end of a string). )
%null? { #00 EQU }
%debug { #ff #0e DEO }
%emit { #18 DEO }
%space { #20 emit }
%newline { #0a emit }
|0100
;test1 ;expr1 ;match JSR2 ;emit-byte JSR2 newline
;test2 ;expr1 ;match JSR2 ;emit-byte JSR2 newline
;test3 ;expr1 ;match JSR2 ;emit-byte JSR2 newline
;test4 ;expr1 ;match JSR2 ;emit-byte JSR2 newline
;test5 ;expr1 ;match JSR2 ;emit-byte JSR2 newline
;test6 ;expr1 ;match JSR2 ;emit-byte JSR2 newline
;test7 ;expr1 ;match JSR2 ;emit-byte JSR2 newline
;test8 ;expr1 ;match JSR2 ;emit-byte JSR2 newline
BRK
@match ( str* regex* -> bool^ )
;reset-stack JSR2
;loop JMP2
( we don't use the return stack here since that )
( complicates the back-tracking we need to do. )
( ultimately this code will issue a JMP2r to )
( return a boolean, which is where the stack )
( effects signature comes from. )
@loop ( s* r* -> bool^ )
LDAk #01 EQU ;do-empty JCN2
LDAk #02 EQU ;do-dot JCN2
LDAk #03 EQU ;do-literal JCN2
LDAk #04 EQU ;do-or JCN2
LDAk #05 EQU ;do-or JCN2 ( same code as the or case )
#00 #00 DIV ( should not happen )
@goto-backtrack ( -> bool^ )
;stack-exist JSR2 ,&has-stack JCN ( do we have stack? )
#00 JMP2r ( no, return false )
&has-stack
;pop JSR2
;goto-next JMP2 ( yes, resume from the top )
@goto-next ( str* next* -> bool^ )
DUP2 #0000 GTH2 ,&has-next JCN
POP2 LDA null? ,&end-of-string JCN
;goto-backtrack JMP2
&end-of-string #01 JMP2r
&has-next ;loop JMP2
@do-empty ( str* regex* -> bool^ )
INC2 LDA2 ( load next )
;goto-next JMP2 ( jump to next )
@do-dot ( str* regex* -> bool^ )
INC2 LDA2 STH2 ( load and stash next )
LDAk #00 NEQ ,&non-empty JCN ( is there a char? )
POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack )
&non-empty INC2 STH2r ;goto-next JMP2 ( yes, inc s, restore and jump )
@do-literal ( str* regex* -> bool^ )
INC2
LDAk STH ( store c )
INC2 LDA2 STH2 ROTr ( store next, move c to top )
LDAk
STHr EQU ,&matches JCN ( do we match this char? )
POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack )
&matches
INC2 STH2r ;goto-next JMP2 ( yes, inc s, restore and jump )
( this also handles asteration, since it ends up having the same structure )
@do-or ( str* regex* -> bool^ )
INC2 OVR2 OVR2 #0002 ADD2 ( s r+1 s r+3 )
LDA2 ;push JSR2 ( save (s, right) in the stack for possible backtracking )
LDA2 ;loop JMP2 ( continue on left branch )
( compile an expression string into a regex graph )
@compile ( expr* -> regex* )
( corresponds to regex: a(b|c)d* )
( accepts "ab" or "ac" followd by any number of d's )
@expr1
03 'a :x1
@x1 04 :x2 :x3
@x2 03 'b :x4
@x3 03 'c :x4
@x4 05 :x5 0000
@x5 03 'd :x4
@test1 "ab 00 ( yes )
@test2 "acdd 00 ( yes )
@test3 "add 00 ( no )
@test4 "abd 00 ( yes )
@test5 "acddddddddddd 00 ( yes )
@test6 "bd 00 ( no )
@test7 "z 00 ( no )
@test8 00 ( no )
@emit-short ( byte -- )
SWP ;emit-byte JSR2 ;emit-byte JSR2 JMP2r
@emit-byte ( byte -- )
DUP #04 SFT ,&hex JSR #0f AND ,&hex JMP
&hex #30 ADD DUP #39 GTH #27 MUL ADD emit
JMP2r
@emit3 ( addr* -> addr* )
DUP2
LDAk ;emit-byte JSR2 space INC2
LDA2 ;emit-short JSR2 newline JMP2r
@emit4 ( addr* -> addr* )
DUP2
LDAk ;emit-byte JSR2 space INC2
LDAk ;emit-byte JSR2 space INC2
LDA2 ;emit-short JSR2 newline JMP2r
@emit5 ( addr* -> addr* )
DUP2
LDAk ;emit-byte JSR2 space INC2
LDA2k ;emit-short JSR2 space #0002 ADD2
LDA2 ;emit-short JSR2 newline JMP2r
@push ( str* regex* -> )
;assert-avail JSR2 ( check for space )
;stack-pos LDA2 #0002 ADD2 STA2 ( cell[2:3] <- regex )
;stack-pos LDA2 STA2 ( cell[0:1] <- str )
;stack-pos LDA2 #0004 ADD2 ;stack-pos STA2 ( pos += 4 )
JMP2r
@pop ( -> str* regex* )
;stack-pos LDA2 ( load stack-pos )
#0002 SUB2 LDA2k STH2 ( pop and stash regex )
#0002 SUB2 LDA2k STH2 ( pop and stash str )
;stack-pos STA2 ( save new stack-pos )
STH2r STH2r ( restore str and regex )
JMP2r
@reset-stack ( -> )
;stack-bot ;stack-pos STA2 JMP2r ( pos <- 0 )
@stack-avail ( -> bool^ )
;stack-pos LDA2 ;stack-top LTH2 JMP2r
@stack-exist ( -> bool^ )
;stack-pos LDA2 ;stack-bot GTH2 JMP2r
@assert-avail ( -> )
;stack-avail JSR2 ,&ok JCN #00 #00 DIV &ok JMP2r
@emit-stack ( -> )
space LIT 'n emit space ;stack-pos LDA2 ;stack-bot SUB2 #0004 DIV2 ;emit-short JSR2 newline
;stack-bot
&loop
DUP2 ;stack-pos LDA2 LTH2 ,&ok JCN
POP2 JMP2r
&ok
space space LDA2k ;emit-short JSR2
#0002 ADD2 DUP2 LDA2 space ;emit-short JSR2 newline
#0002 ADD2 ,&loop JMP
@stack-pos :stack-bot ( the next position to insert at )
@stack-bot $1000 @stack-top ( holds 1024 steps (4096 bytes) )
@reset-arena ( -> )
;arena-bot ;arena-pos STA2 JMP2r
@alloc ( size^ -> addr* )
#00 SWP ( size* )
;arena-pos LDA2 STH2k ADD2 ( pos+size* {pos} )
( TODO: ensure we don't exceed our space )
;arena-pos STA2 ( pos <- pos+size )
STH2r JMP2 ( return old pos )
@arena-pos :arena-bot ( the next position to allocate )
@arena-bot $400 @arena-top ( holds up to 1024 bytes )