regex appears to be working

This commit is contained in:
~d6 2022-01-29 23:13:10 -05:00
parent a1adf78fc1
commit 6d0d309a1c
2 changed files with 374 additions and 35 deletions

406
regex.tal
View File

@ -33,16 +33,32 @@
%emit { #18 DEO }
%space { #20 emit }
%newline { #0a emit }
%print { debug newline }
|0100
;test1 ;expr1 ;match JSR2 ;emit-byte JSR2 newline
;test2 ;expr1 ;match JSR2 ;emit-byte JSR2 newline
;test3 ;expr1 ;match JSR2 ;emit-byte JSR2 newline
;test4 ;expr1 ;match JSR2 ;emit-byte JSR2 newline
;test5 ;expr1 ;match JSR2 ;emit-byte JSR2 newline
;test6 ;expr1 ;match JSR2 ;emit-byte JSR2 newline
;test7 ;expr1 ;match JSR2 ;emit-byte JSR2 newline
;test8 ;expr1 ;match JSR2 ;emit-byte JSR2 newline
;expr1 ;compile JSR2 print newline
;emit-stack JSR2 newline
;emit-arena JSR2 newline
newline
;test1 OVR2k ;match JSR2 ;emit-byte JSR2 newline
;test2 OVR2k ;match JSR2 ;emit-byte JSR2 newline
;test3 OVR2k ;match JSR2 ;emit-byte JSR2 newline
;test4 OVR2k ;match JSR2 ;emit-byte JSR2 newline
;test5 OVR2k ;match JSR2 ;emit-byte JSR2 newline
;test6 OVR2k ;match JSR2 ;emit-byte JSR2 newline
;test7 OVR2k ;match JSR2 ;emit-byte JSR2 newline
;test8 OVR2k ;match JSR2 ;emit-byte JSR2 newline
newline
;test1 ;graph1 ;match JSR2 ;emit-byte JSR2 newline
;test2 ;graph1 ;match JSR2 ;emit-byte JSR2 newline
;test3 ;graph1 ;match JSR2 ;emit-byte JSR2 newline
;test4 ;graph1 ;match JSR2 ;emit-byte JSR2 newline
;test5 ;graph1 ;match JSR2 ;emit-byte JSR2 newline
;test6 ;graph1 ;match JSR2 ;emit-byte JSR2 newline
;test7 ;graph1 ;match JSR2 ;emit-byte JSR2 newline
;test8 ;graph1 ;match JSR2 ;emit-byte JSR2 newline
BRK
@match ( str* regex* -> bool^ )
@ -61,13 +77,13 @@
LDAk #03 EQU ;do-literal JCN2
LDAk #04 EQU ;do-or JCN2
LDAk #05 EQU ;do-or JCN2 ( same code as the or case )
#00 #00 DIV ( should not happen )
( LIT 'x emit ) #00 #00 DIV ( should not happen )
@goto-backtrack ( -> bool^ )
;stack-exist JSR2 ,&has-stack JCN ( do we have stack? )
#00 JMP2r ( no, return false )
&has-stack
;pop JSR2
;pop4 JSR2
;goto-next JMP2 ( yes, resume from the top )
@goto-next ( str* next* -> bool^ )
@ -100,15 +116,308 @@
( this also handles asteration, since it ends up having the same structure )
@do-or ( str* regex* -> bool^ )
INC2 OVR2 OVR2 #0002 ADD2 ( s r+1 s r+3 )
LDA2 ;push JSR2 ( save (s, right) in the stack for possible backtracking )
LDA2 ;push4 JSR2 ( save (s, right) in the stack for possible backtracking )
LDA2 ;loop JMP2 ( continue on left branch )
@pos $2
@parens $2
( read and increments pos )
@read ( -> c^ )
;pos LDA2k ( pos s )
LDAk STHk #00 EQU ( pos s c=0 [c] )
,&is-eof JCN ( pos s [c] )
INC2 ( pos s+1 [c] )
SWP2 STA2 ,&return JMP ( [c] )
&is-eof POP2 POP2
&return STHr ( c ) JMP2r
( ( read pos )
@peek ( -> c^ )
;pos LDA2 LDA JMP2r
@skip ( -> )
;pos LDA2 INC2 ;pos STA2 JMP2r )
( TODO: [] + ? )
( compile an expression string into a regex graph )
@compile ( expr* -> regex* )
;pos STA2
#0000 ;parens STA2
;reset-stack JSR2
;compile-region JMP2
( the basic strategy here is to build a stack of non-or )
( expressions to be joined together at the end of the )
( region. each stack entry has two regex addresses: )
( - the start of the regex )
( - the current tail of the regex )
( when we concatenate a new node to a regex we update )
( the second of these but not the first. )
( )
( the bottom of the stack for a given region is denoted )
( by #ffff #ffff. above that we start with #0000 #0000 )
( to signal an empty node. )
@compile-region ( -> r2* )
( #abcd #1234 ;emit-short JSR2 ;emit-short JSR2 space newline )
#ffff #ffff ;push4 JSR2
#0000 #0000 ;push4 JSR2
@compile-region-loop
;read JSR2
( LIT '> emit space DUP ;emit-byte JSR2 space print
;emit-stack JSR2
;emit-arena JSR2 newline )
DUP #00 EQU ;c-done JCN2
DUP LIT '* EQU ;c-star JCN2
DUP LIT '| EQU ;c-or JCN2
DUP LIT '. EQU ;c-dot JCN2
DUP LIT '( EQU ;c-lpar JCN2
DUP LIT ') EQU ;c-rpar JCN2
DUP LIT '\ EQU ;c-esc JCN2
;c-char JMP2
@c-done ( c^ -> r2* )
( LIT '$ emit print )
POP
;parens LDA2 #0000 GTH2 ,&mismatched-parens JCN
;unroll-stack JSR2 POP2
JMP2r
&mismatched-parens ( LIT 'v emit ) #00 #00 DIV
@c-or ( c^ -> r2* )
( LIT '| emit newline )
POP
#0000 #0000 ;push4 JSR2
;compile-region-loop JMP2
@c-lpar ( c^ -> r2* )
( LIT '( emit newline )
POP
;parens LDA2 INC2 ;parens STA2 ( parens++ )
;compile-region JMP2
@c-rpar ( c^ -> r2* )
( LIT ') emit newline )
POP ( print )
;parens LDA2 #0000 EQU2 ,&mismatched-parens JCN ( print )
;parens LDA2 #0001 SUB2 ;parens STA2 ( parens-- ) ( print )
( LIT 'x emit newline )
;unroll-stack JSR2
( LIT 'u emit print )
;push-next JSR2 ( print )
( LIT 'z emit newline )
;compile-region-loop JMP2
&mismatched-parens ( LIT 'Z emit ) #00 #00 DIV
@c-dot ( c^ -> r2* )
( LIT '. emit newline )
POP
;alloc-dot JSR2 DUP2 ;push-next JSR2
;compile-region-loop JMP2
@c-char ( c^ -> r2* )
( LIT '@ emit print )
( LIT '@ emit DUP ;emit-byte JSR2 newline )
;alloc-lit JSR2
( LIT '# emit print )
;emit-stack JSR2
;emit-arena JSR2
DUP2 ;push-next JSR2 ( print )
( LIT '& emit print )
;compile-region-loop JMP2
( TODO: escaping rules not quite right )
@c-esc ( c^ -> r2* )
( LIT '\ emit newline )
POP ;read JSR2
;c-char JMP2
@c-star ( c^ -> regex* )
LIT '* emit print
POP print
;pop4 JSR2 SWP2 STH2 STH2k ( x1 [x0 x1] )
LIT '! emit space print
;alloc-star JSR2 ( r )
STH2r STH2kr ( r x1 x0 [x0] )
LIT '0 emit newline
;emit-arena JSR2
;remove-from JSR2 ( r [x0] )
LIT '1 emit newline
;emit-arena JSR2
STH2r OVR2 OVR2 ( r x0 r x0 ) ;set-next JSR2
OVR2 #0003 ADD2 #0000 SWP2 STA2 ( fixme )
LIT '2 emit print
;emit-arena JSR2
( r x0 ) SWP2 ;push4 JSR2
LIT '* emit print
;compile-region-loop JMP2
( allocate node types ------ )
@alloc3 ( mode^ -> r* )
#0000 ROT ( 00 00 mode^ )
#03 ;alloc JSR2 ( 00 00 mode^ addr* ) ( LIT 'a emit print )
STH2k STA
STH2kr INC2 STA2
STH2r
( LIT 'e emit print )
JMP2r
@alloc-empty ( -> r* )
#01 ;alloc3 JMP2
@alloc-dot ( -> r* )
#02 ;alloc3 JMP2
@alloc-lit ( c^ -> r* )
( print )
#03 #0000 SWP2 ( print )
#04 ;alloc JSR2 ( print )
STH2k STA ( print )
STH2kr INC2 STA ( print )
STH2kr #0002 ADD2 STA2 ( print )
STH2r ( print )
JMP2r
@alloc-or ( right* left* -> r* )
#05 ;alloc JSR2 STH2 ( r l [x] )
#04 STH2kr STA ( r l [x] )
STH2kr INC2 STA2 ( r [x] )
STH2kr #0003 ADD2 STA2 ( [x] )
STH2r JMP2r
@alloc-star ( expr* -> r* )
( LIT '& emit space print )
LIT 'a emit print
#05 ;alloc JSR2 STH2 print ( expr [r] )
#05 STH2kr STA print ( expr [r] )
DUP2 STH2kr INC2 STA2 print ( expr [r] )
#0000 STH2kr #0003 ADD2 STA2 print ( expr [r] )
LIT 'a emit ;emit-arena JSR2 ;emit-stack JSR2
STH2kr SWP2 print ( r expr [r] )
LIT 'x emit print
;set-next JSR2 ( [r] )
STH2r JMP2r
( unroll one part of the parsing stack, returning )
( a single node consisting of an alternation of )
( all elements on the stack. )
( )
( this unrolls until it hits #ffff #ffff, which it )
( also removes from the stack. )
@unroll-stack ( -> start* end* )
( LIT 'p emit newline )
( #fedc #9876 ;emit-short JSR2 ;emit-short JSR2 space print )
;pop4 JSR2 STH2 ( r ) ( print )
#00 STH
DUP2 #0000 NEQ2 ,&loop JCN ;alloc-empty JSR2 ( print )
&loop ( r* ) LIT 'L emit print
;pop4 JSR2 POP2 ( r x ) ( print )
DUP2 #ffff EQU2 ( print ) ( r x x-is-end? ) ,&done JCN
INCr
( print ) ;alloc-or JSR2 ( r|x ) ( print ) ,&loop JMP
&done
( LIT 'q emit newline )
( r ffff ) ( print )
POP2
STHr ,&is-or JCN
STH2r JMP2r
&is-or
POP2r
LIT 'b emit print
;alloc-empty JSR2 OVR2 OVR2 SWP2 ( r empty empty r )
LIT 'c emit print
;set-next JSR2 LIT 'd emit print
( STH2 ;pop4 JSR2 POP2 STH2r ;push4 JSR2 )
JMP2r
( add r to the top of the stock. )
( )
( in particular, this will write r into tail.next )
( before replacing tail with r. )
@push-next ( r0 r1 -> )
;pop4 JSR2 ( r0 r1 x0 x1 )
DUP2 #0000 EQU2 ( r0 r1 x0 x1 x1=0? ) ,&is-zero JCN
STH2 ROT2 STH2r ( r1 x0 r0 x1 )
LIT 'p emit print
;set-next JSR2 SWP2 ( x0 r1 )
LIT 'q emit print
;push4 JSR2
JMP2r
&is-zero POP2 POP2 ;push4 JSR2 JMP2r
( ( r -> )
( LIT 'N emit space print 0 )
STH2k ( r [r] ) ( print )
;pop4 JSR2 ( r x0 x1 [r] ) ( print )
DUP2 #0000 EQU2 ( r x0 x1 x1=0? [r] ) ( print ) ,&is-zero JCN
ROT2 SWP2 ( x0 r x1 [r] ) ( print )
;set-next JSR2 ( x0 [r] ) STH2r ( x0 r ) ( print )
;push4 JSR2 ( )
JMP2r
&is-zero POP2 POP2 STH2r ;push4 JSR2 ( print LIT '* emit newline ) JMP2r )
( for nodes (other than 'or') read their 'next' pointer )
@get-next ( r* -> r.next* )
LDAk #01 NEQ ,&!1 JCN INC2 LDA2 JMP2r
&!1 LDAk #02 NEQ ,&!2 JCN INC2 LDA2 JMP2r
&!2 LDAk #03 NEQ ,&!3 JCN #0002 ADD2 LDA2 JMP2r
&!3 LDAk #04 NEQ ,&!4 JCN INC2 LDA2 JMP2r
&!4 LDAk #05 NEQ ,&!5 JCN #0003 ADD2 LDA2 JMP2r
&!5 ( either #04 (or) or ??? ) LIT 'q emit #00 #00 DIV
@set-next-addr ( target* addr* -> )
LIT 'Z emit print
LDA2k #0000 EQU2 ( target addr addr=0? ) ,&is-zero JCN
LIT 'N emit print LDA2 ;set-next JSR2 JMP2r
&is-zero ( print ) LIT 'z emit print STA2 JMP2r
( set regex.next to target )
@set-next ( target* regex* -> )
LIT 'n emit space LDAk ;emit-byte JSR2 print
LDAk #01 NEQ ,&!1 JCN INC2 ( STA2 ) LIT 't emit print ;set-next-addr JSR2 JMP2r
&!1 LDAk #02 NEQ ,&!2 JCN INC2 ( STA2 ) ;set-next-addr JSR2 JMP2r
&!2 LDAk #03 NEQ ,&!3 JCN #0002 ADD2 ( STA2 ) LIT 'y emit print ;set-next-addr JSR2 JMP2r
&!3 LDAk #04 NEQ ,&!4 JCN
LIT 'w emit print
( INC2k LDA2 space LIT '{ emit DUP2 ;emit-short JSR2 ;set-next JSR2
#0003 ADD2 LDA2 space LIT '} emit DUP2 ;emit-short JSR2 ;set-next JSR2 JMP2r )
OVR2 OVR2 INC2 ( LIT '{ emit space ) ;set-next-addr JSR2
#0003 ADD2 ( LIT '} emit space ) ;set-next-addr JSR2 JMP2r
&!4 LDAk #05 NEQ ,&!5 JCN #0003 ADD2 ( STA2 ) ;set-next-addr JSR2 JMP2r
&!5 LIT '? emit LDAk ;emit-byte JSR2 #00 #00 DIV
@remove-addr ( target* addr* -> )
LIT 'A emit print
LDA2k #0000 EQU2 ( t a v=0? ) ,&is-zero JCN
OVR2 OVR2 LDA2 EQU2 ( t a t=v? ) ,&is-equal JCN
LDA2 ( t v ) ;remove-from JSR2 JMP2r
&is-zero LIT 'r emit print POP2 POP2 JMP2r
&is-equal LIT 's emit print NIP2 #0000 SWP2 STA2 JMP2r
( remove target from regex )
@remove-from ( target* regex* -> )
( LIT 'R emit print )
LDAk #01 NEQ ,&!1 JCN INC2 ;remove-addr JSR2 JMP2r
&!1 LDAk #02 NEQ ,&!2 JCN INC2 ;remove-addr JSR2 JMP2r
&!2 LDAk #03 NEQ ,&!3 JCN #0002 ADD2 ;remove-addr JSR2 JMP2r
&!3 LDAk #04 NEQ ,&!4 JCN
( LIT 'Q emit print )
OVR2 OVR2 INC2 ;remove-addr JSR2
( LIT 'q emit print )
#0003 ADD2 ;remove-addr JSR2 JMP2r
&!4 LDAk #05 NEQ ,&!5 JCN #0003 ADD2 ;remove-addr JSR2 JMP2r
&!5 LIT '? emit LDAk ;emit-byte JSR2 #00 #00 DIV
( test cases -------- )
( corresponds to regex: a(b|c)d* )
@expr1 "a(b|c)d* 00
( corresponds to regex: a(b|c)d* )
( accepts "ab" or "ac" followd by any number of d's )
@expr1
@graph1
03 'a :x1
@x1 04 :x2 :x3
@x2 03 'b :x4
@ -125,6 +434,8 @@
@test7 "z 00 ( no )
@test8 00 ( no )
( emit byte/short )
@emit-short ( byte -- )
SWP ;emit-byte JSR2 ;emit-byte JSR2 JMP2r
@ -133,31 +444,30 @@
&hex #30 ADD DUP #39 GTH #27 MUL ADD emit
JMP2r
@emit3 ( addr* -> addr* )
DUP2
LDAk ;emit-byte JSR2 space INC2
LDA2 ;emit-short JSR2 newline JMP2r
( stack operations ---- )
@emit4 ( addr* -> addr* )
DUP2
LDAk ;emit-byte JSR2 space INC2
LDAk ;emit-byte JSR2 space INC2
LDA2 ;emit-short JSR2 newline JMP2r
( @peek2 ( -> regex* )
;assert-exist JSR2 ( check for space )
;stack-pos LDA2 ( load stack-pos )
#0002 SUB2 LDA2 ( get regex )
JMP2r )
@emit5 ( addr* -> addr* )
DUP2
LDAk ;emit-byte JSR2 space INC2
LDA2k ;emit-short JSR2 space #0002 ADD2
LDA2 ;emit-short JSR2 newline JMP2r
@push ( str* regex* -> )
@push4 ( str* regex* -> )
;assert-avail JSR2 ( check for space )
;stack-pos LDA2 #0002 ADD2 STA2 ( cell[2:3] <- regex )
;stack-pos LDA2 STA2 ( cell[0:1] <- str )
;stack-pos LDA2 #0004 ADD2 ;stack-pos STA2 ( pos += 4 )
JMP2r
@pop ( -> str* regex* )
( @peek4 ( -> str* regex* )
;assert-exist JSR2 ( check for space )
;stack-pos LDA2 ( load stack-pos )
#0002 SUB2 LDA2k STH2 ( pop and stash regex )
#0002 SUB2 LDA2 STH2r ( pop the str, restore the regex )
JMP2r )
@pop4 ( -> str* regex* )
;assert-exist JSR2 ( check for space )
;stack-pos LDA2 ( load stack-pos )
#0002 SUB2 LDA2k STH2 ( pop and stash regex )
#0002 SUB2 LDA2k STH2 ( pop and stash str )
@ -167,27 +477,31 @@
@reset-stack ( -> )
;stack-bot ;stack-pos STA2 JMP2r ( pos <- 0 )
@stack-avail ( -> bool^ )
;stack-pos LDA2 ;stack-top LTH2 JMP2r
@stack-exist ( -> bool^ )
;stack-pos LDA2 ;stack-bot GTH2 JMP2r
@assert-avail ( -> )
;stack-avail JSR2 ,&ok JCN #00 #00 DIV &ok JMP2r
@assert-exist ( -> )
;stack-exist JSR2 ,&ok JCN #00 #00 DIV &ok JMP2r
@emit-stack ( -> )
space LIT 'n emit space ;stack-pos LDA2 ;stack-bot SUB2 #0004 DIV2 ;emit-short JSR2 newline
space LIT 'n emit LIT '= emit ;stack-pos LDA2 ;stack-bot SUB2 #0004 DIV2 ;emit-short JSR2 LIT ': emit
;stack-bot
&loop
DUP2 ;stack-pos LDA2 LTH2 ,&ok JCN
POP2 JMP2r
POP2 newline JMP2r
&ok
space space LDA2k ;emit-short JSR2
#0002 ADD2 DUP2 LDA2 space ;emit-short JSR2 newline
space LDA2k ;emit-short JSR2
#0002 ADD2 ,&loop JMP
@stack-pos :stack-bot ( the next position to insert at )
@stack-bot $1000 @stack-top ( holds 1024 steps (4096 bytes) )
( arena operations ---- )
@reset-arena ( -> )
;arena-bot ;arena-pos STA2 JMP2r
@ -196,7 +510,31 @@
;arena-pos LDA2 STH2k ADD2 ( pos+size* {pos} )
( TODO: ensure we don't exceed our space )
;arena-pos STA2 ( pos <- pos+size )
STH2r JMP2 ( return old pos )
STH2r JMP2r ( return old pos )
|1ffe
@arena-pos :arena-bot ( the next position to allocate )
@arena-bot $400 @arena-top ( holds up to 1024 bytes )
@emit-n ( addr* count^ -> addr2* )
DUP #00 GTH ( addr count count>0? ) ,&ok JCN ( addr count ) POP newline JMP2r
&ok
STH ( addr [count] ) space LDAk ;emit-byte JSR2 INC2 ( addr+1 [count] )
STHr #01 SUB ( addr+1 count-1 )
;emit-n JMP2
@emit-arena ( -> )
;arena-bot
&loop
( print )
DUP2 ;arena-pos LDA2 LTH2 ,&ok JCN POP2 JMP2r
&ok
DUP2 ;emit-short JSR2
LIT ': emit space
LDAk #01 NEQ ,&!1 JCN #03 ;emit-n JSR2 ,&loop JMP
&!1 LDAk #02 NEQ ,&!2 JCN #03 ;emit-n JSR2 ,&loop JMP
&!2 LDAk #03 NEQ ,&!3 JCN #04 ;emit-n JSR2 ,&loop JMP
&!3 LDAk #04 NEQ ,&!4 JCN #05 ;emit-n JSR2 ,&loop JMP
&!4 LDAk #05 NEQ ,&!5 JCN #05 ;emit-n JSR2 ,&loop JMP
&!5 LDAk ;emit-byte JSR2 LIT '! emit newline #00 #00 DIV

3
uxnrun
View File

@ -1,13 +1,14 @@
#!/bin/sh
BIN="$HOME/w/uxn/bin"
DEST="run.rom"
if [ $# -lt 1 ]; then
echo "usage: $0 [-c] FILE.tal"
exit 1
fi
DEST=$( echo "$1" | sed -re 's#\.tal$#.rom#' )
case "$1" in
-c) $BIN/uxnasm $2 $DEST && $BIN/uxncli $DEST;;
-s) $BIN/uxnasm $3 $DEST && $BIN/uxnemu -s $2 $DEST;;