From 6d0d309a1c3e2b523b1edace5170065bf25fe58c Mon Sep 17 00:00:00 2001 From: d6 Date: Sat, 29 Jan 2022 23:13:10 -0500 Subject: [PATCH] regex appears to be working --- regex.tal | 406 +++++++++++++++++++++++++++++++++++++++++++++++++----- uxnrun | 3 +- 2 files changed, 374 insertions(+), 35 deletions(-) diff --git a/regex.tal b/regex.tal index 87a50a2..f8c1c90 100644 --- a/regex.tal +++ b/regex.tal @@ -33,16 +33,32 @@ %emit { #18 DEO } %space { #20 emit } %newline { #0a emit } +%print { debug newline } |0100 - ;test1 ;expr1 ;match JSR2 ;emit-byte JSR2 newline - ;test2 ;expr1 ;match JSR2 ;emit-byte JSR2 newline - ;test3 ;expr1 ;match JSR2 ;emit-byte JSR2 newline - ;test4 ;expr1 ;match JSR2 ;emit-byte JSR2 newline - ;test5 ;expr1 ;match JSR2 ;emit-byte JSR2 newline - ;test6 ;expr1 ;match JSR2 ;emit-byte JSR2 newline - ;test7 ;expr1 ;match JSR2 ;emit-byte JSR2 newline - ;test8 ;expr1 ;match JSR2 ;emit-byte JSR2 newline + ;expr1 ;compile JSR2 print newline + ;emit-stack JSR2 newline + ;emit-arena JSR2 newline + newline + + ;test1 OVR2k ;match JSR2 ;emit-byte JSR2 newline + ;test2 OVR2k ;match JSR2 ;emit-byte JSR2 newline + ;test3 OVR2k ;match JSR2 ;emit-byte JSR2 newline + ;test4 OVR2k ;match JSR2 ;emit-byte JSR2 newline + ;test5 OVR2k ;match JSR2 ;emit-byte JSR2 newline + ;test6 OVR2k ;match JSR2 ;emit-byte JSR2 newline + ;test7 OVR2k ;match JSR2 ;emit-byte JSR2 newline + ;test8 OVR2k ;match JSR2 ;emit-byte JSR2 newline + newline + + ;test1 ;graph1 ;match JSR2 ;emit-byte JSR2 newline + ;test2 ;graph1 ;match JSR2 ;emit-byte JSR2 newline + ;test3 ;graph1 ;match JSR2 ;emit-byte JSR2 newline + ;test4 ;graph1 ;match JSR2 ;emit-byte JSR2 newline + ;test5 ;graph1 ;match JSR2 ;emit-byte JSR2 newline + ;test6 ;graph1 ;match JSR2 ;emit-byte JSR2 newline + ;test7 ;graph1 ;match JSR2 ;emit-byte JSR2 newline + ;test8 ;graph1 ;match JSR2 ;emit-byte JSR2 newline BRK @match ( str* regex* -> bool^ ) @@ -61,13 +77,13 @@ LDAk #03 EQU ;do-literal JCN2 LDAk #04 EQU ;do-or JCN2 LDAk #05 EQU ;do-or JCN2 ( same code as the or case ) - #00 #00 DIV ( should not happen ) + ( LIT 'x emit ) #00 #00 DIV ( should not happen ) @goto-backtrack ( -> bool^ ) ;stack-exist JSR2 ,&has-stack JCN ( do we have stack? ) #00 JMP2r ( no, return false ) &has-stack - ;pop JSR2 + ;pop4 JSR2 ;goto-next JMP2 ( yes, resume from the top ) @goto-next ( str* next* -> bool^ ) @@ -100,15 +116,308 @@ ( this also handles asteration, since it ends up having the same structure ) @do-or ( str* regex* -> bool^ ) INC2 OVR2 OVR2 #0002 ADD2 ( s r+1 s r+3 ) - LDA2 ;push JSR2 ( save (s, right) in the stack for possible backtracking ) + LDA2 ;push4 JSR2 ( save (s, right) in the stack for possible backtracking ) LDA2 ;loop JMP2 ( continue on left branch ) +@pos $2 +@parens $2 + +( read and increments pos ) +@read ( -> c^ ) + ;pos LDA2k ( pos s ) + LDAk STHk #00 EQU ( pos s c=0 [c] ) + ,&is-eof JCN ( pos s [c] ) + INC2 ( pos s+1 [c] ) + SWP2 STA2 ,&return JMP ( [c] ) + &is-eof POP2 POP2 + &return STHr ( c ) JMP2r + +( ( read pos ) +@peek ( -> c^ ) + ;pos LDA2 LDA JMP2r + +@skip ( -> ) + ;pos LDA2 INC2 ;pos STA2 JMP2r ) + +( TODO: [] + ? ) ( compile an expression string into a regex graph ) @compile ( expr* -> regex* ) + ;pos STA2 + #0000 ;parens STA2 + ;reset-stack JSR2 + ;compile-region JMP2 + +( the basic strategy here is to build a stack of non-or ) +( expressions to be joined together at the end of the ) +( region. each stack entry has two regex addresses: ) +( - the start of the regex ) +( - the current tail of the regex ) +( when we concatenate a new node to a regex we update ) +( the second of these but not the first. ) +( ) +( the bottom of the stack for a given region is denoted ) +( by #ffff #ffff. above that we start with #0000 #0000 ) +( to signal an empty node. ) +@compile-region ( -> r2* ) +( #abcd #1234 ;emit-short JSR2 ;emit-short JSR2 space newline ) + #ffff #ffff ;push4 JSR2 + #0000 #0000 ;push4 JSR2 +@compile-region-loop + ;read JSR2 +( LIT '> emit space DUP ;emit-byte JSR2 space print + ;emit-stack JSR2 + ;emit-arena JSR2 newline ) + DUP #00 EQU ;c-done JCN2 + DUP LIT '* EQU ;c-star JCN2 + DUP LIT '| EQU ;c-or JCN2 + DUP LIT '. EQU ;c-dot JCN2 + DUP LIT '( EQU ;c-lpar JCN2 + DUP LIT ') EQU ;c-rpar JCN2 + DUP LIT '\ EQU ;c-esc JCN2 + ;c-char JMP2 + +@c-done ( c^ -> r2* ) +( LIT '$ emit print ) + POP + ;parens LDA2 #0000 GTH2 ,&mismatched-parens JCN + ;unroll-stack JSR2 POP2 + JMP2r + &mismatched-parens ( LIT 'v emit ) #00 #00 DIV + +@c-or ( c^ -> r2* ) +( LIT '| emit newline ) + POP + #0000 #0000 ;push4 JSR2 + ;compile-region-loop JMP2 + +@c-lpar ( c^ -> r2* ) +( LIT '( emit newline ) + POP + ;parens LDA2 INC2 ;parens STA2 ( parens++ ) + ;compile-region JMP2 + +@c-rpar ( c^ -> r2* ) +( LIT ') emit newline ) + POP ( print ) + ;parens LDA2 #0000 EQU2 ,&mismatched-parens JCN ( print ) + ;parens LDA2 #0001 SUB2 ;parens STA2 ( parens-- ) ( print ) +( LIT 'x emit newline ) + ;unroll-stack JSR2 +( LIT 'u emit print ) + ;push-next JSR2 ( print ) +( LIT 'z emit newline ) + ;compile-region-loop JMP2 + &mismatched-parens ( LIT 'Z emit ) #00 #00 DIV + +@c-dot ( c^ -> r2* ) +( LIT '. emit newline ) + POP + ;alloc-dot JSR2 DUP2 ;push-next JSR2 + ;compile-region-loop JMP2 + +@c-char ( c^ -> r2* ) +( LIT '@ emit print ) +( LIT '@ emit DUP ;emit-byte JSR2 newline ) + ;alloc-lit JSR2 +( LIT '# emit print ) + ;emit-stack JSR2 + ;emit-arena JSR2 + DUP2 ;push-next JSR2 ( print ) +( LIT '& emit print ) + ;compile-region-loop JMP2 + +( TODO: escaping rules not quite right ) +@c-esc ( c^ -> r2* ) +( LIT '\ emit newline ) + POP ;read JSR2 + ;c-char JMP2 + +@c-star ( c^ -> regex* ) + LIT '* emit print + POP print + ;pop4 JSR2 SWP2 STH2 STH2k ( x1 [x0 x1] ) + LIT '! emit space print + ;alloc-star JSR2 ( r ) + STH2r STH2kr ( r x1 x0 [x0] ) + LIT '0 emit newline + ;emit-arena JSR2 + ;remove-from JSR2 ( r [x0] ) + LIT '1 emit newline + ;emit-arena JSR2 + STH2r OVR2 OVR2 ( r x0 r x0 ) ;set-next JSR2 + OVR2 #0003 ADD2 #0000 SWP2 STA2 ( fixme ) + LIT '2 emit print + ;emit-arena JSR2 + ( r x0 ) SWP2 ;push4 JSR2 + LIT '* emit print + ;compile-region-loop JMP2 + + +( allocate node types ------ ) + +@alloc3 ( mode^ -> r* ) + #0000 ROT ( 00 00 mode^ ) + #03 ;alloc JSR2 ( 00 00 mode^ addr* ) ( LIT 'a emit print ) + STH2k STA + STH2kr INC2 STA2 + STH2r +( LIT 'e emit print ) + JMP2r + +@alloc-empty ( -> r* ) + #01 ;alloc3 JMP2 + +@alloc-dot ( -> r* ) + #02 ;alloc3 JMP2 + +@alloc-lit ( c^ -> r* ) + ( print ) + #03 #0000 SWP2 ( print ) + #04 ;alloc JSR2 ( print ) + STH2k STA ( print ) + STH2kr INC2 STA ( print ) + STH2kr #0002 ADD2 STA2 ( print ) + STH2r ( print ) + JMP2r + +@alloc-or ( right* left* -> r* ) + #05 ;alloc JSR2 STH2 ( r l [x] ) + #04 STH2kr STA ( r l [x] ) + STH2kr INC2 STA2 ( r [x] ) + STH2kr #0003 ADD2 STA2 ( [x] ) + STH2r JMP2r + +@alloc-star ( expr* -> r* ) +( LIT '& emit space print ) + LIT 'a emit print + #05 ;alloc JSR2 STH2 print ( expr [r] ) + #05 STH2kr STA print ( expr [r] ) + DUP2 STH2kr INC2 STA2 print ( expr [r] ) + #0000 STH2kr #0003 ADD2 STA2 print ( expr [r] ) + LIT 'a emit ;emit-arena JSR2 ;emit-stack JSR2 + + STH2kr SWP2 print ( r expr [r] ) + LIT 'x emit print + ;set-next JSR2 ( [r] ) + STH2r JMP2r + +( unroll one part of the parsing stack, returning ) +( a single node consisting of an alternation of ) +( all elements on the stack. ) +( ) +( this unrolls until it hits #ffff #ffff, which it ) +( also removes from the stack. ) +@unroll-stack ( -> start* end* ) +( LIT 'p emit newline ) +( #fedc #9876 ;emit-short JSR2 ;emit-short JSR2 space print ) + ;pop4 JSR2 STH2 ( r ) ( print ) + #00 STH + DUP2 #0000 NEQ2 ,&loop JCN ;alloc-empty JSR2 ( print ) + &loop ( r* ) LIT 'L emit print + ;pop4 JSR2 POP2 ( r x ) ( print ) + DUP2 #ffff EQU2 ( print ) ( r x x-is-end? ) ,&done JCN + INCr + ( print ) ;alloc-or JSR2 ( r|x ) ( print ) ,&loop JMP + &done +( LIT 'q emit newline ) + ( r ffff ) ( print ) + POP2 + STHr ,&is-or JCN + STH2r JMP2r + &is-or + POP2r + LIT 'b emit print + ;alloc-empty JSR2 OVR2 OVR2 SWP2 ( r empty empty r ) + LIT 'c emit print + ;set-next JSR2 LIT 'd emit print +( STH2 ;pop4 JSR2 POP2 STH2r ;push4 JSR2 ) + JMP2r + +( add r to the top of the stock. ) +( ) +( in particular, this will write r into tail.next ) +( before replacing tail with r. ) +@push-next ( r0 r1 -> ) + ;pop4 JSR2 ( r0 r1 x0 x1 ) + DUP2 #0000 EQU2 ( r0 r1 x0 x1 x1=0? ) ,&is-zero JCN + STH2 ROT2 STH2r ( r1 x0 r0 x1 ) + LIT 'p emit print + ;set-next JSR2 SWP2 ( x0 r1 ) + LIT 'q emit print + ;push4 JSR2 + JMP2r + &is-zero POP2 POP2 ;push4 JSR2 JMP2r +( ( r -> ) +( LIT 'N emit space print 0 ) + STH2k ( r [r] ) ( print ) + ;pop4 JSR2 ( r x0 x1 [r] ) ( print ) + DUP2 #0000 EQU2 ( r x0 x1 x1=0? [r] ) ( print ) ,&is-zero JCN + ROT2 SWP2 ( x0 r x1 [r] ) ( print ) + ;set-next JSR2 ( x0 [r] ) STH2r ( x0 r ) ( print ) + ;push4 JSR2 ( ) + JMP2r + &is-zero POP2 POP2 STH2r ;push4 JSR2 ( print LIT '* emit newline ) JMP2r ) + +( for nodes (other than 'or') read their 'next' pointer ) +@get-next ( r* -> r.next* ) + LDAk #01 NEQ ,&!1 JCN INC2 LDA2 JMP2r + &!1 LDAk #02 NEQ ,&!2 JCN INC2 LDA2 JMP2r + &!2 LDAk #03 NEQ ,&!3 JCN #0002 ADD2 LDA2 JMP2r + &!3 LDAk #04 NEQ ,&!4 JCN INC2 LDA2 JMP2r + &!4 LDAk #05 NEQ ,&!5 JCN #0003 ADD2 LDA2 JMP2r + &!5 ( either #04 (or) or ??? ) LIT 'q emit #00 #00 DIV + +@set-next-addr ( target* addr* -> ) + LIT 'Z emit print + LDA2k #0000 EQU2 ( target addr addr=0? ) ,&is-zero JCN + LIT 'N emit print LDA2 ;set-next JSR2 JMP2r + &is-zero ( print ) LIT 'z emit print STA2 JMP2r + +( set regex.next to target ) +@set-next ( target* regex* -> ) + LIT 'n emit space LDAk ;emit-byte JSR2 print + LDAk #01 NEQ ,&!1 JCN INC2 ( STA2 ) LIT 't emit print ;set-next-addr JSR2 JMP2r + &!1 LDAk #02 NEQ ,&!2 JCN INC2 ( STA2 ) ;set-next-addr JSR2 JMP2r + &!2 LDAk #03 NEQ ,&!3 JCN #0002 ADD2 ( STA2 ) LIT 'y emit print ;set-next-addr JSR2 JMP2r + &!3 LDAk #04 NEQ ,&!4 JCN + LIT 'w emit print +( INC2k LDA2 space LIT '{ emit DUP2 ;emit-short JSR2 ;set-next JSR2 + #0003 ADD2 LDA2 space LIT '} emit DUP2 ;emit-short JSR2 ;set-next JSR2 JMP2r ) + OVR2 OVR2 INC2 ( LIT '{ emit space ) ;set-next-addr JSR2 + #0003 ADD2 ( LIT '} emit space ) ;set-next-addr JSR2 JMP2r + &!4 LDAk #05 NEQ ,&!5 JCN #0003 ADD2 ( STA2 ) ;set-next-addr JSR2 JMP2r + &!5 LIT '? emit LDAk ;emit-byte JSR2 #00 #00 DIV + +@remove-addr ( target* addr* -> ) + LIT 'A emit print + LDA2k #0000 EQU2 ( t a v=0? ) ,&is-zero JCN + OVR2 OVR2 LDA2 EQU2 ( t a t=v? ) ,&is-equal JCN + LDA2 ( t v ) ;remove-from JSR2 JMP2r + &is-zero LIT 'r emit print POP2 POP2 JMP2r + &is-equal LIT 's emit print NIP2 #0000 SWP2 STA2 JMP2r + +( remove target from regex ) +@remove-from ( target* regex* -> ) +( LIT 'R emit print ) + LDAk #01 NEQ ,&!1 JCN INC2 ;remove-addr JSR2 JMP2r + &!1 LDAk #02 NEQ ,&!2 JCN INC2 ;remove-addr JSR2 JMP2r + &!2 LDAk #03 NEQ ,&!3 JCN #0002 ADD2 ;remove-addr JSR2 JMP2r + &!3 LDAk #04 NEQ ,&!4 JCN +( LIT 'Q emit print ) + OVR2 OVR2 INC2 ;remove-addr JSR2 +( LIT 'q emit print ) + #0003 ADD2 ;remove-addr JSR2 JMP2r + &!4 LDAk #05 NEQ ,&!5 JCN #0003 ADD2 ;remove-addr JSR2 JMP2r + &!5 LIT '? emit LDAk ;emit-byte JSR2 #00 #00 DIV + +( test cases -------- ) + +( corresponds to regex: a(b|c)d* ) +@expr1 "a(b|c)d* 00 ( corresponds to regex: a(b|c)d* ) ( accepts "ab" or "ac" followd by any number of d's ) -@expr1 +@graph1 03 'a :x1 @x1 04 :x2 :x3 @x2 03 'b :x4 @@ -125,6 +434,8 @@ @test7 "z 00 ( no ) @test8 00 ( no ) +( emit byte/short ) + @emit-short ( byte -- ) SWP ;emit-byte JSR2 ;emit-byte JSR2 JMP2r @@ -133,31 +444,30 @@ &hex #30 ADD DUP #39 GTH #27 MUL ADD emit JMP2r -@emit3 ( addr* -> addr* ) - DUP2 - LDAk ;emit-byte JSR2 space INC2 - LDA2 ;emit-short JSR2 newline JMP2r +( stack operations ---- ) -@emit4 ( addr* -> addr* ) - DUP2 - LDAk ;emit-byte JSR2 space INC2 - LDAk ;emit-byte JSR2 space INC2 - LDA2 ;emit-short JSR2 newline JMP2r +( @peek2 ( -> regex* ) + ;assert-exist JSR2 ( check for space ) + ;stack-pos LDA2 ( load stack-pos ) + #0002 SUB2 LDA2 ( get regex ) + JMP2r ) -@emit5 ( addr* -> addr* ) - DUP2 - LDAk ;emit-byte JSR2 space INC2 - LDA2k ;emit-short JSR2 space #0002 ADD2 - LDA2 ;emit-short JSR2 newline JMP2r - -@push ( str* regex* -> ) +@push4 ( str* regex* -> ) ;assert-avail JSR2 ( check for space ) ;stack-pos LDA2 #0002 ADD2 STA2 ( cell[2:3] <- regex ) ;stack-pos LDA2 STA2 ( cell[0:1] <- str ) ;stack-pos LDA2 #0004 ADD2 ;stack-pos STA2 ( pos += 4 ) JMP2r -@pop ( -> str* regex* ) +( @peek4 ( -> str* regex* ) + ;assert-exist JSR2 ( check for space ) + ;stack-pos LDA2 ( load stack-pos ) + #0002 SUB2 LDA2k STH2 ( pop and stash regex ) + #0002 SUB2 LDA2 STH2r ( pop the str, restore the regex ) + JMP2r ) + +@pop4 ( -> str* regex* ) + ;assert-exist JSR2 ( check for space ) ;stack-pos LDA2 ( load stack-pos ) #0002 SUB2 LDA2k STH2 ( pop and stash regex ) #0002 SUB2 LDA2k STH2 ( pop and stash str ) @@ -167,27 +477,31 @@ @reset-stack ( -> ) ;stack-bot ;stack-pos STA2 JMP2r ( pos <- 0 ) + @stack-avail ( -> bool^ ) ;stack-pos LDA2 ;stack-top LTH2 JMP2r @stack-exist ( -> bool^ ) ;stack-pos LDA2 ;stack-bot GTH2 JMP2r @assert-avail ( -> ) ;stack-avail JSR2 ,&ok JCN #00 #00 DIV &ok JMP2r +@assert-exist ( -> ) + ;stack-exist JSR2 ,&ok JCN #00 #00 DIV &ok JMP2r @emit-stack ( -> ) - space LIT 'n emit space ;stack-pos LDA2 ;stack-bot SUB2 #0004 DIV2 ;emit-short JSR2 newline + space LIT 'n emit LIT '= emit ;stack-pos LDA2 ;stack-bot SUB2 #0004 DIV2 ;emit-short JSR2 LIT ': emit ;stack-bot &loop DUP2 ;stack-pos LDA2 LTH2 ,&ok JCN - POP2 JMP2r + POP2 newline JMP2r &ok - space space LDA2k ;emit-short JSR2 - #0002 ADD2 DUP2 LDA2 space ;emit-short JSR2 newline + space LDA2k ;emit-short JSR2 #0002 ADD2 ,&loop JMP @stack-pos :stack-bot ( the next position to insert at ) @stack-bot $1000 @stack-top ( holds 1024 steps (4096 bytes) ) +( arena operations ---- ) + @reset-arena ( -> ) ;arena-bot ;arena-pos STA2 JMP2r @@ -196,7 +510,31 @@ ;arena-pos LDA2 STH2k ADD2 ( pos+size* {pos} ) ( TODO: ensure we don't exceed our space ) ;arena-pos STA2 ( pos <- pos+size ) - STH2r JMP2 ( return old pos ) + STH2r JMP2r ( return old pos ) +|1ffe @arena-pos :arena-bot ( the next position to allocate ) @arena-bot $400 @arena-top ( holds up to 1024 bytes ) + +@emit-n ( addr* count^ -> addr2* ) + DUP #00 GTH ( addr count count>0? ) ,&ok JCN ( addr count ) POP newline JMP2r + &ok + STH ( addr [count] ) space LDAk ;emit-byte JSR2 INC2 ( addr+1 [count] ) + STHr #01 SUB ( addr+1 count-1 ) + ;emit-n JMP2 + +@emit-arena ( -> ) + ;arena-bot + &loop +( print ) + DUP2 ;arena-pos LDA2 LTH2 ,&ok JCN POP2 JMP2r + &ok + DUP2 ;emit-short JSR2 + LIT ': emit space + LDAk #01 NEQ ,&!1 JCN #03 ;emit-n JSR2 ,&loop JMP + &!1 LDAk #02 NEQ ,&!2 JCN #03 ;emit-n JSR2 ,&loop JMP + &!2 LDAk #03 NEQ ,&!3 JCN #04 ;emit-n JSR2 ,&loop JMP + &!3 LDAk #04 NEQ ,&!4 JCN #05 ;emit-n JSR2 ,&loop JMP + &!4 LDAk #05 NEQ ,&!5 JCN #05 ;emit-n JSR2 ,&loop JMP + &!5 LDAk ;emit-byte JSR2 LIT '! emit newline #00 #00 DIV + \ No newline at end of file diff --git a/uxnrun b/uxnrun index fb14da0..78a07a5 100755 --- a/uxnrun +++ b/uxnrun @@ -1,13 +1,14 @@ #!/bin/sh BIN="$HOME/w/uxn/bin" -DEST="run.rom" if [ $# -lt 1 ]; then echo "usage: $0 [-c] FILE.tal" exit 1 fi +DEST=$( echo "$1" | sed -re 's#\.tal$#.rom#' ) + case "$1" in -c) $BIN/uxnasm $2 $DEST && $BIN/uxncli $DEST;; -s) $BIN/uxnasm $3 $DEST && $BIN/uxnemu -s $2 $DEST;;