From a5eb54f87649a1fc0ed29b4307d490ff491753c3 Mon Sep 17 00:00:00 2001 From: d6 Date: Sat, 22 Jan 2022 22:54:51 -0500 Subject: [PATCH] regex graph seems to work --- regex.tal | 202 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 202 insertions(+) create mode 100644 regex.tal diff --git a/regex.tal b/regex.tal new file mode 100644 index 0000000..87a50a2 --- /dev/null +++ b/regex.tal @@ -0,0 +1,202 @@ +( regex.tal ) +( ) +( compiles regex expression strings into regex nodes, then uses ) +( regex ndoes to match input strings. ) +( ) +( this currently only supports matching an entire string, as ) +( opposed to searching for a matching substring, or extracting ) +( matching subgroups. ) +( ) +( regex node types: ) +( ) +( NAME DESCRIPTION STRUCT ) +( empty matches empty string [ #01 next* ] ) +( dot matches any one char [ #02 next* ] ) +( lit matches one specific char (c) [ #03 c^ next* ] ) +( or matches either left or right [ #04 left* right* ] ) +( star matches expr zero-or-more times [ #05 r* next* ] ) +( (NOTE: r.expr.next must be r) ) +( ) +( `or` and `star` have the same structure and are handled by the ) +( same code (;do-or). however, the node types are kept different ) +( to make it clearer how to parse and assemble the nodes. ) +( ) +( concatenation isn't a node, it is implied by the *next addr. ) +( a next value of #0000 signals the end of the regex. ) +( ) +( in these docs str* is an address to a null-terminated string. ) +( regexes should not include nulls and cannot match them (other ) +( than the null which signals the end of a string). ) + +%null? { #00 EQU } +%debug { #ff #0e DEO } +%emit { #18 DEO } +%space { #20 emit } +%newline { #0a emit } + +|0100 + ;test1 ;expr1 ;match JSR2 ;emit-byte JSR2 newline + ;test2 ;expr1 ;match JSR2 ;emit-byte JSR2 newline + ;test3 ;expr1 ;match JSR2 ;emit-byte JSR2 newline + ;test4 ;expr1 ;match JSR2 ;emit-byte JSR2 newline + ;test5 ;expr1 ;match JSR2 ;emit-byte JSR2 newline + ;test6 ;expr1 ;match JSR2 ;emit-byte JSR2 newline + ;test7 ;expr1 ;match JSR2 ;emit-byte JSR2 newline + ;test8 ;expr1 ;match JSR2 ;emit-byte JSR2 newline + BRK + +@match ( str* regex* -> bool^ ) + ;reset-stack JSR2 + ;loop JMP2 + +( we don't use the return stack here since that ) +( complicates the back-tracking we need to do. ) +( ultimately this code will issue a JMP2r to ) +( return a boolean, which is where the stack ) +( effects signature comes from. ) + +@loop ( s* r* -> bool^ ) + LDAk #01 EQU ;do-empty JCN2 + LDAk #02 EQU ;do-dot JCN2 + LDAk #03 EQU ;do-literal JCN2 + LDAk #04 EQU ;do-or JCN2 + LDAk #05 EQU ;do-or JCN2 ( same code as the or case ) + #00 #00 DIV ( should not happen ) + +@goto-backtrack ( -> bool^ ) + ;stack-exist JSR2 ,&has-stack JCN ( do we have stack? ) + #00 JMP2r ( no, return false ) + &has-stack + ;pop JSR2 + ;goto-next JMP2 ( yes, resume from the top ) + +@goto-next ( str* next* -> bool^ ) + DUP2 #0000 GTH2 ,&has-next JCN + POP2 LDA null? ,&end-of-string JCN + ;goto-backtrack JMP2 + &end-of-string #01 JMP2r + &has-next ;loop JMP2 + +@do-empty ( str* regex* -> bool^ ) + INC2 LDA2 ( load next ) + ;goto-next JMP2 ( jump to next ) + +@do-dot ( str* regex* -> bool^ ) + INC2 LDA2 STH2 ( load and stash next ) + LDAk #00 NEQ ,&non-empty JCN ( is there a char? ) + POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack ) + &non-empty INC2 STH2r ;goto-next JMP2 ( yes, inc s, restore and jump ) + +@do-literal ( str* regex* -> bool^ ) + INC2 + LDAk STH ( store c ) + INC2 LDA2 STH2 ROTr ( store next, move c to top ) + LDAk + STHr EQU ,&matches JCN ( do we match this char? ) + POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack ) + &matches + INC2 STH2r ;goto-next JMP2 ( yes, inc s, restore and jump ) + +( this also handles asteration, since it ends up having the same structure ) +@do-or ( str* regex* -> bool^ ) + INC2 OVR2 OVR2 #0002 ADD2 ( s r+1 s r+3 ) + LDA2 ;push JSR2 ( save (s, right) in the stack for possible backtracking ) + LDA2 ;loop JMP2 ( continue on left branch ) + +( compile an expression string into a regex graph ) +@compile ( expr* -> regex* ) + +( corresponds to regex: a(b|c)d* ) +( accepts "ab" or "ac" followd by any number of d's ) +@expr1 + 03 'a :x1 + @x1 04 :x2 :x3 + @x2 03 'b :x4 + @x3 03 'c :x4 + @x4 05 :x5 0000 + @x5 03 'd :x4 + +@test1 "ab 00 ( yes ) +@test2 "acdd 00 ( yes ) +@test3 "add 00 ( no ) +@test4 "abd 00 ( yes ) +@test5 "acddddddddddd 00 ( yes ) +@test6 "bd 00 ( no ) +@test7 "z 00 ( no ) +@test8 00 ( no ) + +@emit-short ( byte -- ) + SWP ;emit-byte JSR2 ;emit-byte JSR2 JMP2r + +@emit-byte ( byte -- ) + DUP #04 SFT ,&hex JSR #0f AND ,&hex JMP + &hex #30 ADD DUP #39 GTH #27 MUL ADD emit + JMP2r + +@emit3 ( addr* -> addr* ) + DUP2 + LDAk ;emit-byte JSR2 space INC2 + LDA2 ;emit-short JSR2 newline JMP2r + +@emit4 ( addr* -> addr* ) + DUP2 + LDAk ;emit-byte JSR2 space INC2 + LDAk ;emit-byte JSR2 space INC2 + LDA2 ;emit-short JSR2 newline JMP2r + +@emit5 ( addr* -> addr* ) + DUP2 + LDAk ;emit-byte JSR2 space INC2 + LDA2k ;emit-short JSR2 space #0002 ADD2 + LDA2 ;emit-short JSR2 newline JMP2r + +@push ( str* regex* -> ) + ;assert-avail JSR2 ( check for space ) + ;stack-pos LDA2 #0002 ADD2 STA2 ( cell[2:3] <- regex ) + ;stack-pos LDA2 STA2 ( cell[0:1] <- str ) + ;stack-pos LDA2 #0004 ADD2 ;stack-pos STA2 ( pos += 4 ) + JMP2r + +@pop ( -> str* regex* ) + ;stack-pos LDA2 ( load stack-pos ) + #0002 SUB2 LDA2k STH2 ( pop and stash regex ) + #0002 SUB2 LDA2k STH2 ( pop and stash str ) + ;stack-pos STA2 ( save new stack-pos ) + STH2r STH2r ( restore str and regex ) + JMP2r + +@reset-stack ( -> ) + ;stack-bot ;stack-pos STA2 JMP2r ( pos <- 0 ) +@stack-avail ( -> bool^ ) + ;stack-pos LDA2 ;stack-top LTH2 JMP2r +@stack-exist ( -> bool^ ) + ;stack-pos LDA2 ;stack-bot GTH2 JMP2r +@assert-avail ( -> ) + ;stack-avail JSR2 ,&ok JCN #00 #00 DIV &ok JMP2r + +@emit-stack ( -> ) + space LIT 'n emit space ;stack-pos LDA2 ;stack-bot SUB2 #0004 DIV2 ;emit-short JSR2 newline + ;stack-bot + &loop + DUP2 ;stack-pos LDA2 LTH2 ,&ok JCN + POP2 JMP2r + &ok + space space LDA2k ;emit-short JSR2 + #0002 ADD2 DUP2 LDA2 space ;emit-short JSR2 newline + #0002 ADD2 ,&loop JMP + +@stack-pos :stack-bot ( the next position to insert at ) +@stack-bot $1000 @stack-top ( holds 1024 steps (4096 bytes) ) + +@reset-arena ( -> ) + ;arena-bot ;arena-pos STA2 JMP2r + +@alloc ( size^ -> addr* ) + #00 SWP ( size* ) + ;arena-pos LDA2 STH2k ADD2 ( pos+size* {pos} ) + ( TODO: ensure we don't exceed our space ) + ;arena-pos STA2 ( pos <- pos+size ) + STH2r JMP2 ( return old pos ) + +@arena-pos :arena-bot ( the next position to allocate ) +@arena-bot $400 @arena-top ( holds up to 1024 bytes )