From c681c0c52bc728b792e7af63430b5f607a64870b Mon Sep 17 00:00:00 2001 From: d6 Date: Sun, 20 Feb 2022 15:06:57 -0500 Subject: [PATCH] add ^ and $ --- grep.tal | 13 +++- regex.tal | 161 ++++++++++++++++++++++++++++++++++++++++++------- test-regex.tal | 50 ++++++++++----- 3 files changed, 185 insertions(+), 39 deletions(-) diff --git a/grep.tal b/grep.tal index 061e37f..141c7c2 100644 --- a/grep.tal +++ b/grep.tal @@ -2,6 +2,10 @@ ( ) ( by d_m ) +( NOTE: currently uxncli doesn't write 00 when its stdin ) +( is closed. that means that grep.tal only "sees" lines ) +( with a trailing newline, and also that it never exits. ) + ( print a character to STDOUT ) %emitt { #18 DEO } @@ -15,6 +19,7 @@ @regex 0000 ( compiled regex address (if any) ) @buffer $1000 ( buffer to read user input ) @ptr :buffer ( next byte to write in buffer ) +@done 00 @println ( s* -> ) &loop LDAk #00 EQU ,&eof JCN ( did we reach \0 ? ) @@ -22,11 +27,15 @@ &eof #0a emitt POP2 JMP2r ( yes so emit \n and return ) @r-read-stdin ( -> ) + #12 DEI #00 EQU ,&finishing JCN ( did we read 00 ? ) #12 DEI #0a EQU ,&execute JCN ( did we read \n ? ) #12 DEI ;ptr LDA2 STA ( no, so save in buffer ) ;ptr LDA2k INC2 SWP2 STA2 ( ptr++ ) BRK ( return ) + &finishing + #01 ;done STA ( this will be our last iteration ) + &execute ( we saw a newline, so do something ) #00 ;ptr LDA2 STA ( null terminate str ) ;buffer ;ptr STA2 ( reset ptr ) @@ -37,7 +46,9 @@ ;buffer ;regex LDA2 ;search JSR2 ( search line for a regex match ) #00 EQU ,&no-match JCN ( did we match? ) ;buffer ;println JSR2 ( print any match ) - &no-match BRK ( return ) + &no-match ;done LDA ,&exit JCN ( ready to exit? ) + &return BRK ( no, return ) + &exit #01 #0f DEO BRK ( yes, exit ) ( regex is unset ) &need-regex ;buffer ;compile JSR2 ( compile regex ) diff --git a/regex.tal b/regex.tal index d89fd29..3d91337 100644 --- a/regex.tal +++ b/regex.tal @@ -1,11 +1,35 @@ ( regex.tal ) ( ) ( compiles regex expression strings into regex nodes, then uses ) -( regex ndoes to match input strings. ) +( regex nodes to match input strings. ) ( ) -( this currently only supports matching an entire string, as ) -( opposed to searching for a matching substring, or extracting ) -( matching subgroups. ) +( two methods are currently supported: ) +( ) +( 1. match ) +( ) +( when matching the regex must match the entire string. this means ) +( that it is unnecessary to use ^ and $ when matching, since their ) +( effect is implied. it also means that that dot nodes will match ) +( any characters at all including newlines. ) +( ) +( match returns 01 if the string was matched and 00 otherwise. ) +( ) +( 2. search ) +( ) +( when searching the regex attempts to find matching substrings ) +( in the given string. this means that after successfully finding ) +( a match, search may be called on the remaining substring to find ) +( more matches. ) +( ) +( when searching, ^ matches the beginning of the string OR a line. ) +( $ matches the end of a line OR the end of the entire string. ) +( (the ^ and $ operators aren't yet supported.) the dot nodes will ) +( not match newline characters, which must be matched explicitly. ) +( ) +( search returns 01 if the string was matched and 00 otherwise. ) +( additionally, the @search-start and @search-end addresses will ) +( contain the starting location and match boundary of the matching ) +( substring. ) ( ) ( regex node types: ) ( ) @@ -16,6 +40,9 @@ ( or matches either left or right [ #04 left* right* ] ) ( star matches expr zero-or-more times [ #05 r* next* ] ) ( (NOTE: r.expr.next must be r) ) +( ) +( caret matches start of line/string [ #06 next* ] ) +( dollar matches end of line/string [ #07 next* ] ) ( ) ( `or` and `star` have the same structure and are handled by the ) ( same code (;do-or). however, the node types are kept different ) @@ -28,7 +55,6 @@ ( regexes should not include nulls and cannot match them (other ) ( than the null which signals the end of a string). ) -%null? { #00 EQU } %debug { #ff #0e DEO } %emit { #18 DEO } %space { #20 emit } @@ -74,13 +100,24 @@ ( ) ( returns true if the string, and false otherwise. ) @match ( str* regex* -> bool^ ) + #01 ;match-multiline STA #00 ;search-mode STA ;reset-stack JSR2 ;loop JMP2 -( ) @search ( str* regex* -> bool^ ) - #01 ;search-mode STA STH2 ( s* [r*] ) + #00 ;match-multiline STA + #01 ;search-mode STA + ;_search JMP2 + +@search-multiline ( str* regex* -> bool^ ) + #01 ;match-multiline STA + #01 ;search-mode STA + ;_search JMP2 + +@_search ( str* regex* -> bool^ ) + STH2 ( s* [r*] ) + DUP2 ;string-start STA2 ( s* [r*] ) &loop LDAk #00 EQU ,&eof JCN ( s* [r*] ) ;reset-stack JSR2 ( s* [r*] ) DUP2 ;search-start STA2 ( s* [r*] ) @@ -100,11 +137,13 @@ ( return a boolean, which is where the stack ) ( effects signature comes from. ) @loop ( s* r* -> bool^ ) - LDAk #01 EQU ;do-empty JCN2 - LDAk #02 EQU ;do-dot JCN2 + LDAk #01 EQU ;do-empty JCN2 + LDAk #02 EQU ;do-dot JCN2 LDAk #03 EQU ;do-literal JCN2 - LDAk #04 EQU ;do-or JCN2 - LDAk #05 EQU ;do-or JCN2 ( same code as the or case ) + LDAk #04 EQU ;do-or JCN2 + LDAk #05 EQU ;do-or JCN2 ( same code as the or case ) + LDAk #06 EQU ;do-caret JCN2 + LDAk #07 EQU ;do-dollar JCN2 ;unknown-node-type ;error! JSR2 ( used when we hit a dead-end during matching. ) @@ -118,7 +157,7 @@ ( follow the given address (next*) to continue matching ) @goto-next ( str* next* -> bool^ ) DUP2 #0000 GTH2 ,&has-next JCN - POP2 LDAk null? ,&end-of-string JCN + POP2 LDAk #00 EQU ,&end-of-string JCN ;search-mode LDA ,&end-of-search JCN POP2 ;goto-backtrack JMP2 &end-of-search DUP2 ;search-end STA2 @@ -134,8 +173,24 @@ @do-dot ( str* regex* -> bool^ ) INC2 LDA2 STH2 ( load and stash next ) LDAk #00 NEQ ,&non-empty JCN ( is there a char? ) + &backtrack POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack ) + &non-empty LDAk #0a NEQ ,&match JCN ( yes, match unless \n in search-mode ) + ;search-mode LDA ,&backtrack JCN ( if \n and search-mode, treat as EOF ) + &match INC2 STH2r ;goto-next JMP2 ( on match: inc s, restore and jump ) + +( TODO: support multi-line=0 ) +@do-caret ( str* regex* -> bool^ ) + INC2 LDA2 STH2 ( load and stash next ) + DUP2 ;string-start LDA2 EQU2 ,&at-start JCN ( at string start? ) POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack ) - &non-empty INC2 STH2r ;goto-next JMP2 ( yes, inc s, restore and jump ) + &at-start STH2r ;goto-next JMP2 ( yes, goto next without advancing str ) + +( TODO: support multi-line=0 ) +@do-dollar ( str* regex* -> bool^ ) + INC2 LDA2 STH2 ( load and stash next ) + LDAk #00 EQU ,&at-end JCN ( at string end? ) + POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack ) + &at-end STH2r ;goto-next JMP2 ( yes, goto next without advancing str ) ( handle literal -- match one specific character ) @do-literal ( str* regex* -> bool^ ) @@ -158,10 +213,32 @@ ( REGEX PARSING ) +( do we match across lines? ) +( - should be true when matching ) +( - can be true or false when searching ) +( - affects syntax of . ^ and $ ) +@match-multiline $1 + ( are we in searching mode? ) +( - should be true when searching ) +( - should be false when matching ) @search-mode $1 + +( ) +@string-start $2 @search-start $2 -@search-end $2 +@search-end $2 + +( +( used for subgroup match start/end addresses ) +@groups [ &s0 $2 &e0 $2 &s1 $2 &e1 $2 + &s2 $2 &e2 $2 &s3 $2 &e3 $2 + &s4 $2 &e4 $2 &s5 $2 &e5 $2 + &s6 $2 &e6 $2 &s7 $2 &e7 $2 + &s8 $2 &e8 $2 &s9 $2 &e9 $2 ] + +( position of last finished group ) +@group-pos $2 ) ( track the position in the input string ) @pos $2 @@ -206,6 +283,7 @@ ( c. substring matching, i.e. searching ) ( d. subgroup extraction ) ( e. back-references, e.g \1 ) +( f. non-capturing groups, e.g. (?:) ) ( compile an expression string into a regex graph ) ( ) @@ -239,6 +317,8 @@ DUP #00 EQU ;c-done JCN2 DUP LIT '| EQU ;c-or JCN2 DUP LIT '. EQU ;c-dot JCN2 + DUP LIT '^ EQU ;c-caret JCN2 + DUP LIT '$ EQU ;c-dollar JCN2 DUP LIT '( EQU ;c-lpar JCN2 DUP LIT ') EQU ;c-rpar JCN2 DUP LIT '\ EQU ;c-esc JCN2 @@ -320,17 +400,44 @@ ;alloc-dot JSR2 ( dot ) DUP2 ;c-peek-and-finalize JMP2 -( TODO: escaping rules not quite right ) +( called when we read "^" ) +( ) +( allocates a caret-node and continues. ) +@c-caret ( c^ -> r2* ) + POP + ;alloc-caret JSR2 ( caret ) + DUP2 ;c-peek-and-finalize JMP2 + +( called when we read "$" ) +( ) +( allocates a dollar-node and continues. ) +@c-dollar ( c^ -> r2* ) + POP + ;alloc-dollar JSR2 ( dollar ) + DUP2 ;c-peek-and-finalize JMP2 ( called when we read "\" ) ( ) -( allocates a literal of the next character. ) +( handles special sequences: \a \b \t \n \v \f \r ) ( ) -( this doesn't currently handle any special escape sequences. ) +( otherwise, allocates a literal of the next character. ) @c-esc ( c^ -> r2* ) - POP - ;read JSR2 - ;c-char JMP2 + POP ;read JSR2 + DUP LIT 'a EQU ,&bel JCN + DUP LIT 'b EQU ,&bs JCN + DUP LIT 't EQU ,&tab JCN + DUP LIT 'n EQU ,&nl JCN + DUP LIT 'v EQU ,&vtab JCN + DUP LIT 'f EQU ,&ff JCN + DUP LIT 'r EQU ,&cr JCN + &default ;c-char JMP2 + &bel POP #07 ,&default JMP + &bs POP #08 ,&default JMP + &tab POP #09 ,&default JMP + &nl POP #0a ,&default JMP + &vtab POP #0b ,&default JMP + &ff POP #0c ,&default JMP + &cr POP #0d ,&default JMP ( called when we read any other character ) ( ) @@ -375,6 +482,12 @@ @alloc-dot ( -> r* ) #02 ;alloc3 JMP2 +@alloc-caret ( -> r* ) + #06 ;alloc3 JMP2 + +@alloc-dollar ( -> r* ) + #07 ;alloc3 JMP2 + @alloc-lit ( c^ -> r* ) #03 #0000 SWP2 ( 0000 c^ 03 ) #04 ;alloc JSR2 ( 0000 c^ 03 addr* ) @@ -477,7 +590,10 @@ &!2 LDAk #03 NEQ ,&!3 JCN #0002 ADD2 ;set-next-addr JSR2 JMP2r &!3 LDAk #04 NEQ ,&!4 JCN INC2 ;set-next-addr JSR2 JMP2r &!4 LDAk #05 NEQ ,&!5 JCN #0003 ADD2 ;set-next-addr JSR2 JMP2r - &!5 ;unknown-node-type ;error! JSR2 +( &!5 ;unknown-node-type ;error! JSR2 ) + &!5 LDAk #06 NEQ ,&!6 JCN INC2 ;set-next-addr JSR2 JMP2r + &!6 LDAk #07 NEQ ,&!7 JCN INC2 ;set-next-addr JSR2 JMP2r + &!7 ;unknown-node-type ;error! JSR2 @set-next-or-addr ( target* addr* -> ) LDA2k #0000 EQU2 ( target addr addr=0? ) ,&is-zero JCN @@ -524,6 +640,7 @@ STH2r STH2r ( restore str and regex ) JMP2r +( ( -> size^ ) @frame-size #00 STH ;stack-pos LDA2 @@ -531,7 +648,7 @@ #0004 SUB2 LDA2k #ffff EQU2 ,&done JCN INCr ,&loop JMP &done - STHr JMP2r + STHr JMP2r ) ( reset stack pointers ) @reset-stack ( -> ) diff --git a/test-regex.tal b/test-regex.tal index 57618d2..6ce4800 100644 --- a/test-regex.tal +++ b/test-regex.tal @@ -8,23 +8,35 @@ ;emit-stack JSR2 nl ;emit-arena JSR2 nl - ;test1 OVR2k ;match JSR2 ;emit-byte JSR2 sp - ;test2 OVR2k ;match JSR2 ;emit-byte JSR2 sp - ;test3 OVR2k ;match JSR2 ;emit-byte JSR2 sp - ;test4 OVR2k ;match JSR2 ;emit-byte JSR2 sp - ;test5 OVR2k ;match JSR2 ;emit-byte JSR2 sp - ;test6 OVR2k ;match JSR2 ;emit-byte JSR2 sp - ;test7 OVR2k ;match JSR2 ;emit-byte JSR2 sp - ;test8 OVR2k ;match JSR2 ;emit-byte JSR2 nl + LIT '= ;emit JSR2 sp + #01 ;emit-bool JSR2 sp + #01 ;emit-bool JSR2 sp + #00 ;emit-bool JSR2 sp + #01 ;emit-bool JSR2 sp + #01 ;emit-bool JSR2 sp + #00 ;emit-bool JSR2 sp + #00 ;emit-bool JSR2 sp + #00 ;emit-bool JSR2 nl - ;test1 ;graph1 ;match JSR2 ;emit-byte JSR2 sp - ;test2 ;graph1 ;match JSR2 ;emit-byte JSR2 sp - ;test3 ;graph1 ;match JSR2 ;emit-byte JSR2 sp - ;test4 ;graph1 ;match JSR2 ;emit-byte JSR2 sp - ;test5 ;graph1 ;match JSR2 ;emit-byte JSR2 sp - ;test6 ;graph1 ;match JSR2 ;emit-byte JSR2 sp - ;test7 ;graph1 ;match JSR2 ;emit-byte JSR2 sp - ;test8 ;graph1 ;match JSR2 ;emit-byte JSR2 nl + LIT 'A ;emit JSR2 sp + ;test1 OVR2k ;match JSR2 ;emit-bool JSR2 sp + ;test2 OVR2k ;match JSR2 ;emit-bool JSR2 sp + ;test3 OVR2k ;match JSR2 ;emit-bool JSR2 sp + ;test4 OVR2k ;match JSR2 ;emit-bool JSR2 sp + ;test5 OVR2k ;match JSR2 ;emit-bool JSR2 sp + ;test6 OVR2k ;match JSR2 ;emit-bool JSR2 sp + ;test7 OVR2k ;match JSR2 ;emit-bool JSR2 sp + ;test8 OVR2k ;match JSR2 ;emit-bool JSR2 nl + + LIT 'B ;emit JSR2 sp + ;test1 ;graph1 ;match JSR2 ;emit-bool JSR2 sp + ;test2 ;graph1 ;match JSR2 ;emit-bool JSR2 sp + ;test3 ;graph1 ;match JSR2 ;emit-bool JSR2 sp + ;test4 ;graph1 ;match JSR2 ;emit-bool JSR2 sp + ;test5 ;graph1 ;match JSR2 ;emit-bool JSR2 sp + ;test6 ;graph1 ;match JSR2 ;emit-bool JSR2 sp + ;test7 ;graph1 ;match JSR2 ;emit-bool JSR2 sp + ;test8 ;graph1 ;match JSR2 ;emit-bool JSR2 nl ;reset-arena JSR2 exit @@ -54,6 +66,9 @@ ~regex.tal +@emit ( c^ -- ) + emit JMP2r + @emit-short ( short* -- ) SWP ;emit-byte JSR2 ;emit-byte JSR2 JMP2r @@ -62,6 +77,9 @@ &hex #30 ADD DUP #39 GTH #27 MUL ADD emit JMP2r +@emit-bool ( byte^ -- ) + LIT '0 ADD emit JMP2r + ( print stack size, followed by contents ) @emit-stack ( -> ) space LIT 'n emit LIT '= emit ;stack-pos LDA2 ;stack-bot SUB2 #0004 DIV2 ;emit-short JSR2 LIT ': emit