add ^ and $

This commit is contained in:
~d6 2022-02-20 15:06:57 -05:00
parent 21a31755b6
commit c681c0c52b
3 changed files with 185 additions and 39 deletions

View File

@ -2,6 +2,10 @@
( ) ( )
( by d_m ) ( by d_m )
( NOTE: currently uxncli doesn't write 00 when its stdin )
( is closed. that means that grep.tal only "sees" lines )
( with a trailing newline, and also that it never exits. )
( print a character to STDOUT ) ( print a character to STDOUT )
%emitt { #18 DEO } %emitt { #18 DEO }
@ -15,6 +19,7 @@
@regex 0000 ( compiled regex address (if any) ) @regex 0000 ( compiled regex address (if any) )
@buffer $1000 ( buffer to read user input ) @buffer $1000 ( buffer to read user input )
@ptr :buffer ( next byte to write in buffer ) @ptr :buffer ( next byte to write in buffer )
@done 00
@println ( s* -> ) @println ( s* -> )
&loop LDAk #00 EQU ,&eof JCN ( did we reach \0 ? ) &loop LDAk #00 EQU ,&eof JCN ( did we reach \0 ? )
@ -22,11 +27,15 @@
&eof #0a emitt POP2 JMP2r ( yes so emit \n and return ) &eof #0a emitt POP2 JMP2r ( yes so emit \n and return )
@r-read-stdin ( -> ) @r-read-stdin ( -> )
#12 DEI #00 EQU ,&finishing JCN ( did we read 00 ? )
#12 DEI #0a EQU ,&execute JCN ( did we read \n ? ) #12 DEI #0a EQU ,&execute JCN ( did we read \n ? )
#12 DEI ;ptr LDA2 STA ( no, so save in buffer ) #12 DEI ;ptr LDA2 STA ( no, so save in buffer )
;ptr LDA2k INC2 SWP2 STA2 ( ptr++ ) ;ptr LDA2k INC2 SWP2 STA2 ( ptr++ )
BRK ( return ) BRK ( return )
&finishing
#01 ;done STA ( this will be our last iteration )
&execute ( we saw a newline, so do something ) &execute ( we saw a newline, so do something )
#00 ;ptr LDA2 STA ( null terminate str ) #00 ;ptr LDA2 STA ( null terminate str )
;buffer ;ptr STA2 ( reset ptr ) ;buffer ;ptr STA2 ( reset ptr )
@ -37,7 +46,9 @@
;buffer ;regex LDA2 ;search JSR2 ( search line for a regex match ) ;buffer ;regex LDA2 ;search JSR2 ( search line for a regex match )
#00 EQU ,&no-match JCN ( did we match? ) #00 EQU ,&no-match JCN ( did we match? )
;buffer ;println JSR2 ( print any match ) ;buffer ;println JSR2 ( print any match )
&no-match BRK ( return ) &no-match ;done LDA ,&exit JCN ( ready to exit? )
&return BRK ( no, return )
&exit #01 #0f DEO BRK ( yes, exit )
( regex is unset ) ( regex is unset )
&need-regex ;buffer ;compile JSR2 ( compile regex ) &need-regex ;buffer ;compile JSR2 ( compile regex )

161
regex.tal
View File

@ -1,11 +1,35 @@
( regex.tal ) ( regex.tal )
( ) ( )
( compiles regex expression strings into regex nodes, then uses ) ( compiles regex expression strings into regex nodes, then uses )
( regex ndoes to match input strings. ) ( regex nodes to match input strings. )
( ) ( )
( this currently only supports matching an entire string, as ) ( two methods are currently supported: )
( opposed to searching for a matching substring, or extracting ) ( )
( matching subgroups. ) ( 1. match )
( )
( when matching the regex must match the entire string. this means )
( that it is unnecessary to use ^ and $ when matching, since their )
( effect is implied. it also means that that dot nodes will match )
( any characters at all including newlines. )
( )
( match returns 01 if the string was matched and 00 otherwise. )
( )
( 2. search )
( )
( when searching the regex attempts to find matching substrings )
( in the given string. this means that after successfully finding )
( a match, search may be called on the remaining substring to find )
( more matches. )
( )
( when searching, ^ matches the beginning of the string OR a line. )
( $ matches the end of a line OR the end of the entire string. )
( (the ^ and $ operators aren't yet supported.) the dot nodes will )
( not match newline characters, which must be matched explicitly. )
( )
( search returns 01 if the string was matched and 00 otherwise. )
( additionally, the @search-start and @search-end addresses will )
( contain the starting location and match boundary of the matching )
( substring. )
( ) ( )
( regex node types: ) ( regex node types: )
( ) ( )
@ -16,6 +40,9 @@
( or matches either left or right [ #04 left* right* ] ) ( or matches either left or right [ #04 left* right* ] )
( star matches expr zero-or-more times [ #05 r* next* ] ) ( star matches expr zero-or-more times [ #05 r* next* ] )
( (NOTE: r.expr.next must be r) ) ( (NOTE: r.expr.next must be r) )
( )
( caret matches start of line/string [ #06 next* ] )
( dollar matches end of line/string [ #07 next* ] )
( ) ( )
( `or` and `star` have the same structure and are handled by the ) ( `or` and `star` have the same structure and are handled by the )
( same code (;do-or). however, the node types are kept different ) ( same code (;do-or). however, the node types are kept different )
@ -28,7 +55,6 @@
( regexes should not include nulls and cannot match them (other ) ( regexes should not include nulls and cannot match them (other )
( than the null which signals the end of a string). ) ( than the null which signals the end of a string). )
%null? { #00 EQU }
%debug { #ff #0e DEO } %debug { #ff #0e DEO }
%emit { #18 DEO } %emit { #18 DEO }
%space { #20 emit } %space { #20 emit }
@ -74,13 +100,24 @@
( ) ( )
( returns true if the string, and false otherwise. ) ( returns true if the string, and false otherwise. )
@match ( str* regex* -> bool^ ) @match ( str* regex* -> bool^ )
#01 ;match-multiline STA
#00 ;search-mode STA #00 ;search-mode STA
;reset-stack JSR2 ;reset-stack JSR2
;loop JMP2 ;loop JMP2
( )
@search ( str* regex* -> bool^ ) @search ( str* regex* -> bool^ )
#01 ;search-mode STA STH2 ( s* [r*] ) #00 ;match-multiline STA
#01 ;search-mode STA
;_search JMP2
@search-multiline ( str* regex* -> bool^ )
#01 ;match-multiline STA
#01 ;search-mode STA
;_search JMP2
@_search ( str* regex* -> bool^ )
STH2 ( s* [r*] )
DUP2 ;string-start STA2 ( s* [r*] )
&loop LDAk #00 EQU ,&eof JCN ( s* [r*] ) &loop LDAk #00 EQU ,&eof JCN ( s* [r*] )
;reset-stack JSR2 ( s* [r*] ) ;reset-stack JSR2 ( s* [r*] )
DUP2 ;search-start STA2 ( s* [r*] ) DUP2 ;search-start STA2 ( s* [r*] )
@ -100,11 +137,13 @@
( return a boolean, which is where the stack ) ( return a boolean, which is where the stack )
( effects signature comes from. ) ( effects signature comes from. )
@loop ( s* r* -> bool^ ) @loop ( s* r* -> bool^ )
LDAk #01 EQU ;do-empty JCN2 LDAk #01 EQU ;do-empty JCN2
LDAk #02 EQU ;do-dot JCN2 LDAk #02 EQU ;do-dot JCN2
LDAk #03 EQU ;do-literal JCN2 LDAk #03 EQU ;do-literal JCN2
LDAk #04 EQU ;do-or JCN2 LDAk #04 EQU ;do-or JCN2
LDAk #05 EQU ;do-or JCN2 ( same code as the or case ) LDAk #05 EQU ;do-or JCN2 ( same code as the or case )
LDAk #06 EQU ;do-caret JCN2
LDAk #07 EQU ;do-dollar JCN2
;unknown-node-type ;error! JSR2 ;unknown-node-type ;error! JSR2
( used when we hit a dead-end during matching. ) ( used when we hit a dead-end during matching. )
@ -118,7 +157,7 @@
( follow the given address (next*) to continue matching ) ( follow the given address (next*) to continue matching )
@goto-next ( str* next* -> bool^ ) @goto-next ( str* next* -> bool^ )
DUP2 #0000 GTH2 ,&has-next JCN DUP2 #0000 GTH2 ,&has-next JCN
POP2 LDAk null? ,&end-of-string JCN POP2 LDAk #00 EQU ,&end-of-string JCN
;search-mode LDA ,&end-of-search JCN ;search-mode LDA ,&end-of-search JCN
POP2 ;goto-backtrack JMP2 POP2 ;goto-backtrack JMP2
&end-of-search DUP2 ;search-end STA2 &end-of-search DUP2 ;search-end STA2
@ -134,8 +173,24 @@
@do-dot ( str* regex* -> bool^ ) @do-dot ( str* regex* -> bool^ )
INC2 LDA2 STH2 ( load and stash next ) INC2 LDA2 STH2 ( load and stash next )
LDAk #00 NEQ ,&non-empty JCN ( is there a char? ) LDAk #00 NEQ ,&non-empty JCN ( is there a char? )
&backtrack POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack )
&non-empty LDAk #0a NEQ ,&match JCN ( yes, match unless \n in search-mode )
;search-mode LDA ,&backtrack JCN ( if \n and search-mode, treat as EOF )
&match INC2 STH2r ;goto-next JMP2 ( on match: inc s, restore and jump )
( TODO: support multi-line=0 )
@do-caret ( str* regex* -> bool^ )
INC2 LDA2 STH2 ( load and stash next )
DUP2 ;string-start LDA2 EQU2 ,&at-start JCN ( at string start? )
POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack ) POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack )
&non-empty INC2 STH2r ;goto-next JMP2 ( yes, inc s, restore and jump ) &at-start STH2r ;goto-next JMP2 ( yes, goto next without advancing str )
( TODO: support multi-line=0 )
@do-dollar ( str* regex* -> bool^ )
INC2 LDA2 STH2 ( load and stash next )
LDAk #00 EQU ,&at-end JCN ( at string end? )
POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack )
&at-end STH2r ;goto-next JMP2 ( yes, goto next without advancing str )
( handle literal -- match one specific character ) ( handle literal -- match one specific character )
@do-literal ( str* regex* -> bool^ ) @do-literal ( str* regex* -> bool^ )
@ -158,10 +213,32 @@
( REGEX PARSING ) ( REGEX PARSING )
( do we match across lines? )
( - should be true when matching )
( - can be true or false when searching )
( - affects syntax of . ^ and $ )
@match-multiline $1
( are we in searching mode? ) ( are we in searching mode? )
( - should be true when searching )
( - should be false when matching )
@search-mode $1 @search-mode $1
( )
@string-start $2
@search-start $2 @search-start $2
@search-end $2 @search-end $2
(
( used for subgroup match start/end addresses )
@groups [ &s0 $2 &e0 $2 &s1 $2 &e1 $2
&s2 $2 &e2 $2 &s3 $2 &e3 $2
&s4 $2 &e4 $2 &s5 $2 &e5 $2
&s6 $2 &e6 $2 &s7 $2 &e7 $2
&s8 $2 &e8 $2 &s9 $2 &e9 $2 ]
( position of last finished group )
@group-pos $2 )
( track the position in the input string ) ( track the position in the input string )
@pos $2 @pos $2
@ -206,6 +283,7 @@
( c. substring matching, i.e. searching ) ( c. substring matching, i.e. searching )
( d. subgroup extraction ) ( d. subgroup extraction )
( e. back-references, e.g \1 ) ( e. back-references, e.g \1 )
( f. non-capturing groups, e.g. (?:) )
( compile an expression string into a regex graph ) ( compile an expression string into a regex graph )
( ) ( )
@ -239,6 +317,8 @@
DUP #00 EQU ;c-done JCN2 DUP #00 EQU ;c-done JCN2
DUP LIT '| EQU ;c-or JCN2 DUP LIT '| EQU ;c-or JCN2
DUP LIT '. EQU ;c-dot JCN2 DUP LIT '. EQU ;c-dot JCN2
DUP LIT '^ EQU ;c-caret JCN2
DUP LIT '$ EQU ;c-dollar JCN2
DUP LIT '( EQU ;c-lpar JCN2 DUP LIT '( EQU ;c-lpar JCN2
DUP LIT ') EQU ;c-rpar JCN2 DUP LIT ') EQU ;c-rpar JCN2
DUP LIT '\ EQU ;c-esc JCN2 DUP LIT '\ EQU ;c-esc JCN2
@ -320,17 +400,44 @@
;alloc-dot JSR2 ( dot ) ;alloc-dot JSR2 ( dot )
DUP2 ;c-peek-and-finalize JMP2 DUP2 ;c-peek-and-finalize JMP2
( TODO: escaping rules not quite right ) ( called when we read "^" )
( )
( allocates a caret-node and continues. )
@c-caret ( c^ -> r2* )
POP
;alloc-caret JSR2 ( caret )
DUP2 ;c-peek-and-finalize JMP2
( called when we read "$" )
( )
( allocates a dollar-node and continues. )
@c-dollar ( c^ -> r2* )
POP
;alloc-dollar JSR2 ( dollar )
DUP2 ;c-peek-and-finalize JMP2
( called when we read "\" ) ( called when we read "\" )
( ) ( )
( allocates a literal of the next character. ) ( handles special sequences: \a \b \t \n \v \f \r )
( ) ( )
( this doesn't currently handle any special escape sequences. ) ( otherwise, allocates a literal of the next character. )
@c-esc ( c^ -> r2* ) @c-esc ( c^ -> r2* )
POP POP ;read JSR2
;read JSR2 DUP LIT 'a EQU ,&bel JCN
;c-char JMP2 DUP LIT 'b EQU ,&bs JCN
DUP LIT 't EQU ,&tab JCN
DUP LIT 'n EQU ,&nl JCN
DUP LIT 'v EQU ,&vtab JCN
DUP LIT 'f EQU ,&ff JCN
DUP LIT 'r EQU ,&cr JCN
&default ;c-char JMP2
&bel POP #07 ,&default JMP
&bs POP #08 ,&default JMP
&tab POP #09 ,&default JMP
&nl POP #0a ,&default JMP
&vtab POP #0b ,&default JMP
&ff POP #0c ,&default JMP
&cr POP #0d ,&default JMP
( called when we read any other character ) ( called when we read any other character )
( ) ( )
@ -375,6 +482,12 @@
@alloc-dot ( -> r* ) @alloc-dot ( -> r* )
#02 ;alloc3 JMP2 #02 ;alloc3 JMP2
@alloc-caret ( -> r* )
#06 ;alloc3 JMP2
@alloc-dollar ( -> r* )
#07 ;alloc3 JMP2
@alloc-lit ( c^ -> r* ) @alloc-lit ( c^ -> r* )
#03 #0000 SWP2 ( 0000 c^ 03 ) #03 #0000 SWP2 ( 0000 c^ 03 )
#04 ;alloc JSR2 ( 0000 c^ 03 addr* ) #04 ;alloc JSR2 ( 0000 c^ 03 addr* )
@ -477,7 +590,10 @@
&!2 LDAk #03 NEQ ,&!3 JCN #0002 ADD2 ;set-next-addr JSR2 JMP2r &!2 LDAk #03 NEQ ,&!3 JCN #0002 ADD2 ;set-next-addr JSR2 JMP2r
&!3 LDAk #04 NEQ ,&!4 JCN INC2 ;set-next-addr JSR2 JMP2r &!3 LDAk #04 NEQ ,&!4 JCN INC2 ;set-next-addr JSR2 JMP2r
&!4 LDAk #05 NEQ ,&!5 JCN #0003 ADD2 ;set-next-addr JSR2 JMP2r &!4 LDAk #05 NEQ ,&!5 JCN #0003 ADD2 ;set-next-addr JSR2 JMP2r
&!5 ;unknown-node-type ;error! JSR2 ( &!5 ;unknown-node-type ;error! JSR2 )
&!5 LDAk #06 NEQ ,&!6 JCN INC2 ;set-next-addr JSR2 JMP2r
&!6 LDAk #07 NEQ ,&!7 JCN INC2 ;set-next-addr JSR2 JMP2r
&!7 ;unknown-node-type ;error! JSR2
@set-next-or-addr ( target* addr* -> ) @set-next-or-addr ( target* addr* -> )
LDA2k #0000 EQU2 ( target addr addr=0? ) ,&is-zero JCN LDA2k #0000 EQU2 ( target addr addr=0? ) ,&is-zero JCN
@ -524,6 +640,7 @@
STH2r STH2r ( restore str and regex ) STH2r STH2r ( restore str and regex )
JMP2r JMP2r
(
( -> size^ ) ( -> size^ )
@frame-size @frame-size
#00 STH ;stack-pos LDA2 #00 STH ;stack-pos LDA2
@ -531,7 +648,7 @@
#0004 SUB2 LDA2k #ffff EQU2 ,&done JCN #0004 SUB2 LDA2k #ffff EQU2 ,&done JCN
INCr ,&loop JMP INCr ,&loop JMP
&done &done
STHr JMP2r STHr JMP2r )
( reset stack pointers ) ( reset stack pointers )
@reset-stack ( -> ) @reset-stack ( -> )

View File

@ -8,23 +8,35 @@
;emit-stack JSR2 nl ;emit-stack JSR2 nl
;emit-arena JSR2 nl ;emit-arena JSR2 nl
;test1 OVR2k ;match JSR2 ;emit-byte JSR2 sp LIT '= ;emit JSR2 sp
;test2 OVR2k ;match JSR2 ;emit-byte JSR2 sp #01 ;emit-bool JSR2 sp
;test3 OVR2k ;match JSR2 ;emit-byte JSR2 sp #01 ;emit-bool JSR2 sp
;test4 OVR2k ;match JSR2 ;emit-byte JSR2 sp #00 ;emit-bool JSR2 sp
;test5 OVR2k ;match JSR2 ;emit-byte JSR2 sp #01 ;emit-bool JSR2 sp
;test6 OVR2k ;match JSR2 ;emit-byte JSR2 sp #01 ;emit-bool JSR2 sp
;test7 OVR2k ;match JSR2 ;emit-byte JSR2 sp #00 ;emit-bool JSR2 sp
;test8 OVR2k ;match JSR2 ;emit-byte JSR2 nl #00 ;emit-bool JSR2 sp
#00 ;emit-bool JSR2 nl
;test1 ;graph1 ;match JSR2 ;emit-byte JSR2 sp LIT 'A ;emit JSR2 sp
;test2 ;graph1 ;match JSR2 ;emit-byte JSR2 sp ;test1 OVR2k ;match JSR2 ;emit-bool JSR2 sp
;test3 ;graph1 ;match JSR2 ;emit-byte JSR2 sp ;test2 OVR2k ;match JSR2 ;emit-bool JSR2 sp
;test4 ;graph1 ;match JSR2 ;emit-byte JSR2 sp ;test3 OVR2k ;match JSR2 ;emit-bool JSR2 sp
;test5 ;graph1 ;match JSR2 ;emit-byte JSR2 sp ;test4 OVR2k ;match JSR2 ;emit-bool JSR2 sp
;test6 ;graph1 ;match JSR2 ;emit-byte JSR2 sp ;test5 OVR2k ;match JSR2 ;emit-bool JSR2 sp
;test7 ;graph1 ;match JSR2 ;emit-byte JSR2 sp ;test6 OVR2k ;match JSR2 ;emit-bool JSR2 sp
;test8 ;graph1 ;match JSR2 ;emit-byte JSR2 nl ;test7 OVR2k ;match JSR2 ;emit-bool JSR2 sp
;test8 OVR2k ;match JSR2 ;emit-bool JSR2 nl
LIT 'B ;emit JSR2 sp
;test1 ;graph1 ;match JSR2 ;emit-bool JSR2 sp
;test2 ;graph1 ;match JSR2 ;emit-bool JSR2 sp
;test3 ;graph1 ;match JSR2 ;emit-bool JSR2 sp
;test4 ;graph1 ;match JSR2 ;emit-bool JSR2 sp
;test5 ;graph1 ;match JSR2 ;emit-bool JSR2 sp
;test6 ;graph1 ;match JSR2 ;emit-bool JSR2 sp
;test7 ;graph1 ;match JSR2 ;emit-bool JSR2 sp
;test8 ;graph1 ;match JSR2 ;emit-bool JSR2 nl
;reset-arena JSR2 ;reset-arena JSR2
exit exit
@ -54,6 +66,9 @@
~regex.tal ~regex.tal
@emit ( c^ -- )
emit JMP2r
@emit-short ( short* -- ) @emit-short ( short* -- )
SWP ;emit-byte JSR2 ;emit-byte JSR2 JMP2r SWP ;emit-byte JSR2 ;emit-byte JSR2 JMP2r
@ -62,6 +77,9 @@
&hex #30 ADD DUP #39 GTH #27 MUL ADD emit &hex #30 ADD DUP #39 GTH #27 MUL ADD emit
JMP2r JMP2r
@emit-bool ( byte^ -- )
LIT '0 ADD emit JMP2r
( print stack size, followed by contents ) ( print stack size, followed by contents )
@emit-stack ( -> ) @emit-stack ( -> )
space LIT 'n emit LIT '= emit ;stack-pos LDA2 ;stack-bot SUB2 #0004 DIV2 ;emit-short JSR2 LIT ': emit space LIT 'n emit LIT '= emit ;stack-pos LDA2 ;stack-bot SUB2 #0004 DIV2 ;emit-short JSR2 LIT ': emit