add ^ and $
This commit is contained in:
parent
21a31755b6
commit
c681c0c52b
13
grep.tal
13
grep.tal
|
@ -2,6 +2,10 @@
|
|||
( )
|
||||
( by d_m )
|
||||
|
||||
( NOTE: currently uxncli doesn't write 00 when its stdin )
|
||||
( is closed. that means that grep.tal only "sees" lines )
|
||||
( with a trailing newline, and also that it never exits. )
|
||||
|
||||
( print a character to STDOUT )
|
||||
%emitt { #18 DEO }
|
||||
|
||||
|
@ -15,6 +19,7 @@
|
|||
@regex 0000 ( compiled regex address (if any) )
|
||||
@buffer $1000 ( buffer to read user input )
|
||||
@ptr :buffer ( next byte to write in buffer )
|
||||
@done 00
|
||||
|
||||
@println ( s* -> )
|
||||
&loop LDAk #00 EQU ,&eof JCN ( did we reach \0 ? )
|
||||
|
@ -22,11 +27,15 @@
|
|||
&eof #0a emitt POP2 JMP2r ( yes so emit \n and return )
|
||||
|
||||
@r-read-stdin ( -> )
|
||||
#12 DEI #00 EQU ,&finishing JCN ( did we read 00 ? )
|
||||
#12 DEI #0a EQU ,&execute JCN ( did we read \n ? )
|
||||
#12 DEI ;ptr LDA2 STA ( no, so save in buffer )
|
||||
;ptr LDA2k INC2 SWP2 STA2 ( ptr++ )
|
||||
BRK ( return )
|
||||
|
||||
&finishing
|
||||
#01 ;done STA ( this will be our last iteration )
|
||||
|
||||
&execute ( we saw a newline, so do something )
|
||||
#00 ;ptr LDA2 STA ( null terminate str )
|
||||
;buffer ;ptr STA2 ( reset ptr )
|
||||
|
@ -37,7 +46,9 @@
|
|||
;buffer ;regex LDA2 ;search JSR2 ( search line for a regex match )
|
||||
#00 EQU ,&no-match JCN ( did we match? )
|
||||
;buffer ;println JSR2 ( print any match )
|
||||
&no-match BRK ( return )
|
||||
&no-match ;done LDA ,&exit JCN ( ready to exit? )
|
||||
&return BRK ( no, return )
|
||||
&exit #01 #0f DEO BRK ( yes, exit )
|
||||
|
||||
( regex is unset )
|
||||
&need-regex ;buffer ;compile JSR2 ( compile regex )
|
||||
|
|
151
regex.tal
151
regex.tal
|
@ -1,11 +1,35 @@
|
|||
( regex.tal )
|
||||
( )
|
||||
( compiles regex expression strings into regex nodes, then uses )
|
||||
( regex ndoes to match input strings. )
|
||||
( regex nodes to match input strings. )
|
||||
( )
|
||||
( this currently only supports matching an entire string, as )
|
||||
( opposed to searching for a matching substring, or extracting )
|
||||
( matching subgroups. )
|
||||
( two methods are currently supported: )
|
||||
( )
|
||||
( 1. match )
|
||||
( )
|
||||
( when matching the regex must match the entire string. this means )
|
||||
( that it is unnecessary to use ^ and $ when matching, since their )
|
||||
( effect is implied. it also means that that dot nodes will match )
|
||||
( any characters at all including newlines. )
|
||||
( )
|
||||
( match returns 01 if the string was matched and 00 otherwise. )
|
||||
( )
|
||||
( 2. search )
|
||||
( )
|
||||
( when searching the regex attempts to find matching substrings )
|
||||
( in the given string. this means that after successfully finding )
|
||||
( a match, search may be called on the remaining substring to find )
|
||||
( more matches. )
|
||||
( )
|
||||
( when searching, ^ matches the beginning of the string OR a line. )
|
||||
( $ matches the end of a line OR the end of the entire string. )
|
||||
( (the ^ and $ operators aren't yet supported.) the dot nodes will )
|
||||
( not match newline characters, which must be matched explicitly. )
|
||||
( )
|
||||
( search returns 01 if the string was matched and 00 otherwise. )
|
||||
( additionally, the @search-start and @search-end addresses will )
|
||||
( contain the starting location and match boundary of the matching )
|
||||
( substring. )
|
||||
( )
|
||||
( regex node types: )
|
||||
( )
|
||||
|
@ -17,6 +41,9 @@
|
|||
( star matches expr zero-or-more times [ #05 r* next* ] )
|
||||
( (NOTE: r.expr.next must be r) )
|
||||
( )
|
||||
( caret matches start of line/string [ #06 next* ] )
|
||||
( dollar matches end of line/string [ #07 next* ] )
|
||||
( )
|
||||
( `or` and `star` have the same structure and are handled by the )
|
||||
( same code (;do-or). however, the node types are kept different )
|
||||
( to make it clearer how to parse and assemble the nodes. )
|
||||
|
@ -28,7 +55,6 @@
|
|||
( regexes should not include nulls and cannot match them (other )
|
||||
( than the null which signals the end of a string). )
|
||||
|
||||
%null? { #00 EQU }
|
||||
%debug { #ff #0e DEO }
|
||||
%emit { #18 DEO }
|
||||
%space { #20 emit }
|
||||
|
@ -74,13 +100,24 @@
|
|||
( )
|
||||
( returns true if the string, and false otherwise. )
|
||||
@match ( str* regex* -> bool^ )
|
||||
#01 ;match-multiline STA
|
||||
#00 ;search-mode STA
|
||||
;reset-stack JSR2
|
||||
;loop JMP2
|
||||
|
||||
( )
|
||||
@search ( str* regex* -> bool^ )
|
||||
#01 ;search-mode STA STH2 ( s* [r*] )
|
||||
#00 ;match-multiline STA
|
||||
#01 ;search-mode STA
|
||||
;_search JMP2
|
||||
|
||||
@search-multiline ( str* regex* -> bool^ )
|
||||
#01 ;match-multiline STA
|
||||
#01 ;search-mode STA
|
||||
;_search JMP2
|
||||
|
||||
@_search ( str* regex* -> bool^ )
|
||||
STH2 ( s* [r*] )
|
||||
DUP2 ;string-start STA2 ( s* [r*] )
|
||||
&loop LDAk #00 EQU ,&eof JCN ( s* [r*] )
|
||||
;reset-stack JSR2 ( s* [r*] )
|
||||
DUP2 ;search-start STA2 ( s* [r*] )
|
||||
|
@ -105,6 +142,8 @@
|
|||
LDAk #03 EQU ;do-literal JCN2
|
||||
LDAk #04 EQU ;do-or JCN2
|
||||
LDAk #05 EQU ;do-or JCN2 ( same code as the or case )
|
||||
LDAk #06 EQU ;do-caret JCN2
|
||||
LDAk #07 EQU ;do-dollar JCN2
|
||||
;unknown-node-type ;error! JSR2
|
||||
|
||||
( used when we hit a dead-end during matching. )
|
||||
|
@ -118,7 +157,7 @@
|
|||
( follow the given address (next*) to continue matching )
|
||||
@goto-next ( str* next* -> bool^ )
|
||||
DUP2 #0000 GTH2 ,&has-next JCN
|
||||
POP2 LDAk null? ,&end-of-string JCN
|
||||
POP2 LDAk #00 EQU ,&end-of-string JCN
|
||||
;search-mode LDA ,&end-of-search JCN
|
||||
POP2 ;goto-backtrack JMP2
|
||||
&end-of-search DUP2 ;search-end STA2
|
||||
|
@ -134,8 +173,24 @@
|
|||
@do-dot ( str* regex* -> bool^ )
|
||||
INC2 LDA2 STH2 ( load and stash next )
|
||||
LDAk #00 NEQ ,&non-empty JCN ( is there a char? )
|
||||
&backtrack POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack )
|
||||
&non-empty LDAk #0a NEQ ,&match JCN ( yes, match unless \n in search-mode )
|
||||
;search-mode LDA ,&backtrack JCN ( if \n and search-mode, treat as EOF )
|
||||
&match INC2 STH2r ;goto-next JMP2 ( on match: inc s, restore and jump )
|
||||
|
||||
( TODO: support multi-line=0 )
|
||||
@do-caret ( str* regex* -> bool^ )
|
||||
INC2 LDA2 STH2 ( load and stash next )
|
||||
DUP2 ;string-start LDA2 EQU2 ,&at-start JCN ( at string start? )
|
||||
POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack )
|
||||
&non-empty INC2 STH2r ;goto-next JMP2 ( yes, inc s, restore and jump )
|
||||
&at-start STH2r ;goto-next JMP2 ( yes, goto next without advancing str )
|
||||
|
||||
( TODO: support multi-line=0 )
|
||||
@do-dollar ( str* regex* -> bool^ )
|
||||
INC2 LDA2 STH2 ( load and stash next )
|
||||
LDAk #00 EQU ,&at-end JCN ( at string end? )
|
||||
POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack )
|
||||
&at-end STH2r ;goto-next JMP2 ( yes, goto next without advancing str )
|
||||
|
||||
( handle literal -- match one specific character )
|
||||
@do-literal ( str* regex* -> bool^ )
|
||||
|
@ -158,11 +213,33 @@
|
|||
|
||||
( REGEX PARSING )
|
||||
|
||||
( do we match across lines? )
|
||||
( - should be true when matching )
|
||||
( - can be true or false when searching )
|
||||
( - affects syntax of . ^ and $ )
|
||||
@match-multiline $1
|
||||
|
||||
( are we in searching mode? )
|
||||
( - should be true when searching )
|
||||
( - should be false when matching )
|
||||
@search-mode $1
|
||||
|
||||
( )
|
||||
@string-start $2
|
||||
@search-start $2
|
||||
@search-end $2
|
||||
|
||||
(
|
||||
( used for subgroup match start/end addresses )
|
||||
@groups [ &s0 $2 &e0 $2 &s1 $2 &e1 $2
|
||||
&s2 $2 &e2 $2 &s3 $2 &e3 $2
|
||||
&s4 $2 &e4 $2 &s5 $2 &e5 $2
|
||||
&s6 $2 &e6 $2 &s7 $2 &e7 $2
|
||||
&s8 $2 &e8 $2 &s9 $2 &e9 $2 ]
|
||||
|
||||
( position of last finished group )
|
||||
@group-pos $2 )
|
||||
|
||||
( track the position in the input string )
|
||||
@pos $2
|
||||
|
||||
|
@ -206,6 +283,7 @@
|
|||
( c. substring matching, i.e. searching )
|
||||
( d. subgroup extraction )
|
||||
( e. back-references, e.g \1 )
|
||||
( f. non-capturing groups, e.g. (?:) )
|
||||
|
||||
( compile an expression string into a regex graph )
|
||||
( )
|
||||
|
@ -239,6 +317,8 @@
|
|||
DUP #00 EQU ;c-done JCN2
|
||||
DUP LIT '| EQU ;c-or JCN2
|
||||
DUP LIT '. EQU ;c-dot JCN2
|
||||
DUP LIT '^ EQU ;c-caret JCN2
|
||||
DUP LIT '$ EQU ;c-dollar JCN2
|
||||
DUP LIT '( EQU ;c-lpar JCN2
|
||||
DUP LIT ') EQU ;c-rpar JCN2
|
||||
DUP LIT '\ EQU ;c-esc JCN2
|
||||
|
@ -320,17 +400,44 @@
|
|||
;alloc-dot JSR2 ( dot )
|
||||
DUP2 ;c-peek-and-finalize JMP2
|
||||
|
||||
( TODO: escaping rules not quite right )
|
||||
( called when we read "^" )
|
||||
( )
|
||||
( allocates a caret-node and continues. )
|
||||
@c-caret ( c^ -> r2* )
|
||||
POP
|
||||
;alloc-caret JSR2 ( caret )
|
||||
DUP2 ;c-peek-and-finalize JMP2
|
||||
|
||||
( called when we read "$" )
|
||||
( )
|
||||
( allocates a dollar-node and continues. )
|
||||
@c-dollar ( c^ -> r2* )
|
||||
POP
|
||||
;alloc-dollar JSR2 ( dollar )
|
||||
DUP2 ;c-peek-and-finalize JMP2
|
||||
|
||||
( called when we read "\" )
|
||||
( )
|
||||
( allocates a literal of the next character. )
|
||||
( handles special sequences: \a \b \t \n \v \f \r )
|
||||
( )
|
||||
( this doesn't currently handle any special escape sequences. )
|
||||
( otherwise, allocates a literal of the next character. )
|
||||
@c-esc ( c^ -> r2* )
|
||||
POP
|
||||
;read JSR2
|
||||
;c-char JMP2
|
||||
POP ;read JSR2
|
||||
DUP LIT 'a EQU ,&bel JCN
|
||||
DUP LIT 'b EQU ,&bs JCN
|
||||
DUP LIT 't EQU ,&tab JCN
|
||||
DUP LIT 'n EQU ,&nl JCN
|
||||
DUP LIT 'v EQU ,&vtab JCN
|
||||
DUP LIT 'f EQU ,&ff JCN
|
||||
DUP LIT 'r EQU ,&cr JCN
|
||||
&default ;c-char JMP2
|
||||
&bel POP #07 ,&default JMP
|
||||
&bs POP #08 ,&default JMP
|
||||
&tab POP #09 ,&default JMP
|
||||
&nl POP #0a ,&default JMP
|
||||
&vtab POP #0b ,&default JMP
|
||||
&ff POP #0c ,&default JMP
|
||||
&cr POP #0d ,&default JMP
|
||||
|
||||
( called when we read any other character )
|
||||
( )
|
||||
|
@ -375,6 +482,12 @@
|
|||
@alloc-dot ( -> r* )
|
||||
#02 ;alloc3 JMP2
|
||||
|
||||
@alloc-caret ( -> r* )
|
||||
#06 ;alloc3 JMP2
|
||||
|
||||
@alloc-dollar ( -> r* )
|
||||
#07 ;alloc3 JMP2
|
||||
|
||||
@alloc-lit ( c^ -> r* )
|
||||
#03 #0000 SWP2 ( 0000 c^ 03 )
|
||||
#04 ;alloc JSR2 ( 0000 c^ 03 addr* )
|
||||
|
@ -477,7 +590,10 @@
|
|||
&!2 LDAk #03 NEQ ,&!3 JCN #0002 ADD2 ;set-next-addr JSR2 JMP2r
|
||||
&!3 LDAk #04 NEQ ,&!4 JCN INC2 ;set-next-addr JSR2 JMP2r
|
||||
&!4 LDAk #05 NEQ ,&!5 JCN #0003 ADD2 ;set-next-addr JSR2 JMP2r
|
||||
&!5 ;unknown-node-type ;error! JSR2
|
||||
( &!5 ;unknown-node-type ;error! JSR2 )
|
||||
&!5 LDAk #06 NEQ ,&!6 JCN INC2 ;set-next-addr JSR2 JMP2r
|
||||
&!6 LDAk #07 NEQ ,&!7 JCN INC2 ;set-next-addr JSR2 JMP2r
|
||||
&!7 ;unknown-node-type ;error! JSR2
|
||||
|
||||
@set-next-or-addr ( target* addr* -> )
|
||||
LDA2k #0000 EQU2 ( target addr addr=0? ) ,&is-zero JCN
|
||||
|
@ -524,6 +640,7 @@
|
|||
STH2r STH2r ( restore str and regex )
|
||||
JMP2r
|
||||
|
||||
(
|
||||
( -> size^ )
|
||||
@frame-size
|
||||
#00 STH ;stack-pos LDA2
|
||||
|
@ -531,7 +648,7 @@
|
|||
#0004 SUB2 LDA2k #ffff EQU2 ,&done JCN
|
||||
INCr ,&loop JMP
|
||||
&done
|
||||
STHr JMP2r
|
||||
STHr JMP2r )
|
||||
|
||||
( reset stack pointers )
|
||||
@reset-stack ( -> )
|
||||
|
|
|
@ -8,23 +8,35 @@
|
|||
;emit-stack JSR2 nl
|
||||
;emit-arena JSR2 nl
|
||||
|
||||
;test1 OVR2k ;match JSR2 ;emit-byte JSR2 sp
|
||||
;test2 OVR2k ;match JSR2 ;emit-byte JSR2 sp
|
||||
;test3 OVR2k ;match JSR2 ;emit-byte JSR2 sp
|
||||
;test4 OVR2k ;match JSR2 ;emit-byte JSR2 sp
|
||||
;test5 OVR2k ;match JSR2 ;emit-byte JSR2 sp
|
||||
;test6 OVR2k ;match JSR2 ;emit-byte JSR2 sp
|
||||
;test7 OVR2k ;match JSR2 ;emit-byte JSR2 sp
|
||||
;test8 OVR2k ;match JSR2 ;emit-byte JSR2 nl
|
||||
LIT '= ;emit JSR2 sp
|
||||
#01 ;emit-bool JSR2 sp
|
||||
#01 ;emit-bool JSR2 sp
|
||||
#00 ;emit-bool JSR2 sp
|
||||
#01 ;emit-bool JSR2 sp
|
||||
#01 ;emit-bool JSR2 sp
|
||||
#00 ;emit-bool JSR2 sp
|
||||
#00 ;emit-bool JSR2 sp
|
||||
#00 ;emit-bool JSR2 nl
|
||||
|
||||
;test1 ;graph1 ;match JSR2 ;emit-byte JSR2 sp
|
||||
;test2 ;graph1 ;match JSR2 ;emit-byte JSR2 sp
|
||||
;test3 ;graph1 ;match JSR2 ;emit-byte JSR2 sp
|
||||
;test4 ;graph1 ;match JSR2 ;emit-byte JSR2 sp
|
||||
;test5 ;graph1 ;match JSR2 ;emit-byte JSR2 sp
|
||||
;test6 ;graph1 ;match JSR2 ;emit-byte JSR2 sp
|
||||
;test7 ;graph1 ;match JSR2 ;emit-byte JSR2 sp
|
||||
;test8 ;graph1 ;match JSR2 ;emit-byte JSR2 nl
|
||||
LIT 'A ;emit JSR2 sp
|
||||
;test1 OVR2k ;match JSR2 ;emit-bool JSR2 sp
|
||||
;test2 OVR2k ;match JSR2 ;emit-bool JSR2 sp
|
||||
;test3 OVR2k ;match JSR2 ;emit-bool JSR2 sp
|
||||
;test4 OVR2k ;match JSR2 ;emit-bool JSR2 sp
|
||||
;test5 OVR2k ;match JSR2 ;emit-bool JSR2 sp
|
||||
;test6 OVR2k ;match JSR2 ;emit-bool JSR2 sp
|
||||
;test7 OVR2k ;match JSR2 ;emit-bool JSR2 sp
|
||||
;test8 OVR2k ;match JSR2 ;emit-bool JSR2 nl
|
||||
|
||||
LIT 'B ;emit JSR2 sp
|
||||
;test1 ;graph1 ;match JSR2 ;emit-bool JSR2 sp
|
||||
;test2 ;graph1 ;match JSR2 ;emit-bool JSR2 sp
|
||||
;test3 ;graph1 ;match JSR2 ;emit-bool JSR2 sp
|
||||
;test4 ;graph1 ;match JSR2 ;emit-bool JSR2 sp
|
||||
;test5 ;graph1 ;match JSR2 ;emit-bool JSR2 sp
|
||||
;test6 ;graph1 ;match JSR2 ;emit-bool JSR2 sp
|
||||
;test7 ;graph1 ;match JSR2 ;emit-bool JSR2 sp
|
||||
;test8 ;graph1 ;match JSR2 ;emit-bool JSR2 nl
|
||||
|
||||
;reset-arena JSR2
|
||||
exit
|
||||
|
@ -54,6 +66,9 @@
|
|||
|
||||
~regex.tal
|
||||
|
||||
@emit ( c^ -- )
|
||||
emit JMP2r
|
||||
|
||||
@emit-short ( short* -- )
|
||||
SWP ;emit-byte JSR2 ;emit-byte JSR2 JMP2r
|
||||
|
||||
|
@ -62,6 +77,9 @@
|
|||
&hex #30 ADD DUP #39 GTH #27 MUL ADD emit
|
||||
JMP2r
|
||||
|
||||
@emit-bool ( byte^ -- )
|
||||
LIT '0 ADD emit JMP2r
|
||||
|
||||
( print stack size, followed by contents )
|
||||
@emit-stack ( -> )
|
||||
space LIT 'n emit LIT '= emit ;stack-pos LDA2 ;stack-bot SUB2 #0004 DIV2 ;emit-short JSR2 LIT ': emit
|
||||
|
|
Loading…
Reference in New Issue