From cae3ce4a7c1380ab1972f08ad0eea8b8d42e9b46 Mon Sep 17 00:00:00 2001 From: d6 Date: Fri, 15 Apr 2022 01:40:52 -0400 Subject: [PATCH] commit asma.rom --- asma.rom | Bin 0 -> 3095 bytes regex.tal | 123 +++++++++++++++++++++++++++++++++++-------------- repl-regex.tal | 12 +++-- 3 files changed, 96 insertions(+), 39 deletions(-) create mode 100644 asma.rom diff --git a/asma.rom b/asma.rom new file mode 100644 index 0000000000000000000000000000000000000000..8e8c264e9700295b2bfc63106ab7c11c530222ca GIT binary patch literal 3095 zcma)8Z)_Y#6`$Mn-rn8L?xo3HFW30$-96u(le%kT@6~onlz$h3Q`_hIPc5aY%GtL` zt!{3^*-0Eqk_UW!Q3O)d36Y4EqFP8sRrt~fA&mG^kRk%8&_o~tg2fr1U5e|Lo5P{&Qx%S{3IV>Q%?pW zxN_D7rI&CGLRjda!$R>CPOhY~u++yw-%c?lQ)J4vB9rGFrj9Ws%}JNZYYtP|+zcGl znY@Ioi3Tf!ODY=n{!=o8%cCxPKn9TKqPoqbpE^v6;JeH5S?C9j4@=K5*};X77?cQl z+U2mF=@c{ZMUP^AgMO?*JIITrQ@-iX);yD777`F@1rmX{&Xicgh?L{H4VgJJIAYEP zC+ahvITKVpC)QAk2&KHp3}$6Oc$YpaoZcclU(KcS4O`Qv=RINT6|ZvKJTd20J56FX zR*%#@Zr8mCl5YE1_{d<#z7W{=mHxFe2 z|H9DH1_?ufcq{0J%&D-!34%gfW!R~qA{4fwsLyQaNfthnWc&E|l_dKjA1@?fohjiY zqUtgp8bZd#x4pEq!4dwO#bCH)^f(NZ9lhf z>^bu_v%A|oW9|uRPuIOUZ{9>EY8Kcn)P6t;0|nS@u<-j$U=rQtkI#)4x(aIu=w;~X z?OKNJ-YkstLe9utY0txV`rup7c-T7Oe~f0jyHg$KO>%QXa1M zL9z!m@&j;}a!TbCPJE1`E8m~TSUTc7n(gIQrs&Ds@*-Foo{t3|(W{^%8ubJWV`xE; z^A{l&1(Ht&P%|3Ul-IF(U{8+)pX_A{O<>tt%kX_F=AkqWNvj<-CEH$ON~9Q|L*E9X zs%T`pAxQFnJecly5&s@5E(&y7%Qu#R+S5!q z*6(&|tJ;(n#GX)BBfA6|BDxHpiD9fhbf2TjtSs>o>}`NN54eLYJl$UxuD2o|yzB z1HYp()HkS~Q+PGXD;zmL!EInn3LG|I3Y~6oVH4_L5o|%!Cs!;hSH$>}E0~&rToD+! z27JN*2Lr<+*}|EqBk9tkQUHbl+J*@H3Q?4Fv3%UOFo{dB1^4rWVZq-h(~`UnZCmnf zt66KQB4m29Y<;~06SSNREHHX~gzh!bp4;J1lo0F=SNre|aQSucfWX^*exscskuCJ+ z7GZ13x(fFZ&{zWSkDg0#zc=Z_hdq}pxZ7LAbGy?6AzHbye|gfnet&oxZlV)RI#cdB z86KfxEQS!iAaUencnf%h|5w3>P+HC<;rBX|)}Q2JSX)9YoPcLhqMaWUBFSbNCaVpT zg^fDL+X_>^>^vN}Q}t^Ow0!Ic5+Bd`3^uj$Yq2TmV>RGq$rAtOR5TbHEQ8;34?Rd& zEFI`qFgsk34Q#9;sxA5oKa>tGt?lN~ArAgme4Hna5kp7B3X_=H2yGZox)_!6I^Qmf zeN!B&kIs`EA#pds!t0%U^02uh?O1S+0^>87!UlP%6Gm}Zav-Fm66TOZ(2koCH+Sr9 zkwJJ`U}*pgf+>kadH|~lp2F6)er`uOBXS*E6_VT%MHu4!2^^vw@>Fyi@s&eVIel z>d!G{ha>XCZ@u4A+qYTlZe!~jwg`k<)5Kciv!RKe#wQXL;PwY=CoBGJ)qKTUs8qd~ z*<%%-?5$Su9-A|3!Mr(JGe76m1M_(0sJFogPR!5ERz0&`sp0BSW!m$F@R8s|tx6_q zb#Ep(TARg>4s$+u)vJ-=$_(15_;s&RJ#Bgmvvc!vWY{|v)K8niNw0pYK0EJ`F>mHX zeQq|WnZ9=tP0j~_S@pc*O%HXi>K(z)s4C%42T_-_G9lZE`r{_OqP;BW+uBj>)N4dt zUDAvi2_8 zzs=V>N%T4GO>NB~(GE`UBvDIykC10cbenclyR(NxL|`wXU+n{`zSQ*3*SndbUg5?i20X?bXHf}10QBm2cTS{C7w(&Q+icj&>9!3I4(h4eZtlwTMlw2v;2 zl^c2oo}UzYBL|)*bd8=rIPqyg4>iX_!>HFrPaWDnLg+bq<lQ1DJe#_LfS7zDakFPwpc$pf3+k h4iUPltEIsg2)&8?k&=)rkD-1SojOo%`v24f`4`xPLq-4q literal 0 HcmV?d00001 diff --git a/regex.tal b/regex.tal index c428c57..566ba5c 100644 --- a/regex.tal +++ b/regex.tal @@ -23,8 +23,12 @@ ( ) ( when searching, ^ matches the beginning of the string OR a line. ) ( $ matches the end of a line OR the end of the entire string. ) -( (the ^ and $ operators aren't yet supported.) the dot nodes will ) -( not match newline characters, which must be matched explicitly. ) +( the dot nodes will not match newline characters, which must be ) +( matched explicitly. ) +( ) +( finally, search-multiline will cause ^ and $ to use the matching ) +( behavior (i.e. only matching the beginning or end of a string). ) +( however dot nodes will still not match newline characters. ) ( ) ( search returns 01 if the string was matched and 00 otherwise. ) ( additionally, the @search-start and @search-end addresses will ) @@ -44,6 +48,10 @@ ( dollar matches end of line/string [ #07 next* ] ) ( lpar starts subgroup region [ #08 i* next* ] ) ( rpar ends subgroup region [ #09 i* next* ] ) +( class character class, e.g. [a-z] [ #0a next* n^ ... ] ) +( (NOTE: n is the number of pairs in ...) ) +( nclass negative class, e.g. [^a-z] [ #0b next* n^ ... ] ) +( (NOTE: n is the number of pairs in ...) ) ( ) ( `or` and `star` have the same structure and are handled by the ) ( same code (;do-or). however, the node types are kept different ) @@ -93,7 +101,7 @@ LIT '! emit! space &loop LDAk #00 EQU ,&done JCN LDAk emit! INC2 ,&loop JMP - &done POP2 newline #010f DEO BRK + &done POP2 newline #ff0e DEO #010f DEO BRK ( error messages ) @unknown-node-type "unknown 20 "node 20 "type 00 @@ -159,7 +167,9 @@ LDAk #07 EQU ;do-dollar JCN2 LDAk #08 EQU ;do-lpar JCN2 LDAk #09 EQU ;do-rpar JCN2 - LDAk ;unknown-node-type ;error!! JSR2 + LDAk #0a EQU ;do-ccls JCN2 + LDAk #0b EQU ;do-ncls JCN2 + LDAk #dd ;unknown-node-type ;error!! JSR2 ( used when we hit a dead-end during matching. ) ( ) @@ -244,6 +254,36 @@ LDA2 ;push4 JSR2 ( save (s, right) in the stack for possible backtracking ) LDA2 ;loop JMP2 ( continue on left branch ) +@matches-cls ( str* regex* -> bool^ ) + OVR2 LDA ,¬-null JCN + ( needs to have a character to match ) + POP2 POP2 ;goto-backtrack JMP2 + ¬-null + DUP2 INC2 LDA2 STH2 ( str regex [next] ) + OVR2 INC2 STH2 ( str regex [str+1 next] ) + SWP2 LDA STH ( regex [c str+1 next] ) + #0003 ADD2 LDAk #00 SWP #0002 MUL2 ( r+3 len*2 [c str+1 next] ) + SWP2 INC2 STH2k ADD2 STH2r ( r+4+len*2 r+4 [c str+1 next] ) + &loop ( limit addr [c str+1 next] ) + EQU2k ,&missing JCN + LDAk STHkr GTH ,&next1 JCN INC2 + LDAk STHkr LTH ,&next2 JCN ,&found JMP + &next1 INC2 + &next2 INC2 ,&loop JMP + &missing POP2 POP2 POPr ,&negated LDR ,&match JCN + &no-match POP2r POP2r ;goto-backtrack JMP2 + &found POP2 POP2 POPr ,&negated LDR ,&no-match JCN + &match STH2r STH2r ;goto-next JMP2 + [ &negated $1 ] + +( ) +@do-ccls ( str* regex* -> bool^ ) + #00 ,matches-cls/negated STR ,matches-cls JMP + +( ) +@do-ncls ( str* regex* -> bool^ ) + #01 ,matches-cls/negated STR ,matches-cls JMP + ( REGEX PARSING ) ( do we match across lines? ) @@ -343,6 +383,8 @@ DUP LIT '$ EQU ;c-dollar JCN2 DUP LIT '( EQU ;c-lpar JCN2 DUP LIT ') EQU ;c-rpar JCN2 + DUP LIT '[ EQU ;c-lbrk JCN2 + DUP LIT '] EQU ;c-rbrk JCN2 DUP LIT '\ EQU ;c-esc JCN2 DUP LIT '* EQU ;c-star JCN2 DUP LIT '+ EQU ;c-plus JCN2 @@ -414,6 +456,42 @@ ;c-peek-and-finalize JMP2 &mismatched-parens ;mismatched-parens ;error!! JSR2 +( doesn't support weird things like []abc] or [-abc] or similar. ) +( doesn't currently handle "special" escapes such as \n ) +@c-lbrk ( c^ -> r2* ) + POP LITr 00 ;pos LDA2 ( pos [0] ) + LDAk LIT '^ NEQ ,&normal JCN INCr INC2 ( pos [negated?^] ) + &normal + #0a STHr ADD ( src* type^ ) + ;arena-pos LDA2 STH2k ( src* type^ dst* [dst*] ) + STA LIT2r 0004 ADD2r ( src* [dst+4] ) + &left-parse ( src* [dst*] ) + LDAk LIT '] EQU ,&done JCN + LDAk LIT '- EQU ,&error JCN + LDAk LIT '\ NEQ ,&left JCN INC2 + &left + LDAk STH2kr STA INC2r + DUP2 INC2 LDA LIT '- NEQ ,&pre-right JCN INC2 INC2 + LDAk LIT '] EQU ,&error JCN + LDAk LIT '- EQU ,&error JCN + &pre-right + LDAk LIT '\ NEQ ,&right JCN INC2 + &right + LDAk STH2kr STA INC2 INC2r ,&left-parse JMP + &done ( src* [dst*] ) + INC2 ;pos STA2 STH2r ( dst* ) + DUP2 ;arena-pos LDA2 ( dst dst a ) + #0004 ADD2 SUB2 #0002 DIV2 NIP ( dst (dst-(a+4))/2 ) + ;arena-pos LDA2 STH2k #0003 ADD2 STA ( dst [a] ) + ;arena-pos STA2 STH2r ( a ) + DUP2 ;c-peek-and-finalize JMP2 + &error + #abcd #0000 DIV ( TODO error here ) + +@c-rbrk ( c^ -> r2* ) + POP + #0000 DIV ( invariant: should never be seen ) + ( called when we read "." ) ( ) ( allocates a dot-node and continues. ) @@ -492,8 +570,8 @@ ( ALLOCATING REGEX NDOES ) @rx-node-sizes - ( 00 01 02 03 04 05 06 07 08 09 ) - [ 00 03 03 04 ] [ 05 05 03 03 ] [ 05 05 ] + ( 00 01 02 03 04 05 06 07 08 09 0a 0b ) + [ 00 03 03 04 ] [ 05 05 03 03 ] [ 05 05 00 00 ] @alloc3 ( mode^ -> r* ) #0000 ROT ( 00 00 mode^ ) @@ -612,11 +690,13 @@ ( you can end up double-appending things. ) @set-next ( target* regex* -> ) LDAk #01 LTH ,&unknown JCN - LDAk #09 GTH ,&unknown JCN + LDAk #0b GTH ,&unknown JCN + LDAk #09 GTH ,&cc JCN LDAk #00 SWP ;rx-node-sizes ADD2 LDA #00 SWP ADD2 #0002 SUB2 ;set-next-addr JMP2 - &unknown LDAk ;unknown-node-type ;error!! JSR2 + &cc INC2 ;set-next-addr JMP2 + &unknown LDAk #ee ;unknown-node-type ;error!! JSR2 @set-next-or-addr ( target* addr* -> ) LDA2k #0000 EQU2 ( target addr addr=0? ) ,&is-zero JCN @@ -776,30 +856,3 @@ @subgroup-pos :subgroup-bot ( the position of the first unallocated subgroup ) @subgroup-bot $400 @subgroup-top ( holds up to 256 subgroups (1024 bytes) ) - -( INTERVAL OPERATIONS ) -( ) -( not baked yet ) -( -@min ( first* last* -> min-addr* ) - SWP2 STH2k ,&incr JMP ( last first [first] ) - &loop LDAk LDAkr STHr LTH ,&replace JCN ,&incr JMP ( last a [c] ) - &replace POP2r STH2k ( last a [a] ) - &incr EQUk ,&done JCN INC2 ,&loop JMP ( last a+1 [c] ) - &done POP2 POP2 STH2r JMP2r ( c ) - -@sort ( first* last* -> ) - SWP2 ( last first ) - &loop ;min JSR2 NEQk ,&swap JCN POP2 ,&incr JMP - &swap STH2 LDA2k ( last first fx [min] ) STH2kr STA STH2r SWP2 ( last min first ) - STH2 LDA2 ( last mx [first] ) STH2kr STA STH2r ( last first ) - &incr EQUk ,&done JCN INC2 ,&loop JMP - &done POP2 POP2 JMP2r - -@iv-in-range ( c^ b0^ b1^ -> bool^ ) - ROT STHk LTH ,&above JCN - STHr GTH ,&below JCN #01 JMP2r - &above POPr POP &below #00 JMP2r - -@iv-find ( c^ iv* -> bool^ ) - ) diff --git a/repl-regex.tal b/repl-regex.tal index 661533e..e436086 100644 --- a/repl-regex.tal +++ b/repl-regex.tal @@ -52,8 +52,8 @@ #00 ;ptr LDA2 STA ( null terminate string ) ;ptr LDA2 ;buffer EQU2 STH ( stash is-empty? ) ;buffer ;ptr STA2 ( reset ptr ) -( ;buffer ;regex LDA2 ;rx-match JSR2 ( match regex ) ) - ;buffer ;regex LDA2 ;rx-search JSR2 ( search regex ) + ;buffer ;regex LDA2 ;rx-match JSR2 ( match regex ) +( ;buffer ;regex LDA2 ;rx-search JSR2 ( search regex ) ) ;emit-byte JSR2 nl ( print result ) STHr ,&was-empty JCN ;s-prompt ;println JSR2 @@ -102,8 +102,12 @@ DUP2 ;emit-short JSR2 LIT ': emit! space LDAk #01 LTH ,&uu JCN - LDAk #09 GTH ,&uu JCN + LDAk #0b GTH ,&uu JCN + LDAk #09 GTH ,&cc JCN LDAk #00 SWP ;rx-node-sizes ADD2 LDA ;emit-n JSR2 ,&loop JMP + &cc + DUP2 #0003 ADD2 LDA #02 MUL #04 ADD + ;emit-n JSR2 ,&loop JMP &uu - ;unknown-node-type ;error!! JSR2 + LDAk #cc ;unknown-node-type ;error!! JSR2