diff --git a/math32.tal b/math32.tal
index 87ffd7c..0afb5df 100644
--- a/math32.tal
+++ b/math32.tal
@@ -27,7 +27,7 @@
 (   div32           x** y** -> q**      x / y                     )
 (   mod32           x** y** -> r**      x % y                     )
 (   divmod32        x** y** -> q** r**  x / y, x % y              )
-(   gcd32           x** y** -> z**      gcd(x, y)                 )
+(   gcd32           x** y** -> z**      gcd[x, y]                 )
 (   negate32        x**     -> z**      -x                        )
 (   lshift32        x** n^  -> z**      x<<n                      )
 (   rshift32        x** n^  -> z**      x>>n                      )
@@ -43,19 +43,12 @@
 (   gt32            x** y** -> bool^    x > y                     )
 (   lteq32          x** y** -> bool^    x <= y                    )
 (   gteq32          x** y** -> bool^    x >= y                    )
-(   bitcount8       x^      -> bool^    floor(log2(x))+1          )
-(   bitcount16      x*      -> bool^    floor(log2(x))+1          )
-(   bitcount32      x**     -> bool^    floor(log2(x))+1          )
+(   bitcount8       x^      -> bool^    floor[log2[x]]+1          )
+(   bitcount16      x*      -> bool^    floor[log2[x]]+1          )
+(   bitcount32      x**     -> bool^    floor[log2[x]]+1          )
 (                                                                 )
-( In addition to the code this file uses 44 bytes of registers    )
-( to store temporary state:                                       )
-(                                                                 )
-(   - shared memory, 16 bytes                                     )
-(   - mul32 memory, 12 bytes                                      )
-(   - z_divmod32 memory, 16 bytes                                  )
-
-( bitcount: number of bits needed to represent number )
-( equivalent to floor[log2[x]] + 1 )
+( bitcount: number of bits needed to represent the number.        )
+( this is equivalent to floor[log2[x]] + 1 )
 
 @bitcount8 ( x^ -> n^ )
     LITr 00 &loop DUP ?{ POP STHr JMP2r } #01 SFT INCr !&loop
@@ -66,7 +59,7 @@
 @bitcount32 ( x** -> n^ )
     SWP2 bitcount16 DUP ?{ POP !bitcount16 } #10 NIP2 ADD JMP2r
 
-( equality )
+( -- equality )
 
 ( x == y )
 @eq32 ( xhi* xlo* yhi* ylo* -> bool^ )
@@ -84,7 +77,7 @@
 @non-zero32 ( x** -> bool^ )
     ORA2 ORA JMP2r
 
-( comparisons )
+( -- comparisons )
 
 ( x < y )
 @lt32 ( x** y** -> bool^ )
@@ -102,10 +95,10 @@
 @gteq32 ( x** y** -> bool^ )
     ROT2 SWP2 LTH2 ?{ LTH2 #00 EQU JMP2r } GTH2 JMP2r
 
-( bitwise operations )
+( -- bitwise operations )
 
 ( x & y )
-@and32 ( xhi* xlo* yhi* ylo* -> xhi|yhi* xlo|ylo* )
+@and32 ( xhi* xlo* yhi* ylo* -> xhi&yhi* xlo&ylo* )
     ROT2 AND2 STH2 AND2 STH2r JMP2r
 
 ( x | y )
@@ -113,14 +106,14 @@
     ROT2 ORA2 STH2 ORA2 STH2r JMP2r
 
 ( x ^ y )
-@xor32 ( xhi* xlo* yhi* ylo* -> xhi|yhi* xlo|ylo* )
+@xor32 ( xhi* xlo* yhi* ylo* -> xhi^yhi* xlo^ylo* )
     ROT2 EOR2 STH2 EOR2 STH2r JMP2r
 
 ( ~x )
-@complement32 ( x** -> ~x** )
+@complement32 ( x** -> ~xhi* ~xlo* )
     SWP2 #ffff EOR2 SWP2 #ffff EOR2 JMP2r
 
-( bit shifting )
+( -- bit-shifting )
 
 ( x >> n )
 @rshift32 ( x** n^ -> x>>n )
@@ -181,7 +174,7 @@
     #18 SUB #40 SFT ( stash [n-24]<<4 )
     SFT NIP2 NIP #0000 #00 JMP2r
 
-( arithmetic )
+( -- arithmetic )
 
 ( x + y )
 @add32 ( xhi* xlo* yhi* ylo* -> zhi* zlo* )
@@ -228,70 +221,57 @@
     LIT2 [ &z0 $2 ] ADD2 ( sum += [x0*y1+x1*y0]<<16 )
     LIT2 [ &z1 $2 ] JMP2r
 
+( x / y )
 @div32 ( x** y** -> q** )
     z_divmod32 ;z_divmod32/quo0 LDA2 ;z_divmod32/quo1 LDA2 JMP2r
 
+( x % y )
 @mod32 ( x** y** -> r** )
     z_divmod32 ;z_divmod32/rem0 LDA2 ;z_divmod32/rem1 LDA2 JMP2r
 
+( x / y, x % y )
 @divmod32 ( x** y** -> q** r** )
     z_divmod32
     ;z_divmod32/quo0 LDA2 ;z_divmod32/quo1 LDA2
     ;z_divmod32/rem0 LDA2 ;z_divmod32/rem1 LDA2
     JMP2r
 
-( calculate and store x / y and x % y )
+( private: calculate and store x / y and x % y )
 @z_divmod32 ( x** y** -> )
-    ( store y and x for repeated use )
-    ,&div1 STR2 ,&div0 STR2 ( y -> div )
-    ,&rem1 STR2 ,&rem0 STR2 ( x -> rem )
+    ( ; store y and x for repeated use )
+    #0000 DUP2 ,&quo0 STR2 ,&quo1 STR2      ( x** y**         ; quo<-0 )
+    STH2k ,&div1 STR2 STH2k ,&div0 STR2     ( x** [ylo* yhi*] ; div<-y )
+    OVR2 OVR2 ,&rem1 STR2 ,&rem0 STR2       ( x** [ylo* yhi*] ; rem<-x )
+    OVR2 OVR2 STH2r STH2r                   ( x** x** y** )
+    OVR2 OVR2 STH2 STH2                     ( x** x** y** [ylo* yhi*] )
+    gteq32 ?{ POP2 POP2 POP2r POP2r JMP2r } ( x** [ylo* yhi*] ; return if x < y )
 
-    ( if x < y then the answer is 0 )
-    ,&rem0 LDR2 ,&rem1 LDR2
-    ,&div0 LDR2 ,&div1 LDR2
-    lt32 ?&is-zero !&not-zero
-    &is-zero
-    #0000 ,&quo0 STR2 #0000 ,&quo1 STR2 JMP2r
-
-    ( x >= y so the answer is >= 1 )
-    &not-zero
-    #0000 ,&quo0 STR2 #0000 ,&quo1 STR2 ( 0 -> quo )
-
-    ( bitcount[x] - bitcount[y] determines the largest multiple of y to try )
-    ,&rem0 LDR2 ,&rem1 LDR2 bitcount32 ( rbits^ )
-    ,&div0 LDR2 ,&div1 LDR2 bitcount32 ( rbits^ dbits^ )
-    SUB ( shift=rbits-dits )
-    #00 DUP2 ( shift 0 shift 0 )
-
-    ( 1<<shift -> cur )
-    #0000 INC2k ROT2 POP
-    lshift32 ,&cur1 STR2 ,&cur0 STR2
-
-    ( div<<shift -> div )
-    ,&div0 LDR2 ,&div1 LDR2 ROT2 POP
-    lshift32 ,&div1 STR2 ,&div0 STR2
-
-    !&loop
-
-    [ &div0 $2 &div1 $2
-      &rem0 $2 &rem1 $2
-      &quo0 $2 &quo1 $2
-      &cur0 $2 &cur1 $2 ]
+    ( ; bitcount[x] - bitcount[y] determines largest multiple of y to try )
+    bitcount32 STH2r STH2r bitcount32 SUB ( shift=rbits-dits^ )
+    #00 DUP2                              ( shift^ 0^ shift^ 0^ )
+    #0000 INC2k ROT2 POP                  ( shift^ 0^ 0* 1* shift^ )
+    lshift32 ,&cur1 STR2 ,&cur0 STR2      ( shift^ 0^ ; cur<-1<<shift )
+    ,&div0 LDR2 ,&div1 LDR2 ROT2 POP      ( div** shift^ )
+    lshift32 ,&div1 STR2 ,&div0 STR2      ( ; div<-div<<shift )
 
     &loop
-    ( if rem >= the current divisor, we can subtract it and add to quotient )
-    ,&rem0 LDR2 ,&rem1 LDR2 ,&div0 LDR2 ,&div1 LDR2 lt32 ( is rem < div? )
-    ?&rem-lt ( if rem < div skip this iteration )
-
-    ( since rem >= div, we have found a multiple of y that divides x )
-    ,&rem0 LDR2 ,&rem1 LDR2 ,&div0 LDR2 ,&div1 LDR2 sub32 ,&rem1 STR2 ,&rem0 STR2 ( rem -= div )
-    ,&quo0 LDR2 ,&quo1 LDR2 ,&cur0 LDR2 ,&cur1 LDR2 add32 ,&quo1 STR2 ,&quo0 STR2 ( quo += cur )
-
-    &rem-lt
-    ,&div0 LDR2 ,&div1 LDR2 #01 rshift32 ,&div1 STR2 ,&div0 STR2 ( div >>= 1 )
-    ,&cur0 LDR2 ,&cur1 LDR2 #01 rshift32 ,&cur1 STR2 ,&cur0 STR2 ( cur >>= 1 )
-    ,&cur0 LDR2 ,&cur1 LDR2 non-zero32 ?&loop ( if cur>0, loop. else we're done )
-    JMP2r
+      ( ; if rem >= cur [current divisor], we can subtract it and add to quotient )
+      ( ; otherwise, skip that iteration and reduce cur. )
+      LIT2 [ &rem0 $2 ] LIT2 [ &rem1 $2 ] ,&div0 LDR2 ,&div1 LDR2
+      lt32 ?{
+        ( ; since rem >= div, we have found a multiple of y that divides x )
+        ,&rem0 LDR2 ,&rem1 LDR2             ( rem** )
+        LIT2 [ &div0 $2 ] LIT2 [ &div1 $2 ] ( rem** div** )
+        sub32 ,&rem1 STR2 ,&rem0 STR2       ( ; rem<-rem-div** )
+        LIT2 [ &quo0 $2 ] LIT2 [ &quo1 $2 ] ( quo** )
+        LIT2 [ &cur0 $2 ] LIT2 [ &cur1 $2 ] ( quo** cur** )
+        add32 ,&quo1 STR2 ,&quo0 STR2       ( ; quo<-quo+cur** )
+      }
+      ,&div0 LDR2 ,&div1 LDR2 #01 rshift32 ( div>>1** )
+      ,&div1 STR2 ,&div0 STR2              ( ; div<-div>>1 )
+      ,&cur0 LDR2 ,&cur1 LDR2 #01 rshift32 ( cur>>1** )
+      OVR2 OVR2 ,&cur1 STR2 ,&cur0 STR2    ( cur>>1** ; cur<-cur>>1 )
+      non-zero32 ?&loop JMP2r              ( ; loop if cur>0, else we're done )
 
 ( greatest common divisor - euclidean algorithm )
 @gcd32 ( x** y** -> z** )