nxu/math32.tal

( math32.tal                                                      )
(                                                                 )
( This library supports arithmetic on 32-bit unsigned integers,   )
( also known as long values.                                      )
(                                                                 )
( 32-bit long values are represented by two 16-bit short values:  )
(                                                                 )
(      decimal  hexadecimal  uxn literals                         )
(            0   0x00000000   #0000 #0000                         )
(            1   0x00000001   #0000 #0001                         )
(         4660   0x00001234   #0000 #1234                         )
(        65535   0x0000ffff   #0000 #ffff                         )
(        65536   0x00010000   #0001 #0000                         )
(     16777215   0x00ffffff   #00ff #ffff                         )
(   4294967295   0xffffffff   #ffff #ffff                         )
(                                                                 )
( The most significant 16-bit, the "high bits", are stored first. )
( We document long values as x** -- equivalent to xhi* xlo*.      )
(                                                                 )
( Operations supported:                                           )
(                                                                 )
(   NAME            STACK EFFECT        DEFINITION                )
(   add32           x** y** -> z**      x + y                     )
(   sub32           x** y** -> z**      x - y                     )
(   mul16           x*  y*  -> z**      x * y                     )
(   mul32           x** y** -> z**      x * y                     )
(   div32           x** y** -> q**      x / y                     )
(   mod32           x** y** -> r**      x % y                     )
(   divmod32        x** y** -> q** r**  x / y, x % y              )
(   gcd32           x** y** -> z**      gcd(x, y)                 )
(   negate32        x**     -> z**      -x                        )
(   lshift32        x** n^  -> z**      x<<n                      )
(   rshift32        x** n^  -> z**      x>>n                      )
(   and32           x** y** -> z**      x & y                     )
(   or32            x** y** -> z**      x | y                     )
(   xor32           x** y** -> z**      x ^ y                     )
(   complement32    x**     -> z**      ~x                        )
(   eq32            x** y** -> bool^    x == y                    )
(   ne32            x** y** -> bool^    x != y                    )
(   is-zero32       x**     -> bool^    x == 0                    )
(   non-zero32      x**     -> bool^    x != 0                    )
(   lt32            x** y** -> bool^    x < y                     )
(   gt32            x** y** -> bool^    x > y                     )
(   lteq32          x** y** -> bool^    x <= y                    )
(   gteq32          x** y** -> bool^    x >= y                    )
(   bitcount8       x^      -> bool^    floor(log2(x))+1          )
(   bitcount16      x*      -> bool^    floor(log2(x))+1          )
(   bitcount32      x**     -> bool^    floor(log2(x))+1          )
(                                                                 )
( In addition to the code this file uses 44 bytes of registers    )
( to store temporary state:                                       )
(                                                                 )
(   - shared memory, 16 bytes                                     )
(   - mul32 memory, 12 bytes                                      )
(   - z_divmod32 memory, 16 bytes                                  )

( bitcount: number of bits needed to represent number )
( equivalent to floor[log2[x]] + 1 )

@bitcount8 ( x^ -> n^ )
    LITr 00 &loop DUP ?{ POP STHr JMP2r } #01 SFT INCr !&loop

@bitcount16 ( x* -> n^ )
    LITr 00 &loop ORAk ?{ POP2 STHr JMP2r } #01 SFT2 INCr !&loop

@bitcount32 ( x** -> n^ )
    SWP2 bitcount16 DUP ?{ POP !bitcount16 } #10 NIP2 ADD JMP2r

( equality )

( x == y )
@eq32 ( xhi* xlo* yhi* ylo* -> bool^ )
    ROT2 EQU2 STH EQU2 STHr AND JMP2r

( x != y )
@ne32 ( xhi* xlo* yhi* ylo* -> bool^ )
    ROT2 NEQ2 STH NEQ2 STHr ORA JMP2r

( x == 0 )
@is-zero32 ( x** -> bool^ )
    ORA2 #0000 EQU2 JMP2r

( x != 0 )
@non-zero32 ( x** -> bool^ )
    ORA2 ORA JMP2r

( comparisons )

( x < y )
@lt32 ( x** y** -> bool^ )
    ROT2 SWP2 LTH2 ?{ LTH2 JMP2r } GTH2 #00 EQU JMP2r

( x <= y )
@lteq32 ( x** y** -> bool^ )
    ROT2 SWP2 GTH2 ?{ GTH2 #00 EQU JMP2r } LTH2 JMP2r

( x > y )
@gt32 ( x** y** -> bool^ )
    ROT2 SWP2 GTH2 ?{ GTH2 JMP2r } LTH2 #00 EQU JMP2r

( x > y )
@gteq32 ( x** y** -> bool^ )
    ROT2 SWP2 LTH2 ?{ LTH2 #00 EQU JMP2r } GTH2 JMP2r

( bitwise operations )

( x & y )
@and32 ( xhi* xlo* yhi* ylo* -> xhi|yhi* xlo|ylo* )
    ROT2 AND2 STH2 AND2 STH2r JMP2r

( x | y )
@or32 ( xhi* xlo* yhi* ylo* -> xhi|yhi* xlo|ylo* )
    ROT2 ORA2 STH2 ORA2 STH2r JMP2r

( x ^ y )
@xor32 ( xhi* xlo* yhi* ylo* -> xhi|yhi* xlo|ylo* )
    ROT2 EOR2 STH2 EOR2 STH2r JMP2r

( ~x )
@complement32 ( x** -> ~x** )
    SWP2 #ffff EOR2 SWP2 #ffff EOR2 JMP2r

( bit shifting )

( x >> n )
@rshift32 ( x** n^ -> x>>n )
    DUP #08 LTH ?shift32-0 ( x n )
    DUP #10 LTH ?rshift32-1 ( x n )
    DUP #18 LTH ?rshift32-2 ( x n )
    !rshift32-3 ( x n )

( shift by 0-7 bits; used by both lshift and rshift )
@shift32-0 ( x** n^ -> x>>n )
    STH DUP2 STHkr SFT2                       ,&z2 STR2
    POP DUP2 STHkr SFT2 ,&z2 LDR ORA ,&z2 STR ,&z1 STR
    POP      STHr  SFT2 ,&z1 LDR ORA ,&z1 STR
    LIT [ &z1 $1 ] LIT2 [ &z2 $2 ] JMP2r

( shift right by 8-15 bits )
@rshift32-1 ( x** n^ -> x>>n )
    #08 SUB STH                 ( stash [n>>8] )
    POP DUP2 STHkr SFT2 ,&z2 STR2
    POP      STHr  SFT2 ,&z2 LDR ORA ,&z2 STR
    #00 SWP LIT2 [ &z2 $2 ] JMP2r

( shift right by 16-23 bits )
@rshift32-2 ( x** n^ -> x>>n )
    #10 SUB STH                 ( stash [n>>16] )
    POP2 STHr SFT2 #0000 SWP2 JMP2r

( shift right by 16-23 bits )
@rshift32-3 ( x** n^ -> x>>n )
    #18 SUB STH                 ( stash [n>>24] )
    POP2 POP STH SWPr SFTr #00 #0000 STHr JMP2r

( x << n )
@lshift32 ( x** n^ -> x<<n )
    DUP #08 LTH ?lshift32-0 ( x n )
    DUP #10 LTH ?lshift32-1 ( x n )
    DUP #18 LTH ?lshift32-2 ( x n )
    !lshift32-3 ( x n )

( shift left by 0-7 bits )
@lshift32-0 ( x** n^ -> x<<n )
    #40 SFT !shift32-0

( shift left by 8-15 bits )
@lshift32-1 ( x** n^ -> x<<n )
    #08 SUB #40 SFT STH ( stash [n-8]<<4 )
        DUP2 STHkr SFT2 ,&z1 STR2
    POP      STHr  SFT2 ,&z1 LDR ORA ,&z1 STR
    NIP LIT2 [ &z1 $1 &z2 $1 ] #00 JMP2r

( shift left by 16-23 bits )
@lshift32-2 ( x** n^ -> x<<n )
    #10 SUB #40 SFT STH ( stash [n-16]<<4 )
    NIP2 STHr SFT2 #0000 JMP2r

( shift left by 24-31 bits )
@lshift32-3 ( x** n^ -> x<<n )
    #18 SUB #40 SFT ( stash [n-24]<<4 )
    SFT NIP2 NIP #0000 #00 JMP2r

( arithmetic )

( x + y )
@add32 ( xhi* xlo* yhi* ylo* -> zhi* zlo* )
    ROT2 STH2k ADD2 STH2k ROT2 ROT2 GTH2r #00 STHr ADD2 ADD2 SWP2 JMP2r

( -x )
@negate32 ( x** -> -x** )
    complement32 INC2 ORAk ?{ SWP2 INC2 SWP2 } JMP2r

( x - y )
@sub32 ( x** y** -> z** )
    ROT2 STH2k SWP2 SUB2 STH2k ROT2 ROT2 LTH2r #00 STHr ADD2 SUB2 SWP2 JMP2r

( 16-bit multiplication )
@mul16 ( x* y* -> z** )
    ,&y1 STR ,&y0 STR ( save ylo, yhi )
    ,&x1 STR ,&x0 STR ( save xlo, xhi )
    #0000 ,&z1 STR ,&w0 STR ( reset z1 and w0 )

    ( x1 * y1 => z1z2 )
    LIT2 00 [ &x1 $1 ] LIT2 00 [ &y1 $1 ] MUL2 ,&z3 STR ,&z2 STR

    ( x0 * y1 => z0z1 )
    #00 ,&x0 LDR #00 ,&y1 LDR MUL2 ,&z1 LDR2 ADD2 ,&z1 STR2

    ( x1 * y0 => w1w2 )
    #00 ,&x1 LDR #00 ,&y0 LDR MUL2 ,&w2 STR ,&w1 STR

    ( x0 * y0 => w0w1 )
    LIT2 00 [ &x0 $1 ] LIT2 00 [ &y0 $1 ] MUL2 ,&w0 LDR2 ADD2 ,&w0 STR2

    ( add z and a<<8 )
    #00 LIT2 [ &z1 $1 &z2 $1 ] LIT [ &z3 $1 ]
    LIT2 [ &w0 $1 &w1 $1 ] LIT [ &w2 $1 ] #00
    !add32

( x * y )
@mul32 ( x** y** -> z** )
    ROT2k ( x0* x1* y0* y1* y0* y1* x1* )
    mul16 ,&z1 STR2 ,&z0 STR2 POP2 ( x0* x1* y0* y1* ; sum = [x1*y1] )
    STH2 ROT2 STH2                 ( x1* y0* [y1* x0*] )
    MUL2r MUL2 STH2r ADD2          ( x1*y0+y1*x0* )
    ( [x0*y0]<<32 will completely overflow )
    LIT2 [ &z0 $2 ] ADD2 ( sum += [x0*y1+x1*y0]<<16 )
    LIT2 [ &z1 $2 ] JMP2r

@div32 ( x** y** -> q** )
    z_divmod32 ;z_divmod32/quo0 LDA2 ;z_divmod32/quo1 LDA2 JMP2r

@mod32 ( x** y** -> r** )
    z_divmod32 ;z_divmod32/rem0 LDA2 ;z_divmod32/rem1 LDA2 JMP2r

@divmod32 ( x** y** -> q** r** )
    z_divmod32
    ;z_divmod32/quo0 LDA2 ;z_divmod32/quo1 LDA2
    ;z_divmod32/rem0 LDA2 ;z_divmod32/rem1 LDA2
    JMP2r

( calculate and store x / y and x % y )
@z_divmod32 ( x** y** -> )
    ( store y and x for repeated use )
    ,&div1 STR2 ,&div0 STR2 ( y -> div )
    ,&rem1 STR2 ,&rem0 STR2 ( x -> rem )

    ( if x < y then the answer is 0 )
    ,&rem0 LDR2 ,&rem1 LDR2
    ,&div0 LDR2 ,&div1 LDR2
    lt32 ?&is-zero !&not-zero
    &is-zero
    #0000 ,&quo0 STR2 #0000 ,&quo1 STR2 JMP2r

    ( x >= y so the answer is >= 1 )
    &not-zero
    #0000 ,&quo0 STR2 #0000 ,&quo1 STR2 ( 0 -> quo )

    ( bitcount[x] - bitcount[y] determines the largest multiple of y to try )
    ,&rem0 LDR2 ,&rem1 LDR2 bitcount32 ( rbits^ )
    ,&div0 LDR2 ,&div1 LDR2 bitcount32 ( rbits^ dbits^ )
    SUB ( shift=rbits-dits )
    #00 DUP2 ( shift 0 shift 0 )

    ( 1<<shift -> cur )
    #0000 INC2k ROT2 POP
    lshift32 ,&cur1 STR2 ,&cur0 STR2

    ( div<<shift -> div )
    ,&div0 LDR2 ,&div1 LDR2 ROT2 POP
    lshift32 ,&div1 STR2 ,&div0 STR2

    !&loop

    [ &div0 $2 &div1 $2
      &rem0 $2 &rem1 $2
      &quo0 $2 &quo1 $2
      &cur0 $2 &cur1 $2 ]

    &loop
    ( if rem >= the current divisor, we can subtract it and add to quotient )
    ,&rem0 LDR2 ,&rem1 LDR2 ,&div0 LDR2 ,&div1 LDR2 lt32 ( is rem < div? )
    ?&rem-lt ( if rem < div skip this iteration )

    ( since rem >= div, we have found a multiple of y that divides x )
    ,&rem0 LDR2 ,&rem1 LDR2 ,&div0 LDR2 ,&div1 LDR2 sub32 ,&rem1 STR2 ,&rem0 STR2 ( rem -= div )
    ,&quo0 LDR2 ,&quo1 LDR2 ,&cur0 LDR2 ,&cur1 LDR2 add32 ,&quo1 STR2 ,&quo0 STR2 ( quo += cur )

    &rem-lt
    ,&div0 LDR2 ,&div1 LDR2 #01 rshift32 ,&div1 STR2 ,&div0 STR2 ( div >>= 1 )
    ,&cur0 LDR2 ,&cur1 LDR2 #01 rshift32 ,&cur1 STR2 ,&cur0 STR2 ( cur >>= 1 )
    ,&cur0 LDR2 ,&cur1 LDR2 non-zero32 ?&loop ( if cur>0, loop. else we're done )
    JMP2r

( greatest common divisor - euclidean algorithm )
@gcd32 ( x** y** -> z** )
    &loop OVR2 OVR2 is-zero32 ?{ ( x** y** )
        OVR2 OVR2 STH2 STH2      ( x** y** [y**] )
        mod32                    ( r=x%y** [y**] )
        STH2r ROT2 ROT2          ( yhi* rhi* rlo* [ylo*] )
        STH2r ROT2 ROT2 !&loop   ( y** r** )
    } POP2 POP2 JMP2r            ( z** )