major fixes to fix16 accuracy

2023-11-05 20:17:26 -05:00 · 2023-11-05 20:17:26 -05:00 · 2fe1d7d770
parent 1d0bb3da6e
commit 2fe1d7d770
2 changed files with 114 additions and 50 deletions
--- a/fix16.tal
+++ b/fix16.tal
@ -22,12 +22,17 @@
 ( #0700     1792/256      7.0000                          )
 ( #7f00    32512/256    127.0000                          )
 ( #7fff    32767/256    127.9961                          )
-( #8000   -32768/256   -128.0000                          )
+( #8000      invalid     invalid                          )
 ( #8001   -32767/256   -127.9961                          )
 ( #8100   -32767/256   -127.0000                          )
 ( #ff00     -256/256     -1.0000                          )
 ( #ffff       -1/256     -0.0039                          )
 (                                                         )
+( due to the limited range operations saturate at the     )
+( terminal values (i.e. #7fff and #8001). #8000 should    )
+( never be generated, and can be considered a Nan value   )
+( although this library does not use it as such.          )
+(                                                         )
 ( many 8.8 operations are equivalent to unsigned int16:   )
 (   * addition                                            )
 (   * subtraction                                         )
@ -60,7 +65,7 @@
 (  - tan() is mostly supported                            )
 (  - tan(#0192) and tan(#04b6) throw an error             )
 (  - a few tan() values are inaccurate due to range       )
-(    + values "next to" pi/2 and 3pi/2 are affected       ) 
+(    + values "next to" pi/2 and 3pi/2 are affected       )
 (    + tan(#0191) returns 127.996 not 227.785             )
 (    + tan(#0193) returns -127.996 not -292.189           )
 (    + etc.                                               )
@ -198,43 +203,66 @@
@x16-mul ( x* y* -- xy* )
    ;x16-mul-unsigned !x16-signed-op

-@x16-mul-unsigned ( x* y* -- xy* )
-         DUP #00 EQU ?x16-mul-unsigned-rhs-whole
-    SWP2 DUP #00 EQU ?x16-mul-unsigned-rhs-whole
-    ,&y0 STR2 ,&x0 STR2
-    #00 ,&x0 LDR #00 ,&y0 LDR MUL2      ( acc* )
-    OVR ?&overflow SWP                  ( acc* )
-    #00 ,&x1 LDR #00 ,&y0 LDR MUL2 ADD2 ( acc* )
-    #00 ,&x0 LDR #00 ,&y1 LDR MUL2 ADD2 ( acc* )
-    #00 ,&x1 LDR #00 ,&y1 LDR MUL2 #08 SFT2 ADD2 ( acc* )
-    DUP2 #7fff GTH2 ?&overflow
-    JMP2r [ &x0 $1 &x1 $1 &y0 $1 &y1 $1 ]
-    &overflow POP2 #7fff JMP2r
+@x16-mul8 ( x^ y^ -> xy* )
+    #0000 SWP2 ROT SWP MUL2 JMP2r

-@x16-mul-unsigned-rhs-whole ( x0_x1* y0_00* -- xy* )
-    #08 SFT2 MUL2 #7fff !unsigned-min
+@x16-mul-unsigned ( ab* cd* -> ac+ad+bc+bd* )
+    OVR2 OVR2 STH2 STH2 ROT SWPr ROTr             ( a c d b [c b a d] )
+    SWP2  x16-mul8 DUP2 #007f GTH2 ?&o1 SWP       ( d b acc* [c b a d] )
+    SWP2  x16-mul8 #08 SFT2 ADD2                  ( acc* [c b a d] )
+    STH2r x16-mul8 ADD2 OVR #7f GTH ?&o3          ( acc* [c d] )
+    STH2r x16-mul8 ADD2 OVR #7f GTH ?&o4 JMP2r    ( acc* )
+    &o1 POP2 POP2r &o3 POP2r &o4 POP2 #7fff JMP2r ( ; handle overflow )

@x16-div ( x* y* -- x/y* )
    ;x16-div-unsigned !x16-signed-op

+( The idea here is a bit complicated. )
+( )
+( First we look at `x y DIV2`. If that is >255 then we are )
+( going to overflow and we can return 7fff and be done. )
+( )
+( If not, we multiply that result by 256 and start looking )
+( for smaller and smaller fractions of y to star chipping )
+( away at the remainder, if any. )
+( )
+( However we want to avoid rounding errors caused by needlessly )
+( truncating y. For that reason, if the remainder is less than )
+( 0x8000 we will double it rather than diving y. Once it cannot )
+( be safely multiplied we will start dividing y by 2. After 16 )
+( rounds of multiplying the remainder or dividing the quotient )
+( we get our final answer. )
@x16-div-unsigned ( x* y* -> x/y* )
-    DIV2k STH2k       ( x y x/y [x/y] )
-    LITr 80 SFT2r     ( x y x/y [div=(x/y)<<8] )
-    OVR2 STH2         ( x y x/y [y div] )
-    MUL2 SUB2         ( x%y     [y div] )
-    STH2r LIT2r 0100  ( x%y y   [0100 div] )
+    DIV2k DUP2 #007f GTH2 ?&o   ( x* y* x/y* )
+    STH2k LITr 80 SFT2r         ( x* y* x/y* [div=(x/y)<<8*] )
+    OVR2 STH2                   ( x* y* x/y* [div* y*] )
+    MUL2 SUB2                   ( x%y* [div* y*] )
+    STH2r LIT2r 0100            ( x%y* y* [div* 0100*] )
+
+    ( We know rem < y, so start left-shifting rem. )
+    &ploop                       ( rem* y* [div* s*] )
+        STH2kr #0000 EQU2 ?&done ( rem* y* [div* s*] ; done when s=0 )
+        OVR2 #7fff GTH2 ?&loop   ( rem* y* [div* s*] ; rem too big, start shifting y )
+        SWP2 #10 SFT2 SWP2       ( rem<<1* y* [div* s*] )
+        LITr 01 SFT2r            ( rem<<1* y* [div* s>>1*] )
+        LTH2k ?&ploop            ( rem<<1* y* [div* s>>1*] ; rem too small )
+        SWP2 OVR2 SUB2 SWP2      ( rem<<1-y* y* [div* s>>1*] )
+        DUP2r ROT2r ADD2r SWP2r  ( rem<<1-y* y* [div+s>>1* s>>1*] )
+        !&ploop                  ( rem<<1-y* y* [div+s>>1* s>>1*] )
+
+    ( We know rem < y, so start right-shifting y. )
+    &loop                        ( rem* y* [div* s*] )
+        STH2kr #0000 EQU2 ?&done ( rem* y* [div* s*] ; done when s=0 )
+        #01 SFT2 LITr 01 SFT2r   ( rem* y>>1* [div* s>>1*] )
+        LTH2k ?&loop             ( rem* y>>1* [div* s>>1**] ; rem too small )
+        SWP2 OVR2 SUB2 SWP2      ( rem-y>>1* y>>1* [div* s>>1*] )
+        DUP2r ROT2r ADD2r SWP2r  ( rem-y>>1* y>>1* [div+s>>1* s>>1*] )
+        !&loop                   ( rem-y>>1* y>>1* [div+s>>1* s>>1*] )

-    ( We know x%y < y, so start right-shifting y. )
-    &loop DUP2 #0000 EQU2 ?&done
-        #01 SFT2 LITr 01 SFT2r  ( rem yi    [shifti div] )
-        LTH2k ,&loop JCN        ( rem yi    [shifti div] )
-        SWP2 OVR2 SUB2 SWP2     ( rem-yi yi [shifti div] )
-        DUP2r ROT2r ADD2r SWP2r ( rem-yi yi [shifti div+shifti] )
-        !&loop                  ( rem-yi yi [shifti div+shifti] )
    &done
-
-    POP2 POP2         ( [shiftk div] )
-    POP2r STH2r JMP2r ( div )
+    POP2 POP2 POP2r STH2r        ( div* )
+    DUP2 #7fff GTH2 ?&oo JMP2r   ( div* )
+    &o POP2 POP2 &oo POP2 #7fff JMP2r ( 7fff ; saturate on overflow )

@x16-signed-op ( x* y* f* -> f(x,y)* )
    STH2 LIT2r 0001
@ -245,7 +273,11 @@
    JMP2r

@x16-quotient ( x* y* -> x//y* )
-    DIV2 #80 SFT2 JMP2r
+    ;x16-quot-unsigned !x16-signed-op
+
+@x16-quot-unsigned ( x* y* -> x//y* )
+    DIV2 DUP2 #007f GTH2 ?{ #80 SFT2 JMP2r }
+    POP2 #7fff JMP2r

@x16-remainder ( x* y* -> x%y* )
    DIV2k MUL2 SUB2 JMP2r
@ -311,16 +343,20 @@
        POP2r POP2r NIP2 JMP2r  ( s1* )

@x16-cos ( x* -> cos[x]* )
-    x16-pi/2 ADD2 ( fall-thru )
+    x16-pi/2 ,&x16-sin-cos-helper/offset STR2 !sin-cos-helper

@x16-sin ( x* -> sin[x]* )
+    #0000 ,&x16-sin-cos-helper/offset STR2 ( fall-through )
+
+@x16-sin-cos-helper
    DUP2 #8000 LTH2 ?&non-negative
    x16-negate x16-sin/non-negative !x16-negate
    &non-negative
    x16-pi*2 STH2            ( x       [2pi] )
    DUP2 STH2kr x16-quotient ( x x/2pi [2pi] )
-    STH2r x16-mul SUB2       ( x' ; 0 <= x' < 2pi )
-
+    DUP2 #1400 DIV2 STH2 SWP2r ( x x/2pi [adj* 2pi*] )
+    STH2r x16-mul STH2r ADD2 SUB2       ( x' ; 0 <= x' < 2pi )
+    LIT2 [ &offset $2 ] ADD2
        DUP2 x16-3pi/2 LTH2 ?&c1
    ( -sin(2pi - x) ) x16-pi*2 SWP2 SUB2 x16-sin-q !x16-negate
    &c1 DUP2 x16-pi LTH2 ?&c2
@ -365,7 +401,8 @@
@x16-tan ( x* -> tan[x]* )
    x16-pi*2 STH2            ( x       [2pi] )
    DUP2 STH2kr x16-quotient ( x x/2pi [2pi] )
-    STH2r x16-mul SUB2       ( x' ; 0 <= x' < 2pi )
+    DUP2 #1400 DIV2 STH2 SWP2r ( x x/2pi [adj* 2pi*] )
+    STH2r x16-mul STH2r ADD2 SUB2       ( x' ; 0 <= x' < 2pi )

    ( tan(pi/2) = tan(3pi/2) = error )
    DUP2 x16-3pi/2 EQU2 ?error
--- a/test-fix16.py
+++ b/test-fix16.py
@ -4,15 +4,16 @@ from math import ceil, copysign, cos, floor, log, sin, sqrt, tan
 from os import environ
 from random import randint
 from subprocess import Popen, PIPE, run
+from sys import argv

 def tosigned(x):
    return x if x < 32768 else x - 65536

 u8  = {'sz': 1 << 8,  'fmt': b'%02x'}
 u16 = {'sz': 1 << 16, 'fmt': b'%04x'}
-z16 = {'sz': 1 << 16, 'fmt': b'%04x'}
-p16 = {'sz': 1 << 16, 'fmt': b'%04x'}
-t16 = {'sz': 1 << 16, 'fmt': b'%04x'}
+z16 = {'sz': 1 << 16, 'fmt': b'%04x'} # non-zero
+p16 = {'sz': 1 << 16, 'fmt': b'%04x'} # positive
+t16 = {'sz': 1 << 16, 'fmt': b'%04x'} # tangent, must not be pi/2

 def eq(got, expected):
    return got == expected
@ -25,7 +26,7 @@ def releq(got0, expected0):
    else:
        error = abs(got - expected) / (abs(expected) + 0.001)
        return error < 0.01
-def abseq(got0, expected0):
+def sineq(got0, expected0):
    got, expected = tosigned(got0), tosigned(expected0)
    if (expected - 10) <= got and got <= (expected + 10):
        return True
@ -79,6 +80,24 @@ def test(p, trials, sym, args, out, f, eq=eq):
    else:
        print('%s failed %d/%d trials (%r)' % (name, fails, trials, cases))

+def fromfix(n):
+    assert 0 <= n and n <= 65535
+    if n >= 32768:
+        res = (n - 65536) / 256
+    else:
+        res = n / 256
+    return res
+
+bound = 32767 / 256
+
+def tofix(x):
+    y = min(max(x, -bound), bound)
+    if y < 0:
+        res = int(ceil(65536 + y * 256))
+    else:
+        res = int(y * 256)
+    return res % 65536
+
 def pipe():
    return Popen(['uxncli', 'run.rom'], stdin=PIPE, stdout=PIPE)

@ -87,11 +106,17 @@ def x16_add(x, y):
 def x16_sub(x, y):
    return (x - y) % 65536
 def x16_mul(x, y):
-    return ((x * y) // 256) % 65536
+    return tofix(fromfix(x) * fromfix(y))
 def x16_div(x, y):
-    return ((x * 256) // y) % 65536
+    return tofix(fromfix(x) / fromfix(y))
 def x16_quot(x, y):
-    return x16_div(x, y) & 0xff00
+    n = x16_div(x, y)
+    if n < 0x7fff:
+        return n & 0xff00
+    elif n > 0x8001:
+        return (n + 255) & 0xff00
+    else:
+        return n
 def x16_rem(x, y):
    return x % y
 def x16_is_whole(x):
@ -147,13 +172,13 @@ def x16_trunc8(x):
        return floor(x / 256)

 def main():
-    trials = 1000
+    trials = int(argv[1]) if argv[1:] else 100
    run(['uxnasm', 'test-fix16.tal', 'run.rom'])
    p = pipe()
    test(p, trials, b'+', [('x', u16), ('y', u16)], u16, x16_add)
    test(p, trials, b'-', [('x', u16), ('y', u16)], u16, x16_sub)
    test(p, trials, b'*', [('x', u16), ('y', u16)], u16, x16_mul)
-    test(p, trials, b'/', [('x', u16), ('y', z16)], u16, x16_div, eq=releq)
+    test(p, trials, b'/', [('x', u16), ('y', z16)], u16, x16_div)
    test(p, trials, b'\\', [('x', u16), ('y', z16)], u16, x16_quot)
    test(p, trials, b'%', [('x', u16), ('y', z16)], u16, x16_rem)
    test(p, trials, b'w', [('x', u16)], u8, x16_is_whole, eq=booleq)
@ -164,16 +189,18 @@ def main():
    test(p, trials, b'>', [('x', u16), ('y', u16)], u8, x16_gt)
    test(p, trials, b'{', [('x', u16), ('y', u16)], u8, x16_lteq)
    test(p, trials, b'}', [('x', u16), ('y', u16)], u8, x16_gteq)
-    test(p, trials, b'r', [('x', p16)], u16, x16_sqrt, eq=releq)
-    test(p, trials, b's', [('x', p16)], u16, x16_sin, eq=abseq)
-    test(p, trials, b'c', [('x', p16)], u16, x16_cos, eq=abseq)
-    test(p, trials, b't', [('x', t16)], u16, x16_tan, eq=taneq)
-    test(p, trials, b'l', [('x', p16)], u16, x16_log, eq=releq)
    test(p, trials, b'F', [('x', u16)], u16, x16_floor)
    test(p, trials, b'C', [('x', u16)], u16, x16_ceil)
    test(p, trials, b'R', [('x', u16)], u16, x16_round)
    test(p, trials, b'8', [('x', u16)], u16, x16_trunc8)
    test(p, trials, b'T', [('x', u16)], u16, x16_trunc16)
+    # the next five are known to be somewhat inaccurate and use
+    # a "relaxed" equality predicate for testing purposes.
+    test(p, trials, b'r', [('x', p16)], u16, x16_sqrt, eq=releq)
+    test(p, trials, b's', [('x', p16)], u16, x16_sin, eq=sineq)
+    test(p, trials, b'c', [('x', p16)], u16, x16_cos, eq=sineq)
+    test(p, trials, b't', [('x', t16)], u16, x16_tan, eq=taneq)
+    test(p, trials, b'l', [('x', p16)], u16, x16_log, eq=releq)
    p.stdin.write(b'\n\n')
    p.stdin.flush()
    p.stdin.close()