Cube Root on Tenstorrent

This is Algorithm 5 from the paper, modified to compute cube root instead of inverse cube root (note that Algorithm 6 in the paper contains an error):

float third = 0x1.555556p-2f;
float k0 =  0x1.c09806p0f;
float k1 = -0x1.403e6cp0f;
float k2 =  0x1.04cdb2p-1f;
uint32_t i = *(uint32_t *)&x;
i = 0x548c2b4b - i/3;
float y = *(float *)&i;
float c = x*y*y*y;
y = y * fmaf(c, fmaf(k2, c, k1), k0);

float d = x * (y * y);
c = fmaf(d, y, -1.0f);
float t = fmaf(c, -third, 1.0f);
y = d * (t * t);

Modifications for Tenstorrent

float f = (float)i;  // i < 2^31
f = f * (0.333333333333f/128.0f) + 8388608.0f;
i = *(int *)&f << 7; // multiply by 2^7; now high 2 bits 10, and low 7 bits all zero
i = 0xd48c2b4b - i;  // modified magic constant with MSB=1

vConstFloatPrgm0 =  0x1.c09806p0f;
vConstFloatPrgm1 = -0x1.403e6cp0f;
vConstFloatPrgm2 =  0x1.04cdb2p-1f;

vInt magic = 0xd48c2b4b;
vFloat third_128 = 0x1.555556p-9f;
vFloat rounding_bias = 8388608.0f;

for (int idx = 0; idx < 8; ++idx) {
    vFloat a = dst_reg[0];
    vFloat x = abs(a);

    vInt i = reinterpret<vInt>(x);
    vFloat f = int32_to_float(i, 0) * third_128 + rounding_bias;

    i = reinterpret<vInt>(f);
    i = magic - (i << 7);

    vFloat y = reinterpret<vFloat>(i);

    vFloat c = (x * y) * (y * y);
    y = y * (c * (vConstFloatPrgm2 * c + vConstFloatPrgm1) + vConstFloatPrgm0);

    vFloat d = x * (y * y);
    c = d * y + vConstNeg1;
    vFloat third = addexp(third_128, 7); // hide sfpnop
    vFloat t = c * third + vConstNeg1;
    d = setsgn(d, a); // hide sfpnop
    y = d * (t * t);

    dst_reg[0] = y;
    dst_reg++;
}

Note that we invert t = -c*(1.0/3.0) + 1.0 due to the lack of NEGATE_VA on Wormhole; since t gets squared anyway, we can use t = c*(1.0/3.0) + -1.0 instead.

Chains of 2-cycle dependent SFPMAD instructions leave plenty of SFPNOP slots that we can potentially hide with other instructions.

Ignoring constant initialisation, this compiles to the following 32-cycle sequence:

; L1 = 0x1.555556p-9f, L3 = 0xd48c2b4b, L4 = 8388608.0
sfpload L2,0,0,3
sfpabs L5,L2,1
sfpcast L0,L5,0
sfpmad L0,L0,L1,L4,0
sfpnop
sfpshft L0,L0,0x007,1
sfpiadd L0,L3,0x000,6
sfpmul L6,L5,L0,L9,0
sfpmul L7,L0,L0,L9,0
sfpnop
sfpmul L6,L6,L7,L9,0
sfpnop
sfpmad L7,L14,L6,L13,0
sfpnop
sfpmad L6,L6,L7,L12,0
sfpnop
sfpmul L0,L0,L6,L9,0
sfpnop
sfpmul L6,L0,L0,L9,0
sfpnop
sfpmul L5,L5,L6,L9,0
sfpnop
sfpmad L0,L5,L0,L11,0
sfpdivp2 L6,L1,0x007,1
sfpmad L0,L0,L6,L11,0
sfpsetsgn L2,L5,0x000,0
sfpmul L0,L0,L0,L9,0
sfpnop
sfpmul L2,L2,L0,L9,0
sfpnop
sfpstore 0,L2,0,3
ttincrwc 0,2,0,0

bf16

According to the paper, one iteration of the algorithm yields 15.18 correct bits, and so we obtain the following code for bf16, with the addition of a rounding step for the final value:

vConstFloatPrgm0 =  0x1.c09806p0f;
vConstFloatPrgm1 = -0x1.403e6cp0f;
vConstFloatPrgm2 =  0x1.04cdb2p-1f;

vInt magic = 0xd48c2b4b;
vFloat third_128 = 0x1.555556p-9f;
vFloat rounding_bias = 8388608.0f;

for (int idx = 0; idx < 8; ++idx) {
    vFloat a = dst_reg[0];
    vFloat x = abs(a);

    vInt i = reinterpret<vInt>(x);
    vFloat f = int32_to_float(i, 0) * third_128 + rounding_bias;

    i = reinterpret<vInt>(f);
    i = magic - (i << 7);

    vFloat y = reinterpret<vFloat>(i);

    vFloat d = x * (y * y);
    vFloat c = d * y;
    vFloat t = c * (vConstFloatPrgm2 * c + vConstFloatPrgm1) + vConstFloatPrgm0;
    d = setsgn(d, a);
    y = d * (t * t);

    dst_reg[0] = reinterpret<vFloat>(float_to_fp16b(y, 0));
    dst_reg++;
}

; L3 = 0x1.555556p-9f, L1 = 0xd48c2b4b, L5 = 8388608.0
sfpload L2,0,0,3
sfpabs L4,L2,1
sfpcast L0,L4,0
sfpmad L0,L0,L3,L5,0
sfpnop
sfpshft L0,L0,0x007,1
sfpiadd L0,L1,0x000,6
sfpmul L6,L0,L0,L9,0
sfpnop
sfpmul L4,L4,L6,L9,0
sfpnop
sfpmul L0,L4,L0,L9,0
sfpnop
sfpmad L6,L14,L0,L13,0
sfpnop
sfpmad L0,L0,L6,L12,0
sfpsetsgn L2,L4,0x000,0
sfpmul L0,L0,L0,L9,0
sfpnop
sfpmul L2,L2,L0,L9,0
sfpnop
sfpstochrnd L2,L0,L2,1,0,0
sfpstore 0,L2,0,3
ttincrwc 0,2,0,0

Cube Root on Tenstorrent

Introduction

Modifications for Tenstorrent

bf16

Acknowledgements