Cube Root on Tenstorrent

21st February, 2026

Introduction

This is a version of Fast Calculation of Cube and Inverse Cube Roots Using a Magic Constant and Its Implementation on Microcontrollers by Moroz et al. modified to run on Tenstorrent AI accelerators.

This is Algorithm 5 from the paper, modified to compute cube root instead of inverse cube root (note that Algorithm 6 in the paper contains an error):

float third = 0x1.555556p-2f;
float k0 =  0x1.c09806p0f;
float k1 = -0x1.403e6cp0f;
float k2 =  0x1.04cdb2p-1f;
uint32_t i = *(uint32_t *)&x;
i = 0x548c2b4b - i/3;
float y = *(float *)&i;
float c = x*y*y*y;
y = y * fmaf(c, fmaf(k2, c, k1), k0);

float d = x * (y * y);
c = fmaf(d, y, -1.0f);
float t = fmaf(c, -third, 1.0f);
y = d * (t * t);

Modifications for Tenstorrent

We use the following strategy to compute i/3 on Tenstorrent:

Note:

  1. x is positive, hence the MSB of i will always be zero, and i < 2^31.
  2. i/3 loses another high bit: i/3 < 2^30.
  3. It would be efficient to use the rounding bias trick to convert fp32 to int23, where adding 2^23 in floating point shifts the integer part into the mantissa bits if value < 2^23. Since i/3 < 2^30, we simply need to divide by 2^7 to obtain value < 2^23.
float f = (float)i;  // i < 2^31
f = f * (0.333333333333f/128.0f) + 8388608.0f;
i = *(int *)&f << 7; // multiply by 2^7; now high 2 bits 10, and low 7 bits all zero
i = 0xd48c2b4b - i;  // modified magic constant with MSB=1

Porting this to SFPI yields:

vConstFloatPrgm0 =  0x1.c09806p0f;
vConstFloatPrgm1 = -0x1.403e6cp0f;
vConstFloatPrgm2 =  0x1.04cdb2p-1f;

vInt magic = 0xd48c2b4b;
vFloat third_128 = 0x1.555556p-9f;
vFloat rounding_bias = 8388608.0f;

for (int idx = 0; idx < 8; ++idx) {
    vFloat a = dst_reg[0];
    vFloat x = abs(a);

    vInt i = reinterpret<vInt>(x);
    vFloat f = int32_to_float(i, 0) * third_128 + rounding_bias;

    i = reinterpret<vInt>(f);
    i = magic - (i << 7);

    vFloat y = reinterpret<vFloat>(i);

    vFloat c = (x * y) * (y * y);
    y = y * (c * (vConstFloatPrgm2 * c + vConstFloatPrgm1) + vConstFloatPrgm0);

    vFloat d = x * (y * y);
    c = d * y + vConstNeg1;
    vFloat third = addexp(third_128, 7); // hide sfpnop
    vFloat t = c * third + vConstNeg1;
    d = setsgn(d, a); // hide sfpnop
    y = d * (t * t);

    dst_reg[0] = y;
    dst_reg++;
}

Note that we invert t = -c*(1.0/3.0) + 1.0 due to the lack of NEGATE_VA on Wormhole; since t gets squared anyway, we can use t = c*(1.0/3.0) + -1.0 instead.

Chains of 2-cycle dependent SFPMAD instructions leave plenty of SFPNOP slots that we can potentially hide with other instructions.

We exploit this in two ways:

  1. Use SFPDIVP2 (SFPI calls this addexp) to adjust the exponent of (1.0/3.0)/128.0 by 7 to obtain 1.0/3.0.
  2. Use SFPSETSGN to copy the sign of the original input to d rather than leaving this to the end, hiding a SFPNOP slot.

Ignoring constant initialisation, this compiles to the following 32-cycle sequence:

; L1 = 0x1.555556p-9f, L3 = 0xd48c2b4b, L4 = 8388608.0
sfpload L2,0,0,3
sfpabs L5,L2,1
sfpcast L0,L5,0
sfpmad L0,L0,L1,L4,0
sfpnop
sfpshft L0,L0,0x007,1
sfpiadd L0,L3,0x000,6
sfpmul L6,L5,L0,L9,0
sfpmul L7,L0,L0,L9,0
sfpnop
sfpmul L6,L6,L7,L9,0
sfpnop
sfpmad L7,L14,L6,L13,0
sfpnop
sfpmad L6,L6,L7,L12,0
sfpnop
sfpmul L0,L0,L6,L9,0
sfpnop
sfpmul L6,L0,L0,L9,0
sfpnop
sfpmul L5,L5,L6,L9,0
sfpnop
sfpmad L0,L5,L0,L11,0
sfpdivp2 L6,L1,0x007,1
sfpmad L0,L0,L6,L11,0
sfpsetsgn L2,L5,0x000,0
sfpmul L0,L0,L0,L9,0
sfpnop
sfpmul L2,L2,L0,L9,0
sfpnop
sfpstore 0,L2,0,3
ttincrwc 0,2,0,0

ULP error is reasonable with a maximum of ~2.5 ULPs.

bf16

According to the paper, one iteration of the algorithm yields 15.18 correct bits, and so we obtain the following code for bf16, with the addition of a rounding step for the final value:

vConstFloatPrgm0 =  0x1.c09806p0f;
vConstFloatPrgm1 = -0x1.403e6cp0f;
vConstFloatPrgm2 =  0x1.04cdb2p-1f;

vInt magic = 0xd48c2b4b;
vFloat third_128 = 0x1.555556p-9f;
vFloat rounding_bias = 8388608.0f;

for (int idx = 0; idx < 8; ++idx) {
    vFloat a = dst_reg[0];
    vFloat x = abs(a);

    vInt i = reinterpret<vInt>(x);
    vFloat f = int32_to_float(i, 0) * third_128 + rounding_bias;

    i = reinterpret<vInt>(f);
    i = magic - (i << 7);

    vFloat y = reinterpret<vFloat>(i);

    vFloat d = x * (y * y);
    vFloat c = d * y;
    vFloat t = c * (vConstFloatPrgm2 * c + vConstFloatPrgm1) + vConstFloatPrgm0;
    d = setsgn(d, a);
    y = d * (t * t);

    dst_reg[0] = reinterpret<vFloat>(float_to_fp16b(y, 0));
    dst_reg++;
}

This compiles to the following 24-cycle sequence:

; L3 = 0x1.555556p-9f, L1 = 0xd48c2b4b, L5 = 8388608.0
sfpload L2,0,0,3
sfpabs L4,L2,1
sfpcast L0,L4,0
sfpmad L0,L0,L3,L5,0
sfpnop
sfpshft L0,L0,0x007,1
sfpiadd L0,L1,0x000,6
sfpmul L6,L0,L0,L9,0
sfpnop
sfpmul L4,L4,L6,L9,0
sfpnop
sfpmul L0,L4,L0,L9,0
sfpnop
sfpmad L6,L14,L0,L13,0
sfpnop
sfpmad L0,L0,L6,L12,0
sfpsetsgn L2,L4,0x000,0
sfpmul L0,L0,L0,L9,0
sfpnop
sfpmul L2,L2,L0,L9,0
sfpnop
sfpstochrnd L2,L0,L2,1,0,0
sfpstore 0,L2,0,3
ttincrwc 0,2,0,0

Acknowledgements

Thanks to Tenstorrent for sponsoring this work.