21st February, 2026
This is a version of Fast Calculation of Cube and Inverse Cube Roots Using a Magic Constant and Its Implementation on Microcontrollers by Moroz et al. modified to run on Tenstorrent AI accelerators.
This is Algorithm 5 from the paper, modified to compute cube root instead of inverse cube root (note that Algorithm 6 in the paper contains an error):
float third = 0x1.555556p-2f;
float k0 = 0x1.c09806p0f;
float k1 = -0x1.403e6cp0f;
float k2 = 0x1.04cdb2p-1f;
uint32_t i = *(uint32_t *)&x;
i = 0x548c2b4b - i/3;
float y = *(float *)&i;
float c = x*y*y*y;
y = y * fmaf(c, fmaf(k2, c, k1), k0);
float d = x * (y * y);
c = fmaf(d, y, -1.0f);
float t = fmaf(c, -third, 1.0f);
y = d * (t * t);We use the following strategy to compute i/3 on
Tenstorrent:
Note:
x is positive, hence the MSB of i will
always be zero, and i < 2^31.i/3 loses another high bit:
i/3 < 2^30.2^23 in floating point shifts the
integer part into the mantissa bits if value < 2^23.
Since i/3 < 2^30, we simply need to divide by
2^7 to obtain value < 2^23.float f = (float)i; // i < 2^31
f = f * (0.333333333333f/128.0f) + 8388608.0f;
i = *(int *)&f << 7; // multiply by 2^7; now high 2 bits 10, and low 7 bits all zero
i = 0xd48c2b4b - i; // modified magic constant with MSB=1Porting this to SFPI yields:
vConstFloatPrgm0 = 0x1.c09806p0f;
vConstFloatPrgm1 = -0x1.403e6cp0f;
vConstFloatPrgm2 = 0x1.04cdb2p-1f;
vInt magic = 0xd48c2b4b;
vFloat third_128 = 0x1.555556p-9f;
vFloat rounding_bias = 8388608.0f;
for (int idx = 0; idx < 8; ++idx) {
vFloat a = dst_reg[0];
vFloat x = abs(a);
vInt i = reinterpret<vInt>(x);
vFloat f = int32_to_float(i, 0) * third_128 + rounding_bias;
i = reinterpret<vInt>(f);
i = magic - (i << 7);
vFloat y = reinterpret<vFloat>(i);
vFloat c = (x * y) * (y * y);
y = y * (c * (vConstFloatPrgm2 * c + vConstFloatPrgm1) + vConstFloatPrgm0);
vFloat d = x * (y * y);
c = d * y + vConstNeg1;
vFloat third = addexp(third_128, 7); // hide sfpnop
vFloat t = c * third + vConstNeg1;
d = setsgn(d, a); // hide sfpnop
y = d * (t * t);
dst_reg[0] = y;
dst_reg++;
}Note that we invert t = -c*(1.0/3.0) + 1.0 due to the
lack of NEGATE_VA on Wormhole; since
t gets squared anyway, we can use
t = c*(1.0/3.0) + -1.0 instead.
Chains of 2-cycle dependent SFPMAD instructions leave plenty of SFPNOP slots that we can potentially hide with other instructions.
We exploit this in two ways:
addexp) to adjust the exponent of
(1.0/3.0)/128.0 by 7 to obtain 1.0/3.0.d rather than
leaving this to the end, hiding a SFPNOP slot.Ignoring constant initialisation, this compiles to the following 32-cycle sequence:
; L1 = 0x1.555556p-9f, L3 = 0xd48c2b4b, L4 = 8388608.0
sfpload L2,0,0,3
sfpabs L5,L2,1
sfpcast L0,L5,0
sfpmad L0,L0,L1,L4,0
sfpnop
sfpshft L0,L0,0x007,1
sfpiadd L0,L3,0x000,6
sfpmul L6,L5,L0,L9,0
sfpmul L7,L0,L0,L9,0
sfpnop
sfpmul L6,L6,L7,L9,0
sfpnop
sfpmad L7,L14,L6,L13,0
sfpnop
sfpmad L6,L6,L7,L12,0
sfpnop
sfpmul L0,L0,L6,L9,0
sfpnop
sfpmul L6,L0,L0,L9,0
sfpnop
sfpmul L5,L5,L6,L9,0
sfpnop
sfpmad L0,L5,L0,L11,0
sfpdivp2 L6,L1,0x007,1
sfpmad L0,L0,L6,L11,0
sfpsetsgn L2,L5,0x000,0
sfpmul L0,L0,L0,L9,0
sfpnop
sfpmul L2,L2,L0,L9,0
sfpnop
sfpstore 0,L2,0,3
ttincrwc 0,2,0,0ULP error is reasonable with a maximum of ~2.5 ULPs.

According to the paper, one iteration of the algorithm yields 15.18 correct bits, and so we obtain the following code for bf16, with the addition of a rounding step for the final value:
vConstFloatPrgm0 = 0x1.c09806p0f;
vConstFloatPrgm1 = -0x1.403e6cp0f;
vConstFloatPrgm2 = 0x1.04cdb2p-1f;
vInt magic = 0xd48c2b4b;
vFloat third_128 = 0x1.555556p-9f;
vFloat rounding_bias = 8388608.0f;
for (int idx = 0; idx < 8; ++idx) {
vFloat a = dst_reg[0];
vFloat x = abs(a);
vInt i = reinterpret<vInt>(x);
vFloat f = int32_to_float(i, 0) * third_128 + rounding_bias;
i = reinterpret<vInt>(f);
i = magic - (i << 7);
vFloat y = reinterpret<vFloat>(i);
vFloat d = x * (y * y);
vFloat c = d * y;
vFloat t = c * (vConstFloatPrgm2 * c + vConstFloatPrgm1) + vConstFloatPrgm0;
d = setsgn(d, a);
y = d * (t * t);
dst_reg[0] = reinterpret<vFloat>(float_to_fp16b(y, 0));
dst_reg++;
}This compiles to the following 24-cycle sequence:
; L3 = 0x1.555556p-9f, L1 = 0xd48c2b4b, L5 = 8388608.0
sfpload L2,0,0,3
sfpabs L4,L2,1
sfpcast L0,L4,0
sfpmad L0,L0,L3,L5,0
sfpnop
sfpshft L0,L0,0x007,1
sfpiadd L0,L1,0x000,6
sfpmul L6,L0,L0,L9,0
sfpnop
sfpmul L4,L4,L6,L9,0
sfpnop
sfpmul L0,L4,L0,L9,0
sfpnop
sfpmad L6,L14,L0,L13,0
sfpnop
sfpmad L0,L0,L6,L12,0
sfpsetsgn L2,L4,0x000,0
sfpmul L0,L0,L0,L9,0
sfpnop
sfpmul L2,L2,L0,L9,0
sfpnop
sfpstochrnd L2,L0,L2,1,0,0
sfpstore 0,L2,0,3
ttincrwc 0,2,0,0Thanks to Tenstorrent for sponsoring this work.