21st February, 2026
This is a version of Fast Calculation of Cube and Inverse Cube Roots Using a Magic Constant and Its Implementation on Microcontrollers by Moroz et al. modified to run on Tenstorrent AI accelerators.
This is Algorithm 5 from the paper, modified to compute cube root instead of inverse cube root (note that Algorithm 6 in the paper contains an error):
float third = 0x1.555556p-2f;
float k0 = 0x1.c09806p0f;
float k1 = -0x1.403e6cp0f;
float k2 = 0x1.04cdb2p-1f;
uint32_t i = *(uint32_t *)&x;
i = 0x548c2b4b - i/3;
float y = *(float *)&i;
float c = x*y*y*y;
y = y * fmaf(c, fmaf(k2, c, k1), k0);
float d = x * (y * y);
c = fmaf(d, y, -1.0f);
float t = fmaf(c, -third, 1.0f);
y = d * (t * t);On Tenstorrent, it’s more efficient to compute i/3 in
floating-point.
SFPCAST.(0x548c2b4b - i/3) in fp32, but we
also want to add 2^23 to shift the integer part of the
result into the mantissa bits. This only works if
(0x548c2b4b - i/3)*k < 2^23, so k = 2^-8.
Rearranged, this gives us
(0x548c2b4b/256.0 + 2^23) + i * (-1.0/3.0/256.0), which can
be computed with a single SFPMAD.8 via SFPSHFT
to restore the integer to its correct position.This means that the low 8 bits will be zero, but this doesn’t affect the overall precision since the initial approximation has 15.18 correct bits according to the paper.
Strictly speaking, we want
floor(0x548c2b4b/256.0) + 2^23 since we want to avoid
rounding up; but the exact value is 13929515.29296875,
which will round down to 13929515.0.
float f = (float)i; // i < 2^31
f = f * (-0.333333333333f/256.0f) + (1418472267.0f/256.0f + 8388608.0f);
i = *(int *)&f << 8; // restore integerPorting this to SFPI yields:
vConstFloatPrgm0 = 0x1.c09806p0f;
vConstFloatPrgm1 = -0x1.403e6cp0f;
vConstFloatPrgm2 = 0x1.04cdb2p-1f;
vFloat negative_third_256 = -0x1.555556p-10f;
vFloat magic = 1418472267.0f / 256.0f + 8388608.0f;
for (int idx = 0; idx < 8; ++idx) {
vFloat a = dst_reg[0];
vFloat x = abs(a);
vFloat f = int32_to_float(reinterpret<vInt>(x), 0);
f = f * negative_third_256 + magic;
vFloat y = reinterpret<vFloat>(reinterpret<vInt>(f) << 8);
vFloat c = (x * y) * (y * y);
y = y * (c * (vConstFloatPrgm2 * c + vConstFloatPrgm1) + vConstFloatPrgm0);
vFloat d = x * (y * y);
c = d * y + vConstNeg1;
vFloat third = addexp(negative_third_256, 8); // hide sfpnop
vFloat t = c * third + vConst1;
d = setsgn(d, a); // hide sfpnop
y = d * (t * t);
dst_reg[0] = y;
dst_reg++;
}Chains of 2-cycle dependent SFPMAD
instructions leave plenty of SFPNOP
slots that we can potentially hide with other instructions.
We exploit this in two ways:
SFPDIVP2
(SFPI calls this addexp) to adjust the exponent of
(1.0/3.0)/256.0 by 8 to obtain 1.0/3.0.SFPSETSGN
to copy the sign of the original input to d rather than
leaving this to the end, hiding a SFPNOP slot.Ignoring constant initialisation, this compiles to the following 31-cycle sequence:
; L0 = -0x1.555556p-10; L3 = 13929515.0
sfpload L2,0,0,3
sfpabs L5,L2,1
sfpcast L1,L5,0
sfpmad L1,L1,L0,L3,0
sfpnop
sfpshft L1,0,0x008,1
sfpmul L4,L5,L1,L9,0
sfpmul L6,L1,L1,L9,0
sfpnop
sfpmul L4,L4,L6,L9,0
sfpnop
sfpmad L6,L14,L4,L13,0
sfpnop
sfpmad L4,L4,L6,L12,0
sfpnop
sfpmul L1,L1,L4,L9,0
sfpnop
sfpmul L4,L1,L1,L9,0
sfpnop
sfpmul L4,L5,L4,L9,0
sfpnop
sfpmad L1,L4,L1,L11,0
sfpdivp2 L5,L0,0x008,1
sfpmad L1,L1,L5,L10,0
sfpsetsgn L2,L4,0x000,0
sfpmul L1,L1,L1,L9,0
sfpnop
sfpmul L2,L2,L1,L9,0
sfpnop
sfpstore 0,L2,0,3
ttincrwc 0,2,0,0ULP error is reasonable with a maximum of ~2.5 ULPs.
According to the paper, one iteration of the algorithm yields 15.18 correct bits, and so we obtain the following code for bf16, with the addition of a rounding step for the final value:
vConstFloatPrgm0 = 0x1.c09806p0f;
vConstFloatPrgm1 = -0x1.403e6cp0f;
vConstFloatPrgm2 = 0x1.04cdb2p-1f;
vFloat negative_third_256 = -0x1.555556p-10f;
vFloat magic = 1418472267.0f / 256.0f + 8388608.0f;
for (int idx = 0; idx < 8; ++idx) {
vFloat a = dst_reg[0];
vFloat x = abs(a);
vFloat f = int32_to_float(reinterpret<vInt>(x), 0);
f = f * negative_third_256 + magic;
vFloat y = reinterpret<vFloat>(reinterpret<vInt>(f) << 8);
vFloat d = x * (y * y);
vFloat c = d * y;
vFloat t = c * (vConstFloatPrgm2 * c + vConstFloatPrgm1) + vConstFloatPrgm0;
d = setsgn(d, a);
y = d * (t * t);
dst_reg[0] = reinterpret<vFloat>(float_to_fp16b(y, 0));
dst_reg++;
}This compiles to the following 23-cycle sequence:
; L1 = -0x1.555556p-10f, L2 = 13929515.0
sfpload L3,0,0,3
sfpabs L4,L3,1
sfpcast L0,L4,0
sfpmad L0,L0,L1,L2,0
sfpnop
sfpshft L0,0,0x008,1
sfpmul L5,L0,L0,L9,0
sfpnop
sfpmul L4,L4,L5,L9,0
sfpnop
sfpmul L0,L4,L0,L9,0
sfpnop
sfpmad L5,L14,L0,L13,0
sfpnop
sfpmad L0,L0,L5,L12,0
sfpsetsgn L3,L4,0x000,0
sfpmul L0,L0,L0,L9,0
sfpnop
sfpmul L3,L3,L0,L9,0
sfpnop
sfpstochrnd L3,L0,L3,1,0,0
sfpstore 0,L3,0,3
ttincrwc 0,2,0,0