Use a minimax polynomial approximation instead of a Pade rational

function approximation for the second step.  The polynomial has degree
2 for cbrtf() and 4 for cbrt().  These degrees are minimal for the final
accuracy to be essentially the same as before (slightly smaller).
Adjust the rounding between steps 2 and 3 to match.  Unfortunately,
for cbrt(), this breaks the claimed accuracy slightly although incorrect
rounding doesn't.  Claim less accuracy since its not worth pessimizing
the polynomial or relying on exhaustive testing to get insignificantly
more accuracy.

This saves about 30 cycles on Athlons (mainly by avoiding 2 divisions)
so it gives an overall optimization in the 10-25% range (a larger
percentage for float precision, especially in 32-bit mode, since other
overheads are more dominant for double precision, surprisingly more
in 32-bit mode).
This commit is contained in:
Bruce Evans 2005-12-19 00:22:03 +00:00
parent f94b154580
commit c5964538b7
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=153520
2 changed files with 30 additions and 37 deletions

View file

@ -26,12 +26,13 @@ static const u_int32_t
B1 = 715094163, /* B1 = (1023-1023/3-0.03306235651)*2**20 */
B2 = 696219795; /* B2 = (1023-1023/3-54/3-0.03306235651)*2**20 */
/* |1/cbrt(x) - p(x)| < 2**-23.5 (~[-7.93e-8, 7.929e-8]). */
static const double
C = 5.42857142857142815906e-01, /* 19/35 = 0x3FE15F15, 0xF15F15F1 */
D = -7.05306122448979611050e-01, /* -864/1225 = 0xBFE691DE, 0x2532C834 */
E = 1.41428571428571436819e+00, /* 99/70 = 0x3FF6A0EA, 0x0EA0EA0F */
F = 1.60714285714285720630e+00, /* 45/28 = 0x3FF9B6DB, 0x6DB6DB6E */
G = 3.57142857142857150787e-01; /* 5/14 = 0x3FD6DB6D, 0xB6DB6DB7 */
P0 = 1.87595182427177009643, /* 0x3ffe03e6, 0x0f61e692 */
P1 = -1.88497979543377169875, /* 0xbffe28e0, 0x92f02420 */
P2 = 1.621429720105354466140, /* 0x3ff9f160, 0x4a49d6c2 */
P3 = -0.758397934778766047437, /* 0xbfe844cb, 0xbee751d9 */
P4 = 0.145996192886612446982; /* 0x3fc2b000, 0xd4e4edd7 */
double
cbrt(double x)
@ -78,36 +79,30 @@ cbrt(double x)
SET_HIGH_WORD(t,sign|(hx/3+B1));
/*
* New cbrt to 25 bits:
* cbrt(x) = t*cbrt(x/t**3) ~= t*R(x/t**3)
* where R(r) = (14*r**2 + 35*r + 5)/(5*r**2 + 35*r + 14) is the
* (2,2) Pade approximation to cbrt(r) at r = 1. We replace
* r = x/t**3 by 1/r = t**3/x since the latter can be evaluated
* more efficiently, and rearrange the expression for R(r) to use
* 4 additions and 2 divisions instead of the 4 additions, 4
* multiplications and 1 division that would be required using
* Horner's rule on the numerator and denominator. t being good
* to 32 bits means that |t/cbrt(x)-1| < 1/32, so |x/t**3-1| < 0.1
* and for R(r) we can use any approximation to cbrt(r) that is good
* to 20 bits on [0.9, 1.1]. The (2,2) Pade approximation is not an
* especially good choice.
* New cbrt to 23 bits:
* cbrt(x) = t*cbrt(x/t**3) ~= t*P(t**3/x)
* where P(r) is a polynomial of degree 4 that approximates 1/cbrt(r)
* to within 2**-23.5 when |r - 1| < 1/10. The rough approximation
* has produced t such than |t/cbrt(x) - 1| ~< 1/32, and cubing this
* gives us bounds for r = t**3/x.
*
* Try to optimize for parallel evaluation as in k_tanf.c.
*/
r=t*t/x;
s=C+r*t;
t*=G+F/(s+E+D/s);
r=(t*t)*(t/x);
t=t*((P0+r*(P1+r*P2))+((r*r)*r)*(P3+r*P4));
/*
* Round t away from zero to 25 bits (sloppily except for ensuring that
* Round t away from zero to 23 bits (sloppily except for ensuring that
* the result is larger in magnitude than cbrt(x) but not much more than
* 2 25-bit ulps larger). With rounding towards zero, the error bound
* would be ~5/6 instead of ~4/6. With a maximum error of 1 25-bit ulps
* 2 23-bit ulps larger). With rounding towards zero, the error bound
* would be ~5/6 instead of ~4/6. With a maximum error of 2 23-bit ulps
* in the rounded t, the infinite-precision error in the Newton
* approximation barely affects third digit in the the final error
* 0.667; the error in the rounded t can be up to about 12 25-bit ulps
* 0.667; the error in the rounded t can be up to about 3 23-bit ulps
* before the final error is larger than 0.667 ulps.
*/
u.value=t;
u.bits=(u.bits+0x20000000)&0xfffffffff0000000ULL;
u.bits=(u.bits+0x80000000)&0xffffffffc0000000ULL;
t=u.value;
/* one step Newton iteration to 53 bits with error < 0.667 ulps */

View file

@ -28,12 +28,11 @@ static const unsigned
B1 = 709958130, /* B1 = (127-127.0/3-0.03306235651)*2**23 */
B2 = 642849266; /* B2 = (127-127.0/3-24/3-0.03306235651)*2**23 */
/* |1/cbrt(x) - p(x)| < 2**-14.5 (~[-4.37e-4, 4.366e-5]). */
static const float
C = 5.4285717010e-01, /* 19/35 = 0x3f0af8b0 */
D = -7.0530611277e-01, /* -864/1225 = 0xbf348ef1 */
E = 1.4142856598e+00, /* 99/70 = 0x3fb50750 */
F = 1.6071428061e+00, /* 45/28 = 0x3fcdb6db */
G = 3.5714286566e-01; /* 5/14 = 0x3eb6db6e */
P0 = 1.5586718321, /* 0x3fc7828f */
P1 = -0.78271341324, /* -0xbf485fe8 */
P2 = 0.22403796017; /* 0x3e656a35 */
float
cbrtf(float x)
@ -59,10 +58,9 @@ cbrtf(float x)
} else
SET_FLOAT_WORD(t,sign|(hx/3+B1));
/* new cbrt to 23 bits */
r=t*t/x;
s=C+r*t;
t*=G+F/(s+E+D/s);
/* new cbrt to 14 bits */
r=(t*t)*(t/x);
t=t*((P0+r*P1)+(r*r)*P2);
/*
* Round t away from zero to 12 bits (sloppily except for ensuring that
@ -73,9 +71,9 @@ cbrtf(float x)
* the second digit instead of the third digit of 4/6 = 0.666..., etc.
*/
GET_FLOAT_WORD(high,t);
SET_FLOAT_WORD(t,(high+0x1002)&0xfffff000);
SET_FLOAT_WORD(t,(high+0x1800)&0xfffff000);
/* one step Newton iteration to 24 bits with error < 0.667 ulps */
/* one step Newton iteration to 24 bits with error < 0.669 ulps */
s=t*t; /* t*t is exact */
r=x/s; /* error <= 0.5 ulps; |r| < |t| */
w=t+t; /* t+t is exact */