mirror of
https://github.com/freebsd/freebsd-src
synced 2024-10-05 08:00:30 +00:00
Rearranged the polynomial evaluation to reduce dependencies, as in
k_tanf.c but with different details. The polynomial is odd with degree 13 for tanf() and odd with degree 9 for sinf(), so the details are not very different for sinf() -- the term with the x**11 and x**13 coefficients goes awaym and (mysteriously) it helps to do the evaluation of w = z*z early although moving it later was a key optimization for tanf(). The details are different but simpler for cosf() because the polynomial is even and of lower degree. On Athlons, for uniformly distributed args in [-2pi, 2pi], this gives an optimization of about 4 cycles (10%) in most cases (13% for sinf() on AXP, but 0% for cosf() with gcc-3.3 -O1 on AXP). The best case (sinf() with gcc-3.4 -O1 -fcaller-saves on A64) now takes 33-39 cycles (was 37-45 cycles). Hardware sinf takes 74-129 cycles. Despite being fine tuned for Athlons, the optimization is even larger on some other arches (about 15% on ia64 (pluto2) and 20% on alpha (beast) with gcc -O2 -fomit-frame-pointer).
This commit is contained in:
parent
1f4ae9be57
commit
f4b01a9edf
Notes:
svn2git
2020-12-20 02:59:44 +00:00
svn path=/head/; revision=152951
|
@ -37,9 +37,11 @@ extern inline
|
|||
float
|
||||
__kernel_cosdf(double x)
|
||||
{
|
||||
double z,r;
|
||||
double r, w, z;
|
||||
|
||||
z = x*x;
|
||||
r = z*(C1+z*(C2+z*C3));
|
||||
return (one+z*C0) + z*r;
|
||||
/* Try to optimize for parallel evaluation as in k_tanf.c. */
|
||||
z = x*x;
|
||||
w = z*z;
|
||||
r = C2+z*C3;
|
||||
return ((one+z*C0) + w*C1) + (w*z)*r;
|
||||
}
|
||||
|
|
|
@ -36,10 +36,12 @@ extern inline
|
|||
float
|
||||
__kernel_sindf(double x)
|
||||
{
|
||||
double z,r,v;
|
||||
double r, s, w, z;
|
||||
|
||||
z = x*x;
|
||||
v = z*x;
|
||||
r = S2+z*(S3+z*S4);
|
||||
return x+v*(S1+z*r);
|
||||
/* Try to optimize for parallel evaluation as in k_tanf.c. */
|
||||
z = x*x;
|
||||
w = z*z;
|
||||
r = S3+z*S4;
|
||||
s = z*x;
|
||||
return (x + s*(S1+z*S2)) + s*w*r;
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue