Optimize by not doing excessive conversions for handling the sign bit.

This gives an optimization of between 9 and 22% on Athlons (largest
for cbrt() on amd64 -- from 205 to 159 cycles).

We extracted the sign bit and worked with |x|, and restored the sign
bit as the last step.  We avoided branches to a fault by using accesses
to FP values as bits to clear and restore the sign bit.  Avoiding
branches is usually good, but the bit access macros are not so good
(especially for setting FP values), and here they always caused pipeline
stalls on Athlons.  Even using branches would be faster except on args
that give perfect branch misprediction, since only mispredicted branches
cause stalls, but it possible to avoid touching the sign bit in FP
values at all (except to preserve it in conversions from bits to FP
not related to the sign bit).  Do this.  The results are identical
except in 2 of the 3 unsupported rounding modes, since all the
approximations use odd rational functions so they work right on strictly
negative values, and the special case of -0 doesn't use an approximation.
This commit is contained in:
Bruce Evans 2005-12-13 20:17:23 +00:00
parent 984922d761
commit ec761d7501
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=153386
2 changed files with 9 additions and 15 deletions

View file

@ -8,6 +8,8 @@
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*
* Optimized by Bruce D. Evans.
*/
#ifndef lint
@ -47,7 +49,6 @@ cbrt(double x)
if((hx|low)==0)
return(x); /* cbrt(0) is itself */
SET_HIGH_WORD(x,hx); /* x <- |x| */
/*
* Rough cbrt to 5 bits:
* cbrt(2**e*(1+m) ~= 2**(e/3)*(1+(e%3+m)/3)
@ -67,16 +68,16 @@ cbrt(double x)
SET_HIGH_WORD(t,0x43500000); /* set t= 2**54 */
t*=x;
GET_HIGH_WORD(high,t);
SET_HIGH_WORD(t,high/3+B2);
SET_HIGH_WORD(t,sign|((high&0x7fffffff)/3+B2));
} else
SET_HIGH_WORD(t,hx/3+B1);
SET_HIGH_WORD(t,sign|(hx/3+B1));
/* new cbrt to 23 bits; may be implemented in single precision */
r=t*t/x;
s=C+r*t;
t*=G+F/(s+E+D/s);
/* chop t to 20 bits and make it larger than cbrt(x) */
/* chop t to 20 bits and make it larger in magnitude than cbrt(x) */
GET_HIGH_WORD(high,t);
INSERT_WORDS(t,high+0x00000001,0);
@ -87,8 +88,5 @@ cbrt(double x)
r=(r-t)/(w+r); /* r-t is exact */
t=t+t*r;
/* restore the sign bit */
GET_HIGH_WORD(high,t);
SET_HIGH_WORD(t,high|sign);
return(t);
}

View file

@ -1,6 +1,6 @@
/* s_cbrtf.c -- float version of s_cbrt.c.
* Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
* Debugged by Bruce D. Evans.
* Debugged and optimized by Bruce D. Evans.
*/
/*
@ -50,22 +50,21 @@ cbrtf(float x)
if(hx==0)
return(x); /* cbrt(0) is itself */
SET_FLOAT_WORD(x,hx); /* x <- |x| */
/* rough cbrt to 5 bits */
if(hx<0x00800000) { /* subnormal number */
SET_FLOAT_WORD(t,0x4b800000); /* set t= 2**24 */
t*=x;
GET_FLOAT_WORD(high,t);
SET_FLOAT_WORD(t,high/3+B2);
SET_FLOAT_WORD(t,sign|((high&0x7fffffff)/3+B2));
} else
SET_FLOAT_WORD(t,hx/3+B1);
SET_FLOAT_WORD(t,sign|(hx/3+B1));
/* new cbrt to 23 bits */
r=t*t/x;
s=C+r*t;
t*=G+F/(s+E+D/s);
/* chop t to 12 bits and make it larger than cbrt(x) */
/* chop t to 12 bits and make it larger in magnitude than cbrt(x) */
GET_FLOAT_WORD(high,t);
SET_FLOAT_WORD(t,(high&0xfffff000)+0x00001000);
@ -76,8 +75,5 @@ cbrtf(float x)
r=(r-t)/(w+r); /* r-t is exact */
t=t+t*r;
/* restore the sign bit */
GET_FLOAT_WORD(high,t);
SET_FLOAT_WORD(t,high|sign);
return(t);
}