diff --git a/contrib/arm-optimized-routines/README b/contrib/arm-optimized-routines/README index a2143a28488a..651ebdc84bc8 100644 --- a/contrib/arm-optimized-routines/README +++ b/contrib/arm-optimized-routines/README @@ -12,7 +12,7 @@ contribution requirements are documented in README.contributors of the appropriate subdirectory. Regular quarterly releases are tagged as vYY.MM, the latest -release is v23.01. +release is v24.01. Source code layout: diff --git a/contrib/arm-optimized-routines/config.mk.dist b/contrib/arm-optimized-routines/config.mk.dist index 7a8497507a81..03fb54db52fa 100644 --- a/contrib/arm-optimized-routines/config.mk.dist +++ b/contrib/arm-optimized-routines/config.mk.dist @@ -1,6 +1,6 @@ # Example config.mk # -# Copyright (c) 2018-2022, Arm Limited. +# Copyright (c) 2018-2023, Arm Limited. # SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception # Subprojects to build @@ -59,13 +59,14 @@ math-cflags += -ffp-contract=fast -fno-math-errno # Use with clang. #math-cflags += -ffp-contract=fast -# Disable vector math code -#math-cflags += -DWANT_VMATH=0 - -# Disable/enable SVE vector math code and tests +# Disable/enable SVE vector math code and tests. +# If WANT_SVE_MATH is enabled, math-sve-cflags is added for SVE +# routines only so that SVE code does not leak into scalar +# routines. It is also necessary to add it for tools (e.g. ulp, +# mathbench) WANT_SVE_MATH = 0 ifeq ($(WANT_SVE_MATH), 1) - math-cflags += -march=armv8.2-a+sve + math-sve-cflags = -march=armv8-a+sve endif math-cflags += -DWANT_SVE_MATH=$(WANT_SVE_MATH) diff --git a/contrib/arm-optimized-routines/math/Dir.mk b/contrib/arm-optimized-routines/math/Dir.mk index 2a9cad10d96a..5e9494a7bd3c 100644 --- a/contrib/arm-optimized-routines/math/Dir.mk +++ b/contrib/arm-optimized-routines/math/Dir.mk @@ -1,12 +1,14 @@ # Makefile fragment - requires GNU make # -# Copyright (c) 2019-2022, Arm Limited. +# Copyright (c) 2019-2023, Arm Limited. # SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception S := $(srcdir)/math B := build/math math-lib-srcs := $(wildcard $(S)/*.[cS]) +math-lib-srcs += $(wildcard $(S)/$(ARCH)/*.[cS]) + math-test-srcs := \ $(S)/test/mathtest.c \ $(S)/test/mathbench.c \ @@ -65,6 +67,8 @@ build/lib/libmathlib.a: $(math-lib-objs) $(math-host-tools): HOST_LDLIBS += -lm -lmpfr -lmpc $(math-tools): LDLIBS += $(math-ldlibs) -lm +# math-sve-cflags should be empty if WANT_SVE_MATH is not enabled +$(math-tools): CFLAGS_ALL += $(math-sve-cflags) build/bin/rtest: $(math-host-objs) $(HOST_CC) $(HOST_CFLAGS) $(HOST_LDFLAGS) -o $@ $^ $(HOST_LDLIBS) diff --git a/contrib/arm-optimized-routines/math/aarch64/v_cos.c b/contrib/arm-optimized-routines/math/aarch64/v_cos.c new file mode 100644 index 000000000000..9a73575bce89 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_cos.c @@ -0,0 +1,87 @@ +/* + * Double-precision vector cos function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + float64x2_t poly[7]; + float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3; +} data = { + /* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */ + .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), + V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19), + V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33), + V2 (-0x1.9e9540300a1p-41) }, + .inv_pi = V2 (0x1.45f306dc9c883p-2), + .half_pi = V2 (0x1.921fb54442d18p+0), + .pi_1 = V2 (0x1.921fb54442d18p+1), + .pi_2 = V2 (0x1.1a62633145c06p-53), + .pi_3 = V2 (0x1.c1cd129024e09p-106), + .shift = V2 (0x1.8p52), + .range_val = V2 (0x1p23) +}; + +#define C(i) d->poly[i] + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) +{ + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); + return v_call_f64 (cos, x, y, cmp); +} + +float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + float64x2_t n, r, r2, r3, r4, t1, t2, t3, y; + uint64x2_t odd, cmp; + +#if WANT_SIMD_EXCEPT + r = vabsq_f64 (x); + cmp = vcgeq_u64 (vreinterpretq_u64_f64 (r), + vreinterpretq_u64_f64 (d->range_val)); + if (unlikely (v_any_u64 (cmp))) + /* If fenv exceptions are to be triggered correctly, set any special lanes + to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by + special-case handler later. */ + r = vbslq_f64 (cmp, v_f64 (1.0), r); +#else + cmp = vcageq_f64 (x, d->range_val); + r = x; +#endif + + /* n = rint((|x|+pi/2)/pi) - 0.5. */ + n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi)); + odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63); + n = vsubq_f64 (n, d->shift); + n = vsubq_f64 (n, v_f64 (0.5)); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = vfmsq_f64 (r, d->pi_1, n); + r = vfmsq_f64 (r, d->pi_2, n); + r = vfmsq_f64 (r, d->pi_3, n); + + /* sin(r) poly approx. */ + r2 = vmulq_f64 (r, r); + r3 = vmulq_f64 (r2, r); + r4 = vmulq_f64 (r2, r2); + + t1 = vfmaq_f64 (C (4), C (5), r2); + t2 = vfmaq_f64 (C (2), C (3), r2); + t3 = vfmaq_f64 (C (0), C (1), r2); + + y = vfmaq_f64 (t1, C (6), r4); + y = vfmaq_f64 (t2, y, r4); + y = vfmaq_f64 (t3, y, r4); + y = vfmaq_f64 (r, y, r3); + + if (unlikely (v_any_u64 (cmp))) + return special_case (x, y, odd, cmp); + return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_cosf.c b/contrib/arm-optimized-routines/math/aarch64/v_cosf.c new file mode 100644 index 000000000000..b9890b2998ad --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_cosf.c @@ -0,0 +1,82 @@ +/* + * Single-precision vector cos function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + float32x4_t poly[4]; + float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3; +} data = { + /* 1.886 ulp error. */ + .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f), + V4 (0x1.5b2e76p-19f) }, + + .pi_1 = V4 (0x1.921fb6p+1f), + .pi_2 = V4 (-0x1.777a5cp-24f), + .pi_3 = V4 (-0x1.ee59dap-49f), + + .inv_pi = V4 (0x1.45f306p-2f), + .shift = V4 (0x1.8p+23f), + .half_pi = V4 (0x1.921fb6p0f), + .range_val = V4 (0x1p20f) +}; + +#define C(i) d->poly[i] + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) +{ + /* Fall back to scalar code. */ + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); + return v_call_f32 (cosf, x, y, cmp); +} + +float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t n, r, r2, r3, y; + uint32x4_t odd, cmp; + +#if WANT_SIMD_EXCEPT + r = vabsq_f32 (x); + cmp = vcgeq_u32 (vreinterpretq_u32_f32 (r), + vreinterpretq_u32_f32 (d->range_val)); + if (unlikely (v_any_u32 (cmp))) + /* If fenv exceptions are to be triggered correctly, set any special lanes + to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by + special-case handler later. */ + r = vbslq_f32 (cmp, v_f32 (1.0f), r); +#else + cmp = vcageq_f32 (x, d->range_val); + r = x; +#endif + + /* n = rint((|x|+pi/2)/pi) - 0.5. */ + n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi)); + odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31); + n = vsubq_f32 (n, d->shift); + n = vsubq_f32 (n, v_f32 (0.5f)); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = vfmsq_f32 (r, d->pi_1, n); + r = vfmsq_f32 (r, d->pi_2, n); + r = vfmsq_f32 (r, d->pi_3, n); + + /* y = sin(r). */ + r2 = vmulq_f32 (r, r); + r3 = vmulq_f32 (r2, r); + y = vfmaq_f32 (C (2), C (3), r2); + y = vfmaq_f32 (C (1), y, r2); + y = vfmaq_f32 (C (0), y, r2); + y = vfmaq_f32 (r, y, r3); + + if (unlikely (v_any_u32 (cmp))) + return special_case (x, y, odd, cmp); + return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_exp.c b/contrib/arm-optimized-routines/math/aarch64/v_exp.c new file mode 100644 index 000000000000..bc5609faf4fc --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_exp.c @@ -0,0 +1,125 @@ +/* + * Double-precision vector e^x function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +#define N (1 << V_EXP_TABLE_BITS) +#define IndexMask (N - 1) + +const static volatile struct +{ + float64x2_t poly[3]; + float64x2_t inv_ln2, ln2_hi, ln2_lo, shift; +#if !WANT_SIMD_EXCEPT + float64x2_t special_bound, scale_thresh; +#endif +} data = { + /* maxerr: 1.88 +0.5 ulp + rel error: 1.4337*2^-53 + abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ]. */ + .poly = { V2 (0x1.ffffffffffd43p-2), V2 (0x1.55555c75adbb2p-3), + V2 (0x1.55555da646206p-5) }, +#if !WANT_SIMD_EXCEPT + .scale_thresh = V2 (163840.0), /* 1280.0 * N. */ + .special_bound = V2 (704.0), +#endif + .inv_ln2 = V2 (0x1.71547652b82fep7), /* N/ln2. */ + .ln2_hi = V2 (0x1.62e42fefa39efp-8), /* ln2/N. */ + .ln2_lo = V2 (0x1.abc9e3b39803f3p-63), + .shift = V2 (0x1.8p+52) +}; + +#define C(i) data.poly[i] +#define Tab __v_exp_data + +#if WANT_SIMD_EXCEPT + +# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511). */ +# define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9). */ +# define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound. */ + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) +{ + /* If fenv exceptions are to be triggered correctly, fall back to the scalar + routine to special lanes. */ + return v_call_f64 (exp, x, y, cmp); +} + +#else + +# define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */ +/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ +# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */ +# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */ + +static inline float64x2_t VPCS_ATTR +special_case (float64x2_t s, float64x2_t y, float64x2_t n) +{ + /* 2^(n/N) may overflow, break it up into s1*s2. */ + uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset); + float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b)); + float64x2_t s2 = vreinterpretq_f64_u64 ( + vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b)); + uint64x2_t cmp = vcagtq_f64 (n, data.scale_thresh); + float64x2_t r1 = vmulq_f64 (s1, s1); + float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1); + return vbslq_f64 (cmp, r1, r0); +} + +#endif + +float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x) +{ + float64x2_t n, r, r2, s, y, z; + uint64x2_t cmp, u, e; + +#if WANT_SIMD_EXCEPT + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special_case to fix special lanes later. This is only necessary if fenv + exceptions are to be triggered correctly. */ + float64x2_t xm = x; + uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x)); + cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), SpecialBound); + if (unlikely (v_any_u64 (cmp))) + x = vbslq_f64 (cmp, v_f64 (1), x); +#else + cmp = vcagtq_f64 (x, data.special_bound); +#endif + + /* n = round(x/(ln2/N)). */ + z = vfmaq_f64 (data.shift, x, data.inv_ln2); + u = vreinterpretq_u64_f64 (z); + n = vsubq_f64 (z, data.shift); + + /* r = x - n*ln2/N. */ + r = x; + r = vfmsq_f64 (r, data.ln2_hi, n); + r = vfmsq_f64 (r, data.ln2_lo, n); + + e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS); + + /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4. */ + r2 = vmulq_f64 (r, r); + y = vfmaq_f64 (C (0), C (1), r); + y = vfmaq_f64 (y, C (2), r2); + y = vfmaq_f64 (r, y, r2); + + /* s = 2^(n/N). */ + u = (uint64x2_t){ Tab[u[0] & IndexMask], Tab[u[1] & IndexMask] }; + s = vreinterpretq_f64_u64 (vaddq_u64 (u, e)); + + if (unlikely (v_any_u64 (cmp))) +#if WANT_SIMD_EXCEPT + return special_case (xm, vfmaq_f64 (s, y, s), cmp); +#else + return special_case (s, y, n); +#endif + + return vfmaq_f64 (s, y, s); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_exp2f.c b/contrib/arm-optimized-routines/math/aarch64/v_exp2f.c new file mode 100644 index 000000000000..e402205e98e6 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_exp2f.c @@ -0,0 +1,113 @@ +/* + * Single-precision vector 2^x function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + float32x4_t poly[5]; + uint32x4_t exponent_bias; +#if !WANT_SIMD_EXCEPT + float32x4_t special_bound, scale_thresh; +#endif +} data = { + /* maxerr: 1.962 ulp. */ + .poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f), + V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) }, + .exponent_bias = V4 (0x3f800000), +#if !WANT_SIMD_EXCEPT + .special_bound = V4 (126.0f), + .scale_thresh = V4 (192.0f), +#endif +}; + +#define C(i) d->poly[i] + +#if WANT_SIMD_EXCEPT + +# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ +# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */ +# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */ + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) +{ + /* If fenv exceptions are to be triggered correctly, fall back to the scalar + routine for special lanes. */ + return v_call_f32 (exp2f, x, y, cmp); +} + +#else + +# define SpecialOffset v_u32 (0x82000000) +# define SpecialBias v_u32 (0x7f000000) + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, + float32x4_t scale, const struct data *d) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); + float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); + uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); + float32x4_t r2 = vmulq_f32 (s1, s1); + float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1); + /* Similar to r1 but avoids double rounding in the subnormal range. */ + float32x4_t r0 = vfmaq_f32 (scale, poly, scale); + float32x4_t r = vbslq_f32 (cmp1, r1, r0); + return vbslq_f32 (cmp2, r2, r); +} + +#endif + +float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t n, r, r2, scale, p, q, poly; + uint32x4_t cmp, e; + +#if WANT_SIMD_EXCEPT + /* asuint(|x|) - TinyBound >= BigBound - TinyBound. */ + uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x)); + cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound); + float32x4_t xm = x; + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special_case to fix special lanes later. This is only necessary if fenv + exceptions are to be triggered correctly. */ + if (unlikely (v_any_u32 (cmp))) + x = vbslq_f32 (cmp, v_f32 (1), x); +#endif + + /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = n + r, with r in [-1/2, 1/2]. */ + n = vrndaq_f32 (x); + r = vsubq_f32 (x, n); + e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23); + scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); + +#if !WANT_SIMD_EXCEPT + cmp = vcagtq_f32 (n, d->special_bound); +#endif + + r2 = vmulq_f32 (r, r); + p = vfmaq_f32 (C (1), C (0), r); + q = vfmaq_f32 (C (3), C (2), r); + q = vfmaq_f32 (q, p, r2); + p = vmulq_f32 (C (4), r); + poly = vfmaq_f32 (p, q, r2); + + if (unlikely (v_any_u32 (cmp))) +#if WANT_SIMD_EXCEPT + return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp); +#else + return special_case (poly, n, e, cmp, scale, d); +#endif + + return vfmaq_f32 (scale, poly, scale); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_exp2f_1u.c b/contrib/arm-optimized-routines/math/aarch64/v_exp2f_1u.c new file mode 100644 index 000000000000..ba6b02fbb4bc --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_exp2f_1u.c @@ -0,0 +1,72 @@ +/* + * Single-precision vector 2^x function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const float Poly[] = { + /* maxerr: 0.878 ulp. */ + 0x1.416b5ep-13f, 0x1.5f082ep-10f, 0x1.3b2dep-7f, 0x1.c6af7cp-5f, 0x1.ebfbdcp-3f, 0x1.62e43p-1f +}; +#define C0 v_f32 (Poly[0]) +#define C1 v_f32 (Poly[1]) +#define C2 v_f32 (Poly[2]) +#define C3 v_f32 (Poly[3]) +#define C4 v_f32 (Poly[4]) +#define C5 v_f32 (Poly[5]) + +#define Shift v_f32 (0x1.8p23f) +#define InvLn2 v_f32 (0x1.715476p+0f) +#define Ln2hi v_f32 (0x1.62e4p-1f) +#define Ln2lo v_f32 (0x1.7f7d1cp-20f) + +static float32x4_t VPCS_ATTR NOINLINE +specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000); + float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b); + float32x4_t s2 = vreinterpretq_f32_u32 (e - b); + uint32x4_t cmp = absn > v_f32 (192.0f); + float32x4_t r1 = s1 * s1; + float32x4_t r0 = poly * s1 * s2; + return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1)) + | (~cmp & vreinterpretq_u32_f32 (r0))); +} + +float32x4_t VPCS_ATTR +_ZGVnN4v_exp2f_1u (float32x4_t x) +{ + float32x4_t n, r, scale, poly, absn; + uint32x4_t cmp, e; + + /* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] + x = n + r, with r in [-1/2, 1/2]. */ +#if 0 + float32x4_t z; + z = x + Shift; + n = z - Shift; + r = x - n; + e = vreinterpretq_u32_f32 (z) << 23; +#else + n = vrndaq_f32 (x); + r = x - n; + e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)) << 23; +#endif + scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000)); + absn = vabsq_f32 (n); + cmp = absn > v_f32 (126.0f); + poly = vfmaq_f32 (C1, C0, r); + poly = vfmaq_f32 (C2, poly, r); + poly = vfmaq_f32 (C3, poly, r); + poly = vfmaq_f32 (C4, poly, r); + poly = vfmaq_f32 (C5, poly, r); + poly = vfmaq_f32 (v_f32 (1.0f), poly, r); + if (unlikely (v_any_u32 (cmp))) + return specialcase (poly, n, e, absn); + return scale * poly; +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_exp_data.c b/contrib/arm-optimized-routines/math/aarch64/v_exp_data.c new file mode 100644 index 000000000000..45f0848cac5b --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_exp_data.c @@ -0,0 +1,146 @@ +/* + * Lookup table for double-precision e^x vector function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" + +# define N (1 << V_EXP_TABLE_BITS) + +/* 2^(j/N), j=0..N. */ +const uint64_t __v_exp_data[] = { +# if N == 128 + 0x3ff0000000000000, 0x3feff63da9fb3335, 0x3fefec9a3e778061, + 0x3fefe315e86e7f85, 0x3fefd9b0d3158574, 0x3fefd06b29ddf6de, + 0x3fefc74518759bc8, 0x3fefbe3ecac6f383, 0x3fefb5586cf9890f, + 0x3fefac922b7247f7, 0x3fefa3ec32d3d1a2, 0x3fef9b66affed31b, + 0x3fef9301d0125b51, 0x3fef8abdc06c31cc, 0x3fef829aaea92de0, + 0x3fef7a98c8a58e51, 0x3fef72b83c7d517b, 0x3fef6af9388c8dea, + 0x3fef635beb6fcb75, 0x3fef5be084045cd4, 0x3fef54873168b9aa, + 0x3fef4d5022fcd91d, 0x3fef463b88628cd6, 0x3fef3f49917ddc96, + 0x3fef387a6e756238, 0x3fef31ce4fb2a63f, 0x3fef2b4565e27cdd, + 0x3fef24dfe1f56381, 0x3fef1e9df51fdee1, 0x3fef187fd0dad990, + 0x3fef1285a6e4030b, 0x3fef0cafa93e2f56, 0x3fef06fe0a31b715, + 0x3fef0170fc4cd831, 0x3feefc08b26416ff, 0x3feef6c55f929ff1, + 0x3feef1a7373aa9cb, 0x3feeecae6d05d866, 0x3feee7db34e59ff7, + 0x3feee32dc313a8e5, 0x3feedea64c123422, 0x3feeda4504ac801c, + 0x3feed60a21f72e2a, 0x3feed1f5d950a897, 0x3feece086061892d, + 0x3feeca41ed1d0057, 0x3feec6a2b5c13cd0, 0x3feec32af0d7d3de, + 0x3feebfdad5362a27, 0x3feebcb299fddd0d, 0x3feeb9b2769d2ca7, + 0x3feeb6daa2cf6642, 0x3feeb42b569d4f82, 0x3feeb1a4ca5d920f, + 0x3feeaf4736b527da, 0x3feead12d497c7fd, 0x3feeab07dd485429, + 0x3feea9268a5946b7, 0x3feea76f15ad2148, 0x3feea5e1b976dc09, + 0x3feea47eb03a5585, 0x3feea34634ccc320, 0x3feea23882552225, + 0x3feea155d44ca973, 0x3feea09e667f3bcd, 0x3feea012750bdabf, + 0x3fee9fb23c651a2f, 0x3fee9f7df9519484, 0x3fee9f75e8ec5f74, + 0x3fee9f9a48a58174, 0x3fee9feb564267c9, 0x3feea0694fde5d3f, + 0x3feea11473eb0187, 0x3feea1ed0130c132, 0x3feea2f336cf4e62, + 0x3feea427543e1a12, 0x3feea589994cce13, 0x3feea71a4623c7ad, + 0x3feea8d99b4492ed, 0x3feeaac7d98a6699, 0x3feeace5422aa0db, + 0x3feeaf3216b5448c, 0x3feeb1ae99157736, 0x3feeb45b0b91ffc6, + 0x3feeb737b0cdc5e5, 0x3feeba44cbc8520f, 0x3feebd829fde4e50, + 0x3feec0f170ca07ba, 0x3feec49182a3f090, 0x3feec86319e32323, + 0x3feecc667b5de565, 0x3feed09bec4a2d33, 0x3feed503b23e255d, + 0x3feed99e1330b358, 0x3feede6b5579fdbf, 0x3feee36bbfd3f37a, + 0x3feee89f995ad3ad, 0x3feeee07298db666, 0x3feef3a2b84f15fb, + 0x3feef9728de5593a, 0x3feeff76f2fb5e47, 0x3fef05b030a1064a, + 0x3fef0c1e904bc1d2, 0x3fef12c25bd71e09, 0x3fef199bdd85529c, + 0x3fef20ab5fffd07a, 0x3fef27f12e57d14b, 0x3fef2f6d9406e7b5, + 0x3fef3720dcef9069, 0x3fef3f0b555dc3fa, 0x3fef472d4a07897c, + 0x3fef4f87080d89f2, 0x3fef5818dcfba487, 0x3fef60e316c98398, + 0x3fef69e603db3285, 0x3fef7321f301b460, 0x3fef7c97337b9b5f, + 0x3fef864614f5a129, 0x3fef902ee78b3ff6, 0x3fef9a51fbc74c83, + 0x3fefa4afa2a490da, 0x3fefaf482d8e67f1, 0x3fefba1bee615a27, + 0x3fefc52b376bba97, 0x3fefd0765b6e4540, 0x3fefdbfdad9cbe14, + 0x3fefe7c1819e90d8, 0x3feff3c22b8f71f1, +# elif N == 256 + 0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335, + 0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc, + 0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574, + 0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836, + 0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383, + 0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85, + 0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2, + 0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e, + 0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc, + 0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e, + 0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b, + 0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f, + 0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4, + 0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027, + 0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6, + 0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1, + 0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f, + 0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29, + 0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1, + 0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f, + 0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56, + 0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd, + 0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff, + 0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b, + 0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866, + 0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4, + 0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422, + 0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024, + 0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897, + 0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232, + 0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0, + 0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7, + 0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d, + 0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee, + 0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82, + 0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2, + 0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd, + 0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03, + 0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148, + 0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4, + 0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320, + 0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6, + 0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd, + 0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645, + 0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484, + 0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a, + 0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9, + 0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6, + 0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132, + 0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491, + 0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13, + 0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21, + 0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699, + 0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778, + 0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736, + 0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2, + 0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f, + 0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2, + 0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090, + 0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e, + 0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33, + 0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052, + 0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf, + 0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774, + 0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666, + 0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1, + 0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47, + 0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f, + 0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09, + 0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c, + 0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b, + 0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db, + 0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa, + 0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968, + 0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487, + 0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075, + 0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460, + 0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17, + 0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6, + 0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740, + 0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1, + 0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a, + 0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540, + 0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89, + 0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1, + 0x3feff9d96b2a23d9, +# endif +}; diff --git a/contrib/arm-optimized-routines/math/aarch64/v_expf.c b/contrib/arm-optimized-routines/math/aarch64/v_expf.c new file mode 100644 index 000000000000..34e8b6081bcd --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_expf.c @@ -0,0 +1,122 @@ +/* + * Single-precision vector e^x function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + float32x4_t poly[5]; + float32x4_t shift, inv_ln2, ln2_hi, ln2_lo; + uint32x4_t exponent_bias; +#if !WANT_SIMD_EXCEPT + float32x4_t special_bound, scale_thresh; +#endif +} data = { + /* maxerr: 1.45358 +0.5 ulp. */ + .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), + V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, + .shift = V4 (0x1.8p23f), + .inv_ln2 = V4 (0x1.715476p+0f), + .ln2_hi = V4 (0x1.62e4p-1f), + .ln2_lo = V4 (0x1.7f7d1cp-20f), + .exponent_bias = V4 (0x3f800000), +#if !WANT_SIMD_EXCEPT + .special_bound = V4 (126.0f), + .scale_thresh = V4 (192.0f), +#endif +}; + +#define C(i) d->poly[i] + +#if WANT_SIMD_EXCEPT + +# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ +# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */ +# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */ + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) +{ + /* If fenv exceptions are to be triggered correctly, fall back to the scalar + routine to special lanes. */ + return v_call_f32 (expf, x, y, cmp); +} + +#else + +# define SpecialOffset v_u32 (0x82000000) +# define SpecialBias v_u32 (0x7f000000) + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, + float32x4_t scale, const struct data *d) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); + float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); + uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); + float32x4_t r2 = vmulq_f32 (s1, s1); + float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1); + /* Similar to r1 but avoids double rounding in the subnormal range. */ + float32x4_t r0 = vfmaq_f32 (scale, poly, scale); + float32x4_t r = vbslq_f32 (cmp1, r1, r0); + return vbslq_f32 (cmp2, r2, r); +} + +#endif + +float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t n, r, r2, scale, p, q, poly, z; + uint32x4_t cmp, e; + +#if WANT_SIMD_EXCEPT + /* asuint(x) - TinyBound >= BigBound - TinyBound. */ + cmp = vcgeq_u32 ( + vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)), + TinyBound), + SpecialBound); + float32x4_t xm = x; + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special case handler to fix special lanes later. This is only necessary if + fenv exceptions are to be triggered correctly. */ + if (unlikely (v_any_u32 (cmp))) + x = vbslq_f32 (cmp, v_f32 (1), x); +#endif + + /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + z = vfmaq_f32 (d->shift, x, d->inv_ln2); + n = vsubq_f32 (z, d->shift); + r = vfmsq_f32 (x, n, d->ln2_hi); + r = vfmsq_f32 (r, n, d->ln2_lo); + e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); + scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); + +#if !WANT_SIMD_EXCEPT + cmp = vcagtq_f32 (n, d->special_bound); +#endif + + r2 = vmulq_f32 (r, r); + p = vfmaq_f32 (C (1), C (0), r); + q = vfmaq_f32 (C (3), C (2), r); + q = vfmaq_f32 (q, p, r2); + p = vmulq_f32 (C (4), r); + poly = vfmaq_f32 (p, q, r2); + + if (unlikely (v_any_u32 (cmp))) +#if WANT_SIMD_EXCEPT + return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp); +#else + return special_case (poly, n, e, cmp, scale, d); +#endif + + return vfmaq_f32 (scale, poly, scale); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_expf_1u.c b/contrib/arm-optimized-routines/math/aarch64/v_expf_1u.c new file mode 100644 index 000000000000..43d03fa34efa --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_expf_1u.c @@ -0,0 +1,77 @@ +/* + * Single-precision vector e^x function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const float Poly[] = { + /* maxerr: 0.36565 +0.5 ulp. */ + 0x1.6a6000p-10f, + 0x1.12718ep-7f, + 0x1.555af0p-5f, + 0x1.555430p-3f, + 0x1.fffff4p-2f, +}; +#define C0 v_f32 (Poly[0]) +#define C1 v_f32 (Poly[1]) +#define C2 v_f32 (Poly[2]) +#define C3 v_f32 (Poly[3]) +#define C4 v_f32 (Poly[4]) + +#define Shift v_f32 (0x1.8p23f) +#define InvLn2 v_f32 (0x1.715476p+0f) +#define Ln2hi v_f32 (0x1.62e4p-1f) +#define Ln2lo v_f32 (0x1.7f7d1cp-20f) + +static float32x4_t VPCS_ATTR NOINLINE +specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000); + float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b); + float32x4_t s2 = vreinterpretq_f32_u32 (e - b); + uint32x4_t cmp = absn > v_f32 (192.0f); + float32x4_t r1 = s1 * s1; + float32x4_t r0 = poly * s1 * s2; + return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1)) + | (~cmp & vreinterpretq_u32_f32 (r0))); +} + +float32x4_t VPCS_ATTR +_ZGVnN4v_expf_1u (float32x4_t x) +{ + float32x4_t n, r, scale, poly, absn, z; + uint32x4_t cmp, e; + + /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ +#if 1 + z = vfmaq_f32 (Shift, x, InvLn2); + n = z - Shift; + r = vfmaq_f32 (x, n, -Ln2hi); + r = vfmaq_f32 (r, n, -Ln2lo); + e = vreinterpretq_u32_f32 (z) << 23; +#else + z = x * InvLn2; + n = vrndaq_f32 (z); + r = vfmaq_f32 (x, n, -Ln2hi); + r = vfmaq_f32 (r, n, -Ln2lo); + e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (z)) << 23; +#endif + scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000)); + absn = vabsq_f32 (n); + cmp = absn > v_f32 (126.0f); + poly = vfmaq_f32 (C1, C0, r); + poly = vfmaq_f32 (C2, poly, r); + poly = vfmaq_f32 (C3, poly, r); + poly = vfmaq_f32 (C4, poly, r); + poly = vfmaq_f32 (v_f32 (1.0f), poly, r); + poly = vfmaq_f32 (v_f32 (1.0f), poly, r); + if (unlikely (v_any_u32 (cmp))) + return specialcase (poly, n, e, absn); + return scale * poly; +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_log.c b/contrib/arm-optimized-routines/math/aarch64/v_log.c new file mode 100644 index 000000000000..1d1c1fa62c04 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_log.c @@ -0,0 +1,100 @@ +/* + * Double-precision vector log(x) function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + uint64x2_t min_norm; + uint32x4_t special_bound; + float64x2_t poly[5]; + float64x2_t ln2; + uint64x2_t sign_exp_mask; +} data = { + /* Worst-case error: 1.17 + 0.5 ulp. + Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */ + .poly = { V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2), + V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3), + V2 (-0x1.554e550bd501ep-3) }, + .ln2 = V2 (0x1.62e42fefa39efp-1), + .min_norm = V2 (0x0010000000000000), + .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */ + .sign_exp_mask = V2 (0xfff0000000000000) +}; + +#define A(i) d->poly[i] +#define N (1 << V_LOG_TABLE_BITS) +#define IndexMask (N - 1) +#define Off v_u64 (0x3fe6900900000000) + +struct entry +{ + float64x2_t invc; + float64x2_t logc; +}; + +static inline struct entry +lookup (uint64x2_t i) +{ + /* Since N is a power of 2, n % N = n & (N - 1). */ + struct entry e; + uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc); + e.invc = vuzp1q_f64 (e0, e1); + e.logc = vuzp2q_f64 (e0, e1); + return e; +} + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2, + uint32x2_t cmp) +{ + return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (cmp)); +} + +float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + float64x2_t z, r, r2, p, y, kd, hi; + uint64x2_t ix, iz, tmp; + uint32x2_t cmp; + int64x2_t k; + struct entry e; + + ix = vreinterpretq_u64_f64 (x); + cmp = vcge_u32 (vsubhn_u64 (ix, d->min_norm), + vget_low_u32 (d->special_bound)); + + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + tmp = vsubq_u64 (ix, Off); + k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */ + iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask)); + z = vreinterpretq_f64_u64 (iz); + e = lookup (tmp); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ + r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); + kd = vcvtq_f64_s64 (k); + + /* hi = r + log(c) + k*Ln2. */ + hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2); + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + r2 = vmulq_f64 (r, r); + y = vfmaq_f64 (A (2), A (3), r); + p = vfmaq_f64 (A (0), A (1), r); + y = vfmaq_f64 (y, A (4), r2); + y = vfmaq_f64 (p, y, r2); + + if (unlikely (v_any_u32h (cmp))) + return special_case (x, y, hi, r2, cmp); + return vfmaq_f64 (hi, y, r2); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_log_data.c b/contrib/arm-optimized-routines/math/aarch64/v_log_data.c new file mode 100644 index 000000000000..82351bb14766 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_log_data.c @@ -0,0 +1,156 @@ +/* + * Lookup table for double-precision log(x) vector function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" + +#define N (1 << V_LOG_TABLE_BITS) + +const struct v_log_data __v_log_data = { + /* Algorithm: + + x = 2^k z + log(x) = k ln2 + log(c) + poly(z/c - 1) + + where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1, + N=128) and log(c) and 1/c for the ith subinterval comes from lookup tables: + + table[i].invc = 1/c + table[i].logc = (double)log(c) + + where c is near the center of the subinterval and is chosen by trying several + floating point invc candidates around 1/center and selecting one for which + the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval + that contains 1 and the previous one got tweaked to avoid cancellation. */ + .table = { { 0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2 }, + { 0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2 }, + { 0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2 }, + { 0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2 }, + { 0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2 }, + { 0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2 }, + { 0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2 }, + { 0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2 }, + { 0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2 }, + { 0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2 }, + { 0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2 }, + { 0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2 }, + { 0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2 }, + { 0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2 }, + { 0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2 }, + { 0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2 }, + { 0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2 }, + { 0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2 }, + { 0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2 }, + { 0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3 }, + { 0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3 }, + { 0x1.446f12b278001p+0, -0x1.e52e160484698p-3 }, + { 0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3 }, + { 0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3 }, + { 0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3 }, + { 0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3 }, + { 0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3 }, + { 0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3 }, + { 0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3 }, + { 0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3 }, + { 0x1.36987540fbf53p+0, -0x1.8be843d796044p-3 }, + { 0x1.352166b648f61p+0, -0x1.82395ecc477edp-3 }, + { 0x1.33adddb3eb575p+0, -0x1.7896240966422p-3 }, + { 0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3 }, + { 0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3 }, + { 0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3 }, + { 0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3 }, + { 0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3 }, + { 0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3 }, + { 0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3 }, + { 0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3 }, + { 0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3 }, + { 0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3 }, + { 0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3 }, + { 0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3 }, + { 0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4 }, + { 0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4 }, + { 0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4 }, + { 0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4 }, + { 0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4 }, + { 0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4 }, + { 0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4 }, + { 0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4 }, + { 0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4 }, + { 0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4 }, + { 0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4 }, + { 0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4 }, + { 0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4 }, + { 0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4 }, + { 0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4 }, + { 0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5 }, + { 0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5 }, + { 0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5 }, + { 0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5 }, + { 0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5 }, + { 0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5 }, + { 0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5 }, + { 0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5 }, + { 0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6 }, + { 0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6 }, + { 0x1.05193497a7cc5p+0, -0x1.43183683400acp-6 }, + { 0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6 }, + { 0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7 }, + { 0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7 }, + { 0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9 }, + { 1.0, 0.0 }, + { 0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8 }, + { 0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7 }, + { 0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6 }, + { 0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6 }, + { 0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5 }, + { 0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5 }, + { 0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5 }, + { 0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5 }, + { 0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4 }, + { 0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4 }, + { 0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4 }, + { 0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4 }, + { 0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4 }, + { 0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4 }, + { 0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4 }, + { 0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4 }, + { 0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4 }, + { 0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3 }, + { 0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3 }, + { 0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3 }, + { 0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3 }, + { 0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3 }, + { 0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3 }, + { 0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3 }, + { 0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3 }, + { 0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3 }, + { 0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3 }, + { 0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3 }, + { 0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3 }, + { 0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3 }, + { 0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3 }, + { 0x1.9998e1480b618p-1, 0x1.c903161240163p-3 }, + { 0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3 }, + { 0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3 }, + { 0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3 }, + { 0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3 }, + { 0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2 }, + { 0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2 }, + { 0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2 }, + { 0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2 }, + { 0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2 }, + { 0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2 }, + { 0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2 }, + { 0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2 }, + { 0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2 }, + { 0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2 }, + { 0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2 }, + { 0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2 }, + { 0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2 }, + { 0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2 }, + { 0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2 }, + { 0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2 } } +}; diff --git a/contrib/arm-optimized-routines/math/aarch64/v_logf.c b/contrib/arm-optimized-routines/math/aarch64/v_logf.c new file mode 100644 index 000000000000..66ebbbcd2b5a --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_logf.c @@ -0,0 +1,74 @@ +/* + * Single-precision vector log function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + uint32x4_t min_norm; + uint16x8_t special_bound; + float32x4_t poly[7]; + float32x4_t ln2, tiny_bound; + uint32x4_t off, mantissa_mask; +} data = { + /* 3.34 ulp error. */ + .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f), + V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f), + V4 (-0x1.ffffc8p-2f) }, + .ln2 = V4 (0x1.62e43p-1f), + .tiny_bound = V4 (0x1p-126), + .min_norm = V4 (0x00800000), + .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */ + .off = V4 (0x3f2aaaab), /* 0.666667. */ + .mantissa_mask = V4 (0x007fffff) +}; + +#define P(i) d->poly[7 - i] + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p, + uint16x4_t cmp) +{ + /* Fall back to scalar code. */ + return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp)); +} + +float32x4_t VPCS_ATTR V_NAME_F1 (log) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t n, p, q, r, r2, y; + uint32x4_t u; + uint16x4_t cmp; + + u = vreinterpretq_u32_f32 (x); + cmp = vcge_u16 (vsubhn_u32 (u, d->min_norm), + vget_low_u16 (d->special_bound)); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + u = vsubq_u32 (u, d->off); + n = vcvtq_f32_s32 ( + vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */ + u = vandq_u32 (u, d->mantissa_mask); + u = vaddq_u32 (u, d->off); + r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); + + /* y = log(1+r) + n*ln2. */ + r2 = vmulq_f32 (r, r); + /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */ + p = vfmaq_f32 (P (5), P (6), r); + q = vfmaq_f32 (P (3), P (4), r); + y = vfmaq_f32 (P (1), P (2), r); + p = vfmaq_f32 (p, P (7), r2); + q = vfmaq_f32 (q, p, r2); + y = vfmaq_f32 (y, q, r2); + p = vfmaq_f32 (r, d->ln2, n); + + if (unlikely (v_any_u16h (cmp))) + return special_case (x, y, r2, p, cmp); + return vfmaq_f32 (p, y, r2); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_math.h b/contrib/arm-optimized-routines/math/aarch64/v_math.h new file mode 100644 index 000000000000..1dc9916c6fb0 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_math.h @@ -0,0 +1,135 @@ +/* + * Vector math abstractions. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef _V_MATH_H +#define _V_MATH_H + +#if !__aarch64__ +# error "Cannot build without AArch64" +#endif + +#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) + +#define V_NAME_F1(fun) _ZGVnN4v_##fun##f +#define V_NAME_D1(fun) _ZGVnN2v_##fun +#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f +#define V_NAME_D2(fun) _ZGVnN2vv_##fun + +#include +#include "../math_config.h" +#include + +/* Shorthand helpers for declaring constants. */ +# define V2(X) { X, X } +# define V4(X) { X, X, X, X } +# define V8(X) { X, X, X, X, X, X, X, X } + +static inline int +v_any_u16h (uint16x4_t x) +{ + return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0; +} + +static inline int +v_lanes32 (void) +{ + return 4; +} + +static inline float32x4_t +v_f32 (float x) +{ + return (float32x4_t) V4 (x); +} +static inline uint32x4_t +v_u32 (uint32_t x) +{ + return (uint32x4_t) V4 (x); +} +/* true if any elements of a v_cond result is non-zero. */ +static inline int +v_any_u32 (uint32x4_t x) +{ + /* assume elements in x are either 0 or -1u. */ + return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0; +} +static inline int +v_any_u32h (uint32x2_t x) +{ + return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0; +} +static inline float32x4_t +v_lookup_f32 (const float *tab, uint32x4_t idx) +{ + return (float32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; +} +static inline uint32x4_t +v_lookup_u32 (const uint32_t *tab, uint32x4_t idx) +{ + return (uint32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; +} +static inline float32x4_t +v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p) +{ + return (float32x4_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1], + p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]}; +} +static inline float32x4_t +v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2, + float32x4_t y, uint32x4_t p) +{ + return (float32x4_t){p[0] ? f (x1[0], x2[0]) : y[0], + p[1] ? f (x1[1], x2[1]) : y[1], + p[2] ? f (x1[2], x2[2]) : y[2], + p[3] ? f (x1[3], x2[3]) : y[3]}; +} + +static inline int +v_lanes64 (void) +{ + return 2; +} +static inline float64x2_t +v_f64 (double x) +{ + return (float64x2_t) V2 (x); +} +static inline uint64x2_t +v_u64 (uint64_t x) +{ + return (uint64x2_t) V2 (x); +} +/* true if any elements of a v_cond result is non-zero. */ +static inline int +v_any_u64 (uint64x2_t x) +{ + /* assume elements in x are either 0 or -1u. */ + return vpaddd_u64 (x) != 0; +} +static inline float64x2_t +v_lookup_f64 (const double *tab, uint64x2_t idx) +{ + return (float64x2_t){tab[idx[0]], tab[idx[1]]}; +} +static inline uint64x2_t +v_lookup_u64 (const uint64_t *tab, uint64x2_t idx) +{ + return (uint64x2_t){tab[idx[0]], tab[idx[1]]}; +} +static inline float64x2_t +v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p) +{ + double p1 = p[1]; + double x1 = x[1]; + if (likely (p[0])) + y[0] = f (x[0]); + if (likely (p1)) + y[1] = f (x1); + return y; +} + +#endif diff --git a/contrib/arm-optimized-routines/math/aarch64/v_pow.c b/contrib/arm-optimized-routines/math/aarch64/v_pow.c new file mode 100644 index 000000000000..734f1663a283 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_pow.c @@ -0,0 +1,22 @@ +/* + * Double-precision vector pow function. + * + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y) +{ + float64x2_t z; + for (int lane = 0; lane < v_lanes64 (); lane++) + { + double sx = x[lane]; + double sy = y[lane]; + double sz = pow (sx, sy); + z[lane] = sz; + } + return z; +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_powf.c b/contrib/arm-optimized-routines/math/aarch64/v_powf.c new file mode 100644 index 000000000000..3a4163ab0558 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_powf.c @@ -0,0 +1,148 @@ +/* + * Single-precision vector powf function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" + +#define Min v_u32 (0x00800000) +#define Max v_u32 (0x7f800000) +#define Thresh v_u32 (0x7f000000) /* Max - Min. */ +#define MantissaMask v_u32 (0x007fffff) + +#define A data.log2_poly +#define C data.exp2f_poly + +/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */ +#define Off v_u32 (0x3f35d000) + +#define V_POWF_LOG2_TABLE_BITS 5 +#define V_EXP2F_TABLE_BITS 5 +#define Log2IdxMask v_u32 ((1 << V_POWF_LOG2_TABLE_BITS) - 1) +#define Scale ((double) (1 << V_EXP2F_TABLE_BITS)) + +static const struct +{ + struct + { + double invc, logc; + } log2_tab[1 << V_POWF_LOG2_TABLE_BITS]; + double log2_poly[4]; + uint64_t exp2f_tab[1 << V_EXP2F_TABLE_BITS]; + double exp2f_poly[3]; +} data = { + .log2_tab = {{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * Scale}, + {0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * Scale}, + {0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * Scale}, + {0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * Scale}, + {0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * Scale}, + {0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * Scale}, + {0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * Scale}, + {0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * Scale}, + {0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * Scale}, + {0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * Scale}, + {0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * Scale}, + {0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * Scale}, + {0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * Scale}, + {0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * Scale}, + {0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * Scale}, + {0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * Scale}, + {0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * Scale}, + {0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * Scale}, + {0x1p+0, 0x0p+0 * Scale}, + {0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * Scale}, + {0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * Scale}, + {0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * Scale}, + {0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * Scale}, + {0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * Scale}, + {0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * Scale}, + {0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * Scale}, + {0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * Scale}, + {0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * Scale}, + {0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * Scale}, + {0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * Scale}, + {0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * Scale}, + {0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * Scale},}, + .log2_poly = { /* rel err: 1.5 * 2^-30. */ + -0x1.6ff5daa3b3d7cp-2 * Scale, 0x1.ec81d03c01aebp-2 * Scale, + -0x1.71547bb43f101p-1 * Scale, 0x1.7154764a815cbp0 * Scale,}, + .exp2f_tab = {0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, + 0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa, + 0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715, + 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d, + 0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, + 0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, + 0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db, + 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d, + 0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, + 0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f, + 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,}, + .exp2f_poly = { /* rel err: 1.69 * 2^-34. */ + 0x1.c6af84b912394p-5 / Scale / Scale / Scale, + 0x1.ebfce50fac4f3p-3 / Scale / Scale, + 0x1.62e42ff0c52d6p-1 / Scale}}; + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, float32x4_t ret, uint32x4_t cmp) +{ + return v_call2_f32 (powf, x, y, ret, cmp); +} + +float32x4_t VPCS_ATTR V_NAME_F2 (pow) (float32x4_t x, float32x4_t y) +{ + uint32x4_t u = vreinterpretq_u32_f32 (x); + uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (u, Min), Thresh); + uint32x4_t tmp = vsubq_u32 (u, Off); + uint32x4_t i = vandq_u32 (vshrq_n_u32 (tmp, (23 - V_POWF_LOG2_TABLE_BITS)), + Log2IdxMask); + uint32x4_t top = vbicq_u32 (tmp, MantissaMask); + uint32x4_t iz = vsubq_u32 (u, top); + int32x4_t k = vshrq_n_s32 (vreinterpretq_s32_u32 (top), + 23 - V_EXP2F_TABLE_BITS); /* arithmetic shift. */ + + float32x4_t ret; + for (int lane = 0; lane < 4; lane++) + { + /* Use double precision for each lane. */ + double invc = data.log2_tab[i[lane]].invc; + double logc = data.log2_tab[i[lane]].logc; + double z = (double) asfloat (iz[lane]); + + /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */ + double r = __builtin_fma (z, invc, -1.0); + double y0 = logc + (double) k[lane]; + + /* Polynomial to approximate log1p(r)/ln2. */ + double logx = A[0]; + logx = r * logx + A[1]; + logx = r * logx + A[2]; + logx = r * logx + A[3]; + logx = r * logx + y0; + double ylogx = y[lane] * logx; + cmp[lane] = (asuint64 (ylogx) >> 47 & 0xffff) + >= asuint64 (126.0 * (1 << V_EXP2F_TABLE_BITS)) >> 47 + ? 1 + : cmp[lane]; + + /* N*x = k + r with r in [-1/2, 1/2]. */ + double kd = round (ylogx); + uint64_t ki = lround (ylogx); + r = ylogx - kd; + + /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */ + uint64_t t = data.exp2f_tab[ki % (1 << V_EXP2F_TABLE_BITS)]; + t += ki << (52 - V_EXP2F_TABLE_BITS); + double s = asdouble (t); + double p = C[0]; + p = __builtin_fma (p, r, C[1]); + p = __builtin_fma (p, r, C[2]); + p = __builtin_fma (p, s * r, s); + + ret[lane] = p; + } + if (unlikely (v_any_u32 (cmp))) + return special_case (x, y, ret, cmp); + return ret; +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_sin.c b/contrib/arm-optimized-routines/math/aarch64/v_sin.c new file mode 100644 index 000000000000..04129c31133d --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_sin.c @@ -0,0 +1,97 @@ +/* + * Double-precision vector sin function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + float64x2_t poly[7]; + float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3; +} data = { + .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), + V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19), + V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33), + V2 (-0x1.9e9540300a1p-41) }, + + .range_val = V2 (0x1p23), + .inv_pi = V2 (0x1.45f306dc9c883p-2), + .pi_1 = V2 (0x1.921fb54442d18p+1), + .pi_2 = V2 (0x1.1a62633145c06p-53), + .pi_3 = V2 (0x1.c1cd129024e09p-106), + .shift = V2 (0x1.8p52), +}; + +#if WANT_SIMD_EXCEPT +# define TinyBound v_u64 (0x3000000000000000) /* asuint64 (0x1p-255). */ +# define Thresh v_u64 (0x1160000000000000) /* RangeVal - TinyBound. */ +#endif + +#define C(i) d->poly[i] + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) +{ + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); + return v_call_f64 (sin, x, y, cmp); +} + +/* Vector (AdvSIMD) sin approximation. + Maximum observed error in [-pi/2, pi/2], where argument is not reduced, + is 2.87 ULP: + _ZGVnN2v_sin (0x1.921d5c6a07142p+0) got 0x1.fffffffa7dc02p-1 + want 0x1.fffffffa7dc05p-1 + Maximum observed error in the entire non-special domain ([-2^23, 2^23]) + is 3.22 ULP: + _ZGVnN2v_sin (0x1.5702447b6f17bp+22) got 0x1.ffdcd125c84fbp-3 + want 0x1.ffdcd125c84f8p-3. */ +float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + float64x2_t n, r, r2, r3, r4, y, t1, t2, t3; + uint64x2_t odd, cmp; + +#if WANT_SIMD_EXCEPT + /* Detect |x| <= TinyBound or |x| >= RangeVal. If fenv exceptions are to be + triggered correctly, set any special lanes to 1 (which is neutral w.r.t. + fenv). These lanes will be fixed by special-case handler later. */ + uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x)); + cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh); + r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x); +#else + r = x; + cmp = vcageq_f64 (x, d->range_val); +#endif + + /* n = rint(|x|/pi). */ + n = vfmaq_f64 (d->shift, d->inv_pi, r); + odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63); + n = vsubq_f64 (n, d->shift); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = vfmsq_f64 (r, d->pi_1, n); + r = vfmsq_f64 (r, d->pi_2, n); + r = vfmsq_f64 (r, d->pi_3, n); + + /* sin(r) poly approx. */ + r2 = vmulq_f64 (r, r); + r3 = vmulq_f64 (r2, r); + r4 = vmulq_f64 (r2, r2); + + t1 = vfmaq_f64 (C (4), C (5), r2); + t2 = vfmaq_f64 (C (2), C (3), r2); + t3 = vfmaq_f64 (C (0), C (1), r2); + + y = vfmaq_f64 (t1, C (6), r4); + y = vfmaq_f64 (t2, y, r4); + y = vfmaq_f64 (t3, y, r4); + y = vfmaq_f64 (r, y, r3); + + if (unlikely (v_any_u64 (cmp))) + return special_case (x, y, odd, cmp); + return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_sinf.c b/contrib/arm-optimized-routines/math/aarch64/v_sinf.c new file mode 100644 index 000000000000..336879844459 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_sinf.c @@ -0,0 +1,82 @@ +/* + * Single-precision vector sin function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + float32x4_t poly[4]; + float32x4_t range_val, inv_pi, shift, pi_1, pi_2, pi_3; +} data = { + /* 1.886 ulp error. */ + .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f), + V4 (0x1.5b2e76p-19f) }, + + .pi_1 = V4 (0x1.921fb6p+1f), + .pi_2 = V4 (-0x1.777a5cp-24f), + .pi_3 = V4 (-0x1.ee59dap-49f), + + .inv_pi = V4 (0x1.45f306p-2f), + .shift = V4 (0x1.8p+23f), + .range_val = V4 (0x1p20f) +}; + +#if WANT_SIMD_EXCEPT +# define TinyBound v_u32 (0x21000000) /* asuint32(0x1p-61f). */ +# define Thresh v_u32 (0x28800000) /* RangeVal - TinyBound. */ +#endif + +#define C(i) d->poly[i] + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) +{ + /* Fall back to scalar code. */ + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); + return v_call_f32 (sinf, x, y, cmp); +} + +float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t n, r, r2, y; + uint32x4_t odd, cmp; + +#if WANT_SIMD_EXCEPT + uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x)); + cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh); + /* If fenv exceptions are to be triggered correctly, set any special lanes + to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by + special-case handler later. */ + r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x); +#else + r = x; + cmp = vcageq_f32 (x, d->range_val); +#endif + + /* n = rint(|x|/pi) */ + n = vfmaq_f32 (d->shift, d->inv_pi, r); + odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31); + n = vsubq_f32 (n, d->shift); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */ + r = vfmsq_f32 (r, d->pi_1, n); + r = vfmsq_f32 (r, d->pi_2, n); + r = vfmsq_f32 (r, d->pi_3, n); + + /* y = sin(r) */ + r2 = vmulq_f32 (r, r); + y = vfmaq_f32 (C (2), C (3), r2); + y = vfmaq_f32 (C (1), y, r2); + y = vfmaq_f32 (C (0), y, r2); + y = vfmaq_f32 (r, vmulq_f32 (y, r2), r); + + if (unlikely (v_any_u32 (cmp))) + return special_case (x, y, odd, cmp); + return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); +} diff --git a/contrib/arm-optimized-routines/math/exp10.c b/contrib/arm-optimized-routines/math/exp10.c new file mode 100644 index 000000000000..0fbec4c694ca --- /dev/null +++ b/contrib/arm-optimized-routines/math/exp10.c @@ -0,0 +1,129 @@ +/* + * Double-precision 10^x function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +#define N (1 << EXP_TABLE_BITS) +#define IndexMask (N - 1) +#define OFlowBound 0x1.34413509f79ffp8 /* log10(DBL_MAX). */ +#define UFlowBound -0x1.5ep+8 /* -350. */ +#define SmallTop 0x3c6 /* top12(0x1p-57). */ +#define BigTop 0x407 /* top12(0x1p8). */ +#define Thresh 0x41 /* BigTop - SmallTop. */ +#define Shift __exp_data.shift +#define C(i) __exp_data.exp10_poly[i] + +static double +special_case (uint64_t sbits, double_t tmp, uint64_t ki) +{ + double_t scale, y; + + if (ki - (1ull << 16) < 0x80000000) + { + /* The exponent of scale might have overflowed by 1. */ + sbits -= 1ull << 52; + scale = asdouble (sbits); + y = 2 * (scale + scale * tmp); + return check_oflow (eval_as_double (y)); + } + + /* n < 0, need special care in the subnormal range. */ + sbits += 1022ull << 52; + scale = asdouble (sbits); + y = scale + scale * tmp; + + if (y < 1.0) + { + /* Round y to the right precision before scaling it into the subnormal + range to avoid double rounding that can cause 0.5+E/2 ulp error where + E is the worst-case ulp error outside the subnormal range. So this + is only useful if the goal is better than 1 ulp worst-case error. */ + double_t lo = scale - y + scale * tmp; + double_t hi = 1.0 + y; + lo = 1.0 - hi + y + lo; + y = eval_as_double (hi + lo) - 1.0; + /* Avoid -0.0 with downward rounding. */ + if (WANT_ROUNDING && y == 0.0) + y = 0.0; + /* The underflow exception needs to be signaled explicitly. */ + force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022); + } + y = 0x1p-1022 * y; + + return check_uflow (y); +} + +/* Double-precision 10^x approximation. Largest observed error is ~0.513 ULP. */ +double +exp10 (double x) +{ + uint64_t ix = asuint64 (x); + uint32_t abstop = (ix >> 52) & 0x7ff; + + if (unlikely (abstop - SmallTop >= Thresh)) + { + if (abstop - SmallTop >= 0x80000000) + /* Avoid spurious underflow for tiny x. + Note: 0 is common input. */ + return x + 1; + if (abstop == 0x7ff) + return ix == asuint64 (-INFINITY) ? 0.0 : x + 1.0; + if (x >= OFlowBound) + return __math_oflow (0); + if (x < UFlowBound) + return __math_uflow (0); + + /* Large x is special-cased below. */ + abstop = 0; + } + + /* Reduce x: z = x * N / log10(2), k = round(z). */ + double_t z = __exp_data.invlog10_2N * x; + double_t kd; + int64_t ki; +#if TOINT_INTRINSICS + kd = roundtoint (z); + ki = converttoint (z); +#else + kd = eval_as_double (z + Shift); + kd -= Shift; + ki = kd; +#endif + + /* r = x - k * log10(2), r in [-0.5, 0.5]. */ + double_t r = x; + r = __exp_data.neglog10_2hiN * kd + r; + r = __exp_data.neglog10_2loN * kd + r; + + /* exp10(x) = 2^(k/N) * 2^(r/N). + Approximate the two components separately. */ + + /* s = 2^(k/N), using lookup table. */ + uint64_t e = ki << (52 - EXP_TABLE_BITS); + uint64_t i = (ki & IndexMask) * 2; + uint64_t u = __exp_data.tab[i + 1]; + uint64_t sbits = u + e; + + double_t tail = asdouble (__exp_data.tab[i]); + + /* 2^(r/N) ~= 1 + r * Poly(r). */ + double_t r2 = r * r; + double_t p = C (0) + r * C (1); + double_t y = C (2) + r * C (3); + y = y + r2 * C (4); + y = p + r2 * y; + y = tail + y * r; + + if (unlikely (abstop == 0)) + return special_case (sbits, y, ki); + + /* Assemble components: + y = 2^(r/N) * 2^(k/N) + ~= (y + 1) * s. */ + double_t s = asdouble (sbits); + return eval_as_double (s * y + s); +} diff --git a/contrib/arm-optimized-routines/math/exp_data.c b/contrib/arm-optimized-routines/math/exp_data.c index 714c845709aa..c20b1b2d3e06 100644 --- a/contrib/arm-optimized-routines/math/exp_data.c +++ b/contrib/arm-optimized-routines/math/exp_data.c @@ -1,7 +1,7 @@ /* * Shared data between exp, exp2 and pow. * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -12,6 +12,7 @@ const struct exp_data __exp_data = { // N/ln2 .invln2N = 0x1.71547652b82fep0 * N, +.invlog10_2N = 0x1.a934f0979a371p1 * N, // -ln2/N #if N == 64 .negln2hiN = -0x1.62e42fefa0000p-7, @@ -26,6 +27,8 @@ const struct exp_data __exp_data = { .negln2hiN = -0x1.62e42fef80000p-10, .negln2loN = -0x1.1cf79abc9e3b4p-45, #endif +.neglog10_2hiN = -0x1.3441350ap-2 / N, +.neglog10_2loN = 0x1.0c0219dc1da99p-39 / N, // Used for rounding when !TOINT_INTRINSICS #if EXP_USE_TOINT_NARROW .shift = 0x1800000000.8p0, @@ -147,6 +150,24 @@ const struct exp_data __exp_data = { 0x1.3b2ab786ee1dap-7, #endif }, +.exp10_poly = { +#if EXP10_POLY_WIDE +/* Range is wider if using shift-based reduction: coeffs generated + using Remez in [-log10(2)/128, log10(2)/128 ]. */ +0x1.26bb1bbb55515p1, +0x1.53524c73cd32bp1, +0x1.0470591e1a108p1, +0x1.2bd77b12fe9a8p0, +0x1.14289fef24b78p-1 +#else +/* Coeffs generated using Remez in [-log10(2)/256, log10(2)/256 ]. */ +0x1.26bb1bbb55516p1, +0x1.53524c73ce9fep1, +0x1.0470591ce4b26p1, +0x1.2bd76577fe684p0, +0x1.1446eeccd0efbp-1 +#endif +}, // 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N) // tab[2*k] = asuint64(T[k]) // tab[2*k+1] = asuint64(H[k]) - (k << 52)/N diff --git a/contrib/arm-optimized-routines/math/include/mathlib.h b/contrib/arm-optimized-routines/math/include/mathlib.h index c520c3772f7f..64cbb9c1f850 100644 --- a/contrib/arm-optimized-routines/math/include/mathlib.h +++ b/contrib/arm-optimized-routines/math/include/mathlib.h @@ -1,7 +1,7 @@ /* * Public API. * - * Copyright (c) 2015-2020, Arm Limited. + * Copyright (c) 2015-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -18,74 +18,33 @@ float cosf (float); void sincosf (float, float*, float*); double exp (double); +double exp10 (double); double exp2 (double); double log (double); double log2 (double); double pow (double, double); -/* Scalar functions using the vector algorithm with identical result. */ -float __s_sinf (float); -float __s_cosf (float); -float __s_expf (float); -float __s_expf_1u (float); -float __s_exp2f (float); -float __s_exp2f_1u (float); -float __s_logf (float); -float __s_powf (float, float); -double __s_sin (double); -double __s_cos (double); -double __s_exp (double); -double __s_log (double); -double __s_pow (double, double); - #if __aarch64__ -#if __GNUC__ >= 5 +# if __GNUC__ >= 5 typedef __Float32x4_t __f32x4_t; typedef __Float64x2_t __f64x2_t; -#elif __clang_major__*100+__clang_minor__ >= 305 +# elif __clang_major__*100+__clang_minor__ >= 305 typedef __attribute__((__neon_vector_type__(4))) float __f32x4_t; typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t; -#else -#error Unsupported compiler -#endif +# else +# error Unsupported compiler +# endif -/* Vector functions following the base PCS. */ -__f32x4_t __v_sinf (__f32x4_t); -__f32x4_t __v_cosf (__f32x4_t); -__f32x4_t __v_expf (__f32x4_t); -__f32x4_t __v_expf_1u (__f32x4_t); -__f32x4_t __v_exp2f (__f32x4_t); -__f32x4_t __v_exp2f_1u (__f32x4_t); -__f32x4_t __v_logf (__f32x4_t); -__f32x4_t __v_powf (__f32x4_t, __f32x4_t); -__f64x2_t __v_sin (__f64x2_t); -__f64x2_t __v_cos (__f64x2_t); -__f64x2_t __v_exp (__f64x2_t); -__f64x2_t __v_log (__f64x2_t); -__f64x2_t __v_pow (__f64x2_t, __f64x2_t); - -#if __GNUC__ >= 9 || __clang_major__ >= 8 -#define __vpcs __attribute__((__aarch64_vector_pcs__)) - -/* Vector functions following the vector PCS. */ -__vpcs __f32x4_t __vn_sinf (__f32x4_t); -__vpcs __f32x4_t __vn_cosf (__f32x4_t); -__vpcs __f32x4_t __vn_expf (__f32x4_t); -__vpcs __f32x4_t __vn_expf_1u (__f32x4_t); -__vpcs __f32x4_t __vn_exp2f (__f32x4_t); -__vpcs __f32x4_t __vn_exp2f_1u (__f32x4_t); -__vpcs __f32x4_t __vn_logf (__f32x4_t); -__vpcs __f32x4_t __vn_powf (__f32x4_t, __f32x4_t); -__vpcs __f64x2_t __vn_sin (__f64x2_t); -__vpcs __f64x2_t __vn_cos (__f64x2_t); -__vpcs __f64x2_t __vn_exp (__f64x2_t); -__vpcs __f64x2_t __vn_log (__f64x2_t); -__vpcs __f64x2_t __vn_pow (__f64x2_t, __f64x2_t); +# if __GNUC__ >= 9 || __clang_major__ >= 8 +# undef __vpcs +# define __vpcs __attribute__((__aarch64_vector_pcs__)) /* Vector functions following the vector PCS using ABI names. */ __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t); +__vpcs __f32x4_t _ZGVnN4v_expf_1u (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t); +__vpcs __f32x4_t _ZGVnN4v_exp2f_1u (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4vv_powf (__f32x4_t, __f32x4_t); @@ -94,7 +53,7 @@ __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t); __vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t); -#endif +# endif #endif #endif diff --git a/contrib/arm-optimized-routines/math/math_config.h b/contrib/arm-optimized-routines/math/math_config.h index 7ffc0cd2796a..faf77b31fc99 100644 --- a/contrib/arm-optimized-routines/math/math_config.h +++ b/contrib/arm-optimized-routines/math/math_config.h @@ -1,7 +1,7 @@ /* * Configuration for math routines. * - * Copyright (c) 2017-2020, Arm Limited. + * Copyright (c) 2017-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -92,6 +92,46 @@ # define unlikely(x) (x) #endif +/* Return ptr but hide its value from the compiler so accesses through it + cannot be optimized based on the contents. */ +#define ptr_barrier(ptr) \ + ({ \ + __typeof (ptr) __ptr = (ptr); \ + __asm("" : "+r"(__ptr)); \ + __ptr; \ + }) + +/* Symbol renames to avoid libc conflicts. */ +#define __math_oflowf arm_math_oflowf +#define __math_uflowf arm_math_uflowf +#define __math_may_uflowf arm_math_may_uflowf +#define __math_divzerof arm_math_divzerof +#define __math_oflow arm_math_oflow +#define __math_uflow arm_math_uflow +#define __math_may_uflow arm_math_may_uflow +#define __math_divzero arm_math_divzero +#define __math_invalidf arm_math_invalidf +#define __math_invalid arm_math_invalid +#define __math_check_oflow arm_math_check_oflow +#define __math_check_uflow arm_math_check_uflow +#define __math_check_oflowf arm_math_check_oflowf +#define __math_check_uflowf arm_math_check_uflowf + +#define __sincosf_table arm_math_sincosf_table +#define __inv_pio4 arm_math_inv_pio4 +#define __exp2f_data arm_math_exp2f_data +#define __logf_data arm_math_logf_data +#define __log2f_data arm_math_log2f_data +#define __powf_log2_data arm_math_powf_log2_data +#define __exp_data arm_math_exp_data +#define __log_data arm_math_log_data +#define __log2_data arm_math_log2_data +#define __pow_log_data arm_math_pow_log_data +#define __erff_data arm_math_erff_data +#define __erf_data arm_math_erf_data +#define __v_exp_data arm_math_v_exp_data +#define __v_log_data arm_math_v_log_data + #if HAVE_FAST_ROUND /* When set, the roundtoint and converttoint functions are provided with the semantics documented below. */ @@ -381,15 +421,22 @@ extern const struct powf_log2_data #define EXP_USE_TOINT_NARROW 0 #define EXP2_POLY_ORDER 5 #define EXP2_POLY_WIDE 0 +/* Wider exp10 polynomial necessary for good precision in non-nearest rounding + and !TOINT_INTRINSICS. */ +#define EXP10_POLY_WIDE 0 extern const struct exp_data { double invln2N; + double invlog10_2N; double shift; double negln2hiN; double negln2loN; + double neglog10_2hiN; + double neglog10_2loN; double poly[4]; /* Last four coefficients. */ double exp2_shift; double exp2_poly[EXP2_POLY_ORDER]; + double exp10_poly[5]; uint64_t tab[2*(1 << EXP_TABLE_BITS)]; } __exp_data HIDDEN; @@ -459,4 +506,16 @@ extern const struct erf_data double erfc_poly_F[ERFC_POLY_F_NCOEFFS]; } __erf_data HIDDEN; +#define V_EXP_TABLE_BITS 7 +extern const uint64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN; + +#define V_LOG_TABLE_BITS 7 +extern const struct v_log_data +{ + struct + { + double invc, logc; + } table[1 << V_LOG_TABLE_BITS]; +} __v_log_data HIDDEN; + #endif diff --git a/contrib/arm-optimized-routines/math/s_cos.c b/contrib/arm-optimized-routines/math/s_cos.c deleted file mode 100644 index e66d563d15b5..000000000000 --- a/contrib/arm-optimized-routines/math/s_cos.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_cos.c" diff --git a/contrib/arm-optimized-routines/math/s_cosf.c b/contrib/arm-optimized-routines/math/s_cosf.c deleted file mode 100644 index f615d260b39b..000000000000 --- a/contrib/arm-optimized-routines/math/s_cosf.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_cosf.c" diff --git a/contrib/arm-optimized-routines/math/s_exp.c b/contrib/arm-optimized-routines/math/s_exp.c deleted file mode 100644 index 5da0099e3c65..000000000000 --- a/contrib/arm-optimized-routines/math/s_exp.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_exp.c" diff --git a/contrib/arm-optimized-routines/math/s_exp2f.c b/contrib/arm-optimized-routines/math/s_exp2f.c deleted file mode 100644 index dcbfea9e1e79..000000000000 --- a/contrib/arm-optimized-routines/math/s_exp2f.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_exp2f.c" diff --git a/contrib/arm-optimized-routines/math/s_exp2f_1u.c b/contrib/arm-optimized-routines/math/s_exp2f_1u.c deleted file mode 100644 index bf387e44cfb2..000000000000 --- a/contrib/arm-optimized-routines/math/s_exp2f_1u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_exp2f_1u.c" diff --git a/contrib/arm-optimized-routines/math/s_expf.c b/contrib/arm-optimized-routines/math/s_expf.c deleted file mode 100644 index dacda7fb4fd5..000000000000 --- a/contrib/arm-optimized-routines/math/s_expf.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_expf.c" diff --git a/contrib/arm-optimized-routines/math/s_expf_1u.c b/contrib/arm-optimized-routines/math/s_expf_1u.c deleted file mode 100644 index 00096449f7a5..000000000000 --- a/contrib/arm-optimized-routines/math/s_expf_1u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_expf_1u.c" diff --git a/contrib/arm-optimized-routines/math/s_log.c b/contrib/arm-optimized-routines/math/s_log.c deleted file mode 100644 index 27d2eb290f56..000000000000 --- a/contrib/arm-optimized-routines/math/s_log.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_log.c" diff --git a/contrib/arm-optimized-routines/math/s_logf.c b/contrib/arm-optimized-routines/math/s_logf.c deleted file mode 100644 index 7d98b2ba15c4..000000000000 --- a/contrib/arm-optimized-routines/math/s_logf.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_logf.c" diff --git a/contrib/arm-optimized-routines/math/s_pow.c b/contrib/arm-optimized-routines/math/s_pow.c deleted file mode 100644 index 6eca2b2b17f1..000000000000 --- a/contrib/arm-optimized-routines/math/s_pow.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_pow.c" diff --git a/contrib/arm-optimized-routines/math/s_powf.c b/contrib/arm-optimized-routines/math/s_powf.c deleted file mode 100644 index 1d55d90df7b2..000000000000 --- a/contrib/arm-optimized-routines/math/s_powf.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_powf.c" diff --git a/contrib/arm-optimized-routines/math/s_sin.c b/contrib/arm-optimized-routines/math/s_sin.c deleted file mode 100644 index 0c6171259c0c..000000000000 --- a/contrib/arm-optimized-routines/math/s_sin.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_sin.c" diff --git a/contrib/arm-optimized-routines/math/s_sinf.c b/contrib/arm-optimized-routines/math/s_sinf.c deleted file mode 100644 index 3aae61149618..000000000000 --- a/contrib/arm-optimized-routines/math/s_sinf.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_sinf.c" diff --git a/contrib/arm-optimized-routines/math/test/mathbench.c b/contrib/arm-optimized-routines/math/test/mathbench.c index 6e18e36fbcb2..ed7e89bb7710 100644 --- a/contrib/arm-optimized-routines/math/test/mathbench.c +++ b/contrib/arm-optimized-routines/math/test/mathbench.c @@ -1,7 +1,7 @@ /* * Microbenchmark for math functions. * - * Copyright (c) 2018-2022, Arm Limited. + * Copyright (c) 2018-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -15,11 +15,6 @@ #include #include "mathlib.h" -#ifndef WANT_VMATH -/* Enable the build of vector math code. */ -# define WANT_VMATH 1 -#endif - /* Number of measurements, best result is reported. */ #define MEASURE 60 /* Array size. */ @@ -34,8 +29,9 @@ static float Af[N]; static long measurecount = MEASURE; static long itercount = ITER; -#if __aarch64__ && WANT_VMATH -typedef __f64x2_t v_double; +#ifdef __vpcs +#include +typedef float64x2_t v_double; #define v_double_len() 2 @@ -51,7 +47,7 @@ v_double_dup (double x) return (v_double){x, x}; } -typedef __f32x4_t v_float; +typedef float32x4_t v_float; #define v_float_len() 4 @@ -66,6 +62,19 @@ v_float_dup (float x) { return (v_float){x, x, x, x}; } +#else +/* dummy definitions to make things compile. */ +typedef double v_double; +typedef float v_float; +#define v_double_len(x) 1 +#define v_double_load(x) (x)[0] +#define v_double_dup(x) (x) +#define v_float_len(x) 1 +#define v_float_load(x) (x)[0] +#define v_float_dup(x) (x) + +#endif + #if WANT_SVE_MATH #include typedef svbool_t sv_bool; @@ -102,17 +111,10 @@ sv_float_dup (float x) { return svdup_n_f32(x); } -#endif #else /* dummy definitions to make things compile. */ -typedef double v_double; -typedef float v_float; -#define v_double_len(x) 1 -#define v_double_load(x) (x)[0] -#define v_double_dup(x) (x) -#define v_float_len(x) 1 -#define v_float_load(x) (x)[0] -#define v_float_dup(x) (x) +#define sv_double_len(x) 1 +#define sv_float_len(x) 1 #endif static double @@ -126,20 +128,6 @@ dummyf (float x) { return x; } -#if WANT_VMATH -#if __aarch64__ -static v_double -__v_dummy (v_double x) -{ - return x; -} - -static v_float -__v_dummyf (v_float x) -{ - return x; -} - #ifdef __vpcs __vpcs static v_double __vn_dummy (v_double x) @@ -166,8 +154,6 @@ __sv_dummyf (sv_float x, sv_bool pg) return x; } -#endif -#endif #endif #include "test/mathbench_wrappers.h" @@ -183,8 +169,6 @@ static const struct fun { double (*d) (double); float (*f) (float); - v_double (*vd) (v_double); - v_float (*vf) (v_float); #ifdef __vpcs __vpcs v_double (*vnd) (v_double); __vpcs v_float (*vnf) (v_float); @@ -197,18 +181,12 @@ static const struct fun } funtab[] = { #define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}}, #define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}}, -#define VD(func, lo, hi) {#func, 'd', 'v', lo, hi, {.vd = func}}, -#define VF(func, lo, hi) {#func, 'f', 'v', lo, hi, {.vf = func}}, #define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}}, #define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}}, #define SVD(func, lo, hi) {#func, 'd', 's', lo, hi, {.svd = func}}, #define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}}, D (dummy, 1.0, 2.0) F (dummyf, 1.0, 2.0) -#if WANT_VMATH -#if __aarch64__ -VD (__v_dummy, 1.0, 2.0) -VF (__v_dummyf, 1.0, 2.0) #ifdef __vpcs VND (__vn_dummy, 1.0, 2.0) VNF (__vn_dummyf, 1.0, 2.0) @@ -217,14 +195,10 @@ VNF (__vn_dummyf, 1.0, 2.0) SVD (__sv_dummy, 1.0, 2.0) SVF (__sv_dummyf, 1.0, 2.0) #endif -#endif -#endif #include "test/mathbench_funcs.h" {0}, #undef F #undef D -#undef VF -#undef VD #undef VNF #undef VND #undef SVF @@ -327,38 +301,6 @@ runf_latency (float f (float)) prev = f (Af[i] + prev * z); } -static void -run_v_thruput (v_double f (v_double)) -{ - for (int i = 0; i < N; i += v_double_len ()) - f (v_double_load (A+i)); -} - -static void -runf_v_thruput (v_float f (v_float)) -{ - for (int i = 0; i < N; i += v_float_len ()) - f (v_float_load (Af+i)); -} - -static void -run_v_latency (v_double f (v_double)) -{ - v_double z = v_double_dup (zero); - v_double prev = z; - for (int i = 0; i < N; i += v_double_len ()) - prev = f (v_double_load (A+i) + prev * z); -} - -static void -runf_v_latency (v_float f (v_float)) -{ - v_float z = v_float_dup (zero); - v_float prev = z; - for (int i = 0; i < N; i += v_float_len ()) - prev = f (v_float_load (Af+i) + prev * z); -} - #ifdef __vpcs static void run_vn_thruput (__vpcs v_double f (v_double)) @@ -377,19 +319,21 @@ runf_vn_thruput (__vpcs v_float f (v_float)) static void run_vn_latency (__vpcs v_double f (v_double)) { - v_double z = v_double_dup (zero); - v_double prev = z; + volatile uint64x2_t vsel = (uint64x2_t) { 0, 0 }; + uint64x2_t sel = vsel; + v_double prev = v_double_dup (0); for (int i = 0; i < N; i += v_double_len ()) - prev = f (v_double_load (A+i) + prev * z); + prev = f (vbslq_f64 (sel, prev, v_double_load (A+i))); } static void runf_vn_latency (__vpcs v_float f (v_float)) { - v_float z = v_float_dup (zero); - v_float prev = z; + volatile uint32x4_t vsel = (uint32x4_t) { 0, 0, 0, 0 }; + uint32x4_t sel = vsel; + v_float prev = v_float_dup (0); for (int i = 0; i < N; i += v_float_len ()) - prev = f (v_float_load (Af+i) + prev * z); + prev = f (vbslq_f32 (sel, prev, v_float_load (Af+i))); } #endif @@ -411,19 +355,21 @@ runf_sv_thruput (sv_float f (sv_float, sv_bool)) static void run_sv_latency (sv_double f (sv_double, sv_bool)) { - sv_double z = sv_double_dup (zero); - sv_double prev = z; + volatile sv_bool vsel = svptrue_b64 (); + sv_bool sel = vsel; + sv_double prev = sv_double_dup (0); for (int i = 0; i < N; i += sv_double_len ()) - prev = f (svmad_f64_x (svptrue_b64 (), prev, z, sv_double_load (A+i)), svptrue_b64 ()); + prev = f (svsel_f64 (sel, sv_double_load (A+i), prev), svptrue_b64 ()); } static void runf_sv_latency (sv_float f (sv_float, sv_bool)) { - sv_float z = sv_float_dup (zero); - sv_float prev = z; + volatile sv_bool vsel = svptrue_b32 (); + sv_bool sel = vsel; + sv_float prev = sv_float_dup (0); for (int i = 0; i < N; i += sv_float_len ()) - prev = f (svmad_f32_x (svptrue_b32 (), prev, z, sv_float_load (Af+i)), svptrue_b32 ()); + prev = f (svsel_f32 (sel, sv_float_load (Af+i), prev), svptrue_b32 ()); } #endif @@ -458,10 +404,10 @@ bench1 (const struct fun *f, int type, double lo, double hi) const char *s = type == 't' ? "rthruput" : "latency"; int vlen = 1; - if (f->vec && f->prec == 'd') - vlen = v_double_len(); - else if (f->vec && f->prec == 'f') - vlen = v_float_len(); + if (f->vec == 'n') + vlen = f->prec == 'd' ? v_double_len() : v_float_len(); + else if (f->vec == 's') + vlen = f->prec == 'd' ? sv_double_len() : sv_float_len(); if (f->prec == 'd' && type == 't' && f->vec == 0) TIMEIT (run_thruput, f->fun.d); @@ -471,14 +417,6 @@ bench1 (const struct fun *f, int type, double lo, double hi) TIMEIT (runf_thruput, f->fun.f); else if (f->prec == 'f' && type == 'l' && f->vec == 0) TIMEIT (runf_latency, f->fun.f); - else if (f->prec == 'd' && type == 't' && f->vec == 'v') - TIMEIT (run_v_thruput, f->fun.vd); - else if (f->prec == 'd' && type == 'l' && f->vec == 'v') - TIMEIT (run_v_latency, f->fun.vd); - else if (f->prec == 'f' && type == 't' && f->vec == 'v') - TIMEIT (runf_v_thruput, f->fun.vf); - else if (f->prec == 'f' && type == 'l' && f->vec == 'v') - TIMEIT (runf_v_latency, f->fun.vf); #ifdef __vpcs else if (f->prec == 'd' && type == 't' && f->vec == 'n') TIMEIT (run_vn_thruput, f->fun.vnd); @@ -503,16 +441,18 @@ bench1 (const struct fun *f, int type, double lo, double hi) if (type == 't') { ns100 = (100 * dt + itercount * N / 2) / (itercount * N); - printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g]\n", f->name, s, + printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g] vlen %d\n", + f->name, s, (unsigned) (ns100 / 100), (unsigned) (ns100 % 100), - (unsigned long long) dt, lo, hi); + (unsigned long long) dt, lo, hi, vlen); } else if (type == 'l') { ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen); - printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g]\n", f->name, s, + printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g] vlen %d\n", + f->name, s, (unsigned) (ns100 / 100), (unsigned) (ns100 % 100), - (unsigned long long) dt, lo, hi); + (unsigned long long) dt, lo, hi, vlen); } fflush (stdout); } diff --git a/contrib/arm-optimized-routines/math/test/mathbench_funcs.h b/contrib/arm-optimized-routines/math/test/mathbench_funcs.h index ad6dd2a2313d..84c4e68650ac 100644 --- a/contrib/arm-optimized-routines/math/test/mathbench_funcs.h +++ b/contrib/arm-optimized-routines/math/test/mathbench_funcs.h @@ -1,11 +1,13 @@ /* * Function entries for mathbench. * - * Copyright (c) 2022, Arm Limited. + * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +/* clang-format off */ D (exp, -9.9, 9.9) D (exp, 0.5, 1.0) +D (exp10, -9.9, 9.9) D (exp2, -9.9, 9.9) D (log, 0.01, 11.1) D (log, 0.999, 1.001) @@ -42,59 +44,19 @@ F (cosf, 3.3, 33.3) F (cosf, 100, 1000) F (cosf, 1e6, 1e32) F (erff, -4.0, 4.0) -#if WANT_VMATH -D (__s_sin, -3.1, 3.1) -D (__s_cos, -3.1, 3.1) -D (__s_exp, -9.9, 9.9) -D (__s_log, 0.01, 11.1) -{"__s_pow", 'd', 0, 0.01, 11.1, {.d = xy__s_pow}}, -F (__s_expf, -9.9, 9.9) -F (__s_expf_1u, -9.9, 9.9) -F (__s_exp2f, -9.9, 9.9) -F (__s_exp2f_1u, -9.9, 9.9) -F (__s_logf, 0.01, 11.1) -{"__s_powf", 'f', 0, 0.01, 11.1, {.f = xy__s_powf}}, -F (__s_sinf, -3.1, 3.1) -F (__s_cosf, -3.1, 3.1) -#if __aarch64__ -VD (__v_sin, -3.1, 3.1) -VD (__v_cos, -3.1, 3.1) -VD (__v_exp, -9.9, 9.9) -VD (__v_log, 0.01, 11.1) -{"__v_pow", 'd', 'v', 0.01, 11.1, {.vd = xy__v_pow}}, -VF (__v_expf, -9.9, 9.9) -VF (__v_expf_1u, -9.9, 9.9) -VF (__v_exp2f, -9.9, 9.9) -VF (__v_exp2f_1u, -9.9, 9.9) -VF (__v_logf, 0.01, 11.1) -{"__v_powf", 'f', 'v', 0.01, 11.1, {.vf = xy__v_powf}}, -VF (__v_sinf, -3.1, 3.1) -VF (__v_cosf, -3.1, 3.1) #ifdef __vpcs -VND (__vn_exp, -9.9, 9.9) VND (_ZGVnN2v_exp, -9.9, 9.9) -VND (__vn_log, 0.01, 11.1) VND (_ZGVnN2v_log, 0.01, 11.1) -{"__vn_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy__vn_pow}}, {"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}}, -VND (__vn_sin, -3.1, 3.1) VND (_ZGVnN2v_sin, -3.1, 3.1) -VND (__vn_cos, -3.1, 3.1) VND (_ZGVnN2v_cos, -3.1, 3.1) -VNF (__vn_expf, -9.9, 9.9) VNF (_ZGVnN4v_expf, -9.9, 9.9) -VNF (__vn_expf_1u, -9.9, 9.9) -VNF (__vn_exp2f, -9.9, 9.9) +VNF (_ZGVnN4v_expf_1u, -9.9, 9.9) VNF (_ZGVnN4v_exp2f, -9.9, 9.9) -VNF (__vn_exp2f_1u, -9.9, 9.9) -VNF (__vn_logf, 0.01, 11.1) +VNF (_ZGVnN4v_exp2f_1u, -9.9, 9.9) VNF (_ZGVnN4v_logf, 0.01, 11.1) -{"__vn_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy__vn_powf}}, {"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}}, -VNF (__vn_sinf, -3.1, 3.1) VNF (_ZGVnN4v_sinf, -3.1, 3.1) -VNF (__vn_cosf, -3.1, 3.1) VNF (_ZGVnN4v_cosf, -3.1, 3.1) #endif -#endif -#endif + /* clang-format on */ diff --git a/contrib/arm-optimized-routines/math/test/mathbench_wrappers.h b/contrib/arm-optimized-routines/math/test/mathbench_wrappers.h index 8311f0f4e173..062b9db56de5 100644 --- a/contrib/arm-optimized-routines/math/test/mathbench_wrappers.h +++ b/contrib/arm-optimized-routines/math/test/mathbench_wrappers.h @@ -1,18 +1,11 @@ /* * Function wrappers for mathbench. * - * Copyright (c) 2022, Arm Limited. + * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#if WANT_VMATH -#if __aarch64__ #ifdef __vpcs -__vpcs static v_float -xy__vn_powf (v_float x) -{ - return __vn_powf (x, x); -} __vpcs static v_float xy_Z_powf (v_float x) @@ -20,44 +13,13 @@ xy_Z_powf (v_float x) return _ZGVnN4vv_powf (x, x); } -__vpcs static v_double -xy__vn_pow (v_double x) -{ - return __vn_pow (x, x); -} - __vpcs static v_double xy_Z_pow (v_double x) { return _ZGVnN2vv_pow (x, x); } -#endif // __vpcs -static v_float -xy__v_powf (v_float x) -{ - return __v_powf (x, x); -} - -static v_double -xy__v_pow (v_double x) -{ - return __v_pow (x, x); -} -#endif // __aarch64__ - -static float -xy__s_powf (float x) -{ - return __s_powf (x, x); -} - -static double -xy__s_pow (double x) -{ - return __s_pow (x, x); -} -#endif // WANT_VMATH +#endif static double xypow (double x) diff --git a/contrib/arm-optimized-routines/math/test/mathtest.c b/contrib/arm-optimized-routines/math/test/mathtest.c index 3168da43b01d..834233fdde9d 100644 --- a/contrib/arm-optimized-routines/math/test/mathtest.c +++ b/contrib/arm-optimized-routines/math/test/mathtest.c @@ -1,7 +1,7 @@ /* * mathtest.c - test rig for mathlib * - * Copyright (c) 1998-2022, Arm Limited. + * Copyright (c) 1998-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -254,6 +254,7 @@ test_func tfuncs[] = { TFUNCARM(at_s,rt_s, expf, 3*ULPUNIT/4), TFUNCARM(at_s,rt_s, exp2f, 3*ULPUNIT/4), TFUNC(at_s,rt_s, expm1f, ULPUNIT), + TFUNC(at_d,rt_d, exp10, ULPUNIT), /* power */ TFUNC(at_d2,rt_d, pow, 3*ULPUNIT/4), @@ -1021,6 +1022,7 @@ int runtest(testdetail t) { DO_DOP(d_arg1,op1r); DO_DOP(d_arg2,op2r); s_arg1.i = t.op1r[0]; s_arg2.i = t.op2r[0]; + s_res.i = 0; /* * Detect NaNs, infinities and denormals on input, and set a @@ -1155,22 +1157,25 @@ int runtest(testdetail t) { tresultr[0] = t.resultr[0]; tresultr[1] = t.resultr[1]; resultr[0] = d_res.i[dmsd]; resultr[1] = d_res.i[dlsd]; + resulti[0] = resulti[1] = 0; wres = 2; break; case rt_i: tresultr[0] = t.resultr[0]; resultr[0] = intres; + resulti[0] = 0; wres = 1; break; case rt_s: case rt_s2: tresultr[0] = t.resultr[0]; resultr[0] = s_res.i; + resulti[0] = 0; wres = 1; break; default: puts("unhandled rettype in runtest"); - wres = 0; + abort (); } if(t.resultc != rc_none) { int err = 0; diff --git a/contrib/arm-optimized-routines/math/test/runulp.sh b/contrib/arm-optimized-routines/math/test/runulp.sh index b4000f6ea01b..e2e03e3ae761 100755 --- a/contrib/arm-optimized-routines/math/test/runulp.sh +++ b/contrib/arm-optimized-routines/math/test/runulp.sh @@ -2,7 +2,7 @@ # ULP error check script. # -# Copyright (c) 2019-2022, Arm Limited. +# Copyright (c) 2019-2023, Arm Limited. # SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception #set -x @@ -72,6 +72,16 @@ t pow 0x1.ffffffffffff0p-1 0x1.0000000000008p0 x 0x1p60 0x1p68 50000 t pow 0x1.ffffffffff000p-1 0x1p0 x 0x1p50 0x1p52 50000 t pow -0x1.ffffffffff000p-1 -0x1p0 x 0x1p50 0x1p52 50000 +L=0.02 +t exp10 0 0x1p-47 5000 +t exp10 -0 -0x1p-47 5000 +t exp10 0x1p-47 1 50000 +t exp10 -0x1p-47 -1 50000 +t exp10 1 0x1.34413509f79ffp8 50000 +t exp10 -1 -0x1.434e6420f4374p8 50000 +t exp10 0x1.34413509f79ffp8 inf 5000 +t exp10 -0x1.434e6420f4374p8 -inf 5000 + L=1.0 Ldir=0.9 t erf 0 0xffff000000000000 10000 @@ -143,15 +153,10 @@ Ldir=0.5 done # vector functions + Ldir=0.5 r='n' flags="${ULPFLAGS:--q}" -runs= -check __s_exp 1 && runs=1 -runv= -check __v_exp 1 && runv=1 -runvn= -check __vn_exp 1 && runvn=1 range_exp=' 0 0xffff000000000000 10000 @@ -177,9 +182,10 @@ range_pow=' ' range_sin=' - 0 0xffff000000000000 10000 - 0x1p-4 0x1p4 400000 - -0x1p-23 0x1p23 400000 + 0 0x1p23 500000 + -0 -0x1p23 500000 + 0x1p23 inf 10000 + -0x1p23 -inf 10000 ' range_cos="$range_sin" @@ -199,9 +205,10 @@ range_logf=' ' range_sinf=' - 0 0xffff0000 10000 - 0x1p-4 0x1p4 300000 --0x1p-9 -0x1p9 300000 + 0 0x1p20 500000 + -0 -0x1p20 500000 + 0x1p20 inf 10000 + -0x1p20 -inf 10000 ' range_cosf="$range_sinf" @@ -229,9 +236,8 @@ L_sinf=1.4 L_cosf=1.4 L_powf=2.1 -while read G F R D +while read G F D do - [ "$R" = 1 ] || continue case "$G" in \#*) continue ;; esac eval range="\${range_$G}" eval L="\${L_$G}" @@ -251,71 +257,23 @@ do t $D $disable_fenv $F $X done << EOF $range + EOF done << EOF # group symbol run -exp __s_exp $runs -exp __v_exp $runv -exp __vn_exp $runvn -exp _ZGVnN2v_exp $runvn - -log __s_log $runs -log __v_log $runv -log __vn_log $runvn -log _ZGVnN2v_log $runvn - -pow __s_pow $runs -f -pow __v_pow $runv -f -pow __vn_pow $runvn -f -pow _ZGVnN2vv_pow $runvn -f - -sin __s_sin $runs -sin __v_sin $runv -sin __vn_sin $runvn -sin _ZGVnN2v_sin $runvn - -cos __s_cos $runs -cos __v_cos $runv -cos __vn_cos $runvn -cos _ZGVnN2v_cos $runvn - -expf __s_expf $runs -expf __v_expf $runv -expf __vn_expf $runvn -expf _ZGVnN4v_expf $runvn - -expf_1u __s_expf_1u $runs -f -expf_1u __v_expf_1u $runv -f -expf_1u __vn_expf_1u $runvn -f - -exp2f __s_exp2f $runs -exp2f __v_exp2f $runv -exp2f __vn_exp2f $runvn -exp2f _ZGVnN4v_exp2f $runvn - -exp2f_1u __s_exp2f_1u $runs -f -exp2f_1u __v_exp2f_1u $runv -f -exp2f_1u __vn_exp2f_1u $runvn -f - -logf __s_logf $runs -logf __v_logf $runv -logf __vn_logf $runvn -logf _ZGVnN4v_logf $runvn - -sinf __s_sinf $runs -sinf __v_sinf $runv -sinf __vn_sinf $runvn -sinf _ZGVnN4v_sinf $runvn - -cosf __s_cosf $runs -cosf __v_cosf $runv -cosf __vn_cosf $runvn -cosf _ZGVnN4v_cosf $runvn - -powf __s_powf $runs -f -powf __v_powf $runv -f -powf __vn_powf $runvn -f -powf _ZGVnN4vv_powf $runvn -f +exp _ZGVnN2v_exp +log _ZGVnN2v_log +pow _ZGVnN2vv_pow -f +sin _ZGVnN2v_sin -z +cos _ZGVnN2v_cos +expf _ZGVnN4v_expf +expf_1u _ZGVnN4v_expf_1u -f +exp2f _ZGVnN4v_exp2f +exp2f_1u _ZGVnN4v_exp2f_1u -f +logf _ZGVnN4v_logf +sinf _ZGVnN4v_sinf -z +cosf _ZGVnN4v_cosf +powf _ZGVnN4vv_powf -f EOF [ 0 -eq $FAIL ] || { diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/exp10.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/exp10.tst new file mode 100644 index 000000000000..2cf4273bd1d7 --- /dev/null +++ b/contrib/arm-optimized-routines/math/test/testcases/directed/exp10.tst @@ -0,0 +1,15 @@ +; Directed test cases for exp10 +; +; Copyright (c) 2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=exp10 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=exp10 op1=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=exp10 op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=exp10 op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=exp10 op1=7ff00000.00000000 result=7ff00000.00000000 errno=0 +func=exp10 op1=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox +func=exp10 op1=fff00000.00000000 result=00000000.00000000 errno=0 +func=exp10 op1=ffefffff.ffffffff result=00000000.00000000 errno=ERANGE status=ux +func=exp10 op1=00000000.00000000 result=3ff00000.00000000 errno=0 +func=exp10 op1=80000000.00000000 result=3ff00000.00000000 errno=0 diff --git a/contrib/arm-optimized-routines/math/test/ulp.c b/contrib/arm-optimized-routines/math/test/ulp.c index bb8c3ad69900..5ff29972e50e 100644 --- a/contrib/arm-optimized-routines/math/test/ulp.c +++ b/contrib/arm-optimized-routines/math/test/ulp.c @@ -1,10 +1,11 @@ /* * ULP error checking tool for math functions. * - * Copyright (c) 2019-2022, Arm Limited. + * Copyright (c) 2019-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +#define _GNU_SOURCE #include #include #include @@ -23,11 +24,6 @@ # include #endif -#ifndef WANT_VMATH -/* Enable the build of vector math code. */ -# define WANT_VMATH 1 -#endif - static inline uint64_t asuint64 (double f) { @@ -212,6 +208,7 @@ struct conf unsigned long long n; double softlim; double errlim; + int ignore_zero_sign; }; /* A bit of a hack: call vector functions twice with the same @@ -220,7 +217,7 @@ struct conf static int secondcall; /* Wrappers for vector functions. */ -#if __aarch64__ && WANT_VMATH +#ifdef __vpcs typedef __f32x4_t v_float; typedef __f64x2_t v_double; /* First element of fv and dv may be changed by -c argument. */ @@ -264,40 +261,8 @@ static inline double svretd(sv_double vec) { #endif #endif -#if WANT_SVE_MATH -long double -dummyl (long double x) -{ - return x; -} - -double -dummy (double x) -{ - return x; -} - -static sv_double -__sv_dummy (sv_double x) -{ - return x; -} - -static sv_float -__sv_dummyf (sv_float x) -{ - return x; -} -#endif - #include "test/ulp_wrappers.h" -/* Wrappers for SVE functions. */ -#if WANT_SVE_MATH -static double sv_dummy (double x) { return svretd (__sv_dummy (svargd (x))); } -static float sv_dummyf (float x) { return svretf (__sv_dummyf (svargf (x))); } -#endif - struct fun { const char *name; @@ -358,10 +323,6 @@ static const struct fun fun[] = { #define ZVNF2(x) VNF2 (x) ZVF2 (x) #define ZVND1(x) VND1 (x) ZVD1 (x) #define ZVND2(x) VND2 (x) ZVD2 (x) -#define SF1(x) F (__s_##x##f, __s_##x##f, x, mpfr_##x, 1, 1, f1, 0) -#define SF2(x) F (__s_##x##f, __s_##x##f, x, mpfr_##x, 2, 1, f2, 0) -#define SD1(x) F (__s_##x, __s_##x, x##l, mpfr_##x, 1, 0, d1, 0) -#define SD2(x) F (__s_##x, __s_##x, x##l, mpfr_##x, 2, 0, d2, 0) /* SVE routines. */ #define SVF1(x) F (__sv_##x##f, sv_##x##f, x, mpfr_##x, 1, 1, f1, 0) #define SVF2(x) F (__sv_##x##f, sv_##x##f, x, mpfr_##x, 2, 1, f2, 0) @@ -374,11 +335,6 @@ static const struct fun fun[] = { #include "test/ulp_funcs.h" -#if WANT_SVE_MATH - SVD1 (dummy) - SVF1 (dummy) -#endif - #undef F #undef F1 #undef F2 @@ -628,17 +584,18 @@ call_mpfr_d2 (mpfr_t y, const struct fun *f, struct args_d2 a, mpfr_rnd_t r) static void usage (void) { - puts ("./ulp [-q] [-m] [-f] [-r nudz] [-l soft-ulplimit] [-e ulplimit] func " + puts ("./ulp [-q] [-m] [-f] [-r {n|u|d|z}] [-l soft-ulplimit] [-e ulplimit] func " "lo [hi [x lo2 hi2] [count]]"); puts ("Compares func against a higher precision implementation in [lo; hi]."); puts ("-q: quiet."); puts ("-m: use mpfr even if faster method is available."); - puts ("-f: disable fenv testing (rounding modes and exceptions)."); -#if __aarch64__ && WANT_VMATH + puts ("-f: disable fenv exceptions testing."); +#ifdef ___vpcs puts ("-c: neutral 'control value' to test behaviour when one lane can affect another. \n" " This should be different from tested input in other lanes, and non-special \n" " (i.e. should not trigger fenv exceptions). Default is 1."); #endif + puts ("-z: ignore sign of 0."); puts ("Supported func:"); for (const struct fun *f = fun; f->name; f++) printf ("\t%s\n", f->name); @@ -762,6 +719,7 @@ main (int argc, char *argv[]) conf.fenv = 1; conf.softlim = 0; conf.errlim = INFINITY; + conf.ignore_zero_sign = 0; for (;;) { argc--; @@ -801,12 +759,15 @@ main (int argc, char *argv[]) { argc--; argv++; - if (argc < 1) + if (argc < 1 || argv[0][1] != '\0') usage (); conf.rc = argv[0][0]; } break; -#if __aarch64__ && WANT_VMATH + case 'z': + conf.ignore_zero_sign = 1; + break; +#ifdef __vpcs case 'c': argc--; argv++; @@ -839,7 +800,19 @@ main (int argc, char *argv[]) if (strcmp (argv[0], f->name) == 0) break; if (!f->name) - usage (); + { +#ifndef __vpcs + /* Ignore vector math functions if vector math is not supported. */ + if (strncmp (argv[0], "_ZGVnN", 6) == 0) + exit (0); +#endif +#if !WANT_SVE_MATH + if (strncmp (argv[0], "_ZGVsMxv", 8) == 0) + exit (0); +#endif + printf ("math function %s not supported\n", argv[0]); + exit (1); + } if (!f->singleprec && LDBL_MANT_DIG == DBL_MANT_DIG) conf.mpfr = 1; /* Use mpfr if long double has no extra precision. */ if (!USE_MPFR && conf.mpfr) diff --git a/contrib/arm-optimized-routines/math/test/ulp.h b/contrib/arm-optimized-routines/math/test/ulp.h index 327b4bd0fd06..b0bc59aeef8d 100644 --- a/contrib/arm-optimized-routines/math/test/ulp.h +++ b/contrib/arm-optimized-routines/math/test/ulp.h @@ -1,7 +1,7 @@ /* * Generic functions for ULP error estimation. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -37,7 +37,8 @@ static int RT(ulpscale_mpfr) (mpfr_t x, int t) /* Difference between exact result and closest real number that gets rounded to got, i.e. error before rounding, for a correctly rounded result the difference is 0. */ -static double RT(ulperr) (RT(float) got, const struct RT(ret) * p, int r) +static double RT (ulperr) (RT (float) got, const struct RT (ret) * p, int r, + int ignore_zero_sign) { RT(float) want = p->y; RT(float) d; @@ -45,10 +46,18 @@ static double RT(ulperr) (RT(float) got, const struct RT(ret) * p, int r) if (RT(asuint) (got) == RT(asuint) (want)) return 0.0; + if (isnan (got) && isnan (want)) + /* Ignore sign of NaN. */ + return RT (issignaling) (got) == RT (issignaling) (want) ? 0 : INFINITY; if (signbit (got) != signbit (want)) - /* May have false positives with NaN. */ - //return isnan(got) && isnan(want) ? 0 : INFINITY; - return INFINITY; + { + /* Fall through to ULP calculation if ignoring sign of zero and at + exactly one of want and got is non-zero. */ + if (ignore_zero_sign && want == got) + return 0.0; + if (!ignore_zero_sign || (want != 0 && got != 0)) + return INFINITY; + } if (!isfinite (want) || !isfinite (got)) { if (isnan (got) != isnan (want)) @@ -114,8 +123,12 @@ static inline void T(call_fenv) (const struct fun *f, struct T(args) a, int r, static inline void T(call_nofenv) (const struct fun *f, struct T(args) a, int r, RT(float) * y, int *ex) { + if (r != FE_TONEAREST) + fesetround (r); *y = T(call) (f, a); *ex = 0; + if (r != FE_TONEAREST) + fesetround (FE_TONEAREST); } static inline int T(call_long_fenv) (const struct fun *f, struct T(args) a, @@ -155,8 +168,12 @@ static inline int T(call_long_nofenv) (const struct fun *f, struct T(args) a, int r, struct RT(ret) * p, RT(float) ygot, int exgot) { + if (r != FE_TONEAREST) + fesetround (r); RT(double) yl = T(call_long) (f, a); p->y = (RT(float)) yl; + if (r != FE_TONEAREST) + fesetround (FE_TONEAREST); if (RT(isok_nofenv) (ygot, p->y)) return 1; p->ulpexp = RT(ulpscale) (p->y); @@ -288,7 +305,7 @@ static int T(cmp) (const struct fun *f, struct gen *gen, if (!ok) { int print = 0; - double err = RT(ulperr) (ygot, &want, r); + double err = RT (ulperr) (ygot, &want, r, conf->ignore_zero_sign); double abserr = fabs (err); // TODO: count errors below accuracy limit. if (abserr > 0) diff --git a/contrib/arm-optimized-routines/math/test/ulp_funcs.h b/contrib/arm-optimized-routines/math/test/ulp_funcs.h index f5cea4d6d14c..84f7927d3935 100644 --- a/contrib/arm-optimized-routines/math/test/ulp_funcs.h +++ b/contrib/arm-optimized-routines/math/test/ulp_funcs.h @@ -1,9 +1,10 @@ /* * Function entries for ulp. * - * Copyright (c) 2022, Arm Limited. + * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +/* clang-format off */ F1 (sin) F1 (cos) F (sincosf_sinf, sincosf_sinf, sincos_sin, sincos_mpfr_sin, 1, 1, f1, 0) @@ -15,56 +16,18 @@ F2 (pow) F1 (erf) D1 (exp) + D1 (exp10) D1 (exp2) D1 (log) D1 (log2) D2 (pow) D1 (erf) -#if WANT_VMATH - F (__s_sinf, __s_sinf, sin, mpfr_sin, 1, 1, f1, 0) - F (__s_cosf, __s_cosf, cos, mpfr_cos, 1, 1, f1, 0) - F (__s_expf_1u, __s_expf_1u, exp, mpfr_exp, 1, 1, f1, 0) - F (__s_expf, __s_expf, exp, mpfr_exp, 1, 1, f1, 0) - F (__s_exp2f_1u, __s_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 0) - F (__s_exp2f, __s_exp2f, exp2, mpfr_exp2, 1, 1, f1, 0) - F (__s_powf, __s_powf, pow, mpfr_pow, 2, 1, f2, 0) - F (__s_logf, __s_logf, log, mpfr_log, 1, 1, f1, 0) - F (__s_sin, __s_sin, sinl, mpfr_sin, 1, 0, d1, 0) - F (__s_cos, __s_cos, cosl, mpfr_cos, 1, 0, d1, 0) - F (__s_exp, __s_exp, expl, mpfr_exp, 1, 0, d1, 0) - F (__s_log, __s_log, logl, mpfr_log, 1, 0, d1, 0) - F (__s_pow, __s_pow, powl, mpfr_pow, 2, 0, d2, 0) -#if __aarch64__ - F (__v_sinf, v_sinf, sin, mpfr_sin, 1, 1, f1, 1) - F (__v_cosf, v_cosf, cos, mpfr_cos, 1, 1, f1, 1) - F (__v_expf_1u, v_expf_1u, exp, mpfr_exp, 1, 1, f1, 1) - F (__v_expf, v_expf, exp, mpfr_exp, 1, 1, f1, 1) - F (__v_exp2f_1u, v_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1) - F (__v_exp2f, v_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1) - F (__v_logf, v_logf, log, mpfr_log, 1, 1, f1, 1) - F (__v_powf, v_powf, pow, mpfr_pow, 2, 1, f2, 1) - F (__v_sin, v_sin, sinl, mpfr_sin, 1, 0, d1, 1) - F (__v_cos, v_cos, cosl, mpfr_cos, 1, 0, d1, 1) - F (__v_exp, v_exp, expl, mpfr_exp, 1, 0, d1, 1) - F (__v_log, v_log, logl, mpfr_log, 1, 0, d1, 1) - F (__v_pow, v_pow, powl, mpfr_pow, 2, 0, d2, 1) #ifdef __vpcs - F (__vn_sinf, vn_sinf, sin, mpfr_sin, 1, 1, f1, 1) - F (__vn_cosf, vn_cosf, cos, mpfr_cos, 1, 1, f1, 1) - F (__vn_expf_1u, vn_expf_1u, exp, mpfr_exp, 1, 1, f1, 1) - F (__vn_expf, vn_expf, exp, mpfr_exp, 1, 1, f1, 1) - F (__vn_exp2f_1u, vn_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1) - F (__vn_exp2f, vn_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1) - F (__vn_logf, vn_logf, log, mpfr_log, 1, 1, f1, 1) - F (__vn_powf, vn_powf, pow, mpfr_pow, 2, 1, f2, 1) - F (__vn_sin, vn_sin, sinl, mpfr_sin, 1, 0, d1, 1) - F (__vn_cos, vn_cos, cosl, mpfr_cos, 1, 0, d1, 1) - F (__vn_exp, vn_exp, expl, mpfr_exp, 1, 0, d1, 1) - F (__vn_log, vn_log, logl, mpfr_log, 1, 0, d1, 1) - F (__vn_pow, vn_pow, powl, mpfr_pow, 2, 0, d2, 1) F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1) F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1) + F (_ZGVnN4v_expf_1u, Z_expf_1u, exp, mpfr_exp, 1, 1, f1, 1) F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1) + F (_ZGVnN4v_exp2f_1u, Z_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1) F (_ZGVnN4v_exp2f, Z_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1) F (_ZGVnN4v_logf, Z_logf, log, mpfr_log, 1, 1, f1, 1) F (_ZGVnN4vv_powf, Z_powf, pow, mpfr_pow, 2, 1, f2, 1) @@ -74,5 +37,4 @@ F (_ZGVnN2v_log, Z_log, logl, mpfr_log, 1, 0, d1, 1) F (_ZGVnN2vv_pow, Z_pow, powl, mpfr_pow, 2, 0, d2, 1) #endif -#endif -#endif +/* clang-format on */ diff --git a/contrib/arm-optimized-routines/math/test/ulp_wrappers.h b/contrib/arm-optimized-routines/math/test/ulp_wrappers.h index fd9e00c0310f..60dc3d6dd652 100644 --- a/contrib/arm-optimized-routines/math/test/ulp_wrappers.h +++ b/contrib/arm-optimized-routines/math/test/ulp_wrappers.h @@ -1,10 +1,12 @@ /* * Function wrappers for ulp. * - * Copyright (c) 2022, Arm Limited. + * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +/* clang-format off */ + /* Wrappers for sincos. */ static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);} static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);} @@ -16,37 +18,12 @@ static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y, #endif /* Wrappers for vector functions. */ -#if __aarch64__ && WANT_VMATH -static float v_sinf(float x) { return __v_sinf(argf(x))[0]; } -static float v_cosf(float x) { return __v_cosf(argf(x))[0]; } -static float v_expf_1u(float x) { return __v_expf_1u(argf(x))[0]; } -static float v_expf(float x) { return __v_expf(argf(x))[0]; } -static float v_exp2f_1u(float x) { return __v_exp2f_1u(argf(x))[0]; } -static float v_exp2f(float x) { return __v_exp2f(argf(x))[0]; } -static float v_logf(float x) { return __v_logf(argf(x))[0]; } -static float v_powf(float x, float y) { return __v_powf(argf(x),argf(y))[0]; } -static double v_sin(double x) { return __v_sin(argd(x))[0]; } -static double v_cos(double x) { return __v_cos(argd(x))[0]; } -static double v_exp(double x) { return __v_exp(argd(x))[0]; } -static double v_log(double x) { return __v_log(argd(x))[0]; } -static double v_pow(double x, double y) { return __v_pow(argd(x),argd(y))[0]; } #ifdef __vpcs -static float vn_sinf(float x) { return __vn_sinf(argf(x))[0]; } -static float vn_cosf(float x) { return __vn_cosf(argf(x))[0]; } -static float vn_expf_1u(float x) { return __vn_expf_1u(argf(x))[0]; } -static float vn_expf(float x) { return __vn_expf(argf(x))[0]; } -static float vn_exp2f_1u(float x) { return __vn_exp2f_1u(argf(x))[0]; } -static float vn_exp2f(float x) { return __vn_exp2f(argf(x))[0]; } -static float vn_logf(float x) { return __vn_logf(argf(x))[0]; } -static float vn_powf(float x, float y) { return __vn_powf(argf(x),argf(y))[0]; } -static double vn_sin(double x) { return __vn_sin(argd(x))[0]; } -static double vn_cos(double x) { return __vn_cos(argd(x))[0]; } -static double vn_exp(double x) { return __vn_exp(argd(x))[0]; } -static double vn_log(double x) { return __vn_log(argd(x))[0]; } -static double vn_pow(double x, double y) { return __vn_pow(argd(x),argd(y))[0]; } static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; } static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; } +static float Z_expf_1u(float x) { return _ZGVnN4v_expf_1u(argf(x))[0]; } static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; } +static float Z_exp2f_1u(float x) { return _ZGVnN4v_exp2f_1u(argf(x))[0]; } static float Z_exp2f(float x) { return _ZGVnN4v_exp2f(argf(x))[0]; } static float Z_logf(float x) { return _ZGVnN4v_logf(argf(x))[0]; } static float Z_powf(float x, float y) { return _ZGVnN4vv_powf(argf(x),argf(y))[0]; } @@ -56,4 +33,5 @@ static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; } static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; } static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; } #endif -#endif + +/* clang-format on */ diff --git a/contrib/arm-optimized-routines/math/tgamma128.c b/contrib/arm-optimized-routines/math/tgamma128.c new file mode 100644 index 000000000000..65deacc49d99 --- /dev/null +++ b/contrib/arm-optimized-routines/math/tgamma128.c @@ -0,0 +1,356 @@ +/* + * Implementation of the true gamma function (as opposed to lgamma) + * for 128-bit long double. + * + * Copyright (c) 2006-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* + * This module implements the float128 gamma function under the name + * tgamma128. It's expected to be suitable for integration into system + * maths libraries under the standard name tgammal, if long double is + * 128-bit. Such a library will probably want to check the error + * handling and optimize the initial process of extracting the + * exponent, which is done here by simple and portable (but + * potentially slower) methods. + */ + +#include +#include +#include +#include + +/* Only binary128 format is supported. */ +#if LDBL_MANT_DIG == 113 + +#include "tgamma128.h" + +#define lenof(x) (sizeof(x)/sizeof(*(x))) + +/* + * Helper routine to evaluate a polynomial via Horner's rule + */ +static long double poly(const long double *coeffs, size_t n, long double x) +{ + long double result = coeffs[--n]; + + while (n > 0) + result = (result * x) + coeffs[--n]; + + return result; +} + +/* + * Compute sin(pi*x) / pi, for use in the reflection formula that + * relates gamma(-x) and gamma(x). + */ +static long double sin_pi_x_over_pi(long double x) +{ + int quo; + long double fracpart = remquol(x, 0.5L, &quo); + + long double sign = 1.0L; + if (quo & 2) + sign = -sign; + quo &= 1; + + if (quo == 0 && fabsl(fracpart) < 0x1.p-58L) { + /* For numbers this size, sin(pi*x) is so close to pi*x that + * sin(pi*x)/pi is indistinguishable from x in float128 */ + return sign * fracpart; + } + + if (quo == 0) { + return sign * sinl(pi*fracpart) / pi; + } else { + return sign * cosl(pi*fracpart) / pi; + } +} + +/* Return tgamma(x) on the assumption that x >= 8. */ +static long double tgamma_large(long double x, + bool negative, long double negadjust) +{ + /* + * In this range we compute gamma(x) as x^(x-1/2) * e^-x * K, + * where K is a correction factor computed as a polynomial in 1/x. + * + * (Vaguely inspired by the form of the Lanczos approximation, but + * I tried the Lanczos approximation itself and it suffers badly + * from big cancellation leading to loss of significance.) + */ + long double t = 1/x; + long double p = poly(coeffs_large, lenof(coeffs_large), t); + + /* + * To avoid overflow in cases where x^(x-0.5) does overflow + * but gamma(x) does not, we split x^(x-0.5) in half and + * multiply back up _after_ multiplying the shrinking factor + * of exp(-(x-0.5)). + * + * Note that computing x-0.5 and (x-0.5)/2 is exact for the + * relevant range of x, so the only sources of error are pow + * and exp themselves, plus the multiplications. + */ + long double powhalf = powl(x, (x-0.5L)/2.0L); + long double expret = expl(-(x-0.5L)); + + if (!negative) { + return (expret * powhalf) * powhalf * p; + } else { + /* + * Apply the reflection formula as commented below, but + * carefully: negadjust has magnitude less than 1, so it can + * turn a case where gamma(+x) would overflow into a case + * where gamma(-x) doesn't underflow. Not only that, but the + * FP format has greater range in the tiny domain due to + * denormals. For both reasons, it's not good enough to + * compute the positive result and then adjust it. + */ + long double ret = 1 / ((expret * powhalf) * (x * negadjust) * p); + return ret / powhalf; + } +} + +/* Return tgamma(x) on the assumption that 0 <= x < 1/32. */ +static long double tgamma_tiny(long double x, + bool negative, long double negadjust) +{ + /* + * For x near zero, we use a polynomial approximation to + * g = 1/(x*gamma(x)), and then return 1/(g*x). + */ + long double g = poly(coeffs_tiny, lenof(coeffs_tiny), x); + if (!negative) + return 1.0L / (g*x); + else + return g / negadjust; +} + +/* Return tgamma(x) on the assumption that 0 <= x < 2^-113. */ +static long double tgamma_ultratiny(long double x, bool negative, + long double negadjust) +{ + /* On this interval, gamma can't even be distinguished from 1/x, + * so we skip the polynomial evaluation in tgamma_tiny, partly to + * save time and partly to avoid the tiny intermediate values + * setting the underflow exception flag. */ + if (!negative) + return 1.0L / x; + else + return 1.0L / negadjust; +} + +/* Return tgamma(x) on the assumption that 1 <= x <= 2. */ +static long double tgamma_central(long double x) +{ + /* + * In this central interval, our strategy is to finding the + * difference between x and the point where gamma has a minimum, + * and approximate based on that. + */ + + /* The difference between the input x and the minimum x. The first + * subtraction is expected to be exact, since x and min_hi have + * the same exponent (unless x=2, in which case it will still be + * exact). */ + long double t = (x - min_x_hi) - min_x_lo; + + /* + * Now use two different polynomials for the intervals [1,m] and + * [m,2]. + */ + long double p; + if (t < 0) + p = poly(coeffs_central_neg, lenof(coeffs_central_neg), -t); + else + p = poly(coeffs_central_pos, lenof(coeffs_central_pos), t); + + return (min_y_lo + p * (t*t)) + min_y_hi; +} + +long double tgamma128(long double x) +{ + /* + * Start by extracting the number's sign and exponent, and ruling + * out cases of non-normalized numbers. + * + * For an implementation integrated into a system libm, it would + * almost certainly be quicker to do this by direct bitwise access + * to the input float128 value, using whatever is the local idiom + * for knowing its endianness. + * + * Integration into a system libc may also need to worry about + * setting errno, if that's the locally preferred way to report + * math.h errors. + */ + int sign = signbit(x); + int exponent; + switch (fpclassify(x)) { + case FP_NAN: + return x+x; /* propagate QNaN, make SNaN throw an exception */ + case FP_ZERO: + return 1/x; /* divide by zero on purpose to indicate a pole */ + case FP_INFINITE: + if (sign) { + return x-x; /* gamma(-inf) has indeterminate sign, so provoke an + * IEEE invalid operation exception to indicate that */ + } + return x; /* but gamma(+inf) is just +inf with no error */ + case FP_SUBNORMAL: + exponent = -16384; + break; + default: + frexpl(x, &exponent); + exponent--; + break; + } + + bool negative = false; + long double negadjust = 0.0L; + + if (sign) { + /* + * Euler's reflection formula is + * + * gamma(1-x) gamma(x) = pi/sin(pi*x) + * + * pi + * => gamma(x) = -------------------- + * gamma(1-x) sin(pi*x) + * + * But computing 1-x is going to lose a lot of accuracy when x + * is very small, so instead we transform using the recurrence + * gamma(t+1)=t gamma(t). Setting t=-x, this gives us + * gamma(1-x) = -x gamma(-x), so we now have + * + * pi + * gamma(x) = ---------------------- + * -x gamma(-x) sin(pi*x) + * + * which relates gamma(x) to gamma(-x), which is much nicer, + * since x can be turned into -x without rounding. + */ + negadjust = sin_pi_x_over_pi(x); + negative = true; + x = -x; + + /* + * Now the ultimate answer we want is + * + * 1 / (gamma(x) * x * negadjust) + * + * where x is the positive value we've just turned it into. + * + * For some of the cases below, we'll compute gamma(x) + * normally and then compute this adjusted value afterwards. + * But for others, we can implement the reciprocal operation + * in this formula by _avoiding_ an inversion that the + * sub-case was going to do anyway. + */ + + if (negadjust == 0) { + /* + * Special case for negative integers. Applying the + * reflection formula would cause division by zero, but + * standards would prefer we treat this error case as an + * invalid operation and return NaN instead. (Possibly + * because otherwise you'd have to decide which sign of + * infinity to return, and unlike the x=0 case, there's no + * sign of zero available to disambiguate.) + */ + return negadjust / negadjust; + } + } + + /* + * Split the positive domain into various cases. For cases where + * we do the negative-number adjustment the usual way, we'll leave + * the answer in 'g' and drop out of the if statement. + */ + long double g; + + if (exponent >= 11) { + /* + * gamma of any positive value this large overflows, and gamma + * of any negative value underflows. + */ + if (!negative) { + long double huge = 0x1p+12288L; + return huge * huge; /* provoke an overflow */ + } else { + long double tiny = 0x1p-12288L; + return tiny * tiny * negadjust; /* underflow, of the right sign */ + } + } else if (exponent >= 3) { + /* Negative-number adjustment happens inside here */ + return tgamma_large(x, negative, negadjust); + } else if (exponent < -113) { + /* Negative-number adjustment happens inside here */ + return tgamma_ultratiny(x, negative, negadjust); + } else if (exponent < -5) { + /* Negative-number adjustment happens inside here */ + return tgamma_tiny(x, negative, negadjust); + } else if (exponent == 0) { + g = tgamma_central(x); + } else if (exponent < 0) { + /* + * For x in [1/32,1) we range-reduce upwards to the interval + * [1,2), using the inverse of the normal recurrence formula: + * gamma(x) = gamma(x+1)/x. + */ + g = tgamma_central(1+x) / x; + } else { + /* + * For x in [2,8) we range-reduce downwards to the interval + * [1,2) by repeated application of the recurrence formula. + * + * Actually multiplying (x-1) by (x-2) by (x-3) and so on + * would introduce multiple ULPs of rounding error. We can get + * better accuracy by writing x = (k+1/2) + t, where k is an + * integer and |t|<1/2, and expanding out the obvious factor + * (x-1)(x-2)...(x-k+1) as a polynomial in t. + */ + long double mult; + int i = x; + if (i == 2) { /* x in [2,3) */ + mult = (x-1); + } else { + long double t = x - (i + 0.5L); + switch (i) { + /* E.g. for x=3.5+t, we want + * (x-1)(x-2) = (2.5+t)(1.5+t) = 3.75 + 4t + t^2 */ + case 3: + mult = 3.75L+t*(4.0L+t); + break; + case 4: + mult = 13.125L+t*(17.75L+t*(7.5L+t)); + break; + case 5: + mult = 59.0625L+t*(93.0L+t*(51.50L+t*(12.0L+t))); + break; + case 6: + mult = 324.84375L+t*(570.5625L+t*(376.250L+t*( + 117.5L+t*(17.5L+t)))); + break; + case 7: + mult = 2111.484375L+t*(4033.5L+t*(3016.1875L+t*( + 1140.0L+t*(231.25L+t*(24.0L+t))))); + break; + } + } + + g = tgamma_central(x - (i-1)) * mult; + } + + if (!negative) { + /* Positive domain: return g unmodified */ + return g; + } else { + /* Negative domain: apply the reflection formula as commented above */ + return 1.0L / (g * x * negadjust); + } +} + +#endif diff --git a/contrib/arm-optimized-routines/math/tgamma128.h b/contrib/arm-optimized-routines/math/tgamma128.h new file mode 100644 index 000000000000..90875a22dce4 --- /dev/null +++ b/contrib/arm-optimized-routines/math/tgamma128.h @@ -0,0 +1,141 @@ +/* + * Polynomial coefficients and other constants for tgamma128.c. + * + * Copyright (c) 2006-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* The largest positive value for which 128-bit tgamma does not overflow. */ +static const long double max_x = 0x1.b6e3180cd66a5c4206f128ba77f4p+10L; + +/* Coefficients of the polynomial used in the tgamma_large() subroutine */ +static const long double coeffs_large[] = { + 0x1.8535745aa79569579b9eec0f3bbcp+0L, + 0x1.0378f83c6fb8f0e51269f2b4a973p-3L, + 0x1.59f6a05094f69686c3380f4e2783p-8L, + -0x1.0b291dee952a82764a4859b081a6p-8L, + -0x1.6dd301b2205bf936b5a3eaad0dbbp-12L, + 0x1.387a8b5f38dd77e7f139b1021e86p-10L, + 0x1.bca46637f65b13750c728cc29e40p-14L, + -0x1.d80401c00aef998c9e303151a51cp-11L, + -0x1.49cb6bb09f935a2053ccc2cf3711p-14L, + 0x1.4e950204437dcaf2be77f73a6f45p-10L, + 0x1.cb711a2d65f188bf60110934d6bep-14L, + -0x1.7d7ff4bc95dc7faefc5e767f70f1p-9L, + -0x1.0305ab9760cddb0d833e73766836p-12L, + 0x1.3ef6c84bf1cd5c3f65ac2693bb5bp-7L, + 0x1.bb4144740ad9290123fdcea684aap-11L, + -0x1.72ab4e88272a229bfafd192450f0p-5L, + 0x1.80c70ac6eb3b7a698983d25a62b8p-12L, + 0x1.e222791c6743ce3e3cae220fb236p-3L, + 0x1.1a2dca1c82a9326c52b465f7cb7ap-2L, + -0x1.9d204fa235a42cd901b123d2ad47p+1L, + 0x1.55b56d1158f77ddb1c95fc44ab02p+0L, + 0x1.37f900a11dbd892abd7dde533e2dp+5L, + -0x1.2da49f4188dd89cb958369ef2401p+7L, + 0x1.fdae5ec3ec6eb7dffc09edbe6c14p+7L, + -0x1.61433cebe649098c9611c4c7774ap+7L, +}; + +/* Coefficients of the polynomial used in the tgamma_tiny() subroutine */ +static const long double coeffs_tiny[] = { + 0x1.0000000000000000000000000000p+0L, + 0x1.2788cfc6fb618f49a37c7f0201fep-1L, + -0x1.4fcf4026afa2dceb8490ade22796p-1L, + -0x1.5815e8fa27047c8f42b5d9217244p-5L, + 0x1.5512320b43fbe5dfa771333518f7p-3L, + -0x1.59af103c340927bffdd44f954bfcp-5L, + -0x1.3b4af28483e210479657e5543366p-7L, + 0x1.d919c527f6070bfce9b29c2ace9cp-8L, + -0x1.317112ce35337def3556a18aa178p-10L, + -0x1.c364fe77a6f27677b985b1fa2e1dp-13L, + 0x1.0c8a7a19a3fd40fe1f7e867efe7bp-13L, + -0x1.51cf9f090b5dc398ba86305e3634p-16L, + -0x1.4e80f64c04a339740de06ca9fa4ap-20L, + 0x1.241ddc2aef2ec20e58b08f2fda17p-20L, +}; + +/* The location within the interval [1,2] where gamma has a minimum. + * Specified as the sum of two 128-bit values, for extra precision. */ +static const long double min_x_hi = 0x1.762d86356be3f6e1a9c8865e0a4fp+0L; +static const long double min_x_lo = 0x1.ac54d7d218de21303a7c60f08840p-118L; + +/* The actual minimum value that gamma takes at that location. + * Again specified as the sum of two 128-bit values. */ +static const long double min_y_hi = 0x1.c56dc82a74aee8d8851566d40f32p-1L; +static const long double min_y_lo = 0x1.8ed98685742c353ce55e5794686fp-114L; + +/* Coefficients of the polynomial used in the tgamma_central() subroutine + * for computing gamma on the interval [1,min_x] */ +static const long double coeffs_central_neg[] = { + 0x1.b6c53f7377b83839c8a292e43b69p-2L, + 0x1.0bae9f40c7d09ed76e732045850ap-3L, + 0x1.4981175e14d04c3530e51d01c5fep-3L, + 0x1.79f77aaf032c948af3a9edbd2061p-4L, + 0x1.1e97bd10821095a5b79fbfdfa1a3p-4L, + 0x1.8071ce0935e4dcf0b33b0fbec7c1p-5L, + 0x1.0b44c2f92982f887b55ec36dfdb0p-5L, + 0x1.6df1de1e178ef72ca7bd63d40870p-6L, + 0x1.f63f502bde27e81c0f5e13479b43p-7L, + 0x1.57fd67d901f40ea011353ad89a0ap-7L, + 0x1.d7151376eed187eb753e2273cafcp-8L, + 0x1.427162b5c6ff1d904c71ef53e37cp-8L, + 0x1.b954b8c3a56cf93e49ef6538928ap-9L, + 0x1.2dff2ec26a3ae5cd3aaccae7a09ep-9L, + 0x1.9d35250d9b9378d9b59df734537ap-10L, + 0x1.1b2c0c48b9855a28f6dbd6fdff3cp-10L, + 0x1.7e0db39bb99cdb52b028d9359380p-11L, + 0x1.2164b5e1d364a0b5eaf97c436aa7p-11L, + 0x1.27521cf5fd24dcdf43524e6add11p-13L, + 0x1.06461d62243bf9a826b42349672fp-10L, + -0x1.2b852abead28209b4e0c756dc46ep-9L, + 0x1.be673c11a72c826115ec6d286c14p-8L, + -0x1.fd9ce330c215c31fcd3cb53c42ebp-7L, + 0x1.fa362bd2dc68f41abef2d8600acdp-6L, + -0x1.a21585b2f52f8b23855de8e452edp-5L, + 0x1.1f234431ed032052fc92e64e0493p-4L, + -0x1.40d332476ca0199c60cdae3f9132p-4L, + 0x1.1d45dc665d86012eba2eea199cefp-4L, + -0x1.8491016cdd08dc9be7ade9b5fef3p-5L, + 0x1.7e7e2fbc6d49ad484300d6add324p-6L, + -0x1.e63fe3f874a37276a8d7d8b705ecp-8L, + 0x1.30a2a73944f8c84998314d69c23fp-10L, +}; + +/* Coefficients of the polynomial used in the tgamma_central() subroutine + * for computing gamma on the interval [min_x,2] */ +static const long double coeffs_central_pos[] = { + 0x1.b6c53f7377b83839c8a292e22aa2p-2L, + -0x1.0bae9f40c7d09ed76e72e1c955dep-3L, + 0x1.4981175e14d04c3530ee5e1ecebcp-3L, + -0x1.79f77aaf032c948ac983d77f3e07p-4L, + 0x1.1e97bd10821095ab7dc94936cc11p-4L, + -0x1.8071ce0935e4d7edef8cbf2a1cf1p-5L, + 0x1.0b44c2f929837fafef7b5d9e80f1p-5L, + -0x1.6df1de1e175fe2a51faa25cddbb4p-6L, + 0x1.f63f502be57d11aed2cfe90843ffp-7L, + -0x1.57fd67d852f230015b9f64770273p-7L, + 0x1.d715138adc07e5fce81077070357p-8L, + -0x1.4271618e9fda8992a667adb15f4fp-8L, + 0x1.b954d15d9eb772e80fdd760672d7p-9L, + -0x1.2dfe391241d3cb79c8c15182843dp-9L, + 0x1.9d44396fcd48451c3ba924cee814p-10L, + -0x1.1ac195fb99739e341589e39803e6p-10L, + 0x1.82e46127b68f002770826e25f146p-11L, + -0x1.089dacd90d9f41493119ac178359p-11L, + 0x1.6993c007b20394a057d21f3d37f8p-12L, + -0x1.ec43a709f4446560c099dec8e31bp-13L, + 0x1.4ba36322f4074e9add9450f003cap-13L, + -0x1.b3f83a977965ca1b7937bf5b34cap-14L, + 0x1.10af346abc09cb25a6d9fe810b6ep-14L, + -0x1.38d8ea1188f242f50203edc395bdp-15L, + 0x1.39add987a948ec56f62b721a4475p-16L, + -0x1.02a4e141f286c8a967e2df9bc9adp-17L, + 0x1.433b50af22425f546e87113062d7p-19L, + -0x1.0c7b73cb0013f00aafc103e8e382p-21L, + 0x1.b852de313ec38da2297f6deaa6b4p-25L, +}; + +/* 128-bit float value of pi, used by the sin_pi_x_over_pi subroutine + */ +static const long double pi = 0x1.921fb54442d18469898cc51701b8p+1L; diff --git a/contrib/arm-optimized-routines/math/tools/tgamma128_gen.jl b/contrib/arm-optimized-routines/math/tools/tgamma128_gen.jl new file mode 100644 index 000000000000..ecec174110ea --- /dev/null +++ b/contrib/arm-optimized-routines/math/tools/tgamma128_gen.jl @@ -0,0 +1,212 @@ +# -*- julia -*- +# +# Generate tgamma128.h, containing polynomials and constants used by +# tgamma128.c. +# +# Copyright (c) 2006-2023, Arm Limited. +# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +# This Julia program depends on the 'Remez' and 'SpecialFunctions' +# library packages. To install them, run this at the interactive Julia +# prompt: +# +# import Pkg; Pkg.add(["Remez", "SpecialFunctions"]) +# +# Tested on Julia 1.4.1 (Ubuntu 20.04) and 1.9.0 (22.04). + +import Printf +import Remez +import SpecialFunctions + +# Round a BigFloat to 128-bit long double and format it as a C99 hex +# float literal. +function quadhex(x) + sign = " " + if x < 0 + sign = "-" + x = -x + end + + exponent = BigInt(floor(log2(x))) + exponent = max(exponent, -16382) + @assert(exponent <= 16383) # else overflow + + x /= BigFloat(2)^exponent + @assert(1 <= x < 2) + x *= BigFloat(2)^112 + mantissa = BigInt(round(x)) + + mantstr = string(mantissa, base=16, pad=29) + return Printf.@sprintf("%s0x%s.%sp%+dL", sign, mantstr[1], mantstr[2:end], + exponent) +end + +# Round a BigFloat to 128-bit long double and return it still as a +# BigFloat. +function quadval(x, round=0) + sign = +1 + if x.sign < 0 + sign = -1 + x = -x + end + + exponent = BigInt(floor(log2(x))) + exponent = max(exponent, -16382) + @assert(exponent <= 16383) # else overflow + + x /= BigFloat(2)^exponent + @assert(1 <= x < 2) + x *= BigFloat(2)^112 + if round < 0 + mantissa = floor(x) + elseif round > 0 + mantissa = ceil(x) + else + mantissa = round(x) + end + + return sign * mantissa * BigFloat(2)^(exponent - 112) +end + +# Output an array of BigFloats as a C array declaration. +function dumparray(a, name) + println("static const long double ", name, "[] = {") + for x in N + println(" ", quadhex(x), ",") + end + println("};") +end + +print("/* + * Polynomial coefficients and other constants for tgamma128.c. + * + * Copyright (c) 2006,2009,2023 Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +") + +Base.MPFR.setprecision(512) + +e = exp(BigFloat(1)) + +print(" +/* The largest positive value for which 128-bit tgamma does not overflow. */ +") +lo = BigFloat("1000") +hi = BigFloat("2000") +while true + global lo + global hi + global max_x + + mid = (lo + hi) / 2 + if mid == lo || mid == hi + max_x = mid + break + end + if SpecialFunctions.logabsgamma(mid)[1] < 16384 * log(BigFloat(2)) + lo = mid + else + hi = mid + end +end +max_x = quadval(max_x, -1) +println("static const long double max_x = ", quadhex(max_x), ";") + +print(" +/* Coefficients of the polynomial used in the tgamma_large() subroutine */ +") +N, D, E, X = Remez.ratfn_minimax( + x -> x==0 ? sqrt(BigFloat(2)*pi/e) : + exp(SpecialFunctions.logabsgamma(1/x)[1] + + (1/x-0.5)*(1+log(x))), + (0, 1/BigFloat(8)), + 24, 0, + (x, y) -> 1/y +) +dumparray(N, "coeffs_large") + +print(" +/* Coefficients of the polynomial used in the tgamma_tiny() subroutine */ +") +N, D, E, X = Remez.ratfn_minimax( + x -> x==0 ? 1 : 1/(x*SpecialFunctions.gamma(x)), + (0, 1/BigFloat(32)), + 13, 0, +) +dumparray(N, "coeffs_tiny") + +print(" +/* The location within the interval [1,2] where gamma has a minimum. + * Specified as the sum of two 128-bit values, for extra precision. */ +") +lo = BigFloat("1.4") +hi = BigFloat("1.5") +while true + global lo + global hi + global min_x + + mid = (lo + hi) / 2 + if mid == lo || mid == hi + min_x = mid + break + end + if SpecialFunctions.digamma(mid) < 0 + lo = mid + else + hi = mid + end +end +min_x_hi = quadval(min_x, -1) +println("static const long double min_x_hi = ", quadhex(min_x_hi), ";") +println("static const long double min_x_lo = ", quadhex(min_x - min_x_hi), ";") + +print(" +/* The actual minimum value that gamma takes at that location. + * Again specified as the sum of two 128-bit values. */ +") +min_y = SpecialFunctions.gamma(min_x) +min_y_hi = quadval(min_y, -1) +println("static const long double min_y_hi = ", quadhex(min_y_hi), ";") +println("static const long double min_y_lo = ", quadhex(min_y - min_y_hi), ";") + +function taylor_bodge(x) + # Taylor series generated by Wolfram Alpha for (gamma(min_x+x)-min_y)/x^2. + # Used in the Remez calls below for x values very near the origin, to avoid + # significance loss problems when trying to compute it directly via that + # formula (even in MPFR's extra precision). + return BigFloat("0.428486815855585429730209907810650582960483696962660010556335457558784421896667728014324097132413696263704801646004585959298743677879606168187061990204432200")+x*(-BigFloat("0.130704158939785761928008749242671025181542078105370084716141350308119418619652583986015464395882363802104154017741656168641240436089858504560718773026275797")+x*(BigFloat("0.160890753325112844190519489594363387594505844658437718135952967735294789599989664428071656484587979507034160383271974554122934842441540146372016567834062876")+x*(-BigFloat("0.092277030213334350126864106458600575084335085690780082222880945224248438672595248111704471182201673989215223667543694847795410779036800385804729955729659506")))) +end + +print(" +/* Coefficients of the polynomial used in the tgamma_central() subroutine + * for computing gamma on the interval [1,min_x] */ +") +N, D, E, X = Remez.ratfn_minimax( + x -> x < BigFloat(0x1p-64) ? taylor_bodge(-x) : + (SpecialFunctions.gamma(min_x - x) - min_y) / (x*x), + (0, min_x - 1), + 31, 0, + (x, y) -> x^2, +) +dumparray(N, "coeffs_central_neg") + +print(" +/* Coefficients of the polynomial used in the tgamma_central() subroutine + * for computing gamma on the interval [min_x,2] */ +") +N, D, E, X = Remez.ratfn_minimax( + x -> x < BigFloat(0x1p-64) ? taylor_bodge(x) : + (SpecialFunctions.gamma(min_x + x) - min_y) / (x*x), + (0, 2 - min_x), + 28, 0, + (x, y) -> x^2, +) +dumparray(N, "coeffs_central_pos") + +print(" +/* 128-bit float value of pi, used by the sin_pi_x_over_pi subroutine + */ +") +println("static const long double pi = ", quadhex(BigFloat(pi)), ";") diff --git a/contrib/arm-optimized-routines/math/v_cos.c b/contrib/arm-optimized-routines/math/v_cos.c deleted file mode 100644 index 4c8787e66c41..000000000000 --- a/contrib/arm-optimized-routines/math/v_cos.c +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Double-precision vector cos function. - * - * Copyright (c) 2019-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const double Poly[] = { -/* worst-case error is 3.5 ulp. - abs error: 0x1.be222a58p-53 in [-pi/2, pi/2]. */ --0x1.9f4a9c8b21dc9p-41, - 0x1.60e88a10163f2p-33, --0x1.ae6361b7254e7p-26, - 0x1.71de382e8d62bp-19, --0x1.a01a019aeb4ffp-13, - 0x1.111111110b25ep-7, --0x1.55555555554c3p-3, -}; - -#define C7 v_f64 (Poly[0]) -#define C6 v_f64 (Poly[1]) -#define C5 v_f64 (Poly[2]) -#define C4 v_f64 (Poly[3]) -#define C3 v_f64 (Poly[4]) -#define C2 v_f64 (Poly[5]) -#define C1 v_f64 (Poly[6]) - -#define InvPi v_f64 (0x1.45f306dc9c883p-2) -#define HalfPi v_f64 (0x1.921fb54442d18p+0) -#define Pi1 v_f64 (0x1.921fb54442d18p+1) -#define Pi2 v_f64 (0x1.1a62633145c06p-53) -#define Pi3 v_f64 (0x1.c1cd129024e09p-106) -#define Shift v_f64 (0x1.8p52) -#define RangeVal v_f64 (0x1p23) -#define AbsMask v_u64 (0x7fffffffffffffff) - -VPCS_ATTR -__attribute__ ((noinline)) static v_f64_t -specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) -{ - return v_call_f64 (cos, x, y, cmp); -} - -VPCS_ATTR -v_f64_t -V_NAME(cos) (v_f64_t x) -{ - v_f64_t n, r, r2, y; - v_u64_t odd, cmp; - - r = v_as_f64_u64 (v_as_u64_f64 (x) & AbsMask); - cmp = v_cond_u64 (v_as_u64_f64 (r) >= v_as_u64_f64 (RangeVal)); - -#if WANT_SIMD_EXCEPT - if (unlikely (v_any_u64 (cmp))) - /* If fenv exceptions are to be triggered correctly, set any special lanes - to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by - specialcase later. */ - r = v_sel_f64 (cmp, v_f64 (1.0), r); -#endif - - /* n = rint((|x|+pi/2)/pi) - 0.5. */ - n = v_fma_f64 (InvPi, r + HalfPi, Shift); - odd = v_as_u64_f64 (n) << 63; - n -= Shift; - n -= v_f64 (0.5); - - /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ - r = v_fma_f64 (-Pi1, n, r); - r = v_fma_f64 (-Pi2, n, r); - r = v_fma_f64 (-Pi3, n, r); - - /* sin(r) poly approx. */ - r2 = r * r; - y = v_fma_f64 (C7, r2, C6); - y = v_fma_f64 (y, r2, C5); - y = v_fma_f64 (y, r2, C4); - y = v_fma_f64 (y, r2, C3); - y = v_fma_f64 (y, r2, C2); - y = v_fma_f64 (y, r2, C1); - y = v_fma_f64 (y * r2, r, r); - - /* sign. */ - y = v_as_f64_u64 (v_as_u64_f64 (y) ^ odd); - - if (unlikely (v_any_u64 (cmp))) - return specialcase (x, y, cmp); - return y; -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_cosf.c b/contrib/arm-optimized-routines/math/v_cosf.c deleted file mode 100644 index bd677c3ae173..000000000000 --- a/contrib/arm-optimized-routines/math/v_cosf.c +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Single-precision vector cos function. - * - * Copyright (c) 2019-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const float Poly[] = { - /* 1.886 ulp error */ - 0x1.5b2e76p-19f, - -0x1.9f42eap-13f, - 0x1.110df4p-7f, - -0x1.555548p-3f, -}; -#define Pi1 v_f32 (0x1.921fb6p+1f) -#define Pi2 v_f32 (-0x1.777a5cp-24f) -#define Pi3 v_f32 (-0x1.ee59dap-49f) -#define A3 v_f32 (Poly[3]) -#define A5 v_f32 (Poly[2]) -#define A7 v_f32 (Poly[1]) -#define A9 v_f32 (Poly[0]) -#define RangeVal v_f32 (0x1p20f) -#define InvPi v_f32 (0x1.45f306p-2f) -#define Shift v_f32 (0x1.8p+23f) -#define AbsMask v_u32 (0x7fffffff) -#define HalfPi v_f32 (0x1.921fb6p0f) - -VPCS_ATTR -static v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) -{ - /* Fall back to scalar code. */ - return v_call_f32 (cosf, x, y, cmp); -} - -VPCS_ATTR -v_f32_t -V_NAME(cosf) (v_f32_t x) -{ - v_f32_t n, r, r2, y; - v_u32_t odd, cmp; - - r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask); - cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal)); - -#if WANT_SIMD_EXCEPT - if (unlikely (v_any_u32 (cmp))) - /* If fenv exceptions are to be triggered correctly, set any special lanes - to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by - specialcase later. */ - r = v_sel_f32 (cmp, v_f32 (1.0f), r); -#endif - - /* n = rint((|x|+pi/2)/pi) - 0.5 */ - n = v_fma_f32 (InvPi, r + HalfPi, Shift); - odd = v_as_u32_f32 (n) << 31; - n -= Shift; - n -= v_f32 (0.5f); - - /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */ - r = v_fma_f32 (-Pi1, n, r); - r = v_fma_f32 (-Pi2, n, r); - r = v_fma_f32 (-Pi3, n, r); - - /* y = sin(r) */ - r2 = r * r; - y = v_fma_f32 (A9, r2, A7); - y = v_fma_f32 (y, r2, A5); - y = v_fma_f32 (y, r2, A3); - y = v_fma_f32 (y * r2, r, r); - - /* sign fix */ - y = v_as_f32_u32 (v_as_u32_f32 (y) ^ odd); - - if (unlikely (v_any_u32 (cmp))) - return specialcase (x, y, cmp); - return y; -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_exp.c b/contrib/arm-optimized-routines/math/v_exp.c deleted file mode 100644 index da23fd1c5f46..000000000000 --- a/contrib/arm-optimized-routines/math/v_exp.c +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Double-precision vector e^x function. - * - * Copyright (c) 2019-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED -#include "v_exp.h" - -#if V_EXP_TABLE_BITS == 7 -/* maxerr: 1.88 +0.5 ulp - rel error: 1.4337*2^-53 - abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ]. */ -#define C1 v_f64 (0x1.ffffffffffd43p-2) -#define C2 v_f64 (0x1.55555c75adbb2p-3) -#define C3 v_f64 (0x1.55555da646206p-5) -#define InvLn2 v_f64 (0x1.71547652b82fep7) /* N/ln2. */ -#define Ln2hi v_f64 (0x1.62e42fefa39efp-8) /* ln2/N. */ -#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-63) -#elif V_EXP_TABLE_BITS == 8 -/* maxerr: 0.54 +0.5 ulp - rel error: 1.4318*2^-58 - abs error: 1.4299*2^-58 in [ -ln2/512, ln2/512 ]. */ -#define C1 v_f64 (0x1.fffffffffffd4p-2) -#define C2 v_f64 (0x1.5555571d6b68cp-3) -#define C3 v_f64 (0x1.5555576a59599p-5) -#define InvLn2 v_f64 (0x1.71547652b82fep8) -#define Ln2hi v_f64 (0x1.62e42fefa39efp-9) -#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-64) -#endif - -#define N (1 << V_EXP_TABLE_BITS) -#define Tab __v_exp_data -#define IndexMask v_u64 (N - 1) -#define Shift v_f64 (0x1.8p+52) - -#if WANT_SIMD_EXCEPT - -#define TinyBound 0x200 /* top12 (asuint64 (0x1p-511)). */ -#define BigBound 0x408 /* top12 (asuint64 (0x1p9)). */ - -VPCS_ATTR static NOINLINE v_f64_t -specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) -{ - /* If fenv exceptions are to be triggered correctly, fall back to the scalar - routine to special lanes. */ - return v_call_f64 (exp, x, y, cmp); -} - -#else - -#define Thres v_f64 (704.0) - -VPCS_ATTR -static v_f64_t -specialcase (v_f64_t s, v_f64_t y, v_f64_t n) -{ - v_f64_t absn = v_abs_f64 (n); - - /* 2^(n/N) may overflow, break it up into s1*s2. */ - v_u64_t b = v_cond_u64 (n <= v_f64 (0.0)) & v_u64 (0x6000000000000000); - v_f64_t s1 = v_as_f64_u64 (v_u64 (0x7000000000000000) - b); - v_f64_t s2 = v_as_f64_u64 (v_as_u64_f64 (s) - v_u64 (0x3010000000000000) + b); - v_u64_t cmp = v_cond_u64 (absn > v_f64 (1280.0 * N)); - v_f64_t r1 = s1 * s1; - v_f64_t r0 = v_fma_f64 (y, s2, s2) * s1; - return v_as_f64_u64 ((cmp & v_as_u64_f64 (r1)) | (~cmp & v_as_u64_f64 (r0))); -} - -#endif - -VPCS_ATTR -v_f64_t -V_NAME(exp) (v_f64_t x) -{ - v_f64_t n, r, r2, s, y, z; - v_u64_t cmp, u, e, i; - -#if WANT_SIMD_EXCEPT - /* If any lanes are special, mask them with 1 and retain a copy of x to allow - specialcase to fix special lanes later. This is only necessary if fenv - exceptions are to be triggered correctly. */ - v_f64_t xm = x; - cmp = v_cond_u64 ((v_as_u64_f64 (v_abs_f64 (x)) >> 52) - TinyBound - >= BigBound - TinyBound); - if (unlikely (v_any_u64 (cmp))) - x = v_sel_f64 (cmp, v_f64 (1), x); -#else - cmp = v_cond_u64 (v_abs_f64 (x) > Thres); -#endif - - /* n = round(x/(ln2/N)). */ - z = v_fma_f64 (x, InvLn2, Shift); - u = v_as_u64_f64 (z); - n = z - Shift; - - /* r = x - n*ln2/N. */ - r = x; - r = v_fma_f64 (-Ln2hi, n, r); - r = v_fma_f64 (-Ln2lo, n, r); - - e = u << (52 - V_EXP_TABLE_BITS); - i = u & IndexMask; - - /* y = exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */ - r2 = r * r; - y = v_fma_f64 (C2, r, C1); - y = v_fma_f64 (C3, r2, y); - y = v_fma_f64 (y, r2, r); - - /* s = 2^(n/N). */ - u = v_lookup_u64 (Tab, i); - s = v_as_f64_u64 (u + e); - - if (unlikely (v_any_u64 (cmp))) -#if WANT_SIMD_EXCEPT - return specialcase (xm, v_fma_f64 (y, s, s), cmp); -#else - return specialcase (s, y, n); -#endif - - return v_fma_f64 (y, s, s); -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_exp.h b/contrib/arm-optimized-routines/math/v_exp.h deleted file mode 100644 index 1e7f7f3b927d..000000000000 --- a/contrib/arm-optimized-routines/math/v_exp.h +++ /dev/null @@ -1,14 +0,0 @@ -/* - * Declarations for double-precision e^x vector function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#if WANT_VMATH - -#define V_EXP_TABLE_BITS 7 - -extern const u64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN; -#endif diff --git a/contrib/arm-optimized-routines/math/v_exp2f.c b/contrib/arm-optimized-routines/math/v_exp2f.c deleted file mode 100644 index 7f40dbaa6679..000000000000 --- a/contrib/arm-optimized-routines/math/v_exp2f.c +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Single-precision vector 2^x function. - * - * Copyright (c) 2019-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const float Poly[] = { - /* maxerr: 1.962 ulp. */ - 0x1.59977ap-10f, - 0x1.3ce9e4p-7f, - 0x1.c6bd32p-5f, - 0x1.ebf9bcp-3f, - 0x1.62e422p-1f, -}; -#define C0 v_f32 (Poly[0]) -#define C1 v_f32 (Poly[1]) -#define C2 v_f32 (Poly[2]) -#define C3 v_f32 (Poly[3]) -#define C4 v_f32 (Poly[4]) - -#define Shift v_f32 (0x1.8p23f) - -#if WANT_SIMD_EXCEPT - -#define TinyBound 0x20000000 /* asuint (0x1p-63). */ -#define BigBound 0x42800000 /* asuint (0x1p6). */ - -VPCS_ATTR -static NOINLINE v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) -{ - /* If fenv exceptions are to be triggered correctly, fall back to the scalar - routine to special lanes. */ - return v_call_f32 (exp2f, x, y, cmp); -} - -#else - -VPCS_ATTR -static v_f32_t -specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale) -{ - /* 2^n may overflow, break it up into s1*s2. */ - v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000); - v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b); - v_f32_t s2 = v_as_f32_u32 (e - b); - v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f)); - v_u32_t r2 = v_as_u32_f32 (s1 * s1); - v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1); - /* Similar to r1 but avoids double rounding in the subnormal range. */ - v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale)); - return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0)); -} - -#endif - -VPCS_ATTR -v_f32_t -V_NAME(exp2f) (v_f32_t x) -{ - v_f32_t n, r, r2, scale, p, q, poly; - v_u32_t cmp, e; - -#if WANT_SIMD_EXCEPT - cmp = v_cond_u32 ((v_as_u32_f32 (x) & 0x7fffffff) - TinyBound - >= BigBound - TinyBound); - v_f32_t xm = x; - /* If any lanes are special, mask them with 1 and retain a copy of x to allow - specialcase to fix special lanes later. This is only necessary if fenv - exceptions are to be triggered correctly. */ - if (unlikely (v_any_u32 (cmp))) - x = v_sel_f32 (cmp, v_f32 (1), x); -#endif - - /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] - x = n + r, with r in [-1/2, 1/2]. */ -#if 0 - v_f32_t z; - z = x + Shift; - n = z - Shift; - r = x - n; - e = v_as_u32_f32 (z) << 23; -#else - n = v_round_f32 (x); - r = x - n; - e = v_as_u32_s32 (v_round_s32 (x)) << 23; -#endif - scale = v_as_f32_u32 (e + v_u32 (0x3f800000)); - -#if !WANT_SIMD_EXCEPT - v_f32_t absn = v_abs_f32 (n); - cmp = v_cond_u32 (absn > v_f32 (126.0f)); -#endif - - r2 = r * r; - p = v_fma_f32 (C0, r, C1); - q = v_fma_f32 (C2, r, C3); - q = v_fma_f32 (p, r2, q); - p = C4 * r; - poly = v_fma_f32 (q, r2, p); - - if (unlikely (v_any_u32 (cmp))) -#if WANT_SIMD_EXCEPT - return specialcase (xm, v_fma_f32 (poly, scale, scale), cmp); -#else - return specialcase (poly, n, e, absn, cmp, scale); -#endif - - return v_fma_f32 (poly, scale, scale); -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_exp2f_1u.c b/contrib/arm-optimized-routines/math/v_exp2f_1u.c deleted file mode 100644 index de1a32d54139..000000000000 --- a/contrib/arm-optimized-routines/math/v_exp2f_1u.c +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Single-precision vector 2^x function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const float Poly[] = { - /* maxerr: 0.878 ulp. */ - 0x1.416b5ep-13f, 0x1.5f082ep-10f, 0x1.3b2dep-7f, 0x1.c6af7cp-5f, 0x1.ebfbdcp-3f, 0x1.62e43p-1f -}; -#define C0 v_f32 (Poly[0]) -#define C1 v_f32 (Poly[1]) -#define C2 v_f32 (Poly[2]) -#define C3 v_f32 (Poly[3]) -#define C4 v_f32 (Poly[4]) -#define C5 v_f32 (Poly[5]) - -#define Shift v_f32 (0x1.8p23f) -#define InvLn2 v_f32 (0x1.715476p+0f) -#define Ln2hi v_f32 (0x1.62e4p-1f) -#define Ln2lo v_f32 (0x1.7f7d1cp-20f) - -VPCS_ATTR -static v_f32_t -specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn) -{ - /* 2^n may overflow, break it up into s1*s2. */ - v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000); - v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b); - v_f32_t s2 = v_as_f32_u32 (e - b); - v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f)); - v_f32_t r1 = s1 * s1; - v_f32_t r0 = poly * s1 * s2; - return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0))); -} - -VPCS_ATTR -v_f32_t -V_NAME(exp2f_1u) (v_f32_t x) -{ - v_f32_t n, r, scale, poly, absn; - v_u32_t cmp, e; - - /* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] - x = n + r, with r in [-1/2, 1/2]. */ -#if 0 - v_f32_t z; - z = x + Shift; - n = z - Shift; - r = x - n; - e = v_as_u32_f32 (z) << 23; -#else - n = v_round_f32 (x); - r = x - n; - e = v_as_u32_s32 (v_round_s32 (x)) << 23; -#endif - scale = v_as_f32_u32 (e + v_u32 (0x3f800000)); - absn = v_abs_f32 (n); - cmp = v_cond_u32 (absn > v_f32 (126.0f)); - poly = v_fma_f32 (C0, r, C1); - poly = v_fma_f32 (poly, r, C2); - poly = v_fma_f32 (poly, r, C3); - poly = v_fma_f32 (poly, r, C4); - poly = v_fma_f32 (poly, r, C5); - poly = v_fma_f32 (poly, r, v_f32 (1.0f)); - if (unlikely (v_any_u32 (cmp))) - return specialcase (poly, n, e, absn); - return scale * poly; -} -#endif diff --git a/contrib/arm-optimized-routines/math/v_expf.c b/contrib/arm-optimized-routines/math/v_expf.c deleted file mode 100644 index ade23b2416aa..000000000000 --- a/contrib/arm-optimized-routines/math/v_expf.c +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Single-precision vector e^x function. - * - * Copyright (c) 2019-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const float Poly[] = { - /* maxerr: 1.45358 +0.5 ulp. */ - 0x1.0e4020p-7f, - 0x1.573e2ep-5f, - 0x1.555e66p-3f, - 0x1.fffdb6p-2f, - 0x1.ffffecp-1f, -}; -#define C0 v_f32 (Poly[0]) -#define C1 v_f32 (Poly[1]) -#define C2 v_f32 (Poly[2]) -#define C3 v_f32 (Poly[3]) -#define C4 v_f32 (Poly[4]) - -#define Shift v_f32 (0x1.8p23f) -#define InvLn2 v_f32 (0x1.715476p+0f) -#define Ln2hi v_f32 (0x1.62e4p-1f) -#define Ln2lo v_f32 (0x1.7f7d1cp-20f) - -#if WANT_SIMD_EXCEPT - -#define TinyBound 0x20000000 /* asuint (0x1p-63). */ -#define BigBound 0x42800000 /* asuint (0x1p6). */ - -VPCS_ATTR -static NOINLINE v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) -{ - /* If fenv exceptions are to be triggered correctly, fall back to the scalar - routine to special lanes. */ - return v_call_f32 (expf, x, y, cmp); -} - -#else - -VPCS_ATTR -static v_f32_t -specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale) -{ - /* 2^n may overflow, break it up into s1*s2. */ - v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000); - v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b); - v_f32_t s2 = v_as_f32_u32 (e - b); - v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f)); - v_u32_t r2 = v_as_u32_f32 (s1 * s1); - v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1); - /* Similar to r1 but avoids double rounding in the subnormal range. */ - v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale)); - return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0)); -} - -#endif - -VPCS_ATTR -v_f32_t -V_NAME(expf) (v_f32_t x) -{ - v_f32_t n, r, r2, scale, p, q, poly, z; - v_u32_t cmp, e; - -#if WANT_SIMD_EXCEPT - cmp = v_cond_u32 ((v_as_u32_f32 (x) & 0x7fffffff) - TinyBound - >= BigBound - TinyBound); - v_f32_t xm = x; - /* If any lanes are special, mask them with 1 and retain a copy of x to allow - specialcase to fix special lanes later. This is only necessary if fenv - exceptions are to be triggered correctly. */ - if (unlikely (v_any_u32 (cmp))) - x = v_sel_f32 (cmp, v_f32 (1), x); -#endif - - /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] - x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ -#if 1 - z = v_fma_f32 (x, InvLn2, Shift); - n = z - Shift; - r = v_fma_f32 (n, -Ln2hi, x); - r = v_fma_f32 (n, -Ln2lo, r); - e = v_as_u32_f32 (z) << 23; -#else - z = x * InvLn2; - n = v_round_f32 (z); - r = v_fma_f32 (n, -Ln2hi, x); - r = v_fma_f32 (n, -Ln2lo, r); - e = v_as_u32_s32 (v_round_s32 (z)) << 23; -#endif - scale = v_as_f32_u32 (e + v_u32 (0x3f800000)); - -#if !WANT_SIMD_EXCEPT - v_f32_t absn = v_abs_f32 (n); - cmp = v_cond_u32 (absn > v_f32 (126.0f)); -#endif - - r2 = r * r; - p = v_fma_f32 (C0, r, C1); - q = v_fma_f32 (C2, r, C3); - q = v_fma_f32 (p, r2, q); - p = C4 * r; - poly = v_fma_f32 (q, r2, p); - - if (unlikely (v_any_u32 (cmp))) -#if WANT_SIMD_EXCEPT - return specialcase (xm, v_fma_f32 (poly, scale, scale), cmp); -#else - return specialcase (poly, n, e, absn, cmp, scale); -#endif - - return v_fma_f32 (poly, scale, scale); -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_expf_1u.c b/contrib/arm-optimized-routines/math/v_expf_1u.c deleted file mode 100644 index 8f0ae91c582a..000000000000 --- a/contrib/arm-optimized-routines/math/v_expf_1u.c +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Single-precision vector e^x function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const float Poly[] = { - /* maxerr: 0.36565 +0.5 ulp. */ - 0x1.6a6000p-10f, - 0x1.12718ep-7f, - 0x1.555af0p-5f, - 0x1.555430p-3f, - 0x1.fffff4p-2f, -}; -#define C0 v_f32 (Poly[0]) -#define C1 v_f32 (Poly[1]) -#define C2 v_f32 (Poly[2]) -#define C3 v_f32 (Poly[3]) -#define C4 v_f32 (Poly[4]) - -#define Shift v_f32 (0x1.8p23f) -#define InvLn2 v_f32 (0x1.715476p+0f) -#define Ln2hi v_f32 (0x1.62e4p-1f) -#define Ln2lo v_f32 (0x1.7f7d1cp-20f) - -VPCS_ATTR -static v_f32_t -specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn) -{ - /* 2^n may overflow, break it up into s1*s2. */ - v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000); - v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b); - v_f32_t s2 = v_as_f32_u32 (e - b); - v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f)); - v_f32_t r1 = s1 * s1; - v_f32_t r0 = poly * s1 * s2; - return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0))); -} - -VPCS_ATTR -v_f32_t -V_NAME(expf_1u) (v_f32_t x) -{ - v_f32_t n, r, scale, poly, absn, z; - v_u32_t cmp, e; - - /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] - x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ -#if 1 - z = v_fma_f32 (x, InvLn2, Shift); - n = z - Shift; - r = v_fma_f32 (n, -Ln2hi, x); - r = v_fma_f32 (n, -Ln2lo, r); - e = v_as_u32_f32 (z) << 23; -#else - z = x * InvLn2; - n = v_round_f32 (z); - r = v_fma_f32 (n, -Ln2hi, x); - r = v_fma_f32 (n, -Ln2lo, r); - e = v_as_u32_s32 (v_round_s32 (z)) << 23; -#endif - scale = v_as_f32_u32 (e + v_u32 (0x3f800000)); - absn = v_abs_f32 (n); - cmp = v_cond_u32 (absn > v_f32 (126.0f)); - poly = v_fma_f32 (C0, r, C1); - poly = v_fma_f32 (poly, r, C2); - poly = v_fma_f32 (poly, r, C3); - poly = v_fma_f32 (poly, r, C4); - poly = v_fma_f32 (poly, r, v_f32 (1.0f)); - poly = v_fma_f32 (poly, r, v_f32 (1.0f)); - if (unlikely (v_any_u32 (cmp))) - return specialcase (poly, n, e, absn); - return scale * poly; -} -#endif diff --git a/contrib/arm-optimized-routines/math/v_log.c b/contrib/arm-optimized-routines/math/v_log.c deleted file mode 100644 index 47a829119b3c..000000000000 --- a/contrib/arm-optimized-routines/math/v_log.c +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Double-precision vector log(x) function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" -#include "v_log.h" -#if V_SUPPORTED - -/* Worst-case error: 1.17 + 0.5 ulp. */ - -static const f64_t Poly[] = { - /* rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */ - -0x1.ffffffffffff7p-2, - 0x1.55555555170d4p-2, - -0x1.0000000399c27p-2, - 0x1.999b2e90e94cap-3, - -0x1.554e550bd501ep-3, -}; - -#define A0 v_f64 (Poly[0]) -#define A1 v_f64 (Poly[1]) -#define A2 v_f64 (Poly[2]) -#define A3 v_f64 (Poly[3]) -#define A4 v_f64 (Poly[4]) -#define Ln2 v_f64 (0x1.62e42fefa39efp-1) -#define N (1 << V_LOG_TABLE_BITS) -#define OFF v_u64 (0x3fe6900900000000) - -struct entry -{ - v_f64_t invc; - v_f64_t logc; -}; - -static inline struct entry -lookup (v_u64_t i) -{ - struct entry e; -#ifdef SCALAR - e.invc = __v_log_data[i].invc; - e.logc = __v_log_data[i].logc; -#else - e.invc[0] = __v_log_data[i[0]].invc; - e.logc[0] = __v_log_data[i[0]].logc; - e.invc[1] = __v_log_data[i[1]].invc; - e.logc[1] = __v_log_data[i[1]].logc; -#endif - return e; -} - -VPCS_ATTR -__attribute__ ((noinline)) static v_f64_t -specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) -{ - return v_call_f64 (log, x, y, cmp); -} - -VPCS_ATTR -v_f64_t -V_NAME(log) (v_f64_t x) -{ - v_f64_t z, r, r2, p, y, kd, hi; - v_u64_t ix, iz, tmp, top, i, cmp; - v_s64_t k; - struct entry e; - - ix = v_as_u64_f64 (x); - top = ix >> 48; - cmp = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010)); - - /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. - The range is split into N subintervals. - The ith subinterval contains z and c is near its center. */ - tmp = ix - OFF; - i = (tmp >> (52 - V_LOG_TABLE_BITS)) % N; - k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift */ - iz = ix - (tmp & v_u64 (0xfffULL << 52)); - z = v_as_f64_u64 (iz); - e = lookup (i); - - /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ - r = v_fma_f64 (z, e.invc, v_f64 (-1.0)); - kd = v_to_f64_s64 (k); - - /* hi = r + log(c) + k*Ln2. */ - hi = v_fma_f64 (kd, Ln2, e.logc + r); - /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ - r2 = r * r; - y = v_fma_f64 (A3, r, A2); - p = v_fma_f64 (A1, r, A0); - y = v_fma_f64 (A4, r2, y); - y = v_fma_f64 (y, r2, p); - y = v_fma_f64 (y, r2, hi); - - if (unlikely (v_any_u64 (cmp))) - return specialcase (x, y, cmp); - return y; -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_log.h b/contrib/arm-optimized-routines/math/v_log.h deleted file mode 100644 index a37bbc2bd6b6..000000000000 --- a/contrib/arm-optimized-routines/math/v_log.h +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Declarations for double-precision log(x) vector function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#if WANT_VMATH - -#define V_LOG_TABLE_BITS 7 - -extern const struct v_log_data -{ - f64_t invc; - f64_t logc; -} __v_log_data[1 << V_LOG_TABLE_BITS] HIDDEN; -#endif diff --git a/contrib/arm-optimized-routines/math/v_log_data.c b/contrib/arm-optimized-routines/math/v_log_data.c deleted file mode 100644 index ec1c8e5e16b2..000000000000 --- a/contrib/arm-optimized-routines/math/v_log_data.c +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Lookup table for double-precision log(x) vector function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_log.h" -#if WANT_VMATH - -#define N (1 << V_LOG_TABLE_BITS) - -/* Algorithm: - - x = 2^k z - log(x) = k ln2 + log(c) + poly(z/c - 1) - -where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,N=128) -and log(c) and 1/c for the ith subinterval comes from a lookup table: - - tab[i].invc = 1/c - tab[i].logc = (double)log(c) - -where c is near the center of the subinterval and is chosen by trying several -floating point invc candidates around 1/center and selecting one for which -the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval -that contains 1 and the previous one got tweaked to avoid cancellation. */ -const struct v_log_data __v_log_data[N] = { -{0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2}, -{0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2}, -{0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2}, -{0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2}, -{0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2}, -{0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2}, -{0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2}, -{0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2}, -{0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2}, -{0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2}, -{0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2}, -{0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2}, -{0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2}, -{0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2}, -{0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2}, -{0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2}, -{0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2}, -{0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2}, -{0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2}, -{0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3}, -{0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3}, -{0x1.446f12b278001p+0, -0x1.e52e160484698p-3}, -{0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3}, -{0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3}, -{0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3}, -{0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3}, -{0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3}, -{0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3}, -{0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3}, -{0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3}, -{0x1.36987540fbf53p+0, -0x1.8be843d796044p-3}, -{0x1.352166b648f61p+0, -0x1.82395ecc477edp-3}, -{0x1.33adddb3eb575p+0, -0x1.7896240966422p-3}, -{0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3}, -{0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3}, -{0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3}, -{0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3}, -{0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3}, -{0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3}, -{0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3}, -{0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3}, -{0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3}, -{0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3}, -{0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3}, -{0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3}, -{0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4}, -{0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4}, -{0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4}, -{0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4}, -{0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4}, -{0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4}, -{0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4}, -{0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4}, -{0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4}, -{0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4}, -{0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4}, -{0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4}, -{0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4}, -{0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4}, -{0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4}, -{0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5}, -{0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5}, -{0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5}, -{0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5}, -{0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5}, -{0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5}, -{0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5}, -{0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5}, -{0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6}, -{0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6}, -{0x1.05193497a7cc5p+0, -0x1.43183683400acp-6}, -{0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6}, -{0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7}, -{0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7}, -{0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9}, -{1.0, 0.0}, -{0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8}, -{0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7}, -{0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6}, -{0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6}, -{0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5}, -{0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5}, -{0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5}, -{0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5}, -{0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4}, -{0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4}, -{0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4}, -{0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4}, -{0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4}, -{0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4}, -{0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4}, -{0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4}, -{0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4}, -{0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3}, -{0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3}, -{0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3}, -{0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3}, -{0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3}, -{0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3}, -{0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3}, -{0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3}, -{0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3}, -{0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3}, -{0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3}, -{0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3}, -{0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3}, -{0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3}, -{0x1.9998e1480b618p-1, 0x1.c903161240163p-3}, -{0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3}, -{0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3}, -{0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3}, -{0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3}, -{0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2}, -{0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2}, -{0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2}, -{0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2}, -{0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2}, -{0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2}, -{0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2}, -{0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2}, -{0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2}, -{0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2}, -{0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2}, -{0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2}, -{0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2}, -{0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2}, -{0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2}, -{0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2}, -}; -#endif diff --git a/contrib/arm-optimized-routines/math/v_logf.c b/contrib/arm-optimized-routines/math/v_logf.c deleted file mode 100644 index 93a53758bff7..000000000000 --- a/contrib/arm-optimized-routines/math/v_logf.c +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Single-precision vector log function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const float Poly[] = { - /* 3.34 ulp error */ - -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, -0x1.4f9934p-3f, 0x1.961348p-3f, - -0x1.00187cp-2f, 0x1.555d7cp-2f, -0x1.ffffc8p-2f, -}; -#define P7 v_f32 (Poly[0]) -#define P6 v_f32 (Poly[1]) -#define P5 v_f32 (Poly[2]) -#define P4 v_f32 (Poly[3]) -#define P3 v_f32 (Poly[4]) -#define P2 v_f32 (Poly[5]) -#define P1 v_f32 (Poly[6]) - -#define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218 */ -#define Min v_u32 (0x00800000) -#define Max v_u32 (0x7f800000) -#define Mask v_u32 (0x007fffff) -#define Off v_u32 (0x3f2aaaab) /* 0.666667 */ - -VPCS_ATTR -__attribute__ ((noinline)) static v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) -{ - /* Fall back to scalar code. */ - return v_call_f32 (logf, x, y, cmp); -} - -VPCS_ATTR -v_f32_t -V_NAME(logf) (v_f32_t x) -{ - v_f32_t n, p, q, r, r2, y; - v_u32_t u, cmp; - - u = v_as_u32_f32 (x); - cmp = v_cond_u32 (u - Min >= Max - Min); - - /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3 */ - u -= Off; - n = v_to_f32_s32 (v_as_s32_u32 (u) >> 23); /* signextend */ - u &= Mask; - u += Off; - r = v_as_f32_u32 (u) - v_f32 (1.0f); - - /* y = log(1+r) + n*ln2. */ - r2 = r * r; - /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */ - p = v_fma_f32 (P6, r, P5); - q = v_fma_f32 (P4, r, P3); - y = v_fma_f32 (P2, r, P1); - p = v_fma_f32 (P7, r2, p); - q = v_fma_f32 (p, r2, q); - y = v_fma_f32 (q, r2, y); - p = v_fma_f32 (Ln2, n, r); - y = v_fma_f32 (y, r2, p); - - if (unlikely (v_any_u32 (cmp))) - return specialcase (x, y, cmp); - return y; -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_math.h b/contrib/arm-optimized-routines/math/v_math.h deleted file mode 100644 index 3289916187d2..000000000000 --- a/contrib/arm-optimized-routines/math/v_math.h +++ /dev/null @@ -1,661 +0,0 @@ -/* - * Vector math abstractions. - * - * Copyright (c) 2019-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#ifndef _V_MATH_H -#define _V_MATH_H - -#ifndef WANT_VMATH -/* Enable the build of vector math code. */ -# define WANT_VMATH 1 -#endif -#if WANT_VMATH - -/* The goal of this header is to allow vector and scalar - build of the same algorithm, the provided intrinsic - wrappers are also vector length agnostic so they can - be implemented for SVE too (or other simd architectures) - and then the code should work on those targets too. */ - -#if SCALAR -#define V_NAME(x) __s_##x -#elif VPCS && __aarch64__ -#define V_NAME(x) __vn_##x -#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) -#else -#define V_NAME(x) __v_##x -#endif - -#ifndef VPCS_ATTR -#define VPCS_ATTR -#endif -#ifndef VPCS_ALIAS -#define VPCS_ALIAS -#endif - -#include -#include "math_config.h" - -typedef float f32_t; -typedef uint32_t u32_t; -typedef int32_t s32_t; -typedef double f64_t; -typedef uint64_t u64_t; -typedef int64_t s64_t; - -/* reinterpret as type1 from type2. */ -static inline u32_t -as_u32_f32 (f32_t x) -{ - union { f32_t f; u32_t u; } r = {x}; - return r.u; -} -static inline f32_t -as_f32_u32 (u32_t x) -{ - union { u32_t u; f32_t f; } r = {x}; - return r.f; -} -static inline s32_t -as_s32_u32 (u32_t x) -{ - union { u32_t u; s32_t i; } r = {x}; - return r.i; -} -static inline u32_t -as_u32_s32 (s32_t x) -{ - union { s32_t i; u32_t u; } r = {x}; - return r.u; -} -static inline u64_t -as_u64_f64 (f64_t x) -{ - union { f64_t f; u64_t u; } r = {x}; - return r.u; -} -static inline f64_t -as_f64_u64 (u64_t x) -{ - union { u64_t u; f64_t f; } r = {x}; - return r.f; -} -static inline s64_t -as_s64_u64 (u64_t x) -{ - union { u64_t u; s64_t i; } r = {x}; - return r.i; -} -static inline u64_t -as_u64_s64 (s64_t x) -{ - union { s64_t i; u64_t u; } r = {x}; - return r.u; -} - -#if SCALAR -#define V_SUPPORTED 1 -typedef f32_t v_f32_t; -typedef u32_t v_u32_t; -typedef s32_t v_s32_t; -typedef f64_t v_f64_t; -typedef u64_t v_u64_t; -typedef s64_t v_s64_t; - -static inline int -v_lanes32 (void) -{ - return 1; -} - -static inline v_f32_t -v_f32 (f32_t x) -{ - return x; -} -static inline v_u32_t -v_u32 (u32_t x) -{ - return x; -} -static inline v_s32_t -v_s32 (s32_t x) -{ - return x; -} - -static inline f32_t -v_get_f32 (v_f32_t x, int i) -{ - return x; -} -static inline u32_t -v_get_u32 (v_u32_t x, int i) -{ - return x; -} -static inline s32_t -v_get_s32 (v_s32_t x, int i) -{ - return x; -} - -static inline void -v_set_f32 (v_f32_t *x, int i, f32_t v) -{ - *x = v; -} -static inline void -v_set_u32 (v_u32_t *x, int i, u32_t v) -{ - *x = v; -} -static inline void -v_set_s32 (v_s32_t *x, int i, s32_t v) -{ - *x = v; -} - -/* true if any elements of a v_cond result is non-zero. */ -static inline int -v_any_u32 (v_u32_t x) -{ - return x != 0; -} -/* to wrap the result of relational operators. */ -static inline v_u32_t -v_cond_u32 (v_u32_t x) -{ - return x ? -1 : 0; -} -static inline v_f32_t -v_abs_f32 (v_f32_t x) -{ - return __builtin_fabsf (x); -} -static inline v_f32_t -v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z) -{ - return __builtin_fmaf (x, y, z); -} -static inline v_f32_t -v_round_f32 (v_f32_t x) -{ - return __builtin_roundf (x); -} -static inline v_s32_t -v_round_s32 (v_f32_t x) -{ - return __builtin_lroundf (x); /* relies on -fno-math-errno. */ -} -static inline v_f32_t -v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y) -{ - return p ? x : y; -} -/* convert to type1 from type2. */ -static inline v_f32_t -v_to_f32_s32 (v_s32_t x) -{ - return x; -} -static inline v_f32_t -v_to_f32_u32 (v_u32_t x) -{ - return x; -} -/* reinterpret as type1 from type2. */ -static inline v_u32_t -v_as_u32_f32 (v_f32_t x) -{ - union { v_f32_t f; v_u32_t u; } r = {x}; - return r.u; -} -static inline v_f32_t -v_as_f32_u32 (v_u32_t x) -{ - union { v_u32_t u; v_f32_t f; } r = {x}; - return r.f; -} -static inline v_s32_t -v_as_s32_u32 (v_u32_t x) -{ - union { v_u32_t u; v_s32_t i; } r = {x}; - return r.i; -} -static inline v_u32_t -v_as_u32_s32 (v_s32_t x) -{ - union { v_s32_t i; v_u32_t u; } r = {x}; - return r.u; -} -static inline v_f32_t -v_lookup_f32 (const f32_t *tab, v_u32_t idx) -{ - return tab[idx]; -} -static inline v_u32_t -v_lookup_u32 (const u32_t *tab, v_u32_t idx) -{ - return tab[idx]; -} -static inline v_f32_t -v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p) -{ - return f (x); -} -static inline v_f32_t -v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y, - v_u32_t p) -{ - return f (x1, x2); -} - -static inline int -v_lanes64 (void) -{ - return 1; -} -static inline v_f64_t -v_f64 (f64_t x) -{ - return x; -} -static inline v_u64_t -v_u64 (u64_t x) -{ - return x; -} -static inline v_s64_t -v_s64 (s64_t x) -{ - return x; -} -static inline f64_t -v_get_f64 (v_f64_t x, int i) -{ - return x; -} -static inline void -v_set_f64 (v_f64_t *x, int i, f64_t v) -{ - *x = v; -} -/* true if any elements of a v_cond result is non-zero. */ -static inline int -v_any_u64 (v_u64_t x) -{ - return x != 0; -} -/* to wrap the result of relational operators. */ -static inline v_u64_t -v_cond_u64 (v_u64_t x) -{ - return x ? -1 : 0; -} -static inline v_f64_t -v_abs_f64 (v_f64_t x) -{ - return __builtin_fabs (x); -} -static inline v_f64_t -v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z) -{ - return __builtin_fma (x, y, z); -} -static inline v_f64_t -v_round_f64 (v_f64_t x) -{ - return __builtin_round (x); -} -static inline v_s64_t -v_round_s64 (v_f64_t x) -{ - return __builtin_lround (x); /* relies on -fno-math-errno. */ -} -static inline v_f64_t -v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y) -{ - return p ? x : y; -} -/* convert to type1 from type2. */ -static inline v_f64_t -v_to_f64_s64 (v_s64_t x) -{ - return x; -} -static inline v_f64_t -v_to_f64_u64 (v_u64_t x) -{ - return x; -} -/* reinterpret as type1 from type2. */ -static inline v_u64_t -v_as_u64_f64 (v_f64_t x) -{ - union { v_f64_t f; v_u64_t u; } r = {x}; - return r.u; -} -static inline v_f64_t -v_as_f64_u64 (v_u64_t x) -{ - union { v_u64_t u; v_f64_t f; } r = {x}; - return r.f; -} -static inline v_s64_t -v_as_s64_u64 (v_u64_t x) -{ - union { v_u64_t u; v_s64_t i; } r = {x}; - return r.i; -} -static inline v_u64_t -v_as_u64_s64 (v_s64_t x) -{ - union { v_s64_t i; v_u64_t u; } r = {x}; - return r.u; -} -static inline v_f64_t -v_lookup_f64 (const f64_t *tab, v_u64_t idx) -{ - return tab[idx]; -} -static inline v_u64_t -v_lookup_u64 (const u64_t *tab, v_u64_t idx) -{ - return tab[idx]; -} -static inline v_f64_t -v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p) -{ - return f (x); -} - -#elif __aarch64__ -#define V_SUPPORTED 1 -#include -typedef float32x4_t v_f32_t; -typedef uint32x4_t v_u32_t; -typedef int32x4_t v_s32_t; -typedef float64x2_t v_f64_t; -typedef uint64x2_t v_u64_t; -typedef int64x2_t v_s64_t; - -static inline int -v_lanes32 (void) -{ - return 4; -} - -static inline v_f32_t -v_f32 (f32_t x) -{ - return (v_f32_t){x, x, x, x}; -} -static inline v_u32_t -v_u32 (u32_t x) -{ - return (v_u32_t){x, x, x, x}; -} -static inline v_s32_t -v_s32 (s32_t x) -{ - return (v_s32_t){x, x, x, x}; -} - -static inline f32_t -v_get_f32 (v_f32_t x, int i) -{ - return x[i]; -} -static inline u32_t -v_get_u32 (v_u32_t x, int i) -{ - return x[i]; -} -static inline s32_t -v_get_s32 (v_s32_t x, int i) -{ - return x[i]; -} - -static inline void -v_set_f32 (v_f32_t *x, int i, f32_t v) -{ - (*x)[i] = v; -} -static inline void -v_set_u32 (v_u32_t *x, int i, u32_t v) -{ - (*x)[i] = v; -} -static inline void -v_set_s32 (v_s32_t *x, int i, s32_t v) -{ - (*x)[i] = v; -} - -/* true if any elements of a v_cond result is non-zero. */ -static inline int -v_any_u32 (v_u32_t x) -{ - /* assume elements in x are either 0 or -1u. */ - return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0; -} -/* to wrap the result of relational operators. */ -static inline v_u32_t -v_cond_u32 (v_u32_t x) -{ - return x; -} -static inline v_f32_t -v_abs_f32 (v_f32_t x) -{ - return vabsq_f32 (x); -} -static inline v_f32_t -v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z) -{ - return vfmaq_f32 (z, x, y); -} -static inline v_f32_t -v_round_f32 (v_f32_t x) -{ - return vrndaq_f32 (x); -} -static inline v_s32_t -v_round_s32 (v_f32_t x) -{ - return vcvtaq_s32_f32 (x); -} -static inline v_f32_t -v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y) -{ - return vbslq_f32 (p, x, y); -} -/* convert to type1 from type2. */ -static inline v_f32_t -v_to_f32_s32 (v_s32_t x) -{ - return (v_f32_t){x[0], x[1], x[2], x[3]}; -} -static inline v_f32_t -v_to_f32_u32 (v_u32_t x) -{ - return (v_f32_t){x[0], x[1], x[2], x[3]}; -} -/* reinterpret as type1 from type2. */ -static inline v_u32_t -v_as_u32_f32 (v_f32_t x) -{ - union { v_f32_t f; v_u32_t u; } r = {x}; - return r.u; -} -static inline v_f32_t -v_as_f32_u32 (v_u32_t x) -{ - union { v_u32_t u; v_f32_t f; } r = {x}; - return r.f; -} -static inline v_s32_t -v_as_s32_u32 (v_u32_t x) -{ - union { v_u32_t u; v_s32_t i; } r = {x}; - return r.i; -} -static inline v_u32_t -v_as_u32_s32 (v_s32_t x) -{ - union { v_s32_t i; v_u32_t u; } r = {x}; - return r.u; -} -static inline v_f32_t -v_lookup_f32 (const f32_t *tab, v_u32_t idx) -{ - return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; -} -static inline v_u32_t -v_lookup_u32 (const u32_t *tab, v_u32_t idx) -{ - return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; -} -static inline v_f32_t -v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p) -{ - return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1], - p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]}; -} -static inline v_f32_t -v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y, - v_u32_t p) -{ - return ( - v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1], - p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]}; -} - -static inline int -v_lanes64 (void) -{ - return 2; -} -static inline v_f64_t -v_f64 (f64_t x) -{ - return (v_f64_t){x, x}; -} -static inline v_u64_t -v_u64 (u64_t x) -{ - return (v_u64_t){x, x}; -} -static inline v_s64_t -v_s64 (s64_t x) -{ - return (v_s64_t){x, x}; -} -static inline f64_t -v_get_f64 (v_f64_t x, int i) -{ - return x[i]; -} -static inline void -v_set_f64 (v_f64_t *x, int i, f64_t v) -{ - (*x)[i] = v; -} -/* true if any elements of a v_cond result is non-zero. */ -static inline int -v_any_u64 (v_u64_t x) -{ - /* assume elements in x are either 0 or -1u. */ - return vpaddd_u64 (x) != 0; -} -/* to wrap the result of relational operators. */ -static inline v_u64_t -v_cond_u64 (v_u64_t x) -{ - return x; -} -static inline v_f64_t -v_abs_f64 (v_f64_t x) -{ - return vabsq_f64 (x); -} -static inline v_f64_t -v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z) -{ - return vfmaq_f64 (z, x, y); -} -static inline v_f64_t -v_round_f64 (v_f64_t x) -{ - return vrndaq_f64 (x); -} -static inline v_s64_t -v_round_s64 (v_f64_t x) -{ - return vcvtaq_s64_f64 (x); -} -static inline v_f64_t -v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y) -{ - return vbslq_f64 (p, x, y); -} -/* convert to type1 from type2. */ -static inline v_f64_t -v_to_f64_s64 (v_s64_t x) -{ - return (v_f64_t){x[0], x[1]}; -} -static inline v_f64_t -v_to_f64_u64 (v_u64_t x) -{ - return (v_f64_t){x[0], x[1]}; -} -/* reinterpret as type1 from type2. */ -static inline v_u64_t -v_as_u64_f64 (v_f64_t x) -{ - union { v_f64_t f; v_u64_t u; } r = {x}; - return r.u; -} -static inline v_f64_t -v_as_f64_u64 (v_u64_t x) -{ - union { v_u64_t u; v_f64_t f; } r = {x}; - return r.f; -} -static inline v_s64_t -v_as_s64_u64 (v_u64_t x) -{ - union { v_u64_t u; v_s64_t i; } r = {x}; - return r.i; -} -static inline v_u64_t -v_as_u64_s64 (v_s64_t x) -{ - union { v_s64_t i; v_u64_t u; } r = {x}; - return r.u; -} -static inline v_f64_t -v_lookup_f64 (const f64_t *tab, v_u64_t idx) -{ - return (v_f64_t){tab[idx[0]], tab[idx[1]]}; -} -static inline v_u64_t -v_lookup_u64 (const u64_t *tab, v_u64_t idx) -{ - return (v_u64_t){tab[idx[0]], tab[idx[1]]}; -} -static inline v_f64_t -v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p) -{ - return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]}; -} -#endif - -#endif -#endif diff --git a/contrib/arm-optimized-routines/math/v_pow.c b/contrib/arm-optimized-routines/math/v_pow.c deleted file mode 100644 index 05a83aaa8c0a..000000000000 --- a/contrib/arm-optimized-routines/math/v_pow.c +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Double-precision vector pow function. - * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -VPCS_ATTR -v_f64_t -V_NAME(pow) (v_f64_t x, v_f64_t y) -{ - v_f64_t z; - for (int lane = 0; lane < v_lanes64 (); lane++) - { - f64_t sx = v_get_f64 (x, lane); - f64_t sy = v_get_f64 (y, lane); - f64_t sz = pow (sx, sy); - v_set_f64 (&z, lane, sz); - } - return z; -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_powf.c b/contrib/arm-optimized-routines/math/v_powf.c deleted file mode 100644 index ad8ab8d4f00d..000000000000 --- a/contrib/arm-optimized-routines/math/v_powf.c +++ /dev/null @@ -1,235 +0,0 @@ -/* - * Single-precision vector powf function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -#define Min v_u32 (0x00800000) -#define Max v_u32 (0x7f800000) -#define SBITS 5 -#define Tlog v__powf_log2_data.tab -#define Texp v__exp2f_data.tab -#define A v__powf_log2_data.poly -#define C v__exp2f_data.poly -#define LOGDEG 4 - -#if LOGDEG == 5 -/* 1.01 ulp */ -#define OFF v_u32 (0x3f330000) -#define TBITS 4 -#elif LOGDEG == 4 -/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2) */ -#define OFF v_u32 (0x3f35d000) -#define TBITS 5 -#endif - -#define V_EXP2F_TABLE_BITS SBITS -#define V_EXP2F_POLY_ORDER 3 -struct v_exp2f_data -{ - uint64_t tab[1 << V_EXP2F_TABLE_BITS]; - double poly[V_EXP2F_POLY_ORDER]; -}; - -#define V_POWF_LOG2_TABLE_BITS TBITS -#define V_POWF_LOG2_POLY_ORDER LOGDEG -#define SCALE ((double) (1 << SBITS)) -struct v_powf_log2_data -{ - struct - { - double invc, logc; - } tab[1 << V_POWF_LOG2_TABLE_BITS]; - double poly[V_POWF_LOG2_POLY_ORDER]; -}; - -static const struct v_powf_log2_data v__powf_log2_data = { -#if LOGDEG == 5 - .tab = { -{ 0x1.661ec79f8f3bep+0, -0x1.efec65b963019p-2 * SCALE }, -{ 0x1.571ed4aaf883dp+0, -0x1.b0b6832d4fca4p-2 * SCALE }, -{ 0x1.49539f0f010bp+0, -0x1.7418b0a1fb77bp-2 * SCALE }, -{ 0x1.3c995b0b80385p+0, -0x1.39de91a6dcf7bp-2 * SCALE }, -{ 0x1.30d190c8864a5p+0, -0x1.01d9bf3f2b631p-2 * SCALE }, -{ 0x1.25e227b0b8eap+0, -0x1.97c1d1b3b7afp-3 * SCALE }, -{ 0x1.1bb4a4a1a343fp+0, -0x1.2f9e393af3c9fp-3 * SCALE }, -{ 0x1.12358f08ae5bap+0, -0x1.960cbbf788d5cp-4 * SCALE }, -{ 0x1.0953f419900a7p+0, -0x1.a6f9db6475fcep-5 * SCALE }, -{ 0x1p+0, 0x0p+0 * SCALE }, -{ 0x1.e608cfd9a47acp-1, 0x1.338ca9f24f53dp-4 * SCALE }, -{ 0x1.ca4b31f026aap-1, 0x1.476a9543891bap-3 * SCALE }, -{ 0x1.b2036576afce6p-1, 0x1.e840b4ac4e4d2p-3 * SCALE }, -{ 0x1.9c2d163a1aa2dp-1, 0x1.40645f0c6651cp-2 * SCALE }, -{ 0x1.886e6037841edp-1, 0x1.88e9c2c1b9ff8p-2 * SCALE }, -{ 0x1.767dcf5534862p-1, 0x1.ce0a44eb17bccp-2 * SCALE }, - }, -/* rel err: 1.46 * 2^-32 */ - .poly = { -0x1.27616c9496e0bp-2 * SCALE, -0x1.71969a075c67ap-2 * SCALE, -0x1.ec70a6ca7baddp-2 * SCALE, -0x1.7154748bef6c8p-1 * SCALE, -0x1.71547652ab82bp0 * SCALE, - } -#elif LOGDEG == 4 - .tab = { -{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * SCALE}, -{0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * SCALE}, -{0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * SCALE}, -{0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * SCALE}, -{0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * SCALE}, -{0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * SCALE}, -{0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * SCALE}, -{0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * SCALE}, -{0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * SCALE}, -{0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * SCALE}, -{0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * SCALE}, -{0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * SCALE}, -{0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * SCALE}, -{0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * SCALE}, -{0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * SCALE}, -{0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * SCALE}, -{0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * SCALE}, -{0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * SCALE}, -{0x1p+0, 0x0p+0 * SCALE}, -{0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * SCALE}, -{0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * SCALE}, -{0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * SCALE}, -{0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * SCALE}, -{0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * SCALE}, -{0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * SCALE}, -{0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * SCALE}, -{0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * SCALE}, -{0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * SCALE}, -{0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * SCALE}, -{0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * SCALE}, -{0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * SCALE}, -{0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * SCALE}, - }, -/* rel err: 1.5 * 2^-30 */ - .poly = { - -0x1.6ff5daa3b3d7cp-2 * SCALE, - 0x1.ec81d03c01aebp-2 * SCALE, - -0x1.71547bb43f101p-1 * SCALE, - 0x1.7154764a815cbp0 * SCALE, - } -#endif -}; - -static const struct v_exp2f_data v__exp2f_data = { - .tab = { -0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51, -0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1, -0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d, -0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585, -0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13, -0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d, -0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069, -0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540, - }, -/* rel err: 1.69 * 2^-34 */ - .poly = { -0x1.c6af84b912394p-5/SCALE/SCALE/SCALE, 0x1.ebfce50fac4f3p-3/SCALE/SCALE, 0x1.62e42ff0c52d6p-1/SCALE - }, -}; - -VPCS_ATTR -__attribute__ ((noinline)) static v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_f32_t ret, v_u32_t cmp) -{ - return v_call2_f32 (powf, x, y, ret, cmp); -} - -VPCS_ATTR -v_f32_t -V_NAME(powf) (v_f32_t x, v_f32_t y) -{ - v_u32_t u, tmp, cmp, i, top, iz; - v_s32_t k; - v_f32_t ret; - - u = v_as_u32_f32 (x); - cmp = v_cond_u32 (u - Min >= Max - Min); - tmp = u - OFF; - i = (tmp >> (23 - TBITS)) % (1 << TBITS); - top = tmp & 0xff800000; - iz = u - top; - k = v_as_s32_u32 (top) >> (23 - SBITS); /* arithmetic shift */ - - for (int lane = 0; lane < v_lanes32 (); lane++) - { - uint32_t si, siz; - int32_t sk; - float sy; - - /* Use double precision for each lane. */ - double invc, logc, z, r, p, y0, logx, ylogx, kd, s; - uint64_t ki, t; - - si = v_get_u32 (i, lane); - siz = v_get_u32 (iz, lane); - sk = v_get_s32 (k, lane); - sy = v_get_f32 (y, lane); - - invc = Tlog[si].invc; - logc = Tlog[si].logc; - z = (double) as_f32_u32 (siz); - - /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k */ - r = __builtin_fma (z, invc, -1.0); - y0 = logc + (double) sk; - - /* Polynomial to approximate log1p(r)/ln2. */ -#if LOGDEG == 5 - logx = A[0]; - logx = r * logx + A[1]; - logx = r * logx + A[2]; - logx = r * logx + A[3]; - logx = r * logx + A[4]; - logx = r * logx + y0; -#elif LOGDEG == 4 - logx = A[0]; - logx = r * logx + A[1]; - logx = r * logx + A[2]; - logx = r * logx + A[3]; - logx = r * logx + y0; -#endif - ylogx = sy * logx; - v_set_u32 (&cmp, lane, - (as_u64_f64 (ylogx) >> 47 & 0xffff) - >= as_u64_f64 (126.0 * (1 << SBITS)) >> 47 - ? 1 - : v_get_u32 (cmp, lane)); - - /* N*x = k + r with r in [-1/2, 1/2] */ -#if TOINT_INTRINSICS - kd = roundtoint (ylogx); /* k */ - ki = converttoint (ylogx); -#else -# define SHIFT 0x1.8p52 - kd = eval_as_double (ylogx + SHIFT); - ki = asuint64 (kd); - kd -= SHIFT; -#endif - r = ylogx - kd; - - /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */ - t = Texp[ki % (1 << SBITS)]; - t += ki << (52 - SBITS); - s = as_f64_u64 (t); - p = C[0]; - p = __builtin_fma (p, r, C[1]); - p = __builtin_fma (p, r, C[2]); - p = __builtin_fma (p, s * r, s); - - v_set_f32 (&ret, lane, p); - } - if (unlikely (v_any_u32 (cmp))) - return specialcase (x, y, ret, cmp); - return ret; -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_sin.c b/contrib/arm-optimized-routines/math/v_sin.c deleted file mode 100644 index 9dbb9dec04de..000000000000 --- a/contrib/arm-optimized-routines/math/v_sin.c +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Double-precision vector sin function. - * - * Copyright (c) 2019-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const double Poly[] = { -/* worst-case error is 3.5 ulp. - abs error: 0x1.be222a58p-53 in [-pi/2, pi/2]. */ --0x1.9f4a9c8b21dc9p-41, - 0x1.60e88a10163f2p-33, --0x1.ae6361b7254e7p-26, - 0x1.71de382e8d62bp-19, --0x1.a01a019aeb4ffp-13, - 0x1.111111110b25ep-7, --0x1.55555555554c3p-3, -}; - -#define C7 v_f64 (Poly[0]) -#define C6 v_f64 (Poly[1]) -#define C5 v_f64 (Poly[2]) -#define C4 v_f64 (Poly[3]) -#define C3 v_f64 (Poly[4]) -#define C2 v_f64 (Poly[5]) -#define C1 v_f64 (Poly[6]) - -#define InvPi v_f64 (0x1.45f306dc9c883p-2) -#define Pi1 v_f64 (0x1.921fb54442d18p+1) -#define Pi2 v_f64 (0x1.1a62633145c06p-53) -#define Pi3 v_f64 (0x1.c1cd129024e09p-106) -#define Shift v_f64 (0x1.8p52) -#define AbsMask v_u64 (0x7fffffffffffffff) - -#if WANT_SIMD_EXCEPT -#define TinyBound 0x202 /* top12 (asuint64 (0x1p-509)). */ -#define Thresh 0x214 /* top12 (asuint64 (RangeVal)) - TinyBound. */ -#else -#define RangeVal v_f64 (0x1p23) -#endif - -VPCS_ATTR -__attribute__ ((noinline)) static v_f64_t -specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) -{ - return v_call_f64 (sin, x, y, cmp); -} - -VPCS_ATTR -v_f64_t -V_NAME(sin) (v_f64_t x) -{ - v_f64_t n, r, r2, y; - v_u64_t sign, odd, cmp, ir; - - ir = v_as_u64_f64 (x) & AbsMask; - r = v_as_f64_u64 (ir); - sign = v_as_u64_f64 (x) & ~AbsMask; - -#if WANT_SIMD_EXCEPT - /* Detect |x| <= 0x1p-509 or |x| >= RangeVal. If fenv exceptions are to be - triggered correctly, set any special lanes to 1 (which is neutral w.r.t. - fenv). These lanes will be fixed by specialcase later. */ - cmp = v_cond_u64 ((ir >> 52) - TinyBound >= Thresh); - if (unlikely (v_any_u64 (cmp))) - r = v_sel_f64 (cmp, v_f64 (1), r); -#else - cmp = v_cond_u64 (ir >= v_as_u64_f64 (RangeVal)); -#endif - - /* n = rint(|x|/pi). */ - n = v_fma_f64 (InvPi, r, Shift); - odd = v_as_u64_f64 (n) << 63; - n -= Shift; - - /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ - r = v_fma_f64 (-Pi1, n, r); - r = v_fma_f64 (-Pi2, n, r); - r = v_fma_f64 (-Pi3, n, r); - - /* sin(r) poly approx. */ - r2 = r * r; - y = v_fma_f64 (C7, r2, C6); - y = v_fma_f64 (y, r2, C5); - y = v_fma_f64 (y, r2, C4); - y = v_fma_f64 (y, r2, C3); - y = v_fma_f64 (y, r2, C2); - y = v_fma_f64 (y, r2, C1); - y = v_fma_f64 (y * r2, r, r); - - /* sign. */ - y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign ^ odd); - - if (unlikely (v_any_u64 (cmp))) - return specialcase (x, y, cmp); - return y; -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_sinf.c b/contrib/arm-optimized-routines/math/v_sinf.c deleted file mode 100644 index ce35dacc65cf..000000000000 --- a/contrib/arm-optimized-routines/math/v_sinf.c +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Single-precision vector sin function. - * - * Copyright (c) 2019-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const float Poly[] = { - /* 1.886 ulp error */ - 0x1.5b2e76p-19f, - -0x1.9f42eap-13f, - 0x1.110df4p-7f, - -0x1.555548p-3f, -}; -#define Pi1 v_f32 (0x1.921fb6p+1f) -#define Pi2 v_f32 (-0x1.777a5cp-24f) -#define Pi3 v_f32 (-0x1.ee59dap-49f) -#define A3 v_f32 (Poly[3]) -#define A5 v_f32 (Poly[2]) -#define A7 v_f32 (Poly[1]) -#define A9 v_f32 (Poly[0]) -#define RangeVal v_f32 (0x1p20f) -#define TinyBound v_f32 (0x1p-61f) -#define InvPi v_f32 (0x1.45f306p-2f) -#define Shift v_f32 (0x1.8p+23f) -#define AbsMask v_u32 (0x7fffffff) - -VPCS_ATTR -static v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) -{ - /* Fall back to scalar code. */ - return v_call_f32 (sinf, x, y, cmp); -} - -VPCS_ATTR -v_f32_t -V_NAME(sinf) (v_f32_t x) -{ - v_f32_t n, r, r2, y; - v_u32_t sign, odd, cmp, ir; - - ir = v_as_u32_f32 (x) & AbsMask; - r = v_as_f32_u32 (ir); - sign = v_as_u32_f32 (x) & ~AbsMask; - -#if WANT_SIMD_EXCEPT - cmp = v_cond_u32 ((ir - v_as_u32_f32 (TinyBound) - >= v_as_u32_f32 (RangeVal) - v_as_u32_f32 (TinyBound))); - if (unlikely (v_any_u32 (cmp))) - /* If fenv exceptions are to be triggered correctly, set any special lanes - to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by - specialcase later. */ - r = v_sel_f32 (cmp, v_f32 (1), r); -#else - cmp = v_cond_u32 (ir >= v_as_u32_f32 (RangeVal)); -#endif - - /* n = rint(|x|/pi) */ - n = v_fma_f32 (InvPi, r, Shift); - odd = v_as_u32_f32 (n) << 31; - n -= Shift; - - /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */ - r = v_fma_f32 (-Pi1, n, r); - r = v_fma_f32 (-Pi2, n, r); - r = v_fma_f32 (-Pi3, n, r); - - /* y = sin(r) */ - r2 = r * r; - y = v_fma_f32 (A9, r2, A7); - y = v_fma_f32 (y, r2, A5); - y = v_fma_f32 (y, r2, A3); - y = v_fma_f32 (y * r2, r, r); - - /* sign fix */ - y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign ^ odd); - - if (unlikely (v_any_u32 (cmp))) - return specialcase (x, y, cmp); - return y; -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/vn_cos.c b/contrib/arm-optimized-routines/math/vn_cos.c deleted file mode 100644 index 4b5b23718a8b..000000000000 --- a/contrib/arm-optimized-routines/math/vn_cos.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_cos. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_cos, _ZGVnN2v_cos) -#include "v_cos.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_cosf.c b/contrib/arm-optimized-routines/math/vn_cosf.c deleted file mode 100644 index 86dd26ecb3e7..000000000000 --- a/contrib/arm-optimized-routines/math/vn_cosf.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_cosf. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_cosf, _ZGVnN4v_cosf) -#include "v_cosf.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_exp.c b/contrib/arm-optimized-routines/math/vn_exp.c deleted file mode 100644 index 0d85b17de05a..000000000000 --- a/contrib/arm-optimized-routines/math/vn_exp.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_exp. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_exp, _ZGVnN2v_exp) -#include "v_exp.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_exp2f.c b/contrib/arm-optimized-routines/math/vn_exp2f.c deleted file mode 100644 index da3bb40ae93f..000000000000 --- a/contrib/arm-optimized-routines/math/vn_exp2f.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_exp2f. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_exp2f, _ZGVnN4v_exp2f) -#include "v_exp2f.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_exp2f_1u.c b/contrib/arm-optimized-routines/math/vn_exp2f_1u.c deleted file mode 100644 index 3e3a24705614..000000000000 --- a/contrib/arm-optimized-routines/math/vn_exp2f_1u.c +++ /dev/null @@ -1,11 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_exp2f_1u. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#include "v_exp2f_1u.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_expf.c b/contrib/arm-optimized-routines/math/vn_expf.c deleted file mode 100644 index 6e91a940bbf4..000000000000 --- a/contrib/arm-optimized-routines/math/vn_expf.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_expf. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_expf, _ZGVnN4v_expf) -#include "v_expf.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_expf_1u.c b/contrib/arm-optimized-routines/math/vn_expf_1u.c deleted file mode 100644 index 57ae6a315b9b..000000000000 --- a/contrib/arm-optimized-routines/math/vn_expf_1u.c +++ /dev/null @@ -1,11 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_expf_1u. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#include "v_expf_1u.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_log.c b/contrib/arm-optimized-routines/math/vn_log.c deleted file mode 100644 index 902bff1fcd4e..000000000000 --- a/contrib/arm-optimized-routines/math/vn_log.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_log. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_log, _ZGVnN2v_log) -#include "v_log.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_logf.c b/contrib/arm-optimized-routines/math/vn_logf.c deleted file mode 100644 index 07e493685b4d..000000000000 --- a/contrib/arm-optimized-routines/math/vn_logf.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_logf. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_logf, _ZGVnN4v_logf) -#include "v_logf.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_pow.c b/contrib/arm-optimized-routines/math/vn_pow.c deleted file mode 100644 index 1a980ff6bf2f..000000000000 --- a/contrib/arm-optimized-routines/math/vn_pow.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_pow. - * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_pow, _ZGVnN2vv_pow) -#include "v_pow.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_powf.c b/contrib/arm-optimized-routines/math/vn_powf.c deleted file mode 100644 index a42ade371adc..000000000000 --- a/contrib/arm-optimized-routines/math/vn_powf.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_powf. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_powf, _ZGVnN4vv_powf) -#include "v_powf.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_sin.c b/contrib/arm-optimized-routines/math/vn_sin.c deleted file mode 100644 index 64b05c8ca0eb..000000000000 --- a/contrib/arm-optimized-routines/math/vn_sin.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_sin. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_sin, _ZGVnN2v_sin) -#include "v_sin.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_sinf.c b/contrib/arm-optimized-routines/math/vn_sinf.c deleted file mode 100644 index 6e880c60dc39..000000000000 --- a/contrib/arm-optimized-routines/math/vn_sinf.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_sinf. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_sinf, _ZGVnN4v_sinf) -#include "v_sinf.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/Dir.mk b/contrib/arm-optimized-routines/pl/math/Dir.mk index be65344572a8..94b26cf3309c 100644 --- a/contrib/arm-optimized-routines/pl/math/Dir.mk +++ b/contrib/arm-optimized-routines/pl/math/Dir.mk @@ -1,13 +1,18 @@ # Makefile fragment - requires GNU make # -# Copyright (c) 2019-2023, Arm Limited. +# Copyright (c) 2019-2024, Arm Limited. # SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception PLM := $(srcdir)/pl/math AOR := $(srcdir)/math B := build/pl/math -math-lib-srcs := $(wildcard $(PLM)/*.[cS]) +pl-lib-srcs := $(wildcard $(PLM)/*.[cS]) + +ifeq ($(WANT_SVE_MATH), 0) +pl-lib-srcs := $(filter-out $(PLM)/sv_%, $(pl-lib-srcs)) +endif + math-test-srcs := \ $(AOR)/test/mathtest.c \ $(AOR)/test/mathbench.c \ @@ -15,10 +20,10 @@ math-test-srcs := \ math-test-host-srcs := $(wildcard $(AOR)/test/rtest/*.[cS]) -math-includes := $(patsubst $(PLM)/%,build/pl/%,$(wildcard $(PLM)/include/*.h)) -math-test-includes := $(patsubst $(PLM)/%,build/pl/include/%,$(wildcard $(PLM)/test/*.h)) +pl-includes := $(patsubst $(PLM)/%,build/pl/%,$(wildcard $(PLM)/include/*.h)) +pl-test-includes := $(patsubst $(PLM)/%,build/pl/include/%,$(wildcard $(PLM)/test/*.h)) -math-libs := \ +pl-libs := \ build/pl/lib/libmathlib.so \ build/pl/lib/libmathlib.a \ @@ -32,37 +37,39 @@ math-tools := \ math-host-tools := \ build/pl/bin/rtest \ -math-lib-objs := $(patsubst $(PLM)/%,$(B)/%.o,$(basename $(math-lib-srcs))) +pl-lib-objs := $(patsubst $(PLM)/%,$(B)/%.o,$(basename $(pl-lib-srcs))) math-test-objs := $(patsubst $(AOR)/%,$(B)/%.o,$(basename $(math-test-srcs))) math-host-objs := $(patsubst $(AOR)/%,$(B)/%.o,$(basename $(math-test-host-srcs))) -math-target-objs := $(math-lib-objs) $(math-test-objs) -math-objs := $(math-target-objs) $(math-target-objs:%.o=%.os) $(math-host-objs) +pl-target-objs := $(pl-lib-objs) $(math-test-objs) +pl-objs := $(pl-target-objs) $(pl-target-objs:%.o=%.os) $(math-host-objs) pl/math-files := \ - $(math-objs) \ - $(math-libs) \ + $(pl-objs) \ + $(pl-libs) \ $(math-tools) \ $(math-host-tools) \ - $(math-includes) \ - $(math-test-includes) \ + $(pl-includes) \ + $(pl-test-includes) \ -all-pl/math: $(math-libs) $(math-tools) $(math-includes) $(math-test-includes) +all-pl/math: $(pl-libs) $(math-tools) $(pl-includes) $(pl-test-includes) -$(math-objs): $(math-includes) $(math-test-includes) -$(math-objs): CFLAGS_PL += $(math-cflags) +$(pl-objs): $(pl-includes) $(pl-test-includes) +$(pl-objs): CFLAGS_PL += $(math-cflags) $(B)/test/mathtest.o: CFLAGS_PL += -fmath-errno $(math-host-objs): CC = $(HOST_CC) $(math-host-objs): CFLAGS_PL = $(HOST_CFLAGS) -build/pl/include/test/ulp_funcs_gen.h: $(math-lib-srcs) +$(B)/sv_%: CFLAGS_PL += $(math-sve-cflags) + +build/pl/include/test/ulp_funcs_gen.h: $(pl-lib-srcs) # Replace PL_SIG cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=_Z##v##t##a(f)" -P > $@ -build/pl/include/test/mathbench_funcs_gen.h: $(math-lib-srcs) +build/pl/include/test/mathbench_funcs_gen.h: $(pl-lib-srcs) # Replace PL_SIG macros with mathbench func entries cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=_Z##v##t##a(f, ##__VA_ARGS__)" -P > $@ -build/pl/include/test/ulp_wrappers_gen.h: $(math-lib-srcs) +build/pl/include/test/ulp_wrappers_gen.h: $(pl-lib-srcs) # Replace PL_SIG macros with ULP wrapper declarations cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=Z##v##N##t##a##_WRAP(f)" -P > $@ @@ -72,16 +79,18 @@ $(B)/test/ulp.o: CFLAGS_PL += -I build/pl/include/test $(B)/test/mathbench.o: build/pl/include/test/mathbench_funcs_gen.h $(B)/test/mathbench.o: CFLAGS_PL += -I build/pl/include/test -build/pl/lib/libmathlib.so: $(math-lib-objs:%.o=%.os) +build/pl/lib/libmathlib.so: $(pl-lib-objs:%.o=%.os) $(CC) $(CFLAGS_PL) $(LDFLAGS) -shared -o $@ $^ -build/pl/lib/libmathlib.a: $(math-lib-objs) +build/pl/lib/libmathlib.a: $(pl-lib-objs) rm -f $@ $(AR) rc $@ $^ $(RANLIB) $@ $(math-host-tools): HOST_LDLIBS += -lm -lmpfr -lmpc $(math-tools): LDLIBS += $(math-ldlibs) -lm +# math-sve-cflags should be empty if WANT_SVE_MATH is not enabled +$(math-tools): CFLAGS_PL += $(math-sve-cflags) # Some targets to build pl/math/test from math/test sources build/pl/math/test/%.o: $(srcdir)/math/test/%.S @@ -145,12 +154,11 @@ check-pl/math-rtest: $(math-host-tools) $(math-tools) ulp-input-dir=$(B)/test/inputs -math-lib-lims = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.ulp,$(basename $(math-lib-srcs))) -math-lib-aliases = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.alias,$(basename $(math-lib-srcs))) -math-lib-fenvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.fenv,$(basename $(math-lib-srcs))) -math-lib-itvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.itv,$(basename $(math-lib-srcs))) +math-lib-lims = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.ulp,$(basename $(pl-lib-srcs))) +math-lib-fenvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.fenv,$(basename $(pl-lib-srcs))) +math-lib-itvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.itv,$(basename $(pl-lib-srcs))) -ulp-inputs = $(math-lib-lims) $(math-lib-aliases) $(math-lib-fenvs) $(math-lib-itvs) +ulp-inputs = $(math-lib-lims) $(math-lib-fenvs) $(math-lib-itvs) $(ulp-inputs): CFLAGS_PL += -I$(PLM) -I$(PLM)/include $(math-cflags) @@ -158,10 +166,6 @@ $(ulp-input-dir)/%.ulp: $(PLM)/%.c mkdir -p $(@D) $(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep -o "PL_TEST_ULP [^ ]* [^ ]*" || true; } > $@ -$(ulp-input-dir)/%.alias: $(PLM)/%.c - mkdir -p $(@D) - $(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep "PL_TEST_ALIAS" || true; } | sed "s/_x / /g"> $@ - $(ulp-input-dir)/%.fenv: $(PLM)/%.c mkdir -p $(@D) $(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep -o "PL_TEST_EXPECT_FENV_ENABLED [^ ]*" || true; } > $@ @@ -174,38 +178,21 @@ ulp-lims := $(ulp-input-dir)/limits $(ulp-lims): $(math-lib-lims) cat $^ | sed "s/PL_TEST_ULP //g;s/^ *//g" > $@ -ulp-aliases := $(ulp-input-dir)/aliases -$(ulp-aliases): $(math-lib-aliases) - cat $^ | sed "s/PL_TEST_ALIAS //g;s/^ *//g" > $@ - fenv-exps := $(ulp-input-dir)/fenv $(fenv-exps): $(math-lib-fenvs) cat $^ | sed "s/PL_TEST_EXPECT_FENV_ENABLED //g;s/^ *//g" > $@ -ulp-itvs-noalias := $(ulp-input-dir)/itvs_noalias -$(ulp-itvs-noalias): $(math-lib-itvs) - cat $^ > $@ - -rename-aliases := $(ulp-input-dir)/rename_alias.sed -$(rename-aliases): $(ulp-aliases) - # Build sed script for replacing aliases from generated alias file - cat $< | awk '{ print "s/ " $$1 " / " $$2 " /g" }' > $@ - -ulp-itvs-alias := $(ulp-input-dir)/itvs_alias -$(ulp-itvs-alias): $(ulp-itvs-noalias) $(rename-aliases) - cat $< | sed -f $(rename-aliases) > $@ - ulp-itvs := $(ulp-input-dir)/intervals -$(ulp-itvs): $(ulp-itvs-alias) $(ulp-itvs-noalias) +$(ulp-itvs): $(math-lib-itvs) cat $^ | sort -u | sed "s/PL_TEST_INTERVAL //g" > $@ -check-pl/math-ulp: $(math-tools) $(ulp-lims) $(ulp-aliases) $(fenv-exps) $(ulp-itvs) +check-pl/math-ulp: $(math-tools) $(ulp-lims) $(fenv-exps) $(ulp-itvs) WANT_SVE_MATH=$(WANT_SVE_MATH) \ ULPFLAGS="$(math-ulpflags)" \ LIMITS=../../../$(ulp-lims) \ - ALIASES=../../../$(ulp-aliases) \ INTERVALS=../../../$(ulp-itvs) \ FENV=../../../$(fenv-exps) \ + FUNC=$(func) \ build/pl/bin/runulp.sh $(EMULATOR) check-pl/math: check-pl/math-test check-pl/math-rtest check-pl/math-ulp @@ -220,8 +207,8 @@ $(DESTDIR)$(includedir)/pl/%: build/pl/include/% $(INSTALL) -m 644 -D $< $@ install-pl/math: \ - $(math-libs:build/pl/lib/%=$(DESTDIR)$(libdir)/pl/%) \ - $(math-includes:build/pl/include/%=$(DESTDIR)$(includedir)/pl/%) + $(pl-libs:build/pl/lib/%=$(DESTDIR)$(libdir)/pl/%) \ + $(pl-includes:build/pl/include/%=$(DESTDIR)$(includedir)/pl/%) clean-pl/math: rm -f $(pl/math-files) diff --git a/contrib/arm-optimized-routines/pl/math/acos_2u.c b/contrib/arm-optimized-routines/pl/math/acos_2u.c new file mode 100644 index 000000000000..9ec6894f1d81 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/acos_2u.c @@ -0,0 +1,100 @@ +/* + * Double-precision acos(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "poly_scalar_f64.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define AbsMask (0x7fffffffffffffff) +#define Half (0x3fe0000000000000) +#define One (0x3ff0000000000000) +#define PiOver2 (0x1.921fb54442d18p+0) +#define Pi (0x1.921fb54442d18p+1) +#define Small (0x3c90000000000000) /* 2^-53. */ +#define Small16 (0x3c90) +#define QNaN (0x7ff8) + +/* Fast implementation of double-precision acos(x) based on polynomial + approximation of double-precision asin(x). + + For x < Small, approximate acos(x) by pi/2 - x. Small = 2^-53 for correct + rounding. + + For |x| in [Small, 0.5], use the trigonometric identity + + acos(x) = pi/2 - asin(x) + + and use an order 11 polynomial P such that the final approximation of asin is + an odd polynomial: asin(x) ~ x + x^3 * P(x^2). + + The largest observed error in this region is 1.18 ulps, + acos(0x1.fbab0a7c460f6p-2) got 0x1.0d54d1985c068p+0 + want 0x1.0d54d1985c069p+0. + + For |x| in [0.5, 1.0], use the following development of acos(x) near x = 1 + + acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z)) + + where z = (1-x)/2, z is near 0 when x approaches 1, and P contributes to the + approximation of asin near 0. + + The largest observed error in this region is 1.52 ulps, + acos(0x1.23d362722f591p-1) got 0x1.edbbedf8a7d6ep-1 + want 0x1.edbbedf8a7d6cp-1. + + For x in [-1.0, -0.5], use this other identity to deduce the negative inputs + from their absolute value: acos(x) = pi - acos(-x). */ +double +acos (double x) +{ + uint64_t ix = asuint64 (x); + uint64_t ia = ix & AbsMask; + uint64_t ia16 = ia >> 48; + double ax = asdouble (ia); + uint64_t sign = ix & ~AbsMask; + + /* Special values and invalid range. */ + if (unlikely (ia16 == QNaN)) + return x; + if (ia > One) + return __math_invalid (x); + if (ia16 < Small16) + return PiOver2 - x; + + /* Evaluate polynomial Q(|x|) = z + z * z2 * P(z2) with + z2 = x ^ 2 and z = |x| , if |x| < 0.5 + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */ + double z2 = ax < 0.5 ? x * x : fma (-0.5, ax, 0.5); + double z = ax < 0.5 ? ax : sqrt (z2); + + /* Use a single polynomial approximation P for both intervals. */ + double z4 = z2 * z2; + double z8 = z4 * z4; + double z16 = z8 * z8; + double p = estrin_11_f64 (z2, z4, z8, z16, __asin_poly); + + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = fma (z * z2, p, z); + + /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5 + = pi - 2 Q(|x|), for -1.0 < x <= -0.5 + = 2 Q(|x|) , for -0.5 < x < 0.0. */ + if (ax < 0.5) + return PiOver2 - asdouble (asuint64 (p) | sign); + + return (x <= -0.5) ? fma (-2.0, p, Pi) : 2.0 * p; +} + +PL_SIG (S, D, 1, acos, -1.0, 1.0) +PL_TEST_ULP (acos, 1.02) +PL_TEST_INTERVAL (acos, 0, Small, 5000) +PL_TEST_INTERVAL (acos, Small, 0.5, 50000) +PL_TEST_INTERVAL (acos, 0.5, 1.0, 50000) +PL_TEST_INTERVAL (acos, 1.0, 0x1p11, 50000) +PL_TEST_INTERVAL (acos, 0x1p11, inf, 20000) +PL_TEST_INTERVAL (acos, -0, -inf, 20000) diff --git a/contrib/arm-optimized-routines/pl/math/acosf_1u4.c b/contrib/arm-optimized-routines/pl/math/acosf_1u4.c new file mode 100644 index 000000000000..6dde422ef85a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/acosf_1u4.c @@ -0,0 +1,99 @@ +/* + * Single-precision acos(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "poly_scalar_f32.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define AbsMask (0x7fffffff) +#define Half (0x3f000000) +#define One (0x3f800000) +#define PiOver2f (0x1.921fb6p+0f) +#define Pif (0x1.921fb6p+1f) +#define Small (0x32800000) /* 2^-26. */ +#define Small12 (0x328) +#define QNaN (0x7fc) + +/* Fast implementation of single-precision acos(x) based on polynomial + approximation of single-precision asin(x). + + For x < Small, approximate acos(x) by pi/2 - x. Small = 2^-26 for correct + rounding. + + For |x| in [Small, 0.5], use the trigonometric identity + + acos(x) = pi/2 - asin(x) + + and use an order 4 polynomial P such that the final approximation of asin is + an odd polynomial: asin(x) ~ x + x^3 * P(x^2). + + The largest observed error in this region is 1.16 ulps, + acosf(0x1.ffbeccp-2) got 0x1.0c27f8p+0 want 0x1.0c27f6p+0. + + For |x| in [0.5, 1.0], use the following development of acos(x) near x = 1 + + acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z)) + + where z = (1-x)/2, z is near 0 when x approaches 1, and P contributes to the + approximation of asin near 0. + + The largest observed error in this region is 1.32 ulps, + acosf(0x1.15ba56p-1) got 0x1.feb33p-1 want 0x1.feb32ep-1. + + For x in [-1.0, -0.5], use this other identity to deduce the negative inputs + from their absolute value. + + acos(x) = pi - acos(-x) + + The largest observed error in this region is 1.28 ulps, + acosf(-0x1.002072p-1) got 0x1.0c1e84p+1 want 0x1.0c1e82p+1. */ +float +acosf (float x) +{ + uint32_t ix = asuint (x); + uint32_t ia = ix & AbsMask; + uint32_t ia12 = ia >> 20; + float ax = asfloat (ia); + uint32_t sign = ix & ~AbsMask; + + /* Special values and invalid range. */ + if (unlikely (ia12 == QNaN)) + return x; + if (ia > One) + return __math_invalidf (x); + if (ia12 < Small12) + return PiOver2f - x; + + /* Evaluate polynomial Q(|x|) = z + z * z2 * P(z2) with + z2 = x ^ 2 and z = |x| , if |x| < 0.5 + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */ + float z2 = ax < 0.5 ? x * x : fmaf (-0.5f, ax, 0.5f); + float z = ax < 0.5 ? ax : sqrtf (z2); + + /* Use a single polynomial approximation P for both intervals. */ + float p = horner_4_f32 (z2, __asinf_poly); + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = fmaf (z * z2, p, z); + + /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5 + = pi - 2 Q(|x|), for -1.0 < x <= -0.5 + = 2 Q(|x|) , for -0.5 < x < 0.0. */ + if (ax < 0.5) + return PiOver2f - asfloat (asuint (p) | sign); + + return (x <= -0.5) ? fmaf (-2.0f, p, Pif) : 2.0f * p; +} + +PL_SIG (S, F, 1, acos, -1.0, 1.0) +PL_TEST_ULP (acosf, 0.82) +PL_TEST_INTERVAL (acosf, 0, Small, 5000) +PL_TEST_INTERVAL (acosf, Small, 0.5, 50000) +PL_TEST_INTERVAL (acosf, 0.5, 1.0, 50000) +PL_TEST_INTERVAL (acosf, 1.0, 0x1p11, 50000) +PL_TEST_INTERVAL (acosf, 0x1p11, inf, 20000) +PL_TEST_INTERVAL (acosf, -0, -inf, 20000) diff --git a/contrib/arm-optimized-routines/pl/math/asin_3u.c b/contrib/arm-optimized-routines/pl/math/asin_3u.c new file mode 100644 index 000000000000..0b50995449ce --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/asin_3u.c @@ -0,0 +1,106 @@ +/* + * Double-precision asin(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "poly_scalar_f64.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define AbsMask (0x7fffffffffffffff) +#define Half (0x3fe0000000000000) +#define One (0x3ff0000000000000) +#define PiOver2 (0x1.921fb54442d18p+0) +#define Small (0x3e50000000000000) /* 2^-26. */ +#define Small16 (0x3e50) +#define QNaN (0x7ff8) + +/* Fast implementation of double-precision asin(x) based on polynomial + approximation. + + For x < Small, approximate asin(x) by x. Small = 2^-26 for correct rounding. + + For x in [Small, 0.5], use an order 11 polynomial P such that the final + approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2). + + The largest observed error in this region is 1.01 ulps, + asin(0x1.da9735b5a9277p-2) got 0x1.ed78525a927efp-2 + want 0x1.ed78525a927eep-2. + + No cheap approximation can be obtained near x = 1, since the function is not + continuously differentiable on 1. + + For x in [0.5, 1.0], we use a method based on a trigonometric identity + + asin(x) = pi/2 - acos(x) + + and a generalized power series expansion of acos(y) near y=1, that reads as + + acos(y)/sqrt(2y) ~ 1 + 1/12 * y + 3/160 * y^2 + ... (1) + + The Taylor series of asin(z) near z = 0, reads as + + asin(z) ~ z + z^3 P(z^2) = z + z^3 * (1/6 + 3/40 z^2 + ...). + + Therefore, (1) can be written in terms of P(y/2) or even asin(y/2) + + acos(y) ~ sqrt(2y) (1 + y/2 * P(y/2)) = 2 * sqrt(y/2) (1 + y/2 * P(y/2) + + Hence, if we write z = (1-x)/2, z is near 0 when x approaches 1 and + + asin(x) ~ pi/2 - acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z)). + + The largest observed error in this region is 2.69 ulps, + asin(0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1 + want 0x1.110d7e85fdd53p-1. */ +double +asin (double x) +{ + uint64_t ix = asuint64 (x); + uint64_t ia = ix & AbsMask; + uint64_t ia16 = ia >> 48; + double ax = asdouble (ia); + uint64_t sign = ix & ~AbsMask; + + /* Special values and invalid range. */ + if (unlikely (ia16 == QNaN)) + return x; + if (ia > One) + return __math_invalid (x); + if (ia16 < Small16) + return x; + + /* Evaluate polynomial Q(x) = y + y * z * P(z) with + z2 = x ^ 2 and z = |x| , if |x| < 0.5 + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */ + double z2 = ax < 0.5 ? x * x : fma (-0.5, ax, 0.5); + double z = ax < 0.5 ? ax : sqrt (z2); + + /* Use a single polynomial approximation P for both intervals. */ + double z4 = z2 * z2; + double z8 = z4 * z4; + double z16 = z8 * z8; + double p = estrin_11_f64 (z2, z4, z8, z16, __asin_poly); + + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = fma (z * z2, p, z); + + /* asin(|x|) = Q(|x|) , for |x| < 0.5 + = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ + double y = ax < 0.5 ? p : fma (-2.0, p, PiOver2); + + /* Copy sign. */ + return asdouble (asuint64 (y) | sign); +} + +PL_SIG (S, D, 1, asin, -1.0, 1.0) +PL_TEST_ULP (asin, 2.19) +PL_TEST_INTERVAL (asin, 0, Small, 5000) +PL_TEST_INTERVAL (asin, Small, 0.5, 50000) +PL_TEST_INTERVAL (asin, 0.5, 1.0, 50000) +PL_TEST_INTERVAL (asin, 1.0, 0x1p11, 50000) +PL_TEST_INTERVAL (asin, 0x1p11, inf, 20000) +PL_TEST_INTERVAL (asin, -0, -inf, 20000) diff --git a/contrib/arm-optimized-routines/pl/math/asin_data.c b/contrib/arm-optimized-routines/pl/math/asin_data.c new file mode 100644 index 000000000000..b5517731c7f4 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/asin_data.c @@ -0,0 +1,19 @@ +/* + * Coefficients for single-precision asin(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Approximate asin(x) directly in [0x1p-106, 0.25]. See tools/asin.sollya + for these coeffcients were generated. */ +const double __asin_poly[] = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) + on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ + 0x1.555555555554ep-3, 0x1.3333333337233p-4, 0x1.6db6db67f6d9fp-5, + 0x1.f1c71fbd29fbbp-6, 0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6, + 0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7, 0x1.fd1151acb6bedp-8, + 0x1.087182f799c1dp-6, -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6, +}; diff --git a/contrib/arm-optimized-routines/pl/math/asinf_2u5.c b/contrib/arm-optimized-routines/pl/math/asinf_2u5.c new file mode 100644 index 000000000000..ec608146ff66 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/asinf_2u5.c @@ -0,0 +1,100 @@ +/* + * Single-precision asin(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "poly_scalar_f32.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define AbsMask (0x7fffffff) +#define Half (0x3f000000) +#define One (0x3f800000) +#define PiOver2f (0x1.921fb6p+0f) +#define Small (0x39800000) /* 2^-12. */ +#define Small12 (0x398) +#define QNaN (0x7fc) + +/* Fast implementation of single-precision asin(x) based on polynomial + approximation. + + For x < Small, approximate asin(x) by x. Small = 2^-12 for correct rounding. + + For x in [Small, 0.5], use order 4 polynomial P such that the final + approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2). + + The largest observed error in this region is 0.83 ulps, + asinf(0x1.ea00f4p-2) got 0x1.fef15ep-2 want 0x1.fef15cp-2. + + No cheap approximation can be obtained near x = 1, since the function is not + continuously differentiable on 1. + + For x in [0.5, 1.0], we use a method based on a trigonometric identity + + asin(x) = pi/2 - acos(x) + + and a generalized power series expansion of acos(y) near y=1, that reads as + + acos(y)/sqrt(2y) ~ 1 + 1/12 * y + 3/160 * y^2 + ... (1) + + The Taylor series of asin(z) near z = 0, reads as + + asin(z) ~ z + z^3 P(z^2) = z + z^3 * (1/6 + 3/40 z^2 + ...). + + Therefore, (1) can be written in terms of P(y/2) or even asin(y/2) + + acos(y) ~ sqrt(2y) (1 + y/2 * P(y/2)) = 2 * sqrt(y/2) (1 + y/2 * P(y/2) + + Hence, if we write z = (1-x)/2, z is near 0 when x approaches 1 and + + asin(x) ~ pi/2 - acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z)). + + The largest observed error in this region is 2.41 ulps, + asinf(0x1.00203ep-1) got 0x1.0c3a64p-1 want 0x1.0c3a6p-1. */ +float +asinf (float x) +{ + uint32_t ix = asuint (x); + uint32_t ia = ix & AbsMask; + uint32_t ia12 = ia >> 20; + float ax = asfloat (ia); + uint32_t sign = ix & ~AbsMask; + + /* Special values and invalid range. */ + if (unlikely (ia12 == QNaN)) + return x; + if (ia > One) + return __math_invalidf (x); + if (ia12 < Small12) + return x; + + /* Evaluate polynomial Q(x) = y + y * z * P(z) with + z2 = x ^ 2 and z = |x| , if |x| < 0.5 + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */ + float z2 = ax < 0.5 ? x * x : fmaf (-0.5f, ax, 0.5f); + float z = ax < 0.5 ? ax : sqrtf (z2); + + /* Use a single polynomial approximation P for both intervals. */ + float p = horner_4_f32 (z2, __asinf_poly); + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = fmaf (z * z2, p, z); + + /* asin(|x|) = Q(|x|) , for |x| < 0.5 + = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ + float y = ax < 0.5 ? p : fmaf (-2.0f, p, PiOver2f); + + /* Copy sign. */ + return asfloat (asuint (y) | sign); +} + +PL_SIG (S, F, 1, asin, -1.0, 1.0) +PL_TEST_ULP (asinf, 1.91) +PL_TEST_INTERVAL (asinf, 0, Small, 5000) +PL_TEST_INTERVAL (asinf, Small, 0.5, 50000) +PL_TEST_INTERVAL (asinf, 0.5, 1.0, 50000) +PL_TEST_INTERVAL (asinf, 1.0, 0x1p11, 50000) +PL_TEST_INTERVAL (asinf, 0x1p11, inf, 20000) +PL_TEST_INTERVAL (asinf, -0, -inf, 20000) diff --git a/contrib/arm-optimized-routines/pl/math/asinf_data.c b/contrib/arm-optimized-routines/pl/math/asinf_data.c new file mode 100644 index 000000000000..1652025e2920 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/asinf_data.c @@ -0,0 +1,16 @@ +/* + * Coefficients for single-precision asin(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Approximate asinf(x) directly in [0x1p-24, 0.25]. See for tools/asinf.sollya + for these coeffs were generated. */ +const float __asinf_poly[] = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on + [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */ + 0x1.55555ep-3, 0x1.33261ap-4, 0x1.70d7dcp-5, 0x1.b059dp-6, 0x1.3af7d8p-5, +}; diff --git a/contrib/arm-optimized-routines/pl/math/asinh_2u5.c b/contrib/arm-optimized-routines/pl/math/asinh_2u5.c index f1679556d5f8..b7fc81a2b94f 100644 --- a/contrib/arm-optimized-routines/pl/math/asinh_2u5.c +++ b/contrib/arm-optimized-routines/pl/math/asinh_2u5.c @@ -4,7 +4,7 @@ * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "estrin.h" +#include "poly_scalar_f64.h" #include "math_config.h" #include "pl_sig.h" #include "pl_test.h" @@ -60,8 +60,7 @@ asinh (double x) double z2 = x2 * x2; double z4 = z2 * z2; double z8 = z4 * z4; -#define C(i) __asinh_data.poly[i] - double p = ESTRIN_17 (x2, z2, z4, z8, z8 * z8, C); + double p = estrin_17_f64 (x2, z2, z4, z8, z8 * z8, __asinh_data.poly); double y = fma (p, x2 * ax, ax); return asdouble (asuint64 (y) | sign); } diff --git a/contrib/arm-optimized-routines/pl/math/asinhf_3u5.c b/contrib/arm-optimized-routines/pl/math/asinhf_3u5.c index 2b2c55db56dc..ec26b80ec2ec 100644 --- a/contrib/arm-optimized-routines/pl/math/asinhf_3u5.c +++ b/contrib/arm-optimized-routines/pl/math/asinhf_3u5.c @@ -5,7 +5,7 @@ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "estrinf.h" +#include "poly_scalar_f32.h" #include "math_config.h" #include "pl_sig.h" #include "pl_test.h" @@ -16,8 +16,6 @@ #define One (0x3f8) #define ExpM12 (0x398) -#define C(i) __asinhf_data.coeffs[i] - float optr_aor_log_f32 (float); @@ -57,7 +55,7 @@ asinhf (float x) if (ia12 < One) { float x2 = ax * ax; - float p = ESTRIN_7 (ax, x2, x2 * x2, C); + float p = estrin_7_f32 (ax, x2, x2 * x2, __asinhf_data.coeffs); float y = fmaf (x2, p, ax); return asfloat (asuint (y) | sign); } diff --git a/contrib/arm-optimized-routines/pl/math/atan_common.h b/contrib/arm-optimized-routines/pl/math/atan_common.h index da0da6436854..798cc22cc40a 100644 --- a/contrib/arm-optimized-routines/pl/math/atan_common.h +++ b/contrib/arm-optimized-routines/pl/math/atan_common.h @@ -1,49 +1,33 @@ /* - * Double-precision polynomial evaluation function for scalar and vector atan(x) - * and atan2(y,x). + * Double-precision polynomial evaluation function for scalar + * atan(x) and atan2(y,x). * * Copyright (c) 2021-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" -#include "estrin.h" - -#if V_SUPPORTED - -#include "v_math.h" - -#define DBL_T v_f64_t -#define P(i) v_f64 (__atan_poly_data.poly[i]) - -#else - -#define DBL_T double -#define P(i) __atan_poly_data.poly[i] - -#endif +#include "poly_scalar_f64.h" /* Polynomial used in fast atan(x) and atan2(y,x) implementations The order 19 polynomial P approximates (atan(sqrt(x))-sqrt(x))/x^(3/2). */ -static inline DBL_T -eval_poly (DBL_T z, DBL_T az, DBL_T shift) +static inline double +eval_poly (double z, double az, double shift) { /* Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of full scheme to avoid underflow in x^16. */ - DBL_T z2 = z * z; - DBL_T x2 = z2 * z2; - DBL_T x4 = x2 * x2; - DBL_T x8 = x4 * x4; - DBL_T y - = FMA (ESTRIN_11_ (z2, x2, x4, x8, P, 8), x8, ESTRIN_7 (z2, x2, x4, P)); + double z2 = z * z; + double x2 = z2 * z2; + double x4 = x2 * x2; + double x8 = x4 * x4; + double y = fma (estrin_11_f64 (z2, x2, x4, x8, __atan_poly_data.poly + 8), + x8, estrin_7_f64 (z2, x2, x4, __atan_poly_data.poly)); /* Finalize. y = shift + z + z^3 * P(z^2). */ - y = FMA (y, z2 * az, az); + y = fma (y, z2 * az, az); y = y + shift; return y; } -#undef DBL_T -#undef FMA #undef P diff --git a/contrib/arm-optimized-routines/pl/math/atanf_2u9.c b/contrib/arm-optimized-routines/pl/math/atanf_2u9.c index 9d17f252b8b9..ba6f68089de1 100644 --- a/contrib/arm-optimized-routines/pl/math/atanf_2u9.c +++ b/contrib/arm-optimized-routines/pl/math/atanf_2u9.c @@ -66,11 +66,7 @@ atanf (float x) PL_SIG (S, F, 1, atan, -10.0, 10.0) PL_TEST_ULP (atanf, 2.38) -PL_TEST_INTERVAL (atanf, 0, 0x1p-30, 5000) -PL_TEST_INTERVAL (atanf, -0, -0x1p-30, 5000) -PL_TEST_INTERVAL (atanf, 0x1p-30, 1, 40000) -PL_TEST_INTERVAL (atanf, -0x1p-30, -1, 40000) -PL_TEST_INTERVAL (atanf, 1, 0x1p30, 40000) -PL_TEST_INTERVAL (atanf, -1, -0x1p30, 40000) -PL_TEST_INTERVAL (atanf, 0x1p30, inf, 1000) -PL_TEST_INTERVAL (atanf, -0x1p30, -inf, 1000) +PL_TEST_SYM_INTERVAL (atanf, 0, 0x1p-30, 5000) +PL_TEST_SYM_INTERVAL (atanf, 0x1p-30, 1, 40000) +PL_TEST_SYM_INTERVAL (atanf, 1, 0x1p30, 40000) +PL_TEST_SYM_INTERVAL (atanf, 0x1p30, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/atanf_common.h b/contrib/arm-optimized-routines/pl/math/atanf_common.h index 37ca76dee2f7..8952e7e0078b 100644 --- a/contrib/arm-optimized-routines/pl/math/atanf_common.h +++ b/contrib/arm-optimized-routines/pl/math/atanf_common.h @@ -1,5 +1,5 @@ /* - * Single-precision polynomial evaluation function for scalar and vector + * Single-precision polynomial evaluation function for scalar * atan(x) and atan2(y,x). * * Copyright (c) 2021-2023, Arm Limited. @@ -10,26 +10,12 @@ #define PL_MATH_ATANF_COMMON_H #include "math_config.h" -#include "estrinf.h" - -#if V_SUPPORTED - -#include "v_math.h" - -#define FLT_T v_f32_t -#define P(i) v_f32 (__atanf_poly_data.poly[i]) - -#else - -#define FLT_T float -#define P(i) __atanf_poly_data.poly[i] - -#endif +#include "poly_scalar_f32.h" /* Polynomial used in fast atanf(x) and atan2f(y,x) implementations The order 7 polynomial P approximates (atan(sqrt(x))-sqrt(x))/x^(3/2). */ -static inline FLT_T -eval_poly (FLT_T z, FLT_T az, FLT_T shift) +static inline float +eval_poly (float z, float az, float shift) { /* Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However, a standard implementation using z8 creates spurious underflow @@ -37,15 +23,16 @@ eval_poly (FLT_T z, FLT_T az, FLT_T shift) Therefore, we split the last fma into a mul and and an fma. Horner and single-level Estrin have higher errors that exceed threshold. */ - FLT_T z2 = z * z; - FLT_T z4 = z2 * z2; + float z2 = z * z; + float z4 = z2 * z2; /* Then assemble polynomial. */ - FLT_T y = FMA (z4, z4 * ESTRIN_3_ (z2, z4, P, 4), ESTRIN_3 (z2, z4, P)); - + float y = fmaf ( + z4, z4 * pairwise_poly_3_f32 (z2, z4, __atanf_poly_data.poly + 4), + pairwise_poly_3_f32 (z2, z4, __atanf_poly_data.poly)); /* Finalize: y = shift + z * P(z^2). */ - return FMA (y, z2 * az, az) + shift; + return fmaf (y, z2 * az, az) + shift; } #endif // PL_MATH_ATANF_COMMON_H diff --git a/contrib/arm-optimized-routines/pl/math/atanh_3u.c b/contrib/arm-optimized-routines/pl/math/atanh_3u.c index a168cd555ff6..dcfbe8192a22 100644 --- a/contrib/arm-optimized-routines/pl/math/atanh_3u.c +++ b/contrib/arm-optimized-routines/pl/math/atanh_3u.c @@ -6,7 +6,7 @@ */ #include "math_config.h" -#include "estrin.h" +#include "poly_scalar_f64.h" #include "pl_sig.h" #include "pl_test.h" @@ -20,7 +20,6 @@ #define OneTop12 0x3ff #define HfRt2Top 0x3fe6a09e /* top32(asuint64(sqrt(2)/2)). */ #define BottomMask 0xffffffff -#define C(i) __log1p_data.coeffs[i] static inline double log1p_inline (double x) @@ -46,7 +45,8 @@ log1p_inline (double x) double f2 = f * f; double f4 = f2 * f2; double f8 = f4 * f4; - double p = fma (f, ESTRIN_18 (f, f2, f4, f8, f8 * f8, C) * f, f); + double p = fma ( + f, estrin_18_f64 (f, f2, f4, f8, f8 * f8, __log1p_data.coeffs) * f, f); /* Recombine log1p(x) = k*log2 + log1p(f) + c/m. */ double kd = k; @@ -78,9 +78,6 @@ atanh (double x) PL_SIG (S, D, 1, atanh, -1.0, 1.0) PL_TEST_ULP (atanh, 3.00) -PL_TEST_INTERVAL (atanh, 0, 0x1p-23, 10000) -PL_TEST_INTERVAL (atanh, -0, -0x1p-23, 10000) -PL_TEST_INTERVAL (atanh, 0x1p-23, 1, 90000) -PL_TEST_INTERVAL (atanh, -0x1p-23, -1, 90000) -PL_TEST_INTERVAL (atanh, 1, inf, 100) -PL_TEST_INTERVAL (atanh, -1, -inf, 100) +PL_TEST_SYM_INTERVAL (atanh, 0, 0x1p-23, 10000) +PL_TEST_SYM_INTERVAL (atanh, 0x1p-23, 1, 90000) +PL_TEST_SYM_INTERVAL (atanh, 1, inf, 100) diff --git a/contrib/arm-optimized-routines/pl/math/atanhf_3u1.c b/contrib/arm-optimized-routines/pl/math/atanhf_3u1.c index fb90aa29c7a3..e99d5a9900a9 100644 --- a/contrib/arm-optimized-routines/pl/math/atanhf_3u1.c +++ b/contrib/arm-optimized-routines/pl/math/atanhf_3u1.c @@ -15,7 +15,8 @@ #define One 0x3f800000 #define Four 0x40800000 #define Ln2 0x1.62e43p-1f -#define TinyBound 0x39800000 /* 0x1p-12, below which atanhf(x) rounds to x. */ +/* asuint(0x1p-12), below which atanhf(x) rounds to x. */ +#define TinyBound 0x39800000 #define C(i) __log1pf_data.coeffs[i] @@ -80,9 +81,6 @@ atanhf (float x) PL_SIG (S, F, 1, atanh, -1.0, 1.0) PL_TEST_ULP (atanhf, 2.59) -PL_TEST_INTERVAL (atanhf, 0, 0x1p-12, 500) -PL_TEST_INTERVAL (atanhf, 0x1p-12, 1, 200000) -PL_TEST_INTERVAL (atanhf, 1, inf, 1000) -PL_TEST_INTERVAL (atanhf, -0, -0x1p-12, 500) -PL_TEST_INTERVAL (atanhf, -0x1p-12, -1, 200000) -PL_TEST_INTERVAL (atanhf, -1, -inf, 1000) +PL_TEST_SYM_INTERVAL (atanhf, 0, 0x1p-12, 500) +PL_TEST_SYM_INTERVAL (atanhf, 0x1p-12, 1, 200000) +PL_TEST_SYM_INTERVAL (atanhf, 1, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/cbrt_2u.c b/contrib/arm-optimized-routines/pl/math/cbrt_2u.c index 83715dd18a3e..80be83c4470c 100644 --- a/contrib/arm-optimized-routines/pl/math/cbrt_2u.c +++ b/contrib/arm-optimized-routines/pl/math/cbrt_2u.c @@ -31,7 +31,7 @@ cbrt (double x) uint64_t iax = ix & AbsMask; uint64_t sign = ix & ~AbsMask; - if (unlikely (iax == 0 || iax == 0x7f80000000000000)) + if (unlikely (iax == 0 || iax == 0x7ff0000000000000)) return x; /* |x| = m * 2^e, where m is in [0.5, 1.0]. @@ -66,5 +66,4 @@ cbrt (double x) } PL_TEST_ULP (cbrt, 1.30) -PL_TEST_INTERVAL (cbrt, 0, inf, 1000000) -PL_TEST_INTERVAL (cbrt, -0, -inf, 1000000) +PL_TEST_SYM_INTERVAL (cbrt, 0, inf, 1000000) diff --git a/contrib/arm-optimized-routines/pl/math/cbrtf_1u5.c b/contrib/arm-optimized-routines/pl/math/cbrtf_1u5.c index adc591786a6a..88fcb7162ef6 100644 --- a/contrib/arm-optimized-routines/pl/math/cbrtf_1u5.c +++ b/contrib/arm-optimized-routines/pl/math/cbrtf_1u5.c @@ -5,7 +5,7 @@ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "estrinf.h" +#include "poly_scalar_f32.h" #include "math_config.h" #include "pl_sig.h" #include "pl_test.h" @@ -14,7 +14,6 @@ #define SignMask 0x80000000 #define TwoThirds 0x1.555556p-1f -#define C(i) __cbrtf_data.poly[i] #define T(i) __cbrtf_data.table[i] /* Approximation for single-precision cbrt(x), using low-order polynomial and @@ -41,7 +40,8 @@ cbrtf (float x) /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is, the less accurate the next stage of the algorithm needs to be. An order-4 polynomial is enough for one Newton iteration. */ - float p = ESTRIN_3 (m, m * m, C); + float p = pairwise_poly_3_f32 (m, m * m, __cbrtf_data.poly); + /* One iteration of Newton's method for iteratively approximating cbrt. */ float m_by_3 = m / 3; float a = fmaf (TwoThirds, p, m_by_3 / (p * p)); @@ -63,5 +63,4 @@ cbrtf (float x) PL_SIG (S, F, 1, cbrt, -10.0, 10.0) PL_TEST_ULP (cbrtf, 1.03) -PL_TEST_INTERVAL (cbrtf, 0, inf, 1000000) -PL_TEST_INTERVAL (cbrtf, -0, -inf, 1000000) +PL_TEST_SYM_INTERVAL (cbrtf, 0, inf, 1000000) diff --git a/contrib/arm-optimized-routines/pl/math/cosh_2u.c b/contrib/arm-optimized-routines/pl/math/cosh_2u.c index 5d1df0717453..2240a9c56f15 100644 --- a/contrib/arm-optimized-routines/pl/math/cosh_2u.c +++ b/contrib/arm-optimized-routines/pl/math/cosh_2u.c @@ -58,9 +58,6 @@ cosh (double x) PL_SIG (S, D, 1, cosh, -10.0, 10.0) PL_TEST_ULP (cosh, 1.43) -PL_TEST_INTERVAL (cosh, 0, 0x1.61da04cbafe44p+9, 100000) -PL_TEST_INTERVAL (cosh, -0, -0x1.61da04cbafe44p+9, 100000) -PL_TEST_INTERVAL (cosh, 0x1.61da04cbafe44p+9, 0x1p10, 1000) -PL_TEST_INTERVAL (cosh, -0x1.61da04cbafe44p+9, -0x1p10, 1000) -PL_TEST_INTERVAL (cosh, 0x1p10, inf, 100) -PL_TEST_INTERVAL (cosh, -0x1p10, -inf, 100) +PL_TEST_SYM_INTERVAL (cosh, 0, 0x1.61da04cbafe44p+9, 100000) +PL_TEST_SYM_INTERVAL (cosh, 0x1.61da04cbafe44p+9, 0x1p10, 1000) +PL_TEST_SYM_INTERVAL (cosh, 0x1p10, inf, 100) diff --git a/contrib/arm-optimized-routines/pl/math/coshf_1u9.c b/contrib/arm-optimized-routines/pl/math/coshf_1u9.c index c125c929aa77..cf737840e0d6 100644 --- a/contrib/arm-optimized-routines/pl/math/coshf_1u9.c +++ b/contrib/arm-optimized-routines/pl/math/coshf_1u9.c @@ -63,9 +63,6 @@ coshf (float x) PL_SIG (S, F, 1, cosh, -10.0, 10.0) PL_TEST_ULP (coshf, 1.89) -PL_TEST_INTERVAL (coshf, 0, 0x1p-63, 100) -PL_TEST_INTERVAL (coshf, 0, 0x1.5a92d8p+6, 80000) -PL_TEST_INTERVAL (coshf, 0x1.5a92d8p+6, inf, 2000) -PL_TEST_INTERVAL (coshf, -0, -0x1p-63, 100) -PL_TEST_INTERVAL (coshf, -0, -0x1.5a92d8p+6, 80000) -PL_TEST_INTERVAL (coshf, -0x1.5a92d8p+6, -inf, 2000) +PL_TEST_SYM_INTERVAL (coshf, 0, 0x1p-63, 100) +PL_TEST_SYM_INTERVAL (coshf, 0, 0x1.5a92d8p+6, 80000) +PL_TEST_SYM_INTERVAL (coshf, 0x1.5a92d8p+6, inf, 2000) diff --git a/contrib/arm-optimized-routines/pl/math/cospi_3u1.c b/contrib/arm-optimized-routines/pl/math/cospi_3u1.c new file mode 100644 index 000000000000..4a688a076829 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/cospi_3u1.c @@ -0,0 +1,89 @@ +/* + * Double-precision scalar cospi function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "poly_scalar_f64.h" + +/* Taylor series coefficents for sin(pi * x). + C2 coefficient (orginally ~=5.16771278) has been split into two parts: + C2_hi = 4, C2_lo = C2 - C2_hi (~=1.16771278) + This change in magnitude reduces floating point rounding errors. + C2_hi is then reintroduced after the polynomial approxmation. */ +static const double poly[] + = { 0x1.921fb54442d184p1, -0x1.2aef39896f94bp0, 0x1.466bc6775ab16p1, + -0x1.32d2cce62dc33p-1, 0x1.507834891188ep-4, -0x1.e30750a28c88ep-8, + 0x1.e8f48308acda4p-12, -0x1.6fc0032b3c29fp-16, 0x1.af86ae521260bp-21, + -0x1.012a9870eeb7dp-25 }; + +#define Shift 0x1.8p+52 + +/* Approximation for scalar double-precision cospi(x). + Maximum error: 3.13 ULP: + cospi(0x1.160b129300112p-21) got 0x1.fffffffffd16bp-1 + want 0x1.fffffffffd16ep-1. */ +double +cospi (double x) +{ + if (isinf (x)) + return __math_invalid (x); + + double ax = asdouble (asuint64 (x) & ~0x8000000000000000); + + /* Edge cases for when cospif should be exactly 1. (Integers) + 0x1p53 is the limit for single precision to store any decimal places. */ + if (ax >= 0x1p53) + return 1; + + /* If x is an integer, return +- 1, based upon if x is odd. */ + uint64_t m = (uint64_t) ax; + if (m == ax) + return (m & 1) ? -1 : 1; + + /* For very small inputs, squaring r causes underflow. + Values below this threshold can be approximated via + cospi(x) ~= 1. */ + if (ax < 0x1p-63) + return 1; + + /* Any non-integer values >= 0x1x51 will be int +0.5. + These values should return exactly 0. */ + if (ax >= 0x1p51) + return 0; + + /* n = rint(|x|). */ + double n = ax + Shift; + uint64_t sign = asuint64 (n) << 63; + n = n - Shift; + + /* We know that cospi(x) = sinpi(0.5 - x) + range reduction and offset into sinpi range -1/2 .. 1/2 + r = 0.5 - |x - rint(x)|. */ + double r = 0.5 - fabs (ax - n); + + /* y = sin(r). */ + double r2 = r * r; + double y = horner_9_f64 (r2, poly); + y = y * r; + + /* Reintroduce C2_hi. */ + y = fma (-4 * r2, r, y); + + /* As all values are reduced to -1/2 .. 1/2, the result of cos(x) always be + positive, therefore, the sign must be introduced based upon if x rounds to + odd or even. */ + return asdouble (asuint64 (y) ^ sign); +} + +PL_SIG (S, D, 1, cospi, -0.9, 0.9) +PL_TEST_ULP (cospi, 2.63) +PL_TEST_SYM_INTERVAL (cospi, 0, 0x1p-63, 5000) +PL_TEST_SYM_INTERVAL (cospi, 0x1p-63, 0.5, 10000) +PL_TEST_SYM_INTERVAL (cospi, 0.5, 0x1p51f, 10000) +PL_TEST_SYM_INTERVAL (cospi, 0x1p51f, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/cospif_2u6.c b/contrib/arm-optimized-routines/pl/math/cospif_2u6.c new file mode 100644 index 000000000000..d78a98ed8b2d --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/cospif_2u6.c @@ -0,0 +1,84 @@ +/* + * Single-precision scalar cospi function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +/* Taylor series coefficents for sin(pi * x). */ +#define C0 0x1.921fb6p1f +#define C1 -0x1.4abbcep2f +#define C2 0x1.466bc6p1f +#define C3 -0x1.32d2ccp-1f +#define C4 0x1.50783p-4f +#define C5 -0x1.e30750p-8f + +#define Shift 0x1.0p+23f + +/* Approximation for scalar single-precision cospi(x) - cospif. + Maximum error: 2.64 ULP: + cospif(0x1.37e844p-4) got 0x1.f16b3p-1 + want 0x1.f16b2ap-1. */ +float +cospif (float x) +{ + if (isinf (x)) + return __math_invalidf (x); + + float ax = asfloat (asuint (x) & ~0x80000000); + + /* Edge cases for when cospif should be exactly +/- 1. (Integers) + 0x1p23 is the limit for single precision to store any decimal places. */ + if (ax >= 0x1p24f) + return 1; + + uint32_t m = roundf (ax); + if (m == ax) + return (m & 1) ? -1 : 1; + + /* Any non-integer values >= 0x1p22f will be int +0.5. + These values should return exactly 0. */ + if (ax >= 0x1p22f) + return 0; + + /* For very small inputs, squaring r causes underflow. + Values below this threshold can be approximated via cospi(x) ~= 1 - + (pi*x). */ + if (ax < 0x1p-31f) + return 1 - (C0 * x); + + /* n = rint(|x|). */ + float n = ax + Shift; + uint32_t sign = asuint (n) << 31; + n = n - Shift; + + /* We know that cospi(x) = sinpi(0.5 - x) + range reduction and offset into sinpi range -1/2 .. 1/2 + r = 0.5 - |x - rint(x)|. */ + float r = 0.5f - fabs (ax - n); + + /* y = sin(pi * r). */ + float r2 = r * r; + float y = fmaf (C5, r2, C4); + y = fmaf (y, r2, C3); + y = fmaf (y, r2, C2); + y = fmaf (y, r2, C1); + y = fmaf (y, r2, C0); + + /* As all values are reduced to -1/2 .. 1/2, the result of cos(x) always be + positive, therefore, the sign must be introduced based upon if x rounds to + odd or even. */ + return asfloat (asuint (y * r) ^ sign); +} + +PL_SIG (S, F, 1, cospi, -0.9, 0.9) +PL_TEST_ULP (cospif, 2.15) +PL_TEST_SYM_INTERVAL (cospif, 0, 0x1p-31, 5000) +PL_TEST_SYM_INTERVAL (cospif, 0x1p-31, 0.5, 10000) +PL_TEST_SYM_INTERVAL (cospif, 0.5, 0x1p22f, 10000) +PL_TEST_SYM_INTERVAL (cospif, 0x1p22f, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/erf_2u5.c b/contrib/arm-optimized-routines/pl/math/erf_2u5.c new file mode 100644 index 000000000000..3ca2a1332c1f --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/erf_2u5.c @@ -0,0 +1,102 @@ +/* + * Double-precision erf(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define TwoOverSqrtPiMinusOne 0x1.06eba8214db69p-3 +#define Shift 0x1p45 + +/* Polynomial coefficients. */ +#define OneThird 0x1.5555555555555p-2 +#define TwoThird 0x1.5555555555555p-1 + +#define TwoOverFifteen 0x1.1111111111111p-3 +#define TwoOverFive 0x1.999999999999ap-2 +#define Tenth 0x1.999999999999ap-4 + +#define TwoOverNine 0x1.c71c71c71c71cp-3 +#define TwoOverFortyFive 0x1.6c16c16c16c17p-5 +#define Sixth 0x1.555555555555p-3 + +/* Fast erf approximation based on series expansion near x rounded to + nearest multiple of 1/128. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erf(x) ~ erf(r) + + scale * d * [ + + 1 + - r d + + 1/3 (2 r^2 - 1) d^2 + - 1/6 (r (2 r^2 - 3)) d^3 + + 1/30 (4 r^4 - 12 r^2 + 3) d^4 + - 1/90 (4 r^4 - 20 r^2 + 15) d^5 + ] + + Maximum measure error: 2.29 ULP + erf(-0x1.00003c924e5d1p-8) got -0x1.20dd59132ebadp-8 + want -0x1.20dd59132ebafp-8. */ +double +erf (double x) +{ + /* Get absolute value and sign. */ + uint64_t ix = asuint64 (x); + uint64_t ia = ix & 0x7fffffffffffffff; + uint64_t sign = ix & ~0x7fffffffffffffff; + + /* |x| < 0x1p-508. Triggers exceptions. */ + if (unlikely (ia < 0x2030000000000000)) + return fma (TwoOverSqrtPiMinusOne, x, x); + + if (ia < 0x4017f80000000000) /* |x| < 6 - 1 / 128 = 5.9921875. */ + { + /* Set r to multiple of 1/128 nearest to |x|. */ + double a = asdouble (ia); + double z = a + Shift; + uint64_t i = asuint64 (z) - asuint64 (Shift); + double r = z - Shift; + /* Lookup erf(r) and scale(r) in table. + Set erf(r) to 0 and scale to 2/sqrt(pi) for |x| <= 0x1.cp-9. */ + double erfr = __erf_data.tab[i].erf; + double scale = __erf_data.tab[i].scale; + + /* erf(x) ~ erf(r) + scale * d * poly (d, r). */ + double d = a - r; + double r2 = r * r; + double d2 = d * d; + + /* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5. */ + double p1 = -r; + double p2 = fma (TwoThird, r2, -OneThird); + double p3 = -r * fma (OneThird, r2, -0.5); + double p4 = fma (fma (TwoOverFifteen, r2, -TwoOverFive), r2, Tenth); + double p5 + = -r * fma (fma (TwoOverFortyFive, r2, -TwoOverNine), r2, Sixth); + + double p34 = fma (p4, d, p3); + double p12 = fma (p2, d, p1); + double y = fma (p5, d2, p34); + y = fma (y, d2, p12); + + y = fma (fma (y, d2, d), scale, erfr); + return asdouble (asuint64 (y) | sign); + } + + /* Special cases : erf(nan)=nan, erf(+inf)=+1 and erf(-inf)=-1. */ + if (unlikely (ia >= 0x7ff0000000000000)) + return (1.0 - (double) (sign >> 62)) + 1.0 / x; + + /* Boring domain (|x| >= 6.0). */ + return asdouble (sign | asuint64 (1.0)); +} + +PL_SIG (S, D, 1, erf, -6.0, 6.0) +PL_TEST_ULP (erf, 1.79) +PL_TEST_SYM_INTERVAL (erf, 0, 5.9921875, 40000) +PL_TEST_SYM_INTERVAL (erf, 5.9921875, inf, 40000) +PL_TEST_SYM_INTERVAL (erf, 0, inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/erf_data.c b/contrib/arm-optimized-routines/pl/math/erf_data.c new file mode 100644 index 000000000000..138e03578e77 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/erf_data.c @@ -0,0 +1,788 @@ +/* + * Data for approximation of erf. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Lookup table used in erf. + For each possible rounded input r (multiples of 1/128), between + r = 0.0 and r = 6.0 (769 values): + - the first entry __erff_data.tab.erf contains the values of erf(r), + - the second entry __erff_data.tab.scale contains the values of + 2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the + algorithm, since lookup is performed only for x >= 1/64-1/512. */ +const struct erf_data __erf_data = { + .tab = { { 0x0.0000000000000p+0, 0x1.20dd750429b6dp+0 }, + { 0x1.20dbf3deb1340p-7, 0x1.20d8f1975c85dp+0 }, + { 0x1.20d77083f17a0p-6, 0x1.20cb67bd452c7p+0 }, + { 0x1.b137e0cf584dcp-6, 0x1.20b4d8bac36c1p+0 }, + { 0x1.20c5645dd2538p-5, 0x1.209546ad13ccfp+0 }, + { 0x1.68e5d3bbc9526p-5, 0x1.206cb4897b148p+0 }, + { 0x1.b0fafef135745p-5, 0x1.203b261cd0052p+0 }, + { 0x1.f902a77bd3821p-5, 0x1.2000a00ae3804p+0 }, + { 0x1.207d480e90658p-4, 0x1.1fbd27cdc72d3p+0 }, + { 0x1.44703e87e8593p-4, 0x1.1f70c3b4f2cc7p+0 }, + { 0x1.68591a1e83b5dp-4, 0x1.1f1b7ae44867fp+0 }, + { 0x1.8c36beb8a8d23p-4, 0x1.1ebd5552f795bp+0 }, + { 0x1.b0081148a873ap-4, 0x1.1e565bca400d4p+0 }, + { 0x1.d3cbf7e70a4b3p-4, 0x1.1de697e413d28p+0 }, + { 0x1.f78159ec8bb50p-4, 0x1.1d6e14099944ap+0 }, + { 0x1.0d939005f65e5p-3, 0x1.1cecdb718d61cp+0 }, + { 0x1.1f5e1a35c3b89p-3, 0x1.1c62fa1e869b6p+0 }, + { 0x1.311fc15f56d14p-3, 0x1.1bd07cdd189acp+0 }, + { 0x1.42d7fc2f64959p-3, 0x1.1b357141d95d5p+0 }, + { 0x1.548642321d7c6p-3, 0x1.1a91e5a748165p+0 }, + { 0x1.662a0bdf7a89fp-3, 0x1.19e5e92b964abp+0 }, + { 0x1.77c2d2a765f9ep-3, 0x1.19318bae53a04p+0 }, + { 0x1.895010fdbdbfdp-3, 0x1.1874ddcdfce24p+0 }, + { 0x1.9ad142662e14dp-3, 0x1.17aff0e56ec10p+0 }, + { 0x1.ac45e37fe2526p-3, 0x1.16e2d7093cd8cp+0 }, + { 0x1.bdad72110a648p-3, 0x1.160da304ed92fp+0 }, + { 0x1.cf076d1233237p-3, 0x1.153068581b781p+0 }, + { 0x1.e05354b96ff36p-3, 0x1.144b3b337c90cp+0 }, + { 0x1.f190aa85540e2p-3, 0x1.135e3075d076bp+0 }, + { 0x1.015f78a3dcf3dp-2, 0x1.12695da8b5bdep+0 }, + { 0x1.09eed6982b948p-2, 0x1.116cd8fd67618p+0 }, + { 0x1.127631eb8de32p-2, 0x1.1068b94962e5ep+0 }, + { 0x1.1af54e232d609p-2, 0x1.0f5d1602f7e41p+0 }, + { 0x1.236bef825d9a2p-2, 0x1.0e4a073dc1b91p+0 }, + { 0x1.2bd9db0f7827fp-2, 0x1.0d2fa5a70c168p+0 }, + { 0x1.343ed6989b7d9p-2, 0x1.0c0e0a8223359p+0 }, + { 0x1.3c9aa8b84bedap-2, 0x1.0ae54fa490722p+0 }, + { 0x1.44ed18d9f6462p-2, 0x1.09b58f724416bp+0 }, + { 0x1.4d35ef3e5372ep-2, 0x1.087ee4d9ad247p+0 }, + { 0x1.5574f4ffac98ep-2, 0x1.07416b4fbfe7cp+0 }, + { 0x1.5da9f415ff23fp-2, 0x1.05fd3ecbec297p+0 }, + { 0x1.65d4b75b00471p-2, 0x1.04b27bc403d30p+0 }, + { 0x1.6df50a8dff772p-2, 0x1.03613f2812dafp+0 }, + { 0x1.760aba57a76bfp-2, 0x1.0209a65e29545p+0 }, + { 0x1.7e15944d9d3e4p-2, 0x1.00abcf3e187a9p+0 }, + { 0x1.861566f5fd3c0p-2, 0x1.fe8fb01a47307p-1 }, + { 0x1.8e0a01cab516bp-2, 0x1.fbbbbef34b4b2p-1 }, + { 0x1.95f3353cbb146p-2, 0x1.f8dc092d58ff8p-1 }, + { 0x1.9dd0d2b721f39p-2, 0x1.f5f0cdaf15313p-1 }, + { 0x1.a5a2aca209394p-2, 0x1.f2fa4c16c0019p-1 }, + { 0x1.ad68966569a87p-2, 0x1.eff8c4b1375dbp-1 }, + { 0x1.b522646bbda68p-2, 0x1.ecec7870ebca7p-1 }, + { 0x1.bccfec24855b8p-2, 0x1.e9d5a8e4c934ep-1 }, + { 0x1.c4710406a65fcp-2, 0x1.e6b4982f158b9p-1 }, + { 0x1.cc058392a6d2dp-2, 0x1.e38988fc46e72p-1 }, + { 0x1.d38d4354c3bd0p-2, 0x1.e054be79d3042p-1 }, + { 0x1.db081ce6e2a48p-2, 0x1.dd167c4cf9d2ap-1 }, + { 0x1.e275eaf25e458p-2, 0x1.d9cf06898cdafp-1 }, + { 0x1.e9d68931ae650p-2, 0x1.d67ea1a8b5368p-1 }, + { 0x1.f129d471eabb1p-2, 0x1.d325927fb9d89p-1 }, + { 0x1.f86faa9428f9dp-2, 0x1.cfc41e36c7df9p-1 }, + { 0x1.ffa7ea8eb5fd0p-2, 0x1.cc5a8a3fbea40p-1 }, + { 0x1.03693a371519cp-1, 0x1.c8e91c4d01368p-1 }, + { 0x1.06f794ab2cae7p-1, 0x1.c5701a484ef9dp-1 }, + { 0x1.0a7ef5c18edd2p-1, 0x1.c1efca49a5011p-1 }, + { 0x1.0dff4f247f6c6p-1, 0x1.be68728e29d5dp-1 }, + { 0x1.1178930ada115p-1, 0x1.bada596f25436p-1 }, + { 0x1.14eab43841b55p-1, 0x1.b745c55905bf8p-1 }, + { 0x1.1855a5fd3dd50p-1, 0x1.b3aafcc27502ep-1 }, + { 0x1.1bb95c3746199p-1, 0x1.b00a46237d5bep-1 }, + { 0x1.1f15cb50bc4dep-1, 0x1.ac63e7ecc1411p-1 }, + { 0x1.226ae840d4d70p-1, 0x1.a8b8287ec6a09p-1 }, + { 0x1.25b8a88b6dd7fp-1, 0x1.a5074e2157620p-1 }, + { 0x1.28ff0240d52cdp-1, 0x1.a1519efaf889ep-1 }, + { 0x1.2c3debfd7d6c1p-1, 0x1.9d97610879642p-1 }, + { 0x1.2f755ce9a21f4p-1, 0x1.99d8da149c13fp-1 }, + { 0x1.32a54cb8db67bp-1, 0x1.96164fafd8de3p-1 }, + { 0x1.35cdb3a9a144dp-1, 0x1.925007283d7aap-1 }, + { 0x1.38ee8a84beb71p-1, 0x1.8e86458169af8p-1 }, + { 0x1.3c07ca9cb4f9ep-1, 0x1.8ab94f6caa71dp-1 }, + { 0x1.3f196dcd0f135p-1, 0x1.86e9694134b9ep-1 }, + { 0x1.42236e79a5fa6p-1, 0x1.8316d6f48133dp-1 }, + { 0x1.4525c78dd5966p-1, 0x1.7f41dc12c9e89p-1 }, + { 0x1.4820747ba2dc2p-1, 0x1.7b6abbb7aaf19p-1 }, + { 0x1.4b13713ad3513p-1, 0x1.7791b886e7403p-1 }, + { 0x1.4dfeba47f63ccp-1, 0x1.73b714a552763p-1 }, + { 0x1.50e24ca35fd2cp-1, 0x1.6fdb11b1e0c34p-1 }, + { 0x1.53be25d016a4fp-1, 0x1.6bfdf0beddaf5p-1 }, + { 0x1.569243d2b3a9bp-1, 0x1.681ff24b4ab04p-1 }, + { 0x1.595ea53035283p-1, 0x1.6441563c665d4p-1 }, + { 0x1.5c2348ecc4dc3p-1, 0x1.60625bd75d07bp-1 }, + { 0x1.5ee02e8a71a53p-1, 0x1.5c8341bb23767p-1 }, + { 0x1.61955607dd15dp-1, 0x1.58a445da7c74cp-1 }, + { 0x1.6442bfdedd397p-1, 0x1.54c5a57629db0p-1 }, + { 0x1.66e86d0312e82p-1, 0x1.50e79d1749ac9p-1 }, + { 0x1.69865ee075011p-1, 0x1.4d0a6889dfd9fp-1 }, + { 0x1.6c1c9759d0e5fp-1, 0x1.492e42d78d2c5p-1 }, + { 0x1.6eab18c74091bp-1, 0x1.4553664273d24p-1 }, + { 0x1.7131e5f496a5ap-1, 0x1.417a0c4049fd0p-1 }, + { 0x1.73b1021fc0cb8p-1, 0x1.3da26d759aef5p-1 }, + { 0x1.762870f720c6fp-1, 0x1.39ccc1b136d5ap-1 }, + { 0x1.78983697dc96fp-1, 0x1.35f93fe7d1b3dp-1 }, + { 0x1.7b00578c26037p-1, 0x1.32281e2fd1a92p-1 }, + { 0x1.7d60d8c979f7bp-1, 0x1.2e5991bd4cbfcp-1 }, + { 0x1.7fb9bfaed8078p-1, 0x1.2a8dcede3673bp-1 }, + { 0x1.820b1202f27fbp-1, 0x1.26c508f6bd0ffp-1 }, + { 0x1.8454d5f25760dp-1, 0x1.22ff727dd6f7bp-1 }, + { 0x1.8697120d92a4ap-1, 0x1.1f3d3cf9ffe5ap-1 }, + { 0x1.88d1cd474a2e0p-1, 0x1.1b7e98fe26217p-1 }, + { 0x1.8b050ef253c37p-1, 0x1.17c3b626c7a11p-1 }, + { 0x1.8d30debfc572ep-1, 0x1.140cc3173f007p-1 }, + { 0x1.8f5544bd00c04p-1, 0x1.1059ed7740313p-1 }, + { 0x1.91724951b8fc6p-1, 0x1.0cab61f084b93p-1 }, + { 0x1.9387f53df5238p-1, 0x1.09014c2ca74dap-1 }, + { 0x1.959651980da31p-1, 0x1.055bd6d32e8d7p-1 }, + { 0x1.979d67caa6631p-1, 0x1.01bb2b87c6968p-1 }, + { 0x1.999d4192a5715p-1, 0x1.fc3ee5d1524b0p-2 }, + { 0x1.9b95e8fd26abap-1, 0x1.f511a91a67d2ap-2 }, + { 0x1.9d8768656cc42p-1, 0x1.edeeee0959518p-2 }, + { 0x1.9f71ca72cffb6p-1, 0x1.e6d6ffaa65a25p-2 }, + { 0x1.a1551a16aaeafp-1, 0x1.dfca26f5bbf88p-2 }, + { 0x1.a331628a45b92p-1, 0x1.d8c8aace11e63p-2 }, + { 0x1.a506af4cc00f4p-1, 0x1.d1d2cfff91594p-2 }, + { 0x1.a6d50c20fa293p-1, 0x1.cae8d93f1d7b6p-2 }, + { 0x1.a89c850b7d54dp-1, 0x1.c40b0729ed547p-2 }, + { 0x1.aa5d265064366p-1, 0x1.bd3998457afdap-2 }, + { 0x1.ac16fc7143263p-1, 0x1.b674c8ffc6283p-2 }, + { 0x1.adca142b10f98p-1, 0x1.afbcd3afe8ab6p-2 }, + { 0x1.af767a741088bp-1, 0x1.a911f096fbc26p-2 }, + { 0x1.b11c3c79bb424p-1, 0x1.a27455e14c93cp-2 }, + { 0x1.b2bb679ead19cp-1, 0x1.9be437a7de946p-2 }, + { 0x1.b4540978921eep-1, 0x1.9561c7f23a47bp-2 }, + { 0x1.b5e62fce16095p-1, 0x1.8eed36b886d93p-2 }, + { 0x1.b771e894d602ep-1, 0x1.8886b1e5ecfd1p-2 }, + { 0x1.b8f741ef54f83p-1, 0x1.822e655b417e6p-2 }, + { 0x1.ba764a2af2b78p-1, 0x1.7be47af1f5d89p-2 }, + { 0x1.bbef0fbde6221p-1, 0x1.75a91a7f4d2edp-2 }, + { 0x1.bd61a1453ab44p-1, 0x1.6f7c69d7d3ef8p-2 }, + { 0x1.bece0d82d1a5cp-1, 0x1.695e8cd31867ep-2 }, + { 0x1.c034635b66e23p-1, 0x1.634fa54fa285fp-2 }, + { 0x1.c194b1d49a184p-1, 0x1.5d4fd33729015p-2 }, + { 0x1.c2ef0812fc1bdp-1, 0x1.575f3483021c3p-2 }, + { 0x1.c443755820d64p-1, 0x1.517de540ce2a3p-2 }, + { 0x1.c5920900b5fd1p-1, 0x1.4babff975a04cp-2 }, + { 0x1.c6dad2829ec62p-1, 0x1.45e99bcbb7915p-2 }, + { 0x1.c81de16b14cefp-1, 0x1.4036d0468a7a2p-2 }, + { 0x1.c95b455cce69dp-1, 0x1.3a93b1998736cp-2 }, + { 0x1.ca930e0e2a825p-1, 0x1.35005285227f1p-2 }, + { 0x1.cbc54b476248dp-1, 0x1.2f7cc3fe6f423p-2 }, + { 0x1.ccf20ce0c0d27p-1, 0x1.2a09153529381p-2 }, + { 0x1.ce1962c0e0d8bp-1, 0x1.24a55399ea239p-2 }, + { 0x1.cf3b5cdaf0c39p-1, 0x1.1f518ae487dc8p-2 }, + { 0x1.d0580b2cfd249p-1, 0x1.1a0dc51a9934dp-2 }, + { 0x1.d16f7dbe41ca0p-1, 0x1.14da0a961fd14p-2 }, + { 0x1.d281c49d818d0p-1, 0x1.0fb6620c550afp-2 }, + { 0x1.d38eefdf64fddp-1, 0x1.0aa2d09497f2bp-2 }, + { 0x1.d4970f9ce00d9p-1, 0x1.059f59af7a906p-2 }, + { 0x1.d59a33f19ed42p-1, 0x1.00abff4dec7a3p-2 }, + { 0x1.d6986cfa798e7p-1, 0x1.f79183b101c5bp-3 }, + { 0x1.d791cad3eff01p-1, 0x1.edeb406d9c824p-3 }, + { 0x1.d8865d98abe01p-1, 0x1.e4652fadcb6b2p-3 }, + { 0x1.d97635600bb89p-1, 0x1.daff4969c0b04p-3 }, + { 0x1.da61623cb41e0p-1, 0x1.d1b982c501370p-3 }, + { 0x1.db47f43b2980dp-1, 0x1.c893ce1dcbef7p-3 }, + { 0x1.dc29fb60715afp-1, 0x1.bf8e1b1ca2279p-3 }, + { 0x1.dd0787a8bb39dp-1, 0x1.b6a856c3ed54fp-3 }, + { 0x1.dde0a90611a0dp-1, 0x1.ade26b7fbed95p-3 }, + { 0x1.deb56f5f12d28p-1, 0x1.a53c4135a6526p-3 }, + { 0x1.df85ea8db188ep-1, 0x1.9cb5bd549b111p-3 }, + { 0x1.e0522a5dfda73p-1, 0x1.944ec2e4f5630p-3 }, + { 0x1.e11a3e8cf4eb8p-1, 0x1.8c07329874652p-3 }, + { 0x1.e1de36c75ba58p-1, 0x1.83deeada4d25ap-3 }, + { 0x1.e29e22a89d766p-1, 0x1.7bd5c7df3fe9cp-3 }, + { 0x1.e35a11b9b61cep-1, 0x1.73eba3b5b07b7p-3 }, + { 0x1.e4121370224ccp-1, 0x1.6c205655be71fp-3 }, + { 0x1.e4c6372cd8927p-1, 0x1.6473b5b15a7a1p-3 }, + { 0x1.e5768c3b4a3fcp-1, 0x1.5ce595c455b0ap-3 }, + { 0x1.e62321d06c5e0p-1, 0x1.5575c8a468361p-3 }, + { 0x1.e6cc0709c8a0dp-1, 0x1.4e241e912c305p-3 }, + { 0x1.e7714aec96534p-1, 0x1.46f066040a832p-3 }, + { 0x1.e812fc64db369p-1, 0x1.3fda6bc016994p-3 }, + { 0x1.e8b12a44944a8p-1, 0x1.38e1fae1d6a9dp-3 }, + { 0x1.e94be342e6743p-1, 0x1.3206dceef5f87p-3 }, + { 0x1.e9e335fb56f87p-1, 0x1.2b48d9e5dea1cp-3 }, + { 0x1.ea7730ed0bbb9p-1, 0x1.24a7b84d38971p-3 }, + { 0x1.eb07e27a133aap-1, 0x1.1e233d434b813p-3 }, + { 0x1.eb9558e6b42cep-1, 0x1.17bb2c8d41535p-3 }, + { 0x1.ec1fa258c4beap-1, 0x1.116f48a6476ccp-3 }, + { 0x1.eca6ccd709544p-1, 0x1.0b3f52ce8c383p-3 }, + { 0x1.ed2ae6489ac1ep-1, 0x1.052b0b1a174eap-3 }, + { 0x1.edabfc7453e63p-1, 0x1.fe6460fef4680p-4 }, + { 0x1.ee2a1d004692cp-1, 0x1.f2a901ccafb37p-4 }, + { 0x1.eea5557137ae0p-1, 0x1.e723726b824a9p-4 }, + { 0x1.ef1db32a2277cp-1, 0x1.dbd32ac4c99b0p-4 }, + { 0x1.ef93436bc2daap-1, 0x1.d0b7a0f921e7cp-4 }, + { 0x1.f006135426b26p-1, 0x1.c5d0497c09e74p-4 }, + { 0x1.f0762fde45ee6p-1, 0x1.bb1c972f23e50p-4 }, + { 0x1.f0e3a5e1a1788p-1, 0x1.b09bfb7d11a83p-4 }, + { 0x1.f14e8211e8c55p-1, 0x1.a64de673e8837p-4 }, + { 0x1.f1b6d0fea5f4dp-1, 0x1.9c31c6df3b1b8p-4 }, + { 0x1.f21c9f12f0677p-1, 0x1.92470a61b6965p-4 }, + { 0x1.f27ff89525acfp-1, 0x1.888d1d8e510a3p-4 }, + { 0x1.f2e0e9a6a8b09p-1, 0x1.7f036c0107294p-4 }, + { 0x1.f33f7e43a706bp-1, 0x1.75a96077274bap-4 }, + { 0x1.f39bc242e43e6p-1, 0x1.6c7e64e7281cbp-4 }, + { 0x1.f3f5c1558b19ep-1, 0x1.6381e2980956bp-4 }, + { 0x1.f44d870704911p-1, 0x1.5ab342383d177p-4 }, + { 0x1.f4a31ebcd47dfp-1, 0x1.5211ebf41880bp-4 }, + { 0x1.f4f693b67bd77p-1, 0x1.499d478bca735p-4 }, + { 0x1.f547f10d60597p-1, 0x1.4154bc68d75c3p-4 }, + { 0x1.f59741b4b97cfp-1, 0x1.3937b1b319259p-4 }, + { 0x1.f5e4907982a07p-1, 0x1.31458e6542847p-4 }, + { 0x1.f62fe80272419p-1, 0x1.297db960e4f63p-4 }, + { 0x1.f67952cff6282p-1, 0x1.21df9981f8e53p-4 }, + { 0x1.f6c0db3c34641p-1, 0x1.1a6a95b1e786fp-4 }, + { 0x1.f7068b7b10fd9p-1, 0x1.131e14fa1625dp-4 }, + { 0x1.f74a6d9a38383p-1, 0x1.0bf97e95f2a64p-4 }, + { 0x1.f78c8b812d498p-1, 0x1.04fc3a0481321p-4 }, + { 0x1.f7cceef15d631p-1, 0x1.fc4b5e32d6259p-5 }, + { 0x1.f80ba18636f07p-1, 0x1.eeea8c1b1db93p-5 }, + { 0x1.f848acb544e95p-1, 0x1.e1d4cf1e2450ap-5 }, + { 0x1.f88419ce4e184p-1, 0x1.d508f9a1ea64ep-5 }, + { 0x1.f8bdf1fb78370p-1, 0x1.c885df3451a07p-5 }, + { 0x1.f8f63e416ebffp-1, 0x1.bc4a54a84e834p-5 }, + { 0x1.f92d077f8d56dp-1, 0x1.b055303221015p-5 }, + { 0x1.f96256700da8ep-1, 0x1.a4a549829587ep-5 }, + { 0x1.f99633a838a57p-1, 0x1.993979e14fffdp-5 }, + { 0x1.f9c8a7989af0dp-1, 0x1.8e109c4622913p-5 }, + { 0x1.f9f9ba8d3c733p-1, 0x1.83298d717210ep-5 }, + { 0x1.fa2974addae45p-1, 0x1.78832c03aa2b1p-5 }, + { 0x1.fa57ddfe27376p-1, 0x1.6e1c5893c380bp-5 }, + { 0x1.fa84fe5e05c8dp-1, 0x1.63f3f5c4de13bp-5 }, + { 0x1.fab0dd89d1309p-1, 0x1.5a08e85af27e0p-5 }, + { 0x1.fadb831a9f9c3p-1, 0x1.505a174e9c929p-5 }, + { 0x1.fb04f6868a944p-1, 0x1.46e66be002240p-5 }, + { 0x1.fb2d3f20f9101p-1, 0x1.3dacd1a8d8ccdp-5 }, + { 0x1.fb54641aebbc9p-1, 0x1.34ac36ad8dafep-5 }, + { 0x1.fb7a6c834b5a2p-1, 0x1.2be38b6d92415p-5 }, + { 0x1.fb9f5f4739170p-1, 0x1.2351c2f2d1449p-5 }, + { 0x1.fbc3433260ca5p-1, 0x1.1af5d2e04f3f6p-5 }, + { 0x1.fbe61eef4cf6ap-1, 0x1.12ceb37ff9bc3p-5 }, + { 0x1.fc07f907bc794p-1, 0x1.0adb5fcfa8c75p-5 }, + { 0x1.fc28d7e4f9cd0p-1, 0x1.031ad58d56279p-5 }, + { 0x1.fc48c1d033c7ap-1, 0x1.f7182a851bca2p-6 }, + { 0x1.fc67bcf2d7b8fp-1, 0x1.e85c449e377f2p-6 }, + { 0x1.fc85cf56ecd38p-1, 0x1.da0005e5f28dfp-6 }, + { 0x1.fca2fee770c79p-1, 0x1.cc0180af00a8bp-6 }, + { 0x1.fcbf5170b578bp-1, 0x1.be5ecd2fcb5f9p-6 }, + { 0x1.fcdacca0bfb73p-1, 0x1.b1160991ff737p-6 }, + { 0x1.fcf57607a6e7cp-1, 0x1.a4255a00b9f03p-6 }, + { 0x1.fd0f5317f582fp-1, 0x1.978ae8b55ce1bp-6 }, + { 0x1.fd2869270a56fp-1, 0x1.8b44e6031383ep-6 }, + { 0x1.fd40bd6d7a785p-1, 0x1.7f5188610ddc8p-6 }, + { 0x1.fd58550773cb5p-1, 0x1.73af0c737bb45p-6 }, + { 0x1.fd6f34f52013ap-1, 0x1.685bb5134ef13p-6 }, + { 0x1.fd85621b0876dp-1, 0x1.5d55cb54cd53ap-6 }, + { 0x1.fd9ae142795e3p-1, 0x1.529b9e8cf9a1ep-6 }, + { 0x1.fdafb719e6a69p-1, 0x1.482b8455dc491p-6 }, + { 0x1.fdc3e835500b3p-1, 0x1.3e03d891b37dep-6 }, + { 0x1.fdd7790ea5bc0p-1, 0x1.3422fd6d12e2bp-6 }, + { 0x1.fdea6e062d0c9p-1, 0x1.2a875b5ffab56p-6 }, + { 0x1.fdfccb62e52d3p-1, 0x1.212f612dee7fbp-6 }, + { 0x1.fe0e9552ebdd6p-1, 0x1.181983e5133ddp-6 }, + { 0x1.fe1fcfebe2083p-1, 0x1.0f443edc5ce49p-6 }, + { 0x1.fe307f2b503d0p-1, 0x1.06ae13b0d3255p-6 }, + { 0x1.fe40a6f70af4bp-1, 0x1.fcab1483ea7fcp-7 }, + { 0x1.fe504b1d9696cp-1, 0x1.ec72615a894c4p-7 }, + { 0x1.fe5f6f568b301p-1, 0x1.dcaf3691fc448p-7 }, + { 0x1.fe6e1742f7cf6p-1, 0x1.cd5ec93c12431p-7 }, + { 0x1.fe7c466dc57a1p-1, 0x1.be7e5ac24963bp-7 }, + { 0x1.fe8a004c19ae6p-1, 0x1.b00b38d6b3575p-7 }, + { 0x1.fe97483db8670p-1, 0x1.a202bd6372dcep-7 }, + { 0x1.fea4218d6594ap-1, 0x1.94624e78e0fafp-7 }, + { 0x1.feb08f7146046p-1, 0x1.87275e3a6869dp-7 }, + { 0x1.febc950b3fa75p-1, 0x1.7a4f6aca256cbp-7 }, + { 0x1.fec835695932ep-1, 0x1.6dd7fe3358230p-7 }, + { 0x1.fed37386190fbp-1, 0x1.61beae53b72b7p-7 }, + { 0x1.fede5248e38f4p-1, 0x1.56011cc3b036dp-7 }, + { 0x1.fee8d486585eep-1, 0x1.4a9cf6bda3f4cp-7 }, + { 0x1.fef2fd00af31ap-1, 0x1.3f8ff5042a88ep-7 }, + { 0x1.fefcce6813974p-1, 0x1.34d7dbc76d7e5p-7 }, + { 0x1.ff064b5afffbep-1, 0x1.2a727a89a3f14p-7 }, + { 0x1.ff0f766697c76p-1, 0x1.205dac02bd6b9p-7 }, + { 0x1.ff18520700971p-1, 0x1.1697560347b25p-7 }, + { 0x1.ff20e0a7ba8c2p-1, 0x1.0d1d69569b82dp-7 }, + { 0x1.ff2924a3f7a83p-1, 0x1.03ede1a45bfeep-7 }, + { 0x1.ff312046f2339p-1, 0x1.f60d8aa2a88f2p-8 }, + { 0x1.ff38d5cc4227fp-1, 0x1.e4cc4abf7d065p-8 }, + { 0x1.ff404760319b4p-1, 0x1.d4143a9dfe965p-8 }, + { 0x1.ff47772010262p-1, 0x1.c3e1a5f5c077cp-8 }, + { 0x1.ff4e671a85425p-1, 0x1.b430ecf4a83a8p-8 }, + { 0x1.ff55194fe19dfp-1, 0x1.a4fe83fb9db25p-8 }, + { 0x1.ff5b8fb26f5f6p-1, 0x1.9646f35a76623p-8 }, + { 0x1.ff61cc26c1578p-1, 0x1.8806d70b2fc36p-8 }, + { 0x1.ff67d08401202p-1, 0x1.7a3ade6c8b3e4p-8 }, + { 0x1.ff6d9e943c231p-1, 0x1.6cdfcbfc1e263p-8 }, + { 0x1.ff733814af88cp-1, 0x1.5ff2750fe7820p-8 }, + { 0x1.ff789eb6130c9p-1, 0x1.536fc18f7ce5cp-8 }, + { 0x1.ff7dd41ce2b4dp-1, 0x1.4754abacdf1dcp-8 }, + { 0x1.ff82d9e1a76d8p-1, 0x1.3b9e3f9d06e3fp-8 }, + { 0x1.ff87b1913e853p-1, 0x1.30499b503957fp-8 }, + { 0x1.ff8c5cad200a5p-1, 0x1.2553ee2a336bfp-8 }, + { 0x1.ff90dcaba4096p-1, 0x1.1aba78ba3af89p-8 }, + { 0x1.ff9532f846ab0p-1, 0x1.107a8c7323a6ep-8 }, + { 0x1.ff9960f3eb327p-1, 0x1.06918b6355624p-8 }, + { 0x1.ff9d67f51ddbap-1, 0x1.f9f9cfd9c3035p-9 }, + { 0x1.ffa14948549a7p-1, 0x1.e77448fb66bb9p-9 }, + { 0x1.ffa506302ebaep-1, 0x1.d58da68fd1170p-9 }, + { 0x1.ffa89fe5b3625p-1, 0x1.c4412bf4b8f0bp-9 }, + { 0x1.ffac17988ef4bp-1, 0x1.b38a3af2e55b4p-9 }, + { 0x1.ffaf6e6f4f5c0p-1, 0x1.a3645330550ffp-9 }, + { 0x1.ffb2a5879f35ep-1, 0x1.93cb11a30d765p-9 }, + { 0x1.ffb5bdf67fe6fp-1, 0x1.84ba3004a50d0p-9 }, + { 0x1.ffb8b8c88295fp-1, 0x1.762d84469c18fp-9 }, + { 0x1.ffbb970200110p-1, 0x1.6821000795a03p-9 }, + { 0x1.ffbe599f4f9d9p-1, 0x1.5a90b00981d93p-9 }, + { 0x1.ffc10194fcb64p-1, 0x1.4d78bba8ca5fdp-9 }, + { 0x1.ffc38fcffbb7cp-1, 0x1.40d564548fad7p-9 }, + { 0x1.ffc60535dd7f5p-1, 0x1.34a305080681fp-9 }, + { 0x1.ffc862a501fd7p-1, 0x1.28de11c5031ebp-9 }, + { 0x1.ffcaa8f4c9beap-1, 0x1.1d83170fbf6fbp-9 }, + { 0x1.ffccd8f5c66d1p-1, 0x1.128eb96be8798p-9 }, + { 0x1.ffcef371ea4d7p-1, 0x1.07fdb4dafea5fp-9 }, + { 0x1.ffd0f92cb6ba7p-1, 0x1.fb99b8b8279e1p-10 }, + { 0x1.ffd2eae369a07p-1, 0x1.e7f232d9e2630p-10 }, + { 0x1.ffd4c94d29fdbp-1, 0x1.d4fed7195d7e8p-10 }, + { 0x1.ffd6951b33686p-1, 0x1.c2b9cf7f893bfp-10 }, + { 0x1.ffd84ef9009eep-1, 0x1.b11d702b3deb1p-10 }, + { 0x1.ffd9f78c7524ap-1, 0x1.a024365f771bdp-10 }, + { 0x1.ffdb8f7605ee7p-1, 0x1.8fc8c794b03b5p-10 }, + { 0x1.ffdd1750e1220p-1, 0x1.8005f08d6f1efp-10 }, + { 0x1.ffde8fb314ebfp-1, 0x1.70d6a46e07ddap-10 }, + { 0x1.ffdff92db56e5p-1, 0x1.6235fbd7a4345p-10 }, + { 0x1.ffe1544d01ccbp-1, 0x1.541f340697987p-10 }, + { 0x1.ffe2a1988857cp-1, 0x1.468dadf4080abp-10 }, + { 0x1.ffe3e19349dc7p-1, 0x1.397ced7af2b15p-10 }, + { 0x1.ffe514bbdc197p-1, 0x1.2ce898809244ep-10 }, + { 0x1.ffe63b8c8b5f7p-1, 0x1.20cc76202c5fap-10 }, + { 0x1.ffe7567b7b5e1p-1, 0x1.15246dda49d47p-10 }, + { 0x1.ffe865fac722bp-1, 0x1.09ec86c75d497p-10 }, + { 0x1.ffe96a78a04a9p-1, 0x1.fe41cd9bb4eeep-11 }, + { 0x1.ffea645f6d6dap-1, 0x1.e97ba3b77f306p-11 }, + { 0x1.ffeb5415e7c44p-1, 0x1.d57f524723822p-11 }, + { 0x1.ffec39ff380b9p-1, 0x1.c245d4b998479p-11 }, + { 0x1.ffed167b12ac2p-1, 0x1.afc85e0f82e12p-11 }, + { 0x1.ffede9e5d3262p-1, 0x1.9e005769dbc1dp-11 }, + { 0x1.ffeeb49896c6dp-1, 0x1.8ce75e9f6f8a0p-11 }, + { 0x1.ffef76e956a9fp-1, 0x1.7c7744d9378f7p-11 }, + { 0x1.fff0312b010b5p-1, 0x1.6caa0d3582fe9p-11 }, + { 0x1.fff0e3ad91ec2p-1, 0x1.5d79eb71e893bp-11 }, + { 0x1.fff18ebe2b0e1p-1, 0x1.4ee1429bf7cc0p-11 }, + { 0x1.fff232a72b48ep-1, 0x1.40daa3c89f5b6p-11 }, + { 0x1.fff2cfb0453d9p-1, 0x1.3360ccd23db3ap-11 }, + { 0x1.fff3661e9569dp-1, 0x1.266ea71d4f71ap-11 }, + { 0x1.fff3f634b79f9p-1, 0x1.19ff4663ae9dfp-11 }, + { 0x1.fff48032dbe40p-1, 0x1.0e0de78654d1ep-11 }, + { 0x1.fff50456dab8cp-1, 0x1.0295ef6591848p-11 }, + { 0x1.fff582dc48d30p-1, 0x1.ef25d37f49fe1p-12 }, + { 0x1.fff5fbfc8a439p-1, 0x1.da01102b5f851p-12 }, + { 0x1.fff66feee5129p-1, 0x1.c5b5412dcafadp-12 }, + { 0x1.fff6dee89352ep-1, 0x1.b23a5a23e4210p-12 }, + { 0x1.fff7491cd4af6p-1, 0x1.9f8893d8fd1c1p-12 }, + { 0x1.fff7aebcff755p-1, 0x1.8d986a4187285p-12 }, + { 0x1.fff80ff8911fdp-1, 0x1.7c629a822bc9ep-12 }, + { 0x1.fff86cfd3e657p-1, 0x1.6be02102b3520p-12 }, + { 0x1.fff8c5f702ccfp-1, 0x1.5c0a378c90bcap-12 }, + { 0x1.fff91b102fca8p-1, 0x1.4cda5374ea275p-12 }, + { 0x1.fff96c717b695p-1, 0x1.3e4a23d1f4702p-12 }, + { 0x1.fff9ba420e834p-1, 0x1.30538fbb77ecdp-12 }, + { 0x1.fffa04a7928b1p-1, 0x1.22f0b496539bdp-12 }, + { 0x1.fffa4bc63ee9ap-1, 0x1.161be46ad3b50p-12 }, + { 0x1.fffa8fc0e5f33p-1, 0x1.09cfa445b00ffp-12 }, + { 0x1.fffad0b901755p-1, 0x1.fc0d55470cf51p-13 }, + { 0x1.fffb0ecebee1bp-1, 0x1.e577bbcd49935p-13 }, + { 0x1.fffb4a210b172p-1, 0x1.cfd4a5adec5bfp-13 }, + { 0x1.fffb82cd9dcbfp-1, 0x1.bb1a9657ce465p-13 }, + { 0x1.fffbb8f1049c6p-1, 0x1.a740684026555p-13 }, + { 0x1.fffbeca6adbe9p-1, 0x1.943d4a1d1ed39p-13 }, + { 0x1.fffc1e08f25f5p-1, 0x1.8208bc334a6a5p-13 }, + { 0x1.fffc4d3120aa1p-1, 0x1.709a8db59f25cp-13 }, + { 0x1.fffc7a37857d2p-1, 0x1.5feada379d8b7p-13 }, + { 0x1.fffca53375ce3p-1, 0x1.4ff207314a102p-13 }, + { 0x1.fffcce3b57bffp-1, 0x1.40a8c1949f75ep-13 }, + { 0x1.fffcf564ab6b7p-1, 0x1.3207fb7420eb9p-13 }, + { 0x1.fffd1ac4135f9p-1, 0x1.2408e9ba3327fp-13 }, + { 0x1.fffd3e6d5cd87p-1, 0x1.16a501f0e42cap-13 }, + { 0x1.fffd607387b07p-1, 0x1.09d5f819c9e29p-13 }, + { 0x1.fffd80e8ce0dap-1, 0x1.fb2b792b40a22p-14 }, + { 0x1.fffd9fdeabccep-1, 0x1.e3bcf436a1a95p-14 }, + { 0x1.fffdbd65e5ad0p-1, 0x1.cd55277c18d05p-14 }, + { 0x1.fffdd98e903b2p-1, 0x1.b7e94604479dcp-14 }, + { 0x1.fffdf46816833p-1, 0x1.a36eec00926ddp-14 }, + { 0x1.fffe0e0140857p-1, 0x1.8fdc1b2dcf7b9p-14 }, + { 0x1.fffe26683972ap-1, 0x1.7d2737527c3f9p-14 }, + { 0x1.fffe3daa95b18p-1, 0x1.6b4702d7d5849p-14 }, + { 0x1.fffe53d558ae9p-1, 0x1.5a329b7d30748p-14 }, + { 0x1.fffe68f4fa777p-1, 0x1.49e17724f4d41p-14 }, + { 0x1.fffe7d156d244p-1, 0x1.3a4b60ba9aa4dp-14 }, + { 0x1.fffe904222101p-1, 0x1.2b6875310f785p-14 }, + { 0x1.fffea2860ee1ep-1, 0x1.1d312098e9dbap-14 }, + { 0x1.fffeb3ebb267bp-1, 0x1.0f9e1b4dd36dfp-14 }, + { 0x1.fffec47d19457p-1, 0x1.02a8673a94691p-14 }, + { 0x1.fffed443e2787p-1, 0x1.ec929a665b449p-15 }, + { 0x1.fffee34943b15p-1, 0x1.d4f4b4c8e09edp-15 }, + { 0x1.fffef1960d85dp-1, 0x1.be6abbb10a5aap-15 }, + { 0x1.fffeff32af7afp-1, 0x1.a8e8cc1fadef6p-15 }, + { 0x1.ffff0c273bea2p-1, 0x1.94637d5bacfdbp-15 }, + { 0x1.ffff187b6bc0ep-1, 0x1.80cfdc72220cfp-15 }, + { 0x1.ffff2436a21dcp-1, 0x1.6e2367dc27f95p-15 }, + { 0x1.ffff2f5fefcaap-1, 0x1.5c540b4936fd2p-15 }, + { 0x1.ffff39fe16963p-1, 0x1.4b581b8d170fcp-15 }, + { 0x1.ffff44178c8d2p-1, 0x1.3b2652b06c2b2p-15 }, + { 0x1.ffff4db27f146p-1, 0x1.2bb5cc22e5db6p-15 }, + { 0x1.ffff56d4d5e5ep-1, 0x1.1cfe010e2052dp-15 }, + { 0x1.ffff5f8435efcp-1, 0x1.0ef6c4c84a0fep-15 }, + { 0x1.ffff67c604180p-1, 0x1.01984165a5f36p-15 }, + { 0x1.ffff6f9f67e55p-1, 0x1.e9b5e8d00ce76p-16 }, + { 0x1.ffff77154e0d6p-1, 0x1.d16f5716c6c1ap-16 }, + { 0x1.ffff7e2c6aea2p-1, 0x1.ba4f035d60e02p-16 }, + { 0x1.ffff84e93cd75p-1, 0x1.a447b7b03f045p-16 }, + { 0x1.ffff8b500e77cp-1, 0x1.8f4ccca7fc90dp-16 }, + { 0x1.ffff9164f8e46p-1, 0x1.7b5223dac7336p-16 }, + { 0x1.ffff972be5c59p-1, 0x1.684c227fcacefp-16 }, + { 0x1.ffff9ca891572p-1, 0x1.562fac4329b48p-16 }, + { 0x1.ffffa1de8c582p-1, 0x1.44f21e49054f2p-16 }, + { 0x1.ffffa6d13de73p-1, 0x1.34894a5e24657p-16 }, + { 0x1.ffffab83e54b8p-1, 0x1.24eb7254ccf83p-16 }, + { 0x1.ffffaff99bac4p-1, 0x1.160f438c70913p-16 }, + { 0x1.ffffb43555b5fp-1, 0x1.07ebd2a2d2844p-16 }, + { 0x1.ffffb839e52f3p-1, 0x1.f4f12e9ab070ap-17 }, + { 0x1.ffffbc09fa7cdp-1, 0x1.db5ad0b27805cp-17 }, + { 0x1.ffffbfa82616bp-1, 0x1.c304efa2c6f4ep-17 }, + { 0x1.ffffc316d9ed0p-1, 0x1.abe09e9144b5ep-17 }, + { 0x1.ffffc6586abf6p-1, 0x1.95df988e76644p-17 }, + { 0x1.ffffc96f1165ep-1, 0x1.80f439b4ee04bp-17 }, + { 0x1.ffffcc5cec0c1p-1, 0x1.6d11788a69c64p-17 }, + { 0x1.ffffcf23ff5fcp-1, 0x1.5a2adfa0b4bc4p-17 }, + { 0x1.ffffd1c637b2bp-1, 0x1.4834877429b8fp-17 }, + { 0x1.ffffd4456a10dp-1, 0x1.37231085c7d9ap-17 }, + { 0x1.ffffd6a3554a1p-1, 0x1.26eb9daed6f7ep-17 }, + { 0x1.ffffd8e1a2f22p-1, 0x1.1783ceac28910p-17 }, + { 0x1.ffffdb01e8546p-1, 0x1.08e1badf0fcedp-17 }, + { 0x1.ffffdd05a75eap-1, 0x1.f5f7d88472604p-18 }, + { 0x1.ffffdeee4f810p-1, 0x1.db92b5212fb8dp-18 }, + { 0x1.ffffe0bd3e852p-1, 0x1.c282cd3957edap-18 }, + { 0x1.ffffe273c15b7p-1, 0x1.aab7abace48dcp-18 }, + { 0x1.ffffe41314e06p-1, 0x1.94219bfcb4928p-18 }, + { 0x1.ffffe59c6698bp-1, 0x1.7eb1a2075864dp-18 }, + { 0x1.ffffe710d565ep-1, 0x1.6a597219a93d9p-18 }, + { 0x1.ffffe8717232dp-1, 0x1.570b69502f313p-18 }, + { 0x1.ffffe9bf4098cp-1, 0x1.44ba864670882p-18 }, + { 0x1.ffffeafb377d5p-1, 0x1.335a62115bce2p-18 }, + { 0x1.ffffec2641a9ep-1, 0x1.22df298214423p-18 }, + { 0x1.ffffed413e5b7p-1, 0x1.133d96ae7e0ddp-18 }, + { 0x1.ffffee4d01cd6p-1, 0x1.046aeabcfcdecp-18 }, + { 0x1.ffffef4a55bd4p-1, 0x1.ecb9cfe1d8642p-19 }, + { 0x1.fffff039f9e8fp-1, 0x1.d21397ead99cbp-19 }, + { 0x1.fffff11ca4876p-1, 0x1.b8d094c86d374p-19 }, + { 0x1.fffff1f302bc1p-1, 0x1.a0df0f0c626dcp-19 }, + { 0x1.fffff2bdb904dp-1, 0x1.8a2e269750a39p-19 }, + { 0x1.fffff37d63a36p-1, 0x1.74adc8f4064d3p-19 }, + { 0x1.fffff43297019p-1, 0x1.604ea819f007cp-19 }, + { 0x1.fffff4dde0118p-1, 0x1.4d0231928c6f9p-19 }, + { 0x1.fffff57fc4a95p-1, 0x1.3aba85fe22e1fp-19 }, + { 0x1.fffff618c3da6p-1, 0x1.296a70f414053p-19 }, + { 0x1.fffff6a956450p-1, 0x1.1905613b3abf2p-19 }, + { 0x1.fffff731ee681p-1, 0x1.097f6156f32c5p-19 }, + { 0x1.fffff7b2f8ed6p-1, 0x1.f59a20caf6695p-20 }, + { 0x1.fffff82cdcf1bp-1, 0x1.d9c73698fb1dcp-20 }, + { 0x1.fffff89ffc4aap-1, 0x1.bf716c6168baep-20 }, + { 0x1.fffff90cb3c81p-1, 0x1.a6852c6b58392p-20 }, + { 0x1.fffff9735b73bp-1, 0x1.8eefd70594a88p-20 }, + { 0x1.fffff9d446cccp-1, 0x1.789fb715aae95p-20 }, + { 0x1.fffffa2fc5015p-1, 0x1.6383f726a8e04p-20 }, + { 0x1.fffffa8621251p-1, 0x1.4f8c96f26a26ap-20 }, + { 0x1.fffffad7a2652p-1, 0x1.3caa61607f920p-20 }, + { 0x1.fffffb248c39dp-1, 0x1.2acee2f5ecdb8p-20 }, + { 0x1.fffffb6d1e95dp-1, 0x1.19ec60b1242edp-20 }, + { 0x1.fffffbb196132p-1, 0x1.09f5cf4dd2877p-20 }, + { 0x1.fffffbf22c1e2p-1, 0x1.f5bd95d8730d8p-21 }, + { 0x1.fffffc2f171e3p-1, 0x1.d9371e2ff7c35p-21 }, + { 0x1.fffffc688a9cfp-1, 0x1.be41de54d155ap-21 }, + { 0x1.fffffc9eb76acp-1, 0x1.a4c89e08ef4f3p-21 }, + { 0x1.fffffcd1cbc28p-1, 0x1.8cb738399b12cp-21 }, + { 0x1.fffffd01f36afp-1, 0x1.75fa8dbc84becp-21 }, + { 0x1.fffffd2f57d68p-1, 0x1.608078a70dcbcp-21 }, + { 0x1.fffffd5a2041fp-1, 0x1.4c37c0394d094p-21 }, + { 0x1.fffffd8271d12p-1, 0x1.39100d5687bfep-21 }, + { 0x1.fffffda86faa9p-1, 0x1.26f9df8519bd6p-21 }, + { 0x1.fffffdcc3b117p-1, 0x1.15e6827001f18p-21 }, + { 0x1.fffffdedf37edp-1, 0x1.05c803e4831c1p-21 }, + { 0x1.fffffe0db6b91p-1, 0x1.ed22548cffd35p-22 }, + { 0x1.fffffe2ba0ea5p-1, 0x1.d06ad6ecdf971p-22 }, + { 0x1.fffffe47ccb60p-1, 0x1.b551c847fbc96p-22 }, + { 0x1.fffffe62534d4p-1, 0x1.9bc09f112b494p-22 }, + { 0x1.fffffe7b4c81ep-1, 0x1.83a1ff0aa239dp-22 }, + { 0x1.fffffe92ced93p-1, 0x1.6ce1aa3fd7bddp-22 }, + { 0x1.fffffea8ef9cfp-1, 0x1.576c72b514859p-22 }, + { 0x1.fffffebdc2ec6p-1, 0x1.43302cc4a0da8p-22 }, + { 0x1.fffffed15bcbap-1, 0x1.301ba221dc9bbp-22 }, + { 0x1.fffffee3cc32cp-1, 0x1.1e1e857adc568p-22 }, + { 0x1.fffffef5251c2p-1, 0x1.0d2966b1746f7p-22 }, + { 0x1.ffffff0576917p-1, 0x1.fa5b4f49cc6b2p-23 }, + { 0x1.ffffff14cfb92p-1, 0x1.dc3ae30b55c16p-23 }, + { 0x1.ffffff233ee1dp-1, 0x1.bfd7555a3bd68p-23 }, + { 0x1.ffffff30d18e8p-1, 0x1.a517d9e61628ap-23 }, + { 0x1.ffffff3d9480fp-1, 0x1.8be4f8f6c951fp-23 }, + { 0x1.ffffff4993c46p-1, 0x1.74287ded49339p-23 }, + { 0x1.ffffff54dab72p-1, 0x1.5dcd669f2cd34p-23 }, + { 0x1.ffffff5f74141p-1, 0x1.48bfd38302870p-23 }, + { 0x1.ffffff6969fb8p-1, 0x1.34ecf8a3c124ap-23 }, + { 0x1.ffffff72c5fb6p-1, 0x1.22430f521cbcfp-23 }, + { 0x1.ffffff7b91176p-1, 0x1.10b1488aeb235p-23 }, + { 0x1.ffffff83d3d07p-1, 0x1.0027c00a263a6p-23 }, + { 0x1.ffffff8b962bep-1, 0x1.e12ee004efc37p-24 }, + { 0x1.ffffff92dfba2p-1, 0x1.c3e44ae32b16bp-24 }, + { 0x1.ffffff99b79d2p-1, 0x1.a854ea14102a8p-24 }, + { 0x1.ffffffa0248e8p-1, 0x1.8e6761569f45dp-24 }, + { 0x1.ffffffa62ce54p-1, 0x1.7603bac345f65p-24 }, + { 0x1.ffffffabd69b4p-1, 0x1.5f1353cdad001p-24 }, + { 0x1.ffffffb127525p-1, 0x1.4980cb3c80949p-24 }, + { 0x1.ffffffb624592p-1, 0x1.3537f00b6ad4dp-24 }, + { 0x1.ffffffbad2affp-1, 0x1.2225b12bffc68p-24 }, + { 0x1.ffffffbf370cdp-1, 0x1.10380e1adb7e9p-24 }, + { 0x1.ffffffc355dfdp-1, 0x1.febc107d5efaap-25 }, + { 0x1.ffffffc733572p-1, 0x1.df0f2a0ee6946p-25 }, + { 0x1.ffffffcad3626p-1, 0x1.c14b2188bcee4p-25 }, + { 0x1.ffffffce39b67p-1, 0x1.a553644f7f07dp-25 }, + { 0x1.ffffffd169d0cp-1, 0x1.8b0cfce0579dfp-25 }, + { 0x1.ffffffd466fa5p-1, 0x1.725e7c5dd20f7p-25 }, + { 0x1.ffffffd7344aap-1, 0x1.5b2fe547a1340p-25 }, + { 0x1.ffffffd9d4aabp-1, 0x1.456a974e92e93p-25 }, + { 0x1.ffffffdc4ad7ap-1, 0x1.30f93c3699078p-25 }, + { 0x1.ffffffde9964ep-1, 0x1.1dc7b5b978cf8p-25 }, + { 0x1.ffffffe0c2bf0p-1, 0x1.0bc30c5d52f15p-25 }, + { 0x1.ffffffe2c92dbp-1, 0x1.f5b2be65a0c7fp-26 }, + { 0x1.ffffffe4aed5ep-1, 0x1.d5f3a8dea7357p-26 }, + { 0x1.ffffffe675bbdp-1, 0x1.b82915b03515bp-26 }, + { 0x1.ffffffe81fc4ep-1, 0x1.9c3517e789488p-26 }, + { 0x1.ffffffe9aeb97p-1, 0x1.81fb7df06136ep-26 }, + { 0x1.ffffffeb24467p-1, 0x1.6961b8d641d06p-26 }, + { 0x1.ffffffec81ff2p-1, 0x1.524ec4d916caep-26 }, + { 0x1.ffffffedc95e7p-1, 0x1.3cab1343d18d1p-26 }, + { 0x1.ffffffeefbc85p-1, 0x1.2860757487a01p-26 }, + { 0x1.fffffff01a8b6p-1, 0x1.155a09065d4f7p-26 }, + { 0x1.fffffff126e1ep-1, 0x1.0384250e4c9fcp-26 }, + { 0x1.fffffff221f30p-1, 0x1.e59890b926c78p-27 }, + { 0x1.fffffff30cd3fp-1, 0x1.c642116a8a9e3p-27 }, + { 0x1.fffffff3e8892p-1, 0x1.a8e405e651ab6p-27 }, + { 0x1.fffffff4b606fp-1, 0x1.8d5f98114f872p-27 }, + { 0x1.fffffff57632dp-1, 0x1.7397c5a66e307p-27 }, + { 0x1.fffffff629e44p-1, 0x1.5b71456c5a4c4p-27 }, + { 0x1.fffffff6d1e56p-1, 0x1.44d26de513197p-27 }, + { 0x1.fffffff76ef3fp-1, 0x1.2fa31d6371537p-27 }, + { 0x1.fffffff801c1fp-1, 0x1.1bcca373b7b43p-27 }, + { 0x1.fffffff88af67p-1, 0x1.0939ab853339fp-27 }, + { 0x1.fffffff90b2e3p-1, 0x1.efac5187b2863p-28 }, + { 0x1.fffffff982fc1p-1, 0x1.cf1e86235d0e6p-28 }, + { 0x1.fffffff9f2e9fp-1, 0x1.b0a68a2128babp-28 }, + { 0x1.fffffffa5b790p-1, 0x1.9423165bc4444p-28 }, + { 0x1.fffffffabd229p-1, 0x1.7974e743dea3cp-28 }, + { 0x1.fffffffb18582p-1, 0x1.607e9eacd1050p-28 }, + { 0x1.fffffffb6d844p-1, 0x1.4924a74dec728p-28 }, + { 0x1.fffffffbbd0aap-1, 0x1.334d19e0c2160p-28 }, + { 0x1.fffffffc0748fp-1, 0x1.1edfa3c5f5ccap-28 }, + { 0x1.fffffffc4c96cp-1, 0x1.0bc56f1b54701p-28 }, + { 0x1.fffffffc8d462p-1, 0x1.f3d2185e047d9p-29 }, + { 0x1.fffffffcc9a41p-1, 0x1.d26cb87945e87p-29 }, + { 0x1.fffffffd01f89p-1, 0x1.b334fac4b9f99p-29 }, + { 0x1.fffffffd36871p-1, 0x1.96076f7918d1cp-29 }, + { 0x1.fffffffd678edp-1, 0x1.7ac2d72fc2c63p-29 }, + { 0x1.fffffffd954aep-1, 0x1.614801550319ep-29 }, + { 0x1.fffffffdbff2ap-1, 0x1.4979ac8b28926p-29 }, + { 0x1.fffffffde7ba0p-1, 0x1.333c68e2d0548p-29 }, + { 0x1.fffffffe0cd16p-1, 0x1.1e767bce37dd7p-29 }, + { 0x1.fffffffe2f664p-1, 0x1.0b0fc5b6d05a0p-29 }, + { 0x1.fffffffe4fa30p-1, 0x1.f1e3523b41d7dp-30 }, + { 0x1.fffffffe6daf7p-1, 0x1.d00de6608effep-30 }, + { 0x1.fffffffe89b0cp-1, 0x1.b0778b7b3301ap-30 }, + { 0x1.fffffffea3c9ap-1, 0x1.92fb04ec0f6cfp-30 }, + { 0x1.fffffffebc1a9p-1, 0x1.77756ec9f78fap-30 }, + { 0x1.fffffffed2c21p-1, 0x1.5dc61922d5a06p-30 }, + { 0x1.fffffffee7dc8p-1, 0x1.45ce65699ff6dp-30 }, + { 0x1.fffffffefb847p-1, 0x1.2f71a5f159970p-30 }, + { 0x1.ffffffff0dd2bp-1, 0x1.1a94ff571654fp-30 }, + { 0x1.ffffffff1ede9p-1, 0x1.071f4bbea09ecp-30 }, + { 0x1.ffffffff2ebdap-1, 0x1.e9f1ff8ddd774p-31 }, + { 0x1.ffffffff3d843p-1, 0x1.c818223a202c7p-31 }, + { 0x1.ffffffff4b453p-1, 0x1.a887bd2b4404dp-31 }, + { 0x1.ffffffff58126p-1, 0x1.8b1a336c5eb6bp-31 }, + { 0x1.ffffffff63fc3p-1, 0x1.6fab63324088ap-31 }, + { 0x1.ffffffff6f121p-1, 0x1.56197e30205bap-31 }, + { 0x1.ffffffff79626p-1, 0x1.3e44e45301b92p-31 }, + { 0x1.ffffffff82fabp-1, 0x1.281000bfe4c3fp-31 }, + { 0x1.ffffffff8be77p-1, 0x1.135f28f2d50b4p-31 }, + { 0x1.ffffffff94346p-1, 0x1.00187dded5975p-31 }, + { 0x1.ffffffff9bec8p-1, 0x1.dc479de0ef001p-32 }, + { 0x1.ffffffffa319fp-1, 0x1.bad4fdad3caa1p-32 }, + { 0x1.ffffffffa9c63p-1, 0x1.9baed3ed27ab8p-32 }, + { 0x1.ffffffffaffa4p-1, 0x1.7ead9ce4285bbp-32 }, + { 0x1.ffffffffb5be5p-1, 0x1.63ac6b4edc88ep-32 }, + { 0x1.ffffffffbb1a2p-1, 0x1.4a88be2a6390cp-32 }, + { 0x1.ffffffffc014ep-1, 0x1.332259185f1a0p-32 }, + { 0x1.ffffffffc4b56p-1, 0x1.1d5b1f3793044p-32 }, + { 0x1.ffffffffc901cp-1, 0x1.0916f04b6e18bp-32 }, + { 0x1.ffffffffccfffp-1, 0x1.ec77101de6926p-33 }, + { 0x1.ffffffffd0b56p-1, 0x1.c960bf23153e0p-33 }, + { 0x1.ffffffffd4271p-1, 0x1.a8bd20fc65ef7p-33 }, + { 0x1.ffffffffd759dp-1, 0x1.8a61745ec7d1dp-33 }, + { 0x1.ffffffffda520p-1, 0x1.6e25d0e756261p-33 }, + { 0x1.ffffffffdd13cp-1, 0x1.53e4f7d1666cbp-33 }, + { 0x1.ffffffffdfa2dp-1, 0x1.3b7c27a7ddb0ep-33 }, + { 0x1.ffffffffe202dp-1, 0x1.24caf2c32af14p-33 }, + { 0x1.ffffffffe4371p-1, 0x1.0fb3186804d0fp-33 }, + { 0x1.ffffffffe642ap-1, 0x1.f830c0bb41fd7p-34 }, + { 0x1.ffffffffe8286p-1, 0x1.d3c0f1a91c846p-34 }, + { 0x1.ffffffffe9eb0p-1, 0x1.b1e5acf351d87p-34 }, + { 0x1.ffffffffeb8d0p-1, 0x1.92712d259ce66p-34 }, + { 0x1.ffffffffed10ap-1, 0x1.7538c60a04476p-34 }, + { 0x1.ffffffffee782p-1, 0x1.5a14b04b47879p-34 }, + { 0x1.ffffffffefc57p-1, 0x1.40dfd87456f4cp-34 }, + { 0x1.fffffffff0fa7p-1, 0x1.2977b1172b9d5p-34 }, + { 0x1.fffffffff218fp-1, 0x1.13bc07e891491p-34 }, + { 0x1.fffffffff3227p-1, 0x1.ff1dbb4300811p-35 }, + { 0x1.fffffffff4188p-1, 0x1.d9a880f306bd8p-35 }, + { 0x1.fffffffff4fc9p-1, 0x1.b6e45220b55e0p-35 }, + { 0x1.fffffffff5cfdp-1, 0x1.96a0b33f2c4dap-35 }, + { 0x1.fffffffff6939p-1, 0x1.78b07e9e924acp-35 }, + { 0x1.fffffffff748ep-1, 0x1.5ce9ab1670dd2p-35 }, + { 0x1.fffffffff7f0dp-1, 0x1.4325167006bb0p-35 }, + { 0x1.fffffffff88c5p-1, 0x1.2b3e53538ff3fp-35 }, + { 0x1.fffffffff91c6p-1, 0x1.15137a7f44864p-35 }, + { 0x1.fffffffff9a1bp-1, 0x1.0084ff125639dp-35 }, + { 0x1.fffffffffa1d2p-1, 0x1.daeb0b7311ec7p-36 }, + { 0x1.fffffffffa8f6p-1, 0x1.b7937d1c40c52p-36 }, + { 0x1.fffffffffaf92p-1, 0x1.96d082f59ab06p-36 }, + { 0x1.fffffffffb5b0p-1, 0x1.7872d9fa10aadp-36 }, + { 0x1.fffffffffbb58p-1, 0x1.5c4e8e37bc7d0p-36 }, + { 0x1.fffffffffc095p-1, 0x1.423ac0df49a40p-36 }, + { 0x1.fffffffffc56dp-1, 0x1.2a117230ad284p-36 }, + { 0x1.fffffffffc9e8p-1, 0x1.13af4f04f9998p-36 }, + { 0x1.fffffffffce0dp-1, 0x1.fde703724e560p-37 }, + { 0x1.fffffffffd1e1p-1, 0x1.d77f0c82e7641p-37 }, + { 0x1.fffffffffd56cp-1, 0x1.b3ee02611d7ddp-37 }, + { 0x1.fffffffffd8b3p-1, 0x1.92ff33023d5bdp-37 }, + { 0x1.fffffffffdbbap-1, 0x1.7481a9e69f53fp-37 }, + { 0x1.fffffffffde86p-1, 0x1.5847eda620959p-37 }, + { 0x1.fffffffffe11dp-1, 0x1.3e27c1fcc74bdp-37 }, + { 0x1.fffffffffe380p-1, 0x1.25f9ee0b923dcp-37 }, + { 0x1.fffffffffe5b6p-1, 0x1.0f9a0686531ffp-37 }, + { 0x1.fffffffffe7c0p-1, 0x1.f5cc7718082afp-38 }, + { 0x1.fffffffffe9a2p-1, 0x1.cf7e53d6a2ca5p-38 }, + { 0x1.fffffffffeb60p-1, 0x1.ac0f5f3229372p-38 }, + { 0x1.fffffffffecfbp-1, 0x1.8b498644847eap-38 }, + { 0x1.fffffffffee77p-1, 0x1.6cfa9bcca59dcp-38 }, + { 0x1.fffffffffefd6p-1, 0x1.50f411d4fd2cdp-38 }, + { 0x1.ffffffffff11ap-1, 0x1.370ab8327af5ep-38 }, + { 0x1.ffffffffff245p-1, 0x1.1f167f88c6b6ep-38 }, + { 0x1.ffffffffff359p-1, 0x1.08f24085d4597p-38 }, + { 0x1.ffffffffff457p-1, 0x1.e8f70e181d619p-39 }, + { 0x1.ffffffffff542p-1, 0x1.c324c20e337dcp-39 }, + { 0x1.ffffffffff61bp-1, 0x1.a03261574b54ep-39 }, + { 0x1.ffffffffff6e3p-1, 0x1.7fe903cdf5855p-39 }, + { 0x1.ffffffffff79bp-1, 0x1.6215c58da3450p-39 }, + { 0x1.ffffffffff845p-1, 0x1.46897d4b69fc6p-39 }, + { 0x1.ffffffffff8e2p-1, 0x1.2d1877d731b7bp-39 }, + { 0x1.ffffffffff973p-1, 0x1.159a386b11517p-39 }, + { 0x1.ffffffffff9f8p-1, 0x1.ffd27ae9393cep-40 }, + { 0x1.ffffffffffa73p-1, 0x1.d7c593130dd0bp-40 }, + { 0x1.ffffffffffae4p-1, 0x1.b2cd607c79bcfp-40 }, + { 0x1.ffffffffffb4cp-1, 0x1.90ae4d3405651p-40 }, + { 0x1.ffffffffffbadp-1, 0x1.71312dd1759e2p-40 }, + { 0x1.ffffffffffc05p-1, 0x1.5422ef5d8949dp-40 }, + { 0x1.ffffffffffc57p-1, 0x1.39544b0ecc957p-40 }, + { 0x1.ffffffffffca2p-1, 0x1.20997f73e73ddp-40 }, + { 0x1.ffffffffffce7p-1, 0x1.09ca0eaacd277p-40 }, + { 0x1.ffffffffffd27p-1, 0x1.e9810295890ecp-41 }, + { 0x1.ffffffffffd62p-1, 0x1.c2b45b5aa4a1dp-41 }, + { 0x1.ffffffffffd98p-1, 0x1.9eee068fa7596p-41 }, + { 0x1.ffffffffffdcap-1, 0x1.7df2b399c10a8p-41 }, + { 0x1.ffffffffffdf8p-1, 0x1.5f8b87a31bd85p-41 }, + { 0x1.ffffffffffe22p-1, 0x1.4385c96e9a2d9p-41 }, + { 0x1.ffffffffffe49p-1, 0x1.29b2933ef4cbcp-41 }, + { 0x1.ffffffffffe6cp-1, 0x1.11e68a6378f8ap-41 }, + { 0x1.ffffffffffe8dp-1, 0x1.f7f338086a86bp-42 }, + { 0x1.ffffffffffeabp-1, 0x1.cf8d7d9ce040ap-42 }, + { 0x1.ffffffffffec7p-1, 0x1.aa577251ae484p-42 }, + { 0x1.ffffffffffee1p-1, 0x1.8811d739efb5ep-42 }, + { 0x1.ffffffffffef8p-1, 0x1.68823e52970bep-42 }, + { 0x1.fffffffffff0ep-1, 0x1.4b72ae68e8b4cp-42 }, + { 0x1.fffffffffff22p-1, 0x1.30b14dbe876bcp-42 }, + { 0x1.fffffffffff34p-1, 0x1.181012ef86610p-42 }, + { 0x1.fffffffffff45p-1, 0x1.01647ba798744p-42 }, + { 0x1.fffffffffff54p-1, 0x1.d90e917701675p-43 }, + { 0x1.fffffffffff62p-1, 0x1.b2a87e86d0c8ap-43 }, + { 0x1.fffffffffff6fp-1, 0x1.8f53dcb377293p-43 }, + { 0x1.fffffffffff7bp-1, 0x1.6ed2f2515e933p-43 }, + { 0x1.fffffffffff86p-1, 0x1.50ecc9ed47f19p-43 }, + { 0x1.fffffffffff90p-1, 0x1.356cd5ce7799ep-43 }, + { 0x1.fffffffffff9ap-1, 0x1.1c229a587ab78p-43 }, + { 0x1.fffffffffffa2p-1, 0x1.04e15ecc7f3f6p-43 }, + { 0x1.fffffffffffaap-1, 0x1.deffc7e6a6017p-44 }, + { 0x1.fffffffffffb1p-1, 0x1.b7b040832f310p-44 }, + { 0x1.fffffffffffb8p-1, 0x1.938e021f36d76p-44 }, + { 0x1.fffffffffffbep-1, 0x1.7258610b3b233p-44 }, + { 0x1.fffffffffffc3p-1, 0x1.53d3bfc82a909p-44 }, + { 0x1.fffffffffffc8p-1, 0x1.37c92babdc2fdp-44 }, + { 0x1.fffffffffffcdp-1, 0x1.1e06010120f6ap-44 }, + { 0x1.fffffffffffd1p-1, 0x1.065b9616170d4p-44 }, + { 0x1.fffffffffffd5p-1, 0x1.e13dd96b3753ap-45 }, + { 0x1.fffffffffffd9p-1, 0x1.b950d32467392p-45 }, + { 0x1.fffffffffffdcp-1, 0x1.94a72263259a5p-45 }, + { 0x1.fffffffffffdfp-1, 0x1.72fd93e036cdcp-45 }, + { 0x1.fffffffffffe2p-1, 0x1.54164576929abp-45 }, + { 0x1.fffffffffffe4p-1, 0x1.37b83c521fe96p-45 }, + { 0x1.fffffffffffe7p-1, 0x1.1daf033182e96p-45 }, + { 0x1.fffffffffffe9p-1, 0x1.05ca50205d26ap-45 }, + { 0x1.fffffffffffebp-1, 0x1.dfbb6235639fap-46 }, + { 0x1.fffffffffffedp-1, 0x1.b7807e294781fp-46 }, + { 0x1.fffffffffffeep-1, 0x1.9298add70a734p-46 }, + { 0x1.ffffffffffff0p-1, 0x1.70beaf9c7ffb6p-46 }, + { 0x1.ffffffffffff1p-1, 0x1.51b2cd6709222p-46 }, + { 0x1.ffffffffffff3p-1, 0x1.353a6cf7f7fffp-46 }, + { 0x1.ffffffffffff4p-1, 0x1.1b1fa8cbe84a7p-46 }, + { 0x1.ffffffffffff5p-1, 0x1.0330f0fd69921p-46 }, + { 0x1.ffffffffffff6p-1, 0x1.da81670f96f9bp-47 }, + { 0x1.ffffffffffff7p-1, 0x1.b24a16b4d09aap-47 }, + { 0x1.ffffffffffff7p-1, 0x1.8d6eeb6efdbd6p-47 }, + { 0x1.ffffffffffff8p-1, 0x1.6ba91ac734785p-47 }, + { 0x1.ffffffffffff9p-1, 0x1.4cb7966770ab5p-47 }, + { 0x1.ffffffffffff9p-1, 0x1.305e9721d0981p-47 }, + { 0x1.ffffffffffffap-1, 0x1.1667311fff70ap-47 }, + { 0x1.ffffffffffffbp-1, 0x1.fd3de10d62855p-48 }, + { 0x1.ffffffffffffbp-1, 0x1.d1aefbcd48d0cp-48 }, + { 0x1.ffffffffffffbp-1, 0x1.a9cc93c25aca9p-48 }, + { 0x1.ffffffffffffcp-1, 0x1.85487ee3ea735p-48 }, + { 0x1.ffffffffffffcp-1, 0x1.63daf8b4b1e0cp-48 }, + { 0x1.ffffffffffffdp-1, 0x1.45421e69a6ca1p-48 }, + { 0x1.ffffffffffffdp-1, 0x1.294175802d99ap-48 }, + { 0x1.ffffffffffffdp-1, 0x1.0fa17bf41068fp-48 }, + { 0x1.ffffffffffffdp-1, 0x1.f05e82aae2bb9p-49 }, + { 0x1.ffffffffffffep-1, 0x1.c578101b29058p-49 }, + { 0x1.ffffffffffffep-1, 0x1.9e39dc5dd2f7cp-49 }, + { 0x1.ffffffffffffep-1, 0x1.7a553a728bbf2p-49 }, + { 0x1.ffffffffffffep-1, 0x1.5982008db1304p-49 }, + { 0x1.ffffffffffffep-1, 0x1.3b7e00422e51bp-49 }, + { 0x1.ffffffffffffep-1, 0x1.200c898d9ee3ep-49 }, + { 0x1.fffffffffffffp-1, 0x1.06f5f7eb65a56p-49 }, + { 0x1.fffffffffffffp-1, 0x1.e00e9148a1d25p-50 }, + { 0x1.fffffffffffffp-1, 0x1.b623734024e92p-50 }, + { 0x1.fffffffffffffp-1, 0x1.8fd4e01891bf8p-50 }, + { 0x1.fffffffffffffp-1, 0x1.6cd44c7470d89p-50 }, + { 0x1.fffffffffffffp-1, 0x1.4cd9c04158cd7p-50 }, + { 0x1.fffffffffffffp-1, 0x1.2fa34bf5c8344p-50 }, + { 0x1.fffffffffffffp-1, 0x1.14f4890ff2461p-50 }, + { 0x1.fffffffffffffp-1, 0x1.f92c49dfa4df5p-51 }, + { 0x1.fffffffffffffp-1, 0x1.ccaaea71ab0dfp-51 }, + { 0x1.fffffffffffffp-1, 0x1.a40829f001197p-51 }, + { 0x1.0000000000000p+0, 0x1.7eef13b59e96cp-51 }, + { 0x1.0000000000000p+0, 0x1.5d11e1a252bf5p-51 }, + { 0x1.0000000000000p+0, 0x1.3e296303b2297p-51 }, + { 0x1.0000000000000p+0, 0x1.21f47009f43cep-51 }, + { 0x1.0000000000000p+0, 0x1.083768c5e4541p-51 }, + { 0x1.0000000000000p+0, 0x1.e1777d831265ep-52 }, + { 0x1.0000000000000p+0, 0x1.b69f10b0191b5p-52 }, + { 0x1.0000000000000p+0, 0x1.8f8a3a05b5b52p-52 }, + { 0x1.0000000000000p+0, 0x1.6be573c40c8e7p-52 }, + { 0x1.0000000000000p+0, 0x1.4b645ba991fdbp-52 }, + { 0x1.0000000000000p+0, 0x1.2dc119095729fp-52 }, + }, +}; diff --git a/contrib/arm-optimized-routines/pl/math/erfc_1u8.c b/contrib/arm-optimized-routines/pl/math/erfc_1u8.c new file mode 100644 index 000000000000..7f2004e9335d --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/erfc_1u8.c @@ -0,0 +1,153 @@ +/* + * Double-precision erfc(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define Shift 0x1p45 +#define P20 0x1.5555555555555p-2 /* 1/3. */ +#define P21 0x1.5555555555555p-1 /* 2/3. */ + +#define P40 0x1.999999999999ap-4 /* 1/10. */ +#define P41 0x1.999999999999ap-2 /* 2/5. */ +#define P42 0x1.11111111111111p-3 /* 2/15. */ + +#define P50 0x1.5555555555555p-3 /* 1/6. */ +#define P51 0x1.c71c71c71c71cp-3 /* 2/9. */ +#define P52 0x1.6c16c16c16c17p-5 /* 2/45. */ + +/* Qi = (i+1) / i. */ +#define Q5 0x1.3333333333333p0 +#define Q6 0x1.2aaaaaaaaaaabp0 +#define Q7 0x1.2492492492492p0 +#define Q8 0x1.2p0 +#define Q9 0x1.1c71c71c71c72p0 + +/* Ri = -2 * i / ((i+1)*(i+2)). */ +#define R5 -0x1.e79e79e79e79ep-3 +#define R6 -0x1.b6db6db6db6dbp-3 +#define R7 -0x1.8e38e38e38e39p-3 +#define R8 -0x1.6c16c16c16c17p-3 +#define R9 -0x1.4f2094f2094f2p-3 + +/* Fast erfc approximation based on series expansion near x rounded to + nearest multiple of 1/128. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erfc(x) ~ erfc(r) - scale * d * poly(r, d), with + + poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3 + + (2/15 r^4 - 2/5 r^2 + 1/10) d^4 + - r * (2/45 r^4 - 2/9 r^2 + 1/6) d^5 + + p6(r) d^6 + ... + p10(r) d^10 + + Polynomials p6(r) to p10(r) are computed using recurrence relation + + 2(i+1)p_i + 2r(i+2)p_{i+1} + (i+2)(i+3)p_{i+2} = 0, + with p0 = 1, and p1(r) = -r. + + Values of erfc(r) and scale(r) are read from lookup tables. Stored values + are scaled to avoid hitting the subnormal range. + + Note that for x < 0, erfc(x) = 2.0 - erfc(-x). + + Maximum measured error: 1.71 ULP + erfc(0x1.46cfe976733p+4) got 0x1.e15fcbea3e7afp-608 + want 0x1.e15fcbea3e7adp-608. */ +double +erfc (double x) +{ + /* Get top words and sign. */ + uint64_t ix = asuint64 (x); + uint64_t ia = ix & 0x7fffffffffffffff; + double a = asdouble (ia); + uint64_t sign = ix & ~0x7fffffffffffffff; + + /* erfc(nan)=nan, erfc(+inf)=0 and erfc(-inf)=2. */ + if (unlikely (ia >= 0x7ff0000000000000)) + return asdouble (sign >> 1) + 1.0 / x; /* Special cases. */ + + /* Return early for large enough negative values. */ + if (x < -6.0) + return 2.0; + + /* For |x| < 3487.0/128.0, the following approximation holds. */ + if (likely (ia < 0x403b3e0000000000)) + { + /* |x| < 0x1p-511 => accurate to 0.5 ULP. */ + if (unlikely (ia < asuint64 (0x1p-511))) + return 1.0 - x; + + /* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 1 and scale + to 2/sqrt(pi), when x reduced to r = 0. */ + double z = a + Shift; + uint64_t i = asuint64 (z); + double r = z - Shift; + /* These values are scaled by 2^128. */ + double erfcr = __erfc_data.tab[i].erfc; + double scale = __erfc_data.tab[i].scale; + + /* erfc(x) ~ erfc(r) - scale * d * poly (r, d). */ + double d = a - r; + double d2 = d * d; + double r2 = r * r; + /* Compute p_i as a regular (low-order) polynomial. */ + double p1 = -r; + double p2 = fma (P21, r2, -P20); + double p3 = -r * fma (P20, r2, -0.5); + double p4 = fma (fma (P42, r2, -P41), r2, P40); + double p5 = -r * fma (fma (P52, r2, -P51), r2, P50); + /* Compute p_i using recurrence relation: + p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */ + double p6 = fma (Q5 * r, p5, p4) * R5; + double p7 = fma (Q6 * r, p6, p5) * R6; + double p8 = fma (Q7 * r, p7, p6) * R7; + double p9 = fma (Q8 * r, p8, p7) * R8; + double p10 = fma (Q9 * r, p9, p8) * R9; + /* Compute polynomial in d using pairwise Horner scheme. */ + double p90 = fma (p10, d, p9); + double p78 = fma (p8, d, p7); + double p56 = fma (p6, d, p5); + double p34 = fma (p4, d, p3); + double p12 = fma (p2, d, p1); + double y = fma (p90, d2, p78); + y = fma (y, d2, p56); + y = fma (y, d2, p34); + y = fma (y, d2, p12); + + y = fma (-fma (y, d2, d), scale, erfcr); + + /* Handle sign and scale back in a single fma. */ + double off = asdouble (sign >> 1); + double fac = asdouble (asuint64 (0x1p-128) | sign); + y = fma (y, fac, off); + + if (unlikely (x > 26.0)) + { + /* The underflow exception needs to be signaled explicitly when + result gets into the subnormal range. */ + if (unlikely (y < 0x1p-1022)) + force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022); + /* Set errno to ERANGE if result rounds to 0. */ + return __math_check_uflow (y); + } + + return y; + } + /* Above the threshold (x > 3487.0/128.0) erfc is constant and needs to raise + underflow exception for positive x. */ + return __math_uflow (0); +} + +PL_SIG (S, D, 1, erfc, -6.0, 28.0) +PL_TEST_ULP (erfc, 1.21) +PL_TEST_SYM_INTERVAL (erfc, 0, 0x1p-26, 40000) +PL_TEST_INTERVAL (erfc, 0x1p-26, 28.0, 100000) +PL_TEST_INTERVAL (erfc, -0x1p-26, -6.0, 100000) +PL_TEST_INTERVAL (erfc, 28.0, inf, 40000) +PL_TEST_INTERVAL (erfc, -6.0, -inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/erfc_4u5.c b/contrib/arm-optimized-routines/pl/math/erfc_4u5.c deleted file mode 100644 index e9af9d3bcdb4..000000000000 --- a/contrib/arm-optimized-routines/pl/math/erfc_4u5.c +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Double-precision erfc(x) function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" -#include "pairwise_horner.h" -#include "pl_sig.h" -#include "pl_test.h" - -#define AbsMask (0x7fffffffffffffff) - -#define xint __erfc_data.interval_bounds -#define PX __erfc_data.poly - -/* Accurate exponential from optimized routines. */ -double -__exp_dd (double x, double xtail); - -static inline double -eval_poly_horner (double z, int i) -{ - double z2 = z * z; -#define C(j) PX[i][j] - return PAIRWISE_HORNER_12 (z, z2, C); -} - -/* Accurate evaluation of exp(x^2) - using compensated product (x^2 ~ x*x + e2) - and the __exp_dd(y,d) routine, that is the - computation of exp(y+d) with a small correction d< 6.0. */ -static inline double -approx_erfc_hi (double x, int i) -{ - double a = fabs (x); - double z = a - xint[i]; - double p = eval_poly_horner (z, i); - double e_mx2 = eval_accurate_gaussian (a); - return p * e_mx2; -} - -static inline int -get_itv_idx (double x) -{ - /* Interval bounds are a logarithmic scale, i.e. interval n has - lower bound 2^(n/4) - 1. Use the exponent of (|x|+1)^4 to obtain - the interval index. */ - double a = asdouble (asuint64 (x) & AbsMask); - double z = a + 1.0; - z = z * z; - z = z * z; - return (asuint64 (z) >> 52) - 1023; -} - -/* Approximation of erfc for |x| < 6.0. */ -static inline double -approx_erfc_lo (double x, uint32_t sign, int i) -{ - double a = fabs (x); - double z = a - xint[i]; - double p = eval_poly_horner (z, i); - double e_mx2 = eval_accurate_gaussian (a); - if (sign) - return fma (-p, e_mx2, 2.0); - else - return p * e_mx2; -} - -/* Top 12 bits of a double (sign and exponent bits). */ -static inline uint32_t -abstop12 (double x) -{ - return (asuint64 (x) >> 52) & 0x7ff; -} - -/* Top 32 bits of a double. */ -static inline uint32_t -top32 (double x) -{ - return asuint64 (x) >> 32; -} - -/* Fast erfc implementation. - The approximation uses polynomial approximation of - exp(x^2) * erfc(x) with fixed orders on 20 intervals. - Maximum measured error is 4.05 ULPs:. - erfc(0x1.e8ebf6a2b0801p-2) got 0x1.ff84036f8f0b3p-2 - want 0x1.ff84036f8f0b7p-2. */ -double -erfc (double x) -{ - /* Get top words. */ - uint32_t ix = top32 (x); /* We need to compare at most 32 bits. */ - uint32_t ia = ix & 0x7fffffff; - uint32_t sign = ix >> 31; - - /* Handle special cases and small values with a single comparison: - abstop12(x)-abstop12(small) >= abstop12(INFINITY)-abstop12(small) - Special cases erfc(nan)=nan, erfc(+inf)=0 and erfc(-inf)=2 - Errno EDOM does not have to be set in case of erfc(nan). - Only ERANGE may be set in case of underflow. - Small values (|x| accurate up to 0.5 ULP (top12(0x1p-50) = 0x3c7) - |x|<0x1.0p-50 => accurate up to 1.0 ULP (top12(0x1p-50) = 0x3cd). */ - if (unlikely (abstop12 (x) - 0x3cd >= (abstop12 (INFINITY) & 0x7ff) - 0x3cd)) - { - if (abstop12 (x) >= 0x7ff) - return (double) (sign << 1) + 1.0 / x; /* special cases. */ - else - return 1.0 - x; /* small case. */ - } - else if (ia < 0x40180000) - { /* |x| < 6.0. */ - return approx_erfc_lo (x, sign, get_itv_idx (x)); - } - else if (sign) - { /* x <= -6.0. */ - return 2.0; - } - else if (ia < 0x403c0000) - { /* 6.0 <= x < 28. */ - return approx_erfc_hi (x, get_itv_idx (x)); - } - else - { /* x > 28. */ - return __math_uflow (0); - } -} - -PL_SIG (S, D, 1, erfc, -6.0, 28.0) -PL_TEST_ULP (erfc, 3.56) -PL_TEST_INTERVAL (erfc, 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (erfc, 0x1p-1022, 0x1p-26, 40000) -PL_TEST_INTERVAL (erfc, -0x1p-1022, -0x1p-26, 40000) -PL_TEST_INTERVAL (erfc, 0x1p-26, 0x1p5, 40000) -PL_TEST_INTERVAL (erfc, -0x1p-26, -0x1p3, 40000) -PL_TEST_INTERVAL (erfc, 0, inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/erfc_data.c b/contrib/arm-optimized-routines/pl/math/erfc_data.c index fa7184fcc871..40f72a4d6d5b 100644 --- a/contrib/arm-optimized-routines/pl/math/erfc_data.c +++ b/contrib/arm-optimized-routines/pl/math/erfc_data.c @@ -7,139 +7,3501 @@ #include "math_config.h" -/* Polynomial coefficients for approximating erfc(x)*exp(x*x) in double - precision. Generated using the Remez algorithm on each interval separately - (see erfc.sollya for more detail). */ +/* Lookup table used in erfc. + For each possible rounded input r (multiples of 1/128), between + r = 0.0 and r = ~27.0 (3488 values): + - the first entry __erfc_data.tab.erfc contains the values of erfc(r), + - the second entry __erfc_data.tab.scale contains the values of + 2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore + they are scaled by a large enough value 2^128 (fits in 8bit). */ const struct erfc_data __erfc_data = { - -/* Bounds for 20 intervals spanning [0x1.0p-50., 31.]. Interval bounds are a - logarithmic scale, i.e. interval n has lower bound 2^(n/4) - 1, with the - exception of the first interval. */ -.interval_bounds = { - 0x1.0p-50, /* Tiny boundary. */ - 0x1.837f05c490126p-3, /* 0.189. */ - 0x1.a827997709f7ap-2, /* 0.414. */ - 0x1.5d13f326fe9c8p-1, /* 0.682. */ - 0x1.0p0, /* 1.000. */ - 0x1.60dfc14636e2ap0, /* 1.378. */ - 0x1.d413cccfe779ap0, /* 1.828. */ - 0x1.2e89f995ad3adp1, /* 2.364. */ - 0x1.8p1, /* 3.000. */ - 0x1.e0dfc14636e2ap1, /* 3.757. */ - 0x1.2a09e667f3bcdp2, /* 4.657. */ - 0x1.6e89f995ad3adp2, /* 5.727. */ - 0x1.cp2, /* 7.000. */ - 0x1.106fe0a31b715p3, /* 8.514. */ - 0x1.4a09e667f3bcdp3, /* 10.31. */ - 0x1.8e89f995ad3adp3, /* 12.45. */ - 0x1.ep3, /* 15.00. */ - 0x1.206fe0a31b715p4, /* 18.03. */ - 0x1.5a09e667f3bcdp4, /* 21.63. */ - 0x1.9e89f995ad3adp4, /* 25.91. */ - 0x1.fp4 /* 31.00. */ + .tab = { { 0x1p128, 0x1.20dd750429b6dp128 }, + { 0x1.fb7c9030853b3p127, 0x1.20d8f1975c85dp128 }, + { 0x1.f6f9447be0743p127, 0x1.20cb67bd452c7p128 }, + { 0x1.f27640f9853d9p127, 0x1.20b4d8bac36c1p128 }, + { 0x1.edf3a9ba22dadp127, 0x1.209546ad13ccfp128 }, + { 0x1.e971a2c4436aep127, 0x1.206cb4897b148p128 }, + { 0x1.e4f05010eca8cp127, 0x1.203b261cd0053p128 }, + { 0x1.e06fd58842c7ep127, 0x1.2000a00ae3804p128 }, + { 0x1.dbf056fe2df35p127, 0x1.1fbd27cdc72d3p128 }, + { 0x1.d771f82f02f4ep127, 0x1.1f70c3b4f2cc8p128 }, + { 0x1.d2f4dcbc2f894p127, 0x1.1f1b7ae44867fp128 }, + { 0x1.ce792828eae5cp127, 0x1.1ebd5552f795bp128 }, + { 0x1.c9fefdd6eaf19p127, 0x1.1e565bca400d4p128 }, + { 0x1.c58681031eb6ap127, 0x1.1de697e413d29p128 }, + { 0x1.c10fd4c26e896p127, 0x1.1d6e14099944ap128 }, + { 0x1.bc9b1bfe82687p127, 0x1.1cecdb718d61cp128 }, + { 0x1.b82879728f11ep127, 0x1.1c62fa1e869b6p128 }, + { 0x1.b3b80fa82a4bbp127, 0x1.1bd07cdd189acp128 }, + { 0x1.af4a00f426daap127, 0x1.1b357141d95d5p128 }, + { 0x1.aade6f7378a0ep127, 0x1.1a91e5a748165p128 }, + { 0x1.a6757d08215d8p127, 0x1.19e5e92b964abp128 }, + { 0x1.a20f4b5626818p127, 0x1.19318bae53a04p128 }, + { 0x1.9dabfbc090901p127, 0x1.1874ddcdfce24p128 }, + { 0x1.994baf66747adp127, 0x1.17aff0e56ec1p128 }, + { 0x1.94ee8720076b6p127, 0x1.16e2d7093cd8cp128 }, + { 0x1.9094a37bbd66ep127, 0x1.160da304ed92fp128 }, + { 0x1.8c3e24bb73372p127, 0x1.153068581b781p128 }, + { 0x1.87eb2ad1a4032p127, 0x1.144b3b337c90cp128 }, + { 0x1.839bd55eaafc8p127, 0x1.135e3075d076bp128 }, + { 0x1.7f5043ae11862p127, 0x1.12695da8b5bdep128 }, + { 0x1.7b0894b3ea35cp127, 0x1.116cd8fd67618p128 }, + { 0x1.76c4e70a390e7p127, 0x1.1068b94962e5ep128 }, + { 0x1.728558ee694fcp127, 0x1.0f5d1602f7e41p128 }, + { 0x1.6e4a083ed132fp127, 0x1.0e4a073dc1b91p128 }, + { 0x1.6a13127843ec1p127, 0x1.0d2fa5a70c168p128 }, + { 0x1.65e094b3b2413p127, 0x1.0c0e0a8223359p128 }, + { 0x1.61b2aba3da093p127, 0x1.0ae54fa490723p128 }, + { 0x1.5d89739304dcfp127, 0x1.09b58f724416bp128 }, + { 0x1.59650860d6469p127, 0x1.087ee4d9ad247p128 }, + { 0x1.5545858029b39p127, 0x1.07416b4fbfe7cp128 }, + { 0x1.512b05f5006e1p127, 0x1.05fd3ecbec298p128 }, + { 0x1.4d15a4527fdc7p127, 0x1.04b27bc403d3p128 }, + { 0x1.49057ab900447p127, 0x1.03613f2812dafp128 }, + { 0x1.44faa2d42c4ap127, 0x1.0209a65e29545p128 }, + { 0x1.40f535d93160ep127, 0x1.00abcf3e187a9p128 }, + { 0x1.3cf54c850162p127, 0x1.fe8fb01a47307p127 }, + { 0x1.38faff1aa574ap127, 0x1.fbbbbef34b4b2p127 }, + { 0x1.35066561a275dp127, 0x1.f8dc092d58ff8p127 }, + { 0x1.311796a46f064p127, 0x1.f5f0cdaf15313p127 }, + { 0x1.2d2ea9aefb636p127, 0x1.f2fa4c16c0019p127 }, + { 0x1.294bb4cd4b2bdp127, 0x1.eff8c4b1375dbp127 }, + { 0x1.256ecdca212ccp127, 0x1.ecec7870ebca8p127 }, + { 0x1.219809edbd524p127, 0x1.e9d5a8e4c934ep127 }, + { 0x1.1dc77dfcacd02p127, 0x1.e6b4982f158b9p127 }, + { 0x1.19fd3e36ac96ap127, 0x1.e38988fc46e72p127 }, + { 0x1.16395e559e218p127, 0x1.e054be79d3042p127 }, + { 0x1.127bf18c8eadcp127, 0x1.dd167c4cf9d2ap127 }, + { 0x1.0ec50a86d0dd4p127, 0x1.d9cf06898cdafp127 }, + { 0x1.0b14bb6728cd8p127, 0x1.d67ea1a8b5368p127 }, + { 0x1.076b15c70aa28p127, 0x1.d325927fb9d89p127 }, + { 0x1.03c82ab5eb831p127, 0x1.cfc41e36c7df9p127 }, + { 0x1.002c0ab8a5018p127, 0x1.cc5a8a3fbea4p127 }, + { 0x1.f92d8b91d5cc7p126, 0x1.c8e91c4d01368p127 }, + { 0x1.f210d6a9a6a31p126, 0x1.c5701a484ef9dp127 }, + { 0x1.eb02147ce245cp126, 0x1.c1efca49a5011p127 }, + { 0x1.e40161b701275p126, 0x1.be68728e29d5ep127 }, + { 0x1.dd0ed9ea4bdd6p126, 0x1.bada596f25436p127 }, + { 0x1.d62a978f7c957p126, 0x1.b745c55905bf8p127 }, + { 0x1.cf54b4058455fp126, 0x1.b3aafcc27502ep127 }, + { 0x1.c88d479173ccep126, 0x1.b00a46237d5bep127 }, + { 0x1.c1d4695e87644p126, 0x1.ac63e7ecc1411p127 }, + { 0x1.bb2a2f7e5652p126, 0x1.a8b8287ec6a09p127 }, + { 0x1.b48eaee924501p126, 0x1.a5074e215762p127 }, + { 0x1.ae01fb7e55a66p126, 0x1.a1519efaf889ep127 }, + { 0x1.a78428050527ep126, 0x1.9d97610879642p127 }, + { 0x1.a115462cbbc17p126, 0x1.99d8da149c13fp127 }, + { 0x1.9ab5668e4930ap126, 0x1.96164fafd8de3p127 }, + { 0x1.946498acbd766p126, 0x1.925007283d7aap127 }, + { 0x1.8e22eaf68291ep126, 0x1.8e86458169af8p127 }, + { 0x1.87f06ac6960c4p126, 0x1.8ab94f6caa71dp127 }, + { 0x1.81cd2465e1d96p126, 0x1.86e9694134b9ep127 }, + { 0x1.7bb9230cb40b4p126, 0x1.8316d6f48133dp127 }, + { 0x1.75b470e454d35p126, 0x1.7f41dc12c9e89p127 }, + { 0x1.6fbf1708ba47cp126, 0x1.7b6abbb7aaf19p127 }, + { 0x1.69d91d8a595dap126, 0x1.7791b886e7403p127 }, + { 0x1.64028b7013867p126, 0x1.73b714a552763p127 }, + { 0x1.5e3b66b9405a9p126, 0x1.6fdb11b1e0c34p127 }, + { 0x1.5883b45fd2b63p126, 0x1.6bfdf0beddaf5p127 }, + { 0x1.52db785a98acap126, 0x1.681ff24b4ab04p127 }, + { 0x1.4d42b59f95afap126, 0x1.6441563c665d4p127 }, + { 0x1.47b96e267647ap126, 0x1.60625bd75d07bp127 }, + { 0x1.423fa2eb1cb59p126, 0x1.5c8341bb23767p127 }, + { 0x1.3cd553f045d45p126, 0x1.58a445da7c74cp127 }, + { 0x1.377a8042458d1p126, 0x1.54c5a57629dbp127 }, + { 0x1.322f25f9da2fdp126, 0x1.50e79d1749ac9p127 }, + { 0x1.2cf3423f15fdfp126, 0x1.4d0a6889dfd9fp127 }, + { 0x1.27c6d14c5e341p126, 0x1.492e42d78d2c5p127 }, + { 0x1.22a9ce717edcbp126, 0x1.4553664273d24p127 }, + { 0x1.1d9c3416d2b4bp126, 0x1.417a0c4049fdp127 }, + { 0x1.189dfbc07e69p126, 0x1.3da26d759aef5p127 }, + { 0x1.13af1e11be721p126, 0x1.39ccc1b136d5ap127 }, + { 0x1.0ecf92d046d22p126, 0x1.35f93fe7d1b3dp127 }, + { 0x1.09ff50e7b3f93p126, 0x1.32281e2fd1a92p127 }, + { 0x1.053e4e6d0c10bp126, 0x1.2e5991bd4cbfcp127 }, + { 0x1.008c80a24ff1p126, 0x1.2a8dcede3673bp127 }, + { 0x1.f7d3b7f436013p125, 0x1.26c508f6bd0ffp127 }, + { 0x1.eeaca836a27ccp125, 0x1.22ff727dd6f7bp127 }, + { 0x1.e5a3b7c9b56dap125, 0x1.1f3d3cf9ffe5ap127 }, + { 0x1.dcb8cae2d747fp125, 0x1.1b7e98fe26217p127 }, + { 0x1.d3ebc436b0f26p125, 0x1.17c3b626c7a12p127 }, + { 0x1.cb3c8500ea349p125, 0x1.140cc3173f007p127 }, + { 0x1.c2aaed0bfcfeep125, 0x1.1059ed7740313p127 }, + { 0x1.ba36dab91c0e9p125, 0x1.0cab61f084b93p127 }, + { 0x1.b1e02b082b72p125, 0x1.09014c2ca74dap127 }, + { 0x1.a9a6b99fc973bp125, 0x1.055bd6d32e8d7p127 }, + { 0x1.a18a60d56673ep125, 0x1.01bb2b87c6968p127 }, + { 0x1.998af9b56a3aep125, 0x1.fc3ee5d1524bp126 }, + { 0x1.91a85c0b65519p125, 0x1.f511a91a67d2ap126 }, + { 0x1.89e25e6a4cef9p125, 0x1.edeeee0959518p126 }, + { 0x1.8238d634c0127p125, 0x1.e6d6ffaa65a25p126 }, + { 0x1.7aab97a554544p125, 0x1.dfca26f5bbf88p126 }, + { 0x1.733a75d6e91b8p125, 0x1.d8c8aace11e63p126 }, + { 0x1.6be542ccffc2fp125, 0x1.d1d2cfff91594p126 }, + { 0x1.64abcf7c175b4p125, 0x1.cae8d93f1d7b7p126 }, + { 0x1.5d8debd20aacep125, 0x1.c40b0729ed548p126 }, + { 0x1.568b66be6f268p125, 0x1.bd3998457afdbp126 }, + { 0x1.4fa40e3af3674p125, 0x1.b674c8ffc6283p126 }, + { 0x1.48d7af53bc19fp125, 0x1.afbcd3afe8ab6p126 }, + { 0x1.4226162fbddd5p125, 0x1.a911f096fbc26p126 }, + { 0x1.3b8f0e1912f7p125, 0x1.a27455e14c93cp126 }, + { 0x1.351261854b991p125, 0x1.9be437a7de946p126 }, + { 0x1.2eafda1db784ap125, 0x1.9561c7f23a47bp126 }, + { 0x1.286740c7a7dabp125, 0x1.8eed36b886d93p126 }, + { 0x1.22385daca7f47p125, 0x1.8886b1e5ecfd1p126 }, + { 0x1.1c22f842ac1f2p125, 0x1.822e655b417e7p126 }, + { 0x1.1626d7543522p125, 0x1.7be47af1f5d89p126 }, + { 0x1.1043c1086777dp125, 0x1.75a91a7f4d2edp126 }, + { 0x1.0a797aeb152f2p125, 0x1.6f7c69d7d3ef8p126 }, + { 0x1.04c7c9f4b969p125, 0x1.695e8cd31867ep126 }, + { 0x1.fe5ce524c8ee5p124, 0x1.634fa54fa285fp126 }, + { 0x1.f35a715b2f3e1p124, 0x1.5d4fd33729015p126 }, + { 0x1.e887bf681f218p124, 0x1.575f3483021c3p126 }, + { 0x1.dde4553ef94dep124, 0x1.517de540ce2a3p126 }, + { 0x1.d36fb7fa50177p124, 0x1.4babff975a04cp126 }, + { 0x1.c9296beb09cf1p124, 0x1.45e99bcbb7915p126 }, + { 0x1.bf10f4a759889p124, 0x1.4036d0468a7a2p126 }, + { 0x1.b525d5198cb1cp124, 0x1.3a93b1998736cp126 }, + { 0x1.ab678f8eabedbp124, 0x1.35005285227f1p126 }, + { 0x1.a1d5a5c4edb96p124, 0x1.2f7cc3fe6f423p126 }, + { 0x1.986f98f9f96c8p124, 0x1.2a09153529381p126 }, + { 0x1.8f34e9f8f93a6p124, 0x1.24a55399ea239p126 }, + { 0x1.8625192879e39p124, 0x1.1f518ae487dc8p126 }, + { 0x1.7d3fa69816db5p124, 0x1.1a0dc51a9934dp126 }, + { 0x1.7484120df1b01p124, 0x1.14da0a961fd14p126 }, + { 0x1.6bf1db13f3983p124, 0x1.0fb6620c550afp126 }, + { 0x1.63888104d811ap124, 0x1.0aa2d09497f2bp126 }, + { 0x1.5b478318ff939p124, 0x1.059f59af7a906p126 }, + { 0x1.532e6073095f2p124, 0x1.00abff4dec7a3p126 }, + { 0x1.4b3c982c338c7p124, 0x1.f79183b101c5bp125 }, + { 0x1.4371a960807f8p124, 0x1.edeb406d9c825p125 }, + { 0x1.3bcd133aa0ffcp124, 0x1.e4652fadcb6b2p125 }, + { 0x1.344e54ffa23b9p124, 0x1.daff4969c0b04p125 }, + { 0x1.2cf4ee1a5f0fcp124, 0x1.d1b982c50137p125 }, + { 0x1.25c05e26b3f99p124, 0x1.c893ce1dcbef7p125 }, + { 0x1.1eb024fc75285p124, 0x1.bf8e1b1ca2279p125 }, + { 0x1.17c3c2ba26319p124, 0x1.b6a856c3ed54fp125 }, + { 0x1.10fab7cf72f94p124, 0x1.ade26b7fbed95p125 }, + { 0x1.0a548507696cp124, 0x1.a53c4135a6526p125 }, + { 0x1.03d0ab9273b94p124, 0x1.9cb5bd549b111p125 }, + { 0x1.fadd5a20258d3p123, 0x1.944ec2e4f563p125 }, + { 0x1.ee5c1730b147cp123, 0x1.8c07329874652p125 }, + { 0x1.e21c938a45a83p123, 0x1.83deeada4d25ap125 }, + { 0x1.d61dd57628999p123, 0x1.7bd5c7df3fe9cp125 }, + { 0x1.ca5ee4649e31fp123, 0x1.73eba3b5b07b7p125 }, + { 0x1.bedec8fddb34p123, 0x1.6c205655be72p125 }, + { 0x1.b39c8d3276d8ap123, 0x1.6473b5b15a7a1p125 }, + { 0x1.a8973c4b5c03ep123, 0x1.5ce595c455b0ap125 }, + { 0x1.9dcde2f93a207p123, 0x1.5575c8a468362p125 }, + { 0x1.933f8f6375f2cp123, 0x1.4e241e912c305p125 }, + { 0x1.88eb51369acb9p123, 0x1.46f066040a832p125 }, + { 0x1.7ed039b24c96bp123, 0x1.3fda6bc016994p125 }, + { 0x1.74ed5bb6bb581p123, 0x1.38e1fae1d6a9dp125 }, + { 0x1.6b41cbd198bc8p123, 0x1.3206dceef5f87p125 }, + { 0x1.61cca04a90795p123, 0x1.2b48d9e5dea1cp125 }, + { 0x1.588cf12f4446bp123, 0x1.24a7b84d38971p125 }, + { 0x1.4f81d85ecc55bp123, 0x1.1e233d434b813p125 }, + { 0x1.46aa7194bd324p123, 0x1.17bb2c8d41535p125 }, + { 0x1.3e05da73b4159p123, 0x1.116f48a6476ccp125 }, + { 0x1.3593328f6abbep123, 0x1.0b3f52ce8c383p125 }, + { 0x1.2d519b7653e1ep123, 0x1.052b0b1a174eap125 }, + { 0x1.254038bac19d6p123, 0x1.fe6460fef468p124 }, + { 0x1.1d5e2ffb96d4p123, 0x1.f2a901ccafb37p124 }, + { 0x1.15aaa8ec85205p123, 0x1.e723726b824a9p124 }, + { 0x1.0e24cd5dd8846p123, 0x1.dbd32ac4c99bp124 }, + { 0x1.06cbc943d255ap123, 0x1.d0b7a0f921e7cp124 }, + { 0x1.ff3d957b29b39p122, 0x1.c5d0497c09e74p124 }, + { 0x1.f13a043742333p122, 0x1.bb1c972f23e5p124 }, + { 0x1.e38b43cbd0f0fp122, 0x1.b09bfb7d11a84p124 }, + { 0x1.d62fbdc2e756bp122, 0x1.a64de673e8837p124 }, + { 0x1.c925e02b41668p122, 0x1.9c31c6df3b1b8p124 }, + { 0x1.bc6c1da1f3121p122, 0x1.92470a61b6965p124 }, + { 0x1.b000ed5b4a626p122, 0x1.888d1d8e510a3p124 }, + { 0x1.a3e2cb2ae9edbp122, 0x1.7f036c0107294p124 }, + { 0x1.9810378b1f299p122, 0x1.75a96077274bap124 }, + { 0x1.8c87b7a37834fp122, 0x1.6c7e64e7281cbp124 }, + { 0x1.8147d54e9cc33p122, 0x1.6381e2980956bp124 }, + { 0x1.764f1f1f6ddeap122, 0x1.5ab342383d178p124 }, + { 0x1.6b9c28657041ap122, 0x1.5211ebf41880bp124 }, + { 0x1.612d893085125p122, 0x1.499d478bca735p124 }, + { 0x1.5701de53f4d2ep122, 0x1.4154bc68d75c3p124 }, + { 0x1.4d17c968d062bp122, 0x1.3937b1b31925ap124 }, + { 0x1.436df0cfabf1dp122, 0x1.31458e6542847p124 }, + { 0x1.3a02ffb1b7ceep122, 0x1.297db960e4f63p124 }, + { 0x1.30d5a6013afc5p122, 0x1.21df9981f8e53p124 }, + { 0x1.27e49879737d3p122, 0x1.1a6a95b1e786fp124 }, + { 0x1.1f2e909de04d2p122, 0x1.131e14fa1625dp124 }, + { 0x1.16b24cb8f8f92p122, 0x1.0bf97e95f2a64p124 }, + { 0x1.0e6e8fda56cf7p122, 0x1.04fc3a0481321p124 }, + { 0x1.066221d4539d8p122, 0x1.fc4b5e32d6259p123 }, + { 0x1.fd179e7243e3cp121, 0x1.eeea8c1b1db94p123 }, + { 0x1.edd4d2aec5adbp121, 0x1.e1d4cf1e2450ap123 }, + { 0x1.def98c6c79efap121, 0x1.d508f9a1ea64fp123 }, + { 0x1.d0838121f2418p121, 0x1.c885df3451a07p123 }, + { 0x1.c2706fa45005ep121, 0x1.bc4a54a84e834p123 }, + { 0x1.b4be201caa4b4p121, 0x1.b055303221015p123 }, + { 0x1.a76a63fc95c79p121, 0x1.a4a549829587ep123 }, + { 0x1.9a7315f1d6a55p121, 0x1.993979e14fffep123 }, + { 0x1.8dd619d943ca1p121, 0x1.8e109c4622913p123 }, + { 0x1.81915cb0e3323p121, 0x1.83298d717210ep123 }, + { 0x1.75a2d48946eb1p121, 0x1.78832c03aa2b1p123 }, + { 0x1.6a08807632262p121, 0x1.6e1c5893c380bp123 }, + { 0x1.5ec0687e8dcb2p121, 0x1.63f3f5c4de13bp123 }, + { 0x1.53c89d8bb3ddbp121, 0x1.5a08e85af27ep123 }, + { 0x1.491f395818f54p121, 0x1.505a174e9c929p123 }, + { 0x1.3ec25e5d5af12p121, 0x1.46e66be00224p123 }, + { 0x1.34b037c1bbfc5p121, 0x1.3dacd1a8d8ccep123 }, + { 0x1.2ae6f94510dd8p121, 0x1.34ac36ad8dafep123 }, + { 0x1.2164df2d29765p121, 0x1.2be38b6d92415p123 }, + { 0x1.18282e31ba3e8p121, 0x1.2351c2f2d1449p123 }, + { 0x1.0f2f3367cd6aap121, 0x1.1af5d2e04f3f6p123 }, + { 0x1.0678442cc256fp121, 0x1.12ceb37ff9bc3p123 }, + { 0x1.fc037c21c3622p120, 0x1.0adb5fcfa8c75p123 }, + { 0x1.eb940d8319831p120, 0x1.031ad58d56279p123 }, + { 0x1.db9f17e61c31p120, 0x1.f7182a851bca2p122 }, + { 0x1.cc218694238a2p120, 0x1.e85c449e377f3p122 }, + { 0x1.bd18548996419p120, 0x1.da0005e5f28dfp122 }, + { 0x1.ae808c479c371p120, 0x1.cc0180af00a8bp122 }, + { 0x1.a05747a543aa7p120, 0x1.be5ecd2fcb5f9p122 }, + { 0x1.9299afa0246a6p120, 0x1.b1160991ff737p122 }, + { 0x1.8544fc2c8c1dap120, 0x1.a4255a00b9f03p122 }, + { 0x1.785674053e8b9p120, 0x1.978ae8b55ce1bp122 }, + { 0x1.6bcb6c7ad4854p120, 0x1.8b44e6031383ep122 }, + { 0x1.5fa14942c3d54p120, 0x1.7f5188610ddc8p122 }, + { 0x1.53d57c461a5a7p120, 0x1.73af0c737bb45p122 }, + { 0x1.4865856ff632ap120, 0x1.685bb5134ef13p122 }, + { 0x1.3d4ef27bc49a6p120, 0x1.5d55cb54cd53ap122 }, + { 0x1.328f5ec350e67p120, 0x1.529b9e8cf9a1ep122 }, + { 0x1.2824730cacbb4p120, 0x1.482b8455dc491p122 }, + { 0x1.1e0be557fa673p120, 0x1.3e03d891b37dep122 }, + { 0x1.144378ad22027p120, 0x1.3422fd6d12e2bp122 }, + { 0x1.0ac8fce979b96p120, 0x1.2a875b5ffab56p122 }, + { 0x1.019a4e8d69649p120, 0x1.212f612dee7fbp122 }, + { 0x1.f16aad1422a55p119, 0x1.181983e5133ddp122 }, + { 0x1.e030141df7d25p119, 0x1.0f443edc5ce49p122 }, + { 0x1.cf80d4afc3019p119, 0x1.06ae13b0d3255p122 }, + { 0x1.bf5908f50b4ap119, 0x1.fcab1483ea7fcp121 }, + { 0x1.afb4e269693dfp119, 0x1.ec72615a894c4p121 }, + { 0x1.a090a974cfebep119, 0x1.dcaf3691fc448p121 }, + { 0x1.91e8bd0830a74p119, 0x1.cd5ec93c12432p121 }, + { 0x1.83b9923a85f7bp119, 0x1.be7e5ac24963bp121 }, + { 0x1.75ffb3e6519ap119, 0x1.b00b38d6b3575p121 }, + { 0x1.68b7c2479902dp119, 0x1.a202bd6372dcep121 }, + { 0x1.5bde729a6b60fp119, 0x1.94624e78e0fafp121 }, + { 0x1.4f708eb9fba63p119, 0x1.87275e3a6869ep121 }, + { 0x1.436af4c058acbp119, 0x1.7a4f6aca256cbp121 }, + { 0x1.37ca96a6cd1d4p119, 0x1.6dd7fe335823p121 }, + { 0x1.2c8c79e6f04a3p119, 0x1.61beae53b72b7p121 }, + { 0x1.21adb71c70c75p119, 0x1.56011cc3b036dp121 }, + { 0x1.172b79a7a1181p119, 0x1.4a9cf6bda3f4cp121 }, + { 0x1.0d02ff50ce651p119, 0x1.3f8ff5042a88ep121 }, + { 0x1.033197ec68c0ep119, 0x1.34d7dbc76d7e5p121 }, + { 0x1.f3694a0008381p118, 0x1.2a727a89a3f14p121 }, + { 0x1.e11332d0714c5p118, 0x1.205dac02bd6b9p121 }, + { 0x1.cf5bf1fed1e7p118, 0x1.1697560347b26p121 }, + { 0x1.be3eb08ae7c2p118, 0x1.0d1d69569b82dp121 }, + { 0x1.adb6b810af9e2p118, 0x1.03ede1a45bfeep121 }, + { 0x1.9dbf721b98dfap118, 0x1.f60d8aa2a88f2p120 }, + { 0x1.8e54677bb0151p118, 0x1.e4cc4abf7d065p120 }, + { 0x1.7f713f9cc9784p118, 0x1.d4143a9dfe965p120 }, + { 0x1.7111bfdfb3cep118, 0x1.c3e1a5f5c077cp120 }, + { 0x1.6331caf57b5dbp118, 0x1.b430ecf4a83a8p120 }, + { 0x1.55cd603cc415p118, 0x1.a4fe83fb9db25p120 }, + { 0x1.48e09b21414bfp118, 0x1.9646f35a76624p120 }, + { 0x1.3c67b27d50fe7p118, 0x1.8806d70b2fc36p120 }, + { 0x1.305ef7fdbfb95p118, 0x1.7a3ade6c8b3e5p120 }, + { 0x1.24c2d787b9e37p118, 0x1.6cdfcbfc1e263p120 }, + { 0x1.198fd6a0ee7bdp118, 0x1.5ff2750fe782p120 }, + { 0x1.0ec293d9e6d85p118, 0x1.536fc18f7ce5cp120 }, + { 0x1.0457c63a9669p118, 0x1.4754abacdf1dcp120 }, + { 0x1.f49879624a021p117, 0x1.3b9e3f9d06e3fp120 }, + { 0x1.e139bb05eb49ep117, 0x1.30499b503957fp120 }, + { 0x1.ce8d4b7fd6c7p117, 0x1.2553ee2a336bfp120 }, + { 0x1.bc8d516fda8bap117, 0x1.1aba78ba3af89p120 }, + { 0x1.ab341ee553e25p117, 0x1.107a8c7323a6ep120 }, + { 0x1.9a7c305336484p117, 0x1.06918b6355624p120 }, + { 0x1.8a602b88919cp117, 0x1.f9f9cfd9c3035p119 }, + { 0x1.7adadead962edp117, 0x1.e77448fb66bb9p119 }, + { 0x1.6be73f45149fbp117, 0x1.d58da68fd117p119 }, + { 0x1.5d80693276a6dp117, 0x1.c4412bf4b8f0bp119 }, + { 0x1.4fa19dc42d409p117, 0x1.b38a3af2e55b4p119 }, + { 0x1.424642c28ff75p117, 0x1.a3645330550ffp119 }, + { 0x1.3569e18328604p117, 0x1.93cb11a30d765p119 }, + { 0x1.29082600643fdp117, 0x1.84ba3004a50dp119 }, + { 0x1.1d1cddf5a82dep117, 0x1.762d84469c18fp119 }, + { 0x1.11a3f7ffbbfeap117, 0x1.6821000795a03p119 }, + { 0x1.069982c189a9ep117, 0x1.5a90b00981d93p119 }, + { 0x1.f7f3581a4dc2cp116, 0x1.4d78bba8ca5fdp119 }, + { 0x1.e381802242163p116, 0x1.40d564548fad7p119 }, + { 0x1.cfd6511405b2dp116, 0x1.34a305080681fp119 }, + { 0x1.bcead7f01492fp116, 0x1.28de11c5031ebp119 }, + { 0x1.aab859b20ac9ep116, 0x1.1d83170fbf6fbp119 }, + { 0x1.993851cc9779ap116, 0x1.128eb96be8798p119 }, + { 0x1.886470ad946a7p116, 0x1.07fdb4dafea5fp119 }, + { 0x1.78369a4a2cbd6p116, 0x1.fb99b8b8279e1p118 }, + { 0x1.68a8e4b2fc8c2p116, 0x1.e7f232d9e263p118 }, + { 0x1.59b596b012aaap116, 0x1.d4fed7195d7e8p118 }, + { 0x1.4b572664bd2dcp116, 0x1.c2b9cf7f893bfp118 }, + { 0x1.3d8837fb08d1dp116, 0x1.b11d702b3deb2p118 }, + { 0x1.30439c56dadf6p116, 0x1.a024365f771bdp118 }, + { 0x1.23844fd08cb93p116, 0x1.8fc8c794b03b5p118 }, + { 0x1.174578f6efd5dp116, 0x1.8005f08d6f1efp118 }, + { 0x1.0b826758a086bp116, 0x1.70d6a46e07ddap118 }, + { 0x1.003692548d98bp116, 0x1.6235fbd7a4345p118 }, + { 0x1.eabb2fe335196p115, 0x1.541f340697987p118 }, + { 0x1.d5e6777a83c2ap115, 0x1.468dadf4080abp118 }, + { 0x1.c1e6cb6239574p115, 0x1.397ced7af2b15p118 }, + { 0x1.aeb4423e690e7p115, 0x1.2ce898809244ep118 }, + { 0x1.9c47374a0974ep115, 0x1.20cc76202c5fbp118 }, + { 0x1.8a98484a1e8d3p115, 0x1.15246dda49d47p118 }, + { 0x1.79a0538dd4fc7p115, 0x1.09ec86c75d497p118 }, + { 0x1.695875fb574ap115, 0x1.fe41cd9bb4eeep117 }, + { 0x1.59ba0929261c5p115, 0x1.e97ba3b77f306p117 }, + { 0x1.4abea183bc47p115, 0x1.d57f524723822p117 }, + { 0x1.3c600c7f477c5p115, 0x1.c245d4b99847ap117 }, + { 0x1.2e984ed53e777p115, 0x1.afc85e0f82e12p117 }, + { 0x1.2161a2cd9d894p115, 0x1.9e005769dbc1dp117 }, + { 0x1.14b67693928cfp115, 0x1.8ce75e9f6f8ap117 }, + { 0x1.08916a956172p115, 0x1.7c7744d9378f7p117 }, + { 0x1.f9da9fde95755p114, 0x1.6caa0d3582fe9p117 }, + { 0x1.e38a4dc27b11bp114, 0x1.5d79eb71e893bp117 }, + { 0x1.ce283a9e3e33p114, 0x1.4ee1429bf7ccp117 }, + { 0x1.b9ab1a96e3b3ep114, 0x1.40daa3c89f5b6p117 }, + { 0x1.a609f7584d32bp114, 0x1.3360ccd23db3ap117 }, + { 0x1.933c2d52c56c9p114, 0x1.266ea71d4f71ap117 }, + { 0x1.8139690c0d187p114, 0x1.19ff4663ae9dfp117 }, + { 0x1.6ff9a4837fa43p114, 0x1.0e0de78654d1ep117 }, + { 0x1.5f7524a8e81a2p114, 0x1.0295ef6591848p117 }, + { 0x1.4fa476e59f668p114, 0x1.ef25d37f49fe1p116 }, + { 0x1.40806eb78e353p114, 0x1.da01102b5f851p116 }, + { 0x1.3202235dada5p114, 0x1.c5b5412dcafadp116 }, + { 0x1.2422ed95a3235p114, 0x1.b23a5a23e421p116 }, + { 0x1.16dc656a14df6p114, 0x1.9f8893d8fd1c1p116 }, + { 0x1.0a2860115569cp114, 0x1.8d986a4187285p116 }, + { 0x1.fc01dbb80c841p113, 0x1.7c629a822bc9ep116 }, + { 0x1.e4c0b066a497p113, 0x1.6be02102b352p116 }, + { 0x1.ce823f4cc4badp113, 0x1.5c0a378c90bcap116 }, + { 0x1.b93bf40d5eccbp113, 0x1.4cda5374ea275p116 }, + { 0x1.a4e3a125adc76p113, 0x1.3e4a23d1f4703p116 }, + { 0x1.916f7c5f2f764p113, 0x1.30538fbb77ecdp116 }, + { 0x1.7ed61b5d3db0ap113, 0x1.22f0b496539bep116 }, + { 0x1.6d0e7045988cbp113, 0x1.161be46ad3b5p116 }, + { 0x1.5c0fc68335b0cp113, 0x1.09cfa445b00ffp116 }, + { 0x1.4bd1bfa2aba3dp113, 0x1.fc0d55470cf51p115 }, + { 0x1.3c4c504792bf8p113, 0x1.e577bbcd49935p115 }, + { 0x1.2d77bd3a382bcp113, 0x1.cfd4a5adec5cp115 }, + { 0x1.1f4c988d02149p113, 0x1.bb1a9657ce465p115 }, + { 0x1.11c3bed8e716ap113, 0x1.a740684026555p115 }, + { 0x1.04d654905dadp113, 0x1.943d4a1d1ed39p115 }, + { 0x1.f0fb86d056745p112, 0x1.8208bc334a6a5p115 }, + { 0x1.d9676faafa27fp112, 0x1.709a8db59f25cp115 }, + { 0x1.c2e43d417197bp112, 0x1.5feada379d8b7p115 }, + { 0x1.ad664518e771bp112, 0x1.4ff207314a102p115 }, + { 0x1.98e25420092dap112, 0x1.40a8c1949f75ep115 }, + { 0x1.854daa4a49b0fp112, 0x1.3207fb7420eb9p115 }, + { 0x1.729df6503422ap112, 0x1.2408e9ba3327fp115 }, + { 0x1.60c95193c542dp112, 0x1.16a501f0e42cap115 }, + { 0x1.4fc63c27c71aep112, 0x1.09d5f819c9e29p115 }, + { 0x1.3f8b98f93052ap112, 0x1.fb2b792b40a22p114 }, + { 0x1.3010aa198de78p112, 0x1.e3bcf436a1a95p114 }, + { 0x1.214d0d298365p112, 0x1.cd55277c18d05p114 }, + { 0x1.1338b7e273194p112, 0x1.b7e94604479dcp114 }, + { 0x1.05cbf4be650abp112, 0x1.a36eec00926ddp114 }, + { 0x1.f1febf7a916aap111, 0x1.8fdc1b2dcf7b9p114 }, + { 0x1.d997c68d65936p111, 0x1.7d2737527c3f9p114 }, + { 0x1.c2556a4e7a90fp111, 0x1.6b4702d7d5849p114 }, + { 0x1.ac2aa7516ade4p111, 0x1.5a329b7d30748p114 }, + { 0x1.970b05888fda2p111, 0x1.49e17724f4d41p114 }, + { 0x1.82ea92dbc1a27p111, 0x1.3a4b60ba9aa4ep114 }, + { 0x1.6fbdddeff308fp111, 0x1.2b6875310f785p114 }, + { 0x1.5d79f11e27f6bp111, 0x1.1d312098e9dbap114 }, + { 0x1.4c144d984e1b8p111, 0x1.0f9e1b4dd36dfp114 }, + { 0x1.3b82e6ba892a4p111, 0x1.02a8673a94692p114 }, + { 0x1.2bbc1d878d272p111, 0x1.ec929a665b449p113 }, + { 0x1.1cb6bc4eaa678p111, 0x1.d4f4b4c8e09edp113 }, + { 0x1.0e69f27a37df3p111, 0x1.be6abbb10a5aap113 }, + { 0x1.00cd508511266p111, 0x1.a8e8cc1fadef6p113 }, + { 0x1.e7b1882bccac5p110, 0x1.94637d5bacfdbp113 }, + { 0x1.cf09287e48bb9p110, 0x1.80cfdc72220cfp113 }, + { 0x1.b792bbc489b04p110, 0x1.6e2367dc27f95p113 }, + { 0x1.a140206ab945p110, 0x1.5c540b4936fd2p113 }, + { 0x1.8c03d2d39119bp110, 0x1.4b581b8d170fcp113 }, + { 0x1.77d0e6e5bed21p110, 0x1.3b2652b06c2b2p113 }, + { 0x1.649b01d73110ap110, 0x1.2bb5cc22e5db6p113 }, + { 0x1.525654343aad2p110, 0x1.1cfe010e2052dp113 }, + { 0x1.40f79420887c7p110, 0x1.0ef6c4c84a0fep113 }, + { 0x1.3073f7cff4a85p110, 0x1.01984165a5f36p113 }, + { 0x1.20c1303550f0ep110, 0x1.e9b5e8d00ce77p112 }, + { 0x1.11d563e54f40ep110, 0x1.d16f5716c6c1ap112 }, + { 0x1.03a72a2bbdc06p110, 0x1.ba4f035d60e03p112 }, + { 0x1.ec5b0ca2b20f5p109, 0x1.a447b7b03f045p112 }, + { 0x1.d2bfc6210880ap109, 0x1.8f4ccca7fc90dp112 }, + { 0x1.ba6c1c6e87c4p109, 0x1.7b5223dac7336p112 }, + { 0x1.a35068e9c89cfp109, 0x1.684c227fcacefp112 }, + { 0x1.8d5dbaa383b98p109, 0x1.562fac4329b48p112 }, + { 0x1.7885ce9f67cdbp109, 0x1.44f21e49054f2p112 }, + { 0x1.64bb0863504ddp109, 0x1.34894a5e24657p112 }, + { 0x1.51f06ad20e4c3p109, 0x1.24eb7254ccf83p112 }, + { 0x1.4019914f0b53ap109, 0x1.160f438c70913p112 }, + { 0x1.2f2aa92823e8p109, 0x1.07ebd2a2d2844p112 }, + { 0x1.1f186b432c98bp109, 0x1.f4f12e9ab070ap111 }, + { 0x1.0fd8160ca94ap109, 0x1.db5ad0b27805cp111 }, + { 0x1.015f67a552924p109, 0x1.c304efa2c6f4ep111 }, + { 0x1.e749309831666p108, 0x1.abe09e9144b5ep111 }, + { 0x1.cd3caa04cdd1bp108, 0x1.95df988e76644p111 }, + { 0x1.b48774d0f8e45p108, 0x1.80f439b4ee04bp111 }, + { 0x1.9d189f9f85cbfp108, 0x1.6d11788a69c64p111 }, + { 0x1.86e0050236315p108, 0x1.5a2adfa0b4bc4p111 }, + { 0x1.71ce426a561d3p108, 0x1.4834877429b8fp111 }, + { 0x1.5dd4af79906a9p108, 0x1.37231085c7d9ap111 }, + { 0x1.4ae555af52cdfp108, 0x1.26eb9daed6f7ep111 }, + { 0x1.38f2e86f38216p108, 0x1.1783ceac2891p111 }, + { 0x1.27f0bd5d0e6b1p108, 0x1.08e1badf0fcedp111 }, + { 0x1.17d2c50b2bfafp108, 0x1.f5f7d88472604p110 }, + { 0x1.088d83f7e4069p108, 0x1.db92b5212fb8dp110 }, + { 0x1.f42c17ae0ebf6p107, 0x1.c282cd3957edap110 }, + { 0x1.d8c3ea48f2889p107, 0x1.aab7abace48dcp110 }, + { 0x1.beceb1f9f5b3dp107, 0x1.94219bfcb4928p110 }, + { 0x1.a6399674d366bp107, 0x1.7eb1a2075864ep110 }, + { 0x1.8ef2a9a18d857p107, 0x1.6a597219a93dap110 }, + { 0x1.78e8dcd2e6bfdp107, 0x1.570b69502f313p110 }, + { 0x1.640bf6745325ep107, 0x1.44ba864670882p110 }, + { 0x1.504c882a97424p107, 0x1.335a62115bce2p110 }, + { 0x1.3d9be56279ee9p107, 0x1.22df298214423p110 }, + { 0x1.2bec1a4917edbp107, 0x1.133d96ae7e0ddp110 }, + { 0x1.1b2fe32991d5cp107, 0x1.046aeabcfcdecp110 }, + { 0x1.0b5aa42bf5054p107, 0x1.ecb9cfe1d8642p109 }, + { 0x1.f8c0c2e2ce8dep106, 0x1.d21397ead99cbp109 }, + { 0x1.dc6b6f1384e18p106, 0x1.b8d094c86d374p109 }, + { 0x1.c19fa87de37fbp106, 0x1.a0df0f0c626dcp109 }, + { 0x1.a848df650bea7p106, 0x1.8a2e269750a39p109 }, + { 0x1.90538b942ea7cp106, 0x1.74adc8f4064d3p109 }, + { 0x1.79ad1fce5b3d8p106, 0x1.604ea819f007cp109 }, + { 0x1.6443fdcf0c327p106, 0x1.4d0231928c6f9p109 }, + { 0x1.50076ad55cc39p106, 0x1.3aba85fe22e2p109 }, + { 0x1.3ce784b411931p106, 0x1.296a70f414053p109 }, + { 0x1.2ad53760d7287p106, 0x1.1905613b3abf2p109 }, + { 0x1.19c232fd50b88p106, 0x1.097f6156f32c5p109 }, + { 0x1.09a0e254c75ep106, 0x1.f59a20caf6695p108 }, + { 0x1.f4c8c392fb944p105, 0x1.d9c73698fb1dcp108 }, + { 0x1.d800ed59bd026p105, 0x1.bf716c6168baep108 }, + { 0x1.bcd30dfbd611bp105, 0x1.a6852c6b58392p108 }, + { 0x1.a32923130213fp105, 0x1.8eefd70594a89p108 }, + { 0x1.8aee4cd06ec1bp105, 0x1.789fb715aae95p108 }, + { 0x1.740ebfab80eb4p105, 0x1.6383f726a8e04p108 }, + { 0x1.5e77b6bbd2127p105, 0x1.4f8c96f26a26ap108 }, + { 0x1.4a1766b6e5e8ap105, 0x1.3caa61607f92p108 }, + { 0x1.36dcf18a6465cp105, 0x1.2acee2f5ecdb8p108 }, + { 0x1.24b85a8bf0124p105, 0x1.19ec60b1242edp108 }, + { 0x1.139a7b37f8475p105, 0x1.09f5cf4dd2877p108 }, + { 0x1.0374f8792ca97p105, 0x1.f5bd95d8730d8p107 }, + { 0x1.e87470e4f4246p104, 0x1.d9371e2ff7c35p107 }, + { 0x1.cbbab18b73217p104, 0x1.be41de54d155ap107 }, + { 0x1.b0a44aa2f067ep104, 0x1.a4c89e08ef4f3p107 }, + { 0x1.971a1ec0f40c7p104, 0x1.8cb738399b12cp107 }, + { 0x1.7f064a8ba8323p104, 0x1.75fa8dbc84becp107 }, + { 0x1.685414c16188ep104, 0x1.608078a70dcbcp107 }, + { 0x1.52efdf060cd2p104, 0x1.4c37c0394d094p107 }, + { 0x1.3ec7176d784b5p104, 0x1.39100d5687bfep107 }, + { 0x1.2bc82ab9d2302p104, 0x1.26f9df8519bd7p107 }, + { 0x1.19e277461404p104, 0x1.15e6827001f18p107 }, + { 0x1.090640946d2d5p104, 0x1.05c803e4831c1p107 }, + { 0x1.f24946f22d5aep103, 0x1.ed22548cffd35p106 }, + { 0x1.d45f15b49b35ep103, 0x1.d06ad6ecdf971p106 }, + { 0x1.b83349fd05191p103, 0x1.b551c847fbc96p106 }, + { 0x1.9dacb2c432ef4p103, 0x1.9bc09f112b494p106 }, + { 0x1.84b37e1cbf8ebp103, 0x1.83a1ff0aa239dp106 }, + { 0x1.6d3126d74b6ccp103, 0x1.6ce1aa3fd7bddp106 }, + { 0x1.5710631158bffp103, 0x1.576c72b514859p106 }, + { 0x1.423d13a3b73e1p103, 0x1.43302cc4a0da8p106 }, + { 0x1.2ea43465e3995p103, 0x1.301ba221dc9bbp106 }, + { 0x1.1c33cd3c37addp103, 0x1.1e1e857adc568p106 }, + { 0x1.0adae3e73c2b5p103, 0x1.0d2966b1746f7p106 }, + { 0x1.f512dd15b73b7p102, 0x1.fa5b4f49cc6b2p105 }, + { 0x1.d6608dc942687p102, 0x1.dc3ae30b55c16p105 }, + { 0x1.b9823c51276e1p102, 0x1.bfd7555a3bd68p105 }, + { 0x1.9e5ce2f93dd76p102, 0x1.a517d9e61628ap105 }, + { 0x1.84d6fe15b6b93p102, 0x1.8be4f8f6c951fp105 }, + { 0x1.6cd87746bc76bp102, 0x1.74287ded49339p105 }, + { 0x1.564a91cd221fp102, 0x1.5dcd669f2cd34p105 }, + { 0x1.4117d7e2c667dp102, 0x1.48bfd38302871p105 }, + { 0x1.2d2c0909ebeb9p102, 0x1.34ecf8a3c124ap105 }, + { 0x1.1a7409475f2f9p102, 0x1.22430f521cbcfp105 }, + { 0x1.08ddd13bd35e7p102, 0x1.10b1488aeb235p105 }, + { 0x1.f0b0be22d18e8p101, 0x1.0027c00a263a6p105 }, + { 0x1.d1a75065a8c74p101, 0x1.e12ee004efc37p104 }, + { 0x1.b48117843c1c7p101, 0x1.c3e44ae32b16bp104 }, + { 0x1.99218b8ac7f8ep101, 0x1.a854ea14102a8p104 }, + { 0x1.7f6dc6010b4adp101, 0x1.8e6761569f45dp104 }, + { 0x1.674c6ae60d852p101, 0x1.7603bac345f65p104 }, + { 0x1.50a592e3c968ep101, 0x1.5f1353cdad001p104 }, + { 0x1.3b62b6aafb0c8p101, 0x1.4980cb3c80949p104 }, + { 0x1.276e9b681072fp101, 0x1.3537f00b6ad4dp104 }, + { 0x1.14b54042f445bp101, 0x1.2225b12bffc68p104 }, + { 0x1.0323ccdc1a3dcp101, 0x1.10380e1adb7e9p104 }, + { 0x1.e5510173b9a5p100, 0x1.febc107d5efaap103 }, + { 0x1.c6654733b86adp100, 0x1.df0f2a0ee6947p103 }, + { 0x1.a964ed354f984p100, 0x1.c14b2188bcee4p103 }, + { 0x1.8e324c651b064p100, 0x1.a553644f7f07dp103 }, + { 0x1.74b179d1eba81p100, 0x1.8b0cfce0579ep103 }, + { 0x1.5cc82d9070d95p100, 0x1.725e7c5dd20f7p103 }, + { 0x1.465daafca8b1dp100, 0x1.5b2fe547a134p103 }, + { 0x1.315aaa46df48ep100, 0x1.456a974e92e93p103 }, + { 0x1.1da9433aebbcfp100, 0x1.30f93c3699078p103 }, + { 0x1.0b34d93135fcp100, 0x1.1dc7b5b978cf8p103 }, + { 0x1.f3d41033c44ccp99, 0x1.0bc30c5d52f15p103 }, + { 0x1.d36d25268cd2bp99, 0x1.f5b2be65a0c7fp102 }, + { 0x1.b512a1fb1d8fcp99, 0x1.d5f3a8dea7357p102 }, + { 0x1.98a442fc4fc15p99, 0x1.b82915b03515bp102 }, + { 0x1.7e03b1cc6d738p99, 0x1.9c3517e789488p102 }, + { 0x1.651468e010b8ap99, 0x1.81fb7df06136ep102 }, + { 0x1.4dbb989001d84p99, 0x1.6961b8d641d06p102 }, + { 0x1.37e00dac4e8b5p99, 0x1.524ec4d916caep102 }, + { 0x1.236a197bf0b9ap99, 0x1.3cab1343d18d1p102 }, + { 0x1.10437b1569d7ep99, 0x1.2860757487a01p102 }, + { 0x1.fcae93fb7323cp98, 0x1.155a09065d4f7p102 }, + { 0x1.db23c3f816f92p98, 0x1.0384250e4c9fcp102 }, + { 0x1.bbc1a022c14d4p98, 0x1.e59890b926c78p101 }, + { 0x1.9e658108af2ep98, 0x1.c642116a8a9e3p101 }, + { 0x1.82eedbe410407p98, 0x1.a8e405e651ab6p101 }, + { 0x1.693f22ab61ce9p98, 0x1.8d5f98114f872p101 }, + { 0x1.5139a5f3661fbp98, 0x1.7397c5a66e307p101 }, + { 0x1.3ac3788a1b429p98, 0x1.5b71456c5a4c4p101 }, + { 0x1.25c354b26cb4ep98, 0x1.44d26de513197p101 }, + { 0x1.122182e9a270fp98, 0x1.2fa31d6371537p101 }, + { 0x1.ff8f84418d51p97, 0x1.1bcca373b7b43p101 }, + { 0x1.dd4262aac53e8p97, 0x1.0939ab853339fp101 }, + { 0x1.bd3474ec16ca5p97, 0x1.efac5187b2863p100 }, + { 0x1.9f40fd0082b72p97, 0x1.cf1e86235d0e7p100 }, + { 0x1.8345858c4438dp97, 0x1.b0a68a2128babp100 }, + { 0x1.6921be96b86b1p97, 0x1.9423165bc4444p100 }, + { 0x1.50b75c536f927p97, 0x1.7974e743dea3dp100 }, + { 0x1.39e9f7dcbe479p97, 0x1.607e9eacd105p100 }, + { 0x1.249ef1c3be817p97, 0x1.4924a74dec729p100 }, + { 0x1.10bd565b35393p97, 0x1.334d19e0c216p100 }, + { 0x1.fc5b8748842b2p96, 0x1.1edfa3c5f5ccap100 }, + { 0x1.d9b4a18a38642p96, 0x1.0bc56f1b54701p100 }, + { 0x1.b95cede6d524bp96, 0x1.f3d2185e047d9p99 }, + { 0x1.9b2df77a02225p96, 0x1.d26cb87945e87p99 }, + { 0x1.7f03b935e8e3ap96, 0x1.b334fac4b9f99p99 }, + { 0x1.64bc777824f0ep96, 0x1.96076f7918d1cp99 }, + { 0x1.4c389be9acb83p96, 0x1.7ac2d72fc2c63p99 }, + { 0x1.355a9387de78cp96, 0x1.614801550319ep99 }, + { 0x1.2006aeb6bc768p96, 0x1.4979ac8b28927p99 }, + { 0x1.0c23033e2a376p96, 0x1.333c68e2d0548p99 }, + { 0x1.f32ea02b55d23p95, 0x1.1e767bce37dd7p99 }, + { 0x1.d099c5c770f5ap95, 0x1.0b0fc5b6d05ap99 }, + { 0x1.b05cfe2e99435p95, 0x1.f1e3523b41d7dp98 }, + { 0x1.92508d0743fc9p95, 0x1.d00de6608effep98 }, + { 0x1.764f46cf19f9cp95, 0x1.b0778b7b3301bp98 }, + { 0x1.5c36679625a01p95, 0x1.92fb04ec0f6cfp98 }, + { 0x1.43e56c3e340a7p95, 0x1.77756ec9f78fap98 }, + { 0x1.2d3dee1869201p95, 0x1.5dc61922d5a06p98 }, + { 0x1.182380bd2f494p95, 0x1.45ce65699ff6dp98 }, + { 0x1.047b91fcb6491p95, 0x1.2f71a5f15997p98 }, + { 0x1.e45a9790460c1p94, 0x1.1a94ff571654fp98 }, + { 0x1.c242efeaca76p94, 0x1.071f4bbea09ecp98 }, + { 0x1.a284cb82c31cep94, 0x1.e9f1ff8ddd774p97 }, + { 0x1.84f7a1eb7f7f3p94, 0x1.c818223a202c7p97 }, + { 0x1.697595326d7dcp94, 0x1.a887bd2b4404dp97 }, + { 0x1.4fdb462549af1p94, 0x1.8b1a336c5eb6bp97 }, + { 0x1.3807ab51436a8p94, 0x1.6fab63324088ap97 }, + { 0x1.21dbea9108398p94, 0x1.56197e30205bap97 }, + { 0x1.0d3b35021d695p94, 0x1.3e44e45301b92p97 }, + { 0x1.f4154a787cc1bp93, 0x1.281000bfe4c3fp97 }, + { 0x1.d0623f4f4a28fp93, 0x1.135f28f2d50b4p97 }, + { 0x1.af2e69a26261p93, 0x1.00187dded5975p97 }, + { 0x1.904e0b3aa82a3p93, 0x1.dc479de0ef001p96 }, + { 0x1.73985278fa30ep93, 0x1.bad4fdad3caa1p96 }, + { 0x1.58e7298af87d9p93, 0x1.9baed3ed27ab8p96 }, + { 0x1.401708b7e64c6p93, 0x1.7ead9ce4285bbp96 }, + { 0x1.2906cb94eb40dp93, 0x1.63ac6b4edc88ep96 }, + { 0x1.139788f2dd663p93, 0x1.4a88be2a6390cp96 }, + { 0x1.ff58dab4f2a79p92, 0x1.332259185f1ap96 }, + { 0x1.da552fdd03043p92, 0x1.1d5b1f3793044p96 }, + { 0x1.b7f1f31b571b6p92, 0x1.0916f04b6e18bp96 }, + { 0x1.98006c2117e39p92, 0x1.ec77101de6926p95 }, + { 0x1.7a550f03b145bp92, 0x1.c960bf23153ep95 }, + { 0x1.5ec74662c5961p92, 0x1.a8bd20fc65ef7p95 }, + { 0x1.453141082302ap92, 0x1.8a61745ec7d1dp95 }, + { 0x1.2d6fc2c9e8bcp92, 0x1.6e25d0e756261p95 }, + { 0x1.1761f87a6dc3dp92, 0x1.53e4f7d1666cbp95 }, + { 0x1.02e94eb4ac8a5p92, 0x1.3b7c27a7ddb0ep95 }, + { 0x1.dfd296adef82ap91, 0x1.24caf2c32af14p95 }, + { 0x1.bc8ed301215ebp91, 0x1.0fb3186804d0fp95 }, + { 0x1.9bd5efd2c0f15p91, 0x1.f830c0bb41fd7p94 }, + { 0x1.7d79f2db2d4a5p91, 0x1.d3c0f1a91c846p94 }, + { 0x1.61500f5293f06p91, 0x1.b1e5acf351d87p94 }, + { 0x1.47306f04df3d6p91, 0x1.92712d259ce66p94 }, + { 0x1.2ef5ff0323b28p91, 0x1.7538c60a04476p94 }, + { 0x1.187e3fb74914dp91, 0x1.5a14b04b47879p94 }, + { 0x1.03a918225a966p91, 0x1.40dfd87456f4cp94 }, + { 0x1.e0b15822be4ep90, 0x1.2977b1172b9d5p94 }, + { 0x1.bce26a2fb7176p90, 0x1.13bc07e891491p94 }, + { 0x1.9bb1bc445c3c6p90, 0x1.ff1dbb4300811p93 }, + { 0x1.7cef42e9a617dp90, 0x1.d9a880f306bd8p93 }, + { 0x1.606e51e0a4963p90, 0x1.b6e45220b55ep93 }, + { 0x1.460560e841d79p90, 0x1.96a0b33f2c4dap93 }, + { 0x1.2d8dd47a40ad8p90, 0x1.78b07e9e924acp93 }, + { 0x1.16e3ca3d4393fp90, 0x1.5ce9ab1670dd2p93 }, + { 0x1.01e5e8edda47bp90, 0x1.4325167006bbp93 }, + { 0x1.dcea670907819p89, 0x1.2b3e53538ff3fp93 }, + { 0x1.b8e9bec48816dp89, 0x1.15137a7f44864p93 }, + { 0x1.97945aa1c9c35p89, 0x1.0084ff125639dp93 }, + { 0x1.78b88a4e7107bp89, 0x1.daeb0b7311ec7p92 }, + { 0x1.5c2827c986b62p89, 0x1.b7937d1c40c53p92 }, + { 0x1.41b858361b0fep89, 0x1.96d082f59ab06p92 }, + { 0x1.294150fb19119p89, 0x1.7872d9fa10aadp92 }, + { 0x1.129e20e732adcp89, 0x1.5c4e8e37bc7dp92 }, + { 0x1.fb58fa290d436p88, 0x1.423ac0df49a4p92 }, + { 0x1.d499229819bc6p88, 0x1.2a117230ad284p92 }, + { 0x1.b0c1a759f7739p88, 0x1.13af4f04f9998p92 }, + { 0x1.8f9bb6c075486p88, 0x1.fde703724e56p91 }, + { 0x1.70f4744735c2bp88, 0x1.d77f0c82e7641p91 }, + { 0x1.549cb0f7ef8e2p88, 0x1.b3ee02611d7ddp91 }, + { 0x1.3a68a8c1234e1p88, 0x1.92ff33023d5bdp91 }, + { 0x1.222fc469e8b8cp88, 0x1.7481a9e69f53fp91 }, + { 0x1.0bcc5fd30f1ddp88, 0x1.5847eda620959p91 }, + { 0x1.ee3728761897bp87, 0x1.3e27c1fcc74bdp91 }, + { 0x1.c7fa0c7e3bac7p87, 0x1.25f9ee0b923dcp91 }, + { 0x1.a4a56eb132a54p87, 0x1.0f9a0686532p91 }, + { 0x1.8401b5336a8ap87, 0x1.f5cc7718082bp90 }, + { 0x1.65db58e2358c1p87, 0x1.cf7e53d6a2ca5p90 }, + { 0x1.4a029a7ea7cd1p87, 0x1.ac0f5f3229372p90 }, + { 0x1.304b3d1961171p87, 0x1.8b498644847eap90 }, + { 0x1.188c45630dc53p87, 0x1.6cfa9bcca59dcp90 }, + { 0x1.029fbd8b92835p87, 0x1.50f411d4fd2cdp90 }, + { 0x1.dcc4fabf32f1cp86, 0x1.370ab8327af5ep90 }, + { 0x1.b767ecb334a7ep86, 0x1.1f167f88c6b6ep90 }, + { 0x1.94ec06c0ff29fp86, 0x1.08f24085d4597p90 }, + { 0x1.751977e5803d3p86, 0x1.e8f70e181d61ap89 }, + { 0x1.57bc950253825p86, 0x1.c324c20e337dcp89 }, + { 0x1.3ca58b816a87fp86, 0x1.a03261574b54ep89 }, + { 0x1.23a8197d2607ep86, 0x1.7fe903cdf5855p89 }, + { 0x1.0c9b4b0a6a16fp86, 0x1.6215c58da345p89 }, + { 0x1.eeb27891d2bb3p85, 0x1.46897d4b69fc6p89 }, + { 0x1.c77dbfc848866p85, 0x1.2d1877d731b7bp89 }, + { 0x1.a357936adf17bp85, 0x1.159a386b11517p89 }, + { 0x1.8203fa7992554p85, 0x1.ffd27ae9393cep88 }, + { 0x1.634b7f56b0a5cp85, 0x1.d7c593130dd0bp88 }, + { 0x1.46fada7e6a5fep85, 0x1.b2cd607c79bcfp88 }, + { 0x1.2ce2a3690576bp85, 0x1.90ae4d3405651p88 }, + { 0x1.14d707280e6cfp85, 0x1.71312dd1759e2p88 }, + { 0x1.fd5f08ad2b29ap84, 0x1.5422ef5d8949dp88 }, + { 0x1.d48d57f7718b7p84, 0x1.39544b0ecc957p88 }, + { 0x1.aef3ce0add578p84, 0x1.20997f73e73ddp88 }, + { 0x1.8c52800f939c8p84, 0x1.09ca0eaacd277p88 }, + { 0x1.6c6e61e57bf9bp84, 0x1.e9810295890ecp87 }, + { 0x1.4f10e8ebc44a9p84, 0x1.c2b45b5aa4a1dp87 }, + { 0x1.3407b59d72a5bp84, 0x1.9eee068fa7596p87 }, + { 0x1.1b2443858c0a1p84, 0x1.7df2b399c10a8p87 }, + { 0x1.043b9f1621ff3p84, 0x1.5f8b87a31bd85p87 }, + { 0x1.de4c41eb96b45p83, 0x1.4385c96e9a2d9p87 }, + { 0x1.b77e5cbd5d147p83, 0x1.29b2933ef4cbcp87 }, + { 0x1.93c9fc62bfb11p83, 0x1.11e68a6378f8ap87 }, + { 0x1.72f0c4c8e9bffp83, 0x1.f7f338086a86bp86 }, + { 0x1.54b92affb11afp83, 0x1.cf8d7d9ce040ap86 }, + { 0x1.38ee17b150182p83, 0x1.aa577251ae485p86 }, + { 0x1.1f5e908f70e0cp83, 0x1.8811d739efb5fp86 }, + { 0x1.07dd6833bb38p83, 0x1.68823e52970bep86 }, + { 0x1.e481e7f6ac4bcp82, 0x1.4b72ae68e8b4cp86 }, + { 0x1.bcc58edad5559p82, 0x1.30b14dbe876bcp86 }, + { 0x1.983ee9896d582p82, 0x1.181012ef8661p86 }, + { 0x1.76aca47764427p82, 0x1.01647ba798745p86 }, + { 0x1.57d287836bd3dp82, 0x1.d90e917701675p85 }, + { 0x1.3b79118c097a1p82, 0x1.b2a87e86d0c8ap85 }, + { 0x1.216d1b97279a9p82, 0x1.8f53dcb377293p85 }, + { 0x1.097f82fc04025p82, 0x1.6ed2f2515e933p85 }, + { 0x1.e709b415656dp81, 0x1.50ecc9ed47f19p85 }, + { 0x1.beaa3d6c15504p81, 0x1.356cd5ce7799ep85 }, + { 0x1.9996ed9b83967p81, 0x1.1c229a587ab78p85 }, + { 0x1.778be2bd9795bp81, 0x1.04e15ecc7f3f6p85 }, + { 0x1.584a99af8a842p81, 0x1.deffc7e6a6017p84 }, + { 0x1.3b99832cbefddp81, 0x1.b7b040832f31p84 }, + { 0x1.2143a112d0466p81, 0x1.938e021f36d76p84 }, + { 0x1.09182b326b229p81, 0x1.7258610b3b233p84 }, + { 0x1.e5d47637f5db5p80, 0x1.53d3bfc82a909p84 }, + { 0x1.bd20fcc3b76d7p80, 0x1.37c92babdc2fdp84 }, + { 0x1.97c9dda748fc7p80, 0x1.1e06010120f6ap84 }, + { 0x1.7589207e91ad1p80, 0x1.065b9616170d4p84 }, + { 0x1.561e669aa7fdbp80, 0x1.e13dd96b3753bp83 }, + { 0x1.394e7a2ac9fc7p80, 0x1.b950d32467392p83 }, + { 0x1.1ee2e61eccc99p80, 0x1.94a72263259a5p83 }, + { 0x1.06a996198f06fp80, 0x1.72fd93e036cdcp83 }, + { 0x1.e0e8fbad2703ep79, 0x1.54164576929abp83 }, + { 0x1.b8328ee330ae9p79, 0x1.37b83c521fe96p83 }, + { 0x1.92e21013a767p79, 0x1.1daf033182e96p83 }, + { 0x1.70aff489136ebp79, 0x1.05ca50205d26ap83 }, + { 0x1.515a7c77fab48p79, 0x1.dfbb6235639fap82 }, + { 0x1.34a53ce0bbb6fp79, 0x1.b7807e294781fp82 }, + { 0x1.1a58b2b09fdcbp79, 0x1.9298add70a734p82 }, + { 0x1.0241de6c31e5bp79, 0x1.70beaf9c7ffb6p82 }, + { 0x1.d863cf753825cp78, 0x1.51b2cd6709222p82 }, + { 0x1.affb906d0ae09p78, 0x1.353a6cf7f7fffp82 }, + { 0x1.8afbf9e9520c2p78, 0x1.1b1fa8cbe84a7p82 }, + { 0x1.691c7c768becep78, 0x1.0330f0fd69921p82 }, + { 0x1.4a1a79df39cdep78, 0x1.da81670f96f9bp81 }, + { 0x1.2db8ca9009091p78, 0x1.b24a16b4d09aap81 }, + { 0x1.13bf4cb384e4ap78, 0x1.8d6eeb6efdbd6p81 }, + { 0x1.f7f4f88751db4p77, 0x1.6ba91ac734786p81 }, + { 0x1.cc7626bced452p77, 0x1.4cb7966770ab5p81 }, + { 0x1.a4ab6470c1c5cp77, 0x1.305e9721d0981p81 }, + { 0x1.80451c2811052p77, 0x1.1667311fff70ap81 }, + { 0x1.5efa4d64f59f6p77, 0x1.fd3de10d62855p80 }, + { 0x1.40880373ed74p77, 0x1.d1aefbcd48d0cp80 }, + { 0x1.24b0d7368076ep77, 0x1.a9cc93c25aca9p80 }, + { 0x1.0b3c7b0d960fp77, 0x1.85487ee3ea735p80 }, + { 0x1.e7eea02e4ed88p76, 0x1.63daf8b4b1e0cp80 }, + { 0x1.bd6408059b696p76, 0x1.45421e69a6ca1p80 }, + { 0x1.96826d9e90341p76, 0x1.294175802d99ap80 }, + { 0x1.72fa4fa12d516p76, 0x1.0fa17bf41068fp80 }, + { 0x1.5282d2d5803fep76, 0x1.f05e82aae2bb9p79 }, + { 0x1.34d935f1be064p76, 0x1.c578101b29058p79 }, + { 0x1.19c050c56d0d7p76, 0x1.9e39dc5dd2f7cp79 }, + { 0x1.01001dd9c7ccep76, 0x1.7a553a728bbf2p79 }, + { 0x1.d4ca9b634ecbap75, 0x1.5982008db1304p79 }, + { 0x1.ab81c5c80cf39p75, 0x1.3b7e00422e51bp79 }, + { 0x1.85cfacb7477f2p75, 0x1.200c898d9ee3ep79 }, + { 0x1.6365862923eb9p75, 0x1.06f5f7eb65a56p79 }, + { 0x1.43fb317b5dc37p75, 0x1.e00e9148a1d25p78 }, + { 0x1.274ea96044bd7p75, 0x1.b623734024e92p78 }, + { 0x1.0d23817479c67p75, 0x1.8fd4e01891bf8p78 }, + { 0x1.ea84dd159259p74, 0x1.6cd44c7470d89p78 }, + { 0x1.bef1b1a12823ep74, 0x1.4cd9c04158cd7p78 }, + { 0x1.9730edfda64acp74, 0x1.2fa34bf5c8344p78 }, + { 0x1.72ede3b7eaa25p74, 0x1.14f4890ff2461p78 }, + { 0x1.51db1ec3a3087p74, 0x1.f92c49dfa4df5p77 }, + { 0x1.33b1c9d1576ecp74, 0x1.ccaaea71ab0dfp77 }, + { 0x1.18311f8a03acap74, 0x1.a40829f001197p77 }, + { 0x1.fe3bcf4629feap73, 0x1.7eef13b59e96cp77 }, + { 0x1.d083fda665164p73, 0x1.5d11e1a252bf5p77 }, + { 0x1.a6d7d18831888p73, 0x1.3e296303b2297p77 }, + { 0x1.80dcd6603df1bp73, 0x1.21f47009f43cep77 }, + { 0x1.5e4062d5b6a4ep73, 0x1.083768c5e4542p77 }, + { 0x1.3eb6ef47c2758p73, 0x1.e1777d831265fp76 }, + { 0x1.21fb7a81c5444p73, 0x1.b69f10b0191b5p76 }, + { 0x1.07cefb734d68bp73, 0x1.8f8a3a05b5b53p76 }, + { 0x1.dfefbdb19ac7ep72, 0x1.6be573c40c8e7p76 }, + { 0x1.b4831fb12344p72, 0x1.4b645ba991fdbp76 }, + { 0x1.8cf81557d20b6p72, 0x1.2dc119095729fp76 }, + { 0x1.68f6f0feb4755p72, 0x1.12bbcfa4d62dep76 }, + { 0x1.482fa78c40635p72, 0x1.f4343c7d504b9p75 }, + { 0x1.2a59289a484fbp72, 0x1.c74d4fe1e0e8bp75 }, + { 0x1.0f30c4d0be5cp72, 0x1.9e614ecbf4af6p75 }, + { 0x1.ecf3428c48d4fp71, 0x1.791716475420cp75 }, + { 0x1.bff86d9ec8499p71, 0x1.571d34563050ap75 }, + { 0x1.970bb87f4ae14p71, 0x1.3829407a207d8p75 }, + { 0x1.71d0b55b79b86p71, 0x1.1bf74244aed5ap75 }, + { 0x1.4ff315d036fbdp71, 0x1.024924c7520d1p75 }, + { 0x1.3125f6a3d257p71, 0x1.d5cc6ba567f29p74 }, + { 0x1.15233ae8815f2p71, 0x1.ab3560167ccaap74 }, + { 0x1.f755ea760487dp70, 0x1.846e9dda7a163p74 }, + { 0x1.c905bbd9ab5a6p70, 0x1.6121d7db32bddp74 }, + { 0x1.9eebaa0589b4ep70, 0x1.410047ead6894p74 }, + { 0x1.78a6de0f41b89p70, 0x1.23c2090cdde78p74 }, + { 0x1.55df1790f2f61p70, 0x1.09257fca001cp74 }, + { 0x1.3643ec463a3cfp70, 0x1.e1dd9ec677783p73 }, + { 0x1.198c18435598dp70, 0x1.b5ceb5a13221bp73 }, + { 0x1.fee9bab9f4e14p69, 0x1.8dbaa11de2037p73 }, + { 0x1.cf82e0eb6196bp69, 0x1.694680a9a3ee6p73 }, + { 0x1.a474e7029a919p69, 0x1.481f73b3778e8p73 }, + { 0x1.7d5af6513e2bep69, 0x1.29f9e7d8fd094p73 }, + { 0x1.59d93e1d8f57dp69, 0x1.0e90f64b5b103p73 }, + { 0x1.399c279e4699ap69, 0x1.eb4b9e47b58c9p72 }, + { 0x1.1c579bbca6885p69, 0x1.bdfe62f60dd7p72 }, + { 0x1.01c659160612dp69, 0x1.94d1de5c4576fp72 }, + { 0x1.d352b1ae2694p68, 0x1.6f66f6ab90c3cp72 }, + { 0x1.a78e8252c204dp68, 0x1.4d67050b31c2ap72 }, + { 0x1.7fd7c80f3410ep68, 0x1.2e8318008cf89p72 }, + { 0x1.5bcf92cc55d86p68, 0x1.1273463a1589bp72 }, + { 0x1.3b1f876b10da7p68, 0x1.f1ec20afad0e2p71 }, + { 0x1.1d791bb1324a1p68, 0x1.c39fa0d4a5a2bp71 }, + { 0x1.0294e37abcee8p68, 0x1.99946bf7e02a1p71 }, + { 0x1.d463db5fa3c13p67, 0x1.73679b24aeb9bp71 }, + { 0x1.a82a5f4047a5bp67, 0x1.50bf2558ab78fp71 }, + { 0x1.8011fb05fe09p67, 0x1.314916abfa1eap71 }, + { 0x1.5bb91decf8a58p67, 0x1.14bad9006f53bp71 }, + { 0x1.3ac71ce35c1d3p67, 0x1.f5a1196b5bb2ep70 }, + { 0x1.1ceb656955c59p67, 0x1.c698e001f6d3p70 }, + { 0x1.01dcc2acf7755p67, 0x1.9beca74b0f147p70 }, + { 0x1.d2b166911c178p66, 0x1.753637caac6d9p70 }, + { 0x1.a6459c5b11342p66, 0x1.5218993857afcp70 }, + { 0x1.7e086accc805dp66, 0x1.323f3f19cff3ep70 }, + { 0x1.59962aef547b3p66, 0x1.155d47fdb9c94p70 }, + { 0x1.3894608650edep66, 0x1.f6599b70323cap69 }, + { 0x1.1ab0e4d284f44p66, 0x1.c6dc8a4bb3ba6p69 }, + { 0x1.ff4248ebb8299p65, 0x1.9bcfd83a431e9p69 }, + { 0x1.ce42dd8e4fa23p65, 0x1.74ca889bbacd5p69 }, + { 0x1.a1e8aa1400997p65, 0x1.516d33e26c04p69 }, + { 0x1.79c430435a7fcp65, 0x1.31612a7ef535fp69 }, + { 0x1.557046eb39249p65, 0x1.1457ab75c2489p69 }, + { 0x1.349127b59b217p65, 0x1.f41259c9550cp68 }, + { 0x1.16d392dff5104p65, 0x1.c46969ca99a2ep68 }, + { 0x1.f7d80dc993f2fp64, 0x1.993e82b76e726p68 }, + { 0x1.c72c149cb214bp64, 0x1.72267ac1b25ap68 }, + { 0x1.9b270c24cc8fap64, 0x1.4ec0062aeeb78p68 }, + { 0x1.73585df7b6643p64, 0x1.2eb2d18a2081bp68 }, + { 0x1.4f59f9910367ep64, 0x1.11aeb0b11d1a1p68 }, + { 0x1.2ecf5b7f6abe3p64, 0x1.eed5c0bbf1061p67 }, + { 0x1.1164ab45aa235p64, 0x1.bf4ab21b4f3fp67 }, + { 0x1.ed9bdbc6f1b0ap63, 0x1.944462d4d5991p67 }, + { 0x1.bd8c96533b39bp63, 0x1.6d561de54f6a1p67 }, + { 0x1.921ec84d5860ep63, 0x1.4a1d472804fc8p67 }, + { 0x1.6ae172414cebap63, 0x1.2a406e25fcb44p67 }, + { 0x1.476e3b661be8cp63, 0x1.0d6e7662dda9dp67 }, + { 0x1.276873924f0b4p63, 0x1.e6bba6770e22dp66 }, + { 0x1.0a7c2c9322f59p63, 0x1.b797ab2ba22d2p66 }, + { 0x1.e0bad18c4e37dp62, 0x1.8cf813910fdcdp66 }, + { 0x1.b18eba0be4d24p62, 0x1.666f488db6e0ap66 }, + { 0x1.86f7884e1caadp62, 0x1.4399f7770045fp66 }, + { 0x1.608484d592328p62, 0x1.241e1ebbbf4ecp66 }, + { 0x1.3dcfaee52a8f5p62, 0x1.07aa30ce6a5ap66 }, + { 0x1.1e7cbac093f27p62, 0x1.dbe8969a24c6fp65 }, + { 0x1.023827dc88ed9p62, 0x1.ad7301258d788p65 }, + { 0x1.d16cd999791c3p61, 0x1.837a640fa9d3dp65 }, + { 0x1.a3666de0788bp61, 0x1.5d90f358d61f6p65 }, + { 0x1.79e17816df1e8p61, 0x1.3b5342f7be9cp65 }, + { 0x1.546e385224d1p61, 0x1.1c674ecd152d3p65 }, + { 0x1.32a7a483e977bp61, 0x1.007b997a0b531p65 }, + { 0x1.1432649c86c4dp61, 0x1.ce8cc007a6432p64 }, + { 0x1.f177ce0bd5836p60, 0x1.a109c0bccbc39p64 }, + { 0x1.bff3166bc36eep60, 0x1.77f5624913c3ap64 }, + { 0x1.934fc0975fb3p60, 0x1.52e251d5d3b1fp64 }, + { 0x1.6b13ebb9a5ad4p60, 0x1.316da780bc4d9p64 }, + { 0x1.46d17a80cc174p60, 0x1.133deb1d3526p64 }, + { 0x1.2624f3a0a887p60, 0x1.f00460b24acf8p63 }, + { 0x1.08b47d7733cb6p60, 0x1.bee2903d584f9p63 }, + { 0x1.dc5de496b181p59, 0x1.92920a7c80e26p63 }, + { 0x1.ac9615b3c9fd7p59, 0x1.6a9b25345c773p63 }, + { 0x1.818d3a356669ep59, 0x1.4691b26b9c82fp63 }, + { 0x1.5acbdab2ed713p59, 0x1.2613e9610f6d1p63 }, + { 0x1.37e61fd4c0fep59, 0x1.08c969adf0beap63 }, + { 0x1.187ab3d71db11p59, 0x1.dcc4ac4f59be5p62 }, + { 0x1.f8637ea4e52acp58, 0x1.ad2d0a9a18288p62 }, + { 0x1.c577fd709b099p58, 0x1.82498a7cc94b9p62 }, + { 0x1.97a3dc62119c8p58, 0x1.5ba462dee8a02p62 }, + { 0x1.6e66137bb7ccap58, 0x1.38d330d8806ap62 }, + { 0x1.494a3f6a9a70ep58, 0x1.1975e0627306cp62 }, + { 0x1.27e767bb79ea2p58, 0x1.fa6b5ee8f3088p61 }, + { 0x1.09dee32687729p58, 0x1.c78892308bd9p61 }, + { 0x1.ddb6ae2f39381p57, 0x1.99b5ec6741cb3p61 }, + { 0x1.ad1f9fba4b2abp57, 0x1.7073c400e10dcp61 }, + { 0x1.816dde4c11ca3p57, 0x1.4b4ee0b3a84d6p61 }, + { 0x1.5a245d5e5289cp57, 0x1.29df4862ac231p61 }, + { 0x1.36d26a686daafp57, 0x1.0bc7294e0cbafp61 }, + { 0x1.171277cbbce9cp57, 0x1.e163bd8df864p60 }, + { 0x1.f5120b45c00e6p56, 0x1.b0a61bce91993p60 }, + { 0x1.c1c74b30d0bbp56, 0x1.84cbb00f925fp60 }, + { 0x1.93b02e5cf0324p56, 0x1.5d5841ce6cb73p60 }, + { 0x1.6a46f43f3118cp56, 0x1.39dbcd485dd07p60 }, + { 0x1.45132973bb79bp56, 0x1.19f153b38a108p60 }, + { 0x1.23a85891dc72bp56, 0x1.fa7b9159fc471p59 }, + { 0x1.05a4dba466c4ep56, 0x1.c6de3429e31fap59 }, + { 0x1.d561964307dc4p55, 0x1.98769faac8a1bp59 }, + { 0x1.a4fa0f13737e8p55, 0x1.6ebf82977acfp59 }, + { 0x1.7984b636ad1bep55, 0x1.4940bc89fa5aap59 }, + { 0x1.5281628cb373ap55, 0x1.278e135bcf0a4p59 }, + { 0x1.2f7cc38bc628dp55, 0x1.0946088b6f8edp59 }, + { 0x1.100f1aef8eaf5p55, 0x1.dc21972b9e9f4p58 }, + { 0x1.e7b62ce66acdep54, 0x1.ab3e8cfada51ap58 }, + { 0x1.b5198cf325114p54, 0x1.7f5483f729c27p58 }, + { 0x1.87b15da6677afp54, 0x1.57e33e2b1c6dap58 }, + { 0x1.5ef5de2e68985p54, 0x1.3477480d89e25p58 }, + { 0x1.3a6d00852a688p54, 0x1.14a8b54629fb2p58 }, + { 0x1.19a90b14f53afp54, 0x1.f033fa073d52p57 }, + { 0x1.f88eba04114cbp53, 0x1.bcede5acc0d4p57 }, + { 0x1.c3dea36b87937p53, 0x1.8ee7b29d0b081p57 }, + { 0x1.94a28136fa731p53, 0x1.659917bbb6632p57 }, + { 0x1.6a4b2c9663fa1p53, 0x1.40877b79cd868p57 }, + { 0x1.44580945b8452p53, 0x1.1f44979177348p57 }, + { 0x1.22558f1aa9f03p53, 0x1.016d3f035816p57 }, + { 0x1.03dbf8db89298p53, 0x1.cd508600d0ba8p56 }, + { 0x1.d11c2965639f6p52, 0x1.9d4ae77a21604p56 }, + { 0x1.a03065db54a4bp52, 0x1.723974e9529d8p56 }, + { 0x1.745e6013d8cf3p52, 0x1.4b9a944f57915p56 }, + { 0x1.4d1f2eb8531p52, 0x1.28f9c9b769ee3p56 }, + { 0x1.29f9b7c4f56dfp52, 0x1.09ee66b6e99e9p56 }, + { 0x1.0a814a1dfc5edp52, 0x1.dc34b6999ff72p55 }, + { 0x1.dca8b63e38fa9p51, 0x1.aa5249b4cca57p55 }, + { 0x1.aa36c9242f8bcp51, 0x1.7d9db080918bap55 }, + { 0x1.7d0fbfa6c3c19p51, 0x1.558e88e8945efp55 }, + { 0x1.54a6b679dd96fp51, 0x1.31aa564e92066p55 }, + { 0x1.307d4e71272d7p51, 0x1.11831a9c3763dp55 }, + { 0x1.1022313b11381p51, 0x1.e96c265c21fbfp54 }, + { 0x1.e65f78e13edcdp50, 0x1.b5d52c19374fep54 }, + { 0x1.b2959e487c93fp50, 0x1.87a2188252d5fp54 }, + { 0x1.84436cf62b6f8p50, 0x1.5e440cc8caaf9p54 }, + { 0x1.5ad66c67f3f63p50, 0x1.393ad199301dep54 }, + { 0x1.35cb549c616ebp50, 0x1.18135a0647102p54 }, + { 0x1.14ac7e9322a1ap50, 0x1.f4ccd98eab06bp53 }, + { 0x1.ee20fae75a2c5p49, 0x1.bfaedff2748c1p53 }, + { 0x1.b931b883c77f2p49, 0x1.9026a7e3c9538p53 }, + { 0x1.89e1f8e1d4be6p49, 0x1.659f3419269eep53 }, + { 0x1.5f9a24050e89fp49, 0x1.3f92e9472ca4cp53 }, + { 0x1.39d2746cbe57fp49, 0x1.1d89fb6602df9p53 }, + { 0x1.18115431b6c4ap49, 0x1.fe32077e095c4p52 }, + { 0x1.f3d3ca19edf64p48, 0x1.c7bf775863df5p52 }, + { 0x1.bdf55dd9bdcep48, 0x1.970fb0b5580dcp52 }, + { 0x1.8dd8e25d2255dp48, 0x1.6b88087e4af9fp52 }, + { 0x1.62e225ebca19p48, 0x1.449de67f2c6b2p52 }, + { 0x1.3c855ef212badp48, 0x1.21d51dc348d4dp52 }, + { 0x1.1a4576cd5cddcp48, 0x1.02be7023a443ep52 }, + { 0x1.f765035c713d8p47, 0x1.cdec7155697e1p51 }, + { 0x1.c0d0bdeb46ae2p47, 0x1.9c4671c1a6e3cp51 }, + { 0x1.901afbd3819bep47, 0x1.6feb0af26f865p51 }, + { 0x1.64a386137b955p47, 0x1.484b1e63b3be4p51 }, + { 0x1.3ddb15521ce49p47, 0x1.24e68a1458bd7p51 }, + { 0x1.1b418ba2217c6p47, 0x1.054a9a7c2f05ap51 }, + { 0x1.f8c8bad8e2a2p46, 0x1.d2214ad33ca5ep50 }, + { 0x1.c1ba4950b8f4fp46, 0x1.9fb9933adac68p50 }, + { 0x1.90a0b40dd690cp46, 0x1.72b99eccc462ep50 }, + { 0x1.64d860502b279p46, 0x1.4a8e4dbe3539cp50 }, + { 0x1.3dcf1aadc099dp46, 0x1.26b4018ef81f7p50 }, + { 0x1.1b02414a73357p46, 0x1.06b4fe82cc6aep50 }, + { 0x1.f7fa3e4bec2aep45, 0x1.d44feffb34893p49 }, + { 0x1.c0aee6d6b1406p45, 0x1.a15d86bb23572p49 }, + { 0x1.8f684065398bfp45, 0x1.73ea5ac0d71a9p49 }, + { 0x1.637ff9397e989p45, 0x1.4b5fdd0f567fap49 }, + { 0x1.3c618d3c706ebp45, 0x1.2737769828878p49 }, + { 0x1.1988625955723p45, 0x1.06f8da87263cep49 }, + { 0x1.f4fc2f6d50e41p44, 0x1.d4710a9e149edp48 }, + { 0x1.bdb204ff1cda3p44, 0x1.a12cc7b1bf616p48 }, + { 0x1.8c75a6fa17116p44, 0x1.73793d6253bd7p48 }, + { 0x1.609ec277b8703p44, 0x1.4abd0af44c7f8p48 }, + { 0x1.399725d96eb63p44, 0x1.266f2e981ccfbp48 }, + { 0x1.16d8d1241b86bp44, 0x1.06154a07d21a2p48 }, + { 0x1.efd875a51d28dp43, 0x1.d2842b40e25fp47 }, + { 0x1.b8cd873c4de72p43, 0x1.9f27fa465d061p47 }, + { 0x1.87d2a89e5ac65p43, 0x1.7167c3937ded9p47 }, + { 0x1.5c3e42539c769p43, 0x1.48a7fb96552cap47 }, + { 0x1.35791e04cd29fp43, 0x1.245dcbaa25b1bp47 }, + { 0x1.12fc6cdafd10dp43, 0x1.040d4ab2de626p47 }, + { 0x1.e8a0077a1ed47p42, 0x1.ce8fcb8dadc2cp46 }, + { 0x1.b2118f75a4eb7p42, 0x1.9b55e7c11d9e6p46 }, + { 0x1.818e8b1c2616fp42, 0x1.6dbce02ec5c77p46 }, + { 0x1.566cdf4525ebp42, 0x1.4527acab6dfebp46 }, + { 0x1.3014fd204bc71p42, 0x1.210a3ddcb4706p46 }, + { 0x1.0dffe0bfc0c74p42, 0x1.00e7aba6527c9p46 }, + { 0x1.df6a8d5e14f11p41, 0x1.c8a12a152d814p45 }, + { 0x1.a9942579915cdp41, 0x1.95c35893651c9p45 }, + { 0x1.79bdc576e403ap41, 0x1.6884d52cc9914p45 }, + { 0x1.4f3d9114d799bp41, 0x1.4047ce663f641p45 }, + { 0x1.297c4e6eb62fcp41, 0x1.1c7f9c74f3e7cp45 }, + { 0x1.07f35ef1a4fcp41, 0x1.f95dcee779f74p44 }, + { 0x1.d455e0a3b0d94p40, 0x1.c0cc007cc808ep44 }, + { 0x1.9f70bf04a77cep40, 0x1.8e82cd2a6133cp44 }, + { 0x1.707990a8defefp40, 0x1.61d0ef76712e4p44 }, + { 0x1.46c779ebb14aep40, 0x1.3a1882865d26ep44 }, + { 0x1.21c4420bc9879p40, 0x1.16cce86450b2p44 }, + { 0x1.00ea48df1e7fbp40, 0x1.eee1d41e1e516p43 }, + { 0x1.c7856a7693627p39, 0x1.b72a1658393d4p43 }, + { 0x1.93c7abef59a2cp39, 0x1.85ac17b553c4fp43 }, + { 0x1.65df602b1e0ffp39, 0x1.59b72775450f3p43 }, + { 0x1.3d256a5ee461dp39, 0x1.32ae03812fcp43 }, + { 0x1.19053bac5f645p39, 0x1.1004b9cd4bae6p43 }, + { 0x1.f1f58fe66e142p38, 0x1.e27d88d5289bfp42 }, + { 0x1.b9216793da422p38, 0x1.abdab3fb224cep42 }, + { 0x1.86bd6adace04ep38, 0x1.7b5bd9f52a89ep42 }, + { 0x1.5a104640aeb74p38, 0x1.5051a941eb13p42 }, + { 0x1.32755417b50ddp38, 0x1.2a20366f6a0dep42 }, + { 0x1.0f5a5274f5c45p38, 0x1.083cdb1163405p42 }, + { 0x1.e07ab300dc4b9p37, 0x1.d458a013d18b4p41 }, + { 0x1.a956163a49613p37, 0x1.9f01f97b2e043p41 }, + { 0x1.7879eb52380edp37, 0x1.6fb2eaf7d8102p41 }, + { 0x1.4d30488394e18p37, 0x1.45be480207b14p41 }, + { 0x1.26d7af2869fc5p37, 0x1.208a2b041836ep41 }, + { 0x1.04e0c593552f5p37, 0x1.ff1ba8cbc9c8dp40 }, + { 0x1.cd98a274acae3p36, 0x1.c49f8a8ec4aebp40 }, + { 0x1.9852d44d7528bp36, 0x1.90c81ede57558p40 }, + { 0x1.6927c2c3e497p36, 0x1.62d5a948b6358p40 }, + { 0x1.3f65a98c177c9p36, 0x1.3a1de0952fd2bp40 }, + { 0x1.1a6ed66936eeap36, 0x1.16098d4b94692p40 }, + { 0x1.f36ed3084aa81p35, 0x1.ec24d6a8bc072p39 }, + { 0x1.b986ab7ebdd54p35, 0x1.b3828ebcc128bp39 }, + { 0x1.864933f3c0573p35, 0x1.8158a3038115ep39 }, + { 0x1.58f359f0c4e8fp35, 0x1.54eb3e9a3e72bp39 }, + { 0x1.30d82cb8a968cp35, 0x1.2d93b0174f61ap39 }, + { 0x1.0d5e5f59de7c1p35, 0x1.0abe0d45fd5c2p39 }, + { 0x1.dbfc240ab5f81p34, 0x1.d7ce33a39bd89p38 }, + { 0x1.a47db588b15cfp34, 0x1.a134d30d655e4p38 }, + { 0x1.736c0d0a31187p34, 0x1.70e16f315ef4p38 }, + { 0x1.480a1879e8f57p34, 0x1.461cda38e2783p38 }, + { 0x1.21b0591ce1cfdp34, 0x1.2044a2faebb7bp38 }, + { 0x1.ff94e3fca1752p33, 0x1.fd91813f8cc8cp37 }, + { 0x1.c3a9f9558ffap33, 0x1.c2530177987fep37 }, + { 0x1.8eb738c76b2f2p33, 0x1.8deb61106f334p37 }, + { 0x1.5fee91a43fef1p33, 0x1.5f91f55e86346p37 }, + { 0x1.3699940a6a811p33, 0x1.3694e7b13691bp37 }, + { 0x1.1216c07263dep33, 0x1.1256a18de488bp37 }, + { 0x1.e3ae49fef5535p32, 0x1.e49705a5ebd5fp36 }, + { 0x1.aab87fb8e4441p32, 0x1.abefb3186e784p36 }, + { 0x1.786c3dca158c4p32, 0x1.79dc285401b7dp36 }, + { 0x1.4c036b7451223p32, 0x1.4d9a4f359ba1ep36 }, + { 0x1.24cec8453db03p32, 0x1.267e46fd85893p36 }, + { 0x1.02334e92993b9p32, 0x1.03efdea0a0506p36 }, + { 0x1.c74fc41217dfbp31, 0x1.cad0afbb569b1p35 }, + { 0x1.9166837399532p31, 0x1.94e0d5e7a8744p35 }, + { 0x1.61d46c11dd916p31, 0x1.653d077d9eefp35 }, + { 0x1.37dbe7711fcd4p31, 0x1.3b2a639494566p35 }, + { 0x1.12d55c1e73c65p31, 0x1.16038b4af0a0ep35 }, + { 0x1.e4594b115943bp30, 0x1.ea6c598920c48p34 }, + { 0x1.aabdabdb93484p30, 0x1.b081aaf25ade1p34 }, + { 0x1.77f073eb945dfp30, 0x1.7d62079a4e4a6p34 }, + { 0x1.4b252d0bc8bebp30, 0x1.5042e1a8664edp34 }, + { 0x1.23a7345c57ccap30, 0x1.287117d29a9e6p34 }, + { 0x1.00d6f8a57f06ep30, 0x1.054e44f8ee735p34 }, + { 0x1.c44f136cf3bd8p29, 0x1.cc9cbc5fe04a8p33 }, + { 0x1.8e38df2790b7ap29, 0x1.95eb2cb828067p33 }, + { 0x1.5e8f828661e21p29, 0x1.65acfefcd0029p33 }, + { 0x1.3490e7e2bc31cp29, 0x1.3b20c56ad84f5p33 }, + { 0x1.0f91b7ff9bb2ap29, 0x1.159b917beb87ap33 }, + { 0x1.ddf56913a541ep28, 0x1.e90cb5cac7057p32 }, + { 0x1.a48cc1b8a7bc7p28, 0x1.aeb7659e5f7efp32 }, + { 0x1.71fde01e2ca8cp28, 0x1.7b4b752e86e5fp32 }, + { 0x1.4578e0b906b32p28, 0x1.4df8ace15322ep32 }, + { 0x1.1e4659a2a2156p28, 0x1.26072a17961ap32 }, + { 0x1.f788fc218597bp27, 0x1.02d48c75e7d9bp32 }, + { 0x1.bac92daac0b9dp27, 0x1.c7a2ecd5f05ap31 }, + { 0x1.85518c3484796p27, 0x1.90feaede7f2aep31 }, + { 0x1.56441b55bfff1p27, 0x1.60dcef1cedc3ap31 }, + { 0x1.2cdd203ab43a1p27, 0x1.36787980e7387p31 }, + { 0x1.08700c199ad4fp27, 0x1.112346e13dd7ep31 }, + { 0x1.d0c9857c390f3p26, 0x1.e087915129a98p30 }, + { 0x1.986a650394095p26, 0x1.a6a5096da5b7dp30 }, + { 0x1.66d6688315ad6p26, 0x1.73aff07c7874ep30 }, + { 0x1.3b3d55ebd8547p26, 0x1.46d572e10e216p30 }, + { 0x1.14e7b714e7093p26, 0x1.1f5ba17e5a90bp30 }, + { 0x1.e667d9a8bcd9ep25, 0x1.f93d0d186fbcdp29 }, + { 0x1.ab2733e383ad8p25, 0x1.bc1b22cec72bp29 }, + { 0x1.7712b76c8c7f6p25, 0x1.86529e9df069cp29 }, + { 0x1.494d8e1d4fc61p25, 0x1.5702d052bf73ap29 }, + { 0x1.2115447c6627dp25, 0x1.2d65aee08874cp29 }, + { 0x1.fb7d503fc65c8p24, 0x1.08ccb49580d43p29 }, + { 0x1.bd660913b938cp24, 0x1.d13c32a98512bp28 }, + { 0x1.86db66e158524p24, 0x1.98a4bfd5a5fadp28 }, + { 0x1.56f3ed5aa4222p24, 0x1.66e459a7794f4p28 }, + { 0x1.2ce2265a96befp24, 0x1.3b28bbce3c1c6p28 }, + { 0x1.07f14a8d0c116p24, 0x1.14b8b6b67144ep28 }, + { 0x1.cf049ebedf60dp23, 0x1.e5e26dbef0e28p27 }, + { 0x1.96129ca292f7ep23, 0x1.aa854b5c4f131p27 }, + { 0x1.6416763f6b3bcp23, 0x1.765d329106241p27 }, + { 0x1.3837bf030f4a8p23, 0x1.488b9479ee1c4p27 }, + { 0x1.11b82880134f9p23, 0x1.204c8d940530bp27 }, + { 0x1.dfe0c1b8af1f3p22, 0x1.f9e77238e0031p26 }, + { 0x1.a49aa1651cfcap22, 0x1.bbd2c8fd7e193p26 }, + { 0x1.709b5a3a79128p22, 0x1.85502f16a0f8dp26 }, + { 0x1.42ffa7e9ace3fp22, 0x1.5574ceffe3945p26 }, + { 0x1.1affd2eccd616p22, 0x1.2b72182c97af5p26 }, + { 0x1.efd8be43ac9a9p21, 0x1.06925da53a0fcp26 }, + { 0x1.b2564005de7e5p21, 0x1.cc6bb6d71090dp25 }, + { 0x1.7c694cd2b4ffdp21, 0x1.93a02d0c97221p25 }, + { 0x1.4d23fa69bd814p21, 0x1.61cb1a027e057p25 }, + { 0x1.23b556e6e918ep21, 0x1.361358dd1f243p25 }, + { 0x1.fecbcf04dca9p20, 0x1.0fba0d2660d89p25 }, + { 0x1.bf29264dcdc82p20, 0x1.dc2ef387bd0ep24 }, + { 0x1.8767d7fc43eb6p20, 0x1.a130711aadcdap24 }, + { 0x1.568f9937abc79p20, 0x1.6d758e1ac9659p24 }, + { 0x1.2bc67d8c20136p20, 0x1.401abca024479p24 }, + { 0x1.064d4616b0094p20, 0x1.185819a7f8c6ap24 }, + { 0x1.caf8458ad2a12p19, 0x1.eafc2b00a99b1p23 }, + { 0x1.917faff93e54p19, 0x1.ade505ba61e89p23 }, + { 0x1.5f2e79283b1cap19, 0x1.785c00b5cb27ep23 }, + { 0x1.33220b1da4f59p19, 0x1.4973634932c1ap23 }, + { 0x1.0c93ac678b0ccp19, 0x1.205a7d78be568p23 }, + { 0x1.d5aa313452daep18, 0x1.f8b4440d68221p22 }, + { 0x1.9a9b05368c88bp18, 0x1.b9a31a7b9868cp22 }, + { 0x1.66ede7f0c2d55p18, 0x1.826756e1a42e2p22 }, + { 0x1.39b7fc18e5891p18, 0x1.5209676e4b424p22 }, + { 0x1.122b662569616p18, 0x1.27b019965e362p22 }, + { 0x1.df2779ceabfc8p17, 0x1.029ce648133fdp22 }, + { 0x1.a2a5d2945d2b7p17, 0x1.c45161cd95fe8p21 }, + { 0x1.6dbccf848794ap17, 0x1.8b81d680cdfc5p21 }, + { 0x1.3f79bf21caa96p17, 0x1.59ca24a7521ddp21 }, + { 0x1.17080ae674896p17, 0x1.2e48f266999cfp21 }, + { 0x1.e75b024885f54p16, 0x1.0838b13324d03p21 }, + { 0x1.a98e26924c6c8p16, 0x1.cdd86b83e679dp20 }, + { 0x1.738bf4bc8d296p16, 0x1.93977456406ddp20 }, + { 0x1.445a6a9a273c6p16, 0x1.60a47aca18e96p20 }, + { 0x1.1b1eabeffc3a5p16, 0x1.341669953fe1cp20 }, + { 0x1.ee324e1fde417p15, 0x1.0d210b765b3d6p20 }, + { 0x1.af4465e9c5668p15, 0x1.d622fa53c02cep19 }, + { 0x1.784e3008fb46bp15, 0x1.9a961d6383ef7p19 }, + { 0x1.484eecd2f1383p15, 0x1.66890cd0bf55fp19 }, + { 0x1.1e65fd1ef2701p15, 0x1.390b73f2a4fbp19 }, + { 0x1.f39dc6baaccd7p14, 0x1.114ae59581395p19 }, + { 0x1.b3bb863d26278p14, 0x1.dd1e5296953a3p18 }, + { 0x1.7bf89f052b591p14, 0x1.a06dfa21b6c59p18 }, + { 0x1.4b4e35dbe0cddp14, 0x1.6b6a7a27c9005p18 }, + { 0x1.20d6781986167p14, 0x1.3d1cca3d4f6d8p18 }, + { 0x1.f790f6877f51ep13, 0x1.14acc164c64fep18 }, + { 0x1.b6e93fa7299b3p13, 0x1.e2ba80b9c3a1bp17 }, + { 0x1.7e82cde922833p13, 0x1.a511aa3827999p17 }, + { 0x1.4d515a14a6132p13, 0x1.6f3d9139319edp17 }, + { 0x1.226a790f97768p13, 0x1.404113d7d18e6p17 }, + { 0x1.fa02b8ac73416p12, 0x1.173ed60fcd6fap17 }, + { 0x1.b8c634233722p12, 0x1.e6ea95e92c624p16 }, + { 0x1.7fe6d7fbcef2cp12, 0x1.a8767775dd309p16 }, + { 0x1.4e53acc7531b1p12, 0x1.71f97a2983044p16 }, + { 0x1.231e547065724p12, 0x1.42710a88aab19p16 }, + { 0x1.faed5c4559717p11, 0x1.18fb2ded8ebb1p16 }, + { 0x1.b94e0bfb59934p11, 0x1.e9a4d9b21386ep15 }, + { 0x1.80217e57d8a3fp11, 0x1.aa947efe69879p15 }, + { 0x1.4e52d23cf50bp11, 0x1.7397d8e2bd385p15 }, + { 0x1.22f0652094ae6p11, 0x1.43a79684f6ef6p15 }, + { 0x1.fa4eba730bf6p10, 0x1.19ddbd8138a9p15 }, + { 0x1.b87f86a26fad7p10, 0x1.eae2ef93df996p14 }, + { 0x1.7f323487ff94ap10, 0x1.ab66cfccafb75p14 }, + { 0x1.4d4ec8ea8ee67p10, 0x1.7414e5b5ca43cp14 }, + { 0x1.21e112e39bf18p10, 0x1.43e1e22ebfdb4p14 }, + { 0x1.f8283ec45f117p9, 0x1.19e4732be2ffp14 }, + { 0x1.b65c7f9f1fbedp9, 0x1.eaa1efb3b003ep13 }, + { 0x1.7d1b22b6810f6p9, 0x1.aaeb7de6855e2p13 }, + { 0x1.4b49e984886ep9, 0x1.736f7c0d13f06p13 }, + { 0x1.1ff2d0d5a2649p9, 0x1.431f651be2ff4p13 }, + { 0x1.f47ee1cab73ddp8, 0x1.190f3f39e9af4p13 }, + { 0x1.b2e9e76c8d9f9p8, 0x1.e8e2722ca46cfp12 }, + { 0x1.79e11d635b9a7p8, 0x1.a923a9d8d5019p12 }, + { 0x1.4848ddf7dfffep8, 0x1.71a91ee04e82cp12 }, + { 0x1.1d2a13fdd2709p8, 0x1.4161e6298ed3ap12 }, + { 0x1.ef5b15f73200ap7, 0x1.176014201ab17p12 }, + { 0x1.ae2fb07705cc3p7, 0x1.e5a88cbf394e4p11 }, + { 0x1.758b92cdfdc64p7, 0x1.a6137c537bf6dp11 }, + { 0x1.44528f79b1b51p7, 0x1.6ec5f2d1367f4p11 }, + { 0x1.198d422be3f8cp7, 0x1.3ead7491061afp11 }, + { 0x1.e8c8a7276c93p6, 0x1.14dadee76975ap11 }, + { 0x1.a838b09afcf62p6, 0x1.e0fbc2ec572b9p10 }, + { 0x1.70246e766d2f3p6, 0x1.a1c215fcd0beap10 }, + { 0x1.3f700c0d99876p6, 0x1.6accae115453ep10 }, + { 0x1.1524997d01ap6, 0x1.3b08582357e32p10 }, + { 0x1.e0d68d9047f7ap5, 0x1.118577f06b2f2p10 }, + { 0x1.a11277ca2bd3fp5, 0x1.dae6e8d292a1ep9 }, + { 0x1.69b7f34ec048ep5, 0x1.9c3973d4c9b08p9 }, + { 0x1.39ac6410ceb63p5, 0x1.65c67e684d1e6p9 }, + { 0x1.0ffa110b113fp5, 0x1.367af901b137p9 }, + { 0x1.d796b4f7aaf7fp4, 0x1.0d678c614f535p9 }, + { 0x1.98cd1cb38dccp4, 0x1.d377f96b9fd62p8 }, + { 0x1.62548d6675835p4, 0x1.958648bd6035p8 }, + { 0x1.331480815e7cdp4, 0x1.5fbee5e7590f4p8 }, + { 0x1.0a19336cc73a1p4, 0x1.310fbf558eca2p8 }, + { 0x1.cd1db96a6c6efp3, 0x1.088a80b837328p8 }, + { 0x1.8f7b007e1de49p3, 0x1.cabfe10b3371ap7 }, + { 0x1.5a0a9c047e3c7p3, 0x1.8db7ccf7600f4p7 }, + { 0x1.2bb6f2dd8e254p3, 0x1.58c38f07b7c3bp7 }, + { 0x1.038ef3cbdc1c7p3, 0x1.2ad2ebb6268bdp7 }, + { 0x1.c1829acfb62b3p2, 0x1.02f94d1fb1ba4p7 }, + { 0x1.85308ad209551p2, 0x1.c0d23d3daadadp6 }, + { 0x1.50ec3549a202dp2, 0x1.84df8496cc3aep6 }, + { 0x1.23a3bf963c1ebp2, 0x1.50e4191e1b76cp6 }, + { 0x1.f8d2fce0ebb41p1, 0x1.23d2690dc7344p6 }, + { 0x1.b4de68e608347p1, 0x1.f980a88588961p5 }, + { 0x1.7a03df8f9f479p1, 0x1.b5c5135a44acbp5 }, + { 0x1.470ce4924af72p1, 0x1.7b10fe1f0aeaap5 }, + { 0x1.1aec242758b4fp1, 0x1.4831de32e25bdp5 }, + { 0x1.e9700b697ec96p0, 0x1.1c1d98f1b1f71p5 }, + { 0x1.a74be9568f922p0, 0x1.ebda6af103d07p4 }, + { 0x1.6e0c8fadbb05p0, 0x1.a9b07f491a273p4 }, + { 0x1.3c8164e42f29cp0, 0x1.70618a9c019dap4 }, + { 0x1.11a259faba91ep0, 0x1.3ebfb36da371bp4 }, + { 0x1.d91518c2acaf6p-1, 0x1.13c51b7852ecp4 }, + { 0x1.98e739a118b5ep-1, 0x1.dd1d36683753bp3 }, + { 0x1.616346ca3be0ep-1, 0x1.9cae5c1f5de61p3 }, + { 0x1.315f58c13df9cp-1, 0x1.64e7f0a95542fp3 }, + { 0x1.07d957435b8c4p-1, 0x1.34a1a5595e9cbp3 }, + { 0x1.c7e35cf4db634p-2, 0x1.0ada93ac2688ep3 }, + { 0x1.89cd6ead31b71p-2, 0x1.cd680d6a376d2p2 }, + { 0x1.542176fe1c2b2p-2, 0x1.8ed9e84be9bacp2 }, + { 0x1.25bd00bd97eddp-2, 0x1.58bc1beb8e117p2 }, + { 0x1.fb491e02b7c15p-3, 0x1.29ecb15514182p2 }, + { 0x1.b5fcd30c7e1f6p-3, 0x1.017069c4b54cfp2 }, + { 0x1.7a1c33cc1922bp-3, 0x1.bcdb33f7b88f9p1 }, + { 0x1.46610483f2395p-3, 0x1.804f671a7a35cp1 }, + { 0x1.19b0f23241b88p-3, 0x1.4bf6ca87a4707p1 }, + { 0x1.e62f62b4555dcp-4, 0x1.1eb67d8a75351p1 }, + { 0x1.a383ca9f98a0fp-4, 0x1.ef3318a5788dep0 }, + { 0x1.69f16aeb3677p-4, 0x1.ab97c2106c4d2p0 }, + { 0x1.383bf2b37a037p-4, 0x1.712bc1550fb6ap0 }, + { 0x1.0d51cf5a16254p-4, 0x1.3eb13a24821e2p0 }, + { 0x1.d08cdac87dce6p-5, 0x1.131510c1da6adp0 }, + { 0x1.909a7c3ac6f99p-5, 0x1.dad26311e9efp-1 }, + { 0x1.596acfa0bcc8fp-5, 0x1.99bf36c7ef068p-1 }, + { 0x1.29cc13bfd53ap-5, 0x1.618c26c1169a6p-1 }, + { 0x1.00b60212cf113p-5, 0x1.3104d5f799552p-1 }, + { 0x1.ba886ae6e40ep-6, 0x1.071e8b6003b16p-1 }, + { 0x1.7d62a282a4851p-6, 0x1.c5e5338097f6bp-2 }, + { 0x1.48a59e9cb1eb1p-6, 0x1.87730de08c821p-2 }, + { 0x1.1b2abc895a771p-6, 0x1.518db221cf8bap-2 }, + { 0x1.e7e6f4c33ededp-7, 0x1.230ae74a714aap-2 }, + { 0x1.a4480db60fe17p-7, 0x1.f5d1c58fdc6acp-3 }, + { 0x1.69fd19aacb90ap-7, 0x1.b091a88a72f08p-3 }, + { 0x1.37be42e1159e7p-7, 0x1.74d459ba38afep-3 }, + { 0x1.0c707db025298p-7, 0x1.414d114bdcde1p-3 }, + { 0x1.ce3ee3757dbe5p-8, 0x1.14dc49cbc0c3p-3 }, + { 0x1.8df06bfb34f6dp-8, 0x1.dd13408401cdcp-4 }, + { 0x1.568986affafc5p-8, 0x1.9afd0eca1593dp-4 }, + { 0x1.26d009f5af049p-8, 0x1.6203633a6814ap-4 }, + { 0x1.fb69c5d6b524ep-9, 0x1.30e632b0008c9p-4 }, + { 0x1.b49c67cd1611fp-9, 0x1.069124dc6eaefp-4 }, + { 0x1.77a47ec4e9fa1p-9, 0x1.c42b48d5cfe42p-5 }, + { 0x1.43260788f0a1fp-9, 0x1.854b792c33d4ap-5 }, + { 0x1.15f4e018a09eep-9, 0x1.4f1f511f7b2d7p-5 }, + { 0x1.de1c72f739a49p-10, 0x1.2073f996519cp-5 }, + { 0x1.9b25dc6d6642ep-10, 0x1.f08155c194aadp-6 }, + { 0x1.61853cc8eddacp-10, 0x1.ab41e011814e5p-6 }, + { 0x1.2feeed430b87bp-10, 0x1.6f9f62ec4193ap-6 }, + { 0x1.05451535e8102p-10, 0x1.3c45d7f9e2fbp-6 }, + { 0x1.c122bcbda7f8ep-11, 0x1.100ffa10ff0f3p-6 }, + { 0x1.81ff0b26f3b6ap-11, 0x1.d401bee3a7787p-7 }, + { 0x1.4bb153d2d0728p-11, 0x1.927ce5fbbe352p-7 }, + { 0x1.1cfe80beb05a4p-11, 0x1.5a195c6e2a08ep-7 }, + { 0x1.e9ae566e02486p-12, 0x1.2992f3c7d2ce7p-7 }, + { 0x1.a4a3297375461p-12, 0x1.ffa47aef63bd2p-8 }, + { 0x1.6948e77b6c537p-12, 0x1.b7ccca35ce88ep-8 }, + { 0x1.3644eed5b1126p-12, 0x1.79ffc3cd6bc92p-8 }, + { 0x1.0a6cd27d913d7p-12, 0x1.44d7c3dca9cc8p-8 }, + { 0x1.c97f5c053e775p-13, 0x1.1720abf01aa9bp-8 }, + { 0x1.88c0c973b68fcp-13, 0x1.dfa22008cf2c8p-9 }, + { 0x1.512157ee1d8bep-13, 0x1.9c08a63df00dcp-9 }, + { 0x1.215988e86b086p-13, 0x1.61eb258af5a93p-9 }, + { 0x1.f09f2b684fb31p-14, 0x1.2ff68a28f7dc4p-9 }, + { 0x1.aa222a98ba953p-14, 0x1.0506e21782262p-9 }, + { 0x1.6d9b06046eb66p-14, 0x1.c041afe3a1ad2p-10 }, + { 0x1.39a30e3030664p-14, 0x1.80d8271e40929p-10 }, + { 0x1.0d05cd2b64652p-14, 0x1.4a5cc1e67b046p-10 }, + { 0x1.cd740d2318d4dp-15, 0x1.1b8f04bdfa1bfp-10 }, + { 0x1.8bb7603d9828p-15, 0x1.e6b65816f0ff1p-11 }, + { 0x1.534d810db5377p-15, 0x1.a1a7ec86c94fbp-11 }, + { 0x1.22e56de90dc1ap-15, 0x1.665a9398034f1p-11 }, + { 0x1.f2bb06a7069e2p-16, 0x1.336f30c8d3345p-11 }, + { 0x1.ab79b6edb04e1p-16, 0x1.07b7cbf13abf4p-11 }, + { 0x1.6e5b33b150249p-16, 0x1.c461717dacbd8p-12 }, + { 0x1.39f005226a7dbp-16, 0x1.83f56253c12f1p-12 }, + { 0x1.0cfc8192e69bdp-16, 0x1.4cab82baddd6cp-12 }, + { 0x1.cce310b024fd4p-17, 0x1.1d39d04e50424p-12 }, + { 0x1.8acc81455f971p-17, 0x1.e9094beff3587p-13 }, + { 0x1.522570529739fp-17, 0x1.a3308036822dbp-13 }, + { 0x1.219685023e1bep-17, 0x1.67464f8a36affp-13 }, + { 0x1.eff1f945e7f7bp-18, 0x1.33e2c9c277148p-13 }, + { 0x1.a89fa515a2b44p-18, 0x1.07d0b7bb52fc7p-13 }, + { 0x1.6b83bb4ee4348p-18, 0x1.c40cfbd11fd1p-14 }, + { 0x1.372982e2fde1dp-18, 0x1.833ffa698fa8bp-14 }, + { 0x1.0a51297b20ab7p-18, 0x1.4bb29dadf3acp-14 }, + { 0x1.c7d093fb7e463p-19, 0x1.1c147957723bdp-14 }, + { 0x1.8607006600009p-19, 0x1.e6896f5762306p-15 }, + { 0x1.4db1c7b733812p-19, 0x1.a096cc3260668p-15 }, + { 0x1.1d76959a6b622p-19, 0x1.64a7647d3f88ap-15 }, + { 0x1.e858d8b3acc8p-20, 0x1.314deba7bab37p-15 }, + { 0x1.a1a94b14e3d7fp-20, 0x1.0550e92636252p-15 }, + { 0x1.6529df3d1cf1cp-20, 0x1.bf46cd0f972c3p-16 }, + { 0x1.316449a955429p-20, 0x1.7ebd49fbb30eep-16 }, + { 0x1.0517b9e1f89dep-20, 0x1.47796af08285bp-16 }, + { 0x1.be627dddb55d7p-21, 0x1.1827a73755ec7p-16 }, + { 0x1.7d8a7f2a8a2dp-21, 0x1.df49a10ccc568p-17 }, + { 0x1.4613bf000c71dp-21, 0x1.99ee7037b652bp-17 }, + { 0x1.16a45fcb7b882p-21, 0x1.5e9197017791dp-17 }, + { 0x1.dc283bcbe780fp-22, 0x1.2bc40c543e36bp-17 }, + { 0x1.96ca751cac37fp-22, 0x1.004b34180a4a9p-17 }, + { 0x1.5b7cd13179ddep-22, 0x1.b632d58444fadp-18 }, + { 0x1.28cb2cb8b4015p-22, 0x1.768f3e13d3bdcp-18 }, + { 0x1.faedd62dabd96p-23, 0x1.401fa7657909ep-18 }, + { 0x1.b0de982dbf111p-23, 0x1.1190d162109abp-18 }, + { 0x1.7195b2becea19p-23, 0x1.d3803e22a78e4p-19 }, + { 0x1.3b8387eea3f9dp-23, 0x1.8f694ad8ac632p-19 }, + { 0x1.0d521f8291cd6p-23, 0x1.55326d6aac6fap-19 }, + { 0x1.cbb9be9cbac1ep-24, 0x1.236e8d3a9e0e7p-19 }, + { 0x1.8852e54d26542p-24, 0x1.f1ca221c0b98bp-20 }, + { 0x1.4ec36b8fdf428p-24, 0x1.a914b62872bc3p-20 }, + { 0x1.1d9d0055d11dp-24, 0x1.6af2ae42db58p-20 }, + { 0x1.e74cb7ebdea0ap-25, 0x1.35dbe86ed95c7p-20 }, + { 0x1.9fa735b03463ap-25, 0x1.0880cfe68041ep-20 }, + { 0x1.627f6220ca6a9p-25, 0x1.c3847cbf78a3bp-21 }, + { 0x1.2e4d9d8b5b22fp-25, 0x1.81550cf271bfdp-21 }, + { 0x1.01c325e8bb3cp-25, 0x1.48cefa0aac509p-21 }, + { 0x1.b783bc148fcefp-26, 0x1.188ab9ce5fdddp-21 }, + { 0x1.76aa8791eba33p-26, 0x1.dea9996bf1c0fp-22 }, + { 0x1.3f58d390caeecp-26, 0x1.984c7bb9c53ffp-22 }, + { 0x1.10299f255a2cap-26, 0x1.5c3c6ce5f2f75p-22 }, + { 0x1.cfd7e08a13b2p-27, 0x1.28f8faa7c3202p-22 }, + { 0x1.8b368e0429dacp-27, 0x1.fa7304087353p-23 }, + { 0x1.50b2501707be6p-27, 0x1.afca3c464e1d5p-23 }, + { 0x1.1ecf2c897b782p-27, 0x1.701780b38d71ap-23 }, + { 0x1.e891642306feep-28, 0x1.39c08dab159ep-23 }, + { 0x1.a013c6709bdd5p-28, 0x1.0b66dac93672bp-23 }, + { 0x1.624c9a2f2f8fcp-28, 0x1.c7bde43ebd873p-24 }, + { 0x1.2da83d59392f5p-28, 0x1.84520ec5eb55ap-24 }, + { 0x1.00ce3767b77a8p-28, 0x1.4ad54236cf6b4p-24 }, + { 0x1.b5312d520a3f4p-29, 0x1.19d258cf47194p-24 }, + { 0x1.74191dcab90bcp-29, 0x1.e015665e4efbdp-25 }, + { 0x1.3ca855a30dad5p-29, 0x1.98dc92b26aeap-25 }, + { 0x1.0d71d1069e44fp-29, 0x1.5c29c3e79c162p-25 }, + { 0x1.ca7c7b61a5357p-30, 0x1.28708aaed4d7p-25 }, + { 0x1.86083aaabaf73p-30, 0x1.f8bd2046619b5p-26 }, + { 0x1.4bc21b880f9dep-30, 0x1.ada636f165959p-26 }, + { 0x1.1a28183b0e32p-30, 0x1.6dafa60f704a1p-26 }, + { 0x1.dfe23a6ad4f8bp-31, 0x1.37351629c53cp-26 }, + { 0x1.980956bea8ccp-31, 0x1.08cff68f5874cp-26 }, + { 0x1.5ae767663002ep-31, 0x1.c29ce58c1fc1p-27 }, + { 0x1.26e4fd1165b76p-31, 0x1.7f5772973d16cp-27 }, + { 0x1.f54dde2ba8f56p-32, 0x1.4612c5674eed9p-27 }, + { 0x1.aa0af3e698b26p-32, 0x1.15539e864d70fp-27 }, + { 0x1.6a0956d7d1b63p-32, 0x1.d7ad5cdc3741ep-28 }, + { 0x1.339bd6e517d44p-32, 0x1.9110bc4b50f8cp-28 }, + { 0x1.0554f0943ba8cp-32, 0x1.54fb970dbe54ep-28 }, + { 0x1.bbfac9007ec07p-33, 0x1.21dd98bc7de87p-28 }, + { 0x1.791862715d02fp-33, 0x1.ecc34851c9763p-29 }, + { 0x1.403f77382e654p-33, 0x1.a2ca34863bfcbp-29 }, + { 0x1.0feff2a4fc49p-33, 0x1.63e0d12d4d288p-29 }, + { 0x1.cdc5de1ae8c09p-34, 0x1.2e615f0543e41p-29 }, + { 0x1.8804761a993c4p-34, 0x1.00e4ae934cb56p-29 }, + { 0x1.4cc23eb3b5ffap-34, 0x1.b471c42165f4ap-30 }, + { 0x1.1a6c6c06ea18bp-34, 0x1.72b316e47cc93p-30 }, + { 0x1.df58ab9ae4fcbp-35, 0x1.3ad1e7143aa75p-30 }, + { 0x1.96bd0bd6c9a31p-35, 0x1.0b54bd6a9e23fp-30 }, + { 0x1.59163428fb3a6p-35, 0x1.c5f4a785a88d1p-31 }, + { 0x1.24be8d0138113p-35, 0x1.8162809b8dff6p-31 }, + { 0x1.f09f3c1618809p-36, 0x1.4721b76389525p-31 }, + { 0x1.a53148c3fc482p-36, 0x1.15a6678e0082cp-31 }, + { 0x1.652d1d62b45e1p-36, 0x1.d73f8da963966p-32 }, + { 0x1.2eda549c16ee8p-36, 0x1.8fdeb6a9e8ebcp-32 }, + { 0x1.00c2a84aed164p-36, 0x1.5342fe16e83a5p-32 }, + { 0x1.b3501c0fdbbcfp-37, 0x1.1fcdfea216d16p-32 }, + { 0x1.70f8998ccf075p-37, 0x1.e83eb9bce31c4p-33 }, + { 0x1.38b3a7222dd33p-37, 0x1.9e170e2dbff8cp-33 }, + { 0x1.08fb437656229p-37, 0x1.5f27a9aa5f66p-33 }, + { 0x1.c1085f96d9feep-38, 0x1.29bfa42bc7b76p-33 }, + { 0x1.7c6a3cf1c9dcfp-38, 0x1.f8de2739c95a9p-34 }, + { 0x1.423e65b2a3a8cp-38, 0x1.abfaa7d4233fap-34 }, + { 0x1.10ef40de709bcp-38, 0x1.6ac1833360c58p-34 }, + { 0x1.ce48f9d9e5928p-39, 0x1.336f5ff042b88p-34 }, + { 0x1.8773adc5703cep-39, 0x1.0484d7ff5f6bdp-34 }, + { 0x1.4b6e86a5aa9d8p-39, 0x1.b978904649f57p-35 }, + { 0x1.189488e2e9743p-39, 0x1.760249f31a968p-35 }, + { 0x1.db0100ef385d3p-40, 0x1.3cd13761f1731p-35 }, + { 0x1.9206c1ae9fb29p-40, 0x1.0c569a0b1627cp-35 }, + { 0x1.54382e8081943p-40, 0x1.c67fe1e83e91p-36 }, + { 0x1.1fe13002859cap-40, 0x1.80dbcff1d72cfp-36 }, + { 0x1.e71fde0c5e218p-41, 0x1.45d945dc4844dp-36 }, + { 0x1.9c159bbc9900ap-41, 0x1.13da615eb6c5fp-36 }, + { 0x1.5c8fc931c6d94p-41, 0x1.d2ffe78d87996p-37 }, + { 0x1.26cb8c1920344p-41, 0x1.8b4017551e03bp-37 }, + { 0x1.f295714275bc3p-42, 0x1.4e7bd56b77338p-37 }, + { 0x1.a592ca70605e5p-42, 0x1.1b06621cfb60ep-37 }, + { 0x1.646a234bddd88p-42, 0x1.dee83fc205fc8p-38 }, + { 0x1.2d4a498c21371p-42, 0x1.9521701d324dap-38 }, + { 0x1.fd5235020e009p-43, 0x1.56ad77d8efe38p-38 }, + { 0x1.ae71657ff542ep-43, 0x1.21d11201bfbcfp-38 }, + { 0x1.6bbc82f12468ap-43, 0x1.ea290040397f4p-39 }, + { 0x1.3354802504d9ep-43, 0x1.9e7295f29cf91p-39 }, + { 0x1.03a3b07cf84bp-43, 0x1.5e631fb2a96dbp-39 }, + { 0x1.b6a52af7c7202p-44, 0x1.28313d62cbf4fp-39 }, + { 0x1.727cc024d462ap-44, 0x1.f4b2d92a8da6ap-40 }, + { 0x1.38e1c7590edafp-44, 0x1.a726cda9c5fc4p-40 }, + { 0x1.083385f1e344cp-44, 0x1.6592390114765p-40 }, + { 0x1.be229b5ed10ebp-45, 0x1.2e1e1bdc1cff3p-40 }, + { 0x1.78a15c33bf0d1p-45, 0x1.fe77379b5869ap-41 }, + { 0x1.3dea49bdca04dp-45, 0x1.af3202215009fp-41 }, + { 0x1.0c5225e967ce3p-45, 0x1.6c30c15ee186bp-41 }, + { 0x1.c4df14833b32ep-46, 0x1.338f646703f05p-41 }, + { 0x1.7e2197e99732ep-46, 0x1.03b4338f71d3bp-41 }, + { 0x1.4266d76b7e9efp-46, 0x1.b688e02001605p-42 }, + { 0x1.0ff9aa4df55cbp-46, 0x1.72355f261c90fp-42 }, + { 0x1.cad0ea9847218p-47, 0x1.387d609c076c8p-42 }, + { 0x1.82f5884a3c4ffp-47, 0x1.07bcd8d61f54dp-42 }, + { 0x1.4650f71159187p-47, 0x1.bd20f0d88c869p-43 }, + { 0x1.1324c9f973607p-47, 0x1.77977767b819cp-43 }, + { 0x1.cfef7f529f1bfp-48, 0x1.3ce0fee10ae91p-43 }, + { 0x1.8716298a66d68p-48, 0x1.0b4fbeda58aa9p-43 }, + { 0x1.49a2f582864b8p-48, 0x1.c2f0b2bc85943p-44 }, + { 0x1.15cee56fb8f8p-48, 0x1.7c4f426570458p-44 }, + { 0x1.d43356b5d1bc3p-49, 0x1.40b3e347db73ap-44 }, + { 0x1.8a7d700826ce3p-49, 0x1.0e67b4f33d066p-44 }, + { 0x1.4c57f38808af9p-49, 0x1.c7efb04c36011p-45 }, + { 0x1.17f41219f6e6ep-49, 0x1.8055de49eb405p-45 }, + { 0x1.d796294cc09e7p-50, 0x1.43f076e4dac86p-45 }, + { 0x1.8d265709c8b81p-50, 0x1.11003322f9f2ap-45 }, + { 0x1.4e6bf1c869176p-50, 0x1.cc169496c493bp-46 }, + { 0x1.199123dce7f7cp-50, 0x1.83a55fe01c77fp-46 }, + { 0x1.da12f38ef6065p-51, 0x1.4691f56a0b9d1p-46 }, + { 0x1.8f0ced10d0db4p-51, 0x1.131565242338p-46 }, + { 0x1.4fdbda9c9106cp-51, 0x1.cf5f3d25346p-47 }, + { 0x1.1aa3b4e8f3caap-51, 0x1.8638e1112031dp-47 }, + { 0x1.dba6023e1257ap-52, 0x1.489478d82c425p-47 }, + { 0x1.902e5d96b5dc7p-52, 0x1.14a433d21a4e2p-47 }, + { 0x1.50a589affacc9p-52, 0x1.d1c4c912f9acbp-48 }, + { 0x1.1b2a2ba958505p-52, 0x1.880c8cf6ecf16p-48 }, + { 0x1.dc4cfb90a7ce5p-53, 0x1.49f5031dc194p-48 }, + { 0x1.9088f811b7254p-53, 0x1.15aa4ccc2f79bp-48 }, + { 0x1.50c7d151d73dp-53, 0x1.d343a5202c7c4p-49 }, + { 0x1.1b23bebdcda6dp-53, 0x1.891da95a3a6f5p-49 }, + { 0x1.dc06e50abd949p-54, 0x1.4ab18582d9df2p-49 }, + { 0x1.901c34297491p-54, 0x1.1626283914e64p-49 }, + { 0x1.50427d64b1c7dp-54, 0x1.d3d994938f3adp-50 }, + { 0x1.1a9076f0d2e24p-54, 0x1.896a9d7ab89b1p-50 }, + { 0x1.dad425efa38efp-55, 0x1.4ac8e5c7c8723p-50 }, + { 0x1.8ee8b30ca2586p-55, 0x1.16170c969f828p-50 }, + { 0x1.4f1653e256f41p-55, 0x1.d385b6cd88b32p-51 }, + { 0x1.19712f23cae3dp-55, 0x1.88f2f609fe4d3p-51 }, + { 0x1.d8b686448b5afp-56, 0x1.4a3b00e506616p-51 }, + { 0x1.8cf03de32b406p-56, 0x1.157d10888e2f3p-51 }, + { 0x1.4d4512f22a65dp-56, 0x1.d2488978a2f74p-52 }, + { 0x1.17c7923127a39p-56, 0x1.87b7664b4e00cp-52 }, + { 0x1.d5b12a674c804p-57, 0x1.4908ab62a09acp-52 }, + { 0x1.8a35c1621f2ccp-57, 0x1.14591aa0080cap-52 }, + { 0x1.4ad16c988b007p-57, 0x1.d023e74fea7e1p-53 }, + { 0x1.159616cbf8a0cp-57, 0x1.85b9c65443c51p-53 }, + { 0x1.d1c88b489c5c3p-58, 0x1.4733af4601fe1p-53 }, + { 0x1.86bd4690c0845p-58, 0x1.12acdf1c9738cp-53 }, + { 0x1.47bf000e37ae9p-58, 0x1.cd1b037f7490bp-54 }, + { 0x1.12dff96b26d81p-58, 0x1.82fd0e7486194p-54 }, + { 0x1.cd026b64a0ca8p-59, 0x1.44bec79d5416cp-54 }, + { 0x1.828be8d7b2e74p-59, 0x1.107adbae7661dp-54 }, + { 0x1.441250d6b8cc7p-59, 0x1.c93261af2cd0dp-55 }, + { 0x1.0fa934555eb5ap-59, 0x1.7f854fd47e7d3p-55 }, + { 0x1.c765c89feb632p-60, 0x1.41ad99b7fc9ebp-55 }, + { 0x1.7da7c97c8ea4bp-60, 0x1.0dc65148f57fcp-55 }, + { 0x1.3fd0bbb47d67cp-60, 0x1.c46fcad39a071p-56 }, + { 0x1.0bf675e9015a3p-60, 0x1.7b57aa64c1e42p-56 }, + { 0x1.c0facb396944ap-61, 0x1.3e04ac23c3f11p-56 }, + { 0x1.781800b4c5862p-61, 0x1.0a933c1a65e31p-56 }, + { 0x1.3b0069a07f02dp-61, 0x1.beda3eeb5f0a2p-57 }, + { 0x1.07cd15415698ap-61, 0x1.767a404101f5ap-57 }, + { 0x1.b9cab20b7b4acp-62, 0x1.39c95b8dcd835p-57 }, + { 0x1.71e48c82b190ap-62, 0x1.06e649c54a11dp-57 }, + { 0x1.35a840f1bb9bfp-62, 0x1.b879e3daa485dp-58 }, + { 0x1.0333055f872d1p-62, 0x1.70f426b1f5c67p-58 }, + { 0x1.b1dfbc5f13465p-63, 0x1.3501cdad9df5bp-58 }, + { 0x1.6b163d96b3dd9p-63, 0x1.02c4cdfc5722cp-58 }, + { 0x1.2fcfd4e6913cap-63, 0x1.b157f19f267eap-59 }, + { 0x1.fc5d8e0519af3p-64, 0x1.6acd55017e4e2p-59 }, + { 0x1.a945119b38a65p-64, 0x1.2fb4e266d3e9fp-59 }, + { 0x1.63b6a2745bde1p-64, 0x1.fc696b5025168p-60 }, + { 0x1.297f53c6e927fp-64, 0x1.a97e9c202c067p-60 }, + { 0x1.f18eb2ba6357fp-65, 0x1.640e915b3f3eap-60 }, + { 0x1.a006a7219c6a4p-65, 0x1.29ea2353deb28p-60 }, + { 0x1.5bcff1208eb99p-65, 0x1.f278f182d5ccep-61 }, + { 0x1.22bf73da1838dp-65, 0x1.a0f8fae51588p-61 }, + { 0x1.e60853b8b4b65p-66, 0x1.5cc15bf9dbbbbp-61 }, + { 0x1.963124add21cp-66, 0x1.23a9b1f0c9515p-61 }, + { 0x1.536cefa1810b4p-66, 0x1.e7c6162103b4ep-62 }, + { 0x1.1b995f6e584afp-66, 0x1.97d2ef035140ap-62 }, + { 0x1.d9da06644bc9dp-67, 0x1.54efd8e5e8a15p-62 }, + { 0x1.8bd1c79049ec2p-67, 0x1.1cfc34a10ee47p-62 }, + { 0x1.4a98db9bff0e8p-67, 0x1.dc5f9803d5324p-63 }, + { 0x1.1416a031bacf2p-67, 0x1.8e1907994f8d3p-63 }, + { 0x1.cd13f7b7c3414p-68, 0x1.4ca4b88f6234cp-63 }, + { 0x1.80f645203dff7p-68, 0x1.15eac2ce52257p-63 }, + { 0x1.415f515af2672p-68, 0x1.d054eb8db2ad5p-64 }, + { 0x1.0c410a1d6b3cap-68, 0x1.83d8652f7235cp-64 }, + { 0x1.bfc6c8b2d1c95p-69, 0x1.43eb1f8cfdcf1p-64 }, + { 0x1.75acacc068ebep-69, 0x1.0e7ed05fb3af3p-64 }, + { 0x1.37cc328e513e5p-69, 0x1.c3b617ec3cfd6p-65 }, + { 0x1.0422a6340a512p-69, 0x1.791e9c59e2b42p-65 }, + { 0x1.b2036a988beadp-70, 0x1.3ace8dce03fbdp-65 }, + { 0x1.6a0349d192d1ap-70, 0x1.06c218ca5f25ap-65 }, + { 0x1.2deb8d0dae905p-70, 0x1.b69393c895b87p-66 }, + { 0x1.f78b3aa5bebbep-71, 0x1.6df997f6bab1bp-66 }, + { 0x1.a3dafb67a96cfp-71, 0x1.315ac58b7d6b7p-66 }, + { 0x1.5e0885ebd9cc3p-71, 0x1.fd7d13f78002dp-67 }, + { 0x1.23c981e88b022p-71, 0x1.a8fe21d205ebp-67 }, + { 0x1.e66846a73c925p-72, 0x1.62777b62fde0cp-67 }, + { 0x1.955ea2f392221p-72, 0x1.279bb2446baf4p-67 }, + { 0x1.51cacbb42476ep-72, 0x1.ecfc5eb955129p-68 }, + { 0x1.19722d0b598a4p-72, 0x1.9b06ad8cbcafbp-68 }, + { 0x1.d4f0c5733dbc9p-73, 0x1.56a684fe99fcap-68 }, + { 0x1.869f70ffc1fcbp-73, 0x1.1d9d500e92622p-68 }, + { 0x1.45586a9e82938p-73, 0x1.dc163a555fefbp-69 }, + { 0x1.0ef18dbc017ffp-73, 0x1.8cbe28ca7c426p-69 }, + { 0x1.c338d2435fb4bp-74, 0x1.4a94f1540c9eap-69 }, + { 0x1.77ae3cb88b469p-74, 0x1.136b93820fc76p-69 }, + { 0x1.38bf7be87e681p-74, 0x1.cadeb8c3bba05p-70 }, + { 0x1.0453702b9a5bbp-74, 0x1.7e356a2db5e15p-70 }, + { 0x1.b154294e891dap-75, 0x1.3e50df3387f95p-70 }, + { 0x1.689b85dc875b1p-75, 0x1.09125281c373ap-70 }, + { 0x1.2c0dc90fab5bap-75, 0x1.b969aedac7779p-71 }, + { 0x1.f346b0aa94647p-76, 0x1.6f7d0d10edd84p-71 }, + { 0x1.9f5604d9610bp-76, 0x1.31e8350b95daep-71 }, + { 0x1.597757e14e4e8p-76, 0x1.fd3a5c3ac18bbp-72 }, + { 0x1.1f50b401397f7p-76, 0x1.a7ca8fa24018p-72 }, + { 0x1.ddd8dcb76e388p-77, 0x1.60a5532471804p-72 }, + { 0x1.8d50fcdd2a012p-77, 0x1.256887c26e498p-72 }, + { 0x1.4a512f5483d32p-77, 0x1.e82efb884fa7p-73 }, + { 0x1.129521372a709p-77, 0x1.961449f1f5f93p-73 }, + { 0x1.c872d91eff745p-78, 0x1.51be080b9d49dp-73 }, + { 0x1.7b56e9895b756p-78, 0x1.18df034ba2c47p-73 }, + { 0x1.3b37e1b01d1bdp-78, 0x1.d31877f1753bap-74 }, + { 0x1.05e763ef1c6e1p-78, 0x1.845928aac023dp-74 }, + { 0x1.b3291e83a6ddap-79, 0x1.42d6673958cf7p-74 }, + { 0x1.6978c8d7d61b8p-79, 0x1.0c58552d896bdp-74 }, + { 0x1.2c3987ce2b431p-79, 0x1.be0be95f0126ep-75 }, + { 0x1.f2a6593b4ee39p-80, 0x1.72aab5cc51918p-75 }, + { 0x1.9e0f0cfd57ab4p-80, 0x1.33fd04413c4e8p-75 }, + { 0x1.57c6a75ebbd36p-80, 0x1.ffc132424c87ap-76 }, + { 0x1.1d636b1da2b46p-80, 0x1.a91d6af35687bp-76 }, + { 0x1.d9c6f3705063cp-81, 0x1.6119a09e14fe5p-76 }, + { 0x1.8936d384f421ap-81, 0x1.253fb5c838ba6p-76 }, + { 0x1.464f8c7e074fcp-81, 0x1.e7068fdcaeb4ep-77 }, + { 0x1.0ec1f5aebc21fp-81, 0x1.945fff2eb1b17p-77 }, + { 0x1.c14515cb6f8fp-82, 0x1.4fb5a7146299ap-77 }, + { 0x1.74b15b6eeceb1p-82, 0x1.16ab8334ccb0ap-77 }, + { 0x1.352169fa33216p-82, 0x1.ce965139dad89p-78 }, + { 0x1.0060a522d6818p-82, 0x1.7fe578074e0c8p-78 }, + { 0x1.a933ad3e37ea3p-83, 0x1.3e8d828e807b4p-78 }, + { 0x1.608e37fe916b7p-83, 0x1.084c9533fea9dp-78 }, + { 0x1.24490f08ca22dp-83, 0x1.b68488148e38cp-79 }, + { 0x1.e4940102c0a26p-84, 0x1.6bbe630bdc58cp-79 }, + { 0x1.91a40479b1837p-84, 0x1.2daed7fd23569p-79 }, + { 0x1.4cdb9a0d20ef7p-84, 0x1.f45c523b5ec4ep-80 }, + { 0x1.13d21ec7ce7a5p-84, 0x1.9ee3b5d440d2p-80 }, + { 0x1.c90f21d2d475fp-85, 0x1.57f9f997e1f52p-80 }, + { 0x1.7aa5b8d4b4359p-85, 0x1.1d262b74c69e4p-80 }, + { 0x1.39a647b21bed6p-85, 0x1.d8b50e711660ap-81 }, + { 0x1.03c70a0dadb1dp-85, 0x1.87c4bc616ed3dp-81 }, + { 0x1.ae43ba1c85bb1p-86, 0x1.44a615135e868p-81 }, + { 0x1.6446b3db12c58p-86, 0x1.0cfed72363bb7p-81 }, + { 0x1.26f997cdc041dp-86, 0x1.bdb5f7a82d0f4p-82 }, + { 0x1.e86218ea3e6acp-87, 0x1.7136d3b897e11p-82 }, + { 0x1.9440cec9f5e3ap-87, 0x1.31cf2729ac24dp-82 }, + { 0x1.4e93295651e9bp-87, 0x1.fa860b2bf75f8p-83 }, + { 0x1.14df714b2cc27p-87, 0x1.a36fa64c5b19fp-83 }, + { 0x1.ca3058fde005fp-88, 0x1.5b478418ed951p-83 }, + { 0x1.7b135dc219792p-88, 0x1.1f8035d726d41p-83 }, + { 0x1.3995999427ba7p-88, 0x1.dbf75e60682c2p-84 }, + { 0x1.03604de581436p-88, 0x1.89f0afa1deecap-84 }, + { 0x1.ad067d36fa2c8p-89, 0x1.4602a49df0a52p-84 }, + { 0x1.62c6642f5d4b9p-89, 0x1.0dc2db21eaf21p-84 }, + { 0x1.2556d7a42568ap-89, 0x1.be61355e30a98p-85 }, + { 0x1.e5068065139bep-90, 0x1.7145a7dd1cf8cp-85 }, + { 0x1.90efd5cd13c3p-90, 0x1.31725e0702649p-85 }, + { 0x1.4b62e9374c452p-90, 0x1.f93e90900fd6bp-86 }, + { 0x1.11de133cc6916p-90, 0x1.a1d0c10ff74dfp-86 }, + { 0x1.c49bf95c5f745p-91, 0x1.597928f3e0c7p-86 }, + { 0x1.75f56ab48bd89p-91, 0x1.1d9f316556fccp-86 }, + { 0x1.34f00cbd8ea42p-91, 0x1.d8389849eaf01p-87 }, + { 0x1.fe61cbe17950dp-92, 0x1.8650e1db268ebp-87 }, + { 0x1.a589caf82618cp-92, 0x1.4293ddcb013c1p-87 }, + { 0x1.5c1e107375834p-92, 0x1.0a90025fd130cp-87 }, + { 0x1.1f7319c565581p-92, 0x1.b87eb911fc5efp-88 }, + { 0x1.daa6c6af5c17fp-93, 0x1.6bea387f6b0ap-88 }, + { 0x1.87d63120a742cp-93, 0x1.2c9c915a28ddap-88 }, + { 0x1.436e80df031fp-93, 0x1.f094496a5e827p-89 }, + { 0x1.0aef9bffa708dp-93, 0x1.9a19446f657ccp-89 }, + { 0x1.b890579385cdcp-94, 0x1.52a33b4b8094cp-89 }, + { 0x1.6b84ffdb5d885p-94, 0x1.179841589cdp-89 }, + { 0x1.2be9773700384p-94, 0x1.cda2d93f291abp-90 }, + { 0x1.eecef0206652cp-95, 0x1.7d0e0e7cac5bp-90 }, + { 0x1.9821029662ccfp-95, 0x1.3a804f20fd2f4p-90 }, + { 0x1.5097c74b3d08ep-95, 0x1.038a34010e13fp-90 }, + { 0x1.158fcf12f6c8ep-95, 0x1.ac508371be502p-91 }, + { 0x1.c9b60c296975dp-96, 0x1.61608ea10db83p-91 }, + { 0x1.7958bc88e6006p-96, 0x1.2383e3bce375p-91 }, + { 0x1.370dfa8e149d1p-96, 0x1.e0e820ef7463p-92 }, + { 0x1.0060a594f59c7p-96, 0x1.8c9f67fa9c048p-92 }, + { 0x1.a6925bee98d74p-97, 0x1.471203b047e85p-92 }, + { 0x1.5c351b499632p-97, 0x1.0dae92b93887p-92 }, + { 0x1.1ee518d278c58p-97, 0x1.bcabf2ba981bfp-93 }, + { 0x1.d8b2f8b0b2924p-98, 0x1.6e8f25135d13fp-93 }, + { 0x1.855f0a34582a6p-98, 0x1.2e219acb023aep-93 }, + { 0x1.40b1881e58e3p-98, 0x1.f1fe817902cebp-94 }, + { 0x1.0818d80634105p-98, 0x1.9a5d5233d8e13p-94 }, + { 0x1.b2ecbb2e8d76cp-99, 0x1.521d0766f8b85p-94 }, + { 0x1.6614d9da549fbp-99, 0x1.168c985c93c95p-94 }, + { 0x1.26c7736a63e7fp-99, 0x1.cae6809d7d445p-95 }, + { 0x1.e546a107b57d5p-100, 0x1.79f71edd3cb51p-95 }, + { 0x1.8f64020effd9cp-100, 0x1.37443c37e4835p-95 }, + { 0x1.48aa64075b15p-100, 0x1.004e8297ce819p-95 }, + { 0x1.0e6e891142764p-100, 0x1.a60ceba01346ap-96 }, + { 0x1.bcfa525d16889p-101, 0x1.5b71dfbe662f9p-96 }, + { 0x1.6e0be1ed4e4ccp-101, 0x1.1dfe04c5b884ap-96 }, + { 0x1.2d14568fa3103p-101, 0x1.d6c299b6b03dep-97 }, + { 0x1.ef39c9c67da7p-102, 0x1.8366f8264d161p-97 }, + { 0x1.973b86e9a718fp-102, 0x1.3ec401194be5fp-97 }, + { 0x1.4ed55e6d4d5dfp-102, 0x1.0641ea45be131p-97 }, + { 0x1.1345b1de4a541p-102, 0x1.af7b06dd7c2fap-98 }, + { 0x1.c48e8cf8e20edp-103, 0x1.62e7924beab28p-98 }, + { 0x1.73f6cd7db5a56p-103, 0x1.23e2123cac1dcp-98 }, + { 0x1.31afb2e91937bp-103, 0x1.e00be39adba8fp-99 }, + { 0x1.f6600b76754fcp-104, 0x1.8ab4ee2717624p-99 }, + { 0x1.9cc2881babafp-104, 0x1.447fa5b4e25fep-99 }, + { 0x1.5316d5b010b17p-104, 0x1.0abf02c055867p-99 }, + { 0x1.1688993cfebe3p-104, 0x1.b67d9f35f4de8p-100 }, + { 0x1.c98758b0a4ebap-105, 0x1.685ccfe1e2ab5p-100 }, + { 0x1.77baf72da4868p-105, 0x1.281e65593d67p-100 }, + { 0x1.3484c1e2418cbp-105, 0x1.e698bd1000fd2p-101 }, + { 0x1.fa991c211034p-106, 0x1.8fc0326c87b11p-101 }, + { 0x1.9fe006460b912p-106, 0x1.485d5ed97243ep-101 }, + { 0x1.555b844a27ecdp-106, 0x1.0db191585c5a2p-101 }, + { 0x1.182875c9f3984p-106, 0x1.baf50ff65044dp-102 }, + { 0x1.cbce2423a80acp-107, 0x1.6bb8ebe73c54ap-102 }, + { 0x1.794741d4d28c6p-107, 0x1.2a9fd1221e357p-102 }, + { 0x1.3586a18110b0ep-107, 0x1.ea4b746dbeae3p-103 }, + { 0x1.fbd1c1dcb3991p-108, 0x1.9271dfe5687e7p-103 }, + { 0x1.a085cf5d6c87ep-108, 0x1.4a4b9ae2c857dp-103 }, + { 0x1.559911f8b7812p-108, 0x1.0f0c2d578f06ap-103 }, + { 0x1.181ddd71c27fbp-108, 0x1.bccd0201398bap-104 }, + { 0x1.cb5889458c00ep-109, 0x1.6cec95dfef21ap-104 }, + { 0x1.789499da6bff1p-109, 0x1.2b5ae7721763fp-104 }, + { 0x1.34b0b5ddf82c6p-109, 0x1.eb1327842cc63p-105 }, + { 0x1.fa04646636ebep-110, 0x1.92bda7bca05b7p-105 }, + { 0x1.9eb0ea42d451ep-110, 0x1.4a4186866270ap-105 }, + { 0x1.53ce6234f7db7p-110, 0x1.0ec8a57831ec5p-105 }, + { 0x1.1668fdbb007d5p-110, 0x1.bbfd05e1b64f3p-106 }, + { 0x1.c8289c5fd0187p-111, 0x1.6bf24d893426cp-106 }, + { 0x1.75a62b0407aefp-111, 0x1.2a4c4fb42b862p-106 }, + { 0x1.3206cc37b0e4ap-111, 0x1.e8ec43d273fbap-107 }, + { 0x1.f53937c26236ep-112, 0x1.90a22ee0d506ep-107 }, + { 0x1.9a69ad7793258p-112, 0x1.483f4fee6553cp-107 }, + { 0x1.50039cbf56e41p-112, 0x1.0ce82f0139653p-107 }, + { 0x1.13119a81ee824p-112, 0x1.b888d3fea2a71p-108 }, + { 0x1.c24cdc6a6909bp-113, 0x1.68ce8cbb7eaebp-108 }, + { 0x1.7089487e1182ep-113, 0x1.2778e05f0f826p-108 }, + { 0x1.2d94fe2dcd5a4p-113, 0x1.e3e0a1bcb7b9p-109 }, + { 0x1.ed85fe218f015p-114, 0x1.8c29185861611p-109 }, + { 0x1.93c37ffa2be3p-114, 0x1.444e2559eb861p-109 }, + { 0x1.4a49efe08b764p-114, 0x1.09735c9244f77p-109 }, + { 0x1.0e26d33274acdp-114, 0x1.b28030446d467p-110 }, + { 0x1.b9dfc560135fp-115, 0x1.638fa554a9791p-110 }, + { 0x1.6955081ac80b2p-115, 0x1.22ed7a20d2031p-110 }, + { 0x1.276f565251c73p-115, 0x1.dc07399fb9ebdp-111 }, + { 0x1.e30d639687648p-116, 0x1.8566bbf3afdccp-111 }, + { 0x1.8adc46e842374p-116, 0x1.3e7fef514c8f7p-111 }, + { 0x1.42bb0eedd3fb2p-116, 0x1.0479dd0162987p-111 }, + { 0x1.07beb0edff1b8p-116, 0x1.a9fe7272a642bp-112 }, + { 0x1.af070915be74ep-117, 0x1.5c4d5495043b3p-112 }, + { 0x1.602994f04daa5p-117, 0x1.1cbea64272b5fp-112 }, + { 0x1.1fb139d7ad13p-117, 0x1.d18375dee0b86p-113 }, + { 0x1.d5fdfa65dd70dp-118, 0x1.7c798c690caf6p-113 }, + { 0x1.7fdb85ec65bd4p-118, 0x1.36eec953c25e3p-113 }, + { 0x1.39787263ebbcap-118, 0x1.fc2409fc1812ep-114 }, + { 0x1.ffeb0495cc103p-119, 0x1.9f29b80329143p-114 }, + { 0x1.a1f276c1aeb71p-119, 0x1.5328106ecc8f8p-114 }, + { 0x1.552f40714fe54p-119, 0x1.1507fc4d2f4bap-114 }, + { 0x1.167c9d827337cp-119, 0x1.c484291d11ffp-115 }, + { 0x1.c690e28b6a9bfp-120, 0x1.7189333483e3bp-115 }, + { 0x1.72f13b97db104p-120, 0x1.2dbc3e931f24dp-115 }, + { 0x1.2eaa616a9b21cp-120, 0x1.ecb050b3055ap-116 }, + { 0x1.edda16b7edc87p-121, 0x1.9231c8255bcdbp-116 }, + { 0x1.92da9c960076ap-121, 0x1.4848161f4e509p-116 }, + { 0x1.48955baf138afp-121, 0x1.0beb55467080ap-116 }, + { 0x1.0bf90e157d9dap-121, 0x1.b542338309321p-117 }, + { 0x1.b5082a5d8de09p-122, 0x1.64c56b8fb3cecp-117 }, + { 0x1.6454856772fedp-122, 0x1.231052b5f7dd6p-117 }, + { 0x1.227ecea87251dp-122, 0x1.dadb937ed07ebp-118 }, + { 0x1.d99724acabf71p-123, 0x1.834eb55a1d18ep-118 }, + { 0x1.81ff31715569ap-123, 0x1.3bdc43dd8955fp-118 }, + { 0x1.3a90e48619574p-123, 0x1.018fd4cd15479p-118 }, + { 0x1.005296113b586p-123, 0x1.a3fee5158c03fp-119 }, + { 0x1.a1acf8c750894p-124, 0x1.5664a8518a142p-119 }, + { 0x1.54421936100c1p-124, 0x1.171860917e7c8p-119 }, + { 0x1.152813e135602p-124, 0x1.c6f152728fb8fp-120 }, + { 0x1.c375a4cba7b23p-125, 0x1.72bf4ab4db677p-120 }, + { 0x1.6fa5568fa20f3p-125, 0x1.2e18c95c4bfb1p-120 }, + { 0x1.2b5b13ef0805cp-125, 0x1.ec41a3d4cf576p-121 }, + { 0x1.e77117811a7d2p-126, 0x1.91022d83bf8f5p-121 }, + { 0x1.8ccd934db2cbp-126, 0x1.46a292659269ep-121 }, + { 0x1.42faa33070d2ap-126, 0x1.0a05da41d6048p-121 }, + { 0x1.06db98d7f6125p-126, 0x1.b14375f322de2p-122 }, + { 0x1.abcdbdfcc9f7cp-127, 0x1.60c75486158bp-122 }, + { 0x1.5c15c23fbb403p-127, 0x1.1f35bc35fb59fp-122 }, + { 0x1.1b2fdb7cab6dfp-127, 0x1.d39954e0a9d3dp-123 }, + { 0x1.ccb8a64624f6cp-128, 0x1.7c98ab66270f5p-123 }, + { 0x1.76bb52e82b59ap-128, 0x1.35be6eb898758p-123 }, + { 0x1.30c117f001ac3p-128, 0x1.f819edd38db9cp-124 }, + { 0x1.efa0e49e3feccp-129, 0x1.9a2821242ebdp-124 }, + { 0x1.92fa046d58d4ep-129, 0x1.4dadd528d6ea9p-124 }, + { 0x1.479ae4e865feep-129, 0x1.0f6d9e092345cp-124 }, + { 0x1.0a4c603089f16p-129, 0x1.b987187720ae4p-125 }, + { 0x1.b0e03e96a5485p-130, 0x1.6711ad9310ce1p-125 }, + { 0x1.5fc89a9e03199p-130, 0x1.23f97aea9f29fp-125 }, + { 0x1.1dd90a3522c75p-130, 0x1.dac6b554960ffp-126 }, + { 0x1.d07c0b8b30398p-131, 0x1.81f77dc55f2bdp-126 }, + { 0x1.795540ea5dda7p-131, 0x1.39bb36d1a51dap-126 }, + { 0x1.327f191dd6247p-131, 0x1.fdf7c425dfb89p-127 }, + { 0x1.f1db008e061d6p-132, 0x1.9e6c7f42ee3ap-127 }, + { 0x1.944b7c8850269p-132, 0x1.50bd38f4b0e14p-127 }, + { 0x1.4846e1e475567p-132, 0x1.11954fcd9d596p-127 }, + { 0x1.0a8512d6deebp-132, 0x1.bc7d8a23288e1p-128 }, + { 0x1.b0b57b848dfd5p-133, 0x1.69099571fea27p-128 }, + { 0x1.5f385601a1095p-133, 0x1.25378a982372p-128 }, + { 0x1.1d0aee3f21eaep-133, 0x1.dc36feecfa2bap-129 }, + { 0x1.ce9ce0f1b56b8p-134, 0x1.82a9fb7ad076bp-129 }, + { 0x1.775af322a6fb6p-134, 0x1.39ea243c7bf71p-129 }, + { 0x1.3084e2fb958e5p-134, 0x1.fda4af81b306ap-130 }, + { 0x1.ee0aaff5c7275p-135, 0x1.9da7a2c5ab52cp-130 }, + { 0x1.90b5b261712acp-135, 0x1.4fb44aa933f5cp-130 }, + { 0x1.44f853ca3d2a1p-135, 0x1.1068e39733d5fp-130 }, + { 0x1.07839b24e2329p-135, 0x1.ba0b385a9673fp-131 }, + { 0x1.ab4ef712ea53cp-136, 0x1.669cb88b98bb4p-131 }, + { 0x1.5a6a27edc2aafp-136, 0x1.22e458ff074e2p-131 }, + { 0x1.18ccfb2383c0dp-136, 0x1.d7dccacf16bdfp-132 }, + { 0x1.c72c7d427b5c7p-137, 0x1.7ea9a57d9c3fdp-132 }, + { 0x1.70debd3477d7cp-137, 0x1.364981b4fcaccp-132 }, + { 0x1.2ae4c8505c4dcp-137, 0x1.f723b60a4c45ap-133 }, + { 0x1.e45347f37826dp-138, 0x1.97e0b5db827a8p-133 }, + { 0x1.8859d9d834871p-138, 0x1.4a9cae44d02aap-133 }, + { 0x1.3dcdd6f53a761p-138, 0x1.0bf347561e06fp-133 }, + { 0x1.0163c7a1b8ce3p-138, 0x1.b246ea577dcd5p-134 }, + { 0x1.a0de9e4d0326ap-139, 0x1.5fe1a8f2ffd47p-134 }, + { 0x1.518a7407eb90ep-139, 0x1.1d15869af1a46p-134 }, + { 0x1.1146574533e59p-139, 0x1.cde08f63664fdp-135 }, + { 0x1.ba6f77161f191p-140, 0x1.761ba88bf6eedp-135 }, + { 0x1.661c59f17faep-140, 0x1.2efafc89163c3p-135 }, + { 0x1.21d2894bdd4c7p-140, 0x1.eab12c8aa7e5p-136 }, + { 0x1.d50e0eba3e44dp-141, 0x1.8d4d432dee077p-136 }, + { 0x1.7b84a5753cf1fp-141, 0x1.41a589d11cb19p-136 }, + { 0x1.33091416396dbp-141, 0x1.045db9ec2ba81p-136 }, + { 0x1.f0bb3ff173143p-142, 0x1.a57861242277fp-137 }, + { 0x1.91c3cacc75aaap-142, 0x1.551681b8d361p-137 }, + { 0x1.44ea256a84bbp-142, 0x1.140098b38820cp-137 }, + { 0x1.06bb841410434p-142, 0x1.be9e2feb561ep-138 }, + { 0x1.a8d98b0d5771p-143, 0x1.694e9fdcb7be5p-138 }, + { 0x1.57755a2313bdfp-143, 0x1.24419d9ce37ffp-138 }, + { 0x1.15a03d39bca43p-143, 0x1.d8bf1578b3aacp-139 }, + { 0x1.c0c4e9f387792p-144, 0x1.7e4dfe2cee6a2p-139 }, + { 0x1.6aa9b63079411p-144, 0x1.3520b0bf08a51p-139 }, + { 0x1.250ad98a67e4fp-144, 0x1.f3daa3dd37f3ap-140 }, + { 0x1.d9842421f4af1p-145, 0x1.94140b3abb78ep-140 }, + { 0x1.7e859d0226582p-145, 0x1.469d2facc66f7p-140 }, + { 0x1.34f9e5d4c96d3p-145, 0x1.07f7c6b04c092p-140 }, + { 0x1.f314a5f5af6d7p-146, 0x1.aa9f80ec12e52p-141 }, + { 0x1.9306ca687d568p-146, 0x1.58b5e63278412p-141 }, + { 0x1.456b681315dafp-146, 0x1.167dcc97a0fd3p-141 }, + { 0x1.06b98180e66fp-146, 0x1.c1ee5bab4ede7p-142 }, + { 0x1.a82a4c036e3f3p-147, 0x1.6b69077bfc3c7p-142 }, + { 0x1.565cda5d05a6ap-147, 0x1.257dcc5bc2717p-142 }, + { 0x1.144d77262f022p-147, 0x1.d9fdd2296338fp-143 }, + { 0x1.bdec7b50a66cp-148, 0x1.7eb427b4ddd71p-143 }, + { 0x1.67cb265d8483ap-148, 0x1.34f5aee91217p-143 }, + { 0x1.224399b226996p-148, 0x1.f2ca4dc8ff69fp-144 }, + { 0x1.d448f86c23d12p-149, 0x1.92943634830d2p-144 }, + { 0x1.79b2a15ae0faap-149, 0x1.44e2d8e947442p-144 }, + { 0x1.3098d833c2dap-149, 0x1.0627b1e47c261p-144 }, + { 0x1.eb3aa595948f3p-150, 0x1.a705784809825p-145 }, + { 0x1.8c0f08dff4e68p-150, 0x1.554226cd542efp-145 }, + { 0x1.3f49a8880f6adp-150, 0x1.1343e7a202e9p-145 }, + { 0x1.015dd1c62a082p-150, 0x1.bc0384ab3550dp-146 }, + { 0x1.9edb80143a705p-151, 0x1.660fe966c4e28p-146 }, + { 0x1.4e52056f2dec4p-151, 0x1.20b6b60dae611p-146 }, + { 0x1.0d62a769875ep-151, 0x1.d1893fc15ba16p-147 }, + { 0x1.b2128dd015485p-152, 0x1.7747e31ddd25cp-147 }, + { 0x1.5dad6d3a16694p-152, 0x1.2e7c997078049p-147 }, + { 0x1.19a81ef58dfc6p-152, 0x1.e790d89e8e564p-148 }, + { 0x1.c5ae1b79c4ee8p-153, 0x1.88e545d12ba57p-148 }, + { 0x1.6d56e11abc8a7p-153, 0x1.3c919aea9787p-148 }, + { 0x1.262a204b39df1p-153, 0x1.fe13c6f07b6aep-149 }, + { 0x1.d9a774b67b183p-154, 0x1.9ae2b16a9550ap-149 }, + { 0x1.7d48e51f6d6edp-154, 0x1.4af14f857334ep-149 }, + { 0x1.32e43016e50e4p-154, 0x1.0a8564eab8ff5p-149 }, + { 0x1.edf747f9f14f1p-155, 0x1.ad3a33350402p-150 }, + { 0x1.8d7d80e14b91p-155, 0x1.5996d7e13f467p-150 }, + { 0x1.3fd1708b687cbp-155, 0x1.1636f3d76858ap-150 }, + { 0x1.014ad3fec9ec4p-155, 0x1.bfe545fce7a55p-151 }, + { 0x1.9dee40ecc2982p-156, 0x1.687ce08618977p-151 }, + { 0x1.4ceca2b27454p-156, 0x1.221a377d62eb4p-151 }, + { 0x1.0bbd071377b87p-156, 0x1.d2dcd30499eb7p-152 }, + { 0x1.ae9438e9a5c0bp-157, 0x1.779da2df7a30cp-152 }, + { 0x1.5a30285652adp-157, 0x1.2e2a7c1fe1c5fp-152 }, + { 0x1.164daef1c2b15p-157, 0x1.e61933d473856p-153 }, + { 0x1.bf6806876a635p-158, 0x1.86f2e6e7e582ap-153 }, + { 0x1.67960688424efp-158, 0x1.3a62b4892ce6ep-153 }, + { 0x1.20f7f47f404a7p-158, 0x1.f99234ed0089ep-154 }, + { 0x1.d061d530972c5p-159, 0x1.9676058974913p-154 }, + { 0x1.7517e8c57f622p-159, 0x1.46bd7c1e28efp-154 }, + { 0x1.2bb6ba79809edp-159, 0x1.069f8cb02119fp-154 }, + { 0x1.e17962871247p-160, 0x1.a61febb6d574dp-155 }, + { 0x1.82af24bbe81ddp-160, 0x1.53351984f5d61p-155 }, + { 0x1.3684a09debb18p-160, 0x1.108b4faaa8971p-155 }, + { 0x1.f2a603a977e7cp-161, 0x1.b5e91e3ee196dp-156 }, + { 0x1.9054beadf5a51p-161, 0x1.5fc381e001854p-156 }, + { 0x1.415c074fc9065p-161, 0x1.1a8782bc000bep-156 }, + { 0x1.01ef55a0092e3p-161, 0x1.c5c9be5ba37d4p-157 }, + { 0x1.9e016e74801cbp-162, 0x1.6c625c9dd5c05p-157 }, + { 0x1.4c3713bae315dp-162, 0x1.248f08aa2a9f5p-157 }, + { 0x1.0a8cf82738469p-162, 0x1.d5b98efc2e8d5p-158 }, + { 0x1.abada51b7b47ep-163, 0x1.790b07dcc17ddp-158 }, + { 0x1.570fb47030aa8p-163, 0x1.2e9c8b4dec3dep-158 }, + { 0x1.13270ae279a57p-163, 0x1.e5affac730013p-159 }, + { 0x1.b951931589ad6p-164, 0x1.85b69d604d483p-159 }, + { 0x1.61dfa678e3296p-164, 0x1.38aa7fa8655e3p-159 }, + { 0x1.1bb88966006c4p-164, 0x1.f5a41ad29abd6p-160 }, + { 0x1.c6e52f00f28e6p-165, 0x1.925df815332e1p-160 }, + { 0x1.6ca07adb2cabep-165, 0x1.42b32a68b6433p-160 }, + { 0x1.243c4de072741p-165, 0x1.02c65f05a223cp-160 }, + { 0x1.d4603cf73627ep-166, 0x1.9ef9ba1f58105p-161 }, + { 0x1.774b9c8b0652p-166, 0x1.4cb0a4ddc2264p-161 }, + { 0x1.2cad15ed5f00dp-166, 0x1.0ab038a2ddd17p-161 }, + { 0x1.e1ba565f2f2dap-167, 0x1.ab82536c08c11p-162 }, + { 0x1.81da56c03901cp-167, 0x1.569ce24f30cadp-162 }, + { 0x1.350587b61e2e7p-167, 0x1.128ac3f80b9acp-162 }, + { 0x1.eeeaf2386ba73p-168, 0x1.b7f008c184953p-163 }, + { 0x1.8c45dba9ebaffp-168, 0x1.6071b5b7d5f0bp-163 }, + { 0x1.3d40375ab2fc9p-168, 0x1.1a5112ad78884p-163 }, + { 0x1.fbe96dd52dd2ap-169, 0x1.c43afb43abf3ap-164 }, + { 0x1.96874b77050b3p-169, 0x1.6a28d7dab475p-164 }, + { 0x1.4557ac9b8a4ffp-169, 0x1.21fe234726979p-164 }, + { 0x1.04568afbad70bp-169, 0x1.d05b30647f5b6p-165 }, + { 0x1.a097bba9c5bbap-170, 0x1.73bbedaae952fp-165 }, + { 0x1.4d4668bc3c638p-170, 0x1.298ce64edbc52p-165 }, + { 0x1.0a969821c25d4p-170, 0x1.dc489a35fd89p-166 }, + { 0x1.aa703eac27071p-171, 0x1.7d248efdebaf1p-166 }, + { 0x1.5506ec96ce1d8p-171, 0x1.30f843b6c62b7p-166 }, + { 0x1.10b0827e1c59fp-171, 0x1.e7fb2011e1175p-167 }, + { 0x1.b409eb99c2287p-172, 0x1.865c4d7ebd336p-167 }, + { 0x1.5c93bed6568e9p-172, 0x1.383b206d0bb99p-167 }, + { 0x1.169ff47b694c6p-172, 0x1.f36aa78ac249dp-168 }, + { 0x1.bd5de633517f7p-173, 0x1.8f5cbbd7e3bd9p-168 }, + { 0x1.63e7724f64774p-173, 0x1.3f5064180659dp-168 }, + { 0x1.1c60a3dd2224ep-173, 0x1.fe8f1d993bb19p-169 }, + { 0x1.c66566ef40333p-174, 0x1.981f750955121p-169 }, + { 0x1.6afcac6c09d1ap-174, 0x1.4632fef2669ecp-169 }, + { 0x1.21ee56dbc8c6ap-174, 0x1.04b03ffb7174ap-169 }, + { 0x1.cf19c31a391acp-175, 0x1.a09e23dee12dbp-170 }, + { 0x1.71ce2ba111a68p-175, 0x1.4cddefbe00daep-170 }, + { 0x1.2744e94597dfp-175, 0x1.09eb734c1a314p-170 }, + { 0x1.d77474fa3c96fp-176, 0x1.a8d28a7b21f9ep-171 }, + { 0x1.7856cde19858bp-176, 0x1.534c49c3a48ap-171 }, + { 0x1.2c60519b06073p-176, 0x1.0ef5469afe541p-171 }, + { 0x1.df6f23e67822ep-177, 0x1.b0b689ea896fp-172 }, + { 0x1.7e9197060941ap-177, 0x1.59793ad60d8abp-172 }, + { 0x1.313ca61e59763p-177, 0x1.13c9ee6b2a529p-172 }, + { 0x1.e703ac45eb1a5p-178, 0x1.b84429b1d33d8p-173 }, + { 0x1.8479b71b66ff2p-178, 0x1.5f60114dc317ap-173 }, + { 0x1.35d621cd7892fp-178, 0x1.1865baa279b03p-173 }, + { 0x1.ee2c2766d39aep-179, 0x1.bf759f4ae6481p-174 }, + { 0x1.8a0a908fbee34p-179, 0x1.64fc41f392bcdp-174 }, + { 0x1.3a29293d26666p-179, 0x1.1cc51b3533d1bp-174 }, + { 0x1.f4e2f320ed2f5p-180, 0x1.c645558315ad7p-175 }, + { 0x1.8f3fbe30bc1d8p-180, 0x1.6a496dcf4682p-175 }, + { 0x1.3e324f4cf0981p-180, 0x1.20e4a4b8e031ep-175 }, + { 0x1.fb22b934b993p-181, 0x1.ccadf3adb1afp-176 }, + { 0x1.941518f17ca26p-181, 0x1.6f4367d03dbd8p-176 }, + { 0x1.41ee59ab3f625p-181, 0x1.24c114d62226p-176 }, + { 0x1.00733b2d2d2a7p-181, 0x1.d2aa649df6e65p-177 }, + { 0x1.9886bd6d1085bp-182, 0x1.73e63a45afd4dp-177 }, + { 0x1.455a452136a6p-182, 0x1.285756918be22p-177 }, + { 0x1.0314c07978175p-182, 0x1.d835dd5ba6335p-178 }, + { 0x1.9c91111b6c15fp-183, 0x1.782e2c1c97a81p-178 }, + { 0x1.4873499e69a71p-183, 0x1.2ba486638ab1ep-178 }, + { 0x1.0573c7a800f18p-183, 0x1.dd4be385e972p-179 }, + { 0x1.a030c72f0cf33p-184, 0x1.7c17c5d99552cp-179 }, + { 0x1.4b36ddfcc8743p-184, 0x1.2ea5f617d321fp-179 }, + { 0x1.078e5ec28bafdp-184, 0x1.e1e853589fe15p-180 }, + { 0x1.a362e51221b9fp-185, 0x1.7f9fd64579e1ap-180 }, + { 0x1.4da2bb75a5c65p-185, 0x1.3159306d0abdp-180 }, + { 0x1.0962c95c3eb5p-185, 0x1.e6076548c0765p-181 }, + { 0x1.a624c67aa97dfp-186, 0x1.82c376c3acddfp-181 }, + { 0x1.4fb4e0c13d49p-186, 0x1.33bbfc6dd55a6p-181 }, + { 0x1.0aef82f484486p-186, 0x1.e9a5b32d2ef52p-182 }, + { 0x1.a874210dbadcfp-187, 0x1.85800f4a2d262p-182 }, + { 0x1.516b94dabb86dp-187, 0x1.35cc607ce4fd8p-182 }, + { 0x1.0c33410fd4c56p-187, 0x1.ecc03cea2935dp-183 }, + { 0x1.aa4f078af0321p-188, 0x1.87d359f39448ep-183 }, + { 0x1.52c5696370c9dp-188, 0x1.3788a50e33e44p-183 }, + { 0x1.0d2cf5025ba2dp-188, 0x1.ef546c9652b0ap-184 }, + { 0x1.abb3ec79d594dp-189, 0x1.89bb66243bfd5p-184 }, + { 0x1.53c13ca08d951p-189, 0x1.38ef570827673p-184 }, + { 0x1.0ddbcd68fc943p-189, 0x1.f1601a115b514p-185 }, + { 0x1.aca1a45423b35p-190, 0x1.8b369b3c6ec4fp-185 }, + { 0x1.545e3b0f8838ap-190, 0x1.39ff49c7fe5e8p-185 }, + { 0x1.0e3f374dd9d68p-190, 0x1.f2e18e05495b4p-186 }, + { 0x1.ad1767288e013p-191, 0x1.8c43bad265564p-186 }, + { 0x1.549be08e15927p-191, 0x1.3ab798c59d4c2p-186 }, + { 0x1.0e56def61fbc4p-191, 0x1.f3d7844c8a592p-187 }, + { 0x1.ad14d1b2f0b5fp-192, 0x1.8ce1e26fb8214p-187 }, + { 0x1.5479f9137160bp-192, 0x1.3b17a8d383f04p-187 }, + { 0x1.0e22b05782284p-192, 0x1.f4412db819edfp-188 }, + { 0x1.ac99e5e7b9269p-193, 0x1.8d108ccedcd75p-188 }, + { 0x1.53f8a0f98a8b8p-193, 0x1.3b1f28f8795cap-188 }, + { 0x1.0da2d734853ffp-193, 0x1.f41e3132440dap-189 }, + { 0x1.aba70af1767bp-194, 0x1.8ccf9296410aep-189 }, + { 0x1.531844d58365ep-194, 0x1.3ace12e143377p-189 }, + { 0x1.0cd7bedf59779p-194, 0x1.f36eac3bc78c2p-190 }, + { 0x1.aa3d0ca096eedp-195, 0x1.8c1f2a8f92477p-190 }, + { 0x1.51d9a0dfd2e93p-195, 0x1.3a24aae988ae7p-190 }, + { 0x1.0bc211a3c2859p-195, 0x1.f23332c263066p-191 }, + { 0x1.a85d1a4e6bedcp-196, 0x1.8affe95ac6f2ap-191 }, + { 0x1.503dbfed30324p-196, 0x1.39237fbbcfa18p-191 }, + { 0x1.0a62b7d92f095p-196, 0x1.f06cce511da3ep-192 }, + { 0x1.a608c535a2ba1p-197, 0x1.8972c09d7f45cp-192 }, + { 0x1.4e45f9fa4adffp-197, 0x1.37cb698950bdap-192 }, + { 0x1.08bad69ed20a4p-197, 0x1.ee1cfc9be3df9p-193 }, + { 0x1.a341fe436d2d7p-198, 0x1.8778fdb058321p-193 }, + { 0x1.4bf3f24d273a5p-198, 0x1.361d88db2b95bp-193 }, + { 0x1.06cbce44363ecp-198, 0x1.eb45ad695330ap-194 }, + { 0x1.a00b13659be7cp-199, 0x1.851447ccc879bp-194 }, + { 0x1.4949952fc2371p-199, 0x1.341b44ff4c3c6p-194 }, + { 0x1.0497386163a39p-199, 0x1.e7e93fdecaep-195 }, + { 0x1.9c66ac5ae65b3p-200, 0x1.82469dbf1833ep-195 }, + { 0x1.464915486577bp-200, 0x1.31c64a141680ep-195 }, + { 0x1.021ee5a248c7fp-200, 0x1.e40a7f340982ap-196 }, + { 0x1.9857c70b8b2bcp-201, 0x1.7f125320f1e94p-196 }, + { 0x1.42f4e894cc71ap-201, 0x1.2f2086b6a5cf4p-196 }, + { 0x1.fec9b69351b7p-202, 0x1.dfac9ed4c27cep-197 }, + { 0x1.93e1b371520a1p-202, 0x1.7b7a0d21f0262p-197 }, + { 0x1.3f4fc50de840ap-202, 0x1.2c2c295822108p-197 }, + { 0x1.f8d6a0e0a9508p-203, 0x1.dad335f7aacdbp-198 }, + { 0x1.8f080f16c57cp-203, 0x1.7780bee4609a1p-198 }, + { 0x1.3b5c9cfaada16p-203, 0x1.28eb9d3f5000ap-198 }, + { 0x1.f269560bdbf92p-204, 0x1.d5823ab37d92ep-199 }, + { 0x1.89cec0363502dp-204, 0x1.7329a5753ca24p-199 }, + { 0x1.371e9af8e6ccfp-204, 0x1.2561873c1cc7ap-199 }, + { 0x1.eb86f931c309dp-205, 0x1.cfbdfc9b64d6ep-200 }, + { 0x1.8439f081b525ap-205, 0x1.6e7843670c8d2p-200 }, + { 0x1.32991dc38028ep-205, 0x1.2190c2136fc76p-200 }, + { 0x1.e434fdd743954p-206, 0x1.c98b1eed08258p-201 }, + { 0x1.7e4e079de1a2ep-206, 0x1.69705c180d6c1p-201 }, + { 0x1.2dcfb3be31ebdp-206, 0x1.1d7c5aaa0949p-201 }, + { 0x1.dc7920bafc5dcp-207, 0x1.c2ee925b3e3f6p-202 }, + { 0x1.780fa5599d558p-207, 0x1.6415eeac7f744p-202 }, + { 0x1.28c6164ec1235p-207, 0x1.19278bf59ff34p-202 }, + { 0x1.d459605b63623p-208, 0x1.bbed8e8100752p-203 }, + { 0x1.71839bad6a45bp-208, 0x1.5e6d30c67b96bp-203 }, + { 0x1.2380250c57526p-208, 0x1.1495babbc8d8ep-203 }, + { 0x1.cbdbf53eed588p-209, 0x1.b48d8b08c37b5p-204 }, + { 0x1.6aaee88d3a5e6p-209, 0x1.587a8905112ebp-204 }, + { 0x1.1e01e0cda0c0ep-209, 0x1.0fca71267dd26p-204 }, + { 0x1.c3074a0c1c67dp-210, 0x1.acd43894c1f06p-205 }, + { 0x1.6396af97c5f7fp-210, 0x1.52428954b7c2fp-205 }, + { 0x1.184f669e7e645p-210, 0x1.0ac95a364b406p-205 }, + { 0x1.b9e1f37f768c9p-211, 0x1.a4c779750fb77p-206 }, + { 0x1.5c4033ae88d94p-211, 0x1.4bc9e91b546a8p-206 }, + { 0x1.126ceaa621095p-211, 0x1.05963d1a5105bp-206 }, + { 0x1.b072a84d6770bp-212, 0x1.9c6d5a387a6d7p-207 }, + { 0x1.54b0d08180ac6p-212, 0x1.45157f4a2e598p-207 }, + { 0x1.0c5eb30658611p-212, 0x1.0034f87652744p-207 }, + { 0x1.a6c038fdf5aedp-213, 0x1.93cc0a254a9f5p-208 }, + { 0x1.4cedf419a9b38p-213, 0x1.3e2a3c60327aap-208 }, + { 0x1.062912bcc23f9p-213, 0x1.f552fb3e1c70bp-209 }, + { 0x1.9cd187cff951cp-214, 0x1.8ae9d3a6eb66fp-209 }, + { 0x1.44fd186d008c2p-214, 0x1.370d2466d3327p-209 }, + { 0x1.ffa0c91caab55p-215, 0x1.e9ef97aa04b46p-210 }, + { 0x1.92ad80b12a09bp-215, 0x1.81cd14bd535bbp-210 }, + { 0x1.3ce3bd0683046p-215, 0x1.2fc348f3a8121p-210 }, + { 0x1.f2b20c0b002abp-216, 0x1.de47d70b3398cp-211 }, + { 0x1.885b1157e885cp-216, 0x1.787c377ac34cdp-211 }, + { 0x1.34a760cc47acap-216, 0x1.2851c338b22e4p-211 }, + { 0x1.e58ea51580badp-217, 0x1.d263d33512bb6p-212 }, + { 0x1.7de1218b19542p-217, 0x1.6efdaa9c0e45ep-212 }, + { 0x1.2c4d7bed4d522p-217, 0x1.20bdae2cd61c6p-212 }, + { 0x1.d83f3d3e6d15p-218, 0x1.c64ba5bdb46dep-213 }, + { 0x1.73468ba3c29b8p-218, 0x1.6557da47246f7p-213 }, + { 0x1.23db7a001a935p-218, 0x1.190c20d5b5808p-213 }, + { 0x1.cacc668087b83p-219, 0x1.ba075f0192b6p-214 }, + { 0x1.689215536317fp-219, 0x1.5b9128fb09361p-214 }, + { 0x1.1b56b45aac06fp-219, 0x1.114228bb99133p-214 }, + { 0x1.bd3e92f58e3aep-220, 0x1.ad9efd6e7e35p-215 }, + { 0x1.5dca68b92a62fp-220, 0x1.51afe8bbb6b6cp-215 }, + { 0x1.12c46cab86e91p-220, 0x1.0964c48f92b05p-215 }, + { 0x1.af9e0c680145ap-221, 0x1.a11a652260dp-216 }, + { 0x1.52f60dcf5b39p-221, 0x1.47ba5483b6e8fp-216 }, + { 0x1.0a29c7db10f7p-221, 0x1.0178df0b67157p-216 }, + { 0x1.a1f2ec5b27de2p-222, 0x1.948157e97fbd7p-217 }, + { 0x1.481b643932becp-222, 0x1.3db68a0470a4fp-217 }, + { 0x1.018bc93b8e2e5p-222, 0x1.f306942454ae6p-218 }, + { 0x1.9445149305037p-223, 0x1.87db6da6dd3cap-218 }, + { 0x1.3d409d78b6819p-223, 0x1.33aa83bd4deabp-218 }, + { 0x1.f1de9c1ab95aap-224, 0x1.e311742f9561bp-219 }, + { 0x1.869c2824b4b6bp-224, 0x1.7b300d303ed2cp-219 }, + { 0x1.326bb792c8c5bp-224, 0x1.299c1370fc2d1p-219 }, + { 0x1.e0b212b870715p-225, 0x1.d31b83aa1a53bp-220 }, + { 0x1.78ff85165ac91p-225, 0x1.6e8665a634affp-220 }, + { 0x1.27a27826da7a5p-225, 0x1.1f90dcff1976ep-220 }, + { 0x1.cf9b0072f8176p-226, 0x1.c32d9c998168ap-221 }, + { 0x1.6b763e947db08p-226, 0x1.61e5684f4d137p-221 }, + { 0x1.1cea67fe8699cp-226, 0x1.158e51a7ac97ep-221 }, + { 0x1.bea20cad09b1fp-227, 0x1.b350464c51c99p-222 }, + { 0x1.5e0717c155a1cp-227, 0x1.5553c2fc66728p-222 }, + { 0x1.1248cf18568a2p-227, 0x1.0b99abbccdbb1p-222 }, + { 0x1.adcf760300963p-228, 0x1.a38baebfb68e4p-223 }, + { 0x1.50b87f214792dp-228, 0x1.48d7dafad7ffep-223 }, + { 0x1.07c2b12fe4dbap-228, 0x1.01b7eac5ea688p-223 }, + { 0x1.9d2b0d0c4a0b1p-229, 0x1.93e7a4bb0743p-224 }, + { 0x1.43908aa677d25p-229, 0x1.3c77c897ed254p-224 }, + { 0x1.fab995891c153p-230, 0x1.efdba02e2ceffp-225 }, + { 0x1.8cbc2fe600108p-230, 0x1.846b92a47c343p-225 }, + { 0x1.3694f45c1b92fp-230, 0x1.30395337f89bbp-225 }, + { 0x1.e6371d3dc0233p-231, 0x1.dc7fb7bbca8adp-226 }, + { 0x1.7c89c6867890ep-231, 0x1.751e7a10e8264p-226 }, + { 0x1.29cb17b0f706bp-231, 0x1.2421ee0211f87p-226 }, + { 0x1.d20647a807a0cp-232, 0x1.c9649548abac7p-227 }, + { 0x1.6c9a3fd812077p-232, 0x1.6606f00ed6d5dp-227 }, + { 0x1.1d37ef5f490cdp-232, 0x1.1836b52067807p-227 }, + { 0x1.be2ec88ae1479p-233, 0x1.b6922692e74d4p-228 }, + { 0x1.5cf38f9818abfp-233, 0x1.572b1a2c0293ap-228 }, + { 0x1.10e013ef486f7p-233, 0x1.0c7c6b93f06a1p-228 }, + { 0x1.aab7b734b99f6p-234, 0x1.a40fcadcdd133p-229 }, + { 0x1.4d9b2cf546b09p-234, 0x1.4890ac32b69b5p-229 }, + { 0x1.04c7bad04b57cp-234, 0x1.00f779993bbc1p-229 }, + { 0x1.97a78d5f1c6dbp-235, 0x1.91e450ac30542p-230 }, + { 0x1.3e9611e8218p-235, 0x1.3a3ce69b6a143p-230 }, + { 0x1.f1e56c0773bb7p-236, 0x1.eb57d7362f984p-231 }, + { 0x1.850426f2df55dp-236, 0x1.8015f467ddd4p-231 }, + { 0x1.2fe8bb3e4f4d8p-236, 0x1.2c3495adab7d8p-231 }, + { 0x1.dac8e8a813f1fp-237, 0x1.d53ae35dbfa26p-232 }, + { 0x1.72d2c2a7422abp-237, 0x1.6eaa5fce4af3ap-232 }, + { 0x1.21972950f570dp-237, 0x1.1e7c114a57a33p-232 }, + { 0x1.c44004226dc17p-238, 0x1.bf9ebf2ac34cfp-233 }, + { 0x1.6118037139874p-238, 0x1.5da6aa3adb7a3p-233 }, + { 0x1.13a4e15d42467p-238, 0x1.11173d5813f4dp-233 }, + { 0x1.ae501496e23f2p-239, 0x1.aa895a750e0f6p-234 }, + { 0x1.4fd7f2b705e64p-239, 0x1.4d0f59b16ac32p-234 }, + { 0x1.0614ef7575b09p-239, 0x1.04098aca1b898p-234 }, + { 0x1.98fdb1084fd1cp-240, 0x1.95ffef5a788b3p-235 }, + { 0x1.3f16033b4da17p-240, 0x1.3ce864a4f75bbp-235 }, + { 0x1.f1d3d20014dd3p-241, 0x1.eeabf27142ccbp-236 }, + { 0x1.844cb59a101a9p-241, 0x1.82070510e6e91p-236 }, + { 0x1.2ed514b22b68bp-241, 0x1.2d35346de60f3p-236 }, + { 0x1.d84bdf7421499p-242, 0x1.d5fe3202b4d44p-237 }, + { 0x1.7040489842ad7p-242, 0x1.6ea2738b3dbebp-237 }, + { 0x1.1f1777f205012p-242, 0x1.1df8a8637ba9cp-237 }, + { 0x1.bf956a62adf73p-243, 0x1.be0e1bcc5bf2bp-238 }, + { 0x1.5cdae0381ff94p-243, 0x1.5bd567e120a1cp-238 }, + { 0x1.0fdef3b187063p-243, 0x1.0f35198b8b7f7p-238 }, + { 0x1.a7b2fd5556b6ap-244, 0x1.a6df243f2c6f4p-239 }, + { 0x1.4a1e48fd99b8ep-244, 0x1.49a26968a8fd1p-239 }, + { 0x1.012cc9c3d142ap-244, 0x1.00ec5ed2dbe3ep-239 }, + { 0x1.90a652d08b6ecp-245, 0x1.9073f3afbdfebp-240 }, + { 0x1.380bacb3471d9p-245, 0x1.380b5f70c487dp-240 }, + { 0x1.e603798765b0ap-246, 0x1.e63fa380d130bp-241 }, + { 0x1.7a705e88ab4c8p-246, 0x1.7ace6e086aab7p-241 }, + { 0x1.26a399e180e7cp-246, 0x1.2711978a97cf7p-241 }, + { 0x1.cabc2c3d98d7cp-247, 0x1.cba0a72ae9c08p-242 }, + { 0x1.651157275ac6fp-247, 0x1.65efbb20adf2dp-242 }, + { 0x1.15e60bb1a2bacp-247, 0x1.16b5cc5019368p-242 }, + { 0x1.b08358e30e1b1p-248, 0x1.b1fca598944c3p-243 }, + { 0x1.5088c08941b89p-248, 0x1.51d84fa353951p-243 }, + { 0x1.05d2722aa0abep-248, 0x1.06f82c9619b9p-243 }, + { 0x1.9757d44a0d5d1p-249, 0x1.9953a1cf16aadp-244 }, + { 0x1.3cd5765cc7b51p-249, 0x1.3e87f66d27bbp-244 }, + { 0x1.eccf7568ff3afp-250, 0x1.efb0c5f0312cdp-245 }, + { 0x1.7f37a88128933p-250, 0x1.81a4d1085cfd1p-245 }, + { 0x1.29f5b70afae6ep-250, 0x1.2bfdda4e2b20cp-245 }, + { 0x1.cf48b1a182cb9p-251, 0x1.d2ab3b59164a6p-246 }, + { 0x1.682022c0d8296p-251, 0x1.6aeea740e7e26p-246 }, + { 0x1.17e72ed48d1c2p-251, 0x1.1a389017ca93cp-246 }, + { 0x1.b30c9decefa86p-252, 0x1.b6dd2d215fccfp-247 }, + { 0x1.520de188c8ff4p-252, 0x1.552ee415230cdp-247 }, + { 0x1.06a7030db71fbp-252, 0x1.093620e33d9f9p-247 }, + { 0x1.98166f02e00aap-253, 0x1.9c4336b720df7p-248 }, + { 0x1.3cfce2d301755p-253, 0x1.40629fd47fda6p-248 }, + { 0x1.ec63bac9af50ap-254, 0x1.f1e828f7f1e6ep-249 }, + { 0x1.7e609b497d4bfp-254, 0x1.82d92bd0fbc5bp-249 }, + { 0x1.28e89244647b5p-254, 0x1.2c8658b1c7fabp-249 }, + { 0x1.cd07ee41894f6p-255, 0x1.d2def7b6139fbp-250 }, + { 0x1.65e4eca3c47cep-255, 0x1.6a9a29142865ap-250 }, + { 0x1.15cbd7439af48p-255, 0x1.1995fff959855p-250 }, + { 0x1.af324889fe32ep-256, 0x1.b549f742691f7p-251 }, + { 0x1.4e9c920d5db05p-256, 0x1.5380a4af4c2e9p-251 }, + { 0x1.03a122e1077b7p-256, 0x1.078d07375b0bp-251 }, + { 0x1.92d9bd168c63p-257, 0x1.9921acfd99f39p-252 }, + { 0x1.388030ea8589cp-257, 0x1.3d867ecfb60a5p-252 }, + { 0x1.e4c4faf832008p-258, 0x1.ecccda72dba49p-253 }, + { 0x1.77f4a046c515ep-258, 0x1.7e5deef2de87bp-253 }, + { 0x1.2387f5f4b712ep-258, 0x1.28a511d87ce7dp-253 }, + { 0x1.c413282821079p-259, 0x1.cc3995b1e2c4p-254 }, + { 0x1.5e78bc56d0fbbp-259, 0x1.64f5f80200f46p-254 }, + { 0x1.0faba5af01355p-259, 0x1.14d5424501d7ep-254 }, + { 0x1.a51f8a6830159p-260, 0x1.ad54bef9112dp-255 }, + { 0x1.465b65a83bdbbp-260, 0x1.4ce07b8d50856p-255 }, + { 0x1.f9c5589e7201fp-261, 0x1.020f8e226943ep-255 }, + { 0x1.87dc5ad8af9ecp-261, 0x1.90123a8271991p-256 }, + { 0x1.2f918e4d3f95cp-261, 0x1.3613b89391a8fp-256 }, + { 0x1.d6485a170413ap-262, 0x1.e098381b76cd3p-257 }, + { 0x1.6c3b66970be3dp-262, 0x1.7465697a54c64p-257 }, + { 0x1.1a0fd8c3a4e6fp-262, 0x1.20858c20a1795p-257 }, + { 0x1.b4ce217bd5e55p-263, 0x1.bf05934cfa1ccp-258 }, + { 0x1.522e259c7017ap-263, 0x1.5a41409f84e49p-258 }, + { 0x1.05caa9cf257c4p-263, 0x1.0c2b83023243dp-258 }, + { 0x1.954427a430b11p-264, 0x1.9f5672cf62a4fp-259 }, + { 0x1.39a5d07601e71p-264, 0x1.41985de8f7a14p-259 }, + { 0x1.e56c72cc01fccp-265, 0x1.f1f5d5615d783p-260 }, + { 0x1.7797a6e64ddc9p-265, 0x1.8179bfb69c631p-260 }, + { 0x1.229374c83806p-265, 0x1.2a5d1d1f1ae5cp-260 }, + { 0x1.c18d454a503aep-266, 0x1.cdd1c2bddbb9ep-261 }, + { 0x1.5bb5b3e414ad3p-266, 0x1.655e203c78adp-261 }, + { 0x1.0ce808921de57p-266, 0x1.1481ab5a1469ap-261 }, + { 0x1.9fdfe587f056ap-267, 0x1.abd4ca4bd8884p-262 }, + { 0x1.418b54bd6a895p-267, 0x1.4af20f59f283dp-262 }, + { 0x1.f128f851039d9p-268, 0x1.fff032b2dbde7p-263 }, + { 0x1.804c6e03f60cbp-268, 0x1.8be8c488684b4p-263 }, + { 0x1.290596a08a94fp-268, 0x1.3223f2e5be0fp-263 }, + { 0x1.cb1395c8187f6p-269, 0x1.d964d959533d1p-264 }, + { 0x1.62bb1316ec5fcp-269, 0x1.6df780d5ecc43p-264 }, + { 0x1.1211a1b47d3aep-269, 0x1.1ae2302fd4bcdp-264 }, + { 0x1.a772150026811p-270, 0x1.b5455f4e2ce45p-265 }, + { 0x1.47143aa78b5fep-270, 0x1.51eade2a24279p-265 }, + { 0x1.f93996ba5e93dp-271, 0x1.051b3f15282e5p-265 }, + { 0x1.8626f2553e204p-271, 0x1.93760037df87ap-266 }, + { 0x1.2d4091cd12adcp-271, 0x1.37ace1ccc1a8dp-266 }, + { 0x1.d1294db79df79p-272, 0x1.e17b7713cf17fp-267 }, + { 0x1.6715149108678p-272, 0x1.73db39c4b278bp-267 }, + { 0x1.1529206516167p-272, 0x1.1f27cc2724f9p-267 }, + { 0x1.abce28a1f17f2p-273, 0x1.bb70eb3792a1cp-268 }, + { 0x1.4a1fe3e55f964p-273, 0x1.5659e4463ddd1p-268 }, + { 0x1.fd6eb54be7326p-274, 0x1.08462ba9624dbp-268 }, + { 0x1.89049c51b8388p-274, 0x1.97f4ffe1284a1p-269 }, + { 0x1.2f2b5e6789756p-274, 0x1.3ad748e88c53fp-269 }, + { 0x1.d3aa617478594p-275, 0x1.e5e5db98318a5p-270 }, + { 0x1.68a9e9f7b2f9ap-275, 0x1.76e6798f53e9ap-270 }, + { 0x1.161c2a1de488ep-275, 0x1.21393590da64bp-270 }, + { 0x1.acda38e82463bp-276, 0x1.be32dc731f12cp-271 }, + { 0x1.4a9c33e05809ap-276, 0x1.5824d30f3fce1p-271 }, + { 0x1.fdaf4969fc45p-277, 0x1.09660e736b8bdp-271 }, + { 0x1.88d45a53c41c5p-277, 0x1.994b0856743cbp-272 }, + { 0x1.2eba8f55fe897p-277, 0x1.3b9051c5e7679p-272 }, + { 0x1.d287e1e77c85ap-278, 0x1.e689bae600601p-273 }, + { 0x1.6770239fc87e6p-278, 0x1.77071c1633b26p-273 }, + { 0x1.14e513c1b20dcp-278, 0x1.210a174166fcdp-273 }, + { 0x1.aa90041143186p-279, 0x1.bd7abebe480e6p-274 }, + { 0x1.488642c71cfa6p-279, 0x1.5740f6d4ed277p-274 }, + { 0x1.f9f9ce5a157bbp-280, 0x1.0874302ee34fdp-274 }, + { 0x1.85974997b931fp-280, 0x1.97701e51a6bfep-275 }, + { 0x1.2bf0c37efc00bp-280, 0x1.39d3aac239fe2p-275 }, + { 0x1.cdc89092e43c3p-281, 0x1.e36341a88ea0cp-276 }, + { 0x1.636f0e2785c54p-281, 0x1.743c5e4db43f9p-276 }, + { 0x1.118b19def65f8p-281, 0x1.1e9b8ad36fd99p-276 }, + { 0x1.a4fd2c459c71p-282, 0x1.b94cde5e4fc3p-277 }, + { 0x1.43ea7a73d5cfp-282, 0x1.53b3a109a94aep-277 }, + { 0x1.f26454740b953p-283, 0x1.057635a1ed1dfp-277 }, + { 0x1.7f60ab495565cp-283, 0x1.926f55b776f91p-278 }, + { 0x1.26de8be09d876p-283, 0x1.35abb1f1cadefp-278 }, + { 0x1.c5889cb51dbb9p-284, 0x1.dc853b381e5ap-279 }, + { 0x1.5cbe6a335189cp-284, 0x1.6e96e5d005f5dp-279 }, + { 0x1.0c22190c33c65p-284, 0x1.19fc0dba0e848p-279 }, + { 0x1.9c42b0a7816acp-285, 0x1.b1c21d6e11086p-280 }, + { 0x1.3ce41b9a97542p-285, 0x1.4d91f3701143cp-280 }, + { 0x1.e71ba6efe048bp-286, 0x1.007de792cfd6ep-280 }, + { 0x1.76552635a3b27p-286, 0x1.8a6663a0ececbp-281 }, + { 0x1.1fa1c7f04e719p-286, 0x1.2f310e41037d6p-281 }, + { 0x1.b9f88d1e59fb3p-287, 0x1.d2185735c5ad9p-282 }, + { 0x1.538582347c59ep-287, 0x1.66381bdd98a02p-282 }, + { 0x1.04c9ca3c242adp-287, 0x1.1346f1ba5a69ap-282 }, + { 0x1.9093a8968bba5p-288, 0x1.a706fd9470fb8p-283 }, + { 0x1.339c31e0d51b7p-288, 0x1.45000f1eec014p-283 }, + { 0x1.d8619415342d3p-289, 0x1.f3510620184eap-284 }, + { 0x1.6aa95f63dd017p-289, 0x1.7f84791f6fdbbp-284 }, + { 0x1.16648113f6ec6p-289, 0x1.2689bc620188bp-284 }, + { 0x1.ab5b65b277be7p-290, 0x1.c45998d7521aep-285 }, + { 0x1.47f9aad3382fep-290, 0x1.5b50e4b7d6356p-285 }, + { 0x1.f7591b1b1c875p-291, 0x1.0aa3508d5dbp-285 }, + { 0x1.82335294ba26p-291, 0x1.9959eb6f64db6p-286 }, + { 0x1.2848053b7dfb1p-291, 0x1.3a2fb2a16d1ccp-286 }, + { 0x1.c68a6f5a8ef62p-292, 0x1.e23b370697cbbp-287 }, + { 0x1.5c9ffcce7e5fdp-292, 0x1.720876851d9fbp-287 }, + { 0x1.0b5b54d487d35p-292, 0x1.1be79c992aff6p-287 }, + { 0x1.9a0421e5c5d71p-293, 0x1.b3980569c43a5p-288 }, + { 0x1.3a5c4268d4e27p-293, 0x1.4e1fc4f822568p-288 }, + { 0x1.e1fba80d34a41p-294, 0x1.0042910b94342p-288 }, + { 0x1.7172912ec21f8p-294, 0x1.8908e30f7a1b3p-289 }, + { 0x1.1b271db151968p-294, 0x1.2d5e5a1b8288ep-289 }, + { 0x1.b1f9ef2d6b135p-295, 0x1.ce1b3b9ea6267p-290 }, + { 0x1.4c872d1af92bcp-295, 0x1.623e8fb994f23p-290 }, + { 0x1.fd87064e02a6fp-296, 0x1.0f8695160ca38p-290 }, + { 0x1.8652a61cdcd3bp-296, 0x1.a031b186be289p-291 }, + { 0x1.2af84a660968dp-296, 0x1.3eee8e04dc3ap-291 }, + { 0x1.c9f07af149226p-297, 0x1.e8bd23cc416fp-292 }, + { 0x1.5eacf76fffc0cp-297, 0x1.766e8d5583265p-292 }, + { 0x1.0c80f3efbbf3fp-297, 0x1.1ed2fab014c43p-292 }, + { 0x1.9b1f8ffd8f3c8p-298, 0x1.b76010ebb6c6ap-293 }, + { 0x1.3ab5d5023fe4ap-298, 0x1.507d813502ab7p-293 }, + { 0x1.e1c174ea2aaa6p-299, 0x1.01aa61c90eaccp-293 }, + { 0x1.70b05029068dap-299, 0x1.8a90544ab274dp-294 }, + { 0x1.1a1fba21de5fp-299, 0x1.2e0fb0911dd84p-294 }, + { 0x1.afb70654af059p-300, 0x1.ce6f24739f7c7p-295 }, + { 0x1.4a458b53b2a84p-300, 0x1.61eefc532711fp-295 }, + { 0x1.f944d95c81983p-301, 0x1.0edb77098a96p-295 }, + { 0x1.8272ab43f7156p-301, 0x1.9e82e04d9025fp-296 }, + { 0x1.278886c5a4d73p-301, 0x1.3d237a2e0f859p-296 }, + { 0x1.c3f57b512a1f2p-302, 0x1.e5385c7d0efep-297 }, + { 0x1.598c52c5d1746p-302, 0x1.73258d0b919ebp-297 }, + { 0x1.0828ad1da0983p-302, 0x1.1bdb57d01ceccp-297 }, + { 0x1.93d4935512f54p-303, 0x1.b223e5e67d24ap-298 }, + { 0x1.34a3670d3cd59p-303, 0x1.4bf43098a2ef1p-298 }, + { 0x1.d7b67cefff216p-304, 0x1.fb93db1e39a21p-299 }, + { 0x1.686e7356020d2p-304, 0x1.8402d3eada60ap-299 }, + { 0x1.135e695d6d4f8p-304, 0x1.2892e3159736p-299 }, + { 0x1.a4b6028e1ae52p-305, 0x1.c5502f868f04bp-300 }, + { 0x1.415808da66669p-305, 0x1.5a670a5d83e0ep-300 }, + { 0x1.ead51e60a821dp-306, 0x1.08ac71830fd4ep-300 }, + { 0x1.76cfe88ffbfa7p-306, 0x1.9467d9d3bce7dp-301 }, + { 0x1.1e2e61d740a91p-306, 0x1.34ea92731d6fp-301 }, + { 0x1.b4f6c22875415p-307, 0x1.d7e402cf49a21p-302 }, + { 0x1.4d8e03e448998p-307, 0x1.6860e96265ba8p-302 }, + { 0x1.fd2c6816f010bp-308, 0x1.132f279000564p-302 }, + { 0x1.8494b75728df1p-308, 0x1.a4356bd52863ep-303 }, + { 0x1.28836b62851b4p-308, 0x1.40cac092d16a6p-303 }, + { 0x1.c476ceb4ce0a6p-309, 0x1.e9bb8c8c45eaap-304 }, + { 0x1.592d26553a529p-309, 0x1.75c6ad9777c96p-304 }, + { 0x1.074be65f60432p-309, 0x1.1d3d889242361p-304 }, + { 0x1.91a14719373e5p-310, 0x1.b34c7bf3e0108p-305 }, + { 0x1.3248b33f78dd9p-310, 0x1.4c1bf325b5886p-305 }, + { 0x1.d316bfa6ecf07p-311, 0x1.fab351a6d7271p-306 }, + { 0x1.641dc398561efp-311, 0x1.827d8b273a859p-306 }, + { 0x1.0f79d08c027e2p-311, 0x1.26c35a8453a6ep-306 }, + { 0x1.9ddabce45ff88p-312, 0x1.c18e854f7a653p-307 }, + { 0x1.3b6a0443345f1p-312, 0x1.56c727238c10ep-307 }, + { 0x1.e0b830517633fp-313, 0x1.05545196af9e3p-307 }, + { 0x1.6e4903f595976p-313, 0x1.8e6b62ae03487p-308 }, + { 0x1.170eca4e7a4cap-313, 0x1.2facf384d3a3bp-308 }, + { 0x1.a92756c27d93ap-314, 0x1.ceddf1e753b81p-309 }, + { 0x1.43d40bf74392dp-314, 0x1.60b61e0028436p-309 }, + { 0x1.ed3e286c4c0dep-315, 0x1.0cbd09b1e5e1p-309 }, + { 0x1.77993389df313p-315, 0x1.997719e8b73a8p-310 }, + { 0x1.1dfa945eaae99p-315, 0x1.37e77cf85ca37p-310 }, + { 0x1.b36ec5aa0588p-316, 0x1.db1e802a6c81fp-311 }, + { 0x1.4b749e64b35f5p-316, 0x1.69d3aa6fccfd9p-311 }, + { 0x1.f88d823260c9ep-317, 0x1.1383f4dd09079p-311 }, + { 0x1.7ffa0f1fabb65p-317, 0x1.a388f33976b7bp-312 }, + { 0x1.242e12375b352p-317, 0x1.3f613589599c6p-312 }, + { 0x1.bc9a844ffd2b5p-318, 0x1.e635a66e3ebe7p-313 }, + { 0x1.523af73f84783p-318, 0x1.720bfb4a981d7p-313 }, + { 0x1.0146a610e0588p-318, 0x1.199a49bcc51p-313 }, + { 0x1.87590d6d36008p-319, 0x1.ac8ae259e160cp-314 }, + { 0x1.299b80ea6bb7fp-319, 0x1.4609b0c4183cap-314 }, + { 0x1.c496292aa266bp-320, 0x1.f00af26520f9dp-315 }, + { 0x1.5817f72c95e4cp-320, 0x1.794ce31e24c7bp-315 }, + { 0x1.059392396d038p-320, 0x1.1ef2877dbfcadp-315 }, + { 0x1.8da5a346cbb3fp-321, 0x1.b468dc95cb829p-316 }, + { 0x1.2e36a9eb80d32p-321, 0x1.4bd213115ac94p-316 }, + { 0x1.cb4fb203e18ap-322, 0x1.f88862b544527p-317 }, + { 0x1.5cfe5be9615c7p-322, 0x1.7f861b04cbe3ap-317 }, + { 0x1.0923c6394f695p-322, 0x1.2380a7a548a2fp-317 }, + { 0x1.92d18166ccd51p-323, 0x1.bb1122f6e5762p-318 }, + { 0x1.31f510cb3f507p-323, 0x1.50ad48dd9b3a6p-318 }, + { 0x1.d0b7c794af438p-324, 0x1.ff9ab8e5d6631p-319 }, + { 0x1.60e2f23228dedp-324, 0x1.84a97f6b3e853p-319 }, + { 0x1.0bef1906dac58p-324, 0x1.273a4b16ba84fp-319 }, + { 0x1.96d0ca88e4fcp-325, 0x1.c07484e1da469p-320 }, + { 0x1.34ce1af3c1b6p-325, 0x1.549037ceef1fep-320 }, + { 0x1.d4c1f7c67dd18p-326, 0x1.0298e0fc06037p-320 }, + { 0x1.63bcc0600e3b1p-326, 0x1.88ab45875f419p-321 }, + { 0x1.0def17046c37ep-326, 0x1.2a16e161fa35fp-321 }, + { 0x1.999a40ba75f42p-327, 0x1.c48699c75f345p-322 }, + { 0x1.36bb3093bcf7fp-327, 0x1.5771e906a9978p-322 }, + { 0x1.d764e5657aa2p-328, 0x1.04a04a1699caap-322 }, + { 0x1.658528dc53bd5p-328, 0x1.8b822865b44e6p-323 }, + { 0x1.0f1f1acd583cp-328, 0x1.2c0fc98ac934cp-323 }, + { 0x1.9b2768ee2e28p-329, 0x1.c73df0b6d4334p-324 }, + { 0x1.37b7d60833afbp-329, 0x1.594bab8ddacb1p-324 }, + { 0x1.d89a6c43f4c1p-330, 0x1.05dee05833b3cp-324 }, + { 0x1.663803afd90e2p-330, 0x1.8d278c9cbfc58p-325 }, + { 0x1.0f7c5f2e4265p-330, 0x1.2d206b997c2ccp-325 }, + { 0x1.9b74a41343d69p-331, 0x1.c89434d36542fp-326 }, + { 0x1.37c1bd3bb9cfep-331, 0x1.5a192e33cf627p-326 }, + { 0x1.d85fb90bdf218p-332, 0x1.0651bc0c61b2p-326 }, + { 0x1.65d3aea4b609ep-332, 0x1.8d9799e5f2521p-327 }, + { 0x1.0f0609e7aa674p-332, 0x1.2d464a6b30dc2p-327 }, + { 0x1.9a813d2878f74p-333, 0x1.c88645e6c88eep-328 }, + { 0x1.36d8ce9d2217bp-333, 0x1.59d89052b0525p-328 }, + { 0x1.d6b5543d3c94p-334, 0x1.05f7d07f3fb02p-328 }, + { 0x1.645913a262a36p-334, 0x1.8cd14a1185c8dp-329 }, + { 0x1.0dbd2f003b6a5p-334, 0x1.2c810d60e767ep-329 }, + { 0x1.984f6bfe6778p-335, 0x1.c714448c370a6p-330 }, + { 0x1.34ff297cd534dp-335, 0x1.588a691f2cd1fp-330 }, + { 0x1.d39f201da2255p-336, 0x1.04d1f01416963p-330 }, + { 0x1.61cba521cabb4p-336, 0x1.8ad66d03eba59p-331 }, + { 0x1.0ba4cc94c45b3p-336, 0x1.2ad281b8cc2ap-331 }, + { 0x1.94e44c9a075e7p-337, 0x1.c44191b160ec2p-332 }, + { 0x1.32391bcecdc03p-337, 0x1.5631c55b5d22cp-332 }, + { 0x1.cf2449a3fda4bp-338, 0x1.02e2c911c7929p-332 }, + { 0x1.5e3150cc8eda4p-338, 0x1.87aba1a7120bfp-333 }, + { 0x1.08c1bf3c985fap-338, 0x1.283e938a586f7p-333 }, + { 0x1.9047cb663bb8cp-339, 0x1.c014c17012593p-334 }, + { 0x1.2e8d117dfdd44p-339, 0x1.52d41b7968429p-334 }, + { 0x1.c94f2cb2815a8p-340, 0x1.002edb3674f27p-334 }, + { 0x1.599268900e7bcp-340, 0x1.835843f5f0b0cp-335 }, + { 0x1.051aaf415041dp-340, 0x1.24cb3e8b7d756p-335 }, + { 0x1.8a84869fc8267p-341, 0x1.ba9781881c8a9p-336 }, + { 0x1.2a037bab743e1p-341, 0x1.4e79366e7a47p-336 }, + { 0x1.c22d2c350e306p-342, 0x1.f978cc962d426p-337 }, + { 0x1.53f982a03a248p-342, 0x1.7de65083f0e21p-337 }, + { 0x1.00b7f70f68972p-342, 0x1.208076f18ea3p-337 }, + { 0x1.83a7a5a0b9d4dp-343, 0x1.b3d6740403453p-338 }, + { 0x1.24a6b05eb3edap-343, 0x1.492b17a8d9ad4p-338 }, + { 0x1.b9ce7efad864cp-344, 0x1.f126a42ab2a64p-339 }, + { 0x1.4d7351162fad8p-344, 0x1.77623e1a3ca2fp-339 }, + { 0x1.f74706d1f613cp-345, 0x1.1b680aeae0c3cp-339 }, + { 0x1.7bc0a6e57fbc5p-345, 0x1.abe0fed214bcap-340 }, + { 0x1.1e82c35430e3dp-345, 0x1.42f5d0cb0afebp-340 }, + { 0x1.b045f25c98b4bp-346, 0x1.e77a20528f8f5p-341 }, + { 0x1.460e7202036c7p-346, 0x1.6fdace394b03cp-341 }, + { 0x1.ebd15c07c2acdp-347, 0x1.158d7d54f1681p-341 }, + { 0x1.72e125d540295p-347, 0x1.a2c9115542385p-342 }, + { 0x1.17a558b9c184fp-347, 0x1.3be755f8b210cp-342 }, + { 0x1.a5a8a3f3de092p-348, 0x1.dc88f077bd369p-343 }, + { 0x1.3ddb38ecb5b52p-348, 0x1.6760d57bb9982p-343 }, + { 0x1.df2826b036578p-349, 0x1.0efdda755dbb3p-343 }, + { 0x1.691c997f37f0ep-349, 0x1.98a2e123c782ep-344 }, + { 0x1.101d72c627ff7p-349, 0x1.340f49a72211p-344 }, + { 0x1.9a0db3d2b8dacp-350, 0x1.d06b3f65f6fdp-345 }, + { 0x1.34eb72e63e592p-350, 0x1.5e06fcff790f4p-345 }, + { 0x1.d166c8f34fca4p-351, 0x1.07c787991a68p-345 }, + { 0x1.5e880d9f1fe43p-351, 0x1.8d849f54265f7p-346 }, + { 0x1.07fb3b2ff1602p-351, 0x1.2b7ec30262d2bp-346 }, + { 0x1.8d8df0cbffd52p-352, 0x1.c33b5a8ad639fp-347 }, + { 0x1.2b52265317648p-352, 0x1.53e17e1a8afadp-347 }, + { 0x1.c2aa6bd34f17bp-353, 0x1.fff41d2913dabp-348 }, + { 0x1.5339d751ff2a1p-353, 0x1.818627da2e9e4p-348 }, + { 0x1.fe9f93308c405p-354, 0x1.2248100f21115p-348 }, + { 0x1.80438073219dep-354, 0x1.b515531d535ebp-349 }, + { 0x1.21234fbc4a127p-354, 0x1.4905d9b84e0cbp-349 }, + { 0x1.b31198aa5f8abp-355, 0x1.ef4bcc5f71a72p-350 }, + { 0x1.474946f304456p-355, 0x1.74c0ac8d03b2bp-350 }, + { 0x1.ec59d00f3fe38p-356, 0x1.187e74c209a91p-350 }, + { 0x1.7249848679fa9p-356, 0x1.a6169b09c4411p-351 }, + { 0x1.16739cec78bd4p-356, 0x1.3d8a8ccb26cd9p-351 }, + { 0x1.a2bbd0795adeep-357, 0x1.ddb87127c2076p-352 }, + { 0x1.3ace589cd3352p-357, 0x1.674e5d7be735cp-352 }, + { 0x1.d949ad392f075p-358, 0x1.0e35e84d33d3fp-352 }, + { 0x1.63bbbf78651ccp-358, 0x1.965d9f895d99cp-353 }, + { 0x1.0b5827a3ba382p-358, 0x1.3186c3440696p-353 }, + { 0x1.91c922f9ee4cp-359, 0x1.cb5d51a48d7d4p-354 }, + { 0x1.2de164c74e725p-359, 0x1.594a1039f0199p-354 }, + { 0x1.c5941f108d9d1p-360, 0x1.0382d1e479246p-354 }, + { 0x1.54b639c219649p-360, 0x1.8609634a384ccp-355 }, + { 0x1.ffcc62473097ap-361, 0x1.25120afe02122p-355 }, + { 0x1.8059c757355aep-361, 0x1.b85e31314f4b4p-356 }, + { 0x1.209ad26ca18d9p-361, 0x1.4acee7c0fcbafp-356 }, + { 0x1.b15e18d0d2d12p-362, 0x1.f0f38c6449ad9p-357 }, + { 0x1.4554e9983b016p-362, 0x1.753919ff4b182p-357 }, + { 0x1.e865bf893f8f4p-363, 0x1.1844080030d76p-357 }, + { 0x1.6e8db855aac9ap-363, 0x1.a4dede3a3eb93p-358 }, + { 0x1.1312cc0ae5d04p-363, 0x1.3bf7fe7aa33ap-358 }, + { 0x1.9ccc1bfbf7ecbp-364, 0x1.da5e8d4d639edp-359 }, + { 0x1.35b35e7d0088ep-364, 0x1.640bc7176cda7p-359 }, + { 0x1.d0a5ff60b92cfp-365, 0x1.0b342b640cc13p-359 }, + { 0x1.5c84558f35d95p-365, 0x1.9102c47629cb9p-360 }, + { 0x1.0560f8bafb2c7p-365, 0x1.2ce013e375d0fp-360 }, + { 0x1.8801ce509ea26p-366, 0x1.c36f07720a932p-361 }, + { 0x1.25ec7207b3c64p-366, 0x1.529fe13854ed9p-361 }, + { 0x1.b8b58f7c67c36p-367, 0x1.fbf2dc269c35dp-362 }, + { 0x1.4a5c0b3b7424dp-367, 0x1.7cec854a40ddcp-362 }, + { 0x1.ef3874e46141bp-368, 0x1.1da13f1aaaee6p-362 }, + { 0x1.732197e24d857p-368, 0x1.ac4c46230c45cp-363 }, + { 0x1.1619ff0ea7ec6p-368, 0x1.4112fbeff8a1fp-363 }, + { 0x1.a0bb46a0a2c53p-369, 0x1.e15420dda8758p-364 }, + { 0x1.383201c8ba71ap-369, 0x1.68bd97eb5b05dp-364 }, + { 0x1.d3b4e4b894768p-370, 0x1.0e54a78756b6bp-364 }, + { 0x1.5e4c4aaef013p-370, 0x1.951c14f527745p-365 }, + { 0x1.0654a030d3e7p-370, 0x1.2f8178dd14a04p-365 }, + { 0x1.88dc03d1ca801p-371, 0x1.c6b6bf9361ee4p-366 }, + { 0x1.2621d65152a67p-371, 0x1.5495f2949c65ep-366 }, + { 0x1.b860981f4834ap-372, 0x1.fe24891c8ca0cp-367 }, + { 0x1.49a0d4c97c281p-372, 0x1.7e02609a87253p-367 }, + { 0x1.ed66ed1143993p-373, 0x1.1e064158c947bp-367 }, + { 0x1.713a5a10cc9bp-373, 0x1.ac4304f253262p-368 }, + { 0x1.14455cbbff469p-373, 0x1.4093bdea6e36fp-368 }, + { 0x1.9d62205df47a6p-374, 0x1.dfe14a435c3c2p-369 }, + { 0x1.353bfdeb15aa4p-374, 0x1.6720e3d624fdcp-369 }, + { 0x1.ce97f23783a55p-375, 0x1.0cba8970a9d66p-369 }, + { 0x1.59f649793ea9ap-375, 0x1.921e961b81171p-370 }, + { 0x1.02b46c188f22dp-375, 0x1.2cd3135c626d1p-370 }, + { 0x1.82dcfdba2d59cp-376, 0x1.c2097f7f7c953p-371 }, + { 0x1.213830f44d648p-376, 0x1.5096e15b063dbp-371 }, + { 0x1.b0639acae41c7p-377, 0x1.f76b39886a20dp-372 }, + { 0x1.432d063e4cc5ap-377, 0x1.786c2636e4e2ap-372 }, + { 0x1.e3096b161ade1p-378, 0x1.196dc712e8651p-372 }, + { 0x1.68f1646f450ccp-378, 0x1.a4c39680abb0bp-373 }, + { 0x1.0dad51a121c5fp-378, 0x1.3a80eb1934625p-373 }, + { 0x1.92ed52465cf13p-379, 0x1.d6196b3830612p-374 }, + { 0x1.2cf8cdb32b26dp-379, 0x1.5f4b3b930a91ap-374 }, + { 0x1.c1934bb7035c1p-380, 0x1.067b3db09279ep-374 }, + { 0x1.4fbc11c19c0b7p-380, 0x1.8832413bcb6f5p-375 }, + { 0x1.f5613cdc1ad52p-381, 0x1.24f8b72bbd6eep-375 }, + { 0x1.76547ab0f816ap-381, 0x1.b5a5bcacf14ddp-376 }, + { 0x1.1770c93ef3136p-381, 0x1.46d8046ba690cp-376 }, + { 0x1.a128a30d837ebp-382, 0x1.e8209bd7c6d4dp-377 }, + { 0x1.375630e92b79p-382, 0x1.6c744b66f6406p-377 }, + { 0x1.d0a93cd8add1ep-383, 0x1.1015024fefc8dp-377 }, + { 0x1.5ab4549d6cf15p-383, 0x1.9631ba1694964p-378 }, + { 0x1.02a8fed4a1944p-383, 0x1.2f2b3b1ae197dp-378 }, + { 0x1.81e6d5efc2ecep-384, 0x1.c47e5b8f9de0cp-379 }, + { 0x1.1fd54f3e20bfcp-384, 0x1.51a481761d265p-379 }, + { 0x1.ad523512d80aep-385, 0x1.f7d2ff106229cp-380 }, + { 0x1.4023f854f9c86p-385, 0x1.77da522f79ec5p-380 }, + { 0x1.dd649c8fad0d5p-386, 0x1.185a192bd02b4p-380 }, + { 0x1.63e684c4d4572p-386, 0x1.a22ed5ef67f83p-381 }, + { 0x1.094b5ecc6e29p-386, 0x1.37d9a85948033p-381 }, + { 0x1.8b7643330549ep-387, 0x1.d10da89b8212ap-382 }, + { 0x1.26b65f14cd4dap-387, 0x1.5ab7d4224f7e2p-382 }, + { 0x1.b734f53e57228p-388, 0x1.0276587fa1c2p-382 }, + { 0x1.473b9d1931175p-388, 0x1.814bdb918424dp-383 }, + { 0x1.e78d8c6e84fddp-389, 0x1.1f2684f2af658p-383 }, + { 0x1.6b2a2c93cd65ap-389, 0x1.abf540fb4e1a1p-384 }, + { 0x1.0e7a7b055d281p-389, 0x1.3eddfeeed0dd2p-384 }, + { 0x1.92d87cacce695p-390, 0x1.db1c82f79707dp-385 }, + { 0x1.2bf57b6e0d98dp-390, 0x1.61ea0b7eb4c3cp-385 }, + { 0x1.bea4f9488e121p-391, 0x1.0799f1fb897d8p-385 }, + { 0x1.4c7d8bf7bdc41p-391, 0x1.889f21fdb1d69p-386 }, + { 0x1.eef6b8bfa9225p-392, 0x1.245c20ba28a39p-386 }, + { 0x1.705ed2bbfd521p-392, 0x1.b3598a0d5984p-387 }, + { 0x1.121f1b69882ebp-392, 0x1.4418fde75923ep-387 }, + { 0x1.97ec608197c79p-393, 0x1.e27e05b6c31f9p-388 }, + { 0x1.2f7b0edc74f1cp-393, 0x1.671af7f5d8858p-388 }, + { 0x1.c380c41f7503p-394, 0x1.0b3d4442eda68p-388 }, + { 0x1.4fd20f15083b3p-394, 0x1.8db341e4d4306p-389 }, + { 0x1.f37ea8d01e9c5p-395, 0x1.27e37e3bc73c9p-389 }, + { 0x1.736cebb19a201p-395, 0x1.b83a639f29a8p-390 }, + { 0x1.1428c012e2c57p-395, 0x1.47730acf38edcp-390 }, + { 0x1.9a9ae80c06018p-396, 0x1.e710d5155d028p-391 }, + { 0x1.31371c2b63b8p-396, 0x1.6a331ab64b688p-391 }, + { 0x1.c5b240b14f4d6p-397, 0x1.0d4fd25f7f52ep-391 }, + { 0x1.5129ffd17a136p-397, 0x1.90712f4e38e37p-392 }, + { 0x1.f510ba62354a5p-398, 0x1.29ac951c1e60bp-392 }, + { 0x1.74468acd1611cp-398, 0x1.ba819d5f14678p-393 }, + { 0x1.148e1d96c299ep-398, 0x1.48dce2dc3ecd5p-393 }, + { 0x1.9ad7d58aaba44p-399, 0x1.e8c0193d16d55p-394 }, + { 0x1.3121b71d77179p-399, 0x1.6b2456938b866p-394 }, + { 0x1.c52f68dd90e64p-400, 0x1.0dc826696c76cp-394 }, + { 0x1.507f397188496p-400, 0x1.90cc63cdbf2a2p-395 }, + { 0x1.f3a5bdf92c388p-401, 0x1.29af3c144f8cp-395 }, + { 0x1.72e7cbdbb95dbp-401, 0x1.ba24cc0f4c8e2p-396 }, + { 0x1.134d638b07143p-401, 0x1.48500e815d897p-396 }, + { 0x1.98a2111174d79p-402, 0x1.e7841c45926dp-397 }, + { 0x1.2f3b409e1b7b6p-402, 0x1.69ea5b1b71301p-397 }, + { 0x1.c1fa91a869695p-403, 0x1.0ca4195cda6d3p-397 }, + { 0x1.4dd4c7d7ec9fap-403, 0x1.8ec33daf13649p-398 }, + { 0x1.ef442d8796795p-404, 0x1.27eb66fea5e85p-398 }, + { 0x1.6f56f0c0f22b9p-404, 0x1.b72598c77c448p-399 }, + { 0x1.106c4a594a047p-404, 0x1.45cf12a60cb9ap-399 }, + { 0x1.9403b0e4bd1b9p-405, 0x1.e36284e81b5ffp-400 }, + { 0x1.2b8c63e7468c1p-405, 0x1.668ac570f2fc8p-400 }, + { 0x1.bc22598793379p-406, 0x1.09e8e37ef2488p-400 }, + { 0x1.4936d06178106p-406, 0x1.8a5f0c63b5c24p-401 }, + { 0x1.e7fffb3b16a7dp-407, 0x1.2469273320bdap-401 }, + { 0x1.69a431ed205ap-407, 0x1.b191b44e70edfp-402 }, + { 0x1.0bf7e7cce4d07p-407, 0x1.41655d7606103p-402 }, + { 0x1.8d11ace4d8996p-408, 0x1.dc6e2b76185d5p-403 }, + { 0x1.2625d4b960a47p-408, 0x1.6114f58eab906p-403 }, + { 0x1.b3c139841a735p-409, 0x1.05a2f4a403a4dp-403 }, + { 0x1.42ba35d81be5cp-409, 0x1.83b3c9af7ee45p-404 }, + { 0x1.ddf9fa6fc513ap-410, 0x1.1f386e3013e68p-404 }, + { 0x1.61e943a26f542p-410, 0x1.a9826f127d04dp-405 }, + { 0x1.06044c28d2704p-410, 0x1.3b26ef9596f74p-405 }, + { 0x1.83eb403668f94p-411, 0x1.d2c68adc24dd3p-406 }, + { 0x1.1f1fd15ed30fep-411, 0x1.59a199b7c8167p-406 }, + { 0x1.a8fcbdc7eab51p-412, 0x1.ffcb2bfa5b8dap-407 }, + { 0x1.3a7bfb4be9962p-412, 0x1.7adf828472cfdp-407 }, + { 0x1.d15ee90987618p-413, 0x1.1870951a86a79p-407 }, + { 0x1.584895194492p-413, 0x1.9f1bfa110cbbap-408 }, + { 0x1.fd57d7b45b3cap-414, 0x1.332fc55367264p-408 }, + { 0x1.78b8ffae32bfp-414, 0x1.c696d39db75f3p-409 }, + { 0x1.16996dab0cd1ep-414, 0x1.5051f4ea04fdfp-409 }, + { 0x1.9c046dcaa75a4p-415, 0x1.f194b2a4cb97p-410 }, + { 0x1.30a06c462f23ep-415, 0x1.700975cbb46aap-410 }, + { 0x1.c2662350ce7fap-416, 0x1.102fae0ec7794p-410 }, + { 0x1.4cec5169fb931p-416, 0x1.928c588cfb6d9p-411 }, + { 0x1.ec1db7d8e44b5p-417, 0x1.29a3060c44f3ap-411 }, + { 0x1.6babae8929706p-417, 0x1.b814aa869e0e4p-412 }, + { 0x1.0cb7ae5506e7ep-417, 0x1.454ee7edd0063p-412 }, + { 0x1.8d106f7f4047ep-418, 0x1.e0e0b72e6ef2ep-413 }, + { 0x1.255213192c405p-418, 0x1.6360f251c2f1fp-413 }, + { 0x1.b1500fc71b69ap-419, 0x1.0699a6631f93fp-413 }, + { 0x1.40052c8ba04b4p-419, 0x1.840a0d97bb129p-414 }, + { 0x1.d8a3d24511c07p-420, 0x1.1eaa023d58a69p-414 }, + { 0x1.5cfadd7b9716p-420, 0x1.a77ea01d8b821p-415 }, + { 0x1.01a47ddad3ea8p-420, 0x1.38c7c7057a652p-415 }, + { 0x1.7c5ff3799c35bp-421, 0x1.cdf6c504a93e5p-416 }, + { 0x1.18c087e86a1f3p-421, 0x1.551bff88c1175p-416 }, + { 0x1.9e64530b957f4p-422, 0x1.f7ae8590bb8p-417 }, + { 0x1.31c908986e1a8p-422, 0x1.73d293026bc2ap-417 }, + { 0x1.c33b25da2082ep-423, 0x1.12730a9790f69p-417 }, + { 0x1.4ce362055227ep-423, 0x1.951a7082f394ap-418 }, + { 0x1.eb1b0ae0a386ap-424, 0x1.2af1081b22794p-418 }, + { 0x1.6a3779e1ff3bp-424, 0x1.b925bc48353ep-419 }, + { 0x1.0b1f245435eeap-424, 0x1.4575deb5305a2p-419 }, + { 0x1.89efddb97fd18p-425, 0x1.e029ff0fc8645p-420 }, + { 0x1.227180cb0a8cap-425, 0x1.6228a92a17423p-420 }, + { 0x1.ac39e8a7de062p-426, 0x1.05302bb5e3a1ap-420 }, + { 0x1.3ba5b5279aa24p-426, 0x1.81331d3a2cc81p-421 }, + { 0x1.d145ea8ff6403p-427, 0x1.1c02d69097c72p-421 }, + { 0x1.56df011e743b9p-427, 0x1.a2c1b0ae83a64p-422 }, + { 0x1.f94750d0f9308p-428, 0x1.34ad734ae6135p-422 }, + { 0x1.7442e7172840ap-428, 0x1.c703bfdc748cdp-423 }, + { 0x1.123a683e9b9d5p-428, 0x1.4f5290291de6ep-423 }, + { 0x1.93f94a8e393e5p-429, 0x1.ee2bb5a2a447p-424 }, + { 0x1.298449094a08p-429, 0x1.6c16f34d9525ep-424 }, + { 0x1.b62c8f87855a8p-430, 0x1.0c379a70923bcp-424 }, + { 0x1.42a02f59d51efp-430, 0x1.8b21b8919710fp-425 }, + { 0x1.db09bb0ffb21fp-431, 0x1.2303a1b68b2dep-425 }, + { 0x1.5daee76f997a8p-431, 0x1.ac9c706a79cfcp-426 }, + { 0x1.01604a662bf4cp-431, 0x1.3b983b3f72fb5p-426 }, + { 0x1.7ad33d50dacdp-432, 0x1.d0b33fd9b6e85p-427 }, + { 0x1.16c1e4c8c451ap-432, 0x1.5615904c6373ap-427 }, + { 0x1.9a32159dea0d8p-433, 0x1.f7950165d693dp-428 }, + { 0x1.2dc48781056c9p-433, 0x1.729dc070c926ap-428 }, + { 0x1.bbf2871addffbp-434, 0x1.10b9b38c6e833p-428 }, + { 0x1.4684a4152d4ep-434, 0x1.9154f9f73ee5fp-429 }, + { 0x1.e03df4eb2c204p-435, 0x1.27418ebfd96bep-429 }, + { 0x1.6120558a89b12p-435, 0x1.b26192fa2f36ep-430 }, + { 0x1.03a014bcb5352p-435, 0x1.3f7df7d25b3e6p-430 }, + { 0x1.7db773a6f6623p-436, 0x1.d5ec232ba3385p-431 }, + { 0x1.1893b9023690dp-436, 0x1.598c75ff21ea4p-431 }, + { 0x1.9c6ba6a49465ap-437, 0x1.fc1f9e46a53e2p-432 }, + { 0x1.2f125d64e7642p-437, 0x1.758c452444076p-432 }, + { 0x1.bd607b51aff83p-438, 0x1.1294b791c6529p-432 }, + { 0x1.4735d5e25dd32p-438, 0x1.939e692035be7p-433 }, + { 0x1.e0bb7795ebab2p-439, 0x1.289cc9b3b4107p-433 }, + { 0x1.611962fb4b008p-439, 0x1.b3e5c199dc217p-434 }, + { 0x1.035217aa6e0adp-439, 0x1.40415be2c6028p-434 }, + { 0x1.7cd9c096da3b3p-440, 0x1.d6871e2c76342p-435 }, + { 0x1.17a22cd2a508fp-440, 0x1.599d2a64857abp-435 }, + { 0x1.9a95351e8c9f1p-441, 0x1.fba952efabe51p-436 }, + { 0x1.2d63f329a8bcbp-441, 0x1.74cc660d4897ap-436 }, + { 0x1.ba6ba0cb47e2bp-442, 0x1.11baa6a990cd8p-436 }, + { 0x1.44ae89d144108p-442, 0x1.91ecc31adec4ep-437 }, + { 0x1.dc7e8d1b8f556p-443, 0x1.270b14a1f9816p-437 }, + { 0x1.5d9a42222275cp-443, 0x1.b11d883fd3ec1p-438 }, + { 0x1.00789e350bd1ap-443, 0x1.3ddca348b8e79p-438 }, + { 0x1.7840aaba80c98p-444, 0x1.d27f9dd765764p-439 }, + { 0x1.13f45ccd8c935p-444, 0x1.56472f42babf3p-439 }, + { 0x1.94bc9a9955f26p-445, 0x1.f6359d3980ea5p-440 }, + { 0x1.28c5f3eaf8eddp-445, 0x1.7063ccd1b83c6p-440 }, + { 0x1.b32a3c3e46a35p-446, 0x1.0e31f012ad2b3p-440 }, + { 0x1.3f01c91fe7f47p-446, 0x1.8c4cd2c02ec2dp-441 }, + { 0x1.d3a718c61d154p-447, 0x1.2298481c2ca0dp-441 }, + { 0x1.56bd3dd5a05c1p-447, 0x1.aa1de55237abcp-442 }, + { 0x1.f65222fadfcp-448, 0x1.3861db33230bp-442 }, + { 0x1.700eb717cfb77p-448, 0x1.c9f401331dbf6p-443 }, + { 0x1.0da5e12700c8dp-448, 0x1.4fa3a533642f6p-443 }, + { 0x1.8b0da54d3c71fp-449, 0x1.ebed8656f1a7bp-444 }, + { 0x1.215aeed941b43p-449, 0x1.6873a105b43c2p-444 }, + { 0x1.a7d28bd609e5p-450, 0x1.081521636047p-444 }, + { 0x1.3659f3261d19p-450, 0x1.82e8d038330cap-445 }, + { 0x1.c6770887b13f6p-451, 0x1.1b65bea6b7e6ap-445 }, + { 0x1.4cb570f463d9dp-451, 0x1.9f1b427ce89a2p-446 }, + { 0x1.e715dafe5cd6p-452, 0x1.2ff9fffd4f5f9p-446 }, + { 0x1.6480ba9b1723cp-452, 0x1.bd241d06b6757p-447 }, + { 0x1.04e575dd6f2ebp-452, 0x1.45e411382662bp-447 }, + { 0x1.7dcff6d521467p-453, 0x1.dd1da1bc7ec85p-448 }, + { 0x1.1759a98201ff3p-453, 0x1.5d36e9f7af39cp-448 }, + { 0x1.98b82586ccf2dp-454, 0x1.ff233639de02ap-449 }, + { 0x1.2af6afc0ce651p-454, 0x1.7606528b3cf28p-449 }, + { 0x1.b54f244df93dfp-455, 0x1.11a8b54a30c34p-449 }, + { 0x1.3fcc4e4385b18p-455, 0x1.9066e8a3084adp-450 }, + { 0x1.d3abb2d5b9282p-456, 0x1.24e2ffedd9f78p-450 }, + { 0x1.55eaec016b2b5p-456, 0x1.ac6e23cde6ac9p-451 }, + { 0x1.f3e576e5bfb2cp-457, 0x1.394ff72563c26p-451 }, + { 0x1.6d6394041cb01p-457, 0x1.ca3259bb8013ep-452 }, + { 0x1.0b0a8012d71fbp-457, 0x1.4effb58fcce2p-452 }, + { 0x1.8647f7f3a91dep-458, 0x1.e9cac23b8427ep-453 }, + { 0x1.1d29e5c60946bp-458, 0x1.6602f707600f3p-453 }, + { 0x1.a0aa72640fd47p-459, 0x1.05a7bd790a4bcp-453 }, + { 0x1.305e23384e58ap-459, 0x1.7e6b1b23c38f4p-454 }, + { 0x1.bc9e08de1532fp-460, 0x1.176cc55ca9b8p-454 }, + { 0x1.44b4e89c6a35fp-460, 0x1.984a277e8539ap-455 }, + { 0x1.da366d9d2b975p-461, 0x1.2a417253e014bp-455 }, + { 0x1.5a3c60cb2c6b1p-461, 0x1.b3b2c9b4277c6p-456 }, + { 0x1.f98800fc076dbp-462, 0x1.3e333559670c8p-456 }, + { 0x1.71033226bf0afp-462, 0x1.d0b8591b88278p-457 }, + { 0x1.0d53e944a7e18p-462, 0x1.534ff7f271b4dp-457 }, + { 0x1.89187f3d75a14p-463, 0x1.ef6ed82d51675p-458 }, + { 0x1.1ed5d0deddfb7p-463, 0x1.69a61d0edc9d2p-458 }, + { 0x1.a28be72757b85p-464, 0x1.07f57aca805f1p-458 }, + { 0x1.3154ef266983dp-464, 0x1.814481a9f253cp-459 }, + { 0x1.bd6d859990532p-465, 0x1.1921067277b5dp-459 }, + { 0x1.44dcd404b4fcdp-465, 0x1.9a3a7d2712f82p-460 }, + { 0x1.d9cdf2aadd6a6p-466, 0x1.2b45137355f77p-460 }, + { 0x1.5979672b76b96p-466, 0x1.b497e1657b91bp-461 }, + { 0x1.f7be424410479p-467, 0x1.3e6cfcc06ed27p-461 }, + { 0x1.6f36e7903ba4fp-467, 0x1.d06cfa865bc4ep-462 }, + { 0x1.0ba8019bd4e86p-467, 0x1.52a47395ed2aep-462 }, + { 0x1.8621eaa755f34p-468, 0x1.edca8e605e67ap-463 }, + { 0x1.1c4a9efdce654p-468, 0x1.67f77ef705254p-463 }, + { 0x1.9e475b5aaea97p-469, 0x1.0660edcde1e02p-463 }, + { 0x1.2dd03980220acp-469, 0x1.7e727aec99554p-464 }, + { 0x1.b7b478b8fda1cp-470, 0x1.16b24c391593bp-464 }, + { 0x1.40424c4fd21f7p-470, 0x1.96221780dfe95p-465 }, + { 0x1.d276d459f43c7p-471, 0x1.27e2788696d86p-465 }, + { 0x1.53aa8c500f5dp-471, 0x1.af1357749947cp-466 }, + { 0x1.ee9c5073f397ep-472, 0x1.39fac2bf7a531p-466 }, + { 0x1.6812e6a2e8fcp-472, 0x1.c9538eaa71fbp-467 }, + { 0x1.06198ecffc0ep-472, 0x1.4d04b3a802aeep-467 }, + { 0x1.7d857ef6fe55ap-473, 0x1.e4f0604536408p-468 }, + { 0x1.15a4dc243cc5fp-473, 0x1.610a0b4ec8401p-468 }, + { 0x1.940cad97ee071p-474, 0x1.00fbde3ac71c6p-468 }, + { 0x1.25f772e00c70ap-474, 0x1.7614bf61d6bfap-469 }, + { 0x1.abb2fd3f529efp-475, 0x1.103beefa0765p-469 }, + { 0x1.3718d87e8a0afp-475, 0x1.8c2ef94786008p-470 }, + { 0x1.c48328a4346ebp-476, 0x1.203fa39242793p-470 }, + { 0x1.4910b37b4de72p-476, 0x1.a36313f8e64ecp-471 }, + { 0x1.de8817c6f33b9p-477, 0x1.310e5f6fbfd44p-471 }, + { 0x1.5be6c950a7e6fp-477, 0x1.bbbb999bb060ap-472 }, + { 0x1.f9ccdcf7c94fep-478, 0x1.42afa66f9fdc1p-472 }, + { 0x1.6fa2fc442a9d3p-478, 0x1.d54340d9c375dp-473 }, + { 0x1.0b2e58cb15f5cp-478, 0x1.552b1ae6aeaa2p-473 }, + { 0x1.844d490056942p-479, 0x1.f004e9f45a94bp-474 }, + { 0x1.1a217943b9ac7p-479, 0x1.68887b7750462p-474 }, + { 0x1.99edc3fa555f4p-480, 0x1.0605cdc8a1e5ep-474 }, + { 0x1.29c58e31af831p-480, 0x1.7ccfa0b55e3f7p-475 }, + { 0x1.b08c96a2d341cp-481, 0x1.14b13fa04509fp-475 }, + { 0x1.3a2063aa9bfc9p-481, 0x1.92087a96ea8f4p-476 }, + { 0x1.c831fc61280f7p-482, 0x1.240a6edc95f53p-476 }, + { 0x1.4b37d15842e1dp-482, 0x1.a83b0db0fa5b6p-477 }, + { 0x1.e0e63f582488bp-483, 0x1.34170d65d2fe5p-477 }, + { 0x1.5d11b81c3fea7p-483, 0x1.bf6f703f6c8b1p-478 }, + { 0x1.fab1b4f400c2ep-484, 0x1.44dcd884a52dcp-478 }, + { 0x1.6fb3ff8ccf41cp-484, 0x1.d7adc6f76430fp-479 }, + { 0x1.0ace5d20891a2p-484, 0x1.5661968fc8c68p-479 }, + { 0x1.8324934a763f4p-485, 0x1.f0fe41a3b588bp-480 }, + { 0x1.18d7d8058e531p-485, 0x1.68ab147365bffp-480 }, + { 0x1.9769602e7d2c4p-486, 0x1.05b48bc57ed71p-480 }, + { 0x1.27797b62a04a4p-486, 0x1.7bbf2311e9661p-481 }, + { 0x1.ac8851524d431p-487, 0x1.137b41cf9c9a4p-481 }, + { 0x1.36b7751d5da7fp-487, 0x1.8fa3947e525d9p-482 }, + { 0x1.c2874cefea298p-488, 0x1.21d7603b6e2ccp-482 }, + { 0x1.4695ee8470b66p-488, 0x1.a45e3910021acp-483 }, + { 0x1.d96c311be3eb3p-489, 0x1.30cd0207d04edp-483 }, + { 0x1.571909f179506p-489, 0x1.b9f4dc504a668p-484 }, + { 0x1.f13cd05945d89p-490, 0x1.40603dadb780ap-484 }, + { 0x1.6844e0504f766p-490, 0x1.d06d41c212c13p-485 }, + { 0x1.04ff770417c7ep-490, 0x1.509522cc01f2fp-485 }, + { 0x1.7a1d7e8c27e5p-491, 0x1.e7cd2184183ebp-486 }, + { 0x1.11dc1d57f7df8p-491, 0x1.616fb7b910c11p-486 }, + { 0x1.8ca6e2e342651p-492, 0x1.000d1267395e3p-486 }, + { 0x1.1f372812d1e14p-492, 0x1.72f3f6faafe57p-487 }, + { 0x1.9fe4fa21e8c98p-493, 0x1.0cacf12619fe1p-487 }, + { 0x1.2d1356c845fd1p-493, 0x1.8525cca4f244dp-488 }, + { 0x1.b3db9cc5a58f3p-494, 0x1.19c8ed29100e2p-488 }, + { 0x1.3b7359a6b9391p-494, 0x1.980913a0c5f1ep-489 }, + { 0x1.c88e8c09b9bb2p-495, 0x1.2763b979d57b5p-489 }, + { 0x1.4a59cf5958098p-495, 0x1.aba192db244fdp-490 }, + { 0x1.de016eddfacadp-496, 0x1.357ff9fbc97f4p-490 }, + { 0x1.59c942db45eaep-496, 0x1.bff2fa5de1e9dp-491 }, + { 0x1.f437cec9632b8p-497, 0x1.44204156d00fcp-491 }, + { 0x1.69c4293cefa3fp-497, 0x1.d500e0534289dp-492 }, + { 0x1.059a8a5ce0ce7p-497, 0x1.53470ed39dd97p-492 }, + { 0x1.7a4cdf5c8de47p-498, 0x1.eacebdf5973c2p-493 }, + { 0x1.117e42e10afc5p-498, 0x1.62f6cc2a62dbdp-493 }, + { 0x1.8b65a792fe14p-499, 0x1.00aff63626acfp-493 }, + { 0x1.1dc89fe4a5f8ap-499, 0x1.7331cb44dd6ecp-494 }, + { 0x1.9d10a7562f377p-500, 0x1.0c5bd0cbfba3p-494 }, + { 0x1.2a7b1b1593291p-500, 0x1.83fa43f4f73d5p-495 }, + { 0x1.af4fe4d278bf9p-501, 0x1.186c76677c8f7p-495 }, + { 0x1.37971726a776ep-501, 0x1.955251a12574cp-496 }, + { 0x1.c225447c48b85p-502, 0x1.24e359c6528bbp-496 }, + { 0x1.451dde15504ecp-502, 0x1.a73bf0e7dcf7bp-497 }, + { 0x1.d592869bae136p-503, 0x1.31c1d70a5a26cp-497 }, + { 0x1.53109f6b70a02p-503, 0x1.b9b8fd3b82acep-498 }, + { 0x1.e99944d35a898p-504, 0x1.3f09320694d4p-498 }, + { 0x1.61706e7ea0b42p-504, 0x1.cccb2e7856e93p-499 }, + { 0x1.fe3aefa4cdaa2p-505, 0x1.4cba948866255p-499 }, + { 0x1.703e40ae0b133p-505, 0x1.e0741675f15a5p-500 }, + { 0x1.09bc65f9b8064p-505, 0x1.5ad70c9e433d4p-500 }, + { 0x1.7f7aeba02f7efp-506, 0x1.f4b51e95f89d5p-501 }, + { 0x1.14a9f8443d058p-506, 0x1.695f8add0a062p-501 }, + { 0x1.8f272381e3222p-507, 0x1.04c7c2a8ead79p-501 }, + { 0x1.1fe6a1ccca721p-507, 0x1.7854e0a5444cfp-502 }, + { 0x1.9f437947f2743p-508, 0x1.0f822de49bc54p-502 }, + { 0x1.2b72bc2a1bb29p-508, 0x1.87b7be69a8c26p-503 }, + { 0x1.afd058f4d5cb9p-509, 0x1.1a8a41a9a734p-503 }, + { 0x1.374e8637e822fp-509, 0x1.9788b1f83908ep-504 }, + { 0x1.c0ce07e3f5247p-510, 0x1.25e0558a5c077p-504 }, + { 0x1.437a22e46ffc9p-510, 0x1.a7c824c7683f1p-505 }, + { 0x1.d23ca31c0220cp-511, 0x1.3184a6ce13b46p-505 }, + { 0x1.4ff5980398e02p-511, 0x1.b8765a48c0cf1p-506 }, + { 0x1.e41c1da9f8a5fp-512, 0x1.3d775743f06aep-506 }, + { 0x1.5cc0cd28b81e5p-512, 0x1.c9936e428a9d9p-507 }, + { 0x1.f66c3f065ea05p-513, 0x1.49b86c1b194cep-507 }, + { 0x1.69db8a882e29p-513, 0x1.db1f5331fbe71p-508 }, + { 0x1.049650c331274p-513, 0x1.5647ccc18e717p-508 }, + { 0x1.774577e1faf4fp-514, 0x1.ed19d0b78718cp-509 }, + { 0x1.0e2e586d3df5cp-514, 0x1.632541cab3acp-509 }, + { 0x1.84fe1b767669bp-515, 0x1.ff82820edeaabp-510 }, + { 0x1.17fdd44e1dc6cp-515, 0x1.705073deb552ap-510 }, + { 0x1.9304d9065a4b9p-516, 0x1.092c6a4a26abfp-510 }, + { 0x1.220449767742ap-516, 0x1.7dc8eab3ed87ap-511 }, + { 0x1.a158f0df4c356p-517, 0x1.12ce032c827cep-511 }, + { 0x1.2c4123936432bp-517, 0x1.8b8e0c1372c25p-512 }, + { 0x1.aff97ef6163edp-518, 0x1.1ca5926404568p-512 }, + { 0x1.36b3b4511d82bp-518, 0x1.999f1ae9f978bp-513 }, + { 0x1.bee57a0fbbbdcp-519, 0x1.26b285aeabdbep-513 }, + { 0x1.415b32c89327cp-519, 0x1.a7fb366632c72p-514 }, + { 0x1.ce1bb2fa9523ep-520, 0x1.30f431387ee69p-514 }, + { 0x1.4c36baf8c2285p-520, 0x1.b6a15925d0c25p-515 }, + { 0x1.dd9ad3d89a4a5p-521, 0x1.3b69cf0bd5608p-515 }, + { 0x1.57454d4c97f21p-521, 0x1.c590587256b75p-516 }, + { 0x1.ed615f7bfd7d2p-522, 0x1.46127e8d37ba7p-516 }, + { 0x1.6285ce2e2e29bp-522, 0x1.d4c6e38ed7f06p-517 }, + { 0x1.fd6db0d73348ep-523, 0x1.50ed44039bd53p-517 }, + { 0x1.6df705a8252f7p-523, 0x1.e4438317c2a1ep-518 }, + { 0x1.06defd40bdb09p-523, 0x1.5bf9082dc8412p-518 }, + { 0x1.79979f15ddb0dp-524, 0x1.f4049875ce63p-519 }, + { 0x1.0f2823287afb6p-524, 0x1.673497e5a0d03p-519 }, + { 0x1.856628e34ac2cp-525, 0x1.02042eb28efefp-519 }, + { 0x1.17913a85a33a7p-525, 0x1.729ea3d219a53p-520 }, + { 0x1.9161145d0e326p-526, 0x1.0a2671c8cdbeep-520 }, + { 0x1.20191f16dc709p-526, 0x1.7e35c0288722ep-521 }, + { 0x1.9d86b59187f4ep-527, 0x1.12680a24c58f5p-521 }, + { 0x1.28be97e6e9065p-527, 0x1.89f8647df9662p-522 }, + { 0x1.a9d5434377e7bp-528, 0x1.1ac7d823a316cp-522 }, + { 0x1.31805749922c3p-528, 0x1.95e4eba9494cap-523 }, + { 0x1.b64ad6eec66d3p-529, 0x1.2344a7c981006p-523 }, + { 0x1.3a5cfae5998ecp-529, 0x1.a1f993b67371dp-524 }, + { 0x1.c2e56cdffce02p-530, 0x1.2bdd30bebc795p-524 }, + { 0x1.43530bcc0ee3ap-530, 0x1.ae347debd307p-525 }, + { 0x1.cfa2e45eea63dp-531, 0x1.3490165a1de5p-525 }, + { 0x1.4c60fe9d5cbc1p-531, 0x1.ba93aee1c301fp-526 }, + { 0x1.dc80ffece4451p-532, 0x1.3d5be7b8309a9p-526 }, + { 0x1.558533bc564e3p-532, 0x1.c7150ead1fd0ep-527 }, + { 0x1.e97d659702f92p-533, 0x1.463f1fe01b7dap-527 }, + { 0x1.5ebdf78f85a03p-533, 0x1.d3b6691d169e3p-528 }, + { 0x1.f6959f5cadd73p-534, 0x1.4f3825f642bp-528 }, + { 0x1.680982d0eea8ap-534, 0x1.e0756e0ca137bp-529 }, + { 0x1.01e38dd55bfc7p-534, 0x1.58454d7cf072p-529 }, + { 0x1.7165faec70a1p-535, 0x1.ed4fb1c7fef16p-530 }, + { 0x1.088796f5a026p-535, 0x1.6164d6a338985p-530 }, + { 0x1.7ad1726ce2f3cp-536, 0x1.fa42ad866b6p-531 }, + { 0x1.0f3587953aeb5p-536, 0x1.6a94eea23ecd2p-531 }, + { 0x1.8449e977fef01p-537, 0x1.03a5dffc21d0dp-531 }, + { 0x1.15ebef6827c9dp-537, 0x1.73d3b028fc2cfp-532 }, + { 0x1.8dcd4e591ac76p-538, 0x1.0a3416f4dd0f1p-532 }, + { 0x1.1ca951b79a938p-538, 0x1.7d1f23d694b62p-533 }, + { 0x1.97597e1aad586p-539, 0x1.10ca917d13a59p-533 }, + { 0x1.236c25d3c18a2p-539, 0x1.867540c340902p-534 }, + { 0x1.a0ec452e85047p-540, 0x1.1767d933fa0f7p-534 }, + { 0x1.2a32d78fe110fp-540, 0x1.8fd3ed17c059fp-535 }, + { 0x1.aa8360248e3edp-541, 0x1.1e0a6bf884441p-535 }, + { 0x1.30fbc7c8ab284p-541, 0x1.9938feb3469d1p-536 }, + { 0x1.b41c7c6ff8cc6p-542, 0x1.24b0bc63cac6bp-536 }, + { 0x1.37c54cf4ab1fcp-542, 0x1.a2a23bdfb3241p-537 }, + { 0x1.bdb5393a7ccd2p-543, 0x1.2b59324d7fd9bp-537 }, + { 0x1.3e8db3be9418cp-543, 0x1.ac0d5c13ef72ap-538 }, + { 0x1.c74b284572b4cp-544, 0x1.32022b5a4d882p-538 }, + { 0x1.45533fa93710cp-544, 0x1.b57808c42df0bp-539 }, + { 0x1.d0dbced86364cp-545, 0x1.38a9fb93eb86p-539 }, + { 0x1.4c142bbcdb51bp-545, 0x1.bedfde3fbf9f1p-540 }, + { 0x1.da64a6bca7adp-546, 0x1.3f4eee0ab230dp-540 }, + { 0x1.52ceab3daa53bp-546, 0x1.c8426c9c266d4p-541 }, + { 0x1.e3e31f45a0a96p-547, 0x1.45ef458066425p-541 }, + { 0x1.5980ea6ad6692p-547, 0x1.d19d38acfc932p-542 }, + { 0x1.ed549e6504cf2p-548, 0x1.4c893d1bef1fep-542 }, + { 0x1.60290f4619f98p-548, 0x1.daedbd083bb8ep-543 }, + { 0x1.f6b681cab013bp-549, 0x1.531b0925a021ep-543 }, + { 0x1.66c53a6323b06p-549, 0x1.e4316b16614afp-544 }, + { 0x1.00031007ac3e3p-549, 0x1.59a2d7cbb3c39p-544 }, + { 0x1.6d5387be7adf6p-550, 0x1.ed65ac2de0264p-545 }, + { 0x1.04a064f4bdd38p-550, 0x1.601ed1ee8e719p-545 }, + { 0x1.73d20f9b5e73bp-551, 0x1.f687e2b942e41p-546 }, + { 0x1.0931e5b5e6c43p-551, 0x1.668d1bf455ad8p-546 }, + { 0x1.7a3ee7681856fp-552, 0x1.ff956b675583bp-547 }, + { 0x1.0db636a632668p-552, 0x1.6cebd6a35f863p-547 }, + { 0x1.809822a836e1fp-553, 0x1.0445cf3250898p-547 }, + { 0x1.122bfb19eafe7p-553, 0x1.73392002f5fc2p-548 }, + { 0x1.86dbd3e416493p-554, 0x1.08b3e84ebc2b9p-548 }, + { 0x1.1691d609b1ec9p-554, 0x1.79731441e1e21p-549 }, + { 0x1.8d080d9d1c96dp-555, 0x1.0d13aa83e4b01p-549 }, + { 0x1.1ae66ac0b0b6ap-555, 0x1.7f97cea22928bp-550 }, + { 0x1.931ae34603f62p-556, 0x1.1163bef9eebc1p-550 }, + { 0x1.1f285d8d6c817p-556, 0x1.85a56a6965552p-551 }, + { 0x1.99126a3e88ca5p-557, 0x1.15a2cf3193875p-551 }, + { 0x1.23565474c154ep-557, 0x1.8b9a03d510324p-552 }, + { 0x1.9eecbad1cb519p-558, 0x1.19cf85b21a11fp-552 }, + { 0x1.276ef7e686addp-558, 0x1.9173b9121e9f7p-553 }, + { 0x1.a4a7f136af77ep-559, 0x1.1de88eb969b39p-553 }, + { 0x1.2b70f3735b79fp-559, 0x1.9730ab373bc61p-554 }, + { 0x1.aa422e918100dp-560, 0x1.21ec98edb9593p-554 }, + { 0x1.2f5af68314ac2p-560, 0x1.9cceff40f1fb1p-555 }, + { 0x1.afb999f61e5d4p-561, 0x1.25da56105b758p-555 }, + { 0x1.332bb50b471fbp-561, 0x1.a24cdf0f0a2e7p-556 }, + { 0x1.b50c6169e961bp-562, 0x1.29b07bb123c75p-556 }, + { 0x1.36e1e845638bbp-562, 0x1.a7a87a6267113p-557 }, + { 0x1.ba38bae4baa67p-563, 0x1.2d6dc3e1e1b47p-557 }, + { 0x1.3a7c4f63d9d53p-563, 0x1.ace007da9e0c8p-558 }, + { 0x1.bf3ce55012ad1p-564, 0x1.3110ede9680cep-558 }, + { 0x1.3df9b045b81fcp-564, 0x1.b1f1c5f28dcc9p-559 }, + { 0x1.c4172983c2f7ep-565, 0x1.3498bef599a58p-559 }, + { 0x1.4158d828399aep-565, 0x1.b6dbfbfb30836p-560 }, + { 0x1.c8c5db3f49157p-566, 0x1.380402cbf1542p-560 }, + { 0x1.44989c55b9312p-566, 0x1.bb9cfb13e7262p-561 }, + { 0x1.cd475a1f163eep-567, 0x1.3b518c77fb7d2p-561 }, + { 0x1.47b7dad17cf31p-567, 0x1.c0331f1f7ac71p-562 }, + { 0x1.d19a128cff8a4p-568, 0x1.3e8036f737914p-562 }, + { 0x1.4ab57affd05a9p-568, 0x1.c49ccfb511d2cp-563 }, + { 0x1.d5bc7eab14dfbp-569, 0x1.418ee5e1d890ep-563 }, + { 0x1.4d906e49e5535p-569, 0x1.c8d8810c585d4p-564 }, + { 0x1.d9ad27381fd3dp-570, 0x1.447c860fdcf2cp-564 }, + { 0x1.5047b0bcf6527p-570, 0x1.cce4b4e41cdcap-565 }, + { 0x1.dd6aa46d0f45cp-571, 0x1.47480e39f8181p-565 }, + { 0x1.52da49a426b16p-571, 0x1.d0bffb62a59f5p-566 }, + { 0x1.e0f39ed2991f9p-572, 0x1.49f07f95c9d66p-566 }, + { 0x1.55474c1ca1f2bp-572, 0x1.d468f3ef07049p-567 }, + { 0x1.e446d00e60d84p-573, 0x1.4c74e66ce3841p-567 }, + { 0x1.578dd7a37e92bp-573, 0x1.d7de4e02c6f6fp-568 }, + { 0x1.e76303a6f7572p-574, 0x1.4ed45aae1d60cp-568 }, + { 0x1.59ad189ced845p-574, 0x1.db1ec9f31f5e1p-569 }, + { 0x1.ea4717be0f8c8p-575, 0x1.510e0078c325ep-569 }, + { 0x1.5ba448d444792p-575, 0x1.de2939b1372f7p-570 }, + { 0x1.ecf1fdc04a7dbp-576, 0x1.532108a122ff3p-570 }, + { 0x1.5d72aff4768dap-576, 0x1.e0fc8180b06b8p-571 }, + { 0x1.ef62bb0a0594ap-577, 0x1.550cb12e0f1dbp-571 }, + { 0x1.5f17a3f894e1dp-577, 0x1.e39798a3f0a89p-572 }, + { 0x1.f19869809eb8ap-578, 0x1.56d045cee7811p-572 }, + { 0x1.60928993f7077p-578, 0x1.e5f989fd91cadp-573 }, + { 0x1.f392381fab056p-579, 0x1.586b2049c7737p-573 }, + { 0x1.61e2d491b1f68p-579, 0x1.e82174a67122fp-574 }, + { 0x1.f54f6b79a6d5fp-580, 0x1.59dca8e17880fp-574 }, + { 0x1.6308082b0b65cp-580, 0x1.ea0e8c77dc629p-575 }, + { 0x1.f6cf5e2bb03dcp-581, 0x1.5b2456b2d3672p-575 }, + { 0x1.6401b7549eebbp-581, 0x1.ebc01a8965943p-576 }, + { 0x1.f8118143e7ebp-582, 0x1.5c41b0093e8e9p-576 }, + { 0x1.64cf8501f223bp-582, 0x1.ed357da1f18bap-577 }, + { 0x1.f9155c9a1fbd1p-583, 0x1.5d344aaa010f1p-577 }, + { 0x1.6571245f3d39ap-583, 0x1.ee6e2a9b9efdp-578 }, + { 0x1.f9da8f1a8a0ccp-584, 0x1.5dfbcc1628fd2p-578 }, + { 0x1.65e6590135ap-584, 0x1.ef69acba2f951p-579 }, + { 0x1.fa60cf0228aadp-585, 0x1.5e97e9c2cbc7fp-579 }, + { 0x1.662ef70ab154bp-585, 0x1.f027a5f3a7f56p-580 }, + { 0x1.faa7ea0cc6ecbp-586, 0x1.5f0869476fb64p-580 }, + { 0x1.664ae34801e0ep-586, 0x1.f0a7cf2ae7563p-581 }, + { 0x1.faafc59456a8cp-587, 0x1.5f4d2082760f5p-581 }, + { 0x1.663a133fef35p-587, 0x1.f0e9f85c03b41p-582 }, + { 0x1.fa785ea194bf2p-588, 0x1.5f65f5b366281p-582 }, + { 0x1.65fc8d3a43882p-588, 0x1.f0ee08ba43cd5p-583 }, + { 0x1.fa01c9ede6a16p-589, 0x1.5f52df8b025d3p-583 }, + { 0x1.6592683be2829p-589, 0x1.f0b3febf9cbcdp-584 }, + { 0x1.f94c33d66f35bp-590, 0x1.5f13e53118eaap-584 }, + { 0x1.64fbcbf86f1abp-590, 0x1.f03bf02da5a7ap-585 }, + { 0x1.f857e040665ap-591, 0x1.5ea91e400b8afp-585 }, + { 0x1.6438f0b98cabp-591, 0x1.ef860a0000a7ap-586 }, + { 0x1.f7252a6ecb2bbp-592, 0x1.5e12b2b611c72p-586 }, + { 0x1.634a1f3bd0d7ep-592, 0x1.ee92905044d53p-587 }, + { 0x1.f5b484c995f72p-593, 0x1.5d50dadc42d9dp-587 }, + { 0x1.622fb08184d56p-593, 0x1.ed61de2b81fc4p-588 }, + { 0x1.f40678969b4f4p-594, 0x1.5c63df237cf4dp-588 }, + { 0x1.60ea0d9b5d711p-594, 0x1.ebf4655983167p-589 }, + { 0x1.f21ba5a45e2afp-595, 0x1.5b4c17f7488b1p-589 }, + { 0x1.5f79af6759efdp-595, 0x1.ea4aae160108ap-590 }, + { 0x1.eff4c1e71b057p-596, 0x1.5a09ed86def16p-590 }, + { 0x1.5ddf1e460242cp-596, 0x1.e86556bc034fep-591 }, + { 0x1.ed92990861c73p-597, 0x1.589dd784842fp-591 }, + { 0x1.5c1af1c6454bep-597, 0x1.e6451363b8311p-592 }, + { 0x1.eaf60be99fa59p-598, 0x1.57085cdb6c23ep-592 }, + { 0x1.5a2dd0483fd76p-598, 0x1.e3eaad7319948p-593 }, + { 0x1.e820101a05296p-599, 0x1.554a135c6b3d2p-593 }, + { 0x1.58186e973c8cbp-599, 0x1.e1570321beee3p-594 }, + { 0x1.e511af403f0e1p-600, 0x1.53639f61bab8bp-594 }, + { 0x1.55db8f7b445c6p-600, 0x1.de8b06f0475d8p-595 }, + { 0x1.e1cc067882b19p-601, 0x1.5155b36a1ff17p-595 }, + { 0x1.537803429dd3dp-601, 0x1.db87bf13d1856p-596 }, + { 0x1.de5045a77840fp-602, 0x1.4f210fabcd4fep-596 }, + { 0x1.50eea743a03bp-602, 0x1.d84e44d6006fdp-597 }, + { 0x1.da9faec295ac1p-603, 0x1.4cc6819f5a3a9p-597 }, + { 0x1.4e406557456e3p-603, 0x1.d4dfc3ea1615fp-598 }, + { 0x1.d6bb950e85a76p-604, 0x1.4a46e38335bf7p-598 }, + { 0x1.4b6e334ceafc3p-604, 0x1.d13d79b7b4d75p-599 }, + { 0x1.d2a55c543d97bp-605, 0x1.47a31bd7fd98ap-599 }, + { 0x1.48791257b832ep-605, 0x1.cd68b49be13bdp-600 }, + { 0x1.ce5e780d6c294p-606, 0x1.44dc1cd628aecp-600 }, + { 0x1.45620e7623619p-606, 0x1.c962d320e4c77p-601 }, + { 0x1.c9e86a88f07ffp-607, 0x1.41f2e3dd79383p-601 }, + { 0x1.422a3dd414b5ep-607, 0x1.c52d432db963cp-602 }, + { 0x1.c544c4080f626p-608, 0x1.3ee878deaf1c1p-602 }, + { 0x1.3ed2c02828af5p-608, 0x1.c0c9812daaed1p-603 }, + { 0x1.c07521d52071ep-609, 0x1.3bbdedbff743p-603 }, + { 0x1.3b5cbe0c97302p-609, 0x1.bc391730e1bf4p-604 }, + { 0x1.bb7b2d547171ap-610, 0x1.38745dbc97fd1p-604 }, + { 0x1.37c9685446b6bp-610, 0x1.b77d9c068db21p-605 }, + { 0x1.b6589b1020c3ep-611, 0x1.350cecc05d9cfp-605 }, + { 0x1.3419f75c953bcp-611, 0x1.b298b2516cc35p-606 }, + { 0x1.b10f29bfb2a68p-612, 0x1.3188c6bf4cd49p-606 }, + { 0x1.304faa5c619afp-612, 0x1.ad8c07976bbcp-607 }, + { 0x1.aba0a14c264ccp-613, 0x1.2de91f0a22435p-607 }, + { 0x1.2c6bc6b0e1424p-613, 0x1.a859534d21642p-608 }, + { 0x1.a60ed1d150c44p-614, 0x1.2a2f2fa027fc3p-608 }, + { 0x1.286f9728ce321p-614, 0x1.a30255dde65bep-609 }, + { 0x1.a05b929d439abp-615, 0x1.265c387eea954p-609 }, + { 0x1.245c6b4e79163p-615, 0x1.9d88d7b14c6d3p-610 }, + { 0x1.9a88c12e847c2p-616, 0x1.22717ef05792fp-610 }, + { 0x1.203396b14a77p-616, 0x1.97eea82eb8229p-611 }, + { 0x1.94984031d9858p-617, 0x1.1e704cd7ceb7cp-611 }, + { 0x1.1bf6702f3caf4p-617, 0x1.92359cbfdea74p-612 }, + { 0x1.8e8bf6806bcabp-618, 0x1.1a59effeaeef1p-612 }, + { 0x1.17a6513ed67fap-618, 0x1.8c5f8fd2e86f6p-613 }, + { 0x1.8865ce1efe9b6p-619, 0x1.162fb960e6361p-613 }, + { 0x1.1344953a2bc16p-619, 0x1.866e5fdcf6e5cp-614 }, + { 0x1.8227b33ef66f4p-620, 0x1.11f2fc7a0a0a9p-614 }, + { 0x1.0ed298ab66e97p-620, 0x1.8063ee5dc8676p-615 }, + { 0x1.7bd39341e60d2p-621, 0x1.0da50e937b941p-615 }, + { 0x1.0a51b89b5ac38p-621, 0x1.7a421ee53231bp-616 }, + { 0x1.756b5bc0538cfp-622, 0x1.0947461417eb2p-616 }, + { 0x1.05c351e298147p-622, 0x1.740ad61b23997p-617 }, + { 0x1.6ef0f9946142ep-623, 0x1.04daf9d1f19dp-617 }, + { 0x1.0128c07d7eac9p-623, 0x1.6dbff8cae0f32p-618 }, + { 0x1.686657e900799p-624, 0x1.006180668cd93p-618 }, + { 0x1.f906bdc779cfcp-625, 0x1.67636af21f0cbp-619 }, + { 0x1.61cd5f4e4d33cp-625, 0x1.f7b85f0c272bbp-620 }, + { 0x1.efa90ac757637p-626, 0x1.60f70ed4a200ep-620 }, + { 0x1.5b27f4d3aafafp-626, 0x1.ee98b6b3e4f34p-621 }, + { 0x1.e63b1303dfbfbp-627, 0x1.5a7cc414fb8aap-621 }, + { 0x1.5477f92833195p-627, 0x1.e566abbe94f87p-622 }, + { 0x1.dcbf7abb88524p-628, 0x1.53f666d2fde17p-622 }, + { 0x1.4dbf47c1fc8ap-628, 0x1.dc24dc933bf6dp-623 }, + { 0x1.d338de3492428p-629, 0x1.4d65ced070949p-623 }, + { 0x1.46ffb60cbd76p-629, 0x1.d2d5e0d43505p-624 }, + { 0x1.c9a9d09a6515fp-630, 0x1.46ccce9c8cdf5p-624 }, + { 0x1.403b12a03d499p-630, 0x1.c97c4837b573ep-625 }, + { 0x1.c014dae645fc3p-631, 0x1.402d32c6be96dp-625 }, + { 0x1.3973247f05596p-631, 0x1.c01a996aebdb3p-626 }, + { 0x1.b67c7ad400b86p-632, 0x1.3988c1191e211p-626 }, + { 0x1.32a9aa5db4bb3p-632, 0x1.b6b3510058b7ap-627 }, + { 0x1.ace321e309c7bp-633, 0x1.32e137db0ef23p-627 }, + { 0x1.2be059f3526f7p-633, 0x1.ad48e069f2207p-628 }, + { 0x1.a34b346493cc3p-634, 0x1.2c384d1c64d5bp-628 }, + { 0x1.2518df52ef492p-634, 0x1.a3ddacff96f65p-629 }, + { 0x1.99b70897047dcp-635, 0x1.258fae0968e74p-629 }, + { 0x1.1e54dc4edf3a3p-635, 0x1.9a740f1248851p-630 }, + { 0x1.9028e5cf277c7p-636, 0x1.1ee8fe480d92cp-630 }, + { 0x1.1795e7e5c7ccap-636, 0x1.910e510c93fe1p-631 }, + { 0x1.86a303af6f699p-637, 0x1.1845d75e974c6p-631 }, + { 0x1.10dd8db9b7b2p-637, 0x1.87aeaea087811p-632 }, + { 0x1.7d27896d87b8ep-638, 0x1.11a7c823f5ff5p-632 }, + { 0x1.0a2d4d917179ap-638, 0x1.7e57540380a9p-633 }, + { 0x1.73b88d266bc5ap-639, 0x1.0b10543a01766p-633 }, + { 0x1.03869ae409b27p-639, 0x1.750a5d3814d59p-634 }, + { 0x1.6a58134129f18p-640, 0x1.0480f391c14fcp-634 }, + { 0x1.f9d5b8ddde221p-641, 0x1.6bc9d56645be6p-635 }, + { 0x1.61080de06bfbp-641, 0x1.fbf623f3bedbap-636 }, + { 0x1.ecb6d7acd34f7p-642, 0x1.6297b642274f2p-636 }, + { 0x1.57ca5c62d05ddp-642, 0x1.ef001d6eb49dfp-637 }, + { 0x1.dfb32aa129cc6p-643, 0x1.5975e7810e7p-637 }, + { 0x1.4ea0caf213789p-643, 0x1.e222785106b16p-638 }, + { 0x1.d2cd2eb59de4cp-644, 0x1.50663e5d53392p-638 }, + { 0x1.458d1220fa79dp-644, 0x1.d55fbee497ep-639 }, + { 0x1.c60744f31e198p-645, 0x1.476a7d28a437bp-639 }, + { 0x1.3c90d697e5b5dp-645, 0x1.c8ba606fb6833p-640 }, + { 0x1.b963b20518321p-646, 0x1.3e8452ecdbe84p-640 }, + { 0x1.33ada8cfe418fp-646, 0x1.bc34b0b8bbc6p-641 }, + { 0x1.ace49de2283aep-647, 0x1.35b55b1b3d652p-641 }, + { 0x1.2ae504dc15f24p-647, 0x1.afd0e79df00ebp-642 }, + { 0x1.a08c1388db34fp-648, 0x1.2cff1d49f192cp-642 }, + { 0x1.223852412258p-648, 0x1.a39120c175c51p-643 }, + { 0x1.945c00d028182p-649, 0x1.24630cff92d39p-643 }, + { 0x1.19a8e3da77fbep-649, 0x1.97775b48ec1aap-644 }, + { 0x1.8856364b336c5p-650, 0x1.1be2898c8a8a4p-644 }, + { 0x1.1137f7cd08642p-650, 0x1.8b8579b06ca2cp-645 }, + { 0x1.7c7c673fe436ep-651, 0x1.137eddf1f97aep-645 }, + { 0x1.08e6b787233bap-651, 0x1.7fbd41b078795p-646 }, + { 0x1.70d029afc4472p-652, 0x1.0b3940d5da6fcp-646 }, + { 0x1.00b637cd0ec0bp-652, 0x1.74205c365c73ep-647 }, + { 0x1.6552f6729a259p-653, 0x1.0312d48405757p-647 }, + { 0x1.f14ef1a3e4ac2p-654, 0x1.68b0556e87723p-648 }, + { 0x1.5a06296220023p-654, 0x1.f6194df7630e5p-649 }, + { 0x1.e176ccb941b53p-655, 0x1.5d6e9ce0425a7p-649 }, + { 0x1.4eeb0196310cdp-655, 0x1.e64f64121563ep-650 }, + { 0x1.d1e5afef936dap-656, 0x1.525c859a2ea9ap-650 }, + { 0x1.4402a1b0bd9dfp-656, 0x1.d6c9b6d4d6fc5p-651 }, + { 0x1.c29d225a230e3p-657, 0x1.477b466ee6cc1p-651 }, + { 0x1.394e1038ce88ep-657, 0x1.c789ea0183d02p-652 }, + { 0x1.b39e83951bdaap-658, 0x1.3ccbfa4112a58p-652 }, + { 0x1.2ece3803d8d68p-658, 0x1.b8917a154498bp-653 }, + { 0x1.a4eb0c6436cf4p-659, 0x1.324fa05e3adc4p-653 }, + { 0x1.2483e8ac9d061p-659, 0x1.a9e1bcd30af1fp-654 }, + { 0x1.9683cf6400112p-660, 0x1.28071ce79e917p-654 }, + { 0x1.1a6fd716c7c18p-660, 0x1.9b7be1e1550cbp-655 }, + { 0x1.8869b9cc95345p-661, 0x1.1df33948493fap-655 }, + { 0x1.10929dfe85b79p-661, 0x1.8d60f37a227b9p-656 }, + { 0x1.7a9d9444b613ep-662, 0x1.1414a4b7a1729p-656 }, + { 0x1.06ecbe9338febp-662, 0x1.7f91d72bfd333p-657 }, + { 0x1.6d2003c3fdf54p-663, 0x1.0a6bf4c7a4f95p-657 }, + { 0x1.fafd4238f8063p-664, 0x1.720f4eaaf4bbbp-658 }, + { 0x1.5ff18a8317f0ap-664, 0x1.00f9a5fe04069p-658 }, + { 0x1.e8912b5139031p-665, 0x1.64d9f8b065b73p-659 }, + { 0x1.531288f8c01c7p-665, 0x1.ef7c38ee94e41p-660 }, + { 0x1.d695a98770e4bp-666, 0x1.57f251e86550ep-660 }, + { 0x1.46833ee262b1p-666, 0x1.dd73492689d2p-661 }, + { 0x1.c50b006d4e015p-667, 0x1.4b58b5eba6cc7p-661 }, + { 0x1.3a43cc572b3d3p-667, 0x1.cbd8e7539eac7p-662 }, + { 0x1.b3f14799b1616p-668, 0x1.3f0d6044b145dp-662 }, + { 0x1.2e5432e458097p-668, 0x1.baad518e7426ep-663 }, + { 0x1.a3486c40b74f1p-669, 0x1.33106d7f3cac9p-663 }, + { 0x1.22b456b1a8db7p-669, 0x1.a9f09adee91e3p-664 }, + { 0x1.931032d667261p-670, 0x1.2761dc408f1efp-664 }, + { 0x1.1763ffacc46acp-670, 0x1.99a2acce5bd7fp-665 }, + { 0x1.834838ba6fe3dp-671, 0x1.1c018e67b6eaep-665 }, + { 0x1.0c62daba74e7cp-671, 0x1.89c349043d67ep-666 }, + { 0x1.73eff5eb5eca5p-672, 0x1.10ef4a3481a29p-666 }, + { 0x1.01b07aeca1f42p-672, 0x1.7a520aeb63faep-667 }, + { 0x1.6506bebfc67bdp-673, 0x1.062abb7415c63p-667 }, + { 0x1.ee98b577ea7cap-674, 0x1.6b4e695e9099fp-668 }, + { 0x1.568bc5a3d72eep-674, 0x1.f766e96435041p-669 }, + { 0x1.da6bba883d22ap-675, 0x1.5cb7b85aa6067p-669 }, + { 0x1.487e1cd9f3e43p-675, 0x1.e311e0dabf963p-670 }, + { 0x1.c6d89f0368fc1p-676, 0x1.4e8d2ab5187d6p-670 }, + { 0x1.3adcb83cdccc3p-676, 0x1.cf55249e0172ap-671 }, + { 0x1.b3ddd3216f86ep-677, 0x1.40cdd3d52967cp-671 }, + { 0x1.2da66f0214306p-677, 0x1.bc2f50c60488ep-672 }, + { 0x1.a1799fd5925f4p-678, 0x1.3378a96e8e29ap-672 }, + { 0x1.20d9fd7b31257p-678, 0x1.a99ed8a2f2e6bp-673 }, + { 0x1.8faa294857a39p-679, 0x1.268c853c2e48dp-673 }, + { 0x1.147606d4e1ee3p-679, 0x1.97a2092e9b19dp-674 }, + { 0x1.7e6d714d6fce7p-680, 0x1.1a0826b9b2f1ep-674 }, + { 0x1.087916d26f37cp-680, 0x1.86370b7b69b46p-675 }, + { 0x1.6dc159d3dbce3p-681, 0x1.0dea34dab05c3p-675 }, + { 0x1.f9c3470942341p-682, 0x1.755be71f29feap-676 }, + { 0x1.5da3a74ec8bc7p-682, 0x1.02313fbe40a01p-676 }, + { 0x1.e35c1df5edf07p-683, 0x1.650e8497f58cdp-677 }, + { 0x1.4e120315adc06p-683, 0x1.edb784bbee452p-678 }, + { 0x1.cdb951dc67cbfp-684, 0x1.554cafa9d0c34p-678 }, + { 0x1.3f09fdba5037ep-684, 0x1.d7d0486e476ccp-679 }, + { 0x1.b8d760c6a3faap-685, 0x1.461419b3892c2p-679 }, + { 0x1.308911536a23dp-685, 0x1.c2a975dad9bep-680 }, + { 0x1.a4b2aa8c000cap-686, 0x1.37625bf981bdbp-680 }, + { 0x1.228ca3bac6e07p-686, 0x1.ae3f97cbb25cep-681 }, + { 0x1.914773f3bbbacp-687, 0x1.2934f9e530badp-681 }, + { 0x1.151208bdc254ep-687, 0x1.9a8f1bb2e0d78p-682 }, + { 0x1.7e91e9c37a26bp-688, 0x1.1b8963382a86p-682 }, + { 0x1.0816843f2edd8p-688, 0x1.879454bd5bf1ap-683 }, + { 0x1.6c8e23b87885fp-689, 0x1.0e5cf631ac83bp-683 }, + { 0x1.f72e98937c4f8p-690, 0x1.754b7ed21d736p-684 }, + { 0x1.5b38276a48eap-690, 0x1.01ad01a5b2ddp-684 }, + { 0x1.df23162441e8bp-691, 0x1.63b0c17c2afp-685 }, + { 0x1.4a8beb16012edp-691, 0x1.eaed8e09770edp-686 }, + { 0x1.c804c1d0522ebp-692, 0x1.52c032be62aabp-686 }, + { 0x1.3a855850eeeeap-692, 0x1.d36ef8a6e08fap-687 }, + { 0x1.b1cdcc2ca0214p-693, 0x1.4275d9d00481dp-687 }, + { 0x1.2b204ea20186ep-693, 0x1.bcd89c2310d59p-688 }, + { 0x1.9c78595e362cep-694, 0x1.32cdb1c10f0eep-688 }, + { 0x1.1c58a6013aaeep-694, 0x1.a724c21e93002p-689 }, + { 0x1.87fe848fd6bffp-695, 0x1.23c3ac05a8c19p-689 }, + { 0x1.0e2a313c94bb5p-695, 0x1.924da8624908p-690 }, + { 0x1.745a6341bd9d3p-696, 0x1.1553b2e7eba16p-690 }, + { 0x1.0090c041eb55fp-696, 0x1.7e4d844204d5fp-691 }, + { 0x1.61860872f36c7p-697, 0x1.0779abdf88654p-691 }, + { 0x1.e710449b20327p-698, 0x1.6b1e85d9cfdc3p-692 }, + { 0x1.4f7b87a3ccd22p-698, 0x1.f462f39da55f5p-693 }, + { 0x1.ce184ffaa0275p-699, 0x1.58badb2559681p-693 }, + { 0x1.3e34f7b15484dp-699, 0x1.daedfe49c8a9fp-694 }, + { 0x1.b6314a8f93441p-700, 0x1.471cb2f12adecp-694 }, + { 0x1.2dac75898461p-700, 0x1.c28c3fc94131bp-695 }, + { 0x1.9f52e6b0168fbp-701, 0x1.363e3fa56683p-695 }, + { 0x1.1ddc26b854422p-701, 0x1.ab358720f461fp-696 }, + { 0x1.8974e49b18481p-702, 0x1.2619b9e9f9276p-696 }, + { 0x1.0ebe3bcdc6652p-702, 0x1.94e1adf5ef17ap-697 }, + { 0x1.748f15c14a99p-703, 0x1.16a96324493c1p-697 }, + { 0x1.004cf29d383afp-703, 0x1.7f889bf8109c7p-698 }, + { 0x1.60995fd7916b4p-704, 0x1.07e787ce8decbp-698 }, + { 0x1.e50530acb7a2bp-705, 0x1.6b224a16aa4ep-699 }, + { 0x1.4d8bbfb38c98p-705, 0x1.f39d03522ee6ep-700 }, + { 0x1.cab316f0b29dep-706, 0x1.57a6c57f8fed2p-700 }, + { 0x1.3b5e4bf3051bbp-706, 0x1.d8b1738bdcb74p-701 }, + { 0x1.b1987b3f62cd2p-707, 0x1.450e32693ba8dp-701 }, + { 0x1.2a09376f26716p-707, 0x1.bf0154de94403p-702 }, + { 0x1.99aa6a5f22416p-708, 0x1.3350cea8cd61ap-702 }, + { 0x1.1984d37c8d151p-708, 0x1.a681c1d2f0b94p-703 }, + { 0x1.82de1daeb9c47p-709, 0x1.2266f414ce57bp-703 }, + { 0x1.09c991f950457p-709, 0x1.8f27fe21c9591p-704 }, + { 0x1.6d28fdea9871ap-710, 0x1.12491ab5c17d9p-704 }, + { 0x1.f5a00e548f085p-711, 0x1.78e979aa0c9bep-705 }, + { 0x1.5880a5ae03598p-711, 0x1.02efdac5a4ff4p-705 }, + { 0x1.d921d6d1c821bp-712, 0x1.63bbd32217718p-706 }, + { 0x1.44dae3b23367bp-712, 0x1.e8a7dcff4677cp-707 }, + { 0x1.be0a394617721p-713, 0x1.4f94da865b2a3p-707 }, + { 0x1.322dbccd73cabp-713, 0x1.ccdc67829105bp-708 }, + { 0x1.a44b3f5ce9c8bp-714, 0x1.3c6a934743c05p-708 }, + { 0x1.206f6db46b93p-714, 0x1.b26f5afd4ebc9p-709 }, + { 0x1.8bd742e227a38p-715, 0x1.2a3336386b4d7p-709 }, + { 0x1.0f966c7fd2396p-715, 0x1.99530a15ce61ap-710 }, + { 0x1.74a0efc06d36ep-716, 0x1.18e533433f227p-710 }, + { 0x1.ff32d3f1c0a49p-717, 0x1.817a166d90dbdp-711 }, + { 0x1.5e9b45aff1bep-717, 0x1.087732df4f3abp-711 }, + { 0x1.e0dea55db81c4p-718, 0x1.6ad7728d6db01p-712 }, + { 0x1.49b9999981d6cp-718, 0x1.f1c02ea5235f3p-713 }, + { 0x1.c41e9fb058b1ep-719, 0x1.555e63841a093p-713 }, + { 0x1.35ef96b0fe655p-719, 0x1.d42dfb77e321ep-714 }, + { 0x1.a8e19002cb47fp-720, 0x1.4102823a6a0a2p-714 }, + { 0x1.23313f4adb099p-720, 0x1.b8267dd51660dp-715 }, + { 0x1.8f16bf19917acp-721, 0x1.2db7bc80b123ep-715 }, + { 0x1.1172ed701cd4p-721, 0x1.9d98e007ff597p-716 }, + { 0x1.76adf2095d808p-722, 0x1.1b7255d8af1cep-716 }, + { 0x1.00a953345bce4p-722, 0x1.8474c5f89cf1fp-717 }, + { 0x1.5f976a86ba7a3p-723, 0x1.0a26e7ff7c8ap-717 }, + { 0x1.e192f5a290a0dp-724, 0x1.6caa4dc34bcc6p-718 }, + { 0x1.49c3e6e576cf8p-724, 0x1.f394c675d5da1p-719 }, + { 0x1.c3918d16606afp-725, 0x1.562a0ffd36fefp-719 }, + { 0x1.3524a1ccb90cep-725, 0x1.d4a41cdb95576p-720 }, + { 0x1.a739e0c3f00b3p-726, 0x1.40e51faa74ee4p-720 }, + { 0x1.21ab51a49a64p-726, 0x1.b7670ded07be7p-721 }, + { 0x1.8c781323e2b8bp-727, 0x1.2ccd09eaa341p-721 }, + { 0x1.0f4a27c210b83p-727, 0x1.9bc980b6cd88bp-722 }, + { 0x1.7338f3cfd4b18p-728, 0x1.19d3d560c7458p-722 }, + { 0x1.fbe79eabbab8bp-729, 0x1.81b807901b2ddp-723 }, + { 0x1.5b69fdd784131p-729, 0x1.07ec015b26bbfp-723 }, + { 0x1.db36d8463b3e1p-730, 0x1.691fdebe382bep-724 }, + { 0x1.44f955c9776f6p-730, 0x1.ee11097f70374p-725 }, + { 0x1.bc693203fe92cp-731, 0x1.51eeeac7320bep-725 }, + { 0x1.2fd5c7756dd24p-731, 0x1.ce39998362bf9p-726 }, + { 0x1.9f66cc65fb2cbp-732, 0x1.3c13b67a17ff2p-726 }, + { 0x1.1beec36eb8502p-732, 0x1.b03976c943068p-727 }, + { 0x1.8418af0dd65edp-733, 0x1.277d70b2ebc6fp-727 }, + { 0x1.09345c546e7cdp-733, 0x1.93f94ba2c6b6ap-728 }, + { 0x1.6a68c4bfd764bp-734, 0x1.141be9e049453p-728 }, + { 0x1.ef2e87ca7b717p-735, 0x1.7962a50231832p-729 }, + { 0x1.5241d71eb6e19p-735, 0x1.01df915097b64p-729 }, + { 0x1.ce118fc8beeeap-736, 0x1.605fee84767fp-730 }, + { 0x1.3b8f8a28fd848p-736, 0x1.e172e498cd2fcp-731 }, + { 0x1.aef59daa19c93p-737, 0x1.48dc6e3757e71p-731 }, + { 0x1.263e577f574dp-737, 0x1.c1366206ca036p-732 }, + { 0x1.91bfa9231de5cp-738, 0x1.32c440230ef3ap-732 }, + { 0x1.123b897af1af4p-738, 0x1.a2ee0ea25a216p-733 }, + { 0x1.7655cd85a2773p-739, 0x1.1e04519eb8f87p-733 }, + { 0x1.feea6c3554149p-740, 0x1.867f82bdccb8fp-734 }, + { 0x1.5c9f427a491a4p-740, 0x1.0a8a5c7678dffp-734 }, + { 0x1.dbb4739afff2ep-741, 0x1.6bd1744d1513ep-735 }, + { 0x1.4484548d479a3p-741, 0x1.f089c3d3d8b6fp-736 }, + { 0x1.bab46440d8e4bp-742, 0x1.52cbafb8bc99fp-736 }, + { 0x1.2dee5d96e696ep-742, 0x1.ce464b1286c0dp-737 }, + { 0x1.9bcaf0aad775cp-743, 0x1.3b571085ef9dbp-737 }, + { 0x1.18c7bd07b007fp-743, 0x1.ae2a4fedee59cp-738 }, + { 0x1.7eda37d26ae66p-744, 0x1.255d79dbe3905p-738 }, + { 0x1.04fbd01fd3b9ap-744, 0x1.9017432798e26p-739 }, + { 0x1.63c5ba199716fp-745, 0x1.10c9ceee61d28p-739 }, + { 0x1.e4edd431a7a4p-746, 0x1.73effa34f57abp-740 }, + { 0x1.4a724e2f6eadep-746, 0x1.fb0fd6a99ec28p-741 }, + { 0x1.c24c9890314cdp-747, 0x1.5998a4600495bp-741 }, + { 0x1.32c615eef6a3dp-747, 0x1.d70936a92f04ap-742 }, + { 0x1.a1f03c81340fdp-748, 0x1.40f6bfdad1f14p-742 }, + { 0x1.1ca87340e1c39p-748, 0x1.b55b284add8c1p-743 }, + { 0x1.83b6cbf2ba29fp-749, 0x1.29f10ece9036ep-743 }, + { 0x1.0801fd07f7284p-749, 0x1.95e2d86ae92c8p-744 }, + { 0x1.677ffffc31b92p-750, 0x1.146f8c6e8dc57p-744 }, + { 0x1.e978e83ebd95dp-751, 0x1.787f26e598ebbp-745 }, + { 0x1.4d2d2f5dd4096p-751, 0x1.005b6216a17eap-745 }, + { 0x1.c58570e2f641dp-752, 0x1.5d10973fbab06p-746 }, + { 0x1.34a13f272cdfap-752, 0x1.db3db8f832a58p-747 }, + { 0x1.a4017c5ace0dep-753, 0x1.4379416dfac63p-747 }, + { 0x1.1dc0938cfb932p-753, 0x1.b84ac1ef46255p-748 }, + { 0x1.84c7064147f81p-754, 0x1.2b9cc2c3d6738p-748 }, + { 0x1.087100f5e6429p-754, 0x1.97b6c5dc3637ap-749 }, + { 0x1.67b20873fc995p-755, 0x1.15602f1227af8p-749 }, + { 0x1.e9337a8979dap-756, 0x1.795cb2bb480b6p-750 }, + { 0x1.4ca0667456eb8p-756, 0x1.00aa01fc8a73ep-750 }, + { 0x1.c446a2ccade1cp-757, 0x1.5d196927cdaccp-751 }, + { 0x1.3371d92c55c69p-757, 0x1.dac421184af19p-752 }, + { 0x1.a1ef1650d3562p-758, 0x1.42cba823b93cbp-752 }, + { 0x1.1c07db1df4cf6p-758, 0x1.b6e2f60b615c1p-753 }, + { 0x1.8202debc2593cp-759, 0x1.2a53f94211ba9p-753 }, + { 0x1.064595037ce7bp-759, 0x1.95853e0fd75adp-754 }, + { 0x1.645a58ac6913cp-760, 0x1.13949d3b2fbd2p-754 }, + { 0x1.e41f95cc492cep-761, 0x1.768213ee2ba9cp-755 }, + { 0x1.48d0194e5b153p-761, 0x1.fce2f1e195a7ap-756 }, + { 0x1.be99935f38c42p-762, 0x1.59b2d772c1b04p-756 }, + { 0x1.2f40d4a5d287p-762, 0x1.d5a005ce1b15dp-757 }, + { 0x1.9bc8aa74c3805p-763, 0x1.3ef3138f8ae58p-757 }, + { 0x1.178b448b82b16p-763, 0x1.b12e626e3c8a1p-758 }, + { 0x1.7b7f2dc7fa066p-764, 0x1.2620652c3102cp-758 }, + { 0x1.0190106456396p-764, 0x1.8f5ecffd9c995p-759 }, + { 0x1.5d92194746ef2p-765, 0x1.0f1a62a97a48ep-759 }, + { 0x1.da636b2add63ap-766, 0x1.7004d0a0dd3fcp-760 }, + { 0x1.41d8f14e2d235p-766, 0x1.f38508375a815p-761 }, + { 0x1.b4a8e16df3a2ep-767, 0x1.52f67f4a45dbdp-761 }, + { 0x1.282da2ee06e9fp-767, 0x1.cbf8187da97p-762 }, + { 0x1.91bc4f0e82a1p-768, 0x1.380c6fa6ddd1bp-762 }, + { 0x1.106c65473611bp-768, 0x1.a757e44dde4fbp-763 }, + { 0x1.716ca73d3a1dcp-769, 0x1.1f218f165083cp-763 }, + { 0x1.f4e737e667fe6p-770, 0x1.8571975a9ba0cp-764 }, + { 0x1.538bdbc88035p-770, 0x1.081306aee058bp-764 }, + { 0x1.cc4774fe05a13p-771, 0x1.661571375ee31p-765 }, + { 0x1.37eeb586702afp-771, 0x1.e5803c9b677cp-766 }, + { 0x1.a6be51e94d2c3p-772, 0x1.49169d29f057fp-766 }, + { 0x1.1e6cae3cc5ce4p-772, 0x1.be144165bfdadp-767 }, + { 0x1.841452e30c6ecp-773, 0x1.2e4b0b7596d86p-767 }, + { 0x1.06dfcc0330324p-773, 0x1.99a8814f82396p-768 }, + { 0x1.64157d8dbcaa1p-774, 0x1.158b4c1d7aa61p-768 }, + { 0x1.e248fc3725278p-775, 0x1.7806fe5adc0dep-769 }, + { 0x1.4691284199248p-775, 0x1.fd64d63539ac4p-770 }, + { 0x1.ba32f675bcca1p-776, 0x1.58fd2560c98e3p-770 }, + { 0x1.2b59cb5fcd07p-776, 0x1.d33b9c01b8858p-771 }, + { 0x1.953f4278d9771p-777, 0x1.3c5b9e7be019ep-771 }, + { 0x1.1244d4a198783p-777, 0x1.ac5a261b57bd2p-772 }, + { 0x1.7333ac721d353p-778, 0x1.21f61f6e6a3a5p-772 }, + { 0x1.f654f8b2c9938p-779, 0x1.8883e334bf813p-773 }, + { 0x1.53d9d5f4e3889p-779, 0x1.09a33ffab8174p-773 }, + { 0x1.cbcb3935e8707p-780, 0x1.678037d69a88ap-774 }, + { 0x1.36fefd85e37f7p-780, 0x1.e678a0474dd4dp-775 }, + { 0x1.a4a7147e53789p-781, 0x1.491a44a8cc267p-775 }, + { 0x1.1c73c8c2f3143p-781, 0x1.bd3a60953bab8p-776 }, + { 0x1.80a7df6e9e4abp-782, 0x1.2d20af56e98e4p-776 }, + { 0x1.040c111171b21p-782, 0x1.9748563f2a02cp-777 }, + { 0x1.5f9153468350dp-783, 0x1.13656dff66048p-777 }, + { 0x1.db3d65827b6f1p-784, 0x1.7463a2ae57157p-778 }, + { 0x1.412b4a3b0b6bbp-784, 0x1.f77b2a384d071p-779 }, + { 0x1.b20abd232bd72p-785, 0x1.5451ae34b02aep-779 }, + { 0x1.25417f5fe18aap-785, 0x1.cc024fa52d21ep-780 }, + { 0x1.8c38db09c3d68p-786, 0x1.36dbe645ba702p-780 }, + { 0x1.0ba351c6b2c44p-786, 0x1.a415d531b6e85p-781 }, + { 0x1.69856de02317p-787, 0x1.1bcf7eeeba2f5p-781 }, + { 0x1.e847157246bfcp-788, 0x1.7f70703ac5558p-782 }, + { 0x1.49b2d16422141p-788, 0x1.02fd377359b1p-782 }, + { 0x1.bd304de355d85p-789, 0x1.5dd1b0bb84b26p-783 }, + { 0x1.2c87c2ff697dcp-789, 0x1.d87243e77ecadp-784 }, + { 0x1.95b4456f24a66p-790, 0x1.3efdb3b369292p-784 }, + { 0x1.11cf1a60f1d84p-790, 0x1.aeb4dc01a4631p-785 }, + { 0x1.718a9184a8678p-791, 0x1.22bcd99dbdb06p-785 }, + { 0x1.f2af0be1fde49p-792, 0x1.88766c06b0833p-786 }, + { 0x1.507007917e3d9p-792, 0x1.08db80d427d79p-786 }, + { 0x1.c5e695f15072bp-793, 0x1.65709eb54bf5ep-787 }, + { 0x1.32266540e08c2p-793, 0x1.e253876b38acep-788 }, + { 0x1.9cf012acb820bp-794, 0x1.45623a2f6a451p-788 }, + { 0x1.1673fda512b46p-794, 0x1.b6f674d703273p-789 }, + { 0x1.777d05328bd26p-795, 0x1.280eca736b4b1p-789 }, + { 0x1.fa46d62b8e57dp-796, 0x1.8f4d804e3ad6fp-790 }, + { 0x1.5544c8bc23e1cp-796, 0x1.0d3e50a2eecdcp-790 }, + { 0x1.cc068b1dc8ab2p-797, 0x1.6b0c7763ce52bp-791 }, + { 0x1.36042b906571p-797, 0x1.e979edc5b3767p-792 }, + { 0x1.a1cbbab815b4cp-798, 0x1.49ecd657d5dd6p-792 }, + { 0x1.197d0fe71564cp-798, 0x1.bcb59141dc715p-793 }, + { 0x1.7b41f3bcb1869p-799, 0x1.2bad65a82bb23p-793 }, + { 0x1.feec24eca8006p-800, 0x1.93d6de18ac6bfp-794 }, + { 0x1.581b387627669p-800, 0x1.1011dd6dfecf6p-794 }, + { 0x1.cf746ccaba032p-801, 0x1.6e8be31f2fe24p-795 }, + { 0x1.380f8b864e1acp-801, 0x1.edc51c8649aaap-796 }, + { 0x1.a4312cc2f816ap-802, 0x1.4c88f43732a1p-796 }, + { 0x1.1adc83c96accfp-802, 0x1.bfd81ed74f1cdp-797 }, + { 0x1.7cc835281bbf3p-803, 0x1.2d883a292df3bp-797 }, + { 0x1.0044e6f2b903fp-803, 0x1.95fde403b5724p-798 }, + { 0x1.58e66674c0f82p-804, 0x1.11494966870b7p-798 }, + { 0x1.d0209514d613dp-805, 0x1.6fdef1ca550b3p-799 }, + { 0x1.383f2f4495aedp-805, 0x1.ef217eb67d36dp-800 }, + { 0x1.a41575f0363d6p-806, 0x1.4d2aaa5b8e28ap-800 }, + { 0x1.1a8c12a0cae91p-806, 0x1.c04fcbf1fddd8p-801 }, + { 0x1.7c08d08f2ccbbp-807, 0x1.2d96cdd2a30b8p-801 }, + { 0x1.ff186c5b90604p-808, 0x1.95b8ba50a2687p-802 }, + { 0x1.57a2b0b1c4c86p-808, 0x1.10df03cd711e3p-802 }, + { 0x1.ce07ef98af2aep-809, 0x1.6eff939f51c8fp-803 }, + { 0x1.36923c5eb270bp-809, 0x1.ed88d96607fb4p-804 }, + { 0x1.a1791489717bfp-810, 0x1.4bcf1445c1d61p-804 }, + { 0x1.188d2c2d680a3p-810, 0x1.be1a747b458c8p-805 }, + { 0x1.7907312c7e255p-811, 0x1.2bd8dde16ba8ap-805 }, + { 0x1.fa9e995f4c414p-812, 0x1.93089dc23e417p-806 }, + { 0x1.5455df149c7b5p-812, 0x1.0ed4f34d6e965p-806 }, + { 0x1.c93410e8142f8p-813, 0x1.6bf1c754a3325p-807 }, + { 0x1.33105a5b594f7p-813, 0x1.e9027b1c5a4abp-808 }, + { 0x1.9c67f441e11b3p-814, 0x1.487c687197597p-808 }, + { 0x1.14e8ebae7496ep-814, 0x1.b942323a72767p-809 }, + { 0x1.73d10c597b774p-815, 0x1.285660efb3e9ap-809 }, + { 0x1.f330b99c7f9e7p-816, 0x1.8df9d62fb9c5ep-810 }, + { 0x1.4f0ef77c81a6fp-816, 0x1.0b34677fe9486p-810 }, + { 0x1.c1baedb5f2e65p-817, 0x1.66c37bb05de1ep-811 }, + { 0x1.2dc9788ad9864p-817, 0x1.e1a30436bcde5p-812 }, + { 0x1.94f913add4907p-818, 0x1.4341c90c553e7p-812 }, + { 0x1.0fafd2c40ba27p-818, 0x1.b1dd0ffc5d04bp-813 }, + { 0x1.6c7df995241d1p-819, 0x1.231f4a6757469p-813 }, + { 0x1.e8f062cc963cep-820, 0x1.86a35930ed5e1p-814 }, + { 0x1.47e5cbff0d92ep-820, 0x1.060dd236f49a3p-814 }, + { 0x1.b7be34be4e18dp-821, 0x1.5f8c25cd122d7p-815 }, + { 0x1.26d5559b935e7p-821, 0x1.d78bca82e9f37p-816 }, + { 0x1.8b4dd6af9c05dp-822, 0x1.3c36d15093021p-816 }, + { 0x1.08f94cfc79158p-822, 0x1.a80c62c44a65bp-817 }, + { 0x1.632ec0e0d009cp-823, 0x1.1c4b11ed6627ap-817 }, + { 0x1.dc0b5f2e40ea4p-824, 0x1.7d261cc2edf72p-818 }, + { 0x1.3efa480ea698bp-824, 0x1.fef096f5252fp-819 }, + { 0x1.ab6a5245de9e5p-825, 0x1.566c107178d1fp-819 }, + { 0x1.1e52cde409267p-825, 0x1.cae9de8f00c0bp-820 }, + { 0x1.7f910d0084829p-826, 0x1.337ae444bd293p-820 }, + { 0x1.00e3012bd4171p-826, 0x1.9bfbcfe9dc1e8p-821 }, + { 0x1.580c66bfc7cf5p-827, 0x1.13f803c0631d9p-821 }, + { 0x1.ccba595fe34b5p-828, 0x1.71ac2109d33c9p-822 }, + { 0x1.347383dcf4a9bp-828, 0x1.ef21caa7d80c3p-823 }, + { 0x1.9cf52785fcd1fp-829, 0x1.4b8b6bbdb7a4fp-823 }, + { 0x1.1466f7a4ba4b3p-829, 0x1.bbf4bcf8ca0c3p-824 }, + { 0x1.71f5b701cb667p-830, 0x1.2934441fdae8bp-824 }, + { 0x1.ef1fef5338f87p-831, 0x1.8de00a5d4cff3p-825 }, + { 0x1.4b46ffc2e70ccp-831, 0x1.0a4a61359d63ap-825 }, + { 0x1.bb3f3e667d5e5p-832, 0x1.64673b39bdd54p-826 }, + { 0x1.287ea78b8278fp-832, 0x1.dcf3acd0cc1f4p-827 }, + { 0x1.8c9c8347a2863p-833, 0x1.3f1926f0c2aa4p-827 }, + { 0x1.093c166d47d9p-833, 0x1.aaecb94ca24e1p-828 }, + { 0x1.62b5957e6b822p-834, 0x1.1d8efbbc88d6cp-828 }, + { 0x1.da4f3c5b8c56fp-835, 0x1.7df554174928cp-829 }, + { 0x1.3d1457a1afdaep-835, 0x1.fed6b4a9440a8p-830 }, + { 0x1.a7e3665ffae25p-836, 0x1.558fae0fed7aap-830 }, + { 0x1.1b4da97b89113p-836, 0x1.c8b307e047613p-831 }, + { 0x1.7aa46b2ec675cp-837, 0x1.3149a005e5984p-831 }, + { 0x1.fa00e080e536p-838, 0x1.9819329634547p-832 }, + { 0x1.520f92dcad4a2p-838, 0x1.10bba52994e8ep-832 }, + { 0x1.c3a9666328faap-839, 0x1.6c7dd2d93c0f9p-833 }, + { 0x1.2dae795ce73b6p-839, 0x1.e70fd5d6d806dp-834 }, + { 0x1.92f5963d343cfp-840, 0x1.45629dffe1fa7p-834 }, + { 0x1.0d15f439254bep-840, 0x1.b2b2e959996bp-835 }, + { 0x1.675546ac2c967p-841, 0x1.2255364dfcfd7p-835 }, + { 0x1.dfca1ff236f02p-842, 0x1.83c6a3841fccap-836 }, + { 0x1.4046155930cfbp-842, 0x1.02ee197efc99dp-836 }, + { 0x1.ab8846c89a496p-843, 0x1.59bfc8bdbfffep-837 }, + { 0x1.1d5226b496f7ep-843, 0x1.cd9f4c973304p-838 }, + { 0x1.7cc7edd2bedd1p-844, 0x1.3420703d360eap-838 }, + { 0x1.fc1e021531b11p-845, 0x1.9b4a6e4580455p-839 }, + { 0x1.52f9fd29afa7bp-845, 0x1.1276cde31355ep-839 }, + { 0x1.c439018f9e7bp-846, 0x1.6e44a0da72dedp-840 }, + { 0x1.2d9d4a3bfacfap-846, 0x1.e8b82d35e9882p-841 }, + { 0x1.9247c7d6b7109p-847, 0x1.4603c1a2de688p-841 }, + { 0x1.0c3d4d5746632p-847, 0x1.b2e6fa531d555p-842 }, + { 0x1.65add59367765p-848, 0x1.220b241172407p-842 }, + { 0x1.dce1e8301e6efp-849, 0x1.82d28ae825549p-843 }, + { 0x1.3dde18cb97a8dp-849, 0x1.01ea51e3f541cp-843 }, + { 0x1.a7b31ccb0b2f4p-850, 0x1.57e3d8e31e749p-844 }, + { 0x1.1a59798dd7aa2p-850, 0x1.ca77ce984ce61p-845 }, + { 0x1.7843a7981f8e3p-851, 0x1.3192c63185ef2p-845 }, + { 0x1.f55b0f3ffe463p-852, 0x1.974911a73b1a7p-846 }, + { 0x1.4df9fe655b0fbp-852, 0x1.0f64b579273f6p-846 }, + { 0x1.bce68ce6bcfedp-853, 0x1.69a3e1bad13dap-847 }, + { 0x1.284bfe1cdea24p-853, 0x1.e1d6859c11527p-848 }, + { 0x1.8a9c29acbf47dp-854, 0x1.40f425a16dca3p-848 }, + { 0x1.06bd70b72892bp-854, 0x1.ab8633790b1e2p-849 }, + { 0x1.5dd55c1a48477p-855, 0x1.1cb4a43b9229fp-849 }, + { 0x1.d1bd6b173b9f2p-856, 0x1.7b25cc6523c3bp-850 }, + { 0x1.35fc8451ff49ep-856, 0x1.f8db2dc70232bp-851 }, + { 0x1.9c9712232f548p-857, 0x1.5014bc06e7f91p-851 }, + { 0x1.128b47439dcd5p-857, 0x1.bf66ba3b9066cp-852 }, + { 0x1.6d53d2be0a0b6p-858, 0x1.29c2c1dc958dbp-852 }, + { 0x1.e6122171333dfp-859, 0x1.8c4a9d76af90fp-853 }, + { 0x1.435229d0cc681p-859, 0x1.07ae5a7347d0bp-853 }, + { 0x1.ae1371b74ea2dp-860, 0x1.5ed9539dfd0c9p-854 }, + { 0x1.1e01427183001p-860, 0x1.d2c69c7599edcp-855 }, + { 0x1.7c589442700ecp-861, 0x1.3677341a98a13p-855 }, + { 0x1.f9be9e1d7b4e4p-862, 0x1.9cf2c5625685ep-856 }, + { 0x1.5033c96eb757p-862, 0x1.1298aebe8af0fp-856 }, + { 0x1.bef014f36ffa9p-863, 0x1.6d2655c8560ebp-857 }, + { 0x1.290979be09b3bp-863, 0x1.e58166789d0bcp-858 }, + { 0x1.8ac6ba86dcc3cp-864, 0x1.42b9e90b536b6p-858 }, + { 0x1.064e638fb2517p-864, 0x1.acfe7e64002b1p-859 }, + { 0x1.5c884857d8adep-865, 0x1.1d179e12ade6ep-859 }, + { 0x1.cf0beaeb1b319p-866, 0x1.7ae01eb0f55cbp-860 }, + { 0x1.338e29511ffcdp-866, 0x1.f772a9e0423a1p-861 }, + { 0x1.9881a23b2ff9bp-867, 0x1.4e72e15f0f016p-861 }, + { 0x1.0f43798c4f845p-867, 0x1.bc4e2f5a8c9afp-862 }, + { 0x1.6836e63bd7d88p-868, 0x1.27165d875ec78p-862 }, + { 0x1.de466f9c32fdap-869, 0x1.87eb54ae1860dp-863 }, + { 0x1.3d79f883687bfp-869, 0x1.043b38d103ec9p-863 }, + { 0x1.a56d48500b8a3p-870, 0x1.598a7d65e3b67p-864 }, + { 0x1.17ac327f9b5e5p-870, 0x1.cac2d1ee89db1p-865 }, + { 0x1.73278f241bb95p-871, 0x1.308090afcd9f3p-865 }, + { 0x1.ec801820c3f3dp-872, 0x1.942d41e7bf2a3p-866 }, + { 0x1.46b841565ab3ep-872, 0x1.0c34dc595f4bfp-866 }, + { 0x1.b16ea850bfa34p-873, 0x1.63e9cb83e74b2p-867 }, + { 0x1.1f76e44abf0ecp-873, 0x1.d83e5a3ffd7adp-868 }, + { 0x1.7d432d7dd0ca1p-874, 0x1.39428e0fd00c5p-868 }, + { 0x1.f99abec00b682p-875, 0x1.9f8c2eadfb109p-869 }, + { 0x1.4f35579392d4bp-875, 0x1.13957092e7741p-869 }, + { 0x1.bc6c19eee10e8p-876, 0x1.6d7ad6ac744f9p-870 }, + { 0x1.2692d6adc530fp-876, 0x1.e4a41e3c393c2p-871 }, + { 0x1.8673fad41c337p-877, 0x1.4149a31665d1ep-871 }, + { 0x1.02bd066e6e446p-877, 0x1.a9efbad7c9909p-872 }, + { 0x1.56dece3f159c3p-878, 0x1.1a4d14ca40e6p-872 }, + { 0x1.c64dabfd6babdp-879, 0x1.7628f37011dc7p-873 }, + { 0x1.2cf07ed3ac7cap-879, 0x1.efd93aae49244p-874 }, + { 0x1.8ea5cdb1b77f8p-880, 0x1.4884565714d83p-874 }, + { 0x1.0801f05da3babp-880, 0x1.b341347ab9d2ep-875 }, + { 0x1.5da3ba0723cbcp-881, 0x1.204d0f497ca7dp-875 }, + { 0x1.cefd7b19fc691p-882, 0x1.7de10a24a9be3p-876 }, + { 0x1.3281b7ca3d771p-882, 0x1.f9c4f419d97b9p-877 }, + { 0x1.95c663259c5d8p-883, 0x1.4ee2a6bb63f1dp-877 }, + { 0x1.0c90568fe453bp-883, 0x1.bb6bea4d790c6p-878 }, + { 0x1.6374ef6370a23p-884, 0x1.258802fee3a1bp-878 }, + { 0x1.d668024e6e773p-885, 0x1.8491dcb50d65p-879 }, + { 0x1.3739f6c74a992p-885, 0x1.012888bcf5e1bp-879 }, + { 0x1.9bc5a2748239p-886, 0x1.5456466d99824p-880 }, + { 0x1.105de86fb726ep-886, 0x1.c25d7813e5a28p-881 }, + { 0x1.68453b252f9afp-887, 0x1.29f220ff323bdp-881 }, + { 0x1.dc7c640bf856fp-888, 0x1.8a2c46b36447dp-882 }, + { 0x1.3b0e7a2d8004dp-888, 0x1.04b5178932d9ep-882 }, + { 0x1.a095d99893beap-889, 0x1.58d2d04dcdef9p-883 }, + { 0x1.1361f24d04a1ep-889, 0x1.c8060b8a624d8p-884 }, + { 0x1.6c0994513d45bp-890, 0x1.2d8154e3020f5p-884 }, + { 0x1.e12caa0268707p-891, 0x1.8ea37661d565fp-885 }, + { 0x1.3df6725a60cf5p-891, 0x1.078003d294269p-885 }, + { 0x1.a42bf15180a09p-892, 0x1.5c4df6da1a5fp-886 }, + { 0x1.15957e82800c6p-892, 0x1.cc58a0676d26ep-887 }, + { 0x1.6eb9463d29a0dp-893, 0x1.302d6b1661efp-887 }, + { 0x1.e46dfa81a2018p-894, 0x1.91ed1d851d1ddp-888 }, + { 0x1.3feb236502138p-894, 0x1.0982d94421652p-888 }, + { 0x1.a67f97b02e026p-895, 0x1.5ebfab91b4a2bp-889 }, + { 0x1.16f37032d6085p-895, 0x1.cf4b3235443f5p-890 }, + { 0x1.704e120e656fdp-896, 0x1.31f0304f01ddbp-890 }, + { 0x1.e638c247f445dp-897, 0x1.940198fd0e1c2p-891 }, + { 0x1.40e7ff18c854cp-897, 0x1.0ab8eaa8fae67p-891 }, + { 0x1.a78b6039c7039p-898, 0x1.60223e0067b2cp-892 }, + { 0x1.1778970df4481p-898, 0x1.d0d6e2f89dd66p-893 }, + { 0x1.70c446e7535ccp-899, 0x1.32c589802b4bap-893 }, + { 0x1.e688d1dc06742p-900, 0x1.94dc0e4e3bd62p-894 }, + { 0x1.40eab69ffb357p-900, 0x1.0b1f64079cf15p-894 }, + { 0x1.a74cd8f49285bp-901, 0x1.607271cb1c23p-895 }, + { 0x1.1723bbb37e71p-901, 0x1.d0f815d3e30e4p-896 }, + { 0x1.701ad03f5aba2p-902, 0x1.32ab83cb1b9aap-896 }, + { 0x1.e55d6dd34aeb5p-903, 0x1.947a7e7d08e62p-897 }, + { 0x1.3ff3437e5e592p-903, 0x1.0ab555a059592p-897 }, + { 0x1.a5c493ec4b75bp-904, 0x1.5faf8b45ee11cp-898 }, + { 0x1.15f5a46f2a8c5p-904, 0x1.cfae7d166a387p-899 }, + { 0x1.6e533a1804da5p-905, 0x1.31a25c153692fp-899 }, + { 0x1.e2b951ac76b4bp-906, 0x1.92ddcdd3a585ap-900 }, + { 0x1.3e03e7aaf4a23p-906, 0x1.097bb793410b5p-900 }, + { 0x1.a2f624fa2da41p-907, 0x1.5ddb524f58124p-901 }, + { 0x1.13f112353b2e2p-907, 0x1.ccfd1b6b2b0d1p-902 }, + { 0x1.6b71aaf8395acp-908, 0x1.2fac7e1ac1a55p-902 }, + { 0x1.dea2a52e6f8d6p-909, 0x1.9009c068a7447p-903 }, + { 0x1.3b2124c85eb7dp-909, 0x1.077566199da13p-903 }, + { 0x1.9ee813dcc82f4p-910, 0x1.5afa0b60e30adp-904 }, + { 0x1.111ab5ef7d9cep-910, 0x1.c8ea38207b48cp-905 }, + { 0x1.677cd3ce598a2p-911, 0x1.2cce7b0334e93p-905 }, + { 0x1.d922e485849dfp-912, 0x1.8c04eb792831bp-906 }, + { 0x1.3751aaab95803p-912, 0x1.04a716678c7d9p-906 }, + { 0x1.99a3c2eb312dfp-913, 0x1.571266fb205e7p-907 }, + { 0x1.0d791e54efc95p-913, 0x1.c37f46c8a36cep-908 }, + { 0x1.627dd610c1f2fp-914, 0x1.290ef7aa6784ep-908 }, + { 0x1.d246bba093dddp-915, 0x1.86d89be61c44fp-909 }, + { 0x1.329e3d8fc35e5p-915, 0x1.011744722e8f8p-909 }, + { 0x1.93354aecb0f91p-916, 0x1.522d67c700dd9p-910 }, + { 0x1.09149eae599f4p-916, 0x1.bcc8c2b79e5e6p-911 }, + { 0x1.5c8020a89d6a7p-917, 0x1.247692feaf7c7p-911 }, + { 0x1.ca1dd59404578p-918, 0x1.8090b25f1fb1cp-912 }, + { 0x1.2d1194826d1d9p-918, 0x1.f99c33fa36826p-913 }, + { 0x1.8bab4cd7bc185p-919, 0x1.4c563ff8738edp-913 }, + { 0x1.03f72f0fa181cp-919, 0x1.b4d5ff233ee8bp-914 }, + { 0x1.559144638d7d2p-920, 0x1.1f0fc4fe41aefp-914 }, + { 0x1.c0baa10766979p-921, 0x1.793b75fbd2367p-915 }, + { 0x1.26b830bbc4f33p-921, 0x1.efaa9eeaa4992p-916 }, + { 0x1.8316ba6f8ef74p-922, 0x1.459a26ac43fcfp-916 }, + { 0x1.fc588d5eeb3p-923, 0x1.abb8ece685efep-917 }, + { 0x1.4dc0c0d42f863p-923, 0x1.18e6b704952c1p-917 }, + { 0x1.b6320aea7077ap-924, 0x1.70e95e366ca95p-918 }, + { 0x1.1fa02ebad6485p-924, 0x1.e4700e7fab75ep-919 }, + { 0x1.798a96e59845bp-925, 0x1.3e0826243926dp-919 }, + { 0x1.ef81624855ca5p-926, 0x1.a185d71d9ae78p-920 }, + { 0x1.451fcaaed5e7p-926, 0x1.1209163a43d8ap-920 }, + { 0x1.aa9b30dd7b333p-927, 0x1.67acd56555624p-921 }, + { 0x1.17d9121b4ff43p-927, 0x1.d805487b20ec2p-922 }, + { 0x1.6f1bb0c9eff18p-928, 0x1.35b0e3e76f72ap-922 }, + { 0x1.e184bec96bcc5p-929, 0x1.965317fc3f8ebp-923 }, + { 0x1.3bc10ccdff1d7p-929, 0x1.0a85e11600392p-923 }, + { 0x1.9e0f0cdf83a76p-930, 0x1.5d99f4f4fa7a2p-924 }, + { 0x1.0f738d3253e75p-930, 0x1.ca8538b911cc2p-925 }, + { 0x1.63e056b37b486p-931, 0x1.2ca663e8f6c6ep-925 }, + { 0x1.d2806afda0512p-932, 0x1.8a38c763ae5p-926 }, + { 0x1.31b865207923bp-932, 0x1.026d30f31261ep-926 }, + { 0x1.90a81bef15367p-933, 0x1.52c63cbe5201dp-927 }, + { 0x1.068145905baddp-933, 0x1.bc0c903e2dd51p-928 }, + { 0x1.57f0081c7461bp-934, 0x1.22fbc7eb40c8ep-928 }, + { 0x1.c293abfeb81c1p-935, 0x1.7d5064d5d2e6ap-929 }, + { 0x1.271a9ed146425p-935, 0x1.f3a001a1da12ap-930 }, + { 0x1.8282015bfd093p-936, 0x1.474846e880b8p-930 }, + { 0x1.fa292d1f4b615p-937, 0x1.acb96019278e3p-931 }, + { 0x1.4b6323fa7fafcp-937, 0x1.18c50c637e437p-931 }, + { 0x1.b1ded81f6cf48p-938, 0x1.6fb47e7243b1p-932 }, + { 0x1.1bfd2aff12d23p-938, 0x1.e17fe4af1cdcdp-933 }, + { 0x1.73b9288cf980bp-939, 0x1.3b3779cd081bcp-933 }, + { 0x1.e680a6315c8f9p-940, 0x1.9caab20737c4bp-934 }, + { 0x1.3e52969a46a03p-940, 0x1.0e16c42489121p-934 }, + { 0x1.a082ea93d471fp-941, 0x1.618056ad2fa0dp-935 }, + { 0x1.1075d9566cab2p-941, 0x1.ce9e247afa7efp-936 }, + { 0x1.646a66f6fb197p-942, 0x1.2eabb9557e4c3p-936 }, + { 0x1.d22f0f82317a8p-943, 0x1.8c0020c90fd02p-937 }, + { 0x1.30d7883df3e07p-943, 0x1.0305d4157bdecp-937 }, + { 0x1.8ea1187daf8b3p-944, 0x1.52cf8a69cbdeep-938 }, + { 0x1.049a91d747c02p-944, 0x1.bb1f3a4ce848cp-939 }, + { 0x1.54b29ff375e83p-945, 0x1.21bd19407d3a8p-939 }, + { 0x1.bd5a7cbaf896dp-946, 0x1.7ad97206eb3e9p-940 }, + { 0x1.230b0dec754dap-946, 0x1.ef4e6059f1fe4p-941 }, + { 0x1.7c5a693980a4p-947, 0x1.43bdb9112e65bp-941 }, + { 0x1.f10221f87a1cap-948, 0x1.a7278c0b2c815p-942 }, + { 0x1.44ae6c097e3b8p-948, 0x1.148391a9b5b7p-942 }, + { 0x1.a8288818abb4p-949, 0x1.69563388e87eep-943 }, }, - -/* Coefficients for each order 12 polynomial on each of the 20 intervals. */ -.poly = { - {0x1.ffffffffffff6p-1, -0x1.20dd750429b66p0, 0x1.fffffffffffdcp-1, - -0x1.812746b03713ap-1, 0x1.ffffffffbe94cp-2, -0x1.341f6bb6ec9a6p-2, - 0x1.555553a70ec2ep-3, -0x1.6023b4617a388p-4, 0x1.5550f0e40bfbap-5, - -0x1.38c290c0c8de8p-6, 0x1.0e84002c6274ep-7, -0x1.a599eb0ac5d04p-9, - 0x1.c9bfafa73899cp-11}, - {0x1.a2b43dbd503c8p-1, -0x1.a3495b7c9e6a4p-1, 0x1.535f3fb8cb92ap-1, - -0x1.d96ee9c714f44p-2, 0x1.26956676d2c64p-2, -0x1.4e2820da90c08p-3, - 0x1.5ea0cffac775ap-4, -0x1.57fb82ca373e8p-5, 0x1.3e0e8f48ba0f8p-6, - -0x1.16a695af1bbd4p-7, 0x1.cc836241a87d4p-9, -0x1.531de41264fdap-10, - 0x1.526a8a14e9bfcp-12}, - {0x1.532e75821ed48p-1, -0x1.28be350460782p-1, 0x1.b08873adbf108p-2, - -0x1.14377569249e2p-2, 0x1.3e1ece8cd10dap-3, -0x1.5087e2e6dc2e8p-4, - 0x1.4b3adb3bb335ap-5, -0x1.32342d711a4f4p-6, 0x1.0bc4f6ce2b656p-7, - -0x1.bcdaa331f2144p-9, 0x1.5c21c9e0ca954p-10, -0x1.dfdc9b3b5c402p-12, - 0x1.b451af7dd52fep-14}, - {0x1.10f9745a4f44ap-1, -0x1.9b03213e6963ap-2, 0x1.09b942bc8de66p-2, - -0x1.32755394481e4p-3, 0x1.42819b18af0e4p-4, -0x1.3a6d643aaa572p-5, - 0x1.1f17897603eaep-6, -0x1.eefb8d3f89d42p-8, 0x1.95559544f2fbp-9, - -0x1.3c2a67c33338p-10, 0x1.cffa784efe6cp-12, -0x1.282646774689cp-13, - 0x1.e654e67532b44p-16}, - {0x1.b5d8780f956b2p-2, -0x1.17c4e3f17c04dp-2, 0x1.3c27283c328dbp-3, - -0x1.44837f88ea4bdp-4, 0x1.33cad0e887482p-5, -0x1.10fcf0bc8963cp-6, - 0x1.c8cb68153ec42p-8, -0x1.6aef9a9842c54p-9, 0x1.1334345d6467cp-10, - -0x1.8ebe8763a2a8cp-12, 0x1.0f457219dec0dp-13, -0x1.3d2501dcd2a0fp-15, - 0x1.d213a128a75c9p-18}, - {0x1.5ee444130b7dbp-2, -0x1.78396ab208478p-3, 0x1.6e617ec5c0cc3p-4, - -0x1.49e60f63656b5p-5, 0x1.16064fddbbcb9p-6, -0x1.ba80af6a31018p-8, - 0x1.4ec374269d4ecp-9, -0x1.e40be960703a4p-11, 0x1.4fb029f35a144p-12, - -0x1.be45fd71a60eap-14, 0x1.161235cd2a3e7p-15, -0x1.264890eb1b5ebp-17, - 0x1.7f90154bde15dp-20}, - {0x1.19a22c064d4eap-2, -0x1.f645498cae217p-4, 0x1.a0565950e3f08p-5, - -0x1.446605c21c178p-6, 0x1.df1231d75622fp-8, -0x1.515167553de25p-9, - 0x1.c72c1b4a2a57fp-11, -0x1.276ae9394ecf1p-12, 0x1.71d2696d6c8c3p-14, - -0x1.bd4152984ce1dp-16, 0x1.f5afd2b450df7p-18, -0x1.dafdaddc7f943p-20, - 0x1.1020f4741f79ep-22}, - {0x1.c57f0542a7637p-3, -0x1.4e5535c17afc8p-4, 0x1.d312725242824p-6, - -0x1.3727cbc12a4bbp-7, 0x1.8d6730fc45b6bp-9, -0x1.e8855055c9b53p-11, - 0x1.21f73b70cc792p-12, -0x1.4d4fe06f13831p-14, 0x1.73867a82f7484p-16, - -0x1.8fab204d1d75ep-18, 0x1.91d9ba10367f4p-20, -0x1.5077ce4b334ddp-22, - 0x1.501716d098f14p-25}, - {0x1.6e9827d229d2dp-3, -0x1.bd6ae4d14b135p-5, 0x1.043fe1a989f11p-6, - -0x1.259061b98cf96p-8, 0x1.409cc2b1c4fc2p-10, -0x1.53dec152f6abfp-12, - 0x1.5e72cb4cc919fp-14, -0x1.6018b68100642p-16, 0x1.58d859380fb24p-18, - -0x1.471723286dad5p-20, 0x1.21c1a0f7a6593p-22, -0x1.a872678d91154p-25, - 0x1.6eb74e2e99662p-28}, - {0x1.29a8a4e95063ep-3, -0x1.29a8a316d3318p-5, 0x1.21876b3fe4f84p-7, - -0x1.1276f2d8ee36cp-9, 0x1.fbff52181a454p-12, -0x1.cb9ce9bde195ep-14, - 0x1.9710786fa90c5p-16, -0x1.6145ad5b471dcp-18, 0x1.2c52fac57009cp-20, - -0x1.f02a8711f07cfp-23, 0x1.7eb574960398cp-25, -0x1.e58ce325343aap-28, - 0x1.68510d1c32842p-31}, - {0x1.e583024e2bc8p-4, -0x1.8fb458acb5b0fp-6, 0x1.42b9dffac2531p-8, - -0x1.ff9fe9a553dddp-11, 0x1.8e7e86883ba0bp-13, -0x1.313af0bb12375p-15, - 0x1.cc29ccb17372ep-18, -0x1.55895fbb1ae42p-20, 0x1.f2bd2d6c7fd07p-23, - -0x1.62ec031844613p-25, 0x1.d7d69ce7c1847p-28, -0x1.0106b95e4db03p-30, - 0x1.45aabbe505f6ap-34}, - {0x1.8d9cbafa30408p-4, -0x1.0dd14614ed20fp-6, 0x1.6943976ea9dcap-9, - -0x1.dd6f05f4d7ce8p-12, 0x1.37891334aa621p-14, -0x1.91a8207766e1ep-17, - 0x1.ffcb0c613d75cp-20, -0x1.425116a6c88dfp-22, 0x1.90cb7c902d428p-25, - -0x1.e70fc740c3b6dp-28, 0x1.14a09ae5851ep-30, -0x1.00f9e03eae993p-33, - 0x1.14989aac741c2p-37}, - {0x1.46dc6bf900f68p-4, -0x1.6e4b45246f8dp-7, 0x1.96a3de47cfdb5p-10, - -0x1.bf5070eb6823bp-13, 0x1.e7af6e4aa8ef8p-16, -0x1.078bf26142831p-18, - 0x1.1a6e547aa40bep-21, -0x1.2c1c68f62f614p-24, 0x1.3bb8b473dd9e7p-27, - -0x1.45576cacb45a1p-30, 0x1.39ab71899b44ep-33, -0x1.ee307d46e2866p-37, - 0x1.c21ba1b404f5ap-41}, - {0x1.0d9a17e032288p-4, -0x1.f3e942ff4e097p-8, 0x1.cc77f09db5af8p-11, - -0x1.a56e8bffaab5cp-14, 0x1.7f49e36974e03p-17, -0x1.5a73fc0025d2fp-20, - 0x1.3742ae06a8be6p-23, -0x1.15ecf5317789bp-26, 0x1.ec74dd2b109fp-30, - -0x1.ac28325f88dc1p-33, 0x1.5ca9e8d7841b2p-36, -0x1.cfef04667185fp-40, - 0x1.6487c50052867p-44}, - {0x1.be0c73cc19eddp-5, -0x1.56ce6f6c0cb33p-8, 0x1.0645980ec8568p-11, - -0x1.8f86f88695a8cp-15, 0x1.2ef80cb1dca7cp-18, -0x1.c97ff7c599a6dp-22, - 0x1.57f0ac907d436p-25, -0x1.016be8d812c69p-28, 0x1.7ef6d33c73b75p-32, - -0x1.17f9784eda0d4p-35, 0x1.7fd8662b486f1p-39, -0x1.ae21758156d89p-43, - 0x1.165732f1ae138p-47}, - {0x1.71eafbd9f5877p-5, -0x1.d83714d904525p-9, 0x1.2c74dbaccea28p-12, - -0x1.7d27f3cdea565p-16, 0x1.e20b13581fcf8p-20, -0x1.2fe336f089679p-23, - 0x1.7dfce36129db3p-27, -0x1.dea026ee03f14p-31, 0x1.2a6019f7c64b1p-34, - -0x1.6e0eeb9f98eeap-38, 0x1.a58b4ed07d741p-42, -0x1.8d12c77071e4cp-46, - 0x1.b0241c6d5b761p-51}, - {0x1.33714a024097ep-5, -0x1.467f441a50cbdp-9, 0x1.59fa2994d0e65p-13, - -0x1.6dd369d9306cap-17, 0x1.81fb2b2af9413p-21, -0x1.96604d3c1bb6ep-25, - 0x1.aaef2da14243p-29, -0x1.bf7f1b935d3ebp-33, 0x1.d3261ebcd2061p-37, - -0x1.e04c803bbd875p-41, 0x1.cff98a43bacdep-45, -0x1.6ef39a63cf675p-49, - 0x1.4f8abb4398a0dp-54}, - {0x1.fff97acd75487p-6, -0x1.c502e8e46ec0cp-10, 0x1.903b0650672eap-14, - -0x1.6110aa5fb096fp-18, 0x1.36fd4c3e4040cp-22, -0x1.118489fe28728p-26, - 0x1.e06601208ac47p-31, -0x1.a52b90c21650ap-35, 0x1.6ffc42c05429bp-39, - -0x1.3ce3322a6972ep-43, 0x1.009d8ef37ff8cp-47, -0x1.5498d2cc51c99p-52, - 0x1.058cd4ea9bf04p-57}, - {0x1.aaf347fc8c45bp-6, -0x1.3b2fd709cf97dp-10, 0x1.d0ddfb8593f4p-15, - -0x1.5673f4aa86542p-19, 0x1.f8048954325f6p-24, -0x1.72839959ab3e9p-28, - 0x1.101597113be2ap-32, -0x1.8f1cf0ff4adeep-37, 0x1.23dca407fd66p-41, - -0x1.a4f387e57a6a5p-46, 0x1.1dafd753f65e9p-50, -0x1.3e15343c973d6p-55, - 0x1.9a2af47d77e44p-61}, - {0x1.64839d636f92bp-6, -0x1.b7adf7536232dp-11, 0x1.0eec0b6357148p-15, - -0x1.4da09b7f2c52bp-20, 0x1.9a8b146de838ep-25, -0x1.f8d1f145e7b6fp-30, - 0x1.3624435b3ba11p-34, -0x1.7cba19b4af977p-39, 0x1.d2282481ba91ep-44, - -0x1.198c1e91f9564p-48, 0x1.4046224f8ccp-53, -0x1.2b1dc676c096fp-58, - 0x1.43d3358c64dafp-64} -} }; diff --git a/contrib/arm-optimized-routines/pl/math/erfcf.h b/contrib/arm-optimized-routines/pl/math/erfcf.h deleted file mode 100644 index 8f1e5f4226e3..000000000000 --- a/contrib/arm-optimized-routines/pl/math/erfcf.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Shared functions for scalar and vector single-precision erfc(x) functions. - * - * Copyright (c) 2021-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#ifndef PL_MATH_ERFCF_H -#define PL_MATH_ERFCF_H - -#include "math_config.h" - -#define FMA fma -#include "estrin_wrap.h" - -/* Accurate exponential from optimized-routines. */ -double -__exp_dd (double x, double xtail); - -static inline double -eval_poly (double z, const double *coeff) -{ - double z2 = z * z; - double z4 = z2 * z2; - double z8 = z4 * z4; -#define C(i) coeff[i] - return ESTRIN_15 (z, z2, z4, z8, C); -#undef C -} - -static inline double -eval_exp_mx2 (double x) -{ - return __exp_dd (-(x * x), 0.0); -} - -#undef FMA -#endif // PL_MATH_ERFCF_H diff --git a/contrib/arm-optimized-routines/pl/math/erfcf_1u7.c b/contrib/arm-optimized-routines/pl/math/erfcf_1u7.c new file mode 100644 index 000000000000..c8ce95cca058 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/erfcf_1u7.c @@ -0,0 +1,103 @@ +/* + * Single-precision erfc(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define Shift 0x1p17f +#define OneThird 0x1.555556p-2f +#define TwoThird 0x1.555556p-1f + +#define TwoOverFifteen 0x1.111112p-3f +#define TwoOverFive 0x1.99999ap-2f +#define Tenth 0x1.99999ap-4f + +#define SignMask 0x7fffffff + +/* Fast erfcf approximation based on series expansion near x rounded to + nearest multiple of 1/64. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erfc(x) ~ erfc(r) - scale * d * poly(r, d), with + + poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3 + + (2/15 r^4 - 2/5 r^2 + 1/10) d^4 + + Values of erfc(r) and scale are read from lookup tables. Stored values + are scaled to avoid hitting the subnormal range. + + Note that for x < 0, erfc(x) = 2.0 - erfc(-x). + + Maximum error: 1.63 ULP (~1.0 ULP for x < 0.0). + erfcf(0x1.1dbf7ap+3) got 0x1.f51212p-120 + want 0x1.f51216p-120. */ +float +erfcf (float x) +{ + /* Get top words and sign. */ + uint32_t ix = asuint (x); + uint32_t ia = ix & SignMask; + uint32_t sign = ix & ~SignMask; + + /* |x| < 0x1.0p-26 => accurate to 0.5 ULP (top12(0x1p-26) = 0x328). */ + if (unlikely (ia < 0x32800000)) + return 1.0f - x; /* Small case. */ + + /* For |x| < 10.0625, the following approximation holds. */ + if (likely (ia < 0x41210000)) + { + /* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 1 and scale + to 2/sqrt(pi), when x reduced to r = 0. */ + float a = asfloat (ia); + float z = a + Shift; + uint32_t i = asuint (z) - asuint (Shift); + float r = z - Shift; + + /* These values are scaled by 2^-47. */ + float erfcr = __erfcf_data.tab[i].erfc; + float scale = __erfcf_data.tab[i].scale; + + /* erfc(x) ~ erfc(r) - scale * d * poly (r, d). */ + float d = a - r; + float d2 = d * d; + float r2 = r * r; + float p1 = -r; + float p2 = fmaf (TwoThird, r2, -OneThird); + float p3 = -r * fmaf (OneThird, r2, -0.5f); + float p4 = fmaf (fmaf (TwoOverFifteen, r2, -TwoOverFive), r2, Tenth); + float y = fmaf (p4, d, p3); + y = fmaf (y, d, p2); + y = fmaf (y, d, p1); + y = fmaf (-fmaf (y, d2, d), scale, erfcr); + /* Handle sign and scale back in a single fma. */ + float off = asfloat (sign >> 1); + float fac = asfloat (asuint (0x1p-47f) | sign); + y = fmaf (y, fac, off); + /* The underflow exception needs to be signaled explicitly when + result gets into subormnal range. */ + if (x >= 0x1.2639cp+3f) + force_eval_float (opt_barrier_float (0x1p-123f) * 0x1p-123f); + return y; + } + + /* erfcf(nan)=nan, erfcf(+inf)=0 and erfcf(-inf)=2. */ + if (unlikely (ia >= 0x7f800000)) + return asfloat (sign >> 1) + 1.0f / x; /* Special cases. */ + + /* Above this threshold erfcf is constant and needs to raise underflow + exception for positive x. */ + return sign ? 2.0f : __math_uflowf (0); +} + +PL_SIG (S, F, 1, erfc, -4.0, 10.0) +PL_TEST_ULP (erfcf, 1.14) +PL_TEST_SYM_INTERVAL (erfcf, 0, 0x1p-26, 40000) +PL_TEST_INTERVAL (erfcf, 0x1p-26, 10.0625, 40000) +PL_TEST_INTERVAL (erfcf, -0x1p-26, -4.0, 40000) +PL_TEST_INTERVAL (erfcf, 10.0625, inf, 40000) +PL_TEST_INTERVAL (erfcf, -4.0, -inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/erfcf_2u.c b/contrib/arm-optimized-routines/pl/math/erfcf_2u.c deleted file mode 100644 index 5a3f9b00aa5c..000000000000 --- a/contrib/arm-optimized-routines/pl/math/erfcf_2u.c +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Single-precision erfc(x) function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "erfcf.h" -#include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" - -#define P(i) __erfcf_poly_data.poly[i] - -/* Approximation of erfcf for |x| > 4.0. */ -static inline float -approx_erfcf_hi (float x, uint32_t sign, const double *coeff) -{ - if (sign) - { - return 2.0f; - } - - /* Polynomial contribution. */ - double z = (double) fabs (x); - float p = (float) eval_poly (z, coeff); - /* Gaussian contribution. */ - float e_mx2 = (float) eval_exp_mx2 (z); - - return p * e_mx2; -} - -/* Approximation of erfcf for |x| < 4.0. */ -static inline float -approx_erfcf_lo (float x, uint32_t sign, const double *coeff) -{ - /* Polynomial contribution. */ - double z = (double) fabs (x); - float p = (float) eval_poly (z, coeff); - /* Gaussian contribution. */ - float e_mx2 = (float) eval_exp_mx2 (z); - - if (sign) - return fmaf (-p, e_mx2, 2.0f); - else - return p * e_mx2; -} - -/* Top 12 bits of a float (sign and exponent bits). */ -static inline uint32_t -abstop12 (float x) -{ - return (asuint (x) >> 20) & 0x7ff; -} - -/* Top 12 bits of a float. */ -static inline uint32_t -top12 (float x) -{ - return asuint (x) >> 20; -} - -/* Fast erfcf approximation using polynomial approximation - multiplied by gaussian. - Most of the computation is carried out in double precision, - and is very sensitive to accuracy of polynomial and exp - evaluation. - Worst-case error is 1.968ulps, obtained for x = 2.0412941. - erfcf(0x1.05492p+1) got 0x1.fe10f6p-9 want 0x1.fe10f2p-9 ulp - err 1.46788. */ -float -erfcf (float x) -{ - /* Get top words and sign. */ - uint32_t ix = asuint (x); /* We need to compare at most 32 bits. */ - uint32_t sign = ix >> 31; - uint32_t ia12 = top12 (x) & 0x7ff; - - /* Handle special cases and small values with a single comparison: - abstop12(x)-abstop12(small) >= abstop12(INFINITY)-abstop12(small) - - Special cases - erfcf(nan)=nan, erfcf(+inf)=0 and erfcf(-inf)=2 - - Errno - EDOM does not have to be set in case of erfcf(nan). - Only ERANGE may be set in case of underflow. - - Small values (|x| accurate to 0.5 ULP (top12(0x1p-26) = 0x328). */ - if (unlikely (abstop12 (x) - 0x328 >= (abstop12 (INFINITY) & 0x7f8) - 0x328)) - { - if (abstop12 (x) >= 0x7f8) - return (float) (sign << 1) + 1.0f / x; /* Special cases. */ - else - return 1.0f - x; /* Small case. */ - } - - /* Normalized numbers divided in 4 intervals - with bounds: 2.0, 4.0, 8.0 and 10.0. 10 was chosen as the upper bound for - the interesting region as it is the smallest value, representable as a - 12-bit integer, for which returning 0 gives <1.5 ULP. */ - if (ia12 < 0x400) - { - return approx_erfcf_lo (x, sign, P (0)); - } - if (ia12 < 0x408) - { - return approx_erfcf_lo (x, sign, P (1)); - } - if (ia12 < 0x410) - { - return approx_erfcf_hi (x, sign, P (2)); - } - if (ia12 < 0x412) - { - return approx_erfcf_hi (x, sign, P (3)); - } - if (sign) - { - return 2.0f; - } - return __math_uflowf (0); -} - -PL_SIG (S, F, 1, erfc, -4.0, 10.0) -PL_TEST_ULP (erfcf, 1.5) -PL_TEST_INTERVAL (erfcf, 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (erfcf, 0x1p-127, 0x1p-26, 40000) -PL_TEST_INTERVAL (erfcf, -0x1p-127, -0x1p-26, 40000) -PL_TEST_INTERVAL (erfcf, 0x1p-26, 0x1p5, 40000) -PL_TEST_INTERVAL (erfcf, -0x1p-26, -0x1p3, 40000) -PL_TEST_INTERVAL (erfcf, 0, inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/erfcf_data.c b/contrib/arm-optimized-routines/pl/math/erfcf_data.c index 2e018c8c6710..a54e11973819 100644 --- a/contrib/arm-optimized-routines/pl/math/erfcf_data.c +++ b/contrib/arm-optimized-routines/pl/math/erfcf_data.c @@ -1,57 +1,664 @@ /* * Data used in single-precision erfc(x) function. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" -/* Polynomial coefficients for approximating erfc(x)*exp(x*x) in double - precision. Generated using the Remez algorithm on each interval separately - (see erfcf.sollya for more detail). */ -const struct erfcf_poly_data __erfcf_poly_data - = {.poly - = {{ -#if ERFCF_POLY_NCOEFFS == 16 - 0x1.ffffffffe7c59p-1, -0x1.20dd74f8cecc5p0, 0x1.fffffc67a0fbdp-1, - -0x1.81270c3ced2d6p-1, 0x1.fffc0c6606e45p-2, -0x1.340a779e8a8e3p-2, - 0x1.54c1663fc5a01p-3, -0x1.5d468c9269dafp-4, 0x1.4afe6b00df9d5p-5, - -0x1.1d22d2720cb91p-6, 0x1.afa399a5761b1p-8, -0x1.113851b5858adp-9, - 0x1.0f992e4d5c6a4p-11, -0x1.86534d558052ap-14, 0x1.63e537bfb7cd5p-17, - -0x1.32712a6275c4dp-21 -#endif +/* Lookup table used in erfcf. + For each possible rounded input r (multiples of 1/64), between + r = 0.0 and r = 10.0625 (645 values): + - the first entry __erfcf_data.tab.erfc contains the values of erfc(r), + - the second entry __erfcf_data.tab.scale contains the values of + 2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore + they are scaled by a large enough value 2^47 (fits in 8 bits). */ +const struct erfcf_data __erfcf_data = { + .tab = { { 0x1p47, 0x1.20dd76p47 }, + { 0x1.f6f944p46, 0x1.20cb68p47 }, + { 0x1.edf3aap46, 0x1.209546p47 }, + { 0x1.e4f05p46, 0x1.203b26p47 }, + { 0x1.dbf056p46, 0x1.1fbd28p47 }, + { 0x1.d2f4dcp46, 0x1.1f1b7ap47 }, + { 0x1.c9fefep46, 0x1.1e565cp47 }, + { 0x1.c10fd4p46, 0x1.1d6e14p47 }, + { 0x1.b8287ap46, 0x1.1c62fap47 }, + { 0x1.af4ap46, 0x1.1b3572p47 }, + { 0x1.a6757ep46, 0x1.19e5eap47 }, + { 0x1.9dabfcp46, 0x1.1874dep47 }, + { 0x1.94ee88p46, 0x1.16e2d8p47 }, + { 0x1.8c3e24p46, 0x1.153068p47 }, + { 0x1.839bd6p46, 0x1.135e3p47 }, + { 0x1.7b0894p46, 0x1.116cd8p47 }, + { 0x1.728558p46, 0x1.0f5d16p47 }, + { 0x1.6a1312p46, 0x1.0d2fa6p47 }, + { 0x1.61b2acp46, 0x1.0ae55p47 }, + { 0x1.596508p46, 0x1.087ee4p47 }, + { 0x1.512b06p46, 0x1.05fd3ep47 }, + { 0x1.49057ap46, 0x1.03614p47 }, + { 0x1.40f536p46, 0x1.00abdp47 }, + { 0x1.38fbp46, 0x1.fbbbbep46 }, + { 0x1.311796p46, 0x1.f5f0cep46 }, + { 0x1.294bb4p46, 0x1.eff8c4p46 }, + { 0x1.21980ap46, 0x1.e9d5a8p46 }, + { 0x1.19fd3ep46, 0x1.e38988p46 }, + { 0x1.127bf2p46, 0x1.dd167cp46 }, + { 0x1.0b14bcp46, 0x1.d67ea2p46 }, + { 0x1.03c82ap46, 0x1.cfc41ep46 }, + { 0x1.f92d8cp45, 0x1.c8e91cp46 }, + { 0x1.eb0214p45, 0x1.c1efcap46 }, + { 0x1.dd0edap45, 0x1.bada5ap46 }, + { 0x1.cf54b4p45, 0x1.b3aafcp46 }, + { 0x1.c1d46ap45, 0x1.ac63e8p46 }, + { 0x1.b48eaep45, 0x1.a5074ep46 }, + { 0x1.a78428p45, 0x1.9d9762p46 }, + { 0x1.9ab566p45, 0x1.96165p46 }, + { 0x1.8e22eap45, 0x1.8e8646p46 }, + { 0x1.81cd24p45, 0x1.86e96ap46 }, + { 0x1.75b47p45, 0x1.7f41dcp46 }, + { 0x1.69d91ep45, 0x1.7791b8p46 }, + { 0x1.5e3b66p45, 0x1.6fdb12p46 }, + { 0x1.52db78p45, 0x1.681ff2p46 }, + { 0x1.47b96ep45, 0x1.60625cp46 }, + { 0x1.3cd554p45, 0x1.58a446p46 }, + { 0x1.322f26p45, 0x1.50e79ep46 }, + { 0x1.27c6d2p45, 0x1.492e42p46 }, + { 0x1.1d9c34p45, 0x1.417a0cp46 }, + { 0x1.13af1ep45, 0x1.39ccc2p46 }, + { 0x1.09ff5p45, 0x1.32281ep46 }, + { 0x1.008c8p45, 0x1.2a8dcep46 }, + { 0x1.eeaca8p44, 0x1.22ff72p46 }, + { 0x1.dcb8cap44, 0x1.1b7e98p46 }, + { 0x1.cb3c86p44, 0x1.140cc4p46 }, + { 0x1.ba36dap44, 0x1.0cab62p46 }, + { 0x1.a9a6bap44, 0x1.055bd6p46 }, + { 0x1.998afap44, 0x1.fc3ee6p45 }, + { 0x1.89e25ep44, 0x1.edeeeep45 }, + { 0x1.7aab98p44, 0x1.dfca26p45 }, + { 0x1.6be542p44, 0x1.d1d2dp45 }, + { 0x1.5d8decp44, 0x1.c40b08p45 }, + { 0x1.4fa40ep44, 0x1.b674c8p45 }, + { 0x1.422616p44, 0x1.a911fp45 }, + { 0x1.351262p44, 0x1.9be438p45 }, + { 0x1.28674p44, 0x1.8eed36p45 }, + { 0x1.1c22f8p44, 0x1.822e66p45 }, + { 0x1.1043c2p44, 0x1.75a91ap45 }, + { 0x1.04c7cap44, 0x1.695e8cp45 }, + { 0x1.f35a72p43, 0x1.5d4fd4p45 }, + { 0x1.dde456p43, 0x1.517de6p45 }, + { 0x1.c9296cp43, 0x1.45e99cp45 }, + { 0x1.b525d6p43, 0x1.3a93b2p45 }, + { 0x1.a1d5a6p43, 0x1.2f7cc4p45 }, + { 0x1.8f34eap43, 0x1.24a554p45 }, + { 0x1.7d3fa6p43, 0x1.1a0dc6p45 }, + { 0x1.6bf1dcp43, 0x1.0fb662p45 }, + { 0x1.5b4784p43, 0x1.059f5ap45 }, + { 0x1.4b3c98p43, 0x1.f79184p44 }, + { 0x1.3bcd14p43, 0x1.e4653p44 }, + { 0x1.2cf4eep43, 0x1.d1b982p44 }, + { 0x1.1eb024p43, 0x1.bf8e1cp44 }, + { 0x1.10fab8p43, 0x1.ade26cp44 }, + { 0x1.03d0acp43, 0x1.9cb5bep44 }, + { 0x1.ee5c18p42, 0x1.8c0732p44 }, + { 0x1.d61dd6p42, 0x1.7bd5c8p44 }, + { 0x1.bedec8p42, 0x1.6c2056p44 }, + { 0x1.a8973cp42, 0x1.5ce596p44 }, + { 0x1.933f9p42, 0x1.4e241ep44 }, + { 0x1.7ed03ap42, 0x1.3fda6cp44 }, + { 0x1.6b41ccp42, 0x1.3206dcp44 }, + { 0x1.588cf2p42, 0x1.24a7b8p44 }, + { 0x1.46aa72p42, 0x1.17bb2cp44 }, + { 0x1.359332p42, 0x1.0b3f52p44 }, + { 0x1.254038p42, 0x1.fe646p43 }, + { 0x1.15aaa8p42, 0x1.e72372p43 }, + { 0x1.06cbcap42, 0x1.d0b7ap43 }, + { 0x1.f13a04p41, 0x1.bb1c98p43 }, + { 0x1.d62fbep41, 0x1.a64de6p43 }, + { 0x1.bc6c1ep41, 0x1.92470ap43 }, + { 0x1.a3e2ccp41, 0x1.7f036cp43 }, + { 0x1.8c87b8p41, 0x1.6c7e64p43 }, + { 0x1.764f2p41, 0x1.5ab342p43 }, + { 0x1.612d8ap41, 0x1.499d48p43 }, + { 0x1.4d17cap41, 0x1.3937b2p43 }, + { 0x1.3a03p41, 0x1.297dbap43 }, + { 0x1.27e498p41, 0x1.1a6a96p43 }, + { 0x1.16b24cp41, 0x1.0bf97ep43 }, + { 0x1.066222p41, 0x1.fc4b5ep42 }, + { 0x1.edd4d2p40, 0x1.e1d4dp42 }, + { 0x1.d08382p40, 0x1.c885ep42 }, + { 0x1.b4be2p40, 0x1.b0553p42 }, + { 0x1.9a7316p40, 0x1.99397ap42 }, + { 0x1.81915cp40, 0x1.83298ep42 }, + { 0x1.6a088p40, 0x1.6e1c58p42 }, + { 0x1.53c89ep40, 0x1.5a08e8p42 }, + { 0x1.3ec25ep40, 0x1.46e66cp42 }, + { 0x1.2ae6fap40, 0x1.34ac36p42 }, + { 0x1.18282ep40, 0x1.2351c2p42 }, + { 0x1.067844p40, 0x1.12ceb4p42 }, + { 0x1.eb940ep39, 0x1.031ad6p42 }, + { 0x1.cc2186p39, 0x1.e85c44p41 }, + { 0x1.ae808cp39, 0x1.cc018p41 }, + { 0x1.9299bp39, 0x1.b1160ap41 }, + { 0x1.785674p39, 0x1.978ae8p41 }, + { 0x1.5fa14ap39, 0x1.7f5188p41 }, + { 0x1.486586p39, 0x1.685bb6p41 }, + { 0x1.328f5ep39, 0x1.529b9ep41 }, + { 0x1.1e0be6p39, 0x1.3e03d8p41 }, + { 0x1.0ac8fcp39, 0x1.2a875cp41 }, + { 0x1.f16aaep38, 0x1.181984p41 }, + { 0x1.cf80d4p38, 0x1.06ae14p41 }, + { 0x1.afb4e2p38, 0x1.ec7262p40 }, + { 0x1.91e8bep38, 0x1.cd5ecap40 }, + { 0x1.75ffb4p38, 0x1.b00b38p40 }, + { 0x1.5bde72p38, 0x1.94624ep40 }, + { 0x1.436af4p38, 0x1.7a4f6ap40 }, + { 0x1.2c8c7ap38, 0x1.61beaep40 }, + { 0x1.172b7ap38, 0x1.4a9cf6p40 }, + { 0x1.033198p38, 0x1.34d7dcp40 }, + { 0x1.e11332p37, 0x1.205dacp40 }, + { 0x1.be3ebp37, 0x1.0d1d6ap40 }, + { 0x1.9dbf72p37, 0x1.f60d8ap39 }, + { 0x1.7f714p37, 0x1.d4143ap39 }, + { 0x1.6331cap37, 0x1.b430ecp39 }, + { 0x1.48e09cp37, 0x1.9646f4p39 }, + { 0x1.305ef8p37, 0x1.7a3adep39 }, + { 0x1.198fd6p37, 0x1.5ff276p39 }, + { 0x1.0457c6p37, 0x1.4754acp39 }, + { 0x1.e139bcp36, 0x1.30499cp39 }, + { 0x1.bc8d52p36, 0x1.1aba78p39 }, + { 0x1.9a7c3p36, 0x1.06918cp39 }, + { 0x1.7adadep36, 0x1.e77448p38 }, + { 0x1.5d806ap36, 0x1.c4412cp38 }, + { 0x1.424642p36, 0x1.a36454p38 }, + { 0x1.290826p36, 0x1.84ba3p38 }, + { 0x1.11a3f8p36, 0x1.6821p38 }, + { 0x1.f7f358p35, 0x1.4d78bcp38 }, + { 0x1.cfd652p35, 0x1.34a306p38 }, + { 0x1.aab85ap35, 0x1.1d8318p38 }, + { 0x1.88647p35, 0x1.07fdb4p38 }, + { 0x1.68a8e4p35, 0x1.e7f232p37 }, + { 0x1.4b5726p35, 0x1.c2b9dp37 }, + { 0x1.30439cp35, 0x1.a02436p37 }, + { 0x1.174578p35, 0x1.8005fp37 }, + { 0x1.003692p35, 0x1.6235fcp37 }, + { 0x1.d5e678p34, 0x1.468daep37 }, + { 0x1.aeb442p34, 0x1.2ce898p37 }, + { 0x1.8a9848p34, 0x1.15246ep37 }, + { 0x1.695876p34, 0x1.fe41cep36 }, + { 0x1.4abea2p34, 0x1.d57f52p36 }, + { 0x1.2e984ep34, 0x1.afc85ep36 }, + { 0x1.14b676p34, 0x1.8ce75ep36 }, + { 0x1.f9daap33, 0x1.6caa0ep36 }, + { 0x1.ce283ap33, 0x1.4ee142p36 }, + { 0x1.a609f8p33, 0x1.3360ccp36 }, + { 0x1.81396ap33, 0x1.19ff46p36 }, + { 0x1.5f7524p33, 0x1.0295fp36 }, + { 0x1.40806ep33, 0x1.da011p35 }, + { 0x1.2422eep33, 0x1.b23a5ap35 }, + { 0x1.0a286p33, 0x1.8d986ap35 }, + { 0x1.e4c0bp32, 0x1.6be022p35 }, + { 0x1.b93bf4p32, 0x1.4cda54p35 }, + { 0x1.916f7cp32, 0x1.30539p35 }, + { 0x1.6d0e7p32, 0x1.161be4p35 }, + { 0x1.4bd1cp32, 0x1.fc0d56p34 }, + { 0x1.2d77bep32, 0x1.cfd4a6p34 }, + { 0x1.11c3bep32, 0x1.a74068p34 }, + { 0x1.f0fb86p31, 0x1.8208bcp34 }, + { 0x1.c2e43ep31, 0x1.5feadap34 }, + { 0x1.98e254p31, 0x1.40a8c2p34 }, + { 0x1.729df6p31, 0x1.2408eap34 }, + { 0x1.4fc63cp31, 0x1.09d5f8p34 }, + { 0x1.3010aap31, 0x1.e3bcf4p33 }, + { 0x1.1338b8p31, 0x1.b7e946p33 }, + { 0x1.f1fecp30, 0x1.8fdc1cp33 }, + { 0x1.c2556ap30, 0x1.6b4702p33 }, + { 0x1.970b06p30, 0x1.49e178p33 }, + { 0x1.6fbddep30, 0x1.2b6876p33 }, + { 0x1.4c144ep30, 0x1.0f9e1cp33 }, + { 0x1.2bbc1ep30, 0x1.ec929ap32 }, + { 0x1.0e69f2p30, 0x1.be6abcp32 }, + { 0x1.e7b188p29, 0x1.94637ep32 }, + { 0x1.b792bcp29, 0x1.6e2368p32 }, + { 0x1.8c03d2p29, 0x1.4b581cp32 }, + { 0x1.649b02p29, 0x1.2bb5ccp32 }, + { 0x1.40f794p29, 0x1.0ef6c4p32 }, + { 0x1.20c13p29, 0x1.e9b5e8p31 }, + { 0x1.03a72ap29, 0x1.ba4f04p31 }, + { 0x1.d2bfc6p28, 0x1.8f4cccp31 }, + { 0x1.a35068p28, 0x1.684c22p31 }, + { 0x1.7885cep28, 0x1.44f21ep31 }, + { 0x1.51f06ap28, 0x1.24eb72p31 }, + { 0x1.2f2aaap28, 0x1.07ebd2p31 }, + { 0x1.0fd816p28, 0x1.db5adp30 }, + { 0x1.e7493p27, 0x1.abe09ep30 }, + { 0x1.b48774p27, 0x1.80f43ap30 }, + { 0x1.86e006p27, 0x1.5a2aep30 }, + { 0x1.5dd4bp27, 0x1.37231p30 }, + { 0x1.38f2e8p27, 0x1.1783cep30 }, + { 0x1.17d2c6p27, 0x1.f5f7d8p29 }, + { 0x1.f42c18p26, 0x1.c282cep29 }, + { 0x1.beceb2p26, 0x1.94219cp29 }, + { 0x1.8ef2aap26, 0x1.6a5972p29 }, + { 0x1.640bf6p26, 0x1.44ba86p29 }, + { 0x1.3d9be6p26, 0x1.22df2ap29 }, + { 0x1.1b2fe4p26, 0x1.046aeap29 }, + { 0x1.f8c0c2p25, 0x1.d21398p28 }, + { 0x1.c19fa8p25, 0x1.a0df1p28 }, + { 0x1.90538cp25, 0x1.74adc8p28 }, + { 0x1.6443fep25, 0x1.4d0232p28 }, + { 0x1.3ce784p25, 0x1.296a7p28 }, + { 0x1.19c232p25, 0x1.097f62p28 }, + { 0x1.f4c8c4p24, 0x1.d9c736p27 }, + { 0x1.bcd30ep24, 0x1.a6852cp27 }, + { 0x1.8aee4cp24, 0x1.789fb8p27 }, + { 0x1.5e77b6p24, 0x1.4f8c96p27 }, + { 0x1.36dcf2p24, 0x1.2acee2p27 }, + { 0x1.139a7cp24, 0x1.09f5dp27 }, + { 0x1.e8747p23, 0x1.d9371ep26 }, + { 0x1.b0a44ap23, 0x1.a4c89ep26 }, + { 0x1.7f064ap23, 0x1.75fa8ep26 }, + { 0x1.52efep23, 0x1.4c37cp26 }, + { 0x1.2bc82ap23, 0x1.26f9ep26 }, + { 0x1.09064p23, 0x1.05c804p26 }, + { 0x1.d45f16p22, 0x1.d06ad6p25 }, + { 0x1.9dacb2p22, 0x1.9bc0ap25 }, + { 0x1.6d3126p22, 0x1.6ce1aap25 }, + { 0x1.423d14p22, 0x1.43302cp25 }, + { 0x1.1c33cep22, 0x1.1e1e86p25 }, + { 0x1.f512dep21, 0x1.fa5b5p24 }, + { 0x1.b9823cp21, 0x1.bfd756p24 }, + { 0x1.84d6fep21, 0x1.8be4f8p24 }, + { 0x1.564a92p21, 0x1.5dcd66p24 }, + { 0x1.2d2c0ap21, 0x1.34ecf8p24 }, + { 0x1.08ddd2p21, 0x1.10b148p24 }, + { 0x1.d1a75p20, 0x1.e12eep23 }, + { 0x1.99218cp20, 0x1.a854eap23 }, + { 0x1.674c6ap20, 0x1.7603bap23 }, + { 0x1.3b62b6p20, 0x1.4980ccp23 }, + { 0x1.14b54p20, 0x1.2225b2p23 }, + { 0x1.e55102p19, 0x1.febc1p22 }, + { 0x1.a964eep19, 0x1.c14b22p22 }, + { 0x1.74b17ap19, 0x1.8b0cfcp22 }, + { 0x1.465daap19, 0x1.5b2fe6p22 }, + { 0x1.1da944p19, 0x1.30f93cp22 }, + { 0x1.f3d41p18, 0x1.0bc30cp22 }, + { 0x1.b512a2p18, 0x1.d5f3a8p21 }, + { 0x1.7e03b2p18, 0x1.9c3518p21 }, + { 0x1.4dbb98p18, 0x1.6961b8p21 }, + { 0x1.236a1ap18, 0x1.3cab14p21 }, + { 0x1.fcae94p17, 0x1.155a0ap21 }, + { 0x1.bbc1ap17, 0x1.e5989p20 }, + { 0x1.82eedcp17, 0x1.a8e406p20 }, + { 0x1.5139a6p17, 0x1.7397c6p20 }, + { 0x1.25c354p17, 0x1.44d26ep20 }, + { 0x1.ff8f84p16, 0x1.1bcca4p20 }, + { 0x1.bd3474p16, 0x1.efac52p19 }, + { 0x1.834586p16, 0x1.b0a68ap19 }, + { 0x1.50b75cp16, 0x1.7974e8p19 }, + { 0x1.249ef2p16, 0x1.4924a8p19 }, + { 0x1.fc5b88p15, 0x1.1edfa4p19 }, + { 0x1.b95ceep15, 0x1.f3d218p18 }, + { 0x1.7f03bap15, 0x1.b334fap18 }, + { 0x1.4c389cp15, 0x1.7ac2d8p18 }, + { 0x1.2006aep15, 0x1.4979acp18 }, + { 0x1.f32eap14, 0x1.1e767cp18 }, + { 0x1.b05cfep14, 0x1.f1e352p17 }, + { 0x1.764f46p14, 0x1.b0778cp17 }, + { 0x1.43e56cp14, 0x1.77756ep17 }, + { 0x1.18238p14, 0x1.45ce66p17 }, + { 0x1.e45a98p13, 0x1.1a95p17 }, + { 0x1.a284ccp13, 0x1.e9f2p16 }, + { 0x1.697596p13, 0x1.a887bep16 }, + { 0x1.3807acp13, 0x1.6fab64p16 }, + { 0x1.0d3b36p13, 0x1.3e44e4p16 }, + { 0x1.d0624p12, 0x1.135f28p16 }, + { 0x1.904e0cp12, 0x1.dc479ep15 }, + { 0x1.58e72ap12, 0x1.9baed4p15 }, + { 0x1.2906ccp12, 0x1.63ac6cp15 }, + { 0x1.ff58dap11, 0x1.33225ap15 }, + { 0x1.b7f1f4p11, 0x1.0916fp15 }, + { 0x1.7a551p11, 0x1.c960cp14 }, + { 0x1.453142p11, 0x1.8a6174p14 }, + { 0x1.1761f8p11, 0x1.53e4f8p14 }, + { 0x1.dfd296p10, 0x1.24caf2p14 }, + { 0x1.9bd5fp10, 0x1.f830cp13 }, + { 0x1.61501p10, 0x1.b1e5acp13 }, + { 0x1.2ef6p10, 0x1.7538c6p13 }, + { 0x1.03a918p10, 0x1.40dfd8p13 }, + { 0x1.bce26ap9, 0x1.13bc08p13 }, + { 0x1.7cef42p9, 0x1.d9a88p12 }, + { 0x1.46056p9, 0x1.96a0b4p12 }, + { 0x1.16e3cap9, 0x1.5ce9acp12 }, + { 0x1.dcea68p8, 0x1.2b3e54p12 }, + { 0x1.97945ap8, 0x1.0085p12 }, + { 0x1.5c2828p8, 0x1.b7937ep11 }, + { 0x1.29415p8, 0x1.7872dap11 }, + { 0x1.fb58fap7, 0x1.423acp11 }, + { 0x1.b0c1a8p7, 0x1.13af5p11 }, + { 0x1.70f474p7, 0x1.d77f0cp10 }, + { 0x1.3a68a8p7, 0x1.92ff34p10 }, + { 0x1.0bcc6p7, 0x1.5847eep10 }, + { 0x1.c7fa0cp6, 0x1.25f9eep10 }, + { 0x1.8401b6p6, 0x1.f5cc78p9 }, + { 0x1.4a029ap6, 0x1.ac0f6p9 }, + { 0x1.188c46p6, 0x1.6cfa9cp9 }, + { 0x1.dcc4fap5, 0x1.370ab8p9 }, + { 0x1.94ec06p5, 0x1.08f24p9 }, + { 0x1.57bc96p5, 0x1.c324c2p8 }, + { 0x1.23a81ap5, 0x1.7fe904p8 }, + { 0x1.eeb278p4, 0x1.46897ep8 }, + { 0x1.a35794p4, 0x1.159a38p8 }, + { 0x1.634b8p4, 0x1.d7c594p7 }, + { 0x1.2ce2a4p4, 0x1.90ae4ep7 }, + { 0x1.fd5f08p3, 0x1.5422fp7 }, + { 0x1.aef3cep3, 0x1.20998p7 }, + { 0x1.6c6e62p3, 0x1.e98102p6 }, + { 0x1.3407b6p3, 0x1.9eee06p6 }, + { 0x1.043bap3, 0x1.5f8b88p6 }, + { 0x1.b77e5cp2, 0x1.29b294p6 }, + { 0x1.72f0c4p2, 0x1.f7f338p5 }, + { 0x1.38ee18p2, 0x1.aa5772p5 }, + { 0x1.07dd68p2, 0x1.68823ep5 }, + { 0x1.bcc58ep1, 0x1.30b14ep5 }, + { 0x1.76aca4p1, 0x1.01647cp5 }, + { 0x1.3b7912p1, 0x1.b2a87ep4 }, + { 0x1.097f82p1, 0x1.6ed2f2p4 }, + { 0x1.beaa3ep0, 0x1.356cd6p4 }, + { 0x1.778be2p0, 0x1.04e15ep4 }, + { 0x1.3b9984p0, 0x1.b7b04p3 }, + { 0x1.09182cp0, 0x1.725862p3 }, + { 0x1.bd20fcp-1, 0x1.37c92cp3 }, + { 0x1.75892p-1, 0x1.065b96p3 }, + { 0x1.394e7ap-1, 0x1.b950d4p2 }, + { 0x1.06a996p-1, 0x1.72fd94p2 }, + { 0x1.b8328ep-2, 0x1.37b83cp2 }, + { 0x1.70aff4p-2, 0x1.05ca5p2 }, + { 0x1.34a53cp-2, 0x1.b7807ep1 }, + { 0x1.0241dep-2, 0x1.70bebp1 }, + { 0x1.affb9p-3, 0x1.353a6cp1 }, + { 0x1.691c7cp-3, 0x1.0330fp1 }, + { 0x1.2db8cap-3, 0x1.b24a16p0 }, + { 0x1.f7f4f8p-4, 0x1.6ba91ap0 }, + { 0x1.a4ab64p-4, 0x1.305e98p0 }, + { 0x1.5efa4ep-4, 0x1.fd3de2p-1 }, + { 0x1.24b0d8p-4, 0x1.a9cc94p-1 }, + { 0x1.e7eeap-5, 0x1.63daf8p-1 }, + { 0x1.96826ep-5, 0x1.294176p-1 }, + { 0x1.5282d2p-5, 0x1.f05e82p-2 }, + { 0x1.19c05p-5, 0x1.9e39dcp-2 }, + { 0x1.d4ca9cp-6, 0x1.5982p-2 }, + { 0x1.85cfacp-6, 0x1.200c8ap-2 }, + { 0x1.43fb32p-6, 0x1.e00e92p-3 }, + { 0x1.0d2382p-6, 0x1.8fd4ep-3 }, + { 0x1.bef1b2p-7, 0x1.4cd9cp-3 }, + { 0x1.72ede4p-7, 0x1.14f48ap-3 }, + { 0x1.33b1cap-7, 0x1.ccaaeap-4 }, + { 0x1.fe3bdp-8, 0x1.7eef14p-4 }, + { 0x1.a6d7d2p-8, 0x1.3e2964p-4 }, + { 0x1.5e4062p-8, 0x1.083768p-4 }, + { 0x1.21fb7ap-8, 0x1.b69f1p-5 }, + { 0x1.dfefbep-9, 0x1.6be574p-5 }, + { 0x1.8cf816p-9, 0x1.2dc11ap-5 }, + { 0x1.482fa8p-9, 0x1.f4343cp-6 }, + { 0x1.0f30c4p-9, 0x1.9e614ep-6 }, + { 0x1.bff86ep-10, 0x1.571d34p-6 }, + { 0x1.71d0b6p-10, 0x1.1bf742p-6 }, + { 0x1.3125f6p-10, 0x1.d5cc6cp-7 }, + { 0x1.f755eap-11, 0x1.846e9ep-7 }, + { 0x1.9eebaap-11, 0x1.410048p-7 }, + { 0x1.55df18p-11, 0x1.09258p-7 }, + { 0x1.198c18p-11, 0x1.b5ceb6p-8 }, + { 0x1.cf82ep-12, 0x1.69468p-8 }, + { 0x1.7d5af6p-12, 0x1.29f9e8p-8 }, + { 0x1.399c28p-12, 0x1.eb4b9ep-9 }, + { 0x1.01c65ap-12, 0x1.94d1dep-9 }, + { 0x1.a78e82p-13, 0x1.4d6706p-9 }, + { 0x1.5bcf92p-13, 0x1.127346p-9 }, + { 0x1.1d791cp-13, 0x1.c39fap-10 }, + { 0x1.d463dcp-14, 0x1.73679cp-10 }, + { 0x1.8011fcp-14, 0x1.314916p-10 }, + { 0x1.3ac71cp-14, 0x1.f5a11ap-11 }, + { 0x1.01dcc2p-14, 0x1.9beca8p-11 }, + { 0x1.a6459cp-15, 0x1.52189ap-11 }, + { 0x1.59962ap-15, 0x1.155d48p-11 }, + { 0x1.1ab0e4p-15, 0x1.c6dc8ap-12 }, + { 0x1.ce42dep-16, 0x1.74ca88p-12 }, + { 0x1.79c43p-16, 0x1.31612ap-12 }, + { 0x1.349128p-16, 0x1.f4125ap-13 }, + { 0x1.f7d80ep-17, 0x1.993e82p-13 }, + { 0x1.9b270cp-17, 0x1.4ec006p-13 }, + { 0x1.4f59fap-17, 0x1.11aebp-13 }, + { 0x1.1164acp-17, 0x1.bf4ab2p-14 }, + { 0x1.bd8c96p-18, 0x1.6d561ep-14 }, + { 0x1.6ae172p-18, 0x1.2a406ep-14 }, + { 0x1.276874p-18, 0x1.e6bba6p-15 }, + { 0x1.e0bad2p-19, 0x1.8cf814p-15 }, + { 0x1.86f788p-19, 0x1.4399f8p-15 }, + { 0x1.3dcfaep-19, 0x1.07aa3p-15 }, + { 0x1.023828p-19, 0x1.ad7302p-16 }, + { 0x1.a3666ep-20, 0x1.5d90f4p-16 }, + { 0x1.546e38p-20, 0x1.1c674ep-16 }, + { 0x1.143264p-20, 0x1.ce8ccp-17 }, + { 0x1.bff316p-21, 0x1.77f562p-17 }, + { 0x1.6b13ecp-21, 0x1.316da8p-17 }, + { 0x1.2624f4p-21, 0x1.f0046p-18 }, + { 0x1.dc5de4p-22, 0x1.92920ap-18 }, + { 0x1.818d3ap-22, 0x1.4691b2p-18 }, + { 0x1.37e62p-22, 0x1.08c96ap-18 }, + { 0x1.f8637ep-23, 0x1.ad2d0ap-19 }, + { 0x1.97a3dcp-23, 0x1.5ba462p-19 }, + { 0x1.494a4p-23, 0x1.1975ep-19 }, + { 0x1.09dee4p-23, 0x1.c78892p-20 }, + { 0x1.ad1fap-24, 0x1.7073c4p-20 }, + { 0x1.5a245ep-24, 0x1.29df48p-20 }, + { 0x1.171278p-24, 0x1.e163bep-21 }, + { 0x1.c1c74cp-25, 0x1.84cbbp-21 }, + { 0x1.6a46f4p-25, 0x1.39dbcep-21 }, + { 0x1.23a858p-25, 0x1.fa7b92p-22 }, + { 0x1.d56196p-26, 0x1.9876ap-22 }, + { 0x1.7984b6p-26, 0x1.4940bcp-22 }, + { 0x1.2f7cc4p-26, 0x1.094608p-22 }, + { 0x1.e7b62cp-27, 0x1.ab3e8cp-23 }, + { 0x1.87b15ep-27, 0x1.57e33ep-23 }, + { 0x1.3a6dp-27, 0x1.14a8b6p-23 }, + { 0x1.f88ebap-28, 0x1.bcede6p-24 }, + { 0x1.94a282p-28, 0x1.659918p-24 }, + { 0x1.44580ap-28, 0x1.1f4498p-24 }, + { 0x1.03dbf8p-28, 0x1.cd5086p-25 }, + { 0x1.a03066p-29, 0x1.723974p-25 }, + { 0x1.4d1f2ep-29, 0x1.28f9cap-25 }, + { 0x1.0a814ap-29, 0x1.dc34b6p-26 }, + { 0x1.aa36cap-30, 0x1.7d9dbp-26 }, + { 0x1.54a6b6p-30, 0x1.31aa56p-26 }, + { 0x1.102232p-30, 0x1.e96c26p-27 }, + { 0x1.b2959ep-31, 0x1.87a218p-27 }, + { 0x1.5ad66cp-31, 0x1.393ad2p-27 }, + { 0x1.14ac7ep-31, 0x1.f4ccdap-28 }, + { 0x1.b931b8p-32, 0x1.9026a8p-28 }, + { 0x1.5f9a24p-32, 0x1.3f92eap-28 }, + { 0x1.181154p-32, 0x1.fe3208p-29 }, + { 0x1.bdf55ep-33, 0x1.970fbp-29 }, + { 0x1.62e226p-33, 0x1.449de6p-29 }, + { 0x1.1a4576p-33, 0x1.02be7p-29 }, + { 0x1.c0d0bep-34, 0x1.9c4672p-30 }, + { 0x1.64a386p-34, 0x1.484b1ep-30 }, + { 0x1.1b418cp-34, 0x1.054a9ap-30 }, + { 0x1.c1ba4ap-35, 0x1.9fb994p-31 }, + { 0x1.64d86p-35, 0x1.4a8e4ep-31 }, + { 0x1.1b0242p-35, 0x1.06b4fep-31 }, + { 0x1.c0aee6p-36, 0x1.a15d86p-32 }, + { 0x1.637ffap-36, 0x1.4b5fdep-32 }, + { 0x1.198862p-36, 0x1.06f8dap-32 }, + { 0x1.bdb204p-37, 0x1.a12cc8p-33 }, + { 0x1.609ec2p-37, 0x1.4abd0ap-33 }, + { 0x1.16d8d2p-37, 0x1.06154ap-33 }, + { 0x1.b8cd88p-38, 0x1.9f27fap-34 }, + { 0x1.5c3e42p-38, 0x1.48a7fcp-34 }, + { 0x1.12fc6cp-38, 0x1.040d4ap-34 }, + { 0x1.b2119p-39, 0x1.9b55e8p-35 }, + { 0x1.566cep-39, 0x1.4527acp-35 }, + { 0x1.0dffep-39, 0x1.00e7acp-35 }, + { 0x1.a99426p-40, 0x1.95c358p-36 }, + { 0x1.4f3d92p-40, 0x1.4047cep-36 }, + { 0x1.07f35ep-40, 0x1.f95dcep-37 }, + { 0x1.9f70cp-41, 0x1.8e82cep-37 }, + { 0x1.46c77ap-41, 0x1.3a1882p-37 }, + { 0x1.00ea48p-41, 0x1.eee1d4p-38 }, + { 0x1.93c7acp-42, 0x1.85ac18p-38 }, + { 0x1.3d256ap-42, 0x1.32ae04p-38 }, + { 0x1.f1f59p-43, 0x1.e27d88p-39 }, + { 0x1.86bd6ap-43, 0x1.7b5bdap-39 }, + { 0x1.327554p-43, 0x1.2a2036p-39 }, + { 0x1.e07ab4p-44, 0x1.d458ap-40 }, + { 0x1.7879ecp-44, 0x1.6fb2eap-40 }, + { 0x1.26d7bp-44, 0x1.208a2cp-40 }, + { 0x1.cd98a2p-45, 0x1.c49f8ap-41 }, + { 0x1.6927c2p-45, 0x1.62d5aap-41 }, + { 0x1.1a6ed6p-45, 0x1.16098ep-41 }, + { 0x1.b986acp-46, 0x1.b3828ep-42 }, + { 0x1.58f35ap-46, 0x1.54eb3ep-42 }, + { 0x1.0d5e6p-46, 0x1.0abe0ep-42 }, + { 0x1.a47db6p-47, 0x1.a134d4p-43 }, + { 0x1.480a18p-47, 0x1.461cdap-43 }, + { 0x1.ff94e4p-48, 0x1.fd9182p-44 }, + { 0x1.8eb738p-48, 0x1.8deb62p-44 }, + { 0x1.369994p-48, 0x1.3694e8p-44 }, + { 0x1.e3ae4ap-49, 0x1.e49706p-45 }, + { 0x1.786c3ep-49, 0x1.79dc28p-45 }, + { 0x1.24cec8p-49, 0x1.267e46p-45 }, + { 0x1.c74fc4p-50, 0x1.cad0bp-46 }, + { 0x1.61d46cp-50, 0x1.653d08p-46 }, + { 0x1.12d55cp-50, 0x1.16038cp-46 }, + { 0x1.aabdacp-51, 0x1.b081aap-47 }, + { 0x1.4b252ep-51, 0x1.5042e2p-47 }, + { 0x1.00d6f8p-51, 0x1.054e44p-47 }, + { 0x1.8e38ep-52, 0x1.95eb2cp-48 }, + { 0x1.3490e8p-52, 0x1.3b20c6p-48 }, + { 0x1.ddf56ap-53, 0x1.e90cb6p-49 }, + { 0x1.71fdep-53, 0x1.7b4b76p-49 }, + { 0x1.1e465ap-53, 0x1.26072ap-49 }, + { 0x1.bac92ep-54, 0x1.c7a2ecp-50 }, + { 0x1.56441cp-54, 0x1.60dcfp-50 }, + { 0x1.08700cp-54, 0x1.112346p-50 }, + { 0x1.986a66p-55, 0x1.a6a50ap-51 }, + { 0x1.3b3d56p-55, 0x1.46d572p-51 }, + { 0x1.e667dap-56, 0x1.f93d0ep-52 }, + { 0x1.7712b8p-56, 0x1.86529ep-52 }, + { 0x1.211544p-56, 0x1.2d65aep-52 }, + { 0x1.bd660ap-57, 0x1.d13c32p-53 }, + { 0x1.56f3eep-57, 0x1.66e45ap-53 }, + { 0x1.07f14ap-57, 0x1.14b8b6p-53 }, + { 0x1.96129cp-58, 0x1.aa854cp-54 }, + { 0x1.3837cp-58, 0x1.488b94p-54 }, + { 0x1.dfe0c2p-59, 0x1.f9e772p-55 }, + { 0x1.709b5ap-59, 0x1.85503p-55 }, + { 0x1.1affd2p-59, 0x1.2b7218p-55 }, + { 0x1.b2564p-60, 0x1.cc6bb6p-56 }, + { 0x1.4d23fap-60, 0x1.61cb1ap-56 }, + { 0x1.fecbdp-61, 0x1.0fba0ep-56 }, + { 0x1.8767d8p-61, 0x1.a13072p-57 }, + { 0x1.2bc67ep-61, 0x1.401abcp-57 }, + { 0x1.caf846p-62, 0x1.eafc2cp-58 }, + { 0x1.5f2e7ap-62, 0x1.785cp-58 }, + { 0x1.0c93acp-62, 0x1.205a7ep-58 }, + { 0x1.9a9b06p-63, 0x1.b9a31ap-59 }, + { 0x1.39b7fcp-63, 0x1.520968p-59 }, + { 0x1.df277ap-64, 0x1.029ce6p-59 }, + { 0x1.6dbcdp-64, 0x1.8b81d6p-60 }, + { 0x1.17080ap-64, 0x1.2e48f2p-60 }, + { 0x1.a98e26p-65, 0x1.cdd86cp-61 }, + { 0x1.445a6ap-65, 0x1.60a47ap-61 }, + { 0x1.ee324ep-66, 0x1.0d210cp-61 }, + { 0x1.784e3p-66, 0x1.9a961ep-62 }, + { 0x1.1e65fep-66, 0x1.390b74p-62 }, + { 0x1.b3bb86p-67, 0x1.dd1e52p-63 }, + { 0x1.4b4e36p-67, 0x1.6b6a7ap-63 }, + { 0x1.f790f6p-68, 0x1.14acc2p-63 }, + { 0x1.7e82cep-68, 0x1.a511aap-64 }, + { 0x1.226a7ap-68, 0x1.404114p-64 }, + { 0x1.b8c634p-69, 0x1.e6ea96p-65 }, + { 0x1.4e53acp-69, 0x1.71f97ap-65 }, + { 0x1.faed5cp-70, 0x1.18fb2ep-65 }, + { 0x1.80217ep-70, 0x1.aa947ep-66 }, + { 0x1.22f066p-70, 0x1.43a796p-66 }, + { 0x1.b87f86p-71, 0x1.eae2fp-67 }, + { 0x1.4d4ec8p-71, 0x1.7414e6p-67 }, + { 0x1.f8283ep-72, 0x1.19e474p-67 }, + { 0x1.7d1b22p-72, 0x1.aaeb7ep-68 }, + { 0x1.1ff2dp-72, 0x1.431f66p-68 }, + { 0x1.b2e9e8p-73, 0x1.e8e272p-69 }, + { 0x1.4848dep-73, 0x1.71a91ep-69 }, + { 0x1.ef5b16p-74, 0x1.176014p-69 }, + { 0x1.758b92p-74, 0x1.a6137cp-70 }, + { 0x1.198d42p-74, 0x1.3ead74p-70 }, + { 0x1.a838bp-75, 0x1.e0fbc2p-71 }, + { 0x1.3f700cp-75, 0x1.6accaep-71 }, + { 0x1.e0d68ep-76, 0x1.118578p-71 }, + { 0x1.69b7f4p-76, 0x1.9c3974p-72 }, + { 0x1.0ffa12p-76, 0x1.367afap-72 }, + { 0x1.98cd1cp-77, 0x1.d377fap-73 }, + { 0x1.33148p-77, 0x1.5fbee6p-73 }, + { 0x1.cd1dbap-78, 0x1.088a8p-73 }, + { 0x1.5a0a9cp-78, 0x1.8db7ccp-74 }, + { 0x1.038ef4p-78, 0x1.2ad2ecp-74 }, + { 0x1.85308ap-79, 0x1.c0d23ep-75 }, + { 0x1.23a3cp-79, 0x1.50e41ap-75 }, + { 0x1.b4de68p-80, 0x1.f980a8p-76 }, + { 0x1.470ce4p-80, 0x1.7b10fep-76 }, + { 0x1.e9700cp-81, 0x1.1c1d98p-76 }, + { 0x1.6e0c9p-81, 0x1.a9b08p-77 }, + { 0x1.11a25ap-81, 0x1.3ebfb4p-77 }, + { 0x1.98e73ap-82, 0x1.dd1d36p-78 }, + { 0x1.315f58p-82, 0x1.64e7fp-78 }, + { 0x1.c7e35cp-83, 0x1.0ada94p-78 }, + { 0x1.542176p-83, 0x1.8ed9e8p-79 }, + { 0x1.fb491ep-84, 0x1.29ecb2p-79 }, + { 0x1.7a1c34p-84, 0x1.bcdb34p-80 }, + { 0x1.19b0f2p-84, 0x1.4bf6cap-80 }, + { 0x1.a383cap-85, 0x1.ef3318p-81 }, + { 0x1.383bf2p-85, 0x1.712bc2p-81 }, + { 0x1.d08cdap-86, 0x1.13151p-81 }, + { 0x1.596adp-86, 0x1.99bf36p-82 }, + { 0x1.00b602p-86, 0x1.3104d6p-82 }, + { 0x1.7d62a2p-87, 0x1.c5e534p-83 }, + { 0x1.1b2abcp-87, 0x1.518db2p-83 }, + { 0x1.a4480ep-88, 0x1.f5d1c6p-84 }, + { 0x1.37be42p-88, 0x1.74d45ap-84 }, + { 0x1.ce3ee4p-89, 0x1.14dc4ap-84 }, + { 0x1.568986p-89, 0x1.9afd0ep-85 }, + { 0x1.fb69c6p-90, 0x1.30e632p-85 }, + { 0x1.77a47ep-90, 0x1.c42b48p-86 }, + { 0x1.15f4ep-90, 0x1.4f1f52p-86 }, + { 0x1.9b25dcp-91, 0x1.f08156p-87 }, + { 0x1.2feeeep-91, 0x1.6f9f62p-87 }, + { 0x1.c122bcp-92, 0x1.100ffap-87 }, + { 0x1.4bb154p-92, 0x1.927ce6p-88 }, + { 0x1.e9ae56p-93, 0x1.2992f4p-88 }, + { 0x1.6948e8p-93, 0x1.b7cccap-89 }, + { 0x1.0a6cd2p-93, 0x1.44d7c4p-89 }, + { 0x1.88c0cap-94, 0x1.dfa22p-90 }, + { 0x1.215988p-94, 0x1.61eb26p-90 }, + { 0x1.aa222ap-95, 0x1.0506e2p-90 }, + { 0x1.39a30ep-95, 0x1.80d828p-91 }, + { 0x1.cd740ep-96, 0x1.1b8f04p-91 }, + { 0x1.534d82p-96, 0x1.a1a7ecp-92 }, + { 0x1.f2bb06p-97, 0x1.336f3p-92 }, + { 0x1.6e5b34p-97, 0x1.c46172p-93 }, + { 0x1.0cfc82p-97, 0x1.4cab82p-93 }, + { 0x1.8acc82p-98, 0x1.e9094cp-94 }, + { 0x1.219686p-98, 0x1.67465p-94 }, + { 0x1.a89fa6p-99, 0x1.07d0b8p-94 }, + { 0x1.372982p-99, 0x1.833ffap-95 }, + { 0x1.c7d094p-100, 0x1.1c147ap-95 }, + { 0x1.4db1c8p-100, 0x1.a096ccp-96 }, + { 0x1.e858d8p-101, 0x1.314decp-96 }, + { 0x1.6529ep-101, 0x1.bf46cep-97 }, + { 0x1.0517bap-101, 0x1.47796ap-97 }, + { 0x1.7d8a8p-102, 0x1.df49a2p-98 }, + { 0x1.16a46p-102, 0x1.5e9198p-98 }, + { 0x1.96ca76p-103, 0x1.004b34p-98 }, + { 0x1.28cb2cp-103, 0x1.768f3ep-99 }, + { 0x1.b0de98p-104, 0x1.1190d2p-99 }, }, - - { -#if ERFCF_POLY_NCOEFFS == 16 - 0x1.fea5663f75cd1p-1, -0x1.1cb5a82adf1c4p0, 0x1.e7c8da942d86fp-1, - -0x1.547ba0456bac7p-1, 0x1.8a6fc0f4421a4p-2, -0x1.7c14f9301ee58p-3, - 0x1.2f67c8351577p-4, -0x1.8e733f6d159d9p-6, 0x1.aa6a0ec249067p-8, - -0x1.6f4ec45b11f3fp-10, 0x1.f4c00c4b33ba8p-13, -0x1.0795faf7846d2p-15, - 0x1.9cef9031810ddp-19, -0x1.c4d60c3fecdb6p-23, 0x1.360547ec2229dp-27, - -0x1.8ec1581647f9fp-33 -#endif - }, - - { -#if ERFCF_POLY_NCOEFFS == 16 - 0x1.dae421147c591p-1, -0x1.c211957a0abfcp-1, 0x1.28a8d87aa1b12p-1, - -0x1.224d2a58cbef4p-2, 0x1.b3d45dcaef898p-4, -0x1.ff99d8b33e7a9p-6, - 0x1.dac66375b99f6p-8, -0x1.5e1786f0f91ap-10, 0x1.9a2588deaec4fp-13, - -0x1.7b886b183b235p-16, 0x1.1209e7da8ff82p-19, -0x1.2e5c870c6ed8p-23, - 0x1.ec6a89422928ep-28, -0x1.16e7d837b61bcp-32, 0x1.88868a73e4b43p-38, - -0x1.027034672f11cp-44 -#endif - }, - - { -#if ERFCF_POLY_NCOEFFS == 16 - 0x1.8ae320c1bad5ap-1, -0x1.1cdd6aa6929aap-1, 0x1.0e39a7b285f58p-2, - -0x1.6fb12a95e351dp-4, 0x1.77dd0649e352cp-6, -0x1.28a9e9560c461p-8, - 0x1.6f7d7778e9433p-11, -0x1.68363698afe4ap-14, 0x1.17e94cdf35d82p-17, - -0x1.5766a817bd3ffp-21, 0x1.48d892094a2c1p-25, -0x1.e1b6511ab6d0bp-30, - 0x1.04c7b8143f6a4p-34, -0x1.898831961065bp-40, 0x1.71ae8a56142a6p-46, - -0x1.45abac612344bp-53 -#endif - }}}; + }; diff --git a/contrib/arm-optimized-routines/pl/math/erff_1u5.c b/contrib/arm-optimized-routines/pl/math/erff_1u5.c deleted file mode 100644 index 1a69872c43e5..000000000000 --- a/contrib/arm-optimized-routines/pl/math/erff_1u5.c +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Single-precision erf(x) function. - * - * Copyright (c) 2020-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "estrinf.h" -#include "hornerf.h" -#include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" - -#define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f -#define A __erff_data.erff_poly_A -#define B __erff_data.erff_poly_B - -/* Top 12 bits of a float. */ -static inline uint32_t -top12 (float x) -{ - return asuint (x) >> 20; -} - -/* Efficient implementation of erff using either a pure polynomial approximation - or the exponential of a polynomial. Worst-case error is 1.09ulps at - 0x1.c111acp-1. */ -float -erff (float x) -{ - float r, x2; - - /* Get top word. */ - uint32_t ix = asuint (x); - uint32_t sign = ix >> 31; - uint32_t ia12 = top12 (x) & 0x7ff; - - /* Limit of both intervals is 0.875 for performance reasons but coefficients - computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy - from 0.94 to 1.1ulps. */ - if (ia12 < 0x3f6) - { /* a = |x| < 0.875. */ - - /* Tiny and subnormal cases. */ - if (unlikely (ia12 < 0x318)) - { /* |x| < 2^(-28). */ - if (unlikely (ia12 < 0x040)) - { /* |x| < 2^(-119). */ - float y = fmaf (TwoOverSqrtPiMinusOne, x, x); - return check_uflowf (y); - } - return x + TwoOverSqrtPiMinusOne * x; - } - - x2 = x * x; - - /* Normalized cases (|x| < 0.921875) - Use Horner scheme for x+x*P(x^2). - */ -#define C(i) A[i] - r = fmaf (HORNER_5 (x2, C), x, x); -#undef C - } - else if (ia12 < 0x408) - { /* |x| < 4.0 - Use a custom Estrin scheme. */ - - float a = fabsf (x); - /* Use Estrin scheme on high order (small magnitude) coefficients. */ -#define C(i) B[i] - r = ESTRIN_3_ (a, x * x, C, 3); -#undef C - /* Then switch to pure Horner scheme. */ - r = fmaf (r, a, B[2]); - r = fmaf (r, a, B[1]); - r = fmaf (r, a, B[0]); - r = fmaf (r, a, a); - /* Single precision exponential with ~0.5ulps ensures erff has maximum - relative error below 1ulp on [0.921875, 4.0] and below 1.1ulps on - [0.875, 4.0]. */ - r = expf (-r); - /* Explicit copysign (calling copysignf increases latency). */ - if (sign) - r = -1.0f + r; - else - r = 1.0f - r; - } - else - { /* |x| >= 4.0. */ - - /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1. */ - if (unlikely (ia12 >= 0x7f8)) - return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x; - - /* Explicit copysign (calling copysignf increases latency). */ - if (sign) - r = -1.0f; - else - r = 1.0f; - } - return r; -} - -PL_SIG (S, F, 1, erf, -4.0, 4.0) -PL_TEST_ULP (erff, 0.6) -PL_TEST_INTERVAL (erff, 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (erff, 0x1p-127, 0x1p-26, 40000) -PL_TEST_INTERVAL (erff, -0x1p-127, -0x1p-26, 40000) -PL_TEST_INTERVAL (erff, 0x1p-26, 0x1p3, 40000) -PL_TEST_INTERVAL (erff, -0x1p-26, -0x1p3, 40000) -PL_TEST_INTERVAL (erff, 0, inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/erff_2u.c b/contrib/arm-optimized-routines/pl/math/erff_2u.c new file mode 100644 index 000000000000..f43e647072f8 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/erff_2u.c @@ -0,0 +1,82 @@ +/* + * Single-precision erf(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f +#define Shift 0x1p16f +#define OneThird 0x1.555556p-2f + +/* Fast erff approximation based on series expansion near x rounded to + nearest multiple of 1/128. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erf(x) ~ erf(r) + + scale * d * [ + + 1 + - r d + + 1/3 (2 r^2 - 1) d^2 + - 1/6 (r (2 r^2 - 3) ) d^3 + + 1/30 (4 r^4 - 12 r^2 + 3) d^4 + ] + + This single precision implementation uses only the following terms: + + erf(x) ~ erf(r) + scale * d * [1 - r * d - 1/3 * d^2] + + Values of erf(r) and scale are read from lookup tables. + For |x| > 3.9375, erf(|x|) rounds to 1.0f. + + Maximum error: 1.93 ULP + erff(0x1.c373e6p-9) got 0x1.fd686cp-9 + want 0x1.fd6868p-9. */ +float +erff (float x) +{ + /* Get absolute value and sign. */ + uint32_t ix = asuint (x); + uint32_t ia = ix & 0x7fffffff; + uint32_t sign = ix & ~0x7fffffff; + + /* |x| < 0x1p-62. Triggers exceptions. */ + if (unlikely (ia < 0x20800000)) + return fmaf (TwoOverSqrtPiMinusOne, x, x); + + if (ia < 0x407b8000) /* |x| < 4 - 8 / 128 = 3.9375. */ + { + /* Lookup erf(r) and scale(r) in tables, e.g. set erf(r) to 0 and scale + to 2/sqrt(pi), when x reduced to r = 0. */ + float a = asfloat (ia); + float z = a + Shift; + uint32_t i = asuint (z) - asuint (Shift); + float r = z - Shift; + float erfr = __erff_data.tab[i].erf; + float scale = __erff_data.tab[i].scale; + + /* erf(x) ~ erf(r) + scale * d * (1 - r * d - 1/3 * d^2). */ + float d = a - r; + float d2 = d * d; + float y = -fmaf (OneThird, d, r); + y = fmaf (fmaf (y, d2, d), scale, erfr); + return asfloat (asuint (y) | sign); + } + + /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1. */ + if (unlikely (ia >= 0x7f800000)) + return (1.0f - (float) (sign >> 30)) + 1.0f / x; + + /* Boring domain (|x| >= 4.0). */ + return asfloat (sign | asuint (1.0f)); +} + +PL_SIG (S, F, 1, erf, -4.0, 4.0) +PL_TEST_ULP (erff, 1.43) +PL_TEST_SYM_INTERVAL (erff, 0, 3.9375, 40000) +PL_TEST_SYM_INTERVAL (erff, 3.9375, inf, 40000) +PL_TEST_SYM_INTERVAL (erff, 0, inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/erff_data.c b/contrib/arm-optimized-routines/pl/math/erff_data.c index 2352baefd35f..84c0d2e95463 100644 --- a/contrib/arm-optimized-routines/pl/math/erff_data.c +++ b/contrib/arm-optimized-routines/pl/math/erff_data.c @@ -1,16 +1,532 @@ /* * Data for approximation of erff. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" -/* Minimax approximation of erff. */ -const struct erff_data __erff_data - = {.erff_poly_A = {0x1.06eba6p-03f, -0x1.8126e0p-02f, 0x1.ce1a46p-04f, - -0x1.b68bd2p-06f, 0x1.473f48p-08f, -0x1.3a1a82p-11f}, - .erff_poly_B - = {0x1.079d0cp-3f, 0x1.450aa0p-1f, 0x1.b55cb0p-4f, -0x1.8d6300p-6f, - 0x1.fd1336p-9f, -0x1.91d2ccp-12f, 0x1.222900p-16f}}; +/* Lookup table used in erff. + For each possible rounded input r (multiples of 1/128), between + r = 0.0 and r = 4.0 (513 values): + - the first entry __erff_data.tab.erf contains the values of erf(r), + - the second entry __erff_data.tab.scale contains the values of + 2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the + algorithm, since lookup is performed only for x >= 1/64-1/512. */ +const struct erff_data __erff_data = { + .tab = { { 0x0.000000p+0, 0x1.20dd76p+0 }, + { 0x1.20dbf4p-7, 0x1.20d8f2p+0 }, + { 0x1.20d770p-6, 0x1.20cb68p+0 }, + { 0x1.b137e0p-6, 0x1.20b4d8p+0 }, + { 0x1.20c564p-5, 0x1.209546p+0 }, + { 0x1.68e5d4p-5, 0x1.206cb4p+0 }, + { 0x1.b0fafep-5, 0x1.203b26p+0 }, + { 0x1.f902a8p-5, 0x1.2000a0p+0 }, + { 0x1.207d48p-4, 0x1.1fbd28p+0 }, + { 0x1.44703ep-4, 0x1.1f70c4p+0 }, + { 0x1.68591ap-4, 0x1.1f1b7ap+0 }, + { 0x1.8c36bep-4, 0x1.1ebd56p+0 }, + { 0x1.b00812p-4, 0x1.1e565cp+0 }, + { 0x1.d3cbf8p-4, 0x1.1de698p+0 }, + { 0x1.f7815ap-4, 0x1.1d6e14p+0 }, + { 0x1.0d9390p-3, 0x1.1cecdcp+0 }, + { 0x1.1f5e1ap-3, 0x1.1c62fap+0 }, + { 0x1.311fc2p-3, 0x1.1bd07cp+0 }, + { 0x1.42d7fcp-3, 0x1.1b3572p+0 }, + { 0x1.548642p-3, 0x1.1a91e6p+0 }, + { 0x1.662a0cp-3, 0x1.19e5eap+0 }, + { 0x1.77c2d2p-3, 0x1.19318cp+0 }, + { 0x1.895010p-3, 0x1.1874dep+0 }, + { 0x1.9ad142p-3, 0x1.17aff0p+0 }, + { 0x1.ac45e4p-3, 0x1.16e2d8p+0 }, + { 0x1.bdad72p-3, 0x1.160da4p+0 }, + { 0x1.cf076ep-3, 0x1.153068p+0 }, + { 0x1.e05354p-3, 0x1.144b3cp+0 }, + { 0x1.f190aap-3, 0x1.135e30p+0 }, + { 0x1.015f78p-2, 0x1.12695ep+0 }, + { 0x1.09eed6p-2, 0x1.116cd8p+0 }, + { 0x1.127632p-2, 0x1.1068bap+0 }, + { 0x1.1af54ep-2, 0x1.0f5d16p+0 }, + { 0x1.236bf0p-2, 0x1.0e4a08p+0 }, + { 0x1.2bd9dcp-2, 0x1.0d2fa6p+0 }, + { 0x1.343ed6p-2, 0x1.0c0e0ap+0 }, + { 0x1.3c9aa8p-2, 0x1.0ae550p+0 }, + { 0x1.44ed18p-2, 0x1.09b590p+0 }, + { 0x1.4d35f0p-2, 0x1.087ee4p+0 }, + { 0x1.5574f4p-2, 0x1.07416cp+0 }, + { 0x1.5da9f4p-2, 0x1.05fd3ep+0 }, + { 0x1.65d4b8p-2, 0x1.04b27cp+0 }, + { 0x1.6df50ap-2, 0x1.036140p+0 }, + { 0x1.760abap-2, 0x1.0209a6p+0 }, + { 0x1.7e1594p-2, 0x1.00abd0p+0 }, + { 0x1.861566p-2, 0x1.fe8fb0p-1 }, + { 0x1.8e0a02p-2, 0x1.fbbbbep-1 }, + { 0x1.95f336p-2, 0x1.f8dc0ap-1 }, + { 0x1.9dd0d2p-2, 0x1.f5f0cep-1 }, + { 0x1.a5a2acp-2, 0x1.f2fa4cp-1 }, + { 0x1.ad6896p-2, 0x1.eff8c4p-1 }, + { 0x1.b52264p-2, 0x1.ecec78p-1 }, + { 0x1.bccfecp-2, 0x1.e9d5a8p-1 }, + { 0x1.c47104p-2, 0x1.e6b498p-1 }, + { 0x1.cc0584p-2, 0x1.e38988p-1 }, + { 0x1.d38d44p-2, 0x1.e054bep-1 }, + { 0x1.db081cp-2, 0x1.dd167cp-1 }, + { 0x1.e275eap-2, 0x1.d9cf06p-1 }, + { 0x1.e9d68ap-2, 0x1.d67ea2p-1 }, + { 0x1.f129d4p-2, 0x1.d32592p-1 }, + { 0x1.f86faap-2, 0x1.cfc41ep-1 }, + { 0x1.ffa7eap-2, 0x1.cc5a8ap-1 }, + { 0x1.03693ap-1, 0x1.c8e91cp-1 }, + { 0x1.06f794p-1, 0x1.c5701ap-1 }, + { 0x1.0a7ef6p-1, 0x1.c1efcap-1 }, + { 0x1.0dff50p-1, 0x1.be6872p-1 }, + { 0x1.117894p-1, 0x1.bada5ap-1 }, + { 0x1.14eab4p-1, 0x1.b745c6p-1 }, + { 0x1.1855a6p-1, 0x1.b3aafcp-1 }, + { 0x1.1bb95cp-1, 0x1.b00a46p-1 }, + { 0x1.1f15ccp-1, 0x1.ac63e8p-1 }, + { 0x1.226ae8p-1, 0x1.a8b828p-1 }, + { 0x1.25b8a8p-1, 0x1.a5074ep-1 }, + { 0x1.28ff02p-1, 0x1.a1519ep-1 }, + { 0x1.2c3decp-1, 0x1.9d9762p-1 }, + { 0x1.2f755cp-1, 0x1.99d8dap-1 }, + { 0x1.32a54cp-1, 0x1.961650p-1 }, + { 0x1.35cdb4p-1, 0x1.925008p-1 }, + { 0x1.38ee8ap-1, 0x1.8e8646p-1 }, + { 0x1.3c07cap-1, 0x1.8ab950p-1 }, + { 0x1.3f196ep-1, 0x1.86e96ap-1 }, + { 0x1.42236ep-1, 0x1.8316d6p-1 }, + { 0x1.4525c8p-1, 0x1.7f41dcp-1 }, + { 0x1.482074p-1, 0x1.7b6abcp-1 }, + { 0x1.4b1372p-1, 0x1.7791b8p-1 }, + { 0x1.4dfebap-1, 0x1.73b714p-1 }, + { 0x1.50e24cp-1, 0x1.6fdb12p-1 }, + { 0x1.53be26p-1, 0x1.6bfdf0p-1 }, + { 0x1.569244p-1, 0x1.681ff2p-1 }, + { 0x1.595ea6p-1, 0x1.644156p-1 }, + { 0x1.5c2348p-1, 0x1.60625cp-1 }, + { 0x1.5ee02ep-1, 0x1.5c8342p-1 }, + { 0x1.619556p-1, 0x1.58a446p-1 }, + { 0x1.6442c0p-1, 0x1.54c5a6p-1 }, + { 0x1.66e86ep-1, 0x1.50e79ep-1 }, + { 0x1.69865ep-1, 0x1.4d0a68p-1 }, + { 0x1.6c1c98p-1, 0x1.492e42p-1 }, + { 0x1.6eab18p-1, 0x1.455366p-1 }, + { 0x1.7131e6p-1, 0x1.417a0cp-1 }, + { 0x1.73b102p-1, 0x1.3da26ep-1 }, + { 0x1.762870p-1, 0x1.39ccc2p-1 }, + { 0x1.789836p-1, 0x1.35f940p-1 }, + { 0x1.7b0058p-1, 0x1.32281ep-1 }, + { 0x1.7d60d8p-1, 0x1.2e5992p-1 }, + { 0x1.7fb9c0p-1, 0x1.2a8dcep-1 }, + { 0x1.820b12p-1, 0x1.26c508p-1 }, + { 0x1.8454d6p-1, 0x1.22ff72p-1 }, + { 0x1.869712p-1, 0x1.1f3d3cp-1 }, + { 0x1.88d1cep-1, 0x1.1b7e98p-1 }, + { 0x1.8b050ep-1, 0x1.17c3b6p-1 }, + { 0x1.8d30dep-1, 0x1.140cc4p-1 }, + { 0x1.8f5544p-1, 0x1.1059eep-1 }, + { 0x1.91724ap-1, 0x1.0cab62p-1 }, + { 0x1.9387f6p-1, 0x1.09014cp-1 }, + { 0x1.959652p-1, 0x1.055bd6p-1 }, + { 0x1.979d68p-1, 0x1.01bb2cp-1 }, + { 0x1.999d42p-1, 0x1.fc3ee6p-2 }, + { 0x1.9b95e8p-1, 0x1.f511aap-2 }, + { 0x1.9d8768p-1, 0x1.edeeeep-2 }, + { 0x1.9f71cap-1, 0x1.e6d700p-2 }, + { 0x1.a1551ap-1, 0x1.dfca26p-2 }, + { 0x1.a33162p-1, 0x1.d8c8aap-2 }, + { 0x1.a506b0p-1, 0x1.d1d2d0p-2 }, + { 0x1.a6d50cp-1, 0x1.cae8dap-2 }, + { 0x1.a89c86p-1, 0x1.c40b08p-2 }, + { 0x1.aa5d26p-1, 0x1.bd3998p-2 }, + { 0x1.ac16fcp-1, 0x1.b674c8p-2 }, + { 0x1.adca14p-1, 0x1.afbcd4p-2 }, + { 0x1.af767ap-1, 0x1.a911f0p-2 }, + { 0x1.b11c3cp-1, 0x1.a27456p-2 }, + { 0x1.b2bb68p-1, 0x1.9be438p-2 }, + { 0x1.b4540ap-1, 0x1.9561c8p-2 }, + { 0x1.b5e630p-1, 0x1.8eed36p-2 }, + { 0x1.b771e8p-1, 0x1.8886b2p-2 }, + { 0x1.b8f742p-1, 0x1.822e66p-2 }, + { 0x1.ba764ap-1, 0x1.7be47ap-2 }, + { 0x1.bbef10p-1, 0x1.75a91ap-2 }, + { 0x1.bd61a2p-1, 0x1.6f7c6ap-2 }, + { 0x1.bece0ep-1, 0x1.695e8cp-2 }, + { 0x1.c03464p-1, 0x1.634fa6p-2 }, + { 0x1.c194b2p-1, 0x1.5d4fd4p-2 }, + { 0x1.c2ef08p-1, 0x1.575f34p-2 }, + { 0x1.c44376p-1, 0x1.517de6p-2 }, + { 0x1.c5920ap-1, 0x1.4bac00p-2 }, + { 0x1.c6dad2p-1, 0x1.45e99cp-2 }, + { 0x1.c81de2p-1, 0x1.4036d0p-2 }, + { 0x1.c95b46p-1, 0x1.3a93b2p-2 }, + { 0x1.ca930ep-1, 0x1.350052p-2 }, + { 0x1.cbc54cp-1, 0x1.2f7cc4p-2 }, + { 0x1.ccf20cp-1, 0x1.2a0916p-2 }, + { 0x1.ce1962p-1, 0x1.24a554p-2 }, + { 0x1.cf3b5cp-1, 0x1.1f518ap-2 }, + { 0x1.d0580cp-1, 0x1.1a0dc6p-2 }, + { 0x1.d16f7ep-1, 0x1.14da0ap-2 }, + { 0x1.d281c4p-1, 0x1.0fb662p-2 }, + { 0x1.d38ef0p-1, 0x1.0aa2d0p-2 }, + { 0x1.d49710p-1, 0x1.059f5ap-2 }, + { 0x1.d59a34p-1, 0x1.00ac00p-2 }, + { 0x1.d6986cp-1, 0x1.f79184p-3 }, + { 0x1.d791cap-1, 0x1.edeb40p-3 }, + { 0x1.d8865ep-1, 0x1.e46530p-3 }, + { 0x1.d97636p-1, 0x1.daff4ap-3 }, + { 0x1.da6162p-1, 0x1.d1b982p-3 }, + { 0x1.db47f4p-1, 0x1.c893cep-3 }, + { 0x1.dc29fcp-1, 0x1.bf8e1cp-3 }, + { 0x1.dd0788p-1, 0x1.b6a856p-3 }, + { 0x1.dde0aap-1, 0x1.ade26cp-3 }, + { 0x1.deb570p-1, 0x1.a53c42p-3 }, + { 0x1.df85eap-1, 0x1.9cb5bep-3 }, + { 0x1.e0522ap-1, 0x1.944ec2p-3 }, + { 0x1.e11a3ep-1, 0x1.8c0732p-3 }, + { 0x1.e1de36p-1, 0x1.83deeap-3 }, + { 0x1.e29e22p-1, 0x1.7bd5c8p-3 }, + { 0x1.e35a12p-1, 0x1.73eba4p-3 }, + { 0x1.e41214p-1, 0x1.6c2056p-3 }, + { 0x1.e4c638p-1, 0x1.6473b6p-3 }, + { 0x1.e5768cp-1, 0x1.5ce596p-3 }, + { 0x1.e62322p-1, 0x1.5575c8p-3 }, + { 0x1.e6cc08p-1, 0x1.4e241ep-3 }, + { 0x1.e7714ap-1, 0x1.46f066p-3 }, + { 0x1.e812fcp-1, 0x1.3fda6cp-3 }, + { 0x1.e8b12ap-1, 0x1.38e1fap-3 }, + { 0x1.e94be4p-1, 0x1.3206dcp-3 }, + { 0x1.e9e336p-1, 0x1.2b48dap-3 }, + { 0x1.ea7730p-1, 0x1.24a7b8p-3 }, + { 0x1.eb07e2p-1, 0x1.1e233ep-3 }, + { 0x1.eb9558p-1, 0x1.17bb2cp-3 }, + { 0x1.ec1fa2p-1, 0x1.116f48p-3 }, + { 0x1.eca6ccp-1, 0x1.0b3f52p-3 }, + { 0x1.ed2ae6p-1, 0x1.052b0cp-3 }, + { 0x1.edabfcp-1, 0x1.fe6460p-4 }, + { 0x1.ee2a1ep-1, 0x1.f2a902p-4 }, + { 0x1.eea556p-1, 0x1.e72372p-4 }, + { 0x1.ef1db4p-1, 0x1.dbd32ap-4 }, + { 0x1.ef9344p-1, 0x1.d0b7a0p-4 }, + { 0x1.f00614p-1, 0x1.c5d04ap-4 }, + { 0x1.f07630p-1, 0x1.bb1c98p-4 }, + { 0x1.f0e3a6p-1, 0x1.b09bfcp-4 }, + { 0x1.f14e82p-1, 0x1.a64de6p-4 }, + { 0x1.f1b6d0p-1, 0x1.9c31c6p-4 }, + { 0x1.f21ca0p-1, 0x1.92470ap-4 }, + { 0x1.f27ff8p-1, 0x1.888d1ep-4 }, + { 0x1.f2e0eap-1, 0x1.7f036cp-4 }, + { 0x1.f33f7ep-1, 0x1.75a960p-4 }, + { 0x1.f39bc2p-1, 0x1.6c7e64p-4 }, + { 0x1.f3f5c2p-1, 0x1.6381e2p-4 }, + { 0x1.f44d88p-1, 0x1.5ab342p-4 }, + { 0x1.f4a31ep-1, 0x1.5211ecp-4 }, + { 0x1.f4f694p-1, 0x1.499d48p-4 }, + { 0x1.f547f2p-1, 0x1.4154bcp-4 }, + { 0x1.f59742p-1, 0x1.3937b2p-4 }, + { 0x1.f5e490p-1, 0x1.31458ep-4 }, + { 0x1.f62fe8p-1, 0x1.297dbap-4 }, + { 0x1.f67952p-1, 0x1.21df9ap-4 }, + { 0x1.f6c0dcp-1, 0x1.1a6a96p-4 }, + { 0x1.f7068cp-1, 0x1.131e14p-4 }, + { 0x1.f74a6ep-1, 0x1.0bf97ep-4 }, + { 0x1.f78c8cp-1, 0x1.04fc3ap-4 }, + { 0x1.f7cceep-1, 0x1.fc4b5ep-5 }, + { 0x1.f80ba2p-1, 0x1.eeea8cp-5 }, + { 0x1.f848acp-1, 0x1.e1d4d0p-5 }, + { 0x1.f8841ap-1, 0x1.d508fap-5 }, + { 0x1.f8bdf2p-1, 0x1.c885e0p-5 }, + { 0x1.f8f63ep-1, 0x1.bc4a54p-5 }, + { 0x1.f92d08p-1, 0x1.b05530p-5 }, + { 0x1.f96256p-1, 0x1.a4a54ap-5 }, + { 0x1.f99634p-1, 0x1.99397ap-5 }, + { 0x1.f9c8a8p-1, 0x1.8e109cp-5 }, + { 0x1.f9f9bap-1, 0x1.83298ep-5 }, + { 0x1.fa2974p-1, 0x1.78832cp-5 }, + { 0x1.fa57dep-1, 0x1.6e1c58p-5 }, + { 0x1.fa84fep-1, 0x1.63f3f6p-5 }, + { 0x1.fab0dep-1, 0x1.5a08e8p-5 }, + { 0x1.fadb84p-1, 0x1.505a18p-5 }, + { 0x1.fb04f6p-1, 0x1.46e66cp-5 }, + { 0x1.fb2d40p-1, 0x1.3dacd2p-5 }, + { 0x1.fb5464p-1, 0x1.34ac36p-5 }, + { 0x1.fb7a6cp-1, 0x1.2be38cp-5 }, + { 0x1.fb9f60p-1, 0x1.2351c2p-5 }, + { 0x1.fbc344p-1, 0x1.1af5d2p-5 }, + { 0x1.fbe61ep-1, 0x1.12ceb4p-5 }, + { 0x1.fc07fap-1, 0x1.0adb60p-5 }, + { 0x1.fc28d8p-1, 0x1.031ad6p-5 }, + { 0x1.fc48c2p-1, 0x1.f7182ap-6 }, + { 0x1.fc67bcp-1, 0x1.e85c44p-6 }, + { 0x1.fc85d0p-1, 0x1.da0006p-6 }, + { 0x1.fca2fep-1, 0x1.cc0180p-6 }, + { 0x1.fcbf52p-1, 0x1.be5ecep-6 }, + { 0x1.fcdaccp-1, 0x1.b1160ap-6 }, + { 0x1.fcf576p-1, 0x1.a4255ap-6 }, + { 0x1.fd0f54p-1, 0x1.978ae8p-6 }, + { 0x1.fd286ap-1, 0x1.8b44e6p-6 }, + { 0x1.fd40bep-1, 0x1.7f5188p-6 }, + { 0x1.fd5856p-1, 0x1.73af0cp-6 }, + { 0x1.fd6f34p-1, 0x1.685bb6p-6 }, + { 0x1.fd8562p-1, 0x1.5d55ccp-6 }, + { 0x1.fd9ae2p-1, 0x1.529b9ep-6 }, + { 0x1.fdafb8p-1, 0x1.482b84p-6 }, + { 0x1.fdc3e8p-1, 0x1.3e03d8p-6 }, + { 0x1.fdd77ap-1, 0x1.3422fep-6 }, + { 0x1.fdea6ep-1, 0x1.2a875cp-6 }, + { 0x1.fdfcccp-1, 0x1.212f62p-6 }, + { 0x1.fe0e96p-1, 0x1.181984p-6 }, + { 0x1.fe1fd0p-1, 0x1.0f443ep-6 }, + { 0x1.fe3080p-1, 0x1.06ae14p-6 }, + { 0x1.fe40a6p-1, 0x1.fcab14p-7 }, + { 0x1.fe504cp-1, 0x1.ec7262p-7 }, + { 0x1.fe5f70p-1, 0x1.dcaf36p-7 }, + { 0x1.fe6e18p-1, 0x1.cd5ecap-7 }, + { 0x1.fe7c46p-1, 0x1.be7e5ap-7 }, + { 0x1.fe8a00p-1, 0x1.b00b38p-7 }, + { 0x1.fe9748p-1, 0x1.a202bep-7 }, + { 0x1.fea422p-1, 0x1.94624ep-7 }, + { 0x1.feb090p-1, 0x1.87275ep-7 }, + { 0x1.febc96p-1, 0x1.7a4f6ap-7 }, + { 0x1.fec836p-1, 0x1.6dd7fep-7 }, + { 0x1.fed374p-1, 0x1.61beaep-7 }, + { 0x1.fede52p-1, 0x1.56011cp-7 }, + { 0x1.fee8d4p-1, 0x1.4a9cf6p-7 }, + { 0x1.fef2fep-1, 0x1.3f8ff6p-7 }, + { 0x1.fefccep-1, 0x1.34d7dcp-7 }, + { 0x1.ff064cp-1, 0x1.2a727ap-7 }, + { 0x1.ff0f76p-1, 0x1.205dacp-7 }, + { 0x1.ff1852p-1, 0x1.169756p-7 }, + { 0x1.ff20e0p-1, 0x1.0d1d6ap-7 }, + { 0x1.ff2924p-1, 0x1.03ede2p-7 }, + { 0x1.ff3120p-1, 0x1.f60d8ap-8 }, + { 0x1.ff38d6p-1, 0x1.e4cc4ap-8 }, + { 0x1.ff4048p-1, 0x1.d4143ap-8 }, + { 0x1.ff4778p-1, 0x1.c3e1a6p-8 }, + { 0x1.ff4e68p-1, 0x1.b430ecp-8 }, + { 0x1.ff551ap-1, 0x1.a4fe84p-8 }, + { 0x1.ff5b90p-1, 0x1.9646f4p-8 }, + { 0x1.ff61ccp-1, 0x1.8806d8p-8 }, + { 0x1.ff67d0p-1, 0x1.7a3adep-8 }, + { 0x1.ff6d9ep-1, 0x1.6cdfccp-8 }, + { 0x1.ff7338p-1, 0x1.5ff276p-8 }, + { 0x1.ff789ep-1, 0x1.536fc2p-8 }, + { 0x1.ff7dd4p-1, 0x1.4754acp-8 }, + { 0x1.ff82dap-1, 0x1.3b9e40p-8 }, + { 0x1.ff87b2p-1, 0x1.30499cp-8 }, + { 0x1.ff8c5cp-1, 0x1.2553eep-8 }, + { 0x1.ff90dcp-1, 0x1.1aba78p-8 }, + { 0x1.ff9532p-1, 0x1.107a8cp-8 }, + { 0x1.ff9960p-1, 0x1.06918cp-8 }, + { 0x1.ff9d68p-1, 0x1.f9f9d0p-9 }, + { 0x1.ffa14ap-1, 0x1.e77448p-9 }, + { 0x1.ffa506p-1, 0x1.d58da6p-9 }, + { 0x1.ffa8a0p-1, 0x1.c4412cp-9 }, + { 0x1.ffac18p-1, 0x1.b38a3ap-9 }, + { 0x1.ffaf6ep-1, 0x1.a36454p-9 }, + { 0x1.ffb2a6p-1, 0x1.93cb12p-9 }, + { 0x1.ffb5bep-1, 0x1.84ba30p-9 }, + { 0x1.ffb8b8p-1, 0x1.762d84p-9 }, + { 0x1.ffbb98p-1, 0x1.682100p-9 }, + { 0x1.ffbe5ap-1, 0x1.5a90b0p-9 }, + { 0x1.ffc102p-1, 0x1.4d78bcp-9 }, + { 0x1.ffc390p-1, 0x1.40d564p-9 }, + { 0x1.ffc606p-1, 0x1.34a306p-9 }, + { 0x1.ffc862p-1, 0x1.28de12p-9 }, + { 0x1.ffcaa8p-1, 0x1.1d8318p-9 }, + { 0x1.ffccd8p-1, 0x1.128ebap-9 }, + { 0x1.ffcef4p-1, 0x1.07fdb4p-9 }, + { 0x1.ffd0fap-1, 0x1.fb99b8p-10 }, + { 0x1.ffd2eap-1, 0x1.e7f232p-10 }, + { 0x1.ffd4cap-1, 0x1.d4fed8p-10 }, + { 0x1.ffd696p-1, 0x1.c2b9d0p-10 }, + { 0x1.ffd84ep-1, 0x1.b11d70p-10 }, + { 0x1.ffd9f8p-1, 0x1.a02436p-10 }, + { 0x1.ffdb90p-1, 0x1.8fc8c8p-10 }, + { 0x1.ffdd18p-1, 0x1.8005f0p-10 }, + { 0x1.ffde90p-1, 0x1.70d6a4p-10 }, + { 0x1.ffdffap-1, 0x1.6235fcp-10 }, + { 0x1.ffe154p-1, 0x1.541f34p-10 }, + { 0x1.ffe2a2p-1, 0x1.468daep-10 }, + { 0x1.ffe3e2p-1, 0x1.397ceep-10 }, + { 0x1.ffe514p-1, 0x1.2ce898p-10 }, + { 0x1.ffe63cp-1, 0x1.20cc76p-10 }, + { 0x1.ffe756p-1, 0x1.15246ep-10 }, + { 0x1.ffe866p-1, 0x1.09ec86p-10 }, + { 0x1.ffe96ap-1, 0x1.fe41cep-11 }, + { 0x1.ffea64p-1, 0x1.e97ba4p-11 }, + { 0x1.ffeb54p-1, 0x1.d57f52p-11 }, + { 0x1.ffec3ap-1, 0x1.c245d4p-11 }, + { 0x1.ffed16p-1, 0x1.afc85ep-11 }, + { 0x1.ffedeap-1, 0x1.9e0058p-11 }, + { 0x1.ffeeb4p-1, 0x1.8ce75ep-11 }, + { 0x1.ffef76p-1, 0x1.7c7744p-11 }, + { 0x1.fff032p-1, 0x1.6caa0ep-11 }, + { 0x1.fff0e4p-1, 0x1.5d79ecp-11 }, + { 0x1.fff18ep-1, 0x1.4ee142p-11 }, + { 0x1.fff232p-1, 0x1.40daa4p-11 }, + { 0x1.fff2d0p-1, 0x1.3360ccp-11 }, + { 0x1.fff366p-1, 0x1.266ea8p-11 }, + { 0x1.fff3f6p-1, 0x1.19ff46p-11 }, + { 0x1.fff480p-1, 0x1.0e0de8p-11 }, + { 0x1.fff504p-1, 0x1.0295f0p-11 }, + { 0x1.fff582p-1, 0x1.ef25d4p-12 }, + { 0x1.fff5fcp-1, 0x1.da0110p-12 }, + { 0x1.fff670p-1, 0x1.c5b542p-12 }, + { 0x1.fff6dep-1, 0x1.b23a5ap-12 }, + { 0x1.fff74ap-1, 0x1.9f8894p-12 }, + { 0x1.fff7aep-1, 0x1.8d986ap-12 }, + { 0x1.fff810p-1, 0x1.7c629ap-12 }, + { 0x1.fff86cp-1, 0x1.6be022p-12 }, + { 0x1.fff8c6p-1, 0x1.5c0a38p-12 }, + { 0x1.fff91cp-1, 0x1.4cda54p-12 }, + { 0x1.fff96cp-1, 0x1.3e4a24p-12 }, + { 0x1.fff9bap-1, 0x1.305390p-12 }, + { 0x1.fffa04p-1, 0x1.22f0b4p-12 }, + { 0x1.fffa4cp-1, 0x1.161be4p-12 }, + { 0x1.fffa90p-1, 0x1.09cfa4p-12 }, + { 0x1.fffad0p-1, 0x1.fc0d56p-13 }, + { 0x1.fffb0ep-1, 0x1.e577bcp-13 }, + { 0x1.fffb4ap-1, 0x1.cfd4a6p-13 }, + { 0x1.fffb82p-1, 0x1.bb1a96p-13 }, + { 0x1.fffbb8p-1, 0x1.a74068p-13 }, + { 0x1.fffbecp-1, 0x1.943d4ap-13 }, + { 0x1.fffc1ep-1, 0x1.8208bcp-13 }, + { 0x1.fffc4ep-1, 0x1.709a8ep-13 }, + { 0x1.fffc7ap-1, 0x1.5feadap-13 }, + { 0x1.fffca6p-1, 0x1.4ff208p-13 }, + { 0x1.fffccep-1, 0x1.40a8c2p-13 }, + { 0x1.fffcf6p-1, 0x1.3207fcp-13 }, + { 0x1.fffd1ap-1, 0x1.2408eap-13 }, + { 0x1.fffd3ep-1, 0x1.16a502p-13 }, + { 0x1.fffd60p-1, 0x1.09d5f8p-13 }, + { 0x1.fffd80p-1, 0x1.fb2b7ap-14 }, + { 0x1.fffda0p-1, 0x1.e3bcf4p-14 }, + { 0x1.fffdbep-1, 0x1.cd5528p-14 }, + { 0x1.fffddap-1, 0x1.b7e946p-14 }, + { 0x1.fffdf4p-1, 0x1.a36eecp-14 }, + { 0x1.fffe0ep-1, 0x1.8fdc1cp-14 }, + { 0x1.fffe26p-1, 0x1.7d2738p-14 }, + { 0x1.fffe3ep-1, 0x1.6b4702p-14 }, + { 0x1.fffe54p-1, 0x1.5a329cp-14 }, + { 0x1.fffe68p-1, 0x1.49e178p-14 }, + { 0x1.fffe7ep-1, 0x1.3a4b60p-14 }, + { 0x1.fffe90p-1, 0x1.2b6876p-14 }, + { 0x1.fffea2p-1, 0x1.1d3120p-14 }, + { 0x1.fffeb4p-1, 0x1.0f9e1cp-14 }, + { 0x1.fffec4p-1, 0x1.02a868p-14 }, + { 0x1.fffed4p-1, 0x1.ec929ap-15 }, + { 0x1.fffee4p-1, 0x1.d4f4b4p-15 }, + { 0x1.fffef2p-1, 0x1.be6abcp-15 }, + { 0x1.ffff00p-1, 0x1.a8e8ccp-15 }, + { 0x1.ffff0cp-1, 0x1.94637ep-15 }, + { 0x1.ffff18p-1, 0x1.80cfdcp-15 }, + { 0x1.ffff24p-1, 0x1.6e2368p-15 }, + { 0x1.ffff30p-1, 0x1.5c540cp-15 }, + { 0x1.ffff3ap-1, 0x1.4b581cp-15 }, + { 0x1.ffff44p-1, 0x1.3b2652p-15 }, + { 0x1.ffff4ep-1, 0x1.2bb5ccp-15 }, + { 0x1.ffff56p-1, 0x1.1cfe02p-15 }, + { 0x1.ffff60p-1, 0x1.0ef6c4p-15 }, + { 0x1.ffff68p-1, 0x1.019842p-15 }, + { 0x1.ffff70p-1, 0x1.e9b5e8p-16 }, + { 0x1.ffff78p-1, 0x1.d16f58p-16 }, + { 0x1.ffff7ep-1, 0x1.ba4f04p-16 }, + { 0x1.ffff84p-1, 0x1.a447b8p-16 }, + { 0x1.ffff8cp-1, 0x1.8f4cccp-16 }, + { 0x1.ffff92p-1, 0x1.7b5224p-16 }, + { 0x1.ffff98p-1, 0x1.684c22p-16 }, + { 0x1.ffff9cp-1, 0x1.562facp-16 }, + { 0x1.ffffa2p-1, 0x1.44f21ep-16 }, + { 0x1.ffffa6p-1, 0x1.34894ap-16 }, + { 0x1.ffffacp-1, 0x1.24eb72p-16 }, + { 0x1.ffffb0p-1, 0x1.160f44p-16 }, + { 0x1.ffffb4p-1, 0x1.07ebd2p-16 }, + { 0x1.ffffb8p-1, 0x1.f4f12ep-17 }, + { 0x1.ffffbcp-1, 0x1.db5ad0p-17 }, + { 0x1.ffffc0p-1, 0x1.c304f0p-17 }, + { 0x1.ffffc4p-1, 0x1.abe09ep-17 }, + { 0x1.ffffc6p-1, 0x1.95df98p-17 }, + { 0x1.ffffcap-1, 0x1.80f43ap-17 }, + { 0x1.ffffccp-1, 0x1.6d1178p-17 }, + { 0x1.ffffd0p-1, 0x1.5a2ae0p-17 }, + { 0x1.ffffd2p-1, 0x1.483488p-17 }, + { 0x1.ffffd4p-1, 0x1.372310p-17 }, + { 0x1.ffffd6p-1, 0x1.26eb9ep-17 }, + { 0x1.ffffd8p-1, 0x1.1783cep-17 }, + { 0x1.ffffdcp-1, 0x1.08e1bap-17 }, + { 0x1.ffffdep-1, 0x1.f5f7d8p-18 }, + { 0x1.ffffdep-1, 0x1.db92b6p-18 }, + { 0x1.ffffe0p-1, 0x1.c282cep-18 }, + { 0x1.ffffe2p-1, 0x1.aab7acp-18 }, + { 0x1.ffffe4p-1, 0x1.94219cp-18 }, + { 0x1.ffffe6p-1, 0x1.7eb1a2p-18 }, + { 0x1.ffffe8p-1, 0x1.6a5972p-18 }, + { 0x1.ffffe8p-1, 0x1.570b6ap-18 }, + { 0x1.ffffeap-1, 0x1.44ba86p-18 }, + { 0x1.ffffeap-1, 0x1.335a62p-18 }, + { 0x1.ffffecp-1, 0x1.22df2ap-18 }, + { 0x1.ffffeep-1, 0x1.133d96p-18 }, + { 0x1.ffffeep-1, 0x1.046aeap-18 }, + { 0x1.fffff0p-1, 0x1.ecb9d0p-19 }, + { 0x1.fffff0p-1, 0x1.d21398p-19 }, + { 0x1.fffff2p-1, 0x1.b8d094p-19 }, + { 0x1.fffff2p-1, 0x1.a0df10p-19 }, + { 0x1.fffff2p-1, 0x1.8a2e26p-19 }, + { 0x1.fffff4p-1, 0x1.74adc8p-19 }, + { 0x1.fffff4p-1, 0x1.604ea8p-19 }, + { 0x1.fffff4p-1, 0x1.4d0232p-19 }, + { 0x1.fffff6p-1, 0x1.3aba86p-19 }, + { 0x1.fffff6p-1, 0x1.296a70p-19 }, + { 0x1.fffff6p-1, 0x1.190562p-19 }, + { 0x1.fffff8p-1, 0x1.097f62p-19 }, + { 0x1.fffff8p-1, 0x1.f59a20p-20 }, + { 0x1.fffff8p-1, 0x1.d9c736p-20 }, + { 0x1.fffff8p-1, 0x1.bf716cp-20 }, + { 0x1.fffffap-1, 0x1.a6852cp-20 }, + { 0x1.fffffap-1, 0x1.8eefd8p-20 }, + { 0x1.fffffap-1, 0x1.789fb8p-20 }, + { 0x1.fffffap-1, 0x1.6383f8p-20 }, + { 0x1.fffffap-1, 0x1.4f8c96p-20 }, + { 0x1.fffffap-1, 0x1.3caa62p-20 }, + { 0x1.fffffcp-1, 0x1.2acee2p-20 }, + { 0x1.fffffcp-1, 0x1.19ec60p-20 }, + { 0x1.fffffcp-1, 0x1.09f5d0p-20 }, + { 0x1.fffffcp-1, 0x1.f5bd96p-21 }, + { 0x1.fffffcp-1, 0x1.d9371ep-21 }, + { 0x1.fffffcp-1, 0x1.be41dep-21 }, + { 0x1.fffffcp-1, 0x1.a4c89ep-21 }, + { 0x1.fffffcp-1, 0x1.8cb738p-21 }, + { 0x1.fffffep-1, 0x1.75fa8ep-21 }, + { 0x1.fffffep-1, 0x1.608078p-21 }, + { 0x1.fffffep-1, 0x1.4c37c0p-21 }, + { 0x1.fffffep-1, 0x1.39100ep-21 }, + { 0x1.fffffep-1, 0x1.26f9e0p-21 }, + { 0x1.fffffep-1, 0x1.15e682p-21 }, + { 0x1.fffffep-1, 0x1.05c804p-21 }, + { 0x1.fffffep-1, 0x1.ed2254p-22 }, + { 0x1.fffffep-1, 0x1.d06ad6p-22 }, + { 0x1.fffffep-1, 0x1.b551c8p-22 }, + { 0x1.fffffep-1, 0x1.9bc0a0p-22 }, + { 0x1.fffffep-1, 0x1.83a200p-22 }, + { 0x1.fffffep-1, 0x1.6ce1aap-22 }, + { 0x1.fffffep-1, 0x1.576c72p-22 }, + { 0x1.fffffep-1, 0x1.43302cp-22 }, + { 0x1.fffffep-1, 0x1.301ba2p-22 }, + { 0x1.fffffep-1, 0x1.1e1e86p-22 }, + { 0x1.fffffep-1, 0x1.0d2966p-22 }, + { 0x1.000000p+0, 0x1.fa5b50p-23 }, + { 0x1.000000p+0, 0x1.dc3ae4p-23 }, + { 0x1.000000p+0, 0x1.bfd756p-23 }, + { 0x1.000000p+0, 0x1.a517dap-23 }, + { 0x1.000000p+0, 0x1.8be4f8p-23 }, + { 0x1.000000p+0, 0x1.74287ep-23 }, + { 0x1.000000p+0, 0x1.5dcd66p-23 }, + { 0x1.000000p+0, 0x1.48bfd4p-23 }, + { 0x1.000000p+0, 0x1.34ecf8p-23 }, + { 0x1.000000p+0, 0x1.224310p-23 }, + { 0x1.000000p+0, 0x1.10b148p-23 }, + }, +}; diff --git a/contrib/arm-optimized-routines/pl/math/erfinv_24u5.c b/contrib/arm-optimized-routines/pl/math/erfinv_24u5.c new file mode 100644 index 000000000000..20e1e361befc --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/erfinv_24u5.c @@ -0,0 +1,81 @@ +/* + * Double-precision inverse error function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "math_config.h" +#include "poly_scalar_f64.h" +#include "pl_sig.h" +#define IGNORE_SCALAR_FENV +#include "pl_test.h" + +const static struct +{ + /* We use P_N and Q_N to refer to arrays of coefficients, where P_N is the + coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs + of the denominator. */ + double P_17[7], Q_17[7], P_37[8], Q_37[8], P_57[9], Q_57[10]; +} data = { + .P_17 = { 0x1.007ce8f01b2e8p+4, -0x1.6b23cc5c6c6d7p+6, 0x1.74e5f6ceb3548p+7, + -0x1.5200bb15cc6bbp+7, 0x1.05d193233a849p+6, -0x1.148c5474ee5e1p+3, + 0x1.689181bbafd0cp-3 }, + .Q_17 = { 0x1.d8fb0f913bd7bp+3, -0x1.6d7f25a3f1c24p+6, 0x1.a450d8e7f4cbbp+7, + -0x1.bc3480485857p+7, 0x1.ae6b0c504ee02p+6, -0x1.499dfec1a7f5fp+4, + 0x1p+0 }, + .P_37 = { -0x1.f3596123109edp-7, 0x1.60b8fe375999ep-2, -0x1.779bb9bef7c0fp+1, + 0x1.786ea384470a2p+3, -0x1.6a7c1453c85d3p+4, 0x1.31f0fc5613142p+4, + -0x1.5ea6c007d4dbbp+2, 0x1.e66f265ce9e5p-3 }, + .Q_37 = { -0x1.636b2dcf4edbep-7, 0x1.0b5411e2acf29p-2, -0x1.3413109467a0bp+1, + 0x1.563e8136c554ap+3, -0x1.7b77aab1dcafbp+4, 0x1.8a3e174e05ddcp+4, + -0x1.4075c56404eecp+3, 0x1p+0 }, + .P_57 = { 0x1.b874f9516f7f1p-14, 0x1.5921f2916c1c4p-7, 0x1.145ae7d5b8fa4p-2, + 0x1.29d6dcc3b2fb7p+1, 0x1.cabe2209a7985p+2, 0x1.11859f0745c4p+3, + 0x1.b7ec7bc6a2ce5p+2, 0x1.d0419e0bb42aep+1, 0x1.c5aa03eef7258p-1 }, + .Q_57 = { 0x1.b8747e12691f1p-14, 0x1.59240d8ed1e0ap-7, 0x1.14aef2b181e2p-2, + 0x1.2cd181bcea52p+1, 0x1.e6e63e0b7aa4cp+2, 0x1.65cf8da94aa3ap+3, + 0x1.7e5c787b10a36p+3, 0x1.0626d68b6cea3p+3, 0x1.065c5f193abf6p+2, + 0x1p+0 } +}; + +/* Inverse error function approximation, based on rational approximation as + described in + J. M. Blair, C. A. Edwards, and J. H. Johnson, + "Rational Chebyshev approximations for the inverse of the error function", + Math. Comp. 30, pp. 827--830 (1976). + https://doi.org/10.1090/S0025-5718-1976-0421040-7 + Largest observed error is 24.46 ULP, in the extreme tail: + erfinv(0x1.fd9504351b757p-1) got 0x1.ff72c1092917p+0 + want 0x1.ff72c10929158p+0. */ +double +erfinv (double x) +{ + double a = fabs (x); + + if (a <= 0.75) + { + /* Largest observed error in this region is 6.06 ULP: + erfinv(0x1.1884650fd2d41p-2) got 0x1.fb65998cbd3fep-3 + want 0x1.fb65998cbd404p-3. */ + double t = x * x - 0.5625; + return x * horner_6_f64 (t, data.P_17) / horner_6_f64 (t, data.Q_17); + } + + if (a <= 0.9375) + { + /* Largest observed error in this region is 6.95 ULP: + erfinv(0x1.a8d65b94d8c6p-1) got 0x1.f08325591b54p-1 + want 0x1.f08325591b547p-1. */ + double t = x * x - 0.87890625; + return x * horner_7_f64 (t, data.P_37) / horner_7_f64 (t, data.Q_37); + } + + double t = 1.0 / (sqrt (-log (1 - a))); + return horner_8_f64 (t, data.P_57) + / (copysign (t, x) * horner_9_f64 (t, data.Q_57)); +} + +PL_SIG (S, D, 1, erfinv, -0.99, 0.99) +PL_TEST_ULP (erfinv, 24.0) +PL_TEST_INTERVAL (erfinv, 0, 1, 40000) +PL_TEST_INTERVAL (erfinv, -0x1p-1022, -1, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/erfinvf_4u7.c b/contrib/arm-optimized-routines/pl/math/erfinvf_4u7.c new file mode 100644 index 000000000000..40736da08be8 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/erfinvf_4u7.c @@ -0,0 +1,74 @@ +/* + * Single-precision inverse error function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "poly_scalar_f32.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +const static struct +{ + /* We use P_N and Q_N to refer to arrays of coefficients, where P_N is the + coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs + of the denominator. */ + float P_10[3], Q_10[4], P_29[4], Q_29[4], P_50[6], Q_50[3]; +} data = { .P_10 = { -0x1.a31268p+3, 0x1.ac9048p+4, -0x1.293ff6p+3 }, + .Q_10 = { -0x1.8265eep+3, 0x1.ef5eaep+4, -0x1.12665p+4, 0x1p+0 }, + .P_29 + = { -0x1.fc0252p-4, 0x1.119d44p+0, -0x1.f59ee2p+0, 0x1.b13626p-2 }, + .Q_29 = { -0x1.69952p-4, 0x1.c7b7d2p-1, -0x1.167d7p+1, 0x1p+0 }, + .P_50 = { 0x1.3d8948p-3, 0x1.61f9eap+0, 0x1.61c6bcp-1, + -0x1.20c9f2p+0, 0x1.5c704cp-1, -0x1.50c6bep-3 }, + .Q_50 = { 0x1.3d7dacp-3, 0x1.629e5p+0, 0x1p+0 } }; + +/* Inverse error function approximation, based on rational approximation as + described in + J. M. Blair, C. A. Edwards, and J. H. Johnson, + "Rational Chebyshev approximations for the inverse of the error function", + Math. Comp. 30, pp. 827--830 (1976). + https://doi.org/10.1090/S0025-5718-1976-0421040-7 + Largest error is 4.71 ULP, in the tail region: + erfinvf(0x1.f84e9ap-1) got 0x1.b8326ap+0 + want 0x1.b83274p+0. */ +float +erfinvf (float x) +{ + if (x == 1.0f) + return __math_oflowf (0); + if (x == -1.0f) + return __math_oflowf (1); + + float a = fabsf (x); + if (a > 1.0f) + return __math_invalidf (x); + + if (a <= 0.75f) + { + /* Greatest error in this region is 4.60 ULP: + erfinvf(0x1.0a98bap-5) got 0x1.d8a93ep-6 + want 0x1.d8a948p-6. */ + float t = x * x - 0.5625f; + return x * horner_2_f32 (t, data.P_10) / horner_3_f32 (t, data.Q_10); + } + if (a < 0.9375f) + { + /* Greatest error in this region is 3.79 ULP: + erfinvf(0x1.ac82d6p-1) got 0x1.f8fc54p-1 + want 0x1.f8fc5cp-1. */ + float t = x * x - 0.87890625f; + return x * horner_3_f32 (t, data.P_29) / horner_3_f32 (t, data.Q_29); + } + + /* Tail region, where error is greatest (and sensitive to sqrt and log1p + implementations. */ + float t = 1.0 / sqrtf (-log1pf (-a)); + return horner_5_f32 (t, data.P_50) + / (copysignf (t, x) * horner_2_f32 (t, data.Q_50)); +} + +PL_SIG (S, F, 1, erfinv, -0.99, 0.99) +PL_TEST_ULP (erfinvf, 4.09) +PL_TEST_SYM_INTERVAL (erfinvf, 0, 1, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/erfinvl.c b/contrib/arm-optimized-routines/pl/math/erfinvl.c new file mode 100644 index 000000000000..ea4aadfccd00 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/erfinvl.c @@ -0,0 +1,114 @@ +/* + * Extended precision inverse error function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define _GNU_SOURCE +#include +#include +#include + +#include "math_config.h" +#include "poly_scalar_f64.h" + +#define SQRT_PIl 0x1.c5bf891b4ef6aa79c3b0520d5db9p0l +#define HF_SQRT_PIl 0x1.c5bf891b4ef6aa79c3b0520d5db9p-1l + +const static struct +{ + /* We use P_N and Q_N to refer to arrays of coefficients, where P_N is the + coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs + of the denominator. */ + double P_17[7], Q_17[7], P_37[8], Q_37[8], P_57[9], Q_57[10]; +} data = { + .P_17 = { 0x1.007ce8f01b2e8p+4, -0x1.6b23cc5c6c6d7p+6, 0x1.74e5f6ceb3548p+7, + -0x1.5200bb15cc6bbp+7, 0x1.05d193233a849p+6, -0x1.148c5474ee5e1p+3, + 0x1.689181bbafd0cp-3 }, + .Q_17 = { 0x1.d8fb0f913bd7bp+3, -0x1.6d7f25a3f1c24p+6, 0x1.a450d8e7f4cbbp+7, + -0x1.bc3480485857p+7, 0x1.ae6b0c504ee02p+6, -0x1.499dfec1a7f5fp+4, + 0x1p+0 }, + .P_37 = { -0x1.f3596123109edp-7, 0x1.60b8fe375999ep-2, -0x1.779bb9bef7c0fp+1, + 0x1.786ea384470a2p+3, -0x1.6a7c1453c85d3p+4, 0x1.31f0fc5613142p+4, + -0x1.5ea6c007d4dbbp+2, 0x1.e66f265ce9e5p-3 }, + .Q_37 = { -0x1.636b2dcf4edbep-7, 0x1.0b5411e2acf29p-2, -0x1.3413109467a0bp+1, + 0x1.563e8136c554ap+3, -0x1.7b77aab1dcafbp+4, 0x1.8a3e174e05ddcp+4, + -0x1.4075c56404eecp+3, 0x1p+0 }, + .P_57 = { 0x1.b874f9516f7f1p-14, 0x1.5921f2916c1c4p-7, 0x1.145ae7d5b8fa4p-2, + 0x1.29d6dcc3b2fb7p+1, 0x1.cabe2209a7985p+2, 0x1.11859f0745c4p+3, + 0x1.b7ec7bc6a2ce5p+2, 0x1.d0419e0bb42aep+1, 0x1.c5aa03eef7258p-1 }, + .Q_57 = { 0x1.b8747e12691f1p-14, 0x1.59240d8ed1e0ap-7, 0x1.14aef2b181e2p-2, + 0x1.2cd181bcea52p+1, 0x1.e6e63e0b7aa4cp+2, 0x1.65cf8da94aa3ap+3, + 0x1.7e5c787b10a36p+3, 0x1.0626d68b6cea3p+3, 0x1.065c5f193abf6p+2, + 0x1p+0 } +}; + +/* Inverse error function approximation, based on rational approximation as + described in + J. M. Blair, C. A. Edwards, and J. H. Johnson, + "Rational Chebyshev approximations for the inverse of the error function", + Math. Comp. 30, pp. 827--830 (1976). + https://doi.org/10.1090/S0025-5718-1976-0421040-7. */ +static inline double +__erfinv (double x) +{ + if (x == 1.0) + return __math_oflow (0); + if (x == -1.0) + return __math_oflow (1); + + double a = fabs (x); + if (a > 1) + return __math_invalid (x); + + if (a <= 0.75) + { + double t = x * x - 0.5625; + return x * horner_6_f64 (t, data.P_17) / horner_6_f64 (t, data.Q_17); + } + + if (a <= 0.9375) + { + double t = x * x - 0.87890625; + return x * horner_7_f64 (t, data.P_37) / horner_7_f64 (t, data.Q_37); + } + + double t = 1.0 / (sqrtl (-log1pl (-a))); + return horner_8_f64 (t, data.P_57) + / (copysign (t, x) * horner_9_f64 (t, data.Q_57)); +} + +/* Extended-precision variant, which uses the above (or asymptotic estimate) as + starting point for Newton refinement. This implementation is a port to C of + the version in the SpecialFunctions.jl Julia package, with relaxed stopping + criteria for the Newton refinement. */ +long double +erfinvl (long double x) +{ + if (x == 0) + return 0; + + double yf = __erfinv (x); + long double y; + if (isfinite (yf)) + y = yf; + else + { + /* Double overflowed, use asymptotic estimate instead. */ + y = copysignl (sqrtl (-logl (1.0l - fabsl (x)) * SQRT_PIl), x); + if (!isfinite (y)) + return y; + } + + double eps = fabs (yf - nextafter (yf, 0)); + while (true) + { + long double dy = HF_SQRT_PIl * (erfl (y) - x) * exp (y * y); + y -= dy; + /* Stopping criterion is different to Julia implementation, but is enough + to ensure result is accurate when rounded to double-precision. */ + if (fabsl (dy) < eps) + break; + } + return y; +} diff --git a/contrib/arm-optimized-routines/pl/math/estrin.h b/contrib/arm-optimized-routines/pl/math/estrin.h deleted file mode 100644 index f967fb0475b0..000000000000 --- a/contrib/arm-optimized-routines/pl/math/estrin.h +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Helper macros for double-precision Estrin polynomial evaluation. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -#if V_SUPPORTED -#define FMA v_fma_f64 -#else -#define FMA fma -#endif - -#include "estrin_wrap.h" diff --git a/contrib/arm-optimized-routines/pl/math/estrin_wrap.h b/contrib/arm-optimized-routines/pl/math/estrin_wrap.h deleted file mode 100644 index 2ae07001f2cf..000000000000 --- a/contrib/arm-optimized-routines/pl/math/estrin_wrap.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Helper macros for double-precision Estrin polynomial evaluation. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -// clang-format off -#define ESTRIN_1_(x, c, i) FMA(x, c(1 + i), c(i)) -#define ESTRIN_2_(x, x2, c, i) FMA(x2, c(2 + i), ESTRIN_1_(x, c, i)) -#define ESTRIN_3_(x, x2, c, i) FMA(x2, ESTRIN_1_(x, c, 2 + i), ESTRIN_1_(x, c, i)) -#define ESTRIN_4_(x, x2, x4, c, i) FMA(x4, c(4 + i), ESTRIN_3_(x, x2, c, i)) -#define ESTRIN_5_(x, x2, x4, c, i) FMA(x4, ESTRIN_1_(x, c, 4 + i), ESTRIN_3_(x, x2, c, i)) -#define ESTRIN_6_(x, x2, x4, c, i) FMA(x4, ESTRIN_2_(x, x2, c, 4 + i), ESTRIN_3_(x, x2, c, i)) -#define ESTRIN_7_(x, x2, x4, c, i) FMA(x4, ESTRIN_3_(x, x2, c, 4 + i), ESTRIN_3_(x, x2, c, i)) -#define ESTRIN_8_(x, x2, x4, x8, c, i) FMA(x8, c(8 + i), ESTRIN_7_(x, x2, x4, c, i)) -#define ESTRIN_9_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_1_(x, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i)) -#define ESTRIN_10_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_2_(x, x2, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i)) -#define ESTRIN_11_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_3_(x, x2, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i)) -#define ESTRIN_12_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_4_(x, x2, x4, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i)) -#define ESTRIN_13_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_5_(x, x2, x4, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i)) -#define ESTRIN_14_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_6_(x, x2, x4, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i)) -#define ESTRIN_15_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_7_(x, x2, x4, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i)) -#define ESTRIN_16_(x, x2, x4, x8, x16, c, i) FMA(x16, c(16 + i), ESTRIN_15_(x, x2, x4, x8, c, i)) -#define ESTRIN_17_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_1_(x, c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i)) -#define ESTRIN_18_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_2_(x, x2, c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i)) -#define ESTRIN_19_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_3_(x, x2, c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i)) - -#define ESTRIN_1(x, c) ESTRIN_1_(x, c, 0) -#define ESTRIN_2(x, x2, c) ESTRIN_2_(x, x2, c, 0) -#define ESTRIN_3(x, x2, c) ESTRIN_3_(x, x2, c, 0) -#define ESTRIN_4(x, x2, x4, c) ESTRIN_4_(x, x2, x4, c, 0) -#define ESTRIN_5(x, x2, x4, c) ESTRIN_5_(x, x2, x4, c, 0) -#define ESTRIN_6(x, x2, x4, c) ESTRIN_6_(x, x2, x4, c, 0) -#define ESTRIN_7(x, x2, x4, c) ESTRIN_7_(x, x2, x4, c, 0) -#define ESTRIN_8(x, x2, x4, x8, c) ESTRIN_8_(x, x2, x4, x8, c, 0) -#define ESTRIN_9(x, x2, x4, x8, c) ESTRIN_9_(x, x2, x4, x8, c, 0) -#define ESTRIN_10(x, x2, x4, x8, c) ESTRIN_10_(x, x2, x4, x8, c, 0) -#define ESTRIN_11(x, x2, x4, x8, c) ESTRIN_11_(x, x2, x4, x8, c, 0) -#define ESTRIN_12(x, x2, x4, x8, c) ESTRIN_12_(x, x2, x4, x8, c, 0) -#define ESTRIN_13(x, x2, x4, x8, c) ESTRIN_13_(x, x2, x4, x8, c, 0) -#define ESTRIN_14(x, x2, x4, x8, c) ESTRIN_14_(x, x2, x4, x8, c, 0) -#define ESTRIN_15(x, x2, x4, x8, c) ESTRIN_15_(x, x2, x4, x8, c, 0) -#define ESTRIN_16(x, x2, x4, x8, x16, c) ESTRIN_16_(x, x2, x4, x8, x16, c, 0) -#define ESTRIN_17(x, x2, x4, x8, x16, c) ESTRIN_17_(x, x2, x4, x8, x16, c, 0) -#define ESTRIN_18(x, x2, x4, x8, x16, c) ESTRIN_18_(x, x2, x4, x8, x16, c, 0) -#define ESTRIN_19(x, x2, x4, x8, x16, c) ESTRIN_19_(x, x2, x4, x8, x16, c, 0) -// clang-format on diff --git a/contrib/arm-optimized-routines/pl/math/estrinf.h b/contrib/arm-optimized-routines/pl/math/estrinf.h deleted file mode 100644 index 175233c6c799..000000000000 --- a/contrib/arm-optimized-routines/pl/math/estrinf.h +++ /dev/null @@ -1,14 +0,0 @@ -/* - * Helper macros for single-precision Estrin polynomial evaluation. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#if V_SUPPORTED -#define FMA v_fma_f32 -#else -#define FMA fmaf -#endif - -#include "estrin_wrap.h" diff --git a/contrib/arm-optimized-routines/pl/math/expf.c b/contrib/arm-optimized-routines/pl/math/expf.c index c325e45d5cc6..cd3cfa925c64 100644 --- a/contrib/arm-optimized-routines/pl/math/expf.c +++ b/contrib/arm-optimized-routines/pl/math/expf.c @@ -59,8 +59,8 @@ optr_aor_exp_f32 (float x) /* Round and convert z to int, the result is in [-150*N, 128*N] and ideally nearest int is used, otherwise the magnitude of r can be bigger which gives larger approximation error. */ - kd = roundtoint (z); - ki = converttoint (z); + kd = round (z); + ki = lround (z); r = z - kd; /* exp(x) = 2^(k/N) * 2^(r/N) ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */ diff --git a/contrib/arm-optimized-routines/pl/math/expm1_2u5.c b/contrib/arm-optimized-routines/pl/math/expm1_2u5.c index a3faff70cb62..f7d431198614 100644 --- a/contrib/arm-optimized-routines/pl/math/expm1_2u5.c +++ b/contrib/arm-optimized-routines/pl/math/expm1_2u5.c @@ -5,7 +5,7 @@ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "estrin.h" +#include "poly_scalar_f64.h" #include "math_config.h" #include "pl_sig.h" #include "pl_test.h" @@ -14,14 +14,14 @@ #define Ln2hi 0x1.62e42fefa39efp-1 #define Ln2lo 0x1.abc9e3b39803fp-56 #define Shift 0x1.8p52 -#define TinyBound \ - 0x3cc0000000000000 /* 0x1p-51, below which expm1(x) is within 2 ULP of x. */ -#define BigBound 0x1.63108c75a1937p+9 /* Above which expm1(x) overflows. */ -#define NegBound -0x1.740bf7c0d927dp+9 /* Below which expm1(x) rounds to 1. */ +/* 0x1p-51, below which expm1(x) is within 2 ULP of x. */ +#define TinyBound 0x3cc0000000000000 +/* Above which expm1(x) overflows. */ +#define BigBound 0x1.63108c75a1937p+9 +/* Below which expm1(x) rounds to 1. */ +#define NegBound -0x1.740bf7c0d927dp+9 #define AbsMask 0x7fffffffffffffff -#define C(i) __expm1_poly[i] - /* Approximation for exp(x) - 1 using polynomial on a reduced interval. The maximum error observed error is 2.17 ULP: expm1(0x1.63f90a866748dp-2) got 0x1.a9af56603878ap-2 @@ -65,7 +65,7 @@ expm1 (double x) and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ double f2 = f * f; double f4 = f2 * f2; - double p = fma (f2, ESTRIN_10 (f, f2, f4, f4 * f4, C), f); + double p = fma (f2, estrin_10_f64 (f, f2, f4, f4 * f4, __expm1_poly), f); /* Assemble the result, using a slight rearrangement to achieve acceptable accuracy. @@ -78,8 +78,7 @@ expm1 (double x) PL_SIG (S, D, 1, expm1, -9.9, 9.9) PL_TEST_ULP (expm1, 1.68) -PL_TEST_INTERVAL (expm1, 0, 0x1p-51, 1000) -PL_TEST_INTERVAL (expm1, -0, -0x1p-51, 1000) +PL_TEST_SYM_INTERVAL (expm1, 0, 0x1p-51, 1000) PL_TEST_INTERVAL (expm1, 0x1p-51, 0x1.63108c75a1937p+9, 100000) PL_TEST_INTERVAL (expm1, -0x1p-51, -0x1.740bf7c0d927dp+9, 100000) PL_TEST_INTERVAL (expm1, 0x1.63108c75a1937p+9, inf, 100) diff --git a/contrib/arm-optimized-routines/pl/math/expm1f_1u6.c b/contrib/arm-optimized-routines/pl/math/expm1f_1u6.c index 70b14e48519d..e12c9ba9a8a2 100644 --- a/contrib/arm-optimized-routines/pl/math/expm1f_1u6.c +++ b/contrib/arm-optimized-routines/pl/math/expm1f_1u6.c @@ -5,7 +5,7 @@ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "hornerf.h" +#include "poly_scalar_f32.h" #include "math_config.h" #include "pl_sig.h" #include "pl_test.h" @@ -20,8 +20,6 @@ #define NegLimit \ (-0x1.9bbabcp+6) /* Largest value of x for which expm1(x) rounds to 1. */ -#define C(i) __expm1f_poly[i] - /* Approximation for exp(x) - 1 using polynomial on a reduced interval. The maximum error is 1.51 ULP: expm1f(0x1.8baa96p-2) got 0x1.e2fb9p-2 @@ -62,7 +60,7 @@ expm1f (float x) x + ax^2 + bx^3 + cx^4 .... So we calculate the polynomial P(f) = a + bf + cf^2 + ... and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ - float p = fmaf (f * f, HORNER_4 (f, C), f); + float p = fmaf (f * f, horner_4_f32 (f, __expm1f_poly), f); /* Assemble the result, using a slight rearrangement to achieve acceptable accuracy. expm1(x) ~= 2^i * (p + 1) - 1 @@ -74,7 +72,8 @@ expm1f (float x) PL_SIG (S, F, 1, expm1, -9.9, 9.9) PL_TEST_ULP (expm1f, 1.02) -PL_TEST_INTERVAL (expm1f, 0, 0x1p-23, 1000) -PL_TEST_INTERVAL (expm1f, -0, -0x1p-23, 1000) +PL_TEST_SYM_INTERVAL (expm1f, 0, 0x1p-23, 1000) PL_TEST_INTERVAL (expm1f, 0x1p-23, 0x1.644716p6, 100000) +PL_TEST_INTERVAL (expm1f, 0x1.644716p6, inf, 1000) PL_TEST_INTERVAL (expm1f, -0x1p-23, -0x1.9bbabcp+6, 100000) +PL_TEST_INTERVAL (expm1f, -0x1.9bbabcp+6, -inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/finite_pow.h b/contrib/arm-optimized-routines/pl/math/finite_pow.h new file mode 100644 index 000000000000..8944d4fae625 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/finite_pow.h @@ -0,0 +1,365 @@ +/* + * Double-precision x^y function. + * + * Copyright (c) 2018-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Scalar version of pow used for fallbacks in vector implementations. */ + +/* Data is defined in v_pow_log_data.c. */ +#define N_LOG (1 << V_POW_LOG_TABLE_BITS) +#define Off 0x3fe6955500000000 +#define As __v_pow_log_data.poly + +/* Data is defined in v_pow_exp_data.c. */ +#define N_EXP (1 << V_POW_EXP_TABLE_BITS) +#define SignBias (0x800 << V_POW_EXP_TABLE_BITS) +#define SmallExp 0x3c9 /* top12(0x1p-54). */ +#define BigExp 0x408 /* top12(512.0). */ +#define ThresExp 0x03f /* BigExp - SmallExp. */ +#define InvLn2N __v_pow_exp_data.n_over_ln2 +#define Ln2HiN __v_pow_exp_data.ln2_over_n_hi +#define Ln2LoN __v_pow_exp_data.ln2_over_n_lo +#define SBits __v_pow_exp_data.sbits +#define Cs __v_pow_exp_data.poly + +/* Constants associated with pow. */ +#define SmallPowX 0x001 /* top12(0x1p-126). */ +#define BigPowX 0x7ff /* top12(INFINITY). */ +#define ThresPowX 0x7fe /* BigPowX - SmallPowX. */ +#define SmallPowY 0x3be /* top12(0x1.e7b6p-65). */ +#define BigPowY 0x43e /* top12(0x1.749p62). */ +#define ThresPowY 0x080 /* BigPowY - SmallPowY. */ + +/* Top 12 bits of a double (sign and exponent bits). */ +static inline uint32_t +top12 (double x) +{ + return asuint64 (x) >> 52; +} + +/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about + additional 15 bits precision. IX is the bit representation of x, but + normalized in the subnormal range using the sign bit for the exponent. */ +static inline double +log_inline (uint64_t ix, double *tail) +{ + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + uint64_t tmp = ix - Off; + int i = (tmp >> (52 - V_POW_LOG_TABLE_BITS)) & (N_LOG - 1); + int k = (int64_t) tmp >> 52; /* arithmetic shift. */ + uint64_t iz = ix - (tmp & 0xfffULL << 52); + double z = asdouble (iz); + double kd = (double) k; + + /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */ + double invc = __v_pow_log_data.invc[i]; + double logc = __v_pow_log_data.logc[i]; + double logctail = __v_pow_log_data.logctail[i]; + + /* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and + |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */ + double r = fma (z, invc, -1.0); + + /* k*Ln2 + log(c) + r. */ + double t1 = kd * __v_pow_log_data.ln2_hi + logc; + double t2 = t1 + r; + double lo1 = kd * __v_pow_log_data.ln2_lo + logctail; + double lo2 = t1 - t2 + r; + + /* Evaluation is optimized assuming superscalar pipelined execution. */ + double ar = As[0] * r; + double ar2 = r * ar; + double ar3 = r * ar2; + /* k*Ln2 + log(c) + r + A[0]*r*r. */ + double hi = t2 + ar2; + double lo3 = fma (ar, r, -ar2); + double lo4 = t2 - hi + ar2; + /* p = log1p(r) - r - A[0]*r*r. */ + double p = (ar3 + * (As[1] + r * As[2] + + ar2 * (As[3] + r * As[4] + ar2 * (As[5] + r * As[6])))); + double lo = lo1 + lo2 + lo3 + lo4 + p; + double y = hi + lo; + *tail = hi - y + lo; + return y; +} + +/* Handle cases that may overflow or underflow when computing the result that + is scale*(1+TMP) without intermediate rounding. The bit representation of + scale is in SBITS, however it has a computed exponent that may have + overflown into the sign bit so that needs to be adjusted before using it as + a double. (int32_t)KI is the k used in the argument reduction and exponent + adjustment of scale, positive k here means the result may overflow and + negative k means the result may underflow. */ +static inline double +special_case (double tmp, uint64_t sbits, uint64_t ki) +{ + double scale, y; + + if ((ki & 0x80000000) == 0) + { + /* k > 0, the exponent of scale might have overflowed by <= 460. */ + sbits -= 1009ull << 52; + scale = asdouble (sbits); + y = 0x1p1009 * (scale + scale * tmp); + return check_oflow (eval_as_double (y)); + } + /* k < 0, need special care in the subnormal range. */ + sbits += 1022ull << 52; + /* Note: sbits is signed scale. */ + scale = asdouble (sbits); + y = scale + scale * tmp; +#if WANT_SIMD_EXCEPT + if (fabs (y) < 1.0) + { + /* Round y to the right precision before scaling it into the subnormal + range to avoid double rounding that can cause 0.5+E/2 ulp error where + E is the worst-case ulp error outside the subnormal range. So this + is only useful if the goal is better than 1 ulp worst-case error. */ + double hi, lo, one = 1.0; + if (y < 0.0) + one = -1.0; + lo = scale - y + scale * tmp; + hi = one + y; + lo = one - hi + y + lo; + y = eval_as_double (hi + lo) - one; + /* Fix the sign of 0. */ + if (y == 0.0) + y = asdouble (sbits & 0x8000000000000000); + /* The underflow exception needs to be signaled explicitly. */ + force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022); + } +#endif + y = 0x1p-1022 * y; + return check_uflow (eval_as_double (y)); +} + +/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. + The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1. */ +static inline double +exp_inline (double x, double xtail, uint32_t sign_bias) +{ + uint32_t abstop = top12 (x) & 0x7ff; + if (unlikely (abstop - SmallExp >= ThresExp)) + { + if (abstop - SmallExp >= 0x80000000) + { + /* Avoid spurious underflow for tiny x. */ + /* Note: 0 is common input. */ + return sign_bias ? -1.0 : 1.0; + } + if (abstop >= top12 (1024.0)) + { + /* Note: inf and nan are already handled. */ + /* Skip errno handling. */ +#if WANT_SIMD_EXCEPT + return asuint64 (x) >> 63 ? __math_uflow (sign_bias) + : __math_oflow (sign_bias); +#else + double res_uoflow = asuint64 (x) >> 63 ? 0.0 : INFINITY; + return sign_bias ? -res_uoflow : res_uoflow; +#endif + } + /* Large x is special cased below. */ + abstop = 0; + } + + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ + /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */ + double z = InvLn2N * x; + double kd = round (z); + uint64_t ki = lround (z); + double r = x - kd * Ln2HiN - kd * Ln2LoN; + /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ + r += xtail; + /* 2^(k/N) ~= scale. */ + uint64_t idx = ki & (N_EXP - 1); + uint64_t top = (ki + sign_bias) << (52 - V_POW_EXP_TABLE_BITS); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + uint64_t sbits = SBits[idx] + top; + /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */ + /* Evaluation is optimized assuming superscalar pipelined execution. */ + double r2 = r * r; + double tmp = r + r2 * Cs[0] + r * r2 * (Cs[1] + r * Cs[2]); + if (unlikely (abstop == 0)) + return special_case (tmp, sbits, ki); + double scale = asdouble (sbits); + /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there + is no spurious underflow here even without fma. */ + return eval_as_double (scale + scale * tmp); +} + +/* Computes exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. + A version of exp_inline that is not inlined and for which sign_bias is + equal to 0. */ +static double NOINLINE +exp_nosignbias (double x, double xtail) +{ + uint32_t abstop = top12 (x) & 0x7ff; + if (unlikely (abstop - SmallExp >= ThresExp)) + { + /* Avoid spurious underflow for tiny x. */ + if (abstop - SmallExp >= 0x80000000) + return 1.0; + /* Note: inf and nan are already handled. */ + if (abstop >= top12 (1024.0)) +#if WANT_SIMD_EXCEPT + return asuint64 (x) >> 63 ? __math_uflow (0) : __math_oflow (0); +#else + return asuint64 (x) >> 63 ? 0.0 : INFINITY; +#endif + /* Large x is special cased below. */ + abstop = 0; + } + + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ + /* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N]. */ + double z = InvLn2N * x; + double kd = round (z); + uint64_t ki = lround (z); + double r = x - kd * Ln2HiN - kd * Ln2LoN; + /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ + r += xtail; + /* 2^(k/N) ~= scale. */ + uint64_t idx = ki & (N_EXP - 1); + uint64_t top = ki << (52 - V_POW_EXP_TABLE_BITS); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + uint64_t sbits = SBits[idx] + top; + /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (tail + exp(r) - 1). */ + double r2 = r * r; + double tmp = r + r2 * Cs[0] + r * r2 * (Cs[1] + r * Cs[2]); + if (unlikely (abstop == 0)) + return special_case (tmp, sbits, ki); + double scale = asdouble (sbits); + /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there + is no spurious underflow here even without fma. */ + return eval_as_double (scale + scale * tmp); +} + +/* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is + the bit representation of a non-zero finite floating-point value. */ +static inline int +checkint (uint64_t iy) +{ + int e = iy >> 52 & 0x7ff; + if (e < 0x3ff) + return 0; + if (e > 0x3ff + 52) + return 2; + if (iy & ((1ULL << (0x3ff + 52 - e)) - 1)) + return 0; + if (iy & (1ULL << (0x3ff + 52 - e))) + return 1; + return 2; +} + +/* Returns 1 if input is the bit representation of 0, infinity or nan. */ +static inline int +zeroinfnan (uint64_t i) +{ + return 2 * i - 1 >= 2 * asuint64 (INFINITY) - 1; +} + +static double NOINLINE +__pl_finite_pow (double x, double y) +{ + uint32_t sign_bias = 0; + uint64_t ix, iy; + uint32_t topx, topy; + + ix = asuint64 (x); + iy = asuint64 (y); + topx = top12 (x); + topy = top12 (y); + if (unlikely (topx - SmallPowX >= ThresPowX + || (topy & 0x7ff) - SmallPowY >= ThresPowY)) + { + /* Note: if |y| > 1075 * ln2 * 2^53 ~= 0x1.749p62 then pow(x,y) = inf/0 + and if |y| < 2^-54 / 1075 ~= 0x1.e7b6p-65 then pow(x,y) = +-1. */ + /* Special cases: (x < 0x1p-126 or inf or nan) or + (|y| < 0x1p-65 or |y| >= 0x1p63 or nan). */ + if (unlikely (zeroinfnan (iy))) + { + if (2 * iy == 0) + return issignaling_inline (x) ? x + y : 1.0; + if (ix == asuint64 (1.0)) + return issignaling_inline (y) ? x + y : 1.0; + if (2 * ix > 2 * asuint64 (INFINITY) + || 2 * iy > 2 * asuint64 (INFINITY)) + return x + y; + if (2 * ix == 2 * asuint64 (1.0)) + return 1.0; + if ((2 * ix < 2 * asuint64 (1.0)) == !(iy >> 63)) + return 0.0; /* |x|<1 && y==inf or |x|>1 && y==-inf. */ + return y * y; + } + if (unlikely (zeroinfnan (ix))) + { + double x2 = x * x; + if (ix >> 63 && checkint (iy) == 1) + { + x2 = -x2; + sign_bias = 1; + } +#if WANT_SIMD_EXCEPT + if (2 * ix == 0 && iy >> 63) + return __math_divzero (sign_bias); +#endif + /* Without the barrier some versions of clang hoist the 1/x2 and + thus division by zero exception can be signaled spuriously. */ + return iy >> 63 ? opt_barrier_double (1 / x2) : x2; + } + /* Here x and y are non-zero finite. */ + if (ix >> 63) + { + /* Finite x < 0. */ + int yint = checkint (iy); + if (yint == 0) +#if WANT_SIMD_EXCEPT + return __math_invalid (x); +#else + return __builtin_nan (""); +#endif + if (yint == 1) + sign_bias = SignBias; + ix &= 0x7fffffffffffffff; + topx &= 0x7ff; + } + if ((topy & 0x7ff) - SmallPowY >= ThresPowY) + { + /* Note: sign_bias == 0 here because y is not odd. */ + if (ix == asuint64 (1.0)) + return 1.0; + /* |y| < 2^-65, x^y ~= 1 + y*log(x). */ + if ((topy & 0x7ff) < SmallPowY) + return 1.0; +#if WANT_SIMD_EXCEPT + return (ix > asuint64 (1.0)) == (topy < 0x800) ? __math_oflow (0) + : __math_uflow (0); +#else + return (ix > asuint64 (1.0)) == (topy < 0x800) ? INFINITY : 0; +#endif + } + if (topx == 0) + { + /* Normalize subnormal x so exponent becomes negative. */ + /* Without the barrier some versions of clang evalutate the mul + unconditionally causing spurious overflow exceptions. */ + ix = asuint64 (opt_barrier_double (x) * 0x1p52); + ix &= 0x7fffffffffffffff; + ix -= 52ULL << 52; + } + } + + double lo; + double hi = log_inline (ix, &lo); + double ehi = y * hi; + double elo = y * lo + fma (y, hi, -ehi); + return exp_inline (ehi, elo, sign_bias); +} diff --git a/contrib/arm-optimized-routines/pl/math/horner.h b/contrib/arm-optimized-routines/pl/math/horner.h deleted file mode 100644 index f92ab6752110..000000000000 --- a/contrib/arm-optimized-routines/pl/math/horner.h +++ /dev/null @@ -1,14 +0,0 @@ -/* - * Helper macros for single-precision Horner polynomial evaluation. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#if V_SUPPORTED -#define FMA v_fma_f64 -#else -#define FMA fma -#endif - -#include "horner_wrap.h" diff --git a/contrib/arm-optimized-routines/pl/math/horner_wrap.h b/contrib/arm-optimized-routines/pl/math/horner_wrap.h deleted file mode 100644 index 6478968db913..000000000000 --- a/contrib/arm-optimized-routines/pl/math/horner_wrap.h +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Helper macros for Horner polynomial evaluation. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -// clang-format off -#define HORNER_1_(x, c, i) FMA(c(i + 1), x, c(i)) -#define HORNER_2_(x, c, i) FMA(HORNER_1_ (x, c, i + 1), x, c(i)) -#define HORNER_3_(x, c, i) FMA(HORNER_2_ (x, c, i + 1), x, c(i)) -#define HORNER_4_(x, c, i) FMA(HORNER_3_ (x, c, i + 1), x, c(i)) -#define HORNER_5_(x, c, i) FMA(HORNER_4_ (x, c, i + 1), x, c(i)) -#define HORNER_6_(x, c, i) FMA(HORNER_5_ (x, c, i + 1), x, c(i)) -#define HORNER_7_(x, c, i) FMA(HORNER_6_ (x, c, i + 1), x, c(i)) -#define HORNER_8_(x, c, i) FMA(HORNER_7_ (x, c, i + 1), x, c(i)) -#define HORNER_9_(x, c, i) FMA(HORNER_8_ (x, c, i + 1), x, c(i)) -#define HORNER_10_(x, c, i) FMA(HORNER_9_ (x, c, i + 1), x, c(i)) -#define HORNER_11_(x, c, i) FMA(HORNER_10_(x, c, i + 1), x, c(i)) -#define HORNER_12_(x, c, i) FMA(HORNER_11_(x, c, i + 1), x, c(i)) - -#define HORNER_1(x, c) HORNER_1_ (x, c, 0) -#define HORNER_2(x, c) HORNER_2_ (x, c, 0) -#define HORNER_3(x, c) HORNER_3_ (x, c, 0) -#define HORNER_4(x, c) HORNER_4_ (x, c, 0) -#define HORNER_5(x, c) HORNER_5_ (x, c, 0) -#define HORNER_6(x, c) HORNER_6_ (x, c, 0) -#define HORNER_7(x, c) HORNER_7_ (x, c, 0) -#define HORNER_8(x, c) HORNER_8_ (x, c, 0) -#define HORNER_9(x, c) HORNER_9_ (x, c, 0) -#define HORNER_10(x, c) HORNER_10_(x, c, 0) -#define HORNER_11(x, c) HORNER_11_(x, c, 0) -#define HORNER_12(x, c) HORNER_12_(x, c, 0) -// clang-format on diff --git a/contrib/arm-optimized-routines/pl/math/hornerf.h b/contrib/arm-optimized-routines/pl/math/hornerf.h deleted file mode 100644 index 0703817b0fbb..000000000000 --- a/contrib/arm-optimized-routines/pl/math/hornerf.h +++ /dev/null @@ -1,14 +0,0 @@ -/* - * Helper macros for double-precision Horner polynomial evaluation. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#if V_SUPPORTED -#define FMA v_fma_f32 -#else -#define FMA fmaf -#endif - -#include "horner_wrap.h" diff --git a/contrib/arm-optimized-routines/pl/math/include/mathlib.h b/contrib/arm-optimized-routines/pl/math/include/mathlib.h index af5f9f9c6afb..f886e7f8c07a 100644 --- a/contrib/arm-optimized-routines/pl/math/include/mathlib.h +++ b/contrib/arm-optimized-routines/pl/math/include/mathlib.h @@ -1,4 +1,3 @@ -// clang-format off /* * Public API. * @@ -9,155 +8,84 @@ #ifndef _MATHLIB_H #define _MATHLIB_H +float acosf (float); float acoshf (float); +float asinf (float); float asinhf (float); float atan2f (float, float); float atanf (float); float atanhf (float); float cbrtf (float); float coshf (float); +float cospif (float); float erfcf (float); float erff (float); +float erfinvf (float); +float exp10f (float); float expm1f (float); float log10f (float); float log1pf (float); float sinhf (float); +float sinpif (float); float tanf (float); float tanhf (float); +double acos (double); double acosh (double); +double asin (double); double asinh (double); double atan (double); double atan2 (double, double); double atanh (double); double cbrt (double); double cosh (double); +double cospi (double); double erfc (double); +double erfinv (double); +double exp10 (double); double expm1 (double); double log10 (double); double log1p (double); double sinh (double); +double sinpi (double); double tanh (double); -float __s_acoshf (float); -float __s_asinhf (float); -float __s_atanf (float); -float __s_atan2f (float, float); -float __s_atanhf (float); -float __s_cbrtf (float); -float __s_coshf (float); -float __s_erfcf (float); -float __s_erff (float); -float __s_expm1f (float); -float __s_log10f (float); -float __s_log1pf (float); -float __s_log2f (float); -float __s_sinhf (float); -float __s_tanf (float); -float __s_tanhf (float); - -double __s_acosh (double); -double __s_asinh (double); -double __s_atan (double); -double __s_atan2 (double, double); -double __s_atanh (double); -double __s_cbrt (double); -double __s_cosh (double); -double __s_erf (double); -double __s_erfc (double); -double __s_expm1 (double); -double __s_log10 (double); -double __s_log1p (double); -double __s_log2 (double); -double __s_sinh (double); -double __s_tan (double); -double __s_tanh (double); +long double cospil (long double); +long double erfinvl (long double); +long double exp10l (long double); +long double sinpil (long double); #if __aarch64__ -#if __GNUC__ >= 5 +# if __GNUC__ >= 5 typedef __Float32x4_t __f32x4_t; typedef __Float64x2_t __f64x2_t; -#elif __clang_major__*100+__clang_minor__ >= 305 -typedef __attribute__((__neon_vector_type__(4))) float __f32x4_t; -typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t; -#else -#error Unsupported compiler -#endif +# elif __clang_major__ * 100 + __clang_minor__ >= 305 +typedef __attribute__ ((__neon_vector_type__ (4))) float __f32x4_t; +typedef __attribute__ ((__neon_vector_type__ (2))) double __f64x2_t; +# else +# error Unsupported compiler +# endif -/* Vector functions following the base PCS. */ -__f32x4_t __v_acoshf (__f32x4_t); -__f64x2_t __v_acosh (__f64x2_t); -__f32x4_t __v_asinhf (__f32x4_t); -__f64x2_t __v_asinh (__f64x2_t); -__f32x4_t __v_atanf (__f32x4_t); -__f64x2_t __v_atan (__f64x2_t); -__f32x4_t __v_atan2f (__f32x4_t, __f32x4_t); -__f64x2_t __v_atan2 (__f64x2_t, __f64x2_t); -__f32x4_t __v_atanhf (__f32x4_t); -__f64x2_t __v_atanh (__f64x2_t); -__f32x4_t __v_cbrtf (__f32x4_t); -__f64x2_t __v_cbrt (__f64x2_t); -__f32x4_t __v_coshf (__f32x4_t); -__f64x2_t __v_cosh (__f64x2_t); -__f32x4_t __v_erff (__f32x4_t); -__f64x2_t __v_erf (__f64x2_t); -__f32x4_t __v_erfcf (__f32x4_t); -__f64x2_t __v_erfc (__f64x2_t); -__f32x4_t __v_expm1f (__f32x4_t); -__f64x2_t __v_expm1 (__f64x2_t); -__f32x4_t __v_log10f (__f32x4_t); -__f64x2_t __v_log10 (__f64x2_t); -__f32x4_t __v_log1pf (__f32x4_t); -__f64x2_t __v_log1p (__f64x2_t); -__f32x4_t __v_log2f (__f32x4_t); -__f64x2_t __v_log2 (__f64x2_t); -__f32x4_t __v_sinhf (__f32x4_t); -__f64x2_t __v_sinh (__f64x2_t); -__f32x4_t __v_tanf (__f32x4_t); -__f64x2_t __v_tan (__f64x2_t); -__f32x4_t __v_tanhf (__f32x4_t); -__f64x2_t __v_tanh (__f64x2_t); +# if __GNUC__ >= 9 || __clang_major__ >= 8 +# define __vpcs __attribute__ ((__aarch64_vector_pcs__)) -#if __GNUC__ >= 9 || __clang_major__ >= 8 -#define __vpcs __attribute__((__aarch64_vector_pcs__)) +typedef struct __f32x4x2_t +{ + __f32x4_t val[2]; +} __f32x4x2_t; -/* Vector functions following the vector PCS. */ -__vpcs __f32x4_t __vn_acoshf (__f32x4_t); -__vpcs __f64x2_t __vn_acosh (__f64x2_t); -__vpcs __f32x4_t __vn_asinhf (__f32x4_t); -__vpcs __f64x2_t __vn_asinh (__f64x2_t); -__vpcs __f32x4_t __vn_atanf (__f32x4_t); -__vpcs __f64x2_t __vn_atan (__f64x2_t); -__vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t); -__vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t); -__vpcs __f32x4_t __vn_atanhf (__f32x4_t); -__vpcs __f64x2_t __vn_atanh (__f64x2_t); -__vpcs __f32x4_t __vn_cbrtf (__f32x4_t); -__vpcs __f64x2_t __vn_cbrt (__f64x2_t); -__vpcs __f32x4_t __vn_coshf (__f32x4_t); -__vpcs __f64x2_t __vn_cosh (__f64x2_t); -__vpcs __f32x4_t __vn_erff (__f32x4_t); -__vpcs __f64x2_t __vn_erf (__f64x2_t); -__vpcs __f32x4_t __vn_erfcf (__f32x4_t); -__vpcs __f64x2_t __vn_erfc (__f64x2_t); -__vpcs __f32x4_t __vn_expm1f (__f32x4_t); -__vpcs __f64x2_t __vn_expm1 (__f64x2_t); -__vpcs __f32x4_t __vn_log10f (__f32x4_t); -__vpcs __f64x2_t __vn_log10 (__f64x2_t); -__vpcs __f32x4_t __vn_log1pf (__f32x4_t); -__vpcs __f64x2_t __vn_log1p (__f64x2_t); -__vpcs __f32x4_t __vn_log2f (__f32x4_t); -__vpcs __f64x2_t __vn_log2 (__f64x2_t); -__vpcs __f32x4_t __vn_sinhf (__f32x4_t); -__vpcs __f64x2_t __vn_sinh (__f64x2_t); -__vpcs __f32x4_t __vn_tanf (__f32x4_t); -__vpcs __f64x2_t __vn_tan (__f64x2_t); -__vpcs __f32x4_t __vn_tanhf (__f32x4_t); -__vpcs __f64x2_t __vn_tanh (__f64x2_t); +typedef struct __f64x2x2_t +{ + __f64x2_t val[2]; +} __f64x2x2_t; /* Vector functions following the vector PCS using ABI names. */ __vpcs __f32x4_t _ZGVnN4v_acoshf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_acosh (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4v_acosf (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_acos (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_asinh (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t); @@ -168,77 +96,111 @@ __vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_atanh (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_cbrtf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_cbrt (__f64x2_t); +__vpcs __f32x4x2_t _ZGVnN4v_cexpif (__f32x4_t); +__vpcs __f64x2x2_t _ZGVnN2v_cexpi (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4v_cospif (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_cospi (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_erfcf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_erfc (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4v_erfinvf (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_erfinv (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t); +__vpcs __f64x2_t _ZGVnN2v_exp2 (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_expm1f (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_expm1 (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4vv_hypotf (__f32x4_t, __f32x4_t); +__vpcs __f64x2_t _ZGVnN2vv_hypot (__f64x2_t, __f64x2_t); __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t); +__vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t); __vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4v_sinpif (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_sinpi (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_tanhf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_tanh (__f64x2_t); +__vpcs void _ZGVnN4vl4l4_sincosf (__f32x4_t, __f32x4_t *, __f32x4_t *); +__vpcs void _ZGVnN2vl8l8_sincos (__f64x2_t, __f64x2_t *, __f64x2_t *); -#endif +# endif -#if WANT_SVE_MATH -#include -svfloat32_t __sv_atan2f_x (svfloat32_t, svfloat32_t, svbool_t); -svfloat32_t __sv_atanf_x (svfloat32_t, svbool_t); -svfloat64_t __sv_atan_x (svfloat64_t, svbool_t); -svfloat64_t __sv_atan2_x (svfloat64_t, svfloat64_t, svbool_t); -svfloat32_t __sv_cosf_x (svfloat32_t, svbool_t); -svfloat64_t __sv_cos_x (svfloat64_t, svbool_t); -svfloat32_t __sv_erff_x (svfloat32_t, svbool_t); -svfloat64_t __sv_erf_x (svfloat64_t, svbool_t); -svfloat64_t __sv_erfc_x (svfloat64_t, svbool_t); -svfloat32_t __sv_expf_x (svfloat32_t, svbool_t); -svfloat32_t __sv_logf_x (svfloat32_t, svbool_t); -svfloat64_t __sv_log_x (svfloat64_t, svbool_t); -svfloat32_t __sv_log10f_x (svfloat32_t, svbool_t); -svfloat64_t __sv_log10_x (svfloat64_t, svbool_t); -svfloat32_t __sv_log2f_x (svfloat32_t, svbool_t); -svfloat64_t __sv_log2_x (svfloat64_t, svbool_t); -svfloat32_t __sv_powif_x (svfloat32_t, svint32_t, svbool_t); -svfloat64_t __sv_powi_x (svfloat64_t, svint64_t, svbool_t); -svfloat32_t __sv_sinf_x (svfloat32_t, svbool_t); -svfloat64_t __sv_sin_x (svfloat64_t, svbool_t); -svfloat32_t __sv_tanf_x (svfloat32_t, svbool_t); -/* SVE ABI names. */ +# if WANT_SVE_MATH +# include +svfloat32_t _ZGVsMxv_acoshf (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_acosh (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_acosf (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_acos (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_asinhf (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_asinh (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_asinf (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_asin (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_atanhf (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_atanh (svfloat64_t, svbool_t); svfloat32_t _ZGVsMxvv_atan2f (svfloat32_t, svfloat32_t, svbool_t); svfloat32_t _ZGVsMxv_atanf (svfloat32_t, svbool_t); svfloat64_t _ZGVsMxv_atan (svfloat64_t, svbool_t); svfloat64_t _ZGVsMxvv_atan2 (svfloat64_t, svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_cbrtf (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_cbrt (svfloat64_t, svbool_t); +svfloat32x2_t _ZGVsMxv_cexpif (svfloat32_t, svbool_t); +svfloat64x2_t _ZGVsMxv_cexpi (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_coshf (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_cosh (svfloat64_t, svbool_t); svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_cospif (svfloat32_t, svbool_t); svfloat64_t _ZGVsMxv_cos (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_cospi (svfloat64_t, svbool_t); svfloat32_t _ZGVsMxv_erff (svfloat32_t, svbool_t); svfloat64_t _ZGVsMxv_erf (svfloat64_t, svbool_t); svfloat64_t _ZGVsMxv_erfc (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_erfcf (svfloat32_t, svbool_t); svfloat32_t _ZGVsMxv_expf (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_exp (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_exp10f (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_exp10 (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_exp2f (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_exp2 (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_expm1f (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_expm1 (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxvv_hypotf (svfloat32_t, svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxvv_hypot (svfloat64_t, svfloat64_t, svbool_t); svfloat32_t _ZGVsMxv_logf (svfloat32_t, svbool_t); svfloat64_t _ZGVsMxv_log (svfloat64_t, svbool_t); svfloat32_t _ZGVsMxv_log10f (svfloat32_t, svbool_t); svfloat64_t _ZGVsMxv_log10 (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_log1pf (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_log1p (svfloat64_t, svbool_t); svfloat32_t _ZGVsMxv_log2f (svfloat32_t, svbool_t); svfloat64_t _ZGVsMxv_log2 (svfloat64_t, svbool_t); -svfloat32_t _ZGVsMxvv_powi(svfloat32_t, svint32_t, svbool_t); -svfloat64_t _ZGVsMxvv_powk(svfloat64_t, svint64_t, svbool_t); +svfloat32_t _ZGVsMxvv_powi (svfloat32_t, svint32_t, svbool_t); +svfloat64_t _ZGVsMxvv_powk (svfloat64_t, svint64_t, svbool_t); +svfloat32_t _ZGVsMxvv_powf (svfloat32_t, svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxvv_pow (svfloat64_t, svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_sinhf (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_sinh (svfloat64_t, svbool_t); svfloat32_t _ZGVsMxv_sinf (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_sinpif (svfloat32_t, svbool_t); svfloat64_t _ZGVsMxv_sin (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_sinpi (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_tanhf (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_tanh (svfloat64_t, svbool_t); svfloat32_t _ZGVsMxv_tanf (svfloat32_t, svbool_t); -#endif +svfloat64_t _ZGVsMxv_tan (svfloat64_t, svbool_t); +void _ZGVsMxvl4l4_sincosf (svfloat32_t, float *, float *, svbool_t); +void _ZGVsMxvl8l8_sincos (svfloat64_t, double *, double *, svbool_t); +# endif #endif #endif -// clang-format on diff --git a/contrib/arm-optimized-routines/pl/math/include/pl_test.h b/contrib/arm-optimized-routines/pl/math/include/pl_test.h index 6a81360ba287..3a3407e337b8 100644 --- a/contrib/arm-optimized-routines/pl/math/include/pl_test.h +++ b/contrib/arm-optimized-routines/pl/math/include/pl_test.h @@ -10,11 +10,6 @@ /* Emit max ULP threshold - silenced for building the routine. */ #define PL_TEST_ULP(f, l) -/* Emit alias. The PL_TEST_ALIAS declaration is piggy-backed on top of - strong_alias. Use PL_ALIAS instead of strong_alias to make sure the alias is - also added to the test suite. */ -#define PL_ALIAS(a, b) strong_alias (a, b) - /* Emit routine name if e == 1 and f is expected to correctly trigger fenv exceptions. e allows declaration to be emitted conditionally upon certain build flags - defer expansion by one pass to allow those flags to be expanded @@ -23,4 +18,7 @@ #define PL_TEST_EXPECT_FENV_ALWAYS(f) #define PL_TEST_INTERVAL(f, lo, hi, n) +#define PL_TEST_SYM_INTERVAL(f, lo, hi, n) #define PL_TEST_INTERVAL_C(f, lo, hi, n, c) +#define PL_TEST_SYM_INTERVAL_C(f, lo, hi, n, c) +#define PL_TEST_INTERVAL2(f, xlo, xhi, ylo, yhi, n) diff --git a/contrib/arm-optimized-routines/pl/math/log1p_2u.c b/contrib/arm-optimized-routines/pl/math/log1p_2u.c index 23c8ed4a1914..f9491ce52b44 100644 --- a/contrib/arm-optimized-routines/pl/math/log1p_2u.c +++ b/contrib/arm-optimized-routines/pl/math/log1p_2u.c @@ -5,7 +5,7 @@ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "estrin.h" +#include "poly_scalar_f64.h" #include "math_config.h" #include "pl_sig.h" #include "pl_test.h" @@ -21,7 +21,6 @@ #define Rt2MOne 0x3fda827999fcef32 #define AbsMask 0x7fffffffffffffff #define ExpM63 0x3c00 -#define C(i) __log1p_data.coeffs[i] static inline double eval_poly (double f) @@ -29,7 +28,7 @@ eval_poly (double f) double f2 = f * f; double f4 = f2 * f2; double f8 = f4 * f4; - return ESTRIN_18 (f, f2, f4, f8, f8 * f8, C); + return estrin_18_f64 (f, f2, f4, f8, f8 * f8, __log1p_data.coeffs); } /* log1p approximation using polynomial on reduced interval. Largest @@ -126,11 +125,7 @@ log1p (double x) PL_SIG (S, D, 1, log1p, -0.9, 10.0) PL_TEST_ULP (log1p, 1.26) -PL_TEST_INTERVAL (log1p, -10.0, 10.0, 10000) -PL_TEST_INTERVAL (log1p, 0.0, 0x1p-23, 50000) -PL_TEST_INTERVAL (log1p, 0x1p-23, 0.001, 50000) -PL_TEST_INTERVAL (log1p, 0.001, 1.0, 50000) -PL_TEST_INTERVAL (log1p, 0.0, -0x1p-23, 50000) -PL_TEST_INTERVAL (log1p, -0x1p-23, -0.001, 50000) -PL_TEST_INTERVAL (log1p, -0.001, -1.0, 50000) -PL_TEST_INTERVAL (log1p, -1.0, inf, 5000) +PL_TEST_SYM_INTERVAL (log1p, 0.0, 0x1p-23, 50000) +PL_TEST_SYM_INTERVAL (log1p, 0x1p-23, 0.001, 50000) +PL_TEST_SYM_INTERVAL (log1p, 0.001, 1.0, 50000) +PL_TEST_SYM_INTERVAL (log1p, 1.0, inf, 5000) diff --git a/contrib/arm-optimized-routines/pl/math/log1pf_2u1.c b/contrib/arm-optimized-routines/pl/math/log1pf_2u1.c index fcfd05a6fcb7..e99174853720 100644 --- a/contrib/arm-optimized-routines/pl/math/log1pf_2u1.c +++ b/contrib/arm-optimized-routines/pl/math/log1pf_2u1.c @@ -5,7 +5,7 @@ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "hornerf.h" +#include "poly_scalar_f32.h" #include "math_config.h" #include "pl_sig.h" #include "pl_test.h" @@ -53,7 +53,7 @@ eval_poly (float m, uint32_t e) x + C1 * x^2 + C2 * x^3 + C3 * x^4 + ... Hence approximation has the form m + m^2 * P(m) where P(x) = C1 + C2 * x + C3 * x^2 + ... . */ - return fmaf (m, m * HORNER_8 (m, C), m); + return fmaf (m, m * horner_8_f32 (m, __log1pf_data.coeffs), m); #else #error No log1pf approximation exists with the requested precision. Options are 13 or 25. @@ -155,11 +155,7 @@ log1pf (float x) PL_SIG (S, F, 1, log1p, -0.9, 10.0) PL_TEST_ULP (log1pf, 1.52) -PL_TEST_INTERVAL (log1pf, -10.0, 10.0, 10000) -PL_TEST_INTERVAL (log1pf, 0.0, 0x1p-23, 50000) -PL_TEST_INTERVAL (log1pf, 0x1p-23, 0.001, 50000) -PL_TEST_INTERVAL (log1pf, 0.001, 1.0, 50000) -PL_TEST_INTERVAL (log1pf, 0.0, -0x1p-23, 50000) -PL_TEST_INTERVAL (log1pf, -0x1p-23, -0.001, 50000) -PL_TEST_INTERVAL (log1pf, -0.001, -1.0, 50000) -PL_TEST_INTERVAL (log1pf, -1.0, inf, 5000) +PL_TEST_SYM_INTERVAL (log1pf, 0.0, 0x1p-23, 50000) +PL_TEST_SYM_INTERVAL (log1pf, 0x1p-23, 0.001, 50000) +PL_TEST_SYM_INTERVAL (log1pf, 0.001, 1.0, 50000) +PL_TEST_SYM_INTERVAL (log1pf, 1.0, inf, 5000) diff --git a/contrib/arm-optimized-routines/pl/math/math_config.h b/contrib/arm-optimized-routines/pl/math/math_config.h index dccb3ce4c775..c3dd8f2db8c7 100644 --- a/contrib/arm-optimized-routines/pl/math/math_config.h +++ b/contrib/arm-optimized-routines/pl/math/math_config.h @@ -13,9 +13,9 @@ #ifndef WANT_ROUNDING /* If defined to 1, return correct results for special cases in non-nearest - rounding modes (logf (1.0f) returns 0.0f with FE_DOWNWARD rather than -0.0f). - This may be set to 0 if there is no fenv support or if math functions only - get called in round to nearest mode. */ + rounding modes (logf (1.0f) returns 0.0f with FE_DOWNWARD rather than + -0.0f). This may be set to 0 if there is no fenv support or if math + functions only get called in round to nearest mode. */ # define WANT_ROUNDING 1 #endif #ifndef WANT_ERRNO @@ -27,33 +27,34 @@ #ifndef WANT_SIMD_EXCEPT /* If defined to 1, trigger fp exceptions in vector routines, consistently with behaviour expected from the corresponding scalar routine. */ -#define WANT_SIMD_EXCEPT 0 +# define WANT_SIMD_EXCEPT 0 #endif /* Compiler can inline round as a single instruction. */ #ifndef HAVE_FAST_ROUND # if __aarch64__ -# define HAVE_FAST_ROUND 1 +# define HAVE_FAST_ROUND 1 # else -# define HAVE_FAST_ROUND 0 +# define HAVE_FAST_ROUND 0 # endif #endif /* Compiler can inline lround, but not (long)round(x). */ #ifndef HAVE_FAST_LROUND -# if __aarch64__ && (100*__GNUC__ + __GNUC_MINOR__) >= 408 && __NO_MATH_ERRNO__ -# define HAVE_FAST_LROUND 1 +# if __aarch64__ && (100 * __GNUC__ + __GNUC_MINOR__) >= 408 \ + && __NO_MATH_ERRNO__ +# define HAVE_FAST_LROUND 1 # else -# define HAVE_FAST_LROUND 0 +# define HAVE_FAST_LROUND 0 # endif #endif /* Compiler can inline fma as a single instruction. */ #ifndef HAVE_FAST_FMA # if defined FP_FAST_FMA || __aarch64__ -# define HAVE_FAST_FMA 1 +# define HAVE_FAST_FMA 1 # else -# define HAVE_FAST_FMA 0 +# define HAVE_FAST_FMA 0 # endif #endif @@ -62,9 +63,9 @@ to interpose math functions with both static and dynamic linking. */ #ifndef USE_GLIBC_ABI # if __GNUC__ -# define USE_GLIBC_ABI 1 +# define USE_GLIBC_ABI 1 # else -# define USE_GLIBC_ABI 0 +# define USE_GLIBC_ABI 0 # endif #endif @@ -76,15 +77,15 @@ # define likely(x) __builtin_expect (!!(x), 1) # define unlikely(x) __builtin_expect (x, 0) # if __GNUC__ >= 9 -# define attribute_copy(f) __attribute__ ((copy (f))) +# define attribute_copy(f) __attribute__ ((copy (f))) # else -# define attribute_copy(f) +# define attribute_copy(f) # endif -# define strong_alias(f, a) \ - extern __typeof (f) a __attribute__ ((alias (#f))) attribute_copy (f); -# define hidden_alias(f, a) \ - extern __typeof (f) a __attribute__ ((alias (#f), visibility ("hidden"))) \ - attribute_copy (f); +# define strong_alias(f, a) \ + extern __typeof (f) a __attribute__ ((alias (#f))) attribute_copy (f); +# define hidden_alias(f, a) \ + extern __typeof (f) a __attribute__ ((alias (#f), visibility ("hidden"))) \ + attribute_copy (f); #else # define HIDDEN # define NOINLINE @@ -93,6 +94,31 @@ # define unlikely(x) (x) #endif +/* Return ptr but hide its value from the compiler so accesses through it + cannot be optimized based on the contents. */ +#define ptr_barrier(ptr) \ + ({ \ + __typeof (ptr) __ptr = (ptr); \ + __asm("" : "+r"(__ptr)); \ + __ptr; \ + }) + +/* Symbol renames to avoid libc conflicts. */ +#define __math_oflowf arm_math_oflowf +#define __math_uflowf arm_math_uflowf +#define __math_may_uflowf arm_math_may_uflowf +#define __math_divzerof arm_math_divzerof +#define __math_oflow arm_math_oflow +#define __math_uflow arm_math_uflow +#define __math_may_uflow arm_math_may_uflow +#define __math_divzero arm_math_divzero +#define __math_invalidf arm_math_invalidf +#define __math_invalid arm_math_invalid +#define __math_check_oflow arm_math_check_oflow +#define __math_check_uflow arm_math_check_uflow +#define __math_check_oflowf arm_math_check_oflowf +#define __math_check_uflowf arm_math_check_uflowf + #if HAVE_FAST_ROUND /* When set, the roundtoint and converttoint functions are provided with the semantics documented below. */ @@ -128,7 +154,7 @@ asuint (float f) { float f; uint32_t i; - } u = {f}; + } u = { f }; return u.i; } @@ -139,7 +165,7 @@ asfloat (uint32_t i) { uint32_t i; float f; - } u = {i}; + } u = { i }; return u.f; } @@ -150,7 +176,7 @@ asuint64 (double f) { double f; uint64_t i; - } u = {f}; + } u = { f }; return u.i; } @@ -161,7 +187,7 @@ asdouble (uint64_t i) { uint64_t i; double f; - } u = {i}; + } u = { i }; return u.f; } @@ -320,10 +346,26 @@ check_uflowf (float x) extern const struct erff_data { - float erff_poly_A[6]; - float erff_poly_B[7]; + struct + { + float erf, scale; + } tab[513]; } __erff_data HIDDEN; +extern const struct sv_erff_data +{ + float erf[513]; + float scale[513]; +} __sv_erff_data HIDDEN; + +extern const struct erfcf_data +{ + struct + { + float erfc, scale; + } tab[645]; +} __erfcf_data HIDDEN; + /* Data for logf and log10f. */ #define LOGF_TABLE_BITS 4 #define LOGF_POLY_ORDER 4 @@ -349,9 +391,15 @@ extern const struct log10_data double invln10; double poly[LOG10_POLY_ORDER - 1]; /* First coefficient is 1/log(10). */ double poly1[LOG10_POLY1_ORDER - 1]; - struct {double invc, logc;} tab[1 << LOG10_TABLE_BITS]; + struct + { + double invc, logc; + } tab[1 << LOG10_TABLE_BITS]; #if !HAVE_FAST_FMA - struct {double chi, clo;} tab2[1 << LOG10_TABLE_BITS]; + struct + { + double chi, clo; + } tab2[1 << LOG10_TABLE_BITS]; #endif } __log10_data HIDDEN; @@ -374,44 +422,38 @@ extern const struct exp_data double poly[4]; /* Last four coefficients. */ double exp2_shift; double exp2_poly[EXP2_POLY_ORDER]; - uint64_t tab[2*(1 << EXP_TABLE_BITS)]; + uint64_t tab[2 * (1 << EXP_TABLE_BITS)]; } __exp_data HIDDEN; -#define ERFC_NUM_INTERVALS 20 -#define ERFC_POLY_ORDER 12 -extern const struct erfc_data -{ - double interval_bounds[ERFC_NUM_INTERVALS + 1]; - double poly[ERFC_NUM_INTERVALS][ERFC_POLY_ORDER + 1]; -} __erfc_data HIDDEN; -extern const struct v_erfc_data -{ - double interval_bounds[ERFC_NUM_INTERVALS + 1]; - double poly[ERFC_NUM_INTERVALS + 1][ERFC_POLY_ORDER + 1]; -} __v_erfc_data HIDDEN; - -#define ERFCF_POLY_NCOEFFS 16 -extern const struct erfcf_poly_data -{ - double poly[4][ERFCF_POLY_NCOEFFS]; -} __erfcf_poly_data HIDDEN; - +/* Copied from math/v_exp.h for use in vector exp_tail. */ #define V_EXP_TAIL_TABLE_BITS 8 extern const uint64_t __v_exp_tail_data[1 << V_EXP_TAIL_TABLE_BITS] HIDDEN; -#define V_ERF_NINTS 49 -#define V_ERF_NCOEFFS 10 -extern const struct v_erf_data -{ - double shifts[V_ERF_NINTS]; - double coeffs[V_ERF_NCOEFFS][V_ERF_NINTS]; -} __v_erf_data HIDDEN; +/* Copied from math/v_exp.h for use in vector exp2. */ +#define V_EXP_TABLE_BITS 7 +extern const uint64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN; -#define V_ERFF_NCOEFFS 7 -extern const struct v_erff_data +extern const struct erf_data { - float coeffs[V_ERFF_NCOEFFS][2]; -} __v_erff_data HIDDEN; + struct + { + double erf, scale; + } tab[769]; +} __erf_data HIDDEN; + +extern const struct sv_erf_data +{ + double erf[769]; + double scale[769]; +} __sv_erf_data HIDDEN; + +extern const struct erfc_data +{ + struct + { + double erfc, scale; + } tab[3488]; +} __erfc_data HIDDEN; #define ATAN_POLY_NCOEFFS 20 extern const struct atan_poly_data @@ -465,7 +507,6 @@ extern const struct log1p_data } __log1p_data HIDDEN; #define LOG1PF_2U5 -#define V_LOG1PF_2U5 #define LOG1PF_NCOEFFS 9 extern const struct log1pf_data { @@ -481,61 +522,52 @@ extern const struct tanf_poly_data float poly_cotan[TANF_Q_POLY_NCOEFFS]; } __tanf_poly_data HIDDEN; -#define V_LOG2F_POLY_NCOEFFS 9 -extern const struct v_log2f_data -{ - float poly[V_LOG2F_POLY_NCOEFFS]; -} __v_log2f_data HIDDEN; - #define V_LOG2_TABLE_BITS 7 -#define V_LOG2_POLY_ORDER 6 extern const struct v_log2_data { - double poly[V_LOG2_POLY_ORDER - 1]; + double poly[5]; + double invln2; struct { double invc, log2c; - } tab[1 << V_LOG2_TABLE_BITS]; + } table[1 << V_LOG2_TABLE_BITS]; } __v_log2_data HIDDEN; -#define V_SINF_NCOEFFS 4 -extern const struct sv_sinf_data -{ - float coeffs[V_SINF_NCOEFFS]; -} __sv_sinf_data HIDDEN; - #define V_LOG10_TABLE_BITS 7 -#define V_LOG10_POLY_ORDER 6 extern const struct v_log10_data { + double poly[5]; + double invln10, log10_2; struct { double invc, log10c; - } tab[1 << V_LOG10_TABLE_BITS]; - double poly[V_LOG10_POLY_ORDER - 1]; - double invln10, log10_2; + } table[1 << V_LOG10_TABLE_BITS]; } __v_log10_data HIDDEN; -#define V_LOG10F_POLY_ORDER 9 -extern const float __v_log10f_poly[V_LOG10F_POLY_ORDER - 1] HIDDEN; - -#define SV_LOGF_POLY_ORDER 8 -extern const float __sv_logf_poly[SV_LOGF_POLY_ORDER - 1] HIDDEN; - -#define SV_LOG_POLY_ORDER 6 -#define SV_LOG_TABLE_BITS 7 -extern const struct sv_log_data +/* Some data for SVE powf's internal exp and log. */ +#define V_POWF_EXP2_TABLE_BITS 5 +#define V_POWF_EXP2_N (1 << V_POWF_EXP2_TABLE_BITS) +#define V_POWF_LOG2_TABLE_BITS 5 +#define V_POWF_LOG2_N (1 << V_POWF_LOG2_TABLE_BITS) +extern const struct v_powf_data { - double invc[1 << SV_LOG_TABLE_BITS]; - double logc[1 << SV_LOG_TABLE_BITS]; - double poly[SV_LOG_POLY_ORDER - 1]; -} __sv_log_data HIDDEN; + double invc[V_POWF_LOG2_N]; + double logc[V_POWF_LOG2_N]; + uint64_t scale[V_POWF_EXP2_N]; +} __v_powf_data HIDDEN; -#ifndef SV_EXPF_USE_FEXPA -#define SV_EXPF_USE_FEXPA 0 -#endif -#define SV_EXPF_POLY_ORDER 6 -extern const float __sv_expf_poly[SV_EXPF_POLY_ORDER - 1] HIDDEN; +#define V_LOG_POLY_ORDER 6 +#define V_LOG_TABLE_BITS 7 +extern const struct v_log_data +{ + /* Shared data for vector log and log-derived routines (e.g. asinh). */ + double poly[V_LOG_POLY_ORDER - 1]; + double ln2; + struct + { + double invc, logc; + } table[1 << V_LOG_TABLE_BITS]; +} __v_log_data HIDDEN; #define EXPM1F_POLY_ORDER 5 extern const float __expm1f_poly[EXPM1F_POLY_ORDER] HIDDEN; @@ -564,9 +596,29 @@ extern const struct cbrt_data double table[5]; } __cbrt_data HIDDEN; -extern const struct v_tan_data +#define ASINF_POLY_ORDER 4 +extern const float __asinf_poly[ASINF_POLY_ORDER + 1] HIDDEN; + +#define ASIN_POLY_ORDER 11 +extern const double __asin_poly[ASIN_POLY_ORDER + 1] HIDDEN; + +/* Some data for AdvSIMD and SVE pow's internal exp and log. */ +#define V_POW_EXP_TABLE_BITS 8 +extern const struct v_pow_exp_data { - double neg_half_pi_hi, neg_half_pi_lo; - double poly[9]; -} __v_tan_data HIDDEN; + double poly[3]; + double n_over_ln2, ln2_over_n_hi, ln2_over_n_lo, shift; + uint64_t sbits[1 << V_POW_EXP_TABLE_BITS]; +} __v_pow_exp_data HIDDEN; + +#define V_POW_LOG_TABLE_BITS 7 +extern const struct v_pow_log_data +{ + double poly[7]; /* First coefficient is 1. */ + double ln2_hi, ln2_lo; + double invc[1 << V_POW_LOG_TABLE_BITS]; + double logc[1 << V_POW_LOG_TABLE_BITS]; + double logctail[1 << V_POW_LOG_TABLE_BITS]; +} __v_pow_log_data HIDDEN; + #endif diff --git a/contrib/arm-optimized-routines/pl/math/math_err.c b/contrib/arm-optimized-routines/pl/math/math_err.c index d246a89982de..74db54a5b2cd 100644 --- a/contrib/arm-optimized-routines/pl/math/math_err.c +++ b/contrib/arm-optimized-routines/pl/math/math_err.c @@ -8,7 +8,7 @@ #include "math_config.h" #if WANT_ERRNO -#include +# include /* NOINLINE reduces code size and avoids making math functions non-leaf when the error handling is inlined. */ NOINLINE static double @@ -18,7 +18,7 @@ with_errno (double y, int e) return y; } #else -#define with_errno(x, e) (x) +# define with_errno(x, e) (x) #endif /* NOINLINE reduces code size. */ diff --git a/contrib/arm-optimized-routines/pl/math/math_errf.c b/contrib/arm-optimized-routines/pl/math/math_errf.c index 96271ff18bc1..2b8c6bd25753 100644 --- a/contrib/arm-optimized-routines/pl/math/math_errf.c +++ b/contrib/arm-optimized-routines/pl/math/math_errf.c @@ -8,7 +8,7 @@ #include "math_config.h" #if WANT_ERRNO -#include +# include /* NOINLINE reduces code size and avoids making math functions non-leaf when the error handling is inlined. */ NOINLINE static float @@ -18,7 +18,7 @@ with_errnof (float y, int e) return y; } #else -#define with_errnof(x, e) (x) +# define with_errnof(x, e) (x) #endif /* NOINLINE reduces code size. */ diff --git a/contrib/arm-optimized-routines/pl/math/pairwise_horner.h b/contrib/arm-optimized-routines/pl/math/pairwise_horner.h deleted file mode 100644 index 6ad98dccd6aa..000000000000 --- a/contrib/arm-optimized-routines/pl/math/pairwise_horner.h +++ /dev/null @@ -1,14 +0,0 @@ -/* - * Helper macros for double-precision pairwise Horner polynomial evaluation. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#if V_SUPPORTED -#define FMA v_fma_f64 -#else -#define FMA fma -#endif - -#include "pairwise_horner_wrap.h" diff --git a/contrib/arm-optimized-routines/pl/math/pairwise_horner_wrap.h b/contrib/arm-optimized-routines/pl/math/pairwise_horner_wrap.h deleted file mode 100644 index e56f059514ad..000000000000 --- a/contrib/arm-optimized-routines/pl/math/pairwise_horner_wrap.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Helper macros for pairwise Horner polynomial evaluation. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -// clang-format off -#define PW_HORNER_1_(x, c, i) FMA(x, c(i + 1), c(i)) -#define PW_HORNER_3_(x, x2, c, i) FMA(x2, PW_HORNER_1_ (x, c, i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_5_(x, x2, c, i) FMA(x2, PW_HORNER_3_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_7_(x, x2, c, i) FMA(x2, PW_HORNER_5_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_9_(x, x2, c, i) FMA(x2, PW_HORNER_7_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_11_(x, x2, c, i) FMA(x2, PW_HORNER_9_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_13_(x, x2, c, i) FMA(x2, PW_HORNER_11_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_15_(x, x2, c, i) FMA(x2, PW_HORNER_13_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_17_(x, x2, c, i) FMA(x2, PW_HORNER_15_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) - -#define PAIRWISE_HORNER_1(x, c) PW_HORNER_1_ (x, c, 0) -#define PAIRWISE_HORNER_3(x, x2, c) PW_HORNER_3_ (x, x2, c, 0) -#define PAIRWISE_HORNER_5(x, x2, c) PW_HORNER_5_ (x, x2, c, 0) -#define PAIRWISE_HORNER_7(x, x2, c) PW_HORNER_7_ (x, x2, c, 0) -#define PAIRWISE_HORNER_9(x, x2, c) PW_HORNER_9_ (x, x2, c, 0) -#define PAIRWISE_HORNER_11(x, x2, c) PW_HORNER_11_(x, x2, c, 0) -#define PAIRWISE_HORNER_13(x, x2, c) PW_HORNER_13_(x, x2, c, 0) -#define PAIRWISE_HORNER_15(x, x2, c) PW_HORNER_15_(x, x2, c, 0) -#define PAIRWISE_HORNER_17(x, x2, c) PW_HORNER_17_(x, x2, c, 0) - -#define PW_HORNER_2_(x, x2, c, i) FMA(x2, c(i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_4_(x, x2, c, i) FMA(x2, PW_HORNER_2_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_6_(x, x2, c, i) FMA(x2, PW_HORNER_4_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_8_(x, x2, c, i) FMA(x2, PW_HORNER_6_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_10_(x, x2, c, i) FMA(x2, PW_HORNER_8_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_12_(x, x2, c, i) FMA(x2, PW_HORNER_10_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_14_(x, x2, c, i) FMA(x2, PW_HORNER_12_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_16_(x, x2, c, i) FMA(x2, PW_HORNER_14_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_18_(x, x2, c, i) FMA(x2, PW_HORNER_16_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) - -#define PAIRWISE_HORNER_2(x, x2, c) PW_HORNER_2_ (x, x2, c, 0) -#define PAIRWISE_HORNER_4(x, x2, c) PW_HORNER_4_ (x, x2, c, 0) -#define PAIRWISE_HORNER_6(x, x2, c) PW_HORNER_6_ (x, x2, c, 0) -#define PAIRWISE_HORNER_8(x, x2, c) PW_HORNER_8_(x, x2, c, 0) -#define PAIRWISE_HORNER_10(x, x2, c) PW_HORNER_10_(x, x2, c, 0) -#define PAIRWISE_HORNER_12(x, x2, c) PW_HORNER_12_(x, x2, c, 0) -#define PAIRWISE_HORNER_14(x, x2, c) PW_HORNER_14_(x, x2, c, 0) -#define PAIRWISE_HORNER_16(x, x2, c) PW_HORNER_16_(x, x2, c, 0) -#define PAIRWISE_HORNER_18(x, x2, c) PW_HORNER_18_(x, x2, c, 0) -// clang-format on diff --git a/contrib/arm-optimized-routines/pl/math/pairwise_hornerf.h b/contrib/arm-optimized-routines/pl/math/pairwise_hornerf.h deleted file mode 100644 index 784750cde0b6..000000000000 --- a/contrib/arm-optimized-routines/pl/math/pairwise_hornerf.h +++ /dev/null @@ -1,14 +0,0 @@ -/* - * Helper macros for single-precision pairwise Horner polynomial evaluation. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#if V_SUPPORTED -#define FMA v_fma_f32 -#else -#define FMA fmaf -#endif - -#include "pairwise_horner_wrap.h" diff --git a/contrib/arm-optimized-routines/pl/math/pl_sig.h b/contrib/arm-optimized-routines/pl/math/pl_sig.h index 686d24f0d9a5..52d988f0e1ce 100644 --- a/contrib/arm-optimized-routines/pl/math/pl_sig.h +++ b/contrib/arm-optimized-routines/pl/math/pl_sig.h @@ -4,35 +4,51 @@ * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception. */ + +#define V_NAME_F1(fun) _ZGVnN4v_##fun##f +#define V_NAME_D1(fun) _ZGVnN2v_##fun +#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f +#define V_NAME_D2(fun) _ZGVnN2vv_##fun + +#define SV_NAME_F1(fun) _ZGVsMxv_##fun##f +#define SV_NAME_D1(fun) _ZGVsMxv_##fun +#define SV_NAME_F2(fun) _ZGVsMxvv_##fun##f +#define SV_NAME_D2(fun) _ZGVsMxvv_##fun + #define PL_DECL_SF1(fun) float fun##f (float); #define PL_DECL_SF2(fun) float fun##f (float, float); #define PL_DECL_SD1(fun) double fun (double); #define PL_DECL_SD2(fun) double fun (double, double); -#if V_SUPPORTED -#define PL_DECL_VF1(fun) VPCS_ATTR v_f32_t V_NAME (fun##f) (v_f32_t); -#define PL_DECL_VF2(fun) VPCS_ATTR v_f32_t V_NAME (fun##f) (v_f32_t, v_f32_t); -#define PL_DECL_VD1(fun) VPCS_ATTR v_f64_t V_NAME (fun) (v_f64_t); -#define PL_DECL_VD2(fun) VPCS_ATTR v_f64_t V_NAME (fun) (v_f64_t, v_f64_t); +#if WANT_VMATH +# define PL_DECL_VF1(fun) \ + VPCS_ATTR float32x4_t V_NAME_F1 (fun##f) (float32x4_t); +# define PL_DECL_VF2(fun) \ + VPCS_ATTR float32x4_t V_NAME_F2 (fun##f) (float32x4_t, float32x4_t); +# define PL_DECL_VD1(fun) VPCS_ATTR float64x2_t V_NAME_D1 (fun) (float64x2_t); +# define PL_DECL_VD2(fun) \ + VPCS_ATTR float64x2_t V_NAME_D2 (fun) (float64x2_t, float64x2_t); #else -#define PL_DECL_VF1(fun) -#define PL_DECL_VF2(fun) -#define PL_DECL_VD1(fun) -#define PL_DECL_VD2(fun) +# define PL_DECL_VF1(fun) +# define PL_DECL_VF2(fun) +# define PL_DECL_VD1(fun) +# define PL_DECL_VD2(fun) #endif -#if SV_SUPPORTED -#define PL_DECL_SVF1(fun) sv_f32_t __sv_##fun##f_x (sv_f32_t, svbool_t); -#define PL_DECL_SVF2(fun) \ - sv_f32_t __sv_##fun##f_x (sv_f32_t, sv_f32_t, svbool_t); -#define PL_DECL_SVD1(fun) sv_f64_t __sv_##fun##_x (sv_f64_t, svbool_t); -#define PL_DECL_SVD2(fun) \ - sv_f64_t __sv_##fun##_x (sv_f64_t, sv_f64_t, svbool_t); +#if WANT_SVE_MATH +# define PL_DECL_SVF1(fun) \ + svfloat32_t SV_NAME_F1 (fun) (svfloat32_t, svbool_t); +# define PL_DECL_SVF2(fun) \ + svfloat32_t SV_NAME_F2 (fun) (svfloat32_t, svfloat32_t, svbool_t); +# define PL_DECL_SVD1(fun) \ + svfloat64_t SV_NAME_D1 (fun) (svfloat64_t, svbool_t); +# define PL_DECL_SVD2(fun) \ + svfloat64_t SV_NAME_D2 (fun) (svfloat64_t, svfloat64_t, svbool_t); #else -#define PL_DECL_SVF1(fun) -#define PL_DECL_SVF2(fun) -#define PL_DECL_SVD1(fun) -#define PL_DECL_SVD2(fun) +# define PL_DECL_SVF1(fun) +# define PL_DECL_SVF2(fun) +# define PL_DECL_SVD1(fun) +# define PL_DECL_SVD2(fun) #endif /* For building the routines, emit function prototype from PL_SIG. This diff --git a/contrib/arm-optimized-routines/pl/math/poly_advsimd_f32.h b/contrib/arm-optimized-routines/pl/math/poly_advsimd_f32.h new file mode 100644 index 000000000000..438e153dff90 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/poly_advsimd_f32.h @@ -0,0 +1,24 @@ +/* + * Helpers for evaluating polynomials on single-precision AdvSIMD input, using + * various schemes. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef PL_MATH_POLY_ADVSIMD_F32_H +#define PL_MATH_POLY_ADVSIMD_F32_H + +#include + +/* Wrap AdvSIMD f32 helpers: evaluation of some scheme/order has form: + v_[scheme]_[order]_f32. */ +#define VTYPE float32x4_t +#define FMA(x, y, z) vfmaq_f32 (z, x, y) +#define VWRAP(f) v_##f##_f32 +#include "poly_generic.h" +#undef VWRAP +#undef FMA +#undef VTYPE + +#endif diff --git a/contrib/arm-optimized-routines/pl/math/poly_advsimd_f64.h b/contrib/arm-optimized-routines/pl/math/poly_advsimd_f64.h new file mode 100644 index 000000000000..7ea249a91225 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/poly_advsimd_f64.h @@ -0,0 +1,24 @@ +/* + * Helpers for evaluating polynomials on double-precision AdvSIMD input, using + * various schemes. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef PL_MATH_POLY_ADVSIMD_F64_H +#define PL_MATH_POLY_ADVSIMD_F64_H + +#include + +/* Wrap AdvSIMD f64 helpers: evaluation of some scheme/order has form: + v_[scheme]_[order]_f64. */ +#define VTYPE float64x2_t +#define FMA(x, y, z) vfmaq_f64 (z, x, y) +#define VWRAP(f) v_##f##_f64 +#include "poly_generic.h" +#undef VWRAP +#undef FMA +#undef VTYPE + +#endif diff --git a/contrib/arm-optimized-routines/pl/math/poly_generic.h b/contrib/arm-optimized-routines/pl/math/poly_generic.h new file mode 100644 index 000000000000..3fc25f8762f2 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/poly_generic.h @@ -0,0 +1,277 @@ +/* + * Generic helpers for evaluating polynomials with various schemes. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef VTYPE +# error Cannot use poly_generic without defining VTYPE +#endif +#ifndef VWRAP +# error Cannot use poly_generic without defining VWRAP +#endif +#ifndef FMA +# error Cannot use poly_generic without defining FMA +#endif + +static inline VTYPE VWRAP (pairwise_poly_3) (VTYPE x, VTYPE x2, + const VTYPE *poly) +{ + /* At order 3, Estrin and Pairwise Horner are identical. */ + VTYPE p01 = FMA (poly[1], x, poly[0]); + VTYPE p23 = FMA (poly[3], x, poly[2]); + return FMA (p23, x2, p01); +} + +static inline VTYPE VWRAP (estrin_4) (VTYPE x, VTYPE x2, VTYPE x4, + const VTYPE *poly) +{ + VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly); + return FMA (poly[4], x4, p03); +} +static inline VTYPE VWRAP (estrin_5) (VTYPE x, VTYPE x2, VTYPE x4, + const VTYPE *poly) +{ + VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly); + VTYPE p45 = FMA (poly[5], x, poly[4]); + return FMA (p45, x4, p03); +} +static inline VTYPE VWRAP (estrin_6) (VTYPE x, VTYPE x2, VTYPE x4, + const VTYPE *poly) +{ + VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly); + VTYPE p45 = FMA (poly[5], x, poly[4]); + VTYPE p46 = FMA (poly[6], x2, p45); + return FMA (p46, x4, p03); +} +static inline VTYPE VWRAP (estrin_7) (VTYPE x, VTYPE x2, VTYPE x4, + const VTYPE *poly) +{ + VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly); + VTYPE p47 = VWRAP (pairwise_poly_3) (x, x2, poly + 4); + return FMA (p47, x4, p03); +} +static inline VTYPE VWRAP (estrin_8) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + const VTYPE *poly) +{ + return FMA (poly[8], x8, VWRAP (estrin_7) (x, x2, x4, poly)); +} +static inline VTYPE VWRAP (estrin_9) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + const VTYPE *poly) +{ + VTYPE p89 = FMA (poly[9], x, poly[8]); + return FMA (p89, x8, VWRAP (estrin_7) (x, x2, x4, poly)); +} +static inline VTYPE VWRAP (estrin_10) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + const VTYPE *poly) +{ + VTYPE p89 = FMA (poly[9], x, poly[8]); + VTYPE p8_10 = FMA (poly[10], x2, p89); + return FMA (p8_10, x8, VWRAP (estrin_7) (x, x2, x4, poly)); +} +static inline VTYPE VWRAP (estrin_11) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + const VTYPE *poly) +{ + VTYPE p8_11 = VWRAP (pairwise_poly_3) (x, x2, poly + 8); + return FMA (p8_11, x8, VWRAP (estrin_7) (x, x2, x4, poly)); +} +static inline VTYPE VWRAP (estrin_12) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + const VTYPE *poly) +{ + return FMA (VWRAP (estrin_4) (x, x2, x4, poly + 8), x8, + VWRAP (estrin_7) (x, x2, x4, poly)); +} +static inline VTYPE VWRAP (estrin_13) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + const VTYPE *poly) +{ + return FMA (VWRAP (estrin_5) (x, x2, x4, poly + 8), x8, + VWRAP (estrin_7) (x, x2, x4, poly)); +} +static inline VTYPE VWRAP (estrin_14) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + const VTYPE *poly) +{ + return FMA (VWRAP (estrin_6) (x, x2, x4, poly + 8), x8, + VWRAP (estrin_7) (x, x2, x4, poly)); +} +static inline VTYPE VWRAP (estrin_15) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + const VTYPE *poly) +{ + return FMA (VWRAP (estrin_7) (x, x2, x4, poly + 8), x8, + VWRAP (estrin_7) (x, x2, x4, poly)); +} +static inline VTYPE VWRAP (estrin_16) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + VTYPE x16, const VTYPE *poly) +{ + return FMA (poly[16], x16, VWRAP (estrin_15) (x, x2, x4, x8, poly)); +} +static inline VTYPE VWRAP (estrin_17) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + VTYPE x16, const VTYPE *poly) +{ + VTYPE p16_17 = FMA (poly[17], x, poly[16]); + return FMA (p16_17, x16, VWRAP (estrin_15) (x, x2, x4, x8, poly)); +} +static inline VTYPE VWRAP (estrin_18) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + VTYPE x16, const VTYPE *poly) +{ + VTYPE p16_17 = FMA (poly[17], x, poly[16]); + VTYPE p16_18 = FMA (poly[18], x2, p16_17); + return FMA (p16_18, x16, VWRAP (estrin_15) (x, x2, x4, x8, poly)); +} +static inline VTYPE VWRAP (estrin_19) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + VTYPE x16, const VTYPE *poly) +{ + VTYPE p16_19 = VWRAP (pairwise_poly_3) (x, x2, poly + 16); + return FMA (p16_19, x16, VWRAP (estrin_15) (x, x2, x4, x8, poly)); +} + +static inline VTYPE VWRAP (horner_2) (VTYPE x, const VTYPE *poly) +{ + VTYPE p = FMA (poly[2], x, poly[1]); + return FMA (x, p, poly[0]); +} +static inline VTYPE VWRAP (horner_3) (VTYPE x, const VTYPE *poly) +{ + VTYPE p = FMA (poly[3], x, poly[2]); + p = FMA (x, p, poly[1]); + p = FMA (x, p, poly[0]); + return p; +} +static inline VTYPE VWRAP (horner_4) (VTYPE x, const VTYPE *poly) +{ + VTYPE p = FMA (poly[4], x, poly[3]); + p = FMA (x, p, poly[2]); + p = FMA (x, p, poly[1]); + p = FMA (x, p, poly[0]); + return p; +} +static inline VTYPE VWRAP (horner_5) (VTYPE x, const VTYPE *poly) +{ + return FMA (x, VWRAP (horner_4) (x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_6) (VTYPE x, const VTYPE *poly) +{ + return FMA (x, VWRAP (horner_5) (x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_7) (VTYPE x, const VTYPE *poly) +{ + return FMA (x, VWRAP (horner_6) (x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_8) (VTYPE x, const VTYPE *poly) +{ + return FMA (x, VWRAP (horner_7) (x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_9) (VTYPE x, const VTYPE *poly) +{ + return FMA (x, VWRAP (horner_8) (x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_10) (VTYPE x, const VTYPE *poly) +{ + return FMA (x, VWRAP (horner_9) (x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_11) (VTYPE x, const VTYPE *poly) +{ + return FMA (x, VWRAP (horner_10) (x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_12) (VTYPE x, const VTYPE *poly) +{ + return FMA (x, VWRAP (horner_11) (x, poly + 1), poly[0]); +} + +static inline VTYPE VWRAP (pw_horner_4) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p01 = FMA (poly[1], x, poly[0]); + VTYPE p23 = FMA (poly[3], x, poly[2]); + VTYPE p; + p = FMA (x2, poly[4], p23); + p = FMA (x2, p, p01); + return p; +} +static inline VTYPE VWRAP (pw_horner_5) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p01 = FMA (poly[1], x, poly[0]); + VTYPE p23 = FMA (poly[3], x, poly[2]); + VTYPE p45 = FMA (poly[5], x, poly[4]); + VTYPE p; + p = FMA (x2, p45, p23); + p = FMA (x2, p, p01); + return p; +} +static inline VTYPE VWRAP (pw_horner_6) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p26 = VWRAP (pw_horner_4) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p26, p01); +} +static inline VTYPE VWRAP (pw_horner_7) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p27 = VWRAP (pw_horner_5) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p27, p01); +} +static inline VTYPE VWRAP (pw_horner_8) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p28 = VWRAP (pw_horner_6) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p28, p01); +} +static inline VTYPE VWRAP (pw_horner_9) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p29 = VWRAP (pw_horner_7) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p29, p01); +} +static inline VTYPE VWRAP (pw_horner_10) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p2_10 = VWRAP (pw_horner_8) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p2_10, p01); +} +static inline VTYPE VWRAP (pw_horner_11) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p2_11 = VWRAP (pw_horner_9) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p2_11, p01); +} +static inline VTYPE VWRAP (pw_horner_12) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p2_12 = VWRAP (pw_horner_10) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p2_12, p01); +} +static inline VTYPE VWRAP (pw_horner_13) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p2_13 = VWRAP (pw_horner_11) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p2_13, p01); +} +static inline VTYPE VWRAP (pw_horner_14) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p2_14 = VWRAP (pw_horner_12) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p2_14, p01); +} +static inline VTYPE VWRAP (pw_horner_15) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p2_15 = VWRAP (pw_horner_13) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p2_15, p01); +} +static inline VTYPE VWRAP (pw_horner_16) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p2_16 = VWRAP (pw_horner_14) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p2_16, p01); +} +static inline VTYPE VWRAP (pw_horner_17) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p2_17 = VWRAP (pw_horner_15) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p2_17, p01); +} +static inline VTYPE VWRAP (pw_horner_18) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p2_18 = VWRAP (pw_horner_16) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p2_18, p01); +} diff --git a/contrib/arm-optimized-routines/pl/math/poly_scalar_f32.h b/contrib/arm-optimized-routines/pl/math/poly_scalar_f32.h new file mode 100644 index 000000000000..a9b1c5544494 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/poly_scalar_f32.h @@ -0,0 +1,24 @@ +/* + * Helpers for evaluating polynomials on siongle-precision scalar input, using + * various schemes. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef PL_MATH_POLY_SCALAR_F32_H +#define PL_MATH_POLY_SCALAR_F32_H + +#include + +/* Wrap scalar f32 helpers: evaluation of some scheme/order has form: + [scheme]_[order]_f32. */ +#define VTYPE float +#define FMA fmaf +#define VWRAP(f) f##_f32 +#include "poly_generic.h" +#undef VWRAP +#undef FMA +#undef VTYPE + +#endif diff --git a/contrib/arm-optimized-routines/pl/math/poly_scalar_f64.h b/contrib/arm-optimized-routines/pl/math/poly_scalar_f64.h new file mode 100644 index 000000000000..207dccee30ad --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/poly_scalar_f64.h @@ -0,0 +1,24 @@ +/* + * Helpers for evaluating polynomials on double-precision scalar input, using + * various schemes. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef PL_MATH_POLY_SCALAR_F64_H +#define PL_MATH_POLY_SCALAR_F64_H + +#include + +/* Wrap scalar f64 helpers: evaluation of some scheme/order has form: + [scheme]_[order]_f64. */ +#define VTYPE double +#define FMA fma +#define VWRAP(f) f##_f64 +#include "poly_generic.h" +#undef VWRAP +#undef FMA +#undef VTYPE + +#endif diff --git a/contrib/arm-optimized-routines/pl/math/poly_sve_f32.h b/contrib/arm-optimized-routines/pl/math/poly_sve_f32.h new file mode 100644 index 000000000000..a97e2ced027a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/poly_sve_f32.h @@ -0,0 +1,26 @@ +/* + * Helpers for evaluating polynomials on single-precision SVE input, using + * various schemes. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef PL_MATH_POLY_SVE_F32_H +#define PL_MATH_POLY_SVE_F32_H + +#include + +/* Wrap SVE f32 helpers: evaluation of some scheme/order has form: + sv_[scheme]_[order]_f32_x. */ +#define VTYPE svfloat32_t +#define STYPE float +#define VWRAP(f) sv_##f##_f32_x +#define DUP svdup_f32 +#include "poly_sve_generic.h" +#undef DUP +#undef VWRAP +#undef STYPE +#undef VTYPE + +#endif diff --git a/contrib/arm-optimized-routines/pl/math/poly_sve_f64.h b/contrib/arm-optimized-routines/pl/math/poly_sve_f64.h new file mode 100644 index 000000000000..5fb14b3c1700 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/poly_sve_f64.h @@ -0,0 +1,26 @@ +/* + * Helpers for evaluating polynomials on double-precision SVE input, using + * various schemes. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef PL_MATH_POLY_SVE_F64_H +#define PL_MATH_POLY_SVE_F64_H + +#include + +/* Wrap SVE f64 helpers: evaluation of some scheme/order has form: + sv_[scheme]_[order]_f64_x. */ +#define VTYPE svfloat64_t +#define STYPE double +#define VWRAP(f) sv_##f##_f64_x +#define DUP svdup_f64 +#include "poly_sve_generic.h" +#undef DUP +#undef VWRAP +#undef STYPE +#undef VTYPE + +#endif diff --git a/contrib/arm-optimized-routines/pl/math/poly_sve_generic.h b/contrib/arm-optimized-routines/pl/math/poly_sve_generic.h new file mode 100644 index 000000000000..b568e4cddff3 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/poly_sve_generic.h @@ -0,0 +1,301 @@ +/* + * Helpers for evaluating polynomials with various schemes - specific to SVE + * but precision-agnostic. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef VTYPE +# error Cannot use poly_generic without defining VTYPE +#endif +#ifndef STYPE +# error Cannot use poly_generic without defining STYPE +#endif +#ifndef VWRAP +# error Cannot use poly_generic without defining VWRAP +#endif +#ifndef DUP +# error Cannot use poly_generic without defining DUP +#endif + +static inline VTYPE VWRAP (pairwise_poly_3) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + /* At order 3, Estrin and Pairwise Horner are identical. */ + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]); + return svmla_x (pg, p01, p23, x2); +} + +static inline VTYPE VWRAP (estrin_4) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, + const STYPE *poly) +{ + VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly); + return svmla_x (pg, p03, x4, poly[4]); +} +static inline VTYPE VWRAP (estrin_5) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, + const STYPE *poly) +{ + VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly); + VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]); + return svmla_x (pg, p03, p45, x4); +} +static inline VTYPE VWRAP (estrin_6) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, + const STYPE *poly) +{ + VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly); + VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]); + VTYPE p46 = svmla_x (pg, p45, x, poly[6]); + return svmla_x (pg, p03, p46, x4); +} +static inline VTYPE VWRAP (estrin_7) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, + const STYPE *poly) +{ + VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly); + VTYPE p47 = VWRAP (pairwise_poly_3) (pg, x, x2, poly + 4); + return svmla_x (pg, p03, p47, x4); +} +static inline VTYPE VWRAP (estrin_8) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, + VTYPE x8, const STYPE *poly) +{ + return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), x8, poly[8]); +} +static inline VTYPE VWRAP (estrin_9) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, + VTYPE x8, const STYPE *poly) +{ + VTYPE p89 = svmla_x (pg, DUP (poly[8]), x, poly[9]); + return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p89, x8); +} +static inline VTYPE VWRAP (estrin_10) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, const STYPE *poly) +{ + VTYPE p89 = svmla_x (pg, DUP (poly[8]), x, poly[9]); + VTYPE p8_10 = svmla_x (pg, p89, x2, poly[10]); + return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p8_10, x8); +} +static inline VTYPE VWRAP (estrin_11) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, const STYPE *poly) +{ + VTYPE p8_11 = VWRAP (pairwise_poly_3) (pg, x, x2, poly + 8); + return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p8_11, x8); +} +static inline VTYPE VWRAP (estrin_12) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, const STYPE *poly) +{ + return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), + VWRAP (estrin_4) (pg, x, x2, x4, poly + 8), x8); +} +static inline VTYPE VWRAP (estrin_13) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, const STYPE *poly) +{ + return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), + VWRAP (estrin_5) (pg, x, x2, x4, poly + 8), x8); +} +static inline VTYPE VWRAP (estrin_14) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, const STYPE *poly) +{ + return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), + VWRAP (estrin_6) (pg, x, x2, x4, poly + 8), x8); +} +static inline VTYPE VWRAP (estrin_15) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, const STYPE *poly) +{ + return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), + VWRAP (estrin_7) (pg, x, x2, x4, poly + 8), x8); +} +static inline VTYPE VWRAP (estrin_16) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, VTYPE x16, + const STYPE *poly) +{ + return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), x16, + poly[16]); +} +static inline VTYPE VWRAP (estrin_17) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, VTYPE x16, + const STYPE *poly) +{ + VTYPE p16_17 = svmla_x (pg, DUP (poly[16]), x, poly[17]); + return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), p16_17, + x16); +} +static inline VTYPE VWRAP (estrin_18) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, VTYPE x16, + const STYPE *poly) +{ + VTYPE p16_17 = svmla_x (pg, DUP (poly[16]), x, poly[17]); + VTYPE p16_18 = svmla_x (pg, p16_17, x2, poly[18]); + return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), p16_18, + x16); +} +static inline VTYPE VWRAP (estrin_19) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, VTYPE x16, + const STYPE *poly) +{ + return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), + VWRAP (pairwise_poly_3) (pg, x, x2, poly + 16), x16); +} + +static inline VTYPE VWRAP (horner_3) (svbool_t pg, VTYPE x, const STYPE *poly) +{ + VTYPE p = svmla_x (pg, DUP (poly[2]), x, poly[3]); + p = svmad_x (pg, x, p, poly[1]); + p = svmad_x (pg, x, p, poly[0]); + return p; +} +static inline VTYPE VWRAP (horner_4) (svbool_t pg, VTYPE x, const STYPE *poly) +{ + VTYPE p = svmla_x (pg, DUP (poly[3]), x, poly[4]); + p = svmad_x (pg, x, p, poly[2]); + p = svmad_x (pg, x, p, poly[1]); + p = svmad_x (pg, x, p, poly[0]); + return p; +} +static inline VTYPE VWRAP (horner_5) (svbool_t pg, VTYPE x, const STYPE *poly) +{ + return svmad_x (pg, x, VWRAP (horner_4) (pg, x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_6) (svbool_t pg, VTYPE x, const STYPE *poly) +{ + return svmad_x (pg, x, VWRAP (horner_5) (pg, x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_7) (svbool_t pg, VTYPE x, const STYPE *poly) +{ + return svmad_x (pg, x, VWRAP (horner_6) (pg, x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_8) (svbool_t pg, VTYPE x, const STYPE *poly) +{ + return svmad_x (pg, x, VWRAP (horner_7) (pg, x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_9) (svbool_t pg, VTYPE x, const STYPE *poly) +{ + return svmad_x (pg, x, VWRAP (horner_8) (pg, x, poly + 1), poly[0]); +} +static inline VTYPE +sv_horner_10_f32_x (svbool_t pg, VTYPE x, const STYPE *poly) +{ + return svmad_x (pg, x, VWRAP (horner_9) (pg, x, poly + 1), poly[0]); +} +static inline VTYPE +sv_horner_11_f32_x (svbool_t pg, VTYPE x, const STYPE *poly) +{ + return svmad_x (pg, x, sv_horner_10_f32_x (pg, x, poly + 1), poly[0]); +} +static inline VTYPE +sv_horner_12_f32_x (svbool_t pg, VTYPE x, const STYPE *poly) +{ + return svmad_x (pg, x, sv_horner_11_f32_x (pg, x, poly + 1), poly[0]); +} + +static inline VTYPE VWRAP (pw_horner_4) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]); + VTYPE p; + p = svmla_x (pg, p23, x2, poly[4]); + p = svmla_x (pg, p01, x2, p); + return p; +} +static inline VTYPE VWRAP (pw_horner_5) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]); + VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]); + VTYPE p; + p = svmla_x (pg, p23, x2, p45); + p = svmla_x (pg, p01, x2, p); + return p; +} +static inline VTYPE VWRAP (pw_horner_6) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p26 = VWRAP (pw_horner_4) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p26); +} +static inline VTYPE VWRAP (pw_horner_7) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p27 = VWRAP (pw_horner_5) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p27); +} +static inline VTYPE VWRAP (pw_horner_8) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p28 = VWRAP (pw_horner_6) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p28); +} +static inline VTYPE VWRAP (pw_horner_9) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p29 = VWRAP (pw_horner_7) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p29); +} +static inline VTYPE VWRAP (pw_horner_10) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p2_10 = VWRAP (pw_horner_8) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p2_10); +} +static inline VTYPE VWRAP (pw_horner_11) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p2_11 = VWRAP (pw_horner_9) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p2_11); +} +static inline VTYPE VWRAP (pw_horner_12) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p2_12 = VWRAP (pw_horner_10) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p2_12); +} +static inline VTYPE VWRAP (pw_horner_13) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p2_13 = VWRAP (pw_horner_11) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p2_13); +} +static inline VTYPE VWRAP (pw_horner_14) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p2_14 = VWRAP (pw_horner_12) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p2_14); +} +static inline VTYPE VWRAP (pw_horner_15) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p2_15 = VWRAP (pw_horner_13) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p2_15); +} +static inline VTYPE VWRAP (pw_horner_16) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p2_16 = VWRAP (pw_horner_14) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p2_16); +} +static inline VTYPE VWRAP (pw_horner_17) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p2_17 = VWRAP (pw_horner_15) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p2_17); +} +static inline VTYPE VWRAP (pw_horner_18) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p2_18 = VWRAP (pw_horner_16) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p2_18); +} diff --git a/contrib/arm-optimized-routines/pl/math/s_acosh_3u5.c b/contrib/arm-optimized-routines/pl/math/s_acosh_3u5.c deleted file mode 100644 index f62cbd6b53f0..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_acosh_3u5.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_acosh_3u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_acoshf_3u1.c b/contrib/arm-optimized-routines/pl/math/s_acoshf_3u1.c deleted file mode 100644 index 374066622a0f..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_acoshf_3u1.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_acoshf_3u1.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_asinh_3u5.c b/contrib/arm-optimized-routines/pl/math/s_asinh_3u5.c deleted file mode 100644 index ab8fbd9c3d69..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_asinh_3u5.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_asinh_3u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_asinhf_2u7.c b/contrib/arm-optimized-routines/pl/math/s_asinhf_2u7.c deleted file mode 100644 index 13e1a5fd314a..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_asinhf_2u7.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_asinhf_2u7.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_atan2_3u.c b/contrib/arm-optimized-routines/pl/math/s_atan2_3u.c deleted file mode 100644 index 4603e5f72615..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_atan2_3u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2021-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_atan2_3u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_atan2f_3u.c b/contrib/arm-optimized-routines/pl/math/s_atan2f_3u.c deleted file mode 100644 index 894d843273ea..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_atan2f_3u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2021-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_atan2f_3u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_atan_2u5.c b/contrib/arm-optimized-routines/pl/math/s_atan_2u5.c deleted file mode 100644 index 4b61bc4d1460..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_atan_2u5.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2021-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_atan_2u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_atanf_3u.c b/contrib/arm-optimized-routines/pl/math/s_atanf_3u.c deleted file mode 100644 index 6b6571927195..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_atanf_3u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2021-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_atanf_3u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_atanh_3u5.c b/contrib/arm-optimized-routines/pl/math/s_atanh_3u5.c deleted file mode 100644 index f6a5f75b1779..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_atanh_3u5.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_atanh_3u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_atanhf_3u1.c b/contrib/arm-optimized-routines/pl/math/s_atanhf_3u1.c deleted file mode 100644 index e7e5c6197406..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_atanhf_3u1.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_atanhf_3u1.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_cbrt_2u.c b/contrib/arm-optimized-routines/pl/math/s_cbrt_2u.c deleted file mode 100644 index 435e74a546c6..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_cbrt_2u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_cbrt_2u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_cbrtf_1u5.c b/contrib/arm-optimized-routines/pl/math/s_cbrtf_1u5.c deleted file mode 100644 index 5c793704b62a..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_cbrtf_1u5.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_cbrtf_1u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_cosh_2u.c b/contrib/arm-optimized-routines/pl/math/s_cosh_2u.c deleted file mode 100644 index cdf352cf5793..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_cosh_2u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_cosh_2u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_coshf_2u4.c b/contrib/arm-optimized-routines/pl/math/s_coshf_2u4.c deleted file mode 100644 index 8f7d5da6e6ef..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_coshf_2u4.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_coshf_2u4.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_erf_2u.c b/contrib/arm-optimized-routines/pl/math/s_erf_2u.c deleted file mode 100644 index 839535c3897f..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_erf_2u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_erf_2u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_erfc_4u.c b/contrib/arm-optimized-routines/pl/math/s_erfc_4u.c deleted file mode 100644 index bf9e3e62bd31..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_erfc_4u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_erfc_4u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_erfcf_1u.c b/contrib/arm-optimized-routines/pl/math/s_erfcf_1u.c deleted file mode 100644 index 024d22498ff5..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_erfcf_1u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_erfcf_1u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_erff_1u5.c b/contrib/arm-optimized-routines/pl/math/s_erff_1u5.c deleted file mode 100644 index a5b9bf9afa72..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_erff_1u5.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_erff_1u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_exp_tail.c b/contrib/arm-optimized-routines/pl/math/s_exp_tail.c deleted file mode 100644 index 20b1b41a9689..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_exp_tail.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_exp_tail.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_expf.c b/contrib/arm-optimized-routines/pl/math/s_expf.c deleted file mode 100644 index 557a2e3d36af..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_expf.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_expf.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_expm1_2u5.c b/contrib/arm-optimized-routines/pl/math/s_expm1_2u5.c deleted file mode 100644 index da2d6e7ebf82..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_expm1_2u5.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_expm1_2u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_expm1f_1u6.c b/contrib/arm-optimized-routines/pl/math/s_expm1f_1u6.c deleted file mode 100644 index eea8089da989..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_expm1f_1u6.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_expm1f_1u6.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_log10_2u5.c b/contrib/arm-optimized-routines/pl/math/s_log10_2u5.c deleted file mode 100644 index 2480e5aa2cf1..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_log10_2u5.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_log10_2u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_log10f_3u5.c b/contrib/arm-optimized-routines/pl/math/s_log10f_3u5.c deleted file mode 100644 index 173e0fdc3400..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_log10f_3u5.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_log10f_3u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_log1p_2u5.c b/contrib/arm-optimized-routines/pl/math/s_log1p_2u5.c deleted file mode 100644 index 20b395a5a2d0..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_log1p_2u5.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_log1p_2u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_log1pf_2u1.c b/contrib/arm-optimized-routines/pl/math/s_log1pf_2u1.c deleted file mode 100644 index 013ec4c1d903..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_log1pf_2u1.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_log1pf_2u1.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_log2_3u.c b/contrib/arm-optimized-routines/pl/math/s_log2_3u.c deleted file mode 100644 index d46f3f998190..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_log2_3u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_log2_3u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_log2f_2u5.c b/contrib/arm-optimized-routines/pl/math/s_log2f_2u5.c deleted file mode 100644 index e76c67dceb62..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_log2f_2u5.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_log2f_2u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_sinh_3u.c b/contrib/arm-optimized-routines/pl/math/s_sinh_3u.c deleted file mode 100644 index 27e5e65db178..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_sinh_3u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_sinh_3u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_sinhf_2u3.c b/contrib/arm-optimized-routines/pl/math/s_sinhf_2u3.c deleted file mode 100644 index 607f94298a79..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_sinhf_2u3.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_sinhf_2u3.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_tan_3u5.c b/contrib/arm-optimized-routines/pl/math/s_tan_3u5.c deleted file mode 100644 index adb807c5beb8..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_tan_3u5.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_tan_3u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_tanf_3u5.c b/contrib/arm-optimized-routines/pl/math/s_tanf_3u5.c deleted file mode 100644 index fa64c8aef697..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_tanf_3u5.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_tanf_3u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_tanh_3u.c b/contrib/arm-optimized-routines/pl/math/s_tanh_3u.c deleted file mode 100644 index a4d7bce649f1..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_tanh_3u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_tanh_3u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_tanhf_2u6.c b/contrib/arm-optimized-routines/pl/math/s_tanhf_2u6.c deleted file mode 100644 index 896fc62ebe9b..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_tanhf_2u6.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_tanhf_2u6.c" diff --git a/contrib/arm-optimized-routines/pl/math/sinh_3u.c b/contrib/arm-optimized-routines/pl/math/sinh_3u.c index f534815c6674..1d86629ee2a3 100644 --- a/contrib/arm-optimized-routines/pl/math/sinh_3u.c +++ b/contrib/arm-optimized-routines/pl/math/sinh_3u.c @@ -58,9 +58,6 @@ sinh (double x) PL_SIG (S, D, 1, sinh, -10.0, 10.0) PL_TEST_ULP (sinh, 2.08) -PL_TEST_INTERVAL (sinh, 0, 0x1p-51, 100) -PL_TEST_INTERVAL (sinh, -0, -0x1p-51, 100) -PL_TEST_INTERVAL (sinh, 0x1p-51, 0x1.62e42fefa39fp+9, 100000) -PL_TEST_INTERVAL (sinh, -0x1p-51, -0x1.62e42fefa39fp+9, 100000) -PL_TEST_INTERVAL (sinh, 0x1.62e42fefa39fp+9, inf, 1000) -PL_TEST_INTERVAL (sinh, -0x1.62e42fefa39fp+9, -inf, 1000) +PL_TEST_SYM_INTERVAL (sinh, 0, 0x1p-51, 100) +PL_TEST_SYM_INTERVAL (sinh, 0x1p-51, 0x1.62e42fefa39fp+9, 100000) +PL_TEST_SYM_INTERVAL (sinh, 0x1.62e42fefa39fp+9, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/sinhf_2u3.c b/contrib/arm-optimized-routines/pl/math/sinhf_2u3.c index de944288a02b..aa7aadcf67c5 100644 --- a/contrib/arm-optimized-routines/pl/math/sinhf_2u3.c +++ b/contrib/arm-optimized-routines/pl/math/sinhf_2u3.c @@ -68,9 +68,6 @@ sinhf (float x) PL_SIG (S, F, 1, sinh, -10.0, 10.0) PL_TEST_ULP (sinhf, 1.76) -PL_TEST_INTERVAL (sinhf, 0, 0x1.62e43p+6, 100000) -PL_TEST_INTERVAL (sinhf, -0, -0x1.62e43p+6, 100000) -PL_TEST_INTERVAL (sinhf, 0x1.62e43p+6, 0x1.65a9fap+6, 100) -PL_TEST_INTERVAL (sinhf, -0x1.62e43p+6, -0x1.65a9fap+6, 100) -PL_TEST_INTERVAL (sinhf, 0x1.65a9fap+6, inf, 100) -PL_TEST_INTERVAL (sinhf, -0x1.65a9fap+6, -inf, 100) +PL_TEST_SYM_INTERVAL (sinhf, 0, 0x1.62e43p+6, 100000) +PL_TEST_SYM_INTERVAL (sinhf, 0x1.62e43p+6, 0x1.65a9fap+6, 100) +PL_TEST_SYM_INTERVAL (sinhf, 0x1.65a9fap+6, inf, 100) diff --git a/contrib/arm-optimized-routines/pl/math/sinpi_3u.c b/contrib/arm-optimized-routines/pl/math/sinpi_3u.c new file mode 100644 index 000000000000..a04a352a62e6 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sinpi_3u.c @@ -0,0 +1,90 @@ +/* + * Double-precision scalar sinpi function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#define _GNU_SOURCE +#include +#include "mathlib.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "poly_scalar_f64.h" + +/* Taylor series coefficents for sin(pi * x). + C2 coefficient (orginally ~=5.16771278) has been split into two parts: + C2_hi = 4, C2_lo = C2 - C2_hi (~=1.16771278) + This change in magnitude reduces floating point rounding errors. + C2_hi is then reintroduced after the polynomial approxmation. */ +static const double poly[] + = { 0x1.921fb54442d184p1, -0x1.2aef39896f94bp0, 0x1.466bc6775ab16p1, + -0x1.32d2cce62dc33p-1, 0x1.507834891188ep-4, -0x1.e30750a28c88ep-8, + 0x1.e8f48308acda4p-12, -0x1.6fc0032b3c29fp-16, 0x1.af86ae521260bp-21, + -0x1.012a9870eeb7dp-25 }; + +#define Shift 0x1.8p+52 + +/* Approximation for scalar double-precision sinpi(x). + Maximum error: 3.03 ULP: + sinpi(0x1.a90da2818f8b5p+7) got 0x1.fe358f255a4b3p-1 + want 0x1.fe358f255a4b6p-1. */ +double +sinpi (double x) +{ + if (isinf (x)) + return __math_invalid (x); + + double r = asdouble (asuint64 (x) & ~0x8000000000000000); + uint64_t sign = asuint64 (x) & 0x8000000000000000; + + /* Edge cases for when sinpif should be exactly 0. (Integers) + 0x1p53 is the limit for single precision to store any decimal places. */ + if (r >= 0x1p53) + return 0; + + /* If x is an integer, return 0. */ + uint64_t m = (uint64_t) r; + if (r == m) + return 0; + + /* For very small inputs, squaring r causes underflow. + Values below this threshold can be approximated via sinpi(x) ≈ pi*x. */ + if (r < 0x1p-63) + return M_PI * x; + + /* Any non-integer values >= 0x1x51 will be int + 0.5. + These values should return exactly 1 or -1. */ + if (r >= 0x1p51) + { + uint64_t iy = ((m & 1) << 63) ^ asuint64 (1.0); + return asdouble (sign ^ iy); + } + + /* n = rint(|x|). */ + double n = r + Shift; + sign ^= (asuint64 (n) << 63); + n = n - Shift; + + /* r = |x| - n (range reduction into -1/2 .. 1/2). */ + r = r - n; + + /* y = sin(r). */ + double r2 = r * r; + double y = horner_9_f64 (r2, poly); + y = y * r; + + /* Reintroduce C2_hi. */ + y = fma (-4 * r2, r, y); + + /* Copy sign of x to sin(|x|). */ + return asdouble (asuint64 (y) ^ sign); +} + +PL_SIG (S, D, 1, sinpi, -0.9, 0.9) +PL_TEST_ULP (sinpi, 2.53) +PL_TEST_SYM_INTERVAL (sinpi, 0, 0x1p-63, 5000) +PL_TEST_SYM_INTERVAL (sinpi, 0x1p-63, 0.5, 10000) +PL_TEST_SYM_INTERVAL (sinpi, 0.5, 0x1p51, 10000) +PL_TEST_SYM_INTERVAL (sinpi, 0x1p51, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/sinpif_2u5.c b/contrib/arm-optimized-routines/pl/math/sinpif_2u5.c new file mode 100644 index 000000000000..af9ca0573b37 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sinpif_2u5.c @@ -0,0 +1,83 @@ +/* + * Single-precision scalar sinpi function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +/* Taylor series coefficents for sin(pi * x). */ +#define C0 0x1.921fb6p1f +#define C1 -0x1.4abbcep2f +#define C2 0x1.466bc6p1f +#define C3 -0x1.32d2ccp-1f +#define C4 0x1.50783p-4f +#define C5 -0x1.e30750p-8f + +#define Shift 0x1.0p+23f + +/* Approximation for scalar single-precision sinpi(x) - sinpif. + Maximum error: 2.48 ULP: + sinpif(0x1.d062b6p-2) got 0x1.fa8c06p-1 + want 0x1.fa8c02p-1. */ +float +sinpif (float x) +{ + if (isinf (x)) + return __math_invalidf (x); + + float r = asfloat (asuint (x) & ~0x80000000); + uint32_t sign = asuint (x) & 0x80000000; + + /* Edge cases for when sinpif should be exactly 0. (Integers) + 0x1p23 is the limit for single precision to store any decimal places. */ + if (r >= 0x1p23f) + return 0; + + int32_t m = roundf (r); + if (m == r) + return 0; + + /* For very small inputs, squaring r causes underflow. + Values below this threshold can be approximated via sinpi(x) ~= pi*x. */ + if (r < 0x1p-31f) + return C0 * x; + + /* Any non-integer values >= 0x1p22f will be int + 0.5. + These values should return exactly 1 or -1. */ + if (r >= 0x1p22f) + { + uint32_t iy = ((m & 1) << 31) ^ asuint (-1.0f); + return asfloat (sign ^ iy); + } + + /* n = rint(|x|). */ + float n = r + Shift; + sign ^= (asuint (n) << 31); + n = n - Shift; + + /* r = |x| - n (range reduction into -1/2 .. 1/2). */ + r = r - n; + + /* y = sin(pi * r). */ + float r2 = r * r; + float y = fmaf (C5, r2, C4); + y = fmaf (y, r2, C3); + y = fmaf (y, r2, C2); + y = fmaf (y, r2, C1); + y = fmaf (y, r2, C0); + + /* Copy sign of x to sin(|x|). */ + return asfloat (asuint (y * r) ^ sign); +} + +PL_SIG (S, F, 1, sinpi, -0.9, 0.9) +PL_TEST_ULP (sinpif, 1.99) +PL_TEST_SYM_INTERVAL (sinpif, 0, 0x1p-31, 5000) +PL_TEST_SYM_INTERVAL (sinpif, 0x1p-31, 0.5, 10000) +PL_TEST_SYM_INTERVAL (sinpif, 0.5, 0x1p22f, 10000) +PL_TEST_SYM_INTERVAL (sinpif, 0x1p22f, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_acos_2u.c b/contrib/arm-optimized-routines/pl/math/sv_acos_2u.c new file mode 100644 index 000000000000..e06db6cae6af --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_acos_2u.c @@ -0,0 +1,91 @@ +/* + * Double-precision SVE acos(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "poly_sve_f64.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float64_t poly[12]; + float64_t pi, pi_over_2; +} data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) + on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ + .poly = { 0x1.555555555554ep-3, 0x1.3333333337233p-4, 0x1.6db6db67f6d9fp-5, + 0x1.f1c71fbd29fbbp-6, 0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6, + 0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7, 0x1.fd1151acb6bedp-8, + 0x1.087182f799c1dp-6, -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6, }, + .pi = 0x1.921fb54442d18p+1, + .pi_over_2 = 0x1.921fb54442d18p+0, +}; + +/* Double-precision SVE implementation of vector acos(x). + + For |x| in [0, 0.5], use an order 11 polynomial P such that the final + approximation of asin is an odd polynomial: + + acos(x) ~ pi/2 - (x + x^3 P(x^2)). + + The largest observed error in this region is 1.18 ulps, + _ZGVsMxv_acos (0x1.fbc5fe28ee9e3p-2) got 0x1.0d4d0f55667f6p+0 + want 0x1.0d4d0f55667f7p+0. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 1.52 ulps, + _ZGVsMxv_acos (0x1.24024271a500ap-1) got 0x1.ed82df4243f0dp-1 + want 0x1.ed82df4243f0bp-1. */ +svfloat64_t SV_NAME_D1 (acos) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000); + svfloat64_t ax = svabs_x (pg, x); + + svbool_t a_gt_half = svacgt (pg, x, 0.5); + + /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with + z2 = x ^ 2 and z = |x| , if |x| < 0.5 + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */ + svfloat64_t z2 = svsel (a_gt_half, svmls_x (pg, sv_f64 (0.5), ax, 0.5), + svmul_x (pg, x, x)); + svfloat64_t z = svsqrt_m (ax, a_gt_half, z2); + + /* Use a single polynomial approximation P for both intervals. */ + svfloat64_t z4 = svmul_x (pg, z2, z2); + svfloat64_t z8 = svmul_x (pg, z4, z4); + svfloat64_t z16 = svmul_x (pg, z8, z8); + svfloat64_t p = sv_estrin_11_f64_x (pg, z2, z4, z8, z16, d->poly); + + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = svmla_x (pg, z, svmul_x (pg, z, z2), p); + + /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5 + = 2 Q(|x|) , for 0.5 < x < 1.0 + = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */ + svfloat64_t y + = svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (p), sign)); + + svbool_t is_neg = svcmplt (pg, x, 0.0); + svfloat64_t off = svdup_f64_z (is_neg, d->pi); + svfloat64_t mul = svsel (a_gt_half, sv_f64 (2.0), sv_f64 (-1.0)); + svfloat64_t add = svsel (a_gt_half, off, sv_f64 (d->pi_over_2)); + + return svmla_x (pg, add, mul, y); +} + +PL_SIG (SV, D, 1, acos, -1.0, 1.0) +PL_TEST_ULP (SV_NAME_D1 (acos), 1.02) +PL_TEST_INTERVAL (SV_NAME_D1 (acos), 0, 0.5, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (acos), 0.5, 1.0, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (acos), 1.0, 0x1p11, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (acos), 0x1p11, inf, 20000) +PL_TEST_INTERVAL (SV_NAME_D1 (acos), -0, -inf, 20000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_acosf_1u4.c b/contrib/arm-optimized-routines/pl/math/sv_acosf_1u4.c new file mode 100644 index 000000000000..7ac59ceedfbd --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_acosf_1u4.c @@ -0,0 +1,84 @@ +/* + * Single-precision SVE acos(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "poly_sve_f32.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float32_t poly[5]; + float32_t pi, pi_over_2; +} data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on + [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */ + .poly = { 0x1.55555ep-3, 0x1.33261ap-4, 0x1.70d7dcp-5, 0x1.b059dp-6, + 0x1.3af7d8p-5, }, + .pi = 0x1.921fb6p+1f, + .pi_over_2 = 0x1.921fb6p+0f, +}; + +/* Single-precision SVE implementation of vector acos(x). + + For |x| in [0, 0.5], use order 4 polynomial P such that the final + approximation of asin is an odd polynomial: + + acos(x) ~ pi/2 - (x + x^3 P(x^2)). + + The largest observed error in this region is 1.16 ulps, + _ZGVsMxv_acosf(0x1.ffbeccp-2) got 0x1.0c27f8p+0 + want 0x1.0c27f6p+0. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 1.32 ulps, + _ZGVsMxv_acosf (0x1.15ba56p-1) got 0x1.feb33p-1 + want 0x1.feb32ep-1. */ +svfloat32_t SV_NAME_F1 (acos) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint32_t sign = svand_x (pg, svreinterpret_u32 (x), 0x80000000); + svfloat32_t ax = svabs_x (pg, x); + svbool_t a_gt_half = svacgt (pg, x, 0.5); + + /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with + z2 = x ^ 2 and z = |x| , if |x| < 0.5 + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */ + svfloat32_t z2 = svsel (a_gt_half, svmls_x (pg, sv_f32 (0.5), ax, 0.5), + svmul_x (pg, x, x)); + svfloat32_t z = svsqrt_m (ax, a_gt_half, z2); + + /* Use a single polynomial approximation P for both intervals. */ + svfloat32_t p = sv_horner_4_f32_x (pg, z2, d->poly); + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = svmla_x (pg, z, svmul_x (pg, z, z2), p); + + /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5 + = 2 Q(|x|) , for 0.5 < x < 1.0 + = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */ + svfloat32_t y + = svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (p), sign)); + + svbool_t is_neg = svcmplt (pg, x, 0.0); + svfloat32_t off = svdup_f32_z (is_neg, d->pi); + svfloat32_t mul = svsel (a_gt_half, sv_f32 (2.0), sv_f32 (-1.0)); + svfloat32_t add = svsel (a_gt_half, off, sv_f32 (d->pi_over_2)); + + return svmla_x (pg, add, mul, y); +} + +PL_SIG (SV, F, 1, acos, -1.0, 1.0) +PL_TEST_ULP (SV_NAME_F1 (acos), 0.82) +PL_TEST_INTERVAL (SV_NAME_F1 (acos), 0, 0.5, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (acos), 0.5, 1.0, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (acos), 1.0, 0x1p11, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (acos), 0x1p11, inf, 20000) +PL_TEST_INTERVAL (SV_NAME_F1 (acos), -0, -inf, 20000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_acosh_3u5.c b/contrib/arm-optimized-routines/pl/math/sv_acosh_3u5.c new file mode 100644 index 000000000000..faf351331464 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_acosh_3u5.c @@ -0,0 +1,50 @@ +/* + * Double-precision SVE acosh(x) function. + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define WANT_SV_LOG1P_K0_SHORTCUT 1 +#include "sv_log1p_inline.h" + +#define BigBoundTop 0x5fe /* top12 (asuint64 (0x1p511)). */ +#define OneTop 0x3ff + +static NOINLINE svfloat64_t +special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +{ + return sv_call_f64 (acosh, x, y, special); +} + +/* SVE approximation for double-precision acosh, based on log1p. + The largest observed error is 3.19 ULP in the region where the + argument to log1p falls in the k=0 interval, i.e. x close to 1: + SV_NAME_D1 (acosh)(0x1.1e4388d4ca821p+0) got 0x1.ed23399f5137p-2 + want 0x1.ed23399f51373p-2. */ +svfloat64_t SV_NAME_D1 (acosh) (svfloat64_t x, const svbool_t pg) +{ + svuint64_t itop = svlsr_x (pg, svreinterpret_u64 (x), 52); + /* (itop - OneTop) >= (BigBoundTop - OneTop). */ + svbool_t special = svcmpge (pg, svsub_x (pg, itop, OneTop), sv_u64 (0x1ff)); + + svfloat64_t xm1 = svsub_x (pg, x, 1); + svfloat64_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1)); + svfloat64_t y = sv_log1p_inline (svadd_x (pg, xm1, svsqrt_x (pg, u)), pg); + + /* Fall back to scalar routine for special lanes. */ + if (unlikely (svptest_any (pg, special))) + return special_case (x, y, special); + + return y; +} + +PL_SIG (SV, D, 1, acosh, 1.0, 10.0) +PL_TEST_ULP (SV_NAME_D1 (acosh), 2.69) +PL_TEST_INTERVAL (SV_NAME_D1 (acosh), 1, 0x1p511, 90000) +PL_TEST_INTERVAL (SV_NAME_D1 (acosh), 0x1p511, inf, 10000) +PL_TEST_INTERVAL (SV_NAME_D1 (acosh), 0, 1, 1000) +PL_TEST_INTERVAL (SV_NAME_D1 (acosh), -0, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_acoshf_2u8.c b/contrib/arm-optimized-routines/pl/math/sv_acoshf_2u8.c new file mode 100644 index 000000000000..f527083af40a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_acoshf_2u8.c @@ -0,0 +1,47 @@ +/* + * Single-precision SVE acosh(x) function. + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define One 0x3f800000 +#define Thres 0x20000000 /* asuint(0x1p64) - One. */ + +#include "sv_log1pf_inline.h" + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +{ + return sv_call_f32 (acoshf, x, y, special); +} + +/* Single-precision SVE acosh(x) routine. Implements the same algorithm as + vector acoshf and log1p. + + Maximum error is 2.78 ULPs: + SV_NAME_F1 (acosh) (0x1.01e996p+0) got 0x1.f45b42p-4 + want 0x1.f45b3cp-4. */ +svfloat32_t SV_NAME_F1 (acosh) (svfloat32_t x, const svbool_t pg) +{ + svuint32_t ix = svreinterpret_u32 (x); + svbool_t special = svcmpge (pg, svsub_x (pg, ix, One), Thres); + + svfloat32_t xm1 = svsub_x (pg, x, 1.0f); + svfloat32_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1.0f)); + svfloat32_t y = sv_log1pf_inline (svadd_x (pg, xm1, svsqrt_x (pg, u)), pg); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, y, special); + return y; +} + +PL_SIG (SV, F, 1, acosh, 1.0, 10.0) +PL_TEST_ULP (SV_NAME_F1 (acosh), 2.29) +PL_TEST_INTERVAL (SV_NAME_F1 (acosh), 0, 1, 500) +PL_TEST_INTERVAL (SV_NAME_F1 (acosh), 1, 0x1p64, 100000) +PL_TEST_INTERVAL (SV_NAME_F1 (acosh), 0x1p64, inf, 1000) +PL_TEST_INTERVAL (SV_NAME_F1 (acosh), -0, -inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_asin_3u.c b/contrib/arm-optimized-routines/pl/math/sv_asin_3u.c new file mode 100644 index 000000000000..c3dd37b145ae --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_asin_3u.c @@ -0,0 +1,84 @@ +/* + * Double-precision SVE asin(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "poly_sve_f64.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float64_t poly[12]; + float64_t pi_over_2f; +} data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) + on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ + .poly = { 0x1.555555555554ep-3, 0x1.3333333337233p-4, + 0x1.6db6db67f6d9fp-5, 0x1.f1c71fbd29fbbp-6, + 0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6, + 0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7, + 0x1.fd1151acb6bedp-8, 0x1.087182f799c1dp-6, + -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6, }, + .pi_over_2f = 0x1.921fb54442d18p+0, +}; + +#define P(i) sv_f64 (d->poly[i]) + +/* Double-precision SVE implementation of vector asin(x). + + For |x| in [0, 0.5], use an order 11 polynomial P such that the final + approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2). + + The largest observed error in this region is 0.52 ulps, + _ZGVsMxv_asin(0x1.d95ae04998b6cp-2) got 0x1.ec13757305f27p-2 + want 0x1.ec13757305f26p-2. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 2.69 ulps, + _ZGVsMxv_asin(0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1 + want 0x1.110d7e85fdd53p-1. */ +svfloat64_t SV_NAME_D1 (asin) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000); + svfloat64_t ax = svabs_x (pg, x); + svbool_t a_ge_half = svacge (pg, x, 0.5); + + /* Evaluate polynomial Q(x) = y + y * z * P(z) with + z = x ^ 2 and y = |x| , if |x| < 0.5 + z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */ + svfloat64_t z2 = svsel (a_ge_half, svmls_x (pg, sv_f64 (0.5), ax, 0.5), + svmul_x (pg, x, x)); + svfloat64_t z = svsqrt_m (ax, a_ge_half, z2); + + /* Use a single polynomial approximation P for both intervals. */ + svfloat64_t z4 = svmul_x (pg, z2, z2); + svfloat64_t z8 = svmul_x (pg, z4, z4); + svfloat64_t z16 = svmul_x (pg, z8, z8); + svfloat64_t p = sv_estrin_11_f64_x (pg, z2, z4, z8, z16, d->poly); + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = svmla_x (pg, z, svmul_x (pg, z, z2), p); + + /* asin(|x|) = Q(|x|) , for |x| < 0.5 + = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ + svfloat64_t y = svmad_m (a_ge_half, p, sv_f64 (-2.0), d->pi_over_2f); + + /* Copy sign. */ + return svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign)); +} + +PL_SIG (SV, D, 1, asin, -1.0, 1.0) +PL_TEST_ULP (SV_NAME_D1 (asin), 2.19) +PL_TEST_INTERVAL (SV_NAME_D1 (asin), 0, 0.5, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (asin), 0.5, 1.0, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (asin), 1.0, 0x1p11, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (asin), 0x1p11, inf, 20000) +PL_TEST_INTERVAL (SV_NAME_D1 (asin), -0, -inf, 20000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_asinf_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_asinf_2u5.c new file mode 100644 index 000000000000..8e9edc2439f5 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_asinf_2u5.c @@ -0,0 +1,76 @@ +/* + * Single-precision SVE asin(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "poly_sve_f32.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float32_t poly[5]; + float32_t pi_over_2f; +} data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on + [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */ + .poly = { 0x1.55555ep-3, 0x1.33261ap-4, 0x1.70d7dcp-5, 0x1.b059dp-6, + 0x1.3af7d8p-5, }, + .pi_over_2f = 0x1.921fb6p+0f, +}; + +/* Single-precision SVE implementation of vector asin(x). + + For |x| in [0, 0.5], use order 4 polynomial P such that the final + approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2). + + The largest observed error in this region is 0.83 ulps, + _ZGVsMxv_asinf (0x1.ea00f4p-2) got 0x1.fef15ep-2 + want 0x1.fef15cp-2. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 2.41 ulps, + _ZGVsMxv_asinf (-0x1.00203ep-1) got -0x1.0c3a64p-1 + want -0x1.0c3a6p-1. */ +svfloat32_t SV_NAME_F1 (asin) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint32_t sign = svand_x (pg, svreinterpret_u32 (x), 0x80000000); + + svfloat32_t ax = svabs_x (pg, x); + svbool_t a_ge_half = svacge (pg, x, 0.5); + + /* Evaluate polynomial Q(x) = y + y * z * P(z) with + z = x ^ 2 and y = |x| , if |x| < 0.5 + z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */ + svfloat32_t z2 = svsel (a_ge_half, svmls_x (pg, sv_f32 (0.5), ax, 0.5), + svmul_x (pg, x, x)); + svfloat32_t z = svsqrt_m (ax, a_ge_half, z2); + + /* Use a single polynomial approximation P for both intervals. */ + svfloat32_t p = sv_horner_4_f32_x (pg, z2, d->poly); + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = svmla_x (pg, z, svmul_x (pg, z, z2), p); + + /* asin(|x|) = Q(|x|) , for |x| < 0.5 + = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ + svfloat32_t y = svmad_m (a_ge_half, p, sv_f32 (-2.0), d->pi_over_2f); + + /* Copy sign. */ + return svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign)); +} + +PL_SIG (SV, F, 1, asin, -1.0, 1.0) +PL_TEST_ULP (SV_NAME_F1 (asin), 1.91) +PL_TEST_INTERVAL (SV_NAME_F1 (asin), 0, 0.5, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (asin), 0.5, 1.0, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (asin), 1.0, 0x1p11, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (asin), 0x1p11, inf, 20000) +PL_TEST_INTERVAL (SV_NAME_F1 (asin), -0, -inf, 20000) \ No newline at end of file diff --git a/contrib/arm-optimized-routines/pl/math/sv_asinh_3u0.c b/contrib/arm-optimized-routines/pl/math/sv_asinh_3u0.c new file mode 100644 index 000000000000..711f0dfdbedc --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_asinh_3u0.c @@ -0,0 +1,129 @@ +/* + * Double-precision SVE asinh(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "poly_sve_f64.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define OneTop sv_u64 (0x3ff) /* top12(asuint64(1.0f)). */ +#define HugeBound sv_u64 (0x5fe) /* top12(asuint64(0x1p511)). */ +#define TinyBound (0x3e5) /* top12(asuint64(0x1p-26)). */ +#define SignMask (0x8000000000000000) + +/* Constants & data for log. */ +#define A(i) __v_log_data.poly[i] +#define Ln2 (0x1.62e42fefa39efp-1) +#define N (1 << V_LOG_TABLE_BITS) +#define OFF (0x3fe6900900000000) + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +{ + return sv_call_f64 (asinh, x, y, special); +} + +static inline svfloat64_t +__sv_log_inline (svfloat64_t x, const svbool_t pg) +{ + /* Double-precision SVE log, copied from pl/math/sv_log_2u5.c with some + cosmetic modification and special-cases removed. See that file for details + of the algorithm used. */ + svuint64_t ix = svreinterpret_u64 (x); + svuint64_t tmp = svsub_x (pg, ix, OFF); + svuint64_t i + = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)), (N - 1) << 1); + svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52); + svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52)); + svfloat64_t z = svreinterpret_f64 (iz); + svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i); + svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i); + svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), invc, z); + svfloat64_t kd = svcvt_f64_x (pg, k); + svfloat64_t hi = svmla_x (pg, svadd_x (pg, logc, r), kd, Ln2); + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t y = svmla_x (pg, sv_f64 (A (2)), r, A (3)); + svfloat64_t p = svmla_x (pg, sv_f64 (A (0)), r, A (1)); + y = svmla_x (pg, y, r2, A (4)); + y = svmla_x (pg, p, r2, y); + y = svmla_x (pg, hi, r2, y); + return y; +} + +/* Double-precision implementation of SVE asinh(x). + asinh is very sensitive around 1, so it is impractical to devise a single + low-cost algorithm which is sufficiently accurate on a wide range of input. + Instead we use two different algorithms: + asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1 + = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise + where log(x) is an optimized log approximation, and P(x) is a polynomial + shared with the scalar routine. The greatest observed error 2.51 ULP, in + |x| >= 1: + _ZGVsMxv_asinh(0x1.170469d024505p+0) got 0x1.e3181c43b0f36p-1 + want 0x1.e3181c43b0f39p-1. */ +svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg) +{ + svuint64_t ix = svreinterpret_u64 (x); + svuint64_t iax = svbic_x (pg, ix, SignMask); + svuint64_t sign = svand_x (pg, ix, SignMask); + svfloat64_t ax = svreinterpret_f64 (iax); + svuint64_t top12 = svlsr_x (pg, iax, 52); + + svbool_t ge1 = svcmpge (pg, top12, OneTop); + svbool_t special = svcmpge (pg, top12, HugeBound); + + /* Option 1: |x| >= 1. + Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)). */ + svfloat64_t option_1 = sv_f64 (0); + if (likely (svptest_any (pg, ge1))) + { + svfloat64_t axax = svmul_x (pg, ax, ax); + option_1 = __sv_log_inline ( + svadd_x (pg, ax, svsqrt_x (pg, svadd_x (pg, axax, 1))), pg); + } + + /* Option 2: |x| < 1. + Compute asinh(x) using a polynomial. + The largest observed error in this region is 1.51 ULPs: + _ZGVsMxv_asinh(0x1.fe12bf8c616a2p-1) got 0x1.c1e649ee2681bp-1 + want 0x1.c1e649ee2681dp-1. */ + svfloat64_t option_2 = sv_f64 (0); + if (likely (svptest_any (pg, svnot_z (pg, ge1)))) + { + svfloat64_t x2 = svmul_x (pg, ax, ax); + svfloat64_t z2 = svmul_x (pg, x2, x2); + svfloat64_t z4 = svmul_x (pg, z2, z2); + svfloat64_t z8 = svmul_x (pg, z4, z4); + svfloat64_t z16 = svmul_x (pg, z8, z8); + svfloat64_t p + = sv_estrin_17_f64_x (pg, x2, z2, z4, z8, z16, __asinh_data.poly); + option_2 = svmla_x (pg, ax, p, svmul_x (pg, x2, ax)); + } + + /* Choose the right option for each lane. */ + svfloat64_t y = svsel (ge1, option_1, option_2); + + /* Apply sign of x to y. */ + y = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, y, special); + return y; +} + +PL_SIG (SV, D, 1, asinh, -10.0, 10.0) +PL_TEST_ULP (SV_NAME_D1 (asinh), 2.52) +/* Test vector asinh 3 times, with control lane < 1, > 1 and special. + Ensures the svsel is choosing the right option in all cases. */ +#define SV_ASINH_INTERVAL(lo, hi, n) \ + PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (asinh), lo, hi, n, 0.5) \ + PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (asinh), lo, hi, n, 2) \ + PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (asinh), lo, hi, n, 0x1p600) +SV_ASINH_INTERVAL (0, 0x1p-26, 50000) +SV_ASINH_INTERVAL (0x1p-26, 1, 50000) +SV_ASINH_INTERVAL (1, 0x1p511, 50000) +SV_ASINH_INTERVAL (0x1p511, inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_asinhf_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_asinhf_2u5.c new file mode 100644 index 000000000000..1f1f6e5c846f --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_asinhf_2u5.c @@ -0,0 +1,55 @@ +/* + * Single-precision SVE asinh(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "include/mathlib.h" +#include "pl_sig.h" +#include "pl_test.h" + +#include "sv_log1pf_inline.h" + +#define BigBound (0x5f800000) /* asuint(0x1p64). */ + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +{ + return sv_call_f32 (asinhf, x, y, special); +} + +/* Single-precision SVE asinh(x) routine. Implements the same algorithm as + vector asinhf and log1p. + + Maximum error is 2.48 ULPs: + SV_NAME_F1 (asinh) (0x1.008864p-3) got 0x1.ffbbbcp-4 + want 0x1.ffbbb8p-4. */ +svfloat32_t SV_NAME_F1 (asinh) (svfloat32_t x, const svbool_t pg) +{ + svfloat32_t ax = svabs_x (pg, x); + svuint32_t iax = svreinterpret_u32 (ax); + svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax); + svbool_t special = svcmpge (pg, iax, BigBound); + + /* asinh(x) = log(x + sqrt(x * x + 1)). + For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */ + svfloat32_t ax2 = svmul_x (pg, ax, ax); + svfloat32_t d = svadd_x (pg, svsqrt_x (pg, svadd_x (pg, ax2, 1.0f)), 1.0f); + svfloat32_t y + = sv_log1pf_inline (svadd_x (pg, ax, svdiv_x (pg, ax2, d)), pg); + + if (unlikely (svptest_any (pg, special))) + return special_case ( + x, svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y))), + special); + return svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y))); +} + +PL_SIG (SV, F, 1, asinh, -10.0, 10.0) +PL_TEST_ULP (SV_NAME_F1 (asinh), 1.98) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 0, 0x1p-12, 4000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 0x1p-12, 1.0, 20000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 1.0, 0x1p64, 20000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 0x1p64, inf, 4000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_atan2_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_atan2_2u5.c index a4bea1dcba09..00530a324a76 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_atan2_2u5.c +++ b/contrib/arm-optimized-routines/pl/math/sv_atan2_2u5.c @@ -8,86 +8,109 @@ #include "sv_math.h" #include "pl_sig.h" #include "pl_test.h" +#include "poly_sve_f64.h" -#if SV_SUPPORTED - -#include "sv_atan_common.h" +static const struct data +{ + float64_t poly[20]; + float64_t pi_over_2; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-1022, 1.0]. */ + .poly = { -0x1.5555555555555p-2, 0x1.99999999996c1p-3, -0x1.2492492478f88p-3, + 0x1.c71c71bc3951cp-4, -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4, + -0x1.11100ee084227p-4, 0x1.e1d0f9696f63bp-5, -0x1.aebfe7b418581p-5, + 0x1.842dbe9b0d916p-5, -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5, + -0x1.00e6eece7de8p-5, 0x1.860897b29e5efp-6, -0x1.0051381722a59p-6, + 0x1.14e9dc19a4a4ep-7, -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10, + -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16, }, + .pi_over_2 = 0x1.921fb54442d18p+0, +}; /* Useful constants. */ -#define PiOver2 sv_f64 (0x1.921fb54442d18p+0) #define SignMask sv_u64 (0x8000000000000000) /* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */ -__attribute__ ((noinline)) static sv_f64_t -specialcase (sv_f64_t y, sv_f64_t x, sv_f64_t ret, const svbool_t cmp) +static svfloat64_t NOINLINE +special_case (svfloat64_t y, svfloat64_t x, svfloat64_t ret, + const svbool_t cmp) { return sv_call2_f64 (atan2, y, x, ret, cmp); } -/* Returns a predicate indicating true if the input is the bit representation of - 0, infinity or nan. */ +/* Returns a predicate indicating true if the input is the bit representation + of 0, infinity or nan. */ static inline svbool_t -zeroinfnan (sv_u64_t i, const svbool_t pg) +zeroinfnan (svuint64_t i, const svbool_t pg) { - return svcmpge_u64 (pg, svsub_n_u64_x (pg, svlsl_n_u64_x (pg, i, 1), 1), - sv_u64 (2 * asuint64 (INFINITY) - 1)); + return svcmpge (pg, svsub_x (pg, svlsl_x (pg, i, 1), 1), + sv_u64 (2 * asuint64 (INFINITY) - 1)); } /* Fast implementation of SVE atan2. Errors are greatest when y and x are reasonably close together. The greatest observed error is 2.28 ULP: - sv_atan2(-0x1.5915b1498e82fp+732, 0x1.54d11ef838826p+732) + _ZGVsMxvv_atan2 (-0x1.5915b1498e82fp+732, 0x1.54d11ef838826p+732) got -0x1.954f42f1fa841p-1 want -0x1.954f42f1fa843p-1. */ -sv_f64_t -__sv_atan2_x (sv_f64_t y, sv_f64_t x, const svbool_t pg) +svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg) { - sv_u64_t ix = sv_as_u64_f64 (x); - sv_u64_t iy = sv_as_u64_f64 (y); + const struct data *data_ptr = ptr_barrier (&data); + + svuint64_t ix = svreinterpret_u64 (x); + svuint64_t iy = svreinterpret_u64 (y); svbool_t cmp_x = zeroinfnan (ix, pg); svbool_t cmp_y = zeroinfnan (iy, pg); - svbool_t cmp_xy = svorr_b_z (pg, cmp_x, cmp_y); + svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y); - sv_u64_t sign_x = svand_u64_x (pg, ix, SignMask); - sv_u64_t sign_y = svand_u64_x (pg, iy, SignMask); - sv_u64_t sign_xy = sveor_u64_x (pg, sign_x, sign_y); + svuint64_t sign_x = svand_x (pg, ix, SignMask); + svuint64_t sign_y = svand_x (pg, iy, SignMask); + svuint64_t sign_xy = sveor_x (pg, sign_x, sign_y); - sv_f64_t ax = svabs_f64_x (pg, x); - sv_f64_t ay = svabs_f64_x (pg, y); + svfloat64_t ax = svabs_x (pg, x); + svfloat64_t ay = svabs_x (pg, y); - svbool_t pred_xlt0 = svcmplt_f64 (pg, x, sv_f64 (0.0)); - svbool_t pred_aygtax = svcmpgt_f64 (pg, ay, ax); + svbool_t pred_xlt0 = svcmplt (pg, x, 0.0); + svbool_t pred_aygtax = svcmpgt (pg, ay, ax); /* Set up z for call to atan. */ - sv_f64_t n = svsel_f64 (pred_aygtax, svneg_f64_x (pg, ax), ay); - sv_f64_t d = svsel_f64 (pred_aygtax, ay, ax); - sv_f64_t z = svdiv_f64_x (pg, n, d); + svfloat64_t n = svsel (pred_aygtax, svneg_x (pg, ax), ay); + svfloat64_t d = svsel (pred_aygtax, ay, ax); + svfloat64_t z = svdiv_x (pg, n, d); /* Work out the correct shift. */ - sv_f64_t shift = svsel_f64 (pred_xlt0, sv_f64 (-2.0), sv_f64 (0.0)); - shift = svsel_f64 (pred_aygtax, svadd_n_f64_x (pg, shift, 1.0), shift); - shift = svmul_f64_x (pg, shift, PiOver2); + svfloat64_t shift = svsel (pred_xlt0, sv_f64 (-2.0), sv_f64 (0.0)); + shift = svsel (pred_aygtax, svadd_x (pg, shift, 1.0), shift); + shift = svmul_x (pg, shift, data_ptr->pi_over_2); - sv_f64_t ret = __sv_atan_common (pg, pg, z, z, shift); + /* Use split Estrin scheme for P(z^2) with deg(P)=19. */ + svfloat64_t z2 = svmul_x (pg, z, z); + svfloat64_t x2 = svmul_x (pg, z2, z2); + svfloat64_t x4 = svmul_x (pg, x2, x2); + svfloat64_t x8 = svmul_x (pg, x4, x4); + + svfloat64_t ret = svmla_x ( + pg, sv_estrin_7_f64_x (pg, z2, x2, x4, data_ptr->poly), + sv_estrin_11_f64_x (pg, z2, x2, x4, x8, data_ptr->poly + 8), x8); + + /* y = shift + z + z^3 * P(z^2). */ + svfloat64_t z3 = svmul_x (pg, z2, z); + ret = svmla_x (pg, z, z3, ret); + + ret = svadd_m (pg, ret, shift); /* Account for the sign of x and y. */ - ret = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (ret), sign_xy)); + ret = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy)); if (unlikely (svptest_any (pg, cmp_xy))) - { - return specialcase (y, x, ret, cmp_xy); - } + return special_case (y, x, ret, cmp_xy); return ret; } -PL_ALIAS (__sv_atan2_x, _ZGVsMxvv_atan2) - /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ PL_SIG (SV, D, 2, atan2) -PL_TEST_ULP (__sv_atan2, 1.78) -PL_TEST_INTERVAL (__sv_atan2, -10.0, 10.0, 50000) -PL_TEST_INTERVAL (__sv_atan2, -1.0, 1.0, 40000) -PL_TEST_INTERVAL (__sv_atan2, 0.0, 1.0, 40000) -PL_TEST_INTERVAL (__sv_atan2, 1.0, 100.0, 40000) -PL_TEST_INTERVAL (__sv_atan2, 1e6, 1e32, 40000) -#endif +PL_TEST_ULP (SV_NAME_D2 (atan2), 1.78) +PL_TEST_INTERVAL (SV_NAME_D2 (atan2), 0.0, 1.0, 40000) +PL_TEST_INTERVAL (SV_NAME_D2 (atan2), 1.0, 100.0, 40000) +PL_TEST_INTERVAL (SV_NAME_D2 (atan2), 100, inf, 40000) +PL_TEST_INTERVAL (SV_NAME_D2 (atan2), -0, -inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_atan2f_3u.c b/contrib/arm-optimized-routines/pl/math/sv_atan2f_3u.c index f7674c441f2f..9ff73ecb74ba 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_atan2f_3u.c +++ b/contrib/arm-optimized-routines/pl/math/sv_atan2f_3u.c @@ -8,87 +8,101 @@ #include "sv_math.h" #include "pl_sig.h" #include "pl_test.h" +#include "poly_sve_f32.h" -#if SV_SUPPORTED +static const struct data +{ + float32_t poly[8]; + float32_t pi_over_2; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-128, 1.0]. */ + .poly = { -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f, + -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f }, + .pi_over_2 = 0x1.921fb6p+0f, +}; -#include "sv_atanf_common.h" - -/* Useful constants. */ -#define PiOver2 sv_f32 (0x1.921fb6p+0f) #define SignMask sv_u32 (0x80000000) /* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */ -static inline sv_f32_t -specialcase (sv_f32_t y, sv_f32_t x, sv_f32_t ret, const svbool_t cmp) +static inline svfloat32_t +special_case (svfloat32_t y, svfloat32_t x, svfloat32_t ret, + const svbool_t cmp) { return sv_call2_f32 (atan2f, y, x, ret, cmp); } -/* Returns a predicate indicating true if the input is the bit representation of - 0, infinity or nan. */ +/* Returns a predicate indicating true if the input is the bit representation + of 0, infinity or nan. */ static inline svbool_t -zeroinfnan (sv_u32_t i, const svbool_t pg) +zeroinfnan (svuint32_t i, const svbool_t pg) { - return svcmpge_u32 (pg, svsub_n_u32_x (pg, svlsl_n_u32_x (pg, i, 1), 1), - sv_u32 (2 * 0x7f800000lu - 1)); + return svcmpge (pg, svsub_x (pg, svlsl_x (pg, i, 1), 1), + sv_u32 (2 * 0x7f800000lu - 1)); } -/* Fast implementation of SVE atan2f based on atan(x) ~ shift + z + z^3 * P(z^2) - with reduction to [0,1] using z=1/x and shift = pi/2. - Maximum observed error is 2.95 ULP: - __sv_atan2f(0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1 - want 0x1.967f00p-1. */ -sv_f32_t -__sv_atan2f_x (sv_f32_t y, sv_f32_t x, const svbool_t pg) +/* Fast implementation of SVE atan2f based on atan(x) ~ shift + z + z^3 * + P(z^2) with reduction to [0,1] using z=1/x and shift = pi/2. Maximum + observed error is 2.95 ULP: + _ZGVsMxvv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1 + want 0x1.967f00p-1. */ +svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg) { - sv_u32_t ix = sv_as_u32_f32 (x); - sv_u32_t iy = sv_as_u32_f32 (y); + const struct data *data_ptr = ptr_barrier (&data); + + svuint32_t ix = svreinterpret_u32 (x); + svuint32_t iy = svreinterpret_u32 (y); svbool_t cmp_x = zeroinfnan (ix, pg); svbool_t cmp_y = zeroinfnan (iy, pg); - svbool_t cmp_xy = svorr_b_z (pg, cmp_x, cmp_y); + svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y); - sv_u32_t sign_x = svand_u32_x (pg, ix, SignMask); - sv_u32_t sign_y = svand_u32_x (pg, iy, SignMask); - sv_u32_t sign_xy = sveor_u32_x (pg, sign_x, sign_y); + svuint32_t sign_x = svand_x (pg, ix, SignMask); + svuint32_t sign_y = svand_x (pg, iy, SignMask); + svuint32_t sign_xy = sveor_x (pg, sign_x, sign_y); - sv_f32_t ax = svabs_f32_x (pg, x); - sv_f32_t ay = svabs_f32_x (pg, y); + svfloat32_t ax = svabs_x (pg, x); + svfloat32_t ay = svabs_x (pg, y); - svbool_t pred_xlt0 = svcmplt_f32 (pg, x, sv_f32 (0.0)); - svbool_t pred_aygtax = svcmpgt_f32 (pg, ay, ax); + svbool_t pred_xlt0 = svcmplt (pg, x, 0.0); + svbool_t pred_aygtax = svcmpgt (pg, ay, ax); /* Set up z for call to atan. */ - sv_f32_t n = svsel_f32 (pred_aygtax, svneg_f32_x (pg, ax), ay); - sv_f32_t d = svsel_f32 (pred_aygtax, ay, ax); - sv_f32_t z = svdiv_f32_x (pg, n, d); + svfloat32_t n = svsel (pred_aygtax, svneg_x (pg, ax), ay); + svfloat32_t d = svsel (pred_aygtax, ay, ax); + svfloat32_t z = svdiv_x (pg, n, d); /* Work out the correct shift. */ - sv_f32_t shift = svsel_f32 (pred_xlt0, sv_f32 (-2.0), sv_f32 (0.0)); - shift = svsel_f32 (pred_aygtax, svadd_n_f32_x (pg, shift, 1.0), shift); - shift = svmul_f32_x (pg, shift, PiOver2); + svfloat32_t shift = svsel (pred_xlt0, sv_f32 (-2.0), sv_f32 (0.0)); + shift = svsel (pred_aygtax, svadd_x (pg, shift, 1.0), shift); + shift = svmul_x (pg, shift, sv_f32 (data_ptr->pi_over_2)); - sv_f32_t ret = __sv_atanf_common (pg, pg, z, z, shift); + /* Use split Estrin scheme for P(z^2) with deg(P)=7. */ + svfloat32_t z2 = svmul_x (pg, z, z); + svfloat32_t z4 = svmul_x (pg, z2, z2); + svfloat32_t z8 = svmul_x (pg, z4, z4); + + svfloat32_t ret = sv_estrin_7_f32_x (pg, z2, z4, z8, data_ptr->poly); + + /* ret = shift + z + z^3 * P(z^2). */ + svfloat32_t z3 = svmul_x (pg, z2, z); + ret = svmla_x (pg, z, z3, ret); + + ret = svadd_m (pg, ret, shift); /* Account for the sign of x and y. */ - ret = sv_as_f32_u32 (sveor_u32_x (pg, sv_as_u32_f32 (ret), sign_xy)); + ret = svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy)); if (unlikely (svptest_any (pg, cmp_xy))) - { - return specialcase (y, x, ret, cmp_xy); - } + return special_case (y, x, ret, cmp_xy); return ret; } -PL_ALIAS (__sv_atan2f_x, _ZGVsMxvv_atan2f) - /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ PL_SIG (SV, F, 2, atan2) -PL_TEST_ULP (__sv_atan2f, 2.45) -PL_TEST_INTERVAL (__sv_atan2f, -10.0, 10.0, 50000) -PL_TEST_INTERVAL (__sv_atan2f, -1.0, 1.0, 40000) -PL_TEST_INTERVAL (__sv_atan2f, 0.0, 1.0, 40000) -PL_TEST_INTERVAL (__sv_atan2f, 1.0, 100.0, 40000) -PL_TEST_INTERVAL (__sv_atan2f, 1e6, 1e32, 40000) -#endif +PL_TEST_ULP (SV_NAME_F2 (atan2), 2.45) +PL_TEST_INTERVAL (SV_NAME_F2 (atan2), 0.0, 1.0, 40000) +PL_TEST_INTERVAL (SV_NAME_F2 (atan2), 1.0, 100.0, 40000) +PL_TEST_INTERVAL (SV_NAME_F2 (atan2), 100, inf, 40000) +PL_TEST_INTERVAL (SV_NAME_F2 (atan2), -0, -inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_atan_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_atan_2u5.c index 02ac331970c9..7ab486a4c9d2 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_atan_2u5.c +++ b/contrib/arm-optimized-routines/pl/math/sv_atan_2u5.c @@ -8,55 +8,80 @@ #include "sv_math.h" #include "pl_sig.h" #include "pl_test.h" +#include "poly_sve_f64.h" -#if SV_SUPPORTED - -#include "sv_atan_common.h" +static const struct data +{ + float64_t poly[20]; + float64_t pi_over_2; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-1022, 1.0]. */ + .poly = { -0x1.5555555555555p-2, 0x1.99999999996c1p-3, -0x1.2492492478f88p-3, + 0x1.c71c71bc3951cp-4, -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4, + -0x1.11100ee084227p-4, 0x1.e1d0f9696f63bp-5, -0x1.aebfe7b418581p-5, + 0x1.842dbe9b0d916p-5, -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5, + -0x1.00e6eece7de8p-5, 0x1.860897b29e5efp-6, -0x1.0051381722a59p-6, + 0x1.14e9dc19a4a4ep-7, -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10, + -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16, }, + .pi_over_2 = 0x1.921fb54442d18p+0, +}; /* Useful constants. */ -#define PiOver2 sv_f64 (0x1.921fb54442d18p+0) -#define AbsMask (0x7fffffffffffffff) +#define SignMask (0x8000000000000000) /* Fast implementation of SVE atan. Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using z=1/x and shift = pi/2. Largest errors are close to 1. The maximum observed error is 2.27 ulps: - __sv_atan(0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1 - want 0x1.9225645bdd7c3p-1. */ -sv_f64_t -__sv_atan_x (sv_f64_t x, const svbool_t pg) + _ZGVsMxv_atan (0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1 + want 0x1.9225645bdd7c3p-1. */ +svfloat64_t SV_NAME_D1 (atan) (svfloat64_t x, const svbool_t pg) { + const struct data *d = ptr_barrier (&data); + /* No need to trigger special case. Small cases, infs and nans are supported by our approximation technique. */ - sv_u64_t ix = sv_as_u64_f64 (x); - sv_u64_t sign = svand_n_u64_x (pg, ix, ~AbsMask); + svuint64_t ix = svreinterpret_u64 (x); + svuint64_t sign = svand_x (pg, ix, SignMask); /* Argument reduction: y := arctan(x) for x < 1 y := pi/2 + arctan(-1/x) for x > 1 Hence, use z=-1/a if x>=1, otherwise z=a. */ - svbool_t red = svacgt_n_f64 (pg, x, 1.0); + svbool_t red = svacgt (pg, x, 1.0); /* Avoid dependency in abs(x) in division (and comparison). */ - sv_f64_t z = svsel_f64 (red, svdiv_f64_x (pg, sv_f64 (-1.0), x), x); + svfloat64_t z = svsel (red, svdivr_x (pg, x, 1.0), x); /* Use absolute value only when needed (odd powers of z). */ - sv_f64_t az = svabs_f64_x (pg, z); - az = svneg_f64_m (az, red, az); + svfloat64_t az = svabs_x (pg, z); + az = svneg_m (az, red, az); - sv_f64_t y = __sv_atan_common (pg, red, z, az, PiOver2); + /* Use split Estrin scheme for P(z^2) with deg(P)=19. */ + svfloat64_t z2 = svmul_x (pg, z, z); + svfloat64_t x2 = svmul_x (pg, z2, z2); + svfloat64_t x4 = svmul_x (pg, x2, x2); + svfloat64_t x8 = svmul_x (pg, x4, x4); + + svfloat64_t y + = svmla_x (pg, sv_estrin_7_f64_x (pg, z2, x2, x4, d->poly), + sv_estrin_11_f64_x (pg, z2, x2, x4, x8, d->poly + 8), x8); + + /* y = shift + z + z^3 * P(z^2). */ + svfloat64_t z3 = svmul_x (pg, z2, az); + y = svmla_x (pg, az, z3, y); + + /* Apply shift as indicated by `red` predicate. */ + y = svadd_m (red, y, d->pi_over_2); /* y = atan(x) if x>0, -atan(-x) otherwise. */ - y = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (y), sign)); + y = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)); return y; } -PL_ALIAS (__sv_atan_x, _ZGVsMxv_atan) - PL_SIG (SV, D, 1, atan, -3.1, 3.1) -PL_TEST_ULP (__sv_atan, 1.78) -PL_TEST_INTERVAL (__sv_atan, -10.0, 10.0, 50000) -PL_TEST_INTERVAL (__sv_atan, -1.0, 1.0, 40000) -PL_TEST_INTERVAL (__sv_atan, 0.0, 1.0, 40000) -PL_TEST_INTERVAL (__sv_atan, 1.0, 100.0, 40000) -PL_TEST_INTERVAL (__sv_atan, 1e6, 1e32, 40000) -#endif +PL_TEST_ULP (SV_NAME_D1 (atan), 1.78) +PL_TEST_INTERVAL (SV_NAME_D1 (atan), 0.0, 1.0, 40000) +PL_TEST_INTERVAL (SV_NAME_D1 (atan), 1.0, 100.0, 40000) +PL_TEST_INTERVAL (SV_NAME_D1 (atan), 100, inf, 40000) +PL_TEST_INTERVAL (SV_NAME_D1 (atan), -0, -inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_atan_common.h b/contrib/arm-optimized-routines/pl/math/sv_atan_common.h deleted file mode 100644 index bfe6998d2416..000000000000 --- a/contrib/arm-optimized-routines/pl/math/sv_atan_common.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Double-precision polynomial evaluation function for SVE atan(x) and - * atan2(y,x). - * - * Copyright (c) 2021-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" -#include "sv_math.h" - -#define P(i) sv_f64 (__atan_poly_data.poly[i]) - -/* Polynomial used in fast SVE atan(x) and atan2(y,x) implementations - The order 19 polynomial P approximates (atan(sqrt(x))-sqrt(x))/x^(3/2). */ -static inline sv_f64_t -__sv_atan_common (svbool_t pg, svbool_t red, sv_f64_t z, sv_f64_t az, - sv_f64_t shift) -{ - /* Use full Estrin scheme for P(z^2) with deg(P)=19. */ - sv_f64_t z2 = svmul_f64_x (pg, z, z); - - /* Level 1. */ - sv_f64_t P_1_0 = sv_fma_f64_x (pg, P (1), z2, P (0)); - sv_f64_t P_3_2 = sv_fma_f64_x (pg, P (3), z2, P (2)); - sv_f64_t P_5_4 = sv_fma_f64_x (pg, P (5), z2, P (4)); - sv_f64_t P_7_6 = sv_fma_f64_x (pg, P (7), z2, P (6)); - sv_f64_t P_9_8 = sv_fma_f64_x (pg, P (9), z2, P (8)); - sv_f64_t P_11_10 = sv_fma_f64_x (pg, P (11), z2, P (10)); - sv_f64_t P_13_12 = sv_fma_f64_x (pg, P (13), z2, P (12)); - sv_f64_t P_15_14 = sv_fma_f64_x (pg, P (15), z2, P (14)); - sv_f64_t P_17_16 = sv_fma_f64_x (pg, P (17), z2, P (16)); - sv_f64_t P_19_18 = sv_fma_f64_x (pg, P (19), z2, P (18)); - - /* Level 2. */ - sv_f64_t x2 = svmul_f64_x (pg, z2, z2); - sv_f64_t P_3_0 = sv_fma_f64_x (pg, P_3_2, x2, P_1_0); - sv_f64_t P_7_4 = sv_fma_f64_x (pg, P_7_6, x2, P_5_4); - sv_f64_t P_11_8 = sv_fma_f64_x (pg, P_11_10, x2, P_9_8); - sv_f64_t P_15_12 = sv_fma_f64_x (pg, P_15_14, x2, P_13_12); - sv_f64_t P_19_16 = sv_fma_f64_x (pg, P_19_18, x2, P_17_16); - - /* Level 3. */ - sv_f64_t x4 = svmul_f64_x (pg, x2, x2); - sv_f64_t P_7_0 = sv_fma_f64_x (pg, P_7_4, x4, P_3_0); - sv_f64_t P_15_8 = sv_fma_f64_x (pg, P_15_12, x4, P_11_8); - - /* Level 4. */ - sv_f64_t x8 = svmul_f64_x (pg, x4, x4); - sv_f64_t y = sv_fma_f64_x (pg, P_19_16, x8, P_15_8); - y = sv_fma_f64_x (pg, y, x8, P_7_0); - - /* Finalize. y = shift + z + z^3 * P(z^2). */ - sv_f64_t z3 = svmul_f64_x (pg, z2, az); - y = sv_fma_f64_x (pg, y, z3, az); - - /* Apply shift as indicated by `red` predicate. */ - y = svadd_f64_m (red, y, shift); - - return y; -} diff --git a/contrib/arm-optimized-routines/pl/math/sv_atanf_2u9.c b/contrib/arm-optimized-routines/pl/math/sv_atanf_2u9.c index 8d38e42b2290..4defb356e7f9 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_atanf_2u9.c +++ b/contrib/arm-optimized-routines/pl/math/sv_atanf_2u9.c @@ -8,52 +8,69 @@ #include "sv_math.h" #include "pl_sig.h" #include "pl_test.h" +#include "poly_sve_f32.h" -#if SV_SUPPORTED +static const struct data +{ + float32_t poly[8]; + float32_t pi_over_2; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-128, 1.0]. */ + .poly = { -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f, + -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f }, + .pi_over_2 = 0x1.921fb6p+0f, +}; -#include "sv_atanf_common.h" - -#define PiOver2 sv_f32 (0x1.921fb6p+0f) -#define AbsMask (0x7fffffff) +#define SignMask (0x80000000) /* Fast implementation of SVE atanf based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using z=-1/x and shift = pi/2. Largest observed error is 2.9 ULP, close to +/-1.0: - __sv_atanf(0x1.0468f6p+0) got -0x1.967f06p-1 - want -0x1.967fp-1. */ -sv_f32_t -__sv_atanf_x (sv_f32_t x, const svbool_t pg) + _ZGVsMxv_atanf (0x1.0468f6p+0) got -0x1.967f06p-1 + want -0x1.967fp-1. */ +svfloat32_t SV_NAME_F1 (atan) (svfloat32_t x, const svbool_t pg) { + const struct data *d = ptr_barrier (&data); + /* No need to trigger special case. Small cases, infs and nans are supported by our approximation technique. */ - sv_u32_t ix = sv_as_u32_f32 (x); - sv_u32_t sign = svand_n_u32_x (pg, ix, ~AbsMask); + svuint32_t ix = svreinterpret_u32 (x); + svuint32_t sign = svand_x (pg, ix, SignMask); /* Argument reduction: y := arctan(x) for x < 1 y := pi/2 + arctan(-1/x) for x > 1 Hence, use z=-1/a if x>=1, otherwise z=a. */ - svbool_t red = svacgt_n_f32 (pg, x, 1.0f); + svbool_t red = svacgt (pg, x, 1.0f); /* Avoid dependency in abs(x) in division (and comparison). */ - sv_f32_t z = svsel_f32 (red, svdiv_f32_x (pg, sv_f32 (-1.0f), x), x); + svfloat32_t z = svsel (red, svdiv_x (pg, sv_f32 (1.0f), x), x); /* Use absolute value only when needed (odd powers of z). */ - sv_f32_t az = svabs_f32_x (pg, z); - az = svneg_f32_m (az, red, az); + svfloat32_t az = svabs_x (pg, z); + az = svneg_m (az, red, az); - sv_f32_t y = __sv_atanf_common (pg, red, z, az, PiOver2); + /* Use split Estrin scheme for P(z^2) with deg(P)=7. */ + svfloat32_t z2 = svmul_x (pg, z, z); + svfloat32_t z4 = svmul_x (pg, z2, z2); + svfloat32_t z8 = svmul_x (pg, z4, z4); + + svfloat32_t y = sv_estrin_7_f32_x (pg, z2, z4, z8, d->poly); + + /* y = shift + z + z^3 * P(z^2). */ + svfloat32_t z3 = svmul_x (pg, z2, az); + y = svmla_x (pg, az, z3, y); + + /* Apply shift as indicated by 'red' predicate. */ + y = svadd_m (red, y, sv_f32 (d->pi_over_2)); /* y = atan(x) if x>0, -atan(-x) otherwise. */ - return sv_as_f32_u32 (sveor_u32_x (pg, sv_as_u32_f32 (y), sign)); + return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign)); } -PL_ALIAS (__sv_atanf_x, _ZGVsMxv_atanf) - PL_SIG (SV, F, 1, atan, -3.1, 3.1) -PL_TEST_ULP (__sv_atanf, 2.9) -PL_TEST_INTERVAL (__sv_atanf, -10.0, 10.0, 50000) -PL_TEST_INTERVAL (__sv_atanf, -1.0, 1.0, 40000) -PL_TEST_INTERVAL (__sv_atanf, 0.0, 1.0, 40000) -PL_TEST_INTERVAL (__sv_atanf, 1.0, 100.0, 40000) -PL_TEST_INTERVAL (__sv_atanf, 1e6, 1e32, 40000) -#endif +PL_TEST_ULP (SV_NAME_F1 (atan), 2.9) +PL_TEST_INTERVAL (SV_NAME_F1 (atan), 0.0, 1.0, 40000) +PL_TEST_INTERVAL (SV_NAME_F1 (atan), 1.0, 100.0, 40000) +PL_TEST_INTERVAL (SV_NAME_F1 (atan), 100, inf, 40000) +PL_TEST_INTERVAL (SV_NAME_F1 (atan), -0, -inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_atanf_common.h b/contrib/arm-optimized-routines/pl/math/sv_atanf_common.h deleted file mode 100644 index dc45effec1cd..000000000000 --- a/contrib/arm-optimized-routines/pl/math/sv_atanf_common.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Single-precision polynomial evaluation function for SVE atan(x) and - * atan2(y,x). - * - * Copyright (c) 2021-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#ifndef PL_MATH_SV_ATANF_COMMON_H -#define PL_MATH_SV_ATANF_COMMON_H - -#include "math_config.h" -#include "sv_math.h" - -#define P(i) sv_f32 (__atanf_poly_data.poly[i]) - -/* Polynomial used in fast SVE atanf(x) and atan2f(y,x) implementations - The order 7 polynomial P approximates (f(sqrt(x))-sqrt(x))/x^(3/2). */ -static inline sv_f32_t -__sv_atanf_common (svbool_t pg, svbool_t red, sv_f32_t z, sv_f32_t az, - sv_f32_t shift) -{ - /* Use full Estrin scheme for P(z^2) with deg(P)=7. */ - - /* First compute square powers of z. */ - sv_f32_t z2 = svmul_f32_x (pg, z, z); - sv_f32_t z4 = svmul_f32_x (pg, z2, z2); - sv_f32_t z8 = svmul_f32_x (pg, z4, z4); - - /* Then assemble polynomial. */ - sv_f32_t p_4_7 = sv_fma_f32_x (pg, z4, (sv_fma_f32_x (pg, z2, P (7), P (6))), - (sv_fma_f32_x (pg, z2, P (5), P (4)))); - sv_f32_t p_0_3 = sv_fma_f32_x (pg, z4, (sv_fma_f32_x (pg, z2, P (3), P (2))), - (sv_fma_f32_x (pg, z2, P (1), P (0)))); - sv_f32_t y = sv_fma_f32_x (pg, z8, p_4_7, p_0_3); - - /* Finalize. y = shift + z + z^3 * P(z^2). */ - sv_f32_t z3 = svmul_f32_x (pg, z2, az); - y = sv_fma_f32_x (pg, y, z3, az); - - /* Apply shift as indicated by 'red' predicate. */ - y = svadd_f32_m (red, y, shift); - - return y; -} - -#endif // PL_MATH_SV_ATANF_COMMON_H diff --git a/contrib/arm-optimized-routines/pl/math/sv_atanh_3u3.c b/contrib/arm-optimized-routines/pl/math/sv_atanh_3u3.c new file mode 100644 index 000000000000..dcc9350b4962 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_atanh_3u3.c @@ -0,0 +1,60 @@ +/* + * Double-precision SVE atanh(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define WANT_SV_LOG1P_K0_SHORTCUT 0 +#include "sv_log1p_inline.h" + +#define One (0x3ff0000000000000) +#define Half (0x3fe0000000000000) + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +{ + return sv_call_f64 (atanh, x, y, special); +} + +/* SVE approximation for double-precision atanh, based on log1p. + The greatest observed error is 2.81 ULP: + _ZGVsMxv_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6 + want 0x1.ffd8ff31b501cp-6. */ +svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg) +{ + + svfloat64_t ax = svabs_x (pg, x); + svuint64_t iax = svreinterpret_u64 (ax); + svuint64_t sign = sveor_x (pg, svreinterpret_u64 (x), iax); + svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, Half)); + + /* It is special if iax >= 1. */ +// svbool_t special = svcmpge (pg, iax, One); + svbool_t special = svacge (pg, x, 1.0); + + /* Computation is performed based on the following sequence of equality: + (1+x)/(1-x) = 1 + 2x/(1-x). */ + svfloat64_t y; + y = svadd_x (pg, ax, ax); + y = svdiv_x (pg, y, svsub_x (pg, sv_f64 (1), ax)); + /* ln((1+x)/(1-x)) = ln(1+2x/(1-x)) = ln(1 + y). */ + y = sv_log1p_inline (y, pg); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, svmul_x (pg, halfsign, y), special); + return svmul_x (pg, halfsign, y); +} + +PL_SIG (SV, D, 1, atanh, -1.0, 1.0) +PL_TEST_ULP (SV_NAME_D1 (atanh), 3.32) +/* atanh is asymptotic at 1, which is the default control value - have to set + -c 0 specially to ensure fp exceptions are triggered correctly (choice of + control lane is irrelevant if fp exceptions are disabled). */ +PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (atanh), 0, 0x1p-23, 10000, 0) +PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (atanh), 0x1p-23, 1, 90000, 0) +PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (atanh), 1, inf, 100, 0) diff --git a/contrib/arm-optimized-routines/pl/math/sv_atanhf_2u8.c b/contrib/arm-optimized-routines/pl/math/sv_atanhf_2u8.c new file mode 100644 index 000000000000..413c60ce05da --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_atanhf_2u8.c @@ -0,0 +1,56 @@ +/* + * Single-precision vector atanh(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "mathlib.h" +#include "pl_sig.h" +#include "pl_test.h" + +#include "sv_log1pf_inline.h" + +#define One (0x3f800000) +#define Half (0x3f000000) + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +{ + return sv_call_f32 (atanhf, x, y, special); +} + +/* Approximation for vector single-precision atanh(x) using modified log1p. + The maximum error is 2.28 ULP: + _ZGVsMxv_atanhf(0x1.ff1194p-5) got 0x1.ffbbbcp-5 + want 0x1.ffbbb6p-5. */ +svfloat32_t SV_NAME_F1 (atanh) (svfloat32_t x, const svbool_t pg) +{ + svfloat32_t ax = svabs_x (pg, x); + svuint32_t iax = svreinterpret_u32 (ax); + svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax); + svfloat32_t halfsign = svreinterpret_f32 (svorr_x (pg, sign, Half)); + svbool_t special = svcmpge (pg, iax, One); + + /* Computation is performed based on the following sequence of equality: + * (1+x)/(1-x) = 1 + 2x/(1-x). */ + svfloat32_t y = svadd_x (pg, ax, ax); + y = svdiv_x (pg, y, svsub_x (pg, sv_f32 (1), ax)); + /* ln((1+x)/(1-x)) = ln(1+2x/(1-x)) = ln(1 + y). */ + y = sv_log1pf_inline (y, pg); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, svmul_x (pg, halfsign, y), special); + + return svmul_x (pg, halfsign, y); +} + +PL_SIG (SV, F, 1, atanh, -1.0, 1.0) +PL_TEST_ULP (SV_NAME_F1 (atanh), 2.59) +/* atanh is asymptotic at 1, which is the default control value - have to set + -c 0 specially to ensure fp exceptions are triggered correctly (choice of + control lane is irrelevant if fp exceptions are disabled). */ +PL_TEST_SYM_INTERVAL_C (SV_NAME_F1 (atanh), 0, 0x1p-12, 1000, 0) +PL_TEST_SYM_INTERVAL_C (SV_NAME_F1 (atanh), 0x1p-12, 1, 20000, 0) +PL_TEST_SYM_INTERVAL_C (SV_NAME_F1 (atanh), 1, inf, 1000, 0) diff --git a/contrib/arm-optimized-routines/pl/math/sv_cbrt_2u.c b/contrib/arm-optimized-routines/pl/math/sv_cbrt_2u.c new file mode 100644 index 000000000000..192f1cd80d59 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_cbrt_2u.c @@ -0,0 +1,122 @@ +/* + * Double-precision SVE cbrt(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "poly_sve_f64.h" + +const static struct data +{ + float64_t poly[4]; + float64_t table[5]; + float64_t one_third, two_thirds, shift; + int64_t exp_bias; + uint64_t tiny_bound, thresh; +} data = { + /* Generated with FPMinimax in [0.5, 1]. */ + .poly = { 0x1.c14e8ee44767p-2, 0x1.dd2d3f99e4c0ep-1, -0x1.08e83026b7e74p-1, + 0x1.2c74eaa3ba428p-3, }, + /* table[i] = 2^((i - 2) / 3). */ + .table = { 0x1.428a2f98d728bp-1, 0x1.965fea53d6e3dp-1, 0x1p0, + 0x1.428a2f98d728bp0, 0x1.965fea53d6e3dp0, }, + .one_third = 0x1.5555555555555p-2, + .two_thirds = 0x1.5555555555555p-1, + .shift = 0x1.8p52, + .exp_bias = 1022, + .tiny_bound = 0x0010000000000000, /* Smallest normal. */ + .thresh = 0x7fe0000000000000, /* asuint64 (infinity) - tiny_bound. */ +}; + +#define MantissaMask 0x000fffffffffffff +#define HalfExp 0x3fe0000000000000 + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +{ + return sv_call_f64 (cbrt, x, y, special); +} + +static inline svfloat64_t +shifted_lookup (const svbool_t pg, const float64_t *table, svint64_t i) +{ + return svld1_gather_index (pg, table, svadd_x (pg, i, 2)); +} + +/* Approximation for double-precision vector cbrt(x), using low-order + polynomial and two Newton iterations. Greatest observed error is 1.79 ULP. + Errors repeat according to the exponent, for instance an error observed for + double value m * 2^e will be observed for any input m * 2^(e + 3*i), where i + is an integer. + _ZGVsMxv_cbrt (0x0.3fffb8d4413f3p-1022) got 0x1.965f53b0e5d97p-342 + want 0x1.965f53b0e5d95p-342. */ +svfloat64_t SV_NAME_D1 (cbrt) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat64_t ax = svabs_x (pg, x); + svuint64_t iax = svreinterpret_u64 (ax); + svuint64_t sign = sveor_x (pg, svreinterpret_u64 (x), iax); + + /* Subnormal, +/-0 and special values. */ + svbool_t special = svcmpge (pg, svsub_x (pg, iax, d->tiny_bound), d->thresh); + + /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector + version of frexp, which gets subnormal values wrong - these have to be + special-cased as a result. */ + svfloat64_t m = svreinterpret_f64 (svorr_x ( + pg, svand_x (pg, svreinterpret_u64 (x), MantissaMask), HalfExp)); + svint64_t e + = svsub_x (pg, svreinterpret_s64 (svlsr_x (pg, iax, 52)), d->exp_bias); + + /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point + for Newton iterations. */ + svfloat64_t p + = sv_pairwise_poly_3_f64_x (pg, m, svmul_x (pg, m, m), d->poly); + + /* Two iterations of Newton's method for iteratively approximating cbrt. */ + svfloat64_t m_by_3 = svmul_x (pg, m, d->one_third); + svfloat64_t a = svmla_x (pg, svdiv_x (pg, m_by_3, svmul_x (pg, p, p)), p, + d->two_thirds); + a = svmla_x (pg, svdiv_x (pg, m_by_3, svmul_x (pg, a, a)), a, d->two_thirds); + + /* Assemble the result by the following: + + cbrt(x) = cbrt(m) * 2 ^ (e / 3). + + We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is + not necessarily a multiple of 3 we lose some information. + + Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q. + + Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which + is an integer in [-2, 2], and can be looked up in the table T. Hence the + result is assembled as: + + cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */ + svfloat64_t eb3f = svmul_x (pg, svcvt_f64_x (pg, e), d->one_third); + svint64_t ey = svcvt_s64_x (pg, eb3f); + svint64_t em3 = svmls_x (pg, e, ey, 3); + + svfloat64_t my = shifted_lookup (pg, d->table, em3); + my = svmul_x (pg, my, a); + + /* Vector version of ldexp. */ + svfloat64_t y = svscale_x (pg, my, ey); + + if (unlikely (svptest_any (pg, special))) + return special_case ( + x, svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign)), + special); + + /* Copy sign. */ + return svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign)); +} + +PL_SIG (SV, D, 1, cbrt, -10.0, 10.0) +PL_TEST_ULP (SV_NAME_D1 (cbrt), 1.30) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cbrt), 0, inf, 1000000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_cbrtf_1u7.c b/contrib/arm-optimized-routines/pl/math/sv_cbrtf_1u7.c new file mode 100644 index 000000000000..5b625f308827 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_cbrtf_1u7.c @@ -0,0 +1,116 @@ +/* + * Single-precision SVE cbrt(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "poly_sve_f32.h" + +const static struct data +{ + float32_t poly[4]; + float32_t table[5]; + float32_t one_third, two_thirds; +} data = { + /* Very rough approximation of cbrt(x) in [0.5, 1], generated with FPMinimax. + */ + .poly = { 0x1.c14e96p-2, 0x1.dd2d3p-1, -0x1.08e81ap-1, + 0x1.2c74c2p-3, }, + /* table[i] = 2^((i - 2) / 3). */ + .table = { 0x1.428a3p-1, 0x1.965feap-1, 0x1p0, 0x1.428a3p0, 0x1.965feap0 }, + .one_third = 0x1.555556p-2f, + .two_thirds = 0x1.555556p-1f, +}; + +#define SmallestNormal 0x00800000 +#define Thresh 0x7f000000 /* asuint(INFINITY) - SmallestNormal. */ +#define MantissaMask 0x007fffff +#define HalfExp 0x3f000000 + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +{ + return sv_call_f32 (cbrtf, x, y, special); +} + +static inline svfloat32_t +shifted_lookup (const svbool_t pg, const float32_t *table, svint32_t i) +{ + return svld1_gather_index (pg, table, svadd_x (pg, i, 2)); +} + +/* Approximation for vector single-precision cbrt(x) using Newton iteration + with initial guess obtained by a low-order polynomial. Greatest error + is 1.64 ULP. This is observed for every value where the mantissa is + 0x1.85a2aa and the exponent is a multiple of 3, for example: + _ZGVsMxv_cbrtf (0x1.85a2aap+3) got 0x1.267936p+1 + want 0x1.267932p+1. */ +svfloat32_t SV_NAME_F1 (cbrt) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat32_t ax = svabs_x (pg, x); + svuint32_t iax = svreinterpret_u32 (ax); + svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax); + + /* Subnormal, +/-0 and special values. */ + svbool_t special = svcmpge (pg, svsub_x (pg, iax, SmallestNormal), Thresh); + + /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector + version of frexpf, which gets subnormal values wrong - these have to be + special-cased as a result. */ + svfloat32_t m = svreinterpret_f32 (svorr_x ( + pg, svand_x (pg, svreinterpret_u32 (x), MantissaMask), HalfExp)); + svint32_t e = svsub_x (pg, svreinterpret_s32 (svlsr_x (pg, iax, 23)), 126); + + /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is, + the less accurate the next stage of the algorithm needs to be. An order-4 + polynomial is enough for one Newton iteration. */ + svfloat32_t p + = sv_pairwise_poly_3_f32_x (pg, m, svmul_x (pg, m, m), d->poly); + + /* One iteration of Newton's method for iteratively approximating cbrt. */ + svfloat32_t m_by_3 = svmul_x (pg, m, d->one_third); + svfloat32_t a = svmla_x (pg, svdiv_x (pg, m_by_3, svmul_x (pg, p, p)), p, + d->two_thirds); + + /* Assemble the result by the following: + + cbrt(x) = cbrt(m) * 2 ^ (e / 3). + + We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is + not necessarily a multiple of 3 we lose some information. + + Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q. + + Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which + is an integer in [-2, 2], and can be looked up in the table T. Hence the + result is assembled as: + + cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */ + svfloat32_t ef = svmul_x (pg, svcvt_f32_x (pg, e), d->one_third); + svint32_t ey = svcvt_s32_x (pg, ef); + svint32_t em3 = svmls_x (pg, e, ey, 3); + + svfloat32_t my = shifted_lookup (pg, d->table, em3); + my = svmul_x (pg, my, a); + + /* Vector version of ldexpf. */ + svfloat32_t y = svscale_x (pg, my, ey); + + if (unlikely (svptest_any (pg, special))) + return special_case ( + x, svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign)), + special); + + /* Copy sign. */ + return svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign)); +} + +PL_SIG (SV, F, 1, cbrt, -10.0, 10.0) +PL_TEST_ULP (SV_NAME_F1 (cbrt), 1.15) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cbrt), 0, inf, 1000000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_cexpi_3u5.c b/contrib/arm-optimized-routines/pl/math/sv_cexpi_3u5.c new file mode 100644 index 000000000000..920acfea5da0 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_cexpi_3u5.c @@ -0,0 +1,45 @@ +/* + * Double-precision vector cexpi function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_sincos_common.h" +#include "sv_math.h" +#include "pl_test.h" + +static svfloat64x2_t NOINLINE +special_case (svfloat64_t x, svbool_t special, svfloat64x2_t y) +{ + return svcreate2 (sv_call_f64 (sin, x, svget2 (y, 0), special), + sv_call_f64 (cos, x, svget2 (y, 1), special)); +} + +/* Double-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate polynomials. + Largest observed error is for sin, 3.22 ULP: + sv_cexpi_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3 + want -0x1.ffe9537d5dbb4p-3. */ +svfloat64x2_t +_ZGVsMxv_cexpi (svfloat64_t x, svbool_t pg) +{ + const struct sv_sincos_data *d = ptr_barrier (&sv_sincos_data); + svbool_t special = check_ge_rangeval (pg, x, d); + + svfloat64x2_t sc = sv_sincos_inline (pg, x, d); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, special, sc); + return sc; +} + +PL_TEST_ULP (_ZGVsMxv_cexpi_sin, 2.73) +PL_TEST_ULP (_ZGVsMxv_cexpi_cos, 2.73) +#define SV_CEXPI_INTERVAL(lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVsMxv_cexpi_sin, lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVsMxv_cexpi_cos, lo, hi, n) +SV_CEXPI_INTERVAL (0, 0x1p23, 500000) +SV_CEXPI_INTERVAL (-0, -0x1p23, 500000) +SV_CEXPI_INTERVAL (0x1p23, inf, 10000) +SV_CEXPI_INTERVAL (-0x1p23, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_cexpif_1u8.c b/contrib/arm-optimized-routines/pl/math/sv_cexpif_1u8.c new file mode 100644 index 000000000000..93f2f998cb38 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_cexpif_1u8.c @@ -0,0 +1,47 @@ +/* + * Single-precision vector cexpi function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_sincosf_common.h" +#include "sv_math.h" +#include "pl_test.h" + +static svfloat32x2_t NOINLINE +special_case (svfloat32_t x, svbool_t special, svfloat32x2_t y) +{ + return svcreate2 (sv_call_f32 (sinf, x, svget2 (y, 0), special), + sv_call_f32 (cosf, x, svget2 (y, 1), special)); +} + +/* Single-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate low-order + polynomials. + Worst-case error for sin is 1.67 ULP: + v_cexpif_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5 + Worst-case error for cos is 1.81 ULP: + v_cexpif_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */ +svfloat32x2_t +_ZGVsMxv_cexpif (svfloat32_t x, svbool_t pg) +{ + const struct sv_sincosf_data *d = ptr_barrier (&sv_sincosf_data); + svbool_t special = check_ge_rangeval (pg, x, d); + + svfloat32x2_t sc = sv_sincosf_inline (pg, x, d); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, special, sc); + return sc; +} + +PL_TEST_ULP (_ZGVsMxv_cexpif_sin, 1.17) +PL_TEST_ULP (_ZGVsMxv_cexpif_cos, 1.31) +#define SV_CEXPIF_INTERVAL(lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVsMxv_cexpif_sin, lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVsMxv_cexpif_cos, lo, hi, n) +SV_CEXPIF_INTERVAL (0, 0x1p20, 500000) +SV_CEXPIF_INTERVAL (-0, -0x1p20, 500000) +SV_CEXPIF_INTERVAL (0x1p20, inf, 10000) +SV_CEXPIF_INTERVAL (-0x1p20, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_cos_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_cos_2u5.c index 194034802452..76af3459b3f2 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_cos_2u5.c +++ b/contrib/arm-optimized-routines/pl/math/sv_cos_2u5.c @@ -9,76 +9,78 @@ #include "pl_sig.h" #include "pl_test.h" -#if SV_SUPPORTED - -#define InvPio2 (sv_f64 (0x1.45f306dc9c882p-1)) -#define NegPio2_1 (sv_f64 (-0x1.921fb50000000p+0)) -#define NegPio2_2 (sv_f64 (-0x1.110b460000000p-26)) -#define NegPio2_3 (sv_f64 (-0x1.1a62633145c07p-54)) -/* Original shift used in Neon cos, - plus a contribution to set the bit #0 of q - as expected by trigonometric instructions. */ -#define Shift (sv_f64 (0x1.8000000000001p52)) -#define RangeVal (sv_f64 (0x1p23)) -#define AbsMask (0x7fffffffffffffff) - -static NOINLINE sv_f64_t -__sv_cos_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp) +static const struct data { - return sv_call_f64 (cos, x, y, cmp); + double inv_pio2, pio2_1, pio2_2, pio2_3, shift; +} data = { + /* Polynomial coefficients are hardwired in FTMAD instructions. */ + .inv_pio2 = 0x1.45f306dc9c882p-1, + .pio2_1 = 0x1.921fb50000000p+0, + .pio2_2 = 0x1.110b460000000p-26, + .pio2_3 = 0x1.1a62633145c07p-54, + /* Original shift used in AdvSIMD cos, + plus a contribution to set the bit #0 of q + as expected by trigonometric instructions. */ + .shift = 0x1.8000000000001p52 +}; + +#define RangeVal 0x4160000000000000 /* asuint64 (0x1p23). */ + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t oob) +{ + return sv_call_f64 (cos, x, y, oob); } /* A fast SVE implementation of cos based on trigonometric instructions (FTMAD, FTSSEL, FTSMUL). Maximum measured error: 2.108 ULPs. - __sv_cos(0x1.9b0ba158c98f3p+7) got -0x1.fddd4c65c7f07p-3 - want -0x1.fddd4c65c7f05p-3. */ -sv_f64_t -__sv_cos_x (sv_f64_t x, const svbool_t pg) + SV_NAME_D1 (cos)(0x1.9b0ba158c98f3p+7) got -0x1.fddd4c65c7f07p-3 + want -0x1.fddd4c65c7f05p-3. */ +svfloat64_t SV_NAME_D1 (cos) (svfloat64_t x, const svbool_t pg) { - sv_f64_t n, r, r2, y; - svbool_t cmp; + const struct data *d = ptr_barrier (&data); - r = sv_as_f64_u64 (svand_n_u64_x (pg, sv_as_u64_f64 (x), AbsMask)); - cmp = svcmpge_u64 (pg, sv_as_u64_f64 (r), sv_as_u64_f64 (RangeVal)); + svfloat64_t r = svabs_x (pg, x); + svbool_t oob = svcmpge (pg, svreinterpret_u64 (r), RangeVal); + + /* Load some constants in quad-word chunks to minimise memory access. */ + svbool_t ptrue = svptrue_b64 (); + svfloat64_t invpio2_and_pio2_1 = svld1rq (ptrue, &d->inv_pio2); + svfloat64_t pio2_23 = svld1rq (ptrue, &d->pio2_2); /* n = rint(|x|/(pi/2)). */ - sv_f64_t q = sv_fma_f64_x (pg, InvPio2, r, Shift); - n = svsub_f64_x (pg, q, Shift); + svfloat64_t q = svmla_lane (sv_f64 (d->shift), r, invpio2_and_pio2_1, 0); + svfloat64_t n = svsub_x (pg, q, d->shift); /* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */ - r = sv_fma_f64_x (pg, NegPio2_1, n, r); - r = sv_fma_f64_x (pg, NegPio2_2, n, r); - r = sv_fma_f64_x (pg, NegPio2_3, n, r); + r = svmls_lane (r, n, invpio2_and_pio2_1, 1); + r = svmls_lane (r, n, pio2_23, 0); + r = svmls_lane (r, n, pio2_23, 1); /* cos(r) poly approx. */ - r2 = svtsmul_f64 (r, sv_as_u64_f64 (q)); - y = sv_f64 (0.0); - y = svtmad_f64 (y, r2, 7); - y = svtmad_f64 (y, r2, 6); - y = svtmad_f64 (y, r2, 5); - y = svtmad_f64 (y, r2, 4); - y = svtmad_f64 (y, r2, 3); - y = svtmad_f64 (y, r2, 2); - y = svtmad_f64 (y, r2, 1); - y = svtmad_f64 (y, r2, 0); + svfloat64_t r2 = svtsmul (r, svreinterpret_u64 (q)); + svfloat64_t y = sv_f64 (0.0); + y = svtmad (y, r2, 7); + y = svtmad (y, r2, 6); + y = svtmad (y, r2, 5); + y = svtmad (y, r2, 4); + y = svtmad (y, r2, 3); + y = svtmad (y, r2, 2); + y = svtmad (y, r2, 1); + y = svtmad (y, r2, 0); /* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */ - sv_f64_t f = svtssel_f64 (r, sv_as_u64_f64 (q)); - /* Apply factor. */ - y = svmul_f64_x (pg, f, y); + svfloat64_t f = svtssel (r, svreinterpret_u64 (q)); - /* No need to pass pg to specialcase here since cmp is a strict subset, - guaranteed by the cmpge above. */ - if (unlikely (svptest_any (pg, cmp))) - return __sv_cos_specialcase (x, y, cmp); - return y; + if (unlikely (svptest_any (pg, oob))) + return special_case (x, svmul_x (svnot_z (pg, oob), y, f), oob); + + /* Apply factor. */ + return svmul_x (pg, f, y); } -PL_ALIAS (__sv_cos_x, _ZGVsMxv_cos) - PL_SIG (SV, D, 1, cos, -3.1, 3.1) -PL_TEST_ULP (__sv_cos, 1.61) -PL_TEST_INTERVAL (__sv_cos, 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (__sv_cos, 0x1p-4, 0x1p4, 500000) -#endif +PL_TEST_ULP (SV_NAME_D1 (cos), 1.61) +PL_TEST_INTERVAL (SV_NAME_D1 (cos), 0, 0xffff0000, 10000) +PL_TEST_INTERVAL (SV_NAME_D1 (cos), 0x1p-4, 0x1p4, 500000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_cosf_2u1.c b/contrib/arm-optimized-routines/pl/math/sv_cosf_2u1.c index 8f138bcba7af..4bdb0dd146bb 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_cosf_2u1.c +++ b/contrib/arm-optimized-routines/pl/math/sv_cosf_2u1.c @@ -9,74 +9,72 @@ #include "pl_sig.h" #include "pl_test.h" -#if SV_SUPPORTED - -#define NegPio2_1 (sv_f32 (-0x1.921fb6p+0f)) -#define NegPio2_2 (sv_f32 (0x1.777a5cp-25f)) -#define NegPio2_3 (sv_f32 (0x1.ee59dap-50f)) -#define RangeVal (sv_f32 (0x1p20f)) -#define InvPio2 (sv_f32 (0x1.45f306p-1f)) -/* Original shift used in Neon cosf, - plus a contribution to set the bit #0 of q - as expected by trigonometric instructions. */ -#define Shift (sv_f32 (0x1.800002p+23f)) -#define AbsMask (0x7fffffff) - -static NOINLINE sv_f32_t -__sv_cosf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp) +static const struct data { - return sv_call_f32 (cosf, x, y, cmp); + float neg_pio2_1, neg_pio2_2, neg_pio2_3, inv_pio2, shift; +} data = { + /* Polynomial coefficients are hard-wired in FTMAD instructions. */ + .neg_pio2_1 = -0x1.921fb6p+0f, + .neg_pio2_2 = 0x1.777a5cp-25f, + .neg_pio2_3 = 0x1.ee59dap-50f, + .inv_pio2 = 0x1.45f306p-1f, + /* Original shift used in AdvSIMD cosf, + plus a contribution to set the bit #0 of q + as expected by trigonometric instructions. */ + .shift = 0x1.800002p+23f +}; + +#define RangeVal 0x49800000 /* asuint32(0x1p20f). */ + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t oob) +{ + return sv_call_f32 (cosf, x, y, oob); } /* A fast SVE implementation of cosf based on trigonometric instructions (FTMAD, FTSSEL, FTSMUL). Maximum measured error: 2.06 ULPs. - __sv_cosf(0x1.dea2f2p+19) got 0x1.fffe7ap-6 - want 0x1.fffe76p-6. */ -sv_f32_t -__sv_cosf_x (sv_f32_t x, const svbool_t pg) + SV_NAME_F1 (cos)(0x1.dea2f2p+19) got 0x1.fffe7ap-6 + want 0x1.fffe76p-6. */ +svfloat32_t SV_NAME_F1 (cos) (svfloat32_t x, const svbool_t pg) { - sv_f32_t n, r, r2, y; - svbool_t cmp; + const struct data *d = ptr_barrier (&data); - r = sv_as_f32_u32 (svand_n_u32_x (pg, sv_as_u32_f32 (x), AbsMask)); - cmp = svcmpge_u32 (pg, sv_as_u32_f32 (r), sv_as_u32_f32 (RangeVal)); + svfloat32_t r = svabs_x (pg, x); + svbool_t oob = svcmpge (pg, svreinterpret_u32 (r), RangeVal); + + /* Load some constants in quad-word chunks to minimise memory access. */ + svfloat32_t negpio2_and_invpio2 = svld1rq (svptrue_b32 (), &d->neg_pio2_1); /* n = rint(|x|/(pi/2)). */ - sv_f32_t q = sv_fma_f32_x (pg, InvPio2, r, Shift); - n = svsub_f32_x (pg, q, Shift); + svfloat32_t q = svmla_lane (sv_f32 (d->shift), r, negpio2_and_invpio2, 3); + svfloat32_t n = svsub_x (pg, q, d->shift); /* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */ - r = sv_fma_f32_x (pg, NegPio2_1, n, r); - r = sv_fma_f32_x (pg, NegPio2_2, n, r); - r = sv_fma_f32_x (pg, NegPio2_3, n, r); + r = svmla_lane (r, n, negpio2_and_invpio2, 0); + r = svmla_lane (r, n, negpio2_and_invpio2, 1); + r = svmla_lane (r, n, negpio2_and_invpio2, 2); /* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */ - sv_f32_t f = svtssel_f32 (r, sv_as_u32_f32 (q)); + svfloat32_t f = svtssel (r, svreinterpret_u32 (q)); /* cos(r) poly approx. */ - r2 = svtsmul_f32 (r, sv_as_u32_f32 (q)); - y = sv_f32 (0.0f); - y = svtmad_f32 (y, r2, 4); - y = svtmad_f32 (y, r2, 3); - y = svtmad_f32 (y, r2, 2); - y = svtmad_f32 (y, r2, 1); - y = svtmad_f32 (y, r2, 0); + svfloat32_t r2 = svtsmul (r, svreinterpret_u32 (q)); + svfloat32_t y = sv_f32 (0.0f); + y = svtmad (y, r2, 4); + y = svtmad (y, r2, 3); + y = svtmad (y, r2, 2); + y = svtmad (y, r2, 1); + y = svtmad (y, r2, 0); + if (unlikely (svptest_any (pg, oob))) + return special_case (x, svmul_x (svnot_z (pg, oob), f, y), oob); /* Apply factor. */ - y = svmul_f32_x (pg, f, y); - - /* No need to pass pg to specialcase here since cmp is a strict subset, - guaranteed by the cmpge above. */ - if (unlikely (svptest_any (pg, cmp))) - return __sv_cosf_specialcase (x, y, cmp); - return y; + return svmul_x (pg, f, y); } -PL_ALIAS (__sv_cosf_x, _ZGVsMxv_cosf) - PL_SIG (SV, F, 1, cos, -3.1, 3.1) -PL_TEST_ULP (__sv_cosf, 1.57) -PL_TEST_INTERVAL (__sv_cosf, 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (__sv_cosf, 0x1p-4, 0x1p4, 500000) -#endif +PL_TEST_ULP (SV_NAME_F1 (cos), 1.57) +PL_TEST_INTERVAL (SV_NAME_F1 (cos), 0, 0xffff0000, 10000) +PL_TEST_INTERVAL (SV_NAME_F1 (cos), 0x1p-4, 0x1p4, 500000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_cosh_2u.c b/contrib/arm-optimized-routines/pl/math/sv_cosh_2u.c new file mode 100644 index 000000000000..a6d743fb9b96 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_cosh_2u.c @@ -0,0 +1,100 @@ +/* + * Double-precision SVE cosh(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float64_t poly[3]; + float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres; + uint64_t index_mask, special_bound; +} data = { + .poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3, + 0x1.5555576a59599p-5, }, + + .inv_ln2 = 0x1.71547652b82fep8, /* N/ln2. */ + /* -ln2/N. */ + .ln2_hi = -0x1.62e42fefa39efp-9, + .ln2_lo = -0x1.abc9e3b39803f3p-64, + .shift = 0x1.8p+52, + .thres = 704.0, + + .index_mask = 0xff, + /* 0x1.6p9, above which exp overflows. */ + .special_bound = 0x4086000000000000, +}; + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +{ + return sv_call_f64 (cosh, x, y, special); +} + +/* Helper for approximating exp(x). Copied from sv_exp_tail, with no + special-case handling or tail. */ +static inline svfloat64_t +exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d) +{ + /* Calculate exp(x). */ + svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2); + svfloat64_t n = svsub_x (pg, z, d->shift); + + svfloat64_t r = svmla_x (pg, x, n, d->ln2_hi); + r = svmla_x (pg, r, n, d->ln2_lo); + + svuint64_t u = svreinterpret_u64 (z); + svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS); + svuint64_t i = svand_x (pg, u, d->index_mask); + + svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]); + y = svmla_x (pg, sv_f64 (d->poly[0]), r, y); + y = svmla_x (pg, sv_f64 (1.0), r, y); + y = svmul_x (pg, r, y); + + /* s = 2^(n/N). */ + u = svld1_gather_index (pg, __v_exp_tail_data, i); + svfloat64_t s = svreinterpret_f64 (svadd_x (pg, u, e)); + + return svmla_x (pg, s, s, y); +} + +/* Approximation for SVE double-precision cosh(x) using exp_inline. + cosh(x) = (exp(x) + exp(-x)) / 2. + The greatest observed error is in the scalar fall-back region, so is the + same as the scalar routine, 1.93 ULP: + _ZGVsMxv_cosh (0x1.628ad45039d2fp+9) got 0x1.fd774e958236dp+1021 + want 0x1.fd774e958236fp+1021. + + The greatest observed error in the non-special region is 1.54 ULP: + _ZGVsMxv_cosh (0x1.ba5651dd4486bp+2) got 0x1.f5e2bb8d5c98fp+8 + want 0x1.f5e2bb8d5c991p+8. */ +svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat64_t ax = svabs_x (pg, x); + svbool_t special = svcmpgt (pg, svreinterpret_u64 (ax), d->special_bound); + + /* Up to the point that exp overflows, we can use it to calculate cosh by + exp(|x|) / 2 + 1 / (2 * exp(|x|)). */ + svfloat64_t t = exp_inline (ax, pg, d); + svfloat64_t half_t = svmul_x (pg, t, 0.5); + svfloat64_t half_over_t = svdivr_x (pg, t, 0.5); + + /* Fall back to scalar for any special cases. */ + if (unlikely (svptest_any (pg, special))) + return special_case (x, svadd_x (pg, half_t, half_over_t), special); + + return svadd_x (pg, half_t, half_over_t); +} + +PL_SIG (SV, D, 1, cosh, -10.0, 10.0) +PL_TEST_ULP (SV_NAME_D1 (cosh), 1.43) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cosh), 0, 0x1.6p9, 100000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cosh), 0x1.6p9, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_coshf_2u.c b/contrib/arm-optimized-routines/pl/math/sv_coshf_2u.c new file mode 100644 index 000000000000..81680fef318e --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_coshf_2u.c @@ -0,0 +1,56 @@ +/* + * Single-precision SVE cosh(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#include "sv_expf_inline.h" + +static const struct data +{ + struct sv_expf_data expf_consts; + uint32_t special_bound; +} data = { + .expf_consts = SV_EXPF_DATA, + /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */ + .special_bound = 0x42ad496c, +}; + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t pg) +{ + return sv_call_f32 (coshf, x, y, pg); +} + +/* Single-precision vector cosh, using vector expf. + Maximum error is 1.89 ULP: + _ZGVsMxv_coshf (-0x1.65898cp+6) got 0x1.f00aep+127 + want 0x1.f00adcp+127. */ +svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat32_t ax = svabs_x (pg, x); + svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->special_bound); + + /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */ + svfloat32_t t = expf_inline (ax, pg, &d->expf_consts); + svfloat32_t half_t = svmul_x (pg, t, 0.5); + svfloat32_t half_over_t = svdivr_x (pg, t, 0.5); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, svadd_x (pg, half_t, half_over_t), special); + + return svadd_x (pg, half_t, half_over_t); +} + +PL_SIG (SV, F, 1, cosh, -10.0, 10.0) +PL_TEST_ULP (SV_NAME_F1 (cosh), 1.39) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cosh), 0, 0x1p-63, 100) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cosh), 0, 0x1.5a92d8p+6, 80000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cosh), 0x1.5a92d8p+6, inf, 2000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_cospi_3u2.c b/contrib/arm-optimized-routines/pl/math/sv_cospi_3u2.c new file mode 100644 index 000000000000..d80f899c41e4 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_cospi_3u2.c @@ -0,0 +1,63 @@ +/* + * Double-precision SVE cospi(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "poly_sve_f64.h" + +static const struct data +{ + double poly[10]; + double range_val; +} data = { + /* Polynomial coefficients generated using Remez algorithm, + see sinpi.sollya for details. */ + .poly = { 0x1.921fb54442d184p1, -0x1.4abbce625be53p2, 0x1.466bc6775ab16p1, + -0x1.32d2cce62dc33p-1, 0x1.507834891188ep-4, -0x1.e30750a28c88ep-8, + 0x1.e8f48308acda4p-12, -0x1.6fc0032b3c29fp-16, + 0x1.af86ae521260bp-21, -0x1.012a9870eeb7dp-25 }, + .range_val = 0x1p53, +}; + +/* A fast SVE implementation of cospi. + Maximum error 3.20 ULP: + _ZGVsMxv_cospi(0x1.f18ba32c63159p-6) got 0x1.fdabf595f9763p-1 + want 0x1.fdabf595f9766p-1. */ +svfloat64_t SV_NAME_D1 (cospi) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* Using cospi(x) = sinpi(0.5 - x) + range reduction and offset into sinpi range -1/2 .. 1/2 + r = 0.5 - |x - rint(x)|. */ + svfloat64_t n = svrinta_x (pg, x); + svfloat64_t r = svsub_x (pg, x, n); + r = svsub_x (pg, sv_f64 (0.5), svabs_x (pg, r)); + + /* Result should be negated based on if n is odd or not. + If ax >= 2^53, the result will always be positive. */ + svbool_t cmp = svaclt (pg, x, d->range_val); + svuint64_t intn = svreinterpret_u64 (svcvt_s64_z (pg, n)); + svuint64_t sign = svlsl_z (cmp, intn, 63); + + /* y = sin(r). */ + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t r4 = svmul_x (pg, r2, r2); + svfloat64_t y = sv_pw_horner_9_f64_x (pg, r2, r4, d->poly); + y = svmul_x (pg, y, r); + + return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)); +} + +PL_SIG (SV, D, 1, cospi, -0.9, 0.9) +PL_TEST_ULP (SV_NAME_D1 (cospi), 2.71) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0, 0x1p-63, 5000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0x1p-63, 0.5, 10000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0.5, 0x1p51, 10000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0x1p51, inf, 100000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_cospif_2u6.c b/contrib/arm-optimized-routines/pl/math/sv_cospif_2u6.c new file mode 100644 index 000000000000..fb2922d0533a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_cospif_2u6.c @@ -0,0 +1,59 @@ +/* + * Single-precision SVE cospi(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "poly_sve_f32.h" + +static const struct data +{ + float poly[6]; + float range_val; +} data = { + /* Taylor series coefficents for sin(pi * x). */ + .poly = { 0x1.921fb6p1f, -0x1.4abbcep2f, 0x1.466bc6p1f, -0x1.32d2ccp-1f, + 0x1.50783p-4f, -0x1.e30750p-8f }, + .range_val = 0x1p31f, +}; + +/* A fast SVE implementation of cospif. + Maximum error: 2.60 ULP: + _ZGVsMxv_cospif(+/-0x1.cae664p-4) got 0x1.e09c9ep-1 + want 0x1.e09c98p-1. */ +svfloat32_t SV_NAME_F1 (cospi) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* Using cospi(x) = sinpi(0.5 - x) + range reduction and offset into sinpi range -1/2 .. 1/2 + r = 0.5 - |x - rint(x)|. */ + svfloat32_t n = svrinta_x (pg, x); + svfloat32_t r = svsub_x (pg, x, n); + r = svsub_x (pg, sv_f32 (0.5f), svabs_x (pg, r)); + + /* Result should be negated based on if n is odd or not. + If ax >= 2^31, the result will always be positive. */ + svbool_t cmp = svaclt (pg, x, d->range_val); + svuint32_t intn = svreinterpret_u32 (svcvt_s32_x (pg, n)); + svuint32_t sign = svlsl_z (cmp, intn, 31); + + /* y = sin(r). */ + svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t y = sv_horner_5_f32_x (pg, r2, d->poly); + y = svmul_x (pg, y, r); + + return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign)); +} + +PL_SIG (SV, F, 1, cospi, -0.9, 0.9) +PL_TEST_ULP (SV_NAME_F1 (cospi), 2.08) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0, 0x1p-31, 5000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0x1p-31, 0.5, 10000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0.5, 0x1p31f, 10000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0x1p31f, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_erf_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_erf_2u5.c new file mode 100644 index 000000000000..cbf9718e5bb0 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_erf_2u5.c @@ -0,0 +1,111 @@ +/* + * Double-precision vector erf(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + double third; + double tenth, two_over_five, two_over_fifteen; + double two_over_nine, two_over_fortyfive; + double max, shift; +} data = { + .third = 0x1.5555555555556p-2, /* used to compute 2/3 and 1/6 too. */ + .two_over_fifteen = 0x1.1111111111111p-3, + .tenth = -0x1.999999999999ap-4, + .two_over_five = -0x1.999999999999ap-2, + .two_over_nine = -0x1.c71c71c71c71cp-3, + .two_over_fortyfive = 0x1.6c16c16c16c17p-5, + .max = 5.9921875, /* 6 - 1/128. */ + .shift = 0x1p45, +}; + +#define SignMask (0x8000000000000000) + +/* Double-precision implementation of vector erf(x). + Approximation based on series expansion near x rounded to + nearest multiple of 1/128. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + erf(x) ~ erf(r) + scale * d * [ + + 1 + - r d + + 1/3 (2 r^2 - 1) d^2 + - 1/6 (r (2 r^2 - 3)) d^3 + + 1/30 (4 r^4 - 12 r^2 + 3) d^4 + - 1/90 (4 r^4 - 20 r^2 + 15) d^5 + ] + + Maximum measure error: 2.29 ULP + _ZGVsMxv_erf(-0x1.00003c924e5d1p-8) got -0x1.20dd59132ebadp-8 + want -0x1.20dd59132ebafp-8. */ +svfloat64_t SV_NAME_D1 (erf) (svfloat64_t x, const svbool_t pg) +{ + const struct data *dat = ptr_barrier (&data); + + /* |x| >= 6.0 - 1/128. Opposite conditions except none of them catch NaNs so + they can be used in lookup and BSLs to yield the expected results. */ + svbool_t a_ge_max = svacge (pg, x, dat->max); + svbool_t a_lt_max = svaclt (pg, x, dat->max); + + /* Set r to multiple of 1/128 nearest to |x|. */ + svfloat64_t a = svabs_x (pg, x); + svfloat64_t shift = sv_f64 (dat->shift); + svfloat64_t z = svadd_x (pg, a, shift); + svuint64_t i + = svsub_x (pg, svreinterpret_u64 (z), svreinterpret_u64 (shift)); + + /* Lookup without shortcut for small values but with predicate to avoid + segfault for large values and NaNs. */ + svfloat64_t r = svsub_x (pg, z, shift); + svfloat64_t erfr = svld1_gather_index (a_lt_max, __sv_erf_data.erf, i); + svfloat64_t scale = svld1_gather_index (a_lt_max, __sv_erf_data.scale, i); + + /* erf(x) ~ erf(r) + scale * d * poly (r, d). */ + svfloat64_t d = svsub_x (pg, a, r); + svfloat64_t d2 = svmul_x (pg, d, d); + svfloat64_t r2 = svmul_x (pg, r, r); + + /* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5. */ + svfloat64_t p1 = r; + svfloat64_t third = sv_f64 (dat->third); + svfloat64_t twothird = svmul_x (pg, third, 2.0); + svfloat64_t sixth = svmul_x (pg, third, 0.5); + svfloat64_t p2 = svmls_x (pg, third, r2, twothird); + svfloat64_t p3 = svmad_x (pg, r2, third, -0.5); + p3 = svmul_x (pg, r, p3); + svfloat64_t p4 + = svmla_x (pg, sv_f64 (dat->two_over_five), r2, dat->two_over_fifteen); + p4 = svmls_x (pg, sv_f64 (dat->tenth), r2, p4); + svfloat64_t p5 + = svmla_x (pg, sv_f64 (dat->two_over_nine), r2, dat->two_over_fortyfive); + p5 = svmla_x (pg, sixth, r2, p5); + p5 = svmul_x (pg, r, p5); + + svfloat64_t p34 = svmla_x (pg, p3, d, p4); + svfloat64_t p12 = svmla_x (pg, p1, d, p2); + svfloat64_t y = svmla_x (pg, p34, d2, p5); + y = svmla_x (pg, p12, d2, y); + + y = svmla_x (pg, erfr, scale, svmls_x (pg, d, d2, y)); + + /* Solves the |x| = inf and NaN cases. */ + y = svsel (a_ge_max, sv_f64 (1.0), y); + + /* Copy sign. */ + svuint64_t ix = svreinterpret_u64 (x); + svuint64_t iy = svreinterpret_u64 (y); + svuint64_t sign = svand_x (pg, ix, SignMask); + return svreinterpret_f64 (svorr_x (pg, sign, iy)); +} + +PL_SIG (SV, D, 1, erf, -6.0, 6.0) +PL_TEST_ULP (SV_NAME_D1 (erf), 1.79) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (erf), 0, 5.9921875, 40000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (erf), 5.9921875, inf, 40000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (erf), 0, inf, 4000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_erf_3u.c b/contrib/arm-optimized-routines/pl/math/sv_erf_3u.c deleted file mode 100644 index bec7f8a819d2..000000000000 --- a/contrib/arm-optimized-routines/pl/math/sv_erf_3u.c +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Double-precision SVE erf(x) function. - * - * Copyright (c) 2020-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" - -#if SV_SUPPORTED - -#define Scale (8.0) -#define AbsMask (0x7fffffffffffffff) - -static NOINLINE sv_f64_t -__sv_erf_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp) -{ - return sv_call_f64 (erf, x, y, cmp); -} - -/* Optimized double precision SVE error function erf. - Maximum observed error is 2.62 ULP: - __sv_erf(0x1.79cab7e3078fap+2) got 0x1.0000000000001p+0 - want 0x1.fffffffffffffp-1. */ -sv_f64_t -__sv_erf_x (sv_f64_t x, const svbool_t pg) -{ - /* Use top 16 bits to test for special cases and small values. */ - sv_u64_t ix = sv_as_u64_f64 (x); - sv_u64_t atop = svand_n_u64_x (pg, svlsr_n_u64_x (pg, ix, 48), 0x7fff); - - /* Handle both inf/nan as well as small values (|x|<2^-28). */ - svbool_t cmp - = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, atop, 0x3e30), 0x7ff0 - 0x3e30); - - /* Get sign and absolute value. */ - sv_f64_t a = sv_as_f64_u64 (svand_n_u64_x (pg, ix, AbsMask)); - sv_u64_t sign = svand_n_u64_x (pg, ix, ~AbsMask); - - /* i = trunc(Scale*x). */ - sv_f64_t a_scale = svmul_n_f64_x (pg, a, Scale); - /* Saturate index of intervals. */ - svbool_t a_lt_6 = svcmplt_n_u64 (pg, atop, 0x4018); - sv_u64_t i = svcvt_u64_f64_m (sv_u64 (V_ERF_NINTS - 1), a_lt_6, a_scale); - - /* Load polynomial coefficients. */ - sv_f64_t P_0 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[0], i); - sv_f64_t P_1 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[1], i); - sv_f64_t P_2 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[2], i); - sv_f64_t P_3 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[3], i); - sv_f64_t P_4 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[4], i); - sv_f64_t P_5 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[5], i); - sv_f64_t P_6 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[6], i); - sv_f64_t P_7 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[7], i); - sv_f64_t P_8 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[8], i); - sv_f64_t P_9 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[9], i); - - /* Get shift and scale. */ - sv_f64_t shift = sv_lookup_f64_x (pg, __v_erf_data.shifts, i); - - /* Transform polynomial variable. - Set z = 0 in the boring domain to avoid overflow. */ - sv_f64_t z = svmla_f64_m (a_lt_6, shift, sv_f64 (Scale), a); - - /* Evaluate polynomial P(z) using level-2 Estrin. */ - sv_f64_t r1 = sv_fma_f64_x (pg, z, P_1, P_0); - sv_f64_t r2 = sv_fma_f64_x (pg, z, P_3, P_2); - sv_f64_t r3 = sv_fma_f64_x (pg, z, P_5, P_4); - sv_f64_t r4 = sv_fma_f64_x (pg, z, P_7, P_6); - sv_f64_t r5 = sv_fma_f64_x (pg, z, P_9, P_8); - - sv_f64_t z2 = svmul_f64_x (pg, z, z); - sv_f64_t z4 = svmul_f64_x (pg, z2, z2); - - sv_f64_t q2 = sv_fma_f64_x (pg, r4, z2, r3); - sv_f64_t q1 = sv_fma_f64_x (pg, r2, z2, r1); - - sv_f64_t y = sv_fma_f64_x (pg, z4, r5, q2); - y = sv_fma_f64_x (pg, z4, y, q1); - - /* y = erf(x) if x > 0, -erf(-x) otherwise. */ - y = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (y), sign)); - - if (unlikely (svptest_any (pg, cmp))) - return __sv_erf_specialcase (x, y, cmp); - return y; -} - -PL_ALIAS (__sv_erf_x, _ZGVsMxv_erf) - -PL_SIG (SV, D, 1, erf, -4.0, 4.0) -PL_TEST_ULP (__sv_erf, 2.13) -PL_TEST_INTERVAL (__sv_erf, 0, 0x1p-28, 20000) -PL_TEST_INTERVAL (__sv_erf, 0x1p-28, 1, 60000) -PL_TEST_INTERVAL (__sv_erf, 1, 0x1p28, 60000) -PL_TEST_INTERVAL (__sv_erf, 0x1p28, inf, 20000) -PL_TEST_INTERVAL (__sv_erf, -0, -0x1p-28, 20000) -PL_TEST_INTERVAL (__sv_erf, -0x1p-28, -1, 60000) -PL_TEST_INTERVAL (__sv_erf, -1, -0x1p28, 60000) -PL_TEST_INTERVAL (__sv_erf, -0x1p28, -inf, 20000) -#endif diff --git a/contrib/arm-optimized-routines/pl/math/sv_erf_data.c b/contrib/arm-optimized-routines/pl/math/sv_erf_data.c new file mode 100644 index 000000000000..7244aceda5a5 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_erf_data.c @@ -0,0 +1,1558 @@ +/* + * Data for approximation of erf. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Lookup table used in vector erf. + For each possible rounded input r (multiples of 1/128), between + r = 0.0 and r = 6.0 (769 values): + - the first entry __erf_data.tab.erf contains the values of erf(r), + - the second entry __erf_data.tab.scale contains the values of + 2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the + algorithm, since lookup is performed only for x >= 1/64-1/512. */ +const struct sv_erf_data __sv_erf_data = { + .erf = { 0x0.0000000000000p+0, + 0x1.20dbf3deb1340p-7, + 0x1.20d77083f17a0p-6, + 0x1.b137e0cf584dcp-6, + 0x1.20c5645dd2538p-5, + 0x1.68e5d3bbc9526p-5, + 0x1.b0fafef135745p-5, + 0x1.f902a77bd3821p-5, + 0x1.207d480e90658p-4, + 0x1.44703e87e8593p-4, + 0x1.68591a1e83b5dp-4, + 0x1.8c36beb8a8d23p-4, + 0x1.b0081148a873ap-4, + 0x1.d3cbf7e70a4b3p-4, + 0x1.f78159ec8bb50p-4, + 0x1.0d939005f65e5p-3, + 0x1.1f5e1a35c3b89p-3, + 0x1.311fc15f56d14p-3, + 0x1.42d7fc2f64959p-3, + 0x1.548642321d7c6p-3, + 0x1.662a0bdf7a89fp-3, + 0x1.77c2d2a765f9ep-3, + 0x1.895010fdbdbfdp-3, + 0x1.9ad142662e14dp-3, + 0x1.ac45e37fe2526p-3, + 0x1.bdad72110a648p-3, + 0x1.cf076d1233237p-3, + 0x1.e05354b96ff36p-3, + 0x1.f190aa85540e2p-3, + 0x1.015f78a3dcf3dp-2, + 0x1.09eed6982b948p-2, + 0x1.127631eb8de32p-2, + 0x1.1af54e232d609p-2, + 0x1.236bef825d9a2p-2, + 0x1.2bd9db0f7827fp-2, + 0x1.343ed6989b7d9p-2, + 0x1.3c9aa8b84bedap-2, + 0x1.44ed18d9f6462p-2, + 0x1.4d35ef3e5372ep-2, + 0x1.5574f4ffac98ep-2, + 0x1.5da9f415ff23fp-2, + 0x1.65d4b75b00471p-2, + 0x1.6df50a8dff772p-2, + 0x1.760aba57a76bfp-2, + 0x1.7e15944d9d3e4p-2, + 0x1.861566f5fd3c0p-2, + 0x1.8e0a01cab516bp-2, + 0x1.95f3353cbb146p-2, + 0x1.9dd0d2b721f39p-2, + 0x1.a5a2aca209394p-2, + 0x1.ad68966569a87p-2, + 0x1.b522646bbda68p-2, + 0x1.bccfec24855b8p-2, + 0x1.c4710406a65fcp-2, + 0x1.cc058392a6d2dp-2, + 0x1.d38d4354c3bd0p-2, + 0x1.db081ce6e2a48p-2, + 0x1.e275eaf25e458p-2, + 0x1.e9d68931ae650p-2, + 0x1.f129d471eabb1p-2, + 0x1.f86faa9428f9dp-2, + 0x1.ffa7ea8eb5fd0p-2, + 0x1.03693a371519cp-1, + 0x1.06f794ab2cae7p-1, + 0x1.0a7ef5c18edd2p-1, + 0x1.0dff4f247f6c6p-1, + 0x1.1178930ada115p-1, + 0x1.14eab43841b55p-1, + 0x1.1855a5fd3dd50p-1, + 0x1.1bb95c3746199p-1, + 0x1.1f15cb50bc4dep-1, + 0x1.226ae840d4d70p-1, + 0x1.25b8a88b6dd7fp-1, + 0x1.28ff0240d52cdp-1, + 0x1.2c3debfd7d6c1p-1, + 0x1.2f755ce9a21f4p-1, + 0x1.32a54cb8db67bp-1, + 0x1.35cdb3a9a144dp-1, + 0x1.38ee8a84beb71p-1, + 0x1.3c07ca9cb4f9ep-1, + 0x1.3f196dcd0f135p-1, + 0x1.42236e79a5fa6p-1, + 0x1.4525c78dd5966p-1, + 0x1.4820747ba2dc2p-1, + 0x1.4b13713ad3513p-1, + 0x1.4dfeba47f63ccp-1, + 0x1.50e24ca35fd2cp-1, + 0x1.53be25d016a4fp-1, + 0x1.569243d2b3a9bp-1, + 0x1.595ea53035283p-1, + 0x1.5c2348ecc4dc3p-1, + 0x1.5ee02e8a71a53p-1, + 0x1.61955607dd15dp-1, + 0x1.6442bfdedd397p-1, + 0x1.66e86d0312e82p-1, + 0x1.69865ee075011p-1, + 0x1.6c1c9759d0e5fp-1, + 0x1.6eab18c74091bp-1, + 0x1.7131e5f496a5ap-1, + 0x1.73b1021fc0cb8p-1, + 0x1.762870f720c6fp-1, + 0x1.78983697dc96fp-1, + 0x1.7b00578c26037p-1, + 0x1.7d60d8c979f7bp-1, + 0x1.7fb9bfaed8078p-1, + 0x1.820b1202f27fbp-1, + 0x1.8454d5f25760dp-1, + 0x1.8697120d92a4ap-1, + 0x1.88d1cd474a2e0p-1, + 0x1.8b050ef253c37p-1, + 0x1.8d30debfc572ep-1, + 0x1.8f5544bd00c04p-1, + 0x1.91724951b8fc6p-1, + 0x1.9387f53df5238p-1, + 0x1.959651980da31p-1, + 0x1.979d67caa6631p-1, + 0x1.999d4192a5715p-1, + 0x1.9b95e8fd26abap-1, + 0x1.9d8768656cc42p-1, + 0x1.9f71ca72cffb6p-1, + 0x1.a1551a16aaeafp-1, + 0x1.a331628a45b92p-1, + 0x1.a506af4cc00f4p-1, + 0x1.a6d50c20fa293p-1, + 0x1.a89c850b7d54dp-1, + 0x1.aa5d265064366p-1, + 0x1.ac16fc7143263p-1, + 0x1.adca142b10f98p-1, + 0x1.af767a741088bp-1, + 0x1.b11c3c79bb424p-1, + 0x1.b2bb679ead19cp-1, + 0x1.b4540978921eep-1, + 0x1.b5e62fce16095p-1, + 0x1.b771e894d602ep-1, + 0x1.b8f741ef54f83p-1, + 0x1.ba764a2af2b78p-1, + 0x1.bbef0fbde6221p-1, + 0x1.bd61a1453ab44p-1, + 0x1.bece0d82d1a5cp-1, + 0x1.c034635b66e23p-1, + 0x1.c194b1d49a184p-1, + 0x1.c2ef0812fc1bdp-1, + 0x1.c443755820d64p-1, + 0x1.c5920900b5fd1p-1, + 0x1.c6dad2829ec62p-1, + 0x1.c81de16b14cefp-1, + 0x1.c95b455cce69dp-1, + 0x1.ca930e0e2a825p-1, + 0x1.cbc54b476248dp-1, + 0x1.ccf20ce0c0d27p-1, + 0x1.ce1962c0e0d8bp-1, + 0x1.cf3b5cdaf0c39p-1, + 0x1.d0580b2cfd249p-1, + 0x1.d16f7dbe41ca0p-1, + 0x1.d281c49d818d0p-1, + 0x1.d38eefdf64fddp-1, + 0x1.d4970f9ce00d9p-1, + 0x1.d59a33f19ed42p-1, + 0x1.d6986cfa798e7p-1, + 0x1.d791cad3eff01p-1, + 0x1.d8865d98abe01p-1, + 0x1.d97635600bb89p-1, + 0x1.da61623cb41e0p-1, + 0x1.db47f43b2980dp-1, + 0x1.dc29fb60715afp-1, + 0x1.dd0787a8bb39dp-1, + 0x1.dde0a90611a0dp-1, + 0x1.deb56f5f12d28p-1, + 0x1.df85ea8db188ep-1, + 0x1.e0522a5dfda73p-1, + 0x1.e11a3e8cf4eb8p-1, + 0x1.e1de36c75ba58p-1, + 0x1.e29e22a89d766p-1, + 0x1.e35a11b9b61cep-1, + 0x1.e4121370224ccp-1, + 0x1.e4c6372cd8927p-1, + 0x1.e5768c3b4a3fcp-1, + 0x1.e62321d06c5e0p-1, + 0x1.e6cc0709c8a0dp-1, + 0x1.e7714aec96534p-1, + 0x1.e812fc64db369p-1, + 0x1.e8b12a44944a8p-1, + 0x1.e94be342e6743p-1, + 0x1.e9e335fb56f87p-1, + 0x1.ea7730ed0bbb9p-1, + 0x1.eb07e27a133aap-1, + 0x1.eb9558e6b42cep-1, + 0x1.ec1fa258c4beap-1, + 0x1.eca6ccd709544p-1, + 0x1.ed2ae6489ac1ep-1, + 0x1.edabfc7453e63p-1, + 0x1.ee2a1d004692cp-1, + 0x1.eea5557137ae0p-1, + 0x1.ef1db32a2277cp-1, + 0x1.ef93436bc2daap-1, + 0x1.f006135426b26p-1, + 0x1.f0762fde45ee6p-1, + 0x1.f0e3a5e1a1788p-1, + 0x1.f14e8211e8c55p-1, + 0x1.f1b6d0fea5f4dp-1, + 0x1.f21c9f12f0677p-1, + 0x1.f27ff89525acfp-1, + 0x1.f2e0e9a6a8b09p-1, + 0x1.f33f7e43a706bp-1, + 0x1.f39bc242e43e6p-1, + 0x1.f3f5c1558b19ep-1, + 0x1.f44d870704911p-1, + 0x1.f4a31ebcd47dfp-1, + 0x1.f4f693b67bd77p-1, + 0x1.f547f10d60597p-1, + 0x1.f59741b4b97cfp-1, + 0x1.f5e4907982a07p-1, + 0x1.f62fe80272419p-1, + 0x1.f67952cff6282p-1, + 0x1.f6c0db3c34641p-1, + 0x1.f7068b7b10fd9p-1, + 0x1.f74a6d9a38383p-1, + 0x1.f78c8b812d498p-1, + 0x1.f7cceef15d631p-1, + 0x1.f80ba18636f07p-1, + 0x1.f848acb544e95p-1, + 0x1.f88419ce4e184p-1, + 0x1.f8bdf1fb78370p-1, + 0x1.f8f63e416ebffp-1, + 0x1.f92d077f8d56dp-1, + 0x1.f96256700da8ep-1, + 0x1.f99633a838a57p-1, + 0x1.f9c8a7989af0dp-1, + 0x1.f9f9ba8d3c733p-1, + 0x1.fa2974addae45p-1, + 0x1.fa57ddfe27376p-1, + 0x1.fa84fe5e05c8dp-1, + 0x1.fab0dd89d1309p-1, + 0x1.fadb831a9f9c3p-1, + 0x1.fb04f6868a944p-1, + 0x1.fb2d3f20f9101p-1, + 0x1.fb54641aebbc9p-1, + 0x1.fb7a6c834b5a2p-1, + 0x1.fb9f5f4739170p-1, + 0x1.fbc3433260ca5p-1, + 0x1.fbe61eef4cf6ap-1, + 0x1.fc07f907bc794p-1, + 0x1.fc28d7e4f9cd0p-1, + 0x1.fc48c1d033c7ap-1, + 0x1.fc67bcf2d7b8fp-1, + 0x1.fc85cf56ecd38p-1, + 0x1.fca2fee770c79p-1, + 0x1.fcbf5170b578bp-1, + 0x1.fcdacca0bfb73p-1, + 0x1.fcf57607a6e7cp-1, + 0x1.fd0f5317f582fp-1, + 0x1.fd2869270a56fp-1, + 0x1.fd40bd6d7a785p-1, + 0x1.fd58550773cb5p-1, + 0x1.fd6f34f52013ap-1, + 0x1.fd85621b0876dp-1, + 0x1.fd9ae142795e3p-1, + 0x1.fdafb719e6a69p-1, + 0x1.fdc3e835500b3p-1, + 0x1.fdd7790ea5bc0p-1, + 0x1.fdea6e062d0c9p-1, + 0x1.fdfccb62e52d3p-1, + 0x1.fe0e9552ebdd6p-1, + 0x1.fe1fcfebe2083p-1, + 0x1.fe307f2b503d0p-1, + 0x1.fe40a6f70af4bp-1, + 0x1.fe504b1d9696cp-1, + 0x1.fe5f6f568b301p-1, + 0x1.fe6e1742f7cf6p-1, + 0x1.fe7c466dc57a1p-1, + 0x1.fe8a004c19ae6p-1, + 0x1.fe97483db8670p-1, + 0x1.fea4218d6594ap-1, + 0x1.feb08f7146046p-1, + 0x1.febc950b3fa75p-1, + 0x1.fec835695932ep-1, + 0x1.fed37386190fbp-1, + 0x1.fede5248e38f4p-1, + 0x1.fee8d486585eep-1, + 0x1.fef2fd00af31ap-1, + 0x1.fefcce6813974p-1, + 0x1.ff064b5afffbep-1, + 0x1.ff0f766697c76p-1, + 0x1.ff18520700971p-1, + 0x1.ff20e0a7ba8c2p-1, + 0x1.ff2924a3f7a83p-1, + 0x1.ff312046f2339p-1, + 0x1.ff38d5cc4227fp-1, + 0x1.ff404760319b4p-1, + 0x1.ff47772010262p-1, + 0x1.ff4e671a85425p-1, + 0x1.ff55194fe19dfp-1, + 0x1.ff5b8fb26f5f6p-1, + 0x1.ff61cc26c1578p-1, + 0x1.ff67d08401202p-1, + 0x1.ff6d9e943c231p-1, + 0x1.ff733814af88cp-1, + 0x1.ff789eb6130c9p-1, + 0x1.ff7dd41ce2b4dp-1, + 0x1.ff82d9e1a76d8p-1, + 0x1.ff87b1913e853p-1, + 0x1.ff8c5cad200a5p-1, + 0x1.ff90dcaba4096p-1, + 0x1.ff9532f846ab0p-1, + 0x1.ff9960f3eb327p-1, + 0x1.ff9d67f51ddbap-1, + 0x1.ffa14948549a7p-1, + 0x1.ffa506302ebaep-1, + 0x1.ffa89fe5b3625p-1, + 0x1.ffac17988ef4bp-1, + 0x1.ffaf6e6f4f5c0p-1, + 0x1.ffb2a5879f35ep-1, + 0x1.ffb5bdf67fe6fp-1, + 0x1.ffb8b8c88295fp-1, + 0x1.ffbb970200110p-1, + 0x1.ffbe599f4f9d9p-1, + 0x1.ffc10194fcb64p-1, + 0x1.ffc38fcffbb7cp-1, + 0x1.ffc60535dd7f5p-1, + 0x1.ffc862a501fd7p-1, + 0x1.ffcaa8f4c9beap-1, + 0x1.ffccd8f5c66d1p-1, + 0x1.ffcef371ea4d7p-1, + 0x1.ffd0f92cb6ba7p-1, + 0x1.ffd2eae369a07p-1, + 0x1.ffd4c94d29fdbp-1, + 0x1.ffd6951b33686p-1, + 0x1.ffd84ef9009eep-1, + 0x1.ffd9f78c7524ap-1, + 0x1.ffdb8f7605ee7p-1, + 0x1.ffdd1750e1220p-1, + 0x1.ffde8fb314ebfp-1, + 0x1.ffdff92db56e5p-1, + 0x1.ffe1544d01ccbp-1, + 0x1.ffe2a1988857cp-1, + 0x1.ffe3e19349dc7p-1, + 0x1.ffe514bbdc197p-1, + 0x1.ffe63b8c8b5f7p-1, + 0x1.ffe7567b7b5e1p-1, + 0x1.ffe865fac722bp-1, + 0x1.ffe96a78a04a9p-1, + 0x1.ffea645f6d6dap-1, + 0x1.ffeb5415e7c44p-1, + 0x1.ffec39ff380b9p-1, + 0x1.ffed167b12ac2p-1, + 0x1.ffede9e5d3262p-1, + 0x1.ffeeb49896c6dp-1, + 0x1.ffef76e956a9fp-1, + 0x1.fff0312b010b5p-1, + 0x1.fff0e3ad91ec2p-1, + 0x1.fff18ebe2b0e1p-1, + 0x1.fff232a72b48ep-1, + 0x1.fff2cfb0453d9p-1, + 0x1.fff3661e9569dp-1, + 0x1.fff3f634b79f9p-1, + 0x1.fff48032dbe40p-1, + 0x1.fff50456dab8cp-1, + 0x1.fff582dc48d30p-1, + 0x1.fff5fbfc8a439p-1, + 0x1.fff66feee5129p-1, + 0x1.fff6dee89352ep-1, + 0x1.fff7491cd4af6p-1, + 0x1.fff7aebcff755p-1, + 0x1.fff80ff8911fdp-1, + 0x1.fff86cfd3e657p-1, + 0x1.fff8c5f702ccfp-1, + 0x1.fff91b102fca8p-1, + 0x1.fff96c717b695p-1, + 0x1.fff9ba420e834p-1, + 0x1.fffa04a7928b1p-1, + 0x1.fffa4bc63ee9ap-1, + 0x1.fffa8fc0e5f33p-1, + 0x1.fffad0b901755p-1, + 0x1.fffb0ecebee1bp-1, + 0x1.fffb4a210b172p-1, + 0x1.fffb82cd9dcbfp-1, + 0x1.fffbb8f1049c6p-1, + 0x1.fffbeca6adbe9p-1, + 0x1.fffc1e08f25f5p-1, + 0x1.fffc4d3120aa1p-1, + 0x1.fffc7a37857d2p-1, + 0x1.fffca53375ce3p-1, + 0x1.fffcce3b57bffp-1, + 0x1.fffcf564ab6b7p-1, + 0x1.fffd1ac4135f9p-1, + 0x1.fffd3e6d5cd87p-1, + 0x1.fffd607387b07p-1, + 0x1.fffd80e8ce0dap-1, + 0x1.fffd9fdeabccep-1, + 0x1.fffdbd65e5ad0p-1, + 0x1.fffdd98e903b2p-1, + 0x1.fffdf46816833p-1, + 0x1.fffe0e0140857p-1, + 0x1.fffe26683972ap-1, + 0x1.fffe3daa95b18p-1, + 0x1.fffe53d558ae9p-1, + 0x1.fffe68f4fa777p-1, + 0x1.fffe7d156d244p-1, + 0x1.fffe904222101p-1, + 0x1.fffea2860ee1ep-1, + 0x1.fffeb3ebb267bp-1, + 0x1.fffec47d19457p-1, + 0x1.fffed443e2787p-1, + 0x1.fffee34943b15p-1, + 0x1.fffef1960d85dp-1, + 0x1.fffeff32af7afp-1, + 0x1.ffff0c273bea2p-1, + 0x1.ffff187b6bc0ep-1, + 0x1.ffff2436a21dcp-1, + 0x1.ffff2f5fefcaap-1, + 0x1.ffff39fe16963p-1, + 0x1.ffff44178c8d2p-1, + 0x1.ffff4db27f146p-1, + 0x1.ffff56d4d5e5ep-1, + 0x1.ffff5f8435efcp-1, + 0x1.ffff67c604180p-1, + 0x1.ffff6f9f67e55p-1, + 0x1.ffff77154e0d6p-1, + 0x1.ffff7e2c6aea2p-1, + 0x1.ffff84e93cd75p-1, + 0x1.ffff8b500e77cp-1, + 0x1.ffff9164f8e46p-1, + 0x1.ffff972be5c59p-1, + 0x1.ffff9ca891572p-1, + 0x1.ffffa1de8c582p-1, + 0x1.ffffa6d13de73p-1, + 0x1.ffffab83e54b8p-1, + 0x1.ffffaff99bac4p-1, + 0x1.ffffb43555b5fp-1, + 0x1.ffffb839e52f3p-1, + 0x1.ffffbc09fa7cdp-1, + 0x1.ffffbfa82616bp-1, + 0x1.ffffc316d9ed0p-1, + 0x1.ffffc6586abf6p-1, + 0x1.ffffc96f1165ep-1, + 0x1.ffffcc5cec0c1p-1, + 0x1.ffffcf23ff5fcp-1, + 0x1.ffffd1c637b2bp-1, + 0x1.ffffd4456a10dp-1, + 0x1.ffffd6a3554a1p-1, + 0x1.ffffd8e1a2f22p-1, + 0x1.ffffdb01e8546p-1, + 0x1.ffffdd05a75eap-1, + 0x1.ffffdeee4f810p-1, + 0x1.ffffe0bd3e852p-1, + 0x1.ffffe273c15b7p-1, + 0x1.ffffe41314e06p-1, + 0x1.ffffe59c6698bp-1, + 0x1.ffffe710d565ep-1, + 0x1.ffffe8717232dp-1, + 0x1.ffffe9bf4098cp-1, + 0x1.ffffeafb377d5p-1, + 0x1.ffffec2641a9ep-1, + 0x1.ffffed413e5b7p-1, + 0x1.ffffee4d01cd6p-1, + 0x1.ffffef4a55bd4p-1, + 0x1.fffff039f9e8fp-1, + 0x1.fffff11ca4876p-1, + 0x1.fffff1f302bc1p-1, + 0x1.fffff2bdb904dp-1, + 0x1.fffff37d63a36p-1, + 0x1.fffff43297019p-1, + 0x1.fffff4dde0118p-1, + 0x1.fffff57fc4a95p-1, + 0x1.fffff618c3da6p-1, + 0x1.fffff6a956450p-1, + 0x1.fffff731ee681p-1, + 0x1.fffff7b2f8ed6p-1, + 0x1.fffff82cdcf1bp-1, + 0x1.fffff89ffc4aap-1, + 0x1.fffff90cb3c81p-1, + 0x1.fffff9735b73bp-1, + 0x1.fffff9d446cccp-1, + 0x1.fffffa2fc5015p-1, + 0x1.fffffa8621251p-1, + 0x1.fffffad7a2652p-1, + 0x1.fffffb248c39dp-1, + 0x1.fffffb6d1e95dp-1, + 0x1.fffffbb196132p-1, + 0x1.fffffbf22c1e2p-1, + 0x1.fffffc2f171e3p-1, + 0x1.fffffc688a9cfp-1, + 0x1.fffffc9eb76acp-1, + 0x1.fffffcd1cbc28p-1, + 0x1.fffffd01f36afp-1, + 0x1.fffffd2f57d68p-1, + 0x1.fffffd5a2041fp-1, + 0x1.fffffd8271d12p-1, + 0x1.fffffda86faa9p-1, + 0x1.fffffdcc3b117p-1, + 0x1.fffffdedf37edp-1, + 0x1.fffffe0db6b91p-1, + 0x1.fffffe2ba0ea5p-1, + 0x1.fffffe47ccb60p-1, + 0x1.fffffe62534d4p-1, + 0x1.fffffe7b4c81ep-1, + 0x1.fffffe92ced93p-1, + 0x1.fffffea8ef9cfp-1, + 0x1.fffffebdc2ec6p-1, + 0x1.fffffed15bcbap-1, + 0x1.fffffee3cc32cp-1, + 0x1.fffffef5251c2p-1, + 0x1.ffffff0576917p-1, + 0x1.ffffff14cfb92p-1, + 0x1.ffffff233ee1dp-1, + 0x1.ffffff30d18e8p-1, + 0x1.ffffff3d9480fp-1, + 0x1.ffffff4993c46p-1, + 0x1.ffffff54dab72p-1, + 0x1.ffffff5f74141p-1, + 0x1.ffffff6969fb8p-1, + 0x1.ffffff72c5fb6p-1, + 0x1.ffffff7b91176p-1, + 0x1.ffffff83d3d07p-1, + 0x1.ffffff8b962bep-1, + 0x1.ffffff92dfba2p-1, + 0x1.ffffff99b79d2p-1, + 0x1.ffffffa0248e8p-1, + 0x1.ffffffa62ce54p-1, + 0x1.ffffffabd69b4p-1, + 0x1.ffffffb127525p-1, + 0x1.ffffffb624592p-1, + 0x1.ffffffbad2affp-1, + 0x1.ffffffbf370cdp-1, + 0x1.ffffffc355dfdp-1, + 0x1.ffffffc733572p-1, + 0x1.ffffffcad3626p-1, + 0x1.ffffffce39b67p-1, + 0x1.ffffffd169d0cp-1, + 0x1.ffffffd466fa5p-1, + 0x1.ffffffd7344aap-1, + 0x1.ffffffd9d4aabp-1, + 0x1.ffffffdc4ad7ap-1, + 0x1.ffffffde9964ep-1, + 0x1.ffffffe0c2bf0p-1, + 0x1.ffffffe2c92dbp-1, + 0x1.ffffffe4aed5ep-1, + 0x1.ffffffe675bbdp-1, + 0x1.ffffffe81fc4ep-1, + 0x1.ffffffe9aeb97p-1, + 0x1.ffffffeb24467p-1, + 0x1.ffffffec81ff2p-1, + 0x1.ffffffedc95e7p-1, + 0x1.ffffffeefbc85p-1, + 0x1.fffffff01a8b6p-1, + 0x1.fffffff126e1ep-1, + 0x1.fffffff221f30p-1, + 0x1.fffffff30cd3fp-1, + 0x1.fffffff3e8892p-1, + 0x1.fffffff4b606fp-1, + 0x1.fffffff57632dp-1, + 0x1.fffffff629e44p-1, + 0x1.fffffff6d1e56p-1, + 0x1.fffffff76ef3fp-1, + 0x1.fffffff801c1fp-1, + 0x1.fffffff88af67p-1, + 0x1.fffffff90b2e3p-1, + 0x1.fffffff982fc1p-1, + 0x1.fffffff9f2e9fp-1, + 0x1.fffffffa5b790p-1, + 0x1.fffffffabd229p-1, + 0x1.fffffffb18582p-1, + 0x1.fffffffb6d844p-1, + 0x1.fffffffbbd0aap-1, + 0x1.fffffffc0748fp-1, + 0x1.fffffffc4c96cp-1, + 0x1.fffffffc8d462p-1, + 0x1.fffffffcc9a41p-1, + 0x1.fffffffd01f89p-1, + 0x1.fffffffd36871p-1, + 0x1.fffffffd678edp-1, + 0x1.fffffffd954aep-1, + 0x1.fffffffdbff2ap-1, + 0x1.fffffffde7ba0p-1, + 0x1.fffffffe0cd16p-1, + 0x1.fffffffe2f664p-1, + 0x1.fffffffe4fa30p-1, + 0x1.fffffffe6daf7p-1, + 0x1.fffffffe89b0cp-1, + 0x1.fffffffea3c9ap-1, + 0x1.fffffffebc1a9p-1, + 0x1.fffffffed2c21p-1, + 0x1.fffffffee7dc8p-1, + 0x1.fffffffefb847p-1, + 0x1.ffffffff0dd2bp-1, + 0x1.ffffffff1ede9p-1, + 0x1.ffffffff2ebdap-1, + 0x1.ffffffff3d843p-1, + 0x1.ffffffff4b453p-1, + 0x1.ffffffff58126p-1, + 0x1.ffffffff63fc3p-1, + 0x1.ffffffff6f121p-1, + 0x1.ffffffff79626p-1, + 0x1.ffffffff82fabp-1, + 0x1.ffffffff8be77p-1, + 0x1.ffffffff94346p-1, + 0x1.ffffffff9bec8p-1, + 0x1.ffffffffa319fp-1, + 0x1.ffffffffa9c63p-1, + 0x1.ffffffffaffa4p-1, + 0x1.ffffffffb5be5p-1, + 0x1.ffffffffbb1a2p-1, + 0x1.ffffffffc014ep-1, + 0x1.ffffffffc4b56p-1, + 0x1.ffffffffc901cp-1, + 0x1.ffffffffccfffp-1, + 0x1.ffffffffd0b56p-1, + 0x1.ffffffffd4271p-1, + 0x1.ffffffffd759dp-1, + 0x1.ffffffffda520p-1, + 0x1.ffffffffdd13cp-1, + 0x1.ffffffffdfa2dp-1, + 0x1.ffffffffe202dp-1, + 0x1.ffffffffe4371p-1, + 0x1.ffffffffe642ap-1, + 0x1.ffffffffe8286p-1, + 0x1.ffffffffe9eb0p-1, + 0x1.ffffffffeb8d0p-1, + 0x1.ffffffffed10ap-1, + 0x1.ffffffffee782p-1, + 0x1.ffffffffefc57p-1, + 0x1.fffffffff0fa7p-1, + 0x1.fffffffff218fp-1, + 0x1.fffffffff3227p-1, + 0x1.fffffffff4188p-1, + 0x1.fffffffff4fc9p-1, + 0x1.fffffffff5cfdp-1, + 0x1.fffffffff6939p-1, + 0x1.fffffffff748ep-1, + 0x1.fffffffff7f0dp-1, + 0x1.fffffffff88c5p-1, + 0x1.fffffffff91c6p-1, + 0x1.fffffffff9a1bp-1, + 0x1.fffffffffa1d2p-1, + 0x1.fffffffffa8f6p-1, + 0x1.fffffffffaf92p-1, + 0x1.fffffffffb5b0p-1, + 0x1.fffffffffbb58p-1, + 0x1.fffffffffc095p-1, + 0x1.fffffffffc56dp-1, + 0x1.fffffffffc9e8p-1, + 0x1.fffffffffce0dp-1, + 0x1.fffffffffd1e1p-1, + 0x1.fffffffffd56cp-1, + 0x1.fffffffffd8b3p-1, + 0x1.fffffffffdbbap-1, + 0x1.fffffffffde86p-1, + 0x1.fffffffffe11dp-1, + 0x1.fffffffffe380p-1, + 0x1.fffffffffe5b6p-1, + 0x1.fffffffffe7c0p-1, + 0x1.fffffffffe9a2p-1, + 0x1.fffffffffeb60p-1, + 0x1.fffffffffecfbp-1, + 0x1.fffffffffee77p-1, + 0x1.fffffffffefd6p-1, + 0x1.ffffffffff11ap-1, + 0x1.ffffffffff245p-1, + 0x1.ffffffffff359p-1, + 0x1.ffffffffff457p-1, + 0x1.ffffffffff542p-1, + 0x1.ffffffffff61bp-1, + 0x1.ffffffffff6e3p-1, + 0x1.ffffffffff79bp-1, + 0x1.ffffffffff845p-1, + 0x1.ffffffffff8e2p-1, + 0x1.ffffffffff973p-1, + 0x1.ffffffffff9f8p-1, + 0x1.ffffffffffa73p-1, + 0x1.ffffffffffae4p-1, + 0x1.ffffffffffb4cp-1, + 0x1.ffffffffffbadp-1, + 0x1.ffffffffffc05p-1, + 0x1.ffffffffffc57p-1, + 0x1.ffffffffffca2p-1, + 0x1.ffffffffffce7p-1, + 0x1.ffffffffffd27p-1, + 0x1.ffffffffffd62p-1, + 0x1.ffffffffffd98p-1, + 0x1.ffffffffffdcap-1, + 0x1.ffffffffffdf8p-1, + 0x1.ffffffffffe22p-1, + 0x1.ffffffffffe49p-1, + 0x1.ffffffffffe6cp-1, + 0x1.ffffffffffe8dp-1, + 0x1.ffffffffffeabp-1, + 0x1.ffffffffffec7p-1, + 0x1.ffffffffffee1p-1, + 0x1.ffffffffffef8p-1, + 0x1.fffffffffff0ep-1, + 0x1.fffffffffff22p-1, + 0x1.fffffffffff34p-1, + 0x1.fffffffffff45p-1, + 0x1.fffffffffff54p-1, + 0x1.fffffffffff62p-1, + 0x1.fffffffffff6fp-1, + 0x1.fffffffffff7bp-1, + 0x1.fffffffffff86p-1, + 0x1.fffffffffff90p-1, + 0x1.fffffffffff9ap-1, + 0x1.fffffffffffa2p-1, + 0x1.fffffffffffaap-1, + 0x1.fffffffffffb1p-1, + 0x1.fffffffffffb8p-1, + 0x1.fffffffffffbep-1, + 0x1.fffffffffffc3p-1, + 0x1.fffffffffffc8p-1, + 0x1.fffffffffffcdp-1, + 0x1.fffffffffffd1p-1, + 0x1.fffffffffffd5p-1, + 0x1.fffffffffffd9p-1, + 0x1.fffffffffffdcp-1, + 0x1.fffffffffffdfp-1, + 0x1.fffffffffffe2p-1, + 0x1.fffffffffffe4p-1, + 0x1.fffffffffffe7p-1, + 0x1.fffffffffffe9p-1, + 0x1.fffffffffffebp-1, + 0x1.fffffffffffedp-1, + 0x1.fffffffffffeep-1, + 0x1.ffffffffffff0p-1, + 0x1.ffffffffffff1p-1, + 0x1.ffffffffffff3p-1, + 0x1.ffffffffffff4p-1, + 0x1.ffffffffffff5p-1, + 0x1.ffffffffffff6p-1, + 0x1.ffffffffffff7p-1, + 0x1.ffffffffffff7p-1, + 0x1.ffffffffffff8p-1, + 0x1.ffffffffffff9p-1, + 0x1.ffffffffffff9p-1, + 0x1.ffffffffffffap-1, + 0x1.ffffffffffffbp-1, + 0x1.ffffffffffffbp-1, + 0x1.ffffffffffffbp-1, + 0x1.ffffffffffffcp-1, + 0x1.ffffffffffffcp-1, + 0x1.ffffffffffffdp-1, + 0x1.ffffffffffffdp-1, + 0x1.ffffffffffffdp-1, + 0x1.ffffffffffffdp-1, + 0x1.ffffffffffffep-1, + 0x1.ffffffffffffep-1, + 0x1.ffffffffffffep-1, + 0x1.ffffffffffffep-1, + 0x1.ffffffffffffep-1, + 0x1.ffffffffffffep-1, + 0x1.fffffffffffffp-1, + 0x1.fffffffffffffp-1, + 0x1.fffffffffffffp-1, + 0x1.fffffffffffffp-1, + 0x1.fffffffffffffp-1, + 0x1.fffffffffffffp-1, + 0x1.fffffffffffffp-1, + 0x1.fffffffffffffp-1, + 0x1.fffffffffffffp-1, + 0x1.fffffffffffffp-1, + 0x1.fffffffffffffp-1, + 0x1.0000000000000p+0, + 0x1.0000000000000p+0, + 0x1.0000000000000p+0, + 0x1.0000000000000p+0, + 0x1.0000000000000p+0, + 0x1.0000000000000p+0, + 0x1.0000000000000p+0, + 0x1.0000000000000p+0, + 0x1.0000000000000p+0, + 0x1.0000000000000p+0, + 0x1.0000000000000p+0, + }, + .scale = { 0x1.20dd750429b6dp+0, + 0x1.20d8f1975c85dp+0, + 0x1.20cb67bd452c7p+0, + 0x1.20b4d8bac36c1p+0, + 0x1.209546ad13ccfp+0, + 0x1.206cb4897b148p+0, + 0x1.203b261cd0052p+0, + 0x1.2000a00ae3804p+0, + 0x1.1fbd27cdc72d3p+0, + 0x1.1f70c3b4f2cc7p+0, + 0x1.1f1b7ae44867fp+0, + 0x1.1ebd5552f795bp+0, + 0x1.1e565bca400d4p+0, + 0x1.1de697e413d28p+0, + 0x1.1d6e14099944ap+0, + 0x1.1cecdb718d61cp+0, + 0x1.1c62fa1e869b6p+0, + 0x1.1bd07cdd189acp+0, + 0x1.1b357141d95d5p+0, + 0x1.1a91e5a748165p+0, + 0x1.19e5e92b964abp+0, + 0x1.19318bae53a04p+0, + 0x1.1874ddcdfce24p+0, + 0x1.17aff0e56ec10p+0, + 0x1.16e2d7093cd8cp+0, + 0x1.160da304ed92fp+0, + 0x1.153068581b781p+0, + 0x1.144b3b337c90cp+0, + 0x1.135e3075d076bp+0, + 0x1.12695da8b5bdep+0, + 0x1.116cd8fd67618p+0, + 0x1.1068b94962e5ep+0, + 0x1.0f5d1602f7e41p+0, + 0x1.0e4a073dc1b91p+0, + 0x1.0d2fa5a70c168p+0, + 0x1.0c0e0a8223359p+0, + 0x1.0ae54fa490722p+0, + 0x1.09b58f724416bp+0, + 0x1.087ee4d9ad247p+0, + 0x1.07416b4fbfe7cp+0, + 0x1.05fd3ecbec297p+0, + 0x1.04b27bc403d30p+0, + 0x1.03613f2812dafp+0, + 0x1.0209a65e29545p+0, + 0x1.00abcf3e187a9p+0, + 0x1.fe8fb01a47307p-1, + 0x1.fbbbbef34b4b2p-1, + 0x1.f8dc092d58ff8p-1, + 0x1.f5f0cdaf15313p-1, + 0x1.f2fa4c16c0019p-1, + 0x1.eff8c4b1375dbp-1, + 0x1.ecec7870ebca7p-1, + 0x1.e9d5a8e4c934ep-1, + 0x1.e6b4982f158b9p-1, + 0x1.e38988fc46e72p-1, + 0x1.e054be79d3042p-1, + 0x1.dd167c4cf9d2ap-1, + 0x1.d9cf06898cdafp-1, + 0x1.d67ea1a8b5368p-1, + 0x1.d325927fb9d89p-1, + 0x1.cfc41e36c7df9p-1, + 0x1.cc5a8a3fbea40p-1, + 0x1.c8e91c4d01368p-1, + 0x1.c5701a484ef9dp-1, + 0x1.c1efca49a5011p-1, + 0x1.be68728e29d5dp-1, + 0x1.bada596f25436p-1, + 0x1.b745c55905bf8p-1, + 0x1.b3aafcc27502ep-1, + 0x1.b00a46237d5bep-1, + 0x1.ac63e7ecc1411p-1, + 0x1.a8b8287ec6a09p-1, + 0x1.a5074e2157620p-1, + 0x1.a1519efaf889ep-1, + 0x1.9d97610879642p-1, + 0x1.99d8da149c13fp-1, + 0x1.96164fafd8de3p-1, + 0x1.925007283d7aap-1, + 0x1.8e86458169af8p-1, + 0x1.8ab94f6caa71dp-1, + 0x1.86e9694134b9ep-1, + 0x1.8316d6f48133dp-1, + 0x1.7f41dc12c9e89p-1, + 0x1.7b6abbb7aaf19p-1, + 0x1.7791b886e7403p-1, + 0x1.73b714a552763p-1, + 0x1.6fdb11b1e0c34p-1, + 0x1.6bfdf0beddaf5p-1, + 0x1.681ff24b4ab04p-1, + 0x1.6441563c665d4p-1, + 0x1.60625bd75d07bp-1, + 0x1.5c8341bb23767p-1, + 0x1.58a445da7c74cp-1, + 0x1.54c5a57629db0p-1, + 0x1.50e79d1749ac9p-1, + 0x1.4d0a6889dfd9fp-1, + 0x1.492e42d78d2c5p-1, + 0x1.4553664273d24p-1, + 0x1.417a0c4049fd0p-1, + 0x1.3da26d759aef5p-1, + 0x1.39ccc1b136d5ap-1, + 0x1.35f93fe7d1b3dp-1, + 0x1.32281e2fd1a92p-1, + 0x1.2e5991bd4cbfcp-1, + 0x1.2a8dcede3673bp-1, + 0x1.26c508f6bd0ffp-1, + 0x1.22ff727dd6f7bp-1, + 0x1.1f3d3cf9ffe5ap-1, + 0x1.1b7e98fe26217p-1, + 0x1.17c3b626c7a11p-1, + 0x1.140cc3173f007p-1, + 0x1.1059ed7740313p-1, + 0x1.0cab61f084b93p-1, + 0x1.09014c2ca74dap-1, + 0x1.055bd6d32e8d7p-1, + 0x1.01bb2b87c6968p-1, + 0x1.fc3ee5d1524b0p-2, + 0x1.f511a91a67d2ap-2, + 0x1.edeeee0959518p-2, + 0x1.e6d6ffaa65a25p-2, + 0x1.dfca26f5bbf88p-2, + 0x1.d8c8aace11e63p-2, + 0x1.d1d2cfff91594p-2, + 0x1.cae8d93f1d7b6p-2, + 0x1.c40b0729ed547p-2, + 0x1.bd3998457afdap-2, + 0x1.b674c8ffc6283p-2, + 0x1.afbcd3afe8ab6p-2, + 0x1.a911f096fbc26p-2, + 0x1.a27455e14c93cp-2, + 0x1.9be437a7de946p-2, + 0x1.9561c7f23a47bp-2, + 0x1.8eed36b886d93p-2, + 0x1.8886b1e5ecfd1p-2, + 0x1.822e655b417e6p-2, + 0x1.7be47af1f5d89p-2, + 0x1.75a91a7f4d2edp-2, + 0x1.6f7c69d7d3ef8p-2, + 0x1.695e8cd31867ep-2, + 0x1.634fa54fa285fp-2, + 0x1.5d4fd33729015p-2, + 0x1.575f3483021c3p-2, + 0x1.517de540ce2a3p-2, + 0x1.4babff975a04cp-2, + 0x1.45e99bcbb7915p-2, + 0x1.4036d0468a7a2p-2, + 0x1.3a93b1998736cp-2, + 0x1.35005285227f1p-2, + 0x1.2f7cc3fe6f423p-2, + 0x1.2a09153529381p-2, + 0x1.24a55399ea239p-2, + 0x1.1f518ae487dc8p-2, + 0x1.1a0dc51a9934dp-2, + 0x1.14da0a961fd14p-2, + 0x1.0fb6620c550afp-2, + 0x1.0aa2d09497f2bp-2, + 0x1.059f59af7a906p-2, + 0x1.00abff4dec7a3p-2, + 0x1.f79183b101c5bp-3, + 0x1.edeb406d9c824p-3, + 0x1.e4652fadcb6b2p-3, + 0x1.daff4969c0b04p-3, + 0x1.d1b982c501370p-3, + 0x1.c893ce1dcbef7p-3, + 0x1.bf8e1b1ca2279p-3, + 0x1.b6a856c3ed54fp-3, + 0x1.ade26b7fbed95p-3, + 0x1.a53c4135a6526p-3, + 0x1.9cb5bd549b111p-3, + 0x1.944ec2e4f5630p-3, + 0x1.8c07329874652p-3, + 0x1.83deeada4d25ap-3, + 0x1.7bd5c7df3fe9cp-3, + 0x1.73eba3b5b07b7p-3, + 0x1.6c205655be71fp-3, + 0x1.6473b5b15a7a1p-3, + 0x1.5ce595c455b0ap-3, + 0x1.5575c8a468361p-3, + 0x1.4e241e912c305p-3, + 0x1.46f066040a832p-3, + 0x1.3fda6bc016994p-3, + 0x1.38e1fae1d6a9dp-3, + 0x1.3206dceef5f87p-3, + 0x1.2b48d9e5dea1cp-3, + 0x1.24a7b84d38971p-3, + 0x1.1e233d434b813p-3, + 0x1.17bb2c8d41535p-3, + 0x1.116f48a6476ccp-3, + 0x1.0b3f52ce8c383p-3, + 0x1.052b0b1a174eap-3, + 0x1.fe6460fef4680p-4, + 0x1.f2a901ccafb37p-4, + 0x1.e723726b824a9p-4, + 0x1.dbd32ac4c99b0p-4, + 0x1.d0b7a0f921e7cp-4, + 0x1.c5d0497c09e74p-4, + 0x1.bb1c972f23e50p-4, + 0x1.b09bfb7d11a83p-4, + 0x1.a64de673e8837p-4, + 0x1.9c31c6df3b1b8p-4, + 0x1.92470a61b6965p-4, + 0x1.888d1d8e510a3p-4, + 0x1.7f036c0107294p-4, + 0x1.75a96077274bap-4, + 0x1.6c7e64e7281cbp-4, + 0x1.6381e2980956bp-4, + 0x1.5ab342383d177p-4, + 0x1.5211ebf41880bp-4, + 0x1.499d478bca735p-4, + 0x1.4154bc68d75c3p-4, + 0x1.3937b1b319259p-4, + 0x1.31458e6542847p-4, + 0x1.297db960e4f63p-4, + 0x1.21df9981f8e53p-4, + 0x1.1a6a95b1e786fp-4, + 0x1.131e14fa1625dp-4, + 0x1.0bf97e95f2a64p-4, + 0x1.04fc3a0481321p-4, + 0x1.fc4b5e32d6259p-5, + 0x1.eeea8c1b1db93p-5, + 0x1.e1d4cf1e2450ap-5, + 0x1.d508f9a1ea64ep-5, + 0x1.c885df3451a07p-5, + 0x1.bc4a54a84e834p-5, + 0x1.b055303221015p-5, + 0x1.a4a549829587ep-5, + 0x1.993979e14fffdp-5, + 0x1.8e109c4622913p-5, + 0x1.83298d717210ep-5, + 0x1.78832c03aa2b1p-5, + 0x1.6e1c5893c380bp-5, + 0x1.63f3f5c4de13bp-5, + 0x1.5a08e85af27e0p-5, + 0x1.505a174e9c929p-5, + 0x1.46e66be002240p-5, + 0x1.3dacd1a8d8ccdp-5, + 0x1.34ac36ad8dafep-5, + 0x1.2be38b6d92415p-5, + 0x1.2351c2f2d1449p-5, + 0x1.1af5d2e04f3f6p-5, + 0x1.12ceb37ff9bc3p-5, + 0x1.0adb5fcfa8c75p-5, + 0x1.031ad58d56279p-5, + 0x1.f7182a851bca2p-6, + 0x1.e85c449e377f2p-6, + 0x1.da0005e5f28dfp-6, + 0x1.cc0180af00a8bp-6, + 0x1.be5ecd2fcb5f9p-6, + 0x1.b1160991ff737p-6, + 0x1.a4255a00b9f03p-6, + 0x1.978ae8b55ce1bp-6, + 0x1.8b44e6031383ep-6, + 0x1.7f5188610ddc8p-6, + 0x1.73af0c737bb45p-6, + 0x1.685bb5134ef13p-6, + 0x1.5d55cb54cd53ap-6, + 0x1.529b9e8cf9a1ep-6, + 0x1.482b8455dc491p-6, + 0x1.3e03d891b37dep-6, + 0x1.3422fd6d12e2bp-6, + 0x1.2a875b5ffab56p-6, + 0x1.212f612dee7fbp-6, + 0x1.181983e5133ddp-6, + 0x1.0f443edc5ce49p-6, + 0x1.06ae13b0d3255p-6, + 0x1.fcab1483ea7fcp-7, + 0x1.ec72615a894c4p-7, + 0x1.dcaf3691fc448p-7, + 0x1.cd5ec93c12431p-7, + 0x1.be7e5ac24963bp-7, + 0x1.b00b38d6b3575p-7, + 0x1.a202bd6372dcep-7, + 0x1.94624e78e0fafp-7, + 0x1.87275e3a6869dp-7, + 0x1.7a4f6aca256cbp-7, + 0x1.6dd7fe3358230p-7, + 0x1.61beae53b72b7p-7, + 0x1.56011cc3b036dp-7, + 0x1.4a9cf6bda3f4cp-7, + 0x1.3f8ff5042a88ep-7, + 0x1.34d7dbc76d7e5p-7, + 0x1.2a727a89a3f14p-7, + 0x1.205dac02bd6b9p-7, + 0x1.1697560347b25p-7, + 0x1.0d1d69569b82dp-7, + 0x1.03ede1a45bfeep-7, + 0x1.f60d8aa2a88f2p-8, + 0x1.e4cc4abf7d065p-8, + 0x1.d4143a9dfe965p-8, + 0x1.c3e1a5f5c077cp-8, + 0x1.b430ecf4a83a8p-8, + 0x1.a4fe83fb9db25p-8, + 0x1.9646f35a76623p-8, + 0x1.8806d70b2fc36p-8, + 0x1.7a3ade6c8b3e4p-8, + 0x1.6cdfcbfc1e263p-8, + 0x1.5ff2750fe7820p-8, + 0x1.536fc18f7ce5cp-8, + 0x1.4754abacdf1dcp-8, + 0x1.3b9e3f9d06e3fp-8, + 0x1.30499b503957fp-8, + 0x1.2553ee2a336bfp-8, + 0x1.1aba78ba3af89p-8, + 0x1.107a8c7323a6ep-8, + 0x1.06918b6355624p-8, + 0x1.f9f9cfd9c3035p-9, + 0x1.e77448fb66bb9p-9, + 0x1.d58da68fd1170p-9, + 0x1.c4412bf4b8f0bp-9, + 0x1.b38a3af2e55b4p-9, + 0x1.a3645330550ffp-9, + 0x1.93cb11a30d765p-9, + 0x1.84ba3004a50d0p-9, + 0x1.762d84469c18fp-9, + 0x1.6821000795a03p-9, + 0x1.5a90b00981d93p-9, + 0x1.4d78bba8ca5fdp-9, + 0x1.40d564548fad7p-9, + 0x1.34a305080681fp-9, + 0x1.28de11c5031ebp-9, + 0x1.1d83170fbf6fbp-9, + 0x1.128eb96be8798p-9, + 0x1.07fdb4dafea5fp-9, + 0x1.fb99b8b8279e1p-10, + 0x1.e7f232d9e2630p-10, + 0x1.d4fed7195d7e8p-10, + 0x1.c2b9cf7f893bfp-10, + 0x1.b11d702b3deb1p-10, + 0x1.a024365f771bdp-10, + 0x1.8fc8c794b03b5p-10, + 0x1.8005f08d6f1efp-10, + 0x1.70d6a46e07ddap-10, + 0x1.6235fbd7a4345p-10, + 0x1.541f340697987p-10, + 0x1.468dadf4080abp-10, + 0x1.397ced7af2b15p-10, + 0x1.2ce898809244ep-10, + 0x1.20cc76202c5fap-10, + 0x1.15246dda49d47p-10, + 0x1.09ec86c75d497p-10, + 0x1.fe41cd9bb4eeep-11, + 0x1.e97ba3b77f306p-11, + 0x1.d57f524723822p-11, + 0x1.c245d4b998479p-11, + 0x1.afc85e0f82e12p-11, + 0x1.9e005769dbc1dp-11, + 0x1.8ce75e9f6f8a0p-11, + 0x1.7c7744d9378f7p-11, + 0x1.6caa0d3582fe9p-11, + 0x1.5d79eb71e893bp-11, + 0x1.4ee1429bf7cc0p-11, + 0x1.40daa3c89f5b6p-11, + 0x1.3360ccd23db3ap-11, + 0x1.266ea71d4f71ap-11, + 0x1.19ff4663ae9dfp-11, + 0x1.0e0de78654d1ep-11, + 0x1.0295ef6591848p-11, + 0x1.ef25d37f49fe1p-12, + 0x1.da01102b5f851p-12, + 0x1.c5b5412dcafadp-12, + 0x1.b23a5a23e4210p-12, + 0x1.9f8893d8fd1c1p-12, + 0x1.8d986a4187285p-12, + 0x1.7c629a822bc9ep-12, + 0x1.6be02102b3520p-12, + 0x1.5c0a378c90bcap-12, + 0x1.4cda5374ea275p-12, + 0x1.3e4a23d1f4702p-12, + 0x1.30538fbb77ecdp-12, + 0x1.22f0b496539bdp-12, + 0x1.161be46ad3b50p-12, + 0x1.09cfa445b00ffp-12, + 0x1.fc0d55470cf51p-13, + 0x1.e577bbcd49935p-13, + 0x1.cfd4a5adec5bfp-13, + 0x1.bb1a9657ce465p-13, + 0x1.a740684026555p-13, + 0x1.943d4a1d1ed39p-13, + 0x1.8208bc334a6a5p-13, + 0x1.709a8db59f25cp-13, + 0x1.5feada379d8b7p-13, + 0x1.4ff207314a102p-13, + 0x1.40a8c1949f75ep-13, + 0x1.3207fb7420eb9p-13, + 0x1.2408e9ba3327fp-13, + 0x1.16a501f0e42cap-13, + 0x1.09d5f819c9e29p-13, + 0x1.fb2b792b40a22p-14, + 0x1.e3bcf436a1a95p-14, + 0x1.cd55277c18d05p-14, + 0x1.b7e94604479dcp-14, + 0x1.a36eec00926ddp-14, + 0x1.8fdc1b2dcf7b9p-14, + 0x1.7d2737527c3f9p-14, + 0x1.6b4702d7d5849p-14, + 0x1.5a329b7d30748p-14, + 0x1.49e17724f4d41p-14, + 0x1.3a4b60ba9aa4dp-14, + 0x1.2b6875310f785p-14, + 0x1.1d312098e9dbap-14, + 0x1.0f9e1b4dd36dfp-14, + 0x1.02a8673a94691p-14, + 0x1.ec929a665b449p-15, + 0x1.d4f4b4c8e09edp-15, + 0x1.be6abbb10a5aap-15, + 0x1.a8e8cc1fadef6p-15, + 0x1.94637d5bacfdbp-15, + 0x1.80cfdc72220cfp-15, + 0x1.6e2367dc27f95p-15, + 0x1.5c540b4936fd2p-15, + 0x1.4b581b8d170fcp-15, + 0x1.3b2652b06c2b2p-15, + 0x1.2bb5cc22e5db6p-15, + 0x1.1cfe010e2052dp-15, + 0x1.0ef6c4c84a0fep-15, + 0x1.01984165a5f36p-15, + 0x1.e9b5e8d00ce76p-16, + 0x1.d16f5716c6c1ap-16, + 0x1.ba4f035d60e02p-16, + 0x1.a447b7b03f045p-16, + 0x1.8f4ccca7fc90dp-16, + 0x1.7b5223dac7336p-16, + 0x1.684c227fcacefp-16, + 0x1.562fac4329b48p-16, + 0x1.44f21e49054f2p-16, + 0x1.34894a5e24657p-16, + 0x1.24eb7254ccf83p-16, + 0x1.160f438c70913p-16, + 0x1.07ebd2a2d2844p-16, + 0x1.f4f12e9ab070ap-17, + 0x1.db5ad0b27805cp-17, + 0x1.c304efa2c6f4ep-17, + 0x1.abe09e9144b5ep-17, + 0x1.95df988e76644p-17, + 0x1.80f439b4ee04bp-17, + 0x1.6d11788a69c64p-17, + 0x1.5a2adfa0b4bc4p-17, + 0x1.4834877429b8fp-17, + 0x1.37231085c7d9ap-17, + 0x1.26eb9daed6f7ep-17, + 0x1.1783ceac28910p-17, + 0x1.08e1badf0fcedp-17, + 0x1.f5f7d88472604p-18, + 0x1.db92b5212fb8dp-18, + 0x1.c282cd3957edap-18, + 0x1.aab7abace48dcp-18, + 0x1.94219bfcb4928p-18, + 0x1.7eb1a2075864dp-18, + 0x1.6a597219a93d9p-18, + 0x1.570b69502f313p-18, + 0x1.44ba864670882p-18, + 0x1.335a62115bce2p-18, + 0x1.22df298214423p-18, + 0x1.133d96ae7e0ddp-18, + 0x1.046aeabcfcdecp-18, + 0x1.ecb9cfe1d8642p-19, + 0x1.d21397ead99cbp-19, + 0x1.b8d094c86d374p-19, + 0x1.a0df0f0c626dcp-19, + 0x1.8a2e269750a39p-19, + 0x1.74adc8f4064d3p-19, + 0x1.604ea819f007cp-19, + 0x1.4d0231928c6f9p-19, + 0x1.3aba85fe22e1fp-19, + 0x1.296a70f414053p-19, + 0x1.1905613b3abf2p-19, + 0x1.097f6156f32c5p-19, + 0x1.f59a20caf6695p-20, + 0x1.d9c73698fb1dcp-20, + 0x1.bf716c6168baep-20, + 0x1.a6852c6b58392p-20, + 0x1.8eefd70594a88p-20, + 0x1.789fb715aae95p-20, + 0x1.6383f726a8e04p-20, + 0x1.4f8c96f26a26ap-20, + 0x1.3caa61607f920p-20, + 0x1.2acee2f5ecdb8p-20, + 0x1.19ec60b1242edp-20, + 0x1.09f5cf4dd2877p-20, + 0x1.f5bd95d8730d8p-21, + 0x1.d9371e2ff7c35p-21, + 0x1.be41de54d155ap-21, + 0x1.a4c89e08ef4f3p-21, + 0x1.8cb738399b12cp-21, + 0x1.75fa8dbc84becp-21, + 0x1.608078a70dcbcp-21, + 0x1.4c37c0394d094p-21, + 0x1.39100d5687bfep-21, + 0x1.26f9df8519bd6p-21, + 0x1.15e6827001f18p-21, + 0x1.05c803e4831c1p-21, + 0x1.ed22548cffd35p-22, + 0x1.d06ad6ecdf971p-22, + 0x1.b551c847fbc96p-22, + 0x1.9bc09f112b494p-22, + 0x1.83a1ff0aa239dp-22, + 0x1.6ce1aa3fd7bddp-22, + 0x1.576c72b514859p-22, + 0x1.43302cc4a0da8p-22, + 0x1.301ba221dc9bbp-22, + 0x1.1e1e857adc568p-22, + 0x1.0d2966b1746f7p-22, + 0x1.fa5b4f49cc6b2p-23, + 0x1.dc3ae30b55c16p-23, + 0x1.bfd7555a3bd68p-23, + 0x1.a517d9e61628ap-23, + 0x1.8be4f8f6c951fp-23, + 0x1.74287ded49339p-23, + 0x1.5dcd669f2cd34p-23, + 0x1.48bfd38302870p-23, + 0x1.34ecf8a3c124ap-23, + 0x1.22430f521cbcfp-23, + 0x1.10b1488aeb235p-23, + 0x1.0027c00a263a6p-23, + 0x1.e12ee004efc37p-24, + 0x1.c3e44ae32b16bp-24, + 0x1.a854ea14102a8p-24, + 0x1.8e6761569f45dp-24, + 0x1.7603bac345f65p-24, + 0x1.5f1353cdad001p-24, + 0x1.4980cb3c80949p-24, + 0x1.3537f00b6ad4dp-24, + 0x1.2225b12bffc68p-24, + 0x1.10380e1adb7e9p-24, + 0x1.febc107d5efaap-25, + 0x1.df0f2a0ee6946p-25, + 0x1.c14b2188bcee4p-25, + 0x1.a553644f7f07dp-25, + 0x1.8b0cfce0579dfp-25, + 0x1.725e7c5dd20f7p-25, + 0x1.5b2fe547a1340p-25, + 0x1.456a974e92e93p-25, + 0x1.30f93c3699078p-25, + 0x1.1dc7b5b978cf8p-25, + 0x1.0bc30c5d52f15p-25, + 0x1.f5b2be65a0c7fp-26, + 0x1.d5f3a8dea7357p-26, + 0x1.b82915b03515bp-26, + 0x1.9c3517e789488p-26, + 0x1.81fb7df06136ep-26, + 0x1.6961b8d641d06p-26, + 0x1.524ec4d916caep-26, + 0x1.3cab1343d18d1p-26, + 0x1.2860757487a01p-26, + 0x1.155a09065d4f7p-26, + 0x1.0384250e4c9fcp-26, + 0x1.e59890b926c78p-27, + 0x1.c642116a8a9e3p-27, + 0x1.a8e405e651ab6p-27, + 0x1.8d5f98114f872p-27, + 0x1.7397c5a66e307p-27, + 0x1.5b71456c5a4c4p-27, + 0x1.44d26de513197p-27, + 0x1.2fa31d6371537p-27, + 0x1.1bcca373b7b43p-27, + 0x1.0939ab853339fp-27, + 0x1.efac5187b2863p-28, + 0x1.cf1e86235d0e6p-28, + 0x1.b0a68a2128babp-28, + 0x1.9423165bc4444p-28, + 0x1.7974e743dea3cp-28, + 0x1.607e9eacd1050p-28, + 0x1.4924a74dec728p-28, + 0x1.334d19e0c2160p-28, + 0x1.1edfa3c5f5ccap-28, + 0x1.0bc56f1b54701p-28, + 0x1.f3d2185e047d9p-29, + 0x1.d26cb87945e87p-29, + 0x1.b334fac4b9f99p-29, + 0x1.96076f7918d1cp-29, + 0x1.7ac2d72fc2c63p-29, + 0x1.614801550319ep-29, + 0x1.4979ac8b28926p-29, + 0x1.333c68e2d0548p-29, + 0x1.1e767bce37dd7p-29, + 0x1.0b0fc5b6d05a0p-29, + 0x1.f1e3523b41d7dp-30, + 0x1.d00de6608effep-30, + 0x1.b0778b7b3301ap-30, + 0x1.92fb04ec0f6cfp-30, + 0x1.77756ec9f78fap-30, + 0x1.5dc61922d5a06p-30, + 0x1.45ce65699ff6dp-30, + 0x1.2f71a5f159970p-30, + 0x1.1a94ff571654fp-30, + 0x1.071f4bbea09ecp-30, + 0x1.e9f1ff8ddd774p-31, + 0x1.c818223a202c7p-31, + 0x1.a887bd2b4404dp-31, + 0x1.8b1a336c5eb6bp-31, + 0x1.6fab63324088ap-31, + 0x1.56197e30205bap-31, + 0x1.3e44e45301b92p-31, + 0x1.281000bfe4c3fp-31, + 0x1.135f28f2d50b4p-31, + 0x1.00187dded5975p-31, + 0x1.dc479de0ef001p-32, + 0x1.bad4fdad3caa1p-32, + 0x1.9baed3ed27ab8p-32, + 0x1.7ead9ce4285bbp-32, + 0x1.63ac6b4edc88ep-32, + 0x1.4a88be2a6390cp-32, + 0x1.332259185f1a0p-32, + 0x1.1d5b1f3793044p-32, + 0x1.0916f04b6e18bp-32, + 0x1.ec77101de6926p-33, + 0x1.c960bf23153e0p-33, + 0x1.a8bd20fc65ef7p-33, + 0x1.8a61745ec7d1dp-33, + 0x1.6e25d0e756261p-33, + 0x1.53e4f7d1666cbp-33, + 0x1.3b7c27a7ddb0ep-33, + 0x1.24caf2c32af14p-33, + 0x1.0fb3186804d0fp-33, + 0x1.f830c0bb41fd7p-34, + 0x1.d3c0f1a91c846p-34, + 0x1.b1e5acf351d87p-34, + 0x1.92712d259ce66p-34, + 0x1.7538c60a04476p-34, + 0x1.5a14b04b47879p-34, + 0x1.40dfd87456f4cp-34, + 0x1.2977b1172b9d5p-34, + 0x1.13bc07e891491p-34, + 0x1.ff1dbb4300811p-35, + 0x1.d9a880f306bd8p-35, + 0x1.b6e45220b55e0p-35, + 0x1.96a0b33f2c4dap-35, + 0x1.78b07e9e924acp-35, + 0x1.5ce9ab1670dd2p-35, + 0x1.4325167006bb0p-35, + 0x1.2b3e53538ff3fp-35, + 0x1.15137a7f44864p-35, + 0x1.0084ff125639dp-35, + 0x1.daeb0b7311ec7p-36, + 0x1.b7937d1c40c52p-36, + 0x1.96d082f59ab06p-36, + 0x1.7872d9fa10aadp-36, + 0x1.5c4e8e37bc7d0p-36, + 0x1.423ac0df49a40p-36, + 0x1.2a117230ad284p-36, + 0x1.13af4f04f9998p-36, + 0x1.fde703724e560p-37, + 0x1.d77f0c82e7641p-37, + 0x1.b3ee02611d7ddp-37, + 0x1.92ff33023d5bdp-37, + 0x1.7481a9e69f53fp-37, + 0x1.5847eda620959p-37, + 0x1.3e27c1fcc74bdp-37, + 0x1.25f9ee0b923dcp-37, + 0x1.0f9a0686531ffp-37, + 0x1.f5cc7718082afp-38, + 0x1.cf7e53d6a2ca5p-38, + 0x1.ac0f5f3229372p-38, + 0x1.8b498644847eap-38, + 0x1.6cfa9bcca59dcp-38, + 0x1.50f411d4fd2cdp-38, + 0x1.370ab8327af5ep-38, + 0x1.1f167f88c6b6ep-38, + 0x1.08f24085d4597p-38, + 0x1.e8f70e181d619p-39, + 0x1.c324c20e337dcp-39, + 0x1.a03261574b54ep-39, + 0x1.7fe903cdf5855p-39, + 0x1.6215c58da3450p-39, + 0x1.46897d4b69fc6p-39, + 0x1.2d1877d731b7bp-39, + 0x1.159a386b11517p-39, + 0x1.ffd27ae9393cep-40, + 0x1.d7c593130dd0bp-40, + 0x1.b2cd607c79bcfp-40, + 0x1.90ae4d3405651p-40, + 0x1.71312dd1759e2p-40, + 0x1.5422ef5d8949dp-40, + 0x1.39544b0ecc957p-40, + 0x1.20997f73e73ddp-40, + 0x1.09ca0eaacd277p-40, + 0x1.e9810295890ecp-41, + 0x1.c2b45b5aa4a1dp-41, + 0x1.9eee068fa7596p-41, + 0x1.7df2b399c10a8p-41, + 0x1.5f8b87a31bd85p-41, + 0x1.4385c96e9a2d9p-41, + 0x1.29b2933ef4cbcp-41, + 0x1.11e68a6378f8ap-41, + 0x1.f7f338086a86bp-42, + 0x1.cf8d7d9ce040ap-42, + 0x1.aa577251ae484p-42, + 0x1.8811d739efb5ep-42, + 0x1.68823e52970bep-42, + 0x1.4b72ae68e8b4cp-42, + 0x1.30b14dbe876bcp-42, + 0x1.181012ef86610p-42, + 0x1.01647ba798744p-42, + 0x1.d90e917701675p-43, + 0x1.b2a87e86d0c8ap-43, + 0x1.8f53dcb377293p-43, + 0x1.6ed2f2515e933p-43, + 0x1.50ecc9ed47f19p-43, + 0x1.356cd5ce7799ep-43, + 0x1.1c229a587ab78p-43, + 0x1.04e15ecc7f3f6p-43, + 0x1.deffc7e6a6017p-44, + 0x1.b7b040832f310p-44, + 0x1.938e021f36d76p-44, + 0x1.7258610b3b233p-44, + 0x1.53d3bfc82a909p-44, + 0x1.37c92babdc2fdp-44, + 0x1.1e06010120f6ap-44, + 0x1.065b9616170d4p-44, + 0x1.e13dd96b3753ap-45, + 0x1.b950d32467392p-45, + 0x1.94a72263259a5p-45, + 0x1.72fd93e036cdcp-45, + 0x1.54164576929abp-45, + 0x1.37b83c521fe96p-45, + 0x1.1daf033182e96p-45, + 0x1.05ca50205d26ap-45, + 0x1.dfbb6235639fap-46, + 0x1.b7807e294781fp-46, + 0x1.9298add70a734p-46, + 0x1.70beaf9c7ffb6p-46, + 0x1.51b2cd6709222p-46, + 0x1.353a6cf7f7fffp-46, + 0x1.1b1fa8cbe84a7p-46, + 0x1.0330f0fd69921p-46, + 0x1.da81670f96f9bp-47, + 0x1.b24a16b4d09aap-47, + 0x1.8d6eeb6efdbd6p-47, + 0x1.6ba91ac734785p-47, + 0x1.4cb7966770ab5p-47, + 0x1.305e9721d0981p-47, + 0x1.1667311fff70ap-47, + 0x1.fd3de10d62855p-48, + 0x1.d1aefbcd48d0cp-48, + 0x1.a9cc93c25aca9p-48, + 0x1.85487ee3ea735p-48, + 0x1.63daf8b4b1e0cp-48, + 0x1.45421e69a6ca1p-48, + 0x1.294175802d99ap-48, + 0x1.0fa17bf41068fp-48, + 0x1.f05e82aae2bb9p-49, + 0x1.c578101b29058p-49, + 0x1.9e39dc5dd2f7cp-49, + 0x1.7a553a728bbf2p-49, + 0x1.5982008db1304p-49, + 0x1.3b7e00422e51bp-49, + 0x1.200c898d9ee3ep-49, + 0x1.06f5f7eb65a56p-49, + 0x1.e00e9148a1d25p-50, + 0x1.b623734024e92p-50, + 0x1.8fd4e01891bf8p-50, + 0x1.6cd44c7470d89p-50, + 0x1.4cd9c04158cd7p-50, + 0x1.2fa34bf5c8344p-50, + 0x1.14f4890ff2461p-50, + 0x1.f92c49dfa4df5p-51, + 0x1.ccaaea71ab0dfp-51, + 0x1.a40829f001197p-51, + 0x1.7eef13b59e96cp-51, + 0x1.5d11e1a252bf5p-51, + 0x1.3e296303b2297p-51, + 0x1.21f47009f43cep-51, + 0x1.083768c5e4541p-51, + 0x1.e1777d831265ep-52, + 0x1.b69f10b0191b5p-52, + 0x1.8f8a3a05b5b52p-52, + 0x1.6be573c40c8e7p-52, + 0x1.4b645ba991fdbp-52, + 0x1.2dc119095729fp-52, + }, +}; diff --git a/contrib/arm-optimized-routines/pl/math/sv_erfc_1u8.c b/contrib/arm-optimized-routines/pl/math/sv_erfc_1u8.c new file mode 100644 index 000000000000..a91bef96f2e7 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_erfc_1u8.c @@ -0,0 +1,164 @@ +/* + * Double-precision vector erfc(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + uint64_t off_idx, off_arr; + double max, shift; + double p20, p40, p41, p42; + double p51, p52; + double q5, r5; + double q6, r6; + double q7, r7; + double q8, r8; + double q9, r9; + uint64_t table_scale; +} data = { + /* Set an offset so the range of the index used for lookup is 3487, and it + can be clamped using a saturated add on an offset index. + Index offset is 0xffffffffffffffff - asuint64(shift) - 3487. */ + .off_idx = 0xbd3ffffffffff260, + .off_arr = 0xfffffffffffff260, /* 0xffffffffffffffff - 3487. */ + .max = 0x1.b3ep+4, /* 3487/128. */ + .shift = 0x1p45, + .table_scale = 0x37f0000000000000, /* asuint64(0x1p-128). */ + .p20 = 0x1.5555555555555p-2, /* 1/3, used to compute 2/3 and 1/6. */ + .p40 = -0x1.999999999999ap-4, /* 1/10. */ + .p41 = -0x1.999999999999ap-2, /* 2/5. */ + .p42 = 0x1.1111111111111p-3, /* 2/15. */ + .p51 = -0x1.c71c71c71c71cp-3, /* 2/9. */ + .p52 = 0x1.6c16c16c16c17p-5, /* 2/45. */ + /* Qi = (i+1) / i, for i = 5, ..., 9. */ + .q5 = 0x1.3333333333333p0, + .q6 = 0x1.2aaaaaaaaaaabp0, + .q7 = 0x1.2492492492492p0, + .q8 = 0x1.2p0, + .q9 = 0x1.1c71c71c71c72p0, + /* Ri = -2 * i / ((i+1)*(i+2)), for i = 5, ..., 9. */ + .r5 = -0x1.e79e79e79e79ep-3, + .r6 = -0x1.b6db6db6db6dbp-3, + .r7 = -0x1.8e38e38e38e39p-3, + .r8 = -0x1.6c16c16c16c17p-3, + .r9 = -0x1.4f2094f2094f2p-3, +}; + +/* Optimized double-precision vector erfc(x). + Approximation based on series expansion near x rounded to + nearest multiple of 1/128. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erfc(x) ~ erfc(r) - scale * d * poly(r, d), with + + poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3 + + (2/15 r^4 - 2/5 r^2 + 1/10) d^4 + - r * (2/45 r^4 - 2/9 r^2 + 1/6) d^5 + + p6(r) d^6 + ... + p10(r) d^10 + + Polynomials p6(r) to p10(r) are computed using recurrence relation + + 2(i+1)p_i + 2r(i+2)p_{i+1} + (i+2)(i+3)p_{i+2} = 0, + with p0 = 1, and p1(r) = -r. + + Values of erfc(r) and scale are read from lookup tables. Stored values + are scaled to avoid hitting the subnormal range. + + Note that for x < 0, erfc(x) = 2.0 - erfc(-x). + + Maximum measured error: 1.71 ULP + _ZGVsMxv_erfc(0x1.46cfe976733p+4) got 0x1.e15fcbea3e7afp-608 + want 0x1.e15fcbea3e7adp-608. */ +svfloat64_t SV_NAME_D1 (erfc) (svfloat64_t x, const svbool_t pg) +{ + const struct data *dat = ptr_barrier (&data); + + svfloat64_t a = svabs_x (pg, x); + + /* Clamp input at |x| <= 3487/128. */ + a = svmin_x (pg, a, dat->max); + + /* Reduce x to the nearest multiple of 1/128. */ + svfloat64_t shift = sv_f64 (dat->shift); + svfloat64_t z = svadd_x (pg, a, shift); + + /* Saturate index for the NaN case. */ + svuint64_t i = svqadd (svreinterpret_u64 (z), dat->off_idx); + + /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */ + i = svadd_x (pg, i, i); + const float64_t *p = &__erfc_data.tab[0].erfc - 2 * dat->off_arr; + svfloat64_t erfcr = svld1_gather_index (pg, p, i); + svfloat64_t scale = svld1_gather_index (pg, p + 1, i); + + /* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */ + svfloat64_t r = svsub_x (pg, z, shift); + svfloat64_t d = svsub_x (pg, a, r); + svfloat64_t d2 = svmul_x (pg, d, d); + svfloat64_t r2 = svmul_x (pg, r, r); + + /* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p9(r) * d^9. */ + svfloat64_t p1 = r; + svfloat64_t third = sv_f64 (dat->p20); + svfloat64_t twothird = svmul_x (pg, third, 2.0); + svfloat64_t sixth = svmul_x (pg, third, 0.5); + svfloat64_t p2 = svmls_x (pg, third, r2, twothird); + svfloat64_t p3 = svmad_x (pg, r2, third, -0.5); + p3 = svmul_x (pg, r, p3); + svfloat64_t p4 = svmla_x (pg, sv_f64 (dat->p41), r2, dat->p42); + p4 = svmls_x (pg, sv_f64 (dat->p40), r2, p4); + svfloat64_t p5 = svmla_x (pg, sv_f64 (dat->p51), r2, dat->p52); + p5 = svmla_x (pg, sixth, r2, p5); + p5 = svmul_x (pg, r, p5); + /* Compute p_i using recurrence relation: + p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */ + svfloat64_t qr5 = svld1rq (svptrue_b64 (), &dat->q5); + svfloat64_t qr6 = svld1rq (svptrue_b64 (), &dat->q6); + svfloat64_t qr7 = svld1rq (svptrue_b64 (), &dat->q7); + svfloat64_t qr8 = svld1rq (svptrue_b64 (), &dat->q8); + svfloat64_t qr9 = svld1rq (svptrue_b64 (), &dat->q9); + svfloat64_t p6 = svmla_x (pg, p4, p5, svmul_lane (r, qr5, 0)); + p6 = svmul_lane (p6, qr5, 1); + svfloat64_t p7 = svmla_x (pg, p5, p6, svmul_lane (r, qr6, 0)); + p7 = svmul_lane (p7, qr6, 1); + svfloat64_t p8 = svmla_x (pg, p6, p7, svmul_lane (r, qr7, 0)); + p8 = svmul_lane (p8, qr7, 1); + svfloat64_t p9 = svmla_x (pg, p7, p8, svmul_lane (r, qr8, 0)); + p9 = svmul_lane (p9, qr8, 1); + svfloat64_t p10 = svmla_x (pg, p8, p9, svmul_lane (r, qr9, 0)); + p10 = svmul_lane (p10, qr9, 1); + /* Compute polynomial in d using pairwise Horner scheme. */ + svfloat64_t p90 = svmla_x (pg, p9, d, p10); + svfloat64_t p78 = svmla_x (pg, p7, d, p8); + svfloat64_t p56 = svmla_x (pg, p5, d, p6); + svfloat64_t p34 = svmla_x (pg, p3, d, p4); + svfloat64_t p12 = svmla_x (pg, p1, d, p2); + svfloat64_t y = svmla_x (pg, p78, d2, p90); + y = svmla_x (pg, p56, d2, y); + y = svmla_x (pg, p34, d2, y); + y = svmla_x (pg, p12, d2, y); + + y = svmls_x (pg, erfcr, scale, svmls_x (pg, d, d2, y)); + + /* Offset equals 2.0 if sign, else 0.0. */ + svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000); + svfloat64_t off = svreinterpret_f64 (svlsr_x (pg, sign, 1)); + /* Handle sign and scale back in a single fma. */ + svfloat64_t fac = svreinterpret_f64 (svorr_x (pg, sign, dat->table_scale)); + + return svmla_x (pg, off, fac, y); +} + +PL_SIG (SV, D, 1, erfc, -6.0, 28.0) +PL_TEST_ULP (SV_NAME_D1 (erfc), 1.21) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (erfc), 0.0, 0x1p-26, 40000) +PL_TEST_INTERVAL (SV_NAME_D1 (erfc), 0x1p-26, 28.0, 40000) +PL_TEST_INTERVAL (SV_NAME_D1 (erfc), -0x1p-26, -6.0, 40000) +PL_TEST_INTERVAL (SV_NAME_D1 (erfc), 28.0, inf, 40000) +PL_TEST_INTERVAL (SV_NAME_D1 (erfc), 6.0, -inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_erfc_4u.c b/contrib/arm-optimized-routines/pl/math/sv_erfc_4u.c deleted file mode 100644 index 076b47129862..000000000000 --- a/contrib/arm-optimized-routines/pl/math/sv_erfc_4u.c +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Double-precision SVE erfc(x) function. - * - * Copyright (c) 2021-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" - -#if SV_SUPPORTED -#include "sv_exp_tail.h" - -sv_f64_t __sv_exp_x (sv_f64_t, svbool_t); - -static NOINLINE sv_f64_t -specialcase (sv_f64_t x, sv_f64_t y, svbool_t special) -{ - return sv_call_f64 (erfc, x, y, special); -} - -static inline sv_u64_t -lookup_interval_idx (const svbool_t pg, sv_f64_t abs_x) -{ - /* Interval index is calculated by (((abs(x) + 1)^4) >> 53) - 1023, bounded by - the number of polynomials. */ - sv_f64_t xp1 = svadd_n_f64_x (pg, abs_x, 1); - xp1 = svmul_f64_x (pg, xp1, xp1); - xp1 = svmul_f64_x (pg, xp1, xp1); - sv_u64_t interval_idx - = svsub_n_u64_x (pg, svlsr_n_u64_x (pg, sv_as_u64_f64 (xp1), 52), 1023); - return svsel_u64 (svcmple_n_u64 (pg, interval_idx, ERFC_NUM_INTERVALS), - interval_idx, sv_u64 (ERFC_NUM_INTERVALS)); -} - -static inline sv_f64_t -sv_eval_poly (const svbool_t pg, sv_f64_t z, sv_u64_t idx) -{ - sv_u64_t offset = svmul_n_u64_x (pg, idx, ERFC_POLY_ORDER + 1); - const double *base = &__v_erfc_data.poly[0][12]; - sv_f64_t r = sv_lookup_f64_x (pg, base, offset); - for (int i = 0; i < ERFC_POLY_ORDER; i++) - { - base--; - sv_f64_t c = sv_lookup_f64_x (pg, base, offset); - r = sv_fma_f64_x (pg, z, r, c); - } - return r; -} - -static inline sv_f64_t -sv_eval_gauss (const svbool_t pg, sv_f64_t abs_x) -{ - /* Accurate evaluation of exp(-x^2). This operation is sensitive to rounding - errors in x^2, so we compute an estimate for the error and use a custom exp - helper which corrects for the calculated error estimate. */ - sv_f64_t a2 = svmul_f64_x (pg, abs_x, abs_x); - - /* Split abs_x into (a_hi + a_lo), where a_hi is the 'large' component and - a_lo is the 'small' component. */ - const sv_f64_t scale = sv_f64 (0x1.0000002p27); - sv_f64_t a_hi = svneg_f64_x (pg, sv_fma_f64_x (pg, scale, abs_x, - svneg_f64_x (pg, abs_x))); - a_hi = sv_fma_f64_x (pg, scale, abs_x, a_hi); - sv_f64_t a_lo = svsub_f64_x (pg, abs_x, a_hi); - - sv_f64_t a_hi_neg = svneg_f64_x (pg, a_hi); - sv_f64_t a_lo_neg = svneg_f64_x (pg, a_lo); - - /* We can then estimate the error in abs_x^2 by computing (abs_x * abs_x) - - (a_hi + a_lo) * (a_hi + a_lo). */ - sv_f64_t e2 = sv_fma_f64_x (pg, a_hi_neg, a_hi, a2); - e2 = sv_fma_f64_x (pg, a_hi_neg, a_lo, e2); - e2 = sv_fma_f64_x (pg, a_lo_neg, a_hi, e2); - e2 = sv_fma_f64_x (pg, a_lo_neg, a_lo, e2); - - return sv_exp_tail (pg, svneg_f64_x (pg, a2), e2); -} - -/* Optimized double precision vector complementary error function erfc. - Maximum measured error is 3.64 ULP: - __sv_erfc(0x1.4792573ee6cc7p+2) got 0x1.ff3f4c8e200d5p-42 - want 0x1.ff3f4c8e200d9p-42. */ -sv_f64_t -__sv_erfc_x (sv_f64_t x, const svbool_t pg) -{ - sv_u64_t ix = sv_as_u64_f64 (x); - sv_f64_t abs_x = svabs_f64_x (pg, x); - sv_u64_t atop = svlsr_n_u64_x (pg, sv_as_u64_f64 (abs_x), 52); - - /* Outside of the 'interesting' bounds, [-6, 28], +ve goes to 0, -ve goes - to 2. As long as the polynomial is 0 in the boring zone, we can assemble - the result correctly. This is dealt with in two ways: - - The 'coarse approach' is that the approximation algorithm is - zero-predicated on in_bounds = |x| < 32, which saves the need to do - coefficient lookup etc for |x| >= 32. - - The coarse approach misses [-32, -6] and [28, 32], which are dealt with in - the polynomial and index calculation, such that the polynomial evaluates to - 0 in these regions. */ - /* in_bounds is true for lanes where |x| < 32. */ - svbool_t in_bounds = svcmplt_n_u64 (pg, atop, 0x404); - /* boring_zone = 2 for x < 0, 0 otherwise. */ - sv_f64_t boring_zone - = sv_as_f64_u64 (svlsl_n_u64_x (pg, svlsr_n_u64_x (pg, ix, 63), 62)); - /* Very small, nan and inf. */ - svbool_t special_cases - = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, atop, 0x3cd), 0x432); - - /* erfc(|x|) ~= P_i(|x|-x_i)*exp(-x^2) - - Where P_i is a polynomial and x_i is an offset, both defined in - v_erfc_data.c. i is chosen based on which interval x falls in. */ - sv_u64_t i = lookup_interval_idx (in_bounds, abs_x); - sv_f64_t x_i = sv_lookup_f64_x (in_bounds, __v_erfc_data.interval_bounds, i); - sv_f64_t p = sv_eval_poly (in_bounds, svsub_f64_x (pg, abs_x, x_i), i); - /* 'copy' sign of x to p, i.e. negate p if x is negative. */ - sv_u64_t sign = svbic_n_u64_z (in_bounds, ix, 0x7fffffffffffffff); - p = sv_as_f64_u64 (sveor_u64_z (in_bounds, sv_as_u64_f64 (p), sign)); - - sv_f64_t e = sv_eval_gauss (in_bounds, abs_x); - - /* Assemble result: 2-p*e if x<0, p*e otherwise. No need to conditionally - select boring_zone because P[V_ERFC_NINTS-1]=0. */ - sv_f64_t y = sv_fma_f64_x (pg, p, e, boring_zone); - - if (unlikely (svptest_any (pg, special_cases))) - { - return specialcase (x, y, special_cases); - } - return y; -} - -PL_ALIAS (__sv_erfc_x, _ZGVsMxv_erfc) - -PL_SIG (SV, D, 1, erfc, -4.0, 10.0) -PL_TEST_ULP (__sv_erfc, 3.15) -PL_TEST_INTERVAL (__sv_erfc, 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (__sv_erfc, 0x1p-127, 0x1p-26, 40000) -PL_TEST_INTERVAL (__sv_erfc, -0x1p-127, -0x1p-26, 40000) -PL_TEST_INTERVAL (__sv_erfc, 0x1p-26, 0x1p5, 40000) -PL_TEST_INTERVAL (__sv_erfc, -0x1p-26, -0x1p3, 40000) -PL_TEST_INTERVAL (__sv_erfc, 0, inf, 40000) -#endif diff --git a/contrib/arm-optimized-routines/pl/math/sv_erfcf_1u7.c b/contrib/arm-optimized-routines/pl/math/sv_erfcf_1u7.c new file mode 100644 index 000000000000..cda8f0b3752e --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_erfcf_1u7.c @@ -0,0 +1,111 @@ +/* + * Single-precision vector erfc(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + uint32_t off_idx, off_arr; + float max, shift; + float third, two_thirds, two_over_fifteen, two_over_five, tenth; +} data = { + /* Set an offset so the range of the index used for lookup is 644, and it can + be clamped using a saturated add. */ + .off_idx = 0xb7fffd7b, /* 0xffffffff - asuint(shift) - 644. */ + .off_arr = 0xfffffd7b, /* 0xffffffff - 644. */ + .max = 10.0625f, /* 644/64. */ + .shift = 0x1p17f, + .third = 0x1.555556p-2f, + .two_thirds = 0x1.555556p-1f, + .two_over_fifteen = 0x1.111112p-3f, + .two_over_five = -0x1.99999ap-2f, + .tenth = -0x1.99999ap-4f, +}; + +#define SignMask 0x80000000 +#define TableScale 0x28000000 /* 0x1p-47. */ + +/* Optimized single-precision vector erfcf(x). + Approximation based on series expansion near x rounded to + nearest multiple of 1/64. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erfc(x) ~ erfc(r) - scale * d * poly(r, d), with + + poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3 + + (2/15 r^4 - 2/5 r^2 + 1/10) d^4 + + Values of erfc(r) and scale are read from lookup tables. Stored values + are scaled to avoid hitting the subnormal range. + + Note that for x < 0, erfc(x) = 2.0 - erfc(-x). + + Maximum error: 1.63 ULP (~1.0 ULP for x < 0.0). + _ZGVsMxv_erfcf(0x1.1dbf7ap+3) got 0x1.f51212p-120 + want 0x1.f51216p-120. */ +svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg) +{ + const struct data *dat = ptr_barrier (&data); + + svfloat32_t a = svabs_x (pg, x); + + /* Clamp input at |x| <= 10.0 + 4/64. */ + a = svmin_x (pg, a, dat->max); + + /* Reduce x to the nearest multiple of 1/64. */ + svfloat32_t shift = sv_f32 (dat->shift); + svfloat32_t z = svadd_x (pg, a, shift); + + /* Saturate index for the NaN case. */ + svuint32_t i = svqadd (svreinterpret_u32 (z), dat->off_idx); + + /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */ + i = svmul_x (pg, i, 2); + const float32_t *p = &__erfcf_data.tab[0].erfc - 2 * dat->off_arr; + svfloat32_t erfcr = svld1_gather_index (pg, p, i); + svfloat32_t scale = svld1_gather_index (pg, p + 1, i); + + /* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */ + svfloat32_t r = svsub_x (pg, z, shift); + svfloat32_t d = svsub_x (pg, a, r); + svfloat32_t d2 = svmul_x (pg, d, d); + svfloat32_t r2 = svmul_x (pg, r, r); + + svfloat32_t coeffs = svld1rq (svptrue_b32 (), &dat->third); + svfloat32_t third = svdup_lane (coeffs, 0); + + svfloat32_t p1 = r; + svfloat32_t p2 = svmls_lane (third, r2, coeffs, 1); + svfloat32_t p3 = svmul_x (pg, r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0)); + svfloat32_t p4 = svmla_lane (sv_f32 (dat->two_over_five), r2, coeffs, 2); + p4 = svmls_x (pg, sv_f32 (dat->tenth), r2, p4); + + svfloat32_t y = svmla_x (pg, p3, d, p4); + y = svmla_x (pg, p2, d, y); + y = svmla_x (pg, p1, d, y); + + /* Solves the |x| = inf/nan case. */ + y = svmls_x (pg, erfcr, scale, svmls_x (pg, d, d2, y)); + + /* Offset equals 2.0f if sign, else 0.0f. */ + svuint32_t sign = svand_x (pg, svreinterpret_u32 (x), SignMask); + svfloat32_t off = svreinterpret_f32 (svlsr_x (pg, sign, 1)); + /* Handle sign and scale back in a single fma. */ + svfloat32_t fac = svreinterpret_f32 (svorr_x (pg, sign, TableScale)); + + return svmla_x (pg, off, fac, y); +} + +PL_SIG (SV, F, 1, erfc, -4.0, 10.0) +PL_TEST_ULP (SV_NAME_F1 (erfc), 1.14) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (erfc), 0.0, 0x1p-26, 40000) +PL_TEST_INTERVAL (SV_NAME_F1 (erfc), 0x1p-26, 10.0625, 40000) +PL_TEST_INTERVAL (SV_NAME_F1 (erfc), -0x1p-26, -4.0, 40000) +PL_TEST_INTERVAL (SV_NAME_F1 (erfc), 10.0625, inf, 40000) +PL_TEST_INTERVAL (SV_NAME_F1 (erfc), -4.0, -inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_erff_1u3.c b/contrib/arm-optimized-routines/pl/math/sv_erff_1u3.c deleted file mode 100644 index c7a738c55f7b..000000000000 --- a/contrib/arm-optimized-routines/pl/math/sv_erff_1u3.c +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Single-precision vector erf(x) function. - * - * Copyright (c) 2020-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" - -#if SV_SUPPORTED - -#define AbsMask (0x7fffffff) - -static NOINLINE sv_f32_t -__sv_erff_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp) -{ - return sv_call_f32 (erff, x, y, cmp); -} - -sv_f32_t __sv_expf_x (svbool_t, sv_f32_t); - -/* Optimized single precision vector erf. Worst-case error is 1.25 ULP: - __sv_erff(0x1.dc59fap-1) got 0x1.9f9c88p-1 - want 0x1.9f9c8ap-1. */ -sv_f32_t -__sv_erff_x (sv_f32_t x, const svbool_t pg) -{ - sv_u32_t ix = sv_as_u32_f32 (x); - sv_u32_t atop = svand_n_u32_x (pg, svlsr_n_u32_x (pg, ix, 16), 0x7fff); - /* Handle both inf/nan as well as small values (|x|<2^-28). */ - svbool_t cmp - = svcmpge_n_u32 (pg, svsub_n_u32_x (pg, atop, 0x3180), 0x7ff0 - 0x3180); - - sv_u32_t sign = svand_n_u32_x (pg, ix, ~AbsMask); - /* |x| < 0.921875. */ - svbool_t red = svaclt_n_f32 (pg, x, 0.921875f); - /* |x| > 4.0. */ - svbool_t bor = svacgt_n_f32 (pg, x, 4.0f); - - /* Load polynomial coefficients. */ - sv_u32_t idx_lo = svsel (red, sv_u32 (0), sv_u32 (1)); - sv_u32_t idx_hi = svadd_n_u32_x (pg, idx_lo, 2); - - const float *base = (float *) __v_erff_data.coeffs; - sv_f32_t c_2_5 = svld1rq (svptrue_b32 (), base + 2); - sv_f32_t c_6_9 = svld1rq (svptrue_b32 (), base + 6); - sv_f32_t c_10_13 = svld1rq (svptrue_b32 (), base + 10); - - /* Do not need to store elem 0 of __v_erff_data as it is not used. */ - sv_f32_t p1 = svtbl (c_2_5, idx_lo); - sv_f32_t p2 = svtbl (c_2_5, idx_hi); - sv_f32_t p3 = svtbl (c_6_9, idx_lo); - sv_f32_t p4 = svtbl (c_6_9, idx_hi); - sv_f32_t p5 = svtbl (c_10_13, idx_lo); - sv_f32_t p6 = svtbl (c_10_13, idx_hi); - - sv_f32_t a = svabs_f32_x (pg, x); - /* Square with merging mul - z is x^2 for reduced, |x| otherwise. */ - sv_f32_t z = svmul_f32_m (red, a, a); - - /* Evaluate polynomial on |x| or x^2. */ - sv_f32_t r = sv_fma_f32_x (pg, z, p6, p5); - r = sv_fma_f32_x (pg, z, r, p4); - r = sv_fma_f32_x (pg, z, r, p3); - r = sv_fma_f32_x (pg, z, r, p2); - r = sv_fma_f32_x (pg, z, r, p1); - /* Use merging svmad for last operation - apply first coefficient if not - reduced, otherwise r is propagated unchanged. This is because the reduced - polynomial has lower order than the non-reduced. */ - r = svmad_n_f32_m (svnot_b_z (pg, red), r, z, base[1]); - r = sv_fma_f32_x (pg, a, r, a); - - /* y = |x| + |x| * P(x^2) if |x| < 0.921875 - y = 1 - exp (-(|x| + |x| * P(|x|))) otherwise. */ - sv_f32_t y = __sv_expf_x (pg, svneg_f32_x (pg, r)); - y = svsel_f32 (red, r, svsubr_n_f32_x (pg, y, 1.0)); - - /* Boring domain (absolute value is required to get the sign of erf(-nan) - right). */ - y = svsel_f32 (bor, sv_f32 (1.0f), svabs_f32_x (pg, y)); - - /* y = erf(x) if x>0, -erf(-x) otherwise. */ - y = sv_as_f32_u32 (sveor_u32_x (pg, sv_as_u32_f32 (y), sign)); - - if (unlikely (svptest_any (pg, cmp))) - return __sv_erff_specialcase (x, y, cmp); - return y; -} - -PL_ALIAS (__sv_erff_x, _ZGVsMxv_erff) - -PL_SIG (SV, F, 1, erf, -4.0, 4.0) -PL_TEST_ULP (__sv_erff, 0.76) -PL_TEST_INTERVAL (__sv_erff, 0, 0x1p-28, 20000) -PL_TEST_INTERVAL (__sv_erff, 0x1p-28, 1, 60000) -PL_TEST_INTERVAL (__sv_erff, 1, 0x1p28, 60000) -PL_TEST_INTERVAL (__sv_erff, 0x1p28, inf, 20000) -PL_TEST_INTERVAL (__sv_erff, -0, -0x1p-28, 20000) -PL_TEST_INTERVAL (__sv_erff, -0x1p-28, -1, 60000) -PL_TEST_INTERVAL (__sv_erff, -1, -0x1p28, 60000) -PL_TEST_INTERVAL (__sv_erff, -0x1p28, -inf, 20000) -#endif diff --git a/contrib/arm-optimized-routines/pl/math/sv_erff_2u.c b/contrib/arm-optimized-routines/pl/math/sv_erff_2u.c new file mode 100644 index 000000000000..adeee798ee2e --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_erff_2u.c @@ -0,0 +1,90 @@ +/* + * Single-precision vector erf(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float min, max, scale, shift, third; +} data = { + .min = 0x1.cp-7f, /* 1/64 - 1/512. */ + .max = 3.9375, /* 4 - 8/128. */ + .scale = 0x1.20dd76p+0f, /* 2/sqrt(pi). */ + .shift = 0x1p16f, + .third = 0x1.555556p-2f, /* 1/3. */ +}; + +#define SignMask (0x80000000) + +/* Single-precision implementation of vector erf(x). + Approximation based on series expansion near x rounded to + nearest multiple of 1/128. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erf(x) ~ erf(r) + scale * d * [1 - r * d - 1/3 * d^2] + + Values of erf(r) and scale are read from lookup tables. + For |x| < 0x1.cp-7, the algorithm sets r = 0, erf(r) = 0, and scale = 2 / + sqrt(pi), so it simply boils down to a Taylor series expansion near 0. For + |x| > 3.9375, erf(|x|) rounds to 1.0f. + + Maximum error on each interval: + - [0, 0x1.cp-7]: 1.93 ULP + _ZGVsMxv_erff(0x1.c373e6p-9) got 0x1.fd686cp-9 want 0x1.fd6868p-9 + - [0x1.cp-7, 4.0]: 1.26 ULP + _ZGVsMxv_erff(0x1.1d002ep+0) got 0x1.c4eb9ap-1 want 0x1.c4eb98p-1. */ +svfloat32_t SV_NAME_F1 (erf) (svfloat32_t x, const svbool_t pg) +{ + const struct data *dat = ptr_barrier (&data); + + /* |x| > 1/64 - 1/512. */ + svbool_t a_gt_min = svacgt (pg, x, dat->min); + + /* |x| >= 4.0 - 8/128. */ + svbool_t a_ge_max = svacge (pg, x, dat->max); + svfloat32_t a = svabs_x (pg, x); + + svfloat32_t shift = sv_f32 (dat->shift); + svfloat32_t z = svadd_x (pg, a, shift); + svuint32_t i + = svsub_x (pg, svreinterpret_u32 (z), svreinterpret_u32 (shift)); + + /* Saturate lookup index. */ + i = svsel (a_ge_max, sv_u32 (512), i); + + /* r and erf(r) set to 0 for |x| below min. */ + svfloat32_t r = svsub_z (a_gt_min, z, shift); + svfloat32_t erfr = svld1_gather_index (a_gt_min, __sv_erff_data.erf, i); + + /* scale set to 2/sqrt(pi) for |x| below min. */ + svfloat32_t scale = svld1_gather_index (a_gt_min, __sv_erff_data.scale, i); + scale = svsel (a_gt_min, scale, sv_f32 (dat->scale)); + + /* erf(x) ~ erf(r) + scale * d * (1 - r * d + 1/3 * d^2). */ + svfloat32_t d = svsub_x (pg, a, r); + svfloat32_t d2 = svmul_x (pg, d, d); + svfloat32_t y = svmla_x (pg, r, d, dat->third); + y = svmla_x (pg, erfr, scale, svmls_x (pg, d, d2, y)); + + /* Solves the |x| = inf case. */ + y = svsel (a_ge_max, sv_f32 (1.0f), y); + + /* Copy sign. */ + svuint32_t ix = svreinterpret_u32 (x); + svuint32_t iy = svreinterpret_u32 (y); + svuint32_t sign = svand_x (pg, ix, SignMask); + return svreinterpret_f32 (svorr_x (pg, sign, iy)); +} + +PL_SIG (SV, F, 1, erf, -4.0, 4.0) +PL_TEST_ULP (SV_NAME_F1 (erf), 1.43) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 0, 0x1.cp-7, 40000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 0x1.cp-7, 3.9375, 40000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 3.9375, inf, 40000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 0, inf, 4000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_erff_data.c b/contrib/arm-optimized-routines/pl/math/sv_erff_data.c new file mode 100644 index 000000000000..154d3c188874 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_erff_data.c @@ -0,0 +1,1046 @@ +/* + * Data for approximation of vector erff. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Lookup table used in SVE erff. + For each possible rounded input r (multiples of 1/128), between + r = 0.0 and r = 4.0 (513 values): + - __erff_data.erf contains the values of erf(r), + - __erff_data.scale contains the values of 2/sqrt(pi)*exp(-r^2). + Note that indices 0 and 1 are never hit by the algorithm, since lookup is + performed only for x >= 1/64-1/512. */ +const struct sv_erff_data __sv_erff_data = { + .erf = { 0x0.000000p+0, + 0x1.20dbf4p-7, + 0x1.20d770p-6, + 0x1.b137e0p-6, + 0x1.20c564p-5, + 0x1.68e5d4p-5, + 0x1.b0fafep-5, + 0x1.f902a8p-5, + 0x1.207d48p-4, + 0x1.44703ep-4, + 0x1.68591ap-4, + 0x1.8c36bep-4, + 0x1.b00812p-4, + 0x1.d3cbf8p-4, + 0x1.f7815ap-4, + 0x1.0d9390p-3, + 0x1.1f5e1ap-3, + 0x1.311fc2p-3, + 0x1.42d7fcp-3, + 0x1.548642p-3, + 0x1.662a0cp-3, + 0x1.77c2d2p-3, + 0x1.895010p-3, + 0x1.9ad142p-3, + 0x1.ac45e4p-3, + 0x1.bdad72p-3, + 0x1.cf076ep-3, + 0x1.e05354p-3, + 0x1.f190aap-3, + 0x1.015f78p-2, + 0x1.09eed6p-2, + 0x1.127632p-2, + 0x1.1af54ep-2, + 0x1.236bf0p-2, + 0x1.2bd9dcp-2, + 0x1.343ed6p-2, + 0x1.3c9aa8p-2, + 0x1.44ed18p-2, + 0x1.4d35f0p-2, + 0x1.5574f4p-2, + 0x1.5da9f4p-2, + 0x1.65d4b8p-2, + 0x1.6df50ap-2, + 0x1.760abap-2, + 0x1.7e1594p-2, + 0x1.861566p-2, + 0x1.8e0a02p-2, + 0x1.95f336p-2, + 0x1.9dd0d2p-2, + 0x1.a5a2acp-2, + 0x1.ad6896p-2, + 0x1.b52264p-2, + 0x1.bccfecp-2, + 0x1.c47104p-2, + 0x1.cc0584p-2, + 0x1.d38d44p-2, + 0x1.db081cp-2, + 0x1.e275eap-2, + 0x1.e9d68ap-2, + 0x1.f129d4p-2, + 0x1.f86faap-2, + 0x1.ffa7eap-2, + 0x1.03693ap-1, + 0x1.06f794p-1, + 0x1.0a7ef6p-1, + 0x1.0dff50p-1, + 0x1.117894p-1, + 0x1.14eab4p-1, + 0x1.1855a6p-1, + 0x1.1bb95cp-1, + 0x1.1f15ccp-1, + 0x1.226ae8p-1, + 0x1.25b8a8p-1, + 0x1.28ff02p-1, + 0x1.2c3decp-1, + 0x1.2f755cp-1, + 0x1.32a54cp-1, + 0x1.35cdb4p-1, + 0x1.38ee8ap-1, + 0x1.3c07cap-1, + 0x1.3f196ep-1, + 0x1.42236ep-1, + 0x1.4525c8p-1, + 0x1.482074p-1, + 0x1.4b1372p-1, + 0x1.4dfebap-1, + 0x1.50e24cp-1, + 0x1.53be26p-1, + 0x1.569244p-1, + 0x1.595ea6p-1, + 0x1.5c2348p-1, + 0x1.5ee02ep-1, + 0x1.619556p-1, + 0x1.6442c0p-1, + 0x1.66e86ep-1, + 0x1.69865ep-1, + 0x1.6c1c98p-1, + 0x1.6eab18p-1, + 0x1.7131e6p-1, + 0x1.73b102p-1, + 0x1.762870p-1, + 0x1.789836p-1, + 0x1.7b0058p-1, + 0x1.7d60d8p-1, + 0x1.7fb9c0p-1, + 0x1.820b12p-1, + 0x1.8454d6p-1, + 0x1.869712p-1, + 0x1.88d1cep-1, + 0x1.8b050ep-1, + 0x1.8d30dep-1, + 0x1.8f5544p-1, + 0x1.91724ap-1, + 0x1.9387f6p-1, + 0x1.959652p-1, + 0x1.979d68p-1, + 0x1.999d42p-1, + 0x1.9b95e8p-1, + 0x1.9d8768p-1, + 0x1.9f71cap-1, + 0x1.a1551ap-1, + 0x1.a33162p-1, + 0x1.a506b0p-1, + 0x1.a6d50cp-1, + 0x1.a89c86p-1, + 0x1.aa5d26p-1, + 0x1.ac16fcp-1, + 0x1.adca14p-1, + 0x1.af767ap-1, + 0x1.b11c3cp-1, + 0x1.b2bb68p-1, + 0x1.b4540ap-1, + 0x1.b5e630p-1, + 0x1.b771e8p-1, + 0x1.b8f742p-1, + 0x1.ba764ap-1, + 0x1.bbef10p-1, + 0x1.bd61a2p-1, + 0x1.bece0ep-1, + 0x1.c03464p-1, + 0x1.c194b2p-1, + 0x1.c2ef08p-1, + 0x1.c44376p-1, + 0x1.c5920ap-1, + 0x1.c6dad2p-1, + 0x1.c81de2p-1, + 0x1.c95b46p-1, + 0x1.ca930ep-1, + 0x1.cbc54cp-1, + 0x1.ccf20cp-1, + 0x1.ce1962p-1, + 0x1.cf3b5cp-1, + 0x1.d0580cp-1, + 0x1.d16f7ep-1, + 0x1.d281c4p-1, + 0x1.d38ef0p-1, + 0x1.d49710p-1, + 0x1.d59a34p-1, + 0x1.d6986cp-1, + 0x1.d791cap-1, + 0x1.d8865ep-1, + 0x1.d97636p-1, + 0x1.da6162p-1, + 0x1.db47f4p-1, + 0x1.dc29fcp-1, + 0x1.dd0788p-1, + 0x1.dde0aap-1, + 0x1.deb570p-1, + 0x1.df85eap-1, + 0x1.e0522ap-1, + 0x1.e11a3ep-1, + 0x1.e1de36p-1, + 0x1.e29e22p-1, + 0x1.e35a12p-1, + 0x1.e41214p-1, + 0x1.e4c638p-1, + 0x1.e5768cp-1, + 0x1.e62322p-1, + 0x1.e6cc08p-1, + 0x1.e7714ap-1, + 0x1.e812fcp-1, + 0x1.e8b12ap-1, + 0x1.e94be4p-1, + 0x1.e9e336p-1, + 0x1.ea7730p-1, + 0x1.eb07e2p-1, + 0x1.eb9558p-1, + 0x1.ec1fa2p-1, + 0x1.eca6ccp-1, + 0x1.ed2ae6p-1, + 0x1.edabfcp-1, + 0x1.ee2a1ep-1, + 0x1.eea556p-1, + 0x1.ef1db4p-1, + 0x1.ef9344p-1, + 0x1.f00614p-1, + 0x1.f07630p-1, + 0x1.f0e3a6p-1, + 0x1.f14e82p-1, + 0x1.f1b6d0p-1, + 0x1.f21ca0p-1, + 0x1.f27ff8p-1, + 0x1.f2e0eap-1, + 0x1.f33f7ep-1, + 0x1.f39bc2p-1, + 0x1.f3f5c2p-1, + 0x1.f44d88p-1, + 0x1.f4a31ep-1, + 0x1.f4f694p-1, + 0x1.f547f2p-1, + 0x1.f59742p-1, + 0x1.f5e490p-1, + 0x1.f62fe8p-1, + 0x1.f67952p-1, + 0x1.f6c0dcp-1, + 0x1.f7068cp-1, + 0x1.f74a6ep-1, + 0x1.f78c8cp-1, + 0x1.f7cceep-1, + 0x1.f80ba2p-1, + 0x1.f848acp-1, + 0x1.f8841ap-1, + 0x1.f8bdf2p-1, + 0x1.f8f63ep-1, + 0x1.f92d08p-1, + 0x1.f96256p-1, + 0x1.f99634p-1, + 0x1.f9c8a8p-1, + 0x1.f9f9bap-1, + 0x1.fa2974p-1, + 0x1.fa57dep-1, + 0x1.fa84fep-1, + 0x1.fab0dep-1, + 0x1.fadb84p-1, + 0x1.fb04f6p-1, + 0x1.fb2d40p-1, + 0x1.fb5464p-1, + 0x1.fb7a6cp-1, + 0x1.fb9f60p-1, + 0x1.fbc344p-1, + 0x1.fbe61ep-1, + 0x1.fc07fap-1, + 0x1.fc28d8p-1, + 0x1.fc48c2p-1, + 0x1.fc67bcp-1, + 0x1.fc85d0p-1, + 0x1.fca2fep-1, + 0x1.fcbf52p-1, + 0x1.fcdaccp-1, + 0x1.fcf576p-1, + 0x1.fd0f54p-1, + 0x1.fd286ap-1, + 0x1.fd40bep-1, + 0x1.fd5856p-1, + 0x1.fd6f34p-1, + 0x1.fd8562p-1, + 0x1.fd9ae2p-1, + 0x1.fdafb8p-1, + 0x1.fdc3e8p-1, + 0x1.fdd77ap-1, + 0x1.fdea6ep-1, + 0x1.fdfcccp-1, + 0x1.fe0e96p-1, + 0x1.fe1fd0p-1, + 0x1.fe3080p-1, + 0x1.fe40a6p-1, + 0x1.fe504cp-1, + 0x1.fe5f70p-1, + 0x1.fe6e18p-1, + 0x1.fe7c46p-1, + 0x1.fe8a00p-1, + 0x1.fe9748p-1, + 0x1.fea422p-1, + 0x1.feb090p-1, + 0x1.febc96p-1, + 0x1.fec836p-1, + 0x1.fed374p-1, + 0x1.fede52p-1, + 0x1.fee8d4p-1, + 0x1.fef2fep-1, + 0x1.fefccep-1, + 0x1.ff064cp-1, + 0x1.ff0f76p-1, + 0x1.ff1852p-1, + 0x1.ff20e0p-1, + 0x1.ff2924p-1, + 0x1.ff3120p-1, + 0x1.ff38d6p-1, + 0x1.ff4048p-1, + 0x1.ff4778p-1, + 0x1.ff4e68p-1, + 0x1.ff551ap-1, + 0x1.ff5b90p-1, + 0x1.ff61ccp-1, + 0x1.ff67d0p-1, + 0x1.ff6d9ep-1, + 0x1.ff7338p-1, + 0x1.ff789ep-1, + 0x1.ff7dd4p-1, + 0x1.ff82dap-1, + 0x1.ff87b2p-1, + 0x1.ff8c5cp-1, + 0x1.ff90dcp-1, + 0x1.ff9532p-1, + 0x1.ff9960p-1, + 0x1.ff9d68p-1, + 0x1.ffa14ap-1, + 0x1.ffa506p-1, + 0x1.ffa8a0p-1, + 0x1.ffac18p-1, + 0x1.ffaf6ep-1, + 0x1.ffb2a6p-1, + 0x1.ffb5bep-1, + 0x1.ffb8b8p-1, + 0x1.ffbb98p-1, + 0x1.ffbe5ap-1, + 0x1.ffc102p-1, + 0x1.ffc390p-1, + 0x1.ffc606p-1, + 0x1.ffc862p-1, + 0x1.ffcaa8p-1, + 0x1.ffccd8p-1, + 0x1.ffcef4p-1, + 0x1.ffd0fap-1, + 0x1.ffd2eap-1, + 0x1.ffd4cap-1, + 0x1.ffd696p-1, + 0x1.ffd84ep-1, + 0x1.ffd9f8p-1, + 0x1.ffdb90p-1, + 0x1.ffdd18p-1, + 0x1.ffde90p-1, + 0x1.ffdffap-1, + 0x1.ffe154p-1, + 0x1.ffe2a2p-1, + 0x1.ffe3e2p-1, + 0x1.ffe514p-1, + 0x1.ffe63cp-1, + 0x1.ffe756p-1, + 0x1.ffe866p-1, + 0x1.ffe96ap-1, + 0x1.ffea64p-1, + 0x1.ffeb54p-1, + 0x1.ffec3ap-1, + 0x1.ffed16p-1, + 0x1.ffedeap-1, + 0x1.ffeeb4p-1, + 0x1.ffef76p-1, + 0x1.fff032p-1, + 0x1.fff0e4p-1, + 0x1.fff18ep-1, + 0x1.fff232p-1, + 0x1.fff2d0p-1, + 0x1.fff366p-1, + 0x1.fff3f6p-1, + 0x1.fff480p-1, + 0x1.fff504p-1, + 0x1.fff582p-1, + 0x1.fff5fcp-1, + 0x1.fff670p-1, + 0x1.fff6dep-1, + 0x1.fff74ap-1, + 0x1.fff7aep-1, + 0x1.fff810p-1, + 0x1.fff86cp-1, + 0x1.fff8c6p-1, + 0x1.fff91cp-1, + 0x1.fff96cp-1, + 0x1.fff9bap-1, + 0x1.fffa04p-1, + 0x1.fffa4cp-1, + 0x1.fffa90p-1, + 0x1.fffad0p-1, + 0x1.fffb0ep-1, + 0x1.fffb4ap-1, + 0x1.fffb82p-1, + 0x1.fffbb8p-1, + 0x1.fffbecp-1, + 0x1.fffc1ep-1, + 0x1.fffc4ep-1, + 0x1.fffc7ap-1, + 0x1.fffca6p-1, + 0x1.fffccep-1, + 0x1.fffcf6p-1, + 0x1.fffd1ap-1, + 0x1.fffd3ep-1, + 0x1.fffd60p-1, + 0x1.fffd80p-1, + 0x1.fffda0p-1, + 0x1.fffdbep-1, + 0x1.fffddap-1, + 0x1.fffdf4p-1, + 0x1.fffe0ep-1, + 0x1.fffe26p-1, + 0x1.fffe3ep-1, + 0x1.fffe54p-1, + 0x1.fffe68p-1, + 0x1.fffe7ep-1, + 0x1.fffe90p-1, + 0x1.fffea2p-1, + 0x1.fffeb4p-1, + 0x1.fffec4p-1, + 0x1.fffed4p-1, + 0x1.fffee4p-1, + 0x1.fffef2p-1, + 0x1.ffff00p-1, + 0x1.ffff0cp-1, + 0x1.ffff18p-1, + 0x1.ffff24p-1, + 0x1.ffff30p-1, + 0x1.ffff3ap-1, + 0x1.ffff44p-1, + 0x1.ffff4ep-1, + 0x1.ffff56p-1, + 0x1.ffff60p-1, + 0x1.ffff68p-1, + 0x1.ffff70p-1, + 0x1.ffff78p-1, + 0x1.ffff7ep-1, + 0x1.ffff84p-1, + 0x1.ffff8cp-1, + 0x1.ffff92p-1, + 0x1.ffff98p-1, + 0x1.ffff9cp-1, + 0x1.ffffa2p-1, + 0x1.ffffa6p-1, + 0x1.ffffacp-1, + 0x1.ffffb0p-1, + 0x1.ffffb4p-1, + 0x1.ffffb8p-1, + 0x1.ffffbcp-1, + 0x1.ffffc0p-1, + 0x1.ffffc4p-1, + 0x1.ffffc6p-1, + 0x1.ffffcap-1, + 0x1.ffffccp-1, + 0x1.ffffd0p-1, + 0x1.ffffd2p-1, + 0x1.ffffd4p-1, + 0x1.ffffd6p-1, + 0x1.ffffd8p-1, + 0x1.ffffdcp-1, + 0x1.ffffdep-1, + 0x1.ffffdep-1, + 0x1.ffffe0p-1, + 0x1.ffffe2p-1, + 0x1.ffffe4p-1, + 0x1.ffffe6p-1, + 0x1.ffffe8p-1, + 0x1.ffffe8p-1, + 0x1.ffffeap-1, + 0x1.ffffeap-1, + 0x1.ffffecp-1, + 0x1.ffffeep-1, + 0x1.ffffeep-1, + 0x1.fffff0p-1, + 0x1.fffff0p-1, + 0x1.fffff2p-1, + 0x1.fffff2p-1, + 0x1.fffff2p-1, + 0x1.fffff4p-1, + 0x1.fffff4p-1, + 0x1.fffff4p-1, + 0x1.fffff6p-1, + 0x1.fffff6p-1, + 0x1.fffff6p-1, + 0x1.fffff8p-1, + 0x1.fffff8p-1, + 0x1.fffff8p-1, + 0x1.fffff8p-1, + 0x1.fffffap-1, + 0x1.fffffap-1, + 0x1.fffffap-1, + 0x1.fffffap-1, + 0x1.fffffap-1, + 0x1.fffffap-1, + 0x1.fffffcp-1, + 0x1.fffffcp-1, + 0x1.fffffcp-1, + 0x1.fffffcp-1, + 0x1.fffffcp-1, + 0x1.fffffcp-1, + 0x1.fffffcp-1, + 0x1.fffffcp-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.000000p+0, + 0x1.000000p+0, + 0x1.000000p+0, + 0x1.000000p+0, + 0x1.000000p+0, + 0x1.000000p+0, + 0x1.000000p+0, + 0x1.000000p+0, + 0x1.000000p+0, + 0x1.000000p+0, + 0x1.000000p+0, + }, + .scale = { 0x1.20dd76p+0, + 0x1.20d8f2p+0, + 0x1.20cb68p+0, + 0x1.20b4d8p+0, + 0x1.209546p+0, + 0x1.206cb4p+0, + 0x1.203b26p+0, + 0x1.2000a0p+0, + 0x1.1fbd28p+0, + 0x1.1f70c4p+0, + 0x1.1f1b7ap+0, + 0x1.1ebd56p+0, + 0x1.1e565cp+0, + 0x1.1de698p+0, + 0x1.1d6e14p+0, + 0x1.1cecdcp+0, + 0x1.1c62fap+0, + 0x1.1bd07cp+0, + 0x1.1b3572p+0, + 0x1.1a91e6p+0, + 0x1.19e5eap+0, + 0x1.19318cp+0, + 0x1.1874dep+0, + 0x1.17aff0p+0, + 0x1.16e2d8p+0, + 0x1.160da4p+0, + 0x1.153068p+0, + 0x1.144b3cp+0, + 0x1.135e30p+0, + 0x1.12695ep+0, + 0x1.116cd8p+0, + 0x1.1068bap+0, + 0x1.0f5d16p+0, + 0x1.0e4a08p+0, + 0x1.0d2fa6p+0, + 0x1.0c0e0ap+0, + 0x1.0ae550p+0, + 0x1.09b590p+0, + 0x1.087ee4p+0, + 0x1.07416cp+0, + 0x1.05fd3ep+0, + 0x1.04b27cp+0, + 0x1.036140p+0, + 0x1.0209a6p+0, + 0x1.00abd0p+0, + 0x1.fe8fb0p-1, + 0x1.fbbbbep-1, + 0x1.f8dc0ap-1, + 0x1.f5f0cep-1, + 0x1.f2fa4cp-1, + 0x1.eff8c4p-1, + 0x1.ecec78p-1, + 0x1.e9d5a8p-1, + 0x1.e6b498p-1, + 0x1.e38988p-1, + 0x1.e054bep-1, + 0x1.dd167cp-1, + 0x1.d9cf06p-1, + 0x1.d67ea2p-1, + 0x1.d32592p-1, + 0x1.cfc41ep-1, + 0x1.cc5a8ap-1, + 0x1.c8e91cp-1, + 0x1.c5701ap-1, + 0x1.c1efcap-1, + 0x1.be6872p-1, + 0x1.bada5ap-1, + 0x1.b745c6p-1, + 0x1.b3aafcp-1, + 0x1.b00a46p-1, + 0x1.ac63e8p-1, + 0x1.a8b828p-1, + 0x1.a5074ep-1, + 0x1.a1519ep-1, + 0x1.9d9762p-1, + 0x1.99d8dap-1, + 0x1.961650p-1, + 0x1.925008p-1, + 0x1.8e8646p-1, + 0x1.8ab950p-1, + 0x1.86e96ap-1, + 0x1.8316d6p-1, + 0x1.7f41dcp-1, + 0x1.7b6abcp-1, + 0x1.7791b8p-1, + 0x1.73b714p-1, + 0x1.6fdb12p-1, + 0x1.6bfdf0p-1, + 0x1.681ff2p-1, + 0x1.644156p-1, + 0x1.60625cp-1, + 0x1.5c8342p-1, + 0x1.58a446p-1, + 0x1.54c5a6p-1, + 0x1.50e79ep-1, + 0x1.4d0a68p-1, + 0x1.492e42p-1, + 0x1.455366p-1, + 0x1.417a0cp-1, + 0x1.3da26ep-1, + 0x1.39ccc2p-1, + 0x1.35f940p-1, + 0x1.32281ep-1, + 0x1.2e5992p-1, + 0x1.2a8dcep-1, + 0x1.26c508p-1, + 0x1.22ff72p-1, + 0x1.1f3d3cp-1, + 0x1.1b7e98p-1, + 0x1.17c3b6p-1, + 0x1.140cc4p-1, + 0x1.1059eep-1, + 0x1.0cab62p-1, + 0x1.09014cp-1, + 0x1.055bd6p-1, + 0x1.01bb2cp-1, + 0x1.fc3ee6p-2, + 0x1.f511aap-2, + 0x1.edeeeep-2, + 0x1.e6d700p-2, + 0x1.dfca26p-2, + 0x1.d8c8aap-2, + 0x1.d1d2d0p-2, + 0x1.cae8dap-2, + 0x1.c40b08p-2, + 0x1.bd3998p-2, + 0x1.b674c8p-2, + 0x1.afbcd4p-2, + 0x1.a911f0p-2, + 0x1.a27456p-2, + 0x1.9be438p-2, + 0x1.9561c8p-2, + 0x1.8eed36p-2, + 0x1.8886b2p-2, + 0x1.822e66p-2, + 0x1.7be47ap-2, + 0x1.75a91ap-2, + 0x1.6f7c6ap-2, + 0x1.695e8cp-2, + 0x1.634fa6p-2, + 0x1.5d4fd4p-2, + 0x1.575f34p-2, + 0x1.517de6p-2, + 0x1.4bac00p-2, + 0x1.45e99cp-2, + 0x1.4036d0p-2, + 0x1.3a93b2p-2, + 0x1.350052p-2, + 0x1.2f7cc4p-2, + 0x1.2a0916p-2, + 0x1.24a554p-2, + 0x1.1f518ap-2, + 0x1.1a0dc6p-2, + 0x1.14da0ap-2, + 0x1.0fb662p-2, + 0x1.0aa2d0p-2, + 0x1.059f5ap-2, + 0x1.00ac00p-2, + 0x1.f79184p-3, + 0x1.edeb40p-3, + 0x1.e46530p-3, + 0x1.daff4ap-3, + 0x1.d1b982p-3, + 0x1.c893cep-3, + 0x1.bf8e1cp-3, + 0x1.b6a856p-3, + 0x1.ade26cp-3, + 0x1.a53c42p-3, + 0x1.9cb5bep-3, + 0x1.944ec2p-3, + 0x1.8c0732p-3, + 0x1.83deeap-3, + 0x1.7bd5c8p-3, + 0x1.73eba4p-3, + 0x1.6c2056p-3, + 0x1.6473b6p-3, + 0x1.5ce596p-3, + 0x1.5575c8p-3, + 0x1.4e241ep-3, + 0x1.46f066p-3, + 0x1.3fda6cp-3, + 0x1.38e1fap-3, + 0x1.3206dcp-3, + 0x1.2b48dap-3, + 0x1.24a7b8p-3, + 0x1.1e233ep-3, + 0x1.17bb2cp-3, + 0x1.116f48p-3, + 0x1.0b3f52p-3, + 0x1.052b0cp-3, + 0x1.fe6460p-4, + 0x1.f2a902p-4, + 0x1.e72372p-4, + 0x1.dbd32ap-4, + 0x1.d0b7a0p-4, + 0x1.c5d04ap-4, + 0x1.bb1c98p-4, + 0x1.b09bfcp-4, + 0x1.a64de6p-4, + 0x1.9c31c6p-4, + 0x1.92470ap-4, + 0x1.888d1ep-4, + 0x1.7f036cp-4, + 0x1.75a960p-4, + 0x1.6c7e64p-4, + 0x1.6381e2p-4, + 0x1.5ab342p-4, + 0x1.5211ecp-4, + 0x1.499d48p-4, + 0x1.4154bcp-4, + 0x1.3937b2p-4, + 0x1.31458ep-4, + 0x1.297dbap-4, + 0x1.21df9ap-4, + 0x1.1a6a96p-4, + 0x1.131e14p-4, + 0x1.0bf97ep-4, + 0x1.04fc3ap-4, + 0x1.fc4b5ep-5, + 0x1.eeea8cp-5, + 0x1.e1d4d0p-5, + 0x1.d508fap-5, + 0x1.c885e0p-5, + 0x1.bc4a54p-5, + 0x1.b05530p-5, + 0x1.a4a54ap-5, + 0x1.99397ap-5, + 0x1.8e109cp-5, + 0x1.83298ep-5, + 0x1.78832cp-5, + 0x1.6e1c58p-5, + 0x1.63f3f6p-5, + 0x1.5a08e8p-5, + 0x1.505a18p-5, + 0x1.46e66cp-5, + 0x1.3dacd2p-5, + 0x1.34ac36p-5, + 0x1.2be38cp-5, + 0x1.2351c2p-5, + 0x1.1af5d2p-5, + 0x1.12ceb4p-5, + 0x1.0adb60p-5, + 0x1.031ad6p-5, + 0x1.f7182ap-6, + 0x1.e85c44p-6, + 0x1.da0006p-6, + 0x1.cc0180p-6, + 0x1.be5ecep-6, + 0x1.b1160ap-6, + 0x1.a4255ap-6, + 0x1.978ae8p-6, + 0x1.8b44e6p-6, + 0x1.7f5188p-6, + 0x1.73af0cp-6, + 0x1.685bb6p-6, + 0x1.5d55ccp-6, + 0x1.529b9ep-6, + 0x1.482b84p-6, + 0x1.3e03d8p-6, + 0x1.3422fep-6, + 0x1.2a875cp-6, + 0x1.212f62p-6, + 0x1.181984p-6, + 0x1.0f443ep-6, + 0x1.06ae14p-6, + 0x1.fcab14p-7, + 0x1.ec7262p-7, + 0x1.dcaf36p-7, + 0x1.cd5ecap-7, + 0x1.be7e5ap-7, + 0x1.b00b38p-7, + 0x1.a202bep-7, + 0x1.94624ep-7, + 0x1.87275ep-7, + 0x1.7a4f6ap-7, + 0x1.6dd7fep-7, + 0x1.61beaep-7, + 0x1.56011cp-7, + 0x1.4a9cf6p-7, + 0x1.3f8ff6p-7, + 0x1.34d7dcp-7, + 0x1.2a727ap-7, + 0x1.205dacp-7, + 0x1.169756p-7, + 0x1.0d1d6ap-7, + 0x1.03ede2p-7, + 0x1.f60d8ap-8, + 0x1.e4cc4ap-8, + 0x1.d4143ap-8, + 0x1.c3e1a6p-8, + 0x1.b430ecp-8, + 0x1.a4fe84p-8, + 0x1.9646f4p-8, + 0x1.8806d8p-8, + 0x1.7a3adep-8, + 0x1.6cdfccp-8, + 0x1.5ff276p-8, + 0x1.536fc2p-8, + 0x1.4754acp-8, + 0x1.3b9e40p-8, + 0x1.30499cp-8, + 0x1.2553eep-8, + 0x1.1aba78p-8, + 0x1.107a8cp-8, + 0x1.06918cp-8, + 0x1.f9f9d0p-9, + 0x1.e77448p-9, + 0x1.d58da6p-9, + 0x1.c4412cp-9, + 0x1.b38a3ap-9, + 0x1.a36454p-9, + 0x1.93cb12p-9, + 0x1.84ba30p-9, + 0x1.762d84p-9, + 0x1.682100p-9, + 0x1.5a90b0p-9, + 0x1.4d78bcp-9, + 0x1.40d564p-9, + 0x1.34a306p-9, + 0x1.28de12p-9, + 0x1.1d8318p-9, + 0x1.128ebap-9, + 0x1.07fdb4p-9, + 0x1.fb99b8p-10, + 0x1.e7f232p-10, + 0x1.d4fed8p-10, + 0x1.c2b9d0p-10, + 0x1.b11d70p-10, + 0x1.a02436p-10, + 0x1.8fc8c8p-10, + 0x1.8005f0p-10, + 0x1.70d6a4p-10, + 0x1.6235fcp-10, + 0x1.541f34p-10, + 0x1.468daep-10, + 0x1.397ceep-10, + 0x1.2ce898p-10, + 0x1.20cc76p-10, + 0x1.15246ep-10, + 0x1.09ec86p-10, + 0x1.fe41cep-11, + 0x1.e97ba4p-11, + 0x1.d57f52p-11, + 0x1.c245d4p-11, + 0x1.afc85ep-11, + 0x1.9e0058p-11, + 0x1.8ce75ep-11, + 0x1.7c7744p-11, + 0x1.6caa0ep-11, + 0x1.5d79ecp-11, + 0x1.4ee142p-11, + 0x1.40daa4p-11, + 0x1.3360ccp-11, + 0x1.266ea8p-11, + 0x1.19ff46p-11, + 0x1.0e0de8p-11, + 0x1.0295f0p-11, + 0x1.ef25d4p-12, + 0x1.da0110p-12, + 0x1.c5b542p-12, + 0x1.b23a5ap-12, + 0x1.9f8894p-12, + 0x1.8d986ap-12, + 0x1.7c629ap-12, + 0x1.6be022p-12, + 0x1.5c0a38p-12, + 0x1.4cda54p-12, + 0x1.3e4a24p-12, + 0x1.305390p-12, + 0x1.22f0b4p-12, + 0x1.161be4p-12, + 0x1.09cfa4p-12, + 0x1.fc0d56p-13, + 0x1.e577bcp-13, + 0x1.cfd4a6p-13, + 0x1.bb1a96p-13, + 0x1.a74068p-13, + 0x1.943d4ap-13, + 0x1.8208bcp-13, + 0x1.709a8ep-13, + 0x1.5feadap-13, + 0x1.4ff208p-13, + 0x1.40a8c2p-13, + 0x1.3207fcp-13, + 0x1.2408eap-13, + 0x1.16a502p-13, + 0x1.09d5f8p-13, + 0x1.fb2b7ap-14, + 0x1.e3bcf4p-14, + 0x1.cd5528p-14, + 0x1.b7e946p-14, + 0x1.a36eecp-14, + 0x1.8fdc1cp-14, + 0x1.7d2738p-14, + 0x1.6b4702p-14, + 0x1.5a329cp-14, + 0x1.49e178p-14, + 0x1.3a4b60p-14, + 0x1.2b6876p-14, + 0x1.1d3120p-14, + 0x1.0f9e1cp-14, + 0x1.02a868p-14, + 0x1.ec929ap-15, + 0x1.d4f4b4p-15, + 0x1.be6abcp-15, + 0x1.a8e8ccp-15, + 0x1.94637ep-15, + 0x1.80cfdcp-15, + 0x1.6e2368p-15, + 0x1.5c540cp-15, + 0x1.4b581cp-15, + 0x1.3b2652p-15, + 0x1.2bb5ccp-15, + 0x1.1cfe02p-15, + 0x1.0ef6c4p-15, + 0x1.019842p-15, + 0x1.e9b5e8p-16, + 0x1.d16f58p-16, + 0x1.ba4f04p-16, + 0x1.a447b8p-16, + 0x1.8f4cccp-16, + 0x1.7b5224p-16, + 0x1.684c22p-16, + 0x1.562facp-16, + 0x1.44f21ep-16, + 0x1.34894ap-16, + 0x1.24eb72p-16, + 0x1.160f44p-16, + 0x1.07ebd2p-16, + 0x1.f4f12ep-17, + 0x1.db5ad0p-17, + 0x1.c304f0p-17, + 0x1.abe09ep-17, + 0x1.95df98p-17, + 0x1.80f43ap-17, + 0x1.6d1178p-17, + 0x1.5a2ae0p-17, + 0x1.483488p-17, + 0x1.372310p-17, + 0x1.26eb9ep-17, + 0x1.1783cep-17, + 0x1.08e1bap-17, + 0x1.f5f7d8p-18, + 0x1.db92b6p-18, + 0x1.c282cep-18, + 0x1.aab7acp-18, + 0x1.94219cp-18, + 0x1.7eb1a2p-18, + 0x1.6a5972p-18, + 0x1.570b6ap-18, + 0x1.44ba86p-18, + 0x1.335a62p-18, + 0x1.22df2ap-18, + 0x1.133d96p-18, + 0x1.046aeap-18, + 0x1.ecb9d0p-19, + 0x1.d21398p-19, + 0x1.b8d094p-19, + 0x1.a0df10p-19, + 0x1.8a2e26p-19, + 0x1.74adc8p-19, + 0x1.604ea8p-19, + 0x1.4d0232p-19, + 0x1.3aba86p-19, + 0x1.296a70p-19, + 0x1.190562p-19, + 0x1.097f62p-19, + 0x1.f59a20p-20, + 0x1.d9c736p-20, + 0x1.bf716cp-20, + 0x1.a6852cp-20, + 0x1.8eefd8p-20, + 0x1.789fb8p-20, + 0x1.6383f8p-20, + 0x1.4f8c96p-20, + 0x1.3caa62p-20, + 0x1.2acee2p-20, + 0x1.19ec60p-20, + 0x1.09f5d0p-20, + 0x1.f5bd96p-21, + 0x1.d9371ep-21, + 0x1.be41dep-21, + 0x1.a4c89ep-21, + 0x1.8cb738p-21, + 0x1.75fa8ep-21, + 0x1.608078p-21, + 0x1.4c37c0p-21, + 0x1.39100ep-21, + 0x1.26f9e0p-21, + 0x1.15e682p-21, + 0x1.05c804p-21, + 0x1.ed2254p-22, + 0x1.d06ad6p-22, + 0x1.b551c8p-22, + 0x1.9bc0a0p-22, + 0x1.83a200p-22, + 0x1.6ce1aap-22, + 0x1.576c72p-22, + 0x1.43302cp-22, + 0x1.301ba2p-22, + 0x1.1e1e86p-22, + 0x1.0d2966p-22, + 0x1.fa5b50p-23, + 0x1.dc3ae4p-23, + 0x1.bfd756p-23, + 0x1.a517dap-23, + 0x1.8be4f8p-23, + 0x1.74287ep-23, + 0x1.5dcd66p-23, + 0x1.48bfd4p-23, + 0x1.34ecf8p-23, + 0x1.224310p-23, + 0x1.10b148p-23, + }, +}; diff --git a/contrib/arm-optimized-routines/pl/math/sv_exp10_1u5.c b/contrib/arm-optimized-routines/pl/math/sv_exp10_1u5.c new file mode 100644 index 000000000000..519693afcab0 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_exp10_1u5.c @@ -0,0 +1,122 @@ +/* + * Double-precision SVE 10^x function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "poly_sve_f64.h" + +#define SpecialBound 307.0 /* floor (log10 (2^1023)). */ + +static const struct data +{ + double poly[5]; + double shift, log10_2, log2_10_hi, log2_10_lo, scale_thres, special_bound; +} data = { + /* Coefficients generated using Remez algorithm. + rel error: 0x1.9fcb9b3p-60 + abs error: 0x1.a20d9598p-60 in [ -log10(2)/128, log10(2)/128 ] + max ulp err 0.52 +0.5. */ + .poly = { 0x1.26bb1bbb55516p1, 0x1.53524c73cd32ap1, 0x1.0470591daeafbp1, + 0x1.2bd77b1361ef6p0, 0x1.142b5d54e9621p-1 }, + /* 1.5*2^46+1023. This value is further explained below. */ + .shift = 0x1.800000000ffc0p+46, + .log10_2 = 0x1.a934f0979a371p1, /* 1/log2(10). */ + .log2_10_hi = 0x1.34413509f79ffp-2, /* log2(10). */ + .log2_10_lo = -0x1.9dc1da994fd21p-59, + .scale_thres = 1280.0, + .special_bound = SpecialBound, +}; + +#define SpecialOffset 0x6000000000000000 /* 0x1p513. */ +/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ +#define SpecialBias1 0x7000000000000000 /* 0x1p769. */ +#define SpecialBias2 0x3010000000000000 /* 0x1p-254. */ + +/* Update of both special and non-special cases, if any special case is + detected. */ +static inline svfloat64_t +special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n, + const struct data *d) +{ + /* s=2^n may overflow, break it up into s=s1*s2, + such that exp = s + s*y can be computed as s1*(s2+s2*y) + and s1*s1 overflows only if n>0. */ + + /* If n<=0 then set b to 0x6, 0 otherwise. */ + svbool_t p_sign = svcmple (pg, n, 0.0); /* n <= 0. */ + svuint64_t b = svdup_u64_z (p_sign, SpecialOffset); + + /* Set s1 to generate overflow depending on sign of exponent n. */ + svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1)); + /* Offset s to avoid overflow in final result if n is below threshold. */ + svfloat64_t s2 = svreinterpret_f64 ( + svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b)); + + /* |n| > 1280 => 2^(n) overflows. */ + svbool_t p_cmp = svacgt (pg, n, d->scale_thres); + + svfloat64_t r1 = svmul_x (pg, s1, s1); + svfloat64_t r2 = svmla_x (pg, s2, s2, y); + svfloat64_t r0 = svmul_x (pg, r2, s1); + + return svsel (p_cmp, r1, r0); +} + +/* Fast vector implementation of exp10 using FEXPA instruction. + Maximum measured error is 1.02 ulp. + SV_NAME_D1 (exp10)(-0x1.2862fec805e58p+2) got 0x1.885a89551d782p-16 + want 0x1.885a89551d781p-16. */ +svfloat64_t SV_NAME_D1 (exp10) (svfloat64_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + svbool_t no_big_scale = svacle (pg, x, d->special_bound); + svbool_t special = svnot_z (pg, no_big_scale); + + /* n = round(x/(log10(2)/N)). */ + svfloat64_t shift = sv_f64 (d->shift); + svfloat64_t z = svmla_x (pg, shift, x, d->log10_2); + svfloat64_t n = svsub_x (pg, z, shift); + + /* r = x - n*log10(2)/N. */ + svfloat64_t log2_10 = svld1rq (svptrue_b64 (), &d->log2_10_hi); + svfloat64_t r = x; + r = svmls_lane (r, n, log2_10, 0); + r = svmls_lane (r, n, log2_10, 1); + + /* scale = 2^(n/N), computed using FEXPA. FEXPA does not propagate NaNs, so + for consistent NaN handling we have to manually propagate them. This + comes at significant performance cost. */ + svuint64_t u = svreinterpret_u64 (z); + svfloat64_t scale = svexpa (u); + + /* Approximate exp10(r) using polynomial. */ + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t y = svmla_x (pg, svmul_x (pg, r, d->poly[0]), r2, + sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly + 1)); + + /* Assemble result as exp10(x) = 2^n * exp10(r). If |x| > SpecialBound + multiplication may overflow, so use special case routine. */ + if (unlikely (svptest_any (pg, special))) + { + /* FEXPA zeroes the sign bit, however the sign is meaningful to the + special case function so needs to be copied. + e = sign bit of u << 46. */ + svuint64_t e = svand_x (pg, svlsl_x (pg, u, 46), 0x8000000000000000); + /* Copy sign to scale. */ + scale = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (scale))); + return special_case (pg, scale, y, n, d); + } + + /* No special case. */ + return svmla_x (pg, scale, scale, y); +} + +PL_SIG (SV, D, 1, exp10, -9.9, 9.9) +PL_TEST_ULP (SV_NAME_D1 (exp10), 0.52) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp10), 0, 307, 10000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp10), 307, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_exp10f_1u5.c b/contrib/arm-optimized-routines/pl/math/sv_exp10f_1u5.c new file mode 100644 index 000000000000..9ecde8f1aa52 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_exp10f_1u5.c @@ -0,0 +1,87 @@ +/* + * Single-precision SVE 2^x function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "include/mathlib.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "poly_sve_f32.h" + +/* For x < -SpecialBound, the result is subnormal and not handled correctly by + FEXPA. */ +#define SpecialBound 37.9 + +static const struct data +{ + float poly[5]; + float shift, log10_2, log2_10_hi, log2_10_lo, special_bound; +} data = { + /* Coefficients generated using Remez algorithm with minimisation of relative + error. + rel error: 0x1.89dafa3p-24 + abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2] + maxerr: 0.52 +0.5 ulp. */ + .poly = { 0x1.26bb16p+1f, 0x1.5350d2p+1f, 0x1.04744ap+1f, 0x1.2d8176p+0f, + 0x1.12b41ap-1f }, + /* 1.5*2^17 + 127, a shift value suitable for FEXPA. */ + .shift = 0x1.903f8p17f, + .log10_2 = 0x1.a934fp+1, + .log2_10_hi = 0x1.344136p-2, + .log2_10_lo = -0x1.ec10cp-27, + .special_bound = SpecialBound, +}; + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +{ + return sv_call_f32 (exp10f, x, y, special); +} + +/* Single-precision SVE exp10f routine. Implements the same algorithm + as AdvSIMD exp10f. + Worst case error is 1.02 ULPs. + _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1 + want 0x1.ba5f9cp-1. */ +svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + /* exp10(x) = 2^(n/N) * 10^r = 2^n * (1 + poly (r)), + with poly(r) in [1/sqrt(2), sqrt(2)] and + x = r + n * log10(2) / N, with r in [-log10(2)/2N, log10(2)/2N]. */ + + /* Load some constants in quad-word chunks to minimise memory access (last + lane is wasted). */ + svfloat32_t log10_2_and_inv = svld1rq (svptrue_b32 (), &d->log10_2); + + /* n = round(x/(log10(2)/N)). */ + svfloat32_t shift = sv_f32 (d->shift); + svfloat32_t z = svmla_lane (shift, x, log10_2_and_inv, 0); + svfloat32_t n = svsub_x (pg, z, shift); + + /* r = x - n*log10(2)/N. */ + svfloat32_t r = svmls_lane (x, n, log10_2_and_inv, 1); + r = svmls_lane (r, n, log10_2_and_inv, 2); + + svbool_t special = svacgt (pg, x, d->special_bound); + svfloat32_t scale = svexpa (svreinterpret_u32 (z)); + + /* Polynomial evaluation: poly(r) ~ exp10(r)-1. */ + svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t poly + = svmla_x (pg, svmul_x (pg, r, d->poly[0]), + sv_pairwise_poly_3_f32_x (pg, r, r2, d->poly + 1), r2); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, svmla_x (pg, scale, scale, poly), special); + + return svmla_x (pg, scale, scale, poly); +} + +PL_SIG (SV, F, 1, exp10, -9.9, 9.9) +PL_TEST_ULP (SV_NAME_F1 (exp10), 0.52) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp10), 0, SpecialBound, 50000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp10), SpecialBound, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_exp2_2u.c b/contrib/arm-optimized-routines/pl/math/sv_exp2_2u.c new file mode 100644 index 000000000000..dcbca8adddd1 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_exp2_2u.c @@ -0,0 +1,107 @@ +/* + * Double-precision SVE 2^x function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "poly_sve_f64.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define N (1 << V_EXP_TABLE_BITS) + +#define BigBound 1022 +#define UOFlowBound 1280 + +static const struct data +{ + double poly[4]; + double shift, big_bound, uoflow_bound; +} data = { + /* Coefficients are computed using Remez algorithm with + minimisation of the absolute error. */ + .poly = { 0x1.62e42fefa3686p-1, 0x1.ebfbdff82c241p-3, 0x1.c6b09b16de99ap-5, + 0x1.3b2abf5571ad8p-7 }, + .shift = 0x1.8p52 / N, + .uoflow_bound = UOFlowBound, + .big_bound = BigBound, +}; + +#define SpecialOffset 0x6000000000000000 /* 0x1p513. */ +/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ +#define SpecialBias1 0x7000000000000000 /* 0x1p769. */ +#define SpecialBias2 0x3010000000000000 /* 0x1p-254. */ + +/* Update of both special and non-special cases, if any special case is + detected. */ +static inline svfloat64_t +special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n, + const struct data *d) +{ + /* s=2^n may overflow, break it up into s=s1*s2, + such that exp = s + s*y can be computed as s1*(s2+s2*y) + and s1*s1 overflows only if n>0. */ + + /* If n<=0 then set b to 0x6, 0 otherwise. */ + svbool_t p_sign = svcmple (pg, n, 0.0); /* n <= 0. */ + svuint64_t b = svdup_u64_z (p_sign, SpecialOffset); + + /* Set s1 to generate overflow depending on sign of exponent n. */ + svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1)); + /* Offset s to avoid overflow in final result if n is below threshold. */ + svfloat64_t s2 = svreinterpret_f64 ( + svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b)); + + /* |n| > 1280 => 2^(n) overflows. */ + svbool_t p_cmp = svacgt (pg, n, d->uoflow_bound); + + svfloat64_t r1 = svmul_x (pg, s1, s1); + svfloat64_t r2 = svmla_x (pg, s2, s2, y); + svfloat64_t r0 = svmul_x (pg, r2, s1); + + return svsel (p_cmp, r1, r0); +} + +/* Fast vector implementation of exp2. + Maximum measured error is 1.65 ulp. + _ZGVsMxv_exp2(-0x1.4c264ab5b559bp-6) got 0x1.f8db0d4df721fp-1 + want 0x1.f8db0d4df721dp-1. */ +svfloat64_t SV_NAME_D1 (exp2) (svfloat64_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + svbool_t no_big_scale = svacle (pg, x, d->big_bound); + svbool_t special = svnot_z (pg, no_big_scale); + + /* Reduce x to k/N + r, where k is integer and r in [-1/2N, 1/2N]. */ + svfloat64_t shift = sv_f64 (d->shift); + svfloat64_t kd = svadd_x (pg, x, shift); + svuint64_t ki = svreinterpret_u64 (kd); + /* kd = k/N. */ + kd = svsub_x (pg, kd, shift); + svfloat64_t r = svsub_x (pg, x, kd); + + /* scale ~= 2^(k/N). */ + svuint64_t idx = svand_x (pg, ki, N - 1); + svuint64_t sbits = svld1_gather_index (pg, __v_exp_data, idx); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + svuint64_t top = svlsl_x (pg, ki, 52 - V_EXP_TABLE_BITS); + svfloat64_t scale = svreinterpret_f64 (svadd_x (pg, sbits, top)); + + /* Approximate exp2(r) using polynomial. */ + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t p = sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly); + svfloat64_t y = svmul_x (pg, r, p); + + /* Assemble exp2(x) = exp2(r) * scale. */ + if (unlikely (svptest_any (pg, special))) + return special_case (pg, scale, y, kd, d); + return svmla_x (pg, scale, scale, y); +} + +PL_SIG (SV, D, 1, exp2, -9.9, 9.9) +PL_TEST_ULP (SV_NAME_D1 (exp2), 1.15) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp2), 0, BigBound, 1000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp2), BigBound, UOFlowBound, 100000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp2), UOFlowBound, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_exp2f_1u6.c b/contrib/arm-optimized-routines/pl/math/sv_exp2f_1u6.c new file mode 100644 index 000000000000..9698ff6f0682 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_exp2f_1u6.c @@ -0,0 +1,80 @@ +/* + * Single-precision SVE 2^x function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "poly_sve_f32.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float poly[5]; + float shift, thres; +} data = { + /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for + compatibility with polynomial helpers. */ + .poly = { 0x1.62e422p-1f, 0x1.ebf9bcp-3f, 0x1.c6bd32p-5f, 0x1.3ce9e4p-7f, + 0x1.59977ap-10f }, + /* 1.5*2^17 + 127. */ + .shift = 0x1.903f8p17f, + /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled + correctly by FEXPA. */ + .thres = 0x1.5d5e2ap+6f, +}; + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +{ + return sv_call_f32 (exp2f, x, y, special); +} + +/* Single-precision SVE exp2f routine. Implements the same algorithm + as AdvSIMD exp2f. + Worst case error is 1.04 ULPs. + SV_NAME_F1 (exp2)(0x1.943b9p-1) got 0x1.ba7eb2p+0 + want 0x1.ba7ebp+0. */ +svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = n + r, with r in [-1/2, 1/2]. */ + svfloat32_t shift = sv_f32 (d->shift); + svfloat32_t z = svadd_x (pg, x, shift); + svfloat32_t n = svsub_x (pg, z, shift); + svfloat32_t r = svsub_x (pg, x, n); + + svbool_t special = svacgt (pg, x, d->thres); + svfloat32_t scale = svexpa (svreinterpret_u32 (z)); + + /* Polynomial evaluation: poly(r) ~ exp2(r)-1. + Evaluate polynomial use hybrid scheme - offset ESTRIN by 1 for + coefficients 1 to 4, and apply most significant coefficient directly. */ + svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t p14 = sv_pairwise_poly_3_f32_x (pg, r, r2, d->poly + 1); + svfloat32_t p0 = svmul_x (pg, r, d->poly[0]); + svfloat32_t poly = svmla_x (pg, p0, r2, p14); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, svmla_x (pg, scale, scale, poly), special); + + return svmla_x (pg, scale, scale, poly); +} + +PL_SIG (SV, F, 1, exp2, -9.9, 9.9) +PL_TEST_ULP (SV_NAME_F1 (exp2), 0.55) +PL_TEST_INTERVAL (SV_NAME_F1 (exp2), 0, Thres, 40000) +PL_TEST_INTERVAL (SV_NAME_F1 (exp2), Thres, 1, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (exp2), 1, Thres, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (exp2), Thres, inf, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -0, -0x1p-23, 40000) +PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -0x1p-23, -1, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -1, -0x1p23, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -0x1p23, -inf, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -0, ScaleThres, 40000) +PL_TEST_INTERVAL (SV_NAME_F1 (exp2), ScaleThres, -1, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -1, ScaleThres, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (exp2), ScaleThres, -inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_exp_1u5.c b/contrib/arm-optimized-routines/pl/math/sv_exp_1u5.c new file mode 100644 index 000000000000..c187def9e625 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_exp_1u5.c @@ -0,0 +1,137 @@ +/* + * Double-precision vector e^x function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + double poly[4]; + double ln2_hi, ln2_lo, inv_ln2, shift, thres; +} data = { + .poly = { /* ulp error: 0.53. */ + 0x1.fffffffffdbcdp-2, 0x1.555555555444cp-3, 0x1.555573c6a9f7dp-5, + 0x1.1111266d28935p-7 }, + .ln2_hi = 0x1.62e42fefa3800p-1, + .ln2_lo = 0x1.ef35793c76730p-45, + /* 1/ln2. */ + .inv_ln2 = 0x1.71547652b82fep+0, + /* 1.5*2^46+1023. This value is further explained below. */ + .shift = 0x1.800000000ffc0p+46, + .thres = 704.0, +}; + +#define C(i) sv_f64 (d->poly[i]) +#define SpecialOffset 0x6000000000000000 /* 0x1p513. */ +/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ +#define SpecialBias1 0x7000000000000000 /* 0x1p769. */ +#define SpecialBias2 0x3010000000000000 /* 0x1p-254. */ + +/* Update of both special and non-special cases, if any special case is + detected. */ +static inline svfloat64_t +special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n) +{ + /* s=2^n may overflow, break it up into s=s1*s2, + such that exp = s + s*y can be computed as s1*(s2+s2*y) + and s1*s1 overflows only if n>0. */ + + /* If n<=0 then set b to 0x6, 0 otherwise. */ + svbool_t p_sign = svcmple (pg, n, 0.0); /* n <= 0. */ + svuint64_t b + = svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0. */ + + /* Set s1 to generate overflow depending on sign of exponent n. */ + svfloat64_t s1 = svreinterpret_f64 ( + svsubr_x (pg, b, SpecialBias1)); /* 0x70...0 - b. */ + /* Offset s to avoid overflow in final result if n is below threshold. */ + svfloat64_t s2 = svreinterpret_f64 ( + svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), + b)); /* as_u64 (s) - 0x3010...0 + b. */ + + /* |n| > 1280 => 2^(n) overflows. */ + svbool_t p_cmp = svacgt (pg, n, 1280.0); + + svfloat64_t r1 = svmul_x (pg, s1, s1); + svfloat64_t r2 = svmla_x (pg, s2, s2, y); + svfloat64_t r0 = svmul_x (pg, r2, s1); + + return svsel (p_cmp, r1, r0); +} + +/* SVE exp algorithm. Maximum measured error is 1.01ulps: + SV_NAME_D1 (exp)(0x1.4619d7b04da41p+6) got 0x1.885d9acc41da7p+117 + want 0x1.885d9acc41da6p+117. */ +svfloat64_t SV_NAME_D1 (exp) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svbool_t special = svacgt (pg, x, d->thres); + + /* Use a modifed version of the shift used for flooring, such that x/ln2 is + rounded to a multiple of 2^-6=1/64, shift = 1.5 * 2^52 * 2^-6 = 1.5 * + 2^46. + + n is not an integer but can be written as n = m + i/64, with i and m + integer, 0 <= i < 64 and m <= n. + + Bits 5:0 of z will be null every time x/ln2 reaches a new integer value + (n=m, i=0), and is incremented every time z (or n) is incremented by 1/64. + FEXPA expects i in bits 5:0 of the input so it can be used as index into + FEXPA hardwired table T[i] = 2^(i/64) for i = 0:63, that will in turn + populate the mantissa of the output. Therefore, we use u=asuint(z) as + input to FEXPA. + + We add 1023 to the modified shift value in order to set bits 16:6 of u to + 1, such that once these bits are moved to the exponent of the output of + FEXPA, we get the exponent of 2^n right, i.e. we get 2^m. */ + svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2); + svuint64_t u = svreinterpret_u64 (z); + svfloat64_t n = svsub_x (pg, z, d->shift); + + /* r = x - n * ln2, r is in [-ln2/(2N), ln2/(2N)]. */ + svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi); + svfloat64_t r = svmls_lane (x, n, ln2, 0); + r = svmls_lane (r, n, ln2, 1); + + /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5. */ + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t p01 = svmla_x (pg, C (0), C (1), r); + svfloat64_t p23 = svmla_x (pg, C (2), C (3), r); + svfloat64_t p04 = svmla_x (pg, p01, p23, r2); + svfloat64_t y = svmla_x (pg, r, p04, r2); + + /* s = 2^n, computed using FEXPA. FEXPA does not propagate NaNs, so for + consistent NaN handling we have to manually propagate them. This comes at + significant performance cost. */ + svfloat64_t s = svexpa (u); + + /* Assemble result as exp(x) = 2^n * exp(r). If |x| > Thresh the + multiplication may overflow, so use special case routine. */ + + if (unlikely (svptest_any (pg, special))) + { + /* FEXPA zeroes the sign bit, however the sign is meaningful to the + special case function so needs to be copied. + e = sign bit of u << 46. */ + svuint64_t e = svand_x (pg, svlsl_x (pg, u, 46), 0x8000000000000000); + /* Copy sign to s. */ + s = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (s))); + return special_case (pg, s, y, n); + } + + /* No special case. */ + return svmla_x (pg, s, s, y); +} + +PL_SIG (SV, D, 1, exp, -9.9, 9.9) +PL_TEST_ULP (SV_NAME_D1 (exp), 1.46) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 0, 0x1p-23, 40000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 0x1p-23, 1, 50000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 1, 0x1p23, 50000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 0x1p23, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_exp_tail.h b/contrib/arm-optimized-routines/pl/math/sv_exp_tail.h deleted file mode 100644 index 9b739da9d82a..000000000000 --- a/contrib/arm-optimized-routines/pl/math/sv_exp_tail.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Double-precision SVE e^(x+tail) function. - * - * Copyright (c) 2021-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#ifndef SV_EXP_TAIL_H -#define SV_EXP_TAIL_H - -#include "sv_math.h" -#if SV_SUPPORTED - -#include "v_exp_tail.h" - -#define C1 sv_f64 (C1_scal) -#define C2 sv_f64 (C2_scal) -#define C3 sv_f64 (C3_scal) -#define MinusLn2hi (-Ln2hi_scal) -#define MinusLn2lo (-Ln2lo_scal) - -#define N (1 << V_EXP_TAIL_TABLE_BITS) -#define Tab __v_exp_tail_data -#define IndexMask (N - 1) -#define Shift sv_f64 (0x1.8p+52) -#define Thres 704.0 - -static inline sv_f64_t -sv_exp_tail_special_case (svbool_t pg, sv_f64_t s, sv_f64_t y, sv_f64_t n) -{ - sv_f64_t absn = svabs_f64_x (pg, n); - - /* 2^(n/N) may overflow, break it up into s1*s2. */ - sv_u64_t b = svsel_u64 (svcmple_n_f64 (pg, n, 0), sv_u64 (0x6000000000000000), - sv_u64 (0)); - sv_f64_t s1 = sv_as_f64_u64 (svsubr_n_u64_x (pg, b, 0x7000000000000000)); - sv_f64_t s2 = sv_as_f64_u64 ( - svadd_u64_x (pg, svsub_n_u64_x (pg, sv_as_u64_f64 (s), 0x3010000000000000), - b)); - - svbool_t cmp = svcmpgt_n_f64 (pg, absn, 1280.0 * N); - sv_f64_t r1 = svmul_f64_x (pg, s1, s1); - sv_f64_t r0 = svmul_f64_x (pg, sv_fma_f64_x (pg, y, s2, s2), s1); - return svsel_f64 (cmp, r1, r0); -} - -static inline sv_f64_t -sv_exp_tail (const svbool_t pg, sv_f64_t x, sv_f64_t xtail) -{ - /* Calculate exp(x + xtail). */ - sv_f64_t z = sv_fma_n_f64_x (pg, InvLn2_scal, x, Shift); - sv_f64_t n = svsub_f64_x (pg, z, Shift); - - sv_f64_t r = sv_fma_n_f64_x (pg, MinusLn2hi, n, x); - r = sv_fma_n_f64_x (pg, MinusLn2lo, n, r); - - sv_u64_t u = sv_as_u64_f64 (z); - sv_u64_t e = svlsl_n_u64_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS); - sv_u64_t i = svand_n_u64_x (pg, u, IndexMask); - - sv_f64_t y = sv_fma_f64_x (pg, C3, r, C2); - y = sv_fma_f64_x (pg, y, r, C1); - y = sv_fma_f64_x (pg, y, r, sv_f64 (1.0)); - y = sv_fma_f64_x (pg, y, r, xtail); - - /* s = 2^(n/N). */ - u = sv_lookup_u64_x (pg, Tab, i); - sv_f64_t s = sv_as_f64_u64 (svadd_u64_x (pg, u, e)); - - svbool_t cmp = svcmpgt_n_f64 (pg, svabs_f64_x (pg, x), Thres); - if (unlikely (svptest_any (pg, cmp))) - { - return sv_exp_tail_special_case (pg, s, y, n); - } - return sv_fma_f64_x (pg, y, s, s); -} - -#endif -#endif diff --git a/contrib/arm-optimized-routines/pl/math/sv_expf_2u.c b/contrib/arm-optimized-routines/pl/math/sv_expf_2u.c index 87fbe45df5fd..93d705ce420a 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_expf_2u.c +++ b/contrib/arm-optimized-routines/pl/math/sv_expf_2u.c @@ -9,148 +9,78 @@ #include "pl_sig.h" #include "pl_test.h" -#if SV_SUPPORTED - -#define C(i) __sv_expf_poly[i] - -#define InvLn2 (0x1.715476p+0f) -#define Ln2hi (0x1.62e4p-1f) -#define Ln2lo (0x1.7f7d1cp-20f) - -#if SV_EXPF_USE_FEXPA - -#define Shift (0x1.903f8p17f) /* 1.5*2^17 + 127. */ -#define Thres \ - (0x1.5d5e2ap+6f) /* Roughly 87.3. For x < -Thres, the result is subnormal \ - and not handled correctly by FEXPA. */ - -static NOINLINE sv_f32_t -special_case (sv_f32_t x, sv_f32_t y, svbool_t special) +static const struct data +{ + float poly[5]; + float inv_ln2, ln2_hi, ln2_lo, shift, thres; +} data = { + /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for + compatibility with polynomial helpers. */ + .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f, + 0x1.0e4020p-7f }, + .inv_ln2 = 0x1.715476p+0f, + .ln2_hi = 0x1.62e4p-1f, + .ln2_lo = 0x1.7f7d1cp-20f, + /* 1.5*2^17 + 127. */ + .shift = 0x1.903f8p17f, + /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled + correctly by FEXPA. */ + .thres = 0x1.5d5e2ap+6f, +}; + +#define C(i) sv_f32 (d->poly[i]) +#define ExponentBias 0x3f800000 + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t special) { - /* The special-case handler from the Neon routine does not handle subnormals - in a way that is compatible with FEXPA. For the FEXPA variant we just fall - back to scalar expf. */ return sv_call_f32 (expf, x, y, special); } -#else - -#define Shift (0x1.8p23f) /* 1.5 * 2^23. */ -#define Thres (126.0f) - -/* Special-case handler adapted from Neon variant. Uses s, y and n to produce - the final result (normal cases included). It performs an update of all lanes! - Therefore: - - all previous computation need to be done on all lanes indicated by input - pg - - we cannot simply apply the special case to the special-case-activated - lanes. Besides it is likely that this would not increase performance (no - scatter/gather). */ -static inline sv_f32_t -specialcase (svbool_t pg, sv_f32_t poly, sv_f32_t n, sv_u32_t e, - svbool_t p_cmp1, sv_f32_t scale) +/* Optimised single-precision SVE exp function. + Worst-case error is 1.04 ulp: + SV_NAME_F1 (exp)(0x1.a8eda4p+1) got 0x1.ba74bcp+4 + want 0x1.ba74bap+4. */ +svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg) { - /* s=2^(n/N) may overflow, break it up into s=s1*s2, - such that exp = s + s*y can be computed as s1*(s2+s2*y) - and s1*s1 overflows only if n>0. */ + const struct data *d = ptr_barrier (&data); - /* If n<=0 then set b to 0x820...0, 0 otherwise. */ - svbool_t p_sign = svcmple_n_f32 (pg, n, 0.0f); /* n <= 0. */ - sv_u32_t b - = svdup_n_u32_z (p_sign, 0x82000000); /* Inactive lanes set to 0. */ - - /* Set s1 to generate overflow depending on sign of exponent n. */ - sv_f32_t s1 - = sv_as_f32_u32 (svadd_n_u32_x (pg, b, 0x7f000000)); /* b + 0x7f000000. */ - /* Offset s to avoid overflow in final result if n is below threshold. */ - sv_f32_t s2 = sv_as_f32_u32 ( - svsub_u32_x (pg, e, b)); /* as_u32 (s) - 0x3010...0 + b. */ - - /* |n| > 192 => 2^(n/N) overflows. */ - svbool_t p_cmp2 = svacgt_n_f32 (pg, n, 192.0f); - - sv_f32_t r2 = svmul_f32_x (pg, s1, s1); - sv_f32_t r1 = sv_fma_f32_x (pg, poly, s2, s2); - r1 = svmul_f32_x (pg, r1, s1); - sv_f32_t r0 = sv_fma_f32_x (pg, poly, scale, scale); - - /* Apply condition 1 then 2. - Returns r2 if cond2 is true, otherwise - if cond1 is true then return r1, otherwise return r0. */ - sv_f32_t r = svsel_f32 (p_cmp1, r1, r0); - - return svsel_f32 (p_cmp2, r2, r); -} - -#endif - -/* Optimised single-precision SVE exp function. By default this is an SVE port - of the Neon algorithm from math/. Alternatively, enable a modification of - that algorithm that looks up scale using SVE FEXPA instruction with - SV_EXPF_USE_FEXPA. - - Worst-case error of the default algorithm is 1.95 ulp: - __sv_expf(-0x1.4cb74ap+2) got 0x1.6a022cp-8 - want 0x1.6a023p-8. - - Worst-case error when using FEXPA is 1.04 ulp: - __sv_expf(0x1.a8eda4p+1) got 0x1.ba74bcp+4 - want 0x1.ba74bap+4. */ -sv_f32_t -__sv_expf_x (sv_f32_t x, const svbool_t pg) -{ /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + /* Load some constants in quad-word chunks to minimise memory access (last + lane is wasted). */ + svfloat32_t invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->inv_ln2); + /* n = round(x/(ln2/N)). */ - sv_f32_t z = sv_fma_n_f32_x (pg, InvLn2, x, sv_f32 (Shift)); - sv_f32_t n = svsub_n_f32_x (pg, z, Shift); + svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, invln2_and_ln2, 0); + svfloat32_t n = svsub_x (pg, z, d->shift); /* r = x - n*ln2/N. */ - sv_f32_t r = sv_fma_n_f32_x (pg, -Ln2hi, n, x); - r = sv_fma_n_f32_x (pg, -Ln2lo, n, r); + svfloat32_t r = svmls_lane (x, n, invln2_and_ln2, 1); + r = svmls_lane (r, n, invln2_and_ln2, 2); -/* scale = 2^(n/N). */ -#if SV_EXPF_USE_FEXPA - /* NaNs also need special handling with FEXPA. */ - svbool_t is_special_case - = svorr_b_z (pg, svacgt_n_f32 (pg, x, Thres), svcmpne_f32 (pg, x, x)); - sv_f32_t scale = svexpa_f32 (sv_as_u32_f32 (z)); -#else - sv_u32_t e = svlsl_n_u32_x (pg, sv_as_u32_f32 (z), 23); - svbool_t is_special_case = svacgt_n_f32 (pg, n, Thres); - sv_f32_t scale = sv_as_f32_u32 (svadd_n_u32_x (pg, e, 0x3f800000)); -#endif + /* scale = 2^(n/N). */ + svbool_t is_special_case = svacgt (pg, x, d->thres); + svfloat32_t scale = svexpa (svreinterpret_u32 (z)); - /* y = exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */ - sv_f32_t r2 = svmul_f32_x (pg, r, r); - sv_f32_t p = sv_fma_n_f32_x (pg, C (0), r, sv_f32 (C (1))); - sv_f32_t q = sv_fma_n_f32_x (pg, C (2), r, sv_f32 (C (3))); - q = sv_fma_f32_x (pg, p, r2, q); - p = svmul_n_f32_x (pg, r, C (4)); - sv_f32_t poly = sv_fma_f32_x (pg, q, r2, p); + /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */ + svfloat32_t p12 = svmla_x (pg, C (1), C (2), r); + svfloat32_t p34 = svmla_x (pg, C (3), C (4), r); + svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t p14 = svmla_x (pg, p12, p34, r2); + svfloat32_t p0 = svmul_x (pg, r, C (0)); + svfloat32_t poly = svmla_x (pg, p0, r2, p14); if (unlikely (svptest_any (pg, is_special_case))) -#if SV_EXPF_USE_FEXPA - return special_case (x, sv_fma_f32_x (pg, poly, scale, scale), - is_special_case); -#else - return specialcase (pg, poly, n, e, is_special_case, scale); -#endif + return special_case (x, svmla_x (pg, scale, scale, poly), is_special_case); - return sv_fma_f32_x (pg, poly, scale, scale); + return svmla_x (pg, scale, scale, poly); } -PL_ALIAS (__sv_expf_x, _ZGVsMxv_expf) - PL_SIG (SV, F, 1, exp, -9.9, 9.9) -PL_TEST_ULP (__sv_expf, 1.46) -PL_TEST_INTERVAL (__sv_expf, 0, 0x1p-23, 40000) -PL_TEST_INTERVAL (__sv_expf, 0x1p-23, 1, 50000) -PL_TEST_INTERVAL (__sv_expf, 1, 0x1p23, 50000) -PL_TEST_INTERVAL (__sv_expf, 0x1p23, inf, 50000) -PL_TEST_INTERVAL (__sv_expf, -0, -0x1p-23, 40000) -PL_TEST_INTERVAL (__sv_expf, -0x1p-23, -1, 50000) -PL_TEST_INTERVAL (__sv_expf, -1, -0x1p23, 50000) -PL_TEST_INTERVAL (__sv_expf, -0x1p23, -inf, 50000) -#endif // SV_SUPPORTED +PL_TEST_ULP (SV_NAME_F1 (exp), 0.55) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp), 0, 0x1p-23, 40000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp), 0x1p-23, 1, 50000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp), 1, 0x1p23, 50000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp), 0x1p23, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_expf_data.c b/contrib/arm-optimized-routines/pl/math/sv_expf_data.c deleted file mode 100644 index 6875adf857b6..000000000000 --- a/contrib/arm-optimized-routines/pl/math/sv_expf_data.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * Coefficients for single-precision vector e^x function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -/* Coefficients copied from the polynomial in math/v_expf.c. */ -const float __sv_expf_poly[] = {0x1.0e4020p-7f, 0x1.573e2ep-5f, 0x1.555e66p-3f, - 0x1.fffdb6p-2f, 0x1.ffffecp-1f}; diff --git a/contrib/arm-optimized-routines/pl/math/sv_expf_inline.h b/contrib/arm-optimized-routines/pl/math/sv_expf_inline.h new file mode 100644 index 000000000000..0ef4e0fda946 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_expf_inline.h @@ -0,0 +1,66 @@ +/* + * SVE helper for single-precision routines which calculate exp(x) and do + * not need special-case handling + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef PL_MATH_SV_EXPF_INLINE_H +#define PL_MATH_SV_EXPF_INLINE_H + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +struct sv_expf_data +{ + float poly[5]; + float inv_ln2, ln2_hi, ln2_lo, shift; +}; + +/* Coefficients copied from the polynomial in AdvSIMD variant, reversed for + compatibility with polynomial helpers. Shift is 1.5*2^17 + 127. */ +#define SV_EXPF_DATA \ + { \ + .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f, \ + 0x1.0e4020p-7f }, \ + \ + .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \ + .ln2_lo = 0x1.7f7d1cp-20f, .shift = 0x1.803f8p17f, \ + } + +#define C(i) sv_f32 (d->poly[i]) + +static inline svfloat32_t +expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d) +{ + /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + + /* Load some constants in quad-word chunks to minimise memory access. */ + svfloat32_t c4_invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->poly[4]); + + /* n = round(x/(ln2/N)). */ + svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, c4_invln2_and_ln2, 1); + svfloat32_t n = svsub_x (pg, z, d->shift); + + /* r = x - n*ln2/N. */ + svfloat32_t r = svmls_lane (x, n, c4_invln2_and_ln2, 2); + r = svmls_lane (r, n, c4_invln2_and_ln2, 3); + + /* scale = 2^(n/N). */ + svfloat32_t scale = svexpa (svreinterpret_u32_f32 (z)); + + /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */ + svfloat32_t p12 = svmla_x (pg, C (1), C (2), r); + svfloat32_t p34 = svmla_lane (C (3), r, c4_invln2_and_ln2, 0); + svfloat32_t r2 = svmul_f32_x (pg, r, r); + svfloat32_t p14 = svmla_x (pg, p12, p34, r2); + svfloat32_t p0 = svmul_f32_x (pg, r, C (0)); + svfloat32_t poly = svmla_x (pg, p0, r2, p14); + + return svmla_x (pg, scale, scale, poly); +} + +#endif // PL_MATH_SV_EXPF_INLINE_H \ No newline at end of file diff --git a/contrib/arm-optimized-routines/pl/math/sv_expm1_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_expm1_2u5.c new file mode 100644 index 000000000000..82a31f6d9c0e --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_expm1_2u5.c @@ -0,0 +1,95 @@ +/* + * Double-precision vector exp(x) - 1 function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "poly_sve_f64.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define SpecialBound 0x1.62b7d369a5aa9p+9 +#define ExponentBias 0x3ff0000000000000 + +static const struct data +{ + double poly[11]; + double shift, inv_ln2, special_bound; + /* To be loaded in one quad-word. */ + double ln2_hi, ln2_lo; +} data = { + /* Generated using fpminimax. */ + .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5, + 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10, 0x1.a01a01affa35dp-13, + 0x1.a01a018b4ecbbp-16, 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22, + 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, }, + + .special_bound = SpecialBound, + .inv_ln2 = 0x1.71547652b82fep0, + .ln2_hi = 0x1.62e42fefa39efp-1, + .ln2_lo = 0x1.abc9e3b39803fp-56, + .shift = 0x1.8p52, +}; + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t pg) +{ + return sv_call_f64 (expm1, x, y, pg); +} + +/* Double-precision vector exp(x) - 1 function. + The maximum error observed error is 2.18 ULP: + _ZGVsMxv_expm1(0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2 + want 0x1.a8b9ea8d66e2p-2. */ +svfloat64_t SV_NAME_D1 (expm1) (svfloat64_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* Large, Nan/Inf. */ + svbool_t special = svnot_z (pg, svaclt (pg, x, d->special_bound)); + + /* Reduce argument to smaller range: + Let i = round(x / ln2) + and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 + where 2^i is exact because i is an integer. */ + svfloat64_t shift = sv_f64 (d->shift); + svfloat64_t n = svsub_x (pg, svmla_x (pg, shift, x, d->inv_ln2), shift); + svint64_t i = svcvt_s64_x (pg, n); + svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi); + svfloat64_t f = svmls_lane (x, n, ln2, 0); + f = svmls_lane (f, n, ln2, 1); + + /* Approximate expm1(f) using polynomial. + Taylor expansion for expm1(x) has the form: + x + ax^2 + bx^3 + cx^4 .... + So we calculate the polynomial P(f) = a + bf + cf^2 + ... + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ + svfloat64_t f2 = svmul_x (pg, f, f); + svfloat64_t f4 = svmul_x (pg, f2, f2); + svfloat64_t f8 = svmul_x (pg, f4, f4); + svfloat64_t p + = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly)); + + /* Assemble the result. + expm1(x) ~= 2^i * (p + 1) - 1 + Let t = 2^i. */ + svint64_t u = svadd_x (pg, svlsl_x (pg, i, 52), ExponentBias); + svfloat64_t t = svreinterpret_f64 (u); + + /* expm1(x) ~= p * t + (t - 1). */ + svfloat64_t y = svmla_x (pg, svsub_x (pg, t, 1), p, t); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, y, special); + + return y; +} + +PL_SIG (SV, D, 1, expm1, -9.9, 9.9) +PL_TEST_ULP (SV_NAME_D1 (expm1), 1.68) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (expm1), 0, 0x1p-23, 1000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (expm1), 0x1p-23, SpecialBound, 200000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (expm1), SpecialBound, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_expm1f_1u6.c b/contrib/arm-optimized-routines/pl/math/sv_expm1f_1u6.c new file mode 100644 index 000000000000..0ec7c00f5300 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_expm1f_1u6.c @@ -0,0 +1,93 @@ +/* + * Single-precision vector exp(x) - 1 function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +/* Largest value of x for which expm1(x) should round to -1. */ +#define SpecialBound 0x1.5ebc4p+6f + +static const struct data +{ + /* These 4 are grouped together so they can be loaded as one quadword, then + used with _lane forms of svmla/svmls. */ + float c2, c4, ln2_hi, ln2_lo; + float c0, c1, c3, inv_ln2, special_bound, shift; +} data = { + /* Generated using fpminimax. */ + .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, + .c2 = 0x1.555736p-5, .c3 = 0x1.12287cp-7, + .c4 = 0x1.6b55a2p-10, + + .special_bound = SpecialBound, .shift = 0x1.8p23f, + .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, + .ln2_lo = 0x1.7f7d1cp-20f, +}; + +#define C(i) sv_f32 (d->c##i) + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svbool_t pg) +{ + return sv_call_f32 (expm1f, x, x, pg); +} + +/* Single-precision SVE exp(x) - 1. Maximum error is 1.52 ULP: + _ZGVsMxv_expm1f(0x1.8f4ebcp-2) got 0x1.e859dp-2 + want 0x1.e859d4p-2. */ +svfloat32_t SV_NAME_F1 (expm1) (svfloat32_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* Large, NaN/Inf. */ + svbool_t special = svnot_z (pg, svaclt (pg, x, d->special_bound)); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, pg); + + /* This vector is reliant on layout of data - it contains constants + that can be used with _lane forms of svmla/svmls. Values are: + [ coeff_2, coeff_4, ln2_hi, ln2_lo ]. */ + svfloat32_t lane_constants = svld1rq (svptrue_b32 (), &d->c2); + + /* Reduce argument to smaller range: + Let i = round(x / ln2) + and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 + where 2^i is exact because i is an integer. */ + svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2); + j = svsub_x (pg, j, d->shift); + svint32_t i = svcvt_s32_x (pg, j); + + svfloat32_t f = svmls_lane (x, j, lane_constants, 2); + f = svmls_lane (f, j, lane_constants, 3); + + /* Approximate expm1(f) using polynomial. + Taylor expansion for expm1(x) has the form: + x + ax^2 + bx^3 + cx^4 .... + So we calculate the polynomial P(f) = a + bf + cf^2 + ... + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ + svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0); + svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1); + svfloat32_t f2 = svmul_x (pg, f, f); + svfloat32_t p = svmla_x (pg, p12, f2, p34); + p = svmla_x (pg, C (0), f, p); + p = svmla_x (pg, f, f2, p); + + /* Assemble the result. + expm1(x) ~= 2^i * (p + 1) - 1 + Let t = 2^i. */ + svfloat32_t t = svreinterpret_f32 ( + svadd_x (pg, svreinterpret_u32 (svlsl_x (pg, i, 23)), 0x3f800000)); + return svmla_x (pg, svsub_x (pg, t, 1), p, t); +} + +PL_SIG (SV, F, 1, expm1, -9.9, 9.9) +PL_TEST_ULP (SV_NAME_F1 (expm1), 1.02) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (expm1), 0, SpecialBound, 100000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (expm1), SpecialBound, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_expm1f_inline.h b/contrib/arm-optimized-routines/pl/math/sv_expm1f_inline.h new file mode 100644 index 000000000000..a6e2050ff4a6 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_expm1f_inline.h @@ -0,0 +1,73 @@ +/* + * SVE helper for single-precision routines which calculate exp(x) - 1 and do + * not need special-case handling + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef PL_MATH_SV_EXPM1F_INLINE_H +#define PL_MATH_SV_EXPM1F_INLINE_H + +#include "sv_math.h" + +struct sv_expm1f_data +{ + /* These 4 are grouped together so they can be loaded as one quadword, then + used with _lane forms of svmla/svmls. */ + float32_t c2, c4, ln2_hi, ln2_lo; + float32_t c0, c1, c3, inv_ln2, shift; +}; + +/* Coefficients generated using fpminimax. */ +#define SV_EXPM1F_DATA \ + { \ + .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, .c2 = 0x1.555736p-5, \ + .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \ + \ + .shift = 0x1.8p23f, .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \ + .ln2_lo = 0x1.7f7d1cp-20f, \ + } + +#define C(i) sv_f32 (d->c##i) + +static inline svfloat32_t +expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d) +{ + /* This vector is reliant on layout of data - it contains constants + that can be used with _lane forms of svmla/svmls. Values are: + [ coeff_2, coeff_4, ln2_hi, ln2_lo ]. */ + svfloat32_t lane_constants = svld1rq (svptrue_b32 (), &d->c2); + + /* Reduce argument to smaller range: + Let i = round(x / ln2) + and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 + where 2^i is exact because i is an integer. */ + svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2); + j = svsub_x (pg, j, d->shift); + svint32_t i = svcvt_s32_x (pg, j); + + svfloat32_t f = svmls_lane (x, j, lane_constants, 2); + f = svmls_lane (f, j, lane_constants, 3); + + /* Approximate expm1(f) using polynomial. + Taylor expansion for expm1(x) has the form: + x + ax^2 + bx^3 + cx^4 .... + So we calculate the polynomial P(f) = a + bf + cf^2 + ... + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ + svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0); + svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1); + svfloat32_t f2 = svmul_x (pg, f, f); + svfloat32_t p = svmla_x (pg, p12, f2, p34); + p = svmla_x (pg, C (0), f, p); + p = svmla_x (pg, f, f2, p); + + /* Assemble the result. + expm1(x) ~= 2^i * (p + 1) - 1 + Let t = 2^i. */ + svfloat32_t t = svscale_x (pg, sv_f32 (1), i); + return svmla_x (pg, svsub_x (pg, t, 1), p, t); +} + +#endif // PL_MATH_SV_EXPM1F_INLINE_H \ No newline at end of file diff --git a/contrib/arm-optimized-routines/pl/math/sv_hypot_1u5.c b/contrib/arm-optimized-routines/pl/math/sv_hypot_1u5.c new file mode 100644 index 000000000000..cf1590e4b9ab --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_hypot_1u5.c @@ -0,0 +1,51 @@ +/* + * Double-precision SVE hypot(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + uint64_t tiny_bound, thres; +} data = { + .tiny_bound = 0x0c80000000000000, /* asuint (0x1p-102). */ + .thres = 0x7300000000000000, /* asuint (inf) - tiny_bound. */ +}; + +static svfloat64_t NOINLINE +special_case (svfloat64_t sqsum, svfloat64_t x, svfloat64_t y, svbool_t pg, + svbool_t special) +{ + return sv_call2_f64 (hypot, x, y, svsqrt_x (pg, sqsum), special); +} + +/* SVE implementation of double-precision hypot. + Maximum error observed is 1.21 ULP: + _ZGVsMxvv_hypot (-0x1.6a22d0412cdd3p+352, 0x1.d3d89bd66fb1ap+330) + got 0x1.6a22d0412cfp+352 + want 0x1.6a22d0412cf01p+352. */ +svfloat64_t SV_NAME_D2 (hypot) (svfloat64_t x, svfloat64_t y, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat64_t sqsum = svmla_x (pg, svmul_x (pg, x, x), y, y); + + svbool_t special = svcmpge ( + pg, svsub_x (pg, svreinterpret_u64 (sqsum), d->tiny_bound), d->thres); + + if (unlikely (svptest_any (pg, special))) + return special_case (sqsum, x, y, pg, special); + return svsqrt_x (pg, sqsum); +} + +PL_SIG (SV, D, 2, hypot, -10.0, 10.0) +PL_TEST_ULP (SV_NAME_D2 (hypot), 0.71) +PL_TEST_INTERVAL2 (SV_NAME_D2 (hypot), 0, inf, 0, inf, 10000) +PL_TEST_INTERVAL2 (SV_NAME_D2 (hypot), 0, inf, -0, -inf, 10000) +PL_TEST_INTERVAL2 (SV_NAME_D2 (hypot), -0, -inf, 0, inf, 10000) +PL_TEST_INTERVAL2 (SV_NAME_D2 (hypot), -0, -inf, -0, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_hypotf_1u5.c b/contrib/arm-optimized-routines/pl/math/sv_hypotf_1u5.c new file mode 100644 index 000000000000..f428832b3dbc --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_hypotf_1u5.c @@ -0,0 +1,45 @@ +/* + * Single-precision SVE hypot(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define TinyBound 0x0c800000 /* asuint (0x1p-102). */ +#define Thres 0x73000000 /* 0x70000000 - TinyBound. */ + +static svfloat32_t NOINLINE +special_case (svfloat32_t sqsum, svfloat32_t x, svfloat32_t y, svbool_t pg, + svbool_t special) +{ + return sv_call2_f32 (hypotf, x, y, svsqrt_x (pg, sqsum), special); +} + +/* SVE implementation of single-precision hypot. + Maximum error observed is 1.21 ULP: + _ZGVsMxvv_hypotf (0x1.6a213cp-19, -0x1.32b982p-26) got 0x1.6a2346p-19 + want 0x1.6a2344p-19. */ +svfloat32_t SV_NAME_F2 (hypot) (svfloat32_t x, svfloat32_t y, + const svbool_t pg) +{ + svfloat32_t sqsum = svmla_x (pg, svmul_x (pg, x, x), y, y); + + svbool_t special = svcmpge ( + pg, svsub_x (pg, svreinterpret_u32 (sqsum), TinyBound), Thres); + + if (unlikely (svptest_any (pg, special))) + return special_case (sqsum, x, y, pg, special); + + return svsqrt_x (pg, sqsum); +} + +PL_SIG (SV, F, 2, hypot, -10.0, 10.0) +PL_TEST_ULP (SV_NAME_F2 (hypot), 0.71) +PL_TEST_INTERVAL2 (SV_NAME_F2 (hypot), 0, inf, 0, inf, 10000) +PL_TEST_INTERVAL2 (SV_NAME_F2 (hypot), 0, inf, -0, -inf, 10000) +PL_TEST_INTERVAL2 (SV_NAME_F2 (hypot), -0, -inf, 0, inf, 10000) +PL_TEST_INTERVAL2 (SV_NAME_F2 (hypot), -0, -inf, -0, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_log10_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_log10_2u5.c index 884e2011d2f8..f55e068fd442 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_log10_2u5.c +++ b/contrib/arm-optimized-routines/pl/math/sv_log10_2u5.c @@ -6,84 +6,70 @@ */ #include "sv_math.h" -#include "math_config.h" #include "pl_sig.h" #include "pl_test.h" +#include "poly_sve_f64.h" -#if SV_SUPPORTED - -#define OFF 0x3fe6900900000000 +#define Min 0x0010000000000000 +#define Max 0x7ff0000000000000 +#define Thres 0x7fe0000000000000 /* Max - Min. */ +#define Off 0x3fe6900900000000 #define N (1 << V_LOG10_TABLE_BITS) -#define A(i) __v_log10_data.poly[i] - -static inline sv_f64_t -specialcase (sv_f64_t x, sv_f64_t y, svbool_t special) +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t special) { return sv_call_f64 (log10, x, y, special); } -/* SVE log10 algorithm. Maximum measured error is 2.46 ulps. - __sv_log10(0x1.131956cd4b627p+0) got 0x1.fffbdf6eaa669p-6 - want 0x1.fffbdf6eaa667p-6. */ -sv_f64_t -__sv_log10_x (sv_f64_t x, const svbool_t pg) +/* SVE log10 algorithm. + Maximum measured error is 2.46 ulps. + SV_NAME_D1 (log10)(0x1.131956cd4b627p+0) got 0x1.fffbdf6eaa669p-6 + want 0x1.fffbdf6eaa667p-6. */ +svfloat64_t SV_NAME_D1 (log10) (svfloat64_t x, const svbool_t pg) { - sv_u64_t ix = sv_as_u64_f64 (x); - sv_u64_t top = svlsr_n_u64_x (pg, ix, 48); + svuint64_t ix = svreinterpret_u64 (x); + svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres); - svbool_t is_special_case - = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, top, 0x0010), 0x07ff0 - 0x0010); - - /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + /* x = 2^k z; where z is in range [Off,2*Off) and exact. The range is split into N subintervals. The ith subinterval contains z and c is near its center. */ - sv_u64_t tmp = svsub_n_u64_x (pg, ix, OFF); - sv_u64_t i - = sv_mod_n_u64_x (pg, svlsr_n_u64_x (pg, tmp, 52 - V_LOG10_TABLE_BITS), N); - sv_f64_t k - = sv_to_f64_s64_x (pg, svasr_n_s64_x (pg, sv_as_s64_u64 (tmp), 52)); - sv_f64_t z = sv_as_f64_u64 ( - svsub_u64_x (pg, ix, svand_n_u64_x (pg, tmp, 0xfffULL << 52))); + svuint64_t tmp = svsub_x (pg, ix, Off); + svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG10_TABLE_BITS); + i = svand_x (pg, i, (N - 1) << 1); + svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52)); + svfloat64_t z = svreinterpret_f64 ( + svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52))); /* log(x) = k*log(2) + log(c) + log(z/c). */ - - sv_u64_t idx = svmul_n_u64_x (pg, i, 2); - sv_f64_t invc = sv_lookup_f64_x (pg, &__v_log10_data.tab[0].invc, idx); - sv_f64_t logc = sv_lookup_f64_x (pg, &__v_log10_data.tab[0].log10c, idx); + svfloat64_t invc = svld1_gather_index (pg, &__v_log10_data.table[0].invc, i); + svfloat64_t logc + = svld1_gather_index (pg, &__v_log10_data.table[0].log10c, i); /* We approximate log(z/c) with a polynomial P(x) ~= log(x + 1): r = z/c - 1 (we look up precomputed 1/c) log(z/c) ~= P(r). */ - sv_f64_t r = sv_fma_f64_x (pg, z, invc, sv_f64 (-1.0)); + svfloat64_t r = svmad_x (pg, invc, z, -1.0); /* hi = log(c) + k*log(2). */ - sv_f64_t w = sv_fma_n_f64_x (pg, __v_log10_data.invln10, r, logc); - sv_f64_t hi = sv_fma_n_f64_x (pg, __v_log10_data.log10_2, k, w); + svfloat64_t w = svmla_x (pg, logc, r, __v_log10_data.invln10); + svfloat64_t hi = svmla_x (pg, w, k, __v_log10_data.log10_2); /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ - sv_f64_t r2 = svmul_f64_x (pg, r, r); - sv_f64_t y = sv_fma_n_f64_x (pg, A (3), r, sv_f64 (A (2))); - sv_f64_t p = sv_fma_n_f64_x (pg, A (1), r, sv_f64 (A (0))); - y = sv_fma_n_f64_x (pg, A (4), r2, y); - y = sv_fma_f64_x (pg, y, r2, p); - y = sv_fma_f64_x (pg, y, r2, hi); + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t y = sv_pw_horner_4_f64_x (pg, r, r2, __v_log10_data.poly); - if (unlikely (svptest_any (pg, is_special_case))) - { - return specialcase (x, y, is_special_case); - } - return y; + if (unlikely (svptest_any (pg, special))) + return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y), + special); + return svmla_x (pg, hi, r2, y); } -PL_ALIAS (__sv_log10_x, _ZGVsMxv_log10) - PL_SIG (SV, D, 1, log10, 0.01, 11.1) -PL_TEST_ULP (__sv_log10, 1.97) -PL_TEST_INTERVAL (__sv_log10, -0.0, -0x1p126, 100) -PL_TEST_INTERVAL (__sv_log10, 0x1p-149, 0x1p-126, 4000) -PL_TEST_INTERVAL (__sv_log10, 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (__sv_log10, 0x1p-23, 1.0, 50000) -PL_TEST_INTERVAL (__sv_log10, 1.0, 100, 50000) -PL_TEST_INTERVAL (__sv_log10, 100, inf, 50000) -#endif +PL_TEST_ULP (SV_NAME_D1 (log10), 1.97) +PL_TEST_INTERVAL (SV_NAME_D1 (log10), -0.0, -0x1p126, 100) +PL_TEST_INTERVAL (SV_NAME_D1 (log10), 0x1p-149, 0x1p-126, 4000) +PL_TEST_INTERVAL (SV_NAME_D1 (log10), 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (log10), 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (log10), 1.0, 100, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (log10), 100, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_log10f_3u5.c b/contrib/arm-optimized-routines/pl/math/sv_log10f_3u5.c index e7b1e9801fa9..a685b23e5de5 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_log10f_3u5.c +++ b/contrib/arm-optimized-routines/pl/math/sv_log10f_3u5.c @@ -9,80 +9,85 @@ #include "pl_sig.h" #include "pl_test.h" -#if SV_SUPPORTED +static const struct data +{ + float poly_0246[4]; + float poly_1357[4]; + float ln2, inv_ln10; +} data = { + .poly_1357 = { + /* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs + 1, 3, 5 and 7 can be loaded as a single quad-word, hence used with _lane + variant of MLA intrinsic. */ + 0x1.2879c8p-3f, 0x1.6408f8p-4f, 0x1.f0e514p-5f, 0x1.f5f76ap-5f + }, + .poly_0246 = { -0x1.bcb79cp-3f, -0x1.bcd472p-4f, -0x1.246f8p-4f, + -0x1.0fc92cp-4f }, + .ln2 = 0x1.62e43p-1f, + .inv_ln10 = 0x1.bcb7b2p-2f, +}; -#define SpecialCaseMin 0x00800000 -#define SpecialCaseMax 0x7f800000 +#define Min 0x00800000 +#define Max 0x7f800000 +#define Thres 0x7f000000 /* Max - Min. */ #define Offset 0x3f2aaaab /* 0.666667. */ -#define Mask 0x007fffff -#define Ln2 0x1.62e43p-1f /* 0x3f317218. */ -#define InvLn10 0x1.bcb7b2p-2f +#define MantissaMask 0x007fffff -#define P(i) __v_log10f_poly[i] - -static NOINLINE sv_f32_t -special_case (sv_f32_t x, sv_f32_t y, svbool_t special) +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t special) { return sv_call_f32 (log10f, x, y, special); } /* Optimised implementation of SVE log10f using the same algorithm and - polynomial as v_log10f. Maximum error is 3.31ulps: - __sv_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4 - want 0x1.ffe2f4p-4. */ -sv_f32_t -__sv_log10f_x (sv_f32_t x, const svbool_t pg) + polynomial as AdvSIMD log10f. + Maximum error is 3.31ulps: + SV_NAME_F1 (log10)(0x1.555c16p+0) got 0x1.ffe2fap-4 + want 0x1.ffe2f4p-4. */ +svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg) { - sv_u32_t ix = sv_as_u32_f32 (x); - svbool_t special_cases - = svcmpge_n_u32 (pg, svsub_n_u32_x (pg, ix, SpecialCaseMin), - SpecialCaseMax - SpecialCaseMin); + const struct data *d = ptr_barrier (&data); + svuint32_t ix = svreinterpret_u32 (x); + svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres); /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ - ix = svsub_n_u32_x (pg, ix, Offset); - sv_f32_t n = sv_to_f32_s32_x (pg, svasr_n_s32_x (pg, sv_as_s32_u32 (ix), - 23)); /* signextend. */ - ix = svand_n_u32_x (pg, ix, Mask); - ix = svadd_n_u32_x (pg, ix, Offset); - sv_f32_t r = svsub_n_f32_x (pg, sv_as_f32_u32 (ix), 1.0f); + ix = svsub_x (pg, ix, Offset); + svfloat32_t n = svcvt_f32_x ( + pg, svasr_x (pg, svreinterpret_s32 (ix), 23)); /* signextend. */ + ix = svand_x (pg, ix, MantissaMask); + ix = svadd_x (pg, ix, Offset); + svfloat32_t r = svsub_x (pg, svreinterpret_f32 (ix), 1.0f); /* y = log10(1+r) + n*log10(2) log10(1+r) ~ r * InvLn(10) + P(r) where P(r) is a polynomial. Use order 9 for log10(1+x), i.e. order 8 for - log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3) + log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3). */ + svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t r4 = svmul_x (pg, r2, r2); + svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]); + svfloat32_t q_01 = svmla_lane (sv_f32 (d->poly_0246[0]), r, p_1357, 0); + svfloat32_t q_23 = svmla_lane (sv_f32 (d->poly_0246[1]), r, p_1357, 1); + svfloat32_t q_45 = svmla_lane (sv_f32 (d->poly_0246[2]), r, p_1357, 2); + svfloat32_t q_67 = svmla_lane (sv_f32 (d->poly_0246[3]), r, p_1357, 3); + svfloat32_t q_47 = svmla_x (pg, q_45, r2, q_67); + svfloat32_t q_03 = svmla_x (pg, q_01, r2, q_23); + svfloat32_t y = svmla_x (pg, q_03, r4, q_47); - P(r) = r2 * (Q01 + r2 * (Q23 + r2 * (Q45 + r2 * Q67))) - and Qij = Pi + r * Pj. */ - sv_f32_t q12 = sv_fma_n_f32_x (pg, P (1), r, sv_f32 (P (0))); - sv_f32_t q34 = sv_fma_n_f32_x (pg, P (3), r, sv_f32 (P (2))); - sv_f32_t q56 = sv_fma_n_f32_x (pg, P (5), r, sv_f32 (P (4))); - sv_f32_t q78 = sv_fma_n_f32_x (pg, P (7), r, sv_f32 (P (6))); + /* Using hi = Log10(2)*n + r*InvLn(10) is faster but less accurate. */ + svfloat32_t hi = svmla_x (pg, r, n, d->ln2); + hi = svmul_x (pg, hi, d->inv_ln10); - sv_f32_t r2 = svmul_f32_x (pg, r, r); - sv_f32_t y = sv_fma_f32_x (pg, q78, r2, q56); - y = sv_fma_f32_x (pg, y, r2, q34); - y = sv_fma_f32_x (pg, y, r2, q12); - - /* Using p = Log10(2)*n + r*InvLn(10) is slightly faster but less - accurate. */ - sv_f32_t p = sv_fma_n_f32_x (pg, Ln2, n, r); - y = sv_fma_f32_x (pg, y, r2, svmul_n_f32_x (pg, p, InvLn10)); - - if (unlikely (svptest_any (pg, special_cases))) - { - return special_case (x, y, special_cases); - } - return y; + if (unlikely (svptest_any (pg, special))) + return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y), + special); + return svmla_x (pg, hi, r2, y); } -PL_ALIAS (__sv_log10f_x, _ZGVsMxv_log10f) - PL_SIG (SV, F, 1, log10, 0.01, 11.1) -PL_TEST_ULP (__sv_log10f, 2.82) -PL_TEST_INTERVAL (__sv_log10f, -0.0, -0x1p126, 100) -PL_TEST_INTERVAL (__sv_log10f, 0x1p-149, 0x1p-126, 4000) -PL_TEST_INTERVAL (__sv_log10f, 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (__sv_log10f, 0x1p-23, 1.0, 50000) -PL_TEST_INTERVAL (__sv_log10f, 1.0, 100, 50000) -PL_TEST_INTERVAL (__sv_log10f, 100, inf, 50000) -#endif +PL_TEST_ULP (SV_NAME_F1 (log10), 2.82) +PL_TEST_INTERVAL (SV_NAME_F1 (log10), -0.0, -0x1p126, 100) +PL_TEST_INTERVAL (SV_NAME_F1 (log10), 0x1p-149, 0x1p-126, 4000) +PL_TEST_INTERVAL (SV_NAME_F1 (log10), 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (log10), 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (log10), 1.0, 100, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (log10), 100, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_log1p_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_log1p_2u5.c new file mode 100644 index 000000000000..f178ab16238a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_log1p_2u5.c @@ -0,0 +1,116 @@ +/* + * Double-precision SVE log(1+x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "poly_sve_f64.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + double poly[19]; + double ln2_hi, ln2_lo; + uint64_t hfrt2_top, onemhfrt2_top, inf, mone; +} data = { + /* Generated using Remez in [ sqrt(2)/2 - 1, sqrt(2) - 1]. Order 20 + polynomial, however first 2 coefficients are 0 and 1 so are not stored. */ + .poly = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2, + 0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3, + -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4, + 0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4, + -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5, + 0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4, + -0x1.cfa7385bdb37ep-6, }, + .ln2_hi = 0x1.62e42fefa3800p-1, + .ln2_lo = 0x1.ef35793c76730p-45, + /* top32(asuint64(sqrt(2)/2)) << 32. */ + .hfrt2_top = 0x3fe6a09e00000000, + /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32. */ + .onemhfrt2_top = 0x00095f6200000000, + .inf = 0x7ff0000000000000, + .mone = 0xbff0000000000000, +}; + +#define AbsMask 0x7fffffffffffffff +#define BottomMask 0xffffffff + +static svfloat64_t NOINLINE +special_case (svbool_t special, svfloat64_t x, svfloat64_t y) +{ + return sv_call_f64 (log1p, x, y, special); +} + +/* Vector approximation for log1p using polynomial on reduced interval. Maximum + observed error is 2.46 ULP: + _ZGVsMxv_log1p(0x1.654a1307242a4p+11) got 0x1.fd5565fb590f4p+2 + want 0x1.fd5565fb590f6p+2. */ +svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + svuint64_t ix = svreinterpret_u64 (x); + svuint64_t ax = svand_x (pg, ix, AbsMask); + svbool_t special + = svorr_z (pg, svcmpge (pg, ax, d->inf), svcmpge (pg, ix, d->mone)); + + /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f + is in [sqrt(2)/2, sqrt(2)]): + log1p(x) = k*log(2) + log1p(f). + + f may not be representable exactly, so we need a correction term: + let m = round(1 + x), c = (1 + x) - m. + c << m: at very small x, log1p(x) ~ x, hence: + log(1+x) - log(m) ~ c/m. + + We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m. */ + + /* Obtain correctly scaled k by manipulation in the exponent. + The scalar algorithm casts down to 32-bit at this point to calculate k and + u_red. We stay in double-width to obtain f and k, using the same constants + as the scalar algorithm but shifted left by 32. */ + svfloat64_t m = svadd_x (pg, x, 1); + svuint64_t mi = svreinterpret_u64 (m); + svuint64_t u = svadd_x (pg, mi, d->onemhfrt2_top); + + svint64_t ki = svsub_x (pg, svreinterpret_s64 (svlsr_x (pg, u, 52)), 0x3ff); + svfloat64_t k = svcvt_f64_x (pg, ki); + + /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */ + svuint64_t utop + = svadd_x (pg, svand_x (pg, u, 0x000fffff00000000), d->hfrt2_top); + svuint64_t u_red = svorr_x (pg, utop, svand_x (pg, mi, BottomMask)); + svfloat64_t f = svsub_x (pg, svreinterpret_f64 (u_red), 1); + + /* Correction term c/m. */ + svfloat64_t cm = svdiv_x (pg, svsub_x (pg, x, svsub_x (pg, m, 1)), m); + + /* Approximate log1p(x) on the reduced input using a polynomial. Because + log1p(0)=0 we choose an approximation of the form: + x + C0*x^2 + C1*x^3 + C2x^4 + ... + Hence approximation has the form f + f^2 * P(f) + where P(x) = C0 + C1*x + C2x^2 + ... + Assembling this all correctly is dealt with at the final step. */ + svfloat64_t f2 = svmul_x (pg, f, f), f4 = svmul_x (pg, f2, f2), + f8 = svmul_x (pg, f4, f4), f16 = svmul_x (pg, f8, f8); + svfloat64_t p = sv_estrin_18_f64_x (pg, f, f2, f4, f8, f16, d->poly); + + svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2_lo); + svfloat64_t yhi = svmla_x (pg, f, k, d->ln2_hi); + svfloat64_t y = svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p); + + if (unlikely (svptest_any (pg, special))) + return special_case (special, x, y); + + return y; +} + +PL_SIG (SV, D, 1, log1p, -0.9, 10.0) +PL_TEST_ULP (SV_NAME_D1 (log1p), 1.97) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (log1p), 0.0, 0x1p-23, 50000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (log1p), 0x1p-23, 0.001, 50000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (log1p), 0.001, 1.0, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (log1p), 1, inf, 10000) +PL_TEST_INTERVAL (SV_NAME_D1 (log1p), -1, -inf, 10) diff --git a/contrib/arm-optimized-routines/pl/math/sv_log1p_inline.h b/contrib/arm-optimized-routines/pl/math/sv_log1p_inline.h new file mode 100644 index 000000000000..983f8e1b0413 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_log1p_inline.h @@ -0,0 +1,96 @@ +/* + * Helper for SVE double-precision routines which calculate log(1 + x) and do + * not need special-case handling + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#ifndef PL_MATH_SV_LOG1P_INLINE_H +#define PL_MATH_SV_LOG1P_INLINE_H + +#include "sv_math.h" +#include "poly_sve_f64.h" + +static const struct sv_log1p_data +{ + double poly[19], ln2[2]; + uint64_t hf_rt2_top; + uint64_t one_m_hf_rt2_top; + uint32_t bottom_mask; + int64_t one_top; +} sv_log1p_data = { + /* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. + */ + .poly = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2, + 0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3, + -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4, + 0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4, + -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5, + 0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4, + -0x1.cfa7385bdb37ep-6 }, + .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 }, + .hf_rt2_top = 0x3fe6a09e00000000, + .one_m_hf_rt2_top = 0x00095f6200000000, + .bottom_mask = 0xffffffff, + .one_top = 0x3ff +}; + +static inline svfloat64_t +sv_log1p_inline (svfloat64_t x, const svbool_t pg) +{ + /* Helper for calculating log(x + 1). Adapted from v_log1p_inline.h, which + differs from v_log1p_2u5.c by: + - No special-case handling - this should be dealt with by the caller. + - Pairwise Horner polynomial evaluation for improved accuracy. + - Optionally simulate the shortcut for k=0, used in the scalar routine, + using svsel, for improved accuracy when the argument to log1p is close + to 0. This feature is enabled by defining WANT_SV_LOG1P_K0_SHORTCUT as 1 + in the source of the caller before including this file. + See sv_log1p_2u1.c for details of the algorithm. */ + const struct sv_log1p_data *d = ptr_barrier (&sv_log1p_data); + svfloat64_t m = svadd_x (pg, x, 1); + svuint64_t mi = svreinterpret_u64 (m); + svuint64_t u = svadd_x (pg, mi, d->one_m_hf_rt2_top); + + svint64_t ki + = svsub_x (pg, svreinterpret_s64 (svlsr_x (pg, u, 52)), d->one_top); + svfloat64_t k = svcvt_f64_x (pg, ki); + + /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */ + svuint64_t utop + = svadd_x (pg, svand_x (pg, u, 0x000fffff00000000), d->hf_rt2_top); + svuint64_t u_red = svorr_x (pg, utop, svand_x (pg, mi, d->bottom_mask)); + svfloat64_t f = svsub_x (pg, svreinterpret_f64 (u_red), 1); + + /* Correction term c/m. */ + svfloat64_t c = svsub_x (pg, x, svsub_x (pg, m, 1)); + svfloat64_t cm; + +#ifndef WANT_SV_LOG1P_K0_SHORTCUT +#error \ + "Cannot use sv_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0" +#elif WANT_SV_LOG1P_K0_SHORTCUT + /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is + that the approximation is solely the polynomial. */ + svbool_t knot0 = svcmpne (pg, k, 0); + cm = svdiv_z (knot0, c, m); + if (likely (!svptest_any (pg, knot0))) + { + f = svsel (knot0, f, x); + } +#else + /* No shortcut. */ + cm = svdiv_x (pg, c, m); +#endif + + /* Approximate log1p(f) on the reduced input using a polynomial. */ + svfloat64_t f2 = svmul_x (pg, f, f); + svfloat64_t p = sv_pw_horner_18_f64_x (pg, f, f2, d->poly); + + /* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */ + svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2[0]); + svfloat64_t yhi = svmla_x (pg, f, k, d->ln2[1]); + + return svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p); +} +#endif // PL_MATH_SV_LOG1P_INLINE_H diff --git a/contrib/arm-optimized-routines/pl/math/sv_log1pf_1u3.c b/contrib/arm-optimized-routines/pl/math/sv_log1pf_1u3.c new file mode 100644 index 000000000000..ea1a3dbf723a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_log1pf_1u3.c @@ -0,0 +1,97 @@ +/* + * Single-precision vector log(x + 1) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "poly_sve_f32.h" + +static const struct data +{ + float poly[8]; + float ln2, exp_bias; + uint32_t four, three_quarters; +} data = {.poly = {/* Do not store first term of polynomial, which is -0.5, as + this can be fmov-ed directly instead of including it in + the main load-and-mla polynomial schedule. */ + 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f, + -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, + 0x1.abcb6p-4f, -0x1.6f0d5ep-5f}, + .ln2 = 0x1.62e43p-1f, + .exp_bias = 0x1p-23f, + .four = 0x40800000, + .three_quarters = 0x3f400000}; + +#define SignExponentMask 0xff800000 + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +{ + return sv_call_f32 (log1pf, x, y, special); +} + +/* Vector log1pf approximation using polynomial on reduced interval. Worst-case + error is 1.27 ULP very close to 0.5. + _ZGVsMxv_log1pf(0x1.fffffep-2) got 0x1.9f324p-2 + want 0x1.9f323ep-2. */ +svfloat32_t SV_NAME_F1 (log1p) (svfloat32_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + /* x < -1, Inf/Nan. */ + svbool_t special = svcmpeq (pg, svreinterpret_u32 (x), 0x7f800000); + special = svorn_z (pg, special, svcmpge (pg, x, -1)); + + /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m + is in [-0.25, 0.5]): + log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2). + + We approximate log1p(m) with a polynomial, then scale by + k*log(2). Instead of doing this directly, we use an intermediate + scale factor s = 4*k*log(2) to ensure the scale is representable + as a normalised fp32 number. */ + svfloat32_t m = svadd_x (pg, x, 1); + + /* Choose k to scale x to the range [-1/4, 1/2]. */ + svint32_t k + = svand_x (pg, svsub_x (pg, svreinterpret_s32 (m), d->three_quarters), + sv_s32 (SignExponentMask)); + + /* Scale x by exponent manipulation. */ + svfloat32_t m_scale = svreinterpret_f32 ( + svsub_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (k))); + + /* Scale up to ensure that the scale factor is representable as normalised + fp32 number, and scale m down accordingly. */ + svfloat32_t s = svreinterpret_f32 (svsubr_x (pg, k, d->four)); + m_scale = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1), s, 0.25)); + + /* Evaluate polynomial on reduced interval. */ + svfloat32_t ms2 = svmul_x (pg, m_scale, m_scale), + ms4 = svmul_x (pg, ms2, ms2); + svfloat32_t p = sv_estrin_7_f32_x (pg, m_scale, ms2, ms4, d->poly); + p = svmad_x (pg, m_scale, p, -0.5); + p = svmla_x (pg, m_scale, m_scale, svmul_x (pg, m_scale, p)); + + /* The scale factor to be applied back at the end - by multiplying float(k) + by 2^-23 we get the unbiased exponent of k. */ + svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->exp_bias); + + /* Apply the scaling back. */ + svfloat32_t y = svmla_x (pg, p, scale_back, d->ln2); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, y, special); + + return y; +} + +PL_SIG (SV, F, 1, log1p, -0.9, 10.0) +PL_TEST_ULP (SV_NAME_F1 (log1p), 0.77) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (log1p), 0, 0x1p-23, 5000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (log1p), 0x1p-23, 1, 5000) +PL_TEST_INTERVAL (SV_NAME_F1 (log1p), 1, inf, 10000) +PL_TEST_INTERVAL (SV_NAME_F1 (log1p), -1, -inf, 10) diff --git a/contrib/arm-optimized-routines/pl/math/sv_log1pf_inline.h b/contrib/arm-optimized-routines/pl/math/sv_log1pf_inline.h new file mode 100644 index 000000000000..d13b094f6b5d --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_log1pf_inline.h @@ -0,0 +1,65 @@ +/* + * Helper for SVE routines which calculate log(1 + x) and do not + * need special-case handling + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef PL_MATH_SV_LOG1PF_INLINE_H +#define PL_MATH_SV_LOG1PF_INLINE_H + +#include "v_math.h" +#include "math_config.h" +#include "poly_sve_f32.h" + +static const struct sv_log1pf_data +{ + float32_t poly[9]; + float32_t ln2; + float32_t scale_back; +} sv_log1pf_data = { + /* Polynomial generated using FPMinimax in [-0.25, 0.5]. */ + .poly = { -0x1p-1f, 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f, + -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, 0x1.abcb6p-4f, + -0x1.6f0d5ep-5f }, + .scale_back = 0x1.0p-23f, + .ln2 = 0x1.62e43p-1f, +}; + +static inline svfloat32_t +eval_poly (svfloat32_t m, const float32_t *c, svbool_t pg) +{ + svfloat32_t p_12 = svmla_x (pg, sv_f32 (c[0]), m, sv_f32 (c[1])); + svfloat32_t m2 = svmul_x (pg, m, m); + svfloat32_t q = svmla_x (pg, m, m2, p_12); + svfloat32_t p = sv_pw_horner_6_f32_x (pg, m, m2, c + 2); + p = svmul_x (pg, m2, p); + + return svmla_x (pg, q, m2, p); +} + +static inline svfloat32_t +sv_log1pf_inline (svfloat32_t x, svbool_t pg) +{ + const struct sv_log1pf_data *d = ptr_barrier (&sv_log1pf_data); + + svfloat32_t m = svadd_x (pg, x, 1.0f); + + svint32_t ks = svsub_x (pg, svreinterpret_s32 (m), + svreinterpret_s32 (svdup_f32 (0.75f))); + ks = svand_x (pg, ks, 0xff800000); + svuint32_t k = svreinterpret_u32 (ks); + svfloat32_t s = svreinterpret_f32 ( + svsub_x (pg, svreinterpret_u32 (svdup_f32 (4.0f)), k)); + + svfloat32_t m_scale + = svreinterpret_f32 (svsub_x (pg, svreinterpret_u32 (x), k)); + m_scale + = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1.0f), sv_f32 (0.25f), s)); + svfloat32_t p = eval_poly (m_scale, d->poly, pg); + svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->scale_back); + return svmla_x (pg, p, scale_back, d->ln2); +} + +#endif // PL_MATH_SV_LOG1PF_INLINE_H \ No newline at end of file diff --git a/contrib/arm-optimized-routines/pl/math/sv_log2_3u.c b/contrib/arm-optimized-routines/pl/math/sv_log2_3u.c index a0815bb5646f..0775a39cc85d 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_log2_3u.c +++ b/contrib/arm-optimized-routines/pl/math/sv_log2_3u.c @@ -8,78 +8,66 @@ #include "sv_math.h" #include "pl_sig.h" #include "pl_test.h" +#include "poly_sve_f64.h" -#if SV_SUPPORTED - -#define InvLn2 sv_f64 (0x1.71547652b82fep0) #define N (1 << V_LOG2_TABLE_BITS) -#define OFF 0x3fe6900900000000 -#define P(i) sv_f64 (__v_log2_data.poly[i]) +#define Off 0x3fe6900900000000 +#define Max (0x7ff0000000000000) +#define Min (0x0010000000000000) +#define Thresh (0x7fe0000000000000) /* Max - Min. */ -NOINLINE static sv_f64_t -specialcase (sv_f64_t x, sv_f64_t y, const svbool_t cmp) +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp) { return sv_call_f64 (log2, x, y, cmp); } -/* Double-precision SVE log2 routine. Implements the same algorithm as vector - log10, with coefficients and table entries scaled in extended precision. +/* Double-precision SVE log2 routine. + Implements the same algorithm as AdvSIMD log10, with coefficients and table + entries scaled in extended precision. The maximum observed error is 2.58 ULP: - __v_log2(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5 - want 0x1.fffb34198d9ddp-5. */ -sv_f64_t -__sv_log2_x (sv_f64_t x, const svbool_t pg) + SV_NAME_D1 (log2)(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5 + want 0x1.fffb34198d9ddp-5. */ +svfloat64_t SV_NAME_D1 (log2) (svfloat64_t x, const svbool_t pg) { - sv_u64_t ix = sv_as_u64_f64 (x); - sv_u64_t top = svlsr_n_u64_x (pg, ix, 48); + svuint64_t ix = svreinterpret_u64 (x); + svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thresh); - svbool_t special - = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, top, 0x0010), 0x7ff0 - 0x0010); - - /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + /* x = 2^k z; where z is in range [Off,2*Off) and exact. The range is split into N subintervals. The ith subinterval contains z and c is near its center. */ - sv_u64_t tmp = svsub_n_u64_x (pg, ix, OFF); - sv_u64_t i - = sv_mod_n_u64_x (pg, svlsr_n_u64_x (pg, tmp, 52 - V_LOG2_TABLE_BITS), N); - sv_f64_t k - = sv_to_f64_s64_x (pg, svasr_n_s64_x (pg, sv_as_s64_u64 (tmp), 52)); - sv_f64_t z = sv_as_f64_u64 ( - svsub_u64_x (pg, ix, svand_n_u64_x (pg, tmp, 0xfffULL << 52))); + svuint64_t tmp = svsub_x (pg, ix, Off); + svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG2_TABLE_BITS); + i = svand_x (pg, i, (N - 1) << 1); + svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52)); + svfloat64_t z = svreinterpret_f64 ( + svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52))); - sv_u64_t idx = svmul_n_u64_x (pg, i, 2); - sv_f64_t invc = sv_lookup_f64_x (pg, &__v_log2_data.tab[0].invc, idx); - sv_f64_t log2c = sv_lookup_f64_x (pg, &__v_log2_data.tab[0].log2c, idx); + svfloat64_t invc = svld1_gather_index (pg, &__v_log2_data.table[0].invc, i); + svfloat64_t log2c + = svld1_gather_index (pg, &__v_log2_data.table[0].log2c, i); /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */ - sv_f64_t r = sv_fma_f64_x (pg, z, invc, sv_f64 (-1.0)); - sv_f64_t w = sv_fma_f64_x (pg, r, InvLn2, log2c); + svfloat64_t r = svmad_x (pg, invc, z, -1.0); + svfloat64_t w = svmla_x (pg, log2c, r, __v_log2_data.invln2); - sv_f64_t r2 = svmul_f64_x (pg, r, r); - sv_f64_t p_23 = sv_fma_f64_x (pg, P (3), r, P (2)); - sv_f64_t p_01 = sv_fma_f64_x (pg, P (1), r, P (0)); - sv_f64_t y = sv_fma_f64_x (pg, P (4), r2, p_23); - y = sv_fma_f64_x (pg, y, r2, p_01); - y = sv_fma_f64_x (pg, y, r2, svadd_f64_x (pg, k, w)); + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t y = sv_pw_horner_4_f64_x (pg, r, r2, __v_log2_data.poly); + w = svadd_x (pg, k, w); if (unlikely (svptest_any (pg, special))) - { - return specialcase (x, y, special); - } - return y; + return special_case (x, svmla_x (svnot_z (pg, special), w, r2, y), + special); + return svmla_x (pg, w, r2, y); } -PL_ALIAS (__sv_log2_x, _ZGVsMxv_log2) - PL_SIG (SV, D, 1, log2, 0.01, 11.1) -PL_TEST_ULP (__sv_log2, 2.09) -PL_TEST_EXPECT_FENV_ALWAYS (__sv_log2) -PL_TEST_INTERVAL (__sv_log2, -0.0, -0x1p126, 1000) -PL_TEST_INTERVAL (__sv_log2, 0.0, 0x1p-126, 4000) -PL_TEST_INTERVAL (__sv_log2, 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (__sv_log2, 0x1p-23, 1.0, 50000) -PL_TEST_INTERVAL (__sv_log2, 1.0, 100, 50000) -PL_TEST_INTERVAL (__sv_log2, 100, inf, 50000) - -#endif +PL_TEST_ULP (SV_NAME_D1 (log2), 2.09) +PL_TEST_EXPECT_FENV_ALWAYS (SV_NAME_D1 (log2)) +PL_TEST_INTERVAL (SV_NAME_D1 (log2), -0.0, -0x1p126, 1000) +PL_TEST_INTERVAL (SV_NAME_D1 (log2), 0.0, 0x1p-126, 4000) +PL_TEST_INTERVAL (SV_NAME_D1 (log2), 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (log2), 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (log2), 1.0, 100, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (log2), 100, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_log2f_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_log2f_2u5.c index fe2ab16b90b7..9e96c62bbcc6 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_log2f_2u5.c +++ b/contrib/arm-optimized-routines/pl/math/sv_log2f_2u5.c @@ -9,71 +9,78 @@ #include "pl_sig.h" #include "pl_test.h" -#if SV_SUPPORTED +static const struct data +{ + float poly_02468[5]; + float poly_1357[4]; +} data = { + .poly_1357 = { + /* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs + 1, 3, 5 and 7 can be loaded as a single quad-word, hence used with _lane + variant of MLA intrinsic. */ + -0x1.715458p-1f, -0x1.7171a4p-2f, -0x1.e5143ep-3f, -0x1.c675bp-3f + }, + .poly_02468 = { 0x1.715476p0f, 0x1.ec701cp-2f, 0x1.27a0b8p-2f, + 0x1.9d8ecap-3f, 0x1.9e495p-3f }, +}; -#define P(i) __v_log2f_data.poly[i] - -#define Ln2 (0x1.62e43p-1f) /* 0x3f317218. */ #define Min (0x00800000) #define Max (0x7f800000) -#define Mask (0x007fffff) +#define Thres (0x7f000000) /* Max - Min. */ +#define MantissaMask (0x007fffff) #define Off (0x3f2aaaab) /* 0.666667. */ -static NOINLINE sv_f32_t -specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp) +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp) { return sv_call_f32 (log2f, x, y, cmp); } /* Optimised implementation of SVE log2f, using the same algorithm - and polynomial as Neon log2f. Maximum error is 2.48 ULPs: - __sv_log2f(0x1.558174p+0) got 0x1.a9be84p-2 - want 0x1.a9be8p-2. */ -sv_f32_t -__sv_log2f_x (sv_f32_t x, const svbool_t pg) + and polynomial as AdvSIMD log2f. + Maximum error is 2.48 ULPs: + SV_NAME_F1 (log2)(0x1.558174p+0) got 0x1.a9be84p-2 + want 0x1.a9be8p-2. */ +svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg) { - sv_u32_t u = sv_as_u32_f32 (x); - svbool_t special - = svcmpge_u32 (pg, svsub_n_u32_x (pg, u, Min), sv_u32 (Max - Min)); + const struct data *d = ptr_barrier (&data); + + svuint32_t u = svreinterpret_u32 (x); + svbool_t special = svcmpge (pg, svsub_x (pg, u, Min), Thres); /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ - u = svsub_n_u32_x (pg, u, Off); - sv_f32_t n = sv_to_f32_s32_x (pg, svasr_n_s32_x (pg, sv_as_s32_u32 (u), - 23)); /* Sign-extend. */ - u = svand_n_u32_x (pg, u, Mask); - u = svadd_n_u32_x (pg, u, Off); - sv_f32_t r = svsub_n_f32_x (pg, sv_as_f32_u32 (u), 1.0f); + u = svsub_x (pg, u, Off); + svfloat32_t n = svcvt_f32_x ( + pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend. */ + u = svand_x (pg, u, MantissaMask); + u = svadd_x (pg, u, Off); + svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f); /* y = log2(1+r) + n. */ - sv_f32_t r2 = svmul_f32_x (pg, r, r); + svfloat32_t r2 = svmul_x (pg, r, r); /* Evaluate polynomial using pairwise Horner scheme. */ - sv_f32_t p67 = sv_fma_n_f32_x (pg, P (7), r, sv_f32 (P (6))); - sv_f32_t p45 = sv_fma_n_f32_x (pg, P (5), r, sv_f32 (P (4))); - sv_f32_t p23 = sv_fma_n_f32_x (pg, P (3), r, sv_f32 (P (2))); - sv_f32_t p01 = sv_fma_n_f32_x (pg, P (1), r, sv_f32 (P (0))); - sv_f32_t y; - y = sv_fma_n_f32_x (pg, P (8), r2, p67); - y = sv_fma_f32_x (pg, y, r2, p45); - y = sv_fma_f32_x (pg, y, r2, p23); - y = sv_fma_f32_x (pg, y, r2, p01); - y = sv_fma_f32_x (pg, y, r, n); + svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]); + svfloat32_t q_01 = svmla_lane (sv_f32 (d->poly_02468[0]), r, p_1357, 0); + svfloat32_t q_23 = svmla_lane (sv_f32 (d->poly_02468[1]), r, p_1357, 1); + svfloat32_t q_45 = svmla_lane (sv_f32 (d->poly_02468[2]), r, p_1357, 2); + svfloat32_t q_67 = svmla_lane (sv_f32 (d->poly_02468[3]), r, p_1357, 3); + svfloat32_t y = svmla_x (pg, q_67, r2, sv_f32 (d->poly_02468[4])); + y = svmla_x (pg, q_45, r2, y); + y = svmla_x (pg, q_23, r2, y); + y = svmla_x (pg, q_01, r2, y); if (unlikely (svptest_any (pg, special))) - return specialcase (x, y, special); - return y; + return special_case (x, svmla_x (svnot_z (pg, special), n, r, y), special); + return svmla_x (pg, n, r, y); } -PL_ALIAS (__sv_log2f_x, _ZGVsMxv_log2f) - PL_SIG (SV, F, 1, log2, 0.01, 11.1) -PL_TEST_ULP (__sv_log2f, 1.99) -PL_TEST_EXPECT_FENV_ALWAYS (__sv_log2f) -PL_TEST_INTERVAL (__sv_log2f, -0.0, -0x1p126, 4000) -PL_TEST_INTERVAL (__sv_log2f, 0.0, 0x1p-126, 4000) -PL_TEST_INTERVAL (__sv_log2f, 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (__sv_log2f, 0x1p-23, 1.0, 50000) -PL_TEST_INTERVAL (__sv_log2f, 1.0, 100, 50000) -PL_TEST_INTERVAL (__sv_log2f, 100, inf, 50000) - -#endif // SV_SUPPORTED +PL_TEST_ULP (SV_NAME_F1 (log2), 1.99) +PL_TEST_EXPECT_FENV_ALWAYS (SV_NAME_F1 (log2)) +PL_TEST_INTERVAL (SV_NAME_F1 (log2), -0.0, -0x1p126, 4000) +PL_TEST_INTERVAL (SV_NAME_F1 (log2), 0.0, 0x1p-126, 4000) +PL_TEST_INTERVAL (SV_NAME_F1 (log2), 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (log2), 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (log2), 1.0, 100, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (log2), 100, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_log_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_log_2u5.c index 7f06fd31ebf1..2530c9e3f62c 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_log_2u5.c +++ b/contrib/arm-optimized-routines/pl/math/sv_log_2u5.c @@ -9,77 +9,68 @@ #include "pl_sig.h" #include "pl_test.h" -#if SV_SUPPORTED +#define P(i) sv_f64 (__v_log_data.poly[i]) +#define N (1 << V_LOG_TABLE_BITS) +#define Off (0x3fe6900900000000) +#define MaxTop (0x7ff) +#define MinTop (0x001) +#define ThreshTop (0x7fe) /* MaxTop - MinTop. */ -#define A(i) __sv_log_data.poly[i] -#define Ln2 (0x1.62e42fefa39efp-1) -#define N (1 << SV_LOG_TABLE_BITS) -#define OFF (0x3fe6900900000000) - -double -optr_aor_log_f64 (double); - -static NOINLINE sv_f64_t -__sv_log_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp) +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp) { - return sv_call_f64 (optr_aor_log_f64, x, y, cmp); + return sv_call_f64 (log, x, y, cmp); } -/* SVE port of Neon log algorithm from math/. +/* SVE port of AdvSIMD log algorithm. Maximum measured error is 2.17 ulp: - __sv_log(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2 - want 0x1.ffffff1cca045p-2. */ -sv_f64_t -__sv_log_x (sv_f64_t x, const svbool_t pg) + SV_NAME_D1 (log)(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2 + want 0x1.ffffff1cca045p-2. */ +svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg) { - sv_u64_t ix = sv_as_u64_f64 (x); - sv_u64_t top = svlsr_n_u64_x (pg, ix, 48); - svbool_t cmp = svcmpge_u64 (pg, svsub_n_u64_x (pg, top, 0x0010), - sv_u64 (0x7ff0 - 0x0010)); + svuint64_t ix = svreinterpret_u64 (x); + svuint64_t top = svlsr_x (pg, ix, 52); + svbool_t cmp = svcmpge (pg, svsub_x (pg, top, MinTop), sv_u64 (ThreshTop)); - /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + /* x = 2^k z; where z is in range [Off,2*Off) and exact. The range is split into N subintervals. The ith subinterval contains z and c is near its center. */ - sv_u64_t tmp = svsub_n_u64_x (pg, ix, OFF); - /* Equivalent to (tmp >> (52 - SV_LOG_TABLE_BITS)) % N, since N is a power - of 2. */ - sv_u64_t i - = svand_n_u64_x (pg, svlsr_n_u64_x (pg, tmp, (52 - SV_LOG_TABLE_BITS)), - N - 1); - sv_s64_t k - = svasr_n_s64_x (pg, sv_as_s64_u64 (tmp), 52); /* Arithmetic shift. */ - sv_u64_t iz = svsub_u64_x (pg, ix, svand_n_u64_x (pg, tmp, 0xfffULL << 52)); - sv_f64_t z = sv_as_f64_u64 (iz); + svuint64_t tmp = svsub_x (pg, ix, Off); + /* Calculate table index = (tmp >> (52 - V_LOG_TABLE_BITS)) % N. + The actual value of i is double this due to table layout. */ + svuint64_t i + = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)), (N - 1) << 1); + svint64_t k + = svasr_x (pg, svreinterpret_s64 (tmp), 52); /* Arithmetic shift. */ + svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52)); + svfloat64_t z = svreinterpret_f64 (iz); /* Lookup in 2 global lists (length N). */ - sv_f64_t invc = sv_lookup_f64_x (pg, __sv_log_data.invc, i); - sv_f64_t logc = sv_lookup_f64_x (pg, __sv_log_data.logc, i); + svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i); + svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i); /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ - sv_f64_t r = sv_fma_f64_x (pg, z, invc, sv_f64 (-1.0)); - sv_f64_t kd = sv_to_f64_s64_x (pg, k); + svfloat64_t r = svmad_x (pg, invc, z, -1); + svfloat64_t kd = svcvt_f64_x (pg, k); /* hi = r + log(c) + k*Ln2. */ - sv_f64_t hi = sv_fma_n_f64_x (pg, Ln2, kd, svadd_f64_x (pg, logc, r)); + svfloat64_t hi = svmla_x (pg, svadd_x (pg, logc, r), kd, __v_log_data.ln2); /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ - sv_f64_t r2 = svmul_f64_x (pg, r, r); - sv_f64_t y = sv_fma_n_f64_x (pg, A (3), r, sv_f64 (A (2))); - sv_f64_t p = sv_fma_n_f64_x (pg, A (1), r, sv_f64 (A (0))); - y = sv_fma_n_f64_x (pg, A (4), r2, y); - y = sv_fma_f64_x (pg, y, r2, p); - y = sv_fma_f64_x (pg, y, r2, hi); + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t y = svmla_x (pg, P (2), r, P (3)); + svfloat64_t p = svmla_x (pg, P (0), r, P (1)); + y = svmla_x (pg, y, r2, P (4)); + y = svmla_x (pg, p, r2, y); if (unlikely (svptest_any (pg, cmp))) - return __sv_log_specialcase (x, y, cmp); - return y; + return special_case (x, svmla_x (svnot_z (pg, cmp), hi, r2, y), cmp); + return svmla_x (pg, hi, r2, y); } -PL_ALIAS (__sv_log_x, _ZGVsMxv_log) - PL_SIG (SV, D, 1, log, 0.01, 11.1) -PL_TEST_ULP (__sv_log, 1.68) -PL_TEST_INTERVAL (__sv_log, -0.0, -0x1p126, 100) -PL_TEST_INTERVAL (__sv_log, 0x1p-149, 0x1p-126, 4000) -PL_TEST_INTERVAL (__sv_log, 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (__sv_log, 0x1p-23, 1.0, 50000) -PL_TEST_INTERVAL (__sv_log, 1.0, 100, 50000) -PL_TEST_INTERVAL (__sv_log, 100, inf, 50000) -#endif // SV_SUPPORTED +PL_TEST_ULP (SV_NAME_D1 (log), 1.68) +PL_TEST_INTERVAL (SV_NAME_D1 (log), -0.0, -inf, 1000) +PL_TEST_INTERVAL (SV_NAME_D1 (log), 0, 0x1p-149, 1000) +PL_TEST_INTERVAL (SV_NAME_D1 (log), 0x1p-149, 0x1p-126, 4000) +PL_TEST_INTERVAL (SV_NAME_D1 (log), 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (log), 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (log), 1.0, 100, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (log), 100, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_log_data.c b/contrib/arm-optimized-routines/pl/math/sv_log_data.c deleted file mode 100644 index 77f9989444f5..000000000000 --- a/contrib/arm-optimized-routines/pl/math/sv_log_data.c +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Coefficients for double-precision SVE log(x) function. - * - * Copyright (c) 2020-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -const struct sv_log_data __sv_log_data = { - /* All coefficients and table entries are copied from the Neon routine in - math/. See math/v_log_data.c for an explanation of the algorithm. */ - - .invc = {0x1.6a133d0dec120p+0, 0x1.6815f2f3e42edp+0, - 0x1.661e39be1ac9ep+0, 0x1.642bfa30ac371p+0, - 0x1.623f1d916f323p+0, 0x1.60578da220f65p+0, - 0x1.5e75349dea571p+0, 0x1.5c97fd387a75ap+0, - 0x1.5abfd2981f200p+0, 0x1.58eca051dc99cp+0, - 0x1.571e526d9df12p+0, 0x1.5554d555b3fcbp+0, - 0x1.539015e2a20cdp+0, 0x1.51d0014ee0164p+0, - 0x1.50148538cd9eep+0, 0x1.4e5d8f9f698a1p+0, - 0x1.4cab0edca66bep+0, 0x1.4afcf1a9db874p+0, - 0x1.495327136e16fp+0, 0x1.47ad9e84af28fp+0, - 0x1.460c47b39ae15p+0, 0x1.446f12b278001p+0, - 0x1.42d5efdd720ecp+0, 0x1.4140cfe001a0fp+0, - 0x1.3fafa3b421f69p+0, 0x1.3e225c9c8ece5p+0, - 0x1.3c98ec29a211ap+0, 0x1.3b13442a413fep+0, - 0x1.399156baa3c54p+0, 0x1.38131639b4cdbp+0, - 0x1.36987540fbf53p+0, 0x1.352166b648f61p+0, - 0x1.33adddb3eb575p+0, 0x1.323dcd99fc1d3p+0, - 0x1.30d129fefc7d2p+0, 0x1.2f67e6b72fe7dp+0, - 0x1.2e01f7cf8b187p+0, 0x1.2c9f518ddc86ep+0, - 0x1.2b3fe86e5f413p+0, 0x1.29e3b1211b25cp+0, - 0x1.288aa08b373cfp+0, 0x1.2734abcaa8467p+0, - 0x1.25e1c82459b81p+0, 0x1.2491eb1ad59c5p+0, - 0x1.23450a54048b5p+0, 0x1.21fb1bb09e578p+0, - 0x1.20b415346d8f7p+0, 0x1.1f6fed179a1acp+0, - 0x1.1e2e99b93c7b3p+0, 0x1.1cf011a7a882ap+0, - 0x1.1bb44b97dba5ap+0, 0x1.1a7b3e66cdd4fp+0, - 0x1.1944e11dc56cdp+0, 0x1.18112aebb1a6ep+0, - 0x1.16e013231b7e9p+0, 0x1.15b1913f156cfp+0, - 0x1.14859cdedde13p+0, 0x1.135c2dc68cfa4p+0, - 0x1.12353bdb01684p+0, 0x1.1110bf25b85b4p+0, - 0x1.0feeafd2f8577p+0, 0x1.0ecf062c51c3bp+0, - 0x1.0db1baa076c8bp+0, 0x1.0c96c5bb3048ep+0, - 0x1.0b7e20263e070p+0, 0x1.0a67c2acd0ce3p+0, - 0x1.0953a6391e982p+0, 0x1.0841c3caea380p+0, - 0x1.07321489b13eap+0, 0x1.062491aee9904p+0, - 0x1.05193497a7cc5p+0, 0x1.040ff6b5f5e9fp+0, - 0x1.0308d19aa6127p+0, 0x1.0203beedb0c67p+0, - 0x1.010037d38bcc2p+0, 1.0, - 0x1.fc06d493cca10p-1, 0x1.f81e6ac3b918fp-1, - 0x1.f44546ef18996p-1, 0x1.f07b10382c84bp-1, - 0x1.ecbf7070e59d4p-1, 0x1.e91213f715939p-1, - 0x1.e572a9a75f7b7p-1, 0x1.e1e0e2c530207p-1, - 0x1.de5c72d8a8be3p-1, 0x1.dae50fa5658ccp-1, - 0x1.d77a71145a2dap-1, 0x1.d41c51166623ep-1, - 0x1.d0ca6ba0bb29fp-1, 0x1.cd847e8e59681p-1, - 0x1.ca4a499693e00p-1, 0x1.c71b8e399e821p-1, - 0x1.c3f80faf19077p-1, 0x1.c0df92dc2b0ecp-1, - 0x1.bdd1de3cbb542p-1, 0x1.baceb9e1007a3p-1, - 0x1.b7d5ef543e55ep-1, 0x1.b4e749977d953p-1, - 0x1.b20295155478ep-1, 0x1.af279f8e82be2p-1, - 0x1.ac5638197fdf3p-1, 0x1.a98e2f102e087p-1, - 0x1.a6cf5606d05c1p-1, 0x1.a4197fc04d746p-1, - 0x1.a16c80293dc01p-1, 0x1.9ec82c4dc5bc9p-1, - 0x1.9c2c5a491f534p-1, 0x1.9998e1480b618p-1, - 0x1.970d9977c6c2dp-1, 0x1.948a5c023d212p-1, - 0x1.920f0303d6809p-1, 0x1.8f9b698a98b45p-1, - 0x1.8d2f6b81726f6p-1, 0x1.8acae5bb55badp-1, - 0x1.886db5d9275b8p-1, 0x1.8617ba567c13cp-1, - 0x1.83c8d27487800p-1, 0x1.8180de3c5dbe7p-1, - 0x1.7f3fbe71cdb71p-1, 0x1.7d055498071c1p-1, - 0x1.7ad182e54f65ap-1, 0x1.78a42c3c90125p-1, - 0x1.767d342f76944p-1, 0x1.745c7ef26b00ap-1, - 0x1.7241f15769d0fp-1, 0x1.702d70d396e41p-1, - 0x1.6e1ee3700cd11p-1, 0x1.6c162fc9cbe02p-1}, - - .logc = {-0x1.62fe995eb963ap-2, -0x1.5d5a48dad6b67p-2, - -0x1.57bde257d2769p-2, -0x1.52294fbf2af55p-2, - -0x1.4c9c7b598aa38p-2, -0x1.47174fc5ff560p-2, - -0x1.4199b7fa7b5cap-2, -0x1.3c239f48cfb99p-2, - -0x1.36b4f154d2aebp-2, -0x1.314d9a0ff32fbp-2, - -0x1.2bed85cca3cffp-2, -0x1.2694a11421af9p-2, - -0x1.2142d8d014fb2p-2, -0x1.1bf81a2c77776p-2, - -0x1.16b452a39c6a4p-2, -0x1.11776ffa6c67ep-2, - -0x1.0c416035020e0p-2, -0x1.071211aa10fdap-2, - -0x1.01e972e293b1bp-2, -0x1.f98ee587fd434p-3, - -0x1.ef5800ad716fbp-3, -0x1.e52e160484698p-3, - -0x1.db1104b19352ep-3, -0x1.d100ac59e0bd6p-3, - -0x1.c6fced287c3bdp-3, -0x1.bd05a7b317c29p-3, - -0x1.b31abd229164fp-3, -0x1.a93c0edadb0a3p-3, - -0x1.9f697ee30d7ddp-3, -0x1.95a2efa9aa40ap-3, - -0x1.8be843d796044p-3, -0x1.82395ecc477edp-3, - -0x1.7896240966422p-3, -0x1.6efe77aca8c55p-3, - -0x1.65723e117ec5cp-3, -0x1.5bf15c0955706p-3, - -0x1.527bb6c111da1p-3, -0x1.491133c939f8fp-3, - -0x1.3fb1b90c7fc58p-3, -0x1.365d2cc485f8dp-3, - -0x1.2d13758970de7p-3, -0x1.23d47a721fd47p-3, - -0x1.1aa0229f25ec2p-3, -0x1.117655ddebc3bp-3, - -0x1.0856fbf83ab6bp-3, -0x1.fe83fabbaa106p-4, - -0x1.ec6e8507a56cdp-4, -0x1.da6d68c7cc2eap-4, - -0x1.c88078462be0cp-4, -0x1.b6a786a423565p-4, - -0x1.a4e2676ac7f85p-4, -0x1.9330eea777e76p-4, - -0x1.8192f134d5ad9p-4, -0x1.70084464f0538p-4, - -0x1.5e90bdec5cb1fp-4, -0x1.4d2c3433c5536p-4, - -0x1.3bda7e219879ap-4, -0x1.2a9b732d27194p-4, - -0x1.196eeb2b10807p-4, -0x1.0854be8ef8a7ep-4, - -0x1.ee998cb277432p-5, -0x1.ccadb79919fb9p-5, - -0x1.aae5b1d8618b0p-5, -0x1.89413015d7442p-5, - -0x1.67bfe7bf158dep-5, -0x1.46618f83941bep-5, - -0x1.2525df1b0618ap-5, -0x1.040c8e2f77c6ap-5, - -0x1.c62aad39f738ap-6, -0x1.847fe3bdead9cp-6, - -0x1.43183683400acp-6, -0x1.01f31c4e1d544p-6, - -0x1.82201d1e6b69ap-7, -0x1.00dd0f3e1bfd6p-7, - -0x1.ff6fe1feb4e53p-9, 0.0, - 0x1.fe91885ec8e20p-8, 0x1.fc516f716296dp-7, - 0x1.7bb4dd70a015bp-6, 0x1.f84c99b34b674p-6, - 0x1.39f9ce4fb2d71p-5, 0x1.7756c0fd22e78p-5, - 0x1.b43ee82db8f3ap-5, 0x1.f0b3fced60034p-5, - 0x1.165bd78d4878ep-4, 0x1.3425d2715ebe6p-4, - 0x1.51b8bd91b7915p-4, 0x1.6f15632c76a47p-4, - 0x1.8c3c88ecbe503p-4, 0x1.a92ef077625dap-4, - 0x1.c5ed5745fa006p-4, 0x1.e27876de1c993p-4, - 0x1.fed104fce4cdcp-4, 0x1.0d7bd9c17d78bp-3, - 0x1.1b76986cef97bp-3, 0x1.295913d24f750p-3, - 0x1.37239fa295d17p-3, 0x1.44d68dd78714bp-3, - 0x1.52722ebe5d780p-3, 0x1.5ff6d12671f98p-3, - 0x1.6d64c2389484bp-3, 0x1.7abc4da40fddap-3, - 0x1.87fdbda1e8452p-3, 0x1.95295b06a5f37p-3, - 0x1.a23f6d34abbc5p-3, 0x1.af403a28e04f2p-3, - 0x1.bc2c06a85721ap-3, 0x1.c903161240163p-3, - 0x1.d5c5aa93287ebp-3, 0x1.e274051823fa9p-3, - 0x1.ef0e656300c16p-3, 0x1.fb9509f05aa2ap-3, - 0x1.04041821f37afp-2, 0x1.0a340a49b3029p-2, - 0x1.105a7918a126dp-2, 0x1.1677819812b84p-2, - 0x1.1c8b405b40c0ep-2, 0x1.2295d16cfa6b1p-2, - 0x1.28975066318a2p-2, 0x1.2e8fd855d86fcp-2, - 0x1.347f83d605e59p-2, 0x1.3a666d1244588p-2, - 0x1.4044adb6f8ec4p-2, 0x1.461a5f077558cp-2, - 0x1.4be799e20b9c8p-2, 0x1.51ac76a6b79dfp-2, - 0x1.57690d5744a45p-2, 0x1.5d1d758e45217p-2}, - - .poly = {-0x1.ffffffffffff7p-2, 0x1.55555555170d4p-2, -0x1.0000000399c27p-2, - 0x1.999b2e90e94cap-3, -0x1.554e550bd501ep-3}, -}; diff --git a/contrib/arm-optimized-routines/pl/math/sv_logf_3u4.c b/contrib/arm-optimized-routines/pl/math/sv_logf_3u4.c index 11f0b8aa12c5..967355247036 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_logf_3u4.c +++ b/contrib/arm-optimized-routines/pl/math/sv_logf_3u4.c @@ -9,69 +9,78 @@ #include "pl_sig.h" #include "pl_test.h" -#if SV_SUPPORTED +static const struct data +{ + float poly_0135[4]; + float poly_246[3]; + float ln2; +} data = { + .poly_0135 = { + /* Coefficients copied from the AdvSIMD routine in math/, then rearranged so + that coeffs 0, 1, 3 and 5 can be loaded as a single quad-word, hence used + with _lane variant of MLA intrinsic. */ + -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, 0x1.961348p-3f, 0x1.555d7cp-2f + }, + .poly_246 = { -0x1.4f9934p-3f, -0x1.00187cp-2f, -0x1.ffffc8p-2f }, + .ln2 = 0x1.62e43p-1f +}; -#define P(i) __sv_logf_poly[i] - -#define Ln2 (0x1.62e43p-1f) /* 0x3f317218 */ #define Min (0x00800000) #define Max (0x7f800000) +#define Thresh (0x7f000000) /* Max - Min. */ #define Mask (0x007fffff) -#define Off (0x3f2aaaab) /* 0.666667 */ +#define Off (0x3f2aaaab) /* 0.666667. */ -float -optr_aor_log_f32 (float); +float optr_aor_log_f32 (float); -static NOINLINE sv_f32_t -__sv_logf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp) +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp) { return sv_call_f32 (optr_aor_log_f32, x, y, cmp); } -/* Optimised implementation of SVE logf, using the same algorithm and polynomial - as the Neon routine in math/. Maximum error is 3.34 ULPs: - __sv_logf(0x1.557298p+0) got 0x1.26edecp-2 - want 0x1.26ede6p-2. */ -sv_f32_t -__sv_logf_x (sv_f32_t x, const svbool_t pg) +/* Optimised implementation of SVE logf, using the same algorithm and + polynomial as the AdvSIMD routine. Maximum error is 3.34 ULPs: + SV_NAME_F1 (log)(0x1.557298p+0) got 0x1.26edecp-2 + want 0x1.26ede6p-2. */ +svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg) { - sv_u32_t u = sv_as_u32_f32 (x); - svbool_t cmp - = svcmpge_u32 (pg, svsub_n_u32_x (pg, u, Min), sv_u32 (Max - Min)); + const struct data *d = ptr_barrier (&data); + + svuint32_t u = svreinterpret_u32 (x); + svbool_t cmp = svcmpge (pg, svsub_x (pg, u, Min), Thresh); /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ - u = svsub_n_u32_x (pg, u, Off); - sv_f32_t n = sv_to_f32_s32_x (pg, svasr_n_s32_x (pg, sv_as_s32_u32 (u), - 23)); /* Sign-extend. */ - u = svand_n_u32_x (pg, u, Mask); - u = svadd_n_u32_x (pg, u, Off); - sv_f32_t r = svsub_n_f32_x (pg, sv_as_f32_u32 (u), 1.0f); + u = svsub_x (pg, u, Off); + svfloat32_t n = svcvt_f32_x ( + pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend. */ + u = svand_x (pg, u, Mask); + u = svadd_x (pg, u, Off); + svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f); /* y = log(1+r) + n*ln2. */ - sv_f32_t r2 = svmul_f32_x (pg, r, r); + svfloat32_t r2 = svmul_x (pg, r, r); /* n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))). */ - sv_f32_t p = sv_fma_n_f32_x (pg, P (1), r, sv_f32 (P (2))); - sv_f32_t q = sv_fma_n_f32_x (pg, P (3), r, sv_f32 (P (4))); - sv_f32_t y = sv_fma_n_f32_x (pg, P (5), r, sv_f32 (P (6))); - p = sv_fma_n_f32_x (pg, P (0), r2, p); - q = sv_fma_f32_x (pg, p, r2, q); - y = sv_fma_f32_x (pg, q, r2, y); - p = sv_fma_n_f32_x (pg, Ln2, n, r); - y = sv_fma_f32_x (pg, y, r2, p); + svfloat32_t p_0135 = svld1rq (svptrue_b32 (), &d->poly_0135[0]); + svfloat32_t p = svmla_lane (sv_f32 (d->poly_246[0]), r, p_0135, 1); + svfloat32_t q = svmla_lane (sv_f32 (d->poly_246[1]), r, p_0135, 2); + svfloat32_t y = svmla_lane (sv_f32 (d->poly_246[2]), r, p_0135, 3); + p = svmla_lane (p, r2, p_0135, 0); + + q = svmla_x (pg, q, r2, p); + y = svmla_x (pg, y, r2, q); + p = svmla_x (pg, r, n, d->ln2); if (unlikely (svptest_any (pg, cmp))) - return __sv_logf_specialcase (x, y, cmp); - return y; + return special_case (x, svmla_x (svnot_z (pg, cmp), p, r2, y), cmp); + return svmla_x (pg, p, r2, y); } -PL_ALIAS (__sv_logf_x, _ZGVsMxv_logf) - PL_SIG (SV, F, 1, log, 0.01, 11.1) -PL_TEST_ULP (__sv_logf, 2.85) -PL_TEST_INTERVAL (__sv_logf, -0.0, -0x1p126, 100) -PL_TEST_INTERVAL (__sv_logf, 0x1p-149, 0x1p-126, 4000) -PL_TEST_INTERVAL (__sv_logf, 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (__sv_logf, 0x1p-23, 1.0, 50000) -PL_TEST_INTERVAL (__sv_logf, 1.0, 100, 50000) -PL_TEST_INTERVAL (__sv_logf, 100, inf, 50000) -#endif // SV_SUPPORTED +PL_TEST_ULP (SV_NAME_F1 (log), 2.85) +PL_TEST_INTERVAL (SV_NAME_F1 (log), -0.0, -inf, 100) +PL_TEST_INTERVAL (SV_NAME_F1 (log), 0, 0x1p-126, 100) +PL_TEST_INTERVAL (SV_NAME_F1 (log), 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (log), 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (log), 1.0, 100, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (log), 100, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_logf_data.c b/contrib/arm-optimized-routines/pl/math/sv_logf_data.c deleted file mode 100644 index 51dd7a7eeb37..000000000000 --- a/contrib/arm-optimized-routines/pl/math/sv_logf_data.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * Coefficients for single-precision SVE log function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -const float __sv_logf_poly[] = { - /* Copied from coeffs for the Neon routine in math/. */ - -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, -0x1.4f9934p-3f, 0x1.961348p-3f, - -0x1.00187cp-2f, 0x1.555d7cp-2f, -0x1.ffffc8p-2f, -}; diff --git a/contrib/arm-optimized-routines/pl/math/sv_math.h b/contrib/arm-optimized-routines/pl/math/sv_math.h index 5ef0ad3bd5e0..f67fe91803ba 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_math.h +++ b/contrib/arm-optimized-routines/pl/math/sv_math.h @@ -10,236 +10,124 @@ #ifndef WANT_VMATH /* Enable the build of vector math code. */ -#define WANT_VMATH 1 +# define WANT_VMATH 1 #endif + #if WANT_VMATH -#if WANT_SVE_MATH -#define SV_SUPPORTED 1 +# include +# include -#include -#include - -#include "math_config.h" - -typedef float f32_t; -typedef uint32_t u32_t; -typedef int32_t s32_t; -typedef double f64_t; -typedef uint64_t u64_t; -typedef int64_t s64_t; - -typedef svfloat64_t sv_f64_t; -typedef svuint64_t sv_u64_t; -typedef svint64_t sv_s64_t; - -typedef svfloat32_t sv_f32_t; -typedef svuint32_t sv_u32_t; -typedef svint32_t sv_s32_t; +# include "math_config.h" /* Double precision. */ -static inline sv_s64_t -sv_s64 (s64_t x) +static inline svint64_t +sv_s64 (int64_t x) { - return svdup_n_s64 (x); + return svdup_s64 (x); } -static inline sv_u64_t -sv_u64 (u64_t x) +static inline svuint64_t +sv_u64 (uint64_t x) { - return svdup_n_u64 (x); + return svdup_u64 (x); } -static inline sv_f64_t -sv_f64 (f64_t x) +static inline svfloat64_t +sv_f64 (double x) { - return svdup_n_f64 (x); + return svdup_f64 (x); } -static inline sv_f64_t -sv_fma_f64_x (svbool_t pg, sv_f64_t x, sv_f64_t y, sv_f64_t z) -{ - return svmla_f64_x (pg, z, x, y); -} - -/* res = z + x * y with x scalar. */ -static inline sv_f64_t -sv_fma_n_f64_x (svbool_t pg, f64_t x, sv_f64_t y, sv_f64_t z) -{ - return svmla_n_f64_x (pg, z, y, x); -} - -static inline sv_s64_t -sv_as_s64_u64 (sv_u64_t x) -{ - return svreinterpret_s64_u64 (x); -} - -static inline sv_u64_t -sv_as_u64_f64 (sv_f64_t x) -{ - return svreinterpret_u64_f64 (x); -} - -static inline sv_f64_t -sv_as_f64_u64 (sv_u64_t x) -{ - return svreinterpret_f64_u64 (x); -} - -static inline sv_f64_t -sv_to_f64_s64_x (svbool_t pg, sv_s64_t s) -{ - return svcvt_f64_x (pg, s); -} - -static inline sv_f64_t -sv_call_f64 (f64_t (*f) (f64_t), sv_f64_t x, sv_f64_t y, svbool_t cmp) +static inline svfloat64_t +sv_call_f64 (double (*f) (double), svfloat64_t x, svfloat64_t y, svbool_t cmp) { svbool_t p = svpfirst (cmp, svpfalse ()); while (svptest_any (cmp, p)) { - f64_t elem = svclastb_n_f64 (p, 0, x); + double elem = svclastb (p, 0, x); elem = (*f) (elem); - sv_f64_t y2 = svdup_n_f64 (elem); - y = svsel_f64 (p, y2, y); + svfloat64_t y2 = sv_f64 (elem); + y = svsel (p, y2, y); p = svpnext_b64 (cmp, p); } return y; } -static inline sv_f64_t -sv_call2_f64 (f64_t (*f) (f64_t, f64_t), sv_f64_t x1, sv_f64_t x2, sv_f64_t y, - svbool_t cmp) +static inline svfloat64_t +sv_call2_f64 (double (*f) (double, double), svfloat64_t x1, svfloat64_t x2, + svfloat64_t y, svbool_t cmp) { svbool_t p = svpfirst (cmp, svpfalse ()); while (svptest_any (cmp, p)) { - f64_t elem1 = svclastb_n_f64 (p, 0, x1); - f64_t elem2 = svclastb_n_f64 (p, 0, x2); - f64_t ret = (*f) (elem1, elem2); - sv_f64_t y2 = svdup_n_f64 (ret); - y = svsel_f64 (p, y2, y); + double elem1 = svclastb (p, 0, x1); + double elem2 = svclastb (p, 0, x2); + double ret = (*f) (elem1, elem2); + svfloat64_t y2 = sv_f64 (ret); + y = svsel (p, y2, y); p = svpnext_b64 (cmp, p); } return y; } -/* Load array of uint64_t into svuint64_t. */ -static inline sv_u64_t -sv_lookup_u64_x (svbool_t pg, const u64_t *tab, sv_u64_t idx) +static inline svuint64_t +sv_mod_n_u64_x (svbool_t pg, svuint64_t x, uint64_t y) { - return svld1_gather_u64index_u64 (pg, tab, idx); -} - -/* Load array of double into svfloat64_t. */ -static inline sv_f64_t -sv_lookup_f64_x (svbool_t pg, const f64_t *tab, sv_u64_t idx) -{ - return svld1_gather_u64index_f64 (pg, tab, idx); -} - -static inline sv_u64_t -sv_mod_n_u64_x (svbool_t pg, sv_u64_t x, u64_t y) -{ - sv_u64_t q = svdiv_n_u64_x (pg, x, y); - return svmls_n_u64_x (pg, x, q, y); + svuint64_t q = svdiv_x (pg, x, y); + return svmls_x (pg, x, q, y); } /* Single precision. */ -static inline sv_s32_t -sv_s32 (s32_t x) +static inline svint32_t +sv_s32 (int32_t x) { - return svdup_n_s32 (x); + return svdup_s32 (x); } -static inline sv_u32_t -sv_u32 (u32_t x) +static inline svuint32_t +sv_u32 (uint32_t x) { - return svdup_n_u32 (x); + return svdup_u32 (x); } -static inline sv_f32_t -sv_f32 (f32_t x) +static inline svfloat32_t +sv_f32 (float x) { - return svdup_n_f32 (x); + return svdup_f32 (x); } -static inline sv_f32_t -sv_fma_f32_x (svbool_t pg, sv_f32_t x, sv_f32_t y, sv_f32_t z) -{ - return svmla_f32_x (pg, z, x, y); -} - -/* res = z + x * y with x scalar. */ -static inline sv_f32_t -sv_fma_n_f32_x (svbool_t pg, f32_t x, sv_f32_t y, sv_f32_t z) -{ - return svmla_n_f32_x (pg, z, y, x); -} - -static inline sv_u32_t -sv_as_u32_f32 (sv_f32_t x) -{ - return svreinterpret_u32_f32 (x); -} - -static inline sv_f32_t -sv_as_f32_u32 (sv_u32_t x) -{ - return svreinterpret_f32_u32 (x); -} - -static inline sv_s32_t -sv_as_s32_u32 (sv_u32_t x) -{ - return svreinterpret_s32_u32 (x); -} - -static inline sv_f32_t -sv_to_f32_s32_x (svbool_t pg, sv_s32_t s) -{ - return svcvt_f32_x (pg, s); -} - -static inline sv_s32_t -sv_to_s32_f32_x (svbool_t pg, sv_f32_t x) -{ - return svcvt_s32_f32_x (pg, x); -} - -static inline sv_f32_t -sv_call_f32 (f32_t (*f) (f32_t), sv_f32_t x, sv_f32_t y, svbool_t cmp) +static inline svfloat32_t +sv_call_f32 (float (*f) (float), svfloat32_t x, svfloat32_t y, svbool_t cmp) { svbool_t p = svpfirst (cmp, svpfalse ()); while (svptest_any (cmp, p)) { - f32_t elem = svclastb_n_f32 (p, 0, x); + float elem = svclastb (p, 0, x); elem = (*f) (elem); - sv_f32_t y2 = svdup_n_f32 (elem); - y = svsel_f32 (p, y2, y); + svfloat32_t y2 = sv_f32 (elem); + y = svsel (p, y2, y); p = svpnext_b32 (cmp, p); } return y; } -static inline sv_f32_t -sv_call2_f32 (f32_t (*f) (f32_t, f32_t), sv_f32_t x1, sv_f32_t x2, sv_f32_t y, - svbool_t cmp) +static inline svfloat32_t +sv_call2_f32 (float (*f) (float, float), svfloat32_t x1, svfloat32_t x2, + svfloat32_t y, svbool_t cmp) { svbool_t p = svpfirst (cmp, svpfalse ()); while (svptest_any (cmp, p)) { - f32_t elem1 = svclastb_n_f32 (p, 0, x1); - f32_t elem2 = svclastb_n_f32 (p, 0, x2); - f32_t ret = (*f) (elem1, elem2); - sv_f32_t y2 = svdup_n_f32 (ret); - y = svsel_f32 (p, y2, y); + float elem1 = svclastb (p, 0, x1); + float elem2 = svclastb (p, 0, x2); + float ret = (*f) (elem1, elem2); + svfloat32_t y2 = sv_f32 (ret); + y = svsel (p, y2, y); p = svpnext_b32 (cmp, p); } return y; } +#endif #endif -#endif -#endif diff --git a/contrib/arm-optimized-routines/pl/math/sv_pow_1u5.c b/contrib/arm-optimized-routines/pl/math/sv_pow_1u5.c new file mode 100644 index 000000000000..0838810206a1 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_pow_1u5.c @@ -0,0 +1,444 @@ +/* + * Double-precision SVE pow(x, y) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +/* This version share a similar algorithm as AOR scalar pow. + + The core computation consists in computing pow(x, y) as + + exp (y * log (x)). + + The algorithms for exp and log are very similar to scalar exp and log. + The log relies on table lookup for 3 variables and an order 8 polynomial. + It returns a high and a low contribution that are then passed to the exp, + to minimise the loss of accuracy in both routines. + The exp is based on 8-bit table lookup for scale and order-4 polynomial. + The SVE algorithm drops the tail in the exp computation at the price of + a lower accuracy, slightly above 1ULP. + The SVE algorithm also drops the special treatement of small (< 2^-65) and + large (> 2^63) finite values of |y|, as they only affect non-round to nearest + modes. + + Maximum measured error is 1.04 ULPs: + SV_NAME_D2 (pow) (0x1.3d2d45bc848acp+63, -0x1.a48a38b40cd43p-12) + got 0x1.f7116284221fcp-1 + want 0x1.f7116284221fdp-1. */ + +/* Data is defined in v_pow_log_data.c. */ +#define N_LOG (1 << V_POW_LOG_TABLE_BITS) +#define A __v_pow_log_data.poly +#define Off 0x3fe6955500000000 + +/* Data is defined in v_pow_exp_data.c. */ +#define N_EXP (1 << V_POW_EXP_TABLE_BITS) +#define SignBias (0x800 << V_POW_EXP_TABLE_BITS) +#define C __v_pow_exp_data.poly +#define SmallExp 0x3c9 /* top12(0x1p-54). */ +#define BigExp 0x408 /* top12(512.). */ +#define ThresExp 0x03f /* BigExp - SmallExp. */ +#define HugeExp 0x409 /* top12(1024.). */ + +/* Constants associated with pow. */ +#define SmallPowX 0x001 /* top12(0x1p-126). */ +#define BigPowX 0x7ff /* top12(INFINITY). */ +#define ThresPowX 0x7fe /* BigPowX - SmallPowX. */ +#define SmallPowY 0x3be /* top12(0x1.e7b6p-65). */ +#define BigPowY 0x43e /* top12(0x1.749p62). */ +#define ThresPowY 0x080 /* BigPowY - SmallPowY. */ + +/* Check if x is an integer. */ +static inline svbool_t +sv_isint (svbool_t pg, svfloat64_t x) +{ + return svcmpeq (pg, svrintz_z (pg, x), x); +} + +/* Check if x is real not integer valued. */ +static inline svbool_t +sv_isnotint (svbool_t pg, svfloat64_t x) +{ + return svcmpne (pg, svrintz_z (pg, x), x); +} + +/* Check if x is an odd integer. */ +static inline svbool_t +sv_isodd (svbool_t pg, svfloat64_t x) +{ + svfloat64_t y = svmul_x (pg, x, 0.5); + return sv_isnotint (pg, y); +} + +/* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is + the bit representation of a non-zero finite floating-point value. */ +static inline int +checkint (uint64_t iy) +{ + int e = iy >> 52 & 0x7ff; + if (e < 0x3ff) + return 0; + if (e > 0x3ff + 52) + return 2; + if (iy & ((1ULL << (0x3ff + 52 - e)) - 1)) + return 0; + if (iy & (1ULL << (0x3ff + 52 - e))) + return 1; + return 2; +} + +/* Top 12 bits (sign and exponent of each double float lane). */ +static inline svuint64_t +sv_top12 (svfloat64_t x) +{ + return svlsr_x (svptrue_b64 (), svreinterpret_u64 (x), 52); +} + +/* Returns 1 if input is the bit representation of 0, infinity or nan. */ +static inline int +zeroinfnan (uint64_t i) +{ + return 2 * i - 1 >= 2 * asuint64 (INFINITY) - 1; +} + +/* Returns 1 if input is the bit representation of 0, infinity or nan. */ +static inline svbool_t +sv_zeroinfnan (svbool_t pg, svuint64_t i) +{ + return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2), 1), + 2 * asuint64 (INFINITY) - 1); +} + +/* Handle cases that may overflow or underflow when computing the result that + is scale*(1+TMP) without intermediate rounding. The bit representation of + scale is in SBITS, however it has a computed exponent that may have + overflown into the sign bit so that needs to be adjusted before using it as + a double. (int32_t)KI is the k used in the argument reduction and exponent + adjustment of scale, positive k here means the result may overflow and + negative k means the result may underflow. */ +static inline double +specialcase (double tmp, uint64_t sbits, uint64_t ki) +{ + double scale; + if ((ki & 0x80000000) == 0) + { + /* k > 0, the exponent of scale might have overflowed by <= 460. */ + sbits -= 1009ull << 52; + scale = asdouble (sbits); + return 0x1p1009 * (scale + scale * tmp); + } + /* k < 0, need special care in the subnormal range. */ + sbits += 1022ull << 52; + /* Note: sbits is signed scale. */ + scale = asdouble (sbits); + double y = scale + scale * tmp; + return 0x1p-1022 * y; +} + +/* Scalar fallback for special cases of SVE pow's exp. */ +static inline svfloat64_t +sv_call_specialcase (svfloat64_t x1, svuint64_t u1, svuint64_t u2, + svfloat64_t y, svbool_t cmp) +{ + svbool_t p = svpfirst (cmp, svpfalse ()); + while (svptest_any (cmp, p)) + { + double sx1 = svclastb (p, 0, x1); + uint64_t su1 = svclastb (p, 0, u1); + uint64_t su2 = svclastb (p, 0, u2); + double elem = specialcase (sx1, su1, su2); + svfloat64_t y2 = sv_f64 (elem); + y = svsel (p, y2, y); + p = svpnext_b64 (cmp, p); + } + return y; +} + +/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about + additional 15 bits precision. IX is the bit representation of x, but + normalized in the subnormal range using the sign bit for the exponent. */ +static inline svfloat64_t +sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail) +{ + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + svuint64_t tmp = svsub_x (pg, ix, Off); + svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, 52 - V_POW_LOG_TABLE_BITS), + sv_u64 (N_LOG - 1)); + svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52); + svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, sv_u64 (0xfffULL << 52))); + svfloat64_t z = svreinterpret_f64 (iz); + svfloat64_t kd = svcvt_f64_x (pg, k); + + /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */ + /* SVE lookup requires 3 separate lookup tables, as opposed to scalar version + that uses array of structures. We also do the lookup earlier in the code to + make sure it finishes as early as possible. */ + svfloat64_t invc = svld1_gather_index (pg, __v_pow_log_data.invc, i); + svfloat64_t logc = svld1_gather_index (pg, __v_pow_log_data.logc, i); + svfloat64_t logctail = svld1_gather_index (pg, __v_pow_log_data.logctail, i); + + /* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and + |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */ + svfloat64_t r = svmad_x (pg, z, invc, -1.0); + /* k*Ln2 + log(c) + r. */ + svfloat64_t t1 = svmla_x (pg, logc, kd, __v_pow_log_data.ln2_hi); + svfloat64_t t2 = svadd_x (pg, t1, r); + svfloat64_t lo1 = svmla_x (pg, logctail, kd, __v_pow_log_data.ln2_lo); + svfloat64_t lo2 = svadd_x (pg, svsub_x (pg, t1, t2), r); + + /* Evaluation is optimized assuming superscalar pipelined execution. */ + svfloat64_t ar = svmul_x (pg, r, -0.5); /* A[0] = -0.5. */ + svfloat64_t ar2 = svmul_x (pg, r, ar); + svfloat64_t ar3 = svmul_x (pg, r, ar2); + /* k*Ln2 + log(c) + r + A[0]*r*r. */ + svfloat64_t hi = svadd_x (pg, t2, ar2); + svfloat64_t lo3 = svmla_x (pg, svneg_x (pg, ar2), ar, r); + svfloat64_t lo4 = svadd_x (pg, svsub_x (pg, t2, hi), ar2); + /* p = log1p(r) - r - A[0]*r*r. */ + /* p = (ar3 * (A[1] + r * A[2] + ar2 * (A[3] + r * A[4] + ar2 * (A[5] + r * + A[6])))). */ + svfloat64_t a56 = svmla_x (pg, sv_f64 (A[5]), r, A[6]); + svfloat64_t a34 = svmla_x (pg, sv_f64 (A[3]), r, A[4]); + svfloat64_t a12 = svmla_x (pg, sv_f64 (A[1]), r, A[2]); + svfloat64_t p = svmla_x (pg, a34, ar2, a56); + p = svmla_x (pg, a12, ar2, p); + p = svmul_x (pg, ar3, p); + svfloat64_t lo = svadd_x ( + pg, svadd_x (pg, svadd_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p); + svfloat64_t y = svadd_x (pg, hi, lo); + *tail = svadd_x (pg, svsub_x (pg, hi, y), lo); + return y; +} + +/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. + The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1. */ +static inline svfloat64_t +sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail, + svuint64_t sign_bias) +{ + /* 3 types of special cases: tiny (uflow and spurious uflow), huge (oflow) + and other cases of large values of x (scale * (1 + TMP) oflow). */ + svuint64_t abstop = svand_x (pg, sv_top12 (x), 0x7ff); + /* |x| is large (|x| >= 512) or tiny (|x| <= 0x1p-54). */ + svbool_t uoflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), ThresExp); + + /* Conditions special, uflow and oflow are all expressed as uoflow && + something, hence do not bother computing anything if no lane in uoflow is + true. */ + svbool_t special = svpfalse_b (); + svbool_t uflow = svpfalse_b (); + svbool_t oflow = svpfalse_b (); + if (unlikely (svptest_any (pg, uoflow))) + { + /* |x| is tiny (|x| <= 0x1p-54). */ + uflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000); + uflow = svand_z (pg, uoflow, uflow); + /* |x| is huge (|x| >= 1024). */ + oflow = svcmpge (pg, abstop, HugeExp); + oflow = svand_z (pg, uoflow, svbic_z (pg, oflow, uflow)); + /* For large |x| values (512 < |x| < 1024) scale * (1 + TMP) can overflow + or underflow. */ + special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow)); + } + + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ + /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */ + svfloat64_t z = svmul_x (pg, x, __v_pow_exp_data.n_over_ln2); + /* z - kd is in [-1, 1] in non-nearest rounding modes. */ + svfloat64_t shift = sv_f64 (__v_pow_exp_data.shift); + svfloat64_t kd = svadd_x (pg, z, shift); + svuint64_t ki = svreinterpret_u64 (kd); + kd = svsub_x (pg, kd, shift); + svfloat64_t r = x; + r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_hi); + r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_lo); + /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ + r = svadd_x (pg, r, xtail); + /* 2^(k/N) ~= scale. */ + svuint64_t idx = svand_x (pg, ki, N_EXP - 1); + svuint64_t top + = svlsl_x (pg, svadd_x (pg, ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + svuint64_t sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx); + sbits = svadd_x (pg, sbits, top); + /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */ + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t tmp = svmla_x (pg, sv_f64 (C[1]), r, C[2]); + tmp = svmla_x (pg, sv_f64 (C[0]), r, tmp); + tmp = svmla_x (pg, r, r2, tmp); + svfloat64_t scale = svreinterpret_f64 (sbits); + /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there + is no spurious underflow here even without fma. */ + z = svmla_x (pg, scale, scale, tmp); + + /* Update result with special and large cases. */ + if (unlikely (svptest_any (pg, special))) + z = sv_call_specialcase (tmp, sbits, ki, z, special); + + /* Handle underflow and overflow. */ + svuint64_t sign_bit = svlsr_x (pg, svreinterpret_u64 (x), 63); + svbool_t x_is_neg = svcmpne (pg, sign_bit, 0); + svuint64_t sign_mask = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS); + svfloat64_t res_uoflow = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY)); + res_uoflow = svreinterpret_f64 ( + svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask)); + z = svsel (oflow, res_uoflow, z); + /* Avoid spurious underflow for tiny x. */ + svfloat64_t res_spurious_uflow + = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000)); + z = svsel (uflow, res_spurious_uflow, z); + + return z; +} + +static inline double +pow_sc (double x, double y) +{ + uint64_t ix = asuint64 (x); + uint64_t iy = asuint64 (y); + /* Special cases: |x| or |y| is 0, inf or nan. */ + if (unlikely (zeroinfnan (iy))) + { + if (2 * iy == 0) + return issignaling_inline (x) ? x + y : 1.0; + if (ix == asuint64 (1.0)) + return issignaling_inline (y) ? x + y : 1.0; + if (2 * ix > 2 * asuint64 (INFINITY) || 2 * iy > 2 * asuint64 (INFINITY)) + return x + y; + if (2 * ix == 2 * asuint64 (1.0)) + return 1.0; + if ((2 * ix < 2 * asuint64 (1.0)) == !(iy >> 63)) + return 0.0; /* |x|<1 && y==inf or |x|>1 && y==-inf. */ + return y * y; + } + if (unlikely (zeroinfnan (ix))) + { + double_t x2 = x * x; + if (ix >> 63 && checkint (iy) == 1) + x2 = -x2; + /* Without the barrier some versions of clang hoist the 1/x2 and + thus division by zero exception can be signaled spuriously. */ + return (iy >> 63) ? opt_barrier_double (1 / x2) : x2; + } + return x; +} + +svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg) +{ + /* This preamble handles special case conditions used in the final scalar + fallbacks. It also updates ix and sign_bias, that are used in the core + computation too, i.e., exp( y * log (x) ). */ + svuint64_t vix0 = svreinterpret_u64 (x); + svuint64_t viy0 = svreinterpret_u64 (y); + svuint64_t vtopx0 = svlsr_x (svptrue_b64 (), vix0, 52); + + /* Negative x cases. */ + svuint64_t sign_bit = svlsr_m (pg, vix0, 63); + svbool_t xisneg = svcmpeq (pg, sign_bit, 1); + + /* Set sign_bias and ix depending on sign of x and nature of y. */ + svbool_t yisnotint_xisneg = svpfalse_b (); + svuint64_t sign_bias = sv_u64 (0); + svuint64_t vix = vix0; + svuint64_t vtopx1 = vtopx0; + if (unlikely (svptest_any (pg, xisneg))) + { + /* Determine nature of y. */ + yisnotint_xisneg = sv_isnotint (xisneg, y); + svbool_t yisint_xisneg = sv_isint (xisneg, y); + svbool_t yisodd_xisneg = sv_isodd (xisneg, y); + /* ix set to abs(ix) if y is integer. */ + vix = svand_m (yisint_xisneg, vix0, 0x7fffffffffffffff); + vtopx1 = svand_m (yisint_xisneg, vtopx0, 0x7ff); + /* Set to SignBias if x is negative and y is odd. */ + sign_bias = svsel (yisodd_xisneg, sv_u64 (SignBias), sv_u64 (0)); + } + + /* Special cases of x or y: zero, inf and nan. */ + svbool_t xspecial = sv_zeroinfnan (pg, vix0); + svbool_t yspecial = sv_zeroinfnan (pg, viy0); + svbool_t special = svorr_z (pg, xspecial, yspecial); + + /* Small cases of x: |x| < 0x1p-126. */ + svuint64_t vabstopx0 = svand_x (pg, vtopx0, 0x7ff); + svbool_t xsmall = svcmplt (pg, vabstopx0, SmallPowX); + if (unlikely (svptest_any (pg, xsmall))) + { + /* Normalize subnormal x so exponent becomes negative. */ + svbool_t topx_is_null = svcmpeq (xsmall, vtopx1, 0); + + svuint64_t vix_norm = svreinterpret_u64 (svmul_m (xsmall, x, 0x1p52)); + vix_norm = svand_m (xsmall, vix_norm, 0x7fffffffffffffff); + vix_norm = svsub_m (xsmall, vix_norm, 52ULL << 52); + vix = svsel (topx_is_null, vix_norm, vix); + } + + /* y_hi = log(ix, &y_lo). */ + svfloat64_t vlo; + svfloat64_t vhi = sv_log_inline (pg, vix, &vlo); + + /* z = exp(y_hi, y_lo, sign_bias). */ + svfloat64_t vehi = svmul_x (pg, y, vhi); + svfloat64_t velo = svmul_x (pg, y, vlo); + svfloat64_t vemi = svmls_x (pg, vehi, y, vhi); + velo = svsub_x (pg, velo, vemi); + svfloat64_t vz = sv_exp_inline (pg, vehi, velo, sign_bias); + + /* Cases of finite y and finite negative x. */ + vz = svsel (yisnotint_xisneg, sv_f64 (__builtin_nan ("")), vz); + + /* Cases of zero/inf/nan x or y. */ + if (unlikely (svptest_any (pg, special))) + vz = sv_call2_f64 (pow_sc, x, y, vz, special); + + return vz; +} + +PL_SIG (SV, D, 2, pow) +PL_TEST_ULP (SV_NAME_D2 (pow), 0.55) +/* Wide intervals spanning the whole domain but shared between x and y. */ +#define SV_POW_INTERVAL2(xlo, xhi, ylo, yhi, n) \ + PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), xlo, xhi, ylo, yhi, n) \ + PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n) \ + PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n) \ + PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n) +#define EXPAND(str) str##000000000 +#define SHL52(str) EXPAND (str) +SV_POW_INTERVAL2 (0, SHL52 (SmallPowX), 0, inf, 40000) +SV_POW_INTERVAL2 (SHL52 (SmallPowX), SHL52 (BigPowX), 0, inf, 40000) +SV_POW_INTERVAL2 (SHL52 (BigPowX), inf, 0, inf, 40000) +SV_POW_INTERVAL2 (0, inf, 0, SHL52 (SmallPowY), 40000) +SV_POW_INTERVAL2 (0, inf, SHL52 (SmallPowY), SHL52 (BigPowY), 40000) +SV_POW_INTERVAL2 (0, inf, SHL52 (BigPowY), inf, 40000) +SV_POW_INTERVAL2 (0, inf, 0, inf, 1000) +/* x~1 or y~1. */ +SV_POW_INTERVAL2 (0x1p-1, 0x1p1, 0x1p-10, 0x1p10, 10000) +SV_POW_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000) +SV_POW_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000) +/* around estimated argmaxs of ULP error. */ +SV_POW_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000) +SV_POW_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000) +/* x is negative, y is odd or even integer, or y is real not integer. */ +PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000) +PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000) +PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000) +PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000) +/* |x| is inf, y is odd or even integer, or y is real not integer. */ +SV_POW_INTERVAL2 (inf, inf, 0.5, 0.5, 1) +SV_POW_INTERVAL2 (inf, inf, 1.0, 1.0, 1) +SV_POW_INTERVAL2 (inf, inf, 2.0, 2.0, 1) +SV_POW_INTERVAL2 (inf, inf, 3.0, 3.0, 1) +/* 0.0^y. */ +SV_POW_INTERVAL2 (0.0, 0.0, 0.0, 0x1p120, 1000) +/* 1.0^y. */ +PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000) +PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000) +PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000) +PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_powf_2u6.c b/contrib/arm-optimized-routines/pl/math/sv_powf_2u6.c new file mode 100644 index 000000000000..2db0636aea62 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_powf_2u6.c @@ -0,0 +1,360 @@ +/* + * Single-precision SVE powf function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +/* The following data is used in the SVE pow core computation + and special case detection. */ +#define Tinvc __v_powf_data.invc +#define Tlogc __v_powf_data.logc +#define Texp __v_powf_data.scale +#define SignBias (1 << (V_POWF_EXP2_TABLE_BITS + 11)) +#define Shift 0x1.8p52 +#define Norm 0x1p23f /* 0x4b000000. */ + +/* Overall ULP error bound for pow is 2.6 ulp + ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */ +static const struct data +{ + double log_poly[4]; + double exp_poly[3]; + float uflow_bound, oflow_bound, small_bound; + uint32_t sign_bias, sign_mask, subnormal_bias, off; +} data = { + /* rel err: 1.5 * 2^-30. Each coefficients is multiplied the value of + V_POWF_EXP2_N. */ + .log_poly = { -0x1.6ff5daa3b3d7cp+3, 0x1.ec81d03c01aebp+3, + -0x1.71547bb43f101p+4, 0x1.7154764a815cbp+5 }, + /* rel err: 1.69 * 2^-34. */ + .exp_poly = { + 0x1.c6af84b912394p-20, /* A0 / V_POWF_EXP2_N^3. */ + 0x1.ebfce50fac4f3p-13, /* A1 / V_POWF_EXP2_N^2. */ + 0x1.62e42ff0c52d6p-6, /* A3 / V_POWF_EXP2_N. */ + }, + .uflow_bound = -0x1.2cp+12f, /* -150.0 * V_POWF_EXP2_N. */ + .oflow_bound = 0x1p+12f, /* 128.0 * V_POWF_EXP2_N. */ + .small_bound = 0x1p-126f, + .off = 0x3f35d000, + .sign_bias = SignBias, + .sign_mask = 0x80000000, + .subnormal_bias = 0x0b800000, /* 23 << 23. */ +}; + +#define A(i) sv_f64 (d->log_poly[i]) +#define C(i) sv_f64 (d->exp_poly[i]) + +/* Check if x is an integer. */ +static inline svbool_t +svisint (svbool_t pg, svfloat32_t x) +{ + return svcmpeq (pg, svrintz_z (pg, x), x); +} + +/* Check if x is real not integer valued. */ +static inline svbool_t +svisnotint (svbool_t pg, svfloat32_t x) +{ + return svcmpne (pg, svrintz_z (pg, x), x); +} + +/* Check if x is an odd integer. */ +static inline svbool_t +svisodd (svbool_t pg, svfloat32_t x) +{ + svfloat32_t y = svmul_x (pg, x, 0.5f); + return svisnotint (pg, y); +} + +/* Check if zero, inf or nan. */ +static inline svbool_t +sv_zeroinfnan (svbool_t pg, svuint32_t i) +{ + return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2u), 1), + 2u * 0x7f800000 - 1); +} + +/* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is + the bit representation of a non-zero finite floating-point value. */ +static inline int +checkint (uint32_t iy) +{ + int e = iy >> 23 & 0xff; + if (e < 0x7f) + return 0; + if (e > 0x7f + 23) + return 2; + if (iy & ((1 << (0x7f + 23 - e)) - 1)) + return 0; + if (iy & (1 << (0x7f + 23 - e))) + return 1; + return 2; +} + +/* Check if zero, inf or nan. */ +static inline int +zeroinfnan (uint32_t ix) +{ + return 2 * ix - 1 >= 2u * 0x7f800000 - 1; +} + +/* A scalar subroutine used to fix main power special cases. Similar to the + preamble of finite_powf except that we do not update ix and sign_bias. This + is done in the preamble of the SVE powf. */ +static inline float +powf_specialcase (float x, float y, float z) +{ + uint32_t ix = asuint (x); + uint32_t iy = asuint (y); + /* Either (x < 0x1p-126 or inf or nan) or (y is 0 or inf or nan). */ + if (unlikely (zeroinfnan (iy))) + { + if (2 * iy == 0) + return issignalingf_inline (x) ? x + y : 1.0f; + if (ix == 0x3f800000) + return issignalingf_inline (y) ? x + y : 1.0f; + if (2 * ix > 2u * 0x7f800000 || 2 * iy > 2u * 0x7f800000) + return x + y; + if (2 * ix == 2 * 0x3f800000) + return 1.0f; + if ((2 * ix < 2 * 0x3f800000) == !(iy & 0x80000000)) + return 0.0f; /* |x|<1 && y==inf or |x|>1 && y==-inf. */ + return y * y; + } + if (unlikely (zeroinfnan (ix))) + { + float_t x2 = x * x; + if (ix & 0x80000000 && checkint (iy) == 1) + x2 = -x2; + return iy & 0x80000000 ? 1 / x2 : x2; + } + /* We need a return here in case x<0 and y is integer, but all other tests + need to be run. */ + return z; +} + +/* Scalar fallback for special case routines with custom signature. */ +static inline svfloat32_t +sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp) +{ + svbool_t p = svpfirst (cmp, svpfalse ()); + while (svptest_any (cmp, p)) + { + float sx1 = svclastb (p, 0, x1); + float sx2 = svclastb (p, 0, x2); + float elem = svclastb (p, 0, y); + elem = powf_specialcase (sx1, sx2, elem); + svfloat32_t y2 = sv_f32 (elem); + y = svsel (p, y2, y); + p = svpnext_b32 (cmp, p); + } + return y; +} + +/* Compute core for half of the lanes in double precision. */ +static inline svfloat64_t +sv_powf_core_ext (const svbool_t pg, svuint64_t i, svfloat64_t z, svint64_t k, + svfloat64_t y, svuint64_t sign_bias, svfloat64_t *pylogx, + const struct data *d) +{ + svfloat64_t invc = svld1_gather_index (pg, Tinvc, i); + svfloat64_t logc = svld1_gather_index (pg, Tlogc, i); + + /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */ + svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), z, invc); + svfloat64_t y0 = svadd_x (pg, logc, svcvt_f64_x (pg, k)); + + /* Polynomial to approximate log1p(r)/ln2. */ + svfloat64_t logx = A (0); + logx = svmla_x (pg, A (1), r, logx); + logx = svmla_x (pg, A (2), r, logx); + logx = svmla_x (pg, A (3), r, logx); + logx = svmla_x (pg, y0, r, logx); + *pylogx = svmul_x (pg, y, logx); + + /* z - kd is in [-1, 1] in non-nearest rounding modes. */ + svfloat64_t kd = svadd_x (pg, *pylogx, Shift); + svuint64_t ki = svreinterpret_u64 (kd); + kd = svsub_x (pg, kd, Shift); + + r = svsub_x (pg, *pylogx, kd); + + /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */ + svuint64_t t + = svld1_gather_index (pg, Texp, svand_x (pg, ki, V_POWF_EXP2_N - 1)); + svuint64_t ski = svadd_x (pg, ki, sign_bias); + t = svadd_x (pg, t, svlsl_x (pg, ski, 52 - V_POWF_EXP2_TABLE_BITS)); + svfloat64_t s = svreinterpret_f64 (t); + + svfloat64_t p = C (0); + p = svmla_x (pg, C (1), p, r); + p = svmla_x (pg, C (2), p, r); + p = svmla_x (pg, s, p, svmul_x (pg, s, r)); + + return p; +} + +/* Widen vector to double precision and compute core on both halves of the + vector. Lower cost of promotion by considering all lanes active. */ +static inline svfloat32_t +sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k, + svfloat32_t y, svuint32_t sign_bias, svfloat32_t *pylogx, + const struct data *d) +{ + const svbool_t ptrue = svptrue_b64 (); + + /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two in + order to perform core computation in double precision. */ + const svbool_t pg_lo = svunpklo (pg); + const svbool_t pg_hi = svunpkhi (pg); + svfloat64_t y_lo = svcvt_f64_x ( + ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y)))); + svfloat64_t y_hi = svcvt_f64_x ( + ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y)))); + svfloat32_t z = svreinterpret_f32 (iz); + svfloat64_t z_lo = svcvt_f64_x ( + ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (z)))); + svfloat64_t z_hi = svcvt_f64_x ( + ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (z)))); + svuint64_t i_lo = svunpklo (i); + svuint64_t i_hi = svunpkhi (i); + svint64_t k_lo = svunpklo (k); + svint64_t k_hi = svunpkhi (k); + svuint64_t sign_bias_lo = svunpklo (sign_bias); + svuint64_t sign_bias_hi = svunpkhi (sign_bias); + + /* Compute each part in double precision. */ + svfloat64_t ylogx_lo, ylogx_hi; + svfloat64_t lo = sv_powf_core_ext (pg_lo, i_lo, z_lo, k_lo, y_lo, + sign_bias_lo, &ylogx_lo, d); + svfloat64_t hi = sv_powf_core_ext (pg_hi, i_hi, z_hi, k_hi, y_hi, + sign_bias_hi, &ylogx_hi, d); + + /* Convert back to single-precision and interleave. */ + svfloat32_t ylogx_lo_32 = svcvt_f32_x (ptrue, ylogx_lo); + svfloat32_t ylogx_hi_32 = svcvt_f32_x (ptrue, ylogx_hi); + *pylogx = svuzp1 (ylogx_lo_32, ylogx_hi_32); + svfloat32_t lo_32 = svcvt_f32_x (ptrue, lo); + svfloat32_t hi_32 = svcvt_f32_x (ptrue, hi); + return svuzp1 (lo_32, hi_32); +} + +/* Implementation of SVE powf. + Provides the same accuracy as AdvSIMD powf, since it relies on the same + algorithm. The theoretical maximum error is under 2.60 ULPs. + Maximum measured error is 2.56 ULPs: + SV_NAME_F2 (pow) (0x1.004118p+0, 0x1.5d14a4p+16) got 0x1.fd4bp+127 + want 0x1.fd4b06p+127. */ +svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint32_t vix0 = svreinterpret_u32 (x); + svuint32_t viy0 = svreinterpret_u32 (y); + + /* Negative x cases. */ + svuint32_t sign_bit = svand_m (pg, vix0, d->sign_mask); + svbool_t xisneg = svcmpeq (pg, sign_bit, d->sign_mask); + + /* Set sign_bias and ix depending on sign of x and nature of y. */ + svbool_t yisnotint_xisneg = svpfalse_b (); + svuint32_t sign_bias = sv_u32 (0); + svuint32_t vix = vix0; + if (unlikely (svptest_any (pg, xisneg))) + { + /* Determine nature of y. */ + yisnotint_xisneg = svisnotint (xisneg, y); + svbool_t yisint_xisneg = svisint (xisneg, y); + svbool_t yisodd_xisneg = svisodd (xisneg, y); + /* ix set to abs(ix) if y is integer. */ + vix = svand_m (yisint_xisneg, vix0, 0x7fffffff); + /* Set to SignBias if x is negative and y is odd. */ + sign_bias = svsel (yisodd_xisneg, sv_u32 (d->sign_bias), sv_u32 (0)); + } + + /* Special cases of x or y: zero, inf and nan. */ + svbool_t xspecial = sv_zeroinfnan (pg, vix0); + svbool_t yspecial = sv_zeroinfnan (pg, viy0); + svbool_t cmp = svorr_z (pg, xspecial, yspecial); + + /* Small cases of x: |x| < 0x1p-126. */ + svbool_t xsmall = svaclt (pg, x, d->small_bound); + if (unlikely (svptest_any (pg, xsmall))) + { + /* Normalize subnormal x so exponent becomes negative. */ + svuint32_t vix_norm = svreinterpret_u32 (svmul_x (xsmall, x, Norm)); + vix_norm = svand_x (xsmall, vix_norm, 0x7fffffff); + vix_norm = svsub_x (xsmall, vix_norm, d->subnormal_bias); + vix = svsel (xsmall, vix_norm, vix); + } + /* Part of core computation carried in working precision. */ + svuint32_t tmp = svsub_x (pg, vix, d->off); + svuint32_t i = svand_x (pg, svlsr_x (pg, tmp, (23 - V_POWF_LOG2_TABLE_BITS)), + V_POWF_LOG2_N - 1); + svuint32_t top = svand_x (pg, tmp, 0xff800000); + svuint32_t iz = svsub_x (pg, vix, top); + svint32_t k + = svasr_x (pg, svreinterpret_s32 (top), (23 - V_POWF_EXP2_TABLE_BITS)); + + /* Compute core in extended precision and return intermediate ylogx results to + handle cases of underflow and underflow in exp. */ + svfloat32_t ylogx; + svfloat32_t ret = sv_powf_core (pg, i, iz, k, y, sign_bias, &ylogx, d); + + /* Handle exp special cases of underflow and overflow. */ + svuint32_t sign = svlsl_x (pg, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS); + svfloat32_t ret_oflow + = svreinterpret_f32 (svorr_x (pg, sign, asuint (INFINITY))); + svfloat32_t ret_uflow = svreinterpret_f32 (sign); + ret = svsel (svcmple (pg, ylogx, d->uflow_bound), ret_uflow, ret); + ret = svsel (svcmpgt (pg, ylogx, d->oflow_bound), ret_oflow, ret); + + /* Cases of finite y and finite negative x. */ + ret = svsel (yisnotint_xisneg, sv_f32 (__builtin_nanf ("")), ret); + + if (unlikely (svptest_any (pg, cmp))) + return sv_call_powf_sc (x, y, ret, cmp); + + return ret; +} + +PL_SIG (SV, F, 2, pow) +PL_TEST_ULP (SV_NAME_F2 (pow), 2.06) +/* Wide intervals spanning the whole domain but shared between x and y. */ +#define SV_POWF_INTERVAL2(xlo, xhi, ylo, yhi, n) \ + PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, ylo, yhi, n) \ + PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, -ylo, -yhi, n) \ + PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, ylo, yhi, n) \ + PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, -ylo, -yhi, n) +SV_POWF_INTERVAL2 (0, 0x1p-126, 0, inf, 40000) +SV_POWF_INTERVAL2 (0x1p-126, 1, 0, inf, 50000) +SV_POWF_INTERVAL2 (1, inf, 0, inf, 50000) +/* x~1 or y~1. */ +SV_POWF_INTERVAL2 (0x1p-1, 0x1p1, 0x1p-10, 0x1p10, 10000) +SV_POWF_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000) +SV_POWF_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000) +/* around estimated argmaxs of ULP error. */ +SV_POWF_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000) +SV_POWF_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000) +/* x is negative, y is odd or even integer, or y is real not integer. */ +PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 3.0, 3.0, 10000) +PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 4.0, 4.0, 10000) +PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 0.0, 10.0, 10000) +PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 0.0, 10.0, -0.0, -10.0, 10000) +/* |x| is inf, y is odd or even integer, or y is real not integer. */ +SV_POWF_INTERVAL2 (inf, inf, 0.5, 0.5, 1) +SV_POWF_INTERVAL2 (inf, inf, 1.0, 1.0, 1) +SV_POWF_INTERVAL2 (inf, inf, 2.0, 2.0, 1) +SV_POWF_INTERVAL2 (inf, inf, 3.0, 3.0, 1) +/* 0.0^y. */ +SV_POWF_INTERVAL2 (0.0, 0.0, 0.0, 0x1p120, 1000) +/* 1.0^y. */ +PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000) +PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000) +PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000) +PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_powi.c b/contrib/arm-optimized-routines/pl/math/sv_powi.c index 1bb0eb3d3498..e53bf2195533 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_powi.c +++ b/contrib/arm-optimized-routines/pl/math/sv_powi.c @@ -6,23 +6,22 @@ */ #include "sv_math.h" -#if SV_SUPPORTED /* Optimized double-precision vector powi (double base, long integer power). powi is developed for environments in which accuracy is of much less importance than performance, hence we provide no estimate for worst-case error. */ svfloat64_t -__sv_powi_x (svfloat64_t as, svint64_t ns, svbool_t p) +_ZGVsMxvv_powk (svfloat64_t as, svint64_t ns, svbool_t p) { /* Compute powi by successive squaring, right to left. */ - svfloat64_t acc = svdup_n_f64 (1.0); - svbool_t want_recip = svcmplt_n_s64 (p, ns, 0); - svuint64_t ns_abs = svreinterpret_u64_s64 (svabs_s64_x (p, ns)); + svfloat64_t acc = sv_f64 (1.0); + svbool_t want_recip = svcmplt (p, ns, 0); + svuint64_t ns_abs = svreinterpret_u64 (svabs_x (p, ns)); /* We use a max to avoid needing to check whether any lane != 0 on each iteration. */ - uint64_t max_n = svmaxv_u64 (p, ns_abs); + uint64_t max_n = svmaxv (p, ns_abs); svfloat64_t c = as; /* Successively square c, and use merging predication (_m) to determine @@ -30,24 +29,20 @@ __sv_powi_x (svfloat64_t as, svint64_t ns, svbool_t p) iteration. */ while (true) { - svbool_t px = svcmpeq_n_u64 (p, svand_n_u64_x (p, ns_abs, 1ull), 1ull); - acc = svmul_f64_m (px, acc, c); + svbool_t px = svcmpeq (p, svand_x (p, ns_abs, 1ull), 1ull); + acc = svmul_m (px, acc, c); max_n >>= 1; if (max_n == 0) break; - ns_abs = svlsr_n_u64_x (p, ns_abs, 1); - c = svmul_f64_x (p, c, c); + ns_abs = svlsr_x (p, ns_abs, 1); + c = svmul_x (p, c, c); } /* Negative powers are handled by computing the abs(n) version and then taking the reciprocal. */ if (svptest_any (want_recip, want_recip)) - acc = svdivr_n_f64_m (want_recip, acc, 1.0); + acc = svdivr_m (want_recip, acc, 1.0); return acc; } - -strong_alias (__sv_powi_x, _ZGVsMxvv_powk) - -#endif // SV_SUPPORTED diff --git a/contrib/arm-optimized-routines/pl/math/sv_powif.c b/contrib/arm-optimized-routines/pl/math/sv_powif.c index d0567e393927..7e032fd86a20 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_powif.c +++ b/contrib/arm-optimized-routines/pl/math/sv_powif.c @@ -6,23 +6,22 @@ */ #include "sv_math.h" -#if SV_SUPPORTED /* Optimized single-precision vector powi (float base, integer power). powi is developed for environments in which accuracy is of much less importance than performance, hence we provide no estimate for worst-case error. */ svfloat32_t -__sv_powif_x (svfloat32_t as, svint32_t ns, svbool_t p) +_ZGVsMxvv_powi (svfloat32_t as, svint32_t ns, svbool_t p) { /* Compute powi by successive squaring, right to left. */ - svfloat32_t acc = svdup_n_f32 (1.f); - svbool_t want_recip = svcmplt_n_s32 (p, ns, 0); - svuint32_t ns_abs = svreinterpret_u32_s32 (svabs_s32_x (p, ns)); + svfloat32_t acc = sv_f32 (1.f); + svbool_t want_recip = svcmplt (p, ns, 0); + svuint32_t ns_abs = svreinterpret_u32 (svabs_x (p, ns)); /* We use a max to avoid needing to check whether any lane != 0 on each iteration. */ - uint32_t max_n = svmaxv_u32 (p, ns_abs); + uint32_t max_n = svmaxv (p, ns_abs); svfloat32_t c = as; /* Successively square c, and use merging predication (_m) to determine @@ -30,25 +29,20 @@ __sv_powif_x (svfloat32_t as, svint32_t ns, svbool_t p) iteration. */ while (true) { - svbool_t px = svcmpeq_n_u32 (p, svand_n_u32_x (p, ns_abs, 1), 1); - acc = svmul_f32_m (px, acc, c); + svbool_t px = svcmpeq (p, svand_x (p, ns_abs, 1), 1); + acc = svmul_m (px, acc, c); max_n >>= 1; if (max_n == 0) break; - ns_abs = svlsr_n_u32_x (p, ns_abs, 1); - c = svmul_f32_x (p, c, c); + ns_abs = svlsr_x (p, ns_abs, 1); + c = svmul_x (p, c, c); } /* Negative powers are handled by computing the abs(n) version and then taking the reciprocal. */ if (svptest_any (want_recip, want_recip)) - acc = svdivr_n_f32_m (want_recip, acc, 1.0f); + acc = svdivr_m (want_recip, acc, 1.0f); return acc; } - -/* Note no trailing f for ZGV... name - 64-bit integer version is powk. */ -strong_alias (__sv_powif_x, _ZGVsMxvv_powi) - -#endif // SV_SUPPORTED diff --git a/contrib/arm-optimized-routines/pl/math/sv_sin_3u.c b/contrib/arm-optimized-routines/pl/math/sv_sin_3u.c deleted file mode 100644 index 3fee08061918..000000000000 --- a/contrib/arm-optimized-routines/pl/math/sv_sin_3u.c +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Double-precision SVE sin(x) function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" - -#if SV_SUPPORTED - -#define InvPi (sv_f64 (0x1.45f306dc9c883p-2)) -#define HalfPi (sv_f64 (0x1.921fb54442d18p+0)) -#define InvPio2 (sv_f64 (0x1.45f306dc9c882p-1)) -#define NegPio2_1 (sv_f64 (-0x1.921fb50000000p+0)) -#define NegPio2_2 (sv_f64 (-0x1.110b460000000p-26)) -#define NegPio2_3 (sv_f64 (-0x1.1a62633145c07p-54)) -#define Shift (sv_f64 (0x1.8p52)) -#define RangeVal (sv_f64 (0x1p23)) -#define AbsMask (0x7fffffffffffffff) - -static NOINLINE sv_f64_t -__sv_sin_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp) -{ - return sv_call_f64 (sin, x, y, cmp); -} - -/* A fast SVE implementation of sin based on trigonometric - instructions (FTMAD, FTSSEL, FTSMUL). - Maximum observed error in 2.52 ULP: - __sv_sin(0x1.2d2b00df69661p+19) got 0x1.10ace8f3e786bp-40 - want 0x1.10ace8f3e7868p-40. */ -sv_f64_t -__sv_sin_x (sv_f64_t x, const svbool_t pg) -{ - sv_f64_t n, r, r2, y; - sv_u64_t sign; - svbool_t cmp; - - r = sv_as_f64_u64 (svand_n_u64_x (pg, sv_as_u64_f64 (x), AbsMask)); - sign = svand_n_u64_x (pg, sv_as_u64_f64 (x), ~AbsMask); - cmp = svcmpge_u64 (pg, sv_as_u64_f64 (r), sv_as_u64_f64 (RangeVal)); - - /* n = rint(|x|/(pi/2)). */ - sv_f64_t q = sv_fma_f64_x (pg, InvPio2, r, Shift); - n = svsub_f64_x (pg, q, Shift); - - /* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */ - r = sv_fma_f64_x (pg, NegPio2_1, n, r); - r = sv_fma_f64_x (pg, NegPio2_2, n, r); - r = sv_fma_f64_x (pg, NegPio2_3, n, r); - - /* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */ - sv_f64_t f = svtssel_f64 (r, sv_as_u64_f64 (q)); - - /* sin(r) poly approx. */ - r2 = svtsmul_f64 (r, sv_as_u64_f64 (q)); - y = sv_f64 (0.0); - y = svtmad_f64 (y, r2, 7); - y = svtmad_f64 (y, r2, 6); - y = svtmad_f64 (y, r2, 5); - y = svtmad_f64 (y, r2, 4); - y = svtmad_f64 (y, r2, 3); - y = svtmad_f64 (y, r2, 2); - y = svtmad_f64 (y, r2, 1); - y = svtmad_f64 (y, r2, 0); - - /* Apply factor. */ - y = svmul_f64_x (pg, f, y); - - /* sign = y^sign. */ - y = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (y), sign)); - - /* No need to pass pg to specialcase here since cmp is a strict subset, - guaranteed by the cmpge above. */ - if (unlikely (svptest_any (pg, cmp))) - return __sv_sin_specialcase (x, y, cmp); - return y; -} - -PL_ALIAS (__sv_sin_x, _ZGVsMxv_sin) - -PL_SIG (SV, D, 1, sin, -3.1, 3.1) -PL_TEST_ULP (__sv_sin, 2.03) -PL_TEST_INTERVAL (__sv_sin, 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (__sv_sin, 0x1p-4, 0x1p4, 500000) -#endif diff --git a/contrib/arm-optimized-routines/pl/math/sv_sin_3u5.c b/contrib/arm-optimized-routines/pl/math/sv_sin_3u5.c new file mode 100644 index 000000000000..a81f3fc80f3d --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_sin_3u5.c @@ -0,0 +1,96 @@ +/* + * Double-precision SVE sin(x) function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + double inv_pi, pi_1, pi_2, pi_3, shift, range_val; + double poly[7]; +} data = { + .poly = { -0x1.555555555547bp-3, 0x1.1111111108a4dp-7, -0x1.a01a019936f27p-13, + 0x1.71de37a97d93ep-19, -0x1.ae633919987c6p-26, + 0x1.60e277ae07cecp-33, -0x1.9e9540300a1p-41, }, + + .inv_pi = 0x1.45f306dc9c883p-2, + .pi_1 = 0x1.921fb54442d18p+1, + .pi_2 = 0x1.1a62633145c06p-53, + .pi_3 = 0x1.c1cd129024e09p-106, + .shift = 0x1.8p52, + .range_val = 0x1p23, +}; + +#define C(i) sv_f64 (d->poly[i]) + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp) +{ + return sv_call_f64 (sin, x, y, cmp); +} + +/* A fast SVE implementation of sin. + Maximum observed error in [-pi/2, pi/2], where argument is not reduced, + is 2.87 ULP: + _ZGVsMxv_sin (0x1.921d5c6a07142p+0) got 0x1.fffffffa7dc02p-1 + want 0x1.fffffffa7dc05p-1 + Maximum observed error in the entire non-special domain ([-2^23, 2^23]) + is 3.22 ULP: + _ZGVsMxv_sin (0x1.5702447b6f17bp+22) got 0x1.ffdcd125c84fbp-3 + want 0x1.ffdcd125c84f8p-3. */ +svfloat64_t SV_NAME_D1 (sin) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* Load some values in quad-word chunks to minimise memory access. */ + const svbool_t ptrue = svptrue_b64 (); + svfloat64_t shift = sv_f64 (d->shift); + svfloat64_t inv_pi_and_pi1 = svld1rq (ptrue, &d->inv_pi); + svfloat64_t pi2_and_pi3 = svld1rq (ptrue, &d->pi_2); + + /* n = rint(|x|/pi). */ + svfloat64_t n = svmla_lane (shift, x, inv_pi_and_pi1, 0); + svuint64_t odd = svlsl_x (pg, svreinterpret_u64 (n), 63); + n = svsub_x (pg, n, shift); + + /* r = |x| - n*(pi/2) (range reduction into -pi/2 .. pi/2). */ + svfloat64_t r = x; + r = svmls_lane (r, n, inv_pi_and_pi1, 1); + r = svmls_lane (r, n, pi2_and_pi3, 0); + r = svmls_lane (r, n, pi2_and_pi3, 1); + + /* sin(r) poly approx. */ + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t r3 = svmul_x (pg, r2, r); + svfloat64_t r4 = svmul_x (pg, r2, r2); + + svfloat64_t t1 = svmla_x (pg, C (4), C (5), r2); + svfloat64_t t2 = svmla_x (pg, C (2), C (3), r2); + svfloat64_t t3 = svmla_x (pg, C (0), C (1), r2); + + svfloat64_t y = svmla_x (pg, t1, C (6), r4); + y = svmla_x (pg, t2, y, r4); + y = svmla_x (pg, t3, y, r4); + y = svmla_x (pg, r, y, r3); + + svbool_t cmp = svacle (pg, x, d->range_val); + cmp = svnot_z (pg, cmp); + if (unlikely (svptest_any (pg, cmp))) + return special_case (x, + svreinterpret_f64 (sveor_z ( + svnot_z (pg, cmp), svreinterpret_u64 (y), odd)), + cmp); + + /* Copy sign. */ + return svreinterpret_f64 (sveor_z (pg, svreinterpret_u64 (y), odd)); +} + +PL_SIG (SV, D, 1, sin, -3.1, 3.1) +PL_TEST_ULP (SV_NAME_D1 (sin), 2.73) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sin), 0, 0x1p23, 1000000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sin), 0x1p23, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_sincos_3u5.c b/contrib/arm-optimized-routines/pl/math/sv_sincos_3u5.c new file mode 100644 index 000000000000..f73550082d5b --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_sincos_3u5.c @@ -0,0 +1,61 @@ +/* + * Double-precision vector sincos function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* Define _GNU_SOURCE in order to include sincos declaration. If building + pre-GLIBC 2.1, or on a non-GNU conforming system, this routine will need to + be linked against the scalar sincosf from math/. */ +#define _GNU_SOURCE +#include +#undef _GNU_SOURCE + +#include "sv_sincos_common.h" +#include "sv_math.h" +#include "pl_test.h" + +static void NOINLINE +special_case (svfloat64_t x, svbool_t special, double *out_sin, + double *out_cos) +{ + svbool_t p = svptrue_pat_b64 (SV_VL1); + for (int i = 0; i < svcntd (); i++) + { + if (svptest_any (special, p)) + sincos (svlastb (p, x), out_sin + i, out_cos + i); + p = svpnext_b64 (svptrue_b64 (), p); + } +} + +/* Double-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate polynomials. + Largest observed error is for sin, 3.22 ULP: + sv_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3 + want -0x1.ffe9537d5dbb4p-3. */ +void +_ZGVsMxvl8l8_sincos (svfloat64_t x, double *out_sin, double *out_cos, + svbool_t pg) +{ + const struct sv_sincos_data *d = ptr_barrier (&sv_sincos_data); + svbool_t special = check_ge_rangeval (pg, x, d); + + svfloat64x2_t sc = sv_sincos_inline (pg, x, d); + + svst1 (pg, out_sin, svget2 (sc, 0)); + svst1 (pg, out_cos, svget2 (sc, 1)); + + if (unlikely (svptest_any (pg, special))) + special_case (x, special, out_sin, out_cos); +} + +PL_TEST_ULP (_ZGVsMxv_sincos_sin, 2.73) +PL_TEST_ULP (_ZGVsMxv_sincos_cos, 2.73) +#define SV_SINCOS_INTERVAL(lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVsMxv_sincos_sin, lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVsMxv_sincos_cos, lo, hi, n) +SV_SINCOS_INTERVAL (0, 0x1p23, 500000) +SV_SINCOS_INTERVAL (-0, -0x1p23, 500000) +SV_SINCOS_INTERVAL (0x1p23, inf, 10000) +SV_SINCOS_INTERVAL (-0x1p23, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_sincos_common.h b/contrib/arm-optimized-routines/pl/math/sv_sincos_common.h new file mode 100644 index 000000000000..f7b58deb90bd --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_sincos_common.h @@ -0,0 +1,85 @@ +/* + * Core approximation for double-precision vector sincos + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "poly_sve_f64.h" + +static const struct sv_sincos_data +{ + double sin_poly[7], cos_poly[6], pio2[3]; + double inv_pio2, shift, range_val; +} sv_sincos_data = { + .inv_pio2 = 0x1.45f306dc9c882p-1, + .pio2 = { 0x1.921fb50000000p+0, 0x1.110b460000000p-26, + 0x1.1a62633145c07p-54 }, + .shift = 0x1.8p52, + .sin_poly = { /* Computed using Remez in [-pi/2, pi/2]. */ + -0x1.555555555547bp-3, 0x1.1111111108a4dp-7, + -0x1.a01a019936f27p-13, 0x1.71de37a97d93ep-19, + -0x1.ae633919987c6p-26, 0x1.60e277ae07cecp-33, + -0x1.9e9540300a1p-41 }, + .cos_poly = { /* Computed using Remez in [-pi/4, pi/4]. */ + 0x1.555555555554cp-5, -0x1.6c16c16c1521fp-10, + 0x1.a01a019cbf62ap-16, -0x1.27e4f812b681ep-22, + 0x1.1ee9f152a57cdp-29, -0x1.8fb131098404bp-37 }, + .range_val = 0x1p23, }; + +static inline svbool_t +check_ge_rangeval (svbool_t pg, svfloat64_t x, const struct sv_sincos_data *d) +{ + svbool_t in_bounds = svaclt (pg, x, d->range_val); + return svnot_z (pg, in_bounds); +} + +/* Double-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate polynomials. + Largest observed error is for sin, 3.22 ULP: + v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3 + want -0x1.ffe9537d5dbb4p-3. */ +static inline svfloat64x2_t +sv_sincos_inline (svbool_t pg, svfloat64_t x, const struct sv_sincos_data *d) +{ + /* q = nearest integer to 2 * x / pi. */ + svfloat64_t q = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_pio2), + d->shift); + svint64_t n = svcvt_s64_x (pg, q); + + /* Reduce x such that r is in [ -pi/4, pi/4 ]. */ + svfloat64_t r = x; + r = svmls_x (pg, r, q, d->pio2[0]); + r = svmls_x (pg, r, q, d->pio2[1]); + r = svmls_x (pg, r, q, d->pio2[2]); + + svfloat64_t r2 = svmul_x (pg, r, r), r3 = svmul_x (pg, r2, r), + r4 = svmul_x (pg, r2, r2); + + /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2). */ + svfloat64_t s = sv_pw_horner_6_f64_x (pg, r2, r4, d->sin_poly); + s = svmla_x (pg, r, r3, s); + + /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2). */ + svfloat64_t c = sv_pw_horner_5_f64_x (pg, r2, r4, d->cos_poly); + c = svmad_x (pg, c, r2, -0.5); + c = svmad_x (pg, c, r2, 1); + + svuint64_t un = svreinterpret_u64 (n); + /* If odd quadrant, swap cos and sin. */ + svbool_t swap = svcmpeq (pg, svlsl_x (pg, un, 63), 0); + svfloat64_t ss = svsel (swap, s, c); + svfloat64_t cc = svsel (swap, c, s); + + /* Fix signs according to quadrant. + ss = asdouble(asuint64(ss) ^ ((n & 2) << 62)) + cc = asdouble(asuint64(cc) & (((n + 1) & 2) << 62)). */ + svuint64_t sin_sign = svlsl_x (pg, svand_x (pg, un, 2), 62); + svuint64_t cos_sign = svlsl_x ( + pg, svand_x (pg, svreinterpret_u64 (svadd_x (pg, n, 1)), 2), 62); + ss = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ss), sin_sign)); + cc = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (cc), cos_sign)); + + return svcreate2 (ss, cc); +} diff --git a/contrib/arm-optimized-routines/pl/math/sv_sincosf_1u8.c b/contrib/arm-optimized-routines/pl/math/sv_sincosf_1u8.c new file mode 100644 index 000000000000..c335de8d3dbb --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_sincosf_1u8.c @@ -0,0 +1,62 @@ +/* + * Single-precision vector sincos function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* Define _GNU_SOURCE in order to include sincosf declaration. If building + pre-GLIBC 2.1, or on a non-GNU conforming system, this routine will need to + be linked against the scalar sincosf from math/. */ +#define _GNU_SOURCE +#include +#undef _GNU_SOURCE + +#include "sv_sincosf_common.h" +#include "sv_math.h" +#include "pl_test.h" + +static void NOINLINE +special_case (svfloat32_t x, svbool_t special, float *out_sin, float *out_cos) +{ + svbool_t p = svptrue_pat_b32 (SV_VL1); + for (int i = 0; i < svcntw (); i++) + { + if (svptest_any (special, p)) + sincosf (svlastb (p, x), out_sin + i, out_cos + i); + p = svpnext_b32 (svptrue_b32 (), p); + } +} + +/* Single-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate low-order + polynomials. + Worst-case error for sin is 1.67 ULP: + sv_sincosf_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5 + Worst-case error for cos is 1.81 ULP: + sv_sincosf_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */ +void +_ZGVsMxvl4l4_sincosf (svfloat32_t x, float *out_sin, float *out_cos, + svbool_t pg) +{ + const struct sv_sincosf_data *d = ptr_barrier (&sv_sincosf_data); + svbool_t special = check_ge_rangeval (pg, x, d); + + svfloat32x2_t sc = sv_sincosf_inline (pg, x, d); + + svst1_f32 (pg, out_sin, svget2 (sc, 0)); + svst1_f32 (pg, out_cos, svget2 (sc, 1)); + + if (unlikely (svptest_any (pg, special))) + special_case (x, special, out_sin, out_cos); +} + +PL_TEST_ULP (_ZGVsMxv_sincosf_sin, 1.17) +PL_TEST_ULP (_ZGVsMxv_sincosf_cos, 1.31) +#define SV_SINCOSF_INTERVAL(lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVsMxv_sincosf_sin, lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVsMxv_sincosf_cos, lo, hi, n) +SV_SINCOSF_INTERVAL (0, 0x1p20, 500000) +SV_SINCOSF_INTERVAL (-0, -0x1p20, 500000) +SV_SINCOSF_INTERVAL (0x1p20, inf, 10000) +SV_SINCOSF_INTERVAL (-0x1p20, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_sincosf_common.h b/contrib/arm-optimized-routines/pl/math/sv_sincosf_common.h new file mode 100644 index 000000000000..714e996443b3 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_sincosf_common.h @@ -0,0 +1,81 @@ +/* + * Core approximation for single-precision vector sincos + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" + +const static struct sv_sincosf_data +{ + float poly_sin[3], poly_cos[3], pio2[3], inv_pio2, shift, range_val; +} sv_sincosf_data = { + .poly_sin = { /* Generated using Remez, odd coeffs only, in [-pi/4, pi/4]. */ + -0x1.555546p-3, 0x1.11076p-7, -0x1.994eb4p-13 }, + .poly_cos = { /* Generated using Remez, even coeffs only, in [-pi/4, pi/4]. */ + 0x1.55554ap-5, -0x1.6c0c1ap-10, 0x1.99e0eep-16 }, + .pio2 = { 0x1.921fb6p+0f, -0x1.777a5cp-25f, -0x1.ee59dap-50f }, + .inv_pio2 = 0x1.45f306p-1f, + .shift = 0x1.8p23, + .range_val = 0x1p20 +}; + +static inline svbool_t +check_ge_rangeval (svbool_t pg, svfloat32_t x, const struct sv_sincosf_data *d) +{ + svbool_t in_bounds = svaclt (pg, x, d->range_val); + return svnot_z (pg, in_bounds); +} + +/* Single-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate low-order + polynomials. + Worst-case error for sin is 1.67 ULP: + sv_sincosf_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5 + Worst-case error for cos is 1.81 ULP: + sv_sincosf_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */ +static inline svfloat32x2_t +sv_sincosf_inline (svbool_t pg, svfloat32_t x, const struct sv_sincosf_data *d) +{ + /* n = rint ( x / (pi/2) ). */ + svfloat32_t q = svmla_x (pg, sv_f32 (d->shift), x, d->inv_pio2); + q = svsub_x (pg, q, d->shift); + svint32_t n = svcvt_s32_x (pg, q); + + /* Reduce x such that r is in [ -pi/4, pi/4 ]. */ + svfloat32_t r = x; + r = svmls_x (pg, r, q, d->pio2[0]); + r = svmls_x (pg, r, q, d->pio2[1]); + r = svmls_x (pg, r, q, d->pio2[2]); + + /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2). */ + svfloat32_t r2 = svmul_x (pg, r, r), r3 = svmul_x (pg, r, r2); + svfloat32_t s = svmla_x (pg, sv_f32 (d->poly_sin[1]), r2, d->poly_sin[2]); + s = svmad_x (pg, r2, s, d->poly_sin[0]); + s = svmla_x (pg, r, r3, s); + + /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2). */ + svfloat32_t r4 = svmul_x (pg, r2, r2); + svfloat32_t p = svmla_x (pg, sv_f32 (d->poly_cos[1]), r2, d->poly_cos[2]); + svfloat32_t c = svmad_x (pg, sv_f32 (d->poly_cos[0]), r2, -0.5); + c = svmla_x (pg, c, r4, p); + c = svmad_x (pg, r2, c, 1); + + svuint32_t un = svreinterpret_u32 (n); + /* If odd quadrant, swap cos and sin. */ + svbool_t swap = svcmpeq (pg, svlsl_x (pg, un, 31), 0); + svfloat32_t ss = svsel (swap, s, c); + svfloat32_t cc = svsel (swap, c, s); + + /* Fix signs according to quadrant. + ss = asfloat(asuint(ss) ^ ((n & 2) << 30)) + cc = asfloat(asuint(cc) & (((n + 1) & 2) << 30)). */ + svuint32_t sin_sign = svlsl_x (pg, svand_x (pg, un, 2), 30); + svuint32_t cos_sign = svlsl_x ( + pg, svand_x (pg, svreinterpret_u32 (svadd_x (pg, n, 1)), 2), 30); + ss = svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ss), sin_sign)); + cc = svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (cc), cos_sign)); + + return svcreate2 (ss, cc); +} diff --git a/contrib/arm-optimized-routines/pl/math/sv_sinf_1u9.c b/contrib/arm-optimized-routines/pl/math/sv_sinf_1u9.c index 9184ccd3cf0c..675d7b2480f7 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_sinf_1u9.c +++ b/contrib/arm-optimized-routines/pl/math/sv_sinf_1u9.c @@ -9,23 +9,31 @@ #include "pl_sig.h" #include "pl_test.h" -#if SV_SUPPORTED +static const struct data +{ + float poly[4]; + /* Pi-related values to be loaded as one quad-word and used with + svmla_lane. */ + float negpi1, negpi2, negpi3, invpi; + float shift; +} data = { + .poly = { + /* Non-zero coefficients from the degree 9 Taylor series expansion of + sin. */ + -0x1.555548p-3f, 0x1.110df4p-7f, -0x1.9f42eap-13f, 0x1.5b2e76p-19f + }, + .negpi1 = -0x1.921fb6p+1f, + .negpi2 = 0x1.777a5cp-24f, + .negpi3 = 0x1.ee59dap-49f, + .invpi = 0x1.45f306p-2f, + .shift = 0x1.8p+23f +}; -#define A3 (sv_f32 (__sv_sinf_data.coeffs[3])) -#define A5 (sv_f32 (__sv_sinf_data.coeffs[2])) -#define A7 (sv_f32 (__sv_sinf_data.coeffs[1])) -#define A9 (sv_f32 (__sv_sinf_data.coeffs[0])) +#define RangeVal 0x49800000 /* asuint32 (0x1p20f). */ +#define C(i) sv_f32 (d->poly[i]) -#define NegPi1 (sv_f32 (-0x1.921fb6p+1f)) -#define NegPi2 (sv_f32 (0x1.777a5cp-24f)) -#define NegPi3 (sv_f32 (0x1.ee59dap-49f)) -#define RangeVal (sv_f32 (0x1p20f)) -#define InvPi (sv_f32 (0x1.45f306p-2f)) -#define Shift (sv_f32 (0x1.8p+23f)) -#define AbsMask (0x7fffffff) - -static NOINLINE sv_f32_t -__sv_sinf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp) +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp) { return sv_call_f32 (sinf, x, y, cmp); } @@ -34,51 +42,52 @@ __sv_sinf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp) Maximum error: 1.89 ULPs. This maximum error is achieved at multiple values in [-2^18, 2^18] but one example is: - __sv_sinf(0x1.9247a4p+0) got 0x1.fffff6p-1 want 0x1.fffffap-1. */ -sv_f32_t -__sv_sinf_x (sv_f32_t x, const svbool_t pg) + SV_NAME_F1 (sin)(0x1.9247a4p+0) got 0x1.fffff6p-1 want 0x1.fffffap-1. */ +svfloat32_t SV_NAME_F1 (sin) (svfloat32_t x, const svbool_t pg) { - sv_f32_t n, r, r2, y; - sv_u32_t sign, odd; - svbool_t cmp; + const struct data *d = ptr_barrier (&data); - r = sv_as_f32_u32 (svand_n_u32_x (pg, sv_as_u32_f32 (x), AbsMask)); - sign = svand_n_u32_x (pg, sv_as_u32_f32 (x), ~AbsMask); - cmp = svcmpge_u32 (pg, sv_as_u32_f32 (r), sv_as_u32_f32 (RangeVal)); + svfloat32_t ax = svabs_x (pg, x); + svuint32_t sign + = sveor_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (ax)); + svbool_t cmp = svcmpge (pg, svreinterpret_u32 (ax), RangeVal); + + /* pi_vals are a quad-word of helper values - the first 3 elements contain + -pi in extended precision, the last contains 1 / pi. */ + svfloat32_t pi_vals = svld1rq (svptrue_b32 (), &d->negpi1); /* n = rint(|x|/pi). */ - n = sv_fma_f32_x (pg, InvPi, r, Shift); - odd = svlsl_n_u32_x (pg, sv_as_u32_f32 (n), 31); - n = svsub_f32_x (pg, n, Shift); + svfloat32_t n = svmla_lane (sv_f32 (d->shift), ax, pi_vals, 3); + svuint32_t odd = svlsl_x (pg, svreinterpret_u32 (n), 31); + n = svsub_x (pg, n, d->shift); /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ - r = sv_fma_f32_x (pg, NegPi1, n, r); - r = sv_fma_f32_x (pg, NegPi2, n, r); - r = sv_fma_f32_x (pg, NegPi3, n, r); + svfloat32_t r; + r = svmla_lane (ax, n, pi_vals, 0); + r = svmla_lane (r, n, pi_vals, 1); + r = svmla_lane (r, n, pi_vals, 2); /* sin(r) approx using a degree 9 polynomial from the Taylor series expansion. Note that only the odd terms of this are non-zero. */ - r2 = svmul_f32_x (pg, r, r); - y = sv_fma_f32_x (pg, A9, r2, A7); - y = sv_fma_f32_x (pg, y, r2, A5); - y = sv_fma_f32_x (pg, y, r2, A3); - y = sv_fma_f32_x (pg, svmul_f32_x (pg, y, r2), r, r); + svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t y; + y = svmla_x (pg, C (2), r2, C (3)); + y = svmla_x (pg, C (1), r2, y); + y = svmla_x (pg, C (0), r2, y); + y = svmla_x (pg, r, r, svmul_x (pg, y, r2)); /* sign = y^sign^odd. */ - y = sv_as_f32_u32 ( - sveor_u32_x (pg, sv_as_u32_f32 (y), sveor_u32_x (pg, sign, odd))); + sign = sveor_x (pg, sign, odd); - /* No need to pass pg to specialcase here since cmp is a strict subset, - guaranteed by the cmpge above. */ if (unlikely (svptest_any (pg, cmp))) - return __sv_sinf_specialcase (x, y, cmp); - return y; + return special_case (x, + svreinterpret_f32 (sveor_x ( + svnot_z (pg, cmp), svreinterpret_u32 (y), sign)), + cmp); + return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign)); } -PL_ALIAS (__sv_sinf_x, _ZGVsMxv_sinf) - PL_SIG (SV, F, 1, sin, -3.1, 3.1) -PL_TEST_ULP (__sv_sinf, 1.40) -PL_TEST_INTERVAL (__sv_sinf, 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (__sv_sinf, 0x1p-4, 0x1p4, 500000) -#endif +PL_TEST_ULP (SV_NAME_F1 (sin), 1.40) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sin), 0, 0x1p23, 1000000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sin), 0x1p23, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_sinf_poly_data.c b/contrib/arm-optimized-routines/pl/math/sv_sinf_poly_data.c deleted file mode 100644 index 1e1ab5e48df1..000000000000 --- a/contrib/arm-optimized-routines/pl/math/sv_sinf_poly_data.c +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Data used in single-precision sin(x) function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -/* Polynomial coefficients for approximating sin(x) in single - precision. These are the non-zero coefficients from the - degree 9 Taylor series expansion of sin. */ - -const struct sv_sinf_data __sv_sinf_data = {.coeffs = { - 0x1.5b2e76p-19f, - -0x1.9f42eap-13f, - 0x1.110df4p-7f, - -0x1.555548p-3f, - }}; diff --git a/contrib/arm-optimized-routines/pl/math/sv_sinh_3u.c b/contrib/arm-optimized-routines/pl/math/sv_sinh_3u.c new file mode 100644 index 000000000000..a01e19caecda --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_sinh_3u.c @@ -0,0 +1,103 @@ +/* + * Double-precision SVE sinh(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "poly_sve_f64.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float64_t poly[11]; + float64_t inv_ln2, m_ln2_hi, m_ln2_lo, shift; + uint64_t halff; + int64_t onef; + uint64_t large_bound; +} data = { + /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */ + .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5, + 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10, + 0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16, + 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22, + 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, }, + + .inv_ln2 = 0x1.71547652b82fep0, + .m_ln2_hi = -0x1.62e42fefa39efp-1, + .m_ln2_lo = -0x1.abc9e3b39803fp-56, + .shift = 0x1.8p52, + + .halff = 0x3fe0000000000000, + .onef = 0x3ff0000000000000, + /* 2^9. expm1 helper overflows for large input. */ + .large_bound = 0x4080000000000000, +}; + +static inline svfloat64_t +expm1_inline (svfloat64_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* Reduce argument: + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 + where i = round(x / ln2) + and f = x - i * ln2 (f in [-ln2/2, ln2/2]). */ + svfloat64_t j + = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift); + svint64_t i = svcvt_s64_x (pg, j); + svfloat64_t f = svmla_x (pg, x, j, d->m_ln2_hi); + f = svmla_x (pg, f, j, d->m_ln2_lo); + /* Approximate expm1(f) using polynomial. */ + svfloat64_t f2 = svmul_x (pg, f, f); + svfloat64_t f4 = svmul_x (pg, f2, f2); + svfloat64_t f8 = svmul_x (pg, f4, f4); + svfloat64_t p + = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly)); + /* t = 2^i. */ + svfloat64_t t = svscale_x (pg, sv_f64 (1), i); + /* expm1(x) ~= p * t + (t - 1). */ + return svmla_x (pg, svsub_x (pg, t, 1.0), p, t); +} + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svbool_t pg) +{ + return sv_call_f64 (sinh, x, x, pg); +} + +/* Approximation for SVE double-precision sinh(x) using expm1. + sinh(x) = (exp(x) - exp(-x)) / 2. + The greatest observed error is 2.57 ULP: + _ZGVsMxv_sinh (0x1.a008538399931p-2) got 0x1.ab929fc64bd66p-2 + want 0x1.ab929fc64bd63p-2. */ +svfloat64_t SV_NAME_D1 (sinh) (svfloat64_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat64_t ax = svabs_x (pg, x); + svuint64_t sign + = sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax)); + svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, d->halff)); + + svbool_t special = svcmpge (pg, svreinterpret_u64 (ax), d->large_bound); + + /* Fall back to scalar variant for all lanes if any are special. */ + if (unlikely (svptest_any (pg, special))) + return special_case (x, pg); + + /* Up to the point that expm1 overflows, we can use it to calculate sinh + using a slight rearrangement of the definition of sinh. This allows us to + retain acceptable accuracy for very small inputs. */ + svfloat64_t t = expm1_inline (ax, pg); + t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0))); + return svmul_x (pg, t, halfsign); +} + +PL_SIG (SV, D, 1, sinh, -10.0, 10.0) +PL_TEST_ULP (SV_NAME_D1 (sinh), 2.08) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinh), 0, 0x1p-26, 1000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinh), 0x1p-26, 0x1p9, 500000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinh), 0x1p9, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_sinhf_2u3.c b/contrib/arm-optimized-routines/pl/math/sv_sinhf_2u3.c new file mode 100644 index 000000000000..e34ecf378ad3 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_sinhf_2u3.c @@ -0,0 +1,64 @@ +/* + * Single-precision SVE sinh(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#include "sv_expm1f_inline.h" + +static const struct data +{ + struct sv_expm1f_data expm1f_consts; + uint32_t halff, large_bound; +} data = { + .expm1f_consts = SV_EXPM1F_DATA, + .halff = 0x3f000000, + /* 0x1.61814ep+6, above which expm1f helper overflows. */ + .large_bound = 0x42b0c0a7, +}; + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t pg) +{ + return sv_call_f32 (sinhf, x, y, pg); +} + +/* Approximation for SVE single-precision sinh(x) using expm1. + sinh(x) = (exp(x) - exp(-x)) / 2. + The maximum error is 2.26 ULP: + _ZGVsMxv_sinhf (0x1.e34a9ep-4) got 0x1.e469ep-4 + want 0x1.e469e4p-4. */ +svfloat32_t SV_NAME_F1 (sinh) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + svfloat32_t ax = svabs_x (pg, x); + svuint32_t sign + = sveor_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (ax)); + svfloat32_t halfsign = svreinterpret_f32 (svorr_x (pg, sign, d->halff)); + + svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->large_bound); + + /* Up to the point that expm1f overflows, we can use it to calculate sinhf + using a slight rearrangement of the definition of asinh. This allows us to + retain acceptable accuracy for very small inputs. */ + svfloat32_t t = expm1f_inline (ax, pg, &d->expm1f_consts); + t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0))); + + /* Fall back to the scalar variant for any lanes which would cause + expm1f to overflow. */ + if (unlikely (svptest_any (pg, special))) + return special_case (x, svmul_x (pg, t, halfsign), special); + + return svmul_x (pg, t, halfsign); +} + +PL_SIG (SV, F, 1, sinh, -10.0, 10.0) +PL_TEST_ULP (SV_NAME_F1 (sinh), 1.76) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinh), 0, 0x1.6a09e8p-32, 1000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinh), 0x1.6a09e8p-32, 0x42b0c0a7, 100000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinh), 0x42b0c0a7, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_sinpi_3u1.c b/contrib/arm-optimized-routines/pl/math/sv_sinpi_3u1.c new file mode 100644 index 000000000000..c9f23da1b19b --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_sinpi_3u1.c @@ -0,0 +1,57 @@ +/* + * Double-precision SVE sinpi(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "poly_sve_f64.h" + +static const struct data +{ + double poly[10]; +} data = { + /* Polynomial coefficients generated using Remez algorithm, + see sinpi.sollya for details. */ + .poly = { 0x1.921fb54442d184p1, -0x1.4abbce625be53p2, 0x1.466bc6775ab16p1, + -0x1.32d2cce62dc33p-1, 0x1.507834891188ep-4, -0x1.e30750a28c88ep-8, + 0x1.e8f48308acda4p-12, -0x1.6fc0032b3c29fp-16, + 0x1.af86ae521260bp-21, -0x1.012a9870eeb7dp-25 }, +}; + +/* A fast SVE implementation of sinpi. + Maximum error 3.10 ULP: + _ZGVsMxv_sinpi(0x1.df1a14f1b235p-2) got 0x1.fd64f541606cp-1 + want 0x1.fd64f541606c3p-1. */ +svfloat64_t SV_NAME_D1 (sinpi) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* range reduction into -1/2 .. 1/2) + with n = rint(x) and r = r - n. */ + svfloat64_t n = svrinta_x (pg, x); + svfloat64_t r = svsub_x (pg, x, n); + + /* Result should be negated based on if n is odd or not. */ + svuint64_t intn = svreinterpret_u64 (svcvt_s64_x (pg, n)); + svuint64_t sign = svlsl_z (pg, intn, 63); + + /* y = sin(r). */ + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t r4 = svmul_x (pg, r2, r2); + svfloat64_t y = sv_pw_horner_9_f64_x (pg, r2, r4, d->poly); + y = svmul_x (pg, y, r); + + return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)); +} + +PL_SIG (SV, D, 1, sinpi, -0.9, 0.9) +PL_TEST_ULP (SV_NAME_D1 (sinpi), 2.61) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0, 0x1p-63, 5000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0x1p-63, 0.5, 10000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0.5, 0x1p51, 10000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0x1p51, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_sinpif_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_sinpif_2u5.c new file mode 100644 index 000000000000..ac3f924bed68 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_sinpif_2u5.c @@ -0,0 +1,53 @@ +/* + * Single-precision SVE sinpi(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "poly_sve_f32.h" + +static const struct data +{ + float poly[6]; +} data = { + /* Taylor series coefficents for sin(pi * x). */ + .poly = { 0x1.921fb6p1f, -0x1.4abbcep2f, 0x1.466bc6p1f, -0x1.32d2ccp-1f, + 0x1.50783p-4f, -0x1.e30750p-8f }, +}; + +/* A fast SVE implementation of sinpif. + Maximum error 2.48 ULP: + _ZGVsMxv_sinpif(0x1.d062b6p-2) got 0x1.fa8c06p-1 + want 0x1.fa8c02p-1. */ +svfloat32_t SV_NAME_F1 (sinpi) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* range reduction into -1/2 .. 1/2 + with n = rint(x) and r = r - n. */ + svfloat32_t n = svrinta_x (pg, x); + svfloat32_t r = svsub_x (pg, x, n); + + /* Result should be negated based on if n is odd or not. */ + svuint32_t intn = svreinterpret_u32 (svcvt_s32_x (pg, n)); + svuint32_t sign = svlsl_z (pg, intn, 31); + + /* y = sin(r). */ + svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t y = sv_horner_5_f32_x (pg, r2, d->poly); + y = svmul_x (pg, y, r); + + return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign)); +} + +PL_SIG (SV, F, 1, sinpi, -0.9, 0.9) +PL_TEST_ULP (SV_NAME_F1 (sinpi), 1.99) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0, 0x1p-31, 5000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0x1p-31, 0.5, 10000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0.5, 0x1p22f, 10000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0x1p22f, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_tan_3u5.c b/contrib/arm-optimized-routines/pl/math/sv_tan_3u5.c new file mode 100644 index 000000000000..746396e98a10 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_tan_3u5.c @@ -0,0 +1,99 @@ +/* + * Double-precision SVE tan(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "poly_sve_f64.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + double poly[9]; + double half_pi_hi, half_pi_lo, inv_half_pi, range_val, shift; +} data = { + /* Polynomial generated with FPMinimax. */ + .poly = { 0x1.5555555555556p-2, 0x1.1111111110a63p-3, 0x1.ba1ba1bb46414p-5, + 0x1.664f47e5b5445p-6, 0x1.226e5e5ecdfa3p-7, 0x1.d6c7ddbf87047p-9, + 0x1.7ea75d05b583ep-10, 0x1.289f22964a03cp-11, + 0x1.4e4fd14147622p-12, }, + .half_pi_hi = 0x1.921fb54442d18p0, + .half_pi_lo = 0x1.1a62633145c07p-54, + .inv_half_pi = 0x1.45f306dc9c883p-1, + .range_val = 0x1p23, + .shift = 0x1.8p52, +}; + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +{ + return sv_call_f64 (tan, x, y, special); +} + +/* Vector approximation for double-precision tan. + Maximum measured error is 3.48 ULP: + _ZGVsMxv_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37 + want -0x1.f6ccd8ecf7deap+37. */ +svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg) +{ + const struct data *dat = ptr_barrier (&data); + + /* Invert condition to catch NaNs and Infs as well as large values. */ + svbool_t special = svnot_z (pg, svaclt (pg, x, dat->range_val)); + + /* q = nearest integer to 2 * x / pi. */ + svfloat64_t shift = sv_f64 (dat->shift); + svfloat64_t q = svmla_x (pg, shift, x, dat->inv_half_pi); + q = svsub_x (pg, q, shift); + svint64_t qi = svcvt_s64_x (pg, q); + + /* Use q to reduce x to r in [-pi/4, pi/4], by: + r = x - q * pi/2, in extended precision. */ + svfloat64_t r = x; + svfloat64_t half_pi = svld1rq (svptrue_b64 (), &dat->half_pi_hi); + r = svmls_lane (r, q, half_pi, 0); + r = svmls_lane (r, q, half_pi, 1); + /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle + formula. */ + r = svmul_x (pg, r, 0.5); + + /* Approximate tan(r) using order 8 polynomial. + tan(x) is odd, so polynomial has the form: + tan(x) ~= x + C0 * x^3 + C1 * x^5 + C3 * x^7 + ... + Hence we first approximate P(r) = C1 + C2 * r^2 + C3 * r^4 + ... + Then compute the approximation by: + tan(r) ~= r + r^3 * (C0 + r^2 * P(r)). */ + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t r4 = svmul_x (pg, r2, r2); + svfloat64_t r8 = svmul_x (pg, r4, r4); + /* Use offset version coeff array by 1 to evaluate from C1 onwards. */ + svfloat64_t p = sv_estrin_7_f64_x (pg, r2, r4, r8, dat->poly + 1); + p = svmad_x (pg, p, r2, dat->poly[0]); + p = svmla_x (pg, r, r2, svmul_x (pg, p, r)); + + /* Recombination uses double-angle formula: + tan(2x) = 2 * tan(x) / (1 - (tan(x))^2) + and reciprocity around pi/2: + tan(x) = 1 / (tan(pi/2 - x)) + to assemble result using change-of-sign and conditional selection of + numerator/denominator dependent on odd/even-ness of q (hence quadrant). */ + svbool_t use_recip + = svcmpeq (pg, svand_x (pg, svreinterpret_u64 (qi), 1), 0); + + svfloat64_t n = svmad_x (pg, p, p, -1); + svfloat64_t d = svmul_x (pg, p, 2); + svfloat64_t swap = n; + n = svneg_m (n, use_recip, d); + d = svsel (use_recip, swap, d); + if (unlikely (svptest_any (pg, special))) + return special_case (x, svdiv_x (svnot_z (pg, special), n, d), special); + return svdiv_x (pg, n, d); +} + +PL_SIG (SV, D, 1, tan, -3.1, 3.1) +PL_TEST_ULP (SV_NAME_D1 (tan), 2.99) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (tan), 0, 0x1p23, 500000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (tan), 0x1p23, inf, 5000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_tanf_3u5.c b/contrib/arm-optimized-routines/pl/math/sv_tanf_3u5.c index cca43bd886fd..6b8cd1e64b44 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_tanf_3u5.c +++ b/contrib/arm-optimized-routines/pl/math/sv_tanf_3u5.c @@ -9,63 +9,67 @@ #include "pl_sig.h" #include "pl_test.h" -#if SV_SUPPORTED - -/* Constants. */ -#define NegPio2_1 (sv_f32 (-0x1.921fb6p+0f)) -#define NegPio2_2 (sv_f32 (0x1.777a5cp-25f)) -#define NegPio2_3 (sv_f32 (0x1.ee59dap-50f)) -#define InvPio2 (sv_f32 (0x1.45f306p-1f)) -#define RangeVal (sv_f32 (0x1p15f)) -#define Shift (sv_f32 (0x1.8p+23f)) - -#define poly(i) sv_f32 (__tanf_poly_data.poly_tan[i]) - -/* Use full Estrin's scheme to evaluate polynomial. */ -static inline sv_f32_t -eval_poly (svbool_t pg, sv_f32_t z) +static const struct data { - sv_f32_t z2 = svmul_f32_x (pg, z, z); - sv_f32_t z4 = svmul_f32_x (pg, z2, z2); - sv_f32_t y_10 = sv_fma_f32_x (pg, z, poly (1), poly (0)); - sv_f32_t y_32 = sv_fma_f32_x (pg, z, poly (3), poly (2)); - sv_f32_t y_54 = sv_fma_f32_x (pg, z, poly (5), poly (4)); - sv_f32_t y_32_10 = sv_fma_f32_x (pg, z2, y_32, y_10); - sv_f32_t y = sv_fma_f32_x (pg, z4, y_54, y_32_10); - return y; -} + float pio2_1, pio2_2, pio2_3, invpio2; + float c1, c3, c5; + float c0, c2, c4, range_val, shift; +} data = { + /* Coefficients generated using: + poly = fpminimax((tan(sqrt(x))-sqrt(x))/x^(3/2), + deg, + [|single ...|], + [a*a;b*b]); + optimize relative error + final prec : 23 bits + deg : 5 + a : 0x1p-126 ^ 2 + b : ((pi) / 0x1p2) ^ 2 + dirty rel error: 0x1.f7c2e4p-25 + dirty abs error: 0x1.f7c2ecp-25. */ + .c0 = 0x1.55555p-2, .c1 = 0x1.11166p-3, + .c2 = 0x1.b88a78p-5, .c3 = 0x1.7b5756p-6, + .c4 = 0x1.4ef4cep-8, .c5 = 0x1.0e1e74p-7, -static NOINLINE sv_f32_t -__sv_tanf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp) + .pio2_1 = 0x1.921fb6p+0f, .pio2_2 = -0x1.777a5cp-25f, + .pio2_3 = -0x1.ee59dap-50f, .invpio2 = 0x1.45f306p-1f, + .range_val = 0x1p15f, .shift = 0x1.8p+23f +}; + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp) { return sv_call_f32 (tanf, x, y, cmp); } /* Fast implementation of SVE tanf. Maximum error is 3.45 ULP: - __sv_tanf(-0x1.e5f0cap+13) got 0x1.ff9856p-1 - want 0x1.ff9850p-1. */ -sv_f32_t -__sv_tanf_x (sv_f32_t x, const svbool_t pg) + SV_NAME_F1 (tan)(-0x1.e5f0cap+13) got 0x1.ff9856p-1 + want 0x1.ff9850p-1. */ +svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg) { + const struct data *d = ptr_barrier (&data); + /* Determine whether input is too large to perform fast regression. */ - svbool_t cmp = svacge_f32 (pg, x, RangeVal); - svbool_t pred_minuszero = svcmpeq_f32 (pg, x, sv_f32 (-0.0)); + svbool_t cmp = svacge (pg, x, d->range_val); + + svfloat32_t odd_coeffs = svld1rq (svptrue_b32 (), &d->c1); + svfloat32_t pi_vals = svld1rq (svptrue_b32 (), &d->pio2_1); /* n = rint(x/(pi/2)). */ - sv_f32_t q = sv_fma_f32_x (pg, InvPio2, x, Shift); - sv_f32_t n = svsub_f32_x (pg, q, Shift); + svfloat32_t q = svmla_lane (sv_f32 (d->shift), x, pi_vals, 3); + svfloat32_t n = svsub_x (pg, q, d->shift); /* n is already a signed integer, simply convert it. */ - sv_s32_t in = sv_to_s32_f32_x (pg, n); + svint32_t in = svcvt_s32_x (pg, n); /* Determine if x lives in an interval, where |tan(x)| grows to infinity. */ - sv_s32_t alt = svand_s32_x (pg, in, sv_s32 (1)); - svbool_t pred_alt = svcmpne_s32 (pg, alt, sv_s32 (0)); + svint32_t alt = svand_x (pg, in, 1); + svbool_t pred_alt = svcmpne (pg, alt, 0); /* r = x - n * (pi/2) (range reduction into 0 .. pi/4). */ - sv_f32_t r; - r = sv_fma_f32_x (pg, NegPio2_1, n, x); - r = sv_fma_f32_x (pg, NegPio2_2, n, r); - r = sv_fma_f32_x (pg, NegPio2_3, n, r); + svfloat32_t r; + r = svmls_lane (x, n, pi_vals, 0); + r = svmls_lane (r, n, pi_vals, 1); + r = svmls_lane (r, n, pi_vals, 2); /* If x lives in an interval, where |tan(x)| - is finite, then use a polynomial approximation of the form @@ -75,38 +79,41 @@ __sv_tanf_x (sv_f32_t x, const svbool_t pg) the same polynomial approximation of tan as above. */ /* Perform additional reduction if required. */ - sv_f32_t z = svneg_f32_m (r, pred_alt, r); + svfloat32_t z = svneg_m (r, pred_alt, r); - /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4]. */ - sv_f32_t z2 = svmul_f32_x (pg, z, z); - sv_f32_t p = eval_poly (pg, z2); - sv_f32_t y = sv_fma_f32_x (pg, svmul_f32_x (pg, z, z2), p, z); + /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4], + using Estrin on z^2. */ + svfloat32_t z2 = svmul_x (pg, z, z); + svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), z2, odd_coeffs, 0); + svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), z2, odd_coeffs, 1); + svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), z2, odd_coeffs, 2); + + svfloat32_t z4 = svmul_x (pg, z2, z2); + svfloat32_t p = svmla_x (pg, p01, z4, p23); + + svfloat32_t z8 = svmul_x (pg, z4, z4); + p = svmla_x (pg, p, z8, p45); + + svfloat32_t y = svmla_x (pg, z, p, svmul_x (pg, z, z2)); /* Transform result back, if necessary. */ - sv_f32_t inv_y = svdiv_f32_x (pg, sv_f32 (1.0f), y); - y = svsel_f32 (pred_alt, inv_y, y); - - /* Fast reduction does not handle the x = -0.0 case well, - therefore it is fixed here. */ - y = svsel_f32 (pred_minuszero, x, y); + svfloat32_t inv_y = svdivr_x (pg, y, 1.0f); /* No need to pass pg to specialcase here since cmp is a strict subset, guaranteed by the cmpge above. */ if (unlikely (svptest_any (pg, cmp))) - return __sv_tanf_specialcase (x, y, cmp); - return y; + return special_case (x, svsel (pred_alt, inv_y, y), cmp); + + return svsel (pred_alt, inv_y, y); } -PL_ALIAS (__sv_tanf_x, _ZGVsMxv_tanf) - PL_SIG (SV, F, 1, tan, -3.1, 3.1) -PL_TEST_ULP (__sv_tanf, 2.96) -PL_TEST_INTERVAL (__sv_tanf, -0.0, -0x1p126, 100) -PL_TEST_INTERVAL (__sv_tanf, 0x1p-149, 0x1p-126, 4000) -PL_TEST_INTERVAL (__sv_tanf, 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (__sv_tanf, 0x1p-23, 0.7, 50000) -PL_TEST_INTERVAL (__sv_tanf, 0.7, 1.5, 50000) -PL_TEST_INTERVAL (__sv_tanf, 1.5, 100, 50000) -PL_TEST_INTERVAL (__sv_tanf, 100, 0x1p17, 50000) -PL_TEST_INTERVAL (__sv_tanf, 0x1p17, inf, 50000) -#endif +PL_TEST_ULP (SV_NAME_F1 (tan), 2.96) +PL_TEST_INTERVAL (SV_NAME_F1 (tan), -0.0, -0x1p126, 100) +PL_TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p-149, 0x1p-126, 4000) +PL_TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p-23, 0.7, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (tan), 0.7, 1.5, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (tan), 1.5, 100, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (tan), 100, 0x1p17, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p17, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_tanh_3u.c b/contrib/arm-optimized-routines/pl/math/sv_tanh_3u.c new file mode 100644 index 000000000000..f54139f1ddbc --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_tanh_3u.c @@ -0,0 +1,96 @@ +/* + * Double-precision SVE tanh(x) function. + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "poly_sve_f64.h" +#include "mathlib.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float64_t poly[11]; + float64_t inv_ln2, ln2_hi, ln2_lo, shift; + uint64_t thresh, tiny_bound; +} data = { + /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */ + .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5, + 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10, + 0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16, + 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22, + 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, }, + + .inv_ln2 = 0x1.71547652b82fep0, + .ln2_hi = -0x1.62e42fefa39efp-1, + .ln2_lo = -0x1.abc9e3b39803fp-56, + .shift = 0x1.8p52, + + .tiny_bound = 0x3e40000000000000, /* asuint64 (0x1p-27). */ + /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */ + .thresh = 0x01f241bf835f9d5f, +}; + +static inline svfloat64_t +expm1_inline (svfloat64_t x, const svbool_t pg, const struct data *d) +{ + /* Helper routine for calculating exp(x) - 1. Vector port of the helper from + the scalar variant of tanh. */ + + /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ + svfloat64_t j + = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift); + svint64_t i = svcvt_s64_x (pg, j); + svfloat64_t f = svmla_x (pg, x, j, d->ln2_hi); + f = svmla_x (pg, f, j, d->ln2_lo); + + /* Approximate expm1(f) using polynomial. */ + svfloat64_t f2 = svmul_x (pg, f, f); + svfloat64_t f4 = svmul_x (pg, f2, f2); + svfloat64_t p = svmla_x ( + pg, f, f2, + sv_estrin_10_f64_x (pg, f, f2, f4, svmul_x (pg, f4, f4), d->poly)); + + /* t = 2 ^ i. */ + svfloat64_t t = svscale_x (pg, sv_f64 (1), i); + /* expm1(x) = p * t + (t - 1). */ + return svmla_x (pg, svsub_x (pg, t, 1), p, t); +} + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +{ + return sv_call_f64 (tanh, x, y, special); +} + +/* SVE approximation for double-precision tanh(x), using a simplified + version of expm1. The greatest observed error is 2.77 ULP: + _ZGVsMxv_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3 + want -0x1.bd6a21a163624p-3. */ +svfloat64_t SV_NAME_D1 (tanh) (svfloat64_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint64_t ia = svreinterpret_u64 (svabs_x (pg, x)); + + /* Trigger special-cases for tiny, boring and infinity/NaN. */ + svbool_t special = svcmpgt (pg, svsub_x (pg, ia, d->tiny_bound), d->thresh); + + svfloat64_t u = svadd_x (pg, x, x); + + /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ + svfloat64_t q = expm1_inline (u, pg, d); + svfloat64_t qp2 = svadd_x (pg, q, 2); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, svdiv_x (pg, q, qp2), special); + return svdiv_x (pg, q, qp2); +} + +PL_SIG (SV, D, 1, tanh, -10.0, 10.0) +PL_TEST_ULP (SV_NAME_D1 (tanh), 2.27) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (tanh), 0, 0x1p-27, 5000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (tanh), 0x1p-27, 0x1.241bf835f9d5fp+4, 50000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (tanh), 0x1.241bf835f9d5fp+4, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_tanhf_2u6.c b/contrib/arm-optimized-routines/pl/math/sv_tanhf_2u6.c new file mode 100644 index 000000000000..988a56de0b2e --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_tanhf_2u6.c @@ -0,0 +1,59 @@ +/* + * Single-precision SVE tanh(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#include "sv_expm1f_inline.h" + +static const struct data +{ + struct sv_expm1f_data expm1f_consts; + uint32_t boring_bound, onef; +} data = { + .expm1f_consts = SV_EXPM1F_DATA, + /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */ + .boring_bound = 0x41102cb3, + .onef = 0x3f800000, +}; + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +{ + return sv_call_f32 (tanhf, x, y, special); +} + +/* Approximation for single-precision SVE tanh(x), using a simplified + version of expm1f. The maximum error is 2.57 ULP: + _ZGVsMxv_tanhf (0x1.fc1832p-5) got 0x1.fb71a4p-5 + want 0x1.fb71aap-5. */ +svfloat32_t SV_NAME_F1 (tanh) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat32_t ax = svabs_x (pg, x); + svuint32_t iax = svreinterpret_u32 (ax); + svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax); + svbool_t is_boring = svcmpgt (pg, iax, d->boring_bound); + svfloat32_t boring = svreinterpret_f32 (svorr_x (pg, sign, d->onef)); + + svbool_t special = svcmpgt (pg, iax, 0x7f800000); + + /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ + svfloat32_t q = expm1f_inline (svmul_x (pg, x, 2.0), pg, &d->expm1f_consts); + svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0)); + if (unlikely (svptest_any (pg, special))) + return special_case (x, svsel_f32 (is_boring, boring, y), special); + return svsel_f32 (is_boring, boring, y); +} + +PL_SIG (SV, F, 1, tanh, -10.0, 10.0) +PL_TEST_ULP (SV_NAME_F1 (tanh), 2.07) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (tanh), 0, 0x1p-23, 1000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (tanh), 0x1p-23, 0x1.205966p+3, 100000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (tanh), 0x1.205966p+3, inf, 100) diff --git a/contrib/arm-optimized-routines/pl/math/tanf_3u3.c b/contrib/arm-optimized-routines/pl/math/tanf_3u3.c index ec006dc04c4c..30c86fa89730 100644 --- a/contrib/arm-optimized-routines/pl/math/tanf_3u3.c +++ b/contrib/arm-optimized-routines/pl/math/tanf_3u3.c @@ -7,7 +7,7 @@ #include "math_config.h" #include "pl_sig.h" #include "pl_test.h" -#include "pairwise_hornerf.h" +#include "poly_scalar_f32.h" /* Useful constants. */ #define NegPio2_1 (-0x1.921fb6p+0f) @@ -22,19 +22,16 @@ /* 2PI * 2^-64. */ #define Pio2p63 (0x1.921FB54442D18p-62) -#define P(i) __tanf_poly_data.poly_tan[i] -#define Q(i) __tanf_poly_data.poly_cotan[i] - static inline float eval_P (float z) { - return PAIRWISE_HORNER_5 (z, z * z, P); + return pw_horner_5_f32 (z, z * z, __tanf_poly_data.poly_tan); } static inline float eval_Q (float z) { - return PAIRWISE_HORNER_3 (z, z * z, Q); + return pairwise_poly_3_f32 (z, z * z, __tanf_poly_data.poly_cotan); } /* Reduction of the input argument x using Cody-Waite approach, such that x = r @@ -188,15 +185,9 @@ tanf (float x) PL_SIG (S, F, 1, tan, -3.1, 3.1) PL_TEST_ULP (tanf, 2.80) PL_TEST_INTERVAL (tanf, 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (tanf, 0x1p-127, 0x1p-14, 50000) -PL_TEST_INTERVAL (tanf, -0x1p-127, -0x1p-14, 50000) -PL_TEST_INTERVAL (tanf, 0x1p-14, 0.7, 50000) -PL_TEST_INTERVAL (tanf, -0x1p-14, -0.7, 50000) -PL_TEST_INTERVAL (tanf, 0.7, 1.5, 50000) -PL_TEST_INTERVAL (tanf, -0.7, -1.5, 50000) -PL_TEST_INTERVAL (tanf, 1.5, 0x1p17, 50000) -PL_TEST_INTERVAL (tanf, -1.5, -0x1p17, 50000) -PL_TEST_INTERVAL (tanf, 0x1p17, 0x1p54, 50000) -PL_TEST_INTERVAL (tanf, -0x1p17, -0x1p54, 50000) -PL_TEST_INTERVAL (tanf, 0x1p54, inf, 50000) -PL_TEST_INTERVAL (tanf, -0x1p54, -inf, 50000) +PL_TEST_SYM_INTERVAL (tanf, 0x1p-127, 0x1p-14, 50000) +PL_TEST_SYM_INTERVAL (tanf, 0x1p-14, 0.7, 50000) +PL_TEST_SYM_INTERVAL (tanf, 0.7, 1.5, 50000) +PL_TEST_SYM_INTERVAL (tanf, 1.5, 0x1p17, 50000) +PL_TEST_SYM_INTERVAL (tanf, 0x1p17, 0x1p54, 50000) +PL_TEST_SYM_INTERVAL (tanf, 0x1p54, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/tanh_3u.c b/contrib/arm-optimized-routines/pl/math/tanh_3u.c index 46d9fb3fd7e1..86f2904afc32 100644 --- a/contrib/arm-optimized-routines/pl/math/tanh_3u.c +++ b/contrib/arm-optimized-routines/pl/math/tanh_3u.c @@ -5,7 +5,7 @@ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" -#include "estrin.h" +#include "poly_scalar_f64.h" #include "pl_sig.h" #include "pl_test.h" @@ -14,7 +14,6 @@ #define Ln2hi 0x1.62e42fefa39efp-1 #define Ln2lo 0x1.abc9e3b39803fp-56 #define Shift 0x1.8p52 -#define C(i) __expm1_poly[i] #define BoringBound 0x403241bf835f9d5f /* asuint64 (0x1.241bf835f9d5fp+4). */ #define TinyBound 0x3e40000000000000 /* asuint64 (0x1p-27). */ @@ -38,7 +37,7 @@ expm1_inline (double x) /* Approximate expm1(f) using polynomial. */ double f2 = f * f; double f4 = f2 * f2; - double p = fma (f2, ESTRIN_10 (f, f2, f4, f4 * f4, C), f); + double p = fma (f2, estrin_10_f64 (f, f2, f4, f4 * f4, __expm1_poly), f); /* t = 2 ^ i. */ double t = asdouble ((uint64_t) (i + 1023) << 52); @@ -47,9 +46,9 @@ expm1_inline (double x) } /* Approximation for double-precision tanh(x), using a simplified version of - expm1. The greatest observed error is 2.75 ULP: - tanh(-0x1.c143c3a44e087p-3) got -0x1.ba31ba4691ab7p-3 - want -0x1.ba31ba4691ab4p-3. */ + expm1. The greatest observed error is 2.77 ULP: + tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3 + want -0x1.bd6a21a163624p-3. */ double tanh (double x) { @@ -73,10 +72,7 @@ tanh (double x) } PL_SIG (S, D, 1, tanh, -10.0, 10.0) -PL_TEST_ULP (tanh, 2.26) -PL_TEST_INTERVAL (tanh, 0, TinyBound, 1000) -PL_TEST_INTERVAL (tanh, -0, -TinyBound, 1000) -PL_TEST_INTERVAL (tanh, TinyBound, BoringBound, 100000) -PL_TEST_INTERVAL (tanh, -TinyBound, -BoringBound, 100000) -PL_TEST_INTERVAL (tanh, BoringBound, inf, 1000) -PL_TEST_INTERVAL (tanh, -BoringBound, -inf, 1000) +PL_TEST_ULP (tanh, 2.27) +PL_TEST_SYM_INTERVAL (tanh, 0, TinyBound, 1000) +PL_TEST_SYM_INTERVAL (tanh, TinyBound, BoringBound, 100000) +PL_TEST_SYM_INTERVAL (tanh, BoringBound, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/tanhf_2u6.c b/contrib/arm-optimized-routines/pl/math/tanhf_2u6.c index 76e54a438e57..93ea3cf5d865 100644 --- a/contrib/arm-optimized-routines/pl/math/tanhf_2u6.c +++ b/contrib/arm-optimized-routines/pl/math/tanhf_2u6.c @@ -83,9 +83,6 @@ tanhf (float x) PL_SIG (S, F, 1, tanh, -10.0, 10.0) PL_TEST_ULP (tanhf, 2.09) -PL_TEST_INTERVAL (tanhf, 0, 0x1p-23, 1000) -PL_TEST_INTERVAL (tanhf, -0, -0x1p-23, 1000) -PL_TEST_INTERVAL (tanhf, 0x1p-23, 0x1.205966p+3, 100000) -PL_TEST_INTERVAL (tanhf, -0x1p-23, -0x1.205966p+3, 100000) -PL_TEST_INTERVAL (tanhf, 0x1.205966p+3, inf, 100) -PL_TEST_INTERVAL (tanhf, -0x1.205966p+3, -inf, 100) +PL_TEST_SYM_INTERVAL (tanhf, 0, 0x1p-23, 1000) +PL_TEST_SYM_INTERVAL (tanhf, 0x1p-23, 0x1.205966p+3, 100000) +PL_TEST_SYM_INTERVAL (tanhf, 0x1.205966p+3, inf, 100) diff --git a/contrib/arm-optimized-routines/pl/math/test/mathbench_funcs.h b/contrib/arm-optimized-routines/pl/math/test/mathbench_funcs.h index e0f6ac70912c..f2710a979d40 100644 --- a/contrib/arm-optimized-routines/pl/math/test/mathbench_funcs.h +++ b/contrib/arm-optimized-routines/pl/math/test/mathbench_funcs.h @@ -9,20 +9,10 @@ #define _ZSF1(fun, a, b) F(fun##f, a, b) #define _ZSD1(f, a, b) D(f, a, b) -#ifdef __vpcs +#if defined(__vpcs) && __aarch64__ -#define _ZVF1(fun, a, b) F(__s_##fun##f, a, b) VF(__v_##fun##f, a, b) VNF(__vn_##fun##f, a, b) VNF(_ZGVnN4v_##fun##f, a, b) -#define _ZVD1(f, a, b) D(__s_##f, a, b) VD(__v_##f, a, b) VND(__vn_##f, a, b) VND(_ZGVnN2v_##f, a, b) - -#elif __aarch64__ - -#define _ZVF1(fun, a, b) F(__s_##fun##f, a, b) VF(__v_##fun##f, a, b) -#define _ZVD1(f, a, b) D(__s_##f, a, b) VD(__v_##f, a, b) - -#elif WANT_VMATH - -#define _ZVF1(fun, a, b) F(__s_##fun##f, a, b) -#define _ZVD1(f, a, b) D(__s_##f, a, b) +#define _ZVF1(fun, a, b) VNF(_ZGVnN4v_##fun##f, a, b) +#define _ZVD1(f, a, b) VND(_ZGVnN2v_##f, a, b) #else @@ -33,8 +23,8 @@ #if WANT_SVE_MATH -#define _ZSVF1(fun, a, b) SVF(__sv_##fun##f_x, a, b) SVF(_ZGVsMxv_##fun##f, a, b) -#define _ZSVD1(f, a, b) SVD(__sv_##f##_x, a, b) SVD(_ZGVsMxv_##f, a, b) +#define _ZSVF1(fun, a, b) SVF(_ZGVsMxv_##fun##f, a, b) +#define _ZSVD1(f, a, b) SVD(_ZGVsMxv_##f, a, b) #else @@ -64,23 +54,34 @@ {"atan2", 'd', 0, -10.0, 10.0, {.d = atan2_wrap}}, {"powi", 'd', 0, 0.01, 11.1, {.d = powi_wrap}}, -{"__s_atan2f", 'f', 0, -10.0, 10.0, {.f = __s_atan2f_wrap}}, -{"__s_atan2", 'd', 0, -10.0, 10.0, {.d = __s_atan2_wrap}}, -{"__v_atan2f", 'f', 'v', -10.0, 10.0, {.vf = __v_atan2f_wrap}}, -{"__v_atan2", 'd', 'v', -10.0, 10.0, {.vd = __v_atan2_wrap}}, -{"__vn_atan2f", 'f', 'n', -10.0, 10.0, {.vnf = __vn_atan2f_wrap}}, {"_ZGVnN4vv_atan2f", 'f', 'n', -10.0, 10.0, {.vnf = _Z_atan2f_wrap}}, -{"__vn_atan2", 'd', 'n', -10.0, 10.0, {.vnd = __vn_atan2_wrap}}, {"_ZGVnN2vv_atan2", 'd', 'n', -10.0, 10.0, {.vnd = _Z_atan2_wrap}}, +{"_ZGVnN4vv_hypotf", 'f', 'n', -10.0, 10.0, {.vnf = _Z_hypotf_wrap}}, +{"_ZGVnN2vv_hypot", 'd', 'n', -10.0, 10.0, {.vnd = _Z_hypot_wrap}}, +{"_ZGVnN2vv_pow", 'd', 'n', -10.0, 10.0, {.vnd = xy_Z_pow}}, +{"x_ZGVnN2vv_pow", 'd', 'n', -10.0, 10.0, {.vnd = x_Z_pow}}, +{"y_ZGVnN2vv_pow", 'd', 'n', -10.0, 10.0, {.vnd = y_Z_pow}}, +{"_ZGVnN4vl4l4_sincosf", 'f', 'n', -3.1, 3.1, {.vnf = _Z_sincosf_wrap}}, +{"_ZGVnN2vl8l8_sincos", 'd', 'n', -3.1, 3.1, {.vnd = _Z_sincos_wrap}}, +{"_ZGVnN4v_cexpif", 'f', 'n', -3.1, 3.1, {.vnf = _Z_cexpif_wrap}}, +{"_ZGVnN2v_cexpi", 'd', 'n', -3.1, 3.1, {.vnd = _Z_cexpi_wrap}}, #if WANT_SVE_MATH -{"__sv_atan2f_x", 'f', 's', -10.0, 10.0, {.svf = __sv_atan2f_wrap}}, {"_ZGVsMxvv_atan2f", 'f', 's', -10.0, 10.0, {.svf = _Z_sv_atan2f_wrap}}, -{"__sv_atan2_x", 'd', 's', -10.0, 10.0, {.svd = __sv_atan2_wrap}}, -{"_ZGVsM2vv_atan2", 'd', 's', -10.0, 10.0, {.svd = _Z_sv_atan2_wrap}}, -{"__sv_powif_x", 'f', 's', -10.0, 10.0, {.svf = __sv_powif_wrap}}, +{"_ZGVsMxvv_atan2", 'd', 's', -10.0, 10.0, {.svd = _Z_sv_atan2_wrap}}, +{"_ZGVsMxvv_hypotf", 'f', 's', -10.0, 10.0, {.svf = _Z_sv_hypotf_wrap}}, +{"_ZGVsMxvv_hypot", 'd', 's', -10.0, 10.0, {.svd = _Z_sv_hypot_wrap}}, {"_ZGVsMxvv_powi", 'f', 's', -10.0, 10.0, {.svf = _Z_sv_powi_wrap}}, -{"__sv_powi_x", 'd', 's', -10.0, 10.0, {.svd = __sv_powi_wrap}}, {"_ZGVsMxvv_powk", 'd', 's', -10.0, 10.0, {.svd = _Z_sv_powk_wrap}}, +{"_ZGVsMxvv_powf", 'f', 's', -10.0, 10.0, {.svf = xy_Z_sv_powf}}, +{"x_ZGVsMxvv_powf", 'f', 's', -10.0, 10.0, {.svf = x_Z_sv_powf}}, +{"y_ZGVsMxvv_powf", 'f', 's', -10.0, 10.0, {.svf = y_Z_sv_powf}}, +{"_ZGVsMxvv_pow", 'd', 's', -10.0, 10.0, {.svd = xy_Z_sv_pow}}, +{"x_ZGVsMxvv_pow", 'd', 's', -10.0, 10.0, {.svd = x_Z_sv_pow}}, +{"y_ZGVsMxvv_pow", 'd', 's', -10.0, 10.0, {.svd = y_Z_sv_pow}}, +{"_ZGVsMxvl4l4_sincosf", 'f', 's', -3.1, 3.1, {.svf = _Z_sv_sincosf_wrap}}, +{"_ZGVsMxvl8l8_sincos", 'd', 's', -3.1, 3.1, {.svd = _Z_sv_sincos_wrap}}, +{"_ZGVsMxv_cexpif", 'f', 's', -3.1, 3.1, {.svf = _Z_sv_cexpif_wrap}}, +{"_ZGVsMxv_cexpi", 'd', 's', -3.1, 3.1, {.svd = _Z_sv_cexpi_wrap}}, #endif - // clang-format on + // clang-format on diff --git a/contrib/arm-optimized-routines/pl/math/test/mathbench_wrappers.h b/contrib/arm-optimized-routines/pl/math/test/mathbench_wrappers.h index eba960eb96ac..fe7f8963cdee 100644 --- a/contrib/arm-optimized-routines/pl/math/test/mathbench_wrappers.h +++ b/contrib/arm-optimized-routines/pl/math/test/mathbench_wrappers.h @@ -23,46 +23,7 @@ powi_wrap (double x) return __builtin_powi (x, (int) round (x)); } -#if WANT_VMATH -#if __aarch64__ - -static double -__s_atan2_wrap (double x) -{ - return __s_atan2 (5.0, x); -} - -static float -__s_atan2f_wrap (float x) -{ - return __s_atan2f (5.0f, x); -} - -static v_double -__v_atan2_wrap (v_double x) -{ - return __v_atan2 (v_double_dup (5.0), x); -} - -static v_float -__v_atan2f_wrap (v_float x) -{ - return __v_atan2f (v_float_dup (5.0f), x); -} - -#ifdef __vpcs - -__vpcs static v_double -__vn_atan2_wrap (v_double x) -{ - return __vn_atan2 (v_double_dup (5.0), x); -} - -__vpcs static v_float -__vn_atan2f_wrap (v_float x) -{ - return __vn_atan2f (v_float_dup (5.0f), x); -} +#if __aarch64__ && defined(__vpcs) __vpcs static v_double _Z_atan2_wrap (v_double x) @@ -76,34 +37,92 @@ _Z_atan2f_wrap (v_float x) return _ZGVnN4vv_atan2f (v_float_dup (5.0f), x); } -#endif // __vpcs -#endif // __arch64__ -#endif // WANT_VMATH +__vpcs static v_float +_Z_hypotf_wrap (v_float x) +{ + return _ZGVnN4vv_hypotf (v_float_dup (5.0f), x); +} + +__vpcs static v_double +_Z_hypot_wrap (v_double x) +{ + return _ZGVnN2vv_hypot (v_double_dup (5.0), x); +} + +__vpcs static v_double +xy_Z_pow (v_double x) +{ + return _ZGVnN2vv_pow (x, x); +} + +__vpcs static v_double +x_Z_pow (v_double x) +{ + return _ZGVnN2vv_pow (x, v_double_dup (23.4)); +} + +__vpcs static v_double +y_Z_pow (v_double x) +{ + return _ZGVnN2vv_pow (v_double_dup (2.34), x); +} + +__vpcs static v_float +_Z_sincosf_wrap (v_float x) +{ + v_float s, c; + _ZGVnN4vl4l4_sincosf (x, &s, &c); + return s + c; +} + +__vpcs static v_float +_Z_cexpif_wrap (v_float x) +{ + __f32x4x2_t sc = _ZGVnN4v_cexpif (x); + return sc.val[0] + sc.val[1]; +} + +__vpcs static v_double +_Z_sincos_wrap (v_double x) +{ + v_double s, c; + _ZGVnN2vl8l8_sincos (x, &s, &c); + return s + c; +} + +__vpcs static v_double +_Z_cexpi_wrap (v_double x) +{ + __f64x2x2_t sc = _ZGVnN2v_cexpi (x); + return sc.val[0] + sc.val[1]; +} + +#endif // __arch64__ && __vpcs #if WANT_SVE_MATH -static sv_float -__sv_atan2f_wrap (sv_float x, sv_bool pg) -{ - return __sv_atan2f_x (x, svdup_n_f32 (5.0f), pg); -} - static sv_float _Z_sv_atan2f_wrap (sv_float x, sv_bool pg) { - return _ZGVsMxvv_atan2f (x, svdup_n_f32 (5.0f), pg); -} - -static sv_double -__sv_atan2_wrap (sv_double x, sv_bool pg) -{ - return __sv_atan2_x (x, svdup_n_f64 (5.0), pg); + return _ZGVsMxvv_atan2f (x, svdup_f32 (5.0f), pg); } static sv_double _Z_sv_atan2_wrap (sv_double x, sv_bool pg) { - return _ZGVsMxvv_atan2 (x, svdup_n_f64 (5.0), pg); + return _ZGVsMxvv_atan2 (x, svdup_f64 (5.0), pg); +} + +static sv_float +_Z_sv_hypotf_wrap (sv_float x, sv_bool pg) +{ + return _ZGVsMxvv_hypotf (x, svdup_f32 (5.0), pg); +} + +static sv_double +_Z_sv_hypot_wrap (sv_double x, sv_bool pg) +{ + return _ZGVsMxvv_hypot (x, svdup_f64 (5.0), pg); } static sv_float @@ -112,22 +131,76 @@ _Z_sv_powi_wrap (sv_float x, sv_bool pg) return _ZGVsMxvv_powi (x, svcvt_s32_f32_x (pg, x), pg); } -static sv_float -__sv_powif_wrap (sv_float x, sv_bool pg) -{ - return __sv_powif_x (x, svcvt_s32_f32_x (pg, x), pg); -} - static sv_double _Z_sv_powk_wrap (sv_double x, sv_bool pg) { return _ZGVsMxvv_powk (x, svcvt_s64_f64_x (pg, x), pg); } -static sv_double -__sv_powi_wrap (sv_double x, sv_bool pg) +static sv_float +xy_Z_sv_powf (sv_float x, sv_bool pg) { - return __sv_powi_x (x, svcvt_s64_f64_x (pg, x), pg); + return _ZGVsMxvv_powf (x, x, pg); +} + +static sv_float +x_Z_sv_powf (sv_float x, sv_bool pg) +{ + return _ZGVsMxvv_powf (x, svdup_f32 (23.4f), pg); +} + +static sv_float +y_Z_sv_powf (sv_float x, sv_bool pg) +{ + return _ZGVsMxvv_powf (svdup_f32 (2.34f), x, pg); +} + +static sv_double +xy_Z_sv_pow (sv_double x, sv_bool pg) +{ + return _ZGVsMxvv_pow (x, x, pg); +} + +static sv_double +x_Z_sv_pow (sv_double x, sv_bool pg) +{ + return _ZGVsMxvv_pow (x, svdup_f64 (23.4), pg); +} + +static sv_double +y_Z_sv_pow (sv_double x, sv_bool pg) +{ + return _ZGVsMxvv_pow (svdup_f64 (2.34), x, pg); +} + +static sv_float +_Z_sv_sincosf_wrap (sv_float x, sv_bool pg) +{ + float s[svcntw ()], c[svcntw ()]; + _ZGVsMxvl4l4_sincosf (x, s, c, pg); + return svadd_x (pg, svld1 (pg, s), svld1 (pg, s)); +} + +static sv_float +_Z_sv_cexpif_wrap (sv_float x, sv_bool pg) +{ + svfloat32x2_t sc = _ZGVsMxv_cexpif (x, pg); + return svadd_x (pg, svget2 (sc, 0), svget2 (sc, 1)); +} + +static sv_double +_Z_sv_sincos_wrap (sv_double x, sv_bool pg) +{ + double s[svcntd ()], c[svcntd ()]; + _ZGVsMxvl8l8_sincos (x, s, c, pg); + return svadd_x (pg, svld1 (pg, s), svld1 (pg, s)); +} + +static sv_double +_Z_sv_cexpi_wrap (sv_double x, sv_bool pg) +{ + svfloat64x2_t sc = _ZGVsMxv_cexpi (x, pg); + return svadd_x (pg, svget2 (sc, 0), svget2 (sc, 1)); } #endif // WANT_SVE_MATH diff --git a/contrib/arm-optimized-routines/pl/math/test/pl_test.h b/contrib/arm-optimized-routines/pl/math/test/pl_test.h index 467d1cac0c36..e7ed4eed634e 100644 --- a/contrib/arm-optimized-routines/pl/math/test/pl_test.h +++ b/contrib/arm-optimized-routines/pl/math/test/pl_test.h @@ -8,18 +8,14 @@ /* Emit the max ULP threshold, l, for routine f. Piggy-back PL_TEST_EXPECT_FENV on PL_TEST_ULP to add EXPECT_FENV to all scalar routines. */ -#if !(V_SUPPORTED || SV_SUPPORTED) -#define PL_TEST_ULP(f, l) \ - PL_TEST_EXPECT_FENV_ALWAYS (f) \ - PL_TEST_ULP f l +#if WANT_VMATH || defined(IGNORE_SCALAR_FENV) +# define PL_TEST_ULP(f, l) PL_TEST_ULP f l #else -#define PL_TEST_ULP(f, l) PL_TEST_ULP f l +# define PL_TEST_ULP(f, l) \ + PL_TEST_EXPECT_FENV_ALWAYS (f) \ + PL_TEST_ULP f l #endif -/* Emit aliases to allow test params to be mapped from aliases back to their - aliasees. */ -#define PL_ALIAS(a, b) PL_TEST_ALIAS a b - /* Emit routine name if e == 1 and f is expected to correctly trigger fenv exceptions. e allows declaration to be emitted conditionally upon certain build flags - defer expansion by one pass to allow those flags to be expanded @@ -30,4 +26,14 @@ #define PL_TEST_EXPECT_FENV_ALWAYS(f) PL_TEST_EXPECT_FENV (f, 1) #define PL_TEST_INTERVAL(f, lo, hi, n) PL_TEST_INTERVAL f lo hi n +#define PL_TEST_SYM_INTERVAL(f, lo, hi, n) \ + PL_TEST_INTERVAL (f, lo, hi, n) \ + PL_TEST_INTERVAL (f, -lo, -hi, n) #define PL_TEST_INTERVAL_C(f, lo, hi, n, c) PL_TEST_INTERVAL f lo hi n c +#define PL_TEST_SYM_INTERVAL_C(f, lo, hi, n, c) \ + PL_TEST_INTERVAL_C (f, lo, hi, n, c) \ + PL_TEST_INTERVAL_C (f, -lo, -hi, n, c) +// clang-format off +#define PL_TEST_INTERVAL2(f, xlo, xhi, ylo, yhi, n) \ + PL_TEST_INTERVAL f xlo,ylo xhi,yhi n +// clang-format on diff --git a/contrib/arm-optimized-routines/pl/math/test/runulp.sh b/contrib/arm-optimized-routines/pl/math/test/runulp.sh index 4d02530d44b1..0f5a41f76b25 100755 --- a/contrib/arm-optimized-routines/pl/math/test/runulp.sh +++ b/contrib/arm-optimized-routines/pl/math/test/runulp.sh @@ -21,55 +21,55 @@ FAIL=0 PASS=0 t() { - key=$(cat $ALIASES | { grep " $1$" || echo $1; } | awk '{print $1}') - L=$(cat $LIMITS | grep "^$key " | awk '{print $2}') + routine=$1 + L=$(cat $LIMITS | grep "^$routine " | awk '{print $2}') [[ $L =~ ^[0-9]+\.[0-9]+$ ]] - extra_flags="" + extra_flags= [[ -z "${5:-}" ]] || extra_flags="$extra_flags -c $5" - grep -q "^$key$" $FENV || extra_flags="$extra_flags -f" - $emu ./ulp -e $L $flags ${extra_flags} $1 $2 $3 $4 && PASS=$((PASS+1)) || FAIL=$((FAIL+1)) + grep -q "^$routine$" $FENV || extra_flags="$extra_flags -f" + IFS=',' read -ra LO <<< "$2" + IFS=',' read -ra HI <<< "$3" + ITV="${LO[0]} ${HI[0]}" + for i in "${!LO[@]}"; do + [[ "$i" -eq "0" ]] || ITV="$ITV x ${LO[$i]} ${HI[$i]}" + done + # Add -z flag to ignore zero sign for vector routines + { echo $routine | grep -q "ZGV"; } && extra_flags="$extra_flags -z" + $emu ./ulp -e $L $flags ${extra_flags} $routine $ITV $4 && PASS=$((PASS+1)) || FAIL=$((FAIL+1)) } check() { $emu ./ulp -f -q "$@" #>/dev/null } -# Regression-test for correct NaN handling in atan2 -check atan2 0x1p-1022 0x1p-1000 x 0 0x1p-1022 40000 -check atan2 0x1.7887a0a717aefp+1017 0x1.7887a0a717aefp+1017 x -nan -nan -check atan2 nan nan x -nan -nan +if [ "$FUNC" == "atan2" ] || [ -z "$FUNC" ]; then + # Regression-test for correct NaN handling in atan2 + check atan2 0x1p-1022 0x1p-1000 x 0 0x1p-1022 40000 + check atan2 0x1.7887a0a717aefp+1017 0x1.7887a0a717aefp+1017 x -nan -nan + check atan2 nan nan x -nan -nan +fi # vector functions flags="${ULPFLAGS:--q}" -runs= -check __s_log10f 1 && runs=1 -runv= -check __v_log10f 1 && runv=1 -runvn= -check __vn_log10f 1 && runvn=1 runsv= if [ $WANT_SVE_MATH -eq 1 ]; then -check __sv_cosf 0 && runsv=1 -check __sv_cos 0 && runsv=1 -check __sv_sinf 0 && runsv=1 -check __sv_sin 0 && runsv=1 # No guarantees about powi accuracy, so regression-test for exactness # w.r.t. the custom reference impl in ulp_wrappers.h -check -q -f -e 0 __sv_powif 0 inf x 0 1000 100000 && runsv=1 -check -q -f -e 0 __sv_powif -0 -inf x 0 1000 100000 && runsv=1 -check -q -f -e 0 __sv_powif 0 inf x -0 -1000 100000 && runsv=1 -check -q -f -e 0 __sv_powif -0 -inf x -0 -1000 100000 && runsv=1 -check -q -f -e 0 __sv_powi 0 inf x 0 1000 100000 && runsv=1 -check -q -f -e 0 __sv_powi -0 -inf x 0 1000 100000 && runsv=1 -check -q -f -e 0 __sv_powi 0 inf x -0 -1000 100000 && runsv=1 -check -q -f -e 0 __sv_powi -0 -inf x -0 -1000 100000 && runsv=1 +check -q -f -e 0 _ZGVsMxvv_powi 0 inf x 0 1000 100000 && runsv=1 +check -q -f -e 0 _ZGVsMxvv_powi -0 -inf x 0 1000 100000 && runsv=1 +check -q -f -e 0 _ZGVsMxvv_powi 0 inf x -0 -1000 100000 && runsv=1 +check -q -f -e 0 _ZGVsMxvv_powi -0 -inf x -0 -1000 100000 && runsv=1 +check -q -f -e 0 _ZGVsMxvv_powk 0 inf x 0 1000 100000 && runsv=1 +check -q -f -e 0 _ZGVsMxvv_powk -0 -inf x 0 1000 100000 && runsv=1 +check -q -f -e 0 _ZGVsMxvv_powk 0 inf x -0 -1000 100000 && runsv=1 +check -q -f -e 0 _ZGVsMxvv_powk -0 -inf x -0 -1000 100000 && runsv=1 fi while read F LO HI N C do t $F $LO $HI $N $C done << EOF -$(cat $INTERVALS) +$(cat $INTERVALS | grep "\b$FUNC\b") EOF [ 0 -eq $FAIL ] || { diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/acos.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/acos.tst new file mode 100644 index 000000000000..a73dcd25965b --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/acos.tst @@ -0,0 +1,17 @@ +; acos.tst +; +; Copyright (c) 2009-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=acos op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=acos op1=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=acos op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=acos op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=acos op1=7ff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i +func=acos op1=fff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i +func=acos op1=00000000.00000000 result=3ff921fb.54442d18.469 errno=0 +func=acos op1=80000000.00000000 result=3ff921fb.54442d18.469 errno=0 +func=acos op1=3ff00000.00000000 result=00000000.00000000 errno=0 +func=acos op1=bff00000.00000000 result=400921fb.54442d18.469 errno=0 +func=acos op1=3ff00000.00000001 result=7ff80000.00000001 errno=EDOM status=i +func=acos op1=bff00000.00000001 result=7ff80000.00000001 errno=EDOM status=i diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/acosf.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/acosf.tst new file mode 100644 index 000000000000..9e453e3bff5e --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/acosf.tst @@ -0,0 +1,21 @@ +; acosf.tst +; +; Copyright (c) 2009-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=acosf op1=7fc00001 result=7fc00001 errno=0 +func=acosf op1=ffc00001 result=7fc00001 errno=0 +func=acosf op1=7f800001 result=7fc00001 errno=0 status=i +func=acosf op1=ff800001 result=7fc00001 errno=0 status=i +func=acosf op1=7f800000 result=7fc00001 errno=EDOM status=i +func=acosf op1=ff800000 result=7fc00001 errno=EDOM status=i +func=acosf op1=00000000 result=3fc90fda.a22 errno=0 +func=acosf op1=80000000 result=3fc90fda.a22 errno=0 +func=acosf op1=3f800000 result=00000000 errno=0 +func=acosf op1=bf800000 result=40490fda.a22 errno=0 +func=acosf op1=3f800001 result=7fc00001 errno=EDOM status=i +func=acosf op1=bf800001 result=7fc00001 errno=EDOM status=i +func=acosf op1=33000000 result=3fc90fda.622 error=0 +func=acosf op1=30000000 result=3fc90fda.a12 error=0 +func=acosf op1=2d000000 result=3fc90fda.a21 error=0 +func=acosf op1=2a000000 result=3fc90fda.a22 error=0 diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/asin.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/asin.tst new file mode 100644 index 000000000000..6180d7849d90 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/asin.tst @@ -0,0 +1,24 @@ +; asin.tst +; +; Copyright (c) 2009-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=asin op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=asin op1=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=asin op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=asin op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=asin op1=7ff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i +func=asin op1=fff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i +func=asin op1=00000000.00000000 result=00000000.00000000 errno=0 +func=asin op1=80000000.00000000 result=80000000.00000000 errno=0 +; Inconsistent behavior was detected for the following 2 cases. +; No exception is raised with certain versions of glibc. Functions +; approximated by x near zero may not generate/implement flops and +; thus may not raise exceptions. +func=asin op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux +func=asin op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux + +func=asin op1=3ff00000.00000000 result=3ff921fb.54442d18.469 errno=0 +func=asin op1=bff00000.00000000 result=bff921fb.54442d18.469 errno=0 +func=asin op1=3ff00000.00000001 result=7ff80000.00000001 errno=EDOM status=i +func=asin op1=bff00000.00000001 result=7ff80000.00000001 errno=EDOM status=i diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/asinf.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/asinf.tst new file mode 100644 index 000000000000..a85b2593768d --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/asinf.tst @@ -0,0 +1,24 @@ +; asinf.tst +; +; Copyright (c) 2009-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=asinf op1=7fc00001 result=7fc00001 errno=0 +func=asinf op1=ffc00001 result=7fc00001 errno=0 +func=asinf op1=7f800001 result=7fc00001 errno=0 status=i +func=asinf op1=ff800001 result=7fc00001 errno=0 status=i +func=asinf op1=7f800000 result=7fc00001 errno=EDOM status=i +func=asinf op1=ff800000 result=7fc00001 errno=EDOM status=i +func=asinf op1=00000000 result=00000000 errno=0 +func=asinf op1=80000000 result=80000000 errno=0 +; Inconsistent behavior was detected for the following 2 cases. +; No exception is raised with certain versions of glibc. Functions +; approximated by x near zero may not generate/implement flops and +; thus may not raise exceptions. +func=asinf op1=00000001 result=00000001 errno=0 maybestatus=ux +func=asinf op1=80000001 result=80000001 errno=0 maybestatus=ux + +func=asinf op1=3f800000 result=3fc90fda.a22 errno=0 +func=asinf op1=bf800000 result=bfc90fda.a22 errno=0 +func=asinf op1=3f800001 result=7fc00001 errno=EDOM status=i +func=asinf op1=bf800001 result=7fc00001 errno=EDOM status=i diff --git a/contrib/arm-optimized-routines/pl/math/test/ulp_funcs.h b/contrib/arm-optimized-routines/pl/math/test/ulp_funcs.h index 5e3133e1db4c..4929b481ffe1 100644 --- a/contrib/arm-optimized-routines/pl/math/test/ulp_funcs.h +++ b/contrib/arm-optimized-routines/pl/math/test/ulp_funcs.h @@ -5,26 +5,12 @@ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#ifdef __vpcs +#if defined(__vpcs) && __aarch64__ -#define _ZVF1(f) SF1 (f) VF1 (f) ZVNF1 (f) -#define _ZVD1(f) SD1 (f) VD1 (f) ZVND1 (f) -#define _ZVF2(f) SF2 (f) VF2 (f) ZVNF2 (f) -#define _ZVD2(f) SD2 (f) VD2 (f) ZVND2 (f) - -#elif __aarch64 - -#define _ZVF1(f) SF1 (f) VF1 (f) -#define _ZVD1(f) SD1 (f) VD1 (f) -#define _ZVF2(f) SF2 (f) VF2 (f) -#define _ZVD2(f) SD2 (f) VD2 (f) - -#elif WANT_VMATH - -#define _ZVF1(f) SF1 (f) -#define _ZVD1(f) SD1 (f) -#define _ZVF2(f) SF2 (f) -#define _ZVD2(f) SD2 (f) +#define _ZVF1(f) ZVF1 (f) +#define _ZVD1(f) ZVD1 (f) +#define _ZVF2(f) ZVF2 (f) +#define _ZVD2(f) ZVD2 (f) #else @@ -37,10 +23,10 @@ #if WANT_SVE_MATH -#define _ZSVF1(f) SVF1 (f) ZSVF1 (f) -#define _ZSVF2(f) SVF2 (f) ZSVF2 (f) -#define _ZSVD1(f) SVD1 (f) ZSVD1 (f) -#define _ZSVD2(f) SVD2 (f) ZSVD2 (f) +#define _ZSVF1(f) ZSVF1 (f) +#define _ZSVF2(f) ZSVF2 (f) +#define _ZSVD1(f) ZSVD1 (f) +#define _ZSVD2(f) ZSVD2 (f) #else @@ -58,9 +44,27 @@ #include "ulp_funcs_gen.h" +F (_ZGVnN4v_sincosf_sin, v_sincosf_sin, sin, mpfr_sin, 1, 1, f1, 0) +F (_ZGVnN4v_sincosf_cos, v_sincosf_cos, cos, mpfr_cos, 1, 1, f1, 0) +F (_ZGVnN4v_cexpif_sin, v_cexpif_sin, sin, mpfr_sin, 1, 1, f1, 0) +F (_ZGVnN4v_cexpif_cos, v_cexpif_cos, cos, mpfr_cos, 1, 1, f1, 0) + +F (_ZGVnN2v_sincos_sin, v_sincos_sin, sinl, mpfr_sin, 1, 0, d1, 0) +F (_ZGVnN2v_sincos_cos, v_sincos_cos, cosl, mpfr_cos, 1, 0, d1, 0) +F (_ZGVnN2v_cexpi_sin, v_cexpi_sin, sinl, mpfr_sin, 1, 0, d1, 0) +F (_ZGVnN2v_cexpi_cos, v_cexpi_cos, cosl, mpfr_cos, 1, 0, d1, 0) + #if WANT_SVE_MATH -F (__sv_powi, sv_powi, ref_powi, mpfr_powi, 2, 0, d2, 0) F (_ZGVsMxvv_powk, Z_sv_powk, ref_powi, mpfr_powi, 2, 0, d2, 0) -F (__sv_powif, sv_powif, ref_powif, mpfr_powi, 2, 1, f2, 0) F (_ZGVsMxvv_powi, Z_sv_powi, ref_powif, mpfr_powi, 2, 1, f2, 0) + +F (_ZGVsMxv_sincosf_sin, sv_sincosf_sin, sin, mpfr_sin, 1, 1, f1, 0) +F (_ZGVsMxv_sincosf_cos, sv_sincosf_cos, cos, mpfr_cos, 1, 1, f1, 0) +F (_ZGVsMxv_cexpif_sin, sv_cexpif_sin, sin, mpfr_sin, 1, 1, f1, 0) +F (_ZGVsMxv_cexpif_cos, sv_cexpif_cos, cos, mpfr_cos, 1, 1, f1, 0) + +F (_ZGVsMxv_sincos_sin, sv_sincos_sin, sinl, mpfr_sin, 1, 0, d1, 0) +F (_ZGVsMxv_sincos_cos, sv_sincos_cos, cosl, mpfr_cos, 1, 0, d1, 0) +F (_ZGVsMxv_cexpi_sin, sv_cexpi_sin, sinl, mpfr_sin, 1, 0, d1, 0) +F (_ZGVsMxv_cexpi_cos, sv_cexpi_cos, cosl, mpfr_cos, 1, 0, d1, 0) #endif diff --git a/contrib/arm-optimized-routines/pl/math/test/ulp_wrappers.h b/contrib/arm-optimized-routines/pl/math/test/ulp_wrappers.h index b682e939054a..0f7b68949c7b 100644 --- a/contrib/arm-optimized-routines/pl/math/test/ulp_wrappers.h +++ b/contrib/arm-optimized-routines/pl/math/test/ulp_wrappers.h @@ -6,7 +6,9 @@ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +#define _GNU_SOURCE #include +#include #if USE_MPFR static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { @@ -36,7 +38,7 @@ static int wrap_mpfr_powi(mpfr_t ret, const mpfr_t x, const mpfr_t y, mpfr_rnd_t double. This is fine since a round-trip to higher precision and back down is correctly rounded. */ #define DECL_POW_INT_REF(NAME, DBL_T, FLT_T, INT_T) \ - static DBL_T NAME (DBL_T in_val, DBL_T y) \ + static DBL_T __attribute__((unused)) NAME (DBL_T in_val, DBL_T y) \ { \ INT_T n = (INT_T) round (y); \ FLT_T acc = 1.0; \ @@ -60,41 +62,17 @@ static int wrap_mpfr_powi(mpfr_t ret, const mpfr_t x, const mpfr_t y, mpfr_rnd_t DECL_POW_INT_REF(ref_powif, double, float, int) DECL_POW_INT_REF(ref_powi, long double, double, int) -#define VF1_WRAP(func) static float v_##func##f(float x) { return __v_##func##f(argf(x))[0]; } -#define VF2_WRAP(func) static float v_##func##f(float x, float y) { return __v_##func##f(argf(x), argf(y))[0]; } -#define VD1_WRAP(func) static double v_##func(double x) { return __v_##func(argd(x))[0]; } -#define VD2_WRAP(func) static double v_##func(double x, double y) { return __v_##func(argd(x), argd(y))[0]; } - -#define VNF1_WRAP(func) static float vn_##func##f(float x) { return __vn_##func##f(argf(x))[0]; } -#define VNF2_WRAP(func) static float vn_##func##f(float x, float y) { return __vn_##func##f(argf(x), argf(y))[0]; } -#define VND1_WRAP(func) static double vn_##func(double x) { return __vn_##func(argd(x))[0]; } -#define VND2_WRAP(func) static double vn_##func(double x, double y) { return __vn_##func(argd(x), argd(y))[0]; } - #define ZVF1_WRAP(func) static float Z_##func##f(float x) { return _ZGVnN4v_##func##f(argf(x))[0]; } #define ZVF2_WRAP(func) static float Z_##func##f(float x, float y) { return _ZGVnN4vv_##func##f(argf(x), argf(y))[0]; } #define ZVD1_WRAP(func) static double Z_##func(double x) { return _ZGVnN2v_##func(argd(x))[0]; } #define ZVD2_WRAP(func) static double Z_##func(double x, double y) { return _ZGVnN2vv_##func(argd(x), argd(y))[0]; } -#ifdef __vpcs +#if defined(__vpcs) && __aarch64__ -#define ZVNF1_WRAP(func) VF1_WRAP(func) VNF1_WRAP(func) ZVF1_WRAP(func) -#define ZVNF2_WRAP(func) VF2_WRAP(func) VNF2_WRAP(func) ZVF2_WRAP(func) -#define ZVND1_WRAP(func) VD1_WRAP(func) VND1_WRAP(func) ZVD1_WRAP(func) -#define ZVND2_WRAP(func) VD2_WRAP(func) VND2_WRAP(func) ZVD2_WRAP(func) - -#elif __aarch64__ - -#define ZVNF1_WRAP(func) VF1_WRAP(func) VNF1_WRAP(func) -#define ZVNF2_WRAP(func) VF2_WRAP(func) VNF2_WRAP(func) -#define ZVND1_WRAP(func) VD1_WRAP(func) VND1_WRAP(func) -#define ZVND2_WRAP(func) VD2_WRAP(func) VND2_WRAP(func) - -#elif WANT_VMATH - -#define ZVNF1_WRAP(func) VF1_WRAP(func) -#define ZVNF2_WRAP(func) VF2_WRAP(func) -#define ZVND1_WRAP(func) VD1_WRAP(func) -#define ZVND2_WRAP(func) VD2_WRAP(func) +#define ZVNF1_WRAP(func) ZVF1_WRAP(func) +#define ZVNF2_WRAP(func) ZVF2_WRAP(func) +#define ZVND1_WRAP(func) ZVD1_WRAP(func) +#define ZVND2_WRAP(func) ZVD2_WRAP(func) #else @@ -105,11 +83,6 @@ DECL_POW_INT_REF(ref_powi, long double, double, int) #endif -#define SVF1_WRAP(func) static float sv_##func##f(float x) { return svretf(__sv_##func##f_x(svargf(x), svptrue_b32())); } -#define SVF2_WRAP(func) static float sv_##func##f(float x, float y) { return svretf(__sv_##func##f_x(svargf(x), svargf(y), svptrue_b32())); } -#define SVD1_WRAP(func) static double sv_##func(double x) { return svretd(__sv_##func##_x(svargd(x), svptrue_b64())); } -#define SVD2_WRAP(func) static double sv_##func(double x, double y) { return svretd(__sv_##func##_x(svargd(x), svargd(y), svptrue_b64())); } - #define ZSVF1_WRAP(func) static float Z_sv_##func##f(float x) { return svretf(_ZGVsMxv_##func##f(svargf(x), svptrue_b32())); } #define ZSVF2_WRAP(func) static float Z_sv_##func##f(float x, float y) { return svretf(_ZGVsMxvv_##func##f(svargf(x), svargf(y), svptrue_b32())); } #define ZSVD1_WRAP(func) static double Z_sv_##func(double x) { return svretd(_ZGVsMxv_##func(svargd(x), svptrue_b64())); } @@ -117,10 +90,10 @@ DECL_POW_INT_REF(ref_powi, long double, double, int) #if WANT_SVE_MATH -#define ZSVNF1_WRAP(func) SVF1_WRAP(func) ZSVF1_WRAP(func) -#define ZSVNF2_WRAP(func) SVF2_WRAP(func) ZSVF2_WRAP(func) -#define ZSVND1_WRAP(func) SVD1_WRAP(func) ZSVD1_WRAP(func) -#define ZSVND2_WRAP(func) SVD2_WRAP(func) ZSVD2_WRAP(func) +#define ZSVNF1_WRAP(func) ZSVF1_WRAP(func) +#define ZSVNF2_WRAP(func) ZSVF2_WRAP(func) +#define ZSVND1_WRAP(func) ZSVD1_WRAP(func) +#define ZSVND2_WRAP(func) ZSVD2_WRAP(func) #else @@ -139,10 +112,29 @@ DECL_POW_INT_REF(ref_powi, long double, double, int) #include "ulp_wrappers_gen.h" +float v_sincosf_sin(float x) { float32x4_t s, c; _ZGVnN4vl4l4_sincosf(vdupq_n_f32(x), &s, &c); return s[0]; } +float v_sincosf_cos(float x) { float32x4_t s, c; _ZGVnN4vl4l4_sincosf(vdupq_n_f32(x), &s, &c); return c[0]; } +float v_cexpif_sin(float x) { return _ZGVnN4v_cexpif(vdupq_n_f32(x)).val[0][0]; } +float v_cexpif_cos(float x) { return _ZGVnN4v_cexpif(vdupq_n_f32(x)).val[1][0]; } + +double v_sincos_sin(double x) { float64x2_t s, c; _ZGVnN2vl8l8_sincos(vdupq_n_f64(x), &s, &c); return s[0]; } +double v_sincos_cos(double x) { float64x2_t s, c; _ZGVnN2vl8l8_sincos(vdupq_n_f64(x), &s, &c); return c[0]; } +double v_cexpi_sin(double x) { return _ZGVnN2v_cexpi(vdupq_n_f64(x)).val[0][0]; } +double v_cexpi_cos(double x) { return _ZGVnN2v_cexpi(vdupq_n_f64(x)).val[1][0]; } + #if WANT_SVE_MATH -static float Z_sv_powi(float x, float y) { return svretf(_ZGVsMxvv_powi(svargf(x), svdup_n_s32((int)round(y)), svptrue_b32())); } -static float sv_powif(float x, float y) { return svretf(__sv_powif_x(svargf(x), svdup_n_s32((int)round(y)), svptrue_b32())); } -static double Z_sv_powk(double x, double y) { return svretd(_ZGVsMxvv_powk(svargd(x), svdup_n_s64((long)round(y)), svptrue_b64())); } -static double sv_powi(double x, double y) { return svretd(__sv_powi_x(svargd(x), svdup_n_s64((long)round(y)), svptrue_b64())); } +static float Z_sv_powi(float x, float y) { return svretf(_ZGVsMxvv_powi(svargf(x), svdup_s32((int)round(y)), svptrue_b32())); } +static double Z_sv_powk(double x, double y) { return svretd(_ZGVsMxvv_powk(svargd(x), svdup_s64((long)round(y)), svptrue_b64())); } + +float sv_sincosf_sin(float x) { float s[svcntw()], c[svcntw()]; _ZGVsMxvl4l4_sincosf(svdup_f32(x), s, c, svptrue_b32()); return s[0]; } +float sv_sincosf_cos(float x) { float s[svcntw()], c[svcntw()]; _ZGVsMxvl4l4_sincosf(svdup_f32(x), s, c, svptrue_b32()); return c[0]; } +float sv_cexpif_sin(float x) { return svretf(svget2(_ZGVsMxv_cexpif(svdup_f32(x), svptrue_b32()), 0)); } +float sv_cexpif_cos(float x) { return svretf(svget2(_ZGVsMxv_cexpif(svdup_f32(x), svptrue_b32()), 1)); } + +double sv_sincos_sin(double x) { double s[svcntd()], c[svcntd()]; _ZGVsMxvl8l8_sincos(svdup_f64(x), s, c, svptrue_b64()); return s[0]; } +double sv_sincos_cos(double x) { double s[svcntd()], c[svcntd()]; _ZGVsMxvl8l8_sincos(svdup_f64(x), s, c, svptrue_b64()); return c[0]; } +double sv_cexpi_sin(double x) { return svretd(svget2(_ZGVsMxv_cexpi(svdup_f64(x), svptrue_b64()), 0)); } +double sv_cexpi_cos(double x) { return svretd(svget2(_ZGVsMxv_cexpi(svdup_f64(x), svptrue_b64()), 1)); } + #endif // clang-format on diff --git a/contrib/arm-optimized-routines/pl/math/tools/asin.sollya b/contrib/arm-optimized-routines/pl/math/tools/asin.sollya new file mode 100644 index 000000000000..8ef861d0898b --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/asin.sollya @@ -0,0 +1,29 @@ +// polynomial for approximating asin(x) +// +// Copyright (c) 2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +f = asin(x); +dtype = double; + +prec=256; + +a = 0x1p-106; +b = 0.25; + +deg = 11; + +backward = proc(poly, d) { + return d + d ^ 3 * poly(d * d); +}; + +forward = proc(f, d) { + return (f(sqrt(d))-sqrt(d))/(d*sqrt(d)); +}; + +poly = fpminimax(forward(f, x), [|0,...,deg|], [|dtype ...|], [a;b], relative, floating); + +display = hexadecimal!; +print("rel error:", dirtyinfnorm(1-backward(poly, x)/f(x), [a;b])); +print("in [", a, b, "]"); +for i from 0 to deg do print(coeff(poly, i)); diff --git a/contrib/arm-optimized-routines/pl/math/tools/asinf.sollya b/contrib/arm-optimized-routines/pl/math/tools/asinf.sollya new file mode 100644 index 000000000000..5b627e546c73 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/asinf.sollya @@ -0,0 +1,36 @@ +// polynomial for approximating asinf(x) +// +// Copyright (c) 2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +f = asin(x); +dtype = single; + +a = 0x1p-24; +b = 0.25; + +deg = 4; + +backward = proc(poly, d) { + return d + d ^ 3 * poly(d * d); +}; + +forward = proc(f, d) { + return (f(sqrt(d))-sqrt(d))/(d*sqrt(d)); +}; + +approx = proc(poly, d) { + return remez(1 - poly(x) / forward(f, x), deg - d, [a;b], x^d/forward(f, x), 1e-16); +}; + +poly = 0; +for i from 0 to deg do { + i; + p = roundcoefficients(approx(poly,i), [|dtype ...|]); + poly = poly + x^i*coeff(p,0); +}; + +display = hexadecimal!; +print("rel error:", accurateinfnorm(1-backward(poly, x)/f(x), [a;b], 30)); +print("in [", a, b, "]"); +for i from 0 to deg do print(coeff(poly, i)); diff --git a/contrib/arm-optimized-routines/pl/math/tools/erf.sollya b/contrib/arm-optimized-routines/pl/math/tools/erf.sollya new file mode 100644 index 000000000000..b2fc559b511e --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/erf.sollya @@ -0,0 +1,25 @@ +// tables and constants for approximating erf(x). +// +// Copyright (c) 2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +display = hexadecimal; +prec=128; + +// Tables +print("{ i, r, erf(r), 2/sqrt(pi) * exp(-r^2)}"); +for i from 0 to 768 do { + r = i / 128; + t0 = double(erf(r)); + t1 = double(2/sqrt(pi) * exp(-r * r)); + print("{ " @ i @ ",\t" @ r @ ",\t" @ t0 @ ",\t" @ t1 @ " },"); +}; + +// Constants +double(1/3); +double(1/10); +double(2/15); +double(2/9); +double(2/45); +double(2/sqrt(pi)); + diff --git a/contrib/arm-optimized-routines/pl/math/tools/erfc.sollya b/contrib/arm-optimized-routines/pl/math/tools/erfc.sollya index 8c40b4b5db6b..1e2791291ebb 100644 --- a/contrib/arm-optimized-routines/pl/math/tools/erfc.sollya +++ b/contrib/arm-optimized-routines/pl/math/tools/erfc.sollya @@ -1,23 +1,51 @@ -// polynomial for approximating erfc(x)*exp(x*x) +// tables and constants for approximating erfc(x). // -// Copyright (c) 2022-2023, Arm Limited. +// Copyright (c) 2023, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception -deg = 12; // poly degree +display = hexadecimal; +prec=128; -// interval bounds -a = 0x1.60dfc14636e2ap0; -b = 0x1.d413cccfe779ap0; - -f = proc(y) { - t = y + a; - return erfc(t) * exp(t*t); +// Tables +print("{ i, r, erfc(r), 2/sqrt(pi) * exp(-r^2) }"); +for i from 0 to 3787 do { + r = 0.0 + i / 128; + t0 = double(erfc(r) * 2^128); + t1 = double(2/sqrt(pi) * exp(-r * r) * 2^128); + print("{ " @ t0 @ ",\t" @ t1 @ " },"); }; -poly = remez(f(x), deg, [0;b-a], 1, 1e-16); +// Constants +print("> 2/sqrt(pi)"); +double(2/sqrt(pi)); -display = hexadecimal; -print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30)); -print("in [",a,b,"]"); -print("coeffs:"); -for i from 0 to deg do round(coeff(poly,i), 52, RN); +print("> 1/3"); +double(1/3); + +print("> P5"); +double(2/15); +double(1/10); +double(2/9); +double(2/45); + +print("> P6"); +double(1/42); +double(1/7); +double(2/21); +double(4/315); + +print("> Q"); +double( 5.0 / 4.0); +double( 6.0 / 5.0); +double( 7.0 / 6.0); +double( 8.0 / 7.0); +double( 9.0 / 8.0); +double(10.0 / 9.0); + +print("> R"); +double(-2.0 * 4.0 / (5.0 * 6.0)); +double(-2.0 * 5.0 / (6.0 * 7.0)); +double(-2.0 * 6.0 / (7.0 * 8.0)); +double(-2.0 * 7.0 / (8.0 * 9.0)); +double(-2.0 * 8.0 / (9.0 * 10.0)); +double(-2.0 * 9.0 / (10.0 * 11.0)); diff --git a/contrib/arm-optimized-routines/pl/math/tools/erfcf.sollya b/contrib/arm-optimized-routines/pl/math/tools/erfcf.sollya index 69c683647af7..1d7fc264d99d 100644 --- a/contrib/arm-optimized-routines/pl/math/tools/erfcf.sollya +++ b/contrib/arm-optimized-routines/pl/math/tools/erfcf.sollya @@ -1,31 +1,22 @@ -// polynomial for approximating erfc(x)*exp(x*x) +// tables and constants for approximating erfcf(x). // -// Copyright (c) 2022-2023, Arm Limited. +// Copyright (c) 2023, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception -deg = 15; // poly degree - -// interval bounds -a = 0x1.0p-26; -b = 2; - -f = proc(y) { - return erfc(y) * exp(y*y); -}; - -approx = proc(poly, d) { - return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10); -}; - -poly = 0; -for i from 0 to deg do { - p = roundcoefficients(approx(poly,i), [|D ...|]); - poly = poly + x^i*coeff(p,0); - print(i); -}; - display = hexadecimal; -print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30)); -print("in [",a,b,"]"); -print("coeffs:"); -for i from 0 to deg do coeff(poly,i); +prec=128; + +// Tables +print("{ i, r, erfc(r), 2/sqrt(pi) * exp(-r^2) }"); +for i from 0 to 644 do { + r = 0.0 + i / 64; + t0 = single(erfc(r) * 2^47); + t1 = single(2/sqrt(pi) * exp(-r * r) * 2^47); + print("{ " @ t0 @ ",\t" @ t1 @ " },"); +}; + +// Constants +single(1/3); +single(2/15); +single(1/10); +single(2/sqrt(pi)); diff --git a/contrib/arm-optimized-routines/pl/math/tools/erff.sollya b/contrib/arm-optimized-routines/pl/math/tools/erff.sollya new file mode 100644 index 000000000000..59b23ef021f0 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/erff.sollya @@ -0,0 +1,20 @@ +// tables and constants for approximating erff(x). +// +// Copyright (c) 2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +display = hexadecimal; +prec=128; + +// Tables +print("{ i, r, erf(r), 2/sqrt(pi) * exp(-r^2)}"); +for i from 0 to 512 do { + r = i / 128; + t0 = single(erf(r)); + t1 = single(2/sqrt(pi) * exp(-r * r)); + print("{ " @ i @ ",\t" @ r @ ",\t" @ t0 @ ",\t" @ t1 @ " },"); +}; + +// Constants +single(1/3); +single(2/sqrt(pi)); diff --git a/contrib/arm-optimized-routines/pl/math/tools/exp10.sollya b/contrib/arm-optimized-routines/pl/math/tools/exp10.sollya new file mode 100644 index 000000000000..9f30b4018209 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/exp10.sollya @@ -0,0 +1,55 @@ +// polynomial for approximating 10^x +// +// Copyright (c) 2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +// exp10f parameters +deg = 5; // poly degree +N = 1; // Neon 1, SVE 64 +b = log(2)/(2 * N * log(10)); // interval +a = -b; +wp = single; + +// exp10 parameters +//deg = 4; // poly degree - bump to 5 for ~1 ULP +//N = 128; // table size +//b = log(2)/(2 * N * log(10)); // interval +//a = -b; +//wp = D; + + +// find polynomial with minimal relative error + +f = 10^x; + +// return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)| +approx = proc(poly,d) { + return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10); +}; +// return p that minimizes |f(x) - poly(x) - x^d*p(x)| +approx_abs = proc(poly,d) { + return remez(f(x) - poly(x), deg-d, [a;b], x^d, 1e-10); +}; + +// first coeff is fixed, iteratively find optimal double prec coeffs +poly = 1; +for i from 1 to deg do { + p = roundcoefficients(approx(poly,i), [|wp ...|]); +// p = roundcoefficients(approx_abs(poly,i), [|wp ...|]); + poly = poly + x^i*coeff(p,0); +}; + +display = hexadecimal; +print("rel error:", accurateinfnorm(1-poly(x)/10^x, [a;b], 30)); +print("abs error:", accurateinfnorm(10^x-poly(x), [a;b], 30)); +print("in [",a,b,"]"); +print("coeffs:"); +for i from 0 to deg do coeff(poly,i); + +log10_2 = round(N * log(10) / log(2), wp, RN); +log2_10 = log(2) / (N * log(10)); +log2_10_hi = round(log2_10, wp, RN); +log2_10_lo = round(log2_10 - log2_10_hi, wp, RN); +print(log10_2); +print(log2_10_hi); +print(log2_10_lo); diff --git a/contrib/arm-optimized-routines/pl/math/tools/sincos.sollya b/contrib/arm-optimized-routines/pl/math/tools/sincos.sollya new file mode 100644 index 000000000000..7d36266b446b --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/sincos.sollya @@ -0,0 +1,33 @@ +// polynomial for approximating cos(x) +// +// Copyright (c) 2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +// This script only finds the coeffs for cos - see math/aarch64/v_sin.c for sin coeffs + +deg = 14; // polynomial degree +a = -pi/4; // interval +b = pi/4; + +// find even polynomial with minimal abs error compared to cos(x) + +f = cos(x); + +// return p that minimizes |f(x) - poly(x) - x^d*p(x)| +approx = proc(poly,d) { + return remez(f(x)-poly(x), deg-d, [a;b], x^d, 1e-10); +}; + +// first coeff is fixed, iteratively find optimal double prec coeffs +poly = 1; +for i from 1 to deg/2 do { + p = roundcoefficients(approx(poly,2*i), [|double ...|]); + poly = poly + x^(2*i)*coeff(p,0); +}; + +display = hexadecimal; +//print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30)); +//print("abs error:", accurateinfnorm(f(x)-poly(x), [a;b], 30)); +print("in [",a,b,"]"); +print("coeffs:"); +for i from 0 to deg do coeff(poly,i); diff --git a/contrib/arm-optimized-routines/pl/math/tools/sincosf.sollya b/contrib/arm-optimized-routines/pl/math/tools/sincosf.sollya new file mode 100644 index 000000000000..178ee83ac196 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/sincosf.sollya @@ -0,0 +1,33 @@ +// polynomial for approximating cos(x) +// +// Copyright (c) 2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +// This script only finds the coeffs for cos - see math/tools/sin.sollya for sin coeffs. + +deg = 8; // polynomial degree +a = -pi/4; // interval +b = pi/4; + +// find even polynomial with minimal abs error compared to cos(x) + +f = cos(x); + +// return p that minimizes |f(x) - poly(x) - x^d*p(x)| +approx = proc(poly,d) { + return remez(f(x)-poly(x), deg-d, [a;b], x^d, 1e-10); +}; + +// first coeff is fixed, iteratively find optimal double prec coeffs +poly = 1; +for i from 1 to deg/2 do { + p = roundcoefficients(approx(poly,2*i), [|single ...|]); + poly = poly + x^(2*i)*coeff(p,0); +}; + +display = hexadecimal; +//print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30)); +//print("abs error:", accurateinfnorm(f(x)-poly(x), [a;b], 30)); +print("in [",a,b,"]"); +print("coeffs:"); +for i from 0 to deg do coeff(poly,i); diff --git a/contrib/arm-optimized-routines/pl/math/tools/sinpi.sollya b/contrib/arm-optimized-routines/pl/math/tools/sinpi.sollya new file mode 100644 index 000000000000..62cc87e7697d --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/sinpi.sollya @@ -0,0 +1,33 @@ +// polynomial for approximating sinpi(x) +// +// Copyright (c) 2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +deg = 19; // polynomial degree +a = -1/2; // interval +b = 1/2; + +// find even polynomial with minimal abs error compared to sinpi(x) + +// f = sin(pi* x); +f = pi*x; +c = 1; +for i from 1 to 80 do { c = 2*i*(2*i + 1)*c; f = f + (-1)^i*(pi*x)^(2*i+1)/c; }; + +// return p that minimizes |f(x) - poly(x) - x^d*p(x)| +approx = proc(poly,d) { + return remez(f(x)-poly(x), deg-d, [a;b], x^d, 1e-10); +}; + +// first coeff is predefine, iteratively find optimal double prec coeffs +poly = pi*x; +for i from 0 to (deg-1)/2 do { + p = roundcoefficients(approx(poly,2*i+1), [|D ...|]); + poly = poly + x^(2*i+1)*coeff(p,0); +}; + +display = hexadecimal; +print("abs error:", accurateinfnorm(sin(pi*x)-poly(x), [a;b], 30)); +print("in [",a,b,"]"); +print("coeffs:"); +for i from 0 to deg do coeff(poly,i); diff --git a/contrib/arm-optimized-routines/pl/math/trigpi_references.c b/contrib/arm-optimized-routines/pl/math/trigpi_references.c new file mode 100644 index 000000000000..4b0514b6766a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/trigpi_references.c @@ -0,0 +1,57 @@ +/* + * Extended precision scalar reference functions for trigpi. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#define _GNU_SOURCE +#include "math_config.h" +#include "mathlib.h" + +long double +sinpil (long double x) +{ + /* sin(inf) should return nan, as defined by C23. */ + if (isinf (x)) + return __math_invalid (x); + + long double ax = fabsl (x); + + /* Return 0 for all values above 2^64 to prevent + overflow when casting to uint64_t. */ + if (ax >= 0x1p64) + return 0; + + /* All integer cases should return 0. */ + if (ax == (uint64_t) ax) + return 0; + + return sinl (x * M_PIl); +} + +long double +cospil (long double x) +{ + /* cos(inf) should return nan, as defined by C23. */ + if (isinf (x)) + return __math_invalid (x); + + long double ax = fabsl (x); + + if (ax >= 0x1p64) + return 1; + + uint64_t m = (uint64_t) ax; + + /* Integer values of cospi(x) should return +/-1. + The sign depends on if x is odd or even. */ + if (m == ax) + return (m & 1) ? -1 : 1; + + /* Values of Integer + 0.5 should always return 0. */ + if (ax - 0.5 == m || ax + 0.5 == m) + return 0; + + return cosl (ax * M_PIl); +} \ No newline at end of file diff --git a/contrib/arm-optimized-routines/pl/math/v_acos_2u.c b/contrib/arm-optimized-routines/pl/math/v_acos_2u.c new file mode 100644 index 000000000000..581f8506c0d6 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_acos_2u.c @@ -0,0 +1,122 @@ +/* + * Double-precision vector acos(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "poly_advsimd_f64.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float64x2_t poly[12]; + float64x2_t pi, pi_over_2; + uint64x2_t abs_mask; +} data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) + on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ + .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4), + V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6), + V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6), + V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7), + V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6), + V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), }, + .pi = V2 (0x1.921fb54442d18p+1), + .pi_over_2 = V2 (0x1.921fb54442d18p+0), + .abs_mask = V2 (0x7fffffffffffffff), +}; + +#define AllMask v_u64 (0xffffffffffffffff) +#define Oneu (0x3ff0000000000000) +#define Small (0x3e50000000000000) /* 2^-53. */ + +#if WANT_SIMD_EXCEPT +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t special) +{ + return v_call_f64 (acos, x, y, special); +} +#endif + +/* Double-precision implementation of vector acos(x). + + For |x| < Small, approximate acos(x) by pi/2 - x. Small = 2^-53 for correct + rounding. + If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the following + approximation. + + For |x| in [Small, 0.5], use an order 11 polynomial P such that the final + approximation of asin is an odd polynomial: + + acos(x) ~ pi/2 - (x + x^3 P(x^2)). + + The largest observed error in this region is 1.18 ulps, + _ZGVnN2v_acos (0x1.fbab0a7c460f6p-2) got 0x1.0d54d1985c068p+0 + want 0x1.0d54d1985c069p+0. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 1.52 ulps, + _ZGVnN2v_acos (0x1.23d362722f591p-1) got 0x1.edbbedf8a7d6ep-1 + want 0x1.edbbedf8a7d6cp-1. */ +float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + float64x2_t ax = vabsq_f64 (x); + +#if WANT_SIMD_EXCEPT + /* A single comparison for One, Small and QNaN. */ + uint64x2_t special + = vcgtq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (ax), v_u64 (Small)), + v_u64 (Oneu - Small)); + if (unlikely (v_any_u64 (special))) + return special_case (x, x, AllMask); +#endif + + uint64x2_t a_le_half = vcleq_f64 (ax, v_f64 (0.5)); + + /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with + z2 = x ^ 2 and z = |x| , if |x| < 0.5 + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */ + float64x2_t z2 = vbslq_f64 (a_le_half, vmulq_f64 (x, x), + vfmaq_f64 (v_f64 (0.5), v_f64 (-0.5), ax)); + float64x2_t z = vbslq_f64 (a_le_half, ax, vsqrtq_f64 (z2)); + + /* Use a single polynomial approximation P for both intervals. */ + float64x2_t z4 = vmulq_f64 (z2, z2); + float64x2_t z8 = vmulq_f64 (z4, z4); + float64x2_t z16 = vmulq_f64 (z8, z8); + float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly); + + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = vfmaq_f64 (z, vmulq_f64 (z, z2), p); + + /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5 + = 2 Q(|x|) , for 0.5 < x < 1.0 + = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */ + float64x2_t y = vbslq_f64 (d->abs_mask, p, x); + + uint64x2_t is_neg = vcltzq_f64 (x); + float64x2_t off = vreinterpretq_f64_u64 ( + vandq_u64 (is_neg, vreinterpretq_u64_f64 (d->pi))); + float64x2_t mul = vbslq_f64 (a_le_half, v_f64 (-1.0), v_f64 (2.0)); + float64x2_t add = vbslq_f64 (a_le_half, d->pi_over_2, off); + + return vfmaq_f64 (add, mul, y); +} + +PL_SIG (V, D, 1, acos, -1.0, 1.0) +PL_TEST_ULP (V_NAME_D1 (acos), 1.02) +PL_TEST_EXPECT_FENV (V_NAME_D1 (acos), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME_D1 (acos), 0, Small, 5000) +PL_TEST_INTERVAL (V_NAME_D1 (acos), Small, 0.5, 50000) +PL_TEST_INTERVAL (V_NAME_D1 (acos), 0.5, 1.0, 50000) +PL_TEST_INTERVAL (V_NAME_D1 (acos), 1.0, 0x1p11, 50000) +PL_TEST_INTERVAL (V_NAME_D1 (acos), 0x1p11, inf, 20000) +PL_TEST_INTERVAL (V_NAME_D1 (acos), -0, -inf, 20000) diff --git a/contrib/arm-optimized-routines/pl/math/v_acosf_1u4.c b/contrib/arm-optimized-routines/pl/math/v_acosf_1u4.c new file mode 100644 index 000000000000..bb17b1df18f3 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_acosf_1u4.c @@ -0,0 +1,113 @@ +/* + * Single-precision vector acos(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "poly_advsimd_f32.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float32x4_t poly[5]; + float32x4_t pi_over_2f, pif; +} data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on + [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */ + .poly = { V4 (0x1.55555ep-3), V4 (0x1.33261ap-4), V4 (0x1.70d7dcp-5), + V4 (0x1.b059dp-6), V4 (0x1.3af7d8p-5) }, + .pi_over_2f = V4 (0x1.921fb6p+0f), + .pif = V4 (0x1.921fb6p+1f), +}; + +#define AbsMask 0x7fffffff +#define Half 0x3f000000 +#define One 0x3f800000 +#define Small 0x32800000 /* 2^-26. */ + +#if WANT_SIMD_EXCEPT +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +{ + return v_call_f32 (acosf, x, y, special); +} +#endif + +/* Single-precision implementation of vector acos(x). + + For |x| < Small, approximate acos(x) by pi/2 - x. Small = 2^-26 for correct + rounding. + If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the following + approximation. + + For |x| in [Small, 0.5], use order 4 polynomial P such that the final + approximation of asin is an odd polynomial: + + acos(x) ~ pi/2 - (x + x^3 P(x^2)). + + The largest observed error in this region is 1.26 ulps, + _ZGVnN4v_acosf (0x1.843bfcp-2) got 0x1.2e934cp+0 want 0x1.2e934ap+0. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 1.32 ulps, + _ZGVnN4v_acosf (0x1.15ba56p-1) got 0x1.feb33p-1 + want 0x1.feb32ep-1. */ +float32x4_t VPCS_ATTR V_NAME_F1 (acos) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint32x4_t ia = vandq_u32 (ix, v_u32 (AbsMask)); + +#if WANT_SIMD_EXCEPT + /* A single comparison for One, Small and QNaN. */ + uint32x4_t special + = vcgtq_u32 (vsubq_u32 (ia, v_u32 (Small)), v_u32 (One - Small)); + if (unlikely (v_any_u32 (special))) + return special_case (x, x, v_u32 (0xffffffff)); +#endif + + float32x4_t ax = vreinterpretq_f32_u32 (ia); + uint32x4_t a_le_half = vcleq_u32 (ia, v_u32 (Half)); + + /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with + z2 = x ^ 2 and z = |x| , if |x| < 0.5 + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */ + float32x4_t z2 = vbslq_f32 (a_le_half, vmulq_f32 (x, x), + vfmsq_n_f32 (v_f32 (0.5), ax, 0.5)); + float32x4_t z = vbslq_f32 (a_le_half, ax, vsqrtq_f32 (z2)); + + /* Use a single polynomial approximation P for both intervals. */ + float32x4_t p = v_horner_4_f32 (z2, d->poly); + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = vfmaq_f32 (z, vmulq_f32 (z, z2), p); + + /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5 + = 2 Q(|x|) , for 0.5 < x < 1.0 + = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */ + float32x4_t y = vbslq_f32 (v_u32 (AbsMask), p, x); + + uint32x4_t is_neg = vcltzq_f32 (x); + float32x4_t off = vreinterpretq_f32_u32 ( + vandq_u32 (vreinterpretq_u32_f32 (d->pif), is_neg)); + float32x4_t mul = vbslq_f32 (a_le_half, v_f32 (-1.0), v_f32 (2.0)); + float32x4_t add = vbslq_f32 (a_le_half, d->pi_over_2f, off); + + return vfmaq_f32 (add, mul, y); +} + +PL_SIG (V, F, 1, acos, -1.0, 1.0) +PL_TEST_ULP (V_NAME_F1 (acos), 0.82) +PL_TEST_EXPECT_FENV (V_NAME_F1 (acos), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME_F1 (acos), 0, 0x1p-26, 5000) +PL_TEST_INTERVAL (V_NAME_F1 (acos), 0x1p-26, 0.5, 50000) +PL_TEST_INTERVAL (V_NAME_F1 (acos), 0.5, 1.0, 50000) +PL_TEST_INTERVAL (V_NAME_F1 (acos), 1.0, 0x1p11, 50000) +PL_TEST_INTERVAL (V_NAME_F1 (acos), 0x1p11, inf, 20000) +PL_TEST_INTERVAL (V_NAME_F1 (acos), -0, -inf, 20000) diff --git a/contrib/arm-optimized-routines/pl/math/v_acosh_3u5.c b/contrib/arm-optimized-routines/pl/math/v_acosh_3u5.c index 22f69d7636e4..42fa2616d562 100644 --- a/contrib/arm-optimized-routines/pl/math/v_acosh_3u5.c +++ b/contrib/arm-optimized-routines/pl/math/v_acosh_3u5.c @@ -11,41 +11,56 @@ #define WANT_V_LOG1P_K0_SHORTCUT 1 #include "v_log1p_inline.h" -#define BigBoundTop 0x5fe /* top12 (asuint64 (0x1p511)). */ - -#if V_SUPPORTED - -static NOINLINE VPCS_ATTR v_f64_t -special_case (v_f64_t x) +const static struct data { - return v_call_f64 (acosh, x, x, v_u64 (-1)); + struct v_log1p_data log1p_consts; + uint64x2_t one, thresh; +} data = { + .log1p_consts = V_LOG1P_CONSTANTS_TABLE, + .one = V2 (0x3ff0000000000000), + .thresh = V2 (0x1ff0000000000000) /* asuint64(0x1p511) - asuint64(1). */ +}; + +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, float64x2_t y, uint64x2_t special, + const struct v_log1p_data *d) +{ + return v_call_f64 (acosh, x, log1p_inline (y, d), special); } /* Vector approximation for double-precision acosh, based on log1p. The largest observed error is 3.02 ULP in the region where the argument to log1p falls in the k=0 interval, i.e. x close to 1: - __v_acosh(0x1.00798aaf80739p+0) got 0x1.f2d6d823bc9dfp-5 - want 0x1.f2d6d823bc9e2p-5. */ -VPCS_ATTR v_f64_t V_NAME (acosh) (v_f64_t x) + _ZGVnN2v_acosh(0x1.00798aaf80739p+0) got 0x1.f2d6d823bc9dfp-5 + want 0x1.f2d6d823bc9e2p-5. */ +VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x) { - v_u64_t itop = v_as_u64_f64 (x) >> 52; - v_u64_t special = v_cond_u64 ((itop - OneTop) >= (BigBoundTop - OneTop)); + const struct data *d = ptr_barrier (&data); + uint64x2_t special + = vcgeq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (x), d->one), d->thresh); + float64x2_t special_arg = x; - /* Fall back to scalar routine for all lanes if any of them are special. */ +#if WANT_SIMD_EXCEPT if (unlikely (v_any_u64 (special))) - return special_case (x); + x = vbslq_f64 (special, vreinterpretq_f64_u64 (d->one), x); +#endif - v_f64_t xm1 = x - 1; - v_f64_t u = xm1 * (x + 1); - return log1p_inline (xm1 + v_sqrt_f64 (u)); + float64x2_t xm1 = vsubq_f64 (x, v_f64 (1)); + float64x2_t y; + y = vaddq_f64 (x, v_f64 (1)); + y = vmulq_f64 (y, xm1); + y = vsqrtq_f64 (y); + y = vaddq_f64 (xm1, y); + + if (unlikely (v_any_u64 (special))) + return special_case (special_arg, y, special, &d->log1p_consts); + return log1p_inline (y, &d->log1p_consts); } -VPCS_ALIAS PL_SIG (V, D, 1, acosh, 1.0, 10.0) -PL_TEST_ULP (V_NAME (acosh), 2.53) -PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (acosh)) -PL_TEST_INTERVAL (V_NAME (acosh), 1, 0x1p511, 90000) -PL_TEST_INTERVAL (V_NAME (acosh), 0x1p511, inf, 10000) -PL_TEST_INTERVAL (V_NAME (acosh), 0, 1, 1000) -PL_TEST_INTERVAL (V_NAME (acosh), -0, -inf, 10000) -#endif +PL_TEST_ULP (V_NAME_D1 (acosh), 2.53) +PL_TEST_EXPECT_FENV (V_NAME_D1 (acosh), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME_D1 (acosh), 1, 0x1p511, 90000) +PL_TEST_INTERVAL (V_NAME_D1 (acosh), 0x1p511, inf, 10000) +PL_TEST_INTERVAL (V_NAME_D1 (acosh), 0, 1, 1000) +PL_TEST_INTERVAL (V_NAME_D1 (acosh), -0, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/v_acoshf_3u1.c b/contrib/arm-optimized-routines/pl/math/v_acoshf_3u1.c index 2b5aff591a74..a2ff0f02635b 100644 --- a/contrib/arm-optimized-routines/pl/math/v_acoshf_3u1.c +++ b/contrib/arm-optimized-routines/pl/math/v_acoshf_3u1.c @@ -7,19 +7,26 @@ #include "v_math.h" #include "pl_sig.h" #include "pl_test.h" - -#define SignMask 0x80000000 -#define One 0x3f800000 -#define SquareLim 0x5f800000 /* asuint(0x1p64). */ - -#if V_SUPPORTED - #include "v_log1pf_inline.h" -static NOINLINE VPCS_ATTR v_f32_t -special_case (v_f32_t x, v_f32_t y, v_u32_t special) +const static struct data { - return v_call_f32 (acoshf, x, y, special); + struct v_log1pf_data log1pf_consts; + uint32x4_t one; + uint16x4_t thresh; +} data = { + .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, + .one = V4 (0x3f800000), + .thresh = V4 (0x2000) /* asuint(0x1p64) - asuint(1). */ +}; + +#define SignMask 0x80000000 + +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, float32x4_t y, uint16x4_t special, + const struct v_log1pf_data d) +{ + return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special)); } /* Vector approximation for single-precision acosh, based on log1p. Maximum @@ -32,37 +39,40 @@ special_case (v_f32_t x, v_f32_t y, v_u32_t special) __v_acoshf(0x1.01f83ep+0) got 0x1.fbc7fap-4 want 0x1.fbc7f4p-4. */ -VPCS_ATTR v_f32_t V_NAME (acoshf) (v_f32_t x) +VPCS_ATTR float32x4_t V_NAME_F1 (acosh) (float32x4_t x) { - v_u32_t ix = v_as_u32_f32 (x); - v_u32_t special = v_cond_u32 ((ix - One) >= (SquareLim - One)); + const struct data *d = ptr_barrier (&data); + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), d->thresh); #if WANT_SIMD_EXCEPT /* Mask special lanes with 1 to side-step spurious invalid or overflow. Use - only xm1 to calculate u, as operating on x will trigger invalid for NaN. */ - v_f32_t xm1 = v_sel_f32 (special, v_f32 (1), x - 1); - v_f32_t u = v_fma_f32 (xm1, xm1, 2 * xm1); + only xm1 to calculate u, as operating on x will trigger invalid for NaN. + Widening sign-extend special predicate in order to mask with it. */ + uint32x4_t p + = vreinterpretq_u32_s32 (vmovl_s16 (vreinterpret_s16_u16 (special))); + float32x4_t xm1 = v_zerofy_f32 (vsubq_f32 (x, v_f32 (1)), p); + float32x4_t u = vfmaq_f32 (vaddq_f32 (xm1, xm1), xm1, xm1); #else - v_f32_t xm1 = x - 1; - v_f32_t u = xm1 * (x + 1.0f); + float32x4_t xm1 = vsubq_f32 (x, v_f32 (1)); + float32x4_t u = vmulq_f32 (xm1, vaddq_f32 (x, v_f32 (1.0f))); #endif - v_f32_t y = log1pf_inline (xm1 + v_sqrt_f32 (u)); - if (unlikely (v_any_u32 (special))) - return special_case (x, y, special); - return y; + float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u)); + + if (unlikely (v_any_u16h (special))) + return special_case (x, y, special, d->log1pf_consts); + return log1pf_inline (y, d->log1pf_consts); } -VPCS_ALIAS PL_SIG (V, F, 1, acosh, 1.0, 10.0) #if WANT_SIMD_EXCEPT -PL_TEST_ULP (V_NAME (acoshf), 2.29) +PL_TEST_ULP (V_NAME_F1 (acosh), 2.29) #else -PL_TEST_ULP (V_NAME (acoshf), 2.58) -#endif -PL_TEST_EXPECT_FENV (V_NAME (acoshf), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME (acoshf), 0, 1, 500) -PL_TEST_INTERVAL (V_NAME (acoshf), 1, SquareLim, 100000) -PL_TEST_INTERVAL (V_NAME (acoshf), SquareLim, inf, 1000) -PL_TEST_INTERVAL (V_NAME (acoshf), -0, -inf, 1000) +PL_TEST_ULP (V_NAME_F1 (acosh), 2.58) #endif +PL_TEST_EXPECT_FENV (V_NAME_F1 (acosh), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME_F1 (acosh), 0, 1, 500) +PL_TEST_INTERVAL (V_NAME_F1 (acosh), 1, SquareLim, 100000) +PL_TEST_INTERVAL (V_NAME_F1 (acosh), SquareLim, inf, 1000) +PL_TEST_INTERVAL (V_NAME_F1 (acosh), -0, -inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/v_asin_3u.c b/contrib/arm-optimized-routines/pl/math/v_asin_3u.c new file mode 100644 index 000000000000..756443c6b320 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_asin_3u.c @@ -0,0 +1,113 @@ +/* + * Double-precision vector asin(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "poly_advsimd_f64.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float64x2_t poly[12]; + float64x2_t pi_over_2; + uint64x2_t abs_mask; +} data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) + on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ + .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4), + V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6), + V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6), + V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7), + V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6), + V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), }, + .pi_over_2 = V2 (0x1.921fb54442d18p+0), + .abs_mask = V2 (0x7fffffffffffffff), +}; + +#define AllMask v_u64 (0xffffffffffffffff) +#define One (0x3ff0000000000000) +#define Small (0x3e50000000000000) /* 2^-12. */ + +#if WANT_SIMD_EXCEPT +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t special) +{ + return v_call_f64 (asin, x, y, special); +} +#endif + +/* Double-precision implementation of vector asin(x). + + For |x| < Small, approximate asin(x) by x. Small = 2^-12 for correct + rounding. If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the + following approximation. + + For |x| in [Small, 0.5], use an order 11 polynomial P such that the final + approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2). + + The largest observed error in this region is 1.01 ulps, + _ZGVnN2v_asin (0x1.da9735b5a9277p-2) got 0x1.ed78525a927efp-2 + want 0x1.ed78525a927eep-2. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 2.69 ulps, + _ZGVnN2v_asin (0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1 + want 0x1.110d7e85fdd53p-1. */ +float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + float64x2_t ax = vabsq_f64 (x); + +#if WANT_SIMD_EXCEPT + /* Special values need to be computed with scalar fallbacks so + that appropriate exceptions are raised. */ + uint64x2_t special + = vcgtq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (ax), v_u64 (Small)), + v_u64 (One - Small)); + if (unlikely (v_any_u64 (special))) + return special_case (x, x, AllMask); +#endif + + uint64x2_t a_lt_half = vcltq_f64 (ax, v_f64 (0.5)); + + /* Evaluate polynomial Q(x) = y + y * z * P(z) with + z = x ^ 2 and y = |x| , if |x| < 0.5 + z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */ + float64x2_t z2 = vbslq_f64 (a_lt_half, vmulq_f64 (x, x), + vfmsq_n_f64 (v_f64 (0.5), ax, 0.5)); + float64x2_t z = vbslq_f64 (a_lt_half, ax, vsqrtq_f64 (z2)); + + /* Use a single polynomial approximation P for both intervals. */ + float64x2_t z4 = vmulq_f64 (z2, z2); + float64x2_t z8 = vmulq_f64 (z4, z4); + float64x2_t z16 = vmulq_f64 (z8, z8); + float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly); + + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = vfmaq_f64 (z, vmulq_f64 (z, z2), p); + + /* asin(|x|) = Q(|x|) , for |x| < 0.5 + = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ + float64x2_t y = vbslq_f64 (a_lt_half, p, vfmsq_n_f64 (d->pi_over_2, p, 2.0)); + + /* Copy sign. */ + return vbslq_f64 (d->abs_mask, y, x); +} + +PL_SIG (V, D, 1, asin, -1.0, 1.0) +PL_TEST_ULP (V_NAME_D1 (asin), 2.19) +PL_TEST_EXPECT_FENV (V_NAME_D1 (asin), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME_D1 (asin), 0, Small, 5000) +PL_TEST_INTERVAL (V_NAME_D1 (asin), Small, 0.5, 50000) +PL_TEST_INTERVAL (V_NAME_D1 (asin), 0.5, 1.0, 50000) +PL_TEST_INTERVAL (V_NAME_D1 (asin), 1.0, 0x1p11, 50000) +PL_TEST_INTERVAL (V_NAME_D1 (asin), 0x1p11, inf, 20000) +PL_TEST_INTERVAL (V_NAME_D1 (asin), -0, -inf, 20000) diff --git a/contrib/arm-optimized-routines/pl/math/v_asinf_2u5.c b/contrib/arm-optimized-routines/pl/math/v_asinf_2u5.c new file mode 100644 index 000000000000..eb978cd956ab --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_asinf_2u5.c @@ -0,0 +1,104 @@ +/* + * Single-precision vector asin(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "poly_advsimd_f32.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float32x4_t poly[5]; + float32x4_t pi_over_2f; +} data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on + [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */ + .poly = { V4 (0x1.55555ep-3), V4 (0x1.33261ap-4), V4 (0x1.70d7dcp-5), + V4 (0x1.b059dp-6), V4 (0x1.3af7d8p-5) }, + .pi_over_2f = V4 (0x1.921fb6p+0f), +}; + +#define AbsMask 0x7fffffff +#define Half 0x3f000000 +#define One 0x3f800000 +#define Small 0x39800000 /* 2^-12. */ + +#if WANT_SIMD_EXCEPT +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +{ + return v_call_f32 (asinf, x, y, special); +} +#endif + +/* Single-precision implementation of vector asin(x). + + For |x| < Small, approximate asin(x) by x. Small = 2^-12 for correct + rounding. If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the + following approximation. + + For |x| in [Small, 0.5], use order 4 polynomial P such that the final + approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2). + + The largest observed error in this region is 0.83 ulps, + _ZGVnN4v_asinf (0x1.ea00f4p-2) got 0x1.fef15ep-2 want 0x1.fef15cp-2. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 2.41 ulps, + _ZGVnN4v_asinf (0x1.00203ep-1) got 0x1.0c3a64p-1 want 0x1.0c3a6p-1. */ +float32x4_t VPCS_ATTR V_NAME_F1 (asin) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint32x4_t ia = vandq_u32 (ix, v_u32 (AbsMask)); + +#if WANT_SIMD_EXCEPT + /* Special values need to be computed with scalar fallbacks so + that appropriate fp exceptions are raised. */ + uint32x4_t special + = vcgtq_u32 (vsubq_u32 (ia, v_u32 (Small)), v_u32 (One - Small)); + if (unlikely (v_any_u32 (special))) + return special_case (x, x, v_u32 (0xffffffff)); +#endif + + float32x4_t ax = vreinterpretq_f32_u32 (ia); + uint32x4_t a_lt_half = vcltq_u32 (ia, v_u32 (Half)); + + /* Evaluate polynomial Q(x) = y + y * z * P(z) with + z = x ^ 2 and y = |x| , if |x| < 0.5 + z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */ + float32x4_t z2 = vbslq_f32 (a_lt_half, vmulq_f32 (x, x), + vfmsq_n_f32 (v_f32 (0.5), ax, 0.5)); + float32x4_t z = vbslq_f32 (a_lt_half, ax, vsqrtq_f32 (z2)); + + /* Use a single polynomial approximation P for both intervals. */ + float32x4_t p = v_horner_4_f32 (z2, d->poly); + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = vfmaq_f32 (z, vmulq_f32 (z, z2), p); + + /* asin(|x|) = Q(|x|) , for |x| < 0.5 + = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ + float32x4_t y + = vbslq_f32 (a_lt_half, p, vfmsq_n_f32 (d->pi_over_2f, p, 2.0)); + + /* Copy sign. */ + return vbslq_f32 (v_u32 (AbsMask), y, x); +} + +PL_SIG (V, F, 1, asin, -1.0, 1.0) +PL_TEST_ULP (V_NAME_F1 (asin), 1.91) +PL_TEST_EXPECT_FENV (V_NAME_F1 (asin), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME_F1 (asin), 0, 0x1p-12, 5000) +PL_TEST_INTERVAL (V_NAME_F1 (asin), 0x1p-12, 0.5, 50000) +PL_TEST_INTERVAL (V_NAME_F1 (asin), 0.5, 1.0, 50000) +PL_TEST_INTERVAL (V_NAME_F1 (asin), 1.0, 0x1p11, 50000) +PL_TEST_INTERVAL (V_NAME_F1 (asin), 0x1p11, inf, 20000) +PL_TEST_INTERVAL (V_NAME_F1 (asin), -0, -inf, 20000) diff --git a/contrib/arm-optimized-routines/pl/math/v_asinh_3u5.c b/contrib/arm-optimized-routines/pl/math/v_asinh_3u5.c index fd329b6b7f69..4862bef94861 100644 --- a/contrib/arm-optimized-routines/pl/math/v_asinh_3u5.c +++ b/contrib/arm-optimized-routines/pl/math/v_asinh_3u5.c @@ -6,75 +6,81 @@ */ #include "v_math.h" -#include "estrin.h" +#include "poly_advsimd_f64.h" #include "pl_sig.h" #include "pl_test.h" -#if V_SUPPORTED +#define A(i) v_f64 (__v_log_data.poly[i]) +#define N (1 << V_LOG_TABLE_BITS) -#define OneTop 0x3ff /* top12(asuint64(1.0f)). */ -#define HugeBound 0x5fe /* top12(asuint64(0x1p511)). */ -#define TinyBound 0x3e5 /* top12(asuint64(0x1p-26)). */ -#define AbsMask v_u64 (0x7fffffffffffffff) -#define C(i) v_f64 (__asinh_data.poly[i]) +const static struct data +{ + float64x2_t poly[18]; + uint64x2_t off, huge_bound, abs_mask; + float64x2_t ln2, tiny_bound; +} data = { + .off = V2 (0x3fe6900900000000), + .ln2 = V2 (0x1.62e42fefa39efp-1), + .huge_bound = V2 (0x5fe0000000000000), + .tiny_bound = V2 (0x1p-26), + .abs_mask = V2 (0x7fffffffffffffff), + /* Even terms of polynomial s.t. asinh(x) is approximated by + asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...). + Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2). */ + .poly = { V2 (-0x1.55555555554a7p-3), V2 (0x1.3333333326c7p-4), + V2 (-0x1.6db6db68332e6p-5), V2 (0x1.f1c71b26fb40dp-6), + V2 (-0x1.6e8b8b654a621p-6), V2 (0x1.1c4daa9e67871p-6), + V2 (-0x1.c9871d10885afp-7), V2 (0x1.7a16e8d9d2ecfp-7), + V2 (-0x1.3ddca533e9f54p-7), V2 (0x1.0becef748dafcp-7), + V2 (-0x1.b90c7099dd397p-8), V2 (0x1.541f2bb1ffe51p-8), + V2 (-0x1.d217026a669ecp-9), V2 (0x1.0b5c7977aaf7p-9), + V2 (-0x1.e0f37daef9127p-11), V2 (0x1.388b5fe542a6p-12), + V2 (-0x1.021a48685e287p-14), V2 (0x1.93d4ba83d34dap-18) }, +}; -/* Constants & data for log. */ -#define OFF 0x3fe6000000000000 -#define Ln2 v_f64 (0x1.62e42fefa39efp-1) -#define A(i) v_f64 (__sv_log_data.poly[i]) -#define T(i) __log_data.tab[i] -#define N (1 << LOG_TABLE_BITS) - -static NOINLINE v_f64_t -special_case (v_f64_t x, v_f64_t y, v_u64_t special) +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, float64x2_t y, uint64x2_t special) { return v_call_f64 (asinh, x, y, special); } struct entry { - v_f64_t invc; - v_f64_t logc; + float64x2_t invc; + float64x2_t logc; }; static inline struct entry -lookup (v_u64_t i) +lookup (uint64x2_t i) { - struct entry e; -#ifdef SCALAR - e.invc = T (i).invc; - e.logc = T (i).logc; -#else - e.invc[0] = T (i[0]).invc; - e.logc[0] = T (i[0]).logc; - e.invc[1] = T (i[1]).invc; - e.logc[1] = T (i[1]).logc; -#endif - return e; + float64x2_t e0 = vld1q_f64 ( + &__v_log_data.table[(i[0] >> (52 - V_LOG_TABLE_BITS)) & (N - 1)].invc); + float64x2_t e1 = vld1q_f64 ( + &__v_log_data.table[(i[1] >> (52 - V_LOG_TABLE_BITS)) & (N - 1)].invc); + return (struct entry){ vuzp1q_f64 (e0, e1), vuzp2q_f64 (e0, e1) }; } -static inline v_f64_t -log_inline (v_f64_t x) +static inline float64x2_t +log_inline (float64x2_t x, const struct data *d) { - /* Double-precision vector log, copied from math/v_log.c with some cosmetic - modification and special-cases removed. See that file for details of the - algorithm used. */ - v_u64_t ix = v_as_u64_f64 (x); - v_u64_t tmp = ix - OFF; - v_u64_t i = (tmp >> (52 - LOG_TABLE_BITS)) % N; - v_s64_t k = v_as_s64_u64 (tmp) >> 52; - v_u64_t iz = ix - (tmp & 0xfffULL << 52); - v_f64_t z = v_as_f64_u64 (iz); - struct entry e = lookup (i); - v_f64_t r = v_fma_f64 (z, e.invc, v_f64 (-1.0)); - v_f64_t kd = v_to_f64_s64 (k); - v_f64_t hi = v_fma_f64 (kd, Ln2, e.logc + r); - v_f64_t r2 = r * r; - v_f64_t y = v_fma_f64 (A (3), r, A (2)); - v_f64_t p = v_fma_f64 (A (1), r, A (0)); - y = v_fma_f64 (A (4), r2, y); - y = v_fma_f64 (y, r2, p); - y = v_fma_f64 (y, r2, hi); + /* Double-precision vector log, copied from ordinary vector log with some + cosmetic modification and special-cases removed. */ + uint64x2_t ix = vreinterpretq_u64_f64 (x); + uint64x2_t tmp = vsubq_u64 (ix, d->off); + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); + uint64x2_t iz + = vsubq_u64 (ix, vandq_u64 (tmp, vdupq_n_u64 (0xfffULL << 52))); + float64x2_t z = vreinterpretq_f64_u64 (iz); + struct entry e = lookup (tmp); + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); + float64x2_t kd = vcvtq_f64_s64 (k); + float64x2_t hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2); + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t y = vfmaq_f64 (A (2), A (3), r); + float64x2_t p = vfmaq_f64 (A (0), A (1), r); + y = vfmaq_f64 (y, A (4), r2); + y = vfmaq_f64 (p, y, r2); + y = vfmaq_f64 (hi, y, r2); return y; } @@ -89,34 +95,35 @@ log_inline (v_f64_t x) |x| >= 1: __v_asinh(0x1.2cd9d717e2c9bp+0) got 0x1.ffffcfd0e234fp-1 want 0x1.ffffcfd0e2352p-1. */ -VPCS_ATTR v_f64_t V_NAME (asinh) (v_f64_t x) +VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x) { - v_u64_t ix = v_as_u64_f64 (x); - v_u64_t iax = ix & AbsMask; - v_f64_t ax = v_as_f64_u64 (iax); - v_u64_t top12 = iax >> 52; + const struct data *d = ptr_barrier (&data); - v_u64_t gt1 = v_cond_u64 (top12 >= OneTop); - v_u64_t special = v_cond_u64 (top12 >= HugeBound); + float64x2_t ax = vabsq_f64 (x); + uint64x2_t iax = vreinterpretq_u64_f64 (ax); + + uint64x2_t gt1 = vcgeq_f64 (ax, v_f64 (1)); + uint64x2_t special = vcgeq_u64 (iax, d->huge_bound); #if WANT_SIMD_EXCEPT - v_u64_t tiny = v_cond_u64 (top12 < TinyBound); - special |= tiny; + uint64x2_t tiny = vcltq_f64 (ax, d->tiny_bound); + special = vorrq_u64 (special, tiny); #endif /* Option 1: |x| >= 1. Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)). If WANT_SIMD_EXCEPT is enabled, sidestep special values, which will overflow, by setting special lanes to 1. These will be fixed later. */ - v_f64_t option_1 = v_f64 (0); + float64x2_t option_1 = v_f64 (0); if (likely (v_any_u64 (gt1))) { #if WANT_SIMD_EXCEPT - v_f64_t xm = v_sel_f64 (special, v_f64 (1), ax); + float64x2_t xm = v_zerofy_f64 (ax, special); #else - v_f64_t xm = ax; + float64x2_t xm = ax; #endif - option_1 = log_inline (xm + v_sqrt_f64 (xm * xm + 1)); + option_1 = log_inline ( + vaddq_f64 (xm, vsqrtq_f64 (vfmaq_f64 (v_f64 (1), xm, xm))), d); } /* Option 2: |x| < 1. @@ -127,49 +134,42 @@ VPCS_ATTR v_f64_t V_NAME (asinh) (v_f64_t x) special-case. The largest observed error in this region is 1.47 ULPs: __v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1 want 0x1.c1d6bf874019cp-1. */ - v_f64_t option_2 = v_f64 (0); - if (likely (v_any_u64 (~gt1))) + float64x2_t option_2 = v_f64 (0); + if (likely (v_any_u64 (vceqzq_u64 (gt1)))) { #if WANT_SIMD_EXCEPT - ax = v_sel_f64 (tiny | gt1, v_f64 (0), ax); + ax = v_zerofy_f64 (ax, vorrq_u64 (tiny, gt1)); #endif - v_f64_t x2 = ax * ax; - v_f64_t z2 = x2 * x2; - v_f64_t z4 = z2 * z2; - v_f64_t z8 = z4 * z4; - v_f64_t p = ESTRIN_17 (x2, z2, z4, z8, z8 * z8, C); - option_2 = v_fma_f64 (p, x2 * ax, ax); + float64x2_t x2 = vmulq_f64 (ax, ax), x3 = vmulq_f64 (ax, x2), + z2 = vmulq_f64 (x2, x2), z4 = vmulq_f64 (z2, z2), + z8 = vmulq_f64 (z4, z4), z16 = vmulq_f64 (z8, z8); + float64x2_t p = v_estrin_17_f64 (x2, z2, z4, z8, z16, d->poly); + option_2 = vfmaq_f64 (ax, p, x3); #if WANT_SIMD_EXCEPT - option_2 = v_sel_f64 (tiny, x, option_2); + option_2 = vbslq_f64 (tiny, x, option_2); #endif } /* Choose the right option for each lane. */ - v_f64_t y = v_sel_f64 (gt1, option_1, option_2); + float64x2_t y = vbslq_f64 (gt1, option_1, option_2); /* Copy sign. */ - y = v_as_f64_u64 (v_bsl_u64 (AbsMask, v_as_u64_f64 (y), ix)); + y = vbslq_f64 (d->abs_mask, y, x); if (unlikely (v_any_u64 (special))) return special_case (x, y, special); return y; } -VPCS_ALIAS PL_SIG (V, D, 1, asinh, -10.0, 10.0) -PL_TEST_ULP (V_NAME (asinh), 2.80) -PL_TEST_EXPECT_FENV (V_NAME (asinh), WANT_SIMD_EXCEPT) +PL_TEST_ULP (V_NAME_D1 (asinh), 2.80) +PL_TEST_EXPECT_FENV (V_NAME_D1 (asinh), WANT_SIMD_EXCEPT) /* Test vector asinh 3 times, with control lane < 1, > 1 and special. Ensures the v_sel is choosing the right option in all cases. */ -#define V_ASINH_INTERVAL(lo, hi, n) \ - PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 0.5) \ - PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 2) \ - PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 0x1p600) +#define V_ASINH_INTERVAL(lo, hi, n) \ + PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (asinh), lo, hi, n, 0.5) \ + PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (asinh), lo, hi, n, 2) \ + PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (asinh), lo, hi, n, 0x1p600) V_ASINH_INTERVAL (0, 0x1p-26, 50000) V_ASINH_INTERVAL (0x1p-26, 1, 50000) V_ASINH_INTERVAL (1, 0x1p511, 50000) V_ASINH_INTERVAL (0x1p511, inf, 40000) -V_ASINH_INTERVAL (-0, -0x1p-26, 50000) -V_ASINH_INTERVAL (-0x1p-26, -1, 50000) -V_ASINH_INTERVAL (-1, -0x1p511, 50000) -V_ASINH_INTERVAL (-0x1p511, -inf, 40000) -#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_asinhf_2u7.c b/contrib/arm-optimized-routines/pl/math/v_asinhf_2u7.c index 9d8c8a936ae3..1723ba90d2f3 100644 --- a/contrib/arm-optimized-routines/pl/math/v_asinhf_2u7.c +++ b/contrib/arm-optimized-routines/pl/math/v_asinhf_2u7.c @@ -6,21 +6,29 @@ */ #include "v_math.h" -#include "include/mathlib.h" #include "pl_sig.h" #include "pl_test.h" - -#if V_SUPPORTED - -#define SignMask v_u32 (0x80000000) -#define One v_f32 (1.0f) -#define BigBound v_u32 (0x5f800000) /* asuint(0x1p64). */ -#define TinyBound v_u32 (0x30800000) /* asuint(0x1p-30). */ - #include "v_log1pf_inline.h" -static NOINLINE v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t special) +#define SignMask v_u32 (0x80000000) + +const static struct data +{ + struct v_log1pf_data log1pf_consts; + uint32x4_t big_bound; +#if WANT_SIMD_EXCEPT + uint32x4_t tiny_bound; +#endif +} data = { + .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, + .big_bound = V4 (0x5f800000), /* asuint(0x1p64). */ +#if WANT_SIMD_EXCEPT + .tiny_bound = V4 (0x30800000) /* asuint(0x1p-30). */ +#endif +}; + +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, float32x4_t y, uint32x4_t special) { return v_call_f32 (asinhf, x, y, special); } @@ -28,43 +36,45 @@ specialcase (v_f32_t x, v_f32_t y, v_u32_t special) /* Single-precision implementation of vector asinh(x), using vector log1p. Worst-case error is 2.66 ULP, at roughly +/-0.25: __v_asinhf(0x1.01b04p-2) got 0x1.fe163ep-3 want 0x1.fe1638p-3. */ -VPCS_ATTR v_f32_t V_NAME (asinhf) (v_f32_t x) +VPCS_ATTR float32x4_t V_NAME_F1 (asinh) (float32x4_t x) { - v_u32_t ix = v_as_u32_f32 (x); - v_u32_t iax = ix & ~SignMask; - v_u32_t sign = ix & SignMask; - v_f32_t ax = v_as_f32_u32 (iax); - v_u32_t special = v_cond_u32 (iax >= BigBound); + const struct data *dat = ptr_barrier (&data); + uint32x4_t iax = vbicq_u32 (vreinterpretq_u32_f32 (x), SignMask); + float32x4_t ax = vreinterpretq_f32_u32 (iax); + uint32x4_t special = vcgeq_u32 (iax, dat->big_bound); + float32x4_t special_arg = x; #if WANT_SIMD_EXCEPT /* Sidestep tiny and large values to avoid inadvertently triggering under/overflow. */ - special |= v_cond_u32 (iax < TinyBound); + special = vorrq_u32 (special, vcltq_u32 (iax, dat->tiny_bound)); if (unlikely (v_any_u32 (special))) - ax = v_sel_f32 (special, One, ax); + { + ax = v_zerofy_f32 (ax, special); + x = v_zerofy_f32 (x, special); + } #endif /* asinh(x) = log(x + sqrt(x * x + 1)). For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */ - v_f32_t d = One + v_sqrt_f32 (ax * ax + One); - v_f32_t y = log1pf_inline (ax + ax * ax / d); - y = v_as_f32_u32 (sign | v_as_u32_f32 (y)); + float32x4_t d + = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (v_f32 (1), x, x))); + float32x4_t y = log1pf_inline ( + vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d)), dat->log1pf_consts); if (unlikely (v_any_u32 (special))) - return specialcase (x, y, special); - return y; + return special_case (special_arg, vbslq_f32 (SignMask, x, y), special); + return vbslq_f32 (SignMask, x, y); } -VPCS_ALIAS PL_SIG (V, F, 1, asinh, -10.0, 10.0) -PL_TEST_ULP (V_NAME (asinhf), 2.17) -PL_TEST_EXPECT_FENV (V_NAME (asinhf), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME (asinhf), 0, 0x1p-12, 40000) -PL_TEST_INTERVAL (V_NAME (asinhf), 0x1p-12, 1.0, 40000) -PL_TEST_INTERVAL (V_NAME (asinhf), 1.0, 0x1p11, 40000) -PL_TEST_INTERVAL (V_NAME (asinhf), 0x1p11, inf, 40000) -PL_TEST_INTERVAL (V_NAME (asinhf), 0, -0x1p-12, 20000) -PL_TEST_INTERVAL (V_NAME (asinhf), -0x1p-12, -1.0, 20000) -PL_TEST_INTERVAL (V_NAME (asinhf), -1.0, -0x1p11, 20000) -PL_TEST_INTERVAL (V_NAME (asinhf), -0x1p11, -inf, 20000) -#endif +PL_TEST_ULP (V_NAME_F1 (asinh), 2.17) +PL_TEST_EXPECT_FENV (V_NAME_F1 (asinh), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME_F1 (asinh), 0, 0x1p-12, 40000) +PL_TEST_INTERVAL (V_NAME_F1 (asinh), 0x1p-12, 1.0, 40000) +PL_TEST_INTERVAL (V_NAME_F1 (asinh), 1.0, 0x1p11, 40000) +PL_TEST_INTERVAL (V_NAME_F1 (asinh), 0x1p11, inf, 40000) +PL_TEST_INTERVAL (V_NAME_F1 (asinh), -0, -0x1p-12, 20000) +PL_TEST_INTERVAL (V_NAME_F1 (asinh), -0x1p-12, -1.0, 20000) +PL_TEST_INTERVAL (V_NAME_F1 (asinh), -1.0, -0x1p11, 20000) +PL_TEST_INTERVAL (V_NAME_F1 (asinh), -0x1p11, -inf, 20000) diff --git a/contrib/arm-optimized-routines/pl/math/v_atan2_3u.c b/contrib/arm-optimized-routines/pl/math/v_atan2_3u.c index 6327fea8eb2c..f24667682dec 100644 --- a/contrib/arm-optimized-routines/pl/math/v_atan2_3u.c +++ b/contrib/arm-optimized-routines/pl/math/v_atan2_3u.c @@ -8,83 +8,114 @@ #include "v_math.h" #include "pl_sig.h" #include "pl_test.h" +#include "poly_advsimd_f64.h" -#if V_SUPPORTED +static const struct data +{ + float64x2_t pi_over_2; + float64x2_t poly[20]; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + the interval [2**-1022, 1.0]. */ + .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3), + V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4), + V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4), + V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5), + V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5), + V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5), + V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6), + V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7), + V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10), + V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), }, + .pi_over_2 = V2 (0x1.921fb54442d18p+0), +}; -#include "atan_common.h" - -#define PiOver2 v_f64 (0x1.921fb54442d18p+0) #define SignMask v_u64 (0x8000000000000000) /* Special cases i.e. 0, infinity, NaN (fall back to scalar calls). */ -VPCS_ATTR -NOINLINE static v_f64_t -specialcase (v_f64_t y, v_f64_t x, v_f64_t ret, v_u64_t cmp) +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t y, float64x2_t x, float64x2_t ret, uint64x2_t cmp) { return v_call2_f64 (atan2, y, x, ret, cmp); } /* Returns 1 if input is the bit representation of 0, infinity or nan. */ -static inline v_u64_t -zeroinfnan (v_u64_t i) +static inline uint64x2_t +zeroinfnan (uint64x2_t i) { - return v_cond_u64 (2 * i - 1 >= v_u64 (2 * asuint64 (INFINITY) - 1)); + /* (2 * i - 1) >= (2 * asuint64 (INFINITY) - 1). */ + return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)), + v_u64 (2 * asuint64 (INFINITY) - 1)); } /* Fast implementation of vector atan2. Maximum observed error is 2.8 ulps: - v_atan2(0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5) + _ZGVnN2vv_atan2 (0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5) got 0x1.92d628ab678ccp-1 want 0x1.92d628ab678cfp-1. */ -VPCS_ATTR -v_f64_t V_NAME (atan2) (v_f64_t y, v_f64_t x) +float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x) { - v_u64_t ix = v_as_u64_f64 (x); - v_u64_t iy = v_as_u64_f64 (y); + const struct data *data_ptr = ptr_barrier (&data); - v_u64_t special_cases = zeroinfnan (ix) | zeroinfnan (iy); + uint64x2_t ix = vreinterpretq_u64_f64 (x); + uint64x2_t iy = vreinterpretq_u64_f64 (y); - v_u64_t sign_x = ix & SignMask; - v_u64_t sign_y = iy & SignMask; - v_u64_t sign_xy = sign_x ^ sign_y; + uint64x2_t special_cases = vorrq_u64 (zeroinfnan (ix), zeroinfnan (iy)); - v_f64_t ax = v_abs_f64 (x); - v_f64_t ay = v_abs_f64 (y); + uint64x2_t sign_x = vandq_u64 (ix, SignMask); + uint64x2_t sign_y = vandq_u64 (iy, SignMask); + uint64x2_t sign_xy = veorq_u64 (sign_x, sign_y); - v_u64_t pred_xlt0 = x < 0.0; - v_u64_t pred_aygtax = ay > ax; + float64x2_t ax = vabsq_f64 (x); + float64x2_t ay = vabsq_f64 (y); + + uint64x2_t pred_xlt0 = vcltzq_f64 (x); + uint64x2_t pred_aygtax = vcgtq_f64 (ay, ax); /* Set up z for call to atan. */ - v_f64_t n = v_sel_f64 (pred_aygtax, -ax, ay); - v_f64_t d = v_sel_f64 (pred_aygtax, ay, ax); - v_f64_t z = v_div_f64 (n, d); + float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay); + float64x2_t d = vbslq_f64 (pred_aygtax, ay, ax); + float64x2_t z = vdivq_f64 (n, d); /* Work out the correct shift. */ - v_f64_t shift = v_sel_f64 (pred_xlt0, v_f64 (-2.0), v_f64 (0.0)); - shift = v_sel_f64 (pred_aygtax, shift + 1.0, shift); - shift *= PiOver2; + float64x2_t shift = vreinterpretq_f64_u64 ( + vandq_u64 (pred_xlt0, vreinterpretq_u64_f64 (v_f64 (-2.0)))); + shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift); + shift = vmulq_f64 (shift, data_ptr->pi_over_2); - v_f64_t ret = eval_poly (z, z, shift); + /* Calculate the polynomial approximation. + Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of + full scheme to avoid underflow in x^16. + The order 19 polynomial P approximates + (atan(sqrt(x))-sqrt(x))/x^(3/2). */ + float64x2_t z2 = vmulq_f64 (z, z); + float64x2_t x2 = vmulq_f64 (z2, z2); + float64x2_t x4 = vmulq_f64 (x2, x2); + float64x2_t x8 = vmulq_f64 (x4, x4); + float64x2_t ret + = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, data_ptr->poly), + v_estrin_11_f64 (z2, x2, x4, x8, data_ptr->poly + 8), x8); + + /* Finalize. y = shift + z + z^3 * P(z^2). */ + ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z)); + ret = vaddq_f64 (ret, shift); /* Account for the sign of x and y. */ - ret = v_as_f64_u64 (v_as_u64_f64 (ret) ^ sign_xy); + ret = vreinterpretq_f64_u64 ( + veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy)); if (unlikely (v_any_u64 (special_cases))) - { - return specialcase (y, x, ret, special_cases); - } + return special_case (y, x, ret, special_cases); return ret; } -VPCS_ALIAS /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ PL_SIG (V, D, 2, atan2) // TODO tighten this once __v_atan2 is fixed -PL_TEST_ULP (V_NAME (atan2), 2.9) -PL_TEST_INTERVAL (V_NAME (atan2), -10.0, 10.0, 50000) -PL_TEST_INTERVAL (V_NAME (atan2), -1.0, 1.0, 40000) -PL_TEST_INTERVAL (V_NAME (atan2), 0.0, 1.0, 40000) -PL_TEST_INTERVAL (V_NAME (atan2), 1.0, 100.0, 40000) -PL_TEST_INTERVAL (V_NAME (atan2), 1e6, 1e32, 40000) -#endif +PL_TEST_ULP (V_NAME_D2 (atan2), 2.9) +PL_TEST_INTERVAL (V_NAME_D2 (atan2), -10.0, 10.0, 50000) +PL_TEST_INTERVAL (V_NAME_D2 (atan2), -1.0, 1.0, 40000) +PL_TEST_INTERVAL (V_NAME_D2 (atan2), 0.0, 1.0, 40000) +PL_TEST_INTERVAL (V_NAME_D2 (atan2), 1.0, 100.0, 40000) +PL_TEST_INTERVAL (V_NAME_D2 (atan2), 1e6, 1e32, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/v_atan2f_3u.c b/contrib/arm-optimized-routines/pl/math/v_atan2f_3u.c index 5d1e6ca4488e..bbfc3cb552f6 100644 --- a/contrib/arm-optimized-routines/pl/math/v_atan2f_3u.c +++ b/contrib/arm-optimized-routines/pl/math/v_atan2f_3u.c @@ -8,82 +8,108 @@ #include "v_math.h" #include "pl_sig.h" #include "pl_test.h" +#include "poly_advsimd_f32.h" -#if V_SUPPORTED +static const struct data +{ + float32x4_t poly[8]; + float32x4_t pi_over_2; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-128, 1.0]. + Generated using fpminimax between FLT_MIN and 1. */ + .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f), + V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f), + V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) }, + .pi_over_2 = V4 (0x1.921fb6p+0f), +}; -#include "atanf_common.h" - -/* Useful constants. */ -#define PiOver2 v_f32 (0x1.921fb6p+0f) #define SignMask v_u32 (0x80000000) /* Special cases i.e. 0, infinity and nan (fall back to scalar calls). */ -VPCS_ATTR -NOINLINE static v_f32_t -specialcase (v_f32_t y, v_f32_t x, v_f32_t ret, v_u32_t cmp) +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t y, float32x4_t x, float32x4_t ret, uint32x4_t cmp) { return v_call2_f32 (atan2f, y, x, ret, cmp); } /* Returns 1 if input is the bit representation of 0, infinity or nan. */ -static inline v_u32_t -zeroinfnan (v_u32_t i) +static inline uint32x4_t +zeroinfnan (uint32x4_t i) { - return v_cond_u32 (2 * i - 1 >= v_u32 (2 * 0x7f800000lu - 1)); + /* 2 * i - 1 >= 2 * 0x7f800000lu - 1. */ + return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), + v_u32 (2 * 0x7f800000lu - 1)); } /* Fast implementation of vector atan2f. Maximum observed error is 2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]: - v_atan2(0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1 - want 0x1.967f00p-1. */ -VPCS_ATTR -v_f32_t V_NAME (atan2f) (v_f32_t y, v_f32_t x) + _ZGVnN4vv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1 + want 0x1.967f00p-1. */ +float32x4_t VPCS_ATTR V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) { - v_u32_t ix = v_as_u32_f32 (x); - v_u32_t iy = v_as_u32_f32 (y); + const struct data *data_ptr = ptr_barrier (&data); - v_u32_t special_cases = zeroinfnan (ix) | zeroinfnan (iy); + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint32x4_t iy = vreinterpretq_u32_f32 (y); - v_u32_t sign_x = ix & SignMask; - v_u32_t sign_y = iy & SignMask; - v_u32_t sign_xy = sign_x ^ sign_y; + uint32x4_t special_cases = vorrq_u32 (zeroinfnan (ix), zeroinfnan (iy)); - v_f32_t ax = v_abs_f32 (x); - v_f32_t ay = v_abs_f32 (y); + uint32x4_t sign_x = vandq_u32 (ix, SignMask); + uint32x4_t sign_y = vandq_u32 (iy, SignMask); + uint32x4_t sign_xy = veorq_u32 (sign_x, sign_y); - v_u32_t pred_xlt0 = x < 0.0f; - v_u32_t pred_aygtax = ay > ax; + float32x4_t ax = vabsq_f32 (x); + float32x4_t ay = vabsq_f32 (y); + + uint32x4_t pred_xlt0 = vcltzq_f32 (x); + uint32x4_t pred_aygtax = vcgtq_f32 (ay, ax); /* Set up z for call to atanf. */ - v_f32_t n = v_sel_f32 (pred_aygtax, -ax, ay); - v_f32_t d = v_sel_f32 (pred_aygtax, ay, ax); - v_f32_t z = v_div_f32 (n, d); + float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay); + float32x4_t d = vbslq_f32 (pred_aygtax, ay, ax); + float32x4_t z = vdivq_f32 (n, d); /* Work out the correct shift. */ - v_f32_t shift = v_sel_f32 (pred_xlt0, v_f32 (-2.0f), v_f32 (0.0f)); - shift = v_sel_f32 (pred_aygtax, shift + 1.0f, shift); - shift *= PiOver2; + float32x4_t shift = vreinterpretq_f32_u32 ( + vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f)))); + shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift); + shift = vmulq_f32 (shift, data_ptr->pi_over_2); - v_f32_t ret = eval_poly (z, z, shift); + /* Calculate the polynomial approximation. + Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However, + a standard implementation using z8 creates spurious underflow + in the very last fma (when z^8 is small enough). + Therefore, we split the last fma into a mul and an fma. + Horner and single-level Estrin have higher errors that exceed + threshold. */ + float32x4_t z2 = vmulq_f32 (z, z); + float32x4_t z4 = vmulq_f32 (z2, z2); + + float32x4_t ret = vfmaq_f32 ( + v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly), z4, + vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly + 4))); + + /* y = shift + z * P(z^2). */ + ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift); /* Account for the sign of y. */ - ret = v_as_f32_u32 (v_as_u32_f32 (ret) ^ sign_xy); + ret = vreinterpretq_f32_u32 ( + veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy)); if (unlikely (v_any_u32 (special_cases))) { - return specialcase (y, x, ret, special_cases); + return special_case (y, x, ret, special_cases); } return ret; } -VPCS_ALIAS /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ PL_SIG (V, F, 2, atan2) -PL_TEST_ULP (V_NAME (atan2f), 2.46) -PL_TEST_INTERVAL (V_NAME (atan2f), -10.0, 10.0, 50000) -PL_TEST_INTERVAL (V_NAME (atan2f), -1.0, 1.0, 40000) -PL_TEST_INTERVAL (V_NAME (atan2f), 0.0, 1.0, 40000) -PL_TEST_INTERVAL (V_NAME (atan2f), 1.0, 100.0, 40000) -PL_TEST_INTERVAL (V_NAME (atan2f), 1e6, 1e32, 40000) -#endif +PL_TEST_ULP (V_NAME_F2 (atan2), 2.46) +PL_TEST_INTERVAL (V_NAME_F2 (atan2), -10.0, 10.0, 50000) +PL_TEST_INTERVAL (V_NAME_F2 (atan2), -1.0, 1.0, 40000) +PL_TEST_INTERVAL (V_NAME_F2 (atan2), 0.0, 1.0, 40000) +PL_TEST_INTERVAL (V_NAME_F2 (atan2), 1.0, 100.0, 40000) +PL_TEST_INTERVAL (V_NAME_F2 (atan2), 1e6, 1e32, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/v_atan_2u5.c b/contrib/arm-optimized-routines/pl/math/v_atan_2u5.c index 0f3c2ccf2606..ba68cc3cc720 100644 --- a/contrib/arm-optimized-routines/pl/math/v_atan_2u5.c +++ b/contrib/arm-optimized-routines/pl/math/v_atan_2u5.c @@ -8,33 +8,51 @@ #include "v_math.h" #include "pl_sig.h" #include "pl_test.h" +#include "poly_advsimd_f64.h" -#if V_SUPPORTED +static const struct data +{ + float64x2_t pi_over_2; + float64x2_t poly[20]; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-1022, 1.0]. */ + .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3), + V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4), + V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4), + V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5), + V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5), + V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5), + V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6), + V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7), + V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10), + V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), }, + .pi_over_2 = V2 (0x1.921fb54442d18p+0), +}; -#include "atan_common.h" - -#define PiOver2 v_f64 (0x1.921fb54442d18p+0) -#define AbsMask v_u64 (0x7fffffffffffffff) -#define TinyBound 0x3e1 /* top12(asuint64(0x1p-30)). */ -#define BigBound 0x434 /* top12(asuint64(0x1p53)). */ +#define SignMask v_u64 (0x8000000000000000) +#define TinyBound 0x3e10000000000000 /* asuint64(0x1p-30). */ +#define BigBound 0x4340000000000000 /* asuint64(0x1p53). */ /* Fast implementation of vector atan. Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using z=1/x and shift = pi/2. Maximum observed error is 2.27 ulps: - __v_atan(0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1 - want 0x1.9225645bdd7c3p-1. */ -VPCS_ATTR -v_f64_t V_NAME (atan) (v_f64_t x) + _ZGVnN2v_atan (0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1 + want 0x1.9225645bdd7c3p-1. */ +float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x) { + const struct data *d = ptr_barrier (&data); + /* Small cases, infs and nans are supported by our approximation technique, but do not set fenv flags correctly. Only trigger special case if we need fenv. */ - v_u64_t ix = v_as_u64_f64 (x); - v_u64_t sign = ix & ~AbsMask; + uint64x2_t ix = vreinterpretq_u64_f64 (x); + uint64x2_t sign = vandq_u64 (ix, SignMask); #if WANT_SIMD_EXCEPT - v_u64_t ia12 = (ix >> 52) & 0x7ff; - v_u64_t special = v_cond_u64 (ia12 - TinyBound > BigBound - TinyBound); + uint64x2_t ia12 = vandq_u64 (ix, v_u64 (0x7ff0000000000000)); + uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia12, v_u64 (TinyBound)), + v_u64 (BigBound - TinyBound)); /* If any lane is special, fall back to the scalar routine for all lanes. */ if (unlikely (v_any_u64 (special))) return v_call_f64 (atan, x, v_f64 (0), v_u64 (-1)); @@ -44,31 +62,43 @@ v_f64_t V_NAME (atan) (v_f64_t x) y := arctan(x) for x < 1 y := pi/2 + arctan(-1/x) for x > 1 Hence, use z=-1/a if x>=1, otherwise z=a. */ - v_u64_t red = v_cagt_f64 (x, v_f64 (1.0)); + uint64x2_t red = vcagtq_f64 (x, v_f64 (1.0)); /* Avoid dependency in abs(x) in division (and comparison). */ - v_f64_t z = v_sel_f64 (red, v_div_f64 (v_f64 (-1.0), x), x); - v_f64_t shift = v_sel_f64 (red, PiOver2, v_f64 (0.0)); + float64x2_t z = vbslq_f64 (red, vdivq_f64 (v_f64 (1.0), x), x); + float64x2_t shift = vreinterpretq_f64_u64 ( + vandq_u64 (red, vreinterpretq_u64_f64 (d->pi_over_2))); /* Use absolute value only when needed (odd powers of z). */ - v_f64_t az = v_abs_f64 (z); - az = v_sel_f64 (red, -az, az); + float64x2_t az = vbslq_f64 ( + SignMask, vreinterpretq_f64_u64 (vandq_u64 (SignMask, red)), z); - /* Calculate the polynomial approximation. */ - v_f64_t y = eval_poly (z, az, shift); + /* Calculate the polynomial approximation. + Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of + full scheme to avoid underflow in x^16. + The order 19 polynomial P approximates + (atan(sqrt(x))-sqrt(x))/x^(3/2). */ + float64x2_t z2 = vmulq_f64 (z, z); + float64x2_t x2 = vmulq_f64 (z2, z2); + float64x2_t x4 = vmulq_f64 (x2, x2); + float64x2_t x8 = vmulq_f64 (x4, x4); + float64x2_t y + = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, d->poly), + v_estrin_11_f64 (z2, x2, x4, x8, d->poly + 8), x8); + + /* Finalize. y = shift + z + z^3 * P(z^2). */ + y = vfmaq_f64 (az, y, vmulq_f64 (z2, az)); + y = vaddq_f64 (y, shift); /* y = atan(x) if x>0, -atan(-x) otherwise. */ - y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign); + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), sign)); return y; } -VPCS_ALIAS PL_SIG (V, D, 1, atan, -10.0, 10.0) -PL_TEST_ULP (V_NAME (atan), 1.78) -PL_TEST_EXPECT_FENV (V_NAME (atan), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME (atan), 0, 0x1p-30, 10000) -PL_TEST_INTERVAL (V_NAME (atan), -0, -0x1p-30, 1000) -PL_TEST_INTERVAL (V_NAME (atan), 0x1p-30, 0x1p53, 900000) -PL_TEST_INTERVAL (V_NAME (atan), -0x1p-30, -0x1p53, 90000) -PL_TEST_INTERVAL (V_NAME (atan), 0x1p53, inf, 10000) -PL_TEST_INTERVAL (V_NAME (atan), -0x1p53, -inf, 1000) - -#endif +PL_TEST_ULP (V_NAME_D1 (atan), 1.78) +PL_TEST_EXPECT_FENV (V_NAME_D1 (atan), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME_D1 (atan), 0, 0x1p-30, 10000) +PL_TEST_INTERVAL (V_NAME_D1 (atan), -0, -0x1p-30, 1000) +PL_TEST_INTERVAL (V_NAME_D1 (atan), 0x1p-30, 0x1p53, 900000) +PL_TEST_INTERVAL (V_NAME_D1 (atan), -0x1p-30, -0x1p53, 90000) +PL_TEST_INTERVAL (V_NAME_D1 (atan), 0x1p53, inf, 10000) +PL_TEST_INTERVAL (V_NAME_D1 (atan), -0x1p53, -inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/v_atanf_3u.c b/contrib/arm-optimized-routines/pl/math/v_atanf_3u.c index 67d90b94f5d3..f522d957c1cc 100644 --- a/contrib/arm-optimized-routines/pl/math/v_atanf_3u.c +++ b/contrib/arm-optimized-routines/pl/math/v_atanf_3u.c @@ -8,19 +8,32 @@ #include "v_math.h" #include "pl_sig.h" #include "pl_test.h" +#include "poly_advsimd_f32.h" -#if V_SUPPORTED +static const struct data +{ + float32x4_t poly[8]; + float32x4_t pi_over_2; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-128, 1.0]. + Generated using fpminimax between FLT_MIN and 1. */ + .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f), + V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f), + V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) }, + .pi_over_2 = V4 (0x1.921fb6p+0f), +}; -#include "atanf_common.h" +#define SignMask v_u32 (0x80000000) -#define PiOver2 v_f32 (0x1.921fb6p+0f) -#define AbsMask v_u32 (0x7fffffff) -#define TinyBound 0x308 /* top12(asuint(0x1p-30)). */ -#define BigBound 0x4e8 /* top12(asuint(0x1p30)). */ +#define P(i) d->poly[i] + +#define TinyBound 0x30800000 /* asuint(0x1p-30). */ +#define BigBound 0x4e800000 /* asuint(0x1p30). */ #if WANT_SIMD_EXCEPT -static NOINLINE v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t special) +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t special) { return v_call_f32 (atanf, x, y, special); } @@ -29,55 +42,66 @@ specialcase (v_f32_t x, v_f32_t y, v_u32_t special) /* Fast implementation of vector atanf based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps: - v_atanf(0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1. */ -VPCS_ATTR -v_f32_t V_NAME (atanf) (v_f32_t x) + _ZGVnN4v_atanf (0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1. */ +float32x4_t VPCS_ATTR V_NAME_F1 (atan) (float32x4_t x) { + const struct data *d = ptr_barrier (&data); + /* Small cases, infs and nans are supported by our approximation technique, but do not set fenv flags correctly. Only trigger special case if we need fenv. */ - v_u32_t ix = v_as_u32_f32 (x); - v_u32_t sign = ix & ~AbsMask; + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint32x4_t sign = vandq_u32 (ix, SignMask); #if WANT_SIMD_EXCEPT - v_u32_t ia12 = (ix >> 20) & 0x7ff; - v_u32_t special = v_cond_u32 (ia12 - TinyBound > BigBound - TinyBound); + uint32x4_t ia = vandq_u32 (ix, v_u32 (0x7ff00000)); + uint32x4_t special = vcgtq_u32 (vsubq_u32 (ia, v_u32 (TinyBound)), + v_u32 (BigBound - TinyBound)); /* If any lane is special, fall back to the scalar routine for all lanes. */ if (unlikely (v_any_u32 (special))) - return specialcase (x, x, v_u32 (-1)); + return special_case (x, x, v_u32 (-1)); #endif /* Argument reduction: y := arctan(x) for x < 1 y := pi/2 + arctan(-1/x) for x > 1 Hence, use z=-1/a if x>=1, otherwise z=a. */ - v_u32_t red = v_cagt_f32 (x, v_f32 (1.0)); + uint32x4_t red = vcagtq_f32 (x, v_f32 (1.0)); /* Avoid dependency in abs(x) in division (and comparison). */ - v_f32_t z = v_sel_f32 (red, v_div_f32 (v_f32 (-1.0f), x), x); - v_f32_t shift = v_sel_f32 (red, PiOver2, v_f32 (0.0f)); + float32x4_t z = vbslq_f32 (red, vdivq_f32 (v_f32 (1.0f), x), x); + float32x4_t shift = vreinterpretq_f32_u32 ( + vandq_u32 (red, vreinterpretq_u32_f32 (d->pi_over_2))); /* Use absolute value only when needed (odd powers of z). */ - v_f32_t az = v_abs_f32 (z); - az = v_sel_f32 (red, -az, az); + float32x4_t az = vbslq_f32 ( + SignMask, vreinterpretq_f32_u32 (vandq_u32 (SignMask, red)), z); - /* Calculate the polynomial approximation. */ - v_f32_t y = eval_poly (z, az, shift); + /* Calculate the polynomial approximation. + Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However, + a standard implementation using z8 creates spurious underflow + in the very last fma (when z^8 is small enough). + Therefore, we split the last fma into a mul and an fma. + Horner and single-level Estrin have higher errors that exceed + threshold. */ + float32x4_t z2 = vmulq_f32 (z, z); + float32x4_t z4 = vmulq_f32 (z2, z2); + + float32x4_t y = vfmaq_f32 ( + v_pairwise_poly_3_f32 (z2, z4, d->poly), z4, + vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, d->poly + 4))); + + /* y = shift + z * P(z^2). */ + y = vaddq_f32 (vfmaq_f32 (az, y, vmulq_f32 (z2, az)), shift); /* y = atan(x) if x>0, -atan(-x) otherwise. */ - y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign); + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), sign)); return y; } -VPCS_ALIAS PL_SIG (V, F, 1, atan, -10.0, 10.0) -PL_TEST_ULP (V_NAME (atanf), 2.5) -PL_TEST_EXPECT_FENV (V_NAME (atanf), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME (atanf), 0, 0x1p-30, 5000) -PL_TEST_INTERVAL (V_NAME (atanf), -0, -0x1p-30, 5000) -PL_TEST_INTERVAL (V_NAME (atanf), 0x1p-30, 1, 40000) -PL_TEST_INTERVAL (V_NAME (atanf), -0x1p-30, -1, 40000) -PL_TEST_INTERVAL (V_NAME (atanf), 1, 0x1p30, 40000) -PL_TEST_INTERVAL (V_NAME (atanf), -1, -0x1p30, 40000) -PL_TEST_INTERVAL (V_NAME (atanf), 0x1p30, inf, 1000) -PL_TEST_INTERVAL (V_NAME (atanf), -0x1p30, -inf, 1000) -#endif +PL_TEST_ULP (V_NAME_F1 (atan), 2.5) +PL_TEST_EXPECT_FENV (V_NAME_F1 (atan), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0, 0x1p-30, 5000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p-30, 1, 40000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 1, 0x1p30, 40000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p30, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/v_atanh_3u5.c b/contrib/arm-optimized-routines/pl/math/v_atanh_3u5.c index bfaf5c2b917f..f282826a3f32 100644 --- a/contrib/arm-optimized-routines/pl/math/v_atanh_3u5.c +++ b/contrib/arm-optimized-routines/pl/math/v_atanh_3u5.c @@ -6,56 +6,61 @@ */ #include "v_math.h" -#include "pairwise_horner.h" #include "pl_sig.h" #include "pl_test.h" -#if V_SUPPORTED - #define WANT_V_LOG1P_K0_SHORTCUT 0 #include "v_log1p_inline.h" -#define AbsMask 0x7fffffffffffffff -#define Half 0x3fe0000000000000 -#define One 0x3ff0000000000000 +const static struct data +{ + struct v_log1p_data log1p_consts; + uint64x2_t one, half; +} data = { .log1p_consts = V_LOG1P_CONSTANTS_TABLE, + .one = V2 (0x3ff0000000000000), + .half = V2 (0x3fe0000000000000) }; -VPCS_ATTR -NOINLINE static v_f64_t -specialcase (v_f64_t x, v_f64_t y, v_u64_t special) +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t special) { return v_call_f64 (atanh, x, y, special); } /* Approximation for vector double-precision atanh(x) using modified log1p. The greatest observed error is 3.31 ULP: - __v_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6 - want 0x1.ffd8ff31b501cp-6. */ + _ZGVnN2v_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6 + want 0x1.ffd8ff31b501cp-6. */ VPCS_ATTR -v_f64_t V_NAME (atanh) (v_f64_t x) +float64x2_t V_NAME_D1 (atanh) (float64x2_t x) { - v_u64_t ix = v_as_u64_f64 (x); - v_u64_t sign = ix & ~AbsMask; - v_u64_t ia = ix & AbsMask; - v_u64_t special = v_cond_u64 (ia >= One); - v_f64_t halfsign = v_as_f64_u64 (sign | Half); + const struct data *d = ptr_barrier (&data); - /* Mask special lanes with 0 to prevent spurious underflow. */ - v_f64_t ax = v_sel_f64 (special, v_f64 (0), v_as_f64_u64 (ia)); - v_f64_t y = halfsign * log1p_inline ((2 * ax) / (1 - ax)); + float64x2_t ax = vabsq_f64 (x); + uint64x2_t ia = vreinterpretq_u64_f64 (ax); + uint64x2_t sign = veorq_u64 (vreinterpretq_u64_f64 (x), ia); + uint64x2_t special = vcgeq_u64 (ia, d->one); + float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->half)); + +#if WANT_SIMD_EXCEPT + ax = v_zerofy_f64 (ax, special); +#endif + + float64x2_t y; + y = vaddq_f64 (ax, ax); + y = vdivq_f64 (y, vsubq_f64 (v_f64 (1), ax)); + y = log1p_inline (y, &d->log1p_consts); if (unlikely (v_any_u64 (special))) - return specialcase (x, y, special); - return y; + return special_case (x, vmulq_f64 (y, halfsign), special); + return vmulq_f64 (y, halfsign); } -VPCS_ALIAS PL_SIG (V, D, 1, atanh, -1.0, 1.0) -PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (atanh)) -PL_TEST_ULP (V_NAME (atanh), 3.32) -PL_TEST_INTERVAL_C (V_NAME (atanh), 0, 0x1p-23, 10000, 0) -PL_TEST_INTERVAL_C (V_NAME (atanh), -0, -0x1p-23, 10000, 0) -PL_TEST_INTERVAL_C (V_NAME (atanh), 0x1p-23, 1, 90000, 0) -PL_TEST_INTERVAL_C (V_NAME (atanh), -0x1p-23, -1, 90000, 0) -PL_TEST_INTERVAL_C (V_NAME (atanh), 1, inf, 100, 0) -PL_TEST_INTERVAL_C (V_NAME (atanh), -1, -inf, 100, 0) -#endif +PL_TEST_EXPECT_FENV (V_NAME_D1 (atanh), WANT_SIMD_EXCEPT) +PL_TEST_ULP (V_NAME_D1 (atanh), 3.32) +/* atanh is asymptotic at 1, which is the default control value - have to set + -c 0 specially to ensure fp exceptions are triggered correctly (choice of + control lane is irrelevant if fp exceptions are disabled). */ +PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (atanh), 0, 0x1p-23, 10000, 0) +PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (atanh), 0x1p-23, 1, 90000, 0) +PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (atanh), 1, inf, 100, 0) diff --git a/contrib/arm-optimized-routines/pl/math/v_atanhf_3u1.c b/contrib/arm-optimized-routines/pl/math/v_atanhf_3u1.c index cd3069661142..f6a5f25eca9a 100644 --- a/contrib/arm-optimized-routines/pl/math/v_atanhf_3u1.c +++ b/contrib/arm-optimized-routines/pl/math/v_atanhf_3u1.c @@ -6,57 +6,72 @@ */ #include "v_math.h" -#include "mathlib.h" #include "pl_sig.h" #include "pl_test.h" - -#if V_SUPPORTED - #include "v_log1pf_inline.h" -#define AbsMask 0x7fffffff -#define Half 0x3f000000 -#define One 0x3f800000 -#define TinyBound 0x39800000 /* 0x1p-12, below which atanhf(x) rounds to x. */ +const static struct data +{ + struct v_log1pf_data log1pf_consts; + uint32x4_t one; +#if WANT_SIMD_EXCEPT + uint32x4_t tiny_bound; +#endif +} data = { + .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, + .one = V4 (0x3f800000), +#if WANT_SIMD_EXCEPT + /* 0x1p-12, below which atanhf(x) rounds to x. */ + .tiny_bound = V4 (0x39800000), +#endif +}; + +#define AbsMask v_u32 (0x7fffffff) +#define Half v_u32 (0x3f000000) + +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +{ + return v_call_f32 (atanhf, x, y, special); +} /* Approximation for vector single-precision atanh(x) using modified log1p. The maximum error is 3.08 ULP: __v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5 want 0x1.ffcb82p-5. */ -VPCS_ATTR v_f32_t V_NAME (atanhf) (v_f32_t x) +VPCS_ATTR float32x4_t V_NAME_F1 (atanh) (float32x4_t x) { - v_u32_t ix = v_as_u32_f32 (x); - v_f32_t halfsign - = v_as_f32_u32 (v_bsl_u32 (v_u32 (AbsMask), v_u32 (Half), ix)); - v_u32_t iax = ix & AbsMask; + const struct data *d = ptr_barrier (&data); - v_f32_t ax = v_as_f32_u32 (iax); + float32x4_t halfsign = vbslq_f32 (AbsMask, v_f32 (0.5), x); + float32x4_t ax = vabsq_f32 (x); + uint32x4_t iax = vreinterpretq_u32_f32 (ax); #if WANT_SIMD_EXCEPT - v_u32_t special = v_cond_u32 ((iax >= One) | (iax <= TinyBound)); + uint32x4_t special + = vorrq_u32 (vcgeq_u32 (iax, d->one), vcltq_u32 (iax, d->tiny_bound)); /* Side-step special cases by setting those lanes to 0, which will trigger no exceptions. These will be fixed up later. */ if (unlikely (v_any_u32 (special))) - ax = v_sel_f32 (special, v_f32 (0), ax); + ax = v_zerofy_f32 (ax, special); #else - v_u32_t special = v_cond_u32 (iax >= One); + uint32x4_t special = vcgeq_u32 (iax, d->one); #endif - v_f32_t y = halfsign * log1pf_inline ((2 * ax) / (1 - ax)); + float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax), vsubq_f32 (v_f32 (1), ax)); + y = log1pf_inline (y, d->log1pf_consts); if (unlikely (v_any_u32 (special))) - return v_call_f32 (atanhf, x, y, special); - return y; + return special_case (x, vmulq_f32 (halfsign, y), special); + return vmulq_f32 (halfsign, y); } -VPCS_ALIAS PL_SIG (V, F, 1, atanh, -1.0, 1.0) -PL_TEST_ULP (V_NAME (atanhf), 2.59) -PL_TEST_EXPECT_FENV (V_NAME (atanhf), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL_C (V_NAME (atanhf), 0, 0x1p-12, 500, 0) -PL_TEST_INTERVAL_C (V_NAME (atanhf), 0x1p-12, 1, 200000, 0) -PL_TEST_INTERVAL_C (V_NAME (atanhf), 1, inf, 1000, 0) -PL_TEST_INTERVAL_C (V_NAME (atanhf), -0, -0x1p-12, 500, 0) -PL_TEST_INTERVAL_C (V_NAME (atanhf), -0x1p-12, -1, 200000, 0) -PL_TEST_INTERVAL_C (V_NAME (atanhf), -1, -inf, 1000, 0) -#endif +PL_TEST_ULP (V_NAME_F1 (atanh), 2.59) +PL_TEST_EXPECT_FENV (V_NAME_F1 (atanh), WANT_SIMD_EXCEPT) +/* atanh is asymptotic at 1, which is the default control value - have to set + -c 0 specially to ensure fp exceptions are triggered correctly (choice of + control lane is irrelevant if fp exceptions are disabled). */ +PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (atanh), 0, 0x1p-12, 500, 0) +PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (atanh), 0x1p-12, 1, 200000, 0) +PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (atanh), 1, inf, 1000, 0) diff --git a/contrib/arm-optimized-routines/pl/math/v_cbrt_2u.c b/contrib/arm-optimized-routines/pl/math/v_cbrt_2u.c index d5abe41024bc..cc7cff15dc0f 100644 --- a/contrib/arm-optimized-routines/pl/math/v_cbrt_2u.c +++ b/contrib/arm-optimized-routines/pl/math/v_cbrt_2u.c @@ -6,26 +6,38 @@ */ #include "v_math.h" -#include "mathlib.h" #include "pl_sig.h" #include "pl_test.h" +#include "poly_advsimd_f64.h" -#if V_SUPPORTED - -#define AbsMask 0x7fffffffffffffff -#define TwoThirds v_f64 (0x1.5555555555555p-1) -#define TinyBound 0x001 /* top12 (smallest_normal). */ -#define BigBound 0x7ff /* top12 (infinity). */ -#define MantissaMask v_u64 (0x000fffffffffffff) -#define HalfExp v_u64 (0x3fe0000000000000) - -#define C(i) v_f64 (__cbrt_data.poly[i]) -#define T(i) v_lookup_f64 (__cbrt_data.table, i) - -static NOINLINE v_f64_t -specialcase (v_f64_t x, v_f64_t y, v_u64_t special) +const static struct data { - return v_call_f64 (cbrt, x, y, special); + float64x2_t poly[4], one_third, shift; + int64x2_t exp_bias; + uint64x2_t abs_mask, tiny_bound; + uint32x4_t thresh; + double table[5]; +} data = { + .shift = V2 (0x1.8p52), + .poly = { /* Generated with fpminimax in [0.5, 1]. */ + V2 (0x1.c14e8ee44767p-2), V2 (0x1.dd2d3f99e4c0ep-1), + V2 (-0x1.08e83026b7e74p-1), V2 (0x1.2c74eaa3ba428p-3) }, + .exp_bias = V2 (1022), + .abs_mask = V2(0x7fffffffffffffff), + .tiny_bound = V2(0x0010000000000000), /* Smallest normal. */ + .thresh = V4(0x7fe00000), /* asuint64 (infinity) - tiny_bound. */ + .one_third = V2(0x1.5555555555555p-2), + .table = { /* table[i] = 2^((i - 2) / 3). */ + 0x1.428a2f98d728bp-1, 0x1.965fea53d6e3dp-1, 0x1p0, + 0x1.428a2f98d728bp0, 0x1.965fea53d6e3dp0 } +}; + +#define MantissaMask v_u64 (0x000fffffffffffff) + +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, float64x2_t y, uint32x2_t special) +{ + return v_call_f64 (cbrt, x, y, vmovl_u32 (special)); } /* Approximation for double-precision vector cbrt(x), using low-order polynomial @@ -35,31 +47,33 @@ specialcase (v_f64_t x, v_f64_t y, v_u64_t special) integer. __v_cbrt(0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0 want 0x1.965fe72821e99p+0. */ -VPCS_ATTR v_f64_t V_NAME (cbrt) (v_f64_t x) +VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x) { - v_u64_t ix = v_as_u64_f64 (x); - v_u64_t iax = ix & AbsMask; - v_u64_t ia12 = iax >> 52; + const struct data *d = ptr_barrier (&data); + uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x)); /* Subnormal, +/-0 and special values. */ - v_u64_t special = v_cond_u64 ((ia12 < TinyBound) | (ia12 >= BigBound)); + uint32x2_t special + = vcge_u32 (vsubhn_u64 (iax, d->tiny_bound), vget_low_u32 (d->thresh)); /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector version of frexp, which gets subnormal values wrong - these have to be special-cased as a result. */ - v_f64_t m = v_as_f64_u64 (v_bsl_u64 (MantissaMask, iax, HalfExp)); - v_s64_t e = v_as_s64_u64 (iax >> 52) - 1022; + float64x2_t m = vbslq_f64 (MantissaMask, x, v_f64 (0.5)); + int64x2_t exp_bias = d->exp_bias; + uint64x2_t ia12 = vshrq_n_u64 (iax, 52); + int64x2_t e = vsubq_s64 (vreinterpretq_s64_u64 (ia12), exp_bias); /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for Newton iterations. */ - v_f64_t p_01 = v_fma_f64 (C (1), m, C (0)); - v_f64_t p_23 = v_fma_f64 (C (3), m, C (2)); - v_f64_t p = v_fma_f64 (m * m, p_23, p_01); - + float64x2_t p = v_pairwise_poly_3_f64 (m, vmulq_f64 (m, m), d->poly); + float64x2_t one_third = d->one_third; /* Two iterations of Newton's method for iteratively approximating cbrt. */ - v_f64_t m_by_3 = m / 3; - v_f64_t a = v_fma_f64 (TwoThirds, p, m_by_3 / (p * p)); - a = v_fma_f64 (TwoThirds, a, m_by_3 / (a * a)); + float64x2_t m_by_3 = vmulq_f64 (m, one_third); + float64x2_t two_thirds = vaddq_f64 (one_third, one_third); + float64x2_t a + = vfmaq_f64 (vdivq_f64 (m_by_3, vmulq_f64 (p, p)), two_thirds, p); + a = vfmaq_f64 (vdivq_f64 (m_by_3, vmulq_f64 (a, a)), two_thirds, a); /* Assemble the result by the following: @@ -76,23 +90,27 @@ VPCS_ATTR v_f64_t V_NAME (cbrt) (v_f64_t x) cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */ - v_s64_t ey = e / 3; - v_f64_t my = a * T (v_as_u64_s64 (e % 3 + 2)); + float64x2_t ef = vcvtq_f64_s64 (e); + float64x2_t eb3f = vrndnq_f64 (vmulq_f64 (ef, one_third)); + int64x2_t em3 = vcvtq_s64_f64 (vfmsq_f64 (ef, eb3f, v_f64 (3))); + int64x2_t ey = vcvtq_s64_f64 (eb3f); + + float64x2_t my = (float64x2_t){ d->table[em3[0] + 2], d->table[em3[1] + 2] }; + my = vmulq_f64 (my, a); /* Vector version of ldexp. */ - v_f64_t y = v_as_f64_u64 ((v_as_u64_s64 (ey + 1023) << 52)) * my; + float64x2_t y = vreinterpretq_f64_s64 ( + vshlq_n_s64 (vaddq_s64 (ey, vaddq_s64 (exp_bias, v_s64 (1))), 52)); + y = vmulq_f64 (y, my); + + if (unlikely (v_any_u32h (special))) + return special_case (x, vbslq_f64 (d->abs_mask, y, x), special); + /* Copy sign. */ - y = v_as_f64_u64 (v_bsl_u64 (v_u64 (AbsMask), v_as_u64_f64 (y), ix)); - - if (unlikely (v_any_u64 (special))) - return specialcase (x, y, special); - return y; + return vbslq_f64 (d->abs_mask, y, x); } -VPCS_ALIAS -PL_TEST_ULP (V_NAME (cbrt), 1.30) +PL_TEST_ULP (V_NAME_D1 (cbrt), 1.30) PL_SIG (V, D, 1, cbrt, -10.0, 10.0) -PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (cbrt)) -PL_TEST_INTERVAL (V_NAME (cbrt), 0, inf, 1000000) -PL_TEST_INTERVAL (V_NAME (cbrt), -0, -inf, 1000000) -#endif +PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_D1 (cbrt)) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (cbrt), 0, inf, 1000000) diff --git a/contrib/arm-optimized-routines/pl/math/v_cbrtf_1u5.c b/contrib/arm-optimized-routines/pl/math/v_cbrtf_1u5.c deleted file mode 100644 index 62fa37505834..000000000000 --- a/contrib/arm-optimized-routines/pl/math/v_cbrtf_1u5.c +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Single-precision vector cbrt(x) function. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#include "mathlib.h" -#include "pl_sig.h" -#include "pl_test.h" - -#if V_SUPPORTED - -#define AbsMask 0x7fffffff -#define SignMask v_u32 (0x80000000) -#define TwoThirds v_f32 (0x1.555556p-1f) -#define SmallestNormal 0x00800000 -#define MantissaMask 0x007fffff -#define HalfExp 0x3f000000 - -#define C(i) v_f32 (__cbrtf_data.poly[i]) -#define T(i) v_lookup_f32 (__cbrtf_data.table, i) - -static NOINLINE v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t special) -{ - return v_call_f32 (cbrtf, x, y, special); -} - -/* Approximation for vector single-precision cbrt(x) using Newton iteration with - initial guess obtained by a low-order polynomial. Greatest error is 1.5 ULP. - This is observed for every value where the mantissa is 0x1.81410e and the - exponent is a multiple of 3, for example: - __v_cbrtf(0x1.81410ep+30) got 0x1.255d96p+10 - want 0x1.255d92p+10. */ -VPCS_ATTR v_f32_t V_NAME (cbrtf) (v_f32_t x) -{ - v_u32_t ix = v_as_u32_f32 (x); - v_u32_t iax = ix & AbsMask; - - /* Subnormal, +/-0 and special values. */ - v_u32_t special = v_cond_u32 ((iax < SmallestNormal) | (iax >= 0x7f800000)); - - /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector - version of frexpf, which gets subnormal values wrong - these have to be - special-cased as a result. */ - v_f32_t m = v_as_f32_u32 ((iax & MantissaMask) | HalfExp); - v_s32_t e = v_as_s32_u32 (iax >> 23) - 126; - - /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is, - the less accurate the next stage of the algorithm needs to be. An order-4 - polynomial is enough for one Newton iteration. */ - v_f32_t p_01 = v_fma_f32 (C (1), m, C (0)); - v_f32_t p_23 = v_fma_f32 (C (3), m, C (2)); - v_f32_t p = v_fma_f32 (m * m, p_23, p_01); - - /* One iteration of Newton's method for iteratively approximating cbrt. */ - v_f32_t m_by_3 = m / 3; - v_f32_t a = v_fma_f32 (TwoThirds, p, m_by_3 / (p * p)); - - /* Assemble the result by the following: - - cbrt(x) = cbrt(m) * 2 ^ (e / 3). - - We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is - not necessarily a multiple of 3 we lose some information. - - Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q. - - Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which is - an integer in [-2, 2], and can be looked up in the table T. Hence the - result is assembled as: - - cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */ - - v_s32_t ey = e / 3; - v_f32_t my = a * T (v_as_u32_s32 (e % 3 + 2)); - - /* Vector version of ldexpf. */ - v_f32_t y = v_as_f32_u32 ((v_as_u32_s32 (ey + 127) << 23)) * my; - /* Copy sign. */ - y = v_as_f32_u32 (v_bsl_u32 (SignMask, ix, v_as_u32_f32 (y))); - - if (unlikely (v_any_u32 (special))) - return specialcase (x, y, special); - return y; -} -VPCS_ALIAS - -PL_SIG (V, F, 1, cbrt, -10.0, 10.0) -PL_TEST_ULP (V_NAME (cbrtf), 1.03) -PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (cbrtf)) -PL_TEST_INTERVAL (V_NAME (cbrtf), 0, inf, 1000000) -PL_TEST_INTERVAL (V_NAME (cbrtf), -0, -inf, 1000000) -#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_cbrtf_1u7.c b/contrib/arm-optimized-routines/pl/math/v_cbrtf_1u7.c new file mode 100644 index 000000000000..74918765209f --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_cbrtf_1u7.c @@ -0,0 +1,116 @@ +/* + * Single-precision vector cbrt(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "poly_advsimd_f32.h" + +const static struct data +{ + float32x4_t poly[4], one_third; + float table[5]; +} data = { + .poly = { /* Very rough approximation of cbrt(x) in [0.5, 1], generated with + FPMinimax. */ + V4 (0x1.c14e96p-2), V4 (0x1.dd2d3p-1), V4 (-0x1.08e81ap-1), + V4 (0x1.2c74c2p-3) }, + .table = { /* table[i] = 2^((i - 2) / 3). */ + 0x1.428a3p-1, 0x1.965feap-1, 0x1p0, 0x1.428a3p0, 0x1.965feap0 }, + .one_third = V4 (0x1.555556p-2f), +}; + +#define SignMask v_u32 (0x80000000) +#define SmallestNormal v_u32 (0x00800000) +#define Thresh vdup_n_u16 (0x7f00) /* asuint(INFINITY) - SmallestNormal. */ +#define MantissaMask v_u32 (0x007fffff) +#define HalfExp v_u32 (0x3f000000) + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint16x4_t special) +{ + return v_call_f32 (cbrtf, x, y, vmovl_u16 (special)); +} + +static inline float32x4_t +shifted_lookup (const float *table, int32x4_t i) +{ + return (float32x4_t){ table[i[0] + 2], table[i[1] + 2], table[i[2] + 2], + table[i[3] + 2] }; +} + +/* Approximation for vector single-precision cbrt(x) using Newton iteration + with initial guess obtained by a low-order polynomial. Greatest error + is 1.64 ULP. This is observed for every value where the mantissa is + 0x1.85a2aa and the exponent is a multiple of 3, for example: + _ZGVnN4v_cbrtf(0x1.85a2aap+3) got 0x1.267936p+1 + want 0x1.267932p+1. */ +VPCS_ATTR float32x4_t V_NAME_F1 (cbrt) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + uint32x4_t iax = vreinterpretq_u32_f32 (vabsq_f32 (x)); + + /* Subnormal, +/-0 and special values. */ + uint16x4_t special = vcge_u16 (vsubhn_u32 (iax, SmallestNormal), Thresh); + + /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector + version of frexpf, which gets subnormal values wrong - these have to be + special-cased as a result. */ + float32x4_t m = vbslq_f32 (MantissaMask, x, v_f32 (0.5)); + int32x4_t e + = vsubq_s32 (vreinterpretq_s32_u32 (vshrq_n_u32 (iax, 23)), v_s32 (126)); + + /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is, + the less accurate the next stage of the algorithm needs to be. An order-4 + polynomial is enough for one Newton iteration. */ + float32x4_t p = v_pairwise_poly_3_f32 (m, vmulq_f32 (m, m), d->poly); + + float32x4_t one_third = d->one_third; + float32x4_t two_thirds = vaddq_f32 (one_third, one_third); + + /* One iteration of Newton's method for iteratively approximating cbrt. */ + float32x4_t m_by_3 = vmulq_f32 (m, one_third); + float32x4_t a + = vfmaq_f32 (vdivq_f32 (m_by_3, vmulq_f32 (p, p)), two_thirds, p); + + /* Assemble the result by the following: + + cbrt(x) = cbrt(m) * 2 ^ (e / 3). + + We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is + not necessarily a multiple of 3 we lose some information. + + Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q. + + Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which + is an integer in [-2, 2], and can be looked up in the table T. Hence the + result is assembled as: + + cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */ + float32x4_t ef = vmulq_f32 (vcvtq_f32_s32 (e), one_third); + int32x4_t ey = vcvtq_s32_f32 (ef); + int32x4_t em3 = vsubq_s32 (e, vmulq_s32 (ey, v_s32 (3))); + + float32x4_t my = shifted_lookup (d->table, em3); + my = vmulq_f32 (my, a); + + /* Vector version of ldexpf. */ + float32x4_t y + = vreinterpretq_f32_s32 (vshlq_n_s32 (vaddq_s32 (ey, v_s32 (127)), 23)); + y = vmulq_f32 (y, my); + + if (unlikely (v_any_u16h (special))) + return special_case (x, vbslq_f32 (SignMask, x, y), special); + + /* Copy sign. */ + return vbslq_f32 (SignMask, x, y); +} + +PL_SIG (V, F, 1, cbrt, -10.0, 10.0) +PL_TEST_ULP (V_NAME_F1 (cbrt), 1.15) +PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_F1 (cbrt)) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (cbrt), 0, inf, 1000000) diff --git a/contrib/arm-optimized-routines/pl/math/v_cexpi_3u5.c b/contrib/arm-optimized-routines/pl/math/v_cexpi_3u5.c new file mode 100644 index 000000000000..5163b15926b8 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_cexpi_3u5.c @@ -0,0 +1,45 @@ +/* + * Double-precision vector sincos function - return-by-value interface. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_sincos_common.h" +#include "v_math.h" +#include "pl_test.h" + +static float64x2x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, uint64x2_t special, float64x2x2_t y) +{ + return (float64x2x2_t){ v_call_f64 (sin, x, y.val[0], special), + v_call_f64 (cos, x, y.val[1], special) }; +} + +/* Double-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate polynomials. + Largest observed error is for sin, 3.22 ULP: + v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3 + want -0x1.ffe9537d5dbb4p-3. */ +VPCS_ATTR float64x2x2_t +_ZGVnN2v_cexpi (float64x2_t x) +{ + const struct v_sincos_data *d = ptr_barrier (&v_sincos_data); + uint64x2_t special = check_ge_rangeval (x, d); + + float64x2x2_t sc = v_sincos_inline (x, d); + + if (unlikely (v_any_u64 (special))) + return special_case (x, special, sc); + return sc; +} + +PL_TEST_ULP (_ZGVnN2v_cexpi_sin, 2.73) +PL_TEST_ULP (_ZGVnN2v_cexpi_cos, 2.73) +#define V_CEXPI_INTERVAL(lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVnN2v_cexpi_sin, lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVnN2v_cexpi_cos, lo, hi, n) +V_CEXPI_INTERVAL (0, 0x1p23, 500000) +V_CEXPI_INTERVAL (-0, -0x1p23, 500000) +V_CEXPI_INTERVAL (0x1p23, inf, 10000) +V_CEXPI_INTERVAL (-0x1p23, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/v_cexpif_1u8.c b/contrib/arm-optimized-routines/pl/math/v_cexpif_1u8.c new file mode 100644 index 000000000000..4897018d3090 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_cexpif_1u8.c @@ -0,0 +1,47 @@ +/* + * Single-precision vector cexpi function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_sincosf_common.h" +#include "v_math.h" +#include "pl_test.h" + +static float32x4x2_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, uint32x4_t special, float32x4x2_t y) +{ + return (float32x4x2_t){ v_call_f32 (sinf, x, y.val[0], special), + v_call_f32 (cosf, x, y.val[1], special) }; +} + +/* Single-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate low-order + polynomials. + Worst-case error for sin is 1.67 ULP: + v_cexpif_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5 + Worst-case error for cos is 1.81 ULP: + v_cexpif_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */ +VPCS_ATTR float32x4x2_t +_ZGVnN4v_cexpif (float32x4_t x) +{ + const struct v_sincosf_data *d = ptr_barrier (&v_sincosf_data); + uint32x4_t special = check_ge_rangeval (x, d); + + float32x4x2_t sc = v_sincosf_inline (x, d); + + if (unlikely (v_any_u32 (special))) + return special_case (x, special, sc); + return sc; +} + +PL_TEST_ULP (_ZGVnN4v_cexpif_sin, 1.17) +PL_TEST_ULP (_ZGVnN4v_cexpif_cos, 1.31) +#define V_CEXPIF_INTERVAL(lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVnN4v_cexpif_sin, lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVnN4v_cexpif_cos, lo, hi, n) +V_CEXPIF_INTERVAL (0, 0x1p20, 500000) +V_CEXPIF_INTERVAL (-0, -0x1p20, 500000) +V_CEXPIF_INTERVAL (0x1p20, inf, 10000) +V_CEXPIF_INTERVAL (-0x1p20, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/v_cosh_2u.c b/contrib/arm-optimized-routines/pl/math/v_cosh_2u.c index 0a9fbf817a10..649c390f4622 100644 --- a/contrib/arm-optimized-routines/pl/math/v_cosh_2u.c +++ b/contrib/arm-optimized-routines/pl/math/v_cosh_2u.c @@ -8,89 +8,97 @@ #include "v_math.h" #include "pl_sig.h" #include "pl_test.h" -#include "v_exp_tail.h" -#define C1 v_f64 (C1_scal) -#define C2 v_f64 (C2_scal) -#define C3 v_f64 (C3_scal) -#define InvLn2 v_f64 (InvLn2_scal) -#define Ln2hi v_f64 (Ln2hi_scal) -#define Ln2lo v_f64 (Ln2lo_scal) -#define IndexMask v_u64 (IndexMask_scal) -#define Shift v_f64 (Shift_scal) -#define Thres v_f64 (Thres_scal) - -#define AbsMask 0x7fffffffffffffff -#define Half v_f64 (0.5) -#define SpecialBound \ - 0x4086000000000000 /* 0x1.6p9, above which exp overflows. */ - -#if V_SUPPORTED - -static inline v_f64_t -exp_inline (v_f64_t x) +static const struct data { - /* Helper for approximating exp(x). Copied from v_exp_tail, with no - special-case handling or tail. */ + float64x2_t poly[3]; + float64x2_t inv_ln2, ln2, shift, thres; + uint64x2_t index_mask, special_bound; +} data = { + .poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3), + V2 (0x1.5555576a59599p-5), }, + + .inv_ln2 = V2 (0x1.71547652b82fep8), /* N/ln2. */ + /* -ln2/N. */ + .ln2 = {-0x1.62e42fefa39efp-9, -0x1.abc9e3b39803f3p-64}, + .shift = V2 (0x1.8p+52), + .thres = V2 (704.0), + + .index_mask = V2 (0xff), + /* 0x1.6p9, above which exp overflows. */ + .special_bound = V2 (0x4086000000000000), +}; + +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, float64x2_t y, uint64x2_t special) +{ + return v_call_f64 (cosh, x, y, special); +} + +/* Helper for approximating exp(x). Copied from v_exp_tail, with no + special-case handling or tail. */ +static inline float64x2_t +exp_inline (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); /* n = round(x/(ln2/N)). */ - v_f64_t z = v_fma_f64 (x, InvLn2, Shift); - v_u64_t u = v_as_u64_f64 (z); - v_f64_t n = z - Shift; + float64x2_t z = vfmaq_f64 (d->shift, x, d->inv_ln2); + uint64x2_t u = vreinterpretq_u64_f64 (z); + float64x2_t n = vsubq_f64 (z, d->shift); /* r = x - n*ln2/N. */ - v_f64_t r = x; - r = v_fma_f64 (-Ln2hi, n, r); - r = v_fma_f64 (-Ln2lo, n, r); + float64x2_t r = vfmaq_laneq_f64 (x, n, d->ln2, 0); + r = vfmaq_laneq_f64 (r, n, d->ln2, 1); - v_u64_t e = u << (52 - V_EXP_TAIL_TABLE_BITS); - v_u64_t i = u & IndexMask; + uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS); + uint64x2_t i = vandq_u64 (u, d->index_mask); /* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */ - v_f64_t y = v_fma_f64 (C3, r, C2); - y = v_fma_f64 (y, r, C1); - y = v_fma_f64 (y, r, v_f64 (1)) * r; + float64x2_t y = vfmaq_f64 (d->poly[1], d->poly[2], r); + y = vfmaq_f64 (d->poly[0], y, r); + y = vmulq_f64 (vfmaq_f64 (v_f64 (1), y, r), r); /* s = 2^(n/N). */ - u = v_lookup_u64 (Tab, i); - v_f64_t s = v_as_f64_u64 (u + e); + u = v_lookup_u64 (__v_exp_tail_data, i); + float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e)); - return v_fma_f64 (y, s, s); + return vfmaq_f64 (s, y, s); } /* Approximation for vector double-precision cosh(x) using exp_inline. cosh(x) = (exp(x) + exp(-x)) / 2. - The greatest observed error is in the scalar fall-back region, so is the same - as the scalar routine, 1.93 ULP: - __v_cosh(0x1.628af341989dap+9) got 0x1.fdf28623ef921p+1021 - want 0x1.fdf28623ef923p+1021. + The greatest observed error is in the scalar fall-back region, so is the + same as the scalar routine, 1.93 ULP: + _ZGVnN2v_cosh (0x1.628af341989dap+9) got 0x1.fdf28623ef921p+1021 + want 0x1.fdf28623ef923p+1021. The greatest observed error in the non-special region is 1.54 ULP: - __v_cosh(0x1.8e205b6ecacf7p+2) got 0x1.f711dcb0c77afp+7 - want 0x1.f711dcb0c77b1p+7. */ -VPCS_ATTR v_f64_t V_NAME (cosh) (v_f64_t x) + _ZGVnN2v_cosh (0x1.8e205b6ecacf7p+2) got 0x1.f711dcb0c77afp+7 + want 0x1.f711dcb0c77b1p+7. */ +float64x2_t VPCS_ATTR V_NAME_D1 (cosh) (float64x2_t x) { - v_u64_t ix = v_as_u64_f64 (x); - v_u64_t iax = ix & AbsMask; - v_u64_t special = v_cond_u64 (iax > SpecialBound); + const struct data *d = ptr_barrier (&data); - /* If any inputs are special, fall back to scalar for all lanes. */ - if (unlikely (v_any_u64 (special))) - return v_call_f64 (cosh, x, x, v_u64 (-1)); + float64x2_t ax = vabsq_f64 (x); + uint64x2_t special + = vcgtq_u64 (vreinterpretq_u64_f64 (ax), d->special_bound); - v_f64_t ax = v_as_f64_u64 (iax); /* Up to the point that exp overflows, we can use it to calculate cosh by exp(|x|) / 2 + 1 / (2 * exp(|x|)). */ - v_f64_t t = exp_inline (ax); - return t * Half + Half / t; + float64x2_t t = exp_inline (ax); + float64x2_t half_t = vmulq_n_f64 (t, 0.5); + float64x2_t half_over_t = vdivq_f64 (v_f64 (0.5), t); + + /* Fall back to scalar for any special cases. */ + if (unlikely (v_any_u64 (special))) + return special_case (x, vaddq_f64 (half_t, half_over_t), special); + + return vaddq_f64 (half_t, half_over_t); } -VPCS_ALIAS PL_SIG (V, D, 1, cosh, -10.0, 10.0) -PL_TEST_ULP (V_NAME (cosh), 1.43) -PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (cosh)) -PL_TEST_INTERVAL (V_NAME (cosh), 0, 0x1.6p9, 100000) -PL_TEST_INTERVAL (V_NAME (cosh), -0, -0x1.6p9, 100000) -PL_TEST_INTERVAL (V_NAME (cosh), 0x1.6p9, inf, 1000) -PL_TEST_INTERVAL (V_NAME (cosh), -0x1.6p9, -inf, 1000) -#endif +PL_TEST_ULP (V_NAME_D1 (cosh), 1.43) +PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_D1 (cosh)) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0, 0x1.6p9, 100000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0x1.6p9, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/v_coshf_2u4.c b/contrib/arm-optimized-routines/pl/math/v_coshf_2u4.c index 1422d4d12b31..c622b0b183f1 100644 --- a/contrib/arm-optimized-routines/pl/math/v_coshf_2u4.c +++ b/contrib/arm-optimized-routines/pl/math/v_coshf_2u4.c @@ -5,70 +5,76 @@ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +#include "v_expf_inline.h" #include "v_math.h" #include "mathlib.h" #include "pl_sig.h" #include "pl_test.h" -#define AbsMask 0x7fffffff -#define TinyBound 0x20000000 /* 0x1p-63: Round to 1 below this. */ -#define SpecialBound \ - 0x42ad496c /* 0x1.5a92d8p+6: expf overflows above this, so have to use \ - special case. */ -#define Half v_f32 (0.5) +static const struct data +{ + struct v_expf_data expf_consts; + uint32x4_t tiny_bound, special_bound; +} data = { + .expf_consts = V_EXPF_DATA, + .tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this. */ + /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */ + .special_bound = V4 (0x42ad496c), +}; -#if V_SUPPORTED - -v_f32_t V_NAME (expf) (v_f32_t); +#if !WANT_SIMD_EXCEPT +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +{ + return v_call_f32 (coshf, x, y, special); +} +#endif /* Single-precision vector cosh, using vector expf. Maximum error is 2.38 ULP: - __v_coshf(0x1.e8001ep+1) got 0x1.6a491ep+4 want 0x1.6a4922p+4. */ -VPCS_ATTR v_f32_t V_NAME (coshf) (v_f32_t x) + _ZGVnN4v_coshf (0x1.e8001ep+1) got 0x1.6a491ep+4 + want 0x1.6a4922p+4. */ +float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x) { - v_u32_t ix = v_as_u32_f32 (x); - v_u32_t iax = ix & AbsMask; - v_f32_t ax = v_as_f32_u32 (iax); - v_u32_t special = v_cond_u32 (iax >= SpecialBound); + const struct data *d = ptr_barrier (&data); + + float32x4_t ax = vabsq_f32 (x); + uint32x4_t iax = vreinterpretq_u32_f32 (ax); + uint32x4_t special = vcgeq_u32 (iax, d->special_bound); #if WANT_SIMD_EXCEPT /* If fp exceptions are to be triggered correctly, fall back to the scalar variant for all inputs if any input is a special value or above the bound - at which expf overflows. */ + at which expf overflows. */ if (unlikely (v_any_u32 (special))) return v_call_f32 (coshf, x, x, v_u32 (-1)); - v_u32_t tiny = v_cond_u32 (iax <= TinyBound); + uint32x4_t tiny = vcleq_u32 (iax, d->tiny_bound); /* If any input is tiny, avoid underflow exception by fixing tiny lanes of - input to 1, which will generate no exceptions, and then also fixing tiny - lanes of output to 1 just before return. */ + input to 0, which will generate no exceptions. */ if (unlikely (v_any_u32 (tiny))) - ax = v_sel_f32 (tiny, v_f32 (1), ax); + ax = v_zerofy_f32 (ax, tiny); #endif /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */ - v_f32_t t = V_NAME (expf) (ax); - v_f32_t y = t * Half + Half / t; + float32x4_t t = v_expf_inline (ax, &d->expf_consts); + float32x4_t half_t = vmulq_n_f32 (t, 0.5); + float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t); #if WANT_SIMD_EXCEPT if (unlikely (v_any_u32 (tiny))) - return v_sel_f32 (tiny, v_f32 (1), y); + return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t)); #else if (unlikely (v_any_u32 (special))) - return v_call_f32 (coshf, x, y, special); + return special_case (x, vaddq_f32 (half_t, half_over_t), special); #endif - return y; + return vaddq_f32 (half_t, half_over_t); } -VPCS_ALIAS PL_SIG (V, F, 1, cosh, -10.0, 10.0) -PL_TEST_ULP (V_NAME (coshf), 1.89) -PL_TEST_EXPECT_FENV (V_NAME (coshf), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME (coshf), 0, 0x1p-63, 100) -PL_TEST_INTERVAL (V_NAME (coshf), 0, 0x1.5a92d8p+6, 80000) -PL_TEST_INTERVAL (V_NAME (coshf), 0x1.5a92d8p+6, inf, 2000) -PL_TEST_INTERVAL (V_NAME (coshf), -0, -0x1p-63, 100) -PL_TEST_INTERVAL (V_NAME (coshf), -0, -0x1.5a92d8p+6, 80000) -PL_TEST_INTERVAL (V_NAME (coshf), -0x1.5a92d8p+6, -inf, 2000) -#endif +PL_TEST_ULP (V_NAME_F1 (cosh), 1.89) +PL_TEST_EXPECT_FENV (V_NAME_F1 (cosh), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0, 0x1p-63, 100) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0, 0x1.5a92d8p+6, 80000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0x1.5a92d8p+6, inf, 2000) diff --git a/contrib/arm-optimized-routines/pl/math/v_cospi_3u1.c b/contrib/arm-optimized-routines/pl/math/v_cospi_3u1.c new file mode 100644 index 000000000000..3c2ee0b74c8e --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_cospi_3u1.c @@ -0,0 +1,86 @@ +/* + * Double-precision vector cospi function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" +#include "poly_advsimd_f64.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float64x2_t poly[10]; + float64x2_t range_val; +} data = { + /* Polynomial coefficients generated using Remez algorithm, + see sinpi.sollya for details. */ + .poly = { V2 (0x1.921fb54442d184p1), V2 (-0x1.4abbce625be53p2), + V2 (0x1.466bc6775ab16p1), V2 (-0x1.32d2cce62dc33p-1), + V2 (0x1.507834891188ep-4), V2 (-0x1.e30750a28c88ep-8), + V2 (0x1.e8f48308acda4p-12), V2 (-0x1.6fc0032b3c29fp-16), + V2 (0x1.af86ae521260bp-21), V2 (-0x1.012a9870eeb7dp-25) }, + .range_val = V2 (0x1p63), +}; + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) +{ + /* Fall back to scalar code. */ + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); + return v_call_f64 (cospi, x, y, cmp); +} + +/* Approximation for vector double-precision cospi(x). + Maximum Error 3.06 ULP: + _ZGVnN2v_cospi(0x1.7dd4c0b03cc66p-5) got 0x1.fa854babfb6bep-1 + want 0x1.fa854babfb6c1p-1. */ +float64x2_t VPCS_ATTR V_NAME_D1 (cospi) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + float64x2_t r = vabsq_f64 (x); + uint64x2_t cmp = vcaleq_f64 (v_f64 (0x1p64), x); + + /* When WANT_SIMD_EXCEPT = 1, special lanes should be zero'd + to avoid them overflowing and throwing exceptions. */ + r = v_zerofy_f64 (r, cmp); + uint64x2_t odd = vshlq_n_u64 (vcvtnq_u64_f64 (r), 63); + +#else + float64x2_t r = x; + uint64x2_t cmp = vcageq_f64 (r, d->range_val); + uint64x2_t odd + = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (r)), 63); + +#endif + + r = vsubq_f64 (r, vrndaq_f64 (r)); + + /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */ + r = vsubq_f64 (v_f64 (0.5), vabsq_f64 (r)); + + /* y = sin(r). */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t r4 = vmulq_f64 (r2, r2); + float64x2_t y = vmulq_f64 (v_pw_horner_9_f64 (r2, r4, d->poly), r); + + /* Fallback to scalar. */ + if (unlikely (v_any_u64 (cmp))) + return special_case (x, y, odd, cmp); + + /* Reintroduce the sign bit for inputs which round to odd. */ + return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); +} + +PL_SIG (V, D, 1, cospi, -0.9, 0.9) +PL_TEST_ULP (V_NAME_D1 (cospi), 2.56) +PL_TEST_EXPECT_FENV (V_NAME_D1 (cospi), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0, 0x1p-63, 5000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p-63, 0.5, 10000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0.5, 0x1p51, 10000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p51, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/v_cospif_3u2.c b/contrib/arm-optimized-routines/pl/math/v_cospif_3u2.c new file mode 100644 index 000000000000..d88aa828439d --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_cospif_3u2.c @@ -0,0 +1,83 @@ +/* + * Single-precision vector cospi function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" +#include "poly_advsimd_f32.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float32x4_t poly[6]; + float32x4_t range_val; +} data = { + /* Taylor series coefficents for sin(pi * x). */ + .poly = { V4 (0x1.921fb6p1f), V4 (-0x1.4abbcep2f), V4 (0x1.466bc6p1f), + V4 (-0x1.32d2ccp-1f), V4 (0x1.50783p-4f), V4 (-0x1.e30750p-8f) }, + .range_val = V4 (0x1p31f), +}; + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) +{ + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); + return v_call_f32 (cospif, x, y, cmp); +} + +/* Approximation for vector single-precision cospi(x) + Maximum Error: 3.17 ULP: + _ZGVnN4v_cospif(0x1.d341a8p-5) got 0x1.f7cd56p-1 + want 0x1.f7cd5p-1. */ +float32x4_t VPCS_ATTR V_NAME_F1 (cospi) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + float32x4_t r = vabsq_f32 (x); + uint32x4_t cmp = vcaleq_f32 (v_f32 (0x1p32f), x); + + /* When WANT_SIMD_EXCEPT = 1, special lanes should be zero'd + to avoid them overflowing and throwing exceptions. */ + r = v_zerofy_f32 (r, cmp); + uint32x4_t odd = vshlq_n_u32 (vcvtnq_u32_f32 (r), 31); + +#else + float32x4_t r = x; + uint32x4_t cmp = vcageq_f32 (r, d->range_val); + + uint32x4_t odd + = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (r)), 31); + +#endif + + /* r = x - rint(x). */ + r = vsubq_f32 (r, vrndaq_f32 (r)); + + /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */ + r = vsubq_f32 (v_f32 (0.5f), vabsq_f32 (r)); + + /* Pairwise Horner approximation for y = sin(r * pi). */ + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t r4 = vmulq_f32 (r2, r2); + float32x4_t y = vmulq_f32 (v_pw_horner_5_f32 (r2, r4, d->poly), r); + + /* Fallback to scalar. */ + if (unlikely (v_any_u32 (cmp))) + return special_case (x, y, odd, cmp); + + /* Reintroduce the sign bit for inputs which round to odd. */ + return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); +} + +PL_SIG (V, F, 1, cospi, -0.9, 0.9) +PL_TEST_ULP (V_NAME_F1 (cospi), 2.67) +PL_TEST_EXPECT_FENV (V_NAME_F1 (cospi), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0, 0x1p-31, 5000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p-31, 0.5, 10000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0.5, 0x1p32f, 10000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p32f, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/v_erf_2u.c b/contrib/arm-optimized-routines/pl/math/v_erf_2u.c deleted file mode 100644 index 1d7ddbb1ee3e..000000000000 --- a/contrib/arm-optimized-routines/pl/math/v_erf_2u.c +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Double-precision vector erf(x) function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#include "include/mathlib.h" -#include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" - -#if V_SUPPORTED - -#define AbsMask v_u64 (0x7fffffffffffffff) -#define AbsXMax v_f64 (0x1.8p+2) -#define Scale v_f64 (0x1p+3) - -/* Special cases (fall back to scalar calls). */ -VPCS_ATTR -NOINLINE static v_f64_t -specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) -{ - return v_call_f64 (erf, x, y, cmp); -} - -/* A structure to perform look-up in coeffs and other parameter tables. */ -struct entry -{ - v_f64_t P[V_ERF_NCOEFFS]; - v_f64_t shift; -}; - -static inline struct entry -lookup (v_u64_t i) -{ - struct entry e; -#ifdef SCALAR - for (int j = 0; j < V_ERF_NCOEFFS; ++j) - e.P[j] = __v_erf_data.coeffs[j][i]; - e.shift = __v_erf_data.shifts[i]; -#else - for (int j = 0; j < V_ERF_NCOEFFS; ++j) - { - e.P[j][0] = __v_erf_data.coeffs[j][i[0]]; - e.P[j][1] = __v_erf_data.coeffs[j][i[1]]; - } - e.shift[0] = __v_erf_data.shifts[i[0]]; - e.shift[1] = __v_erf_data.shifts[i[1]]; -#endif - return e; -} - -/* Optimized double precision vector error function erf. Maximum - observed error is 1.75 ULP, in [0.110, 0.111]: - verf(0x1.c5e0c2d5d0543p-4) got 0x1.fe0ed62a54987p-4 - want 0x1.fe0ed62a54985p-4. */ -VPCS_ATTR -v_f64_t V_NAME (erf) (v_f64_t x) -{ - /* Handle both inf/nan as well as small values (|x|<2^-28) - If any condition in the lane is true then a loop over - scalar calls will be performed. */ - v_u64_t ix = v_as_u64_f64 (x); - v_u64_t atop = (ix >> 48) & v_u64 (0x7fff); - v_u64_t special_case - = v_cond_u64 (atop - v_u64 (0x3e30) >= v_u64 (0x7ff0 - 0x3e30)); - - /* Get sign and absolute value. */ - v_u64_t sign = v_as_u64_f64 (x) & ~AbsMask; - v_f64_t a = v_min_f64 (v_abs_f64 (x), AbsXMax); - - /* Compute index by truncating 8 * a with a=|x| saturated to 6.0. */ - -#ifdef SCALAR - v_u64_t i = v_trunc_u64 (a * Scale); -#else - v_u64_t i = vcvtq_n_u64_f64 (a, 3); -#endif - /* Get polynomial coefficients and shift parameter using lookup. */ - struct entry dat = lookup (i); - - /* Evaluate polynomial on transformed argument. */ - v_f64_t z = v_fma_f64 (a, Scale, dat.shift); - - v_f64_t r1 = v_fma_f64 (z, dat.P[1], dat.P[0]); - v_f64_t r2 = v_fma_f64 (z, dat.P[3], dat.P[2]); - v_f64_t r3 = v_fma_f64 (z, dat.P[5], dat.P[4]); - v_f64_t r4 = v_fma_f64 (z, dat.P[7], dat.P[6]); - v_f64_t r5 = v_fma_f64 (z, dat.P[9], dat.P[8]); - - v_f64_t z2 = z * z; - v_f64_t y = v_fma_f64 (z2, r5, r4); - y = v_fma_f64 (z2, y, r3); - y = v_fma_f64 (z2, y, r2); - y = v_fma_f64 (z2, y, r1); - - /* y=erf(x) if x>0, -erf(-x) otherwise. */ - y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign); - - if (unlikely (v_any_u64 (special_case))) - return specialcase (x, y, special_case); - return y; -} -VPCS_ALIAS - -PL_SIG (V, D, 1, erf, -6.0, 6.0) -PL_TEST_ULP (V_NAME (erf), 1.26) -PL_TEST_INTERVAL (V_NAME (erf), 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (V_NAME (erf), 0x1p-127, 0x1p-26, 40000) -PL_TEST_INTERVAL (V_NAME (erf), -0x1p-127, -0x1p-26, 40000) -PL_TEST_INTERVAL (V_NAME (erf), 0x1p-26, 0x1p3, 40000) -PL_TEST_INTERVAL (V_NAME (erf), -0x1p-26, -0x1p3, 40000) -PL_TEST_INTERVAL (V_NAME (erf), 0, inf, 40000) -#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_erf_2u5.c b/contrib/arm-optimized-routines/pl/math/v_erf_2u5.c new file mode 100644 index 000000000000..e581ec5bb8a7 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_erf_2u5.c @@ -0,0 +1,158 @@ +/* + * Double-precision vector erf(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float64x2_t third; + float64x2_t tenth, two_over_five, two_over_fifteen; + float64x2_t two_over_nine, two_over_fortyfive; + float64x2_t max, shift; +#if WANT_SIMD_EXCEPT + float64x2_t tiny_bound, huge_bound, scale_minus_one; +#endif +} data = { + .third = V2 (0x1.5555555555556p-2), /* used to compute 2/3 and 1/6 too. */ + .two_over_fifteen = V2 (0x1.1111111111111p-3), + .tenth = V2 (-0x1.999999999999ap-4), + .two_over_five = V2 (-0x1.999999999999ap-2), + .two_over_nine = V2 (-0x1.c71c71c71c71cp-3), + .two_over_fortyfive = V2 (0x1.6c16c16c16c17p-5), + .max = V2 (5.9921875), /* 6 - 1/128. */ + .shift = V2 (0x1p45), +#if WANT_SIMD_EXCEPT + .huge_bound = V2 (0x1p205), + .tiny_bound = V2 (0x1p-226), + .scale_minus_one = V2 (0x1.06eba8214db69p-3), /* 2/sqrt(pi) - 1.0. */ +#endif +}; + +#define AbsMask 0x7fffffffffffffff + +struct entry +{ + float64x2_t erf; + float64x2_t scale; +}; + +static inline struct entry +lookup (uint64x2_t i) +{ + struct entry e; + float64x2_t e1 = vld1q_f64 ((float64_t *) (__erf_data.tab + i[0])), + e2 = vld1q_f64 ((float64_t *) (__erf_data.tab + i[1])); + e.erf = vuzp1q_f64 (e1, e2); + e.scale = vuzp2q_f64 (e1, e2); + return e; +} + +/* Double-precision implementation of vector erf(x). + Approximation based on series expansion near x rounded to + nearest multiple of 1/128. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erf(x) ~ erf(r) + scale * d * [ + + 1 + - r d + + 1/3 (2 r^2 - 1) d^2 + - 1/6 (r (2 r^2 - 3)) d^3 + + 1/30 (4 r^4 - 12 r^2 + 3) d^4 + - 1/90 (4 r^4 - 20 r^2 + 15) d^5 + ] + + Maximum measure error: 2.29 ULP + V_NAME_D1 (erf)(-0x1.00003c924e5d1p-8) got -0x1.20dd59132ebadp-8 + want -0x1.20dd59132ebafp-8. */ +float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x) +{ + const struct data *dat = ptr_barrier (&data); + + float64x2_t a = vabsq_f64 (x); + /* Reciprocal conditions that do not catch NaNs so they can be used in BSLs + to return expected results. */ + uint64x2_t a_le_max = vcleq_f64 (a, dat->max); + uint64x2_t a_gt_max = vcgtq_f64 (a, dat->max); + +#if WANT_SIMD_EXCEPT + /* |x| huge or tiny. */ + uint64x2_t cmp1 = vcgtq_f64 (a, dat->huge_bound); + uint64x2_t cmp2 = vcltq_f64 (a, dat->tiny_bound); + uint64x2_t cmp = vorrq_u64 (cmp1, cmp2); + /* If any lanes are special, mask them with 1 for small x or 8 for large + values and retain a copy of a to allow special case handler to fix special + lanes later. This is only necessary if fenv exceptions are to be triggered + correctly. */ + if (unlikely (v_any_u64 (cmp))) + { + a = vbslq_f64 (cmp1, v_f64 (8.0), a); + a = vbslq_f64 (cmp2, v_f64 (1.0), a); + } +#endif + + /* Set r to multiple of 1/128 nearest to |x|. */ + float64x2_t shift = dat->shift; + float64x2_t z = vaddq_f64 (a, shift); + + /* Lookup erf(r) and scale(r) in table, without shortcut for small values, + but with saturated indices for large values and NaNs in order to avoid + segfault. */ + uint64x2_t i + = vsubq_u64 (vreinterpretq_u64_f64 (z), vreinterpretq_u64_f64 (shift)); + i = vbslq_u64 (a_le_max, i, v_u64 (768)); + struct entry e = lookup (i); + + float64x2_t r = vsubq_f64 (z, shift); + + /* erf(x) ~ erf(r) + scale * d * poly (r, d). */ + float64x2_t d = vsubq_f64 (a, r); + float64x2_t d2 = vmulq_f64 (d, d); + float64x2_t r2 = vmulq_f64 (r, r); + + /* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5. */ + float64x2_t p1 = r; + float64x2_t p2 + = vfmsq_f64 (dat->third, r2, vaddq_f64 (dat->third, dat->third)); + float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->third)); + float64x2_t p4 = vfmaq_f64 (dat->two_over_five, r2, dat->two_over_fifteen); + p4 = vfmsq_f64 (dat->tenth, r2, p4); + float64x2_t p5 = vfmaq_f64 (dat->two_over_nine, r2, dat->two_over_fortyfive); + p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->third), r2, p5)); + + float64x2_t p34 = vfmaq_f64 (p3, d, p4); + float64x2_t p12 = vfmaq_f64 (p1, d, p2); + float64x2_t y = vfmaq_f64 (p34, d2, p5); + y = vfmaq_f64 (p12, d2, y); + + y = vfmaq_f64 (e.erf, e.scale, vfmsq_f64 (d, d2, y)); + + /* Solves the |x| = inf and NaN cases. */ + y = vbslq_f64 (a_gt_max, v_f64 (1.0), y); + + /* Copy sign. */ + y = vbslq_f64 (v_u64 (AbsMask), y, x); + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u64 (cmp2))) + { + /* Neutralise huge values of x before fixing small values. */ + x = vbslq_f64 (cmp1, v_f64 (1.0), x); + /* Fix tiny values that trigger spurious underflow. */ + return vbslq_f64 (cmp2, vfmaq_f64 (x, dat->scale_minus_one, x), y); + } +#endif + return y; +} + +PL_SIG (V, D, 1, erf, -6.0, 6.0) +PL_TEST_ULP (V_NAME_D1 (erf), 1.79) +PL_TEST_EXPECT_FENV (V_NAME_D1 (erf), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, 5.9921875, 40000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (erf), 5.9921875, inf, 40000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/v_erf_data.c b/contrib/arm-optimized-routines/pl/math/v_erf_data.c deleted file mode 100644 index 7bbb281ad912..000000000000 --- a/contrib/arm-optimized-routines/pl/math/v_erf_data.c +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Polynomial coefficients and shifts for double-precision erf(x) vector - * function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -/* 48 intervals of the form [x_i, x_{i+1}] with x_i = i / 8 for - i=1,...,47 (x_0 = 2^-1022). There is an extra dummy interval for - [6, +inf] with all coeffs = 0 except for P_0 = 1.0, as erf(x) == 1 - above 6. - - Coefficients for each interval generated using fpminimax algorithm. See - v_erf.sollya for details. Note the array is transposed, so for a set of - coefficients C generated on interval i, C[j] is at coeffs[j][i]. */ - -const struct v_erf_data __v_erf_data - = {.shifts - = {-0x1p-1019, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, - -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, - -26, -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, - -39, -40, -41, -42, -43, -44, -45, -46, -47, 0}, - .coeffs = { - // clang-format off - -{0x1.20dd750429b6dp-1022, 0x1.1f5e1a35c3b8ap-3, 0x1.1af54e232d609p-2, 0x1.9dd0d2b721f39p-2, 0x1.0a7ef5c18edd2p-1, 0x1.3f196dcd0f135p-1, - 0x1.6c1c9759d0e5fp-1, 0x1.91724951b8fc6p-1, 0x1.af767a741088bp-1, 0x1.c6dad2829ec62p-1, 0x1.d8865d98abe00p-1, 0x1.e5768c3b4a3fcp-1, - 0x1.eea5557137ae0p-1, 0x1.f4f693b67bd77p-1, 0x1.f92d077f8d56dp-1, 0x1.fbe61eef4cf6ap-1, 0x1.fd9ae142795e3p-1, 0x1.fea4218d6594ap-1, - 0x1.ff404760319b4p-1, 0x1.ff9960f3eb327p-1, 0x1.ffcaa8f4c9beap-1, 0x1.ffe514bbdc197p-1, 0x1.fff2cfb0453d9p-1, 0x1.fff9ba420e834p-1, - 0x1.fffd1ac4135f9p-1, 0x1.fffeb3ebb267bp-1, 0x1.ffff6f9f67e55p-1, 0x1.ffffc316d9ed0p-1, 0x1.ffffe710d565ep-1, 0x1.fffff618c3da6p-1, - 0x1.fffffc2f171e3p-1, 0x1.fffffe92ced93p-1, 0x1.ffffff7b91176p-1, 0x1.ffffffd169d0cp-1, 0x1.fffffff01a8b6p-1, 0x1.fffffffabd229p-1, - 0x1.fffffffe4fa30p-1, 0x1.ffffffff79626p-1, 0x1.ffffffffd759dp-1, 0x1.fffffffff4188p-1, 0x1.fffffffffc9e8p-1, 0x1.ffffffffff11ap-1, - 0x1.ffffffffffc05p-1, 0x1.ffffffffffef8p-1, 0x1.fffffffffffbep-1, 0x1.ffffffffffff0p-1, 0x1.ffffffffffffcp-1, 0x1.fffffffffffffp-1, 1.0}, - -{0x1.20dd750429b6dp-3, 0x1.1c62fa1e86989p-3, 0x1.0f5d1602f7dfbp-3, 0x1.f5f0cdaf152b2p-4, 0x1.c1efca49a5051p-4, 0x1.86e9694134b22p-4, - 0x1.492e42d78d39cp-4, 0x1.0cab61f084b1bp-4, 0x1.a911f096fbb79p-5, 0x1.45e99bcbb78d4p-5, 0x1.e4652fadcbaa3p-6, 0x1.5ce595c455bccp-6, - 0x1.e723726b81ff1p-7, 0x1.499d478bca4acp-7, 0x1.b055303221566p-8, 0x1.12ceb37ffa389p-8, 0x1.529b9e8cfa59fp-9, 0x1.94624e78e084fp-10, - 0x1.d4143a9e023f5p-11, 0x1.06918b63537c2p-11, 0x1.1d83170fcc34bp-12, 0x1.2ce898808f08ep-13, 0x1.3360ccd26e06ap-14, 0x1.30538fbb986fbp-15, - 0x1.2408e9bb1b657p-16, 0x1.0f9e1b4e4baaep-17, 0x1.e9b5e8d71b5e3p-19, 0x1.abe09e85af38ap-20, 0x1.6a5972347c568p-21, 0x1.296a70eff1bd9p-22, - 0x1.d9371ee6bfc07p-24, 0x1.6ce1a88a01b3ap-25, 0x1.10b14985663f9p-26, 0x1.8b0d07ade43d8p-28, 0x1.155a098eceb0fp-29, 0x1.7974d3b397e7cp-31, - 0x1.f1e3bf5a6493ap-33, 0x1.3e47781d91b97p-34, 0x1.8a7038368986cp-36, 0x1.d9d4d7be5992cp-38, 0x1.137dabebc1319p-39, 0x1.367541123e46cp-41, - 0x1.58007ab162c1dp-43, 0x1.709f0d280b3f5p-45, 0x1.30a3dcf531ebfp-47, 0x1.d2707c055dedcp-50, 0x1.0d97f61945387p-49, 0x1.1dbc3ab728933p-50, 0}, - -{0x1.2411381609db0p-51, -0x1.1c62fa1e75c0ap-9, -0x1.0f5d1602eb436p-8, -0x1.78749a4346714p-8, -0x1.c1efca49a7b15p-8, -0x1.e8a3c39178d95p-8, - -0x1.edc5644363883p-8, -0x1.d62beb64e19eep-8, -0x1.a911f096f7a87p-8, -0x1.6ea6cf452dca3p-8, -0x1.2ebf3dccb166cp-8, -0x1.dfbbadedfcde6p-9, - -0x1.6d5a95d08c346p-9, -0x1.0bcfca21880c9p-9, -0x1.7a4a8a2bf1a0bp-10, -0x1.01a1c8481a466p-10, -0x1.529b9e8d29ddap-11, -0x1.ada873604cf20p-12, - -0x1.074b60f960c25p-12, -0x1.37ccd585732c6p-13, -0x1.64e3dcd73a1d3p-14, -0x1.8af14827e93bap-15, -0x1.a6a519ae712fbp-16, -0x1.b5781ea681265p-17, - -0x1.b60d5ed744563p-18, -0x1.a8670acc75c29p-19, -0x1.8de3ce2154088p-20, -0x1.690584329096ap-21, -0x1.3d0e478659a54p-22, -0x1.0d8875cb088d0p-23, - -0x1.bba3c56e56d69p-25, -0x1.617a60b4bcd87p-26, -0x1.10b16afb9ce08p-27, -0x1.9766e11f62828p-29, -0x1.26afbc55ef33cp-30, -0x1.9cd52c0e709a9p-32, - -0x1.18175f6758766p-33, -0x1.705a68dde7f3ap-35, -0x1.d65ba6d52556dp-37, -0x1.23af5c3865987p-38, -0x1.51c72cd64a6bcp-40, -0x1.79f63bbc02f5ap-42, - -0x1.2346f2840d7bfp-43, -0x1.8110f614395a8p-45, 0x1.c3309f1fe85a4p-46, 0x1.09e6fb6ee0b85p-46, -0x1.959834938224fp-46, -0x1.0e9a684ecee47p-46, 0}, - -{-0x1.812746b057b58p-11, -0x1.6f552dbf96b31p-11, -0x1.3c97445cee1b0p-11, -0x1.e106c523a966dp-12, -0x1.2bf5318638e21p-12, -0x1.c8105034ea92fp-14, - 0x1.b6e85963275c5p-15, 0x1.7c9d756585d29p-13, 0x1.1b614b0e78122p-12, 0x1.4cb3cf0b42031p-12, 0x1.571d01cf7eeb3p-12, 0x1.4374d82fe7f2ep-12, - 0x1.1c2a02b9199a0p-12, 0x1.d6631e131dabap-13, 0x1.7148c3d9d22bap-13, 0x1.143d1c76ae7c6p-13, 0x1.8b0ae3afc07e6p-14, 0x1.0ea475d5b3822p-14, - 0x1.63ef6208bd4adp-15, 0x1.c1ec100ec3e71p-16, 0x1.119da13709716p-16, 0x1.407fbd00318a5p-17, 0x1.69cf481b4666cp-18, 0x1.89e17d2b19c42p-19, - 0x1.9db7531fa76f6p-20, 0x1.a37382bd61dc8p-21, 0x1.9aa4a8e8fe8dfp-22, 0x1.8451fcde36f23p-23, 0x1.62cd605193fe9p-24, 0x1.394b0d46af85cp-25, - 0x1.0b6c0d1191ec9p-26, 0x1.b9581bcc8f4ebp-28, 0x1.603ea0f602119p-29, 0x1.0ff28bc88022cp-30, 0x1.95ecc71a0b4bep-32, 0x1.24ffe516534d4p-33, - 0x1.9aa89abeffd90p-35, 0x1.1ab57210158fap-36, 0x1.8b0c503eafbcbp-38, 0x1.166413b8ba611p-39, 0x1.5848fad1e38e9p-42, 0x1.3573cc6d6d4e6p-49, - 0x1.404c0dc8b5ffcp-42, 0x1.38779160f5f11p-43, -0x1.1dc84293acf27p-42, -0x1.2892755467252p-43, 0x1.8e40aed4a9e02p-43, 0x1.0cef3bce98bedp-43, 0}, - -{0x1.4ade8e6d47ef0p-43, 0x1.196c9ee6491cfp-16, 0x1.040e8be6a9625p-15, 0x1.5529ad049b967p-15, 0x1.76f27e1744b44p-15, 0x1.6963c95cd8395p-15, - 0x1.349b5d6ae76a6p-15, 0x1.cc6056b95eed3p-16, 0x1.1b614adacb10dp-16, 0x1.ca5080f4ec9b9p-18, -0x1.93a9d54fb750bp-20, -0x1.f3b8d7695d38cp-18, - -0x1.6d5a929bfde5fp-17, -0x1.974c013452be9p-17, -0x1.8a0da620ab60fp-17, -0x1.5a3166e1f5682p-17, -0x1.1a2c5ad80a584p-17, -0x1.afe552a6507eep-18, - -0x1.38a9879a760b8p-18, -0x1.ae595d5041755p-19, -0x1.1a89c93c4b9c8p-19, -0x1.62d4c3dc10fdbp-20, -0x1.ab0c620cf63d1p-21, -0x1.ed4aeff35fd90p-22, - -0x1.11c8e63fae76dp-22, -0x1.2454a1fb4749ap-23, -0x1.2c7f7846b0e7bp-24, -0x1.298c17acfd63ap-25, -0x1.1c0f6cc5baa18p-26, -0x1.0574c9f0e63fap-27, - -0x1.d0a5c4232f4cep-29, -0x1.8d9d301253af8p-30, -0x1.49cb78be34c81p-31, -0x1.08fc30eb50526p-32, -0x1.96e2f50cad458p-34, -0x1.2c888ddad994bp-35, - -0x1.c5dd3068e7fcap-37, -0x1.935b876ed56ffp-38, -0x1.e74a7c256ba0dp-39, -0x1.1681c73733b50p-39, 0x1.855ab0b8664dep-41, 0x1.4aebdf7fb67e5p-41, - -0x1.2aef07c393759p-40, -0x1.37e52b17505e6p-41, 0x1.394b997da7ed5p-40, 0x1.4345440ea9876p-41, -0x1.af227669dca68p-41, -0x1.23589e4f3cc49p-41, 0}, - -{0x1.ce2f1b1646d4bp-19, 0x1.aaba29a029bd5p-19, 0x1.47e57fbf662a0p-19, 0x1.74882f55f1bd4p-20, 0x1.dfed759bd9091p-23, -0x1.c124b2acb3ee8p-21, - -0x1.b429a82901889p-20, -0x1.1350ee93fbfb3p-19, -0x1.1b613a5e1e196p-19, -0x1.f65ceb61aa63ap-20, -0x1.82814da1daaa1p-20, -0x1.f5729185c040ep-21, - -0x1.e72489bfea503p-22, -0x1.17d784c065f21p-24, 0x1.b2229e5122850p-23, 0x1.779b916c44358p-22, 0x1.ace7a08f66cb0p-22, 0x1.9973788b8f181p-22, - 0x1.5d3bceb9c39d5p-22, 0x1.11da976499339p-22, 0x1.90eaa0d25df91p-23, 0x1.146c19a9f0ae8p-23, 0x1.693a52f5ccd0bp-24, 0x1.c122683fc1404p-25, - 0x1.0a866e311e50ap-25, 0x1.2e85588e08741p-26, 0x1.493501a3ee15cp-27, 0x1.572eec204dc18p-28, 0x1.590e0157d4dabp-29, 0x1.4c0619d7359e8p-30, - 0x1.36608b7b22d22p-31, 0x1.0e3f514a0d7fep-32, 0x1.e04d29135056ep-34, 0x1.aa936eb977e33p-35, 0x1.3ce1ec4a299b6p-36, 0x1.aba42bc751130p-38, - 0x1.0861b5dc819e3p-38, 0x1.3bc7b1f0f8afbp-38, 0x1.7d6c896bf3579p-38, 0x1.14f24be91338cp-38, -0x1.2896024cf2ca9p-39, -0x1.c2e8399d1e8e7p-40, - 0x1.7836a61cc0f4bp-39, 0x1.8a98e07f8cdfcp-40, -0x1.8f332379c6ce4p-39, -0x1.9bbec3ab83755p-40, 0x1.126c9c6d24bd6p-39, 0x1.72eaeac065cc2p-40, 0}, - -{0x1.240b25b9a9823p-39, -0x1.733f879c52150p-24, -0x1.4c00873f3742fp-23, -0x1.9a6fe48163775p-23, -0x1.99ed7481d2399p-23, -0x1.52aea61425cf7p-23, - -0x1.b853c3ad1c781p-24, -0x1.53c3e486c1845p-25, 0x1.2e2a4e7a0286dp-26, 0x1.fd0e266132929p-25, 0x1.5cf1d8fe5611fp-24, 0x1.6b140ba72ac56p-24, - 0x1.3cab2fa73a9c4p-24, 0x1.d864967df5009p-25, 0x1.25b4551256078p-25, 0x1.0d029bc50b0cdp-26, 0x1.e126485c5dceep-30, -0x1.dd5e4bed818c0p-28, - -0x1.7cd1b44dbfdc3p-27, -0x1.981def704f39ep-27, -0x1.6f0e87a0f3e35p-27, -0x1.267c0dc9b6e95p-27, -0x1.b2ec3078bf153p-28, -0x1.2b066605239f5p-28, - -0x1.840473ed3d070p-29, -0x1.daf9b9b8c06cap-30, -0x1.1661520cf8a32p-30, -0x1.2fa49c29e30b5p-31, -0x1.4ddfd9d6a7cf4p-32, -0x1.4a55b8564425ap-33, - -0x1.5df1ca746f291p-34, -0x1.dd6b8d1ec2e4fp-36, -0x1.34c63d902f888p-36, -0x1.b55b65a1655c0p-37, -0x1.9c1cfd1e2142cp-39, 0x1.98f2b73f288c4p-43, - -0x1.3baba91a10af8p-39, -0x1.8cb03e5359e2bp-38, -0x1.16063ce2129afp-37, -0x1.9fd74120d8e00p-38, 0x1.cf0caf7defe71p-39, 0x1.5d029f324f3a7p-39, - -0x1.21268c2290cb5p-38, -0x1.2f6de12d74afdp-39, 0x1.332ead763d55ap-38, 0x1.3cd3a7103e138p-39, -0x1.a64e5d1cdb028p-39, -0x1.1d674b3db2a42p-39, 0}, - -{-0x1.b84a0abf33534p-27, -0x1.89c6cd0cf2b65p-27, -0x1.09bb37091d4aep-27, -0x1.68f777b72ca95p-29, 0x1.60a5240c5ece1p-29, 0x1.c7421c28ef551p-28, - 0x1.2e75b6acb2116p-27, 0x1.30f14412b258cp-27, 0x1.f153992d28a09p-28, 0x1.3b80153a3c97bp-28, 0x1.df36fe4b5094cp-30, -0x1.724a2b185f507p-31, - -0x1.37cb36ce4237dp-29, -0x1.963d70f677f90p-29, -0x1.8d5c135b0af66p-29, -0x1.42fbc01c11a3bp-29, -0x1.baba060b7adb1p-30, -0x1.eaf481fbc6feap-31, - -0x1.5b5d0a354e49cp-32, 0x1.fb57bbdb6f854p-35, 0x1.2423823b5dcaep-32, 0x1.64e9c7f44ececp-32, 0x1.59b6fb115bcefp-32, 0x1.179a1737c24d9p-32, - 0x1.a9515bcf95bb0p-33, 0x1.1ca83baba64bdp-33, 0x1.826e7ef89b3cap-34, 0x1.7ab5cb5ca2db0p-35, 0x1.2ce997226e82dp-35, 0x1.fdd14ca5a6d38p-37, - 0x1.d35252de2a363p-37, -0x1.8dd5e799b3695p-39, 0x1.047fd46786432p-38, 0x1.aa8639c65a4a4p-38, 0x1.10495d2cdaee5p-41, -0x1.24b2b7e751230p-40, - 0x1.e2ec0b9e9b211p-40, 0x1.6203cc50754ffp-38, 0x1.f95c0def7238bp-38, 0x1.7b31a463405b9p-38, -0x1.a826fa90b3c96p-39, -0x1.3f6315812b719p-39, - 0x1.0862d42832ac6p-38, 0x1.1575d5fa4614cp-39, -0x1.18eb527929cedp-38, -0x1.21bd844e0e3b8p-39, 0x1.8233e415548a0p-39, 0x1.0501b16f5819bp-39, 0}, - -{0x1.9b4497171a29dp-39, 0x1.7f9c0bcd4b3e7p-32, 0x1.4928133bccac3p-31, 0x1.7b5a70f49485bp-31, 0x1.4f71ee2c4aff3p-31, 0x1.bca22e6a9cd38p-32, - 0x1.1c93a34970852p-33, -0x1.03d86c164d20cp-33, -0x1.448222383eb95p-32, -0x1.95aa76b3417ddp-32, -0x1.80448ecd34689p-32, -0x1.19d3f547d1f1fp-32, - -0x1.2c65995a6a63fp-33, -0x1.01b5832823cc6p-35, 0x1.97d70f56a4524p-35, 0x1.7d57df58d20a9p-34, 0x1.a3d6fe32773b9p-34, 0x1.6ff53581ac827p-34, - 0x1.faff84d277a6fp-35, 0x1.39ff19e23455bp-35, 0x1.9b1e383b8e03dp-37, 0x1.fd37bce839816p-40, -0x1.31b58a910d109p-37, -0x1.480a28743a67fp-37, - -0x1.9a8b926ca51b4p-37, -0x1.14d6b0b9c8256p-37, -0x1.227dfd10a7f51p-37, -0x1.d1d5ba9e5676cp-42, -0x1.71c57d72b90eap-38, -0x1.018922e3bb1eap-40, - -0x1.e0970faab38e6p-39, 0x1.a442b8ab5ed33p-39, -0x1.3a6f0acbd7293p-40, -0x1.7c53be7062a3ap-39, -0x1.c562622693573p-44, 0x1.458e668db57cdp-41, - -0x1.d5f41a61e90a0p-41, -0x1.60d1f7c57cb11p-39, -0x1.f8fa4c98324fep-39, -0x1.7b178840b90e3p-39, 0x1.a8558cdf5220ap-40, 0x1.3f7acb241cdbbp-40, - -0x1.086dc81118428p-39, -0x1.15828db8b2da6p-40, 0x1.18f9d5a5099c3p-39, 0x1.21cd05249b8c9p-40, -0x1.82493a2d7a1fep-40, -0x1.0510a8a58c1abp-40, 0}, - -{0x1.4c0cf8eccd2e0p-35, 0x1.de696ed8004cbp-36, 0x1.62392d5363e58p-37, -0x1.21d68e1a8e4c7p-37, -0x1.867b57075ec9dp-36, -0x1.058af4c30abafp-35, - -0x1.dbb6594ed5127p-36, -0x1.6006d1f354794p-36, -0x1.311e96adfec96p-37, 0x1.2c82e5ef56703p-39, 0x1.6f2c1413cbe8ep-37, 0x1.c46886dd6c5d6p-37, - 0x1.92e273bf63d54p-37, 0x1.2982faf5df034p-37, 0x1.5ad37b1dc30c4p-38, 0x1.97104fd2630f8p-40, -0x1.38bcd955ecbb9p-40, -0x1.7779727d36c91p-39, - -0x1.4862c13c3ccf5p-39, -0x1.53facd6319433p-39, -0x1.de2f6e88b0926p-41, -0x1.fb0967f0fa611p-41, 0x1.5fadb405af344p-42, 0x1.e90319ef64411p-43, - 0x1.fc013fac4d3d7p-41, 0x1.0546d08a05cacp-41, 0x1.fa1b10c35012ep-41, -0x1.000d4354b8049p-41, 0x1.b68ee44b2b84bp-41, 0x1.cfa36d83ea2afp-48, - 0x1.5c41a6c8aaf3ap-41, -0x1.7edb2342ceb28p-41, 0x1.d9211942a37d9p-43, 0x1.39b815d399ba2p-41, 0x1.1fc46969db91bp-46, -0x1.1736507c25bafp-43, - 0x1.89bbcfdb5c677p-43, 0x1.28f22b295bc86p-41, 0x1.a9396e0b45a3bp-41, 0x1.3f409ac2dbfafp-41, -0x1.65682520f07a7p-42, -0x1.0d1586492d3b1p-42, - 0x1.bd6c9f236abc3p-42, 0x1.d376a4bd795bep-43, -0x1.d94e87dd31275p-42, -0x1.e82d04ff5649fp-43, 0x1.455b18d5d810fp-42, 0x1.b7c6a4ab711bdp-43, 0} - // clang-format on - }}; diff --git a/contrib/arm-optimized-routines/pl/math/v_erfc_1u8.c b/contrib/arm-optimized-routines/pl/math/v_erfc_1u8.c new file mode 100644 index 000000000000..10ef7e6a3c34 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_erfc_1u8.c @@ -0,0 +1,198 @@ +/* + * Double-precision vector erfc(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + uint64x2_t offset, table_scale; + float64x2_t max, shift; + float64x2_t p20, p40, p41, p42; + float64x2_t p51, p52; + float64x2_t qr5, qr6, qr7, qr8, qr9; +#if WANT_SIMD_EXCEPT + float64x2_t uflow_bound; +#endif +} data = { + /* Set an offset so the range of the index used for lookup is 3487, and it + can be clamped using a saturated add on an offset index. + Index offset is 0xffffffffffffffff - asuint64(shift) - 3487. */ + .offset = V2 (0xbd3ffffffffff260), + .table_scale = V2 (0x37f0000000000000 << 1), /* asuint64 (2^-128) << 1. */ + .max = V2 (0x1.b3ep+4), /* 3487/128. */ + .shift = V2 (0x1p45), + .p20 = V2 (0x1.5555555555555p-2), /* 1/3, used to compute 2/3 and 1/6. */ + .p40 = V2 (-0x1.999999999999ap-4), /* 1/10. */ + .p41 = V2 (-0x1.999999999999ap-2), /* 2/5. */ + .p42 = V2 (0x1.1111111111111p-3), /* 2/15. */ + .p51 = V2 (-0x1.c71c71c71c71cp-3), /* 2/9. */ + .p52 = V2 (0x1.6c16c16c16c17p-5), /* 2/45. */ + /* Qi = (i+1) / i, Ri = -2 * i / ((i+1)*(i+2)), for i = 5, ..., 9. */ + .qr5 = { 0x1.3333333333333p0, -0x1.e79e79e79e79ep-3 }, + .qr6 = { 0x1.2aaaaaaaaaaabp0, -0x1.b6db6db6db6dbp-3 }, + .qr7 = { 0x1.2492492492492p0, -0x1.8e38e38e38e39p-3 }, + .qr8 = { 0x1.2p0, -0x1.6c16c16c16c17p-3 }, + .qr9 = { 0x1.1c71c71c71c72p0, -0x1.4f2094f2094f2p-3 }, +#if WANT_SIMD_EXCEPT + .uflow_bound = V2 (0x1.a8b12fc6e4892p+4), +#endif +}; + +#define TinyBound 0x4000000000000000 /* 0x1p-511 << 1. */ +#define Off 0xfffffffffffff260 /* 0xffffffffffffffff - 3487. */ + +struct entry +{ + float64x2_t erfc; + float64x2_t scale; +}; + +static inline struct entry +lookup (uint64x2_t i) +{ + struct entry e; + float64x2_t e1 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[0])), + e2 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[1])); + e.erfc = vuzp1q_f64 (e1, e2); + e.scale = vuzp2q_f64 (e1, e2); + return e; +} + +#if WANT_SIMD_EXCEPT +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) +{ + return v_call_f64 (erfc, x, y, cmp); +} +#endif + +/* Optimized double-precision vector erfc(x). + Approximation based on series expansion near x rounded to + nearest multiple of 1/128. + + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erfc(x) ~ erfc(r) - scale * d * poly(r, d), with + + poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3 + + (2/15 r^4 - 2/5 r^2 + 1/10) d^4 + - r * (2/45 r^4 - 2/9 r^2 + 1/6) d^5 + + p6(r) d^6 + ... + p10(r) d^10 + + Polynomials p6(r) to p10(r) are computed using recurrence relation + + 2(i+1)p_i + 2r(i+2)p_{i+1} + (i+2)(i+3)p_{i+2} = 0, + with p0 = 1, and p1(r) = -r. + + Values of erfc(r) and scale are read from lookup tables. Stored values + are scaled to avoid hitting the subnormal range. + + Note that for x < 0, erfc(x) = 2.0 - erfc(-x). + + Maximum measured error: 1.71 ULP + V_NAME_D1 (erfc)(0x1.46cfe976733p+4) got 0x1.e15fcbea3e7afp-608 + want 0x1.e15fcbea3e7adp-608. */ +VPCS_ATTR +float64x2_t V_NAME_D1 (erfc) (float64x2_t x) +{ + const struct data *dat = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + /* |x| < 2^-511. Avoid fabs by left-shifting by 1. */ + uint64x2_t ix = vreinterpretq_u64_f64 (x); + uint64x2_t cmp = vcltq_u64 (vaddq_u64 (ix, ix), v_u64 (TinyBound)); + /* x >= ~26.54 (into subnormal case and uflow case). Comparison is done in + integer domain to avoid raising exceptions in presence of nans. */ + uint64x2_t uflow = vcgeq_s64 (vreinterpretq_s64_f64 (x), + vreinterpretq_s64_f64 (dat->uflow_bound)); + cmp = vorrq_u64 (cmp, uflow); + float64x2_t xm = x; + /* If any lanes are special, mask them with 0 and retain a copy of x to allow + special case handler to fix special lanes later. This is only necessary if + fenv exceptions are to be triggered correctly. */ + if (unlikely (v_any_u64 (cmp))) + x = v_zerofy_f64 (x, cmp); +#endif + + float64x2_t a = vabsq_f64 (x); + a = vminq_f64 (a, dat->max); + + /* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 0 and scale to + 2/sqrt(pi), when x reduced to r = 0. */ + float64x2_t shift = dat->shift; + float64x2_t z = vaddq_f64 (a, shift); + + /* Clamp index to a range of 3487. A naive approach would use a subtract and + min. Instead we offset the table address and the index, then use a + saturating add. */ + uint64x2_t i = vqaddq_u64 (vreinterpretq_u64_f64 (z), dat->offset); + + struct entry e = lookup (i); + + /* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */ + float64x2_t r = vsubq_f64 (z, shift); + float64x2_t d = vsubq_f64 (a, r); + float64x2_t d2 = vmulq_f64 (d, d); + float64x2_t r2 = vmulq_f64 (r, r); + + float64x2_t p1 = r; + float64x2_t p2 = vfmsq_f64 (dat->p20, r2, vaddq_f64 (dat->p20, dat->p20)); + float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->p20)); + float64x2_t p4 = vfmaq_f64 (dat->p41, r2, dat->p42); + p4 = vfmsq_f64 (dat->p40, r2, p4); + float64x2_t p5 = vfmaq_f64 (dat->p51, r2, dat->p52); + p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5)); + /* Compute p_i using recurrence relation: + p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */ + float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, dat->qr5, 0)); + p6 = vmulq_laneq_f64 (p6, dat->qr5, 1); + float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, dat->qr6, 0)); + p7 = vmulq_laneq_f64 (p7, dat->qr6, 1); + float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, dat->qr7, 0)); + p8 = vmulq_laneq_f64 (p8, dat->qr7, 1); + float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, dat->qr8, 0)); + p9 = vmulq_laneq_f64 (p9, dat->qr8, 1); + float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, dat->qr9, 0)); + p10 = vmulq_laneq_f64 (p10, dat->qr9, 1); + /* Compute polynomial in d using pairwise Horner scheme. */ + float64x2_t p90 = vfmaq_f64 (p9, d, p10); + float64x2_t p78 = vfmaq_f64 (p7, d, p8); + float64x2_t p56 = vfmaq_f64 (p5, d, p6); + float64x2_t p34 = vfmaq_f64 (p3, d, p4); + float64x2_t p12 = vfmaq_f64 (p1, d, p2); + float64x2_t y = vfmaq_f64 (p78, d2, p90); + y = vfmaq_f64 (p56, d2, y); + y = vfmaq_f64 (p34, d2, y); + y = vfmaq_f64 (p12, d2, y); + + y = vfmsq_f64 (e.erfc, e.scale, vfmsq_f64 (d, d2, y)); + + /* Offset equals 2.0 if sign, else 0.0. */ + uint64x2_t sign = vshrq_n_u64 (vreinterpretq_u64_f64 (x), 63); + float64x2_t off = vreinterpretq_f64_u64 (vshlq_n_u64 (sign, 62)); + /* Copy sign and scale back in a single fma. Since the bit patterns do not + overlap, then logical or and addition are equivalent here. */ + float64x2_t fac = vreinterpretq_f64_u64 ( + vsraq_n_u64 (vshlq_n_u64 (sign, 63), dat->table_scale, 1)); + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u64 (cmp))) + return special_case (xm, vfmaq_f64 (off, fac, y), cmp); +#endif + + return vfmaq_f64 (off, fac, y); +} + +PL_SIG (V, D, 1, erfc, -6.0, 28.0) +PL_TEST_ULP (V_NAME_D1 (erfc), 1.21) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (erfc), 0, 0x1p-26, 40000) +PL_TEST_INTERVAL (V_NAME_D1 (erfc), 0x1p-26, 28.0, 40000) +PL_TEST_INTERVAL (V_NAME_D1 (erfc), -0x1p-26, -6.0, 40000) +PL_TEST_INTERVAL (V_NAME_D1 (erfc), 28.0, inf, 40000) +PL_TEST_INTERVAL (V_NAME_D1 (erfc), -6.0, -inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/v_erfc_4u.c b/contrib/arm-optimized-routines/pl/math/v_erfc_4u.c deleted file mode 100644 index c30635153a20..000000000000 --- a/contrib/arm-optimized-routines/pl/math/v_erfc_4u.c +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Double-precision vector erfc(x) function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#include "horner.h" -#include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" - -#if V_SUPPORTED - -/* Accurate exponential (vector variant of exp_dd). */ -v_f64_t V_NAME (exp_tail) (v_f64_t, v_f64_t); - -#define One v_f64 (1.0) -#define AbsMask v_u64 (0x7fffffffffffffff) -#define Scale v_f64 (0x1.0000002p27) - -/* Coeffs for polynomial approximation on [0x1.0p-28., 31.]. */ -#define PX __v_erfc_data.poly -#define xint __v_erfc_data.interval_bounds - -/* Special cases (fall back to scalar calls). */ -VPCS_ATTR -NOINLINE static v_f64_t -specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) -{ - return v_call_f64 (erfc, x, y, cmp); -} - -/* A structure to perform look-up in coeffs and other parameter - tables. */ -struct entry -{ - v_f64_t P[ERFC_POLY_ORDER + 1]; - v_f64_t xi; -}; - -static inline struct entry -lookup (v_u64_t i) -{ - struct entry e; -#ifdef SCALAR - for (int j = 0; j <= ERFC_POLY_ORDER; ++j) - e.P[j] = PX[i][j]; - e.xi = xint[i]; -#else - for (int j = 0; j <= ERFC_POLY_ORDER; ++j) - { - e.P[j][0] = PX[i[0]][j]; - e.P[j][1] = PX[i[1]][j]; - } - e.xi[0] = xint[i[0]]; - e.xi[1] = xint[i[1]]; -#endif - return e; -} - -/* Accurate evaluation of exp(x^2) using compensated product - (x^2 ~ x*x + e2) and custom exp(y+d) routine for small - corrections d<> 63) << 62); - /* Use 12-bit for small, nan and inf case detection. */ - atop = (ix >> 52) & 0x7ff; - cmp = v_cond_u64 (atop - v_u64 (0x3cd) >= v_u64 (0x7ff - 0x3cd)); - - struct entry dat; - - /* All entries of the vector are out of bounds, take a short path. - Use smallest possible number above 28 representable in 12 bits. */ - v_u64_t out_of_bounds = v_cond_u64 (atop >= v_u64 (0x404)); - - /* Use sign to produce either 0 if x > 0, 2 otherwise. */ - if (v_all_u64 (out_of_bounds) && likely (v_any_u64 (~cmp))) - return fac; - - /* erfc(|x|) = P(|x|-x_i)*exp(-x^2). */ - - v_f64_t a = v_abs_f64 (x); - - /* Interval bounds are a logarithmic scale, i.e. interval n has - lower bound 2^(n/4) - 1. Use the exponent of (|x|+1)^4 to obtain - the interval index. */ - v_f64_t xp1 = a + v_f64 (1.0); - xp1 = xp1 * xp1; - xp1 = xp1 * xp1; - v_u64_t ixp1 = v_as_u64_f64 (xp1); - i = (ixp1 >> 52) - v_u64 (1023); - - /* Index cannot exceed number of polynomials. */ -#ifdef SCALAR - i = i <= (ERFC_NUM_INTERVALS) ? i : ERFC_NUM_INTERVALS; -#else - i = (v_u64_t){i[0] <= ERFC_NUM_INTERVALS ? i[0] : ERFC_NUM_INTERVALS, - i[1] <= ERFC_NUM_INTERVALS ? i[1] : ERFC_NUM_INTERVALS}; -#endif - /* Get coeffs of i-th polynomial. */ - dat = lookup (i); - - /* Evaluate Polynomial: P(|x|-x_i). */ - z = a - dat.xi; -#define C(i) dat.P[i] - p = HORNER_12 (z, C); - - /* Evaluate Gaussian: exp(-x^2). */ - v_f64_t e = v_eval_gauss (a); - - /* Copy sign. */ - sign = v_as_u64_f64 (x) & ~AbsMask; - p = v_as_f64_u64 (v_as_u64_f64 (p) ^ sign); - - /* Assemble result as 2.0 - p * e if x < 0, p * e otherwise. */ - y = v_fma_f64 (p, e, fac); - - /* No need to fix value of y if x is out of bound, as - P[ERFC_NUM_INTERVALS]=0. */ - if (unlikely (v_any_u64 (cmp))) - return specialcase (x, y, cmp); - return y; -} -VPCS_ALIAS - -PL_SIG (V, D, 1, erfc, -6.0, 28.0) -PL_TEST_ULP (V_NAME (erfc), 3.15) -PL_TEST_INTERVAL (V_NAME (erfc), 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (V_NAME (erfc), 0x1p-1022, 0x1p-26, 40000) -PL_TEST_INTERVAL (V_NAME (erfc), -0x1p-1022, -0x1p-26, 40000) -PL_TEST_INTERVAL (V_NAME (erfc), 0x1p-26, 0x1p5, 40000) -PL_TEST_INTERVAL (V_NAME (erfc), -0x1p-26, -0x1p3, 40000) -PL_TEST_INTERVAL (V_NAME (erfc), 0, inf, 40000) -#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_erfc_data.c b/contrib/arm-optimized-routines/pl/math/v_erfc_data.c deleted file mode 100644 index 3c47033c1170..000000000000 --- a/contrib/arm-optimized-routines/pl/math/v_erfc_data.c +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Polynomial coefficients for double-precision erfc(x) vector function. - * - * Copyright (c) 2020-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -/* Coefficients for 20 order-12 polynomials used in v_erfc. The intervals have - the same bounds as the scalar algorithm, with the exception of the lower - bound of the first interval which is larger. This is because the vector - variants fall back to the scalar for tiny arguments, meaning that we can use - a slightly different approach which is more precise for larger inputs but - unacceptably imprecise for tiny inputs. */ - -const struct v_erfc_data __v_erfc_data = { - -/* Bounds for 20 intervals spanning [0x1.0p-28., 31.]. Interval bounds are a - logarithmic scale, i.e. interval n has lower bound 2^(n/4) - 1, with the - exception of the first interval. */ -.interval_bounds = { - 0x1p-28, /* If xmin=2^-28, 0 otherwise. */ - 0x1.837f0518db8a9p-3, /* 0.189. */ - 0x1.a827999fcef32p-2, /* 0.414. */ - 0x1.5d13f32b5a75bp-1, /* 0.682. */ - 0x1.0p0, /* 1.000. */ - 0x1.60dfc14636e2ap0, /* 1.378. */ - 0x1.d413cccfe779ap0, /* 1.828. */ - 0x1.2e89f995ad3adp1, /* 2.364. */ - 0x1.8p1, /* 3.000. */ - 0x1.e0dfc14636e2ap1, /* 3.757. */ - 0x1.2a09e667f3bcdp2, /* 4.657. */ - 0x1.6e89f995ad3adp2, /* 5.727. */ - 0x1.cp2, /* 7.000. */ - 0x1.106fe0a31b715p3, /* 8.514. */ - 0x1.4a09e667f3bcdp3, /* 10.31. */ - 0x1.8e89f995ad3adp3, /* 12.45. */ - 0x1.ep3, /* 15.00. */ - 0x1.206fe0a31b715p4, /* 18.03. */ - 0x1.5a09e667f3bcdp4, /* 21.63. */ - 0x1.9e89f995ad3adp4, /* 25.91. */ - 0x1.fp4 /* 31.00. */ -}, - -/* Generated using fpminimax algorithm on each interval separately. The - polynomial approximates erfc(x + a) * exp((x + a) ^ 2) in the interval - [0;b-a], where [a;b] is the interval in which the input lies. Note this is - slightly different from the scalar polynomial, which approximates - erfc(x + a) * exp(x ^ 2). See v_erfc.sollya for more details. */ -.poly = { -/* 3.725290298461914e-9 < x < 0.18920711500272103. */ -{0x1.ffffffdbe4516p-1, -0x1.20dd74e429b54p0, 0x1.ffffffb7c6a67p-1, -0x1.8127466fa2ec9p-1, 0x1.ffffff6eeff5ap-2, -0x1.341f668c90dccp-2, 0x1.5554aca74e5d6p-3, -0x1.6014d9d3fed0dp-4, 0x1.546b5f2c85127p-5, -0x1.2f7ec79acc129p-6, 0x1.a27e53703b7abp-8, 0x1.7b18bce311fa3p-12, -0x1.1897cda04df3ap-9}, -/* 0.18920711500272103 < x < 0.41421356237309515. */ -{0x1.a2b43de077724p-1, -0x1.a3495bb58664cp-1, 0x1.535f3ff4547e6p-1, -0x1.d96eea2951a7cp-2, 0x1.269566a956371p-2, -0x1.4e281de026b47p-3, 0x1.5ea071b652a2fp-4, -0x1.57f46cfca7024p-5, 0x1.3db28243f06abp-6, -0x1.138745eef6f26p-7, 0x1.a9cd70bad344p-9, -0x1.c6e4fda8920c4p-11, 0x1.624709ca2bc71p-16}, -/* 0.41421356237309515 < x < 0.681792830507429. */ -{0x1.532e75764e513p-1, -0x1.28be34f327f9dp-1, 0x1.b088738cca84cp-2, -0x1.14377551bd5c8p-2, 0x1.3e1ecedd64246p-3, -0x1.5087f3110eb57p-4, 0x1.4b3c61efcb562p-5, -0x1.324cc70a4f459p-6, 0x1.0cd19a96af21bp-7, -0x1.cc2ccc725d07p-9, 0x1.a3ba67a7d02b4p-10, -0x1.b1943295882abp-11, 0x1.53a1c5fdf8e67p-12}, -/* 0.681792830507429 < x < 1. */ -{0x1.10f974588f63dp-1, -0x1.9b032139e3367p-2, 0x1.09b942b8a951dp-2, -0x1.327553909cb88p-3, 0x1.42819b6c9a14p-4, -0x1.3a6d6f1924825p-5, 0x1.1f1864dd6f28fp-6, -0x1.ef12c5e9f3232p-8, 0x1.962ac63d55aa1p-9, -0x1.4146d9206419cp-10, 0x1.f823f62268229p-12, -0x1.837ab488d5ed8p-13, 0x1.aa021ae16edfep-15}, -/* 1 < x < 1.378414230005442. */ -{0x1.b5d8780f956b2p-2, -0x1.17c4e3f17c034p-2, 0x1.3c27283c31939p-3, -0x1.44837f88a0ecdp-4, 0x1.33cad0dc779c8p-5, -0x1.10fcef8294e8dp-6, 0x1.c8cb3e5a6a5a6p-8, -0x1.6aedbd3a05f1cp-9, 0x1.1325c0bf9a0cap-10, -0x1.8e28d61a0f646p-12, 0x1.0d554e2ab3652p-13, -0x1.35b5f9ac296ebp-15, 0x1.b8faf07e2527dp-18}, -/* 1.378414230005442 < x < 1.8284271247461903. */ -{0x1.5ee444130b7dbp-2, -0x1.78396ab2083e8p-3, 0x1.6e617ec5bc039p-4, -0x1.49e60f6238765p-5, 0x1.16064fb4428c9p-6, -0x1.ba80a8575a434p-8, 0x1.4ec30f2efeb8p-9, -0x1.e40456c735f09p-11, 0x1.4f7ee6b7885b7p-12, -0x1.bc9997995fdecp-14, 0x1.1169f7327ff2p-15, -0x1.174826d000852p-17, 0x1.5506a7433e925p-20}, -/* 1.8284271247461903 < x < 2.363585661014858. */ -{0x1.19a22c064d4eap-2, -0x1.f645498cae1b3p-4, 0x1.a0565950e1256p-5, -0x1.446605c186f6dp-6, 0x1.df1231b47ff04p-8, -0x1.515164d13dfafp-9, 0x1.c72bde869ad61p-11, -0x1.2768fbf9b1d6ep-12, 0x1.71bd3a1b851e9p-14, -0x1.bca5b5942017cp-16, 0x1.f2d480b3a2e63p-18, -0x1.d339662d53467p-20, 0x1.06d67ebf792bp-22}, -/* 2.363585661014858 < x < 3. */ -{0x1.c57f0542a7637p-3, -0x1.4e5535c17af25p-4, 0x1.d31272523acfep-6, -0x1.3727cbbfd1bfcp-7, 0x1.8d6730b8c5a4cp-9, -0x1.e88548286036fp-11, 0x1.21f6e89456853p-12, -0x1.4d4b7787bd3c2p-14, 0x1.735dc84e7ff16p-16, -0x1.8eb02db832048p-18, 0x1.8dfb8add3b86ep-20, -0x1.47a340d76c72bp-22, 0x1.3e5925ffebe6bp-25}, -/* 3 < x < 3.756828460010884. */ -{0x1.6e9827d229d2dp-3, -0x1.bd6ae4d14b1adp-5, 0x1.043fe1a98c3b9p-6, -0x1.259061ba34453p-8, 0x1.409cc2cc96bedp-10, -0x1.53dec3fd6c443p-12, 0x1.5e72f7baf3554p-14, -0x1.601aa94bf21eep-16, 0x1.58e730ceaa91dp-18, -0x1.4762cbd256163p-20, 0x1.22b8bea5d4a5ap-22, -0x1.ac197af37fcadp-25, 0x1.74cdf138a0b73p-28}, -/* 3.756828460010884 < x < 4.656854249492381. */ -{0x1.29a8a4e95063ep-3, -0x1.29a8a316d331dp-5, 0x1.21876b3fe50cfp-7, -0x1.1276f2d8eefd9p-9, 0x1.fbff521741e5cp-12, -0x1.cb9ce996b9601p-14, 0x1.971075371ef81p-16, -0x1.61458571e4738p-18, 0x1.2c51c21b7ab9ep-20, -0x1.f01e444a666c3p-23, 0x1.7e8f2979b67f1p-25, -0x1.e505367843027p-28, 0x1.67809d68de49cp-31}, -/* 4.656854249492381 < x < 5.727171322029716. */ -{0x1.e583024e2bc7fp-4, -0x1.8fb458acb5acep-6, 0x1.42b9dffac075cp-8, -0x1.ff9fe9a48522p-11, 0x1.8e7e866f4f073p-13, -0x1.313aeee1c2d45p-15, 0x1.cc299efd7374cp-18, -0x1.5587e53442d66p-20, 0x1.f2aca160f159bp-23, -0x1.62ae4834dcda7p-25, 0x1.d6b070147cb37p-28, -0x1.fee399e7be1bfp-31, 0x1.41d6f9fbc9515p-34}, -/* 5.727171322029716 < x < 7. */ -{0x1.8d9cbafa30408p-4, -0x1.0dd14614ed1cfp-6, 0x1.6943976ea6bf4p-9, -0x1.dd6f05f3b914cp-12, 0x1.37891317e7bcfp-14, -0x1.91a81ce9014a2p-17, 0x1.ffcac303208b9p-20, -0x1.424f1af78feb3p-22, 0x1.90b8edbca12a5p-25, -0x1.e69bea0338c7fp-28, 0x1.13b974a710373p-30, -0x1.fdc9aa9359794p-34, 0x1.105fc772b5a66p-37}, -/* 7 < x < 8.513656920021768. */ -{0x1.46dc6bf900f68p-4, -0x1.6e4b45246f95p-7, 0x1.96a3de47d4bd7p-10, -0x1.bf5070eccb409p-13, 0x1.e7af6e83607a2p-16, -0x1.078bf5306f9eep-18, 0x1.1a6e8327243adp-21, -0x1.2c1e7368c7809p-24, 0x1.3bc83557dac43p-27, -0x1.45a6405b2e649p-30, 0x1.3aac4888689ebp-33, -0x1.f1fa23448a168p-37, 0x1.c868668755778p-41}, -/* 8.513656920021768 < x < 10.313708498984761. */ -{0x1.0d9a17e032288p-4, -0x1.f3e942ff4df7p-8, 0x1.cc77f09dabc5cp-11, -0x1.a56e8bfd32da8p-14, 0x1.7f49e31164409p-17, -0x1.5a73f46a6afc9p-20, 0x1.374240ce973d2p-23, -0x1.15e8d473b728cp-26, 0x1.ec3ec79699378p-30, -0x1.ab3b8aba63362p-33, 0x1.5a1381cfe2866p-36, -0x1.c78e252ce77ccp-40, 0x1.589857ceaaaeep-44}, -/* 10.313708498984761 < x < 12.454342644059432. */ -{0x1.be0c73cc19eddp-5, -0x1.56ce6f6c0cbb1p-8, 0x1.0645980ecbbfcp-11, -0x1.8f86f887f6598p-15, 0x1.2ef80cd9e00b1p-18, -0x1.c97ffd66720e4p-22, 0x1.57f0eeecf030ap-25, -0x1.016df7d5e28d9p-28, 0x1.7f0d022922f1dp-32, -0x1.1849731f004aep-35, 0x1.8149e7ca0fb3cp-39, -0x1.b1fe4abe62d81p-43, 0x1.1ae4d60247651p-47}, -/* 12.454342644059432 < x < 15. */ -{0x1.71eafbd9f5877p-5, -0x1.d83714d90461fp-9, 0x1.2c74dbacd45fdp-12, -0x1.7d27f3cfe160ep-16, 0x1.e20b13b8d32e3p-20, -0x1.2fe33cb2bce33p-23, 0x1.7dfd564d69a07p-27, -0x1.dea62ef0f7d7ep-31, 0x1.2a7b946273ea5p-34, -0x1.6eb665bad5b72p-38, 0x1.a8191750e8bf9p-42, -0x1.92d8a86cbd0fcp-46, 0x1.bba272feef841p-51}, -/* 15 < x < 18.027313840043536. */ -{0x1.33714a024097ep-5, -0x1.467f441a50bc3p-9, 0x1.59fa2994c6f7ap-13, -0x1.6dd369d642b7dp-17, 0x1.81fb2aaf2e37p-21, -0x1.966040990b623p-25, 0x1.aaee55e15a079p-29, -0x1.bf756fc8ef04p-33, 0x1.d2daf554e0157p-37, -0x1.dec63e10d317p-41, 0x1.cae915bab7704p-45, -0x1.6537fbb62a8edp-49, 0x1.3f14bd5531da8p-54}, -/* 18.027313840043536 < x < 21.627416997969522. */ -{0x1.fff97acd75487p-6, -0x1.c502e8e46eb81p-10, 0x1.903b065062756p-14, -0x1.6110aa5e81885p-18, 0x1.36fd4c13c4f1fp-22, -0x1.11848650be987p-26, 0x1.e06596bf6a27p-31, -0x1.a527876771d55p-35, 0x1.6fe1b92a40eb8p-39, -0x1.3c6eb50b23bc6p-43, 0x1.fead2230125dp-48, -0x1.5073427c5207dp-52, 0x1.ff420973fa51dp-58}, -/* 21.627416997969522 < x < 25.908685288118864. */ -{0x1.aaf347fc8c45bp-6, -0x1.3b2fd709cf8e5p-10, 0x1.d0ddfb858b60ap-15, -0x1.5673f4a8bb08ep-19, 0x1.f80488e89ddb9p-24, -0x1.728391905fcf3p-28, 0x1.101538d7e30bap-32, -0x1.8f16f49d0fa3bp-37, 0x1.23bbaea534034p-41, -0x1.a40119533ee1p-46, 0x1.1b75770e435fdp-50, -0x1.3804bdeb33efdp-55, 0x1.8ba4e7838a4dp-61}, -/* 25.908685288118864 < x < 31. */ -{0x1.64839d636f92bp-6, -0x1.b7adf753623afp-11, 0x1.0eec0b635a0c4p-15, -0x1.4da09b802ef48p-20, 0x1.9a8b149f5ddf1p-25, -0x1.f8d1f722c65bap-30, 0x1.36247d9a20e19p-34, -0x1.7cbd25180c1d3p-39, 0x1.d243c7a5c8331p-44, -0x1.19e00cc6b1e08p-48, 0x1.418cb6823f2d9p-53, -0x1.2dfdc526c43acp-58, 0x1.49885a987486fp-64}, -/* Dummy interval for x>31 */ -{0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, - 0x0p0, 0x0p0, 0x0p0} -} -}; diff --git a/contrib/arm-optimized-routines/pl/math/v_erfcf_1u.c b/contrib/arm-optimized-routines/pl/math/v_erfcf_1u.c deleted file mode 100644 index 963490d789bd..000000000000 --- a/contrib/arm-optimized-routines/pl/math/v_erfcf_1u.c +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Single-precision vector erfc(x) function. - * - * Copyright (c) 2021-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#include "erfcf.h" -#include "estrin.h" -#include "pl_sig.h" -#include "pl_test.h" - -#if V_SUPPORTED - -#define P(ia12) __erfcf_poly_data.poly[interval_index (ia12)] - -VPCS_ATTR v_f64_t V_NAME (exp_tail) (v_f64_t, v_f64_t); - -static VPCS_ATTR NOINLINE v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t special) -{ - return v_call_f32 (erfcf, x, y, special); -} - -static inline uint32_t -interval_index (uint32_t ia12) -{ - // clang-format off - return (ia12 < 0x400 ? 0 : - (ia12 < 0x408 ? 1 : - (ia12 < 0x410 ? 2 : - 3))); - // clang-format on -} - -/* The C macro wraps the coeffs argument in order to make the - poynomial evaluation more readable. In the scalarised variant the - second pointer is ignored. */ -#ifdef SCALAR -#define C(i) coeff1[i] -#else -#define C(i) ((v_f64_t){coeff1[i], coeff2[i]}) -#endif - -static inline v_f64_t -v_approx_erfcf_poly_gauss (v_f64_t x, const double *coeff1, - const double *coeff2) -{ - v_f64_t x2 = x * x; - v_f64_t x4 = x2 * x2; - v_f64_t poly = ESTRIN_15 (x, x2, x4, x4 * x4, C); - v_f64_t gauss = V_NAME (exp_tail) (-(x * x), v_f64 (0.0)); - return poly * gauss; -} - -static inline float -approx_poly_gauss (float abs_x, const double *coeff) -{ - return (float) (eval_poly (abs_x, coeff) * eval_exp_mx2 (abs_x)); -} - -static v_f32_t -v_approx_erfcf (v_f32_t abs_x, v_u32_t sign, v_u32_t ia12, v_u32_t lanes) -{ -#ifdef SCALAR - float y = approx_poly_gauss (abs_x, P (ia12)); - return sign ? 2 - y : y; -#else - float32x2_t lo32 = {0, 0}; - float32x2_t hi32 = {0, 0}; - /* The polynomial and Gaussian components must be calculated in - double precision in order to meet the required ULP error. This - means we have to promote low and high halves of the - single-precision input vector to two separate double-precision - input vectors. This incurs some overhead, and there is also - overhead to loading the polynomial coefficients as this cannot be - done in a vector fashion. This would be wasted effort for - elements which lie in the 'boring' zone, as they will be - overwritten later. Hence we use the lanes parameter to only do - the promotion on a pair of lanes if both of those lanes are - interesting and not special cases. If one lane is inactive, we - use a scalar routine which is shared with the scalar variant. */ - if (lanes[0] & lanes[1]) - { - lo32 = vcvt_f32_f64 ( - v_approx_erfcf_poly_gauss (vcvt_f64_f32 (vget_low_f32 (abs_x)), - P (ia12[0]), P (ia12[1]))); - } - else if (lanes[0]) - { - lo32[0] = approx_poly_gauss (abs_x[0], P (ia12[0])); - } - else if (lanes[1]) - { - lo32[1] = approx_poly_gauss (abs_x[1], P (ia12[1])); - } - - if (lanes[2] & lanes[3]) - { - hi32 - = vcvt_f32_f64 (v_approx_erfcf_poly_gauss (vcvt_high_f64_f32 (abs_x), - P (ia12[2]), P (ia12[3]))); - } - else if (lanes[2]) - { - hi32[0] = approx_poly_gauss (abs_x[2], P (ia12[2])); - } - else if (lanes[3]) - { - hi32[1] = approx_poly_gauss (abs_x[3], P (ia12[3])); - } - - v_f32_t y = vcombine_f32 (lo32, hi32); - - if (v_any_u32 (sign)) - { - y = vbslq_f32 (vceqzq_u32 (sign), y, 2 - y); - } - - return y; -#endif -} - -/* Optimized single-precision vector complementary error function - erfcf. Max measured error: 0.750092 at various values between - -0x1.06521p-20 and -0x1.add1dap-17. For example: - __v_erfc(-0x1.08185p-18) got 0x1.00004cp+0 want 0x1.00004ap+0 - +0.249908 ulp err 0.250092. */ -VPCS_ATTR -v_f32_t V_NAME (erfcf) (v_f32_t x) -{ - v_u32_t ix = v_as_u32_f32 (x); - v_u32_t ia = ix & 0x7fffffff; - v_u32_t ia12 = ia >> 20; - v_u32_t sign = ix >> 31; - v_u32_t inf_ia12 = v_u32 (0x7f8); - - v_u32_t special_cases - = v_cond_u32 ((ia12 - 0x328) >= ((inf_ia12 & 0x7f8) - 0x328)); - v_u32_t in_bounds - = v_cond_u32 ((ia < 0x408ccccd) | (~sign & (ix < 0x4120f5c3))); - v_f32_t boring_zone = v_as_f32_u32 (sign << 30); - -#ifdef SCALAR - if (unlikely (special_cases)) - { - if (ia12 >= 0x7f8) - return (float) (sign << 1) + 1.0f / x; /* Special cases. */ - else - return 1.0f - x; /* Small case. */ - } - else if (likely (!in_bounds)) - { - return sign ? boring_zone : __math_uflowf (boring_zone); - } -#endif - - v_f32_t y = v_approx_erfcf (v_as_f32_u32 (ia), sign, ia12, - in_bounds & ~special_cases); - -#ifndef SCALAR - y = vbslq_f32 (~in_bounds, boring_zone, y); - - if (unlikely (v_any_u32 (special_cases))) - { - return specialcase (x, y, special_cases); - } -#endif - - return y; -} -VPCS_ALIAS - -PL_SIG (V, F, 1, erfc, -6.0, 28.0) -PL_TEST_ULP (V_NAME (erfcf), 0.26) -PL_TEST_INTERVAL (V_NAME (erfcf), 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-127, 0x1p-26, 40000) -PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-127, -0x1p-26, 40000) -PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-26, 0x1p5, 40000) -PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-26, -0x1p3, 40000) -PL_TEST_INTERVAL (V_NAME (erfcf), 0, inf, 40000) -#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_erfcf_1u7.c b/contrib/arm-optimized-routines/pl/math/v_erfcf_1u7.c new file mode 100644 index 000000000000..c361d0704438 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_erfcf_1u7.c @@ -0,0 +1,166 @@ +/* + * Single-precision vector erfc(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + uint32x4_t offset, table_scale; + float32x4_t max, shift; + float32x4_t coeffs, third, two_over_five, tenth; +#if WANT_SIMD_EXCEPT + float32x4_t uflow_bound; +#endif + +} data = { + /* Set an offset so the range of the index used for lookup is 644, and it can + be clamped using a saturated add. */ + .offset = V4 (0xb7fffd7b), /* 0xffffffff - asuint(shift) - 644. */ + .table_scale = V4 (0x28000000 << 1), /* asuint (2^-47) << 1. */ + .max = V4 (10.0625f), /* 10 + 1/16 = 644/64. */ + .shift = V4 (0x1p17f), + /* Store 1/3, 2/3 and 2/15 in a single register for use with indexed muls and + fmas. */ + .coeffs = (float32x4_t){ 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 }, + .third = V4 (0x1.555556p-2f), + .two_over_five = V4 (-0x1.99999ap-2f), + .tenth = V4 (-0x1.99999ap-4f), +#if WANT_SIMD_EXCEPT + .uflow_bound = V4 (0x1.2639cp+3f), +#endif +}; + +#define TinyBound 0x41000000 /* 0x1p-62f << 1. */ +#define Thres 0xbe000000 /* asuint(infinity) << 1 - TinyBound. */ +#define Off 0xfffffd7b /* 0xffffffff - 644. */ + +struct entry +{ + float32x4_t erfc; + float32x4_t scale; +}; + +static inline struct entry +lookup (uint32x4_t i) +{ + struct entry e; + float64_t t0 = *((float64_t *) (__erfcf_data.tab - Off + i[0])); + float64_t t1 = *((float64_t *) (__erfcf_data.tab - Off + i[1])); + float64_t t2 = *((float64_t *) (__erfcf_data.tab - Off + i[2])); + float64_t t3 = *((float64_t *) (__erfcf_data.tab - Off + i[3])); + float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 }); + float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 }); + e.erfc = vuzp1q_f32 (e1, e2); + e.scale = vuzp2q_f32 (e1, e2); + return e; +} + +#if WANT_SIMD_EXCEPT +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) +{ + return v_call_f32 (erfcf, x, y, cmp); +} +#endif + +/* Optimized single-precision vector erfcf(x). + Approximation based on series expansion near x rounded to + nearest multiple of 1/64. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erfc(x) ~ erfc(r) - scale * d * poly(r, d), with + + poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3 + + (2/15 r^4 - 2/5 r^2 + 1/10) d^4 + + Values of erfc(r) and scale are read from lookup tables. Stored values + are scaled to avoid hitting the subnormal range. + + Note that for x < 0, erfc(x) = 2.0 - erfc(-x). + Maximum error: 1.63 ULP (~1.0 ULP for x < 0.0). + _ZGVnN4v_erfcf(0x1.1dbf7ap+3) got 0x1.f51212p-120 + want 0x1.f51216p-120. */ +VPCS_ATTR +float32x4_t V_NAME_F1 (erfc) (float32x4_t x) +{ + const struct data *dat = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + /* |x| < 2^-62. Avoid fabs by left-shifting by 1. */ + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint32x4_t cmp = vcltq_u32 (vaddq_u32 (ix, ix), v_u32 (TinyBound)); + /* x >= ~9.19 (into subnormal case and uflow case). Comparison is done in + integer domain to avoid raising exceptions in presence of nans. */ + uint32x4_t uflow = vcgeq_s32 (vreinterpretq_s32_f32 (x), + vreinterpretq_s32_f32 (dat->uflow_bound)); + cmp = vorrq_u32 (cmp, uflow); + float32x4_t xm = x; + /* If any lanes are special, mask them with 0 and retain a copy of x to allow + special case handler to fix special lanes later. This is only necessary if + fenv exceptions are to be triggered correctly. */ + if (unlikely (v_any_u32 (cmp))) + x = v_zerofy_f32 (x, cmp); +#endif + + float32x4_t a = vabsq_f32 (x); + a = vminq_f32 (a, dat->max); + + /* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 0 and scale to + 2/sqrt(pi), when x reduced to r = 0. */ + float32x4_t shift = dat->shift; + float32x4_t z = vaddq_f32 (a, shift); + + /* Clamp index to a range of 644. A naive approach would use a subtract and + min. Instead we offset the table address and the index, then use a + saturating add. */ + uint32x4_t i = vqaddq_u32 (vreinterpretq_u32_f32 (z), dat->offset); + + struct entry e = lookup (i); + + /* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */ + float32x4_t r = vsubq_f32 (z, shift); + float32x4_t d = vsubq_f32 (a, r); + float32x4_t d2 = vmulq_f32 (d, d); + float32x4_t r2 = vmulq_f32 (r, r); + + float32x4_t p1 = r; + float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, dat->coeffs, 1); + float32x4_t p3 + = vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, dat->coeffs, 0)); + float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, dat->coeffs, 2); + p4 = vfmsq_f32 (dat->tenth, r2, p4); + + float32x4_t y = vfmaq_f32 (p3, d, p4); + y = vfmaq_f32 (p2, d, y); + y = vfmaq_f32 (p1, d, y); + y = vfmsq_f32 (e.erfc, e.scale, vfmsq_f32 (d, d2, y)); + + /* Offset equals 2.0f if sign, else 0.0f. */ + uint32x4_t sign = vshrq_n_u32 (vreinterpretq_u32_f32 (x), 31); + float32x4_t off = vreinterpretq_f32_u32 (vshlq_n_u32 (sign, 30)); + /* Copy sign and scale back in a single fma. Since the bit patterns do not + overlap, then logical or and addition are equivalent here. */ + float32x4_t fac = vreinterpretq_f32_u32 ( + vsraq_n_u32 (vshlq_n_u32 (sign, 31), dat->table_scale, 1)); + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u32 (cmp))) + return special_case (xm, vfmaq_f32 (off, fac, y), cmp); +#endif + + return vfmaq_f32 (off, fac, y); +} + +PL_SIG (V, F, 1, erfc, -4.0, 10.0) +PL_TEST_ULP (V_NAME_F1 (erfc), 1.14) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (erfc), 0, 0x1p-26, 40000) +PL_TEST_INTERVAL (V_NAME_F1 (erfc), 0x1p-26, 10.0625, 40000) +PL_TEST_INTERVAL (V_NAME_F1 (erfc), -0x1p-26, -4.0, 40000) +PL_TEST_INTERVAL (V_NAME_F1 (erfc), 10.0625, inf, 40000) +PL_TEST_INTERVAL (V_NAME_F1 (erfc), -4.0, -inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/v_erff_1u5.c b/contrib/arm-optimized-routines/pl/math/v_erff_1u5.c deleted file mode 100644 index 3a25cc8751d1..000000000000 --- a/contrib/arm-optimized-routines/pl/math/v_erff_1u5.c +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Single-precision vector erf(x) function. - * - * Copyright (c) 2020-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#include "include/mathlib.h" -#include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" - -#if V_SUPPORTED - -VPCS_ATTR v_f32_t V_NAME (expf) (v_f32_t); - -#define AbsMask v_u32 (0x7fffffff) - -/* Special cases (fall back to scalar calls). */ -VPCS_ATTR -NOINLINE static v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) -{ - return v_call_f32 (erff, x, y, cmp); -} - -/* A structure to perform look-up in coeffs and other parameter tables. */ -struct entry -{ - v_f32_t P[V_ERFF_NCOEFFS]; -}; - -static inline struct entry -lookup (v_u32_t i) -{ - struct entry e; -#ifdef SCALAR - for (int j = 0; j < V_ERFF_NCOEFFS; ++j) - e.P[j] = __v_erff_data.coeffs[j][i]; -#else - for (int j = 0; j < V_ERFF_NCOEFFS; ++j) - { - e.P[j][0] = __v_erff_data.coeffs[j][i[0]]; - e.P[j][1] = __v_erff_data.coeffs[j][i[1]]; - e.P[j][2] = __v_erff_data.coeffs[j][i[2]]; - e.P[j][3] = __v_erff_data.coeffs[j][i[3]]; - } -#endif - return e; -} - -/* Optimized single precision vector error function erf. - Maximum measured at +/- 0.931, 1.25ULP: - v_erff(-0x1.dc59fap-1) got -0x1.9f9c88p-1 - want -0x1.9f9c8ap-1. */ -VPCS_ATTR -v_f32_t V_NAME (erff) (v_f32_t x) -{ - /* Handle both inf/nan as well as small values (|x|<2^-28). If any condition - in the lane is true then a loop over scalar calls will be performed. */ - v_u32_t ix = v_as_u32_f32 (x); - v_u32_t atop = (ix >> 16) & v_u32 (0x7fff); - v_u32_t cmp = v_cond_u32 (atop - v_u32 (0x3180) >= v_u32 (0x7ff0 - 0x3180)); - - /* Get sign and absolute value. */ - v_u32_t sign = ix & ~AbsMask; - /* |x| < 0.921875. */ - v_u32_t red = v_calt_f32 (x, v_f32 (0.921875f)); - /* |x| > 4.0. */ - v_u32_t bor = v_cagt_f32 (x, v_f32 (4.0f)); - /* Avoid dependency in abs(x) in division (and comparison). */ - v_u32_t i = v_sel_u32 (red, v_u32 (0), v_u32 (1)); - - /* Get polynomial coefficients. */ - struct entry dat = lookup (i); - - v_f32_t a = v_abs_f32 (x); - v_f32_t z = v_sel_f32 (red, x * x, a); - - /* Evaluate Polynomial of |x| or x^2. */ - v_f32_t r = dat.P[6]; - r = v_fma_f32 (z, r, dat.P[5]); - r = v_fma_f32 (z, r, dat.P[4]); - r = v_fma_f32 (z, r, dat.P[3]); - r = v_fma_f32 (z, r, dat.P[2]); - r = v_fma_f32 (z, r, dat.P[1]); - r = v_sel_f32 (red, r, v_fma_f32 (z, r, dat.P[0])); - r = v_fma_f32 (a, r, a); - - /* y = |x| + |x|*P(|x|) if |x| < 0.921875 - 1 - exp (-(|x|+|x|*P(x^2))) otherwise. */ - v_f32_t y = v_sel_f32 (red, r, v_f32 (1.0f) - V_NAME (expf) (-r)); - - /* Boring domain (absolute value is required to get the sign of erf(-nan) - right). */ - y = v_sel_f32 (bor, v_f32 (1.0f), v_abs_f32 (y)); - - /* y=erf(x) if x>0, -erf(-x) otherwise. */ - y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign); - - if (unlikely (v_any_u32 (cmp))) - return specialcase (x, y, cmp); - return y; -} -VPCS_ALIAS - -PL_SIG (V, F, 1, erf, -4.0, 4.0) -PL_TEST_ULP (V_NAME (erff), 0.76) -PL_TEST_INTERVAL (V_NAME (erff), 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (V_NAME (erff), 0x1p-127, 0x1p-26, 40000) -PL_TEST_INTERVAL (V_NAME (erff), -0x1p-127, -0x1p-26, 40000) -PL_TEST_INTERVAL (V_NAME (erff), 0x1p-26, 0x1p3, 40000) -PL_TEST_INTERVAL (V_NAME (erff), -0x1p-26, -0x1p3, 40000) -PL_TEST_INTERVAL (V_NAME (erff), 0, inf, 40000) -#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_erff_2u.c b/contrib/arm-optimized-routines/pl/math/v_erff_2u.c new file mode 100644 index 000000000000..502526407df2 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_erff_2u.c @@ -0,0 +1,118 @@ +/* + * Single-precision vector erf(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float32x4_t max, shift, third; +#if WANT_SIMD_EXCEPT + float32x4_t tiny_bound, scale_minus_one; +#endif +} data = { + .max = V4 (3.9375), /* 4 - 8/128. */ + .shift = V4 (0x1p16f), + .third = V4 (0x1.555556p-2f), /* 1/3. */ +#if WANT_SIMD_EXCEPT + .tiny_bound = V4 (0x1p-62f), + .scale_minus_one = V4 (0x1.06eba8p-3f), /* scale - 1.0. */ +#endif +}; + +#define AbsMask 0x7fffffff + +struct entry +{ + float32x4_t erf; + float32x4_t scale; +}; + +static inline struct entry +lookup (uint32x4_t i) +{ + struct entry e; + float64_t t0 = *((float64_t *) (__erff_data.tab + i[0])); + float64_t t1 = *((float64_t *) (__erff_data.tab + i[1])); + float64_t t2 = *((float64_t *) (__erff_data.tab + i[2])); + float64_t t3 = *((float64_t *) (__erff_data.tab + i[3])); + float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 }); + float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 }); + e.erf = vuzp1q_f32 (e1, e2); + e.scale = vuzp2q_f32 (e1, e2); + return e; +} + +/* Single-precision implementation of vector erf(x). + Approximation based on series expansion near x rounded to + nearest multiple of 1/128. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erf(x) ~ erf(r) + scale * d * [1 - r * d - 1/3 * d^2] + + Values of erf(r) and scale are read from lookup tables. + For |x| > 3.9375, erf(|x|) rounds to 1.0f. + + Maximum error: 1.93 ULP + _ZGVnN4v_erff(0x1.c373e6p-9) got 0x1.fd686cp-9 + want 0x1.fd6868p-9. */ +float32x4_t VPCS_ATTR V_NAME_F1 (erf) (float32x4_t x) +{ + const struct data *dat = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + /* |x| < 2^-62. */ + uint32x4_t cmp = vcaltq_f32 (x, dat->tiny_bound); + float32x4_t xm = x; + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special case handler to fix special lanes later. This is only necessary if + fenv exceptions are to be triggered correctly. */ + if (unlikely (v_any_u32 (cmp))) + x = vbslq_f32 (cmp, v_f32 (1), x); +#endif + + float32x4_t a = vabsq_f32 (x); + uint32x4_t a_gt_max = vcgtq_f32 (a, dat->max); + + /* Lookup erf(r) and scale(r) in tables, e.g. set erf(r) to 0 and scale to + 2/sqrt(pi), when x reduced to r = 0. */ + float32x4_t shift = dat->shift; + float32x4_t z = vaddq_f32 (a, shift); + + uint32x4_t i + = vsubq_u32 (vreinterpretq_u32_f32 (z), vreinterpretq_u32_f32 (shift)); + i = vminq_u32 (i, v_u32 (512)); + struct entry e = lookup (i); + + float32x4_t r = vsubq_f32 (z, shift); + + /* erf(x) ~ erf(r) + scale * d * (1 - r * d - 1/3 * d^2). */ + float32x4_t d = vsubq_f32 (a, r); + float32x4_t d2 = vmulq_f32 (d, d); + float32x4_t y = vfmaq_f32 (r, dat->third, d); + y = vfmaq_f32 (e.erf, e.scale, vfmsq_f32 (d, d2, y)); + + /* Solves the |x| = inf case. */ + y = vbslq_f32 (a_gt_max, v_f32 (1.0f), y); + + /* Copy sign. */ + y = vbslq_f32 (v_u32 (AbsMask), y, x); + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u32 (cmp))) + return vbslq_f32 (cmp, vfmaq_f32 (xm, dat->scale_minus_one, xm), y); +#endif + return y; +} + +PL_SIG (V, F, 1, erf, -4.0, 4.0) +PL_TEST_ULP (V_NAME_F1 (erf), 1.43) +PL_TEST_EXPECT_FENV (V_NAME_F1 (erf), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, 3.9375, 40000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (erf), 3.9375, inf, 40000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/v_erff_data.c b/contrib/arm-optimized-routines/pl/math/v_erff_data.c deleted file mode 100644 index 73ccb5cbcfa8..000000000000 --- a/contrib/arm-optimized-routines/pl/math/v_erff_data.c +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Data for approximation of vector erff. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -/* Minimax approximation of erff. */ -const struct v_erff_data __v_erff_data - = {.coeffs = {{0x0p0f, 0x1.079d0cp-3f}, - {0x1.06eba6p-03f, 0x1.450aa0p-1}, - {-0x1.8126e0p-02f, 0x1.b55cb0p-4f}, - {0x1.ce1a46p-04f, -0x1.8d6300p-6f}, - {-0x1.b68bd2p-06f, 0x1.fd1336p-9f}, - {0x1.473f48p-08f, -0x1.91d2ccp-12f}, - {-0x1.3a1a82p-11f, 0x1.222900p-16f}}}; diff --git a/contrib/arm-optimized-routines/pl/math/v_erfinv_25u.c b/contrib/arm-optimized-routines/pl/math/v_erfinv_25u.c new file mode 100644 index 000000000000..654a7336e85b --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_erfinv_25u.c @@ -0,0 +1,161 @@ +/* + * Double-precision inverse error function (AdvSIMD variant). + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "v_math.h" +#include "pl_test.h" +#include "mathlib.h" +#include "math_config.h" +#include "pl_sig.h" +#include "poly_advsimd_f64.h" +#define V_LOG_INLINE_POLY_ORDER 4 +#include "v_log_inline.h" + +const static struct data +{ + /* We use P_N and Q_N to refer to arrays of coefficients, where P_N is the + coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs + of the denominator. P is interleaved P_17 and P_37, similar for Q. P17 + and Q17 are provided as homogenous vectors as well for when the shortcut + can be taken. */ + double P[8][2], Q[7][2]; + float64x2_t tailshift; + uint8x16_t idx; + struct v_log_inline_data log_tbl; + float64x2_t P_57[9], Q_57[10], P_17[7], Q_17[6]; +} data = { .P = { { 0x1.007ce8f01b2e8p+4, -0x1.f3596123109edp-7 }, + { -0x1.6b23cc5c6c6d7p+6, 0x1.60b8fe375999ep-2 }, + { 0x1.74e5f6ceb3548p+7, -0x1.779bb9bef7c0fp+1 }, + { -0x1.5200bb15cc6bbp+7, 0x1.786ea384470a2p+3 }, + { 0x1.05d193233a849p+6, -0x1.6a7c1453c85d3p+4 }, + { -0x1.148c5474ee5e1p+3, 0x1.31f0fc5613142p+4 }, + { 0x1.689181bbafd0cp-3, -0x1.5ea6c007d4dbbp+2 }, + { 0, 0x1.e66f265ce9e5p-3 } }, + .Q = { { 0x1.d8fb0f913bd7bp+3, -0x1.636b2dcf4edbep-7 }, + { -0x1.6d7f25a3f1c24p+6, 0x1.0b5411e2acf29p-2 }, + { 0x1.a450d8e7f4cbbp+7, -0x1.3413109467a0bp+1 }, + { -0x1.bc3480485857p+7, 0x1.563e8136c554ap+3 }, + { 0x1.ae6b0c504ee02p+6, -0x1.7b77aab1dcafbp+4 }, + { -0x1.499dfec1a7f5fp+4, 0x1.8a3e174e05ddcp+4 }, + { 0x1p+0, -0x1.4075c56404eecp+3 } }, + .P_57 = { V2 (0x1.b874f9516f7f1p-14), V2 (0x1.5921f2916c1c4p-7), + V2 (0x1.145ae7d5b8fa4p-2), V2 (0x1.29d6dcc3b2fb7p+1), + V2 (0x1.cabe2209a7985p+2), V2 (0x1.11859f0745c4p+3), + V2 (0x1.b7ec7bc6a2ce5p+2), V2 (0x1.d0419e0bb42aep+1), + V2 (0x1.c5aa03eef7258p-1) }, + .Q_57 = { V2 (0x1.b8747e12691f1p-14), V2 (0x1.59240d8ed1e0ap-7), + V2 (0x1.14aef2b181e2p-2), V2 (0x1.2cd181bcea52p+1), + V2 (0x1.e6e63e0b7aa4cp+2), V2 (0x1.65cf8da94aa3ap+3), + V2 (0x1.7e5c787b10a36p+3), V2 (0x1.0626d68b6cea3p+3), + V2 (0x1.065c5f193abf6p+2), V2 (0x1p+0) }, + .P_17 = { V2 (0x1.007ce8f01b2e8p+4), V2 (-0x1.6b23cc5c6c6d7p+6), + V2 (0x1.74e5f6ceb3548p+7), V2 (-0x1.5200bb15cc6bbp+7), + V2 (0x1.05d193233a849p+6), V2 (-0x1.148c5474ee5e1p+3), + V2 (0x1.689181bbafd0cp-3) }, + .Q_17 = { V2 (0x1.d8fb0f913bd7bp+3), V2 (-0x1.6d7f25a3f1c24p+6), + V2 (0x1.a450d8e7f4cbbp+7), V2 (-0x1.bc3480485857p+7), + V2 (0x1.ae6b0c504ee02p+6), V2 (-0x1.499dfec1a7f5fp+4) }, + .tailshift = V2 (-0.87890625), + .idx = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + .log_tbl = V_LOG_CONSTANTS }; + +static inline float64x2_t +special (float64x2_t x, const struct data *d) +{ + /* Note erfinv(inf) should return NaN, and erfinv(1) should return Inf. + By using log here, instead of log1p, we return finite values for both + these inputs, and values outside [-1, 1]. This is non-compliant, but is an + acceptable optimisation at Ofast. To get correct behaviour for all finite + values use the log1p_inline helper on -abs(x) - note that erfinv(inf) + will still be finite. */ + float64x2_t t = vnegq_f64 ( + v_log_inline (vsubq_f64 (v_f64 (1), vabsq_f64 (x)), &d->log_tbl)); + t = vdivq_f64 (v_f64 (1), vsqrtq_f64 (t)); + float64x2_t ts = vbslq_f64 (v_u64 (0x7fffffffffffffff), t, x); + return vdivq_f64 (v_horner_8_f64 (t, d->P_57), + vmulq_f64 (ts, v_horner_9_f64 (t, d->Q_57))); +} + +static inline float64x2_t +lookup (const double *c, uint8x16_t idx) +{ + float64x2_t x = vld1q_f64 (c); + return vreinterpretq_f64_u8 (vqtbl1q_u8 (vreinterpretq_u8_f64 (x), idx)); +} + +static inline float64x2_t VPCS_ATTR +notails (float64x2_t x, const struct data *d) +{ + /* Shortcut when no input is in a tail region - no need to gather shift or + coefficients. */ + float64x2_t t = vfmaq_f64 (v_f64 (-0.5625), x, x); + float64x2_t p = vmulq_f64 (v_horner_6_f64 (t, d->P_17), x); + float64x2_t q = vaddq_f64 (d->Q_17[5], t); + for (int i = 4; i >= 0; i--) + q = vfmaq_f64 (d->Q_17[i], q, t); + return vdivq_f64 (p, q); +} + +/* Vector implementation of Blair et al's rational approximation to inverse + error function in single-precision. Largest observed error is 24.75 ULP: + _ZGVnN2v_erfinv(0x1.fc861d81c2ba8p-1) got 0x1.ea05472686625p+0 + want 0x1.ea0547268660cp+0. */ +float64x2_t VPCS_ATTR V_NAME_D1 (erfinv) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + /* Calculate inverse error using algorithm described in + J. M. Blair, C. A. Edwards, and J. H. Johnson, + "Rational Chebyshev approximations for the inverse of the error function", + Math. Comp. 30, pp. 827--830 (1976). + https://doi.org/10.1090/S0025-5718-1976-0421040-7. + + Algorithm has 3 intervals: + - 'Normal' region [-0.75, 0.75] + - Tail region [0.75, 0.9375] U [-0.9375, -0.75] + - Extreme tail [-1, -0.9375] U [0.9375, 1] + Normal and tail are both rational approximation of similar order on + shifted input - these are typically performed in parallel using gather + loads to obtain correct coefficients depending on interval. */ + uint64x2_t is_tail = vcagtq_f64 (x, v_f64 (0.75)); + + if (unlikely (!v_any_u64 (is_tail))) + /* If input is normally distributed in [-1, 1] then likelihood of this is + 0.75^2 ~= 0.56. */ + return notails (x, d); + + uint64x2_t extreme_tail = vcagtq_f64 (x, v_f64 (0.9375)); + + uint8x16_t off = vandq_u8 (vreinterpretq_u8_u64 (is_tail), vdupq_n_u8 (8)); + uint8x16_t idx = vaddq_u8 (d->idx, off); + + float64x2_t t = vbslq_f64 (is_tail, d->tailshift, v_f64 (-0.5625)); + t = vfmaq_f64 (t, x, x); + + float64x2_t p = lookup (&d->P[7][0], idx); + /* Last coeff of q is either 0 or 1 - use mask instead of load. */ + float64x2_t q = vreinterpretq_f64_u64 ( + vandq_u64 (is_tail, vreinterpretq_u64_f64 (v_f64 (1)))); + for (int i = 6; i >= 0; i--) + { + p = vfmaq_f64 (lookup (&d->P[i][0], idx), p, t); + q = vfmaq_f64 (lookup (&d->Q[i][0], idx), q, t); + } + p = vmulq_f64 (p, x); + + if (unlikely (v_any_u64 (extreme_tail))) + return vbslq_f64 (extreme_tail, special (x, d), vdivq_f64 (p, q)); + + return vdivq_f64 (p, q); +} + +PL_SIG (V, D, 1, erfinv, -0.99, 0.99) +PL_TEST_ULP (V_NAME_D1 (erfinv), 24.8) +/* Test with control lane in each interval. */ +PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000, + 0.5) +PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000, + 0.8) +PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000, + 0.95) diff --git a/contrib/arm-optimized-routines/pl/math/v_erfinvf_5u.c b/contrib/arm-optimized-routines/pl/math/v_erfinvf_5u.c new file mode 100644 index 000000000000..5a6800b86ae9 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_erfinvf_5u.c @@ -0,0 +1,163 @@ +/* + * Single-precision inverse error function (AdvSIMD variant). + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "poly_advsimd_f32.h" +#include "v_logf_inline.h" + +const static struct data +{ + /* We use P_N and Q_N to refer to arrays of coefficients, where P_N is the + coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs + of the denominator. Coefficients are stored in various interleaved + formats to allow for table-based (vector-to-vector) lookup. + + Plo is first two coefficients of P_10 and P_29 interleaved. + PQ is third coeff of P_10 and first of Q_29 interleaved. + Qhi is second and third coeffs of Q_29 interleaved. + P29_3 is a homogenous vector with fourth coeff of P_29. + + P_10 and Q_10 are also stored in homogenous vectors to allow better + memory access when no lanes are in a tail region. */ + float32x4_t Plo, PQ, Qhi, P29_3, tailshift; + float32x4_t P_50[6], Q_50[2]; + float32x4_t P_10[3], Q_10[3]; + uint8x16_t idxhi, idxlo; + struct v_logf_data logf_tbl; +} data = { + .idxlo = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + .idxhi = { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 }, + .P29_3 = V4 (0x1.b13626p-2), + .tailshift = V4 (-0.87890625), + .Plo = { -0x1.a31268p+3, -0x1.fc0252p-4, 0x1.ac9048p+4, 0x1.119d44p+0 }, + .PQ = { -0x1.293ff6p+3, -0x1.f59ee2p+0, -0x1.8265eep+3, -0x1.69952p-4 }, + .Qhi = { 0x1.ef5eaep+4, 0x1.c7b7d2p-1, -0x1.12665p+4, -0x1.167d7p+1 }, + .P_50 = { V4 (0x1.3d8948p-3), V4 (0x1.61f9eap+0), V4 (0x1.61c6bcp-1), + V4 (-0x1.20c9f2p+0), V4 (0x1.5c704cp-1), V4 (-0x1.50c6bep-3) }, + .Q_50 = { V4 (0x1.3d7dacp-3), V4 (0x1.629e5p+0) }, + .P_10 = { V4 (-0x1.a31268p+3), V4 (0x1.ac9048p+4), V4 (-0x1.293ff6p+3) }, + .Q_10 = { V4 (-0x1.8265eep+3), V4 (0x1.ef5eaep+4), V4 (-0x1.12665p+4) }, + .logf_tbl = V_LOGF_CONSTANTS +}; + +static inline float32x4_t +special (float32x4_t x, const struct data *d) +{ + /* Note erfinvf(inf) should return NaN, and erfinvf(1) should return Inf. + By using log here, instead of log1p, we return finite values for both + these inputs, and values outside [-1, 1]. This is non-compliant, but is an + acceptable optimisation at Ofast. To get correct behaviour for all finite + values use the log1pf_inline helper on -abs(x) - note that erfinvf(inf) + will still be finite. */ + float32x4_t t = vdivq_f32 ( + v_f32 (1), vsqrtq_f32 (vnegq_f32 (v_logf_inline ( + vsubq_f32 (v_f32 (1), vabsq_f32 (x)), &d->logf_tbl)))); + float32x4_t ts = vbslq_f32 (v_u32 (0x7fffffff), t, x); + float32x4_t q = vfmaq_f32 (d->Q_50[0], vaddq_f32 (t, d->Q_50[1]), t); + return vdivq_f32 (v_horner_5_f32 (t, d->P_50), vmulq_f32 (ts, q)); +} + +static inline float32x4_t +notails (float32x4_t x, const struct data *d) +{ + /* Shortcut when no input is in a tail region - no need to gather shift or + coefficients. */ + float32x4_t t = vfmaq_f32 (v_f32 (-0.5625), x, x); + float32x4_t q = vaddq_f32 (t, d->Q_10[2]); + q = vfmaq_f32 (d->Q_10[1], t, q); + q = vfmaq_f32 (d->Q_10[0], t, q); + + return vdivq_f32 (vmulq_f32 (x, v_horner_2_f32 (t, d->P_10)), q); +} + +static inline float32x4_t +lookup (float32x4_t tbl, uint8x16_t idx) +{ + return vreinterpretq_f32_u8 (vqtbl1q_u8 (vreinterpretq_u8_f32 (tbl), idx)); +} + +/* Vector implementation of Blair et al's rational approximation to inverse + error function in single-precision. Worst-case error is 4.98 ULP, in the + tail region: + _ZGVnN4v_erfinvf(0x1.f7dbeep-1) got 0x1.b4793p+0 + want 0x1.b4793ap+0 . */ +float32x4_t VPCS_ATTR V_NAME_F1 (erfinv) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + + /* Calculate inverse error using algorithm described in + J. M. Blair, C. A. Edwards, and J. H. Johnson, + "Rational Chebyshev approximations for the inverse of the error + function", Math. Comp. 30, pp. 827--830 (1976). + https://doi.org/10.1090/S0025-5718-1976-0421040-7. + + Algorithm has 3 intervals: + - 'Normal' region [-0.75, 0.75] + - Tail region [0.75, 0.9375] U [-0.9375, -0.75] + - Extreme tail [-1, -0.9375] U [0.9375, 1] + Normal and tail are both rational approximation of similar order on + shifted input - these are typically performed in parallel using gather + loads to obtain correct coefficients depending on interval. */ + uint32x4_t is_tail = vcageq_f32 (x, v_f32 (0.75)); + uint32x4_t extreme_tail = vcageq_f32 (x, v_f32 (0.9375)); + + if (unlikely (!v_any_u32 (is_tail))) + /* Shortcut for if all lanes are in [-0.75, 0.75] - can avoid having to + gather coefficients. If input is uniform in [-1, 1] then likelihood of + this is 0.75^4 ~= 0.31. */ + return notails (x, d); + + /* Select requisite shift depending on interval: polynomial is evaluated on + x * x - shift. + Normal shift = 0.5625 + Tail shift = 0.87890625. */ + float32x4_t t + = vfmaq_f32 (vbslq_f32 (is_tail, d->tailshift, v_f32 (-0.5625)), x, x); + + /* Calculate indexes for tbl: tbl is byte-wise, so: + [0, 1, 2, 3, 4, 5, 6, ....] copies the vector + Add 4 * i to a group of 4 lanes to copy 32-bit lane i. Each vector stores + two pairs of coeffs, so we need two idx vectors - one for each pair. */ + uint8x16_t off = vandq_u8 (vreinterpretq_u8_u32 (is_tail), vdupq_n_u8 (4)); + uint8x16_t idx_lo = vaddq_u8 (d->idxlo, off); + uint8x16_t idx_hi = vaddq_u8 (d->idxhi, off); + + /* Load the tables. */ + float32x4_t p_lo = d->Plo; + float32x4_t pq = d->PQ; + float32x4_t qhi = d->Qhi; + + /* Do the lookup (and calculate p3 by masking non-tail lanes). */ + float32x4_t p3 = vreinterpretq_f32_u32 ( + vandq_u32 (is_tail, vreinterpretq_u32_f32 (d->P29_3))); + float32x4_t p0 = lookup (p_lo, idx_lo), p1 = lookup (p_lo, idx_hi), + p2 = lookup (pq, idx_lo), q0 = lookup (pq, idx_hi), + q1 = lookup (qhi, idx_lo), q2 = lookup (qhi, idx_hi); + + float32x4_t p = vfmaq_f32 (p2, p3, t); + p = vfmaq_f32 (p1, p, t); + p = vfmaq_f32 (p0, p, t); + p = vmulq_f32 (x, p); + + float32x4_t q = vfmaq_f32 (q1, vaddq_f32 (q2, t), t); + q = vfmaq_f32 (q0, q, t); + + if (unlikely (v_any_u32 (extreme_tail))) + /* At least one lane is in the extreme tail - if input is uniform in + [-1, 1] the likelihood of this is ~0.23. */ + return vbslq_f32 (extreme_tail, special (x, d), vdivq_f32 (p, q)); + + return vdivq_f32 (p, q); +} + +PL_SIG (V, F, 1, erfinv, -0.99, 0.99) +PL_TEST_ULP (V_NAME_F1 (erfinv), 4.49) +/* Test with control lane in each interval. */ +PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000, 0.5) +PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000, 0.8) +PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000, 0.95) diff --git a/contrib/arm-optimized-routines/pl/math/v_exp10_2u.c b/contrib/arm-optimized-routines/pl/math/v_exp10_2u.c new file mode 100644 index 000000000000..29072a60fb3a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_exp10_2u.c @@ -0,0 +1,144 @@ +/* + * Double-precision vector 10^x function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +/* Value of |x| above which scale overflows without special treatment. */ +#define SpecialBound 306.0 /* floor (log10 (2^1023)) - 1. */ +/* Value of n above which scale overflows even with special treatment. */ +#define ScaleBound 163840.0 /* 1280.0 * N. */ + +const static struct data +{ + float64x2_t poly[4]; + float64x2_t log10_2, log2_10_hi, log2_10_lo, shift; +#if !WANT_SIMD_EXCEPT + float64x2_t special_bound, scale_thresh; +#endif +} data = { + /* Coefficients generated using Remez algorithm. + rel error: 0x1.5ddf8f28p-54 + abs error: 0x1.5ed266c8p-54 in [ -log10(2)/256, log10(2)/256 ] + maxerr: 1.14432 +0.5 ulp. */ + .poly = { V2 (0x1.26bb1bbb5524p1), V2 (0x1.53524c73cecdap1), + V2 (0x1.047060efb781cp1), V2 (0x1.2bd76040f0d16p0) }, + .log10_2 = V2 (0x1.a934f0979a371p8), /* N/log2(10). */ + .log2_10_hi = V2 (0x1.34413509f79ffp-9), /* log2(10)/N. */ + .log2_10_lo = V2 (-0x1.9dc1da994fd21p-66), + .shift = V2 (0x1.8p+52), +#if !WANT_SIMD_EXCEPT + .scale_thresh = V2 (ScaleBound), + .special_bound = V2 (SpecialBound), +#endif +}; + +#define N (1 << V_EXP_TABLE_BITS) +#define IndexMask v_u64 (N - 1) + +#if WANT_SIMD_EXCEPT + +# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511). */ +# define BigBound v_u64 (0x4070000000000000) /* asuint64 (0x1p8). */ +# define Thres v_u64 (0x2070000000000000) /* BigBound - TinyBound. */ + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) +{ + /* If fenv exceptions are to be triggered correctly, fall back to the scalar + routine for special lanes. */ + return v_call_f64 (exp10, x, y, cmp); +} + +#else + +# define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */ +/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ +# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */ +# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */ + +static inline float64x2_t VPCS_ATTR +special_case (float64x2_t s, float64x2_t y, float64x2_t n, + const struct data *d) +{ + /* 2^(n/N) may overflow, break it up into s1*s2. */ + uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset); + float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b)); + float64x2_t s2 = vreinterpretq_f64_u64 ( + vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b)); + uint64x2_t cmp = vcagtq_f64 (n, d->scale_thresh); + float64x2_t r1 = vmulq_f64 (s1, s1); + float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1); + return vbslq_f64 (cmp, r1, r0); +} + +#endif + +/* Fast vector implementation of exp10. + Maximum measured error is 1.64 ulp. + _ZGVnN2v_exp10(0x1.ccd1c9d82cc8cp+0) got 0x1.f8dab6d7fed0cp+5 + want 0x1.f8dab6d7fed0ap+5. */ +float64x2_t VPCS_ATTR V_NAME_D1 (exp10) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + uint64x2_t cmp; +#if WANT_SIMD_EXCEPT + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special_case to fix special lanes later. This is only necessary if fenv + exceptions are to be triggered correctly. */ + float64x2_t xm = x; + uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x)); + cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), Thres); + if (unlikely (v_any_u64 (cmp))) + x = vbslq_f64 (cmp, v_f64 (1), x); +#else + cmp = vcageq_f64 (x, d->special_bound); +#endif + + /* n = round(x/(log10(2)/N)). */ + float64x2_t z = vfmaq_f64 (d->shift, x, d->log10_2); + uint64x2_t u = vreinterpretq_u64_f64 (z); + float64x2_t n = vsubq_f64 (z, d->shift); + + /* r = x - n*log10(2)/N. */ + float64x2_t r = x; + r = vfmsq_f64 (r, d->log2_10_hi, n); + r = vfmsq_f64 (r, d->log2_10_lo, n); + + uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS); + uint64x2_t i = vandq_u64 (u, IndexMask); + + /* y = exp10(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4. */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t p = vfmaq_f64 (d->poly[0], r, d->poly[1]); + float64x2_t y = vfmaq_f64 (d->poly[2], r, d->poly[3]); + p = vfmaq_f64 (p, y, r2); + y = vmulq_f64 (r, p); + + /* s = 2^(n/N). */ + u = v_lookup_u64 (__v_exp_data, i); + float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e)); + + if (unlikely (v_any_u64 (cmp))) +#if WANT_SIMD_EXCEPT + return special_case (xm, vfmaq_f64 (s, y, s), cmp); +#else + return special_case (s, y, n, d); +#endif + + return vfmaq_f64 (s, y, s); +} + +PL_SIG (S, D, 1, exp10, -9.9, 9.9) +PL_SIG (V, D, 1, exp10, -9.9, 9.9) +PL_TEST_ULP (V_NAME_D1 (exp10), 1.15) +PL_TEST_EXPECT_FENV (V_NAME_D1 (exp10), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp10), 0, SpecialBound, 5000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp10), SpecialBound, ScaleBound, 5000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp10), ScaleBound, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/v_exp10f_2u4.c b/contrib/arm-optimized-routines/pl/math/v_exp10f_2u4.c new file mode 100644 index 000000000000..0e91becfa612 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_exp10f_2u4.c @@ -0,0 +1,138 @@ +/* + * Single-precision vector 10^x function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "poly_advsimd_f32.h" + +#define ScaleBound 192.0f + +static const struct data +{ + float32x4_t poly[5]; + float32x4_t log10_2_and_inv, shift; + +#if !WANT_SIMD_EXCEPT + float32x4_t scale_thresh; +#endif +} data = { + /* Coefficients generated using Remez algorithm with minimisation of relative + error. + rel error: 0x1.89dafa3p-24 + abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2] + maxerr: 1.85943 +0.5 ulp. */ + .poly = { V4 (0x1.26bb16p+1f), V4 (0x1.5350d2p+1f), V4 (0x1.04744ap+1f), + V4 (0x1.2d8176p+0f), V4 (0x1.12b41ap-1f) }, + .shift = V4 (0x1.8p23f), + + /* Stores constants 1/log10(2), log10(2)_high, log10(2)_low, 0. */ + .log10_2_and_inv = { 0x1.a934fp+1, 0x1.344136p-2, -0x1.ec10cp-27, 0 }, +#if !WANT_SIMD_EXCEPT + .scale_thresh = V4 (ScaleBound) +#endif +}; + +#define ExponentBias v_u32 (0x3f800000) + +#if WANT_SIMD_EXCEPT + +# define SpecialBound 38.0f /* rint(log10(2^127)). */ +# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ +# define BigBound v_u32 (0x42180000) /* asuint (SpecialBound). */ +# define Thres v_u32 (0x22180000) /* BigBound - TinyBound. */ + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) +{ + /* If fenv exceptions are to be triggered correctly, fall back to the scalar + routine to special lanes. */ + return v_call_f32 (exp10f, x, y, cmp); +} + +#else + +# define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */ +# define SpecialOffset v_u32 (0x82000000) +# define SpecialBias v_u32 (0x7f000000) + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, + float32x4_t scale, const struct data *d) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); + float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); + uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); + float32x4_t r2 = vmulq_f32 (s1, s1); + float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1); + /* Similar to r1 but avoids double rounding in the subnormal range. */ + float32x4_t r0 = vfmaq_f32 (scale, poly, scale); + float32x4_t r = vbslq_f32 (cmp1, r1, r0); + return vbslq_f32 (cmp2, r2, r); +} + +#endif + +/* Fast vector implementation of single-precision exp10. + Algorithm is accurate to 2.36 ULP. + _ZGVnN4v_exp10f(0x1.be2b36p+1) got 0x1.7e79c4p+11 + want 0x1.7e79cp+11. */ +float32x4_t VPCS_ATTR V_NAME_F1 (exp10) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); +#if WANT_SIMD_EXCEPT + /* asuint(x) - TinyBound >= BigBound - TinyBound. */ + uint32x4_t cmp = vcgeq_u32 ( + vsubq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (x)), TinyBound), Thres); + float32x4_t xm = x; + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special case handler to fix special lanes later. This is only necessary if + fenv exceptions are to be triggered correctly. */ + if (unlikely (v_any_u32 (cmp))) + x = v_zerofy_f32 (x, cmp); +#endif + + /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)), + with poly(r) in [1/sqrt(2), sqrt(2)] and + x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */ + float32x4_t z = vfmaq_laneq_f32 (d->shift, x, d->log10_2_and_inv, 0); + float32x4_t n = vsubq_f32 (z, d->shift); + float32x4_t r = vfmsq_laneq_f32 (x, n, d->log10_2_and_inv, 1); + r = vfmsq_laneq_f32 (r, n, d->log10_2_and_inv, 2); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); + + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias)); + +#if !WANT_SIMD_EXCEPT + uint32x4_t cmp = vcagtq_f32 (n, v_f32 (SpecialBound)); +#endif + + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t poly + = vfmaq_f32 (vmulq_f32 (r, d->poly[0]), + v_pairwise_poly_3_f32 (r, r2, d->poly + 1), r2); + + if (unlikely (v_any_u32 (cmp))) +#if WANT_SIMD_EXCEPT + return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp); +#else + return special_case (poly, n, e, cmp, scale, d); +#endif + + return vfmaq_f32 (scale, poly, scale); +} + +PL_SIG (S, F, 1, exp10, -9.9, 9.9) +PL_SIG (V, F, 1, exp10, -9.9, 9.9) +PL_TEST_ULP (V_NAME_F1 (exp10), 1.86) +PL_TEST_EXPECT_FENV (V_NAME_F1 (exp10), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (exp10), 0, SpecialBound, 5000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (exp10), SpecialBound, ScaleBound, 5000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (exp10), ScaleBound, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/v_exp2_2u.c b/contrib/arm-optimized-routines/pl/math/v_exp2_2u.c new file mode 100644 index 000000000000..de59779689f5 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_exp2_2u.c @@ -0,0 +1,128 @@ +/* + * Double-precision vector 2^x function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "poly_advsimd_f64.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define N (1 << V_EXP_TABLE_BITS) +#define IndexMask (N - 1) +#define BigBound 1022.0 +#define UOFlowBound 1280.0 + +static const struct data +{ + float64x2_t poly[4]; + float64x2_t shift, scale_big_bound, scale_uoflow_bound; +} data = { + /* Coefficients are computed using Remez algorithm with + minimisation of the absolute error. */ + .poly = { V2 (0x1.62e42fefa3686p-1), V2 (0x1.ebfbdff82c241p-3), + V2 (0x1.c6b09b16de99ap-5), V2 (0x1.3b2abf5571ad8p-7) }, + .shift = V2 (0x1.8p52 / N), + .scale_big_bound = V2 (BigBound), + .scale_uoflow_bound = V2 (UOFlowBound), +}; + +static inline uint64x2_t +lookup_sbits (uint64x2_t i) +{ + return (uint64x2_t){ __v_exp_data[i[0] & IndexMask], + __v_exp_data[i[1] & IndexMask] }; +} + +#if WANT_SIMD_EXCEPT + +# define TinyBound 0x2000000000000000 /* asuint64(0x1p-511). */ +# define Thres 0x2080000000000000 /* asuint64(512.0) - TinyBound. */ + +/* Call scalar exp2 as a fallback. */ +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t is_special) +{ + return v_call_f64 (exp2, x, y, is_special); +} + +#else + +# define SpecialOffset 0x6000000000000000 /* 0x1p513. */ +/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ +# define SpecialBias1 0x7000000000000000 /* 0x1p769. */ +# define SpecialBias2 0x3010000000000000 /* 0x1p-254. */ + +static inline float64x2_t VPCS_ATTR +special_case (float64x2_t s, float64x2_t y, float64x2_t n, + const struct data *d) +{ + /* 2^(n/N) may overflow, break it up into s1*s2. */ + uint64x2_t b = vandq_u64 (vclezq_f64 (n), v_u64 (SpecialOffset)); + float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (v_u64 (SpecialBias1), b)); + float64x2_t s2 = vreinterpretq_f64_u64 ( + vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), v_u64 (SpecialBias2)), b)); + uint64x2_t cmp = vcagtq_f64 (n, d->scale_uoflow_bound); + float64x2_t r1 = vmulq_f64 (s1, s1); + float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, s2, y), s1); + return vbslq_f64 (cmp, r1, r0); +} + +#endif + +/* Fast vector implementation of exp2. + Maximum measured error is 1.65 ulp. + _ZGVnN2v_exp2(-0x1.4c264ab5b559bp-6) got 0x1.f8db0d4df721fp-1 + want 0x1.f8db0d4df721dp-1. */ +VPCS_ATTR +float64x2_t V_NAME_D1 (exp2) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + uint64x2_t cmp; +#if WANT_SIMD_EXCEPT + uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x)); + cmp = vcgeq_u64 (vsubq_u64 (ia, v_u64 (TinyBound)), v_u64 (Thres)); + /* Mask special lanes and retain a copy of x for passing to special-case + handler. */ + float64x2_t xc = x; + x = v_zerofy_f64 (x, cmp); +#else + cmp = vcagtq_f64 (x, d->scale_big_bound); +#endif + + /* n = round(x/N). */ + float64x2_t z = vaddq_f64 (d->shift, x); + uint64x2_t u = vreinterpretq_u64_f64 (z); + float64x2_t n = vsubq_f64 (z, d->shift); + + /* r = x - n/N. */ + float64x2_t r = vsubq_f64 (x, n); + + /* s = 2^(n/N). */ + uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS); + u = lookup_sbits (u); + float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e)); + + /* y ~ exp2(r) - 1. */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t y = v_pairwise_poly_3_f64 (r, r2, d->poly); + y = vmulq_f64 (r, y); + + if (unlikely (v_any_u64 (cmp))) +#if !WANT_SIMD_EXCEPT + return special_case (s, y, n, d); +#else + return special_case (xc, vfmaq_f64 (s, s, y), cmp); +#endif + return vfmaq_f64 (s, s, y); +} + +PL_SIG (V, D, 1, exp2, -9.9, 9.9) +PL_TEST_ULP (V_NAME_D1 (exp2), 1.15) +PL_TEST_EXPECT_FENV (V_NAME_D1 (exp2), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), 0, TinyBound, 5000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), TinyBound, BigBound, 10000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), BigBound, UOFlowBound, 5000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), UOFlowBound, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/v_exp_data.c b/contrib/arm-optimized-routines/pl/math/v_exp_data.c new file mode 100644 index 000000000000..fd01cf27606f --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_exp_data.c @@ -0,0 +1,55 @@ +/* + * Scale values for vector exp and exp2 + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* 2^(j/N), j=0..N, N=2^7=128. Copied from math/v_exp_data.c. */ +const uint64_t __v_exp_data[] = { + 0x3ff0000000000000, 0x3feff63da9fb3335, 0x3fefec9a3e778061, + 0x3fefe315e86e7f85, 0x3fefd9b0d3158574, 0x3fefd06b29ddf6de, + 0x3fefc74518759bc8, 0x3fefbe3ecac6f383, 0x3fefb5586cf9890f, + 0x3fefac922b7247f7, 0x3fefa3ec32d3d1a2, 0x3fef9b66affed31b, + 0x3fef9301d0125b51, 0x3fef8abdc06c31cc, 0x3fef829aaea92de0, + 0x3fef7a98c8a58e51, 0x3fef72b83c7d517b, 0x3fef6af9388c8dea, + 0x3fef635beb6fcb75, 0x3fef5be084045cd4, 0x3fef54873168b9aa, + 0x3fef4d5022fcd91d, 0x3fef463b88628cd6, 0x3fef3f49917ddc96, + 0x3fef387a6e756238, 0x3fef31ce4fb2a63f, 0x3fef2b4565e27cdd, + 0x3fef24dfe1f56381, 0x3fef1e9df51fdee1, 0x3fef187fd0dad990, + 0x3fef1285a6e4030b, 0x3fef0cafa93e2f56, 0x3fef06fe0a31b715, + 0x3fef0170fc4cd831, 0x3feefc08b26416ff, 0x3feef6c55f929ff1, + 0x3feef1a7373aa9cb, 0x3feeecae6d05d866, 0x3feee7db34e59ff7, + 0x3feee32dc313a8e5, 0x3feedea64c123422, 0x3feeda4504ac801c, + 0x3feed60a21f72e2a, 0x3feed1f5d950a897, 0x3feece086061892d, + 0x3feeca41ed1d0057, 0x3feec6a2b5c13cd0, 0x3feec32af0d7d3de, + 0x3feebfdad5362a27, 0x3feebcb299fddd0d, 0x3feeb9b2769d2ca7, + 0x3feeb6daa2cf6642, 0x3feeb42b569d4f82, 0x3feeb1a4ca5d920f, + 0x3feeaf4736b527da, 0x3feead12d497c7fd, 0x3feeab07dd485429, + 0x3feea9268a5946b7, 0x3feea76f15ad2148, 0x3feea5e1b976dc09, + 0x3feea47eb03a5585, 0x3feea34634ccc320, 0x3feea23882552225, + 0x3feea155d44ca973, 0x3feea09e667f3bcd, 0x3feea012750bdabf, + 0x3fee9fb23c651a2f, 0x3fee9f7df9519484, 0x3fee9f75e8ec5f74, + 0x3fee9f9a48a58174, 0x3fee9feb564267c9, 0x3feea0694fde5d3f, + 0x3feea11473eb0187, 0x3feea1ed0130c132, 0x3feea2f336cf4e62, + 0x3feea427543e1a12, 0x3feea589994cce13, 0x3feea71a4623c7ad, + 0x3feea8d99b4492ed, 0x3feeaac7d98a6699, 0x3feeace5422aa0db, + 0x3feeaf3216b5448c, 0x3feeb1ae99157736, 0x3feeb45b0b91ffc6, + 0x3feeb737b0cdc5e5, 0x3feeba44cbc8520f, 0x3feebd829fde4e50, + 0x3feec0f170ca07ba, 0x3feec49182a3f090, 0x3feec86319e32323, + 0x3feecc667b5de565, 0x3feed09bec4a2d33, 0x3feed503b23e255d, + 0x3feed99e1330b358, 0x3feede6b5579fdbf, 0x3feee36bbfd3f37a, + 0x3feee89f995ad3ad, 0x3feeee07298db666, 0x3feef3a2b84f15fb, + 0x3feef9728de5593a, 0x3feeff76f2fb5e47, 0x3fef05b030a1064a, + 0x3fef0c1e904bc1d2, 0x3fef12c25bd71e09, 0x3fef199bdd85529c, + 0x3fef20ab5fffd07a, 0x3fef27f12e57d14b, 0x3fef2f6d9406e7b5, + 0x3fef3720dcef9069, 0x3fef3f0b555dc3fa, 0x3fef472d4a07897c, + 0x3fef4f87080d89f2, 0x3fef5818dcfba487, 0x3fef60e316c98398, + 0x3fef69e603db3285, 0x3fef7321f301b460, 0x3fef7c97337b9b5f, + 0x3fef864614f5a129, 0x3fef902ee78b3ff6, 0x3fef9a51fbc74c83, + 0x3fefa4afa2a490da, 0x3fefaf482d8e67f1, 0x3fefba1bee615a27, + 0x3fefc52b376bba97, 0x3fefd0765b6e4540, 0x3fefdbfdad9cbe14, + 0x3fefe7c1819e90d8, 0x3feff3c22b8f71f1, +}; diff --git a/contrib/arm-optimized-routines/pl/math/v_exp_tail.c b/contrib/arm-optimized-routines/pl/math/v_exp_tail.c deleted file mode 100644 index fd38aa8ae6ea..000000000000 --- a/contrib/arm-optimized-routines/pl/math/v_exp_tail.c +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Double-precision vector e^(x+tail) function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#include "math_config.h" -#if V_SUPPORTED -#include "v_exp_tail.h" - -#define C1 v_f64 (C1_scal) -#define C2 v_f64 (C2_scal) -#define C3 v_f64 (C3_scal) -#define InvLn2 v_f64 (InvLn2_scal) -#define Ln2hi v_f64 (Ln2hi_scal) -#define Ln2lo v_f64 (Ln2lo_scal) - -#define IndexMask v_u64 (IndexMask_scal) -#define Shift v_f64 (Shift_scal) -#define Thres v_f64 (Thres_scal) - -VPCS_ATTR -static v_f64_t -specialcase (v_f64_t s, v_f64_t y, v_f64_t n) -{ - v_f64_t absn = v_abs_f64 (n); - - /* 2^(n/N) may overflow, break it up into s1*s2. */ - v_u64_t b = v_cond_u64 (n <= v_f64 (0.0)) & v_u64 (0x6000000000000000); - v_f64_t s1 = v_as_f64_u64 (v_u64 (0x7000000000000000) - b); - v_f64_t s2 = v_as_f64_u64 (v_as_u64_f64 (s) - v_u64 (0x3010000000000000) + b); - v_u64_t cmp = v_cond_u64 (absn > v_f64 (1280.0 * N)); - v_f64_t r1 = s1 * s1; - v_f64_t r0 = v_fma_f64 (y, s2, s2) * s1; - return v_as_f64_u64 ((cmp & v_as_u64_f64 (r1)) | (~cmp & v_as_u64_f64 (r0))); -} - -VPCS_ATTR -v_f64_t V_NAME (exp_tail) (v_f64_t x, v_f64_t xtail) -{ - v_f64_t n, r, s, y, z; - v_u64_t cmp, u, e, i; - - cmp = v_cond_u64 (v_abs_f64 (x) > Thres); - - /* n = round(x/(ln2/N)). */ - z = v_fma_f64 (x, InvLn2, Shift); - u = v_as_u64_f64 (z); - n = z - Shift; - - /* r = x - n*ln2/N. */ - r = x; - r = v_fma_f64 (-Ln2hi, n, r); - r = v_fma_f64 (-Ln2lo, n, r); - - e = u << (52 - V_EXP_TAIL_TABLE_BITS); - i = u & IndexMask; - - /* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */ - y = v_fma_f64 (C3, r, C2); - y = v_fma_f64 (y, r, C1); - y = v_fma_f64 (y, r, v_f64 (1.0)); - y = v_fma_f64 (y, r, xtail); - - /* s = 2^(n/N). */ - u = v_lookup_u64 (Tab, i); - s = v_as_f64_u64 (u + e); - - if (unlikely (v_any_u64 (cmp))) - return specialcase (s, y, n); - return v_fma_f64 (y, s, s); -} -#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_exp_tail_data.c b/contrib/arm-optimized-routines/pl/math/v_exp_tail_data.c index 675eb769bf07..989dd41d949a 100644 --- a/contrib/arm-optimized-routines/pl/math/v_exp_tail_data.c +++ b/contrib/arm-optimized-routines/pl/math/v_exp_tail_data.c @@ -1,5 +1,5 @@ /* - * Lookup table for double-precision e^(x+tail) vector function. + * Lookup table for double-precision e^x vector function. * * Copyright (c) 2019-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception @@ -7,91 +7,92 @@ #include "math_config.h" -/* 2^(j/N), j=0..N (where N = 256). */ -const uint64_t __v_exp_tail_data[] - = {0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335, - 0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc, - 0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574, - 0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836, - 0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383, - 0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85, - 0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2, - 0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e, - 0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc, - 0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e, - 0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b, - 0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f, - 0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4, - 0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027, - 0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6, - 0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1, - 0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f, - 0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29, - 0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1, - 0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f, - 0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56, - 0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd, - 0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff, - 0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b, - 0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866, - 0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4, - 0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422, - 0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024, - 0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897, - 0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232, - 0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0, - 0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7, - 0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d, - 0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee, - 0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82, - 0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2, - 0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd, - 0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03, - 0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148, - 0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4, - 0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320, - 0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6, - 0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd, - 0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645, - 0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484, - 0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a, - 0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9, - 0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6, - 0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132, - 0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491, - 0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13, - 0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21, - 0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699, - 0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778, - 0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736, - 0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2, - 0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f, - 0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2, - 0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090, - 0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e, - 0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33, - 0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052, - 0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf, - 0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774, - 0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666, - 0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1, - 0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47, - 0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f, - 0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09, - 0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c, - 0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b, - 0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db, - 0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa, - 0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968, - 0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487, - 0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075, - 0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460, - 0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17, - 0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6, - 0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740, - 0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1, - 0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a, - 0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540, - 0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89, - 0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1, - 0x3feff9d96b2a23d9}; +/* 2^(j/N), j=0..N, N=2^8=256. Copied from math/v_exp_data.c. */ +const uint64_t __v_exp_tail_data[] = { + 0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335, + 0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc, + 0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574, + 0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836, + 0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383, + 0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85, + 0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2, + 0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e, + 0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc, + 0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e, + 0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b, + 0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f, + 0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4, + 0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027, + 0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6, + 0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1, + 0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f, + 0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29, + 0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1, + 0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f, + 0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56, + 0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd, + 0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff, + 0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b, + 0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866, + 0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4, + 0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422, + 0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024, + 0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897, + 0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232, + 0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0, + 0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7, + 0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d, + 0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee, + 0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82, + 0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2, + 0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd, + 0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03, + 0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148, + 0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4, + 0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320, + 0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6, + 0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd, + 0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645, + 0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484, + 0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a, + 0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9, + 0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6, + 0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132, + 0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491, + 0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13, + 0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21, + 0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699, + 0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778, + 0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736, + 0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2, + 0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f, + 0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2, + 0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090, + 0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e, + 0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33, + 0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052, + 0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf, + 0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774, + 0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666, + 0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1, + 0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47, + 0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f, + 0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09, + 0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c, + 0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b, + 0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db, + 0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa, + 0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968, + 0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487, + 0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075, + 0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460, + 0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17, + 0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6, + 0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740, + 0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1, + 0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a, + 0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540, + 0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89, + 0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1, + 0x3feff9d96b2a23d9, +}; diff --git a/contrib/arm-optimized-routines/pl/math/v_exp_tail_inline.h b/contrib/arm-optimized-routines/pl/math/v_exp_tail_inline.h new file mode 100644 index 000000000000..76ecc6b0a33a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_exp_tail_inline.h @@ -0,0 +1,102 @@ +/* + * Double-precision vector e^(x+tail) function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#ifndef PL_MATH_V_EXP_TAIL_INLINE_H +#define PL_MATH_V_EXP_TAIL_INLINE_H + +#include "v_math.h" +#include "poly_advsimd_f64.h" + +#ifndef WANT_V_EXP_TAIL_SPECIALCASE +#error \ + "Cannot use v_exp_tail_inline.h without specifying whether you need the special case computation." +#endif + +#define N (1 << V_EXP_TAIL_TABLE_BITS) + +static const struct data +{ + float64x2_t poly[4]; +#if WANT_V_EXP_TAIL_SPECIALCASE + float64x2_t big_bound, huge_bound; +#endif + float64x2_t shift, invln2, ln2_hi, ln2_lo; +} data = { +#if WANT_V_EXP_TAIL_SPECIALCASE + .big_bound = V2 (704.0), + .huge_bound = V2 (1280.0 * N), +#endif + .shift = V2 (0x1.8p52), + .invln2 = V2 (0x1.71547652b82fep8), /* N/ln2. */ + .ln2_hi = V2 (0x1.62e42fefa39efp-9), /* ln2/N. */ + .ln2_lo = V2 (0x1.abc9e3b39803f3p-64), + .poly = { V2 (1.0), V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3), + V2 (0x1.5555576a59599p-5) }, +}; + +static inline uint64x2_t +lookup_sbits (uint64x2_t i) +{ + return (uint64x2_t){__v_exp_tail_data[i[0]], __v_exp_tail_data[i[1]]}; +} + +#if WANT_V_EXP_TAIL_SPECIALCASE +#define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */ +/* The following 2 bias when combined form the exponent bias: + SpecialBias1 - SpecialBias2 = asuint64(1.0). */ +#define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */ +#define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */ +static float64x2_t VPCS_ATTR +v_exp_tail_special_case (float64x2_t s, float64x2_t y, float64x2_t n, + const struct data *d) +{ + /* 2^(n/N) may overflow, break it up into s1*s2. */ + uint64x2_t b = vandq_u64 (vclezq_f64 (n), SpecialOffset); + float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b)); + float64x2_t s2 = vreinterpretq_f64_u64 ( + vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b)); + uint64x2_t oflow = vcagtq_f64 (n, d->huge_bound); + float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1); + float64x2_t r1 = vmulq_f64 (s1, s1); + return vbslq_f64 (oflow, r1, r0); +} +#endif + +static inline float64x2_t VPCS_ATTR +v_exp_tail_inline (float64x2_t x, float64x2_t xtail) +{ + const struct data *d = ptr_barrier (&data); +#if WANT_V_EXP_TAIL_SPECIALCASE + uint64x2_t special = vcgtq_f64 (vabsq_f64 (x), d->big_bound); +#endif + /* n = round(x/(ln2/N)). */ + float64x2_t z = vfmaq_f64 (d->shift, x, d->invln2); + uint64x2_t u = vreinterpretq_u64_f64 (z); + float64x2_t n = vsubq_f64 (z, d->shift); + + /* r = x - n*ln2/N. */ + float64x2_t r = x; + r = vfmsq_f64 (r, d->ln2_hi, n); + r = vfmsq_f64 (r, d->ln2_lo, n); + + uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS); + uint64x2_t i = vandq_u64 (u, v_u64 (N - 1)); + + /* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4, using Horner. */ + float64x2_t y = v_horner_3_f64 (r, d->poly); + y = vfmaq_f64 (xtail, y, r); + + /* s = 2^(n/N). */ + u = lookup_sbits (i); + float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e)); + +#if WANT_V_EXP_TAIL_SPECIALCASE + if (unlikely (v_any_u64 (special))) + return v_exp_tail_special_case (s, y, n, d); +#endif + return vfmaq_f64 (s, y, s); +} +#endif // PL_MATH_V_EXP_TAIL_INLINE_H diff --git a/contrib/arm-optimized-routines/pl/math/v_expf.c b/contrib/arm-optimized-routines/pl/math/v_expf.c deleted file mode 100644 index a422e69feb62..000000000000 --- a/contrib/arm-optimized-routines/pl/math/v_expf.c +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Single-precision vector e^x function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#include "mathlib.h" -#if V_SUPPORTED - -static const float Poly[] = { - /* maxerr: 1.45358 +0.5 ulp. */ - 0x1.0e4020p-7f, - 0x1.573e2ep-5f, - 0x1.555e66p-3f, - 0x1.fffdb6p-2f, - 0x1.ffffecp-1f, -}; -#define C0 v_f32 (Poly[0]) -#define C1 v_f32 (Poly[1]) -#define C2 v_f32 (Poly[2]) -#define C3 v_f32 (Poly[3]) -#define C4 v_f32 (Poly[4]) - -#define Shift v_f32 (0x1.8p23f) -#define InvLn2 v_f32 (0x1.715476p+0f) -#define Ln2hi v_f32 (0x1.62e4p-1f) -#define Ln2lo v_f32 (0x1.7f7d1cp-20f) - -VPCS_ATTR -static v_f32_t -specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale) -{ - /* 2^n may overflow, break it up into s1*s2. */ - v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000); - v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b); - v_f32_t s2 = v_as_f32_u32 (e - b); - v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f)); - v_u32_t r2 = v_as_u32_f32 (s1 * s1); - v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1); - /* Similar to r1 but avoids double rounding in the subnormal range. */ - v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale)); - return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0)); -} - -VPCS_ATTR -v_f32_t -V_NAME(expf) (v_f32_t x) -{ - v_f32_t n, r, r2, scale, p, q, poly, absn, z; - v_u32_t cmp, e; - - /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] - x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ -#if 1 - z = v_fma_f32 (x, InvLn2, Shift); - n = z - Shift; - r = v_fma_f32 (n, -Ln2hi, x); - r = v_fma_f32 (n, -Ln2lo, r); - e = v_as_u32_f32 (z) << 23; -#else - z = x * InvLn2; - n = v_round_f32 (z); - r = v_fma_f32 (n, -Ln2hi, x); - r = v_fma_f32 (n, -Ln2lo, r); - e = v_as_u32_s32 (v_round_s32 (z)) << 23; -#endif - scale = v_as_f32_u32 (e + v_u32 (0x3f800000)); - absn = v_abs_f32 (n); - cmp = v_cond_u32 (absn > v_f32 (126.0f)); - r2 = r * r; - p = v_fma_f32 (C0, r, C1); - q = v_fma_f32 (C2, r, C3); - q = v_fma_f32 (p, r2, q); - p = C4 * r; - poly = v_fma_f32 (q, r2, p); - if (unlikely (v_any_u32 (cmp))) - return specialcase (poly, n, e, absn, cmp, scale); - return v_fma_f32 (poly, scale, scale); -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_expf_inline.h b/contrib/arm-optimized-routines/pl/math/v_expf_inline.h new file mode 100644 index 000000000000..166683726b4d --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_expf_inline.h @@ -0,0 +1,60 @@ +/* + * Helper for single-precision routines which calculate exp(x) and do not + * need special-case handling + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef PL_MATH_V_EXPF_INLINE_H +#define PL_MATH_V_EXPF_INLINE_H + +#include "v_math.h" + +struct v_expf_data +{ + float32x4_t poly[5]; + float32x4_t shift, invln2_and_ln2; +}; + +/* maxerr: 1.45358 +0.5 ulp. */ +#define V_EXPF_DATA \ + { \ + .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), \ + V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, \ + .shift = V4 (0x1.8p23f), \ + .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \ + } + +#define ExponentBias v_u32 (0x3f800000) /* asuint(1.0f). */ +#define C(i) d->poly[i] + +static inline float32x4_t +v_expf_inline (float32x4_t x, const struct v_expf_data *d) +{ + /* Helper routine for calculating exp(x). + Copied from v_expf.c, with all special-case handling removed - the + calling routine should handle special values if required. */ + + /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + float32x4_t n, r, z; + z = vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0); + n = vsubq_f32 (z, d->shift); + r = vfmsq_laneq_f32 (x, n, d->invln2_and_ln2, 1); + r = vfmsq_laneq_f32 (r, n, d->invln2_and_ln2, 2); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias)); + + /* Custom order-4 Estrin avoids building high order monomial. */ + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t p, q, poly; + p = vfmaq_f32 (C (1), C (0), r); + q = vfmaq_f32 (C (3), C (2), r); + q = vfmaq_f32 (q, p, r2); + p = vmulq_f32 (C (4), r); + poly = vfmaq_f32 (p, q, r2); + return vfmaq_f32 (scale, poly, scale); +} + +#endif // PL_MATH_V_EXPF_INLINE_H diff --git a/contrib/arm-optimized-routines/pl/math/v_expm1_2u5.c b/contrib/arm-optimized-routines/pl/math/v_expm1_2u5.c index 4b491d17feef..dd255472cec0 100644 --- a/contrib/arm-optimized-routines/pl/math/v_expm1_2u5.c +++ b/contrib/arm-optimized-routines/pl/math/v_expm1_2u5.c @@ -6,65 +6,73 @@ */ #include "v_math.h" +#include "poly_advsimd_f64.h" #include "pl_sig.h" #include "pl_test.h" -#if V_SUPPORTED - -#define InvLn2 v_f64 (0x1.71547652b82fep0) -#define MLn2hi v_f64 (-0x1.62e42fefa39efp-1) -#define MLn2lo v_f64 (-0x1.abc9e3b39803fp-56) -#define Shift v_f64 (0x1.8p52) -#define TinyBound \ - 0x3cc0000000000000 /* 0x1p-51, below which expm1(x) is within 2 ULP of x. */ -#define SpecialBound \ - 0x40862b7d369a5aa9 /* 0x1.62b7d369a5aa9p+9. For |x| > SpecialBound, the \ - final stage of the algorithm overflows so fall back to \ - scalar. */ -#define AbsMask 0x7fffffffffffffff -#define One 0x3ff0000000000000 - -#define C(i) v_f64 (__expm1_poly[i]) - -static inline v_f64_t -eval_poly (v_f64_t f, v_f64_t f2) +static const struct data { - /* Evaluate custom polynomial using Estrin scheme. */ - v_f64_t p_01 = v_fma_f64 (f, C (1), C (0)); - v_f64_t p_23 = v_fma_f64 (f, C (3), C (2)); - v_f64_t p_45 = v_fma_f64 (f, C (5), C (4)); - v_f64_t p_67 = v_fma_f64 (f, C (7), C (6)); - v_f64_t p_89 = v_fma_f64 (f, C (9), C (8)); + float64x2_t poly[11]; + float64x2_t invln2, ln2, shift; + int64x2_t exponent_bias; +#if WANT_SIMD_EXCEPT + uint64x2_t thresh, tiny_bound; +#else + float64x2_t oflow_bound; +#endif +} data = { + /* Generated using fpminimax, with degree=12 in [log(2)/2, log(2)/2]. */ + .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5), + V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10), + V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16), + V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22), + V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29) }, + .invln2 = V2 (0x1.71547652b82fep0), + .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 }, + .shift = V2 (0x1.8p52), + .exponent_bias = V2 (0x3ff0000000000000), +#if WANT_SIMD_EXCEPT + /* asuint64(oflow_bound) - asuint64(0x1p-51), shifted left by 1 for abs + compare. */ + .thresh = V2 (0x78c56fa6d34b552), + /* asuint64(0x1p-51) << 1. */ + .tiny_bound = V2 (0x3cc0000000000000 << 1), +#else + /* Value above which expm1(x) should overflow. Absolute value of the + underflow bound is greater than this, so it catches both cases - there is + a small window where fallbacks are triggered unnecessarily. */ + .oflow_bound = V2 (0x1.62b7d369a5aa9p+9), +#endif +}; - v_f64_t p_03 = v_fma_f64 (f2, p_23, p_01); - v_f64_t p_47 = v_fma_f64 (f2, p_67, p_45); - v_f64_t p_8a = v_fma_f64 (f2, C (10), p_89); - - v_f64_t f4 = f2 * f2; - v_f64_t p_07 = v_fma_f64 (f4, p_47, p_03); - return v_fma_f64 (f4 * f4, p_8a, p_07); +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t special) +{ + return v_call_f64 (expm1, x, y, special); } /* Double-precision vector exp(x) - 1 function. The maximum error observed error is 2.18 ULP: - __v_expm1(0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2 - want 0x1.a8b9ea8d66e2p-2. */ -VPCS_ATTR -v_f64_t V_NAME (expm1) (v_f64_t x) + _ZGVnN2v_expm1 (0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2 + want 0x1.a8b9ea8d66e2p-2. */ +float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x) { - v_u64_t ix = v_as_u64_f64 (x); - v_u64_t ax = ix & AbsMask; + const struct data *d = ptr_barrier (&data); + + uint64x2_t ix = vreinterpretq_u64_f64 (x); #if WANT_SIMD_EXCEPT - /* If fp exceptions are to be triggered correctly, fall back to the scalar - variant for all lanes if any of them should trigger an exception. */ - v_u64_t special = v_cond_u64 ((ax >= SpecialBound) | (ax <= TinyBound)); + /* If fp exceptions are to be triggered correctly, fall back to scalar for + |x| < 2^-51, |x| > oflow_bound, Inf & NaN. Add ix to itself for + shift-left by 1, and compare with thresh which was left-shifted offline - + this is effectively an absolute compare. */ + uint64x2_t special + = vcgeq_u64 (vsubq_u64 (vaddq_u64 (ix, ix), d->tiny_bound), d->thresh); if (unlikely (v_any_u64 (special))) - return v_call_f64 (expm1, x, x, v_u64 (-1)); + x = v_zerofy_f64 (x, special); #else /* Large input, NaNs and Infs. */ - v_u64_t special - = v_cond_u64 ((ax >= SpecialBound) | (ix == 0x8000000000000000)); + uint64x2_t special = vcageq_f64 (x, d->oflow_bound); #endif /* Reduce argument to smaller range: @@ -72,42 +80,39 @@ v_f64_t V_NAME (expm1) (v_f64_t x) and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 where 2^i is exact because i is an integer. */ - v_f64_t j = v_fma_f64 (InvLn2, x, Shift) - Shift; - v_s64_t i = v_to_s64_f64 (j); - v_f64_t f = v_fma_f64 (j, MLn2hi, x); - f = v_fma_f64 (j, MLn2lo, f); + float64x2_t n = vsubq_f64 (vfmaq_f64 (d->shift, d->invln2, x), d->shift); + int64x2_t i = vcvtq_s64_f64 (n); + float64x2_t f = vfmsq_laneq_f64 (x, n, d->ln2, 0); + f = vfmsq_laneq_f64 (f, n, d->ln2, 1); /* Approximate expm1(f) using polynomial. Taylor expansion for expm1(x) has the form: x + ax^2 + bx^3 + cx^4 .... So we calculate the polynomial P(f) = a + bf + cf^2 + ... and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ - v_f64_t f2 = f * f; - v_f64_t p = v_fma_f64 (f2, eval_poly (f, f2), f); + float64x2_t f2 = vmulq_f64 (f, f); + float64x2_t f4 = vmulq_f64 (f2, f2); + float64x2_t f8 = vmulq_f64 (f4, f4); + float64x2_t p = vfmaq_f64 (f, f2, v_estrin_10_f64 (f, f2, f4, f8, d->poly)); /* Assemble the result. expm1(x) ~= 2^i * (p + 1) - 1 Let t = 2^i. */ - v_f64_t t = v_as_f64_u64 (v_as_u64_s64 (i << 52) + One); - /* expm1(x) ~= p * t + (t - 1). */ - v_f64_t y = v_fma_f64 (p, t, t - 1); + int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias); + float64x2_t t = vreinterpretq_f64_s64 (u); -#if !WANT_SIMD_EXCEPT if (unlikely (v_any_u64 (special))) - return v_call_f64 (expm1, x, y, special); -#endif + return special_case (vreinterpretq_f64_u64 (ix), + vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t), + special); - return y; + /* expm1(x) ~= p * t + (t - 1). */ + return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t); } -VPCS_ALIAS PL_SIG (V, D, 1, expm1, -9.9, 9.9) -PL_TEST_ULP (V_NAME (expm1), 1.68) -PL_TEST_EXPECT_FENV (V_NAME (expm1), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME (expm1), 0, 0x1p-51, 1000) -PL_TEST_INTERVAL (V_NAME (expm1), -0, -0x1p-51, 1000) -PL_TEST_INTERVAL (V_NAME (expm1), 0x1p-51, 0x1.63108c75a1937p+9, 100000) -PL_TEST_INTERVAL (V_NAME (expm1), -0x1p-51, -0x1.740bf7c0d927dp+9, 100000) -PL_TEST_INTERVAL (V_NAME (expm1), 0x1.63108c75a1937p+9, inf, 100) -PL_TEST_INTERVAL (V_NAME (expm1), -0x1.740bf7c0d927dp+9, -inf, 100) -#endif +PL_TEST_ULP (V_NAME_D1 (expm1), 1.68) +PL_TEST_EXPECT_FENV (V_NAME_D1 (expm1), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0, 0x1p-51, 1000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0x1p-51, 0x1.62b7d369a5aa9p+9, 100000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0x1.62b7d369a5aa9p+9, inf, 100) diff --git a/contrib/arm-optimized-routines/pl/math/v_expm1f_1u6.c b/contrib/arm-optimized-routines/pl/math/v_expm1f_1u6.c index ab132427e58d..6b282d0cc00f 100644 --- a/contrib/arm-optimized-routines/pl/math/v_expm1f_1u6.c +++ b/contrib/arm-optimized-routines/pl/math/v_expm1f_1u6.c @@ -6,44 +6,71 @@ */ #include "v_math.h" +#include "poly_advsimd_f32.h" #include "pl_sig.h" #include "pl_test.h" -#if V_SUPPORTED +static const struct data +{ + float32x4_t poly[5]; + float32x4_t invln2_and_ln2; + float32x4_t shift; + int32x4_t exponent_bias; +#if WANT_SIMD_EXCEPT + uint32x4_t thresh; +#else + float32x4_t oflow_bound; +#endif +} data = { + /* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2]. */ + .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5), + V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) }, + /* Stores constants: invln2, ln2_hi, ln2_lo, 0. */ + .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, + .shift = V4 (0x1.8p23f), + .exponent_bias = V4 (0x3f800000), +#if !WANT_SIMD_EXCEPT + /* Value above which expm1f(x) should overflow. Absolute value of the + underflow bound is greater than this, so it catches both cases - there is + a small window where fallbacks are triggered unnecessarily. */ + .oflow_bound = V4 (0x1.5ebc4p+6), +#else + /* asuint(oflow_bound) - asuint(0x1p-23), shifted left by 1 for absolute + compare. */ + .thresh = V4 (0x1d5ebc40), +#endif +}; -#define Shift v_f32 (0x1.8p23f) -#define InvLn2 v_f32 (0x1.715476p+0f) -#define MLn2hi v_f32 (-0x1.62e4p-1f) -#define MLn2lo v_f32 (-0x1.7f7d1cp-20f) -#define AbsMask (0x7fffffff) -#define One (0x3f800000) -#define SpecialBound \ - (0x42af5e20) /* asuint(0x1.5ebc4p+6). Largest value of x for which expm1(x) \ - should round to -1. */ -#define TinyBound (0x34000000) /* asuint(0x1p-23). */ +/* asuint(0x1p-23), shifted by 1 for abs compare. */ +#define TinyBound v_u32 (0x34000000 << 1) -#define C(i) v_f32 (__expm1f_poly[i]) +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +{ + return v_call_f32 (expm1f, x, y, special); +} /* Single-precision vector exp(x) - 1 function. The maximum error is 1.51 ULP: - expm1f(0x1.8baa96p-2) got 0x1.e2fb9p-2 - want 0x1.e2fb94p-2. */ -VPCS_ATTR -v_f32_t V_NAME (expm1f) (v_f32_t x) + _ZGVnN4v_expm1f (0x1.8baa96p-2) got 0x1.e2fb9p-2 + want 0x1.e2fb94p-2. */ +float32x4_t VPCS_ATTR V_NAME_F1 (expm1) (float32x4_t x) { - v_u32_t ix = v_as_u32_f32 (x); - v_u32_t ax = ix & AbsMask; + const struct data *d = ptr_barrier (&data); + uint32x4_t ix = vreinterpretq_u32_f32 (x); #if WANT_SIMD_EXCEPT - /* If fp exceptions are to be triggered correctly, fall back to the scalar - variant for all lanes if any of them should trigger an exception. */ - v_u32_t special - = v_cond_u32 ((ax >= SpecialBound) | (ix == 0x80000000) | (ax < TinyBound)); + /* If fp exceptions are to be triggered correctly, fall back to scalar for + |x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for + shift-left by 1, and compare with thresh which was left-shifted offline - + this is effectively an absolute compare. */ + uint32x4_t special + = vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh); if (unlikely (v_any_u32 (special))) - return v_call_f32 (expm1f, x, x, v_u32 (0xffffffff)); + x = v_zerofy_f32 (x, special); #else - /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf and -0. */ - v_u32_t special = v_cond_u32 ((ax >= SpecialBound) | (ix == 0x80000000)); + /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */ + uint32x4_t special = vcagtq_f32 (x, d->oflow_bound); #endif /* Reduce argument to smaller range: @@ -51,44 +78,40 @@ v_f32_t V_NAME (expm1f) (v_f32_t x) and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 where 2^i is exact because i is an integer. */ - v_f32_t j = v_fma_f32 (InvLn2, x, Shift) - Shift; - v_s32_t i = v_to_s32_f32 (j); - v_f32_t f = v_fma_f32 (j, MLn2hi, x); - f = v_fma_f32 (j, MLn2lo, f); + float32x4_t j = vsubq_f32 ( + vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift); + int32x4_t i = vcvtq_s32_f32 (j); + float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1); + f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2); /* Approximate expm1(f) using polynomial. Taylor expansion for expm1(x) has the form: x + ax^2 + bx^3 + cx^4 .... So we calculate the polynomial P(f) = a + bf + cf^2 + ... and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ - - v_f32_t p = v_fma_f32 (C (4), f, C (3)); - p = v_fma_f32 (p, f, C (2)); - p = v_fma_f32 (p, f, C (1)); - p = v_fma_f32 (p, f, C (0)); - p = v_fma_f32 (f * f, p, f); + float32x4_t p = v_horner_4_f32 (f, d->poly); + p = vfmaq_f32 (f, vmulq_f32 (f, f), p); /* Assemble the result. expm1(x) ~= 2^i * (p + 1) - 1 Let t = 2^i. */ - v_f32_t t = v_as_f32_u32 (v_as_u32_s32 (i << 23) + One); - /* expm1(x) ~= p * t + (t - 1). */ - v_f32_t y = v_fma_f32 (p, t, t - 1); + int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias); + float32x4_t t = vreinterpretq_f32_s32 (u); -#if !WANT_SIMD_EXCEPT if (unlikely (v_any_u32 (special))) - return v_call_f32 (expm1f, x, y, special); -#endif + return special_case (vreinterpretq_f32_u32 (ix), + vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t), + special); - return y; + /* expm1(x) ~= p * t + (t - 1). */ + return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t); } -VPCS_ALIAS PL_SIG (V, F, 1, expm1, -9.9, 9.9) -PL_TEST_ULP (V_NAME (expm1f), 1.02) -PL_TEST_EXPECT_FENV (V_NAME (expm1f), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME (expm1f), 0, 0x1p-23, 1000) -PL_TEST_INTERVAL (V_NAME (expm1f), -0, -0x1p-23, 1000) -PL_TEST_INTERVAL (V_NAME (expm1f), 0x1p-23, 0x1.644716p6, 1000000) -PL_TEST_INTERVAL (V_NAME (expm1f), -0x1p-23, -0x1.9bbabcp+6, 1000000) -#endif +PL_TEST_ULP (V_NAME_F1 (expm1), 1.02) +PL_TEST_EXPECT_FENV (V_NAME_F1 (expm1), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (expm1), 0, 0x1p-23, 1000) +PL_TEST_INTERVAL (V_NAME_F1 (expm1), -0x1p-23, 0x1.5ebc4p+6, 1000000) +PL_TEST_INTERVAL (V_NAME_F1 (expm1), -0x1p-23, -0x1.9bbabcp+6, 1000000) +PL_TEST_INTERVAL (V_NAME_F1 (expm1), 0x1.5ebc4p+6, inf, 1000) +PL_TEST_INTERVAL (V_NAME_F1 (expm1), -0x1.9bbabcp+6, -inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/v_expm1f_inline.h b/contrib/arm-optimized-routines/pl/math/v_expm1f_inline.h index c261941ebed6..6ae94c452de2 100644 --- a/contrib/arm-optimized-routines/pl/math/v_expm1f_inline.h +++ b/contrib/arm-optimized-routines/pl/math/v_expm1f_inline.h @@ -11,39 +11,53 @@ #include "v_math.h" #include "math_config.h" -#include "estrinf.h" +#include "poly_advsimd_f32.h" -#define One 0x3f800000 -#define Shift v_f32 (0x1.8p23f) -#define InvLn2 v_f32 (0x1.715476p+0f) -#define MLn2hi v_f32 (-0x1.62e4p-1f) -#define MLn2lo v_f32 (-0x1.7f7d1cp-20f) +struct v_expm1f_data +{ + float32x4_t poly[5]; + float32x4_t invln2_and_ln2, shift; + int32x4_t exponent_bias; +}; -#define C(i) v_f32 (__expm1f_poly[i]) +/* Coefficients generated using fpminimax with degree=5 in [-log(2)/2, + log(2)/2]. Exponent bias is asuint(1.0f). + invln2_and_ln2 Stores constants: invln2, ln2_lo, ln2_hi, 0. */ +#define V_EXPM1F_DATA \ + { \ + .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5), \ + V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) }, \ + .shift = V4 (0x1.8p23f), .exponent_bias = V4 (0x3f800000), \ + .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \ + } -static inline v_f32_t -expm1f_inline (v_f32_t x) +static inline float32x4_t +expm1f_inline (float32x4_t x, const struct v_expm1f_data *d) { /* Helper routine for calculating exp(x) - 1. Copied from v_expm1f_1u6.c, with all special-case handling removed - the calling routine should handle special values if required. */ /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ - v_f32_t j = v_fma_f32 (InvLn2, x, Shift) - Shift; - v_s32_t i = v_to_s32_f32 (j); - v_f32_t f = v_fma_f32 (j, MLn2hi, x); - f = v_fma_f32 (j, MLn2lo, f); + float32x4_t j = vsubq_f32 ( + vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift); + int32x4_t i = vcvtq_s32_f32 (j); + float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1); + f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2); /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). - Uses Estrin scheme, where the main __v_expm1f routine uses Horner. */ - v_f32_t f2 = f * f; - v_f32_t p = ESTRIN_4 (f, f2, f2 * f2, C); - p = v_fma_f32 (f2, p, f); + Uses Estrin scheme, where the main _ZGVnN4v_expm1f routine uses + Horner. */ + float32x4_t f2 = vmulq_f32 (f, f); + float32x4_t f4 = vmulq_f32 (f2, f2); + float32x4_t p = v_estrin_4_f32 (f, f2, f4, d->poly); + p = vfmaq_f32 (f, f2, p); /* t = 2^i. */ - v_f32_t t = v_as_f32_u32 (v_as_u32_s32 (i << 23) + One); + int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias); + float32x4_t t = vreinterpretq_f32_s32 (u); /* expm1(x) ~= p * t + (t - 1). */ - return v_fma_f32 (p, t, t - 1); + return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t); } #endif // PL_MATH_V_EXPM1F_INLINE_H diff --git a/contrib/arm-optimized-routines/pl/math/v_hypot_1u5.c b/contrib/arm-optimized-routines/pl/math/v_hypot_1u5.c new file mode 100644 index 000000000000..d4ff7be89a8f --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_hypot_1u5.c @@ -0,0 +1,95 @@ +/* + * Double-precision vector hypot(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if WANT_SIMD_EXCEPT +static const struct data +{ + uint64x2_t tiny_bound, thres; +} data = { + .tiny_bound = V2 (0x2000000000000000), /* asuint (0x1p-511). */ + .thres = V2 (0x3fe0000000000000), /* asuint (0x1p511) - tiny_bound. */ +}; +#else +static const struct data +{ + uint64x2_t tiny_bound; + uint32x4_t thres; +} data = { + .tiny_bound = V2 (0x0360000000000000), /* asuint (0x1p-969). */ + .thres = V4 (0x7c900000), /* asuint (inf) - tiny_bound. */ +}; +#endif + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, float64x2_t sqsum, + uint32x2_t special) +{ + return v_call2_f64 (hypot, x, y, vsqrtq_f64 (sqsum), vmovl_u32 (special)); +} + +/* Vector implementation of double-precision hypot. + Maximum error observed is 1.21 ULP: + _ZGVnN2vv_hypot (0x1.6a1b193ff85b5p-204, 0x1.bc50676c2a447p-222) + got 0x1.6a1b19400964ep-204 + want 0x1.6a1b19400964dp-204. */ +#if WANT_SIMD_EXCEPT + +float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y) +{ + const struct data *d = ptr_barrier (&data); + + float64x2_t ax = vabsq_f64 (x); + float64x2_t ay = vabsq_f64 (y); + + uint64x2_t ix = vreinterpretq_u64_f64 (ax); + uint64x2_t iy = vreinterpretq_u64_f64 (ay); + + /* Extreme values, NaNs, and infinities should be handled by the scalar + fallback for correct flag handling. */ + uint64x2_t specialx = vcgeq_u64 (vsubq_u64 (ix, d->tiny_bound), d->thres); + uint64x2_t specialy = vcgeq_u64 (vsubq_u64 (iy, d->tiny_bound), d->thres); + ax = v_zerofy_f64 (ax, specialx); + ay = v_zerofy_f64 (ay, specialy); + uint32x2_t special = vaddhn_u64 (specialx, specialy); + + float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (ax, ax), ay, ay); + + if (unlikely (v_any_u32h (special))) + return special_case (x, y, sqsum, special); + + return vsqrtq_f64 (sqsum); +} +#else + +float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y) +{ + const struct data *d = ptr_barrier (&data); + + float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (x, x), y, y); + + uint32x2_t special = vcge_u32 ( + vsubhn_u64 (vreinterpretq_u64_f64 (sqsum), d->tiny_bound), + vget_low_u32 (d->thres)); + + if (unlikely (v_any_u32h (special))) + return special_case (x, y, sqsum, special); + + return vsqrtq_f64 (sqsum); +} +#endif + +PL_SIG (V, D, 2, hypot, -10.0, 10.0) +PL_TEST_ULP (V_NAME_D2 (hypot), 1.21) +PL_TEST_EXPECT_FENV (V_NAME_D2 (hypot), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, 0, inf, 10000) +PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, -0, -inf, 10000) +PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, 0, inf, 10000) +PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, -0, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/v_hypotf_1u5.c b/contrib/arm-optimized-routines/pl/math/v_hypotf_1u5.c new file mode 100644 index 000000000000..3227b0a3fd8b --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_hypotf_1u5.c @@ -0,0 +1,94 @@ +/* + * Single-precision vector hypot(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if WANT_SIMD_EXCEPT +static const struct data +{ + uint32x4_t tiny_bound, thres; +} data = { + .tiny_bound = V4 (0x20000000), /* asuint (0x1p-63). */ + .thres = V4 (0x3f000000), /* asuint (0x1p63) - tiny_bound. */ +}; +#else +static const struct data +{ + uint32x4_t tiny_bound; + uint16x8_t thres; +} data = { + .tiny_bound = V4 (0x0C800000), /* asuint (0x1p-102). */ + .thres = V8 (0x7300), /* asuint (inf) - tiny_bound. */ +}; +#endif + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, float32x4_t sqsum, + uint16x4_t special) +{ + return v_call2_f32 (hypotf, x, y, vsqrtq_f32 (sqsum), vmovl_u16 (special)); +} + +/* Vector implementation of single-precision hypot. + Maximum error observed is 1.21 ULP: + _ZGVnN4vv_hypotf (0x1.6a419cp-13, 0x1.82a852p-22) got 0x1.6a41d2p-13 + want 0x1.6a41dp-13. */ +#if WANT_SIMD_EXCEPT + +float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y) +{ + const struct data *d = ptr_barrier (&data); + + float32x4_t ax = vabsq_f32 (x); + float32x4_t ay = vabsq_f32 (y); + + uint32x4_t ix = vreinterpretq_u32_f32 (ax); + uint32x4_t iy = vreinterpretq_u32_f32 (ay); + + /* Extreme values, NaNs, and infinities should be handled by the scalar + fallback for correct flag handling. */ + uint32x4_t specialx = vcgeq_u32 (vsubq_u32 (ix, d->tiny_bound), d->thres); + uint32x4_t specialy = vcgeq_u32 (vsubq_u32 (iy, d->tiny_bound), d->thres); + ax = v_zerofy_f32 (ax, specialx); + ay = v_zerofy_f32 (ay, specialy); + uint16x4_t special = vaddhn_u32 (specialx, specialy); + + float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (ax, ax), ay, ay); + + if (unlikely (v_any_u16h (special))) + return special_case (x, y, sqsum, special); + + return vsqrtq_f32 (sqsum); +} +#else + +float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y) +{ + const struct data *d = ptr_barrier (&data); + + float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (x, x), y, y); + + uint16x4_t special = vcge_u16 ( + vsubhn_u32 (vreinterpretq_u32_f32 (sqsum), d->tiny_bound), + vget_low_u16 (d->thres)); + + if (unlikely (v_any_u16h (special))) + return special_case (x, y, sqsum, special); + + return vsqrtq_f32 (sqsum); +} +#endif + +PL_SIG (V, F, 2, hypot, -10.0, 10.0) +PL_TEST_ULP (V_NAME_F2 (hypot), 1.21) +PL_TEST_EXPECT_FENV (V_NAME_F2 (hypot), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, 0, inf, 10000) +PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, -0, -inf, 10000) +PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, 0, inf, 10000) +PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, -0, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/v_log10_2u5.c b/contrib/arm-optimized-routines/pl/math/v_log10_2u5.c index 86d398ca13a9..35dd62fe5e3e 100644 --- a/contrib/arm-optimized-routines/pl/math/v_log10_2u5.c +++ b/contrib/arm-optimized-routines/pl/math/v_log10_2u5.c @@ -6,105 +6,115 @@ */ #include "v_math.h" -#include "include/mathlib.h" #include "pl_sig.h" #include "pl_test.h" +#include "poly_advsimd_f64.h" -#if V_SUPPORTED - -#define A(i) v_f64 (__v_log10_data.poly[i]) -#define T(s, i) __v_log10_data.tab[i].s -#define Ln2 v_f64 (0x1.62e42fefa39efp-1) #define N (1 << V_LOG10_TABLE_BITS) -#define OFF v_u64 (0x3fe6900900000000) + +static const struct data +{ + uint64x2_t min_norm; + uint32x4_t special_bound; + float64x2_t poly[5]; + float64x2_t invln10, log10_2, ln2; + uint64x2_t sign_exp_mask; +} data = { + /* Computed from log coefficients divided by log(10) then rounded to double + precision. */ + .poly = { V2 (-0x1.bcb7b1526e506p-3), V2 (0x1.287a7636be1d1p-3), + V2 (-0x1.bcb7b158af938p-4), V2 (0x1.63c78734e6d07p-4), + V2 (-0x1.287461742fee4p-4) }, + .ln2 = V2 (0x1.62e42fefa39efp-1), + .invln10 = V2 (0x1.bcb7b1526e50ep-2), + .log10_2 = V2 (0x1.34413509f79ffp-2), + .min_norm = V2 (0x0010000000000000), /* asuint64(0x1p-1022). */ + .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */ + .sign_exp_mask = V2 (0xfff0000000000000), +}; + +#define Off v_u64 (0x3fe6900900000000) +#define IndexMask (N - 1) + +#define T(s, i) __v_log10_data.s[i] struct entry { - v_f64_t invc; - v_f64_t log10c; + float64x2_t invc; + float64x2_t log10c; }; static inline struct entry -lookup (v_u64_t i) +lookup (uint64x2_t i) { struct entry e; -#ifdef SCALAR - e.invc = T (invc, i); - e.log10c = T (log10c, i); -#else - e.invc[0] = T (invc, i[0]); - e.log10c[0] = T (log10c, i[0]); - e.invc[1] = T (invc, i[1]); - e.log10c[1] = T (log10c, i[1]); -#endif + uint64_t i0 = (i[0] >> (52 - V_LOG10_TABLE_BITS)) & IndexMask; + uint64_t i1 = (i[1] >> (52 - V_LOG10_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log10_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log10_data.table[i1].invc); + e.invc = vuzp1q_f64 (e0, e1); + e.log10c = vuzp2q_f64 (e0, e1); return e; } -VPCS_ATTR -inline static v_f64_t -specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2, + uint32x2_t special) { - return v_call_f64 (log10, x, y, cmp); + return v_call_f64 (log10, x, vfmaq_f64 (hi, r2, y), vmovl_u32 (special)); } -/* Our implementation of v_log10 is a slight modification of v_log (1.660ulps). +/* Fast implementation of double-precision vector log10 + is a slight modification of double-precision vector log. Max ULP error: < 2.5 ulp (nearest rounding.) Maximum measured at 2.46 ulp for x in [0.96, 0.97] - __v_log10(0x1.13192407fcb46p+0) got 0x1.fff6be3cae4bbp-6 - want 0x1.fff6be3cae4b9p-6 - -0.459999 ulp err 1.96. */ -VPCS_ATTR -v_f64_t V_NAME (log10) (v_f64_t x) + _ZGVnN2v_log10(0x1.13192407fcb46p+0) got 0x1.fff6be3cae4bbp-6 + want 0x1.fff6be3cae4b9p-6. */ +float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x) { - v_f64_t z, r, r2, p, y, kd, hi; - v_u64_t ix, iz, tmp, top, i, cmp; - v_s64_t k; - struct entry e; - - ix = v_as_u64_f64 (x); - top = ix >> 48; - cmp = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010)); + const struct data *d = ptr_barrier (&data); + uint64x2_t ix = vreinterpretq_u64_f64 (x); + uint32x2_t special = vcge_u32 (vsubhn_u64 (ix, d->min_norm), + vget_low_u32 (d->special_bound)); /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. The range is split into N subintervals. The ith subinterval contains z and c is near its center. */ - tmp = ix - OFF; - i = (tmp >> (52 - V_LOG10_TABLE_BITS)) % N; - k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift. */ - iz = ix - (tmp & v_u64 (0xfffULL << 52)); - z = v_as_f64_u64 (iz); - e = lookup (i); + uint64x2_t tmp = vsubq_u64 (ix, Off); + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); + uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask)); + float64x2_t z = vreinterpretq_f64_u64 (iz); + + struct entry e = lookup (tmp); /* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2). */ - r = v_fma_f64 (z, e.invc, v_f64 (-1.0)); - kd = v_to_f64_s64 (k); + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); + float64x2_t kd = vcvtq_f64_s64 (k); /* hi = r / log(10) + log10(c) + k*log10(2). - Constants in `v_log10_data.c` are computed (in extended precision) as + Constants in v_log10_data.c are computed (in extended precision) as e.log10c := e.logc * ivln10. */ - v_f64_t w = v_fma_f64 (r, v_f64 (__v_log10_data.invln10), e.log10c); + float64x2_t w = vfmaq_f64 (e.log10c, r, d->invln10); /* y = log10(1+r) + n * log10(2). */ - hi = v_fma_f64 (kd, v_f64 (__v_log10_data.log10_2), w); + float64x2_t hi = vfmaq_f64 (w, kd, d->log10_2); /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ - r2 = r * r; - y = v_fma_f64 (A (3), r, A (2)); - p = v_fma_f64 (A (1), r, A (0)); - y = v_fma_f64 (A (4), r2, y); - y = v_fma_f64 (y, r2, p); - y = v_fma_f64 (y, r2, hi); + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t y = v_pw_horner_4_f64 (r, r2, d->poly); - if (unlikely (v_any_u64 (cmp))) - return specialcase (x, y, cmp); - return y; + if (unlikely (v_any_u32h (special))) + return special_case (x, y, hi, r2, special); + return vfmaq_f64 (hi, r2, y); } -VPCS_ALIAS PL_SIG (V, D, 1, log10, 0.01, 11.1) -PL_TEST_ULP (V_NAME (log10), 1.97) -PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log10)) -PL_TEST_INTERVAL (V_NAME (log10), 0, 0xffff000000000000, 10000) -PL_TEST_INTERVAL (V_NAME (log10), 0x1p-4, 0x1p4, 400000) -PL_TEST_INTERVAL (V_NAME (log10), 0, inf, 400000) -#endif +PL_TEST_ULP (V_NAME_D1 (log10), 1.97) +PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_D1 (log10)) +PL_TEST_INTERVAL (V_NAME_D1 (log10), -0.0, -inf, 1000) +PL_TEST_INTERVAL (V_NAME_D1 (log10), 0, 0x1p-149, 1000) +PL_TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-149, 0x1p-126, 4000) +PL_TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (V_NAME_D1 (log10), 1.0, 100, 50000) +PL_TEST_INTERVAL (V_NAME_D1 (log10), 100, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/v_log10_data.c b/contrib/arm-optimized-routines/pl/math/v_log10_data.c index fda85c886963..d9a624dab9ce 100644 --- a/contrib/arm-optimized-routines/pl/math/v_log10_data.c +++ b/contrib/arm-optimized-routines/pl/math/v_log10_data.c @@ -7,161 +7,157 @@ #include "math_config.h" -#define N (1 << V_LOG10_TABLE_BITS) - -/* Algorithm: +const struct v_log10_data __v_log10_data = { + /* Computed from log's coefficients div by log(10) then rounded to double + precision. */ + .poly = { -0x1.bcb7b1526e506p-3, 0x1.287a7636be1d1p-3, -0x1.bcb7b158af938p-4, + 0x1.63c78734e6d07p-4, -0x1.287461742fee4p-4 }, + .invln10 = 0x1.bcb7b1526e50ep-2, + .log10_2 = 0x1.34413509f79ffp-2, + /* Algorithm: x = 2^k z log10(x) = k log10(2) + log10(c) + poly(z/c - 1) / log(10) -where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,N=128) -and log(c) and 1/c for the ith subinterval comes from a lookup table: + where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1, + N=128) and log(c) and 1/c for the ith subinterval comes from lookup + tables: - tab[i].invc = 1/c - tab[i].log10c = (double)log10(c) - -where c is near the center of the subinterval and is chosen by trying several -floating point invc candidates around 1/center and selecting one for which -the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval -that contains 1 and the previous one got tweaked to avoid cancellation. -NB: invc should be optimized to minimize error in (double)log10(c) instead. */ -const struct v_log10_data __v_log10_data - = {.tab = {{0x1.6a133d0dec120p+0, -0x1.345825f221684p-3}, - {0x1.6815f2f3e42edp+0, -0x1.2f71a1f0c554ep-3}, - {0x1.661e39be1ac9ep+0, -0x1.2a91fdb30b1f4p-3}, - {0x1.642bfa30ac371p+0, -0x1.25b9260981a04p-3}, - {0x1.623f1d916f323p+0, -0x1.20e7081762193p-3}, - {0x1.60578da220f65p+0, -0x1.1c1b914aeefacp-3}, - {0x1.5e75349dea571p+0, -0x1.1756af5de404dp-3}, - {0x1.5c97fd387a75ap+0, -0x1.12985059c90bfp-3}, - {0x1.5abfd2981f200p+0, -0x1.0de0628f63df4p-3}, - {0x1.58eca051dc99cp+0, -0x1.092ed492e08eep-3}, - {0x1.571e526d9df12p+0, -0x1.0483954caf1dfp-3}, - {0x1.5554d555b3fcbp+0, -0x1.ffbd27a9adbcp-4}, - {0x1.539015e2a20cdp+0, -0x1.f67f7f2e3d1ap-4}, - {0x1.51d0014ee0164p+0, -0x1.ed4e1071ceebep-4}, - {0x1.50148538cd9eep+0, -0x1.e428bb47413c4p-4}, - {0x1.4e5d8f9f698a1p+0, -0x1.db0f6003028d6p-4}, - {0x1.4cab0edca66bep+0, -0x1.d201df6749831p-4}, - {0x1.4afcf1a9db874p+0, -0x1.c9001ac5c9672p-4}, - {0x1.495327136e16fp+0, -0x1.c009f3c78c79p-4}, - {0x1.47ad9e84af28fp+0, -0x1.b71f4cb642e53p-4}, - {0x1.460c47b39ae15p+0, -0x1.ae400818526b2p-4}, - {0x1.446f12b278001p+0, -0x1.a56c091954f87p-4}, - {0x1.42d5efdd720ecp+0, -0x1.9ca3332f096eep-4}, - {0x1.4140cfe001a0fp+0, -0x1.93e56a3f23e55p-4}, - {0x1.3fafa3b421f69p+0, -0x1.8b3292a3903bp-4}, - {0x1.3e225c9c8ece5p+0, -0x1.828a9112d9618p-4}, - {0x1.3c98ec29a211ap+0, -0x1.79ed4ac35f5acp-4}, - {0x1.3b13442a413fep+0, -0x1.715aa51ed28c4p-4}, - {0x1.399156baa3c54p+0, -0x1.68d2861c999e9p-4}, - {0x1.38131639b4cdbp+0, -0x1.6054d40ded21p-4}, - {0x1.36987540fbf53p+0, -0x1.57e17576bc9a2p-4}, - {0x1.352166b648f61p+0, -0x1.4f7851798bb0bp-4}, - {0x1.33adddb3eb575p+0, -0x1.47194f5690ae3p-4}, - {0x1.323dcd99fc1d3p+0, -0x1.3ec456d58ec47p-4}, - {0x1.30d129fefc7d2p+0, -0x1.36794ff3e5f55p-4}, - {0x1.2f67e6b72fe7dp+0, -0x1.2e382315725e4p-4}, - {0x1.2e01f7cf8b187p+0, -0x1.2600b8ed82e91p-4}, - {0x1.2c9f518ddc86ep+0, -0x1.1dd2fa85efc12p-4}, - {0x1.2b3fe86e5f413p+0, -0x1.15aed136e3961p-4}, - {0x1.29e3b1211b25cp+0, -0x1.0d94269d1a30dp-4}, - {0x1.288aa08b373cfp+0, -0x1.0582e4a7659f5p-4}, - {0x1.2734abcaa8467p+0, -0x1.faf5eb655742dp-5}, - {0x1.25e1c82459b81p+0, -0x1.eaf888487e8eep-5}, - {0x1.2491eb1ad59c5p+0, -0x1.db0d75ef25a82p-5}, - {0x1.23450a54048b5p+0, -0x1.cb348a49e6431p-5}, - {0x1.21fb1bb09e578p+0, -0x1.bb6d9c69acdd8p-5}, - {0x1.20b415346d8f7p+0, -0x1.abb88368aa7ap-5}, - {0x1.1f6fed179a1acp+0, -0x1.9c1517476af14p-5}, - {0x1.1e2e99b93c7b3p+0, -0x1.8c833051bfa4dp-5}, - {0x1.1cf011a7a882ap+0, -0x1.7d02a78e7fb31p-5}, - {0x1.1bb44b97dba5ap+0, -0x1.6d93565e97c5fp-5}, - {0x1.1a7b3e66cdd4fp+0, -0x1.5e351695db0c5p-5}, - {0x1.1944e11dc56cdp+0, -0x1.4ee7c2ba67adcp-5}, - {0x1.18112aebb1a6ep+0, -0x1.3fab35ba16c01p-5}, - {0x1.16e013231b7e9p+0, -0x1.307f4ad854bc9p-5}, - {0x1.15b1913f156cfp+0, -0x1.2163ddf4f988cp-5}, - {0x1.14859cdedde13p+0, -0x1.1258cb5d19e22p-5}, - {0x1.135c2dc68cfa4p+0, -0x1.035defdba3188p-5}, - {0x1.12353bdb01684p+0, -0x1.e8e651191bce4p-6}, - {0x1.1110bf25b85b4p+0, -0x1.cb30a62be444cp-6}, - {0x1.0feeafd2f8577p+0, -0x1.ad9a9b3043823p-6}, - {0x1.0ecf062c51c3bp+0, -0x1.9023ecda1ccdep-6}, - {0x1.0db1baa076c8bp+0, -0x1.72cc592bd82dp-6}, - {0x1.0c96c5bb3048ep+0, -0x1.55939eb1f9c6ep-6}, - {0x1.0b7e20263e070p+0, -0x1.38797ca6cc5ap-6}, - {0x1.0a67c2acd0ce3p+0, -0x1.1b7db35c2c072p-6}, - {0x1.0953a6391e982p+0, -0x1.fd400812ee9a2p-7}, - {0x1.0841c3caea380p+0, -0x1.c3c05fb4620f1p-7}, - {0x1.07321489b13eap+0, -0x1.8a7bf3c40e2e3p-7}, - {0x1.062491aee9904p+0, -0x1.517249c15a75cp-7}, - {0x1.05193497a7cc5p+0, -0x1.18a2ea5330c91p-7}, - {0x1.040ff6b5f5e9fp+0, -0x1.c01abc8cdc4e2p-8}, - {0x1.0308d19aa6127p+0, -0x1.4f6261750dec9p-8}, - {0x1.0203beedb0c67p+0, -0x1.be37b6612afa7p-9}, - {0x1.010037d38bcc2p+0, -0x1.bc3a8398ac26p-10}, - {1.0, 0.0}, - {0x1.fc06d493cca10p-1, 0x1.bb796219f30a5p-9}, - {0x1.f81e6ac3b918fp-1, 0x1.b984fdcba61cep-8}, - {0x1.f44546ef18996p-1, 0x1.49cf12adf8e8cp-7}, - {0x1.f07b10382c84bp-1, 0x1.b6075b5217083p-7}, - {0x1.ecbf7070e59d4p-1, 0x1.10b7466fc30ddp-6}, - {0x1.e91213f715939p-1, 0x1.4603e4db6a3a1p-6}, - {0x1.e572a9a75f7b7p-1, 0x1.7aeb10e99e105p-6}, - {0x1.e1e0e2c530207p-1, 0x1.af6e49b0f0e36p-6}, - {0x1.de5c72d8a8be3p-1, 0x1.e38f064f41179p-6}, - {0x1.dae50fa5658ccp-1, 0x1.0ba75abbb7623p-5}, - {0x1.d77a71145a2dap-1, 0x1.25575ee2dba86p-5}, - {0x1.d41c51166623ep-1, 0x1.3ed83f477f946p-5}, - {0x1.d0ca6ba0bb29fp-1, 0x1.582aa79af60efp-5}, - {0x1.cd847e8e59681p-1, 0x1.714f400fa83aep-5}, - {0x1.ca4a499693e00p-1, 0x1.8a46ad3901cb9p-5}, - {0x1.c71b8e399e821p-1, 0x1.a311903b6b87p-5}, - {0x1.c3f80faf19077p-1, 0x1.bbb086f216911p-5}, - {0x1.c0df92dc2b0ecp-1, 0x1.d4242bdda648ep-5}, - {0x1.bdd1de3cbb542p-1, 0x1.ec6d167c2af1p-5}, - {0x1.baceb9e1007a3p-1, 0x1.0245ed8221426p-4}, - {0x1.b7d5ef543e55ep-1, 0x1.0e40856c74f64p-4}, - {0x1.b4e749977d953p-1, 0x1.1a269a31120fep-4}, - {0x1.b20295155478ep-1, 0x1.25f8718fc076cp-4}, - {0x1.af279f8e82be2p-1, 0x1.31b64ffc95bfp-4}, - {0x1.ac5638197fdf3p-1, 0x1.3d60787ca5063p-4}, - {0x1.a98e2f102e087p-1, 0x1.48f72ccd187fdp-4}, - {0x1.a6cf5606d05c1p-1, 0x1.547aad6602f1cp-4}, - {0x1.a4197fc04d746p-1, 0x1.5feb3989d3acbp-4}, - {0x1.a16c80293dc01p-1, 0x1.6b490f3978c79p-4}, - {0x1.9ec82c4dc5bc9p-1, 0x1.76946b3f5e703p-4}, - {0x1.9c2c5a491f534p-1, 0x1.81cd895717c83p-4}, - {0x1.9998e1480b618p-1, 0x1.8cf4a4055c30ep-4}, - {0x1.970d9977c6c2dp-1, 0x1.9809f4c48c0ebp-4}, - {0x1.948a5c023d212p-1, 0x1.a30db3f9899efp-4}, - {0x1.920f0303d6809p-1, 0x1.ae001905458fcp-4}, - {0x1.8f9b698a98b45p-1, 0x1.b8e15a2e3a2cdp-4}, - {0x1.8d2f6b81726f6p-1, 0x1.c3b1ace2b0996p-4}, - {0x1.8acae5bb55badp-1, 0x1.ce71456edfa62p-4}, - {0x1.886db5d9275b8p-1, 0x1.d9205759882c4p-4}, - {0x1.8617ba567c13cp-1, 0x1.e3bf1513af0dfp-4}, - {0x1.83c8d27487800p-1, 0x1.ee4db0412c414p-4}, - {0x1.8180de3c5dbe7p-1, 0x1.f8cc5998de3a5p-4}, - {0x1.7f3fbe71cdb71p-1, 0x1.019da085eaeb1p-3}, - {0x1.7d055498071c1p-1, 0x1.06cd4acdb4e3dp-3}, - {0x1.7ad182e54f65ap-1, 0x1.0bf542bef813fp-3}, - {0x1.78a42c3c90125p-1, 0x1.11159f14da262p-3}, - {0x1.767d342f76944p-1, 0x1.162e761c10d1cp-3}, - {0x1.745c7ef26b00ap-1, 0x1.1b3fddc60d43ep-3}, - {0x1.7241f15769d0fp-1, 0x1.2049ebac86aa6p-3}, - {0x1.702d70d396e41p-1, 0x1.254cb4fb7836ap-3}, - {0x1.6e1ee3700cd11p-1, 0x1.2a484e8d0d252p-3}, - {0x1.6c162fc9cbe02p-1, 0x1.2f3ccce1c860bp-3}}, - - /* Computed from log coeffs div by log(10) then rounded to double - precision. */ - .poly - = {-0x1.bcb7b1526e506p-3, 0x1.287a7636be1d1p-3, -0x1.bcb7b158af938p-4, - 0x1.63c78734e6d07p-4, -0x1.287461742fee4p-4}, - - .invln10 = 0x1.bcb7b1526e50ep-2, - .log10_2 = 0x1.34413509f79ffp-2 + table[i].invc = 1/c + table[i].log10c = (double)log10(c) + where c is near the center of the subinterval and is chosen by trying + several floating point invc candidates around 1/center and selecting one + for which the error in (double)log(c) is minimized (< 0x1p-74), except the + subinterval that contains 1 and the previous one got tweaked to avoid + cancellation. NB: invc should be optimized to minimize error in + (double)log10(c) instead. */ + .table = { { 0x1.6a133d0dec120p+0, -0x1.345825f221684p-3 }, + { 0x1.6815f2f3e42edp+0, -0x1.2f71a1f0c554ep-3 }, + { 0x1.661e39be1ac9ep+0, -0x1.2a91fdb30b1f4p-3 }, + { 0x1.642bfa30ac371p+0, -0x1.25b9260981a04p-3 }, + { 0x1.623f1d916f323p+0, -0x1.20e7081762193p-3 }, + { 0x1.60578da220f65p+0, -0x1.1c1b914aeefacp-3 }, + { 0x1.5e75349dea571p+0, -0x1.1756af5de404dp-3 }, + { 0x1.5c97fd387a75ap+0, -0x1.12985059c90bfp-3 }, + { 0x1.5abfd2981f200p+0, -0x1.0de0628f63df4p-3 }, + { 0x1.58eca051dc99cp+0, -0x1.092ed492e08eep-3 }, + { 0x1.571e526d9df12p+0, -0x1.0483954caf1dfp-3 }, + { 0x1.5554d555b3fcbp+0, -0x1.ffbd27a9adbcp-4 }, + { 0x1.539015e2a20cdp+0, -0x1.f67f7f2e3d1ap-4 }, + { 0x1.51d0014ee0164p+0, -0x1.ed4e1071ceebep-4 }, + { 0x1.50148538cd9eep+0, -0x1.e428bb47413c4p-4 }, + { 0x1.4e5d8f9f698a1p+0, -0x1.db0f6003028d6p-4 }, + { 0x1.4cab0edca66bep+0, -0x1.d201df6749831p-4 }, + { 0x1.4afcf1a9db874p+0, -0x1.c9001ac5c9672p-4 }, + { 0x1.495327136e16fp+0, -0x1.c009f3c78c79p-4 }, + { 0x1.47ad9e84af28fp+0, -0x1.b71f4cb642e53p-4 }, + { 0x1.460c47b39ae15p+0, -0x1.ae400818526b2p-4 }, + { 0x1.446f12b278001p+0, -0x1.a56c091954f87p-4 }, + { 0x1.42d5efdd720ecp+0, -0x1.9ca3332f096eep-4 }, + { 0x1.4140cfe001a0fp+0, -0x1.93e56a3f23e55p-4 }, + { 0x1.3fafa3b421f69p+0, -0x1.8b3292a3903bp-4 }, + { 0x1.3e225c9c8ece5p+0, -0x1.828a9112d9618p-4 }, + { 0x1.3c98ec29a211ap+0, -0x1.79ed4ac35f5acp-4 }, + { 0x1.3b13442a413fep+0, -0x1.715aa51ed28c4p-4 }, + { 0x1.399156baa3c54p+0, -0x1.68d2861c999e9p-4 }, + { 0x1.38131639b4cdbp+0, -0x1.6054d40ded21p-4 }, + { 0x1.36987540fbf53p+0, -0x1.57e17576bc9a2p-4 }, + { 0x1.352166b648f61p+0, -0x1.4f7851798bb0bp-4 }, + { 0x1.33adddb3eb575p+0, -0x1.47194f5690ae3p-4 }, + { 0x1.323dcd99fc1d3p+0, -0x1.3ec456d58ec47p-4 }, + { 0x1.30d129fefc7d2p+0, -0x1.36794ff3e5f55p-4 }, + { 0x1.2f67e6b72fe7dp+0, -0x1.2e382315725e4p-4 }, + { 0x1.2e01f7cf8b187p+0, -0x1.2600b8ed82e91p-4 }, + { 0x1.2c9f518ddc86ep+0, -0x1.1dd2fa85efc12p-4 }, + { 0x1.2b3fe86e5f413p+0, -0x1.15aed136e3961p-4 }, + { 0x1.29e3b1211b25cp+0, -0x1.0d94269d1a30dp-4 }, + { 0x1.288aa08b373cfp+0, -0x1.0582e4a7659f5p-4 }, + { 0x1.2734abcaa8467p+0, -0x1.faf5eb655742dp-5 }, + { 0x1.25e1c82459b81p+0, -0x1.eaf888487e8eep-5 }, + { 0x1.2491eb1ad59c5p+0, -0x1.db0d75ef25a82p-5 }, + { 0x1.23450a54048b5p+0, -0x1.cb348a49e6431p-5 }, + { 0x1.21fb1bb09e578p+0, -0x1.bb6d9c69acdd8p-5 }, + { 0x1.20b415346d8f7p+0, -0x1.abb88368aa7ap-5 }, + { 0x1.1f6fed179a1acp+0, -0x1.9c1517476af14p-5 }, + { 0x1.1e2e99b93c7b3p+0, -0x1.8c833051bfa4dp-5 }, + { 0x1.1cf011a7a882ap+0, -0x1.7d02a78e7fb31p-5 }, + { 0x1.1bb44b97dba5ap+0, -0x1.6d93565e97c5fp-5 }, + { 0x1.1a7b3e66cdd4fp+0, -0x1.5e351695db0c5p-5 }, + { 0x1.1944e11dc56cdp+0, -0x1.4ee7c2ba67adcp-5 }, + { 0x1.18112aebb1a6ep+0, -0x1.3fab35ba16c01p-5 }, + { 0x1.16e013231b7e9p+0, -0x1.307f4ad854bc9p-5 }, + { 0x1.15b1913f156cfp+0, -0x1.2163ddf4f988cp-5 }, + { 0x1.14859cdedde13p+0, -0x1.1258cb5d19e22p-5 }, + { 0x1.135c2dc68cfa4p+0, -0x1.035defdba3188p-5 }, + { 0x1.12353bdb01684p+0, -0x1.e8e651191bce4p-6 }, + { 0x1.1110bf25b85b4p+0, -0x1.cb30a62be444cp-6 }, + { 0x1.0feeafd2f8577p+0, -0x1.ad9a9b3043823p-6 }, + { 0x1.0ecf062c51c3bp+0, -0x1.9023ecda1ccdep-6 }, + { 0x1.0db1baa076c8bp+0, -0x1.72cc592bd82dp-6 }, + { 0x1.0c96c5bb3048ep+0, -0x1.55939eb1f9c6ep-6 }, + { 0x1.0b7e20263e070p+0, -0x1.38797ca6cc5ap-6 }, + { 0x1.0a67c2acd0ce3p+0, -0x1.1b7db35c2c072p-6 }, + { 0x1.0953a6391e982p+0, -0x1.fd400812ee9a2p-7 }, + { 0x1.0841c3caea380p+0, -0x1.c3c05fb4620f1p-7 }, + { 0x1.07321489b13eap+0, -0x1.8a7bf3c40e2e3p-7 }, + { 0x1.062491aee9904p+0, -0x1.517249c15a75cp-7 }, + { 0x1.05193497a7cc5p+0, -0x1.18a2ea5330c91p-7 }, + { 0x1.040ff6b5f5e9fp+0, -0x1.c01abc8cdc4e2p-8 }, + { 0x1.0308d19aa6127p+0, -0x1.4f6261750dec9p-8 }, + { 0x1.0203beedb0c67p+0, -0x1.be37b6612afa7p-9 }, + { 0x1.010037d38bcc2p+0, -0x1.bc3a8398ac26p-10 }, + { 1.0, 0.0 }, + { 0x1.fc06d493cca10p-1, 0x1.bb796219f30a5p-9 }, + { 0x1.f81e6ac3b918fp-1, 0x1.b984fdcba61cep-8 }, + { 0x1.f44546ef18996p-1, 0x1.49cf12adf8e8cp-7 }, + { 0x1.f07b10382c84bp-1, 0x1.b6075b5217083p-7 }, + { 0x1.ecbf7070e59d4p-1, 0x1.10b7466fc30ddp-6 }, + { 0x1.e91213f715939p-1, 0x1.4603e4db6a3a1p-6 }, + { 0x1.e572a9a75f7b7p-1, 0x1.7aeb10e99e105p-6 }, + { 0x1.e1e0e2c530207p-1, 0x1.af6e49b0f0e36p-6 }, + { 0x1.de5c72d8a8be3p-1, 0x1.e38f064f41179p-6 }, + { 0x1.dae50fa5658ccp-1, 0x1.0ba75abbb7623p-5 }, + { 0x1.d77a71145a2dap-1, 0x1.25575ee2dba86p-5 }, + { 0x1.d41c51166623ep-1, 0x1.3ed83f477f946p-5 }, + { 0x1.d0ca6ba0bb29fp-1, 0x1.582aa79af60efp-5 }, + { 0x1.cd847e8e59681p-1, 0x1.714f400fa83aep-5 }, + { 0x1.ca4a499693e00p-1, 0x1.8a46ad3901cb9p-5 }, + { 0x1.c71b8e399e821p-1, 0x1.a311903b6b87p-5 }, + { 0x1.c3f80faf19077p-1, 0x1.bbb086f216911p-5 }, + { 0x1.c0df92dc2b0ecp-1, 0x1.d4242bdda648ep-5 }, + { 0x1.bdd1de3cbb542p-1, 0x1.ec6d167c2af1p-5 }, + { 0x1.baceb9e1007a3p-1, 0x1.0245ed8221426p-4 }, + { 0x1.b7d5ef543e55ep-1, 0x1.0e40856c74f64p-4 }, + { 0x1.b4e749977d953p-1, 0x1.1a269a31120fep-4 }, + { 0x1.b20295155478ep-1, 0x1.25f8718fc076cp-4 }, + { 0x1.af279f8e82be2p-1, 0x1.31b64ffc95bfp-4 }, + { 0x1.ac5638197fdf3p-1, 0x1.3d60787ca5063p-4 }, + { 0x1.a98e2f102e087p-1, 0x1.48f72ccd187fdp-4 }, + { 0x1.a6cf5606d05c1p-1, 0x1.547aad6602f1cp-4 }, + { 0x1.a4197fc04d746p-1, 0x1.5feb3989d3acbp-4 }, + { 0x1.a16c80293dc01p-1, 0x1.6b490f3978c79p-4 }, + { 0x1.9ec82c4dc5bc9p-1, 0x1.76946b3f5e703p-4 }, + { 0x1.9c2c5a491f534p-1, 0x1.81cd895717c83p-4 }, + { 0x1.9998e1480b618p-1, 0x1.8cf4a4055c30ep-4 }, + { 0x1.970d9977c6c2dp-1, 0x1.9809f4c48c0ebp-4 }, + { 0x1.948a5c023d212p-1, 0x1.a30db3f9899efp-4 }, + { 0x1.920f0303d6809p-1, 0x1.ae001905458fcp-4 }, + { 0x1.8f9b698a98b45p-1, 0x1.b8e15a2e3a2cdp-4 }, + { 0x1.8d2f6b81726f6p-1, 0x1.c3b1ace2b0996p-4 }, + { 0x1.8acae5bb55badp-1, 0x1.ce71456edfa62p-4 }, + { 0x1.886db5d9275b8p-1, 0x1.d9205759882c4p-4 }, + { 0x1.8617ba567c13cp-1, 0x1.e3bf1513af0dfp-4 }, + { 0x1.83c8d27487800p-1, 0x1.ee4db0412c414p-4 }, + { 0x1.8180de3c5dbe7p-1, 0x1.f8cc5998de3a5p-4 }, + { 0x1.7f3fbe71cdb71p-1, 0x1.019da085eaeb1p-3 }, + { 0x1.7d055498071c1p-1, 0x1.06cd4acdb4e3dp-3 }, + { 0x1.7ad182e54f65ap-1, 0x1.0bf542bef813fp-3 }, + { 0x1.78a42c3c90125p-1, 0x1.11159f14da262p-3 }, + { 0x1.767d342f76944p-1, 0x1.162e761c10d1cp-3 }, + { 0x1.745c7ef26b00ap-1, 0x1.1b3fddc60d43ep-3 }, + { 0x1.7241f15769d0fp-1, 0x1.2049ebac86aa6p-3 }, + { 0x1.702d70d396e41p-1, 0x1.254cb4fb7836ap-3 }, + { 0x1.6e1ee3700cd11p-1, 0x1.2a484e8d0d252p-3 }, + { 0x1.6c162fc9cbe02p-1, 0x1.2f3ccce1c860bp-3 } } }; diff --git a/contrib/arm-optimized-routines/pl/math/v_log10f_3u5.c b/contrib/arm-optimized-routines/pl/math/v_log10f_3u5.c index e9f7f0346ca2..92bc50ba5bd9 100644 --- a/contrib/arm-optimized-routines/pl/math/v_log10f_3u5.c +++ b/contrib/arm-optimized-routines/pl/math/v_log10f_3u5.c @@ -6,77 +6,77 @@ */ #include "v_math.h" -#include "mathlib.h" +#include "poly_advsimd_f32.h" #include "pl_sig.h" #include "pl_test.h" -#if V_SUPPORTED +static const struct data +{ + uint32x4_t min_norm; + uint16x8_t special_bound; + float32x4_t poly[8]; + float32x4_t inv_ln10, ln2; + uint32x4_t off, mantissa_mask; +} data = { + /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in + [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */ + .poly = { V4 (-0x1.bcb79cp-3f), V4 (0x1.2879c8p-3f), V4 (-0x1.bcd472p-4f), + V4 (0x1.6408f8p-4f), V4 (-0x1.246f8p-4f), V4 (0x1.f0e514p-5f), + V4 (-0x1.0fc92cp-4f), V4 (0x1.f5f76ap-5f) }, + .ln2 = V4 (0x1.62e43p-1f), + .inv_ln10 = V4 (0x1.bcb7b2p-2f), + .min_norm = V4 (0x00800000), + .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */ + .off = V4 (0x3f2aaaab), /* 0.666667. */ + .mantissa_mask = V4 (0x007fffff), +}; -#define P(i) v_f32 (__v_log10f_poly[i]) - -#define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218. */ -#define InvLn10 v_f32 (0x1.bcb7b2p-2f) -#define Min v_u32 (0x00800000) -#define Max v_u32 (0x7f800000) -#define Mask v_u32 (0x007fffff) -#define Off v_u32 (0x3f2aaaab) /* 0.666667. */ - -VPCS_ATTR -NOINLINE static v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, float32x4_t p, float32x4_t r2, + uint16x4_t cmp) { /* Fall back to scalar code. */ - return v_call_f32 (log10f, x, y, cmp); + return v_call_f32 (log10f, x, vfmaq_f32 (y, p, r2), vmovl_u16 (cmp)); } -/* Our fast implementation of v_log10f uses a similar approach as v_logf. - With the same offset as v_logf (i.e., 2/3) it delivers about 3.3ulps with - order 9. This is more efficient than using a low order polynomial computed in - double precision. +/* Fast implementation of AdvSIMD log10f, + uses a similar approach as AdvSIMD logf with the same offset (i.e., 2/3) and + an order 9 polynomial. Maximum error: 3.305ulps (nearest rounding.) - __v_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4 - want 0x1.ffe2f4p-4 -0.304916 ulp err 2.80492. */ -VPCS_ATTR -v_f32_t V_NAME (log10f) (v_f32_t x) + _ZGVnN4v_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4 + want 0x1.ffe2f4p-4. */ +float32x4_t VPCS_ATTR V_NAME_F1 (log10) (float32x4_t x) { - v_f32_t n, o, p, q, r, r2, y; - v_u32_t u, cmp; - - u = v_as_u32_f32 (x); - cmp = v_cond_u32 (u - Min >= Max - Min); + const struct data *d = ptr_barrier (&data); + uint32x4_t u = vreinterpretq_u32_f32 (x); + uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm), + vget_low_u16 (d->special_bound)); /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ - u -= Off; - n = v_to_f32_s32 (v_as_s32_u32 (u) >> 23); /* signextend. */ - u &= Mask; - u += Off; - r = v_as_f32_u32 (u) - v_f32 (1.0f); + u = vsubq_u32 (u, d->off); + float32x4_t n = vcvtq_f32_s32 ( + vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */ + u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off); + float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); - /* y = log10(1+r) + n*log10(2). */ - r2 = r * r; - /* (n*ln2 + r)*InvLn10 + r2*(P0 + r*P1 + r2*(P2 + r*P3 + r2*(P4 + r*P5 + - r2*(P6+r*P7))). */ - o = v_fma_f32 (P (7), r, P (6)); - p = v_fma_f32 (P (5), r, P (4)); - q = v_fma_f32 (P (3), r, P (2)); - y = v_fma_f32 (P (1), r, P (0)); - p = v_fma_f32 (o, r2, p); - q = v_fma_f32 (p, r2, q); - y = v_fma_f32 (q, r2, y); - /* Using p = Log10(2)*n + r*InvLn(10) is slightly faster - but less accurate. */ - p = v_fma_f32 (Ln2, n, r); - y = v_fma_f32 (y, r2, p * InvLn10); + /* y = log10(1+r) + n * log10(2). */ + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t poly = v_pw_horner_7_f32 (r, r2, d->poly); + /* y = Log10(2) * n + poly * InvLn(10). */ + float32x4_t y = vfmaq_f32 (r, d->ln2, n); + y = vmulq_f32 (y, d->inv_ln10); - if (unlikely (v_any_u32 (cmp))) - return specialcase (x, y, cmp); - return y; + if (unlikely (v_any_u16h (special))) + return special_case (x, y, poly, r2, special); + return vfmaq_f32 (y, poly, r2); } -VPCS_ALIAS PL_SIG (V, F, 1, log10, 0.01, 11.1) -PL_TEST_ULP (V_NAME (log10f), 2.81) -PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log10f)) -PL_TEST_INTERVAL (V_NAME (log10f), 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (V_NAME (log10f), 0x1p-4, 0x1p4, 500000) -#endif +PL_TEST_ULP (V_NAME_F1 (log10), 2.81) +PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_F1 (log10)) +PL_TEST_INTERVAL (V_NAME_F1 (log10), -0.0, -inf, 100) +PL_TEST_INTERVAL (V_NAME_F1 (log10), 0, 0x1p-126, 100) +PL_TEST_INTERVAL (V_NAME_F1 (log10), 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (V_NAME_F1 (log10), 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (V_NAME_F1 (log10), 1.0, 100, 50000) +PL_TEST_INTERVAL (V_NAME_F1 (log10), 100, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/v_log10f_data.c b/contrib/arm-optimized-routines/pl/math/v_log10f_data.c deleted file mode 100644 index 537482a92017..000000000000 --- a/contrib/arm-optimized-routines/pl/math/v_log10f_data.c +++ /dev/null @@ -1,13 +0,0 @@ -/* - * Coefficients for single-precision vector log10 function. - * - * Copyright (c) 2020-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "math_config.h" - -const float __v_log10f_poly[] = { - /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in - [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */ - -0x1.bcb79cp-3f, 0x1.2879c8p-3f, -0x1.bcd472p-4f, 0x1.6408f8p-4f, - -0x1.246f8p-4f, 0x1.f0e514p-5f, -0x1.0fc92cp-4f, 0x1.f5f76ap-5f}; diff --git a/contrib/arm-optimized-routines/pl/math/v_log1p_2u5.c b/contrib/arm-optimized-routines/pl/math/v_log1p_2u5.c index e48291081ab3..face02ddc6c3 100644 --- a/contrib/arm-optimized-routines/pl/math/v_log1p_2u5.c +++ b/contrib/arm-optimized-routines/pl/math/v_log1p_2u5.c @@ -6,55 +6,65 @@ */ #include "v_math.h" -#include "estrin.h" +#include "poly_advsimd_f64.h" #include "pl_sig.h" #include "pl_test.h" -#if V_SUPPORTED - -#define Ln2Hi v_f64 (0x1.62e42fefa3800p-1) -#define Ln2Lo v_f64 (0x1.ef35793c76730p-45) -#define HfRt2Top 0x3fe6a09e00000000 /* top32(asuint64(sqrt(2)/2)) << 32. */ -#define OneMHfRt2Top \ - 0x00095f6200000000 /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) \ - << 32. */ -#define OneTop12 0x3ff -#define BottomMask 0xffffffff -#define AbsMask 0x7fffffffffffffff -#define C(i) v_f64 (__log1p_data.coeffs[i]) - -static inline v_f64_t -eval_poly (v_f64_t f) +const static struct data { - v_f64_t f2 = f * f; - v_f64_t f4 = f2 * f2; - v_f64_t f8 = f4 * f4; - return ESTRIN_18 (f, f2, f4, f8, f8 * f8, C); -} + float64x2_t poly[19], ln2[2]; + uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask, inf, minus_one; + int64x2_t one_top; +} data = { + /* Generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */ + .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2), + V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3), + V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3), + V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4), + V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4), + V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4), + V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4), + V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5), + V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4), + V2 (-0x1.cfa7385bdb37ep-6) }, + .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) }, + /* top32(asuint64(sqrt(2)/2)) << 32. */ + .hf_rt2_top = V2 (0x3fe6a09e00000000), + /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32. */ + .one_m_hf_rt2_top = V2 (0x00095f6200000000), + .umask = V2 (0x000fffff00000000), + .one_top = V2 (0x3ff), + .inf = V2 (0x7ff0000000000000), + .minus_one = V2 (0xbff0000000000000) +}; -VPCS_ATTR -NOINLINE static v_f64_t -specialcase (v_f64_t x, v_f64_t y, v_u64_t special) +#define BottomMask v_u64 (0xffffffff) + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t special) { return v_call_f64 (log1p, x, y, special); } -/* Vector log1p approximation using polynomial on reduced interval. Routine is a - modification of the algorithm used in scalar log1p, with no shortcut for k=0 - and no narrowing for f and k. Maximum observed error is 2.46 ULP: - __v_log1p(0x1.654a1307242a4p+11) got 0x1.fd5565fb590f4p+2 - want 0x1.fd5565fb590f6p+2 . */ -VPCS_ATTR v_f64_t V_NAME (log1p) (v_f64_t x) +/* Vector log1p approximation using polynomial on reduced interval. Routine is + a modification of the algorithm used in scalar log1p, with no shortcut for + k=0 and no narrowing for f and k. Maximum observed error is 2.45 ULP: + _ZGVnN2v_log1p(0x1.658f7035c4014p+11) got 0x1.fd61d0727429dp+2 + want 0x1.fd61d0727429fp+2 . */ +VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x) { - v_u64_t ix = v_as_u64_f64 (x); - v_u64_t ia = ix & AbsMask; - v_u64_t special - = v_cond_u64 ((ia >= v_u64 (0x7ff0000000000000)) - | (ix >= 0xbff0000000000000) | (ix == 0x8000000000000000)); + const struct data *d = ptr_barrier (&data); + uint64x2_t ix = vreinterpretq_u64_f64 (x); + uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x)); + uint64x2_t special = vcgeq_u64 (ia, d->inf); #if WANT_SIMD_EXCEPT + special = vorrq_u64 (special, + vcgeq_u64 (ix, vreinterpretq_u64_f64 (v_f64 (-1)))); if (unlikely (v_any_u64 (special))) - x = v_sel_f64 (special, v_f64 (0), x); + x = v_zerofy_f64 (x, special); +#else + special = vorrq_u64 (special, vcleq_f64 (x, v_f64 (-1))); #endif /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f @@ -72,49 +82,47 @@ VPCS_ATTR v_f64_t V_NAME (log1p) (v_f64_t x) The scalar algorithm casts down to 32-bit at this point to calculate k and u_red. We stay in double-width to obtain f and k, using the same constants as the scalar algorithm but shifted left by 32. */ - v_f64_t m = x + 1; - v_u64_t mi = v_as_u64_f64 (m); - v_u64_t u = mi + OneMHfRt2Top; + float64x2_t m = vaddq_f64 (x, v_f64 (1)); + uint64x2_t mi = vreinterpretq_u64_f64 (m); + uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top); - v_s64_t ki = v_as_s64_u64 (u >> 52) - OneTop12; - v_f64_t k = v_to_f64_s64 (ki); + int64x2_t ki + = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top); + float64x2_t k = vcvtq_f64_s64 (ki); /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */ - v_u64_t utop = (u & 0x000fffff00000000) + HfRt2Top; - v_u64_t u_red = utop | (mi & BottomMask); - v_f64_t f = v_as_f64_u64 (u_red) - 1; + uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top); + uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask)); + float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1)); /* Correction term c/m. */ - v_f64_t cm = (x - (m - 1)) / m; + float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m); /* Approximate log1p(x) on the reduced input using a polynomial. Because - log1p(0)=0 we choose an approximation of the form: - x + C0*x^2 + C1*x^3 + C2x^4 + ... - Hence approximation has the form f + f^2 * P(f) + log1p(0)=0 we choose an approximation of the form: + x + C0*x^2 + C1*x^3 + C2x^4 + ... + Hence approximation has the form f + f^2 * P(f) where P(x) = C0 + C1*x + C2x^2 + ... - Assembling this all correctly is dealt with at the final step. */ - v_f64_t p = eval_poly (f); + Assembling this all correctly is dealt with at the final step. */ + float64x2_t f2 = vmulq_f64 (f, f); + float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly); - v_f64_t ylo = v_fma_f64 (k, Ln2Lo, cm); - v_f64_t yhi = v_fma_f64 (k, Ln2Hi, f); - v_f64_t y = v_fma_f64 (f * f, p, ylo + yhi); + float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]); + float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]); + float64x2_t y = vaddq_f64 (ylo, yhi); if (unlikely (v_any_u64 (special))) - return specialcase (v_as_f64_u64 (ix), y, special); + return special_case (vreinterpretq_f64_u64 (ix), vfmaq_f64 (y, f2, p), + special); - return y; + return vfmaq_f64 (y, f2, p); } -VPCS_ALIAS PL_SIG (V, D, 1, log1p, -0.9, 10.0) -PL_TEST_ULP (V_NAME (log1p), 1.97) -PL_TEST_EXPECT_FENV (V_NAME (log1p), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME (log1p), -10.0, 10.0, 10000) -PL_TEST_INTERVAL (V_NAME (log1p), 0.0, 0x1p-23, 50000) -PL_TEST_INTERVAL (V_NAME (log1p), 0x1p-23, 0.001, 50000) -PL_TEST_INTERVAL (V_NAME (log1p), 0.001, 1.0, 50000) -PL_TEST_INTERVAL (V_NAME (log1p), 0.0, -0x1p-23, 50000) -PL_TEST_INTERVAL (V_NAME (log1p), -0x1p-23, -0.001, 50000) -PL_TEST_INTERVAL (V_NAME (log1p), -0.001, -1.0, 50000) -PL_TEST_INTERVAL (V_NAME (log1p), -1.0, inf, 5000) -#endif +PL_TEST_ULP (V_NAME_D1 (log1p), 1.97) +PL_TEST_EXPECT_FENV (V_NAME_D1 (log1p), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0.0, 0x1p-23, 50000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0x1p-23, 0.001, 50000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0.001, 1.0, 50000) +PL_TEST_INTERVAL (V_NAME_D1 (log1p), 1, inf, 40000) +PL_TEST_INTERVAL (V_NAME_D1 (log1p), -1.0, -inf, 500) diff --git a/contrib/arm-optimized-routines/pl/math/v_log1p_inline.h b/contrib/arm-optimized-routines/pl/math/v_log1p_inline.h index e5c733964bc0..bd57bfc6fe6e 100644 --- a/contrib/arm-optimized-routines/pl/math/v_log1p_inline.h +++ b/contrib/arm-optimized-routines/pl/math/v_log1p_inline.h @@ -9,22 +9,38 @@ #define PL_MATH_V_LOG1P_INLINE_H #include "v_math.h" -#include "pairwise_horner.h" +#include "poly_advsimd_f64.h" -#define Ln2Hi v_f64 (0x1.62e42fefa3800p-1) -#define Ln2Lo v_f64 (0x1.ef35793c76730p-45) -#define HfRt2Top 0x3fe6a09e00000000 /* top32(asuint64(sqrt(2)/2)) << 32. */ -#define OneMHfRt2Top \ - 0x00095f6200000000 /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) \ - << 32. */ -#define OneTop 0x3ff -#define BottomMask 0xffffffff -#define BigBoundTop 0x5fe /* top12 (asuint64 (0x1p511)). */ +struct v_log1p_data +{ + float64x2_t poly[19], ln2[2]; + uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask; + int64x2_t one_top; +}; -#define C(i) v_f64 (__log1p_data.coeffs[i]) +/* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */ +#define V_LOG1P_CONSTANTS_TABLE \ + { \ + .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2), \ + V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3), \ + V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3), \ + V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4), \ + V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4), \ + V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4), \ + V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4), \ + V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5), \ + V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4), \ + V2 (-0x1.cfa7385bdb37ep-6) }, \ + .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) }, \ + .hf_rt2_top = V2 (0x3fe6a09e00000000), \ + .one_m_hf_rt2_top = V2 (0x00095f6200000000), \ + .umask = V2 (0x000fffff00000000), .one_top = V2 (0x3ff) \ + } -static inline v_f64_t -log1p_inline (v_f64_t x) +#define BottomMask v_u64 (0xffffffff) + +static inline float64x2_t +log1p_inline (float64x2_t x, const struct v_log1p_data *d) { /* Helper for calculating log(x + 1). Copied from v_log1p_2u5.c, with several modifications: @@ -35,43 +51,41 @@ log1p_inline (v_f64_t x) 0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1 in the source of the caller before including this file. See v_log1pf_2u1.c for details of the algorithm. */ - v_f64_t m = x + 1; - v_u64_t mi = v_as_u64_f64 (m); - v_u64_t u = mi + OneMHfRt2Top; + float64x2_t m = vaddq_f64 (x, v_f64 (1)); + uint64x2_t mi = vreinterpretq_u64_f64 (m); + uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top); - v_s64_t ki = v_as_s64_u64 (u >> 52) - OneTop; - v_f64_t k = v_to_f64_s64 (ki); + int64x2_t ki + = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top); + float64x2_t k = vcvtq_f64_s64 (ki); /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */ - v_u64_t utop = (u & 0x000fffff00000000) + HfRt2Top; - v_u64_t u_red = utop | (mi & BottomMask); - v_f64_t f = v_as_f64_u64 (u_red) - 1; + uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top); + uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask)); + float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1)); /* Correction term c/m. */ - v_f64_t cm = (x - (m - 1)) / m; + float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m); #ifndef WANT_V_LOG1P_K0_SHORTCUT #error \ "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0" #elif WANT_V_LOG1P_K0_SHORTCUT /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is - that the approximation is solely the polynomial. */ - v_u64_t k0 = k == 0; - if (unlikely (v_any_u64 (k0))) - { - cm = v_sel_f64 (k0, v_f64 (0), cm); - f = v_sel_f64 (k0, x, f); - } + that the approximation is solely the polynomial. */ + uint64x2_t k0 = vceqzq_f64 (k); + cm = v_zerofy_f64 (cm, k0); + f = vbslq_f64 (k0, x, f); #endif /* Approximate log1p(f) on the reduced input using a polynomial. */ - v_f64_t f2 = f * f; - v_f64_t p = PAIRWISE_HORNER_18 (f, f2, C); + float64x2_t f2 = vmulq_f64 (f, f); + float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly); /* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */ - v_f64_t ylo = v_fma_f64 (k, Ln2Lo, cm); - v_f64_t yhi = v_fma_f64 (k, Ln2Hi, f); - return v_fma_f64 (f2, p, ylo + yhi); + float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]); + float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]); + return vfmaq_f64 (vaddq_f64 (ylo, yhi), f2, p); } #endif // PL_MATH_V_LOG1P_INLINE_H diff --git a/contrib/arm-optimized-routines/pl/math/v_log1pf_2u1.c b/contrib/arm-optimized-routines/pl/math/v_log1pf_2u1.c index 4a7732b403ec..153c88da9c88 100644 --- a/contrib/arm-optimized-routines/pl/math/v_log1pf_2u1.c +++ b/contrib/arm-optimized-routines/pl/math/v_log1pf_2u1.c @@ -8,104 +8,72 @@ #include "v_math.h" #include "pl_sig.h" #include "pl_test.h" +#include "poly_advsimd_f32.h" -#if V_SUPPORTED - -#define AbsMask 0x7fffffff -#define TinyBound 0x340 /* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */ -#define MinusOne 0xbf800000 -#define Ln2 (0x1.62e43p-1f) -#define Four 0x40800000 -#define ThreeQuarters v_u32 (0x3f400000) - -#define C(i) v_f32 (__log1pf_data.coeffs[i]) - -static inline v_f32_t -eval_poly (v_f32_t m) +const static struct data { -#ifdef V_LOG1PF_1U3 + float32x4_t poly[8], ln2; + uint32x4_t tiny_bound, minus_one, four, thresh; + int32x4_t three_quarters; +} data = { + .poly = { /* Generated using FPMinimax in [-0.25, 0.5]. First two coefficients + (1, -0.5) are not stored as they can be generated more + efficiently. */ + V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f), + V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f), + V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) }, + .ln2 = V4 (0x1.62e43p-1f), + .tiny_bound = V4 (0x34000000), /* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */ + .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - tiny_bound. */ + .minus_one = V4 (0xbf800000), + .four = V4 (0x40800000), + .three_quarters = V4 (0x3f400000) +}; - /* Approximate log(1+m) on [-0.25, 0.5] using Horner scheme. */ - v_f32_t p = v_fma_f32 (C (8), m, C (7)); - p = v_fma_f32 (p, m, C (6)); - p = v_fma_f32 (p, m, C (5)); - p = v_fma_f32 (p, m, C (4)); - p = v_fma_f32 (p, m, C (3)); - p = v_fma_f32 (p, m, C (2)); - p = v_fma_f32 (p, m, C (1)); - p = v_fma_f32 (p, m, C (0)); - return v_fma_f32 (m, m * p, m); +static inline float32x4_t +eval_poly (float32x4_t m, const float32x4_t *p) +{ + /* Approximate log(1+m) on [-0.25, 0.5] using split Estrin scheme. */ + float32x4_t p_12 = vfmaq_f32 (v_f32 (-0.5), m, p[0]); + float32x4_t p_34 = vfmaq_f32 (p[1], m, p[2]); + float32x4_t p_56 = vfmaq_f32 (p[3], m, p[4]); + float32x4_t p_78 = vfmaq_f32 (p[5], m, p[6]); -#elif defined(V_LOG1PF_2U5) + float32x4_t m2 = vmulq_f32 (m, m); + float32x4_t p_02 = vfmaq_f32 (m, m2, p_12); + float32x4_t p_36 = vfmaq_f32 (p_34, m2, p_56); + float32x4_t p_79 = vfmaq_f32 (p_78, m2, p[7]); - /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme. */ - v_f32_t p_12 = v_fma_f32 (m, C (1), C (0)); - v_f32_t p_34 = v_fma_f32 (m, C (3), C (2)); - v_f32_t p_56 = v_fma_f32 (m, C (5), C (4)); - v_f32_t p_78 = v_fma_f32 (m, C (7), C (6)); - - v_f32_t m2 = m * m; - v_f32_t p_02 = v_fma_f32 (m2, p_12, m); - v_f32_t p_36 = v_fma_f32 (m2, p_56, p_34); - v_f32_t p_79 = v_fma_f32 (m2, C (8), p_78); - - v_f32_t m4 = m2 * m2; - v_f32_t p_06 = v_fma_f32 (m4, p_36, p_02); - - return v_fma_f32 (m4, m4 * p_79, p_06); - -#else -#error No precision specified for v_log1pf -#endif + float32x4_t m4 = vmulq_f32 (m2, m2); + float32x4_t p_06 = vfmaq_f32 (p_02, m4, p_36); + return vfmaq_f32 (p_06, m4, vmulq_f32 (m4, p_79)); } -static inline float -handle_special (float x) +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, float32x4_t y, uint32x4_t special) { - uint32_t ix = asuint (x); - uint32_t ia = ix & AbsMask; - if (ix == 0xff800000 || ia > 0x7f800000 || ix > 0xbf800000) - { - /* x == -Inf => log1pf(x) = NaN. - x < -1.0 => log1pf(x) = NaN. - x == +/-NaN => log1pf(x) = NaN. */ -#if WANT_SIMD_EXCEPT - return __math_invalidf (asfloat (ia)); -#else - return NAN; -#endif - } - if (ix == 0xbf800000) - { - /* x == -1.0 => log1pf(x) = -Inf. */ -#if WANT_SIMD_EXCEPT - return __math_divzerof (ix); -#else - return -INFINITY; -#endif - } - /* |x| < TinyBound => log1p(x) = x. */ - return x; + return v_call_f32 (log1pf, x, y, special); } -/* Vector log1pf approximation using polynomial on reduced interval. Accuracy is - the same as for the scalar algorithm, i.e. worst-case error when using Estrin +/* Vector log1pf approximation using polynomial on reduced interval. Accuracy is roughly 2.02 ULP: log1pf(0x1.21e13ap-2) got 0x1.fe8028p-3 want 0x1.fe802cp-3. */ -VPCS_ATTR v_f32_t V_NAME (log1pf) (v_f32_t x) +VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x) { - v_u32_t ix = v_as_u32_f32 (x); - v_u32_t ia12 = (ix >> 20) & v_u32 (0x7f8); - v_u32_t special_cases - = v_cond_u32 (ia12 - v_u32 (TinyBound) >= (0x7f8 - TinyBound)) - | v_cond_u32 (ix >= MinusOne); - v_f32_t special_arg = x; + const struct data *d = ptr_barrier (&data); + + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x)); + uint32x4_t special_cases + = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, d->tiny_bound), d->thresh), + vcgeq_u32 (ix, d->minus_one)); + float32x4_t special_arg = x; #if WANT_SIMD_EXCEPT if (unlikely (v_any_u32 (special_cases))) /* Side-step special lanes so fenv exceptions are not triggered inadvertently. */ - x = v_sel_f32 (special_cases, v_f32 (1), x); + x = v_zerofy_f32 (x, special_cases); #endif /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m @@ -117,44 +85,42 @@ VPCS_ATTR v_f32_t V_NAME (log1pf) (v_f32_t x) scale factor s = 4*k*log(2) to ensure the scale is representable as a normalised fp32 number. */ - v_f32_t m = x + v_f32 (1.0f); + float32x4_t m = vaddq_f32 (x, v_f32 (1.0f)); /* Choose k to scale x to the range [-1/4, 1/2]. */ - v_s32_t k = (v_as_s32_f32 (m) - ThreeQuarters) & v_u32 (0xff800000); + int32x4_t k + = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters), + v_s32 (0xff800000)); + uint32x4_t ku = vreinterpretq_u32_s32 (k); /* Scale x by exponent manipulation. */ - v_f32_t m_scale = v_as_f32_u32 (v_as_u32_f32 (x) - v_as_u32_s32 (k)); + float32x4_t m_scale + = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku)); /* Scale up to ensure that the scale factor is representable as normalised fp32 number, and scale m down accordingly. */ - v_f32_t s = v_as_f32_u32 (v_u32 (Four) - k); - m_scale = m_scale + v_fma_f32 (v_f32 (0.25f), s, v_f32 (-1.0f)); + float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku)); + m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s)); /* Evaluate polynomial on the reduced interval. */ - v_f32_t p = eval_poly (m_scale); + float32x4_t p = eval_poly (m_scale, d->poly); /* The scale factor to be applied back at the end - by multiplying float(k) by 2^-23 we get the unbiased exponent of k. */ - v_f32_t scale_back = v_to_f32_s32 (k) * v_f32 (0x1p-23f); + float32x4_t scale_back = vcvtq_f32_s32 (vshrq_n_s32 (k, 23)); /* Apply the scaling back. */ - v_f32_t y = v_fma_f32 (scale_back, v_f32 (Ln2), p); + float32x4_t y = vfmaq_f32 (p, scale_back, d->ln2); if (unlikely (v_any_u32 (special_cases))) - return v_call_f32 (handle_special, special_arg, y, special_cases); + return special_case (special_arg, y, special_cases); return y; } -VPCS_ALIAS PL_SIG (V, F, 1, log1p, -0.9, 10.0) -PL_TEST_ULP (V_NAME (log1pf), 1.53) -PL_TEST_EXPECT_FENV (V_NAME (log1pf), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME (log1pf), -10.0, 10.0, 10000) -PL_TEST_INTERVAL (V_NAME (log1pf), 0.0, 0x1p-23, 30000) -PL_TEST_INTERVAL (V_NAME (log1pf), 0x1p-23, 0.001, 50000) -PL_TEST_INTERVAL (V_NAME (log1pf), 0.001, 1.0, 50000) -PL_TEST_INTERVAL (V_NAME (log1pf), 0.0, -0x1p-23, 30000) -PL_TEST_INTERVAL (V_NAME (log1pf), -0x1p-23, -0.001, 30000) -PL_TEST_INTERVAL (V_NAME (log1pf), -0.001, -1.0, 50000) -PL_TEST_INTERVAL (V_NAME (log1pf), -1.0, inf, 1000) -#endif +PL_TEST_ULP (V_NAME_F1 (log1p), 1.53) +PL_TEST_EXPECT_FENV (V_NAME_F1 (log1p), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (log1p), 0.0, 0x1p-23, 30000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (log1p), 0x1p-23, 1, 50000) +PL_TEST_INTERVAL (V_NAME_F1 (log1p), 1, inf, 50000) +PL_TEST_INTERVAL (V_NAME_F1 (log1p), -1.0, -inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/v_log1pf_inline.h b/contrib/arm-optimized-routines/pl/math/v_log1pf_inline.h index e3048e667c26..c654c6bad08f 100644 --- a/contrib/arm-optimized-routines/pl/math/v_log1pf_inline.h +++ b/contrib/arm-optimized-routines/pl/math/v_log1pf_inline.h @@ -10,46 +10,58 @@ #define PL_MATH_V_LOG1PF_INLINE_H #include "v_math.h" -#include "math_config.h" +#include "poly_advsimd_f32.h" -#define Four 0x40800000 -#define Ln2 v_f32 (0x1.62e43p-1f) - -#define C(i) v_f32 (__log1pf_data.coeffs[i]) - -static inline v_f32_t -eval_poly (v_f32_t m) +struct v_log1pf_data { - /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme. */ - v_f32_t p_12 = v_fma_f32 (m, C (1), C (0)); - v_f32_t p_34 = v_fma_f32 (m, C (3), C (2)); - v_f32_t p_56 = v_fma_f32 (m, C (5), C (4)); - v_f32_t p_78 = v_fma_f32 (m, C (7), C (6)); + float32x4_t poly[8], ln2; + uint32x4_t four; + int32x4_t three_quarters; +}; - v_f32_t m2 = m * m; - v_f32_t p_02 = v_fma_f32 (m2, p_12, m); - v_f32_t p_36 = v_fma_f32 (m2, p_56, p_34); - v_f32_t p_79 = v_fma_f32 (m2, C (8), p_78); +/* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients + (1, -0.5) are not stored as they can be generated more efficiently. */ +#define V_LOG1PF_CONSTANTS_TABLE \ + { \ + .poly \ + = { V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f), \ + V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f), \ + V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) }, \ + .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \ + .three_quarters = V4 (0x3f400000) \ + } - v_f32_t m4 = m2 * m2; - v_f32_t p_06 = v_fma_f32 (m4, p_36, p_02); - - return v_fma_f32 (m4, m4 * p_79, p_06); +static inline float32x4_t +eval_poly (float32x4_t m, const float32x4_t *c) +{ + /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner (main routine + uses split Estrin, but this way reduces register pressure in the calling + routine). */ + float32x4_t q = vfmaq_f32 (v_f32 (-0.5), m, c[0]); + float32x4_t m2 = vmulq_f32 (m, m); + q = vfmaq_f32 (m, m2, q); + float32x4_t p = v_pw_horner_6_f32 (m, m2, c + 1); + p = vmulq_f32 (m2, p); + return vfmaq_f32 (q, m2, p); } -static inline v_f32_t -log1pf_inline (v_f32_t x) +static inline float32x4_t +log1pf_inline (float32x4_t x, const struct v_log1pf_data d) { /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no special-case handling. See that file for details of the algorithm. */ - v_f32_t m = x + 1.0f; - v_u32_t k = (v_as_u32_f32 (m) - 0x3f400000) & 0xff800000; - v_f32_t s = v_as_f32_u32 (v_u32 (Four) - k); - v_f32_t m_scale = v_as_f32_u32 (v_as_u32_f32 (x) - k) - + v_fma_f32 (v_f32 (0.25f), s, v_f32 (-1.0f)); - v_f32_t p = eval_poly (m_scale); - v_f32_t scale_back = v_to_f32_u32 (k) * 0x1.0p-23f; - return v_fma_f32 (scale_back, Ln2, p); + float32x4_t m = vaddq_f32 (x, v_f32 (1.0f)); + int32x4_t k + = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d.three_quarters), + v_s32 (0xff800000)); + uint32x4_t ku = vreinterpretq_u32_s32 (k); + float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d.four, ku)); + float32x4_t m_scale + = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku)); + m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s)); + float32x4_t p = eval_poly (m_scale, d.poly); + float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f)); + return vfmaq_f32 (p, scale_back, d.ln2); } #endif // PL_MATH_V_LOG1PF_INLINE_H diff --git a/contrib/arm-optimized-routines/pl/math/v_log2_3u.c b/contrib/arm-optimized-routines/pl/math/v_log2_3u.c index fac73f60c600..2dd2c34b7c97 100644 --- a/contrib/arm-optimized-routines/pl/math/v_log2_3u.c +++ b/contrib/arm-optimized-routines/pl/math/v_log2_3u.c @@ -6,95 +6,104 @@ */ #include "v_math.h" -#include "include/mathlib.h" #include "pl_sig.h" #include "pl_test.h" +#include "poly_advsimd_f64.h" -#if V_SUPPORTED - -#define InvLn2 v_f64 (0x1.71547652b82fep0) #define N (1 << V_LOG2_TABLE_BITS) -#define OFF v_u64 (0x3fe6900900000000) -#define P(i) v_f64 (__v_log2_data.poly[i]) + +static const struct data +{ + uint64x2_t min_norm; + uint32x4_t special_bound; + float64x2_t poly[5]; + float64x2_t invln2; + uint64x2_t sign_exp_mask; +} data = { + /* Each coefficient was generated to approximate log(r) for |r| < 0x1.fp-9 + and N = 128, then scaled by log2(e) in extended precision and rounded back + to double precision. */ + .poly = { V2 (-0x1.71547652b83p-1), V2 (0x1.ec709dc340953p-2), + V2 (-0x1.71547651c8f35p-2), V2 (0x1.2777ebe12dda5p-2), + V2 (-0x1.ec738d616fe26p-3) }, + .invln2 = V2 (0x1.71547652b82fep0), + .min_norm = V2 (0x0010000000000000), /* asuint64(0x1p-1022). */ + .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */ + .sign_exp_mask = V2 (0xfff0000000000000), +}; + +#define Off v_u64 (0x3fe6900900000000) +#define IndexMask (N - 1) struct entry { - v_f64_t invc; - v_f64_t log2c; + float64x2_t invc; + float64x2_t log2c; }; static inline struct entry -lookup (v_u64_t i) +lookup (uint64x2_t i) { struct entry e; -#ifdef SCALAR - e.invc = __v_log2_data.tab[i].invc; - e.log2c = __v_log2_data.tab[i].log2c; -#else - e.invc[0] = __v_log2_data.tab[i[0]].invc; - e.log2c[0] = __v_log2_data.tab[i[0]].log2c; - e.invc[1] = __v_log2_data.tab[i[1]].invc; - e.log2c[1] = __v_log2_data.tab[i[1]].log2c; -#endif + uint64_t i0 = (i[0] >> (52 - V_LOG2_TABLE_BITS)) & IndexMask; + uint64_t i1 = (i[1] >> (52 - V_LOG2_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log2_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log2_data.table[i1].invc); + e.invc = vuzp1q_f64 (e0, e1); + e.log2c = vuzp2q_f64 (e0, e1); return e; } -VPCS_ATTR -NOINLINE static v_f64_t -specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, float64x2_t w, float64x2_t r2, + uint32x2_t special) { - return v_call_f64 (log2, x, y, cmp); + return v_call_f64 (log2, x, vfmaq_f64 (w, r2, y), vmovl_u32 (special)); } -/* Double-precision vector log2 routine. Implements the same algorithm as vector - log10, with coefficients and table entries scaled in extended precision. - The maximum observed error is 2.58 ULP: - __v_log2(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5 - want 0x1.fffb34198d9ddp-5. */ -VPCS_ATTR -v_f64_t V_NAME (log2) (v_f64_t x) +/* Double-precision vector log2 routine. Implements the same algorithm as + vector log10, with coefficients and table entries scaled in extended + precision. The maximum observed error is 2.58 ULP: + _ZGVnN2v_log2(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5 + want 0x1.fffb34198d9ddp-5. */ +float64x2_t VPCS_ATTR V_NAME_D1 (log2) (float64x2_t x) { - v_u64_t ix = v_as_u64_f64 (x); - v_u64_t top = ix >> 48; - v_u64_t special - = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010)); + const struct data *d = ptr_barrier (&data); + uint64x2_t ix = vreinterpretq_u64_f64 (x); + uint32x2_t special = vcge_u32 (vsubhn_u64 (ix, d->min_norm), + vget_low_u32 (d->special_bound)); - /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + /* x = 2^k z; where z is in range [Off,2*Off) and exact. The range is split into N subintervals. The ith subinterval contains z and c is near its center. */ - v_u64_t tmp = ix - OFF; - v_u64_t i = (tmp >> (52 - V_LOG2_TABLE_BITS)) % N; - v_s64_t k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift. */ - v_u64_t iz = ix - (tmp & v_u64 (0xfffULL << 52)); - v_f64_t z = v_as_f64_u64 (iz); - struct entry e = lookup (i); + uint64x2_t tmp = vsubq_u64 (ix, Off); + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); + uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask)); + float64x2_t z = vreinterpretq_f64_u64 (iz); + + struct entry e = lookup (tmp); /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */ - v_f64_t r = v_fma_f64 (z, e.invc, v_f64 (-1.0)); - v_f64_t kd = v_to_f64_s64 (k); - v_f64_t w = v_fma_f64 (r, InvLn2, e.log2c); + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); + float64x2_t kd = vcvtq_f64_s64 (k); + float64x2_t w = vfmaq_f64 (e.log2c, r, d->invln2); - v_f64_t r2 = r * r; - v_f64_t p_23 = v_fma_f64 (P (3), r, P (2)); - v_f64_t p_01 = v_fma_f64 (P (1), r, P (0)); - v_f64_t y = v_fma_f64 (P (4), r2, p_23); - y = v_fma_f64 (r2, y, p_01); - y = v_fma_f64 (r2, y, kd + w); + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t y = v_pw_horner_4_f64 (r, r2, d->poly); + w = vaddq_f64 (kd, w); - if (unlikely (v_any_u64 (special))) - return specialcase (x, y, special); - return y; + if (unlikely (v_any_u32h (special))) + return special_case (x, y, w, r2, special); + return vfmaq_f64 (w, r2, y); } -VPCS_ALIAS PL_SIG (V, D, 1, log2, 0.01, 11.1) -PL_TEST_ULP (V_NAME (log2), 2.09) -PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log2)) -PL_TEST_INTERVAL (V_NAME (log2), -0.0, -0x1p126, 100) -PL_TEST_INTERVAL (V_NAME (log2), 0x1p-149, 0x1p-126, 4000) -PL_TEST_INTERVAL (V_NAME (log2), 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (V_NAME (log2), 0x1p-23, 1.0, 50000) -PL_TEST_INTERVAL (V_NAME (log2), 1.0, 100, 50000) -PL_TEST_INTERVAL (V_NAME (log2), 100, inf, 50000) -#endif +PL_TEST_ULP (V_NAME_D1 (log2), 2.09) +PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_D1 (log2)) +PL_TEST_INTERVAL (V_NAME_D1 (log2), -0.0, -0x1p126, 100) +PL_TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-149, 0x1p-126, 4000) +PL_TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (V_NAME_D1 (log2), 1.0, 100, 50000) +PL_TEST_INTERVAL (V_NAME_D1 (log2), 100, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/v_log2_data.c b/contrib/arm-optimized-routines/pl/math/v_log2_data.c index 2a1da6823fbc..50697daff925 100644 --- a/contrib/arm-optimized-routines/pl/math/v_log2_data.c +++ b/contrib/arm-optimized-routines/pl/math/v_log2_data.c @@ -9,147 +9,145 @@ #define N (1 << V_LOG2_TABLE_BITS) -// clang-format off - const struct v_log2_data __v_log2_data = { -/* Derived from the coefficients in log_data.c for N == 128 && LOG_POLY_ORDER == 6. - Each coefficient was scaled by log2(e) in extended precision and rounded back to - double. */ -.poly = { -0x1.71547652b83p-1, 0x1.ec709dc340953p-2, -0x1.71547651c8f35p-2, - 0x1.2777ebe12dda5p-2, -0x1.ec738d616fe26p-3 }, + /* Each coefficient was generated to approximate log(r) for |r| < 0x1.fp-9 + and N = 128, then scaled by log2(e) in extended precision and rounded back + to double precision. */ + .poly = { -0x1.71547652b83p-1, 0x1.ec709dc340953p-2, -0x1.71547651c8f35p-2, + 0x1.2777ebe12dda5p-2, -0x1.ec738d616fe26p-3 }, -/* Derived from the table in v_log10_data.c. invc is unchanged. log2(c) was - calculated by scaling log10(c) by log2(10) in extended precision and rounding - back. */ -.tab = { -{ 0x1.6a133d0dec120p+0, -0x1.00130d57f5fadp-1 }, -{ 0x1.6815f2f3e42edp+0, -0x1.f802661bd725ep-2 }, -{ 0x1.661e39be1ac9ep+0, -0x1.efea1c6f73a5bp-2 }, -{ 0x1.642bfa30ac371p+0, -0x1.e7dd1dcd06f05p-2 }, -{ 0x1.623f1d916f323p+0, -0x1.dfdb4ae024809p-2 }, -{ 0x1.60578da220f65p+0, -0x1.d7e484d101958p-2 }, -{ 0x1.5e75349dea571p+0, -0x1.cff8ad452f6ep-2 }, -{ 0x1.5c97fd387a75ap+0, -0x1.c817a666c997fp-2 }, -{ 0x1.5abfd2981f200p+0, -0x1.c04152d640419p-2 }, -{ 0x1.58eca051dc99cp+0, -0x1.b87595a3f64b2p-2 }, -{ 0x1.571e526d9df12p+0, -0x1.b0b4526c44d07p-2 }, -{ 0x1.5554d555b3fcbp+0, -0x1.a8fd6d1a90f5ep-2 }, -{ 0x1.539015e2a20cdp+0, -0x1.a150ca2559fc6p-2 }, -{ 0x1.51d0014ee0164p+0, -0x1.99ae4e62cca29p-2 }, -{ 0x1.50148538cd9eep+0, -0x1.9215df1a1e842p-2 }, -{ 0x1.4e5d8f9f698a1p+0, -0x1.8a8761fe1f0d9p-2 }, -{ 0x1.4cab0edca66bep+0, -0x1.8302bd1cc9a54p-2 }, -{ 0x1.4afcf1a9db874p+0, -0x1.7b87d6fb437f6p-2 }, -{ 0x1.495327136e16fp+0, -0x1.741696673a86dp-2 }, -{ 0x1.47ad9e84af28fp+0, -0x1.6caee2b3c6fe4p-2 }, -{ 0x1.460c47b39ae15p+0, -0x1.6550a3666c27ap-2 }, -{ 0x1.446f12b278001p+0, -0x1.5dfbc08de02a4p-2 }, -{ 0x1.42d5efdd720ecp+0, -0x1.56b022766c84ap-2 }, -{ 0x1.4140cfe001a0fp+0, -0x1.4f6db1c955536p-2 }, -{ 0x1.3fafa3b421f69p+0, -0x1.4834579063054p-2 }, -{ 0x1.3e225c9c8ece5p+0, -0x1.4103fd2249a76p-2 }, -{ 0x1.3c98ec29a211ap+0, -0x1.39dc8c3fe6dabp-2 }, -{ 0x1.3b13442a413fep+0, -0x1.32bdeed4b5c8fp-2 }, -{ 0x1.399156baa3c54p+0, -0x1.2ba80f41e20ddp-2 }, -{ 0x1.38131639b4cdbp+0, -0x1.249ad8332f4a7p-2 }, -{ 0x1.36987540fbf53p+0, -0x1.1d96347e7f3ebp-2 }, -{ 0x1.352166b648f61p+0, -0x1.169a0f7d6604ap-2 }, -{ 0x1.33adddb3eb575p+0, -0x1.0fa654a221909p-2 }, -{ 0x1.323dcd99fc1d3p+0, -0x1.08baefcf8251ap-2 }, -{ 0x1.30d129fefc7d2p+0, -0x1.01d7cd14deecdp-2 }, -{ 0x1.2f67e6b72fe7dp+0, -0x1.f5f9b1ad55495p-3 }, -{ 0x1.2e01f7cf8b187p+0, -0x1.e853ff76a77afp-3 }, -{ 0x1.2c9f518ddc86ep+0, -0x1.dabe5d624cba1p-3 }, -{ 0x1.2b3fe86e5f413p+0, -0x1.cd38a5cef4822p-3 }, -{ 0x1.29e3b1211b25cp+0, -0x1.bfc2b38d315f9p-3 }, -{ 0x1.288aa08b373cfp+0, -0x1.b25c61f5edd0fp-3 }, -{ 0x1.2734abcaa8467p+0, -0x1.a5058d18e9cacp-3 }, -{ 0x1.25e1c82459b81p+0, -0x1.97be1113e47a3p-3 }, -{ 0x1.2491eb1ad59c5p+0, -0x1.8a85cafdf5e27p-3 }, -{ 0x1.23450a54048b5p+0, -0x1.7d5c97e8fc45bp-3 }, -{ 0x1.21fb1bb09e578p+0, -0x1.704255d6486e4p-3 }, -{ 0x1.20b415346d8f7p+0, -0x1.6336e2cedd7bfp-3 }, -{ 0x1.1f6fed179a1acp+0, -0x1.563a1d9b0cc6ap-3 }, -{ 0x1.1e2e99b93c7b3p+0, -0x1.494be541aaa6fp-3 }, -{ 0x1.1cf011a7a882ap+0, -0x1.3c6c1964dd0f2p-3 }, -{ 0x1.1bb44b97dba5ap+0, -0x1.2f9a99f19a243p-3 }, -{ 0x1.1a7b3e66cdd4fp+0, -0x1.22d747344446p-3 }, -{ 0x1.1944e11dc56cdp+0, -0x1.1622020d4f7f5p-3 }, -{ 0x1.18112aebb1a6ep+0, -0x1.097aabb3553f3p-3 }, -{ 0x1.16e013231b7e9p+0, -0x1.f9c24b48014c5p-4 }, -{ 0x1.15b1913f156cfp+0, -0x1.e0aaa3bdc858ap-4 }, -{ 0x1.14859cdedde13p+0, -0x1.c7ae257c952d6p-4 }, -{ 0x1.135c2dc68cfa4p+0, -0x1.aecc960a03e58p-4 }, -{ 0x1.12353bdb01684p+0, -0x1.9605bb724d541p-4 }, -{ 0x1.1110bf25b85b4p+0, -0x1.7d595ca7147cep-4 }, -{ 0x1.0feeafd2f8577p+0, -0x1.64c74165002d9p-4 }, -{ 0x1.0ecf062c51c3bp+0, -0x1.4c4f31c86d344p-4 }, -{ 0x1.0db1baa076c8bp+0, -0x1.33f0f70388258p-4 }, -{ 0x1.0c96c5bb3048ep+0, -0x1.1bac5abb3037dp-4 }, -{ 0x1.0b7e20263e070p+0, -0x1.0381272495f21p-4 }, -{ 0x1.0a67c2acd0ce3p+0, -0x1.d6de4eba2de2ap-5 }, -{ 0x1.0953a6391e982p+0, -0x1.a6ec4e8156898p-5 }, -{ 0x1.0841c3caea380p+0, -0x1.772be542e3e1bp-5 }, -{ 0x1.07321489b13eap+0, -0x1.479cadcde852dp-5 }, -{ 0x1.062491aee9904p+0, -0x1.183e4265faa5p-5 }, -{ 0x1.05193497a7cc5p+0, -0x1.d2207fdaa1b85p-6 }, -{ 0x1.040ff6b5f5e9fp+0, -0x1.742486cb4a6a2p-6 }, -{ 0x1.0308d19aa6127p+0, -0x1.1687d77cfc299p-6 }, -{ 0x1.0203beedb0c67p+0, -0x1.7293623a6b5dep-7 }, -{ 0x1.010037d38bcc2p+0, -0x1.70ec80ec8f25dp-8 }, -{ 1.0, 0.0 }, -{ 0x1.fc06d493cca10p-1, 0x1.704c1ca6b6bc9p-7 }, -{ 0x1.f81e6ac3b918fp-1, 0x1.6eac8ba664beap-6 }, -{ 0x1.f44546ef18996p-1, 0x1.11e67d040772dp-5 }, -{ 0x1.f07b10382c84bp-1, 0x1.6bc665e2105dep-5 }, -{ 0x1.ecbf7070e59d4p-1, 0x1.c4f8a9772bf1dp-5 }, -{ 0x1.e91213f715939p-1, 0x1.0ebff10fbb951p-4 }, -{ 0x1.e572a9a75f7b7p-1, 0x1.3aaf4d7805d11p-4 }, -{ 0x1.e1e0e2c530207p-1, 0x1.664ba81a4d717p-4 }, -{ 0x1.de5c72d8a8be3p-1, 0x1.9196387da6de4p-4 }, -{ 0x1.dae50fa5658ccp-1, 0x1.bc902f2b7796p-4 }, -{ 0x1.d77a71145a2dap-1, 0x1.e73ab5f584f28p-4 }, -{ 0x1.d41c51166623ep-1, 0x1.08cb78510d232p-3 }, -{ 0x1.d0ca6ba0bb29fp-1, 0x1.1dd2fe2f0dcb5p-3 }, -{ 0x1.cd847e8e59681p-1, 0x1.32b4784400df4p-3 }, -{ 0x1.ca4a499693e00p-1, 0x1.47706f3d49942p-3 }, -{ 0x1.c71b8e399e821p-1, 0x1.5c0768ee4a4dcp-3 }, -{ 0x1.c3f80faf19077p-1, 0x1.7079e86fc7c6dp-3 }, -{ 0x1.c0df92dc2b0ecp-1, 0x1.84c86e1183467p-3 }, -{ 0x1.bdd1de3cbb542p-1, 0x1.98f377a34b499p-3 }, -{ 0x1.baceb9e1007a3p-1, 0x1.acfb803bc924bp-3 }, -{ 0x1.b7d5ef543e55ep-1, 0x1.c0e10098b025fp-3 }, -{ 0x1.b4e749977d953p-1, 0x1.d4a46efe103efp-3 }, -{ 0x1.b20295155478ep-1, 0x1.e8463f45b8d0bp-3 }, -{ 0x1.af279f8e82be2p-1, 0x1.fbc6e3228997fp-3 }, -{ 0x1.ac5638197fdf3p-1, 0x1.079364f2e5aa8p-2 }, -{ 0x1.a98e2f102e087p-1, 0x1.1133306010a63p-2 }, -{ 0x1.a6cf5606d05c1p-1, 0x1.1ac309631bd17p-2 }, -{ 0x1.a4197fc04d746p-1, 0x1.24432485370c1p-2 }, -{ 0x1.a16c80293dc01p-1, 0x1.2db3b5449132fp-2 }, -{ 0x1.9ec82c4dc5bc9p-1, 0x1.3714ee1d7a32p-2 }, -{ 0x1.9c2c5a491f534p-1, 0x1.406700ab52c94p-2 }, -{ 0x1.9998e1480b618p-1, 0x1.49aa1d87522b2p-2 }, -{ 0x1.970d9977c6c2dp-1, 0x1.52de746d7ecb2p-2 }, -{ 0x1.948a5c023d212p-1, 0x1.5c0434336b343p-2 }, -{ 0x1.920f0303d6809p-1, 0x1.651b8ad6c90d1p-2 }, -{ 0x1.8f9b698a98b45p-1, 0x1.6e24a56ab5831p-2 }, -{ 0x1.8d2f6b81726f6p-1, 0x1.771fb04ec29b1p-2 }, -{ 0x1.8acae5bb55badp-1, 0x1.800cd6f19c25ep-2 }, -{ 0x1.886db5d9275b8p-1, 0x1.88ec441df11dfp-2 }, -{ 0x1.8617ba567c13cp-1, 0x1.91be21b7c93f5p-2 }, -{ 0x1.83c8d27487800p-1, 0x1.9a8298f8c7454p-2 }, -{ 0x1.8180de3c5dbe7p-1, 0x1.a339d255c04ddp-2 }, -{ 0x1.7f3fbe71cdb71p-1, 0x1.abe3f59f43db7p-2 }, -{ 0x1.7d055498071c1p-1, 0x1.b48129deca9efp-2 }, -{ 0x1.7ad182e54f65ap-1, 0x1.bd119575364c1p-2 }, -{ 0x1.78a42c3c90125p-1, 0x1.c5955e23ebcbcp-2 }, -{ 0x1.767d342f76944p-1, 0x1.ce0ca8f4e1557p-2 }, -{ 0x1.745c7ef26b00ap-1, 0x1.d6779a5a75774p-2 }, -{ 0x1.7241f15769d0fp-1, 0x1.ded6563550d27p-2 }, -{ 0x1.702d70d396e41p-1, 0x1.e728ffafd840ep-2 }, -{ 0x1.6e1ee3700cd11p-1, 0x1.ef6fb96c8d739p-2 }, -{ 0x1.6c162fc9cbe02p-1, 0x1.f7aaa57907219p-2 }} + .invln2 = 0x1.71547652b82fep0, + + /* Derived from tables in v_log_data.c in a similar way as v_log10_data.c. + This means invc is unchanged and log2c was calculated by scaling log(c) by + log2(e) in extended precision and rounding back to double precision. */ + .table = { { 0x1.6a133d0dec120p+0, -0x1.00130d57f5fadp-1 }, + { 0x1.6815f2f3e42edp+0, -0x1.f802661bd725ep-2 }, + { 0x1.661e39be1ac9ep+0, -0x1.efea1c6f73a5bp-2 }, + { 0x1.642bfa30ac371p+0, -0x1.e7dd1dcd06f05p-2 }, + { 0x1.623f1d916f323p+0, -0x1.dfdb4ae024809p-2 }, + { 0x1.60578da220f65p+0, -0x1.d7e484d101958p-2 }, + { 0x1.5e75349dea571p+0, -0x1.cff8ad452f6ep-2 }, + { 0x1.5c97fd387a75ap+0, -0x1.c817a666c997fp-2 }, + { 0x1.5abfd2981f200p+0, -0x1.c04152d640419p-2 }, + { 0x1.58eca051dc99cp+0, -0x1.b87595a3f64b2p-2 }, + { 0x1.571e526d9df12p+0, -0x1.b0b4526c44d07p-2 }, + { 0x1.5554d555b3fcbp+0, -0x1.a8fd6d1a90f5ep-2 }, + { 0x1.539015e2a20cdp+0, -0x1.a150ca2559fc6p-2 }, + { 0x1.51d0014ee0164p+0, -0x1.99ae4e62cca29p-2 }, + { 0x1.50148538cd9eep+0, -0x1.9215df1a1e842p-2 }, + { 0x1.4e5d8f9f698a1p+0, -0x1.8a8761fe1f0d9p-2 }, + { 0x1.4cab0edca66bep+0, -0x1.8302bd1cc9a54p-2 }, + { 0x1.4afcf1a9db874p+0, -0x1.7b87d6fb437f6p-2 }, + { 0x1.495327136e16fp+0, -0x1.741696673a86dp-2 }, + { 0x1.47ad9e84af28fp+0, -0x1.6caee2b3c6fe4p-2 }, + { 0x1.460c47b39ae15p+0, -0x1.6550a3666c27ap-2 }, + { 0x1.446f12b278001p+0, -0x1.5dfbc08de02a4p-2 }, + { 0x1.42d5efdd720ecp+0, -0x1.56b022766c84ap-2 }, + { 0x1.4140cfe001a0fp+0, -0x1.4f6db1c955536p-2 }, + { 0x1.3fafa3b421f69p+0, -0x1.4834579063054p-2 }, + { 0x1.3e225c9c8ece5p+0, -0x1.4103fd2249a76p-2 }, + { 0x1.3c98ec29a211ap+0, -0x1.39dc8c3fe6dabp-2 }, + { 0x1.3b13442a413fep+0, -0x1.32bdeed4b5c8fp-2 }, + { 0x1.399156baa3c54p+0, -0x1.2ba80f41e20ddp-2 }, + { 0x1.38131639b4cdbp+0, -0x1.249ad8332f4a7p-2 }, + { 0x1.36987540fbf53p+0, -0x1.1d96347e7f3ebp-2 }, + { 0x1.352166b648f61p+0, -0x1.169a0f7d6604ap-2 }, + { 0x1.33adddb3eb575p+0, -0x1.0fa654a221909p-2 }, + { 0x1.323dcd99fc1d3p+0, -0x1.08baefcf8251ap-2 }, + { 0x1.30d129fefc7d2p+0, -0x1.01d7cd14deecdp-2 }, + { 0x1.2f67e6b72fe7dp+0, -0x1.f5f9b1ad55495p-3 }, + { 0x1.2e01f7cf8b187p+0, -0x1.e853ff76a77afp-3 }, + { 0x1.2c9f518ddc86ep+0, -0x1.dabe5d624cba1p-3 }, + { 0x1.2b3fe86e5f413p+0, -0x1.cd38a5cef4822p-3 }, + { 0x1.29e3b1211b25cp+0, -0x1.bfc2b38d315f9p-3 }, + { 0x1.288aa08b373cfp+0, -0x1.b25c61f5edd0fp-3 }, + { 0x1.2734abcaa8467p+0, -0x1.a5058d18e9cacp-3 }, + { 0x1.25e1c82459b81p+0, -0x1.97be1113e47a3p-3 }, + { 0x1.2491eb1ad59c5p+0, -0x1.8a85cafdf5e27p-3 }, + { 0x1.23450a54048b5p+0, -0x1.7d5c97e8fc45bp-3 }, + { 0x1.21fb1bb09e578p+0, -0x1.704255d6486e4p-3 }, + { 0x1.20b415346d8f7p+0, -0x1.6336e2cedd7bfp-3 }, + { 0x1.1f6fed179a1acp+0, -0x1.563a1d9b0cc6ap-3 }, + { 0x1.1e2e99b93c7b3p+0, -0x1.494be541aaa6fp-3 }, + { 0x1.1cf011a7a882ap+0, -0x1.3c6c1964dd0f2p-3 }, + { 0x1.1bb44b97dba5ap+0, -0x1.2f9a99f19a243p-3 }, + { 0x1.1a7b3e66cdd4fp+0, -0x1.22d747344446p-3 }, + { 0x1.1944e11dc56cdp+0, -0x1.1622020d4f7f5p-3 }, + { 0x1.18112aebb1a6ep+0, -0x1.097aabb3553f3p-3 }, + { 0x1.16e013231b7e9p+0, -0x1.f9c24b48014c5p-4 }, + { 0x1.15b1913f156cfp+0, -0x1.e0aaa3bdc858ap-4 }, + { 0x1.14859cdedde13p+0, -0x1.c7ae257c952d6p-4 }, + { 0x1.135c2dc68cfa4p+0, -0x1.aecc960a03e58p-4 }, + { 0x1.12353bdb01684p+0, -0x1.9605bb724d541p-4 }, + { 0x1.1110bf25b85b4p+0, -0x1.7d595ca7147cep-4 }, + { 0x1.0feeafd2f8577p+0, -0x1.64c74165002d9p-4 }, + { 0x1.0ecf062c51c3bp+0, -0x1.4c4f31c86d344p-4 }, + { 0x1.0db1baa076c8bp+0, -0x1.33f0f70388258p-4 }, + { 0x1.0c96c5bb3048ep+0, -0x1.1bac5abb3037dp-4 }, + { 0x1.0b7e20263e070p+0, -0x1.0381272495f21p-4 }, + { 0x1.0a67c2acd0ce3p+0, -0x1.d6de4eba2de2ap-5 }, + { 0x1.0953a6391e982p+0, -0x1.a6ec4e8156898p-5 }, + { 0x1.0841c3caea380p+0, -0x1.772be542e3e1bp-5 }, + { 0x1.07321489b13eap+0, -0x1.479cadcde852dp-5 }, + { 0x1.062491aee9904p+0, -0x1.183e4265faa5p-5 }, + { 0x1.05193497a7cc5p+0, -0x1.d2207fdaa1b85p-6 }, + { 0x1.040ff6b5f5e9fp+0, -0x1.742486cb4a6a2p-6 }, + { 0x1.0308d19aa6127p+0, -0x1.1687d77cfc299p-6 }, + { 0x1.0203beedb0c67p+0, -0x1.7293623a6b5dep-7 }, + { 0x1.010037d38bcc2p+0, -0x1.70ec80ec8f25dp-8 }, + { 1.0, 0.0 }, + { 0x1.fc06d493cca10p-1, 0x1.704c1ca6b6bc9p-7 }, + { 0x1.f81e6ac3b918fp-1, 0x1.6eac8ba664beap-6 }, + { 0x1.f44546ef18996p-1, 0x1.11e67d040772dp-5 }, + { 0x1.f07b10382c84bp-1, 0x1.6bc665e2105dep-5 }, + { 0x1.ecbf7070e59d4p-1, 0x1.c4f8a9772bf1dp-5 }, + { 0x1.e91213f715939p-1, 0x1.0ebff10fbb951p-4 }, + { 0x1.e572a9a75f7b7p-1, 0x1.3aaf4d7805d11p-4 }, + { 0x1.e1e0e2c530207p-1, 0x1.664ba81a4d717p-4 }, + { 0x1.de5c72d8a8be3p-1, 0x1.9196387da6de4p-4 }, + { 0x1.dae50fa5658ccp-1, 0x1.bc902f2b7796p-4 }, + { 0x1.d77a71145a2dap-1, 0x1.e73ab5f584f28p-4 }, + { 0x1.d41c51166623ep-1, 0x1.08cb78510d232p-3 }, + { 0x1.d0ca6ba0bb29fp-1, 0x1.1dd2fe2f0dcb5p-3 }, + { 0x1.cd847e8e59681p-1, 0x1.32b4784400df4p-3 }, + { 0x1.ca4a499693e00p-1, 0x1.47706f3d49942p-3 }, + { 0x1.c71b8e399e821p-1, 0x1.5c0768ee4a4dcp-3 }, + { 0x1.c3f80faf19077p-1, 0x1.7079e86fc7c6dp-3 }, + { 0x1.c0df92dc2b0ecp-1, 0x1.84c86e1183467p-3 }, + { 0x1.bdd1de3cbb542p-1, 0x1.98f377a34b499p-3 }, + { 0x1.baceb9e1007a3p-1, 0x1.acfb803bc924bp-3 }, + { 0x1.b7d5ef543e55ep-1, 0x1.c0e10098b025fp-3 }, + { 0x1.b4e749977d953p-1, 0x1.d4a46efe103efp-3 }, + { 0x1.b20295155478ep-1, 0x1.e8463f45b8d0bp-3 }, + { 0x1.af279f8e82be2p-1, 0x1.fbc6e3228997fp-3 }, + { 0x1.ac5638197fdf3p-1, 0x1.079364f2e5aa8p-2 }, + { 0x1.a98e2f102e087p-1, 0x1.1133306010a63p-2 }, + { 0x1.a6cf5606d05c1p-1, 0x1.1ac309631bd17p-2 }, + { 0x1.a4197fc04d746p-1, 0x1.24432485370c1p-2 }, + { 0x1.a16c80293dc01p-1, 0x1.2db3b5449132fp-2 }, + { 0x1.9ec82c4dc5bc9p-1, 0x1.3714ee1d7a32p-2 }, + { 0x1.9c2c5a491f534p-1, 0x1.406700ab52c94p-2 }, + { 0x1.9998e1480b618p-1, 0x1.49aa1d87522b2p-2 }, + { 0x1.970d9977c6c2dp-1, 0x1.52de746d7ecb2p-2 }, + { 0x1.948a5c023d212p-1, 0x1.5c0434336b343p-2 }, + { 0x1.920f0303d6809p-1, 0x1.651b8ad6c90d1p-2 }, + { 0x1.8f9b698a98b45p-1, 0x1.6e24a56ab5831p-2 }, + { 0x1.8d2f6b81726f6p-1, 0x1.771fb04ec29b1p-2 }, + { 0x1.8acae5bb55badp-1, 0x1.800cd6f19c25ep-2 }, + { 0x1.886db5d9275b8p-1, 0x1.88ec441df11dfp-2 }, + { 0x1.8617ba567c13cp-1, 0x1.91be21b7c93f5p-2 }, + { 0x1.83c8d27487800p-1, 0x1.9a8298f8c7454p-2 }, + { 0x1.8180de3c5dbe7p-1, 0x1.a339d255c04ddp-2 }, + { 0x1.7f3fbe71cdb71p-1, 0x1.abe3f59f43db7p-2 }, + { 0x1.7d055498071c1p-1, 0x1.b48129deca9efp-2 }, + { 0x1.7ad182e54f65ap-1, 0x1.bd119575364c1p-2 }, + { 0x1.78a42c3c90125p-1, 0x1.c5955e23ebcbcp-2 }, + { 0x1.767d342f76944p-1, 0x1.ce0ca8f4e1557p-2 }, + { 0x1.745c7ef26b00ap-1, 0x1.d6779a5a75774p-2 }, + { 0x1.7241f15769d0fp-1, 0x1.ded6563550d27p-2 }, + { 0x1.702d70d396e41p-1, 0x1.e728ffafd840ep-2 }, + { 0x1.6e1ee3700cd11p-1, 0x1.ef6fb96c8d739p-2 }, + { 0x1.6c162fc9cbe02p-1, 0x1.f7aaa57907219p-2 } } }; -// clang-format on diff --git a/contrib/arm-optimized-routines/pl/math/v_log2f_2u5.c b/contrib/arm-optimized-routines/pl/math/v_log2f_2u5.c index 8f9241bed8e6..c64d88742136 100644 --- a/contrib/arm-optimized-routines/pl/math/v_log2f_2u5.c +++ b/contrib/arm-optimized-routines/pl/math/v_log2f_2u5.c @@ -6,63 +6,72 @@ */ #include "v_math.h" -#include "pairwise_hornerf.h" +#include "poly_advsimd_f32.h" #include "pl_sig.h" #include "pl_test.h" -#if V_SUPPORTED -#define C(i) v_f32 (__v_log2f_data.poly[i]) +static const struct data +{ + uint32x4_t min_norm; + uint16x8_t special_bound; + uint32x4_t off, mantissa_mask; + float32x4_t poly[9]; +} data = { + /* Coefficients generated using Remez algorithm approximate + log2(1+r)/r for r in [ -1/3, 1/3 ]. + rel error: 0x1.c4c4b0cp-26. */ + .poly = { V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */ + V4 (-0x1.715458p-1f), V4 (0x1.ec701cp-2f), V4 (-0x1.7171a4p-2f), + V4 (0x1.27a0b8p-2f), V4 (-0x1.e5143ep-3f), V4 (0x1.9d8ecap-3f), + V4 (-0x1.c675bp-3f), V4 (0x1.9e495p-3f) }, + .min_norm = V4 (0x00800000), + .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */ + .off = V4 (0x3f2aaaab), /* 0.666667. */ + .mantissa_mask = V4 (0x007fffff), +}; -#define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218 */ -#define Min v_u32 (0x00800000) -#define Max v_u32 (0x7f800000) -#define Mask v_u32 (0x007fffff) -#define Off v_u32 (0x3f2aaaab) /* 0.666667 */ - -VPCS_ATTR -NOINLINE static v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r, + uint16x4_t cmp) { /* Fall back to scalar code. */ - return v_call_f32 (log2f, x, y, cmp); + return v_call_f32 (log2f, x, vfmaq_f32 (n, p, r), vmovl_u16 (cmp)); } -/* Fast implementation for single precision log2, - relies on same argument reduction as Neon logf. +/* Fast implementation for single precision AdvSIMD log2, + relies on same argument reduction as AdvSIMD logf. Maximum error: 2.48 ULPs - __v_log2f(0x1.558174p+0) got 0x1.a9be84p-2 - want 0x1.a9be8p-2. */ -VPCS_ATTR -v_f32_t V_NAME (log2f) (v_f32_t x) + _ZGVnN4v_log2f(0x1.558174p+0) got 0x1.a9be84p-2 + want 0x1.a9be8p-2. */ +float32x4_t VPCS_ATTR V_NAME_F1 (log2) (float32x4_t x) { - v_u32_t u = v_as_u32_f32 (x); - v_u32_t cmp = v_cond_u32 (u - Min >= Max - Min); + const struct data *d = ptr_barrier (&data); + uint32x4_t u = vreinterpretq_u32_f32 (x); + uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm), + vget_low_u16 (d->special_bound)); /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ - u -= Off; - v_f32_t n = v_to_f32_s32 (v_as_s32_u32 (u) >> 23); /* signextend. */ - u &= Mask; - u += Off; - v_f32_t r = v_as_f32_u32 (u) - v_f32 (1.0f); + u = vsubq_u32 (u, d->off); + float32x4_t n = vcvtq_f32_s32 ( + vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */ + u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off); + float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); /* y = log2(1+r) + n. */ - v_f32_t r2 = r * r; - v_f32_t p = PAIRWISE_HORNER_8 (r, r2, C); - v_f32_t y = v_fma_f32 (p, r, n); + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t p = v_pw_horner_8_f32 (r, r2, d->poly); - if (unlikely (v_any_u32 (cmp))) - return specialcase (x, y, cmp); - return y; + if (unlikely (v_any_u16h (special))) + return special_case (x, n, p, r, special); + return vfmaq_f32 (n, p, r); } -VPCS_ALIAS PL_SIG (V, F, 1, log2, 0.01, 11.1) -PL_TEST_ULP (V_NAME (log2f), 1.99) -PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log2f)) -PL_TEST_INTERVAL (V_NAME (log2f), -0.0, -0x1p126, 100) -PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-149, 0x1p-126, 4000) -PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-23, 1.0, 50000) -PL_TEST_INTERVAL (V_NAME (log2f), 1.0, 100, 50000) -PL_TEST_INTERVAL (V_NAME (log2f), 100, inf, 50000) -#endif +PL_TEST_ULP (V_NAME_F1 (log2), 1.99) +PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_F1 (log2)) +PL_TEST_INTERVAL (V_NAME_F1 (log2), -0.0, -0x1p126, 100) +PL_TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-149, 0x1p-126, 4000) +PL_TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (V_NAME_F1 (log2), 1.0, 100, 50000) +PL_TEST_INTERVAL (V_NAME_F1 (log2), 100, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/v_log2f_data.c b/contrib/arm-optimized-routines/pl/math/v_log2f_data.c deleted file mode 100644 index b144e8f4992d..000000000000 --- a/contrib/arm-optimized-routines/pl/math/v_log2f_data.c +++ /dev/null @@ -1,15 +0,0 @@ -/* - * Coefficients for vector log2f - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -/* See tools/v_log2f.sollya for the algorithm used to generate these - coefficients. */ -const struct v_log2f_data __v_log2f_data - = {.poly = {0x1.715476p0f, /* (float)(1 / ln(2)). */ - -0x1.715458p-1f, 0x1.ec701cp-2f, -0x1.7171a4p-2f, 0x1.27a0b8p-2f, - -0x1.e5143ep-3f, 0x1.9d8ecap-3f, -0x1.c675bp-3f, 0x1.9e495p-3f}}; diff --git a/contrib/arm-optimized-routines/pl/math/v_log_data.c b/contrib/arm-optimized-routines/pl/math/v_log_data.c new file mode 100644 index 000000000000..a26e8a051d97 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_log_data.c @@ -0,0 +1,161 @@ +/* + * Lookup table for double-precision log(x) vector function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +const struct v_log_data __v_log_data = { + /* Worst-case error: 1.17 + 0.5 ulp. + Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */ + .poly = { -0x1.ffffffffffff7p-2, 0x1.55555555170d4p-2, -0x1.0000000399c27p-2, + 0x1.999b2e90e94cap-3, -0x1.554e550bd501ep-3 }, + .ln2 = 0x1.62e42fefa39efp-1, + /* Algorithm: + + x = 2^k z + log(x) = k ln2 + log(c) + poly(z/c - 1) + + where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1, + N=128) and log(c) and 1/c for the ith subinterval comes from two lookup + tables: + + table[i].invc = 1/c + table[i].logc = (double)log(c) + + where c is near the center of the subinterval and is chosen by trying + several floating point invc candidates around 1/center and selecting one + for which the error in (double)log(c) is minimized (< 0x1p-74), except the + subinterval that contains 1 and the previous one got tweaked to avoid + cancellation. */ + .table = { { 0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2 }, + { 0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2 }, + { 0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2 }, + { 0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2 }, + { 0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2 }, + { 0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2 }, + { 0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2 }, + { 0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2 }, + { 0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2 }, + { 0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2 }, + { 0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2 }, + { 0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2 }, + { 0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2 }, + { 0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2 }, + { 0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2 }, + { 0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2 }, + { 0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2 }, + { 0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2 }, + { 0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2 }, + { 0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3 }, + { 0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3 }, + { 0x1.446f12b278001p+0, -0x1.e52e160484698p-3 }, + { 0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3 }, + { 0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3 }, + { 0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3 }, + { 0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3 }, + { 0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3 }, + { 0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3 }, + { 0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3 }, + { 0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3 }, + { 0x1.36987540fbf53p+0, -0x1.8be843d796044p-3 }, + { 0x1.352166b648f61p+0, -0x1.82395ecc477edp-3 }, + { 0x1.33adddb3eb575p+0, -0x1.7896240966422p-3 }, + { 0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3 }, + { 0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3 }, + { 0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3 }, + { 0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3 }, + { 0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3 }, + { 0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3 }, + { 0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3 }, + { 0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3 }, + { 0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3 }, + { 0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3 }, + { 0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3 }, + { 0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3 }, + { 0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4 }, + { 0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4 }, + { 0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4 }, + { 0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4 }, + { 0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4 }, + { 0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4 }, + { 0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4 }, + { 0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4 }, + { 0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4 }, + { 0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4 }, + { 0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4 }, + { 0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4 }, + { 0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4 }, + { 0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4 }, + { 0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4 }, + { 0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5 }, + { 0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5 }, + { 0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5 }, + { 0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5 }, + { 0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5 }, + { 0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5 }, + { 0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5 }, + { 0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5 }, + { 0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6 }, + { 0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6 }, + { 0x1.05193497a7cc5p+0, -0x1.43183683400acp-6 }, + { 0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6 }, + { 0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7 }, + { 0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7 }, + { 0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9 }, + { 1.0, 0.0 }, + { 0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8 }, + { 0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7 }, + { 0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6 }, + { 0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6 }, + { 0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5 }, + { 0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5 }, + { 0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5 }, + { 0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5 }, + { 0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4 }, + { 0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4 }, + { 0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4 }, + { 0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4 }, + { 0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4 }, + { 0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4 }, + { 0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4 }, + { 0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4 }, + { 0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4 }, + { 0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3 }, + { 0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3 }, + { 0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3 }, + { 0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3 }, + { 0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3 }, + { 0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3 }, + { 0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3 }, + { 0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3 }, + { 0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3 }, + { 0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3 }, + { 0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3 }, + { 0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3 }, + { 0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3 }, + { 0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3 }, + { 0x1.9998e1480b618p-1, 0x1.c903161240163p-3 }, + { 0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3 }, + { 0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3 }, + { 0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3 }, + { 0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3 }, + { 0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2 }, + { 0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2 }, + { 0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2 }, + { 0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2 }, + { 0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2 }, + { 0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2 }, + { 0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2 }, + { 0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2 }, + { 0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2 }, + { 0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2 }, + { 0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2 }, + { 0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2 }, + { 0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2 }, + { 0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2 }, + { 0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2 }, + { 0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2 } } +}; diff --git a/contrib/arm-optimized-routines/pl/math/v_log_inline.h b/contrib/arm-optimized-routines/pl/math/v_log_inline.h new file mode 100644 index 000000000000..2df00cf4ddf4 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_log_inline.h @@ -0,0 +1,104 @@ +/* + * Double-precision vector log(x) function - inline version + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "math_config.h" + +#ifndef V_LOG_INLINE_POLY_ORDER +# error Cannot use inline log helper without specifying poly order (options are 4 or 5) +#endif + +#if V_LOG_INLINE_POLY_ORDER == 4 +# define POLY \ + { \ + V2 (-0x1.ffffffffcbad3p-2), V2 (0x1.555555578ed68p-2), \ + V2 (-0x1.0000d3a1e7055p-2), V2 (0x1.999392d02a63ep-3) \ + } +#elif V_LOG_INLINE_POLY_ORDER == 5 +# define POLY \ + { \ + V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2), \ + V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3), \ + V2 (-0x1.554e550bd501ep-3) \ + } +#else +# error Can only choose order 4 or 5 for log poly +#endif + +struct v_log_inline_data +{ + float64x2_t poly[V_LOG_INLINE_POLY_ORDER]; + float64x2_t ln2; + uint64x2_t off, sign_exp_mask; +}; + +#define V_LOG_CONSTANTS \ + { \ + .poly = POLY, .ln2 = V2 (0x1.62e42fefa39efp-1), \ + .sign_exp_mask = V2 (0xfff0000000000000), .off = V2 (0x3fe6900900000000) \ + } + +#define A(i) d->poly[i] +#define N (1 << V_LOG_TABLE_BITS) +#define IndexMask (N - 1) + +struct entry +{ + float64x2_t invc; + float64x2_t logc; +}; + +static inline struct entry +log_lookup (uint64x2_t i) +{ + /* Since N is a power of 2, n % N = n & (N - 1). */ + struct entry e; + uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc); + e.invc = vuzp1q_f64 (e0, e1); + e.logc = vuzp2q_f64 (e0, e1); + return e; +} + +static inline float64x2_t +v_log_inline (float64x2_t x, const struct v_log_inline_data *d) +{ + float64x2_t z, r, r2, p, y, kd, hi; + uint64x2_t ix, iz, tmp; + int64x2_t k; + struct entry e; + + ix = vreinterpretq_u64_f64 (x); + + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + tmp = vsubq_u64 (ix, d->off); + k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */ + iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask)); + z = vreinterpretq_f64_u64 (iz); + e = log_lookup (tmp); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ + r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); + kd = vcvtq_f64_s64 (k); + + /* hi = r + log(c) + k*Ln2. */ + hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2); + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + r2 = vmulq_f64 (r, r); + y = vfmaq_f64 (A (2), A (3), r); + p = vfmaq_f64 (A (0), A (1), r); +#if V_LOG_POLY_ORDER == 5 + y = vfmaq_f64 (y, A (4), r2); +#endif + y = vfmaq_f64 (p, y, r2); + + return vfmaq_f64 (hi, y, r2); +} diff --git a/contrib/arm-optimized-routines/pl/math/v_logf_inline.h b/contrib/arm-optimized-routines/pl/math/v_logf_inline.h new file mode 100644 index 000000000000..c00fe0909afc --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_logf_inline.h @@ -0,0 +1,59 @@ +/* + * Single-precision vector log function - inline version + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" + +struct v_logf_data +{ + float32x4_t poly[7]; + float32x4_t ln2; + uint32x4_t off, mantissa_mask; +}; + +#define V_LOGF_CONSTANTS \ + { \ + .poly \ + = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f), \ + V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f), \ + V4 (-0x1.ffffc8p-2f) }, \ + .ln2 = V4 (0x1.62e43p-1f), .off = V4 (0x3f2aaaab), \ + .mantissa_mask = V4 (0x007fffff) \ + } + +#define P(i) d->poly[7 - i] + +static inline float32x4_t +v_logf_inline (float32x4_t x, const struct v_logf_data *d) +{ + float32x4_t n, p, q, r, r2, y; + uint32x4_t u; + + u = vreinterpretq_u32_f32 (x); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + u = vsubq_u32 (u, d->off); + n = vcvtq_f32_s32 ( + vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */ + u = vandq_u32 (u, d->mantissa_mask); + u = vaddq_u32 (u, d->off); + r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); + + /* y = log(1+r) + n*ln2. */ + r2 = vmulq_f32 (r, r); + /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */ + p = vfmaq_f32 (P (5), P (6), r); + q = vfmaq_f32 (P (3), P (4), r); + y = vfmaq_f32 (P (1), P (2), r); + p = vfmaq_f32 (p, P (7), r2); + q = vfmaq_f32 (q, p, r2); + y = vfmaq_f32 (y, q, r2); + p = vfmaq_f32 (r, d->ln2, n); + + return vfmaq_f32 (p, y, r2); +} + +#undef P diff --git a/contrib/arm-optimized-routines/pl/math/v_math.h b/contrib/arm-optimized-routines/pl/math/v_math.h index a8fa091a7cbf..1b10929faccc 100644 --- a/contrib/arm-optimized-routines/pl/math/v_math.h +++ b/contrib/arm-optimized-routines/pl/math/v_math.h @@ -12,844 +12,164 @@ /* Enable the build of vector math code. */ # define WANT_VMATH 1 #endif + #if WANT_VMATH -/* The goal of this header is to allow vector (only Neon for now) - and scalar build of the same algorithm. */ +# if __aarch64__ +# define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) +# else +# error "Cannot build without AArch64" +# endif -#if SCALAR -#define V_NAME(x) __s_##x -#elif VPCS && __aarch64__ -#define V_NAME(x) __vn_##x -#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) -#else -#define V_NAME(x) __v_##x -#endif +# include +# include "math_config.h" +# if __aarch64__ -#ifndef VPCS_ATTR -#define VPCS_ATTR -#endif -#ifndef VPCS_ALIAS -#define VPCS_ALIAS -#endif +# include -#include -#include "math_config.h" - -typedef float f32_t; -typedef uint32_t u32_t; -typedef int32_t s32_t; -typedef double f64_t; -typedef uint64_t u64_t; -typedef int64_t s64_t; - -/* reinterpret as type1 from type2. */ -static inline u32_t -as_u32_f32 (f32_t x) -{ - union { f32_t f; u32_t u; } r = {x}; - return r.u; -} -static inline f32_t -as_f32_u32 (u32_t x) -{ - union { u32_t u; f32_t f; } r = {x}; - return r.f; -} -static inline s32_t -as_s32_u32 (u32_t x) -{ - union { u32_t u; s32_t i; } r = {x}; - return r.i; -} -static inline u32_t -as_u32_s32 (s32_t x) -{ - union { s32_t i; u32_t u; } r = {x}; - return r.u; -} -static inline u64_t -as_u64_f64 (f64_t x) -{ - union { f64_t f; u64_t u; } r = {x}; - return r.u; -} -static inline f64_t -as_f64_u64 (u64_t x) -{ - union { u64_t u; f64_t f; } r = {x}; - return r.f; -} -static inline s64_t -as_s64_u64 (u64_t x) -{ - union { u64_t u; s64_t i; } r = {x}; - return r.i; -} -static inline u64_t -as_u64_s64 (s64_t x) -{ - union { s64_t i; u64_t u; } r = {x}; - return r.u; -} - -#if SCALAR -#define V_SUPPORTED 1 -typedef f32_t v_f32_t; -typedef u32_t v_u32_t; -typedef s32_t v_s32_t; -typedef f64_t v_f64_t; -typedef u64_t v_u64_t; -typedef s64_t v_s64_t; +/* Shorthand helpers for declaring constants. */ +# define V2(X) { X, X } +# define V4(X) { X, X, X, X } +# define V8(X) { X, X, X, X, X, X, X, X } static inline int -v_lanes32 (void) +v_any_u16h (uint16x4_t x) { - return 1; + return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0; } -static inline v_f32_t -v_f32 (f32_t x) +static inline float32x4_t +v_f32 (float x) { - return x; + return (float32x4_t) V4 (x); } -static inline v_u32_t -v_u32 (u32_t x) +static inline uint32x4_t +v_u32 (uint32_t x) { - return x; + return (uint32x4_t) V4 (x); } -static inline v_s32_t -v_s32 (s32_t x) +static inline int32x4_t +v_s32 (int32_t x) { - return x; + return (int32x4_t) V4 (x); } -static inline f32_t -v_get_f32 (v_f32_t x, int i) -{ - return x; -} -static inline u32_t -v_get_u32 (v_u32_t x, int i) -{ - return x; -} -static inline s32_t -v_get_s32 (v_s32_t x, int i) -{ - return x; -} - -static inline void -v_set_f32 (v_f32_t *x, int i, f32_t v) -{ - *x = v; -} -static inline void -v_set_u32 (v_u32_t *x, int i, u32_t v) -{ - *x = v; -} -static inline void -v_set_s32 (v_s32_t *x, int i, s32_t v) -{ - *x = v; -} - -/* true if any elements of a v_cond result is non-zero. */ +/* true if any elements of a vector compare result is non-zero. */ static inline int -v_any_u32 (v_u32_t x) -{ - return x != 0; -} -/* to wrap the result of relational operators. */ -static inline v_u32_t -v_cond_u32 (v_u32_t x) -{ - return x ? -1 : 0; -} -static inline v_f32_t -v_abs_f32 (v_f32_t x) -{ - return __builtin_fabsf (x); -} -static inline v_u32_t -v_bsl_u32 (v_u32_t m, v_u32_t x, v_u32_t y) -{ - return (y & ~m) | (x & m); -} -static inline v_u32_t -v_cagt_f32 (v_f32_t x, v_f32_t y) -{ - return fabsf (x) > fabsf (y); -} -/* to wrap |x| >= |y|. */ -static inline v_u32_t -v_cage_f32 (v_f32_t x, v_f32_t y) -{ - return fabsf (x) >= fabsf (y); -} -static inline v_u32_t -v_calt_f32 (v_f32_t x, v_f32_t y) -{ - return fabsf (x) < fabsf (y); -} -static inline v_f32_t -v_div_f32 (v_f32_t x, v_f32_t y) -{ - return x / y; -} -static inline v_f32_t -v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z) -{ - return __builtin_fmaf (x, y, z); -} -static inline v_f32_t -v_round_f32 (v_f32_t x) -{ - return __builtin_roundf (x); -} -static inline v_s32_t -v_round_s32 (v_f32_t x) -{ - return __builtin_lroundf (x); /* relies on -fno-math-errno. */ -} -static inline v_f32_t -v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y) -{ - return p ? x : y; -} -static inline v_u32_t -v_sel_u32 (v_u32_t p, v_u32_t x, v_u32_t y) -{ - return p ? x : y; -} -static inline v_f32_t -v_sqrt_f32 (v_f32_t x) -{ - return __builtin_sqrtf (x); -} -/* convert to type1 from type2. */ -static inline v_f32_t -v_to_f32_s32 (v_s32_t x) -{ - return x; -} -static inline v_s32_t -v_to_s32_f32 (v_f32_t x) -{ - return x; -} -static inline v_f32_t -v_to_f32_u32 (v_u32_t x) -{ - return x; -} -/* reinterpret as type1 from type2. */ -static inline v_u32_t -v_as_u32_f32 (v_f32_t x) -{ - union { v_f32_t f; v_u32_t u; } r = {x}; - return r.u; -} -static inline v_s32_t -v_as_s32_f32 (v_f32_t x) -{ - union - { - v_f32_t f; - v_s32_t u; - } r = {x}; - return r.u; -} -static inline v_f32_t -v_as_f32_u32 (v_u32_t x) -{ - union { v_u32_t u; v_f32_t f; } r = {x}; - return r.f; -} -static inline v_s32_t -v_as_s32_u32 (v_u32_t x) -{ - union { v_u32_t u; v_s32_t i; } r = {x}; - return r.i; -} -static inline v_u32_t -v_as_u32_s32 (v_s32_t x) -{ - union { v_s32_t i; v_u32_t u; } r = {x}; - return r.u; -} -static inline v_f32_t -v_lookup_f32 (const f32_t *tab, v_u32_t idx) -{ - return tab[idx]; -} -static inline v_u32_t -v_lookup_u32 (const u32_t *tab, v_u32_t idx) -{ - return tab[idx]; -} -static inline v_f32_t -v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p) -{ - return f (x); -} -static inline v_f32_t -v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y, - v_u32_t p) -{ - return f (x1, x2); -} - -static inline int -v_lanes64 (void) -{ - return 1; -} -static inline v_f64_t -v_f64 (f64_t x) -{ - return x; -} -static inline v_u64_t -v_u64 (u64_t x) -{ - return x; -} -static inline v_s64_t -v_s64 (s64_t x) -{ - return x; -} -static inline f64_t -v_get_f64 (v_f64_t x, int i) -{ - return x; -} -static inline void -v_set_f64 (v_f64_t *x, int i, f64_t v) -{ - *x = v; -} -/* true if any elements of a v_cond result is non-zero. */ -static inline int -v_any_u64 (v_u64_t x) -{ - return x != 0; -} -/* true if all elements of a v_cond result is non-zero. */ -static inline int -v_all_u64 (v_u64_t x) -{ - return x; -} -/* to wrap the result of relational operators. */ -static inline v_u64_t -v_cond_u64 (v_u64_t x) -{ - return x ? -1 : 0; -} -static inline v_f64_t -v_abs_f64 (v_f64_t x) -{ - return __builtin_fabs (x); -} -static inline v_u64_t -v_bsl_u64 (v_u64_t m, v_u64_t x, v_u64_t y) -{ - return (y & ~m) | (x & m); -} -static inline v_u64_t -v_cagt_f64 (v_f64_t x, v_f64_t y) -{ - return fabs (x) > fabs (y); -} -static inline v_f64_t -v_div_f64 (v_f64_t x, v_f64_t y) -{ - return x / y; -} -static inline v_f64_t -v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z) -{ - return __builtin_fma (x, y, z); -} -static inline v_f64_t -v_min_f64(v_f64_t x, v_f64_t y) { - return x < y ? x : y; -} -static inline v_f64_t -v_round_f64 (v_f64_t x) -{ - return __builtin_round (x); -} -static inline v_f64_t -v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y) -{ - return p ? x : y; -} -static inline v_f64_t -v_sqrt_f64 (v_f64_t x) -{ - return __builtin_sqrt (x); -} -static inline v_s64_t -v_round_s64 (v_f64_t x) -{ - return __builtin_lround (x); /* relies on -fno-math-errno. */ -} -static inline v_u64_t -v_trunc_u64 (v_f64_t x) -{ - return __builtin_trunc (x); -} -/* convert to type1 from type2. */ -static inline v_f64_t -v_to_f64_s64 (v_s64_t x) -{ - return x; -} -static inline v_f64_t -v_to_f64_u64 (v_u64_t x) -{ - return x; -} - -static inline v_s64_t -v_to_s64_f64 (v_f64_t x) -{ - return x; -} -/* reinterpret as type1 from type2. */ -static inline v_u64_t -v_as_u64_f64 (v_f64_t x) -{ - union { v_f64_t f; v_u64_t u; } r = {x}; - return r.u; -} -static inline v_f64_t -v_as_f64_u64 (v_u64_t x) -{ - union { v_u64_t u; v_f64_t f; } r = {x}; - return r.f; -} -static inline v_s64_t -v_as_s64_u64 (v_u64_t x) -{ - union { v_u64_t u; v_s64_t i; } r = {x}; - return r.i; -} -static inline v_u64_t -v_as_u64_s64 (v_s64_t x) -{ - union { v_s64_t i; v_u64_t u; } r = {x}; - return r.u; -} -static inline v_f64_t -v_lookup_f64 (const f64_t *tab, v_u64_t idx) -{ - return tab[idx]; -} -static inline v_u64_t -v_lookup_u64 (const u64_t *tab, v_u64_t idx) -{ - return tab[idx]; -} -static inline v_f64_t -v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p) -{ - return f (x); -} -static inline v_f64_t -v_call2_f64 (f64_t (*f) (f64_t, f64_t), v_f64_t x1, v_f64_t x2, v_f64_t y, - v_u64_t p) -{ - return f (x1, x2); -} - -#elif __aarch64__ -#define V_SUPPORTED 1 -#include -typedef float32x4_t v_f32_t; -typedef uint32x4_t v_u32_t; -typedef int32x4_t v_s32_t; -typedef float64x2_t v_f64_t; -typedef uint64x2_t v_u64_t; -typedef int64x2_t v_s64_t; - -static inline int -v_lanes32 (void) -{ - return 4; -} - -static inline v_f32_t -v_f32 (f32_t x) -{ - return (v_f32_t){x, x, x, x}; -} -static inline v_u32_t -v_u32 (u32_t x) -{ - return (v_u32_t){x, x, x, x}; -} -static inline v_s32_t -v_s32 (s32_t x) -{ - return (v_s32_t){x, x, x, x}; -} - -static inline f32_t -v_get_f32 (v_f32_t x, int i) -{ - return x[i]; -} -static inline u32_t -v_get_u32 (v_u32_t x, int i) -{ - return x[i]; -} -static inline s32_t -v_get_s32 (v_s32_t x, int i) -{ - return x[i]; -} - -static inline void -v_set_f32 (v_f32_t *x, int i, f32_t v) -{ - (*x)[i] = v; -} -static inline void -v_set_u32 (v_u32_t *x, int i, u32_t v) -{ - (*x)[i] = v; -} -static inline void -v_set_s32 (v_s32_t *x, int i, s32_t v) -{ - (*x)[i] = v; -} - -/* true if any elements of a v_cond result is non-zero. */ -static inline int -v_any_u32 (v_u32_t x) +v_any_u32 (uint32x4_t x) { /* assume elements in x are either 0 or -1u. */ return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0; } -/* to wrap the result of relational operators. */ -static inline v_u32_t -v_cond_u32 (v_u32_t x) +static inline int +v_any_u32h (uint32x2_t x) { - return x; + return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0; } -static inline v_f32_t -v_abs_f32 (v_f32_t x) +static inline float32x4_t +v_lookup_f32 (const float *tab, uint32x4_t idx) { - return vabsq_f32 (x); + return (float32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] }; } -static inline v_u32_t -v_bsl_u32 (v_u32_t m, v_u32_t x, v_u32_t y) +static inline uint32x4_t +v_lookup_u32 (const uint32_t *tab, uint32x4_t idx) { - return vbslq_u32 (m, x, y); + return (uint32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] }; } -static inline v_u32_t -v_cagt_f32 (v_f32_t x, v_f32_t y) +static inline float32x4_t +v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p) { - return vcagtq_f32 (x, y); + return (float32x4_t){ p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1], + p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3] }; } -/* to wrap |x| >= |y|. */ -static inline v_u32_t -v_cage_f32 (v_f32_t x, v_f32_t y) +static inline float32x4_t +v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2, + float32x4_t y, uint32x4_t p) { - return vcageq_f32 (x, y); + return (float32x4_t){ p[0] ? f (x1[0], x2[0]) : y[0], + p[1] ? f (x1[1], x2[1]) : y[1], + p[2] ? f (x1[2], x2[2]) : y[2], + p[3] ? f (x1[3], x2[3]) : y[3] }; } -static inline v_u32_t -v_calt_f32 (v_f32_t x, v_f32_t y) +static inline float32x4_t +v_zerofy_f32 (float32x4_t x, uint32x4_t mask) { - return vcaltq_f32 (x, y); -} -static inline v_f32_t -v_div_f32 (v_f32_t x, v_f32_t y) -{ - return vdivq_f32 (x, y); -} -static inline v_f32_t -v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z) -{ - return vfmaq_f32 (z, x, y); -} -static inline v_f32_t -v_round_f32 (v_f32_t x) -{ - return vrndaq_f32 (x); -} -static inline v_s32_t -v_round_s32 (v_f32_t x) -{ - return vcvtaq_s32_f32 (x); -} -static inline v_f32_t -v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y) -{ - return vbslq_f32 (p, x, y); -} -static inline v_u32_t -v_sel_u32 (v_u32_t p, v_u32_t x, v_u32_t y) -{ - return vbslq_u32 (p, x, y); -} -static inline v_f32_t -v_sqrt_f32 (v_f32_t x) -{ - return vsqrtq_f32 (x); -} -/* convert to type1 from type2. */ -static inline v_f32_t -v_to_f32_s32 (v_s32_t x) -{ - return (v_f32_t){x[0], x[1], x[2], x[3]}; -} -static inline v_s32_t -v_to_s32_f32 (v_f32_t x) -{ - return vcvtq_s32_f32 (x); -} -static inline v_f32_t -v_to_f32_u32 (v_u32_t x) -{ - return (v_f32_t){x[0], x[1], x[2], x[3]}; -} -/* reinterpret as type1 from type2. */ -static inline v_u32_t -v_as_u32_f32 (v_f32_t x) -{ - union { v_f32_t f; v_u32_t u; } r = {x}; - return r.u; -} -static inline v_s32_t -v_as_s32_f32 (v_f32_t x) -{ - union - { - v_f32_t f; - v_s32_t u; - } r = {x}; - return r.u; -} -static inline v_f32_t -v_as_f32_u32 (v_u32_t x) -{ - union { v_u32_t u; v_f32_t f; } r = {x}; - return r.f; -} -static inline v_s32_t -v_as_s32_u32 (v_u32_t x) -{ - union { v_u32_t u; v_s32_t i; } r = {x}; - return r.i; -} -static inline v_u32_t -v_as_u32_s32 (v_s32_t x) -{ - union { v_s32_t i; v_u32_t u; } r = {x}; - return r.u; -} -static inline v_f32_t -v_lookup_f32 (const f32_t *tab, v_u32_t idx) -{ - return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; -} -static inline v_u32_t -v_lookup_u32 (const u32_t *tab, v_u32_t idx) -{ - return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; -} -static inline v_f32_t -v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p) -{ - return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1], - p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]}; -} -static inline v_f32_t -v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y, - v_u32_t p) -{ - return ( - v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1], - p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]}; + return vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), mask)); } +static inline float64x2_t +v_f64 (double x) +{ + return (float64x2_t) V2 (x); +} +static inline uint64x2_t +v_u64 (uint64_t x) +{ + return (uint64x2_t) V2 (x); +} +static inline int64x2_t +v_s64 (int64_t x) +{ + return (int64x2_t) V2 (x); +} + +/* true if any elements of a vector compare result is non-zero. */ static inline int -v_lanes64 (void) -{ - return 2; -} -static inline v_f64_t -v_f64 (f64_t x) -{ - return (v_f64_t){x, x}; -} -static inline v_u64_t -v_u64 (u64_t x) -{ - return (v_u64_t){x, x}; -} -static inline v_s64_t -v_s64 (s64_t x) -{ - return (v_s64_t){x, x}; -} -static inline f64_t -v_get_f64 (v_f64_t x, int i) -{ - return x[i]; -} -static inline void -v_set_f64 (v_f64_t *x, int i, f64_t v) -{ - (*x)[i] = v; -} -/* true if any elements of a v_cond result is non-zero. */ -static inline int -v_any_u64 (v_u64_t x) +v_any_u64 (uint64x2_t x) { /* assume elements in x are either 0 or -1u. */ return vpaddd_u64 (x) != 0; } -/* true if all elements of a v_cond result is 1. */ +/* true if all elements of a vector compare result is 1. */ static inline int -v_all_u64 (v_u64_t x) +v_all_u64 (uint64x2_t x) { /* assume elements in x are either 0 or -1u. */ return vpaddd_s64 (vreinterpretq_s64_u64 (x)) == -2; } -/* to wrap the result of relational operators. */ -static inline v_u64_t -v_cond_u64 (v_u64_t x) +static inline float64x2_t +v_lookup_f64 (const double *tab, uint64x2_t idx) { - return x; + return (float64x2_t){ tab[idx[0]], tab[idx[1]] }; } -static inline v_f64_t -v_abs_f64 (v_f64_t x) +static inline uint64x2_t +v_lookup_u64 (const uint64_t *tab, uint64x2_t idx) { - return vabsq_f64 (x); + return (uint64x2_t){ tab[idx[0]], tab[idx[1]] }; } -static inline v_u64_t -v_bsl_u64 (v_u64_t m, v_u64_t x, v_u64_t y) + +static inline float64x2_t +v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p) { - return vbslq_u64 (m, x, y); + double p1 = p[1]; + double x1 = x[1]; + if (likely (p[0])) + y[0] = f (x[0]); + if (likely (p1)) + y[1] = f (x1); + return y; } -static inline v_u64_t -v_cagt_f64 (v_f64_t x, v_f64_t y) + +static inline float64x2_t +v_call2_f64 (double (*f) (double, double), float64x2_t x1, float64x2_t x2, + float64x2_t y, uint64x2_t p) { - return vcagtq_f64 (x, y); + double p1 = p[1]; + double x1h = x1[1]; + double x2h = x2[1]; + if (likely (p[0])) + y[0] = f (x1[0], x2[0]); + if (likely (p1)) + y[1] = f (x1h, x2h); + return y; } -static inline v_f64_t -v_div_f64 (v_f64_t x, v_f64_t y) +static inline float64x2_t +v_zerofy_f64 (float64x2_t x, uint64x2_t mask) { - return vdivq_f64 (x, y); -} -static inline v_f64_t -v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z) -{ - return vfmaq_f64 (z, x, y); -} -static inline v_f64_t -v_min_f64(v_f64_t x, v_f64_t y) { - return vminq_f64(x, y); -} -static inline v_f64_t -v_round_f64 (v_f64_t x) -{ - return vrndaq_f64 (x); -} -static inline v_f64_t -v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y) -{ - return vbslq_f64 (p, x, y); -} -static inline v_f64_t -v_sqrt_f64 (v_f64_t x) -{ - return vsqrtq_f64 (x); -} -static inline v_s64_t -v_round_s64 (v_f64_t x) -{ - return vcvtaq_s64_f64 (x); -} -static inline v_u64_t -v_trunc_u64 (v_f64_t x) -{ - return vcvtq_u64_f64 (x); -} -/* convert to type1 from type2. */ -static inline v_f64_t -v_to_f64_s64 (v_s64_t x) -{ - return (v_f64_t){x[0], x[1]}; -} -static inline v_f64_t -v_to_f64_u64 (v_u64_t x) -{ - return (v_f64_t){x[0], x[1]}; -} -static inline v_s64_t -v_to_s64_f64 (v_f64_t x) -{ - return vcvtq_s64_f64 (x); -} -/* reinterpret as type1 from type2. */ -static inline v_u64_t -v_as_u64_f64 (v_f64_t x) -{ - union { v_f64_t f; v_u64_t u; } r = {x}; - return r.u; -} -static inline v_f64_t -v_as_f64_u64 (v_u64_t x) -{ - union { v_u64_t u; v_f64_t f; } r = {x}; - return r.f; -} -static inline v_s64_t -v_as_s64_u64 (v_u64_t x) -{ - union { v_u64_t u; v_s64_t i; } r = {x}; - return r.i; -} -static inline v_u64_t -v_as_u64_s64 (v_s64_t x) -{ - union { v_s64_t i; v_u64_t u; } r = {x}; - return r.u; -} -static inline v_f64_t -v_lookup_f64 (const f64_t *tab, v_u64_t idx) -{ - return (v_f64_t){tab[idx[0]], tab[idx[1]]}; -} -static inline v_u64_t -v_lookup_u64 (const u64_t *tab, v_u64_t idx) -{ - return (v_u64_t){tab[idx[0]], tab[idx[1]]}; -} -static inline v_f64_t -v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p) -{ - return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]}; -} -static inline v_f64_t -v_call2_f64 (f64_t (*f) (f64_t, f64_t), v_f64_t x1, v_f64_t x2, v_f64_t y, - v_u64_t p) -{ - return (v_f64_t){p[0] ? f (x1[0], x2[0]) : y[0], - p[1] ? f (x1[1], x2[1]) : y[1]}; + return vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), mask)); } + +# endif #endif #endif -#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_pow_1u5.c b/contrib/arm-optimized-routines/pl/math/v_pow_1u5.c new file mode 100644 index 000000000000..9053347d4e35 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_pow_1u5.c @@ -0,0 +1,259 @@ +/* + * Double-precision vector pow function. + * + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +/* Defines parameters of the approximation and scalar fallback. */ +#include "finite_pow.h" + +#define VecSmallExp v_u64 (SmallExp) +#define VecThresExp v_u64 (ThresExp) + +#define VecSmallPowX v_u64 (SmallPowX) +#define VecThresPowX v_u64 (ThresPowX) +#define VecSmallPowY v_u64 (SmallPowY) +#define VecThresPowY v_u64 (ThresPowY) + +static const struct data +{ + float64x2_t log_poly[7]; + float64x2_t exp_poly[3]; + float64x2_t ln2_hi, ln2_lo; + float64x2_t shift, inv_ln2_n, ln2_hi_n, ln2_lo_n; +} data = { + /* Coefficients copied from v_pow_log_data.c + relative error: 0x1.11922ap-70 in [-0x1.6bp-8, 0x1.6bp-8] + Coefficients are scaled to match the scaling during evaluation. */ + .log_poly = { V2 (-0x1p-1), V2 (0x1.555555555556p-2 * -2), + V2 (-0x1.0000000000006p-2 * -2), V2 (0x1.999999959554ep-3 * 4), + V2 (-0x1.555555529a47ap-3 * 4), V2 (0x1.2495b9b4845e9p-3 * -8), + V2 (-0x1.0002b8b263fc3p-3 * -8) }, + .ln2_hi = V2 (0x1.62e42fefa3800p-1), + .ln2_lo = V2 (0x1.ef35793c76730p-45), + /* Polynomial coefficients: abs error: 1.43*2^-58, ulp error: 0.549 + (0.550 without fma) if |x| < ln2/512. */ + .exp_poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6ef9p-3), + V2 (0x1.5555576a5adcep-5) }, + .shift = V2 (0x1.8p52), /* round to nearest int. without intrinsics. */ + .inv_ln2_n = V2 (0x1.71547652b82fep8), /* N/ln2. */ + .ln2_hi_n = V2 (0x1.62e42fefc0000p-9), /* ln2/N. */ + .ln2_lo_n = V2 (-0x1.c610ca86c3899p-45), +}; + +#define A(i) data.log_poly[i] +#define C(i) data.exp_poly[i] + +/* This version implements an algorithm close to AOR scalar pow but + - does not implement the trick in the exp's specialcase subroutine to avoid + double-rounding, + - does not use a tail in the exponential core computation, + - and pow's exp polynomial order and table bits might differ. + + Maximum measured error is 1.04 ULPs: + _ZGVnN2vv_pow(0x1.024a3e56b3c3p-136, 0x1.87910248b58acp-13) + got 0x1.f71162f473251p-1 + want 0x1.f71162f473252p-1. */ + +static inline float64x2_t +v_masked_lookup_f64 (const double *table, uint64x2_t i) +{ + return (float64x2_t){ + table[(i[0] >> (52 - V_POW_LOG_TABLE_BITS)) & (N_LOG - 1)], + table[(i[1] >> (52 - V_POW_LOG_TABLE_BITS)) & (N_LOG - 1)] + }; +} + +/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about + additional 15 bits precision. IX is the bit representation of x, but + normalized in the subnormal range using the sign bit for the exponent. */ +static inline float64x2_t +v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d) +{ + /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + uint64x2_t tmp = vsubq_u64 (ix, v_u64 (Off)); + int64x2_t k + = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */ + uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, v_u64 (0xfffULL << 52))); + float64x2_t z = vreinterpretq_f64_u64 (iz); + float64x2_t kd = vcvtq_f64_s64 (k); + /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */ + float64x2_t invc = v_masked_lookup_f64 (__v_pow_log_data.invc, tmp); + float64x2_t logc = v_masked_lookup_f64 (__v_pow_log_data.logc, tmp); + float64x2_t logctail = v_masked_lookup_f64 (__v_pow_log_data.logctail, tmp); + /* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and + |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */ + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, invc); + /* k*Ln2 + log(c) + r. */ + float64x2_t t1 = vfmaq_f64 (logc, kd, d->ln2_hi); + float64x2_t t2 = vaddq_f64 (t1, r); + float64x2_t lo1 = vfmaq_f64 (logctail, kd, d->ln2_lo); + float64x2_t lo2 = vaddq_f64 (vsubq_f64 (t1, t2), r); + /* Evaluation is optimized assuming superscalar pipelined execution. */ + float64x2_t ar = vmulq_f64 (A (0), r); + float64x2_t ar2 = vmulq_f64 (r, ar); + float64x2_t ar3 = vmulq_f64 (r, ar2); + /* k*Ln2 + log(c) + r + A[0]*r*r. */ + float64x2_t hi = vaddq_f64 (t2, ar2); + float64x2_t lo3 = vfmaq_f64 (vnegq_f64 (ar2), ar, r); + float64x2_t lo4 = vaddq_f64 (vsubq_f64 (t2, hi), ar2); + /* p = log1p(r) - r - A[0]*r*r. */ + float64x2_t a56 = vfmaq_f64 (A (5), r, A (6)); + float64x2_t a34 = vfmaq_f64 (A (3), r, A (4)); + float64x2_t a12 = vfmaq_f64 (A (1), r, A (2)); + float64x2_t p = vfmaq_f64 (a34, ar2, a56); + p = vfmaq_f64 (a12, ar2, p); + p = vmulq_f64 (ar3, p); + float64x2_t lo + = vaddq_f64 (vaddq_f64 (vaddq_f64 (vaddq_f64 (lo1, lo2), lo3), lo4), p); + float64x2_t y = vaddq_f64 (hi, lo); + *tail = vaddq_f64 (vsubq_f64 (hi, y), lo); + return y; +} + +/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. */ +static inline float64x2_t +v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d) +{ + /* Fallback to scalar exp_inline for all lanes if any lane + contains value of x s.t. |x| <= 2^-54 or >= 512. */ + uint64x2_t abstop + = vandq_u64 (vshrq_n_u64 (vreinterpretq_u64_f64 (x), 52), v_u64 (0x7ff)); + uint64x2_t uoflowx + = vcgeq_u64 (vsubq_u64 (abstop, VecSmallExp), VecThresExp); + if (unlikely (v_any_u64 (uoflowx))) + return v_call2_f64 (exp_nosignbias, x, xtail, x, v_u64 (-1)); + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ + /* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N]. */ + float64x2_t z = vmulq_f64 (d->inv_ln2_n, x); + /* z - kd is in [-1, 1] in non-nearest rounding modes. */ + float64x2_t kd = vaddq_f64 (z, d->shift); + uint64x2_t ki = vreinterpretq_u64_f64 (kd); + kd = vsubq_f64 (kd, d->shift); + float64x2_t r = vfmsq_f64 (x, kd, d->ln2_hi_n); + r = vfmsq_f64 (r, kd, d->ln2_lo_n); + /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ + r = vaddq_f64 (r, xtail); + /* 2^(k/N) ~= scale. */ + uint64x2_t idx = vandq_u64 (ki, v_u64 (N_EXP - 1)); + uint64x2_t top = vshlq_n_u64 (ki, 52 - V_POW_EXP_TABLE_BITS); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + uint64x2_t sbits = v_lookup_u64 (SBits, idx); + sbits = vaddq_u64 (sbits, top); + /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t tmp = vfmaq_f64 (C (1), r, C (2)); + tmp = vfmaq_f64 (C (0), r, tmp); + tmp = vfmaq_f64 (r, r2, tmp); + float64x2_t scale = vreinterpretq_f64_u64 (sbits); + /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there + is no spurious underflow here even without fma. */ + return vfmaq_f64 (scale, scale, tmp); +} + +float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y) +{ + const struct data *d = ptr_barrier (&data); + /* Case of x <= 0 is too complicated to be vectorised efficiently here, + fallback to scalar pow for all lanes if any x < 0 detected. */ + if (v_any_u64 (vclezq_s64 (vreinterpretq_s64_f64 (x)))) + return v_call2_f64 (__pl_finite_pow, x, y, x, v_u64 (-1)); + + uint64x2_t vix = vreinterpretq_u64_f64 (x); + uint64x2_t viy = vreinterpretq_u64_f64 (y); + uint64x2_t vtopx = vshrq_n_u64 (vix, 52); + uint64x2_t vtopy = vshrq_n_u64 (viy, 52); + uint64x2_t vabstopx = vandq_u64 (vtopx, v_u64 (0x7ff)); + uint64x2_t vabstopy = vandq_u64 (vtopy, v_u64 (0x7ff)); + + /* Special cases of x or y. */ +#if WANT_SIMD_EXCEPT + /* Small or large. */ + uint64x2_t specialx + = vcgeq_u64 (vsubq_u64 (vtopx, VecSmallPowX), VecThresPowX); + uint64x2_t specialy + = vcgeq_u64 (vsubq_u64 (vabstopy, VecSmallPowY), VecThresPowY); +#else + /* Inf or nan. */ + uint64x2_t specialx = vcgeq_u64 (vabstopx, v_u64 (0x7ff)); + uint64x2_t specialy = vcgeq_u64 (vabstopy, v_u64 (0x7ff)); + /* The case y==0 does not trigger a special case, since in this case it is + necessary to fix the result only if x is a signalling nan, which already + triggers a special case. We test y==0 directly in the scalar fallback. */ +#endif + uint64x2_t special = vorrq_u64 (specialx, specialy); + /* Fallback to scalar on all lanes if any lane is inf or nan. */ + if (unlikely (v_any_u64 (special))) + return v_call2_f64 (__pl_finite_pow, x, y, x, v_u64 (-1)); + + /* Small cases of x: |x| < 0x1p-126. */ + uint64x2_t smallx = vcltq_u64 (vabstopx, VecSmallPowX); + if (unlikely (v_any_u64 (smallx))) + { + /* Update ix if top 12 bits of x are 0. */ + uint64x2_t sub_x = vceqzq_u64 (vtopx); + if (unlikely (v_any_u64 (sub_x))) + { + /* Normalize subnormal x so exponent becomes negative. */ + uint64x2_t vix_norm + = vreinterpretq_u64_f64 (vmulq_f64 (x, v_f64 (0x1p52))); + vix_norm = vandq_u64 (vix_norm, v_u64 (0x7fffffffffffffff)); + vix_norm = vsubq_u64 (vix_norm, v_u64 (52ULL << 52)); + vix = vbslq_u64 (sub_x, vix_norm, vix); + } + } + + /* Vector Log(ix, &lo). */ + float64x2_t vlo; + float64x2_t vhi = v_log_inline (vix, &vlo, d); + + /* Vector Exp(y_loghi, y_loglo). */ + float64x2_t vehi = vmulq_f64 (y, vhi); + float64x2_t velo = vmulq_f64 (y, vlo); + float64x2_t vemi = vfmsq_f64 (vehi, y, vhi); + velo = vsubq_f64 (velo, vemi); + return v_exp_inline (vehi, velo, d); +} + +PL_SIG (V, D, 2, pow) +PL_TEST_ULP (V_NAME_D2 (pow), 0.55) +PL_TEST_EXPECT_FENV (V_NAME_D2 (pow), WANT_SIMD_EXCEPT) +/* Wide intervals spanning the whole domain but shared between x and y. */ +#define V_POW_INTERVAL2(xlo, xhi, ylo, yhi, n) \ + PL_TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, ylo, yhi, n) \ + PL_TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n) \ + PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n) \ + PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n) +#define EXPAND(str) str##000000000 +#define SHL52(str) EXPAND (str) +V_POW_INTERVAL2 (0, SHL52 (SmallPowX), 0, inf, 40000) +V_POW_INTERVAL2 (SHL52 (SmallPowX), SHL52 (BigPowX), 0, inf, 40000) +V_POW_INTERVAL2 (SHL52 (BigPowX), inf, 0, inf, 40000) +V_POW_INTERVAL2 (0, inf, 0, SHL52 (SmallPowY), 40000) +V_POW_INTERVAL2 (0, inf, SHL52 (SmallPowY), SHL52 (BigPowY), 40000) +V_POW_INTERVAL2 (0, inf, SHL52 (BigPowY), inf, 40000) +V_POW_INTERVAL2 (0, inf, 0, inf, 1000) +/* x~1 or y~1. */ +V_POW_INTERVAL2 (0x1p-1, 0x1p1, 0x1p-10, 0x1p10, 10000) +V_POW_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000) +V_POW_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000) +/* around argmaxs of ULP error. */ +V_POW_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000) +V_POW_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000) +/* x is negative, y is odd or even integer, or y is real not integer. */ +PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000) +PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000) +PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000) +PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000) +/* 1.0^y. */ +PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000) +PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000) +PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000) +PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000) diff --git a/contrib/arm-optimized-routines/math/v_exp_data.c b/contrib/arm-optimized-routines/pl/math/v_pow_exp_data.c similarity index 64% rename from contrib/arm-optimized-routines/math/v_exp_data.c rename to contrib/arm-optimized-routines/pl/math/v_pow_exp_data.c index 30421da81429..5d921ef648a4 100644 --- a/contrib/arm-optimized-routines/math/v_exp_data.c +++ b/contrib/arm-optimized-routines/pl/math/v_pow_exp_data.c @@ -1,147 +1,34 @@ /* - * Lookup table for double-precision e^x vector function. + * Shared data between exp, exp2 and pow. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2018-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "v_exp.h" -#if WANT_VMATH +#include "math_config.h" -#define N (1 << V_EXP_TABLE_BITS) +#define N (1 << V_POW_EXP_TABLE_BITS) -/* 2^(j/N), j=0..N. */ -const u64_t __v_exp_data[] = { -#if N == 128 -0x3ff0000000000000, -0x3feff63da9fb3335, -0x3fefec9a3e778061, -0x3fefe315e86e7f85, -0x3fefd9b0d3158574, -0x3fefd06b29ddf6de, -0x3fefc74518759bc8, -0x3fefbe3ecac6f383, -0x3fefb5586cf9890f, -0x3fefac922b7247f7, -0x3fefa3ec32d3d1a2, -0x3fef9b66affed31b, -0x3fef9301d0125b51, -0x3fef8abdc06c31cc, -0x3fef829aaea92de0, -0x3fef7a98c8a58e51, -0x3fef72b83c7d517b, -0x3fef6af9388c8dea, -0x3fef635beb6fcb75, -0x3fef5be084045cd4, -0x3fef54873168b9aa, -0x3fef4d5022fcd91d, -0x3fef463b88628cd6, -0x3fef3f49917ddc96, -0x3fef387a6e756238, -0x3fef31ce4fb2a63f, -0x3fef2b4565e27cdd, -0x3fef24dfe1f56381, -0x3fef1e9df51fdee1, -0x3fef187fd0dad990, -0x3fef1285a6e4030b, -0x3fef0cafa93e2f56, -0x3fef06fe0a31b715, -0x3fef0170fc4cd831, -0x3feefc08b26416ff, -0x3feef6c55f929ff1, -0x3feef1a7373aa9cb, -0x3feeecae6d05d866, -0x3feee7db34e59ff7, -0x3feee32dc313a8e5, -0x3feedea64c123422, -0x3feeda4504ac801c, -0x3feed60a21f72e2a, -0x3feed1f5d950a897, -0x3feece086061892d, -0x3feeca41ed1d0057, -0x3feec6a2b5c13cd0, -0x3feec32af0d7d3de, -0x3feebfdad5362a27, -0x3feebcb299fddd0d, -0x3feeb9b2769d2ca7, -0x3feeb6daa2cf6642, -0x3feeb42b569d4f82, -0x3feeb1a4ca5d920f, -0x3feeaf4736b527da, -0x3feead12d497c7fd, -0x3feeab07dd485429, -0x3feea9268a5946b7, -0x3feea76f15ad2148, -0x3feea5e1b976dc09, -0x3feea47eb03a5585, -0x3feea34634ccc320, -0x3feea23882552225, -0x3feea155d44ca973, -0x3feea09e667f3bcd, -0x3feea012750bdabf, -0x3fee9fb23c651a2f, -0x3fee9f7df9519484, -0x3fee9f75e8ec5f74, -0x3fee9f9a48a58174, -0x3fee9feb564267c9, -0x3feea0694fde5d3f, -0x3feea11473eb0187, -0x3feea1ed0130c132, -0x3feea2f336cf4e62, -0x3feea427543e1a12, -0x3feea589994cce13, -0x3feea71a4623c7ad, -0x3feea8d99b4492ed, -0x3feeaac7d98a6699, -0x3feeace5422aa0db, -0x3feeaf3216b5448c, -0x3feeb1ae99157736, -0x3feeb45b0b91ffc6, -0x3feeb737b0cdc5e5, -0x3feeba44cbc8520f, -0x3feebd829fde4e50, -0x3feec0f170ca07ba, -0x3feec49182a3f090, -0x3feec86319e32323, -0x3feecc667b5de565, -0x3feed09bec4a2d33, -0x3feed503b23e255d, -0x3feed99e1330b358, -0x3feede6b5579fdbf, -0x3feee36bbfd3f37a, -0x3feee89f995ad3ad, -0x3feeee07298db666, -0x3feef3a2b84f15fb, -0x3feef9728de5593a, -0x3feeff76f2fb5e47, -0x3fef05b030a1064a, -0x3fef0c1e904bc1d2, -0x3fef12c25bd71e09, -0x3fef199bdd85529c, -0x3fef20ab5fffd07a, -0x3fef27f12e57d14b, -0x3fef2f6d9406e7b5, -0x3fef3720dcef9069, -0x3fef3f0b555dc3fa, -0x3fef472d4a07897c, -0x3fef4f87080d89f2, -0x3fef5818dcfba487, -0x3fef60e316c98398, -0x3fef69e603db3285, -0x3fef7321f301b460, -0x3fef7c97337b9b5f, -0x3fef864614f5a129, -0x3fef902ee78b3ff6, -0x3fef9a51fbc74c83, -0x3fefa4afa2a490da, -0x3fefaf482d8e67f1, -0x3fefba1bee615a27, -0x3fefc52b376bba97, -0x3fefd0765b6e4540, -0x3fefdbfdad9cbe14, -0x3fefe7c1819e90d8, -0x3feff3c22b8f71f1, -#elif N == 256 +const struct v_pow_exp_data __v_pow_exp_data = { +// exp polynomial coefficients. +.poly = { +// abs error: 1.43*2^-58 +// ulp error: 0.549 (0.550 without fma) +// if |x| < ln2/512 +0x1.fffffffffffd4p-2, +0x1.5555571d6ef9p-3, +0x1.5555576a5adcep-5, +}, +// N/ln2 +.n_over_ln2 = 0x1.71547652b82fep0 * N, +// ln2/N +.ln2_over_n_hi = 0x1.62e42fefc0000p-9, +.ln2_over_n_lo = -0x1.c610ca86c3899p-45, +// Used for rounding to nearest integer without using intrinsics. +.shift = 0x1.8p52, +// 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N) +// sbits[k] = asuint64(H[k]) - (k << 52)/N +.sbits = { 0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335, @@ -398,6 +285,5 @@ const u64_t __v_exp_data[] = { 0x3fefedba3692d514, 0x3feff3c22b8f71f1, 0x3feff9d96b2a23d9, -#endif +}, }; -#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_pow_log_data.c b/contrib/arm-optimized-routines/pl/math/v_pow_log_data.c new file mode 100644 index 000000000000..036faa5c97c1 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_pow_log_data.c @@ -0,0 +1,174 @@ +/* + * Data for the log part of pow. + * + * Copyright (c) 2018-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +#define N (1 << V_POW_LOG_TABLE_BITS) + +/* Algorithm: + + x = 2^k z + log(x) = k ln2 + log(c) + log(z/c) + log(z/c) = poly(z/c - 1) + + where z is in [0x1.69555p-1; 0x1.69555p0] which is split into N subintervals + and z falls into the ith one, then table entries are computed as + + tab[i].invc = 1/c + tab[i].logc = round(0x1p43*log(c))/0x1p43 + tab[i].logctail = (double)(log(c) - logc) + + where c is chosen near the center of the subinterval such that 1/c has only + a few precision bits so z/c - 1 is exactly representible as double: + + 1/c = center < 1 ? round(N/center)/N : round(2*N/center)/N/2 + + Note: |z/c - 1| < 1/N for the chosen c, |log(c) - logc - logctail| < + 0x1p-97, the last few bits of logc are rounded away so k*ln2hi + logc has no + rounding error and the interval for z is selected such that near x == 1, + where log(x) + is tiny, large cancellation error is avoided in logc + poly(z/c - 1). */ +const struct v_pow_log_data __v_pow_log_data = { + /* relative error: 0x1.11922ap-70 in [-0x1.6bp-8, 0x1.6bp-8] + Coefficients are scaled to match the scaling during evaluation. */ + .poly = { -0x1p-1, -0x1.555555555556p-1, 0x1.0000000000006p-1, + 0x1.999999959554ep-1, -0x1.555555529a47ap-1, -0x1.2495b9b4845e9p0, + 0x1.0002b8b263fc3p0, }, + .ln2_hi = 0x1.62e42fefa3800p-1, + .ln2_lo = 0x1.ef35793c76730p-45, + .invc = { 0x1.6a00000000000p+0, 0x1.6800000000000p+0, 0x1.6600000000000p+0, + 0x1.6400000000000p+0, 0x1.6200000000000p+0, 0x1.6000000000000p+0, + 0x1.5e00000000000p+0, 0x1.5c00000000000p+0, 0x1.5a00000000000p+0, + 0x1.5800000000000p+0, 0x1.5600000000000p+0, 0x1.5600000000000p+0, + 0x1.5400000000000p+0, 0x1.5200000000000p+0, 0x1.5000000000000p+0, + 0x1.4e00000000000p+0, 0x1.4c00000000000p+0, 0x1.4a00000000000p+0, + 0x1.4a00000000000p+0, 0x1.4800000000000p+0, 0x1.4600000000000p+0, + 0x1.4400000000000p+0, 0x1.4200000000000p+0, 0x1.4000000000000p+0, + 0x1.4000000000000p+0, 0x1.3e00000000000p+0, 0x1.3c00000000000p+0, + 0x1.3a00000000000p+0, 0x1.3a00000000000p+0, 0x1.3800000000000p+0, + 0x1.3600000000000p+0, 0x1.3400000000000p+0, 0x1.3400000000000p+0, + 0x1.3200000000000p+0, 0x1.3000000000000p+0, 0x1.3000000000000p+0, + 0x1.2e00000000000p+0, 0x1.2c00000000000p+0, 0x1.2c00000000000p+0, + 0x1.2a00000000000p+0, 0x1.2800000000000p+0, 0x1.2600000000000p+0, + 0x1.2600000000000p+0, 0x1.2400000000000p+0, 0x1.2400000000000p+0, + 0x1.2200000000000p+0, 0x1.2000000000000p+0, 0x1.2000000000000p+0, + 0x1.1e00000000000p+0, 0x1.1c00000000000p+0, 0x1.1c00000000000p+0, + 0x1.1a00000000000p+0, 0x1.1a00000000000p+0, 0x1.1800000000000p+0, + 0x1.1600000000000p+0, 0x1.1600000000000p+0, 0x1.1400000000000p+0, + 0x1.1400000000000p+0, 0x1.1200000000000p+0, 0x1.1000000000000p+0, + 0x1.1000000000000p+0, 0x1.0e00000000000p+0, 0x1.0e00000000000p+0, + 0x1.0c00000000000p+0, 0x1.0c00000000000p+0, 0x1.0a00000000000p+0, + 0x1.0a00000000000p+0, 0x1.0800000000000p+0, 0x1.0800000000000p+0, + 0x1.0600000000000p+0, 0x1.0400000000000p+0, 0x1.0400000000000p+0, + 0x1.0200000000000p+0, 0x1.0200000000000p+0, 0x1.0000000000000p+0, + 0x1.0000000000000p+0, 0x1.fc00000000000p-1, 0x1.f800000000000p-1, + 0x1.f400000000000p-1, 0x1.f000000000000p-1, 0x1.ec00000000000p-1, + 0x1.e800000000000p-1, 0x1.e400000000000p-1, 0x1.e200000000000p-1, + 0x1.de00000000000p-1, 0x1.da00000000000p-1, 0x1.d600000000000p-1, + 0x1.d400000000000p-1, 0x1.d000000000000p-1, 0x1.cc00000000000p-1, + 0x1.ca00000000000p-1, 0x1.c600000000000p-1, 0x1.c400000000000p-1, + 0x1.c000000000000p-1, 0x1.be00000000000p-1, 0x1.ba00000000000p-1, + 0x1.b800000000000p-1, 0x1.b400000000000p-1, 0x1.b200000000000p-1, + 0x1.ae00000000000p-1, 0x1.ac00000000000p-1, 0x1.aa00000000000p-1, + 0x1.a600000000000p-1, 0x1.a400000000000p-1, 0x1.a000000000000p-1, + 0x1.9e00000000000p-1, 0x1.9c00000000000p-1, 0x1.9a00000000000p-1, + 0x1.9600000000000p-1, 0x1.9400000000000p-1, 0x1.9200000000000p-1, + 0x1.9000000000000p-1, 0x1.8c00000000000p-1, 0x1.8a00000000000p-1, + 0x1.8800000000000p-1, 0x1.8600000000000p-1, 0x1.8400000000000p-1, + 0x1.8200000000000p-1, 0x1.7e00000000000p-1, 0x1.7c00000000000p-1, + 0x1.7a00000000000p-1, 0x1.7800000000000p-1, 0x1.7600000000000p-1, + 0x1.7400000000000p-1, 0x1.7200000000000p-1, 0x1.7000000000000p-1, + 0x1.6e00000000000p-1, 0x1.6c00000000000p-1, }, + .logc + = { -0x1.62c82f2b9c800p-2, -0x1.5d1bdbf580800p-2, -0x1.5767717455800p-2, + -0x1.51aad872df800p-2, -0x1.4be5f95777800p-2, -0x1.4618bc21c6000p-2, + -0x1.404308686a800p-2, -0x1.3a64c55694800p-2, -0x1.347dd9a988000p-2, + -0x1.2e8e2bae12000p-2, -0x1.2895a13de8800p-2, -0x1.2895a13de8800p-2, + -0x1.22941fbcf7800p-2, -0x1.1c898c1699800p-2, -0x1.1675cababa800p-2, + -0x1.1058bf9ae4800p-2, -0x1.0a324e2739000p-2, -0x1.0402594b4d000p-2, + -0x1.0402594b4d000p-2, -0x1.fb9186d5e4000p-3, -0x1.ef0adcbdc6000p-3, + -0x1.e27076e2af000p-3, -0x1.d5c216b4fc000p-3, -0x1.c8ff7c79aa000p-3, + -0x1.c8ff7c79aa000p-3, -0x1.bc286742d9000p-3, -0x1.af3c94e80c000p-3, + -0x1.a23bc1fe2b000p-3, -0x1.a23bc1fe2b000p-3, -0x1.9525a9cf45000p-3, + -0x1.87fa06520d000p-3, -0x1.7ab890210e000p-3, -0x1.7ab890210e000p-3, + -0x1.6d60fe719d000p-3, -0x1.5ff3070a79000p-3, -0x1.5ff3070a79000p-3, + -0x1.526e5e3a1b000p-3, -0x1.44d2b6ccb8000p-3, -0x1.44d2b6ccb8000p-3, + -0x1.371fc201e9000p-3, -0x1.29552f81ff000p-3, -0x1.1b72ad52f6000p-3, + -0x1.1b72ad52f6000p-3, -0x1.0d77e7cd09000p-3, -0x1.0d77e7cd09000p-3, + -0x1.fec9131dbe000p-4, -0x1.e27076e2b0000p-4, -0x1.e27076e2b0000p-4, + -0x1.c5e548f5bc000p-4, -0x1.a926d3a4ae000p-4, -0x1.a926d3a4ae000p-4, + -0x1.8c345d631a000p-4, -0x1.8c345d631a000p-4, -0x1.6f0d28ae56000p-4, + -0x1.51b073f062000p-4, -0x1.51b073f062000p-4, -0x1.341d7961be000p-4, + -0x1.341d7961be000p-4, -0x1.16536eea38000p-4, -0x1.f0a30c0118000p-5, + -0x1.f0a30c0118000p-5, -0x1.b42dd71198000p-5, -0x1.b42dd71198000p-5, + -0x1.77458f632c000p-5, -0x1.77458f632c000p-5, -0x1.39e87b9fec000p-5, + -0x1.39e87b9fec000p-5, -0x1.f829b0e780000p-6, -0x1.f829b0e780000p-6, + -0x1.7b91b07d58000p-6, -0x1.fc0a8b0fc0000p-7, -0x1.fc0a8b0fc0000p-7, + -0x1.fe02a6b100000p-8, -0x1.fe02a6b100000p-8, 0x0.0000000000000p+0, + 0x0.0000000000000p+0, 0x1.0101575890000p-7, 0x1.0205658938000p-6, + 0x1.8492528c90000p-6, 0x1.0415d89e74000p-5, 0x1.466aed42e0000p-5, + 0x1.894aa149fc000p-5, 0x1.ccb73cdddc000p-5, 0x1.eea31c006c000p-5, + 0x1.1973bd1466000p-4, 0x1.3bdf5a7d1e000p-4, 0x1.5e95a4d97a000p-4, + 0x1.700d30aeac000p-4, 0x1.9335e5d594000p-4, 0x1.b6ac88dad6000p-4, + 0x1.c885801bc4000p-4, 0x1.ec739830a2000p-4, 0x1.fe89139dbe000p-4, + 0x1.1178e8227e000p-3, 0x1.1aa2b7e23f000p-3, 0x1.2d1610c868000p-3, + 0x1.365fcb0159000p-3, 0x1.4913d8333b000p-3, 0x1.527e5e4a1b000p-3, + 0x1.6574ebe8c1000p-3, 0x1.6f0128b757000p-3, 0x1.7898d85445000p-3, + 0x1.8beafeb390000p-3, 0x1.95a5adcf70000p-3, 0x1.a93ed3c8ae000p-3, + 0x1.b31d8575bd000p-3, 0x1.bd087383be000p-3, 0x1.c6ffbc6f01000p-3, + 0x1.db13db0d49000p-3, 0x1.e530effe71000p-3, 0x1.ef5ade4dd0000p-3, + 0x1.f991c6cb3b000p-3, 0x1.07138604d5800p-2, 0x1.0c42d67616000p-2, + 0x1.1178e8227e800p-2, 0x1.16b5ccbacf800p-2, 0x1.1bf99635a6800p-2, + 0x1.214456d0eb800p-2, 0x1.2bef07cdc9000p-2, 0x1.314f1e1d36000p-2, + 0x1.36b6776be1000p-2, 0x1.3c25277333000p-2, 0x1.419b423d5e800p-2, + 0x1.4718dc271c800p-2, 0x1.4c9e09e173000p-2, 0x1.522ae0738a000p-2, + 0x1.57bf753c8d000p-2, 0x1.5d5bddf596000p-2, }, + .logctail + = { 0x1.ab42428375680p-48, -0x1.ca508d8e0f720p-46, -0x1.362a4d5b6506dp-45, + -0x1.684e49eb067d5p-49, -0x1.41b6993293ee0p-47, 0x1.3d82f484c84ccp-46, + 0x1.c42f3ed820b3ap-50, 0x1.0b1c686519460p-45, 0x1.5594dd4c58092p-45, + 0x1.67b1e99b72bd8p-45, 0x1.5ca14b6cfb03fp-46, 0x1.5ca14b6cfb03fp-46, + -0x1.65a242853da76p-46, -0x1.fafbc68e75404p-46, 0x1.f1fc63382a8f0p-46, + -0x1.6a8c4fd055a66p-45, -0x1.c6bee7ef4030ep-47, -0x1.036b89ef42d7fp-48, + -0x1.036b89ef42d7fp-48, 0x1.d572aab993c87p-47, 0x1.b26b79c86af24p-45, + -0x1.72f4f543fff10p-46, 0x1.1ba91bbca681bp-45, 0x1.7794f689f8434p-45, + 0x1.7794f689f8434p-45, 0x1.94eb0318bb78fp-46, 0x1.a4e633fcd9066p-52, + -0x1.58c64dc46c1eap-45, -0x1.58c64dc46c1eap-45, -0x1.ad1d904c1d4e3p-45, + 0x1.bbdbf7fdbfa09p-45, 0x1.bdb9072534a58p-45, 0x1.bdb9072534a58p-45, + -0x1.0e46aa3b2e266p-46, -0x1.e9e439f105039p-46, -0x1.e9e439f105039p-46, + -0x1.0de8b90075b8fp-45, 0x1.70cc16135783cp-46, 0x1.70cc16135783cp-46, + 0x1.178864d27543ap-48, -0x1.48d301771c408p-45, -0x1.e80a41811a396p-45, + -0x1.e80a41811a396p-45, 0x1.a699688e85bf4p-47, 0x1.a699688e85bf4p-47, + -0x1.575545ca333f2p-45, 0x1.a342c2af0003cp-45, 0x1.a342c2af0003cp-45, + -0x1.d0c57585fbe06p-46, 0x1.53935e85baac8p-45, 0x1.53935e85baac8p-45, + 0x1.37c294d2f5668p-46, 0x1.37c294d2f5668p-46, -0x1.69737c93373dap-45, + 0x1.f025b61c65e57p-46, 0x1.f025b61c65e57p-46, 0x1.c5edaccf913dfp-45, + 0x1.c5edaccf913dfp-45, 0x1.47c5e768fa309p-46, 0x1.d599e83368e91p-45, + 0x1.d599e83368e91p-45, 0x1.c827ae5d6704cp-46, 0x1.c827ae5d6704cp-46, + -0x1.cfc4634f2a1eep-45, -0x1.cfc4634f2a1eep-45, 0x1.502b7f526feaap-48, + 0x1.502b7f526feaap-48, -0x1.980267c7e09e4p-45, -0x1.980267c7e09e4p-45, + -0x1.88d5493faa639p-45, -0x1.f1e7cf6d3a69cp-50, -0x1.f1e7cf6d3a69cp-50, + -0x1.9e23f0dda40e4p-46, -0x1.9e23f0dda40e4p-46, 0x0.0000000000000p+0, + 0x0.0000000000000p+0, -0x1.0c76b999d2be8p-46, -0x1.3dc5b06e2f7d2p-45, + -0x1.aa0ba325a0c34p-45, 0x1.111c05cf1d753p-47, -0x1.c167375bdfd28p-45, + -0x1.97995d05a267dp-46, -0x1.a68f247d82807p-46, -0x1.e113e4fc93b7bp-47, + -0x1.5325d560d9e9bp-45, 0x1.cc85ea5db4ed7p-45, -0x1.c69063c5d1d1ep-45, + 0x1.c1e8da99ded32p-49, 0x1.3115c3abd47dap-45, -0x1.390802bf768e5p-46, + 0x1.646d1c65aacd3p-45, -0x1.dc068afe645e0p-45, -0x1.534d64fa10afdp-45, + 0x1.1ef78ce2d07f2p-45, 0x1.ca78e44389934p-45, 0x1.39d6ccb81b4a1p-47, + 0x1.62fa8234b7289p-51, 0x1.5837954fdb678p-45, 0x1.633e8e5697dc7p-45, + 0x1.9cf8b2c3c2e78p-46, -0x1.5118de59c21e1p-45, -0x1.c661070914305p-46, + -0x1.73d54aae92cd1p-47, 0x1.7f22858a0ff6fp-47, -0x1.8724350562169p-45, + -0x1.c358d4eace1aap-47, -0x1.d4bc4595412b6p-45, -0x1.1ec72c5962bd2p-48, + -0x1.aff2af715b035p-45, 0x1.212276041f430p-51, -0x1.a211565bb8e11p-51, + 0x1.bcbecca0cdf30p-46, 0x1.89cdb16ed4e91p-48, 0x1.7188b163ceae9p-45, + -0x1.c210e63a5f01cp-45, 0x1.b9acdf7a51681p-45, 0x1.ca6ed5147bdb7p-45, + 0x1.a87deba46baeap-47, 0x1.a9cfa4a5004f4p-45, -0x1.8e27ad3213cb8p-45, + 0x1.16ecdb0f177c8p-46, 0x1.83b54b606bd5cp-46, 0x1.8e436ec90e09dp-47, + -0x1.f27ce0967d675p-45, -0x1.e20891b0ad8a4p-45, 0x1.ebe708164c759p-45, + 0x1.fadedee5d40efp-46, -0x1.a0b2a08a465dcp-47, }, +}; diff --git a/contrib/arm-optimized-routines/pl/math/v_powf_data.c b/contrib/arm-optimized-routines/pl/math/v_powf_data.c new file mode 100644 index 000000000000..ded211924b80 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_powf_data.c @@ -0,0 +1,89 @@ +/* + * Coefficients for single-precision SVE pow(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +const struct v_powf_data __v_powf_data = { + .invc = { 0x1.6489890582816p+0, + 0x1.5cf19b35e3472p+0, + 0x1.55aac0e956d65p+0, + 0x1.4eb0022977e01p+0, + 0x1.47fcccda1dd1fp+0, + 0x1.418ceabab68c1p+0, + 0x1.3b5c788f1edb3p+0, + 0x1.3567de48e9c9ap+0, + 0x1.2fabc80fd19bap+0, + 0x1.2a25200ce536bp+0, + 0x1.24d108e0152e3p+0, + 0x1.1facd8ab2fbe1p+0, + 0x1.1ab614a03efdfp+0, + 0x1.15ea6d03af9ffp+0, + 0x1.1147b994bb776p+0, + 0x1.0ccbf650593aap+0, + 0x1.0875408477302p+0, + 0x1.0441d42a93328p+0, + 0x1p+0, + 0x1.f1d006c855e86p-1, + 0x1.e28c3341aa301p-1, + 0x1.d4bdf9aa64747p-1, + 0x1.c7b45a24e5803p-1, + 0x1.bb5f5eb2ed60ap-1, + 0x1.afb0bff8fe6b4p-1, + 0x1.a49badf7ab1f5p-1, + 0x1.9a14a111fc4c9p-1, + 0x1.901131f5b2fdcp-1, + 0x1.8687f73f6d865p-1, + 0x1.7d7067eb77986p-1, + 0x1.74c2c1cf97b65p-1, + 0x1.6c77f37cff2a1p-1 + }, + .logc = { -0x1.e960f97b22702p+3, + -0x1.c993406cd4db6p+3, + -0x1.aa711d9a7d0f3p+3, + -0x1.8bf37bacdce9bp+3, + -0x1.6e13b3519946ep+3, + -0x1.50cb8281e4089p+3, + -0x1.341504a237e2bp+3, + -0x1.17eaab624ffbbp+3, + -0x1.f88e708f8c853p+2, + -0x1.c24b6da113914p+2, + -0x1.8d02ee397cb1dp+2, + -0x1.58ac1223408b3p+2, + -0x1.253e6fd190e89p+2, + -0x1.e5641882c12ffp+1, + -0x1.81fea712926f7p+1, + -0x1.203e240de64a3p+1, + -0x1.8029b86a78281p0, + -0x1.85d713190fb9p-1, + 0x0p+0, + 0x1.4c1cc07312997p0, + 0x1.5e1848ccec948p+1, + 0x1.04cfcb7f1196fp+2, + 0x1.582813d463c21p+2, + 0x1.a936fa68760ccp+2, + 0x1.f81bc31d6cc4ep+2, + 0x1.2279a09fae6b1p+3, + 0x1.47ec0b6df5526p+3, + 0x1.6c71762280f1p+3, + 0x1.90155070798dap+3, + 0x1.b2e23b1d3068cp+3, + 0x1.d4e21b0daa86ap+3, + 0x1.f61e2a2f67f3fp+3 + }, + .scale = { 0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, + 0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa, + 0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715, + 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d, + 0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, + 0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, + 0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db, + 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d, + 0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, + 0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f, + 0x3fefa4afa2a490da, 0x3fefd0765b6e4540, + }, +}; diff --git a/contrib/arm-optimized-routines/pl/math/v_sincos_3u5.c b/contrib/arm-optimized-routines/pl/math/v_sincos_3u5.c new file mode 100644 index 000000000000..6fc014c120b8 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_sincos_3u5.c @@ -0,0 +1,57 @@ +/* + * Double-precision vector sincos function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* Define _GNU_SOURCE in order to include sincos declaration. If building + pre-GLIBC 2.1, or on a non-GNU conforming system, this routine will need to + be linked against the scalar sincosf from math/. */ +#define _GNU_SOURCE +#include +#undef _GNU_SOURCE + +#include "v_math.h" +#include "pl_test.h" +#include "v_sincos_common.h" + +static void VPCS_ATTR NOINLINE +special_case (float64x2_t x, uint64x2_t special, double *out_sin, + double *out_cos) +{ + if (special[0]) + sincos (x[0], out_sin, out_cos); + if (special[1]) + sincos (x[1], out_sin + 1, out_cos + 1); +} + +/* Double-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate polynomials. + Largest observed error is for sin, 3.22 ULP: + v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3 + want -0x1.ffe9537d5dbb4p-3. */ +VPCS_ATTR void +_ZGVnN2vl8l8_sincos (float64x2_t x, double *out_sin, double *out_cos) +{ + const struct v_sincos_data *d = ptr_barrier (&v_sincos_data); + uint64x2_t special = check_ge_rangeval (x, d); + + float64x2x2_t sc = v_sincos_inline (x, d); + + vst1q_f64 (out_sin, sc.val[0]); + vst1q_f64 (out_cos, sc.val[1]); + + if (unlikely (v_any_u64 (special))) + special_case (x, special, out_sin, out_cos); +} + +PL_TEST_ULP (_ZGVnN2v_sincos_sin, 2.73) +PL_TEST_ULP (_ZGVnN2v_sincos_cos, 2.73) +#define V_SINCOS_INTERVAL(lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVnN2v_sincos_sin, lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVnN2v_sincos_cos, lo, hi, n) +V_SINCOS_INTERVAL (0, 0x1p23, 500000) +V_SINCOS_INTERVAL (-0, -0x1p23, 500000) +V_SINCOS_INTERVAL (0x1p23, inf, 10000) +V_SINCOS_INTERVAL (-0x1p23, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/v_sincos_common.h b/contrib/arm-optimized-routines/pl/math/v_sincos_common.h new file mode 100644 index 000000000000..ee7937e0785a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_sincos_common.h @@ -0,0 +1,86 @@ +/* + * Core approximation for double-precision vector sincos + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "poly_advsimd_f64.h" + +static const struct v_sincos_data +{ + float64x2_t sin_poly[7], cos_poly[6], pio2[3]; + float64x2_t inv_pio2, shift, range_val; +} v_sincos_data = { + .inv_pio2 = V2 (0x1.45f306dc9c882p-1), + .pio2 = { V2 (0x1.921fb50000000p+0), V2 (0x1.110b460000000p-26), + V2 (0x1.1a62633145c07p-54) }, + .shift = V2 (0x1.8p52), + .sin_poly = { /* Computed using Remez in [-pi/2, pi/2]. */ + V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), + V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19), + V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33), + V2 (-0x1.9e9540300a1p-41) }, + .cos_poly = { /* Computed using Remez in [-pi/4, pi/4]. */ + V2 (0x1.555555555554cp-5), V2 (-0x1.6c16c16c1521fp-10), + V2 (0x1.a01a019cbf62ap-16), V2 (-0x1.27e4f812b681ep-22), + V2 (0x1.1ee9f152a57cdp-29), V2 (-0x1.8fb131098404bp-37) }, + .range_val = V2 (0x1p23), }; + +static inline uint64x2_t +check_ge_rangeval (float64x2_t x, const struct v_sincos_data *d) +{ + return vcagtq_f64 (x, d->range_val); +} + +/* Double-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate polynomials. + Largest observed error is for sin, 3.22 ULP: + v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3 + want -0x1.ffe9537d5dbb4p-3. */ +static inline float64x2x2_t +v_sincos_inline (float64x2_t x, const struct v_sincos_data *d) +{ + /* q = nearest integer to 2 * x / pi. */ + float64x2_t q = vsubq_f64 (vfmaq_f64 (d->shift, x, d->inv_pio2), d->shift); + int64x2_t n = vcvtq_s64_f64 (q); + + /* Use q to reduce x to r in [-pi/4, pi/4], by: + r = x - q * pi/2, in extended precision. */ + float64x2_t r = x; + r = vfmsq_f64 (r, q, d->pio2[0]); + r = vfmsq_f64 (r, q, d->pio2[1]); + r = vfmsq_f64 (r, q, d->pio2[2]); + + float64x2_t r2 = r * r, r3 = r2 * r, r4 = r2 * r2; + + /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2). */ + float64x2_t s = v_pw_horner_6_f64 (r2, r4, d->sin_poly); + s = vfmaq_f64 (r, r3, s); + + /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2). */ + float64x2_t c = v_pw_horner_5_f64 (r2, r4, d->cos_poly); + c = vfmaq_f64 (v_f64 (-0.5), r2, c); + c = vfmaq_f64 (v_f64 (1), r2, c); + + /* If odd quadrant, swap cos and sin. */ + uint64x2_t swap = vtstq_s64 (n, v_s64 (1)); + float64x2_t ss = vbslq_f64 (swap, c, s); + float64x2_t cc = vbslq_f64 (swap, s, c); + + /* Fix signs according to quadrant. + ss = asdouble(asuint64(ss) ^ ((n & 2) << 62)) + cc = asdouble(asuint64(cc) & (((n + 1) & 2) << 62)). */ + uint64x2_t sin_sign + = vshlq_n_u64 (vandq_u64 (vreinterpretq_u64_s64 (n), v_u64 (2)), 62); + uint64x2_t cos_sign = vshlq_n_u64 ( + vandq_u64 (vreinterpretq_u64_s64 (vaddq_s64 (n, v_s64 (1))), v_u64 (2)), + 62); + ss = vreinterpretq_f64_u64 ( + veorq_u64 (vreinterpretq_u64_f64 (ss), sin_sign)); + cc = vreinterpretq_f64_u64 ( + veorq_u64 (vreinterpretq_u64_f64 (cc), cos_sign)); + + return (float64x2x2_t){ ss, cc }; +} diff --git a/contrib/arm-optimized-routines/pl/math/v_sincosf_1u8.c b/contrib/arm-optimized-routines/pl/math/v_sincosf_1u8.c new file mode 100644 index 000000000000..bf77afaa14db --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_sincosf_1u8.c @@ -0,0 +1,58 @@ +/* + * Single-precision vector sincos function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* Define _GNU_SOURCE in order to include sincosf declaration. If building + pre-GLIBC 2.1, or on a non-GNU conforming system, this routine will need to + be linked against the scalar sincosf from math/. */ +#define _GNU_SOURCE +#include +#undef _GNU_SOURCE + +#include "v_sincosf_common.h" +#include "v_math.h" +#include "pl_test.h" + +static void VPCS_ATTR NOINLINE +special_case (float32x4_t x, uint32x4_t special, float *out_sin, + float *out_cos) +{ + for (int i = 0; i < 4; i++) + if (special[i]) + sincosf (x[i], out_sin + i, out_cos + i); +} + +/* Single-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate low-order + polynomials. + Worst-case error for sin is 1.67 ULP: + v_sincosf_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5 + Worst-case error for cos is 1.81 ULP: + v_sincosf_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */ +VPCS_ATTR void +_ZGVnN4vl4l4_sincosf (float32x4_t x, float *out_sin, float *out_cos) +{ + const struct v_sincosf_data *d = ptr_barrier (&v_sincosf_data); + uint32x4_t special = check_ge_rangeval (x, d); + + float32x4x2_t sc = v_sincosf_inline (x, d); + + vst1q_f32 (out_sin, sc.val[0]); + vst1q_f32 (out_cos, sc.val[1]); + + if (unlikely (v_any_u32 (special))) + special_case (x, special, out_sin, out_cos); +} + +PL_TEST_ULP (_ZGVnN4v_sincosf_sin, 1.17) +PL_TEST_ULP (_ZGVnN4v_sincosf_cos, 1.31) +#define V_SINCOSF_INTERVAL(lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVnN4v_sincosf_sin, lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVnN4v_sincosf_cos, lo, hi, n) +V_SINCOSF_INTERVAL (0, 0x1p20, 500000) +V_SINCOSF_INTERVAL (-0, -0x1p20, 500000) +V_SINCOSF_INTERVAL (0x1p20, inf, 10000) +V_SINCOSF_INTERVAL (-0x1p20, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/v_sincosf_common.h b/contrib/arm-optimized-routines/pl/math/v_sincosf_common.h new file mode 100644 index 000000000000..8239bd9f0176 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_sincosf_common.h @@ -0,0 +1,84 @@ +/* + * Core approximation for single-precision vector sincos + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" + +const static struct v_sincosf_data +{ + float32x4_t poly_sin[3], poly_cos[3], pio2[3], inv_pio2, shift, range_val; +} v_sincosf_data = { + .poly_sin = { /* Generated using Remez, odd coeffs only, in [-pi/4, pi/4]. */ + V4 (-0x1.555546p-3), V4 (0x1.11076p-7), V4 (-0x1.994eb4p-13) }, + .poly_cos = { /* Generated using Remez, even coeffs only, in [-pi/4, pi/4]. */ + V4 (0x1.55554ap-5), V4 (-0x1.6c0c1ap-10), V4 (0x1.99e0eep-16) }, + .pio2 = { V4 (0x1.921fb6p+0f), V4 (-0x1.777a5cp-25f), V4 (-0x1.ee59dap-50f) }, + .inv_pio2 = V4 (0x1.45f306p-1f), + .shift = V4 (0x1.8p23), + .range_val = V4 (0x1p20), +}; + +static inline uint32x4_t +check_ge_rangeval (float32x4_t x, const struct v_sincosf_data *d) +{ + return vcagtq_f32 (x, d->range_val); +} + +/* Single-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate low-order + polynomials. + Worst-case error for sin is 1.67 ULP: + v_sincosf_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5 + Worst-case error for cos is 1.81 ULP: + v_sincosf_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */ +static inline float32x4x2_t +v_sincosf_inline (float32x4_t x, const struct v_sincosf_data *d) +{ + /* n = rint ( x / (pi/2) ). */ + float32x4_t shift = d->shift; + float32x4_t q = vfmaq_f32 (shift, x, d->inv_pio2); + q = vsubq_f32 (q, shift); + int32x4_t n = vcvtq_s32_f32 (q); + + /* Reduce x such that r is in [ -pi/4, pi/4 ]. */ + float32x4_t r = x; + r = vfmsq_f32 (r, q, d->pio2[0]); + r = vfmsq_f32 (r, q, d->pio2[1]); + r = vfmsq_f32 (r, q, d->pio2[2]); + + /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2). */ + float32x4_t r2 = vmulq_f32 (r, r), r3 = vmulq_f32 (r, r2); + float32x4_t s = vfmaq_f32 (d->poly_sin[1], r2, d->poly_sin[2]); + s = vfmaq_f32 (d->poly_sin[0], r2, s); + s = vfmaq_f32 (r, r3, s); + + /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2). */ + float32x4_t r4 = vmulq_f32 (r2, r2); + float32x4_t p = vfmaq_f32 (d->poly_cos[1], r2, d->poly_cos[2]); + float32x4_t c = vfmaq_f32 (v_f32 (-0.5), r2, d->poly_cos[0]); + c = vfmaq_f32 (c, r4, p); + c = vfmaq_f32 (v_f32 (1), c, r2); + + /* If odd quadrant, swap cos and sin. */ + uint32x4_t swap = vtstq_u32 (vreinterpretq_u32_s32 (n), v_u32 (1)); + float32x4_t ss = vbslq_f32 (swap, c, s); + float32x4_t cc = vbslq_f32 (swap, s, c); + + /* Fix signs according to quadrant. + ss = asfloat(asuint(ss) ^ ((n & 2) << 30)) + cc = asfloat(asuint(cc) & (((n + 1) & 2) << 30)). */ + uint32x4_t sin_sign + = vshlq_n_u32 (vandq_u32 (vreinterpretq_u32_s32 (n), v_u32 (2)), 30); + uint32x4_t cos_sign = vshlq_n_u32 ( + vandq_u32 (vreinterpretq_u32_s32 (vaddq_s32 (n, v_s32 (1))), v_u32 (2)), + 30); + ss = vreinterpretq_f32_u32 ( + veorq_u32 (vreinterpretq_u32_f32 (ss), sin_sign)); + cc = vreinterpretq_f32_u32 ( + veorq_u32 (vreinterpretq_u32_f32 (cc), cos_sign)); + + return (float32x4x2_t){ ss, cc }; +} diff --git a/contrib/arm-optimized-routines/pl/math/v_sinh_3u.c b/contrib/arm-optimized-routines/pl/math/v_sinh_3u.c index 57ec66ecc282..a644f54b4a0f 100644 --- a/contrib/arm-optimized-routines/pl/math/v_sinh_3u.c +++ b/contrib/arm-optimized-routines/pl/math/v_sinh_3u.c @@ -6,47 +6,73 @@ */ #include "v_math.h" -#include "estrin.h" +#include "poly_advsimd_f64.h" #include "pl_sig.h" #include "pl_test.h" -#define AbsMask 0x7fffffffffffffff -#define Half 0x3fe0000000000000 -#define BigBound \ - 0x4080000000000000 /* 2^9. expm1 helper overflows for large input. */ -#define TinyBound \ - 0x3e50000000000000 /* 2^-26, below which sinh(x) rounds to x. */ -#define InvLn2 v_f64 (0x1.71547652b82fep0) -#define MLn2hi v_f64 (-0x1.62e42fefa39efp-1) -#define MLn2lo v_f64 (-0x1.abc9e3b39803fp-56) -#define Shift v_f64 (0x1.8p52) -#define One 0x3ff0000000000000 -#define C(i) v_f64 (__expm1_poly[i]) - -#if V_SUPPORTED - -static inline v_f64_t -expm1_inline (v_f64_t x) +static const struct data { + float64x2_t poly[11]; + float64x2_t inv_ln2, m_ln2, shift; + uint64x2_t halff; + int64x2_t onef; +#if WANT_SIMD_EXCEPT + uint64x2_t tiny_bound, thresh; +#else + uint64x2_t large_bound; +#endif +} data = { + /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */ + .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5), + V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10), + V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16), + V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22), + V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), }, + + .inv_ln2 = V2 (0x1.71547652b82fep0), + .m_ln2 = (float64x2_t) {-0x1.62e42fefa39efp-1, -0x1.abc9e3b39803fp-56}, + .shift = V2 (0x1.8p52), + + .halff = V2 (0x3fe0000000000000), + .onef = V2 (0x3ff0000000000000), +#if WANT_SIMD_EXCEPT + /* 2^-26, below which sinh(x) rounds to x. */ + .tiny_bound = V2 (0x3e50000000000000), + /* asuint(large_bound) - asuint(tiny_bound). */ + .thresh = V2 (0x0230000000000000), +#else +/* 2^9. expm1 helper overflows for large input. */ + .large_bound = V2 (0x4080000000000000), +#endif +}; + +static inline float64x2_t +expm1_inline (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + /* Reduce argument: exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 where i = round(x / ln2) and f = x - i * ln2 (f in [-ln2/2, ln2/2]). */ - v_f64_t j = v_fma_f64 (InvLn2, x, Shift) - Shift; - v_s64_t i = v_to_s64_f64 (j); - v_f64_t f = v_fma_f64 (j, MLn2hi, x); - f = v_fma_f64 (j, MLn2lo, f); + float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift); + int64x2_t i = vcvtq_s64_f64 (j); + float64x2_t f = vfmaq_laneq_f64 (x, j, d->m_ln2, 0); + f = vfmaq_laneq_f64 (f, j, d->m_ln2, 1); /* Approximate expm1(f) using polynomial. */ - v_f64_t f2 = f * f, f4 = f2 * f2, f8 = f4 * f4; - v_f64_t p = v_fma_f64 (f2, ESTRIN_10 (f, f2, f4, f8, C), f); + float64x2_t f2 = vmulq_f64 (f, f); + float64x2_t f4 = vmulq_f64 (f2, f2); + float64x2_t f8 = vmulq_f64 (f4, f4); + float64x2_t p = vfmaq_f64 (f, f2, v_estrin_10_f64 (f, f2, f4, f8, d->poly)); /* t = 2^i. */ - v_f64_t t = v_as_f64_u64 (v_as_u64_s64 (i << 52) + One); + float64x2_t t = vreinterpretq_f64_u64 ( + vreinterpretq_u64_s64 (vaddq_s64 (vshlq_n_s64 (i, 52), d->onef))); /* expm1(x) ~= p * t + (t - 1). */ - return v_fma_f64 (p, t, t - 1); + return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t); } -static NOINLINE VPCS_ATTR v_f64_t -special_case (v_f64_t x) +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x) { return v_call_f64 (sinh, x, x, v_u64 (-1)); } @@ -54,20 +80,22 @@ special_case (v_f64_t x) /* Approximation for vector double-precision sinh(x) using expm1. sinh(x) = (exp(x) - exp(-x)) / 2. The greatest observed error is 2.57 ULP: - sinh(0x1.9fb1d49d1d58bp-2) got 0x1.ab34e59d678dcp-2 - want 0x1.ab34e59d678d9p-2. */ -VPCS_ATTR v_f64_t V_NAME (sinh) (v_f64_t x) + _ZGVnN2v_sinh (0x1.9fb1d49d1d58bp-2) got 0x1.ab34e59d678dcp-2 + want 0x1.ab34e59d678d9p-2. */ +float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x) { - v_u64_t ix = v_as_u64_f64 (x); - v_u64_t iax = ix & AbsMask; - v_f64_t ax = v_as_f64_u64 (iax); - v_u64_t sign = ix & ~AbsMask; - v_f64_t halfsign = v_as_f64_u64 (sign | Half); + const struct data *d = ptr_barrier (&data); + + float64x2_t ax = vabsq_f64 (x); + uint64x2_t sign + = veorq_u64 (vreinterpretq_u64_f64 (x), vreinterpretq_u64_f64 (ax)); + float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->halff)); #if WANT_SIMD_EXCEPT - v_u64_t special = v_cond_u64 ((iax - TinyBound) >= (BigBound - TinyBound)); + uint64x2_t special = vcgeq_u64 ( + vsubq_u64 (vreinterpretq_u64_f64 (ax), d->tiny_bound), d->thresh); #else - v_u64_t special = v_cond_u64 (iax >= BigBound); + uint64x2_t special = vcgeq_u64 (vreinterpretq_u64_f64 (ax), d->large_bound); #endif /* Fall back to scalar variant for all lanes if any of them are special. */ @@ -77,18 +105,14 @@ VPCS_ATTR v_f64_t V_NAME (sinh) (v_f64_t x) /* Up to the point that expm1 overflows, we can use it to calculate sinh using a slight rearrangement of the definition of sinh. This allows us to retain acceptable accuracy for very small inputs. */ - v_f64_t t = expm1_inline (ax); - return (t + t / (t + 1)) * halfsign; + float64x2_t t = expm1_inline (ax); + t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0)))); + return vmulq_f64 (t, halfsign); } -VPCS_ALIAS PL_SIG (V, D, 1, sinh, -10.0, 10.0) -PL_TEST_ULP (V_NAME (sinh), 2.08) -PL_TEST_EXPECT_FENV (V_NAME (sinh), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME (sinh), 0, TinyBound, 1000) -PL_TEST_INTERVAL (V_NAME (sinh), -0, -TinyBound, 1000) -PL_TEST_INTERVAL (V_NAME (sinh), TinyBound, BigBound, 500000) -PL_TEST_INTERVAL (V_NAME (sinh), -TinyBound, -BigBound, 500000) -PL_TEST_INTERVAL (V_NAME (sinh), BigBound, inf, 1000) -PL_TEST_INTERVAL (V_NAME (sinh), -BigBound, -inf, 1000) -#endif +PL_TEST_ULP (V_NAME_D1 (sinh), 2.08) +PL_TEST_EXPECT_FENV (V_NAME_D1 (sinh), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0, 0x1p-26, 1000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0x1p-26, 0x1p9, 500000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0x1p9, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/v_sinhf_2u3.c b/contrib/arm-optimized-routines/pl/math/v_sinhf_2u3.c index 49cf078d0651..cd8c0f08f784 100644 --- a/contrib/arm-optimized-routines/pl/math/v_sinhf_2u3.c +++ b/contrib/arm-optimized-routines/pl/math/v_sinhf_2u3.c @@ -9,61 +9,76 @@ #include "pl_sig.h" #include "pl_test.h" -#if V_SUPPORTED - #include "v_expm1f_inline.h" -#define AbsMask 0x7fffffff -#define Half 0x3f000000 -#define BigBound \ - 0x42b0c0a7 /* 0x1.61814ep+6, above which expm1f helper overflows. */ -#define TinyBound \ - 0x2fb504f4 /* 0x1.6a09e8p-32, below which expm1f underflows. */ - -static NOINLINE VPCS_ATTR v_f32_t -special_case (v_f32_t x) +static const struct data { - return v_call_f32 (sinhf, x, x, v_u32 (-1)); + struct v_expm1f_data expm1f_consts; + uint32x4_t halff; +#if WANT_SIMD_EXCEPT + uint32x4_t tiny_bound, thresh; +#else + uint32x4_t oflow_bound; +#endif +} data = { + .expm1f_consts = V_EXPM1F_DATA, + .halff = V4 (0x3f000000), +#if WANT_SIMD_EXCEPT + /* 0x1.6a09e8p-32, below which expm1f underflows. */ + .tiny_bound = V4 (0x2fb504f4), + /* asuint(oflow_bound) - asuint(tiny_bound). */ + .thresh = V4 (0x12fbbbb3), +#else + /* 0x1.61814ep+6, above which expm1f helper overflows. */ + .oflow_bound = V4 (0x42b0c0a7), +#endif +}; + +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +{ + return v_call_f32 (sinhf, x, y, special); } /* Approximation for vector single-precision sinh(x) using expm1. sinh(x) = (exp(x) - exp(-x)) / 2. The maximum error is 2.26 ULP: - __v_sinhf(0x1.e34a9ep-4) got 0x1.e469ep-4 want 0x1.e469e4p-4. */ -VPCS_ATTR v_f32_t V_NAME (sinhf) (v_f32_t x) + _ZGVnN4v_sinhf (0x1.e34a9ep-4) got 0x1.e469ep-4 + want 0x1.e469e4p-4. */ +float32x4_t VPCS_ATTR V_NAME_F1 (sinh) (float32x4_t x) { - v_u32_t ix = v_as_u32_f32 (x); - v_u32_t iax = ix & AbsMask; - v_f32_t ax = v_as_f32_u32 (iax); - v_u32_t sign = ix & ~AbsMask; - v_f32_t halfsign = v_as_f32_u32 (sign | Half); + const struct data *d = ptr_barrier (&data); + + uint32x4_t ix = vreinterpretq_u32_f32 (x); + float32x4_t ax = vabsq_f32 (x); + uint32x4_t iax = vreinterpretq_u32_f32 (ax); + uint32x4_t sign = veorq_u32 (ix, iax); + float32x4_t halfsign = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->halff)); #if WANT_SIMD_EXCEPT - v_u32_t special = v_cond_u32 ((iax - TinyBound) >= (BigBound - TinyBound)); + uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, d->tiny_bound), d->thresh); + ax = v_zerofy_f32 (ax, special); #else - v_u32_t special = v_cond_u32 (iax >= BigBound); + uint32x4_t special = vcgeq_u32 (iax, d->oflow_bound); #endif - /* Fall back to the scalar variant for all lanes if any of them should trigger - an exception. */ - if (unlikely (v_any_u32 (special))) - return special_case (x); - /* Up to the point that expm1f overflows, we can use it to calculate sinhf - using a slight rearrangement of the definition of asinh. This allows us to - retain acceptable accuracy for very small inputs. */ - v_f32_t t = expm1f_inline (ax); - return (t + t / (t + 1)) * halfsign; + using a slight rearrangement of the definition of asinh. This allows us + to retain acceptable accuracy for very small inputs. */ + float32x4_t t = expm1f_inline (ax, &d->expm1f_consts); + t = vaddq_f32 (t, vdivq_f32 (t, vaddq_f32 (t, v_f32 (1.0)))); + + /* Fall back to the scalar variant for any lanes that should trigger an + exception. */ + if (unlikely (v_any_u32 (special))) + return special_case (x, vmulq_f32 (t, halfsign), special); + + return vmulq_f32 (t, halfsign); } -VPCS_ALIAS PL_SIG (V, F, 1, sinh, -10.0, 10.0) -PL_TEST_ULP (V_NAME (sinhf), 1.76) -PL_TEST_EXPECT_FENV (V_NAME (sinhf), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME (sinhf), 0, TinyBound, 1000) -PL_TEST_INTERVAL (V_NAME (sinhf), -0, -TinyBound, 1000) -PL_TEST_INTERVAL (V_NAME (sinhf), TinyBound, BigBound, 100000) -PL_TEST_INTERVAL (V_NAME (sinhf), -TinyBound, -BigBound, 100000) -PL_TEST_INTERVAL (V_NAME (sinhf), BigBound, inf, 1000) -PL_TEST_INTERVAL (V_NAME (sinhf), -BigBound, -inf, 1000) -#endif +PL_TEST_ULP (V_NAME_F1 (sinh), 1.76) +PL_TEST_EXPECT_FENV (V_NAME_F1 (sinh), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0, 0x2fb504f4, 1000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x2fb504f4, 0x42b0c0a7, 100000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x42b0c0a7, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/v_sinpi_3u1.c b/contrib/arm-optimized-routines/pl/math/v_sinpi_3u1.c new file mode 100644 index 000000000000..8d2917ff8ecd --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_sinpi_3u1.c @@ -0,0 +1,86 @@ +/* + * Double-precision vector sinpi function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" +#include "poly_advsimd_f64.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float64x2_t poly[10]; +} data = { + /* Polynomial coefficients generated using Remez algorithm, + see sinpi.sollya for details. */ + .poly = { V2 (0x1.921fb54442d184p1), V2 (-0x1.4abbce625be53p2), + V2 (0x1.466bc6775ab16p1), V2 (-0x1.32d2cce62dc33p-1), + V2 (0x1.507834891188ep-4), V2 (-0x1.e30750a28c88ep-8), + V2 (0x1.e8f48308acda4p-12), V2 (-0x1.6fc0032b3c29fp-16), + V2 (0x1.af86ae521260bp-21), V2 (-0x1.012a9870eeb7dp-25) }, +}; + +#if WANT_SIMD_EXCEPT +# define TinyBound v_u64 (0x3bf0000000000000) /* asuint64(0x1p-64). */ +/* asuint64(0x1p64) - TinyBound. */ +# define Thresh v_u64 (0x07f0000000000000) + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) +{ + /* Fall back to scalar code. */ + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); + return v_call_f64 (sinpi, x, y, cmp); +} +#endif + +/* Approximation for vector double-precision sinpi(x). + Maximum Error 3.05 ULP: + _ZGVnN2v_sinpi(0x1.d32750db30b4ap-2) got 0x1.fb295878301c7p-1 + want 0x1.fb295878301cap-1. */ +float64x2_t VPCS_ATTR V_NAME_D1 (sinpi) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x)); + uint64x2_t cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh); + + /* When WANT_SIMD_EXCEPT = 1, special lanes should be set to 0 + to avoid them under/overflowing and throwing exceptions. */ + float64x2_t r = v_zerofy_f64 (x, cmp); +#else + float64x2_t r = x; +#endif + + /* If r is odd, the sign of the result should be inverted. */ + uint64x2_t odd + = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (r)), 63); + + /* r = x - rint(x). Range reduction to -1/2 .. 1/2. */ + r = vsubq_f64 (r, vrndaq_f64 (r)); + + /* y = sin(r). */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t r4 = vmulq_f64 (r2, r2); + float64x2_t y = vmulq_f64 (v_pw_horner_9_f64 (r2, r4, d->poly), r); + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u64 (cmp))) + return special_case (x, y, odd, cmp); +#endif + + return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); +} + +PL_SIG (V, D, 1, sinpi, -0.9, 0.9) +PL_TEST_ULP (V_NAME_D1 (sinpi), 3.06) +PL_TEST_EXPECT_FENV (V_NAME_D1 (sinpi), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0, 0x1p-63, 5000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p-63, 0.5, 10000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0.5, 0x1p51, 10000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p51, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/v_sinpif_3u.c b/contrib/arm-optimized-routines/pl/math/v_sinpif_3u.c new file mode 100644 index 000000000000..3d6eeff333f7 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_sinpif_3u.c @@ -0,0 +1,81 @@ +/* + * Single-precision vector sinpi function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" +#include "poly_advsimd_f32.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float32x4_t poly[6]; +} data = { + /* Taylor series coefficents for sin(pi * x). */ + .poly = { V4 (0x1.921fb6p1f), V4 (-0x1.4abbcep2f), V4 (0x1.466bc6p1f), + V4 (-0x1.32d2ccp-1f), V4 (0x1.50783p-4f), V4 (-0x1.e30750p-8f) }, +}; + +#if WANT_SIMD_EXCEPT +# define TinyBound v_u32 (0x30000000) /* asuint32(0x1p-31f). */ +# define Thresh v_u32 (0x1f000000) /* asuint32(0x1p31f) - TinyBound. */ + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) +{ + /* Fall back to scalar code. */ + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); + return v_call_f32 (sinpif, x, y, cmp); +} +#endif + +/* Approximation for vector single-precision sinpi(x) + Maximum Error 3.03 ULP: + _ZGVnN4v_sinpif(0x1.c597ccp-2) got 0x1.f7cd56p-1 + want 0x1.f7cd5p-1. */ +float32x4_t VPCS_ATTR V_NAME_F1 (sinpi) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x)); + uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh); + + /* When WANT_SIMD_EXCEPT = 1, special lanes should be set to 0 + to avoid them under/overflowing and throwing exceptions. */ + float32x4_t r = v_zerofy_f32 (x, cmp); +#else + float32x4_t r = x; +#endif + + /* If r is odd, the sign of the result should be inverted. */ + uint32x4_t odd + = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (r)), 31); + + /* r = x - rint(x). Range reduction to -1/2 .. 1/2. */ + r = vsubq_f32 (r, vrndaq_f32 (r)); + + /* Pairwise Horner approximation for y = sin(r * pi). */ + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t r4 = vmulq_f32 (r2, r2); + float32x4_t y = vmulq_f32 (v_pw_horner_5_f32 (r2, r4, d->poly), r); + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u32 (cmp))) + return special_case (x, y, odd, cmp); +#endif + + return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); +} + +PL_SIG (V, F, 1, sinpi, -0.9, 0.9) +PL_TEST_ULP (V_NAME_F1 (sinpi), 2.54) +PL_TEST_EXPECT_FENV (V_NAME_F1 (sinpi), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0, 0x1p-31, 5000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p-31, 0.5, 10000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0.5, 0x1p31f, 10000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p31f, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/v_tan_3u5.c b/contrib/arm-optimized-routines/pl/math/v_tan_3u5.c index f87baccc4fd7..c431c8c4889e 100644 --- a/contrib/arm-optimized-routines/pl/math/v_tan_3u5.c +++ b/contrib/arm-optimized-routines/pl/math/v_tan_3u5.c @@ -6,62 +6,76 @@ */ #include "v_math.h" -#include "estrin.h" +#include "poly_advsimd_f64.h" #include "pl_sig.h" #include "pl_test.h" -#if V_SUPPORTED +static const struct data +{ + float64x2_t poly[9]; + float64x2_t half_pi, two_over_pi, shift; +#if !WANT_SIMD_EXCEPT + float64x2_t range_val; +#endif +} data = { + /* Coefficients generated using FPMinimax. */ + .poly = { V2 (0x1.5555555555556p-2), V2 (0x1.1111111110a63p-3), + V2 (0x1.ba1ba1bb46414p-5), V2 (0x1.664f47e5b5445p-6), + V2 (0x1.226e5e5ecdfa3p-7), V2 (0x1.d6c7ddbf87047p-9), + V2 (0x1.7ea75d05b583ep-10), V2 (0x1.289f22964a03cp-11), + V2 (0x1.4e4fd14147622p-12) }, + .half_pi = { 0x1.921fb54442d18p0, 0x1.1a62633145c07p-54 }, + .two_over_pi = V2 (0x1.45f306dc9c883p-1), + .shift = V2 (0x1.8p52), +#if !WANT_SIMD_EXCEPT + .range_val = V2 (0x1p23), +#endif +}; -#define MHalfPiHi v_f64 (__v_tan_data.neg_half_pi_hi) -#define MHalfPiLo v_f64 (__v_tan_data.neg_half_pi_lo) -#define TwoOverPi v_f64 (0x1.45f306dc9c883p-1) -#define Shift v_f64 (0x1.8p52) -#define AbsMask 0x7fffffffffffffff -#define RangeVal 0x4160000000000000 /* asuint64(2^23). */ +#define RangeVal 0x4160000000000000 /* asuint64(0x1p23). */ #define TinyBound 0x3e50000000000000 /* asuint64(2^-26). */ -#define C(i) v_f64 (__v_tan_data.poly[i]) +#define Thresh 0x310000000000000 /* RangeVal - TinyBound. */ /* Special cases (fall back to scalar calls). */ -VPCS_ATTR -NOINLINE static v_f64_t -specialcase (v_f64_t x) +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x) { return v_call_f64 (tan, x, x, v_u64 (-1)); } /* Vector approximation for double-precision tan. Maximum measured error is 3.48 ULP: - __v_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37 - want -0x1.f6ccd8ecf7deap+37. */ -VPCS_ATTR -v_f64_t V_NAME (tan) (v_f64_t x) + _ZGVnN2v_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37 + want -0x1.f6ccd8ecf7deap+37. */ +float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x) { - v_u64_t iax = v_as_u64_f64 (x) & AbsMask; - - /* Our argument reduction cannot calculate q with sufficient accuracy for very - large inputs. Fall back to scalar routine for all lanes if any are too - large, or Inf/NaN. If fenv exceptions are expected, also fall back for tiny - input to avoid underflow. Note pl does not supply a scalar double-precision - tan, so the fallback will be statically linked from the system libm. */ + const struct data *dat = ptr_barrier (&data); + /* Our argument reduction cannot calculate q with sufficient accuracy for + very large inputs. Fall back to scalar routine for all lanes if any are + too large, or Inf/NaN. If fenv exceptions are expected, also fall back for + tiny input to avoid underflow. */ #if WANT_SIMD_EXCEPT - if (unlikely (v_any_u64 (iax - TinyBound > RangeVal - TinyBound))) -#else - if (unlikely (v_any_u64 (iax > RangeVal))) + uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x)); + /* iax - tiny_bound > range_val - tiny_bound. */ + uint64x2_t special + = vcgtq_u64 (vsubq_u64 (iax, v_u64 (TinyBound)), v_u64 (Thresh)); + if (unlikely (v_any_u64 (special))) + return special_case (x); #endif - return specialcase (x); /* q = nearest integer to 2 * x / pi. */ - v_f64_t q = v_fma_f64 (x, TwoOverPi, Shift) - Shift; - v_s64_t qi = v_to_s64_f64 (q); + float64x2_t q + = vsubq_f64 (vfmaq_f64 (dat->shift, x, dat->two_over_pi), dat->shift); + int64x2_t qi = vcvtq_s64_f64 (q); /* Use q to reduce x to r in [-pi/4, pi/4], by: r = x - q * pi/2, in extended precision. */ - v_f64_t r = x; - r = v_fma_f64 (q, MHalfPiHi, r); - r = v_fma_f64 (q, MHalfPiLo, r); + float64x2_t r = x; + r = vfmsq_laneq_f64 (r, q, dat->half_pi, 0); + r = vfmsq_laneq_f64 (r, q, dat->half_pi, 1); /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle formula. */ - r = r * 0.5; + r = vmulq_n_f64 (r, 0.5); /* Approximate tan(r) using order 8 polynomial. tan(x) is odd, so polynomial has the form: @@ -69,34 +83,38 @@ v_f64_t V_NAME (tan) (v_f64_t x) Hence we first approximate P(r) = C1 + C2 * r^2 + C3 * r^4 + ... Then compute the approximation by: tan(r) ~= r + r^3 * (C0 + r^2 * P(r)). */ - v_f64_t r2 = r * r, r4 = r2 * r2, r8 = r4 * r4; - /* Use offset version of Estrin wrapper to evaluate from C1 onwards. */ - v_f64_t p = ESTRIN_7_ (r2, r4, r8, C, 1); - p = v_fma_f64 (p, r2, C (0)); - p = v_fma_f64 (r2, p * r, r); + float64x2_t r2 = vmulq_f64 (r, r), r4 = vmulq_f64 (r2, r2), + r8 = vmulq_f64 (r4, r4); + /* Offset coefficients to evaluate from C1 onwards. */ + float64x2_t p = v_estrin_7_f64 (r2, r4, r8, dat->poly + 1); + p = vfmaq_f64 (dat->poly[0], p, r2); + p = vfmaq_f64 (r, r2, vmulq_f64 (p, r)); /* Recombination uses double-angle formula: tan(2x) = 2 * tan(x) / (1 - (tan(x))^2) and reciprocity around pi/2: tan(x) = 1 / (tan(pi/2 - x)) to assemble result using change-of-sign and conditional selection of - numerator/denominator, dependent on odd/even-ness of q (hence quadrant). */ - v_f64_t n = v_fma_f64 (p, p, v_f64 (-1)); - v_f64_t d = p * 2; + numerator/denominator, dependent on odd/even-ness of q (hence quadrant). + */ + float64x2_t n = vfmaq_f64 (v_f64 (-1), p, p); + float64x2_t d = vaddq_f64 (p, p); - v_u64_t use_recip = v_cond_u64 ((v_as_u64_s64 (qi) & 1) == 0); + uint64x2_t no_recip = vtstq_u64 (vreinterpretq_u64_s64 (qi), v_u64 (1)); - return v_sel_f64 (use_recip, -d, n) / v_sel_f64 (use_recip, n, d); +#if !WANT_SIMD_EXCEPT + uint64x2_t special = vcageq_f64 (x, dat->range_val); + if (unlikely (v_any_u64 (special))) + return special_case (x); +#endif + + return vdivq_f64 (vbslq_f64 (no_recip, n, vnegq_f64 (d)), + vbslq_f64 (no_recip, d, n)); } -VPCS_ALIAS PL_SIG (V, D, 1, tan, -3.1, 3.1) -PL_TEST_ULP (V_NAME (tan), 2.99) -PL_TEST_EXPECT_FENV (V_NAME (tan), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME (tan), 0, TinyBound, 5000) -PL_TEST_INTERVAL (V_NAME (tan), TinyBound, RangeVal, 100000) -PL_TEST_INTERVAL (V_NAME (tan), RangeVal, inf, 5000) -PL_TEST_INTERVAL (V_NAME (tan), -0, -TinyBound, 5000) -PL_TEST_INTERVAL (V_NAME (tan), -TinyBound, -RangeVal, 100000) -PL_TEST_INTERVAL (V_NAME (tan), -RangeVal, -inf, 5000) -#endif +PL_TEST_ULP (V_NAME_D1 (tan), 2.99) +PL_TEST_EXPECT_FENV (V_NAME_D1 (tan), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (tan), 0, TinyBound, 5000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (tan), TinyBound, RangeVal, 100000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (tan), RangeVal, inf, 5000) diff --git a/contrib/arm-optimized-routines/pl/math/v_tan_data.c b/contrib/arm-optimized-routines/pl/math/v_tan_data.c deleted file mode 100644 index 04e25169bd88..000000000000 --- a/contrib/arm-optimized-routines/pl/math/v_tan_data.c +++ /dev/null @@ -1,15 +0,0 @@ -/* - * Coefficients and helpers for double-precision vector tan(x) function. - * - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "math_config.h" - -const struct v_tan_data __v_tan_data - = {.neg_half_pi_hi = -0x1.921fb54442d18p0, - .neg_half_pi_lo = -0x1.1a62633145c07p-54, - .poly - = {0x1.5555555555556p-2, 0x1.1111111110a63p-3, 0x1.ba1ba1bb46414p-5, - 0x1.664f47e5b5445p-6, 0x1.226e5e5ecdfa3p-7, 0x1.d6c7ddbf87047p-9, - 0x1.7ea75d05b583ep-10, 0x1.289f22964a03cp-11, 0x1.4e4fd14147622p-12}}; diff --git a/contrib/arm-optimized-routines/pl/math/v_tanf_3u5.c b/contrib/arm-optimized-routines/pl/math/v_tanf_3u5.c index 828466b03182..98948b0a9ecf 100644 --- a/contrib/arm-optimized-routines/pl/math/v_tanf_3u5.c +++ b/contrib/arm-optimized-routines/pl/math/v_tanf_3u5.c @@ -6,87 +6,95 @@ */ #include "v_math.h" -#include "estrinf.h" +#include "poly_advsimd_f32.h" #include "pl_sig.h" #include "pl_test.h" -#if V_SUPPORTED +static const struct data +{ + float32x4_t poly[6]; + float32x4_t pi_consts; + float32x4_t shift; +#if !WANT_SIMD_EXCEPT + float32x4_t range_val; +#endif +} data = { + /* Coefficients generated using FPMinimax. */ + .poly = { V4 (0x1.55555p-2f), V4 (0x1.11166p-3f), V4 (0x1.b88a78p-5f), + V4 (0x1.7b5756p-6f), V4 (0x1.4ef4cep-8f), V4 (0x1.0e1e74p-7f) }, + /* Stores constants: (-pi/2)_high, (-pi/2)_mid, (-pi/2)_low, and 2/pi. */ + .pi_consts + = { -0x1.921fb6p+0f, 0x1.777a5cp-25f, 0x1.ee59dap-50f, 0x1.45f306p-1f }, + .shift = V4 (0x1.8p+23f), +#if !WANT_SIMD_EXCEPT + .range_val = V4 (0x1p15f), +#endif +}; -/* Constants. */ -#define NegPio2_1 (v_f32 (-0x1.921fb6p+0f)) -#define NegPio2_2 (v_f32 (0x1.777a5cp-25f)) -#define NegPio2_3 (v_f32 (0x1.ee59dap-50f)) -#define InvPio2 (v_f32 (0x1.45f306p-1f)) -#define RangeVal (0x47000000) /* asuint32(0x1p15f). */ -#define TinyBound (0x30000000) /* asuint32 (0x1p-31). */ -#define Shift (v_f32 (0x1.8p+23f)) -#define AbsMask (v_u32 (0x7fffffff)) - -#define poly(i) v_f32 (__tanf_poly_data.poly_tan[i]) +#define RangeVal v_u32 (0x47000000) /* asuint32(0x1p15f). */ +#define TinyBound v_u32 (0x30000000) /* asuint32 (0x1p-31f). */ +#define Thresh v_u32 (0x16000000) /* asuint32(RangeVal) - TinyBound. */ /* Special cases (fall back to scalar calls). */ -VPCS_ATTR -NOINLINE static v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) { return v_call_f32 (tanf, x, y, cmp); } /* Use a full Estrin scheme to evaluate polynomial. */ -static inline v_f32_t -eval_poly (v_f32_t z) +static inline float32x4_t +eval_poly (float32x4_t z, const struct data *d) { - v_f32_t z2 = z * z; + float32x4_t z2 = vmulq_f32 (z, z); #if WANT_SIMD_EXCEPT - /* Tiny z (<= 0x1p-31) will underflow when calculating z^4. If fp exceptions - are to be triggered correctly, sidestep this by fixing such lanes to 0. */ - v_u32_t will_uflow = v_cond_u32 ((v_as_u32_f32 (z) & AbsMask) <= TinyBound); + /* Tiny z (<= 0x1p-31) will underflow when calculating z^4. + If fp exceptions are to be triggered correctly, + sidestep this by fixing such lanes to 0. */ + uint32x4_t will_uflow + = vcleq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (z)), TinyBound); if (unlikely (v_any_u32 (will_uflow))) - z2 = v_sel_f32 (will_uflow, v_f32 (0), z2); + z2 = vbslq_f32 (will_uflow, v_f32 (0), z2); #endif - v_f32_t z4 = z2 * z2; - return ESTRIN_5 (z, z2, z4, poly); + float32x4_t z4 = vmulq_f32 (z2, z2); + return v_estrin_5_f32 (z, z2, z4, d->poly); } -/* Fast implementation of Neon tanf. +/* Fast implementation of AdvSIMD tanf. Maximum error is 3.45 ULP: __v_tanf(-0x1.e5f0cap+13) got 0x1.ff9856p-1 want 0x1.ff9850p-1. */ -VPCS_ATTR -v_f32_t V_NAME (tanf) (v_f32_t x) +float32x4_t VPCS_ATTR V_NAME_F1 (tan) (float32x4_t x) { - v_f32_t special_arg = x; - v_u32_t ix = v_as_u32_f32 (x); - v_u32_t iax = ix & AbsMask; + const struct data *d = ptr_barrier (&data); + float32x4_t special_arg = x; /* iax >= RangeVal means x, if not inf or NaN, is too large to perform fast regression. */ #if WANT_SIMD_EXCEPT + uint32x4_t iax = vreinterpretq_u32_f32 (vabsq_f32 (x)); /* If fp exceptions are to be triggered correctly, also special-case tiny input, as this will load to overflow later. Fix any special lanes to 1 to prevent any exceptions being triggered. */ - v_u32_t special = v_cond_u32 (iax - TinyBound >= RangeVal - TinyBound); + uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, TinyBound), Thresh); if (unlikely (v_any_u32 (special))) - x = v_sel_f32 (special, v_f32 (1.0f), x); + x = vbslq_f32 (special, v_f32 (1.0f), x); #else /* Otherwise, special-case large and special values. */ - v_u32_t special = v_cond_u32 (iax >= RangeVal); + uint32x4_t special = vcageq_f32 (x, d->range_val); #endif /* n = rint(x/(pi/2)). */ - v_f32_t q = v_fma_f32 (InvPio2, x, Shift); - v_f32_t n = q - Shift; - /* n is representable as a signed integer, simply convert it. */ - v_s32_t in = v_round_s32 (n); + float32x4_t q = vfmaq_laneq_f32 (d->shift, x, d->pi_consts, 3); + float32x4_t n = vsubq_f32 (q, d->shift); /* Determine if x lives in an interval, where |tan(x)| grows to infinity. */ - v_s32_t alt = in & 1; - v_u32_t pred_alt = (alt != 0); + uint32x4_t pred_alt = vtstq_u32 (vreinterpretq_u32_f32 (q), v_u32 (1)); /* r = x - n * (pi/2) (range reduction into -pi./4 .. pi/4). */ - v_f32_t r; - r = v_fma_f32 (NegPio2_1, n, x); - r = v_fma_f32 (NegPio2_2, n, r); - r = v_fma_f32 (NegPio2_3, n, r); + float32x4_t r; + r = vfmaq_laneq_f32 (x, n, d->pi_consts, 0); + r = vfmaq_laneq_f32 (r, n, d->pi_consts, 1); + r = vfmaq_laneq_f32 (r, n, d->pi_consts, 2); /* If x lives in an interval, where |tan(x)| - is finite, then use a polynomial approximation of the form @@ -95,37 +103,25 @@ v_f32_t V_NAME (tanf) (v_f32_t x) tan(r) = cotan(pi/2 - r) to express tan(x) as 1/tan(-r). Finally, use the same polynomial approximation of tan as above. */ - /* Perform additional reduction if required. */ - v_f32_t z = v_sel_f32 (pred_alt, -r, r); + /* Invert sign of r if odd quadrant. */ + float32x4_t z = vmulq_f32 (r, vbslq_f32 (pred_alt, v_f32 (-1), v_f32 (1))); /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4]. */ - v_f32_t z2 = r * r; - v_f32_t p = eval_poly (z2); - v_f32_t y = v_fma_f32 (z * z2, p, z); + float32x4_t z2 = vmulq_f32 (r, r); + float32x4_t p = eval_poly (z2, d); + float32x4_t y = vfmaq_f32 (z, vmulq_f32 (z, z2), p); /* Compute reciprocal and apply if required. */ - v_f32_t inv_y = v_div_f32 (v_f32 (1.0f), y); - y = v_sel_f32 (pred_alt, inv_y, y); - - /* Fast reduction does not handle the x = -0.0 case well, - therefore it is fixed here. */ - y = v_sel_f32 (x == v_f32 (-0.0), x, y); + float32x4_t inv_y = vdivq_f32 (v_f32 (1.0f), y); if (unlikely (v_any_u32 (special))) - return specialcase (special_arg, y, special); - return y; + return special_case (special_arg, vbslq_f32 (pred_alt, inv_y, y), special); + return vbslq_f32 (pred_alt, inv_y, y); } -VPCS_ALIAS PL_SIG (V, F, 1, tan, -3.1, 3.1) -PL_TEST_ULP (V_NAME (tanf), 2.96) -PL_TEST_EXPECT_FENV (V_NAME (tanf), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME (tanf), -0.0, -0x1p126, 100) -PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-149, 0x1p-126, 4000) -PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-23, 0.7, 50000) -PL_TEST_INTERVAL (V_NAME (tanf), 0.7, 1.5, 50000) -PL_TEST_INTERVAL (V_NAME (tanf), 1.5, 100, 50000) -PL_TEST_INTERVAL (V_NAME (tanf), 100, 0x1p17, 50000) -PL_TEST_INTERVAL (V_NAME (tanf), 0x1p17, inf, 50000) -#endif +PL_TEST_ULP (V_NAME_F1 (tan), 2.96) +PL_TEST_EXPECT_FENV (V_NAME_F1 (tan), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0, 0x1p-31, 5000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p-31, 0x1p15, 500000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p15, inf, 5000) diff --git a/contrib/arm-optimized-routines/pl/math/v_tanh_3u.c b/contrib/arm-optimized-routines/pl/math/v_tanh_3u.c index c8b6c251d453..5de85c68da2c 100644 --- a/contrib/arm-optimized-routines/pl/math/v_tanh_3u.c +++ b/contrib/arm-optimized-routines/pl/math/v_tanh_3u.c @@ -5,90 +5,102 @@ */ #include "v_math.h" -#include "estrin.h" +#include "poly_advsimd_f64.h" #include "mathlib.h" #include "pl_sig.h" #include "pl_test.h" -#if V_SUPPORTED +static const struct data +{ + float64x2_t poly[11]; + float64x2_t inv_ln2, ln2_hi, ln2_lo, shift; + uint64x2_t onef; + uint64x2_t thresh, tiny_bound; +} data = { + /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */ + .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5), + V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10), + V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16), + V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22), + V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), }, -#define AbsMask v_u64 (0x7fffffffffffffff) -#define InvLn2 v_f64 (0x1.71547652b82fep0) -#define MLn2hi v_f64 (-0x1.62e42fefa39efp-1) -#define MLn2lo v_f64 (-0x1.abc9e3b39803fp-56) -#define Shift v_f64 (0x1.8p52) -#define C(i) v_f64 (__expm1_poly[i]) + .inv_ln2 = V2 (0x1.71547652b82fep0), + .ln2_hi = V2 (-0x1.62e42fefa39efp-1), + .ln2_lo = V2 (-0x1.abc9e3b39803fp-56), + .shift = V2 (0x1.8p52), -#define BoringBound 0x403241bf835f9d5f /* asuint64 (0x1.241bf835f9d5fp+4). */ -#define TinyBound 0x3e40000000000000 /* asuint64 (0x1p-27). */ -#define One v_u64 (0x3ff0000000000000) + .onef = V2 (0x3ff0000000000000), + .tiny_bound = V2 (0x3e40000000000000), /* asuint64 (0x1p-27). */ + /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */ + .thresh = V2 (0x01f241bf835f9d5f), +}; -static inline v_f64_t -expm1_inline (v_f64_t x) +static inline float64x2_t +expm1_inline (float64x2_t x, const struct data *d) { /* Helper routine for calculating exp(x) - 1. Vector port of the helper from the scalar variant of tanh. */ /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ - v_f64_t j = v_fma_f64 (InvLn2, x, Shift) - Shift; - v_s64_t i = v_to_s64_f64 (j); - v_f64_t f = v_fma_f64 (j, MLn2hi, x); - f = v_fma_f64 (j, MLn2lo, f); + float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift); + int64x2_t i = vcvtq_s64_f64 (j); + float64x2_t f = vfmaq_f64 (x, j, d->ln2_hi); + f = vfmaq_f64 (f, j, d->ln2_lo); /* Approximate expm1(f) using polynomial. */ - v_f64_t f2 = f * f; - v_f64_t f4 = f2 * f2; - v_f64_t p = v_fma_f64 (f2, ESTRIN_10 (f, f2, f4, f4 * f4, C), f); + float64x2_t f2 = vmulq_f64 (f, f); + float64x2_t f4 = vmulq_f64 (f2, f2); + float64x2_t p = vfmaq_f64 ( + f, f2, v_estrin_10_f64 (f, f2, f4, vmulq_f64 (f4, f4), d->poly)); /* t = 2 ^ i. */ - v_f64_t t = v_as_f64_u64 (v_as_u64_s64 (i << 52) + One); + float64x2_t t = vreinterpretq_f64_u64 ( + vaddq_u64 (vreinterpretq_u64_s64 (i << 52), d->onef)); /* expm1(x) = p * t + (t - 1). */ - return v_fma_f64 (p, t, t - 1); + return vfmaq_f64 (vsubq_f64 (t, v_f64 (1)), p, t); } -static NOINLINE v_f64_t -special_case (v_f64_t x, v_f64_t y, v_u64_t special) +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, float64x2_t y, uint64x2_t special) { return v_call_f64 (tanh, x, y, special); } /* Vector approximation for double-precision tanh(x), using a simplified - version of expm1. The greatest observed error is 2.75 ULP: - __v_tanh(-0x1.c143c3a44e087p-3) got -0x1.ba31ba4691ab7p-3 - want -0x1.ba31ba4691ab4p-3. */ -VPCS_ATTR v_f64_t V_NAME (tanh) (v_f64_t x) + version of expm1. The greatest observed error is 2.77 ULP: + _ZGVnN2v_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3 + want -0x1.bd6a21a163624p-3. */ +float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x) { - v_u64_t ix = v_as_u64_f64 (x); - v_u64_t ia = ix & AbsMask; + const struct data *d = ptr_barrier (&data); + + uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x)); + + float64x2_t u = x; /* Trigger special-cases for tiny, boring and infinity/NaN. */ - v_u64_t special = v_cond_u64 ((ia - TinyBound) > (BoringBound - TinyBound)); - v_f64_t u; - + uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia, d->tiny_bound), d->thresh); +#if WANT_SIMD_EXCEPT /* To trigger fp exceptions correctly, set special lanes to a neutral value. They will be fixed up later by the special-case handler. */ if (unlikely (v_any_u64 (special))) - u = v_sel_f64 (special, v_f64 (1), x) * 2; - else - u = x * 2; + u = v_zerofy_f64 (u, special); +#endif + + u = vaddq_f64 (u, u); /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ - v_f64_t q = expm1_inline (u); - v_f64_t y = q / (q + 2); + float64x2_t q = expm1_inline (u, d); + float64x2_t qp2 = vaddq_f64 (q, v_f64 (2)); if (unlikely (v_any_u64 (special))) - return special_case (x, y, special); - return y; + return special_case (x, vdivq_f64 (q, qp2), special); + return vdivq_f64 (q, qp2); } -VPCS_ALIAS PL_SIG (V, D, 1, tanh, -10.0, 10.0) -PL_TEST_ULP (V_NAME (tanh), 2.26) -PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (tanh)) -PL_TEST_INTERVAL (V_NAME (tanh), 0, TinyBound, 1000) -PL_TEST_INTERVAL (V_NAME (tanh), -0, -TinyBound, 1000) -PL_TEST_INTERVAL (V_NAME (tanh), TinyBound, BoringBound, 100000) -PL_TEST_INTERVAL (V_NAME (tanh), -TinyBound, -BoringBound, 100000) -PL_TEST_INTERVAL (V_NAME (tanh), BoringBound, inf, 1000) -PL_TEST_INTERVAL (V_NAME (tanh), -BoringBound, -inf, 1000) -#endif +PL_TEST_ULP (V_NAME_D1 (tanh), 2.27) +PL_TEST_EXPECT_FENV (V_NAME_D1 (tanh), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0, 0x1p-27, 5000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0x1p-27, 0x1.241bf835f9d5fp+4, 50000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0x1.241bf835f9d5fp+4, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/v_tanhf_2u6.c b/contrib/arm-optimized-routines/pl/math/v_tanhf_2u6.c index 36166118c0f0..d1cb9fb6eeb3 100644 --- a/contrib/arm-optimized-routines/pl/math/v_tanhf_2u6.c +++ b/contrib/arm-optimized-routines/pl/math/v_tanhf_2u6.c @@ -9,61 +9,65 @@ #include "pl_sig.h" #include "pl_test.h" -#if V_SUPPORTED - #include "v_expm1f_inline.h" -#define BoringBound \ - 0x41102cb3 /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for \ - negative). */ -#define AbsMask 0x7fffffff +static const struct data +{ + struct v_expm1f_data expm1f_consts; + uint32x4_t boring_bound, large_bound, onef; +} data = { + .expm1f_consts = V_EXPM1F_DATA, + /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */ + .boring_bound = V4 (0x41102cb3), + .large_bound = V4 (0x7f800000), + .onef = V4 (0x3f800000), +}; -static NOINLINE v_f32_t -special_case (v_f32_t x, v_f32_t y, v_u32_t special) +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, float32x4_t y, uint32x4_t special) { return v_call_f32 (tanhf, x, y, special); } -/* Approximation for single-precision vector tanh(x), using a simplified version - of expm1f. The maximum error is 2.58 ULP: - __v_tanhf(0x1.fa5eep-5) got 0x1.f9ba02p-5 - want 0x1.f9ba08p-5. */ -VPCS_ATTR v_f32_t V_NAME (tanhf) (v_f32_t x) +/* Approximation for single-precision vector tanh(x), using a simplified + version of expm1f. The maximum error is 2.58 ULP: + _ZGVnN4v_tanhf (0x1.fa5eep-5) got 0x1.f9ba02p-5 + want 0x1.f9ba08p-5. */ +float32x4_t VPCS_ATTR V_NAME_F1 (tanh) (float32x4_t x) { - v_u32_t ix = v_as_u32_f32 (x); - v_u32_t iax = ix & AbsMask; - v_u32_t sign = ix & ~AbsMask; - v_u32_t is_boring = v_cond_u32 (iax > BoringBound); - v_f32_t boring = v_as_f32_u32 (sign | One); + const struct data *d = ptr_barrier (&data); + + uint32x4_t ix = vreinterpretq_u32_f32 (x); + float32x4_t ax = vabsq_f32 (x); + uint32x4_t iax = vreinterpretq_u32_f32 (ax); + uint32x4_t sign = veorq_u32 (ix, iax); + uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound); + float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->onef)); #if WANT_SIMD_EXCEPT /* If fp exceptions are to be triggered properly, set all special and boring - lanes to 1, which will trigger no exceptions, and fix them up later. */ - v_u32_t special = v_cond_u32 ((iax > 0x7f800000) | (iax < 0x34000000)); - ix = v_sel_u32 (is_boring, v_u32 (One), ix); + lanes to 0, which will trigger no exceptions, and fix them up later. */ + uint32x4_t special = vorrq_u32 (vcgtq_u32 (iax, d->large_bound), + vcltq_u32 (iax, v_u32 (0x34000000))); + x = v_zerofy_f32 (x, is_boring); if (unlikely (v_any_u32 (special))) - ix = v_sel_u32 (special, v_u32 (One), ix); + x = v_zerofy_f32 (x, special); #else - v_u32_t special = v_cond_u32 ((iax > 0x7f800000) | (iax == 0)); + uint32x4_t special = vcgtq_u32 (iax, d->large_bound); #endif /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ - v_f32_t q = expm1f_inline (2 * v_as_f32_u32 (ix)); - v_f32_t y = q / (q + 2); - y = v_sel_f32 (is_boring, boring, y); + float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts); + float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0))); if (unlikely (v_any_u32 (special))) - return special_case (x, y, special); - return y; + return special_case (vreinterpretq_f32_u32 (ix), + vbslq_f32 (is_boring, boring, y), special); + return vbslq_f32 (is_boring, boring, y); } -VPCS_ALIAS PL_SIG (V, F, 1, tanh, -10.0, 10.0) -PL_TEST_ULP (V_NAME (tanhf), 2.09) -PL_TEST_EXPECT_FENV (V_NAME (tanhf), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME (tanhf), 0, 0x1p-23, 1000) -PL_TEST_INTERVAL (V_NAME (tanhf), -0, -0x1p-23, 1000) -PL_TEST_INTERVAL (V_NAME (tanhf), 0x1p-23, 0x1.205966p+3, 100000) -PL_TEST_INTERVAL (V_NAME (tanhf), -0x1p-23, -0x1.205966p+3, 100000) -PL_TEST_INTERVAL (V_NAME (tanhf), 0x1.205966p+3, inf, 100) -PL_TEST_INTERVAL (V_NAME (tanhf), -0x1.205966p+3, -inf, 100) -#endif +PL_TEST_ULP (V_NAME_F1 (tanh), 2.09) +PL_TEST_EXPECT_FENV (V_NAME_F1 (tanh), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0, 0x1p-23, 1000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1p-23, 0x1.205966p+3, 100000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1.205966p+3, inf, 100) diff --git a/contrib/arm-optimized-routines/pl/math/vn_acosh_3u5.c b/contrib/arm-optimized-routines/pl/math/vn_acosh_3u5.c deleted file mode 100644 index 649735b140f3..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_acosh_3u5.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_acosh. - * - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_acosh, _ZGVnN2v_acosh) -#include "v_acosh_3u5.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_acoshf_3u1.c b/contrib/arm-optimized-routines/pl/math/vn_acoshf_3u1.c deleted file mode 100644 index 8c5f106992a7..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_acoshf_3u1.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_acoshf. - * - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_acoshf, _ZGVnN4v_acoshf) -#include "v_acoshf_3u1.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_asinh_3u5.c b/contrib/arm-optimized-routines/pl/math/vn_asinh_3u5.c deleted file mode 100644 index 0d2373b5e4b2..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_asinh_3u5.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_asinh. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_asinh, _ZGVnN2v_asinh) -#include "v_asinh_3u5.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_asinhf_2u7.c b/contrib/arm-optimized-routines/pl/math/vn_asinhf_2u7.c deleted file mode 100644 index 6c8927f0875b..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_asinhf_2u7.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_asinhf. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_asinhf, _ZGVnN4v_asinhf) -#include "v_asinhf_2u7.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_atan2_3u.c b/contrib/arm-optimized-routines/pl/math/vn_atan2_3u.c deleted file mode 100644 index 925b5b4ef324..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_atan2_3u.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_atan2. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_atan2, _ZGVnN2vv_atan2) -#include "v_atan2_3u.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_atan2f_3u.c b/contrib/arm-optimized-routines/pl/math/vn_atan2f_3u.c deleted file mode 100644 index 51d33d50f6ef..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_atan2f_3u.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_atan2f. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_atan2f, _ZGVnN4vv_atan2f) -#include "v_atan2f_3u.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_atan_2u5.c b/contrib/arm-optimized-routines/pl/math/vn_atan_2u5.c deleted file mode 100644 index ccebce2dc2ed..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_atan_2u5.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_atan. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_atan, _ZGVnN2v_atan) -#include "v_atan_2u5.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_atanf_3u.c b/contrib/arm-optimized-routines/pl/math/vn_atanf_3u.c deleted file mode 100644 index b8797276d981..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_atanf_3u.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_atanf. - * - * Copyright (c) 2021-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_atanf, _ZGVnN4v_atanf) -#include "v_atanf_3u.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_atanh_3u5.c b/contrib/arm-optimized-routines/pl/math/vn_atanh_3u5.c deleted file mode 100644 index 19429b209b3a..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_atanh_3u5.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_atanh. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_atanh, _ZGVnN2v_atanh) -#include "v_atanh_3u5.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_atanhf_3u1.c b/contrib/arm-optimized-routines/pl/math/vn_atanhf_3u1.c deleted file mode 100644 index 7de226dda054..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_atanhf_3u1.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_atanhf. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_atanhf, _ZGVnN4v_atanhf) -#include "v_atanhf_3u1.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_cbrt_2u.c b/contrib/arm-optimized-routines/pl/math/vn_cbrt_2u.c deleted file mode 100644 index 4cb0dc8cefb5..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_cbrt_2u.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_cbrt. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_cbrt, _ZGVnN2v_cbrt) -#include "v_cbrt_2u.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_cbrtf_1u5.c b/contrib/arm-optimized-routines/pl/math/vn_cbrtf_1u5.c deleted file mode 100644 index 40a72d8c301e..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_cbrtf_1u5.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_cbrtf. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_cbrtf, _ZGVnN4v_cbrtf) -#include "v_cbrtf_1u5.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_cosh_2u.c b/contrib/arm-optimized-routines/pl/math/vn_cosh_2u.c deleted file mode 100644 index 9bf7f026447a..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_cosh_2u.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_cosh. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_cosh, _ZGVnN2v_cosh) -#include "v_cosh_2u.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_coshf_2u4.c b/contrib/arm-optimized-routines/pl/math/vn_coshf_2u4.c deleted file mode 100644 index b149cb34df61..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_coshf_2u4.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_coshf. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_coshf, _ZGVnN4v_coshf) -#include "v_coshf_2u4.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_erf_2u.c b/contrib/arm-optimized-routines/pl/math/vn_erf_2u.c deleted file mode 100644 index 95bd141554e4..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_erf_2u.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_erf. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_erf, _ZGVnN2v_erf) -#include "v_erf_2u.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_erfc_4u.c b/contrib/arm-optimized-routines/pl/math/vn_erfc_4u.c deleted file mode 100644 index 1cf6546ce715..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_erfc_4u.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_erfc. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_erfc, _ZGVnN2v_erfc) -#include "v_erfc_4u.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_erfcf_1u.c b/contrib/arm-optimized-routines/pl/math/vn_erfcf_1u.c deleted file mode 100644 index ef5a21d6336c..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_erfcf_1u.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_erfcf. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_erfcf, _ZGVnN4v_erfcf) -#include "v_erfcf_1u.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_erff_1u5.c b/contrib/arm-optimized-routines/pl/math/vn_erff_1u5.c deleted file mode 100644 index ee8848ee24ed..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_erff_1u5.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_erff. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_erff, _ZGVnN4v_erff) -#include "v_erff_1u5.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_exp_tail.c b/contrib/arm-optimized-routines/pl/math/vn_exp_tail.c deleted file mode 100644 index 52a57feefbff..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_exp_tail.c +++ /dev/null @@ -1,11 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_erfc. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#include "v_exp_tail.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_expf.c b/contrib/arm-optimized-routines/pl/math/vn_expf.c deleted file mode 100644 index 83e7f0a2070b..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_expf.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_expf. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_expf, _ZGVnN4v_expf) -#include "v_expf.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_expm1_2u5.c b/contrib/arm-optimized-routines/pl/math/vn_expm1_2u5.c deleted file mode 100644 index 35111e2fc221..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_expm1_2u5.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_expm1. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_expm1, _ZGVnN2v_expm1) -#include "v_expm1_2u5.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_expm1f_1u6.c b/contrib/arm-optimized-routines/pl/math/vn_expm1f_1u6.c deleted file mode 100644 index bea491f4898e..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_expm1f_1u6.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_expm1f. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_expm1f, _ZGVnN4v_expm1f) -#include "v_expm1f_1u6.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_log10_2u5.c b/contrib/arm-optimized-routines/pl/math/vn_log10_2u5.c deleted file mode 100644 index 5f32c33e059f..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_log10_2u5.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_log10. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_log10, _ZGVnN2v_log10) -#include "v_log10_2u5.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_log10f_3u5.c b/contrib/arm-optimized-routines/pl/math/vn_log10f_3u5.c deleted file mode 100644 index 2673ef515df7..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_log10f_3u5.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_log10f. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_log10f, _ZGVnN4v_log10f) -#include "v_log10f_3u5.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_log1p_2u5.c b/contrib/arm-optimized-routines/pl/math/vn_log1p_2u5.c deleted file mode 100644 index 3f4f8d1bd297..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_log1p_2u5.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_log1p. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_log1p, _ZGVnN2v_log1p) -#include "v_log1p_2u5.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_log1pf_2u1.c b/contrib/arm-optimized-routines/pl/math/vn_log1pf_2u1.c deleted file mode 100644 index a319bc98f491..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_log1pf_2u1.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_log1pf. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_log1pf, _ZGVnN4v_log1pf) -#include "v_log1pf_2u1.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_log2_3u.c b/contrib/arm-optimized-routines/pl/math/vn_log2_3u.c deleted file mode 100644 index a87039204439..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_log2_3u.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_log2. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_log2, _ZGVnN2v_log2) -#include "v_log2_3u.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_log2f_2u5.c b/contrib/arm-optimized-routines/pl/math/vn_log2f_2u5.c deleted file mode 100644 index b4a9cb708bae..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_log2f_2u5.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_log2f. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_log2f, _ZGVnN4v_log2f) -#include "v_log2f_2u5.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_sinh_3u.c b/contrib/arm-optimized-routines/pl/math/vn_sinh_3u.c deleted file mode 100644 index 7c881de21688..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_sinh_3u.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_sinh. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_sinh, _ZGVnN2v_sinh) -#include "v_sinh_3u.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_sinhf_2u3.c b/contrib/arm-optimized-routines/pl/math/vn_sinhf_2u3.c deleted file mode 100644 index 251e73232d01..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_sinhf_2u3.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_sinhf. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_sinhf, _ZGVnN4v_sinhf) -#include "v_sinhf_2u3.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_tan_3u5.c b/contrib/arm-optimized-routines/pl/math/vn_tan_3u5.c deleted file mode 100644 index a4efb065bc08..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_tan_3u5.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_tan. - * - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_tan, _ZGVnN2v_tan) -#include "v_tan_3u5.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_tanf_3u5.c b/contrib/arm-optimized-routines/pl/math/vn_tanf_3u5.c deleted file mode 100644 index a88cb4077b3d..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_tanf_3u5.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_tanf. - * - * Copyright (c) 2020-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_tanf, _ZGVnN4v_tanf) -#include "v_tanf_3u5.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_tanh_3u.c b/contrib/arm-optimized-routines/pl/math/vn_tanh_3u.c deleted file mode 100644 index cb2746cf22a5..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_tanh_3u.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_tanh. - * - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_tanh, _ZGVnN2v_tanh) -#include "v_tanh_3u.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_tanhf_2u6.c b/contrib/arm-optimized-routines/pl/math/vn_tanhf_2u6.c deleted file mode 100644 index 47f0a7f57d05..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_tanhf_2u6.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_tanhf. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_tanhf, _ZGVnN4v_tanhf) -#include "v_tanhf_2u6.c" -#endif diff --git a/contrib/arm-optimized-routines/string/aarch64/asmdefs.h b/contrib/arm-optimized-routines/string/aarch64/asmdefs.h index 069b146f4a69..131b95e1fea9 100644 --- a/contrib/arm-optimized-routines/string/aarch64/asmdefs.h +++ b/contrib/arm-optimized-routines/string/aarch64/asmdefs.h @@ -21,6 +21,19 @@ #define FEATURE_1_PAC 2 /* Add a NT_GNU_PROPERTY_TYPE_0 note. */ +#ifdef __ILP32__ +#define GNU_PROPERTY(type, value) \ + .section .note.gnu.property, "a"; \ + .p2align 2; \ + .word 4; \ + .word 12; \ + .word 5; \ + .asciz "GNU"; \ + .word type; \ + .word 4; \ + .word value; \ + .text +#else #define GNU_PROPERTY(type, value) \ .section .note.gnu.property, "a"; \ .p2align 3; \ @@ -33,6 +46,7 @@ .word value; \ .word 0; \ .text +#endif /* If set then the GNU Property Note section will be added to mark objects to support BTI and PAC-RET. */ diff --git a/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S b/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S index e6527d0dac2c..9d3027d4d3cd 100644 --- a/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S +++ b/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S @@ -1,7 +1,7 @@ /* * memcpy - copy memory area * - * Copyright (c) 2019-2022, Arm Limited. + * Copyright (c) 2019-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -56,11 +56,12 @@ ENTRY (__memcpy_aarch64_simd) PTR_ARG (1) SIZE_ARG (2) add srcend, src, count - add dstend, dstin, count cmp count, 128 b.hi L(copy_long) + add dstend, dstin, count cmp count, 32 b.hi L(copy32_128) + nop /* Small copies: 0..32 bytes. */ cmp count, 16 @@ -71,6 +72,18 @@ ENTRY (__memcpy_aarch64_simd) str B_q, [dstend, -16] ret + .p2align 4 + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + ldp A_q, B_q, [src] + ldp C_q, D_q, [srcend, -32] + cmp count, 64 + b.hi L(copy128) + stp A_q, B_q, [dstin] + stp C_q, D_q, [dstend, -32] + ret + + .p2align 4 /* Copy 8-15 bytes. */ L(copy16): tbz count, 3, L(copy8) @@ -80,7 +93,6 @@ L(copy16): str A_h, [dstend, -8] ret - .p2align 3 /* Copy 4-7 bytes. */ L(copy8): tbz count, 2, L(copy4) @@ -90,6 +102,19 @@ L(copy8): str B_lw, [dstend, -4] ret + /* Copy 65..128 bytes. */ +L(copy128): + ldp E_q, F_q, [src, 32] + cmp count, 96 + b.ls L(copy96) + ldp G_q, H_q, [srcend, -64] + stp G_q, H_q, [dstend, -64] +L(copy96): + stp A_q, B_q, [dstin] + stp E_q, F_q, [dstin, 32] + stp C_q, D_q, [dstend, -32] + ret + /* Copy 0..3 bytes using a branchless sequence. */ L(copy4): cbz count, L(copy0) @@ -103,33 +128,11 @@ L(copy4): L(copy0): ret - .p2align 4 - /* Medium copies: 33..128 bytes. */ -L(copy32_128): - ldp A_q, B_q, [src] - ldp C_q, D_q, [srcend, -32] - cmp count, 64 - b.hi L(copy128) - stp A_q, B_q, [dstin] - stp C_q, D_q, [dstend, -32] - ret - - .p2align 4 - /* Copy 65..128 bytes. */ -L(copy128): - ldp E_q, F_q, [src, 32] - cmp count, 96 - b.ls L(copy96) - ldp G_q, H_q, [srcend, -64] - stp G_q, H_q, [dstend, -64] -L(copy96): - stp A_q, B_q, [dstin] - stp E_q, F_q, [dstin, 32] - stp C_q, D_q, [dstend, -32] - ret - + .p2align 3 /* Copy more than 128 bytes. */ L(copy_long): + add dstend, dstin, count + /* Use backwards copy if there is an overlap. */ sub tmp1, dstin, src cmp tmp1, count @@ -166,6 +169,9 @@ L(copy64_from_end): stp A_q, B_q, [dstend, -32] ret + .p2align 4 + nop + /* Large backwards copy for overlapping copies. Copy 16 bytes and then align srcend to 16-byte alignment. */ L(copy_long_backwards): diff --git a/contrib/arm-optimized-routines/string/aarch64/memcpy-mops.S b/contrib/arm-optimized-routines/string/aarch64/memcpy-mops.S new file mode 100644 index 000000000000..b45c31418717 --- /dev/null +++ b/contrib/arm-optimized-routines/string/aarch64/memcpy-mops.S @@ -0,0 +1,21 @@ +/* + * memcpy using MOPS extension. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "asmdefs.h" + +ENTRY (__memcpy_aarch64_mops) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + + mov x3, x0 + .inst 0x19010443 /* cpyfp [x3]!, [x1]!, x2! */ + .inst 0x19410443 /* cpyfm [x3]!, [x1]!, x2! */ + .inst 0x19810443 /* cpyfe [x3]!, [x1]!, x2! */ + ret + +END (__memcpy_aarch64_mops) diff --git a/contrib/arm-optimized-routines/string/aarch64/memmove-mops.S b/contrib/arm-optimized-routines/string/aarch64/memmove-mops.S new file mode 100644 index 000000000000..6c73017bb16f --- /dev/null +++ b/contrib/arm-optimized-routines/string/aarch64/memmove-mops.S @@ -0,0 +1,21 @@ +/* + * memmove using MOPS extension. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "asmdefs.h" + +ENTRY (__memmove_aarch64_mops) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + + mov x3, x0 + .inst 0x1d010443 /* cpyp [x3]!, [x1]!, x2! */ + .inst 0x1d410443 /* cpym [x3]!, [x1]!, x2! */ + .inst 0x1d810443 /* cpye [x3]!, [x1]!, x2! */ + ret + +END (__memmove_aarch64_mops) diff --git a/contrib/arm-optimized-routines/string/aarch64/memset-mops.S b/contrib/arm-optimized-routines/string/aarch64/memset-mops.S new file mode 100644 index 000000000000..ec791493bae9 --- /dev/null +++ b/contrib/arm-optimized-routines/string/aarch64/memset-mops.S @@ -0,0 +1,20 @@ +/* + * memset using MOPS extension. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "asmdefs.h" + +ENTRY (__memset_aarch64_mops) + PTR_ARG (0) + SIZE_ARG (2) + + mov x3, x0 + .inst 0x19c10443 /* setp [x3]!, x2!, x1 */ + .inst 0x19c14443 /* setm [x3]!, x2!, x1 */ + .inst 0x19c18443 /* sete [x3]!, x2!, x1 */ + ret + +END (__memset_aarch64_mops) diff --git a/contrib/arm-optimized-routines/string/bench/memcpy.c b/contrib/arm-optimized-routines/string/bench/memcpy.c index 1468663e51cd..b628f9b60d96 100644 --- a/contrib/arm-optimized-routines/string/bench/memcpy.c +++ b/contrib/arm-optimized-routines/string/bench/memcpy.c @@ -1,7 +1,7 @@ /* * memcpy benchmark. * - * Copyright (c) 2020-2022, Arm Limited. + * Copyright (c) 2020-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -39,6 +39,9 @@ static const struct fun # if __ARM_FEATURE_SVE F(__memcpy_aarch64_sve) # endif +# if WANT_MOPS + F(__memcpy_aarch64_mops) +# endif #elif __arm__ F(__memcpy_arm) #endif diff --git a/contrib/arm-optimized-routines/string/include/stringlib.h b/contrib/arm-optimized-routines/string/include/stringlib.h index f41a46446888..01da7ebfc18d 100644 --- a/contrib/arm-optimized-routines/string/include/stringlib.h +++ b/contrib/arm-optimized-routines/string/include/stringlib.h @@ -1,7 +1,7 @@ /* * Public API. * - * Copyright (c) 2019-2022, Arm Limited. + * Copyright (c) 2019-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -52,6 +52,11 @@ size_t __strlen_aarch64_sve (const char *); size_t __strnlen_aarch64_sve (const char *, size_t); int __strncmp_aarch64_sve (const char *, const char *, size_t); # endif +# if WANT_MOPS +void *__memcpy_aarch64_mops (void *__restrict, const void *__restrict, size_t); +void *__memmove_aarch64_mops (void *__restrict, const void *__restrict, size_t); +void *__memset_aarch64_mops (void *, int, size_t); +# endif # if __ARM_FEATURE_MEMORY_TAGGING void *__mtag_tag_region (void *, size_t); void *__mtag_tag_zero_region (void *, size_t); diff --git a/contrib/arm-optimized-routines/string/test/memcpy.c b/contrib/arm-optimized-routines/string/test/memcpy.c index fa15a95b2bda..dc95844bd45a 100644 --- a/contrib/arm-optimized-routines/string/test/memcpy.c +++ b/contrib/arm-optimized-routines/string/test/memcpy.c @@ -1,7 +1,7 @@ /* * memcpy test. * - * Copyright (c) 2019-2022, Arm Limited. + * Copyright (c) 2019-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -31,6 +31,9 @@ static const struct fun # if __ARM_FEATURE_SVE F(__memcpy_aarch64_sve, 1) # endif +# if WANT_MOPS + F(__memcpy_aarch64_mops, 1) +# endif #elif __arm__ F(__memcpy_arm, 0) #endif diff --git a/contrib/arm-optimized-routines/string/test/memmove.c b/contrib/arm-optimized-routines/string/test/memmove.c index 5d509c03affa..b85dd1e864ef 100644 --- a/contrib/arm-optimized-routines/string/test/memmove.c +++ b/contrib/arm-optimized-routines/string/test/memmove.c @@ -1,7 +1,7 @@ /* * memmove test. * - * Copyright (c) 2019-2022, Arm Limited. + * Copyright (c) 2019-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -31,6 +31,9 @@ static const struct fun # if __ARM_FEATURE_SVE F(__memmove_aarch64_sve, 1) # endif +# if WANT_MOPS + F(__memmove_aarch64_mops, 1) +# endif #endif {0, 0, 0} // clang-format on diff --git a/contrib/arm-optimized-routines/string/test/memset.c b/contrib/arm-optimized-routines/string/test/memset.c index 5543f44bb026..7d09c267ffec 100644 --- a/contrib/arm-optimized-routines/string/test/memset.c +++ b/contrib/arm-optimized-routines/string/test/memset.c @@ -1,7 +1,7 @@ /* * memset test. * - * Copyright (c) 2019-2020, Arm Limited. + * Copyright (c) 2019-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -25,6 +25,9 @@ static const struct fun F(memset, 0) #if __aarch64__ F(__memset_aarch64, 1) +# if WANT_MOPS + F(__memset_aarch64_mops, 1) +# endif #elif __arm__ F(__memset_arm, 0) #endif