Update the Arm Optimized Routine library to v24.01

Sponsored by:	Arm Ltd
This commit is contained in:
Andrew Turner 2024-02-29 11:39:12 +00:00
commit 5a02ffc32e
414 changed files with 26650 additions and 10768 deletions

View File

@ -12,7 +12,7 @@ contribution requirements are documented in README.contributors of
the appropriate subdirectory.
Regular quarterly releases are tagged as vYY.MM, the latest
release is v23.01.
release is v24.01.
Source code layout:

View File

@ -1,6 +1,6 @@
# Example config.mk
#
# Copyright (c) 2018-2022, Arm Limited.
# Copyright (c) 2018-2023, Arm Limited.
# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
# Subprojects to build
@ -59,13 +59,14 @@ math-cflags += -ffp-contract=fast -fno-math-errno
# Use with clang.
#math-cflags += -ffp-contract=fast
# Disable vector math code
#math-cflags += -DWANT_VMATH=0
# Disable/enable SVE vector math code and tests
# Disable/enable SVE vector math code and tests.
# If WANT_SVE_MATH is enabled, math-sve-cflags is added for SVE
# routines only so that SVE code does not leak into scalar
# routines. It is also necessary to add it for tools (e.g. ulp,
# mathbench)
WANT_SVE_MATH = 0
ifeq ($(WANT_SVE_MATH), 1)
math-cflags += -march=armv8.2-a+sve
math-sve-cflags = -march=armv8-a+sve
endif
math-cflags += -DWANT_SVE_MATH=$(WANT_SVE_MATH)

View File

@ -1,12 +1,14 @@
# Makefile fragment - requires GNU make
#
# Copyright (c) 2019-2022, Arm Limited.
# Copyright (c) 2019-2023, Arm Limited.
# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
S := $(srcdir)/math
B := build/math
math-lib-srcs := $(wildcard $(S)/*.[cS])
math-lib-srcs += $(wildcard $(S)/$(ARCH)/*.[cS])
math-test-srcs := \
$(S)/test/mathtest.c \
$(S)/test/mathbench.c \
@ -65,6 +67,8 @@ build/lib/libmathlib.a: $(math-lib-objs)
$(math-host-tools): HOST_LDLIBS += -lm -lmpfr -lmpc
$(math-tools): LDLIBS += $(math-ldlibs) -lm
# math-sve-cflags should be empty if WANT_SVE_MATH is not enabled
$(math-tools): CFLAGS_ALL += $(math-sve-cflags)
build/bin/rtest: $(math-host-objs)
$(HOST_CC) $(HOST_CFLAGS) $(HOST_LDFLAGS) -o $@ $^ $(HOST_LDLIBS)

View File

@ -0,0 +1,87 @@
/*
* Double-precision vector cos function.
*
* Copyright (c) 2019-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
static const struct data
{
float64x2_t poly[7];
float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3;
} data = {
/* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */
.poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
V2 (-0x1.9e9540300a1p-41) },
.inv_pi = V2 (0x1.45f306dc9c883p-2),
.half_pi = V2 (0x1.921fb54442d18p+0),
.pi_1 = V2 (0x1.921fb54442d18p+1),
.pi_2 = V2 (0x1.1a62633145c06p-53),
.pi_3 = V2 (0x1.c1cd129024e09p-106),
.shift = V2 (0x1.8p52),
.range_val = V2 (0x1p23)
};
#define C(i) d->poly[i]
static float64x2_t VPCS_ATTR NOINLINE
special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
{
y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
return v_call_f64 (cos, x, y, cmp);
}
float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
float64x2_t n, r, r2, r3, r4, t1, t2, t3, y;
uint64x2_t odd, cmp;
#if WANT_SIMD_EXCEPT
r = vabsq_f64 (x);
cmp = vcgeq_u64 (vreinterpretq_u64_f64 (r),
vreinterpretq_u64_f64 (d->range_val));
if (unlikely (v_any_u64 (cmp)))
/* If fenv exceptions are to be triggered correctly, set any special lanes
to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
special-case handler later. */
r = vbslq_f64 (cmp, v_f64 (1.0), r);
#else
cmp = vcageq_f64 (x, d->range_val);
r = x;
#endif
/* n = rint((|x|+pi/2)/pi) - 0.5. */
n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi));
odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
n = vsubq_f64 (n, d->shift);
n = vsubq_f64 (n, v_f64 (0.5));
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
r = vfmsq_f64 (r, d->pi_1, n);
r = vfmsq_f64 (r, d->pi_2, n);
r = vfmsq_f64 (r, d->pi_3, n);
/* sin(r) poly approx. */
r2 = vmulq_f64 (r, r);
r3 = vmulq_f64 (r2, r);
r4 = vmulq_f64 (r2, r2);
t1 = vfmaq_f64 (C (4), C (5), r2);
t2 = vfmaq_f64 (C (2), C (3), r2);
t3 = vfmaq_f64 (C (0), C (1), r2);
y = vfmaq_f64 (t1, C (6), r4);
y = vfmaq_f64 (t2, y, r4);
y = vfmaq_f64 (t3, y, r4);
y = vfmaq_f64 (r, y, r3);
if (unlikely (v_any_u64 (cmp)))
return special_case (x, y, odd, cmp);
return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
}

View File

@ -0,0 +1,82 @@
/*
* Single-precision vector cos function.
*
* Copyright (c) 2019-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
static const struct data
{
float32x4_t poly[4];
float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3;
} data = {
/* 1.886 ulp error. */
.poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
V4 (0x1.5b2e76p-19f) },
.pi_1 = V4 (0x1.921fb6p+1f),
.pi_2 = V4 (-0x1.777a5cp-24f),
.pi_3 = V4 (-0x1.ee59dap-49f),
.inv_pi = V4 (0x1.45f306p-2f),
.shift = V4 (0x1.8p+23f),
.half_pi = V4 (0x1.921fb6p0f),
.range_val = V4 (0x1p20f)
};
#define C(i) d->poly[i]
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
{
/* Fall back to scalar code. */
y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
return v_call_f32 (cosf, x, y, cmp);
}
float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
float32x4_t n, r, r2, r3, y;
uint32x4_t odd, cmp;
#if WANT_SIMD_EXCEPT
r = vabsq_f32 (x);
cmp = vcgeq_u32 (vreinterpretq_u32_f32 (r),
vreinterpretq_u32_f32 (d->range_val));
if (unlikely (v_any_u32 (cmp)))
/* If fenv exceptions are to be triggered correctly, set any special lanes
to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
special-case handler later. */
r = vbslq_f32 (cmp, v_f32 (1.0f), r);
#else
cmp = vcageq_f32 (x, d->range_val);
r = x;
#endif
/* n = rint((|x|+pi/2)/pi) - 0.5. */
n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi));
odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
n = vsubq_f32 (n, d->shift);
n = vsubq_f32 (n, v_f32 (0.5f));
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
r = vfmsq_f32 (r, d->pi_1, n);
r = vfmsq_f32 (r, d->pi_2, n);
r = vfmsq_f32 (r, d->pi_3, n);
/* y = sin(r). */
r2 = vmulq_f32 (r, r);
r3 = vmulq_f32 (r2, r);
y = vfmaq_f32 (C (2), C (3), r2);
y = vfmaq_f32 (C (1), y, r2);
y = vfmaq_f32 (C (0), y, r2);
y = vfmaq_f32 (r, y, r3);
if (unlikely (v_any_u32 (cmp)))
return special_case (x, y, odd, cmp);
return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
}

View File

@ -0,0 +1,125 @@
/*
* Double-precision vector e^x function.
*
* Copyright (c) 2019-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
#define N (1 << V_EXP_TABLE_BITS)
#define IndexMask (N - 1)
const static volatile struct
{
float64x2_t poly[3];
float64x2_t inv_ln2, ln2_hi, ln2_lo, shift;
#if !WANT_SIMD_EXCEPT
float64x2_t special_bound, scale_thresh;
#endif
} data = {
/* maxerr: 1.88 +0.5 ulp
rel error: 1.4337*2^-53
abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ]. */
.poly = { V2 (0x1.ffffffffffd43p-2), V2 (0x1.55555c75adbb2p-3),
V2 (0x1.55555da646206p-5) },
#if !WANT_SIMD_EXCEPT
.scale_thresh = V2 (163840.0), /* 1280.0 * N. */
.special_bound = V2 (704.0),
#endif
.inv_ln2 = V2 (0x1.71547652b82fep7), /* N/ln2. */
.ln2_hi = V2 (0x1.62e42fefa39efp-8), /* ln2/N. */
.ln2_lo = V2 (0x1.abc9e3b39803f3p-63),
.shift = V2 (0x1.8p+52)
};
#define C(i) data.poly[i]
#define Tab __v_exp_data
#if WANT_SIMD_EXCEPT
# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511). */
# define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9). */
# define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound. */
static float64x2_t VPCS_ATTR NOINLINE
special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
{
/* If fenv exceptions are to be triggered correctly, fall back to the scalar
routine to special lanes. */
return v_call_f64 (exp, x, y, cmp);
}
#else
# define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */
/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */
# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */
static inline float64x2_t VPCS_ATTR
special_case (float64x2_t s, float64x2_t y, float64x2_t n)
{
/* 2^(n/N) may overflow, break it up into s1*s2. */
uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset);
float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b));
float64x2_t s2 = vreinterpretq_f64_u64 (
vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b));
uint64x2_t cmp = vcagtq_f64 (n, data.scale_thresh);
float64x2_t r1 = vmulq_f64 (s1, s1);
float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1);
return vbslq_f64 (cmp, r1, r0);
}
#endif
float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x)
{
float64x2_t n, r, r2, s, y, z;
uint64x2_t cmp, u, e;
#if WANT_SIMD_EXCEPT
/* If any lanes are special, mask them with 1 and retain a copy of x to allow
special_case to fix special lanes later. This is only necessary if fenv
exceptions are to be triggered correctly. */
float64x2_t xm = x;
uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), SpecialBound);
if (unlikely (v_any_u64 (cmp)))
x = vbslq_f64 (cmp, v_f64 (1), x);
#else
cmp = vcagtq_f64 (x, data.special_bound);
#endif
/* n = round(x/(ln2/N)). */
z = vfmaq_f64 (data.shift, x, data.inv_ln2);
u = vreinterpretq_u64_f64 (z);
n = vsubq_f64 (z, data.shift);
/* r = x - n*ln2/N. */
r = x;
r = vfmsq_f64 (r, data.ln2_hi, n);
r = vfmsq_f64 (r, data.ln2_lo, n);
e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS);
/* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4. */
r2 = vmulq_f64 (r, r);
y = vfmaq_f64 (C (0), C (1), r);
y = vfmaq_f64 (y, C (2), r2);
y = vfmaq_f64 (r, y, r2);
/* s = 2^(n/N). */
u = (uint64x2_t){ Tab[u[0] & IndexMask], Tab[u[1] & IndexMask] };
s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
if (unlikely (v_any_u64 (cmp)))
#if WANT_SIMD_EXCEPT
return special_case (xm, vfmaq_f64 (s, y, s), cmp);
#else
return special_case (s, y, n);
#endif
return vfmaq_f64 (s, y, s);
}

View File

@ -0,0 +1,113 @@
/*
* Single-precision vector 2^x function.
*
* Copyright (c) 2019-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
static const struct data
{
float32x4_t poly[5];
uint32x4_t exponent_bias;
#if !WANT_SIMD_EXCEPT
float32x4_t special_bound, scale_thresh;
#endif
} data = {
/* maxerr: 1.962 ulp. */
.poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f),
V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) },
.exponent_bias = V4 (0x3f800000),
#if !WANT_SIMD_EXCEPT
.special_bound = V4 (126.0f),
.scale_thresh = V4 (192.0f),
#endif
};
#define C(i) d->poly[i]
#if WANT_SIMD_EXCEPT
# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */
# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */
# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
{
/* If fenv exceptions are to be triggered correctly, fall back to the scalar
routine for special lanes. */
return v_call_f32 (exp2f, x, y, cmp);
}
#else
# define SpecialOffset v_u32 (0x82000000)
# define SpecialBias v_u32 (0x7f000000)
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
float32x4_t scale, const struct data *d)
{
/* 2^n may overflow, break it up into s1*s2. */
uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
float32x4_t r2 = vmulq_f32 (s1, s1);
float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
/* Similar to r1 but avoids double rounding in the subnormal range. */
float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
float32x4_t r = vbslq_f32 (cmp1, r1, r0);
return vbslq_f32 (cmp2, r2, r);
}
#endif
float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
float32x4_t n, r, r2, scale, p, q, poly;
uint32x4_t cmp, e;
#if WANT_SIMD_EXCEPT
/* asuint(|x|) - TinyBound >= BigBound - TinyBound. */
uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
float32x4_t xm = x;
/* If any lanes are special, mask them with 1 and retain a copy of x to allow
special_case to fix special lanes later. This is only necessary if fenv
exceptions are to be triggered correctly. */
if (unlikely (v_any_u32 (cmp)))
x = vbslq_f32 (cmp, v_f32 (1), x);
#endif
/* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
x = n + r, with r in [-1/2, 1/2]. */
n = vrndaq_f32 (x);
r = vsubq_f32 (x, n);
e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
#if !WANT_SIMD_EXCEPT
cmp = vcagtq_f32 (n, d->special_bound);
#endif
r2 = vmulq_f32 (r, r);
p = vfmaq_f32 (C (1), C (0), r);
q = vfmaq_f32 (C (3), C (2), r);
q = vfmaq_f32 (q, p, r2);
p = vmulq_f32 (C (4), r);
poly = vfmaq_f32 (p, q, r2);
if (unlikely (v_any_u32 (cmp)))
#if WANT_SIMD_EXCEPT
return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
#else
return special_case (poly, n, e, cmp, scale, d);
#endif
return vfmaq_f32 (scale, poly, scale);
}

View File

@ -0,0 +1,72 @@
/*
* Single-precision vector 2^x function.
*
* Copyright (c) 2019-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
static const float Poly[] = {
/* maxerr: 0.878 ulp. */
0x1.416b5ep-13f, 0x1.5f082ep-10f, 0x1.3b2dep-7f, 0x1.c6af7cp-5f, 0x1.ebfbdcp-3f, 0x1.62e43p-1f
};
#define C0 v_f32 (Poly[0])
#define C1 v_f32 (Poly[1])
#define C2 v_f32 (Poly[2])
#define C3 v_f32 (Poly[3])
#define C4 v_f32 (Poly[4])
#define C5 v_f32 (Poly[5])
#define Shift v_f32 (0x1.8p23f)
#define InvLn2 v_f32 (0x1.715476p+0f)
#define Ln2hi v_f32 (0x1.62e4p-1f)
#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
static float32x4_t VPCS_ATTR NOINLINE
specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn)
{
/* 2^n may overflow, break it up into s1*s2. */
uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b);
float32x4_t s2 = vreinterpretq_f32_u32 (e - b);
uint32x4_t cmp = absn > v_f32 (192.0f);
float32x4_t r1 = s1 * s1;
float32x4_t r0 = poly * s1 * s2;
return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
| (~cmp & vreinterpretq_u32_f32 (r0)));
}
float32x4_t VPCS_ATTR
_ZGVnN4v_exp2f_1u (float32x4_t x)
{
float32x4_t n, r, scale, poly, absn;
uint32x4_t cmp, e;
/* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
x = n + r, with r in [-1/2, 1/2]. */
#if 0
float32x4_t z;
z = x + Shift;
n = z - Shift;
r = x - n;
e = vreinterpretq_u32_f32 (z) << 23;
#else
n = vrndaq_f32 (x);
r = x - n;
e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)) << 23;
#endif
scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000));
absn = vabsq_f32 (n);
cmp = absn > v_f32 (126.0f);
poly = vfmaq_f32 (C1, C0, r);
poly = vfmaq_f32 (C2, poly, r);
poly = vfmaq_f32 (C3, poly, r);
poly = vfmaq_f32 (C4, poly, r);
poly = vfmaq_f32 (C5, poly, r);
poly = vfmaq_f32 (v_f32 (1.0f), poly, r);
if (unlikely (v_any_u32 (cmp)))
return specialcase (poly, n, e, absn);
return scale * poly;
}

View File

@ -0,0 +1,146 @@
/*
* Lookup table for double-precision e^x vector function.
*
* Copyright (c) 2019-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
# define N (1 << V_EXP_TABLE_BITS)
/* 2^(j/N), j=0..N. */
const uint64_t __v_exp_data[] = {
# if N == 128
0x3ff0000000000000, 0x3feff63da9fb3335, 0x3fefec9a3e778061,
0x3fefe315e86e7f85, 0x3fefd9b0d3158574, 0x3fefd06b29ddf6de,
0x3fefc74518759bc8, 0x3fefbe3ecac6f383, 0x3fefb5586cf9890f,
0x3fefac922b7247f7, 0x3fefa3ec32d3d1a2, 0x3fef9b66affed31b,
0x3fef9301d0125b51, 0x3fef8abdc06c31cc, 0x3fef829aaea92de0,
0x3fef7a98c8a58e51, 0x3fef72b83c7d517b, 0x3fef6af9388c8dea,
0x3fef635beb6fcb75, 0x3fef5be084045cd4, 0x3fef54873168b9aa,
0x3fef4d5022fcd91d, 0x3fef463b88628cd6, 0x3fef3f49917ddc96,
0x3fef387a6e756238, 0x3fef31ce4fb2a63f, 0x3fef2b4565e27cdd,
0x3fef24dfe1f56381, 0x3fef1e9df51fdee1, 0x3fef187fd0dad990,
0x3fef1285a6e4030b, 0x3fef0cafa93e2f56, 0x3fef06fe0a31b715,
0x3fef0170fc4cd831, 0x3feefc08b26416ff, 0x3feef6c55f929ff1,
0x3feef1a7373aa9cb, 0x3feeecae6d05d866, 0x3feee7db34e59ff7,
0x3feee32dc313a8e5, 0x3feedea64c123422, 0x3feeda4504ac801c,
0x3feed60a21f72e2a, 0x3feed1f5d950a897, 0x3feece086061892d,
0x3feeca41ed1d0057, 0x3feec6a2b5c13cd0, 0x3feec32af0d7d3de,
0x3feebfdad5362a27, 0x3feebcb299fddd0d, 0x3feeb9b2769d2ca7,
0x3feeb6daa2cf6642, 0x3feeb42b569d4f82, 0x3feeb1a4ca5d920f,
0x3feeaf4736b527da, 0x3feead12d497c7fd, 0x3feeab07dd485429,
0x3feea9268a5946b7, 0x3feea76f15ad2148, 0x3feea5e1b976dc09,
0x3feea47eb03a5585, 0x3feea34634ccc320, 0x3feea23882552225,
0x3feea155d44ca973, 0x3feea09e667f3bcd, 0x3feea012750bdabf,
0x3fee9fb23c651a2f, 0x3fee9f7df9519484, 0x3fee9f75e8ec5f74,
0x3fee9f9a48a58174, 0x3fee9feb564267c9, 0x3feea0694fde5d3f,
0x3feea11473eb0187, 0x3feea1ed0130c132, 0x3feea2f336cf4e62,
0x3feea427543e1a12, 0x3feea589994cce13, 0x3feea71a4623c7ad,
0x3feea8d99b4492ed, 0x3feeaac7d98a6699, 0x3feeace5422aa0db,
0x3feeaf3216b5448c, 0x3feeb1ae99157736, 0x3feeb45b0b91ffc6,
0x3feeb737b0cdc5e5, 0x3feeba44cbc8520f, 0x3feebd829fde4e50,
0x3feec0f170ca07ba, 0x3feec49182a3f090, 0x3feec86319e32323,
0x3feecc667b5de565, 0x3feed09bec4a2d33, 0x3feed503b23e255d,
0x3feed99e1330b358, 0x3feede6b5579fdbf, 0x3feee36bbfd3f37a,
0x3feee89f995ad3ad, 0x3feeee07298db666, 0x3feef3a2b84f15fb,
0x3feef9728de5593a, 0x3feeff76f2fb5e47, 0x3fef05b030a1064a,
0x3fef0c1e904bc1d2, 0x3fef12c25bd71e09, 0x3fef199bdd85529c,
0x3fef20ab5fffd07a, 0x3fef27f12e57d14b, 0x3fef2f6d9406e7b5,
0x3fef3720dcef9069, 0x3fef3f0b555dc3fa, 0x3fef472d4a07897c,
0x3fef4f87080d89f2, 0x3fef5818dcfba487, 0x3fef60e316c98398,
0x3fef69e603db3285, 0x3fef7321f301b460, 0x3fef7c97337b9b5f,
0x3fef864614f5a129, 0x3fef902ee78b3ff6, 0x3fef9a51fbc74c83,
0x3fefa4afa2a490da, 0x3fefaf482d8e67f1, 0x3fefba1bee615a27,
0x3fefc52b376bba97, 0x3fefd0765b6e4540, 0x3fefdbfdad9cbe14,
0x3fefe7c1819e90d8, 0x3feff3c22b8f71f1,
# elif N == 256
0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335,
0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc,
0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574,
0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836,
0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383,
0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85,
0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2,
0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e,
0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc,
0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e,
0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b,
0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f,
0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4,
0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027,
0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6,
0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1,
0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f,
0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29,
0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1,
0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f,
0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56,
0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd,
0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff,
0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b,
0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866,
0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4,
0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422,
0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024,
0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897,
0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232,
0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0,
0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7,
0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d,
0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee,
0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82,
0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2,
0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd,
0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03,
0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148,
0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4,
0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320,
0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6,
0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd,
0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645,
0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484,
0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a,
0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9,
0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6,
0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132,
0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491,
0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13,
0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21,
0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699,
0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778,
0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736,
0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2,
0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f,
0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2,
0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090,
0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e,
0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33,
0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052,
0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf,
0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774,
0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666,
0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1,
0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47,
0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f,
0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09,
0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c,
0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b,
0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db,
0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa,
0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968,
0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487,
0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075,
0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460,
0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17,
0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6,
0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740,
0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1,
0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a,
0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540,
0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89,
0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1,
0x3feff9d96b2a23d9,
# endif
};

View File

@ -0,0 +1,122 @@
/*
* Single-precision vector e^x function.
*
* Copyright (c) 2019-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
static const struct data
{
float32x4_t poly[5];
float32x4_t shift, inv_ln2, ln2_hi, ln2_lo;
uint32x4_t exponent_bias;
#if !WANT_SIMD_EXCEPT
float32x4_t special_bound, scale_thresh;
#endif
} data = {
/* maxerr: 1.45358 +0.5 ulp. */
.poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f),
V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) },
.shift = V4 (0x1.8p23f),
.inv_ln2 = V4 (0x1.715476p+0f),
.ln2_hi = V4 (0x1.62e4p-1f),
.ln2_lo = V4 (0x1.7f7d1cp-20f),
.exponent_bias = V4 (0x3f800000),
#if !WANT_SIMD_EXCEPT
.special_bound = V4 (126.0f),
.scale_thresh = V4 (192.0f),
#endif
};
#define C(i) d->poly[i]
#if WANT_SIMD_EXCEPT
# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */
# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */
# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
{
/* If fenv exceptions are to be triggered correctly, fall back to the scalar
routine to special lanes. */
return v_call_f32 (expf, x, y, cmp);
}
#else
# define SpecialOffset v_u32 (0x82000000)
# define SpecialBias v_u32 (0x7f000000)
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
float32x4_t scale, const struct data *d)
{
/* 2^n may overflow, break it up into s1*s2. */
uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
float32x4_t r2 = vmulq_f32 (s1, s1);
float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
/* Similar to r1 but avoids double rounding in the subnormal range. */
float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
float32x4_t r = vbslq_f32 (cmp1, r1, r0);
return vbslq_f32 (cmp2, r2, r);
}
#endif
float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
float32x4_t n, r, r2, scale, p, q, poly, z;
uint32x4_t cmp, e;
#if WANT_SIMD_EXCEPT
/* asuint(x) - TinyBound >= BigBound - TinyBound. */
cmp = vcgeq_u32 (
vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)),
TinyBound),
SpecialBound);
float32x4_t xm = x;
/* If any lanes are special, mask them with 1 and retain a copy of x to allow
special case handler to fix special lanes later. This is only necessary if
fenv exceptions are to be triggered correctly. */
if (unlikely (v_any_u32 (cmp)))
x = vbslq_f32 (cmp, v_f32 (1), x);
#endif
/* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
z = vfmaq_f32 (d->shift, x, d->inv_ln2);
n = vsubq_f32 (z, d->shift);
r = vfmsq_f32 (x, n, d->ln2_hi);
r = vfmsq_f32 (r, n, d->ln2_lo);
e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
#if !WANT_SIMD_EXCEPT
cmp = vcagtq_f32 (n, d->special_bound);
#endif
r2 = vmulq_f32 (r, r);
p = vfmaq_f32 (C (1), C (0), r);
q = vfmaq_f32 (C (3), C (2), r);
q = vfmaq_f32 (q, p, r2);
p = vmulq_f32 (C (4), r);
poly = vfmaq_f32 (p, q, r2);
if (unlikely (v_any_u32 (cmp)))
#if WANT_SIMD_EXCEPT
return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
#else
return special_case (poly, n, e, cmp, scale, d);
#endif
return vfmaq_f32 (scale, poly, scale);
}

View File

@ -0,0 +1,77 @@
/*
* Single-precision vector e^x function.
*
* Copyright (c) 2019-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
static const float Poly[] = {
/* maxerr: 0.36565 +0.5 ulp. */
0x1.6a6000p-10f,
0x1.12718ep-7f,
0x1.555af0p-5f,
0x1.555430p-3f,
0x1.fffff4p-2f,
};
#define C0 v_f32 (Poly[0])
#define C1 v_f32 (Poly[1])
#define C2 v_f32 (Poly[2])
#define C3 v_f32 (Poly[3])
#define C4 v_f32 (Poly[4])
#define Shift v_f32 (0x1.8p23f)
#define InvLn2 v_f32 (0x1.715476p+0f)
#define Ln2hi v_f32 (0x1.62e4p-1f)
#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
static float32x4_t VPCS_ATTR NOINLINE
specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn)
{
/* 2^n may overflow, break it up into s1*s2. */
uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b);
float32x4_t s2 = vreinterpretq_f32_u32 (e - b);
uint32x4_t cmp = absn > v_f32 (192.0f);
float32x4_t r1 = s1 * s1;
float32x4_t r0 = poly * s1 * s2;
return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
| (~cmp & vreinterpretq_u32_f32 (r0)));
}
float32x4_t VPCS_ATTR
_ZGVnN4v_expf_1u (float32x4_t x)
{
float32x4_t n, r, scale, poly, absn, z;
uint32x4_t cmp, e;
/* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
#if 1
z = vfmaq_f32 (Shift, x, InvLn2);
n = z - Shift;
r = vfmaq_f32 (x, n, -Ln2hi);
r = vfmaq_f32 (r, n, -Ln2lo);
e = vreinterpretq_u32_f32 (z) << 23;
#else
z = x * InvLn2;
n = vrndaq_f32 (z);
r = vfmaq_f32 (x, n, -Ln2hi);
r = vfmaq_f32 (r, n, -Ln2lo);
e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (z)) << 23;
#endif
scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000));
absn = vabsq_f32 (n);
cmp = absn > v_f32 (126.0f);
poly = vfmaq_f32 (C1, C0, r);
poly = vfmaq_f32 (C2, poly, r);
poly = vfmaq_f32 (C3, poly, r);
poly = vfmaq_f32 (C4, poly, r);
poly = vfmaq_f32 (v_f32 (1.0f), poly, r);
poly = vfmaq_f32 (v_f32 (1.0f), poly, r);
if (unlikely (v_any_u32 (cmp)))
return specialcase (poly, n, e, absn);
return scale * poly;
}

View File

@ -0,0 +1,100 @@
/*
* Double-precision vector log(x) function.
*
* Copyright (c) 2019-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
static const struct data
{
uint64x2_t min_norm;
uint32x4_t special_bound;
float64x2_t poly[5];
float64x2_t ln2;
uint64x2_t sign_exp_mask;
} data = {
/* Worst-case error: 1.17 + 0.5 ulp.
Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */
.poly = { V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2),
V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3),
V2 (-0x1.554e550bd501ep-3) },
.ln2 = V2 (0x1.62e42fefa39efp-1),
.min_norm = V2 (0x0010000000000000),
.special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */
.sign_exp_mask = V2 (0xfff0000000000000)
};
#define A(i) d->poly[i]
#define N (1 << V_LOG_TABLE_BITS)
#define IndexMask (N - 1)
#define Off v_u64 (0x3fe6900900000000)
struct entry
{
float64x2_t invc;
float64x2_t logc;
};
static inline struct entry
lookup (uint64x2_t i)
{
/* Since N is a power of 2, n % N = n & (N - 1). */
struct entry e;
uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
e.invc = vuzp1q_f64 (e0, e1);
e.logc = vuzp2q_f64 (e0, e1);
return e;
}
static float64x2_t VPCS_ATTR NOINLINE
special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2,
uint32x2_t cmp)
{
return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (cmp));
}
float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
float64x2_t z, r, r2, p, y, kd, hi;
uint64x2_t ix, iz, tmp;
uint32x2_t cmp;
int64x2_t k;
struct entry e;
ix = vreinterpretq_u64_f64 (x);
cmp = vcge_u32 (vsubhn_u64 (ix, d->min_norm),
vget_low_u32 (d->special_bound));
/* x = 2^k z; where z is in range [Off,2*Off) and exact.
The range is split into N subintervals.
The ith subinterval contains z and c is near its center. */
tmp = vsubq_u64 (ix, Off);
k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */
iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
z = vreinterpretq_f64_u64 (iz);
e = lookup (tmp);
/* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
kd = vcvtq_f64_s64 (k);
/* hi = r + log(c) + k*Ln2. */
hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2);
/* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
r2 = vmulq_f64 (r, r);
y = vfmaq_f64 (A (2), A (3), r);
p = vfmaq_f64 (A (0), A (1), r);
y = vfmaq_f64 (y, A (4), r2);
y = vfmaq_f64 (p, y, r2);
if (unlikely (v_any_u32h (cmp)))
return special_case (x, y, hi, r2, cmp);
return vfmaq_f64 (hi, y, r2);
}

View File

@ -0,0 +1,156 @@
/*
* Lookup table for double-precision log(x) vector function.
*
* Copyright (c) 2019-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#define N (1 << V_LOG_TABLE_BITS)
const struct v_log_data __v_log_data = {
/* Algorithm:
x = 2^k z
log(x) = k ln2 + log(c) + poly(z/c - 1)
where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,
N=128) and log(c) and 1/c for the ith subinterval comes from lookup tables:
table[i].invc = 1/c
table[i].logc = (double)log(c)
where c is near the center of the subinterval and is chosen by trying several
floating point invc candidates around 1/center and selecting one for which
the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval
that contains 1 and the previous one got tweaked to avoid cancellation. */
.table = { { 0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2 },
{ 0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2 },
{ 0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2 },
{ 0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2 },
{ 0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2 },
{ 0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2 },
{ 0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2 },
{ 0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2 },
{ 0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2 },
{ 0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2 },
{ 0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2 },
{ 0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2 },
{ 0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2 },
{ 0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2 },
{ 0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2 },
{ 0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2 },
{ 0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2 },
{ 0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2 },
{ 0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2 },
{ 0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3 },
{ 0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3 },
{ 0x1.446f12b278001p+0, -0x1.e52e160484698p-3 },
{ 0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3 },
{ 0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3 },
{ 0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3 },
{ 0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3 },
{ 0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3 },
{ 0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3 },
{ 0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3 },
{ 0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3 },
{ 0x1.36987540fbf53p+0, -0x1.8be843d796044p-3 },
{ 0x1.352166b648f61p+0, -0x1.82395ecc477edp-3 },
{ 0x1.33adddb3eb575p+0, -0x1.7896240966422p-3 },
{ 0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3 },
{ 0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3 },
{ 0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3 },
{ 0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3 },
{ 0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3 },
{ 0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3 },
{ 0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3 },
{ 0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3 },
{ 0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3 },
{ 0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3 },
{ 0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3 },
{ 0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3 },
{ 0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4 },
{ 0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4 },
{ 0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4 },
{ 0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4 },
{ 0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4 },
{ 0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4 },
{ 0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4 },
{ 0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4 },
{ 0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4 },
{ 0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4 },
{ 0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4 },
{ 0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4 },
{ 0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4 },
{ 0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4 },
{ 0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4 },
{ 0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5 },
{ 0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5 },
{ 0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5 },
{ 0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5 },
{ 0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5 },
{ 0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5 },
{ 0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5 },
{ 0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5 },
{ 0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6 },
{ 0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6 },
{ 0x1.05193497a7cc5p+0, -0x1.43183683400acp-6 },
{ 0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6 },
{ 0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7 },
{ 0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7 },
{ 0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9 },
{ 1.0, 0.0 },
{ 0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8 },
{ 0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7 },
{ 0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6 },
{ 0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6 },
{ 0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5 },
{ 0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5 },
{ 0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5 },
{ 0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5 },
{ 0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4 },
{ 0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4 },
{ 0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4 },
{ 0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4 },
{ 0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4 },
{ 0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4 },
{ 0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4 },
{ 0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4 },
{ 0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4 },
{ 0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3 },
{ 0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3 },
{ 0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3 },
{ 0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3 },
{ 0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3 },
{ 0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3 },
{ 0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3 },
{ 0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3 },
{ 0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3 },
{ 0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3 },
{ 0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3 },
{ 0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3 },
{ 0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3 },
{ 0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3 },
{ 0x1.9998e1480b618p-1, 0x1.c903161240163p-3 },
{ 0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3 },
{ 0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3 },
{ 0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3 },
{ 0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3 },
{ 0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2 },
{ 0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2 },
{ 0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2 },
{ 0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2 },
{ 0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2 },
{ 0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2 },
{ 0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2 },
{ 0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2 },
{ 0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2 },
{ 0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2 },
{ 0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2 },
{ 0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2 },
{ 0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2 },
{ 0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2 },
{ 0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2 },
{ 0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2 } }
};

View File

@ -0,0 +1,74 @@
/*
* Single-precision vector log function.
*
* Copyright (c) 2019-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
static const struct data
{
uint32x4_t min_norm;
uint16x8_t special_bound;
float32x4_t poly[7];
float32x4_t ln2, tiny_bound;
uint32x4_t off, mantissa_mask;
} data = {
/* 3.34 ulp error. */
.poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f),
V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f),
V4 (-0x1.ffffc8p-2f) },
.ln2 = V4 (0x1.62e43p-1f),
.tiny_bound = V4 (0x1p-126),
.min_norm = V4 (0x00800000),
.special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */
.off = V4 (0x3f2aaaab), /* 0.666667. */
.mantissa_mask = V4 (0x007fffff)
};
#define P(i) d->poly[7 - i]
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p,
uint16x4_t cmp)
{
/* Fall back to scalar code. */
return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
}
float32x4_t VPCS_ATTR V_NAME_F1 (log) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
float32x4_t n, p, q, r, r2, y;
uint32x4_t u;
uint16x4_t cmp;
u = vreinterpretq_u32_f32 (x);
cmp = vcge_u16 (vsubhn_u32 (u, d->min_norm),
vget_low_u16 (d->special_bound));
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
u = vsubq_u32 (u, d->off);
n = vcvtq_f32_s32 (
vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */
u = vandq_u32 (u, d->mantissa_mask);
u = vaddq_u32 (u, d->off);
r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
/* y = log(1+r) + n*ln2. */
r2 = vmulq_f32 (r, r);
/* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */
p = vfmaq_f32 (P (5), P (6), r);
q = vfmaq_f32 (P (3), P (4), r);
y = vfmaq_f32 (P (1), P (2), r);
p = vfmaq_f32 (p, P (7), r2);
q = vfmaq_f32 (q, p, r2);
y = vfmaq_f32 (y, q, r2);
p = vfmaq_f32 (r, d->ln2, n);
if (unlikely (v_any_u16h (cmp)))
return special_case (x, y, r2, p, cmp);
return vfmaq_f32 (p, y, r2);
}

View File

@ -0,0 +1,135 @@
/*
* Vector math abstractions.
*
* Copyright (c) 2019-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#ifndef _V_MATH_H
#define _V_MATH_H
#if !__aarch64__
# error "Cannot build without AArch64"
#endif
#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
#define V_NAME_F1(fun) _ZGVnN4v_##fun##f
#define V_NAME_D1(fun) _ZGVnN2v_##fun
#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f
#define V_NAME_D2(fun) _ZGVnN2vv_##fun
#include <stdint.h>
#include "../math_config.h"
#include <arm_neon.h>
/* Shorthand helpers for declaring constants. */
# define V2(X) { X, X }
# define V4(X) { X, X, X, X }
# define V8(X) { X, X, X, X, X, X, X, X }
static inline int
v_any_u16h (uint16x4_t x)
{
return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0;
}
static inline int
v_lanes32 (void)
{
return 4;
}
static inline float32x4_t
v_f32 (float x)
{
return (float32x4_t) V4 (x);
}
static inline uint32x4_t
v_u32 (uint32_t x)
{
return (uint32x4_t) V4 (x);
}
/* true if any elements of a v_cond result is non-zero. */
static inline int
v_any_u32 (uint32x4_t x)
{
/* assume elements in x are either 0 or -1u. */
return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
}
static inline int
v_any_u32h (uint32x2_t x)
{
return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0;
}
static inline float32x4_t
v_lookup_f32 (const float *tab, uint32x4_t idx)
{
return (float32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
}
static inline uint32x4_t
v_lookup_u32 (const uint32_t *tab, uint32x4_t idx)
{
return (uint32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
}
static inline float32x4_t
v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p)
{
return (float32x4_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]};
}
static inline float32x4_t
v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2,
float32x4_t y, uint32x4_t p)
{
return (float32x4_t){p[0] ? f (x1[0], x2[0]) : y[0],
p[1] ? f (x1[1], x2[1]) : y[1],
p[2] ? f (x1[2], x2[2]) : y[2],
p[3] ? f (x1[3], x2[3]) : y[3]};
}
static inline int
v_lanes64 (void)
{
return 2;
}
static inline float64x2_t
v_f64 (double x)
{
return (float64x2_t) V2 (x);
}
static inline uint64x2_t
v_u64 (uint64_t x)
{
return (uint64x2_t) V2 (x);
}
/* true if any elements of a v_cond result is non-zero. */
static inline int
v_any_u64 (uint64x2_t x)
{
/* assume elements in x are either 0 or -1u. */
return vpaddd_u64 (x) != 0;
}
static inline float64x2_t
v_lookup_f64 (const double *tab, uint64x2_t idx)
{
return (float64x2_t){tab[idx[0]], tab[idx[1]]};
}
static inline uint64x2_t
v_lookup_u64 (const uint64_t *tab, uint64x2_t idx)
{
return (uint64x2_t){tab[idx[0]], tab[idx[1]]};
}
static inline float64x2_t
v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p)
{
double p1 = p[1];
double x1 = x[1];
if (likely (p[0]))
y[0] = f (x[0]);
if (likely (p1))
y[1] = f (x1);
return y;
}
#endif

View File

@ -0,0 +1,22 @@
/*
* Double-precision vector pow function.
*
* Copyright (c) 2020-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
{
float64x2_t z;
for (int lane = 0; lane < v_lanes64 (); lane++)
{
double sx = x[lane];
double sy = y[lane];
double sz = pow (sx, sy);
z[lane] = sz;
}
return z;
}

View File

@ -0,0 +1,148 @@
/*
* Single-precision vector powf function.
*
* Copyright (c) 2019-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#define Min v_u32 (0x00800000)
#define Max v_u32 (0x7f800000)
#define Thresh v_u32 (0x7f000000) /* Max - Min. */
#define MantissaMask v_u32 (0x007fffff)
#define A data.log2_poly
#define C data.exp2f_poly
/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */
#define Off v_u32 (0x3f35d000)
#define V_POWF_LOG2_TABLE_BITS 5
#define V_EXP2F_TABLE_BITS 5
#define Log2IdxMask v_u32 ((1 << V_POWF_LOG2_TABLE_BITS) - 1)
#define Scale ((double) (1 << V_EXP2F_TABLE_BITS))
static const struct
{
struct
{
double invc, logc;
} log2_tab[1 << V_POWF_LOG2_TABLE_BITS];
double log2_poly[4];
uint64_t exp2f_tab[1 << V_EXP2F_TABLE_BITS];
double exp2f_poly[3];
} data = {
.log2_tab = {{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * Scale},
{0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * Scale},
{0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * Scale},
{0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * Scale},
{0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * Scale},
{0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * Scale},
{0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * Scale},
{0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * Scale},
{0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * Scale},
{0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * Scale},
{0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * Scale},
{0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * Scale},
{0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * Scale},
{0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * Scale},
{0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * Scale},
{0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * Scale},
{0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * Scale},
{0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * Scale},
{0x1p+0, 0x0p+0 * Scale},
{0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * Scale},
{0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * Scale},
{0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * Scale},
{0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * Scale},
{0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * Scale},
{0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * Scale},
{0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * Scale},
{0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * Scale},
{0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * Scale},
{0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * Scale},
{0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * Scale},
{0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * Scale},
{0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * Scale},},
.log2_poly = { /* rel err: 1.5 * 2^-30. */
-0x1.6ff5daa3b3d7cp-2 * Scale, 0x1.ec81d03c01aebp-2 * Scale,
-0x1.71547bb43f101p-1 * Scale, 0x1.7154764a815cbp0 * Scale,},
.exp2f_tab = {0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f,
0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa,
0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715,
0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429,
0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74,
0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db,
0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c,
0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f,
0x3fefa4afa2a490da, 0x3fefd0765b6e4540,},
.exp2f_poly = { /* rel err: 1.69 * 2^-34. */
0x1.c6af84b912394p-5 / Scale / Scale / Scale,
0x1.ebfce50fac4f3p-3 / Scale / Scale,
0x1.62e42ff0c52d6p-1 / Scale}};
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t x, float32x4_t y, float32x4_t ret, uint32x4_t cmp)
{
return v_call2_f32 (powf, x, y, ret, cmp);
}
float32x4_t VPCS_ATTR V_NAME_F2 (pow) (float32x4_t x, float32x4_t y)
{
uint32x4_t u = vreinterpretq_u32_f32 (x);
uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (u, Min), Thresh);
uint32x4_t tmp = vsubq_u32 (u, Off);
uint32x4_t i = vandq_u32 (vshrq_n_u32 (tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
Log2IdxMask);
uint32x4_t top = vbicq_u32 (tmp, MantissaMask);
uint32x4_t iz = vsubq_u32 (u, top);
int32x4_t k = vshrq_n_s32 (vreinterpretq_s32_u32 (top),
23 - V_EXP2F_TABLE_BITS); /* arithmetic shift. */
float32x4_t ret;
for (int lane = 0; lane < 4; lane++)
{
/* Use double precision for each lane. */
double invc = data.log2_tab[i[lane]].invc;
double logc = data.log2_tab[i[lane]].logc;
double z = (double) asfloat (iz[lane]);
/* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */
double r = __builtin_fma (z, invc, -1.0);
double y0 = logc + (double) k[lane];
/* Polynomial to approximate log1p(r)/ln2. */
double logx = A[0];
logx = r * logx + A[1];
logx = r * logx + A[2];
logx = r * logx + A[3];
logx = r * logx + y0;
double ylogx = y[lane] * logx;
cmp[lane] = (asuint64 (ylogx) >> 47 & 0xffff)
>= asuint64 (126.0 * (1 << V_EXP2F_TABLE_BITS)) >> 47
? 1
: cmp[lane];
/* N*x = k + r with r in [-1/2, 1/2]. */
double kd = round (ylogx);
uint64_t ki = lround (ylogx);
r = ylogx - kd;
/* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */
uint64_t t = data.exp2f_tab[ki % (1 << V_EXP2F_TABLE_BITS)];
t += ki << (52 - V_EXP2F_TABLE_BITS);
double s = asdouble (t);
double p = C[0];
p = __builtin_fma (p, r, C[1]);
p = __builtin_fma (p, r, C[2]);
p = __builtin_fma (p, s * r, s);
ret[lane] = p;
}
if (unlikely (v_any_u32 (cmp)))
return special_case (x, y, ret, cmp);
return ret;
}

View File

@ -0,0 +1,97 @@
/*
* Double-precision vector sin function.
*
* Copyright (c) 2019-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
static const struct data
{
float64x2_t poly[7];
float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
} data = {
.poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
V2 (-0x1.9e9540300a1p-41) },
.range_val = V2 (0x1p23),
.inv_pi = V2 (0x1.45f306dc9c883p-2),
.pi_1 = V2 (0x1.921fb54442d18p+1),
.pi_2 = V2 (0x1.1a62633145c06p-53),
.pi_3 = V2 (0x1.c1cd129024e09p-106),
.shift = V2 (0x1.8p52),
};
#if WANT_SIMD_EXCEPT
# define TinyBound v_u64 (0x3000000000000000) /* asuint64 (0x1p-255). */
# define Thresh v_u64 (0x1160000000000000) /* RangeVal - TinyBound. */
#endif
#define C(i) d->poly[i]
static float64x2_t VPCS_ATTR NOINLINE
special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
{
y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
return v_call_f64 (sin, x, y, cmp);
}
/* Vector (AdvSIMD) sin approximation.
Maximum observed error in [-pi/2, pi/2], where argument is not reduced,
is 2.87 ULP:
_ZGVnN2v_sin (0x1.921d5c6a07142p+0) got 0x1.fffffffa7dc02p-1
want 0x1.fffffffa7dc05p-1
Maximum observed error in the entire non-special domain ([-2^23, 2^23])
is 3.22 ULP:
_ZGVnN2v_sin (0x1.5702447b6f17bp+22) got 0x1.ffdcd125c84fbp-3
want 0x1.ffdcd125c84f8p-3. */
float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
float64x2_t n, r, r2, r3, r4, y, t1, t2, t3;
uint64x2_t odd, cmp;
#if WANT_SIMD_EXCEPT
/* Detect |x| <= TinyBound or |x| >= RangeVal. If fenv exceptions are to be
triggered correctly, set any special lanes to 1 (which is neutral w.r.t.
fenv). These lanes will be fixed by special-case handler later. */
uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x));
cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh);
r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x);
#else
r = x;
cmp = vcageq_f64 (x, d->range_val);
#endif
/* n = rint(|x|/pi). */
n = vfmaq_f64 (d->shift, d->inv_pi, r);
odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
n = vsubq_f64 (n, d->shift);
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
r = vfmsq_f64 (r, d->pi_1, n);
r = vfmsq_f64 (r, d->pi_2, n);
r = vfmsq_f64 (r, d->pi_3, n);
/* sin(r) poly approx. */
r2 = vmulq_f64 (r, r);
r3 = vmulq_f64 (r2, r);
r4 = vmulq_f64 (r2, r2);
t1 = vfmaq_f64 (C (4), C (5), r2);
t2 = vfmaq_f64 (C (2), C (3), r2);
t3 = vfmaq_f64 (C (0), C (1), r2);
y = vfmaq_f64 (t1, C (6), r4);
y = vfmaq_f64 (t2, y, r4);
y = vfmaq_f64 (t3, y, r4);
y = vfmaq_f64 (r, y, r3);
if (unlikely (v_any_u64 (cmp)))
return special_case (x, y, odd, cmp);
return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
}

View File

@ -0,0 +1,82 @@
/*
* Single-precision vector sin function.
*
* Copyright (c) 2019-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
static const struct data
{
float32x4_t poly[4];
float32x4_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
} data = {
/* 1.886 ulp error. */
.poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
V4 (0x1.5b2e76p-19f) },
.pi_1 = V4 (0x1.921fb6p+1f),
.pi_2 = V4 (-0x1.777a5cp-24f),
.pi_3 = V4 (-0x1.ee59dap-49f),
.inv_pi = V4 (0x1.45f306p-2f),
.shift = V4 (0x1.8p+23f),
.range_val = V4 (0x1p20f)
};
#if WANT_SIMD_EXCEPT
# define TinyBound v_u32 (0x21000000) /* asuint32(0x1p-61f). */
# define Thresh v_u32 (0x28800000) /* RangeVal - TinyBound. */
#endif
#define C(i) d->poly[i]
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
{
/* Fall back to scalar code. */
y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
return v_call_f32 (sinf, x, y, cmp);
}
float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
float32x4_t n, r, r2, y;
uint32x4_t odd, cmp;
#if WANT_SIMD_EXCEPT
uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x));
cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh);
/* If fenv exceptions are to be triggered correctly, set any special lanes
to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
special-case handler later. */
r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x);
#else
r = x;
cmp = vcageq_f32 (x, d->range_val);
#endif
/* n = rint(|x|/pi) */
n = vfmaq_f32 (d->shift, d->inv_pi, r);
odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
n = vsubq_f32 (n, d->shift);
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */
r = vfmsq_f32 (r, d->pi_1, n);
r = vfmsq_f32 (r, d->pi_2, n);
r = vfmsq_f32 (r, d->pi_3, n);
/* y = sin(r) */
r2 = vmulq_f32 (r, r);
y = vfmaq_f32 (C (2), C (3), r2);
y = vfmaq_f32 (C (1), y, r2);
y = vfmaq_f32 (C (0), y, r2);
y = vfmaq_f32 (r, vmulq_f32 (y, r2), r);
if (unlikely (v_any_u32 (cmp)))
return special_case (x, y, odd, cmp);
return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
}

View File

@ -0,0 +1,129 @@
/*
* Double-precision 10^x function.
*
* Copyright (c) 2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
#define N (1 << EXP_TABLE_BITS)
#define IndexMask (N - 1)
#define OFlowBound 0x1.34413509f79ffp8 /* log10(DBL_MAX). */
#define UFlowBound -0x1.5ep+8 /* -350. */
#define SmallTop 0x3c6 /* top12(0x1p-57). */
#define BigTop 0x407 /* top12(0x1p8). */
#define Thresh 0x41 /* BigTop - SmallTop. */
#define Shift __exp_data.shift
#define C(i) __exp_data.exp10_poly[i]
static double
special_case (uint64_t sbits, double_t tmp, uint64_t ki)
{
double_t scale, y;
if (ki - (1ull << 16) < 0x80000000)
{
/* The exponent of scale might have overflowed by 1. */
sbits -= 1ull << 52;
scale = asdouble (sbits);
y = 2 * (scale + scale * tmp);
return check_oflow (eval_as_double (y));
}
/* n < 0, need special care in the subnormal range. */
sbits += 1022ull << 52;
scale = asdouble (sbits);
y = scale + scale * tmp;
if (y < 1.0)
{
/* Round y to the right precision before scaling it into the subnormal
range to avoid double rounding that can cause 0.5+E/2 ulp error where
E is the worst-case ulp error outside the subnormal range. So this
is only useful if the goal is better than 1 ulp worst-case error. */
double_t lo = scale - y + scale * tmp;
double_t hi = 1.0 + y;
lo = 1.0 - hi + y + lo;
y = eval_as_double (hi + lo) - 1.0;
/* Avoid -0.0 with downward rounding. */
if (WANT_ROUNDING && y == 0.0)
y = 0.0;
/* The underflow exception needs to be signaled explicitly. */
force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022);
}
y = 0x1p-1022 * y;
return check_uflow (y);
}
/* Double-precision 10^x approximation. Largest observed error is ~0.513 ULP. */
double
exp10 (double x)
{
uint64_t ix = asuint64 (x);
uint32_t abstop = (ix >> 52) & 0x7ff;
if (unlikely (abstop - SmallTop >= Thresh))
{
if (abstop - SmallTop >= 0x80000000)
/* Avoid spurious underflow for tiny x.
Note: 0 is common input. */
return x + 1;
if (abstop == 0x7ff)
return ix == asuint64 (-INFINITY) ? 0.0 : x + 1.0;
if (x >= OFlowBound)
return __math_oflow (0);
if (x < UFlowBound)
return __math_uflow (0);
/* Large x is special-cased below. */
abstop = 0;
}
/* Reduce x: z = x * N / log10(2), k = round(z). */
double_t z = __exp_data.invlog10_2N * x;
double_t kd;
int64_t ki;
#if TOINT_INTRINSICS
kd = roundtoint (z);
ki = converttoint (z);
#else
kd = eval_as_double (z + Shift);
kd -= Shift;
ki = kd;
#endif
/* r = x - k * log10(2), r in [-0.5, 0.5]. */
double_t r = x;
r = __exp_data.neglog10_2hiN * kd + r;
r = __exp_data.neglog10_2loN * kd + r;
/* exp10(x) = 2^(k/N) * 2^(r/N).
Approximate the two components separately. */
/* s = 2^(k/N), using lookup table. */
uint64_t e = ki << (52 - EXP_TABLE_BITS);
uint64_t i = (ki & IndexMask) * 2;
uint64_t u = __exp_data.tab[i + 1];
uint64_t sbits = u + e;
double_t tail = asdouble (__exp_data.tab[i]);
/* 2^(r/N) ~= 1 + r * Poly(r). */
double_t r2 = r * r;
double_t p = C (0) + r * C (1);
double_t y = C (2) + r * C (3);
y = y + r2 * C (4);
y = p + r2 * y;
y = tail + y * r;
if (unlikely (abstop == 0))
return special_case (sbits, y, ki);
/* Assemble components:
y = 2^(r/N) * 2^(k/N)
~= (y + 1) * s. */
double_t s = asdouble (sbits);
return eval_as_double (s * y + s);
}

View File

@ -1,7 +1,7 @@
/*
* Shared data between exp, exp2 and pow.
*
* Copyright (c) 2018, Arm Limited.
* Copyright (c) 2018-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@ -12,6 +12,7 @@
const struct exp_data __exp_data = {
// N/ln2
.invln2N = 0x1.71547652b82fep0 * N,
.invlog10_2N = 0x1.a934f0979a371p1 * N,
// -ln2/N
#if N == 64
.negln2hiN = -0x1.62e42fefa0000p-7,
@ -26,6 +27,8 @@ const struct exp_data __exp_data = {
.negln2hiN = -0x1.62e42fef80000p-10,
.negln2loN = -0x1.1cf79abc9e3b4p-45,
#endif
.neglog10_2hiN = -0x1.3441350ap-2 / N,
.neglog10_2loN = 0x1.0c0219dc1da99p-39 / N,
// Used for rounding when !TOINT_INTRINSICS
#if EXP_USE_TOINT_NARROW
.shift = 0x1800000000.8p0,
@ -147,6 +150,24 @@ const struct exp_data __exp_data = {
0x1.3b2ab786ee1dap-7,
#endif
},
.exp10_poly = {
#if EXP10_POLY_WIDE
/* Range is wider if using shift-based reduction: coeffs generated
using Remez in [-log10(2)/128, log10(2)/128 ]. */
0x1.26bb1bbb55515p1,
0x1.53524c73cd32bp1,
0x1.0470591e1a108p1,
0x1.2bd77b12fe9a8p0,
0x1.14289fef24b78p-1
#else
/* Coeffs generated using Remez in [-log10(2)/256, log10(2)/256 ]. */
0x1.26bb1bbb55516p1,
0x1.53524c73ce9fep1,
0x1.0470591ce4b26p1,
0x1.2bd76577fe684p0,
0x1.1446eeccd0efbp-1
#endif
},
// 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N)
// tab[2*k] = asuint64(T[k])
// tab[2*k+1] = asuint64(H[k]) - (k << 52)/N

View File

@ -1,7 +1,7 @@
/*
* Public API.
*
* Copyright (c) 2015-2020, Arm Limited.
* Copyright (c) 2015-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@ -18,74 +18,33 @@ float cosf (float);
void sincosf (float, float*, float*);
double exp (double);
double exp10 (double);
double exp2 (double);
double log (double);
double log2 (double);
double pow (double, double);
/* Scalar functions using the vector algorithm with identical result. */
float __s_sinf (float);
float __s_cosf (float);
float __s_expf (float);
float __s_expf_1u (float);
float __s_exp2f (float);
float __s_exp2f_1u (float);
float __s_logf (float);
float __s_powf (float, float);
double __s_sin (double);
double __s_cos (double);
double __s_exp (double);
double __s_log (double);
double __s_pow (double, double);
#if __aarch64__
#if __GNUC__ >= 5
# if __GNUC__ >= 5
typedef __Float32x4_t __f32x4_t;
typedef __Float64x2_t __f64x2_t;
#elif __clang_major__*100+__clang_minor__ >= 305
# elif __clang_major__*100+__clang_minor__ >= 305
typedef __attribute__((__neon_vector_type__(4))) float __f32x4_t;
typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
#else
#error Unsupported compiler
#endif
# else
# error Unsupported compiler
# endif
/* Vector functions following the base PCS. */
__f32x4_t __v_sinf (__f32x4_t);
__f32x4_t __v_cosf (__f32x4_t);
__f32x4_t __v_expf (__f32x4_t);
__f32x4_t __v_expf_1u (__f32x4_t);
__f32x4_t __v_exp2f (__f32x4_t);
__f32x4_t __v_exp2f_1u (__f32x4_t);
__f32x4_t __v_logf (__f32x4_t);
__f32x4_t __v_powf (__f32x4_t, __f32x4_t);
__f64x2_t __v_sin (__f64x2_t);
__f64x2_t __v_cos (__f64x2_t);
__f64x2_t __v_exp (__f64x2_t);
__f64x2_t __v_log (__f64x2_t);
__f64x2_t __v_pow (__f64x2_t, __f64x2_t);
#if __GNUC__ >= 9 || __clang_major__ >= 8
#define __vpcs __attribute__((__aarch64_vector_pcs__))
/* Vector functions following the vector PCS. */
__vpcs __f32x4_t __vn_sinf (__f32x4_t);
__vpcs __f32x4_t __vn_cosf (__f32x4_t);
__vpcs __f32x4_t __vn_expf (__f32x4_t);
__vpcs __f32x4_t __vn_expf_1u (__f32x4_t);
__vpcs __f32x4_t __vn_exp2f (__f32x4_t);
__vpcs __f32x4_t __vn_exp2f_1u (__f32x4_t);
__vpcs __f32x4_t __vn_logf (__f32x4_t);
__vpcs __f32x4_t __vn_powf (__f32x4_t, __f32x4_t);
__vpcs __f64x2_t __vn_sin (__f64x2_t);
__vpcs __f64x2_t __vn_cos (__f64x2_t);
__vpcs __f64x2_t __vn_exp (__f64x2_t);
__vpcs __f64x2_t __vn_log (__f64x2_t);
__vpcs __f64x2_t __vn_pow (__f64x2_t, __f64x2_t);
# if __GNUC__ >= 9 || __clang_major__ >= 8
# undef __vpcs
# define __vpcs __attribute__((__aarch64_vector_pcs__))
/* Vector functions following the vector PCS using ABI names. */
__vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_expf_1u (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_exp2f_1u (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4vv_powf (__f32x4_t, __f32x4_t);
@ -94,7 +53,7 @@ __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t);
#endif
# endif
#endif
#endif

View File

@ -1,7 +1,7 @@
/*
* Configuration for math routines.
*
* Copyright (c) 2017-2020, Arm Limited.
* Copyright (c) 2017-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@ -92,6 +92,46 @@
# define unlikely(x) (x)
#endif
/* Return ptr but hide its value from the compiler so accesses through it
cannot be optimized based on the contents. */
#define ptr_barrier(ptr) \
({ \
__typeof (ptr) __ptr = (ptr); \
__asm("" : "+r"(__ptr)); \
__ptr; \
})
/* Symbol renames to avoid libc conflicts. */
#define __math_oflowf arm_math_oflowf
#define __math_uflowf arm_math_uflowf
#define __math_may_uflowf arm_math_may_uflowf
#define __math_divzerof arm_math_divzerof
#define __math_oflow arm_math_oflow
#define __math_uflow arm_math_uflow
#define __math_may_uflow arm_math_may_uflow
#define __math_divzero arm_math_divzero
#define __math_invalidf arm_math_invalidf
#define __math_invalid arm_math_invalid
#define __math_check_oflow arm_math_check_oflow
#define __math_check_uflow arm_math_check_uflow
#define __math_check_oflowf arm_math_check_oflowf
#define __math_check_uflowf arm_math_check_uflowf
#define __sincosf_table arm_math_sincosf_table
#define __inv_pio4 arm_math_inv_pio4
#define __exp2f_data arm_math_exp2f_data
#define __logf_data arm_math_logf_data
#define __log2f_data arm_math_log2f_data
#define __powf_log2_data arm_math_powf_log2_data
#define __exp_data arm_math_exp_data
#define __log_data arm_math_log_data
#define __log2_data arm_math_log2_data
#define __pow_log_data arm_math_pow_log_data
#define __erff_data arm_math_erff_data
#define __erf_data arm_math_erf_data
#define __v_exp_data arm_math_v_exp_data
#define __v_log_data arm_math_v_log_data
#if HAVE_FAST_ROUND
/* When set, the roundtoint and converttoint functions are provided with
the semantics documented below. */
@ -381,15 +421,22 @@ extern const struct powf_log2_data
#define EXP_USE_TOINT_NARROW 0
#define EXP2_POLY_ORDER 5
#define EXP2_POLY_WIDE 0
/* Wider exp10 polynomial necessary for good precision in non-nearest rounding
and !TOINT_INTRINSICS. */
#define EXP10_POLY_WIDE 0
extern const struct exp_data
{
double invln2N;
double invlog10_2N;
double shift;
double negln2hiN;
double negln2loN;
double neglog10_2hiN;
double neglog10_2loN;
double poly[4]; /* Last four coefficients. */
double exp2_shift;
double exp2_poly[EXP2_POLY_ORDER];
double exp10_poly[5];
uint64_t tab[2*(1 << EXP_TABLE_BITS)];
} __exp_data HIDDEN;
@ -459,4 +506,16 @@ extern const struct erf_data
double erfc_poly_F[ERFC_POLY_F_NCOEFFS];
} __erf_data HIDDEN;
#define V_EXP_TABLE_BITS 7
extern const uint64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN;
#define V_LOG_TABLE_BITS 7
extern const struct v_log_data
{
struct
{
double invc, logc;
} table[1 << V_LOG_TABLE_BITS];
} __v_log_data HIDDEN;
#endif

View File

@ -1,6 +0,0 @@
/*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#define SCALAR 1
#include "v_cos.c"

View File

@ -1,6 +0,0 @@
/*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#define SCALAR 1
#include "v_cosf.c"

View File

@ -1,6 +0,0 @@
/*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#define SCALAR 1
#include "v_exp.c"

View File

@ -1,6 +0,0 @@
/*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#define SCALAR 1
#include "v_exp2f.c"

View File

@ -1,6 +0,0 @@
/*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#define SCALAR 1
#include "v_exp2f_1u.c"

View File

@ -1,6 +0,0 @@
/*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#define SCALAR 1
#include "v_expf.c"

View File

@ -1,6 +0,0 @@
/*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#define SCALAR 1
#include "v_expf_1u.c"

View File

@ -1,6 +0,0 @@
/*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#define SCALAR 1
#include "v_log.c"

View File

@ -1,6 +0,0 @@
/*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#define SCALAR 1
#include "v_logf.c"

View File

@ -1,6 +0,0 @@
/*
* Copyright (c) 2020, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#define SCALAR 1
#include "v_pow.c"

View File

@ -1,6 +0,0 @@
/*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#define SCALAR 1
#include "v_powf.c"

View File

@ -1,6 +0,0 @@
/*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#define SCALAR 1
#include "v_sin.c"

View File

@ -1,6 +0,0 @@
/*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#define SCALAR 1
#include "v_sinf.c"

View File

@ -1,7 +1,7 @@
/*
* Microbenchmark for math functions.
*
* Copyright (c) 2018-2022, Arm Limited.
* Copyright (c) 2018-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@ -15,11 +15,6 @@
#include <math.h>
#include "mathlib.h"
#ifndef WANT_VMATH
/* Enable the build of vector math code. */
# define WANT_VMATH 1
#endif
/* Number of measurements, best result is reported. */
#define MEASURE 60
/* Array size. */
@ -34,8 +29,9 @@ static float Af[N];
static long measurecount = MEASURE;
static long itercount = ITER;
#if __aarch64__ && WANT_VMATH
typedef __f64x2_t v_double;
#ifdef __vpcs
#include <arm_neon.h>
typedef float64x2_t v_double;
#define v_double_len() 2
@ -51,7 +47,7 @@ v_double_dup (double x)
return (v_double){x, x};
}
typedef __f32x4_t v_float;
typedef float32x4_t v_float;
#define v_float_len() 4
@ -66,6 +62,19 @@ v_float_dup (float x)
{
return (v_float){x, x, x, x};
}
#else
/* dummy definitions to make things compile. */
typedef double v_double;
typedef float v_float;
#define v_double_len(x) 1
#define v_double_load(x) (x)[0]
#define v_double_dup(x) (x)
#define v_float_len(x) 1
#define v_float_load(x) (x)[0]
#define v_float_dup(x) (x)
#endif
#if WANT_SVE_MATH
#include <arm_sve.h>
typedef svbool_t sv_bool;
@ -102,17 +111,10 @@ sv_float_dup (float x)
{
return svdup_n_f32(x);
}
#endif
#else
/* dummy definitions to make things compile. */
typedef double v_double;
typedef float v_float;
#define v_double_len(x) 1
#define v_double_load(x) (x)[0]
#define v_double_dup(x) (x)
#define v_float_len(x) 1
#define v_float_load(x) (x)[0]
#define v_float_dup(x) (x)
#define sv_double_len(x) 1
#define sv_float_len(x) 1
#endif
static double
@ -126,20 +128,6 @@ dummyf (float x)
{
return x;
}
#if WANT_VMATH
#if __aarch64__
static v_double
__v_dummy (v_double x)
{
return x;
}
static v_float
__v_dummyf (v_float x)
{
return x;
}
#ifdef __vpcs
__vpcs static v_double
__vn_dummy (v_double x)
@ -166,8 +154,6 @@ __sv_dummyf (sv_float x, sv_bool pg)
return x;
}
#endif
#endif
#endif
#include "test/mathbench_wrappers.h"
@ -183,8 +169,6 @@ static const struct fun
{
double (*d) (double);
float (*f) (float);
v_double (*vd) (v_double);
v_float (*vf) (v_float);
#ifdef __vpcs
__vpcs v_double (*vnd) (v_double);
__vpcs v_float (*vnf) (v_float);
@ -197,18 +181,12 @@ static const struct fun
} funtab[] = {
#define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}},
#define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}},
#define VD(func, lo, hi) {#func, 'd', 'v', lo, hi, {.vd = func}},
#define VF(func, lo, hi) {#func, 'f', 'v', lo, hi, {.vf = func}},
#define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}},
#define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}},
#define SVD(func, lo, hi) {#func, 'd', 's', lo, hi, {.svd = func}},
#define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}},
D (dummy, 1.0, 2.0)
F (dummyf, 1.0, 2.0)
#if WANT_VMATH
#if __aarch64__
VD (__v_dummy, 1.0, 2.0)
VF (__v_dummyf, 1.0, 2.0)
#ifdef __vpcs
VND (__vn_dummy, 1.0, 2.0)
VNF (__vn_dummyf, 1.0, 2.0)
@ -217,14 +195,10 @@ VNF (__vn_dummyf, 1.0, 2.0)
SVD (__sv_dummy, 1.0, 2.0)
SVF (__sv_dummyf, 1.0, 2.0)
#endif
#endif
#endif
#include "test/mathbench_funcs.h"
{0},
#undef F
#undef D
#undef VF
#undef VD
#undef VNF
#undef VND
#undef SVF
@ -327,38 +301,6 @@ runf_latency (float f (float))
prev = f (Af[i] + prev * z);
}
static void
run_v_thruput (v_double f (v_double))
{
for (int i = 0; i < N; i += v_double_len ())
f (v_double_load (A+i));
}
static void
runf_v_thruput (v_float f (v_float))
{
for (int i = 0; i < N; i += v_float_len ())
f (v_float_load (Af+i));
}
static void
run_v_latency (v_double f (v_double))
{
v_double z = v_double_dup (zero);
v_double prev = z;
for (int i = 0; i < N; i += v_double_len ())
prev = f (v_double_load (A+i) + prev * z);
}
static void
runf_v_latency (v_float f (v_float))
{
v_float z = v_float_dup (zero);
v_float prev = z;
for (int i = 0; i < N; i += v_float_len ())
prev = f (v_float_load (Af+i) + prev * z);
}
#ifdef __vpcs
static void
run_vn_thruput (__vpcs v_double f (v_double))
@ -377,19 +319,21 @@ runf_vn_thruput (__vpcs v_float f (v_float))
static void
run_vn_latency (__vpcs v_double f (v_double))
{
v_double z = v_double_dup (zero);
v_double prev = z;
volatile uint64x2_t vsel = (uint64x2_t) { 0, 0 };
uint64x2_t sel = vsel;
v_double prev = v_double_dup (0);
for (int i = 0; i < N; i += v_double_len ())
prev = f (v_double_load (A+i) + prev * z);
prev = f (vbslq_f64 (sel, prev, v_double_load (A+i)));
}
static void
runf_vn_latency (__vpcs v_float f (v_float))
{
v_float z = v_float_dup (zero);
v_float prev = z;
volatile uint32x4_t vsel = (uint32x4_t) { 0, 0, 0, 0 };
uint32x4_t sel = vsel;
v_float prev = v_float_dup (0);
for (int i = 0; i < N; i += v_float_len ())
prev = f (v_float_load (Af+i) + prev * z);
prev = f (vbslq_f32 (sel, prev, v_float_load (Af+i)));
}
#endif
@ -411,19 +355,21 @@ runf_sv_thruput (sv_float f (sv_float, sv_bool))
static void
run_sv_latency (sv_double f (sv_double, sv_bool))
{
sv_double z = sv_double_dup (zero);
sv_double prev = z;
volatile sv_bool vsel = svptrue_b64 ();
sv_bool sel = vsel;
sv_double prev = sv_double_dup (0);
for (int i = 0; i < N; i += sv_double_len ())
prev = f (svmad_f64_x (svptrue_b64 (), prev, z, sv_double_load (A+i)), svptrue_b64 ());
prev = f (svsel_f64 (sel, sv_double_load (A+i), prev), svptrue_b64 ());
}
static void
runf_sv_latency (sv_float f (sv_float, sv_bool))
{
sv_float z = sv_float_dup (zero);
sv_float prev = z;
volatile sv_bool vsel = svptrue_b32 ();
sv_bool sel = vsel;
sv_float prev = sv_float_dup (0);
for (int i = 0; i < N; i += sv_float_len ())
prev = f (svmad_f32_x (svptrue_b32 (), prev, z, sv_float_load (Af+i)), svptrue_b32 ());
prev = f (svsel_f32 (sel, sv_float_load (Af+i), prev), svptrue_b32 ());
}
#endif
@ -458,10 +404,10 @@ bench1 (const struct fun *f, int type, double lo, double hi)
const char *s = type == 't' ? "rthruput" : "latency";
int vlen = 1;
if (f->vec && f->prec == 'd')
vlen = v_double_len();
else if (f->vec && f->prec == 'f')
vlen = v_float_len();
if (f->vec == 'n')
vlen = f->prec == 'd' ? v_double_len() : v_float_len();
else if (f->vec == 's')
vlen = f->prec == 'd' ? sv_double_len() : sv_float_len();
if (f->prec == 'd' && type == 't' && f->vec == 0)
TIMEIT (run_thruput, f->fun.d);
@ -471,14 +417,6 @@ bench1 (const struct fun *f, int type, double lo, double hi)
TIMEIT (runf_thruput, f->fun.f);
else if (f->prec == 'f' && type == 'l' && f->vec == 0)
TIMEIT (runf_latency, f->fun.f);
else if (f->prec == 'd' && type == 't' && f->vec == 'v')
TIMEIT (run_v_thruput, f->fun.vd);
else if (f->prec == 'd' && type == 'l' && f->vec == 'v')
TIMEIT (run_v_latency, f->fun.vd);
else if (f->prec == 'f' && type == 't' && f->vec == 'v')
TIMEIT (runf_v_thruput, f->fun.vf);
else if (f->prec == 'f' && type == 'l' && f->vec == 'v')
TIMEIT (runf_v_latency, f->fun.vf);
#ifdef __vpcs
else if (f->prec == 'd' && type == 't' && f->vec == 'n')
TIMEIT (run_vn_thruput, f->fun.vnd);
@ -503,16 +441,18 @@ bench1 (const struct fun *f, int type, double lo, double hi)
if (type == 't')
{
ns100 = (100 * dt + itercount * N / 2) / (itercount * N);
printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g]\n", f->name, s,
printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g] vlen %d\n",
f->name, s,
(unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
(unsigned long long) dt, lo, hi);
(unsigned long long) dt, lo, hi, vlen);
}
else if (type == 'l')
{
ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen);
printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g]\n", f->name, s,
printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g] vlen %d\n",
f->name, s,
(unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
(unsigned long long) dt, lo, hi);
(unsigned long long) dt, lo, hi, vlen);
}
fflush (stdout);
}

View File

@ -1,11 +1,13 @@
/*
* Function entries for mathbench.
*
* Copyright (c) 2022, Arm Limited.
* Copyright (c) 2022-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* clang-format off */
D (exp, -9.9, 9.9)
D (exp, 0.5, 1.0)
D (exp10, -9.9, 9.9)
D (exp2, -9.9, 9.9)
D (log, 0.01, 11.1)
D (log, 0.999, 1.001)
@ -42,59 +44,19 @@ F (cosf, 3.3, 33.3)
F (cosf, 100, 1000)
F (cosf, 1e6, 1e32)
F (erff, -4.0, 4.0)
#if WANT_VMATH
D (__s_sin, -3.1, 3.1)
D (__s_cos, -3.1, 3.1)
D (__s_exp, -9.9, 9.9)
D (__s_log, 0.01, 11.1)
{"__s_pow", 'd', 0, 0.01, 11.1, {.d = xy__s_pow}},
F (__s_expf, -9.9, 9.9)
F (__s_expf_1u, -9.9, 9.9)
F (__s_exp2f, -9.9, 9.9)
F (__s_exp2f_1u, -9.9, 9.9)
F (__s_logf, 0.01, 11.1)
{"__s_powf", 'f', 0, 0.01, 11.1, {.f = xy__s_powf}},
F (__s_sinf, -3.1, 3.1)
F (__s_cosf, -3.1, 3.1)
#if __aarch64__
VD (__v_sin, -3.1, 3.1)
VD (__v_cos, -3.1, 3.1)
VD (__v_exp, -9.9, 9.9)
VD (__v_log, 0.01, 11.1)
{"__v_pow", 'd', 'v', 0.01, 11.1, {.vd = xy__v_pow}},
VF (__v_expf, -9.9, 9.9)
VF (__v_expf_1u, -9.9, 9.9)
VF (__v_exp2f, -9.9, 9.9)
VF (__v_exp2f_1u, -9.9, 9.9)
VF (__v_logf, 0.01, 11.1)
{"__v_powf", 'f', 'v', 0.01, 11.1, {.vf = xy__v_powf}},
VF (__v_sinf, -3.1, 3.1)
VF (__v_cosf, -3.1, 3.1)
#ifdef __vpcs
VND (__vn_exp, -9.9, 9.9)
VND (_ZGVnN2v_exp, -9.9, 9.9)
VND (__vn_log, 0.01, 11.1)
VND (_ZGVnN2v_log, 0.01, 11.1)
{"__vn_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy__vn_pow}},
{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}},
VND (__vn_sin, -3.1, 3.1)
VND (_ZGVnN2v_sin, -3.1, 3.1)
VND (__vn_cos, -3.1, 3.1)
VND (_ZGVnN2v_cos, -3.1, 3.1)
VNF (__vn_expf, -9.9, 9.9)
VNF (_ZGVnN4v_expf, -9.9, 9.9)
VNF (__vn_expf_1u, -9.9, 9.9)
VNF (__vn_exp2f, -9.9, 9.9)
VNF (_ZGVnN4v_expf_1u, -9.9, 9.9)
VNF (_ZGVnN4v_exp2f, -9.9, 9.9)
VNF (__vn_exp2f_1u, -9.9, 9.9)
VNF (__vn_logf, 0.01, 11.1)
VNF (_ZGVnN4v_exp2f_1u, -9.9, 9.9)
VNF (_ZGVnN4v_logf, 0.01, 11.1)
{"__vn_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy__vn_powf}},
{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}},
VNF (__vn_sinf, -3.1, 3.1)
VNF (_ZGVnN4v_sinf, -3.1, 3.1)
VNF (__vn_cosf, -3.1, 3.1)
VNF (_ZGVnN4v_cosf, -3.1, 3.1)
#endif
#endif
#endif
/* clang-format on */

View File

@ -1,18 +1,11 @@
/*
* Function wrappers for mathbench.
*
* Copyright (c) 2022, Arm Limited.
* Copyright (c) 2022-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#if WANT_VMATH
#if __aarch64__
#ifdef __vpcs
__vpcs static v_float
xy__vn_powf (v_float x)
{
return __vn_powf (x, x);
}
__vpcs static v_float
xy_Z_powf (v_float x)
@ -20,44 +13,13 @@ xy_Z_powf (v_float x)
return _ZGVnN4vv_powf (x, x);
}
__vpcs static v_double
xy__vn_pow (v_double x)
{
return __vn_pow (x, x);
}
__vpcs static v_double
xy_Z_pow (v_double x)
{
return _ZGVnN2vv_pow (x, x);
}
#endif // __vpcs
static v_float
xy__v_powf (v_float x)
{
return __v_powf (x, x);
}
static v_double
xy__v_pow (v_double x)
{
return __v_pow (x, x);
}
#endif // __aarch64__
static float
xy__s_powf (float x)
{
return __s_powf (x, x);
}
static double
xy__s_pow (double x)
{
return __s_pow (x, x);
}
#endif // WANT_VMATH
#endif
static double
xypow (double x)

View File

@ -1,7 +1,7 @@
/*
* mathtest.c - test rig for mathlib
*
* Copyright (c) 1998-2022, Arm Limited.
* Copyright (c) 1998-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@ -254,6 +254,7 @@ test_func tfuncs[] = {
TFUNCARM(at_s,rt_s, expf, 3*ULPUNIT/4),
TFUNCARM(at_s,rt_s, exp2f, 3*ULPUNIT/4),
TFUNC(at_s,rt_s, expm1f, ULPUNIT),
TFUNC(at_d,rt_d, exp10, ULPUNIT),
/* power */
TFUNC(at_d2,rt_d, pow, 3*ULPUNIT/4),
@ -1021,6 +1022,7 @@ int runtest(testdetail t) {
DO_DOP(d_arg1,op1r);
DO_DOP(d_arg2,op2r);
s_arg1.i = t.op1r[0]; s_arg2.i = t.op2r[0];
s_res.i = 0;
/*
* Detect NaNs, infinities and denormals on input, and set a
@ -1155,22 +1157,25 @@ int runtest(testdetail t) {
tresultr[0] = t.resultr[0];
tresultr[1] = t.resultr[1];
resultr[0] = d_res.i[dmsd]; resultr[1] = d_res.i[dlsd];
resulti[0] = resulti[1] = 0;
wres = 2;
break;
case rt_i:
tresultr[0] = t.resultr[0];
resultr[0] = intres;
resulti[0] = 0;
wres = 1;
break;
case rt_s:
case rt_s2:
tresultr[0] = t.resultr[0];
resultr[0] = s_res.i;
resulti[0] = 0;
wres = 1;
break;
default:
puts("unhandled rettype in runtest");
wres = 0;
abort ();
}
if(t.resultc != rc_none) {
int err = 0;

View File

@ -2,7 +2,7 @@
# ULP error check script.
#
# Copyright (c) 2019-2022, Arm Limited.
# Copyright (c) 2019-2023, Arm Limited.
# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
#set -x
@ -72,6 +72,16 @@ t pow 0x1.ffffffffffff0p-1 0x1.0000000000008p0 x 0x1p60 0x1p68 50000
t pow 0x1.ffffffffff000p-1 0x1p0 x 0x1p50 0x1p52 50000
t pow -0x1.ffffffffff000p-1 -0x1p0 x 0x1p50 0x1p52 50000
L=0.02
t exp10 0 0x1p-47 5000
t exp10 -0 -0x1p-47 5000
t exp10 0x1p-47 1 50000
t exp10 -0x1p-47 -1 50000
t exp10 1 0x1.34413509f79ffp8 50000
t exp10 -1 -0x1.434e6420f4374p8 50000
t exp10 0x1.34413509f79ffp8 inf 5000
t exp10 -0x1.434e6420f4374p8 -inf 5000
L=1.0
Ldir=0.9
t erf 0 0xffff000000000000 10000
@ -143,15 +153,10 @@ Ldir=0.5
done
# vector functions
Ldir=0.5
r='n'
flags="${ULPFLAGS:--q}"
runs=
check __s_exp 1 && runs=1
runv=
check __v_exp 1 && runv=1
runvn=
check __vn_exp 1 && runvn=1
range_exp='
0 0xffff000000000000 10000
@ -177,9 +182,10 @@ range_pow='
'
range_sin='
0 0xffff000000000000 10000
0x1p-4 0x1p4 400000
-0x1p-23 0x1p23 400000
0 0x1p23 500000
-0 -0x1p23 500000
0x1p23 inf 10000
-0x1p23 -inf 10000
'
range_cos="$range_sin"
@ -199,9 +205,10 @@ range_logf='
'
range_sinf='
0 0xffff0000 10000
0x1p-4 0x1p4 300000
-0x1p-9 -0x1p9 300000
0 0x1p20 500000
-0 -0x1p20 500000
0x1p20 inf 10000
-0x1p20 -inf 10000
'
range_cosf="$range_sinf"
@ -229,9 +236,8 @@ L_sinf=1.4
L_cosf=1.4
L_powf=2.1
while read G F R D
while read G F D
do
[ "$R" = 1 ] || continue
case "$G" in \#*) continue ;; esac
eval range="\${range_$G}"
eval L="\${L_$G}"
@ -251,71 +257,23 @@ do
t $D $disable_fenv $F $X
done << EOF
$range
EOF
done << EOF
# group symbol run
exp __s_exp $runs
exp __v_exp $runv
exp __vn_exp $runvn
exp _ZGVnN2v_exp $runvn
log __s_log $runs
log __v_log $runv
log __vn_log $runvn
log _ZGVnN2v_log $runvn
pow __s_pow $runs -f
pow __v_pow $runv -f
pow __vn_pow $runvn -f
pow _ZGVnN2vv_pow $runvn -f
sin __s_sin $runs
sin __v_sin $runv
sin __vn_sin $runvn
sin _ZGVnN2v_sin $runvn
cos __s_cos $runs
cos __v_cos $runv
cos __vn_cos $runvn
cos _ZGVnN2v_cos $runvn
expf __s_expf $runs
expf __v_expf $runv
expf __vn_expf $runvn
expf _ZGVnN4v_expf $runvn
expf_1u __s_expf_1u $runs -f
expf_1u __v_expf_1u $runv -f
expf_1u __vn_expf_1u $runvn -f
exp2f __s_exp2f $runs
exp2f __v_exp2f $runv
exp2f __vn_exp2f $runvn
exp2f _ZGVnN4v_exp2f $runvn
exp2f_1u __s_exp2f_1u $runs -f
exp2f_1u __v_exp2f_1u $runv -f
exp2f_1u __vn_exp2f_1u $runvn -f
logf __s_logf $runs
logf __v_logf $runv
logf __vn_logf $runvn
logf _ZGVnN4v_logf $runvn
sinf __s_sinf $runs
sinf __v_sinf $runv
sinf __vn_sinf $runvn
sinf _ZGVnN4v_sinf $runvn
cosf __s_cosf $runs
cosf __v_cosf $runv
cosf __vn_cosf $runvn
cosf _ZGVnN4v_cosf $runvn
powf __s_powf $runs -f
powf __v_powf $runv -f
powf __vn_powf $runvn -f
powf _ZGVnN4vv_powf $runvn -f
exp _ZGVnN2v_exp
log _ZGVnN2v_log
pow _ZGVnN2vv_pow -f
sin _ZGVnN2v_sin -z
cos _ZGVnN2v_cos
expf _ZGVnN4v_expf
expf_1u _ZGVnN4v_expf_1u -f
exp2f _ZGVnN4v_exp2f
exp2f_1u _ZGVnN4v_exp2f_1u -f
logf _ZGVnN4v_logf
sinf _ZGVnN4v_sinf -z
cosf _ZGVnN4v_cosf
powf _ZGVnN4vv_powf -f
EOF
[ 0 -eq $FAIL ] || {

View File

@ -0,0 +1,15 @@
; Directed test cases for exp10
;
; Copyright (c) 2023, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=exp10 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
func=exp10 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
func=exp10 op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
func=exp10 op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
func=exp10 op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
func=exp10 op1=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox
func=exp10 op1=fff00000.00000000 result=00000000.00000000 errno=0
func=exp10 op1=ffefffff.ffffffff result=00000000.00000000 errno=ERANGE status=ux
func=exp10 op1=00000000.00000000 result=3ff00000.00000000 errno=0
func=exp10 op1=80000000.00000000 result=3ff00000.00000000 errno=0

View File

@ -1,10 +1,11 @@
/*
* ULP error checking tool for math functions.
*
* Copyright (c) 2019-2022, Arm Limited.
* Copyright (c) 2019-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#define _GNU_SOURCE
#include <ctype.h>
#include <fenv.h>
#include <float.h>
@ -23,11 +24,6 @@
# include <mpfr.h>
#endif
#ifndef WANT_VMATH
/* Enable the build of vector math code. */
# define WANT_VMATH 1
#endif
static inline uint64_t
asuint64 (double f)
{
@ -212,6 +208,7 @@ struct conf
unsigned long long n;
double softlim;
double errlim;
int ignore_zero_sign;
};
/* A bit of a hack: call vector functions twice with the same
@ -220,7 +217,7 @@ struct conf
static int secondcall;
/* Wrappers for vector functions. */
#if __aarch64__ && WANT_VMATH
#ifdef __vpcs
typedef __f32x4_t v_float;
typedef __f64x2_t v_double;
/* First element of fv and dv may be changed by -c argument. */
@ -264,40 +261,8 @@ static inline double svretd(sv_double vec) {
#endif
#endif
#if WANT_SVE_MATH
long double
dummyl (long double x)
{
return x;
}
double
dummy (double x)
{
return x;
}
static sv_double
__sv_dummy (sv_double x)
{
return x;
}
static sv_float
__sv_dummyf (sv_float x)
{
return x;
}
#endif
#include "test/ulp_wrappers.h"
/* Wrappers for SVE functions. */
#if WANT_SVE_MATH
static double sv_dummy (double x) { return svretd (__sv_dummy (svargd (x))); }
static float sv_dummyf (float x) { return svretf (__sv_dummyf (svargf (x))); }
#endif
struct fun
{
const char *name;
@ -358,10 +323,6 @@ static const struct fun fun[] = {
#define ZVNF2(x) VNF2 (x) ZVF2 (x)
#define ZVND1(x) VND1 (x) ZVD1 (x)
#define ZVND2(x) VND2 (x) ZVD2 (x)
#define SF1(x) F (__s_##x##f, __s_##x##f, x, mpfr_##x, 1, 1, f1, 0)
#define SF2(x) F (__s_##x##f, __s_##x##f, x, mpfr_##x, 2, 1, f2, 0)
#define SD1(x) F (__s_##x, __s_##x, x##l, mpfr_##x, 1, 0, d1, 0)
#define SD2(x) F (__s_##x, __s_##x, x##l, mpfr_##x, 2, 0, d2, 0)
/* SVE routines. */
#define SVF1(x) F (__sv_##x##f, sv_##x##f, x, mpfr_##x, 1, 1, f1, 0)
#define SVF2(x) F (__sv_##x##f, sv_##x##f, x, mpfr_##x, 2, 1, f2, 0)
@ -374,11 +335,6 @@ static const struct fun fun[] = {
#include "test/ulp_funcs.h"
#if WANT_SVE_MATH
SVD1 (dummy)
SVF1 (dummy)
#endif
#undef F
#undef F1
#undef F2
@ -628,17 +584,18 @@ call_mpfr_d2 (mpfr_t y, const struct fun *f, struct args_d2 a, mpfr_rnd_t r)
static void
usage (void)
{
puts ("./ulp [-q] [-m] [-f] [-r nudz] [-l soft-ulplimit] [-e ulplimit] func "
puts ("./ulp [-q] [-m] [-f] [-r {n|u|d|z}] [-l soft-ulplimit] [-e ulplimit] func "
"lo [hi [x lo2 hi2] [count]]");
puts ("Compares func against a higher precision implementation in [lo; hi].");
puts ("-q: quiet.");
puts ("-m: use mpfr even if faster method is available.");
puts ("-f: disable fenv testing (rounding modes and exceptions).");
#if __aarch64__ && WANT_VMATH
puts ("-f: disable fenv exceptions testing.");
#ifdef ___vpcs
puts ("-c: neutral 'control value' to test behaviour when one lane can affect another. \n"
" This should be different from tested input in other lanes, and non-special \n"
" (i.e. should not trigger fenv exceptions). Default is 1.");
#endif
puts ("-z: ignore sign of 0.");
puts ("Supported func:");
for (const struct fun *f = fun; f->name; f++)
printf ("\t%s\n", f->name);
@ -762,6 +719,7 @@ main (int argc, char *argv[])
conf.fenv = 1;
conf.softlim = 0;
conf.errlim = INFINITY;
conf.ignore_zero_sign = 0;
for (;;)
{
argc--;
@ -801,12 +759,15 @@ main (int argc, char *argv[])
{
argc--;
argv++;
if (argc < 1)
if (argc < 1 || argv[0][1] != '\0')
usage ();
conf.rc = argv[0][0];
}
break;
#if __aarch64__ && WANT_VMATH
case 'z':
conf.ignore_zero_sign = 1;
break;
#ifdef __vpcs
case 'c':
argc--;
argv++;
@ -839,7 +800,19 @@ main (int argc, char *argv[])
if (strcmp (argv[0], f->name) == 0)
break;
if (!f->name)
usage ();
{
#ifndef __vpcs
/* Ignore vector math functions if vector math is not supported. */
if (strncmp (argv[0], "_ZGVnN", 6) == 0)
exit (0);
#endif
#if !WANT_SVE_MATH
if (strncmp (argv[0], "_ZGVsMxv", 8) == 0)
exit (0);
#endif
printf ("math function %s not supported\n", argv[0]);
exit (1);
}
if (!f->singleprec && LDBL_MANT_DIG == DBL_MANT_DIG)
conf.mpfr = 1; /* Use mpfr if long double has no extra precision. */
if (!USE_MPFR && conf.mpfr)

View File

@ -1,7 +1,7 @@
/*
* Generic functions for ULP error estimation.
*
* Copyright (c) 2019, Arm Limited.
* Copyright (c) 2019-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@ -37,7 +37,8 @@ static int RT(ulpscale_mpfr) (mpfr_t x, int t)
/* Difference between exact result and closest real number that
gets rounded to got, i.e. error before rounding, for a correctly
rounded result the difference is 0. */
static double RT(ulperr) (RT(float) got, const struct RT(ret) * p, int r)
static double RT (ulperr) (RT (float) got, const struct RT (ret) * p, int r,
int ignore_zero_sign)
{
RT(float) want = p->y;
RT(float) d;
@ -45,10 +46,18 @@ static double RT(ulperr) (RT(float) got, const struct RT(ret) * p, int r)
if (RT(asuint) (got) == RT(asuint) (want))
return 0.0;
if (isnan (got) && isnan (want))
/* Ignore sign of NaN. */
return RT (issignaling) (got) == RT (issignaling) (want) ? 0 : INFINITY;
if (signbit (got) != signbit (want))
/* May have false positives with NaN. */
//return isnan(got) && isnan(want) ? 0 : INFINITY;
return INFINITY;
{
/* Fall through to ULP calculation if ignoring sign of zero and at
exactly one of want and got is non-zero. */
if (ignore_zero_sign && want == got)
return 0.0;
if (!ignore_zero_sign || (want != 0 && got != 0))
return INFINITY;
}
if (!isfinite (want) || !isfinite (got))
{
if (isnan (got) != isnan (want))
@ -114,8 +123,12 @@ static inline void T(call_fenv) (const struct fun *f, struct T(args) a, int r,
static inline void T(call_nofenv) (const struct fun *f, struct T(args) a,
int r, RT(float) * y, int *ex)
{
if (r != FE_TONEAREST)
fesetround (r);
*y = T(call) (f, a);
*ex = 0;
if (r != FE_TONEAREST)
fesetround (FE_TONEAREST);
}
static inline int T(call_long_fenv) (const struct fun *f, struct T(args) a,
@ -155,8 +168,12 @@ static inline int T(call_long_nofenv) (const struct fun *f, struct T(args) a,
int r, struct RT(ret) * p,
RT(float) ygot, int exgot)
{
if (r != FE_TONEAREST)
fesetround (r);
RT(double) yl = T(call_long) (f, a);
p->y = (RT(float)) yl;
if (r != FE_TONEAREST)
fesetround (FE_TONEAREST);
if (RT(isok_nofenv) (ygot, p->y))
return 1;
p->ulpexp = RT(ulpscale) (p->y);
@ -288,7 +305,7 @@ static int T(cmp) (const struct fun *f, struct gen *gen,
if (!ok)
{
int print = 0;
double err = RT(ulperr) (ygot, &want, r);
double err = RT (ulperr) (ygot, &want, r, conf->ignore_zero_sign);
double abserr = fabs (err);
// TODO: count errors below accuracy limit.
if (abserr > 0)

View File

@ -1,9 +1,10 @@
/*
* Function entries for ulp.
*
* Copyright (c) 2022, Arm Limited.
* Copyright (c) 2022-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* clang-format off */
F1 (sin)
F1 (cos)
F (sincosf_sinf, sincosf_sinf, sincos_sin, sincos_mpfr_sin, 1, 1, f1, 0)
@ -15,56 +16,18 @@
F2 (pow)
F1 (erf)
D1 (exp)
D1 (exp10)
D1 (exp2)
D1 (log)
D1 (log2)
D2 (pow)
D1 (erf)
#if WANT_VMATH
F (__s_sinf, __s_sinf, sin, mpfr_sin, 1, 1, f1, 0)
F (__s_cosf, __s_cosf, cos, mpfr_cos, 1, 1, f1, 0)
F (__s_expf_1u, __s_expf_1u, exp, mpfr_exp, 1, 1, f1, 0)
F (__s_expf, __s_expf, exp, mpfr_exp, 1, 1, f1, 0)
F (__s_exp2f_1u, __s_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 0)
F (__s_exp2f, __s_exp2f, exp2, mpfr_exp2, 1, 1, f1, 0)
F (__s_powf, __s_powf, pow, mpfr_pow, 2, 1, f2, 0)
F (__s_logf, __s_logf, log, mpfr_log, 1, 1, f1, 0)
F (__s_sin, __s_sin, sinl, mpfr_sin, 1, 0, d1, 0)
F (__s_cos, __s_cos, cosl, mpfr_cos, 1, 0, d1, 0)
F (__s_exp, __s_exp, expl, mpfr_exp, 1, 0, d1, 0)
F (__s_log, __s_log, logl, mpfr_log, 1, 0, d1, 0)
F (__s_pow, __s_pow, powl, mpfr_pow, 2, 0, d2, 0)
#if __aarch64__
F (__v_sinf, v_sinf, sin, mpfr_sin, 1, 1, f1, 1)
F (__v_cosf, v_cosf, cos, mpfr_cos, 1, 1, f1, 1)
F (__v_expf_1u, v_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
F (__v_expf, v_expf, exp, mpfr_exp, 1, 1, f1, 1)
F (__v_exp2f_1u, v_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
F (__v_exp2f, v_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
F (__v_logf, v_logf, log, mpfr_log, 1, 1, f1, 1)
F (__v_powf, v_powf, pow, mpfr_pow, 2, 1, f2, 1)
F (__v_sin, v_sin, sinl, mpfr_sin, 1, 0, d1, 1)
F (__v_cos, v_cos, cosl, mpfr_cos, 1, 0, d1, 1)
F (__v_exp, v_exp, expl, mpfr_exp, 1, 0, d1, 1)
F (__v_log, v_log, logl, mpfr_log, 1, 0, d1, 1)
F (__v_pow, v_pow, powl, mpfr_pow, 2, 0, d2, 1)
#ifdef __vpcs
F (__vn_sinf, vn_sinf, sin, mpfr_sin, 1, 1, f1, 1)
F (__vn_cosf, vn_cosf, cos, mpfr_cos, 1, 1, f1, 1)
F (__vn_expf_1u, vn_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
F (__vn_expf, vn_expf, exp, mpfr_exp, 1, 1, f1, 1)
F (__vn_exp2f_1u, vn_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
F (__vn_exp2f, vn_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
F (__vn_logf, vn_logf, log, mpfr_log, 1, 1, f1, 1)
F (__vn_powf, vn_powf, pow, mpfr_pow, 2, 1, f2, 1)
F (__vn_sin, vn_sin, sinl, mpfr_sin, 1, 0, d1, 1)
F (__vn_cos, vn_cos, cosl, mpfr_cos, 1, 0, d1, 1)
F (__vn_exp, vn_exp, expl, mpfr_exp, 1, 0, d1, 1)
F (__vn_log, vn_log, logl, mpfr_log, 1, 0, d1, 1)
F (__vn_pow, vn_pow, powl, mpfr_pow, 2, 0, d2, 1)
F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1)
F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1)
F (_ZGVnN4v_expf_1u, Z_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1)
F (_ZGVnN4v_exp2f_1u, Z_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
F (_ZGVnN4v_exp2f, Z_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
F (_ZGVnN4v_logf, Z_logf, log, mpfr_log, 1, 1, f1, 1)
F (_ZGVnN4vv_powf, Z_powf, pow, mpfr_pow, 2, 1, f2, 1)
@ -74,5 +37,4 @@
F (_ZGVnN2v_log, Z_log, logl, mpfr_log, 1, 0, d1, 1)
F (_ZGVnN2vv_pow, Z_pow, powl, mpfr_pow, 2, 0, d2, 1)
#endif
#endif
#endif
/* clang-format on */

View File

@ -1,10 +1,12 @@
/*
* Function wrappers for ulp.
*
* Copyright (c) 2022, Arm Limited.
* Copyright (c) 2022-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* clang-format off */
/* Wrappers for sincos. */
static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);}
static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);}
@ -16,37 +18,12 @@ static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,
#endif
/* Wrappers for vector functions. */
#if __aarch64__ && WANT_VMATH
static float v_sinf(float x) { return __v_sinf(argf(x))[0]; }
static float v_cosf(float x) { return __v_cosf(argf(x))[0]; }
static float v_expf_1u(float x) { return __v_expf_1u(argf(x))[0]; }
static float v_expf(float x) { return __v_expf(argf(x))[0]; }
static float v_exp2f_1u(float x) { return __v_exp2f_1u(argf(x))[0]; }
static float v_exp2f(float x) { return __v_exp2f(argf(x))[0]; }
static float v_logf(float x) { return __v_logf(argf(x))[0]; }
static float v_powf(float x, float y) { return __v_powf(argf(x),argf(y))[0]; }
static double v_sin(double x) { return __v_sin(argd(x))[0]; }
static double v_cos(double x) { return __v_cos(argd(x))[0]; }
static double v_exp(double x) { return __v_exp(argd(x))[0]; }
static double v_log(double x) { return __v_log(argd(x))[0]; }
static double v_pow(double x, double y) { return __v_pow(argd(x),argd(y))[0]; }
#ifdef __vpcs
static float vn_sinf(float x) { return __vn_sinf(argf(x))[0]; }
static float vn_cosf(float x) { return __vn_cosf(argf(x))[0]; }
static float vn_expf_1u(float x) { return __vn_expf_1u(argf(x))[0]; }
static float vn_expf(float x) { return __vn_expf(argf(x))[0]; }
static float vn_exp2f_1u(float x) { return __vn_exp2f_1u(argf(x))[0]; }
static float vn_exp2f(float x) { return __vn_exp2f(argf(x))[0]; }
static float vn_logf(float x) { return __vn_logf(argf(x))[0]; }
static float vn_powf(float x, float y) { return __vn_powf(argf(x),argf(y))[0]; }
static double vn_sin(double x) { return __vn_sin(argd(x))[0]; }
static double vn_cos(double x) { return __vn_cos(argd(x))[0]; }
static double vn_exp(double x) { return __vn_exp(argd(x))[0]; }
static double vn_log(double x) { return __vn_log(argd(x))[0]; }
static double vn_pow(double x, double y) { return __vn_pow(argd(x),argd(y))[0]; }
static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; }
static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; }
static float Z_expf_1u(float x) { return _ZGVnN4v_expf_1u(argf(x))[0]; }
static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; }
static float Z_exp2f_1u(float x) { return _ZGVnN4v_exp2f_1u(argf(x))[0]; }
static float Z_exp2f(float x) { return _ZGVnN4v_exp2f(argf(x))[0]; }
static float Z_logf(float x) { return _ZGVnN4v_logf(argf(x))[0]; }
static float Z_powf(float x, float y) { return _ZGVnN4vv_powf(argf(x),argf(y))[0]; }
@ -56,4 +33,5 @@ static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; }
static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; }
static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; }
#endif
#endif
/* clang-format on */

View File

@ -0,0 +1,356 @@
/*
* Implementation of the true gamma function (as opposed to lgamma)
* for 128-bit long double.
*
* Copyright (c) 2006-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/*
* This module implements the float128 gamma function under the name
* tgamma128. It's expected to be suitable for integration into system
* maths libraries under the standard name tgammal, if long double is
* 128-bit. Such a library will probably want to check the error
* handling and optimize the initial process of extracting the
* exponent, which is done here by simple and portable (but
* potentially slower) methods.
*/
#include <float.h>
#include <math.h>
#include <stdbool.h>
#include <stddef.h>
/* Only binary128 format is supported. */
#if LDBL_MANT_DIG == 113
#include "tgamma128.h"
#define lenof(x) (sizeof(x)/sizeof(*(x)))
/*
* Helper routine to evaluate a polynomial via Horner's rule
*/
static long double poly(const long double *coeffs, size_t n, long double x)
{
long double result = coeffs[--n];
while (n > 0)
result = (result * x) + coeffs[--n];
return result;
}
/*
* Compute sin(pi*x) / pi, for use in the reflection formula that
* relates gamma(-x) and gamma(x).
*/
static long double sin_pi_x_over_pi(long double x)
{
int quo;
long double fracpart = remquol(x, 0.5L, &quo);
long double sign = 1.0L;
if (quo & 2)
sign = -sign;
quo &= 1;
if (quo == 0 && fabsl(fracpart) < 0x1.p-58L) {
/* For numbers this size, sin(pi*x) is so close to pi*x that
* sin(pi*x)/pi is indistinguishable from x in float128 */
return sign * fracpart;
}
if (quo == 0) {
return sign * sinl(pi*fracpart) / pi;
} else {
return sign * cosl(pi*fracpart) / pi;
}
}
/* Return tgamma(x) on the assumption that x >= 8. */
static long double tgamma_large(long double x,
bool negative, long double negadjust)
{
/*
* In this range we compute gamma(x) as x^(x-1/2) * e^-x * K,
* where K is a correction factor computed as a polynomial in 1/x.
*
* (Vaguely inspired by the form of the Lanczos approximation, but
* I tried the Lanczos approximation itself and it suffers badly
* from big cancellation leading to loss of significance.)
*/
long double t = 1/x;
long double p = poly(coeffs_large, lenof(coeffs_large), t);
/*
* To avoid overflow in cases where x^(x-0.5) does overflow
* but gamma(x) does not, we split x^(x-0.5) in half and
* multiply back up _after_ multiplying the shrinking factor
* of exp(-(x-0.5)).
*
* Note that computing x-0.5 and (x-0.5)/2 is exact for the
* relevant range of x, so the only sources of error are pow
* and exp themselves, plus the multiplications.
*/
long double powhalf = powl(x, (x-0.5L)/2.0L);
long double expret = expl(-(x-0.5L));
if (!negative) {
return (expret * powhalf) * powhalf * p;
} else {
/*
* Apply the reflection formula as commented below, but
* carefully: negadjust has magnitude less than 1, so it can
* turn a case where gamma(+x) would overflow into a case
* where gamma(-x) doesn't underflow. Not only that, but the
* FP format has greater range in the tiny domain due to
* denormals. For both reasons, it's not good enough to
* compute the positive result and then adjust it.
*/
long double ret = 1 / ((expret * powhalf) * (x * negadjust) * p);
return ret / powhalf;
}
}
/* Return tgamma(x) on the assumption that 0 <= x < 1/32. */
static long double tgamma_tiny(long double x,
bool negative, long double negadjust)
{
/*
* For x near zero, we use a polynomial approximation to
* g = 1/(x*gamma(x)), and then return 1/(g*x).
*/
long double g = poly(coeffs_tiny, lenof(coeffs_tiny), x);
if (!negative)
return 1.0L / (g*x);
else
return g / negadjust;
}
/* Return tgamma(x) on the assumption that 0 <= x < 2^-113. */
static long double tgamma_ultratiny(long double x, bool negative,
long double negadjust)
{
/* On this interval, gamma can't even be distinguished from 1/x,
* so we skip the polynomial evaluation in tgamma_tiny, partly to
* save time and partly to avoid the tiny intermediate values
* setting the underflow exception flag. */
if (!negative)
return 1.0L / x;
else
return 1.0L / negadjust;
}
/* Return tgamma(x) on the assumption that 1 <= x <= 2. */
static long double tgamma_central(long double x)
{
/*
* In this central interval, our strategy is to finding the
* difference between x and the point where gamma has a minimum,
* and approximate based on that.
*/
/* The difference between the input x and the minimum x. The first
* subtraction is expected to be exact, since x and min_hi have
* the same exponent (unless x=2, in which case it will still be
* exact). */
long double t = (x - min_x_hi) - min_x_lo;
/*
* Now use two different polynomials for the intervals [1,m] and
* [m,2].
*/
long double p;
if (t < 0)
p = poly(coeffs_central_neg, lenof(coeffs_central_neg), -t);
else
p = poly(coeffs_central_pos, lenof(coeffs_central_pos), t);
return (min_y_lo + p * (t*t)) + min_y_hi;
}
long double tgamma128(long double x)
{
/*
* Start by extracting the number's sign and exponent, and ruling
* out cases of non-normalized numbers.
*
* For an implementation integrated into a system libm, it would
* almost certainly be quicker to do this by direct bitwise access
* to the input float128 value, using whatever is the local idiom
* for knowing its endianness.
*
* Integration into a system libc may also need to worry about
* setting errno, if that's the locally preferred way to report
* math.h errors.
*/
int sign = signbit(x);
int exponent;
switch (fpclassify(x)) {
case FP_NAN:
return x+x; /* propagate QNaN, make SNaN throw an exception */
case FP_ZERO:
return 1/x; /* divide by zero on purpose to indicate a pole */
case FP_INFINITE:
if (sign) {
return x-x; /* gamma(-inf) has indeterminate sign, so provoke an
* IEEE invalid operation exception to indicate that */
}
return x; /* but gamma(+inf) is just +inf with no error */
case FP_SUBNORMAL:
exponent = -16384;
break;
default:
frexpl(x, &exponent);
exponent--;
break;
}
bool negative = false;
long double negadjust = 0.0L;
if (sign) {
/*
* Euler's reflection formula is
*
* gamma(1-x) gamma(x) = pi/sin(pi*x)
*
* pi
* => gamma(x) = --------------------
* gamma(1-x) sin(pi*x)
*
* But computing 1-x is going to lose a lot of accuracy when x
* is very small, so instead we transform using the recurrence
* gamma(t+1)=t gamma(t). Setting t=-x, this gives us
* gamma(1-x) = -x gamma(-x), so we now have
*
* pi
* gamma(x) = ----------------------
* -x gamma(-x) sin(pi*x)
*
* which relates gamma(x) to gamma(-x), which is much nicer,
* since x can be turned into -x without rounding.
*/
negadjust = sin_pi_x_over_pi(x);
negative = true;
x = -x;
/*
* Now the ultimate answer we want is
*
* 1 / (gamma(x) * x * negadjust)
*
* where x is the positive value we've just turned it into.
*
* For some of the cases below, we'll compute gamma(x)
* normally and then compute this adjusted value afterwards.
* But for others, we can implement the reciprocal operation
* in this formula by _avoiding_ an inversion that the
* sub-case was going to do anyway.
*/
if (negadjust == 0) {
/*
* Special case for negative integers. Applying the
* reflection formula would cause division by zero, but
* standards would prefer we treat this error case as an
* invalid operation and return NaN instead. (Possibly
* because otherwise you'd have to decide which sign of
* infinity to return, and unlike the x=0 case, there's no
* sign of zero available to disambiguate.)
*/
return negadjust / negadjust;
}
}
/*
* Split the positive domain into various cases. For cases where
* we do the negative-number adjustment the usual way, we'll leave
* the answer in 'g' and drop out of the if statement.
*/
long double g;
if (exponent >= 11) {
/*
* gamma of any positive value this large overflows, and gamma
* of any negative value underflows.
*/
if (!negative) {
long double huge = 0x1p+12288L;
return huge * huge; /* provoke an overflow */
} else {
long double tiny = 0x1p-12288L;
return tiny * tiny * negadjust; /* underflow, of the right sign */
}
} else if (exponent >= 3) {
/* Negative-number adjustment happens inside here */
return tgamma_large(x, negative, negadjust);
} else if (exponent < -113) {
/* Negative-number adjustment happens inside here */
return tgamma_ultratiny(x, negative, negadjust);
} else if (exponent < -5) {
/* Negative-number adjustment happens inside here */
return tgamma_tiny(x, negative, negadjust);
} else if (exponent == 0) {
g = tgamma_central(x);
} else if (exponent < 0) {
/*
* For x in [1/32,1) we range-reduce upwards to the interval
* [1,2), using the inverse of the normal recurrence formula:
* gamma(x) = gamma(x+1)/x.
*/
g = tgamma_central(1+x) / x;
} else {
/*
* For x in [2,8) we range-reduce downwards to the interval
* [1,2) by repeated application of the recurrence formula.
*
* Actually multiplying (x-1) by (x-2) by (x-3) and so on
* would introduce multiple ULPs of rounding error. We can get
* better accuracy by writing x = (k+1/2) + t, where k is an
* integer and |t|<1/2, and expanding out the obvious factor
* (x-1)(x-2)...(x-k+1) as a polynomial in t.
*/
long double mult;
int i = x;
if (i == 2) { /* x in [2,3) */
mult = (x-1);
} else {
long double t = x - (i + 0.5L);
switch (i) {
/* E.g. for x=3.5+t, we want
* (x-1)(x-2) = (2.5+t)(1.5+t) = 3.75 + 4t + t^2 */
case 3:
mult = 3.75L+t*(4.0L+t);
break;
case 4:
mult = 13.125L+t*(17.75L+t*(7.5L+t));
break;
case 5:
mult = 59.0625L+t*(93.0L+t*(51.50L+t*(12.0L+t)));
break;
case 6:
mult = 324.84375L+t*(570.5625L+t*(376.250L+t*(
117.5L+t*(17.5L+t))));
break;
case 7:
mult = 2111.484375L+t*(4033.5L+t*(3016.1875L+t*(
1140.0L+t*(231.25L+t*(24.0L+t)))));
break;
}
}
g = tgamma_central(x - (i-1)) * mult;
}
if (!negative) {
/* Positive domain: return g unmodified */
return g;
} else {
/* Negative domain: apply the reflection formula as commented above */
return 1.0L / (g * x * negadjust);
}
}
#endif

View File

@ -0,0 +1,141 @@
/*
* Polynomial coefficients and other constants for tgamma128.c.
*
* Copyright (c) 2006-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* The largest positive value for which 128-bit tgamma does not overflow. */
static const long double max_x = 0x1.b6e3180cd66a5c4206f128ba77f4p+10L;
/* Coefficients of the polynomial used in the tgamma_large() subroutine */
static const long double coeffs_large[] = {
0x1.8535745aa79569579b9eec0f3bbcp+0L,
0x1.0378f83c6fb8f0e51269f2b4a973p-3L,
0x1.59f6a05094f69686c3380f4e2783p-8L,
-0x1.0b291dee952a82764a4859b081a6p-8L,
-0x1.6dd301b2205bf936b5a3eaad0dbbp-12L,
0x1.387a8b5f38dd77e7f139b1021e86p-10L,
0x1.bca46637f65b13750c728cc29e40p-14L,
-0x1.d80401c00aef998c9e303151a51cp-11L,
-0x1.49cb6bb09f935a2053ccc2cf3711p-14L,
0x1.4e950204437dcaf2be77f73a6f45p-10L,
0x1.cb711a2d65f188bf60110934d6bep-14L,
-0x1.7d7ff4bc95dc7faefc5e767f70f1p-9L,
-0x1.0305ab9760cddb0d833e73766836p-12L,
0x1.3ef6c84bf1cd5c3f65ac2693bb5bp-7L,
0x1.bb4144740ad9290123fdcea684aap-11L,
-0x1.72ab4e88272a229bfafd192450f0p-5L,
0x1.80c70ac6eb3b7a698983d25a62b8p-12L,
0x1.e222791c6743ce3e3cae220fb236p-3L,
0x1.1a2dca1c82a9326c52b465f7cb7ap-2L,
-0x1.9d204fa235a42cd901b123d2ad47p+1L,
0x1.55b56d1158f77ddb1c95fc44ab02p+0L,
0x1.37f900a11dbd892abd7dde533e2dp+5L,
-0x1.2da49f4188dd89cb958369ef2401p+7L,
0x1.fdae5ec3ec6eb7dffc09edbe6c14p+7L,
-0x1.61433cebe649098c9611c4c7774ap+7L,
};
/* Coefficients of the polynomial used in the tgamma_tiny() subroutine */
static const long double coeffs_tiny[] = {
0x1.0000000000000000000000000000p+0L,
0x1.2788cfc6fb618f49a37c7f0201fep-1L,
-0x1.4fcf4026afa2dceb8490ade22796p-1L,
-0x1.5815e8fa27047c8f42b5d9217244p-5L,
0x1.5512320b43fbe5dfa771333518f7p-3L,
-0x1.59af103c340927bffdd44f954bfcp-5L,
-0x1.3b4af28483e210479657e5543366p-7L,
0x1.d919c527f6070bfce9b29c2ace9cp-8L,
-0x1.317112ce35337def3556a18aa178p-10L,
-0x1.c364fe77a6f27677b985b1fa2e1dp-13L,
0x1.0c8a7a19a3fd40fe1f7e867efe7bp-13L,
-0x1.51cf9f090b5dc398ba86305e3634p-16L,
-0x1.4e80f64c04a339740de06ca9fa4ap-20L,
0x1.241ddc2aef2ec20e58b08f2fda17p-20L,
};
/* The location within the interval [1,2] where gamma has a minimum.
* Specified as the sum of two 128-bit values, for extra precision. */
static const long double min_x_hi = 0x1.762d86356be3f6e1a9c8865e0a4fp+0L;
static const long double min_x_lo = 0x1.ac54d7d218de21303a7c60f08840p-118L;
/* The actual minimum value that gamma takes at that location.
* Again specified as the sum of two 128-bit values. */
static const long double min_y_hi = 0x1.c56dc82a74aee8d8851566d40f32p-1L;
static const long double min_y_lo = 0x1.8ed98685742c353ce55e5794686fp-114L;
/* Coefficients of the polynomial used in the tgamma_central() subroutine
* for computing gamma on the interval [1,min_x] */
static const long double coeffs_central_neg[] = {
0x1.b6c53f7377b83839c8a292e43b69p-2L,
0x1.0bae9f40c7d09ed76e732045850ap-3L,
0x1.4981175e14d04c3530e51d01c5fep-3L,
0x1.79f77aaf032c948af3a9edbd2061p-4L,
0x1.1e97bd10821095a5b79fbfdfa1a3p-4L,
0x1.8071ce0935e4dcf0b33b0fbec7c1p-5L,
0x1.0b44c2f92982f887b55ec36dfdb0p-5L,
0x1.6df1de1e178ef72ca7bd63d40870p-6L,
0x1.f63f502bde27e81c0f5e13479b43p-7L,
0x1.57fd67d901f40ea011353ad89a0ap-7L,
0x1.d7151376eed187eb753e2273cafcp-8L,
0x1.427162b5c6ff1d904c71ef53e37cp-8L,
0x1.b954b8c3a56cf93e49ef6538928ap-9L,
0x1.2dff2ec26a3ae5cd3aaccae7a09ep-9L,
0x1.9d35250d9b9378d9b59df734537ap-10L,
0x1.1b2c0c48b9855a28f6dbd6fdff3cp-10L,
0x1.7e0db39bb99cdb52b028d9359380p-11L,
0x1.2164b5e1d364a0b5eaf97c436aa7p-11L,
0x1.27521cf5fd24dcdf43524e6add11p-13L,
0x1.06461d62243bf9a826b42349672fp-10L,
-0x1.2b852abead28209b4e0c756dc46ep-9L,
0x1.be673c11a72c826115ec6d286c14p-8L,
-0x1.fd9ce330c215c31fcd3cb53c42ebp-7L,
0x1.fa362bd2dc68f41abef2d8600acdp-6L,
-0x1.a21585b2f52f8b23855de8e452edp-5L,
0x1.1f234431ed032052fc92e64e0493p-4L,
-0x1.40d332476ca0199c60cdae3f9132p-4L,
0x1.1d45dc665d86012eba2eea199cefp-4L,
-0x1.8491016cdd08dc9be7ade9b5fef3p-5L,
0x1.7e7e2fbc6d49ad484300d6add324p-6L,
-0x1.e63fe3f874a37276a8d7d8b705ecp-8L,
0x1.30a2a73944f8c84998314d69c23fp-10L,
};
/* Coefficients of the polynomial used in the tgamma_central() subroutine
* for computing gamma on the interval [min_x,2] */
static const long double coeffs_central_pos[] = {
0x1.b6c53f7377b83839c8a292e22aa2p-2L,
-0x1.0bae9f40c7d09ed76e72e1c955dep-3L,
0x1.4981175e14d04c3530ee5e1ecebcp-3L,
-0x1.79f77aaf032c948ac983d77f3e07p-4L,
0x1.1e97bd10821095ab7dc94936cc11p-4L,
-0x1.8071ce0935e4d7edef8cbf2a1cf1p-5L,
0x1.0b44c2f929837fafef7b5d9e80f1p-5L,
-0x1.6df1de1e175fe2a51faa25cddbb4p-6L,
0x1.f63f502be57d11aed2cfe90843ffp-7L,
-0x1.57fd67d852f230015b9f64770273p-7L,
0x1.d715138adc07e5fce81077070357p-8L,
-0x1.4271618e9fda8992a667adb15f4fp-8L,
0x1.b954d15d9eb772e80fdd760672d7p-9L,
-0x1.2dfe391241d3cb79c8c15182843dp-9L,
0x1.9d44396fcd48451c3ba924cee814p-10L,
-0x1.1ac195fb99739e341589e39803e6p-10L,
0x1.82e46127b68f002770826e25f146p-11L,
-0x1.089dacd90d9f41493119ac178359p-11L,
0x1.6993c007b20394a057d21f3d37f8p-12L,
-0x1.ec43a709f4446560c099dec8e31bp-13L,
0x1.4ba36322f4074e9add9450f003cap-13L,
-0x1.b3f83a977965ca1b7937bf5b34cap-14L,
0x1.10af346abc09cb25a6d9fe810b6ep-14L,
-0x1.38d8ea1188f242f50203edc395bdp-15L,
0x1.39add987a948ec56f62b721a4475p-16L,
-0x1.02a4e141f286c8a967e2df9bc9adp-17L,
0x1.433b50af22425f546e87113062d7p-19L,
-0x1.0c7b73cb0013f00aafc103e8e382p-21L,
0x1.b852de313ec38da2297f6deaa6b4p-25L,
};
/* 128-bit float value of pi, used by the sin_pi_x_over_pi subroutine
*/
static const long double pi = 0x1.921fb54442d18469898cc51701b8p+1L;

View File

@ -0,0 +1,212 @@
# -*- julia -*-
#
# Generate tgamma128.h, containing polynomials and constants used by
# tgamma128.c.
#
# Copyright (c) 2006-2023, Arm Limited.
# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
# This Julia program depends on the 'Remez' and 'SpecialFunctions'
# library packages. To install them, run this at the interactive Julia
# prompt:
#
# import Pkg; Pkg.add(["Remez", "SpecialFunctions"])
#
# Tested on Julia 1.4.1 (Ubuntu 20.04) and 1.9.0 (22.04).
import Printf
import Remez
import SpecialFunctions
# Round a BigFloat to 128-bit long double and format it as a C99 hex
# float literal.
function quadhex(x)
sign = " "
if x < 0
sign = "-"
x = -x
end
exponent = BigInt(floor(log2(x)))
exponent = max(exponent, -16382)
@assert(exponent <= 16383) # else overflow
x /= BigFloat(2)^exponent
@assert(1 <= x < 2)
x *= BigFloat(2)^112
mantissa = BigInt(round(x))
mantstr = string(mantissa, base=16, pad=29)
return Printf.@sprintf("%s0x%s.%sp%+dL", sign, mantstr[1], mantstr[2:end],
exponent)
end
# Round a BigFloat to 128-bit long double and return it still as a
# BigFloat.
function quadval(x, round=0)
sign = +1
if x.sign < 0
sign = -1
x = -x
end
exponent = BigInt(floor(log2(x)))
exponent = max(exponent, -16382)
@assert(exponent <= 16383) # else overflow
x /= BigFloat(2)^exponent
@assert(1 <= x < 2)
x *= BigFloat(2)^112
if round < 0
mantissa = floor(x)
elseif round > 0
mantissa = ceil(x)
else
mantissa = round(x)
end
return sign * mantissa * BigFloat(2)^(exponent - 112)
end
# Output an array of BigFloats as a C array declaration.
function dumparray(a, name)
println("static const long double ", name, "[] = {")
for x in N
println(" ", quadhex(x), ",")
end
println("};")
end
print("/*
* Polynomial coefficients and other constants for tgamma128.c.
*
* Copyright (c) 2006,2009,2023 Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
")
Base.MPFR.setprecision(512)
e = exp(BigFloat(1))
print("
/* The largest positive value for which 128-bit tgamma does not overflow. */
")
lo = BigFloat("1000")
hi = BigFloat("2000")
while true
global lo
global hi
global max_x
mid = (lo + hi) / 2
if mid == lo || mid == hi
max_x = mid
break
end
if SpecialFunctions.logabsgamma(mid)[1] < 16384 * log(BigFloat(2))
lo = mid
else
hi = mid
end
end
max_x = quadval(max_x, -1)
println("static const long double max_x = ", quadhex(max_x), ";")
print("
/* Coefficients of the polynomial used in the tgamma_large() subroutine */
")
N, D, E, X = Remez.ratfn_minimax(
x -> x==0 ? sqrt(BigFloat(2)*pi/e) :
exp(SpecialFunctions.logabsgamma(1/x)[1] +
(1/x-0.5)*(1+log(x))),
(0, 1/BigFloat(8)),
24, 0,
(x, y) -> 1/y
)
dumparray(N, "coeffs_large")
print("
/* Coefficients of the polynomial used in the tgamma_tiny() subroutine */
")
N, D, E, X = Remez.ratfn_minimax(
x -> x==0 ? 1 : 1/(x*SpecialFunctions.gamma(x)),
(0, 1/BigFloat(32)),
13, 0,
)
dumparray(N, "coeffs_tiny")
print("
/* The location within the interval [1,2] where gamma has a minimum.
* Specified as the sum of two 128-bit values, for extra precision. */
")
lo = BigFloat("1.4")
hi = BigFloat("1.5")
while true
global lo
global hi
global min_x
mid = (lo + hi) / 2
if mid == lo || mid == hi
min_x = mid
break
end
if SpecialFunctions.digamma(mid) < 0
lo = mid
else
hi = mid
end
end
min_x_hi = quadval(min_x, -1)
println("static const long double min_x_hi = ", quadhex(min_x_hi), ";")
println("static const long double min_x_lo = ", quadhex(min_x - min_x_hi), ";")
print("
/* The actual minimum value that gamma takes at that location.
* Again specified as the sum of two 128-bit values. */
")
min_y = SpecialFunctions.gamma(min_x)
min_y_hi = quadval(min_y, -1)
println("static const long double min_y_hi = ", quadhex(min_y_hi), ";")
println("static const long double min_y_lo = ", quadhex(min_y - min_y_hi), ";")
function taylor_bodge(x)
# Taylor series generated by Wolfram Alpha for (gamma(min_x+x)-min_y)/x^2.
# Used in the Remez calls below for x values very near the origin, to avoid
# significance loss problems when trying to compute it directly via that
# formula (even in MPFR's extra precision).
return BigFloat("0.428486815855585429730209907810650582960483696962660010556335457558784421896667728014324097132413696263704801646004585959298743677879606168187061990204432200")+x*(-BigFloat("0.130704158939785761928008749242671025181542078105370084716141350308119418619652583986015464395882363802104154017741656168641240436089858504560718773026275797")+x*(BigFloat("0.160890753325112844190519489594363387594505844658437718135952967735294789599989664428071656484587979507034160383271974554122934842441540146372016567834062876")+x*(-BigFloat("0.092277030213334350126864106458600575084335085690780082222880945224248438672595248111704471182201673989215223667543694847795410779036800385804729955729659506"))))
end
print("
/* Coefficients of the polynomial used in the tgamma_central() subroutine
* for computing gamma on the interval [1,min_x] */
")
N, D, E, X = Remez.ratfn_minimax(
x -> x < BigFloat(0x1p-64) ? taylor_bodge(-x) :
(SpecialFunctions.gamma(min_x - x) - min_y) / (x*x),
(0, min_x - 1),
31, 0,
(x, y) -> x^2,
)
dumparray(N, "coeffs_central_neg")
print("
/* Coefficients of the polynomial used in the tgamma_central() subroutine
* for computing gamma on the interval [min_x,2] */
")
N, D, E, X = Remez.ratfn_minimax(
x -> x < BigFloat(0x1p-64) ? taylor_bodge(x) :
(SpecialFunctions.gamma(min_x + x) - min_y) / (x*x),
(0, 2 - min_x),
28, 0,
(x, y) -> x^2,
)
dumparray(N, "coeffs_central_pos")
print("
/* 128-bit float value of pi, used by the sin_pi_x_over_pi subroutine
*/
")
println("static const long double pi = ", quadhex(BigFloat(pi)), ";")

View File

@ -1,95 +0,0 @@
/*
* Double-precision vector cos function.
*
* Copyright (c) 2019-2022, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
#if V_SUPPORTED
static const double Poly[] = {
/* worst-case error is 3.5 ulp.
abs error: 0x1.be222a58p-53 in [-pi/2, pi/2]. */
-0x1.9f4a9c8b21dc9p-41,
0x1.60e88a10163f2p-33,
-0x1.ae6361b7254e7p-26,
0x1.71de382e8d62bp-19,
-0x1.a01a019aeb4ffp-13,
0x1.111111110b25ep-7,
-0x1.55555555554c3p-3,
};
#define C7 v_f64 (Poly[0])
#define C6 v_f64 (Poly[1])
#define C5 v_f64 (Poly[2])
#define C4 v_f64 (Poly[3])
#define C3 v_f64 (Poly[4])
#define C2 v_f64 (Poly[5])
#define C1 v_f64 (Poly[6])
#define InvPi v_f64 (0x1.45f306dc9c883p-2)
#define HalfPi v_f64 (0x1.921fb54442d18p+0)
#define Pi1 v_f64 (0x1.921fb54442d18p+1)
#define Pi2 v_f64 (0x1.1a62633145c06p-53)
#define Pi3 v_f64 (0x1.c1cd129024e09p-106)
#define Shift v_f64 (0x1.8p52)
#define RangeVal v_f64 (0x1p23)
#define AbsMask v_u64 (0x7fffffffffffffff)
VPCS_ATTR
__attribute__ ((noinline)) static v_f64_t
specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
{
return v_call_f64 (cos, x, y, cmp);
}
VPCS_ATTR
v_f64_t
V_NAME(cos) (v_f64_t x)
{
v_f64_t n, r, r2, y;
v_u64_t odd, cmp;
r = v_as_f64_u64 (v_as_u64_f64 (x) & AbsMask);
cmp = v_cond_u64 (v_as_u64_f64 (r) >= v_as_u64_f64 (RangeVal));
#if WANT_SIMD_EXCEPT
if (unlikely (v_any_u64 (cmp)))
/* If fenv exceptions are to be triggered correctly, set any special lanes
to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
specialcase later. */
r = v_sel_f64 (cmp, v_f64 (1.0), r);
#endif
/* n = rint((|x|+pi/2)/pi) - 0.5. */
n = v_fma_f64 (InvPi, r + HalfPi, Shift);
odd = v_as_u64_f64 (n) << 63;
n -= Shift;
n -= v_f64 (0.5);
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
r = v_fma_f64 (-Pi1, n, r);
r = v_fma_f64 (-Pi2, n, r);
r = v_fma_f64 (-Pi3, n, r);
/* sin(r) poly approx. */
r2 = r * r;
y = v_fma_f64 (C7, r2, C6);
y = v_fma_f64 (y, r2, C5);
y = v_fma_f64 (y, r2, C4);
y = v_fma_f64 (y, r2, C3);
y = v_fma_f64 (y, r2, C2);
y = v_fma_f64 (y, r2, C1);
y = v_fma_f64 (y * r2, r, r);
/* sign. */
y = v_as_f64_u64 (v_as_u64_f64 (y) ^ odd);
if (unlikely (v_any_u64 (cmp)))
return specialcase (x, y, cmp);
return y;
}
VPCS_ALIAS
#endif

View File

@ -1,84 +0,0 @@
/*
* Single-precision vector cos function.
*
* Copyright (c) 2019-2022, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
#if V_SUPPORTED
static const float Poly[] = {
/* 1.886 ulp error */
0x1.5b2e76p-19f,
-0x1.9f42eap-13f,
0x1.110df4p-7f,
-0x1.555548p-3f,
};
#define Pi1 v_f32 (0x1.921fb6p+1f)
#define Pi2 v_f32 (-0x1.777a5cp-24f)
#define Pi3 v_f32 (-0x1.ee59dap-49f)
#define A3 v_f32 (Poly[3])
#define A5 v_f32 (Poly[2])
#define A7 v_f32 (Poly[1])
#define A9 v_f32 (Poly[0])
#define RangeVal v_f32 (0x1p20f)
#define InvPi v_f32 (0x1.45f306p-2f)
#define Shift v_f32 (0x1.8p+23f)
#define AbsMask v_u32 (0x7fffffff)
#define HalfPi v_f32 (0x1.921fb6p0f)
VPCS_ATTR
static v_f32_t
specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
{
/* Fall back to scalar code. */
return v_call_f32 (cosf, x, y, cmp);
}
VPCS_ATTR
v_f32_t
V_NAME(cosf) (v_f32_t x)
{
v_f32_t n, r, r2, y;
v_u32_t odd, cmp;
r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask);
cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal));
#if WANT_SIMD_EXCEPT
if (unlikely (v_any_u32 (cmp)))
/* If fenv exceptions are to be triggered correctly, set any special lanes
to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
specialcase later. */
r = v_sel_f32 (cmp, v_f32 (1.0f), r);
#endif
/* n = rint((|x|+pi/2)/pi) - 0.5 */
n = v_fma_f32 (InvPi, r + HalfPi, Shift);
odd = v_as_u32_f32 (n) << 31;
n -= Shift;
n -= v_f32 (0.5f);
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */
r = v_fma_f32 (-Pi1, n, r);
r = v_fma_f32 (-Pi2, n, r);
r = v_fma_f32 (-Pi3, n, r);
/* y = sin(r) */
r2 = r * r;
y = v_fma_f32 (A9, r2, A7);
y = v_fma_f32 (y, r2, A5);
y = v_fma_f32 (y, r2, A3);
y = v_fma_f32 (y * r2, r, r);
/* sign fix */
y = v_as_f32_u32 (v_as_u32_f32 (y) ^ odd);
if (unlikely (v_any_u32 (cmp)))
return specialcase (x, y, cmp);
return y;
}
VPCS_ALIAS
#endif

View File

@ -1,128 +0,0 @@
/*
* Double-precision vector e^x function.
*
* Copyright (c) 2019-2022, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
#if V_SUPPORTED
#include "v_exp.h"
#if V_EXP_TABLE_BITS == 7
/* maxerr: 1.88 +0.5 ulp
rel error: 1.4337*2^-53
abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ]. */
#define C1 v_f64 (0x1.ffffffffffd43p-2)
#define C2 v_f64 (0x1.55555c75adbb2p-3)
#define C3 v_f64 (0x1.55555da646206p-5)
#define InvLn2 v_f64 (0x1.71547652b82fep7) /* N/ln2. */
#define Ln2hi v_f64 (0x1.62e42fefa39efp-8) /* ln2/N. */
#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-63)
#elif V_EXP_TABLE_BITS == 8
/* maxerr: 0.54 +0.5 ulp
rel error: 1.4318*2^-58
abs error: 1.4299*2^-58 in [ -ln2/512, ln2/512 ]. */
#define C1 v_f64 (0x1.fffffffffffd4p-2)
#define C2 v_f64 (0x1.5555571d6b68cp-3)
#define C3 v_f64 (0x1.5555576a59599p-5)
#define InvLn2 v_f64 (0x1.71547652b82fep8)
#define Ln2hi v_f64 (0x1.62e42fefa39efp-9)
#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-64)
#endif
#define N (1 << V_EXP_TABLE_BITS)
#define Tab __v_exp_data
#define IndexMask v_u64 (N - 1)
#define Shift v_f64 (0x1.8p+52)
#if WANT_SIMD_EXCEPT
#define TinyBound 0x200 /* top12 (asuint64 (0x1p-511)). */
#define BigBound 0x408 /* top12 (asuint64 (0x1p9)). */
VPCS_ATTR static NOINLINE v_f64_t
specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
{
/* If fenv exceptions are to be triggered correctly, fall back to the scalar
routine to special lanes. */
return v_call_f64 (exp, x, y, cmp);
}
#else
#define Thres v_f64 (704.0)
VPCS_ATTR
static v_f64_t
specialcase (v_f64_t s, v_f64_t y, v_f64_t n)
{
v_f64_t absn = v_abs_f64 (n);
/* 2^(n/N) may overflow, break it up into s1*s2. */
v_u64_t b = v_cond_u64 (n <= v_f64 (0.0)) & v_u64 (0x6000000000000000);
v_f64_t s1 = v_as_f64_u64 (v_u64 (0x7000000000000000) - b);
v_f64_t s2 = v_as_f64_u64 (v_as_u64_f64 (s) - v_u64 (0x3010000000000000) + b);
v_u64_t cmp = v_cond_u64 (absn > v_f64 (1280.0 * N));
v_f64_t r1 = s1 * s1;
v_f64_t r0 = v_fma_f64 (y, s2, s2) * s1;
return v_as_f64_u64 ((cmp & v_as_u64_f64 (r1)) | (~cmp & v_as_u64_f64 (r0)));
}
#endif
VPCS_ATTR
v_f64_t
V_NAME(exp) (v_f64_t x)
{
v_f64_t n, r, r2, s, y, z;
v_u64_t cmp, u, e, i;
#if WANT_SIMD_EXCEPT
/* If any lanes are special, mask them with 1 and retain a copy of x to allow
specialcase to fix special lanes later. This is only necessary if fenv
exceptions are to be triggered correctly. */
v_f64_t xm = x;
cmp = v_cond_u64 ((v_as_u64_f64 (v_abs_f64 (x)) >> 52) - TinyBound
>= BigBound - TinyBound);
if (unlikely (v_any_u64 (cmp)))
x = v_sel_f64 (cmp, v_f64 (1), x);
#else
cmp = v_cond_u64 (v_abs_f64 (x) > Thres);
#endif
/* n = round(x/(ln2/N)). */
z = v_fma_f64 (x, InvLn2, Shift);
u = v_as_u64_f64 (z);
n = z - Shift;
/* r = x - n*ln2/N. */
r = x;
r = v_fma_f64 (-Ln2hi, n, r);
r = v_fma_f64 (-Ln2lo, n, r);
e = u << (52 - V_EXP_TABLE_BITS);
i = u & IndexMask;
/* y = exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */
r2 = r * r;
y = v_fma_f64 (C2, r, C1);
y = v_fma_f64 (C3, r2, y);
y = v_fma_f64 (y, r2, r);
/* s = 2^(n/N). */
u = v_lookup_u64 (Tab, i);
s = v_as_f64_u64 (u + e);
if (unlikely (v_any_u64 (cmp)))
#if WANT_SIMD_EXCEPT
return specialcase (xm, v_fma_f64 (y, s, s), cmp);
#else
return specialcase (s, y, n);
#endif
return v_fma_f64 (y, s, s);
}
VPCS_ALIAS
#endif

View File

@ -1,14 +0,0 @@
/*
* Declarations for double-precision e^x vector function.
*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#if WANT_VMATH
#define V_EXP_TABLE_BITS 7
extern const u64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN;
#endif

View File

@ -1,117 +0,0 @@
/*
* Single-precision vector 2^x function.
*
* Copyright (c) 2019-2022, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
#if V_SUPPORTED
static const float Poly[] = {
/* maxerr: 1.962 ulp. */
0x1.59977ap-10f,
0x1.3ce9e4p-7f,
0x1.c6bd32p-5f,
0x1.ebf9bcp-3f,
0x1.62e422p-1f,
};
#define C0 v_f32 (Poly[0])
#define C1 v_f32 (Poly[1])
#define C2 v_f32 (Poly[2])
#define C3 v_f32 (Poly[3])
#define C4 v_f32 (Poly[4])
#define Shift v_f32 (0x1.8p23f)
#if WANT_SIMD_EXCEPT
#define TinyBound 0x20000000 /* asuint (0x1p-63). */
#define BigBound 0x42800000 /* asuint (0x1p6). */
VPCS_ATTR
static NOINLINE v_f32_t
specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
{
/* If fenv exceptions are to be triggered correctly, fall back to the scalar
routine to special lanes. */
return v_call_f32 (exp2f, x, y, cmp);
}
#else
VPCS_ATTR
static v_f32_t
specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale)
{
/* 2^n may overflow, break it up into s1*s2. */
v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000);
v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
v_f32_t s2 = v_as_f32_u32 (e - b);
v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f));
v_u32_t r2 = v_as_u32_f32 (s1 * s1);
v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1);
/* Similar to r1 but avoids double rounding in the subnormal range. */
v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale));
return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0));
}
#endif
VPCS_ATTR
v_f32_t
V_NAME(exp2f) (v_f32_t x)
{
v_f32_t n, r, r2, scale, p, q, poly;
v_u32_t cmp, e;
#if WANT_SIMD_EXCEPT
cmp = v_cond_u32 ((v_as_u32_f32 (x) & 0x7fffffff) - TinyBound
>= BigBound - TinyBound);
v_f32_t xm = x;
/* If any lanes are special, mask them with 1 and retain a copy of x to allow
specialcase to fix special lanes later. This is only necessary if fenv
exceptions are to be triggered correctly. */
if (unlikely (v_any_u32 (cmp)))
x = v_sel_f32 (cmp, v_f32 (1), x);
#endif
/* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
x = n + r, with r in [-1/2, 1/2]. */
#if 0
v_f32_t z;
z = x + Shift;
n = z - Shift;
r = x - n;
e = v_as_u32_f32 (z) << 23;
#else
n = v_round_f32 (x);
r = x - n;
e = v_as_u32_s32 (v_round_s32 (x)) << 23;
#endif
scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
#if !WANT_SIMD_EXCEPT
v_f32_t absn = v_abs_f32 (n);
cmp = v_cond_u32 (absn > v_f32 (126.0f));
#endif
r2 = r * r;
p = v_fma_f32 (C0, r, C1);
q = v_fma_f32 (C2, r, C3);
q = v_fma_f32 (p, r2, q);
p = C4 * r;
poly = v_fma_f32 (q, r2, p);
if (unlikely (v_any_u32 (cmp)))
#if WANT_SIMD_EXCEPT
return specialcase (xm, v_fma_f32 (poly, scale, scale), cmp);
#else
return specialcase (poly, n, e, absn, cmp, scale);
#endif
return v_fma_f32 (poly, scale, scale);
}
VPCS_ALIAS
#endif

View File

@ -1,75 +0,0 @@
/*
* Single-precision vector 2^x function.
*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
#if V_SUPPORTED
static const float Poly[] = {
/* maxerr: 0.878 ulp. */
0x1.416b5ep-13f, 0x1.5f082ep-10f, 0x1.3b2dep-7f, 0x1.c6af7cp-5f, 0x1.ebfbdcp-3f, 0x1.62e43p-1f
};
#define C0 v_f32 (Poly[0])
#define C1 v_f32 (Poly[1])
#define C2 v_f32 (Poly[2])
#define C3 v_f32 (Poly[3])
#define C4 v_f32 (Poly[4])
#define C5 v_f32 (Poly[5])
#define Shift v_f32 (0x1.8p23f)
#define InvLn2 v_f32 (0x1.715476p+0f)
#define Ln2hi v_f32 (0x1.62e4p-1f)
#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
VPCS_ATTR
static v_f32_t
specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn)
{
/* 2^n may overflow, break it up into s1*s2. */
v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
v_f32_t s2 = v_as_f32_u32 (e - b);
v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f));
v_f32_t r1 = s1 * s1;
v_f32_t r0 = poly * s1 * s2;
return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0)));
}
VPCS_ATTR
v_f32_t
V_NAME(exp2f_1u) (v_f32_t x)
{
v_f32_t n, r, scale, poly, absn;
v_u32_t cmp, e;
/* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
x = n + r, with r in [-1/2, 1/2]. */
#if 0
v_f32_t z;
z = x + Shift;
n = z - Shift;
r = x - n;
e = v_as_u32_f32 (z) << 23;
#else
n = v_round_f32 (x);
r = x - n;
e = v_as_u32_s32 (v_round_s32 (x)) << 23;
#endif
scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
absn = v_abs_f32 (n);
cmp = v_cond_u32 (absn > v_f32 (126.0f));
poly = v_fma_f32 (C0, r, C1);
poly = v_fma_f32 (poly, r, C2);
poly = v_fma_f32 (poly, r, C3);
poly = v_fma_f32 (poly, r, C4);
poly = v_fma_f32 (poly, r, C5);
poly = v_fma_f32 (poly, r, v_f32 (1.0f));
if (unlikely (v_any_u32 (cmp)))
return specialcase (poly, n, e, absn);
return scale * poly;
}
#endif

View File

@ -1,122 +0,0 @@
/*
* Single-precision vector e^x function.
*
* Copyright (c) 2019-2022, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
#if V_SUPPORTED
static const float Poly[] = {
/* maxerr: 1.45358 +0.5 ulp. */
0x1.0e4020p-7f,
0x1.573e2ep-5f,
0x1.555e66p-3f,
0x1.fffdb6p-2f,
0x1.ffffecp-1f,
};
#define C0 v_f32 (Poly[0])
#define C1 v_f32 (Poly[1])
#define C2 v_f32 (Poly[2])
#define C3 v_f32 (Poly[3])
#define C4 v_f32 (Poly[4])
#define Shift v_f32 (0x1.8p23f)
#define InvLn2 v_f32 (0x1.715476p+0f)
#define Ln2hi v_f32 (0x1.62e4p-1f)
#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
#if WANT_SIMD_EXCEPT
#define TinyBound 0x20000000 /* asuint (0x1p-63). */
#define BigBound 0x42800000 /* asuint (0x1p6). */
VPCS_ATTR
static NOINLINE v_f32_t
specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
{
/* If fenv exceptions are to be triggered correctly, fall back to the scalar
routine to special lanes. */
return v_call_f32 (expf, x, y, cmp);
}
#else
VPCS_ATTR
static v_f32_t
specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale)
{
/* 2^n may overflow, break it up into s1*s2. */
v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000);
v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
v_f32_t s2 = v_as_f32_u32 (e - b);
v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f));
v_u32_t r2 = v_as_u32_f32 (s1 * s1);
v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1);
/* Similar to r1 but avoids double rounding in the subnormal range. */
v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale));
return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0));
}
#endif
VPCS_ATTR
v_f32_t
V_NAME(expf) (v_f32_t x)
{
v_f32_t n, r, r2, scale, p, q, poly, z;
v_u32_t cmp, e;
#if WANT_SIMD_EXCEPT
cmp = v_cond_u32 ((v_as_u32_f32 (x) & 0x7fffffff) - TinyBound
>= BigBound - TinyBound);
v_f32_t xm = x;
/* If any lanes are special, mask them with 1 and retain a copy of x to allow
specialcase to fix special lanes later. This is only necessary if fenv
exceptions are to be triggered correctly. */
if (unlikely (v_any_u32 (cmp)))
x = v_sel_f32 (cmp, v_f32 (1), x);
#endif
/* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
#if 1
z = v_fma_f32 (x, InvLn2, Shift);
n = z - Shift;
r = v_fma_f32 (n, -Ln2hi, x);
r = v_fma_f32 (n, -Ln2lo, r);
e = v_as_u32_f32 (z) << 23;
#else
z = x * InvLn2;
n = v_round_f32 (z);
r = v_fma_f32 (n, -Ln2hi, x);
r = v_fma_f32 (n, -Ln2lo, r);
e = v_as_u32_s32 (v_round_s32 (z)) << 23;
#endif
scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
#if !WANT_SIMD_EXCEPT
v_f32_t absn = v_abs_f32 (n);
cmp = v_cond_u32 (absn > v_f32 (126.0f));
#endif
r2 = r * r;
p = v_fma_f32 (C0, r, C1);
q = v_fma_f32 (C2, r, C3);
q = v_fma_f32 (p, r2, q);
p = C4 * r;
poly = v_fma_f32 (q, r2, p);
if (unlikely (v_any_u32 (cmp)))
#if WANT_SIMD_EXCEPT
return specialcase (xm, v_fma_f32 (poly, scale, scale), cmp);
#else
return specialcase (poly, n, e, absn, cmp, scale);
#endif
return v_fma_f32 (poly, scale, scale);
}
VPCS_ALIAS
#endif

View File

@ -1,80 +0,0 @@
/*
* Single-precision vector e^x function.
*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
#if V_SUPPORTED
static const float Poly[] = {
/* maxerr: 0.36565 +0.5 ulp. */
0x1.6a6000p-10f,
0x1.12718ep-7f,
0x1.555af0p-5f,
0x1.555430p-3f,
0x1.fffff4p-2f,
};
#define C0 v_f32 (Poly[0])
#define C1 v_f32 (Poly[1])
#define C2 v_f32 (Poly[2])
#define C3 v_f32 (Poly[3])
#define C4 v_f32 (Poly[4])
#define Shift v_f32 (0x1.8p23f)
#define InvLn2 v_f32 (0x1.715476p+0f)
#define Ln2hi v_f32 (0x1.62e4p-1f)
#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
VPCS_ATTR
static v_f32_t
specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn)
{
/* 2^n may overflow, break it up into s1*s2. */
v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
v_f32_t s2 = v_as_f32_u32 (e - b);
v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f));
v_f32_t r1 = s1 * s1;
v_f32_t r0 = poly * s1 * s2;
return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0)));
}
VPCS_ATTR
v_f32_t
V_NAME(expf_1u) (v_f32_t x)
{
v_f32_t n, r, scale, poly, absn, z;
v_u32_t cmp, e;
/* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
#if 1
z = v_fma_f32 (x, InvLn2, Shift);
n = z - Shift;
r = v_fma_f32 (n, -Ln2hi, x);
r = v_fma_f32 (n, -Ln2lo, r);
e = v_as_u32_f32 (z) << 23;
#else
z = x * InvLn2;
n = v_round_f32 (z);
r = v_fma_f32 (n, -Ln2hi, x);
r = v_fma_f32 (n, -Ln2lo, r);
e = v_as_u32_s32 (v_round_s32 (z)) << 23;
#endif
scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
absn = v_abs_f32 (n);
cmp = v_cond_u32 (absn > v_f32 (126.0f));
poly = v_fma_f32 (C0, r, C1);
poly = v_fma_f32 (poly, r, C2);
poly = v_fma_f32 (poly, r, C3);
poly = v_fma_f32 (poly, r, C4);
poly = v_fma_f32 (poly, r, v_f32 (1.0f));
poly = v_fma_f32 (poly, r, v_f32 (1.0f));
if (unlikely (v_any_u32 (cmp)))
return specialcase (poly, n, e, absn);
return scale * poly;
}
#endif

View File

@ -1,104 +0,0 @@
/*
* Double-precision vector log(x) function.
*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
#include "v_log.h"
#if V_SUPPORTED
/* Worst-case error: 1.17 + 0.5 ulp. */
static const f64_t Poly[] = {
/* rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */
-0x1.ffffffffffff7p-2,
0x1.55555555170d4p-2,
-0x1.0000000399c27p-2,
0x1.999b2e90e94cap-3,
-0x1.554e550bd501ep-3,
};
#define A0 v_f64 (Poly[0])
#define A1 v_f64 (Poly[1])
#define A2 v_f64 (Poly[2])
#define A3 v_f64 (Poly[3])
#define A4 v_f64 (Poly[4])
#define Ln2 v_f64 (0x1.62e42fefa39efp-1)
#define N (1 << V_LOG_TABLE_BITS)
#define OFF v_u64 (0x3fe6900900000000)
struct entry
{
v_f64_t invc;
v_f64_t logc;
};
static inline struct entry
lookup (v_u64_t i)
{
struct entry e;
#ifdef SCALAR
e.invc = __v_log_data[i].invc;
e.logc = __v_log_data[i].logc;
#else
e.invc[0] = __v_log_data[i[0]].invc;
e.logc[0] = __v_log_data[i[0]].logc;
e.invc[1] = __v_log_data[i[1]].invc;
e.logc[1] = __v_log_data[i[1]].logc;
#endif
return e;
}
VPCS_ATTR
__attribute__ ((noinline)) static v_f64_t
specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
{
return v_call_f64 (log, x, y, cmp);
}
VPCS_ATTR
v_f64_t
V_NAME(log) (v_f64_t x)
{
v_f64_t z, r, r2, p, y, kd, hi;
v_u64_t ix, iz, tmp, top, i, cmp;
v_s64_t k;
struct entry e;
ix = v_as_u64_f64 (x);
top = ix >> 48;
cmp = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010));
/* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
The range is split into N subintervals.
The ith subinterval contains z and c is near its center. */
tmp = ix - OFF;
i = (tmp >> (52 - V_LOG_TABLE_BITS)) % N;
k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift */
iz = ix - (tmp & v_u64 (0xfffULL << 52));
z = v_as_f64_u64 (iz);
e = lookup (i);
/* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
r = v_fma_f64 (z, e.invc, v_f64 (-1.0));
kd = v_to_f64_s64 (k);
/* hi = r + log(c) + k*Ln2. */
hi = v_fma_f64 (kd, Ln2, e.logc + r);
/* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
r2 = r * r;
y = v_fma_f64 (A3, r, A2);
p = v_fma_f64 (A1, r, A0);
y = v_fma_f64 (A4, r2, y);
y = v_fma_f64 (y, r2, p);
y = v_fma_f64 (y, r2, hi);
if (unlikely (v_any_u64 (cmp)))
return specialcase (x, y, cmp);
return y;
}
VPCS_ALIAS
#endif

View File

@ -1,18 +0,0 @@
/*
* Declarations for double-precision log(x) vector function.
*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#if WANT_VMATH
#define V_LOG_TABLE_BITS 7
extern const struct v_log_data
{
f64_t invc;
f64_t logc;
} __v_log_data[1 << V_LOG_TABLE_BITS] HIDDEN;
#endif

View File

@ -1,158 +0,0 @@
/*
* Lookup table for double-precision log(x) vector function.
*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_log.h"
#if WANT_VMATH
#define N (1 << V_LOG_TABLE_BITS)
/* Algorithm:
x = 2^k z
log(x) = k ln2 + log(c) + poly(z/c - 1)
where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,N=128)
and log(c) and 1/c for the ith subinterval comes from a lookup table:
tab[i].invc = 1/c
tab[i].logc = (double)log(c)
where c is near the center of the subinterval and is chosen by trying several
floating point invc candidates around 1/center and selecting one for which
the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval
that contains 1 and the previous one got tweaked to avoid cancellation. */
const struct v_log_data __v_log_data[N] = {
{0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2},
{0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2},
{0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2},
{0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2},
{0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2},
{0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2},
{0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2},
{0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2},
{0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2},
{0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2},
{0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2},
{0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2},
{0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2},
{0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2},
{0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2},
{0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2},
{0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2},
{0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2},
{0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2},
{0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3},
{0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3},
{0x1.446f12b278001p+0, -0x1.e52e160484698p-3},
{0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3},
{0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3},
{0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3},
{0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3},
{0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3},
{0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3},
{0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3},
{0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3},
{0x1.36987540fbf53p+0, -0x1.8be843d796044p-3},
{0x1.352166b648f61p+0, -0x1.82395ecc477edp-3},
{0x1.33adddb3eb575p+0, -0x1.7896240966422p-3},
{0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3},
{0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3},
{0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3},
{0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3},
{0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3},
{0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3},
{0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3},
{0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3},
{0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3},
{0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3},
{0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3},
{0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3},
{0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4},
{0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4},
{0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4},
{0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4},
{0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4},
{0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4},
{0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4},
{0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4},
{0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4},
{0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4},
{0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4},
{0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4},
{0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4},
{0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4},
{0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4},
{0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5},
{0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5},
{0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5},
{0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5},
{0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5},
{0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5},
{0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5},
{0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5},
{0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6},
{0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6},
{0x1.05193497a7cc5p+0, -0x1.43183683400acp-6},
{0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6},
{0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7},
{0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7},
{0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9},
{1.0, 0.0},
{0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8},
{0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7},
{0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6},
{0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6},
{0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5},
{0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5},
{0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5},
{0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5},
{0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4},
{0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4},
{0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4},
{0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4},
{0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4},
{0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4},
{0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4},
{0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4},
{0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4},
{0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3},
{0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3},
{0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3},
{0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3},
{0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3},
{0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3},
{0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3},
{0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3},
{0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3},
{0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3},
{0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3},
{0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3},
{0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3},
{0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3},
{0x1.9998e1480b618p-1, 0x1.c903161240163p-3},
{0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3},
{0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3},
{0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3},
{0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3},
{0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2},
{0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2},
{0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2},
{0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2},
{0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2},
{0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2},
{0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2},
{0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2},
{0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2},
{0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2},
{0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2},
{0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2},
{0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2},
{0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2},
{0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2},
{0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2},
};
#endif

View File

@ -1,73 +0,0 @@
/*
* Single-precision vector log function.
*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
#if V_SUPPORTED
static const float Poly[] = {
/* 3.34 ulp error */
-0x1.3e737cp-3f, 0x1.5a9aa2p-3f, -0x1.4f9934p-3f, 0x1.961348p-3f,
-0x1.00187cp-2f, 0x1.555d7cp-2f, -0x1.ffffc8p-2f,
};
#define P7 v_f32 (Poly[0])
#define P6 v_f32 (Poly[1])
#define P5 v_f32 (Poly[2])
#define P4 v_f32 (Poly[3])
#define P3 v_f32 (Poly[4])
#define P2 v_f32 (Poly[5])
#define P1 v_f32 (Poly[6])
#define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218 */
#define Min v_u32 (0x00800000)
#define Max v_u32 (0x7f800000)
#define Mask v_u32 (0x007fffff)
#define Off v_u32 (0x3f2aaaab) /* 0.666667 */
VPCS_ATTR
__attribute__ ((noinline)) static v_f32_t
specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
{
/* Fall back to scalar code. */
return v_call_f32 (logf, x, y, cmp);
}
VPCS_ATTR
v_f32_t
V_NAME(logf) (v_f32_t x)
{
v_f32_t n, p, q, r, r2, y;
v_u32_t u, cmp;
u = v_as_u32_f32 (x);
cmp = v_cond_u32 (u - Min >= Max - Min);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3 */
u -= Off;
n = v_to_f32_s32 (v_as_s32_u32 (u) >> 23); /* signextend */
u &= Mask;
u += Off;
r = v_as_f32_u32 (u) - v_f32 (1.0f);
/* y = log(1+r) + n*ln2. */
r2 = r * r;
/* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */
p = v_fma_f32 (P6, r, P5);
q = v_fma_f32 (P4, r, P3);
y = v_fma_f32 (P2, r, P1);
p = v_fma_f32 (P7, r2, p);
q = v_fma_f32 (p, r2, q);
y = v_fma_f32 (q, r2, y);
p = v_fma_f32 (Ln2, n, r);
y = v_fma_f32 (y, r2, p);
if (unlikely (v_any_u32 (cmp)))
return specialcase (x, y, cmp);
return y;
}
VPCS_ALIAS
#endif

View File

@ -1,661 +0,0 @@
/*
* Vector math abstractions.
*
* Copyright (c) 2019-2022, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#ifndef _V_MATH_H
#define _V_MATH_H
#ifndef WANT_VMATH
/* Enable the build of vector math code. */
# define WANT_VMATH 1
#endif
#if WANT_VMATH
/* The goal of this header is to allow vector and scalar
build of the same algorithm, the provided intrinsic
wrappers are also vector length agnostic so they can
be implemented for SVE too (or other simd architectures)
and then the code should work on those targets too. */
#if SCALAR
#define V_NAME(x) __s_##x
#elif VPCS && __aarch64__
#define V_NAME(x) __vn_##x
#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
#else
#define V_NAME(x) __v_##x
#endif
#ifndef VPCS_ATTR
#define VPCS_ATTR
#endif
#ifndef VPCS_ALIAS
#define VPCS_ALIAS
#endif
#include <stdint.h>
#include "math_config.h"
typedef float f32_t;
typedef uint32_t u32_t;
typedef int32_t s32_t;
typedef double f64_t;
typedef uint64_t u64_t;
typedef int64_t s64_t;
/* reinterpret as type1 from type2. */
static inline u32_t
as_u32_f32 (f32_t x)
{
union { f32_t f; u32_t u; } r = {x};
return r.u;
}
static inline f32_t
as_f32_u32 (u32_t x)
{
union { u32_t u; f32_t f; } r = {x};
return r.f;
}
static inline s32_t
as_s32_u32 (u32_t x)
{
union { u32_t u; s32_t i; } r = {x};
return r.i;
}
static inline u32_t
as_u32_s32 (s32_t x)
{
union { s32_t i; u32_t u; } r = {x};
return r.u;
}
static inline u64_t
as_u64_f64 (f64_t x)
{
union { f64_t f; u64_t u; } r = {x};
return r.u;
}
static inline f64_t
as_f64_u64 (u64_t x)
{
union { u64_t u; f64_t f; } r = {x};
return r.f;
}
static inline s64_t
as_s64_u64 (u64_t x)
{
union { u64_t u; s64_t i; } r = {x};
return r.i;
}
static inline u64_t
as_u64_s64 (s64_t x)
{
union { s64_t i; u64_t u; } r = {x};
return r.u;
}
#if SCALAR
#define V_SUPPORTED 1
typedef f32_t v_f32_t;
typedef u32_t v_u32_t;
typedef s32_t v_s32_t;
typedef f64_t v_f64_t;
typedef u64_t v_u64_t;
typedef s64_t v_s64_t;
static inline int
v_lanes32 (void)
{
return 1;
}
static inline v_f32_t
v_f32 (f32_t x)
{
return x;
}
static inline v_u32_t
v_u32 (u32_t x)
{
return x;
}
static inline v_s32_t
v_s32 (s32_t x)
{
return x;
}
static inline f32_t
v_get_f32 (v_f32_t x, int i)
{
return x;
}
static inline u32_t
v_get_u32 (v_u32_t x, int i)
{
return x;
}
static inline s32_t
v_get_s32 (v_s32_t x, int i)
{
return x;
}
static inline void
v_set_f32 (v_f32_t *x, int i, f32_t v)
{
*x = v;
}
static inline void
v_set_u32 (v_u32_t *x, int i, u32_t v)
{
*x = v;
}
static inline void
v_set_s32 (v_s32_t *x, int i, s32_t v)
{
*x = v;
}
/* true if any elements of a v_cond result is non-zero. */
static inline int
v_any_u32 (v_u32_t x)
{
return x != 0;
}
/* to wrap the result of relational operators. */
static inline v_u32_t
v_cond_u32 (v_u32_t x)
{
return x ? -1 : 0;
}
static inline v_f32_t
v_abs_f32 (v_f32_t x)
{
return __builtin_fabsf (x);
}
static inline v_f32_t
v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
{
return __builtin_fmaf (x, y, z);
}
static inline v_f32_t
v_round_f32 (v_f32_t x)
{
return __builtin_roundf (x);
}
static inline v_s32_t
v_round_s32 (v_f32_t x)
{
return __builtin_lroundf (x); /* relies on -fno-math-errno. */
}
static inline v_f32_t
v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y)
{
return p ? x : y;
}
/* convert to type1 from type2. */
static inline v_f32_t
v_to_f32_s32 (v_s32_t x)
{
return x;
}
static inline v_f32_t
v_to_f32_u32 (v_u32_t x)
{
return x;
}
/* reinterpret as type1 from type2. */
static inline v_u32_t
v_as_u32_f32 (v_f32_t x)
{
union { v_f32_t f; v_u32_t u; } r = {x};
return r.u;
}
static inline v_f32_t
v_as_f32_u32 (v_u32_t x)
{
union { v_u32_t u; v_f32_t f; } r = {x};
return r.f;
}
static inline v_s32_t
v_as_s32_u32 (v_u32_t x)
{
union { v_u32_t u; v_s32_t i; } r = {x};
return r.i;
}
static inline v_u32_t
v_as_u32_s32 (v_s32_t x)
{
union { v_s32_t i; v_u32_t u; } r = {x};
return r.u;
}
static inline v_f32_t
v_lookup_f32 (const f32_t *tab, v_u32_t idx)
{
return tab[idx];
}
static inline v_u32_t
v_lookup_u32 (const u32_t *tab, v_u32_t idx)
{
return tab[idx];
}
static inline v_f32_t
v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
{
return f (x);
}
static inline v_f32_t
v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
v_u32_t p)
{
return f (x1, x2);
}
static inline int
v_lanes64 (void)
{
return 1;
}
static inline v_f64_t
v_f64 (f64_t x)
{
return x;
}
static inline v_u64_t
v_u64 (u64_t x)
{
return x;
}
static inline v_s64_t
v_s64 (s64_t x)
{
return x;
}
static inline f64_t
v_get_f64 (v_f64_t x, int i)
{
return x;
}
static inline void
v_set_f64 (v_f64_t *x, int i, f64_t v)
{
*x = v;
}
/* true if any elements of a v_cond result is non-zero. */
static inline int
v_any_u64 (v_u64_t x)
{
return x != 0;
}
/* to wrap the result of relational operators. */
static inline v_u64_t
v_cond_u64 (v_u64_t x)
{
return x ? -1 : 0;
}
static inline v_f64_t
v_abs_f64 (v_f64_t x)
{
return __builtin_fabs (x);
}
static inline v_f64_t
v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
{
return __builtin_fma (x, y, z);
}
static inline v_f64_t
v_round_f64 (v_f64_t x)
{
return __builtin_round (x);
}
static inline v_s64_t
v_round_s64 (v_f64_t x)
{
return __builtin_lround (x); /* relies on -fno-math-errno. */
}
static inline v_f64_t
v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y)
{
return p ? x : y;
}
/* convert to type1 from type2. */
static inline v_f64_t
v_to_f64_s64 (v_s64_t x)
{
return x;
}
static inline v_f64_t
v_to_f64_u64 (v_u64_t x)
{
return x;
}
/* reinterpret as type1 from type2. */
static inline v_u64_t
v_as_u64_f64 (v_f64_t x)
{
union { v_f64_t f; v_u64_t u; } r = {x};
return r.u;
}
static inline v_f64_t
v_as_f64_u64 (v_u64_t x)
{
union { v_u64_t u; v_f64_t f; } r = {x};
return r.f;
}
static inline v_s64_t
v_as_s64_u64 (v_u64_t x)
{
union { v_u64_t u; v_s64_t i; } r = {x};
return r.i;
}
static inline v_u64_t
v_as_u64_s64 (v_s64_t x)
{
union { v_s64_t i; v_u64_t u; } r = {x};
return r.u;
}
static inline v_f64_t
v_lookup_f64 (const f64_t *tab, v_u64_t idx)
{
return tab[idx];
}
static inline v_u64_t
v_lookup_u64 (const u64_t *tab, v_u64_t idx)
{
return tab[idx];
}
static inline v_f64_t
v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
{
return f (x);
}
#elif __aarch64__
#define V_SUPPORTED 1
#include <arm_neon.h>
typedef float32x4_t v_f32_t;
typedef uint32x4_t v_u32_t;
typedef int32x4_t v_s32_t;
typedef float64x2_t v_f64_t;
typedef uint64x2_t v_u64_t;
typedef int64x2_t v_s64_t;
static inline int
v_lanes32 (void)
{
return 4;
}
static inline v_f32_t
v_f32 (f32_t x)
{
return (v_f32_t){x, x, x, x};
}
static inline v_u32_t
v_u32 (u32_t x)
{
return (v_u32_t){x, x, x, x};
}
static inline v_s32_t
v_s32 (s32_t x)
{
return (v_s32_t){x, x, x, x};
}
static inline f32_t
v_get_f32 (v_f32_t x, int i)
{
return x[i];
}
static inline u32_t
v_get_u32 (v_u32_t x, int i)
{
return x[i];
}
static inline s32_t
v_get_s32 (v_s32_t x, int i)
{
return x[i];
}
static inline void
v_set_f32 (v_f32_t *x, int i, f32_t v)
{
(*x)[i] = v;
}
static inline void
v_set_u32 (v_u32_t *x, int i, u32_t v)
{
(*x)[i] = v;
}
static inline void
v_set_s32 (v_s32_t *x, int i, s32_t v)
{
(*x)[i] = v;
}
/* true if any elements of a v_cond result is non-zero. */
static inline int
v_any_u32 (v_u32_t x)
{
/* assume elements in x are either 0 or -1u. */
return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
}
/* to wrap the result of relational operators. */
static inline v_u32_t
v_cond_u32 (v_u32_t x)
{
return x;
}
static inline v_f32_t
v_abs_f32 (v_f32_t x)
{
return vabsq_f32 (x);
}
static inline v_f32_t
v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
{
return vfmaq_f32 (z, x, y);
}
static inline v_f32_t
v_round_f32 (v_f32_t x)
{
return vrndaq_f32 (x);
}
static inline v_s32_t
v_round_s32 (v_f32_t x)
{
return vcvtaq_s32_f32 (x);
}
static inline v_f32_t
v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y)
{
return vbslq_f32 (p, x, y);
}
/* convert to type1 from type2. */
static inline v_f32_t
v_to_f32_s32 (v_s32_t x)
{
return (v_f32_t){x[0], x[1], x[2], x[3]};
}
static inline v_f32_t
v_to_f32_u32 (v_u32_t x)
{
return (v_f32_t){x[0], x[1], x[2], x[3]};
}
/* reinterpret as type1 from type2. */
static inline v_u32_t
v_as_u32_f32 (v_f32_t x)
{
union { v_f32_t f; v_u32_t u; } r = {x};
return r.u;
}
static inline v_f32_t
v_as_f32_u32 (v_u32_t x)
{
union { v_u32_t u; v_f32_t f; } r = {x};
return r.f;
}
static inline v_s32_t
v_as_s32_u32 (v_u32_t x)
{
union { v_u32_t u; v_s32_t i; } r = {x};
return r.i;
}
static inline v_u32_t
v_as_u32_s32 (v_s32_t x)
{
union { v_s32_t i; v_u32_t u; } r = {x};
return r.u;
}
static inline v_f32_t
v_lookup_f32 (const f32_t *tab, v_u32_t idx)
{
return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
}
static inline v_u32_t
v_lookup_u32 (const u32_t *tab, v_u32_t idx)
{
return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
}
static inline v_f32_t
v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
{
return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]};
}
static inline v_f32_t
v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
v_u32_t p)
{
return (
v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1],
p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]};
}
static inline int
v_lanes64 (void)
{
return 2;
}
static inline v_f64_t
v_f64 (f64_t x)
{
return (v_f64_t){x, x};
}
static inline v_u64_t
v_u64 (u64_t x)
{
return (v_u64_t){x, x};
}
static inline v_s64_t
v_s64 (s64_t x)
{
return (v_s64_t){x, x};
}
static inline f64_t
v_get_f64 (v_f64_t x, int i)
{
return x[i];
}
static inline void
v_set_f64 (v_f64_t *x, int i, f64_t v)
{
(*x)[i] = v;
}
/* true if any elements of a v_cond result is non-zero. */
static inline int
v_any_u64 (v_u64_t x)
{
/* assume elements in x are either 0 or -1u. */
return vpaddd_u64 (x) != 0;
}
/* to wrap the result of relational operators. */
static inline v_u64_t
v_cond_u64 (v_u64_t x)
{
return x;
}
static inline v_f64_t
v_abs_f64 (v_f64_t x)
{
return vabsq_f64 (x);
}
static inline v_f64_t
v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
{
return vfmaq_f64 (z, x, y);
}
static inline v_f64_t
v_round_f64 (v_f64_t x)
{
return vrndaq_f64 (x);
}
static inline v_s64_t
v_round_s64 (v_f64_t x)
{
return vcvtaq_s64_f64 (x);
}
static inline v_f64_t
v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y)
{
return vbslq_f64 (p, x, y);
}
/* convert to type1 from type2. */
static inline v_f64_t
v_to_f64_s64 (v_s64_t x)
{
return (v_f64_t){x[0], x[1]};
}
static inline v_f64_t
v_to_f64_u64 (v_u64_t x)
{
return (v_f64_t){x[0], x[1]};
}
/* reinterpret as type1 from type2. */
static inline v_u64_t
v_as_u64_f64 (v_f64_t x)
{
union { v_f64_t f; v_u64_t u; } r = {x};
return r.u;
}
static inline v_f64_t
v_as_f64_u64 (v_u64_t x)
{
union { v_u64_t u; v_f64_t f; } r = {x};
return r.f;
}
static inline v_s64_t
v_as_s64_u64 (v_u64_t x)
{
union { v_u64_t u; v_s64_t i; } r = {x};
return r.i;
}
static inline v_u64_t
v_as_u64_s64 (v_s64_t x)
{
union { v_s64_t i; v_u64_t u; } r = {x};
return r.u;
}
static inline v_f64_t
v_lookup_f64 (const f64_t *tab, v_u64_t idx)
{
return (v_f64_t){tab[idx[0]], tab[idx[1]]};
}
static inline v_u64_t
v_lookup_u64 (const u64_t *tab, v_u64_t idx)
{
return (v_u64_t){tab[idx[0]], tab[idx[1]]};
}
static inline v_f64_t
v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
{
return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]};
}
#endif
#endif
#endif

View File

@ -1,27 +0,0 @@
/*
* Double-precision vector pow function.
*
* Copyright (c) 2020, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
#if V_SUPPORTED
VPCS_ATTR
v_f64_t
V_NAME(pow) (v_f64_t x, v_f64_t y)
{
v_f64_t z;
for (int lane = 0; lane < v_lanes64 (); lane++)
{
f64_t sx = v_get_f64 (x, lane);
f64_t sy = v_get_f64 (y, lane);
f64_t sz = pow (sx, sy);
v_set_f64 (&z, lane, sz);
}
return z;
}
VPCS_ALIAS
#endif

View File

@ -1,235 +0,0 @@
/*
* Single-precision vector powf function.
*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
#if V_SUPPORTED
#define Min v_u32 (0x00800000)
#define Max v_u32 (0x7f800000)
#define SBITS 5
#define Tlog v__powf_log2_data.tab
#define Texp v__exp2f_data.tab
#define A v__powf_log2_data.poly
#define C v__exp2f_data.poly
#define LOGDEG 4
#if LOGDEG == 5
/* 1.01 ulp */
#define OFF v_u32 (0x3f330000)
#define TBITS 4
#elif LOGDEG == 4
/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2) */
#define OFF v_u32 (0x3f35d000)
#define TBITS 5
#endif
#define V_EXP2F_TABLE_BITS SBITS
#define V_EXP2F_POLY_ORDER 3
struct v_exp2f_data
{
uint64_t tab[1 << V_EXP2F_TABLE_BITS];
double poly[V_EXP2F_POLY_ORDER];
};
#define V_POWF_LOG2_TABLE_BITS TBITS
#define V_POWF_LOG2_POLY_ORDER LOGDEG
#define SCALE ((double) (1 << SBITS))
struct v_powf_log2_data
{
struct
{
double invc, logc;
} tab[1 << V_POWF_LOG2_TABLE_BITS];
double poly[V_POWF_LOG2_POLY_ORDER];
};
static const struct v_powf_log2_data v__powf_log2_data = {
#if LOGDEG == 5
.tab = {
{ 0x1.661ec79f8f3bep+0, -0x1.efec65b963019p-2 * SCALE },
{ 0x1.571ed4aaf883dp+0, -0x1.b0b6832d4fca4p-2 * SCALE },
{ 0x1.49539f0f010bp+0, -0x1.7418b0a1fb77bp-2 * SCALE },
{ 0x1.3c995b0b80385p+0, -0x1.39de91a6dcf7bp-2 * SCALE },
{ 0x1.30d190c8864a5p+0, -0x1.01d9bf3f2b631p-2 * SCALE },
{ 0x1.25e227b0b8eap+0, -0x1.97c1d1b3b7afp-3 * SCALE },
{ 0x1.1bb4a4a1a343fp+0, -0x1.2f9e393af3c9fp-3 * SCALE },
{ 0x1.12358f08ae5bap+0, -0x1.960cbbf788d5cp-4 * SCALE },
{ 0x1.0953f419900a7p+0, -0x1.a6f9db6475fcep-5 * SCALE },
{ 0x1p+0, 0x0p+0 * SCALE },
{ 0x1.e608cfd9a47acp-1, 0x1.338ca9f24f53dp-4 * SCALE },
{ 0x1.ca4b31f026aap-1, 0x1.476a9543891bap-3 * SCALE },
{ 0x1.b2036576afce6p-1, 0x1.e840b4ac4e4d2p-3 * SCALE },
{ 0x1.9c2d163a1aa2dp-1, 0x1.40645f0c6651cp-2 * SCALE },
{ 0x1.886e6037841edp-1, 0x1.88e9c2c1b9ff8p-2 * SCALE },
{ 0x1.767dcf5534862p-1, 0x1.ce0a44eb17bccp-2 * SCALE },
},
/* rel err: 1.46 * 2^-32 */
.poly = {
0x1.27616c9496e0bp-2 * SCALE, -0x1.71969a075c67ap-2 * SCALE,
0x1.ec70a6ca7baddp-2 * SCALE, -0x1.7154748bef6c8p-1 * SCALE,
0x1.71547652ab82bp0 * SCALE,
}
#elif LOGDEG == 4
.tab = {
{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * SCALE},
{0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * SCALE},
{0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * SCALE},
{0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * SCALE},
{0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * SCALE},
{0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * SCALE},
{0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * SCALE},
{0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * SCALE},
{0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * SCALE},
{0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * SCALE},
{0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * SCALE},
{0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * SCALE},
{0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * SCALE},
{0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * SCALE},
{0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * SCALE},
{0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * SCALE},
{0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * SCALE},
{0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * SCALE},
{0x1p+0, 0x0p+0 * SCALE},
{0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * SCALE},
{0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * SCALE},
{0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * SCALE},
{0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * SCALE},
{0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * SCALE},
{0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * SCALE},
{0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * SCALE},
{0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * SCALE},
{0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * SCALE},
{0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * SCALE},
{0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * SCALE},
{0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * SCALE},
{0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * SCALE},
},
/* rel err: 1.5 * 2^-30 */
.poly = {
-0x1.6ff5daa3b3d7cp-2 * SCALE,
0x1.ec81d03c01aebp-2 * SCALE,
-0x1.71547bb43f101p-1 * SCALE,
0x1.7154764a815cbp0 * SCALE,
}
#endif
};
static const struct v_exp2f_data v__exp2f_data = {
.tab = {
0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51,
0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1,
0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585,
0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13,
0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069,
0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,
},
/* rel err: 1.69 * 2^-34 */
.poly = {
0x1.c6af84b912394p-5/SCALE/SCALE/SCALE, 0x1.ebfce50fac4f3p-3/SCALE/SCALE, 0x1.62e42ff0c52d6p-1/SCALE
},
};
VPCS_ATTR
__attribute__ ((noinline)) static v_f32_t
specialcase (v_f32_t x, v_f32_t y, v_f32_t ret, v_u32_t cmp)
{
return v_call2_f32 (powf, x, y, ret, cmp);
}
VPCS_ATTR
v_f32_t
V_NAME(powf) (v_f32_t x, v_f32_t y)
{
v_u32_t u, tmp, cmp, i, top, iz;
v_s32_t k;
v_f32_t ret;
u = v_as_u32_f32 (x);
cmp = v_cond_u32 (u - Min >= Max - Min);
tmp = u - OFF;
i = (tmp >> (23 - TBITS)) % (1 << TBITS);
top = tmp & 0xff800000;
iz = u - top;
k = v_as_s32_u32 (top) >> (23 - SBITS); /* arithmetic shift */
for (int lane = 0; lane < v_lanes32 (); lane++)
{
uint32_t si, siz;
int32_t sk;
float sy;
/* Use double precision for each lane. */
double invc, logc, z, r, p, y0, logx, ylogx, kd, s;
uint64_t ki, t;
si = v_get_u32 (i, lane);
siz = v_get_u32 (iz, lane);
sk = v_get_s32 (k, lane);
sy = v_get_f32 (y, lane);
invc = Tlog[si].invc;
logc = Tlog[si].logc;
z = (double) as_f32_u32 (siz);
/* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k */
r = __builtin_fma (z, invc, -1.0);
y0 = logc + (double) sk;
/* Polynomial to approximate log1p(r)/ln2. */
#if LOGDEG == 5
logx = A[0];
logx = r * logx + A[1];
logx = r * logx + A[2];
logx = r * logx + A[3];
logx = r * logx + A[4];
logx = r * logx + y0;
#elif LOGDEG == 4
logx = A[0];
logx = r * logx + A[1];
logx = r * logx + A[2];
logx = r * logx + A[3];
logx = r * logx + y0;
#endif
ylogx = sy * logx;
v_set_u32 (&cmp, lane,
(as_u64_f64 (ylogx) >> 47 & 0xffff)
>= as_u64_f64 (126.0 * (1 << SBITS)) >> 47
? 1
: v_get_u32 (cmp, lane));
/* N*x = k + r with r in [-1/2, 1/2] */
#if TOINT_INTRINSICS
kd = roundtoint (ylogx); /* k */
ki = converttoint (ylogx);
#else
# define SHIFT 0x1.8p52
kd = eval_as_double (ylogx + SHIFT);
ki = asuint64 (kd);
kd -= SHIFT;
#endif
r = ylogx - kd;
/* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */
t = Texp[ki % (1 << SBITS)];
t += ki << (52 - SBITS);
s = as_f64_u64 (t);
p = C[0];
p = __builtin_fma (p, r, C[1]);
p = __builtin_fma (p, r, C[2]);
p = __builtin_fma (p, s * r, s);
v_set_f32 (&ret, lane, p);
}
if (unlikely (v_any_u32 (cmp)))
return specialcase (x, y, ret, cmp);
return ret;
}
VPCS_ALIAS
#endif

View File

@ -1,103 +0,0 @@
/*
* Double-precision vector sin function.
*
* Copyright (c) 2019-2022, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
#if V_SUPPORTED
static const double Poly[] = {
/* worst-case error is 3.5 ulp.
abs error: 0x1.be222a58p-53 in [-pi/2, pi/2]. */
-0x1.9f4a9c8b21dc9p-41,
0x1.60e88a10163f2p-33,
-0x1.ae6361b7254e7p-26,
0x1.71de382e8d62bp-19,
-0x1.a01a019aeb4ffp-13,
0x1.111111110b25ep-7,
-0x1.55555555554c3p-3,
};
#define C7 v_f64 (Poly[0])
#define C6 v_f64 (Poly[1])
#define C5 v_f64 (Poly[2])
#define C4 v_f64 (Poly[3])
#define C3 v_f64 (Poly[4])
#define C2 v_f64 (Poly[5])
#define C1 v_f64 (Poly[6])
#define InvPi v_f64 (0x1.45f306dc9c883p-2)
#define Pi1 v_f64 (0x1.921fb54442d18p+1)
#define Pi2 v_f64 (0x1.1a62633145c06p-53)
#define Pi3 v_f64 (0x1.c1cd129024e09p-106)
#define Shift v_f64 (0x1.8p52)
#define AbsMask v_u64 (0x7fffffffffffffff)
#if WANT_SIMD_EXCEPT
#define TinyBound 0x202 /* top12 (asuint64 (0x1p-509)). */
#define Thresh 0x214 /* top12 (asuint64 (RangeVal)) - TinyBound. */
#else
#define RangeVal v_f64 (0x1p23)
#endif
VPCS_ATTR
__attribute__ ((noinline)) static v_f64_t
specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
{
return v_call_f64 (sin, x, y, cmp);
}
VPCS_ATTR
v_f64_t
V_NAME(sin) (v_f64_t x)
{
v_f64_t n, r, r2, y;
v_u64_t sign, odd, cmp, ir;
ir = v_as_u64_f64 (x) & AbsMask;
r = v_as_f64_u64 (ir);
sign = v_as_u64_f64 (x) & ~AbsMask;
#if WANT_SIMD_EXCEPT
/* Detect |x| <= 0x1p-509 or |x| >= RangeVal. If fenv exceptions are to be
triggered correctly, set any special lanes to 1 (which is neutral w.r.t.
fenv). These lanes will be fixed by specialcase later. */
cmp = v_cond_u64 ((ir >> 52) - TinyBound >= Thresh);
if (unlikely (v_any_u64 (cmp)))
r = v_sel_f64 (cmp, v_f64 (1), r);
#else
cmp = v_cond_u64 (ir >= v_as_u64_f64 (RangeVal));
#endif
/* n = rint(|x|/pi). */
n = v_fma_f64 (InvPi, r, Shift);
odd = v_as_u64_f64 (n) << 63;
n -= Shift;
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
r = v_fma_f64 (-Pi1, n, r);
r = v_fma_f64 (-Pi2, n, r);
r = v_fma_f64 (-Pi3, n, r);
/* sin(r) poly approx. */
r2 = r * r;
y = v_fma_f64 (C7, r2, C6);
y = v_fma_f64 (y, r2, C5);
y = v_fma_f64 (y, r2, C4);
y = v_fma_f64 (y, r2, C3);
y = v_fma_f64 (y, r2, C2);
y = v_fma_f64 (y, r2, C1);
y = v_fma_f64 (y * r2, r, r);
/* sign. */
y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign ^ odd);
if (unlikely (v_any_u64 (cmp)))
return specialcase (x, y, cmp);
return y;
}
VPCS_ALIAS
#endif

View File

@ -1,88 +0,0 @@
/*
* Single-precision vector sin function.
*
* Copyright (c) 2019-2022, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
#if V_SUPPORTED
static const float Poly[] = {
/* 1.886 ulp error */
0x1.5b2e76p-19f,
-0x1.9f42eap-13f,
0x1.110df4p-7f,
-0x1.555548p-3f,
};
#define Pi1 v_f32 (0x1.921fb6p+1f)
#define Pi2 v_f32 (-0x1.777a5cp-24f)
#define Pi3 v_f32 (-0x1.ee59dap-49f)
#define A3 v_f32 (Poly[3])
#define A5 v_f32 (Poly[2])
#define A7 v_f32 (Poly[1])
#define A9 v_f32 (Poly[0])
#define RangeVal v_f32 (0x1p20f)
#define TinyBound v_f32 (0x1p-61f)
#define InvPi v_f32 (0x1.45f306p-2f)
#define Shift v_f32 (0x1.8p+23f)
#define AbsMask v_u32 (0x7fffffff)
VPCS_ATTR
static v_f32_t
specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
{
/* Fall back to scalar code. */
return v_call_f32 (sinf, x, y, cmp);
}
VPCS_ATTR
v_f32_t
V_NAME(sinf) (v_f32_t x)
{
v_f32_t n, r, r2, y;
v_u32_t sign, odd, cmp, ir;
ir = v_as_u32_f32 (x) & AbsMask;
r = v_as_f32_u32 (ir);
sign = v_as_u32_f32 (x) & ~AbsMask;
#if WANT_SIMD_EXCEPT
cmp = v_cond_u32 ((ir - v_as_u32_f32 (TinyBound)
>= v_as_u32_f32 (RangeVal) - v_as_u32_f32 (TinyBound)));
if (unlikely (v_any_u32 (cmp)))
/* If fenv exceptions are to be triggered correctly, set any special lanes
to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
specialcase later. */
r = v_sel_f32 (cmp, v_f32 (1), r);
#else
cmp = v_cond_u32 (ir >= v_as_u32_f32 (RangeVal));
#endif
/* n = rint(|x|/pi) */
n = v_fma_f32 (InvPi, r, Shift);
odd = v_as_u32_f32 (n) << 31;
n -= Shift;
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */
r = v_fma_f32 (-Pi1, n, r);
r = v_fma_f32 (-Pi2, n, r);
r = v_fma_f32 (-Pi3, n, r);
/* y = sin(r) */
r2 = r * r;
y = v_fma_f32 (A9, r2, A7);
y = v_fma_f32 (y, r2, A5);
y = v_fma_f32 (y, r2, A3);
y = v_fma_f32 (y * r2, r, r);
/* sign fix */
y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign ^ odd);
if (unlikely (v_any_u32 (cmp)))
return specialcase (x, y, cmp);
return y;
}
VPCS_ALIAS
#endif

View File

@ -1,12 +0,0 @@
/*
* AdvSIMD vector PCS variant of __v_cos.
*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#ifdef __vpcs
#define VPCS 1
#define VPCS_ALIAS strong_alias (__vn_cos, _ZGVnN2v_cos)
#include "v_cos.c"
#endif

View File

@ -1,12 +0,0 @@
/*
* AdvSIMD vector PCS variant of __v_cosf.
*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#ifdef __vpcs
#define VPCS 1
#define VPCS_ALIAS strong_alias (__vn_cosf, _ZGVnN4v_cosf)
#include "v_cosf.c"
#endif

View File

@ -1,12 +0,0 @@
/*
* AdvSIMD vector PCS variant of __v_exp.
*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#ifdef __vpcs
#define VPCS 1
#define VPCS_ALIAS strong_alias (__vn_exp, _ZGVnN2v_exp)
#include "v_exp.c"
#endif

View File

@ -1,12 +0,0 @@
/*
* AdvSIMD vector PCS variant of __v_exp2f.
*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#ifdef __vpcs
#define VPCS 1
#define VPCS_ALIAS strong_alias (__vn_exp2f, _ZGVnN4v_exp2f)
#include "v_exp2f.c"
#endif

View File

@ -1,11 +0,0 @@
/*
* AdvSIMD vector PCS variant of __v_exp2f_1u.
*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#ifdef __vpcs
#define VPCS 1
#include "v_exp2f_1u.c"
#endif

View File

@ -1,12 +0,0 @@
/*
* AdvSIMD vector PCS variant of __v_expf.
*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#ifdef __vpcs
#define VPCS 1
#define VPCS_ALIAS strong_alias (__vn_expf, _ZGVnN4v_expf)
#include "v_expf.c"
#endif

View File

@ -1,11 +0,0 @@
/*
* AdvSIMD vector PCS variant of __v_expf_1u.
*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#ifdef __vpcs
#define VPCS 1
#include "v_expf_1u.c"
#endif

View File

@ -1,12 +0,0 @@
/*
* AdvSIMD vector PCS variant of __v_log.
*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#ifdef __vpcs
#define VPCS 1
#define VPCS_ALIAS strong_alias (__vn_log, _ZGVnN2v_log)
#include "v_log.c"
#endif

View File

@ -1,12 +0,0 @@
/*
* AdvSIMD vector PCS variant of __v_logf.
*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#ifdef __vpcs
#define VPCS 1
#define VPCS_ALIAS strong_alias (__vn_logf, _ZGVnN4v_logf)
#include "v_logf.c"
#endif

View File

@ -1,12 +0,0 @@
/*
* AdvSIMD vector PCS variant of __v_pow.
*
* Copyright (c) 2020, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#ifdef __vpcs
#define VPCS 1
#define VPCS_ALIAS strong_alias (__vn_pow, _ZGVnN2vv_pow)
#include "v_pow.c"
#endif

View File

@ -1,12 +0,0 @@
/*
* AdvSIMD vector PCS variant of __v_powf.
*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#ifdef __vpcs
#define VPCS 1
#define VPCS_ALIAS strong_alias (__vn_powf, _ZGVnN4vv_powf)
#include "v_powf.c"
#endif

View File

@ -1,12 +0,0 @@
/*
* AdvSIMD vector PCS variant of __v_sin.
*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#ifdef __vpcs
#define VPCS 1
#define VPCS_ALIAS strong_alias (__vn_sin, _ZGVnN2v_sin)
#include "v_sin.c"
#endif

View File

@ -1,12 +0,0 @@
/*
* AdvSIMD vector PCS variant of __v_sinf.
*
* Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#ifdef __vpcs
#define VPCS 1
#define VPCS_ALIAS strong_alias (__vn_sinf, _ZGVnN4v_sinf)
#include "v_sinf.c"
#endif

View File

@ -1,13 +1,18 @@
# Makefile fragment - requires GNU make
#
# Copyright (c) 2019-2023, Arm Limited.
# Copyright (c) 2019-2024, Arm Limited.
# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
PLM := $(srcdir)/pl/math
AOR := $(srcdir)/math
B := build/pl/math
math-lib-srcs := $(wildcard $(PLM)/*.[cS])
pl-lib-srcs := $(wildcard $(PLM)/*.[cS])
ifeq ($(WANT_SVE_MATH), 0)
pl-lib-srcs := $(filter-out $(PLM)/sv_%, $(pl-lib-srcs))
endif
math-test-srcs := \
$(AOR)/test/mathtest.c \
$(AOR)/test/mathbench.c \
@ -15,10 +20,10 @@ math-test-srcs := \
math-test-host-srcs := $(wildcard $(AOR)/test/rtest/*.[cS])
math-includes := $(patsubst $(PLM)/%,build/pl/%,$(wildcard $(PLM)/include/*.h))
math-test-includes := $(patsubst $(PLM)/%,build/pl/include/%,$(wildcard $(PLM)/test/*.h))
pl-includes := $(patsubst $(PLM)/%,build/pl/%,$(wildcard $(PLM)/include/*.h))
pl-test-includes := $(patsubst $(PLM)/%,build/pl/include/%,$(wildcard $(PLM)/test/*.h))
math-libs := \
pl-libs := \
build/pl/lib/libmathlib.so \
build/pl/lib/libmathlib.a \
@ -32,37 +37,39 @@ math-tools := \
math-host-tools := \
build/pl/bin/rtest \
math-lib-objs := $(patsubst $(PLM)/%,$(B)/%.o,$(basename $(math-lib-srcs)))
pl-lib-objs := $(patsubst $(PLM)/%,$(B)/%.o,$(basename $(pl-lib-srcs)))
math-test-objs := $(patsubst $(AOR)/%,$(B)/%.o,$(basename $(math-test-srcs)))
math-host-objs := $(patsubst $(AOR)/%,$(B)/%.o,$(basename $(math-test-host-srcs)))
math-target-objs := $(math-lib-objs) $(math-test-objs)
math-objs := $(math-target-objs) $(math-target-objs:%.o=%.os) $(math-host-objs)
pl-target-objs := $(pl-lib-objs) $(math-test-objs)
pl-objs := $(pl-target-objs) $(pl-target-objs:%.o=%.os) $(math-host-objs)
pl/math-files := \
$(math-objs) \
$(math-libs) \
$(pl-objs) \
$(pl-libs) \
$(math-tools) \
$(math-host-tools) \
$(math-includes) \
$(math-test-includes) \
$(pl-includes) \
$(pl-test-includes) \
all-pl/math: $(math-libs) $(math-tools) $(math-includes) $(math-test-includes)
all-pl/math: $(pl-libs) $(math-tools) $(pl-includes) $(pl-test-includes)
$(math-objs): $(math-includes) $(math-test-includes)
$(math-objs): CFLAGS_PL += $(math-cflags)
$(pl-objs): $(pl-includes) $(pl-test-includes)
$(pl-objs): CFLAGS_PL += $(math-cflags)
$(B)/test/mathtest.o: CFLAGS_PL += -fmath-errno
$(math-host-objs): CC = $(HOST_CC)
$(math-host-objs): CFLAGS_PL = $(HOST_CFLAGS)
build/pl/include/test/ulp_funcs_gen.h: $(math-lib-srcs)
$(B)/sv_%: CFLAGS_PL += $(math-sve-cflags)
build/pl/include/test/ulp_funcs_gen.h: $(pl-lib-srcs)
# Replace PL_SIG
cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=_Z##v##t##a(f)" -P > $@
build/pl/include/test/mathbench_funcs_gen.h: $(math-lib-srcs)
build/pl/include/test/mathbench_funcs_gen.h: $(pl-lib-srcs)
# Replace PL_SIG macros with mathbench func entries
cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=_Z##v##t##a(f, ##__VA_ARGS__)" -P > $@
build/pl/include/test/ulp_wrappers_gen.h: $(math-lib-srcs)
build/pl/include/test/ulp_wrappers_gen.h: $(pl-lib-srcs)
# Replace PL_SIG macros with ULP wrapper declarations
cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=Z##v##N##t##a##_WRAP(f)" -P > $@
@ -72,16 +79,18 @@ $(B)/test/ulp.o: CFLAGS_PL += -I build/pl/include/test
$(B)/test/mathbench.o: build/pl/include/test/mathbench_funcs_gen.h
$(B)/test/mathbench.o: CFLAGS_PL += -I build/pl/include/test
build/pl/lib/libmathlib.so: $(math-lib-objs:%.o=%.os)
build/pl/lib/libmathlib.so: $(pl-lib-objs:%.o=%.os)
$(CC) $(CFLAGS_PL) $(LDFLAGS) -shared -o $@ $^
build/pl/lib/libmathlib.a: $(math-lib-objs)
build/pl/lib/libmathlib.a: $(pl-lib-objs)
rm -f $@
$(AR) rc $@ $^
$(RANLIB) $@
$(math-host-tools): HOST_LDLIBS += -lm -lmpfr -lmpc
$(math-tools): LDLIBS += $(math-ldlibs) -lm
# math-sve-cflags should be empty if WANT_SVE_MATH is not enabled
$(math-tools): CFLAGS_PL += $(math-sve-cflags)
# Some targets to build pl/math/test from math/test sources
build/pl/math/test/%.o: $(srcdir)/math/test/%.S
@ -145,12 +154,11 @@ check-pl/math-rtest: $(math-host-tools) $(math-tools)
ulp-input-dir=$(B)/test/inputs
math-lib-lims = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.ulp,$(basename $(math-lib-srcs)))
math-lib-aliases = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.alias,$(basename $(math-lib-srcs)))
math-lib-fenvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.fenv,$(basename $(math-lib-srcs)))
math-lib-itvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.itv,$(basename $(math-lib-srcs)))
math-lib-lims = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.ulp,$(basename $(pl-lib-srcs)))
math-lib-fenvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.fenv,$(basename $(pl-lib-srcs)))
math-lib-itvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.itv,$(basename $(pl-lib-srcs)))
ulp-inputs = $(math-lib-lims) $(math-lib-aliases) $(math-lib-fenvs) $(math-lib-itvs)
ulp-inputs = $(math-lib-lims) $(math-lib-fenvs) $(math-lib-itvs)
$(ulp-inputs): CFLAGS_PL += -I$(PLM) -I$(PLM)/include $(math-cflags)
@ -158,10 +166,6 @@ $(ulp-input-dir)/%.ulp: $(PLM)/%.c
mkdir -p $(@D)
$(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep -o "PL_TEST_ULP [^ ]* [^ ]*" || true; } > $@
$(ulp-input-dir)/%.alias: $(PLM)/%.c
mkdir -p $(@D)
$(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep "PL_TEST_ALIAS" || true; } | sed "s/_x / /g"> $@
$(ulp-input-dir)/%.fenv: $(PLM)/%.c
mkdir -p $(@D)
$(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep -o "PL_TEST_EXPECT_FENV_ENABLED [^ ]*" || true; } > $@
@ -174,38 +178,21 @@ ulp-lims := $(ulp-input-dir)/limits
$(ulp-lims): $(math-lib-lims)
cat $^ | sed "s/PL_TEST_ULP //g;s/^ *//g" > $@
ulp-aliases := $(ulp-input-dir)/aliases
$(ulp-aliases): $(math-lib-aliases)
cat $^ | sed "s/PL_TEST_ALIAS //g;s/^ *//g" > $@
fenv-exps := $(ulp-input-dir)/fenv
$(fenv-exps): $(math-lib-fenvs)
cat $^ | sed "s/PL_TEST_EXPECT_FENV_ENABLED //g;s/^ *//g" > $@
ulp-itvs-noalias := $(ulp-input-dir)/itvs_noalias
$(ulp-itvs-noalias): $(math-lib-itvs)
cat $^ > $@
rename-aliases := $(ulp-input-dir)/rename_alias.sed
$(rename-aliases): $(ulp-aliases)
# Build sed script for replacing aliases from generated alias file
cat $< | awk '{ print "s/ " $$1 " / " $$2 " /g" }' > $@
ulp-itvs-alias := $(ulp-input-dir)/itvs_alias
$(ulp-itvs-alias): $(ulp-itvs-noalias) $(rename-aliases)
cat $< | sed -f $(rename-aliases) > $@
ulp-itvs := $(ulp-input-dir)/intervals
$(ulp-itvs): $(ulp-itvs-alias) $(ulp-itvs-noalias)
$(ulp-itvs): $(math-lib-itvs)
cat $^ | sort -u | sed "s/PL_TEST_INTERVAL //g" > $@
check-pl/math-ulp: $(math-tools) $(ulp-lims) $(ulp-aliases) $(fenv-exps) $(ulp-itvs)
check-pl/math-ulp: $(math-tools) $(ulp-lims) $(fenv-exps) $(ulp-itvs)
WANT_SVE_MATH=$(WANT_SVE_MATH) \
ULPFLAGS="$(math-ulpflags)" \
LIMITS=../../../$(ulp-lims) \
ALIASES=../../../$(ulp-aliases) \
INTERVALS=../../../$(ulp-itvs) \
FENV=../../../$(fenv-exps) \
FUNC=$(func) \
build/pl/bin/runulp.sh $(EMULATOR)
check-pl/math: check-pl/math-test check-pl/math-rtest check-pl/math-ulp
@ -220,8 +207,8 @@ $(DESTDIR)$(includedir)/pl/%: build/pl/include/%
$(INSTALL) -m 644 -D $< $@
install-pl/math: \
$(math-libs:build/pl/lib/%=$(DESTDIR)$(libdir)/pl/%) \
$(math-includes:build/pl/include/%=$(DESTDIR)$(includedir)/pl/%)
$(pl-libs:build/pl/lib/%=$(DESTDIR)$(libdir)/pl/%) \
$(pl-includes:build/pl/include/%=$(DESTDIR)$(includedir)/pl/%)
clean-pl/math:
rm -f $(pl/math-files)

View File

@ -0,0 +1,100 @@
/*
* Double-precision acos(x) function.
*
* Copyright (c) 2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
#include "poly_scalar_f64.h"
#include "pl_sig.h"
#include "pl_test.h"
#define AbsMask (0x7fffffffffffffff)
#define Half (0x3fe0000000000000)
#define One (0x3ff0000000000000)
#define PiOver2 (0x1.921fb54442d18p+0)
#define Pi (0x1.921fb54442d18p+1)
#define Small (0x3c90000000000000) /* 2^-53. */
#define Small16 (0x3c90)
#define QNaN (0x7ff8)
/* Fast implementation of double-precision acos(x) based on polynomial
approximation of double-precision asin(x).
For x < Small, approximate acos(x) by pi/2 - x. Small = 2^-53 for correct
rounding.
For |x| in [Small, 0.5], use the trigonometric identity
acos(x) = pi/2 - asin(x)
and use an order 11 polynomial P such that the final approximation of asin is
an odd polynomial: asin(x) ~ x + x^3 * P(x^2).
The largest observed error in this region is 1.18 ulps,
acos(0x1.fbab0a7c460f6p-2) got 0x1.0d54d1985c068p+0
want 0x1.0d54d1985c069p+0.
For |x| in [0.5, 1.0], use the following development of acos(x) near x = 1
acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z))
where z = (1-x)/2, z is near 0 when x approaches 1, and P contributes to the
approximation of asin near 0.
The largest observed error in this region is 1.52 ulps,
acos(0x1.23d362722f591p-1) got 0x1.edbbedf8a7d6ep-1
want 0x1.edbbedf8a7d6cp-1.
For x in [-1.0, -0.5], use this other identity to deduce the negative inputs
from their absolute value: acos(x) = pi - acos(-x). */
double
acos (double x)
{
uint64_t ix = asuint64 (x);
uint64_t ia = ix & AbsMask;
uint64_t ia16 = ia >> 48;
double ax = asdouble (ia);
uint64_t sign = ix & ~AbsMask;
/* Special values and invalid range. */
if (unlikely (ia16 == QNaN))
return x;
if (ia > One)
return __math_invalid (x);
if (ia16 < Small16)
return PiOver2 - x;
/* Evaluate polynomial Q(|x|) = z + z * z2 * P(z2) with
z2 = x ^ 2 and z = |x| , if |x| < 0.5
z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */
double z2 = ax < 0.5 ? x * x : fma (-0.5, ax, 0.5);
double z = ax < 0.5 ? ax : sqrt (z2);
/* Use a single polynomial approximation P for both intervals. */
double z4 = z2 * z2;
double z8 = z4 * z4;
double z16 = z8 * z8;
double p = estrin_11_f64 (z2, z4, z8, z16, __asin_poly);
/* Finalize polynomial: z + z * z2 * P(z2). */
p = fma (z * z2, p, z);
/* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5
= pi - 2 Q(|x|), for -1.0 < x <= -0.5
= 2 Q(|x|) , for -0.5 < x < 0.0. */
if (ax < 0.5)
return PiOver2 - asdouble (asuint64 (p) | sign);
return (x <= -0.5) ? fma (-2.0, p, Pi) : 2.0 * p;
}
PL_SIG (S, D, 1, acos, -1.0, 1.0)
PL_TEST_ULP (acos, 1.02)
PL_TEST_INTERVAL (acos, 0, Small, 5000)
PL_TEST_INTERVAL (acos, Small, 0.5, 50000)
PL_TEST_INTERVAL (acos, 0.5, 1.0, 50000)
PL_TEST_INTERVAL (acos, 1.0, 0x1p11, 50000)
PL_TEST_INTERVAL (acos, 0x1p11, inf, 20000)
PL_TEST_INTERVAL (acos, -0, -inf, 20000)

View File

@ -0,0 +1,99 @@
/*
* Single-precision acos(x) function.
*
* Copyright (c) 2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "poly_scalar_f32.h"
#include "math_config.h"
#include "pl_sig.h"
#include "pl_test.h"
#define AbsMask (0x7fffffff)
#define Half (0x3f000000)
#define One (0x3f800000)
#define PiOver2f (0x1.921fb6p+0f)
#define Pif (0x1.921fb6p+1f)
#define Small (0x32800000) /* 2^-26. */
#define Small12 (0x328)
#define QNaN (0x7fc)
/* Fast implementation of single-precision acos(x) based on polynomial
approximation of single-precision asin(x).
For x < Small, approximate acos(x) by pi/2 - x. Small = 2^-26 for correct
rounding.
For |x| in [Small, 0.5], use the trigonometric identity
acos(x) = pi/2 - asin(x)
and use an order 4 polynomial P such that the final approximation of asin is
an odd polynomial: asin(x) ~ x + x^3 * P(x^2).
The largest observed error in this region is 1.16 ulps,
acosf(0x1.ffbeccp-2) got 0x1.0c27f8p+0 want 0x1.0c27f6p+0.
For |x| in [0.5, 1.0], use the following development of acos(x) near x = 1
acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z))
where z = (1-x)/2, z is near 0 when x approaches 1, and P contributes to the
approximation of asin near 0.
The largest observed error in this region is 1.32 ulps,
acosf(0x1.15ba56p-1) got 0x1.feb33p-1 want 0x1.feb32ep-1.
For x in [-1.0, -0.5], use this other identity to deduce the negative inputs
from their absolute value.
acos(x) = pi - acos(-x)
The largest observed error in this region is 1.28 ulps,
acosf(-0x1.002072p-1) got 0x1.0c1e84p+1 want 0x1.0c1e82p+1. */
float
acosf (float x)
{
uint32_t ix = asuint (x);
uint32_t ia = ix & AbsMask;
uint32_t ia12 = ia >> 20;
float ax = asfloat (ia);
uint32_t sign = ix & ~AbsMask;
/* Special values and invalid range. */
if (unlikely (ia12 == QNaN))
return x;
if (ia > One)
return __math_invalidf (x);
if (ia12 < Small12)
return PiOver2f - x;
/* Evaluate polynomial Q(|x|) = z + z * z2 * P(z2) with
z2 = x ^ 2 and z = |x| , if |x| < 0.5
z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */
float z2 = ax < 0.5 ? x * x : fmaf (-0.5f, ax, 0.5f);
float z = ax < 0.5 ? ax : sqrtf (z2);
/* Use a single polynomial approximation P for both intervals. */
float p = horner_4_f32 (z2, __asinf_poly);
/* Finalize polynomial: z + z * z2 * P(z2). */
p = fmaf (z * z2, p, z);
/* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5
= pi - 2 Q(|x|), for -1.0 < x <= -0.5
= 2 Q(|x|) , for -0.5 < x < 0.0. */
if (ax < 0.5)
return PiOver2f - asfloat (asuint (p) | sign);
return (x <= -0.5) ? fmaf (-2.0f, p, Pif) : 2.0f * p;
}
PL_SIG (S, F, 1, acos, -1.0, 1.0)
PL_TEST_ULP (acosf, 0.82)
PL_TEST_INTERVAL (acosf, 0, Small, 5000)
PL_TEST_INTERVAL (acosf, Small, 0.5, 50000)
PL_TEST_INTERVAL (acosf, 0.5, 1.0, 50000)
PL_TEST_INTERVAL (acosf, 1.0, 0x1p11, 50000)
PL_TEST_INTERVAL (acosf, 0x1p11, inf, 20000)
PL_TEST_INTERVAL (acosf, -0, -inf, 20000)

View File

@ -0,0 +1,106 @@
/*
* Double-precision asin(x) function.
*
* Copyright (c) 2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "poly_scalar_f64.h"
#include "math_config.h"
#include "pl_sig.h"
#include "pl_test.h"
#define AbsMask (0x7fffffffffffffff)
#define Half (0x3fe0000000000000)
#define One (0x3ff0000000000000)
#define PiOver2 (0x1.921fb54442d18p+0)
#define Small (0x3e50000000000000) /* 2^-26. */
#define Small16 (0x3e50)
#define QNaN (0x7ff8)
/* Fast implementation of double-precision asin(x) based on polynomial
approximation.
For x < Small, approximate asin(x) by x. Small = 2^-26 for correct rounding.
For x in [Small, 0.5], use an order 11 polynomial P such that the final
approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
The largest observed error in this region is 1.01 ulps,
asin(0x1.da9735b5a9277p-2) got 0x1.ed78525a927efp-2
want 0x1.ed78525a927eep-2.
No cheap approximation can be obtained near x = 1, since the function is not
continuously differentiable on 1.
For x in [0.5, 1.0], we use a method based on a trigonometric identity
asin(x) = pi/2 - acos(x)
and a generalized power series expansion of acos(y) near y=1, that reads as
acos(y)/sqrt(2y) ~ 1 + 1/12 * y + 3/160 * y^2 + ... (1)
The Taylor series of asin(z) near z = 0, reads as
asin(z) ~ z + z^3 P(z^2) = z + z^3 * (1/6 + 3/40 z^2 + ...).
Therefore, (1) can be written in terms of P(y/2) or even asin(y/2)
acos(y) ~ sqrt(2y) (1 + y/2 * P(y/2)) = 2 * sqrt(y/2) (1 + y/2 * P(y/2)
Hence, if we write z = (1-x)/2, z is near 0 when x approaches 1 and
asin(x) ~ pi/2 - acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z)).
The largest observed error in this region is 2.69 ulps,
asin(0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1
want 0x1.110d7e85fdd53p-1. */
double
asin (double x)
{
uint64_t ix = asuint64 (x);
uint64_t ia = ix & AbsMask;
uint64_t ia16 = ia >> 48;
double ax = asdouble (ia);
uint64_t sign = ix & ~AbsMask;
/* Special values and invalid range. */
if (unlikely (ia16 == QNaN))
return x;
if (ia > One)
return __math_invalid (x);
if (ia16 < Small16)
return x;
/* Evaluate polynomial Q(x) = y + y * z * P(z) with
z2 = x ^ 2 and z = |x| , if |x| < 0.5
z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */
double z2 = ax < 0.5 ? x * x : fma (-0.5, ax, 0.5);
double z = ax < 0.5 ? ax : sqrt (z2);
/* Use a single polynomial approximation P for both intervals. */
double z4 = z2 * z2;
double z8 = z4 * z4;
double z16 = z8 * z8;
double p = estrin_11_f64 (z2, z4, z8, z16, __asin_poly);
/* Finalize polynomial: z + z * z2 * P(z2). */
p = fma (z * z2, p, z);
/* asin(|x|) = Q(|x|) , for |x| < 0.5
= pi/2 - 2 Q(|x|), for |x| >= 0.5. */
double y = ax < 0.5 ? p : fma (-2.0, p, PiOver2);
/* Copy sign. */
return asdouble (asuint64 (y) | sign);
}
PL_SIG (S, D, 1, asin, -1.0, 1.0)
PL_TEST_ULP (asin, 2.19)
PL_TEST_INTERVAL (asin, 0, Small, 5000)
PL_TEST_INTERVAL (asin, Small, 0.5, 50000)
PL_TEST_INTERVAL (asin, 0.5, 1.0, 50000)
PL_TEST_INTERVAL (asin, 1.0, 0x1p11, 50000)
PL_TEST_INTERVAL (asin, 0x1p11, inf, 20000)
PL_TEST_INTERVAL (asin, -0, -inf, 20000)

View File

@ -0,0 +1,19 @@
/*
* Coefficients for single-precision asin(x) function.
*
* Copyright (c) 2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
/* Approximate asin(x) directly in [0x1p-106, 0.25]. See tools/asin.sollya
for these coeffcients were generated. */
const double __asin_poly[] = {
/* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */
0x1.555555555554ep-3, 0x1.3333333337233p-4, 0x1.6db6db67f6d9fp-5,
0x1.f1c71fbd29fbbp-6, 0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6,
0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7, 0x1.fd1151acb6bedp-8,
0x1.087182f799c1dp-6, -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6,
};

View File

@ -0,0 +1,100 @@
/*
* Single-precision asin(x) function.
*
* Copyright (c) 2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "poly_scalar_f32.h"
#include "math_config.h"
#include "pl_sig.h"
#include "pl_test.h"
#define AbsMask (0x7fffffff)
#define Half (0x3f000000)
#define One (0x3f800000)
#define PiOver2f (0x1.921fb6p+0f)
#define Small (0x39800000) /* 2^-12. */
#define Small12 (0x398)
#define QNaN (0x7fc)
/* Fast implementation of single-precision asin(x) based on polynomial
approximation.
For x < Small, approximate asin(x) by x. Small = 2^-12 for correct rounding.
For x in [Small, 0.5], use order 4 polynomial P such that the final
approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
The largest observed error in this region is 0.83 ulps,
asinf(0x1.ea00f4p-2) got 0x1.fef15ep-2 want 0x1.fef15cp-2.
No cheap approximation can be obtained near x = 1, since the function is not
continuously differentiable on 1.
For x in [0.5, 1.0], we use a method based on a trigonometric identity
asin(x) = pi/2 - acos(x)
and a generalized power series expansion of acos(y) near y=1, that reads as
acos(y)/sqrt(2y) ~ 1 + 1/12 * y + 3/160 * y^2 + ... (1)
The Taylor series of asin(z) near z = 0, reads as
asin(z) ~ z + z^3 P(z^2) = z + z^3 * (1/6 + 3/40 z^2 + ...).
Therefore, (1) can be written in terms of P(y/2) or even asin(y/2)
acos(y) ~ sqrt(2y) (1 + y/2 * P(y/2)) = 2 * sqrt(y/2) (1 + y/2 * P(y/2)
Hence, if we write z = (1-x)/2, z is near 0 when x approaches 1 and
asin(x) ~ pi/2 - acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z)).
The largest observed error in this region is 2.41 ulps,
asinf(0x1.00203ep-1) got 0x1.0c3a64p-1 want 0x1.0c3a6p-1. */
float
asinf (float x)
{
uint32_t ix = asuint (x);
uint32_t ia = ix & AbsMask;
uint32_t ia12 = ia >> 20;
float ax = asfloat (ia);
uint32_t sign = ix & ~AbsMask;
/* Special values and invalid range. */
if (unlikely (ia12 == QNaN))
return x;
if (ia > One)
return __math_invalidf (x);
if (ia12 < Small12)
return x;
/* Evaluate polynomial Q(x) = y + y * z * P(z) with
z2 = x ^ 2 and z = |x| , if |x| < 0.5
z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */
float z2 = ax < 0.5 ? x * x : fmaf (-0.5f, ax, 0.5f);
float z = ax < 0.5 ? ax : sqrtf (z2);
/* Use a single polynomial approximation P for both intervals. */
float p = horner_4_f32 (z2, __asinf_poly);
/* Finalize polynomial: z + z * z2 * P(z2). */
p = fmaf (z * z2, p, z);
/* asin(|x|) = Q(|x|) , for |x| < 0.5
= pi/2 - 2 Q(|x|), for |x| >= 0.5. */
float y = ax < 0.5 ? p : fmaf (-2.0f, p, PiOver2f);
/* Copy sign. */
return asfloat (asuint (y) | sign);
}
PL_SIG (S, F, 1, asin, -1.0, 1.0)
PL_TEST_ULP (asinf, 1.91)
PL_TEST_INTERVAL (asinf, 0, Small, 5000)
PL_TEST_INTERVAL (asinf, Small, 0.5, 50000)
PL_TEST_INTERVAL (asinf, 0.5, 1.0, 50000)
PL_TEST_INTERVAL (asinf, 1.0, 0x1p11, 50000)
PL_TEST_INTERVAL (asinf, 0x1p11, inf, 20000)
PL_TEST_INTERVAL (asinf, -0, -inf, 20000)

View File

@ -0,0 +1,16 @@
/*
* Coefficients for single-precision asin(x) function.
*
* Copyright (c) 2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
/* Approximate asinf(x) directly in [0x1p-24, 0.25]. See for tools/asinf.sollya
for these coeffs were generated. */
const float __asinf_poly[] = {
/* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on
[ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */
0x1.55555ep-3, 0x1.33261ap-4, 0x1.70d7dcp-5, 0x1.b059dp-6, 0x1.3af7d8p-5,
};

View File

@ -4,7 +4,7 @@
* Copyright (c) 2022-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "estrin.h"
#include "poly_scalar_f64.h"
#include "math_config.h"
#include "pl_sig.h"
#include "pl_test.h"
@ -60,8 +60,7 @@ asinh (double x)
double z2 = x2 * x2;
double z4 = z2 * z2;
double z8 = z4 * z4;
#define C(i) __asinh_data.poly[i]
double p = ESTRIN_17 (x2, z2, z4, z8, z8 * z8, C);
double p = estrin_17_f64 (x2, z2, z4, z8, z8 * z8, __asinh_data.poly);
double y = fma (p, x2 * ax, ax);
return asdouble (asuint64 (y) | sign);
}

View File

@ -5,7 +5,7 @@
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "estrinf.h"
#include "poly_scalar_f32.h"
#include "math_config.h"
#include "pl_sig.h"
#include "pl_test.h"
@ -16,8 +16,6 @@
#define One (0x3f8)
#define ExpM12 (0x398)
#define C(i) __asinhf_data.coeffs[i]
float
optr_aor_log_f32 (float);
@ -57,7 +55,7 @@ asinhf (float x)
if (ia12 < One)
{
float x2 = ax * ax;
float p = ESTRIN_7 (ax, x2, x2 * x2, C);
float p = estrin_7_f32 (ax, x2, x2 * x2, __asinhf_data.coeffs);
float y = fmaf (x2, p, ax);
return asfloat (asuint (y) | sign);
}

View File

@ -1,49 +1,33 @@
/*
* Double-precision polynomial evaluation function for scalar and vector atan(x)
* and atan2(y,x).
* Double-precision polynomial evaluation function for scalar
* atan(x) and atan2(y,x).
*
* Copyright (c) 2021-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
#include "estrin.h"
#if V_SUPPORTED
#include "v_math.h"
#define DBL_T v_f64_t
#define P(i) v_f64 (__atan_poly_data.poly[i])
#else
#define DBL_T double
#define P(i) __atan_poly_data.poly[i]
#endif
#include "poly_scalar_f64.h"
/* Polynomial used in fast atan(x) and atan2(y,x) implementations
The order 19 polynomial P approximates (atan(sqrt(x))-sqrt(x))/x^(3/2). */
static inline DBL_T
eval_poly (DBL_T z, DBL_T az, DBL_T shift)
static inline double
eval_poly (double z, double az, double shift)
{
/* Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
full scheme to avoid underflow in x^16. */
DBL_T z2 = z * z;
DBL_T x2 = z2 * z2;
DBL_T x4 = x2 * x2;
DBL_T x8 = x4 * x4;
DBL_T y
= FMA (ESTRIN_11_ (z2, x2, x4, x8, P, 8), x8, ESTRIN_7 (z2, x2, x4, P));
double z2 = z * z;
double x2 = z2 * z2;
double x4 = x2 * x2;
double x8 = x4 * x4;
double y = fma (estrin_11_f64 (z2, x2, x4, x8, __atan_poly_data.poly + 8),
x8, estrin_7_f64 (z2, x2, x4, __atan_poly_data.poly));
/* Finalize. y = shift + z + z^3 * P(z^2). */
y = FMA (y, z2 * az, az);
y = fma (y, z2 * az, az);
y = y + shift;
return y;
}
#undef DBL_T
#undef FMA
#undef P

View File

@ -66,11 +66,7 @@ atanf (float x)
PL_SIG (S, F, 1, atan, -10.0, 10.0)
PL_TEST_ULP (atanf, 2.38)
PL_TEST_INTERVAL (atanf, 0, 0x1p-30, 5000)
PL_TEST_INTERVAL (atanf, -0, -0x1p-30, 5000)
PL_TEST_INTERVAL (atanf, 0x1p-30, 1, 40000)
PL_TEST_INTERVAL (atanf, -0x1p-30, -1, 40000)
PL_TEST_INTERVAL (atanf, 1, 0x1p30, 40000)
PL_TEST_INTERVAL (atanf, -1, -0x1p30, 40000)
PL_TEST_INTERVAL (atanf, 0x1p30, inf, 1000)
PL_TEST_INTERVAL (atanf, -0x1p30, -inf, 1000)
PL_TEST_SYM_INTERVAL (atanf, 0, 0x1p-30, 5000)
PL_TEST_SYM_INTERVAL (atanf, 0x1p-30, 1, 40000)
PL_TEST_SYM_INTERVAL (atanf, 1, 0x1p30, 40000)
PL_TEST_SYM_INTERVAL (atanf, 0x1p30, inf, 1000)

View File

@ -1,5 +1,5 @@
/*
* Single-precision polynomial evaluation function for scalar and vector
* Single-precision polynomial evaluation function for scalar
* atan(x) and atan2(y,x).
*
* Copyright (c) 2021-2023, Arm Limited.
@ -10,26 +10,12 @@
#define PL_MATH_ATANF_COMMON_H
#include "math_config.h"
#include "estrinf.h"
#if V_SUPPORTED
#include "v_math.h"
#define FLT_T v_f32_t
#define P(i) v_f32 (__atanf_poly_data.poly[i])
#else
#define FLT_T float
#define P(i) __atanf_poly_data.poly[i]
#endif
#include "poly_scalar_f32.h"
/* Polynomial used in fast atanf(x) and atan2f(y,x) implementations
The order 7 polynomial P approximates (atan(sqrt(x))-sqrt(x))/x^(3/2). */
static inline FLT_T
eval_poly (FLT_T z, FLT_T az, FLT_T shift)
static inline float
eval_poly (float z, float az, float shift)
{
/* Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
a standard implementation using z8 creates spurious underflow
@ -37,15 +23,16 @@ eval_poly (FLT_T z, FLT_T az, FLT_T shift)
Therefore, we split the last fma into a mul and and an fma.
Horner and single-level Estrin have higher errors that exceed
threshold. */
FLT_T z2 = z * z;
FLT_T z4 = z2 * z2;
float z2 = z * z;
float z4 = z2 * z2;
/* Then assemble polynomial. */
FLT_T y = FMA (z4, z4 * ESTRIN_3_ (z2, z4, P, 4), ESTRIN_3 (z2, z4, P));
float y = fmaf (
z4, z4 * pairwise_poly_3_f32 (z2, z4, __atanf_poly_data.poly + 4),
pairwise_poly_3_f32 (z2, z4, __atanf_poly_data.poly));
/* Finalize:
y = shift + z * P(z^2). */
return FMA (y, z2 * az, az) + shift;
return fmaf (y, z2 * az, az) + shift;
}
#endif // PL_MATH_ATANF_COMMON_H

View File

@ -6,7 +6,7 @@
*/
#include "math_config.h"
#include "estrin.h"
#include "poly_scalar_f64.h"
#include "pl_sig.h"
#include "pl_test.h"
@ -20,7 +20,6 @@
#define OneTop12 0x3ff
#define HfRt2Top 0x3fe6a09e /* top32(asuint64(sqrt(2)/2)). */
#define BottomMask 0xffffffff
#define C(i) __log1p_data.coeffs[i]
static inline double
log1p_inline (double x)
@ -46,7 +45,8 @@ log1p_inline (double x)
double f2 = f * f;
double f4 = f2 * f2;
double f8 = f4 * f4;
double p = fma (f, ESTRIN_18 (f, f2, f4, f8, f8 * f8, C) * f, f);
double p = fma (
f, estrin_18_f64 (f, f2, f4, f8, f8 * f8, __log1p_data.coeffs) * f, f);
/* Recombine log1p(x) = k*log2 + log1p(f) + c/m. */
double kd = k;
@ -78,9 +78,6 @@ atanh (double x)
PL_SIG (S, D, 1, atanh, -1.0, 1.0)
PL_TEST_ULP (atanh, 3.00)
PL_TEST_INTERVAL (atanh, 0, 0x1p-23, 10000)
PL_TEST_INTERVAL (atanh, -0, -0x1p-23, 10000)
PL_TEST_INTERVAL (atanh, 0x1p-23, 1, 90000)
PL_TEST_INTERVAL (atanh, -0x1p-23, -1, 90000)
PL_TEST_INTERVAL (atanh, 1, inf, 100)
PL_TEST_INTERVAL (atanh, -1, -inf, 100)
PL_TEST_SYM_INTERVAL (atanh, 0, 0x1p-23, 10000)
PL_TEST_SYM_INTERVAL (atanh, 0x1p-23, 1, 90000)
PL_TEST_SYM_INTERVAL (atanh, 1, inf, 100)

View File

@ -15,7 +15,8 @@
#define One 0x3f800000
#define Four 0x40800000
#define Ln2 0x1.62e43p-1f
#define TinyBound 0x39800000 /* 0x1p-12, below which atanhf(x) rounds to x. */
/* asuint(0x1p-12), below which atanhf(x) rounds to x. */
#define TinyBound 0x39800000
#define C(i) __log1pf_data.coeffs[i]
@ -80,9 +81,6 @@ atanhf (float x)
PL_SIG (S, F, 1, atanh, -1.0, 1.0)
PL_TEST_ULP (atanhf, 2.59)
PL_TEST_INTERVAL (atanhf, 0, 0x1p-12, 500)
PL_TEST_INTERVAL (atanhf, 0x1p-12, 1, 200000)
PL_TEST_INTERVAL (atanhf, 1, inf, 1000)
PL_TEST_INTERVAL (atanhf, -0, -0x1p-12, 500)
PL_TEST_INTERVAL (atanhf, -0x1p-12, -1, 200000)
PL_TEST_INTERVAL (atanhf, -1, -inf, 1000)
PL_TEST_SYM_INTERVAL (atanhf, 0, 0x1p-12, 500)
PL_TEST_SYM_INTERVAL (atanhf, 0x1p-12, 1, 200000)
PL_TEST_SYM_INTERVAL (atanhf, 1, inf, 1000)

View File

@ -31,7 +31,7 @@ cbrt (double x)
uint64_t iax = ix & AbsMask;
uint64_t sign = ix & ~AbsMask;
if (unlikely (iax == 0 || iax == 0x7f80000000000000))
if (unlikely (iax == 0 || iax == 0x7ff0000000000000))
return x;
/* |x| = m * 2^e, where m is in [0.5, 1.0].
@ -66,5 +66,4 @@ cbrt (double x)
}
PL_TEST_ULP (cbrt, 1.30)
PL_TEST_INTERVAL (cbrt, 0, inf, 1000000)
PL_TEST_INTERVAL (cbrt, -0, -inf, 1000000)
PL_TEST_SYM_INTERVAL (cbrt, 0, inf, 1000000)

View File

@ -5,7 +5,7 @@
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "estrinf.h"
#include "poly_scalar_f32.h"
#include "math_config.h"
#include "pl_sig.h"
#include "pl_test.h"
@ -14,7 +14,6 @@
#define SignMask 0x80000000
#define TwoThirds 0x1.555556p-1f
#define C(i) __cbrtf_data.poly[i]
#define T(i) __cbrtf_data.table[i]
/* Approximation for single-precision cbrt(x), using low-order polynomial and
@ -41,7 +40,8 @@ cbrtf (float x)
/* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is,
the less accurate the next stage of the algorithm needs to be. An order-4
polynomial is enough for one Newton iteration. */
float p = ESTRIN_3 (m, m * m, C);
float p = pairwise_poly_3_f32 (m, m * m, __cbrtf_data.poly);
/* One iteration of Newton's method for iteratively approximating cbrt. */
float m_by_3 = m / 3;
float a = fmaf (TwoThirds, p, m_by_3 / (p * p));
@ -63,5 +63,4 @@ cbrtf (float x)
PL_SIG (S, F, 1, cbrt, -10.0, 10.0)
PL_TEST_ULP (cbrtf, 1.03)
PL_TEST_INTERVAL (cbrtf, 0, inf, 1000000)
PL_TEST_INTERVAL (cbrtf, -0, -inf, 1000000)
PL_TEST_SYM_INTERVAL (cbrtf, 0, inf, 1000000)

View File

@ -58,9 +58,6 @@ cosh (double x)
PL_SIG (S, D, 1, cosh, -10.0, 10.0)
PL_TEST_ULP (cosh, 1.43)
PL_TEST_INTERVAL (cosh, 0, 0x1.61da04cbafe44p+9, 100000)
PL_TEST_INTERVAL (cosh, -0, -0x1.61da04cbafe44p+9, 100000)
PL_TEST_INTERVAL (cosh, 0x1.61da04cbafe44p+9, 0x1p10, 1000)
PL_TEST_INTERVAL (cosh, -0x1.61da04cbafe44p+9, -0x1p10, 1000)
PL_TEST_INTERVAL (cosh, 0x1p10, inf, 100)
PL_TEST_INTERVAL (cosh, -0x1p10, -inf, 100)
PL_TEST_SYM_INTERVAL (cosh, 0, 0x1.61da04cbafe44p+9, 100000)
PL_TEST_SYM_INTERVAL (cosh, 0x1.61da04cbafe44p+9, 0x1p10, 1000)
PL_TEST_SYM_INTERVAL (cosh, 0x1p10, inf, 100)

View File

@ -63,9 +63,6 @@ coshf (float x)
PL_SIG (S, F, 1, cosh, -10.0, 10.0)
PL_TEST_ULP (coshf, 1.89)
PL_TEST_INTERVAL (coshf, 0, 0x1p-63, 100)
PL_TEST_INTERVAL (coshf, 0, 0x1.5a92d8p+6, 80000)
PL_TEST_INTERVAL (coshf, 0x1.5a92d8p+6, inf, 2000)
PL_TEST_INTERVAL (coshf, -0, -0x1p-63, 100)
PL_TEST_INTERVAL (coshf, -0, -0x1.5a92d8p+6, 80000)
PL_TEST_INTERVAL (coshf, -0x1.5a92d8p+6, -inf, 2000)
PL_TEST_SYM_INTERVAL (coshf, 0, 0x1p-63, 100)
PL_TEST_SYM_INTERVAL (coshf, 0, 0x1.5a92d8p+6, 80000)
PL_TEST_SYM_INTERVAL (coshf, 0x1.5a92d8p+6, inf, 2000)

View File

@ -0,0 +1,89 @@
/*
* Double-precision scalar cospi function.
*
* Copyright (c) 2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "math_config.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "poly_scalar_f64.h"
/* Taylor series coefficents for sin(pi * x).
C2 coefficient (orginally ~=5.16771278) has been split into two parts:
C2_hi = 4, C2_lo = C2 - C2_hi (~=1.16771278)
This change in magnitude reduces floating point rounding errors.
C2_hi is then reintroduced after the polynomial approxmation. */
static const double poly[]
= { 0x1.921fb54442d184p1, -0x1.2aef39896f94bp0, 0x1.466bc6775ab16p1,
-0x1.32d2cce62dc33p-1, 0x1.507834891188ep-4, -0x1.e30750a28c88ep-8,
0x1.e8f48308acda4p-12, -0x1.6fc0032b3c29fp-16, 0x1.af86ae521260bp-21,
-0x1.012a9870eeb7dp-25 };
#define Shift 0x1.8p+52
/* Approximation for scalar double-precision cospi(x).
Maximum error: 3.13 ULP:
cospi(0x1.160b129300112p-21) got 0x1.fffffffffd16bp-1
want 0x1.fffffffffd16ep-1. */
double
cospi (double x)
{
if (isinf (x))
return __math_invalid (x);
double ax = asdouble (asuint64 (x) & ~0x8000000000000000);
/* Edge cases for when cospif should be exactly 1. (Integers)
0x1p53 is the limit for single precision to store any decimal places. */
if (ax >= 0x1p53)
return 1;
/* If x is an integer, return +- 1, based upon if x is odd. */
uint64_t m = (uint64_t) ax;
if (m == ax)
return (m & 1) ? -1 : 1;
/* For very small inputs, squaring r causes underflow.
Values below this threshold can be approximated via
cospi(x) ~= 1. */
if (ax < 0x1p-63)
return 1;
/* Any non-integer values >= 0x1x51 will be int +0.5.
These values should return exactly 0. */
if (ax >= 0x1p51)
return 0;
/* n = rint(|x|). */
double n = ax + Shift;
uint64_t sign = asuint64 (n) << 63;
n = n - Shift;
/* We know that cospi(x) = sinpi(0.5 - x)
range reduction and offset into sinpi range -1/2 .. 1/2
r = 0.5 - |x - rint(x)|. */
double r = 0.5 - fabs (ax - n);
/* y = sin(r). */
double r2 = r * r;
double y = horner_9_f64 (r2, poly);
y = y * r;
/* Reintroduce C2_hi. */
y = fma (-4 * r2, r, y);
/* As all values are reduced to -1/2 .. 1/2, the result of cos(x) always be
positive, therefore, the sign must be introduced based upon if x rounds to
odd or even. */
return asdouble (asuint64 (y) ^ sign);
}
PL_SIG (S, D, 1, cospi, -0.9, 0.9)
PL_TEST_ULP (cospi, 2.63)
PL_TEST_SYM_INTERVAL (cospi, 0, 0x1p-63, 5000)
PL_TEST_SYM_INTERVAL (cospi, 0x1p-63, 0.5, 10000)
PL_TEST_SYM_INTERVAL (cospi, 0.5, 0x1p51f, 10000)
PL_TEST_SYM_INTERVAL (cospi, 0x1p51f, inf, 10000)

View File

@ -0,0 +1,84 @@
/*
* Single-precision scalar cospi function.
*
* Copyright (c) 2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "math_config.h"
#include "pl_sig.h"
#include "pl_test.h"
/* Taylor series coefficents for sin(pi * x). */
#define C0 0x1.921fb6p1f
#define C1 -0x1.4abbcep2f
#define C2 0x1.466bc6p1f
#define C3 -0x1.32d2ccp-1f
#define C4 0x1.50783p-4f
#define C5 -0x1.e30750p-8f
#define Shift 0x1.0p+23f
/* Approximation for scalar single-precision cospi(x) - cospif.
Maximum error: 2.64 ULP:
cospif(0x1.37e844p-4) got 0x1.f16b3p-1
want 0x1.f16b2ap-1. */
float
cospif (float x)
{
if (isinf (x))
return __math_invalidf (x);
float ax = asfloat (asuint (x) & ~0x80000000);
/* Edge cases for when cospif should be exactly +/- 1. (Integers)
0x1p23 is the limit for single precision to store any decimal places. */
if (ax >= 0x1p24f)
return 1;
uint32_t m = roundf (ax);
if (m == ax)
return (m & 1) ? -1 : 1;
/* Any non-integer values >= 0x1p22f will be int +0.5.
These values should return exactly 0. */
if (ax >= 0x1p22f)
return 0;
/* For very small inputs, squaring r causes underflow.
Values below this threshold can be approximated via cospi(x) ~= 1 -
(pi*x). */
if (ax < 0x1p-31f)
return 1 - (C0 * x);
/* n = rint(|x|). */
float n = ax + Shift;
uint32_t sign = asuint (n) << 31;
n = n - Shift;
/* We know that cospi(x) = sinpi(0.5 - x)
range reduction and offset into sinpi range -1/2 .. 1/2
r = 0.5 - |x - rint(x)|. */
float r = 0.5f - fabs (ax - n);
/* y = sin(pi * r). */
float r2 = r * r;
float y = fmaf (C5, r2, C4);
y = fmaf (y, r2, C3);
y = fmaf (y, r2, C2);
y = fmaf (y, r2, C1);
y = fmaf (y, r2, C0);
/* As all values are reduced to -1/2 .. 1/2, the result of cos(x) always be
positive, therefore, the sign must be introduced based upon if x rounds to
odd or even. */
return asfloat (asuint (y * r) ^ sign);
}
PL_SIG (S, F, 1, cospi, -0.9, 0.9)
PL_TEST_ULP (cospif, 2.15)
PL_TEST_SYM_INTERVAL (cospif, 0, 0x1p-31, 5000)
PL_TEST_SYM_INTERVAL (cospif, 0x1p-31, 0.5, 10000)
PL_TEST_SYM_INTERVAL (cospif, 0.5, 0x1p22f, 10000)
PL_TEST_SYM_INTERVAL (cospif, 0x1p22f, inf, 10000)

View File

@ -0,0 +1,102 @@
/*
* Double-precision erf(x) function.
*
* Copyright (c) 2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
#include "pl_sig.h"
#include "pl_test.h"
#define TwoOverSqrtPiMinusOne 0x1.06eba8214db69p-3
#define Shift 0x1p45
/* Polynomial coefficients. */
#define OneThird 0x1.5555555555555p-2
#define TwoThird 0x1.5555555555555p-1
#define TwoOverFifteen 0x1.1111111111111p-3
#define TwoOverFive 0x1.999999999999ap-2
#define Tenth 0x1.999999999999ap-4
#define TwoOverNine 0x1.c71c71c71c71cp-3
#define TwoOverFortyFive 0x1.6c16c16c16c17p-5
#define Sixth 0x1.555555555555p-3
/* Fast erf approximation based on series expansion near x rounded to
nearest multiple of 1/128.
Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
erf(x) ~ erf(r)
+ scale * d * [
+ 1
- r d
+ 1/3 (2 r^2 - 1) d^2
- 1/6 (r (2 r^2 - 3)) d^3
+ 1/30 (4 r^4 - 12 r^2 + 3) d^4
- 1/90 (4 r^4 - 20 r^2 + 15) d^5
]
Maximum measure error: 2.29 ULP
erf(-0x1.00003c924e5d1p-8) got -0x1.20dd59132ebadp-8
want -0x1.20dd59132ebafp-8. */
double
erf (double x)
{
/* Get absolute value and sign. */
uint64_t ix = asuint64 (x);
uint64_t ia = ix & 0x7fffffffffffffff;
uint64_t sign = ix & ~0x7fffffffffffffff;
/* |x| < 0x1p-508. Triggers exceptions. */
if (unlikely (ia < 0x2030000000000000))
return fma (TwoOverSqrtPiMinusOne, x, x);
if (ia < 0x4017f80000000000) /* |x| < 6 - 1 / 128 = 5.9921875. */
{
/* Set r to multiple of 1/128 nearest to |x|. */
double a = asdouble (ia);
double z = a + Shift;
uint64_t i = asuint64 (z) - asuint64 (Shift);
double r = z - Shift;
/* Lookup erf(r) and scale(r) in table.
Set erf(r) to 0 and scale to 2/sqrt(pi) for |x| <= 0x1.cp-9. */
double erfr = __erf_data.tab[i].erf;
double scale = __erf_data.tab[i].scale;
/* erf(x) ~ erf(r) + scale * d * poly (d, r). */
double d = a - r;
double r2 = r * r;
double d2 = d * d;
/* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5. */
double p1 = -r;
double p2 = fma (TwoThird, r2, -OneThird);
double p3 = -r * fma (OneThird, r2, -0.5);
double p4 = fma (fma (TwoOverFifteen, r2, -TwoOverFive), r2, Tenth);
double p5
= -r * fma (fma (TwoOverFortyFive, r2, -TwoOverNine), r2, Sixth);
double p34 = fma (p4, d, p3);
double p12 = fma (p2, d, p1);
double y = fma (p5, d2, p34);
y = fma (y, d2, p12);
y = fma (fma (y, d2, d), scale, erfr);
return asdouble (asuint64 (y) | sign);
}
/* Special cases : erf(nan)=nan, erf(+inf)=+1 and erf(-inf)=-1. */
if (unlikely (ia >= 0x7ff0000000000000))
return (1.0 - (double) (sign >> 62)) + 1.0 / x;
/* Boring domain (|x| >= 6.0). */
return asdouble (sign | asuint64 (1.0));
}
PL_SIG (S, D, 1, erf, -6.0, 6.0)
PL_TEST_ULP (erf, 1.79)
PL_TEST_SYM_INTERVAL (erf, 0, 5.9921875, 40000)
PL_TEST_SYM_INTERVAL (erf, 5.9921875, inf, 40000)
PL_TEST_SYM_INTERVAL (erf, 0, inf, 40000)

Some files were not shown because too many files have changed in this diff Show More