AK: Add FloatingPoint.h

This is a set of functions that allow you to convert between arbitrary
IEEE 754 floating point types, as long as they can be represented
within 64 bits. Conversion methods between floats and doubles are
provided, as well as a generic `float_to_float()`.

Example usage:

  #include <AK/FloatingPoint.h>

  double val = 1.234;
  auto weird_f16 =
      convert_from_native_double<FloatingPointBits<0, 6, 10>>(val);

Signed and unsigned floats are supported, and both NaN and +/-Inf are
handled correctly. Values that do not fit in the target floating point
type are clamped.
This commit is contained in:
Jelle Raaijmakers 2022-02-05 02:23:27 +01:00 committed by Andreas Kling
parent f1cd6453ae
commit 8483064b59
3 changed files with 279 additions and 0 deletions

193
AK/FloatingPoint.h Normal file
View file

@ -0,0 +1,193 @@
/*
* Copyright (c) 2022, Jelle Raaijmakers <jelle@gmta.nl>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/BitCast.h>
#include <AK/Concepts.h>
#include <AK/Types.h>
namespace AK {
template<size_t S, size_t E, size_t M>
requires(S <= 1 && E >= 1 && M >= 1 && (S + E + M) <= 64) class FloatingPointBits final {
public:
static const size_t signbit = S;
static const size_t exponentbits = E;
static const size_t mantissabits = M;
template<typename T>
requires(IsIntegral<T>&& IsUnsigned<T> && sizeof(T) <= 8) constexpr FloatingPointBits(T bits)
: m_bits(bits)
{
}
constexpr FloatingPointBits(double value)
: m_bits(bit_cast<u64>(value))
{
}
constexpr FloatingPointBits(float value)
: m_bits(bit_cast<u32>(value))
{
}
double as_double() const requires(S == 1 && E == 11 && M == 52) { return bit_cast<double>(m_bits); }
float as_float() const requires(S == 1 && E == 8 && M == 23) { return bit_cast<float>(static_cast<u32>(m_bits)); }
u64 bits() const { return m_bits; }
private:
u64 m_bits;
};
typedef FloatingPointBits<1, 8, 23> SingleFloatingPointBits;
typedef FloatingPointBits<1, 11, 52> DoubleFloatingPointBits;
/**
* Convert between two IEEE 754 floating point types in any arrangement of sign, exponent and mantissa bits.
*/
template<typename To, typename From>
constexpr To float_to_float(From const input)
{
constexpr u64 from_exponent_nonnumber = (1ull << From::exponentbits) - 1;
constexpr u64 from_exponent_bias = (1ull << (From::exponentbits - 1)) - 1;
constexpr u64 to_exponent_nonnumber = (1ull << To::exponentbits) - 1;
constexpr u64 to_exponent_bias = (1ull << (To::exponentbits - 1)) - 1;
constexpr u64 to_exponent_max = (1ull << To::exponentbits) - 2;
// Deconstruct input bits to float components
u64 from_sign = (input.bits() >> (From::exponentbits + From::mantissabits)) & From::signbit;
u64 from_exponent = (input.bits() >> From::mantissabits) & ((1ull << From::exponentbits) - 1);
u64 from_mantissa = input.bits() & ((1ull << From::mantissabits) - 1);
u64 to_sign = from_sign & To::signbit;
u64 to_exponent;
u64 to_mantissa;
auto target_value = [&to_sign, &to_exponent, &to_mantissa]() {
return To((to_sign << (To::exponentbits + To::mantissabits)) | (to_exponent << To::mantissabits) | to_mantissa);
};
auto shift_mantissa = [](u64 mantissa) -> u64 {
if constexpr (From::mantissabits < To::mantissabits)
return mantissa << (To::mantissabits - From::mantissabits);
else
return mantissa >> (From::mantissabits - To::mantissabits);
};
// If target is unsigned and source is negative, clamp to 0 or keep NaN
if constexpr (To::signbit == 0) {
if (from_sign == 1) {
if (from_exponent == from_exponent_nonnumber && from_mantissa > 0) {
to_exponent = to_exponent_nonnumber;
to_mantissa = 1;
} else {
to_exponent = 0;
to_mantissa = 0;
}
return target_value();
}
}
// If the source floating point is denormalized;
if (from_exponent == 0) {
// If the source mantissa is 0, the value is +/-0
if (from_mantissa == 0) {
to_exponent = 0;
to_mantissa = 0;
return target_value();
}
// If the source has more exponent bits than the target, then the largest possible
// source mantissa still cannot be represented in the target denormalized value.
if constexpr (From::exponentbits > To::exponentbits) {
to_exponent = 0;
to_mantissa = 0;
return target_value();
}
// If the source and target have the same number of exponent bits, we only need to
// shift the mantissa.
if constexpr (From::exponentbits == To::exponentbits) {
to_exponent = 0;
to_mantissa = shift_mantissa(from_mantissa);
return target_value();
}
// The target has more exponent bits, so our denormalized value can be represented
// as a normalized value in the target floating point. Normalized values have an
// implicit leading 1, so we shift the mantissa left until we find our explicit
// leading 1 which is then dropped.
int adjust_exponent = -1;
to_mantissa = from_mantissa;
do {
++adjust_exponent;
to_mantissa <<= 1;
} while ((to_mantissa & (1ull << From::mantissabits)) == 0);
to_exponent = to_exponent_bias - from_exponent_bias - adjust_exponent;
// Drop the most significant bit from the mantissa
to_mantissa &= (1ull << From::mantissabits) - 1;
to_mantissa = shift_mantissa(to_mantissa);
return target_value();
}
// If the source is NaN or +/-Inf, keep it that way
if (from_exponent == from_exponent_nonnumber) {
to_exponent = to_exponent_nonnumber;
to_mantissa = (from_mantissa == 0) ? 0 : 1;
return target_value();
}
// Determine the target exponent
to_exponent = to_exponent_bias - from_exponent_bias + from_exponent;
// If the calculated exponent exceeds the target's capacity, clamp both the exponent and the
// mantissa to their maximum values.
if (to_exponent > to_exponent_max) {
to_exponent = to_exponent_max;
to_mantissa = (1ull << To::mantissabits) - 1;
return target_value();
}
// If the new exponent is less than 1, we can only represent this value as a denormalized number
if (to_exponent < 1) {
to_exponent = 0;
// Add a leading 1 and shift the mantissa right
int adjust_exponent = 1 - to_exponent_bias - from_exponent + from_exponent_bias;
to_mantissa = ((1ull << From::mantissabits) | from_mantissa) >> adjust_exponent;
to_mantissa = shift_mantissa(to_mantissa);
return target_value();
}
// New exponent fits; shift the mantissa to fit as well
to_mantissa = shift_mantissa(from_mantissa);
return target_value();
}
template<typename O>
constexpr O convert_from_native_double(double input) { return float_to_float<O>(DoubleFloatingPointBits(input)); }
template<typename O>
constexpr O convert_from_native_float(float input) { return float_to_float<O>(SingleFloatingPointBits(input)); }
template<typename I>
constexpr double convert_to_native_double(I input) { return float_to_float<DoubleFloatingPointBits>(input).as_double(); }
template<typename I>
constexpr float convert_to_native_float(I input) { return float_to_float<SingleFloatingPointBits>(input).as_float(); }
}
using AK::DoubleFloatingPointBits;
using AK::FloatingPointBits;
using AK::SingleFloatingPointBits;
using AK::convert_from_native_double;
using AK::convert_from_native_float;
using AK::convert_to_native_double;
using AK::convert_to_native_float;
using AK::float_to_float;

View file

@ -26,6 +26,7 @@ set(AK_TEST_SOURCES
TestEnumBits.cpp
TestFind.cpp
TestFixedArray.cpp
TestFloatingPoint.cpp
TestFormat.cpp
TestGenericLexer.cpp
TestHashFunctions.cpp

View file

@ -0,0 +1,85 @@
/*
* Copyright (c) 2022, Jelle Raaijmakers <jelle@gmta.nl>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/FloatingPoint.h>
#include <LibTest/TestCase.h>
#include <math.h>
TEST_CASE(f16_1_5_10_to_native_float)
{
auto within_approximate = [](u16 lhs, float rhs) -> bool {
auto f32_lhs = convert_to_native_float(FloatingPointBits<1, 5, 10>(lhs));
return fabsf(f32_lhs - rhs) <= 0.00001f;
};
EXPECT(within_approximate(0x0000, 0.f));
EXPECT(within_approximate(0x03FF, 0.000061f));
EXPECT(within_approximate(0x3CEF, 1.23339f));
EXPECT(within_approximate(0xBC00, -1.f));
EXPECT(within_approximate(0xA266, -0.0125f));
float result;
result = convert_to_native_float(FloatingPointBits<1, 5, 10>(0xFC01u));
EXPECT(isnan(result));
result = convert_to_native_float(FloatingPointBits<1, 5, 10>(0x7C00u));
EXPECT(isinf(result));
}
TEST_CASE(float_to_double_roundtrips)
{
auto roundtrip = [](float floatvalue1) {
auto doublevalue = convert_from_native_float<DoubleFloatingPointBits>(floatvalue1).as_double();
auto floatbits = convert_from_native_double<SingleFloatingPointBits>(doublevalue);
auto floatvalue2 = convert_to_native_float(floatbits);
EXPECT_APPROXIMATE(floatvalue1, floatvalue2);
};
roundtrip(-1.0f);
roundtrip(-0.1f);
roundtrip(0.0f);
roundtrip(0.000001f);
roundtrip(0.1f);
roundtrip(1.0f);
roundtrip(3.141592f);
roundtrip(16777216.0f);
roundtrip(33554432.0f);
roundtrip(1 / 0.0f);
roundtrip(1 / -0.0f);
roundtrip(0 / 0.0f);
}
TEST_CASE(normalize_denormalize)
{
// Go from denormalized float to normalized double
auto denormalized_float = 6.709679e-39f;
auto denormalized_float_bits = SingleFloatingPointBits(denormalized_float);
auto normalized_double = convert_to_native_double(denormalized_float_bits);
EXPECT_APPROXIMATE(denormalized_float, normalized_double);
// Go back from normalized double to denormalized float
auto normalized_double_bits = DoubleFloatingPointBits(normalized_double);
auto reconstructed_denormalized_float = convert_to_native_float(normalized_double_bits);
EXPECT_APPROXIMATE(denormalized_float, reconstructed_denormalized_float);
}
TEST_CASE(large_exponent)
{
// Make sure we support at least 62 bits of exponent
auto large_exponent_float = convert_from_native_double<FloatingPointBits<1, 62, 1>>(1.0);
auto converted_double = convert_to_native_double(large_exponent_float);
EXPECT_APPROXIMATE(converted_double, 1.0);
}
TEST_CASE(large_mantissa)
{
// Make sure we support at least 62 bits of mantissa
auto large_exponent_float = convert_from_native_double<FloatingPointBits<1, 1, 62>>(1.0);
auto converted_double = convert_to_native_double(large_exponent_float);
EXPECT_APPROXIMATE(converted_double, 1.0);
}