Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 31 additions & 2 deletions cpp/src/arrow/util/float16.h
Original file line number Diff line number Diff line change
Expand Up @@ -185,22 +185,51 @@ static_assert(sizeof(Float16) == sizeof(uint16_t));
} // namespace util
} // namespace arrow

// TODO: Not complete
template <>
class std::numeric_limits<arrow::util::Float16> {
using T = arrow::util::Float16;

public:
static constexpr bool is_specialized = true;
static constexpr bool is_signed = true;
static constexpr bool is_integer = false;
static constexpr bool is_exact = false;
static constexpr bool has_infinity = true;
static constexpr bool has_quiet_NaN = true;
static constexpr bool has_signaling_NaN = true;
static constexpr std::float_denorm_style has_denorm = std::denorm_present;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems to be deprecated in C++23, so perhaps we shouldn't bother?

static constexpr bool has_denorm_loss = false;
static constexpr bool is_iec559 = true;
static constexpr bool is_bounded = true;
static constexpr bool is_modulo = false;
static constexpr int radix = 2;

// Float16 has 10 explicit mantissa bits + 1 implicit bit = 11 bits precision
static constexpr int digits = 11;
// Number of decimal digits that can be represented: floor(10 * log10(2))
static constexpr int digits10 = 3;
// Number of decimal digits to fully represent the type: ceil(11 * log10(2) + 1)
static constexpr int max_digits10 = 5;

// Exponent range: bias = 15, min subnormal exponent = -14, min normal = -13
static constexpr int min_exponent = -13;
static constexpr int min_exponent10 = -4;
// Max exponent before infinity: field value 30 -> 30 - 15 + 1 = 16
static constexpr int max_exponent = 16;
static constexpr int max_exponent10 = 4;

static constexpr bool traps = false;
static constexpr bool tinyness_before = false;
static constexpr std::float_round_style round_style = std::round_to_nearest;

static constexpr T min() { return T::FromBits(0b0000010000000000); }
static constexpr T max() { return T::FromBits(0b0111101111111111); }
static constexpr T lowest() { return -max(); }
static constexpr T epsilon() { return T::FromBits(0b0001010000000000); } // 2^-10
static constexpr T round_error() { return T::FromBits(0b0011100000000000); } // 0.5
static constexpr T denorm_min() { return T::FromBits(0b0000000000000001); }

static constexpr T infinity() { return T::FromBits(0b0111110000000000); }

static constexpr T quiet_NaN() { return T::FromBits(0b0111111111111111); }
static constexpr T signaling_NaN() { return T::FromBits(0b0111110000000001); }
};
64 changes: 64 additions & 0 deletions cpp/src/arrow/util/float16_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -372,5 +372,69 @@ TEST(Float16Test, FromBytes) {
ASSERT_EQ(Float16::FromBigEndian(bytes.data()), Float16::FromBits(0x1cd0));
}

TEST(Float16Test, NumericLimits) {
using F16 = std::numeric_limits<Float16>;
using F32 = std::numeric_limits<float>;

// Boolean traits - should match standard float
ASSERT_EQ(F16::is_specialized, F32::is_specialized);
ASSERT_EQ(F16::is_signed, F32::is_signed);
ASSERT_EQ(F16::is_integer, F32::is_integer);
ASSERT_EQ(F16::is_exact, F32::is_exact);
ASSERT_EQ(F16::has_infinity, F32::has_infinity);
ASSERT_EQ(F16::has_quiet_NaN, F32::has_quiet_NaN);
ASSERT_EQ(F16::has_signaling_NaN, F32::has_signaling_NaN);
ASSERT_EQ(F16::has_denorm, F32::has_denorm);
ASSERT_EQ(F16::has_denorm_loss, F32::has_denorm_loss);
ASSERT_EQ(F16::is_iec559, F32::is_iec559);
ASSERT_EQ(F16::is_bounded, F32::is_bounded);
ASSERT_EQ(F16::is_modulo, F32::is_modulo);
ASSERT_EQ(F16::radix, F32::radix);

// Check if IEEE 754 is implemented correctly.
// Precision and exponent range
ASSERT_EQ(F16::digits, 11);
ASSERT_EQ(F16::digits10, 3);
ASSERT_EQ(F16::max_digits10, 5);
ASSERT_EQ(F16::min_exponent, -13);
ASSERT_EQ(F16::max_exponent, 16);
ASSERT_EQ(F16::min_exponent10, -4);
ASSERT_EQ(F16::max_exponent10, 4);

// Special values
ASSERT_FLOAT_EQ(F16::max().ToFloat(), 65504.0f); // Largest finite value
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we can also add this:

Suggested change
ASSERT_FLOAT_EQ(F16::max().ToFloat(), 65504.0f); // Largest finite value
ASSERT_FLOAT_EQ(F16::max().ToFloat(), 65504.0f); // Largest finite value
ASSERT_TRUE(Float16::FromBits(F16::max().bits() + 1U).is_infinity());

ASSERT_EQ(F16::lowest(), -F16::max()); // Most negative = -max
ASSERT_FLOAT_EQ(F16::epsilon().ToFloat(), 0.0009765625f); // 2^-10
ASSERT_FLOAT_EQ(F16::round_error().ToFloat(), 0.5f); // Round-to-nearest
ASSERT_TRUE(F16::infinity().is_infinity());
ASSERT_FALSE(F16::infinity().signbit());
ASSERT_TRUE((-F16::infinity()).is_infinity());
ASSERT_TRUE((-F16::infinity()).signbit());
ASSERT_TRUE(F16::quiet_NaN().is_nan());
ASSERT_TRUE(F16::signaling_NaN().is_nan());

// min() is smallest positive normal, denorm_min() is smallest subnormal
ASSERT_TRUE(F16::min().is_finite());
ASSERT_FALSE(F16::min().signbit());
ASSERT_TRUE(F16::denorm_min().is_finite());
ASSERT_FALSE(F16::denorm_min().signbit());

// Verify special values semantics
ASSERT_TRUE(F16::infinity().is_infinity());
ASSERT_TRUE((-F16::infinity()).is_infinity());
ASSERT_TRUE(F16::min() > Float16::FromBits(0));
ASSERT_TRUE(F16::denorm_min() > Float16::FromBits(0));
ASSERT_TRUE(F16::denorm_min() < F16::min());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps also verify that denorm_min / 2 == 0? Am I assuming right?


// Verify epsilon: 1 + epsilon != 1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We may also verify that 1 + (epsilon / 2) == 1

auto one = Float16(1.0f);
auto one_plus_epsilon = Float16(one.ToFloat() + F16::epsilon().ToFloat());
ASSERT_NE(one, one_plus_epsilon);

// Verify round_error is 0.5
ASSERT_FLOAT_EQ(F16::round_error().ToFloat(), 0.5f);
ASSERT_FLOAT_EQ(F32::round_error(), 0.5f);
}

} // namespace
} // namespace arrow::util
Loading