add __float128 support for Midpoint754

boostorg · ryanelandt · Aug 5, 2023 · Aug 5, 2023 · Aug 5, 2023 · Aug 7, 2023
commit 21dc82bdc678b1d857c18794c8c3b7ed60374d30
diff --git a/include/boost/math/tools/roots.hpp b/include/boost/math/tools/roots.hpp
@@ -25,6 +25,11 @@
 #include <boost/math/tools/toms748_solve.hpp>
 #include <boost/math/policies/error_handling.hpp>
 
+#ifdef BOOST_HAS_FLOAT128
+#include <boost/multiprecision/float128.hpp>
+using boost::multiprecision::float128;
+#endif
+
 namespace boost {
 namespace math {
 namespace tools {
@@ -273,6 +278,9 @@ namespace Bisection {
       static_assert(std::is_unsigned<U>::value, "U must be an unsigned integer type.");
       static_assert(sizeof(T) == sizeof(U), "Type and uint size must be the same.");
 
+      // The sign bit is 1, all other bits are 0
+      static constexpr U sign_mask() { return U(1) << (sizeof(U) * 8 - 1); }
+
       // Convert float to bits
       static U float_to_uint(T x) {
          U bits;
@@ -287,47 +295,73 @@ namespace Bisection {
          return x;
       }
 
+      // Avoids linking against libquadmath for __float128
+      static T fabs_imp(T x) {
+         U bits_x = float_to_uint(x);
+         bits_x = bits_x & ~sign_mask();  // Zero the sign bit
+         return uint_to_float(bits_x);
+      }
+
    public:
       static T solve(T x, T y) {
-         using std::fabs;
-
          // Sort so that y has the larger magnitude
-         if (fabs(y) < fabs(x)) {
+         if (fabs_imp(y) < fabs_imp(x)) {
             std::swap(x, y);
          }
-
-         const T x_mag = std::fabs(x);
-         const T y_mag = std::fabs(y);
-         const T sign_x = sign(x);
-         const T sign_y = sign(y);
 
-         // Convert the magnitudes to bits
-         U bits_mag_x = float_to_uint(x_mag);
-         U bits_mag_y = float_to_uint(y_mag);
+         const U bits_x = float_to_uint(x);
+         const U bits_y = float_to_uint(y);
+
+         const U sign_x = bits_x & sign_mask();
+         const U sign_y = bits_y & sign_mask();
+
+         const U mag_x = bits_x & ~sign_mask();
+         const U mag_y = bits_y & ~sign_mask();
 
          // Calculate the average magnitude in bits
-         U bits_mag = (sign_x == sign_y) ? (bits_mag_y + bits_mag_x) : (bits_mag_y - bits_mag_x);
+         U bits_mag = (sign_x == sign_y) ? (mag_y + mag_x) : (mag_y - mag_x);
          bits_mag = bits_mag >> 1;  // Divide by 2
 
-         // Reconstruct upl_mean from average magnitude and sign of y
-         return uint_to_float(bits_mag) * sign_y;
+         bits_mag = bits_mag | sign_y;  // Make the sign of bits_mag match the sign of y
+
+         return uint_to_float(bits_mag);
+      }
+
+      // NOTE: boost::multiprecision::float128 is cast to __float128
+      template <typename V>
+      static V solve(V x, V y) { return solve(static_cast<T>(x), static_cast<T>(y)); }
+
+      // Must evaluate to true in order to bisect correctly with infinity
+      // Ideally this should be a static assert.
+      static bool is_one_plus_max_bits_inf() {
+         const U bits_max = float_to_uint((std::numeric_limits<T>::max)());
+         const U bits_one_plus_max = bits_max + 1;
+         return uint_to_float(bits_one_plus_max) == std::numeric_limits<T>::infinity();
       }
+
+      using type_float = T;  // Used for unit tests
    };  // class Midpoint754
 
 
    template <typename T>
    class MidpointNon754 {
    private:
-      // NOTE: The Midpoint754 solver is faster than this solver and should be used when possible.
-      //       To use the Midpoint754 solver, two criteria must be satisfied:
-      //         1. The type T must conform to the IEEE 754 standard and
-      //         2. There must exist a corresponding unsigned integer type with the same number of bits.
-      //       Currently, this limits the use of the Midpoint754 solver to `float` and `double`. The
-      //       lack of a 128 bit unsigned integer type prevents the Midpoint754 solver being used for
-      //       `long double`.
+      // NOTE: The Midpoint754 solver should be used when possible because it is faster
+      //       than this solver. The two criteria below must be satifsied to use the Midpoint754
+      //       solver:
+      //           1. The type T must conform to the IEEE 754 standard and
+      //           2. The following sequence of steps must produce `numeric_limits<T>::infinity`.
+      //              Start with `numeric_limits<T>::max()`. Then reinterpret this value as an
+      //              unsigned integer. Next add one to this unsigned integer. Finally reinterpret
+      //              the result as type T. This result must equal infinity.
+      //       The above two criteria are true for the following datatypes: `float`, `double`,
+      //       and `__float128`. The 80 bit variation of `long double` does not satisfy the
+      //       second criteria.
       static_assert(!std::is_same<T, float>::value, "Need to use Midpoint754 solver when T is float");
       static_assert(!std::is_same<T, double>::value, "Need to use Midpoint754 solver when T is double");
-      // static_assert(!std::is_same<T, long double>::value -- (See NOTE above)
+#if defined(BOOST_HAS_INT128) && defined(BOOST_HAS_FLOAT128)
+      static_assert(!std::is_same<T, __float128>::value, "Need to use Midpoint754 solver when T is __float128");
+#endif
 
    public:
       static T solve(T x, T y) {
@@ -422,6 +456,9 @@ namespace Bisection {
             const U denorm_min = std::numeric_limits<U>::denorm_min();
             if (denorm_min != 0) { return denorm_min; }
 
+            // NOTE: the two lines below can be removed when
+            //       https://github.com/boostorg/multiprecision/pull/562
+            //       gets merged.
             const U min = (std::numeric_limits<U>::min)();
             if (min != 0) { return min; }
 
@@ -467,16 +504,32 @@ namespace Bisection {
       }
    }; // class MidpointNon754
 
-   template <typename T>                                                
-   static T calc_midpoint(T x, T y) {
-      return MidpointNon754<T>::solve(x, y);
-   }
-   static float calc_midpoint(float x, float y) {
-      return Midpoint754<float, std::uint32_t>::solve(x, y);
-   }
-   static double calc_midpoint(double x, double y) {
-      return Midpoint754<double, std::uint64_t>::solve(x, y);
-   }
+   // The purposes of this class is to not cause compiler warnings from unused functions.
+   class CalcMidpoint {
+   public:
+      template <typename T>
+      static MidpointNon754<T> get_solver(T) {
+         return MidpointNon754<T>();
+      }
+      static Midpoint754<float, std::uint32_t> get_solver(float) {
+         return Midpoint754<float, std::uint32_t>();
+      }
+      static Midpoint754<double, std::uint64_t> get_solver(double) {
+         return Midpoint754<double, std::uint64_t>();
+      }
+#if defined(BOOST_HAS_INT128) && defined(BOOST_HAS_FLOAT128)
+      static Midpoint754<__float128, boost::uint128_type> get_solver(__float128) {
+         return Midpoint754<__float128, boost::uint128_type>();
+      }
+      static Midpoint754<__float128, boost::uint128_type> get_solver(boost::multiprecision::float128) {
+         return Midpoint754<__float128, boost::uint128_type>();
+      }
+#endif
+      template <typename T>
+      static T calc_midpoint(T x, T y) {
+         return get_solver(x).solve(x, y);
+      }
+   };
 
 }  // namespace Bisection
 }  // namespace detail
@@ -549,7 +602,7 @@ T newton_raphson_iterate(F f, T guess, T min, T max, int digits, std::uintmax_t&
       {
          // Last two steps haven't converged.
          const T x_other = (delta > 0) ? min : max;
-         delta = result - detail::Bisection::calc_midpoint(result, x_other);
+         delta = result - detail::Bisection::CalcMidpoint::calc_midpoint(result, x_other);
          // reset delta1/2 so we don't take this branch next time round:
          delta1 = 3 * delta;
          delta2 = 3 * delta;

diff --git a/test/test_roots.cpp b/test/test_roots.cpp
@@ -27,6 +27,10 @@
 #include <boost/multiprecision/cpp_complex.hpp>
 #include <boost/math/concepts/real_concept.hpp>
 
+#ifdef BOOST_HAS_FLOAT128
+#include <boost/multiprecision/float128.hpp>
+#endif
+
 #if __has_include(<stdfloat>)
 #  include <stdfloat>
 #endif
@@ -655,93 +659,124 @@ void test_failures()
 #endif
 }
 
+// Creates a vector of test values that include: zero, denorm_min, min, epsilon, max,
+// and infinity. Also includes powers of 2 ranging from 2^-15 to 2^+15 by powers of 3.
 template <typename T>
-void test_bisect() {
-   // Creates a vector of test values that include: zero, denorm_min, min, epsilon, max,
-   // and infinity. Also includes powers of 2 ranging from 2^-15 to 2^+15 by powers of 3.
-   const auto fn_create_test_ladder = [](){
-      std::vector<T> v;
-
-      const auto fn_push_back_x_scaled = [&v](const T& x) {
-         const auto fn_add_if_non_zero_and_finite = [&v](const T& x) {
-            if (x != 0 && boost::math::isfinite(x)) {  // Avoids most duplications
-               v.push_back(x);
-            }
-         };
-
-         const T two_thirds = T(2) / T(3);
-         const T four_thirds = T(4) / T(3);
-
-         v.push_back(x);
-         fn_add_if_non_zero_and_finite(x * T(0.5));
-         fn_add_if_non_zero_and_finite(x * two_thirds);
-         fn_add_if_non_zero_and_finite(x * four_thirds);
-         fn_add_if_non_zero_and_finite(x * T(2.0));
-      };
-
-      const auto fn_push_back_pm = [&](std::vector<T>& v, const T& x) {
-         fn_push_back_x_scaled( x);
-         fn_push_back_x_scaled(-x);
-      };
-
-      fn_push_back_pm(v, T(0.0));
-      fn_push_back_pm(v, std::numeric_limits<T>::denorm_min());
-      fn_push_back_pm(v, (std::numeric_limits<T>::min)());
-      fn_push_back_pm(v, std::numeric_limits<T>::epsilon());
-      const int test_exp_range = 5;
-      for (int i = -test_exp_range; i < (test_exp_range + 1); ++i) {
-         const int exponent = i * 3;
-         const T x = std::ldexp(1.0, exponent);
-         fn_push_back_pm(v, x);
-      }
-      fn_push_back_pm(v, (std::numeric_limits<T>::max)());
+std::vector<T> create_test_ladder() {
+   std::vector<T> v;
+
+   const auto fn_push_back_x_scaled = [&v](const T& x) {
+      const T two_thirds = T(2) / T(3);
+      const T four_thirds = T(4) / T(3);
+
+      v.push_back(x);
+      v.push_back(x * T(0.5));
+      v.push_back(x * two_thirds);
+      v.push_back(x * four_thirds);
+      v.push_back(x * T(2.0));
+   };
 
-      // Real_concept doesn't have infinity
-      if (std::numeric_limits<T>::has_infinity) {
-         fn_push_back_pm(v, std::numeric_limits<T>::infinity());
-      }
+   const auto fn_push_back_pm = [&](std::vector<T>& v, const T& x) {
+      fn_push_back_x_scaled( x);
+      fn_push_back_x_scaled(-x);
+   };
 
-      std::sort(v.begin(), v.end());
+   fn_push_back_pm(v, std::numeric_limits<T>::denorm_min());
+   fn_push_back_pm(v, (std::numeric_limits<T>::min)());
+   fn_push_back_pm(v, std::numeric_limits<T>::epsilon());
 
-      const int num_zeros = std::count(v.begin(), v.end(), T(0.0));
-      BOOST_CHECK_LE(2, num_zeros);  // Positive and negative zero 
+   const int test_exp_range = 5;
+   for (int i = -test_exp_range; i < (test_exp_range + 1); ++i) {
+      const int exponent = i * 3;
+      const T x = std::ldexp(1.0, exponent);
+      fn_push_back_pm(v, x);
+   }
+   fn_push_back_pm(v, (std::numeric_limits<T>::max)());
+   fn_push_back_pm(v, std::numeric_limits<T>::infinity());
 
-      return v;
-   };
+   // Take unique elements of v
+   std::sort(v.begin(), v.end());
+   v.erase(std::unique(v.begin(), v.end()), v.end());
+
+   // Add both positive and negative zero
+   v.push_back(T(-0.0));
+   v.push_back(T(+0.0));
+
+   const int num_zeros = std::count(v.begin(), v.end(), T(0.0));
+   BOOST_CHECK_LE(2, num_zeros);  // Check for both positive and negative zero 
 
+   return v;
+};
+
+template <typename T, typename S>
+void test_bisect_non(S solver) {
    // Test for all combinations from the ladder
-   auto v = fn_create_test_ladder();
+   auto v = create_test_ladder<T>();
+
    for (const T& x_i: v) {
       for (const T& x_j: v) {
-         const T x_mid_ij = boost::math::tools::detail::Bisection::calc_midpoint(x_i, x_j);
+         const T x_mid_ij = solver.solve(x_i, x_j);
 
          const T x_lo = (std::min)(x_i, x_j);
          const T x_hi = (std::max)(x_i, x_j);
 
-         BOOST_CHECK_LE(x_lo, x_mid_ij);
-         BOOST_CHECK_LE(x_mid_ij, x_hi);
+         BOOST_CHECK(x_lo <= x_mid_ij);  // NOTE: BOOST_CHECK_LE(x_lo, x_mid_ij) fails to link
+         BOOST_CHECK(x_mid_ij <= x_hi);  // NOTE: BOOST_CHECK_LE(x_mid_ij, x_hi) fails to link
       }
    }
 }
 
+template <typename T>
+void test_bisect_non() {
+   const auto solver = boost::math::tools::detail::Bisection::CalcMidpoint::get_solver(T(1));  // Input value unused
+   test_bisect_non<T,decltype(solver)>(solver);
+}
+
+template <typename S>
+void test_bisect_754(S solver) {
+   BOOST_MATH_ASSERT(solver.is_one_plus_max_bits_inf());  // Catch infinity misbehavior for 80 bit `long double`
+                                                          // before it causes downstream failures
+   test_bisect_non<typename S::type_float,S>(solver);
+}
+
+template <typename T>
+void test_bisect_754() {
+   const auto solver = boost::math::tools::detail::Bisection::CalcMidpoint::get_solver(T(1));  // Input value unused
+   test_bisect_754(solver);
+}
+
+void test_long_double_fail() {
+   const auto solver = boost::math::tools::detail::Bisection::Midpoint754<long double,boost::uint128_type>();
+   BOOST_CHECK(!solver.is_one_plus_max_bits_inf());
+}
+
 void test_bisect_all_cases() {
-   test_bisect<float>();
-   test_bisect<double>();
-   test_bisect<long double>();
-   test_bisect<boost::math::concepts::real_concept>();
-   test_bisect<boost::multiprecision::cpp_bin_float_50>();
-   test_bisect<boost::multiprecision::cpp_bin_float_100>();
-   test_bisect<boost::multiprecision::cpp_dec_float_50>();
-   test_bisect<boost::multiprecision::cpp_dec_float_100>();
+   test_bisect_754<float>();
+   test_bisect_754<double>();
+   test_bisect_non<long double>();
+   test_bisect_non<boost::math::concepts::real_concept>();
+   test_bisect_non<boost::multiprecision::cpp_bin_float_50>();
+   test_bisect_non<boost::multiprecision::cpp_bin_float_100>();
+   test_bisect_non<boost::multiprecision::cpp_dec_float_50>();
+   test_bisect_non<boost::multiprecision::cpp_dec_float_100>();
+
+#if defined(BOOST_HAS_FLOAT128) && defined(BOOST_HAS_INT128)
+   test_bisect_754<boost::multiprecision::float128>();
+   test_bisect_754<__float128>();
+#endif 
 
 #if __has_include(<stdfloat>)
 #ifdef __STDCPP_FLOAT32_T__
-   test_bisect<std::float32_t>();
+   test_bisect_754<std::float32_t>();
 #endif
 #ifdef __STDCPP_FLOAT64_T__
-   test_bisect<std::float64_t>();
+   test_bisect_754<std::float64_t>();
 #endif 
 #endif
+
+#if LDBL_MANT_DIG == 64 && LDBL_MAX_EXP == 16384 && defined(BOOST_HAS_INT128)
+   test_long_double_fail();
+#endif
 }
 
 void test_count() {