inc/libm_inlines_amd.h - open64_libacml_mv - Git at Google


 /*
 *  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
 *
 *  This file is part of libacml_mv.
 *
 *  libacml_mv is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  libacml_mv is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public
 *  License along with libacml_mv.  If not, see
 *  <http://www.gnu.org/licenses/>.
 *
 */


 #ifndef LIBM_INLINES_AMD_H_INCLUDED
 #define LIBM_INLINES_AMD_H_INCLUDED 1

 #include "libm_util_amd.h"
 #include <math.h>

 #ifdef WINDOWS
 #define inline __inline
 #include "emmintrin.h"
 #endif

 /* Compile-time verification that type long is the same size
    as type double (i.e. we are really on a 64-bit machine) */
 void check_long_against_double_size(int machine_is_64_bit[(sizeof(long long) == sizeof(double))?1:-1]);

 /* Set defines for inline functions calling other inlines */
 #if defined(USE_VAL_WITH_FLAGS) || defined(USE_VALF_WITH_FLAGS) || \
     defined(USE_ZERO_WITH_FLAGS) || defined(USE_ZEROF_WITH_FLAGS) || \
     defined(USE_NAN_WITH_FLAGS) || defined(USE_NANF_WITH_FLAGS) || \
     defined(USE_INDEFINITE_WITH_FLAGS) || defined(USE_INDEFINITEF_WITH_FLAGS) || \
     defined(USE_INFINITY_WITH_FLAGS) || defined(USE_INFINITYF_WITH_FLAGS) || \
     defined(USE_SQRT_AMD_INLINE) || defined(USE_SQRTF_AMD_INLINE) || \
     (defined(WINDOWS) && (defined(USE_HANDLE_ERROR) || defined(USE_HANDLE_ERRORF)))
 #undef USE_RAISE_FPSW_FLAGS
 #define USE_RAISE_FPSW_FLAGS 1
 #endif

 #if defined(USE_SPLITDOUBLE)
 /* Splits double x into exponent e and mantissa m, where 0.5 <= abs(m) < 1.0.
    Assumes that x is not zero, denormal, infinity or NaN, but these conditions
    are not checked */
 static inline void splitDouble(double x, int *e, double *m)
 {
   unsigned long long ux, uy;
   GET_BITS_DP64(x, ux);
   uy = ux;
   ux &= EXPBITS_DP64;
   ux >>= EXPSHIFTBITS_DP64;
   *e = (int)ux - EXPBIAS_DP64 + 1;
   uy = (uy & (SIGNBIT_DP64 | MANTBITS_DP64)) | HALFEXPBITS_DP64;
   PUT_BITS_DP64(uy, x);
   *m = x;
 }
 #endif /* USE_SPLITDOUBLE */


 #if defined(USE_SPLITDOUBLE_2)
 /* Splits double x into exponent e and mantissa m, where 1.0 <= abs(m) < 4.0.
    Assumes that x is not zero, denormal, infinity or NaN, but these conditions
    are not checked. Also assumes EXPBIAS_DP is odd. With this
    assumption, e will be even on exit. */
 static inline void splitDouble_2(double x, int *e, double *m)
 {
   unsigned long long ux, vx;
   GET_BITS_DP64(x, ux);
   vx = ux;
   ux &= EXPBITS_DP64;
   ux >>= EXPSHIFTBITS_DP64;
   if (ux & 1)
     {
       /* The exponent is odd */
       vx = (vx & (SIGNBIT_DP64 | MANTBITS_DP64)) | ONEEXPBITS_DP64;
       PUT_BITS_DP64(vx, x);
       *m = x;
       *e = ux - EXPBIAS_DP64;
     }
   else
     {
       /* The exponent is even */
       vx = (vx & (SIGNBIT_DP64 | MANTBITS_DP64)) | TWOEXPBITS_DP64;
       PUT_BITS_DP64(vx, x);
       *m = x;
       *e = ux - EXPBIAS_DP64 - 1;
     }
 }
 #endif /* USE_SPLITDOUBLE_2 */


 #if defined(USE_SPLITFLOAT)
 /* Splits float x into exponent e and mantissa m, where 0.5 <= abs(m) < 1.0.
    Assumes that x is not zero, denormal, infinity or NaN, but these conditions
    are not checked */
 static inline void splitFloat(float x, int *e, float *m)
 {
   unsigned int ux, uy;
   GET_BITS_SP32(x, ux);
   uy = ux;
   ux &= EXPBITS_SP32;
   ux >>= EXPSHIFTBITS_SP32;
   *e = (int)ux - EXPBIAS_SP32 + 1;
   uy = (uy & (SIGNBIT_SP32 | MANTBITS_SP32)) | HALFEXPBITS_SP32;
   PUT_BITS_SP32(uy, x);
   *m = x;
 }
 #endif /* USE_SPLITFLOAT */


 #if defined(USE_SCALEDOUBLE_1)
 /* Scales the double x by 2.0**n.
    Assumes EMIN <= n <= EMAX, though this condition is not checked. */
 static inline double scaleDouble_1(double x, int n)
 {
   double t;
   /* Construct the number t = 2.0**n */
   PUT_BITS_DP64(((long long)n + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t);
   return x*t;
 }
 #endif /* USE_SCALEDOUBLE_1 */


 #if defined(USE_SCALEDOUBLE_2)
 /* Scales the double x by 2.0**n.
    Assumes 2*EMIN <= n <= 2*EMAX, though this condition is not checked. */
 static inline double scaleDouble_2(double x, int n)
 {
   double t1, t2;
   int n1, n2;
   n1 = n / 2;
   n2 = n - n1;
   /* Construct the numbers t1 = 2.0**n1 and t2 = 2.0**n2 */
   PUT_BITS_DP64(((long long)n1 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t1);
   PUT_BITS_DP64(((long long)n2 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t2);
   return (x*t1)*t2;
 }
 #endif /* USE_SCALEDOUBLE_2 */


 #if defined(USE_SCALEDOUBLE_3)
 /* Scales the double x by 2.0**n.
    Assumes 3*EMIN <= n <= 3*EMAX, though this condition is not checked. */
 static inline double scaleDouble_3(double x, int n)
 {
   double t1, t2, t3;
   int n1, n2, n3;
   n1 = n / 3;
   n2 = (n - n1) / 2;
   n3 = n - n1 - n2;
   /* Construct the numbers t1 = 2.0**n1, t2 = 2.0**n2 and t3 = 2.0**n3 */
   PUT_BITS_DP64(((long long)n1 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t1);
   PUT_BITS_DP64(((long long)n2 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t2);
   PUT_BITS_DP64(((long long)n3 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t3);
   return ((x*t1)*t2)*t3;
 }
 #endif /* USE_SCALEDOUBLE_3 */


 #if defined(USE_SCALEFLOAT_1)
 /* Scales the float x by 2.0**n.
    Assumes EMIN <= n <= EMAX, though this condition is not checked. */
 static inline float scaleFloat_1(float x, int n)
 {
   float t;
   /* Construct the number t = 2.0**n */
   PUT_BITS_SP32((n + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t);
   return x*t;
 }
 #endif /* USE_SCALEFLOAT_1 */


 #if defined(USE_SCALEFLOAT_2)
 /* Scales the float x by 2.0**n.
    Assumes 2*EMIN <= n <= 2*EMAX, though this condition is not checked. */
 static inline float scaleFloat_2(float x, int n)
 {
   float t1, t2;
   int n1, n2;
   n1 = n / 2;
   n2 = n - n1;
   /* Construct the numbers t1 = 2.0**n1 and t2 = 2.0**n2 */
   PUT_BITS_SP32((n1 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t1);
   PUT_BITS_SP32((n2 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t2);
   return (x*t1)*t2;
 }
 #endif /* USE_SCALEFLOAT_2 */


 #if defined(USE_SCALEFLOAT_3)
 /* Scales the float x by 2.0**n.
    Assumes 3*EMIN <= n <= 3*EMAX, though this condition is not checked. */
 static inline float scaleFloat_3(float x, int n)
 {
   float t1, t2, t3;
   int n1, n2, n3;
   n1 = n / 3;
   n2 = (n - n1) / 2;
   n3 = n - n1 - n2;
   /* Construct the numbers t1 = 2.0**n1, t2 = 2.0**n2 and t3 = 2.0**n3 */
   PUT_BITS_SP32((n1 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t1);
   PUT_BITS_SP32((n2 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t2);
   PUT_BITS_SP32((n3 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t3);
   return ((x*t1)*t2)*t3;
 }
 #endif /* USE_SCALEFLOAT_3 */

 #if defined(USE_SETPRECISIONDOUBLE)
 unsigned int setPrecisionDouble(void)
 {
   unsigned int cw, cwold = 0;
   /* There is no precision control on Hammer */
   return cwold;
 }
 #endif /* USE_SETPRECISIONDOUBLE */

 #if defined(USE_RESTOREPRECISION)
 void restorePrecision(unsigned int cwold)
 {
 #if defined(WINDOWS)
   /* There is no precision control on Hammer */
 #elif defined(linux)
   /* There is no precision control on Hammer */
 #else
 #error Unknown machine
 #endif
   return;
 }
 #endif /* USE_RESTOREPRECISION */


 #if defined(USE_CLEAR_FPSW_FLAGS)
 /* Clears floating-point status flags. The argument should be
    the bitwise or of the flags to be cleared, from the
    list above, e.g.
      clear_fpsw_flags(AMD_F_INEXACT | AMD_F_INVALID);
  */
 static inline void clear_fpsw_flags(int flags)
 {
 #if defined(WINDOWS)
   unsigned int cw = _mm_getcsr();
   cw &= (~flags);
   _mm_setcsr(cw);
 #elif defined(linux)
   unsigned int cw;
   /* Get the current floating-point control/status word */
   asm volatile ("STMXCSR %0" : "=m" (cw));
   cw &= (~flags);
   asm volatile ("LDMXCSR %0" : : "m" (cw));
 #else
 #error Unknown machine
 #endif
 }
 #endif /* USE_CLEAR_FPSW_FLAGS */


 #if defined(USE_RAISE_FPSW_FLAGS)
 /* Raises floating-point status flags. The argument should be
    the bitwise or of the flags to be raised, from the
    list above, e.g.
      raise_fpsw_flags(AMD_F_INEXACT | AMD_F_INVALID);
  */
 static inline void raise_fpsw_flags(int flags)
 {
 #if defined(WINDOWS)
   _mm_setcsr(_mm_getcsr() | flags);
 #elif defined(linux)
   unsigned int cw;
   /* Get the current floating-point control/status word */
   asm volatile ("STMXCSR %0" : "=m" (cw));
   cw |= flags;
   asm volatile ("LDMXCSR %0" : : "m" (cw));
 #else
 #error Unknown machine
 #endif
 }
 #endif /* USE_RAISE_FPSW_FLAGS */


 #if defined(USE_GET_FPSW_INLINE)
 /* Return the current floating-point status word */
 static inline unsigned int get_fpsw_inline(void)
 {
 #if defined(WINDOWS)
   return _mm_getcsr();
 #elif defined(linux)
   unsigned int sw;
   asm volatile ("STMXCSR %0" : "=m" (sw));
   return sw;
 #else
 #error Unknown machine
 #endif
 }
 #endif /* USE_GET_FPSW_INLINE */

 #if defined(USE_SET_FPSW_INLINE)
 /* Set the floating-point status word */
 static inline void set_fpsw_inline(unsigned int sw)
 {
 #if defined(WINDOWS)
   _mm_setcsr(sw);
 #elif defined(linux)
   /* Set the current floating-point control/status word */
   asm volatile ("LDMXCSR %0" : : "m" (sw));
 #else
 #error Unknown machine
 #endif
 }
 #endif /* USE_SET_FPSW_INLINE */

 #if defined(USE_CLEAR_FPSW_INLINE)
 /* Clear all exceptions from the floating-point status word */
 static inline void clear_fpsw_inline(void)
 {
 #if defined(WINDOWS)
   unsigned int cw;
   cw = _mm_getcsr();
   cw &= ~(AMD_F_INEXACT | AMD_F_UNDERFLOW | AMD_F_OVERFLOW |
           AMD_F_DIVBYZERO | AMD_F_INVALID);
   _mm_setcsr(cw);
 #elif defined(linux)
   unsigned int cw;
   /* Get the current floating-point control/status word */
   asm volatile ("STMXCSR %0" : "=m" (cw));
   cw &= ~(AMD_F_INEXACT | AMD_F_UNDERFLOW | AMD_F_OVERFLOW |
           AMD_F_DIVBYZERO | AMD_F_INVALID);
   asm volatile ("LDMXCSR %0" : : "m" (cw));
 #else
 #error Unknown machine
 #endif
 }
 #endif /* USE_CLEAR_FPSW_INLINE */


 #if defined(USE_VAL_WITH_FLAGS)
 /* Returns a double value after raising the given flags,
   e.g.  val_with_flags(x, AMD_F_INEXACT);
  */
 static inline double val_with_flags(double val, int flags)
 {
   raise_fpsw_flags(flags);
   return val;
 }
 #endif /* USE_VAL_WITH_FLAGS */

 #if defined(USE_VALF_WITH_FLAGS)
 /* Returns a float value after raising the given flags,
   e.g.  valf_with_flags(x, AMD_F_INEXACT);
  */
 static inline float valf_with_flags(float val, int flags)
 {
   raise_fpsw_flags(flags);
   return val;
 }
 #endif /* USE_VALF_WITH_FLAGS */


 #if defined(USE_ZERO_WITH_FLAGS)
 /* Returns a double +zero after raising the given flags,
   e.g.  zero_with_flags(AMD_F_INEXACT | AMD_F_INVALID);
  */
 static inline double zero_with_flags(int flags)
 {
   raise_fpsw_flags(flags);
   return 0.0;
 }
 #endif /* USE_ZERO_WITH_FLAGS */


 #if defined(USE_ZEROF_WITH_FLAGS)
 /* Returns a float +zero after raising the given flags,
   e.g.  zerof_with_flags(AMD_F_INEXACT | AMD_F_INVALID);
  */
 static inline float zerof_with_flags(int flags)
 {
   raise_fpsw_flags(flags);
   return 0.0F;
 }
 #endif /* USE_ZEROF_WITH_FLAGS */


 #if defined(USE_NAN_WITH_FLAGS)
 /* Returns a double quiet +nan after raising the given flags,
    e.g.  nan_with_flags(AMD_F_INVALID);
 */
 static inline double nan_with_flags(int flags)
 {
   double z;
   raise_fpsw_flags(flags);
   PUT_BITS_DP64(0x7ff8000000000000, z);
   return z;
 }
 #endif /* USE_NAN_WITH_FLAGS */

 #if defined(USE_NANF_WITH_FLAGS)
 /* Returns a float quiet +nan after raising the given flags,
    e.g.  nanf_with_flags(AMD_F_INVALID);
 */
 static inline float nanf_with_flags(int flags)
 {
   float z;
   raise_fpsw_flags(flags);
   PUT_BITS_SP32(0x7fc00000, z);
   return z;
 }
 #endif /* USE_NANF_WITH_FLAGS */


 #if defined(USE_INDEFINITE_WITH_FLAGS)
 /* Returns a double indefinite after raising the given flags,
    e.g.  indefinite_with_flags(AMD_F_INVALID);
 */
 static inline double indefinite_with_flags(int flags)
 {
   double z;
   raise_fpsw_flags(flags);
   PUT_BITS_DP64(0xfff8000000000000, z);
   return z;
 }
 #endif /* USE_INDEFINITE_WITH_FLAGS */

 #if defined(USE_INDEFINITEF_WITH_FLAGS)
 /* Returns a float quiet +indefinite after raising the given flags,
    e.g.  indefinitef_with_flags(AMD_F_INVALID);
 */
 static inline float indefinitef_with_flags(int flags)
 {
   float z;
   raise_fpsw_flags(flags);
   PUT_BITS_SP32(0xffc00000, z);
   return z;
 }
 #endif /* USE_INDEFINITEF_WITH_FLAGS */


 #ifdef USE_INFINITY_WITH_FLAGS
 /* Returns a positive double infinity after raising the given flags,
    e.g.  infinity_with_flags(AMD_F_OVERFLOW);
 */
 static inline double infinity_with_flags(int flags)
 {
   double z;
   raise_fpsw_flags(flags);
   PUT_BITS_DP64((unsigned long long)(BIASEDEMAX_DP64 + 1) << EXPSHIFTBITS_DP64, z);
   return z;
 }
 #endif /* USE_INFINITY_WITH_FLAGS */

 #ifdef USE_INFINITYF_WITH_FLAGS
 /* Returns a positive float infinity after raising the given flags,
    e.g.  infinityf_with_flags(AMD_F_OVERFLOW);
 */
 static inline float infinityf_with_flags(int flags)
 {
   float z;
   raise_fpsw_flags(flags);
   PUT_BITS_SP32((BIASEDEMAX_SP32 + 1) << EXPSHIFTBITS_SP32, z);
   return z;
 }
 #endif /* USE_INFINITYF_WITH_FLAGS */


 #if defined(USE_SPLITEXP)
 /* Compute the values m, z1, and z2 such that base**x = 2**m * (z1 + z2).
    Small arguments abs(x) < 1/(16*ln(base)) and extreme arguments
    abs(x) > large/(ln(base)) (where large is the largest representable
    floating point number) should be handled separately instead of calling
    this function. This function is called by exp_amd, exp2_amd, exp10_amd,
    cosh_amd and sinh_amd. */
 static inline void splitexp(double x, double logbase,
                             double thirtytwo_by_logbaseof2,
                             double logbaseof2_by_32_lead,
                             double logbaseof2_by_32_trail,
                             int *m, double *z1, double *z2)
 {
   double q, r, r1, r2, f1, f2;
   int n, j;

 /* Arrays two_to_jby32_lead_table and two_to_jby32_trail_table contain
    leading and trailing parts respectively of precomputed
    values of pow(2.0,j/32.0), for j = 0, 1, ..., 31.
    two_to_jby32_lead_table contains the first 25 bits of precision,
    and two_to_jby32_trail_table contains a further 53 bits precision. */

   static const double two_to_jby32_lead_table[32] = {
     1.00000000000000000000e+00,   /* 0x3ff0000000000000 */
     1.02189713716506958008e+00,   /* 0x3ff059b0d0000000 */
     1.04427373409271240234e+00,   /* 0x3ff0b55860000000 */
     1.06714040040969848633e+00,   /* 0x3ff11301d0000000 */
     1.09050768613815307617e+00,   /* 0x3ff172b830000000 */
     1.11438673734664916992e+00,   /* 0x3ff1d48730000000 */
     1.13878858089447021484e+00,   /* 0x3ff2387a60000000 */
     1.16372483968734741211e+00,   /* 0x3ff29e9df0000000 */
     1.18920707702636718750e+00,   /* 0x3ff306fe00000000 */
     1.21524733304977416992e+00,   /* 0x3ff371a730000000 */
     1.24185776710510253906e+00,   /* 0x3ff3dea640000000 */
     1.26905095577239990234e+00,   /* 0x3ff44e0860000000 */
     1.29683953523635864258e+00,   /* 0x3ff4bfdad0000000 */
     1.32523661851882934570e+00,   /* 0x3ff5342b50000000 */
     1.35425549745559692383e+00,   /* 0x3ff5ab07d0000000 */
     1.38390988111495971680e+00,   /* 0x3ff6247eb0000000 */
     1.41421353816986083984e+00,   /* 0x3ff6a09e60000000 */
     1.44518077373504638672e+00,   /* 0x3ff71f75e0000000 */
     1.47682613134384155273e+00,   /* 0x3ff7a11470000000 */
     1.50916439294815063477e+00,   /* 0x3ff8258990000000 */
     1.54221081733703613281e+00,   /* 0x3ff8ace540000000 */
     1.57598084211349487305e+00,   /* 0x3ff93737b0000000 */
     1.61049032211303710938e+00,   /* 0x3ff9c49180000000 */
     1.64575546979904174805e+00,   /* 0x3ffa5503b0000000 */
     1.68179279565811157227e+00,   /* 0x3ffae89f90000000 */
     1.71861928701400756836e+00,   /* 0x3ffb7f76f0000000 */
     1.75625211000442504883e+00,   /* 0x3ffc199bd0000000 */
     1.79470902681350708008e+00,   /* 0x3ffcb720d0000000 */
     1.83400803804397583008e+00,   /* 0x3ffd5818d0000000 */
     1.87416762113571166992e+00,   /* 0x3ffdfc9730000000 */
     1.91520655155181884766e+00,   /* 0x3ffea4afa0000000 */
     1.95714408159255981445e+00};  /* 0x3fff507650000000 */

   static const double two_to_jby32_trail_table[32] = {
     0.00000000000000000000e+00,   /* 0x0000000000000000 */
     1.14890470981563546737e-08,   /* 0x3e48ac2ba1d73e2a */
     4.83347014379782142328e-08,   /* 0x3e69f3121ec53172 */
     2.67125131841396124714e-10,   /* 0x3df25b50a4ebbf1b */
     4.65271045830351350190e-08,   /* 0x3e68faa2f5b9bef9 */
     5.24924336638693782574e-09,   /* 0x3e368b9aa7805b80 */
     5.38622214388600821910e-08,   /* 0x3e6ceac470cd83f6 */
     1.90902301017041969782e-08,   /* 0x3e547f7b84b09745 */
     3.79763538792174980894e-08,   /* 0x3e64636e2a5bd1ab */
     2.69306947081946450986e-08,   /* 0x3e5ceaa72a9c5154 */
     4.49683815095311756138e-08,   /* 0x3e682468446b6824 */
     1.41933332021066904914e-09,   /* 0x3e18624b40c4dbd0 */
     1.94146510233556266402e-08,   /* 0x3e54d8a89c750e5e */
     2.46409119489264118569e-08,   /* 0x3e5a753e077c2a0f */
     4.94812958044698886494e-08,   /* 0x3e6a90a852b19260 */
     8.48872238075784476136e-10,   /* 0x3e0d2ac258f87d03 */
     2.42032342089579394887e-08,   /* 0x3e59fcef32422cbf */
     3.32420002333182569170e-08,   /* 0x3e61d8bee7ba46e2 */
     1.45956577586525322754e-08,   /* 0x3e4f580c36bea881 */
     3.46452721050003920866e-08,   /* 0x3e62999c25159f11 */
     8.07090469079979051284e-09,   /* 0x3e415506dadd3e2a */
     2.99439161340839520436e-09,   /* 0x3e29b8bc9e8a0388 */
     9.83621719880452147153e-09,   /* 0x3e451f8480e3e236 */
     8.35492309647188080486e-09,   /* 0x3e41f12ae45a1224 */
     3.48493175137966283582e-08,   /* 0x3e62b5a75abd0e6a */
     1.11084703472699692902e-08,   /* 0x3e47daf237553d84 */
     5.03688744342840346564e-08,   /* 0x3e6b0aa538444196 */
     4.81896001063495806249e-08,   /* 0x3e69df20d22a0798 */
     4.83653666334089557746e-08,   /* 0x3e69f7490e4bb40b */
     1.29745882314081237628e-08,   /* 0x3e4bdcdaf5cb4656 */
     9.84532844621636118964e-09,   /* 0x3e452486cc2c7b9d */
     4.25828404545651943883e-08};  /* 0x3e66dc8a80ce9f09 */

     /*
       Step 1. Reduce the argument.

       To perform argument reduction, we find the integer n such that
       x = n * logbaseof2/32 + remainder, |remainder| <= logbaseof2/64.
       n is defined by round-to-nearest-integer( x*32/logbaseof2 ) and
       remainder by x - n*logbaseof2/32. The calculation of n is
       straightforward whereas the computation of x - n*logbaseof2/32
       must be carried out carefully.
       logbaseof2/32 is so represented in two pieces that
       (1) logbaseof2/32 is known to extra precision, (2) the product
       of n and the leading piece is a model number and is hence
       calculated without error, and (3) the subtraction of the value
       obtained in (2) from x is a model number and is hence again
       obtained without error.
     */

     r = x * thirtytwo_by_logbaseof2;
     /* Set n = nearest integer to r */
     /* This is faster on Hammer */
     if (r > 0)
       n = (int)(r + 0.5);
     else
       n = (int)(r - 0.5);

     r1 = x - n * logbaseof2_by_32_lead;
     r2 =   - n * logbaseof2_by_32_trail;

     /* Set j = n mod 32:   5 mod 32 = 5,   -5 mod 32 = 27,  etc. */
     /* j = n % 32;
        if (j < 0) j += 32; */
     j = n & 0x0000001f;

     f1 = two_to_jby32_lead_table[j];
     f2 = two_to_jby32_trail_table[j];

     *m = (n - j) / 32;

     /* Step 2. The following is the core approximation. We approximate
        exp(r1+r2)-1 by a polynomial. */

     r1 *= logbase; r2 *= logbase;

     r = r1 + r2;
     q = r1 + (r2 +
               r*r*( 5.00000000000000008883e-01 +
                       r*( 1.66666666665260878863e-01 +
                       r*( 4.16666666662260795726e-02 +
                       r*( 8.33336798434219616221e-03 +
                       r*( 1.38889490863777199667e-03 ))))));

     /* Step 3. Function value reconstruction.
        We now reconstruct the exponential of the input argument
        so that exp(x) = 2**m * (z1 + z2).
        The order of the computation below must be strictly observed. */

     *z1 = f1;
     *z2 = f2 + ((f1 + f2) * q);
 }
 #endif /* USE_SPLITEXP */


 #if defined(USE_SPLITEXPF)
 /* Compute the values m, z1, and z2 such that base**x = 2**m * (z1 + z2).
    Small arguments abs(x) < 1/(16*ln(base)) and extreme arguments
    abs(x) > large/(ln(base)) (where large is the largest representable
    floating point number) should be handled separately instead of calling
    this function. This function is called by exp_amd, exp2_amd, exp10_amd,
    cosh_amd and sinh_amd. */
 static inline void splitexpf(float x, float logbase,
                              float thirtytwo_by_logbaseof2,
                              float logbaseof2_by_32_lead,
                              float logbaseof2_by_32_trail,
                              int *m, float *z1, float *z2)
 {
   float q, r, r1, r2, f1, f2;
   int n, j;

 /* Arrays two_to_jby32_lead_table and two_to_jby32_trail_table contain
    leading and trailing parts respectively of precomputed
    values of pow(2.0,j/32.0), for j = 0, 1, ..., 31.
    two_to_jby32_lead_table contains the first 10 bits of precision,
    and two_to_jby32_trail_table contains a further 24 bits precision. */

   static const float two_to_jby32_lead_table[32] = {
     1.0000000000E+00F,  /* 0x3F800000 */
     1.0214843750E+00F,  /* 0x3F82C000 */
     1.0429687500E+00F,  /* 0x3F858000 */
     1.0664062500E+00F,  /* 0x3F888000 */
     1.0898437500E+00F,  /* 0x3F8B8000 */
     1.1132812500E+00F,  /* 0x3F8E8000 */
     1.1386718750E+00F,  /* 0x3F91C000 */
     1.1621093750E+00F,  /* 0x3F94C000 */
     1.1875000000E+00F,  /* 0x3F980000 */
     1.2148437500E+00F,  /* 0x3F9B8000 */
     1.2402343750E+00F,  /* 0x3F9EC000 */
     1.2675781250E+00F,  /* 0x3FA24000 */
     1.2949218750E+00F,  /* 0x3FA5C000 */
     1.3242187500E+00F,  /* 0x3FA98000 */
     1.3535156250E+00F,  /* 0x3FAD4000 */
     1.3828125000E+00F,  /* 0x3FB10000 */
     1.4140625000E+00F,  /* 0x3FB50000 */
     1.4433593750E+00F,  /* 0x3FB8C000 */
     1.4765625000E+00F,  /* 0x3FBD0000 */
     1.5078125000E+00F,  /* 0x3FC10000 */
     1.5410156250E+00F,  /* 0x3FC54000 */
     1.5742187500E+00F,  /* 0x3FC98000 */
     1.6093750000E+00F,  /* 0x3FCE0000 */
     1.6445312500E+00F,  /* 0x3FD28000 */
     1.6816406250E+00F,  /* 0x3FD74000 */
     1.7167968750E+00F,  /* 0x3FDBC000 */
     1.7558593750E+00F,  /* 0x3FE0C000 */
     1.7929687500E+00F,  /* 0x3FE58000 */
     1.8339843750E+00F,  /* 0x3FEAC000 */
     1.8730468750E+00F,  /* 0x3FEFC000 */
     1.9140625000E+00F,  /* 0x3FF50000 */
     1.9570312500E+00F}; /* 0x3FFA8000 */

   static const float two_to_jby32_trail_table[32] = {
     0.0000000000E+00F,  /* 0x00000000 */
     4.1277357377E-04F,  /* 0x39D86988 */
     1.3050324051E-03F,  /* 0x3AAB0D9F */
     7.3415064253E-04F,  /* 0x3A407404 */
     6.6398258787E-04F,  /* 0x3A2E0F1E */
     1.1054925853E-03F,  /* 0x3A90E62D */
     1.1675967835E-04F,  /* 0x38F4DCE0 */
     1.6154836630E-03F,  /* 0x3AD3BEA3 */
     1.7071149778E-03F,  /* 0x3ADFC146 */
     4.0360994171E-04F,  /* 0x39D39B9C */
     1.6234370414E-03F,  /* 0x3AD4C982 */
     1.4728321694E-03F,  /* 0x3AC10C0C */
     1.9176795613E-03F,  /* 0x3AFB5AA6 */
     1.0178930825E-03F,  /* 0x3A856AD3 */
     7.3992193211E-04F,  /* 0x3A41F752 */
     1.0973819299E-03F,  /* 0x3A8FD607 */
     1.5106226783E-04F,  /* 0x391E6678 */
     1.8214319134E-03F,  /* 0x3AEEBD1D */
     2.6364589576E-04F,  /* 0x398A39F4 */
     1.3519275235E-03F,  /* 0x3AB13329 */
     1.1952003697E-03F,  /* 0x3A9CA845 */
     1.7620950239E-03F,  /* 0x3AE6F619 */
     1.1153318919E-03F,  /* 0x3A923054 */
     1.2242280645E-03F,  /* 0x3AA07647 */
     1.5220546629E-04F,  /* 0x391F9958 */
     1.8224230735E-03F,  /* 0x3AEEDE5F */
     3.9278529584E-04F,  /* 0x39CDEEC0 */
     1.7403248930E-03F,  /* 0x3AE41B9D */
     2.3711356334E-05F,  /* 0x37C6E7C0 */
     1.1207590578E-03F,  /* 0x3A92E66F */
     1.1440613307E-03F,  /* 0x3A95F454 */
     1.1287408415E-04F}; /* 0x38ECB6D0 */

     /*
       Step 1. Reduce the argument.

       To perform argument reduction, we find the integer n such that
       x = n * logbaseof2/32 + remainder, |remainder| <= logbaseof2/64.
       n is defined by round-to-nearest-integer( x*32/logbaseof2 ) and
       remainder by x - n*logbaseof2/32. The calculation of n is
       straightforward whereas the computation of x - n*logbaseof2/32
       must be carried out carefully.
       logbaseof2/32 is so represented in two pieces that
       (1) logbaseof2/32 is known to extra precision, (2) the product
       of n and the leading piece is a model number and is hence
       calculated without error, and (3) the subtraction of the value
       obtained in (2) from x is a model number and is hence again
       obtained without error.
     */

     r = x * thirtytwo_by_logbaseof2;
     /* Set n = nearest integer to r */
     /* This is faster on Hammer */
     if (r > 0)
       n = (int)(r + 0.5F);
     else
       n = (int)(r - 0.5F);

     r1 = x - n * logbaseof2_by_32_lead;
     r2 =   - n * logbaseof2_by_32_trail;

     /* Set j = n mod 32:   5 mod 32 = 5,   -5 mod 32 = 27,  etc. */
     /* j = n % 32;
        if (j < 0) j += 32; */
     j = n & 0x0000001f;

     f1 = two_to_jby32_lead_table[j];
     f2 = two_to_jby32_trail_table[j];

     *m = (n - j) / 32;

     /* Step 2. The following is the core approximation. We approximate
        exp(r1+r2)-1 by a polynomial. */

     r1 *= logbase; r2 *= logbase;

     r = r1 + r2;
     q = r1 + (r2 +
               r*r*( 5.00000000000000008883e-01F +
                       r*( 1.66666666665260878863e-01F )));

     /* Step 3. Function value reconstruction.
        We now reconstruct the exponential of the input argument
        so that exp(x) = 2**m * (z1 + z2).
        The order of the computation below must be strictly observed. */

     *z1 = f1;
     *z2 = f2 + ((f1 + f2) * q);
 }
 #endif /* SPLITEXPF */


 #if defined(USE_SCALEUPDOUBLE1024)
 /* Scales up a double (normal or denormal) whose bit pattern is given
    as ux by 2**1024. There are no checks that the input number is
    scalable by that amount. */
 static inline void scaleUpDouble1024(unsigned long long ux, unsigned long long *ur)
 {
   unsigned long long uy;
   double y;

   if ((ux & EXPBITS_DP64) == 0)
     {
       /* ux is denormalised */
       PUT_BITS_DP64(ux | 0x4010000000000000, y);
       if (ux & SIGNBIT_DP64)
         y += 4.0;
       else
         y -= 4.0;
       GET_BITS_DP64(y, uy);
     }
   else
     /* ux is normal */
     uy = ux + 0x4000000000000000;

   *ur = uy;
   return;
 }

 #endif /* SCALEUPDOUBLE1024 */


 #if defined(USE_SCALEDOWNDOUBLE)
 /* Scales down a double whose bit pattern is given as ux by 2**k.
    There are no checks that the input number is scalable by that amount. */
 static inline void scaleDownDouble(unsigned long long ux, int k,
                                    unsigned long long *ur)
 {
   unsigned long long uy, uk, ax, xsign;
   int n, shift;
   xsign = ux & SIGNBIT_DP64;
   ax = ux & ~SIGNBIT_DP64;
   n = (int)((ax & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - k;
   if (n > 0)
     {
       uk = (unsigned long long)n << EXPSHIFTBITS_DP64;
       uy = (ax & ~EXPBITS_DP64) | uk;
     }
   else
     {
       uy = (ax & ~EXPBITS_DP64) | 0x0010000000000000;
       shift = (1 - n);
       if (shift > MANTLENGTH_DP64 + 1)
         /* Sigh. Shifting works mod 64 so be careful not to shift too much */
         uy = 0;
       else
         {
           /* Make sure we round the result */
           uy >>= shift - 1;
           uy = (uy >> 1) + (uy & 1);
         }
     }
   *ur = uy | xsign;
 }

 #endif /* SCALEDOWNDOUBLE */


 #if defined(USE_SCALEUPFLOAT128)
 /* Scales up a float (normal or denormal) whose bit pattern is given
    as ux by 2**128. There are no checks that the input number is
    scalable by that amount. */
 static inline void scaleUpFloat128(unsigned int ux, unsigned int *ur)
 {
   unsigned int uy;
   float y;

   if ((ux & EXPBITS_SP32) == 0)
     {
       /* ux is denormalised */
       PUT_BITS_SP32(ux | 0x40800000, y);
       /* Compensate for the implicit bit just added */
       if (ux & SIGNBIT_SP32)
         y += 4.0F;
       else
         y -= 4.0F;
       GET_BITS_SP32(y, uy);
     }
   else
     /* ux is normal */
     uy = ux + 0x40000000;
   *ur = uy;
 }
 #endif /* SCALEUPFLOAT128 */


 #if defined(USE_SCALEDOWNFLOAT)
 /* Scales down a float whose bit pattern is given as ux by 2**k.
    There are no checks that the input number is scalable by that amount. */
 static inline void scaleDownFloat(unsigned int ux, int k,
                                   unsigned int *ur)
 {
   unsigned int uy, uk, ax, xsign;
   int n, shift;

   xsign = ux & SIGNBIT_SP32;
   ax = ux & ~SIGNBIT_SP32;
   n = ((ax & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - k;
   if (n > 0)
     {
       uk = (unsigned int)n << EXPSHIFTBITS_SP32;
       uy = (ax & ~EXPBITS_SP32) | uk;
     }
   else
     {
       uy = (ax & ~EXPBITS_SP32) | 0x00800000;
       shift = (1 - n);
       if (shift > MANTLENGTH_SP32 + 1)
         /* Sigh. Shifting works mod 32 so be careful not to shift too much */
         uy = 0;
       else
         {
           /* Make sure we round the result */
           uy >>= shift - 1;
           uy = (uy >> 1) + (uy & 1);
         }
     }
   *ur = uy | xsign;
 }
 #endif /* SCALEDOWNFLOAT */


 #if defined(USE_SQRT_AMD_INLINE)
 static inline double sqrt_amd_inline(double x)
 {
   /*
      Computes the square root of x.

      The calculation is carried out in three steps.

      Step 1. Reduction.
      The input argument is scaled to the interval [1, 4) by
      computing
                x = 2^e * y, where y in [1,4).
      Furthermore y is decomposed as y = c + t where
                c = 1 + j/32, j = 0,1,..,96; and |t| <= 1/64.

      Step 2. Approximation.
      An approximation q = sqrt(1 + (t/c)) - 1  is obtained
      from a basic series expansion using precomputed values
      stored in rt_jby32_lead_table_dbl and rt_jby32_trail_table_dbl.

      Step 3. Reconstruction.
      The value of sqrt(x) is reconstructed via
        sqrt(x) = 2^(e/2) * sqrt(y)
                = 2^(e/2) * sqrt(c) * sqrt(y/c)
                = 2^(e/2) * sqrt(c) * sqrt(1 + t/c)
                = 2^(e/2) * [ sqrt(c) + sqrt(c)*q ]
     */

   unsigned long long ux, ax, u;
   double r1, r2, c, y, p, q, r, twop, z, rtc, rtc_lead, rtc_trail;
   int e, denorm = 0, index;

 /* Arrays rt_jby32_lead_table_dbl and rt_jby32_trail_table_dbl contain
    leading and trailing parts respectively of precomputed
    values of sqrt(j/32), for j = 32, 33, ..., 128.
    rt_jby32_lead_table_dbl contains the first 21 bits of precision,
    and rt_jby32_trail_table_dbl contains a further 53 bits precision. */

   static const double rt_jby32_lead_table_dbl[97] = {
     1.00000000000000000000e+00,   /* 0x3ff0000000000000 */
     1.01550388336181640625e+00,   /* 0x3ff03f8100000000 */
     1.03077602386474609375e+00,   /* 0x3ff07e0f00000000 */
     1.04582500457763671875e+00,   /* 0x3ff0bbb300000000 */
     1.06065940856933593750e+00,   /* 0x3ff0f87600000000 */
     1.07528972625732421875e+00,   /* 0x3ff1346300000000 */
     1.08972454071044921875e+00,   /* 0x3ff16f8300000000 */
     1.10396957397460937500e+00,   /* 0x3ff1a9dc00000000 */
     1.11803340911865234375e+00,   /* 0x3ff1e37700000000 */
     1.13192272186279296875e+00,   /* 0x3ff21c5b00000000 */
     1.14564323425292968750e+00,   /* 0x3ff2548e00000000 */
     1.15920162200927734375e+00,   /* 0x3ff28c1700000000 */
     1.17260360717773437500e+00,   /* 0x3ff2c2fc00000000 */
     1.18585395812988281250e+00,   /* 0x3ff2f94200000000 */
     1.19895744323730468750e+00,   /* 0x3ff32eee00000000 */
     1.21191978454589843750e+00,   /* 0x3ff3640600000000 */
     1.22474479675292968750e+00,   /* 0x3ff3988e00000000 */
     1.23743629455566406250e+00,   /* 0x3ff3cc8a00000000 */
     1.25000000000000000000e+00,   /* 0x3ff4000000000000 */
     1.26243782043457031250e+00,   /* 0x3ff432f200000000 */
     1.27475452423095703125e+00,   /* 0x3ff4656500000000 */
     1.28695297241210937500e+00,   /* 0x3ff4975c00000000 */
     1.29903793334960937500e+00,   /* 0x3ff4c8dc00000000 */
     1.31101036071777343750e+00,   /* 0x3ff4f9e600000000 */
     1.32287502288818359375e+00,   /* 0x3ff52a7f00000000 */
     1.33463478088378906250e+00,   /* 0x3ff55aaa00000000 */
     1.34629058837890625000e+00,   /* 0x3ff58a6800000000 */
     1.35784721374511718750e+00,   /* 0x3ff5b9be00000000 */
     1.36930561065673828125e+00,   /* 0x3ff5e8ad00000000 */
     1.38066959381103515625e+00,   /* 0x3ff6173900000000 */
     1.39194107055664062500e+00,   /* 0x3ff6456400000000 */
     1.40312099456787109375e+00,   /* 0x3ff6732f00000000 */
     1.41421318054199218750e+00,   /* 0x3ff6a09e00000000 */
     1.42521858215332031250e+00,   /* 0x3ff6cdb200000000 */
     1.43614006042480468750e+00,   /* 0x3ff6fa6e00000000 */
     1.44697952270507812500e+00,   /* 0x3ff726d400000000 */
     1.45773792266845703125e+00,   /* 0x3ff752e500000000 */
     1.46841716766357421875e+00,   /* 0x3ff77ea300000000 */
     1.47901916503906250000e+00,   /* 0x3ff7aa1000000000 */
     1.48954677581787109375e+00,   /* 0x3ff7d52f00000000 */
     1.50000000000000000000e+00,   /* 0x3ff8000000000000 */
     1.51038074493408203125e+00,   /* 0x3ff82a8500000000 */
     1.52068996429443359375e+00,   /* 0x3ff854bf00000000 */
     1.53093051910400390625e+00,   /* 0x3ff87eb100000000 */
     1.54110336303710937500e+00,   /* 0x3ff8a85c00000000 */
     1.55120849609375000000e+00,   /* 0x3ff8d1c000000000 */
     1.56124877929687500000e+00,   /* 0x3ff8fae000000000 */
     1.57122516632080078125e+00,   /* 0x3ff923bd00000000 */
     1.58113861083984375000e+00,   /* 0x3ff94c5800000000 */
     1.59099006652832031250e+00,   /* 0x3ff974b200000000 */
     1.60078048706054687500e+00,   /* 0x3ff99ccc00000000 */
     1.61051177978515625000e+00,   /* 0x3ff9c4a800000000 */
     1.62018489837646484375e+00,   /* 0x3ff9ec4700000000 */
     1.62979984283447265625e+00,   /* 0x3ffa13a900000000 */
     1.63935947418212890625e+00,   /* 0x3ffa3ad100000000 */
     1.64886283874511718750e+00,   /* 0x3ffa61be00000000 */
     1.65831184387207031250e+00,   /* 0x3ffa887200000000 */
     1.66770744323730468750e+00,   /* 0x3ffaaeee00000000 */
     1.67705059051513671875e+00,   /* 0x3ffad53300000000 */
     1.68634128570556640625e+00,   /* 0x3ffafb4100000000 */
     1.69558238983154296875e+00,   /* 0x3ffb211b00000000 */
     1.70477199554443359375e+00,   /* 0x3ffb46bf00000000 */
     1.71391296386718750000e+00,   /* 0x3ffb6c3000000000 */
     1.72300529479980468750e+00,   /* 0x3ffb916e00000000 */
     1.73204994201660156250e+00,   /* 0x3ffbb67a00000000 */
     1.74104785919189453125e+00,   /* 0x3ffbdb5500000000 */
     1.75000000000000000000e+00,   /* 0x3ffc000000000000 */
     1.75890541076660156250e+00,   /* 0x3ffc247a00000000 */
     1.76776695251464843750e+00,   /* 0x3ffc48c600000000 */
     1.77658367156982421875e+00,   /* 0x3ffc6ce300000000 */
     1.78535652160644531250e+00,   /* 0x3ffc90d200000000 */
     1.79408740997314453125e+00,   /* 0x3ffcb49500000000 */
     1.80277538299560546875e+00,   /* 0x3ffcd82b00000000 */
     1.81142139434814453125e+00,   /* 0x3ffcfb9500000000 */
     1.82002735137939453125e+00,   /* 0x3ffd1ed500000000 */
     1.82859230041503906250e+00,   /* 0x3ffd41ea00000000 */
     1.83711719512939453125e+00,   /* 0x3ffd64d500000000 */
     1.84560203552246093750e+00,   /* 0x3ffd879600000000 */
     1.85404872894287109375e+00,   /* 0x3ffdaa2f00000000 */
     1.86245727539062500000e+00,   /* 0x3ffdcca000000000 */
     1.87082862854003906250e+00,   /* 0x3ffdeeea00000000 */
     1.87916183471679687500e+00,   /* 0x3ffe110c00000000 */
     1.88745784759521484375e+00,   /* 0x3ffe330700000000 */
     1.89571857452392578125e+00,   /* 0x3ffe54dd00000000 */
     1.90394306182861328125e+00,   /* 0x3ffe768d00000000 */
     1.91213226318359375000e+00,   /* 0x3ffe981800000000 */
     1.92028617858886718750e+00,   /* 0x3ffeb97e00000000 */
     1.92840576171875000000e+00,   /* 0x3ffedac000000000 */
     1.93649101257324218750e+00,   /* 0x3ffefbde00000000 */
     1.94454288482666015625e+00,   /* 0x3fff1cd900000000 */
     1.95256233215332031250e+00,   /* 0x3fff3db200000000 */
     1.96054744720458984375e+00,   /* 0x3fff5e6700000000 */
     1.96850109100341796875e+00,   /* 0x3fff7efb00000000 */
     1.97642326354980468750e+00,   /* 0x3fff9f6e00000000 */
     1.98431301116943359375e+00,   /* 0x3fffbfbf00000000 */
     1.99217128753662109375e+00,   /* 0x3fffdfef00000000 */
     2.00000000000000000000e+00};  /* 0x4000000000000000 */

   static const double rt_jby32_trail_table_dbl[97] = {
     0.00000000000000000000e+00,   /* 0x0000000000000000 */
     9.17217678638807524014e-07,   /* 0x3eaec6d70177881c */
     3.82539669043705364790e-07,   /* 0x3e99abfb41bd6b24 */
     2.85899577162227138140e-08,   /* 0x3e5eb2bf6bab55a2 */
     7.63210485349101216659e-07,   /* 0x3ea99bed9b2d8d0c */
     9.32123004127716212874e-07,   /* 0x3eaf46e029c1b296 */
     1.95174719169309219157e-07,   /* 0x3e8a3226fc42f30c */
     5.34316371481845492427e-07,   /* 0x3ea1edbe20701d73 */
     5.79631242504454563052e-07,   /* 0x3ea372fe94f82be7 */
     4.20404384109571705948e-07,   /* 0x3e9c367e08e7bb06 */
     6.89486030314147010716e-07,   /* 0x3ea722a3d0a66608 */
     6.89927685625314560328e-07,   /* 0x3ea7266f067ca1d6 */
     3.32778123013641425828e-07,   /* 0x3e965515a9b34850 */
     1.64433259436999584387e-07,   /* 0x3e8611e23ef6c1bd */
     4.37590875197899335723e-07,   /* 0x3e9d5dc1059ed8e7 */
     1.79808183816018617413e-07,   /* 0x3e88222982d0e4f4 */
     7.46386593615986477624e-08,   /* 0x3e7409212e7d0322 */
     5.72520794105201454728e-07,   /* 0x3ea335ea8a5fcf39 */
     0.00000000000000000000e+00,   /* 0x0000000000000000 */
     2.96860689431670420344e-07,   /* 0x3e93ec071e938bfe */
     3.54167239176257065345e-07,   /* 0x3e97c48bfd9862c6 */
     7.95211265664474710063e-07,   /* 0x3eaaaed010f74671 */
     1.72327048595145565621e-07,   /* 0x3e87211cbfeb62e0 */
     6.99494915996239297020e-07,   /* 0x3ea7789d9660e72d */
     6.32644111701500844315e-07,   /* 0x3ea53a5f1d36f1cf */
     6.20124838851440463844e-10,   /* 0x3e054eacff2057dc */
     6.13404719757812629969e-07,   /* 0x3ea4951b3e6a83cc */
     3.47654909777986407387e-07,   /* 0x3e9754aa76884c66 */
     7.83106177002392475763e-07,   /* 0x3eaa46d4b1de1074 */
     5.33337372440526357008e-07,   /* 0x3ea1e55548f92635 */
     2.01508648555298681765e-08,   /* 0x3e55a3070dd17788 */
     5.25472356925843939587e-07,   /* 0x3ea1a1c5eedb0801 */
     3.81831102861301692797e-07,   /* 0x3e999fcef32422cc */
     6.99220602161420018738e-07,   /* 0x3ea776425d6b0199 */
     6.01209702477462624811e-07,   /* 0x3ea42c5a1e0191a2 */
     9.01437000591944740554e-08,   /* 0x3e7832a0bdff1327 */
     5.10428680864685379950e-08,   /* 0x3e6b674743636676 */
     3.47895267104621031421e-07,   /* 0x3e9758cb90d2f714 */
     7.80735841510641848628e-07,   /* 0x3eaa3278459cde25 */
     1.35158752025506517690e-07,   /* 0x3e822404f4a103ee */
     0.00000000000000000000e+00,   /* 0x0000000000000000 */
     1.76523947728535489812e-09,   /* 0x3e1e539af6892ac5 */
     6.68280121328499932183e-07,   /* 0x3ea66c7b872c9cd0 */
     5.70135482405123276616e-07,   /* 0x3ea3216d2f43887d */
     1.37705134737562525897e-07,   /* 0x3e827b832cbedc0e */
     7.09655107074516613672e-07,   /* 0x3ea7cfe41579091d */
     7.20302724551461693011e-07,   /* 0x3ea82b5a713c490a */
     4.69926266058212796694e-07,   /* 0x3e9f8945932d872e */
     2.19244345915999437026e-07,   /* 0x3e8d6d2da9490251 */
     1.91141411617401877927e-07,   /* 0x3e89a791a3114e4a */
     5.72297665296622053774e-07,   /* 0x3ea333ffe005988d */
     5.61055484436830560103e-07,   /* 0x3ea2d36e0ed49ab1 */
     2.76225500213991506100e-07,   /* 0x3e92898498f55f9e */
     7.58466189522395692908e-07,   /* 0x3ea9732cca1032a3 */
     1.56893371256836029827e-07,   /* 0x3e850ed0b02a22d2 */
     4.06038997708867066507e-07,   /* 0x3e9b3fb265b1e40a */
     5.51305629612057435809e-07,   /* 0x3ea27fade682d1de */
     5.64778487026561123207e-07,   /* 0x3ea2f36906f707ba */
     3.92609705553556897517e-07,   /* 0x3e9a58fbbee883b6 */
     9.09698438776943827802e-07,   /* 0x3eae864005bca6d7 */
     1.05949774066016139743e-07,   /* 0x3e7c70d02300f263 */
     7.16578798392844784244e-07,   /* 0x3ea80b5d712d8e3e */
     6.86233073531233972561e-07,   /* 0x3ea706b27cc7d390 */
     7.99211473033494452908e-07,   /* 0x3eaad12c9d849a97 */
     8.65552275731027456121e-07,   /* 0x3ead0b09954e764b */
     6.75456120386058448618e-07,   /* 0x3ea6aa1fb7826cbd */
     0.00000000000000000000e+00,   /* 0x0000000000000000 */
     4.99167184520462138743e-07,   /* 0x3ea0bfd03f46763c */
     4.51720373502110930296e-10,   /* 0x3dff0abfb4adfb9e */
     1.28874162718371367439e-07,   /* 0x3e814c151f991b2e */
     5.85529267186999798656e-07,   /* 0x3ea3a5a879b09292 */
     1.01827770937125531924e-07,   /* 0x3e7b558d173f9796 */
     2.54736389177809626508e-07,   /* 0x3e9118567cd83fb8 */
     6.98925535290464831294e-07,   /* 0x3ea773b981896751 */
     1.20940735036524314513e-07,   /* 0x3e803b7df49f48a8 */
     5.43759351196479689657e-08,   /* 0x3e6d315f22491900 */
     1.11957989042397958409e-07,   /* 0x3e7e0db1c5bb84b2 */
     8.47006714134442661218e-07,   /* 0x3eac6bbb7644ff76 */
     8.92831044643427836228e-07,   /* 0x3eadf55c3afec01f */
     7.77828292464916501663e-07,   /* 0x3eaa197e81034da3 */
     6.48469316302918797451e-08,   /* 0x3e71683f4920555d */
     2.12579816658859849140e-07,   /* 0x3e8c882fd78bb0b0 */
     7.61222472580559138435e-07,   /* 0x3ea98ad9eb7b83ec */
     2.86488961857314189607e-07,   /* 0x3e9339d7c7777273 */
     2.14637363790165363515e-07,   /* 0x3e8ccee237cae6fe */
     5.44137005612605847831e-08,   /* 0x3e6d368fe324a146 */
     2.58378284856442408413e-07,   /* 0x3e9156e7b6d99b45 */
     3.15848939061134843091e-07,   /* 0x3e95323e5310b5c1 */
     6.60530466255089632309e-07,   /* 0x3ea629e9db362f5d */
     7.63436345535852301127e-07,   /* 0x3ea99dde4728d7ec */
     8.68233432860324345268e-08,   /* 0x3e774e746878544d */
     9.45465175398023087082e-07,   /* 0x3eafb97be873a87d */
     8.77499534786171267246e-07,   /* 0x3ead71a9e23c2f63 */
     2.74055432394999316135e-07,   /* 0x3e92643c89cda173 */
     4.72129009349126213532e-07,   /* 0x3e9faf1d57a4d56c */
     8.93777032327078947306e-07,   /* 0x3eadfd7c7ab7b282 */
     0.00000000000000000000e+00};  /* 0x0000000000000000 */


   /* Handle special arguments first */

   GET_BITS_DP64(x, ux);
   ax = ux & (~SIGNBIT_DP64);

   if(ax >= 0x7ff0000000000000)
     {
       /* x is either NaN or infinity */
       if (ux & MANTBITS_DP64)
         /* x is NaN */
         return x + x; /* Raise invalid if it is a signalling NaN */
       else if (ux & SIGNBIT_DP64)
         /* x is negative infinity */
         return nan_with_flags(AMD_F_INVALID);
       else
         /* x is positive infinity */
         return x;
     }
   else if (ux & SIGNBIT_DP64)
     {
       /* x is negative. */
       if (ux == SIGNBIT_DP64)
         /* Handle negative zero first */
         return x;
       else
         return nan_with_flags(AMD_F_INVALID);
     }
   else if (ux <= 0x000fffffffffffff)
     {
       /* x is denormalised or zero */
       if (ux == 0)
         /* x is zero */
         return x;
       else
         {
           /* x is denormalised; scale it up */
           /* Normalize x by increasing the exponent by 60
              and subtracting a correction to account for the implicit
              bit. This replaces a slow denormalized
              multiplication by a fast normal subtraction. */
           static const double corr = 2.5653355008114851558350183e-290; /* 0x03d0000000000000 */
           denorm = 1;
           GET_BITS_DP64(x, ux);
           PUT_BITS_DP64(ux | 0x03d0000000000000, x);
           x -= corr;
           GET_BITS_DP64(x, ux);
         }
     }

   /* Main algorithm */

   /*
      Find y and e such that x = 2^e * y, where y in [1,4).
      This is done using an in-lined variant of splitDouble,
      which also ensures that e is even.
    */
   y = x;
   ux &= EXPBITS_DP64;
   ux >>= EXPSHIFTBITS_DP64;
   if (ux & 1)
     {
       GET_BITS_DP64(y, u);
       u &= (SIGNBIT_DP64 | MANTBITS_DP64);
       u |= ONEEXPBITS_DP64;
       PUT_BITS_DP64(u, y);
       e = ux - EXPBIAS_DP64;
     }
   else
     {
       GET_BITS_DP64(y, u);
       u &= (SIGNBIT_DP64 | MANTBITS_DP64);
       u |= TWOEXPBITS_DP64;
       PUT_BITS_DP64(u, y);
       e = ux - EXPBIAS_DP64 - 1;
     }


   /* Find the index of the sub-interval of [1,4) in which y lies. */

   index = (int)(32.0*y+0.5);

   /* Look up the table values and compute c and r = c/t */

   rtc_lead = rt_jby32_lead_table_dbl[index-32];
   rtc_trail = rt_jby32_trail_table_dbl[index-32];
   c = 0.03125*index;
   r = (y - c)/c;

   /*
     Find q = sqrt(1+r) - 1.
     From one step of Newton on (q+1)^2 = 1+r
   */

   p = r*0.5 - r*r*(0.1250079870 - r*(0.6250522999E-01));
   twop = p + p;
   q = p - (p*p + (twop - r))/(twop + 2.0);

   /* Reconstruction */

   rtc = rtc_lead + rtc_trail;
   e >>= 1; /* e = e/2 */
   z = rtc_lead + (rtc*q+rtc_trail);

   if (denorm)
     {
       /* Scale by 2**(e-30) */
       PUT_BITS_DP64(((long long)(e - 30) + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, r);
       z *= r;
     }
   else
     {
       /* Scale by 2**e */
       PUT_BITS_DP64(((long long)e + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, r);
       z *= r;
     }

   return z;

 }
 #endif /* SQRT_AMD_INLINE */

 #if defined(USE_SQRTF_AMD_INLINE)

 static inline float sqrtf_amd_inline(float x)
 {
   /*
      Computes the square root of x.

      The calculation is carried out in three steps.

      Step 1. Reduction.
      The input argument is scaled to the interval [1, 4) by
      computing
                x = 2^e * y, where y in [1,4).
      Furthermore y is decomposed as y = c + t where
                c = 1 + j/32, j = 0,1,..,96; and |t| <= 1/64.

      Step 2. Approximation.
      An approximation q = sqrt(1 + (t/c)) - 1  is obtained
      from a basic series expansion using precomputed values
      stored in rt_jby32_lead_table_float and rt_jby32_trail_table_float.

      Step 3. Reconstruction.
      The value of sqrt(x) is reconstructed via
        sqrt(x) = 2^(e/2) * sqrt(y)
                = 2^(e/2) * sqrt(c) * sqrt(y/c)
                = 2^(e/2) * sqrt(c) * sqrt(1 + t/c)
                = 2^(e/2) * [ sqrt(c) + sqrt(c)*q ]
     */

   unsigned int ux, ax, u;
   float r1, r2, c, y, p, q, r, twop, z, rtc, rtc_lead, rtc_trail;
   int e, denorm = 0, index;

 /* Arrays rt_jby32_lead_table_float and rt_jby32_trail_table_float contain
    leading and trailing parts respectively of precomputed
    values of sqrt(j/32), for j = 32, 33, ..., 128.
    rt_jby32_lead_table_float contains the first 13 bits of precision,
    and rt_jby32_trail_table_float contains a further 24 bits precision. */

 static const float rt_jby32_lead_table_float[97] = {
     1.00000000000000000000e+00F,   /* 0x3f800000 */
     1.01538085937500000000e+00F,   /* 0x3f81f800 */
     1.03076171875000000000e+00F,   /* 0x3f83f000 */
     1.04565429687500000000e+00F,   /* 0x3f85d800 */
     1.06054687500000000000e+00F,   /* 0x3f87c000 */
     1.07519531250000000000e+00F,   /* 0x3f89a000 */
     1.08959960937500000000e+00F,   /* 0x3f8b7800 */
     1.10375976562500000000e+00F,   /* 0x3f8d4800 */
     1.11791992187500000000e+00F,   /* 0x3f8f1800 */
     1.13183593750000000000e+00F,   /* 0x3f90e000 */
     1.14550781250000000000e+00F,   /* 0x3f92a000 */
     1.15917968750000000000e+00F,   /* 0x3f946000 */
     1.17236328125000000000e+00F,   /* 0x3f961000 */
     1.18579101562500000000e+00F,   /* 0x3f97c800 */
     1.19873046875000000000e+00F,   /* 0x3f997000 */
     1.21191406250000000000e+00F,   /* 0x3f9b2000 */
     1.22460937500000000000e+00F,   /* 0x3f9cc000 */
     1.23730468750000000000e+00F,   /* 0x3f9e6000 */
     1.25000000000000000000e+00F,   /* 0x3fa00000 */
     1.26220703125000000000e+00F,   /* 0x3fa19000 */
     1.27465820312500000000e+00F,   /* 0x3fa32800 */
     1.28686523437500000000e+00F,   /* 0x3fa4b800 */
     1.29882812500000000000e+00F,   /* 0x3fa64000 */
     1.31079101562500000000e+00F,   /* 0x3fa7c800 */
     1.32275390625000000000e+00F,   /* 0x3fa95000 */
     1.33447265625000000000e+00F,   /* 0x3faad000 */
     1.34619140625000000000e+00F,   /* 0x3fac5000 */
     1.35766601562500000000e+00F,   /* 0x3fadc800 */
     1.36914062500000000000e+00F,   /* 0x3faf4000 */
     1.38061523437500000000e+00F,   /* 0x3fb0b800 */
     1.39184570312500000000e+00F,   /* 0x3fb22800 */
     1.40307617187500000000e+00F,   /* 0x3fb39800 */
     1.41406250000000000000e+00F,   /* 0x3fb50000 */
     1.42504882812500000000e+00F,   /* 0x3fb66800 */
     1.43603515625000000000e+00F,   /* 0x3fb7d000 */
     1.44677734375000000000e+00F,   /* 0x3fb93000 */
     1.45751953125000000000e+00F,   /* 0x3fba9000 */
     1.46826171875000000000e+00F,   /* 0x3fbbf000 */
     1.47900390625000000000e+00F,   /* 0x3fbd5000 */
     1.48950195312500000000e+00F,   /* 0x3fbea800 */
     1.50000000000000000000e+00F,   /* 0x3fc00000 */
     1.51025390625000000000e+00F,   /* 0x3fc15000 */
     1.52050781250000000000e+00F,   /* 0x3fc2a000 */
     1.53076171875000000000e+00F,   /* 0x3fc3f000 */
     1.54101562500000000000e+00F,   /* 0x3fc54000 */
     1.55102539062500000000e+00F,   /* 0x3fc68800 */
     1.56103515625000000000e+00F,   /* 0x3fc7d000 */
     1.57104492187500000000e+00F,   /* 0x3fc91800 */
     1.58105468750000000000e+00F,   /* 0x3fca6000 */
     1.59082031250000000000e+00F,   /* 0x3fcba000 */
     1.60058593750000000000e+00F,   /* 0x3fcce000 */
     1.61035156250000000000e+00F,   /* 0x3fce2000 */
     1.62011718750000000000e+00F,   /* 0x3fcf6000 */
     1.62963867187500000000e+00F,   /* 0x3fd09800 */
     1.63916015625000000000e+00F,   /* 0x3fd1d000 */
     1.64868164062500000000e+00F,   /* 0x3fd30800 */
     1.65820312500000000000e+00F,   /* 0x3fd44000 */
     1.66748046875000000000e+00F,   /* 0x3fd57000 */
     1.67700195312500000000e+00F,   /* 0x3fd6a800 */
     1.68627929687500000000e+00F,   /* 0x3fd7d800 */
     1.69555664062500000000e+00F,   /* 0x3fd90800 */
     1.70458984375000000000e+00F,   /* 0x3fda3000 */
     1.71386718750000000000e+00F,   /* 0x3fdb6000 */
     1.72290039062500000000e+00F,   /* 0x3fdc8800 */
     1.73193359375000000000e+00F,   /* 0x3fddb000 */
     1.74096679687500000000e+00F,   /* 0x3fded800 */
     1.75000000000000000000e+00F,   /* 0x3fe00000 */
     1.75878906250000000000e+00F,   /* 0x3fe12000 */
     1.76757812500000000000e+00F,   /* 0x3fe24000 */
     1.77636718750000000000e+00F,   /* 0x3fe36000 */
     1.78515625000000000000e+00F,   /* 0x3fe48000 */
     1.79394531250000000000e+00F,   /* 0x3fe5a000 */
     1.80273437500000000000e+00F,   /* 0x3fe6c000 */
     1.81127929687500000000e+00F,   /* 0x3fe7d800 */
     1.81982421875000000000e+00F,   /* 0x3fe8f000 */
     1.82836914062500000000e+00F,   /* 0x3fea0800 */
     1.83691406250000000000e+00F,   /* 0x3feb2000 */
     1.84545898437500000000e+00F,   /* 0x3fec3800 */
     1.85400390625000000000e+00F,   /* 0x3fed5000 */
     1.86230468750000000000e+00F,   /* 0x3fee6000 */
     1.87060546875000000000e+00F,   /* 0x3fef7000 */
     1.87915039062500000000e+00F,   /* 0x3ff08800 */
     1.88745117187500000000e+00F,   /* 0x3ff19800 */
     1.89550781250000000000e+00F,   /* 0x3ff2a000 */
     1.90380859375000000000e+00F,   /* 0x3ff3b000 */
     1.91210937500000000000e+00F,   /* 0x3ff4c000 */
     1.92016601562500000000e+00F,   /* 0x3ff5c800 */
     1.92822265625000000000e+00F,   /* 0x3ff6d000 */
     1.93627929687500000000e+00F,   /* 0x3ff7d800 */
     1.94433593750000000000e+00F,   /* 0x3ff8e000 */
     1.95239257812500000000e+00F,   /* 0x3ff9e800 */
     1.96044921875000000000e+00F,   /* 0x3ffaf000 */
     1.96826171875000000000e+00F,   /* 0x3ffbf000 */
     1.97631835937500000000e+00F,   /* 0x3ffcf800 */
     1.98413085937500000000e+00F,   /* 0x3ffdf800 */
     1.99194335937500000000e+00F,   /* 0x3ffef800 */
     2.00000000000000000000e+00F};  /* 0x40000000 */

 static const float rt_jby32_trail_table_float[97] = {
     0.00000000000000000000e+00F,   /* 0x00000000 */
     1.23941208585165441036e-04F,   /* 0x3901f637 */
     1.46876545841223560274e-05F,   /* 0x37766aff */
     1.70736297150142490864e-04F,   /* 0x393307ad */
     1.13296780909877270460e-04F,   /* 0x38ed99bf */
     9.53458802541717886925e-05F,   /* 0x38c7f46e */
     1.25126505736261606216e-04F,   /* 0x39033464 */
     2.10342666832730174065e-04F,   /* 0x395c8f6e */
     1.14066875539720058441e-04F,   /* 0x38ef3730 */
     8.72047676239162683487e-05F,   /* 0x38b6e1b4 */
     1.36111237225122749805e-04F,   /* 0x390eb915 */
     2.26244374061934649944e-05F,   /* 0x37bdc99c */
     2.40658700931817293167e-04F,   /* 0x397c5954 */
     6.31069415248930454254e-05F,   /* 0x38845848 */
     2.27412077947519719601e-04F,   /* 0x396e7577 */
     5.90185391047270968556e-06F,   /* 0x36c6088a */
     1.35496389702893793583e-04F,   /* 0x390e1409 */
     1.32179571664892137051e-04F,   /* 0x390a99af */
     0.00000000000000000000e+00F,   /* 0x00000000 */
     2.31086043640971183777e-04F,   /* 0x39724fb0 */
     9.66752704698592424393e-05F,   /* 0x38cabe24 */
     8.85332483449019491673e-05F,   /* 0x38b9aaed */
     2.09980673389509320259e-04F,   /* 0x395c2e42 */
     2.20044588786549866199e-04F,   /* 0x3966bbc5 */
     1.21749282698146998882e-04F,   /* 0x38ff53a6 */
     1.62125259521417319775e-04F,   /* 0x392a002b */
     9.97955357888713479042e-05F,   /* 0x38d14952 */
     1.81545779923908412457e-04F,   /* 0x393e5d53 */
     1.65768768056295812130e-04F,   /* 0x392dd237 */
     5.48927710042335093021e-05F,   /* 0x38663caa */
     9.53875860432162880898e-05F,   /* 0x38c80ad2 */
     4.53481625299900770187e-05F,   /* 0x383e3438 */
     1.51062369695864617825e-04F,   /* 0x391e667f */
     1.70453247847035527229e-04F,   /* 0x3932bbb2 */
     1.05505387182347476482e-04F,   /* 0x38dd42c6 */
     2.02269104192964732647e-04F,   /* 0x39541833 */
     2.18442466575652360916e-04F,   /* 0x39650db4 */
     1.55796806211583316326e-04F,   /* 0x39235d63 */
     1.60395247803535312414e-05F,   /* 0x37868c9e */
     4.49578510597348213196e-05F,   /* 0x383c9120 */
     0.00000000000000000000e+00F,   /* 0x00000000 */
     1.26840444863773882389e-04F,   /* 0x39050079 */
     1.82820076588541269302e-04F,   /* 0x393fb364 */
     1.69370483490638434887e-04F,   /* 0x3931990b */
     8.78757418831810355186e-05F,   /* 0x38b849ee */
     1.83815121999941766262e-04F,   /* 0x3940be7f */
     2.14343352126888930798e-04F,   /* 0x3960c15b */
     1.80714370799250900745e-04F,   /* 0x393d7e25 */
     8.41425862745381891727e-05F,   /* 0x38b075b5 */
     1.69945167726837098598e-04F,   /* 0x3932334f */
     1.95121858268976211548e-04F,   /* 0x394c99a0 */
     1.60778334247879683971e-04F,   /* 0x3928969b */
     6.79871009197086095810e-05F,   /* 0x388e944c */
     1.61929419846273958683e-04F,   /* 0x3929cb99 */
     1.99474830878898501396e-04F,   /* 0x39512a1e */
     1.81604162207804620266e-04F,   /* 0x393e6cff */
     1.09270178654696792364e-04F,   /* 0x38e527fb */
     2.27539261686615645885e-04F,   /* 0x396e979b */
     4.90300008095800876617e-05F,   /* 0x384da590 */
     6.28985289949923753738e-05F,   /* 0x3883e864 */
     2.58551553997676819563e-05F,   /* 0x37d8e386 */
     1.82868374395184218884e-04F,   /* 0x393fc05b */
     4.64625991298817098141e-05F,   /* 0x3842e0d6 */
     1.05703387816902250051e-04F,   /* 0x38ddad13 */
     1.17213814519345760345e-04F,   /* 0x38f5d0b0 */
     8.17377731436863541603e-05F,   /* 0x38ab6aa2 */
     0.00000000000000000000e+00F,   /* 0x00000000 */
     1.16847433673683553934e-04F,   /* 0x38f50bfd */
     1.88827965757809579372e-04F,   /* 0x3946001f */
     2.16612941585481166840e-04F,   /* 0x39632298 */
     2.00857131858356297016e-04F,   /* 0x39529d2d */
     1.42199307447299361229e-04F,   /* 0x39151b56 */
     4.12627305195201188326e-05F,   /* 0x382d1185 */
     1.42796401632949709892e-04F,   /* 0x3915bb9e */
     2.03253570361994206905e-04F,   /* 0x39552077 */
     2.23214170546270906925e-04F,   /* 0x396a0e99 */
     2.03244591830298304558e-04F,   /* 0x39551e0e */
     1.43898156238719820976e-04F,   /* 0x3916e35e */
     4.57155256299301981926e-05F,   /* 0x383fbeac */
     1.53365719597786664963e-04F,   /* 0x3920d0cc */
     2.23224633373320102692e-04F,   /* 0x396a1168 */
     1.16566716314991936088e-05F,   /* 0x37439106 */
     7.43694272387074306607e-06F,   /* 0x36f98ada */
     2.11048507480882108212e-04F,   /* 0x395d4ce7 */
     1.34682719362899661064e-04F,   /* 0x390d399e */
     2.29425968427676707506e-05F,   /* 0x37c074da */
     1.20421340398024767637e-04F,   /* 0x38fc8ab7 */
     1.83421318070031702518e-04F,   /* 0x394054c9 */
     2.12376224226318299770e-04F,   /* 0x395eb14f */
     2.07710763788782060146e-04F,   /* 0x3959ccef */
     1.69840845046564936638e-04F,   /* 0x3932174e */
     9.91739216260612010956e-05F,   /* 0x38cffb98 */
     2.40249748458154499531e-04F,   /* 0x397beb8d */
     1.05178231024183332920e-04F,   /* 0x38dc9322 */
     1.82623916771262884140e-04F,   /* 0x393f7ebc */
     2.28821940254420042038e-04F,   /* 0x396fefec */
     0.00000000000000000000e+00F};  /* 0x00000000 */


 /* Handle special arguments first */

   GET_BITS_SP32(x, ux);
   ax = ux & (~SIGNBIT_SP32);

   if(ax >= 0x7f800000)
     {
       /* x is either NaN or infinity */
       if (ux & MANTBITS_SP32)
         /* x is NaN */
         return x + x; /* Raise invalid if it is a signalling NaN */
       else if (ux & SIGNBIT_SP32)
         return nanf_with_flags(AMD_F_INVALID);
       else
         /* x is positive infinity */
         return x;
     }
   else if (ux & SIGNBIT_SP32)
     {
       /* x is negative. */
       if (x == 0.0F)
         /* Handle negative zero first */
         return x;
       else
         return nanf_with_flags(AMD_F_INVALID);
     }
   else if (ux <= 0x007fffff)
     {
       /* x is denormalised or zero */
       if (ux == 0)
         /* x is zero */
         return x;
       else
         {
           /* x is denormalised; scale it up */
           /* Normalize x by increasing the exponent by 26
              and subtracting a correction to account for the implicit
              bit. This replaces a slow denormalized
              multiplication by a fast normal subtraction. */
           static const float corr = 7.888609052210118054e-31F; /* 0x0d800000 */
           denorm = 1;
           GET_BITS_SP32(x, ux);
           PUT_BITS_SP32(ux | 0x0d800000, x);
           x -= corr;
           GET_BITS_SP32(x, ux);
         }
     }

   /* Main algorithm */

   /*
      Find y and e such that x = 2^e * y, where y in [1,4).
      This is done using an in-lined variant of splitFloat,
      which also ensures that e is even.
    */
   y = x;
   ux &= EXPBITS_SP32;
   ux >>= EXPSHIFTBITS_SP32;
   if (ux & 1)
     {
       GET_BITS_SP32(y, u);
       u &= (SIGNBIT_SP32 | MANTBITS_SP32);
       u |= ONEEXPBITS_SP32;
       PUT_BITS_SP32(u, y);
       e = ux - EXPBIAS_SP32;
     }
   else
     {
       GET_BITS_SP32(y, u);
       u &= (SIGNBIT_SP32 | MANTBITS_SP32);
       u |= TWOEXPBITS_SP32;
       PUT_BITS_SP32(u, y);
       e = ux - EXPBIAS_SP32 - 1;
     }

   /* Find the index of the sub-interval of [1,4) in which y lies. */

   index = (int)(32.0F*y+0.5);

   /* Look up the table values and compute c and r = c/t */

   rtc_lead = rt_jby32_lead_table_float[index-32];
   rtc_trail = rt_jby32_trail_table_float[index-32];
   c = 0.03125F*index;
   r = (y - c)/c;

   /*
   Find q = sqrt(1+r) - 1.
   From one step of Newton on (q+1)^2 = 1+r
   */

   p = r*0.5F - r*r*(0.1250079870F - r*(0.6250522999e-01F));
   twop = p + p;
   q = p - (p*p + (twop - r))/(twop + 2.0);

   /* Reconstruction */

   rtc = rtc_lead + rtc_trail;
   e >>= 1; /* e = e/2 */
   z = rtc_lead + (rtc*q+rtc_trail);

   if (denorm)
     {
       /* Scale by 2**(e-13) */
       PUT_BITS_SP32(((e - 13) + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, r);
       z *= r;
     }
   else
     {
       /* Scale by 2**e */
       PUT_BITS_SP32((e + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, r);
       z *= r;
     }

   return z;

 }
 #endif /* SQRTF_AMD_INLINE */

 #ifdef USE_LOG_KERNEL_AMD
 static inline void log_kernel_amd64(double x, unsigned long long ux, int *xexp, double *r1, double *r2)
 {

   int expadjust;
   double r, z1, z2, correction, f, f1, f2, q, u, v, poly;
   int index;

   /*
     Computes natural log(x). Algorithm based on:
     Ping-Tak Peter Tang
     "Table-driven implementation of the logarithm function in IEEE
     floating-point arithmetic"
     ACM Transactions on Mathematical Software (TOMS)
     Volume 16, Issue 4 (December 1990)
   */

 /* Arrays ln_lead_table and ln_tail_table contain
    leading and trailing parts respectively of precomputed
    values of natural log(1+i/64), for i = 0, 1, ..., 64.
    ln_lead_table contains the first 24 bits of precision,
    and ln_tail_table contains a further 53 bits precision. */

   static const double ln_lead_table[65] = {
     0.00000000000000000000e+00,   /* 0x0000000000000000 */
     1.55041813850402832031e-02,   /* 0x3f8fc0a800000000 */
     3.07716131210327148438e-02,   /* 0x3f9f829800000000 */
     4.58095073699951171875e-02,   /* 0x3fa7745800000000 */
     6.06245994567871093750e-02,   /* 0x3faf0a3000000000 */
     7.52233862876892089844e-02,   /* 0x3fb341d700000000 */
     8.96121263504028320312e-02,   /* 0x3fb6f0d200000000 */
     1.03796780109405517578e-01,   /* 0x3fba926d00000000 */
     1.17783010005950927734e-01,   /* 0x3fbe270700000000 */
     1.31576299667358398438e-01,   /* 0x3fc0d77e00000000 */
     1.45181953907012939453e-01,   /* 0x3fc2955280000000 */
     1.58604979515075683594e-01,   /* 0x3fc44d2b00000000 */
     1.71850204467773437500e-01,   /* 0x3fc5ff3000000000 */
     1.84922337532043457031e-01,   /* 0x3fc7ab8900000000 */
     1.97825729846954345703e-01,   /* 0x3fc9525a80000000 */
     2.10564732551574707031e-01,   /* 0x3fcaf3c900000000 */
     2.23143517971038818359e-01,   /* 0x3fcc8ff780000000 */
     2.35566020011901855469e-01,   /* 0x3fce270700000000 */
     2.47836112976074218750e-01,   /* 0x3fcfb91800000000 */
     2.59957492351531982422e-01,   /* 0x3fd0a324c0000000 */
     2.71933674812316894531e-01,   /* 0x3fd1675c80000000 */
     2.83768117427825927734e-01,   /* 0x3fd22941c0000000 */
     2.95464158058166503906e-01,   /* 0x3fd2e8e280000000 */
     3.07025015354156494141e-01,   /* 0x3fd3a64c40000000 */
     3.18453729152679443359e-01,   /* 0x3fd4618bc0000000 */
     3.29753279685974121094e-01,   /* 0x3fd51aad80000000 */
     3.40926527976989746094e-01,   /* 0x3fd5d1bd80000000 */
     3.51976394653320312500e-01,   /* 0x3fd686c800000000 */
     3.62905442714691162109e-01,   /* 0x3fd739d7c0000000 */
     3.73716354370117187500e-01,   /* 0x3fd7eaf800000000 */
     3.84411692619323730469e-01,   /* 0x3fd89a3380000000 */
     3.94993782043457031250e-01,   /* 0x3fd9479400000000 */
     4.05465066432952880859e-01,   /* 0x3fd9f323c0000000 */
     4.15827870368957519531e-01,   /* 0x3fda9cec80000000 */
     4.26084339618682861328e-01,   /* 0x3fdb44f740000000 */
     4.36236739158630371094e-01,   /* 0x3fdbeb4d80000000 */
     4.46287095546722412109e-01,   /* 0x3fdc8ff7c0000000 */
     4.56237375736236572266e-01,   /* 0x3fdd32fe40000000 */
     4.66089725494384765625e-01,   /* 0x3fddd46a00000000 */
     4.75845873355865478516e-01,   /* 0x3fde744240000000 */
     4.85507786273956298828e-01,   /* 0x3fdf128f40000000 */
     4.95077252388000488281e-01,   /* 0x3fdfaf5880000000 */
     5.04556000232696533203e-01,   /* 0x3fe02552a0000000 */
     5.13945698738098144531e-01,   /* 0x3fe0723e40000000 */
     5.23248136043548583984e-01,   /* 0x3fe0be72e0000000 */
     5.32464742660522460938e-01,   /* 0x3fe109f380000000 */
     5.41597247123718261719e-01,   /* 0x3fe154c3c0000000 */
     5.50647079944610595703e-01,   /* 0x3fe19ee6a0000000 */
     5.59615731239318847656e-01,   /* 0x3fe1e85f40000000 */
     5.68504691123962402344e-01,   /* 0x3fe23130c0000000 */
     5.77315330505371093750e-01,   /* 0x3fe2795e00000000 */
     5.86049020290374755859e-01,   /* 0x3fe2c0e9e0000000 */
     5.94707071781158447266e-01,   /* 0x3fe307d720000000 */
     6.03290796279907226562e-01,   /* 0x3fe34e2880000000 */
     6.11801505088806152344e-01,   /* 0x3fe393e0c0000000 */
     6.20240390300750732422e-01,   /* 0x3fe3d90260000000 */
     6.28608644008636474609e-01,   /* 0x3fe41d8fe0000000 */
     6.36907458305358886719e-01,   /* 0x3fe4618bc0000000 */
     6.45137906074523925781e-01,   /* 0x3fe4a4f840000000 */
     6.53301239013671875000e-01,   /* 0x3fe4e7d800000000 */
     6.61398470401763916016e-01,   /* 0x3fe52a2d20000000 */
     6.69430613517761230469e-01,   /* 0x3fe56bf9c0000000 */
     6.77398800849914550781e-01,   /* 0x3fe5ad4040000000 */
     6.85303986072540283203e-01,   /* 0x3fe5ee02a0000000 */
     6.93147122859954833984e-01};  /* 0x3fe62e42e0000000 */

   static const double ln_tail_table[65] = {
     0.00000000000000000000e+00,   /* 0x0000000000000000 */
     5.15092497094772879206e-09,   /* 0x3e361f807c79f3db */
     4.55457209735272790188e-08,   /* 0x3e6873c1980267c8 */
     2.86612990859791781788e-08,   /* 0x3e5ec65b9f88c69e */
     2.23596477332056055352e-08,   /* 0x3e58022c54cc2f99 */
     3.49498983167142274770e-08,   /* 0x3e62c37a3a125330 */
     3.23392843005887000414e-08,   /* 0x3e615cad69737c93 */
     1.35722380472479366661e-08,   /* 0x3e4d256ab1b285e9 */
     2.56504325268044191098e-08,   /* 0x3e5b8abcb97a7aa2 */
     5.81213608741512136843e-08,   /* 0x3e6f34239659a5dc */
     5.59374849578288093334e-08,   /* 0x3e6e07fd48d30177 */
     5.06615629004996189970e-08,   /* 0x3e6b32df4799f4f6 */
     5.24588857848400955725e-08,   /* 0x3e6c29e4f4f21cf8 */
     9.61968535632653505972e-10,   /* 0x3e1086c848df1b59 */
     1.34829655346594463137e-08,   /* 0x3e4cf456b4764130 */
     3.65557749306383026498e-08,   /* 0x3e63a02ffcb63398 */
     3.33431709374069198903e-08,   /* 0x3e61e6a6886b0976 */
     5.13008650536088382197e-08,   /* 0x3e6b8abcb97a7aa2 */
     5.09285070380306053751e-08,   /* 0x3e6b578f8aa35552 */
     3.20853940845502057341e-08,   /* 0x3e6139c871afb9fc */
     4.06713248643004200446e-08,   /* 0x3e65d5d30701ce64 */
     5.57028186706125221168e-08,   /* 0x3e6de7bcb2d12142 */
     5.48356693724804282546e-08,   /* 0x3e6d708e984e1664 */
     1.99407553679345001938e-08,   /* 0x3e556945e9c72f36 */
     1.96585517245087232086e-09,   /* 0x3e20e2f613e85bda */
     6.68649386072067321503e-09,   /* 0x3e3cb7e0b42724f6 */
     5.89936034642113390002e-08,   /* 0x3e6fac04e52846c7 */
     2.85038578721554472484e-08,   /* 0x3e5e9b14aec442be */
     5.09746772910284482606e-08,   /* 0x3e6b5de8034e7126 */
     5.54234668933210171467e-08,   /* 0x3e6dc157e1b259d3 */
     6.29100830926604004874e-09,   /* 0x3e3b05096ad69c62 */
     2.61974119468563937716e-08,   /* 0x3e5c2116faba4cdd */
     4.16752115011186398935e-08,   /* 0x3e665fcc25f95b47 */
     2.47747534460820790327e-08,   /* 0x3e5a9a08498d4850 */
     5.56922172017964209793e-08,   /* 0x3e6de647b1465f77 */
     2.76162876992552906035e-08,   /* 0x3e5da71b7bf7861d */
     7.08169709942321478061e-09,   /* 0x3e3e6a6886b09760 */
     5.77453510221151779025e-08,   /* 0x3e6f0075eab0ef64 */
     4.43021445893361960146e-09,   /* 0x3e33071282fb989b */
     3.15140984357495864573e-08,   /* 0x3e60eb43c3f1bed2 */
     2.95077445089736670973e-08,   /* 0x3e5faf06ecb35c84 */
     1.44098510263167149349e-08,   /* 0x3e4ef1e63db35f68 */
     1.05196987538551827693e-08,   /* 0x3e469743fb1a71a5 */
     5.23641361722697546261e-08,   /* 0x3e6c1cdf404e5796 */
     7.72099925253243069458e-09,   /* 0x3e4094aa0ada625e */
     5.62089493829364197156e-08,   /* 0x3e6e2d4c96fde3ec */
     3.53090261098577946927e-08,   /* 0x3e62f4d5e9a98f34 */
     3.80080516835568242269e-08,   /* 0x3e6467c96ecc5cbe */
     5.66961038386146408282e-08,   /* 0x3e6e7040d03dec5a */
     4.42287063097349852717e-08,   /* 0x3e67bebf4282de36 */
     3.45294525105681104660e-08,   /* 0x3e6289b11aeb783f */
     2.47132034530447431509e-08,   /* 0x3e5a891d1772f538 */
     3.59655343422487209774e-08,   /* 0x3e634f10be1fb591 */
     5.51581770357780862071e-08,   /* 0x3e6d9ce1d316eb93 */
     3.60171867511861372793e-08,   /* 0x3e63562a19a9c442 */
     1.94511067964296180547e-08,   /* 0x3e54e2adf548084c */
     1.54137376631349347838e-08,   /* 0x3e508ce55cc8c97a */
     3.93171034490174464173e-09,   /* 0x3e30e2f613e85bda */
     5.52990607758839766440e-08,   /* 0x3e6db03ebb0227bf */
     3.29990737637586136511e-08,   /* 0x3e61b75bb09cb098 */
     1.18436010922446096216e-08,   /* 0x3e496f16abb9df22 */
     4.04248680368301346709e-08,   /* 0x3e65b3f399411c62 */
     2.27418915900284316293e-08,   /* 0x3e586b3e59f65355 */
     1.70263791333409206020e-08,   /* 0x3e52482ceae1ac12 */
     5.76999904754328540596e-08};  /* 0x3e6efa39ef35793c */

   /* Approximating polynomial coefficients for x near 1.0 */
   static const double
     ca_1 = 8.33333333333317923934e-02,  /* 0x3fb55555555554e6 */
     ca_2 = 1.25000000037717509602e-02,  /* 0x3f89999999bac6d4 */
     ca_3 = 2.23213998791944806202e-03,  /* 0x3f62492307f1519f */
     ca_4 = 4.34887777707614552256e-04;  /* 0x3f3c8034c85dfff0 */

   /* Approximating polynomial coefficients for other x */
   static const double
     cb_1 = 8.33333333333333593622e-02,  /* 0x3fb5555555555557 */
     cb_2 = 1.24999999978138668903e-02,  /* 0x3f89999999865ede */
     cb_3 = 2.23219810758559851206e-03;  /* 0x3f6249423bd94741 */

   static const unsigned long long
     log_thresh1 = 0x3fee0faa00000000,
     log_thresh2 = 0x3ff1082c00000000;

   /* log_thresh1 = 9.39412117004394531250e-1 = 0x3fee0faa00000000
      log_thresh2 = 1.06449508666992187500 = 0x3ff1082c00000000 */
   if (ux >= log_thresh1 && ux <= log_thresh2)
     {
       /* Arguments close to 1.0 are handled separately to maintain
          accuracy.

          The approximation in this region exploits the identity
              log( 1 + r ) = log( 1 + u/2 )  /  log( 1 - u/2 ), where
              u  = 2r / (2+r).
          Note that the right hand side has an odd Taylor series expansion
          which converges much faster than the Taylor series expansion of
          log( 1 + r ) in r. Thus, we approximate log( 1 + r ) by
              u + A1 * u^3 + A2 * u^5 + ... + An * u^(2n+1).

          One subtlety is that since u cannot be calculated from
          r exactly, the rounding error in the first u should be
          avoided if possible. To accomplish this, we observe that
                        u  =  r  -  r*r/(2+r).
          Since x (=1+r) is the input argument, and thus presumed exact,
          the formula above approximates u accurately because
                        u  =  r  -  correction,
          and the magnitude of "correction" (of the order of r*r)
          is small.
          With these observations, we will approximate log( 1 + r ) by
             r + (  (A1*u^3 + ... + An*u^(2n+1)) - correction ).

          We approximate log(1+r) by an odd polynomial in u, where
                   u = 2r/(2+r) = r - r*r/(2+r).
       */
       r = x - 1.0;
       u = r / (2.0 + r);
       correction = r * u;
       u = u + u;
       v = u * u;
       z1 = r;
       z2 = (u * v * (ca_1 + v * (ca_2 + v * (ca_3 + v * ca_4))) - correction);
       *r1 = z1;
       *r2 = z2;
       *xexp = 0;
     }
   else
     {
       /*
         First, we decompose the argument x to the form
         x  =  2**M  *  (F1  +  F2),
         where  1 <= F1+F2 < 2, M has the value of an integer,
         F1 = 1 + j/64, j ranges from 0 to 64, and |F2| <= 1/128.

         Second, we approximate log( 1 + F2/F1 ) by an odd polynomial
         in U, where U  =  2 F2 / (2 F2 + F1).
         Note that log( 1 + F2/F1 ) = log( 1 + U/2 ) - log( 1 - U/2 ).
         The core approximation calculates
         Poly = [log( 1 + U/2 ) - log( 1 - U/2 )]/U   -   1.
         Note that  log(1 + U/2) - log(1 - U/2) = 2 arctanh ( U/2 ),
         thus, Poly =  2 arctanh( U/2 ) / U  -  1.

         It is not hard to see that
           log(x) = M*log(2) + log(F1) + log( 1 + F2/F1 ).
         Hence, we return Z1 = log(F1), and  Z2 = log( 1 + F2/F1).
         The values of log(F1) are calculated beforehand and stored
         in the program.
       */

       f = x;
       if (ux < IMPBIT_DP64)
         {
           /* The input argument x is denormalized */
           /* Normalize f by increasing the exponent by 60
              and subtracting a correction to account for the implicit
              bit. This replaces a slow denormalized
              multiplication by a fast normal subtraction. */
           static const double corr = 2.5653355008114851558350183e-290; /* 0x03d0000000000000 */
           GET_BITS_DP64(f, ux);
           ux |= 0x03d0000000000000;
           PUT_BITS_DP64(ux, f);
           f -= corr;
           GET_BITS_DP64(f, ux);
           expadjust = 60;
         }
       else
         expadjust = 0;

       /* Store the exponent of x in xexp and put
          f into the range [0.5,1) */
       *xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 - expadjust;
       PUT_BITS_DP64((ux & MANTBITS_DP64) | HALFEXPBITS_DP64, f);

       /* Now  x = 2**xexp  * f,  1/2 <= f < 1. */

       /* Set index to be the nearest integer to 128*f */
       r = 128.0 * f;
       index = (int)(r + 0.5);

       z1 = ln_lead_table[index-64];
       q = ln_tail_table[index-64];
       f1 = index * 0.0078125; /* 0.0078125 = 1/128 */
       f2 = f - f1;
       /* At this point, x = 2**xexp * ( f1  +  f2 ) where
          f1 = j/128, j = 64, 65, ..., 128 and |f2| <= 1/256. */

       /* Calculate u = 2 f2 / ( 2 f1 + f2 ) = f2 / ( f1 + 0.5*f2 ) */
       /* u = f2 / (f1 + 0.5 * f2); */
       u = f2 / (f1 + 0.5 * f2);

       /* Here, |u| <= 2(exp(1/16)-1) / (exp(1/16)+1).
          The core approximation calculates
          poly = [log(1 + u/2) - log(1 - u/2)]/u  -  1  */
       v = u * u;
       poly = (v * (cb_1 + v * (cb_2 + v * cb_3)));
       z2 = q + (u + u * poly);
       *r1 = z1;
       *r2 = z2;
     }
   return;
 }
 #endif /* USE_LOG_KERNEL_AMD */

 #if defined(USE_REMAINDER_PIBY2F_INLINE)
 /* Define this to get debugging print statements activated */
 #define DEBUGGING_PRINT
 #undef DEBUGGING_PRINT


 #ifdef DEBUGGING_PRINT
 #include <stdio.h>
 char *d2b(long long d, int bitsper, int point)
 {
   static char buff[200];
   int i, j;
   j = bitsper;
   if (point >= 0 && point <= bitsper)
     j++;
   buff[j] = '\0';
   for (i = bitsper - 1; i >= 0; i--)
     {
       j--;
       if (d % 2 == 1)
         buff[j] = '1';
       else
         buff[j] = '0';
       if (i == point)
         {
           j--;
           buff[j] = '.';
         }
       d /= 2;
     }
   return buff;
 }
 #endif

 /* Given positive argument x, reduce it to the range [-pi/4,pi/4] using
    extra precision, and return the result in r.
    Return value "region" tells how many lots of pi/2 were subtracted
    from x to put it in the range [-pi/4,pi/4], mod 4. */
 static inline void __remainder_piby2f_inline(unsigned long long ux, double *r, int *region)
 {

       /* This method simulates multi-precision floating-point
          arithmetic and is accurate for all 1 <= x < infinity */
 #define bitsper 36
       unsigned long long res[10];
       unsigned long long u, carry, mask, mant, nextbits;
       int first, last, i, rexp, xexp, resexp, ltb, determ, bc;
       double dx;
       static const double
         piby2 = 1.57079632679489655800e+00; /* 0x3ff921fb54442d18 */
 #ifdef WINDOWS
       static unsigned long long pibits[] =
       {
         0LL,
         5215LL, 13000023176LL, 11362338026LL, 67174558139LL,
         34819822259LL, 10612056195LL, 67816420731LL, 57840157550LL,
         19558516809LL, 50025467026LL, 25186875954LL, 18152700886LL
       };
 #else
       static unsigned long long pibits[] =
       {
         0L,
         5215L, 13000023176L, 11362338026L, 67174558139L,
         34819822259L, 10612056195L, 67816420731L, 57840157550L,
         19558516809L, 50025467026L, 25186875954L, 18152700886L
       };
 #endif

 #ifdef DEBUGGING_PRINT
       printf("On entry, x = %25.20e = %s\n", x, double2hex(&x));
 #endif

       xexp = (int)(((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64);
       ux = ((ux & MANTBITS_DP64) | IMPBIT_DP64) >> 29;

 #ifdef DEBUGGING_PRINT
       printf("ux = %s\n", d2b(ux, 64, -1));
 #endif

       /* Now ux is the mantissa bit pattern of x as a long integer */
       mask = 1;
       mask = (mask << bitsper) - 1;

       /* Set first and last to the positions of the first
          and last chunks of 2/pi that we need */
       first = xexp / bitsper;
       resexp = xexp - first * bitsper;
       /* 120 is the theoretical maximum number of bits (actually
          115 for IEEE single precision) that we need to extract
          from the middle of 2/pi to compute the reduced argument
          accurately enough for our purposes */
       last = first + 120 / bitsper;

 #ifdef DEBUGGING_PRINT
       printf("first = %d, last = %d\n", first, last);
 #endif

       /* Do a long multiplication of the bits of 2/pi by the
          integer mantissa */
       /* Unroll the loop. This is only correct because we know
          that bitsper is fixed as 36. */
       res[4] = 0;
       u = pibits[last] * ux;
       res[3] = u & mask;
       carry = u >> bitsper;
       u = pibits[last - 1] * ux + carry;
       res[2] = u & mask;
       carry = u >> bitsper;
       u = pibits[last - 2] * ux + carry;
       res[1] = u & mask;
       carry = u >> bitsper;
       u = pibits[first] * ux + carry;
       res[0] = u & mask;

 #ifdef DEBUGGING_PRINT
       printf("resexp = %d\n", resexp);
       printf("Significant part of x * 2/pi with binary"
              " point in correct place:\n");
       for (i = 0; i <= last - first; i++)
         {
           if (i > 0 && i % 5 == 0)
             printf("\n ");
           if (i == 1)
             printf("%s ", d2b(res[i], bitsper, resexp));
           else
             printf("%s ", d2b(res[i], bitsper, -1));
         }
       printf("\n");
 #endif

       /* Reconstruct the result */
       ltb = (int)((((res[0] << bitsper) | res[1])
                    >> (bitsper - 1 - resexp)) & 7);

       /* determ says whether the fractional part is >= 0.5 */
       determ = ltb & 1;

 #ifdef DEBUGGING_PRINT
       printf("ltb = %d (last two bits before binary point"
              " and first bit after)\n", ltb);
       printf("determ = %d (1 means need to negate because the fractional\n"
              "            part of x * 2/pi is greater than 0.5)\n", determ);
 #endif

       i = 1;
       if (determ)
         {
           /* The mantissa is >= 0.5. We want to subtract it
              from 1.0 by negating all the bits */
           *region = ((ltb >> 1) + 1) & 3;
           mant = 1;
           mant = ~(res[1]) & ((mant << (bitsper - resexp)) - 1);
           while (mant < 0x0000000000010000)
             {
               i++;
               mant = (mant << bitsper) | (~(res[i]) & mask);
             }
           nextbits = (~(res[i+1]) & mask);
         }
       else
         {
           *region = (ltb >> 1);
           mant = 1;
           mant = res[1] & ((mant << (bitsper - resexp)) - 1);
           while (mant < 0x0000000000010000)
             {
               i++;
               mant = (mant << bitsper) | res[i];
             }
           nextbits = res[i+1];
         }

 #ifdef DEBUGGING_PRINT
       printf("First bits of mant = %s\n", d2b(mant, bitsper, -1));
 #endif

       /* Normalize the mantissa. The shift value 6 here, determined by
          trial and error, seems to give optimal speed. */
       bc = 0;
       while (mant < 0x0000400000000000LL)
         {
           bc += 6;
           mant <<= 6;
         }
       while (mant < 0x0010000000000000LL)
         {
           bc++;
           mant <<= 1;
         }
       mant |= nextbits >> (bitsper - bc);

       rexp = 52 + resexp - bc - i * bitsper;

 #ifdef DEBUGGING_PRINT
       printf("Normalised mantissa = 0x%016lx\n", mant);
       printf("Exponent to be inserted on mantissa = rexp = %d\n", rexp);
 #endif

       /* Put the result exponent rexp onto the mantissa pattern */
       u = ((unsigned long long)rexp + EXPBIAS_DP64) << EXPSHIFTBITS_DP64;
       ux = (mant & MANTBITS_DP64) | u;
       if (determ)
         /* If we negated the mantissa we negate x too */
         ux |= SIGNBIT_DP64;
       PUT_BITS_DP64(ux, dx);

 #ifdef DEBUGGING_PRINT
       printf("(x*2/pi) = %25.20e = %s\n", dx, double2hex(&dx));
 #endif

       /* x is a double precision version of the fractional part of
          x * 2 / pi. Multiply x by pi/2 in double precision
          to get the reduced argument r. */
       *r = dx * piby2;

 #ifdef DEBUGGING_PRINT
       printf(" r = frac(x*2/pi) * pi/2:\n");
       printf(" r = %25.20e = %s\n", *r, double2hex(r));
       printf("region = (number of pi/2 subtracted from x) mod 4 = %d\n",
              *region);
 #endif
 }
 #endif /* USE_REMAINDER_PIBY2F_INLINE */

 #if defined(WINDOWS)
 #if defined(USE_HANDLE_ERROR) || defined(USE_HANDLE_ERRORF)
 #include <errno.h>
 #endif

 #if defined(USE_HANDLE_ERROR)
 /* Define the Microsoft specific error handling routines */
 static __declspec(noinline) double handle_error(const char *name,
                                                 unsigned long long value,
                                                 int type, int flags, int error,
                                                 double arg1, double arg2)
 {
   double z;
   struct _exception exception_data;
   exception_data.type = type;
   exception_data.name = (char*)name;
   exception_data.arg1 = arg1;
   exception_data.arg2 = arg2;
   PUT_BITS_DP64(value, z);
   exception_data.retval = z;
   raise_fpsw_flags(flags);
   if (!_matherr(&exception_data))
     {
       errno = error;
     }
   return exception_data.retval;
 }
 #endif /* USE_HANDLE_ERROR */

 #if defined(USE_HANDLE_ERRORF)
 static __declspec(noinline) float handle_errorf(const char *name,
                                                 unsigned int value,
                                                 int type, int flags, int error,
                                                 float arg1, float arg2)
 {
   float z;
   struct _exception exception_data;
   exception_data.type = type;
   exception_data.name = (char*)name;
   exception_data.arg1 = (double)arg1;
   exception_data.arg2 = (double)arg2;
   PUT_BITS_SP32(value, z);
   exception_data.retval = z;
   raise_fpsw_flags(flags);
   if (!_matherr(&exception_data))
     {
       errno = error;
     }
   return (float)exception_data.retval;
 }
 #endif /* USE_HANDLE_ERRORF */
 #endif /* WINDOWS */

 #endif /* LIBM_INLINES_AMD_H_INCLUDED */