blob: a2e387aee2be10c149d96dd4a9310f2bbed1846d [file] [log] [blame]
/*
* Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
*
* This file is part of libacml_mv.
*
* libacml_mv is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* libacml_mv is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with libacml_mv. If not, see
* <http://www.gnu.org/licenses/>.
*
*/
#ifndef LIBM_INLINES_AMD_H_INCLUDED
#define LIBM_INLINES_AMD_H_INCLUDED 1
#include "libm_util_amd.h"
#include <math.h>
#ifdef WINDOWS
#define inline __inline
#include "emmintrin.h"
#endif
/* Compile-time verification that type long is the same size
as type double (i.e. we are really on a 64-bit machine) */
void check_long_against_double_size(int machine_is_64_bit[(sizeof(long long) == sizeof(double))?1:-1]);
/* Set defines for inline functions calling other inlines */
#if defined(USE_VAL_WITH_FLAGS) || defined(USE_VALF_WITH_FLAGS) || \
defined(USE_ZERO_WITH_FLAGS) || defined(USE_ZEROF_WITH_FLAGS) || \
defined(USE_NAN_WITH_FLAGS) || defined(USE_NANF_WITH_FLAGS) || \
defined(USE_INDEFINITE_WITH_FLAGS) || defined(USE_INDEFINITEF_WITH_FLAGS) || \
defined(USE_INFINITY_WITH_FLAGS) || defined(USE_INFINITYF_WITH_FLAGS) || \
defined(USE_SQRT_AMD_INLINE) || defined(USE_SQRTF_AMD_INLINE) || \
(defined(WINDOWS) && (defined(USE_HANDLE_ERROR) || defined(USE_HANDLE_ERRORF)))
#undef USE_RAISE_FPSW_FLAGS
#define USE_RAISE_FPSW_FLAGS 1
#endif
#if defined(USE_SPLITDOUBLE)
/* Splits double x into exponent e and mantissa m, where 0.5 <= abs(m) < 1.0.
Assumes that x is not zero, denormal, infinity or NaN, but these conditions
are not checked */
static inline void splitDouble(double x, int *e, double *m)
{
unsigned long long ux, uy;
GET_BITS_DP64(x, ux);
uy = ux;
ux &= EXPBITS_DP64;
ux >>= EXPSHIFTBITS_DP64;
*e = (int)ux - EXPBIAS_DP64 + 1;
uy = (uy & (SIGNBIT_DP64 | MANTBITS_DP64)) | HALFEXPBITS_DP64;
PUT_BITS_DP64(uy, x);
*m = x;
}
#endif /* USE_SPLITDOUBLE */
#if defined(USE_SPLITDOUBLE_2)
/* Splits double x into exponent e and mantissa m, where 1.0 <= abs(m) < 4.0.
Assumes that x is not zero, denormal, infinity or NaN, but these conditions
are not checked. Also assumes EXPBIAS_DP is odd. With this
assumption, e will be even on exit. */
static inline void splitDouble_2(double x, int *e, double *m)
{
unsigned long long ux, vx;
GET_BITS_DP64(x, ux);
vx = ux;
ux &= EXPBITS_DP64;
ux >>= EXPSHIFTBITS_DP64;
if (ux & 1)
{
/* The exponent is odd */
vx = (vx & (SIGNBIT_DP64 | MANTBITS_DP64)) | ONEEXPBITS_DP64;
PUT_BITS_DP64(vx, x);
*m = x;
*e = ux - EXPBIAS_DP64;
}
else
{
/* The exponent is even */
vx = (vx & (SIGNBIT_DP64 | MANTBITS_DP64)) | TWOEXPBITS_DP64;
PUT_BITS_DP64(vx, x);
*m = x;
*e = ux - EXPBIAS_DP64 - 1;
}
}
#endif /* USE_SPLITDOUBLE_2 */
#if defined(USE_SPLITFLOAT)
/* Splits float x into exponent e and mantissa m, where 0.5 <= abs(m) < 1.0.
Assumes that x is not zero, denormal, infinity or NaN, but these conditions
are not checked */
static inline void splitFloat(float x, int *e, float *m)
{
unsigned int ux, uy;
GET_BITS_SP32(x, ux);
uy = ux;
ux &= EXPBITS_SP32;
ux >>= EXPSHIFTBITS_SP32;
*e = (int)ux - EXPBIAS_SP32 + 1;
uy = (uy & (SIGNBIT_SP32 | MANTBITS_SP32)) | HALFEXPBITS_SP32;
PUT_BITS_SP32(uy, x);
*m = x;
}
#endif /* USE_SPLITFLOAT */
#if defined(USE_SCALEDOUBLE_1)
/* Scales the double x by 2.0**n.
Assumes EMIN <= n <= EMAX, though this condition is not checked. */
static inline double scaleDouble_1(double x, int n)
{
double t;
/* Construct the number t = 2.0**n */
PUT_BITS_DP64(((long long)n + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t);
return x*t;
}
#endif /* USE_SCALEDOUBLE_1 */
#if defined(USE_SCALEDOUBLE_2)
/* Scales the double x by 2.0**n.
Assumes 2*EMIN <= n <= 2*EMAX, though this condition is not checked. */
static inline double scaleDouble_2(double x, int n)
{
double t1, t2;
int n1, n2;
n1 = n / 2;
n2 = n - n1;
/* Construct the numbers t1 = 2.0**n1 and t2 = 2.0**n2 */
PUT_BITS_DP64(((long long)n1 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t1);
PUT_BITS_DP64(((long long)n2 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t2);
return (x*t1)*t2;
}
#endif /* USE_SCALEDOUBLE_2 */
#if defined(USE_SCALEDOUBLE_3)
/* Scales the double x by 2.0**n.
Assumes 3*EMIN <= n <= 3*EMAX, though this condition is not checked. */
static inline double scaleDouble_3(double x, int n)
{
double t1, t2, t3;
int n1, n2, n3;
n1 = n / 3;
n2 = (n - n1) / 2;
n3 = n - n1 - n2;
/* Construct the numbers t1 = 2.0**n1, t2 = 2.0**n2 and t3 = 2.0**n3 */
PUT_BITS_DP64(((long long)n1 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t1);
PUT_BITS_DP64(((long long)n2 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t2);
PUT_BITS_DP64(((long long)n3 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t3);
return ((x*t1)*t2)*t3;
}
#endif /* USE_SCALEDOUBLE_3 */
#if defined(USE_SCALEFLOAT_1)
/* Scales the float x by 2.0**n.
Assumes EMIN <= n <= EMAX, though this condition is not checked. */
static inline float scaleFloat_1(float x, int n)
{
float t;
/* Construct the number t = 2.0**n */
PUT_BITS_SP32((n + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t);
return x*t;
}
#endif /* USE_SCALEFLOAT_1 */
#if defined(USE_SCALEFLOAT_2)
/* Scales the float x by 2.0**n.
Assumes 2*EMIN <= n <= 2*EMAX, though this condition is not checked. */
static inline float scaleFloat_2(float x, int n)
{
float t1, t2;
int n1, n2;
n1 = n / 2;
n2 = n - n1;
/* Construct the numbers t1 = 2.0**n1 and t2 = 2.0**n2 */
PUT_BITS_SP32((n1 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t1);
PUT_BITS_SP32((n2 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t2);
return (x*t1)*t2;
}
#endif /* USE_SCALEFLOAT_2 */
#if defined(USE_SCALEFLOAT_3)
/* Scales the float x by 2.0**n.
Assumes 3*EMIN <= n <= 3*EMAX, though this condition is not checked. */
static inline float scaleFloat_3(float x, int n)
{
float t1, t2, t3;
int n1, n2, n3;
n1 = n / 3;
n2 = (n - n1) / 2;
n3 = n - n1 - n2;
/* Construct the numbers t1 = 2.0**n1, t2 = 2.0**n2 and t3 = 2.0**n3 */
PUT_BITS_SP32((n1 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t1);
PUT_BITS_SP32((n2 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t2);
PUT_BITS_SP32((n3 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t3);
return ((x*t1)*t2)*t3;
}
#endif /* USE_SCALEFLOAT_3 */
#if defined(USE_SETPRECISIONDOUBLE)
unsigned int setPrecisionDouble(void)
{
unsigned int cw, cwold = 0;
/* There is no precision control on Hammer */
return cwold;
}
#endif /* USE_SETPRECISIONDOUBLE */
#if defined(USE_RESTOREPRECISION)
void restorePrecision(unsigned int cwold)
{
#if defined(WINDOWS)
/* There is no precision control on Hammer */
#elif defined(linux)
/* There is no precision control on Hammer */
#else
#error Unknown machine
#endif
return;
}
#endif /* USE_RESTOREPRECISION */
#if defined(USE_CLEAR_FPSW_FLAGS)
/* Clears floating-point status flags. The argument should be
the bitwise or of the flags to be cleared, from the
list above, e.g.
clear_fpsw_flags(AMD_F_INEXACT | AMD_F_INVALID);
*/
static inline void clear_fpsw_flags(int flags)
{
#if defined(WINDOWS)
unsigned int cw = _mm_getcsr();
cw &= (~flags);
_mm_setcsr(cw);
#elif defined(linux)
unsigned int cw;
/* Get the current floating-point control/status word */
asm volatile ("STMXCSR %0" : "=m" (cw));
cw &= (~flags);
asm volatile ("LDMXCSR %0" : : "m" (cw));
#else
#error Unknown machine
#endif
}
#endif /* USE_CLEAR_FPSW_FLAGS */
#if defined(USE_RAISE_FPSW_FLAGS)
/* Raises floating-point status flags. The argument should be
the bitwise or of the flags to be raised, from the
list above, e.g.
raise_fpsw_flags(AMD_F_INEXACT | AMD_F_INVALID);
*/
static inline void raise_fpsw_flags(int flags)
{
#if defined(WINDOWS)
_mm_setcsr(_mm_getcsr() | flags);
#elif defined(linux)
unsigned int cw;
/* Get the current floating-point control/status word */
asm volatile ("STMXCSR %0" : "=m" (cw));
cw |= flags;
asm volatile ("LDMXCSR %0" : : "m" (cw));
#else
#error Unknown machine
#endif
}
#endif /* USE_RAISE_FPSW_FLAGS */
#if defined(USE_GET_FPSW_INLINE)
/* Return the current floating-point status word */
static inline unsigned int get_fpsw_inline(void)
{
#if defined(WINDOWS)
return _mm_getcsr();
#elif defined(linux)
unsigned int sw;
asm volatile ("STMXCSR %0" : "=m" (sw));
return sw;
#else
#error Unknown machine
#endif
}
#endif /* USE_GET_FPSW_INLINE */
#if defined(USE_SET_FPSW_INLINE)
/* Set the floating-point status word */
static inline void set_fpsw_inline(unsigned int sw)
{
#if defined(WINDOWS)
_mm_setcsr(sw);
#elif defined(linux)
/* Set the current floating-point control/status word */
asm volatile ("LDMXCSR %0" : : "m" (sw));
#else
#error Unknown machine
#endif
}
#endif /* USE_SET_FPSW_INLINE */
#if defined(USE_CLEAR_FPSW_INLINE)
/* Clear all exceptions from the floating-point status word */
static inline void clear_fpsw_inline(void)
{
#if defined(WINDOWS)
unsigned int cw;
cw = _mm_getcsr();
cw &= ~(AMD_F_INEXACT | AMD_F_UNDERFLOW | AMD_F_OVERFLOW |
AMD_F_DIVBYZERO | AMD_F_INVALID);
_mm_setcsr(cw);
#elif defined(linux)
unsigned int cw;
/* Get the current floating-point control/status word */
asm volatile ("STMXCSR %0" : "=m" (cw));
cw &= ~(AMD_F_INEXACT | AMD_F_UNDERFLOW | AMD_F_OVERFLOW |
AMD_F_DIVBYZERO | AMD_F_INVALID);
asm volatile ("LDMXCSR %0" : : "m" (cw));
#else
#error Unknown machine
#endif
}
#endif /* USE_CLEAR_FPSW_INLINE */
#if defined(USE_VAL_WITH_FLAGS)
/* Returns a double value after raising the given flags,
e.g. val_with_flags(x, AMD_F_INEXACT);
*/
static inline double val_with_flags(double val, int flags)
{
raise_fpsw_flags(flags);
return val;
}
#endif /* USE_VAL_WITH_FLAGS */
#if defined(USE_VALF_WITH_FLAGS)
/* Returns a float value after raising the given flags,
e.g. valf_with_flags(x, AMD_F_INEXACT);
*/
static inline float valf_with_flags(float val, int flags)
{
raise_fpsw_flags(flags);
return val;
}
#endif /* USE_VALF_WITH_FLAGS */
#if defined(USE_ZERO_WITH_FLAGS)
/* Returns a double +zero after raising the given flags,
e.g. zero_with_flags(AMD_F_INEXACT | AMD_F_INVALID);
*/
static inline double zero_with_flags(int flags)
{
raise_fpsw_flags(flags);
return 0.0;
}
#endif /* USE_ZERO_WITH_FLAGS */
#if defined(USE_ZEROF_WITH_FLAGS)
/* Returns a float +zero after raising the given flags,
e.g. zerof_with_flags(AMD_F_INEXACT | AMD_F_INVALID);
*/
static inline float zerof_with_flags(int flags)
{
raise_fpsw_flags(flags);
return 0.0F;
}
#endif /* USE_ZEROF_WITH_FLAGS */
#if defined(USE_NAN_WITH_FLAGS)
/* Returns a double quiet +nan after raising the given flags,
e.g. nan_with_flags(AMD_F_INVALID);
*/
static inline double nan_with_flags(int flags)
{
double z;
raise_fpsw_flags(flags);
PUT_BITS_DP64(0x7ff8000000000000, z);
return z;
}
#endif /* USE_NAN_WITH_FLAGS */
#if defined(USE_NANF_WITH_FLAGS)
/* Returns a float quiet +nan after raising the given flags,
e.g. nanf_with_flags(AMD_F_INVALID);
*/
static inline float nanf_with_flags(int flags)
{
float z;
raise_fpsw_flags(flags);
PUT_BITS_SP32(0x7fc00000, z);
return z;
}
#endif /* USE_NANF_WITH_FLAGS */
#if defined(USE_INDEFINITE_WITH_FLAGS)
/* Returns a double indefinite after raising the given flags,
e.g. indefinite_with_flags(AMD_F_INVALID);
*/
static inline double indefinite_with_flags(int flags)
{
double z;
raise_fpsw_flags(flags);
PUT_BITS_DP64(0xfff8000000000000, z);
return z;
}
#endif /* USE_INDEFINITE_WITH_FLAGS */
#if defined(USE_INDEFINITEF_WITH_FLAGS)
/* Returns a float quiet +indefinite after raising the given flags,
e.g. indefinitef_with_flags(AMD_F_INVALID);
*/
static inline float indefinitef_with_flags(int flags)
{
float z;
raise_fpsw_flags(flags);
PUT_BITS_SP32(0xffc00000, z);
return z;
}
#endif /* USE_INDEFINITEF_WITH_FLAGS */
#ifdef USE_INFINITY_WITH_FLAGS
/* Returns a positive double infinity after raising the given flags,
e.g. infinity_with_flags(AMD_F_OVERFLOW);
*/
static inline double infinity_with_flags(int flags)
{
double z;
raise_fpsw_flags(flags);
PUT_BITS_DP64((unsigned long long)(BIASEDEMAX_DP64 + 1) << EXPSHIFTBITS_DP64, z);
return z;
}
#endif /* USE_INFINITY_WITH_FLAGS */
#ifdef USE_INFINITYF_WITH_FLAGS
/* Returns a positive float infinity after raising the given flags,
e.g. infinityf_with_flags(AMD_F_OVERFLOW);
*/
static inline float infinityf_with_flags(int flags)
{
float z;
raise_fpsw_flags(flags);
PUT_BITS_SP32((BIASEDEMAX_SP32 + 1) << EXPSHIFTBITS_SP32, z);
return z;
}
#endif /* USE_INFINITYF_WITH_FLAGS */
#if defined(USE_SPLITEXP)
/* Compute the values m, z1, and z2 such that base**x = 2**m * (z1 + z2).
Small arguments abs(x) < 1/(16*ln(base)) and extreme arguments
abs(x) > large/(ln(base)) (where large is the largest representable
floating point number) should be handled separately instead of calling
this function. This function is called by exp_amd, exp2_amd, exp10_amd,
cosh_amd and sinh_amd. */
static inline void splitexp(double x, double logbase,
double thirtytwo_by_logbaseof2,
double logbaseof2_by_32_lead,
double logbaseof2_by_32_trail,
int *m, double *z1, double *z2)
{
double q, r, r1, r2, f1, f2;
int n, j;
/* Arrays two_to_jby32_lead_table and two_to_jby32_trail_table contain
leading and trailing parts respectively of precomputed
values of pow(2.0,j/32.0), for j = 0, 1, ..., 31.
two_to_jby32_lead_table contains the first 25 bits of precision,
and two_to_jby32_trail_table contains a further 53 bits precision. */
static const double two_to_jby32_lead_table[32] = {
1.00000000000000000000e+00, /* 0x3ff0000000000000 */
1.02189713716506958008e+00, /* 0x3ff059b0d0000000 */
1.04427373409271240234e+00, /* 0x3ff0b55860000000 */
1.06714040040969848633e+00, /* 0x3ff11301d0000000 */
1.09050768613815307617e+00, /* 0x3ff172b830000000 */
1.11438673734664916992e+00, /* 0x3ff1d48730000000 */
1.13878858089447021484e+00, /* 0x3ff2387a60000000 */
1.16372483968734741211e+00, /* 0x3ff29e9df0000000 */
1.18920707702636718750e+00, /* 0x3ff306fe00000000 */
1.21524733304977416992e+00, /* 0x3ff371a730000000 */
1.24185776710510253906e+00, /* 0x3ff3dea640000000 */
1.26905095577239990234e+00, /* 0x3ff44e0860000000 */
1.29683953523635864258e+00, /* 0x3ff4bfdad0000000 */
1.32523661851882934570e+00, /* 0x3ff5342b50000000 */
1.35425549745559692383e+00, /* 0x3ff5ab07d0000000 */
1.38390988111495971680e+00, /* 0x3ff6247eb0000000 */
1.41421353816986083984e+00, /* 0x3ff6a09e60000000 */
1.44518077373504638672e+00, /* 0x3ff71f75e0000000 */
1.47682613134384155273e+00, /* 0x3ff7a11470000000 */
1.50916439294815063477e+00, /* 0x3ff8258990000000 */
1.54221081733703613281e+00, /* 0x3ff8ace540000000 */
1.57598084211349487305e+00, /* 0x3ff93737b0000000 */
1.61049032211303710938e+00, /* 0x3ff9c49180000000 */
1.64575546979904174805e+00, /* 0x3ffa5503b0000000 */
1.68179279565811157227e+00, /* 0x3ffae89f90000000 */
1.71861928701400756836e+00, /* 0x3ffb7f76f0000000 */
1.75625211000442504883e+00, /* 0x3ffc199bd0000000 */
1.79470902681350708008e+00, /* 0x3ffcb720d0000000 */
1.83400803804397583008e+00, /* 0x3ffd5818d0000000 */
1.87416762113571166992e+00, /* 0x3ffdfc9730000000 */
1.91520655155181884766e+00, /* 0x3ffea4afa0000000 */
1.95714408159255981445e+00}; /* 0x3fff507650000000 */
static const double two_to_jby32_trail_table[32] = {
0.00000000000000000000e+00, /* 0x0000000000000000 */
1.14890470981563546737e-08, /* 0x3e48ac2ba1d73e2a */
4.83347014379782142328e-08, /* 0x3e69f3121ec53172 */
2.67125131841396124714e-10, /* 0x3df25b50a4ebbf1b */
4.65271045830351350190e-08, /* 0x3e68faa2f5b9bef9 */
5.24924336638693782574e-09, /* 0x3e368b9aa7805b80 */
5.38622214388600821910e-08, /* 0x3e6ceac470cd83f6 */
1.90902301017041969782e-08, /* 0x3e547f7b84b09745 */
3.79763538792174980894e-08, /* 0x3e64636e2a5bd1ab */
2.69306947081946450986e-08, /* 0x3e5ceaa72a9c5154 */
4.49683815095311756138e-08, /* 0x3e682468446b6824 */
1.41933332021066904914e-09, /* 0x3e18624b40c4dbd0 */
1.94146510233556266402e-08, /* 0x3e54d8a89c750e5e */
2.46409119489264118569e-08, /* 0x3e5a753e077c2a0f */
4.94812958044698886494e-08, /* 0x3e6a90a852b19260 */
8.48872238075784476136e-10, /* 0x3e0d2ac258f87d03 */
2.42032342089579394887e-08, /* 0x3e59fcef32422cbf */
3.32420002333182569170e-08, /* 0x3e61d8bee7ba46e2 */
1.45956577586525322754e-08, /* 0x3e4f580c36bea881 */
3.46452721050003920866e-08, /* 0x3e62999c25159f11 */
8.07090469079979051284e-09, /* 0x3e415506dadd3e2a */
2.99439161340839520436e-09, /* 0x3e29b8bc9e8a0388 */
9.83621719880452147153e-09, /* 0x3e451f8480e3e236 */
8.35492309647188080486e-09, /* 0x3e41f12ae45a1224 */
3.48493175137966283582e-08, /* 0x3e62b5a75abd0e6a */
1.11084703472699692902e-08, /* 0x3e47daf237553d84 */
5.03688744342840346564e-08, /* 0x3e6b0aa538444196 */
4.81896001063495806249e-08, /* 0x3e69df20d22a0798 */
4.83653666334089557746e-08, /* 0x3e69f7490e4bb40b */
1.29745882314081237628e-08, /* 0x3e4bdcdaf5cb4656 */
9.84532844621636118964e-09, /* 0x3e452486cc2c7b9d */
4.25828404545651943883e-08}; /* 0x3e66dc8a80ce9f09 */
/*
Step 1. Reduce the argument.
To perform argument reduction, we find the integer n such that
x = n * logbaseof2/32 + remainder, |remainder| <= logbaseof2/64.
n is defined by round-to-nearest-integer( x*32/logbaseof2 ) and
remainder by x - n*logbaseof2/32. The calculation of n is
straightforward whereas the computation of x - n*logbaseof2/32
must be carried out carefully.
logbaseof2/32 is so represented in two pieces that
(1) logbaseof2/32 is known to extra precision, (2) the product
of n and the leading piece is a model number and is hence
calculated without error, and (3) the subtraction of the value
obtained in (2) from x is a model number and is hence again
obtained without error.
*/
r = x * thirtytwo_by_logbaseof2;
/* Set n = nearest integer to r */
/* This is faster on Hammer */
if (r > 0)
n = (int)(r + 0.5);
else
n = (int)(r - 0.5);
r1 = x - n * logbaseof2_by_32_lead;
r2 = - n * logbaseof2_by_32_trail;
/* Set j = n mod 32: 5 mod 32 = 5, -5 mod 32 = 27, etc. */
/* j = n % 32;
if (j < 0) j += 32; */
j = n & 0x0000001f;
f1 = two_to_jby32_lead_table[j];
f2 = two_to_jby32_trail_table[j];
*m = (n - j) / 32;
/* Step 2. The following is the core approximation. We approximate
exp(r1+r2)-1 by a polynomial. */
r1 *= logbase; r2 *= logbase;
r = r1 + r2;
q = r1 + (r2 +
r*r*( 5.00000000000000008883e-01 +
r*( 1.66666666665260878863e-01 +
r*( 4.16666666662260795726e-02 +
r*( 8.33336798434219616221e-03 +
r*( 1.38889490863777199667e-03 ))))));
/* Step 3. Function value reconstruction.
We now reconstruct the exponential of the input argument
so that exp(x) = 2**m * (z1 + z2).
The order of the computation below must be strictly observed. */
*z1 = f1;
*z2 = f2 + ((f1 + f2) * q);
}
#endif /* USE_SPLITEXP */
#if defined(USE_SPLITEXPF)
/* Compute the values m, z1, and z2 such that base**x = 2**m * (z1 + z2).
Small arguments abs(x) < 1/(16*ln(base)) and extreme arguments
abs(x) > large/(ln(base)) (where large is the largest representable
floating point number) should be handled separately instead of calling
this function. This function is called by exp_amd, exp2_amd, exp10_amd,
cosh_amd and sinh_amd. */
static inline void splitexpf(float x, float logbase,
float thirtytwo_by_logbaseof2,
float logbaseof2_by_32_lead,
float logbaseof2_by_32_trail,
int *m, float *z1, float *z2)
{
float q, r, r1, r2, f1, f2;
int n, j;
/* Arrays two_to_jby32_lead_table and two_to_jby32_trail_table contain
leading and trailing parts respectively of precomputed
values of pow(2.0,j/32.0), for j = 0, 1, ..., 31.
two_to_jby32_lead_table contains the first 10 bits of precision,
and two_to_jby32_trail_table contains a further 24 bits precision. */
static const float two_to_jby32_lead_table[32] = {
1.0000000000E+00F, /* 0x3F800000 */
1.0214843750E+00F, /* 0x3F82C000 */
1.0429687500E+00F, /* 0x3F858000 */
1.0664062500E+00F, /* 0x3F888000 */
1.0898437500E+00F, /* 0x3F8B8000 */
1.1132812500E+00F, /* 0x3F8E8000 */
1.1386718750E+00F, /* 0x3F91C000 */
1.1621093750E+00F, /* 0x3F94C000 */
1.1875000000E+00F, /* 0x3F980000 */
1.2148437500E+00F, /* 0x3F9B8000 */
1.2402343750E+00F, /* 0x3F9EC000 */
1.2675781250E+00F, /* 0x3FA24000 */
1.2949218750E+00F, /* 0x3FA5C000 */
1.3242187500E+00F, /* 0x3FA98000 */
1.3535156250E+00F, /* 0x3FAD4000 */
1.3828125000E+00F, /* 0x3FB10000 */
1.4140625000E+00F, /* 0x3FB50000 */
1.4433593750E+00F, /* 0x3FB8C000 */
1.4765625000E+00F, /* 0x3FBD0000 */
1.5078125000E+00F, /* 0x3FC10000 */
1.5410156250E+00F, /* 0x3FC54000 */
1.5742187500E+00F, /* 0x3FC98000 */
1.6093750000E+00F, /* 0x3FCE0000 */
1.6445312500E+00F, /* 0x3FD28000 */
1.6816406250E+00F, /* 0x3FD74000 */
1.7167968750E+00F, /* 0x3FDBC000 */
1.7558593750E+00F, /* 0x3FE0C000 */
1.7929687500E+00F, /* 0x3FE58000 */
1.8339843750E+00F, /* 0x3FEAC000 */
1.8730468750E+00F, /* 0x3FEFC000 */
1.9140625000E+00F, /* 0x3FF50000 */
1.9570312500E+00F}; /* 0x3FFA8000 */
static const float two_to_jby32_trail_table[32] = {
0.0000000000E+00F, /* 0x00000000 */
4.1277357377E-04F, /* 0x39D86988 */
1.3050324051E-03F, /* 0x3AAB0D9F */
7.3415064253E-04F, /* 0x3A407404 */
6.6398258787E-04F, /* 0x3A2E0F1E */
1.1054925853E-03F, /* 0x3A90E62D */
1.1675967835E-04F, /* 0x38F4DCE0 */
1.6154836630E-03F, /* 0x3AD3BEA3 */
1.7071149778E-03F, /* 0x3ADFC146 */
4.0360994171E-04F, /* 0x39D39B9C */
1.6234370414E-03F, /* 0x3AD4C982 */
1.4728321694E-03F, /* 0x3AC10C0C */
1.9176795613E-03F, /* 0x3AFB5AA6 */
1.0178930825E-03F, /* 0x3A856AD3 */
7.3992193211E-04F, /* 0x3A41F752 */
1.0973819299E-03F, /* 0x3A8FD607 */
1.5106226783E-04F, /* 0x391E6678 */
1.8214319134E-03F, /* 0x3AEEBD1D */
2.6364589576E-04F, /* 0x398A39F4 */
1.3519275235E-03F, /* 0x3AB13329 */
1.1952003697E-03F, /* 0x3A9CA845 */
1.7620950239E-03F, /* 0x3AE6F619 */
1.1153318919E-03F, /* 0x3A923054 */
1.2242280645E-03F, /* 0x3AA07647 */
1.5220546629E-04F, /* 0x391F9958 */
1.8224230735E-03F, /* 0x3AEEDE5F */
3.9278529584E-04F, /* 0x39CDEEC0 */
1.7403248930E-03F, /* 0x3AE41B9D */
2.3711356334E-05F, /* 0x37C6E7C0 */
1.1207590578E-03F, /* 0x3A92E66F */
1.1440613307E-03F, /* 0x3A95F454 */
1.1287408415E-04F}; /* 0x38ECB6D0 */
/*
Step 1. Reduce the argument.
To perform argument reduction, we find the integer n such that
x = n * logbaseof2/32 + remainder, |remainder| <= logbaseof2/64.
n is defined by round-to-nearest-integer( x*32/logbaseof2 ) and
remainder by x - n*logbaseof2/32. The calculation of n is
straightforward whereas the computation of x - n*logbaseof2/32
must be carried out carefully.
logbaseof2/32 is so represented in two pieces that
(1) logbaseof2/32 is known to extra precision, (2) the product
of n and the leading piece is a model number and is hence
calculated without error, and (3) the subtraction of the value
obtained in (2) from x is a model number and is hence again
obtained without error.
*/
r = x * thirtytwo_by_logbaseof2;
/* Set n = nearest integer to r */
/* This is faster on Hammer */
if (r > 0)
n = (int)(r + 0.5F);
else
n = (int)(r - 0.5F);
r1 = x - n * logbaseof2_by_32_lead;
r2 = - n * logbaseof2_by_32_trail;
/* Set j = n mod 32: 5 mod 32 = 5, -5 mod 32 = 27, etc. */
/* j = n % 32;
if (j < 0) j += 32; */
j = n & 0x0000001f;
f1 = two_to_jby32_lead_table[j];
f2 = two_to_jby32_trail_table[j];
*m = (n - j) / 32;
/* Step 2. The following is the core approximation. We approximate
exp(r1+r2)-1 by a polynomial. */
r1 *= logbase; r2 *= logbase;
r = r1 + r2;
q = r1 + (r2 +
r*r*( 5.00000000000000008883e-01F +
r*( 1.66666666665260878863e-01F )));
/* Step 3. Function value reconstruction.
We now reconstruct the exponential of the input argument
so that exp(x) = 2**m * (z1 + z2).
The order of the computation below must be strictly observed. */
*z1 = f1;
*z2 = f2 + ((f1 + f2) * q);
}
#endif /* SPLITEXPF */
#if defined(USE_SCALEUPDOUBLE1024)
/* Scales up a double (normal or denormal) whose bit pattern is given
as ux by 2**1024. There are no checks that the input number is
scalable by that amount. */
static inline void scaleUpDouble1024(unsigned long long ux, unsigned long long *ur)
{
unsigned long long uy;
double y;
if ((ux & EXPBITS_DP64) == 0)
{
/* ux is denormalised */
PUT_BITS_DP64(ux | 0x4010000000000000, y);
if (ux & SIGNBIT_DP64)
y += 4.0;
else
y -= 4.0;
GET_BITS_DP64(y, uy);
}
else
/* ux is normal */
uy = ux + 0x4000000000000000;
*ur = uy;
return;
}
#endif /* SCALEUPDOUBLE1024 */
#if defined(USE_SCALEDOWNDOUBLE)
/* Scales down a double whose bit pattern is given as ux by 2**k.
There are no checks that the input number is scalable by that amount. */
static inline void scaleDownDouble(unsigned long long ux, int k,
unsigned long long *ur)
{
unsigned long long uy, uk, ax, xsign;
int n, shift;
xsign = ux & SIGNBIT_DP64;
ax = ux & ~SIGNBIT_DP64;
n = (int)((ax & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - k;
if (n > 0)
{
uk = (unsigned long long)n << EXPSHIFTBITS_DP64;
uy = (ax & ~EXPBITS_DP64) | uk;
}
else
{
uy = (ax & ~EXPBITS_DP64) | 0x0010000000000000;
shift = (1 - n);
if (shift > MANTLENGTH_DP64 + 1)
/* Sigh. Shifting works mod 64 so be careful not to shift too much */
uy = 0;
else
{
/* Make sure we round the result */
uy >>= shift - 1;
uy = (uy >> 1) + (uy & 1);
}
}
*ur = uy | xsign;
}
#endif /* SCALEDOWNDOUBLE */
#if defined(USE_SCALEUPFLOAT128)
/* Scales up a float (normal or denormal) whose bit pattern is given
as ux by 2**128. There are no checks that the input number is
scalable by that amount. */
static inline void scaleUpFloat128(unsigned int ux, unsigned int *ur)
{
unsigned int uy;
float y;
if ((ux & EXPBITS_SP32) == 0)
{
/* ux is denormalised */
PUT_BITS_SP32(ux | 0x40800000, y);
/* Compensate for the implicit bit just added */
if (ux & SIGNBIT_SP32)
y += 4.0F;
else
y -= 4.0F;
GET_BITS_SP32(y, uy);
}
else
/* ux is normal */
uy = ux + 0x40000000;
*ur = uy;
}
#endif /* SCALEUPFLOAT128 */
#if defined(USE_SCALEDOWNFLOAT)
/* Scales down a float whose bit pattern is given as ux by 2**k.
There are no checks that the input number is scalable by that amount. */
static inline void scaleDownFloat(unsigned int ux, int k,
unsigned int *ur)
{
unsigned int uy, uk, ax, xsign;
int n, shift;
xsign = ux & SIGNBIT_SP32;
ax = ux & ~SIGNBIT_SP32;
n = ((ax & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - k;
if (n > 0)
{
uk = (unsigned int)n << EXPSHIFTBITS_SP32;
uy = (ax & ~EXPBITS_SP32) | uk;
}
else
{
uy = (ax & ~EXPBITS_SP32) | 0x00800000;
shift = (1 - n);
if (shift > MANTLENGTH_SP32 + 1)
/* Sigh. Shifting works mod 32 so be careful not to shift too much */
uy = 0;
else
{
/* Make sure we round the result */
uy >>= shift - 1;
uy = (uy >> 1) + (uy & 1);
}
}
*ur = uy | xsign;
}
#endif /* SCALEDOWNFLOAT */
#if defined(USE_SQRT_AMD_INLINE)
static inline double sqrt_amd_inline(double x)
{
/*
Computes the square root of x.
The calculation is carried out in three steps.
Step 1. Reduction.
The input argument is scaled to the interval [1, 4) by
computing
x = 2^e * y, where y in [1,4).
Furthermore y is decomposed as y = c + t where
c = 1 + j/32, j = 0,1,..,96; and |t| <= 1/64.
Step 2. Approximation.
An approximation q = sqrt(1 + (t/c)) - 1 is obtained
from a basic series expansion using precomputed values
stored in rt_jby32_lead_table_dbl and rt_jby32_trail_table_dbl.
Step 3. Reconstruction.
The value of sqrt(x) is reconstructed via
sqrt(x) = 2^(e/2) * sqrt(y)
= 2^(e/2) * sqrt(c) * sqrt(y/c)
= 2^(e/2) * sqrt(c) * sqrt(1 + t/c)
= 2^(e/2) * [ sqrt(c) + sqrt(c)*q ]
*/
unsigned long long ux, ax, u;
double r1, r2, c, y, p, q, r, twop, z, rtc, rtc_lead, rtc_trail;
int e, denorm = 0, index;
/* Arrays rt_jby32_lead_table_dbl and rt_jby32_trail_table_dbl contain
leading and trailing parts respectively of precomputed
values of sqrt(j/32), for j = 32, 33, ..., 128.
rt_jby32_lead_table_dbl contains the first 21 bits of precision,
and rt_jby32_trail_table_dbl contains a further 53 bits precision. */
static const double rt_jby32_lead_table_dbl[97] = {
1.00000000000000000000e+00, /* 0x3ff0000000000000 */
1.01550388336181640625e+00, /* 0x3ff03f8100000000 */
1.03077602386474609375e+00, /* 0x3ff07e0f00000000 */
1.04582500457763671875e+00, /* 0x3ff0bbb300000000 */
1.06065940856933593750e+00, /* 0x3ff0f87600000000 */
1.07528972625732421875e+00, /* 0x3ff1346300000000 */
1.08972454071044921875e+00, /* 0x3ff16f8300000000 */
1.10396957397460937500e+00, /* 0x3ff1a9dc00000000 */
1.11803340911865234375e+00, /* 0x3ff1e37700000000 */
1.13192272186279296875e+00, /* 0x3ff21c5b00000000 */
1.14564323425292968750e+00, /* 0x3ff2548e00000000 */
1.15920162200927734375e+00, /* 0x3ff28c1700000000 */
1.17260360717773437500e+00, /* 0x3ff2c2fc00000000 */
1.18585395812988281250e+00, /* 0x3ff2f94200000000 */
1.19895744323730468750e+00, /* 0x3ff32eee00000000 */
1.21191978454589843750e+00, /* 0x3ff3640600000000 */
1.22474479675292968750e+00, /* 0x3ff3988e00000000 */
1.23743629455566406250e+00, /* 0x3ff3cc8a00000000 */
1.25000000000000000000e+00, /* 0x3ff4000000000000 */
1.26243782043457031250e+00, /* 0x3ff432f200000000 */
1.27475452423095703125e+00, /* 0x3ff4656500000000 */
1.28695297241210937500e+00, /* 0x3ff4975c00000000 */
1.29903793334960937500e+00, /* 0x3ff4c8dc00000000 */
1.31101036071777343750e+00, /* 0x3ff4f9e600000000 */
1.32287502288818359375e+00, /* 0x3ff52a7f00000000 */
1.33463478088378906250e+00, /* 0x3ff55aaa00000000 */
1.34629058837890625000e+00, /* 0x3ff58a6800000000 */
1.35784721374511718750e+00, /* 0x3ff5b9be00000000 */
1.36930561065673828125e+00, /* 0x3ff5e8ad00000000 */
1.38066959381103515625e+00, /* 0x3ff6173900000000 */
1.39194107055664062500e+00, /* 0x3ff6456400000000 */
1.40312099456787109375e+00, /* 0x3ff6732f00000000 */
1.41421318054199218750e+00, /* 0x3ff6a09e00000000 */
1.42521858215332031250e+00, /* 0x3ff6cdb200000000 */
1.43614006042480468750e+00, /* 0x3ff6fa6e00000000 */
1.44697952270507812500e+00, /* 0x3ff726d400000000 */
1.45773792266845703125e+00, /* 0x3ff752e500000000 */
1.46841716766357421875e+00, /* 0x3ff77ea300000000 */
1.47901916503906250000e+00, /* 0x3ff7aa1000000000 */
1.48954677581787109375e+00, /* 0x3ff7d52f00000000 */
1.50000000000000000000e+00, /* 0x3ff8000000000000 */
1.51038074493408203125e+00, /* 0x3ff82a8500000000 */
1.52068996429443359375e+00, /* 0x3ff854bf00000000 */
1.53093051910400390625e+00, /* 0x3ff87eb100000000 */
1.54110336303710937500e+00, /* 0x3ff8a85c00000000 */
1.55120849609375000000e+00, /* 0x3ff8d1c000000000 */
1.56124877929687500000e+00, /* 0x3ff8fae000000000 */
1.57122516632080078125e+00, /* 0x3ff923bd00000000 */
1.58113861083984375000e+00, /* 0x3ff94c5800000000 */
1.59099006652832031250e+00, /* 0x3ff974b200000000 */
1.60078048706054687500e+00, /* 0x3ff99ccc00000000 */
1.61051177978515625000e+00, /* 0x3ff9c4a800000000 */
1.62018489837646484375e+00, /* 0x3ff9ec4700000000 */
1.62979984283447265625e+00, /* 0x3ffa13a900000000 */
1.63935947418212890625e+00, /* 0x3ffa3ad100000000 */
1.64886283874511718750e+00, /* 0x3ffa61be00000000 */
1.65831184387207031250e+00, /* 0x3ffa887200000000 */
1.66770744323730468750e+00, /* 0x3ffaaeee00000000 */
1.67705059051513671875e+00, /* 0x3ffad53300000000 */
1.68634128570556640625e+00, /* 0x3ffafb4100000000 */
1.69558238983154296875e+00, /* 0x3ffb211b00000000 */
1.70477199554443359375e+00, /* 0x3ffb46bf00000000 */
1.71391296386718750000e+00, /* 0x3ffb6c3000000000 */
1.72300529479980468750e+00, /* 0x3ffb916e00000000 */
1.73204994201660156250e+00, /* 0x3ffbb67a00000000 */
1.74104785919189453125e+00, /* 0x3ffbdb5500000000 */
1.75000000000000000000e+00, /* 0x3ffc000000000000 */
1.75890541076660156250e+00, /* 0x3ffc247a00000000 */
1.76776695251464843750e+00, /* 0x3ffc48c600000000 */
1.77658367156982421875e+00, /* 0x3ffc6ce300000000 */
1.78535652160644531250e+00, /* 0x3ffc90d200000000 */
1.79408740997314453125e+00, /* 0x3ffcb49500000000 */
1.80277538299560546875e+00, /* 0x3ffcd82b00000000 */
1.81142139434814453125e+00, /* 0x3ffcfb9500000000 */
1.82002735137939453125e+00, /* 0x3ffd1ed500000000 */
1.82859230041503906250e+00, /* 0x3ffd41ea00000000 */
1.83711719512939453125e+00, /* 0x3ffd64d500000000 */
1.84560203552246093750e+00, /* 0x3ffd879600000000 */
1.85404872894287109375e+00, /* 0x3ffdaa2f00000000 */
1.86245727539062500000e+00, /* 0x3ffdcca000000000 */
1.87082862854003906250e+00, /* 0x3ffdeeea00000000 */
1.87916183471679687500e+00, /* 0x3ffe110c00000000 */
1.88745784759521484375e+00, /* 0x3ffe330700000000 */
1.89571857452392578125e+00, /* 0x3ffe54dd00000000 */
1.90394306182861328125e+00, /* 0x3ffe768d00000000 */
1.91213226318359375000e+00, /* 0x3ffe981800000000 */
1.92028617858886718750e+00, /* 0x3ffeb97e00000000 */
1.92840576171875000000e+00, /* 0x3ffedac000000000 */
1.93649101257324218750e+00, /* 0x3ffefbde00000000 */
1.94454288482666015625e+00, /* 0x3fff1cd900000000 */
1.95256233215332031250e+00, /* 0x3fff3db200000000 */
1.96054744720458984375e+00, /* 0x3fff5e6700000000 */
1.96850109100341796875e+00, /* 0x3fff7efb00000000 */
1.97642326354980468750e+00, /* 0x3fff9f6e00000000 */
1.98431301116943359375e+00, /* 0x3fffbfbf00000000 */
1.99217128753662109375e+00, /* 0x3fffdfef00000000 */
2.00000000000000000000e+00}; /* 0x4000000000000000 */
static const double rt_jby32_trail_table_dbl[97] = {
0.00000000000000000000e+00, /* 0x0000000000000000 */
9.17217678638807524014e-07, /* 0x3eaec6d70177881c */
3.82539669043705364790e-07, /* 0x3e99abfb41bd6b24 */
2.85899577162227138140e-08, /* 0x3e5eb2bf6bab55a2 */
7.63210485349101216659e-07, /* 0x3ea99bed9b2d8d0c */
9.32123004127716212874e-07, /* 0x3eaf46e029c1b296 */
1.95174719169309219157e-07, /* 0x3e8a3226fc42f30c */
5.34316371481845492427e-07, /* 0x3ea1edbe20701d73 */
5.79631242504454563052e-07, /* 0x3ea372fe94f82be7 */
4.20404384109571705948e-07, /* 0x3e9c367e08e7bb06 */
6.89486030314147010716e-07, /* 0x3ea722a3d0a66608 */
6.89927685625314560328e-07, /* 0x3ea7266f067ca1d6 */
3.32778123013641425828e-07, /* 0x3e965515a9b34850 */
1.64433259436999584387e-07, /* 0x3e8611e23ef6c1bd */
4.37590875197899335723e-07, /* 0x3e9d5dc1059ed8e7 */
1.79808183816018617413e-07, /* 0x3e88222982d0e4f4 */
7.46386593615986477624e-08, /* 0x3e7409212e7d0322 */
5.72520794105201454728e-07, /* 0x3ea335ea8a5fcf39 */
0.00000000000000000000e+00, /* 0x0000000000000000 */
2.96860689431670420344e-07, /* 0x3e93ec071e938bfe */
3.54167239176257065345e-07, /* 0x3e97c48bfd9862c6 */
7.95211265664474710063e-07, /* 0x3eaaaed010f74671 */
1.72327048595145565621e-07, /* 0x3e87211cbfeb62e0 */
6.99494915996239297020e-07, /* 0x3ea7789d9660e72d */
6.32644111701500844315e-07, /* 0x3ea53a5f1d36f1cf */
6.20124838851440463844e-10, /* 0x3e054eacff2057dc */
6.13404719757812629969e-07, /* 0x3ea4951b3e6a83cc */
3.47654909777986407387e-07, /* 0x3e9754aa76884c66 */
7.83106177002392475763e-07, /* 0x3eaa46d4b1de1074 */
5.33337372440526357008e-07, /* 0x3ea1e55548f92635 */
2.01508648555298681765e-08, /* 0x3e55a3070dd17788 */
5.25472356925843939587e-07, /* 0x3ea1a1c5eedb0801 */
3.81831102861301692797e-07, /* 0x3e999fcef32422cc */
6.99220602161420018738e-07, /* 0x3ea776425d6b0199 */
6.01209702477462624811e-07, /* 0x3ea42c5a1e0191a2 */
9.01437000591944740554e-08, /* 0x3e7832a0bdff1327 */
5.10428680864685379950e-08, /* 0x3e6b674743636676 */
3.47895267104621031421e-07, /* 0x3e9758cb90d2f714 */
7.80735841510641848628e-07, /* 0x3eaa3278459cde25 */
1.35158752025506517690e-07, /* 0x3e822404f4a103ee */
0.00000000000000000000e+00, /* 0x0000000000000000 */
1.76523947728535489812e-09, /* 0x3e1e539af6892ac5 */
6.68280121328499932183e-07, /* 0x3ea66c7b872c9cd0 */
5.70135482405123276616e-07, /* 0x3ea3216d2f43887d */
1.37705134737562525897e-07, /* 0x3e827b832cbedc0e */
7.09655107074516613672e-07, /* 0x3ea7cfe41579091d */
7.20302724551461693011e-07, /* 0x3ea82b5a713c490a */
4.69926266058212796694e-07, /* 0x3e9f8945932d872e */
2.19244345915999437026e-07, /* 0x3e8d6d2da9490251 */
1.91141411617401877927e-07, /* 0x3e89a791a3114e4a */
5.72297665296622053774e-07, /* 0x3ea333ffe005988d */
5.61055484436830560103e-07, /* 0x3ea2d36e0ed49ab1 */
2.76225500213991506100e-07, /* 0x3e92898498f55f9e */
7.58466189522395692908e-07, /* 0x3ea9732cca1032a3 */
1.56893371256836029827e-07, /* 0x3e850ed0b02a22d2 */
4.06038997708867066507e-07, /* 0x3e9b3fb265b1e40a */
5.51305629612057435809e-07, /* 0x3ea27fade682d1de */
5.64778487026561123207e-07, /* 0x3ea2f36906f707ba */
3.92609705553556897517e-07, /* 0x3e9a58fbbee883b6 */
9.09698438776943827802e-07, /* 0x3eae864005bca6d7 */
1.05949774066016139743e-07, /* 0x3e7c70d02300f263 */
7.16578798392844784244e-07, /* 0x3ea80b5d712d8e3e */
6.86233073531233972561e-07, /* 0x3ea706b27cc7d390 */
7.99211473033494452908e-07, /* 0x3eaad12c9d849a97 */
8.65552275731027456121e-07, /* 0x3ead0b09954e764b */
6.75456120386058448618e-07, /* 0x3ea6aa1fb7826cbd */
0.00000000000000000000e+00, /* 0x0000000000000000 */
4.99167184520462138743e-07, /* 0x3ea0bfd03f46763c */
4.51720373502110930296e-10, /* 0x3dff0abfb4adfb9e */
1.28874162718371367439e-07, /* 0x3e814c151f991b2e */
5.85529267186999798656e-07, /* 0x3ea3a5a879b09292 */
1.01827770937125531924e-07, /* 0x3e7b558d173f9796 */
2.54736389177809626508e-07, /* 0x3e9118567cd83fb8 */
6.98925535290464831294e-07, /* 0x3ea773b981896751 */
1.20940735036524314513e-07, /* 0x3e803b7df49f48a8 */
5.43759351196479689657e-08, /* 0x3e6d315f22491900 */
1.11957989042397958409e-07, /* 0x3e7e0db1c5bb84b2 */
8.47006714134442661218e-07, /* 0x3eac6bbb7644ff76 */
8.92831044643427836228e-07, /* 0x3eadf55c3afec01f */
7.77828292464916501663e-07, /* 0x3eaa197e81034da3 */
6.48469316302918797451e-08, /* 0x3e71683f4920555d */
2.12579816658859849140e-07, /* 0x3e8c882fd78bb0b0 */
7.61222472580559138435e-07, /* 0x3ea98ad9eb7b83ec */
2.86488961857314189607e-07, /* 0x3e9339d7c7777273 */
2.14637363790165363515e-07, /* 0x3e8ccee237cae6fe */
5.44137005612605847831e-08, /* 0x3e6d368fe324a146 */
2.58378284856442408413e-07, /* 0x3e9156e7b6d99b45 */
3.15848939061134843091e-07, /* 0x3e95323e5310b5c1 */
6.60530466255089632309e-07, /* 0x3ea629e9db362f5d */
7.63436345535852301127e-07, /* 0x3ea99dde4728d7ec */
8.68233432860324345268e-08, /* 0x3e774e746878544d */
9.45465175398023087082e-07, /* 0x3eafb97be873a87d */
8.77499534786171267246e-07, /* 0x3ead71a9e23c2f63 */
2.74055432394999316135e-07, /* 0x3e92643c89cda173 */
4.72129009349126213532e-07, /* 0x3e9faf1d57a4d56c */
8.93777032327078947306e-07, /* 0x3eadfd7c7ab7b282 */
0.00000000000000000000e+00}; /* 0x0000000000000000 */
/* Handle special arguments first */
GET_BITS_DP64(x, ux);
ax = ux & (~SIGNBIT_DP64);
if(ax >= 0x7ff0000000000000)
{
/* x is either NaN or infinity */
if (ux & MANTBITS_DP64)
/* x is NaN */
return x + x; /* Raise invalid if it is a signalling NaN */
else if (ux & SIGNBIT_DP64)
/* x is negative infinity */
return nan_with_flags(AMD_F_INVALID);
else
/* x is positive infinity */
return x;
}
else if (ux & SIGNBIT_DP64)
{
/* x is negative. */
if (ux == SIGNBIT_DP64)
/* Handle negative zero first */
return x;
else
return nan_with_flags(AMD_F_INVALID);
}
else if (ux <= 0x000fffffffffffff)
{
/* x is denormalised or zero */
if (ux == 0)
/* x is zero */
return x;
else
{
/* x is denormalised; scale it up */
/* Normalize x by increasing the exponent by 60
and subtracting a correction to account for the implicit
bit. This replaces a slow denormalized
multiplication by a fast normal subtraction. */
static const double corr = 2.5653355008114851558350183e-290; /* 0x03d0000000000000 */
denorm = 1;
GET_BITS_DP64(x, ux);
PUT_BITS_DP64(ux | 0x03d0000000000000, x);
x -= corr;
GET_BITS_DP64(x, ux);
}
}
/* Main algorithm */
/*
Find y and e such that x = 2^e * y, where y in [1,4).
This is done using an in-lined variant of splitDouble,
which also ensures that e is even.
*/
y = x;
ux &= EXPBITS_DP64;
ux >>= EXPSHIFTBITS_DP64;
if (ux & 1)
{
GET_BITS_DP64(y, u);
u &= (SIGNBIT_DP64 | MANTBITS_DP64);
u |= ONEEXPBITS_DP64;
PUT_BITS_DP64(u, y);
e = ux - EXPBIAS_DP64;
}
else
{
GET_BITS_DP64(y, u);
u &= (SIGNBIT_DP64 | MANTBITS_DP64);
u |= TWOEXPBITS_DP64;
PUT_BITS_DP64(u, y);
e = ux - EXPBIAS_DP64 - 1;
}
/* Find the index of the sub-interval of [1,4) in which y lies. */
index = (int)(32.0*y+0.5);
/* Look up the table values and compute c and r = c/t */
rtc_lead = rt_jby32_lead_table_dbl[index-32];
rtc_trail = rt_jby32_trail_table_dbl[index-32];
c = 0.03125*index;
r = (y - c)/c;
/*
Find q = sqrt(1+r) - 1.
From one step of Newton on (q+1)^2 = 1+r
*/
p = r*0.5 - r*r*(0.1250079870 - r*(0.6250522999E-01));
twop = p + p;
q = p - (p*p + (twop - r))/(twop + 2.0);
/* Reconstruction */
rtc = rtc_lead + rtc_trail;
e >>= 1; /* e = e/2 */
z = rtc_lead + (rtc*q+rtc_trail);
if (denorm)
{
/* Scale by 2**(e-30) */
PUT_BITS_DP64(((long long)(e - 30) + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, r);
z *= r;
}
else
{
/* Scale by 2**e */
PUT_BITS_DP64(((long long)e + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, r);
z *= r;
}
return z;
}
#endif /* SQRT_AMD_INLINE */
#if defined(USE_SQRTF_AMD_INLINE)
static inline float sqrtf_amd_inline(float x)
{
/*
Computes the square root of x.
The calculation is carried out in three steps.
Step 1. Reduction.
The input argument is scaled to the interval [1, 4) by
computing
x = 2^e * y, where y in [1,4).
Furthermore y is decomposed as y = c + t where
c = 1 + j/32, j = 0,1,..,96; and |t| <= 1/64.
Step 2. Approximation.
An approximation q = sqrt(1 + (t/c)) - 1 is obtained
from a basic series expansion using precomputed values
stored in rt_jby32_lead_table_float and rt_jby32_trail_table_float.
Step 3. Reconstruction.
The value of sqrt(x) is reconstructed via
sqrt(x) = 2^(e/2) * sqrt(y)
= 2^(e/2) * sqrt(c) * sqrt(y/c)
= 2^(e/2) * sqrt(c) * sqrt(1 + t/c)
= 2^(e/2) * [ sqrt(c) + sqrt(c)*q ]
*/
unsigned int ux, ax, u;
float r1, r2, c, y, p, q, r, twop, z, rtc, rtc_lead, rtc_trail;
int e, denorm = 0, index;
/* Arrays rt_jby32_lead_table_float and rt_jby32_trail_table_float contain
leading and trailing parts respectively of precomputed
values of sqrt(j/32), for j = 32, 33, ..., 128.
rt_jby32_lead_table_float contains the first 13 bits of precision,
and rt_jby32_trail_table_float contains a further 24 bits precision. */
static const float rt_jby32_lead_table_float[97] = {
1.00000000000000000000e+00F, /* 0x3f800000 */
1.01538085937500000000e+00F, /* 0x3f81f800 */
1.03076171875000000000e+00F, /* 0x3f83f000 */
1.04565429687500000000e+00F, /* 0x3f85d800 */
1.06054687500000000000e+00F, /* 0x3f87c000 */
1.07519531250000000000e+00F, /* 0x3f89a000 */
1.08959960937500000000e+00F, /* 0x3f8b7800 */
1.10375976562500000000e+00F, /* 0x3f8d4800 */
1.11791992187500000000e+00F, /* 0x3f8f1800 */
1.13183593750000000000e+00F, /* 0x3f90e000 */
1.14550781250000000000e+00F, /* 0x3f92a000 */
1.15917968750000000000e+00F, /* 0x3f946000 */
1.17236328125000000000e+00F, /* 0x3f961000 */
1.18579101562500000000e+00F, /* 0x3f97c800 */
1.19873046875000000000e+00F, /* 0x3f997000 */
1.21191406250000000000e+00F, /* 0x3f9b2000 */
1.22460937500000000000e+00F, /* 0x3f9cc000 */
1.23730468750000000000e+00F, /* 0x3f9e6000 */
1.25000000000000000000e+00F, /* 0x3fa00000 */
1.26220703125000000000e+00F, /* 0x3fa19000 */
1.27465820312500000000e+00F, /* 0x3fa32800 */
1.28686523437500000000e+00F, /* 0x3fa4b800 */
1.29882812500000000000e+00F, /* 0x3fa64000 */
1.31079101562500000000e+00F, /* 0x3fa7c800 */
1.32275390625000000000e+00F, /* 0x3fa95000 */
1.33447265625000000000e+00F, /* 0x3faad000 */
1.34619140625000000000e+00F, /* 0x3fac5000 */
1.35766601562500000000e+00F, /* 0x3fadc800 */
1.36914062500000000000e+00F, /* 0x3faf4000 */
1.38061523437500000000e+00F, /* 0x3fb0b800 */
1.39184570312500000000e+00F, /* 0x3fb22800 */
1.40307617187500000000e+00F, /* 0x3fb39800 */
1.41406250000000000000e+00F, /* 0x3fb50000 */
1.42504882812500000000e+00F, /* 0x3fb66800 */
1.43603515625000000000e+00F, /* 0x3fb7d000 */
1.44677734375000000000e+00F, /* 0x3fb93000 */
1.45751953125000000000e+00F, /* 0x3fba9000 */
1.46826171875000000000e+00F, /* 0x3fbbf000 */
1.47900390625000000000e+00F, /* 0x3fbd5000 */
1.48950195312500000000e+00F, /* 0x3fbea800 */
1.50000000000000000000e+00F, /* 0x3fc00000 */
1.51025390625000000000e+00F, /* 0x3fc15000 */
1.52050781250000000000e+00F, /* 0x3fc2a000 */
1.53076171875000000000e+00F, /* 0x3fc3f000 */
1.54101562500000000000e+00F, /* 0x3fc54000 */
1.55102539062500000000e+00F, /* 0x3fc68800 */
1.56103515625000000000e+00F, /* 0x3fc7d000 */
1.57104492187500000000e+00F, /* 0x3fc91800 */
1.58105468750000000000e+00F, /* 0x3fca6000 */
1.59082031250000000000e+00F, /* 0x3fcba000 */
1.60058593750000000000e+00F, /* 0x3fcce000 */
1.61035156250000000000e+00F, /* 0x3fce2000 */
1.62011718750000000000e+00F, /* 0x3fcf6000 */
1.62963867187500000000e+00F, /* 0x3fd09800 */
1.63916015625000000000e+00F, /* 0x3fd1d000 */
1.64868164062500000000e+00F, /* 0x3fd30800 */
1.65820312500000000000e+00F, /* 0x3fd44000 */
1.66748046875000000000e+00F, /* 0x3fd57000 */
1.67700195312500000000e+00F, /* 0x3fd6a800 */
1.68627929687500000000e+00F, /* 0x3fd7d800 */
1.69555664062500000000e+00F, /* 0x3fd90800 */
1.70458984375000000000e+00F, /* 0x3fda3000 */
1.71386718750000000000e+00F, /* 0x3fdb6000 */
1.72290039062500000000e+00F, /* 0x3fdc8800 */
1.73193359375000000000e+00F, /* 0x3fddb000 */
1.74096679687500000000e+00F, /* 0x3fded800 */
1.75000000000000000000e+00F, /* 0x3fe00000 */
1.75878906250000000000e+00F, /* 0x3fe12000 */
1.76757812500000000000e+00F, /* 0x3fe24000 */
1.77636718750000000000e+00F, /* 0x3fe36000 */
1.78515625000000000000e+00F, /* 0x3fe48000 */
1.79394531250000000000e+00F, /* 0x3fe5a000 */
1.80273437500000000000e+00F, /* 0x3fe6c000 */
1.81127929687500000000e+00F, /* 0x3fe7d800 */
1.81982421875000000000e+00F, /* 0x3fe8f000 */
1.82836914062500000000e+00F, /* 0x3fea0800 */
1.83691406250000000000e+00F, /* 0x3feb2000 */
1.84545898437500000000e+00F, /* 0x3fec3800 */
1.85400390625000000000e+00F, /* 0x3fed5000 */
1.86230468750000000000e+00F, /* 0x3fee6000 */
1.87060546875000000000e+00F, /* 0x3fef7000 */
1.87915039062500000000e+00F, /* 0x3ff08800 */
1.88745117187500000000e+00F, /* 0x3ff19800 */
1.89550781250000000000e+00F, /* 0x3ff2a000 */
1.90380859375000000000e+00F, /* 0x3ff3b000 */
1.91210937500000000000e+00F, /* 0x3ff4c000 */
1.92016601562500000000e+00F, /* 0x3ff5c800 */
1.92822265625000000000e+00F, /* 0x3ff6d000 */
1.93627929687500000000e+00F, /* 0x3ff7d800 */
1.94433593750000000000e+00F, /* 0x3ff8e000 */
1.95239257812500000000e+00F, /* 0x3ff9e800 */
1.96044921875000000000e+00F, /* 0x3ffaf000 */
1.96826171875000000000e+00F, /* 0x3ffbf000 */
1.97631835937500000000e+00F, /* 0x3ffcf800 */
1.98413085937500000000e+00F, /* 0x3ffdf800 */
1.99194335937500000000e+00F, /* 0x3ffef800 */
2.00000000000000000000e+00F}; /* 0x40000000 */
static const float rt_jby32_trail_table_float[97] = {
0.00000000000000000000e+00F, /* 0x00000000 */
1.23941208585165441036e-04F, /* 0x3901f637 */
1.46876545841223560274e-05F, /* 0x37766aff */
1.70736297150142490864e-04F, /* 0x393307ad */
1.13296780909877270460e-04F, /* 0x38ed99bf */
9.53458802541717886925e-05F, /* 0x38c7f46e */
1.25126505736261606216e-04F, /* 0x39033464 */
2.10342666832730174065e-04F, /* 0x395c8f6e */
1.14066875539720058441e-04F, /* 0x38ef3730 */
8.72047676239162683487e-05F, /* 0x38b6e1b4 */
1.36111237225122749805e-04F, /* 0x390eb915 */
2.26244374061934649944e-05F, /* 0x37bdc99c */
2.40658700931817293167e-04F, /* 0x397c5954 */
6.31069415248930454254e-05F, /* 0x38845848 */
2.27412077947519719601e-04F, /* 0x396e7577 */
5.90185391047270968556e-06F, /* 0x36c6088a */
1.35496389702893793583e-04F, /* 0x390e1409 */
1.32179571664892137051e-04F, /* 0x390a99af */
0.00000000000000000000e+00F, /* 0x00000000 */
2.31086043640971183777e-04F, /* 0x39724fb0 */
9.66752704698592424393e-05F, /* 0x38cabe24 */
8.85332483449019491673e-05F, /* 0x38b9aaed */
2.09980673389509320259e-04F, /* 0x395c2e42 */
2.20044588786549866199e-04F, /* 0x3966bbc5 */
1.21749282698146998882e-04F, /* 0x38ff53a6 */
1.62125259521417319775e-04F, /* 0x392a002b */
9.97955357888713479042e-05F, /* 0x38d14952 */
1.81545779923908412457e-04F, /* 0x393e5d53 */
1.65768768056295812130e-04F, /* 0x392dd237 */
5.48927710042335093021e-05F, /* 0x38663caa */
9.53875860432162880898e-05F, /* 0x38c80ad2 */
4.53481625299900770187e-05F, /* 0x383e3438 */
1.51062369695864617825e-04F, /* 0x391e667f */
1.70453247847035527229e-04F, /* 0x3932bbb2 */
1.05505387182347476482e-04F, /* 0x38dd42c6 */
2.02269104192964732647e-04F, /* 0x39541833 */
2.18442466575652360916e-04F, /* 0x39650db4 */
1.55796806211583316326e-04F, /* 0x39235d63 */
1.60395247803535312414e-05F, /* 0x37868c9e */
4.49578510597348213196e-05F, /* 0x383c9120 */
0.00000000000000000000e+00F, /* 0x00000000 */
1.26840444863773882389e-04F, /* 0x39050079 */
1.82820076588541269302e-04F, /* 0x393fb364 */
1.69370483490638434887e-04F, /* 0x3931990b */
8.78757418831810355186e-05F, /* 0x38b849ee */
1.83815121999941766262e-04F, /* 0x3940be7f */
2.14343352126888930798e-04F, /* 0x3960c15b */
1.80714370799250900745e-04F, /* 0x393d7e25 */
8.41425862745381891727e-05F, /* 0x38b075b5 */
1.69945167726837098598e-04F, /* 0x3932334f */
1.95121858268976211548e-04F, /* 0x394c99a0 */
1.60778334247879683971e-04F, /* 0x3928969b */
6.79871009197086095810e-05F, /* 0x388e944c */
1.61929419846273958683e-04F, /* 0x3929cb99 */
1.99474830878898501396e-04F, /* 0x39512a1e */
1.81604162207804620266e-04F, /* 0x393e6cff */
1.09270178654696792364e-04F, /* 0x38e527fb */
2.27539261686615645885e-04F, /* 0x396e979b */
4.90300008095800876617e-05F, /* 0x384da590 */
6.28985289949923753738e-05F, /* 0x3883e864 */
2.58551553997676819563e-05F, /* 0x37d8e386 */
1.82868374395184218884e-04F, /* 0x393fc05b */
4.64625991298817098141e-05F, /* 0x3842e0d6 */
1.05703387816902250051e-04F, /* 0x38ddad13 */
1.17213814519345760345e-04F, /* 0x38f5d0b0 */
8.17377731436863541603e-05F, /* 0x38ab6aa2 */
0.00000000000000000000e+00F, /* 0x00000000 */
1.16847433673683553934e-04F, /* 0x38f50bfd */
1.88827965757809579372e-04F, /* 0x3946001f */
2.16612941585481166840e-04F, /* 0x39632298 */
2.00857131858356297016e-04F, /* 0x39529d2d */
1.42199307447299361229e-04F, /* 0x39151b56 */
4.12627305195201188326e-05F, /* 0x382d1185 */
1.42796401632949709892e-04F, /* 0x3915bb9e */
2.03253570361994206905e-04F, /* 0x39552077 */
2.23214170546270906925e-04F, /* 0x396a0e99 */
2.03244591830298304558e-04F, /* 0x39551e0e */
1.43898156238719820976e-04F, /* 0x3916e35e */
4.57155256299301981926e-05F, /* 0x383fbeac */
1.53365719597786664963e-04F, /* 0x3920d0cc */
2.23224633373320102692e-04F, /* 0x396a1168 */
1.16566716314991936088e-05F, /* 0x37439106 */
7.43694272387074306607e-06F, /* 0x36f98ada */
2.11048507480882108212e-04F, /* 0x395d4ce7 */
1.34682719362899661064e-04F, /* 0x390d399e */
2.29425968427676707506e-05F, /* 0x37c074da */
1.20421340398024767637e-04F, /* 0x38fc8ab7 */
1.83421318070031702518e-04F, /* 0x394054c9 */
2.12376224226318299770e-04F, /* 0x395eb14f */
2.07710763788782060146e-04F, /* 0x3959ccef */
1.69840845046564936638e-04F, /* 0x3932174e */
9.91739216260612010956e-05F, /* 0x38cffb98 */
2.40249748458154499531e-04F, /* 0x397beb8d */
1.05178231024183332920e-04F, /* 0x38dc9322 */
1.82623916771262884140e-04F, /* 0x393f7ebc */
2.28821940254420042038e-04F, /* 0x396fefec */
0.00000000000000000000e+00F}; /* 0x00000000 */
/* Handle special arguments first */
GET_BITS_SP32(x, ux);
ax = ux & (~SIGNBIT_SP32);
if(ax >= 0x7f800000)
{
/* x is either NaN or infinity */
if (ux & MANTBITS_SP32)
/* x is NaN */
return x + x; /* Raise invalid if it is a signalling NaN */
else if (ux & SIGNBIT_SP32)
return nanf_with_flags(AMD_F_INVALID);
else
/* x is positive infinity */
return x;
}
else if (ux & SIGNBIT_SP32)
{
/* x is negative. */
if (x == 0.0F)
/* Handle negative zero first */
return x;
else
return nanf_with_flags(AMD_F_INVALID);
}
else if (ux <= 0x007fffff)
{
/* x is denormalised or zero */
if (ux == 0)
/* x is zero */
return x;
else
{
/* x is denormalised; scale it up */
/* Normalize x by increasing the exponent by 26
and subtracting a correction to account for the implicit
bit. This replaces a slow denormalized
multiplication by a fast normal subtraction. */
static const float corr = 7.888609052210118054e-31F; /* 0x0d800000 */
denorm = 1;
GET_BITS_SP32(x, ux);
PUT_BITS_SP32(ux | 0x0d800000, x);
x -= corr;
GET_BITS_SP32(x, ux);
}
}
/* Main algorithm */
/*
Find y and e such that x = 2^e * y, where y in [1,4).
This is done using an in-lined variant of splitFloat,
which also ensures that e is even.
*/
y = x;
ux &= EXPBITS_SP32;
ux >>= EXPSHIFTBITS_SP32;
if (ux & 1)
{
GET_BITS_SP32(y, u);
u &= (SIGNBIT_SP32 | MANTBITS_SP32);
u |= ONEEXPBITS_SP32;
PUT_BITS_SP32(u, y);
e = ux - EXPBIAS_SP32;
}
else
{
GET_BITS_SP32(y, u);
u &= (SIGNBIT_SP32 | MANTBITS_SP32);
u |= TWOEXPBITS_SP32;
PUT_BITS_SP32(u, y);
e = ux - EXPBIAS_SP32 - 1;
}
/* Find the index of the sub-interval of [1,4) in which y lies. */
index = (int)(32.0F*y+0.5);
/* Look up the table values and compute c and r = c/t */
rtc_lead = rt_jby32_lead_table_float[index-32];
rtc_trail = rt_jby32_trail_table_float[index-32];
c = 0.03125F*index;
r = (y - c)/c;
/*
Find q = sqrt(1+r) - 1.
From one step of Newton on (q+1)^2 = 1+r
*/
p = r*0.5F - r*r*(0.1250079870F - r*(0.6250522999e-01F));
twop = p + p;
q = p - (p*p + (twop - r))/(twop + 2.0);
/* Reconstruction */
rtc = rtc_lead + rtc_trail;
e >>= 1; /* e = e/2 */
z = rtc_lead + (rtc*q+rtc_trail);
if (denorm)
{
/* Scale by 2**(e-13) */
PUT_BITS_SP32(((e - 13) + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, r);
z *= r;
}
else
{
/* Scale by 2**e */
PUT_BITS_SP32((e + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, r);
z *= r;
}
return z;
}
#endif /* SQRTF_AMD_INLINE */
#ifdef USE_LOG_KERNEL_AMD
static inline void log_kernel_amd64(double x, unsigned long long ux, int *xexp, double *r1, double *r2)
{
int expadjust;
double r, z1, z2, correction, f, f1, f2, q, u, v, poly;
int index;
/*
Computes natural log(x). Algorithm based on:
Ping-Tak Peter Tang
"Table-driven implementation of the logarithm function in IEEE
floating-point arithmetic"
ACM Transactions on Mathematical Software (TOMS)
Volume 16, Issue 4 (December 1990)
*/
/* Arrays ln_lead_table and ln_tail_table contain
leading and trailing parts respectively of precomputed
values of natural log(1+i/64), for i = 0, 1, ..., 64.
ln_lead_table contains the first 24 bits of precision,
and ln_tail_table contains a further 53 bits precision. */
static const double ln_lead_table[65] = {
0.00000000000000000000e+00, /* 0x0000000000000000 */
1.55041813850402832031e-02, /* 0x3f8fc0a800000000 */
3.07716131210327148438e-02, /* 0x3f9f829800000000 */
4.58095073699951171875e-02, /* 0x3fa7745800000000 */
6.06245994567871093750e-02, /* 0x3faf0a3000000000 */
7.52233862876892089844e-02, /* 0x3fb341d700000000 */
8.96121263504028320312e-02, /* 0x3fb6f0d200000000 */
1.03796780109405517578e-01, /* 0x3fba926d00000000 */
1.17783010005950927734e-01, /* 0x3fbe270700000000 */
1.31576299667358398438e-01, /* 0x3fc0d77e00000000 */
1.45181953907012939453e-01, /* 0x3fc2955280000000 */
1.58604979515075683594e-01, /* 0x3fc44d2b00000000 */
1.71850204467773437500e-01, /* 0x3fc5ff3000000000 */
1.84922337532043457031e-01, /* 0x3fc7ab8900000000 */
1.97825729846954345703e-01, /* 0x3fc9525a80000000 */
2.10564732551574707031e-01, /* 0x3fcaf3c900000000 */
2.23143517971038818359e-01, /* 0x3fcc8ff780000000 */
2.35566020011901855469e-01, /* 0x3fce270700000000 */
2.47836112976074218750e-01, /* 0x3fcfb91800000000 */
2.59957492351531982422e-01, /* 0x3fd0a324c0000000 */
2.71933674812316894531e-01, /* 0x3fd1675c80000000 */
2.83768117427825927734e-01, /* 0x3fd22941c0000000 */
2.95464158058166503906e-01, /* 0x3fd2e8e280000000 */
3.07025015354156494141e-01, /* 0x3fd3a64c40000000 */
3.18453729152679443359e-01, /* 0x3fd4618bc0000000 */
3.29753279685974121094e-01, /* 0x3fd51aad80000000 */
3.40926527976989746094e-01, /* 0x3fd5d1bd80000000 */
3.51976394653320312500e-01, /* 0x3fd686c800000000 */
3.62905442714691162109e-01, /* 0x3fd739d7c0000000 */
3.73716354370117187500e-01, /* 0x3fd7eaf800000000 */
3.84411692619323730469e-01, /* 0x3fd89a3380000000 */
3.94993782043457031250e-01, /* 0x3fd9479400000000 */
4.05465066432952880859e-01, /* 0x3fd9f323c0000000 */
4.15827870368957519531e-01, /* 0x3fda9cec80000000 */
4.26084339618682861328e-01, /* 0x3fdb44f740000000 */
4.36236739158630371094e-01, /* 0x3fdbeb4d80000000 */
4.46287095546722412109e-01, /* 0x3fdc8ff7c0000000 */
4.56237375736236572266e-01, /* 0x3fdd32fe40000000 */
4.66089725494384765625e-01, /* 0x3fddd46a00000000 */
4.75845873355865478516e-01, /* 0x3fde744240000000 */
4.85507786273956298828e-01, /* 0x3fdf128f40000000 */
4.95077252388000488281e-01, /* 0x3fdfaf5880000000 */
5.04556000232696533203e-01, /* 0x3fe02552a0000000 */
5.13945698738098144531e-01, /* 0x3fe0723e40000000 */
5.23248136043548583984e-01, /* 0x3fe0be72e0000000 */
5.32464742660522460938e-01, /* 0x3fe109f380000000 */
5.41597247123718261719e-01, /* 0x3fe154c3c0000000 */
5.50647079944610595703e-01, /* 0x3fe19ee6a0000000 */
5.59615731239318847656e-01, /* 0x3fe1e85f40000000 */
5.68504691123962402344e-01, /* 0x3fe23130c0000000 */
5.77315330505371093750e-01, /* 0x3fe2795e00000000 */
5.86049020290374755859e-01, /* 0x3fe2c0e9e0000000 */
5.94707071781158447266e-01, /* 0x3fe307d720000000 */
6.03290796279907226562e-01, /* 0x3fe34e2880000000 */
6.11801505088806152344e-01, /* 0x3fe393e0c0000000 */
6.20240390300750732422e-01, /* 0x3fe3d90260000000 */
6.28608644008636474609e-01, /* 0x3fe41d8fe0000000 */
6.36907458305358886719e-01, /* 0x3fe4618bc0000000 */
6.45137906074523925781e-01, /* 0x3fe4a4f840000000 */
6.53301239013671875000e-01, /* 0x3fe4e7d800000000 */
6.61398470401763916016e-01, /* 0x3fe52a2d20000000 */
6.69430613517761230469e-01, /* 0x3fe56bf9c0000000 */
6.77398800849914550781e-01, /* 0x3fe5ad4040000000 */
6.85303986072540283203e-01, /* 0x3fe5ee02a0000000 */
6.93147122859954833984e-01}; /* 0x3fe62e42e0000000 */
static const double ln_tail_table[65] = {
0.00000000000000000000e+00, /* 0x0000000000000000 */
5.15092497094772879206e-09, /* 0x3e361f807c79f3db */
4.55457209735272790188e-08, /* 0x3e6873c1980267c8 */
2.86612990859791781788e-08, /* 0x3e5ec65b9f88c69e */
2.23596477332056055352e-08, /* 0x3e58022c54cc2f99 */
3.49498983167142274770e-08, /* 0x3e62c37a3a125330 */
3.23392843005887000414e-08, /* 0x3e615cad69737c93 */
1.35722380472479366661e-08, /* 0x3e4d256ab1b285e9 */
2.56504325268044191098e-08, /* 0x3e5b8abcb97a7aa2 */
5.81213608741512136843e-08, /* 0x3e6f34239659a5dc */
5.59374849578288093334e-08, /* 0x3e6e07fd48d30177 */
5.06615629004996189970e-08, /* 0x3e6b32df4799f4f6 */
5.24588857848400955725e-08, /* 0x3e6c29e4f4f21cf8 */
9.61968535632653505972e-10, /* 0x3e1086c848df1b59 */
1.34829655346594463137e-08, /* 0x3e4cf456b4764130 */
3.65557749306383026498e-08, /* 0x3e63a02ffcb63398 */
3.33431709374069198903e-08, /* 0x3e61e6a6886b0976 */
5.13008650536088382197e-08, /* 0x3e6b8abcb97a7aa2 */
5.09285070380306053751e-08, /* 0x3e6b578f8aa35552 */
3.20853940845502057341e-08, /* 0x3e6139c871afb9fc */
4.06713248643004200446e-08, /* 0x3e65d5d30701ce64 */
5.57028186706125221168e-08, /* 0x3e6de7bcb2d12142 */
5.48356693724804282546e-08, /* 0x3e6d708e984e1664 */
1.99407553679345001938e-08, /* 0x3e556945e9c72f36 */
1.96585517245087232086e-09, /* 0x3e20e2f613e85bda */
6.68649386072067321503e-09, /* 0x3e3cb7e0b42724f6 */
5.89936034642113390002e-08, /* 0x3e6fac04e52846c7 */
2.85038578721554472484e-08, /* 0x3e5e9b14aec442be */
5.09746772910284482606e-08, /* 0x3e6b5de8034e7126 */
5.54234668933210171467e-08, /* 0x3e6dc157e1b259d3 */
6.29100830926604004874e-09, /* 0x3e3b05096ad69c62 */
2.61974119468563937716e-08, /* 0x3e5c2116faba4cdd */
4.16752115011186398935e-08, /* 0x3e665fcc25f95b47 */
2.47747534460820790327e-08, /* 0x3e5a9a08498d4850 */
5.56922172017964209793e-08, /* 0x3e6de647b1465f77 */
2.76162876992552906035e-08, /* 0x3e5da71b7bf7861d */
7.08169709942321478061e-09, /* 0x3e3e6a6886b09760 */
5.77453510221151779025e-08, /* 0x3e6f0075eab0ef64 */
4.43021445893361960146e-09, /* 0x3e33071282fb989b */
3.15140984357495864573e-08, /* 0x3e60eb43c3f1bed2 */
2.95077445089736670973e-08, /* 0x3e5faf06ecb35c84 */
1.44098510263167149349e-08, /* 0x3e4ef1e63db35f68 */
1.05196987538551827693e-08, /* 0x3e469743fb1a71a5 */
5.23641361722697546261e-08, /* 0x3e6c1cdf404e5796 */
7.72099925253243069458e-09, /* 0x3e4094aa0ada625e */
5.62089493829364197156e-08, /* 0x3e6e2d4c96fde3ec */
3.53090261098577946927e-08, /* 0x3e62f4d5e9a98f34 */
3.80080516835568242269e-08, /* 0x3e6467c96ecc5cbe */
5.66961038386146408282e-08, /* 0x3e6e7040d03dec5a */
4.42287063097349852717e-08, /* 0x3e67bebf4282de36 */
3.45294525105681104660e-08, /* 0x3e6289b11aeb783f */
2.47132034530447431509e-08, /* 0x3e5a891d1772f538 */
3.59655343422487209774e-08, /* 0x3e634f10be1fb591 */
5.51581770357780862071e-08, /* 0x3e6d9ce1d316eb93 */
3.60171867511861372793e-08, /* 0x3e63562a19a9c442 */
1.94511067964296180547e-08, /* 0x3e54e2adf548084c */
1.54137376631349347838e-08, /* 0x3e508ce55cc8c97a */
3.93171034490174464173e-09, /* 0x3e30e2f613e85bda */
5.52990607758839766440e-08, /* 0x3e6db03ebb0227bf */
3.29990737637586136511e-08, /* 0x3e61b75bb09cb098 */
1.18436010922446096216e-08, /* 0x3e496f16abb9df22 */
4.04248680368301346709e-08, /* 0x3e65b3f399411c62 */
2.27418915900284316293e-08, /* 0x3e586b3e59f65355 */
1.70263791333409206020e-08, /* 0x3e52482ceae1ac12 */
5.76999904754328540596e-08}; /* 0x3e6efa39ef35793c */
/* Approximating polynomial coefficients for x near 1.0 */
static const double
ca_1 = 8.33333333333317923934e-02, /* 0x3fb55555555554e6 */
ca_2 = 1.25000000037717509602e-02, /* 0x3f89999999bac6d4 */
ca_3 = 2.23213998791944806202e-03, /* 0x3f62492307f1519f */
ca_4 = 4.34887777707614552256e-04; /* 0x3f3c8034c85dfff0 */
/* Approximating polynomial coefficients for other x */
static const double
cb_1 = 8.33333333333333593622e-02, /* 0x3fb5555555555557 */
cb_2 = 1.24999999978138668903e-02, /* 0x3f89999999865ede */
cb_3 = 2.23219810758559851206e-03; /* 0x3f6249423bd94741 */
static const unsigned long long
log_thresh1 = 0x3fee0faa00000000,
log_thresh2 = 0x3ff1082c00000000;
/* log_thresh1 = 9.39412117004394531250e-1 = 0x3fee0faa00000000
log_thresh2 = 1.06449508666992187500 = 0x3ff1082c00000000 */
if (ux >= log_thresh1 && ux <= log_thresh2)
{
/* Arguments close to 1.0 are handled separately to maintain
accuracy.
The approximation in this region exploits the identity
log( 1 + r ) = log( 1 + u/2 ) / log( 1 - u/2 ), where
u = 2r / (2+r).
Note that the right hand side has an odd Taylor series expansion
which converges much faster than the Taylor series expansion of
log( 1 + r ) in r. Thus, we approximate log( 1 + r ) by
u + A1 * u^3 + A2 * u^5 + ... + An * u^(2n+1).
One subtlety is that since u cannot be calculated from
r exactly, the rounding error in the first u should be
avoided if possible. To accomplish this, we observe that
u = r - r*r/(2+r).
Since x (=1+r) is the input argument, and thus presumed exact,
the formula above approximates u accurately because
u = r - correction,
and the magnitude of "correction" (of the order of r*r)
is small.
With these observations, we will approximate log( 1 + r ) by
r + ( (A1*u^3 + ... + An*u^(2n+1)) - correction ).
We approximate log(1+r) by an odd polynomial in u, where
u = 2r/(2+r) = r - r*r/(2+r).
*/
r = x - 1.0;
u = r / (2.0 + r);
correction = r * u;
u = u + u;
v = u * u;
z1 = r;
z2 = (u * v * (ca_1 + v * (ca_2 + v * (ca_3 + v * ca_4))) - correction);
*r1 = z1;
*r2 = z2;
*xexp = 0;
}
else
{
/*
First, we decompose the argument x to the form
x = 2**M * (F1 + F2),
where 1 <= F1+F2 < 2, M has the value of an integer,
F1 = 1 + j/64, j ranges from 0 to 64, and |F2| <= 1/128.
Second, we approximate log( 1 + F2/F1 ) by an odd polynomial
in U, where U = 2 F2 / (2 F2 + F1).
Note that log( 1 + F2/F1 ) = log( 1 + U/2 ) - log( 1 - U/2 ).
The core approximation calculates
Poly = [log( 1 + U/2 ) - log( 1 - U/2 )]/U - 1.
Note that log(1 + U/2) - log(1 - U/2) = 2 arctanh ( U/2 ),
thus, Poly = 2 arctanh( U/2 ) / U - 1.
It is not hard to see that
log(x) = M*log(2) + log(F1) + log( 1 + F2/F1 ).
Hence, we return Z1 = log(F1), and Z2 = log( 1 + F2/F1).
The values of log(F1) are calculated beforehand and stored
in the program.
*/
f = x;
if (ux < IMPBIT_DP64)
{
/* The input argument x is denormalized */
/* Normalize f by increasing the exponent by 60
and subtracting a correction to account for the implicit
bit. This replaces a slow denormalized
multiplication by a fast normal subtraction. */
static const double corr = 2.5653355008114851558350183e-290; /* 0x03d0000000000000 */
GET_BITS_DP64(f, ux);
ux |= 0x03d0000000000000;
PUT_BITS_DP64(ux, f);
f -= corr;
GET_BITS_DP64(f, ux);
expadjust = 60;
}
else
expadjust = 0;
/* Store the exponent of x in xexp and put
f into the range [0.5,1) */
*xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 - expadjust;
PUT_BITS_DP64((ux & MANTBITS_DP64) | HALFEXPBITS_DP64, f);
/* Now x = 2**xexp * f, 1/2 <= f < 1. */
/* Set index to be the nearest integer to 128*f */
r = 128.0 * f;
index = (int)(r + 0.5);
z1 = ln_lead_table[index-64];
q = ln_tail_table[index-64];
f1 = index * 0.0078125; /* 0.0078125 = 1/128 */
f2 = f - f1;
/* At this point, x = 2**xexp * ( f1 + f2 ) where
f1 = j/128, j = 64, 65, ..., 128 and |f2| <= 1/256. */
/* Calculate u = 2 f2 / ( 2 f1 + f2 ) = f2 / ( f1 + 0.5*f2 ) */
/* u = f2 / (f1 + 0.5 * f2); */
u = f2 / (f1 + 0.5 * f2);
/* Here, |u| <= 2(exp(1/16)-1) / (exp(1/16)+1).
The core approximation calculates
poly = [log(1 + u/2) - log(1 - u/2)]/u - 1 */
v = u * u;
poly = (v * (cb_1 + v * (cb_2 + v * cb_3)));
z2 = q + (u + u * poly);
*r1 = z1;
*r2 = z2;
}
return;
}
#endif /* USE_LOG_KERNEL_AMD */
#if defined(USE_REMAINDER_PIBY2F_INLINE)
/* Define this to get debugging print statements activated */
#define DEBUGGING_PRINT
#undef DEBUGGING_PRINT
#ifdef DEBUGGING_PRINT
#include <stdio.h>
char *d2b(long long d, int bitsper, int point)
{
static char buff[200];
int i, j;
j = bitsper;
if (point >= 0 && point <= bitsper)
j++;
buff[j] = '\0';
for (i = bitsper - 1; i >= 0; i--)
{
j--;
if (d % 2 == 1)
buff[j] = '1';
else
buff[j] = '0';
if (i == point)
{
j--;
buff[j] = '.';
}
d /= 2;
}
return buff;
}
#endif
/* Given positive argument x, reduce it to the range [-pi/4,pi/4] using
extra precision, and return the result in r.
Return value "region" tells how many lots of pi/2 were subtracted
from x to put it in the range [-pi/4,pi/4], mod 4. */
static inline void __remainder_piby2f_inline(unsigned long long ux, double *r, int *region)
{
/* This method simulates multi-precision floating-point
arithmetic and is accurate for all 1 <= x < infinity */
#define bitsper 36
unsigned long long res[10];
unsigned long long u, carry, mask, mant, nextbits;
int first, last, i, rexp, xexp, resexp, ltb, determ, bc;
double dx;
static const double
piby2 = 1.57079632679489655800e+00; /* 0x3ff921fb54442d18 */
#ifdef WINDOWS
static unsigned long long pibits[] =
{
0LL,
5215LL, 13000023176LL, 11362338026LL, 67174558139LL,
34819822259LL, 10612056195LL, 67816420731LL, 57840157550LL,
19558516809LL, 50025467026LL, 25186875954LL, 18152700886LL
};
#else
static unsigned long long pibits[] =
{
0L,
5215L, 13000023176L, 11362338026L, 67174558139L,
34819822259L, 10612056195L, 67816420731L, 57840157550L,
19558516809L, 50025467026L, 25186875954L, 18152700886L
};
#endif
#ifdef DEBUGGING_PRINT
printf("On entry, x = %25.20e = %s\n", x, double2hex(&x));
#endif
xexp = (int)(((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64);
ux = ((ux & MANTBITS_DP64) | IMPBIT_DP64) >> 29;
#ifdef DEBUGGING_PRINT
printf("ux = %s\n", d2b(ux, 64, -1));
#endif
/* Now ux is the mantissa bit pattern of x as a long integer */
mask = 1;
mask = (mask << bitsper) - 1;
/* Set first and last to the positions of the first
and last chunks of 2/pi that we need */
first = xexp / bitsper;
resexp = xexp - first * bitsper;
/* 120 is the theoretical maximum number of bits (actually
115 for IEEE single precision) that we need to extract
from the middle of 2/pi to compute the reduced argument
accurately enough for our purposes */
last = first + 120 / bitsper;
#ifdef DEBUGGING_PRINT
printf("first = %d, last = %d\n", first, last);
#endif
/* Do a long multiplication of the bits of 2/pi by the
integer mantissa */
/* Unroll the loop. This is only correct because we know
that bitsper is fixed as 36. */
res[4] = 0;
u = pibits[last] * ux;
res[3] = u & mask;
carry = u >> bitsper;
u = pibits[last - 1] * ux + carry;
res[2] = u & mask;
carry = u >> bitsper;
u = pibits[last - 2] * ux + carry;
res[1] = u & mask;
carry = u >> bitsper;
u = pibits[first] * ux + carry;
res[0] = u & mask;
#ifdef DEBUGGING_PRINT
printf("resexp = %d\n", resexp);
printf("Significant part of x * 2/pi with binary"
" point in correct place:\n");
for (i = 0; i <= last - first; i++)
{
if (i > 0 && i % 5 == 0)
printf("\n ");
if (i == 1)
printf("%s ", d2b(res[i], bitsper, resexp));
else
printf("%s ", d2b(res[i], bitsper, -1));
}
printf("\n");
#endif
/* Reconstruct the result */
ltb = (int)((((res[0] << bitsper) | res[1])
>> (bitsper - 1 - resexp)) & 7);
/* determ says whether the fractional part is >= 0.5 */
determ = ltb & 1;
#ifdef DEBUGGING_PRINT
printf("ltb = %d (last two bits before binary point"
" and first bit after)\n", ltb);
printf("determ = %d (1 means need to negate because the fractional\n"
" part of x * 2/pi is greater than 0.5)\n", determ);
#endif
i = 1;
if (determ)
{
/* The mantissa is >= 0.5. We want to subtract it
from 1.0 by negating all the bits */
*region = ((ltb >> 1) + 1) & 3;
mant = 1;
mant = ~(res[1]) & ((mant << (bitsper - resexp)) - 1);
while (mant < 0x0000000000010000)
{
i++;
mant = (mant << bitsper) | (~(res[i]) & mask);
}
nextbits = (~(res[i+1]) & mask);
}
else
{
*region = (ltb >> 1);
mant = 1;
mant = res[1] & ((mant << (bitsper - resexp)) - 1);
while (mant < 0x0000000000010000)
{
i++;
mant = (mant << bitsper) | res[i];
}
nextbits = res[i+1];
}
#ifdef DEBUGGING_PRINT
printf("First bits of mant = %s\n", d2b(mant, bitsper, -1));
#endif
/* Normalize the mantissa. The shift value 6 here, determined by
trial and error, seems to give optimal speed. */
bc = 0;
while (mant < 0x0000400000000000LL)
{
bc += 6;
mant <<= 6;
}
while (mant < 0x0010000000000000LL)
{
bc++;
mant <<= 1;
}
mant |= nextbits >> (bitsper - bc);
rexp = 52 + resexp - bc - i * bitsper;
#ifdef DEBUGGING_PRINT
printf("Normalised mantissa = 0x%016lx\n", mant);
printf("Exponent to be inserted on mantissa = rexp = %d\n", rexp);
#endif
/* Put the result exponent rexp onto the mantissa pattern */
u = ((unsigned long long)rexp + EXPBIAS_DP64) << EXPSHIFTBITS_DP64;
ux = (mant & MANTBITS_DP64) | u;
if (determ)
/* If we negated the mantissa we negate x too */
ux |= SIGNBIT_DP64;
PUT_BITS_DP64(ux, dx);
#ifdef DEBUGGING_PRINT
printf("(x*2/pi) = %25.20e = %s\n", dx, double2hex(&dx));
#endif
/* x is a double precision version of the fractional part of
x * 2 / pi. Multiply x by pi/2 in double precision
to get the reduced argument r. */
*r = dx * piby2;
#ifdef DEBUGGING_PRINT
printf(" r = frac(x*2/pi) * pi/2:\n");
printf(" r = %25.20e = %s\n", *r, double2hex(r));
printf("region = (number of pi/2 subtracted from x) mod 4 = %d\n",
*region);
#endif
}
#endif /* USE_REMAINDER_PIBY2F_INLINE */
#if defined(WINDOWS)
#if defined(USE_HANDLE_ERROR) || defined(USE_HANDLE_ERRORF)
#include <errno.h>
#endif
#if defined(USE_HANDLE_ERROR)
/* Define the Microsoft specific error handling routines */
static __declspec(noinline) double handle_error(const char *name,
unsigned long long value,
int type, int flags, int error,
double arg1, double arg2)
{
double z;
struct _exception exception_data;
exception_data.type = type;
exception_data.name = (char*)name;
exception_data.arg1 = arg1;
exception_data.arg2 = arg2;
PUT_BITS_DP64(value, z);
exception_data.retval = z;
raise_fpsw_flags(flags);
if (!_matherr(&exception_data))
{
errno = error;
}
return exception_data.retval;
}
#endif /* USE_HANDLE_ERROR */
#if defined(USE_HANDLE_ERRORF)
static __declspec(noinline) float handle_errorf(const char *name,
unsigned int value,
int type, int flags, int error,
float arg1, float arg2)
{
float z;
struct _exception exception_data;
exception_data.type = type;
exception_data.name = (char*)name;
exception_data.arg1 = (double)arg1;
exception_data.arg2 = (double)arg2;
PUT_BITS_SP32(value, z);
exception_data.retval = z;
raise_fpsw_flags(flags);
if (!_matherr(&exception_data))
{
errno = error;
}
return (float)exception_data.retval;
}
#endif /* USE_HANDLE_ERRORF */
#endif /* WINDOWS */
#endif /* LIBM_INLINES_AMD_H_INCLUDED */