| |
| /* |
| * Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved. |
| * |
| * This file is part of libacml_mv. |
| * |
| * libacml_mv is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU Lesser General Public |
| * License as published by the Free Software Foundation; either |
| * version 2.1 of the License, or (at your option) any later version. |
| * |
| * libacml_mv is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * Lesser General Public License for more details. |
| * |
| * You should have received a copy of the GNU Lesser General Public |
| * License along with libacml_mv. If not, see |
| * <http://www.gnu.org/licenses/>. |
| * |
| */ |
| |
| |
| #ifndef LIBM_INLINES_AMD_H_INCLUDED |
| #define LIBM_INLINES_AMD_H_INCLUDED 1 |
| |
| #include "libm_util_amd.h" |
| #include <math.h> |
| |
| #ifdef WINDOWS |
| #define inline __inline |
| #include "emmintrin.h" |
| #endif |
| |
| /* Compile-time verification that type long is the same size |
| as type double (i.e. we are really on a 64-bit machine) */ |
| void check_long_against_double_size(int machine_is_64_bit[(sizeof(long long) == sizeof(double))?1:-1]); |
| |
| /* Set defines for inline functions calling other inlines */ |
| #if defined(USE_VAL_WITH_FLAGS) || defined(USE_VALF_WITH_FLAGS) || \ |
| defined(USE_ZERO_WITH_FLAGS) || defined(USE_ZEROF_WITH_FLAGS) || \ |
| defined(USE_NAN_WITH_FLAGS) || defined(USE_NANF_WITH_FLAGS) || \ |
| defined(USE_INDEFINITE_WITH_FLAGS) || defined(USE_INDEFINITEF_WITH_FLAGS) || \ |
| defined(USE_INFINITY_WITH_FLAGS) || defined(USE_INFINITYF_WITH_FLAGS) || \ |
| defined(USE_SQRT_AMD_INLINE) || defined(USE_SQRTF_AMD_INLINE) || \ |
| (defined(WINDOWS) && (defined(USE_HANDLE_ERROR) || defined(USE_HANDLE_ERRORF))) |
| #undef USE_RAISE_FPSW_FLAGS |
| #define USE_RAISE_FPSW_FLAGS 1 |
| #endif |
| |
| #if defined(USE_SPLITDOUBLE) |
| /* Splits double x into exponent e and mantissa m, where 0.5 <= abs(m) < 1.0. |
| Assumes that x is not zero, denormal, infinity or NaN, but these conditions |
| are not checked */ |
| static inline void splitDouble(double x, int *e, double *m) |
| { |
| unsigned long long ux, uy; |
| GET_BITS_DP64(x, ux); |
| uy = ux; |
| ux &= EXPBITS_DP64; |
| ux >>= EXPSHIFTBITS_DP64; |
| *e = (int)ux - EXPBIAS_DP64 + 1; |
| uy = (uy & (SIGNBIT_DP64 | MANTBITS_DP64)) | HALFEXPBITS_DP64; |
| PUT_BITS_DP64(uy, x); |
| *m = x; |
| } |
| #endif /* USE_SPLITDOUBLE */ |
| |
| |
| #if defined(USE_SPLITDOUBLE_2) |
| /* Splits double x into exponent e and mantissa m, where 1.0 <= abs(m) < 4.0. |
| Assumes that x is not zero, denormal, infinity or NaN, but these conditions |
| are not checked. Also assumes EXPBIAS_DP is odd. With this |
| assumption, e will be even on exit. */ |
| static inline void splitDouble_2(double x, int *e, double *m) |
| { |
| unsigned long long ux, vx; |
| GET_BITS_DP64(x, ux); |
| vx = ux; |
| ux &= EXPBITS_DP64; |
| ux >>= EXPSHIFTBITS_DP64; |
| if (ux & 1) |
| { |
| /* The exponent is odd */ |
| vx = (vx & (SIGNBIT_DP64 | MANTBITS_DP64)) | ONEEXPBITS_DP64; |
| PUT_BITS_DP64(vx, x); |
| *m = x; |
| *e = ux - EXPBIAS_DP64; |
| } |
| else |
| { |
| /* The exponent is even */ |
| vx = (vx & (SIGNBIT_DP64 | MANTBITS_DP64)) | TWOEXPBITS_DP64; |
| PUT_BITS_DP64(vx, x); |
| *m = x; |
| *e = ux - EXPBIAS_DP64 - 1; |
| } |
| } |
| #endif /* USE_SPLITDOUBLE_2 */ |
| |
| |
| #if defined(USE_SPLITFLOAT) |
| /* Splits float x into exponent e and mantissa m, where 0.5 <= abs(m) < 1.0. |
| Assumes that x is not zero, denormal, infinity or NaN, but these conditions |
| are not checked */ |
| static inline void splitFloat(float x, int *e, float *m) |
| { |
| unsigned int ux, uy; |
| GET_BITS_SP32(x, ux); |
| uy = ux; |
| ux &= EXPBITS_SP32; |
| ux >>= EXPSHIFTBITS_SP32; |
| *e = (int)ux - EXPBIAS_SP32 + 1; |
| uy = (uy & (SIGNBIT_SP32 | MANTBITS_SP32)) | HALFEXPBITS_SP32; |
| PUT_BITS_SP32(uy, x); |
| *m = x; |
| } |
| #endif /* USE_SPLITFLOAT */ |
| |
| |
| #if defined(USE_SCALEDOUBLE_1) |
| /* Scales the double x by 2.0**n. |
| Assumes EMIN <= n <= EMAX, though this condition is not checked. */ |
| static inline double scaleDouble_1(double x, int n) |
| { |
| double t; |
| /* Construct the number t = 2.0**n */ |
| PUT_BITS_DP64(((long long)n + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t); |
| return x*t; |
| } |
| #endif /* USE_SCALEDOUBLE_1 */ |
| |
| |
| #if defined(USE_SCALEDOUBLE_2) |
| /* Scales the double x by 2.0**n. |
| Assumes 2*EMIN <= n <= 2*EMAX, though this condition is not checked. */ |
| static inline double scaleDouble_2(double x, int n) |
| { |
| double t1, t2; |
| int n1, n2; |
| n1 = n / 2; |
| n2 = n - n1; |
| /* Construct the numbers t1 = 2.0**n1 and t2 = 2.0**n2 */ |
| PUT_BITS_DP64(((long long)n1 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t1); |
| PUT_BITS_DP64(((long long)n2 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t2); |
| return (x*t1)*t2; |
| } |
| #endif /* USE_SCALEDOUBLE_2 */ |
| |
| |
| #if defined(USE_SCALEDOUBLE_3) |
| /* Scales the double x by 2.0**n. |
| Assumes 3*EMIN <= n <= 3*EMAX, though this condition is not checked. */ |
| static inline double scaleDouble_3(double x, int n) |
| { |
| double t1, t2, t3; |
| int n1, n2, n3; |
| n1 = n / 3; |
| n2 = (n - n1) / 2; |
| n3 = n - n1 - n2; |
| /* Construct the numbers t1 = 2.0**n1, t2 = 2.0**n2 and t3 = 2.0**n3 */ |
| PUT_BITS_DP64(((long long)n1 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t1); |
| PUT_BITS_DP64(((long long)n2 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t2); |
| PUT_BITS_DP64(((long long)n3 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t3); |
| return ((x*t1)*t2)*t3; |
| } |
| #endif /* USE_SCALEDOUBLE_3 */ |
| |
| |
| #if defined(USE_SCALEFLOAT_1) |
| /* Scales the float x by 2.0**n. |
| Assumes EMIN <= n <= EMAX, though this condition is not checked. */ |
| static inline float scaleFloat_1(float x, int n) |
| { |
| float t; |
| /* Construct the number t = 2.0**n */ |
| PUT_BITS_SP32((n + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t); |
| return x*t; |
| } |
| #endif /* USE_SCALEFLOAT_1 */ |
| |
| |
| #if defined(USE_SCALEFLOAT_2) |
| /* Scales the float x by 2.0**n. |
| Assumes 2*EMIN <= n <= 2*EMAX, though this condition is not checked. */ |
| static inline float scaleFloat_2(float x, int n) |
| { |
| float t1, t2; |
| int n1, n2; |
| n1 = n / 2; |
| n2 = n - n1; |
| /* Construct the numbers t1 = 2.0**n1 and t2 = 2.0**n2 */ |
| PUT_BITS_SP32((n1 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t1); |
| PUT_BITS_SP32((n2 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t2); |
| return (x*t1)*t2; |
| } |
| #endif /* USE_SCALEFLOAT_2 */ |
| |
| |
| #if defined(USE_SCALEFLOAT_3) |
| /* Scales the float x by 2.0**n. |
| Assumes 3*EMIN <= n <= 3*EMAX, though this condition is not checked. */ |
| static inline float scaleFloat_3(float x, int n) |
| { |
| float t1, t2, t3; |
| int n1, n2, n3; |
| n1 = n / 3; |
| n2 = (n - n1) / 2; |
| n3 = n - n1 - n2; |
| /* Construct the numbers t1 = 2.0**n1, t2 = 2.0**n2 and t3 = 2.0**n3 */ |
| PUT_BITS_SP32((n1 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t1); |
| PUT_BITS_SP32((n2 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t2); |
| PUT_BITS_SP32((n3 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t3); |
| return ((x*t1)*t2)*t3; |
| } |
| #endif /* USE_SCALEFLOAT_3 */ |
| |
| #if defined(USE_SETPRECISIONDOUBLE) |
| unsigned int setPrecisionDouble(void) |
| { |
| unsigned int cw, cwold = 0; |
| /* There is no precision control on Hammer */ |
| return cwold; |
| } |
| #endif /* USE_SETPRECISIONDOUBLE */ |
| |
| #if defined(USE_RESTOREPRECISION) |
| void restorePrecision(unsigned int cwold) |
| { |
| #if defined(WINDOWS) |
| /* There is no precision control on Hammer */ |
| #elif defined(linux) |
| /* There is no precision control on Hammer */ |
| #else |
| #error Unknown machine |
| #endif |
| return; |
| } |
| #endif /* USE_RESTOREPRECISION */ |
| |
| |
| #if defined(USE_CLEAR_FPSW_FLAGS) |
| /* Clears floating-point status flags. The argument should be |
| the bitwise or of the flags to be cleared, from the |
| list above, e.g. |
| clear_fpsw_flags(AMD_F_INEXACT | AMD_F_INVALID); |
| */ |
| static inline void clear_fpsw_flags(int flags) |
| { |
| #if defined(WINDOWS) |
| unsigned int cw = _mm_getcsr(); |
| cw &= (~flags); |
| _mm_setcsr(cw); |
| #elif defined(linux) |
| unsigned int cw; |
| /* Get the current floating-point control/status word */ |
| asm volatile ("STMXCSR %0" : "=m" (cw)); |
| cw &= (~flags); |
| asm volatile ("LDMXCSR %0" : : "m" (cw)); |
| #else |
| #error Unknown machine |
| #endif |
| } |
| #endif /* USE_CLEAR_FPSW_FLAGS */ |
| |
| |
| #if defined(USE_RAISE_FPSW_FLAGS) |
| /* Raises floating-point status flags. The argument should be |
| the bitwise or of the flags to be raised, from the |
| list above, e.g. |
| raise_fpsw_flags(AMD_F_INEXACT | AMD_F_INVALID); |
| */ |
| static inline void raise_fpsw_flags(int flags) |
| { |
| #if defined(WINDOWS) |
| _mm_setcsr(_mm_getcsr() | flags); |
| #elif defined(linux) |
| unsigned int cw; |
| /* Get the current floating-point control/status word */ |
| asm volatile ("STMXCSR %0" : "=m" (cw)); |
| cw |= flags; |
| asm volatile ("LDMXCSR %0" : : "m" (cw)); |
| #else |
| #error Unknown machine |
| #endif |
| } |
| #endif /* USE_RAISE_FPSW_FLAGS */ |
| |
| |
| #if defined(USE_GET_FPSW_INLINE) |
| /* Return the current floating-point status word */ |
| static inline unsigned int get_fpsw_inline(void) |
| { |
| #if defined(WINDOWS) |
| return _mm_getcsr(); |
| #elif defined(linux) |
| unsigned int sw; |
| asm volatile ("STMXCSR %0" : "=m" (sw)); |
| return sw; |
| #else |
| #error Unknown machine |
| #endif |
| } |
| #endif /* USE_GET_FPSW_INLINE */ |
| |
| #if defined(USE_SET_FPSW_INLINE) |
| /* Set the floating-point status word */ |
| static inline void set_fpsw_inline(unsigned int sw) |
| { |
| #if defined(WINDOWS) |
| _mm_setcsr(sw); |
| #elif defined(linux) |
| /* Set the current floating-point control/status word */ |
| asm volatile ("LDMXCSR %0" : : "m" (sw)); |
| #else |
| #error Unknown machine |
| #endif |
| } |
| #endif /* USE_SET_FPSW_INLINE */ |
| |
| #if defined(USE_CLEAR_FPSW_INLINE) |
| /* Clear all exceptions from the floating-point status word */ |
| static inline void clear_fpsw_inline(void) |
| { |
| #if defined(WINDOWS) |
| unsigned int cw; |
| cw = _mm_getcsr(); |
| cw &= ~(AMD_F_INEXACT | AMD_F_UNDERFLOW | AMD_F_OVERFLOW | |
| AMD_F_DIVBYZERO | AMD_F_INVALID); |
| _mm_setcsr(cw); |
| #elif defined(linux) |
| unsigned int cw; |
| /* Get the current floating-point control/status word */ |
| asm volatile ("STMXCSR %0" : "=m" (cw)); |
| cw &= ~(AMD_F_INEXACT | AMD_F_UNDERFLOW | AMD_F_OVERFLOW | |
| AMD_F_DIVBYZERO | AMD_F_INVALID); |
| asm volatile ("LDMXCSR %0" : : "m" (cw)); |
| #else |
| #error Unknown machine |
| #endif |
| } |
| #endif /* USE_CLEAR_FPSW_INLINE */ |
| |
| |
| #if defined(USE_VAL_WITH_FLAGS) |
| /* Returns a double value after raising the given flags, |
| e.g. val_with_flags(x, AMD_F_INEXACT); |
| */ |
| static inline double val_with_flags(double val, int flags) |
| { |
| raise_fpsw_flags(flags); |
| return val; |
| } |
| #endif /* USE_VAL_WITH_FLAGS */ |
| |
| #if defined(USE_VALF_WITH_FLAGS) |
| /* Returns a float value after raising the given flags, |
| e.g. valf_with_flags(x, AMD_F_INEXACT); |
| */ |
| static inline float valf_with_flags(float val, int flags) |
| { |
| raise_fpsw_flags(flags); |
| return val; |
| } |
| #endif /* USE_VALF_WITH_FLAGS */ |
| |
| |
| #if defined(USE_ZERO_WITH_FLAGS) |
| /* Returns a double +zero after raising the given flags, |
| e.g. zero_with_flags(AMD_F_INEXACT | AMD_F_INVALID); |
| */ |
| static inline double zero_with_flags(int flags) |
| { |
| raise_fpsw_flags(flags); |
| return 0.0; |
| } |
| #endif /* USE_ZERO_WITH_FLAGS */ |
| |
| |
| #if defined(USE_ZEROF_WITH_FLAGS) |
| /* Returns a float +zero after raising the given flags, |
| e.g. zerof_with_flags(AMD_F_INEXACT | AMD_F_INVALID); |
| */ |
| static inline float zerof_with_flags(int flags) |
| { |
| raise_fpsw_flags(flags); |
| return 0.0F; |
| } |
| #endif /* USE_ZEROF_WITH_FLAGS */ |
| |
| |
| #if defined(USE_NAN_WITH_FLAGS) |
| /* Returns a double quiet +nan after raising the given flags, |
| e.g. nan_with_flags(AMD_F_INVALID); |
| */ |
| static inline double nan_with_flags(int flags) |
| { |
| double z; |
| raise_fpsw_flags(flags); |
| PUT_BITS_DP64(0x7ff8000000000000, z); |
| return z; |
| } |
| #endif /* USE_NAN_WITH_FLAGS */ |
| |
| #if defined(USE_NANF_WITH_FLAGS) |
| /* Returns a float quiet +nan after raising the given flags, |
| e.g. nanf_with_flags(AMD_F_INVALID); |
| */ |
| static inline float nanf_with_flags(int flags) |
| { |
| float z; |
| raise_fpsw_flags(flags); |
| PUT_BITS_SP32(0x7fc00000, z); |
| return z; |
| } |
| #endif /* USE_NANF_WITH_FLAGS */ |
| |
| |
| #if defined(USE_INDEFINITE_WITH_FLAGS) |
| /* Returns a double indefinite after raising the given flags, |
| e.g. indefinite_with_flags(AMD_F_INVALID); |
| */ |
| static inline double indefinite_with_flags(int flags) |
| { |
| double z; |
| raise_fpsw_flags(flags); |
| PUT_BITS_DP64(0xfff8000000000000, z); |
| return z; |
| } |
| #endif /* USE_INDEFINITE_WITH_FLAGS */ |
| |
| #if defined(USE_INDEFINITEF_WITH_FLAGS) |
| /* Returns a float quiet +indefinite after raising the given flags, |
| e.g. indefinitef_with_flags(AMD_F_INVALID); |
| */ |
| static inline float indefinitef_with_flags(int flags) |
| { |
| float z; |
| raise_fpsw_flags(flags); |
| PUT_BITS_SP32(0xffc00000, z); |
| return z; |
| } |
| #endif /* USE_INDEFINITEF_WITH_FLAGS */ |
| |
| |
| #ifdef USE_INFINITY_WITH_FLAGS |
| /* Returns a positive double infinity after raising the given flags, |
| e.g. infinity_with_flags(AMD_F_OVERFLOW); |
| */ |
| static inline double infinity_with_flags(int flags) |
| { |
| double z; |
| raise_fpsw_flags(flags); |
| PUT_BITS_DP64((unsigned long long)(BIASEDEMAX_DP64 + 1) << EXPSHIFTBITS_DP64, z); |
| return z; |
| } |
| #endif /* USE_INFINITY_WITH_FLAGS */ |
| |
| #ifdef USE_INFINITYF_WITH_FLAGS |
| /* Returns a positive float infinity after raising the given flags, |
| e.g. infinityf_with_flags(AMD_F_OVERFLOW); |
| */ |
| static inline float infinityf_with_flags(int flags) |
| { |
| float z; |
| raise_fpsw_flags(flags); |
| PUT_BITS_SP32((BIASEDEMAX_SP32 + 1) << EXPSHIFTBITS_SP32, z); |
| return z; |
| } |
| #endif /* USE_INFINITYF_WITH_FLAGS */ |
| |
| |
| #if defined(USE_SPLITEXP) |
| /* Compute the values m, z1, and z2 such that base**x = 2**m * (z1 + z2). |
| Small arguments abs(x) < 1/(16*ln(base)) and extreme arguments |
| abs(x) > large/(ln(base)) (where large is the largest representable |
| floating point number) should be handled separately instead of calling |
| this function. This function is called by exp_amd, exp2_amd, exp10_amd, |
| cosh_amd and sinh_amd. */ |
| static inline void splitexp(double x, double logbase, |
| double thirtytwo_by_logbaseof2, |
| double logbaseof2_by_32_lead, |
| double logbaseof2_by_32_trail, |
| int *m, double *z1, double *z2) |
| { |
| double q, r, r1, r2, f1, f2; |
| int n, j; |
| |
| /* Arrays two_to_jby32_lead_table and two_to_jby32_trail_table contain |
| leading and trailing parts respectively of precomputed |
| values of pow(2.0,j/32.0), for j = 0, 1, ..., 31. |
| two_to_jby32_lead_table contains the first 25 bits of precision, |
| and two_to_jby32_trail_table contains a further 53 bits precision. */ |
| |
| static const double two_to_jby32_lead_table[32] = { |
| 1.00000000000000000000e+00, /* 0x3ff0000000000000 */ |
| 1.02189713716506958008e+00, /* 0x3ff059b0d0000000 */ |
| 1.04427373409271240234e+00, /* 0x3ff0b55860000000 */ |
| 1.06714040040969848633e+00, /* 0x3ff11301d0000000 */ |
| 1.09050768613815307617e+00, /* 0x3ff172b830000000 */ |
| 1.11438673734664916992e+00, /* 0x3ff1d48730000000 */ |
| 1.13878858089447021484e+00, /* 0x3ff2387a60000000 */ |
| 1.16372483968734741211e+00, /* 0x3ff29e9df0000000 */ |
| 1.18920707702636718750e+00, /* 0x3ff306fe00000000 */ |
| 1.21524733304977416992e+00, /* 0x3ff371a730000000 */ |
| 1.24185776710510253906e+00, /* 0x3ff3dea640000000 */ |
| 1.26905095577239990234e+00, /* 0x3ff44e0860000000 */ |
| 1.29683953523635864258e+00, /* 0x3ff4bfdad0000000 */ |
| 1.32523661851882934570e+00, /* 0x3ff5342b50000000 */ |
| 1.35425549745559692383e+00, /* 0x3ff5ab07d0000000 */ |
| 1.38390988111495971680e+00, /* 0x3ff6247eb0000000 */ |
| 1.41421353816986083984e+00, /* 0x3ff6a09e60000000 */ |
| 1.44518077373504638672e+00, /* 0x3ff71f75e0000000 */ |
| 1.47682613134384155273e+00, /* 0x3ff7a11470000000 */ |
| 1.50916439294815063477e+00, /* 0x3ff8258990000000 */ |
| 1.54221081733703613281e+00, /* 0x3ff8ace540000000 */ |
| 1.57598084211349487305e+00, /* 0x3ff93737b0000000 */ |
| 1.61049032211303710938e+00, /* 0x3ff9c49180000000 */ |
| 1.64575546979904174805e+00, /* 0x3ffa5503b0000000 */ |
| 1.68179279565811157227e+00, /* 0x3ffae89f90000000 */ |
| 1.71861928701400756836e+00, /* 0x3ffb7f76f0000000 */ |
| 1.75625211000442504883e+00, /* 0x3ffc199bd0000000 */ |
| 1.79470902681350708008e+00, /* 0x3ffcb720d0000000 */ |
| 1.83400803804397583008e+00, /* 0x3ffd5818d0000000 */ |
| 1.87416762113571166992e+00, /* 0x3ffdfc9730000000 */ |
| 1.91520655155181884766e+00, /* 0x3ffea4afa0000000 */ |
| 1.95714408159255981445e+00}; /* 0x3fff507650000000 */ |
| |
| static const double two_to_jby32_trail_table[32] = { |
| 0.00000000000000000000e+00, /* 0x0000000000000000 */ |
| 1.14890470981563546737e-08, /* 0x3e48ac2ba1d73e2a */ |
| 4.83347014379782142328e-08, /* 0x3e69f3121ec53172 */ |
| 2.67125131841396124714e-10, /* 0x3df25b50a4ebbf1b */ |
| 4.65271045830351350190e-08, /* 0x3e68faa2f5b9bef9 */ |
| 5.24924336638693782574e-09, /* 0x3e368b9aa7805b80 */ |
| 5.38622214388600821910e-08, /* 0x3e6ceac470cd83f6 */ |
| 1.90902301017041969782e-08, /* 0x3e547f7b84b09745 */ |
| 3.79763538792174980894e-08, /* 0x3e64636e2a5bd1ab */ |
| 2.69306947081946450986e-08, /* 0x3e5ceaa72a9c5154 */ |
| 4.49683815095311756138e-08, /* 0x3e682468446b6824 */ |
| 1.41933332021066904914e-09, /* 0x3e18624b40c4dbd0 */ |
| 1.94146510233556266402e-08, /* 0x3e54d8a89c750e5e */ |
| 2.46409119489264118569e-08, /* 0x3e5a753e077c2a0f */ |
| 4.94812958044698886494e-08, /* 0x3e6a90a852b19260 */ |
| 8.48872238075784476136e-10, /* 0x3e0d2ac258f87d03 */ |
| 2.42032342089579394887e-08, /* 0x3e59fcef32422cbf */ |
| 3.32420002333182569170e-08, /* 0x3e61d8bee7ba46e2 */ |
| 1.45956577586525322754e-08, /* 0x3e4f580c36bea881 */ |
| 3.46452721050003920866e-08, /* 0x3e62999c25159f11 */ |
| 8.07090469079979051284e-09, /* 0x3e415506dadd3e2a */ |
| 2.99439161340839520436e-09, /* 0x3e29b8bc9e8a0388 */ |
| 9.83621719880452147153e-09, /* 0x3e451f8480e3e236 */ |
| 8.35492309647188080486e-09, /* 0x3e41f12ae45a1224 */ |
| 3.48493175137966283582e-08, /* 0x3e62b5a75abd0e6a */ |
| 1.11084703472699692902e-08, /* 0x3e47daf237553d84 */ |
| 5.03688744342840346564e-08, /* 0x3e6b0aa538444196 */ |
| 4.81896001063495806249e-08, /* 0x3e69df20d22a0798 */ |
| 4.83653666334089557746e-08, /* 0x3e69f7490e4bb40b */ |
| 1.29745882314081237628e-08, /* 0x3e4bdcdaf5cb4656 */ |
| 9.84532844621636118964e-09, /* 0x3e452486cc2c7b9d */ |
| 4.25828404545651943883e-08}; /* 0x3e66dc8a80ce9f09 */ |
| |
| /* |
| Step 1. Reduce the argument. |
| |
| To perform argument reduction, we find the integer n such that |
| x = n * logbaseof2/32 + remainder, |remainder| <= logbaseof2/64. |
| n is defined by round-to-nearest-integer( x*32/logbaseof2 ) and |
| remainder by x - n*logbaseof2/32. The calculation of n is |
| straightforward whereas the computation of x - n*logbaseof2/32 |
| must be carried out carefully. |
| logbaseof2/32 is so represented in two pieces that |
| (1) logbaseof2/32 is known to extra precision, (2) the product |
| of n and the leading piece is a model number and is hence |
| calculated without error, and (3) the subtraction of the value |
| obtained in (2) from x is a model number and is hence again |
| obtained without error. |
| */ |
| |
| r = x * thirtytwo_by_logbaseof2; |
| /* Set n = nearest integer to r */ |
| /* This is faster on Hammer */ |
| if (r > 0) |
| n = (int)(r + 0.5); |
| else |
| n = (int)(r - 0.5); |
| |
| r1 = x - n * logbaseof2_by_32_lead; |
| r2 = - n * logbaseof2_by_32_trail; |
| |
| /* Set j = n mod 32: 5 mod 32 = 5, -5 mod 32 = 27, etc. */ |
| /* j = n % 32; |
| if (j < 0) j += 32; */ |
| j = n & 0x0000001f; |
| |
| f1 = two_to_jby32_lead_table[j]; |
| f2 = two_to_jby32_trail_table[j]; |
| |
| *m = (n - j) / 32; |
| |
| /* Step 2. The following is the core approximation. We approximate |
| exp(r1+r2)-1 by a polynomial. */ |
| |
| r1 *= logbase; r2 *= logbase; |
| |
| r = r1 + r2; |
| q = r1 + (r2 + |
| r*r*( 5.00000000000000008883e-01 + |
| r*( 1.66666666665260878863e-01 + |
| r*( 4.16666666662260795726e-02 + |
| r*( 8.33336798434219616221e-03 + |
| r*( 1.38889490863777199667e-03 )))))); |
| |
| /* Step 3. Function value reconstruction. |
| We now reconstruct the exponential of the input argument |
| so that exp(x) = 2**m * (z1 + z2). |
| The order of the computation below must be strictly observed. */ |
| |
| *z1 = f1; |
| *z2 = f2 + ((f1 + f2) * q); |
| } |
| #endif /* USE_SPLITEXP */ |
| |
| |
| #if defined(USE_SPLITEXPF) |
| /* Compute the values m, z1, and z2 such that base**x = 2**m * (z1 + z2). |
| Small arguments abs(x) < 1/(16*ln(base)) and extreme arguments |
| abs(x) > large/(ln(base)) (where large is the largest representable |
| floating point number) should be handled separately instead of calling |
| this function. This function is called by exp_amd, exp2_amd, exp10_amd, |
| cosh_amd and sinh_amd. */ |
| static inline void splitexpf(float x, float logbase, |
| float thirtytwo_by_logbaseof2, |
| float logbaseof2_by_32_lead, |
| float logbaseof2_by_32_trail, |
| int *m, float *z1, float *z2) |
| { |
| float q, r, r1, r2, f1, f2; |
| int n, j; |
| |
| /* Arrays two_to_jby32_lead_table and two_to_jby32_trail_table contain |
| leading and trailing parts respectively of precomputed |
| values of pow(2.0,j/32.0), for j = 0, 1, ..., 31. |
| two_to_jby32_lead_table contains the first 10 bits of precision, |
| and two_to_jby32_trail_table contains a further 24 bits precision. */ |
| |
| static const float two_to_jby32_lead_table[32] = { |
| 1.0000000000E+00F, /* 0x3F800000 */ |
| 1.0214843750E+00F, /* 0x3F82C000 */ |
| 1.0429687500E+00F, /* 0x3F858000 */ |
| 1.0664062500E+00F, /* 0x3F888000 */ |
| 1.0898437500E+00F, /* 0x3F8B8000 */ |
| 1.1132812500E+00F, /* 0x3F8E8000 */ |
| 1.1386718750E+00F, /* 0x3F91C000 */ |
| 1.1621093750E+00F, /* 0x3F94C000 */ |
| 1.1875000000E+00F, /* 0x3F980000 */ |
| 1.2148437500E+00F, /* 0x3F9B8000 */ |
| 1.2402343750E+00F, /* 0x3F9EC000 */ |
| 1.2675781250E+00F, /* 0x3FA24000 */ |
| 1.2949218750E+00F, /* 0x3FA5C000 */ |
| 1.3242187500E+00F, /* 0x3FA98000 */ |
| 1.3535156250E+00F, /* 0x3FAD4000 */ |
| 1.3828125000E+00F, /* 0x3FB10000 */ |
| 1.4140625000E+00F, /* 0x3FB50000 */ |
| 1.4433593750E+00F, /* 0x3FB8C000 */ |
| 1.4765625000E+00F, /* 0x3FBD0000 */ |
| 1.5078125000E+00F, /* 0x3FC10000 */ |
| 1.5410156250E+00F, /* 0x3FC54000 */ |
| 1.5742187500E+00F, /* 0x3FC98000 */ |
| 1.6093750000E+00F, /* 0x3FCE0000 */ |
| 1.6445312500E+00F, /* 0x3FD28000 */ |
| 1.6816406250E+00F, /* 0x3FD74000 */ |
| 1.7167968750E+00F, /* 0x3FDBC000 */ |
| 1.7558593750E+00F, /* 0x3FE0C000 */ |
| 1.7929687500E+00F, /* 0x3FE58000 */ |
| 1.8339843750E+00F, /* 0x3FEAC000 */ |
| 1.8730468750E+00F, /* 0x3FEFC000 */ |
| 1.9140625000E+00F, /* 0x3FF50000 */ |
| 1.9570312500E+00F}; /* 0x3FFA8000 */ |
| |
| static const float two_to_jby32_trail_table[32] = { |
| 0.0000000000E+00F, /* 0x00000000 */ |
| 4.1277357377E-04F, /* 0x39D86988 */ |
| 1.3050324051E-03F, /* 0x3AAB0D9F */ |
| 7.3415064253E-04F, /* 0x3A407404 */ |
| 6.6398258787E-04F, /* 0x3A2E0F1E */ |
| 1.1054925853E-03F, /* 0x3A90E62D */ |
| 1.1675967835E-04F, /* 0x38F4DCE0 */ |
| 1.6154836630E-03F, /* 0x3AD3BEA3 */ |
| 1.7071149778E-03F, /* 0x3ADFC146 */ |
| 4.0360994171E-04F, /* 0x39D39B9C */ |
| 1.6234370414E-03F, /* 0x3AD4C982 */ |
| 1.4728321694E-03F, /* 0x3AC10C0C */ |
| 1.9176795613E-03F, /* 0x3AFB5AA6 */ |
| 1.0178930825E-03F, /* 0x3A856AD3 */ |
| 7.3992193211E-04F, /* 0x3A41F752 */ |
| 1.0973819299E-03F, /* 0x3A8FD607 */ |
| 1.5106226783E-04F, /* 0x391E6678 */ |
| 1.8214319134E-03F, /* 0x3AEEBD1D */ |
| 2.6364589576E-04F, /* 0x398A39F4 */ |
| 1.3519275235E-03F, /* 0x3AB13329 */ |
| 1.1952003697E-03F, /* 0x3A9CA845 */ |
| 1.7620950239E-03F, /* 0x3AE6F619 */ |
| 1.1153318919E-03F, /* 0x3A923054 */ |
| 1.2242280645E-03F, /* 0x3AA07647 */ |
| 1.5220546629E-04F, /* 0x391F9958 */ |
| 1.8224230735E-03F, /* 0x3AEEDE5F */ |
| 3.9278529584E-04F, /* 0x39CDEEC0 */ |
| 1.7403248930E-03F, /* 0x3AE41B9D */ |
| 2.3711356334E-05F, /* 0x37C6E7C0 */ |
| 1.1207590578E-03F, /* 0x3A92E66F */ |
| 1.1440613307E-03F, /* 0x3A95F454 */ |
| 1.1287408415E-04F}; /* 0x38ECB6D0 */ |
| |
| /* |
| Step 1. Reduce the argument. |
| |
| To perform argument reduction, we find the integer n such that |
| x = n * logbaseof2/32 + remainder, |remainder| <= logbaseof2/64. |
| n is defined by round-to-nearest-integer( x*32/logbaseof2 ) and |
| remainder by x - n*logbaseof2/32. The calculation of n is |
| straightforward whereas the computation of x - n*logbaseof2/32 |
| must be carried out carefully. |
| logbaseof2/32 is so represented in two pieces that |
| (1) logbaseof2/32 is known to extra precision, (2) the product |
| of n and the leading piece is a model number and is hence |
| calculated without error, and (3) the subtraction of the value |
| obtained in (2) from x is a model number and is hence again |
| obtained without error. |
| */ |
| |
| r = x * thirtytwo_by_logbaseof2; |
| /* Set n = nearest integer to r */ |
| /* This is faster on Hammer */ |
| if (r > 0) |
| n = (int)(r + 0.5F); |
| else |
| n = (int)(r - 0.5F); |
| |
| r1 = x - n * logbaseof2_by_32_lead; |
| r2 = - n * logbaseof2_by_32_trail; |
| |
| /* Set j = n mod 32: 5 mod 32 = 5, -5 mod 32 = 27, etc. */ |
| /* j = n % 32; |
| if (j < 0) j += 32; */ |
| j = n & 0x0000001f; |
| |
| f1 = two_to_jby32_lead_table[j]; |
| f2 = two_to_jby32_trail_table[j]; |
| |
| *m = (n - j) / 32; |
| |
| /* Step 2. The following is the core approximation. We approximate |
| exp(r1+r2)-1 by a polynomial. */ |
| |
| r1 *= logbase; r2 *= logbase; |
| |
| r = r1 + r2; |
| q = r1 + (r2 + |
| r*r*( 5.00000000000000008883e-01F + |
| r*( 1.66666666665260878863e-01F ))); |
| |
| /* Step 3. Function value reconstruction. |
| We now reconstruct the exponential of the input argument |
| so that exp(x) = 2**m * (z1 + z2). |
| The order of the computation below must be strictly observed. */ |
| |
| *z1 = f1; |
| *z2 = f2 + ((f1 + f2) * q); |
| } |
| #endif /* SPLITEXPF */ |
| |
| |
| #if defined(USE_SCALEUPDOUBLE1024) |
| /* Scales up a double (normal or denormal) whose bit pattern is given |
| as ux by 2**1024. There are no checks that the input number is |
| scalable by that amount. */ |
| static inline void scaleUpDouble1024(unsigned long long ux, unsigned long long *ur) |
| { |
| unsigned long long uy; |
| double y; |
| |
| if ((ux & EXPBITS_DP64) == 0) |
| { |
| /* ux is denormalised */ |
| PUT_BITS_DP64(ux | 0x4010000000000000, y); |
| if (ux & SIGNBIT_DP64) |
| y += 4.0; |
| else |
| y -= 4.0; |
| GET_BITS_DP64(y, uy); |
| } |
| else |
| /* ux is normal */ |
| uy = ux + 0x4000000000000000; |
| |
| *ur = uy; |
| return; |
| } |
| |
| #endif /* SCALEUPDOUBLE1024 */ |
| |
| |
| #if defined(USE_SCALEDOWNDOUBLE) |
| /* Scales down a double whose bit pattern is given as ux by 2**k. |
| There are no checks that the input number is scalable by that amount. */ |
| static inline void scaleDownDouble(unsigned long long ux, int k, |
| unsigned long long *ur) |
| { |
| unsigned long long uy, uk, ax, xsign; |
| int n, shift; |
| xsign = ux & SIGNBIT_DP64; |
| ax = ux & ~SIGNBIT_DP64; |
| n = (int)((ax & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - k; |
| if (n > 0) |
| { |
| uk = (unsigned long long)n << EXPSHIFTBITS_DP64; |
| uy = (ax & ~EXPBITS_DP64) | uk; |
| } |
| else |
| { |
| uy = (ax & ~EXPBITS_DP64) | 0x0010000000000000; |
| shift = (1 - n); |
| if (shift > MANTLENGTH_DP64 + 1) |
| /* Sigh. Shifting works mod 64 so be careful not to shift too much */ |
| uy = 0; |
| else |
| { |
| /* Make sure we round the result */ |
| uy >>= shift - 1; |
| uy = (uy >> 1) + (uy & 1); |
| } |
| } |
| *ur = uy | xsign; |
| } |
| |
| #endif /* SCALEDOWNDOUBLE */ |
| |
| |
| #if defined(USE_SCALEUPFLOAT128) |
| /* Scales up a float (normal or denormal) whose bit pattern is given |
| as ux by 2**128. There are no checks that the input number is |
| scalable by that amount. */ |
| static inline void scaleUpFloat128(unsigned int ux, unsigned int *ur) |
| { |
| unsigned int uy; |
| float y; |
| |
| if ((ux & EXPBITS_SP32) == 0) |
| { |
| /* ux is denormalised */ |
| PUT_BITS_SP32(ux | 0x40800000, y); |
| /* Compensate for the implicit bit just added */ |
| if (ux & SIGNBIT_SP32) |
| y += 4.0F; |
| else |
| y -= 4.0F; |
| GET_BITS_SP32(y, uy); |
| } |
| else |
| /* ux is normal */ |
| uy = ux + 0x40000000; |
| *ur = uy; |
| } |
| #endif /* SCALEUPFLOAT128 */ |
| |
| |
| #if defined(USE_SCALEDOWNFLOAT) |
| /* Scales down a float whose bit pattern is given as ux by 2**k. |
| There are no checks that the input number is scalable by that amount. */ |
| static inline void scaleDownFloat(unsigned int ux, int k, |
| unsigned int *ur) |
| { |
| unsigned int uy, uk, ax, xsign; |
| int n, shift; |
| |
| xsign = ux & SIGNBIT_SP32; |
| ax = ux & ~SIGNBIT_SP32; |
| n = ((ax & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - k; |
| if (n > 0) |
| { |
| uk = (unsigned int)n << EXPSHIFTBITS_SP32; |
| uy = (ax & ~EXPBITS_SP32) | uk; |
| } |
| else |
| { |
| uy = (ax & ~EXPBITS_SP32) | 0x00800000; |
| shift = (1 - n); |
| if (shift > MANTLENGTH_SP32 + 1) |
| /* Sigh. Shifting works mod 32 so be careful not to shift too much */ |
| uy = 0; |
| else |
| { |
| /* Make sure we round the result */ |
| uy >>= shift - 1; |
| uy = (uy >> 1) + (uy & 1); |
| } |
| } |
| *ur = uy | xsign; |
| } |
| #endif /* SCALEDOWNFLOAT */ |
| |
| |
| #if defined(USE_SQRT_AMD_INLINE) |
| static inline double sqrt_amd_inline(double x) |
| { |
| /* |
| Computes the square root of x. |
| |
| The calculation is carried out in three steps. |
| |
| Step 1. Reduction. |
| The input argument is scaled to the interval [1, 4) by |
| computing |
| x = 2^e * y, where y in [1,4). |
| Furthermore y is decomposed as y = c + t where |
| c = 1 + j/32, j = 0,1,..,96; and |t| <= 1/64. |
| |
| Step 2. Approximation. |
| An approximation q = sqrt(1 + (t/c)) - 1 is obtained |
| from a basic series expansion using precomputed values |
| stored in rt_jby32_lead_table_dbl and rt_jby32_trail_table_dbl. |
| |
| Step 3. Reconstruction. |
| The value of sqrt(x) is reconstructed via |
| sqrt(x) = 2^(e/2) * sqrt(y) |
| = 2^(e/2) * sqrt(c) * sqrt(y/c) |
| = 2^(e/2) * sqrt(c) * sqrt(1 + t/c) |
| = 2^(e/2) * [ sqrt(c) + sqrt(c)*q ] |
| */ |
| |
| unsigned long long ux, ax, u; |
| double r1, r2, c, y, p, q, r, twop, z, rtc, rtc_lead, rtc_trail; |
| int e, denorm = 0, index; |
| |
| /* Arrays rt_jby32_lead_table_dbl and rt_jby32_trail_table_dbl contain |
| leading and trailing parts respectively of precomputed |
| values of sqrt(j/32), for j = 32, 33, ..., 128. |
| rt_jby32_lead_table_dbl contains the first 21 bits of precision, |
| and rt_jby32_trail_table_dbl contains a further 53 bits precision. */ |
| |
| static const double rt_jby32_lead_table_dbl[97] = { |
| 1.00000000000000000000e+00, /* 0x3ff0000000000000 */ |
| 1.01550388336181640625e+00, /* 0x3ff03f8100000000 */ |
| 1.03077602386474609375e+00, /* 0x3ff07e0f00000000 */ |
| 1.04582500457763671875e+00, /* 0x3ff0bbb300000000 */ |
| 1.06065940856933593750e+00, /* 0x3ff0f87600000000 */ |
| 1.07528972625732421875e+00, /* 0x3ff1346300000000 */ |
| 1.08972454071044921875e+00, /* 0x3ff16f8300000000 */ |
| 1.10396957397460937500e+00, /* 0x3ff1a9dc00000000 */ |
| 1.11803340911865234375e+00, /* 0x3ff1e37700000000 */ |
| 1.13192272186279296875e+00, /* 0x3ff21c5b00000000 */ |
| 1.14564323425292968750e+00, /* 0x3ff2548e00000000 */ |
| 1.15920162200927734375e+00, /* 0x3ff28c1700000000 */ |
| 1.17260360717773437500e+00, /* 0x3ff2c2fc00000000 */ |
| 1.18585395812988281250e+00, /* 0x3ff2f94200000000 */ |
| 1.19895744323730468750e+00, /* 0x3ff32eee00000000 */ |
| 1.21191978454589843750e+00, /* 0x3ff3640600000000 */ |
| 1.22474479675292968750e+00, /* 0x3ff3988e00000000 */ |
| 1.23743629455566406250e+00, /* 0x3ff3cc8a00000000 */ |
| 1.25000000000000000000e+00, /* 0x3ff4000000000000 */ |
| 1.26243782043457031250e+00, /* 0x3ff432f200000000 */ |
| 1.27475452423095703125e+00, /* 0x3ff4656500000000 */ |
| 1.28695297241210937500e+00, /* 0x3ff4975c00000000 */ |
| 1.29903793334960937500e+00, /* 0x3ff4c8dc00000000 */ |
| 1.31101036071777343750e+00, /* 0x3ff4f9e600000000 */ |
| 1.32287502288818359375e+00, /* 0x3ff52a7f00000000 */ |
| 1.33463478088378906250e+00, /* 0x3ff55aaa00000000 */ |
| 1.34629058837890625000e+00, /* 0x3ff58a6800000000 */ |
| 1.35784721374511718750e+00, /* 0x3ff5b9be00000000 */ |
| 1.36930561065673828125e+00, /* 0x3ff5e8ad00000000 */ |
| 1.38066959381103515625e+00, /* 0x3ff6173900000000 */ |
| 1.39194107055664062500e+00, /* 0x3ff6456400000000 */ |
| 1.40312099456787109375e+00, /* 0x3ff6732f00000000 */ |
| 1.41421318054199218750e+00, /* 0x3ff6a09e00000000 */ |
| 1.42521858215332031250e+00, /* 0x3ff6cdb200000000 */ |
| 1.43614006042480468750e+00, /* 0x3ff6fa6e00000000 */ |
| 1.44697952270507812500e+00, /* 0x3ff726d400000000 */ |
| 1.45773792266845703125e+00, /* 0x3ff752e500000000 */ |
| 1.46841716766357421875e+00, /* 0x3ff77ea300000000 */ |
| 1.47901916503906250000e+00, /* 0x3ff7aa1000000000 */ |
| 1.48954677581787109375e+00, /* 0x3ff7d52f00000000 */ |
| 1.50000000000000000000e+00, /* 0x3ff8000000000000 */ |
| 1.51038074493408203125e+00, /* 0x3ff82a8500000000 */ |
| 1.52068996429443359375e+00, /* 0x3ff854bf00000000 */ |
| 1.53093051910400390625e+00, /* 0x3ff87eb100000000 */ |
| 1.54110336303710937500e+00, /* 0x3ff8a85c00000000 */ |
| 1.55120849609375000000e+00, /* 0x3ff8d1c000000000 */ |
| 1.56124877929687500000e+00, /* 0x3ff8fae000000000 */ |
| 1.57122516632080078125e+00, /* 0x3ff923bd00000000 */ |
| 1.58113861083984375000e+00, /* 0x3ff94c5800000000 */ |
| 1.59099006652832031250e+00, /* 0x3ff974b200000000 */ |
| 1.60078048706054687500e+00, /* 0x3ff99ccc00000000 */ |
| 1.61051177978515625000e+00, /* 0x3ff9c4a800000000 */ |
| 1.62018489837646484375e+00, /* 0x3ff9ec4700000000 */ |
| 1.62979984283447265625e+00, /* 0x3ffa13a900000000 */ |
| 1.63935947418212890625e+00, /* 0x3ffa3ad100000000 */ |
| 1.64886283874511718750e+00, /* 0x3ffa61be00000000 */ |
| 1.65831184387207031250e+00, /* 0x3ffa887200000000 */ |
| 1.66770744323730468750e+00, /* 0x3ffaaeee00000000 */ |
| 1.67705059051513671875e+00, /* 0x3ffad53300000000 */ |
| 1.68634128570556640625e+00, /* 0x3ffafb4100000000 */ |
| 1.69558238983154296875e+00, /* 0x3ffb211b00000000 */ |
| 1.70477199554443359375e+00, /* 0x3ffb46bf00000000 */ |
| 1.71391296386718750000e+00, /* 0x3ffb6c3000000000 */ |
| 1.72300529479980468750e+00, /* 0x3ffb916e00000000 */ |
| 1.73204994201660156250e+00, /* 0x3ffbb67a00000000 */ |
| 1.74104785919189453125e+00, /* 0x3ffbdb5500000000 */ |
| 1.75000000000000000000e+00, /* 0x3ffc000000000000 */ |
| 1.75890541076660156250e+00, /* 0x3ffc247a00000000 */ |
| 1.76776695251464843750e+00, /* 0x3ffc48c600000000 */ |
| 1.77658367156982421875e+00, /* 0x3ffc6ce300000000 */ |
| 1.78535652160644531250e+00, /* 0x3ffc90d200000000 */ |
| 1.79408740997314453125e+00, /* 0x3ffcb49500000000 */ |
| 1.80277538299560546875e+00, /* 0x3ffcd82b00000000 */ |
| 1.81142139434814453125e+00, /* 0x3ffcfb9500000000 */ |
| 1.82002735137939453125e+00, /* 0x3ffd1ed500000000 */ |
| 1.82859230041503906250e+00, /* 0x3ffd41ea00000000 */ |
| 1.83711719512939453125e+00, /* 0x3ffd64d500000000 */ |
| 1.84560203552246093750e+00, /* 0x3ffd879600000000 */ |
| 1.85404872894287109375e+00, /* 0x3ffdaa2f00000000 */ |
| 1.86245727539062500000e+00, /* 0x3ffdcca000000000 */ |
| 1.87082862854003906250e+00, /* 0x3ffdeeea00000000 */ |
| 1.87916183471679687500e+00, /* 0x3ffe110c00000000 */ |
| 1.88745784759521484375e+00, /* 0x3ffe330700000000 */ |
| 1.89571857452392578125e+00, /* 0x3ffe54dd00000000 */ |
| 1.90394306182861328125e+00, /* 0x3ffe768d00000000 */ |
| 1.91213226318359375000e+00, /* 0x3ffe981800000000 */ |
| 1.92028617858886718750e+00, /* 0x3ffeb97e00000000 */ |
| 1.92840576171875000000e+00, /* 0x3ffedac000000000 */ |
| 1.93649101257324218750e+00, /* 0x3ffefbde00000000 */ |
| 1.94454288482666015625e+00, /* 0x3fff1cd900000000 */ |
| 1.95256233215332031250e+00, /* 0x3fff3db200000000 */ |
| 1.96054744720458984375e+00, /* 0x3fff5e6700000000 */ |
| 1.96850109100341796875e+00, /* 0x3fff7efb00000000 */ |
| 1.97642326354980468750e+00, /* 0x3fff9f6e00000000 */ |
| 1.98431301116943359375e+00, /* 0x3fffbfbf00000000 */ |
| 1.99217128753662109375e+00, /* 0x3fffdfef00000000 */ |
| 2.00000000000000000000e+00}; /* 0x4000000000000000 */ |
| |
| static const double rt_jby32_trail_table_dbl[97] = { |
| 0.00000000000000000000e+00, /* 0x0000000000000000 */ |
| 9.17217678638807524014e-07, /* 0x3eaec6d70177881c */ |
| 3.82539669043705364790e-07, /* 0x3e99abfb41bd6b24 */ |
| 2.85899577162227138140e-08, /* 0x3e5eb2bf6bab55a2 */ |
| 7.63210485349101216659e-07, /* 0x3ea99bed9b2d8d0c */ |
| 9.32123004127716212874e-07, /* 0x3eaf46e029c1b296 */ |
| 1.95174719169309219157e-07, /* 0x3e8a3226fc42f30c */ |
| 5.34316371481845492427e-07, /* 0x3ea1edbe20701d73 */ |
| 5.79631242504454563052e-07, /* 0x3ea372fe94f82be7 */ |
| 4.20404384109571705948e-07, /* 0x3e9c367e08e7bb06 */ |
| 6.89486030314147010716e-07, /* 0x3ea722a3d0a66608 */ |
| 6.89927685625314560328e-07, /* 0x3ea7266f067ca1d6 */ |
| 3.32778123013641425828e-07, /* 0x3e965515a9b34850 */ |
| 1.64433259436999584387e-07, /* 0x3e8611e23ef6c1bd */ |
| 4.37590875197899335723e-07, /* 0x3e9d5dc1059ed8e7 */ |
| 1.79808183816018617413e-07, /* 0x3e88222982d0e4f4 */ |
| 7.46386593615986477624e-08, /* 0x3e7409212e7d0322 */ |
| 5.72520794105201454728e-07, /* 0x3ea335ea8a5fcf39 */ |
| 0.00000000000000000000e+00, /* 0x0000000000000000 */ |
| 2.96860689431670420344e-07, /* 0x3e93ec071e938bfe */ |
| 3.54167239176257065345e-07, /* 0x3e97c48bfd9862c6 */ |
| 7.95211265664474710063e-07, /* 0x3eaaaed010f74671 */ |
| 1.72327048595145565621e-07, /* 0x3e87211cbfeb62e0 */ |
| 6.99494915996239297020e-07, /* 0x3ea7789d9660e72d */ |
| 6.32644111701500844315e-07, /* 0x3ea53a5f1d36f1cf */ |
| 6.20124838851440463844e-10, /* 0x3e054eacff2057dc */ |
| 6.13404719757812629969e-07, /* 0x3ea4951b3e6a83cc */ |
| 3.47654909777986407387e-07, /* 0x3e9754aa76884c66 */ |
| 7.83106177002392475763e-07, /* 0x3eaa46d4b1de1074 */ |
| 5.33337372440526357008e-07, /* 0x3ea1e55548f92635 */ |
| 2.01508648555298681765e-08, /* 0x3e55a3070dd17788 */ |
| 5.25472356925843939587e-07, /* 0x3ea1a1c5eedb0801 */ |
| 3.81831102861301692797e-07, /* 0x3e999fcef32422cc */ |
| 6.99220602161420018738e-07, /* 0x3ea776425d6b0199 */ |
| 6.01209702477462624811e-07, /* 0x3ea42c5a1e0191a2 */ |
| 9.01437000591944740554e-08, /* 0x3e7832a0bdff1327 */ |
| 5.10428680864685379950e-08, /* 0x3e6b674743636676 */ |
| 3.47895267104621031421e-07, /* 0x3e9758cb90d2f714 */ |
| 7.80735841510641848628e-07, /* 0x3eaa3278459cde25 */ |
| 1.35158752025506517690e-07, /* 0x3e822404f4a103ee */ |
| 0.00000000000000000000e+00, /* 0x0000000000000000 */ |
| 1.76523947728535489812e-09, /* 0x3e1e539af6892ac5 */ |
| 6.68280121328499932183e-07, /* 0x3ea66c7b872c9cd0 */ |
| 5.70135482405123276616e-07, /* 0x3ea3216d2f43887d */ |
| 1.37705134737562525897e-07, /* 0x3e827b832cbedc0e */ |
| 7.09655107074516613672e-07, /* 0x3ea7cfe41579091d */ |
| 7.20302724551461693011e-07, /* 0x3ea82b5a713c490a */ |
| 4.69926266058212796694e-07, /* 0x3e9f8945932d872e */ |
| 2.19244345915999437026e-07, /* 0x3e8d6d2da9490251 */ |
| 1.91141411617401877927e-07, /* 0x3e89a791a3114e4a */ |
| 5.72297665296622053774e-07, /* 0x3ea333ffe005988d */ |
| 5.61055484436830560103e-07, /* 0x3ea2d36e0ed49ab1 */ |
| 2.76225500213991506100e-07, /* 0x3e92898498f55f9e */ |
| 7.58466189522395692908e-07, /* 0x3ea9732cca1032a3 */ |
| 1.56893371256836029827e-07, /* 0x3e850ed0b02a22d2 */ |
| 4.06038997708867066507e-07, /* 0x3e9b3fb265b1e40a */ |
| 5.51305629612057435809e-07, /* 0x3ea27fade682d1de */ |
| 5.64778487026561123207e-07, /* 0x3ea2f36906f707ba */ |
| 3.92609705553556897517e-07, /* 0x3e9a58fbbee883b6 */ |
| 9.09698438776943827802e-07, /* 0x3eae864005bca6d7 */ |
| 1.05949774066016139743e-07, /* 0x3e7c70d02300f263 */ |
| 7.16578798392844784244e-07, /* 0x3ea80b5d712d8e3e */ |
| 6.86233073531233972561e-07, /* 0x3ea706b27cc7d390 */ |
| 7.99211473033494452908e-07, /* 0x3eaad12c9d849a97 */ |
| 8.65552275731027456121e-07, /* 0x3ead0b09954e764b */ |
| 6.75456120386058448618e-07, /* 0x3ea6aa1fb7826cbd */ |
| 0.00000000000000000000e+00, /* 0x0000000000000000 */ |
| 4.99167184520462138743e-07, /* 0x3ea0bfd03f46763c */ |
| 4.51720373502110930296e-10, /* 0x3dff0abfb4adfb9e */ |
| 1.28874162718371367439e-07, /* 0x3e814c151f991b2e */ |
| 5.85529267186999798656e-07, /* 0x3ea3a5a879b09292 */ |
| 1.01827770937125531924e-07, /* 0x3e7b558d173f9796 */ |
| 2.54736389177809626508e-07, /* 0x3e9118567cd83fb8 */ |
| 6.98925535290464831294e-07, /* 0x3ea773b981896751 */ |
| 1.20940735036524314513e-07, /* 0x3e803b7df49f48a8 */ |
| 5.43759351196479689657e-08, /* 0x3e6d315f22491900 */ |
| 1.11957989042397958409e-07, /* 0x3e7e0db1c5bb84b2 */ |
| 8.47006714134442661218e-07, /* 0x3eac6bbb7644ff76 */ |
| 8.92831044643427836228e-07, /* 0x3eadf55c3afec01f */ |
| 7.77828292464916501663e-07, /* 0x3eaa197e81034da3 */ |
| 6.48469316302918797451e-08, /* 0x3e71683f4920555d */ |
| 2.12579816658859849140e-07, /* 0x3e8c882fd78bb0b0 */ |
| 7.61222472580559138435e-07, /* 0x3ea98ad9eb7b83ec */ |
| 2.86488961857314189607e-07, /* 0x3e9339d7c7777273 */ |
| 2.14637363790165363515e-07, /* 0x3e8ccee237cae6fe */ |
| 5.44137005612605847831e-08, /* 0x3e6d368fe324a146 */ |
| 2.58378284856442408413e-07, /* 0x3e9156e7b6d99b45 */ |
| 3.15848939061134843091e-07, /* 0x3e95323e5310b5c1 */ |
| 6.60530466255089632309e-07, /* 0x3ea629e9db362f5d */ |
| 7.63436345535852301127e-07, /* 0x3ea99dde4728d7ec */ |
| 8.68233432860324345268e-08, /* 0x3e774e746878544d */ |
| 9.45465175398023087082e-07, /* 0x3eafb97be873a87d */ |
| 8.77499534786171267246e-07, /* 0x3ead71a9e23c2f63 */ |
| 2.74055432394999316135e-07, /* 0x3e92643c89cda173 */ |
| 4.72129009349126213532e-07, /* 0x3e9faf1d57a4d56c */ |
| 8.93777032327078947306e-07, /* 0x3eadfd7c7ab7b282 */ |
| 0.00000000000000000000e+00}; /* 0x0000000000000000 */ |
| |
| |
| /* Handle special arguments first */ |
| |
| GET_BITS_DP64(x, ux); |
| ax = ux & (~SIGNBIT_DP64); |
| |
| if(ax >= 0x7ff0000000000000) |
| { |
| /* x is either NaN or infinity */ |
| if (ux & MANTBITS_DP64) |
| /* x is NaN */ |
| return x + x; /* Raise invalid if it is a signalling NaN */ |
| else if (ux & SIGNBIT_DP64) |
| /* x is negative infinity */ |
| return nan_with_flags(AMD_F_INVALID); |
| else |
| /* x is positive infinity */ |
| return x; |
| } |
| else if (ux & SIGNBIT_DP64) |
| { |
| /* x is negative. */ |
| if (ux == SIGNBIT_DP64) |
| /* Handle negative zero first */ |
| return x; |
| else |
| return nan_with_flags(AMD_F_INVALID); |
| } |
| else if (ux <= 0x000fffffffffffff) |
| { |
| /* x is denormalised or zero */ |
| if (ux == 0) |
| /* x is zero */ |
| return x; |
| else |
| { |
| /* x is denormalised; scale it up */ |
| /* Normalize x by increasing the exponent by 60 |
| and subtracting a correction to account for the implicit |
| bit. This replaces a slow denormalized |
| multiplication by a fast normal subtraction. */ |
| static const double corr = 2.5653355008114851558350183e-290; /* 0x03d0000000000000 */ |
| denorm = 1; |
| GET_BITS_DP64(x, ux); |
| PUT_BITS_DP64(ux | 0x03d0000000000000, x); |
| x -= corr; |
| GET_BITS_DP64(x, ux); |
| } |
| } |
| |
| /* Main algorithm */ |
| |
| /* |
| Find y and e such that x = 2^e * y, where y in [1,4). |
| This is done using an in-lined variant of splitDouble, |
| which also ensures that e is even. |
| */ |
| y = x; |
| ux &= EXPBITS_DP64; |
| ux >>= EXPSHIFTBITS_DP64; |
| if (ux & 1) |
| { |
| GET_BITS_DP64(y, u); |
| u &= (SIGNBIT_DP64 | MANTBITS_DP64); |
| u |= ONEEXPBITS_DP64; |
| PUT_BITS_DP64(u, y); |
| e = ux - EXPBIAS_DP64; |
| } |
| else |
| { |
| GET_BITS_DP64(y, u); |
| u &= (SIGNBIT_DP64 | MANTBITS_DP64); |
| u |= TWOEXPBITS_DP64; |
| PUT_BITS_DP64(u, y); |
| e = ux - EXPBIAS_DP64 - 1; |
| } |
| |
| |
| /* Find the index of the sub-interval of [1,4) in which y lies. */ |
| |
| index = (int)(32.0*y+0.5); |
| |
| /* Look up the table values and compute c and r = c/t */ |
| |
| rtc_lead = rt_jby32_lead_table_dbl[index-32]; |
| rtc_trail = rt_jby32_trail_table_dbl[index-32]; |
| c = 0.03125*index; |
| r = (y - c)/c; |
| |
| /* |
| Find q = sqrt(1+r) - 1. |
| From one step of Newton on (q+1)^2 = 1+r |
| */ |
| |
| p = r*0.5 - r*r*(0.1250079870 - r*(0.6250522999E-01)); |
| twop = p + p; |
| q = p - (p*p + (twop - r))/(twop + 2.0); |
| |
| /* Reconstruction */ |
| |
| rtc = rtc_lead + rtc_trail; |
| e >>= 1; /* e = e/2 */ |
| z = rtc_lead + (rtc*q+rtc_trail); |
| |
| if (denorm) |
| { |
| /* Scale by 2**(e-30) */ |
| PUT_BITS_DP64(((long long)(e - 30) + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, r); |
| z *= r; |
| } |
| else |
| { |
| /* Scale by 2**e */ |
| PUT_BITS_DP64(((long long)e + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, r); |
| z *= r; |
| } |
| |
| return z; |
| |
| } |
| #endif /* SQRT_AMD_INLINE */ |
| |
| #if defined(USE_SQRTF_AMD_INLINE) |
| |
| static inline float sqrtf_amd_inline(float x) |
| { |
| /* |
| Computes the square root of x. |
| |
| The calculation is carried out in three steps. |
| |
| Step 1. Reduction. |
| The input argument is scaled to the interval [1, 4) by |
| computing |
| x = 2^e * y, where y in [1,4). |
| Furthermore y is decomposed as y = c + t where |
| c = 1 + j/32, j = 0,1,..,96; and |t| <= 1/64. |
| |
| Step 2. Approximation. |
| An approximation q = sqrt(1 + (t/c)) - 1 is obtained |
| from a basic series expansion using precomputed values |
| stored in rt_jby32_lead_table_float and rt_jby32_trail_table_float. |
| |
| Step 3. Reconstruction. |
| The value of sqrt(x) is reconstructed via |
| sqrt(x) = 2^(e/2) * sqrt(y) |
| = 2^(e/2) * sqrt(c) * sqrt(y/c) |
| = 2^(e/2) * sqrt(c) * sqrt(1 + t/c) |
| = 2^(e/2) * [ sqrt(c) + sqrt(c)*q ] |
| */ |
| |
| unsigned int ux, ax, u; |
| float r1, r2, c, y, p, q, r, twop, z, rtc, rtc_lead, rtc_trail; |
| int e, denorm = 0, index; |
| |
| /* Arrays rt_jby32_lead_table_float and rt_jby32_trail_table_float contain |
| leading and trailing parts respectively of precomputed |
| values of sqrt(j/32), for j = 32, 33, ..., 128. |
| rt_jby32_lead_table_float contains the first 13 bits of precision, |
| and rt_jby32_trail_table_float contains a further 24 bits precision. */ |
| |
| static const float rt_jby32_lead_table_float[97] = { |
| 1.00000000000000000000e+00F, /* 0x3f800000 */ |
| 1.01538085937500000000e+00F, /* 0x3f81f800 */ |
| 1.03076171875000000000e+00F, /* 0x3f83f000 */ |
| 1.04565429687500000000e+00F, /* 0x3f85d800 */ |
| 1.06054687500000000000e+00F, /* 0x3f87c000 */ |
| 1.07519531250000000000e+00F, /* 0x3f89a000 */ |
| 1.08959960937500000000e+00F, /* 0x3f8b7800 */ |
| 1.10375976562500000000e+00F, /* 0x3f8d4800 */ |
| 1.11791992187500000000e+00F, /* 0x3f8f1800 */ |
| 1.13183593750000000000e+00F, /* 0x3f90e000 */ |
| 1.14550781250000000000e+00F, /* 0x3f92a000 */ |
| 1.15917968750000000000e+00F, /* 0x3f946000 */ |
| 1.17236328125000000000e+00F, /* 0x3f961000 */ |
| 1.18579101562500000000e+00F, /* 0x3f97c800 */ |
| 1.19873046875000000000e+00F, /* 0x3f997000 */ |
| 1.21191406250000000000e+00F, /* 0x3f9b2000 */ |
| 1.22460937500000000000e+00F, /* 0x3f9cc000 */ |
| 1.23730468750000000000e+00F, /* 0x3f9e6000 */ |
| 1.25000000000000000000e+00F, /* 0x3fa00000 */ |
| 1.26220703125000000000e+00F, /* 0x3fa19000 */ |
| 1.27465820312500000000e+00F, /* 0x3fa32800 */ |
| 1.28686523437500000000e+00F, /* 0x3fa4b800 */ |
| 1.29882812500000000000e+00F, /* 0x3fa64000 */ |
| 1.31079101562500000000e+00F, /* 0x3fa7c800 */ |
| 1.32275390625000000000e+00F, /* 0x3fa95000 */ |
| 1.33447265625000000000e+00F, /* 0x3faad000 */ |
| 1.34619140625000000000e+00F, /* 0x3fac5000 */ |
| 1.35766601562500000000e+00F, /* 0x3fadc800 */ |
| 1.36914062500000000000e+00F, /* 0x3faf4000 */ |
| 1.38061523437500000000e+00F, /* 0x3fb0b800 */ |
| 1.39184570312500000000e+00F, /* 0x3fb22800 */ |
| 1.40307617187500000000e+00F, /* 0x3fb39800 */ |
| 1.41406250000000000000e+00F, /* 0x3fb50000 */ |
| 1.42504882812500000000e+00F, /* 0x3fb66800 */ |
| 1.43603515625000000000e+00F, /* 0x3fb7d000 */ |
| 1.44677734375000000000e+00F, /* 0x3fb93000 */ |
| 1.45751953125000000000e+00F, /* 0x3fba9000 */ |
| 1.46826171875000000000e+00F, /* 0x3fbbf000 */ |
| 1.47900390625000000000e+00F, /* 0x3fbd5000 */ |
| 1.48950195312500000000e+00F, /* 0x3fbea800 */ |
| 1.50000000000000000000e+00F, /* 0x3fc00000 */ |
| 1.51025390625000000000e+00F, /* 0x3fc15000 */ |
| 1.52050781250000000000e+00F, /* 0x3fc2a000 */ |
| 1.53076171875000000000e+00F, /* 0x3fc3f000 */ |
| 1.54101562500000000000e+00F, /* 0x3fc54000 */ |
| 1.55102539062500000000e+00F, /* 0x3fc68800 */ |
| 1.56103515625000000000e+00F, /* 0x3fc7d000 */ |
| 1.57104492187500000000e+00F, /* 0x3fc91800 */ |
| 1.58105468750000000000e+00F, /* 0x3fca6000 */ |
| 1.59082031250000000000e+00F, /* 0x3fcba000 */ |
| 1.60058593750000000000e+00F, /* 0x3fcce000 */ |
| 1.61035156250000000000e+00F, /* 0x3fce2000 */ |
| 1.62011718750000000000e+00F, /* 0x3fcf6000 */ |
| 1.62963867187500000000e+00F, /* 0x3fd09800 */ |
| 1.63916015625000000000e+00F, /* 0x3fd1d000 */ |
| 1.64868164062500000000e+00F, /* 0x3fd30800 */ |
| 1.65820312500000000000e+00F, /* 0x3fd44000 */ |
| 1.66748046875000000000e+00F, /* 0x3fd57000 */ |
| 1.67700195312500000000e+00F, /* 0x3fd6a800 */ |
| 1.68627929687500000000e+00F, /* 0x3fd7d800 */ |
| 1.69555664062500000000e+00F, /* 0x3fd90800 */ |
| 1.70458984375000000000e+00F, /* 0x3fda3000 */ |
| 1.71386718750000000000e+00F, /* 0x3fdb6000 */ |
| 1.72290039062500000000e+00F, /* 0x3fdc8800 */ |
| 1.73193359375000000000e+00F, /* 0x3fddb000 */ |
| 1.74096679687500000000e+00F, /* 0x3fded800 */ |
| 1.75000000000000000000e+00F, /* 0x3fe00000 */ |
| 1.75878906250000000000e+00F, /* 0x3fe12000 */ |
| 1.76757812500000000000e+00F, /* 0x3fe24000 */ |
| 1.77636718750000000000e+00F, /* 0x3fe36000 */ |
| 1.78515625000000000000e+00F, /* 0x3fe48000 */ |
| 1.79394531250000000000e+00F, /* 0x3fe5a000 */ |
| 1.80273437500000000000e+00F, /* 0x3fe6c000 */ |
| 1.81127929687500000000e+00F, /* 0x3fe7d800 */ |
| 1.81982421875000000000e+00F, /* 0x3fe8f000 */ |
| 1.82836914062500000000e+00F, /* 0x3fea0800 */ |
| 1.83691406250000000000e+00F, /* 0x3feb2000 */ |
| 1.84545898437500000000e+00F, /* 0x3fec3800 */ |
| 1.85400390625000000000e+00F, /* 0x3fed5000 */ |
| 1.86230468750000000000e+00F, /* 0x3fee6000 */ |
| 1.87060546875000000000e+00F, /* 0x3fef7000 */ |
| 1.87915039062500000000e+00F, /* 0x3ff08800 */ |
| 1.88745117187500000000e+00F, /* 0x3ff19800 */ |
| 1.89550781250000000000e+00F, /* 0x3ff2a000 */ |
| 1.90380859375000000000e+00F, /* 0x3ff3b000 */ |
| 1.91210937500000000000e+00F, /* 0x3ff4c000 */ |
| 1.92016601562500000000e+00F, /* 0x3ff5c800 */ |
| 1.92822265625000000000e+00F, /* 0x3ff6d000 */ |
| 1.93627929687500000000e+00F, /* 0x3ff7d800 */ |
| 1.94433593750000000000e+00F, /* 0x3ff8e000 */ |
| 1.95239257812500000000e+00F, /* 0x3ff9e800 */ |
| 1.96044921875000000000e+00F, /* 0x3ffaf000 */ |
| 1.96826171875000000000e+00F, /* 0x3ffbf000 */ |
| 1.97631835937500000000e+00F, /* 0x3ffcf800 */ |
| 1.98413085937500000000e+00F, /* 0x3ffdf800 */ |
| 1.99194335937500000000e+00F, /* 0x3ffef800 */ |
| 2.00000000000000000000e+00F}; /* 0x40000000 */ |
| |
| static const float rt_jby32_trail_table_float[97] = { |
| 0.00000000000000000000e+00F, /* 0x00000000 */ |
| 1.23941208585165441036e-04F, /* 0x3901f637 */ |
| 1.46876545841223560274e-05F, /* 0x37766aff */ |
| 1.70736297150142490864e-04F, /* 0x393307ad */ |
| 1.13296780909877270460e-04F, /* 0x38ed99bf */ |
| 9.53458802541717886925e-05F, /* 0x38c7f46e */ |
| 1.25126505736261606216e-04F, /* 0x39033464 */ |
| 2.10342666832730174065e-04F, /* 0x395c8f6e */ |
| 1.14066875539720058441e-04F, /* 0x38ef3730 */ |
| 8.72047676239162683487e-05F, /* 0x38b6e1b4 */ |
| 1.36111237225122749805e-04F, /* 0x390eb915 */ |
| 2.26244374061934649944e-05F, /* 0x37bdc99c */ |
| 2.40658700931817293167e-04F, /* 0x397c5954 */ |
| 6.31069415248930454254e-05F, /* 0x38845848 */ |
| 2.27412077947519719601e-04F, /* 0x396e7577 */ |
| 5.90185391047270968556e-06F, /* 0x36c6088a */ |
| 1.35496389702893793583e-04F, /* 0x390e1409 */ |
| 1.32179571664892137051e-04F, /* 0x390a99af */ |
| 0.00000000000000000000e+00F, /* 0x00000000 */ |
| 2.31086043640971183777e-04F, /* 0x39724fb0 */ |
| 9.66752704698592424393e-05F, /* 0x38cabe24 */ |
| 8.85332483449019491673e-05F, /* 0x38b9aaed */ |
| 2.09980673389509320259e-04F, /* 0x395c2e42 */ |
| 2.20044588786549866199e-04F, /* 0x3966bbc5 */ |
| 1.21749282698146998882e-04F, /* 0x38ff53a6 */ |
| 1.62125259521417319775e-04F, /* 0x392a002b */ |
| 9.97955357888713479042e-05F, /* 0x38d14952 */ |
| 1.81545779923908412457e-04F, /* 0x393e5d53 */ |
| 1.65768768056295812130e-04F, /* 0x392dd237 */ |
| 5.48927710042335093021e-05F, /* 0x38663caa */ |
| 9.53875860432162880898e-05F, /* 0x38c80ad2 */ |
| 4.53481625299900770187e-05F, /* 0x383e3438 */ |
| 1.51062369695864617825e-04F, /* 0x391e667f */ |
| 1.70453247847035527229e-04F, /* 0x3932bbb2 */ |
| 1.05505387182347476482e-04F, /* 0x38dd42c6 */ |
| 2.02269104192964732647e-04F, /* 0x39541833 */ |
| 2.18442466575652360916e-04F, /* 0x39650db4 */ |
| 1.55796806211583316326e-04F, /* 0x39235d63 */ |
| 1.60395247803535312414e-05F, /* 0x37868c9e */ |
| 4.49578510597348213196e-05F, /* 0x383c9120 */ |
| 0.00000000000000000000e+00F, /* 0x00000000 */ |
| 1.26840444863773882389e-04F, /* 0x39050079 */ |
| 1.82820076588541269302e-04F, /* 0x393fb364 */ |
| 1.69370483490638434887e-04F, /* 0x3931990b */ |
| 8.78757418831810355186e-05F, /* 0x38b849ee */ |
| 1.83815121999941766262e-04F, /* 0x3940be7f */ |
| 2.14343352126888930798e-04F, /* 0x3960c15b */ |
| 1.80714370799250900745e-04F, /* 0x393d7e25 */ |
| 8.41425862745381891727e-05F, /* 0x38b075b5 */ |
| 1.69945167726837098598e-04F, /* 0x3932334f */ |
| 1.95121858268976211548e-04F, /* 0x394c99a0 */ |
| 1.60778334247879683971e-04F, /* 0x3928969b */ |
| 6.79871009197086095810e-05F, /* 0x388e944c */ |
| 1.61929419846273958683e-04F, /* 0x3929cb99 */ |
| 1.99474830878898501396e-04F, /* 0x39512a1e */ |
| 1.81604162207804620266e-04F, /* 0x393e6cff */ |
| 1.09270178654696792364e-04F, /* 0x38e527fb */ |
| 2.27539261686615645885e-04F, /* 0x396e979b */ |
| 4.90300008095800876617e-05F, /* 0x384da590 */ |
| 6.28985289949923753738e-05F, /* 0x3883e864 */ |
| 2.58551553997676819563e-05F, /* 0x37d8e386 */ |
| 1.82868374395184218884e-04F, /* 0x393fc05b */ |
| 4.64625991298817098141e-05F, /* 0x3842e0d6 */ |
| 1.05703387816902250051e-04F, /* 0x38ddad13 */ |
| 1.17213814519345760345e-04F, /* 0x38f5d0b0 */ |
| 8.17377731436863541603e-05F, /* 0x38ab6aa2 */ |
| 0.00000000000000000000e+00F, /* 0x00000000 */ |
| 1.16847433673683553934e-04F, /* 0x38f50bfd */ |
| 1.88827965757809579372e-04F, /* 0x3946001f */ |
| 2.16612941585481166840e-04F, /* 0x39632298 */ |
| 2.00857131858356297016e-04F, /* 0x39529d2d */ |
| 1.42199307447299361229e-04F, /* 0x39151b56 */ |
| 4.12627305195201188326e-05F, /* 0x382d1185 */ |
| 1.42796401632949709892e-04F, /* 0x3915bb9e */ |
| 2.03253570361994206905e-04F, /* 0x39552077 */ |
| 2.23214170546270906925e-04F, /* 0x396a0e99 */ |
| 2.03244591830298304558e-04F, /* 0x39551e0e */ |
| 1.43898156238719820976e-04F, /* 0x3916e35e */ |
| 4.57155256299301981926e-05F, /* 0x383fbeac */ |
| 1.53365719597786664963e-04F, /* 0x3920d0cc */ |
| 2.23224633373320102692e-04F, /* 0x396a1168 */ |
| 1.16566716314991936088e-05F, /* 0x37439106 */ |
| 7.43694272387074306607e-06F, /* 0x36f98ada */ |
| 2.11048507480882108212e-04F, /* 0x395d4ce7 */ |
| 1.34682719362899661064e-04F, /* 0x390d399e */ |
| 2.29425968427676707506e-05F, /* 0x37c074da */ |
| 1.20421340398024767637e-04F, /* 0x38fc8ab7 */ |
| 1.83421318070031702518e-04F, /* 0x394054c9 */ |
| 2.12376224226318299770e-04F, /* 0x395eb14f */ |
| 2.07710763788782060146e-04F, /* 0x3959ccef */ |
| 1.69840845046564936638e-04F, /* 0x3932174e */ |
| 9.91739216260612010956e-05F, /* 0x38cffb98 */ |
| 2.40249748458154499531e-04F, /* 0x397beb8d */ |
| 1.05178231024183332920e-04F, /* 0x38dc9322 */ |
| 1.82623916771262884140e-04F, /* 0x393f7ebc */ |
| 2.28821940254420042038e-04F, /* 0x396fefec */ |
| 0.00000000000000000000e+00F}; /* 0x00000000 */ |
| |
| |
| /* Handle special arguments first */ |
| |
| GET_BITS_SP32(x, ux); |
| ax = ux & (~SIGNBIT_SP32); |
| |
| if(ax >= 0x7f800000) |
| { |
| /* x is either NaN or infinity */ |
| if (ux & MANTBITS_SP32) |
| /* x is NaN */ |
| return x + x; /* Raise invalid if it is a signalling NaN */ |
| else if (ux & SIGNBIT_SP32) |
| return nanf_with_flags(AMD_F_INVALID); |
| else |
| /* x is positive infinity */ |
| return x; |
| } |
| else if (ux & SIGNBIT_SP32) |
| { |
| /* x is negative. */ |
| if (x == 0.0F) |
| /* Handle negative zero first */ |
| return x; |
| else |
| return nanf_with_flags(AMD_F_INVALID); |
| } |
| else if (ux <= 0x007fffff) |
| { |
| /* x is denormalised or zero */ |
| if (ux == 0) |
| /* x is zero */ |
| return x; |
| else |
| { |
| /* x is denormalised; scale it up */ |
| /* Normalize x by increasing the exponent by 26 |
| and subtracting a correction to account for the implicit |
| bit. This replaces a slow denormalized |
| multiplication by a fast normal subtraction. */ |
| static const float corr = 7.888609052210118054e-31F; /* 0x0d800000 */ |
| denorm = 1; |
| GET_BITS_SP32(x, ux); |
| PUT_BITS_SP32(ux | 0x0d800000, x); |
| x -= corr; |
| GET_BITS_SP32(x, ux); |
| } |
| } |
| |
| /* Main algorithm */ |
| |
| /* |
| Find y and e such that x = 2^e * y, where y in [1,4). |
| This is done using an in-lined variant of splitFloat, |
| which also ensures that e is even. |
| */ |
| y = x; |
| ux &= EXPBITS_SP32; |
| ux >>= EXPSHIFTBITS_SP32; |
| if (ux & 1) |
| { |
| GET_BITS_SP32(y, u); |
| u &= (SIGNBIT_SP32 | MANTBITS_SP32); |
| u |= ONEEXPBITS_SP32; |
| PUT_BITS_SP32(u, y); |
| e = ux - EXPBIAS_SP32; |
| } |
| else |
| { |
| GET_BITS_SP32(y, u); |
| u &= (SIGNBIT_SP32 | MANTBITS_SP32); |
| u |= TWOEXPBITS_SP32; |
| PUT_BITS_SP32(u, y); |
| e = ux - EXPBIAS_SP32 - 1; |
| } |
| |
| /* Find the index of the sub-interval of [1,4) in which y lies. */ |
| |
| index = (int)(32.0F*y+0.5); |
| |
| /* Look up the table values and compute c and r = c/t */ |
| |
| rtc_lead = rt_jby32_lead_table_float[index-32]; |
| rtc_trail = rt_jby32_trail_table_float[index-32]; |
| c = 0.03125F*index; |
| r = (y - c)/c; |
| |
| /* |
| Find q = sqrt(1+r) - 1. |
| From one step of Newton on (q+1)^2 = 1+r |
| */ |
| |
| p = r*0.5F - r*r*(0.1250079870F - r*(0.6250522999e-01F)); |
| twop = p + p; |
| q = p - (p*p + (twop - r))/(twop + 2.0); |
| |
| /* Reconstruction */ |
| |
| rtc = rtc_lead + rtc_trail; |
| e >>= 1; /* e = e/2 */ |
| z = rtc_lead + (rtc*q+rtc_trail); |
| |
| if (denorm) |
| { |
| /* Scale by 2**(e-13) */ |
| PUT_BITS_SP32(((e - 13) + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, r); |
| z *= r; |
| } |
| else |
| { |
| /* Scale by 2**e */ |
| PUT_BITS_SP32((e + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, r); |
| z *= r; |
| } |
| |
| return z; |
| |
| } |
| #endif /* SQRTF_AMD_INLINE */ |
| |
| #ifdef USE_LOG_KERNEL_AMD |
| static inline void log_kernel_amd64(double x, unsigned long long ux, int *xexp, double *r1, double *r2) |
| { |
| |
| int expadjust; |
| double r, z1, z2, correction, f, f1, f2, q, u, v, poly; |
| int index; |
| |
| /* |
| Computes natural log(x). Algorithm based on: |
| Ping-Tak Peter Tang |
| "Table-driven implementation of the logarithm function in IEEE |
| floating-point arithmetic" |
| ACM Transactions on Mathematical Software (TOMS) |
| Volume 16, Issue 4 (December 1990) |
| */ |
| |
| /* Arrays ln_lead_table and ln_tail_table contain |
| leading and trailing parts respectively of precomputed |
| values of natural log(1+i/64), for i = 0, 1, ..., 64. |
| ln_lead_table contains the first 24 bits of precision, |
| and ln_tail_table contains a further 53 bits precision. */ |
| |
| static const double ln_lead_table[65] = { |
| 0.00000000000000000000e+00, /* 0x0000000000000000 */ |
| 1.55041813850402832031e-02, /* 0x3f8fc0a800000000 */ |
| 3.07716131210327148438e-02, /* 0x3f9f829800000000 */ |
| 4.58095073699951171875e-02, /* 0x3fa7745800000000 */ |
| 6.06245994567871093750e-02, /* 0x3faf0a3000000000 */ |
| 7.52233862876892089844e-02, /* 0x3fb341d700000000 */ |
| 8.96121263504028320312e-02, /* 0x3fb6f0d200000000 */ |
| 1.03796780109405517578e-01, /* 0x3fba926d00000000 */ |
| 1.17783010005950927734e-01, /* 0x3fbe270700000000 */ |
| 1.31576299667358398438e-01, /* 0x3fc0d77e00000000 */ |
| 1.45181953907012939453e-01, /* 0x3fc2955280000000 */ |
| 1.58604979515075683594e-01, /* 0x3fc44d2b00000000 */ |
| 1.71850204467773437500e-01, /* 0x3fc5ff3000000000 */ |
| 1.84922337532043457031e-01, /* 0x3fc7ab8900000000 */ |
| 1.97825729846954345703e-01, /* 0x3fc9525a80000000 */ |
| 2.10564732551574707031e-01, /* 0x3fcaf3c900000000 */ |
| 2.23143517971038818359e-01, /* 0x3fcc8ff780000000 */ |
| 2.35566020011901855469e-01, /* 0x3fce270700000000 */ |
| 2.47836112976074218750e-01, /* 0x3fcfb91800000000 */ |
| 2.59957492351531982422e-01, /* 0x3fd0a324c0000000 */ |
| 2.71933674812316894531e-01, /* 0x3fd1675c80000000 */ |
| 2.83768117427825927734e-01, /* 0x3fd22941c0000000 */ |
| 2.95464158058166503906e-01, /* 0x3fd2e8e280000000 */ |
| 3.07025015354156494141e-01, /* 0x3fd3a64c40000000 */ |
| 3.18453729152679443359e-01, /* 0x3fd4618bc0000000 */ |
| 3.29753279685974121094e-01, /* 0x3fd51aad80000000 */ |
| 3.40926527976989746094e-01, /* 0x3fd5d1bd80000000 */ |
| 3.51976394653320312500e-01, /* 0x3fd686c800000000 */ |
| 3.62905442714691162109e-01, /* 0x3fd739d7c0000000 */ |
| 3.73716354370117187500e-01, /* 0x3fd7eaf800000000 */ |
| 3.84411692619323730469e-01, /* 0x3fd89a3380000000 */ |
| 3.94993782043457031250e-01, /* 0x3fd9479400000000 */ |
| 4.05465066432952880859e-01, /* 0x3fd9f323c0000000 */ |
| 4.15827870368957519531e-01, /* 0x3fda9cec80000000 */ |
| 4.26084339618682861328e-01, /* 0x3fdb44f740000000 */ |
| 4.36236739158630371094e-01, /* 0x3fdbeb4d80000000 */ |
| 4.46287095546722412109e-01, /* 0x3fdc8ff7c0000000 */ |
| 4.56237375736236572266e-01, /* 0x3fdd32fe40000000 */ |
| 4.66089725494384765625e-01, /* 0x3fddd46a00000000 */ |
| 4.75845873355865478516e-01, /* 0x3fde744240000000 */ |
| 4.85507786273956298828e-01, /* 0x3fdf128f40000000 */ |
| 4.95077252388000488281e-01, /* 0x3fdfaf5880000000 */ |
| 5.04556000232696533203e-01, /* 0x3fe02552a0000000 */ |
| 5.13945698738098144531e-01, /* 0x3fe0723e40000000 */ |
| 5.23248136043548583984e-01, /* 0x3fe0be72e0000000 */ |
| 5.32464742660522460938e-01, /* 0x3fe109f380000000 */ |
| 5.41597247123718261719e-01, /* 0x3fe154c3c0000000 */ |
| 5.50647079944610595703e-01, /* 0x3fe19ee6a0000000 */ |
| 5.59615731239318847656e-01, /* 0x3fe1e85f40000000 */ |
| 5.68504691123962402344e-01, /* 0x3fe23130c0000000 */ |
| 5.77315330505371093750e-01, /* 0x3fe2795e00000000 */ |
| 5.86049020290374755859e-01, /* 0x3fe2c0e9e0000000 */ |
| 5.94707071781158447266e-01, /* 0x3fe307d720000000 */ |
| 6.03290796279907226562e-01, /* 0x3fe34e2880000000 */ |
| 6.11801505088806152344e-01, /* 0x3fe393e0c0000000 */ |
| 6.20240390300750732422e-01, /* 0x3fe3d90260000000 */ |
| 6.28608644008636474609e-01, /* 0x3fe41d8fe0000000 */ |
| 6.36907458305358886719e-01, /* 0x3fe4618bc0000000 */ |
| 6.45137906074523925781e-01, /* 0x3fe4a4f840000000 */ |
| 6.53301239013671875000e-01, /* 0x3fe4e7d800000000 */ |
| 6.61398470401763916016e-01, /* 0x3fe52a2d20000000 */ |
| 6.69430613517761230469e-01, /* 0x3fe56bf9c0000000 */ |
| 6.77398800849914550781e-01, /* 0x3fe5ad4040000000 */ |
| 6.85303986072540283203e-01, /* 0x3fe5ee02a0000000 */ |
| 6.93147122859954833984e-01}; /* 0x3fe62e42e0000000 */ |
| |
| static const double ln_tail_table[65] = { |
| 0.00000000000000000000e+00, /* 0x0000000000000000 */ |
| 5.15092497094772879206e-09, /* 0x3e361f807c79f3db */ |
| 4.55457209735272790188e-08, /* 0x3e6873c1980267c8 */ |
| 2.86612990859791781788e-08, /* 0x3e5ec65b9f88c69e */ |
| 2.23596477332056055352e-08, /* 0x3e58022c54cc2f99 */ |
| 3.49498983167142274770e-08, /* 0x3e62c37a3a125330 */ |
| 3.23392843005887000414e-08, /* 0x3e615cad69737c93 */ |
| 1.35722380472479366661e-08, /* 0x3e4d256ab1b285e9 */ |
| 2.56504325268044191098e-08, /* 0x3e5b8abcb97a7aa2 */ |
| 5.81213608741512136843e-08, /* 0x3e6f34239659a5dc */ |
| 5.59374849578288093334e-08, /* 0x3e6e07fd48d30177 */ |
| 5.06615629004996189970e-08, /* 0x3e6b32df4799f4f6 */ |
| 5.24588857848400955725e-08, /* 0x3e6c29e4f4f21cf8 */ |
| 9.61968535632653505972e-10, /* 0x3e1086c848df1b59 */ |
| 1.34829655346594463137e-08, /* 0x3e4cf456b4764130 */ |
| 3.65557749306383026498e-08, /* 0x3e63a02ffcb63398 */ |
| 3.33431709374069198903e-08, /* 0x3e61e6a6886b0976 */ |
| 5.13008650536088382197e-08, /* 0x3e6b8abcb97a7aa2 */ |
| 5.09285070380306053751e-08, /* 0x3e6b578f8aa35552 */ |
| 3.20853940845502057341e-08, /* 0x3e6139c871afb9fc */ |
| 4.06713248643004200446e-08, /* 0x3e65d5d30701ce64 */ |
| 5.57028186706125221168e-08, /* 0x3e6de7bcb2d12142 */ |
| 5.48356693724804282546e-08, /* 0x3e6d708e984e1664 */ |
| 1.99407553679345001938e-08, /* 0x3e556945e9c72f36 */ |
| 1.96585517245087232086e-09, /* 0x3e20e2f613e85bda */ |
| 6.68649386072067321503e-09, /* 0x3e3cb7e0b42724f6 */ |
| 5.89936034642113390002e-08, /* 0x3e6fac04e52846c7 */ |
| 2.85038578721554472484e-08, /* 0x3e5e9b14aec442be */ |
| 5.09746772910284482606e-08, /* 0x3e6b5de8034e7126 */ |
| 5.54234668933210171467e-08, /* 0x3e6dc157e1b259d3 */ |
| 6.29100830926604004874e-09, /* 0x3e3b05096ad69c62 */ |
| 2.61974119468563937716e-08, /* 0x3e5c2116faba4cdd */ |
| 4.16752115011186398935e-08, /* 0x3e665fcc25f95b47 */ |
| 2.47747534460820790327e-08, /* 0x3e5a9a08498d4850 */ |
| 5.56922172017964209793e-08, /* 0x3e6de647b1465f77 */ |
| 2.76162876992552906035e-08, /* 0x3e5da71b7bf7861d */ |
| 7.08169709942321478061e-09, /* 0x3e3e6a6886b09760 */ |
| 5.77453510221151779025e-08, /* 0x3e6f0075eab0ef64 */ |
| 4.43021445893361960146e-09, /* 0x3e33071282fb989b */ |
| 3.15140984357495864573e-08, /* 0x3e60eb43c3f1bed2 */ |
| 2.95077445089736670973e-08, /* 0x3e5faf06ecb35c84 */ |
| 1.44098510263167149349e-08, /* 0x3e4ef1e63db35f68 */ |
| 1.05196987538551827693e-08, /* 0x3e469743fb1a71a5 */ |
| 5.23641361722697546261e-08, /* 0x3e6c1cdf404e5796 */ |
| 7.72099925253243069458e-09, /* 0x3e4094aa0ada625e */ |
| 5.62089493829364197156e-08, /* 0x3e6e2d4c96fde3ec */ |
| 3.53090261098577946927e-08, /* 0x3e62f4d5e9a98f34 */ |
| 3.80080516835568242269e-08, /* 0x3e6467c96ecc5cbe */ |
| 5.66961038386146408282e-08, /* 0x3e6e7040d03dec5a */ |
| 4.42287063097349852717e-08, /* 0x3e67bebf4282de36 */ |
| 3.45294525105681104660e-08, /* 0x3e6289b11aeb783f */ |
| 2.47132034530447431509e-08, /* 0x3e5a891d1772f538 */ |
| 3.59655343422487209774e-08, /* 0x3e634f10be1fb591 */ |
| 5.51581770357780862071e-08, /* 0x3e6d9ce1d316eb93 */ |
| 3.60171867511861372793e-08, /* 0x3e63562a19a9c442 */ |
| 1.94511067964296180547e-08, /* 0x3e54e2adf548084c */ |
| 1.54137376631349347838e-08, /* 0x3e508ce55cc8c97a */ |
| 3.93171034490174464173e-09, /* 0x3e30e2f613e85bda */ |
| 5.52990607758839766440e-08, /* 0x3e6db03ebb0227bf */ |
| 3.29990737637586136511e-08, /* 0x3e61b75bb09cb098 */ |
| 1.18436010922446096216e-08, /* 0x3e496f16abb9df22 */ |
| 4.04248680368301346709e-08, /* 0x3e65b3f399411c62 */ |
| 2.27418915900284316293e-08, /* 0x3e586b3e59f65355 */ |
| 1.70263791333409206020e-08, /* 0x3e52482ceae1ac12 */ |
| 5.76999904754328540596e-08}; /* 0x3e6efa39ef35793c */ |
| |
| /* Approximating polynomial coefficients for x near 1.0 */ |
| static const double |
| ca_1 = 8.33333333333317923934e-02, /* 0x3fb55555555554e6 */ |
| ca_2 = 1.25000000037717509602e-02, /* 0x3f89999999bac6d4 */ |
| ca_3 = 2.23213998791944806202e-03, /* 0x3f62492307f1519f */ |
| ca_4 = 4.34887777707614552256e-04; /* 0x3f3c8034c85dfff0 */ |
| |
| /* Approximating polynomial coefficients for other x */ |
| static const double |
| cb_1 = 8.33333333333333593622e-02, /* 0x3fb5555555555557 */ |
| cb_2 = 1.24999999978138668903e-02, /* 0x3f89999999865ede */ |
| cb_3 = 2.23219810758559851206e-03; /* 0x3f6249423bd94741 */ |
| |
| static const unsigned long long |
| log_thresh1 = 0x3fee0faa00000000, |
| log_thresh2 = 0x3ff1082c00000000; |
| |
| /* log_thresh1 = 9.39412117004394531250e-1 = 0x3fee0faa00000000 |
| log_thresh2 = 1.06449508666992187500 = 0x3ff1082c00000000 */ |
| if (ux >= log_thresh1 && ux <= log_thresh2) |
| { |
| /* Arguments close to 1.0 are handled separately to maintain |
| accuracy. |
| |
| The approximation in this region exploits the identity |
| log( 1 + r ) = log( 1 + u/2 ) / log( 1 - u/2 ), where |
| u = 2r / (2+r). |
| Note that the right hand side has an odd Taylor series expansion |
| which converges much faster than the Taylor series expansion of |
| log( 1 + r ) in r. Thus, we approximate log( 1 + r ) by |
| u + A1 * u^3 + A2 * u^5 + ... + An * u^(2n+1). |
| |
| One subtlety is that since u cannot be calculated from |
| r exactly, the rounding error in the first u should be |
| avoided if possible. To accomplish this, we observe that |
| u = r - r*r/(2+r). |
| Since x (=1+r) is the input argument, and thus presumed exact, |
| the formula above approximates u accurately because |
| u = r - correction, |
| and the magnitude of "correction" (of the order of r*r) |
| is small. |
| With these observations, we will approximate log( 1 + r ) by |
| r + ( (A1*u^3 + ... + An*u^(2n+1)) - correction ). |
| |
| We approximate log(1+r) by an odd polynomial in u, where |
| u = 2r/(2+r) = r - r*r/(2+r). |
| */ |
| r = x - 1.0; |
| u = r / (2.0 + r); |
| correction = r * u; |
| u = u + u; |
| v = u * u; |
| z1 = r; |
| z2 = (u * v * (ca_1 + v * (ca_2 + v * (ca_3 + v * ca_4))) - correction); |
| *r1 = z1; |
| *r2 = z2; |
| *xexp = 0; |
| } |
| else |
| { |
| /* |
| First, we decompose the argument x to the form |
| x = 2**M * (F1 + F2), |
| where 1 <= F1+F2 < 2, M has the value of an integer, |
| F1 = 1 + j/64, j ranges from 0 to 64, and |F2| <= 1/128. |
| |
| Second, we approximate log( 1 + F2/F1 ) by an odd polynomial |
| in U, where U = 2 F2 / (2 F2 + F1). |
| Note that log( 1 + F2/F1 ) = log( 1 + U/2 ) - log( 1 - U/2 ). |
| The core approximation calculates |
| Poly = [log( 1 + U/2 ) - log( 1 - U/2 )]/U - 1. |
| Note that log(1 + U/2) - log(1 - U/2) = 2 arctanh ( U/2 ), |
| thus, Poly = 2 arctanh( U/2 ) / U - 1. |
| |
| It is not hard to see that |
| log(x) = M*log(2) + log(F1) + log( 1 + F2/F1 ). |
| Hence, we return Z1 = log(F1), and Z2 = log( 1 + F2/F1). |
| The values of log(F1) are calculated beforehand and stored |
| in the program. |
| */ |
| |
| f = x; |
| if (ux < IMPBIT_DP64) |
| { |
| /* The input argument x is denormalized */ |
| /* Normalize f by increasing the exponent by 60 |
| and subtracting a correction to account for the implicit |
| bit. This replaces a slow denormalized |
| multiplication by a fast normal subtraction. */ |
| static const double corr = 2.5653355008114851558350183e-290; /* 0x03d0000000000000 */ |
| GET_BITS_DP64(f, ux); |
| ux |= 0x03d0000000000000; |
| PUT_BITS_DP64(ux, f); |
| f -= corr; |
| GET_BITS_DP64(f, ux); |
| expadjust = 60; |
| } |
| else |
| expadjust = 0; |
| |
| /* Store the exponent of x in xexp and put |
| f into the range [0.5,1) */ |
| *xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 - expadjust; |
| PUT_BITS_DP64((ux & MANTBITS_DP64) | HALFEXPBITS_DP64, f); |
| |
| /* Now x = 2**xexp * f, 1/2 <= f < 1. */ |
| |
| /* Set index to be the nearest integer to 128*f */ |
| r = 128.0 * f; |
| index = (int)(r + 0.5); |
| |
| z1 = ln_lead_table[index-64]; |
| q = ln_tail_table[index-64]; |
| f1 = index * 0.0078125; /* 0.0078125 = 1/128 */ |
| f2 = f - f1; |
| /* At this point, x = 2**xexp * ( f1 + f2 ) where |
| f1 = j/128, j = 64, 65, ..., 128 and |f2| <= 1/256. */ |
| |
| /* Calculate u = 2 f2 / ( 2 f1 + f2 ) = f2 / ( f1 + 0.5*f2 ) */ |
| /* u = f2 / (f1 + 0.5 * f2); */ |
| u = f2 / (f1 + 0.5 * f2); |
| |
| /* Here, |u| <= 2(exp(1/16)-1) / (exp(1/16)+1). |
| The core approximation calculates |
| poly = [log(1 + u/2) - log(1 - u/2)]/u - 1 */ |
| v = u * u; |
| poly = (v * (cb_1 + v * (cb_2 + v * cb_3))); |
| z2 = q + (u + u * poly); |
| *r1 = z1; |
| *r2 = z2; |
| } |
| return; |
| } |
| #endif /* USE_LOG_KERNEL_AMD */ |
| |
| #if defined(USE_REMAINDER_PIBY2F_INLINE) |
| /* Define this to get debugging print statements activated */ |
| #define DEBUGGING_PRINT |
| #undef DEBUGGING_PRINT |
| |
| |
| #ifdef DEBUGGING_PRINT |
| #include <stdio.h> |
| char *d2b(long long d, int bitsper, int point) |
| { |
| static char buff[200]; |
| int i, j; |
| j = bitsper; |
| if (point >= 0 && point <= bitsper) |
| j++; |
| buff[j] = '\0'; |
| for (i = bitsper - 1; i >= 0; i--) |
| { |
| j--; |
| if (d % 2 == 1) |
| buff[j] = '1'; |
| else |
| buff[j] = '0'; |
| if (i == point) |
| { |
| j--; |
| buff[j] = '.'; |
| } |
| d /= 2; |
| } |
| return buff; |
| } |
| #endif |
| |
| /* Given positive argument x, reduce it to the range [-pi/4,pi/4] using |
| extra precision, and return the result in r. |
| Return value "region" tells how many lots of pi/2 were subtracted |
| from x to put it in the range [-pi/4,pi/4], mod 4. */ |
| static inline void __remainder_piby2f_inline(unsigned long long ux, double *r, int *region) |
| { |
| |
| /* This method simulates multi-precision floating-point |
| arithmetic and is accurate for all 1 <= x < infinity */ |
| #define bitsper 36 |
| unsigned long long res[10]; |
| unsigned long long u, carry, mask, mant, nextbits; |
| int first, last, i, rexp, xexp, resexp, ltb, determ, bc; |
| double dx; |
| static const double |
| piby2 = 1.57079632679489655800e+00; /* 0x3ff921fb54442d18 */ |
| #ifdef WINDOWS |
| static unsigned long long pibits[] = |
| { |
| 0LL, |
| 5215LL, 13000023176LL, 11362338026LL, 67174558139LL, |
| 34819822259LL, 10612056195LL, 67816420731LL, 57840157550LL, |
| 19558516809LL, 50025467026LL, 25186875954LL, 18152700886LL |
| }; |
| #else |
| static unsigned long long pibits[] = |
| { |
| 0L, |
| 5215L, 13000023176L, 11362338026L, 67174558139L, |
| 34819822259L, 10612056195L, 67816420731L, 57840157550L, |
| 19558516809L, 50025467026L, 25186875954L, 18152700886L |
| }; |
| #endif |
| |
| #ifdef DEBUGGING_PRINT |
| printf("On entry, x = %25.20e = %s\n", x, double2hex(&x)); |
| #endif |
| |
| xexp = (int)(((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64); |
| ux = ((ux & MANTBITS_DP64) | IMPBIT_DP64) >> 29; |
| |
| #ifdef DEBUGGING_PRINT |
| printf("ux = %s\n", d2b(ux, 64, -1)); |
| #endif |
| |
| /* Now ux is the mantissa bit pattern of x as a long integer */ |
| mask = 1; |
| mask = (mask << bitsper) - 1; |
| |
| /* Set first and last to the positions of the first |
| and last chunks of 2/pi that we need */ |
| first = xexp / bitsper; |
| resexp = xexp - first * bitsper; |
| /* 120 is the theoretical maximum number of bits (actually |
| 115 for IEEE single precision) that we need to extract |
| from the middle of 2/pi to compute the reduced argument |
| accurately enough for our purposes */ |
| last = first + 120 / bitsper; |
| |
| #ifdef DEBUGGING_PRINT |
| printf("first = %d, last = %d\n", first, last); |
| #endif |
| |
| /* Do a long multiplication of the bits of 2/pi by the |
| integer mantissa */ |
| /* Unroll the loop. This is only correct because we know |
| that bitsper is fixed as 36. */ |
| res[4] = 0; |
| u = pibits[last] * ux; |
| res[3] = u & mask; |
| carry = u >> bitsper; |
| u = pibits[last - 1] * ux + carry; |
| res[2] = u & mask; |
| carry = u >> bitsper; |
| u = pibits[last - 2] * ux + carry; |
| res[1] = u & mask; |
| carry = u >> bitsper; |
| u = pibits[first] * ux + carry; |
| res[0] = u & mask; |
| |
| #ifdef DEBUGGING_PRINT |
| printf("resexp = %d\n", resexp); |
| printf("Significant part of x * 2/pi with binary" |
| " point in correct place:\n"); |
| for (i = 0; i <= last - first; i++) |
| { |
| if (i > 0 && i % 5 == 0) |
| printf("\n "); |
| if (i == 1) |
| printf("%s ", d2b(res[i], bitsper, resexp)); |
| else |
| printf("%s ", d2b(res[i], bitsper, -1)); |
| } |
| printf("\n"); |
| #endif |
| |
| /* Reconstruct the result */ |
| ltb = (int)((((res[0] << bitsper) | res[1]) |
| >> (bitsper - 1 - resexp)) & 7); |
| |
| /* determ says whether the fractional part is >= 0.5 */ |
| determ = ltb & 1; |
| |
| #ifdef DEBUGGING_PRINT |
| printf("ltb = %d (last two bits before binary point" |
| " and first bit after)\n", ltb); |
| printf("determ = %d (1 means need to negate because the fractional\n" |
| " part of x * 2/pi is greater than 0.5)\n", determ); |
| #endif |
| |
| i = 1; |
| if (determ) |
| { |
| /* The mantissa is >= 0.5. We want to subtract it |
| from 1.0 by negating all the bits */ |
| *region = ((ltb >> 1) + 1) & 3; |
| mant = 1; |
| mant = ~(res[1]) & ((mant << (bitsper - resexp)) - 1); |
| while (mant < 0x0000000000010000) |
| { |
| i++; |
| mant = (mant << bitsper) | (~(res[i]) & mask); |
| } |
| nextbits = (~(res[i+1]) & mask); |
| } |
| else |
| { |
| *region = (ltb >> 1); |
| mant = 1; |
| mant = res[1] & ((mant << (bitsper - resexp)) - 1); |
| while (mant < 0x0000000000010000) |
| { |
| i++; |
| mant = (mant << bitsper) | res[i]; |
| } |
| nextbits = res[i+1]; |
| } |
| |
| #ifdef DEBUGGING_PRINT |
| printf("First bits of mant = %s\n", d2b(mant, bitsper, -1)); |
| #endif |
| |
| /* Normalize the mantissa. The shift value 6 here, determined by |
| trial and error, seems to give optimal speed. */ |
| bc = 0; |
| while (mant < 0x0000400000000000LL) |
| { |
| bc += 6; |
| mant <<= 6; |
| } |
| while (mant < 0x0010000000000000LL) |
| { |
| bc++; |
| mant <<= 1; |
| } |
| mant |= nextbits >> (bitsper - bc); |
| |
| rexp = 52 + resexp - bc - i * bitsper; |
| |
| #ifdef DEBUGGING_PRINT |
| printf("Normalised mantissa = 0x%016lx\n", mant); |
| printf("Exponent to be inserted on mantissa = rexp = %d\n", rexp); |
| #endif |
| |
| /* Put the result exponent rexp onto the mantissa pattern */ |
| u = ((unsigned long long)rexp + EXPBIAS_DP64) << EXPSHIFTBITS_DP64; |
| ux = (mant & MANTBITS_DP64) | u; |
| if (determ) |
| /* If we negated the mantissa we negate x too */ |
| ux |= SIGNBIT_DP64; |
| PUT_BITS_DP64(ux, dx); |
| |
| #ifdef DEBUGGING_PRINT |
| printf("(x*2/pi) = %25.20e = %s\n", dx, double2hex(&dx)); |
| #endif |
| |
| /* x is a double precision version of the fractional part of |
| x * 2 / pi. Multiply x by pi/2 in double precision |
| to get the reduced argument r. */ |
| *r = dx * piby2; |
| |
| #ifdef DEBUGGING_PRINT |
| printf(" r = frac(x*2/pi) * pi/2:\n"); |
| printf(" r = %25.20e = %s\n", *r, double2hex(r)); |
| printf("region = (number of pi/2 subtracted from x) mod 4 = %d\n", |
| *region); |
| #endif |
| } |
| #endif /* USE_REMAINDER_PIBY2F_INLINE */ |
| |
| #if defined(WINDOWS) |
| #if defined(USE_HANDLE_ERROR) || defined(USE_HANDLE_ERRORF) |
| #include <errno.h> |
| #endif |
| |
| #if defined(USE_HANDLE_ERROR) |
| /* Define the Microsoft specific error handling routines */ |
| static __declspec(noinline) double handle_error(const char *name, |
| unsigned long long value, |
| int type, int flags, int error, |
| double arg1, double arg2) |
| { |
| double z; |
| struct _exception exception_data; |
| exception_data.type = type; |
| exception_data.name = (char*)name; |
| exception_data.arg1 = arg1; |
| exception_data.arg2 = arg2; |
| PUT_BITS_DP64(value, z); |
| exception_data.retval = z; |
| raise_fpsw_flags(flags); |
| if (!_matherr(&exception_data)) |
| { |
| errno = error; |
| } |
| return exception_data.retval; |
| } |
| #endif /* USE_HANDLE_ERROR */ |
| |
| #if defined(USE_HANDLE_ERRORF) |
| static __declspec(noinline) float handle_errorf(const char *name, |
| unsigned int value, |
| int type, int flags, int error, |
| float arg1, float arg2) |
| { |
| float z; |
| struct _exception exception_data; |
| exception_data.type = type; |
| exception_data.name = (char*)name; |
| exception_data.arg1 = (double)arg1; |
| exception_data.arg2 = (double)arg2; |
| PUT_BITS_SP32(value, z); |
| exception_data.retval = z; |
| raise_fpsw_flags(flags); |
| if (!_matherr(&exception_data)) |
| { |
| errno = error; |
| } |
| return (float)exception_data.retval; |
| } |
| #endif /* USE_HANDLE_ERRORF */ |
| #endif /* WINDOWS */ |
| |
| #endif /* LIBM_INLINES_AMD_H_INCLUDED */ |