| /** |
| * This file has no copyright assigned and is placed in the Public Domain. |
| * This file is part of the mingw-w64 runtime package. |
| * No warranty is given; refer to the file DISCLAIMER.PD within this package. |
| */ |
| long double fmal(long double x, long double y, long double z); |
| |
| #if defined(_ARM_) || defined(__arm__) || defined(_ARM64_) || defined(__aarch64__) |
| |
| double fma(double x, double y, double z); |
| |
| /* On ARM `long double` is 64 bits. And ARM has hardware FMA. */ |
| long double fmal(long double x, long double y, long double z){ |
| return fma(x, y, z); |
| } |
| |
| #elif defined(_AMD64_) || defined(__x86_64__) || defined(_X86_) || defined(__i386__) |
| |
| /** |
| * x87-specific software-emulated FMA by LH_Mouse (lh_mouse at 126 dot com). |
| * This file is donated to the mingw-w64 project. |
| * Note: This file requires C99 support to compile. |
| */ |
| |
| #include <math.h> |
| #include <stdint.h> |
| #include <limits.h> |
| #include <stdbool.h> |
| |
| /* https://en.wikipedia.org/wiki/Extended_precision#x86_extended_precision_format */ |
| typedef union x87reg_ { |
| struct __attribute__((__packed__)) { |
| union { |
| uint64_t f64; |
| struct { |
| uint32_t flo; |
| uint32_t fhi; |
| }; |
| }; |
| uint16_t exp : 15; |
| uint16_t sgn : 1; |
| }; |
| long double f; |
| } x87reg; |
| |
| static inline void break_down(x87reg *restrict lo, x87reg *restrict hi, long double x){ |
| hi->f = x; |
| const uint32_t flo = hi->flo; |
| const long exp = hi->exp; |
| const bool sgn = hi->sgn; |
| /* Erase low-order significant bits. `hi->f` now has only 32 significant bits. */ |
| hi->flo = 0; |
| |
| if(flo == 0){ |
| /* If the low-order significant bits are all zeroes, return zero in `lo->f`. */ |
| lo->f64 = 0; |
| lo->exp = 0; |
| } else { |
| /* How many bits should we shift to normalize the floating point value? */ |
| const long shn = __builtin_clzl(flo) - (sizeof(long) - sizeof(uint32_t)) * CHAR_BIT + 32; |
| #if 0 /* Naive implementation */ |
| if(shn < exp){ |
| /* `x` can be normalized, normalize it. */ |
| lo->f64 = (uint64_t)flo << shn; |
| lo->exp = (exp - shn) & 0x7FFF; |
| } else { |
| /* Otherwise, go with a denormal number. */ |
| if(exp > 0){ |
| /* Denormalize the source normal number. */ |
| lo->f64 = (uint64_t)flo << (exp - 1); |
| } else { |
| /* Leave the source denormal number as is. */ |
| lo->f64 = flo; |
| } |
| lo->exp = 0; |
| } |
| #else /* Optimal implementation */ |
| const long mask = (shn - exp) >> 31; /* mask = (shn < exp) ? -1 : 0 */ |
| long expm1 = exp - 1; |
| expm1 &= ~(expm1 >> 31); /* expm1 = (exp - 1 >= 0) ? (exp - 1) : 0 */ |
| lo->f64 = (uint64_t)flo << (((shn ^ expm1) & mask) ^ expm1); |
| /* f64 = flo << ((shn < exp) ? shn : expm1) */ |
| lo->exp = (exp - shn) & mask; /* exp = (shn < exp) ? (exp - shn) : 0 */ |
| #endif |
| } |
| lo->sgn = sgn; |
| } |
| static inline long double fpu_fma(long double x, long double y, long double z){ |
| /* |
| POSIX-2013: |
| 1. If x or y are NaN, a NaN shall be returned. |
| 2. If x multiplied by y is an exact infinity and z is also an infinity |
| but with the opposite sign, a domain error shall occur, and either a NaN |
| (if supported), or an implementation-defined value shall be returned. |
| 3. If one of x and y is infinite, the other is zero, and z is not a NaN, |
| a domain error shall occur, and either a NaN (if supported), or an |
| implementation-defined value shall be returned. |
| 4. If one of x and y is infinite, the other is zero, and z is a NaN, a NaN |
| shall be returned and a domain error may occur. |
| 5. If x* y is not 0*Inf nor Inf*0 and z is a NaN, a NaN shall be returned. |
| */ |
| if(__fpclassifyl(x) == FP_NAN){ |
| return x; /* Handle case 1. */ |
| } |
| if(__fpclassifyl(y) == FP_NAN){ |
| return y; /* Handle case 1. */ |
| } |
| /* Handle case 2, 3 and 4 universally. Thanks to x87 a NaN is generated |
| if an INF is multiplied with zero, saving us a huge amount of work. */ |
| const long double xy = x * y; |
| if(__fpclassifyl(xy) == FP_NAN){ |
| return xy; /* Handle case 2, 3 and 4. */ |
| } |
| if(__fpclassifyl(z) == FP_NAN){ |
| return z; /* Handle case 5. */ |
| } |
| /* Check whether the result is finite. */ |
| const long double xyz = xy + z; |
| const int cxyz = __fpclassifyl(xyz); |
| if((cxyz == FP_NAN) || (cxyz == FP_INFINITE)){ |
| return xyz; /* If this naive check doesn't yield a finite value, the FMA isn't |
| likely to return one either. Forward the value as is. */ |
| } |
| |
| long double ret; |
| x87reg xlo, xhi, ylo, yhi; |
| break_down(&xlo, &xhi, x); |
| break_down(&ylo, &yhi, y); |
| /* The order of these four statements is essential. Don't move them around. */ |
| ret = z; |
| ret += xhi.f * yhi.f; /* The most significant item comes first. */ |
| ret += xhi.f * ylo.f + xlo.f * yhi.f; /* They are equally significant. */ |
| ret += xlo.f * ylo.f; /* The least significant item comes last. */ |
| return ret; |
| } |
| |
| long double fmal(long double x, long double y, long double z){ |
| return fpu_fma(x, y, z); |
| } |
| |
| #else |
| |
| #error Please add FMA implementation for this platform. |
| |
| #endif |