| |
| /* |
| * Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved. |
| * |
| * This file is part of libacml_mv. |
| * |
| * libacml_mv is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU Lesser General Public |
| * License as published by the Free Software Foundation; either |
| * version 2.1 of the License, or (at your option) any later version. |
| * |
| * libacml_mv is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * Lesser General Public License for more details. |
| * |
| * You should have received a copy of the GNU Lesser General Public |
| * License along with libacml_mv. If not, see |
| * <http://www.gnu.org/licenses/>. |
| * |
| */ |
| |
| |
| #include "../inc/libm_amd.h" |
| #include "../inc/libm_util_amd.h" |
| |
| #define USE_NAN_WITH_FLAGS |
| #define USE_HANDLE_ERROR |
| #define USE_LOG_KERNEL_AMD |
| #include "../inc/libm_inlines_amd.h" |
| #undef USE_NAN_WITH_FLAGS |
| #undef USE_HANDLE_ERROR |
| #undef USE_LOG_KERNEL_AMD |
| |
| #include "../inc/libm_errno_amd.h" |
| |
| #ifndef WINDOWS |
| /* Deal with errno for out-of-range argument */ |
| static inline double retval_errno_edom(double x) |
| { |
| struct exception exc; |
| exc.arg1 = x; |
| exc.arg2 = x; |
| exc.type = DOMAIN; |
| exc.name = (char *)"acosh"; |
| if (_LIB_VERSION == _SVID_) |
| exc.retval = -HUGE; |
| else |
| exc.retval = nan_with_flags(AMD_F_INVALID); |
| if (_LIB_VERSION == _POSIX_) |
| __set_errno(EDOM); |
| else if (!matherr(&exc)) |
| { |
| if(_LIB_VERSION == _SVID_) |
| (void)fputs("acosh: DOMAIN error\n", stderr); |
| __set_errno(EDOM); |
| } |
| return exc.retval; |
| } |
| #endif |
| |
| #undef _FUNCNAME |
| #define _FUNCNAME "acosh" |
| double FN_PROTOTYPE(acosh)(double x) |
| { |
| |
| unsigned long long ux; |
| double r, rarg, r1, r2; |
| int xexp; |
| |
| static const unsigned long long |
| recrteps = 0x4196a09e667f3bcd; /* 1/sqrt(eps) = 9.49062656242515593767e+07 */ |
| /* log2_lead and log2_tail sum to an extra-precise version |
| of log(2) */ |
| |
| static const double |
| log2_lead = 6.93147122859954833984e-01, /* 0x3fe62e42e0000000 */ |
| log2_tail = 5.76999904754328540596e-08; /* 0x3e6efa39ef35793c */ |
| |
| |
| GET_BITS_DP64(x, ux); |
| |
| if ((ux & EXPBITS_DP64) == EXPBITS_DP64) |
| { |
| /* x is either NaN or infinity */ |
| if (ux & MANTBITS_DP64) |
| { |
| /* x is NaN */ |
| #ifdef WINDOWS |
| return handle_error(_FUNCNAME, ux|0x0008000000000000, _DOMAIN, |
| AMD_F_INVALID, EDOM, x, 0.0); |
| #else |
| return x + x; /* Raise invalid if it is a signalling NaN */ |
| #endif |
| } |
| else |
| { |
| /* x is infinity */ |
| if (ux & SIGNBIT_DP64) |
| /* x is negative infinity. Return a NaN. */ |
| #ifdef WINDOWS |
| return handle_error(_FUNCNAME, INDEFBITPATT_DP64, _DOMAIN, |
| AMD_F_INVALID, EDOM, x, 0.0); |
| #else |
| return retval_errno_edom(x); |
| #endif |
| else |
| /* Return positive infinity with no signal */ |
| return x; |
| } |
| } |
| else if ((ux & SIGNBIT_DP64) || (ux <= 0x3ff0000000000000)) |
| { |
| /* x <= 1.0 */ |
| if (ux == 0x3ff0000000000000) |
| { |
| /* x = 1.0; return zero. */ |
| return 0.0; |
| } |
| else |
| { |
| /* x is less than 1.0. Return a NaN. */ |
| #ifdef WINDOWS |
| return handle_error(_FUNCNAME, INDEFBITPATT_DP64, _DOMAIN, |
| AMD_F_INVALID, EDOM, x, 0.0); |
| #else |
| return retval_errno_edom(x); |
| #endif |
| } |
| } |
| |
| |
| if (ux > recrteps) |
| { |
| /* Arguments greater than 1/sqrt(epsilon) in magnitude are |
| approximated by acosh(x) = ln(2) + ln(x) */ |
| /* log_kernel_amd(x) returns xexp, r1, r2 such that |
| log(x) = xexp*log(2) + r1 + r2 */ |
| log_kernel_amd64(x, ux, &xexp, &r1, &r2); |
| /* Add (xexp+1) * log(2) to z1,z2 to get the result acosh(x). |
| The computed r1 is not subject to rounding error because |
| (xexp+1) has at most 10 significant bits, log(2) has 24 significant |
| bits, and r1 has up to 24 bits; and the exponents of r1 |
| and r2 differ by at most 6. */ |
| r1 = ((xexp+1) * log2_lead + r1); |
| r2 = ((xexp+1) * log2_tail + r2); |
| return r1 + r2; |
| } |
| else if (ux >= 0x4060000000000000) |
| { |
| /* 128.0 <= x <= 1/sqrt(epsilon) */ |
| /* acosh for these arguments is approximated by |
| acosh(x) = ln(x + sqrt(x*x-1)) */ |
| rarg = x*x-1.0; |
| /* Use assembly instruction to compute r = sqrt(rarg); */ |
| ASMSQRT(rarg,r); |
| r += x; |
| GET_BITS_DP64(r, ux); |
| log_kernel_amd64(r, ux, &xexp, &r1, &r2); |
| r1 = (xexp * log2_lead + r1); |
| r2 = (xexp * log2_tail + r2); |
| return r1 + r2; |
| } |
| else |
| { |
| /* 1.0 < x <= 128.0 */ |
| double u1, u2, v1, v2, w1, w2, hx, tx, t, r, s, p1, p2, a1, a2, c1, c2, |
| poly; |
| if (ux >= 0x3ff8000000000000) |
| { |
| /* 1.5 <= x <= 128.0 */ |
| /* We use minimax polynomials, |
| based on Abramowitz and Stegun 4.6.32 series |
| expansion for acosh(x), with the log(2x) and 1/(2.2.x^2) |
| terms removed. We compensate for these two terms later. |
| */ |
| t = x*x; |
| if (ux >= 0x4040000000000000) |
| { |
| /* [3,2] for 32.0 <= x <= 128.0 */ |
| poly = |
| (0.45995704464157438175e-9 + |
| (-0.89080839823528631030e-9 + |
| (-0.10370522395596168095e-27 + |
| 0.35255386405811106347e-32 * t) * t) * t) / |
| (0.21941191335882074014e-8 + |
| (-0.10185073058358334569e-7 + |
| 0.95019562478430648685e-8 * t) * t); |
| } |
| else if (ux >= 0x4020000000000000) |
| { |
| /* [3,3] for 8.0 <= x <= 32.0 */ |
| poly = |
| (-0.54903656589072526589e-10 + |
| (0.27646792387218569776e-9 + |
| (-0.26912957240626571979e-9 - |
| 0.86712268396736384286e-29 * t) * t) * t) / |
| (-0.24327683788655520643e-9 + |
| (0.20633757212593175571e-8 + |
| (-0.45438330985257552677e-8 + |
| 0.28707154390001678580e-8 * t) * t) * t); |
| } |
| else if (ux >= 0x4010000000000000) |
| { |
| /* [4,3] for 4.0 <= x <= 8.0 */ |
| poly = |
| (-0.20827370596738166108e-6 + |
| (0.10232136919220422622e-5 + |
| (-0.98094503424623656701e-6 + |
| (-0.11615338819596146799e-18 + |
| 0.44511847799282297160e-21 * t) * t) * t) * t) / |
| (-0.92579451630913718588e-6 + |
| (0.76997374707496606639e-5 + |
| (-0.16727286999128481170e-4 + |
| 0.10463413698762590251e-4 * t) * t) * t); |
| } |
| else if (ux >= 0x4000000000000000) |
| { |
| /* [5,5] for 2.0 <= x <= 4.0 */ |
| poly = |
| (-0.122195030526902362060e-7 + |
| (0.157894522814328933143e-6 + |
| (-0.579951798420930466109e-6 + |
| (0.803568881125803647331e-6 + |
| (-0.373906657221148667374e-6 - |
| 0.317856399083678204443e-21 * t) * t) * t) * t) * t) / |
| (-0.516260096352477148831e-7 + |
| (0.894662592315345689981e-6 + |
| (-0.475662774453078218581e-5 + |
| (0.107249291567405130310e-4 + |
| (-0.107871445525891289759e-4 + |
| 0.398833767702587224253e-5 * t) * t) * t) * t) * t); |
| } |
| else if (ux >= 0x3ffc000000000000) |
| { |
| /* [5,4] for 1.75 <= x <= 2.0 */ |
| poly = |
| (0.1437926821253825186e-3 + |
| (-0.1034078230246627213e-2 + |
| (0.2015310005461823437e-2 + |
| (-0.1159685218876828075e-2 + |
| (-0.9267353551307245327e-11 + |
| 0.2880267770324388034e-12 * t) * t) * t) * t) * t) / |
| (0.6305521447028109891e-3 + |
| (-0.6816525887775002944e-2 + |
| (0.2228081831550003651e-1 + |
| (-0.2836886105406603318e-1 + |
| 0.1236997707206036752e-1 * t) * t) * t) * t); |
| } |
| else |
| { |
| /* [5,4] for 1.5 <= x <= 1.75 */ |
| poly = |
| ( 0.7471936607751750826e-3 + |
| (-0.4849405284371905506e-2 + |
| (0.8823068059778393019e-2 + |
| (-0.4825395461288629075e-2 + |
| (-0.1001984320956564344e-8 + |
| 0.4299919281586749374e-10 * t) * t) * t) * t) * t) / |
| (0.3322359141239411478e-2 + |
| (-0.3293525930397077675e-1 + |
| (0.1011351440424239210e0 + |
| (-0.1227083591622587079e0 + |
| 0.5147099404383426080e-1 * t) * t) * t) * t); |
| } |
| GET_BITS_DP64(x, ux); |
| log_kernel_amd64(x, ux, &xexp, &r1, &r2); |
| r1 = ((xexp+1) * log2_lead + r1); |
| r2 = ((xexp+1) * log2_tail + r2); |
| /* Now (r1,r2) sum to log(2x). Subtract the term |
| 1/(2.2.x^2) = 0.25/t, and add poly/t, carefully |
| to maintain precision. (Note that we add poly/t |
| rather than poly because of the *x factor used |
| when generating the minimax polynomial) */ |
| v2 = (poly-0.25)/t; |
| r = v2 + r1; |
| s = ((r1 - r) + v2) + r2; |
| v1 = r + s; |
| return v1 + ((r - v1) + s); |
| } |
| |
| /* Here 1.0 <= x <= 1.5. It is hard to maintain accuracy here so |
| we have to go to great lengths to do so. */ |
| |
| /* We compute the value |
| t = x - 1.0 + sqrt(2.0*(x - 1.0) + (x - 1.0)*(x - 1.0)) |
| using simulated quad precision. */ |
| t = x - 1.0; |
| u1 = t * 2.0; |
| |
| /* dekker_mul12(t,t,&v1,&v2); */ |
| GET_BITS_DP64(t, ux); |
| ux &= 0xfffffffff8000000; |
| PUT_BITS_DP64(ux, hx); |
| tx = t - hx; |
| v1 = t * t; |
| v2 = (((hx * hx - v1) + hx * tx) + tx * hx) + tx * tx; |
| |
| /* dekker_add2(u1,0.0,v1,v2,&w1,&w2); */ |
| r = u1 + v1; |
| s = (((u1 - r) + v1) + v2); |
| w1 = r + s; |
| w2 = (r - w1) + s; |
| |
| /* dekker_sqrt2(w1,w2,&u1,&u2); */ |
| ASMSQRT(w1,p1); |
| GET_BITS_DP64(p1, ux); |
| ux &= 0xfffffffff8000000; |
| PUT_BITS_DP64(ux, c1); |
| c2 = p1 - c1; |
| a1 = p1 * p1; |
| a2 = (((c1 * c1 - a1) + c1 * c2) + c2 * c1) + c2 * c2; |
| p2 = (((w1 - a1) - a2) + w2) * 0.5 / p1; |
| u1 = p1 + p2; |
| u2 = (p1 - u1) + p2; |
| |
| /* dekker_add2(u1,u2,t,0.0,&v1,&v2); */ |
| r = u1 + t; |
| s = (((u1 - r) + t)) + u2; |
| r1 = r + s; |
| r2 = (r - r1) + s; |
| t = r1 + r2; |
| |
| /* Check for x close to 1.0. */ |
| if (x < 1.13) |
| { |
| /* Here 1.0 <= x < 1.13 implies r <= 0.656. In this region |
| we need to take extra care to maintain precision. |
| We have t = r1 + r2 = (x - 1.0 + sqrt(x*x-1.0)) |
| to more than basic precision. We use the Taylor series |
| for log(1+x), with terms after the O(x*x) term |
| approximated by a [6,6] minimax polynomial. */ |
| double b1, b2, c1, c2, e1, e2, q1, q2, c, cc, hr1, tr1, hpoly, tpoly, hq1, tq1, hr2, tr2; |
| poly = |
| (0.30893760556597282162e-21 + |
| (0.10513858797132174471e0 + |
| (0.27834538302122012381e0 + |
| (0.27223638654807468186e0 + |
| (0.12038958198848174570e0 + |
| (0.23357202004546870613e-1 + |
| (0.15208417992520237648e-2 + |
| 0.72741030690878441996e-7 * t) * t) * t) * t) * t) * t) * t) / |
| (0.31541576391396523486e0 + |
| (0.10715979719991342022e1 + |
| (0.14311581802952004012e1 + |
| (0.94928647994421895988e0 + |
| (0.32396235926176348977e0 + |
| (0.52566134756985833588e-1 + |
| 0.30477895574211444963e-2 * t) * t) * t) * t) * t) * t); |
| |
| /* Now we can compute the result r = acosh(x) = log1p(t) |
| using the formula t - 0.5*t*t + poly*t*t. Since t is |
| represented as r1+r2, the formula becomes |
| r = r1+r2 - 0.5*(r1+r2)*(r1+r2) + poly*(r1+r2)*(r1+r2). |
| Expanding out, we get |
| r = r1 + r2 - (0.5 + poly)*(r1*r1 + 2*r1*r2 + r2*r2) |
| and ignoring negligible quantities we get |
| r = r1 + r2 - 0.5*r1*r1 + r1*r2 + poly*t*t |
| */ |
| if (x < 1.06) |
| { |
| double b, c, e; |
| b = r1*r2; |
| c = 0.5*r1*r1; |
| e = poly*t*t; |
| /* N.B. the order of additions and subtractions is important */ |
| r = (((r2 - b) + e) - c) + r1; |
| return r; |
| } |
| else |
| { |
| /* For 1.06 <= x <= 1.13 we must evaluate in extended precision |
| to reach about 1 ulp accuracy (in this range the simple code |
| above only manages about 1.5 ulp accuracy) */ |
| |
| /* Split poly, r1 and r2 into head and tail sections */ |
| GET_BITS_DP64(poly, ux); |
| ux &= 0xfffffffff8000000; |
| PUT_BITS_DP64(ux,hpoly); |
| tpoly = poly - hpoly; |
| GET_BITS_DP64(r1,ux); |
| ux &= 0xfffffffff8000000; |
| PUT_BITS_DP64(ux,hr1); |
| tr1 = r1 - hr1; |
| GET_BITS_DP64(r2, ux); |
| ux &= 0xfffffffff8000000; |
| PUT_BITS_DP64(ux,hr2); |
| tr2 = r2 - hr2; |
| |
| /* e = poly*t*t */ |
| c = poly * r1; |
| cc = (((hpoly * hr1 - c) + hpoly * tr1) + tpoly * hr1) + tpoly * tr1; |
| cc = poly * r2 + cc; |
| q1 = c + cc; |
| q2 = (c - q1) + cc; |
| GET_BITS_DP64(q1, ux); |
| ux &= 0xfffffffff8000000; |
| PUT_BITS_DP64(ux,hq1); |
| tq1 = q1 - hq1; |
| c = q1 * r1; |
| cc = (((hq1 * hr1 - c) + hq1 * tr1) + tq1 * hr1) + tq1 * tr1; |
| cc = q1 * r2 + q2 * r1 + cc; |
| e1 = c + cc; |
| e2 = (c - e1) + cc; |
| |
| /* b = r1*r2 */ |
| b1 = r1 * r2; |
| b2 = (((hr1 * hr2 - b1) + hr1 * tr2) + tr1 * hr2) + tr1 * tr2; |
| |
| /* c = 0.5*r1*r1 */ |
| c1 = (0.5*r1) * r1; |
| c2 = (((0.5*hr1 * hr1 - c1) + 0.5*hr1 * tr1) + 0.5*tr1 * hr1) + 0.5*tr1 * tr1; |
| |
| /* v = a + d - b */ |
| r = r1 - b1; |
| s = (((r1 - r) - b1) - b2) + r2; |
| v1 = r + s; |
| v2 = (r - v1) + s; |
| |
| /* w = (a + d - b) - c */ |
| r = v1 - c1; |
| s = (((v1 - r) - c1) - c2) + v2; |
| w1 = r + s; |
| w2 = (r - w1) + s; |
| |
| /* u = ((a + d - b) - c) + e */ |
| r = w1 + e1; |
| s = (((w1 - r) + e1) + e2) + w2; |
| u1 = r + s; |
| u2 = (r - u1) + s; |
| |
| /* The result r = acosh(x) */ |
| r = u1 + u2; |
| |
| return r; |
| } |
| } |
| else |
| { |
| /* For arguments 1.13 <= x <= 1.5 the log1p function |
| is good enough */ |
| return FN_PROTOTYPE(log1p)(t); |
| } |
| } |
| } |
| |
| weak_alias (__acosh, acosh) |