blob: 3f6676f2ebc748325ef377de9526508330ec55e4 [file] [log] [blame]
/*
* Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
*
* This file is part of libacml_mv.
*
* libacml_mv is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* libacml_mv is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with libacml_mv. If not, see
* <http://www.gnu.org/licenses/>.
*
*/
#define EXPBITS_DP64 0x7ff0000000000000
#define EXPSHIFTBITS_DP64 52
#define EXPBIAS_DP64 1023
#define MANTBITS_DP64 0x000fffffffffffff
#define IMPBIT_DP64 0x0010000000000000
#define SIGNBIT_DP64 0x8000000000000000
#define GET_BITS_DP64(x, ux) \
{ \
volatile union {double d; unsigned long long i;} _bitsy; \
_bitsy.d = (x); \
ux = _bitsy.i; \
}
#define PUT_BITS_DP64(ux, x) \
{ \
volatile union {double d; unsigned long long i;} _bitsy; \
_bitsy.i = (ux); \
x = _bitsy.d; \
}
/* Define this to get debugging print statements activated */
#define DEBUGGING_PRINT
#undef DEBUGGING_PRINT
#ifdef DEBUGGING_PRINT
#include <stdio.h>
char *d2b(int d, int bitsper, int point)
{
static char buff[50];
int i, j;
j = bitsper;
if (point >= 0 && point <= bitsper)
j++;
buff[j] = '\0';
for (i = bitsper - 1; i >= 0; i--)
{
j--;
if (d % 2 == 1)
buff[j] = '1';
else
buff[j] = '0';
if (i == point)
{
j--;
buff[j] = '.';
}
d /= 2;
}
return buff;
}
#endif
/* Given positive argument x, reduce it to the range [-pi/4,pi/4] using
extra precision, and return the result in r, rr.
Return value "region" tells how many lots of pi/2 were subtracted
from x to put it in the range [-pi/4,pi/4], mod 4. */
void __amd_remainder_piby2(double x, double *r, double *rr, int *region)
{
/* This method simulates multi-precision floating-point
arithmetic and is accurate for all 1 <= x < infinity */
static const double
piby2_lead = 1.57079632679489655800e+00, /* 0x3ff921fb54442d18 */
piby2_part1 = 1.57079631090164184570e+00, /* 0x3ff921fb50000000 */
piby2_part2 = 1.58932547122958567343e-08, /* 0x3e5110b460000000 */
piby2_part3 = 6.12323399573676480327e-17; /* 0x3c91a62633145c06 */
const int bitsper = 10;
unsigned long long res[500];
unsigned long long ux, u, carry, mask, mant, highbitsrr;
int first, last, i, rexp, xexp, resexp, ltb, determ;
double xx, t;
static unsigned long long pibits[] =
{
0, 0, 0, 0, 0, 0,
162, 998, 54, 915, 580, 84, 671, 777, 855, 839,
851, 311, 448, 877, 553, 358, 316, 270, 260, 127,
593, 398, 701, 942, 965, 390, 882, 283, 570, 265,
221, 184, 6, 292, 750, 642, 465, 584, 463, 903,
491, 114, 786, 617, 830, 930, 35, 381, 302, 749,
72, 314, 412, 448, 619, 279, 894, 260, 921, 117,
569, 525, 307, 637, 156, 529, 504, 751, 505, 160,
945, 1022, 151, 1023, 480, 358, 15, 956, 753, 98,
858, 41, 721, 987, 310, 507, 242, 498, 777, 733,
244, 399, 870, 633, 510, 651, 373, 158, 940, 506,
997, 965, 947, 833, 825, 990, 165, 164, 746, 431,
949, 1004, 287, 565, 464, 533, 515, 193, 111, 798
};
GET_BITS_DP64(x, ux);
#ifdef DEBUGGING_PRINT
printf("On entry, x = %25.20e = %s\n", x, double2hex(&x));
#endif
xexp = (int)(((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64);
ux = (ux & MANTBITS_DP64) | IMPBIT_DP64;
/* Now ux is the mantissa bit pattern of x as a long integer */
carry = 0;
mask = 1;
mask = (mask << bitsper) - 1;
/* Set first and last to the positions of the first
and last chunks of 2/pi that we need */
first = xexp / bitsper;
resexp = xexp - first * bitsper;
/* 180 is the theoretical maximum number of bits (actually
175 for IEEE double precision) that we need to extract
from the middle of 2/pi to compute the reduced argument
accurately enough for our purposes */
last = first + 180 / bitsper;
/* Do a long multiplication of the bits of 2/pi by the
integer mantissa */
/* Unroll the loop. This is only correct because we know
that bitsper is fixed as 10. */
res[19] = 0;
u = pibits[last] * ux;
res[18] = u & mask;
carry = u >> bitsper;
u = pibits[last-1] * ux + carry;
res[17] = u & mask;
carry = u >> bitsper;
u = pibits[last-2] * ux + carry;
res[16] = u & mask;
carry = u >> bitsper;
u = pibits[last-3] * ux + carry;
res[15] = u & mask;
carry = u >> bitsper;
u = pibits[last-4] * ux + carry;
res[14] = u & mask;
carry = u >> bitsper;
u = pibits[last-5] * ux + carry;
res[13] = u & mask;
carry = u >> bitsper;
u = pibits[last-6] * ux + carry;
res[12] = u & mask;
carry = u >> bitsper;
u = pibits[last-7] * ux + carry;
res[11] = u & mask;
carry = u >> bitsper;
u = pibits[last-8] * ux + carry;
res[10] = u & mask;
carry = u >> bitsper;
u = pibits[last-9] * ux + carry;
res[9] = u & mask;
carry = u >> bitsper;
u = pibits[last-10] * ux + carry;
res[8] = u & mask;
carry = u >> bitsper;
u = pibits[last-11] * ux + carry;
res[7] = u & mask;
carry = u >> bitsper;
u = pibits[last-12] * ux + carry;
res[6] = u & mask;
carry = u >> bitsper;
u = pibits[last-13] * ux + carry;
res[5] = u & mask;
carry = u >> bitsper;
u = pibits[last-14] * ux + carry;
res[4] = u & mask;
carry = u >> bitsper;
u = pibits[last-15] * ux + carry;
res[3] = u & mask;
carry = u >> bitsper;
u = pibits[last-16] * ux + carry;
res[2] = u & mask;
carry = u >> bitsper;
u = pibits[last-17] * ux + carry;
res[1] = u & mask;
carry = u >> bitsper;
u = pibits[last-18] * ux + carry;
res[0] = u & mask;
#ifdef DEBUGGING_PRINT
printf("resexp = %d\n", resexp);
printf("Significant part of x * 2/pi with binary"
" point in correct place:\n");
for (i = 0; i <= last - first; i++)
{
if (i > 0 && i % 5 == 0)
printf("\n ");
if (i == 1)
printf("%s ", d2b((int)res[i], bitsper, resexp));
else
printf("%s ", d2b((int)res[i], bitsper, -1));
}
printf("\n");
#endif
/* Reconstruct the result */
ltb = (int)((((res[0] << bitsper) | res[1])
>> (bitsper - 1 - resexp)) & 7);
/* determ says whether the fractional part is >= 0.5 */
determ = ltb & 1;
#ifdef DEBUGGING_PRINT
printf("ltb = %d (last two bits before binary point"
" and first bit after)\n", ltb);
printf("determ = %d (1 means need to negate because the fractional\n"
" part of x * 2/pi is greater than 0.5)\n", determ);
#endif
i = 1;
if (determ)
{
/* The mantissa is >= 0.5. We want to subtract it
from 1.0 by negating all the bits */
*region = ((ltb >> 1) + 1) & 3;
mant = 1;
mant = ~(res[1]) & ((mant << (bitsper - resexp)) - 1);
while (mant < 0x0020000000000000)
{
i++;
mant = (mant << bitsper) | (~(res[i]) & mask);
}
highbitsrr = ~(res[i + 1]) << (64 - bitsper);
}
else
{
*region = (ltb >> 1);
mant = 1;
mant = res[1] & ((mant << (bitsper - resexp)) - 1);
while (mant < 0x0020000000000000)
{
i++;
mant = (mant << bitsper) | res[i];
}
highbitsrr = res[i + 1] << (64 - bitsper);
}
rexp = 52 + resexp - i * bitsper;
while (mant >= 0x0020000000000000)
{
rexp++;
highbitsrr = (highbitsrr >> 1) | ((mant & 1) << 63);
mant >>= 1;
}
#ifdef DEBUGGING_PRINT
printf("Normalised mantissa = 0x%016lx\n", mant);
printf("High bits of rest of mantissa = 0x%016lx\n", highbitsrr);
printf("Exponent to be inserted on mantissa = rexp = %d\n", rexp);
#endif
/* Put the result exponent rexp onto the mantissa pattern */
u = ((unsigned long long)rexp + EXPBIAS_DP64) << EXPSHIFTBITS_DP64;
ux = (mant & MANTBITS_DP64) | u;
if (determ)
/* If we negated the mantissa we negate x too */
ux |= SIGNBIT_DP64;
PUT_BITS_DP64(ux, x);
/* Create the bit pattern for rr */
highbitsrr >>= 12; /* Note this is shifted one place too far */
u = ((unsigned long long)rexp + EXPBIAS_DP64 - 53) << EXPSHIFTBITS_DP64;
PUT_BITS_DP64(u, t);
u |= highbitsrr;
PUT_BITS_DP64(u, xx);
/* Subtract the implicit bit we accidentally added */
xx -= t;
/* Set the correct sign, and double to account for the
"one place too far" shift */
if (determ)
xx *= -2.0;
else
xx *= 2.0;
#ifdef DEBUGGING_PRINT
printf("(lead part of x*2/pi) = %25.20e = %s\n", x, double2hex(&x));
printf("(tail part of x*2/pi) = %25.20e = %s\n", xx, double2hex(&xx));
#endif
/* (x,xx) is an extra-precise version of the fractional part of
x * 2 / pi. Multiply (x,xx) by pi/2 in extra precision
to get the reduced argument (r,rr). */
{
double hx, tx, c, cc;
/* Split x into hx (head) and tx (tail) */
GET_BITS_DP64(x, ux);
ux &= 0xfffffffff8000000;
PUT_BITS_DP64(ux, hx);
tx = x - hx;
c = piby2_lead * x;
cc = ((((piby2_part1 * hx - c) + piby2_part1 * tx) +
piby2_part2 * hx) + piby2_part2 * tx) +
(piby2_lead * xx + piby2_part3 * x);
*r = c + cc;
*rr = (c - *r) + cc;
}
#ifdef DEBUGGING_PRINT
printf(" (r,rr) = lead and tail parts of frac(x*2/pi) * pi/2:\n");
printf(" r = %25.20e = %s\n", *r, double2hex(r));
printf("rr = %25.20e = %s\n", *rr, double2hex(rr));
printf("region = (number of pi/2 subtracted from x) mod 4 = %d\n",
*region);
#endif
return;
}