src/gas/sincos.S - open64_libacml_mv - Git at Google


 #
 #  (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
 #
 #  This file is part of libacml_mv.
 #
 #  libacml_mv is free software; you can redistribute it and/or
 #  modify it under the terms of the GNU Lesser General Public
 #  License as published by the Free Software Foundation; either
 #  version 2.1 of the License, or (at your option) any later version.
 #
 #  libacml_mv is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 #  Lesser General Public License for more details.
 #
 #  You should have received a copy of the GNU Lesser General Public
 #  License along with libacml_mv.  If not, see
 #  <http://www.gnu.org/licenses/>.
 #
 #


 #
 # An implementation of the sincos function.
 #
 # Prototype:
 #
 #   void sincos(double x, double* sinr, double* cosr);
 #
 #   Computes sincos
 #   It will provide proper C99 return values,
 #   but may not raise floating point status bits properly.
 #   Based on the NAG C implementation.
 #
 #
 #ifdef __ELF__
 .section .note.GNU-stack,"",@progbits
 #endif

 .data
 .align 16
 .L__real_7fffffffffffffff: .quad 0x07fffffffffffffff  #Sign bit zero
                         .quad 0                       # for alignment
 .L__real_3ff0000000000000: .quad 0x03ff0000000000000  # 1.0
                         .quad 0
 .L__real_3fe0000000000000: .quad 0x03fe0000000000000  # 0.5
                         .quad 0
 .L__real_3fc5555555555555: .quad 0x03fc5555555555555  # 0.166666666666
                         .quad 0
 .L__real_3fe45f306dc9c883: .quad 0x03fe45f306dc9c883  # twobypi
                         .quad 0
 .L__real_3ff921fb54400000: .quad 0x03ff921fb54400000  # piby2_1
                         .quad 0
 .L__real_3dd0b4611a626331: .quad 0x03dd0b4611a626331  # piby2_1tail
                         .quad 0
 .L__real_3dd0b4611a600000: .quad 0x03dd0b4611a600000  # piby2_2
                         .quad 0
 .L__real_3ba3198a2e037073: .quad 0x03ba3198a2e037073  # piby2_2tail
                         .quad 0
 .L__real_fffffffff8000000: .quad 0x0fffffffff8000000  # mask for stripping head and tail
                         .quad 0
 .L__real_411E848000000000: .quad 0x415312d000000000   # 5e6 0x0411E848000000000  # 5e5
                         .quad 0

 .align 16
 .Lsincosarray:
     .quad    0x0bfc5555555555555                      # -0.16666666666666666    s1
     .quad    0x03fa5555555555555                      # 0.041666666666666664    c1
     .quad    0x03f81111111110bb3                      # 0.00833333333333095     s2
     .quad    0x0bf56c16c16c16967                      # -0.0013888888888887398  c2
     .quad    0x0bf2a01a019e83e5c                      # -0.00019841269836761127 s3
     .quad    0x03efa01a019f4ec90                      # 2.4801587298767041E-05  c3
     .quad    0x03ec71de3796cde01                      # 2.7557316103728802E-06  s4
     .quad    0x0be927e4fa17f65f6                      # -2.7557317272344188E-07 c4
     .quad    0x0be5ae600b42fdfa7                      # -2.5051132068021698E-08 s5
     .quad    0x03e21eeb69037ab78                      # 2.0876146382232963E-09  c6
     .quad    0x03de5e0b2f9a43bb8                      # 1.5918144304485914E-10  s6
     .quad    0x0bda907db46cc5e42                      # -1.1382639806794487E-11 c7

 .align 16
 .Lcossinarray:
     .quad    0x03fa5555555555555                      # 0.0416667        c1
     .quad    0x0bfc5555555555555                      # -0.166667        s1
     .quad    0x0bf56c16c16c16967
     .quad    0x03f81111111110bb3                      # 0.00833333       s2
     .quad    0x03efa01a019f4ec90
     .quad    0x0bf2a01a019e83e5c                      # -0.000198413     s3
     .quad    0x0be927e4fa17f65f6
     .quad    0x03ec71de3796cde01                      # 2.75573e-006     s4
     .quad    0x03e21eeb69037ab78
     .quad    0x0be5ae600b42fdfa7                      # -2.50511e-008    s5
     .quad    0x0bda907db46cc5e42
     .quad    0x03de5e0b2f9a43bb8                      # 1.59181e-010     s6

 .text
 .align 16
 .p2align 4,,15

 #include "fn_macros.h"
 #define fname FN_PROTOTYPE(sincos)
 #define fname_special _sincos_special@PLT

 # define local variable storage offsets
 .equ    p_temp,    0x30                               # temporary for get/put bits operation
 .equ    p_temp1,   0x40                               # temporary for get/put bits operation
 .equ    r,         0x50                               # pointer to r for amd_remainder_piby2
 .equ    rr,        0x60                               # pointer to rr for amd_remainder_piby2
 .equ    region,    0x70                               # pointer to region for amd_remainder_piby2
 .equ   stack_size, 0x98

 .globl fname
 .type  fname,@function

 fname:
     sub     $stack_size, %rsp
     xorpd   %xmm2,%xmm2                               # zeroed out for later use

 # GET_BITS_DP64(x, ux);
 # get the input value to an integer register.
     movsd   %xmm0,p_temp(%rsp)
     mov     p_temp(%rsp),%rcx                         # rcx is ux

 ##  if NaN or inf
     mov     $0x07ff0000000000000,%rax
     mov     %rax,%r10
     and     %rcx,%r10
     cmp     %rax,%r10
     jz      .Lsincos_naninf

 #  ax = (ux & ~SIGNBIT_DP64);
     mov     $0x07fffffffffffffff,%r10
     and     %rcx,%r10                # r10 is ax

 ##  if (ax <= 0x3fe921fb54442d18) /* abs(x) <= pi/4 */
     mov     $0x03fe921fb54442d18,%rax
     cmp     %rax,%r10
     jg      .Lsincos_reduce

 ##      if (ax < 0x3f20000000000000) /* abs(x) < 2.0^(-13) */
     mov     $0x03f20000000000000,%rax
     cmp     %rax,%r10
     jge     .Lsincos_small

 ##          if (ax < 0x3e40000000000000) /* abs(x) < 2.0^(-27) */
     mov     $0x03e40000000000000,%rax
     cmp     %rax,%r10
     jge     .Lsincos_smaller

                             # sin = x;
     movsd   .L__real_3ff0000000000000(%rip),%xmm1     # cos = 1.0;
     jmp     .Lsincos_cleanup

 ##          else
 .align 32
 .Lsincos_smaller:
 #              sin = x - x^3 * 0.1666666666666666666;
 #              cos = 1.0 - x*x*0.5;

     movsd   %xmm0,%xmm2
     movsd   .L__real_3fc5555555555555(%rip),%xmm4     # 0.1666666666666666666
     mulsd   %xmm2,%xmm2                               # x^2
     movsd   .L__real_3ff0000000000000(%rip),%xmm1     # 1.0
     movsd   %xmm2,%xmm3                               # copy of x^2

     mulsd   %xmm0,%xmm2                               # x^3
     mulsd   .L__real_3fe0000000000000(%rip),%xmm3     # 0.5 * x^2
     mulsd   %xmm4,%xmm2                               # x^3 * 0.1666666666666666666
     subsd   %xmm2,%xmm0                               # x - x^3 * 0.1666666666666666666, sin
     subsd   %xmm3,%xmm1                               # 1 - 0.5 * x^2, cos

     jmp     .Lsincos_cleanup


 ##      else

 .align 16
 .Lsincos_small:
 #          sin = sin_piby4(x, 0.0);
     movsd   .L__real_3fe0000000000000(%rip),%xmm5     # .5

 #  x2 = r * r;
     movsd   %xmm0,%xmm2
     mulsd   %xmm0,%xmm2                               # x2
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 # region 0 or 2     - do a sin calculation
 #  zs = (s2 + x2 * (s3 + x2 * (s4 + x2 * (s5 + x2 * s6))));

     movlhps %xmm2,%xmm2
     movapd  .Lsincosarray+0x50(%rip),%xmm3            # s6
     movapd  %xmm2,%xmm1                               # move for x4
     movdqa  .Lsincosarray+0x20(%rip),%xmm5            # s3
     mulpd   %xmm2,%xmm3                               # x2s6
     addpd   .Lsincosarray+0x40(%rip),%xmm3            # s5+x2s6
     mulpd   %xmm2,%xmm5                               # x2s3
     movapd  %xmm4,p_temp(%rsp)                        # rr move to to memory
     mulpd   %xmm2,%xmm1                               # x4
     mulpd   %xmm2,%xmm3                               # x2(s5+x2s6)
     addpd   .Lsincosarray+0x10(%rip),%xmm5            # s2+x2s3
     movapd  %xmm1,%xmm4                               # move for x6
     addpd   .Lsincosarray+0x30(%rip),%xmm3            # s4 + x2(s5+x2s6)
     mulpd   %xmm2,%xmm5                               # x2(s2+x2s3)
     mulpd   %xmm2,%xmm4                               # x6
     addpd   .Lsincosarray(%rip),%xmm5                 # s1+x2(s2+x2s3)
     mulpd   %xmm4,%xmm3                               # x6(s4 + x2(s5+x2s6))

     movsd   %xmm2,%xmm4                               # xmm4 = x2 for 0.5x2 for cos
                                                       # xmm2 contains x2 for x3 for sin
     addpd   %xmm5,%xmm3                               # zs in lower and zc upper

     mulsd   %xmm0,%xmm2                               # xmm2=x3 for sin

     movhlps %xmm3,%xmm5                               # Copy z, xmm5 = cos , xmm3 = sin

     mulsd   .L__real_3fe0000000000000(%rip),%xmm4     # xmm4=r=0.5*x2 for cos term
     mulsd   %xmm2,%xmm3                               # sin *x3
     mulsd   %xmm1,%xmm5                               # cos *x4
     movsd   .L__real_3ff0000000000000(%rip),%xmm2     # 1.0
     subsd   %xmm4,%xmm2                               # t=1.0-r
     movsd   .L__real_3ff0000000000000(%rip),%xmm1     # 1.0
     subsd   %xmm2,%xmm1                               # 1 - t
     subsd   %xmm4,%xmm1                               # (1-t) -r
     addsd   %xmm5,%xmm1                               # ((1-t) -r) + cos
     addsd   %xmm3,%xmm0                               # xmm0= sin+x, final sin term
     addsd   %xmm2,%xmm1                               # xmm1 = t +{ ((1-t) -r) + cos}, final cos term

     jmp     .Lsincos_cleanup
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .align 16
 .Lsincos_reduce:
 # change rdx to rcx and r8 to r9
 # rcx= ux, r10 = ax
 # %r9,%rax are free

 #  xneg = (ax != ux);
     cmp     %r10,%rcx
     mov     $0,%r11d

 ##  if (xneg) x = -x;
     jz      .LPositive
     mov     $1,%r11d
     subsd   %xmm0,%xmm2
     movsd   %xmm2,%xmm0

 # rcx= ux, r10 = ax, r11= Sign
 # %r9,%rax are free
 # change rdx to rcx and r8 to r9

 .align 16
 .LPositive:
 ##  if (x < 5.0e5)
     cmp     .L__real_411E848000000000(%rip),%r10
     jae     .Lsincos_reduce_precise

 # reduce  the argument to be in a range from -pi/4 to +pi/4
 # by subtracting multiples of pi/2
     movsd   %xmm0,%xmm2
     movsd   .L__real_3fe45f306dc9c883(%rip),%xmm3     # twobypi
     movsd   %xmm0,%xmm4
     movsd   .L__real_3fe0000000000000(%rip),%xmm5     # .5
     mulsd   %xmm3,%xmm2

 #/* How many pi/2 is x a multiple of? */
 #       xexp  = ax >> EXPSHIFTBITS_DP64;
     shr     $52,%r10                                  # >>EXPSHIFTBITS_DP64

 #       npi2  = (int)(x * twobypi + 0.5);
     addsd   %xmm5,%xmm2                               # npi2

     movsd   .L__real_3ff921fb54400000(%rip),%xmm3     # piby2_1
     cvttpd2dq    %xmm2,%xmm0                          # convert to integer
     movsd   .L__real_3dd0b4611a626331(%rip),%xmm1     # piby2_1tail
     cvtdq2pd    %xmm0,%xmm2                           # and back to float.

 #       /* Subtract the multiple from x to get an extra-precision remainder */
 #       rhead  = x - npi2 * piby2_1;
     mulsd   %xmm2,%xmm3
     subsd   %xmm3,%xmm4                               # rhead

 #       rtail  = npi2 * piby2_1tail;
     mulsd   %xmm2,%xmm1
     movd    %xmm0,%eax


 #       GET_BITS_DP64(rhead-rtail, uy);                   ; originally only rhead
     movsd   %xmm4,%xmm0
     subsd   %xmm1,%xmm0

     movsd   .L__real_3dd0b4611a600000(%rip),%xmm3     # piby2_2
     movsd   %xmm0,p_temp(%rsp)
     movsd   .L__real_3ba3198a2e037073(%rip),%xmm5     # piby2_2tail
     mov     %eax,%ecx
     mov     p_temp(%rsp),%r9                          # rcx is rhead-rtail

 #    xmm0=r, xmm4=rhead, xmm1=rtail, xmm2=npi2, xmm3=temp for calc, xmm5= temp for calc
 #       expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
     shl     $1,%r9                                    # strip any sign bit
     shr     $53,%r9                                   # >> EXPSHIFTBITS_DP64 +1
     sub     %r9,%r10                                  # expdiff

 ##       if (expdiff > 15)
     cmp     $15,%r10
     jle     .Lexpdiff15

 #          /* The remainder is pretty small compared with x, which
 #             implies that x is a near multiple of pi/2
 #             (x matches the multiple to at least 15 bits) */

 #          t  = rhead;
     movsd   %xmm4,%xmm1

 #          rtail  = npi2 * piby2_2;
     mulsd   %xmm2,%xmm3

 #          rhead  = t - rtail;
     mulsd   %xmm2,%xmm5                               # npi2 * piby2_2tail
     subsd   %xmm3,%xmm4                               # rhead

 #          rtail  = npi2 * piby2_2tail - ((t - rhead) - rtail);
     subsd   %xmm4,%xmm1                               # t - rhead
     subsd   %xmm3,%xmm1                               # -rtail
     subsd   %xmm1,%xmm5                               # rtail

 #      r = rhead - rtail;
     movsd   %xmm4,%xmm0

 #HARSHA
 #xmm1=rtail
     movsd   %xmm5,%xmm1
     subsd   %xmm5,%xmm0

 #    xmm0=r, xmm4=rhead, xmm1=rtail
 .Lexpdiff15:
 #      region = npi2 & 3;

     subsd   %xmm0,%xmm4                               # rhead-r
     subsd   %xmm1,%xmm4                               # rr = (rhead-r) - rtail

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ## if the input was close to a pi/2 multiple
 # The original NAG code missed this trick.  If the input is very close to n*pi/2 after
 # reduction,
 # then the sin is ~ 1.0 , to within 53 bits, when r is < 2^-27.  We already
 # have x at this point, so we can skip the sin polynomials.

     cmp     $0x03f2,%r9                               # if r  small.
     jge     .Lcossin_piby4                            # use taylor series if not
     cmp     $0x03de,%r9                               # if r really small.
     jle     .Lr_small                                 # then sin(r) = 0

     movsd   %xmm0,%xmm2
     mulsd   %xmm2,%xmm2                               # x^2

 ##      if region is 0 or 2 do a sin calc.
     and     $1,%ecx
     jnz     .Lregion13

 # region 0 or 2 do a sincos calculation
 # use simply polynomial
 #              sin=x - x*x*x*0.166666666666666666;
     movsd   .L__real_3fc5555555555555(%rip),%xmm3     # 0.166666666
     mulsd   %xmm0,%xmm3                               # * x
     mulsd   %xmm2,%xmm3                               # * x^2
     subsd   %xmm3,%xmm0                               # xs
 #              cos=1.0 - x*x*0.5;
     movsd   .L__real_3ff0000000000000(%rip),%xmm1     # 1.0
     mulsd   .L__real_3fe0000000000000(%rip),%xmm2     # 0.5 *x^2
     subsd   %xmm2,%xmm1                               # xc

     jmp     .Ladjust_region

 .align 16
 .Lregion13:
 # region 1 or 3 do a cossin calculation
 # use simply polynomial
 #              sin=x - x*x*x*0.166666666666666666;
     movsd   %xmm0,%xmm1

     movsd   .L__real_3fc5555555555555(%rip),%xmm3     # 0.166666666
     mulsd   %xmm0,%xmm3                               # 0.166666666* x
     mulsd   %xmm2,%xmm3                               # 0.166666666* x * x^2
     subsd   %xmm3,%xmm1                               # xs
 #              cos=1.0 - x*x*0.5;
     movsd   .L__real_3ff0000000000000(%rip),%xmm0     # 1.0
     mulsd   .L__real_3fe0000000000000(%rip),%xmm2     # 0.5 *x^2
     subsd   %xmm2,%xmm0                               # xc

     jmp     .Ladjust_region

 .align 16
 .Lr_small:
 ##      if region is 0 or 2    do a sincos calc.
     movsd   .L__real_3ff0000000000000(%rip),%xmm1     # cos(r) is a 1
     and     $1,%ecx
     jz      .Ladjust_region

 ##      if region is 1 or 3    do a cossin calc.
     movsd   %xmm0,%xmm1                               # sin(r) is r
     movsd   .L__real_3ff0000000000000(%rip),%xmm0     # cos(r) is a 1
     jmp     .Ladjust_region

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .align 16
 .Lsincos_reduce_precise:
 #      // Reduce x into range [-pi/4,pi/4]
 #      __amd_remainder_piby2(x, &r, &rr, &region);

     mov     %rdi, p_temp1(%rsp)
     mov     %rsi, p_temp1+8(%rsp)
     mov     %r11,p_temp(%rsp)

     lea     region(%rsp),%rdx
     lea     rr(%rsp),%rsi
     lea     r(%rsp),%rdi

     call    __amd_remainder_piby2@PLT

     mov     p_temp1(%rsp), %rdi
     mov     p_temp1+8(%rsp), %rsi
     mov     p_temp(%rsp),%r11

     movsd   r(%rsp),%xmm0                             # x
     movsd   rr(%rsp),%xmm4                            # xx
     mov     region(%rsp),%eax                         # region to classify for sin/cos calc
     mov     %eax,%ecx                                 # region to get sign

 # xmm0 = x, xmm4 = xx, r8d = 1, eax= region
 .align 16
 .Lcossin_piby4:
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 # perform taylor series to calc sinx, sinx
 #  x2 = r * r;
 #xmm4 = a part of rr for the sin path, xmm4 is overwritten in the sin path
 #instead use xmm3 because that was freed up in the sin path, xmm3 is overwritten in sin path

     movsd   %xmm0,%xmm2
     mulsd   %xmm0,%xmm2                               #x2

 ##      if region is 0 or 2    do a sincos calc.
     and     $1,%ecx
     jz      .Lsincos02

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 # region 1 or 3     - do a cossin calculation
 #  zc = (c2 + x2 * (c3 + x2 * (c4 + x2 * (c5 + x2 * c6))));


     movlhps %xmm2,%xmm2

     movapd  .Lcossinarray+0x50(%rip),%xmm3            # s6
     movapd  %xmm2,%xmm1                               # move for x4
     movdqa  .Lcossinarray+0x20(%rip),%xmm5            # s3
     mulpd   %xmm2,%xmm3                               # x2s6
     addpd   .Lcossinarray+0x40(%rip),%xmm3            # s5+x2s6
     mulpd   %xmm2,%xmm5                               # x2s3
     movsd   %xmm4,p_temp(%rsp)                        # rr move to to memory
     mulpd   %xmm2,%xmm1                               # x4
     mulpd   %xmm2,%xmm3                               # x2(s5+x2s6)
     addpd   .Lcossinarray+0x10(%rip),%xmm5            # s2+x2s3
     movapd  %xmm1,%xmm4                               # move for x6
     addpd   .Lcossinarray+0x30(%rip),%xmm3            # s4 + x2(s5+x2s6)
     mulpd   %xmm2,%xmm5                               # x2(s2+x2s3)
     mulpd   %xmm2,%xmm4                               # x6
     addpd   .Lcossinarray(%rip),%xmm5                 # s1+x2(s2+x2s3)
     mulpd   %xmm4,%xmm3                               # x6(s4 + x2(s5+x2s6))

     movsd   %xmm2,%xmm4                               # xmm4 = x2 for 0.5x2 cos
                                                       # xmm2 contains x2 for x3 sin

     addpd   %xmm5,%xmm3                               # zc in lower and zs in upper

     mulsd   %xmm0,%xmm2                               # xmm2=x3 for the sin term

     movhlps %xmm3,%xmm5                               # Copy z, xmm5 = sin, xmm3 = cos
     mulsd   .L__real_3fe0000000000000(%rip),%xmm4     # xmm4=r=0.5*x2 for cos term

     mulsd   %xmm2,%xmm5                               # sin *x3
     mulsd   %xmm1,%xmm3                               # cos *x4
     movsd   %xmm0,p_temp1(%rsp)                       # store x
     movsd   %xmm0,%xmm1

     movsd   .L__real_3ff0000000000000(%rip),%xmm2     # 1.0
     subsd   %xmm4,%xmm2                               # t=1.0-r

     movsd   .L__real_3ff0000000000000(%rip),%xmm0     # 1.0
     subsd   %xmm2,%xmm0                               # 1 - t

     mulsd   p_temp(%rsp),%xmm1                        # x*xx
     subsd   %xmm4,%xmm0                               # (1-t) -r
     subsd   %xmm1,%xmm0                               # ((1-t) -r) - x *xx

     mulsd   p_temp(%rsp),%xmm4                        # 0.5*x2*xx

     addsd   %xmm3,%xmm0                               # (((1-t) -r) - x *xx) + cos

     subsd   %xmm4,%xmm5                               # sin - 0.5*x2*xx

     addsd   %xmm2,%xmm0                               # xmm0 = t +{ (((1-t) -r) - x *xx) + cos}, final cos term

     addsd   p_temp(%rsp),%xmm5                        # sin + xx
     movsd   p_temp1(%rsp),%xmm1                       # load x
     addsd   %xmm5,%xmm1                               # xmm1= sin+x, final sin term

     jmp     .Ladjust_region

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .align 16
 .Lsincos02:
 # region 0 or 2 do a sincos calculation
     movlhps %xmm2,%xmm2

     movapd  .Lsincosarray+0x50(%rip),%xmm3            # s6
     movapd  %xmm2,%xmm1                               # move for x4
     movdqa  .Lsincosarray+0x20(%rip),%xmm5            # s3
     mulpd   %xmm2,%xmm3                               # x2s6
     addpd   .Lsincosarray+0x40(%rip),%xmm3            # s5+x2s6
     mulpd   %xmm2,%xmm5                               # x2s3
     movsd   %xmm4,p_temp(%rsp)                        # rr move to to memory
     mulpd   %xmm2,%xmm1                               # x4
     mulpd   %xmm2,%xmm3                               # x2(s5+x2s6)
     addpd   .Lsincosarray+0x10(%rip),%xmm5            # s2+x2s3
     movapd  %xmm1,%xmm4                               # move for x6
     addpd   .Lsincosarray+0x30(%rip),%xmm3            # s4 + x2(s5+x2s6)
     mulpd   %xmm2,%xmm5                               # x2(s2+x2s3)
     mulpd   %xmm2,%xmm4                               # x6
     addpd   .Lsincosarray(%rip),%xmm5                 # s1+x2(s2+x2s3)
     mulpd   %xmm4,%xmm3                               # x6(s4 + x2(s5+x2s6))

     movsd   %xmm2,%xmm4                               # xmm4 = x2 for 0.5x2 for cos
                                                       # xmm2 contains x2 for x3 for sin

     addpd   %xmm5,%xmm3                               # zs in lower and zc in upper

     mulsd   %xmm0,%xmm2                               # xmm2=x3 for sin

     movhlps %xmm3,%xmm5                               # Copy z, xmm5 = cos , xmm3 = sin

     mulsd   .L__real_3fe0000000000000(%rip),%xmm4     # xmm4=r=0.5*x2 for cos term

     mulsd   %xmm2,%xmm3                               # sin *x3
     mulsd   %xmm1,%xmm5                               # cos *x4

     movsd   .L__real_3ff0000000000000(%rip),%xmm2     # 1.0
     subsd   %xmm4,%xmm2                               # t=1.0-r

     movsd   .L__real_3ff0000000000000(%rip),%xmm1     # 1.0
     subsd   %xmm2,%xmm1                               # 1 - t

     movsd   %xmm0,p_temp1(%rsp)                       # store x
     mulsd   p_temp(%rsp),%xmm0                        # x*xx

     subsd   %xmm4,%xmm1                               # (1-t) -r
     subsd   %xmm0,%xmm1                               # ((1-t) -r) - x *xx

     mulsd   p_temp(%rsp),%xmm4                        # 0.5*x2*xx

     addsd   %xmm5,%xmm1                               # (((1-t) -r) - x *xx) + cos

     subsd   %xmm4,%xmm3                               # sin - 0.5*x2*xx

     addsd   %xmm2,%xmm1                               # xmm1 = t +{ (((1-t) -r) - x *xx) + cos}, final cos term

     addsd   p_temp(%rsp),%xmm3                        # sin + xx
     movsd   p_temp1(%rsp),%xmm0                       # load x
     addsd   %xmm3,%xmm0                               # xmm0= sin+x, final sin term

     jmp     .Ladjust_region

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 #      switch (region)
 .align 16
 .Ladjust_region:        # positive or negative for sin return val in xmm0

     mov     %eax,%r9d

     shr     $1,%eax
     mov     %eax,%ecx
     and     %r11d,%eax

     not     %ecx
     not     %r11d
     and     %r11d,%ecx

     or      %ecx,%eax
     and     $1,%eax
     jnz     .Lcos_sign

 ## if the original region 0, 1 and arg is negative, then we negate the result.
 ## if the original region 2, 3 and arg is positive, then we negate the result.
     movsd   %xmm0,%xmm2
     xorpd   %xmm0,%xmm0
     subsd   %xmm2,%xmm0

 .Lcos_sign:            # positive or negative for cos return val in xmm1
     add     $1,%r9
     and     $2,%r9d
     jz      .Lsincos_cleanup
 ## if the original region 1 or 2 then we negate the result.
     movsd   %xmm1,%xmm2
     xorpd   %xmm1,%xmm1
     subsd   %xmm2,%xmm1

 #.align 16
 .Lsincos_cleanup:
     movsd   %xmm0, (%rdi)                             # save the sin
     movsd   %xmm1, (%rsi)                             # save the cos

     add     $stack_size,%rsp
     ret

 .align 16
 .Lsincos_naninf:
    call     fname_special
    add      $stack_size, %rsp
    ret

	#
	# (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
	#
	# This file is part of libacml_mv.
	#
	# libacml_mv is free software; you can redistribute it and/or
	# modify it under the terms of the GNU Lesser General Public
	# License as published by the Free Software Foundation; either
	# version 2.1 of the License, or (at your option) any later version.
	#
	# libacml_mv is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	# Lesser General Public License for more details.
	#
	# You should have received a copy of the GNU Lesser General Public
	# License along with libacml_mv. If not, see
	# <http://www.gnu.org/licenses/>.
	#
	#


	#
	# An implementation of the sincos function.
	#
	# Prototype:
	#
	# void sincos(double x, double* sinr, double* cosr);
	#
	# Computes sincos
	# It will provide proper C99 return values,
	# but may not raise floating point status bits properly.
	# Based on the NAG C implementation.
	#
	#
	#ifdef __ELF__
	.section .note.GNU-stack,"",@progbits
	#endif

	.data
	.align 16
	.L__real_7fffffffffffffff: .quad 0x07fffffffffffffff #Sign bit zero
	.quad 0 # for alignment
	.L__real_3ff0000000000000: .quad 0x03ff0000000000000 # 1.0
	.quad 0
	.L__real_3fe0000000000000: .quad 0x03fe0000000000000 # 0.5
	.quad 0
	.L__real_3fc5555555555555: .quad 0x03fc5555555555555 # 0.166666666666
	.quad 0
	.L__real_3fe45f306dc9c883: .quad 0x03fe45f306dc9c883 # twobypi
	.quad 0
	.L__real_3ff921fb54400000: .quad 0x03ff921fb54400000 # piby2_1
	.quad 0
	.L__real_3dd0b4611a626331: .quad 0x03dd0b4611a626331 # piby2_1tail
	.quad 0
	.L__real_3dd0b4611a600000: .quad 0x03dd0b4611a600000 # piby2_2
	.quad 0
	.L__real_3ba3198a2e037073: .quad 0x03ba3198a2e037073 # piby2_2tail
	.quad 0
	.L__real_fffffffff8000000: .quad 0x0fffffffff8000000 # mask for stripping head and tail
	.quad 0
	.L__real_411E848000000000: .quad 0x415312d000000000 # 5e6 0x0411E848000000000 # 5e5
	.quad 0

	.align 16
	.Lsincosarray:
	.quad 0x0bfc5555555555555 # -0.16666666666666666 s1
	.quad 0x03fa5555555555555 # 0.041666666666666664 c1
	.quad 0x03f81111111110bb3 # 0.00833333333333095 s2
	.quad 0x0bf56c16c16c16967 # -0.0013888888888887398 c2
	.quad 0x0bf2a01a019e83e5c # -0.00019841269836761127 s3
	.quad 0x03efa01a019f4ec90 # 2.4801587298767041E-05 c3
	.quad 0x03ec71de3796cde01 # 2.7557316103728802E-06 s4
	.quad 0x0be927e4fa17f65f6 # -2.7557317272344188E-07 c4
	.quad 0x0be5ae600b42fdfa7 # -2.5051132068021698E-08 s5
	.quad 0x03e21eeb69037ab78 # 2.0876146382232963E-09 c6
	.quad 0x03de5e0b2f9a43bb8 # 1.5918144304485914E-10 s6
	.quad 0x0bda907db46cc5e42 # -1.1382639806794487E-11 c7

	.align 16
	.Lcossinarray:
	.quad 0x03fa5555555555555 # 0.0416667 c1
	.quad 0x0bfc5555555555555 # -0.166667 s1
	.quad 0x0bf56c16c16c16967
	.quad 0x03f81111111110bb3 # 0.00833333 s2
	.quad 0x03efa01a019f4ec90
	.quad 0x0bf2a01a019e83e5c # -0.000198413 s3
	.quad 0x0be927e4fa17f65f6
	.quad 0x03ec71de3796cde01 # 2.75573e-006 s4
	.quad 0x03e21eeb69037ab78
	.quad 0x0be5ae600b42fdfa7 # -2.50511e-008 s5
	.quad 0x0bda907db46cc5e42
	.quad 0x03de5e0b2f9a43bb8 # 1.59181e-010 s6

	.text
	.align 16
	.p2align 4,,15

	#include "fn_macros.h"
	#define fname FN_PROTOTYPE(sincos)
	#define fname_special _sincos_special@PLT

	# define local variable storage offsets
	.equ p_temp, 0x30 # temporary for get/put bits operation
	.equ p_temp1, 0x40 # temporary for get/put bits operation
	.equ r, 0x50 # pointer to r for amd_remainder_piby2
	.equ rr, 0x60 # pointer to rr for amd_remainder_piby2
	.equ region, 0x70 # pointer to region for amd_remainder_piby2
	.equ stack_size, 0x98

	.globl fname
	.type fname,@function

	fname:
	sub $stack_size, %rsp
	xorpd %xmm2,%xmm2 # zeroed out for later use

	# GET_BITS_DP64(x, ux);
	# get the input value to an integer register.
	movsd %xmm0,p_temp(%rsp)
	mov p_temp(%rsp),%rcx # rcx is ux

	## if NaN or inf
	mov $0x07ff0000000000000,%rax
	mov %rax,%r10
	and %rcx,%r10
	cmp %rax,%r10
	jz .Lsincos_naninf

	# ax = (ux & ~SIGNBIT_DP64);
	mov $0x07fffffffffffffff,%r10
	and %rcx,%r10 # r10 is ax

	## if (ax <= 0x3fe921fb54442d18) /* abs(x) <= pi/4 */
	mov $0x03fe921fb54442d18,%rax
	cmp %rax,%r10
	jg .Lsincos_reduce

	## if (ax < 0x3f20000000000000) /* abs(x) < 2.0^(-13) */
	mov $0x03f20000000000000,%rax
	cmp %rax,%r10
	jge .Lsincos_small

	## if (ax < 0x3e40000000000000) /* abs(x) < 2.0^(-27) */
	mov $0x03e40000000000000,%rax
	cmp %rax,%r10
	jge .Lsincos_smaller

	# sin = x;
	movsd .L__real_3ff0000000000000(%rip),%xmm1 # cos = 1.0;
	jmp .Lsincos_cleanup

	## else
	.align 32
	.Lsincos_smaller:
	# sin = x - x^3 * 0.1666666666666666666;
	# cos = 1.0 - xx0.5;

	movsd %xmm0,%xmm2
	movsd .L__real_3fc5555555555555(%rip),%xmm4 # 0.1666666666666666666
	mulsd %xmm2,%xmm2 # x^2
	movsd .L__real_3ff0000000000000(%rip),%xmm1 # 1.0
	movsd %xmm2,%xmm3 # copy of x^2

	mulsd %xmm0,%xmm2 # x^3
	mulsd .L__real_3fe0000000000000(%rip),%xmm3 # 0.5 * x^2
	mulsd %xmm4,%xmm2 # x^3 * 0.1666666666666666666
	subsd %xmm2,%xmm0 # x - x^3 * 0.1666666666666666666, sin
	subsd %xmm3,%xmm1 # 1 - 0.5 * x^2, cos

	jmp .Lsincos_cleanup


	## else

	.align 16
	.Lsincos_small:
	# sin = sin_piby4(x, 0.0);
	movsd .L__real_3fe0000000000000(%rip),%xmm5 # .5

	# x2 = r * r;
	movsd %xmm0,%xmm2
	mulsd %xmm0,%xmm2 # x2
	#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	# region 0 or 2 - do a sin calculation
	# zs = (s2 + x2 * (s3 + x2 * (s4 + x2 * (s5 + x2 * s6))));

	movlhps %xmm2,%xmm2
	movapd .Lsincosarray+0x50(%rip),%xmm3 # s6
	movapd %xmm2,%xmm1 # move for x4
	movdqa .Lsincosarray+0x20(%rip),%xmm5 # s3
	mulpd %xmm2,%xmm3 # x2s6
	addpd .Lsincosarray+0x40(%rip),%xmm3 # s5+x2s6
	mulpd %xmm2,%xmm5 # x2s3
	movapd %xmm4,p_temp(%rsp) # rr move to to memory
	mulpd %xmm2,%xmm1 # x4
	mulpd %xmm2,%xmm3 # x2(s5+x2s6)
	addpd .Lsincosarray+0x10(%rip),%xmm5 # s2+x2s3
	movapd %xmm1,%xmm4 # move for x6
	addpd .Lsincosarray+0x30(%rip),%xmm3 # s4 + x2(s5+x2s6)
	mulpd %xmm2,%xmm5 # x2(s2+x2s3)
	mulpd %xmm2,%xmm4 # x6
	addpd .Lsincosarray(%rip),%xmm5 # s1+x2(s2+x2s3)
	mulpd %xmm4,%xmm3 # x6(s4 + x2(s5+x2s6))

	movsd %xmm2,%xmm4 # xmm4 = x2 for 0.5x2 for cos
	# xmm2 contains x2 for x3 for sin
	addpd %xmm5,%xmm3 # zs in lower and zc upper

	mulsd %xmm0,%xmm2 # xmm2=x3 for sin

	movhlps %xmm3,%xmm5 # Copy z, xmm5 = cos , xmm3 = sin

	mulsd .L__real_3fe0000000000000(%rip),%xmm4 # xmm4=r=0.5*x2 for cos term
	mulsd %xmm2,%xmm3 # sin *x3
	mulsd %xmm1,%xmm5 # cos *x4
	movsd .L__real_3ff0000000000000(%rip),%xmm2 # 1.0
	subsd %xmm4,%xmm2 # t=1.0-r
	movsd .L__real_3ff0000000000000(%rip),%xmm1 # 1.0
	subsd %xmm2,%xmm1 # 1 - t
	subsd %xmm4,%xmm1 # (1-t) -r
	addsd %xmm5,%xmm1 # ((1-t) -r) + cos
	addsd %xmm3,%xmm0 # xmm0= sin+x, final sin term
	addsd %xmm2,%xmm1 # xmm1 = t +{ ((1-t) -r) + cos}, final cos term

	jmp .Lsincos_cleanup
	#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	.align 16
	.Lsincos_reduce:
	# change rdx to rcx and r8 to r9
	# rcx= ux, r10 = ax
	# %r9,%rax are free

	# xneg = (ax != ux);
	cmp %r10,%rcx
	mov $0,%r11d

	## if (xneg) x = -x;
	jz .LPositive
	mov $1,%r11d
	subsd %xmm0,%xmm2
	movsd %xmm2,%xmm0

	# rcx= ux, r10 = ax, r11= Sign
	# %r9,%rax are free
	# change rdx to rcx and r8 to r9

	.align 16
	.LPositive:
	## if (x < 5.0e5)
	cmp .L__real_411E848000000000(%rip),%r10
	jae .Lsincos_reduce_precise

	# reduce the argument to be in a range from -pi/4 to +pi/4
	# by subtracting multiples of pi/2
	movsd %xmm0,%xmm2
	movsd .L__real_3fe45f306dc9c883(%rip),%xmm3 # twobypi
	movsd %xmm0,%xmm4
	movsd .L__real_3fe0000000000000(%rip),%xmm5 # .5
	mulsd %xmm3,%xmm2

	#/* How many pi/2 is x a multiple of? */
	# xexp = ax >> EXPSHIFTBITS_DP64;
	shr $52,%r10 # >>EXPSHIFTBITS_DP64

	# npi2 = (int)(x * twobypi + 0.5);
	addsd %xmm5,%xmm2 # npi2

	movsd .L__real_3ff921fb54400000(%rip),%xmm3 # piby2_1
	cvttpd2dq %xmm2,%xmm0 # convert to integer
	movsd .L__real_3dd0b4611a626331(%rip),%xmm1 # piby2_1tail
	cvtdq2pd %xmm0,%xmm2 # and back to float.

	# /* Subtract the multiple from x to get an extra-precision remainder */
	# rhead = x - npi2 * piby2_1;
	mulsd %xmm2,%xmm3
	subsd %xmm3,%xmm4 # rhead

	# rtail = npi2 * piby2_1tail;
	mulsd %xmm2,%xmm1
	movd %xmm0,%eax


	# GET_BITS_DP64(rhead-rtail, uy); ; originally only rhead
	movsd %xmm4,%xmm0
	subsd %xmm1,%xmm0

	movsd .L__real_3dd0b4611a600000(%rip),%xmm3 # piby2_2
	movsd %xmm0,p_temp(%rsp)
	movsd .L__real_3ba3198a2e037073(%rip),%xmm5 # piby2_2tail
	mov %eax,%ecx
	mov p_temp(%rsp),%r9 # rcx is rhead-rtail

	# xmm0=r, xmm4=rhead, xmm1=rtail, xmm2=npi2, xmm3=temp for calc, xmm5= temp for calc
	# expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
	shl $1,%r9 # strip any sign bit
	shr $53,%r9 # >> EXPSHIFTBITS_DP64 +1
	sub %r9,%r10 # expdiff

	## if (expdiff > 15)
	cmp $15,%r10
	jle .Lexpdiff15

	# /* The remainder is pretty small compared with x, which
	# implies that x is a near multiple of pi/2
	# (x matches the multiple to at least 15 bits) */

	# t = rhead;
	movsd %xmm4,%xmm1

	# rtail = npi2 * piby2_2;
	mulsd %xmm2,%xmm3

	# rhead = t - rtail;
	mulsd %xmm2,%xmm5 # npi2 * piby2_2tail
	subsd %xmm3,%xmm4 # rhead

	# rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
	subsd %xmm4,%xmm1 # t - rhead
	subsd %xmm3,%xmm1 # -rtail
	subsd %xmm1,%xmm5 # rtail

	# r = rhead - rtail;
	movsd %xmm4,%xmm0

	#HARSHA
	#xmm1=rtail
	movsd %xmm5,%xmm1
	subsd %xmm5,%xmm0

	# xmm0=r, xmm4=rhead, xmm1=rtail
	.Lexpdiff15:
	# region = npi2 & 3;

	subsd %xmm0,%xmm4 # rhead-r
	subsd %xmm1,%xmm4 # rr = (rhead-r) - rtail

	#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	## if the input was close to a pi/2 multiple
	# The original NAG code missed this trick. If the input is very close to n*pi/2 after
	# reduction,
	# then the sin is ~ 1.0 , to within 53 bits, when r is < 2^-27. We already
	# have x at this point, so we can skip the sin polynomials.

	cmp $0x03f2,%r9 # if r small.
	jge .Lcossin_piby4 # use taylor series if not
	cmp $0x03de,%r9 # if r really small.
	jle .Lr_small # then sin(r) = 0

	movsd %xmm0,%xmm2
	mulsd %xmm2,%xmm2 # x^2

	## if region is 0 or 2 do a sin calc.
	and $1,%ecx
	jnz .Lregion13

	# region 0 or 2 do a sincos calculation
	# use simply polynomial
	# sin=x - xxx*0.166666666666666666;
	movsd .L__real_3fc5555555555555(%rip),%xmm3 # 0.166666666
	mulsd %xmm0,%xmm3 # * x
	mulsd %xmm2,%xmm3 # * x^2
	subsd %xmm3,%xmm0 # xs
	# cos=1.0 - xx0.5;
	movsd .L__real_3ff0000000000000(%rip),%xmm1 # 1.0
	mulsd .L__real_3fe0000000000000(%rip),%xmm2 # 0.5 *x^2
	subsd %xmm2,%xmm1 # xc

	jmp .Ladjust_region

	.align 16
	.Lregion13:
	# region 1 or 3 do a cossin calculation
	# use simply polynomial
	# sin=x - xxx*0.166666666666666666;
	movsd %xmm0,%xmm1

	movsd .L__real_3fc5555555555555(%rip),%xmm3 # 0.166666666
	mulsd %xmm0,%xmm3 # 0.166666666* x
	mulsd %xmm2,%xmm3 # 0.166666666* x * x^2
	subsd %xmm3,%xmm1 # xs
	# cos=1.0 - xx0.5;
	movsd .L__real_3ff0000000000000(%rip),%xmm0 # 1.0
	mulsd .L__real_3fe0000000000000(%rip),%xmm2 # 0.5 *x^2
	subsd %xmm2,%xmm0 # xc

	jmp .Ladjust_region

	.align 16
	.Lr_small:
	## if region is 0 or 2 do a sincos calc.
	movsd .L__real_3ff0000000000000(%rip),%xmm1 # cos(r) is a 1
	and $1,%ecx
	jz .Ladjust_region

	## if region is 1 or 3 do a cossin calc.
	movsd %xmm0,%xmm1 # sin(r) is r
	movsd .L__real_3ff0000000000000(%rip),%xmm0 # cos(r) is a 1
	jmp .Ladjust_region

	#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	.align 16
	.Lsincos_reduce_precise:
	# // Reduce x into range [-pi/4,pi/4]
	# __amd_remainder_piby2(x, &r, &rr, &region);

	mov %rdi, p_temp1(%rsp)
	mov %rsi, p_temp1+8(%rsp)
	mov %r11,p_temp(%rsp)

	lea region(%rsp),%rdx
	lea rr(%rsp),%rsi
	lea r(%rsp),%rdi

	call __amd_remainder_piby2@PLT

	mov p_temp1(%rsp), %rdi
	mov p_temp1+8(%rsp), %rsi
	mov p_temp(%rsp),%r11

	movsd r(%rsp),%xmm0 # x
	movsd rr(%rsp),%xmm4 # xx
	mov region(%rsp),%eax # region to classify for sin/cos calc
	mov %eax,%ecx # region to get sign

	# xmm0 = x, xmm4 = xx, r8d = 1, eax= region
	.align 16
	.Lcossin_piby4:
	#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	# perform taylor series to calc sinx, sinx
	# x2 = r * r;
	#xmm4 = a part of rr for the sin path, xmm4 is overwritten in the sin path
	#instead use xmm3 because that was freed up in the sin path, xmm3 is overwritten in sin path

	movsd %xmm0,%xmm2
	mulsd %xmm0,%xmm2 #x2

	## if region is 0 or 2 do a sincos calc.
	and $1,%ecx
	jz .Lsincos02

	#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	# region 1 or 3 - do a cossin calculation
	# zc = (c2 + x2 * (c3 + x2 * (c4 + x2 * (c5 + x2 * c6))));


	movlhps %xmm2,%xmm2

	movapd .Lcossinarray+0x50(%rip),%xmm3 # s6
	movapd %xmm2,%xmm1 # move for x4
	movdqa .Lcossinarray+0x20(%rip),%xmm5 # s3
	mulpd %xmm2,%xmm3 # x2s6
	addpd .Lcossinarray+0x40(%rip),%xmm3 # s5+x2s6
	mulpd %xmm2,%xmm5 # x2s3
	movsd %xmm4,p_temp(%rsp) # rr move to to memory
	mulpd %xmm2,%xmm1 # x4
	mulpd %xmm2,%xmm3 # x2(s5+x2s6)
	addpd .Lcossinarray+0x10(%rip),%xmm5 # s2+x2s3
	movapd %xmm1,%xmm4 # move for x6
	addpd .Lcossinarray+0x30(%rip),%xmm3 # s4 + x2(s5+x2s6)
	mulpd %xmm2,%xmm5 # x2(s2+x2s3)
	mulpd %xmm2,%xmm4 # x6
	addpd .Lcossinarray(%rip),%xmm5 # s1+x2(s2+x2s3)
	mulpd %xmm4,%xmm3 # x6(s4 + x2(s5+x2s6))

	movsd %xmm2,%xmm4 # xmm4 = x2 for 0.5x2 cos
	# xmm2 contains x2 for x3 sin

	addpd %xmm5,%xmm3 # zc in lower and zs in upper

	mulsd %xmm0,%xmm2 # xmm2=x3 for the sin term

	movhlps %xmm3,%xmm5 # Copy z, xmm5 = sin, xmm3 = cos
	mulsd .L__real_3fe0000000000000(%rip),%xmm4 # xmm4=r=0.5*x2 for cos term

	mulsd %xmm2,%xmm5 # sin *x3
	mulsd %xmm1,%xmm3 # cos *x4
	movsd %xmm0,p_temp1(%rsp) # store x
	movsd %xmm0,%xmm1

	movsd .L__real_3ff0000000000000(%rip),%xmm2 # 1.0
	subsd %xmm4,%xmm2 # t=1.0-r

	movsd .L__real_3ff0000000000000(%rip),%xmm0 # 1.0
	subsd %xmm2,%xmm0 # 1 - t

	mulsd p_temp(%rsp),%xmm1 # x*xx
	subsd %xmm4,%xmm0 # (1-t) -r
	subsd %xmm1,%xmm0 # ((1-t) -r) - x *xx

	mulsd p_temp(%rsp),%xmm4 # 0.5x2xx

	addsd %xmm3,%xmm0 # (((1-t) -r) - x *xx) + cos

	subsd %xmm4,%xmm5 # sin - 0.5x2xx

	addsd %xmm2,%xmm0 # xmm0 = t +{ (((1-t) -r) - x *xx) + cos}, final cos term

	addsd p_temp(%rsp),%xmm5 # sin + xx
	movsd p_temp1(%rsp),%xmm1 # load x
	addsd %xmm5,%xmm1 # xmm1= sin+x, final sin term

	jmp .Ladjust_region

	#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	.align 16
	.Lsincos02:
	# region 0 or 2 do a sincos calculation
	movlhps %xmm2,%xmm2

	movapd .Lsincosarray+0x50(%rip),%xmm3 # s6
	movapd %xmm2,%xmm1 # move for x4
	movdqa .Lsincosarray+0x20(%rip),%xmm5 # s3
	mulpd %xmm2,%xmm3 # x2s6
	addpd .Lsincosarray+0x40(%rip),%xmm3 # s5+x2s6
	mulpd %xmm2,%xmm5 # x2s3
	movsd %xmm4,p_temp(%rsp) # rr move to to memory
	mulpd %xmm2,%xmm1 # x4
	mulpd %xmm2,%xmm3 # x2(s5+x2s6)
	addpd .Lsincosarray+0x10(%rip),%xmm5 # s2+x2s3
	movapd %xmm1,%xmm4 # move for x6
	addpd .Lsincosarray+0x30(%rip),%xmm3 # s4 + x2(s5+x2s6)
	mulpd %xmm2,%xmm5 # x2(s2+x2s3)
	mulpd %xmm2,%xmm4 # x6
	addpd .Lsincosarray(%rip),%xmm5 # s1+x2(s2+x2s3)
	mulpd %xmm4,%xmm3 # x6(s4 + x2(s5+x2s6))

	movsd %xmm2,%xmm4 # xmm4 = x2 for 0.5x2 for cos
	# xmm2 contains x2 for x3 for sin

	addpd %xmm5,%xmm3 # zs in lower and zc in upper

	mulsd %xmm0,%xmm2 # xmm2=x3 for sin

	movhlps %xmm3,%xmm5 # Copy z, xmm5 = cos , xmm3 = sin

	mulsd .L__real_3fe0000000000000(%rip),%xmm4 # xmm4=r=0.5*x2 for cos term

	mulsd %xmm2,%xmm3 # sin *x3
	mulsd %xmm1,%xmm5 # cos *x4

	movsd .L__real_3ff0000000000000(%rip),%xmm2 # 1.0
	subsd %xmm4,%xmm2 # t=1.0-r

	movsd .L__real_3ff0000000000000(%rip),%xmm1 # 1.0
	subsd %xmm2,%xmm1 # 1 - t

	movsd %xmm0,p_temp1(%rsp) # store x
	mulsd p_temp(%rsp),%xmm0 # x*xx

	subsd %xmm4,%xmm1 # (1-t) -r
	subsd %xmm0,%xmm1 # ((1-t) -r) - x *xx

	mulsd p_temp(%rsp),%xmm4 # 0.5x2xx

	addsd %xmm5,%xmm1 # (((1-t) -r) - x *xx) + cos

	subsd %xmm4,%xmm3 # sin - 0.5x2xx

	addsd %xmm2,%xmm1 # xmm1 = t +{ (((1-t) -r) - x *xx) + cos}, final cos term

	addsd p_temp(%rsp),%xmm3 # sin + xx
	movsd p_temp1(%rsp),%xmm0 # load x
	addsd %xmm3,%xmm0 # xmm0= sin+x, final sin term

	jmp .Ladjust_region

	#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	# switch (region)
	.align 16
	.Ladjust_region: # positive or negative for sin return val in xmm0

	mov %eax,%r9d

	shr $1,%eax
	mov %eax,%ecx
	and %r11d,%eax

	not %ecx
	not %r11d
	and %r11d,%ecx

	or %ecx,%eax
	and $1,%eax
	jnz .Lcos_sign

	## if the original region 0, 1 and arg is negative, then we negate the result.
	## if the original region 2, 3 and arg is positive, then we negate the result.
	movsd %xmm0,%xmm2
	xorpd %xmm0,%xmm0
	subsd %xmm2,%xmm0

	.Lcos_sign: # positive or negative for cos return val in xmm1
	add $1,%r9
	and $2,%r9d
	jz .Lsincos_cleanup
	## if the original region 1 or 2 then we negate the result.
	movsd %xmm1,%xmm2
	xorpd %xmm1,%xmm1
	subsd %xmm2,%xmm1

	#.align 16
	.Lsincos_cleanup:
	movsd %xmm0, (%rdi) # save the sin
	movsd %xmm1, (%rsi) # save the cos

	add $stack_size,%rsp
	ret

	.align 16
	.Lsincos_naninf:
	call fname_special
	add $stack_size, %rsp
	ret