src/gas/sinf.S - open64_libacml_mv - Git at Google


 #
 #  (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
 #
 #  This file is part of libacml_mv.
 #
 #  libacml_mv is free software; you can redistribute it and/or
 #  modify it under the terms of the GNU Lesser General Public
 #  License as published by the Free Software Foundation; either
 #  version 2.1 of the License, or (at your option) any later version.
 #
 #  libacml_mv is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 #  Lesser General Public License for more details.
 #
 #  You should have received a copy of the GNU Lesser General Public
 #  License along with libacml_mv.  If not, see
 #  <http://www.gnu.org/licenses/>.
 #
 #


 #
 # An implementation of the sinf function.
 #
 # Prototype:
 #
 #     double sinf(double x);
 #
 #   Computes sinf(x).
 #   It will provide proper C99 return values,
 #   but may not raise floating point status bits properly.
 #   Based on the NAG C implementation.
 #
 #
 #ifdef __ELF__
 .section .note.GNU-stack,"",@progbits
 #endif

 .data
 .align 32
 .L__real_3ff0000000000000: .quad 0x03ff0000000000000  # 1.0
                   .quad 0                             # for alignment
 .L__real_3fe0000000000000: .quad 0x03fe0000000000000  # 0.5
                   .quad 0
 .L__real_3fc5555555555555: .quad 0x03fc5555555555555  # 0.166666666666
                   .quad 0
 .L__real_3fe45f306dc9c883: .quad 0x03fe45f306dc9c883  # twobypi
                   .quad 0
 .L__real_3ff921fb54400000: .quad 0x03ff921fb54400000  # piby2_1
                   .quad 0
 .L__real_3dd0b4611a626331: .quad 0x03dd0b4611a626331  # piby2_1tail
                   .quad 0
 .L__real_3dd0b4611a600000: .quad 0x03dd0b4611a600000  # piby2_2
                   .quad 0
 .L__real_3ba3198a2e037073: .quad 0x03ba3198a2e037073  # piby2_2tail
                   .quad 0
 .L__real_411E848000000000: .quad 0x415312d000000000   # 5e6 0x0411E848000000000  # 5e5
                   .quad 0

 .align 32
 .Lcosfarray:
    .quad   0x0bfe0000000000000                        # -0.5            c0
    .quad   0
    .quad   0x03fa5555555555555                        # 0.0416667       c1
    .quad   0
    .quad   0x0bf56c16c16c16c16                        # -0.00138889     c2
    .quad   0
    .quad   0x03EFA01A01A01A019                        # 2.48016e-005    c3
    .quad   0
    .quad   0x0be927e4fb7789f5c                        # -2.75573e-007   c4
    .quad   0

 .align 32
 .Lsinfarray:
    .quad   0x0bfc5555555555555                        # -0.166667       s1
    .quad   0
    .quad   0x03f81111111111111                        # 0.00833333      s2
    .quad   0
    .quad   0x0bf2a01a01a01a01a                        # -0.000198413    s3
    .quad   0
    .quad   0x03ec71de3a556c734                        # 2.75573e-006    s4
    .quad   0

 .text
 .align 32
 .p2align 4,,15

 #include "fn_macros.h"
 #define fname FN_PROTOTYPE(sinf)
 #define fname_special _sinf_special@PLT

 # define local variable storage offsets
 .equ   p_temp,     0x30                               # temporary for get/put bits operation
 .equ   p_temp1,    0x40                               # temporary for get/put bits operation
 .equ   r,          0x50                               # pointer to r for amd_remainder_piby2
 .equ   region,     0x60                               # pointer to region for amd_remainder_piby2
 .equ   stack_size, 0x88

 .globl fname
 .type  fname,@function

 fname:
    sub      $stack_size, %rsp
    xorpd    %xmm2, %xmm2                              # zeroed out for later use

 ##  if NaN or inf
    movd     %xmm0, %edx
    mov      $0x07f800000, %eax
    mov      %eax, %r10d
    and      %edx, %r10d
    cmp      %eax, %r10d
    jz       .Lsinf_naninf

 # GET_BITS_DP64(x, ux);
 # get the input value to an integer register.
    cvtss2sd %xmm0, %xmm0         # convert input to double.
    movsd    %xmm0,p_temp(%rsp)   # get the input value to an integer register.

    mov   p_temp(%rsp), %rdx      # rdx is ux

 #  ax = (ux & ~SIGNBIT_DP64);
    mov      $0x07fffffffffffffff, %r10
    and      %rdx, %r10            # r10 is ax
    mov      $1, %r8d            # for determining region later on

 ##  if (ax <= 0x3fe921fb54442d18) /* abs(x) <= pi/4 */
    mov      $0x03fe921fb54442d18, %rax
    cmp      %rax, %r10
    jg       .Lsinf_reduce

 ##      if (ax < 0x3f80000000000000) /* abs(x) < 2.0^(-7) */
    mov      $0x3f80000000000000, %rax
    cmp      %rax, %r10
    jge      .Lsinf_small

 ##          if (ax < 0x3f20000000000000) /* abs(x) < 2.0^(-13) */
    mov      $0x3f20000000000000, %rax
    cmp      %rax, %r10
    jge      .Lsinf_smaller

 #                  sinf = x;
    jmp      .Lsinf_cleanup         # done

 ##          else

 .Lsinf_smaller:
 #              sinf = x - x^3 * 0.1666666666666666666;
    movsd    %xmm0, %xmm2
    movsd    .L__real_3fc5555555555555(%rip), %xmm4   # 0.1666666666666666666
    mulsd    %xmm2, %xmm2            # x^2
    mulsd    %xmm0, %xmm2            # x^3
    mulsd    %xmm4, %xmm2            # x^3 * 0.1666666666666666666
    subsd    %xmm2, %xmm0            # x - x^3 * 0.1666666666666666666
    jmp      .Lsinf_cleanup

 .Lsinf_small:
    movsd    %xmm0, %xmm2
    mulsd    %xmm0, %xmm2            # x2

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 # region 0 or 2    - do a sinf calculation
 #  zs = x + x3((s1 + x2 * s2) + x4(s3 + x2 * s4));
    movsd    .Lsinfarray+0x30(%rip), %xmm1   # s4
    mulsd    %xmm2, %xmm1                 # s4x2
    movsd    %xmm2, %xmm4                    # move for x4
    movsd    .Lsinfarray+0x10(%rip), %xmm5   # s2
    mulsd    %xmm2, %xmm4                 # x4
    movsd    %xmm0, %xmm3                     # move for x3
    mulsd    %xmm2, %xmm5                     # s2x2
    mulsd    %xmm2, %xmm3                     # x3
    addsd    .Lsinfarray+0x20(%rip), %xmm1   # s3+s4x2
    mulsd    %xmm4, %xmm1                    # s3x4+s4x6
    addsd    .Lsinfarray(%rip), %xmm5           # s1+s2x2
    addsd    %xmm5, %xmm1                    # s1+s2x2+s3x4+s4x6
    mulsd    %xmm3, %xmm1                    # x3(s1+s2x2+s3x4+s4x6)
    addsd    %xmm1, %xmm0                    # x + x3(s1+s2x2+s3x4+s4x6)
    jmp      .Lsinf_cleanup

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .align 32
 .Lsinf_reduce:

 #  xneg = (ax != ux);
    cmp      %r10, %rdx
    mov      $0, %r11d

 ##  if (xneg) x = -x;
    jz       .L50e5
    mov      $1, %r11d
    subsd    %xmm0, %xmm2
    movsd    %xmm2, %xmm0

 .L50e5:
 ##  if (x < 5.0e5)
    cmp      .L__real_411E848000000000(%rip), %r10
    jae      .Lsinf_reduce_precise

 # reduce  the argument to be in a range from -pi/4 to +pi/4
 # by subtracting multiples of pi/2
    movsd    %xmm0, %xmm2
    movsd    .L__real_3fe45f306dc9c883(%rip), %xmm3            # twobypi
    movsd    %xmm0, %xmm4
    movsd    .L__real_3fe0000000000000(%rip), %xmm5            # .5
    mulsd    %xmm3, %xmm2

 #/* How many pi/2 is x a multiple of? */
 #      xexp  = ax >> EXPSHIFTBITS_DP64;
    mov      %r10, %r9
    shr      $52, %r9                  #>>EXPSHIFTBITS_DP64

 #        npi2  = (int)(x * twobypi + 0.5);
    addsd    %xmm5, %xmm2                  # npi2

    movsd    .L__real_3ff921fb54400000(%rip), %xmm3         # piby2_1
    cvttpd2dq   %xmm2, %xmm0               # convert to integer
    movsd    .L__real_3dd0b4611a626331(%rip), %xmm1         # piby2_1tail
    cvtdq2pd   %xmm0, %xmm2               # and back to double.

 #      /* Subtract the multiple from x to get an extra-precision remainder */
 #      rhead  = x - npi2 * piby2_1;
    mulsd    %xmm2, %xmm3
    subsd    %xmm3, %xmm4                  # rhead

 #      rtail  = npi2 * piby2_1tail;
    mulsd    %xmm2, %xmm1
    movd     %xmm0, %eax

 #      GET_BITS_DP64(rhead-rtail, uy);               ; originally only rhead
    movsd    %xmm4, %xmm0
    subsd    %xmm1, %xmm0

    movsd    .L__real_3dd0b4611a600000(%rip), %xmm3      # piby2_2
    movsd    %xmm0,p_temp(%rsp)
    movsd    .L__real_3ba3198a2e037073(%rip), %xmm5      # piby2_2tail
    mov      p_temp(%rsp), %rcx         # rcx is rhead-rtail

 #   xmm0=r, xmm4=rhead, xmm1=rtail, xmm2=npi2, xmm3=temp for calc, xmm5= temp for calc
 #      expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
    shl      $1, %rcx               # strip any sign bit
    shr      $53, %rcx               #>> EXPSHIFTBITS_DP64 +1
    sub      %rcx, %r9               #expdiff

 ##      if (expdiff > 15)
    cmp      $15, %r9
    jle      .Lexpdiff15

 #          /* The remainder is pretty small compared with x, which
 #             implies that x is a near multiple of pi/2
 #             (x matches the multiple to at least 15 bits) */

 #          t  = rhead;
    movsd    %xmm4, %xmm1

 #          rtail  = npi2 * piby2_2;
    mulsd    %xmm2, %xmm3

 #          rhead  = t - rtail;
    mulsd    %xmm2, %xmm5            # npi2 * piby2_2tail
    subsd    %xmm3, %xmm4            # rhead

 #          rtail  = npi2 * piby2_2tail - ((t - rhead) - rtail);
    subsd    %xmm4, %xmm1            # t - rhead
    subsd    %xmm3, %xmm1            # -rtail
    subsd    %xmm1, %xmm5            #rtail

 #      r = rhead - rtail;
    movsd    %xmm4, %xmm0

 #HARSHA
 #xmm1=rtail
    movsd    %xmm5, %xmm1
    subsd    %xmm5, %xmm0

 #   xmm0=r, xmm4=rhead, xmm1=rtail
 .Lexpdiff15:
 #      region = npi2 & 3;
 # No need rr for float case

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ## if the input was close to a pi/2 multiple
 # The original NAG code missed this trick.  If the input is very close to n*pi/2 after
 # reduction,
 # then the sinf is ~ 1.0 , to within 15 bits, when r is < 2^-13.  We already
 # have x at this point, so we can skip the sinf polynomials.

    cmp      $0x03f2, %rcx            ## if r  small.
    jge      .Lsinf_piby4            # use taylor series if not
    cmp      $0x03de, %rcx            ## if r really small.
    jle      .Lr_small               # then sinf(r) = 0

    movsd    %xmm0, %xmm2
    mulsd    %xmm2, %xmm2            #x^2

 ##      if region is 0 or 2 do a sinf calc.
    and      %eax, %r8d
    jnz      .Lcosfregion

 # region 0 or 2 do a sinf calculation
 # use simply polynomial
 #              x - x*x*x*0.166666666666666666;
    movsd    .L__real_3fc5555555555555(%rip), %xmm3         #
    mulsd    %xmm0, %xmm3                   # * x
    mulsd    %xmm2, %xmm3                  # * x^2
    subsd    %xmm3, %xmm0                   # xs
    jmp      .Ladjust_region

 .align 32
 .Lcosfregion:
 # region 1 or 3 do a cosf calculation
 # use simply polynomial
 #              1.0 - x*x*0.5;
    movsd    .L__real_3ff0000000000000(%rip), %xmm0         # 1.0
    mulsd    .L__real_3fe0000000000000(%rip), %xmm2         # 0.5 *x^2
    subsd    %xmm2, %xmm0                  # xc
    jmp      .Ladjust_region

 .align 32
 .Lr_small:
 ##      if region is 1 or 3   do a cosf calc.
    and      %eax, %r8d
    jz       .Ladjust_region

 # odd
    movsd    .L__real_3ff0000000000000(%rip), %xmm0         # cosf(r) is a 1
    jmp      .Ladjust_region

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .Lsinf_reduce_precise:
 #      // Reduce x into range [-pi/4,pi/4]
 #      __amd_remainder_piby2d2f(x, &r, &region);

    mov      %r11,p_temp(%rsp)
    lea      region(%rsp), %rdx
    lea      r(%rsp), %rsi
    movd     %xmm0, %rdi
    sub      $0x20, %rsp

    call     __amd_remainder_piby2d2f@PLT

    add      $0x20, %rsp
    mov      p_temp(%rsp), %r11
    mov      $1, %r8d            # for determining region later on
    movsd    r(%rsp), %xmm1      #//x
    mov      region(%rsp), %eax   #//region

 # xmm0 = x, xmm4 = xx, r8d = 1, eax= region
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 # perform taylor series to calc sinfx, cosfx
 .Lsinf_piby4:
 #  x2 = r * r;
    movsd    %xmm0, %xmm2
    mulsd    %xmm0, %xmm2                  #x2

 ##      if region is 0 or 2   do a sinf calc.
    and      %eax, %r8d
    jnz      .Lcosfregion2

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 # region 0 or 2 do a sinf calculation
 #  zs = x + x3((s1 + x2 * s2) + x4(s3 + x2 * s4));
    movsd    .Lsinfarray+0x30(%rip), %xmm1   # s4
    mulsd    %xmm2, %xmm1                 # s4x2
    movsd    %xmm2, %xmm4                    # move for x4
    mulsd    %xmm2, %xmm4                 # x4
    movsd    .Lsinfarray+0x10(%rip), %xmm5   # s2
    mulsd    %xmm2, %xmm5                     # s2x2
    movsd    %xmm0, %xmm3                     # move for x3
    mulsd    %xmm2, %xmm3                     # x3
    addsd    .Lsinfarray+0x20(%rip), %xmm1   # s3+s4x2
    mulsd    %xmm4, %xmm1                    # s3x4+s4x6
    addsd    .Lsinfarray(%rip), %xmm5           # s1+s2x2
    addsd    %xmm5, %xmm1                    # s1+s2x2+s3x4+s4x6
    mulsd    %xmm3, %xmm1                    # x3(s1+s2x2+s3x4+s4x6)
    addsd    %xmm1, %xmm0                    # x + x3(s1+s2x2+s3x4+s4x6)

    jmp      .Ladjust_region

 .align 32
 .Lcosfregion2:
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 # region 1 or 3    - do a cosf calculation
 #    zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8 + c4*x10 for a higher precision
    movsd    .Lcosfarray+0x40(%rip), %xmm1    # c4
    movsd    %xmm2, %xmm4                     # move for x4
    mulsd    %xmm2, %xmm1                     # c4x2
    movsd    .Lcosfarray+0x20(%rip), %xmm3    # c2
    mulsd    %xmm2, %xmm4                     # x4
    movsd    .Lcosfarray(%rip), %xmm0         # c0
    mulsd    %xmm2, %xmm3                     # c2x2
    mulsd    %xmm2, %xmm0                     # c0x2 (=-0.5x2)
    addsd    .Lcosfarray+0x30(%rip), %xmm1      # c3+c4x2
    mulsd    %xmm4, %xmm1                     # c3x4 + c4x6
    addsd    .Lcosfarray+0x10(%rip), %xmm3      # c1+c2x2
    addsd    %xmm3, %xmm1                     # c1 + c2x2 + c3x4 + c4x6
    mulsd    %xmm4, %xmm1                  # c1x4 + c2x6 + c3x8 + c4x10
    addsd    .L__real_3ff0000000000000(%rip), %xmm0 # 1 - 0.5x2
    addsd    %xmm1, %xmm0                     # 1 - 0.5x2 + c1x4 + c2x6 + c3x8 + c4x10

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 .align 32
 .Ladjust_region:      # positive or negative
 #      switch (region)
    shr      $1, %eax
    mov      %eax, %ecx
    and      %r11d, %eax

    not      %ecx
    not      %r11d
    and      %r11d, %ecx

    or       %ecx, %eax
    and      $1, %eax
    jnz      .Lsinf_cleanup

 ## if the original region 0, 1 and arg is negative, then we negate the result.
 ## if the original region 2, 3 and arg is positive, then we negate the result.
    movsd    %xmm0, %xmm2
    xorpd    %xmm0, %xmm0
    subsd    %xmm2, %xmm0

 .align 32
 .Lsinf_cleanup:
    cvtsd2ss %xmm0, %xmm0
    add      $stack_size, %rsp
    ret

 .align 32
 .Lsinf_naninf:
    call     fname_special
    add      $stack_size, %rsp
    ret

	#
	# (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
	#
	# This file is part of libacml_mv.
	#
	# libacml_mv is free software; you can redistribute it and/or
	# modify it under the terms of the GNU Lesser General Public
	# License as published by the Free Software Foundation; either
	# version 2.1 of the License, or (at your option) any later version.
	#
	# libacml_mv is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	# Lesser General Public License for more details.
	#
	# You should have received a copy of the GNU Lesser General Public
	# License along with libacml_mv. If not, see
	# <http://www.gnu.org/licenses/>.
	#
	#


	#
	# An implementation of the sinf function.
	#
	# Prototype:
	#
	# double sinf(double x);
	#
	# Computes sinf(x).
	# It will provide proper C99 return values,
	# but may not raise floating point status bits properly.
	# Based on the NAG C implementation.
	#
	#
	#ifdef __ELF__
	.section .note.GNU-stack,"",@progbits
	#endif

	.data
	.align 32
	.L__real_3ff0000000000000: .quad 0x03ff0000000000000 # 1.0
	.quad 0 # for alignment
	.L__real_3fe0000000000000: .quad 0x03fe0000000000000 # 0.5
	.quad 0
	.L__real_3fc5555555555555: .quad 0x03fc5555555555555 # 0.166666666666
	.quad 0
	.L__real_3fe45f306dc9c883: .quad 0x03fe45f306dc9c883 # twobypi
	.quad 0
	.L__real_3ff921fb54400000: .quad 0x03ff921fb54400000 # piby2_1
	.quad 0
	.L__real_3dd0b4611a626331: .quad 0x03dd0b4611a626331 # piby2_1tail
	.quad 0
	.L__real_3dd0b4611a600000: .quad 0x03dd0b4611a600000 # piby2_2
	.quad 0
	.L__real_3ba3198a2e037073: .quad 0x03ba3198a2e037073 # piby2_2tail
	.quad 0
	.L__real_411E848000000000: .quad 0x415312d000000000 # 5e6 0x0411E848000000000 # 5e5
	.quad 0

	.align 32
	.Lcosfarray:
	.quad 0x0bfe0000000000000 # -0.5 c0
	.quad 0
	.quad 0x03fa5555555555555 # 0.0416667 c1
	.quad 0
	.quad 0x0bf56c16c16c16c16 # -0.00138889 c2
	.quad 0
	.quad 0x03EFA01A01A01A019 # 2.48016e-005 c3
	.quad 0
	.quad 0x0be927e4fb7789f5c # -2.75573e-007 c4
	.quad 0

	.align 32
	.Lsinfarray:
	.quad 0x0bfc5555555555555 # -0.166667 s1
	.quad 0
	.quad 0x03f81111111111111 # 0.00833333 s2
	.quad 0
	.quad 0x0bf2a01a01a01a01a # -0.000198413 s3
	.quad 0
	.quad 0x03ec71de3a556c734 # 2.75573e-006 s4
	.quad 0

	.text
	.align 32
	.p2align 4,,15

	#include "fn_macros.h"
	#define fname FN_PROTOTYPE(sinf)
	#define fname_special _sinf_special@PLT

	# define local variable storage offsets
	.equ p_temp, 0x30 # temporary for get/put bits operation
	.equ p_temp1, 0x40 # temporary for get/put bits operation
	.equ r, 0x50 # pointer to r for amd_remainder_piby2
	.equ region, 0x60 # pointer to region for amd_remainder_piby2
	.equ stack_size, 0x88

	.globl fname
	.type fname,@function

	fname:
	sub $stack_size, %rsp
	xorpd %xmm2, %xmm2 # zeroed out for later use

	## if NaN or inf
	movd %xmm0, %edx
	mov $0x07f800000, %eax
	mov %eax, %r10d
	and %edx, %r10d
	cmp %eax, %r10d
	jz .Lsinf_naninf

	# GET_BITS_DP64(x, ux);
	# get the input value to an integer register.
	cvtss2sd %xmm0, %xmm0 # convert input to double.
	movsd %xmm0,p_temp(%rsp) # get the input value to an integer register.

	mov p_temp(%rsp), %rdx # rdx is ux

	# ax = (ux & ~SIGNBIT_DP64);
	mov $0x07fffffffffffffff, %r10
	and %rdx, %r10 # r10 is ax
	mov $1, %r8d # for determining region later on

	## if (ax <= 0x3fe921fb54442d18) /* abs(x) <= pi/4 */
	mov $0x03fe921fb54442d18, %rax
	cmp %rax, %r10
	jg .Lsinf_reduce

	## if (ax < 0x3f80000000000000) /* abs(x) < 2.0^(-7) */
	mov $0x3f80000000000000, %rax
	cmp %rax, %r10
	jge .Lsinf_small

	## if (ax < 0x3f20000000000000) /* abs(x) < 2.0^(-13) */
	mov $0x3f20000000000000, %rax
	cmp %rax, %r10
	jge .Lsinf_smaller

	# sinf = x;
	jmp .Lsinf_cleanup # done

	## else

	.Lsinf_smaller:
	# sinf = x - x^3 * 0.1666666666666666666;
	movsd %xmm0, %xmm2
	movsd .L__real_3fc5555555555555(%rip), %xmm4 # 0.1666666666666666666
	mulsd %xmm2, %xmm2 # x^2
	mulsd %xmm0, %xmm2 # x^3
	mulsd %xmm4, %xmm2 # x^3 * 0.1666666666666666666
	subsd %xmm2, %xmm0 # x - x^3 * 0.1666666666666666666
	jmp .Lsinf_cleanup

	.Lsinf_small:
	movsd %xmm0, %xmm2
	mulsd %xmm0, %xmm2 # x2

	#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	# region 0 or 2 - do a sinf calculation
	# zs = x + x3((s1 + x2 * s2) + x4(s3 + x2 * s4));
	movsd .Lsinfarray+0x30(%rip), %xmm1 # s4
	mulsd %xmm2, %xmm1 # s4x2
	movsd %xmm2, %xmm4 # move for x4
	movsd .Lsinfarray+0x10(%rip), %xmm5 # s2
	mulsd %xmm2, %xmm4 # x4
	movsd %xmm0, %xmm3 # move for x3
	mulsd %xmm2, %xmm5 # s2x2
	mulsd %xmm2, %xmm3 # x3
	addsd .Lsinfarray+0x20(%rip), %xmm1 # s3+s4x2
	mulsd %xmm4, %xmm1 # s3x4+s4x6
	addsd .Lsinfarray(%rip), %xmm5 # s1+s2x2
	addsd %xmm5, %xmm1 # s1+s2x2+s3x4+s4x6
	mulsd %xmm3, %xmm1 # x3(s1+s2x2+s3x4+s4x6)
	addsd %xmm1, %xmm0 # x + x3(s1+s2x2+s3x4+s4x6)
	jmp .Lsinf_cleanup

	#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	.align 32
	.Lsinf_reduce:

	# xneg = (ax != ux);
	cmp %r10, %rdx
	mov $0, %r11d

	## if (xneg) x = -x;
	jz .L50e5
	mov $1, %r11d
	subsd %xmm0, %xmm2
	movsd %xmm2, %xmm0

	.L50e5:
	## if (x < 5.0e5)
	cmp .L__real_411E848000000000(%rip), %r10
	jae .Lsinf_reduce_precise

	# reduce the argument to be in a range from -pi/4 to +pi/4
	# by subtracting multiples of pi/2
	movsd %xmm0, %xmm2
	movsd .L__real_3fe45f306dc9c883(%rip), %xmm3 # twobypi
	movsd %xmm0, %xmm4
	movsd .L__real_3fe0000000000000(%rip), %xmm5 # .5
	mulsd %xmm3, %xmm2

	#/* How many pi/2 is x a multiple of? */
	# xexp = ax >> EXPSHIFTBITS_DP64;
	mov %r10, %r9
	shr $52, %r9 #>>EXPSHIFTBITS_DP64

	# npi2 = (int)(x * twobypi + 0.5);
	addsd %xmm5, %xmm2 # npi2

	movsd .L__real_3ff921fb54400000(%rip), %xmm3 # piby2_1
	cvttpd2dq %xmm2, %xmm0 # convert to integer
	movsd .L__real_3dd0b4611a626331(%rip), %xmm1 # piby2_1tail
	cvtdq2pd %xmm0, %xmm2 # and back to double.

	# /* Subtract the multiple from x to get an extra-precision remainder */
	# rhead = x - npi2 * piby2_1;
	mulsd %xmm2, %xmm3
	subsd %xmm3, %xmm4 # rhead

	# rtail = npi2 * piby2_1tail;
	mulsd %xmm2, %xmm1
	movd %xmm0, %eax

	# GET_BITS_DP64(rhead-rtail, uy); ; originally only rhead
	movsd %xmm4, %xmm0
	subsd %xmm1, %xmm0

	movsd .L__real_3dd0b4611a600000(%rip), %xmm3 # piby2_2
	movsd %xmm0,p_temp(%rsp)
	movsd .L__real_3ba3198a2e037073(%rip), %xmm5 # piby2_2tail
	mov p_temp(%rsp), %rcx # rcx is rhead-rtail

	# xmm0=r, xmm4=rhead, xmm1=rtail, xmm2=npi2, xmm3=temp for calc, xmm5= temp for calc
	# expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
	shl $1, %rcx # strip any sign bit
	shr $53, %rcx #>> EXPSHIFTBITS_DP64 +1
	sub %rcx, %r9 #expdiff

	## if (expdiff > 15)
	cmp $15, %r9
	jle .Lexpdiff15

	# /* The remainder is pretty small compared with x, which
	# implies that x is a near multiple of pi/2
	# (x matches the multiple to at least 15 bits) */

	# t = rhead;
	movsd %xmm4, %xmm1

	# rtail = npi2 * piby2_2;
	mulsd %xmm2, %xmm3

	# rhead = t - rtail;
	mulsd %xmm2, %xmm5 # npi2 * piby2_2tail
	subsd %xmm3, %xmm4 # rhead

	# rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
	subsd %xmm4, %xmm1 # t - rhead
	subsd %xmm3, %xmm1 # -rtail
	subsd %xmm1, %xmm5 #rtail

	# r = rhead - rtail;
	movsd %xmm4, %xmm0

	#HARSHA
	#xmm1=rtail
	movsd %xmm5, %xmm1
	subsd %xmm5, %xmm0

	# xmm0=r, xmm4=rhead, xmm1=rtail
	.Lexpdiff15:
	# region = npi2 & 3;
	# No need rr for float case

	#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	## if the input was close to a pi/2 multiple
	# The original NAG code missed this trick. If the input is very close to n*pi/2 after
	# reduction,
	# then the sinf is ~ 1.0 , to within 15 bits, when r is < 2^-13. We already
	# have x at this point, so we can skip the sinf polynomials.

	cmp $0x03f2, %rcx ## if r small.
	jge .Lsinf_piby4 # use taylor series if not
	cmp $0x03de, %rcx ## if r really small.
	jle .Lr_small # then sinf(r) = 0

	movsd %xmm0, %xmm2
	mulsd %xmm2, %xmm2 #x^2

	## if region is 0 or 2 do a sinf calc.
	and %eax, %r8d
	jnz .Lcosfregion

	# region 0 or 2 do a sinf calculation
	# use simply polynomial
	# x - xxx*0.166666666666666666;
	movsd .L__real_3fc5555555555555(%rip), %xmm3 #
	mulsd %xmm0, %xmm3 # * x
	mulsd %xmm2, %xmm3 # * x^2
	subsd %xmm3, %xmm0 # xs
	jmp .Ladjust_region

	.align 32
	.Lcosfregion:
	# region 1 or 3 do a cosf calculation
	# use simply polynomial
	# 1.0 - xx0.5;
	movsd .L__real_3ff0000000000000(%rip), %xmm0 # 1.0
	mulsd .L__real_3fe0000000000000(%rip), %xmm2 # 0.5 *x^2
	subsd %xmm2, %xmm0 # xc
	jmp .Ladjust_region

	.align 32
	.Lr_small:
	## if region is 1 or 3 do a cosf calc.
	and %eax, %r8d
	jz .Ladjust_region

	# odd
	movsd .L__real_3ff0000000000000(%rip), %xmm0 # cosf(r) is a 1
	jmp .Ladjust_region

	#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	.Lsinf_reduce_precise:
	# // Reduce x into range [-pi/4,pi/4]
	# __amd_remainder_piby2d2f(x, &r, &region);

	mov %r11,p_temp(%rsp)
	lea region(%rsp), %rdx
	lea r(%rsp), %rsi
	movd %xmm0, %rdi
	sub $0x20, %rsp

	call __amd_remainder_piby2d2f@PLT

	add $0x20, %rsp
	mov p_temp(%rsp), %r11
	mov $1, %r8d # for determining region later on
	movsd r(%rsp), %xmm1 #//x
	mov region(%rsp), %eax #//region

	# xmm0 = x, xmm4 = xx, r8d = 1, eax= region
	#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	# perform taylor series to calc sinfx, cosfx
	.Lsinf_piby4:
	# x2 = r * r;
	movsd %xmm0, %xmm2
	mulsd %xmm0, %xmm2 #x2

	## if region is 0 or 2 do a sinf calc.
	and %eax, %r8d
	jnz .Lcosfregion2

	#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	# region 0 or 2 do a sinf calculation
	# zs = x + x3((s1 + x2 * s2) + x4(s3 + x2 * s4));
	movsd .Lsinfarray+0x30(%rip), %xmm1 # s4
	mulsd %xmm2, %xmm1 # s4x2
	movsd %xmm2, %xmm4 # move for x4
	mulsd %xmm2, %xmm4 # x4
	movsd .Lsinfarray+0x10(%rip), %xmm5 # s2
	mulsd %xmm2, %xmm5 # s2x2
	movsd %xmm0, %xmm3 # move for x3
	mulsd %xmm2, %xmm3 # x3
	addsd .Lsinfarray+0x20(%rip), %xmm1 # s3+s4x2
	mulsd %xmm4, %xmm1 # s3x4+s4x6
	addsd .Lsinfarray(%rip), %xmm5 # s1+s2x2
	addsd %xmm5, %xmm1 # s1+s2x2+s3x4+s4x6
	mulsd %xmm3, %xmm1 # x3(s1+s2x2+s3x4+s4x6)
	addsd %xmm1, %xmm0 # x + x3(s1+s2x2+s3x4+s4x6)

	jmp .Ladjust_region

	.align 32
	.Lcosfregion2:
	#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	# region 1 or 3 - do a cosf calculation
	# zc = 1-0.5x2+ c1x4 +c2x6 +c3x8 + c4*x10 for a higher precision
	movsd .Lcosfarray+0x40(%rip), %xmm1 # c4
	movsd %xmm2, %xmm4 # move for x4
	mulsd %xmm2, %xmm1 # c4x2
	movsd .Lcosfarray+0x20(%rip), %xmm3 # c2
	mulsd %xmm2, %xmm4 # x4
	movsd .Lcosfarray(%rip), %xmm0 # c0
	mulsd %xmm2, %xmm3 # c2x2
	mulsd %xmm2, %xmm0 # c0x2 (=-0.5x2)
	addsd .Lcosfarray+0x30(%rip), %xmm1 # c3+c4x2
	mulsd %xmm4, %xmm1 # c3x4 + c4x6
	addsd .Lcosfarray+0x10(%rip), %xmm3 # c1+c2x2
	addsd %xmm3, %xmm1 # c1 + c2x2 + c3x4 + c4x6
	mulsd %xmm4, %xmm1 # c1x4 + c2x6 + c3x8 + c4x10
	addsd .L__real_3ff0000000000000(%rip), %xmm0 # 1 - 0.5x2
	addsd %xmm1, %xmm0 # 1 - 0.5x2 + c1x4 + c2x6 + c3x8 + c4x10

	#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	.align 32
	.Ladjust_region: # positive or negative
	# switch (region)
	shr $1, %eax
	mov %eax, %ecx
	and %r11d, %eax

	not %ecx
	not %r11d
	and %r11d, %ecx

	or %ecx, %eax
	and $1, %eax
	jnz .Lsinf_cleanup

	## if the original region 0, 1 and arg is negative, then we negate the result.
	## if the original region 2, 3 and arg is positive, then we negate the result.
	movsd %xmm0, %xmm2
	xorpd %xmm0, %xmm0
	subsd %xmm2, %xmm0

	.align 32
	.Lsinf_cleanup:
	cvtsd2ss %xmm0, %xmm0
	add $stack_size, %rsp
	ret

	.align 32
	.Lsinf_naninf:
	call fname_special
	add $stack_size, %rsp
	ret