src/gas/vrd2sincos.S - open64_libacml_mv - Git at Google


 #
 #  (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
 #
 #  This file is part of libacml_mv.
 #
 #  libacml_mv is free software; you can redistribute it and/or
 #  modify it under the terms of the GNU Lesser General Public
 #  License as published by the Free Software Foundation; either
 #  version 2.1 of the License, or (at your option) any later version.
 #
 #  libacml_mv is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 #  Lesser General Public License for more details.
 #
 #  You should have received a copy of the GNU Lesser General Public
 #  License along with libacml_mv.  If not, see
 #  <http://www.gnu.org/licenses/>.
 #
 #


 #
 # A vector implementation of the libm sincos function.
 #
 # Prototype:
 #
 #     __vrd2_sincos(__m128d x, __m128d* ys, __m128d* yc);
 #
 #   Computes Sine and Cosine of x.
 #   It will provide proper C99 return values,
 #   but may not raise floating point status bits properly.
 #   Based on the NAG C implementation.
 #   Author: Harsha Jagasia
 #   Email:  harsha.jagasia@amd.com


 #ifdef __ELF__
 .section .note.GNU-stack,"",@progbits
 #endif


 .data
 .align 16
 .L__real_7fffffffffffffff: 	.quad 0x07fffffffffffffff	#Sign bit zero
 				.quad 0x07fffffffffffffff
 .L__real_3ff0000000000000: 	.quad 0x03ff0000000000000	# 1.0
 				.quad 0x03ff0000000000000
 .L__real_v2p__27:		.quad 0x03e40000000000000	# 2p-27
 				.quad 0x03e40000000000000
 .L__real_3fe0000000000000: 	.quad 0x03fe0000000000000	# 0.5
 				.quad 0x03fe0000000000000
 .L__real_3fc5555555555555: 	.quad 0x03fc5555555555555	# 0.166666666666
 				.quad 0x03fc5555555555555
 .L__real_3fe45f306dc9c883: 	.quad 0x03fe45f306dc9c883	# twobypi
 				.quad 0x03fe45f306dc9c883
 .L__real_3ff921fb54400000: 	.quad 0x03ff921fb54400000	# piby2_1
 				.quad 0x03ff921fb54400000
 .L__real_3dd0b4611a626331: 	.quad 0x03dd0b4611a626331	# piby2_1tail
 				.quad 0x03dd0b4611a626331
 .L__real_3dd0b4611a600000: 	.quad 0x03dd0b4611a600000	# piby2_2
 				.quad 0x03dd0b4611a600000
 .L__real_3ba3198a2e037073: 	.quad 0x03ba3198a2e037073	# piby2_2tail
 				.quad 0x03ba3198a2e037073
 .L__real_fffffffff8000000: 	.quad 0x0fffffffff8000000	# mask for stripping head and tail
 				.quad 0x0fffffffff8000000
 .L__real_8000000000000000:	.quad 0x08000000000000000	# -0  or signbit
 				.quad 0x08000000000000000
 .L__reald_one_one:		.quad 0x00000000100000001	#
 				.quad 0
 .L__reald_two_two:		.quad 0x00000000200000002	#
 				.quad 0
 .L__reald_one_zero:		.quad 0x00000000100000000	# sin_cos_filter
 				.quad 0
 .L__reald_zero_one:		.quad 0x00000000000000001	#
 				.quad 0
 .L__reald_two_zero:		.quad 0x00000000200000000	#
 				.quad 0
 .L__realq_one_one:		.quad 0x00000000000000001	#
 				.quad 0x00000000000000001	#
 .L__realq_two_two:		.quad 0x00000000000000002	#
 				.quad 0x00000000000000002	#
 .L__real_1_x_mask:		.quad 0x0ffffffffffffffff	#
 				.quad 0x03ff0000000000000	#
 .L__real_zero:			.quad 0x00000000000000000	#
 				.quad 0x00000000000000000	#
 .L__real_one:			.quad 0x00000000000000001	#
 				.quad 0x00000000000000001	#
 .L__real_ffffffffffffffff: 	.quad 0x0ffffffffffffffff	#Sign bit one
 				.quad 0x0ffffffffffffffff
 .L__real_naninf_upper_sign_mask:	.quad 0x000000000ffffffff	#
 					.quad 0x000000000ffffffff	#
 .L__real_naninf_lower_sign_mask:	.quad 0x0ffffffff00000000	#
 					.quad 0x0ffffffff00000000	#

 .Lcosarray:
 	.quad	0x03fa5555555555555		#  0.0416667		   	c1
 	.quad	0x03fa5555555555555
 	.quad	0x0bf56c16c16c16967		# -0.00138889	   		c2
 	.quad	0x0bf56c16c16c16967
 	.quad	0x03efa01a019f4ec90		#  2.48016e-005			c3
 	.quad	0x03efa01a019f4ec90
 	.quad	0x0be927e4fa17f65f6		# -2.75573e-007			c4
 	.quad	0x0be927e4fa17f65f6
 	.quad	0x03e21eeb69037ab78		#  2.08761e-009			c5
 	.quad	0x03e21eeb69037ab78
 	.quad	0x0bda907db46cc5e42		# -1.13826e-011	   		c6
 	.quad	0x0bda907db46cc5e42
 .Lsinarray:
 	.quad	0x0bfc5555555555555		# -0.166667	   		s1
 	.quad	0x0bfc5555555555555
 	.quad	0x03f81111111110bb3		#  0.00833333	   		s2
 	.quad	0x03f81111111110bb3
 	.quad	0x0bf2a01a019e83e5c		# -0.000198413			s3
 	.quad	0x0bf2a01a019e83e5c
 	.quad	0x03ec71de3796cde01		#  2.75573e-006			s4
 	.quad	0x03ec71de3796cde01
 	.quad	0x0be5ae600b42fdfa7		# -2.50511e-008			s5
 	.quad	0x0be5ae600b42fdfa7
 	.quad	0x03de5e0b2f9a43bb8		#  1.59181e-010	   		s6
 	.quad	0x03de5e0b2f9a43bb8
 .Lsincosarray:
 	.quad	0x0bfc5555555555555		# -0.166667	   		s1
 	.quad	0x03fa5555555555555		#  0.0416667		   	c1
 	.quad	0x03f81111111110bb3		#  0.00833333	   		s2
 	.quad	0x0bf56c16c16c16967		#				c2
 	.quad	0x0bf2a01a019e83e5c		# -0.000198413			s3
 	.quad	0x03efa01a019f4ec90
 	.quad	0x03ec71de3796cde01		#  2.75573e-006			s4
 	.quad	0x0be927e4fa17f65f6
 	.quad	0x0be5ae600b42fdfa7		# -2.50511e-008			s5
 	.quad	0x03e21eeb69037ab78
 	.quad	0x03de5e0b2f9a43bb8		#  1.59181e-010	   		s6
 	.quad	0x0bda907db46cc5e42
 .Lcossinarray:
 	.quad	0x03fa5555555555555		#  0.0416667		   	c1
 	.quad	0x0bfc5555555555555		# -0.166667	   		s1
 	.quad	0x0bf56c16c16c16967		#				c2
 	.quad	0x03f81111111110bb3		#  0.00833333	   		s2
 	.quad	0x03efa01a019f4ec90
 	.quad	0x0bf2a01a019e83e5c		# -0.000198413			s3
 	.quad	0x0be927e4fa17f65f6
 	.quad	0x03ec71de3796cde01		#  2.75573e-006			s4
 	.quad	0x03e21eeb69037ab78
 	.quad	0x0be5ae600b42fdfa7		# -2.50511e-008			s5
 	.quad	0x0bda907db46cc5e42
 	.quad	0x03de5e0b2f9a43bb8		#  1.59181e-010	   		s6


 .text
 .align 16
 .p2align 4,,15

 .equ	p_temp,		0x00		# temporary for get/put bits operation
 .equ	p_temp1,	0x10		# temporary for get/put bits operation
 .equ	p_temp2,	0x20		# temporary for get/put bits operation

 .equ	save_xmm6,	0x30		# temporary for get/put bits operation
 .equ	save_xmm7,	0x40		# temporary for get/put bits operation
 .equ	save_xmm8,	0x50		# temporary for get/put bits operation
 .equ	save_xmm9,	0x60		# temporary for get/put bits operation
 .equ	save_xmm10,	0x70		# temporary for get/put bits operation
 .equ	save_xmm11,	0x80		# temporary for get/put bits operation
 .equ	save_xmm12,	0x90		# temporary for get/put bits operation
 .equ	save_xmm13,	0x0A0		# temporary for get/put bits operation
 .equ	save_xmm14,	0x0B0		# temporary for get/put bits operation
 .equ	save_xmm15,	0x0C0		# temporary for get/put bits operation

 .equ	save_rdi,	0x0D0
 .equ	save_rsi,	0x0E0

 .equ	r,		0x0F0		# pointer to r for remainder_piby2
 .equ	rr,		0x0100		# pointer to r for remainder_piby2
 .equ	region,		0x0110		# pointer to r for remainder_piby2

 .equ	p_original,	0x0120		# original x
 .equ	p_mask,		0x0130		# original x
 .equ	p_sign,		0x0140		# original x
 .equ	p_sign1,	0x0150		# original x
 .equ	p_x,		0x0160		#x
 .equ	p_xx,		0x0170		#xx
 .equ	p_x2,		0x0180		#x2
 .equ	p_sin,		0x0190		#sin
 .equ	p_cos,		0x01A0		#cos
 .equ	p_temp2,	0x01B0		# temporary for get/put bits operation

 .globl __vrd2_sincos
     .type   __vrd2_sincos,@function
 __vrd2_sincos:
 	sub		$0x1C8,%rsp

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 #STARTMAIN

 movdqa 	%xmm0,%xmm6				#move to mem to get into integer regs **
 movdqa  %xmm0, p_original(%rsp)			#move to mem to get into integer regs -

 andpd 	.L__real_7fffffffffffffff(%rip),%xmm0		#Unsign			-

 mov	%rdi, p_sin(%rsp)		# save address for sin return
 mov	%rsi, p_cos(%rsp)		# save address for cos return

 movd	%xmm0,%rax				#rax is lower arg
 movhpd	%xmm0, p_temp+8(%rsp)			#
 mov    	p_temp+8(%rsp),%rcx			#rcx = upper arg
 movdqa	%xmm0,%xmm8

 pcmpgtd		%xmm6,%xmm8
 movdqa		%xmm8,%xmm6
 psrldq		$4,%xmm8
 psrldq		$8,%xmm6

 mov 	$0x3FE921FB54442D18,%rdx			#piby4
 mov	$0x411E848000000000,%r10			#5e5
 movapd	.L__real_3fe0000000000000(%rip),%xmm4			#0.5 for later use

 por	%xmm6,%xmm8
 movd	%xmm8,%r11				#Move Sign to gpr **

 movapd	%xmm0,%xmm2				#x
 movapd	%xmm0,%xmm6				#x

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .align 16
 .Leither_or_both_arg_gt_than_piby4:

 	cmp	%r10,%rax				#is lower arg >= 5e5
 	jae	.Llower_or_both_arg_gt_5e5
 	cmp	%r10,%rcx				#is upper arg >= 5e5
 	jae	.Lupper_arg_gt_5e5

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .Lboth_arg_lt_than_5e5:
 # %xmm2,,%xmm0 xmm6 = x, xmm4 = 0.5

 	mulpd	.L__real_3fe45f306dc9c883(%rip),%xmm2		# * twobypi
 	addpd	%xmm4,%xmm2					# +0.5, npi2
 	movapd	.L__real_3ff921fb54400000(%rip),%xmm0		# piby2_1
 	cvttpd2dq	%xmm2,%xmm4				# convert packed double to packed integers
 	movapd	.L__real_3dd0b4611a600000(%rip),%xmm8		# piby2_2
 	cvtdq2pd	%xmm4,%xmm2				# and back to double.


 #      /* Subtract the multiple from x to get an extra-precision remainder */

 	movd	%xmm4,%r8						# Region

 	mov 	.L__reald_one_zero(%rip),%rdx			#compare value for cossin path
 	mov	%r8,%r10
 	mov	%r8,%rcx

 #      rhead  = x - npi2 * piby2_1;
        mulpd	%xmm2,%xmm0						# npi2 * piby2_1;

 #      rtail  = npi2 * piby2_2;
        mulpd	%xmm2,%xmm8						# rtail

 #      rhead  = x - npi2 * piby2_1;
        subpd	%xmm0,%xmm6						# rhead  = x - npi2 * piby2_1;

 #      t  = rhead;
        movapd	%xmm6,%xmm0						# t

 #      rhead  = t - rtail;
        subpd	%xmm8,%xmm0						# rhead

 #      rtail  = npi2 * piby2_2tail - ((t - rhead) - rtail);
        mulpd	.L__real_3ba3198a2e037073(%rip),%xmm2		# npi2 * piby2_2tail
        subpd	%xmm0,%xmm6						# t-rhead
        subpd	%xmm6,%xmm8						# - ((t - rhead) - rtail)
        addpd	%xmm2,%xmm8						# rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);


 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 # xmm4  = npi2 (int), xmm0 =rhead, xmm8 =rtail

 	and	.L__reald_one_one(%rip),%r8			#odd/even region for cos/sin

 	shr	$1,%r10						#~AB+A~B, A is sign and B is upper bit of region
 	mov	%r10,%rax
 	not 	%r11						#ADDED TO CHANGE THE LOGIC
 	and	%r11,%r10
 	not	%rax
 	not	%r11
 	and	%r11,%rax
 	or	%rax,%r10
 	and	.L__reald_one_one(%rip),%r10				#(~AB+A~B)&1
 	mov	%r10,%r11
 	and	%rdx,%r11				#mask out the lower sign bit leaving the upper sign bit
 	shl	$63,%r10				#shift lower sign bit left by 63 bits
 	shl	$31,%r11				#shift upper sign bit left by 31 bits
 	mov 	 %r10,p_sign(%rsp)		#write out lower sign bit
 	mov 	 %r11,p_sign+8(%rsp)		#write out upper sign bit

 # xmm4  = Sign, xmm0 =rhead, xmm8 =rtail

 	movapd	%xmm0,%xmm6						# rhead
 	subpd	%xmm8,%xmm0						# r = rhead - rtail

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 # xmm4  = Sign, xmm0 = r, xmm6 =rhead, xmm8 =rtail

 	subpd	%xmm0,%xmm6				#rr=rhead-r
 	movapd	%xmm0,%xmm2				#move r for r2
 	mulpd	%xmm0,%xmm2				#r2
 	subpd	%xmm8,%xmm6				#rr=(rhead-r) -rtail

 	mov 	.L__reald_one_zero(%rip),%r9		# Compare value for cossin	+


 	add	.L__reald_one_one(%rip),%rcx
 	and	.L__reald_two_two(%rip),%rcx
 	shr	$1,%rcx

 	mov	%rcx,%rdx
 	and	%r9,%rdx				#mask out the lower sign bit leaving the upper sign bit
 	shl	$63,%rcx				#shift lower sign bit left by 63 bits
 	shl	$31,%rdx				#shift upper sign bit left by 31 bits
 	mov 	 %rcx,p_sign1(%rsp)		#write out lower sign bit
 	mov 	 %rdx,p_sign1+8(%rsp)		#write out upper sign bit

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 .align 16
 .L__vrd2_sincos_approximate:
 	cmp 	$0,%r8
 	jnz	.Lvrd2_not_sin_piby4

 .Lvrd2_sin_piby4:
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 # p_sign  = Sign, xmm0 = r, xmm2 = %xmm6,%r2 =rr

 	movdqa	.Lcosarray+0x50(%rip),%xmm4			# c6
 	movdqa	.Lsinarray+0x50(%rip),%xmm5			# c6
 	movapd	.Lcosarray+0x20(%rip),%xmm8			# c3
 	movapd	.Lsinarray+0x20(%rip),%xmm9			# c3

 	movapd	%xmm2,%xmm10					# x2
 	movapd	%xmm2,%xmm11					# x2

 	mulpd	%xmm2,%xmm4					# c6*x2
 	mulpd	%xmm2,%xmm5					# c6*x2
 	mulpd	%xmm2,%xmm8					# c3*x2
 	mulpd	%xmm2,%xmm9					# c3*x2

 	mulpd	.L__real_3fe0000000000000(%rip),%xmm10	# r = 0.5 *x2
 	movapd	 %xmm2,p_temp(%rsp)			# store x2

 	addpd	.Lcosarray+0x40(%rip),%xmm4			# c5+x2c6
 	addpd	.Lsinarray+0x40(%rip),%xmm5			# c5+x2c6
 	movapd	 %xmm10,p_temp2(%rsp)			# store r
 	addpd	.Lcosarray+0x10(%rip),%xmm8			# c2+x2C3
 	addpd	.Lsinarray+0x10(%rip),%xmm9			# c2+x2C3

 	subpd	.L__real_3ff0000000000000(%rip),%xmm10	# -t=r-1.0
 	mulpd	%xmm2,%xmm11					# x4

 	mulpd	%xmm2,%xmm4					# x2(c5+x2c6)
 	mulpd	%xmm2,%xmm5					# x2(c5+x2c6)
 	movapd	 %xmm10,p_temp1(%rsp) 			# store t
 	movapd	%xmm11,%xmm3					# Keep x4
 	mulpd	%xmm2,%xmm8					# x2(c2+x2C3)
 	mulpd	%xmm2,%xmm9					# x2(c2+x2C3)

 	addpd   .L__real_3ff0000000000000(%rip),%xmm10	# 1 + (-t)
 	mulpd	%xmm2,%xmm11					# x6

 	addpd	.Lcosarray+0x30(%rip),%xmm4			# c4 + x2(c5+x2c6)
 	addpd	.Lsinarray+0x30(%rip),%xmm5			# c4 + x2(c5+x2c6)
 	addpd	.Lcosarray(%rip),%xmm8			# c1 + x2(c2+x2C3)
 	addpd	.Lsinarray(%rip),%xmm9			# c1 + x2(c2+x2C3)

 	subpd   p_temp2(%rsp),%xmm10			# (1 + (-t)) - r
 	mulpd	%xmm0,%xmm2					# x3 recalculate

 	mulpd	%xmm11,%xmm4					# x6(c4 + x2(c5+x2c6))
 	mulpd	%xmm11,%xmm5					# x6(c4 + x2(c5+x2c6))

 	movapd	%xmm0,%xmm1
 	movapd	%xmm6,%xmm7
 	mulpd	%xmm6,%xmm1					# x*xx
 	mulpd	p_temp2(%rsp),%xmm7			# xx * 0.5x2

 	addpd	%xmm8,%xmm4					# zc
 	addpd	%xmm9,%xmm5					# zs

 	subpd   %xmm1,%xmm10					# ((1 + (-t)) - r) -x*xx

 	mulpd	%xmm3,%xmm4					# x4 * zc
 	mulpd	%xmm2,%xmm5					# x3 * zs

 	addpd	%xmm10,%xmm4					# x4*zc + (((1 + (-t)) - r) - x*xx)
 	subpd	%xmm7,%xmm5					# x3*zs - 0.5 * x2 *xx

 	addpd	%xmm6,%xmm5					# sin + xx
 	subpd	p_temp1(%rsp),%xmm4			# cos - (-t)
 	addpd	%xmm0,%xmm5					# sin + x

 	jmp 	.L__vrd2_sincos_cleanup

 .align 16
 .Lvrd2_not_sin_piby4:
 	cmp 	.L__reald_one_one(%rip),%r8
 	jnz	.Lvrd2_not_cos_piby4

 .Lvrd2_cos_piby4:
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 # p_sign  = Sign, xmm0 = r, xmm2 = %xmm6,%r2 =rr

 	movdqa	.Lcosarray+0x50(%rip),%xmm5			# c6
 	movdqa	.Lsinarray+0x50(%rip),%xmm4			# c6
 	movapd	.Lcosarray+0x20(%rip),%xmm9			# c3
 	movapd	.Lsinarray+0x20(%rip),%xmm8			# c3

 	movapd	%xmm2,%xmm10					# x2
 	movapd	%xmm2,%xmm11					# x2

 	mulpd	%xmm2,%xmm5					# c6*x2
 	mulpd	%xmm2,%xmm4					# c6*x2
 	mulpd	%xmm2,%xmm9					# c3*x2
 	mulpd	%xmm2,%xmm8					# c3*x2

 	mulpd	.L__real_3fe0000000000000(%rip),%xmm10	# r = 0.5 *x2
 	movapd	 %xmm2,p_temp(%rsp)			# store x2

 	addpd	.Lcosarray+0x40(%rip),%xmm5			# c5+x2c6
 	addpd	.Lsinarray+0x40(%rip),%xmm4			# c5+x2c6
 	movapd	 %xmm10,p_temp2(%rsp)			# store r
 	addpd	.Lcosarray+0x10(%rip),%xmm9			# c2+x2C3
 	addpd	.Lsinarray+0x10(%rip),%xmm8			# c2+x2C3

 	subpd	.L__real_3ff0000000000000(%rip),%xmm10	# -t=r-1.0
 	mulpd	%xmm2,%xmm11					# x4

 	mulpd	%xmm2,%xmm5					# x2(c5+x2c6)
 	mulpd	%xmm2,%xmm4					# x2(c5+x2c6)
 	movapd	 %xmm10,p_temp1(%rsp) 			# store t
 	movapd	%xmm11,%xmm3					# Keep x4
 	mulpd	%xmm2,%xmm9					# x2(c2+x2C3)
 	mulpd	%xmm2,%xmm8					# x2(c2+x2C3)

 	addpd   .L__real_3ff0000000000000(%rip),%xmm10	# 1 + (-t)
 	mulpd	%xmm2,%xmm11					# x6

 	addpd	.Lcosarray+0x30(%rip),%xmm5			# c4 + x2(c5+x2c6)
 	addpd	.Lsinarray+0x30(%rip),%xmm4			# c4 + x2(c5+x2c6)
 	addpd	.Lcosarray(%rip),%xmm9			# c1 + x2(c2+x2C3)
 	addpd	.Lsinarray(%rip),%xmm8			# c1 + x2(c2+x2C3)

 	subpd   p_temp2(%rsp),%xmm10			# (1 + (-t)) - r
 	mulpd	%xmm0,%xmm2					# x3 recalculate

 	mulpd	%xmm11,%xmm5					# x6(c4 + x2(c5+x2c6))
 	mulpd	%xmm11,%xmm4					# x6(c4 + x2(c5+x2c6))

 	movapd	%xmm0,%xmm1
 	movapd	%xmm6,%xmm7
 	mulpd	%xmm6,%xmm1					# x*xx
 	mulpd	p_temp2(%rsp),%xmm7			# xx * 0.5x2

 	addpd	%xmm9,%xmm5					# zc
 	addpd	%xmm8,%xmm4					# zs

 	subpd   %xmm1,%xmm10					# ((1 + (-t)) - r) -x*xx

 	mulpd	%xmm3,%xmm5					# x4 * zc
 	mulpd	%xmm2,%xmm4					# x3 * zs

 	addpd	%xmm10,%xmm5					# x4*zc + (((1 + (-t)) - r) - x*xx)
 	subpd	%xmm7,%xmm4					# x3*zs - 0.5 * x2 *xx

 	addpd	%xmm6,%xmm4					# sin + xx
 	subpd	p_temp1(%rsp),%xmm5			# cos - (-t)
 	addpd	%xmm0,%xmm4					# sin + x

 	jmp 	.L__vrd2_sincos_cleanup

 .align 16
 .Lvrd2_not_cos_piby4:
 	cmp 	$1,%r8
 	jnz	.Lvrd2_cossin_piby4

 .Lvrd2_sincos_piby4:
 	movdqa	.Lcosarray+0x50(%rip),%xmm4			# c6
 	movdqa	.Lsinarray+0x50(%rip),%xmm5			# c6
 	movapd	.Lcosarray+0x20(%rip),%xmm8			# c3
 	movapd	.Lsinarray+0x20(%rip),%xmm9			# c3

 	movapd	%xmm2,%xmm10					# x2
 	movapd	%xmm2,%xmm11					# x2

 	mulpd	%xmm2,%xmm4					# c6*x2
 	mulpd	%xmm2,%xmm5					# c6*x2
 	mulpd	%xmm2,%xmm8					# c3*x2
 	mulpd	%xmm2,%xmm9					# c3*x2

 	mulpd	.L__real_3fe0000000000000(%rip),%xmm10	# r = 0.5 *x2
 	movapd	 %xmm2,p_temp(%rsp)			# store x2

 	addpd	.Lcosarray+0x40(%rip),%xmm4			# c5+x2c6
 	addpd	.Lsinarray+0x40(%rip),%xmm5			# c5+x2c6
 	movapd	 %xmm10,p_temp2(%rsp)			# store r
 	addpd	.Lcosarray+0x10(%rip),%xmm8			# c2+x2C3
 	addpd	.Lsinarray+0x10(%rip),%xmm9			# c2+x2C3

 	subpd	.L__real_3ff0000000000000(%rip),%xmm10	# -t=r-1.0
 	mulpd	%xmm2,%xmm11					# x4

 	mulpd	%xmm2,%xmm4					# x2(c5+x2c6)
 	mulpd	%xmm2,%xmm5					# x2(c5+x2c6)
 	movapd	 %xmm10,p_temp1(%rsp) 			# store t
 	movapd	%xmm11,%xmm3					# Keep x4
 	mulpd	%xmm2,%xmm8					# x2(c2+x2C3)
 	mulpd	%xmm2,%xmm9					# x2(c2+x2C3)

 	addpd   .L__real_3ff0000000000000(%rip),%xmm10	# 1 + (-t)
 	mulpd	%xmm2,%xmm11					# x6

 	addpd	.Lcosarray+0x30(%rip),%xmm4			# c4 + x2(c5+x2c6)
 	addpd	.Lsinarray+0x30(%rip),%xmm5			# c4 + x2(c5+x2c6)
 	addpd	.Lcosarray(%rip),%xmm8			# c1 + x2(c2+x2C3)
 	addpd	.Lsinarray(%rip),%xmm9			# c1 + x2(c2+x2C3)

 	subpd   p_temp2(%rsp),%xmm10			# (1 + (-t)) - r
 	mulpd	%xmm0,%xmm2					# x3 recalculate

 	mulpd	%xmm11,%xmm4					# x6(c4 + x2(c5+x2c6))
 	mulpd	%xmm11,%xmm5					# x6(c4 + x2(c5+x2c6))

 	movapd	%xmm0,%xmm1
 	movapd	%xmm6,%xmm7
 	mulpd	%xmm6,%xmm1					# x*xx
 	mulpd	p_temp2(%rsp),%xmm7			# xx * 0.5x2

 	addpd	%xmm8,%xmm4					# zc
 	addpd	%xmm9,%xmm5					# zs

 	subpd   %xmm1,%xmm10					# ((1 + (-t)) - r) -x*xx

 	mulpd	%xmm3,%xmm4					# x4 * zc
 	mulpd	%xmm2,%xmm5					# x3 * zs

 	addpd	%xmm10,%xmm4					# x4*zc + (((1 + (-t)) - r) - x*xx)
 	subpd	%xmm7,%xmm5					# x3*zs - 0.5 * x2 *xx

 	addpd	%xmm6,%xmm5					# sin + xx
 	subpd	p_temp1(%rsp),%xmm4			# cos - (-t)
 	addpd	%xmm0,%xmm5					# sin + x

 	movsd	%xmm4,%xmm1
 	movsd	%xmm5,%xmm4
 	movsd	%xmm1,%xmm5

 	jmp 	.L__vrd2_sincos_cleanup

 .align 16
 .Lvrd2_cossin_piby4:
 	movdqa	.Lcosarray+0x50(%rip),%xmm5			# c6
 	movdqa	.Lsinarray+0x50(%rip),%xmm4			# c6
 	movapd	.Lcosarray+0x20(%rip),%xmm9			# c3
 	movapd	.Lsinarray+0x20(%rip),%xmm8			# c3

 	movapd	%xmm2,%xmm10					# x2
 	movapd	%xmm2,%xmm11					# x2

 	mulpd	%xmm2,%xmm5					# c6*x2
 	mulpd	%xmm2,%xmm4					# c6*x2
 	mulpd	%xmm2,%xmm9					# c3*x2
 	mulpd	%xmm2,%xmm8					# c3*x2

 	mulpd	.L__real_3fe0000000000000(%rip),%xmm10	# r = 0.5 *x2
 	movapd	 %xmm2,p_temp(%rsp)			# store x2

 	addpd	.Lcosarray+0x40(%rip),%xmm5			# c5+x2c6
 	addpd	.Lsinarray+0x40(%rip),%xmm4			# c5+x2c6
 	movapd	 %xmm10,p_temp2(%rsp)			# store r
 	addpd	.Lcosarray+0x10(%rip),%xmm9			# c2+x2C3
 	addpd	.Lsinarray+0x10(%rip),%xmm8			# c2+x2C3

 	subpd	.L__real_3ff0000000000000(%rip),%xmm10	# -t=r-1.0
 	mulpd	%xmm2,%xmm11					# x4

 	mulpd	%xmm2,%xmm5					# x2(c5+x2c6)
 	mulpd	%xmm2,%xmm4					# x2(c5+x2c6)
 	movapd	 %xmm10,p_temp1(%rsp) 			# store t
 	movapd	%xmm11,%xmm3					# Keep x4
 	mulpd	%xmm2,%xmm9					# x2(c2+x2C3)
 	mulpd	%xmm2,%xmm8					# x2(c2+x2C3)

 	addpd   .L__real_3ff0000000000000(%rip),%xmm10	# 1 + (-t)
 	mulpd	%xmm2,%xmm11					# x6

 	addpd	.Lcosarray+0x30(%rip),%xmm5			# c4 + x2(c5+x2c6)
 	addpd	.Lsinarray+0x30(%rip),%xmm4			# c4 + x2(c5+x2c6)
 	addpd	.Lcosarray(%rip),%xmm9			# c1 + x2(c2+x2C3)
 	addpd	.Lsinarray(%rip),%xmm8			# c1 + x2(c2+x2C3)

 	subpd   p_temp2(%rsp),%xmm10			# (1 + (-t)) - r
 	mulpd	%xmm0,%xmm2					# x3 recalculate

 	mulpd	%xmm11,%xmm5					# x6(c4 + x2(c5+x2c6))
 	mulpd	%xmm11,%xmm4					# x6(c4 + x2(c5+x2c6))

 	movapd	%xmm0,%xmm1
 	movapd	%xmm6,%xmm7
 	mulpd	%xmm6,%xmm1					# x*xx
 	mulpd	p_temp2(%rsp),%xmm7			# xx * 0.5x2

 	addpd	%xmm9,%xmm5					# zc
 	addpd	%xmm8,%xmm4					# zs

 	subpd   %xmm1,%xmm10					# ((1 + (-t)) - r) -x*xx

 	mulpd	%xmm3,%xmm5					# x4 * zc
 	mulpd	%xmm2,%xmm4					# x3 * zs

 	addpd	%xmm10,%xmm5					# x4*zc + (((1 + (-t)) - r) - x*xx)
 	subpd	%xmm7,%xmm4					# x3*zs - 0.5 * x2 *xx

 	addpd	%xmm6,%xmm4					# sin + xx
 	subpd	p_temp1(%rsp),%xmm5			# cos - (-t)
 	addpd	%xmm0,%xmm4					# sin + x

 	movsd	%xmm5,%xmm1
 	movsd	%xmm4,%xmm5
 	movsd	%xmm1,%xmm4

 	jmp 	.L__vrd2_sincos_cleanup

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .Llower_or_both_arg_gt_5e5:
 	cmp	%r10,%rcx				#is upper arg >= 5e5
 	jae	.Lboth_arg_gt_5e5

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .Llower_arg_gt_5e5:
 # Upper Arg is < 5e5, Lower arg is >= 5e5

 	movlpd	 %xmm0,r(%rsp)		#Save lower fp arg for remainder_piby2 call

 	movhlps	%xmm0,%xmm0			#Needed since we want to work on upper arg
 	movhlps	%xmm2,%xmm2
 	movhlps	%xmm6,%xmm6

 # Work on Upper arg
 # %xmm2,,%xmm0 xmm6 = x, xmm4 = 0.5
 # Lower arg might contain nan/inf, to avoid exception use only scalar instructions on upper arg which has been moved to lower portions of fp regs

 #If upper Arg is <=piby4
        cmp	%rdx,%rcx					# is upper arg > piby4
        ja	0f

        mov 	$0,%ecx						# region = 0
        mov	 %ecx,region+4(%rsp)			# store upper region
        movlpd	 %xmm0,r+8(%rsp)			# store upper r (unsigned - sign is adjusted later based on sign)
        xorpd	%xmm4,%xmm4					# rr = 0
        movlpd	 %xmm4,rr+8(%rsp)			# store upper rr
        jmp	.Lcheck_lower_arg

 #If upper Arg is > piby4
 .align 16
 0:
 	mulsd	.L__real_3fe45f306dc9c883(%rip),%xmm2		# x*twobypi
 	addsd	%xmm4,%xmm2					# npi2=(x*twobypi+0.5)
 	movsd	.L__real_3ff921fb54400000(%rip),%xmm3		# piby2_1
 	cvttsd2si	%xmm2,%ecx				# npi2 trunc to ints
 	movsd	.L__real_3dd0b4611a600000(%rip),%xmm1		# piby2_2
 	cvtsi2sd	%ecx,%xmm2				# npi2 trunc to doubles

 	#/* Subtract the multiple from x to get an extra-precision remainder */
 	#rhead  = x - npi2 * piby2_1;
 	mulsd	%xmm2,%xmm3					# npi2 * piby2_1
 	subsd	%xmm3,%xmm6					# rhead =(x-npi2*piby2_1)
 	movsd	.L__real_3ba3198a2e037073(%rip),%xmm8		# piby2_2tail

 	#t  = rhead;
        movsd	%xmm6,%xmm5					# t = rhead

 	#rtail  = npi2 * piby2_2;
        mulsd	%xmm2,%xmm1					# rtail=(npi2*piby2_2)

 	#rhead  = t - rtail
        subsd	%xmm1,%xmm6					# rhead=(t-rtail)

 	#rtail  = npi2 * piby2_2tail - ((t - rhead) - rtail);
        mulsd	%xmm2,%xmm8     					# npi2 * piby2_2tail
        subsd	%xmm6,%xmm5					# t-rhead
        subsd	%xmm5,%xmm1					# (rtail-(t-rhead))
        addsd	%xmm8,%xmm1					# rtail=npi2*piby2_2tail+(rtail-(t-rhead));

 	#r =  rhead - rtail
 	#rr = (rhead-r) -rtail
        mov	 %ecx,region+4(%rsp)			# store upper region
        movsd	%xmm6,%xmm0
        subsd	%xmm1,%xmm0					# r=(rhead-rtail)

        subsd	%xmm0,%xmm6					# rr=rhead-r
        subsd	%xmm1,%xmm6					# xmm4 = rr=((rhead-r) -rtail)

        movlpd	 %xmm0,r+8(%rsp)			# store upper r
        movlpd	 %xmm6,rr+8(%rsp)			# store upper rr

 #If lower Arg is > 5e5
 #Note that volatiles will be trashed by the call
 #We do not care since this is the last check
 #We will construct r, rr, region and sign
 .align 16
 .Lcheck_lower_arg:
 	mov		$0x07ff0000000000000,%r9			# is lower arg nan/inf
 	mov		%r9,%r10
 	and		%rax,%r10
 	cmp		%r9,%r10
 	jz		.L__vrd2_cos_lower_naninf

 	lea	 region(%rsp),%rdx			# lower arg is **NOT** nan/inf
 	lea	 rr(%rsp),%rsi
 	lea	 r(%rsp),%rdi
 	movlpd	 r(%rsp),%xmm0				#Restore lower fp arg for remainder_piby2 call
 	mov	 %r11,p_temp(%rsp)			#Save Sign
         call    __amd_remainder_piby2@PLT
 	mov	p_temp(%rsp),%r11			#Restore Sign

 	jmp 	.L__vrd2_cos_reconstruct

 .L__vrd2_cos_lower_naninf:
 	mov	p_original(%rsp),%rax			# upper arg is nan/inf

 	mov	$0x00008000000000000,%r9
 	or	%r9,%rax
 	mov	 %rax,r(%rsp)				# r = x | 0x0008000000000000
 	xor	%r10,%r10
 	mov	 %r10,rr(%rsp)				# rr = 0
 	mov	 %r10d,region(%rsp)			# region =0
 	and 	.L__real_naninf_lower_sign_mask(%rip),%r11	# Sign

 	jmp 	.L__vrd2_cos_reconstruct

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .align 16
 .Lupper_arg_gt_5e5:
 # Upper Arg is >= 5e5, Lower arg is < 5e5
 	movhpd	 %xmm0,r+8(%rsp)		#Save upper fp arg for remainder_piby2 call
 #	movlhps	%xmm0,%xmm0				;Not needed since we want to work on lower arg, but done just to be safe and avoide exceptions due to nan/inf and to mirror the lower_arg_gt_5e5 case
 #	movlhps	%xmm2,%xmm2
 #	movlhps	%xmm6,%xmm6

 # Work on Lower arg
 # %xmm2,,%xmm0 xmm6 = x, xmm4 = 0.5
 # Upper arg might contain nan/inf, to avoid exception use only scalar instructions on lower arg

 #If lower Arg is <=piby4
        cmp	%rdx,%rax					# is upper arg > piby4
        ja	0f

        mov 	$0,%eax						# region = 0
        mov	 %eax,region(%rsp)			# store upper region
        movlpd	 %xmm0,r(%rsp)				# store upper r
        xorpd	%xmm4,%xmm4					# rr = 0
        movlpd	 %xmm4,rr(%rsp)				# store upper rr
        jmp 	.Lcheck_upper_arg

 .align 16
 0:
 #If upper Arg is > piby4
 	mulsd	.L__real_3fe45f306dc9c883(%rip),%xmm2		# x*twobypi
 	addsd	%xmm4,%xmm2					# npi2=(x*twobypi+0.5)
 	movsd	.L__real_3ff921fb54400000(%rip),%xmm3		# piby2_1
 	cvttsd2si	%xmm2,%eax				# npi2 trunc to ints
 	movsd	.L__real_3dd0b4611a600000(%rip),%xmm1		# piby2_2
 	cvtsi2sd	%eax,%xmm2				# npi2 trunc to doubles

 #/* Subtract the multiple from x to get an extra-precision remainder */
 #rhead  = x - npi2 * piby2_1;
 	mulsd	%xmm2,%xmm3					# npi2 * piby2_1;
 	subsd	%xmm3,%xmm6					# rhead =(x-npi2*piby2_1)
 	movsd	.L__real_3ba3198a2e037073(%rip),%xmm8		# piby2_2tail

 #t  = rhead;
        movsd	%xmm6,%xmm5					# t = rhead

 #rtail  = npi2 * piby2_2;
        mulsd	%xmm2,%xmm1					# rtail=(npi2*piby2_2)

 #rhead  = t - rtail
        subsd	%xmm1,%xmm6					# rhead=(t-rtail)

 #rtail  = npi2 * piby2_2tail - ((t - rhead) - rtail);
        mulsd	%xmm2,%xmm8     					# npi2 * piby2_2tail
        subsd	%xmm6,%xmm5					# t-rhead
        subsd	%xmm5,%xmm1					# (rtail-(t-rhead))
        addsd	%xmm8,%xmm1					# rtail=npi2*piby2_2tail+(rtail-(t-rhead));

 #r =  rhead - rtail
 #rr = (rhead-r) -rtail
        mov	 %eax,region(%rsp)			# store lower region
        movsd	%xmm6,%xmm0
        subsd	%xmm1,%xmm0					# r=(rhead-rtail)
        subsd	%xmm0,%xmm6					# rr=rhead-r
        subsd	%xmm1,%xmm6					# rr=((rhead-r) -rtail)
        movlpd	 %xmm0,r(%rsp)				# store lower r
        movlpd	 %xmm6,rr(%rsp)				# store lower rr

 #Note that volatiles will be trashed by the call
 #We do not care since this is the last check
 #We will construct r, rr, region and sign
 .align 16
 .Lcheck_upper_arg:
 	mov		$0x07ff0000000000000,%r9			# is upper arg nan/inf
 	mov		%r9,%r10
 	and		%rcx,%r10
 	cmp		%r9,%r10
 	jz		.L__vrd2_cos_upper_naninf

 	lea	 region+4(%rsp),%rdx			# upper arg is **NOT** nan/inf
 	lea	 rr+8(%rsp),%rsi
 	lea	 r+8(%rsp),%rdi
 	movlpd	 r+8(%rsp),%xmm0	#Restore upper fp arg for remainder_piby2 call
 	mov	 %r11,p_temp(%rsp)	#Save Sign
         call    __amd_remainder_piby2@PLT
 	mov	p_temp(%rsp),%r11	#Restore Sign

 	jmp 	.L__vrd2_cos_reconstruct

 .L__vrd2_cos_upper_naninf:
 	mov	p_original+8(%rsp),%rcx		# upper arg is nan/inf
 	mov	$0x00008000000000000,%r9
 	or	%r9,%rcx
 	mov	 %rcx,r+8(%rsp)					# r = x | 0x0008000000000000
 	xor	%r10,%r10
 	mov	 %r10,rr+8(%rsp)					# rr = 0
 	mov	 %r10d,region+4(%rsp)				# region =0
 	and 	.L__real_naninf_upper_sign_mask(%rip),%r11	# Sign
 	jmp 	.L__vrd2_cos_reconstruct


 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .align 16
 .Lboth_arg_gt_5e5:
 #Upper Arg is >= 5e5, Lower arg is >= 5e5

 	movhpd	%xmm0,  p_temp2(%rsp) 				#Save upper fp arg for remainder_piby2 call

 	mov		$0x07ff0000000000000,%r9			#is lower arg nan/inf
 	mov		%r9,%r10
 	and		%rax,%r10
 	cmp		%r9,%r10
 	jz		.L__vrd2_cos_lower_naninf_of_both_gt_5e5

 	lea	 region(%rsp),%rdx			#lower arg is **NOT** nan/inf
 	lea	 rr(%rsp),%rsi
 	lea	 r(%rsp),%rdi
 	mov	  %rcx,p_temp(%rsp)			#Save upper arg
 	mov	  %r11,p_temp1(%rsp)	#Save Sign
         call    __amd_remainder_piby2@PLT
 	mov	 p_temp1(%rsp),%r11	#Restore Sign
 	mov	 p_temp(%rsp),%rcx			#Restore upper arg
 	jmp 	0f

 .L__vrd2_cos_lower_naninf_of_both_gt_5e5:				#lower arg is nan/inf
 	mov	p_original(%rsp),%rax
 	mov	$0x00008000000000000,%r9
 	or	%r9,%rax
 	mov	 %rax,r(%rsp)				#r = x | 0x0008000000000000
 	xor	%r10,%r10
 	mov	 %r10,rr(%rsp)				#rr = 0
 	mov	 %r10d,region(%rsp)			#region = 0
 	and 	.L__real_naninf_lower_sign_mask(%rip),%r11	# Sign

 .align 16
 0:
 	mov		$0x07ff0000000000000,%r9			#is upper arg nan/inf
 	mov		%r9,%r10
 	and		%rcx,%r10
 	cmp		%r9,%r10
 	jz		.L__vrd2_cos_upper_naninf_of_both_gt_5e5

 	lea	 region+4(%rsp),%rdx			#upper arg is **NOT** nan/inf
 	lea	 rr+8(%rsp),%rsi
 	lea	 r+8(%rsp),%rdi
 	movlpd	 p_temp2(%rsp), %xmm0			#Restore upper fp arg for remainder_piby2 call
 	mov	 %r11,p_temp(%rsp)	#Save Sign
         call    __amd_remainder_piby2@PLT
 	mov	 p_temp(%rsp),%r11	#Restore Sign

 	jmp 	0f

 .L__vrd2_cos_upper_naninf_of_both_gt_5e5:
 	mov	p_original+8(%rsp),%rcx		#upper arg is nan/inf
 	mov	$0x00008000000000000,%r9
 	or	%r9,%rcx
 	mov	%rcx,r+8(%rsp)					#r = x | 0x0008000000000000
 	xor	%r10,%r10
 	mov	%r10,rr+8(%rsp)					#rr = 0
 	mov	%r10d,region+4(%rsp)				#region = 0
 	and 	.L__real_naninf_upper_sign_mask(%rip),%r11	# Sign

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .align 16
 0:
 .L__vrd2_cos_reconstruct:
 #Construct p_sign=Sign for Sin term, p_sign1=Sign for Cos term, xmm0 = r, xmm2 = %xmm6,%r2 =rr, r8=region
 	movapd	r(%rsp),%xmm0				#x
 	movapd	%xmm0,%xmm2					#move for x2
 	mulpd	%xmm2,%xmm2					#x2
 	movapd	rr(%rsp),%xmm6				#xx

 	mov	region(%rsp),%r8
 	mov 	.L__reald_one_zero(%rip),%r9		#compare value for cossin path
 	mov 	%r8,%r10
 	mov	%r8,%rax
 	and	.L__reald_one_one(%rip),%r8		#odd/even region for cos/sin

 	shr	$1,%r10						#~AB+A~B, A is sign and B is upper bit of region
 	mov	%r10,%rcx
 	not 	%r11						#ADDED TO CHANGE THE LOGIC
 	and	%r11,%r10
 	not	%rcx
 	not	%r11
 	and	%r11,%rcx
 	or	%rcx,%r10
 	and	.L__reald_one_one(%rip),%r10				#(~AB+A~B)&1

 	mov	%r10,%r11
 	and	%r9,%r11				#mask out the lower sign bit leaving the upper sign bit
 	shl	$63,%r10				#shift lower sign bit left by 63 bits
 	shl	$31,%r11				#shift upper sign bit left by 31 bits
 	mov 	 %r10,p_sign(%rsp)		#write out lower sign bit
 	mov 	 %r11,p_sign+8(%rsp)		#write out upper sign bit

 	add	.L__reald_one_one(%rip),%rax
 	and	.L__reald_two_two(%rip),%rax
 	shr	$1,%rax

 	mov	%rax,%rdx
 	and	%r9,%rdx				#mask out the lower sign bit leaving the upper sign bit
 	shl	$63,%rax				#shift lower sign bit left by 63 bits
 	shl	$31,%rdx				#shift upper sign bit left by 31 bits
 	mov 	 %rax,p_sign1(%rsp)		#write out lower sign bit
 	mov 	 %rdx,p_sign1+8(%rsp)		#write out upper sign bit


 	jmp .L__vrd2_sincos_approximate


 #ENDMAIN

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .align 16
 .L__vrd2_sincos_cleanup:

 	xorpd	p_sign(%rsp),%xmm5		# SIN sign
 	xorpd	p_sign1(%rsp),%xmm4		# COS sign

 	mov	p_sin(%rsp),%rdi
 	mov	p_cos(%rsp),%rsi

 	movapd	 %xmm5,(%rdi)			# save the sin
 	movapd	 %xmm4,(%rsi)			# save the cos

 .Lfinal_check:
 	add	$0x1C8,%rsp
 	ret