src/gas/vrsasincosf.S - open64_libacml_mv - Git at Google


 #
 #  (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
 #
 #  This file is part of libacml_mv.
 #
 #  libacml_mv is free software; you can redistribute it and/or
 #  modify it under the terms of the GNU Lesser General Public
 #  License as published by the Free Software Foundation; either
 #  version 2.1 of the License, or (at your option) any later version.
 #
 #  libacml_mv is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 #  Lesser General Public License for more details.
 #
 #  You should have received a copy of the GNU Lesser General Public
 #  License along with libacml_mv.  If not, see
 #  <http://www.gnu.org/licenses/>.
 #
 #


 #
 # vrsasincosf.s
 #
 # A vector implementation of the sincos libm function.
 #
 # Prototype:
 #
 #    __vrsa_sincosf(int n, float* x, float* ys, float* yc);
 #
 # Computes Sine and Cosine of x for an array of input values.
 # Places the Sine results into the supplied ys array and the Cosine results into the supplied yc array.
 # Does not perform error checking.
 # Denormal inputs may produce unexpected results.
 # This routine computes 4 single precision Sine Cosine values at a time.
 # The four values are passed as packed single in xmm0.
 # The four Sine results are returned as packed singles in the supplied ys array.
 # The four Cosine results are returned as packed singles in the supplied yc array.
 # Note that this represents a non-standard ABI usage, as no ABI
 # ( and indeed C) currently allows returning 2 values for a function.
 # It is expected that some compilers may be able to take advantage of this
 # interface when implementing vectorized loops.  Using the array implementation
 # of the routine requires putting the inputs into memory, and retrieving
 # the results from memory.  This routine eliminates the need for this
 # overhead if the data does not already reside in memory.

 # Author: Harsha Jagasia
 # Email:  harsha.jagasia@amd.com

 #ifdef __ELF__
 .section .note.GNU-stack,"",@progbits
 #endif

 .data
 .align 64
 .L__real_7fffffffffffffff: .quad 0x07fffffffffffffff	#Sign bit zero
 			.quad 0x07fffffffffffffff
 .L__real_3ff0000000000000: .quad 0x03ff0000000000000	# 1.0
 			.quad 0x03ff0000000000000
 .L__real_v2p__27:		.quad 0x03e40000000000000	# 2p-27
 			.quad 0x03e40000000000000
 .L__real_3fe0000000000000: .quad 0x03fe0000000000000	# 0.5
 			.quad 0x03fe0000000000000
 .L__real_3fc5555555555555: .quad 0x03fc5555555555555	# 0.166666666666
 			.quad 0x03fc5555555555555
 .L__real_3fe45f306dc9c883: .quad 0x03fe45f306dc9c883	# twobypi
 			.quad 0x03fe45f306dc9c883
 .L__real_3ff921fb54400000: .quad 0x03ff921fb54400000	# piby2_1
 			.quad 0x03ff921fb54400000
 .L__real_3dd0b4611a626331: .quad 0x03dd0b4611a626331	# piby2_1tail
 			.quad 0x03dd0b4611a626331
 .L__real_3dd0b4611a600000: .quad 0x03dd0b4611a600000	# piby2_2
 			.quad 0x03dd0b4611a600000
 .L__real_3ba3198a2e037073: .quad 0x03ba3198a2e037073	# piby2_2tail
 			.quad 0x03ba3198a2e037073
 .L__real_fffffffff8000000: .quad 0x0fffffffff8000000	# mask for stripping head and tail
 			.quad 0x0fffffffff8000000
 .L__real_8000000000000000:	.quad 0x08000000000000000	# -0  or signbit
 			.quad 0x08000000000000000
 .L__reald_one_one:		.quad 0x00000000100000001	#
 			.quad 0
 .L__reald_two_two:		.quad 0x00000000200000002	#
 			.quad 0
 .L__reald_one_zero:	.quad 0x00000000100000000	# sin_cos_filter
 			.quad 0
 .L__reald_zero_one:	.quad 0x00000000000000001	#
 			.quad 0
 .L__reald_two_zero:	.quad 0x00000000200000000	#
 			.quad 0
 .L__realq_one_one:		.quad 0x00000000000000001	#
 			.quad 0x00000000000000001	#
 .L__realq_two_two:		.quad 0x00000000000000002	#
 			.quad 0x00000000000000002	#
 .L__real_1_x_mask:		.quad 0x0ffffffffffffffff	#
 			.quad 0x03ff0000000000000	#
 .L__real_zero:		.quad 0x00000000000000000	#
 			.quad 0x00000000000000000	#
 .L__real_one:		.quad 0x00000000000000001	#
 			.quad 0x00000000000000001	#

 .Lcosarray:
 	.quad	0x03FA5555555502F31		#  0.0416667			c1
 	.quad	0x03FA5555555502F31
 	.quad	0x0BF56C16BF55699D7		# -0.00138889			c2
 	.quad	0x0BF56C16BF55699D7
 	.quad	0x03EFA015C50A93B49		#  2.48016e-005			c3
 	.quad	0x03EFA015C50A93B49
 	.quad	0x0BE92524743CC46B8		# -2.75573e-007			c4
 	.quad	0x0BE92524743CC46B8

 .Lsinarray:
 	.quad	0x0BFC555555545E87D		# -0.166667	   		s1
 	.quad	0x0BFC555555545E87D
 	.quad	0x03F811110DF01232D		# 0.00833333	   		s2
 	.quad	0x03F811110DF01232D
 	.quad	0x0BF2A013A88A37196		# -0.000198413			s3
 	.quad	0x0BF2A013A88A37196
 	.quad	0x03EC6DBE4AD1572D5		# 2.75573e-006			s4
 	.quad	0x03EC6DBE4AD1572D5

 .Lsincosarray:
 	.quad	0x0BFC555555545E87D		# -0.166667	   		s1
 	.quad	0x03FA5555555502F31		# 0.0416667		   	c1
 	.quad	0x03F811110DF01232D		# 0.00833333	   		s2
 	.quad	0x0BF56C16BF55699D7
 	.quad	0x0BF2A013A88A37196		# -0.000198413			s3
 	.quad	0x03EFA015C50A93B49
 	.quad	0x03EC6DBE4AD1572D5		# 2.75573e-006			s4
 	.quad	0x0BE92524743CC46B8

 .Lcossinarray:
 	.quad	0x03FA5555555502F31		# 0.0416667		   	c1
 	.quad	0x0BFC555555545E87D		# -0.166667	   		s1
 	.quad	0x0BF56C16BF55699D7		#				c2
 	.quad	0x03F811110DF01232D
 	.quad	0x03EFA015C50A93B49		#				c3
 	.quad	0x0BF2A013A88A37196
 	.quad	0x0BE92524743CC46B8		#				c4
 	.quad	0x03EC6DBE4AD1572D5

 .align 8
 	.Levensin_oddcos_tbl:

 		.quad	.Lsinsin_sinsin_piby4		# 0		*	; Done
 		.quad	.Lsinsin_sincos_piby4		# 1		+	; Done
 		.quad	.Lsinsin_cossin_piby4		# 2			; Done
 		.quad	.Lsinsin_coscos_piby4		# 3		+	; Done

 		.quad	.Lsincos_sinsin_piby4		# 4			; Done
 		.quad	.Lsincos_sincos_piby4		# 5		*	; Done
 		.quad	.Lsincos_cossin_piby4		# 6			; Done
 		.quad	.Lsincos_coscos_piby4		# 7			; Done

 		.quad	.Lcossin_sinsin_piby4		# 8			; Done
 		.quad	.Lcossin_sincos_piby4		# 9			; TBD
 		.quad	.Lcossin_cossin_piby4		# 10		*	; Done
 		.quad	.Lcossin_coscos_piby4		# 11			; Done

 		.quad	.Lcoscos_sinsin_piby4		# 12			; Done
 		.quad	.Lcoscos_sincos_piby4		# 13		+	; Done
 		.quad	.Lcoscos_cossin_piby4		# 14			; Done
 		.quad	.Lcoscos_coscos_piby4		# 15		*	; Done

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
         .weak vrsa_sincosf_
         .set vrsa_sincosf_,__vrsa_sincosf__
         .weak vrsa_sincosf__
         .set vrsa_sincosf__,__vrsa_sincosf__

     .text
     .align 16
     .p2align 4,,15

 #FORTRAN subroutine implementation of array sincos
 #VRSA_SINCOSF(N,X,Y,Z)
 #C equivalent*/
 #void vrsa_sincosf__(int * n, double *x, double *y, double *z)
 #{
 #       vrsa_sincosf(*n,x,y,z);
 #}

 .globl __vrsa_sincosf__
     .type   __vrsa_sincosf__,@function
 __vrsa_sincosf__:
     mov         (%rdi),%edi

     .align 16
     .p2align 4,,15

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 # define local variable storage offsets
 .equ	p_temp,0		# temporary for get/put bits operation
 .equ	p_temp1,0x10		# temporary for get/put bits operation

 .equ	save_xmm6,0x20		# temporary for get/put bits operation
 .equ	save_xmm7,0x30		# temporary for get/put bits operation
 .equ	save_xmm8,0x40		# temporary for get/put bits operation
 .equ	save_xmm9,0x50		# temporary for get/put bits operation
 .equ	save_xmm0,0x60		# temporary for get/put bits operation
 .equ	save_xmm11,0x70		# temporary for get/put bits operation
 .equ	save_xmm12,0x80		# temporary for get/put bits operation
 .equ	save_xmm13,0x90		# temporary for get/put bits operation
 .equ	save_xmm14,0x0A0		# temporary for get/put bits operation
 .equ	save_xmm15,0x0B0		# temporary for get/put bits operation

 .equ	r,0x0C0			# pointer to r for remainder_piby2
 .equ	rr,0x0D0		# pointer to r for remainder_piby2
 .equ	region,0x0E0		# pointer to r for remainder_piby2

 .equ	r1,0x0F0		# pointer to r for remainder_piby2
 .equ	rr1,0x0100		# pointer to r for remainder_piby2
 .equ	region1,0x0110		# pointer to r for remainder_piby2

 .equ	p_temp2,0x0120		# temporary for get/put bits operation
 .equ	p_temp3,0x0130		# temporary for get/put bits operation

 .equ	p_temp4,0x0140		# temporary for get/put bits operation
 .equ	p_temp5,0x0150		# temporary for get/put bits operation

 .equ	p_original,0x0160		# original x
 .equ	p_mask,0x0170		# original x
 .equ	p_sign_sin,0x0180		# original x

 .equ	p_original1,0x0190		# original x
 .equ	p_mask1,0x01A0		# original x
 .equ	p_sign1_sin,0x01B0		# original x


 .equ	save_r12,0x01C0		# temporary for get/put bits operation
 .equ	save_r13,0x01D0		# temporary for get/put bits operation

 .equ	p_sin,0x01E0		# sin
 .equ	p_cos,0x01F0		# cos

 .equ	save_rdi,0x0200		# temporary for get/put bits operation
 .equ	save_rsi,0x0210		# temporary for get/put bits operation

 .equ	p_sign_cos,0x0220		# Sign of lower cos term
 .equ	p_sign1_cos,0x0230		# Sign of upper cos term

 .equ	save_xa,0x0240		#qword ; leave space for 4 args*****
 .equ	save_ysa,0x0250		#qword ; leave space for 4 args*****
 .equ	save_yca,0x0260		#qword ; leave space for 4 args*****

 .equ	save_nv,0x0270		#qword
 .equ	p_iter,0x0280		#qword	storage for number of loop iterations

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .globl vrsa_sincosf
     .type   vrsa_sincosf,@function
 vrsa_sincosf:

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 # parameters are passed in by Linux as:
 # rcx - int n
 # rdx - double *x
 # r8  - double *y

 	sub		$0x0298,%rsp
 	mov		%r12,save_r12(%rsp)	# save r12
 	mov		%r13,save_r13(%rsp)	# save r13


 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 #START PROCESS INPUT
 # save the arguments
 	mov		%rsi,save_xa(%rsp)	# save x_array pointer
 	mov		%rdx,save_ysa(%rsp)	# save ysin_array pointer
 	mov		%rcx,save_yca(%rsp)	# save ycos_array pointer
 #ifdef INTEGER64
         mov             %rdi,%rax
 #else
         mov             %edi,%eax
         mov             %rax,%rdi
 #endif
 	mov		%rdi,save_nv(%rsp)	# save number of values
 # see if too few values to call the main loop
 	shr		$2,%rax				# get number of iterations
 	jz		.L__vrsa_cleanup			# jump if only single calls
 # prepare the iteration counts
 	mov		%rax,p_iter(%rsp)	# save number of iterations
 	shl		$2,%rax
 	sub		%rax,%rdi		# compute number of extra single calls
 	mov		%rdi,save_nv(%rsp)	# save number of left over values

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 #START LOOP
 .align 16
 .L__vrsa_top:
 # build the input _m128d
 #	movapd	.L__real_7fffffffffffffff,%xmm2	#
 #	mov	.L__real_7fffffffffffffff,%rdx	#

 	mov		save_xa(%rsp),%rsi	# get x_array pointer
 	movlps	(%rsi),%xmm0
 	movhps	8(%rsi),%xmm0

 	prefetch	32(%rsi)
 	add		$16,%rsi
 	mov		%rsi,save_xa(%rsp)	# save x_array pointer


 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 #STARTMAIN

 	movhlps		%xmm0,%xmm8
 	cvtps2pd	%xmm0,%xmm10			# convert input to double.
 	cvtps2pd	%xmm8,%xmm1			# convert input to double.

 movdqa	%xmm10,%xmm6
 movdqa	%xmm1,%xmm7
 movapd	.L__real_7fffffffffffffff(%rip),%xmm2

 andpd 	%xmm2,%xmm10				#Unsign
 andpd 	%xmm2,%xmm1				#Unsign

 mov	%rdi, p_sin(%rsp)			# save address for sin return
 mov	%rsi,  p_cos(%rsp)			# save address for cos return

 movd	%xmm10,%rax				#rax is lower arg
 movhpd	%xmm10, p_temp+8(%rsp)			#
 mov    	p_temp+8(%rsp),%rcx			#rcx = upper arg

 movd	%xmm1,%r8				#r8 is lower arg
 movhpd	%xmm1, p_temp1+8(%rsp)			#
 mov    	p_temp1+8(%rsp),%r9			#r9 = upper arg

 movdqa	%xmm10,%xmm12
 movdqa	%xmm1,%xmm13

 pcmpgtd		%xmm6,%xmm12
 pcmpgtd		%xmm7,%xmm13
 movdqa		%xmm12,%xmm6
 movdqa		%xmm13,%xmm7
 psrldq		$4,%xmm12
 psrldq		$4,%xmm13
 psrldq		$8,%xmm6
 psrldq		$8,%xmm7

 mov 	$0x3FE921FB54442D18,%rdx			#piby4	+
 mov	$0x411E848000000000,%r10			#5e5	+
 movapd	.L__real_3fe0000000000000(%rip),%xmm4		#0.5 for later use +

 por	%xmm6,%xmm12
 por	%xmm7,%xmm13

 movd	%xmm12,%r12				#Move Sign to gpr **
 movd	%xmm13,%r13				#Move Sign to gpr **

 movapd	%xmm10,%xmm2				#x0
 movapd	%xmm1,%xmm3				#x1
 movapd	%xmm10,%xmm6				#x0
 movapd	%xmm1,%xmm7				#x1

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 # xmm2 = x, xmm4 =0.5/t, xmm6 =x
 # xmm3 = x, xmm5 =0.5/t, xmm7 =x
 .align 16
 .Leither_or_both_arg_gt_than_piby4:
 	cmp	%r10,%rax
 	jae	.Lfirst_or_next3_arg_gt_5e5

 	cmp	%r10,%rcx
 	jae	.Lsecond_or_next2_arg_gt_5e5

 	cmp	%r10,%r8
 	jae	.Lthird_or_fourth_arg_gt_5e5

 	cmp	%r10,%r9
 	jae	.Lfourth_arg_gt_5e5


 #      /* Find out what multiple of piby2 */
 #        npi2  = (int)(x * twobypi + 0.5);
 	movapd	.L__real_3fe45f306dc9c883(%rip),%xmm10
 	mulpd	%xmm10,%xmm2						# * twobypi
 	mulpd	%xmm10,%xmm3						# * twobypi

 	addpd	%xmm4,%xmm2						# +0.5, npi2
 	addpd	%xmm4,%xmm3						# +0.5, npi2

 	movapd	.L__real_3ff921fb54400000(%rip),%xmm10		# piby2_1
 	movapd	.L__real_3ff921fb54400000(%rip),%xmm1		# piby2_1

 	cvttpd2dq	%xmm2,%xmm4					# convert packed double to packed integers
 	cvttpd2dq	%xmm3,%xmm5					# convert packed double to packed integers

 	movapd	.L__real_3dd0b4611a600000(%rip),%xmm8		# piby2_2
 	movapd	.L__real_3dd0b4611a600000(%rip),%xmm9		# piby2_2

 	cvtdq2pd	%xmm4,%xmm2					# and back to double.
 	cvtdq2pd	%xmm5,%xmm3					# and back to double.

 #      /* Subtract the multiple from x to get an extra-precision remainder */

 	movd	%xmm4,%r8						# Region
 	movd	%xmm5,%r9						# Region

 #DELETE
 #	mov 	.LQWORD,%rdx PTR __reald_one_zero			;compare value for cossin path
 #DELETE

 	mov	%r8,%r10
 	mov	%r9,%r11

 #      rhead  = x - npi2 * piby2_1;
        mulpd	%xmm2,%xmm10						# npi2 * piby2_1;
        mulpd	%xmm3,%xmm1						# npi2 * piby2_1;

 #      rtail  = npi2 * piby2_2;
        mulpd	%xmm2,%xmm8						# rtail
        mulpd	%xmm3,%xmm9						# rtail

 #      rhead  = x - npi2 * piby2_1;
        subpd	%xmm10,%xmm6						# rhead  = x - npi2 * piby2_1;
        subpd	%xmm1,%xmm7						# rhead  = x - npi2 * piby2_1;

 #      t  = rhead;
        movapd	%xmm6,%xmm10						# t
        movapd	%xmm7,%xmm1						# t

 #      rhead  = t - rtail;
        subpd	%xmm8,%xmm10						# rhead
        subpd	%xmm9,%xmm1						# rhead

 #      rtail  = npi2 * piby2_2tail - ((t - rhead) - rtail);
        mulpd	.L__real_3ba3198a2e037073(%rip),%xmm2		# npi2 * piby2_2tail
        mulpd	.L__real_3ba3198a2e037073(%rip),%xmm3		# npi2 * piby2_2tail

        subpd	%xmm10,%xmm6						# t-rhead
        subpd	%xmm1,%xmm7						# t-rhead

        subpd	%xmm6,%xmm8						# - ((t - rhead) - rtail)
        subpd	%xmm7,%xmm9						# - ((t - rhead) - rtail)

        addpd	%xmm2,%xmm8						# rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
        addpd	%xmm3,%xmm9						# rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 # xmm4  = npi2 (int), xmm10 =rhead, xmm8 =rtail, r8 = region, r10 = region, r12 = Sign
 # xmm5  = npi2 (int), xmm1 =rhead, xmm9 =rtail,  r9 = region, r11 = region, r13 = Sign

 	and	.L__reald_one_one(%rip),%r8		#odd/even region for cos/sin
 	and	.L__reald_one_one(%rip),%r9		#odd/even region for cos/sin

 # NEW

 	#ADDED
 	mov	%r10,%rdi				# npi2 in int
 	mov	%r11,%rsi				# npi2 in int
 	#ADDED

 	shr	$1,%r10					# 0 and 1 => 0
 	shr	$1,%r11					# 2 and 3 => 1

 	mov	%r10,%rax
 	mov	%r11,%rcx

 	#ADDED
 	xor	%r10,%rdi				# xor last 2 bits of region for cos
 	xor	%r11,%rsi				# xor last 2 bits of region for cos
 	#ADDED

 	not 	%r12					#~(sign)
 	not 	%r13					#~(sign)
 	and	%r12,%r10				#region & ~(sign)
 	and	%r13,%r11				#region & ~(sign)

 	not	%rax					#~(region)
 	not	%rcx					#~(region)
 	not	%r12					#~~(sign)
 	not	%r13					#~~(sign)
 	and	%r12,%rax				#~region & ~~(sign)
 	and	%r13,%rcx				#~region & ~~(sign)

 	#ADDED
 	and	.L__reald_one_one(%rip),%rdi		# sign for cos
 	and	.L__reald_one_one(%rip),%rsi		# sign for cos
 	#ADDED

 	or	%rax,%r10
 	or	%rcx,%r11
 	and	.L__reald_one_one(%rip),%r10		# sign for sin
 	and	.L__reald_one_one(%rip),%r11		# sign for sin


 	mov	%r10,%r12
 	mov	%r11,%r13

 	#ADDED
 	mov	%rdi,%rax
 	mov	%rsi,%rcx
 	#ADDED

 	and	.L__reald_one_zero(%rip),%r12		#mask out the lower sign bit leaving the upper sign bit
 	and	.L__reald_one_zero(%rip),%r13		#mask out the lower sign bit leaving the upper sign bit

 	#ADDED
 	and	.L__reald_one_zero(%rip),%rax		#mask out the lower sign bit leaving the upper sign bit
 	and	.L__reald_one_zero(%rip),%rcx		#mask out the lower sign bit leaving the upper sign bit
 	#ADDED

 	shl	$63,%r10				#shift lower sign bit left by 63 bits
 	shl	$63,%r11				#shift lower sign bit left by 63 bits
 	shl	$31,%r12				#shift upper sign bit left by 31 bits
 	shl	$31,%r13				#shift upper sign bit left by 31 bits

 	#ADDED
 	shl	$63,%rdi				#shift lower sign bit left by 63 bits
 	shl	$63,%rsi				#shift lower sign bit left by 63 bits
 	shl	$31,%rax				#shift upper sign bit left by 31 bits
 	shl	$31,%rcx				#shift upper sign bit left by 31 bits
 	#ADDED

 	mov 	 %r10,p_sign_sin(%rsp)		#write out lower sign bit
 	mov 	 %r12,p_sign_sin+8(%rsp)		#write out upper sign bit
 	mov 	 %r11,p_sign1_sin(%rsp)		#write out lower sign bit
 	mov 	 %r13,p_sign1_sin+8(%rsp)	#write out upper sign bit

 	mov 	 %rdi,p_sign_cos(%rsp)		#write out lower sign bit
 	mov 	 %rax,p_sign_cos+8(%rsp)		#write out upper sign bit
 	mov 	 %rsi,p_sign1_cos(%rsp)		#write out lower sign bit
 	mov 	 %rcx,p_sign1_cos+8(%rsp)	#write out upper sign bit

 # NEW

 # GET_BITS_DP64(rhead-rtail, uy);			   		; originally only rhead
 # xmm4  = Sign, xmm10 =rhead, xmm8 =rtail
 # xmm5  = Sign, xmm1 =rhead, xmm9 =rtail
 	movapd	%xmm10,%xmm6						# rhead
 	movapd	%xmm1,%xmm7						# rhead

 	subpd	%xmm8,%xmm10						# r = rhead - rtail
 	subpd	%xmm9,%xmm1						# r = rhead - rtail

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 # xmm4  = Sign, xmm10 = r, xmm6 =rhead, xmm8 =rtail
 # xmm5  = Sign, xmm1 = r, xmm7 =rhead, xmm9 =rtail

 #	subpd	%xmm10,%xmm6				;rr=rhead-r
 #	subpd	%xmm1,%xmm7				;rr=rhead-r

 	mov	%r8,%rax
 	mov	%r9,%rcx

 	movapd	%xmm10,%xmm2				# move r for r2
 	movapd	%xmm1,%xmm3				# move r for r2

 	mulpd	%xmm10,%xmm2				# r2
 	mulpd	%xmm1,%xmm3				# r2

 #	subpd	xmm6, xmm8				;rr=(rhead-r) -rtail
 #	subpd	xmm7, xmm9				;rr=(rhead-r) -rtail

 	and	.L__reald_zero_one(%rip),%rax		# region for jump table
 	and	.L__reald_zero_one(%rip),%rcx
 	shr	$31,%r8
 	shr	$31,%r9
 	or	%r8,%rax
 	or	%r9,%rcx
 	shl	$2,%rcx
 	or	%rcx,%rax


 # HARSHA ADDED
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 # p_sign_sin   = Sign, p_sign_cos = Sign,  xmm10 = r, xmm2 = r2
 # p_sign1_sin  = Sign, p_sign1_cos = Sign, xmm1 = r,  xmm3 = r2
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 	movapd	%xmm2,%xmm14					# for x3
 	movapd	%xmm3,%xmm15					# for x3

 	movapd	%xmm2,%xmm0					# for r
 	movapd	%xmm3,%xmm11					# for r

 	movdqa	.Lcosarray+0x30(%rip),%xmm4			# c4
 	movdqa	.Lcosarray+0x30(%rip),%xmm5			# c4

 	movapd	.Lcosarray+0x10(%rip),%xmm8			# c2
 	movapd	.Lcosarray+0x10(%rip),%xmm9			# c2

 	movdqa	.Lsinarray+0x30(%rip),%xmm6			# c4
 	movdqa	.Lsinarray+0x30(%rip),%xmm7			# c4

 	movapd	.Lsinarray+0x10(%rip),%xmm12			# c2
 	movapd	.Lsinarray+0x10(%rip),%xmm13			# c2

 	mulpd	.L__real_3fe0000000000000(%rip),%xmm0		# r = 0.5 *x2
 	mulpd	.L__real_3fe0000000000000(%rip),%xmm11		# r = 0.5 *x2

 	mulpd	%xmm10,%xmm14					# x3
 	mulpd	%xmm1,%xmm15					# x3

 	mulpd	%xmm2,%xmm4					# c4*x2
 	mulpd	%xmm3,%xmm5					# c4*x2

 	mulpd	%xmm2,%xmm8					# c2*x2
 	mulpd	%xmm3,%xmm9					# c2*x2

 	mulpd	%xmm2,%xmm6					# c2*x2
 	mulpd	%xmm3,%xmm7					# c2*x2

 	mulpd	%xmm2,%xmm12					# c4*x2
 	mulpd	%xmm3,%xmm13					# c4*x2

 	subpd	.L__real_3ff0000000000000(%rip),%xmm0		# -t=r-1.0	;trash r
 	subpd	.L__real_3ff0000000000000(%rip),%xmm11	# -t=r-1.0	;trash r

 	mulpd	%xmm2,%xmm2					# x4
 	mulpd	%xmm3,%xmm3					# x4

 	addpd	.Lcosarray+0x20(%rip),%xmm4			# c3+x2c4
 	addpd	.Lcosarray+0x20(%rip),%xmm5			# c3+x2c4

 	addpd	.Lcosarray(%rip),%xmm8			# c1+x2c2
 	addpd	.Lcosarray(%rip),%xmm9			# c1+x2c2

 	addpd	.Lsinarray+0x20(%rip),%xmm6			# c3+x2c4
 	addpd	.Lsinarray+0x20(%rip),%xmm7			# c3+x2c4

 	addpd	.Lsinarray(%rip),%xmm12			# c1+x2c2
 	addpd	.Lsinarray(%rip),%xmm13			# c1+x2c2

 	mulpd	%xmm2,%xmm4					# x4(c3+x2c4)
 	mulpd	%xmm3,%xmm5					# x4(c3+x2c4)

 	mulpd	%xmm2,%xmm6					# x4(c3+x2c4)
 	mulpd	%xmm3,%xmm7					# x4(c3+x2c4)

 	addpd	%xmm8,%xmm4					# zc
 	addpd	%xmm9,%xmm5					# zc

 	addpd	%xmm12,%xmm6					# zs
 	addpd	%xmm13,%xmm7					# zs

 	mulpd	%xmm2,%xmm4					# x4 * zc
 	mulpd	%xmm3,%xmm5					# x4 * zc

 	mulpd	%xmm14,%xmm6					# x3 * zs
 	mulpd	%xmm15,%xmm7					# x3 * zs

 	subpd   %xmm0,%xmm4					# - (-t)
 	subpd   %xmm11,%xmm5					# - (-t)

 	addpd	%xmm10,%xmm6					# +x
 	addpd	%xmm1,%xmm7					# +x

 # HARSHA ADDED

 	lea	.Levensin_oddcos_tbl(%rip),%rcx
 	jmp	*(%rcx,%rax,8)					#Jmp table for cos/sin calculation based on even/odd region


 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .align 16
 .Lfirst_or_next3_arg_gt_5e5:
 # %rcx,,%rax r8, r9

 	cmp	%r10,%rcx				#is upper arg >= 5e5
 	jae	.Lboth_arg_gt_5e5

 .Llower_arg_gt_5e5:
 # Upper Arg is < 5e5, Lower arg is >= 5e5
 # %xmm2,,%xmm10 xmm6 = x, xmm4 = 0.5
 # Be sure not to use %xmm3,%xmm1 and xmm7
 # Use %xmm8,,%xmm5 xmm0, xmm12
 #	    %xmm11,,%xmm9 xmm13


 	movlpd	 %xmm10,r(%rsp)		#Save lower fp arg for remainder_piby2 call
 	movhlps	%xmm10,%xmm10			#Needed since we want to work on upper arg
 	movhlps	%xmm2,%xmm2
 	movhlps	%xmm6,%xmm6

 # Work on Upper arg
 # Lower arg might contain nan/inf, to avoid exception use only scalar instructions on upper arg which has been moved to lower portions of fp regs
 	mulsd	.L__real_3fe45f306dc9c883(%rip),%xmm2		# x*twobypi
 	addsd	%xmm4,%xmm2					# xmm2 = npi2=(x*twobypi+0.5)
 	movsd	.L__real_3ff921fb54400000(%rip),%xmm8		# xmm8 = piby2_1
 	cvttsd2si	%xmm2,%ecx				# ecx = npi2 trunc to ints
 	movsd	.L__real_3dd0b4611a600000(%rip),%xmm0		# xmm0 = piby2_2
 	cvtsi2sd	%ecx,%xmm2				# xmm2 = npi2 trunc to doubles

 #/* Subtract the multiple from x to get an extra-precision remainder */
 #rhead  = x - npi2 * piby2_1;
 	mulsd	%xmm2,%xmm8					# npi2 * piby2_1
 	subsd	%xmm8,%xmm6					# xmm6 = rhead =(x-npi2*piby2_1)
 	movsd	.L__real_3ba3198a2e037073(%rip),%xmm12		# xmm12 =piby2_2tail

 #t  = rhead;
        movsd	%xmm6,%xmm5					# xmm5 = t = rhead

 #rtail  = npi2 * piby2_2;
        mulsd	%xmm2,%xmm0					# xmm1 =rtail=(npi2*piby2_2)

 #rhead  = t - rtail
        subsd	%xmm0,%xmm6					# xmm6 =rhead=(t-rtail)

 #rtail  = npi2 * piby2_2tail - ((t - rhead) - rtail);
        mulsd	%xmm2,%xmm12     					# npi2 * piby2_2tail
        subsd	%xmm6,%xmm5					# t-rhead
        subsd	%xmm5,%xmm0					# (rtail-(t-rhead))
        addsd	%xmm12,%xmm0					# rtail=npi2*piby2_2tail+(rtail-(t-rhead));

 #r =  rhead - rtail
 #rr = (rhead-r) -rtail
        mov	 %ecx,region+4(%rsp)			# store upper region
        movsd	%xmm6,%xmm10
        subsd	%xmm0,%xmm10					# xmm10 = r=(rhead-rtail)
        subsd	%xmm10,%xmm6					# rr=rhead-r
        subsd	%xmm0,%xmm6					# xmm6 = rr=((rhead-r) -rtail)
        movlpd	 %xmm10,r+8(%rsp)			# store upper r
        movlpd	 %xmm6,rr+8(%rsp)			# store upper rr

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 #Note that volatiles will be trashed by the call
 #We will construct r, rr, region and sign

 # Work on Lower arg
 	mov		$0x07ff0000000000000,%r11			# is lower arg nan/inf
 	mov		%r11,%r10
 	and		%rax,%r10
 	cmp		%r11,%r10
 	jz		.L__vrs4_sincosf_lower_naninf

 	mov	  %r8,p_temp(%rsp)
 	mov	  %r9,p_temp2(%rsp)
 	movapd	  %xmm1,p_temp1(%rsp)
 	movapd	  %xmm3,p_temp3(%rsp)
 	movapd	  %xmm7,p_temp5(%rsp)

 	lea	 region(%rsp),%rdx			# lower arg is **NOT** nan/inf
 	lea	 r(%rsp),%rsi

 # changed input from xmm10 to xmm0
 	mov	 r(%rsp),%rdi				#Restore lower fp arg for remainder_piby2 call

 	call	 __remainder_piby2d2f@PLT

 	mov	 p_temp(%rsp),%r8
 	mov	 p_temp2(%rsp),%r9
 	movapd	 p_temp1(%rsp),%xmm1
 	movapd	 p_temp3(%rsp),%xmm3
 	movapd	 p_temp5(%rsp),%xmm7
 	jmp 	0f

 .L__vrs4_sincosf_lower_naninf:
 	mov	$0x00008000000000000,%r11
 	or	%r11,%rax
 	mov	 %rax,r(%rsp)				# r = x | 0x0008000000000000
 	mov	 %r10d,region(%rsp)			# region =0

 .align 16
 0:

 	jmp 	.Lcheck_next2_args


 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .align 16
 .Lboth_arg_gt_5e5:
 #Upper Arg is >= 5e5, Lower arg is >= 5e5
 # %rcx,,%rax r8, r9
 # %xmm2,,%xmm10 xmm6 = x, xmm4 = 0.5

 	movhlps %xmm10,%xmm6		#Save upper fp arg for remainder_piby2 call

 	mov		$0x07ff0000000000000,%r11			#is lower arg nan/inf
 	mov		%r11,%r10
 	and		%rax,%r10
 	cmp		%r11,%r10
 	jz		.L__vrs4_sincosf_lower_naninf_of_both_gt_5e5

 	mov	  %rcx,p_temp(%rsp)			#Save upper arg
 	mov	  %r8,p_temp2(%rsp)
 	mov	  %r9,p_temp4(%rsp)
 	movapd	  %xmm1,p_temp1(%rsp)
 	movapd	  %xmm3,p_temp3(%rsp)
 	movapd	  %xmm7,p_temp5(%rsp)

 	lea	 region(%rsp),%rdx			#lower arg is **NOT** nan/inf
 	lea	 r(%rsp),%rsi

 # added ins- changed input from xmm10 to xmm0
 	movd	%xmm10,%rdi

 	call	 __remainder_piby2d2f@PLT

 	mov	 p_temp2(%rsp),%r8
 	mov	 p_temp4(%rsp),%r9
 	movapd	 p_temp1(%rsp),%xmm1
 	movapd	 p_temp3(%rsp),%xmm3
 	movapd	 p_temp5(%rsp),%xmm7

 	mov	 p_temp(%rsp),%rcx			#Restore upper arg
 	jmp 	0f

 .L__vrs4_sincosf_lower_naninf_of_both_gt_5e5:				#lower arg is nan/inf
 	mov	$0x00008000000000000,%r11
 	or	%r11,%rax
 	mov	 %rax,r(%rsp)				#r = x | 0x0008000000000000
 	mov	 %r10d,region(%rsp)			#region = 0

 .align 16
 0:
 	mov		$0x07ff0000000000000,%r11			#is upper arg nan/inf
 	mov		%r11,%r10
 	and		%rcx,%r10
 	cmp		%r11,%r10
 	jz		.L__vrs4_sincosf_upper_naninf_of_both_gt_5e5


 	mov	  %r8,p_temp2(%rsp)
 	mov	  %r9,p_temp4(%rsp)
 	movapd	  %xmm1,p_temp1(%rsp)
 	movapd	  %xmm3,p_temp3(%rsp)
 	movapd	  %xmm7,p_temp5(%rsp)

 	lea	 region+4(%rsp),%rdx			#upper arg is **NOT** nan/inf
 	lea	 r+8(%rsp),%rsi

 # changed input from xmm10 to xmm0
 	movd	 %xmm6,%rdi				#Restore upper fp arg for remainder_piby2 call

 	call	 __remainder_piby2d2f@PLT

 	mov	 p_temp2(%rsp),%r8
 	mov	 p_temp4(%rsp),%r9
 	movapd	 p_temp1(%rsp),%xmm1
 	movapd	 p_temp3(%rsp),%xmm3
 	movapd	 p_temp5(%rsp),%xmm7

 	jmp 	0f

 .L__vrs4_sincosf_upper_naninf_of_both_gt_5e5:
 	mov	$0x00008000000000000,%r11
 	or	%r11,%rcx
 	mov	 %rcx,r+8(%rsp)				#r = x | 0x0008000000000000
 	mov	 %r10d,region+4(%rsp)			#region = 0

 .align 16
 0:
 	jmp 	.Lcheck_next2_args

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .align 16
 .Lsecond_or_next2_arg_gt_5e5:

 # Upper Arg is >= 5e5, Lower arg is < 5e5
 # %rcx,,%rax r8, r9
 # %xmm2,,%xmm10 xmm6 = x, xmm4 = 0.5
 # Do not use %xmm3,,%xmm1 xmm7
 # Restore xmm4 and %xmm3,,%xmm1 xmm7
 # Can use %xmm0,,%xmm8 xmm12
 #   %xmm9,,%xmm5 xmm11, xmm13

 	movhpd	 %xmm10,r+8(%rsp)	#Save upper fp arg for remainder_piby2 call

 # Work on Lower arg
 # Upper arg might contain nan/inf, to avoid exception use only scalar instructions on lower arg

 	mulsd	.L__real_3fe45f306dc9c883(%rip),%xmm2		# x*twobypi
 	addsd	%xmm4,%xmm2					# xmm2 = npi2=(x*twobypi+0.5)
 	movsd	.L__real_3ff921fb54400000(%rip),%xmm8		# xmm3 = piby2_1
 	cvttsd2si	%xmm2,%eax				# ecx = npi2 trunc to ints
 	movsd	.L__real_3dd0b4611a600000(%rip),%xmm0		# xmm1 = piby2_2
 	cvtsi2sd	%eax,%xmm2				# xmm2 = npi2 trunc to doubles

 #/* Subtract the multiple from x to get an extra-precision remainder */
 #rhead  = x - npi2 * piby2_1;
 	mulsd	%xmm2,%xmm8					# npi2 * piby2_1
 	subsd	%xmm8,%xmm6					# xmm6 = rhead =(x-npi2*piby2_1)
 	movsd	.L__real_3ba3198a2e037073(%rip),%xmm12		# xmm7 =piby2_2tail

 #t  = rhead;
        movsd	%xmm6,%xmm5					# xmm5 = t = rhead

 #rtail  = npi2 * piby2_2;
        mulsd	%xmm2,%xmm0					# xmm1 =rtail=(npi2*piby2_2)

 #rhead  = t - rtail
        subsd	%xmm0,%xmm6					# xmm6 =rhead=(t-rtail)

 #rtail  = npi2 * piby2_2tail - ((t - rhead) - rtail);
        mulsd	%xmm2,%xmm12     					# npi2 * piby2_2tail
        subsd	%xmm6,%xmm5					# t-rhead
        subsd	%xmm5,%xmm0					# (rtail-(t-rhead))
        addsd	%xmm12,%xmm0					# rtail=npi2*piby2_2tail+(rtail-(t-rhead));

 #r =  rhead - rtail
 #rr = (rhead-r) -rtail
        mov	 %eax,region(%rsp)			# store upper region

 #       movsd	%xmm6,%xmm10
 #       subsd	xmm10,xmm0					; xmm10 = r=(rhead-rtail)
 #       subsd	%xmm10,%xmm6					; rr=rhead-r
 #       subsd	xmm6, xmm0					; xmm6 = rr=((rhead-r) -rtail)

         subsd	%xmm0,%xmm6					# xmm10 = r=(rhead-rtail)

 #       movlpd	QWORD PTR r[rsp], xmm10				; store upper r
 #       movlpd	QWORD PTR rr[rsp], xmm6				; store upper rr

         movlpd	 %xmm6,r(%rsp)				# store upper r


 #Work on Upper arg
 #Note that volatiles will be trashed by the call
 #We do not care since this is the last check
 #We will construct r, rr, region and sign
 	mov		$0x07ff0000000000000,%r11			# is upper arg nan/inf
 	mov		%r11,%r10
 	and		%rcx,%r10
 	cmp		%r11,%r10
 	jz		.L__vrs4_sincosf_upper_naninf

 	mov	 %r8,p_temp(%rsp)
 	mov	 %r9,p_temp2(%rsp)
 	movapd	 %xmm1,p_temp1(%rsp)
 	movapd	 %xmm3,p_temp3(%rsp)
 	movapd	 %xmm7,p_temp5(%rsp)

 	lea	 region+4(%rsp),%rdx			# upper arg is **NOT** nan/inf
 	lea	 r+8(%rsp),%rsi

 # changed input from xmm10 to xmm0
 	mov	 r+8(%rsp),%rdi				#Restore upper fp arg for remainder_piby2 call

 	call	 __remainder_piby2d2f@PLT

 	mov	p_temp(%rsp),%r8
 	mov	p_temp2(%rsp),%r9
 	movapd	p_temp1(%rsp),%xmm1
 	movapd	p_temp3(%rsp),%xmm3
 	movapd	p_temp5(%rsp),%xmm7
 	jmp 	0f

 .L__vrs4_sincosf_upper_naninf:
 	mov	$0x00008000000000000,%r11
 	or	%r11,%rcx
 	mov	 %rcx,r+8(%rsp)				# r = x | 0x0008000000000000
 	mov	 %r10d,region+4(%rsp)			# region =0

 .align 16
 0:

 	jmp	.Lcheck_next2_args


 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .align 16
 .Lcheck_next2_args:

 	mov	$0x411E848000000000,%r10			#5e5	+

 	cmp	%r10,%r8
 	jae	.Lfirst_second_done_third_or_fourth_arg_gt_5e5

 	cmp	%r10,%r9
 	jae	.Lfirst_second_done_fourth_arg_gt_5e5


 # Work on next two args, both < 5e5
 # %xmm3,,%xmm1 xmm5 = x, xmm4 = 0.5

 	movapd	.L__real_3fe0000000000000(%rip),%xmm4			#Restore 0.5

 	mulpd	.L__real_3fe45f306dc9c883(%rip),%xmm3						# * twobypi
 	addpd	%xmm4,%xmm3						# +0.5, npi2
 	movapd	.L__real_3ff921fb54400000(%rip),%xmm1		# piby2_1
 	cvttpd2dq	%xmm3,%xmm5					# convert packed double to packed integers
 	movapd	.L__real_3dd0b4611a600000(%rip),%xmm9		# piby2_2
 	cvtdq2pd	%xmm5,%xmm3					# and back to double.

 ###
 #      /* Subtract the multiple from x to get an extra-precision remainder */
 	movlpd	 %xmm5,region1(%rsp)						# Region
 ###

 #      rhead  = x - npi2 * piby2_1;
        mulpd	%xmm3,%xmm1						# npi2 * piby2_1;

 #      rtail  = npi2 * piby2_2;
        mulpd	%xmm3,%xmm9						# rtail

 #      rhead  = x - npi2 * piby2_1;
        subpd	%xmm1,%xmm7						# rhead  = x - npi2 * piby2_1;

 #      t  = rhead;
        movapd	%xmm7,%xmm1						# t

 #      rhead  = t - rtail;
        subpd	%xmm9,%xmm1						# rhead

 #      rtail  = npi2 * piby2_2tail - ((t - rhead) - rtail);
        mulpd	.L__real_3ba3198a2e037073(%rip),%xmm3		# npi2 * piby2_2tail

        subpd	%xmm1,%xmm7						# t-rhead
        subpd	%xmm7,%xmm9						# - ((t - rhead) - rtail)
        addpd	%xmm3,%xmm9						# rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);

 #       movapd	%xmm1,%xmm7						; rhead
        subpd	%xmm9,%xmm1						# r = rhead - rtail
        movapd	 %xmm1,r1(%rsp)

 #       subpd	%xmm1,%xmm7						; rr=rhead-r
 #       subpd	xmm7, xmm9						; rr=(rhead-r) -rtail
 #       movapd	OWORD PTR rr1[rsp], xmm7

 	jmp	.L__vrs4_sincosf_reconstruct


 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .align 16
 .Lthird_or_fourth_arg_gt_5e5:
 #first two args are < 5e5, third arg >= 5e5, fourth arg >= 5e5 or < 5e5
 # %rcx,,%rax r8, r9
 # %xmm2,,%xmm10 xmm6 = x, xmm4 = 0.5
 # Do not use %xmm3,,%xmm1 xmm7
 # Can use 	%xmm11,,%xmm9 xmm13
 # 	%xmm8,,%xmm5 xmm0, xmm12
 # Restore xmm4

 # Work on first two args, both < 5e5


 	mulpd	.L__real_3fe45f306dc9c883(%rip),%xmm2		# * twobypi
 	addpd	%xmm4,%xmm2						# +0.5, npi2
 	movapd	.L__real_3ff921fb54400000(%rip),%xmm10		# piby2_1
 	cvttpd2dq	%xmm2,%xmm4					# convert packed double to packed integers
 	movapd	.L__real_3dd0b4611a600000(%rip),%xmm8		# piby2_2
 	cvtdq2pd	%xmm4,%xmm2					# and back to double.

 ###
 #      /* Subtract the multiple from x to get an extra-precision remainder */
 	movlpd	 %xmm4,region(%rsp)				# Region
 ###

 #      rhead  = x - npi2 * piby2_1;
        mulpd	%xmm2,%xmm10						# npi2 * piby2_1;
 #      rtail  = npi2 * piby2_2;
        mulpd	%xmm2,%xmm8						# rtail

 #      rhead  = x - npi2 * piby2_1;
        subpd	%xmm10,%xmm6						# rhead  = x - npi2 * piby2_1;

 #      t  = rhead;
        movapd	%xmm6,%xmm10						# t

 #      rhead  = t - rtail;
        subpd	%xmm8,%xmm10						# rhead

 #      rtail  = npi2 * piby2_2tail - ((t - rhead) - rtail);
        mulpd	.L__real_3ba3198a2e037073(%rip),%xmm2		# npi2 * piby2_2tail

        subpd	%xmm10,%xmm6						# t-rhead
        subpd	%xmm6,%xmm8						# - ((t - rhead) - rtail)
        addpd	%xmm2,%xmm8						# rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);

 #       movapd	%xmm10,%xmm6						; rhead
        subpd	%xmm8,%xmm10						# r = rhead - rtail
        movapd	 %xmm10,r(%rsp)

 #       subpd	%xmm10,%xmm6						; rr=rhead-r
 #       subpd	xmm6, xmm8						; rr=(rhead-r) -rtail
 #       movapd	OWORD PTR rr[rsp], xmm6


 # Work on next two args, third arg >= 5e5, fourth arg >= 5e5 or < 5e5

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .Lfirst_second_done_third_or_fourth_arg_gt_5e5:
 # %rcx,,%rax r8, r9
 # %xmm2,,%xmm10 xmm6 = x, xmm4 = 0.5


 	mov	$0x411E848000000000,%r10			#5e5	+
 	cmp	%r10,%r9
 	jae	.Lboth_arg_gt_5e5_higher


 # Upper Arg is <5e5, Lower arg is >= 5e5
 # %r9,%r8
 # %xmm3,,%xmm1 xmm7 = x, xmm4 = 0.5

 	movlpd	 %xmm1,r1(%rsp)		#Save lower fp arg for remainder_piby2 call
 	movhlps	%xmm1,%xmm1			#Needed since we want to work on upper arg
 	movhlps	%xmm3,%xmm3
 	movhlps	%xmm7,%xmm7


 # Work on Upper arg
 # Lower arg might contain nan/inf, to avoid exception use only scalar instructions on upper arg which has been moved to lower portions of fp regs
 	movapd	.L__real_3fe0000000000000(%rip),%xmm4		# Restore 0.5

 	mulsd	.L__real_3fe45f306dc9c883(%rip),%xmm3		# x*twobypi
 	addsd	%xmm4,%xmm3					# xmm3 = npi2=(x*twobypi+0.5)
 	movsd	.L__real_3ff921fb54400000(%rip),%xmm2		# xmm2 = piby2_1
 	cvttsd2si	%xmm3,%r9d				# r9d = npi2 trunc to ints
 	movsd	.L__real_3dd0b4611a600000(%rip),%xmm10		# xmm10 = piby2_2
 	cvtsi2sd	%r9d,%xmm3				# xmm3 = npi2 trunc to doubles

 #/* Subtract the multiple from x to get an extra-precision remainder */
 #rhead  = x - npi2 * piby2_1;
 	mulsd	%xmm3,%xmm2					# npi2 * piby2_1
 	subsd	%xmm2,%xmm7					# xmm7 = rhead =(x-npi2*piby2_1)
 	movsd	.L__real_3ba3198a2e037073(%rip),%xmm6		# xmm6 =piby2_2tail

 #t  = rhead;
        movsd	%xmm7,%xmm5					# xmm5 = t = rhead

 #rtail  = npi2 * piby2_2;
        mulsd	%xmm3,%xmm10					# xmm10 =rtail=(npi2*piby2_2)

 #rhead  = t - rtail
        subsd	%xmm10,%xmm7					# xmm7 =rhead=(t-rtail)

 #rtail  = npi2 * piby2_2tail - ((t - rhead) - rtail);
        mulsd	%xmm3,%xmm6     					# npi2 * piby2_2tail
        subsd	%xmm7,%xmm5					# t-rhead
        subsd	%xmm5,%xmm10					# (rtail-(t-rhead))
        addsd	%xmm6,%xmm10					# rtail=npi2*piby2_2tail+(rtail-(t-rhead));

 #r =  rhead - rtail
 #rr = (rhead-r) -rtail
        mov	 %r9d,region1+4(%rsp)			# store upper region


        subsd	%xmm10,%xmm7					# xmm1 = r=(rhead-rtail)

        movlpd	 %xmm7,r1+8(%rsp)			# store upper r


 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 #Note that volatiles will be trashed by the call
 #We do not care since this is the last check
 #We will construct r, rr, region and sign

 # Work on Lower arg
 	mov		$0x07ff0000000000000,%r11			# is lower arg nan/inf
 	mov		%r11,%r10
 	and		%r8,%r10
 	cmp		%r11,%r10
 	jz		.L__vrs4_sincosf_lower_naninf_higher

 	lea	 region1(%rsp),%rdx			# lower arg is **NOT** nan/inf
 	lea	 r1(%rsp),%rsi

 # changed input from xmm10 to xmm0
 	mov	 r1(%rsp),%rdi				#Restore lower fp arg for remainder_piby2 call

 	call	 __remainder_piby2d2f@PLT

 	jmp 	0f

 .L__vrs4_sincosf_lower_naninf_higher:
 	mov	$0x00008000000000000,%r11
 	or	%r11,%r8
 	mov	 %r8,r1(%rsp)				# r = x | 0x0008000000000000
 	mov	 %r10d,region1(%rsp)			# region =0

 .align 16
 0:
 	jmp 	.L__vrs4_sincosf_reconstruct


 .align 16
 .Lboth_arg_gt_5e5_higher:
 # Upper Arg is >= 5e5, Lower arg is >= 5e5
 # %r9,%r8
 # %xmm3,,%xmm1 xmm7 = x, xmm4 = 0.5


 	movhlps %xmm1,%xmm7		#Save upper fp arg for remainder_piby2 call

 	mov		$0x07ff0000000000000,%r11			#is lower arg nan/inf
 	mov		%r11,%r10
 	and		%r8,%r10
 	cmp		%r11,%r10
 	jz		.L__vrs4_sincosf_lower_naninf_of_both_gt_5e5_higher

 	mov	  %r9,p_temp1(%rsp)			#Save upper arg
 	lea	  region1(%rsp),%rdx			#lower arg is **NOT** nan/inf
 	lea	  r1(%rsp),%rsi

 # changed input from xmm10 to xmm0
 	movd	 %xmm1,%rdi

 	call	 __remainder_piby2d2f@PLT

 	mov	 p_temp1(%rsp),%r9			#Restore upper arg


 	jmp 	0f

 .L__vrs4_sincosf_lower_naninf_of_both_gt_5e5_higher:				#lower arg is nan/inf
 	mov	$0x00008000000000000,%r11
 	or	%r11,%r8
 	mov	 %r8,r1(%rsp)				#r = x | 0x0008000000000000
 	mov	 %r10d,region1(%rsp)			#region = 0

 .align 16
 0:
 	mov		$0x07ff0000000000000,%r11			#is upper arg nan/inf
 	mov		%r11,%r10
 	and		%r9,%r10
 	cmp		%r11,%r10
 	jz		.L__vrs4_sincosf_upper_naninf_of_both_gt_5e5_higher

 	lea	 region1+4(%rsp),%rdx			#upper arg is **NOT** nan/inf
 	lea	 r1+8(%rsp),%rsi

 # changed input from xmm10 to xmm0
 	movd	 %xmm7,%rdi			#Restore upper fp arg for remainder_piby2 call


 	call	 __remainder_piby2d2f@PLT

 	jmp 	0f

 .L__vrs4_sincosf_upper_naninf_of_both_gt_5e5_higher:
 	mov	$0x00008000000000000,%r11
 	or	%r11,%r9
 	mov	 %r9,r1+8(%rsp)				#r = x | 0x0008000000000000
 	mov	 %r10d,region1+4(%rsp)			#region = 0

 .align 16
 0:

 	jmp 	.L__vrs4_sincosf_reconstruct

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .align 16
 .Lfourth_arg_gt_5e5:
 #first two args are < 5e5, third arg < 5e5, fourth arg >= 5e5
 #%rcx,,%rax r8, r9
 #%xmm2,,%xmm10 xmm6 = x, xmm4 = 0.5

 # Work on first two args, both < 5e5

 	mulpd	.L__real_3fe45f306dc9c883(%rip),%xmm2		# * twobypi
 	addpd	%xmm4,%xmm2						# +0.5, npi2
 	movapd	.L__real_3ff921fb54400000(%rip),%xmm10		# piby2_1
 	cvttpd2dq	%xmm2,%xmm4					# convert packed double to packed integers
 	movapd	.L__real_3dd0b4611a600000(%rip),%xmm8		# piby2_2
 	cvtdq2pd	%xmm4,%xmm2					# and back to double.

 ###
 #      /* Subtract the multiple from x to get an extra-precision remainder */
 	movlpd	 %xmm4,region(%rsp)				# Region
 ###

 #      rhead  = x - npi2 * piby2_1;
        mulpd	%xmm2,%xmm10						# npi2 * piby2_1;
 #      rtail  = npi2 * piby2_2;
        mulpd	%xmm2,%xmm8						# rtail

 #      rhead  = x - npi2 * piby2_1;
        subpd	%xmm10,%xmm6						# rhead  = x - npi2 * piby2_1;

 #      t  = rhead;
        movapd	%xmm6,%xmm10						# t

 #      rhead  = t - rtail;
        subpd	%xmm8,%xmm10						# rhead

 #      rtail  = npi2 * piby2_2tail - ((t - rhead) - rtail);
        mulpd	.L__real_3ba3198a2e037073(%rip),%xmm2		# npi2 * piby2_2tail

        subpd	%xmm10,%xmm6						# t-rhead
        subpd	%xmm6,%xmm8						# - ((t - rhead) - rtail)
        addpd	%xmm2,%xmm8						# rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);

 #       movapd	%xmm10,%xmm6						; rhead
        subpd	%xmm8,%xmm10						# r = rhead - rtail
        movapd	 %xmm10,r(%rsp)

 #       subpd	%xmm10,%xmm6						; rr=rhead-r
 #       subpd	xmm6, xmm8						; rr=(rhead-r) -rtail
 #       movapd	OWORD PTR rr[rsp], xmm6


 # Work on next two args, third arg < 5e5, fourth arg >= 5e5
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .Lfirst_second_done_fourth_arg_gt_5e5:

 # Upper Arg is >= 5e5, Lower arg is < 5e5
 # %r9,%r8
 # %xmm3,,%xmm1 xmm7 = x, xmm4 = 0.5

 	movhpd	 %xmm1,r1+8(%rsp)	#Save upper fp arg for remainder_piby2 call


 # Work on Lower arg
 # Upper arg might contain nan/inf, to avoid exception use only scalar instructions on lower arg
 	movapd	.L__real_3fe0000000000000(%rip),%xmm4		# Restore 0.5
 	mulsd	.L__real_3fe45f306dc9c883(%rip),%xmm3		# x*twobypi
 	addsd	%xmm4,%xmm3					# xmm3 = npi2=(x*twobypi+0.5)
 	movsd	.L__real_3ff921fb54400000(%rip),%xmm2		# xmm2 = piby2_1
 	cvttsd2si	%xmm3,%r8d				# r8d = npi2 trunc to ints
 	movsd	.L__real_3dd0b4611a600000(%rip),%xmm10		# xmm10 = piby2_2
 	cvtsi2sd	%r8d,%xmm3				# xmm3 = npi2 trunc to doubles

 #/* Subtract the multiple from x to get an extra-precision remainder */
 #rhead  = x - npi2 * piby2_1;
 	mulsd	%xmm3,%xmm2					# npi2 * piby2_1
 	subsd	%xmm2,%xmm7					# xmm7 = rhead =(x-npi2*piby2_1)
 	movsd	.L__real_3ba3198a2e037073(%rip),%xmm6		# xmm6 =piby2_2tail

 #t  = rhead;
        movsd	%xmm7,%xmm5					# xmm5 = t = rhead

 #rtail  = npi2 * piby2_2;
        mulsd	%xmm3,%xmm10					# xmm10 =rtail=(npi2*piby2_2)

 #rhead  = t - rtail
        subsd	%xmm10,%xmm7					# xmm7 =rhead=(t-rtail)

 #rtail  = npi2 * piby2_2tail - ((t - rhead) - rtail);
        mulsd	%xmm3,%xmm6     					# npi2 * piby2_2tail
        subsd	%xmm7,%xmm5					# t-rhead
        subsd	%xmm5,%xmm10					# (rtail-(t-rhead))
        addsd	%xmm6,%xmm10					# rtail=npi2*piby2_2tail+(rtail-(t-rhead));

 #r =  rhead - rtail
 #rr = (rhead-r) -rtail
        mov	 %r8d,region1(%rsp)			# store lower region

 #       movsd	%xmm7,%xmm1
 #       subsd	xmm1, xmm10					; xmm10 = r=(rhead-rtail)
 #       subsd	%xmm1,%xmm7					; rr=rhead-r
 #       subsd	xmm7, xmm10					; xmm6 = rr=((rhead-r) -rtail)

         subsd	%xmm10,%xmm7					# xmm10 = r=(rhead-rtail)

 #       movlpd	QWORD PTR r1[rsp], xmm1				; store upper r
 #       movlpd	QWORD PTR rr1[rsp], xmm7			; store upper rr

         movlpd	 %xmm7,r1(%rsp)				# store upper r

 #Work on Upper arg
 #Note that volatiles will be trashed by the call
 #We do not care since this is the last check
 #We will construct r, rr, region and sign
 	mov		$0x07ff0000000000000,%r11			# is upper arg nan/inf
 	mov		%r11,%r10
 	and		%r9,%r10
 	cmp		%r11,%r10
 	jz		.L__vrs4_sincosf_upper_naninf_higher

 	lea	 region1+4(%rsp),%rdx			# upper arg is **NOT** nan/inf
 	lea	 r1+8(%rsp),%rsi

 # changed input from xmm10 to xmm0
 	mov	 r1+8(%rsp),%rdi				#Restore upper fp arg for remainder_piby2 call

 	call	 __remainder_piby2d2f@PLT

 	jmp 	0f

 .L__vrs4_sincosf_upper_naninf_higher:
 	mov	$0x00008000000000000,%r11
 	or	%r11,%r9
 	mov	 %r9,r1+8(%rsp)				# r = x | 0x0008000000000000
 	mov	 %r10d,region1+4(%rsp)			# region =0

 .align 16
 0:
 	jmp	.L__vrs4_sincosf_reconstruct


 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .align 16
 .L__vrs4_sincosf_reconstruct:
 #Results
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 # p_sign_sin  = Sign,  ; p_sign_cos   = Sign, xmm10 = r, xmm2 = r2
 # p_sign1_sin  = Sign, ; p_sign1_cos  = Sign, xmm1 = r, xmm3 = r2
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 	movapd	r(%rsp),%xmm10
 	movapd	r1(%rsp),%xmm1

 	mov	region(%rsp),%r8
 	mov	region1(%rsp),%r9

 	mov 	%r8,%r10
 	mov 	%r9,%r11

 	and	.L__reald_one_one(%rip),%r8		#odd/even region for cos/sin
 	and	.L__reald_one_one(%rip),%r9		#odd/even region for cos/sin


 # NEW

 	#ADDED
 	mov	%r10,%rdi
 	mov	%r11,%rsi
 	#ADDED

 	shr	$1,%r10						#~AB+A~B, A is sign and B is upper bit of region
 	shr	$1,%r11						#~AB+A~B, A is sign and B is upper bit of region

 	mov	%r10,%rax
 	mov	%r11,%rcx

 	#ADDED
 	xor	%r10,%rdi
 	xor	%r11,%rsi
 	#ADDED

 	not 	%r12						#ADDED TO CHANGE THE LOGIC
 	not 	%r13						#ADDED TO CHANGE THE LOGIC
 	and	%r12,%r10
 	and	%r13,%r11

 	not	%rax
 	not	%rcx
 	not	%r12
 	not	%r13
 	and	%r12,%rax
 	and	%r13,%rcx

 	#ADDED
 	and	.L__reald_one_one(%rip),%rdi				#(~AB+A~B)&1
 	and	.L__reald_one_one(%rip),%rsi				#(~AB+A~B)&1
 	#ADDED

 	or	%rax,%r10
 	or	%rcx,%r11
 	and	.L__reald_one_one(%rip),%r10				#(~AB+A~B)&1
 	and	.L__reald_one_one(%rip),%r11				#(~AB+A~B)&1


 	mov	%r10,%r12
 	mov	%r11,%r13

 	#ADDED
 	mov	%rdi,%rax
 	mov	%rsi,%rcx
 	#ADDED

 	and	.L__reald_one_zero(%rip),%r12		#mask out the lower sign bit leaving the upper sign bit
 	and	.L__reald_one_zero(%rip),%r13		#mask out the lower sign bit leaving the upper sign bit

 	#ADDED
 	and	.L__reald_one_zero(%rip),%rax		#mask out the lower sign bit leaving the upper sign bit
 	and	.L__reald_one_zero(%rip),%rcx		#mask out the lower sign bit leaving the upper sign bit
 	#ADDED

 	shl	$63,%r10				#shift lower sign bit left by 63 bits
 	shl	$63,%r11				#shift lower sign bit left by 63 bits
 	shl	$31,%r12				#shift upper sign bit left by 31 bits
 	shl	$31,%r13				#shift upper sign bit left by 31 bits

 	#ADDED
 	shl	$63,%rdi				#shift lower sign bit left by 63 bits
 	shl	$63,%rsi				#shift lower sign bit left by 63 bits
 	shl	$31,%rax				#shift upper sign bit left by 31 bits
 	shl	$31,%rcx				#shift upper sign bit left by 31 bits
 	#ADDED

 	mov 	 %r10,p_sign_sin(%rsp)		#write out lower sign bit
 	mov 	 %r12,p_sign_sin+8(%rsp)		#write out upper sign bit
 	mov 	 %r11,p_sign1_sin(%rsp)		#write out lower sign bit
 	mov 	 %r13,p_sign1_sin+8(%rsp)	#write out upper sign bit

 	mov 	 %rdi,p_sign_cos(%rsp)		#write out lower sign bit
 	mov 	 %rax,p_sign_cos+8(%rsp)		#write out upper sign bit
 	mov 	 %rsi,p_sign1_cos(%rsp)		#write out lower sign bit
 	mov 	 %rcx,p_sign1_cos+8(%rsp)	#write out upper sign bit
 #NEW


 	mov	%r8,%rax
 	mov	%r9,%rcx

 	movapd	%xmm10,%xmm2
 	movapd	%xmm1,%xmm3

 	mulpd	%xmm10,%xmm2				# r2
 	mulpd	%xmm1,%xmm3				# r2

 	and	.L__reald_zero_one(%rip),%rax
 	and	.L__reald_zero_one(%rip),%rcx
 	shr	$31,%r8
 	shr	$31,%r9
 	or	%r8,%rax
 	or	%r9,%rcx
 	shl	$2,%rcx
 	or	%rcx,%rax


 # HARSHA ADDED
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 # p_sign_cos  = Sign, p_sign_sin  = Sign, xmm10 = r, xmm2 = r2
 # p_sign1_cos = Sign, p_sign1_sin = Sign, xmm1 = r,  xmm3 = r2
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 	movapd	%xmm2,%xmm14					# for x3
 	movapd	%xmm3,%xmm15					# for x3

 	movapd	%xmm2,%xmm0					# for r
 	movapd	%xmm3,%xmm11					# for r

 	movdqa	.Lcosarray+0x30(%rip),%xmm4			# c4
 	movdqa	.Lcosarray+0x30(%rip),%xmm5			# c4

 	movapd	.Lcosarray+0x10(%rip),%xmm8			# c2
 	movapd	.Lcosarray+0x10(%rip),%xmm9			# c2

 	movdqa	.Lsinarray+0x30(%rip),%xmm6			# c4
 	movdqa	.Lsinarray+0x30(%rip),%xmm7			# c4

 	movapd	.Lsinarray+0x10(%rip),%xmm12			# c2
 	movapd	.Lsinarray+0x10(%rip),%xmm13			# c2

 	mulpd	.L__real_3fe0000000000000(%rip),%xmm0		# r = 0.5 *x2
 	mulpd	.L__real_3fe0000000000000(%rip),%xmm11		# r = 0.5 *x2

 	mulpd	%xmm10,%xmm14					# x3
 	mulpd	%xmm1,%xmm15					# x3

 	mulpd	%xmm2,%xmm4					# c4*x2
 	mulpd	%xmm3,%xmm5					# c4*x2

 	mulpd	%xmm2,%xmm8					# c2*x2
 	mulpd	%xmm3,%xmm9					# c2*x2

 	mulpd	%xmm2,%xmm6					# c2*x2
 	mulpd	%xmm3,%xmm7					# c2*x2

 	mulpd	%xmm2,%xmm12					# c4*x2
 	mulpd	%xmm3,%xmm13					# c4*x2

 	subpd	.L__real_3ff0000000000000(%rip),%xmm0		# -t=r-1.0	;trash r
 	subpd	.L__real_3ff0000000000000(%rip),%xmm11	# -t=r-1.0	;trash r

 	mulpd	%xmm2,%xmm2					# x4
 	mulpd	%xmm3,%xmm3					# x4

 	addpd	.Lcosarray+0x20(%rip),%xmm4			# c3+x2c4
 	addpd	.Lcosarray+0x20(%rip),%xmm5			# c3+x2c4

 	addpd	.Lcosarray(%rip),%xmm8			# c1+x2c2
 	addpd	.Lcosarray(%rip),%xmm9			# c1+x2c2

 	addpd	.Lsinarray+0x20(%rip),%xmm6			# c3+x2c4
 	addpd	.Lsinarray+0x20(%rip),%xmm7			# c3+x2c4

 	addpd	.Lsinarray(%rip),%xmm12			# c1+x2c2
 	addpd	.Lsinarray(%rip),%xmm13			# c1+x2c2

 	mulpd	%xmm2,%xmm4					# x4(c3+x2c4)
 	mulpd	%xmm3,%xmm5					# x4(c3+x2c4)

 	mulpd	%xmm2,%xmm6					# x4(c3+x2c4)
 	mulpd	%xmm3,%xmm7					# x4(c3+x2c4)

 	addpd	%xmm8,%xmm4					# zc
 	addpd	%xmm9,%xmm5					# zc

 	addpd	%xmm12,%xmm6					# zs
 	addpd	%xmm13,%xmm7					# zs

 	mulpd	%xmm2,%xmm4					# x4 * zc
 	mulpd	%xmm3,%xmm5					# x4 * zc

 	mulpd	%xmm14,%xmm6					# x3 * zs
 	mulpd	%xmm15,%xmm7					# x3 * zs

 	subpd   %xmm0,%xmm4					# - (-t)
 	subpd   %xmm11,%xmm5					# - (-t)

 	addpd	%xmm10,%xmm6					# +x
 	addpd	%xmm1,%xmm7					# +x

 # HARSHA ADDED


 	lea	.Levensin_oddcos_tbl(%rip),%rcx
 	jmp	*(%rcx,%rax,8)					#Jmp table for cos/sin calculation based on even/odd region


 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .align 16
 .L__vrsa_sincosf_cleanup:

 	movapd	p_sign_cos(%rsp),%xmm10
 	movapd	p_sign1_cos(%rsp),%xmm1
 	xorpd	%xmm4,%xmm10			# Cos term   (+) Sign
 	xorpd	%xmm5,%xmm1			# Cos term   (+) Sign

 	cvtpd2ps %xmm10,%xmm0
 	cvtpd2ps %xmm1,%xmm11

 	movapd	p_sign_sin(%rsp),%xmm14
 	movapd	p_sign1_sin(%rsp),%xmm15
 	xorpd	%xmm6,%xmm14			# Sin term (+) Sign
 	xorpd	%xmm7,%xmm15			# Sin term (+) Sign

 	cvtpd2ps %xmm14,%xmm12
 	cvtpd2ps %xmm15,%xmm13


 .L__vrsa_bottom1:
 # store the result _m128d

 	mov	save_ysa(%rsp),%r8
 	mov	save_yca(%rsp),%r9

 	movlps	 %xmm0,  (%r9)			# save the cos
 	movlps	 %xmm12, (%r8)			# save the sin
 	movlps	 %xmm11, 8(%r9)			# save the cos
 	movlps	 %xmm13, 8(%r8)			# save the sin


 	prefetch	32(%r8)
 	prefetch	32(%r9)

 	add		$16,%r8
 	add		$16,%r9

 	mov		%r8,save_ysa(%rsp)	# save y_sinarray pointer
 	mov		%r9,save_yca(%rsp)	# save y_cosarray pointer

 	mov	p_iter(%rsp),%rax		# get number of iterations
 	sub	$1,%rax
 	mov	%rax,p_iter(%rsp)		# save number of iterations
 	jnz	.L__vrsa_top

 # see if we need to do any extras
 	mov	save_nv(%rsp),%rax	# get number of values
 	test	%rax,%rax
 	jnz	.L__vrsa_cleanup

 .L__final_check:

 	mov	save_r12(%rsp),%r12	# restore r12
 	mov	save_r13(%rsp),%r13	# restore r13

 	add	$0x0298,%rsp
 	ret

 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 # we jump here when we have an odd number of cos calls to make at the end
 # we assume that rdx is pointing at the next x array element, r8 at the next y array element.
 # The number of values left is in save_nv

 .align	16
 .L__vrsa_cleanup:
         mov             save_nv(%rsp),%rax      # get number of values
         test            %rax,%rax               # are there any values
         jz              .L__final_check         # exit if not

 	mov		save_xa(%rsp),%rsi
 	mov		save_ysa(%rsp),%rdi
 	mov		save_yca(%rsp),%r12


 # fill in a m128d with zeroes and the extra values and then make a recursive call.
 	xorps		 %xmm0,%xmm0
 	movss		 %xmm0,p_temp+4(%rsp)
 	movlps		 %xmm0,p_temp+8(%rsp)


 	mov		 (%rsi),%ecx			# we know there's at least one
 	mov	 	 %ecx,p_temp(%rsp)
 	cmp		 $2,%rax
 	jl		 .L__vrsacg

 	mov		 4(%rsi),%ecx			# do the second value
 	mov	 	 %ecx,p_temp+4(%rsp)
 	cmp		 $3,%rax
 	jl		 .L__vrsacg

 	mov		 8(%rsi),%ecx			# do the third value
 	mov	 	 %ecx,p_temp+8(%rsp)

 .L__vrsacg:
 	mov		$4,%rdi				# parameter for N
 	lea		p_temp(%rsp),%rsi		# &x parameter
 	lea		p_temp2(%rsp),%rdx	 	# &ys parameter
 	lea		p_temp3(%rsp),%rcx		# &yc parameter
 	call		vrsa_sincosf@PLT		# call recursively to compute four values

 # now copy the results to the destination array
 	mov		save_ysa(%rsp),%rdi
 	mov		save_yca(%rsp),%r12
 	mov		save_nv(%rsp),%rax			# get number of values

 	mov	 	p_temp2(%rsp),%ecx
 	mov		%ecx,(%rdi)			# we know there's at least one
 	mov	 	p_temp3(%rsp),%edx
 	mov		%edx,(%r12)			# we know there's at least one
 	cmp		$2,%rax
 	jl		.L__vrsacgf

 	mov	 	p_temp2+4(%rsp),%ecx
 	mov		%ecx,4(%rdi)			# do the second value
 	mov	 	p_temp3+4(%rsp),%edx
 	mov		%edx,4(%r12)			# do the second value
 	cmp		$3,%rax
 	jl		.L__vrsacgf

 	mov	 	p_temp2+8(%rsp),%ecx
 	mov		%ecx,8(%rdi)			# do the third value
 	mov	 	p_temp3+8(%rsp),%edx
 	mov		%edx,8(%r12)			# do the third value

 .L__vrsacgf:
 	jmp		.L__final_check


 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;JUMP TABLE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 .align 16
 .Lcoscos_coscos_piby4:
 # Cos in %xmm5,%xmm4
 # Sin in %xmm7,%xmm6
 # Lower and Upper Even

 	movapd	%xmm4,%xmm8
 	movapd	%xmm5,%xmm9

 	movapd	%xmm6,%xmm4
 	movapd	%xmm7,%xmm5

 	movapd	%xmm8,%xmm6
 	movapd	%xmm9,%xmm7

 	jmp 	.L__vrsa_sincosf_cleanup

 .align 16
 .Lcossin_cossin_piby4:

 	movhlps	%xmm5,%xmm9
 	movhlps	%xmm7,%xmm13

 	movlhps	%xmm9,%xmm7
 	movlhps	%xmm13,%xmm5

 	movhlps	%xmm4,%xmm8
 	movhlps	%xmm6,%xmm12

 	movlhps	%xmm8,%xmm6
 	movlhps	%xmm12,%xmm4

 	jmp 	.L__vrsa_sincosf_cleanup

 .align 16
 .Lsincos_cossin_piby4:
 	movsd	%xmm5,%xmm9
 	movsd	%xmm7,%xmm13

 	movsd	%xmm9,%xmm7
 	movsd	%xmm13,%xmm5

 	movhlps	%xmm4,%xmm8
 	movhlps	%xmm6,%xmm12

 	movlhps	%xmm8,%xmm6
 	movlhps	%xmm12,%xmm4

 	jmp	.L__vrsa_sincosf_cleanup

 .align 16
 .Lsincos_sincos_piby4:
 	movsd	%xmm5,%xmm9
 	movsd	%xmm7,%xmm13

 	movsd	%xmm9,%xmm7
 	movsd	%xmm13,%xmm5

 	movsd	%xmm4,%xmm8
 	movsd	%xmm6,%xmm12

 	movsd	%xmm8,%xmm6
 	movsd	%xmm12,%xmm4

 	jmp 	.L__vrsa_sincosf_cleanup

 .align 16
 .Lcossin_sincos_piby4:
 	movhlps	%xmm5,%xmm9
 	movhlps	%xmm7,%xmm13

 	movlhps	%xmm9,%xmm7
 	movlhps	%xmm13,%xmm5

 	movsd	%xmm4,%xmm8
 	movsd	%xmm6,%xmm12

 	movsd	%xmm8,%xmm6
 	movsd	%xmm12,%xmm4

 	jmp	.L__vrsa_sincosf_cleanup

 .align 16
 .Lcoscos_sinsin_piby4:
 # Cos in %xmm5,%xmm4
 # Sin in %xmm7,%xmm6
 # Lower even, Upper odd, Swap upper

 	movapd	%xmm5,%xmm9
 	movapd	%xmm7,%xmm5
 	movapd	%xmm9,%xmm7

 	jmp 	.L__vrsa_sincosf_cleanup

 .align 16
 .Lsinsin_coscos_piby4:
 # Cos in %xmm5,%xmm4
 # Sin in %xmm7,%xmm6
 # Lower odd, Upper even, Swap lower

 	movapd	%xmm4,%xmm8
 	movapd	%xmm6,%xmm4
 	movapd	%xmm8,%xmm6

 	jmp 	.L__vrsa_sincosf_cleanup

 .align 16
 .Lcoscos_cossin_piby4:
 # Cos in xmm4 and xmm5
 # Sin in xmm6 and xmm7

 	movapd	%xmm5,%xmm9
 	movapd	%xmm7,%xmm5
 	movapd	%xmm9,%xmm7

 	movhlps	%xmm4,%xmm8
 	movhlps	%xmm6,%xmm12

 	movlhps	%xmm8,%xmm6
 	movlhps	%xmm12,%xmm4

 	jmp 	.L__vrsa_sincosf_cleanup

 .align 16
 .Lcoscos_sincos_piby4:
 # Cos in xmm4 and xmm5
 # Sin in xmm6 and xmm7

 	movapd	%xmm5,%xmm9
 	movapd	%xmm7,%xmm5
 	movapd	%xmm9,%xmm7

 	movsd	%xmm4,%xmm8
 	movsd	%xmm6,%xmm12

 	movsd	%xmm8,%xmm6
 	movsd	%xmm12,%xmm4
 	jmp 	.L__vrsa_sincosf_cleanup

 .align 16
 .Lcossin_coscos_piby4:
 # Cos in xmm4 and xmm5
 # Sin in xmm6 and xmm7

 	movapd	%xmm4,%xmm8
 	movapd	%xmm6,%xmm4
 	movapd	%xmm8,%xmm6

 	movhlps	%xmm5,%xmm9
 	movhlps	%xmm7,%xmm13

 	movlhps	%xmm9,%xmm7
 	movlhps	%xmm13,%xmm5

 	jmp 	.L__vrsa_sincosf_cleanup

 .align 16
 .Lcossin_sinsin_piby4:
 # Cos in xmm4 and xmm5
 # Sin in xmm6 and xmm7
 	movhlps	%xmm5,%xmm9
 	movhlps	%xmm7,%xmm13

 	movlhps	%xmm9,%xmm7
 	movlhps	%xmm13,%xmm5

 	jmp 	.L__vrsa_sincosf_cleanup


 .align 16
 .Lsincos_coscos_piby4:
 # Cos in xmm4 and xmm5
 # Sin in xmm6 and xmm7
 	movapd	%xmm4,%xmm8
 	movapd	%xmm6,%xmm4
 	movapd	%xmm8,%xmm6

 	movsd	%xmm5,%xmm9
 	movsd	%xmm7,%xmm13

 	movsd	%xmm9,%xmm7
 	movsd	%xmm13,%xmm5
 	jmp 	.L__vrsa_sincosf_cleanup

 .align 16
 .Lsincos_sinsin_piby4:
 # Cos in xmm4 and xmm5
 # Sin in xmm6 and xmm7
 	movsd	%xmm5,%xmm9
 	movsd	%xmm7,%xmm5
 	movsd	%xmm9,%xmm7

 	jmp 	.L__vrsa_sincosf_cleanup

 .align 16
 .Lsinsin_cossin_piby4:
 # Cos in xmm4 and xmm5
 # Sin in xmm6 and xmm7
 	movhlps	%xmm4,%xmm8
 	movhlps	%xmm6,%xmm12

 	movlhps	%xmm8,%xmm6
 	movlhps	%xmm12,%xmm4

 	jmp 	.L__vrsa_sincosf_cleanup

 .align 16
 .Lsinsin_sincos_piby4:
 # Cos in xmm4 and xmm5
 # Sin in xmm6 and xmm7
 	movsd	%xmm4,%xmm8
 	movsd	%xmm6,%xmm4
 	movsd	%xmm8,%xmm6
 	jmp 	.L__vrsa_sincosf_cleanup

 .align 16
 .Lsinsin_sinsin_piby4:
 # Cos in xmm4 and xmm5
 # Sin in xmm6 and xmm7
 # Lower and Upper odd, So Swap

 	jmp 	.L__vrsa_sincosf_cleanup