src/gas/vrd4exp.S - open64_libacml_mv - Git at Google


 #
 #  (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
 #
 #  This file is part of libacml_mv.
 #
 #  libacml_mv is free software; you can redistribute it and/or
 #  modify it under the terms of the GNU Lesser General Public
 #  License as published by the Free Software Foundation; either
 #  version 2.1 of the License, or (at your option) any later version.
 #
 #  libacml_mv is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 #  Lesser General Public License for more details.
 #
 #  You should have received a copy of the GNU Lesser General Public
 #  License along with libacml_mv.  If not, see
 #  <http://www.gnu.org/licenses/>.
 #
 #


 #
 # vrd4exp.s
 #
 # A vector implementation of the exp libm function.
 #
 # Prototype:
 #
 #    __m128d,__m128d __vrd4_exp(__m128d x1, __m128d x2);
 #
 #   Computes e raised to the x power for an array of input values.
 #   Places the results into the supplied y array.
 # Does not perform error checking.   Denormal results are truncated to 0.
 #
 # This routine computes 4 double precision exponent values at a time.
 # The four values are passed as packed doubles in xmm0 and xmm1.
 # The four results are returned as packed doubles in xmm0 and xmm1.
 # Note that this represents a non-standard ABI usage, as no ABI
 # ( and indeed C) currently allows returning 2 values for a function.
 # It is expected that some compilers may be able to take advantage of this
 # interface when implementing vectorized loops.  Using the array implementation
 # of the routine requires putting the inputs into memory, and retrieving
 # the results from memory.  This routine eliminates the need for this
 # overhead if the data does not already reside in memory.
 # This routine is derived directly from the array version.
 #
 #

 #ifdef __ELF__
 .section .note.GNU-stack,"",@progbits
 #endif

 # define local variable storage offsets
 .equ	p_temp,0		# temporary for get/put bits operation
 .equ	p_temp1,0x10		# temporary for exponent multiply

 .equ	save_rbx,0x020		#qword
 .equ	save_rdi,0x028		#qword

 .equ	save_rsi,0x030		#qword


 .equ	p2_temp,0x40		# second temporary for get/put bits operation
 .equ	p2_temp1,0x60		# second temporary for exponent multiply


 .equ	stack_size,0x088


 # parameters are passed in by Linux as:
 # xmm0 - __m128d x1
 # xmm1 - __m128d x2

     .text
     .align 16
     .p2align 4,,15
 .globl __vrd4_exp
     .type   __vrd4_exp,@function
 __vrd4_exp:
 	sub		$stack_size,%rsp
 	mov		%rbx,save_rbx(%rsp)	# save rdi
 		movapd	%xmm1,%xmm6

 # process 4 values at a time.

 	movapd	.L__real_thirtytwo_by_log2(%rip),%xmm3	#

 #      Step 1. Reduce the argument.
 #        /* Find m, z1 and z2 such that exp(x) = 2**m * (z1 + z2) */
 #    r = x * thirtytwo_by_logbaseof2;
 		movapd	%xmm3,%xmm7
 	movapd	 %xmm0,p_temp(%rsp)
 	maxpd	.L__real_C0F0000000000000(%rip),%xmm0	# protect against very large negative, non-infinite numbers
 	mulpd	%xmm0,%xmm3

 		movapd	 %xmm6,p2_temp(%rsp)
 		maxpd	.L__real_C0F0000000000000(%rip),%xmm6
 		mulpd	%xmm6,%xmm7

 # save x for later.
         minpd   .L__real_40F0000000000000(%rip),%xmm3    # protect against very large, non-infinite numbers


 #    /* Set n = nearest integer to r */
 	cvtpd2dq	%xmm3,%xmm4
 	lea		.L__two_to_jby32_lead_table(%rip),%rdi
 	lea		.L__two_to_jby32_trail_table(%rip),%rsi
 	cvtdq2pd	%xmm4,%xmm1
         	minpd   .L__real_40F0000000000000(%rip),%xmm7    # protect against very large, non-infinite numbers

  #    r1 = x - n * logbaseof2_by_32_lead;
 	movapd	.L__real_log2_by_32_lead(%rip),%xmm2	#
 	mulpd	%xmm1,%xmm2					#
 	movq	 %xmm4,p_temp1(%rsp)
 	subpd	%xmm2,%xmm0					# r1 in xmm0,

 		cvtpd2dq	%xmm7,%xmm2
 		cvtdq2pd	%xmm2,%xmm8

 #    r2 =   - n * logbaseof2_by_32_trail;
 	mulpd	.L__real_log2_by_32_tail(%rip),%xmm1	# r2 in xmm1
 #    j = n & 0x0000001f;
 	mov		$0x01f,%r9
 	mov		%r9,%r8
 	mov		p_temp1(%rsp),%ecx
 	and		%ecx,%r9d
 		movq	 %xmm2,p2_temp1(%rsp)
 		movapd	.L__real_log2_by_32_lead(%rip),%xmm9
 		mulpd	%xmm8,%xmm9
 		subpd	%xmm9,%xmm6				# r1b in xmm6
 		mulpd	.L__real_log2_by_32_tail(%rip),%xmm8	# r2b in xmm8

 	mov             p_temp1+4(%rsp),%edx
 	and		%edx,%r8d
 #    f1 = two_to_jby32_lead_table[j];
 #    f2 = two_to_jby32_trail_table[j];

 #    *m = (n - j) / 32;
 	sub		%r9d,%ecx
 	sar		$5,%ecx					#m
 	sub		%r8d,%edx
 	sar		$5,%edx


 	movapd	%xmm0,%xmm2
 	addpd	%xmm1,%xmm2   # r = r1 + r2

 		mov		$0x01f,%r11
 		mov		%r11,%r10
 		mov		p2_temp1(%rsp),%ebx
 		and		%ebx,%r11d
 #      Step 2. Compute the polynomial.
 #    q = r1 + (r2 +
 #              r*r*( 5.00000000000000008883e-01 +
 #                      r*( 1.66666666665260878863e-01 +
 #                      r*( 4.16666666662260795726e-02 +
 #                      r*( 8.33336798434219616221e-03 +
 #                      r*( 1.38889490863777199667e-03 ))))));
 #    q = r + r^2/2 + r^3/6 + r^4/24 + r^5/120 + r^6/720
 	movapd	%xmm2,%xmm1
 	movapd	.L__real_3f56c1728d739765(%rip),%xmm3	# 	1/720
 	movapd	.L__real_3FC5555555548F7C(%rip),%xmm0	# 	1/6
 # deal with infinite results
 	mov		$1024,%rax
 	movsx	%ecx,%rcx
 	cmp		%rax,%rcx

 	mulpd	%xmm2,%xmm3						# *x
 	mulpd	%xmm2,%xmm0						# *x
 	mulpd	%xmm2,%xmm1						# x*x
 	movapd	%xmm1,%xmm4

 	cmovg	%rax,%rcx				## if infinite, then set rcx to multiply
 							# by infinity
 	movsx	%edx,%rdx
 	cmp		%rax,%rdx

 		movapd	%xmm6,%xmm9
 		addpd	%xmm8,%xmm9  #  rb = r1b + r2b
 	addpd	.L__real_3F811115B7AA905E(%rip),%xmm3	# 	+ 1/120
 	addpd	.L__real_3fe0000000000000(%rip),%xmm0	# 	+ .5
 	mulpd	%xmm1,%xmm4						# x^4
 	mulpd	%xmm2,%xmm3						# *x

 	cmovg	%rax,%rdx				## if infinite, then set rcx to multiply
 							# by infinity
 # deal with denormal results
 	xor		%rax,%rax
 	add		$1023,%rcx				# add bias

 	mulpd	%xmm1,%xmm0					# *x^2
 	addpd	.L__real_3FA5555555545D4E(%rip),%xmm3	# 	+ 1/24
 	addpd	%xmm2,%xmm0					# 	+ x
 	mulpd	%xmm4,%xmm3					# *x^4

 # check for infinity or nan
 	movapd	p_temp(%rsp),%xmm2

 	cmovs	%rax,%rcx				## if denormal, then multiply by 0
 	shl		$52,%rcx				# build 2^n

 		sub		%r11d,%ebx
 		movapd	%xmm9,%xmm1
 	addpd	%xmm3,%xmm0					# q = final sum
 		movapd	.L__real_3f56c1728d739765(%rip),%xmm7	# 	1/720
 		movapd	.L__real_3FC5555555548F7C(%rip),%xmm3	# 	1/6

 #    *z2 = f2 + ((f1 + f2) * q);
         movlpd  (%rsi,%r9,8),%xmm5              # f2
         movlpd  (%rsi,%r8,8),%xmm4              # f2
         addsd   (%rdi,%r8,8),%xmm4              # f1 + f2
         addsd   (%rdi,%r9,8),%xmm5              # f1 + f2
                 mov p2_temp1+4(%rsp),%r8d
 		and		%r8d,%r10d
 		sar		$5,%ebx					#m
 		mulpd	%xmm9,%xmm7					# *x
 		mulpd	%xmm9,%xmm3					# *x
 		mulpd	%xmm9,%xmm1					# x*x
 		sub		%r10d,%r8d
 		sar		$5,%r8d
 # check for infinity or nan
 	andpd	.L__real_infinity(%rip),%xmm2
 	cmppd   $0,.L__real_infinity(%rip),%xmm2
 	add		$1023,%rdx				# add bias
 	shufpd	$0,%xmm4,%xmm5
 		movapd	%xmm1,%xmm4

 	cmovs	%rax,%rdx				## if denormal, then multiply by 0
 	shl		$52,%rdx					# build 2^n

 	mulpd	%xmm5,%xmm0
 	mov		 %rcx,p_temp1(%rsp) # get 2^n to memory
 	mov		 %rdx,p_temp1+8(%rsp) # get 2^n to memory
 	addpd	%xmm5,%xmm0					#z = z1 + z2
 		mov		$1024,%rax
 		movsx	%ebx,%rbx
 		cmp		%rax,%rbx
 # end of splitexp
 #        /* Scale (z1 + z2) by 2.0**m */
 #          r = scaleDouble_1(z, n);


 		cmovg	%rax,%rbx			## if infinite, then set rcx to multiply
 							# by infinity
 		movsx	%r8d,%rdx
 		cmp		%rax,%rdx

 	movmskpd	%xmm2,%r8d

 		addpd	.L__real_3F811115B7AA905E(%rip),%xmm7	# 	+ 1/120
 		addpd	.L__real_3fe0000000000000(%rip),%xmm3	# 	+ .5
 		mulpd	%xmm1,%xmm4				# x^4
 		mulpd	%xmm9,%xmm7				# *x
 		cmovg	%rax,%rdx			## if infinite, then set rcx to multiply


 		xor		%rax,%rax
 		add		$1023,%rbx		# add bias

 		mulpd	%xmm1,%xmm3			# *x^2
 		addpd	.L__real_3FA5555555545D4E(%rip),%xmm7	# 	+ 1/24
 		addpd	%xmm9,%xmm3			# 	+ x
 		mulpd	%xmm4,%xmm7			# *x^4

 		cmovs	%rax,%rbx			## if denormal, then multiply by 0
 		shl		$52,%rbx		# build 2^n

 #      Step 3. Reconstitute.

 	mulpd	p_temp1(%rsp),%xmm0	# result *= 2^n
 		addpd	%xmm7,%xmm3			# q = final sum

                 movlpd  (%rsi,%r11,8),%xmm5             # f2
                 movlpd  (%rsi,%r10,8),%xmm4             # f2
                 addsd   (%rdi,%r10,8),%xmm4             # f1 + f2
                 addsd   (%rdi,%r11,8),%xmm5             # f1 + f2

 		add		$1023,%rdx		# add bias
 		cmovs	%rax,%rdx			## if denormal, then multiply by 0
 		shufpd	$0,%xmm4,%xmm5
 		shl		$52,%rdx		# build 2^n

 		mulpd	%xmm5,%xmm3
 		mov		 %rbx,p2_temp1(%rsp) # get 2^n to memory
 		mov		 %rdx,p2_temp1+8(%rsp) # get 2^n to memory
 		addpd	%xmm5,%xmm3			#z = z1 + z2

 		movapd	p2_temp(%rsp),%xmm2
 		andpd	.L__real_infinity(%rip),%xmm2
 		cmppd	$0,.L__real_infinity(%rip),%xmm2
 		movmskpd	%xmm2,%ebx
 	test		$3,%r8d
 		mulpd	p2_temp1(%rsp),%xmm3	# result *= 2^n
 # we'd like to avoid a branch, and can use cmp's and and's to
 # eliminate them.  But it adds cycles for normal cases which
 # are supposed to be exceptions.  Using this branch with the
 # check above results in faster code for the normal cases.
 	jnz			.L__exp_naninf

 .L__vda_bottom1:
 # store the result _m128d
 		test		$3,%ebx
 		jnz		.L__exp_naninf2

 .L__vda_bottom2:

 	movapd	%xmm3,%xmm1


 #
 #
 .L__final_check:
 	mov		save_rbx(%rsp),%rbx		# restore rbx
 	add		$stack_size,%rsp
 	ret

 # at least one of the numbers needs special treatment
 .L__exp_naninf:
 	lea		p_temp(%rsp),%rcx
 	call  .L__naninf
 	jmp		.L__vda_bottom1
 .L__exp_naninf2:
 	lea		p2_temp(%rsp),%rcx
 	mov			%ebx,%r8d
 	movapd	%xmm0,%xmm4
 	movapd	%xmm3,%xmm0
 	call  .L__naninf
 	movapd	%xmm0,%xmm3
 	movapd	%xmm4,%xmm0
 	jmp		.L__vda_bottom2

 # This subroutine checks a double pair for nans and infinities and
 # produces the proper result from the exceptional inputs
 # Register assumptions:
 # Inputs:
 # r8d - mask of errors
 # xmm0 - computed result vector
 # rcx - pointing to memory image of inputs
 # Outputs:
 # xmm0 - new result vector
 # %rax,rdx,,%xmm2 all modified.
 .L__naninf:
 # check the first number
 	test	$1,%r8d
 	jz		.L__check2

 	mov		(%rcx),%rdx
 	mov		$0x0000FFFFFFFFFFFFF,%rax
 	test	%rax,%rdx
 	jnz		.L__enan1			# jump if mantissa not zero, so it's a NaN
 # inf
 	mov		%rdx,%rax
 	rcl		$1,%rax
 	jnc		.L__r1				# exp(+inf) = inf
 	xor		%rdx,%rdx				# exp(-inf) = 0
 	jmp		.L__r1

 #NaN
 .L__enan1:
 	mov		$0x00008000000000000,%rax	# convert to quiet
 	or		%rax,%rdx
 .L__r1:
 	movd	%rdx,%xmm2
 	shufpd	$2,%xmm0,%xmm2
 	movsd	%xmm2,%xmm0
 # check the second number
 .L__check2:
 	test	$2,%r8d
 	jz		.L__r3
 	mov		8(%rcx),%rdx
 	mov		$0x0000FFFFFFFFFFFFF,%rax
 	test	%rax,%rdx
 	jnz		.L__enan2			# jump if mantissa not zero, so it's a NaN
 # inf
 	mov		%rdx,%rax
 	rcl		$1,%rax
 	jnc		.L__r2				# exp(+inf) = inf
 	xor		%rdx,%rdx				# exp(-inf) = 0
 	jmp		.L__r2

 #NaN
 .L__enan2:
 	mov		$0x00008000000000000,%rax	# convert to quiet
 	or		%rax,%rdx
 .L__r2:
 	movd	%rdx,%xmm2
 	shufpd	$0,%xmm2,%xmm0
 .L__r3:
 	ret

 	.data
 	.align	64

 .L__real_3ff0000000000000: 	.quad 0x03ff0000000000000	# 1.0
 				.quad 0x03ff0000000000000
 .L__real_4040000000000000:	.quad 0x04040000000000000	# 32
 				.quad 0x04040000000000000
 .L__real_40F0000000000000:      .quad 0x040F0000000000000        # 65536, to protect against really large numbers
                                 .quad 0x040F0000000000000
 .L__real_C0F0000000000000:	.quad 0x0C0F0000000000000	# -65536, to protect against really large negative numbers
 				.quad 0x0C0F0000000000000
 .L__real_3FA0000000000000:	.quad 0x03FA0000000000000	# 1/32
 				.quad 0x03FA0000000000000
 .L__real_3fe0000000000000:	.quad 0x03fe0000000000000	# 1/2
 				.quad 0x03fe0000000000000
 .L__real_infinity:		.quad 0x07ff0000000000000	#
 				.quad 0x07ff0000000000000
 .L__real_ninfinity:		.quad 0x0fff0000000000000	#
 				.quad 0x0fff0000000000000
 .L__real_thirtytwo_by_log2: 	.quad 0x040471547652b82fe	# thirtytwo_by_log2
 				.quad 0x040471547652b82fe
 .L__real_log2_by_32_lead:  	.quad 0x03f962e42fe000000	# log2_by_32_lead
 				.quad 0x03f962e42fe000000
 .L__real_log2_by_32_tail:  	.quad 0x0Bdcf473de6af278e	# -log2_by_32_tail
 				.quad 0x0Bdcf473de6af278e
 .L__real_3f56c1728d739765:	.quad 0x03f56c1728d739765	# 1.38889490863777199667e-03
 				.quad 0x03f56c1728d739765
 .L__real_3F811115B7AA905E:	.quad 0x03F811115B7AA905E	# 8.33336798434219616221e-03
 				.quad 0x03F811115B7AA905E
 .L__real_3FA5555555545D4E:	.quad 0x03FA5555555545D4E	# 4.16666666662260795726e-02
 				.quad 0x03FA5555555545D4E
 .L__real_3FC5555555548F7C:	.quad 0x03FC5555555548F7C	# 1.66666666665260878863e-01
 				.quad 0x03FC5555555548F7C


 .L__two_to_jby32_lead_table:
 	.quad	0x03ff0000000000000 # 1
 	.quad	0x03ff059b0d0000000		# 1.0219
 	.quad	0x03ff0b55860000000		# 1.04427
 	.quad	0x03ff11301d0000000		# 1.06714
 	.quad	0x03ff172b830000000		# 1.09051
 	.quad	0x03ff1d48730000000		# 1.11439
 	.quad	0x03ff2387a60000000		# 1.13879
 	.quad	0x03ff29e9df0000000		# 1.16372
 	.quad	0x03ff306fe00000000		# 1.18921
 	.quad	0x03ff371a730000000		# 1.21525
 	.quad	0x03ff3dea640000000		# 1.24186
 	.quad	0x03ff44e0860000000		# 1.26905
 	.quad	0x03ff4bfdad0000000		# 1.29684
 	.quad	0x03ff5342b50000000		# 1.32524
 	.quad	0x03ff5ab07d0000000		# 1.35426
 	.quad	0x03ff6247eb0000000		# 1.38391
 	.quad	0x03ff6a09e60000000		# 1.41421
 	.quad	0x03ff71f75e0000000		# 1.44518
 	.quad	0x03ff7a11470000000		# 1.47683
 	.quad	0x03ff8258990000000		# 1.50916
 	.quad	0x03ff8ace540000000		# 1.54221
 	.quad	0x03ff93737b0000000		# 1.57598
 	.quad	0x03ff9c49180000000		# 1.61049
 	.quad	0x03ffa5503b0000000		# 1.64576
 	.quad	0x03ffae89f90000000		# 1.68179
 	.quad	0x03ffb7f76f0000000		# 1.71862
 	.quad	0x03ffc199bd0000000		# 1.75625
 	.quad	0x03ffcb720d0000000		# 1.79471
 	.quad	0x03ffd5818d0000000		# 1.83401
 	.quad	0x03ffdfc9730000000		# 1.87417
 	.quad	0x03ffea4afa0000000		# 1.91521
 	.quad	0x03fff507650000000		# 1.95714
 	.quad 0					# for alignment
 .L__two_to_jby32_trail_table:
 	.quad	0x00000000000000000 # 0
 	.quad	0x03e48ac2ba1d73e2a		# 1.1489e-008
 	.quad	0x03e69f3121ec53172		# 4.83347e-008
 	.quad	0x03df25b50a4ebbf1b		# 2.67125e-010
 	.quad	0x03e68faa2f5b9bef9		# 4.65271e-008
 	.quad	0x03e368b9aa7805b80		# 5.24924e-009
 	.quad	0x03e6ceac470cd83f6		# 5.38622e-008
 	.quad	0x03e547f7b84b09745		# 1.90902e-008
 	.quad	0x03e64636e2a5bd1ab		# 3.79764e-008
 	.quad	0x03e5ceaa72a9c5154		# 2.69307e-008
 	.quad	0x03e682468446b6824		# 4.49684e-008
 	.quad	0x03e18624b40c4dbd0		# 1.41933e-009
 	.quad	0x03e54d8a89c750e5e		# 1.94147e-008
 	.quad	0x03e5a753e077c2a0f		# 2.46409e-008
 	.quad	0x03e6a90a852b19260		# 4.94813e-008
 	.quad	0x03e0d2ac258f87d03		# 8.48872e-010
 	.quad	0x03e59fcef32422cbf		# 2.42032e-008
 	.quad	0x03e61d8bee7ba46e2		# 3.3242e-008
 	.quad	0x03e4f580c36bea881		# 1.45957e-008
 	.quad	0x03e62999c25159f11		# 3.46453e-008
 	.quad	0x03e415506dadd3e2a		# 8.0709e-009
 	.quad	0x03e29b8bc9e8a0388		# 2.99439e-009
 	.quad	0x03e451f8480e3e236		# 9.83622e-009
 	.quad	0x03e41f12ae45a1224		# 8.35492e-009
 	.quad	0x03e62b5a75abd0e6a		# 3.48493e-008
 	.quad	0x03e47daf237553d84		# 1.11085e-008
 	.quad	0x03e6b0aa538444196		# 5.03689e-008
 	.quad	0x03e69df20d22a0798		# 4.81896e-008
 	.quad	0x03e69f7490e4bb40b		# 4.83654e-008
 	.quad	0x03e4bdcdaf5cb4656		# 1.29746e-008
 	.quad	0x03e452486cc2c7b9d		# 9.84533e-009
 	.quad	0x03e66dc8a80ce9f09		# 4.25828e-008
 	.quad 0					# for alignment

	#
	# (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
	#
	# This file is part of libacml_mv.
	#
	# libacml_mv is free software; you can redistribute it and/or
	# modify it under the terms of the GNU Lesser General Public
	# License as published by the Free Software Foundation; either
	# version 2.1 of the License, or (at your option) any later version.
	#
	# libacml_mv is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	# Lesser General Public License for more details.
	#
	# You should have received a copy of the GNU Lesser General Public
	# License along with libacml_mv. If not, see
	# <http://www.gnu.org/licenses/>.
	#
	#





	#
	# vrd4exp.s
	#
	# A vector implementation of the exp libm function.
	#
	# Prototype:
	#
	# __m128d,__m128d __vrd4_exp(__m128d x1, __m128d x2);
	#
	# Computes e raised to the x power for an array of input values.
	# Places the results into the supplied y array.
	# Does not perform error checking. Denormal results are truncated to 0.
	#
	# This routine computes 4 double precision exponent values at a time.
	# The four values are passed as packed doubles in xmm0 and xmm1.
	# The four results are returned as packed doubles in xmm0 and xmm1.
	# Note that this represents a non-standard ABI usage, as no ABI
	# ( and indeed C) currently allows returning 2 values for a function.
	# It is expected that some compilers may be able to take advantage of this
	# interface when implementing vectorized loops. Using the array implementation
	# of the routine requires putting the inputs into memory, and retrieving
	# the results from memory. This routine eliminates the need for this
	# overhead if the data does not already reside in memory.
	# This routine is derived directly from the array version.
	#
	#

	#ifdef __ELF__
	.section .note.GNU-stack,"",@progbits
	#endif

	# define local variable storage offsets
	.equ p_temp,0 # temporary for get/put bits operation
	.equ p_temp1,0x10 # temporary for exponent multiply

	.equ save_rbx,0x020 #qword
	.equ save_rdi,0x028 #qword

	.equ save_rsi,0x030 #qword



	.equ p2_temp,0x40 # second temporary for get/put bits operation
	.equ p2_temp1,0x60 # second temporary for exponent multiply


	.equ stack_size,0x088


	# parameters are passed in by Linux as:
	# xmm0 - __m128d x1
	# xmm1 - __m128d x2

	.text
	.align 16
	.p2align 4,,15
	.globl __vrd4_exp
	.type __vrd4_exp,@function
	__vrd4_exp:
	sub $stack_size,%rsp
	mov %rbx,save_rbx(%rsp) # save rdi
	movapd %xmm1,%xmm6

	# process 4 values at a time.

	movapd .L__real_thirtytwo_by_log2(%rip),%xmm3 #

	# Step 1. Reduce the argument.
	# /* Find m, z1 and z2 such that exp(x) = 2*m (z1 + z2) */
	# r = x * thirtytwo_by_logbaseof2;
	movapd %xmm3,%xmm7
	movapd %xmm0,p_temp(%rsp)
	maxpd .L__real_C0F0000000000000(%rip),%xmm0 # protect against very large negative, non-infinite numbers
	mulpd %xmm0,%xmm3

	movapd %xmm6,p2_temp(%rsp)
	maxpd .L__real_C0F0000000000000(%rip),%xmm6
	mulpd %xmm6,%xmm7

	# save x for later.
	minpd .L__real_40F0000000000000(%rip),%xmm3 # protect against very large, non-infinite numbers


	# /* Set n = nearest integer to r */
	cvtpd2dq %xmm3,%xmm4
	lea .L__two_to_jby32_lead_table(%rip),%rdi
	lea .L__two_to_jby32_trail_table(%rip),%rsi
	cvtdq2pd %xmm4,%xmm1
	minpd .L__real_40F0000000000000(%rip),%xmm7 # protect against very large, non-infinite numbers

	# r1 = x - n * logbaseof2_by_32_lead;
	movapd .L__real_log2_by_32_lead(%rip),%xmm2 #
	mulpd %xmm1,%xmm2 #
	movq %xmm4,p_temp1(%rsp)
	subpd %xmm2,%xmm0 # r1 in xmm0,

	cvtpd2dq %xmm7,%xmm2
	cvtdq2pd %xmm2,%xmm8

	# r2 = - n * logbaseof2_by_32_trail;
	mulpd .L__real_log2_by_32_tail(%rip),%xmm1 # r2 in xmm1
	# j = n & 0x0000001f;
	mov $0x01f,%r9
	mov %r9,%r8
	mov p_temp1(%rsp),%ecx
	and %ecx,%r9d
	movq %xmm2,p2_temp1(%rsp)
	movapd .L__real_log2_by_32_lead(%rip),%xmm9
	mulpd %xmm8,%xmm9
	subpd %xmm9,%xmm6 # r1b in xmm6
	mulpd .L__real_log2_by_32_tail(%rip),%xmm8 # r2b in xmm8

	mov p_temp1+4(%rsp),%edx
	and %edx,%r8d
	# f1 = two_to_jby32_lead_table[j];
	# f2 = two_to_jby32_trail_table[j];

	# *m = (n - j) / 32;
	sub %r9d,%ecx
	sar $5,%ecx #m
	sub %r8d,%edx
	sar $5,%edx


	movapd %xmm0,%xmm2
	addpd %xmm1,%xmm2 # r = r1 + r2

	mov $0x01f,%r11
	mov %r11,%r10
	mov p2_temp1(%rsp),%ebx
	and %ebx,%r11d
	# Step 2. Compute the polynomial.
	# q = r1 + (r2 +
	# rr( 5.00000000000000008883e-01 +
	# r*( 1.66666666665260878863e-01 +
	# r*( 4.16666666662260795726e-02 +
	# r*( 8.33336798434219616221e-03 +
	# r*( 1.38889490863777199667e-03 ))))));
	# q = r + r^2/2 + r^3/6 + r^4/24 + r^5/120 + r^6/720
	movapd %xmm2,%xmm1
	movapd .L__real_3f56c1728d739765(%rip),%xmm3 # 1/720
	movapd .L__real_3FC5555555548F7C(%rip),%xmm0 # 1/6
	# deal with infinite results
	mov $1024,%rax
	movsx %ecx,%rcx
	cmp %rax,%rcx

	mulpd %xmm2,%xmm3 # *x
	mulpd %xmm2,%xmm0 # *x
	mulpd %xmm2,%xmm1 # x*x
	movapd %xmm1,%xmm4

	cmovg %rax,%rcx ## if infinite, then set rcx to multiply
	# by infinity
	movsx %edx,%rdx
	cmp %rax,%rdx

	movapd %xmm6,%xmm9
	addpd %xmm8,%xmm9 # rb = r1b + r2b
	addpd .L__real_3F811115B7AA905E(%rip),%xmm3 # + 1/120
	addpd .L__real_3fe0000000000000(%rip),%xmm0 # + .5
	mulpd %xmm1,%xmm4 # x^4
	mulpd %xmm2,%xmm3 # *x

	cmovg %rax,%rdx ## if infinite, then set rcx to multiply
	# by infinity
	# deal with denormal results
	xor %rax,%rax
	add $1023,%rcx # add bias

	mulpd %xmm1,%xmm0 # *x^2
	addpd .L__real_3FA5555555545D4E(%rip),%xmm3 # + 1/24
	addpd %xmm2,%xmm0 # + x
	mulpd %xmm4,%xmm3 # *x^4

	# check for infinity or nan
	movapd p_temp(%rsp),%xmm2

	cmovs %rax,%rcx ## if denormal, then multiply by 0
	shl $52,%rcx # build 2^n

	sub %r11d,%ebx
	movapd %xmm9,%xmm1
	addpd %xmm3,%xmm0 # q = final sum
	movapd .L__real_3f56c1728d739765(%rip),%xmm7 # 1/720
	movapd .L__real_3FC5555555548F7C(%rip),%xmm3 # 1/6

	# z2 = f2 + ((f1 + f2) q);
	movlpd (%rsi,%r9,8),%xmm5 # f2
	movlpd (%rsi,%r8,8),%xmm4 # f2
	addsd (%rdi,%r8,8),%xmm4 # f1 + f2
	addsd (%rdi,%r9,8),%xmm5 # f1 + f2
	mov p2_temp1+4(%rsp),%r8d
	and %r8d,%r10d
	sar $5,%ebx #m
	mulpd %xmm9,%xmm7 # *x
	mulpd %xmm9,%xmm3 # *x
	mulpd %xmm9,%xmm1 # x*x
	sub %r10d,%r8d
	sar $5,%r8d
	# check for infinity or nan
	andpd .L__real_infinity(%rip),%xmm2
	cmppd $0,.L__real_infinity(%rip),%xmm2
	add $1023,%rdx # add bias
	shufpd $0,%xmm4,%xmm5
	movapd %xmm1,%xmm4

	cmovs %rax,%rdx ## if denormal, then multiply by 0
	shl $52,%rdx # build 2^n

	mulpd %xmm5,%xmm0
	mov %rcx,p_temp1(%rsp) # get 2^n to memory
	mov %rdx,p_temp1+8(%rsp) # get 2^n to memory
	addpd %xmm5,%xmm0 #z = z1 + z2
	mov $1024,%rax
	movsx %ebx,%rbx
	cmp %rax,%rbx
	# end of splitexp
	# /* Scale (z1 + z2) by 2.0*m /
	# r = scaleDouble_1(z, n);


	cmovg %rax,%rbx ## if infinite, then set rcx to multiply
	# by infinity
	movsx %r8d,%rdx
	cmp %rax,%rdx

	movmskpd %xmm2,%r8d

	addpd .L__real_3F811115B7AA905E(%rip),%xmm7 # + 1/120
	addpd .L__real_3fe0000000000000(%rip),%xmm3 # + .5
	mulpd %xmm1,%xmm4 # x^4
	mulpd %xmm9,%xmm7 # *x
	cmovg %rax,%rdx ## if infinite, then set rcx to multiply


	xor %rax,%rax
	add $1023,%rbx # add bias

	mulpd %xmm1,%xmm3 # *x^2
	addpd .L__real_3FA5555555545D4E(%rip),%xmm7 # + 1/24
	addpd %xmm9,%xmm3 # + x
	mulpd %xmm4,%xmm7 # *x^4

	cmovs %rax,%rbx ## if denormal, then multiply by 0
	shl $52,%rbx # build 2^n

	# Step 3. Reconstitute.

	mulpd p_temp1(%rsp),%xmm0 # result *= 2^n
	addpd %xmm7,%xmm3 # q = final sum

	movlpd (%rsi,%r11,8),%xmm5 # f2
	movlpd (%rsi,%r10,8),%xmm4 # f2
	addsd (%rdi,%r10,8),%xmm4 # f1 + f2
	addsd (%rdi,%r11,8),%xmm5 # f1 + f2

	add $1023,%rdx # add bias
	cmovs %rax,%rdx ## if denormal, then multiply by 0
	shufpd $0,%xmm4,%xmm5
	shl $52,%rdx # build 2^n

	mulpd %xmm5,%xmm3
	mov %rbx,p2_temp1(%rsp) # get 2^n to memory
	mov %rdx,p2_temp1+8(%rsp) # get 2^n to memory
	addpd %xmm5,%xmm3 #z = z1 + z2

	movapd p2_temp(%rsp),%xmm2
	andpd .L__real_infinity(%rip),%xmm2
	cmppd $0,.L__real_infinity(%rip),%xmm2
	movmskpd %xmm2,%ebx
	test $3,%r8d
	mulpd p2_temp1(%rsp),%xmm3 # result *= 2^n
	# we'd like to avoid a branch, and can use cmp's and and's to
	# eliminate them. But it adds cycles for normal cases which
	# are supposed to be exceptions. Using this branch with the
	# check above results in faster code for the normal cases.
	jnz .L__exp_naninf

	.L__vda_bottom1:
	# store the result _m128d
	test $3,%ebx
	jnz .L__exp_naninf2

	.L__vda_bottom2:

	movapd %xmm3,%xmm1


	#
	#
	.L__final_check:
	mov save_rbx(%rsp),%rbx # restore rbx
	add $stack_size,%rsp
	ret

	# at least one of the numbers needs special treatment
	.L__exp_naninf:
	lea p_temp(%rsp),%rcx
	call .L__naninf
	jmp .L__vda_bottom1
	.L__exp_naninf2:
	lea p2_temp(%rsp),%rcx
	mov %ebx,%r8d
	movapd %xmm0,%xmm4
	movapd %xmm3,%xmm0
	call .L__naninf
	movapd %xmm0,%xmm3
	movapd %xmm4,%xmm0
	jmp .L__vda_bottom2

	# This subroutine checks a double pair for nans and infinities and
	# produces the proper result from the exceptional inputs
	# Register assumptions:
	# Inputs:
	# r8d - mask of errors
	# xmm0 - computed result vector
	# rcx - pointing to memory image of inputs
	# Outputs:
	# xmm0 - new result vector
	# %rax,rdx,,%xmm2 all modified.
	.L__naninf:
	# check the first number
	test $1,%r8d
	jz .L__check2

	mov (%rcx),%rdx
	mov $0x0000FFFFFFFFFFFFF,%rax
	test %rax,%rdx
	jnz .L__enan1 # jump if mantissa not zero, so it's a NaN
	# inf
	mov %rdx,%rax
	rcl $1,%rax
	jnc .L__r1 # exp(+inf) = inf
	xor %rdx,%rdx # exp(-inf) = 0
	jmp .L__r1

	#NaN
	.L__enan1:
	mov $0x00008000000000000,%rax # convert to quiet
	or %rax,%rdx
	.L__r1:
	movd %rdx,%xmm2
	shufpd $2,%xmm0,%xmm2
	movsd %xmm2,%xmm0
	# check the second number
	.L__check2:
	test $2,%r8d
	jz .L__r3
	mov 8(%rcx),%rdx
	mov $0x0000FFFFFFFFFFFFF,%rax
	test %rax,%rdx
	jnz .L__enan2 # jump if mantissa not zero, so it's a NaN
	# inf
	mov %rdx,%rax
	rcl $1,%rax
	jnc .L__r2 # exp(+inf) = inf
	xor %rdx,%rdx # exp(-inf) = 0
	jmp .L__r2

	#NaN
	.L__enan2:
	mov $0x00008000000000000,%rax # convert to quiet
	or %rax,%rdx
	.L__r2:
	movd %rdx,%xmm2
	shufpd $0,%xmm2,%xmm0
	.L__r3:
	ret

	.data
	.align 64

	.L__real_3ff0000000000000: .quad 0x03ff0000000000000 # 1.0
	.quad 0x03ff0000000000000
	.L__real_4040000000000000: .quad 0x04040000000000000 # 32
	.quad 0x04040000000000000
	.L__real_40F0000000000000: .quad 0x040F0000000000000 # 65536, to protect against really large numbers
	.quad 0x040F0000000000000
	.L__real_C0F0000000000000: .quad 0x0C0F0000000000000 # -65536, to protect against really large negative numbers
	.quad 0x0C0F0000000000000
	.L__real_3FA0000000000000: .quad 0x03FA0000000000000 # 1/32
	.quad 0x03FA0000000000000
	.L__real_3fe0000000000000: .quad 0x03fe0000000000000 # 1/2
	.quad 0x03fe0000000000000
	.L__real_infinity: .quad 0x07ff0000000000000 #
	.quad 0x07ff0000000000000
	.L__real_ninfinity: .quad 0x0fff0000000000000 #
	.quad 0x0fff0000000000000
	.L__real_thirtytwo_by_log2: .quad 0x040471547652b82fe # thirtytwo_by_log2
	.quad 0x040471547652b82fe
	.L__real_log2_by_32_lead: .quad 0x03f962e42fe000000 # log2_by_32_lead
	.quad 0x03f962e42fe000000
	.L__real_log2_by_32_tail: .quad 0x0Bdcf473de6af278e # -log2_by_32_tail
	.quad 0x0Bdcf473de6af278e
	.L__real_3f56c1728d739765: .quad 0x03f56c1728d739765 # 1.38889490863777199667e-03
	.quad 0x03f56c1728d739765
	.L__real_3F811115B7AA905E: .quad 0x03F811115B7AA905E # 8.33336798434219616221e-03
	.quad 0x03F811115B7AA905E
	.L__real_3FA5555555545D4E: .quad 0x03FA5555555545D4E # 4.16666666662260795726e-02
	.quad 0x03FA5555555545D4E
	.L__real_3FC5555555548F7C: .quad 0x03FC5555555548F7C # 1.66666666665260878863e-01
	.quad 0x03FC5555555548F7C


	.L__two_to_jby32_lead_table:
	.quad 0x03ff0000000000000 # 1
	.quad 0x03ff059b0d0000000 # 1.0219
	.quad 0x03ff0b55860000000 # 1.04427
	.quad 0x03ff11301d0000000 # 1.06714
	.quad 0x03ff172b830000000 # 1.09051
	.quad 0x03ff1d48730000000 # 1.11439
	.quad 0x03ff2387a60000000 # 1.13879
	.quad 0x03ff29e9df0000000 # 1.16372
	.quad 0x03ff306fe00000000 # 1.18921
	.quad 0x03ff371a730000000 # 1.21525
	.quad 0x03ff3dea640000000 # 1.24186
	.quad 0x03ff44e0860000000 # 1.26905
	.quad 0x03ff4bfdad0000000 # 1.29684
	.quad 0x03ff5342b50000000 # 1.32524
	.quad 0x03ff5ab07d0000000 # 1.35426
	.quad 0x03ff6247eb0000000 # 1.38391
	.quad 0x03ff6a09e60000000 # 1.41421
	.quad 0x03ff71f75e0000000 # 1.44518
	.quad 0x03ff7a11470000000 # 1.47683
	.quad 0x03ff8258990000000 # 1.50916
	.quad 0x03ff8ace540000000 # 1.54221
	.quad 0x03ff93737b0000000 # 1.57598
	.quad 0x03ff9c49180000000 # 1.61049
	.quad 0x03ffa5503b0000000 # 1.64576
	.quad 0x03ffae89f90000000 # 1.68179
	.quad 0x03ffb7f76f0000000 # 1.71862
	.quad 0x03ffc199bd0000000 # 1.75625
	.quad 0x03ffcb720d0000000 # 1.79471
	.quad 0x03ffd5818d0000000 # 1.83401
	.quad 0x03ffdfc9730000000 # 1.87417
	.quad 0x03ffea4afa0000000 # 1.91521
	.quad 0x03fff507650000000 # 1.95714
	.quad 0 # for alignment
	.L__two_to_jby32_trail_table:
	.quad 0x00000000000000000 # 0
	.quad 0x03e48ac2ba1d73e2a # 1.1489e-008
	.quad 0x03e69f3121ec53172 # 4.83347e-008
	.quad 0x03df25b50a4ebbf1b # 2.67125e-010
	.quad 0x03e68faa2f5b9bef9 # 4.65271e-008
	.quad 0x03e368b9aa7805b80 # 5.24924e-009
	.quad 0x03e6ceac470cd83f6 # 5.38622e-008
	.quad 0x03e547f7b84b09745 # 1.90902e-008
	.quad 0x03e64636e2a5bd1ab # 3.79764e-008
	.quad 0x03e5ceaa72a9c5154 # 2.69307e-008
	.quad 0x03e682468446b6824 # 4.49684e-008
	.quad 0x03e18624b40c4dbd0 # 1.41933e-009
	.quad 0x03e54d8a89c750e5e # 1.94147e-008
	.quad 0x03e5a753e077c2a0f # 2.46409e-008
	.quad 0x03e6a90a852b19260 # 4.94813e-008
	.quad 0x03e0d2ac258f87d03 # 8.48872e-010
	.quad 0x03e59fcef32422cbf # 2.42032e-008
	.quad 0x03e61d8bee7ba46e2 # 3.3242e-008
	.quad 0x03e4f580c36bea881 # 1.45957e-008
	.quad 0x03e62999c25159f11 # 3.46453e-008
	.quad 0x03e415506dadd3e2a # 8.0709e-009
	.quad 0x03e29b8bc9e8a0388 # 2.99439e-009
	.quad 0x03e451f8480e3e236 # 9.83622e-009
	.quad 0x03e41f12ae45a1224 # 8.35492e-009
	.quad 0x03e62b5a75abd0e6a # 3.48493e-008
	.quad 0x03e47daf237553d84 # 1.11085e-008
	.quad 0x03e6b0aa538444196 # 5.03689e-008
	.quad 0x03e69df20d22a0798 # 4.81896e-008
	.quad 0x03e69f7490e4bb40b # 4.83654e-008
	.quad 0x03e4bdcdaf5cb4656 # 1.29746e-008
	.quad 0x03e452486cc2c7b9d # 9.84533e-009
	.quad 0x03e66dc8a80ce9f09 # 4.25828e-008
	.quad 0 # for alignment