src/gas/vrs8log2f.S - open64_libacml_mv - Git at Google


 #
 #  (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
 #
 #  This file is part of libacml_mv.
 #
 #  libacml_mv is free software; you can redistribute it and/or
 #  modify it under the terms of the GNU Lesser General Public
 #  License as published by the Free Software Foundation; either
 #  version 2.1 of the License, or (at your option) any later version.
 #
 #  libacml_mv is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 #  Lesser General Public License for more details.
 #
 #  You should have received a copy of the GNU Lesser General Public
 #  License along with libacml_mv.  If not, see
 #  <http://www.gnu.org/licenses/>.
 #
 #


 #
 # vrs8log2f.s
 #
 # A vector implementation of the logf libm function.
 #  This routine implemented in single precision.  It is slightly
 #  less accurate than the double precision version, but it will
 #  be better for vectorizing.
 #
 # Prototype:
 #
 #    __m128,__m128 __vrs8_log2f(__m128 x1, __m128 x2);
 #
 #   Computes the natural log of x for eight packed single values.
 #   Places the results into xmm0 and xmm1.
 #   Returns proper C99 values, but may not raise status flags properly.
 #   Less than 1 ulp of error.
 #
 # This array version is basically a unrolling of the by4 scalar single
 # routine.  The second set of operations is performed by the indented
 # instructions interleaved into the first set.
 # The scheduling is done by trial and error.  The resulting code represents
 # the best time of many variations.  It would seem more interleaving could
 # be done, as there is a long stretch of the second computation that is not
 # interleaved.  But moving any of this code forward makes the routine
 # slower.
 #
 #ifdef __ELF__
 .section .note.GNU-stack,"",@progbits
 #endif

     .text
     .align 16
     .p2align 4,,15

 # define local variable storage offsets
 .equ	p_x,0			# save x
 .equ	p_idx,0x010		# xmmword index
 .equ	p_z1,0x020		# xmmword index
 .equ	p_q,0x030		# xmmword index
 .equ	p_corr,0x040		# xmmword index
 .equ	p_omask,0x050		# xmmword index
 .equ	save_xmm6,0x060		#
 .equ	save_rbx,0x070		#
 .equ	save_xmm7,0x080		#
 .equ	save_xmm8,0x090		#
 .equ	save_xmm9,0x0a0		#
 .equ	save_xmm10,0x0b0		#
 .equ	save_xmm11,0x0c0		#
 .equ	save_xmm12,0x0d0		#
 .equ	save_xmm13,0x0d0		#
 .equ	p_x2,0x0100		# save x
 .equ	p_idx2,0x0110		# xmmword index
 .equ	p_z12,0x0120		# xmmword index
 .equ	p_q2,0x0130		# xmmword index

 .equ	stack_size,0x0168


 .globl __vrs8_log2f
     .type   __vrs8_log2f,@function
 __vrs8_log2f:
 	sub		$stack_size,%rsp
 	mov		%rbx,save_rbx(%rsp)	# save rbx

 # check e as a special case
 	movdqa	%xmm0,p_x(%rsp)	# save x
 		movdqa	%xmm1,p_x2(%rsp)	# save x
 #	movdqa	%xmm0,%xmm2
 #	cmpps	$0,.L__real_ef(%rip),%xmm2
 #	movmskps	%xmm2,%r9d

 		movdqa	%xmm1,%xmm12
 		movdqa	%xmm1,%xmm9
 		movaps	%xmm1,%xmm7

 #
 # compute the index into the log tables
 #
 	movdqa	%xmm0,%xmm3
 	movaps	%xmm0,%xmm1
 	psrld	$23,%xmm3

 	#
 	# compute the index into the log tables
 	#
 		psrld	$23,%xmm9
 		subps	.L__real_one(%rip),%xmm7
 		psubd	.L__mask_127(%rip),%xmm9
 	subps	.L__real_one(%rip),%xmm1
 	psubd	.L__mask_127(%rip),%xmm3
 		cvtdq2ps	%xmm9,%xmm13			# xexp

 		movdqa	%xmm12,%xmm9
 		pand	.L__real_mant(%rip),%xmm9
 		xor		%r8,%r8
 		movdqa	%xmm9,%xmm8
 		movaps	.L__real_half(%rip),%xmm11							# .5
 	cvtdq2ps	%xmm3,%xmm6			# xexp

 	movdqa	%xmm0,%xmm3
 	pand	.L__real_mant(%rip),%xmm3
 	xor		%r8,%r8
 	movdqa	%xmm3,%xmm2
 	movaps	.L__real_half(%rip),%xmm5							# .5

 #/* Now  x = 2**xexp  * f,  1/2 <= f < 1. */
 	psrld	$16,%xmm3
 	lea		.L__np_ln_lead_table(%rip),%rdx
 	movdqa	%xmm3,%xmm4
 		psrld	$16,%xmm9
 		movdqa	%xmm9,%xmm10
 		psrld	$1,%xmm9
 	psrld	$1,%xmm3
 	paddd	.L__mask_040(%rip),%xmm3
 	pand	.L__mask_001(%rip),%xmm4
 	paddd	%xmm4,%xmm3
 	cvtdq2ps	%xmm3,%xmm1
 	#/* Now  x = 2**xexp  * f,  1/2 <= f < 1. */
 		paddd	.L__mask_040(%rip),%xmm9
 		pand	.L__mask_001(%rip),%xmm10
 		paddd	%xmm10,%xmm9
 		cvtdq2ps	%xmm9,%xmm7
 	packssdw	%xmm3,%xmm3
 	movq	%xmm3,p_idx(%rsp)
 		packssdw	%xmm9,%xmm9
 		movq	%xmm9,p_idx2(%rsp)


 # reduce and get u
 	movdqa	%xmm0,%xmm3
 	orps		.L__real_half(%rip),%xmm2


 	mulps	.L__real_3c000000(%rip),%xmm1				# f1 = index/128
 	# reduce and get u


 	subps	%xmm1,%xmm2											# f2 = f - f1
 	mulps	%xmm2,%xmm5
 	addps	%xmm5,%xmm1

 	divps	%xmm1,%xmm2				# u

 		movdqa	%xmm12,%xmm9
 		orps		.L__real_half(%rip),%xmm8


 		mulps	.L__real_3c000000(%rip),%xmm7				# f1 = index/128
 		subps	%xmm7,%xmm8											# f2 = f - f1
 		mulps	%xmm8,%xmm11
 		addps	%xmm11,%xmm7


 	mov		p_idx(%rsp),%rcx 			# get the indexes
 	mov		%cx,%r8w
 	ror		$16,%rcx
 	mov		-256(%rdx,%r8,4),%eax		# get the f1 value

 	mov		%cx,%r8w
 	ror		$16,%rcx
 	mov		-256(%rdx,%r8,4),%ebx		# get the f1 value
 	shl		$32,%rbx
 	or		%rbx,%rax
 	mov		 %rax,p_z1(%rsp) 			# save the f1 values

 	mov		%cx,%r8w
 	ror		$16,%rcx
 	mov		-256(%rdx,%r8,4),%eax		# get the f1 value

 	mov		%cx,%r8w
 	ror		$16,%rcx
 	or		-256(%rdx,%r8,4),%ebx		# get the f1 value
 	shl		$32,%rbx
 	or		%rbx,%rax
 	mov		 %rax,p_z1+8(%rsp) 			# save the f1 value
 		divps	%xmm7,%xmm8				# u
 		lea		.L__np_ln_lead_table(%rip),%rdx
 		mov		p_idx2(%rsp),%rcx 			# get the indexes
 		mov		%cx,%r8w
 		ror		$16,%rcx
 		mov		-256(%rdx,%r8,4),%eax		# get the f1 value

 		mov		%cx,%r8w
 		ror		$16,%rcx
 		mov		-256(%rdx,%r8,4),%ebx		# get the f1 value
 		shl		$32,%rbx
 		or		%rbx,%rax
 		mov		 %rax,p_z12(%rsp) 			# save the f1 values

 		mov		%cx,%r8w
 		ror		$16,%rcx
 		mov		-256(%rdx,%r8,4),%eax		# get the f1 value

 		mov		%cx,%r8w
 		ror		$16,%rcx
 		or		-256(%rdx,%r8,4),%ebx		# get the f1 value
 		shl		$32,%rbx
 		or		%rbx,%rax
 		mov		 %rax,p_z12+8(%rsp) 			# save the f1 value
 # solve for ln(1+u)
 	movaps	%xmm2,%xmm1				# u
 	mulps	%xmm2,%xmm2				# u^2
 	movaps	%xmm2,%xmm5
 	movaps	.L__real_cb3(%rip),%xmm3
 	mulps	%xmm2,%xmm3				#Cu2
 	mulps	%xmm1,%xmm5				# u^3
 	addps	.L__real_cb2(%rip),%xmm3 #B+Cu2
 	movaps	%xmm2,%xmm4
 	mulps	%xmm5,%xmm4				# u^5
 	movaps	.L__real_log2e_lead(%rip),%xmm2

 	mulps	.L__real_cb1(%rip),%xmm5 #Au3
 	addps	%xmm5,%xmm1				# u+Au3
 	mulps	%xmm3,%xmm4				# u5(B+Cu2)
 	movaps	.L__real_log2e_tail(%rip),%xmm3
 	lea		.L__np_ln_tail_table(%rip),%rdx
 	addps	%xmm4,%xmm1				# poly

 # recombine
 	mov		p_idx(%rsp),%rcx 			# get the indexes
 	mov		%cx,%r8w
 	shr		$16,%rcx
 	mov		-256(%rdx,%r8,4),%eax		# get the f2 value

 	mov		%cx,%r8w
 	shr		$16,%rcx
 	or		-256(%rdx,%r8,4),%ebx		# get the f2 value
 	shl		$32,%rbx
 	or		%rbx,%rax
 	mov		 %rax,p_q(%rsp) 			# save the f2 value

 	mov		%cx,%r8w
 	shr		$16,%rcx
 	mov		-256(%rdx,%r8,4),%eax		# get the f2 value

 	mov		%cx,%r8w
 	mov		-256(%rdx,%r8,4),%ebx		# get the f2 value
 	shl		$32,%rbx
 	or		%rbx,%rax
 	mov		 %rax,p_q+8(%rsp) 			# save the f2 value

 	addps	p_q(%rsp),%xmm1 #z2	+=q
 	movaps	%xmm1,%xmm4	#z2 copy
 	movaps	p_z1(%rsp),%xmm0			# z1  values
 	movaps	%xmm0,%xmm5	#z1 copy

 	mulps	%xmm2,%xmm5	#z1*log2e_lead
 	mulps	%xmm2,%xmm1	#z2*log2e_lead
 	mulps	%xmm3,%xmm4	#z2*log2e_tail
 	mulps	%xmm3,%xmm0	#z1*log2e_tail
 	addps	%xmm6,%xmm5	#r1 = z1*log2e_lead + xexp
 	addps	%xmm4,%xmm0	#z1*log2e_tail + z2*log2e_tail
 	addps	%xmm1,%xmm0	#r2
 #return r1+r2
 	addps 	%xmm5,%xmm0	# r1+ r2


 # check for e
 #	test		$0x0f,%r9d
 #	jnz			.L__vlogf_e
 .L__f1:

 # check for negative numbers or zero
 	xorps	%xmm1,%xmm1
 	cmpps	$1,p_x(%rsp),%xmm1	# 0 greater than =?. catches NaNs also.
 	movmskps	%xmm1,%r9d
 	cmp		$0x0f,%r9d
 	jnz		.L__z_or_neg

 .L__f2:
 ##  if +inf
 	movaps	p_x(%rsp),%xmm3
 	cmpps	$0,.L__real_inf(%rip),%xmm3
 	movmskps	%xmm3,%r9d
 	test		$0x0f,%r9d
 	jnz		.L__log_inf
 .L__f3:

 	movaps	p_x(%rsp),%xmm3
 	subps	.L__real_one(%rip),%xmm3
 	andps	.L__real_notsign(%rip),%xmm3
 	cmpps	$2,.L__real_threshold(%rip),%xmm3
 	movmskps	%xmm3,%r9d
 	test	$0x0f,%r9d
 	jnz		.L__near_one
 .L__f4:

 # finish the second set of calculations

 	# solve for ln(1+u)
 		movaps	%xmm8,%xmm7				# u
 		mulps	%xmm8,%xmm8				# u^2
 		movaps	%xmm8,%xmm11

 		movaps	.L__real_cb3(%rip),%xmm9
 		mulps	%xmm8,%xmm9				#Cu2
 		mulps	%xmm7,%xmm11				# u^3
 		addps	.L__real_cb2(%rip),%xmm9 #B+Cu2
 		movaps	%xmm8,%xmm10
 		mulps	%xmm11,%xmm10				# u^5
 		movaps	.L__real_log2e_lead(%rip),%xmm8

 		mulps	.L__real_cb1(%rip),%xmm11 #Au3
 		addps	%xmm11,%xmm7				# u+Au3
 		mulps	%xmm9,%xmm10				# u5(B+Cu2)
 		movaps	.L__real_log2e_tail(%rip),%xmm9
 		addps	%xmm10,%xmm7				# poly


 	# recombine
 		lea		.L__np_ln_tail_table(%rip),%rdx
 		mov		p_idx2(%rsp),%rcx 			# get the indexes
 		mov		%cx,%r8w
 		shr		$16,%rcx
 		mov		-256(%rdx,%r8,4),%eax		# get the f2 value

 		mov		%cx,%r8w
 		shr		$16,%rcx
 		or		-256(%rdx,%r8,4),%ebx		# get the f2 value
 		shl		$32,%rbx
 		or		%rbx,%rax
 		mov		 %rax,p_q2(%rsp) 			# save the f2 value

 		mov		%cx,%r8w
 		shr		$16,%rcx
 		mov		-256(%rdx,%r8,4),%eax		# get the f2 value

 		mov		%cx,%r8w
 		mov		-256(%rdx,%r8,4),%ebx		# get the f2 value
 		shl		$32,%rbx
 		or		%rbx,%rax
 		mov		 %rax,p_q2+8(%rsp) 			# save the f2 value
 		addps	p_q2(%rsp),%xmm7 #z2	+=q
 		movaps	%xmm7,%xmm10	#z2 copy
 		movaps	p_z12(%rsp),%xmm1			# z1  values
 		movaps	%xmm1,%xmm11	#z1 copy

 		mulps	%xmm8,%xmm11	#z1*log2e_lead
 		mulps	%xmm8,%xmm7	#z2*log2e_lead
 		mulps	%xmm9,%xmm10	#z2*log2e_tail
 		mulps	%xmm9,%xmm1	#z1*log2e_tail
 		addps	%xmm13,%xmm11	#r1 = z1*log2e_lead + xexp
 		addps	%xmm10,%xmm1	#z1*log2e_tail + z2*log2e_tail
 		addps	%xmm7,%xmm1	#r2
 		#return r1+r2
 		addps 	%xmm11,%xmm1	# r1+ r2

 	# check e as a special case
 #		movaps	p_x2(%rsp),%xmm10
 #		cmpps	$0,.L__real_ef(%rip),%xmm10
 #		movmskps	%xmm10,%r9d
 	# check for e
 #		test		$0x0f,%r9d
 #		jnz			.L__vlogf_e2
 .L__f12:

 	# check for negative numbers or zero
 		xorps	%xmm7,%xmm7
 		cmpps	$1,p_x2(%rsp),%xmm7	# 0 greater than =?. catches NaNs also.
 		movmskps	%xmm7,%r9d
 		cmp		$0x0f,%r9d
 		jnz		.L__z_or_neg2

 .L__f22:
 	##  if +inf
 		movaps	p_x2(%rsp),%xmm9
 		cmpps	$0,.L__real_inf(%rip),%xmm9
 		movmskps	%xmm9,%r9d
 		test		$0x0f,%r9d
 		jnz		.L__log_inf2
 .L__f32:

 		movaps	p_x2(%rsp),%xmm9
 		subps	.L__real_one(%rip),%xmm9
 		andps	.L__real_notsign(%rip),%xmm9
 		cmpps	$2,.L__real_threshold(%rip),%xmm9
 		movmskps	%xmm9,%r9d
 		test	$0x0f,%r9d
 		jnz		.L__near_one2
 .L__f42:


 .L__finish:
 	mov		save_rbx(%rsp),%rbx		# restore rbx
 	add		$stack_size,%rsp
 	ret

 .L__vlogf_e:
 	movdqa	p_x(%rsp),%xmm2
 	cmpps	$0,.L__real_ef(%rip),%xmm2
 	movdqa	%xmm2,%xmm3
 	andnps	%xmm0,%xmm3							# keep the non-e values
 	andps	.L__real_one(%rip),%xmm2			# setup the 1 values
 	orps	%xmm3,%xmm2							# merge
 	movdqa	%xmm2,%xmm0							# and replace
 	jmp		.L__f1

 .L__vlogf_e2:
 		movdqa	p_x2(%rsp),%xmm2
 		cmpps	$0,.L__real_ef(%rip),%xmm2
 		movdqa	%xmm2,%xmm3
 		andnps	%xmm1,%xmm3							# keep the non-e values
 		andps	.L__real_one(%rip),%xmm2			# setup the 1 values
 		orps	%xmm3,%xmm2							# merge
 		movdqa	%xmm2,%xmm1							# and replace
 		jmp		.L__f12

 	.align	16
 .L__near_one:
 # saves 10 cycles
 #      r = x - 1.0;
 	movdqa	%xmm3,p_omask(%rsp)	# save ones mask
 	movaps	p_x(%rsp),%xmm3
 	movaps	.L__real_two(%rip),%xmm2
 	subps	.L__real_one(%rip),%xmm3	   # r
 #      u          = r / (2.0 + r);
 	addps	%xmm3,%xmm2
 	movaps	%xmm3,%xmm1
 	divps	%xmm2,%xmm1		# u
 	movaps	.L__real_ca4(%rip),%xmm4	  #D
 	movaps	.L__real_ca3(%rip),%xmm5	  #C
 #      correction = r * u;
 	movaps	%xmm3,%xmm6
 	mulps	%xmm1,%xmm6		# correction
 	movdqa	%xmm6,p_corr(%rsp)	# save correction
 #      u          = u + u;
 	addps	%xmm1,%xmm1		#u
 	movaps	%xmm1,%xmm2
 	mulps	%xmm2,%xmm2		#v =u^2
 #      r2 = (u * v * (ca_1 + v * (ca_2 + v * (ca_3 + v * ca_4))) - correction);
 	mulps	%xmm1,%xmm5		# Cu
 	movaps	%xmm1,%xmm6
 	mulps	%xmm2,%xmm6		# u^3
 	mulps	.L__real_ca2(%rip),%xmm2	#Bu^2
 	mulps	%xmm6,%xmm4		#Du^3

 	addps	.L__real_ca1(%rip),%xmm2	# +A
 	movaps	%xmm6,%xmm1
 	mulps	%xmm1,%xmm1		# u^6
 	addps	%xmm4,%xmm5		#Cu+Du3

 	mulps	%xmm6,%xmm2		#u3(A+Bu2)
 	mulps	%xmm5,%xmm1		#u6(Cu+Du3)
 	addps	%xmm1,%xmm2
 	subps	p_corr(%rsp),%xmm2		# -correction

 #   loge to log2
 	movaps  %xmm3,%xmm5 	#r1=r
 	pand 	.L__mask_lower(%rip),%xmm5
 	subps	%xmm5,%xmm3
 	addps	%xmm3,%xmm2	#r2 = r2 + (r-r1)

 	movaps	%xmm5,%xmm3
 	movaps	%xmm2,%xmm1

 	mulps 	.L__real_log2e_tail(%rip),%xmm2
 	mulps 	.L__real_log2e_tail(%rip),%xmm3
 	mulps 	.L__real_log2e_lead(%rip),%xmm1
 	mulps 	.L__real_log2e_lead(%rip),%xmm5
 	addps 	%xmm2,%xmm3
 	addps 	%xmm1,%xmm3
 	addps	%xmm5,%xmm3

 #      return r + r2;
 #	addps	%xmm2,%xmm3

 	movdqa	p_omask(%rsp),%xmm6
 	movdqa	%xmm6,%xmm2
 	andnps	%xmm0,%xmm6					# keep the non-nearone values
 	andps	%xmm3,%xmm2					# setup the nearone values
 	orps	%xmm6,%xmm2					# merge
 	movdqa	%xmm2,%xmm0					# and replace

 	jmp		.L__f4


 	.align	16
 .L__near_one2:
 # saves 10 cycles
 #      r = x - 1.0;
 		movdqa	%xmm9,p_omask(%rsp)	# save ones mask
 		movaps	p_x2(%rsp),%xmm3
 		movaps	.L__real_two(%rip),%xmm2
 		subps	.L__real_one(%rip),%xmm3	   # r
 	#      u          = r / (2.0 + r);
 		addps	%xmm3,%xmm2
 		movaps	%xmm3,%xmm7
 		divps	%xmm2,%xmm7		# u
 		movaps	.L__real_ca4(%rip),%xmm4	  #D
 		movaps	.L__real_ca3(%rip),%xmm5	  #C
 	#      correction = r * u;
 		movaps	%xmm3,%xmm6
 		mulps	%xmm7,%xmm6		# correction
 		movdqa	%xmm6,p_corr(%rsp)	# save correction
 	#      u          = u + u;
 		addps	%xmm7,%xmm7		#u
 		movaps	%xmm7,%xmm2
 		mulps	%xmm2,%xmm2		#v =u^2
 	#      r2 = (u * v * (ca_1 + v * (ca_2 + v * (ca_3 + v * ca_4))) - correction);
 		mulps	%xmm7,%xmm5		# Cu
 		movaps	%xmm7,%xmm6
 		mulps	%xmm2,%xmm6		# u^3
 		mulps	.L__real_ca2(%rip),%xmm2	#Bu^2
 		mulps	%xmm6,%xmm4		#Du^3

 		addps	.L__real_ca1(%rip),%xmm2	# +A
 		movaps	%xmm6,%xmm7
 		mulps	%xmm7,%xmm7		# u^6
 		addps	%xmm4,%xmm5		#Cu+Du3

 		mulps	%xmm6,%xmm2		#u3(A+Bu2)
 		mulps	%xmm5,%xmm7		#u6(Cu+Du3)
 		addps	%xmm7,%xmm2
 		subps	p_corr(%rsp),%xmm2		# -correction

 # loge to log2
 		movaps  %xmm3,%xmm5 	#r1=r
 		pand 	.L__mask_lower(%rip),%xmm5
 		subps	%xmm5,%xmm3
 		addps	%xmm3,%xmm2 #r2 = r2 + (r-r1)

 		movaps	%xmm5,%xmm3
 		movaps	%xmm2,%xmm7

 		mulps 	.L__real_log2e_tail(%rip),%xmm2
 		mulps 	.L__real_log2e_tail(%rip),%xmm3
 		mulps 	.L__real_log2e_lead(%rip),%xmm7
 		mulps 	.L__real_log2e_lead(%rip),%xmm5
 		addps 	%xmm2,%xmm3
 		addps 	%xmm7,%xmm3
 		addps	%xmm5,%xmm3

 	#      return r + r2;
 	#	addps	%xmm2,%xmm3

 		movdqa	p_omask(%rsp),%xmm6
 		movdqa	%xmm6,%xmm2
 		andnps	%xmm1,%xmm6					# keep the non-nearone values
 		andps	%xmm3,%xmm2					# setup the nearone values
 		orps	%xmm6,%xmm2					# merge
 		movdqa	%xmm2,%xmm1					# and replace

 		jmp		.L__f42

 # we have a zero, a negative number, or both.
 # the mask is already in .LNaNs,%xmm1 are also picked up here, along with -inf.
 .L__z_or_neg:
 # deal with negatives first
 	movdqa	%xmm1,%xmm3
 	andps	%xmm0,%xmm3							# keep the non-error values
 	andnps	.L__real_nan(%rip),%xmm1			# setup the nan values
 	orps	%xmm3,%xmm1							# merge
 	movdqa	%xmm1,%xmm0							# and replace
 # check for +/- 0
 	xorps	%xmm1,%xmm1
 	cmpps	$0,p_x(%rsp),%xmm1	# 0 ?.
 	movmskps	%xmm1,%r9d
 	test		$0x0f,%r9d
 	jz		.L__zn2

 	movdqa	%xmm1,%xmm3
 	andnps	%xmm0,%xmm3							# keep the non-error values
 	andps	.L__real_ninf(%rip),%xmm1		# ; C99 specs -inf for +-0
 	orps	%xmm3,%xmm1							# merge
 	movdqa	%xmm1,%xmm0							# and replace

 .L__zn2:
 # check for NaNs
 	movaps	p_x(%rsp),%xmm3
 	andps	.L__real_inf(%rip),%xmm3
 	cmpps	$0,.L__real_inf(%rip),%xmm3		# mask for max exponent

 	movdqa	p_x(%rsp),%xmm4
 	pand	.L__real_mant(%rip),%xmm4		# mask for non-zero mantissa
 	pcmpeqd	.L__real_zero(%rip),%xmm4
 	pandn	%xmm3,%xmm4							# mask for NaNs
 	movdqa	%xmm4,%xmm2
 	movdqa	p_x(%rsp),%xmm1			# isolate the NaNs
 	pand	%xmm4,%xmm1

 	pand	.L__real_qnanbit(%rip),%xmm4		# now we have a mask that will set QNaN bit
 	por		%xmm1,%xmm4							# turn SNaNs to QNaNs

 	movdqa	%xmm2,%xmm1
 	andnps	%xmm0,%xmm2							# keep the non-error values
 	orps	%xmm4,%xmm2							# merge
 	movdqa	%xmm2,%xmm0							# and replace
 	xorps	%xmm4,%xmm4

 	jmp		.L__f2

 # handle only +inf	 log(+inf) = inf
 .L__log_inf:
 	movdqa	%xmm3,%xmm1
 	andnps	%xmm0,%xmm3							# keep the non-error values
 	andps	p_x(%rsp),%xmm1			# setup the +inf values
 	orps	%xmm3,%xmm1							# merge
 	movdqa	%xmm1,%xmm0							# and replace
 	jmp		.L__f3


 .L__z_or_neg2:
 	# deal with negatives first
 		movdqa	%xmm7,%xmm3
 		andps	%xmm1,%xmm3							# keep the non-error values
 		andnps	.L__real_nan(%rip),%xmm7			# setup the nan values
 		orps	%xmm3,%xmm7							# merge
 		movdqa	%xmm7,%xmm1							# and replace
 	# check for +/- 0
 		xorps	%xmm7,%xmm7
 		cmpps	$0,p_x2(%rsp),%xmm7	# 0 ?.
 		movmskps	%xmm7,%r9d
 		test		$0x0f,%r9d
 		jz		.L__zn22

 		movdqa	%xmm7,%xmm3
 		andnps	%xmm1,%xmm3							# keep the non-error values
 		andps	.L__real_ninf(%rip),%xmm7		# ; C99 specs -inf for +-0
 		orps	%xmm3,%xmm7							# merge
 		movdqa	%xmm7,%xmm1							# and replace

 .L__zn22:
 	# check for NaNs
 		movaps	p_x2(%rsp),%xmm3
 		andps	.L__real_inf(%rip),%xmm3
 		cmpps	$0,.L__real_inf(%rip),%xmm3		# mask for max exponent

 		movdqa	p_x2(%rsp),%xmm4
 		pand	.L__real_mant(%rip),%xmm4		# mask for non-zero mantissa
 		pcmpeqd	.L__real_zero(%rip),%xmm4
 		pandn	%xmm3,%xmm4							# mask for NaNs
 		movdqa	%xmm4,%xmm2
 		movdqa	p_x2(%rsp),%xmm7			# isolate the NaNs
 		pand	%xmm4,%xmm7

 		pand	.L__real_qnanbit(%rip),%xmm4		# now we have a mask that will set QNaN bit
 		por		%xmm7,%xmm4							# turn SNaNs to QNaNs

 		movdqa	%xmm2,%xmm7
 		andnps	%xmm1,%xmm2							# keep the non-error values
 		orps	%xmm4,%xmm2							# merge
 		movdqa	%xmm2,%xmm1							# and replace
 		xorps	%xmm4,%xmm4

 		jmp		.L__f22

 	# handle only +inf	 log(+inf) = inf
 .L__log_inf2:
 		movdqa	%xmm9,%xmm7
 		andnps	%xmm1,%xmm9							# keep the non-error values
 		andps	p_x2(%rsp),%xmm7			# setup the +inf values
 		orps	%xmm9,%xmm7							# merge
 		movdqa	%xmm7,%xmm1							# and replace
 		jmp		.L__f32


         .data
         .align 64

 .L__real_zero:				.quad 0x00000000000000000	# 1.0
 					.quad 0x00000000000000000
 .L__real_one:				.quad 0x03f8000003f800000	# 1.0
 					.quad 0x03f8000003f800000
 .L__real_two:				.quad 0x04000000040000000	# 1.0
 					.quad 0x04000000040000000
 .L__real_ninf:				.quad 0x0ff800000ff800000	# -inf
 					.quad 0x0ff800000ff800000
 .L__real_inf:				.quad 0x07f8000007f800000	# +inf
 					.quad 0x07f8000007f800000
 .L__real_nan:				.quad 0x07fc000007fc00000	# NaN
 					.quad 0x07fc000007fc00000
 .L__real_ef:				.quad 0x0402DF854402DF854	# float e
 					.quad 0x0402DF854402DF854

 .L__real_sign:				.quad 0x08000000080000000	# sign bit
 					.quad 0x08000000080000000
 .L__real_notsign:			.quad 0x07ffFFFFF7ffFFFFF	# ^sign bit
 					.quad 0x07ffFFFFF7ffFFFFF
 .L__real_qnanbit:			.quad 0x00040000000400000	# quiet nan bit
 					.quad 0x00040000000400000
 .L__real_mant:				.quad 0x0007FFFFF007FFFFF	# mantipsa bits
 					.quad 0x0007FFFFF007FFFFF
 .L__real_3c000000:			.quad 0x03c0000003c000000	# /* 0.0078125 = 1/128 */
 					.quad 0x03c0000003c000000
 .L__mask_127:				.quad 0x00000007f0000007f	#
 					.quad 0x00000007f0000007f
 .L__mask_040:				.quad 0x00000004000000040	#
 					.quad 0x00000004000000040
 .L__mask_001:				.quad 0x00000000100000001	#
 					.quad 0x00000000100000001


 .L__real_threshold:			.quad 0x03CF5C28F3CF5C28F	# .03
 					.quad 0x03CF5C28F3CF5C28F

 .L__real_ca1:				.quad 0x03DAAAAAB3DAAAAAB	# 8.33333333333317923934e-02
 					.quad 0x03DAAAAAB3DAAAAAB
 .L__real_ca2:				.quad 0x03C4CCCCD3C4CCCCD	# 1.25000000037717509602e-02
 					.quad 0x03C4CCCCD3C4CCCCD
 .L__real_ca3:				.quad 0x03B1249183B124918	# 2.23213998791944806202e-03
 					.quad 0x03B1249183B124918
 .L__real_ca4:				.quad 0x039E401A639E401A6	# 4.34887777707614552256e-04
 					.quad 0x039E401A639E401A6
 .L__real_cb1:				.quad 0x03DAAAAAB3DAAAAAB	# 8.33333333333333593622e-02
 					.quad 0x03DAAAAAB3DAAAAAB
 .L__real_cb2:				.quad 0x03C4CCCCD3C4CCCCD	# 1.24999999978138668903e-02
 					.quad 0x03C4CCCCD3C4CCCCD
 .L__real_cb3:				.quad 0x03B124A123B124A12	# 2.23219810758559851206e-03
 			.quad 0x03B124A123B124A12
 .L__real_log2_lead:     .quad 0x03F3170003F317000  # 0.693115234375
                         .quad 0x03F3170003F317000
 .L__real_log2_tail:     .quad 0x03805FDF43805FDF4  # 0.000031946183
                         .quad 0x03805FDF43805FDF4
 .L__real_half:		.quad 0x03f0000003f000000	# 1/2
 			.quad 0x03f0000003f000000
 .L__real_log2e_lead:       .quad 0x03FB800003FB80000  #1.4375000000
                         .quad 0x03FB800003FB80000
 .L__real_log2e_tail:       .quad 0x03BAA3B293BAA3B29  # 0.0051950408889633
                         .quad 0x03BAA3B293BAA3B29

 .L__mask_lower:			.quad 0x0ffff0000ffff0000	#
 						.quad 0x0ffff0000ffff0000

 .L__np_ln__table:
 	.quad	0x0000000000000000 		# 0.00000000000000000000e+00
 	.quad	0x3F8FC0A8B0FC03E4		# 1.55041813850402832031e-02
 	.quad	0x3F9F829B0E783300		# 3.07716131210327148438e-02
 	.quad	0x3FA77458F632DCFC		# 4.58095073699951171875e-02
 	.quad	0x3FAF0A30C01162A6		# 6.06245994567871093750e-02
 	.quad	0x3FB341D7961BD1D1		# 7.52233862876892089844e-02
 	.quad	0x3FB6F0D28AE56B4C		# 8.96121263504028320312e-02
 	.quad	0x3FBA926D3A4AD563		# 1.03796780109405517578e-01
 	.quad	0x3FBE27076E2AF2E6		# 1.17783010005950927734e-01
 	.quad	0x3FC0D77E7CD08E59		# 1.31576299667358398438e-01
 	.quad	0x3FC29552F81FF523		# 1.45181953907012939453e-01
 	.quad	0x3FC44D2B6CCB7D1E		# 1.58604979515075683594e-01
 	.quad	0x3FC5FF3070A793D4		# 1.71850204467773437500e-01
 	.quad	0x3FC7AB890210D909		# 1.84922337532043457031e-01
 	.quad	0x3FC9525A9CF456B4		# 1.97825729846954345703e-01
 	.quad	0x3FCAF3C94E80BFF3		# 2.10564732551574707031e-01
 	.quad	0x3FCC8FF7C79A9A22		# 2.23143517971038818359e-01
 	.quad	0x3FCE27076E2AF2E6		# 2.35566020011901855469e-01
 	.quad	0x3FCFB9186D5E3E2B		# 2.47836112976074218750e-01
 	.quad	0x3FD0A324E27390E3		# 2.59957492351531982422e-01
 	.quad	0x3FD1675CABABA60E		# 2.71933674812316894531e-01
 	.quad	0x3FD22941FBCF7966		# 2.83768117427825927734e-01
 	.quad	0x3FD2E8E2BAE11D31		# 2.95464158058166503906e-01
 	.quad	0x3FD3A64C556945EA		# 3.07025015354156494141e-01
 	.quad	0x3FD4618BC21C5EC2		# 3.18453729152679443359e-01
 	.quad	0x3FD51AAD872DF82D		# 3.29753279685974121094e-01
 	.quad	0x3FD5D1BDBF5809CA		# 3.40926527976989746094e-01
 	.quad	0x3FD686C81E9B14AF		# 3.51976394653320312500e-01
 	.quad	0x3FD739D7F6BBD007		# 3.62905442714691162109e-01
 	.quad	0x3FD7EAF83B82AFC3		# 3.73716354370117187500e-01
 	.quad	0x3FD89A3386C1425B		# 3.84411692619323730469e-01
 	.quad	0x3FD947941C2116FB		# 3.94993782043457031250e-01
 	.quad	0x3FD9F323ECBF984C		# 4.05465066432952880859e-01
 	.quad	0x3FDA9CEC9A9A084A		# 4.15827870368957519531e-01
 	.quad	0x3FDB44F77BCC8F63		# 4.26084339618682861328e-01
 	.quad	0x3FDBEB4D9DA71B7C		# 4.36236739158630371094e-01
 	.quad	0x3FDC8FF7C79A9A22		# 4.46287095546722412109e-01
 	.quad	0x3FDD32FE7E00EBD5		# 4.56237375736236572266e-01
 	.quad	0x3FDDD46A04C1C4A1		# 4.66089725494384765625e-01
 	.quad	0x3FDE744261D68788		# 4.75845873355865478516e-01
 	.quad	0x3FDF128F5FAF06ED		# 4.85507786273956298828e-01
 	.quad	0x3FDFAF588F78F31F		# 4.95077252388000488281e-01
 	.quad	0x3FE02552A5A5D0FF		# 5.04556000232696533203e-01
 	.quad	0x3FE0723E5C1CDF40		# 5.13945698738098144531e-01
 	.quad	0x3FE0BE72E4252A83		# 5.23248136043548583984e-01
 	.quad	0x3FE109F39E2D4C97		# 5.32464742660522460938e-01
 	.quad	0x3FE154C3D2F4D5EA		# 5.41597247123718261719e-01
 	.quad	0x3FE19EE6B467C96F		# 5.50647079944610595703e-01
 	.quad	0x3FE1E85F5E7040D0		# 5.59615731239318847656e-01
 	.quad	0x3FE23130D7BEBF43		# 5.68504691123962402344e-01
 	.quad	0x3FE2795E1289B11B		# 5.77315330505371093750e-01
 	.quad	0x3FE2C0E9ED448E8C		# 5.86049020290374755859e-01
 	.quad	0x3FE307D7334F10BE		# 5.94707071781158447266e-01
 	.quad	0x3FE34E289D9CE1D3		# 6.03290796279907226562e-01
 	.quad	0x3FE393E0D3562A1A		# 6.11801505088806152344e-01
 	.quad	0x3FE3D9026A7156FB		# 6.20240390300750732422e-01
 	.quad	0x3FE41D8FE84672AE		# 6.28608644008636474609e-01
 	.quad	0x3FE4618BC21C5EC2		# 6.36907458305358886719e-01
 	.quad	0x3FE4A4F85DB03EBB		# 6.45137906074523925781e-01
 	.quad	0x3FE4E7D811B75BB1		# 6.53301239013671875000e-01
 	.quad	0x3FE52A2D265BC5AB		# 6.61398470401763916016e-01
 	.quad	0x3FE56BF9D5B3F399		# 6.69430613517761230469e-01
 	.quad	0x3FE5AD404C359F2D		# 6.77398800849914550781e-01
 	.quad	0x3FE5EE02A9241675		# 6.85303986072540283203e-01
 	.quad	0x3FE62E42FEFA39EF		# 6.93147122859954833984e-01
 	.quad 0					# for alignment

 .L__np_ln_lead_table:
     .long 0x00000000  # 0.000000000000 0
     .long 0x3C7E0000  # 0.015502929688 1
     .long 0x3CFC1000  # 0.030769348145 2
     .long 0x3D3BA000  # 0.045806884766 3
     .long 0x3D785000  # 0.060623168945 4
     .long 0x3D9A0000  # 0.075195312500 5
     .long 0x3DB78000  # 0.089599609375 6
     .long 0x3DD49000  # 0.103790283203 7
     .long 0x3DF13000  # 0.117767333984 8
     .long 0x3E06B000  # 0.131530761719 9
     .long 0x3E14A000  # 0.145141601563 10
     .long 0x3E226000  # 0.158569335938 11
     .long 0x3E2FF000  # 0.171813964844 12
     .long 0x3E3D5000  # 0.184875488281 13
     .long 0x3E4A9000  # 0.197814941406 14
     .long 0x3E579000  # 0.210510253906 15
     .long 0x3E647000  # 0.223083496094 16
     .long 0x3E713000  # 0.235534667969 17
     .long 0x3E7DC000  # 0.247802734375 18
     .long 0x3E851000  # 0.259887695313 19
     .long 0x3E8B3000  # 0.271850585938 20
     .long 0x3E914000  # 0.283691406250 21
     .long 0x3E974000  # 0.295410156250 22
     .long 0x3E9D3000  # 0.307006835938 23
     .long 0x3EA30000  # 0.318359375000 24
     .long 0x3EA8D000  # 0.329711914063 25
     .long 0x3EAE8000  # 0.340820312500 26
     .long 0x3EB43000  # 0.351928710938 27
     .long 0x3EB9C000  # 0.362792968750 28
     .long 0x3EBF5000  # 0.373657226563 29
     .long 0x3EC4D000  # 0.384399414063 30
     .long 0x3ECA3000  # 0.394897460938 31
     .long 0x3ECF9000  # 0.405395507813 32
     .long 0x3ED4E000  # 0.415771484375 33
     .long 0x3EDA2000  # 0.426025390625 34
     .long 0x3EDF5000  # 0.436157226563 35
     .long 0x3EE47000  # 0.446166992188 36
     .long 0x3EE99000  # 0.456176757813 37
     .long 0x3EEEA000  # 0.466064453125 38
     .long 0x3EF3A000  # 0.475830078125 39
     .long 0x3EF89000  # 0.485473632813 40
     .long 0x3EFD7000  # 0.494995117188 41
     .long 0x3F012000  # 0.504394531250 42
     .long 0x3F039000  # 0.513916015625 43
     .long 0x3F05F000  # 0.523193359375 44
     .long 0x3F084000  # 0.532226562500 45
     .long 0x3F0AA000  # 0.541503906250 46
     .long 0x3F0CF000  # 0.550537109375 47
     .long 0x3F0F4000  # 0.559570312500 48
     .long 0x3F118000  # 0.568359375000 49
     .long 0x3F13C000  # 0.577148437500 50
     .long 0x3F160000  # 0.585937500000 51
     .long 0x3F183000  # 0.594482421875 52
     .long 0x3F1A7000  # 0.603271484375 53
     .long 0x3F1C9000  # 0.611572265625 54
     .long 0x3F1EC000  # 0.620117187500 55
     .long 0x3F20E000  # 0.628417968750 56
     .long 0x3F230000  # 0.636718750000 57
     .long 0x3F252000  # 0.645019531250 58
     .long 0x3F273000  # 0.653076171875 59
     .long 0x3F295000  # 0.661376953125 60
     .long 0x3F2B5000  # 0.669189453125 61
     .long 0x3F2D6000  # 0.677246093750 62
     .long 0x3F2F7000  # 0.685302734375 63
     .long 0x3F317000  # 0.693115234375 64
     .long 0					# for alignment

 .L__np_ln_tail_table:
     .long 0x00000000  # 0.000000000000 0
     .long 0x35A8B0FC  # 0.000001256848 1
     .long 0x361B0E78  # 0.000002310522 2
     .long 0x3631EC66  # 0.000002651266 3
     .long 0x35C30046  # 0.000001452871 4
     .long 0x37EBCB0E  # 0.000028108738 5
     .long 0x37528AE5  # 0.000012549314 6
     .long 0x36DA7496  # 0.000006510479 7
     .long 0x3783B715  # 0.000015701671 8
     .long 0x383F3E68  # 0.000045596069 9
     .long 0x38297C10  # 0.000040408282 10
     .long 0x3815B666  # 0.000035694240 11
     .long 0x38183854  # 0.000036292084 12
     .long 0x38448108  # 0.000046850211 13
     .long 0x373539E9  # 0.000010801924 14
     .long 0x3864A740  # 0.000054515200 15
     .long 0x387BE3CD  # 0.000060055219 16
     .long 0x3803B715  # 0.000031403342 17
     .long 0x380C36AF  # 0.000033429529 18
     .long 0x3892713A  # 0.000069829126 19
     .long 0x38AE55D6  # 0.000083129547 20
     .long 0x38A0FDE8  # 0.000076766883 21
     .long 0x3862BAE1  # 0.000054056643 22
     .long 0x3798AAD3  # 0.000018199358 23
     .long 0x38C5E10E  # 0.000094356117 24
     .long 0x382D872E  # 0.000041372310 25
     .long 0x38DEDFAC  # 0.000106274470 26
     .long 0x38481E9B  # 0.000047712219 27
     .long 0x38EBFB5E  # 0.000112524940 28
     .long 0x38783B83  # 0.000059183232 29
     .long 0x374E1B05  # 0.000012284848 30
     .long 0x38CA0E11  # 0.000096347307 31
     .long 0x3891F660  # 0.000069600297 32
     .long 0x386C9A9A  # 0.000056410769 33
     .long 0x38777BCD  # 0.000059004688 34
     .long 0x38A6CED4  # 0.000079540216 35
     .long 0x38FBE3CD  # 0.000120110439 36
     .long 0x387E7E01  # 0.000060675669 37
     .long 0x37D40984  # 0.000025276800 38
     .long 0x3784C3AD  # 0.000015826745 39
     .long 0x380F5FAF  # 0.000034182969 40
     .long 0x38AC47BC  # 0.000082149607 41
     .long 0x392952D3  # 0.000161479504 42
     .long 0x37F97073  # 0.000029735476 43
     .long 0x3865C84A  # 0.000054784388 44
     .long 0x3979CF17  # 0.000238236375 45
     .long 0x38C3D2F5  # 0.000093376184 46
     .long 0x38E6B468  # 0.000110008579 47
     .long 0x383EBCE1  # 0.000045475437 48
     .long 0x39186BDF  # 0.000145360347 49
     .long 0x392F0945  # 0.000166927537 50
     .long 0x38E9ED45  # 0.000111545007 51
     .long 0x396B99A8  # 0.000224685878 52
     .long 0x37A27674  # 0.000019367064 53
     .long 0x397069AB  # 0.000229275480 54
     .long 0x39013539  # 0.000123222257 55
     .long 0x3947F423  # 0.000190690669 56
     .long 0x3945E10E  # 0.000188712234 57
     .long 0x38F85DB0  # 0.000118430122 58
     .long 0x396C08DC  # 0.000225100142 59
     .long 0x37B4996F  # 0.000021529120 60
     .long 0x397CEADA  # 0.000241200818 61
     .long 0x3920261B  # 0.000152729845 62
     .long 0x35AA4906  # 0.000001268724 63
     .long 0x3805FDF4  # 0.000031946183 64
     .long 0					# for alignment