src/gas/vrs4powf.S - open64_libacml_mv - Git at Google


 #
 #  (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
 #
 #  This file is part of libacml_mv.
 #
 #  libacml_mv is free software; you can redistribute it and/or
 #  modify it under the terms of the GNU Lesser General Public
 #  License as published by the Free Software Foundation; either
 #  version 2.1 of the License, or (at your option) any later version.
 #
 #  libacml_mv is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 #  Lesser General Public License for more details.
 #
 #  You should have received a copy of the GNU Lesser General Public
 #  License along with libacml_mv.  If not, see
 #  <http://www.gnu.org/licenses/>.
 #
 #


 #
 # vrs4powf.s
 #
 # A vector implementation of the powf libm function.
 #
 # Prototype:
 #
 #     __m128 __vrs4_powf(__m128 x,__m128 y);
 #
 #   Computes x raised to the y power.  Returns proper C99 values.
 #   Uses new tuned fastlog/fastexp.
 #
 #

 #ifdef __ELF__
 .section .note.GNU-stack,"",@progbits
 #endif


 # define local variable storage offsets
 .equ	p_temp,0x00		# xmmword
 .equ	p_negateres,0x10		# qword

 .equ	p_xexp,0x20		# qword

 .equ	p_ux,0x030		# storage for X
 .equ	p_uy,0x040		# storage for Y

 .equ	p_ax,0x050		# absolute x
 .equ	p_sx,0x060		# sign of x's

 .equ	p_ay,0x070		# absolute y
 .equ	p_yexp,0x080		# unbiased exponent of y

 .equ	p_inty,0x090		# integer y indicators
 .equ    save_rbx,0x0A0          #

 .equ	stack_size,0x0B8	# allocate 40h more than
 				# we need to avoid bank conflicts


     .text
     .align 16
     .p2align 4,,15
 .globl __vrs4_powf
     .type   __vrs4_powf,@function
 __vrs4_powf:
 	sub		$stack_size,%rsp
         mov             %rbx,save_rbx(%rsp)     # save rbx

 	movaps	  %xmm0,p_ux(%rsp)		# save x
 	movaps	  %xmm1,p_uy(%rsp)		# save y

 	movaps	%xmm0,%xmm2
 	andps	.L__mask_nsign(%rip),%xmm0		# get abs x
 	andps	.L__mask_sign(%rip),%xmm2		# mask for the sign bits
 	movaps	  %xmm0,p_ax(%rsp)		# save them
 	movaps	  %xmm2,p_sx(%rsp)		# save them
 # convert all four x's to double
 	cvtps2pd   p_ax(%rsp),%xmm0
 	cvtps2pd   p_ax+8(%rsp),%xmm1
 #
 # classify y
 # vector 32 bit integer method	 25 cycles to here
 #  /* See whether y is an integer.
 #     inty = 0 means not an integer.
 #     inty = 1 means odd integer.
 #     inty = 2 means even integer.
 #  */
 	movdqa  p_uy(%rsp),%xmm4
 	pxor	%xmm3,%xmm3
 	pand	.L__mask_nsign(%rip),%xmm4		# get abs y in integer format
 	movdqa    %xmm4,p_ay(%rsp)			# save it

 # see if the number is less than 1.0
 	psrld	$23,%xmm4			#>> EXPSHIFTBITS_SP32

 	psubd	.L__mask_127(%rip),%xmm4			# yexp, unbiased exponent
 	movdqa    %xmm4,p_yexp(%rsp)		# save it
 	paddd	.L__mask_1(%rip),%xmm4			# yexp+1
 	pcmpgtd	%xmm3,%xmm4		# 0 if exp less than 126 (2^0) (y < 1.0), else FFs
 # xmm4 is ffs if abs(y) >=1.0, else 0

 # see if the mantissa has fractional bits
 #build mask for mantissa
 	movdqa  .L__mask_23(%rip),%xmm2
 	psubd	p_yexp(%rsp),%xmm2		# 24-yexp
 	pmaxsw	%xmm3,%xmm2							# no shift counts less than 0
 	movdqa    %xmm2,p_temp(%rsp)		# save the shift counts
 # create mask for all four values
 # SSE can't individual shifts so have to do 0xeac one seperately
 	mov		p_temp(%rsp),%rcx
 	mov		$1,%rbx
 	shl		%cl,%ebx			#1 << (24 - yexp)
 	shr		$32,%rcx
 	mov		$1,%eax
 	shl		%cl,%eax			#1 << (24 - yexp)
 	shl		$32,%rax
 	add		%rax,%rbx
 	mov		%rbx,p_temp(%rsp)
 	mov		p_temp+8(%rsp),%rcx
 	mov		$1,%rbx
 	shl		%cl,%ebx			#1 << (24 - yexp)
 	shr		$32,%rcx
 	mov		$1,%eax
 	shl		%cl,%eax			#1 << (24 - yexp)
 	shl		$32,%rax
 	add		%rbx,%rax
 	mov		%rax,p_temp+8(%rsp)
 	movdqa  p_temp(%rsp),%xmm5
 	psubd	.L__mask_1(%rip),%xmm5	#= mask = (1 << (24 - yexp)) - 1

 # now use the mask to see if there are any fractional bits
 	movdqa  p_uy(%rsp),%xmm2 # get uy
 	pand	%xmm5,%xmm2		# uy & mask
 	pcmpeqd	%xmm3,%xmm2		# 0 if not zero (y has fractional mantissa bits), else FFs
 	pand	%xmm4,%xmm2		# either 0s or ff
 # xmm2 now accounts for y< 1.0 or y>=1.0 and y has fractional mantissa bits,
 # it has the value 0 if we know it's non-integer or ff if integer.

 # now see if it's even or odd.

 ## if yexp > 24, then it has to be even
 	movdqa  .L__mask_24(%rip),%xmm4
 	psubd	p_yexp(%rsp),%xmm4		# 24-yexp
 	paddd	.L__mask_1(%rip),%xmm5	# mask+1 = least significant integer bit
 	pcmpgtd	%xmm3,%xmm4		 ## if 0, then must be even, else ff's

  	pand	%xmm4,%xmm5		# set the integer bit mask to zero if yexp>24
  	paddd	.L__mask_2(%rip),%xmm4
  	por		.L__mask_2(%rip),%xmm4
  	pand	%xmm2,%xmm4		 # result can be 0, 2, or 3

 # now for integer numbers, see if odd or even
 	pand	.L__mask_mant(%rip),%xmm5	# mask out exponent bits
 	movdqa	.L__float_one(%rip),%xmm2
 	pand    p_uy(%rsp),%xmm5 #  & uy -> even or odd
 	pcmpeqd	p_ay(%rsp),%xmm2	# is ay equal to 1, ff's if so, then it's odd
 	pand	.L__mask_nsign(%rip),%xmm2 # strip the sign bit so the gt comparison works.
 	por		%xmm2,%xmm5
 	pcmpgtd	%xmm3,%xmm5		 ## if odd then ff's, else 0's for even
 	paddd	.L__mask_2(%rip),%xmm5 # gives us 2 for even, 1 for odd
 	pand	%xmm5,%xmm4

 	movdqa		  %xmm4,p_inty(%rsp)		# save inty
 #
 # do more x special case checking
 #
 	movdqa	%xmm4,%xmm5
 	pcmpeqd	%xmm3,%xmm5						# is not an integer? ff's if so
 	pand	.L__mask_NaN(%rip),%xmm5		# these values will be NaNs, if x<0
 	movdqa	%xmm4,%xmm2
 	pcmpeqd	.L__mask_1(%rip),%xmm2		# is it odd? ff's if so
 	pand	.L__mask_sign(%rip),%xmm2	# these values will get their sign bit set
 	por		%xmm2,%xmm5

 	pcmpeqd	p_sx(%rsp),%xmm3		## if the signs are set
 	pandn	%xmm5,%xmm3						# then negateres gets the values as shown below
 	movdqa	  %xmm3,p_negateres(%rsp)	# save negateres

 #  /* p_negateres now means the following.
 #  ** 7FC00000 means x<0, y not an integer, return NaN.
 #  ** 80000000 means x<0, y is odd integer, so set the sign bit.
 #  ** 0 means even integer, and/or x>=0.
 #  */


 # **** Here starts the main calculations  ****
 # The algorithm used is x**y = exp(y*log(x))
 #  Extra precision is required in intermediate steps to meet the 1ulp requirement
 #
 # log(x) calculation
 	call		__vrd4_log@PLT					# get the double precision log value
 											# for all four x's
 # y* logx
 # convert all four y's to double
 	lea	p_uy(%rsp),%rdx		# get pointer to y
 	cvtps2pd   (%rdx),%xmm2
 	cvtps2pd   8(%rdx),%xmm3

 #  /* just multiply by y */
 	mulpd	%xmm2,%xmm0
 	mulpd	%xmm3,%xmm1

 #  /* The following code computes r = exp(w) */
 	call		__vrd4_exp@PLT					# get the double exp value
 											# for all four y*log(x)'s
 #
 # convert all four results to double
 	cvtpd2ps	%xmm0,%xmm0
 	cvtpd2ps	%xmm1,%xmm1
 	movlhps		%xmm1,%xmm0

 # perform special case and error checking on input values

 # special case checking is done first in the scalar version since
 # it allows for early fast returns.  But for vectors, we consider them
 # to be rare, so early returns are not necessary.  So we first compute
 # the x**y values, and then check for special cases.

 # we do some of the checking in reverse order of the scalar version.
 	lea	p_uy(%rsp),%rdx		# get pointer to y
 # apply the negate result flags
 	orps	p_negateres(%rsp),%xmm0	# get negateres

 ## if y is infinite or so large that the result would overflow or underflow
 	movdqa	p_ay(%rsp),%xmm4
 	cmpps	$5,.L__mask_ly(%rip),%xmm4	# y not less than large value, ffs if so.
 	movmskps %xmm4,%edx
 	test	$0x0f,%edx
 	jnz		.Ly_large
 .Lrnsx3:

 ## if x is infinite
 	movdqa	p_ax(%rsp),%xmm4
 	cmpps	$0,.L__mask_inf(%rip),%xmm4	# equal to infinity, ffs if so.
 	movmskps %xmm4,%edx
 	test	$0x0f,%edx
 	jnz		.Lx_infinite
 .Lrnsx1:
 ## if x is zero
 	xorps	%xmm4,%xmm4
 	cmpps	$0,p_ax(%rsp),%xmm4	# equal to zero, ffs if so.
 	movmskps %xmm4,%edx
 	test	$0x0f,%edx
 	jnz		.Lx_zero
 .Lrnsx2:
 ## if y is NAN
 	lea		p_uy(%rsp),%rdx		# get pointer to y
 	movdqa	(%rdx),%xmm4			# get y
 	cmpps	$4,%xmm4,%xmm4						# a compare not equal  of y to itself should
 											# be false, unless y is a NaN. ff's if NaN.
 	movmskps %xmm4,%ecx
 	test	$0x0f,%ecx
 	jnz		.Ly_NaN
 .Lrnsx4:
 ## if x is NAN
 	lea		p_ux(%rsp),%rdx		# get pointer to x
 	movdqa	(%rdx),%xmm4			# get x
 	cmpps	$4,%xmm4,%xmm4						# a compare not equal  of x to itself should
 											# be false, unless x is a NaN. ff's if NaN.
 	movmskps %xmm4,%ecx
 	test	$0x0f,%ecx
 	jnz		.Lx_NaN
 .Lrnsx5:

 ## if |y| == 0	then return 1
 	movdqa	.L__float_one(%rip),%xmm3	# one
 	xorps	%xmm2,%xmm2
 	cmpps	$4,p_ay(%rsp),%xmm2	# not equal to 0.0?, ffs if not equal.
 	andps	%xmm2,%xmm0						# keep the others
 	andnps	%xmm3,%xmm2						# mask for ones
 	orps	%xmm2,%xmm0
 ## if x == +1, return +1 for all x
 	lea		p_ux(%rsp),%rdx		# get pointer to x
 	movdqa	%xmm3,%xmm2
 	cmpps	$4,(%rdx),%xmm2		# not equal to +1.0?, ffs if not equal.
 	andps	%xmm2,%xmm0						# keep the others
 	andnps	%xmm3,%xmm2						# mask for ones
 	orps	%xmm2,%xmm0

 .L__powf_cleanup2:
         mov             save_rbx(%rsp),%rbx             # restore rbx
 	add		$stack_size,%rsp
 	ret

 	.align 16
 #       y is a NaN.
 .Ly_NaN:
 	lea	p_uy(%rsp),%rdx		# get pointer to y
 	movdqa	(%rdx),%xmm4			# get y
 	movdqa	%xmm4,%xmm3
 	movdqa	%xmm4,%xmm5
 	movdqa	.L__mask_sigbit(%rip),%xmm2	# get the signalling bits
 	cmpps	$0,%xmm4,%xmm4						# a compare equal  of y to itself should
 											# be true, unless y is a NaN. 0's if NaN.
 	cmpps	$4,%xmm3,%xmm3						# compare not equal, ff's if NaN.
 	andps	%xmm4,%xmm0						# keep the other results
 	andps	%xmm3,%xmm2						# get just the right signalling bits
 	andps	%xmm5,%xmm3						# mask for the NaNs
 	orps	%xmm2,%xmm3	# convert to QNaNs
 	orps	%xmm3,%xmm0						# combine
 	jmp	   	.Lrnsx4

 #       y is a NaN.
 .Lx_NaN:
 	lea		p_ux(%rsp),%rcx		# get pointer to x
 	movdqa	(%rcx),%xmm4			# get x
 	movdqa	%xmm4,%xmm3
 	movdqa	%xmm4,%xmm5
 	movdqa	.L__mask_sigbit(%rip),%xmm2	# get the signalling bits
 	cmpps	$0,%xmm4,%xmm4						# a compare equal  of x to itself should
 											# be true, unless x is a NaN. 0's if NaN.
 	cmpps	$4,%xmm3,%xmm3						# compare not equal, ff's if NaN.
 	andps	%xmm4,%xmm0						# keep the other results
 	andps	%xmm3,%xmm2						# get just the right signalling bits
 	andps	%xmm5,%xmm3						# mask for the NaNs
 	orps	%xmm2,%xmm3	# convert to QNaNs
 	orps	%xmm3,%xmm0						# combine
 	jmp	   	.Lrnsx5

 #      * y is infinite or so large that the result would
 #         overflow or underflow.
 .Ly_large:
 	movdqa	  %xmm0,p_temp(%rsp)

 	test	$1,%edx
 	jz		.Lylrga
 	lea		p_ux(%rsp),%rcx		# get pointer to x
 	lea		p_uy(%rsp),%rbx		# get pointer to y
 	mov		(%rcx),%eax
 	mov		(%rbx),%ebx
 	mov		p_inty(%rsp),%ecx
 	sub		$8,%rsp
 	call	.Lnp_special6					# call the handler for one value
 	add		$8,%rsp
 	mov		  %eax,p_temp(%rsp)
 .Lylrga:
 	test	$2,%edx
 	jz		.Lylrgb
 	lea		p_ux(%rsp),%rcx		# get pointer to x
 	lea		p_uy(%rsp),%rbx		# get pointer to y
 	mov		4(%rcx),%eax
 	mov		4(%rbx),%ebx
 	mov		p_inty+4(%rsp),%ecx
 	sub		$8,%rsp
 	call	.Lnp_special6					# call the handler for one value
 	add		$8,%rsp
 	mov		  %eax,p_temp+4(%rsp)
 .Lylrgb:
 	test	$4,%edx
 	jz		.Lylrgc
 	lea		p_ux(%rsp),%rcx		# get pointer to x
 	lea		p_uy(%rsp),%rbx		# get pointer to y
 	mov		8(%rcx),%eax
 	mov		8(%rbx),%ebx
 	mov		p_inty+8(%rsp),%ecx
 	sub		$8,%rsp
 	call	.Lnp_special6					# call the handler for one value
 	add		$8,%rsp
 	mov		  %eax,p_temp+8(%rsp)
 .Lylrgc:
 	test	$8,%edx
 	jz		.Lylrgd
 	lea		p_ux(%rsp),%rcx		# get pointer to x
 	lea		p_uy(%rsp),%rbx		# get pointer to y
 	mov		12(%rcx),%eax
 	mov		12(%rbx),%ebx
 	mov		p_inty+12(%rsp),%ecx
 	sub		$8,%rsp
 	call	.Lnp_special6					# call the handler for one value
 	add		$8,%rsp
 	mov		  %eax,p_temp+12(%rsp)
 .Lylrgd:
 	movdqa	p_temp(%rsp),%xmm0
 	jmp 	.Lrnsx3

 # a subroutine to treat an individual x,y pair when y is large or infinity
 # assumes x in .Ly,%eax in ebx.
 # returns result in eax
 .Lnp_special6:
 # handle |x|==1 cases first
 	mov		$0x07FFFFFFF,%r8d
 	and		%eax,%r8d
 	cmp		$0x03f800000,%r8d					  # jump if |x| !=1
 	jnz		.Lnps6
 	mov		$0x03f800000,%eax					  # return 1 for all |x|==1
 	jmp 	.Lnpx64

 # cases where  |x| !=1
 .Lnps6:
 	mov		$0x07f800000,%ecx
 	xor		%eax,%eax							  # assume 0 return
 	test	$0x080000000,%ebx
 	jnz		.Lnps62							  # jump if y negative
 # y = +inf
 	cmp		$0x03f800000,%r8d
 	cmovg	%ecx,%eax							  # return inf if |x| < 1
 	jmp 	.Lnpx64
 .Lnps62:
 # y = -inf
 	cmp		$0x03f800000,%r8d
 	cmovl	%ecx,%eax							  # return inf if |x| < 1
 	jmp 	.Lnpx64

 .Lnpx64:
 	ret

 # handle cases where x is +/- infinity.  edx is the mask
 	.align 16
 .Lx_infinite:
 	movdqa	  %xmm0,p_temp(%rsp)

 	test	$1,%edx
 	jz		.Lxinfa
 	lea		p_ux(%rsp),%rcx		# get pointer to x
 	lea		p_uy(%rsp),%rbx		# get pointer to y
 	mov		(%rcx),%eax
 	mov		(%rbx),%ebx
 	mov		p_inty(%rsp),%ecx
 	sub		$8,%rsp
 	call	.Lnp_special_x1					# call the handler for one value
 	add		$8,%rsp
 	mov		  %eax,p_temp(%rsp)
 .Lxinfa:
 	test	$2,%edx
 	jz		.Lxinfb
 	lea		p_ux(%rsp),%rcx		# get pointer to x
 	lea		p_uy(%rsp),%rbx		# get pointer to y
 	mov		4(%rcx),%eax
 	mov		4(%rbx),%ebx
 	mov		p_inty+4(%rsp),%ecx
 	sub		$8,%rsp
 	call	.Lnp_special_x1					# call the handler for one value
 	add		$8,%rsp
 	mov		  %eax,p_temp+4(%rsp)
 .Lxinfb:
 	test	$4,%edx
 	jz		.Lxinfc
 	lea		p_ux(%rsp),%rcx		# get pointer to x
 	lea		p_uy(%rsp),%rbx		# get pointer to y
 	mov		8(%rcx),%eax
 	mov		8(%rbx),%ebx
 	mov		p_inty+8(%rsp),%ecx
 	sub		$8,%rsp
 	call	.Lnp_special_x1					# call the handler for one value
 	add		$8,%rsp
 	mov		  %eax,p_temp+8(%rsp)
 .Lxinfc:
 	test	$8,%edx
 	jz		.Lxinfd
 	lea		p_ux(%rsp),%rcx		# get pointer to x
 	lea		p_uy(%rsp),%rbx		# get pointer to y
 	mov		12(%rcx),%eax
 	mov		12(%rbx),%ebx
 	mov		p_inty+12(%rsp),%ecx
 	sub		$8,%rsp
 	call	.Lnp_special_x1					# call the handler for one value
 	add		$8,%rsp
 	mov		  %eax,p_temp+12(%rsp)
 .Lxinfd:
 	movdqa	p_temp(%rsp),%xmm0
 	jmp 	.Lrnsx1

 # a subroutine to treat an individual x,y pair when x is +/-infinity
 # assumes x in .Ly,%eax in ebx, inty in ecx.
 # returns result in eax
 .Lnp_special_x1:											# x is infinite
 	test	$0x080000000,%eax								# is x positive
 	jnz		.Lnsx11										# jump if not
 	test	$0x080000000,%ebx								# is y positive
 	jz		.Lnsx13											# just return if so
 	xor		%eax,%eax										# else return 0
 	jmp 	.Lnsx13

 .Lnsx11:
 	cmp		$1,%ecx										## if inty ==1
 	jnz		.Lnsx12										# jump if not
 	test	$0x080000000,%ebx								# is y positive
 	jz		.Lnsx13											# just return if so
 	mov		$0x080000000,%eax								# else return -0
 	jmp 	.Lnsx13
 .Lnsx12:													# inty <>1
 	and		$0x07FFFFFFF,%eax								# return -x (|x|)  if y<0
 	test	$0x080000000,%ebx								# is y positive
 	jz		.Lnsx13											#
 	xor		%eax,%eax										# return 0  if y >=0
 .Lnsx13:
 	ret


 # handle cases where x is +/- zero.  edx is the mask of x,y pairs with |x|=0
 	.align 16
 .Lx_zero:
 	movdqa	  %xmm0,p_temp(%rsp)

 	test	$1,%edx
 	jz		.Lxzera
 	lea		p_ux(%rsp),%rcx		# get pointer to x
 	lea		p_uy(%rsp),%rbx		# get pointer to y
 	mov		(%rcx),%eax
 	mov		(%rbx),%ebx
 	mov		p_inty(%rsp),%ecx
 	sub		$8,%rsp
 	call	.Lnp_special_x2					# call the handler for one value
 	add		$8,%rsp
 	mov		  %eax,p_temp(%rsp)
 .Lxzera:
 	test	$2,%edx
 	jz		.Lxzerb
 	lea		p_ux(%rsp),%rcx		# get pointer to x
 	lea		p_uy(%rsp),%rbx		# get pointer to y
 	mov		4(%rcx),%eax
 	mov		4(%rbx),%ebx
 	mov		p_inty+4(%rsp),%ecx
 	sub		$8,%rsp
 	call	.Lnp_special_x2					# call the handler for one value
 	add		$8,%rsp
 	mov		  %eax,p_temp+4(%rsp)
 .Lxzerb:
 	test	$4,%edx
 	jz		.Lxzerc
 	lea		p_ux(%rsp),%rcx		# get pointer to x
 	lea		p_uy(%rsp),%rbx		# get pointer to y
 	mov		8(%rcx),%eax
 	mov		8(%rbx),%ebx
 	mov		p_inty+8(%rsp),%ecx
 	sub		$8,%rsp
 	call	.Lnp_special_x2					# call the handler for one value
 	add		$8,%rsp
 	mov		  %eax,p_temp+8(%rsp)
 .Lxzerc:
 	test	$8,%edx
 	jz		.Lxzerd
 	lea		p_ux(%rsp),%rcx		# get pointer to x
 	lea		p_uy(%rsp),%rbx		# get pointer to y
 	mov		12(%rcx),%eax
 	mov		12(%rbx),%ebx
 	mov		p_inty+12(%rsp),%ecx
 	sub		$8,%rsp
 	call	.Lnp_special_x2					# call the handler for one value
 	add		$8,%rsp
 	mov		  %eax,p_temp+12(%rsp)
 .Lxzerd:
 	movdqa	p_temp(%rsp),%xmm0
 	jmp 	.Lrnsx2

 # a subroutine to treat an individual x,y pair when x is +/-0
 # assumes x in .Ly,%eax in ebx, inty in ecx.
 # returns result in eax
 	.align 16
 .Lnp_special_x2:
 	cmp		$1,%ecx										## if inty ==1
 	jz		.Lnsx21										# jump if so
 # handle cases of x=+/-0, y not integer
 	xor		%eax,%eax
 	mov		$0x07f800000,%ecx
 	test	$0x080000000,%ebx								# is ypos
 	cmovnz	%ecx,%eax
 	jmp		.Lnsx23
 # y is an integer
 .Lnsx21:
 	xor		%r8d,%r8d
 	mov		$0x07f800000,%ecx
 	test	$0x080000000,%ebx								# is ypos
 	cmovnz	%ecx,%r8d										# set to infinity if not
 	and		$0x080000000,%eax								# pickup the sign of x
 	or		%r8d,%eax										# and include it in the result
 .Lnsx23:
 	ret


         .data
         .align 64

 .L__mask_sign:			.quad 0x08000000080000000	# a sign bit mask
 				.quad 0x08000000080000000

 .L__mask_nsign:			.quad 0x07FFFFFFF7FFFFFFF	# a not sign bit mask
 				.quad 0x07FFFFFFF7FFFFFFF

 # used by inty
 .L__mask_127:			.quad 0x00000007F0000007F	# EXPBIAS_SP32
 				.quad 0x00000007F0000007F

 .L__mask_mant:			.quad 0x0007FFFFF007FFFFF	# mantissa bit mask
 				.quad 0x0007FFFFF007FFFFF

 .L__mask_1:			.quad 0x00000000100000001	# 1
 				.quad 0x00000000100000001

 .L__mask_2:			.quad 0x00000000200000002	# 2
 				.quad 0x00000000200000002

 .L__mask_24:			.quad 0x00000001800000018	# 24
 				.quad 0x00000001800000018

 .L__mask_23:			.quad 0x00000001700000017	# 23
 				.quad 0x00000001700000017

 # used by special case checking

 .L__float_one:			.quad 0x03f8000003f800000	# one
 				.quad 0x03f8000003f800000

 .L__mask_inf:			.quad 0x07f8000007F800000	# inifinity
 				.quad 0x07f8000007F800000

 .L__mask_NaN:			.quad 0x07fC000007FC00000	# NaN
 				.quad 0x07fC000007FC00000

 .L__mask_sigbit:		.quad 0x00040000000400000	# QNaN bit
 				.quad 0x00040000000400000

 .L__mask_ly:			.quad 0x04f0000004f000000	# large y
 				.quad 0x04f0000004f000000

	#
	# (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
	#
	# This file is part of libacml_mv.
	#
	# libacml_mv is free software; you can redistribute it and/or
	# modify it under the terms of the GNU Lesser General Public
	# License as published by the Free Software Foundation; either
	# version 2.1 of the License, or (at your option) any later version.
	#
	# libacml_mv is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	# Lesser General Public License for more details.
	#
	# You should have received a copy of the GNU Lesser General Public
	# License along with libacml_mv. If not, see
	# <http://www.gnu.org/licenses/>.
	#
	#





	#
	# vrs4powf.s
	#
	# A vector implementation of the powf libm function.
	#
	# Prototype:
	#
	# __m128 __vrs4_powf(__m128 x,__m128 y);
	#
	# Computes x raised to the y power. Returns proper C99 values.
	# Uses new tuned fastlog/fastexp.
	#
	#

	#ifdef __ELF__
	.section .note.GNU-stack,"",@progbits
	#endif


	# define local variable storage offsets
	.equ p_temp,0x00 # xmmword
	.equ p_negateres,0x10 # qword

	.equ p_xexp,0x20 # qword

	.equ p_ux,0x030 # storage for X
	.equ p_uy,0x040 # storage for Y

	.equ p_ax,0x050 # absolute x
	.equ p_sx,0x060 # sign of x's

	.equ p_ay,0x070 # absolute y
	.equ p_yexp,0x080 # unbiased exponent of y

	.equ p_inty,0x090 # integer y indicators
	.equ save_rbx,0x0A0 #

	.equ stack_size,0x0B8 # allocate 40h more than
	# we need to avoid bank conflicts



	.text
	.align 16
	.p2align 4,,15
	.globl __vrs4_powf
	.type __vrs4_powf,@function
	__vrs4_powf:
	sub $stack_size,%rsp
	mov %rbx,save_rbx(%rsp) # save rbx

	movaps %xmm0,p_ux(%rsp) # save x
	movaps %xmm1,p_uy(%rsp) # save y

	movaps %xmm0,%xmm2
	andps .L__mask_nsign(%rip),%xmm0 # get abs x
	andps .L__mask_sign(%rip),%xmm2 # mask for the sign bits
	movaps %xmm0,p_ax(%rsp) # save them
	movaps %xmm2,p_sx(%rsp) # save them
	# convert all four x's to double
	cvtps2pd p_ax(%rsp),%xmm0
	cvtps2pd p_ax+8(%rsp),%xmm1
	#
	# classify y
	# vector 32 bit integer method 25 cycles to here
	# /* See whether y is an integer.
	# inty = 0 means not an integer.
	# inty = 1 means odd integer.
	# inty = 2 means even integer.
	# */
	movdqa p_uy(%rsp),%xmm4
	pxor %xmm3,%xmm3
	pand .L__mask_nsign(%rip),%xmm4 # get abs y in integer format
	movdqa %xmm4,p_ay(%rsp) # save it

	# see if the number is less than 1.0
	psrld $23,%xmm4 #>> EXPSHIFTBITS_SP32

	psubd .L__mask_127(%rip),%xmm4 # yexp, unbiased exponent
	movdqa %xmm4,p_yexp(%rsp) # save it
	paddd .L__mask_1(%rip),%xmm4 # yexp+1
	pcmpgtd %xmm3,%xmm4 # 0 if exp less than 126 (2^0) (y < 1.0), else FFs
	# xmm4 is ffs if abs(y) >=1.0, else 0

	# see if the mantissa has fractional bits
	#build mask for mantissa
	movdqa .L__mask_23(%rip),%xmm2
	psubd p_yexp(%rsp),%xmm2 # 24-yexp
	pmaxsw %xmm3,%xmm2 # no shift counts less than 0
	movdqa %xmm2,p_temp(%rsp) # save the shift counts
	# create mask for all four values
	# SSE can't individual shifts so have to do 0xeac one seperately
	mov p_temp(%rsp),%rcx
	mov $1,%rbx
	shl %cl,%ebx #1 << (24 - yexp)
	shr $32,%rcx
	mov $1,%eax
	shl %cl,%eax #1 << (24 - yexp)
	shl $32,%rax
	add %rax,%rbx
	mov %rbx,p_temp(%rsp)
	mov p_temp+8(%rsp),%rcx
	mov $1,%rbx
	shl %cl,%ebx #1 << (24 - yexp)
	shr $32,%rcx
	mov $1,%eax
	shl %cl,%eax #1 << (24 - yexp)
	shl $32,%rax
	add %rbx,%rax
	mov %rax,p_temp+8(%rsp)
	movdqa p_temp(%rsp),%xmm5
	psubd .L__mask_1(%rip),%xmm5 #= mask = (1 << (24 - yexp)) - 1

	# now use the mask to see if there are any fractional bits
	movdqa p_uy(%rsp),%xmm2 # get uy
	pand %xmm5,%xmm2 # uy & mask
	pcmpeqd %xmm3,%xmm2 # 0 if not zero (y has fractional mantissa bits), else FFs
	pand %xmm4,%xmm2 # either 0s or ff
	# xmm2 now accounts for y< 1.0 or y>=1.0 and y has fractional mantissa bits,
	# it has the value 0 if we know it's non-integer or ff if integer.

	# now see if it's even or odd.

	## if yexp > 24, then it has to be even
	movdqa .L__mask_24(%rip),%xmm4
	psubd p_yexp(%rsp),%xmm4 # 24-yexp
	paddd .L__mask_1(%rip),%xmm5 # mask+1 = least significant integer bit
	pcmpgtd %xmm3,%xmm4 ## if 0, then must be even, else ff's

	pand %xmm4,%xmm5 # set the integer bit mask to zero if yexp>24
	paddd .L__mask_2(%rip),%xmm4
	por .L__mask_2(%rip),%xmm4
	pand %xmm2,%xmm4 # result can be 0, 2, or 3

	# now for integer numbers, see if odd or even
	pand .L__mask_mant(%rip),%xmm5 # mask out exponent bits
	movdqa .L__float_one(%rip),%xmm2
	pand p_uy(%rsp),%xmm5 # & uy -> even or odd
	pcmpeqd p_ay(%rsp),%xmm2 # is ay equal to 1, ff's if so, then it's odd
	pand .L__mask_nsign(%rip),%xmm2 # strip the sign bit so the gt comparison works.
	por %xmm2,%xmm5
	pcmpgtd %xmm3,%xmm5 ## if odd then ff's, else 0's for even
	paddd .L__mask_2(%rip),%xmm5 # gives us 2 for even, 1 for odd
	pand %xmm5,%xmm4

	movdqa %xmm4,p_inty(%rsp) # save inty
	#
	# do more x special case checking
	#
	movdqa %xmm4,%xmm5
	pcmpeqd %xmm3,%xmm5 # is not an integer? ff's if so
	pand .L__mask_NaN(%rip),%xmm5 # these values will be NaNs, if x<0
	movdqa %xmm4,%xmm2
	pcmpeqd .L__mask_1(%rip),%xmm2 # is it odd? ff's if so
	pand .L__mask_sign(%rip),%xmm2 # these values will get their sign bit set
	por %xmm2,%xmm5

	pcmpeqd p_sx(%rsp),%xmm3 ## if the signs are set
	pandn %xmm5,%xmm3 # then negateres gets the values as shown below
	movdqa %xmm3,p_negateres(%rsp) # save negateres

	# /* p_negateres now means the following.
	# ** 7FC00000 means x<0, y not an integer, return NaN.
	# ** 80000000 means x<0, y is odd integer, so set the sign bit.
	# ** 0 means even integer, and/or x>=0.
	# */


	# ** Here starts the main calculations **
	# The algorithm used is x*y = exp(ylog(x))
	# Extra precision is required in intermediate steps to meet the 1ulp requirement
	#
	# log(x) calculation
	call __vrd4_log@PLT # get the double precision log value
	# for all four x's
	# y* logx
	# convert all four y's to double
	lea p_uy(%rsp),%rdx # get pointer to y
	cvtps2pd (%rdx),%xmm2
	cvtps2pd 8(%rdx),%xmm3

	# /* just multiply by y */
	mulpd %xmm2,%xmm0
	mulpd %xmm3,%xmm1

	# /* The following code computes r = exp(w) */
	call __vrd4_exp@PLT # get the double exp value
	# for all four y*log(x)'s
	#
	# convert all four results to double
	cvtpd2ps %xmm0,%xmm0
	cvtpd2ps %xmm1,%xmm1
	movlhps %xmm1,%xmm0

	# perform special case and error checking on input values

	# special case checking is done first in the scalar version since
	# it allows for early fast returns. But for vectors, we consider them
	# to be rare, so early returns are not necessary. So we first compute
	# the x**y values, and then check for special cases.

	# we do some of the checking in reverse order of the scalar version.
	lea p_uy(%rsp),%rdx # get pointer to y
	# apply the negate result flags
	orps p_negateres(%rsp),%xmm0 # get negateres

	## if y is infinite or so large that the result would overflow or underflow
	movdqa p_ay(%rsp),%xmm4
	cmpps $5,.L__mask_ly(%rip),%xmm4 # y not less than large value, ffs if so.
	movmskps %xmm4,%edx
	test $0x0f,%edx
	jnz .Ly_large
	.Lrnsx3:

	## if x is infinite
	movdqa p_ax(%rsp),%xmm4
	cmpps $0,.L__mask_inf(%rip),%xmm4 # equal to infinity, ffs if so.
	movmskps %xmm4,%edx
	test $0x0f,%edx
	jnz .Lx_infinite
	.Lrnsx1:
	## if x is zero
	xorps %xmm4,%xmm4
	cmpps $0,p_ax(%rsp),%xmm4 # equal to zero, ffs if so.
	movmskps %xmm4,%edx
	test $0x0f,%edx
	jnz .Lx_zero
	.Lrnsx2:
	## if y is NAN
	lea p_uy(%rsp),%rdx # get pointer to y
	movdqa (%rdx),%xmm4 # get y
	cmpps $4,%xmm4,%xmm4 # a compare not equal of y to itself should
	# be false, unless y is a NaN. ff's if NaN.
	movmskps %xmm4,%ecx
	test $0x0f,%ecx
	jnz .Ly_NaN
	.Lrnsx4:
	## if x is NAN
	lea p_ux(%rsp),%rdx # get pointer to x
	movdqa (%rdx),%xmm4 # get x
	cmpps $4,%xmm4,%xmm4 # a compare not equal of x to itself should
	# be false, unless x is a NaN. ff's if NaN.
	movmskps %xmm4,%ecx
	test $0x0f,%ecx
	jnz .Lx_NaN
	.Lrnsx5:

	## if \|y\| == 0 then return 1
	movdqa .L__float_one(%rip),%xmm3 # one
	xorps %xmm2,%xmm2
	cmpps $4,p_ay(%rsp),%xmm2 # not equal to 0.0?, ffs if not equal.
	andps %xmm2,%xmm0 # keep the others
	andnps %xmm3,%xmm2 # mask for ones
	orps %xmm2,%xmm0
	## if x == +1, return +1 for all x
	lea p_ux(%rsp),%rdx # get pointer to x
	movdqa %xmm3,%xmm2
	cmpps $4,(%rdx),%xmm2 # not equal to +1.0?, ffs if not equal.
	andps %xmm2,%xmm0 # keep the others
	andnps %xmm3,%xmm2 # mask for ones
	orps %xmm2,%xmm0

	.L__powf_cleanup2:
	mov save_rbx(%rsp),%rbx # restore rbx
	add $stack_size,%rsp
	ret

	.align 16
	# y is a NaN.
	.Ly_NaN:
	lea p_uy(%rsp),%rdx # get pointer to y
	movdqa (%rdx),%xmm4 # get y
	movdqa %xmm4,%xmm3
	movdqa %xmm4,%xmm5
	movdqa .L__mask_sigbit(%rip),%xmm2 # get the signalling bits
	cmpps $0,%xmm4,%xmm4 # a compare equal of y to itself should
	# be true, unless y is a NaN. 0's if NaN.
	cmpps $4,%xmm3,%xmm3 # compare not equal, ff's if NaN.
	andps %xmm4,%xmm0 # keep the other results
	andps %xmm3,%xmm2 # get just the right signalling bits
	andps %xmm5,%xmm3 # mask for the NaNs
	orps %xmm2,%xmm3 # convert to QNaNs
	orps %xmm3,%xmm0 # combine
	jmp .Lrnsx4

	# y is a NaN.
	.Lx_NaN:
	lea p_ux(%rsp),%rcx # get pointer to x
	movdqa (%rcx),%xmm4 # get x
	movdqa %xmm4,%xmm3
	movdqa %xmm4,%xmm5
	movdqa .L__mask_sigbit(%rip),%xmm2 # get the signalling bits
	cmpps $0,%xmm4,%xmm4 # a compare equal of x to itself should
	# be true, unless x is a NaN. 0's if NaN.
	cmpps $4,%xmm3,%xmm3 # compare not equal, ff's if NaN.
	andps %xmm4,%xmm0 # keep the other results
	andps %xmm3,%xmm2 # get just the right signalling bits
	andps %xmm5,%xmm3 # mask for the NaNs
	orps %xmm2,%xmm3 # convert to QNaNs
	orps %xmm3,%xmm0 # combine
	jmp .Lrnsx5

	# * y is infinite or so large that the result would
	# overflow or underflow.
	.Ly_large:
	movdqa %xmm0,p_temp(%rsp)

	test $1,%edx
	jz .Lylrga
	lea p_ux(%rsp),%rcx # get pointer to x
	lea p_uy(%rsp),%rbx # get pointer to y
	mov (%rcx),%eax
	mov (%rbx),%ebx
	mov p_inty(%rsp),%ecx
	sub $8,%rsp
	call .Lnp_special6 # call the handler for one value
	add $8,%rsp
	mov %eax,p_temp(%rsp)
	.Lylrga:
	test $2,%edx
	jz .Lylrgb
	lea p_ux(%rsp),%rcx # get pointer to x
	lea p_uy(%rsp),%rbx # get pointer to y
	mov 4(%rcx),%eax
	mov 4(%rbx),%ebx
	mov p_inty+4(%rsp),%ecx
	sub $8,%rsp
	call .Lnp_special6 # call the handler for one value
	add $8,%rsp
	mov %eax,p_temp+4(%rsp)
	.Lylrgb:
	test $4,%edx
	jz .Lylrgc
	lea p_ux(%rsp),%rcx # get pointer to x
	lea p_uy(%rsp),%rbx # get pointer to y
	mov 8(%rcx),%eax
	mov 8(%rbx),%ebx
	mov p_inty+8(%rsp),%ecx
	sub $8,%rsp
	call .Lnp_special6 # call the handler for one value
	add $8,%rsp
	mov %eax,p_temp+8(%rsp)
	.Lylrgc:
	test $8,%edx
	jz .Lylrgd
	lea p_ux(%rsp),%rcx # get pointer to x
	lea p_uy(%rsp),%rbx # get pointer to y
	mov 12(%rcx),%eax
	mov 12(%rbx),%ebx
	mov p_inty+12(%rsp),%ecx
	sub $8,%rsp
	call .Lnp_special6 # call the handler for one value
	add $8,%rsp
	mov %eax,p_temp+12(%rsp)
	.Lylrgd:
	movdqa p_temp(%rsp),%xmm0
	jmp .Lrnsx3

	# a subroutine to treat an individual x,y pair when y is large or infinity
	# assumes x in .Ly,%eax in ebx.
	# returns result in eax
	.Lnp_special6:
	# handle \|x\|==1 cases first
	mov $0x07FFFFFFF,%r8d
	and %eax,%r8d
	cmp $0x03f800000,%r8d # jump if \|x\| !=1
	jnz .Lnps6
	mov $0x03f800000,%eax # return 1 for all \|x\|==1
	jmp .Lnpx64

	# cases where \|x\| !=1
	.Lnps6:
	mov $0x07f800000,%ecx
	xor %eax,%eax # assume 0 return
	test $0x080000000,%ebx
	jnz .Lnps62 # jump if y negative
	# y = +inf
	cmp $0x03f800000,%r8d
	cmovg %ecx,%eax # return inf if \|x\| < 1
	jmp .Lnpx64
	.Lnps62:
	# y = -inf
	cmp $0x03f800000,%r8d
	cmovl %ecx,%eax # return inf if \|x\| < 1
	jmp .Lnpx64

	.Lnpx64:
	ret

	# handle cases where x is +/- infinity. edx is the mask
	.align 16
	.Lx_infinite:
	movdqa %xmm0,p_temp(%rsp)

	test $1,%edx
	jz .Lxinfa
	lea p_ux(%rsp),%rcx # get pointer to x
	lea p_uy(%rsp),%rbx # get pointer to y
	mov (%rcx),%eax
	mov (%rbx),%ebx
	mov p_inty(%rsp),%ecx
	sub $8,%rsp
	call .Lnp_special_x1 # call the handler for one value
	add $8,%rsp
	mov %eax,p_temp(%rsp)
	.Lxinfa:
	test $2,%edx
	jz .Lxinfb
	lea p_ux(%rsp),%rcx # get pointer to x
	lea p_uy(%rsp),%rbx # get pointer to y
	mov 4(%rcx),%eax
	mov 4(%rbx),%ebx
	mov p_inty+4(%rsp),%ecx
	sub $8,%rsp
	call .Lnp_special_x1 # call the handler for one value
	add $8,%rsp
	mov %eax,p_temp+4(%rsp)
	.Lxinfb:
	test $4,%edx
	jz .Lxinfc
	lea p_ux(%rsp),%rcx # get pointer to x
	lea p_uy(%rsp),%rbx # get pointer to y
	mov 8(%rcx),%eax
	mov 8(%rbx),%ebx
	mov p_inty+8(%rsp),%ecx
	sub $8,%rsp
	call .Lnp_special_x1 # call the handler for one value
	add $8,%rsp
	mov %eax,p_temp+8(%rsp)
	.Lxinfc:
	test $8,%edx
	jz .Lxinfd
	lea p_ux(%rsp),%rcx # get pointer to x
	lea p_uy(%rsp),%rbx # get pointer to y
	mov 12(%rcx),%eax
	mov 12(%rbx),%ebx
	mov p_inty+12(%rsp),%ecx
	sub $8,%rsp
	call .Lnp_special_x1 # call the handler for one value
	add $8,%rsp
	mov %eax,p_temp+12(%rsp)
	.Lxinfd:
	movdqa p_temp(%rsp),%xmm0
	jmp .Lrnsx1

	# a subroutine to treat an individual x,y pair when x is +/-infinity
	# assumes x in .Ly,%eax in ebx, inty in ecx.
	# returns result in eax
	.Lnp_special_x1: # x is infinite
	test $0x080000000,%eax # is x positive
	jnz .Lnsx11 # jump if not
	test $0x080000000,%ebx # is y positive
	jz .Lnsx13 # just return if so
	xor %eax,%eax # else return 0
	jmp .Lnsx13

	.Lnsx11:
	cmp $1,%ecx ## if inty ==1
	jnz .Lnsx12 # jump if not
	test $0x080000000,%ebx # is y positive
	jz .Lnsx13 # just return if so
	mov $0x080000000,%eax # else return -0
	jmp .Lnsx13
	.Lnsx12: # inty <>1
	and $0x07FFFFFFF,%eax # return -x (\|x\|) if y<0
	test $0x080000000,%ebx # is y positive
	jz .Lnsx13 #
	xor %eax,%eax # return 0 if y >=0
	.Lnsx13:
	ret


	# handle cases where x is +/- zero. edx is the mask of x,y pairs with \|x\|=0
	.align 16
	.Lx_zero:
	movdqa %xmm0,p_temp(%rsp)

	test $1,%edx
	jz .Lxzera
	lea p_ux(%rsp),%rcx # get pointer to x
	lea p_uy(%rsp),%rbx # get pointer to y
	mov (%rcx),%eax
	mov (%rbx),%ebx
	mov p_inty(%rsp),%ecx
	sub $8,%rsp
	call .Lnp_special_x2 # call the handler for one value
	add $8,%rsp
	mov %eax,p_temp(%rsp)
	.Lxzera:
	test $2,%edx
	jz .Lxzerb
	lea p_ux(%rsp),%rcx # get pointer to x
	lea p_uy(%rsp),%rbx # get pointer to y
	mov 4(%rcx),%eax
	mov 4(%rbx),%ebx
	mov p_inty+4(%rsp),%ecx
	sub $8,%rsp
	call .Lnp_special_x2 # call the handler for one value
	add $8,%rsp
	mov %eax,p_temp+4(%rsp)
	.Lxzerb:
	test $4,%edx
	jz .Lxzerc
	lea p_ux(%rsp),%rcx # get pointer to x
	lea p_uy(%rsp),%rbx # get pointer to y
	mov 8(%rcx),%eax
	mov 8(%rbx),%ebx
	mov p_inty+8(%rsp),%ecx
	sub $8,%rsp
	call .Lnp_special_x2 # call the handler for one value
	add $8,%rsp
	mov %eax,p_temp+8(%rsp)
	.Lxzerc:
	test $8,%edx
	jz .Lxzerd
	lea p_ux(%rsp),%rcx # get pointer to x
	lea p_uy(%rsp),%rbx # get pointer to y
	mov 12(%rcx),%eax
	mov 12(%rbx),%ebx
	mov p_inty+12(%rsp),%ecx
	sub $8,%rsp
	call .Lnp_special_x2 # call the handler for one value
	add $8,%rsp
	mov %eax,p_temp+12(%rsp)
	.Lxzerd:
	movdqa p_temp(%rsp),%xmm0
	jmp .Lrnsx2

	# a subroutine to treat an individual x,y pair when x is +/-0
	# assumes x in .Ly,%eax in ebx, inty in ecx.
	# returns result in eax
	.align 16
	.Lnp_special_x2:
	cmp $1,%ecx ## if inty ==1
	jz .Lnsx21 # jump if so
	# handle cases of x=+/-0, y not integer
	xor %eax,%eax
	mov $0x07f800000,%ecx
	test $0x080000000,%ebx # is ypos
	cmovnz %ecx,%eax
	jmp .Lnsx23
	# y is an integer
	.Lnsx21:
	xor %r8d,%r8d
	mov $0x07f800000,%ecx
	test $0x080000000,%ebx # is ypos
	cmovnz %ecx,%r8d # set to infinity if not
	and $0x080000000,%eax # pickup the sign of x
	or %r8d,%eax # and include it in the result
	.Lnsx23:
	ret


	.data
	.align 64

	.L__mask_sign: .quad 0x08000000080000000 # a sign bit mask
	.quad 0x08000000080000000

	.L__mask_nsign: .quad 0x07FFFFFFF7FFFFFFF # a not sign bit mask
	.quad 0x07FFFFFFF7FFFFFFF

	# used by inty
	.L__mask_127: .quad 0x00000007F0000007F # EXPBIAS_SP32
	.quad 0x00000007F0000007F

	.L__mask_mant: .quad 0x0007FFFFF007FFFFF # mantissa bit mask
	.quad 0x0007FFFFF007FFFFF

	.L__mask_1: .quad 0x00000000100000001 # 1
	.quad 0x00000000100000001

	.L__mask_2: .quad 0x00000000200000002 # 2
	.quad 0x00000000200000002

	.L__mask_24: .quad 0x00000001800000018 # 24
	.quad 0x00000001800000018

	.L__mask_23: .quad 0x00000001700000017 # 23
	.quad 0x00000001700000017

	# used by special case checking

	.L__float_one: .quad 0x03f8000003f800000 # one
	.quad 0x03f8000003f800000

	.L__mask_inf: .quad 0x07f8000007F800000 # inifinity
	.quad 0x07f8000007F800000

	.L__mask_NaN: .quad 0x07fC000007FC00000 # NaN
	.quad 0x07fC000007FC00000

	.L__mask_sigbit: .quad 0x00040000000400000 # QNaN bit
	.quad 0x00040000000400000

	.L__mask_ly: .quad 0x04f0000004f000000 # large y
	.quad 0x04f0000004f000000