src/gas/vrs4powxf.S - open64_libacml_mv - Git at Google


 #
 #  (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
 #
 #  This file is part of libacml_mv.
 #
 #  libacml_mv is free software; you can redistribute it and/or
 #  modify it under the terms of the GNU Lesser General Public
 #  License as published by the Free Software Foundation; either
 #  version 2.1 of the License, or (at your option) any later version.
 #
 #  libacml_mv is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 #  Lesser General Public License for more details.
 #
 #  You should have received a copy of the GNU Lesser General Public
 #  License along with libacml_mv.  If not, see
 #  <http://www.gnu.org/licenses/>.
 #
 #


 #
 # vrs4powxf.asm
 #
 # A vector implementation of the powf libm function.
 # This routine raises the x vector to a constant y power.
 #
 # Prototype:
 #
 #     __m128 __vrs4_powxf(__m128 x,float y);
 #
 #   Computes x raised to the y power.  Returns proper C99 values.
 #
 #
 #ifdef __ELF__
 .section .note.GNU-stack,"",@progbits
 #endif


 # define local variable storage offsets
 .equ	p_temp,0x00		# xmmword
 .equ	p_negateres,0x10	# qword

 .equ	save_rbx,0x020		#qword
 .equ	save_rsi,0x028		#qword

 .equ	p_xptr,0x030		# ptr to x values
 .equ	p_y,0x038		# y value

 .equ	p_inty,0x040		# integer y indicators

 .equ	p_ux,0x050		# absolute x
 .equ	p_ax,0x060		# absolute x
 .equ	p_sx,0x070		# sign of x's

 .equ	stack_size,0x088	#


     .text
     .align 16
     .p2align 4,,15
 .globl __vrs4_powxf
     .type   __vrs4_powxf,@function
 __vrs4_powxf:

 	sub		$stack_size,%rsp
 	mov		%rbx,save_rbx(%rsp)	# save rbx

 	lea		p_ux(%rsp),%rcx
 	mov		  %rcx,p_xptr(%rsp)		# save pointer to x
 	movaps		%xmm0,(%rcx)
 	movss	  %xmm1,p_y(%rsp)		# save y

 	movdqa	%xmm1,%xmm4

 	movaps	%xmm0,%xmm2
 	andps	.L__mask_nsign(%rip),%xmm0		# get abs x
 	andps	.L__mask_sign(%rip),%xmm2			# mask for the sign bits
 	movaps	  %xmm0,p_ax(%rsp)		# save them
 	movaps	  %xmm2,p_sx(%rsp)		# save them
 # convert all four x's to double
 	cvtps2pd   p_ax(%rsp),%xmm0
 	cvtps2pd   p_ax+8(%rsp),%xmm1
 #
 # classify y
 # vector 32 bit integer method	 25 cycles to here
 #  /* See whether y is an integer.
 #     inty = 0 means not an integer.
 #  */
 # get yexp
 	mov		p_y(%rsp),%r8d						# r8 is uy
 	mov		$0x07fffffff,%r9d
 	and		%r8d,%r9d						# r9 is ay

 ## if |y| == 0	then return 1
 	cmp		$0,%r9d							# is y a zero?
 	jz		.Ly_zero

 	mov		$0x07f800000,%eax				# EXPBITS_SP32
 	and		%r9d,%eax						# y exp

 	xor		%edi,%edi
 	shr		$23,%eax			#>> EXPSHIFTBITS_SP32
 	sub		$126,%eax		# - EXPBIAS_SP32 + 1   - eax is now the unbiased exponent
 	mov		$1,%ebx
 	cmp		%ebx,%eax			## if (yexp < 1)
 	cmovl	%edi,%ebx
 	jl		.Lsave_inty

 	mov		$24,%ecx
 	cmp		%ecx,%eax			## if (yexp >24)
 	jle		.Linfy1
 	mov		$2,%ebx
 	jmp		.Lsave_inty
 .Linfy1:						# else 1<=yexp<=24
 	sub		%eax,%ecx			# build mask for mantissa
 	shl		%cl,%ebx
 	dec		%ebx				# rbx = mask = (1 << (24 - yexp)) - 1

 	mov		%r8d,%eax
 	and		%ebx,%eax			## if ((uy & mask) != 0)
 	cmovnz	%edi,%ebx			#   inty = 0;
 	jnz		.Lsave_inty

 	not		%ebx				# else if (((uy & ~mask) >> (24 - yexp)) & 0x00000001)
 	mov		%r8d,%eax
 	and		%ebx,%eax
 	shr		%cl,%eax
 	inc		%edi
 	and		%edi,%eax
 	mov		%edi,%ebx			#  inty = 1
 	jnz		.Lsave_inty
 	inc		%ebx				# else	inty = 2


 .Lsave_inty:
 	mov		  %r8d,p_y+4(%rsp)						# r8d is ay
 	mov		  %ebx,p_inty(%rsp)		# save inty
 #
 # do more x special case checking
 #
 	pxor	%xmm3,%xmm3
 	xor		%eax,%eax
 	mov		$0x07FC00000,%ecx
 	cmp		$0,%ebx							# is y not an integer?
 	cmovz	%ecx,%eax							# then set to return a NaN.  else 0.
 	mov		$0x080000000,%ecx
 	cmp		$1,%ebx							# is y an odd integer?
 	cmovz	%ecx,%eax							# maybe set sign bit if so
 	movd	%eax,%xmm5
 	pshufd	$0,%xmm5,%xmm5

 	pcmpeqd	p_sx(%rsp),%xmm3		## if the signs are set
 	pandn	%xmm5,%xmm3						# then negateres gets the values as shown below
 	movdqa	  %xmm3,p_negateres(%rsp)	# save negateres

 #  /* p_negateres now means the following.
 #     7FC00000 means x<0, y not an integer, return NaN.
 #     80000000 means x<0, y is odd integer, so set the sign bit.
 ##     0 means even integer, and/or x>=0.
 #  */

 # **** Here starts the main calculations  ****
 # The algorithm used is x**y = exp(y*log(x))
 #  Extra precision is required in intermediate steps to meet the 1ulp requirement
 #
 # log(x) calculation
 	call		__vrd4_log@PLT					# get the double precision log value
 											# for all four x's
 # y* logx
 	cvtps2pd   p_y(%rsp),%xmm2		#convert the two packed single y's to double

 #  /* just multiply by y */
 	mulpd	%xmm2,%xmm0
 	mulpd	%xmm2,%xmm1

 #  /* The following code computes r = exp(w) */
 	call		__vrd4_exp@PLT					# get the double exp value
 											# for all four y*log(x)'s
 #
 # convert all four results to double
 	cvtpd2ps	%xmm0,%xmm0
 	cvtpd2ps	%xmm1,%xmm1
 	movlhps		%xmm1,%xmm0

 # perform special case and error checking on input values

 # special case checking is done first in the scalar version since
 # it allows for early fast returns.  But for vectors, we consider them
 # to be rare, so early returns are not necessary.  So we first compute
 # the x**y values, and then check for special cases.

 # we do some of the checking in reverse order of the scalar version.
 # apply the negate result flags
 	orps	p_negateres(%rsp),%xmm0	# get negateres

 ## if y is infinite or so large that the result would overflow or underflow
 	mov		p_y(%rsp),%edx			# get y
 	and 	$0x07fffffff,%edx					# develop ay
 	cmp		$0x04f000000,%edx
 	ja		.Ly_large
 .Lrnsx3:

 ## if x is infinite
 	movdqa	p_ax(%rsp),%xmm4
 	cmpps	$0,.L__mask_inf(%rip),%xmm4	# equal to infinity, ffs if so.
 	movmskps %xmm4,%edx
 	test	$0x0f,%edx
 	jnz		.Lx_infinite
 .Lrnsx1:
 ## if x is zero
 	xorps	%xmm4,%xmm4
 	cmpps	$0,p_ax(%rsp),%xmm4	# equal to zero, ffs if so.
 	movmskps %xmm4,%edx
 	test	$0x0f,%edx
 	jnz		.Lx_zero
 .Lrnsx2:
 ## if y is NAN
 	movss	p_y(%rsp),%xmm4			# get y
 	ucomiss	%xmm4,%xmm4						# comparing y to itself should
 											# be true, unless y is a NaN. parity flag if NaN.
 	jp		.Ly_NaN
 .Lrnsx4:
 ## if x is NAN
 	movdqa	p_ax(%rsp),%xmm4			# get x
 	cmpps	$4,%xmm4,%xmm4						# a compare not equal  of x to itself should
 											# be false, unless x is a NaN. ff's if NaN.
 	movmskps %xmm4,%ecx
 	test	$0x0f,%ecx
 	jnz		.Lx_NaN
 .Lrnsx5:

 ## if x == +1, return +1 for all x
 	movdqa	.L__float_one(%rip),%xmm3	# one
 	mov		p_xptr(%rsp),%rdx		# get pointer to x
 	movdqa	%xmm3,%xmm2
 	cmpps	$4,(%rdx),%xmm2		# not equal to +1.0?, ffs if not equal.
 	andps	%xmm2,%xmm0						# keep the others
 	andnps	%xmm3,%xmm2						# mask for ones
 	orps	%xmm2,%xmm0

 .L__powf_cleanup2:

 	mov		save_rbx(%rsp),%rbx		# restore rbx
 	add		$stack_size,%rsp
 	ret

 	.align 16
 .Ly_zero:
 ## if |y| == 0	then return 1
 	movdqa	.L__float_one(%rip),%xmm0	# one
 	jmp		.L__powf_cleanup2
 #      * y is a NaN.
 .Ly_NaN:
 	mov		p_y(%rsp),%r8d
 	or		$0x000400000,%r8d				# convert to QNaNs
 	movd	%r8d,%xmm0					# propagate to all results
 	shufps	$0,%xmm0,%xmm0
 	jmp	   	.Lrnsx4

 #       y is a NaN.
 .Lx_NaN:
 	mov		p_xptr(%rsp),%rcx		# get pointer to x
 	movdqa	(%rcx),%xmm4			# get x
 	movdqa	%xmm4,%xmm3
 	movdqa	%xmm4,%xmm5
 	movdqa	.L__mask_sigbit(%rip),%xmm2	# get the signalling bits
 	cmpps	$0,%xmm4,%xmm4						# a compare equal  of x to itself should
 											# be true, unless x is a NaN. 0's if NaN.
 	cmpps	$4,%xmm3,%xmm3						# compare not equal, ff's if NaN.
 	andps	%xmm4,%xmm0						# keep the other results
 	andps	%xmm3,%xmm2						# get just the right signalling bits
 	andps	%xmm5,%xmm3						# mask for the NaNs
 	orps	%xmm2,%xmm3	# convert to QNaNs
 	orps	%xmm3,%xmm0						# combine
 	jmp	   	.Lrnsx5

 #      * y is infinite or so large that the result would
 #         overflow or underflow.
 .Ly_large:
 	movdqa	  %xmm0,p_temp(%rsp)

 	mov		p_xptr(%rsp),%rcx		# get pointer to x
 	mov		(%rcx),%eax
 	mov		p_y(%rsp),%ebx
 	mov		p_inty(%rsp),%ecx
 	sub		$8,%rsp
 	call	.Lnp_special6					# call the handler for one value
 	add		$8,%rsp
 	mov		  %eax,p_temp(%rsp)

 	mov		p_xptr(%rsp),%rcx		# get pointer to x
 	mov		4(%rcx),%eax
 	mov		p_y(%rsp),%ebx
 	mov		p_inty(%rsp),%ecx
 	sub		$8,%rsp
 	call	.Lnp_special6					# call the handler for one value
 	add		$8,%rsp
 	mov		  %eax,p_temp+4(%rsp)

 	mov		p_xptr(%rsp),%rcx		# get pointer to x
 	mov		8(%rcx),%eax
 	mov		p_y(%rsp),%ebx
 	mov		p_inty(%rsp),%ecx
 	sub		$8,%rsp
 	call	.Lnp_special6					# call the handler for one value
 	add		$8,%rsp
 	mov		  %eax,p_temp+8(%rsp)

 	mov		p_xptr(%rsp),%rcx		# get pointer to x
 	mov		12(%rcx),%eax
 	mov		p_y(%rsp),%ebx
 	mov		p_inty(%rsp),%ecx
 	sub		$8,%rsp
 	call	.Lnp_special6					# call the handler for one value
 	add		$8,%rsp
 	mov		  %eax,p_temp+12(%rsp)

 	movdqa	p_temp(%rsp),%xmm0
 	jmp 	.Lrnsx3

 # a subroutine to treat an individual x,y pair when y is large or infinity
 # assumes x in .Ly(%rip),%eax in ebx.
 # returns result in eax
 .Lnp_special6:
 # handle |x|==1 cases first
 	mov		$0x07FFFFFFF,%r8d
 	and		%eax,%r8d
 	cmp		$0x03f800000,%r8d					  # jump if |x| !=1
 	jnz		.Lnps6
 	mov		$0x03f800000,%eax					  # return 1 for all |x|==1
 	jmp 	.Lnpx64

 # cases where  |x| !=1
 .Lnps6:
 	mov		$0x07f800000,%ecx
 	xor		%eax,%eax							  # assume 0 return
 	test	$0x080000000,%ebx
 	jnz		.Lnps62							  # jump if y negative
 # y = +inf
 	cmp		$0x03f800000,%r8d
 	cmovg	%ecx,%eax							  # return inf if |x| < 1
 	jmp 	.Lnpx64
 .Lnps62:
 # y = -inf
 	cmp		$0x03f800000,%r8d
 	cmovl	%ecx,%eax							  # return inf if |x| < 1
 	jmp 	.Lnpx64

 .Lnpx64:
 	ret

 # handle cases where x is +/- infinity.  edx is the mask
 	.align 16
 .Lx_infinite:
 	movdqa	  %xmm0,p_temp(%rsp)

 	test	$1,%edx
 	jz		.Lxinfa
 	mov		p_xptr(%rsp),%rcx		# get pointer to x
 	mov		(%rcx),%eax
 	mov		p_y(%rsp),%ebx
 	mov		p_inty(%rsp),%ecx
 	sub		$8,%rsp
 	call	.Lnp_special_x1					# call the handler for one value
 	add		$8,%rsp
 	mov		  %eax,p_temp(%rsp)
 .Lxinfa:
 	test	$2,%edx
 	jz		.Lxinfb
 	mov		p_xptr(%rsp),%rcx		# get pointer to x
 	mov		p_y(%rsp),%ebx
 	mov		4(%rcx),%eax
 	mov		p_inty(%rsp),%ecx
 	sub		$8,%rsp
 	call	.Lnp_special_x1					# call the handler for one value
 	add		$8,%rsp
 	mov		  %eax,p_temp+4(%rsp)
 .Lxinfb:
 	test	$4,%edx
 	jz		.Lxinfc
 	mov		p_xptr(%rsp),%rcx		# get pointer to x
 	mov		p_y(%rsp),%ebx
 	mov		8(%rcx),%eax
 	mov		p_inty(%rsp),%ecx
 	sub		$8,%rsp
 	call	.Lnp_special_x1					# call the handler for one value
 	add		$8,%rsp
 	mov		  %eax,p_temp+8(%rsp)
 .Lxinfc:
 	test	$8,%edx
 	jz		.Lxinfd
 	mov		p_xptr(%rsp),%rcx		# get pointer to x
 	mov		p_y(%rsp),%ebx
 	mov		12(%rcx),%eax
 	mov		p_inty(%rsp),%ecx
 	sub		$8,%rsp
 	call	.Lnp_special_x1					# call the handler for one value
 	add		$8,%rsp
 	mov		  %eax,p_temp+12(%rsp)
 .Lxinfd:
 	movdqa	p_temp(%rsp),%xmm0
 	jmp 	.Lrnsx1

 # a subroutine to treat an individual x,y pair when x is +/-infinity
 # assumes x in .Ly(%rip),%eax in ebx, inty in ecx.
 # returns result in eax
 .Lnp_special_x1:											# x is infinite
 	test	$0x080000000,%eax								# is x positive
 	jnz		.Lnsx11										# jump if not
 	test	$0x080000000,%ebx								# is y positive
 	jz		.Lnsx13											# just return if so
 	xor		%eax,%eax										# else return 0
 	jmp 	.Lnsx13

 .Lnsx11:
 	cmp		$1,%ecx										## if inty ==1
 	jnz		.Lnsx12										# jump if not
 	test	$0x080000000,%ebx								# is y positive
 	jz		.Lnsx13											# just return if so
 	mov		$0x080000000,%eax								# else return -0
 	jmp 	.Lnsx13
 .Lnsx12:													# inty <>1
 	and		$0x07FFFFFFF,%eax								# return -x (|x|)  if y<0
 	test	$0x080000000,%ebx								# is y positive
 	jz		.Lnsx13											#
 	xor		%eax,%eax										# return 0  if y >=0
 .Lnsx13:
 	ret


 # handle cases where x is +/- zero.  edx is the mask of x,y pairs with |x|=0
 	.align 16
 .Lx_zero:
 	movdqa	  %xmm0,p_temp(%rsp)

 	test	$1,%edx
 	jz		.Lxzera
 	mov		p_xptr(%rsp),%rcx		# get pointer to x
 	mov		p_y(%rsp),%ebx
 	mov		(%rcx),%eax
 	mov		p_inty(%rsp),%ecx
 	sub		$8,%rsp
 	call	.Lnp_special_x2					# call the handler for one value
 	add		$8,%rsp
 	mov		  %eax,p_temp(%rsp)
 .Lxzera:
 	test	$2,%edx
 	jz		.Lxzerb
 	mov		p_xptr(%rsp),%rcx		# get pointer to x
 	mov		p_y(%rsp),%ebx
 	mov		4(%rcx),%eax
 	mov		p_inty(%rsp),%ecx
 	sub		$8,%rsp
 	call	.Lnp_special_x2					# call the handler for one value
 	add		$8,%rsp
 	mov		  %eax,p_temp+4(%rsp)
 .Lxzerb:
 	test	$4,%edx
 	jz		.Lxzerc
 	mov		p_xptr(%rsp),%rcx		# get pointer to x
 	mov		p_y(%rsp),%ebx
 	mov		8(%rcx),%eax
 	mov		p_inty(%rsp),%ecx
 	sub		$8,%rsp
 	call	.Lnp_special_x2					# call the handler for one value
 	add		$8,%rsp
 	mov		  %eax,p_temp+8(%rsp)
 .Lxzerc:
 	test	$8,%edx
 	jz		.Lxzerd
 	mov		p_xptr(%rsp),%rcx		# get pointer to x
 	mov		p_y(%rsp),%ebx
 	mov		12(%rcx),%eax
 	mov		p_inty(%rsp),%ecx
 	sub		$8,%rsp
 	call	.Lnp_special_x2					# call the handler for one value
 	add		$8,%rsp
 	mov		  %eax,p_temp+12(%rsp)
 .Lxzerd:
 	movdqa	p_temp(%rsp),%xmm0
 	jmp 	.Lrnsx2

 # a subroutine to treat an individual x,y pair when x is +/-0
 # assumes x in .Ly(%rip),%eax in ebx, inty in ecx.
 # returns result in eax
 	.align 16
 .Lnp_special_x2:
 	cmp		$1,%ecx										## if inty ==1
 	jz		.Lnsx21										# jump if so
 # handle cases of x=+/-0, y not integer
 	xor		%eax,%eax
 	mov		$0x07f800000,%ecx
 	test	$0x080000000,%ebx								# is ypos
 	cmovnz	%ecx,%eax
 	jmp		.Lnsx23
 # y is an integer
 .Lnsx21:
 	xor		%r8d,%r8d
 	mov		$0x07f800000,%ecx
 	test	$0x080000000,%ebx								# is ypos
 	cmovnz	%ecx,%r8d										# set to infinity if not
 	and		$0x080000000,%eax								# pickup the sign of x
 	or		%r8d,%eax										# and include it in the result
 .Lnsx23:
 	ret


         .data
         .align 64

 .L__mask_sign:			.quad 0x08000000080000000	# a sign bit mask
 				.quad 0x08000000080000000

 .L__mask_nsign:			.quad 0x07FFFFFFF7FFFFFFF	# a not sign bit mask
 				.quad 0x07FFFFFFF7FFFFFFF

 # used by special case checking

 .L__float_one:			.quad 0x03f8000003f800000	# one
 				.quad 0x03f8000003f800000

 .L__mask_inf:			.quad 0x07f8000007F800000	# inifinity
 				.quad 0x07f8000007F800000

 .L__mask_sigbit:		.quad 0x00040000000400000	# QNaN bit
 				.quad 0x00040000000400000

	#
	# (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
	#
	# This file is part of libacml_mv.
	#
	# libacml_mv is free software; you can redistribute it and/or
	# modify it under the terms of the GNU Lesser General Public
	# License as published by the Free Software Foundation; either
	# version 2.1 of the License, or (at your option) any later version.
	#
	# libacml_mv is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	# Lesser General Public License for more details.
	#
	# You should have received a copy of the GNU Lesser General Public
	# License along with libacml_mv. If not, see
	# <http://www.gnu.org/licenses/>.
	#
	#





	#
	# vrs4powxf.asm
	#
	# A vector implementation of the powf libm function.
	# This routine raises the x vector to a constant y power.
	#
	# Prototype:
	#
	# __m128 __vrs4_powxf(__m128 x,float y);
	#
	# Computes x raised to the y power. Returns proper C99 values.
	#
	#
	#ifdef __ELF__
	.section .note.GNU-stack,"",@progbits
	#endif



	# define local variable storage offsets
	.equ p_temp,0x00 # xmmword
	.equ p_negateres,0x10 # qword

	.equ save_rbx,0x020 #qword
	.equ save_rsi,0x028 #qword

	.equ p_xptr,0x030 # ptr to x values
	.equ p_y,0x038 # y value

	.equ p_inty,0x040 # integer y indicators

	.equ p_ux,0x050 # absolute x
	.equ p_ax,0x060 # absolute x
	.equ p_sx,0x070 # sign of x's

	.equ stack_size,0x088 #





	.text
	.align 16
	.p2align 4,,15
	.globl __vrs4_powxf
	.type __vrs4_powxf,@function
	__vrs4_powxf:

	sub $stack_size,%rsp
	mov %rbx,save_rbx(%rsp) # save rbx

	lea p_ux(%rsp),%rcx
	mov %rcx,p_xptr(%rsp) # save pointer to x
	movaps %xmm0,(%rcx)
	movss %xmm1,p_y(%rsp) # save y

	movdqa %xmm1,%xmm4

	movaps %xmm0,%xmm2
	andps .L__mask_nsign(%rip),%xmm0 # get abs x
	andps .L__mask_sign(%rip),%xmm2 # mask for the sign bits
	movaps %xmm0,p_ax(%rsp) # save them
	movaps %xmm2,p_sx(%rsp) # save them
	# convert all four x's to double
	cvtps2pd p_ax(%rsp),%xmm0
	cvtps2pd p_ax+8(%rsp),%xmm1
	#
	# classify y
	# vector 32 bit integer method 25 cycles to here
	# /* See whether y is an integer.
	# inty = 0 means not an integer.
	# */
	# get yexp
	mov p_y(%rsp),%r8d # r8 is uy
	mov $0x07fffffff,%r9d
	and %r8d,%r9d # r9 is ay

	## if \|y\| == 0 then return 1
	cmp $0,%r9d # is y a zero?
	jz .Ly_zero

	mov $0x07f800000,%eax # EXPBITS_SP32
	and %r9d,%eax # y exp

	xor %edi,%edi
	shr $23,%eax #>> EXPSHIFTBITS_SP32
	sub $126,%eax # - EXPBIAS_SP32 + 1 - eax is now the unbiased exponent
	mov $1,%ebx
	cmp %ebx,%eax ## if (yexp < 1)
	cmovl %edi,%ebx
	jl .Lsave_inty

	mov $24,%ecx
	cmp %ecx,%eax ## if (yexp >24)
	jle .Linfy1
	mov $2,%ebx
	jmp .Lsave_inty
	.Linfy1: # else 1<=yexp<=24
	sub %eax,%ecx # build mask for mantissa
	shl %cl,%ebx
	dec %ebx # rbx = mask = (1 << (24 - yexp)) - 1

	mov %r8d,%eax
	and %ebx,%eax ## if ((uy & mask) != 0)
	cmovnz %edi,%ebx # inty = 0;
	jnz .Lsave_inty

	not %ebx # else if (((uy & ~mask) >> (24 - yexp)) & 0x00000001)
	mov %r8d,%eax
	and %ebx,%eax
	shr %cl,%eax
	inc %edi
	and %edi,%eax
	mov %edi,%ebx # inty = 1
	jnz .Lsave_inty
	inc %ebx # else inty = 2


	.Lsave_inty:
	mov %r8d,p_y+4(%rsp) # r8d is ay
	mov %ebx,p_inty(%rsp) # save inty
	#
	# do more x special case checking
	#
	pxor %xmm3,%xmm3
	xor %eax,%eax
	mov $0x07FC00000,%ecx
	cmp $0,%ebx # is y not an integer?
	cmovz %ecx,%eax # then set to return a NaN. else 0.
	mov $0x080000000,%ecx
	cmp $1,%ebx # is y an odd integer?
	cmovz %ecx,%eax # maybe set sign bit if so
	movd %eax,%xmm5
	pshufd $0,%xmm5,%xmm5

	pcmpeqd p_sx(%rsp),%xmm3 ## if the signs are set
	pandn %xmm5,%xmm3 # then negateres gets the values as shown below
	movdqa %xmm3,p_negateres(%rsp) # save negateres

	# /* p_negateres now means the following.
	# 7FC00000 means x<0, y not an integer, return NaN.
	# 80000000 means x<0, y is odd integer, so set the sign bit.
	## 0 means even integer, and/or x>=0.
	# */

	# ** Here starts the main calculations **
	# The algorithm used is x*y = exp(ylog(x))
	# Extra precision is required in intermediate steps to meet the 1ulp requirement
	#
	# log(x) calculation
	call __vrd4_log@PLT # get the double precision log value
	# for all four x's
	# y* logx
	cvtps2pd p_y(%rsp),%xmm2 #convert the two packed single y's to double

	# /* just multiply by y */
	mulpd %xmm2,%xmm0
	mulpd %xmm2,%xmm1

	# /* The following code computes r = exp(w) */
	call __vrd4_exp@PLT # get the double exp value
	# for all four y*log(x)'s
	#
	# convert all four results to double
	cvtpd2ps %xmm0,%xmm0
	cvtpd2ps %xmm1,%xmm1
	movlhps %xmm1,%xmm0

	# perform special case and error checking on input values

	# special case checking is done first in the scalar version since
	# it allows for early fast returns. But for vectors, we consider them
	# to be rare, so early returns are not necessary. So we first compute
	# the x**y values, and then check for special cases.

	# we do some of the checking in reverse order of the scalar version.
	# apply the negate result flags
	orps p_negateres(%rsp),%xmm0 # get negateres

	## if y is infinite or so large that the result would overflow or underflow
	mov p_y(%rsp),%edx # get y
	and $0x07fffffff,%edx # develop ay
	cmp $0x04f000000,%edx
	ja .Ly_large
	.Lrnsx3:

	## if x is infinite
	movdqa p_ax(%rsp),%xmm4
	cmpps $0,.L__mask_inf(%rip),%xmm4 # equal to infinity, ffs if so.
	movmskps %xmm4,%edx
	test $0x0f,%edx
	jnz .Lx_infinite
	.Lrnsx1:
	## if x is zero
	xorps %xmm4,%xmm4
	cmpps $0,p_ax(%rsp),%xmm4 # equal to zero, ffs if so.
	movmskps %xmm4,%edx
	test $0x0f,%edx
	jnz .Lx_zero
	.Lrnsx2:
	## if y is NAN
	movss p_y(%rsp),%xmm4 # get y
	ucomiss %xmm4,%xmm4 # comparing y to itself should
	# be true, unless y is a NaN. parity flag if NaN.
	jp .Ly_NaN
	.Lrnsx4:
	## if x is NAN
	movdqa p_ax(%rsp),%xmm4 # get x
	cmpps $4,%xmm4,%xmm4 # a compare not equal of x to itself should
	# be false, unless x is a NaN. ff's if NaN.
	movmskps %xmm4,%ecx
	test $0x0f,%ecx
	jnz .Lx_NaN
	.Lrnsx5:

	## if x == +1, return +1 for all x
	movdqa .L__float_one(%rip),%xmm3 # one
	mov p_xptr(%rsp),%rdx # get pointer to x
	movdqa %xmm3,%xmm2
	cmpps $4,(%rdx),%xmm2 # not equal to +1.0?, ffs if not equal.
	andps %xmm2,%xmm0 # keep the others
	andnps %xmm3,%xmm2 # mask for ones
	orps %xmm2,%xmm0

	.L__powf_cleanup2:

	mov save_rbx(%rsp),%rbx # restore rbx
	add $stack_size,%rsp
	ret

	.align 16
	.Ly_zero:
	## if \|y\| == 0 then return 1
	movdqa .L__float_one(%rip),%xmm0 # one
	jmp .L__powf_cleanup2
	# * y is a NaN.
	.Ly_NaN:
	mov p_y(%rsp),%r8d
	or $0x000400000,%r8d # convert to QNaNs
	movd %r8d,%xmm0 # propagate to all results
	shufps $0,%xmm0,%xmm0
	jmp .Lrnsx4

	# y is a NaN.
	.Lx_NaN:
	mov p_xptr(%rsp),%rcx # get pointer to x
	movdqa (%rcx),%xmm4 # get x
	movdqa %xmm4,%xmm3
	movdqa %xmm4,%xmm5
	movdqa .L__mask_sigbit(%rip),%xmm2 # get the signalling bits
	cmpps $0,%xmm4,%xmm4 # a compare equal of x to itself should
	# be true, unless x is a NaN. 0's if NaN.
	cmpps $4,%xmm3,%xmm3 # compare not equal, ff's if NaN.
	andps %xmm4,%xmm0 # keep the other results
	andps %xmm3,%xmm2 # get just the right signalling bits
	andps %xmm5,%xmm3 # mask for the NaNs
	orps %xmm2,%xmm3 # convert to QNaNs
	orps %xmm3,%xmm0 # combine
	jmp .Lrnsx5

	# * y is infinite or so large that the result would
	# overflow or underflow.
	.Ly_large:
	movdqa %xmm0,p_temp(%rsp)

	mov p_xptr(%rsp),%rcx # get pointer to x
	mov (%rcx),%eax
	mov p_y(%rsp),%ebx
	mov p_inty(%rsp),%ecx
	sub $8,%rsp
	call .Lnp_special6 # call the handler for one value
	add $8,%rsp
	mov %eax,p_temp(%rsp)

	mov p_xptr(%rsp),%rcx # get pointer to x
	mov 4(%rcx),%eax
	mov p_y(%rsp),%ebx
	mov p_inty(%rsp),%ecx
	sub $8,%rsp
	call .Lnp_special6 # call the handler for one value
	add $8,%rsp
	mov %eax,p_temp+4(%rsp)

	mov p_xptr(%rsp),%rcx # get pointer to x
	mov 8(%rcx),%eax
	mov p_y(%rsp),%ebx
	mov p_inty(%rsp),%ecx
	sub $8,%rsp
	call .Lnp_special6 # call the handler for one value
	add $8,%rsp
	mov %eax,p_temp+8(%rsp)

	mov p_xptr(%rsp),%rcx # get pointer to x
	mov 12(%rcx),%eax
	mov p_y(%rsp),%ebx
	mov p_inty(%rsp),%ecx
	sub $8,%rsp
	call .Lnp_special6 # call the handler for one value
	add $8,%rsp
	mov %eax,p_temp+12(%rsp)

	movdqa p_temp(%rsp),%xmm0
	jmp .Lrnsx3

	# a subroutine to treat an individual x,y pair when y is large or infinity
	# assumes x in .Ly(%rip),%eax in ebx.
	# returns result in eax
	.Lnp_special6:
	# handle \|x\|==1 cases first
	mov $0x07FFFFFFF,%r8d
	and %eax,%r8d
	cmp $0x03f800000,%r8d # jump if \|x\| !=1
	jnz .Lnps6
	mov $0x03f800000,%eax # return 1 for all \|x\|==1
	jmp .Lnpx64

	# cases where \|x\| !=1
	.Lnps6:
	mov $0x07f800000,%ecx
	xor %eax,%eax # assume 0 return
	test $0x080000000,%ebx
	jnz .Lnps62 # jump if y negative
	# y = +inf
	cmp $0x03f800000,%r8d
	cmovg %ecx,%eax # return inf if \|x\| < 1
	jmp .Lnpx64
	.Lnps62:
	# y = -inf
	cmp $0x03f800000,%r8d
	cmovl %ecx,%eax # return inf if \|x\| < 1
	jmp .Lnpx64

	.Lnpx64:
	ret

	# handle cases where x is +/- infinity. edx is the mask
	.align 16
	.Lx_infinite:
	movdqa %xmm0,p_temp(%rsp)

	test $1,%edx
	jz .Lxinfa
	mov p_xptr(%rsp),%rcx # get pointer to x
	mov (%rcx),%eax
	mov p_y(%rsp),%ebx
	mov p_inty(%rsp),%ecx
	sub $8,%rsp
	call .Lnp_special_x1 # call the handler for one value
	add $8,%rsp
	mov %eax,p_temp(%rsp)
	.Lxinfa:
	test $2,%edx
	jz .Lxinfb
	mov p_xptr(%rsp),%rcx # get pointer to x
	mov p_y(%rsp),%ebx
	mov 4(%rcx),%eax
	mov p_inty(%rsp),%ecx
	sub $8,%rsp
	call .Lnp_special_x1 # call the handler for one value
	add $8,%rsp
	mov %eax,p_temp+4(%rsp)
	.Lxinfb:
	test $4,%edx
	jz .Lxinfc
	mov p_xptr(%rsp),%rcx # get pointer to x
	mov p_y(%rsp),%ebx
	mov 8(%rcx),%eax
	mov p_inty(%rsp),%ecx
	sub $8,%rsp
	call .Lnp_special_x1 # call the handler for one value
	add $8,%rsp
	mov %eax,p_temp+8(%rsp)
	.Lxinfc:
	test $8,%edx
	jz .Lxinfd
	mov p_xptr(%rsp),%rcx # get pointer to x
	mov p_y(%rsp),%ebx
	mov 12(%rcx),%eax
	mov p_inty(%rsp),%ecx
	sub $8,%rsp
	call .Lnp_special_x1 # call the handler for one value
	add $8,%rsp
	mov %eax,p_temp+12(%rsp)
	.Lxinfd:
	movdqa p_temp(%rsp),%xmm0
	jmp .Lrnsx1

	# a subroutine to treat an individual x,y pair when x is +/-infinity
	# assumes x in .Ly(%rip),%eax in ebx, inty in ecx.
	# returns result in eax
	.Lnp_special_x1: # x is infinite
	test $0x080000000,%eax # is x positive
	jnz .Lnsx11 # jump if not
	test $0x080000000,%ebx # is y positive
	jz .Lnsx13 # just return if so
	xor %eax,%eax # else return 0
	jmp .Lnsx13

	.Lnsx11:
	cmp $1,%ecx ## if inty ==1
	jnz .Lnsx12 # jump if not
	test $0x080000000,%ebx # is y positive
	jz .Lnsx13 # just return if so
	mov $0x080000000,%eax # else return -0
	jmp .Lnsx13
	.Lnsx12: # inty <>1
	and $0x07FFFFFFF,%eax # return -x (\|x\|) if y<0
	test $0x080000000,%ebx # is y positive
	jz .Lnsx13 #
	xor %eax,%eax # return 0 if y >=0
	.Lnsx13:
	ret


	# handle cases where x is +/- zero. edx is the mask of x,y pairs with \|x\|=0
	.align 16
	.Lx_zero:
	movdqa %xmm0,p_temp(%rsp)

	test $1,%edx
	jz .Lxzera
	mov p_xptr(%rsp),%rcx # get pointer to x
	mov p_y(%rsp),%ebx
	mov (%rcx),%eax
	mov p_inty(%rsp),%ecx
	sub $8,%rsp
	call .Lnp_special_x2 # call the handler for one value
	add $8,%rsp
	mov %eax,p_temp(%rsp)
	.Lxzera:
	test $2,%edx
	jz .Lxzerb
	mov p_xptr(%rsp),%rcx # get pointer to x
	mov p_y(%rsp),%ebx
	mov 4(%rcx),%eax
	mov p_inty(%rsp),%ecx
	sub $8,%rsp
	call .Lnp_special_x2 # call the handler for one value
	add $8,%rsp
	mov %eax,p_temp+4(%rsp)
	.Lxzerb:
	test $4,%edx
	jz .Lxzerc
	mov p_xptr(%rsp),%rcx # get pointer to x
	mov p_y(%rsp),%ebx
	mov 8(%rcx),%eax
	mov p_inty(%rsp),%ecx
	sub $8,%rsp
	call .Lnp_special_x2 # call the handler for one value
	add $8,%rsp
	mov %eax,p_temp+8(%rsp)
	.Lxzerc:
	test $8,%edx
	jz .Lxzerd
	mov p_xptr(%rsp),%rcx # get pointer to x
	mov p_y(%rsp),%ebx
	mov 12(%rcx),%eax
	mov p_inty(%rsp),%ecx
	sub $8,%rsp
	call .Lnp_special_x2 # call the handler for one value
	add $8,%rsp
	mov %eax,p_temp+12(%rsp)
	.Lxzerd:
	movdqa p_temp(%rsp),%xmm0
	jmp .Lrnsx2

	# a subroutine to treat an individual x,y pair when x is +/-0
	# assumes x in .Ly(%rip),%eax in ebx, inty in ecx.
	# returns result in eax
	.align 16
	.Lnp_special_x2:
	cmp $1,%ecx ## if inty ==1
	jz .Lnsx21 # jump if so
	# handle cases of x=+/-0, y not integer
	xor %eax,%eax
	mov $0x07f800000,%ecx
	test $0x080000000,%ebx # is ypos
	cmovnz %ecx,%eax
	jmp .Lnsx23
	# y is an integer
	.Lnsx21:
	xor %r8d,%r8d
	mov $0x07f800000,%ecx
	test $0x080000000,%ebx # is ypos
	cmovnz %ecx,%r8d # set to infinity if not
	and $0x080000000,%eax # pickup the sign of x
	or %r8d,%eax # and include it in the result
	.Lnsx23:
	ret


	.data
	.align 64

	.L__mask_sign: .quad 0x08000000080000000 # a sign bit mask
	.quad 0x08000000080000000

	.L__mask_nsign: .quad 0x07FFFFFFF7FFFFFFF # a not sign bit mask
	.quad 0x07FFFFFFF7FFFFFFF

	# used by special case checking

	.L__float_one: .quad 0x03f8000003f800000 # one
	.quad 0x03f8000003f800000

	.L__mask_inf: .quad 0x07f8000007F800000 # inifinity
	.quad 0x07f8000007F800000

	.L__mask_sigbit: .quad 0x00040000000400000 # QNaN bit
	.quad 0x00040000000400000