| |
| # |
| # (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved. |
| # |
| # This file is part of libacml_mv. |
| # |
| # libacml_mv is free software; you can redistribute it and/or |
| # modify it under the terms of the GNU Lesser General Public |
| # License as published by the Free Software Foundation; either |
| # version 2.1 of the License, or (at your option) any later version. |
| # |
| # libacml_mv is distributed in the hope that it will be useful, |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| # Lesser General Public License for more details. |
| # |
| # You should have received a copy of the GNU Lesser General Public |
| # License along with libacml_mv. If not, see |
| # <http://www.gnu.org/licenses/>. |
| # |
| # |
| |
| |
| |
| |
| |
| # |
| # vrsapowf.asm |
| # |
| # An array implementation of the powf libm function. |
| # |
| # Prototype: |
| # |
| # void vrsa_powf(int n, float *x, float *y, float *z); |
| # |
| # Computes x raised to the y power. |
| # |
| # Places the results into the supplied z array. |
| # Does not perform error handling, but does return C99 values for error |
| # inputs. Denormal results are truncated to 0. |
| |
| # |
| # |
| |
| #ifdef __ELF__ |
| .section .note.GNU-stack,"",@progbits |
| #endif |
| |
| # define local variable storage offsets |
| .equ p_temp,0x00 # xmmword |
| .equ p_negateres,0x10 # qword |
| |
| |
| .equ save_rbx,0x030 #qword |
| |
| |
| .equ p_ax,0x050 # absolute x |
| .equ p_sx,0x060 # sign of x's |
| |
| .equ p_ay,0x070 # absolute y |
| .equ p_yexp,0x080 # unbiased exponent of y |
| |
| .equ p_inty,0x090 # integer y indicators |
| |
| .equ p_xptr,0x0a0 # ptr to x values |
| .equ p_yptr,0x0a8 # ptr to y values |
| .equ p_zptr,0x0b0 # ptr to z values |
| |
| .equ p_nv,0x0b8 #qword |
| .equ p_iter,0x0c0 # qword storage for number of loop iterations |
| |
| .equ p2_temp,0x0d0 #qword |
| .equ p2_temp1,0x0f0 #qword |
| |
| .equ stack_size,0x0118 # allocate 40h more than |
| # we need to avoid bank conflicts |
| |
| |
| |
| |
| .weak vrsa_powf_ |
| .set vrsa_powf_,__vrsa_powf__ |
| .weak vrsa_powf__ |
| .set vrsa_powf__,__vrsa_powf__ |
| |
| .text |
| .align 16 |
| .p2align 4,,15 |
| |
| #/* a FORTRAN subroutine implementation of array powf |
| #** VRSA_POWF(N,X,Y,Z) |
| #** C equivalent |
| #*/ |
| #void vrsa_powf_(int * n, float *x, float *y, float *z) |
| #{ |
| # vrsa_powf(*n,x,y,z); |
| #} |
| |
| .globl __vrsa_powf__ |
| .type __vrsa_powf__,@function |
| __vrsa_powf__: |
| mov (%rdi),%edi |
| |
| |
| # parameters are passed in by Linux as: |
| # edi - int n |
| # rsi - float *x |
| # rdx - float *y |
| # rcx - float *z |
| |
| .globl vrsa_powf |
| .type vrsa_powf,@function |
| vrsa_powf: |
| |
| sub $stack_size,%rsp |
| mov %rbx,save_rbx(%rsp) # save rbx |
| # save the arguments |
| mov %rsi,p_xptr(%rsp) # save pointer to x |
| mov %rdx,p_yptr(%rsp) # save pointer to y |
| mov %rcx,p_zptr(%rsp) # save pointer to z |
| #ifdef INTEGER64 |
| mov %rdi,%rax |
| #else |
| mov %edi,%eax |
| #endif |
| |
| mov %rax,%rcx |
| mov %rcx,p_nv(%rsp) # save number of values |
| # see if too few values to call the main loop |
| shr $2,%rax # get number of iterations |
| jz .L__vsa_cleanup # jump if only single calls |
| # prepare the iteration counts |
| mov %rax,p_iter(%rsp) # save number of iterations |
| shl $2,%rax |
| sub %rax,%rcx # compute number of extra single calls |
| mov %rcx,p_nv(%rsp) # save number of left over values |
| |
| # process the array 4 values at a time. |
| |
| .L__vsa_top: |
| # build the input _m128 |
| # first get x |
| mov p_xptr(%rsp),%rsi # get x_array pointer |
| movups (%rsi),%xmm0 |
| prefetch 64(%rsi) |
| |
| movaps %xmm0,%xmm2 |
| andps .L__mask_nsign(%rip),%xmm0 # get abs x |
| andps .L__mask_sign(%rip),%xmm2 # mask for the sign bits |
| movaps %xmm0,p_ax(%rsp) # save them |
| movaps %xmm2,p_sx(%rsp) # save them |
| # convert all four x's to double |
| cvtps2pd p_ax(%rsp),%xmm0 |
| cvtps2pd p_ax+8(%rsp),%xmm1 |
| # |
| # classify y |
| # vector 32 bit integer method 25 cycles to here |
| # /* See whether y is an integer. |
| # inty = 0 means not an integer. |
| # inty = 1 means odd integer. |
| # inty = 2 means even integer. |
| # */ |
| mov p_yptr(%rsp),%rdi # get y_array pointer |
| movups (%rdi),%xmm4 |
| prefetch 64(%rdi) |
| pxor %xmm3,%xmm3 |
| pand .L__mask_nsign(%rip),%xmm4 # get abs y in integer format |
| movdqa %xmm4,p_ay(%rsp) # save it |
| |
| # see if the number is less than 1.0 |
| psrld $23,%xmm4 #>> EXPSHIFTBITS_SP32 |
| |
| psubd .L__mask_127(%rip),%xmm4 # yexp, unbiased exponent |
| movdqa %xmm4,p_yexp(%rsp) # save it |
| paddd .L__mask_1(%rip),%xmm4 # yexp+1 |
| pcmpgtd %xmm3,%xmm4 # 0 if exp less than 126 (2^0) (y < 1.0), else FFs |
| # xmm4 is ffs if abs(y) >=1.0, else 0 |
| |
| # see if the mantissa has fractional bits |
| #build mask for mantissa |
| movdqa .L__mask_23(%rip),%xmm2 |
| psubd p_yexp(%rsp),%xmm2 # 24-yexp |
| pmaxsw %xmm3,%xmm2 # no shift counts less than 0 |
| movdqa %xmm2,p_temp(%rsp) # save the shift counts |
| # create mask for all four values |
| # SSE can't individual shifts so have to do 0xeac one seperately |
| mov p_temp(%rsp),%rcx |
| mov $1,%rbx |
| shl %cl,%ebx #1 << (24 - yexp) |
| shr $32,%rcx |
| mov $1,%eax |
| shl %cl,%eax #1 << (24 - yexp) |
| shl $32,%rax |
| add %rax,%rbx |
| mov %rbx,p_temp(%rsp) |
| mov p_temp+8(%rsp),%rcx |
| mov $1,%rbx |
| shl %cl,%ebx #1 << (24 - yexp) |
| shr $32,%rcx |
| mov $1,%eax |
| shl %cl,%eax #1 << (24 - yexp) |
| shl $32,%rax |
| add %rbx,%rax |
| mov %rax,p_temp+8(%rsp) |
| movdqa p_temp(%rsp),%xmm5 |
| psubd .L__mask_1(%rip),%xmm5 #= mask = (1 << (24 - yexp)) - 1 |
| |
| # now use the mask to see if there are any fractional bits |
| movdqu (%rdi),%xmm2 # get uy |
| pand %xmm5,%xmm2 # uy & mask |
| pcmpeqd %xmm3,%xmm2 # 0 if not zero (y has fractional mantissa bits), else FFs |
| pand %xmm4,%xmm2 # either 0s or ff |
| # xmm2 now accounts for y< 1.0 or y>=1.0 and y has fractional mantissa bits, |
| # it has the value 0 if we know it's non-integer or ff if integer. |
| |
| # now see if it's even or odd. |
| |
| ## if yexp > 24, then it has to be even |
| movdqa .L__mask_24(%rip),%xmm4 |
| psubd p_yexp(%rsp),%xmm4 # 24-yexp |
| paddd .L__mask_1(%rip),%xmm5 # mask+1 = least significant integer bit |
| pcmpgtd %xmm3,%xmm4 # if 0, then must be even, else ff's |
| |
| pand %xmm4,%xmm5 # set the integer bit mask to zero if yexp>24 |
| paddd .L__mask_2(%rip),%xmm4 |
| por .L__mask_2(%rip),%xmm4 |
| pand %xmm2,%xmm4 # result can be 0, 2, or 3 |
| |
| # now for integer numbers, see if odd or even |
| pand .L__mask_mant(%rip),%xmm5 # mask out exponent bits |
| movdqu (%rdi),%xmm2 |
| pand %xmm2,%xmm5 # & uy -> even or odd |
| movdqa .L__float_one(%rip),%xmm2 |
| pcmpeqd p_ay(%rsp),%xmm2 # is ay equal to 1, ff's if so, then it's odd |
| pand .L__mask_nsign(%rip),%xmm2 # strip the sign bit so the gt comparison works. |
| por %xmm2,%xmm5 |
| pcmpgtd %xmm3,%xmm5 # if odd then ff's, else 0's for even |
| paddd .L__mask_2(%rip),%xmm5 # gives us 2 for even, 1 for odd |
| pand %xmm5,%xmm4 |
| |
| movdqa %xmm4,p_inty(%rsp) # save inty |
| # |
| # do more x special case checking |
| # |
| movdqa %xmm4,%xmm5 |
| pcmpeqd %xmm3,%xmm5 # is not an integer? ff's if so |
| pand .L__mask_NaN(%rip),%xmm5 # these values will be NaNs, if x<0 |
| movdqa %xmm4,%xmm2 |
| pcmpeqd .L__mask_1(%rip),%xmm2 # is it odd? ff's if so |
| pand .L__mask_sign(%rip),%xmm2 # these values will get their sign bit set |
| por %xmm2,%xmm5 |
| |
| pcmpeqd p_sx(%rsp),%xmm3 # if the signs are set |
| pandn %xmm5,%xmm3 # then negateres gets the values as shown below |
| movdqa %xmm3,p_negateres(%rsp) # save negateres |
| |
| # /* p_negateres now means the following. |
| # 7FC00000 means x<0, y not an integer, return NaN. |
| # 80000000 means x<0, y is odd integer, so set the sign bit. |
| ## 0 means even integer, and/or x>=0. |
| # */ |
| |
| |
| # **** Here starts the main calculations **** |
| # The algorithm used is x**y = exp(y*log(x)) |
| # Extra precision is required in intermediate steps to meet the 1ulp requirement |
| # |
| # log(x) calculation |
| call __vrd4_log@PLT # get the double precision log value |
| # for all four x's |
| # y* logx |
| # convert all four y's to double |
| # mov p_yptr(%rsp),%rdi ; get y_array pointer |
| cvtps2pd (%rdi),%xmm2 |
| cvtps2pd 8(%rdi),%xmm3 |
| |
| # /* just multiply by y */ |
| mulpd %xmm2,%xmm0 |
| mulpd %xmm3,%xmm1 |
| |
| # /* The following code computes r = exp(w) */ |
| call __vrd4_exp@PLT # get the double exp value |
| # for all four y*log(x)'s |
| mov p_xptr(%rsp),%rsi # get x_array pointer |
| mov p_yptr(%rsp),%rdi # get y_array pointer |
| # |
| # convert all four results to double |
| cvtpd2ps %xmm0,%xmm0 |
| cvtpd2ps %xmm1,%xmm1 |
| movlhps %xmm1,%xmm0 |
| |
| # perform special case and error checking on input values |
| |
| # special case checking is done first in the scalar version since |
| # it allows for early fast returns. But for vectors, we consider them |
| # to be rare, so early returns are not necessary. So we first compute |
| # the x**y values, and then check for special cases. |
| |
| # we do some of the checking in reverse order of the scalar version. |
| # apply the negate result flags |
| orps p_negateres(%rsp),%xmm0 # get negateres |
| |
| ## if y is infinite or so large that the result would overflow or underflow |
| movdqa p_ay(%rsp),%xmm4 |
| cmpps $5,.L__mask_ly(%rip),%xmm4 # y not less than large value, ffs if so. |
| movmskps %xmm4,%edx |
| test $0x0f,%edx |
| jnz .Ly_large |
| .Lrnsx3: |
| |
| ## if x is infinite |
| movdqa p_ax(%rsp),%xmm4 |
| cmpps $0,.L__mask_inf(%rip),%xmm4 # equal to infinity, ffs if so. |
| movmskps %xmm4,%edx |
| test $0x0f,%edx |
| jnz .Lx_infinite |
| .Lrnsx1: |
| ## if x is zero |
| xorps %xmm4,%xmm4 |
| cmpps $0,p_ax(%rsp),%xmm4 # equal to zero, ffs if so. |
| movmskps %xmm4,%edx |
| test $0x0f,%edx |
| jnz .Lx_zero |
| .Lrnsx2: |
| ## if y is NAN |
| movdqu (%rdi),%xmm4 # get y |
| cmpps $4,%xmm4,%xmm4 # a compare not equal of y to itself should |
| # be false, unless y is a NaN. ff's if NaN. |
| movmskps %xmm4,%ecx |
| test $0x0f,%ecx |
| jnz .Ly_NaN |
| .Lrnsx4: |
| ## if x is NAN |
| movdqu (%rsi),%xmm4 # get x |
| cmpps $4,%xmm4,%xmm4 # a compare not equal of x to itself should |
| # be false, unless x is a NaN. ff's if NaN. |
| movmskps %xmm4,%ecx |
| test $0x0f,%ecx |
| jnz .Lx_NaN |
| .Lrnsx5: |
| |
| ## if |y| == 0 then return 1 |
| movdqa .L__float_one(%rip),%xmm3 # one |
| xorps %xmm2,%xmm2 |
| cmpps $4,p_ay(%rsp),%xmm2 # not equal to 0.0?, ffs if not equal. |
| andps %xmm2,%xmm0 # keep the others |
| andnps %xmm3,%xmm2 # mask for ones |
| orps %xmm2,%xmm0 |
| ## if x == +1, return +1 for all x |
| movdqa %xmm3,%xmm2 |
| movdqu (%rsi),%xmm5 |
| cmpps $4,%xmm5,%xmm2 # not equal to +1.0?, ffs if not equal. |
| andps %xmm2,%xmm0 # keep the others |
| andnps %xmm3,%xmm2 # mask for ones |
| orps %xmm2,%xmm0 |
| |
| .L__powf_cleanup2: |
| |
| # update the x and y pointers |
| add $16,%rdi |
| add $16,%rsi |
| mov %rsi,p_xptr(%rsp) # save x_array pointer |
| mov %rdi,p_yptr(%rsp) # save y_array pointer |
| # store the result _m128d |
| mov p_zptr(%rsp),%rdi # get z_array pointer |
| movups %xmm0,(%rdi) |
| # prefetchw QWORD PTR [rdi+64] |
| prefetch 64(%rdi) |
| add $16,%rdi |
| mov %rdi,p_zptr(%rsp) # save z_array pointer |
| |
| |
| mov p_iter(%rsp),%rax # get number of iterations |
| sub $1,%rax |
| mov %rax,p_iter(%rsp) # save number of iterations |
| jnz .L__vsa_top |
| |
| |
| # see if we need to do any extras |
| mov p_nv(%rsp),%rax # get number of values |
| test %rax,%rax |
| jnz .L__vsa_cleanup |
| |
| .L__final_check: |
| mov save_rbx(%rsp),%rbx # restore rbx |
| add $stack_size,%rsp |
| ret |
| |
| .align 16 |
| # we jump here when we have an odd number of log calls to make at the |
| # end |
| .L__vsa_cleanup: |
| mov p_nv(%rsp),%rax # get number of values |
| test %rax,%rax # are there any values |
| jz .L__final_check # exit if not |
| |
| mov p_xptr(%rsp),%rsi |
| mov p_yptr(%rsp),%rdi |
| |
| # fill in a m128 with zeroes and the extra values and then make a recursive call. |
| xorps %xmm0,%xmm0 |
| movaps %xmm0,p2_temp(%rsp) |
| movaps %xmm0,p2_temp+16(%rsp) |
| |
| mov (%rsi),%ecx # we know there's at least one |
| mov %ecx,p2_temp(%rsp) |
| mov (%rdi),%edx # we know there's at least one |
| mov %edx,p2_temp+16(%rsp) |
| cmp $2,%rax |
| jl .L__vsacg |
| |
| mov 4(%rsi),%ecx # do the second value |
| mov %ecx,p2_temp+4(%rsp) |
| mov 4(%rdi),%edx # we know there's at least one |
| mov %edx,p2_temp+20(%rsp) |
| cmp $3,%rax |
| jl .L__vsacg |
| |
| mov 8(%rsi),%ecx # do the third value |
| mov %ecx,p2_temp+8(%rsp) |
| mov 8(%rdi),%edx # we know there's at least one |
| mov %edx,p2_temp+24(%rsp) |
| |
| .L__vsacg: |
| mov $4,%rdi # parameter for N |
| lea p2_temp(%rsp),%rsi # &x parameter |
| lea p2_temp+16(%rsp),%rdx # &y parameter |
| lea p2_temp1(%rsp),%rcx # &z parameter |
| call vrsa_powf@PLT # call recursively to compute four values |
| |
| # now copy the results to the destination array |
| mov p_zptr(%rsp),%rdi |
| mov p_nv(%rsp),%rax # get number of values |
| mov p2_temp1(%rsp),%ecx |
| mov %ecx,(%rdi) # we know there's at least one |
| cmp $2,%rax |
| jl .L__vsacgf |
| |
| mov p2_temp1+4(%rsp),%ecx |
| mov %ecx,4(%rdi) # do the second value |
| cmp $3,%rax |
| jl .L__vsacgf |
| |
| mov p2_temp1+8(%rsp),%ecx |
| mov %ecx,8(%rdi) # do the third value |
| |
| .L__vsacgf: |
| jmp .L__final_check |
| |
| .align 16 |
| # y is a NaN. |
| .Ly_NaN: |
| mov p_yptr(%rsp),%rdx # get pointer to y |
| movdqu (%rdx),%xmm4 # get y |
| movdqa %xmm4,%xmm3 |
| movdqa %xmm4,%xmm5 |
| movdqa .L__mask_sigbit(%rip),%xmm2 # get the signalling bits |
| cmpps $0,%xmm4,%xmm4 # a compare equal of y to itself should |
| # be true, unless y is a NaN. 0's if NaN. |
| cmpps $4,%xmm3,%xmm3 # compare not equal, ff's if NaN. |
| andps %xmm4,%xmm0 # keep the other results |
| andps %xmm3,%xmm2 # get just the right signalling bits |
| andps %xmm5,%xmm3 # mask for the NaNs |
| orps %xmm2,%xmm3 # convert to QNaNs |
| orps %xmm3,%xmm0 # combine |
| jmp .Lrnsx4 |
| |
| # y is a NaN. |
| .Lx_NaN: |
| mov p_xptr(%rsp),%rcx # get pointer to x |
| movdqu (%rcx),%xmm4 # get x |
| movdqa %xmm4,%xmm3 |
| movdqa %xmm4,%xmm5 |
| movdqa .L__mask_sigbit(%rip),%xmm2 # get the signalling bits |
| cmpps $0,%xmm4,%xmm4 # a compare equal of x to itself should |
| # be true, unless x is a NaN. 0's if NaN. |
| cmpps $4,%xmm3,%xmm3 # compare not equal, ff's if NaN. |
| andps %xmm4,%xmm0 # keep the other results |
| andps %xmm3,%xmm2 # get just the right signalling bits |
| andps %xmm5,%xmm3 # mask for the NaNs |
| orps %xmm2,%xmm3 # convert to QNaNs |
| orps %xmm3,%xmm0 # combine |
| jmp .Lrnsx5 |
| |
| # y is infinite or so large that the result would |
| # overflow or underflow. |
| .Ly_large: |
| movdqa %xmm0,p_temp(%rsp) |
| |
| test $1,%edx |
| jz .Lylrga |
| mov p_xptr(%rsp),%rcx # get pointer to x |
| mov p_yptr(%rsp),%rbx # get pointer to y |
| mov (%rcx),%eax |
| mov (%rbx),%ebx |
| mov p_inty(%rsp),%ecx |
| sub $8,%rsp |
| call .Lnp_special6 # call the handler for one value |
| add $8,%rsp |
| mov %eax,p_temp(%rsp) |
| .Lylrga: |
| test $2,%edx |
| jz .Lylrgb |
| mov p_xptr(%rsp),%rcx # get pointer to x |
| mov p_yptr(%rsp),%rbx # get pointer to y |
| mov 4(%rcx),%eax |
| mov 4(%rbx),%ebx |
| mov p_inty+4(%rsp),%ecx |
| sub $8,%rsp |
| call .Lnp_special6 # call the handler for one value |
| add $8,%rsp |
| mov %eax,p_temp+4(%rsp) |
| .Lylrgb: |
| test $4,%edx |
| jz .Lylrgc |
| mov p_xptr(%rsp),%rcx # get pointer to x |
| mov p_yptr(%rsp),%rbx # get pointer to y |
| mov 8(%rcx),%eax |
| mov 8(%rbx),%ebx |
| mov p_inty+8(%rsp),%ecx |
| sub $8,%rsp |
| call .Lnp_special6 # call the handler for one value |
| add $8,%rsp |
| mov %eax,p_temp+8(%rsp) |
| .Lylrgc: |
| test $8,%edx |
| jz .Lylrgd |
| mov p_xptr(%rsp),%rcx # get pointer to x |
| mov p_yptr(%rsp),%rbx # get pointer to y |
| mov 12(%rcx),%eax |
| mov 12(%rbx),%ebx |
| mov p_inty+12(%rsp),%ecx |
| sub $8,%rsp |
| call .Lnp_special6 # call the handler for one value |
| add $8,%rsp |
| mov %eax,p_temp+12(%rsp) |
| .Lylrgd: |
| movdqa p_temp(%rsp),%xmm0 |
| jmp .Lrnsx3 |
| |
| # a subroutine to treat an individual x,y pair when y is large or infinity |
| # assumes x in .Ly(%rip),%eax in ebx. |
| # returns result in eax |
| .Lnp_special6: |
| # handle |x|==1 cases first |
| mov $0x07FFFFFFF,%r8d |
| and %eax,%r8d |
| cmp $0x03f800000,%r8d # jump if |x| !=1 |
| jnz .Lnps6 |
| mov $0x03f800000,%eax # return 1 for all |x|==1 |
| jmp .Lnpx64 |
| |
| # cases where |x| !=1 |
| .Lnps6: |
| mov $0x07f800000,%ecx |
| xor %eax,%eax # assume 0 return |
| test $0x080000000,%ebx |
| jnz .Lnps62 # jump if y negative |
| # y = +inf |
| cmp $0x03f800000,%r8d |
| cmovg %ecx,%eax # return inf if |x| < 1 |
| jmp .Lnpx64 |
| .Lnps62: |
| # y = -inf |
| cmp $0x03f800000,%r8d |
| cmovl %ecx,%eax # return inf if |x| < 1 |
| jmp .Lnpx64 |
| |
| .Lnpx64: |
| ret |
| |
| # handle cases where x is +/- infinity. edx is the mask |
| .align 16 |
| .Lx_infinite: |
| movdqa %xmm0,p_temp(%rsp) |
| |
| test $1,%edx |
| jz .Lxinfa |
| mov p_xptr(%rsp),%rcx # get pointer to x |
| mov p_yptr(%rsp),%rbx # get pointer to y |
| mov (%rcx),%eax |
| mov (%rbx),%ebx |
| mov p_inty(%rsp),%ecx |
| sub $8,%rsp |
| call .Lnp_special_x1 # call the handler for one value |
| add $8,%rsp |
| mov %eax,p_temp(%rsp) |
| .Lxinfa: |
| test $2,%edx |
| jz .Lxinfb |
| mov p_xptr(%rsp),%rcx # get pointer to x |
| mov p_yptr(%rsp),%rbx # get pointer to y |
| mov 4(%rcx),%eax |
| mov 4(%rbx),%ebx |
| mov p_inty+4(%rsp),%ecx |
| sub $8,%rsp |
| call .Lnp_special_x1 # call the handler for one value |
| add $8,%rsp |
| mov %eax,p_temp+4(%rsp) |
| .Lxinfb: |
| test $4,%edx |
| jz .Lxinfc |
| mov p_xptr(%rsp),%rcx # get pointer to x |
| mov p_yptr(%rsp),%rbx # get pointer to y |
| mov 8(%rcx),%eax |
| mov 8(%rbx),%ebx |
| mov p_inty+8(%rsp),%ecx |
| sub $8,%rsp |
| call .Lnp_special_x1 # call the handler for one value |
| add $8,%rsp |
| mov %eax,p_temp+8(%rsp) |
| .Lxinfc: |
| test $8,%edx |
| jz .Lxinfd |
| mov p_xptr(%rsp),%rcx # get pointer to x |
| mov p_yptr(%rsp),%rbx # get pointer to y |
| mov 12(%rcx),%eax |
| mov 12(%rbx),%ebx |
| mov p_inty+12(%rsp),%ecx |
| sub $8,%rsp |
| call .Lnp_special_x1 # call the handler for one value |
| add $8,%rsp |
| mov %eax,p_temp+12(%rsp) |
| .Lxinfd: |
| movdqa p_temp(%rsp),%xmm0 |
| jmp .Lrnsx1 |
| |
| # a subroutine to treat an individual x,y pair when x is +/-infinity |
| # assumes x in .Ly(%rip),%eax in ebx, inty in ecx. |
| # returns result in eax |
| .Lnp_special_x1: # x is infinite |
| test $0x080000000,%eax # is x positive |
| jnz .Lnsx11 # jump if not |
| test $0x080000000,%ebx # is y positive |
| jz .Lnsx13 # just return if so |
| xor %eax,%eax # else return 0 |
| jmp .Lnsx13 |
| |
| .Lnsx11: |
| cmp $1,%ecx # if inty ==1 |
| jnz .Lnsx12 # jump if not |
| test $0x080000000,%ebx # is y positive |
| jz .Lnsx13 # just return if so |
| mov $0x080000000,%eax # else return -0 |
| jmp .Lnsx13 |
| .Lnsx12: # inty <>1 |
| and $0x07FFFFFFF,%eax # return -x (|x|) if y<0 |
| test $0x080000000,%ebx # is y positive |
| jz .Lnsx13 # |
| xor %eax,%eax # return 0 if y >=0 |
| .Lnsx13: |
| ret |
| |
| |
| # handle cases where x is +/- zero. edx is the mask of x,y pairs with |x|=0 |
| .align 16 |
| .Lx_zero: |
| movdqa %xmm0,p_temp(%rsp) |
| |
| test $1,%edx |
| jz .Lxzera |
| mov p_xptr(%rsp),%rcx # get pointer to x |
| mov p_yptr(%rsp),%rbx # get pointer to y |
| mov (%rcx),%eax |
| mov (%rbx),%ebx |
| mov p_inty(%rsp),%ecx |
| sub $8,%rsp |
| call .Lnp_special_x2 # call the handler for one value |
| add $8,%rsp |
| mov %eax,p_temp(%rsp) |
| .Lxzera: |
| test $2,%edx |
| jz .Lxzerb |
| mov p_xptr(%rsp),%rcx # get pointer to x |
| mov p_yptr(%rsp),%rbx # get pointer to y |
| mov 4(%rcx),%eax |
| mov 4(%rbx),%ebx |
| mov p_inty+4(%rsp),%ecx |
| sub $8,%rsp |
| call .Lnp_special_x2 # call the handler for one value |
| add $8,%rsp |
| mov %eax,p_temp+4(%rsp) |
| .Lxzerb: |
| test $4,%edx |
| jz .Lxzerc |
| mov p_xptr(%rsp),%rcx # get pointer to x |
| mov p_yptr(%rsp),%rbx # get pointer to y |
| mov 8(%rcx),%eax |
| mov 8(%rbx),%ebx |
| mov p_inty+8(%rsp),%ecx |
| sub $8,%rsp |
| call .Lnp_special_x2 # call the handler for one value |
| add $8,%rsp |
| mov %eax,p_temp+8(%rsp) |
| .Lxzerc: |
| test $8,%edx |
| jz .Lxzerd |
| mov p_xptr(%rsp),%rcx # get pointer to x |
| mov p_yptr(%rsp),%rbx # get pointer to y |
| mov 12(%rcx),%eax |
| mov 12(%rbx),%ebx |
| mov p_inty+12(%rsp),%ecx |
| sub $8,%rsp |
| call .Lnp_special_x2 # call the handler for one value |
| add $8,%rsp |
| mov %eax,p_temp+12(%rsp) |
| .Lxzerd: |
| movdqa p_temp(%rsp),%xmm0 |
| jmp .Lrnsx2 |
| |
| # a subroutine to treat an individual x,y pair when x is +/-0 |
| # assumes x in .Ly(%rip),%eax in ebx, inty in ecx. |
| # returns result in eax |
| .align 16 |
| .Lnp_special_x2: |
| cmp $1,%ecx # if inty ==1 |
| jz .Lnsx21 # jump if so |
| # handle cases of x=+/-0, y not integer |
| xor %eax,%eax |
| mov $0x07f800000,%ecx |
| test $0x080000000,%ebx # is ypos |
| cmovnz %ecx,%eax |
| jmp .Lnsx23 |
| # y is an integer |
| .Lnsx21: |
| xor %r8d,%r8d |
| mov $0x07f800000,%ecx |
| test $0x080000000,%ebx # is ypos |
| cmovnz %ecx,%r8d # set to infinity if not |
| and $0x080000000,%eax # pickup the sign of x |
| or %r8d,%eax # and include it in the result |
| .Lnsx23: |
| ret |
| |
| |
| |
| .data |
| .align 64 |
| |
| .L__mask_sign: .quad 0x08000000080000000 # a sign bit mask |
| .quad 0x08000000080000000 |
| |
| .L__mask_nsign: .quad 0x07FFFFFFF7FFFFFFF # a not sign bit mask |
| .quad 0x07FFFFFFF7FFFFFFF |
| |
| # used by inty |
| .L__mask_127: .quad 0x00000007F0000007F # EXPBIAS_SP32 |
| .quad 0x00000007F0000007F |
| |
| .L__mask_mant: .quad 0x0007FFFFF007FFFFF # mantissa bit mask |
| .quad 0x0007FFFFF007FFFFF |
| |
| .L__mask_1: .quad 0x00000000100000001 # 1 |
| .quad 0x00000000100000001 |
| |
| .L__mask_2: .quad 0x00000000200000002 # 2 |
| .quad 0x00000000200000002 |
| |
| .L__mask_24: .quad 0x00000001800000018 # 24 |
| .quad 0x00000001800000018 |
| |
| .L__mask_23: .quad 0x00000001700000017 # 23 |
| .quad 0x00000001700000017 |
| |
| # used by special case checking |
| |
| .L__float_one: .quad 0x03f8000003f800000 # one |
| .quad 0x03f8000003f800000 |
| |
| .L__mask_inf: .quad 0x07f8000007F800000 # inifinity |
| .quad 0x07f8000007F800000 |
| |
| .L__mask_NaN: .quad 0x07fC000007FC00000 # NaN |
| .quad 0x07fC000007FC00000 |
| |
| .L__mask_sigbit: .quad 0x00040000000400000 # QNaN bit |
| .quad 0x00040000000400000 |
| |
| .L__mask_ly: .quad 0x04f0000004f000000 # large y |
| .quad 0x04f0000004f000000 |
| |
| |