| |
| # |
| # (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved. |
| # |
| # This file is part of libacml_mv. |
| # |
| # libacml_mv is free software; you can redistribute it and/or |
| # modify it under the terms of the GNU Lesser General Public |
| # License as published by the Free Software Foundation; either |
| # version 2.1 of the License, or (at your option) any later version. |
| # |
| # libacml_mv is distributed in the hope that it will be useful, |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| # Lesser General Public License for more details. |
| # |
| # You should have received a copy of the GNU Lesser General Public |
| # License along with libacml_mv. If not, see |
| # <http://www.gnu.org/licenses/>. |
| # |
| # |
| |
| |
| |
| |
| |
| # |
| # vrsapowxf.asm |
| # |
| # An array implementation of the powf libm function. |
| # This routine raises the x array to a constant y power. |
| # |
| # Prototype: |
| # |
| # void vrsa_powxf(int n, float *x, float y, float *z); |
| # |
| # Places the results into the supplied z array. |
| # Does not perform error handling, but does return C99 values for error |
| # inputs. Denormal results are truncated to 0. |
| # |
| # |
| |
| #ifdef __ELF__ |
| .section .note.GNU-stack,"",@progbits |
| #endif |
| |
| |
| # define local variable storage offsets |
| .equ p_temp,0x00 # xmmword |
| .equ p_negateres,0x10 # qword |
| |
| .equ p_xexp,0x20 # qword |
| |
| .equ save_rbx,0x030 #qword |
| |
| .equ p_y,0x048 # y value |
| |
| .equ p_ax,0x050 # absolute x |
| .equ p_sx,0x060 # sign of x's |
| |
| .equ p_ay,0x070 # absolute y |
| .equ p_yexp,0x080 # unbiased exponent of y |
| |
| .equ p_inty,0x090 # integer y indicator |
| |
| .equ p_xptr,0x0a0 # ptr to x values |
| .equ p_zptr,0x0b0 # ptr to z values |
| |
| .equ p_nv,0x0b8 #qword |
| .equ p_iter,0x0c0 # qword storage for number of loop iterations |
| |
| .equ p2_temp,0x0d0 #qword |
| .equ p2_temp1,0x0f0 #qword |
| |
| .equ stack_size,0x0118 # allocate 40h more than |
| # we need to avoid bank conflicts |
| |
| |
| |
| |
| .weak vrsa_powxf_ |
| .set vrsa_powxf_,__vrsa_powxf__ |
| .weak vrsa_powxf__ |
| .set vrsa_powxf__,__vrsa_powxf__ |
| |
| .text |
| .align 16 |
| .p2align 4,,15 |
| .globl __vrsa_powxf__ |
| .type __vrsa_powxf__,@function |
| __vrsa_powxf__: |
| |
| #/* a FORTRAN subroutine implementation of array powf |
| #** VRSA_POWXF(N,X,Y,Z) |
| #** C equivalent |
| #*/ |
| #void vrsa_powxf_(int * n, float *x, float *y, float *z) |
| #{ |
| # vrsa_powxf(*n,x,y,z); |
| #} |
| # parameters are passed in by Linux FORTRAN as: |
| # edi - int n |
| # rsi - float *x |
| # rdx - float *y |
| # rcx - float *z |
| mov (%rdi),%edi |
| movss (%rdx),%xmm0 |
| mov %rcx,%rdx |
| |
| |
| |
| |
| # parameters are passed in by Linux C as: |
| # edi - int n |
| # rsi - float *x |
| # xmm0 - float y |
| # rdx - float *z |
| |
| .globl vrsa_powxf |
| .type vrsa_powxf,@function |
| vrsa_powxf: |
| |
| sub $stack_size,%rsp |
| mov %rbx,save_rbx(%rsp) # save rbx |
| |
| movss %xmm0,p_y(%rsp) # save y |
| mov %rsi,p_xptr(%rsp) # save pointer to x |
| mov %rdx,p_zptr(%rsp) # save pointer to z |
| #ifdef INTEGER64 |
| mov %rdi,%rax |
| #else |
| mov %edi,%eax |
| #endif |
| test %rax,%rax # just return if count is zero |
| jz .L__final_check # exit if not |
| |
| mov %rax,%rcx |
| mov %rcx,p_nv(%rsp) # save number of values |
| |
| # |
| # classify y |
| # vector 32 bit integer method |
| # /* See whether y is an integer. |
| # inty = 0 means not an integer. |
| # inty = 1 means odd integer. |
| # inty = 2 means even integer. |
| # */ |
| # movdqa .LXMMWORD(%rip),%xmm4 PTR [rdx] |
| # get yexp |
| mov p_y(%rsp),%r8d # r8 is uy |
| mov $0x07fffffff,%r9d |
| and %r8d,%r9d # r9 is ay |
| |
| ## if |y| == 0 then return 1 |
| cmp $0,%r9d # is y a zero? |
| jz .Ly_zero |
| |
| mov $0x07f800000,%eax # EXPBITS_SP32 |
| and %r9d,%eax # y exp |
| |
| xor %edi,%edi |
| shr $23,%eax #>> EXPSHIFTBITS_SP32 |
| sub $126,%eax # - EXPBIAS_SP32 + 1 - eax is now the unbiased exponent |
| mov $1,%ebx |
| cmp %ebx,%eax # if (yexp < 1) |
| cmovl %edi,%ebx |
| jl .Lsave_inty |
| |
| mov $24,%ecx |
| cmp %ecx,%eax # if (yexp >24) |
| jle .Lcly1 |
| mov $2,%ebx |
| jmp .Lsave_inty |
| .Lcly1: # else 1<=yexp<=24 |
| sub %eax,%ecx # build mask for mantissa |
| shl %cl,%ebx |
| dec %ebx # rbx = mask = (1 << (24 - yexp)) - 1 |
| |
| mov %r8d,%eax |
| and %ebx,%eax # if ((uy & mask) != 0) |
| cmovnz %edi,%ebx # inty = 0; |
| jnz .Lsave_inty |
| |
| not %ebx # else if (((uy & ~mask) >> (24 - yexp)) & 0x00000001) |
| mov %r8d,%eax |
| and %ebx,%eax |
| shr %cl,%eax |
| inc %edi |
| and %edi,%eax |
| mov %edi,%ebx # inty = 1 |
| jnz .Lsave_inty |
| inc %ebx # else inty = 2 |
| |
| |
| .Lsave_inty: |
| mov %r8d,p_y+4(%rsp) # save an extra copy of y |
| mov %ebx,p_inty(%rsp) # save inty |
| |
| mov p_nv(%rsp),%rax # get number of values |
| mov %rax,%rcx |
| # see if too few values to call the main loop |
| shr $2,%rax # get number of iterations |
| jz .L__vsa_cleanup # jump if only single calls |
| # prepare the iteration counts |
| mov %rax,p_iter(%rsp) # save number of iterations |
| shl $2,%rax |
| sub %rax,%rcx # compute number of extra single calls |
| mov %rcx,p_nv(%rsp) # save number of left over values |
| |
| # process the array 4 values at a time. |
| |
| .L__vsa_top: |
| # build the input _m128 |
| # first get x |
| mov p_xptr(%rsp),%rsi # get x_array pointer |
| movups (%rsi),%xmm0 |
| prefetch 64(%rsi) |
| |
| |
| movaps %xmm0,%xmm2 |
| andps .L__mask_nsign(%rip),%xmm0 # get abs x |
| andps .L__mask_sign(%rip),%xmm2 # mask for the sign bits |
| movaps %xmm0,p_ax(%rsp) # save them |
| movaps %xmm2,p_sx(%rsp) # save them |
| # convert all four x's to double |
| cvtps2pd p_ax(%rsp),%xmm0 |
| cvtps2pd p_ax+8(%rsp),%xmm1 |
| # |
| # do x special case checking |
| # |
| # movdqa %xmm4,%xmm5 |
| # pcmpeqd %xmm3,%xmm5 ; is y not an integer? ff's if so |
| # pand .LXMMWORD(%rip),%xmm5 PTR __mask_NaN ; these values will be NaNs, if x<0 |
| pxor %xmm3,%xmm3 |
| xor %eax,%eax |
| mov $0x07FC00000,%ecx |
| cmp $0,%ebx # is y not an integer? |
| cmovz %ecx,%eax # then set to return a NaN. else 0. |
| mov $0x080000000,%ecx |
| cmp $1,%ebx # is y an odd integer? |
| cmovz %ecx,%eax # maybe set sign bit if so |
| movd %eax,%xmm5 |
| pshufd $0,%xmm5,%xmm5 |
| # shufps xmm5,%xmm5 |
| # movdqa %xmm4,%xmm2 |
| # pcmpeqd .LXMMWORD(%rip),%xmm2 PTR __mask_1 ; is it odd? ff's if so |
| # pand .LXMMWORD(%rip),%xmm2 PTR __mask_sign ; these values might get their sign bit set |
| # por %xmm2,%xmm5 |
| |
| # cmpps xmm3,XMMWORD PTR p_sx[rsp],0 ; if the signs are set |
| pcmpeqd p_sx(%rsp),%xmm3 # if the signs are set |
| pandn %xmm5,%xmm3 # then negateres gets the values as shown below |
| movdqa %xmm3,p_negateres(%rsp) # save negateres |
| |
| # /* p_negateres now means the following. |
| # 7FC00000 means x<0, y not an integer, return NaN. |
| # 80000000 means x<0, y is odd integer, so set the sign bit. |
| ## 0 means even integer, and/or x>=0. |
| # */ |
| |
| # **** Here starts the main calculations **** |
| # The algorithm used is x**y = exp(y*log(x)) |
| # Extra precision is required in intermediate steps to meet the 1ulp requirement |
| # |
| # log(x) calculation |
| call __vrd4_log@PLT # get the double precision log value |
| # for all four x's |
| # y* logx |
| cvtps2pd p_y(%rsp),%xmm2 #convert the two packed single y's to double |
| |
| # /* just multiply by y */ |
| mulpd %xmm2,%xmm0 |
| mulpd %xmm2,%xmm1 |
| |
| # /* The following code computes r = exp(w) */ |
| call __vrd4_exp@PLT # get the double exp value |
| # for all four y*log(x)'s |
| mov p_xptr(%rsp),%rsi # get x_array pointer |
| |
| # |
| # convert all four results to double |
| cvtpd2ps %xmm0,%xmm0 |
| cvtpd2ps %xmm1,%xmm1 |
| movlhps %xmm1,%xmm0 |
| |
| # perform special case and error checking on input values |
| |
| # special case checking is done first in the scalar version since |
| # it allows for early fast returns. But for vectors, we consider them |
| # to be rare, so early returns are not necessary. So we first compute |
| # the x**y values, and then check for special cases. |
| |
| # we do some of the checking in reverse order of the scalar version. |
| # apply the negate result flags |
| orps p_negateres(%rsp),%xmm0 # get negateres |
| |
| ## if y is infinite or so large that the result would overflow or underflow |
| mov p_y(%rsp),%edx # get y |
| and $0x07fffffff,%edx # develop ay |
| # mov $0x04f000000,%eax |
| cmp $0x04f000000,%edx |
| ja .Ly_large |
| .Lrnsx3: |
| |
| ## if x is infinite |
| movdqa p_ax(%rsp),%xmm4 |
| cmpps $0,.L__mask_inf(%rip),%xmm4 # equal to infinity, ffs if so. |
| movmskps %xmm4,%edx |
| test $0x0f,%edx |
| jnz .Lx_infinite |
| .Lrnsx1: |
| ## if x is zero |
| xorps %xmm4,%xmm4 |
| cmpps $0,p_ax(%rsp),%xmm4 # equal to zero, ffs if so. |
| movmskps %xmm4,%edx |
| test $0x0f,%edx |
| jnz .Lx_zero |
| .Lrnsx2: |
| ## if y is NAN |
| movss p_y(%rsp),%xmm4 # get y |
| ucomiss %xmm4,%xmm4 # comparing y to itself should |
| # be true, unless y is a NaN. parity flag if NaN. |
| jp .Ly_NaN |
| .Lrnsx4: |
| ## if x is NAN |
| movdqa p_ax(%rsp),%xmm4 # get x |
| cmpps $4,%xmm4,%xmm4 # a compare not equal of x to itself should |
| # be false, unless x is a NaN. ff's if NaN. |
| movmskps %xmm4,%ecx |
| test $0x0f,%ecx |
| jnz .Lx_NaN |
| .Lrnsx5: |
| |
| ## if x == +1, return +1 for all x |
| movdqa .L__float_one(%rip),%xmm3 # one |
| mov p_xptr(%rsp),%rdx # get pointer to x |
| movdqa %xmm3,%xmm2 |
| movdqu (%rdx), %xmm5 |
| cmpps $4,%xmm5,%xmm2 # not equal to +1.0?, ffs if not equal. |
| andps %xmm2,%xmm0 # keep the others |
| andnps %xmm3,%xmm2 # mask for ones |
| orps %xmm2,%xmm0 |
| |
| .L__vsa_bottom: |
| |
| # update the x and y pointers |
| add $16,%rsi |
| mov %rsi,p_xptr(%rsp) # save x_array pointer |
| # store the result _m128d |
| mov p_zptr(%rsp),%rdi # get z_array pointer |
| movups %xmm0,(%rdi) |
| # prefetchw QWORD PTR [rdi+64] |
| prefetch 64(%rdi) |
| add $16,%rdi |
| mov %rdi,p_zptr(%rsp) # save z_array pointer |
| |
| |
| mov p_iter(%rsp),%rax # get number of iterations |
| sub $1,%rax |
| mov %rax,p_iter(%rsp) # save number of iterations |
| jnz .L__vsa_top |
| |
| |
| # see if we need to do any extras |
| mov p_nv(%rsp),%rax # get number of values |
| test %rax,%rax |
| jnz .L__vsa_cleanup |
| |
| .L__final_check: |
| |
| mov save_rbx(%rsp),%rbx # restore rbx |
| add $stack_size,%rsp |
| ret |
| |
| .align 16 |
| # we jump here when we have an odd number of calls to make at the |
| # end |
| .L__vsa_cleanup: |
| mov p_nv(%rsp),%rax # get number of values |
| |
| mov p_xptr(%rsp),%rsi |
| mov p_y(%rsp),%r8d # r8 is uy |
| |
| # fill in a m128 with zeroes and the extra values and then make a recursive call. |
| xorps %xmm0,%xmm0 |
| movaps %xmm0,p2_temp(%rsp) |
| movaps %xmm0,p2_temp+16(%rsp) |
| |
| mov (%rsi),%ecx # we know there's at least one |
| mov %ecx,p2_temp(%rsp) |
| mov %r8d,p2_temp+16(%rsp) |
| cmp $2,%rax |
| jl .L__vsacg |
| |
| mov 4(%rsi),%ecx # do the second value |
| mov %ecx,p2_temp+4(%rsp) |
| mov %r8d,p2_temp+20(%rsp) |
| cmp $3,%rax |
| jl .L__vsacg |
| |
| mov 8(%rsi),%ecx # do the third value |
| mov %ecx,p2_temp+8(%rsp) |
| mov %r8d,p2_temp+24(%rsp) |
| |
| .L__vsacg: |
| mov $4,%rdi # parameter for N |
| lea p2_temp(%rsp),%rsi # &x parameter |
| movaps p2_temp+16(%rsp),%xmm0 # y parameter |
| lea p2_temp1(%rsp),%rdx # &z parameter |
| call vrsa_powxf@PLT # call recursively to compute four values |
| |
| # now copy the results to the destination array |
| mov p_zptr(%rsp),%rdi |
| mov p_nv(%rsp),%rax # get number of values |
| mov p2_temp1(%rsp),%ecx |
| mov %ecx,(%rdi) # we know there's at least one |
| cmp $2,%rax |
| jl .L__vsacgf |
| |
| mov p2_temp1+4(%rsp),%ecx |
| mov %ecx,4(%rdi) # do the second value |
| cmp $3,%rax |
| jl .L__vsacgf |
| |
| mov p2_temp1+8(%rsp),%ecx |
| mov %ecx,8(%rdi) # do the third value |
| |
| .L__vsacgf: |
| jmp .L__final_check |
| |
| |
| .align 16 |
| .Ly_zero: |
| ## if |y| == 0 then return 1 |
| mov $0x03f800000,%ecx # one |
| # fill all results with a one |
| mov p_zptr(%rsp),%r9 # &z parameter |
| mov p_nv(%rsp),%rax # get number of values |
| .L__yzt: |
| mov %ecx,(%r9) # store a 1 |
| add $4,%r9 |
| sub $1,%rax |
| test %rax,%rax |
| jnz .L__yzt |
| jmp .L__final_check |
| # y is a NaN. |
| .Ly_NaN: |
| mov p_y(%rsp),%r8d |
| or $0x000400000,%r8d # convert to QNaNs |
| movd %r8d,%xmm0 # propagate to all results |
| shufps $0,%xmm0,%xmm0 |
| jmp .Lrnsx4 |
| |
| # x is a NaN. |
| .Lx_NaN: |
| mov p_xptr(%rsp),%rcx # get pointer to x |
| movdqu (%rcx),%xmm4 # get x |
| movdqa %xmm4,%xmm3 |
| movdqa %xmm4,%xmm5 |
| movdqa .L__mask_sigbit(%rip),%xmm2 # get the signalling bits |
| cmpps $0,%xmm4,%xmm4 # a compare equal of x to itself should |
| # be true, unless x is a NaN. 0's if NaN. |
| cmpps $4,%xmm3,%xmm3 # compare not equal, ff's if NaN. |
| andps %xmm4,%xmm0 # keep the other results |
| andps %xmm3,%xmm2 # get just the right signalling bits |
| andps %xmm5,%xmm3 # mask for the NaNs |
| orps %xmm2,%xmm3 # convert to QNaNs |
| orps %xmm3,%xmm0 # combine |
| jmp .Lrnsx5 |
| |
| # y is infinite or so large that the result would |
| # overflow or underflow. |
| .Ly_large: |
| movdqa %xmm0,p_temp(%rsp) |
| |
| mov p_xptr(%rsp),%rcx # get pointer to x |
| mov (%rcx),%eax |
| mov p_y(%rsp),%ebx |
| mov p_inty(%rsp),%ecx |
| sub $8,%rsp |
| call .Lnp_special6 # call the handler for one value |
| add $8,%rsp |
| mov %eax,p_temp(%rsp) |
| |
| mov p_xptr(%rsp),%rcx # get pointer to x |
| mov 4(%rcx),%eax |
| mov p_y(%rsp),%ebx |
| mov p_inty(%rsp),%ecx |
| sub $8,%rsp |
| call .Lnp_special6 # call the handler for one value |
| add $8,%rsp |
| mov %eax,p_temp+4(%rsp) |
| |
| mov p_xptr(%rsp),%rcx # get pointer to x |
| mov 8(%rcx),%eax |
| mov p_y(%rsp),%ebx |
| mov p_inty(%rsp),%ecx |
| sub $8,%rsp |
| call .Lnp_special6 # call the handler for one value |
| add $8,%rsp |
| mov %eax,p_temp+8(%rsp) |
| |
| mov p_xptr(%rsp),%rcx # get pointer to x |
| mov 12(%rcx),%eax |
| mov p_y(%rsp),%ebx |
| mov p_inty(%rsp),%ecx |
| sub $8,%rsp |
| call .Lnp_special6 # call the handler for one value |
| add $8,%rsp |
| mov %eax,p_temp+12(%rsp) |
| |
| movdqa p_temp(%rsp),%xmm0 |
| jmp .Lrnsx3 |
| |
| # a subroutine to treat an individual x,y pair when y is large or infinity |
| # assumes x in .Ly(%rip),%eax in ebx. |
| # returns result in eax |
| .Lnp_special6: |
| # handle |x|==1 cases first |
| mov $0x07FFFFFFF,%r8d |
| and %eax,%r8d |
| cmp $0x03f800000,%r8d # jump if |x| !=1 |
| jnz .Lnps6 |
| mov $0x03f800000,%eax # return 1 for all |x|==1 |
| jmp .Lnpx64 |
| |
| # cases where |x| !=1 |
| .Lnps6: |
| mov $0x07f800000,%ecx |
| xor %eax,%eax # assume 0 return |
| test $0x080000000,%ebx |
| jnz .Lnps62 # jump if y negative |
| # y = +inf |
| cmp $0x03f800000,%r8d |
| cmovg %ecx,%eax # return inf if |x| < 1 |
| jmp .Lnpx64 |
| .Lnps62: |
| # y = -inf |
| cmp $0x03f800000,%r8d |
| cmovl %ecx,%eax # return inf if |x| < 1 |
| jmp .Lnpx64 |
| |
| .Lnpx64: |
| ret |
| |
| # handle cases where x is +/- infinity. edx is the mask |
| .align 16 |
| .Lx_infinite: |
| movdqa %xmm0,p_temp(%rsp) |
| |
| test $1,%edx |
| jz .Lxinfa |
| mov p_xptr(%rsp),%rcx # get pointer to x |
| mov (%rcx),%eax |
| mov p_y(%rsp),%ebx |
| mov p_inty(%rsp),%ecx |
| sub $8,%rsp |
| call .Lnp_special_x1 # call the handler for one value |
| add $8,%rsp |
| mov %eax,p_temp(%rsp) |
| .Lxinfa: |
| test $2,%edx |
| jz .Lxinfb |
| mov p_xptr(%rsp),%rcx # get pointer to x |
| mov p_y(%rsp),%ebx |
| mov 4(%rcx),%eax |
| mov p_inty(%rsp),%ecx |
| sub $8,%rsp |
| call .Lnp_special_x1 # call the handler for one value |
| add $8,%rsp |
| mov %eax,p_temp+4(%rsp) |
| .Lxinfb: |
| test $4,%edx |
| jz .Lxinfc |
| mov p_xptr(%rsp),%rcx # get pointer to x |
| mov p_y(%rsp),%ebx |
| mov 8(%rcx),%eax |
| mov p_inty(%rsp),%ecx |
| sub $8,%rsp |
| call .Lnp_special_x1 # call the handler for one value |
| add $8,%rsp |
| mov %eax,p_temp+8(%rsp) |
| .Lxinfc: |
| test $8,%edx |
| jz .Lxinfd |
| mov p_xptr(%rsp),%rcx # get pointer to x |
| mov p_y(%rsp),%ebx |
| mov 12(%rcx),%eax |
| mov p_inty(%rsp),%ecx |
| sub $8,%rsp |
| call .Lnp_special_x1 # call the handler for one value |
| add $8,%rsp |
| mov %eax,p_temp+12(%rsp) |
| .Lxinfd: |
| movdqa p_temp(%rsp),%xmm0 |
| jmp .Lrnsx1 |
| |
| # a subroutine to treat an individual x,y pair when x is +/-infinity |
| # assumes x in .Ly(%rip),%eax in ebx, inty in ecx. |
| # returns result in eax |
| .Lnp_special_x1: # x is infinite |
| test $0x080000000,%eax # is x positive |
| jnz .Lnsx11 # jump if not |
| test $0x080000000,%ebx # is y positive |
| jz .Lnsx13 # just return if so |
| xor %eax,%eax # else return 0 |
| jmp .Lnsx13 |
| |
| .Lnsx11: |
| cmp $1,%ecx # if inty ==1 |
| jnz .Lnsx12 # jump if not |
| test $0x080000000,%ebx # is y positive |
| jz .Lnsx13 # just return if so |
| mov $0x080000000,%eax # else return -0 |
| jmp .Lnsx13 |
| .Lnsx12: # inty <>1 |
| and $0x07FFFFFFF,%eax # return -x (|x|) if y<0 |
| test $0x080000000,%ebx # is y positive |
| jz .Lnsx13 # |
| xor %eax,%eax # return 0 if y >=0 |
| .Lnsx13: |
| ret |
| |
| |
| # handle cases where x is +/- zero. edx is the mask of x,y pairs with |x|=0 |
| .align 16 |
| .Lx_zero: |
| movdqa %xmm0,p_temp(%rsp) |
| |
| test $1,%edx |
| jz .Lxzera |
| mov p_xptr(%rsp),%rcx # get pointer to x |
| mov p_y(%rsp),%ebx |
| mov (%rcx),%eax |
| mov p_inty(%rsp),%ecx |
| sub $8,%rsp |
| call .Lnp_special_x2 # call the handler for one value |
| add $8,%rsp |
| mov %eax,p_temp(%rsp) |
| .Lxzera: |
| test $2,%edx |
| jz .Lxzerb |
| mov p_xptr(%rsp),%rcx # get pointer to x |
| mov p_y(%rsp),%ebx |
| mov 4(%rcx),%eax |
| mov p_inty(%rsp),%ecx |
| sub $8,%rsp |
| call .Lnp_special_x2 # call the handler for one value |
| add $8,%rsp |
| mov %eax,p_temp+4(%rsp) |
| .Lxzerb: |
| test $4,%edx |
| jz .Lxzerc |
| mov p_xptr(%rsp),%rcx # get pointer to x |
| mov p_y(%rsp),%ebx |
| mov 8(%rcx),%eax |
| mov p_inty(%rsp),%ecx |
| sub $8,%rsp |
| call .Lnp_special_x2 # call the handler for one value |
| add $8,%rsp |
| mov %eax,p_temp+8(%rsp) |
| .Lxzerc: |
| test $8,%edx |
| jz .Lxzerd |
| mov p_xptr(%rsp),%rcx # get pointer to x |
| mov p_y(%rsp),%ebx |
| mov 12(%rcx),%eax |
| mov p_inty(%rsp),%ecx |
| sub $8,%rsp |
| call .Lnp_special_x2 # call the handler for one value |
| add $8,%rsp |
| mov %eax,p_temp+12(%rsp) |
| .Lxzerd: |
| movdqa p_temp(%rsp),%xmm0 |
| jmp .Lrnsx2 |
| |
| # a subroutine to treat an individual x,y pair when x is +/-0 |
| # assumes x in .Ly(%rip),%eax in ebx, inty in ecx. |
| # returns result in eax |
| .align 16 |
| .Lnp_special_x2: |
| cmp $1,%ecx # if inty ==1 |
| jz .Lnsx21 # jump if so |
| # handle cases of x=+/-0, y not integer |
| xor %eax,%eax |
| mov $0x07f800000,%ecx |
| test $0x080000000,%ebx # is ypos |
| cmovnz %ecx,%eax |
| jmp .Lnsx23 |
| # y is an integer |
| .Lnsx21: |
| xor %r8d,%r8d |
| mov $0x07f800000,%ecx |
| test $0x080000000,%ebx # is ypos |
| cmovnz %ecx,%r8d # set to infinity if not |
| and $0x080000000,%eax # pickup the sign of x |
| or %r8d,%eax # and include it in the result |
| .Lnsx23: |
| ret |
| |
| .data |
| .align 64 |
| |
| .L__mask_sign: .quad 0x08000000080000000 # a sign bit mask |
| .quad 0x08000000080000000 |
| |
| .L__mask_nsign: .quad 0x07FFFFFFF7FFFFFFF # a not sign bit mask |
| .quad 0x07FFFFFFF7FFFFFFF |
| |
| # used by inty |
| .L__mask_127: .quad 0x00000007F0000007F # EXPBIAS_SP32 |
| .quad 0x00000007F0000007F |
| |
| .L__mask_mant: .quad 0x0007FFFFF007FFFFF # mantissa bit mask |
| .quad 0x0007FFFFF007FFFFF |
| |
| .L__mask_1: .quad 0x00000000100000001 # 1 |
| .quad 0x00000000100000001 |
| |
| .L__mask_2: .quad 0x00000000200000002 # 2 |
| .quad 0x00000000200000002 |
| |
| .L__mask_24: .quad 0x00000001800000018 # 24 |
| .quad 0x00000001800000018 |
| |
| .L__mask_23: .quad 0x00000001700000017 # 23 |
| .quad 0x00000001700000017 |
| |
| # used by special case checking |
| |
| .L__float_one: .quad 0x03f8000003f800000 # one |
| .quad 0x03f8000003f800000 |
| |
| .L__mask_inf: .quad 0x07f8000007F800000 # inifinity |
| .quad 0x07f8000007F800000 |
| |
| .L__mask_ninf: .quad 0x0ff800000fF800000 # -inifinity |
| .quad 0x0ff800000fF800000 |
| |
| .L__mask_NaN: .quad 0x07fC000007FC00000 # NaN |
| .quad 0x07fC000007FC00000 |
| |
| .L__mask_sigbit: .quad 0x00040000000400000 # QNaN bit |
| .quad 0x00040000000400000 |
| |
| .L__mask_impbit: .quad 0x00080000000800000 # implicit bit |
| .quad 0x00080000000800000 |
| |
| .L__mask_ly: .quad 0x04f0000004f000000 # large y |
| .quad 0x04f0000004f000000 |
| |
| |
| |