| |
| # |
| # (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved. |
| # |
| # This file is part of libacml_mv. |
| # |
| # libacml_mv is free software; you can redistribute it and/or |
| # modify it under the terms of the GNU Lesser General Public |
| # License as published by the Free Software Foundation; either |
| # version 2.1 of the License, or (at your option) any later version. |
| # |
| # libacml_mv is distributed in the hope that it will be useful, |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| # Lesser General Public License for more details. |
| # |
| # You should have received a copy of the GNU Lesser General Public |
| # License along with libacml_mv. If not, see |
| # <http://www.gnu.org/licenses/>. |
| # |
| # |
| |
| |
| |
| |
| |
| # |
| # vrdalog.s |
| # |
| # An array implementation of the log libm function. |
| # |
| # Prototype: |
| # |
| # void vrda_log(int n, double *x, double *y); |
| # |
| # Computes the natural log of x. |
| # Returns proper C99 values, but may not raise status flags properly. |
| # Less than 1 ulp of error. This version can compute logs in 44 |
| # cycles with n <= 24 |
| # |
| # |
| |
| #ifdef __ELF__ |
| .section .note.GNU-stack,"",@progbits |
| #endif |
| |
| # define local variable storage offsets |
| .equ p_x,0 # temporary for error checking operation |
| .equ p_idx,0x010 # index storage |
| .equ p_xexp,0x020 # index storage |
| |
| .equ p_x2,0x030 # temporary for error checking operation |
| .equ p_idx2,0x040 # index storage |
| .equ p_xexp2,0x050 # index storage |
| |
| .equ save_xa,0x060 #qword |
| .equ save_ya,0x068 #qword |
| .equ save_nv,0x070 #qword |
| .equ p_iter,0x078 # qword storage for number of loop iterations |
| |
| .equ save_rbx,0x080 #qword |
| |
| |
| .equ p2_temp,0x090 # second temporary for get/put bits operation |
| .equ p2_temp1,0x0b0 # second temporary for exponent multiply |
| |
| .equ p_n1,0x0c0 # temporary for near one check |
| .equ p_n12,0x0d0 # temporary for near one check |
| |
| |
| .equ stack_size,0x0e8 |
| |
| .weak vrda_log_ |
| .set vrda_log_,__vrda_log__ |
| .weak vrda_log__ |
| .set vrda_log__,__vrda_log__ |
| |
| # parameters are passed in by Linux as: |
| # rdi - int n |
| # rsi - double *x |
| # rdx - double *y |
| |
| .text |
| .align 16 |
| .p2align 4,,15 |
| |
| #x/* a FORTRAN subroutine implementation of array log |
| #** VRDA_LOG(N,X,Y) |
| # C equivalent*/ |
| #void vrda_log__(int * n, double *x, double *y) |
| #{ |
| # vrda_log(*n,x,y); |
| #} |
| .globl __vrda_log__ |
| .type __vrda_log__,@function |
| __vrda_log__: |
| mov (%rdi),%edi |
| |
| .align 16 |
| .p2align 4,,15 |
| .globl vrda_log |
| .type vrda_log,@function |
| vrda_log: |
| sub $stack_size,%rsp |
| mov %rbx,save_rbx(%rsp) # save rbx |
| |
| # save the arguments |
| mov %rsi,save_xa(%rsp) # save x_array pointer |
| mov %rdx,save_ya(%rsp) # save y_array pointer |
| #ifdef INTEGER64 |
| mov %rdi,%rax |
| #else |
| mov %edi,%eax |
| mov %rax,%rdi |
| #endif |
| |
| mov %rdi,save_nv(%rsp) # save number of values |
| # see if too few values to call the main loop |
| shr $2,%rax # get number of iterations |
| jz .L__vda_cleanup # jump if only single calls |
| # prepare the iteration counts |
| mov %rax,p_iter(%rsp) # save number of iterations |
| shl $2,%rax |
| sub %rax,%rdi # compute number of extra single calls |
| mov %rdi,save_nv(%rsp) # save number of left over values |
| |
| # In this second version, process the array 2 values at a time. |
| |
| .L__vda_top: |
| # build the input _m128d |
| mov save_xa(%rsp),%rsi # get x_array pointer |
| movlpd (%rsi),%xmm0 |
| movhpd 8(%rsi),%xmm0 |
| prefetch 64(%rsi) |
| add $32,%rsi |
| mov %rsi,save_xa(%rsp) # save x_array pointer |
| |
| movlpd -16(%rsi),%xmm7 |
| movhpd -8(%rsi),%xmm7 |
| |
| # compute the logs |
| |
| ## if NaN or inf |
| movdqa %xmm0,p_x(%rsp) # save the input values |
| |
| # /* Store the exponent of x in xexp and put |
| # f into the range [0.5,1) */ |
| |
| pxor %xmm1,%xmm1 |
| movdqa %xmm0,%xmm3 |
| psrlq $52,%xmm3 |
| psubq .L__mask_1023(%rip),%xmm3 |
| packssdw %xmm1,%xmm3 |
| cvtdq2pd %xmm3,%xmm6 # xexp |
| movdqa %xmm7,p_x2(%rsp) # save the input values |
| movdqa %xmm0,%xmm2 |
| subpd .L__real_one(%rip),%xmm2 |
| |
| movapd %xmm6,p_xexp(%rsp) |
| andpd .L__real_notsign(%rip),%xmm2 |
| xor %rax,%rax |
| |
| movdqa %xmm0,%xmm3 |
| pand .L__real_mant(%rip),%xmm3 |
| |
| cmppd $1,.L__real_threshold(%rip),%xmm2 |
| movmskpd %xmm2,%ecx |
| movdqa %xmm3,%xmm4 |
| mov %ecx,p_n1(%rsp) |
| |
| #/* Now x = 2**xexp * f, 1/2 <= f < 1. */ |
| psrlq $45,%xmm3 |
| movdqa %xmm3,%xmm2 |
| psrlq $1,%xmm3 |
| paddq .L__mask_040(%rip),%xmm3 |
| pand .L__mask_001(%rip),%xmm2 |
| paddq %xmm2,%xmm3 |
| |
| packssdw %xmm1,%xmm3 |
| cvtdq2pd %xmm3,%xmm1 |
| pxor %xmm7,%xmm7 |
| movdqa p_x2(%rsp),%xmm2 |
| movapd p_x2(%rsp),%xmm5 |
| psrlq $52,%xmm2 |
| psubq .L__mask_1023(%rip),%xmm2 |
| packssdw %xmm7,%xmm2 |
| subpd .L__real_one(%rip),%xmm5 |
| andpd .L__real_notsign(%rip),%xmm5 |
| cvtdq2pd %xmm2,%xmm6 # xexp |
| xor %rcx,%rcx |
| cmppd $1,.L__real_threshold(%rip),%xmm5 |
| movq %xmm3,p_idx(%rsp) |
| |
| # reduce and get u |
| por .L__real_half(%rip),%xmm4 |
| movdqa %xmm4,%xmm2 |
| movapd %xmm6,p_xexp2(%rsp) |
| |
| # do near one check |
| movmskpd %xmm5,%edx |
| mov %edx,p_n12(%rsp) |
| |
| mulpd .L__real_3f80000000000000(%rip),%xmm1 # f1 = index/128 |
| |
| |
| lea .L__np_ln_lead_table(%rip),%rdx |
| mov p_idx(%rsp),%eax |
| movdqa p_x2(%rsp),%xmm6 |
| |
| movapd .L__real_half(%rip),%xmm5 # .5 |
| subpd %xmm1,%xmm2 # f2 = f - f1 |
| pand .L__real_mant(%rip),%xmm6 |
| mulpd %xmm2,%xmm5 |
| addpd %xmm5,%xmm1 |
| |
| movdqa %xmm6,%xmm8 |
| psrlq $45,%xmm6 |
| movdqa %xmm6,%xmm4 |
| |
| psrlq $1,%xmm6 |
| paddq .L__mask_040(%rip),%xmm6 |
| pand .L__mask_001(%rip),%xmm4 |
| paddq %xmm4,%xmm6 |
| # do error checking here for scheduling. Saves a bunch of cycles as |
| # compared to doing this at the start of the routine. |
| ## if NaN or inf |
| movapd %xmm0,%xmm3 |
| andpd .L__real_inf(%rip),%xmm3 |
| cmppd $0,.L__real_inf(%rip),%xmm3 |
| movmskpd %xmm3,%r8d |
| packssdw %xmm7,%xmm6 |
| por .L__real_half(%rip),%xmm8 |
| movq %xmm6,p_idx2(%rsp) |
| cvtdq2pd %xmm6,%xmm9 |
| |
| cmppd $2,.L__real_zero(%rip),%xmm0 |
| mulpd .L__real_3f80000000000000(%rip),%xmm9 # f1 = index/128 |
| movmskpd %xmm0,%r9d |
| # delaying this divide helps, but moving the other one does not. |
| # it was after the paddq |
| divpd %xmm1,%xmm2 # u |
| |
| # compute the index into the log tables |
| # |
| |
| movlpd -512(%rdx,%rax,8),%xmm0 # z1 |
| mov p_idx+4(%rsp),%ecx |
| movhpd -512(%rdx,%rcx,8),%xmm0 # z1 |
| # solve for ln(1+u) |
| movapd %xmm2,%xmm1 # u |
| mulpd %xmm2,%xmm2 # u^2 |
| movapd %xmm2,%xmm5 |
| movapd .L__real_cb3(%rip),%xmm3 |
| mulpd %xmm2,%xmm3 #Cu2 |
| mulpd %xmm1,%xmm5 # u^3 |
| addpd .L__real_cb2(%rip),%xmm3 #B+Cu2 |
| |
| mulpd %xmm5,%xmm2 # u^5 |
| movapd .L__real_log2_lead(%rip),%xmm4 |
| |
| mulpd .L__real_cb1(%rip),%xmm5 #Au3 |
| addpd %xmm5,%xmm1 # u+Au3 |
| mulpd %xmm3,%xmm2 # u5(B+Cu2) |
| |
| movapd p_xexp(%rsp),%xmm5 # xexp |
| addpd %xmm2,%xmm1 # poly |
| # recombine |
| mulpd %xmm5,%xmm4 # xexp * log2_lead |
| addpd %xmm4,%xmm0 #r1 |
| lea .L__np_ln_tail_table(%rip),%rdx |
| movlpd -512(%rdx,%rax,8),%xmm4 #z2 +=q |
| movhpd -512(%rdx,%rcx,8),%xmm4 #z2 +=q |
| lea .L__np_ln_lead_table(%rip),%rdx |
| mov p_idx2(%rsp),%eax |
| mov p_idx2+4(%rsp),%ecx |
| addpd %xmm4,%xmm1 |
| |
| mulpd .L__real_log2_tail(%rip),%xmm5 |
| |
| movapd .L__real_half(%rip),%xmm4 # .5 |
| subpd %xmm9,%xmm8 # f2 = f - f1 |
| mulpd %xmm8,%xmm4 |
| addpd %xmm4,%xmm9 |
| |
| addpd %xmm5,%xmm1 #r2 |
| divpd %xmm9,%xmm8 # u |
| movapd p_x2(%rsp),%xmm3 |
| andpd .L__real_inf(%rip),%xmm3 |
| cmppd $0,.L__real_inf(%rip),%xmm3 |
| movmskpd %xmm3,%r10d |
| movapd p_x2(%rsp),%xmm6 |
| cmppd $2,.L__real_zero(%rip),%xmm6 |
| movmskpd %xmm6,%r11d |
| |
| # check for nans/infs |
| test $3,%r8d |
| addpd %xmm1,%xmm0 |
| jnz .L__log_naninf |
| .L__vlog1: |
| # check for negative numbers or zero |
| test $3,%r9d |
| jnz .L__z_or_n |
| |
| .L__vlog2: |
| # store the result _m128d |
| mov save_ya(%rsp),%rdi # get y_array pointer |
| movlpd %xmm0,(%rdi) |
| movhpd %xmm0,8(%rdi) |
| |
| # It seems like a good idea to try and interleave |
| # even more of the following code sooner into the |
| # program. But there were conflicts with the table |
| # index registers, making the problem difficult. |
| # After a lot of work in a branch of this file, |
| # I was not able to match the speed of this version. |
| # CodeAnalyst shows that there is lots of unused add |
| # pipe time around the divides, but the processor |
| # doesn't seem to be able to schedule in those slots. |
| |
| movlpd -512(%rdx,%rax,8),%xmm7 #z2 +=q |
| movhpd -512(%rdx,%rcx,8),%xmm7 #z2 +=q |
| |
| # check for near one |
| mov p_n1(%rsp),%r9d |
| test $3,%r9d |
| jnz .L__near_one1 |
| .L__vlog2n: |
| |
| # solve for ln(1+u) |
| movapd %xmm8,%xmm9 # u |
| mulpd %xmm8,%xmm8 # u^2 |
| movapd %xmm8,%xmm5 |
| movapd .L__real_cb3(%rip),%xmm3 |
| mulpd %xmm8,%xmm3 #Cu2 |
| mulpd %xmm9,%xmm5 # u^3 |
| addpd .L__real_cb2(%rip),%xmm3 #B+Cu2 |
| |
| mulpd %xmm5,%xmm8 # u^5 |
| movapd .L__real_log2_lead(%rip),%xmm4 |
| |
| mulpd .L__real_cb1(%rip),%xmm5 #Au3 |
| addpd %xmm5,%xmm9 # u+Au3 |
| mulpd %xmm3,%xmm8 # u5(B+Cu2) |
| |
| movapd p_xexp2(%rsp),%xmm5 # xexp |
| addpd %xmm8,%xmm9 # poly |
| # recombine |
| mulpd %xmm5,%xmm4 |
| addpd %xmm4,%xmm7 #r1 |
| lea .L__np_ln_tail_table(%rip),%rdx |
| movlpd -512(%rdx,%rax,8),%xmm2 #z2 +=q |
| movhpd -512(%rdx,%rcx,8),%xmm2 #z2 +=q |
| addpd %xmm2,%xmm9 |
| |
| mulpd .L__real_log2_tail(%rip),%xmm5 |
| |
| addpd %xmm5,%xmm9 #r2 |
| |
| # check for nans/infs |
| test $3,%r10d |
| addpd %xmm9,%xmm7 |
| jnz .L__log_naninf2 |
| .L__vlog3: |
| # check for negative numbers or zero |
| test $3,%r11d |
| jnz .L__z_or_n2 |
| |
| .L__vlog4: |
| mov p_n12(%rsp),%r9d |
| test $3,%r9d |
| jnz .L__near_one2 |
| |
| .L__vlog4n: |
| |
| |
| #__vda_bottom2: |
| |
| prefetch 64(%rdi) |
| add $32,%rdi |
| mov %rdi,save_ya(%rsp) # save y_array pointer |
| |
| # store the result _m128d |
| movlpd %xmm7,-16(%rdi) |
| movhpd %xmm7,-8(%rdi) |
| |
| mov p_iter(%rsp),%rax # get number of iterations |
| sub $1,%rax |
| mov %rax,p_iter(%rsp) # save number of iterations |
| jnz .L__vda_top |
| |
| |
| # see if we need to do any extras |
| mov save_nv(%rsp),%rax # get number of values |
| test %rax,%rax |
| jnz .L__vda_cleanup |
| |
| |
| .L__finish: |
| mov save_rbx(%rsp),%rbx # restore rbx |
| add $stack_size,%rsp |
| ret |
| |
| .align 16 |
| .Lboth_nearone: |
| # saves 10 cycles |
| # r = x - 1.0; |
| movapd .L__real_two(%rip),%xmm2 |
| subpd .L__real_one(%rip),%xmm0 # r |
| # u = r / (2.0 + r); |
| addpd %xmm0,%xmm2 |
| movapd %xmm0,%xmm1 |
| divpd %xmm2,%xmm1 # u |
| movapd .L__real_ca4(%rip),%xmm4 #D |
| movapd .L__real_ca3(%rip),%xmm5 #C |
| # correction = r * u; |
| movapd %xmm0,%xmm6 |
| mulpd %xmm1,%xmm6 # correction |
| # u = u + u; |
| addpd %xmm1,%xmm1 #u |
| movapd %xmm1,%xmm2 |
| mulpd %xmm2,%xmm2 #v =u^2 |
| # r2 = (u * v * (ca_1 + v * (ca_2 + v * (ca_3 + v * ca_4))) - correction); |
| mulpd %xmm1,%xmm5 # Cu |
| movapd %xmm1,%xmm3 |
| mulpd %xmm2,%xmm3 # u^3 |
| mulpd .L__real_ca2(%rip),%xmm2 #Bu^2 |
| mulpd %xmm3,%xmm4 #Du^3 |
| |
| addpd .L__real_ca1(%rip),%xmm2 # +A |
| movapd %xmm3,%xmm1 |
| mulpd %xmm1,%xmm1 # u^6 |
| addpd %xmm4,%xmm5 #Cu+Du3 |
| |
| mulpd %xmm3,%xmm2 #u3(A+Bu2) |
| mulpd %xmm5,%xmm1 #u6(Cu+Du3) |
| addpd %xmm1,%xmm2 |
| subpd %xmm6,%xmm2 # -correction |
| |
| # return r + r2; |
| addpd %xmm2,%xmm0 |
| ret |
| |
| .align 16 |
| .L__near_one1: |
| cmp $3,%r9d |
| jnz .L__n1nb1 |
| |
| movapd p_x(%rsp),%xmm0 |
| call .Lboth_nearone |
| movlpd %xmm0,(%rdi) |
| movhpd %xmm0,8(%rdi) |
| jmp .L__vlog2n |
| |
| .align 16 |
| .L__n1nb1: |
| test $1,%r9d |
| jz .L__lnn12 |
| |
| movlpd p_x(%rsp),%xmm0 |
| call .L__ln1 |
| movlpd %xmm0,(%rdi) |
| |
| .L__lnn12: |
| test $2,%r9d # second number? |
| jz .L__lnn1e |
| movlpd p_x+8(%rsp),%xmm0 |
| call .L__ln1 |
| movlpd %xmm0,8(%rdi) |
| |
| .L__lnn1e: |
| jmp .L__vlog2n |
| |
| |
| .align 16 |
| .L__near_one2: |
| cmp $3,%r9d |
| jnz .L__n1nb2 |
| |
| movapd p_x2(%rsp),%xmm0 |
| call .Lboth_nearone |
| movapd %xmm0,%xmm7 |
| jmp .L__vlog4n |
| |
| .align 16 |
| .L__n1nb2: |
| test $1,%r9d |
| jz .L__lnn22 |
| |
| movlpd p_x2(%rsp),%xmm0 |
| call .L__ln1 |
| movsd %xmm0,%xmm7 |
| |
| .L__lnn22: |
| test $2,%r9d # second number? |
| jz .L__lnn2e |
| movlpd p_x2+8(%rsp),%xmm0 |
| call .L__ln1 |
| movlhps %xmm0,%xmm7 |
| |
| .L__lnn2e: |
| jmp .L__vlog4n |
| |
| .align 16 |
| |
| .L__ln1: |
| # saves 10 cycles |
| # r = x - 1.0; |
| movlpd .L__real_two(%rip),%xmm2 |
| subsd .L__real_one(%rip),%xmm0 # r |
| # u = r / (2.0 + r); |
| addsd %xmm0,%xmm2 |
| movsd %xmm0,%xmm1 |
| divsd %xmm2,%xmm1 # u |
| movlpd .L__real_ca4(%rip),%xmm4 #D |
| movlpd .L__real_ca3(%rip),%xmm5 #C |
| # correction = r * u; |
| movsd %xmm0,%xmm6 |
| mulsd %xmm1,%xmm6 # correction |
| # u = u + u; |
| addsd %xmm1,%xmm1 #u |
| movsd %xmm1,%xmm2 |
| mulsd %xmm2,%xmm2 #v =u^2 |
| # r2 = (u * v * (ca_1 + v * (ca_2 + v * (ca_3 + v * ca_4))) - correction); |
| mulsd %xmm1,%xmm5 # Cu |
| movsd %xmm1,%xmm3 |
| mulsd %xmm2,%xmm3 # u^3 |
| mulsd .L__real_ca2(%rip),%xmm2 #Bu^2 |
| mulsd %xmm3,%xmm4 #Du^3 |
| |
| addsd .L__real_ca1(%rip),%xmm2 # +A |
| movsd %xmm3,%xmm1 |
| mulsd %xmm1,%xmm1 # u^6 |
| addsd %xmm4,%xmm5 #Cu+Du3 |
| |
| mulsd %xmm3,%xmm2 #u3(A+Bu2) |
| mulsd %xmm5,%xmm1 #u6(Cu+Du3) |
| addsd %xmm1,%xmm2 |
| subsd %xmm6,%xmm2 # -correction |
| |
| # return r + r2; |
| addsd %xmm2,%xmm0 |
| ret |
| |
| .align 16 |
| |
| # at least one of the numbers was a nan or infinity |
| .L__log_naninf: |
| test $1,%r8d # first number? |
| jz .L__lninf2 |
| |
| mov %rax,p2_temp(%rsp) |
| mov %rdx,p2_temp+8(%rsp) |
| movapd %xmm0,%xmm1 # save the inputs |
| mov p_x(%rsp),%rdx |
| movlpd p_x(%rsp),%xmm0 |
| call .L__lni |
| shufpd $2,%xmm1,%xmm0 |
| mov p2_temp(%rsp),%rax |
| mov p2_temp+8(%rsp),%rdx |
| |
| .L__lninf2: |
| test $2,%r8d # second number? |
| jz .L__lninfe |
| mov %rax,p2_temp(%rsp) |
| mov %rdx,p2_temp+8(%rsp) |
| movapd %xmm0,%xmm1 # save the inputs |
| mov p_x+8(%rsp),%rdx |
| movlpd p_x+8(%rsp),%xmm0 |
| call .L__lni |
| shufpd $0,%xmm0,%xmm1 |
| movapd %xmm1,%xmm0 |
| mov p2_temp(%rsp),%rax |
| mov p2_temp+8(%rsp),%rdx |
| |
| .L__lninfe: |
| jmp .L__vlog1 # continue processing if not |
| |
| # at least one of the numbers was a nan or infinity |
| .L__log_naninf2: |
| test $1,%r10d # first number? |
| jz .L__lninf22 |
| |
| mov %rax,p2_temp(%rsp) |
| mov %rdx,p2_temp+8(%rsp) |
| movapd %xmm7,%xmm1 # save the inputs |
| mov p_x2(%rsp),%rdx |
| movlpd p_x2(%rsp),%xmm0 |
| call .L__lni |
| shufpd $2,%xmm7,%xmm0 |
| mov p2_temp(%rsp),%rax |
| mov p2_temp+8(%rsp),%rdx |
| movapd %xmm0,%xmm7 |
| |
| .L__lninf22: |
| test $2,%r10d # second number? |
| jz .L__lninfe2 |
| mov %rax,p2_temp(%rsp) |
| mov %rdx,p2_temp+8(%rsp) |
| mov p_x2+8(%rsp),%rdx |
| movlpd p_x2+8(%rsp),%xmm0 |
| call .L__lni |
| shufpd $0,%xmm0,%xmm7 |
| mov p2_temp(%rsp),%rax |
| mov p2_temp+8(%rsp),%rdx |
| |
| .L__lninfe2: |
| jmp .L__vlog3 # continue processing if not |
| |
| # a subroutine to treat one number for nan/infinity |
| # the number is expected in rdx and returned in the low |
| # half of xmm0 |
| .L__lni: |
| mov $0x0000FFFFFFFFFFFFF,%rax |
| test %rax,%rdx |
| jnz .L__lnan # jump if mantissa not zero, so it's a NaN |
| # inf |
| rcl $1,%rdx |
| jnc .L__lne2 # log(+inf) = inf |
| # negative x |
| movlpd .L__real_nan(%rip),%xmm0 |
| ret |
| |
| #NaN |
| .L__lnan: |
| mov $0x00008000000000000,%rax # convert to quiet |
| or %rax,%rdx |
| .L__lne: |
| movd %rdx,%xmm0 |
| .L__lne2: |
| ret |
| |
| .align 16 |
| |
| # at least one of the numbers was a zero, a negative number, or both. |
| .L__z_or_n: |
| test $1,%r9d # first number? |
| jz .L__zn2 |
| |
| mov %rax,p2_temp(%rsp) |
| mov %rdx,p2_temp+8(%rsp) |
| movapd %xmm0,%xmm1 # save the inputs |
| mov p_x(%rsp),%rax |
| call .L__zni |
| shufpd $2,%xmm1,%xmm0 |
| mov p2_temp(%rsp),%rax |
| mov p2_temp+8(%rsp),%rdx |
| |
| .L__zn2: |
| test $2,%r9d # second number? |
| jz .L__zne |
| mov %rax,p2_temp(%rsp) |
| mov %rdx,p2_temp+8(%rsp) |
| movapd %xmm0,%xmm1 # save the inputs |
| mov p_x+8(%rsp),%rax |
| call .L__zni |
| shufpd $0,%xmm0,%xmm1 |
| movapd %xmm1,%xmm0 |
| mov p2_temp(%rsp),%rax |
| mov p2_temp+8(%rsp),%rdx |
| |
| .L__zne: |
| jmp .L__vlog2 |
| |
| .L__z_or_n2: |
| test $1,%r11d # first number? |
| jz .L__zn22 |
| |
| mov %rax,p2_temp(%rsp) |
| mov %rdx,p2_temp+8(%rsp) |
| mov p_x2(%rsp),%rax |
| call .L__zni |
| shufpd $2,%xmm7,%xmm0 |
| movapd %xmm0,%xmm7 |
| mov p2_temp(%rsp),%rax |
| mov p2_temp+8(%rsp),%rdx |
| |
| .L__zn22: |
| test $2,%r11d # second number? |
| jz .L__zne2 |
| mov %rax,p2_temp(%rsp) |
| mov %rdx,p2_temp+8(%rsp) |
| mov p_x2+8(%rsp),%rax |
| call .L__zni |
| shufpd $0,%xmm0,%xmm7 |
| mov p2_temp(%rsp),%rax |
| mov p2_temp+8(%rsp),%rdx |
| |
| .L__zne2: |
| jmp .L__vlog4 |
| # a subroutine to treat one number for zero or negative values |
| # the number is expected in rax and returned in the low |
| # half of xmm0 |
| .L__zni: |
| shl $1,%rax |
| jnz .L__zn_x # if just a carry, then must be negative |
| movlpd .L__real_ninf(%rip),%xmm0 # C99 specs -inf for +-0 |
| ret |
| .L__zn_x: |
| movlpd .L__real_nan(%rip),%xmm0 |
| ret |
| |
| |
| # we jump here when we have an odd number of log calls to make at the |
| # end |
| # we assume that rdx is pointing at the next x array element, |
| # r8 at the next y array element. The number of values left is in |
| # save_nv |
| .L__vda_cleanup: |
| mov save_nv(%rsp),%rax # get number of values |
| test %rax,%rax # are there any values |
| jz .L__finish # exit if not |
| |
| mov save_xa(%rsp),%rsi |
| mov save_ya(%rsp),%rdi |
| |
| # fill in a m128d with zeroes and the extra values and then make a recursive call. |
| xorpd %xmm0,%xmm0 |
| movlpd %xmm0,p_x+8(%rsp) |
| movapd %xmm0,p_x+16(%rsp) |
| |
| mov (%rsi),%rcx # we know there's at least one |
| mov %rcx,p_x(%rsp) |
| cmp $2,%rax |
| jl .L__vdacg |
| |
| mov 8(%rsi),%rcx # do the second value |
| mov %rcx,p_x+8(%rsp) |
| cmp $3,%rax |
| jl .L__vdacg |
| |
| mov 16(%rsi),%rcx # do the third value |
| mov %rcx,p_x+16(%rsp) |
| |
| .L__vdacg: |
| mov $4,%rdi # parameter for N |
| lea p_x(%rsp),%rsi # &x parameter |
| lea p2_temp(%rsp),%rdx # &y parameter |
| call vrda_log@PLT # call recursively to compute four values |
| |
| # now copy the results to the destination array |
| mov save_ya(%rsp),%rdi |
| mov save_nv(%rsp),%rax # get number of values |
| mov p2_temp(%rsp),%rcx |
| mov %rcx,(%rdi) # we know there's at least one |
| cmp $2,%rax |
| jl .L__vdacgf |
| |
| mov p2_temp+8(%rsp),%rcx |
| mov %rcx,8(%rdi) # do the second value |
| cmp $3,%rax |
| jl .L__vdacgf |
| |
| mov p2_temp+16(%rsp),%rcx |
| mov %rcx,16(%rdi) # do the third value |
| |
| .L__vdacgf: |
| jmp .L__finish |
| |
| .data |
| .align 64 |
| |
| .L__real_one: .quad 0x03ff0000000000000 # 1.0 |
| .quad 0x03ff0000000000000 |
| .L__real_two: .quad 0x04000000000000000 # 2.0 |
| .quad 0x04000000000000000 |
| .L__real_ninf: .quad 0x0fff0000000000000 # -inf |
| .quad 0x0fff0000000000000 |
| .L__real_inf: .quad 0x07ff0000000000000 # +inf |
| .quad 0x07ff0000000000000 |
| .L__real_nan: .quad 0x07ff8000000000000 # NaN |
| .quad 0x07ff8000000000000 |
| |
| .L__real_zero: .quad 0x00000000000000000 # 0.0 |
| .quad 0x00000000000000000 |
| |
| .L__real_sign: .quad 0x08000000000000000 # sign bit |
| .quad 0x08000000000000000 |
| .L__real_notsign: .quad 0x07ffFFFFFFFFFFFFF # ^sign bit |
| .quad 0x07ffFFFFFFFFFFFFF |
| .L__real_threshold: .quad 0x03F9EB85000000000 # .03 |
| .quad 0x03F9EB85000000000 |
| .L__real_qnanbit: .quad 0x00008000000000000 # quiet nan bit |
| .quad 0x00008000000000000 |
| .L__real_mant: .quad 0x0000FFFFFFFFFFFFF # mantissa bits |
| .quad 0x0000FFFFFFFFFFFFF |
| .L__real_3f80000000000000: .quad 0x03f80000000000000 # /* 0.0078125 = 1/128 */ |
| .quad 0x03f80000000000000 |
| .L__mask_1023: .quad 0x000000000000003ff # |
| .quad 0x000000000000003ff |
| .L__mask_040: .quad 0x00000000000000040 # |
| .quad 0x00000000000000040 |
| .L__mask_001: .quad 0x00000000000000001 # |
| .quad 0x00000000000000001 |
| |
| .L__real_ca1: .quad 0x03fb55555555554e6 # 8.33333333333317923934e-02 |
| .quad 0x03fb55555555554e6 |
| .L__real_ca2: .quad 0x03f89999999bac6d4 # 1.25000000037717509602e-02 |
| .quad 0x03f89999999bac6d4 |
| .L__real_ca3: .quad 0x03f62492307f1519f # 2.23213998791944806202e-03 |
| .quad 0x03f62492307f1519f |
| .L__real_ca4: .quad 0x03f3c8034c85dfff0 # 4.34887777707614552256e-04 |
| .quad 0x03f3c8034c85dfff0 |
| |
| .L__real_cb1: .quad 0x03fb5555555555557 # 8.33333333333333593622e-02 |
| .quad 0x03fb5555555555557 |
| .L__real_cb2: .quad 0x03f89999999865ede # 1.24999999978138668903e-02 |
| .quad 0x03f89999999865ede |
| .L__real_cb3: .quad 0x03f6249423bd94741 # 2.23219810758559851206e-03 |
| .quad 0x03f6249423bd94741 |
| .L__real_log2_lead: .quad 0x03fe62e42e0000000 # log2_lead 6.93147122859954833984e-01 |
| .quad 0x03fe62e42e0000000 |
| .L__real_log2_tail: .quad 0x03e6efa39ef35793c # log2_tail 5.76999904754328540596e-08 |
| .quad 0x03e6efa39ef35793c |
| |
| .L__real_half: .quad 0x03fe0000000000000 # 1/2 |
| .quad 0x03fe0000000000000 |
| |
| |
| .L__np_ln_lead_table: |
| .quad 0x0000000000000000 # 0.00000000000000000000e+00 |
| .quad 0x3f8fc0a800000000 # 1.55041813850402832031e-02 |
| .quad 0x3f9f829800000000 # 3.07716131210327148438e-02 |
| .quad 0x3fa7745800000000 # 4.58095073699951171875e-02 |
| .quad 0x3faf0a3000000000 # 6.06245994567871093750e-02 |
| .quad 0x3fb341d700000000 # 7.52233862876892089844e-02 |
| .quad 0x3fb6f0d200000000 # 8.96121263504028320312e-02 |
| .quad 0x3fba926d00000000 # 1.03796780109405517578e-01 |
| .quad 0x3fbe270700000000 # 1.17783010005950927734e-01 |
| .quad 0x3fc0d77e00000000 # 1.31576299667358398438e-01 |
| .quad 0x3fc2955280000000 # 1.45181953907012939453e-01 |
| .quad 0x3fc44d2b00000000 # 1.58604979515075683594e-01 |
| .quad 0x3fc5ff3000000000 # 1.71850204467773437500e-01 |
| .quad 0x3fc7ab8900000000 # 1.84922337532043457031e-01 |
| .quad 0x3fc9525a80000000 # 1.97825729846954345703e-01 |
| .quad 0x3fcaf3c900000000 # 2.10564732551574707031e-01 |
| .quad 0x3fcc8ff780000000 # 2.23143517971038818359e-01 |
| .quad 0x3fce270700000000 # 2.35566020011901855469e-01 |
| .quad 0x3fcfb91800000000 # 2.47836112976074218750e-01 |
| .quad 0x3fd0a324c0000000 # 2.59957492351531982422e-01 |
| .quad 0x3fd1675c80000000 # 2.71933674812316894531e-01 |
| .quad 0x3fd22941c0000000 # 2.83768117427825927734e-01 |
| .quad 0x3fd2e8e280000000 # 2.95464158058166503906e-01 |
| .quad 0x3fd3a64c40000000 # 3.07025015354156494141e-01 |
| .quad 0x3fd4618bc0000000 # 3.18453729152679443359e-01 |
| .quad 0x3fd51aad80000000 # 3.29753279685974121094e-01 |
| .quad 0x3fd5d1bd80000000 # 3.40926527976989746094e-01 |
| .quad 0x3fd686c800000000 # 3.51976394653320312500e-01 |
| .quad 0x3fd739d7c0000000 # 3.62905442714691162109e-01 |
| .quad 0x3fd7eaf800000000 # 3.73716354370117187500e-01 |
| .quad 0x3fd89a3380000000 # 3.84411692619323730469e-01 |
| .quad 0x3fd9479400000000 # 3.94993782043457031250e-01 |
| .quad 0x3fd9f323c0000000 # 4.05465066432952880859e-01 |
| .quad 0x3fda9cec80000000 # 4.15827870368957519531e-01 |
| .quad 0x3fdb44f740000000 # 4.26084339618682861328e-01 |
| .quad 0x3fdbeb4d80000000 # 4.36236739158630371094e-01 |
| .quad 0x3fdc8ff7c0000000 # 4.46287095546722412109e-01 |
| .quad 0x3fdd32fe40000000 # 4.56237375736236572266e-01 |
| .quad 0x3fddd46a00000000 # 4.66089725494384765625e-01 |
| .quad 0x3fde744240000000 # 4.75845873355865478516e-01 |
| .quad 0x3fdf128f40000000 # 4.85507786273956298828e-01 |
| .quad 0x3fdfaf5880000000 # 4.95077252388000488281e-01 |
| .quad 0x3fe02552a0000000 # 5.04556000232696533203e-01 |
| .quad 0x3fe0723e40000000 # 5.13945698738098144531e-01 |
| .quad 0x3fe0be72e0000000 # 5.23248136043548583984e-01 |
| .quad 0x3fe109f380000000 # 5.32464742660522460938e-01 |
| .quad 0x3fe154c3c0000000 # 5.41597247123718261719e-01 |
| .quad 0x3fe19ee6a0000000 # 5.50647079944610595703e-01 |
| .quad 0x3fe1e85f40000000 # 5.59615731239318847656e-01 |
| .quad 0x3fe23130c0000000 # 5.68504691123962402344e-01 |
| .quad 0x3fe2795e00000000 # 5.77315330505371093750e-01 |
| .quad 0x3fe2c0e9e0000000 # 5.86049020290374755859e-01 |
| .quad 0x3fe307d720000000 # 5.94707071781158447266e-01 |
| .quad 0x3fe34e2880000000 # 6.03290796279907226562e-01 |
| .quad 0x3fe393e0c0000000 # 6.11801505088806152344e-01 |
| .quad 0x3fe3d90260000000 # 6.20240390300750732422e-01 |
| .quad 0x3fe41d8fe0000000 # 6.28608644008636474609e-01 |
| .quad 0x3fe4618bc0000000 # 6.36907458305358886719e-01 |
| .quad 0x3fe4a4f840000000 # 6.45137906074523925781e-01 |
| .quad 0x3fe4e7d800000000 # 6.53301239013671875000e-01 |
| .quad 0x3fe52a2d20000000 # 6.61398470401763916016e-01 |
| .quad 0x3fe56bf9c0000000 # 6.69430613517761230469e-01 |
| .quad 0x3fe5ad4040000000 # 6.77398800849914550781e-01 |
| .quad 0x3fe5ee02a0000000 # 6.85303986072540283203e-01 |
| .quad 0x3fe62e42e0000000 # 6.93147122859954833984e-01 |
| .quad 0 # for alignment |
| |
| .L__np_ln_tail_table: |
| .quad 0x00000000000000000 # 0 ; 0.00000000000000000000e+00 |
| .quad 0x03e361f807c79f3db # 5.15092497094772879206e-09 |
| .quad 0x03e6873c1980267c8 # 4.55457209735272790188e-08 |
| .quad 0x03e5ec65b9f88c69e # 2.86612990859791781788e-08 |
| .quad 0x03e58022c54cc2f99 # 2.23596477332056055352e-08 |
| .quad 0x03e62c37a3a125330 # 3.49498983167142274770e-08 |
| .quad 0x03e615cad69737c93 # 3.23392843005887000414e-08 |
| .quad 0x03e4d256ab1b285e9 # 1.35722380472479366661e-08 |
| .quad 0x03e5b8abcb97a7aa2 # 2.56504325268044191098e-08 |
| .quad 0x03e6f34239659a5dc # 5.81213608741512136843e-08 |
| .quad 0x03e6e07fd48d30177 # 5.59374849578288093334e-08 |
| .quad 0x03e6b32df4799f4f6 # 5.06615629004996189970e-08 |
| .quad 0x03e6c29e4f4f21cf8 # 5.24588857848400955725e-08 |
| .quad 0x03e1086c848df1b59 # 9.61968535632653505972e-10 |
| .quad 0x03e4cf456b4764130 # 1.34829655346594463137e-08 |
| .quad 0x03e63a02ffcb63398 # 3.65557749306383026498e-08 |
| .quad 0x03e61e6a6886b0976 # 3.33431709374069198903e-08 |
| .quad 0x03e6b8abcb97a7aa2 # 5.13008650536088382197e-08 |
| .quad 0x03e6b578f8aa35552 # 5.09285070380306053751e-08 |
| .quad 0x03e6139c871afb9fc # 3.20853940845502057341e-08 |
| .quad 0x03e65d5d30701ce64 # 4.06713248643004200446e-08 |
| .quad 0x03e6de7bcb2d12142 # 5.57028186706125221168e-08 |
| .quad 0x03e6d708e984e1664 # 5.48356693724804282546e-08 |
| .quad 0x03e556945e9c72f36 # 1.99407553679345001938e-08 |
| .quad 0x03e20e2f613e85bda # 1.96585517245087232086e-09 |
| .quad 0x03e3cb7e0b42724f6 # 6.68649386072067321503e-09 |
| .quad 0x03e6fac04e52846c7 # 5.89936034642113390002e-08 |
| .quad 0x03e5e9b14aec442be # 2.85038578721554472484e-08 |
| .quad 0x03e6b5de8034e7126 # 5.09746772910284482606e-08 |
| .quad 0x03e6dc157e1b259d3 # 5.54234668933210171467e-08 |
| .quad 0x03e3b05096ad69c62 # 6.29100830926604004874e-09 |
| .quad 0x03e5c2116faba4cdd # 2.61974119468563937716e-08 |
| .quad 0x03e665fcc25f95b47 # 4.16752115011186398935e-08 |
| .quad 0x03e5a9a08498d4850 # 2.47747534460820790327e-08 |
| .quad 0x03e6de647b1465f77 # 5.56922172017964209793e-08 |
| .quad 0x03e5da71b7bf7861d # 2.76162876992552906035e-08 |
| .quad 0x03e3e6a6886b09760 # 7.08169709942321478061e-09 |
| .quad 0x03e6f0075eab0ef64 # 5.77453510221151779025e-08 |
| .quad 0x03e33071282fb989b # 4.43021445893361960146e-09 |
| .quad 0x03e60eb43c3f1bed2 # 3.15140984357495864573e-08 |
| .quad 0x03e5faf06ecb35c84 # 2.95077445089736670973e-08 |
| .quad 0x03e4ef1e63db35f68 # 1.44098510263167149349e-08 |
| .quad 0x03e469743fb1a71a5 # 1.05196987538551827693e-08 |
| .quad 0x03e6c1cdf404e5796 # 5.23641361722697546261e-08 |
| .quad 0x03e4094aa0ada625e # 7.72099925253243069458e-09 |
| .quad 0x03e6e2d4c96fde3ec # 5.62089493829364197156e-08 |
| .quad 0x03e62f4d5e9a98f34 # 3.53090261098577946927e-08 |
| .quad 0x03e6467c96ecc5cbe # 3.80080516835568242269e-08 |
| .quad 0x03e6e7040d03dec5a # 5.66961038386146408282e-08 |
| .quad 0x03e67bebf4282de36 # 4.42287063097349852717e-08 |
| .quad 0x03e6289b11aeb783f # 3.45294525105681104660e-08 |
| .quad 0x03e5a891d1772f538 # 2.47132034530447431509e-08 |
| .quad 0x03e634f10be1fb591 # 3.59655343422487209774e-08 |
| .quad 0x03e6d9ce1d316eb93 # 5.51581770357780862071e-08 |
| .quad 0x03e63562a19a9c442 # 3.60171867511861372793e-08 |
| .quad 0x03e54e2adf548084c # 1.94511067964296180547e-08 |
| .quad 0x03e508ce55cc8c97a # 1.54137376631349347838e-08 |
| .quad 0x03e30e2f613e85bda # 3.93171034490174464173e-09 |
| .quad 0x03e6db03ebb0227bf # 5.52990607758839766440e-08 |
| .quad 0x03e61b75bb09cb098 # 3.29990737637586136511e-08 |
| .quad 0x03e496f16abb9df22 # 1.18436010922446096216e-08 |
| .quad 0x03e65b3f399411c62 # 4.04248680368301346709e-08 |
| .quad 0x03e586b3e59f65355 # 2.27418915900284316293e-08 |
| .quad 0x03e52482ceae1ac12 # 1.70263791333409206020e-08 |
| .quad 0x03e6efa39ef35793c # 5.76999904754328540596e-08 |
| .quad 0 # for alignment |
| |