| |
| # |
| # (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved. |
| # |
| # This file is part of libacml_mv. |
| # |
| # libacml_mv is free software; you can redistribute it and/or |
| # modify it under the terms of the GNU Lesser General Public |
| # License as published by the Free Software Foundation; either |
| # version 2.1 of the License, or (at your option) any later version. |
| # |
| # libacml_mv is distributed in the hope that it will be useful, |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| # Lesser General Public License for more details. |
| # |
| # You should have received a copy of the GNU Lesser General Public |
| # License along with libacml_mv. If not, see |
| # <http://www.gnu.org/licenses/>. |
| # |
| # |
| |
| |
| |
| |
| |
| # |
| # vrdasincos.s |
| # |
| # An array implementation of the sincos libm function. |
| # |
| # Prototype: |
| # |
| # void vrda_sincos(int n, double *x, double *ys, double *yc); |
| # |
| #Computes Sine of x for an array of input values. |
| #Places the results into the supplied ys array. |
| #Computes Cosine of x for an array of input values. |
| #Places the results into the supplied yc array. |
| #Does not perform error checking. |
| #Denormal inputs may produce unexpected results |
| #Author: Harsha Jagasia |
| #Email: harsha.jagasia@amd.com |
| |
| #ifdef __ELF__ |
| .section .note.GNU-stack,"",@progbits |
| #endif |
| |
| .data |
| .align 16 |
| .L__real_7fffffffffffffff: .quad 0x07fffffffffffffff #Sign bit zero |
| .quad 0x07fffffffffffffff |
| .L__real_3ff0000000000000: .quad 0x03ff0000000000000 # 1.0 |
| .quad 0x03ff0000000000000 |
| .L__real_v2p__27: .quad 0x03e40000000000000 # 2p-27 |
| .quad 0x03e40000000000000 |
| .L__real_3fe0000000000000: .quad 0x03fe0000000000000 # 0.5 |
| .quad 0x03fe0000000000000 |
| .L__real_3fc5555555555555: .quad 0x03fc5555555555555 # 0.166666666666 |
| .quad 0x03fc5555555555555 |
| .L__real_3fe45f306dc9c883: .quad 0x03fe45f306dc9c883 # twobypi |
| .quad 0x03fe45f306dc9c883 |
| .L__real_3ff921fb54400000: .quad 0x03ff921fb54400000 # piby2_1 |
| .quad 0x03ff921fb54400000 |
| .L__real_3dd0b4611a626331: .quad 0x03dd0b4611a626331 # piby2_1tail |
| .quad 0x03dd0b4611a626331 |
| .L__real_3dd0b4611a600000: .quad 0x03dd0b4611a600000 # piby2_2 |
| .quad 0x03dd0b4611a600000 |
| .L__real_3ba3198a2e037073: .quad 0x03ba3198a2e037073 # piby2_2tail |
| .quad 0x03ba3198a2e037073 |
| .L__real_fffffffff8000000: .quad 0x0fffffffff8000000 # mask for stripping head and tail |
| .quad 0x0fffffffff8000000 |
| .L__real_8000000000000000: .quad 0x08000000000000000 # -0 or signbit |
| .quad 0x08000000000000000 |
| .L__reald_one_one: .quad 0x00000000100000001 # |
| .quad 0 |
| .L__reald_two_two: .quad 0x00000000200000002 # |
| .quad 0 |
| .L__reald_one_zero: .quad 0x00000000100000000 # sin_cos_filter |
| .quad 0 |
| .L__reald_zero_one: .quad 0x00000000000000001 # |
| .quad 0 |
| .L__reald_two_zero: .quad 0x00000000200000000 # |
| .quad 0 |
| .L__realq_one_one: .quad 0x00000000000000001 # |
| .quad 0x00000000000000001 # |
| .L__realq_two_two: .quad 0x00000000000000002 # |
| .quad 0x00000000000000002 # |
| .L__real_1_x_mask: .quad 0x0ffffffffffffffff # |
| .quad 0x03ff0000000000000 # |
| .L__real_zero: .quad 0x00000000000000000 # |
| .quad 0x00000000000000000 # |
| .L__real_one: .quad 0x00000000000000001 # |
| .quad 0x00000000000000001 # |
| .L__real_jt_mask: .quad 0x0000000000000000F # |
| .quad 0x00000000000000000 # |
| .L__real_naninf_upper_sign_mask: .quad 0x000000000ffffffff # |
| .quad 0x000000000ffffffff # |
| .L__real_naninf_lower_sign_mask: .quad 0x0ffffffff00000000 # |
| .quad 0x0ffffffff00000000 # |
| |
| .Lcosarray: |
| .quad 0x03fa5555555555555 # 0.0416667 c1 |
| .quad 0x03fa5555555555555 |
| .quad 0x0bf56c16c16c16967 # -0.00138889 c2 |
| .quad 0x0bf56c16c16c16967 |
| .quad 0x03efa01a019f4ec90 # 2.48016e-005 c3 |
| .quad 0x03efa01a019f4ec90 |
| .quad 0x0be927e4fa17f65f6 # -2.75573e-007 c4 |
| .quad 0x0be927e4fa17f65f6 |
| .quad 0x03e21eeb69037ab78 # 2.08761e-009 c5 |
| .quad 0x03e21eeb69037ab78 |
| .quad 0x0bda907db46cc5e42 # -1.13826e-011 c6 |
| .quad 0x0bda907db46cc5e42 |
| .Lsinarray: |
| .quad 0x0bfc5555555555555 # -0.166667 s1 |
| .quad 0x0bfc5555555555555 |
| .quad 0x03f81111111110bb3 # 0.00833333 s2 |
| .quad 0x03f81111111110bb3 |
| .quad 0x0bf2a01a019e83e5c # -0.000198413 s3 |
| .quad 0x0bf2a01a019e83e5c |
| .quad 0x03ec71de3796cde01 # 2.75573e-006 s4 |
| .quad 0x03ec71de3796cde01 |
| .quad 0x0be5ae600b42fdfa7 # -2.50511e-008 s5 |
| .quad 0x0be5ae600b42fdfa7 |
| .quad 0x03de5e0b2f9a43bb8 # 1.59181e-010 s6 |
| .quad 0x03de5e0b2f9a43bb8 |
| .Lsincosarray: |
| .quad 0x0bfc5555555555555 # -0.166667 s1 |
| .quad 0x03fa5555555555555 # 0.0416667 c1 |
| .quad 0x03f81111111110bb3 # 0.00833333 s2 |
| .quad 0x0bf56c16c16c16967 |
| .quad 0x0bf2a01a019e83e5c # -0.000198413 s3 |
| .quad 0x03efa01a019f4ec90 |
| .quad 0x03ec71de3796cde01 # 2.75573e-006 s4 |
| .quad 0x0be927e4fa17f65f6 |
| .quad 0x0be5ae600b42fdfa7 # -2.50511e-008 s5 |
| .quad 0x03e21eeb69037ab78 |
| .quad 0x03de5e0b2f9a43bb8 # 1.59181e-010 s6 |
| .quad 0x0bda907db46cc5e42 |
| |
| |
| .Lcossinarray: |
| .quad 0x03fa5555555555555 # 0.0416667 c1 |
| .quad 0x0bfc5555555555555 # -0.166667 s1 |
| .quad 0x0bf56c16c16c16967 |
| .quad 0x03f81111111110bb3 # 0.00833333 s2 |
| .quad 0x03efa01a019f4ec90 |
| .quad 0x0bf2a01a019e83e5c # -0.000198413 s3 |
| .quad 0x0be927e4fa17f65f6 |
| .quad 0x03ec71de3796cde01 # 2.75573e-006 s4 |
| .quad 0x03e21eeb69037ab78 |
| .quad 0x0be5ae600b42fdfa7 # -2.50511e-008 s5 |
| .quad 0x0bda907db46cc5e42 |
| .quad 0x03de5e0b2f9a43bb8 # 1.59181e-010 s6 |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| .weak vrda_sincos_ |
| .set vrda_sincos_,__vrda_sincos__ |
| .weak vrda_sincos__ |
| .set vrda_sincos__,__vrda_sincos__ |
| |
| .text |
| .align 16 |
| .p2align 4,,15 |
| |
| #x/* a FORTRAN subroutine implementation of array sincos |
| #** VRDA_SINCOS(N,X,YS,YC) |
| # C equivalent*/ |
| #void vrda_sincos__( int * n, double *x, double *ys, double *yc) |
| #{ |
| # vrda_sincos(*n,x,y); |
| #} |
| .globl __vrda_sincos__ |
| .type __vrda_sincos__,@function |
| __vrda_sincos__: |
| mov (%rdi),%edi |
| .align 16 |
| .p2align 4,,15 |
| |
| # define local variable storage offsets |
| .equ save_xmm6, 0x00 # temporary for get/put bits operation |
| .equ save_xmm7, 0x10 # temporary for get/put bits operation |
| .equ save_xmm8, 0x20 # temporary for get/put bits operation |
| .equ save_xmm9, 0x30 # temporary for get/put bits operation |
| .equ save_xmm10, 0x40 # temporary for get/put bits operation |
| .equ save_xmm11, 0x50 # temporary for get/put bits operation |
| .equ save_xmm12, 0x60 # temporary for get/put bits operation |
| .equ save_xmm13, 0x70 # temporary for get/put bits operation |
| .equ save_xmm14, 0x80 # temporary for get/put bits operation |
| .equ save_xmm15, 0x90 # temporary for get/put bits operation |
| |
| .equ save_rdi, 0x0A0 |
| .equ save_rsi, 0x0B0 |
| .equ save_rbx, 0x0C0 |
| |
| .equ r, 0x0D0 # pointer to r for remainder_piby2 |
| .equ rr, 0x0E0 # pointer to r for remainder_piby2 |
| .equ rsq, 0x0F0 |
| .equ region, 0x0100 # pointer to r for remainder_piby2 |
| |
| .equ r1, 0x0110 # pointer to r for remainder_piby2 |
| .equ rr1, 0x0120 # pointer to r for remainder_piby2 |
| .equ rsq1, 0x0130 |
| .equ region1, 0x0140 # pointer to r for remainder_piby2 |
| |
| .equ p_temp, 0x0150 # temporary for get/put bits operation |
| .equ p_temp1, 0x0160 # temporary for get/put bits operation |
| |
| .equ p_temp2, 0x0170 # temporary for get/put bits operation |
| .equ p_temp3, 0x0180 # temporary for get/put bits operation |
| |
| .equ p_temp4, 0x0190 # temporary for get/put bits operation |
| .equ p_temp5, 0x01A0 # temporary for get/put bits operation |
| |
| .equ p_temp6, 0x01B0 # temporary for get/put bits operation |
| .equ p_temp7, 0x01C0 # temporary for get/put bits operation |
| |
| .equ p_original, 0x01D0 # original x |
| .equ p_mask, 0x01E0 # original x |
| .equ p_signs, 0x01F0 # original x |
| .equ p_signc, 0x0200 # original x |
| .equ p_region, 0x0210 |
| |
| .equ p_original1, 0x0220 # original x |
| .equ p_mask1, 0x0230 # original x |
| .equ p_signs1, 0x0240 # original x |
| .equ p_signc1, 0x0250 # original x |
| .equ p_region1, 0x0260 |
| |
| .equ save_r12, 0x0270 # temporary for get/put bits operation |
| .equ save_r13, 0x0280 # temporary for get/put bits operation |
| |
| .equ save_r14, 0x0290 # temporary for get/put bits operation |
| .equ save_r15, 0x02A0 # temporary for get/put bits operation |
| |
| .equ save_xa, 0x02B0 # qword ; leave space for 4 args***** |
| .equ save_ysa, 0x02C0 # qword ; leave space for 4 args***** |
| .equ save_yca, 0x02D0 # qword ; leave space for 4 args***** |
| |
| .equ save_nv, 0x02E0 # qword |
| .equ p_iter, 0x02F0 # qword storage for number of loop iterations |
| |
| |
| .globl vrda_sincos |
| .type vrda_sincos,@function |
| vrda_sincos: |
| |
| sub $0x0308,%rsp |
| |
| mov %r12,save_r12(%rsp) # save r12 |
| mov %r13,save_r13(%rsp) # save r13 |
| mov %rbx,save_rbx(%rsp) # save rbx |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| #START PROCESS INPUT |
| # save the arguments |
| mov %rsi,save_xa(%rsp) # save x_array pointer |
| mov %rdx,save_ysa(%rsp) # save ysin_array pointer |
| mov %rcx,save_yca(%rsp) # save ycos_array pointer |
| #ifdef INTEGER64 |
| mov %rdi,%rax |
| #else |
| mov %edi,%eax |
| mov %rax,%rdi |
| #endif |
| |
| mov %rdi,save_nv(%rsp) # save number of values |
| # see if too few values to call the main loop |
| shr $2,%rax # get number of iterations |
| jz .L__vrda_cleanup # jump if only single calls |
| # prepare the iteration counts |
| mov %rax,p_iter(%rsp) # save number of iterations |
| shl $2,%rax |
| sub %rax,%rdi # compute number of extra single calls |
| mov %rdi,save_nv(%rsp) # save number of left over values |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| #START LOOP |
| .align 16 |
| .L__vrda_top: |
| # build the input _m128d |
| movapd .L__real_7fffffffffffffff(%rip),%xmm2 |
| mov .L__real_7fffffffffffffff(%rip),%rdx |
| |
| mov save_xa(%rsp),%rsi # get x_array pointer |
| movlpd (%rsi),%xmm0 |
| movhpd 8(%rsi),%xmm0 |
| mov (%rsi),%rax |
| mov 8(%rsi),%rcx |
| movdqa %xmm0,%xmm6 |
| movdqa %xmm0,p_original(%rsp) |
| |
| prefetch 64(%rsi) |
| add $32,%rsi |
| mov %rsi,save_xa(%rsp) # save x_array pointer |
| |
| movlpd -16(%rsi), %xmm1 |
| movhpd -8(%rsi), %xmm1 |
| mov -16(%rsi), %r8 |
| mov -8(%rsi), %r9 |
| movdqa %xmm1,%xmm7 |
| movdqa %xmm1,p_original1(%rsp) |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| #STARTMAIN |
| |
| andpd %xmm2,%xmm0 #Unsign |
| andpd %xmm2,%xmm1 #Unsign |
| |
| and %rdx,%rax |
| and %rdx,%rcx |
| and %rdx,%r8 |
| and %rdx,%r9 |
| |
| movdqa %xmm0,%xmm12 |
| movdqa %xmm1,%xmm13 |
| |
| pcmpgtd %xmm6,%xmm12 |
| pcmpgtd %xmm7,%xmm13 |
| movdqa %xmm12,%xmm6 |
| movdqa %xmm13,%xmm7 |
| psrldq $4,%xmm12 |
| psrldq $4,%xmm13 |
| psrldq $8,%xmm6 |
| psrldq $8,%xmm7 |
| |
| mov $0x3FE921FB54442D18,%rdx #piby4 + |
| mov $0x411E848000000000,%r10 #5e5 + |
| movapd .L__real_3fe0000000000000(%rip),%xmm4 #0.5 for later use + |
| |
| por %xmm6,%xmm12 |
| por %xmm7,%xmm13 |
| movd %xmm12,%r12 #Move Sign to gpr ** |
| movd %xmm13,%r13 #Move Sign to gpr ** |
| |
| movapd %xmm0,%xmm2 #x0 |
| movapd %xmm1,%xmm3 #x1 |
| movapd %xmm0,%xmm6 #x0 |
| movapd %xmm1,%xmm7 #x1 |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| # xmm2 = x, xmm4 =0.5, xmm6 =x |
| # xmm3 = x, xmm5 =0.5, xmm7 =x |
| .align 16 |
| .Leither_or_both_arg_gt_than_piby4: |
| cmp %r10,%rax |
| jae .Lfirst_or_next3_arg_gt_5e5 |
| |
| cmp %r10,%rcx |
| jae .Lsecond_or_next2_arg_gt_5e5 |
| |
| cmp %r10,%r8 |
| jae .Lthird_or_fourth_arg_gt_5e5 |
| |
| cmp %r10,%r9 |
| jae .Lfourth_arg_gt_5e5 |
| |
| |
| # /* Find out what multiple of piby2 */ |
| # npi2 = (int)(x * twobypi + 0.5); |
| movapd .L__real_3fe45f306dc9c883(%rip),%xmm0 |
| mulpd %xmm0,%xmm2 # * twobypi |
| mulpd %xmm0,%xmm3 # * twobypi |
| |
| addpd %xmm4,%xmm2 # +0.5, npi2 |
| addpd %xmm4,%xmm3 # +0.5, npi2 |
| |
| movapd .L__real_3ff921fb54400000(%rip),%xmm0 # piby2_1 |
| movapd .L__real_3ff921fb54400000(%rip),%xmm1 # piby2_1 |
| |
| cvttpd2dq %xmm2,%xmm4 # convert packed double to packed integers |
| |
| xorpd %xmm12,%xmm12 |
| |
| cvttpd2dq %xmm3,%xmm5 # convert packed double to packed integers |
| |
| movapd .L__real_3dd0b4611a600000(%rip),%xmm8 # piby2_2 |
| movapd .L__real_3dd0b4611a600000(%rip),%xmm9 # piby2_2 |
| |
| cvtdq2pd %xmm4,%xmm2 # and back to double. |
| cvtdq2pd %xmm5,%xmm3 # and back to double. |
| |
| |
| # /* Subtract the multiple from x to get an extra-precision remainder */ |
| |
| movd %xmm4,%r8 # Region |
| movd %xmm5,%r9 # Region |
| |
| mov .L__reald_one_zero(%rip),%rdx # compare value for cossin path |
| mov %r8,%r10 # For Sign of Sin |
| mov %r9,%r11 |
| |
| # rhead = x - npi2 * piby2_1; |
| mulpd %xmm2,%xmm0 # npi2 * piby2_1; |
| mulpd %xmm3,%xmm1 # npi2 * piby2_1; |
| |
| # rtail = npi2 * piby2_2; |
| mulpd %xmm2,%xmm8 # rtail |
| mulpd %xmm3,%xmm9 # rtail |
| |
| # rhead = x - npi2 * piby2_1; |
| subpd %xmm0,%xmm6 # rhead = x - npi2 * piby2_1; |
| subpd %xmm1,%xmm7 # rhead = x - npi2 * piby2_1; |
| |
| # t = rhead; |
| movapd %xmm6,%xmm0 # t |
| movapd %xmm7,%xmm1 # t |
| |
| # rhead = t - rtail; |
| subpd %xmm8,%xmm0 # rhead |
| subpd %xmm9,%xmm1 # rhead |
| |
| # rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); |
| mulpd .L__real_3ba3198a2e037073(%rip),%xmm2 # npi2 * piby2_2tail |
| mulpd .L__real_3ba3198a2e037073(%rip),%xmm3 # npi2 * piby2_2tail |
| |
| subpd %xmm0,%xmm6 # t-rhead |
| subpd %xmm1,%xmm7 # t-rhead |
| |
| subpd %xmm6,%xmm8 # - ((t - rhead) - rtail) |
| subpd %xmm7,%xmm9 # - ((t - rhead) - rtail) |
| |
| addpd %xmm2,%xmm8 # rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); |
| addpd %xmm3,%xmm9 # rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); |
| |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| # xmm4 = npi2 (int), xmm0 =rhead, xmm8 =rtail |
| # xmm5 = npi2 (int), xmm1 =rhead, xmm9 =rtail |
| |
| pand .L__reald_one_one(%rip),%xmm4 #odd/even region for cos/sin |
| pand .L__reald_one_one(%rip),%xmm5 #odd/even region for cos/sin |
| |
| pcmpeqd %xmm12,%xmm4 |
| pcmpeqd %xmm12,%xmm5 |
| |
| punpckldq %xmm4,%xmm4 |
| punpckldq %xmm5,%xmm5 |
| |
| movapd %xmm4,p_region(%rsp) |
| movapd %xmm5,p_region1(%rsp) |
| |
| shr $1,%r10 #~AB+A~B, A is sign and B is upper bit of region |
| shr $1,%r11 #~AB+A~B, A is sign and B is upper bit of region |
| |
| mov %r10,%rax |
| mov %r11,%rcx |
| |
| not %r12 #ADDED TO CHANGE THE LOGIC |
| not %r13 #ADDED TO CHANGE THE LOGIC |
| and %r12,%r10 |
| and %r13,%r11 |
| |
| not %rax |
| not %rcx |
| not %r12 |
| not %r13 |
| and %r12,%rax |
| and %r13,%rcx |
| |
| or %rax,%r10 |
| or %rcx,%r11 |
| and .L__reald_one_one(%rip),%r10 #(~AB+A~B)&1 |
| and .L__reald_one_one(%rip),%r11 #(~AB+A~B)&1 |
| |
| mov %r10,%r12 |
| mov %r11,%r13 |
| |
| and %rdx,%r12 #mask out the lower sign bit leaving the upper sign bit |
| and %rdx,%r13 #mask out the lower sign bit leaving the upper sign bit |
| |
| shl $63,%r10 #shift lower sign bit left by 63 bits |
| shl $63,%r11 #shift lower sign bit left by 63 bits |
| shl $31,%r12 #shift upper sign bit left by 31 bits |
| shl $31,%r13 #shift upper sign bit left by 31 bits |
| |
| mov %r10,p_signs(%rsp) #write out lower sign bit |
| mov %r12,p_signs+8(%rsp) #write out upper sign bit |
| mov %r11,p_signs1(%rsp) #write out lower sign bit |
| mov %r13,p_signs1+8(%rsp) #write out upper sign bit |
| |
| # GET_BITS_DP64(rhead-rtail, uy); ; originally only rhead |
| # xmm4 = Sign, xmm0 =rhead, xmm8 =rtail |
| # xmm5 = Sign, xmm1 =rhead, xmm9 =rtail |
| movapd %xmm0,%xmm6 # rhead |
| movapd %xmm1,%xmm7 # rhead |
| |
| subpd %xmm8,%xmm0 # r = rhead - rtail |
| subpd %xmm9,%xmm1 # r = rhead - rtail |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| # xmm4 = Sign, xmm0 = r, xmm6 =rhead, xmm8 =rtail |
| # xmm5 = Sign, xmm1 = r, xmm7 =rhead, xmm9 =rtail |
| |
| subpd %xmm0,%xmm6 #rr=rhead-r |
| subpd %xmm1,%xmm7 #rr=rhead-r |
| |
| movapd %xmm0,%xmm2 #move r for r2 |
| movapd %xmm1,%xmm3 #move r for r2 |
| |
| mulpd %xmm0,%xmm2 #r2 |
| mulpd %xmm1,%xmm3 #r2 |
| |
| subpd %xmm8,%xmm6 #rr=(rhead-r) -rtail |
| subpd %xmm9,%xmm7 #rr=(rhead-r) -rtail |
| |
| |
| add .L__reald_one_one(%rip),%r8 |
| add .L__reald_one_one(%rip),%r9 |
| |
| and .L__reald_two_two(%rip),%r8 |
| and .L__reald_two_two(%rip),%r9 |
| |
| shr $1,%r8 |
| shr $1,%r9 |
| |
| mov %r8,%r12 |
| mov %r9,%r13 |
| |
| and .L__reald_one_zero(%rip),%r12 #mask out the lower sign bit leaving the upper sign bit |
| and .L__reald_one_zero(%rip),%r13 #mask out the lower sign bit leaving the upper sign bit |
| |
| shl $63,%r8 #shift lower sign bit left by 63 bits |
| shl $63,%r9 #shift lower sign bit left by 63 bits |
| |
| shl $31,%r12 #shift upper sign bit left by 31 bits |
| shl $31,%r13 #shift upper sign bit left by 31 bits |
| |
| mov %r8,p_signc(%rsp) #write out lower sign bit |
| mov %r12,p_signc+8(%rsp) #write out upper sign bit |
| mov %r9,p_signc1(%rsp) #write out lower sign bit |
| mov %r13,p_signc1+8(%rsp) #write out upper sign bit |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| # p_sign0 = Sign, xmm0 = r, xmm2 = %xmm6,%r2 =rr |
| # p_sign1 = Sign, xmm1 = r, xmm3 = %xmm7,%r2 =rr |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| .align 16 |
| .Lsinsin_sinsin_piby4: |
| |
| movapd %xmm0,p_temp(%rsp) # copy of x |
| movapd %xmm1,p_temp1(%rsp) # copy of x |
| |
| movapd %xmm2,%xmm10 # x2 |
| movapd %xmm3,%xmm11 # x2 |
| |
| movdqa .Lsinarray+0x50(%rip),%xmm4 # s6 |
| movdqa .Lsinarray+0x50(%rip),%xmm5 # s6 |
| movapd .Lsinarray+0x20(%rip),%xmm8 # s3 |
| movapd .Lsinarray+0x20(%rip),%xmm9 # s3 |
| |
| movdqa .Lcosarray+0x50(%rip),%xmm12 # c6 |
| movdqa .Lcosarray+0x50(%rip),%xmm13 # c6 |
| movapd .Lcosarray+0x20(%rip),%xmm14 # c3 |
| movapd .Lcosarray+0x20(%rip),%xmm15 # c3 |
| |
| movapd %xmm2,p_temp2(%rsp) # copy of x2 |
| movapd %xmm3,p_temp3(%rsp) # copy of x2 |
| |
| mulpd %xmm2,%xmm4 # s6*x2 |
| mulpd %xmm3,%xmm5 # s6*x2 |
| mulpd %xmm2,%xmm8 # s3*x2 |
| mulpd %xmm3,%xmm9 # s3*x2 |
| |
| mulpd %xmm2,%xmm12 # s6*x2 |
| mulpd %xmm3,%xmm13 # s6*x2 |
| mulpd %xmm2,%xmm14 # s3*x2 |
| mulpd %xmm3,%xmm15 # s3*x2 |
| |
| mulpd %xmm2,%xmm10 # x4 |
| mulpd %xmm3,%xmm11 # x4 |
| |
| addpd .Lsinarray+0x40(%rip),%xmm4 # s5+x2s6 |
| addpd .Lsinarray+0x40(%rip),%xmm5 # s5+x2s6 |
| addpd .Lsinarray+0x10(%rip),%xmm8 # s2+x2C3 |
| addpd .Lsinarray+0x10(%rip),%xmm9 # s2+x2C3 |
| |
| addpd .Lcosarray+0x40(%rip),%xmm12 # c5+x2c6 |
| addpd .Lcosarray+0x40(%rip),%xmm13 # c5+x2c6 |
| addpd .Lcosarray+0x10(%rip),%xmm14 # c2+x2C3 |
| addpd .Lcosarray+0x10(%rip),%xmm15 # c2+x2C3 |
| |
| mulpd %xmm2,%xmm10 # x6 |
| mulpd %xmm3,%xmm11 # x6 |
| |
| mulpd %xmm2,%xmm4 # x2(s5+x2s6) |
| mulpd %xmm3,%xmm5 # x2(s5+x2s6) |
| mulpd %xmm2,%xmm8 # x2(s2+x2C3) |
| mulpd %xmm3,%xmm9 # x2(s2+x2C3) |
| |
| mulpd %xmm2,%xmm12 # x2(s5+x2s6) |
| mulpd %xmm3,%xmm13 # x2(s5+x2s6) |
| mulpd %xmm2,%xmm14 # x2(s2+x2C3) |
| mulpd %xmm3,%xmm15 # x2(s2+x2C3) |
| |
| mulpd .L__real_3fe0000000000000(%rip),%xmm2 # 0.5 *x2 |
| mulpd .L__real_3fe0000000000000(%rip),%xmm3 # 0.5 *x2 |
| |
| addpd .Lsinarray+0x30(%rip),%xmm4 # s4 + x2(s5+x2s6) |
| addpd .Lsinarray+0x30(%rip),%xmm5 # s4 + x2(s5+x2s6) |
| addpd .Lsinarray(%rip),%xmm8 # s1 + x2(s2+x2C3) |
| addpd .Lsinarray(%rip),%xmm9 # s1 + x2(s2+x2C3) |
| |
| movapd %xmm2,p_temp4(%rsp) # copy of r |
| movapd %xmm3,p_temp5(%rsp) # copy of r |
| |
| movapd %xmm2,%xmm0 # r |
| movapd %xmm3,%xmm1 # r |
| |
| addpd .Lcosarray+0x30(%rip),%xmm12 # c4 + x2(c5+x2c6) |
| addpd .Lcosarray+0x30(%rip),%xmm13 # c4 + x2(c5+x2c6) |
| addpd .Lcosarray(%rip),%xmm14 # c1 + x2(c2+x2C3) |
| addpd .Lcosarray(%rip),%xmm15 # c1 + x2(c2+x2C3) |
| |
| mulpd %xmm6,%xmm2 # 0.5 * x2 *xx |
| mulpd %xmm7,%xmm3 # 0.5 * x2 *xx |
| |
| subpd .L__real_3ff0000000000000(%rip),%xmm0 # -t=r-1.0 |
| subpd .L__real_3ff0000000000000(%rip),%xmm1 # -t=r-1.0 |
| |
| mulpd %xmm10,%xmm4 # x6(s4 + x2(s5+x2s6)) |
| mulpd %xmm11,%xmm5 # x6(s4 + x2(s5+x2s6)) |
| |
| mulpd %xmm10,%xmm12 # x6(c4 + x2(c5+x2c6)) |
| mulpd %xmm11,%xmm13 # x6(c4 + x2(c5+x2c6)) |
| |
| addpd .L__real_3ff0000000000000(%rip),%xmm0 # 1+(-t) |
| addpd .L__real_3ff0000000000000(%rip),%xmm1 # 1+(-t) |
| |
| addpd %xmm8,%xmm4 # zs |
| addpd %xmm9,%xmm5 # zs |
| |
| addpd %xmm14,%xmm12 # zc |
| addpd %xmm15,%xmm13 # zc |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| # p_sign0 = Sign, xmm0 = r, xmm2 = 0.5 * x2 *xx, xmm4 = zs, xmm12 = zc, xmm6 =rr |
| # p_sign1 = Sign, xmm1 = r, xmm3 = 0.5 * x2 *xx, xmm5 = zs, xmm13 = zc, xmm7 =rr |
| |
| # Free |
| # %xmm8,,%xmm10 xmm14 |
| # %xmm9,,%xmm11 xmm15 |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| movapd p_temp2(%rsp),%xmm10 # x2 for x3 |
| movapd p_temp3(%rsp),%xmm11 # x2 for x3 |
| |
| movapd %xmm10,%xmm8 # x2 for x4 |
| movapd %xmm11,%xmm9 # x2 for x4 |
| |
| movapd p_temp(%rsp),%xmm14 # x for x*xx |
| movapd p_temp1(%rsp),%xmm15 # x for x*xx |
| |
| subpd p_temp4(%rsp),%xmm0 # (1 + (-t)) - r |
| subpd p_temp5(%rsp),%xmm1 # (1 + (-t)) - r |
| |
| mulpd %xmm14,%xmm10 # x3 |
| mulpd %xmm15,%xmm11 # x3 |
| |
| mulpd %xmm8,%xmm8 # x4 |
| mulpd %xmm9,%xmm9 # x4 |
| |
| mulpd %xmm6,%xmm14 # x*xx |
| mulpd %xmm7,%xmm15 # x*xx |
| |
| mulpd %xmm10,%xmm4 # x3 * zs |
| mulpd %xmm11,%xmm5 # x3 * zs |
| |
| mulpd %xmm8,%xmm12 # x4 * zc |
| mulpd %xmm9,%xmm13 # x4 * zc |
| |
| subpd %xmm2,%xmm4 # x3*zs-0.5 * x2 *xx |
| subpd %xmm3,%xmm5 # x3*zs-0.5 * x2 *xx |
| |
| subpd %xmm14,%xmm0 # ((1 + (-t)) - r) -x*xx |
| subpd %xmm15,%xmm1 # ((1 + (-t)) - r) -x*xx |
| |
| |
| movapd p_temp4(%rsp),%xmm10 # r for t |
| movapd p_temp5(%rsp),%xmm11 # r for t |
| |
| addpd %xmm6,%xmm4 # sin+xx |
| addpd %xmm7,%xmm5 # sin+xx |
| |
| addpd %xmm0,%xmm12 # x4*zc + (((1 + (-t)) - r) - x*xx) |
| addpd %xmm1,%xmm13 # x4*zc + (((1 + (-t)) - r) - x*xx) |
| |
| subpd .L__real_3ff0000000000000(%rip),%xmm10 # -t=r-1.0 |
| subpd .L__real_3ff0000000000000(%rip),%xmm11 # -t=r-1.0 |
| |
| movapd p_region(%rsp),%xmm2 |
| movapd p_region1(%rsp),%xmm3 |
| |
| movapd %xmm2,%xmm8 |
| movapd %xmm3,%xmm9 |
| |
| addpd p_temp(%rsp),%xmm4 # sin+xx+x |
| addpd p_temp1(%rsp),%xmm5 # sin+xx+x |
| |
| subpd %xmm10,%xmm12 # cos + (-t) |
| subpd %xmm11,%xmm13 # cos + (-t) |
| |
| # xmm4 = sin, xmm5 = sin |
| # xmm12 = cos, xmm13 = cos |
| |
| andnpd %xmm4,%xmm8 |
| andnpd %xmm5,%xmm9 |
| |
| andpd %xmm2,%xmm4 |
| andpd %xmm3,%xmm5 |
| |
| andnpd %xmm12,%xmm2 |
| andnpd %xmm13,%xmm3 |
| |
| andpd p_region(%rsp),%xmm12 |
| andpd p_region1(%rsp),%xmm13 |
| |
| orpd %xmm2,%xmm4 |
| orpd %xmm3,%xmm5 |
| |
| orpd %xmm8,%xmm12 |
| orpd %xmm9,%xmm13 |
| |
| jmp .L__vrd4_sin_cleanup |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| .align 16 |
| .Lfirst_or_next3_arg_gt_5e5: |
| # %rcx,,%rax r8, r9 |
| |
| cmp %r10,%rcx #is upper arg >= 5e5 |
| jae .Lboth_arg_gt_5e5 |
| |
| .Llower_arg_gt_5e5: |
| # Upper Arg is < 5e5, Lower arg is >= 5e5 |
| # %xmm2,,%xmm0 xmm6 = x, xmm4 = 0.5 |
| # Be sure not to use %xmm3,%xmm1 and xmm7 |
| # Use %xmm8,,%xmm5 xmm10, xmm12 |
| # %xmm11,,%xmm9 xmm13 |
| |
| |
| movlpd %xmm0,r(%rsp) #Save lower fp arg for remainder_piby2 call |
| movhlps %xmm0,%xmm0 #Needed since we want to work on upper arg |
| movhlps %xmm2,%xmm2 |
| movhlps %xmm6,%xmm6 |
| |
| # Work on Upper arg |
| # Lower arg might contain nan/inf, to avoid exception use only scalar instructions on upper arg which has been moved to lower portions of fp regs |
| mulsd .L__real_3fe45f306dc9c883(%rip),%xmm2 # x*twobypi |
| addsd %xmm4,%xmm2 # xmm2 = npi2=(x*twobypi+0.5) |
| movsd .L__real_3ff921fb54400000(%rip),%xmm8 # xmm8 = piby2_1 |
| cvttsd2si %xmm2,%ecx # ecx = npi2 trunc to ints |
| movsd .L__real_3dd0b4611a600000(%rip),%xmm10 # xmm10 = piby2_2 |
| cvtsi2sd %ecx,%xmm2 # xmm2 = npi2 trunc to doubles |
| |
| #/* Subtract the multiple from x to get an extra-precision remainder */ |
| #rhead = x - npi2 * piby2_1; |
| mulsd %xmm2,%xmm8 # npi2 * piby2_1 |
| subsd %xmm8,%xmm6 # xmm6 = rhead =(x-npi2*piby2_1) |
| movsd .L__real_3ba3198a2e037073(%rip),%xmm12 # xmm12 =piby2_2tail |
| |
| #t = rhead; |
| movsd %xmm6,%xmm5 # xmm5 = t = rhead |
| |
| #rtail = npi2 * piby2_2; |
| mulsd %xmm2,%xmm10 # xmm1 =rtail=(npi2*piby2_2) |
| |
| #rhead = t - rtail |
| subsd %xmm10,%xmm6 # xmm6 =rhead=(t-rtail) |
| |
| #rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); |
| mulsd %xmm2,%xmm12 # npi2 * piby2_2tail |
| subsd %xmm6,%xmm5 # t-rhead |
| subsd %xmm5,%xmm10 # (rtail-(t-rhead)) |
| addsd %xmm12,%xmm10 # rtail=npi2*piby2_2tail+(rtail-(t-rhead)); |
| |
| #r = rhead - rtail |
| #rr = (rhead-r) -rtail |
| mov %ecx,region+4(%rsp) # store upper region |
| movsd %xmm6,%xmm0 |
| subsd %xmm10,%xmm0 # xmm0 = r=(rhead-rtail) |
| subsd %xmm0,%xmm6 # rr=rhead-r |
| subsd %xmm10,%xmm6 # xmm6 = rr=((rhead-r) -rtail) |
| movlpd %xmm0,r+8(%rsp) # store upper r |
| movlpd %xmm6,rr+8(%rsp) # store upper rr |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| #Note that volatiles will be trashed by the call |
| #We will construct r, rr, region and sign |
| |
| # Work on Lower arg |
| mov $0x07ff0000000000000,%r11 # is lower arg nan/inf |
| mov %r11,%r10 |
| and %rax,%r10 |
| cmp %r11,%r10 |
| jz .L__vrd4_sin_lower_naninf |
| |
| mov %r8,p_temp(%rsp) |
| mov %r9,p_temp2(%rsp) |
| movapd %xmm1,p_temp1(%rsp) |
| movapd %xmm3,p_temp3(%rsp) |
| movapd %xmm7,p_temp5(%rsp) |
| |
| lea region(%rsp),%rdx # lower arg is **NOT** nan/inf |
| lea rr(%rsp),%rsi |
| lea r(%rsp),%rdi |
| movlpd r(%rsp),%xmm0 #Restore lower fp arg for remainder_piby2 call |
| call __amd_remainder_piby2@PLT |
| |
| mov p_temp(%rsp),%r8 |
| mov p_temp2(%rsp),%r9 |
| movapd p_temp1(%rsp),%xmm1 |
| movapd p_temp3(%rsp),%xmm3 |
| movapd p_temp5(%rsp),%xmm7 |
| jmp 0f |
| |
| .L__vrd4_sin_lower_naninf: |
| mov p_original(%rsp),%rax # upper arg is nan/inf |
| mov $0x00008000000000000,%r11 |
| or %r11,%rax |
| mov %rax,r(%rsp) # r = x | 0x0008000000000000 |
| xor %r10,%r10 |
| mov %r10,rr(%rsp) # rr = 0 |
| mov %r10d,region(%rsp) # region =0 |
| and .L__real_naninf_lower_sign_mask(%rip),%r12 # Sign |
| .align 16 |
| 0: |
| jmp .Lcheck_next2_args |
| |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| .align 16 |
| .Lboth_arg_gt_5e5: |
| #Upper Arg is >= 5e5, Lower arg is >= 5e5 |
| # %rcx,,%rax r8, r9 |
| # %xmm2,,%xmm0 xmm6 = x, xmm4 = 0.5 |
| |
| movhpd %xmm0,r+8(%rsp) #Save upper fp arg for remainder_piby2 call |
| |
| mov $0x07ff0000000000000,%r11 #is lower arg nan/inf |
| mov %r11,%r10 |
| and %rax,%r10 |
| cmp %r11,%r10 |
| jz .L__vrd4_sin_lower_naninf_of_both_gt_5e5 |
| |
| mov %rcx,p_temp(%rsp) #Save upper arg |
| mov %r8,p_temp2(%rsp) |
| mov %r9,p_temp4(%rsp) |
| movapd %xmm1,p_temp1(%rsp) |
| movapd %xmm3,p_temp3(%rsp) |
| movapd %xmm7,p_temp5(%rsp) |
| |
| lea region(%rsp),%rdx #lower arg is **NOT** nan/inf |
| lea rr(%rsp),%rsi |
| lea r(%rsp),%rdi |
| call __amd_remainder_piby2@PLT |
| |
| mov p_temp(%rsp),%rcx #Restore upper arg |
| mov p_temp2(%rsp),%r8 |
| mov p_temp4(%rsp),%r9 |
| movapd p_temp1(%rsp),%xmm1 |
| movapd p_temp3(%rsp),%xmm3 |
| movapd p_temp5(%rsp),%xmm7 |
| |
| jmp 0f |
| |
| .L__vrd4_sin_lower_naninf_of_both_gt_5e5: #lower arg is nan/inf |
| mov p_original(%rsp),%rax |
| mov $0x00008000000000000,%r11 |
| or %r11,%rax |
| mov %rax,r(%rsp) #r = x | 0x0008000000000000 |
| xor %r10,%r10 |
| mov %r10,rr(%rsp) #rr = 0 |
| mov %r10d,region(%rsp) #region = 0 |
| and .L__real_naninf_lower_sign_mask(%rip),%r12 # Sign |
| |
| .align 16 |
| 0: |
| mov $0x07ff0000000000000,%r11 #is upper arg nan/inf |
| mov %r11,%r10 |
| and %rcx,%r10 |
| cmp %r11,%r10 |
| jz .L__vrd4_sin_upper_naninf_of_both_gt_5e5 |
| |
| |
| mov %r8,p_temp(%rsp) |
| mov %r9,p_temp2(%rsp) |
| movapd %xmm1,p_temp1(%rsp) |
| movapd %xmm3,p_temp3(%rsp) |
| movapd %xmm7,p_temp5(%rsp) |
| |
| lea region+4(%rsp),%rdx #upper arg is **NOT** nan/inf |
| lea rr+8(%rsp),%rsi |
| lea r+8(%rsp),%rdi |
| movlpd r+8(%rsp),%xmm0 #Restore upper fp arg for remainder_piby2 call |
| call __amd_remainder_piby2@PLT |
| |
| mov p_temp(%rsp),%r8 |
| mov p_temp2(%rsp),%r9 |
| movapd p_temp1(%rsp),%xmm1 |
| movapd p_temp3(%rsp),%xmm3 |
| movapd p_temp5(%rsp),%xmm7 |
| |
| jmp 0f |
| |
| .L__vrd4_sin_upper_naninf_of_both_gt_5e5: |
| mov p_original+8(%rsp),%rcx #upper arg is nan/inf |
| mov $0x00008000000000000,%r11 |
| or %r11,%rcx |
| mov %rcx,r+8(%rsp) #r = x | 0x0008000000000000 |
| xor %r10,%r10 |
| mov %r10,rr+8(%rsp) #rr = 0 |
| mov %r10d,region+4(%rsp) #region = 0 |
| and .L__real_naninf_upper_sign_mask(%rip),%r12 # Sign |
| .align 16 |
| 0: |
| jmp .Lcheck_next2_args |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| .align 16 |
| .Lsecond_or_next2_arg_gt_5e5: |
| # Upper Arg is >= 5e5, Lower arg is < 5e5 |
| # %rcx,,%rax r8, r9 |
| # %xmm2,,%xmm0 xmm6 = x, xmm4 = 0.5 |
| # Do not use %xmm3,,%xmm1 xmm7 |
| # Restore xmm4 and %xmm3,,%xmm1 xmm7 |
| # Can use %xmm10,,%xmm8 xmm12 |
| # %xmm9,,%xmm5 xmm11, xmm13 |
| |
| movhpd %xmm0,r+8(%rsp) #Save upper fp arg for remainder_piby2 call |
| # movlhps %xmm0,%xmm0 #Not needed since we want to work on lower arg, but done just to be safe and avoide exceptions due to nan/inf and to mirror the lower_arg_gt_5e5 case |
| # movlhps %xmm2,%xmm2 |
| # movlhps %xmm6,%xmm6 |
| |
| # Work on Lower arg |
| # Upper arg might contain nan/inf, to avoid exception use only scalar instructions on lower arg |
| |
| mulsd .L__real_3fe45f306dc9c883(%rip),%xmm2 # x*twobypi |
| addsd %xmm4,%xmm2 # xmm2 = npi2=(x*twobypi+0.5) |
| movsd .L__real_3ff921fb54400000(%rip),%xmm8 # xmm3 = piby2_1 |
| cvttsd2si %xmm2,%eax # ecx = npi2 trunc to ints |
| movsd .L__real_3dd0b4611a600000(%rip),%xmm10 # xmm1 = piby2_2 |
| cvtsi2sd %eax,%xmm2 # xmm2 = npi2 trunc to doubles |
| |
| #/* Subtract the multiple from x to get an extra-precision remainder */ |
| #rhead = x - npi2 * piby2_1; |
| mulsd %xmm2,%xmm8 # npi2 * piby2_1 |
| subsd %xmm8,%xmm6 # xmm6 = rhead =(x-npi2*piby2_1) |
| movsd .L__real_3ba3198a2e037073(%rip),%xmm12 # xmm7 =piby2_2tail |
| |
| #t = rhead; |
| movsd %xmm6,%xmm5 # xmm5 = t = rhead |
| |
| #rtail = npi2 * piby2_2; |
| mulsd %xmm2,%xmm10 # xmm1 =rtail=(npi2*piby2_2) |
| |
| #rhead = t - rtail |
| subsd %xmm10,%xmm6 # xmm6 =rhead=(t-rtail) |
| |
| #rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); |
| mulsd %xmm2,%xmm12 # npi2 * piby2_2tail |
| subsd %xmm6,%xmm5 # t-rhead |
| subsd %xmm5,%xmm10 # (rtail-(t-rhead)) |
| addsd %xmm12,%xmm10 # rtail=npi2*piby2_2tail+(rtail-(t-rhead)); |
| |
| #r = rhead - rtail |
| #rr = (rhead-r) -rtail |
| mov %eax,region(%rsp) # store upper region |
| movsd %xmm6,%xmm0 |
| subsd %xmm10,%xmm0 # xmm0 = r=(rhead-rtail) |
| subsd %xmm0,%xmm6 # rr=rhead-r |
| subsd %xmm10,%xmm6 # xmm6 = rr=((rhead-r) -rtail) |
| movlpd %xmm0,r(%rsp) # store upper r |
| movlpd %xmm6,rr(%rsp) # store upper rr |
| |
| #Work on Upper arg |
| #Note that volatiles will be trashed by the call |
| #We do not care since this is the last check |
| #We will construct r, rr, region and sign |
| mov $0x07ff0000000000000,%r11 # is upper arg nan/inf |
| mov %r11,%r10 |
| and %rcx,%r10 |
| cmp %r11,%r10 |
| jz .L__vrd4_sin_upper_naninf |
| |
| |
| mov %r8,p_temp(%rsp) |
| mov %r9,p_temp2(%rsp) |
| movapd %xmm1,p_temp1(%rsp) |
| movapd %xmm3,p_temp3(%rsp) |
| movapd %xmm7,p_temp5(%rsp) |
| |
| lea region+4(%rsp),%rdx # upper arg is **NOT** nan/inf |
| lea rr+8(%rsp),%rsi |
| lea r+8(%rsp),%rdi |
| movlpd r+8(%rsp),%xmm0 #Restore upper fp arg for remainder_piby2 call |
| call __amd_remainder_piby2@PLT |
| |
| mov p_temp(%rsp),%r8 |
| mov p_temp2(%rsp),%r9 |
| movapd p_temp1(%rsp),%xmm1 |
| movapd p_temp3(%rsp),%xmm3 |
| movapd p_temp5(%rsp),%xmm7 |
| jmp 0f |
| |
| .L__vrd4_sin_upper_naninf: |
| mov p_original+8(%rsp),%rcx # upper arg is nan/inf |
| # mov r+8(%rsp),%rcx ; upper arg is nan/inf |
| mov $0x00008000000000000,%r11 |
| or %r11,%rcx |
| mov %rcx,r+8(%rsp) # r = x | 0x0008000000000000 |
| xor %r10,%r10 |
| mov %r10,rr+8(%rsp) # rr = 0 |
| mov %r10d,region+4(%rsp) # region =0 |
| and .L__real_naninf_upper_sign_mask(%rip),%r12 # Sign |
| |
| .align 16 |
| 0: |
| jmp .Lcheck_next2_args |
| |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| .align 16 |
| .Lcheck_next2_args: |
| |
| mov $0x411E848000000000,%r10 #5e5 + |
| |
| cmp %r10,%r8 |
| jae .Lfirst_second_done_third_or_fourth_arg_gt_5e5 |
| |
| cmp %r10,%r9 |
| jae .Lfirst_second_done_fourth_arg_gt_5e5 |
| |
| # Work on next two args, both < 5e5 |
| # %xmm3,,%xmm1 xmm5 = x, xmm4 = 0.5 |
| movapd .L__real_3fe0000000000000(%rip),%xmm4 # Restore 0.5 |
| |
| mulpd .L__real_3fe45f306dc9c883(%rip),%xmm3 # * twobypi |
| addpd %xmm4,%xmm3 # +0.5, npi2 |
| movapd .L__real_3ff921fb54400000(%rip),%xmm1 # piby2_1 |
| cvttpd2dq %xmm3,%xmm5 # convert packed double to packed integers |
| movapd .L__real_3dd0b4611a600000(%rip),%xmm9 # piby2_2 |
| cvtdq2pd %xmm5,%xmm3 # and back to double. |
| |
| # /* Subtract the multiple from x to get an extra-precision remainder */ |
| movq %xmm5,region1(%rsp) # Region |
| |
| # rhead = x - npi2 * piby2_1; |
| mulpd %xmm3,%xmm1 # npi2 * piby2_1; |
| |
| # rtail = npi2 * piby2_2; |
| mulpd %xmm3,%xmm9 # rtail |
| |
| # rhead = x - npi2 * piby2_1; |
| subpd %xmm1,%xmm7 # rhead = x - npi2 * piby2_1; |
| |
| # t = rhead; |
| movapd %xmm7,%xmm1 # t |
| |
| # rhead = t - rtail; |
| subpd %xmm9,%xmm1 # rhead |
| |
| # rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); |
| mulpd .L__real_3ba3198a2e037073(%rip),%xmm3 # npi2 * piby2_2tail |
| |
| subpd %xmm1,%xmm7 # t-rhead |
| subpd %xmm7,%xmm9 # - ((t - rhead) - rtail) |
| addpd %xmm3,%xmm9 # rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); |
| |
| movapd %xmm1,%xmm7 # rhead |
| subpd %xmm9,%xmm1 # r = rhead - rtail |
| movapd %xmm1,r1(%rsp) |
| |
| subpd %xmm1,%xmm7 # rr=rhead-r |
| subpd %xmm9,%xmm7 # rr=(rhead-r) -rtail |
| movapd %xmm7,rr1(%rsp) |
| |
| jmp .L__vrd4_sin_reconstruct |
| |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| .align 16 |
| .Lthird_or_fourth_arg_gt_5e5: |
| #first two args are < 5e5, third arg >= 5e5, fourth arg >= 5e5 or < 5e5 |
| # %rcx,,%rax r8, r9 |
| # %xmm2,,%xmm0 xmm6 = x, xmm4 = 0.5 |
| # Do not use %xmm3,,%xmm1 xmm7 |
| # Can use %xmm11,,%xmm9 xmm13 |
| # %xmm8,,%xmm5 xmm10, xmm12 |
| # Restore xmm4 |
| |
| # Work on first two args, both < 5e5 |
| |
| #DEBUG |
| # movapd %xmm2, %xmm4 |
| # movapd %xmm1, %xmm5 |
| # movapd %xmm2, %xmm12 |
| # movapd %xmm1, %xmm13 |
| # jmp .L__vrd4_sin_cleanup |
| #DEBUG |
| |
| mulpd .L__real_3fe45f306dc9c883(%rip),%xmm2 # * twobypi |
| addpd %xmm4,%xmm2 # +0.5, npi2 |
| movapd .L__real_3ff921fb54400000(%rip),%xmm0 # piby2_1 |
| cvttpd2dq %xmm2,%xmm4 # convert packed double to packed integers |
| movapd .L__real_3dd0b4611a600000(%rip),%xmm8 # piby2_2 |
| cvtdq2pd %xmm4,%xmm2 # and back to double. |
| |
| # /* Subtract the multiple from x to get an extra-precision remainder */ |
| movq %xmm4,region(%rsp) # Region |
| |
| #DEBUG |
| # movapd region(%rsp), %xmm4 |
| # movapd %xmm1, %xmm5 |
| # movapd region(%rsp), %xmm12 |
| # movapd %xmm1, %xmm13 |
| # jmp .L__vrd4_sin_cleanup |
| #DEBUG |
| |
| # rhead = x - npi2 * piby2_1; |
| mulpd %xmm2,%xmm0 # npi2 * piby2_1; |
| # rtail = npi2 * piby2_2; |
| mulpd %xmm2,%xmm8 # rtail |
| |
| # rhead = x - npi2 * piby2_1; |
| subpd %xmm0,%xmm6 # rhead = x - npi2 * piby2_1; |
| |
| # t = rhead; |
| movapd %xmm6,%xmm0 # t |
| |
| # rhead = t - rtail; |
| subpd %xmm8,%xmm0 # rhead |
| |
| # rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); |
| mulpd .L__real_3ba3198a2e037073(%rip),%xmm2 # npi2 * piby2_2tail |
| |
| subpd %xmm0,%xmm6 # t-rhead |
| subpd %xmm6,%xmm8 # - ((t - rhead) - rtail) |
| addpd %xmm2,%xmm8 # rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); |
| |
| movapd %xmm0,%xmm6 # rhead |
| subpd %xmm8,%xmm0 # r = rhead - rtail |
| movapd %xmm0,r(%rsp) |
| |
| subpd %xmm0,%xmm6 # rr=rhead-r |
| subpd %xmm8,%xmm6 # rr=(rhead-r) -rtail |
| movapd %xmm6,rr(%rsp) |
| |
| |
| # Work on next two args, third arg >= 5e5, fourth arg >= 5e5 or < 5e5 |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| .Lfirst_second_done_third_or_fourth_arg_gt_5e5: |
| # %rcx,,%rax r8, r9 |
| # %xmm2,,%xmm0 xmm6 = x, xmm4 = 0.5 |
| |
| #DEBUG |
| # movapd region(%rsp), %xmm4 |
| # movapd %xmm1, %xmm5 |
| # movapd region(%rsp), %xmm12 |
| # movapd %xmm1, %xmm13 |
| # jmp .L__vrd4_sin_cleanup |
| #DEBUG |
| |
| mov $0x411E848000000000,%r10 #5e5 + |
| |
| cmp %r10,%r9 |
| jae .Lboth_arg_gt_5e5_higher |
| |
| |
| # Upper Arg is <5e5, Lower arg is >= 5e5 |
| # %r9,%r8 |
| # %xmm3,,%xmm1 xmm7 = x, xmm4 = 0.5 |
| |
| movlpd %xmm1,r1(%rsp) #Save lower fp arg for remainder_piby2 call |
| movhlps %xmm1,%xmm1 #Needed since we want to work on upper arg |
| movhlps %xmm3,%xmm3 |
| movhlps %xmm7,%xmm7 |
| movapd .L__real_3fe0000000000000(%rip),%xmm4 #0.5 for later use |
| |
| # Work on Upper arg |
| # Lower arg might contain nan/inf, to avoid exception use only scalar instructions on upper arg which has been moved to lower portions of fp regs |
| mulsd .L__real_3fe45f306dc9c883(%rip),%xmm3 # x*twobypi |
| addsd %xmm4,%xmm3 # xmm3 = npi2=(x*twobypi+0.5) |
| movsd .L__real_3ff921fb54400000(%rip),%xmm2 # xmm2 = piby2_1 |
| cvttsd2si %xmm3,%r9d # r9d = npi2 trunc to ints |
| movsd .L__real_3dd0b4611a600000(%rip),%xmm0 # xmm0 = piby2_2 |
| cvtsi2sd %r9d,%xmm3 # xmm3 = npi2 trunc to doubles |
| |
| #/* Subtract the multiple from x to get an extra-precision remainder */ |
| #rhead = x - npi2 * piby2_1; |
| mulsd %xmm3,%xmm2 # npi2 * piby2_1 |
| subsd %xmm2,%xmm7 # xmm7 = rhead =(x-npi2*piby2_1) |
| movsd .L__real_3ba3198a2e037073(%rip),%xmm6 # xmm6 =piby2_2tail |
| |
| #t = rhead; |
| movsd %xmm7,%xmm5 # xmm5 = t = rhead |
| |
| #rtail = npi2 * piby2_2; |
| mulsd %xmm3,%xmm0 # xmm0 =rtail=(npi2*piby2_2) |
| |
| #rhead = t - rtail |
| subsd %xmm0,%xmm7 # xmm7 =rhead=(t-rtail) |
| |
| #rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); |
| mulsd %xmm3,%xmm6 # npi2 * piby2_2tail |
| subsd %xmm7,%xmm5 # t-rhead |
| subsd %xmm5,%xmm0 # (rtail-(t-rhead)) |
| addsd %xmm6,%xmm0 # rtail=npi2*piby2_2tail+(rtail-(t-rhead)); |
| |
| #r = rhead - rtail |
| #rr = (rhead-r) -rtail |
| mov %r9d,region1+4(%rsp) # store upper region |
| movsd %xmm7,%xmm1 |
| subsd %xmm0,%xmm1 # xmm1 = r=(rhead-rtail) |
| subsd %xmm1,%xmm7 # rr=rhead-r |
| subsd %xmm0,%xmm7 # xmm7 = rr=((rhead-r) -rtail) |
| movlpd %xmm1,r1+8(%rsp) # store upper r |
| movlpd %xmm7,rr1+8(%rsp) # store upper rr |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| #Note that volatiles will be trashed by the call |
| #We do not care since this is the last check |
| #We will construct r, rr, region and sign |
| |
| # Work on Lower arg |
| mov $0x07ff0000000000000,%r11 # is lower arg nan/inf |
| mov %r11,%r10 |
| and %r8,%r10 |
| cmp %r11,%r10 |
| jz .L__vrd4_sin_lower_naninf_higher |
| |
| lea region1(%rsp),%rdx # lower arg is **NOT** nan/inf |
| lea rr1(%rsp),%rsi |
| lea r1(%rsp),%rdi |
| movlpd r1(%rsp),%xmm0 #Restore lower fp arg for remainder_piby2 call |
| call __amd_remainder_piby2@PLT |
| jmp 0f |
| |
| .L__vrd4_sin_lower_naninf_higher: |
| mov p_original1(%rsp),%r8 # upper arg is nan/inf |
| mov $0x00008000000000000,%r11 |
| or %r11,%r8 |
| mov %r8,r1(%rsp) # r = x | 0x0008000000000000 |
| xor %r10,%r10 |
| mov %r10,rr1(%rsp) # rr = 0 |
| mov %r10d,region1(%rsp) # region =0 |
| and .L__real_naninf_lower_sign_mask(%rip),%r13 # Sign |
| |
| .align 16 |
| 0: |
| |
| |
| #DEBUG |
| # movapd r(%rsp), %xmm4 |
| # movapd r1(%rsp), %xmm5 |
| # movapd r(%rsp), %xmm12 |
| # movapd r1(%rsp), %xmm13 |
| # jmp .L__vrd4_sin_cleanup |
| #DEBUG |
| |
| |
| jmp .L__vrd4_sin_reconstruct |
| |
| .align 16 |
| .Lboth_arg_gt_5e5_higher: |
| # Upper Arg is >= 5e5, Lower arg is >= 5e5 |
| # %r9,%r8 |
| # %xmm3,,%xmm1 xmm7 = x, xmm4 = 0.5 |
| |
| movhpd %xmm1,r1+8(%rsp) #Save upper fp arg for remainder_piby2 call |
| |
| mov $0x07ff0000000000000,%r11 #is lower arg nan/inf |
| mov %r11,%r10 |
| and %r8,%r10 |
| cmp %r11,%r10 |
| jz .L__vrd4_sin_lower_naninf_of_both_gt_5e5_higher |
| |
| mov %r9,p_temp1(%rsp) #Save upper arg |
| lea region1(%rsp),%rdx #lower arg is **NOT** nan/inf |
| lea rr1(%rsp),%rsi |
| lea r1(%rsp),%rdi |
| movsd %xmm1,%xmm0 |
| call __amd_remainder_piby2@PLT |
| mov p_temp1(%rsp),%r9 #Restore upper arg |
| |
| jmp 0f |
| |
| .L__vrd4_sin_lower_naninf_of_both_gt_5e5_higher: #lower arg is nan/inf |
| mov p_original1(%rsp),%r8 |
| mov $0x00008000000000000,%r11 |
| or %r11,%r8 |
| mov %r8,r1(%rsp) #r = x | 0x0008000000000000 |
| xor %r10,%r10 |
| mov %r10,rr1(%rsp) #rr = 0 |
| mov %r10d,region1(%rsp) #region = 0 |
| and .L__real_naninf_lower_sign_mask(%rip),%r13 # Sign |
| |
| .align 16 |
| 0: |
| mov $0x07ff0000000000000,%r11 #is upper arg nan/inf |
| mov %r11,%r10 |
| and %r9,%r10 |
| cmp %r11,%r10 |
| jz .L__vrd4_sin_upper_naninf_of_both_gt_5e5_higher |
| |
| lea region1+4(%rsp),%rdx #upper arg is **NOT** nan/inf |
| lea rr1+8(%rsp),%rsi |
| lea r1+8(%rsp),%rdi |
| movlpd r1+8(%rsp),%xmm0 #Restore upper fp arg for remainder_piby2 call |
| call __amd_remainder_piby2@PLT |
| jmp 0f |
| |
| .L__vrd4_sin_upper_naninf_of_both_gt_5e5_higher: |
| mov p_original1+8(%rsp),%r9 #upper arg is nan/inf |
| # movd %xmm6,%r9 ;upper arg is nan/inf |
| mov $0x00008000000000000,%r11 |
| or %r11,%r9 |
| mov %r9,r1+8(%rsp) #r = x | 0x0008000000000000 |
| xor %r10,%r10 |
| mov %r10,rr1+8(%rsp) #rr = 0 |
| mov %r10d,region1+4(%rsp) #region = 0 |
| and .L__real_naninf_upper_sign_mask(%rip),%r13 # Sign |
| |
| .align 16 |
| 0: |
| |
| jmp .L__vrd4_sin_reconstruct |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| .align 16 |
| .Lfourth_arg_gt_5e5: |
| #first two args are < 5e5, third arg < 5e5, fourth arg >= 5e5 |
| #%rcx,,%rax r8, r9 |
| #%xmm2,,%xmm0 xmm6 = x, xmm4 = 0.5 |
| |
| # Work on first two args, both < 5e5 |
| |
| mulpd .L__real_3fe45f306dc9c883(%rip),%xmm2 # * twobypi |
| addpd %xmm4,%xmm2 # +0.5, npi2 |
| movapd .L__real_3ff921fb54400000(%rip),%xmm0 # piby2_1 |
| cvttpd2dq %xmm2,%xmm4 # convert packed double to packed integers |
| movapd .L__real_3dd0b4611a600000(%rip),%xmm8 # piby2_2 |
| cvtdq2pd %xmm4,%xmm2 # and back to double. |
| |
| # /* Subtract the multiple from x to get an extra-precision remainder */ |
| movq %xmm4,region(%rsp) # Region |
| |
| # rhead = x - npi2 * piby2_1; |
| mulpd %xmm2,%xmm0 # npi2 * piby2_1; |
| # rtail = npi2 * piby2_2; |
| mulpd %xmm2,%xmm8 # rtail |
| |
| # rhead = x - npi2 * piby2_1; |
| subpd %xmm0,%xmm6 # rhead = x - npi2 * piby2_1; |
| |
| # t = rhead; |
| movapd %xmm6,%xmm0 # t |
| |
| # rhead = t - rtail; |
| subpd %xmm8,%xmm0 # rhead |
| |
| # rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); |
| mulpd .L__real_3ba3198a2e037073(%rip),%xmm2 # npi2 * piby2_2tail |
| |
| subpd %xmm0,%xmm6 # t-rhead |
| subpd %xmm6,%xmm8 # - ((t - rhead) - rtail) |
| addpd %xmm2,%xmm8 # rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); |
| |
| movapd %xmm0,%xmm6 # rhead |
| subpd %xmm8,%xmm0 # r = rhead - rtail |
| movapd %xmm0,r(%rsp) |
| |
| subpd %xmm0,%xmm6 # rr=rhead-r |
| subpd %xmm8,%xmm6 # rr=(rhead-r) -rtail |
| movapd %xmm6,rr(%rsp) |
| |
| |
| # Work on next two args, third arg < 5e5, fourth arg >= 5e5 |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| .Lfirst_second_done_fourth_arg_gt_5e5: |
| |
| # Upper Arg is >= 5e5, Lower arg is < 5e5 |
| # %r9,%r8 |
| # %xmm3,,%xmm1 xmm7 = x, xmm4 = 0.5 |
| |
| movhpd %xmm1,r1+8(%rsp) #Save upper fp arg for remainder_piby2 call |
| # movlhps %xmm1,%xmm1 #Not needed since we want to work on lower arg, but done just to be safe and avoide exceptions due to nan/inf and to mirror the lower_arg_gt_5e5 case |
| # movlhps %xmm3,%xmm3 |
| # movlhps %xmm7,%xmm7 |
| movapd .L__real_3fe0000000000000(%rip),%xmm4 #0.5 for later use |
| |
| # Work on Lower arg |
| # Upper arg might contain nan/inf, to avoid exception use only scalar instructions on lower arg |
| |
| mulsd .L__real_3fe45f306dc9c883(%rip),%xmm3 # x*twobypi |
| addsd %xmm4,%xmm3 # xmm3 = npi2=(x*twobypi+0.5) |
| movsd .L__real_3ff921fb54400000(%rip),%xmm2 # xmm2 = piby2_1 |
| cvttsd2si %xmm3,%r8d # r8d = npi2 trunc to ints |
| movsd .L__real_3dd0b4611a600000(%rip),%xmm0 # xmm0 = piby2_2 |
| cvtsi2sd %r8d,%xmm3 # xmm3 = npi2 trunc to doubles |
| |
| #/* Subtract the multiple from x to get an extra-precision remainder */ |
| #rhead = x - npi2 * piby2_1; |
| mulsd %xmm3,%xmm2 # npi2 * piby2_1 |
| subsd %xmm2,%xmm7 # xmm7 = rhead =(x-npi2*piby2_1) |
| movsd .L__real_3ba3198a2e037073(%rip),%xmm6 # xmm6 =piby2_2tail |
| |
| #t = rhead; |
| movsd %xmm7,%xmm5 # xmm5 = t = rhead |
| |
| #rtail = npi2 * piby2_2; |
| mulsd %xmm3,%xmm0 # xmm0 =rtail=(npi2*piby2_2) |
| |
| #rhead = t - rtail |
| subsd %xmm0,%xmm7 # xmm7 =rhead=(t-rtail) |
| |
| #rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); |
| mulsd %xmm3,%xmm6 # npi2 * piby2_2tail |
| subsd %xmm7,%xmm5 # t-rhead |
| subsd %xmm5,%xmm0 # (rtail-(t-rhead)) |
| addsd %xmm6,%xmm0 # rtail=npi2*piby2_2tail+(rtail-(t-rhead)); |
| |
| #r = rhead - rtail |
| #rr = (rhead-r) -rtail |
| mov %r8d,region1(%rsp) # store lower region |
| movsd %xmm7,%xmm1 |
| subsd %xmm0,%xmm1 # xmm0 = r=(rhead-rtail) |
| subsd %xmm1,%xmm7 # rr=rhead-r |
| subsd %xmm0,%xmm7 # xmm6 = rr=((rhead-r) -rtail) |
| |
| movlpd %xmm1,r1(%rsp) # store upper r |
| movlpd %xmm7,rr1(%rsp) # store upper rr |
| |
| #Work on Upper arg |
| #Note that volatiles will be trashed by the call |
| #We do not care since this is the last check |
| #We will construct r, rr, region and sign |
| mov $0x07ff0000000000000,%r11 # is upper arg nan/inf |
| mov %r11,%r10 |
| and %r9,%r10 |
| cmp %r11,%r10 |
| jz .L__vrd4_sin_upper_naninf_higher |
| |
| lea region1+4(%rsp),%rdx # upper arg is **NOT** nan/inf |
| lea rr1+8(%rsp),%rsi |
| lea r1+8(%rsp),%rdi |
| movlpd r1+8(%rsp),%xmm0 #Restore upper fp arg for remainder_piby2 call |
| call __amd_remainder_piby2@PLT |
| jmp 0f |
| |
| .L__vrd4_sin_upper_naninf_higher: |
| mov p_original1+8(%rsp),%r9 # upper arg is nan/inf |
| # mov r1+8(%rsp),%r9 ; upper arg is nan/inf |
| mov $0x00008000000000000,%r11 |
| or %r11,%r9 |
| mov %r9,r1+8(%rsp) # r = x | 0x0008000000000000 |
| xor %r10,%r10 |
| mov %r10,rr1+8(%rsp) # rr = 0 |
| mov %r10d,region1+4(%rsp) # region =0 |
| and .L__real_naninf_upper_sign_mask(%rip),%r13 # Sign |
| |
| .align 16 |
| 0: |
| jmp .L__vrd4_sin_reconstruct |
| |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| .align 16 |
| .L__vrd4_sin_reconstruct: |
| #Results |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| # p_sign0 = Sign, xmm0 = r, xmm2 = %xmm6,%r2 =rr |
| # p_sign1 = Sign, xmm1 = r, xmm3 = %xmm7,%r2 =rr |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| #DEBUG |
| # movapd region(%rsp), %xmm4 |
| # movapd region1(%rsp), %xmm5 |
| # movapd region(%rsp), %xmm12 |
| # movapd region1(%rsp), %xmm13 |
| # jmp .L__vrd4_sin_cleanup |
| #DEBUG |
| |
| |
| movapd r(%rsp),%xmm0 |
| movapd r1(%rsp),%xmm1 |
| |
| movapd rr(%rsp),%xmm6 |
| movapd rr1(%rsp),%xmm7 |
| |
| mov region(%rsp),%r8 |
| mov region1(%rsp),%r9 |
| |
| movlpd region(%rsp),%xmm4 |
| movlpd region1(%rsp),%xmm5 |
| |
| pand .L__reald_one_one(%rip),%xmm4 #odd/even region for cos/sin |
| pand .L__reald_one_one(%rip),%xmm5 #odd/even region for cos/sin |
| |
| xorpd %xmm12,%xmm12 |
| pcmpeqd %xmm12,%xmm4 |
| pcmpeqd %xmm12,%xmm5 |
| |
| punpckldq %xmm4,%xmm4 |
| punpckldq %xmm5,%xmm5 |
| |
| movapd %xmm4,p_region(%rsp) |
| movapd %xmm5,p_region1(%rsp) |
| |
| mov .L__reald_one_zero(%rip),%rdx #compare value for cossin path |
| mov %r8,%r10 |
| mov %r9,%r11 |
| |
| shr $1,%r10 #~AB+A~B, A is sign and B is upper bit of region |
| shr $1,%r11 #~AB+A~B, A is sign and B is upper bit of region |
| |
| mov %r10,%rax |
| mov %r11,%rcx |
| |
| not %r12 #ADDED TO CHANGE THE LOGIC |
| not %r13 #ADDED TO CHANGE THE LOGIC |
| and %r12,%r10 |
| and %r13,%r11 |
| |
| not %rax |
| not %rcx |
| not %r12 |
| not %r13 |
| and %r12,%rax |
| and %r13,%rcx |
| |
| or %rax,%r10 |
| or %rcx,%r11 |
| and .L__reald_one_one(%rip),%r10 #(~AB+A~B)&1 |
| and .L__reald_one_one(%rip),%r11 #(~AB+A~B)&1 |
| |
| mov %r10,%r12 |
| mov %r11,%r13 |
| |
| and %rdx,%r12 #mask out the lower sign bit leaving the upper sign bit |
| and %rdx,%r13 #mask out the lower sign bit leaving the upper sign bit |
| |
| shl $63,%r10 #shift lower sign bit left by 63 bits |
| shl $63,%r11 #shift lower sign bit left by 63 bits |
| shl $31,%r12 #shift upper sign bit left by 31 bits |
| shl $31,%r13 #shift upper sign bit left by 31 bits |
| |
| mov %r10,p_signs(%rsp) #write out lower sign bit |
| mov %r12,p_signs+8(%rsp) #write out upper sign bit |
| mov %r11,p_signs1(%rsp) #write out lower sign bit |
| mov %r13,p_signs1+8(%rsp) #write out upper sign bit |
| |
| movapd %xmm0,%xmm2 # r |
| movapd %xmm1,%xmm3 # r |
| |
| mulpd %xmm0,%xmm2 # r2 |
| mulpd %xmm1,%xmm3 # r2 |
| |
| add .L__reald_one_one(%rip),%r8 |
| add .L__reald_one_one(%rip),%r9 |
| |
| and .L__reald_two_two(%rip),%r8 |
| and .L__reald_two_two(%rip),%r9 |
| |
| shr $1,%r8 |
| shr $1,%r9 |
| |
| mov %r8,%rax |
| mov %r9,%rcx |
| |
| and .L__reald_one_zero(%rip),%rax #mask out the lower sign bit leaving the upper sign bit |
| and .L__reald_one_zero(%rip),%rcx #mask out the lower sign bit leaving the upper sign bit |
| |
| shl $63,%r8 #shift lower sign bit left by 63 bits |
| shl $63,%r9 #shift lower sign bit left by 63 bits |
| |
| shl $31,%rax #shift upper sign bit left by 31 bits |
| shl $31,%rcx #shift upper sign bit left by 31 bits |
| |
| mov %r8,p_signc(%rsp) #write out lower sign bit |
| mov %rax,p_signc+8(%rsp) #write out upper sign bit |
| mov %r9,p_signc1(%rsp) #write out lower sign bit |
| mov %rcx,p_signc1+8(%rsp) #write out upper sign bit |
| |
| jmp .Lsinsin_sinsin_piby4 |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| .align 16 |
| .L__vrd4_sin_cleanup: |
| |
| xorpd p_signs(%rsp),%xmm4 # (+) Sign |
| xorpd p_signs1(%rsp),%xmm5 # (+) Sign |
| |
| xorpd p_signc(%rsp),%xmm12 # (+) Sign |
| xorpd p_signc1(%rsp),%xmm13 # (+) Sign |
| |
| .L__vrda_bottom1: |
| # store the result _m128d |
| mov save_ysa(%rsp),%rdi # get ysin_array pointer |
| mov save_yca(%rsp),%rbx # get ycos_array pointer |
| |
| movlpd %xmm4,(%rdi) |
| movhpd %xmm4,8(%rdi) |
| |
| movlpd %xmm12,(%rbx) |
| movhpd %xmm12,8(%rbx) |
| |
| .L__vrda_bottom2: |
| |
| prefetch 64(%rdi) |
| prefetch 64(%rbx) |
| |
| add $32,%rdi |
| add $32,%rbx |
| |
| mov %rdi,save_ysa(%rsp) # save ysin_array pointer |
| mov %rbx,save_yca(%rsp) # save ycos_array pointer |
| |
| # store the result _m128d |
| movlpd %xmm5, -16(%rdi) |
| movhpd %xmm5, -8(%rdi) |
| |
| movlpd %xmm13, -16(%rbx) |
| movhpd %xmm13, -8(%rbx) |
| |
| mov p_iter(%rsp),%rax # get number of iterations |
| sub $1,%rax |
| mov %rax,p_iter(%rsp) # save number of iterations |
| jnz .L__vrda_top |
| |
| # see if we need to do any extras |
| mov save_nv(%rsp),%rax # get number of values |
| test %rax,%rax |
| jnz .L__vrda_cleanup |
| |
| .L__final_check: |
| |
| mov save_r12(%rsp),%r12 # restore r12 |
| mov save_r13(%rsp),%r13 # restore r13 |
| mov save_rbx(%rsp),%rbx # restore rbx |
| |
| add $0x0308,%rsp |
| ret |
| |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| # we jump here when we have an odd number of cos calls to make at the end |
| # we assume that rdx is pointing at the next x array element, r8 at the next y array element. |
| # The number of values left is in save_nv |
| |
| .align 16 |
| .L__vrda_cleanup: |
| mov save_nv(%rsp),%rax # get number of values |
| test %rax,%rax # are there any values |
| jz .L__final_check # exit if not |
| |
| mov save_xa(%rsp),%rsi |
| |
| # fill in a m128d with zeroes and the extra values and then make a recursive call. |
| xorpd %xmm0,%xmm0 |
| movlpd %xmm0,p_temp+8(%rsp) |
| movapd %xmm0,p_temp+16(%rsp) |
| |
| mov (%rsi),%rcx # we know there's at least one |
| mov %rcx,p_temp(%rsp) |
| cmp $2,%rax |
| jl .L__vrdacg |
| |
| mov 8(%rsi),%rcx # do the second value |
| mov %rcx,p_temp+8(%rsp) |
| cmp $3,%rax |
| jl .L__vrdacg |
| |
| mov 16(%rsi),%rcx # do the third value |
| mov %rcx,p_temp+16(%rsp) |
| |
| .L__vrdacg: |
| mov $4,%rdi # parameter for N |
| lea p_temp(%rsp),%rsi # &x parameter |
| lea p_temp2(%rsp),%rdx # &ys parameter |
| lea p_temp4(%rsp),%rcx # &yc parameter |
| |
| call vrda_sincos@PLT # call recursively to compute four values |
| |
| # now copy the results to the destination array |
| mov save_ysa(%rsp),%rdi |
| mov save_yca(%rsp),%rbx |
| mov save_nv(%rsp),%rax # get number of values |
| |
| mov p_temp2(%rsp),%rcx |
| mov %rcx,(%rdi) # we know there's at least one |
| mov p_temp4(%rsp),%rdx |
| mov %rdx,(%rbx) # we know there's at least one |
| cmp $2,%rax |
| jl .L__vrdacgf |
| |
| mov p_temp2+8(%rsp),%rcx |
| mov %rcx,8(%rdi) # do the second value |
| mov p_temp4+8(%rsp),%rdx |
| mov %rdx,8(%rbx) # do the second value |
| cmp $3,%rax |
| jl .L__vrdacgf |
| |
| mov p_temp2+16(%rsp),%rcx |
| mov %rcx,16(%rdi) # do the third value |
| mov p_temp4+16(%rsp),%rdx |
| mov %rdx,16(%rbx) # do the third value |
| |
| .L__vrdacgf: |
| jmp .L__final_check |