| |
| # |
| # (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved. |
| # |
| # This file is part of libacml_mv. |
| # |
| # libacml_mv is free software; you can redistribute it and/or |
| # modify it under the terms of the GNU Lesser General Public |
| # License as published by the Free Software Foundation; either |
| # version 2.1 of the License, or (at your option) any later version. |
| # |
| # libacml_mv is distributed in the hope that it will be useful, |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| # Lesser General Public License for more details. |
| # |
| # You should have received a copy of the GNU Lesser General Public |
| # License along with libacml_mv. If not, see |
| # <http://www.gnu.org/licenses/>. |
| # |
| # |
| |
| |
| |
| |
| |
| # |
| # A vector implementation of the libm sin function. |
| # |
| # Prototype: |
| # |
| # __m128d __vrd2_sin(__m128d x); |
| # |
| # Computes Sine of x |
| # It will provide proper C99 return values, |
| # but may not raise floating point status bits properly. |
| # Based on the NAG C implementation. |
| # Author: Harsha Jagasia |
| # Email: harsha.jagasia@amd.com |
| |
| |
| #ifdef __ELF__ |
| .section .note.GNU-stack,"",@progbits |
| #endif |
| |
| |
| .data |
| .align 16 |
| .L__real_7fffffffffffffff: .quad 0x07fffffffffffffff #Sign bit zero |
| .quad 0x07fffffffffffffff |
| .L__real_3ff0000000000000: .quad 0x03ff0000000000000 # 1.0 |
| .quad 0x03ff0000000000000 |
| .L__real_v2p__27: .quad 0x03e40000000000000 # 2p-27 |
| .quad 0x03e40000000000000 |
| .L__real_3fe0000000000000: .quad 0x03fe0000000000000 # 0.5 |
| .quad 0x03fe0000000000000 |
| .L__real_3fc5555555555555: .quad 0x03fc5555555555555 # 0.166666666666 |
| .quad 0x03fc5555555555555 |
| .L__real_3fe45f306dc9c883: .quad 0x03fe45f306dc9c883 # twobypi |
| .quad 0x03fe45f306dc9c883 |
| .L__real_3ff921fb54400000: .quad 0x03ff921fb54400000 # piby2_1 |
| .quad 0x03ff921fb54400000 |
| .L__real_3dd0b4611a626331: .quad 0x03dd0b4611a626331 # piby2_1tail |
| .quad 0x03dd0b4611a626331 |
| .L__real_3dd0b4611a600000: .quad 0x03dd0b4611a600000 # piby2_2 |
| .quad 0x03dd0b4611a600000 |
| .L__real_3ba3198a2e037073: .quad 0x03ba3198a2e037073 # piby2_2tail |
| .quad 0x03ba3198a2e037073 |
| .L__real_fffffffff8000000: .quad 0x0fffffffff8000000 # mask for stripping head and tail |
| .quad 0x0fffffffff8000000 |
| .L__real_8000000000000000: .quad 0x08000000000000000 # -0 or signbit |
| .quad 0x08000000000000000 |
| .L__reald_one_one: .quad 0x00000000100000001 # |
| .quad 0 |
| .L__reald_two_two: .quad 0x00000000200000002 # |
| .quad 0 |
| .L__reald_one_zero: .quad 0x00000000100000000 # sin_cos_filter |
| .quad 0 |
| .L__reald_zero_one: .quad 0x00000000000000001 # |
| .quad 0 |
| .L__reald_two_zero: .quad 0x00000000200000000 # |
| .quad 0 |
| .L__realq_one_one: .quad 0x00000000000000001 # |
| .quad 0x00000000000000001 # |
| .L__realq_two_two: .quad 0x00000000000000002 # |
| .quad 0x00000000000000002 # |
| .L__real_1_x_mask: .quad 0x0ffffffffffffffff # |
| .quad 0x03ff0000000000000 # |
| .L__real_zero: .quad 0x00000000000000000 # |
| .quad 0x00000000000000000 # |
| .L__real_one: .quad 0x00000000000000001 # |
| .quad 0x00000000000000001 # |
| .L__real_ffffffffffffffff: .quad 0x0ffffffffffffffff #Sign bit one |
| .quad 0x0ffffffffffffffff |
| |
| .Lcosarray: |
| .quad 0x03fa5555555555555 # 0.0416667 c1 |
| .quad 0x03fa5555555555555 |
| .quad 0x0bf56c16c16c16967 # -0.00138889 c2 |
| .quad 0x0bf56c16c16c16967 |
| .quad 0x03efa01a019f4ec90 # 2.48016e-005 c3 |
| .quad 0x03efa01a019f4ec90 |
| .quad 0x0be927e4fa17f65f6 # -2.75573e-007 c4 |
| .quad 0x0be927e4fa17f65f6 |
| .quad 0x03e21eeb69037ab78 # 2.08761e-009 c5 |
| .quad 0x03e21eeb69037ab78 |
| .quad 0x0bda907db46cc5e42 # -1.13826e-011 c6 |
| .quad 0x0bda907db46cc5e42 |
| .Lsinarray: |
| .quad 0x0bfc5555555555555 # -0.166667 s1 |
| .quad 0x0bfc5555555555555 |
| .quad 0x03f81111111110bb3 # 0.00833333 s2 |
| .quad 0x03f81111111110bb3 |
| .quad 0x0bf2a01a019e83e5c # -0.000198413 s3 |
| .quad 0x0bf2a01a019e83e5c |
| .quad 0x03ec71de3796cde01 # 2.75573e-006 s4 |
| .quad 0x03ec71de3796cde01 |
| .quad 0x0be5ae600b42fdfa7 # -2.50511e-008 s5 |
| .quad 0x0be5ae600b42fdfa7 |
| .quad 0x03de5e0b2f9a43bb8 # 1.59181e-010 s6 |
| .quad 0x03de5e0b2f9a43bb8 |
| .Lsincosarray: |
| .quad 0x0bfc5555555555555 # -0.166667 s1 |
| .quad 0x03fa5555555555555 # 0.0416667 c1 |
| .quad 0x03f81111111110bb3 # 0.00833333 s2 |
| .quad 0x0bf56c16c16c16967 # c2 |
| .quad 0x0bf2a01a019e83e5c # -0.000198413 s3 |
| .quad 0x03efa01a019f4ec90 |
| .quad 0x03ec71de3796cde01 # 2.75573e-006 s4 |
| .quad 0x0be927e4fa17f65f6 |
| .quad 0x0be5ae600b42fdfa7 # -2.50511e-008 s5 |
| .quad 0x03e21eeb69037ab78 |
| .quad 0x03de5e0b2f9a43bb8 # 1.59181e-010 s6 |
| .quad 0x0bda907db46cc5e42 |
| .Lcossinarray: |
| .quad 0x03fa5555555555555 # 0.0416667 c1 |
| .quad 0x0bfc5555555555555 # -0.166667 s1 |
| .quad 0x0bf56c16c16c16967 # c2 |
| .quad 0x03f81111111110bb3 # 0.00833333 s2 |
| .quad 0x03efa01a019f4ec90 |
| .quad 0x0bf2a01a019e83e5c # -0.000198413 s3 |
| .quad 0x0be927e4fa17f65f6 |
| .quad 0x03ec71de3796cde01 # 2.75573e-006 s4 |
| .quad 0x03e21eeb69037ab78 |
| .quad 0x0be5ae600b42fdfa7 # -2.50511e-008 s5 |
| .quad 0x0bda907db46cc5e42 |
| .quad 0x03de5e0b2f9a43bb8 # 1.59181e-010 s6 |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| .text |
| .align 16 |
| .p2align 4,,15 |
| |
| .equ p_temp, 0x00 # temporary for get/put bits operation |
| .equ p_temp1,0x10 # temporary for get/put bits operation |
| .equ p_temp2,0x20 # temporary for get/put bits operation |
| .equ p_xmm6, 0x30 # temporary for get/put bits operation |
| .equ p_xmm7, 0x40 # temporary for get/put bits operation |
| .equ p_xmm8, 0x50 # temporary for get/put bits operation |
| .equ p_xmm9, 0x60 # temporary for get/put bits operation |
| .equ p_xmm10,0x70 # temporary for get/put bits operation |
| .equ p_xmm11,0x80 # temporary for get/put bits operation |
| .equ p_xmm12,0x90 # temporary for get/put bits operation |
| .equ p_xmm13,0x0A0 # temporary for get/put bits operation |
| .equ p_xmm14,0x0B0 # temporary for get/put bits operation |
| .equ p_xmm15,0x0C0 # temporary for get/put bits operation |
| .equ r, 0x0D0 # pointer to r for remainder_piby2 |
| .equ rr, 0x0E0 # pointer to r for remainder_piby2 |
| .equ region, 0x0F0 # pointer to r for remainder_piby2 |
| .equ p_original,0x100 # original x |
| .equ p_mask, 0x110 # original x |
| .equ p_sign, 0x120 # original x |
| |
| .globl __vrd2_sin |
| .type __vrd2_sin,@function |
| __vrd2_sin: |
| |
| sub $0x138,%rsp |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| #STARTMAIN |
| movdqa %xmm0,%xmm6 #move to mem to get into integer regs ** |
| andpd .L__real_7fffffffffffffff(%rip), %xmm0 #Unsign - |
| |
| movd %xmm0,%rax #rax is lower arg + |
| movhpd %xmm0, p_temp+8(%rsp) # + |
| mov p_temp+8(%rsp),%rcx #rcx = upper arg + |
| movdqa %xmm0,%xmm1 |
| |
| #This will mask all nan/infs also |
| pcmpgtd %xmm6,%xmm1 |
| movdqa %xmm1,%xmm6 |
| psrldq $4, %xmm1 |
| psrldq $8, %xmm6 |
| |
| mov $0x3FE921FB54442D18,%rdx #piby4 + |
| mov $0x411E848000000000,%r10 #5e5 + |
| |
| movapd .L__real_3fe0000000000000(%rip), %xmm5 #0.5 for later use + |
| |
| por %xmm1,%xmm6 |
| movd %xmm6,%r11 #Move Sign to gpr ** |
| |
| movapd %xmm0,%xmm2 #x + |
| movapd %xmm0,%xmm4 #x + |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| .align 16 |
| .Leither_or_both_arg_gt_than_piby4: |
| |
| cmp %r10,%rax #is lower arg >= 5e5 |
| jae .Llower_or_both_arg_gt_5e5 |
| cmp %r10,%rcx #is upper arg >= 5e5 |
| jae .Lupper_arg_gt_5e5 |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| .Lboth_arg_lt_than_5e5: |
| # %xmm2,,%xmm0 xmm4 = x, xmm5 = 0.5 |
| |
| mulpd .L__real_3fe45f306dc9c883(%rip),%xmm2 # x*twobypi |
| movapd .L__real_3ff921fb54400000(%rip),%xmm3 # xmm3=piby2_1 |
| addpd %xmm5,%xmm2 # xmm2 = npi2 = x*twobypi+0.5 |
| movapd .L__real_3dd0b4611a600000(%rip),%xmm1 # xmm1=piby2_2 |
| movapd .L__real_3ba3198a2e037073(%rip),%xmm6 # xmm6=piby2_2tail |
| cvttpd2dq %xmm2,%xmm0 # xmm0=convert npi2 to ints |
| cvtdq2pd %xmm0,%xmm2 # xmm2=and back to double. |
| |
| |
| #/* Subtract the multiple from x to get an extra-precision remainder */ |
| #rhead = x - npi2 * piby2_1; |
| mulpd %xmm2,%xmm3 # npi2 * piby2_1 |
| subpd %xmm3,%xmm4 # xmm4 = rhead=x-npi2*piby2_1 |
| |
| #t = rhead; |
| movapd %xmm4,%xmm5 # xmm5=t=rhead |
| |
| #rtail = npi2 * piby2_2; |
| mulpd %xmm2,%xmm1 # xmm1= npi2*piby2_2 |
| |
| #rhead = t - rtail; |
| subpd %xmm1,%xmm4 # xmm4= rhead = t-rtail |
| |
| #rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); |
| mulpd %xmm2,%xmm6 # npi2 * piby2_2tail |
| subpd %xmm4,%xmm5 # t-rhead |
| subpd %xmm5,%xmm1 # rtail-(t - rhead) |
| addpd %xmm6,%xmm1 # rtail=npi2*piby2_2+(rtail-(t-rhead)) |
| |
| #r = rhead - rtail |
| #rr=(rhead-r) -rtail |
| #Sign |
| #Region |
| movdqa %xmm0,%xmm5 # Region + |
| movd %xmm0,%r10 # Sign |
| movdqa %xmm4,%xmm0 # rhead (handle xmm0 retype) + |
| |
| subpd %xmm1,%xmm0 # rhead - rtail + |
| pand .L__reald_one_one(%rip),%xmm5 # Odd/Even region for Cos/Sin + |
| mov .L__reald_one_zero(%rip),%r9 # Compare value for cossin + |
| subpd %xmm0,%xmm4 # rr=rhead-r + |
| movd %xmm5,%r8 # Region + |
| movapd %xmm0,%xmm2 # Move for x2 + |
| mulpd %xmm0,%xmm2 # x2 + |
| subpd %xmm1,%xmm4 # rr=(rhead-r) -rtail + |
| |
| shr $1,%r10 #~AB+A~B, A is sign and B is upper bit of region |
| mov %r10,%rcx |
| not %r11 #ADDED TO CHANGE THE LOGIC |
| and %r11,%r10 |
| not %rcx |
| not %r11 |
| and %r11,%rcx |
| or %rcx,%r10 |
| and .L__reald_one_one(%rip),%r10 #(~AB+A~B)&1 |
| |
| mov %r10,%r11 |
| and %r9,%r11 #mask out the lower sign bit leaving the upper sign bit |
| shl $63,%r10 #shift lower sign bit left by 63 bits |
| shl $31,%r11 #shift upper sign bit left by 31 bits |
| mov %r10,p_sign(%rsp) #write out lower sign bit |
| mov %r11,p_sign+8(%rsp) #write out upper sign bit |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| #xmm0= x, xmm2 = x2, xmm4 = xx, r8 = region, r9 = compare value for sincos path, xmm6 = Sign |
| |
| .align 16 |
| .L__vrd2_sin_approximate: |
| cmp $0,%r8 |
| jnz .Lvrd2_not_sin_piby4 |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| .Lvrd2_sin_piby4: |
| movapd .Lsinarray+0x50(%rip),%xmm3 # s6 |
| movapd .Lsinarray+0x20(%rip),%xmm5 # s3 |
| movapd %xmm2,%xmm1 # move for x4 |
| |
| mulpd %xmm2,%xmm3 # x2s6 |
| mulpd %xmm2,%xmm5 # x2s3 |
| mulpd %xmm2,%xmm1 # x4 |
| |
| addpd .Lsinarray+0x40(%rip),%xmm3 # s5+x2s6 |
| movapd %xmm2,%xmm6 # move for x3 |
| addpd .Lsinarray+0x10(%rip),%xmm5 # s2+x2s3 |
| |
| mulpd %xmm2,%xmm3 # x2(s5+x2s6) |
| mulpd %xmm2,%xmm5 # x2(s2+x2s3) |
| mulpd %xmm2,%xmm1 # x6 |
| |
| addpd .Lsinarray+0x30(%rip),%xmm3 # s4 + x2(s5+x2s6) |
| addpd .Lsinarray(%rip),%xmm5 # s1+x2(s2+x2s3) |
| mulpd .L__real_3fe0000000000000(%rip),%xmm2 # 0.5 *x2 |
| |
| mulpd %xmm1,%xmm3 # x6(s4 + x2(s5+x2s6)) |
| mulpd %xmm0,%xmm6 # x3 |
| addpd %xmm5,%xmm3 # zs |
| mulpd %xmm4,%xmm2 # 0.5 * x2 *xx |
| |
| mulpd %xmm3,%xmm6 # x3*zs |
| subpd %xmm2,%xmm6 # x3*zs - 0.5 * x2 *xx |
| addpd %xmm4,%xmm6 # +xx |
| addpd %xmm6,%xmm0 # +x |
| xorpd p_sign(%rsp),%xmm0 # xor sign |
| jmp .L__vrd2_sin_cleanup |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| #xmm0= x, xmm2 = x2, xmm4 = xx, r8 = region, r9 = compare value for sincos path, xmm6 = Sign |
| .align 16 |
| .Lvrd2_not_sin_piby4: |
| cmp $1,%r8 |
| jnz .Lvrd2_not_sin_cos_piby4 |
| |
| .Lvrd2_sin_cos_piby4: |
| |
| movapd %xmm4,p_temp(%rsp) # rr move to to memory |
| movapd %xmm0,p_temp1(%rsp) # r move to to memory |
| |
| movapd .Lcossinarray+0x50(%rip),%xmm3 # s6 |
| mulpd %xmm2,%xmm3 # x2s6 |
| movdqa .Lcossinarray+0x20(%rip),%xmm5 # s3 |
| movapd %xmm2,%xmm1 # move x2 for x4 |
| mulpd %xmm2,%xmm1 # x4 |
| mulpd %xmm2,%xmm5 # x2s3 |
| |
| addpd .Lcossinarray+0x40(%rip),%xmm3 # s5+x2s6 |
| movapd %xmm2,%xmm4 # move for x6 |
| mulpd %xmm2,%xmm3 # x2(s5+x2s6) |
| mulpd %xmm1,%xmm4 # x6 |
| addpd .Lcossinarray+0x10(%rip),%xmm5 # s2+x2s3 |
| mulpd %xmm2,%xmm5 # x2(s2+x2s3) |
| addpd .Lcossinarray+0x30(%rip),%xmm3 # s4 + x2(s5+x2s6) |
| |
| movhlps %xmm0,%xmm0 # high of x for x3 |
| mulpd %xmm4,%xmm3 # x6(s4 + x2(s5+x2s6)) |
| addpd .Lcossinarray(%rip),%xmm5 # s1+x2(s2+x2s3) |
| |
| movhlps %xmm2,%xmm4 # high of x2 for x3 |
| addpd %xmm5,%xmm3 # z |
| |
| mulpd .L__real_3fe0000000000000(%rip),%xmm2 # 0.5*x2 |
| mulsd %xmm0,%xmm4 # x3 # |
| movhlps %xmm3,%xmm5 # xmm5 = sin |
| # xmm3 = cos |
| |
| mulsd %xmm4,%xmm5 # sin*x3 # |
| movsd .L__real_3ff0000000000000(%rip),%xmm4 # 1.0 # |
| mulsd %xmm1,%xmm3 # cos*x4 # |
| |
| subsd %xmm2,%xmm4 # t=1.0-r # |
| |
| movhlps %xmm2,%xmm6 # move 0.5 * x2 for 0.5 * x2 * xx # |
| mulsd p_temp+8(%rsp),%xmm6 # 0.5 * x2 * xx # |
| subsd %xmm6,%xmm5 # sin - 0.5 * x2 *xx # |
| addsd p_temp+8(%rsp),%xmm5 # sin+xx # |
| |
| movlpd p_temp1(%rsp),%xmm6 # x |
| mulsd p_temp(%rsp),%xmm6 # x *xx # |
| |
| movsd .L__real_3ff0000000000000(%rip),%xmm1 # 1 # |
| subsd %xmm4,%xmm1 # 1 -t # |
| addsd %xmm5,%xmm0 # sin+x # |
| subsd %xmm2,%xmm1 # (1-t) - r # |
| subsd %xmm6,%xmm1 # ((1 + (-t)) - r) - x*xx # |
| addsd %xmm1,%xmm3 # cos+((1 + (-t)) - r) - x*xx # |
| addsd %xmm4,%xmm3 # cos+t # |
| |
| movapd p_sign(%rsp),%xmm2 # load sign |
| movlhps %xmm0,%xmm3 |
| movapd %xmm3,%xmm0 |
| xorpd %xmm2,%xmm0 |
| jmp .L__vrd2_sin_cleanup |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| #xmm0= x, xmm2 = x2, xmm4 = xx, r8 = region, r9 = compare value for sincos path, xmm6 = Sign |
| .align 16 |
| .Lvrd2_not_sin_cos_piby4: |
| cmp %r9,%r8 |
| jnz .Lvrd2_cos_piby4 |
| |
| .Lvrd2_cos_sin_piby4: |
| |
| movapd %xmm4,p_temp(%rsp) # Store rr |
| movapd .Lsincosarray+0x50(%rip),%xmm3 # s6 |
| mulpd %xmm2,%xmm3 # x2s6 |
| movdqa .Lsincosarray+0x20(%rip),%xmm5 # s3 (handle xmm5 retype) |
| movapd %xmm2,%xmm1 # move x2 for x4 |
| mulpd %xmm2,%xmm1 # x4 |
| mulpd %xmm2,%xmm5 # x2s3 |
| addpd .Lsincosarray+0x40(%rip),%xmm3 # s5+x2s6 |
| movapd %xmm2,%xmm4 # move x2 for x6 |
| mulpd %xmm2,%xmm3 # x2(s5+x2s6) |
| mulpd %xmm1,%xmm4 # x6 |
| addpd .Lsincosarray+0x10(%rip),%xmm5 # s2+x2s3 |
| mulpd %xmm2,%xmm5 # x2(s2+x2s3) |
| addpd .Lsincosarray+0x30(%rip),%xmm3 # s4+x2(s5+x2s6) |
| |
| movhlps %xmm1,%xmm1 # move high x4 for cos |
| mulpd %xmm4,%xmm3 # x6(s4+x2(s5+x2s6)) |
| addpd .Lsincosarray(%rip),%xmm5 # s1+x2(s2+x2s3) |
| movapd %xmm2,%xmm4 # move low x2 for x3 |
| mulsd %xmm0,%xmm4 # get low x3 for sin term |
| mulpd .L__real_3fe0000000000000(%rip),%xmm2 # 0.5*x2 |
| |
| addpd %xmm3,%xmm5 # z |
| movhlps %xmm2,%xmm6 # move high r for cos |
| movhlps %xmm5,%xmm3 # xmm5 = sin |
| # xmm3 = cos |
| mulsd p_temp(%rsp),%xmm2 # 0.5 * x2 * xx |
| |
| mulsd %xmm4,%xmm5 # sin *x3 |
| movsd .L__real_3ff0000000000000(%rip),%xmm4 # 1.0 |
| mulsd %xmm1,%xmm3 # cos *x4 |
| subsd %xmm6,%xmm4 # t=1.0-r |
| |
| movhlps %xmm0,%xmm1 |
| subsd %xmm2,%xmm5 # sin - 0.5 * x2 *xx |
| |
| mulsd p_temp+8(%rsp),%xmm1 # x * xx |
| movsd .L__real_3ff0000000000000(%rip),%xmm2 # 1 |
| subsd %xmm4,%xmm2 # 1 - t |
| addsd p_temp(%rsp),%xmm5 # sin+xx |
| |
| subsd %xmm6,%xmm2 # (1-t) - r |
| subsd %xmm1,%xmm2 # ((1 + (-t)) - r) - x*xx |
| addsd %xmm5,%xmm0 # sin + x |
| addsd %xmm2,%xmm3 # cos+((1-t)-r - x*xx) |
| addsd %xmm4,%xmm3 # cos+t |
| |
| movapd p_sign(%rsp),%xmm5 # load sign |
| movlhps %xmm3,%xmm0 |
| xorpd %xmm5,%xmm0 |
| jmp .L__vrd2_sin_cleanup |
| |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| .align 16 |
| |
| .Lvrd2_cos_piby4: |
| mulpd %xmm0,%xmm4 # x*xx |
| movdqa .L__real_3fe0000000000000(%rip),%xmm5 # 0.5 (handle xmm5 retype) |
| movapd .Lcosarray+0x50(%rip),%xmm1 # c6 |
| movapd .Lcosarray+0x20(%rip),%xmm0 # c3 |
| mulpd %xmm2,%xmm5 # r = 0.5 *x2 |
| movapd %xmm2,%xmm3 # copy of x2 for x4 |
| movapd %xmm4,p_temp(%rsp) # store x*xx |
| mulpd %xmm2,%xmm1 # c6*x2 |
| mulpd %xmm2,%xmm0 # c3*x2 |
| subpd .L__real_3ff0000000000000(%rip),%xmm5 # -t=r-1.0 |
| mulpd %xmm2,%xmm3 # x4 |
| addpd .Lcosarray+0x40(%rip),%xmm1 # c5+x2c6 |
| addpd .Lcosarray+0x10(%rip),%xmm0 # c2+x2C3 |
| addpd .L__real_3ff0000000000000(%rip),%xmm5 # 1 + (-t) |
| mulpd %xmm2,%xmm3 # x6 |
| mulpd %xmm2,%xmm1 # x2(c5+x2c6) |
| mulpd %xmm2,%xmm0 # x2(c2+x2C3) |
| movapd %xmm2,%xmm4 # copy of x2 |
| mulpd .L__real_3fe0000000000000(%rip),%xmm4 # r = 0.5 *x2 |
| addpd .Lcosarray+0x30(%rip),%xmm1 # c4 + x2(c5+x2c6) |
| addpd .Lcosarray(%rip),%xmm0 # c1+x2(c2+x2C3) |
| mulpd %xmm2,%xmm2 # x4 |
| subpd %xmm4,%xmm5 # (1 + (-t)) - r |
| mulpd %xmm3,%xmm1 # x6(c4 + x2(c5+x2c6)) |
| addpd %xmm1,%xmm0 # zc |
| subpd .L__real_3ff0000000000000(%rip),%xmm4 # -t=r-1.0 |
| subpd p_temp(%rsp),%xmm5 # ((1 + (-t)) - r) - x*xx |
| mulpd %xmm2,%xmm0 # x4 * zc |
| addpd %xmm5,%xmm0 # x4 * zc + ((1 + (-t)) - r -x*xx) |
| subpd %xmm4,%xmm0 # result - (-t) |
| xorpd p_sign(%rsp),%xmm0 # xor with sign |
| jmp .L__vrd2_sin_cleanup |
| |
| |
| |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| .align 16 |
| .Llower_or_both_arg_gt_5e5: |
| cmp %r10,%rcx #is upper arg >= 5e5 |
| jae .Lboth_arg_gt_5e5 |
| |
| .Llower_arg_gt_5e5: |
| # Upper Arg is < 5e5, Lower arg is >= 5e5 |
| |
| movlpd %xmm0,r(%rsp) #Save lower fp arg for remainder_piby2 call |
| |
| movhlps %xmm0,%xmm0 #Needed since we want to work on upper arg |
| movhlps %xmm2,%xmm2 |
| movhlps %xmm4,%xmm4 |
| |
| # Work on Upper arg |
| # %xmm2,,%xmm0 xmm4 = x, xmm5 = 0.5 |
| # Lower arg might contain nan/inf, to avoid exception use only scalar instructions on upper arg which has been moved to lower portions of fp regs |
| |
| #If upper Arg is <=piby4 |
| cmp %rdx,%rcx # is upper arg > piby4 |
| ja 0f |
| |
| mov $0,%ecx # region = 0 |
| mov %ecx,region+4(%rsp) # store upper region |
| movlpd %xmm0,r+8(%rsp) # store upper r (unsigned - sign is adjusted later based on sign) |
| xorpd %xmm4,%xmm4 # rr = 0 |
| movlpd %xmm4,rr+8(%rsp) # store upper rr |
| jmp .Lcheck_lower_arg |
| |
| #If upper Arg is > piby4 |
| .align 16 |
| 0: |
| mulsd .L__real_3fe45f306dc9c883(%rip),%xmm2 # x*twobypi |
| addsd %xmm5,%xmm2 # xmm2 = npi2=(x*twobypi+0.5) |
| movsd .L__real_3ff921fb54400000(%rip),%xmm3 # xmm3 = piby2_1 |
| cvttsd2si %xmm2,%ecx # xmm0 = npi2 trunc to ints |
| movsd .L__real_3dd0b4611a600000(%rip),%xmm1 # xmm1 = piby2_2 |
| cvtsi2sd %ecx,%xmm2 # xmm2 = npi2 trunc to doubles |
| |
| #/* Subtract the multiple from x to get an extra-precision remainder */ |
| #rhead = x - npi2 * piby2_1; |
| mulsd %xmm2,%xmm3 # npi2 * piby2_1 |
| subsd %xmm3,%xmm4 # xmm4 = rhead =(x-npi2*piby2_1) |
| movsd .L__real_3ba3198a2e037073(%rip),%xmm6 # xmm6 =piby2_2tail |
| |
| #t = rhead; |
| movsd %xmm4,%xmm5 # xmm5 = t = rhead |
| |
| #rtail = npi2 * piby2_2; |
| mulsd %xmm2,%xmm1 # xmm1 =rtail=(npi2*piby2_2) |
| |
| #rhead = t - rtail |
| subsd %xmm1,%xmm4 # xmm4 =rhead=(t-rtail) |
| |
| #rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); |
| mulsd %xmm2,%xmm6 # npi2 * piby2_2tail |
| subsd %xmm4,%xmm5 # t-rhead |
| subsd %xmm5,%xmm1 # (rtail-(t-rhead)) |
| addsd %xmm6,%xmm1 # rtail=npi2*piby2_2tail+(rtail-(t-rhead)); |
| |
| #r = rhead - rtail |
| #rr = (rhead-r) -rtail |
| mov %ecx,region+4(%rsp) # store upper region |
| movsd %xmm4,%xmm0 |
| subsd %xmm1,%xmm0 # xmm0 = r=(rhead-rtail) |
| subsd %xmm0,%xmm4 # rr=rhead-r |
| subsd %xmm1,%xmm4 # xmm4 = rr=((rhead-r) -rtail) |
| movlpd %xmm0,r+8(%rsp) # store upper r |
| movlpd %xmm4,rr+8(%rsp) # store upper rr |
| |
| #If lower Arg is > 5e5 |
| #Note that volatiles will be trashed by the call |
| #We do not care since this is the last check |
| #We will construct r, rr, region and sign |
| .align 16 |
| .Lcheck_lower_arg: |
| mov $0x07ff0000000000000,%r9 # is lower arg nan/inf |
| mov %r9,%r10 |
| and %rax,%r10 |
| cmp %r9,%r10 |
| jz .L__vrd2_cos_lower_naninf |
| |
| mov %r11,p_temp(%rsp) #Save Sign |
| |
| |
| lea region(%rsp),%rdx # lower arg is **NOT** nan/inf |
| lea rr(%rsp),%rsi |
| lea r(%rsp),%rdi |
| movlpd r(%rsp),%xmm0 #Restore lower fp arg for remainder_piby2 call |
| |
| call __amd_remainder_piby2@PLT |
| |
| mov p_temp(%rsp),%r11 #Restore Sign |
| |
| jmp .L__vrd2_sin_reconstruct |
| |
| .L__vrd2_cos_lower_naninf: |
| mov r(%rsp),%rax |
| mov $0x00008000000000000,%r9 |
| or %r9,%rax |
| mov %rax,r(%rsp) # r = x | 0x0008000000000000 |
| xor %r10,%r10 |
| mov %r10,rr(%rsp) # rr = 0 |
| mov %r10d,region(%rsp) # region =0 |
| |
| jmp .L__vrd2_sin_reconstruct |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| .align 16 |
| .Lupper_arg_gt_5e5: |
| # Upper Arg is >= 5e5, Lower arg is < 5e5 |
| movhpd %xmm0,r+8(%rsp) #Save upper fp arg for remainder_piby2 call |
| movlhps %xmm0,%xmm0 #Not needed since we want to work on lower arg, but done just to be safe and avoide exceptions due to nan/inf and to mirror the lower_arg_gt_5e5 case |
| movlhps %xmm2,%xmm2 |
| movlhps %xmm4,%xmm4 |
| |
| # Work on Lower arg |
| # %xmm2,,%xmm0 xmm4 = x, xmm5 = 0.5 |
| # Upper arg might contain nan/inf, to avoid exception use only scalar instructions on lower arg |
| |
| #If lower Arg is <=piby4 |
| cmp %rdx,%rax # is upper arg > piby4 |
| ja 0f |
| |
| mov $0,%eax # region = 0 |
| mov %eax,region(%rsp) # store upper region |
| movlpd %xmm0,r(%rsp) # store upper r |
| xorpd %xmm4,%xmm4 # rr = 0 |
| movlpd %xmm4,rr(%rsp) # store upper rr |
| jmp .Lcheck_upper_arg |
| |
| .align 16 |
| 0: |
| #If upper Arg is > piby4 |
| mulsd .L__real_3fe45f306dc9c883(%rip),%xmm2 # x*twobypi |
| addsd %xmm5,%xmm2 # xmm2 = npi2=(x*twobypi+0.5) |
| movsd .L__real_3ff921fb54400000(%rip),%xmm3 # xmm3 = piby2_1 |
| cvttsd2si %xmm2,%eax # xmm0 = npi2 trunc to ints |
| movsd .L__real_3dd0b4611a600000(%rip),%xmm1 # xmm1 = piby2_2 |
| cvtsi2sd %eax,%xmm2 # xmm2 = npi2 trunc to doubles |
| |
| #/* Subtract the multiple from x to get an extra-precision remainder */ |
| #rhead = x - npi2 * piby2_1; |
| mulsd %xmm2,%xmm3 # npi2 * piby2_1 |
| subsd %xmm3,%xmm4 # xmm4 = rhead =(x-npi2*piby2_1) |
| movsd .L__real_3ba3198a2e037073(%rip),%xmm6 # xmm6 =piby2_2tail |
| |
| #t = rhead; |
| movsd %xmm4,%xmm5 # xmm5 = t = rhead |
| |
| #rtail = npi2 * piby2_2; |
| mulsd %xmm2,%xmm1 # xmm1 =rtail=(npi2*piby2_2) |
| |
| #rhead = t - rtail |
| subsd %xmm1,%xmm4 # xmm4 =rhead=(t-rtail) |
| |
| #rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); |
| mulsd %xmm2,%xmm6 # npi2 * piby2_2tail |
| subsd %xmm4,%xmm5 # t-rhead |
| subsd %xmm5,%xmm1 # (rtail-(t-rhead)) |
| addsd %xmm6,%xmm1 # rtail=npi2*piby2_2tail+(rtail-(t-rhead)); |
| |
| #r = rhead - rtail |
| #rr = (rhead-r) -rtail |
| mov %eax,region(%rsp) # store lower region |
| movsd %xmm4,%xmm0 |
| subsd %xmm1,%xmm0 # xmm0 = r=(rhead-rtail) |
| subsd %xmm0,%xmm4 # rr=rhead-r |
| subsd %xmm1,%xmm4 # xmm4 = rr=((rhead-r) -rtail) |
| movlpd %xmm0,r(%rsp) # store lower r |
| movlpd %xmm4,rr(%rsp) # store lower rr |
| |
| #Note that volatiles will be trashed by the call |
| #We do not care since this is the last check |
| #We will construct r, rr, region and sign |
| .align 16 |
| .Lcheck_upper_arg: |
| mov $0x07ff0000000000000,%r9 # is upper arg nan/inf |
| mov %r9,%r10 |
| and %rcx,%r10 |
| cmp %r9,%r10 |
| jz .L__vrd2_cos_upper_naninf |
| |
| mov %r11,p_temp(%rsp) #Save Sign |
| |
| |
| lea region+4(%rsp),%rdx # upper arg is **NOT** nan/inf |
| lea rr+8(%rsp),%rsi |
| lea r+8(%rsp),%rdi |
| movlpd r+8(%rsp),%xmm0 #Restore upper fp arg for remainder_piby2 call |
| call __amd_remainder_piby2@PLT |
| |
| mov p_temp(%rsp),%r11 #Restore Sign |
| |
| jmp .L__vrd2_sin_reconstruct |
| |
| .L__vrd2_cos_upper_naninf: |
| mov r+8(%rsp),%rcx # upper arg is nan/inf |
| mov $0x00008000000000000,%r9 |
| or %r9,%rcx |
| mov %rcx,r+8(%rsp) # r = x | 0x0008000000000000 |
| xor %r10,%r10 |
| mov %r10,rr+8(%rsp) # rr = 0 |
| mov %r10d,region+4(%rsp) # region =0 |
| jmp .L__vrd2_sin_reconstruct |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| .align 16 |
| .Lboth_arg_gt_5e5: |
| #Upper Arg is >= 5e5, Lower arg is >= 5e5 |
| |
| movhpd %xmm0,p_temp2(%rsp) #Save upper fp arg for remainder_piby2 call |
| |
| mov $0x07ff0000000000000,%r9 #is lower arg nan/inf |
| mov %r9,%r10 |
| and %rax,%r10 |
| cmp %r9,%r10 |
| jz .L__vrd2_cos_lower_naninf_of_both_gt_5e5 |
| |
| mov %rcx,p_temp(%rsp) #Save upper arg |
| mov %r11,p_temp1(%rsp) #Save Sign |
| |
| lea region(%rsp),%rdx #lower arg is **NOT** nan/inf |
| lea rr(%rsp),%rsi |
| lea r(%rsp),%rdi |
| call __amd_remainder_piby2@PLT |
| |
| mov p_temp1(%rsp),%r11 #Restore Sign |
| mov p_temp(%rsp),%rcx #Restore upper arg |
| jmp 0f |
| |
| .L__vrd2_cos_lower_naninf_of_both_gt_5e5: #lower arg is nan/inf |
| movd %xmm0,%rax |
| mov $0x00008000000000000,%r9 |
| or %r9,%rax |
| mov %rax,r(%rsp) #r = x | 0x0008000000000000 |
| xor %r10,%r10 |
| mov %r10,rr(%rsp) #rr = 0 |
| mov %r10d,region(%rsp) #region = 0 |
| |
| .align 16 |
| 0: |
| mov $0x07ff0000000000000,%r9 #is upper arg nan/inf |
| mov %r9,%r10 |
| and %rcx,%r10 |
| cmp %r9,%r10 |
| jz .L__vrd2_cos_upper_naninf_of_both_gt_5e5 |
| |
| |
| mov %r11,p_temp(%rsp) #Save Sign |
| |
| lea region+4(%rsp),%rdx #upper arg is **NOT** nan/inf |
| lea rr+8(%rsp),%rsi |
| lea r+8(%rsp),%rdi |
| movlpd p_temp2(%rsp),%xmm0 #Restore upper fp arg for remainder_piby2 call |
| call __amd_remainder_piby2@PLT |
| |
| mov p_temp(%rsp),%r11 #Restore Sign |
| |
| jmp 0f |
| |
| .L__vrd2_cos_upper_naninf_of_both_gt_5e5: |
| mov p_temp2(%rsp),%rcx #upper arg is nan/inf |
| mov $0x00008000000000000,%r9 |
| or %r9,%rcx |
| mov %rcx,r+8(%rsp) #r = x | 0x0008000000000000 |
| xor %r10,%r10 |
| mov %r10,rr+8(%rsp) #rr = 0 |
| mov %r10d,region+4(%rsp) #region = 0 |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| .align 16 |
| 0: |
| .L__vrd2_sin_reconstruct: |
| #Construct xmm0=x, xmm2 =x2, xmm4=xx, r8=region, xmm6=sign |
| movapd r(%rsp),%xmm0 #x |
| movapd %xmm0,%xmm2 #move for x2 |
| mulpd %xmm2,%xmm2 #x2 |
| movapd rr(%rsp),%xmm4 #xx |
| |
| mov region(%rsp),%r8 |
| mov .L__reald_one_zero(%rip),%r9 #compare value for cossin path |
| mov %r8,%r10 |
| and .L__reald_one_one(%rip),%r8 #odd/even region for cos/sin |
| |
| shr $1,%r10 #~AB+A~B, A is sign and B is upper bit of region |
| mov %r10,%rcx |
| not %r11 #ADDED TO CHANGE THE LOGIC |
| and %r11,%r10 |
| not %rcx |
| not %r11 |
| and %r11,%rcx |
| or %rcx,%r10 |
| and .L__reald_one_one(%rip),%r10 #(~AB+A~B)&1 |
| |
| mov %r10,%r11 |
| and %r9,%r11 #mask out the lower sign bit leaving the upper sign bit |
| shl $63,%r10 #shift lower sign bit left by 63 bits |
| shl $31,%r11 #shift upper sign bit left by 31 bits |
| mov %r10,p_sign(%rsp) #write out lower sign bit |
| mov %r11,p_sign+8(%rsp) #write out upper sign bit |
| |
| jmp .L__vrd2_sin_approximate |
| #ENDMAIN |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| .align 16 |
| .L__vrd2_sin_cleanup: |
| add $0x138,%rsp |
| ret |
| |