blob: a5fb8d451ca9bf0219899f09f74e53b6c6c7cc64 [file] [log] [blame]
#
# (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
#
# This file is part of libacml_mv.
#
# libacml_mv is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# libacml_mv is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with libacml_mv. If not, see
# <http://www.gnu.org/licenses/>.
#
#
#
# vrdasin.s
#
# An array implementation of the sin libm function.
#
# Prototype:
#
# void vrda_sin(int n, double *x, double *y);
#
#Computes Sine of x for an array of input values.
#Places the results into the supplied y array.
#Does not perform error checking.
#Denormal inputs may produce unexpected results
#Author: Harsha Jagasia
#Email: harsha.jagasia@amd.com
#ifdef __ELF__
.section .note.GNU-stack,"",@progbits
#endif
.data
.align 16
.L__real_7fffffffffffffff: .quad 0x07fffffffffffffff #Sign bit zero
.quad 0x07fffffffffffffff
.L__real_3ff0000000000000: .quad 0x03ff0000000000000 # 1.0
.quad 0x03ff0000000000000
.L__real_v2p__27: .quad 0x03e40000000000000 # 2p-27
.quad 0x03e40000000000000
.L__real_3fe0000000000000: .quad 0x03fe0000000000000 # 0.5
.quad 0x03fe0000000000000
.L__real_3fc5555555555555: .quad 0x03fc5555555555555 # 0.166666666666
.quad 0x03fc5555555555555
.L__real_3fe45f306dc9c883: .quad 0x03fe45f306dc9c883 # twobypi
.quad 0x03fe45f306dc9c883
.L__real_3ff921fb54400000: .quad 0x03ff921fb54400000 # piby2_1
.quad 0x03ff921fb54400000
.L__real_3dd0b4611a626331: .quad 0x03dd0b4611a626331 # piby2_1tail
.quad 0x03dd0b4611a626331
.L__real_3dd0b4611a600000: .quad 0x03dd0b4611a600000 # piby2_2
.quad 0x03dd0b4611a600000
.L__real_3ba3198a2e037073: .quad 0x03ba3198a2e037073 # piby2_2tail
.quad 0x03ba3198a2e037073
.L__real_fffffffff8000000: .quad 0x0fffffffff8000000 # mask for stripping head and tail
.quad 0x0fffffffff8000000
.L__real_8000000000000000: .quad 0x08000000000000000 # -0 or signbit
.quad 0x08000000000000000
.L__reald_one_one: .quad 0x00000000100000001 #
.quad 0
.L__reald_two_two: .quad 0x00000000200000002 #
.quad 0
.L__reald_one_zero: .quad 0x00000000100000000 # sin_cos_filter
.quad 0
.L__reald_zero_one: .quad 0x00000000000000001 #
.quad 0
.L__reald_two_zero: .quad 0x00000000200000000 #
.quad 0
.L__realq_one_one: .quad 0x00000000000000001 #
.quad 0x00000000000000001 #
.L__realq_two_two: .quad 0x00000000000000002 #
.quad 0x00000000000000002 #
.L__real_1_x_mask: .quad 0x0ffffffffffffffff #
.quad 0x03ff0000000000000 #
.L__real_zero: .quad 0x00000000000000000 #
.quad 0x00000000000000000 #
.L__real_one: .quad 0x00000000000000001 #
.quad 0x00000000000000001 #
.Lcosarray:
.quad 0x03fa5555555555555 # 0.0416667 c1
.quad 0x03fa5555555555555
.quad 0x0bf56c16c16c16967 # -0.00138889 c2
.quad 0x0bf56c16c16c16967
.quad 0x03efa01a019f4ec90 # 2.48016e-005 c3
.quad 0x03efa01a019f4ec90
.quad 0x0be927e4fa17f65f6 # -2.75573e-007 c4
.quad 0x0be927e4fa17f65f6
.quad 0x03e21eeb69037ab78 # 2.08761e-009 c5
.quad 0x03e21eeb69037ab78
.quad 0x0bda907db46cc5e42 # -1.13826e-011 c6
.quad 0x0bda907db46cc5e42
.Lsinarray:
.quad 0x0bfc5555555555555 # -0.166667 s1
.quad 0x0bfc5555555555555
.quad 0x03f81111111110bb3 # 0.00833333 s2
.quad 0x03f81111111110bb3
.quad 0x0bf2a01a019e83e5c # -0.000198413 s3
.quad 0x0bf2a01a019e83e5c
.quad 0x03ec71de3796cde01 # 2.75573e-006 s4
.quad 0x03ec71de3796cde01
.quad 0x0be5ae600b42fdfa7 # -2.50511e-008 s5
.quad 0x0be5ae600b42fdfa7
.quad 0x03de5e0b2f9a43bb8 # 1.59181e-010 s6
.quad 0x03de5e0b2f9a43bb8
.Lsincosarray:
.quad 0x0bfc5555555555555 # -0.166667 s1
.quad 0x03fa5555555555555 # 0.0416667 c1
.quad 0x03f81111111110bb3 # 0.00833333 s2
.quad 0x0bf56c16c16c16967
.quad 0x0bf2a01a019e83e5c # -0.000198413 s3
.quad 0x03efa01a019f4ec90
.quad 0x03ec71de3796cde01 # 2.75573e-006 s4
.quad 0x0be927e4fa17f65f6
.quad 0x0be5ae600b42fdfa7 # -2.50511e-008 s5
.quad 0x03e21eeb69037ab78
.quad 0x03de5e0b2f9a43bb8 # 1.59181e-010 s6
.quad 0x0bda907db46cc5e42
.Lcossinarray:
.quad 0x03fa5555555555555 # 0.0416667 c1
.quad 0x0bfc5555555555555 # -0.166667 s1
.quad 0x0bf56c16c16c16967
.quad 0x03f81111111110bb3 # 0.00833333 s2
.quad 0x03efa01a019f4ec90
.quad 0x0bf2a01a019e83e5c # -0.000198413 s3
.quad 0x0be927e4fa17f65f6
.quad 0x03ec71de3796cde01 # 2.75573e-006 s4
.quad 0x03e21eeb69037ab78
.quad 0x0be5ae600b42fdfa7 # -2.50511e-008 s5
.quad 0x0bda907db46cc5e42
.quad 0x03de5e0b2f9a43bb8 # 1.59181e-010 s6
.Levensin_oddcos_tbl:
.quad .Lsinsin_sinsin_piby4 # 0
.quad .Lsinsin_sincos_piby4 # 1
.quad .Lsinsin_cossin_piby4 # 2
.quad .Lsinsin_coscos_piby4 # 3
.quad .Lsincos_sinsin_piby4 # 4
.quad .Lsincos_sincos_piby4 # 5
.quad .Lsincos_cossin_piby4 # 6
.quad .Lsincos_coscos_piby4 # 7
.quad .Lcossin_sinsin_piby4 # 8
.quad .Lcossin_sincos_piby4 # 9
.quad .Lcossin_cossin_piby4 # 10
.quad .Lcossin_coscos_piby4 # 11
.quad .Lcoscos_sinsin_piby4 # 12
.quad .Lcoscos_sincos_piby4 # 13
.quad .Lcoscos_cossin_piby4 # 14
.quad .Lcoscos_coscos_piby4 # 15
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.weak vrda_sin_
.set vrda_sin_,__vrda_sin__
.weak vrda_sin__
.set vrda_sin__,__vrda_sin__
.text
.align 16
.p2align 4,,15
#x/* a FORTRAN subroutine implementation of array sin
#** VRDA_SIN(N,X,Y)
# C equivalent*/
#void vrda_sin__(int * n, double *x, double *y)
#{
# vrda_sin(*n,x,y);
#}
.globl __vrda_sin__
.type __vrda_sin__,@function
__vrda_sin__:
mov (%rdi),%edi
.align 16
.p2align 4,,15
# define local variable storage offsets
.equ p_temp, 0x00 # temporary for get/put bits operation
.equ p_temp1, 0x10 # temporary for get/put bits operation
.equ save_xmm6, 0x20 # temporary for get/put bits operation
.equ save_xmm7, 0x30 # temporary for get/put bits operation
.equ save_xmm8, 0x40 # temporary for get/put bits operation
.equ save_xmm9, 0x50 # temporary for get/put bits operation
.equ save_xmm10, 0x60 # temporary for get/put bits operation
.equ save_xmm11, 0x70 # temporary for get/put bits operation
.equ save_xmm12, 0x80 # temporary for get/put bits operation
.equ save_xmm13, 0x90 # temporary for get/put bits operation
.equ save_xmm14, 0x0A0 # temporary for get/put bits operation
.equ save_xmm15, 0x0B0 # temporary for get/put bits operation
.equ r, 0x0C0 # pointer to r for remainder_piby2
.equ rr, 0x0D0 # pointer to r for remainder_piby2
.equ region, 0x0E0 # pointer to r for remainder_piby2
.equ r1, 0x0F0 # pointer to r for remainder_piby2
.equ rr1, 0x0100 # pointer to r for remainder_piby2
.equ region1, 0x0110 # pointer to r for remainder_piby2
.equ p_temp2, 0x0120 # temporary for get/put bits operation
.equ p_temp3, 0x0130 # temporary for get/put bits operation
.equ p_temp4, 0x0140 # temporary for get/put bits operation
.equ p_temp5, 0x0150 # temporary for get/put bits operation
.equ p_original, 0x0160 # original x
.equ p_mask, 0x0170 # original x
.equ p_sign, 0x0180 # original x
.equ p_original1, 0x0190 # original x
.equ p_mask1, 0x01A0 # original x
.equ p_sign1, 0x01B0 # original x
.equ save_r12, 0x01C0 # temporary for get/put bits operation
.equ save_r13, 0x01D0 # temporary for get/put bits operation
.equ save_xa, 0x01E0 #qword
.equ save_ya, 0x01F0 #qword
.equ save_nv, 0x0200 #qword
.equ p_iter, 0x0210 # qword storage for number of loop iterations
.globl vrda_sin
.type vrda_sin,@function
vrda_sin:
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# parameters are passed in by Linux as:
# rcx - int n
# rdx - double *x
# r8 - double *y
sub $0x228,%rsp
mov %r12,save_r12(%rsp) # save r12
mov %r13,save_r13(%rsp) # save r13
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#START PROCESS INPUT
# save the arguments
mov %rsi, save_xa(%rsp) # save x_array pointer
mov %rdx, save_ya(%rsp) # save y_array pointer
#ifdef INTEGER64
mov %rdi,%rax
#else
mov %edi,%eax
mov %rax,%rdi
#endif
mov %rdi,save_nv(%rsp) # save number of values
# see if too few values to call the main loop
shr $2,%rax # get number of iterations
jz .L__vrda_cleanup # jump if only single calls
# prepare the iteration counts
mov %rax,p_iter(%rsp) # save number of iterations
shl $2,%rax
sub %rax,%rdi # compute number of extra single calls
mov %rdi,save_nv(%rsp) # save number of left over values
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#START LOOP
.align 16
.L__vrda_top:
# build the input _m128d
movapd .L__real_7fffffffffffffff(%rip),%xmm2
mov save_xa(%rsp),%rsi # get x_array pointer
movlpd (%rsi),%xmm0
movhpd 8(%rsi),%xmm0
mov (%rsi),%rax
mov 8(%rsi),%rcx
movdqa %xmm0,%xmm6
prefetch 64(%rsi)
add $32,%rsi
mov %rsi,save_xa(%rsp) # save x_array pointer
movlpd -16(%rsi), %xmm1
movhpd -8(%rsi), %xmm1
mov -16(%rsi), %r8
mov -8(%rsi), %r9
movdqa %xmm1,%xmm7
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#STARTMAIN
andpd %xmm2,%xmm0 #Unsign
andpd %xmm2,%xmm1 #Unsign
and .L__real_7fffffffffffffff(%rip), %rax
and .L__real_7fffffffffffffff(%rip), %rcx
and .L__real_7fffffffffffffff(%rip), %r8
and .L__real_7fffffffffffffff(%rip), %r9
movdqa %xmm0,%xmm12
movdqa %xmm1,%xmm13
pcmpgtd %xmm6,%xmm12
pcmpgtd %xmm7,%xmm13
movdqa %xmm12,%xmm6
movdqa %xmm13,%xmm7
psrldq $4,%xmm12
psrldq $4,%xmm13
psrldq $8,%xmm6
psrldq $8,%xmm7
mov $0x3FE921FB54442D18,%rdx #piby4 +
mov $0x411E848000000000,%r10 #5e5 +
movapd .L__real_3fe0000000000000(%rip),%xmm4 #0.5 for later use +
por %xmm6,%xmm12
por %xmm7,%xmm13
movd %xmm12,%r12 #Move Sign to gpr **
movd %xmm13,%r13 #Move Sign to gpr **
movapd %xmm0,%xmm2 #x0
movapd %xmm1,%xmm3 #x1
movapd %xmm0,%xmm6 #x0
movapd %xmm1,%xmm7 #x1
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# xmm2 = x, xmm4 =0.5/t, xmm6 =x
# xmm3 = x, xmm5 =0.5/t, xmm7 =x
.align 16
.Leither_or_both_arg_gt_than_piby4:
cmp %r10,%rax
jae .Lfirst_or_next3_arg_gt_5e5
cmp %r10,%rcx
jae .Lsecond_or_next2_arg_gt_5e5
cmp %r10,%r8
jae .Lthird_or_fourth_arg_gt_5e5
cmp %r10,%r9
jae .Lfourth_arg_gt_5e5
# /* Find out what multiple of piby2 */
# npi2 = (int)(x * twobypi + 0.5);
movapd .L__real_3fe45f306dc9c883(%rip),%xmm0
mulpd %xmm0,%xmm2 # * twobypi
mulpd %xmm0,%xmm3 # * twobypi
addpd %xmm4,%xmm2 # +0.5, npi2
addpd %xmm4,%xmm3 # +0.5, npi2
movapd .L__real_3ff921fb54400000(%rip),%xmm0 # piby2_1
movapd .L__real_3ff921fb54400000(%rip),%xmm1 # piby2_1
cvttpd2dq %xmm2,%xmm4 # convert packed double to packed integers
cvttpd2dq %xmm3,%xmm5 # convert packed double to packed integers
movapd .L__real_3dd0b4611a600000(%rip),%xmm8 # piby2_2
movapd .L__real_3dd0b4611a600000(%rip),%xmm9 # piby2_2
cvtdq2pd %xmm4,%xmm2 # and back to double.
cvtdq2pd %xmm5,%xmm3 # and back to double.
# /* Subtract the multiple from x to get an extra-precision remainder */
movd %xmm4,%r8 # Region
movd %xmm5,%r9 # Region
mov .L__reald_one_zero(%rip),%rdx #compare value for cossin path
mov %r8,%r10
mov %r9,%r11
# rhead = x - npi2 * piby2_1;
mulpd %xmm2,%xmm0 # npi2 * piby2_1;
mulpd %xmm3,%xmm1 # npi2 * piby2_1;
# rtail = npi2 * piby2_2;
mulpd %xmm2,%xmm8 # rtail
mulpd %xmm3,%xmm9 # rtail
# rhead = x - npi2 * piby2_1;
subpd %xmm0,%xmm6 # rhead = x - npi2 * piby2_1;
subpd %xmm1,%xmm7 # rhead = x - npi2 * piby2_1;
# t = rhead;
movapd %xmm6,%xmm0 # t
movapd %xmm7,%xmm1 # t
# rhead = t - rtail;
subpd %xmm8,%xmm0 # rhead
subpd %xmm9,%xmm1 # rhead
# rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
mulpd .L__real_3ba3198a2e037073(%rip),%xmm2 # npi2 * piby2_2tail
mulpd .L__real_3ba3198a2e037073(%rip),%xmm3 # npi2 * piby2_2tail
subpd %xmm0,%xmm6 # t-rhead
subpd %xmm1,%xmm7 # t-rhead
subpd %xmm6,%xmm8 # - ((t - rhead) - rtail)
subpd %xmm7,%xmm9 # - ((t - rhead) - rtail)
addpd %xmm2,%xmm8 # rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
addpd %xmm3,%xmm9 # rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# xmm4 = npi2 (int), xmm0 =rhead, xmm8 =rtail
# xmm5 = npi2 (int), xmm1 =rhead, xmm9 =rtail
and .L__reald_one_one(%rip),%r8 #odd/even region for cos/sin
and .L__reald_one_one(%rip),%r9 #odd/even region for cos/sin
shr $1,%r10 #~AB+A~B, A is sign and B is upper bit of region
shr $1,%r11 #~AB+A~B, A is sign and B is upper bit of region
mov %r10,%rax
mov %r11,%rcx
not %r12 #ADDED TO CHANGE THE LOGIC
not %r13 #ADDED TO CHANGE THE LOGIC
and %r12,%r10
and %r13,%r11
not %rax
not %rcx
not %r12
not %r13
and %r12,%rax
and %r13,%rcx
or %rax,%r10
or %rcx,%r11
and .L__reald_one_one(%rip),%r10 #(~AB+A~B)&1
and .L__reald_one_one(%rip),%r11 #(~AB+A~B)&1
mov %r10,%r12
mov %r11,%r13
and %rdx,%r12 #mask out the lower sign bit leaving the upper sign bit
and %rdx,%r13 #mask out the lower sign bit leaving the upper sign bit
shl $63,%r10 #shift lower sign bit left by 63 bits
shl $63,%r11 #shift lower sign bit left by 63 bits
shl $31,%r12 #shift upper sign bit left by 31 bits
shl $31,%r13 #shift upper sign bit left by 31 bits
mov %r10,p_sign(%rsp) #write out lower sign bit
mov %r12,p_sign+8(%rsp) #write out upper sign bit
mov %r11,p_sign1(%rsp) #write out lower sign bit
mov %r13,p_sign1+8(%rsp) #write out upper sign bit
# GET_BITS_DP64(rhead-rtail, uy); ; originally only rhead
# xmm4 = Sign, xmm0 =rhead, xmm8 =rtail
# xmm5 = Sign, xmm1 =rhead, xmm9 =rtail
movapd %xmm0,%xmm6 # rhead
movapd %xmm1,%xmm7 # rhead
subpd %xmm8,%xmm0 # r = rhead - rtail
subpd %xmm9,%xmm1 # r = rhead - rtail
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# xmm4 = Sign, xmm0 = r, xmm6 =rhead, xmm8 =rtail
# xmm5 = Sign, xmm1 = r, xmm7 =rhead, xmm9 =rtail
subpd %xmm0,%xmm6 #rr=rhead-r
subpd %xmm1,%xmm7 #rr=rhead-r
mov %r8,%rax
mov %r9,%rcx
movapd %xmm0,%xmm2 # move r for r2
movapd %xmm1,%xmm3 # move r for r2
mulpd %xmm0,%xmm2 # r2
mulpd %xmm1,%xmm3 # r2
subpd %xmm8,%xmm6 #rr=(rhead-r) -rtail
subpd %xmm9,%xmm7 #rr=(rhead-r) -rtail
and .L__reald_zero_one(%rip),%rax
and .L__reald_zero_one(%rip),%rcx
shr $31,%r8
shr $31,%r9
or %r8,%rax
or %r9,%rcx
shl $2,%rcx
or %rcx,%rax
#DEBUG
# jmp .Lfinal_check
#DEBUG
leaq .Levensin_oddcos_tbl(%rip),%rsi
jmp *(%rsi,%rax,8) #Jmp table for cos/sin calculation based on even/odd region
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.Lfirst_or_next3_arg_gt_5e5:
# %rcx,,%rax r8, r9
cmp %r10,%rcx #is upper arg >= 5e5
jae .Lboth_arg_gt_5e5
.Llower_arg_gt_5e5:
# Upper Arg is < 5e5, Lower arg is >= 5e5
# %xmm2,,%xmm0 xmm6 = x, xmm4 = 0.5
# Be sure not to use %xmm3,%xmm1 and xmm7
# Use %xmm8,,%xmm5 xmm10, xmm12
# %xmm11,,%xmm9 xmm13
movlpd %xmm0,r(%rsp) #Save lower fp arg for remainder_piby2 call
movhlps %xmm0,%xmm0 #Needed since we want to work on upper arg
movhlps %xmm2,%xmm2
movhlps %xmm6,%xmm6
# Work on Upper arg
# Lower arg might contain nan/inf, to avoid exception use only scalar instructions on upper arg which has been moved to lower portions of fp regs
mulsd .L__real_3fe45f306dc9c883(%rip),%xmm2 # x*twobypi
addsd %xmm4,%xmm2 # xmm2 = npi2=(x*twobypi+0.5)
movsd .L__real_3ff921fb54400000(%rip),%xmm8 # xmm8 = piby2_1
cvttsd2si %xmm2,%ecx # ecx = npi2 trunc to ints
movsd .L__real_3dd0b4611a600000(%rip),%xmm10 # xmm10 = piby2_2
cvtsi2sd %ecx,%xmm2 # xmm2 = npi2 trunc to doubles
#/* Subtract the multiple from x to get an extra-precision remainder */
#rhead = x - npi2 * piby2_1;
mulsd %xmm2,%xmm8 # npi2 * piby2_1
subsd %xmm8,%xmm6 # xmm6 = rhead =(x-npi2*piby2_1)
movsd .L__real_3ba3198a2e037073(%rip),%xmm12 # xmm12 =piby2_2tail
#t = rhead;
movsd %xmm6,%xmm5 # xmm5 = t = rhead
#rtail = npi2 * piby2_2;
mulsd %xmm2,%xmm10 # xmm1 =rtail=(npi2*piby2_2)
#rhead = t - rtail
subsd %xmm10,%xmm6 # xmm6 =rhead=(t-rtail)
#rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
mulsd %xmm2,%xmm12 # npi2 * piby2_2tail
subsd %xmm6,%xmm5 # t-rhead
subsd %xmm5,%xmm10 # (rtail-(t-rhead))
addsd %xmm12,%xmm10 # rtail=npi2*piby2_2tail+(rtail-(t-rhead));
#r = rhead - rtail
#rr = (rhead-r) -rtail
mov %ecx,region+4(%rsp) # store upper region
movsd %xmm6,%xmm0
subsd %xmm10,%xmm0 # xmm0 = r=(rhead-rtail)
subsd %xmm0,%xmm6 # rr=rhead-r
subsd %xmm10,%xmm6 # xmm6 = rr=((rhead-r) -rtail)
movlpd %xmm0,r+8(%rsp) # store upper r
movlpd %xmm6,rr+8(%rsp) # store upper rr
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#Note that volatiles will be trashed by the call
#We will construct r, rr, region and sign
# Work on Lower arg
mov $0x07ff0000000000000,%r11 # is lower arg nan/inf
mov %r11,%r10
and %rax,%r10
cmp %r11,%r10
jz .L__vrd4_sin_lower_naninf
mov %r8,p_temp(%rsp)
mov %r9,p_temp2(%rsp)
movapd %xmm1,p_temp1(%rsp)
movapd %xmm3,p_temp3(%rsp)
movapd %xmm7,p_temp5(%rsp)
lea region(%rsp),%rdx # lower arg is **NOT** nan/inf
lea rr(%rsp),%rsi
lea r(%rsp),%rdi
movlpd r(%rsp),%xmm0 #Restore lower fp arg for remainder_piby2 call
call __amd_remainder_piby2@PLT
mov p_temp(%rsp),%r8
mov p_temp2(%rsp),%r9
movapd p_temp1(%rsp),%xmm1
movapd p_temp3(%rsp),%xmm3
movapd p_temp5(%rsp),%xmm7
jmp 0f
.L__vrd4_sin_lower_naninf:
mov $0x00008000000000000,%r11
or %r11,%rax
mov %rax,r(%rsp) # r = x | 0x0008000000000000
xor %r10,%r10
mov %r10,rr(%rsp) # rr = 0
mov %r10d,region(%rsp) # region =0
.align 16
0:
jmp .Lcheck_next2_args
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.Lboth_arg_gt_5e5:
#Upper Arg is >= 5e5, Lower arg is >= 5e5
# %rcx,,%rax r8, r9
# %xmm2,,%xmm0 xmm6 = x, xmm4 = 0.5
movhpd %xmm0,r+8(%rsp) #Save upper fp arg for remainder_piby2 call
mov $0x07ff0000000000000,%r11 #is lower arg nan/inf
mov %r11,%r10
and %rax,%r10
cmp %r11,%r10
jz .L__vrd4_sin_lower_naninf_of_both_gt_5e5
mov %rcx,p_temp(%rsp) #Save upper arg
mov %r8,p_temp2(%rsp)
mov %r9,p_temp4(%rsp)
movapd %xmm1,p_temp1(%rsp)
movapd %xmm3,p_temp3(%rsp)
movapd %xmm7,p_temp5(%rsp)
lea region(%rsp),%rdx #lower arg is **NOT** nan/inf
lea rr(%rsp),%rsi
lea r(%rsp),%rdi
call __amd_remainder_piby2@PLT
mov p_temp2(%rsp),%r8
mov p_temp4(%rsp),%r9
movapd p_temp1(%rsp),%xmm1
movapd p_temp3(%rsp),%xmm3
movapd p_temp5(%rsp),%xmm7
mov p_temp(%rsp),%rcx #Restore upper arg
jmp 0f
.L__vrd4_sin_lower_naninf_of_both_gt_5e5: #lower arg is nan/inf
# mov p_original(r%sp),%rax
mov $0x00008000000000000,%r11
or %r11,%rax
mov %rax,r(%rsp) #r = x | 0x0008000000000000
xor %r10,%r10
mov %r10,rr(%rsp) #rr = 0
mov %r10d,region(%rsp) #region = 0
.align 16
0:
mov $0x07ff0000000000000,%r11 #is upper arg nan/inf
mov %r11,%r10
and %rcx,%r10
cmp %r11,%r10
jz .L__vrd4_sin_upper_naninf_of_both_gt_5e5
mov %r8,p_temp2(%rsp)
mov %r9,p_temp4(%rsp)
movapd %xmm1,p_temp1(%rsp)
movapd %xmm3,p_temp3(%rsp)
movapd %xmm7,p_temp5(%rsp)
lea region+4(%rsp),%rdx #upper arg is **NOT** nan/inf
lea rr+8(%rsp),%rsi
lea r+8(%rsp),%rdi
movlpd r+8(%rsp),%xmm0 #Restore upper fp arg for remainder_piby2 call
call __amd_remainder_piby2@PLT
mov p_temp2(%rsp),%r8
mov p_temp4(%rsp),%r9
movapd p_temp1(%rsp),%xmm1
movapd p_temp3(%rsp),%xmm3
movapd p_temp5(%rsp),%xmm7
jmp 0f
.L__vrd4_sin_upper_naninf_of_both_gt_5e5:
# mov p_original+8(%rsp),%rcx ;upper arg is nan/inf
mov $0x00008000000000000,%r11
or %r11,%rcx
mov %rcx,r+8(%rsp) #r = x | 0x0008000000000000
xor %r10,%r10
mov %r10,rr+8(%rsp) #rr = 0
mov %r10d,region+4(%rsp) #region = 0
.align 16
0:
jmp .Lcheck_next2_args
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.Lsecond_or_next2_arg_gt_5e5:
# Upper Arg is >= 5e5, Lower arg is < 5e5
# %rcx,,%rax r8, r9
# %xmm2,,%xmm0 xmm6 = x, xmm4 = 0.5
# Do not use %xmm3,,%xmm1 xmm7
# Restore xmm4 and %xmm3,,%xmm1 xmm7
# Can use %xmm10,,%xmm8 xmm12
# %xmm9,,%xmm5 xmm11, xmm13
movhpd %xmm0,r+8(%rsp) #Save upper fp arg for remainder_piby2 call
# movlhps %xmm0,%xmm0 ;Not needed since we want to work on lower arg, but done just to be safe and avoide exceptions due to nan/inf and to mirror the lower_arg_gt_5e5 case
# movlhps %xmm2,%xmm2
# movlhps %xmm6,%xmm6
# Work on Lower arg
# Upper arg might contain nan/inf, to avoid exception use only scalar instructions on lower arg
mulsd .L__real_3fe45f306dc9c883(%rip),%xmm2 # x*twobypi
addsd %xmm4,%xmm2 # xmm2 = npi2=(x*twobypi+0.5)
movsd .L__real_3ff921fb54400000(%rip),%xmm8 # xmm3 = piby2_1
cvttsd2si %xmm2,%eax # ecx = npi2 trunc to ints
movsd .L__real_3dd0b4611a600000(%rip),%xmm10 # xmm1 = piby2_2
cvtsi2sd %eax,%xmm2 # xmm2 = npi2 trunc to doubles
#/* Subtract the multiple from x to get an extra-precision remainder */
#rhead = x - npi2 * piby2_1;
mulsd %xmm2,%xmm8 # npi2 * piby2_1
subsd %xmm8,%xmm6 # xmm6 = rhead =(x-npi2*piby2_1)
movsd .L__real_3ba3198a2e037073(%rip),%xmm12 # xmm7 =piby2_2tail
#t = rhead;
movsd %xmm6,%xmm5 # xmm5 = t = rhead
#rtail = npi2 * piby2_2;
mulsd %xmm2,%xmm10 # xmm1 =rtail=(npi2*piby2_2)
#rhead = t - rtail
subsd %xmm10,%xmm6 # xmm6 =rhead=(t-rtail)
#rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
mulsd %xmm2,%xmm12 # npi2 * piby2_2tail
subsd %xmm6,%xmm5 # t-rhead
subsd %xmm5,%xmm10 # (rtail-(t-rhead))
addsd %xmm12,%xmm10 # rtail=npi2*piby2_2tail+(rtail-(t-rhead));
#r = rhead - rtail
#rr = (rhead-r) -rtail
mov %eax,region(%rsp) # store upper region
movsd %xmm6,%xmm0
subsd %xmm10,%xmm0 # xmm0 = r=(rhead-rtail)
subsd %xmm0,%xmm6 # rr=rhead-r
subsd %xmm10,%xmm6 # xmm6 = rr=((rhead-r) -rtail)
movlpd %xmm0,r(%rsp) # store upper r
movlpd %xmm6,rr(%rsp) # store upper rr
#Work on Upper arg
#Note that volatiles will be trashed by the call
#We do not care since this is the last check
#We will construct r, rr, region and sign
mov $0x07ff0000000000000,%r11 # is upper arg nan/inf
mov %r11,%r10
and %rcx,%r10
cmp %r11,%r10
jz .L__vrd4_sin_upper_naninf
mov %r8,p_temp(%rsp)
mov %r9,p_temp2(%rsp)
movapd %xmm1,p_temp1(%rsp)
movapd %xmm3,p_temp3(%rsp)
movapd %xmm7,p_temp5(%rsp)
lea region+4(%rsp),%rdx # upper arg is **NOT** nan/inf
lea rr+8(%rsp),%rsi
lea r+8(%rsp),%rdi
movlpd r+8(%rsp),%xmm0 #Restore upper fp arg for remainder_piby2 call
call __amd_remainder_piby2@PLT
mov p_temp(%rsp),%r8
mov p_temp2(%rsp),%r9
movapd p_temp1(%rsp),%xmm1
movapd p_temp3(%rsp),%xmm3
movapd p_temp5(%rsp),%xmm7
jmp 0f
.L__vrd4_sin_upper_naninf:
# mov p_original+8(%rsp),%rcx ; upper arg is nan/inf
# mov r+8(%rsp),%rcx ; upper arg is nan/inf
mov $0x00008000000000000,%r11
or %r11,%rcx
mov %rcx,r+8(%rsp) # r = x | 0x0008000000000000
xor %r10,%r10
mov %r10,rr+8(%rsp) # rr = 0
mov %r10d,region+4(%rsp) # region =0
.align 16
0:
jmp .Lcheck_next2_args
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.Lcheck_next2_args:
mov $0x411E848000000000,%r10 #5e5 +
cmp %r10,%r8
jae .Lfirst_second_done_third_or_fourth_arg_gt_5e5
cmp %r10,%r9
jae .Lfirst_second_done_fourth_arg_gt_5e5
# Work on next two args, both < 5e5
# %xmm3,,%xmm1 xmm5 = x, xmm4 = 0.5
movapd .L__real_3fe0000000000000(%rip),%xmm4 #Restore 0.5
mulpd .L__real_3fe45f306dc9c883(%rip),%xmm3 # * twobypi
addpd %xmm4,%xmm3 # +0.5, npi2
movapd .L__real_3ff921fb54400000(%rip),%xmm1 # piby2_1
cvttpd2dq %xmm3,%xmm5 # convert packed double to packed integers
movapd .L__real_3dd0b4611a600000(%rip),%xmm9 # piby2_2
cvtdq2pd %xmm5,%xmm3 # and back to double.
# /* Subtract the multiple from x to get an extra-precision remainder */
movq %xmm5,region1(%rsp) # Region
# rhead = x - npi2 * piby2_1;
mulpd %xmm3,%xmm1 # npi2 * piby2_1;
# rtail = npi2 * piby2_2;
mulpd %xmm3,%xmm9 # rtail
# rhead = x - npi2 * piby2_1;
subpd %xmm1,%xmm7 # rhead = x - npi2 * piby2_1;
# t = rhead;
movapd %xmm7,%xmm1 # t
# rhead = t - rtail;
subpd %xmm9,%xmm1 # rhead
# rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
mulpd .L__real_3ba3198a2e037073(%rip),%xmm3 # npi2 * piby2_2tail
subpd %xmm1,%xmm7 # t-rhead
subpd %xmm7,%xmm9 # - ((t - rhead) - rtail)
addpd %xmm3,%xmm9 # rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
movapd %xmm1,%xmm7 # rhead
subpd %xmm9,%xmm1 # r = rhead - rtail
movapd %xmm1,r1(%rsp)
subpd %xmm1,%xmm7 # rr=rhead-r
subpd %xmm9,%xmm7 # rr=(rhead-r) -rtail
movapd %xmm7,rr1(%rsp)
jmp .L__vrd4_sin_reconstruct
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.Lthird_or_fourth_arg_gt_5e5:
#first two args are < 5e5, third arg >= 5e5, fourth arg >= 5e5 or < 5e5
# %rcx,,%rax r8, r9
# %xmm2,,%xmm0 xmm6 = x, xmm4 = 0.5
# Do not use %xmm3,,%xmm1 xmm7
# Can use %xmm11,,%xmm9 xmm13
# %xmm8,,%xmm5 xmm10, xmm12
# Restore xmm4
# Work on first two args, both < 5e5
mulpd .L__real_3fe45f306dc9c883(%rip),%xmm2 # * twobypi
addpd %xmm4,%xmm2 # +0.5, npi2
movapd .L__real_3ff921fb54400000(%rip),%xmm0 # piby2_1
cvttpd2dq %xmm2,%xmm4 # convert packed double to packed integers
movapd .L__real_3dd0b4611a600000(%rip),%xmm8 # piby2_2
cvtdq2pd %xmm4,%xmm2 # and back to double.
# /* Subtract the multiple from x to get an extra-precision remainder */
movq %xmm4,region(%rsp) # Region
# rhead = x - npi2 * piby2_1;
mulpd %xmm2,%xmm0 # npi2 * piby2_1;
# rtail = npi2 * piby2_2;
mulpd %xmm2,%xmm8 # rtail
# rhead = x - npi2 * piby2_1;
subpd %xmm0,%xmm6 # rhead = x - npi2 * piby2_1;
# t = rhead;
movapd %xmm6,%xmm0 # t
# rhead = t - rtail;
subpd %xmm8,%xmm0 # rhead
# rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
mulpd .L__real_3ba3198a2e037073(%rip),%xmm2 # npi2 * piby2_2tail
subpd %xmm0,%xmm6 # t-rhead
subpd %xmm6,%xmm8 # - ((t - rhead) - rtail)
addpd %xmm2,%xmm8 # rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
movapd %xmm0,%xmm6 # rhead
subpd %xmm8,%xmm0 # r = rhead - rtail
movapd %xmm0,r(%rsp)
subpd %xmm0,%xmm6 # rr=rhead-r
subpd %xmm8,%xmm6 # rr=(rhead-r) -rtail
movapd %xmm6,rr(%rsp)
# Work on next two args, third arg >= 5e5, fourth arg >= 5e5 or < 5e5
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.Lfirst_second_done_third_or_fourth_arg_gt_5e5:
# %rcx,,%rax r8, r9
# %xmm2,,%xmm0 xmm6 = x, xmm4 = 0.5
mov $0x411E848000000000,%r10 #5e5 +
cmp %r10,%r9
jae .Lboth_arg_gt_5e5_higher
# Upper Arg is <5e5, Lower arg is >= 5e5
# %r9,%r8
# %xmm3,,%xmm1 xmm7 = x, xmm4 = 0.5
movlpd %xmm1,r1(%rsp) #Save lower fp arg for remainder_piby2 call
movhlps %xmm1,%xmm1 #Needed since we want to work on upper arg
movhlps %xmm3,%xmm3
movhlps %xmm7,%xmm7
# Work on Upper arg
# Lower arg might contain nan/inf, to avoid exception use only scalar instructions on upper arg which has been moved to lower portions of fp regs
movapd .L__real_3fe0000000000000(%rip),%xmm4 # Restore 0.5
mulsd .L__real_3fe45f306dc9c883(%rip),%xmm3 # x*twobypi
addsd %xmm4,%xmm3 # xmm3 = npi2=(x*twobypi+0.5)
movsd .L__real_3ff921fb54400000(%rip),%xmm2 # xmm2 = piby2_1
cvttsd2si %xmm3,%r9d # r9d = npi2 trunc to ints
movsd .L__real_3dd0b4611a600000(%rip),%xmm0 # xmm0 = piby2_2
cvtsi2sd %r9d,%xmm3 # xmm3 = npi2 trunc to doubles
#/* Subtract the multiple from x to get an extra-precision remainder */
#rhead = x - npi2 * piby2_1;
mulsd %xmm3,%xmm2 # npi2 * piby2_1
subsd %xmm2,%xmm7 # xmm7 = rhead =(x-npi2*piby2_1)
movsd .L__real_3ba3198a2e037073(%rip),%xmm6 # xmm6 =piby2_2tail
#t = rhead;
movsd %xmm7,%xmm5 # xmm5 = t = rhead
#rtail = npi2 * piby2_2;
mulsd %xmm3,%xmm0 # xmm0 =rtail=(npi2*piby2_2)
#rhead = t - rtail
subsd %xmm0,%xmm7 # xmm7 =rhead=(t-rtail)
#rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
mulsd %xmm3,%xmm6 # npi2 * piby2_2tail
subsd %xmm7,%xmm5 # t-rhead
subsd %xmm5,%xmm0 # (rtail-(t-rhead))
addsd %xmm6,%xmm0 # rtail=npi2*piby2_2tail+(rtail-(t-rhead));
#r = rhead - rtail
#rr = (rhead-r) -rtail
mov %r9d,region1+4(%rsp) # store upper region
movsd %xmm7,%xmm1
subsd %xmm0,%xmm1 # xmm1 = r=(rhead-rtail)
subsd %xmm1,%xmm7 # rr=rhead-r
subsd %xmm0,%xmm7 # xmm7 = rr=((rhead-r) -rtail)
movlpd %xmm1,r1+8(%rsp) # store upper r
movlpd %xmm7,rr1+8(%rsp) # store upper rr
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#Note that volatiles will be trashed by the call
#We do not care since this is the last check
#We will construct r, rr, region and sign
# Work on Lower arg
mov $0x07ff0000000000000,%r11 # is lower arg nan/inf
mov %r11,%r10
and %r8,%r10
cmp %r11,%r10
jz .L__vrd4_sin_lower_naninf_higher
lea region1(%rsp),%rdx # lower arg is **NOT** nan/inf
lea rr1(%rsp),%rsi
lea r1(%rsp),%rdi
movlpd r1(%rsp),%xmm0 #Restore lower fp arg for remainder_piby2 call
call __amd_remainder_piby2@PLT
jmp 0f
.L__vrd4_sin_lower_naninf_higher:
# mov p_original1(%rsp),%r8 ; upper arg is nan/inf
mov $0x00008000000000000,%r11
or %r11,%r8
mov %r8,r1(%rsp) # r = x | 0x0008000000000000
xor %r10,%r10
mov %r10,rr1(%rsp) # rr = 0
mov %r10d,region1(%rsp) # region =0
.align 16
0:
jmp .L__vrd4_sin_reconstruct
.align 16
.Lboth_arg_gt_5e5_higher:
# Upper Arg is >= 5e5, Lower arg is >= 5e5
# %r9,%r8
# %xmm3,,%xmm1 xmm7 = x, xmm4 = 0.5
movhpd %xmm1,r1+8(%rsp) #Save upper fp arg for remainder_piby2 call
mov $0x07ff0000000000000,%r11 #is lower arg nan/inf
mov %r11,%r10
and %r8,%r10
cmp %r11,%r10
jz .L__vrd4_sin_lower_naninf_of_both_gt_5e5_higher
mov %r9,p_temp1(%rsp) #Save upper arg
lea region1(%rsp),%rdx #lower arg is **NOT** nan/inf
lea rr1(%rsp),%rsi
lea r1(%rsp),%rdi
movsd %xmm1,%xmm0
call __amd_remainder_piby2@PLT
mov p_temp1(%rsp),%r9 #Restore upper arg
jmp 0f
.L__vrd4_sin_lower_naninf_of_both_gt_5e5_higher: #lower arg is nan/inf
# mov p_original1(%rsp),%r8
mov $0x00008000000000000,%r11
or %r11,%r8
mov %r8,r1(%rsp) #r = x | 0x0008000000000000
xor %r10,%r10
mov %r10,rr1(%rsp) #rr = 0
mov %r10d,region1(%rsp) #region = 0
.align 16
0:
mov $0x07ff0000000000000,%r11 #is upper arg nan/inf
mov %r11,%r10
and %r9,%r10
cmp %r11,%r10
jz .L__vrd4_sin_upper_naninf_of_both_gt_5e5_higher
lea region1+4(%rsp),%rdx #upper arg is **NOT** nan/inf
lea rr1+8(%rsp),%rsi
lea r1+8(%rsp),%rdi
movlpd r1+8(%rsp),%xmm0 #Restore upper fp arg for remainder_piby2 call
call __amd_remainder_piby2@PLT
jmp 0f
.L__vrd4_sin_upper_naninf_of_both_gt_5e5_higher:
# mov p_original1+8(%rsp),%r9 ;upper arg is nan/inf
# movd %xmm6,%r9 ;upper arg is nan/inf
mov $0x00008000000000000,%r11
or %r11,%r9
mov %r9,r1+8(%rsp) #r = x | 0x0008000000000000
xor %r10,%r10
mov %r10,rr1+8(%rsp) #rr = 0
mov %r10d,region1+4(%rsp) #region = 0
.align 16
0:
jmp .L__vrd4_sin_reconstruct
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.Lfourth_arg_gt_5e5:
#first two args are < 5e5, third arg < 5e5, fourth arg >= 5e5
#%rcx,,%rax r8, r9
#%xmm2,,%xmm0 xmm6 = x, xmm4 = 0.5
# Work on first two args, both < 5e5
mulpd .L__real_3fe45f306dc9c883(%rip),%xmm2 # * twobypi
addpd %xmm4,%xmm2 # +0.5, npi2
movapd .L__real_3ff921fb54400000(%rip),%xmm0 # piby2_1
cvttpd2dq %xmm2,%xmm4 # convert packed double to packed integers
movapd .L__real_3dd0b4611a600000(%rip),%xmm8 # piby2_2
cvtdq2pd %xmm4,%xmm2 # and back to double.
# /* Subtract the multiple from x to get an extra-precision remainder */
movq %xmm4,region(%rsp) # Region
# rhead = x - npi2 * piby2_1;
mulpd %xmm2,%xmm0 # npi2 * piby2_1;
# rtail = npi2 * piby2_2;
mulpd %xmm2,%xmm8 # rtail
# rhead = x - npi2 * piby2_1;
subpd %xmm0,%xmm6 # rhead = x - npi2 * piby2_1;
# t = rhead;
movapd %xmm6,%xmm0 # t
# rhead = t - rtail;
subpd %xmm8,%xmm0 # rhead
# rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
mulpd .L__real_3ba3198a2e037073(%rip),%xmm2 # npi2 * piby2_2tail
subpd %xmm0,%xmm6 # t-rhead
subpd %xmm6,%xmm8 # - ((t - rhead) - rtail)
addpd %xmm2,%xmm8 # rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
movapd %xmm0,%xmm6 # rhead
subpd %xmm8,%xmm0 # r = rhead - rtail
movapd %xmm0,r(%rsp)
subpd %xmm0,%xmm6 # rr=rhead-r
subpd %xmm8,%xmm6 # rr=(rhead-r) -rtail
movapd %xmm6,rr(%rsp)
# Work on next two args, third arg < 5e5, fourth arg >= 5e5
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.Lfirst_second_done_fourth_arg_gt_5e5:
# Upper Arg is >= 5e5, Lower arg is < 5e5
# %r9,%r8
# %xmm3,,%xmm1 xmm7 = x, xmm4 = 0.5
movhpd %xmm1,r1+8(%rsp) #Save upper fp arg for remainder_piby2 call
# movlhps %xmm1,%xmm1 ;Not needed since we want to work on lower arg, but done just to be safe and avoide exceptions due to nan/inf and to mirror the lower_arg_gt_5e5 case
# movlhps %xmm3,%xmm3
# movlhps %xmm7,%xmm7
# Work on Lower arg
# Upper arg might contain nan/inf, to avoid exception use only scalar instructions on lower arg
movapd .L__real_3fe0000000000000(%rip),%xmm4 # Restore 0.5
mulsd .L__real_3fe45f306dc9c883(%rip),%xmm3 # x*twobypi
addsd %xmm4,%xmm3 # xmm3 = npi2=(x*twobypi+0.5)
movsd .L__real_3ff921fb54400000(%rip),%xmm2 # xmm2 = piby2_1
cvttsd2si %xmm3,%r8d # r8d = npi2 trunc to ints
movsd .L__real_3dd0b4611a600000(%rip),%xmm0 # xmm0 = piby2_2
cvtsi2sd %r8d,%xmm3 # xmm3 = npi2 trunc to doubles
#/* Subtract the multiple from x to get an extra-precision remainder */
#rhead = x - npi2 * piby2_1;
mulsd %xmm3,%xmm2 # npi2 * piby2_1
subsd %xmm2,%xmm7 # xmm7 = rhead =(x-npi2*piby2_1)
movsd .L__real_3ba3198a2e037073(%rip),%xmm6 # xmm6 =piby2_2tail
#t = rhead;
movsd %xmm7,%xmm5 # xmm5 = t = rhead
#rtail = npi2 * piby2_2;
mulsd %xmm3,%xmm0 # xmm0 =rtail=(npi2*piby2_2)
#rhead = t - rtail
subsd %xmm0,%xmm7 # xmm7 =rhead=(t-rtail)
#rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
mulsd %xmm3,%xmm6 # npi2 * piby2_2tail
subsd %xmm7,%xmm5 # t-rhead
subsd %xmm5,%xmm0 # (rtail-(t-rhead))
addsd %xmm6,%xmm0 # rtail=npi2*piby2_2tail+(rtail-(t-rhead));
#r = rhead - rtail
#rr = (rhead-r) -rtail
mov %r8d,region1(%rsp) # store lower region
movsd %xmm7,%xmm1
subsd %xmm0,%xmm1 # xmm0 = r=(rhead-rtail)
subsd %xmm1,%xmm7 # rr=rhead-r
subsd %xmm0,%xmm7 # xmm6 = rr=((rhead-r) -rtail)
movlpd %xmm1,r1(%rsp) # store upper r
movlpd %xmm7,rr1(%rsp) # store upper rr
#Work on Upper arg
#Note that volatiles will be trashed by the call
#We do not care since this is the last check
#We will construct r, rr, region and sign
mov $0x07ff0000000000000,%r11 # is upper arg nan/inf
mov %r11,%r10
and %r9,%r10
cmp %r11,%r10
jz .L__vrd4_sin_upper_naninf_higher
lea region1+4(%rsp),%rdx # upper arg is **NOT** nan/inf
lea rr1+8(%rsp),%rsi
lea r1+8(%rsp),%rdi
movlpd r1+8(%rsp),%xmm0 #Restore upper fp arg for remainder_piby2 call
call __amd_remainder_piby2@PLT
jmp 0f
.L__vrd4_sin_upper_naninf_higher:
# mov p_original1+8(%rsp),%r9 ; upper arg is nan/inf
# mov r1+8(%rsp),%r9 ; upper arg is nan/inf
mov $0x00008000000000000,%r11
or %r11,%r9
mov %r9,r1+8(%rsp) # r = x | 0x0008000000000000
xor %r10,%r10
mov %r10,rr1+8(%rsp) # rr = 0
mov %r10d,region1+4(%rsp) # region =0
.align 16
0:
jmp .L__vrd4_sin_reconstruct
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.L__vrd4_sin_reconstruct:
#Results
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# p_sign0 = Sign, xmm0 = r, xmm2 = %xmm6,%r2 =rr
# p_sign1 = Sign, xmm1 = r, xmm3 = %xmm7,%r2 =rr
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
movapd r(%rsp),%xmm0
movapd r1(%rsp),%xmm1
movapd rr(%rsp),%xmm6
movapd rr1(%rsp),%xmm7
mov region(%rsp),%r8
mov region1(%rsp),%r9
mov .L__reald_one_zero(%rip),%rdx #compare value for cossin path
mov %r8,%r10
mov %r9,%r11
and .L__reald_one_one(%rip),%r8 #odd/even region for cos/sin
and .L__reald_one_one(%rip),%r9 #odd/even region for cos/sin
shr $1,%r10 #~AB+A~B, A is sign and B is upper bit of region
shr $1,%r11 #~AB+A~B, A is sign and B is upper bit of region
mov %r10,%rax
mov %r11,%rcx
not %r12 #ADDED TO CHANGE THE LOGIC
not %r13 #ADDED TO CHANGE THE LOGIC
and %r12,%r10
and %r13,%r11
not %rax
not %rcx
not %r12
not %r13
and %r12,%rax
and %r13,%rcx
or %rax,%r10
or %rcx,%r11
and .L__reald_one_one(%rip),%r10 #(~AB+A~B)&1
and .L__reald_one_one(%rip),%r11 #(~AB+A~B)&1
mov %r10,%r12
mov %r11,%r13
and %rdx,%r12 #mask out the lower sign bit leaving the upper sign bit
and %rdx,%r13 #mask out the lower sign bit leaving the upper sign bit
shl $63,%r10 #shift lower sign bit left by 63 bits
shl $63,%r11 #shift lower sign bit left by 63 bits
shl $31,%r12 #shift upper sign bit left by 31 bits
shl $31,%r13 #shift upper sign bit left by 31 bits
mov %r10,p_sign(%rsp) #write out lower sign bit
mov %r12,p_sign+8(%rsp) #write out upper sign bit
mov %r11,p_sign1(%rsp) #write out lower sign bit
mov %r13,p_sign1+8(%rsp) #write out upper sign bit
mov %r8,%rax
mov %r9,%rcx
movapd %xmm0,%xmm2
movapd %xmm1,%xmm3
mulpd %xmm0,%xmm2 # r2
mulpd %xmm1,%xmm3 # r2
and .L__reald_zero_one(%rip),%rax
and .L__reald_zero_one(%rip),%rcx
shr $31,%r8
shr $31,%r9
or %r8,%rax
or %r9,%rcx
shl $2,%rcx
or %rcx,%rax
leaq .Levensin_oddcos_tbl(%rip),%rsi
jmp *(%rsi,%rax,8) #Jmp table for cos/sin calculation based on even/odd region
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.L__vrd4_sin_cleanup:
movapd p_sign(%rsp), %xmm0
movapd p_sign1(%rsp), %xmm1
xorpd %xmm4, %xmm0 # (+) Sign
xorpd %xmm5, %xmm1 # (+) Sign
.L__vrda_bottom1:
# store the result _m128d
mov save_ya(%rsp),%rdi # get y_array pointer
movlpd %xmm0,(%rdi)
movhpd %xmm0,8(%rdi)
.L__vrda_bottom2:
prefetch 64(%rdi)
add $32,%rdi
mov %rdi,save_ya(%rsp) # save y_array pointer
# store the result _m128d
movlpd %xmm1, -16(%rdi)
movhpd %xmm1, -8(%rdi)
mov p_iter(%rsp),%rax # get number of iterations
sub $1,%rax
mov %rax,p_iter(%rsp) # save number of iterations
jnz .L__vrda_top
# see if we need to do any extras
mov save_nv(%rsp),%rax # get number of values
test %rax,%rax
jnz .L__vrda_cleanup
.L__final_check:
mov save_r12(%rsp),%r12 # restore r12
mov save_r13(%rsp),%r13 # restore r13
add $0x228,%rsp
ret
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# we jump here when we have an odd number of cos calls to make at the end
# The number of values left is in save_nv
.align 16
.L__vrda_cleanup:
mov save_nv(%rsp),%rax # get number of values
test %rax,%rax # are there any values
jz .L__final_check # exit if not
mov save_xa(%rsp),%rsi
mov save_ya(%rsp),%rdi
# fill in a m128d with zeroes and the extra values and then make a recursive call.
xorpd %xmm0,%xmm0
movlpd %xmm0,p_temp+8(%rsp)
movapd %xmm0,p_temp+16(%rsp)
mov (%rsi),%rcx # we know there's at least one
mov %rcx,p_temp(%rsp)
cmp $2,%rax
jl .L__vrdacg
mov 8(%rsi),%rcx # do the second value
mov %rcx,p_temp+8(%rsp)
cmp $3,%rax
jl .L__vrdacg
mov 16(%rsi),%rcx # do the third value
mov %rcx,p_temp+16(%rsp)
.L__vrdacg:
mov $4,%rdi # parameter for N
lea p_temp(%rsp),%rsi # &x parameter
lea p_temp2(%rsp),%rdx # &y parameter
call vrda_sin@PLT # call recursively to compute four values
# now copy the results to the destination array
mov save_ya(%rsp),%rdi
mov save_nv(%rsp),%rax # get number of values
mov p_temp2(%rsp),%rcx
mov %rcx, (%rdi) # we know there's at least one
cmp $2,%rax
jl .L__vrdacgf
mov p_temp2+8(%rsp),%rcx
mov %rcx, 8(%rdi) # do the second value
cmp $3,%rax
jl .L__vrdacgf
mov p_temp2+16(%rsp),%rcx
mov %rcx, 16(%rdi) # do the third value
.L__vrdacgf:
jmp .L__final_check
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;JUMP TABLE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# p_sign0 = Sign, xmm0 = r, xmm2 = %xmm6,%r2 =rr
# p_sign1 = Sign, xmm1 = r, xmm3 = %xmm7,%r2 =rr
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.Lcoscos_coscos_piby4:
movapd %xmm2,%xmm10 # r
movapd %xmm3,%xmm11 # r
movdqa .Lcosarray+0x50(%rip),%xmm4 # c6
movdqa .Lcosarray+0x50(%rip),%xmm5 # c6
movapd .Lcosarray+0x20(%rip),%xmm8 # c3
movapd .Lcosarray+0x20(%rip),%xmm9 # c3
mulpd .L__real_3fe0000000000000(%rip),%xmm10 # r = 0.5 *x2
mulpd .L__real_3fe0000000000000(%rip),%xmm11 # r = 0.5 *x2
mulpd %xmm2,%xmm4 # c6*x2
mulpd %xmm3,%xmm5 # c6*x2
movapd %xmm10,p_temp2(%rsp) # r
movapd %xmm11,p_temp3(%rsp) # r
mulpd %xmm2,%xmm8 # c3*x2
mulpd %xmm3,%xmm9 # c3*x2
subpd .L__real_3ff0000000000000(%rip),%xmm10 # -t=r-1.0 ;trash r
subpd .L__real_3ff0000000000000(%rip),%xmm11 # -t=r-1.0 ;trash r
movapd %xmm2,%xmm12 # copy of x2 for x4
movapd %xmm3,%xmm13 # copy of x2 for x4
addpd .Lcosarray+0x40(%rip),%xmm4 # c5+x2c6
addpd .Lcosarray+0x40(%rip),%xmm5 # c5+x2c6
addpd .Lcosarray+0x10(%rip),%xmm8 # c2+x2C3
addpd .Lcosarray+0x10(%rip),%xmm9 # c2+x2C3
addpd .L__real_3ff0000000000000(%rip),%xmm10 # 1 + (-t) ;trash t
addpd .L__real_3ff0000000000000(%rip),%xmm11 # 1 + (-t) ;trash t
mulpd %xmm2,%xmm12 # x4
mulpd %xmm3,%xmm13 # x4
mulpd %xmm2,%xmm4 # x2(c5+x2c6)
mulpd %xmm3,%xmm5 # x2(c5+x2c6)
mulpd %xmm2,%xmm8 # x2(c2+x2C3)
mulpd %xmm3,%xmm9 # x2(c2+x2C3)
mulpd %xmm2,%xmm12 # x6
mulpd %xmm3,%xmm13 # x6
addpd .Lcosarray+0x30(%rip),%xmm4 # c4 + x2(c5+x2c6)
addpd .Lcosarray+0x30(%rip),%xmm5 # c4 + x2(c5+x2c6)
addpd .Lcosarray(%rip),%xmm8 # c1 + x2(c2+x2C3)
addpd .Lcosarray(%rip),%xmm9 # c1 + x2(c2+x2C3)
mulpd %xmm12,%xmm4 # x6(c4 + x2(c5+x2c6))
mulpd %xmm13,%xmm5 # x6(c4 + x2(c5+x2c6))
addpd %xmm8,%xmm4 # zc
addpd %xmm9,%xmm5 # zc
mulpd %xmm2,%xmm2 # x4 recalculate
mulpd %xmm3,%xmm3 # x4 recalculate
movapd p_temp2(%rsp),%xmm12 # r
movapd p_temp3(%rsp),%xmm13 # r
mulpd %xmm0,%xmm6 # x * xx
mulpd %xmm1,%xmm7 # x * xx
subpd %xmm12,%xmm10 # (1 + (-t)) - r
subpd %xmm13,%xmm11 # (1 + (-t)) - r
mulpd %xmm2,%xmm4 # x4 * zc
mulpd %xmm3,%xmm5 # x4 * zc
subpd %xmm6,%xmm10 # ((1 + (-t)) - r) - x*xx
subpd %xmm7,%xmm11 # ((1 + (-t)) - r) - x*xx
addpd %xmm10,%xmm4 # x4*zc + (((1 + (-t)) - r) - x*xx)
addpd %xmm11,%xmm5 # x4*zc + (((1 + (-t)) - r) - x*xx)
subpd .L__real_3ff0000000000000(%rip),%xmm12 # t relaculate, -t = r-1
subpd .L__real_3ff0000000000000(%rip),%xmm13 # t relaculate, -t = r-1
subpd %xmm12,%xmm4 # + t
subpd %xmm13,%xmm5 # + t
jmp .L__vrd4_sin_cleanup
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.Lcossin_cossin_piby4:
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# p_sign0 = Sign, xmm0 = r, xmm2 = %xmm6,%r2 =rr
# p_sign1 = Sign, xmm1 = r, xmm3 = %xmm7,%r2 =rr
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
movapd %xmm6,p_temp(%rsp) # Store rr
movapd %xmm7,p_temp1(%rsp) # Store rr
movdqa .Lsincosarray+0x50(%rip),%xmm4 # s6
movdqa .Lsincosarray+0x50(%rip),%xmm5 # s6
movapd .Lsincosarray+0x20(%rip),%xmm8 # s3
movapd .Lsincosarray+0x20(%rip),%xmm9 # s3
movapd %xmm2,%xmm10 # move x2 for x4
movapd %xmm3,%xmm11 # move x2 for x4
mulpd %xmm2,%xmm4 # x2s6
mulpd %xmm3,%xmm5 # x2s6
mulpd %xmm2,%xmm8 # x2s3
mulpd %xmm3,%xmm9 # x2s3
mulpd %xmm2,%xmm10 # x4
mulpd %xmm3,%xmm11 # x4
addpd .Lsincosarray+0x40(%rip),%xmm4 # s5+x2s6
addpd .Lsincosarray+0x40(%rip),%xmm5 # s5+x2s6
addpd .Lsincosarray+0x10(%rip),%xmm8 # s2+x2s3
addpd .Lsincosarray+0x10(%rip),%xmm9 # s2+x2s3
movapd %xmm2,%xmm12 # move x2 for x6
movapd %xmm3,%xmm13 # move x2 for x6
mulpd %xmm2,%xmm4 # x2(s5+x2s6)
mulpd %xmm3,%xmm5 # x2(s5+x2s6)
mulpd %xmm2,%xmm8 # x2(s2+x2s3)
mulpd %xmm3,%xmm9 # x2(s2+x2s3)
mulpd %xmm10,%xmm12 # x6
mulpd %xmm11,%xmm13 # x6
addpd .Lsincosarray+0x30(%rip),%xmm4 # s4+x2(s5+x2s6)
addpd .Lsincosarray+0x30(%rip),%xmm5 # s4+x2(s5+x2s6)
addpd .Lsincosarray(%rip),%xmm8 # s1+x2(s2+x2s3)
addpd .Lsincosarray(%rip),%xmm9 # s1+x2(s2+x2s3)
movhlps %xmm10,%xmm10 # move high x4 for cos term
movhlps %xmm11,%xmm11 # move high x4 for cos term
mulpd %xmm12,%xmm4 # x6(s4+x2(s5+x2s6))
mulpd %xmm13,%xmm5 # x6(s4+x2(s5+x2s6))
movsd %xmm2,%xmm6 # move low x2 for x3 for sin term
movsd %xmm3,%xmm7 # move low x2 for x3 for sin term
mulsd %xmm0,%xmm6 # get low x3 for sin term
mulsd %xmm1,%xmm7 # get low x3 for sin term
mulpd .L__real_3fe0000000000000(%rip),%xmm2 # 0.5*x2 for sin and cos terms
mulpd .L__real_3fe0000000000000(%rip),%xmm3 # 0.5*x2 for sin and cos terms
addpd %xmm8,%xmm4 # z
addpd %xmm9,%xmm5 # z
movhlps %xmm2,%xmm12 # move high r for cos
movhlps %xmm3,%xmm13 # move high r for cos
movhlps %xmm4,%xmm8 # xmm4 = sin , xmm8 = cos
movhlps %xmm5,%xmm9 # xmm4 = sin , xmm8 = cos
mulsd %xmm6,%xmm4 # sin *x3
mulsd %xmm7,%xmm5 # sin *x3
mulsd %xmm10,%xmm8 # cos *x4
mulsd %xmm11,%xmm9 # cos *x4
mulsd p_temp(%rsp),%xmm2 # 0.5 * x2 * xx for sin term
mulsd p_temp1(%rsp),%xmm3 # 0.5 * x2 * xx for sin term
movsd %xmm12,%xmm6 # Keep high r for cos term
movsd %xmm13,%xmm7 # Keep high r for cos term
subsd .L__real_3ff0000000000000(%rip),%xmm12 #-t=r-1.0
subsd .L__real_3ff0000000000000(%rip),%xmm13 #-t=r-1.0
subsd %xmm2,%xmm4 # sin - 0.5 * x2 *xx
subsd %xmm3,%xmm5 # sin - 0.5 * x2 *xx
movhlps %xmm0,%xmm10 # move high x for x*xx for cos term
movhlps %xmm1,%xmm11 # move high x for x*xx for cos term
mulsd p_temp+8(%rsp),%xmm10 # x * xx
mulsd p_temp1+8(%rsp),%xmm11 # x * xx
movsd %xmm12,%xmm2 # move -t for cos term
movsd %xmm13,%xmm3 # move -t for cos term
addsd .L__real_3ff0000000000000(%rip),%xmm12 #1+(-t)
addsd .L__real_3ff0000000000000(%rip),%xmm13 #1+(-t)
addsd p_temp(%rsp),%xmm4 # sin+xx
addsd p_temp1(%rsp),%xmm5 # sin+xx
subsd %xmm6,%xmm12 # (1-t) - r
subsd %xmm7,%xmm13 # (1-t) - r
subsd %xmm10,%xmm12 # ((1 + (-t)) - r) - x*xx
subsd %xmm11,%xmm13 # ((1 + (-t)) - r) - x*xx
addsd %xmm0,%xmm4 # sin + x
addsd %xmm1,%xmm5 # sin + x
addsd %xmm12,%xmm8 # cos+((1-t)-r - x*xx)
addsd %xmm13,%xmm9 # cos+((1-t)-r - x*xx)
subsd %xmm2,%xmm8 # cos+t
subsd %xmm3,%xmm9 # cos+t
movlhps %xmm8,%xmm4
movlhps %xmm9,%xmm5
jmp .L__vrd4_sin_cleanup
.align 16
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# p_sign0 = Sign, xmm0 = r, xmm2 = %xmm6,%r2 =rr
# p_sign1 = Sign, xmm1 = r, xmm3 = %xmm7,%r2 =rr
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.Lsincos_cossin_piby4: # changed from sincos_sincos
# xmm1 is cossin and xmm0 is sincos
movapd %xmm6,p_temp(%rsp) # Store rr
movapd %xmm7,p_temp1(%rsp) # Store rr
movapd %xmm1,p_temp3(%rsp) # Store r for the sincos term
movapd .Lsincosarray+0x50(%rip),%xmm4 # s6
movapd .Lcossinarray+0x50(%rip),%xmm5 # s6
movdqa .Lsincosarray+0x20(%rip),%xmm8 # s3
movdqa .Lcossinarray+0x20(%rip),%xmm9 # s3
movapd %xmm2,%xmm10 # move x2 for x4
movapd %xmm3,%xmm11 # move x2 for x4
mulpd %xmm2,%xmm4 # x2s6
mulpd %xmm3,%xmm5 # x2s6
mulpd %xmm2,%xmm8 # x2s3
mulpd %xmm3,%xmm9 # x2s3
mulpd %xmm2,%xmm10 # x4
mulpd %xmm3,%xmm11 # x4
addpd .Lsincosarray+0x40(%rip),%xmm4 # s5+x2s6
addpd .Lcossinarray+0x40(%rip),%xmm5 # s5+x2s6
addpd .Lsincosarray+0x10(%rip),%xmm8 # s2+x2s3
addpd .Lcossinarray+0x10(%rip),%xmm9 # s2+x2s3
movapd %xmm2,%xmm12 # move x2 for x6
movapd %xmm3,%xmm13 # move x2 for x6
mulpd %xmm2,%xmm4 # x2(s5+x2s6)
mulpd %xmm3,%xmm5 # x2(s5+x2s6)
mulpd %xmm2,%xmm8 # x2(s2+x2s3)
mulpd %xmm3,%xmm9 # x2(s2+x2s3)
mulpd %xmm10,%xmm12 # x6
mulpd %xmm11,%xmm13 # x6
addpd .Lsincosarray+0x30(%rip),%xmm4 # s4+x2(s5+x2s6)
addpd .Lcossinarray+0x30(%rip),%xmm5 # s4+x2(s5+x2s6)
addpd .Lsincosarray(%rip),%xmm8 # s1+x2(s2+x2s3)
addpd .Lcossinarray(%rip),%xmm9 # s1+x2(s2+x2s3)
movhlps %xmm10,%xmm10 # move high x4 for cos term
mulpd %xmm12,%xmm4 # x6(s4+x2(s5+x2s6))
mulpd %xmm13,%xmm5 # x6(s4+x2(s5+x2s6))
movsd %xmm2,%xmm6 # move low x2 for x3 for sin term (cossin)
movhlps %xmm3,%xmm7 # move high x2 for x3 for sin term (sincos)
mulsd %xmm0,%xmm6 # get low x3 for sin term
mulsd p_temp3+8(%rsp),%xmm7 # get high x3 for sin term
mulpd .L__real_3fe0000000000000(%rip),%xmm2 # 0.5*x2 for sin and cos terms
mulpd .L__real_3fe0000000000000(%rip),%xmm3 # 0.5*x2 for sin and cos terms
addpd %xmm8,%xmm4 # z
addpd %xmm9,%xmm5 # z
movhlps %xmm2,%xmm12 # move high r for cos (cossin)
movhlps %xmm3,%xmm13 # move high 0.5*x2 for sin term (sincos)
movhlps %xmm4,%xmm8 # xmm8 = cos , xmm4 = sin (cossin)
movhlps %xmm5,%xmm9 # xmm9 = sin , xmm5 = cos (sincos)
mulsd %xmm6,%xmm4 # sin *x3
mulsd %xmm11,%xmm5 # cos *x4
mulsd %xmm10,%xmm8 # cos *x4
mulsd %xmm7,%xmm9 # sin *x3
mulsd p_temp(%rsp),%xmm2 # low 0.5 * x2 * xx for sin term (cossin)
mulsd p_temp1+8(%rsp),%xmm13 # high 0.5 * x2 * xx for sin term (sincos)
movsd %xmm12,%xmm6 # Keep high r for cos term
movsd %xmm3,%xmm7 # Keep low r for cos term
subsd .L__real_3ff0000000000000(%rip),%xmm12 # -t=r-1.0
subsd .L__real_3ff0000000000000(%rip),%xmm3 # -t=r-1.0
subsd %xmm2,%xmm4 # sin - 0.5 * x2 *xx (cossin)
subsd %xmm13,%xmm9 # sin - 0.5 * x2 *xx (sincos)
movhlps %xmm0,%xmm10 # move high x for x*xx for cos term (cossin)
movhlps %xmm1,%xmm11 # move high x for x for sin term (sincos)
mulsd p_temp+8(%rsp),%xmm10 # x * xx
mulsd p_temp1(%rsp),%xmm1 # x * xx
movsd %xmm12,%xmm2 # move -t for cos term
movsd %xmm3,%xmm13 # move -t for cos term
addsd .L__real_3ff0000000000000(%rip),%xmm12 # 1+(-t)
addsd .L__real_3ff0000000000000(%rip),%xmm3 # 1+(-t)
addsd p_temp(%rsp),%xmm4 # sin+xx +
addsd p_temp1+8(%rsp),%xmm9 # sin+xx +
subsd %xmm6,%xmm12 # (1-t) - r
subsd %xmm7,%xmm3 # (1-t) - r
subsd %xmm10,%xmm12 # ((1 + (-t)) - r) - x*xx
subsd %xmm1,%xmm3 # ((1 + (-t)) - r) - x*xx
addsd %xmm0,%xmm4 # sin + x +
addsd %xmm11,%xmm9 # sin + x +
addsd %xmm12,%xmm8 # cos+((1-t)-r - x*xx)
addsd %xmm3,%xmm5 # cos+((1-t)-r - x*xx)
subsd %xmm2,%xmm8 # cos+t
subsd %xmm13,%xmm5 # cos+t
movlhps %xmm8,%xmm4 # cossin
movlhps %xmm9,%xmm5 # sincos
jmp .L__vrd4_sin_cleanup
.align 16
.Lsincos_sincos_piby4:
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# p_sign0 = Sign, xmm0 = r, xmm2 = %xmm6,%r2 =rr
# p_sign1 = Sign, xmm1 = r, xmm3 = %xmm7,%r2 =rr
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
movapd %xmm6,p_temp(%rsp) # Store rr
movapd %xmm7,p_temp1(%rsp) # Store rr
movapd %xmm0,p_temp2(%rsp) # Store r
movapd %xmm1,p_temp3(%rsp) # Store r
movapd .Lcossinarray+0x50(%rip),%xmm4 # s6
movapd .Lcossinarray+0x50(%rip),%xmm5 # s6
movdqa .Lcossinarray+0x20(%rip),%xmm8 # s3
movdqa .Lcossinarray+0x20(%rip),%xmm9 # s3
movapd %xmm2,%xmm10 # move x2 for x4
movapd %xmm3,%xmm11 # move x2 for x4
mulpd %xmm2,%xmm4 # x2s6
mulpd %xmm3,%xmm5 # x2s6
mulpd %xmm2,%xmm8 # x2s3
mulpd %xmm3,%xmm9 # x2s3
mulpd %xmm2,%xmm10 # x4
mulpd %xmm3,%xmm11 # x4
addpd .Lcossinarray+0x40(%rip),%xmm4 # s5+x2s6
addpd .Lcossinarray+0x40(%rip),%xmm5 # s5+x2s6
addpd .Lcossinarray+0x10(%rip),%xmm8 # s2+x2s3
addpd .Lcossinarray+0x10(%rip),%xmm9 # s2+x2s3
movapd %xmm2,%xmm12 # move x2 for x6
movapd %xmm3,%xmm13 # move x2 for x6
mulpd %xmm2,%xmm4 # x2(s5+x2s6)
mulpd %xmm3,%xmm5 # x2(s5+x2s6)
mulpd %xmm2,%xmm8 # x2(s2+x2s3)
mulpd %xmm3,%xmm9 # x2(s2+x2s3)
mulpd %xmm10,%xmm12 # x6
mulpd %xmm11,%xmm13 # x6
addpd .Lcossinarray+0x30(%rip),%xmm4 # s4+x2(s5+x2s6)
addpd .Lcossinarray+0x30(%rip),%xmm5 # s4+x2(s5+x2s6)
addpd .Lcossinarray(%rip),%xmm8 # s1+x2(s2+x2s3)
addpd .Lcossinarray(%rip),%xmm9 # s1+x2(s2+x2s3)
mulpd %xmm12,%xmm4 # x6(s4+x2(s5+x2s6))
mulpd %xmm13,%xmm5 # x6(s4+x2(s5+x2s6))
movhlps %xmm2,%xmm6 # move low x2 for x3 for sin term
movhlps %xmm3,%xmm7 # move low x2 for x3 for sin term
mulsd p_temp2+8(%rsp),%xmm6 # get low x3 for sin term
mulsd p_temp3+8(%rsp),%xmm7 # get low x3 for sin term
mulpd .L__real_3fe0000000000000(%rip),%xmm2 # 0.5*x2 for sin and cos terms
mulpd .L__real_3fe0000000000000(%rip),%xmm3 # 0.5*x2 for sin and cos terms
addpd %xmm8,%xmm4 # z
addpd %xmm9,%xmm5 # z
movhlps %xmm2,%xmm12 # move high 0.5*x2 for sin term
movhlps %xmm3,%xmm13 # move high 0.5*x2 for sin term
# Reverse 12 and 2
movhlps %xmm4,%xmm8 # xmm8 = sin , xmm4 = cos
movhlps %xmm5,%xmm9 # xmm9 = sin , xmm5 = cos
mulsd %xmm6,%xmm8 # sin *x3
mulsd %xmm7,%xmm9 # sin *x3
mulsd %xmm10,%xmm4 # cos *x4
mulsd %xmm11,%xmm5 # cos *x4
mulsd p_temp+8(%rsp),%xmm12 # 0.5 * x2 * xx for sin term
mulsd p_temp1+8(%rsp),%xmm13 # 0.5 * x2 * xx for sin term
movsd %xmm2,%xmm6 # Keep high r for cos term
movsd %xmm3,%xmm7 # Keep high r for cos term
subsd .L__real_3ff0000000000000(%rip),%xmm2 #-t=r-1.0
subsd .L__real_3ff0000000000000(%rip),%xmm3 #-t=r-1.0
subsd %xmm12,%xmm8 # sin - 0.5 * x2 *xx
subsd %xmm13,%xmm9 # sin - 0.5 * x2 *xx
movhlps %xmm0,%xmm10 # move high x for x for sin term
movhlps %xmm1,%xmm11 # move high x for x for sin term
# Reverse 10 and 0
mulsd p_temp(%rsp),%xmm0 # x * xx
mulsd p_temp1(%rsp),%xmm1 # x * xx
movsd %xmm2,%xmm12 # move -t for cos term
movsd %xmm3,%xmm13 # move -t for cos term
addsd .L__real_3ff0000000000000(%rip),%xmm2 # 1+(-t)
addsd .L__real_3ff0000000000000(%rip),%xmm3 # 1+(-t)
addsd p_temp+8(%rsp),%xmm8 # sin+xx
addsd p_temp1+8(%rsp),%xmm9 # sin+xx
subsd %xmm6,%xmm2 # (1-t) - r
subsd %xmm7,%xmm3 # (1-t) - r
subsd %xmm0,%xmm2 # ((1 + (-t)) - r) - x*xx
subsd %xmm1,%xmm3 # ((1 + (-t)) - r) - x*xx
addsd %xmm10,%xmm8 # sin + x
addsd %xmm11,%xmm9 # sin + x
addsd %xmm2,%xmm4 # cos+((1-t)-r - x*xx)
addsd %xmm3,%xmm5 # cos+((1-t)-r - x*xx)
subsd %xmm12,%xmm4 # cos+t
subsd %xmm13,%xmm5 # cos+t
movlhps %xmm8,%xmm4
movlhps %xmm9,%xmm5
jmp .L__vrd4_sin_cleanup
.align 16
.Lcossin_sincos_piby4: # changed from sincos_sincos
# xmm1 is cossin and xmm0 is sincos
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# p_sign0 = Sign, xmm0 = r, xmm2 = %xmm6,%r2 =rr
# p_sign1 = Sign, xmm1 = r, xmm3 = %xmm7,%r2 =rr
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
movapd %xmm6,p_temp(%rsp) # Store rr
movapd %xmm7,p_temp1(%rsp) # Store rr
movapd %xmm0,p_temp2(%rsp) # Store r
movapd .Lcossinarray+0x50(%rip),%xmm4 # s6
movapd .Lsincosarray+0x50(%rip),%xmm5 # s6
movdqa .Lcossinarray+0x20(%rip),%xmm8 # s3
movdqa .Lsincosarray+0x20(%rip),%xmm9 # s3
movapd %xmm2,%xmm10 # move x2 for x4
movapd %xmm3,%xmm11 # move x2 for x4
mulpd %xmm2,%xmm4 # x2s6
mulpd %xmm3,%xmm5 # x2s6
mulpd %xmm2,%xmm8 # x2s3
mulpd %xmm3,%xmm9 # x2s3
mulpd %xmm2,%xmm10 # x4
mulpd %xmm3,%xmm11 # x4
addpd .Lcossinarray+0x40(%rip),%xmm4 # s5+x2s6
addpd .Lsincosarray+0x40(%rip),%xmm5 # s5+x2s6
addpd .Lcossinarray+0x10(%rip),%xmm8 # s2+x2s3
addpd .Lsincosarray+0x10(%rip),%xmm9 # s2+x2s3
movapd %xmm2,%xmm12 # move x2 for x6
movapd %xmm3,%xmm13 # move x2 for x6
mulpd %xmm2,%xmm4 # x2(s5+x2s6)
mulpd %xmm3,%xmm5 # x2(s5+x2s6)
mulpd %xmm2,%xmm8 # x2(s2+x2s3)
mulpd %xmm3,%xmm9 # x2(s2+x2s3)
mulpd %xmm10,%xmm12 # x6
mulpd %xmm11,%xmm13 # x6
addpd .Lcossinarray+0x30(%rip),%xmm4 # s4+x2(s5+x2s6)
addpd .Lsincosarray+0x30(%rip),%xmm5 # s4+x2(s5+x2s6)
addpd .Lcossinarray(%rip),%xmm8 # s1+x2(s2+x2s3)
addpd .Lsincosarray(%rip),%xmm9 # s1+x2(s2+x2s3)
movhlps %xmm11,%xmm11 # move high x4 for cos term +
mulpd %xmm12,%xmm4 # x6(s4+x2(s5+x2s6))
mulpd %xmm13,%xmm5 # x6(s4+x2(s5+x2s6))
movhlps %xmm2,%xmm6 # move low x2 for x3 for sin term
movsd %xmm3,%xmm7 # move low x2 for x3 for sin term +
mulsd p_temp2+8(%rsp),%xmm6 # get low x3 for sin term
mulsd %xmm1,%xmm7 # get low x3 for sin term +
mulpd .L__real_3fe0000000000000(%rip),%xmm2 # 0.5*x2 for sin and cos terms
mulpd .L__real_3fe0000000000000(%rip),%xmm3 # 0.5*x2 for sin and cos terms
addpd %xmm8,%xmm4 # z
addpd %xmm9,%xmm5 # z
movhlps %xmm2,%xmm12 # move high 0.5*x2 for sin term
movhlps %xmm3,%xmm13 # move high r for cos
movhlps %xmm4,%xmm8 # xmm8 = sin , xmm4 = cos
movhlps %xmm5,%xmm9 # xmm9 = cos , xmm5 = sin
mulsd %xmm6,%xmm8 # sin *x3
mulsd %xmm11,%xmm9 # cos *x4
mulsd %xmm10,%xmm4 # cos *x4
mulsd %xmm7,%xmm5 # sin *x3
mulsd p_temp+8(%rsp),%xmm12 # 0.5 * x2 * xx for sin term
mulsd p_temp1(%rsp),%xmm3 # 0.5 * x2 * xx for sin term
movsd %xmm2,%xmm6 # Keep high r for cos term
movsd %xmm13,%xmm7 # Keep high r for cos term
subsd .L__real_3ff0000000000000(%rip),%xmm2 #-t=r-1.0
subsd .L__real_3ff0000000000000(%rip),%xmm13 #-t=r-1.0
subsd %xmm12,%xmm8 # sin - 0.5 * x2 *xx
subsd %xmm3,%xmm5 # sin - 0.5 * x2 *xx
movhlps %xmm0,%xmm10 # move high x for x for sin term
movhlps %xmm1,%xmm11 # move high x for x*xx for cos term
mulsd p_temp(%rsp),%xmm0 # x * xx
mulsd p_temp1+8(%rsp),%xmm11 # x * xx
movsd %xmm2,%xmm12 # move -t for cos term
movsd %xmm13,%xmm3 # move -t for cos term
addsd .L__real_3ff0000000000000(%rip),%xmm2 # 1+(-t)
addsd .L__real_3ff0000000000000(%rip),%xmm13 # 1+(-t)
addsd p_temp+8(%rsp),%xmm8 # sin+xx
addsd p_temp1(%rsp),%xmm5 # sin+xx
subsd %xmm6,%xmm2 # (1-t) - r
subsd %xmm7,%xmm13 # (1-t) - r
subsd %xmm0,%xmm2 # ((1 + (-t)) - r) - x*xx
subsd %xmm11,%xmm13 # ((1 + (-t)) - r) - x*xx
addsd %xmm10,%xmm8 # sin + x
addsd %xmm1,%xmm5 # sin + x
addsd %xmm2,%xmm4 # cos+((1-t)-r - x*xx)
addsd %xmm13,%xmm9 # cos+((1-t)-r - x*xx)
subsd %xmm12,%xmm4 # cos+t
subsd %xmm3,%xmm9 # cos+t
movlhps %xmm8,%xmm4
movlhps %xmm9,%xmm5
jmp .L__vrd4_sin_cleanup
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.Lcoscos_sinsin_piby4:
movapd %xmm2,%xmm10 # x2
movapd %xmm3,%xmm11 # x2
movdqa .Lsinarray+0x50(%rip),%xmm4 # c6
movdqa .Lcosarray+0x50(%rip),%xmm5 # c6
movapd .Lsinarray+0x20(%rip),%xmm8 # c3
movapd .Lcosarray+0x20(%rip),%xmm9 # c3
movapd %xmm2,p_temp2(%rsp) # store x2
mulpd .L__real_3fe0000000000000(%rip),%xmm11 # r = 0.5 *x2
mulpd %xmm2,%xmm4 # c6*x2
mulpd %xmm3,%xmm5 # c6*x2
movapd %xmm11,p_temp3(%rsp) # store r
mulpd %xmm2,%xmm8 # c3*x2
mulpd %xmm3,%xmm9 # c3*x2
mulpd %xmm2,%xmm10 # x4
subpd .L__real_3ff0000000000000(%rip),%xmm11 # -t=r-1.0
movapd %xmm2,%xmm12 # copy of x2 for 0.5*x2
movapd %xmm3,%xmm13 # copy of x2 for x4
addpd .Lsinarray+0x40(%rip),%xmm4 # c5+x2c6
addpd .Lcosarray+0x40(%rip),%xmm5 # c5+x2c6
addpd .Lsinarray+0x10(%rip),%xmm8 # c2+x2C3
addpd .Lcosarray+0x10(%rip),%xmm9 # c2+x2C3
addpd .L__real_3ff0000000000000(%rip),%xmm11 # 1 + (-t)
mulpd %xmm2,%xmm10 # x6
mulpd %xmm3,%xmm13 # x4
mulpd %xmm2,%xmm4 # x2(c5+x2c6)
mulpd %xmm3,%xmm5 # x2(c5+x2c6)
mulpd %xmm2,%xmm8 # x2(c2+x2C3)
mulpd %xmm3,%xmm9 # x2(c2+x2C3)
mulpd .L__real_3fe0000000000000(%rip),%xmm12 # 0.5 *x2
mulpd %xmm3,%xmm13 # x6
addpd .Lsinarray+0x30(%rip),%xmm4 # c4 + x2(c5+x2c6)
addpd .Lcosarray+0x30(%rip),%xmm5 # c4 + x2(c5+x2c6)
addpd .Lsinarray(%rip),%xmm8 # c1 + x2(c2+x2C3)
addpd .Lcosarray(%rip),%xmm9 # c1 + x2(c2+x2C3)
mulpd %xmm10,%xmm4 # x6(c4 + x2(c5+x2c6))
mulpd %xmm13,%xmm5 # x6(c4 + x2(c5+x2c6))
addpd %xmm8,%xmm4 # zs
addpd %xmm9,%xmm5 # zc
mulpd %xmm0,%xmm2 # x3 recalculate
mulpd %xmm3,%xmm3 # x4 recalculate
movapd p_temp3(%rsp),%xmm13 # r
mulpd %xmm6,%xmm12 # 0.5 * x2 *xx
mulpd %xmm1,%xmm7 # x * xx
subpd %xmm13,%xmm11 # (1 + (-t)) - r
mulpd %xmm2,%xmm4 # x3 * zs
mulpd %xmm3,%xmm5 # x4 * zc
subpd %xmm12,%xmm4 # -0.5 * x2 *xx
subpd %xmm7,%xmm11 # ((1 + (-t)) - r) - x*xx
addpd %xmm6,%xmm4 # x3 * zs +xx
addpd %xmm11,%xmm5 # x4*zc + (((1 + (-t)) - r) - x*xx)
subpd .L__real_3ff0000000000000(%rip),%xmm13 # t relaculate, -t = r-1
addpd %xmm0,%xmm4 # +x
subpd %xmm13,%xmm5 # + t
jmp .L__vrd4_sin_cleanup
.align 16
.Lsinsin_coscos_piby4:
movapd %xmm2,%xmm10 # x2
movapd %xmm3,%xmm11 # x2
movdqa .Lcosarray+0x50(%rip),%xmm4 # c6
movdqa .Lsinarray+0x50(%rip),%xmm5 # c6
movapd .Lcosarray+0x20(%rip),%xmm8 # c3
movapd .Lsinarray+0x20(%rip),%xmm9 # c3
mulpd .L__real_3fe0000000000000(%rip),%xmm10 # r = 0.5 *x2
movapd %xmm3,p_temp3(%rsp) # store x2
mulpd %xmm2,%xmm4 # c6*x2
mulpd %xmm3,%xmm5 # c6*x2
movapd %xmm10,p_temp2(%rsp) # store r
mulpd %xmm2,%xmm8 # c3*x2
mulpd %xmm3,%xmm9 # c3*x2
subpd .L__real_3ff0000000000000(%rip),%xmm10 # -t=r-1.0
mulpd %xmm3,%xmm11 # x4
movapd %xmm2,%xmm12 # copy of x2 for x4
movapd %xmm3,%xmm13 # copy of x2 for 0.5*x2
addpd .Lcosarray+0x40(%rip),%xmm4 # c5+x2c6
addpd .Lsinarray+0x40(%rip),%xmm5 # c5+x2c6
addpd .Lcosarray+0x10(%rip),%xmm8 # c2+x2C3
addpd .Lsinarray+0x10(%rip),%xmm9 # c2+x2C3
addpd .L__real_3ff0000000000000(%rip),%xmm10 # 1 + (-t)
mulpd %xmm2,%xmm12 # x4
mulpd %xmm3,%xmm11 # x6
mulpd %xmm2,%xmm4 # x2(c5+x2c6)
mulpd %xmm3,%xmm5 # x2(c5+x2c6)
mulpd %xmm2,%xmm8 # x2(c2+x2C3)
mulpd %xmm3,%xmm9 # x2(c2+x2C3)
mulpd %xmm2,%xmm12 # x6
mulpd .L__real_3fe0000000000000(%rip),%xmm13 # 0.5 *x2
addpd .Lcosarray+0x30(%rip),%xmm4 # c4 + x2(c5+x2c6)
addpd .Lsinarray+0x30(%rip),%xmm5 # c4 + x2(c5+x2c6)
addpd .Lcosarray(%rip),%xmm8 # c1 + x2(c2+x2C3)
addpd .Lsinarray(%rip),%xmm9 # c1 + x2(c2+x2C3)
mulpd %xmm12,%xmm4 # x6(c4 + x2(c5+x2c6))
mulpd %xmm11,%xmm5 # x6(c4 + x2(c5+x2c6))
addpd %xmm8,%xmm4 # zc
addpd %xmm9,%xmm5 # zs
mulpd %xmm2,%xmm2 # x4 recalculate
mulpd %xmm1,%xmm3 # x3 recalculate
movapd p_temp2(%rsp),%xmm12 # r
mulpd %xmm0,%xmm6 # x * xx
mulpd %xmm7,%xmm13 # 0.5 * x2 *xx
subpd %xmm12,%xmm10 # (1 + (-t)) - r
mulpd %xmm2,%xmm4 # x4 * zc
mulpd %xmm3,%xmm5 # x3 * zs
subpd %xmm6,%xmm10 # ((1 + (-t)) - r) - x*xx;;;;;;;;;;;;;;;;;;;;;
subpd %xmm13,%xmm5 # -0.5 * x2 *xx
addpd %xmm10,%xmm4 # x4*zc + (((1 + (-t)) - r) - x*xx)
addpd %xmm7,%xmm5 # +xx
subpd .L__real_3ff0000000000000(%rip),%xmm12 # t relaculate, -t = r-1
addpd %xmm1,%xmm5 # +x
subpd %xmm12,%xmm4 # + t
jmp .L__vrd4_sin_cleanup
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.Lcoscos_cossin_piby4: #Derive from cossin_coscos
movapd %xmm2,%xmm10 # r
movapd %xmm3,%xmm11 # r
movdqa .Lsincosarray+0x50(%rip),%xmm4 # c6
movdqa .Lcosarray+0x50(%rip),%xmm5 # c6
movapd .Lsincosarray+0x20(%rip),%xmm8 # c3
movapd .Lcosarray+0x20(%rip),%xmm9 # c3
mulpd .L__real_3fe0000000000000(%rip),%xmm10 # r = 0.5 *x2
mulpd .L__real_3fe0000000000000(%rip),%xmm11 # r = 0.5 *x2
mulpd %xmm2,%xmm4 # c6*x2
mulpd %xmm3,%xmm5 # c6*x2
movapd %xmm10,p_temp2(%rsp) # r
movapd %xmm11,p_temp3(%rsp) # r
movapd %xmm6,p_temp(%rsp) # rr
movhlps %xmm10,%xmm10 # get upper r for t for cos
mulpd %xmm2,%xmm8 # c3*x2
mulpd %xmm3,%xmm9 # c3*x2
subsd .L__real_3ff0000000000000(%rip),%xmm10 # -t=r-1.0 for cos
subpd .L__real_3ff0000000000000(%rip),%xmm11 # -t=r-1.0
movapd %xmm2,%xmm12 # copy of x2 for x4
movapd %xmm3,%xmm13 # copy of x2 for x4
addpd .Lsincosarray+0x40(%rip),%xmm4 # c5+x2c6
addpd .Lcosarray+0x40(%rip),%xmm5 # c5+x2c6
addpd .Lsincosarray+0x10(%rip),%xmm8 # c2+x2C3
addpd .Lcosarray+0x10(%rip),%xmm9 # c2+x2C3
addsd .L__real_3ff0000000000000(%rip),%xmm10 # 1 + (-t)
addpd .L__real_3ff0000000000000(%rip),%xmm11 # 1 + (-t)
mulpd %xmm2,%xmm12 # x4
mulpd %xmm3,%xmm13 # x4
mulpd %xmm2,%xmm4 # x2(c5+x2c6)
mulpd %xmm3,%xmm5 # x2(c5+x2c6)
mulpd %xmm2,%xmm8 # x2(c2+x2C3)
mulpd %xmm3,%xmm9 # x2(c2+x2C3)
mulpd %xmm2,%xmm12 # x6
mulpd %xmm3,%xmm13 # x6
addpd .Lsincosarray+0x30(%rip),%xmm4 # c4 + x2(c5+x2c6)
addpd .Lcosarray+0x30(%rip),%xmm5 # c4 + x2(c5+x2c6)
addpd .Lsincosarray(%rip),%xmm8 # c1 + x2(c2+x2C3)
addpd .Lcosarray(%rip),%xmm9 # c1 + x2(c2+x2C3)
mulpd %xmm12,%xmm4 # x6(c4 + x2(c5+x2c6))
mulpd %xmm13,%xmm5 # x6(c4 + x2(c5+x2c6))
addpd %xmm8,%xmm4 # zczs
addpd %xmm9,%xmm5 # zc
movsd %xmm0,%xmm8 # lower x for sin
mulsd %xmm2,%xmm8 # lower x3 for sin
mulpd %xmm2,%xmm2 # x4
mulpd %xmm3,%xmm3 # upper x4 for cos
movsd %xmm8,%xmm2 # lower x3 for sin
movsd %xmm6,%xmm9 # lower xx
# note using odd reg
movlpd p_temp2+8(%rsp),%xmm12 # upper r for cos term
movapd p_temp3(%rsp),%xmm13 # r
mulpd %xmm0,%xmm6 # x * xx for upper cos term
mulpd %xmm1,%xmm7 # x * xx
movhlps %xmm6,%xmm6
mulsd p_temp2(%rsp),%xmm9 # xx * 0.5*x2 for sin term
subsd %xmm12,%xmm10 # (1 + (-t)) - r
subpd %xmm13,%xmm11 # (1 + (-t)) - r
mulpd %xmm2,%xmm4 # x4 * zc
mulpd %xmm3,%xmm5 # x4 * zc
# x3 * zs
movhlps %xmm4,%xmm8 # xmm8= cos, xmm4= sin
subsd %xmm9,%xmm4 # x3zs - 0.5*x2*xx
subsd %xmm6,%xmm10 # ((1 + (-t)) - r) - x*xx
subpd %xmm7,%xmm11 # ((1 + (-t)) - r) - x*xx
addsd %xmm10,%xmm8 # x4*zc + (((1 + (-t)) - r) - x*xx)
addpd %xmm11,%xmm5 # x4*zc + (((1 + (-t)) - r) - x*xx)
addsd p_temp(%rsp),%xmm4 # +xx
subsd .L__real_3ff0000000000000(%rip),%xmm12 # -t = r-1
subpd .L__real_3ff0000000000000(%rip),%xmm13 # -t = r-1
subsd %xmm12,%xmm8 # + t
addsd %xmm0,%xmm4 # +x
subpd %xmm13,%xmm5 # + t
movlhps %xmm8,%xmm4
jmp .L__vrd4_sin_cleanup
.align 16
.Lcoscos_sincos_piby4: #Derive from sincos_coscos
movapd %xmm2,%xmm10 # r
movapd %xmm3,%xmm11 # r
movdqa .Lcossinarray+0x50(%rip),%xmm4 # c6
movdqa .Lcosarray+0x50(%rip),%xmm5 # c6
movapd .Lcossinarray+0x20(%rip),%xmm8 # c3
movapd .Lcosarray+0x20(%rip),%xmm9 # c3
mulpd .L__real_3fe0000000000000(%rip),%xmm10 # r = 0.5 *x2
mulpd .L__real_3fe0000000000000(%rip),%xmm11 # r = 0.5 *x2
mulpd %xmm2,%xmm4 # c6*x2
mulpd %xmm3,%xmm5 # c6*x2
movapd %xmm10,p_temp2(%rsp) # r
movapd %xmm11,p_temp3(%rsp) # r
movapd %xmm6,p_temp(%rsp) # rr
mulpd %xmm2,%xmm8 # c3*x2
mulpd %xmm3,%xmm9 # c3*x2
subsd .L__real_3ff0000000000000(%rip),%xmm10 # -t=r-1.0 for cos
subpd .L__real_3ff0000000000000(%rip),%xmm11 # -t=r-1.0
movapd %xmm2,%xmm12 # copy of x2 for x4
movapd %xmm3,%xmm13 # copy of x2 for x4
addpd .Lcossinarray+0x40(%rip),%xmm4 # c5+x2c6
addpd .Lcosarray+0x40(%rip),%xmm5 # c5+x2c6
addpd .Lcossinarray+0x10(%rip),%xmm8 # c2+x2C3
addpd .Lcosarray+0x10(%rip),%xmm9 # c2+x2C3
addsd .L__real_3ff0000000000000(%rip),%xmm10 # 1 + (-t) for cos
addpd .L__real_3ff0000000000000(%rip),%xmm11 # 1 + (-t)
mulpd %xmm2,%xmm12 # x4
mulpd %xmm3,%xmm13 # x4
mulpd %xmm2,%xmm4 # x2(c5+x2c6)
mulpd %xmm3,%xmm5 # x2(c5+x2c6)
mulpd %xmm2,%xmm8 # x2(c2+x2C3)
mulpd %xmm3,%xmm9 # x2(c2+x2C3)
mulpd %xmm2,%xmm12 # x6
mulpd %xmm3,%xmm13 # x6
addpd .Lcossinarray+0x30(%rip),%xmm4 # c4 + x2(c5+x2c6)
addpd .Lcosarray+0x30(%rip),%xmm5 # c4 + x2(c5+x2c6)
addpd .Lcossinarray(%rip),%xmm8 # c1 + x2(c2+x2C3)
addpd .Lcosarray(%rip),%xmm9 # c1 + x2(c2+x2C3)
mulpd %xmm12,%xmm4 # x6(c4 + x2(c5+x2c6))
mulpd %xmm13,%xmm5 # x6(c4 + x2(c5+x2c6))
addpd %xmm8,%xmm4 # zszc
addpd %xmm9,%xmm5 # z
mulpd %xmm0,%xmm2 # upper x3 for sin
mulsd %xmm0,%xmm2 # lower x4 for cos
mulpd %xmm3,%xmm3 # x4
movhlps %xmm6,%xmm9 # upper xx for sin term
# note using odd reg
movlpd p_temp2(%rsp),%xmm12 # lower r for cos term
movapd p_temp3(%rsp),%xmm13 # r
mulpd %xmm0,%xmm6 # x * xx for lower cos term
mulpd %xmm1,%xmm7 # x * xx
mulsd p_temp2+8(%rsp),%xmm9 # xx * 0.5*x2 for upper sin term
subsd %xmm12,%xmm10 # (1 + (-t)) - r
subpd %xmm13,%xmm11 # (1 + (-t)) - r
mulpd %xmm2,%xmm4 # lower=x4 * zc
# upper=x3 * zs
mulpd %xmm3,%xmm5
# x4 * zc
movhlps %xmm4,%xmm8 # xmm8= sin, xmm4= cos
subsd %xmm9,%xmm8 # x3zs - 0.5*x2*xx
subsd %xmm6,%xmm10 # ((1 + (-t)) - r) - x*xx
subpd %xmm7,%xmm11 # ((1 + (-t)) - r) - x*xx
addsd %xmm10,%xmm4 # x4*zc + (((1 + (-t)) - r) - x*xx)
addpd %xmm11,%xmm5 # x4*zc + (((1 + (-t)) - r) - x*xx)
addsd p_temp+8(%rsp),%xmm8 # +xx
movhlps %xmm0,%xmm0 # upper x for sin
subsd .L__real_3ff0000000000000(%rip),%xmm12 # -t = r-1
subpd .L__real_3ff0000000000000(%rip),%xmm13 # -t = r-1
subsd %xmm12,%xmm4 # + t
subpd %xmm13,%xmm5 # + t
addsd %xmm0,%xmm8 # +x
movlhps %xmm8,%xmm4
jmp .L__vrd4_sin_cleanup
.align 16
.Lcossin_coscos_piby4:
movapd %xmm2,%xmm10 # r
movapd %xmm3,%xmm11 # r
movdqa .Lcosarray+0x50(%rip),%xmm4 # c6
movdqa .Lsincosarray+0x50(%rip),%xmm5 # c6
movapd .Lcosarray+0x20(%rip),%xmm8 # c3
movapd .Lsincosarray+0x20(%rip),%xmm9 # c3
mulpd .L__real_3fe0000000000000(%rip),%xmm10 # r = 0.5 *x2
mulpd .L__real_3fe0000000000000(%rip),%xmm11 # r = 0.5 *x2
mulpd %xmm2,%xmm4 # c6*x2
mulpd %xmm3,%xmm5 # c6*x2
movapd %xmm10,p_temp2(%rsp) # r
movapd %xmm11,p_temp3(%rsp) # r
movapd %xmm7,p_temp1(%rsp) # rr
movhlps %xmm11,%xmm11 # get upper r for t for cos
mulpd %xmm2,%xmm8 # c3*x2
mulpd %xmm3,%xmm9 # c3*x2
subpd .L__real_3ff0000000000000(%rip),%xmm10 # -t=r-1.0
subsd .L__real_3ff0000000000000(%rip),%xmm11 # -t=r-1.0 for cos
movapd %xmm2,%xmm12 # copy of x2 for x4
movapd %xmm3,%xmm13 # copy of x2 for x4
addpd .Lcosarray+0x40(%rip),%xmm4 # c5+x2c6
addpd .Lsincosarray+0x40(%rip),%xmm5 # c5+x2c6
addpd .Lcosarray+0x10(%rip),%xmm8 # c2+x2C3
addpd .Lsincosarray+0x10(%rip),%xmm9 # c2+x2C3
addpd .L__real_3ff0000000000000(%rip),%xmm10 # 1 + (-t) ;trash t
addsd .L__real_3ff0000000000000(%rip),%xmm11 # 1 + (-t) ;trash t
mulpd %xmm2,%xmm12 # x4
mulpd %xmm3,%xmm13 # x4
mulpd %xmm2,%xmm4 # x2(c5+x2c6)
mulpd %xmm3,%xmm5 # x2(c5+x2c6)
mulpd %xmm2,%xmm8 # x2(c2+x2C3)
mulpd %xmm3,%xmm9 # x2(c2+x2C3)
mulpd %xmm2,%xmm12 # x6
mulpd %xmm3,%xmm13 # x6
addpd .Lcosarray+0x30(%rip),%xmm4 # c4 + x2(c5+x2c6)
addpd .Lsincosarray+0x30(%rip),%xmm5 # c4 + x2(c5+x2c6)
addpd .Lcosarray(%rip),%xmm8 # c1 + x2(c2+x2C3)
addpd .Lsincosarray(%rip),%xmm9 # c1 + x2(c2+x2C3)
mulpd %xmm12,%xmm4 # x6(c4 + x2(c5+x2c6))
mulpd %xmm13,%xmm5 # x6(c4 + x2(c5+x2c6))
addpd %xmm8,%xmm4 # zc
addpd %xmm9,%xmm5 # zcs
movsd %xmm1,%xmm9 # lower x for sin
mulsd %xmm3,%xmm9 # lower x3 for sin
mulpd %xmm2,%xmm2 # x4
mulpd %xmm3,%xmm3 # upper x4 for cos
movsd %xmm9,%xmm3 # lower x3 for sin
movsd %xmm7,%xmm8 # lower xx
# note using even reg
movapd p_temp2(%rsp),%xmm12 # r
movlpd p_temp3+8(%rsp),%xmm13 # upper r for cos term
mulpd %xmm0,%xmm6 # x * xx
mulpd %xmm1,%xmm7 # x * xx for upper cos term
movhlps %xmm7,%xmm7
mulsd p_temp3(%rsp),%xmm8 # xx * 0.5*x2 for sin term
subpd %xmm12,%xmm10 # (1 + (-t)) - r
subsd %xmm13,%xmm11 # (1 + (-t)) - r
mulpd %xmm2,%xmm4 # x4 * zc
mulpd %xmm3,%xmm5 # x4 * zc
# x3 * zs
movhlps %xmm5,%xmm9 # xmm9= cos, xmm5= sin
subsd %xmm8,%xmm5 # x3zs - 0.5*x2*xx
subpd %xmm6,%xmm10 # ((1 + (-t)) - r) - x*xx
subsd %xmm7,%xmm11 # ((1 + (-t)) - r) - x*xx
addpd %xmm10,%xmm4 # x4*zc + (((1 + (-t)) - r) - x*xx)
addsd %xmm11,%xmm9 # x4*zc + (((1 + (-t)) - r) - x*xx)
addsd p_temp1(%rsp),%xmm5 # +xx
subpd .L__real_3ff0000000000000(%rip),%xmm12 # t relaculate, -t = r-1
subsd .L__real_3ff0000000000000(%rip),%xmm13 # t relaculate, -t = r-1
subpd %xmm12,%xmm4 # + t
subsd %xmm13,%xmm9 # + t
addsd %xmm1,%xmm5 # +x
movlhps %xmm9,%xmm5
jmp .L__vrd4_sin_cleanup
.align 16
.Lcossin_sinsin_piby4: # Derived from sincos_sinsin
movapd %xmm2,%xmm10 # x2
movapd %xmm3,%xmm11 # x2
movdqa .Lsinarray+0x50(%rip),%xmm4 # c6
movdqa .Lsincosarray+0x50(%rip),%xmm5 # c6
movapd .Lsinarray+0x20(%rip),%xmm8 # c3
movapd .Lsincosarray+0x20(%rip),%xmm9 # c3
mulpd .L__real_3fe0000000000000(%rip),%xmm10 # r = 0.5 *x2
mulpd .L__real_3fe0000000000000(%rip),%xmm11 # r = 0.5 *x2
mulpd %xmm2,%xmm4 # c6*x2
mulpd %xmm3,%xmm5 # c6*x2
movapd %xmm11,p_temp3(%rsp) # r
movapd %xmm7,p_temp1(%rsp) # rr
movhlps %xmm11,%xmm11
mulpd %xmm2,%xmm8 # c3*x2
mulpd %xmm3,%xmm9 # c3*x2
subsd .L__real_3ff0000000000000(%rip),%xmm11 # -t=r-1.0 for cos
movapd %xmm2,%xmm12 # copy of x2 for x4
movapd %xmm3,%xmm13 # copy of x2 for x4
addpd .Lsinarray+0x40(%rip),%xmm4 # c5+x2c6
addpd .Lsincosarray+0x40(%rip),%xmm5 # c5+x2c6
addpd .Lsinarray+0x10(%rip),%xmm8 # c2+x2C3
addpd .Lsincosarray+0x10(%rip),%xmm9 # c2+x2C3
mulpd %xmm6,%xmm10 # 0.5*x2*xx
addsd .L__real_3ff0000000000000(%rip),%xmm11 # 1 + (-t) for cos
mulpd %xmm2,%xmm12 # x4
mulpd %xmm3,%xmm13 # x4
mulpd %xmm2,%xmm4 # x2(c5+x2c6)
mulpd %xmm3,%xmm5 # x2(c5+x2c6)
mulpd %xmm2,%xmm8 # x2(c2+x2C3)
mulpd %xmm3,%xmm9 # x2(c2+x2C3)
mulpd %xmm2,%xmm12 # x6
mulpd %xmm3,%xmm13 # x6
addpd .Lsinarray+0x30(%rip),%xmm4 # c4 + x2(c5+x2c6)
addpd .Lsincosarray+0x30(%rip),%xmm5 # c4 + x2(c5+x2c6)
addpd .Lsinarray(%rip),%xmm8 # c1 + x2(c2+x2C3)
addpd .Lsincosarray(%rip),%xmm9 # c1 + x2(c2+x2C3)
mulpd %xmm12,%xmm4 # x6(c4 + x2(c5+x2c6))
mulpd %xmm13,%xmm5 # x6(c4 + x2(c5+x2c6))
addpd %xmm8,%xmm4 # zs
addpd %xmm9,%xmm5 # zczs
movsd %xmm3,%xmm12
mulsd %xmm1,%xmm12 # low x3 for sin
mulpd %xmm0, %xmm2 # x3
mulpd %xmm3, %xmm3 # high x4 for cos
movsd %xmm12,%xmm3 # low x3 for sin
movhlps %xmm1,%xmm8 # upper x for cos term
# note using even reg
movlpd p_temp3+8(%rsp),%xmm13 # upper r for cos term
mulsd p_temp1+8(%rsp),%xmm8 # x * xx for upper cos term
mulsd p_temp3(%rsp),%xmm7 # xx * 0.5*x2 for lower sin term
subsd %xmm13,%xmm11 # (1 + (-t)) - r
mulpd %xmm2,%xmm4 # x3 * zs
mulpd %xmm3,%xmm5 # lower=x4 * zc
# upper=x3 * zs
movhlps %xmm5,%xmm9 # xmm9= cos, xmm5= sin
subsd %xmm7,%xmm5 # x3zs - 0.5*x2*xx
subsd %xmm8,%xmm11 # ((1 + (-t)) - r) - x*xx
subpd %xmm10,%xmm4 # x3*zs - 0.5*x2*xx
addsd %xmm11,%xmm9 # x4*zc + (((1 + (-t)) - r) - x*xx)
addsd p_temp1(%rsp),%xmm5 # +xx
addpd %xmm6,%xmm4 # +xx
subsd .L__real_3ff0000000000000(%rip),%xmm13 # -t = r-1
addsd %xmm1,%xmm5 # +x
addpd %xmm0,%xmm4 # +x
subsd %xmm13,%xmm9 # + t
movlhps %xmm9,%xmm5
jmp .L__vrd4_sin_cleanup
.align 16
.Lsincos_coscos_piby4:
movapd %xmm2,%xmm10 # r
movapd %xmm3,%xmm11 # r
movdqa .Lcosarray+0x50(%rip),%xmm4 # c6
movdqa .Lcossinarray+0x50(%rip),%xmm5 # c6
movapd .Lcosarray+0x20(%rip),%xmm8 # c3
movapd .Lcossinarray+0x20(%rip),%xmm9 # c3
mulpd .L__real_3fe0000000000000(%rip),%xmm10 # r = 0.5 *x2
mulpd .L__real_3fe0000000000000(%rip),%xmm11 # r = 0.5 *x2
mulpd %xmm2,%xmm4 # c6*x2
mulpd %xmm3,%xmm5 # c6*x2
movapd %xmm10,p_temp2(%rsp) # r
movapd %xmm11,p_temp3(%rsp) # r
movapd %xmm7,p_temp1(%rsp) # rr
mulpd %xmm2,%xmm8 # c3*x2
mulpd %xmm3,%xmm9 # c3*x2
subpd .L__real_3ff0000000000000(%rip),%xmm10 # -t=r-1.0
subsd .L__real_3ff0000000000000(%rip),%xmm11 # -t=r-1.0 for cos
movapd %xmm2,%xmm12 # copy of x2 for x4
movapd %xmm3,%xmm13 # copy of x2 for x4
addpd .Lcosarray+0x40(%rip),%xmm4 # c5+x2c6
addpd .Lcossinarray+0x40(%rip),%xmm5 # c5+x2c6
addpd .Lcosarray+0x10(%rip),%xmm8 # c2+x2C3
addpd .Lcossinarray+0x10(%rip),%xmm9 # c2+x2C3
addpd .L__real_3ff0000000000000(%rip),%xmm10 # 1 + (-t)
addsd .L__real_3ff0000000000000(%rip),%xmm11 # 1 + (-t) for cos
mulpd %xmm2,%xmm12 # x4
mulpd %xmm3,%xmm13 # x4
mulpd %xmm2,%xmm4 # x2(c5+x2c6)
mulpd %xmm3,%xmm5 # x2(c5+x2c6)
mulpd %xmm2,%xmm8 # x2(c2+x2C3)
mulpd %xmm3,%xmm9 # x2(c2+x2C3)
mulpd %xmm2,%xmm12 # x6
mulpd %xmm3,%xmm13 # x6
addpd .Lcosarray+0x30(%rip),%xmm4 # c4 + x2(c5+x2c6)
addpd .Lcossinarray+0x30(%rip),%xmm5 # c4 + x2(c5+x2c6)
addpd .Lcosarray(%rip),%xmm8 # c1 + x2(c2+x2C3)
addpd .Lcossinarray(%rip),%xmm9 # c1 + x2(c2+x2C3)
mulpd %xmm12,%xmm4 # x6(c4 + x2(c5+x2c6))
mulpd %xmm13,%xmm5 # x6(c4 + x2(c5+x2c6))
addpd %xmm8,%xmm4 # zc
addpd %xmm9,%xmm5 # zszc
mulpd %xmm2,%xmm2 # x4
mulpd %xmm1,%xmm3 # upper x3 for sin
mulsd %xmm1,%xmm3 # lower x4 for cos
movhlps %xmm7,%xmm8 # upper xx for sin term
# note using even reg
movapd p_temp2(%rsp),%xmm12 # r
movlpd p_temp3(%rsp),%xmm13 # lower r for cos term
mulpd %xmm0,%xmm6 # x * xx
mulpd %xmm1,%xmm7 # x * xx for lower cos term
mulsd p_temp3+8(%rsp),%xmm8 # xx * 0.5*x2 for upper sin term
subpd %xmm12,%xmm10 # (1 + (-t)) - r
subsd %xmm13,%xmm11 # (1 + (-t)) - r
mulpd %xmm2,%xmm4 # x4 * zc
mulpd %xmm3,%xmm5 # lower=x4 * zc
# upper=x3 * zs
movhlps %xmm5,%xmm9 # xmm9= sin, xmm5= cos
subsd %xmm8,%xmm9 # x3zs - 0.5*x2*xx
subpd %xmm6,%xmm10 # ((1 + (-t)) - r) - x*xx
subsd %xmm7,%xmm11 # ((1 + (-t)) - r) - x*xx
addpd %xmm10,%xmm4 # x4*zc + (((1 + (-t)) - r) - x*xx)
addsd %xmm11,%xmm5 # x4*zc + (((1 + (-t)) - r) - x*xx)
addsd p_temp1+8(%rsp),%xmm9 # +xx
movhlps %xmm1,%xmm1 # upper x for sin
subpd .L__real_3ff0000000000000(%rip),%xmm12 # -t = r-1
subsd .L__real_3ff0000000000000(%rip),%xmm13 # -t = r-1
subpd %xmm12,%xmm4 # + t
subsd %xmm13,%xmm5 # + t
addsd %xmm1, %xmm9 # +x
movlhps %xmm9, %xmm5
jmp .L__vrd4_sin_cleanup
.align 16
.Lsincos_sinsin_piby4: # Derived from sincos_coscos
movapd %xmm2,%xmm10 # r
movapd %xmm3,%xmm11 # r
movdqa .Lsinarray+0x50(%rip),%xmm4 # c6
movdqa .Lcossinarray+0x50(%rip),%xmm5 # c6
movapd .Lsinarray+0x20(%rip),%xmm8 # c3
movapd .Lcossinarray+0x20(%rip),%xmm9 # c3
mulpd .L__real_3fe0000000000000(%rip),%xmm10 # r = 0.5 *x2
mulpd .L__real_3fe0000000000000(%rip),%xmm11 # r = 0.5 *x2
mulpd %xmm2,%xmm4 # c6*x2
mulpd %xmm3,%xmm5 # c6*x2
movapd %xmm11,p_temp3(%rsp) # r
movapd %xmm7,p_temp1(%rsp) # rr
mulpd %xmm2,%xmm8 # c3*x2
mulpd %xmm3,%xmm9 # c3*x2
subsd .L__real_3ff0000000000000(%rip),%xmm11 # -t=r-1.0 for cos
movapd %xmm2,%xmm12 # copy of x2 for x4
movapd %xmm3,%xmm13 # copy of x2 for x4
addpd .Lsinarray+0x40(%rip),%xmm4 # c5+x2c6
addpd .Lcossinarray+0x40(%rip),%xmm5 # c5+x2c6
addpd .Lsinarray+0x10(%rip),%xmm8 # c2+x2C3
addpd .Lcossinarray+0x10(%rip),%xmm9 # c2+x2C3
mulpd %xmm6,%xmm10 # 0.5x2*xx
addsd .L__real_3ff0000000000000(%rip),%xmm11 # 1 + (-t) for cos
mulpd %xmm2,%xmm12 # x4
mulpd %xmm3,%xmm13 # x4
mulpd %xmm2,%xmm4 # x2(c5+x2c6)
mulpd %xmm3,%xmm5 # x2(c5+x2c6)
mulpd %xmm2,%xmm8 # x2(c2+x2C3)
mulpd %xmm3,%xmm9 # x2(c2+x2C3)
mulpd %xmm2,%xmm12 # x6
mulpd %xmm3,%xmm13 # x6
addpd .Lsinarray+0x30(%rip),%xmm4 # c4 + x2(c5+x2c6)
addpd .Lcossinarray+0x30(%rip),%xmm5 # c4 + x2(c5+x2c6)
addpd .Lsinarray(%rip),%xmm8 # c1 + x2(c2+x2C3)
addpd .Lcossinarray(%rip),%xmm9 # c1 + x2(c2+x2C3)
mulpd %xmm12,%xmm4 # x6(c4 + x2(c5+x2c6))
mulpd %xmm13,%xmm5 # x6(c4 + x2(c5+x2c6))
addpd %xmm8,%xmm4 # zs
addpd %xmm9,%xmm5 # zszc
mulpd %xmm0,%xmm2 # x3
mulpd %xmm1,%xmm3 # upper x3 for sin
mulsd %xmm1,%xmm3 # lower x4 for cos
movhlps %xmm7,%xmm8 # upper xx for sin term
# note using even reg
movlpd p_temp3(%rsp),%xmm13 # lower r for cos term
mulpd %xmm1,%xmm7 # x * xx for lower cos term
mulsd p_temp3+8(%rsp),%xmm8 # xx * 0.5*x2 for upper sin term
subsd %xmm13,%xmm11 # (1 + (-t)) - r
mulpd %xmm2,%xmm4 # x3 * zs
mulpd %xmm3,%xmm5 # lower=x4 * zc
# upper=x3 * zs
movhlps %xmm5,%xmm9 # xmm9= sin, xmm5= cos
subsd %xmm8,%xmm9 # x3zs - 0.5*x2*xx
subsd %xmm7,%xmm11 # ((1 + (-t)) - r) - x*xx
subpd %xmm10,%xmm4 # x3*zs - 0.5*x2*xx
addsd %xmm11,%xmm5 # x4*zc + (((1 + (-t)) - r) - x*xx)
addsd p_temp1+8(%rsp),%xmm9 # +xx
movhlps %xmm1,%xmm1 # upper x for sin
addpd %xmm6,%xmm4 # +xx
subsd .L__real_3ff0000000000000(%rip),%xmm13 # -t = r-1
addsd %xmm1,%xmm9 # +x
addpd %xmm0,%xmm4 # +x
subsd %xmm13,%xmm5 # + t
movlhps %xmm9,%xmm5
jmp .L__vrd4_sin_cleanup
.align 16
.Lsinsin_cossin_piby4: # Derived from sincos_sinsin
movapd %xmm2,%xmm10 # x2
movapd %xmm3,%xmm11 # x2
movdqa .Lsincosarray+0x50(%rip),%xmm4 # c6
movdqa .Lsinarray+0x50(%rip),%xmm5 # c6
movapd .Lsincosarray+0x20(%rip),%xmm8 # c3
movapd .Lsinarray+0x20(%rip),%xmm9 # c3
mulpd .L__real_3fe0000000000000(%rip),%xmm10 # r = 0.5 *x2
mulpd .L__real_3fe0000000000000(%rip),%xmm11 # r = 0.5 *x2
mulpd %xmm2,%xmm4 # c6*x2
mulpd %xmm3,%xmm5 # c6*x2
movapd %xmm10,p_temp2(%rsp) # x2
movapd %xmm6,p_temp(%rsp) # xx
movhlps %xmm10,%xmm10
mulpd %xmm2,%xmm8 # c3*x2
mulpd %xmm3,%xmm9 # c3*x2
subsd .L__real_3ff0000000000000(%rip),%xmm10 # -t=r-1.0 for cos
movapd %xmm2,%xmm12 # copy of x2 for x4
movapd %xmm3,%xmm13 # copy of x2 for x4
addpd .Lsincosarray+0x40(%rip),%xmm4 # c5+x2c6
addpd .Lsinarray+0x40(%rip),%xmm5 # c5+x2c6
addpd .Lsincosarray+0x10(%rip),%xmm8 # c2+x2C3
addpd .Lsinarray+0x10(%rip),%xmm9 # c2+x2C3
mulpd %xmm7,%xmm11 # 0.5*x2*xx
addsd .L__real_3ff0000000000000(%rip),%xmm10 # 1 + (-t) for cos
mulpd %xmm2,%xmm12 # x4
mulpd %xmm3,%xmm13 # x4
mulpd %xmm2,%xmm4 # x2(c5+x2c6)
mulpd %xmm3,%xmm5 # x2(c5+x2c6)
mulpd %xmm2,%xmm8 # x2(c2+x2C3)
mulpd %xmm3,%xmm9 # x2(c2+x2C3)
mulpd %xmm2,%xmm12 # x6
mulpd %xmm3,%xmm13 # x6
addpd .Lsincosarray+0x30(%rip),%xmm4 # c4 + x2(c5+x2c6)
addpd .Lsinarray+0x30(%rip),%xmm5 # c4 + x2(c5+x2c6)
addpd .Lsincosarray(%rip),%xmm8 # c1 + x2(c2+x2C3)
addpd .Lsinarray(%rip),%xmm9 # c1 + x2(c2+x2C3)
mulpd %xmm12,%xmm4 # x6(c4 + x2(c5+x2c6))
mulpd %xmm13,%xmm5 # x6(c4 + x2(c5+x2c6))
addpd %xmm8,%xmm4 # zczs
addpd %xmm9,%xmm5 # zs
movsd %xmm2,%xmm13
mulsd %xmm0,%xmm13 # low x3 for sin
mulpd %xmm1,%xmm3 # x3
mulpd %xmm2,%xmm2 # high x4 for cos
movsd %xmm13,%xmm2 # low x3 for sin
movhlps %xmm0,%xmm9 # upper x for cos term ; note using even reg
movlpd p_temp2+8(%rsp),%xmm12 # upper r for cos term
mulsd p_temp+8(%rsp),%xmm9 # x * xx for upper cos term
mulsd p_temp2(%rsp),%xmm6 # xx * 0.5*x2 for lower sin term
subsd %xmm12,%xmm10 # (1 + (-t)) - r
mulpd %xmm3,%xmm5 # x3 * zs
mulpd %xmm2,%xmm4 # lower=x4 * zc
# upper=x3 * zs
movhlps %xmm4,%xmm8 # xmm8= cos, xmm4= sin
subsd %xmm6,%xmm4 # x3zs - 0.5*x2*xx
subsd %xmm9,%xmm10 # ((1 + (-t)) - r) - x*xx
subpd %xmm11,%xmm5 # x3*zs - 0.5*x2*xx
addsd %xmm10,%xmm8 # x4*zc + (((1 + (-t)) - r) - x*xx)
addsd p_temp(%rsp),%xmm4 # +xx
addpd %xmm7,%xmm5 # +xx
subsd .L__real_3ff0000000000000(%rip),%xmm12 # -t = r-1
addsd %xmm0,%xmm4 # +x
addpd %xmm1,%xmm5 # +x
subsd %xmm12,%xmm8 # + t
movlhps %xmm8,%xmm4
jmp .L__vrd4_sin_cleanup
.align 16
.Lsinsin_sincos_piby4: # Derived from sincos_coscos
movapd %xmm2,%xmm10 # x2
movapd %xmm3,%xmm11 # x2
movdqa .Lcossinarray+0x50(%rip),%xmm4 # c6
movdqa .Lsinarray+0x50(%rip),%xmm5 # c6
movapd .Lcossinarray+0x20(%rip),%xmm8 # c3
movapd .Lsinarray+0x20(%rip),%xmm9 # c3
mulpd .L__real_3fe0000000000000(%rip),%xmm10 # r = 0.5 *x2
mulpd .L__real_3fe0000000000000(%rip),%xmm11 # r = 0.5 *x2
mulpd %xmm2,%xmm4 # c6*x2
mulpd %xmm3,%xmm5 # c6*x2
movapd %xmm10,p_temp2(%rsp) # r
movapd %xmm6,p_temp(%rsp) # rr
mulpd %xmm2,%xmm8 # c3*x2
mulpd %xmm3,%xmm9 # c3*x2
subsd .L__real_3ff0000000000000(%rip),%xmm10 # -t=r-1.0 for cos
movapd %xmm2,%xmm12 # copy of x2 for x4
movapd %xmm3,%xmm13 # copy of x2 for x4
addpd .Lcossinarray+0x40(%rip),%xmm4 # c5+x2c6
addpd .Lsinarray+0x40(%rip),%xmm5 # c5+x2c6
addpd .Lcossinarray+0x10(%rip),%xmm8 # c2+x2C3
addpd .Lsinarray+0x10(%rip),%xmm9 # c2+x2C3
mulpd %xmm7,%xmm11 # 0.5x2*xx
addsd .L__real_3ff0000000000000(%rip),%xmm10 # 1 + (-t) for cos
mulpd %xmm2,%xmm12 # x4
mulpd %xmm3,%xmm13 # x4
mulpd %xmm2,%xmm4 # x2(c5+x2c6)
mulpd %xmm3,%xmm5 # x2(c5+x2c6)
mulpd %xmm2,%xmm8 # x2(c2+x2C3)
mulpd %xmm3,%xmm9 # x2(c2+x2C3)
mulpd %xmm2,%xmm12 # x6
mulpd %xmm3,%xmm13 # x6
addpd .Lcossinarray+0x30(%rip),%xmm4 # c4 + x2(c5+x2c6)
addpd .Lsinarray+0x30(%rip),%xmm5 # c4 + x2(c5+x2c6)
addpd .Lcossinarray(%rip),%xmm8 # c1 + x2(c2+x2C3)
addpd .Lsinarray(%rip),%xmm9 # c1 + x2(c2+x2C3)
mulpd %xmm12,%xmm4 # x6(c4 + x2(c5+x2c6))
mulpd %xmm13,%xmm5 # x6(c4 + x2(c5+x2c6))
addpd %xmm8,%xmm4 # zs
addpd %xmm9,%xmm5 # zszc
mulpd %xmm1,%xmm3 # x3
mulpd %xmm0,%xmm2 # upper x3 for sin
mulsd %xmm0,%xmm2 # lower x4 for cos
movhlps %xmm6,%xmm9 # upper xx for sin term
# note using even reg
movlpd p_temp2(%rsp),%xmm12 # lower r for cos term
mulpd %xmm0,%xmm6 # x * xx for lower cos term
mulsd p_temp2+8(%rsp),%xmm9 # xx * 0.5*x2 for upper sin term
subsd %xmm12,%xmm10 # (1 + (-t)) - r
mulpd %xmm3,%xmm5 # x3 * zs
mulpd %xmm2,%xmm4 # lower=x4 * zc
# upper=x3 * zs
movhlps %xmm4,%xmm8 # xmm9= sin, xmm5= cos
subsd %xmm9,%xmm8 # x3zs - 0.5*x2*xx
subsd %xmm6,%xmm10 # ((1 + (-t)) - r) - x*xx
subpd %xmm11,%xmm5 # x3*zs - 0.5*x2*xx
addsd %xmm10,%xmm4 # x4*zc + (((1 + (-t)) - r) - x*xx)
addsd p_temp+8(%rsp),%xmm8 # +xx
movhlps %xmm0,%xmm0 # upper x for sin
addpd %xmm7,%xmm5 # +xx
subsd .L__real_3ff0000000000000(%rip),%xmm12 # -t = r-1
addsd %xmm0,%xmm8 # +x
addpd %xmm1,%xmm5 # +x
subsd %xmm12,%xmm4 # + t
movlhps %xmm8,%xmm4
jmp .L__vrd4_sin_cleanup
.align 16
.Lsinsin_sinsin_piby4:
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# p_sign0 = Sign, xmm0 = r, xmm2 = %xmm6,%r2 =rr
# p_sign1 = Sign, xmm1 = r, xmm3 = %xmm7,%r2 =rr
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#DEBUG
# xorpd %xmm0, %xmm0
# xorpd %xmm1, %xmm1
# jmp .Lfinal_check
#DEBUG
movapd %xmm2,%xmm10 # x2
movapd %xmm3,%xmm11 # x2
movdqa .Lsinarray+0x50(%rip),%xmm4 # c6
movdqa .Lsinarray+0x50(%rip),%xmm5 # c6
movapd .Lsinarray+0x20(%rip),%xmm8 # c3
movapd .Lsinarray+0x20(%rip),%xmm9 # c3
movapd %xmm2,p_temp2(%rsp) # copy of x2
movapd %xmm3,p_temp3(%rsp) # copy of x2
mulpd %xmm2,%xmm4 # c6*x2
mulpd %xmm3,%xmm5 # c6*x2
mulpd %xmm2,%xmm8 # c3*x2
mulpd %xmm3,%xmm9 # c3*x2
mulpd %xmm2,%xmm10 # x4
mulpd %xmm3,%xmm11 # x4
addpd .Lsinarray+0x40(%rip),%xmm4 # c5+x2c6
addpd .Lsinarray+0x40(%rip),%xmm5 # c5+x2c6
addpd .Lsinarray+0x10(%rip),%xmm8 # c2+x2C3
addpd .Lsinarray+0x10(%rip),%xmm9 # c2+x2C3
mulpd %xmm2,%xmm10 # x6
mulpd %xmm3,%xmm11 # x6
mulpd %xmm2,%xmm4 # x2(c5+x2c6)
mulpd %xmm3,%xmm5 # x2(c5+x2c6)
mulpd %xmm2,%xmm8 # x2(c2+x2C3)
mulpd %xmm3,%xmm9 # x2(c2+x2C3)
mulpd .L__real_3fe0000000000000(%rip),%xmm2 # 0.5 *x2
mulpd .L__real_3fe0000000000000(%rip),%xmm3 # 0.5 *x2
addpd .Lsinarray+0x30(%rip),%xmm4 # c4 + x2(c5+x2c6)
addpd .Lsinarray+0x30(%rip),%xmm5 # c4 + x2(c5+x2c6)
addpd .Lsinarray(%rip),%xmm8 # c1 + x2(c2+x2C3)
addpd .Lsinarray(%rip),%xmm9 # c1 + x2(c2+x2C3)
mulpd %xmm6,%xmm2 # 0.5 * x2 *xx
mulpd %xmm7,%xmm3 # 0.5 * x2 *xx
mulpd %xmm10,%xmm4 # x6(c4 + x2(c5+x2c6))
mulpd %xmm11,%xmm5 # x6(c4 + x2(c5+x2c6))
addpd %xmm8,%xmm4 # zs
addpd %xmm9,%xmm5 # zs
movapd p_temp2(%rsp),%xmm10 # x2
movapd p_temp3(%rsp),%xmm11 # x2
mulpd %xmm0,%xmm10 # x3
mulpd %xmm1,%xmm11 # x3
mulpd %xmm10,%xmm4 # x3 * zs
mulpd %xmm11,%xmm5 # x3 * zs
subpd %xmm2,%xmm4 # -0.5 * x2 *xx
subpd %xmm3,%xmm5 # -0.5 * x2 *xx
addpd %xmm6,%xmm4 # +xx
addpd %xmm7,%xmm5 # +xx
addpd %xmm0,%xmm4 # +x
addpd %xmm1,%xmm5 # +x
jmp .L__vrd4_sin_cleanup