blob: ab590580aa5f3037f2ae407ff08f1714851d0f53 [file] [log] [blame]
#
# (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
#
# This file is part of libacml_mv.
#
# libacml_mv is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# libacml_mv is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with libacml_mv. If not, see
# <http://www.gnu.org/licenses/>.
#
#
#
# vrs4_cosf.s
#
# A vector implementation of the cosf libm function.
#
# Prototype:
#
# __m128 __vrs4_cosf(__m128 x);
#
# Computes Cosine of x for an array of input values.
# Places the results into the supplied y array.
# Does not perform error checking.
# Denormal inputs may produce unexpected results.
# This routine computes 4 single precision Cosine values at a time.
# The four values are passed as packed single in xmm10.
# The four results are returned as packed singles in xmm10.
# Note that this represents a non-standard ABI usage, as no ABI
# ( and indeed C) currently allows returning 4 values for a function.
# It is expected that some compilers may be able to take advantage of this
# interface when implementing vectorized loops. Using the array implementation
# of the routine requires putting the inputs into memory, and retrieving
# the results from memory. This routine eliminates the need for this
# overhead if the data does not already reside in memory.
# Author: Harsha Jagasia
# Email: harsha.jagasia@amd.com
#ifdef __ELF__
.section .note.GNU-stack,"",@progbits
#endif
.data
.align 64
.L__real_7fffffffffffffff: .quad 0x07fffffffffffffff #Sign bit zero
.quad 0x07fffffffffffffff
.L__real_3ff0000000000000: .quad 0x03ff0000000000000 # 1.0
.quad 0x03ff0000000000000
.L__real_v2p__27: .quad 0x03e40000000000000 # 2p-27
.quad 0x03e40000000000000
.L__real_3fe0000000000000: .quad 0x03fe0000000000000 # 0.5
.quad 0x03fe0000000000000
.L__real_3fc5555555555555: .quad 0x03fc5555555555555 # 0.166666666666
.quad 0x03fc5555555555555
.L__real_3fe45f306dc9c883: .quad 0x03fe45f306dc9c883 # twobypi
.quad 0x03fe45f306dc9c883
.L__real_3ff921fb54400000: .quad 0x03ff921fb54400000 # piby2_1
.quad 0x03ff921fb54400000
.L__real_3dd0b4611a626331: .quad 0x03dd0b4611a626331 # piby2_1tail
.quad 0x03dd0b4611a626331
.L__real_3dd0b4611a600000: .quad 0x03dd0b4611a600000 # piby2_2
.quad 0x03dd0b4611a600000
.L__real_3ba3198a2e037073: .quad 0x03ba3198a2e037073 # piby2_2tail
.quad 0x03ba3198a2e037073
.L__real_fffffffff8000000: .quad 0x0fffffffff8000000 # mask for stripping head and tail
.quad 0x0fffffffff8000000
.L__real_8000000000000000: .quad 0x08000000000000000 # -0 or signbit
.quad 0x08000000000000000
.L__reald_one_one: .quad 0x00000000100000001 #
.quad 0
.L__reald_two_two: .quad 0x00000000200000002 #
.quad 0
.L__reald_one_zero: .quad 0x00000000100000000 # sin_cos_filter
.quad 0
.L__reald_zero_one: .quad 0x00000000000000001 #
.quad 0
.L__reald_two_zero: .quad 0x00000000200000000 #
.quad 0
.L__realq_one_one: .quad 0x00000000000000001 #
.quad 0x00000000000000001 #
.L__realq_two_two: .quad 0x00000000000000002 #
.quad 0x00000000000000002 #
.L__real_1_x_mask: .quad 0x0ffffffffffffffff #
.quad 0x03ff0000000000000 #
.L__real_zero: .quad 0x00000000000000000 #
.quad 0x00000000000000000 #
.L__real_one: .quad 0x00000000000000001 #
.quad 0x00000000000000001 #
.Lcosarray:
.quad 0x03FA5555555502F31 # 0.0416667 c1
.quad 0x03FA5555555502F31
.quad 0x0BF56C16BF55699D7 # -0.00138889 c2
.quad 0x0BF56C16BF55699D7
.quad 0x03EFA015C50A93B49 # 2.48016e-005 c3
.quad 0x03EFA015C50A93B49
.quad 0x0BE92524743CC46B8 # -2.75573e-007 c4
.quad 0x0BE92524743CC46B8
.Lsinarray:
.quad 0x0BFC555555545E87D # -0.166667 s1
.quad 0x0BFC555555545E87D
.quad 0x03F811110DF01232D # 0.00833333 s2
.quad 0x03F811110DF01232D
.quad 0x0BF2A013A88A37196 # -0.000198413 s3
.quad 0x0BF2A013A88A37196
.quad 0x03EC6DBE4AD1572D5 # 2.75573e-006 s4
.quad 0x03EC6DBE4AD1572D5
.Lsincosarray:
.quad 0x0BFC555555545E87D # -0.166667 s1
.quad 0x03FA5555555502F31 # 0.0416667 c1
.quad 0x03F811110DF01232D # 0.00833333 s2
.quad 0x0BF56C16BF55699D7
.quad 0x0BF2A013A88A37196 # -0.000198413 s3
.quad 0x03EFA015C50A93B49
.quad 0x03EC6DBE4AD1572D5 # 2.75573e-006 s4
.quad 0x0BE92524743CC46B8
.Lcossinarray:
.quad 0x03FA5555555502F31 # 0.0416667 c1
.quad 0x0BFC555555545E87D # -0.166667 s1
.quad 0x0BF56C16BF55699D7 # c2
.quad 0x03F811110DF01232D
.quad 0x03EFA015C50A93B49 # c3
.quad 0x0BF2A013A88A37196
.quad 0x0BE92524743CC46B8 # c4
.quad 0x03EC6DBE4AD1572D5
.align 64
.Levencos_oddsin_tbl:
.quad .Lcoscos_coscos_piby4 # 0 * ; Done
.quad .Lcoscos_cossin_piby4 # 1 + ; Done
.quad .Lcoscos_sincos_piby4 # 2 ; Done
.quad .Lcoscos_sinsin_piby4 # 3 + ; Done
.quad .Lcossin_coscos_piby4 # 4 ; Done
.quad .Lcossin_cossin_piby4 # 5 * ; Done
.quad .Lcossin_sincos_piby4 # 6 ; Done
.quad .Lcossin_sinsin_piby4 # 7 ; Done
.quad .Lsincos_coscos_piby4 # 8 ; Done
.quad .Lsincos_cossin_piby4 # 9 ; TBD
.quad .Lsincos_sincos_piby4 # 10 * ; Done
.quad .Lsincos_sinsin_piby4 # 11 ; Done
.quad .Lsinsin_coscos_piby4 # 12 ; Done
.quad .Lsinsin_cossin_piby4 # 13 + ; Done
.quad .Lsinsin_sincos_piby4 # 14 ; Done
.quad .Lsinsin_sinsin_piby4 # 15 * ; Done
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.text
.align 16
.p2align 4,,15
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# define local variable storage offsets
.equ p_temp,0 # temporary for get/put bits operation
.equ p_temp1,0x10 # temporary for get/put bits operation
.equ save_xmm6,0x20 # temporary for get/put bits operation
.equ save_xmm7,0x30 # temporary for get/put bits operation
.equ save_xmm8,0x40 # temporary for get/put bits operation
.equ save_xmm9,0x50 # temporary for get/put bits operation
.equ save_xmm0,0x60 # temporary for get/put bits operation
.equ save_xmm11,0x70 # temporary for get/put bits operation
.equ save_xmm12,0x80 # temporary for get/put bits operation
.equ save_xmm13,0x90 # temporary for get/put bits operation
.equ save_xmm14,0x0A0 # temporary for get/put bits operation
.equ save_xmm15,0x0B0 # temporary for get/put bits operation
.equ r,0x0C0 # pointer to r for remainder_piby2
.equ rr,0x0D0 # pointer to r for remainder_piby2
.equ region,0x0E0 # pointer to r for remainder_piby2
.equ r1,0x0F0 # pointer to r for remainder_piby2
.equ rr1,0x0100 # pointer to r for remainder_piby2
.equ region1,0x0110 # pointer to r for remainder_piby2
.equ p_temp2,0x0120 # temporary for get/put bits operation
.equ p_temp3,0x0130 # temporary for get/put bits operation
.equ p_temp4,0x0140 # temporary for get/put bits operation
.equ p_temp5,0x0150 # temporary for get/put bits operation
.equ p_original,0x0160 # original x
.equ p_mask,0x0170 # original x
.equ p_sign,0x0180 # original x
.equ p_original1,0x0190 # original x
.equ p_mask1,0x01A0 # original x
.equ p_sign1,0x01B0 # original x
.equ save_r12,0x01C0 # temporary for get/put bits operation
.equ save_r13,0x01D0 # temporary for get/put bits operation
.globl __vrs4_cosf
.type __vrs4_cosf,@function
__vrs4_cosf:
sub $0x01E8,%rsp
#DEBUG
# mov %r12,save_r12(%rsp) # save r12
# mov %r13,save_r13(%rsp) # save r13
# mov save_r12(%rsp),%r12 # restore r12
# mov save_r13(%rsp),%r13 # restore r13
# add $0x01E8,%rsp
# ret
#DEBUG
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov %r12,save_r12(%rsp) # save r12
mov %r13,save_r13(%rsp) # save r13
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#STARTMAIN
movhlps %xmm0,%xmm8
cvtps2pd %xmm0,%xmm10 # convert input to double.
cvtps2pd %xmm8,%xmm1 # convert input to double.
movdqa %xmm10,%xmm6
movdqa %xmm1,%xmm7
movapd .L__real_7fffffffffffffff(%rip),%xmm2
andpd %xmm2,%xmm10 #Unsign
andpd %xmm2,%xmm1 #Unsign
movd %xmm10,%rax #rax is lower arg
movhpd %xmm10, p_temp+8(%rsp) #
mov p_temp+8(%rsp),%rcx #rcx = upper arg
movd %xmm1,%r8 #r8 is lower arg
movhpd %xmm1, p_temp1+8(%rsp) #
mov p_temp1+8(%rsp),%r9 #r9 = upper arg
movdqa %xmm10,%xmm12
movdqa %xmm1,%xmm13
pcmpgtd %xmm6,%xmm12
pcmpgtd %xmm7,%xmm13
movdqa %xmm12,%xmm6
movdqa %xmm13,%xmm7
psrldq $4,%xmm12
psrldq $4,%xmm13
psrldq $8,%xmm6
psrldq $8,%xmm7
mov $0x3FE921FB54442D18,%rdx #piby4 +
mov $0x411E848000000000,%r10 #5e5 +
movapd .L__real_3fe0000000000000(%rip),%xmm4 #0.5 for later use +
por %xmm6,%xmm12
por %xmm7,%xmm13
movapd %xmm10,%xmm2 #x0
movapd %xmm1,%xmm3 #x1
movapd %xmm10,%xmm6 #x0
movapd %xmm1,%xmm7 #x1
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# xmm2 = x, xmm4 =0.5/t, xmm6 =x
# xmm3 = x, xmm5 =0.5/t, xmm7 =x
.align 16
.Leither_or_both_arg_gt_than_piby4:
cmp %r10,%rax
jae .Lfirst_or_next3_arg_gt_5e5
cmp %r10,%rcx
jae .Lsecond_or_next2_arg_gt_5e5
cmp %r10,%r8
jae .Lthird_or_fourth_arg_gt_5e5
cmp %r10,%r9
jae .Lfourth_arg_gt_5e5
# /* Find out what multiple of piby2 */
# npi2 = (int)(x * twobypi + 0.5);
movapd .L__real_3fe45f306dc9c883(%rip),%xmm10
mulpd %xmm10,%xmm2 # * twobypi
mulpd %xmm10,%xmm3 # * twobypi
addpd %xmm4,%xmm2 # +0.5, npi2
addpd %xmm4,%xmm3 # +0.5, npi2
movapd .L__real_3ff921fb54400000(%rip),%xmm10 # piby2_1
movapd .L__real_3ff921fb54400000(%rip),%xmm1 # piby2_1
cvttpd2dq %xmm2,%xmm4 # convert packed double to packed integers
cvttpd2dq %xmm3,%xmm5 # convert packed double to packed integers
movapd .L__real_3dd0b4611a600000(%rip),%xmm8 # piby2_2
movapd .L__real_3dd0b4611a600000(%rip),%xmm9 # piby2_2
cvtdq2pd %xmm4,%xmm2 # and back to double.
cvtdq2pd %xmm5,%xmm3 # and back to double.
# /* Subtract the multiple from x to get an extra-precision remainder */
movd %xmm4,%r8 # Region
movd %xmm5,%r9 # Region
mov .L__reald_one_zero(%rip),%rdx #compare value for cossin path
mov %r8,%r10
mov %r9,%r11
# rhead = x - npi2 * piby2_1;
mulpd %xmm2,%xmm10 # npi2 * piby2_1;
mulpd %xmm3,%xmm1 # npi2 * piby2_1;
# rtail = npi2 * piby2_2;
mulpd %xmm2,%xmm8 # rtail
mulpd %xmm3,%xmm9 # rtail
# rhead = x - npi2 * piby2_1;
subpd %xmm10,%xmm6 # rhead = x - npi2 * piby2_1;
subpd %xmm1,%xmm7 # rhead = x - npi2 * piby2_1;
# t = rhead;
movapd %xmm6,%xmm10 # t
movapd %xmm7,%xmm1 # t
# rhead = t - rtail;
subpd %xmm8,%xmm10 # rhead
subpd %xmm9,%xmm1 # rhead
# rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
mulpd .L__real_3ba3198a2e037073(%rip),%xmm2 # npi2 * piby2_2tail
mulpd .L__real_3ba3198a2e037073(%rip),%xmm3 # npi2 * piby2_2tail
subpd %xmm10,%xmm6 # t-rhead
subpd %xmm1,%xmm7 # t-rhead
subpd %xmm6,%xmm8 # - ((t - rhead) - rtail)
subpd %xmm7,%xmm9 # - ((t - rhead) - rtail)
addpd %xmm2,%xmm8 # rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
addpd %xmm3,%xmm9 # rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# xmm4 = npi2 (int), xmm10 =rhead, xmm8 =rtail, r8 = region, r10 = region, r12 = Sign
# xmm5 = npi2 (int), xmm1 =rhead, xmm9 =rtail, r9 = region, r11 = region, r13 = Sign
and .L__reald_one_one(%rip),%r8 #odd/even region for cos/sin
and .L__reald_one_one(%rip),%r9 #odd/even region for cos/sin
mov %r10,%rax
mov %r11,%rcx
shr $1,%r10 #~AB+A~B, A is sign and B is upper bit of region
shr $1,%r11 #~AB+A~B, A is sign and B is upper bit of region
xor %rax,%r10
xor %rcx,%r11
and .L__reald_one_one(%rip),%r10 #(~AB+A~B)&1
and .L__reald_one_one(%rip),%r11 #(~AB+A~B)&1
mov %r10,%r12
mov %r11,%r13
and %rdx,%r12 #mask out the lower sign bit leaving the upper sign bit
and %rdx,%r13 #mask out the lower sign bit leaving the upper sign bit
shl $63,%r10 #shift lower sign bit left by 63 bits
shl $63,%r11 #shift lower sign bit left by 63 bits
shl $31,%r12 #shift upper sign bit left by 31 bits
shl $31,%r13 #shift upper sign bit left by 31 bits
mov %r10,p_sign(%rsp) #write out lower sign bit
mov %r12,p_sign+8(%rsp) #write out upper sign bit
mov %r11,p_sign1(%rsp) #write out lower sign bit
mov %r13,p_sign1+8(%rsp) #write out upper sign bit
# GET_BITS_DP64(rhead-rtail, uy); ; originally only rhead
# xmm4 = Sign, xmm10 =rhead, xmm8 =rtail
# xmm5 = Sign, xmm1 =rhead, xmm9 =rtail
movapd %xmm10,%xmm6 # rhead
movapd %xmm1,%xmm7 # rhead
subpd %xmm8,%xmm10 # r = rhead - rtail
subpd %xmm9,%xmm1 # r = rhead - rtail
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# xmm4 = Sign, xmm10 = r, xmm6 =rhead, xmm8 =rtail
# xmm5 = Sign, xmm1 = r, xmm7 =rhead, xmm9 =rtail
mov %r8,%rax
mov %r9,%rcx
movapd %xmm10,%xmm2 # move r for r2
movapd %xmm1,%xmm3 # move r for r2
mulpd %xmm10,%xmm2 # r2
mulpd %xmm1,%xmm3 # r2
and .L__reald_zero_one(%rip),%rax
and .L__reald_zero_one(%rip),%rcx
shr $31,%r8
shr $31,%r9
or %r8,%rax
or %r9,%rcx
shl $2,%rcx
or %rcx,%rax
leaq .Levencos_oddsin_tbl(%rip),%rcx
jmp *(%rcx,%rax,8) #Jmp table for cos/sin calculation based on even/odd region
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.Lfirst_or_next3_arg_gt_5e5:
# %rcx,,%rax r8, r9
cmp %r10,%rcx #is upper arg >= 5e5
jae .Lboth_arg_gt_5e5
.Llower_arg_gt_5e5:
# Upper Arg is < 5e5, Lower arg is >= 5e5
# %xmm2,,%xmm10 xmm6 = x, xmm4 = 0.5
# Be sure not to use %xmm3,%xmm1 and xmm7
# Use %xmm8,,%xmm5 xmm0, xmm12
# %xmm11,,%xmm9 xmm13
movlpd %xmm10,r(%rsp) #Save lower fp arg for remainder_piby2 call
movhlps %xmm10,%xmm10 #Needed since we want to work on upper arg
movhlps %xmm2,%xmm2
movhlps %xmm6,%xmm6
# Work on Upper arg
# Lower arg might contain nan/inf, to avoid exception use only scalar instructions on upper arg which has been moved to lower portions of fp regs
mulsd .L__real_3fe45f306dc9c883(%rip),%xmm2 # x*twobypi
addsd %xmm4,%xmm2 # xmm2 = npi2=(x*twobypi+0.5)
movsd .L__real_3ff921fb54400000(%rip),%xmm8 # xmm8 = piby2_1
cvttsd2si %xmm2,%ecx # ecx = npi2 trunc to ints
movsd .L__real_3dd0b4611a600000(%rip),%xmm0 # xmm0 = piby2_2
cvtsi2sd %ecx,%xmm2 # xmm2 = npi2 trunc to doubles
#/* Subtract the multiple from x to get an extra-precision remainder */
#rhead = x - npi2 * piby2_1;
mulsd %xmm2,%xmm8 # npi2 * piby2_1
subsd %xmm8,%xmm6 # xmm6 = rhead =(x-npi2*piby2_1)
movsd .L__real_3ba3198a2e037073(%rip),%xmm12 # xmm12 =piby2_2tail
#t = rhead;
movsd %xmm6,%xmm5 # xmm5 = t = rhead
#rtail = npi2 * piby2_2;
mulsd %xmm2,%xmm0 # xmm1 =rtail=(npi2*piby2_2)
#rhead = t - rtail
subsd %xmm0,%xmm6 # xmm6 =rhead=(t-rtail)
#rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
mulsd %xmm2,%xmm12 # npi2 * piby2_2tail
subsd %xmm6,%xmm5 # t-rhead
subsd %xmm5,%xmm0 # (rtail-(t-rhead))
addsd %xmm12,%xmm0 # rtail=npi2*piby2_2tail+(rtail-(t-rhead));
#r = rhead - rtail
#rr = (rhead-r) -rtail
mov %ecx,region+4(%rsp) # store upper region
movsd %xmm6,%xmm10
subsd %xmm0,%xmm10 # xmm10 = r=(rhead-rtail)
subsd %xmm10,%xmm6 # rr=rhead-r
subsd %xmm0,%xmm6 # xmm6 = rr=((rhead-r) -rtail)
movlpd %xmm10,r+8(%rsp) # store upper r
movlpd %xmm6,rr+8(%rsp) # store upper rr
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#Note that volatiles will be trashed by the call
#We will construct r, rr, region and sign
# Work on Lower arg
mov $0x07ff0000000000000,%r11 # is lower arg nan/inf
mov %r11,%r10
and %rax,%r10
cmp %r11,%r10
jz .L__vrs4_cosf_lower_naninf
mov %r8,p_temp(%rsp)
mov %r9,p_temp2(%rsp)
movapd %xmm1,p_temp1(%rsp)
movapd %xmm3,p_temp3(%rsp)
movapd %xmm7,p_temp5(%rsp)
lea region(%rsp),%rdx # lower arg is **NOT** nan/inf
lea r(%rsp),%rsi
# changed input from xmm10 to xmm0
mov r(%rsp),%rdi #Restore lower fp arg for remainder_piby2 call
call __remainder_piby2d2f@PLT
mov p_temp(%rsp),%r8
mov p_temp2(%rsp),%r9
movapd p_temp1(%rsp),%xmm1
movapd p_temp3(%rsp),%xmm3
movapd p_temp5(%rsp),%xmm7
jmp 0f
.L__vrs4_cosf_lower_naninf:
mov $0x00008000000000000,%r11
or %r11,%rax
mov %rax,r(%rsp) # r = x | 0x0008000000000000
mov %r10d,region(%rsp) # region =0
.align 16
0:
jmp .Lcheck_next2_args
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.Lboth_arg_gt_5e5:
#Upper Arg is >= 5e5, Lower arg is >= 5e5
# %rcx,,%rax r8, r9
# %xmm2,,%xmm10 xmm6 = x, xmm4 = 0.5
movhlps %xmm10,%xmm6 #Save upper fp arg for remainder_piby2 call
mov $0x07ff0000000000000,%r11 #is lower arg nan/inf
mov %r11,%r10
and %rax,%r10
cmp %r11,%r10
jz .L__vrs4_cosf_lower_naninf_of_both_gt_5e5
mov %rcx,p_temp(%rsp) #Save upper arg
mov %r8,p_temp2(%rsp)
mov %r9,p_temp4(%rsp)
movapd %xmm1,p_temp1(%rsp)
movapd %xmm3,p_temp3(%rsp)
movapd %xmm7,p_temp5(%rsp)
lea region(%rsp),%rdx #lower arg is **NOT** nan/inf
lea r(%rsp),%rsi
# added ins- changed input from xmm10 to xmm0
movd %xmm10,%rdi
call __remainder_piby2d2f@PLT
mov p_temp2(%rsp),%r8
mov p_temp4(%rsp),%r9
movapd p_temp1(%rsp),%xmm1
movapd p_temp3(%rsp),%xmm3
movapd p_temp5(%rsp),%xmm7
mov p_temp(%rsp),%rcx #Restore upper arg
jmp 0f
.L__vrs4_cosf_lower_naninf_of_both_gt_5e5: #lower arg is nan/inf
mov $0x00008000000000000,%r11
or %r11,%rax
mov %rax,r(%rsp) #r = x | 0x0008000000000000
mov %r10d,region(%rsp) #region = 0
.align 16
0:
mov $0x07ff0000000000000,%r11 #is upper arg nan/inf
mov %r11,%r10
and %rcx,%r10
cmp %r11,%r10
jz .L__vrs4_cosf_upper_naninf_of_both_gt_5e5
mov %r8,p_temp2(%rsp)
mov %r9,p_temp4(%rsp)
movapd %xmm1,p_temp1(%rsp)
movapd %xmm3,p_temp3(%rsp)
movapd %xmm7,p_temp5(%rsp)
lea region+4(%rsp),%rdx #upper arg is **NOT** nan/inf
lea r+8(%rsp),%rsi
# changed input from xmm10 to xmm0
movd %xmm6,%rdi #Restore upper fp arg for remainder_piby2 call
call __remainder_piby2d2f@PLT
mov p_temp2(%rsp),%r8
mov p_temp4(%rsp),%r9
movapd p_temp1(%rsp),%xmm1
movapd p_temp3(%rsp),%xmm3
movapd p_temp5(%rsp),%xmm7
jmp 0f
.L__vrs4_cosf_upper_naninf_of_both_gt_5e5:
mov $0x00008000000000000,%r11
or %r11,%rcx
mov %rcx,r+8(%rsp) #r = x | 0x0008000000000000
mov %r10d,region+4(%rsp) #region = 0
.align 16
0:
jmp .Lcheck_next2_args
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.Lsecond_or_next2_arg_gt_5e5:
# Upper Arg is >= 5e5, Lower arg is < 5e5
# %rcx,,%rax r8, r9
# %xmm2,,%xmm10 xmm6 = x, xmm4 = 0.5
# Do not use %xmm3,,%xmm1 xmm7
# Restore xmm4 and %xmm3,,%xmm1 xmm7
# Can use %xmm0,,%xmm8 xmm12
# %xmm9,,%xmm5 xmm11, xmm13
movhpd %xmm10,r+8(%rsp) #Save upper fp arg for remainder_piby2 call
# Work on Lower arg
# Upper arg might contain nan/inf, to avoid exception use only scalar instructions on lower arg
mulsd .L__real_3fe45f306dc9c883(%rip),%xmm2 # x*twobypi
addsd %xmm4,%xmm2 # xmm2 = npi2=(x*twobypi+0.5)
movsd .L__real_3ff921fb54400000(%rip),%xmm8 # xmm3 = piby2_1
cvttsd2si %xmm2,%eax # ecx = npi2 trunc to ints
movsd .L__real_3dd0b4611a600000(%rip),%xmm0 # xmm1 = piby2_2
cvtsi2sd %eax,%xmm2 # xmm2 = npi2 trunc to doubles
#/* Subtract the multiple from x to get an extra-precision remainder */
#rhead = x - npi2 * piby2_1;
mulsd %xmm2,%xmm8 # npi2 * piby2_1
subsd %xmm8,%xmm6 # xmm6 = rhead =(x-npi2*piby2_1)
movsd .L__real_3ba3198a2e037073(%rip),%xmm12 # xmm7 =piby2_2tail
#t = rhead;
movsd %xmm6,%xmm5 # xmm5 = t = rhead
#rtail = npi2 * piby2_2;
mulsd %xmm2,%xmm0 # xmm1 =rtail=(npi2*piby2_2)
#rhead = t - rtail
subsd %xmm0,%xmm6 # xmm6 =rhead=(t-rtail)
#rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
mulsd %xmm2,%xmm12 # npi2 * piby2_2tail
subsd %xmm6,%xmm5 # t-rhead
subsd %xmm5,%xmm0 # (rtail-(t-rhead))
addsd %xmm12,%xmm0 # rtail=npi2*piby2_2tail+(rtail-(t-rhead));
#r = rhead - rtail
#rr = (rhead-r) -rtail
mov %eax,region(%rsp) # store upper region
subsd %xmm0,%xmm6 # xmm10 = r=(rhead-rtail)
movlpd %xmm6,r(%rsp) # store upper r
#Work on Upper arg
#Note that volatiles will be trashed by the call
#We do not care since this is the last check
#We will construct r, rr, region and sign
mov $0x07ff0000000000000,%r11 # is upper arg nan/inf
mov %r11,%r10
and %rcx,%r10
cmp %r11,%r10
jz .L__vrs4_cosf_upper_naninf
mov %r8,p_temp(%rsp)
mov %r9,p_temp2(%rsp)
movapd %xmm1,p_temp1(%rsp)
movapd %xmm3,p_temp3(%rsp)
movapd %xmm7,p_temp5(%rsp)
lea region+4(%rsp),%rdx # upper arg is **NOT** nan/inf
lea r+8(%rsp),%rsi
# changed input from xmm10 to xmm0
mov r+8(%rsp),%rdi #Restore upper fp arg for remainder_piby2 call
call __remainder_piby2d2f@PLT
mov p_temp(%rsp),%r8
mov p_temp2(%rsp),%r9
movapd p_temp1(%rsp),%xmm1
movapd p_temp3(%rsp),%xmm3
movapd p_temp5(%rsp),%xmm7
jmp 0f
.L__vrs4_cosf_upper_naninf:
mov $0x00008000000000000,%r11
or %r11,%rcx
mov %rcx,r+8(%rsp) # r = x | 0x0008000000000000
mov %r10d,region+4(%rsp) # region =0
.align 16
0:
jmp .Lcheck_next2_args
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.Lcheck_next2_args:
mov $0x411E848000000000,%r10 #5e5 +
cmp %r10,%r8
jae .Lfirst_second_done_third_or_fourth_arg_gt_5e5
cmp %r10,%r9
jae .Lfirst_second_done_fourth_arg_gt_5e5
# Work on next two args, both < 5e5
# %xmm3,,%xmm1 xmm5 = x, xmm4 = 0.5
movapd .L__real_3fe0000000000000(%rip),%xmm4 #Restore 0.5
mulpd .L__real_3fe45f306dc9c883(%rip),%xmm3 # * twobypi
addpd %xmm4,%xmm3 # +0.5, npi2
movapd .L__real_3ff921fb54400000(%rip),%xmm1 # piby2_1
cvttpd2dq %xmm3,%xmm5 # convert packed double to packed integers
movapd .L__real_3dd0b4611a600000(%rip),%xmm9 # piby2_2
cvtdq2pd %xmm5,%xmm3 # and back to double.
###
# /* Subtract the multiple from x to get an extra-precision remainder */
movlpd %xmm5,region1(%rsp) # Region
###
# rhead = x - npi2 * piby2_1;
mulpd %xmm3,%xmm1 # npi2 * piby2_1;
# rtail = npi2 * piby2_2;
mulpd %xmm3,%xmm9 # rtail
# rhead = x - npi2 * piby2_1;
subpd %xmm1,%xmm7 # rhead = x - npi2 * piby2_1;
# t = rhead;
movapd %xmm7,%xmm1 # t
# rhead = t - rtail;
subpd %xmm9,%xmm1 # rhead
# rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
mulpd .L__real_3ba3198a2e037073(%rip),%xmm3 # npi2 * piby2_2tail
subpd %xmm1,%xmm7 # t-rhead
subpd %xmm7,%xmm9 # - ((t - rhead) - rtail)
addpd %xmm3,%xmm9 # rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
subpd %xmm9,%xmm1 # r = rhead - rtail
movapd %xmm1,r1(%rsp)
jmp .L__vrs4_cosf_reconstruct
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.Lthird_or_fourth_arg_gt_5e5:
#first two args are < 5e5, third arg >= 5e5, fourth arg >= 5e5 or < 5e5
# %rcx,,%rax r8, r9
# %xmm2,,%xmm10 xmm6 = x, xmm4 = 0.5
# Do not use %xmm3,,%xmm1 xmm7
# Can use %xmm11,,%xmm9 xmm13
# %xmm8,,%xmm5 xmm0, xmm12
# Restore xmm4
# Work on first two args, both < 5e5
mulpd .L__real_3fe45f306dc9c883(%rip),%xmm2 # * twobypi
addpd %xmm4,%xmm2 # +0.5, npi2
movapd .L__real_3ff921fb54400000(%rip),%xmm10 # piby2_1
cvttpd2dq %xmm2,%xmm4 # convert packed double to packed integers
movapd .L__real_3dd0b4611a600000(%rip),%xmm8 # piby2_2
cvtdq2pd %xmm4,%xmm2 # and back to double.
###
# /* Subtract the multiple from x to get an extra-precision remainder */
movlpd %xmm4,region(%rsp) # Region
###
# rhead = x - npi2 * piby2_1;
mulpd %xmm2,%xmm10 # npi2 * piby2_1;
# rtail = npi2 * piby2_2;
mulpd %xmm2,%xmm8 # rtail
# rhead = x - npi2 * piby2_1;
subpd %xmm10,%xmm6 # rhead = x - npi2 * piby2_1;
# t = rhead;
movapd %xmm6,%xmm10 # t
# rhead = t - rtail;
subpd %xmm8,%xmm10 # rhead
# rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
mulpd .L__real_3ba3198a2e037073(%rip),%xmm2 # npi2 * piby2_2tail
subpd %xmm10,%xmm6 # t-rhead
subpd %xmm6,%xmm8 # - ((t - rhead) - rtail)
addpd %xmm2,%xmm8 # rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
subpd %xmm8,%xmm10 # r = rhead - rtail
movapd %xmm10,r(%rsp)
# Work on next two args, third arg >= 5e5, fourth arg >= 5e5 or < 5e5
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.Lfirst_second_done_third_or_fourth_arg_gt_5e5:
# %rcx,,%rax r8, r9
# %xmm2,,%xmm10 xmm6 = x, xmm4 = 0.5
mov $0x411E848000000000,%r10 #5e5 +
cmp %r10,%r9
jae .Lboth_arg_gt_5e5_higher
# Upper Arg is <5e5, Lower arg is >= 5e5
# %r9,%r8
# %xmm3,,%xmm1 xmm7 = x, xmm4 = 0.5
movlpd %xmm1,r1(%rsp) #Save lower fp arg for remainder_piby2 call
movhlps %xmm1,%xmm1 #Needed since we want to work on upper arg
movhlps %xmm3,%xmm3
movhlps %xmm7,%xmm7
# Work on Upper arg
# Lower arg might contain nan/inf, to avoid exception use only scalar instructions on upper arg which has been moved to lower portions of fp regs
movapd .L__real_3fe0000000000000(%rip),%xmm4 # Restore 0.5
mulsd .L__real_3fe45f306dc9c883(%rip),%xmm3 # x*twobypi
addsd %xmm4,%xmm3 # xmm3 = npi2=(x*twobypi+0.5)
movsd .L__real_3ff921fb54400000(%rip),%xmm2 # xmm2 = piby2_1
cvttsd2si %xmm3,%r9d # r9d = npi2 trunc to ints
movsd .L__real_3dd0b4611a600000(%rip),%xmm10 # xmm10 = piby2_2
cvtsi2sd %r9d,%xmm3 # xmm3 = npi2 trunc to doubles
#/* Subtract the multiple from x to get an extra-precision remainder */
#rhead = x - npi2 * piby2_1;
mulsd %xmm3,%xmm2 # npi2 * piby2_1
subsd %xmm2,%xmm7 # xmm7 = rhead =(x-npi2*piby2_1)
movsd .L__real_3ba3198a2e037073(%rip),%xmm6 # xmm6 =piby2_2tail
#t = rhead;
movsd %xmm7,%xmm5 # xmm5 = t = rhead
#rtail = npi2 * piby2_2;
mulsd %xmm3,%xmm10 # xmm10 =rtail=(npi2*piby2_2)
#rhead = t - rtail
subsd %xmm10,%xmm7 # xmm7 =rhead=(t-rtail)
#rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
mulsd %xmm3,%xmm6 # npi2 * piby2_2tail
subsd %xmm7,%xmm5 # t-rhead
subsd %xmm5,%xmm10 # (rtail-(t-rhead))
addsd %xmm6,%xmm10 # rtail=npi2*piby2_2tail+(rtail-(t-rhead));
#r = rhead - rtail
#rr = (rhead-r) -rtail
mov %r9d,region1+4(%rsp) # store upper region
subsd %xmm10,%xmm7 # xmm1 = r=(rhead-rtail)
movlpd %xmm7,r1+8(%rsp) # store upper r
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#Note that volatiles will be trashed by the call
#We do not care since this is the last check
#We will construct r, rr, region and sign
# Work on Lower arg
mov $0x07ff0000000000000,%r11 # is lower arg nan/inf
mov %r11,%r10
and %r8,%r10
cmp %r11,%r10
jz .L__vrs4_cosf_lower_naninf_higher
lea region1(%rsp),%rdx # lower arg is **NOT** nan/inf
lea r1(%rsp),%rsi
# changed input from xmm10 to xmm0
mov r1(%rsp),%rdi #Restore lower fp arg for remainder_piby2 call
call __remainder_piby2d2f@PLT
jmp 0f
.L__vrs4_cosf_lower_naninf_higher:
mov $0x00008000000000000,%r11
or %r11,%r8
mov %r8,r1(%rsp) # r = x | 0x0008000000000000
mov %r10d,region1(%rsp) # region =0
.align 16
0:
jmp .L__vrs4_cosf_reconstruct
.align 16
.Lboth_arg_gt_5e5_higher:
# Upper Arg is >= 5e5, Lower arg is >= 5e5
# %r9,%r8
# %xmm3,,%xmm1 xmm7 = x, xmm4 = 0.5
movhlps %xmm1,%xmm7 #Save upper fp arg for remainder_piby2 call
mov $0x07ff0000000000000,%r11 #is lower arg nan/inf
mov %r11,%r10
and %r8,%r10
cmp %r11,%r10
jz .L__vrs4_cosf_lower_naninf_of_both_gt_5e5_higher
mov %r9,p_temp1(%rsp) #Save upper arg
lea region1(%rsp),%rdx #lower arg is **NOT** nan/inf
lea r1(%rsp),%rsi
# changed input from xmm10 to xmm0
movd %xmm1,%rdi
call __remainder_piby2d2f@PLT
mov p_temp1(%rsp),%r9 #Restore upper arg
jmp 0f
.L__vrs4_cosf_lower_naninf_of_both_gt_5e5_higher: #lower arg is nan/inf
mov $0x00008000000000000,%r11
or %r11,%r8
mov %r8,r1(%rsp) #r = x | 0x0008000000000000
mov %r10d,region1(%rsp) #region = 0
.align 16
0:
mov $0x07ff0000000000000,%r11 #is upper arg nan/inf
mov %r11,%r10
and %r9,%r10
cmp %r11,%r10
jz .L__vrs4_cosf_upper_naninf_of_both_gt_5e5_higher
lea region1+4(%rsp),%rdx #upper arg is **NOT** nan/inf
lea r1+8(%rsp),%rsi
# changed input from xmm10 to xmm0
movd %xmm7,%rdi #Restore upper fp arg for remainder_piby2 call
call __remainder_piby2d2f@PLT
jmp 0f
.L__vrs4_cosf_upper_naninf_of_both_gt_5e5_higher:
mov $0x00008000000000000,%r11
or %r11,%r9
mov %r9,r1+8(%rsp) #r = x | 0x0008000000000000
mov %r10d,region1+4(%rsp) #region = 0
.align 16
0:
jmp .L__vrs4_cosf_reconstruct
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.Lfourth_arg_gt_5e5:
#first two args are < 5e5, third arg < 5e5, fourth arg >= 5e5
#%rcx,,%rax r8, r9
#%xmm2,,%xmm10 xmm6 = x, xmm4 = 0.5
# Work on first two args, both < 5e5
mulpd .L__real_3fe45f306dc9c883(%rip),%xmm2 # * twobypi
addpd %xmm4,%xmm2 # +0.5, npi2
movapd .L__real_3ff921fb54400000(%rip),%xmm10 # piby2_1
cvttpd2dq %xmm2,%xmm4 # convert packed double to packed integers
movapd .L__real_3dd0b4611a600000(%rip),%xmm8 # piby2_2
cvtdq2pd %xmm4,%xmm2 # and back to double.
###
# /* Subtract the multiple from x to get an extra-precision remainder */
movlpd %xmm4,region(%rsp) # Region
###
# rhead = x - npi2 * piby2_1;
mulpd %xmm2,%xmm10 # npi2 * piby2_1;
# rtail = npi2 * piby2_2;
mulpd %xmm2,%xmm8 # rtail
# rhead = x - npi2 * piby2_1;
subpd %xmm10,%xmm6 # rhead = x - npi2 * piby2_1;
# t = rhead;
movapd %xmm6,%xmm10 # t
# rhead = t - rtail;
subpd %xmm8,%xmm10 # rhead
# rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
mulpd .L__real_3ba3198a2e037073(%rip),%xmm2 # npi2 * piby2_2tail
subpd %xmm10,%xmm6 # t-rhead
subpd %xmm6,%xmm8 # - ((t - rhead) - rtail)
addpd %xmm2,%xmm8 # rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
subpd %xmm8,%xmm10 # r = rhead - rtail
movapd %xmm10,r(%rsp)
# Work on next two args, third arg < 5e5, fourth arg >= 5e5
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.Lfirst_second_done_fourth_arg_gt_5e5:
# Upper Arg is >= 5e5, Lower arg is < 5e5
# %r9,%r8
# %xmm3,,%xmm1 xmm7 = x, xmm4 = 0.5
movhpd %xmm1,r1+8(%rsp) #Save upper fp arg for remainder_piby2 call
# Work on Lower arg
# Upper arg might contain nan/inf, to avoid exception use only scalar instructions on lower arg
movapd .L__real_3fe0000000000000(%rip),%xmm4 # Restore 0.5
mulsd .L__real_3fe45f306dc9c883(%rip),%xmm3 # x*twobypi
addsd %xmm4,%xmm3 # xmm3 = npi2=(x*twobypi+0.5)
movsd .L__real_3ff921fb54400000(%rip),%xmm2 # xmm2 = piby2_1
cvttsd2si %xmm3,%r8d # r8d = npi2 trunc to ints
movsd .L__real_3dd0b4611a600000(%rip),%xmm10 # xmm10 = piby2_2
cvtsi2sd %r8d,%xmm3 # xmm3 = npi2 trunc to doubles
#/* Subtract the multiple from x to get an extra-precision remainder */
#rhead = x - npi2 * piby2_1;
mulsd %xmm3,%xmm2 # npi2 * piby2_1
subsd %xmm2,%xmm7 # xmm7 = rhead =(x-npi2*piby2_1)
movsd .L__real_3ba3198a2e037073(%rip),%xmm6 # xmm6 =piby2_2tail
#t = rhead;
movsd %xmm7,%xmm5 # xmm5 = t = rhead
#rtail = npi2 * piby2_2;
mulsd %xmm3,%xmm10 # xmm10 =rtail=(npi2*piby2_2)
#rhead = t - rtail
subsd %xmm10,%xmm7 # xmm7 =rhead=(t-rtail)
#rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
mulsd %xmm3,%xmm6 # npi2 * piby2_2tail
subsd %xmm7,%xmm5 # t-rhead
subsd %xmm5,%xmm10 # (rtail-(t-rhead))
addsd %xmm6,%xmm10 # rtail=npi2*piby2_2tail+(rtail-(t-rhead));
#r = rhead - rtail
#rr = (rhead-r) -rtail
mov %r8d,region1(%rsp) # store lower region
subsd %xmm10,%xmm7 # xmm10 = r=(rhead-rtail)
movlpd %xmm7,r1(%rsp) # store upper r
#Work on Upper arg
#Note that volatiles will be trashed by the call
#We do not care since this is the last check
#We will construct r, rr, region and sign
mov $0x07ff0000000000000,%r11 # is upper arg nan/inf
mov %r11,%r10
and %r9,%r10
cmp %r11,%r10
jz .L__vrs4_cosf_upper_naninf_higher
lea region1+4(%rsp),%rdx # upper arg is **NOT** nan/inf
lea r1+8(%rsp),%rsi
# changed input from xmm10 to xmm0
mov r1+8(%rsp),%rdi #Restore upper fp arg for remainder_piby2 call
call __remainder_piby2d2f@PLT
jmp 0f
.L__vrs4_cosf_upper_naninf_higher:
mov $0x00008000000000000,%r11
or %r11,%r9
mov %r9,r1+8(%rsp) # r = x | 0x0008000000000000
mov %r10d,region1+4(%rsp) # region =0
.align 16
0:
jmp .L__vrs4_cosf_reconstruct
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.L__vrs4_cosf_reconstruct:
#Results
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# p_sign0 = Sign, xmm10 = r, xmm2 = %xmm6,%r2 =rr
# p_sign1 = Sign, xmm1 = r, xmm3 = %xmm7,%r2 =rr
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
movapd r(%rsp),%xmm10
movapd r1(%rsp),%xmm1
mov region(%rsp),%r8
mov region1(%rsp),%r9
mov .L__reald_one_zero(%rip),%rdx #compare value for cossin path
mov %r8,%r10
mov %r9,%r11
and .L__reald_one_one(%rip),%r8 #odd/even region for cos/sin
and .L__reald_one_one(%rip),%r9 #odd/even region for cos/sin
mov %r10,%rax
mov %r11,%rcx
shr $1,%r10 #~AB+A~B, A is sign and B is upper bit of region
shr $1,%r11 #~AB+A~B, A is sign and B is upper bit of region
xor %rax,%r10
xor %rcx,%r11
and .L__reald_one_one(%rip),%r10 #(~AB+A~B)&1
and .L__reald_one_one(%rip),%r11 #(~AB+A~B)&1
mov %r10,%r12
mov %r11,%r13
and %rdx,%r12 #mask out the lower sign bit leaving the upper sign bit
and %rdx,%r13 #mask out the lower sign bit leaving the upper sign bit
shl $63,%r10 #shift lower sign bit left by 63 bits
shl $63,%r11 #shift lower sign bit left by 63 bits
shl $31,%r12 #shift upper sign bit left by 31 bits
shl $31,%r13 #shift upper sign bit left by 31 bits
mov %r10,p_sign(%rsp) #write out lower sign bit
mov %r12,p_sign+8(%rsp) #write out upper sign bit
mov %r11,p_sign1(%rsp) #write out lower sign bit
mov %r13,p_sign1+8(%rsp) #write out upper sign bit
mov %r8,%rax
mov %r9,%rcx
movapd %xmm10,%xmm2
movapd %xmm1,%xmm3
mulpd %xmm10,%xmm2 # r2
mulpd %xmm1,%xmm3 # r2
and .L__reald_zero_one(%rip),%rax
and .L__reald_zero_one(%rip),%rcx
shr $31,%r8
shr $31,%r9
or %r8,%rax
or %r9,%rcx
shl $2,%rcx
or %rcx,%rax
leaq .Levencos_oddsin_tbl(%rip),%rcx
jmp *(%rcx,%rax,8) #Jmp table for cos/sin calculation based on even/odd region
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.L__vrs4_cosf_cleanup:
movapd p_sign(%rsp),%xmm10
movapd p_sign1(%rsp),%xmm1
xorpd %xmm4,%xmm10 # (+) Sign
xorpd %xmm5,%xmm1 # (+) Sign
cvtpd2ps %xmm10,%xmm0
cvtpd2ps %xmm1,%xmm11
movlhps %xmm11,%xmm0
mov save_r12(%rsp),%r12 # restore r12
mov save_r13(%rsp),%r13 # restore r13
add $0x01E8,%rsp
ret
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;JUMP TABLE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# p_sign0 = Sign, xmm10 = r, xmm2 = %xmm6,%r2 =rr
# p_sign1 = Sign, xmm1 = r, xmm3 = %xmm7,%r2 =rr
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.Lcoscos_coscos_piby4:
movapd %xmm2,%xmm0 # r
movapd %xmm3,%xmm11 # r
movdqa .Lcosarray+0x30(%rip),%xmm4 # c4
movdqa .Lcosarray+0x30(%rip),%xmm5 # c4
movapd .Lcosarray+0x10(%rip),%xmm8 # c2
movapd .Lcosarray+0x10(%rip),%xmm9 # c2
mulpd .L__real_3fe0000000000000(%rip),%xmm0 # r = 0.5 *x2
mulpd .L__real_3fe0000000000000(%rip),%xmm11 # r = 0.5 *x2
mulpd %xmm2,%xmm4 # c4*x2
mulpd %xmm3,%xmm5 # c4*x2
mulpd %xmm2,%xmm8 # c3*x2
mulpd %xmm3,%xmm9 # c3*x2
subpd .L__real_3ff0000000000000(%rip),%xmm0 # -t=r-1.0 ;trash r
subpd .L__real_3ff0000000000000(%rip),%xmm11 # -t=r-1.0 ;trash r
mulpd %xmm2,%xmm2 # x4
mulpd %xmm3,%xmm3 # x4
addpd .Lcosarray+0x20(%rip),%xmm4 # c3+x2c4
addpd .Lcosarray+0x20(%rip),%xmm5 # c3+x2c4
addpd .Lcosarray(%rip),%xmm8 # c1+x2c2
addpd .Lcosarray(%rip),%xmm9 # c1+x2c2
mulpd %xmm2,%xmm4 # x4(c3+x2c4)
mulpd %xmm3,%xmm5 # x4(c3+x2c4)
addpd %xmm8,%xmm4 # zc
addpd %xmm9,%xmm5 # zc
mulpd %xmm2,%xmm4 # x4 * zc
mulpd %xmm3,%xmm5 # x4 * zc
subpd %xmm0,%xmm4 # + t
subpd %xmm11,%xmm5 # + t
jmp .L__vrs4_cosf_cleanup
.align 16
.Lcossin_cossin_piby4:
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# p_sign0 = Sign, xmm10 = r, xmm2 = %xmm6,%r2 =rr
# p_sign1 = Sign, xmm1 = r, xmm3 = %xmm7,%r2 =rr
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
movdqa .Lsincosarray+0x30(%rip),%xmm4 # s4
movdqa .Lsincosarray+0x30(%rip),%xmm5 # s4
movapd .Lsincosarray+0x10(%rip),%xmm8 # s2
movapd .Lsincosarray+0x10(%rip),%xmm9 # s2
movapd %xmm2,%xmm0 # move x2 for x4
movapd %xmm3,%xmm11 # move x2 for x4
mulpd %xmm2,%xmm4 # x2s4
mulpd %xmm3,%xmm5 # x2s4
mulpd %xmm2,%xmm8 # x2s2
mulpd %xmm3,%xmm9 # x2s2
mulpd %xmm2,%xmm0 # x4
mulpd %xmm3,%xmm11 # x4
addpd .Lsincosarray+0x20(%rip),%xmm4 # s4+x2s3
addpd .Lsincosarray+0x20(%rip),%xmm5 # s4+x2s3
addpd .Lsincosarray(%rip),%xmm8 # s2+x2s1
addpd .Lsincosarray(%rip),%xmm9 # s2+x2s1
mulpd %xmm0,%xmm4 # x4(s3+x2s4)
mulpd %xmm11,%xmm5 # x4(s3+x2s4)
movhlps %xmm0,%xmm0 # move high x4 for cos term
movhlps %xmm11,%xmm11 # move high x4 for cos term
movsd %xmm2,%xmm6 # move low x2 for x3 for sin term
movsd %xmm3,%xmm7 # move low x2 for x3 for sin term
mulsd %xmm10,%xmm6 # get low x3 for sin term
mulsd %xmm1,%xmm7 # get low x3 for sin term
mulpd .L__real_3fe0000000000000(%rip),%xmm2 # 0.5*x2 for sin and cos terms
mulpd .L__real_3fe0000000000000(%rip),%xmm3 # 0.5*x2 for sin and cos terms
addpd %xmm8,%xmm4 # z
addpd %xmm9,%xmm5 # z
movhlps %xmm2,%xmm12 # move high r for cos
movhlps %xmm3,%xmm13 # move high r for cos
movhlps %xmm4,%xmm8 # xmm4 = sin , xmm8 = cos
movhlps %xmm5,%xmm9 # xmm4 = sin , xmm8 = cos
mulsd %xmm6,%xmm4 # sin *x3
mulsd %xmm7,%xmm5 # sin *x3
mulsd %xmm0,%xmm8 # cos *x4
mulsd %xmm11,%xmm9 # cos *x4
subsd .L__real_3ff0000000000000(%rip),%xmm12 #-t=r-1.0
subsd .L__real_3ff0000000000000(%rip),%xmm13 #-t=r-1.0
addsd %xmm10,%xmm4 # sin + x
addsd %xmm1,%xmm5 # sin + x
subsd %xmm12,%xmm8 # cos+t
subsd %xmm13,%xmm9 # cos+t
movlhps %xmm8,%xmm4
movlhps %xmm9,%xmm5
jmp .L__vrs4_cosf_cleanup
.align 16
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# p_sign0 = Sign, xmm10 = r, xmm2 = %xmm6,%r2 =rr
# p_sign1 = Sign, xmm1 = r, xmm3 = %xmm7,%r2 =rr
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.Lsincos_cossin_piby4:
movapd .Lsincosarray+0x30(%rip),%xmm4 # s4
movapd .Lcossinarray+0x30(%rip),%xmm5 # s4
movdqa .Lsincosarray+0x10(%rip),%xmm8 # s2
movdqa .Lcossinarray+0x10(%rip),%xmm9 # s2
movapd %xmm2,%xmm0 # move x2 for x4
movapd %xmm3,%xmm11 # move x2 for x4
movapd %xmm3,%xmm7 # sincos term upper x2 for x3
mulpd %xmm2,%xmm4 # x2s4
mulpd %xmm3,%xmm5 # x2s4
mulpd %xmm2,%xmm8 # x2s2
mulpd %xmm3,%xmm9 # x2s2
mulpd %xmm2,%xmm0 # x4
mulpd %xmm3,%xmm11 # x4
addpd .Lsincosarray+0x20(%rip),%xmm4 # s3+x2s4
addpd .Lcossinarray+0x20(%rip),%xmm5 # s3+x2s4
addpd .Lsincosarray(%rip),%xmm8 # s1+x2s2
addpd .Lcossinarray(%rip),%xmm9 # s1+x2s2
mulpd %xmm0,%xmm4 # x4(s3+x2s4)
mulpd %xmm11,%xmm5 # x4(s3+x2s4)
movhlps %xmm0,%xmm0 # move high x4 for cos term
movsd %xmm2,%xmm6 # move low x2 for x3 for sin term (cossin)
mulpd %xmm1,%xmm7
mulsd %xmm10,%xmm6 # get low x3 for sin term (cossin)
movhlps %xmm7,%xmm7 # get high x3 for sin term (sincos)
mulpd .L__real_3fe0000000000000(%rip),%xmm2 # 0.5*x2 for cos term
mulpd .L__real_3fe0000000000000(%rip),%xmm3 # 0.5*x2 for cos term
addpd %xmm8,%xmm4 # z
addpd %xmm9,%xmm5 # z
movhlps %xmm2,%xmm12 # move high r for cos (cossin)
movhlps %xmm4,%xmm8 # xmm8 = cos , xmm4 = sin (cossin)
movhlps %xmm5,%xmm9 # xmm9 = sin , xmm5 = cos (sincos)
mulsd %xmm6,%xmm4 # sin *x3
mulsd %xmm11,%xmm5 # cos *x4
mulsd %xmm0,%xmm8 # cos *x4
mulsd %xmm7,%xmm9 # sin *x3
subsd .L__real_3ff0000000000000(%rip),%xmm12 #-t=r-1.0
subsd .L__real_3ff0000000000000(%rip),%xmm3 # -t=r-1.0
movhlps %xmm1,%xmm11 # move high x for x for sin term (sincos)
addsd %xmm10,%xmm4 # sin + x +
addsd %xmm11,%xmm9 # sin + x +
subsd %xmm12,%xmm8 # cos+t
subsd %xmm3,%xmm5 # cos+t
movlhps %xmm8,%xmm4 # cossin
movlhps %xmm9,%xmm5 # sincos
jmp .L__vrs4_cosf_cleanup
.align 16
.Lsincos_sincos_piby4:
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# p_sign0 = Sign, xmm10 = r, xmm2 = %xmm6,%r2 =rr
# p_sign1 = Sign, xmm1 = r, xmm3 = %xmm7,%r2 =rr
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
movapd .Lcossinarray+0x30(%rip),%xmm4 # s4
movapd .Lcossinarray+0x30(%rip),%xmm5 # s4
movdqa .Lcossinarray+0x10(%rip),%xmm8 # s2
movdqa .Lcossinarray+0x10(%rip),%xmm9 # s2
movapd %xmm2,%xmm0 # move x2 for x4
movapd %xmm3,%xmm11 # move x2 for x4
movapd %xmm2,%xmm6 # move x2 for x4
movapd %xmm3,%xmm7 # move x2 for x4
mulpd %xmm2,%xmm4 # x2s6
mulpd %xmm3,%xmm5 # x2s6
mulpd %xmm2,%xmm8 # x2s3
mulpd %xmm3,%xmm9 # x2s3
mulpd %xmm2,%xmm0 # x4
mulpd %xmm3,%xmm11 # x4
addpd .Lcossinarray+0x20(%rip),%xmm4 # s4+x2s3
addpd .Lcossinarray+0x20(%rip),%xmm5 # s4+x2s3
addpd .Lcossinarray(%rip),%xmm8 # s2+x2s1
addpd .Lcossinarray(%rip),%xmm9 # s2+x2s1
mulpd %xmm0,%xmm4 # x4(s4+x2s3)
mulpd %xmm11,%xmm5 # x4(s4+x2s3)
mulpd %xmm10,%xmm6 # get low x3 for sin term
mulpd %xmm1,%xmm7 # get low x3 for sin term
movhlps %xmm6,%xmm6 # move low x2 for x3 for sin term
movhlps %xmm7,%xmm7 # move low x2 for x3 for sin term
mulsd .L__real_3fe0000000000000(%rip),%xmm2 # 0.5*x2 for cos terms
mulsd .L__real_3fe0000000000000(%rip),%xmm3 # 0.5*x2 for cos terms
addpd %xmm8,%xmm4 # z
addpd %xmm9,%xmm5 # z
movhlps %xmm4,%xmm12 # xmm8 = sin , xmm4 = cos
movhlps %xmm5,%xmm13 # xmm9 = sin , xmm5 = cos
mulsd %xmm6,%xmm12 # sin *x3
mulsd %xmm7,%xmm13 # sin *x3
mulsd %xmm0,%xmm4 # cos *x4
mulsd %xmm11,%xmm5 # cos *x4
subsd .L__real_3ff0000000000000(%rip),%xmm2 #-t=r-1.0
subsd .L__real_3ff0000000000000(%rip),%xmm3 #-t=r-1.0
movhlps %xmm10,%xmm0 # move high x for x for sin term
movhlps %xmm1,%xmm11 # move high x for x for sin term
# Reverse 10 and 0
addsd %xmm0,%xmm12 # sin + x
addsd %xmm11,%xmm13 # sin + x
subsd %xmm2,%xmm4 # cos+t
subsd %xmm3,%xmm5 # cos+t
movlhps %xmm12,%xmm4
movlhps %xmm13,%xmm5
jmp .L__vrs4_cosf_cleanup
.align 16
.Lcossin_sincos_piby4:
movapd .Lcossinarray+0x30(%rip),%xmm4 # s4
movapd .Lsincosarray+0x30(%rip),%xmm5 # s4
movdqa .Lcossinarray+0x10(%rip),%xmm8 # s2
movdqa .Lsincosarray+0x10(%rip),%xmm9 # s2
movapd %xmm2,%xmm0 # move x2 for x4
movapd %xmm3,%xmm11 # move x2 for x4
movapd %xmm2,%xmm7 # upper x2 for x3 for sin term (sincos)
mulpd %xmm2,%xmm4 # x2s4
mulpd %xmm3,%xmm5 # x2s4
mulpd %xmm2,%xmm8 # x2s2
mulpd %xmm3,%xmm9 # x2s2
mulpd %xmm2,%xmm0 # x4
mulpd %xmm3,%xmm11 # x4
addpd .Lcossinarray+0x20(%rip),%xmm4 # s3+x2s4
addpd .Lsincosarray+0x20(%rip),%xmm5 # s3+x2s4
addpd .Lcossinarray(%rip),%xmm8 # s1+x2s2
addpd .Lsincosarray(%rip),%xmm9 # s1+x2s2
mulpd %xmm0,%xmm4 # x4(s3+x2s4)
mulpd %xmm11,%xmm5 # x4(s3+x2s4)
movhlps %xmm11,%xmm11 # move high x4 for cos term
movsd %xmm3,%xmm6 # move low x2 for x3 for sin term (cossin)
mulpd %xmm10,%xmm7
mulsd %xmm1,%xmm6 # get low x3 for sin term (cossin)
movhlps %xmm7,%xmm7 # get high x3 for sin term (sincos)
mulsd .L__real_3fe0000000000000(%rip),%xmm2 # 0.5*x2 for cos term
mulpd .L__real_3fe0000000000000(%rip),%xmm3 # 0.5*x2 for cos term
addpd %xmm8,%xmm4 # z
addpd %xmm9,%xmm5 # z
movhlps %xmm3,%xmm12 # move high r for cos (cossin)
movhlps %xmm4,%xmm8 # xmm8 = sin , xmm4 = cos (sincos)
movhlps %xmm5,%xmm9 # xmm9 = cos , xmm5 = sin (cossin)
mulsd %xmm0,%xmm4 # cos *x4
mulsd %xmm6,%xmm5 # sin *x3
mulsd %xmm7,%xmm8 # sin *x3
mulsd %xmm11,%xmm9 # cos *x4
subsd .L__real_3ff0000000000000(%rip),%xmm2 # -t=r-1.0
subsd .L__real_3ff0000000000000(%rip),%xmm12 # -t=r-1.0
movhlps %xmm10,%xmm11 # move high x for x for sin term (sincos)
subsd %xmm2,%xmm4 # cos-(-t)
subsd %xmm12,%xmm9 # cos-(-t)
addsd %xmm11,%xmm8 # sin + x
addsd %xmm1,%xmm5 # sin + x
movlhps %xmm8,%xmm4 # cossin
movlhps %xmm9,%xmm5 # sincos
jmp .L__vrs4_cosf_cleanup
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.Lcoscos_sinsin_piby4:
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# p_sign0 = Sign, xmm10 = r, xmm2 = %xmm6,%r2 =rr: SIN
# p_sign1 = Sign, xmm1 = r, xmm3 = %xmm7,%r2 =rr: COS
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
movapd %xmm2,%xmm0 # x2 ; SIN
movapd %xmm3,%xmm11 # x2 ; COS
movapd %xmm3,%xmm1 # copy of x2 for x4
movdqa .Lsinarray+0x30(%rip),%xmm4 # c4
movdqa .Lcosarray+0x30(%rip),%xmm5 # c4
movapd .Lsinarray+0x10(%rip),%xmm8 # c2
movapd .Lcosarray+0x10(%rip),%xmm9 # c2
mulpd .L__real_3fe0000000000000(%rip),%xmm11 # r = 0.5 *x2
mulpd %xmm2,%xmm4 # c4*x2
mulpd %xmm3,%xmm5 # c4*x2
mulpd %xmm2,%xmm8 # c2*x2
mulpd %xmm3,%xmm9 # c2*x2
mulpd %xmm2,%xmm0 # x4
subpd .L__real_3ff0000000000000(%rip),%xmm11 # -t=r-1.0
mulpd %xmm3,%xmm1 # x4
addpd .Lsinarray+0x20(%rip),%xmm4 # c3+x2c4
addpd .Lcosarray+0x20(%rip),%xmm5 # c3+x2c4
addpd .Lsinarray(%rip),%xmm8 # c1+x2c2
addpd .Lcosarray(%rip),%xmm9 # c1+x2c2
mulpd %xmm10,%xmm2 # x3
mulpd %xmm0,%xmm4 # x4(c3+x2c4)
mulpd %xmm1,%xmm5 # x4(c3+x2c4)
addpd %xmm8,%xmm4 # zs
addpd %xmm9,%xmm5 # zc
mulpd %xmm2,%xmm4 # x3 * zs
mulpd %xmm1,%xmm5 # x4 * zc
addpd %xmm10,%xmm4 # +x
subpd %xmm11,%xmm5 # +t
jmp .L__vrs4_cosf_cleanup
.align 16
.Lsinsin_coscos_piby4:
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# p_sign0 = Sign, xmm10 = r, xmm2 = %xmm6,%r2 =rr: COS
# p_sign1 = Sign, xmm1 = r, xmm3 = %xmm7,%r2 =rr: SIN
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
movapd %xmm2,%xmm0 # x2 ; COS
movapd %xmm3,%xmm11 # x2 ; SIN
movapd %xmm2,%xmm10 # copy of x2 for x4
movdqa .Lcosarray+0x30(%rip),%xmm4 # c4
movdqa .Lsinarray+0x30(%rip),%xmm5 # s4
movapd .Lcosarray+0x10(%rip),%xmm8 # c2
movapd .Lsinarray+0x10(%rip),%xmm9 # s2
mulpd %xmm2,%xmm4 # c4*x2
mulpd %xmm3,%xmm5 # s4*x2
mulpd %xmm2,%xmm8 # c2*x2
mulpd %xmm3,%xmm9 # s2*x2
mulpd %xmm2,%xmm10 # x4
mulpd %xmm3,%xmm11 # x4
addpd .Lcosarray+0x20(%rip),%xmm4 # c3+x2c4
addpd .Lsinarray+0x20(%rip),%xmm5 # s3+x2c4
addpd .Lcosarray(%rip),%xmm8 # c1+x2c2
addpd .Lsinarray(%rip),%xmm9 # s1+x2c2
mulpd %xmm1,%xmm3 # x3
mulpd .L__real_3fe0000000000000(%rip),%xmm0 # r = 0.5 *x2
mulpd %xmm10,%xmm4 # x4(c3+x2c4)
mulpd %xmm11,%xmm5 # x4(s3+x2s4)
subpd .L__real_3ff0000000000000(%rip),%xmm0 # -t=r-1.0
addpd %xmm8,%xmm4 # zc
addpd %xmm9,%xmm5 # zs
mulpd %xmm10,%xmm4 # x4 * zc
mulpd %xmm3,%xmm5 # x3 * zc
subpd %xmm0,%xmm4 # +t
addpd %xmm1,%xmm5 # +x
jmp .L__vrs4_cosf_cleanup
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.Lcoscos_cossin_piby4: #Derive from cossin_coscos
movhlps %xmm2,%xmm0 # x2 for 0.5x2 for upper cos
movsd %xmm2,%xmm6 # lower x2 for x3 for lower sin
movapd %xmm3,%xmm11 # x2 for 0.5x2
movapd %xmm2,%xmm12 # x2 for x4
movapd %xmm3,%xmm13 # x2 for x4
movsd .L__real_3ff0000000000000(%rip),%xmm7
movdqa .Lsincosarray+0x30(%rip),%xmm4 # c4
movdqa .Lcosarray+0x30(%rip),%xmm5 # c4
movapd .Lsincosarray+0x10(%rip),%xmm8 # c2
movapd .Lcosarray+0x10(%rip),%xmm9 # c2
mulsd .L__real_3fe0000000000000(%rip),%xmm0 # 0.5 *x2
mulpd .L__real_3fe0000000000000(%rip),%xmm11 # 0.5 *x2
mulpd %xmm2,%xmm4 # c4*x2
mulpd %xmm3,%xmm5 # c4*x2
mulpd %xmm2,%xmm8 # c2*x2
mulpd %xmm3,%xmm9 # c2*x2
subsd %xmm0,%xmm7 # t=1.0-r for cos
subpd .L__real_3ff0000000000000(%rip),%xmm11 # -t=r-1.0
mulpd %xmm2,%xmm12 # x4
mulpd %xmm3,%xmm13 # x4
addpd .Lsincosarray+0x20(%rip),%xmm4 # c4+x2c3
addpd .Lcosarray+0x20(%rip),%xmm5 # c4+x2c3
addpd .Lsincosarray(%rip),%xmm8 # c2+x2c1
addpd .Lcosarray(%rip),%xmm9 # c2+x2c1
movapd %xmm12,%xmm2 # upper=x4
movsd %xmm6,%xmm2 # lower=x2
mulsd %xmm10,%xmm2 # lower=x2*x
mulpd %xmm12,%xmm4 # x4(c3+x2c4)
mulpd %xmm13,%xmm5 # x4(c3+x2c4)
addpd %xmm8,%xmm4 # zczs
addpd %xmm9,%xmm5 # zc
mulpd %xmm2,%xmm4 # upper= x4 * zc
# lower=x3 * zs
mulpd %xmm13,%xmm5 # x4 * zc
movlhps %xmm7,%xmm10 #
addpd %xmm10,%xmm4 # +x for lower sin, +t for upper cos
subpd %xmm11,%xmm5 # -(-t)
jmp .L__vrs4_cosf_cleanup
.align 16
.Lcoscos_sincos_piby4: #Derive from cossin_coscos
movsd %xmm2,%xmm0 # x2 for 0.5x2 for lower cos
movapd %xmm3,%xmm11 # x2 for 0.5x2
movapd %xmm2,%xmm12 # x2 for x4
movapd %xmm3,%xmm13 # x2 for x4
movsd .L__real_3ff0000000000000(%rip),%xmm7
movdqa .Lcossinarray+0x30(%rip),%xmm4 # cs4
movdqa .Lcosarray+0x30(%rip),%xmm5 # c4
movapd .Lcossinarray+0x10(%rip),%xmm8 # cs2
movapd .Lcosarray+0x10(%rip),%xmm9 # c2
mulsd .L__real_3fe0000000000000(%rip),%xmm0 # 0.5 *x2
mulpd .L__real_3fe0000000000000(%rip),%xmm11 # 0.5 *x2
mulpd %xmm2,%xmm4 # c4*x2
mulpd %xmm3,%xmm5 # c4*x2
mulpd %xmm2,%xmm8 # c2*x2
mulpd %xmm3,%xmm9 # c2*x2
subsd %xmm0,%xmm7 # t=1.0-r for cos
subpd .L__real_3ff0000000000000(%rip),%xmm11 # -t=r-1.0
mulpd %xmm2,%xmm12 # x4
mulpd %xmm3,%xmm13 # x4
addpd .Lcossinarray+0x20(%rip),%xmm4 # c4+x2c3
addpd .Lcosarray+0x20(%rip),%xmm5 # c4+x2c3
addpd .Lcossinarray(%rip),%xmm8 # c2+x2c1
addpd .Lcosarray(%rip),%xmm9 # c2+x2c1
mulpd %xmm10,%xmm2 # upper=x3 for sin
mulsd %xmm10,%xmm2 # lower=x4 for cos
mulpd %xmm12,%xmm4 # x4(c3+x2c4)
mulpd %xmm13,%xmm5 # x4(c3+x2c4)
addpd %xmm8,%xmm4 # zczs
addpd %xmm9,%xmm5 # zc
mulpd %xmm2,%xmm4 # lower= x4 * zc
# upper= x3 * zs
mulpd %xmm13,%xmm5 # x4 * zc
movsd %xmm7,%xmm10
addpd %xmm10,%xmm4 # +x for upper sin, +t for lower cos
subpd %xmm11,%xmm5 # -(-t)
jmp .L__vrs4_cosf_cleanup
.align 16
.Lcossin_coscos_piby4:
movhlps %xmm3,%xmm0 # x2 for 0.5x2 for upper cos
movapd %xmm2,%xmm11 # x2 for 0.5x2
movapd %xmm2,%xmm12 # x2 for x4
movapd %xmm3,%xmm13 # x2 for x4
movsd %xmm3,%xmm6 # lower x2 for x3 for sin
movsd .L__real_3ff0000000000000(%rip),%xmm7
movdqa .Lcosarray+0x30(%rip),%xmm4 # cs4
movdqa .Lsincosarray+0x30(%rip),%xmm5 # c4
movapd .Lcosarray+0x10(%rip),%xmm8 # cs2
movapd .Lsincosarray+0x10(%rip),%xmm9 # c2
mulsd .L__real_3fe0000000000000(%rip),%xmm0 # 0.5 *x2
mulpd .L__real_3fe0000000000000(%rip),%xmm11 # 0.5 *x2
mulpd %xmm2,%xmm4 # c4*x2
mulpd %xmm3,%xmm5 # c4*x2
mulpd %xmm2,%xmm8 # c2*x2
mulpd %xmm3,%xmm9 # c2*x2
subsd %xmm0,%xmm7 # t=1.0-r for cos
subpd .L__real_3ff0000000000000(%rip),%xmm11 # -t=r-1.0
mulpd %xmm2,%xmm12 # x4
mulpd %xmm3,%xmm13 # x4
addpd .Lcosarray+0x20(%rip),%xmm4 # c4+x2c3
addpd .Lsincosarray+0x20(%rip),%xmm5 # c4+x2c3
addpd .Lcosarray(%rip),%xmm8 # c2+x2c1
addpd .Lsincosarray(%rip),%xmm9 # c2+x2c1
movapd %xmm13,%xmm3 # upper=x4
movsd %xmm6,%xmm3 # lower x2
mulsd %xmm1,%xmm3 # lower x2*x
mulpd %xmm12,%xmm4 # x4(c3+x2c4)
mulpd %xmm13,%xmm5 # x4(c3+x2c4)
addpd %xmm8,%xmm4 # zczs
addpd %xmm9,%xmm5 # zc
mulpd %xmm12,%xmm4 # x4 * zc
mulpd %xmm3,%xmm5 # upper= x4 * zc
# lower=x3 * zs
movlhps %xmm7,%xmm1
addpd %xmm1,%xmm5 # +x for lower sin, +t for upper cos
subpd %xmm11,%xmm4 # -(-t)
jmp .L__vrs4_cosf_cleanup
.align 16
.Lcossin_sinsin_piby4: # Derived from sincos_coscos
movhlps %xmm3,%xmm0 # x2
movapd %xmm3,%xmm7
movapd %xmm2,%xmm12 # copy of x2 for x4
movapd %xmm3,%xmm13 # copy of x2 for x4
movsd .L__real_3ff0000000000000(%rip),%xmm11
movdqa .Lsinarray+0x30(%rip),%xmm4 # c4
movdqa .Lsincosarray+0x30(%rip),%xmm5 # c4
movapd .Lsinarray+0x10(%rip),%xmm8 # c2
movapd .Lsincosarray+0x10(%rip),%xmm9 # c2
mulsd .L__real_3fe0000000000000(%rip),%xmm0 # r = 0.5 *x2
mulpd %xmm2,%xmm4 # c4*x2
mulpd %xmm3,%xmm5 # c4*x2
mulpd %xmm2,%xmm8 # c2*x2
mulpd %xmm3,%xmm9 # c2*x2
mulpd %xmm2,%xmm12 # x4
subsd %xmm0,%xmm11 # t=1.0-r for cos
mulpd %xmm3,%xmm13 # x4
addpd .Lsinarray+0x20(%rip),%xmm4 # c3+x2c4
addpd .Lsincosarray+0x20(%rip),%xmm5 # c3+x2c4
addpd .Lsinarray(%rip),%xmm8 # c1+x2c2
addpd .Lsincosarray(%rip),%xmm9 # c1+x2c2
mulpd %xmm10,%xmm2 # x3
movapd %xmm13,%xmm3 # upper x4 for cos
movsd %xmm7,%xmm3 # lower x2 for sin
mulsd %xmm1,%xmm3 # lower x3=x2*x for sin
mulpd %xmm12,%xmm4 # x4(c3+x2c4)
mulpd %xmm13,%xmm5 # x4(c3+x2c4)
movlhps %xmm11,%xmm1 # t for upper cos and x for lower sin
addpd %xmm8,%xmm4 # zczs
addpd %xmm9,%xmm5 # zs
mulpd %xmm2,%xmm4 # x3 * zs
mulpd %xmm3,%xmm5 # upper=x4 * zc
# lower=x3 * zs
addpd %xmm10,%xmm4 # +x
addpd %xmm1,%xmm5 # +t upper, +x lower
jmp .L__vrs4_cosf_cleanup
.align 16
.Lsincos_coscos_piby4:
movsd %xmm3,%xmm0 # x2 for 0.5x2 for lower cos
movapd %xmm2,%xmm11 # x2 for 0.5x2
movapd %xmm2,%xmm12 # x2 for x4
movapd %xmm3,%xmm13 # x2 for x4
movsd .L__real_3ff0000000000000(%rip),%xmm7
movdqa .Lcosarray+0x30(%rip),%xmm4 # cs4
movdqa .Lcossinarray+0x30(%rip),%xmm5 # c4
movapd .Lcosarray+0x10(%rip),%xmm8 # cs2
movapd .Lcossinarray+0x10(%rip),%xmm9 # c2
mulsd .L__real_3fe0000000000000(%rip),%xmm0 # 0.5 *x2
mulpd .L__real_3fe0000000000000(%rip),%xmm11 # 0.5 *x2
mulpd %xmm2,%xmm4 # c4*x2
mulpd %xmm3,%xmm5 # c4*x2
mulpd %xmm2,%xmm8 # c2*x2
mulpd %xmm3,%xmm9 # c2*x2
subsd %xmm0,%xmm7 # t=1.0-r for cos
subpd .L__real_3ff0000000000000(%rip),%xmm11 # -t=r-1.0
mulpd %xmm2,%xmm12 # x4
mulpd %xmm3,%xmm13 # x4
addpd .Lcosarray+0x20(%rip),%xmm4 # c4+x2c3
addpd .Lcossinarray+0x20(%rip),%xmm5 # c4+x2c3
addpd .Lcosarray(%rip),%xmm8 # c2+x2c1
addpd .Lcossinarray(%rip),%xmm9 # c2+x2c1
mulpd %xmm1,%xmm3 # upper=x3 for sin
mulsd %xmm1,%xmm3 # lower=x4 for cos
mulpd %xmm12,%xmm4 # x4(c3+x2c4)
mulpd %xmm13,%xmm5 # x4(c3+x2c4)
addpd %xmm8,%xmm4 # zczs
addpd %xmm9,%xmm5 # zc
mulpd %xmm12,%xmm4 # x4 * zc
mulpd %xmm3,%xmm5 # lower= x4 * zc
# upper= x3 * zs
movsd %xmm7,%xmm1
subpd %xmm11,%xmm4 # -(-t)
addpd %xmm1,%xmm5 # +x for upper sin, +t for lower cos
jmp .L__vrs4_cosf_cleanup
.align 16
.Lsincos_sinsin_piby4: # Derived from sincos_coscos
movsd %xmm3,%xmm0 # x2
movapd %xmm2,%xmm12 # copy of x2 for x4
movapd %xmm3,%xmm13 # copy of x2 for x4
movsd .L__real_3ff0000000000000(%rip),%xmm11
movdqa .Lsinarray+0x30(%rip),%xmm4 # c4
movdqa .Lcossinarray+0x30(%rip),%xmm5 # c4
movapd .Lsinarray+0x10(%rip),%xmm8 # c2
movapd .Lcossinarray+0x10(%rip),%xmm9 # c2
mulsd .L__real_3fe0000000000000(%rip),%xmm0 # r = 0.5 *x2
mulpd %xmm2,%xmm4 # c4*x2
mulpd %xmm3,%xmm5 # c4*x2
mulpd %xmm2,%xmm8 # c2*x2
mulpd %xmm3,%xmm9 # c2*x2
mulpd %xmm2,%xmm12 # x4
subsd %xmm0,%xmm11 # t=1.0-r for cos
mulpd %xmm3,%xmm13 # x4
addpd .Lsinarray+0x20(%rip),%xmm4 # c3+x2c4
addpd .Lcossinarray+0x20(%rip),%xmm5 # c3+x2c4
addpd .Lsinarray(%rip),%xmm8 # c1+x2c2
addpd .Lcossinarray(%rip),%xmm9 # c1+x2c2
mulpd %xmm10,%xmm2 # x3
mulpd %xmm1,%xmm3 # upper x3 for sin
mulsd %xmm1,%xmm3 # lower x4 for cos
movhlps %xmm1,%xmm6
mulpd %xmm12,%xmm4 # x4(c3+x2c4)
mulpd %xmm13,%xmm5 # x4(c3+x2c4)
movlhps %xmm6,%xmm11 # upper =t ; lower =x
addpd %xmm8,%xmm4 # zs
addpd %xmm9,%xmm5 # zszc
mulpd %xmm2,%xmm4 # x3 * zs
mulpd %xmm3,%xmm5 # lower=x4 * zc
# upper=x3 * zs
addpd %xmm10,%xmm4 # +x
addpd %xmm11,%xmm5 # +t lower, +x upper
jmp .L__vrs4_cosf_cleanup
.align 16
.Lsinsin_cossin_piby4: # Derived from sincos_coscos
movhlps %xmm2,%xmm0 # x2
movapd %xmm2,%xmm7
movapd %xmm2,%xmm12 # copy of x2 for x4
movapd %xmm3,%xmm13 # copy of x2 for x4
movsd .L__real_3ff0000000000000(%rip),%xmm11
movdqa .Lsincosarray+0x30(%rip),%xmm4 # c4
movdqa .Lsinarray+0x30(%rip),%xmm5 # c4
movapd .Lsincosarray+0x10(%rip),%xmm8 # c2
movapd .Lsinarray+0x10(%rip),%xmm9 # c2
mulsd .L__real_3fe0000000000000(%rip),%xmm0 # r = 0.5 *x2
mulpd %xmm2,%xmm4 # c4*x2
mulpd %xmm3,%xmm5 # c4*x2
mulpd %xmm2,%xmm8 # c2*x2
mulpd %xmm3,%xmm9 # c2*x2
mulpd %xmm2,%xmm12 # x4
subsd %xmm0,%xmm11 # t=1.0-r for cos
mulpd %xmm3,%xmm13 # x4
addpd .Lsincosarray+0x20(%rip),%xmm4 # c3+x2c4
addpd .Lsinarray+0x20(%rip),%xmm5 # c3+x2c4
addpd .Lsincosarray(%rip),%xmm8 # c1+x2c2
addpd .Lsinarray(%rip),%xmm9 # c1+x2c2
mulpd %xmm1,%xmm3 # x3
movapd %xmm12,%xmm2 # upper x4 for cos
movsd %xmm7,%xmm2 # lower x2 for sin
mulsd %xmm10,%xmm2 # lower x3=x2*x for sin
mulpd %xmm12,%xmm4 # x4(c3+x2c4)
mulpd %xmm13,%xmm5 # x4(c3+x2c4)
movlhps %xmm11,%xmm10 # t for upper cos and x for lower sin
addpd %xmm8,%xmm4 # zs
addpd %xmm9,%xmm5 # zszc
mulpd %xmm3,%xmm5 # x3 * zs
mulpd %xmm2,%xmm4 # upper=x4 * zc
# lower=x3 * zs
addpd %xmm1,%xmm5 # +x
addpd %xmm10,%xmm4 # +t upper, +x lower
jmp .L__vrs4_cosf_cleanup
.align 16
.Lsinsin_sincos_piby4: # Derived from sincos_coscos
movsd %xmm2,%xmm0 # x2
movapd %xmm2,%xmm12 # copy of x2 for x4
movapd %xmm3,%xmm13 # copy of x2 for x4
movsd .L__real_3ff0000000000000(%rip),%xmm11
movdqa .Lcossinarray+0x30(%rip),%xmm4 # c4
movdqa .Lsinarray+0x30(%rip),%xmm5 # c4
movapd .Lcossinarray+0x10(%rip),%xmm8 # c2
movapd .Lsinarray+0x10(%rip),%xmm9 # c2
mulsd .L__real_3fe0000000000000(%rip),%xmm0 # r = 0.5 *x2
mulpd %xmm2,%xmm4 # c4*x2
mulpd %xmm3,%xmm5 # c4*x2
mulpd %xmm2,%xmm8 # c2*x2
mulpd %xmm3,%xmm9 # c2*x2
mulpd %xmm2,%xmm12 # x4
subsd %xmm0,%xmm11 # t=1.0-r for cos
mulpd %xmm3,%xmm13 # x4
addpd .Lcossinarray+0x20(%rip),%xmm4 # c3+x2c4
addpd .Lsinarray+0x20(%rip),%xmm5 # c3+x2c4
addpd .Lcossinarray(%rip),%xmm8 # c1+x2c2
addpd .Lsinarray(%rip),%xmm9 # c1+x2c2
mulpd %xmm1,%xmm3 # x3
mulpd %xmm10,%xmm2 # upper x3 for sin
mulsd %xmm10,%xmm2 # lower x4 for cos
movhlps %xmm10,%xmm6
mulpd %xmm12,%xmm4 # x4(c3+x2c4)
mulpd %xmm13,%xmm5 # x4(c3+x2c4)
movlhps %xmm6,%xmm11
addpd %xmm8,%xmm4 # zs
addpd %xmm9,%xmm5 # zszc
mulpd %xmm3,%xmm5 # x3 * zs
mulpd %xmm2,%xmm4 # lower=x4 * zc
# upper=x3 * zs
addpd %xmm1,%xmm5 # +x
addpd %xmm11,%xmm4 # +t lower, +x upper
jmp .L__vrs4_cosf_cleanup
.align 16
.Lsinsin_sinsin_piby4:
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# p_sign0 = Sign, xmm10 = r, xmm2 = %xmm6,%r2 =rr
# p_sign1 = Sign, xmm1 = r, xmm3 = %xmm7,%r2 =rr
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#x2 = x * x;
#(x + x * x2 * (c1 + x2 * (c2 + x2 * (c3 + x2 * c4))));
#x + x3 * ((c1 + x2 *c2) + x4 * (c3 + x2 * c4));
movapd %xmm2,%xmm0 # x2
movapd %xmm3,%xmm11 # x2
movdqa .Lsinarray+0x30(%rip),%xmm4 # c4
movdqa .Lsinarray+0x30(%rip),%xmm5 # c4
mulpd %xmm2,%xmm0 # x4
mulpd %xmm3,%xmm11 # x4
movapd .Lsinarray+0x10(%rip),%xmm8 # c2
movapd .Lsinarray+0x10(%rip),%xmm9 # c2
mulpd %xmm2,%xmm4 # c4*x2
mulpd %xmm3,%xmm5 # c4*x2
mulpd %xmm2,%xmm8 # c2*x2
mulpd %xmm3,%xmm9 # c2*x2
addpd .Lsinarray+0x20(%rip),%xmm4 # c3+x2c4
addpd .Lsinarray+0x20(%rip),%xmm5 # c3+x2c4
mulpd %xmm10,%xmm2 # x3
mulpd %xmm1,%xmm3 # x3
addpd .Lsinarray(%rip),%xmm8 # c1+x2c2
addpd .Lsinarray(%rip),%xmm9 # c1+x2c2
mulpd %xmm0,%xmm4 # x4(c3+x2c4)
mulpd %xmm11,%xmm5 # x4(c3+x2c4)
addpd %xmm8,%xmm4 # zs
addpd %xmm9,%xmm5 # zs
mulpd %xmm2,%xmm4 # x3 * zs
mulpd %xmm3,%xmm5 # x3 * zs
addpd %xmm10,%xmm4 # +x
addpd %xmm1,%xmm5 # +x
jmp .L__vrs4_cosf_cleanup