blob: 2bb70bfdfda37bebb11bf9ae50cce3147c6c5b9f [file] [log] [blame]
#
# (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
#
# This file is part of libacml_mv.
#
# libacml_mv is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# libacml_mv is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with libacml_mv. If not, see
# <http://www.gnu.org/licenses/>.
#
#
#
# vrsasincosf.s
#
# A vector implementation of the sincos libm function.
#
# Prototype:
#
# __vrsa_sincosf(int n, float* x, float* ys, float* yc);
#
# Computes Sine and Cosine of x for an array of input values.
# Places the Sine results into the supplied ys array and the Cosine results into the supplied yc array.
# Does not perform error checking.
# Denormal inputs may produce unexpected results.
# This routine computes 4 single precision Sine Cosine values at a time.
# The four values are passed as packed single in xmm0.
# The four Sine results are returned as packed singles in the supplied ys array.
# The four Cosine results are returned as packed singles in the supplied yc array.
# Note that this represents a non-standard ABI usage, as no ABI
# ( and indeed C) currently allows returning 2 values for a function.
# It is expected that some compilers may be able to take advantage of this
# interface when implementing vectorized loops. Using the array implementation
# of the routine requires putting the inputs into memory, and retrieving
# the results from memory. This routine eliminates the need for this
# overhead if the data does not already reside in memory.
# Author: Harsha Jagasia
# Email: harsha.jagasia@amd.com
#ifdef __ELF__
.section .note.GNU-stack,"",@progbits
#endif
.data
.align 64
.L__real_7fffffffffffffff: .quad 0x07fffffffffffffff #Sign bit zero
.quad 0x07fffffffffffffff
.L__real_3ff0000000000000: .quad 0x03ff0000000000000 # 1.0
.quad 0x03ff0000000000000
.L__real_v2p__27: .quad 0x03e40000000000000 # 2p-27
.quad 0x03e40000000000000
.L__real_3fe0000000000000: .quad 0x03fe0000000000000 # 0.5
.quad 0x03fe0000000000000
.L__real_3fc5555555555555: .quad 0x03fc5555555555555 # 0.166666666666
.quad 0x03fc5555555555555
.L__real_3fe45f306dc9c883: .quad 0x03fe45f306dc9c883 # twobypi
.quad 0x03fe45f306dc9c883
.L__real_3ff921fb54400000: .quad 0x03ff921fb54400000 # piby2_1
.quad 0x03ff921fb54400000
.L__real_3dd0b4611a626331: .quad 0x03dd0b4611a626331 # piby2_1tail
.quad 0x03dd0b4611a626331
.L__real_3dd0b4611a600000: .quad 0x03dd0b4611a600000 # piby2_2
.quad 0x03dd0b4611a600000
.L__real_3ba3198a2e037073: .quad 0x03ba3198a2e037073 # piby2_2tail
.quad 0x03ba3198a2e037073
.L__real_fffffffff8000000: .quad 0x0fffffffff8000000 # mask for stripping head and tail
.quad 0x0fffffffff8000000
.L__real_8000000000000000: .quad 0x08000000000000000 # -0 or signbit
.quad 0x08000000000000000
.L__reald_one_one: .quad 0x00000000100000001 #
.quad 0
.L__reald_two_two: .quad 0x00000000200000002 #
.quad 0
.L__reald_one_zero: .quad 0x00000000100000000 # sin_cos_filter
.quad 0
.L__reald_zero_one: .quad 0x00000000000000001 #
.quad 0
.L__reald_two_zero: .quad 0x00000000200000000 #
.quad 0
.L__realq_one_one: .quad 0x00000000000000001 #
.quad 0x00000000000000001 #
.L__realq_two_two: .quad 0x00000000000000002 #
.quad 0x00000000000000002 #
.L__real_1_x_mask: .quad 0x0ffffffffffffffff #
.quad 0x03ff0000000000000 #
.L__real_zero: .quad 0x00000000000000000 #
.quad 0x00000000000000000 #
.L__real_one: .quad 0x00000000000000001 #
.quad 0x00000000000000001 #
.Lcosarray:
.quad 0x03FA5555555502F31 # 0.0416667 c1
.quad 0x03FA5555555502F31
.quad 0x0BF56C16BF55699D7 # -0.00138889 c2
.quad 0x0BF56C16BF55699D7
.quad 0x03EFA015C50A93B49 # 2.48016e-005 c3
.quad 0x03EFA015C50A93B49
.quad 0x0BE92524743CC46B8 # -2.75573e-007 c4
.quad 0x0BE92524743CC46B8
.Lsinarray:
.quad 0x0BFC555555545E87D # -0.166667 s1
.quad 0x0BFC555555545E87D
.quad 0x03F811110DF01232D # 0.00833333 s2
.quad 0x03F811110DF01232D
.quad 0x0BF2A013A88A37196 # -0.000198413 s3
.quad 0x0BF2A013A88A37196
.quad 0x03EC6DBE4AD1572D5 # 2.75573e-006 s4
.quad 0x03EC6DBE4AD1572D5
.Lsincosarray:
.quad 0x0BFC555555545E87D # -0.166667 s1
.quad 0x03FA5555555502F31 # 0.0416667 c1
.quad 0x03F811110DF01232D # 0.00833333 s2
.quad 0x0BF56C16BF55699D7
.quad 0x0BF2A013A88A37196 # -0.000198413 s3
.quad 0x03EFA015C50A93B49
.quad 0x03EC6DBE4AD1572D5 # 2.75573e-006 s4
.quad 0x0BE92524743CC46B8
.Lcossinarray:
.quad 0x03FA5555555502F31 # 0.0416667 c1
.quad 0x0BFC555555545E87D # -0.166667 s1
.quad 0x0BF56C16BF55699D7 # c2
.quad 0x03F811110DF01232D
.quad 0x03EFA015C50A93B49 # c3
.quad 0x0BF2A013A88A37196
.quad 0x0BE92524743CC46B8 # c4
.quad 0x03EC6DBE4AD1572D5
.align 8
.Levensin_oddcos_tbl:
.quad .Lsinsin_sinsin_piby4 # 0 * ; Done
.quad .Lsinsin_sincos_piby4 # 1 + ; Done
.quad .Lsinsin_cossin_piby4 # 2 ; Done
.quad .Lsinsin_coscos_piby4 # 3 + ; Done
.quad .Lsincos_sinsin_piby4 # 4 ; Done
.quad .Lsincos_sincos_piby4 # 5 * ; Done
.quad .Lsincos_cossin_piby4 # 6 ; Done
.quad .Lsincos_coscos_piby4 # 7 ; Done
.quad .Lcossin_sinsin_piby4 # 8 ; Done
.quad .Lcossin_sincos_piby4 # 9 ; TBD
.quad .Lcossin_cossin_piby4 # 10 * ; Done
.quad .Lcossin_coscos_piby4 # 11 ; Done
.quad .Lcoscos_sinsin_piby4 # 12 ; Done
.quad .Lcoscos_sincos_piby4 # 13 + ; Done
.quad .Lcoscos_cossin_piby4 # 14 ; Done
.quad .Lcoscos_coscos_piby4 # 15 * ; Done
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.weak vrsa_sincosf_
.set vrsa_sincosf_,__vrsa_sincosf__
.weak vrsa_sincosf__
.set vrsa_sincosf__,__vrsa_sincosf__
.text
.align 16
.p2align 4,,15
#FORTRAN subroutine implementation of array sincos
#VRSA_SINCOSF(N,X,Y,Z)
#C equivalent*/
#void vrsa_sincosf__(int * n, double *x, double *y, double *z)
#{
# vrsa_sincosf(*n,x,y,z);
#}
.globl __vrsa_sincosf__
.type __vrsa_sincosf__,@function
__vrsa_sincosf__:
mov (%rdi),%edi
.align 16
.p2align 4,,15
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# define local variable storage offsets
.equ p_temp,0 # temporary for get/put bits operation
.equ p_temp1,0x10 # temporary for get/put bits operation
.equ save_xmm6,0x20 # temporary for get/put bits operation
.equ save_xmm7,0x30 # temporary for get/put bits operation
.equ save_xmm8,0x40 # temporary for get/put bits operation
.equ save_xmm9,0x50 # temporary for get/put bits operation
.equ save_xmm0,0x60 # temporary for get/put bits operation
.equ save_xmm11,0x70 # temporary for get/put bits operation
.equ save_xmm12,0x80 # temporary for get/put bits operation
.equ save_xmm13,0x90 # temporary for get/put bits operation
.equ save_xmm14,0x0A0 # temporary for get/put bits operation
.equ save_xmm15,0x0B0 # temporary for get/put bits operation
.equ r,0x0C0 # pointer to r for remainder_piby2
.equ rr,0x0D0 # pointer to r for remainder_piby2
.equ region,0x0E0 # pointer to r for remainder_piby2
.equ r1,0x0F0 # pointer to r for remainder_piby2
.equ rr1,0x0100 # pointer to r for remainder_piby2
.equ region1,0x0110 # pointer to r for remainder_piby2
.equ p_temp2,0x0120 # temporary for get/put bits operation
.equ p_temp3,0x0130 # temporary for get/put bits operation
.equ p_temp4,0x0140 # temporary for get/put bits operation
.equ p_temp5,0x0150 # temporary for get/put bits operation
.equ p_original,0x0160 # original x
.equ p_mask,0x0170 # original x
.equ p_sign_sin,0x0180 # original x
.equ p_original1,0x0190 # original x
.equ p_mask1,0x01A0 # original x
.equ p_sign1_sin,0x01B0 # original x
.equ save_r12,0x01C0 # temporary for get/put bits operation
.equ save_r13,0x01D0 # temporary for get/put bits operation
.equ p_sin,0x01E0 # sin
.equ p_cos,0x01F0 # cos
.equ save_rdi,0x0200 # temporary for get/put bits operation
.equ save_rsi,0x0210 # temporary for get/put bits operation
.equ p_sign_cos,0x0220 # Sign of lower cos term
.equ p_sign1_cos,0x0230 # Sign of upper cos term
.equ save_xa,0x0240 #qword ; leave space for 4 args*****
.equ save_ysa,0x0250 #qword ; leave space for 4 args*****
.equ save_yca,0x0260 #qword ; leave space for 4 args*****
.equ save_nv,0x0270 #qword
.equ p_iter,0x0280 #qword storage for number of loop iterations
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.globl vrsa_sincosf
.type vrsa_sincosf,@function
vrsa_sincosf:
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# parameters are passed in by Linux as:
# rcx - int n
# rdx - double *x
# r8 - double *y
sub $0x0298,%rsp
mov %r12,save_r12(%rsp) # save r12
mov %r13,save_r13(%rsp) # save r13
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#START PROCESS INPUT
# save the arguments
mov %rsi,save_xa(%rsp) # save x_array pointer
mov %rdx,save_ysa(%rsp) # save ysin_array pointer
mov %rcx,save_yca(%rsp) # save ycos_array pointer
#ifdef INTEGER64
mov %rdi,%rax
#else
mov %edi,%eax
mov %rax,%rdi
#endif
mov %rdi,save_nv(%rsp) # save number of values
# see if too few values to call the main loop
shr $2,%rax # get number of iterations
jz .L__vrsa_cleanup # jump if only single calls
# prepare the iteration counts
mov %rax,p_iter(%rsp) # save number of iterations
shl $2,%rax
sub %rax,%rdi # compute number of extra single calls
mov %rdi,save_nv(%rsp) # save number of left over values
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#START LOOP
.align 16
.L__vrsa_top:
# build the input _m128d
# movapd .L__real_7fffffffffffffff,%xmm2 #
# mov .L__real_7fffffffffffffff,%rdx #
mov save_xa(%rsp),%rsi # get x_array pointer
movlps (%rsi),%xmm0
movhps 8(%rsi),%xmm0
prefetch 32(%rsi)
add $16,%rsi
mov %rsi,save_xa(%rsp) # save x_array pointer
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#STARTMAIN
movhlps %xmm0,%xmm8
cvtps2pd %xmm0,%xmm10 # convert input to double.
cvtps2pd %xmm8,%xmm1 # convert input to double.
movdqa %xmm10,%xmm6
movdqa %xmm1,%xmm7
movapd .L__real_7fffffffffffffff(%rip),%xmm2
andpd %xmm2,%xmm10 #Unsign
andpd %xmm2,%xmm1 #Unsign
mov %rdi, p_sin(%rsp) # save address for sin return
mov %rsi, p_cos(%rsp) # save address for cos return
movd %xmm10,%rax #rax is lower arg
movhpd %xmm10, p_temp+8(%rsp) #
mov p_temp+8(%rsp),%rcx #rcx = upper arg
movd %xmm1,%r8 #r8 is lower arg
movhpd %xmm1, p_temp1+8(%rsp) #
mov p_temp1+8(%rsp),%r9 #r9 = upper arg
movdqa %xmm10,%xmm12
movdqa %xmm1,%xmm13
pcmpgtd %xmm6,%xmm12
pcmpgtd %xmm7,%xmm13
movdqa %xmm12,%xmm6
movdqa %xmm13,%xmm7
psrldq $4,%xmm12
psrldq $4,%xmm13
psrldq $8,%xmm6
psrldq $8,%xmm7
mov $0x3FE921FB54442D18,%rdx #piby4 +
mov $0x411E848000000000,%r10 #5e5 +
movapd .L__real_3fe0000000000000(%rip),%xmm4 #0.5 for later use +
por %xmm6,%xmm12
por %xmm7,%xmm13
movd %xmm12,%r12 #Move Sign to gpr **
movd %xmm13,%r13 #Move Sign to gpr **
movapd %xmm10,%xmm2 #x0
movapd %xmm1,%xmm3 #x1
movapd %xmm10,%xmm6 #x0
movapd %xmm1,%xmm7 #x1
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# xmm2 = x, xmm4 =0.5/t, xmm6 =x
# xmm3 = x, xmm5 =0.5/t, xmm7 =x
.align 16
.Leither_or_both_arg_gt_than_piby4:
cmp %r10,%rax
jae .Lfirst_or_next3_arg_gt_5e5
cmp %r10,%rcx
jae .Lsecond_or_next2_arg_gt_5e5
cmp %r10,%r8
jae .Lthird_or_fourth_arg_gt_5e5
cmp %r10,%r9
jae .Lfourth_arg_gt_5e5
# /* Find out what multiple of piby2 */
# npi2 = (int)(x * twobypi + 0.5);
movapd .L__real_3fe45f306dc9c883(%rip),%xmm10
mulpd %xmm10,%xmm2 # * twobypi
mulpd %xmm10,%xmm3 # * twobypi
addpd %xmm4,%xmm2 # +0.5, npi2
addpd %xmm4,%xmm3 # +0.5, npi2
movapd .L__real_3ff921fb54400000(%rip),%xmm10 # piby2_1
movapd .L__real_3ff921fb54400000(%rip),%xmm1 # piby2_1
cvttpd2dq %xmm2,%xmm4 # convert packed double to packed integers
cvttpd2dq %xmm3,%xmm5 # convert packed double to packed integers
movapd .L__real_3dd0b4611a600000(%rip),%xmm8 # piby2_2
movapd .L__real_3dd0b4611a600000(%rip),%xmm9 # piby2_2
cvtdq2pd %xmm4,%xmm2 # and back to double.
cvtdq2pd %xmm5,%xmm3 # and back to double.
# /* Subtract the multiple from x to get an extra-precision remainder */
movd %xmm4,%r8 # Region
movd %xmm5,%r9 # Region
#DELETE
# mov .LQWORD,%rdx PTR __reald_one_zero ;compare value for cossin path
#DELETE
mov %r8,%r10
mov %r9,%r11
# rhead = x - npi2 * piby2_1;
mulpd %xmm2,%xmm10 # npi2 * piby2_1;
mulpd %xmm3,%xmm1 # npi2 * piby2_1;
# rtail = npi2 * piby2_2;
mulpd %xmm2,%xmm8 # rtail
mulpd %xmm3,%xmm9 # rtail
# rhead = x - npi2 * piby2_1;
subpd %xmm10,%xmm6 # rhead = x - npi2 * piby2_1;
subpd %xmm1,%xmm7 # rhead = x - npi2 * piby2_1;
# t = rhead;
movapd %xmm6,%xmm10 # t
movapd %xmm7,%xmm1 # t
# rhead = t - rtail;
subpd %xmm8,%xmm10 # rhead
subpd %xmm9,%xmm1 # rhead
# rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
mulpd .L__real_3ba3198a2e037073(%rip),%xmm2 # npi2 * piby2_2tail
mulpd .L__real_3ba3198a2e037073(%rip),%xmm3 # npi2 * piby2_2tail
subpd %xmm10,%xmm6 # t-rhead
subpd %xmm1,%xmm7 # t-rhead
subpd %xmm6,%xmm8 # - ((t - rhead) - rtail)
subpd %xmm7,%xmm9 # - ((t - rhead) - rtail)
addpd %xmm2,%xmm8 # rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
addpd %xmm3,%xmm9 # rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# xmm4 = npi2 (int), xmm10 =rhead, xmm8 =rtail, r8 = region, r10 = region, r12 = Sign
# xmm5 = npi2 (int), xmm1 =rhead, xmm9 =rtail, r9 = region, r11 = region, r13 = Sign
and .L__reald_one_one(%rip),%r8 #odd/even region for cos/sin
and .L__reald_one_one(%rip),%r9 #odd/even region for cos/sin
# NEW
#ADDED
mov %r10,%rdi # npi2 in int
mov %r11,%rsi # npi2 in int
#ADDED
shr $1,%r10 # 0 and 1 => 0
shr $1,%r11 # 2 and 3 => 1
mov %r10,%rax
mov %r11,%rcx
#ADDED
xor %r10,%rdi # xor last 2 bits of region for cos
xor %r11,%rsi # xor last 2 bits of region for cos
#ADDED
not %r12 #~(sign)
not %r13 #~(sign)
and %r12,%r10 #region & ~(sign)
and %r13,%r11 #region & ~(sign)
not %rax #~(region)
not %rcx #~(region)
not %r12 #~~(sign)
not %r13 #~~(sign)
and %r12,%rax #~region & ~~(sign)
and %r13,%rcx #~region & ~~(sign)
#ADDED
and .L__reald_one_one(%rip),%rdi # sign for cos
and .L__reald_one_one(%rip),%rsi # sign for cos
#ADDED
or %rax,%r10
or %rcx,%r11
and .L__reald_one_one(%rip),%r10 # sign for sin
and .L__reald_one_one(%rip),%r11 # sign for sin
mov %r10,%r12
mov %r11,%r13
#ADDED
mov %rdi,%rax
mov %rsi,%rcx
#ADDED
and .L__reald_one_zero(%rip),%r12 #mask out the lower sign bit leaving the upper sign bit
and .L__reald_one_zero(%rip),%r13 #mask out the lower sign bit leaving the upper sign bit
#ADDED
and .L__reald_one_zero(%rip),%rax #mask out the lower sign bit leaving the upper sign bit
and .L__reald_one_zero(%rip),%rcx #mask out the lower sign bit leaving the upper sign bit
#ADDED
shl $63,%r10 #shift lower sign bit left by 63 bits
shl $63,%r11 #shift lower sign bit left by 63 bits
shl $31,%r12 #shift upper sign bit left by 31 bits
shl $31,%r13 #shift upper sign bit left by 31 bits
#ADDED
shl $63,%rdi #shift lower sign bit left by 63 bits
shl $63,%rsi #shift lower sign bit left by 63 bits
shl $31,%rax #shift upper sign bit left by 31 bits
shl $31,%rcx #shift upper sign bit left by 31 bits
#ADDED
mov %r10,p_sign_sin(%rsp) #write out lower sign bit
mov %r12,p_sign_sin+8(%rsp) #write out upper sign bit
mov %r11,p_sign1_sin(%rsp) #write out lower sign bit
mov %r13,p_sign1_sin+8(%rsp) #write out upper sign bit
mov %rdi,p_sign_cos(%rsp) #write out lower sign bit
mov %rax,p_sign_cos+8(%rsp) #write out upper sign bit
mov %rsi,p_sign1_cos(%rsp) #write out lower sign bit
mov %rcx,p_sign1_cos+8(%rsp) #write out upper sign bit
# NEW
# GET_BITS_DP64(rhead-rtail, uy); ; originally only rhead
# xmm4 = Sign, xmm10 =rhead, xmm8 =rtail
# xmm5 = Sign, xmm1 =rhead, xmm9 =rtail
movapd %xmm10,%xmm6 # rhead
movapd %xmm1,%xmm7 # rhead
subpd %xmm8,%xmm10 # r = rhead - rtail
subpd %xmm9,%xmm1 # r = rhead - rtail
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# xmm4 = Sign, xmm10 = r, xmm6 =rhead, xmm8 =rtail
# xmm5 = Sign, xmm1 = r, xmm7 =rhead, xmm9 =rtail
# subpd %xmm10,%xmm6 ;rr=rhead-r
# subpd %xmm1,%xmm7 ;rr=rhead-r
mov %r8,%rax
mov %r9,%rcx
movapd %xmm10,%xmm2 # move r for r2
movapd %xmm1,%xmm3 # move r for r2
mulpd %xmm10,%xmm2 # r2
mulpd %xmm1,%xmm3 # r2
# subpd xmm6, xmm8 ;rr=(rhead-r) -rtail
# subpd xmm7, xmm9 ;rr=(rhead-r) -rtail
and .L__reald_zero_one(%rip),%rax # region for jump table
and .L__reald_zero_one(%rip),%rcx
shr $31,%r8
shr $31,%r9
or %r8,%rax
or %r9,%rcx
shl $2,%rcx
or %rcx,%rax
# HARSHA ADDED
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# p_sign_sin = Sign, p_sign_cos = Sign, xmm10 = r, xmm2 = r2
# p_sign1_sin = Sign, p_sign1_cos = Sign, xmm1 = r, xmm3 = r2
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
movapd %xmm2,%xmm14 # for x3
movapd %xmm3,%xmm15 # for x3
movapd %xmm2,%xmm0 # for r
movapd %xmm3,%xmm11 # for r
movdqa .Lcosarray+0x30(%rip),%xmm4 # c4
movdqa .Lcosarray+0x30(%rip),%xmm5 # c4
movapd .Lcosarray+0x10(%rip),%xmm8 # c2
movapd .Lcosarray+0x10(%rip),%xmm9 # c2
movdqa .Lsinarray+0x30(%rip),%xmm6 # c4
movdqa .Lsinarray+0x30(%rip),%xmm7 # c4
movapd .Lsinarray+0x10(%rip),%xmm12 # c2
movapd .Lsinarray+0x10(%rip),%xmm13 # c2
mulpd .L__real_3fe0000000000000(%rip),%xmm0 # r = 0.5 *x2
mulpd .L__real_3fe0000000000000(%rip),%xmm11 # r = 0.5 *x2
mulpd %xmm10,%xmm14 # x3
mulpd %xmm1,%xmm15 # x3
mulpd %xmm2,%xmm4 # c4*x2
mulpd %xmm3,%xmm5 # c4*x2
mulpd %xmm2,%xmm8 # c2*x2
mulpd %xmm3,%xmm9 # c2*x2
mulpd %xmm2,%xmm6 # c2*x2
mulpd %xmm3,%xmm7 # c2*x2
mulpd %xmm2,%xmm12 # c4*x2
mulpd %xmm3,%xmm13 # c4*x2
subpd .L__real_3ff0000000000000(%rip),%xmm0 # -t=r-1.0 ;trash r
subpd .L__real_3ff0000000000000(%rip),%xmm11 # -t=r-1.0 ;trash r
mulpd %xmm2,%xmm2 # x4
mulpd %xmm3,%xmm3 # x4
addpd .Lcosarray+0x20(%rip),%xmm4 # c3+x2c4
addpd .Lcosarray+0x20(%rip),%xmm5 # c3+x2c4
addpd .Lcosarray(%rip),%xmm8 # c1+x2c2
addpd .Lcosarray(%rip),%xmm9 # c1+x2c2
addpd .Lsinarray+0x20(%rip),%xmm6 # c3+x2c4
addpd .Lsinarray+0x20(%rip),%xmm7 # c3+x2c4
addpd .Lsinarray(%rip),%xmm12 # c1+x2c2
addpd .Lsinarray(%rip),%xmm13 # c1+x2c2
mulpd %xmm2,%xmm4 # x4(c3+x2c4)
mulpd %xmm3,%xmm5 # x4(c3+x2c4)
mulpd %xmm2,%xmm6 # x4(c3+x2c4)
mulpd %xmm3,%xmm7 # x4(c3+x2c4)
addpd %xmm8,%xmm4 # zc
addpd %xmm9,%xmm5 # zc
addpd %xmm12,%xmm6 # zs
addpd %xmm13,%xmm7 # zs
mulpd %xmm2,%xmm4 # x4 * zc
mulpd %xmm3,%xmm5 # x4 * zc
mulpd %xmm14,%xmm6 # x3 * zs
mulpd %xmm15,%xmm7 # x3 * zs
subpd %xmm0,%xmm4 # - (-t)
subpd %xmm11,%xmm5 # - (-t)
addpd %xmm10,%xmm6 # +x
addpd %xmm1,%xmm7 # +x
# HARSHA ADDED
lea .Levensin_oddcos_tbl(%rip),%rcx
jmp *(%rcx,%rax,8) #Jmp table for cos/sin calculation based on even/odd region
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.Lfirst_or_next3_arg_gt_5e5:
# %rcx,,%rax r8, r9
cmp %r10,%rcx #is upper arg >= 5e5
jae .Lboth_arg_gt_5e5
.Llower_arg_gt_5e5:
# Upper Arg is < 5e5, Lower arg is >= 5e5
# %xmm2,,%xmm10 xmm6 = x, xmm4 = 0.5
# Be sure not to use %xmm3,%xmm1 and xmm7
# Use %xmm8,,%xmm5 xmm0, xmm12
# %xmm11,,%xmm9 xmm13
movlpd %xmm10,r(%rsp) #Save lower fp arg for remainder_piby2 call
movhlps %xmm10,%xmm10 #Needed since we want to work on upper arg
movhlps %xmm2,%xmm2
movhlps %xmm6,%xmm6
# Work on Upper arg
# Lower arg might contain nan/inf, to avoid exception use only scalar instructions on upper arg which has been moved to lower portions of fp regs
mulsd .L__real_3fe45f306dc9c883(%rip),%xmm2 # x*twobypi
addsd %xmm4,%xmm2 # xmm2 = npi2=(x*twobypi+0.5)
movsd .L__real_3ff921fb54400000(%rip),%xmm8 # xmm8 = piby2_1
cvttsd2si %xmm2,%ecx # ecx = npi2 trunc to ints
movsd .L__real_3dd0b4611a600000(%rip),%xmm0 # xmm0 = piby2_2
cvtsi2sd %ecx,%xmm2 # xmm2 = npi2 trunc to doubles
#/* Subtract the multiple from x to get an extra-precision remainder */
#rhead = x - npi2 * piby2_1;
mulsd %xmm2,%xmm8 # npi2 * piby2_1
subsd %xmm8,%xmm6 # xmm6 = rhead =(x-npi2*piby2_1)
movsd .L__real_3ba3198a2e037073(%rip),%xmm12 # xmm12 =piby2_2tail
#t = rhead;
movsd %xmm6,%xmm5 # xmm5 = t = rhead
#rtail = npi2 * piby2_2;
mulsd %xmm2,%xmm0 # xmm1 =rtail=(npi2*piby2_2)
#rhead = t - rtail
subsd %xmm0,%xmm6 # xmm6 =rhead=(t-rtail)
#rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
mulsd %xmm2,%xmm12 # npi2 * piby2_2tail
subsd %xmm6,%xmm5 # t-rhead
subsd %xmm5,%xmm0 # (rtail-(t-rhead))
addsd %xmm12,%xmm0 # rtail=npi2*piby2_2tail+(rtail-(t-rhead));
#r = rhead - rtail
#rr = (rhead-r) -rtail
mov %ecx,region+4(%rsp) # store upper region
movsd %xmm6,%xmm10
subsd %xmm0,%xmm10 # xmm10 = r=(rhead-rtail)
subsd %xmm10,%xmm6 # rr=rhead-r
subsd %xmm0,%xmm6 # xmm6 = rr=((rhead-r) -rtail)
movlpd %xmm10,r+8(%rsp) # store upper r
movlpd %xmm6,rr+8(%rsp) # store upper rr
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#Note that volatiles will be trashed by the call
#We will construct r, rr, region and sign
# Work on Lower arg
mov $0x07ff0000000000000,%r11 # is lower arg nan/inf
mov %r11,%r10
and %rax,%r10
cmp %r11,%r10
jz .L__vrs4_sincosf_lower_naninf
mov %r8,p_temp(%rsp)
mov %r9,p_temp2(%rsp)
movapd %xmm1,p_temp1(%rsp)
movapd %xmm3,p_temp3(%rsp)
movapd %xmm7,p_temp5(%rsp)
lea region(%rsp),%rdx # lower arg is **NOT** nan/inf
lea r(%rsp),%rsi
# changed input from xmm10 to xmm0
mov r(%rsp),%rdi #Restore lower fp arg for remainder_piby2 call
call __remainder_piby2d2f@PLT
mov p_temp(%rsp),%r8
mov p_temp2(%rsp),%r9
movapd p_temp1(%rsp),%xmm1
movapd p_temp3(%rsp),%xmm3
movapd p_temp5(%rsp),%xmm7
jmp 0f
.L__vrs4_sincosf_lower_naninf:
mov $0x00008000000000000,%r11
or %r11,%rax
mov %rax,r(%rsp) # r = x | 0x0008000000000000
mov %r10d,region(%rsp) # region =0
.align 16
0:
jmp .Lcheck_next2_args
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.Lboth_arg_gt_5e5:
#Upper Arg is >= 5e5, Lower arg is >= 5e5
# %rcx,,%rax r8, r9
# %xmm2,,%xmm10 xmm6 = x, xmm4 = 0.5
movhlps %xmm10,%xmm6 #Save upper fp arg for remainder_piby2 call
mov $0x07ff0000000000000,%r11 #is lower arg nan/inf
mov %r11,%r10
and %rax,%r10
cmp %r11,%r10
jz .L__vrs4_sincosf_lower_naninf_of_both_gt_5e5
mov %rcx,p_temp(%rsp) #Save upper arg
mov %r8,p_temp2(%rsp)
mov %r9,p_temp4(%rsp)
movapd %xmm1,p_temp1(%rsp)
movapd %xmm3,p_temp3(%rsp)
movapd %xmm7,p_temp5(%rsp)
lea region(%rsp),%rdx #lower arg is **NOT** nan/inf
lea r(%rsp),%rsi
# added ins- changed input from xmm10 to xmm0
movd %xmm10,%rdi
call __remainder_piby2d2f@PLT
mov p_temp2(%rsp),%r8
mov p_temp4(%rsp),%r9
movapd p_temp1(%rsp),%xmm1
movapd p_temp3(%rsp),%xmm3
movapd p_temp5(%rsp),%xmm7
mov p_temp(%rsp),%rcx #Restore upper arg
jmp 0f
.L__vrs4_sincosf_lower_naninf_of_both_gt_5e5: #lower arg is nan/inf
mov $0x00008000000000000,%r11
or %r11,%rax
mov %rax,r(%rsp) #r = x | 0x0008000000000000
mov %r10d,region(%rsp) #region = 0
.align 16
0:
mov $0x07ff0000000000000,%r11 #is upper arg nan/inf
mov %r11,%r10
and %rcx,%r10
cmp %r11,%r10
jz .L__vrs4_sincosf_upper_naninf_of_both_gt_5e5
mov %r8,p_temp2(%rsp)
mov %r9,p_temp4(%rsp)
movapd %xmm1,p_temp1(%rsp)
movapd %xmm3,p_temp3(%rsp)
movapd %xmm7,p_temp5(%rsp)
lea region+4(%rsp),%rdx #upper arg is **NOT** nan/inf
lea r+8(%rsp),%rsi
# changed input from xmm10 to xmm0
movd %xmm6,%rdi #Restore upper fp arg for remainder_piby2 call
call __remainder_piby2d2f@PLT
mov p_temp2(%rsp),%r8
mov p_temp4(%rsp),%r9
movapd p_temp1(%rsp),%xmm1
movapd p_temp3(%rsp),%xmm3
movapd p_temp5(%rsp),%xmm7
jmp 0f
.L__vrs4_sincosf_upper_naninf_of_both_gt_5e5:
mov $0x00008000000000000,%r11
or %r11,%rcx
mov %rcx,r+8(%rsp) #r = x | 0x0008000000000000
mov %r10d,region+4(%rsp) #region = 0
.align 16
0:
jmp .Lcheck_next2_args
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.Lsecond_or_next2_arg_gt_5e5:
# Upper Arg is >= 5e5, Lower arg is < 5e5
# %rcx,,%rax r8, r9
# %xmm2,,%xmm10 xmm6 = x, xmm4 = 0.5
# Do not use %xmm3,,%xmm1 xmm7
# Restore xmm4 and %xmm3,,%xmm1 xmm7
# Can use %xmm0,,%xmm8 xmm12
# %xmm9,,%xmm5 xmm11, xmm13
movhpd %xmm10,r+8(%rsp) #Save upper fp arg for remainder_piby2 call
# Work on Lower arg
# Upper arg might contain nan/inf, to avoid exception use only scalar instructions on lower arg
mulsd .L__real_3fe45f306dc9c883(%rip),%xmm2 # x*twobypi
addsd %xmm4,%xmm2 # xmm2 = npi2=(x*twobypi+0.5)
movsd .L__real_3ff921fb54400000(%rip),%xmm8 # xmm3 = piby2_1
cvttsd2si %xmm2,%eax # ecx = npi2 trunc to ints
movsd .L__real_3dd0b4611a600000(%rip),%xmm0 # xmm1 = piby2_2
cvtsi2sd %eax,%xmm2 # xmm2 = npi2 trunc to doubles
#/* Subtract the multiple from x to get an extra-precision remainder */
#rhead = x - npi2 * piby2_1;
mulsd %xmm2,%xmm8 # npi2 * piby2_1
subsd %xmm8,%xmm6 # xmm6 = rhead =(x-npi2*piby2_1)
movsd .L__real_3ba3198a2e037073(%rip),%xmm12 # xmm7 =piby2_2tail
#t = rhead;
movsd %xmm6,%xmm5 # xmm5 = t = rhead
#rtail = npi2 * piby2_2;
mulsd %xmm2,%xmm0 # xmm1 =rtail=(npi2*piby2_2)
#rhead = t - rtail
subsd %xmm0,%xmm6 # xmm6 =rhead=(t-rtail)
#rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
mulsd %xmm2,%xmm12 # npi2 * piby2_2tail
subsd %xmm6,%xmm5 # t-rhead
subsd %xmm5,%xmm0 # (rtail-(t-rhead))
addsd %xmm12,%xmm0 # rtail=npi2*piby2_2tail+(rtail-(t-rhead));
#r = rhead - rtail
#rr = (rhead-r) -rtail
mov %eax,region(%rsp) # store upper region
# movsd %xmm6,%xmm10
# subsd xmm10,xmm0 ; xmm10 = r=(rhead-rtail)
# subsd %xmm10,%xmm6 ; rr=rhead-r
# subsd xmm6, xmm0 ; xmm6 = rr=((rhead-r) -rtail)
subsd %xmm0,%xmm6 # xmm10 = r=(rhead-rtail)
# movlpd QWORD PTR r[rsp], xmm10 ; store upper r
# movlpd QWORD PTR rr[rsp], xmm6 ; store upper rr
movlpd %xmm6,r(%rsp) # store upper r
#Work on Upper arg
#Note that volatiles will be trashed by the call
#We do not care since this is the last check
#We will construct r, rr, region and sign
mov $0x07ff0000000000000,%r11 # is upper arg nan/inf
mov %r11,%r10
and %rcx,%r10
cmp %r11,%r10
jz .L__vrs4_sincosf_upper_naninf
mov %r8,p_temp(%rsp)
mov %r9,p_temp2(%rsp)
movapd %xmm1,p_temp1(%rsp)
movapd %xmm3,p_temp3(%rsp)
movapd %xmm7,p_temp5(%rsp)
lea region+4(%rsp),%rdx # upper arg is **NOT** nan/inf
lea r+8(%rsp),%rsi
# changed input from xmm10 to xmm0
mov r+8(%rsp),%rdi #Restore upper fp arg for remainder_piby2 call
call __remainder_piby2d2f@PLT
mov p_temp(%rsp),%r8
mov p_temp2(%rsp),%r9
movapd p_temp1(%rsp),%xmm1
movapd p_temp3(%rsp),%xmm3
movapd p_temp5(%rsp),%xmm7
jmp 0f
.L__vrs4_sincosf_upper_naninf:
mov $0x00008000000000000,%r11
or %r11,%rcx
mov %rcx,r+8(%rsp) # r = x | 0x0008000000000000
mov %r10d,region+4(%rsp) # region =0
.align 16
0:
jmp .Lcheck_next2_args
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.Lcheck_next2_args:
mov $0x411E848000000000,%r10 #5e5 +
cmp %r10,%r8
jae .Lfirst_second_done_third_or_fourth_arg_gt_5e5
cmp %r10,%r9
jae .Lfirst_second_done_fourth_arg_gt_5e5
# Work on next two args, both < 5e5
# %xmm3,,%xmm1 xmm5 = x, xmm4 = 0.5
movapd .L__real_3fe0000000000000(%rip),%xmm4 #Restore 0.5
mulpd .L__real_3fe45f306dc9c883(%rip),%xmm3 # * twobypi
addpd %xmm4,%xmm3 # +0.5, npi2
movapd .L__real_3ff921fb54400000(%rip),%xmm1 # piby2_1
cvttpd2dq %xmm3,%xmm5 # convert packed double to packed integers
movapd .L__real_3dd0b4611a600000(%rip),%xmm9 # piby2_2
cvtdq2pd %xmm5,%xmm3 # and back to double.
###
# /* Subtract the multiple from x to get an extra-precision remainder */
movlpd %xmm5,region1(%rsp) # Region
###
# rhead = x - npi2 * piby2_1;
mulpd %xmm3,%xmm1 # npi2 * piby2_1;
# rtail = npi2 * piby2_2;
mulpd %xmm3,%xmm9 # rtail
# rhead = x - npi2 * piby2_1;
subpd %xmm1,%xmm7 # rhead = x - npi2 * piby2_1;
# t = rhead;
movapd %xmm7,%xmm1 # t
# rhead = t - rtail;
subpd %xmm9,%xmm1 # rhead
# rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
mulpd .L__real_3ba3198a2e037073(%rip),%xmm3 # npi2 * piby2_2tail
subpd %xmm1,%xmm7 # t-rhead
subpd %xmm7,%xmm9 # - ((t - rhead) - rtail)
addpd %xmm3,%xmm9 # rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
# movapd %xmm1,%xmm7 ; rhead
subpd %xmm9,%xmm1 # r = rhead - rtail
movapd %xmm1,r1(%rsp)
# subpd %xmm1,%xmm7 ; rr=rhead-r
# subpd xmm7, xmm9 ; rr=(rhead-r) -rtail
# movapd OWORD PTR rr1[rsp], xmm7
jmp .L__vrs4_sincosf_reconstruct
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.Lthird_or_fourth_arg_gt_5e5:
#first two args are < 5e5, third arg >= 5e5, fourth arg >= 5e5 or < 5e5
# %rcx,,%rax r8, r9
# %xmm2,,%xmm10 xmm6 = x, xmm4 = 0.5
# Do not use %xmm3,,%xmm1 xmm7
# Can use %xmm11,,%xmm9 xmm13
# %xmm8,,%xmm5 xmm0, xmm12
# Restore xmm4
# Work on first two args, both < 5e5
mulpd .L__real_3fe45f306dc9c883(%rip),%xmm2 # * twobypi
addpd %xmm4,%xmm2 # +0.5, npi2
movapd .L__real_3ff921fb54400000(%rip),%xmm10 # piby2_1
cvttpd2dq %xmm2,%xmm4 # convert packed double to packed integers
movapd .L__real_3dd0b4611a600000(%rip),%xmm8 # piby2_2
cvtdq2pd %xmm4,%xmm2 # and back to double.
###
# /* Subtract the multiple from x to get an extra-precision remainder */
movlpd %xmm4,region(%rsp) # Region
###
# rhead = x - npi2 * piby2_1;
mulpd %xmm2,%xmm10 # npi2 * piby2_1;
# rtail = npi2 * piby2_2;
mulpd %xmm2,%xmm8 # rtail
# rhead = x - npi2 * piby2_1;
subpd %xmm10,%xmm6 # rhead = x - npi2 * piby2_1;
# t = rhead;
movapd %xmm6,%xmm10 # t
# rhead = t - rtail;
subpd %xmm8,%xmm10 # rhead
# rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
mulpd .L__real_3ba3198a2e037073(%rip),%xmm2 # npi2 * piby2_2tail
subpd %xmm10,%xmm6 # t-rhead
subpd %xmm6,%xmm8 # - ((t - rhead) - rtail)
addpd %xmm2,%xmm8 # rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
# movapd %xmm10,%xmm6 ; rhead
subpd %xmm8,%xmm10 # r = rhead - rtail
movapd %xmm10,r(%rsp)
# subpd %xmm10,%xmm6 ; rr=rhead-r
# subpd xmm6, xmm8 ; rr=(rhead-r) -rtail
# movapd OWORD PTR rr[rsp], xmm6
# Work on next two args, third arg >= 5e5, fourth arg >= 5e5 or < 5e5
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.Lfirst_second_done_third_or_fourth_arg_gt_5e5:
# %rcx,,%rax r8, r9
# %xmm2,,%xmm10 xmm6 = x, xmm4 = 0.5
mov $0x411E848000000000,%r10 #5e5 +
cmp %r10,%r9
jae .Lboth_arg_gt_5e5_higher
# Upper Arg is <5e5, Lower arg is >= 5e5
# %r9,%r8
# %xmm3,,%xmm1 xmm7 = x, xmm4 = 0.5
movlpd %xmm1,r1(%rsp) #Save lower fp arg for remainder_piby2 call
movhlps %xmm1,%xmm1 #Needed since we want to work on upper arg
movhlps %xmm3,%xmm3
movhlps %xmm7,%xmm7
# Work on Upper arg
# Lower arg might contain nan/inf, to avoid exception use only scalar instructions on upper arg which has been moved to lower portions of fp regs
movapd .L__real_3fe0000000000000(%rip),%xmm4 # Restore 0.5
mulsd .L__real_3fe45f306dc9c883(%rip),%xmm3 # x*twobypi
addsd %xmm4,%xmm3 # xmm3 = npi2=(x*twobypi+0.5)
movsd .L__real_3ff921fb54400000(%rip),%xmm2 # xmm2 = piby2_1
cvttsd2si %xmm3,%r9d # r9d = npi2 trunc to ints
movsd .L__real_3dd0b4611a600000(%rip),%xmm10 # xmm10 = piby2_2
cvtsi2sd %r9d,%xmm3 # xmm3 = npi2 trunc to doubles
#/* Subtract the multiple from x to get an extra-precision remainder */
#rhead = x - npi2 * piby2_1;
mulsd %xmm3,%xmm2 # npi2 * piby2_1
subsd %xmm2,%xmm7 # xmm7 = rhead =(x-npi2*piby2_1)
movsd .L__real_3ba3198a2e037073(%rip),%xmm6 # xmm6 =piby2_2tail
#t = rhead;
movsd %xmm7,%xmm5 # xmm5 = t = rhead
#rtail = npi2 * piby2_2;
mulsd %xmm3,%xmm10 # xmm10 =rtail=(npi2*piby2_2)
#rhead = t - rtail
subsd %xmm10,%xmm7 # xmm7 =rhead=(t-rtail)
#rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
mulsd %xmm3,%xmm6 # npi2 * piby2_2tail
subsd %xmm7,%xmm5 # t-rhead
subsd %xmm5,%xmm10 # (rtail-(t-rhead))
addsd %xmm6,%xmm10 # rtail=npi2*piby2_2tail+(rtail-(t-rhead));
#r = rhead - rtail
#rr = (rhead-r) -rtail
mov %r9d,region1+4(%rsp) # store upper region
subsd %xmm10,%xmm7 # xmm1 = r=(rhead-rtail)
movlpd %xmm7,r1+8(%rsp) # store upper r
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#Note that volatiles will be trashed by the call
#We do not care since this is the last check
#We will construct r, rr, region and sign
# Work on Lower arg
mov $0x07ff0000000000000,%r11 # is lower arg nan/inf
mov %r11,%r10
and %r8,%r10
cmp %r11,%r10
jz .L__vrs4_sincosf_lower_naninf_higher
lea region1(%rsp),%rdx # lower arg is **NOT** nan/inf
lea r1(%rsp),%rsi
# changed input from xmm10 to xmm0
mov r1(%rsp),%rdi #Restore lower fp arg for remainder_piby2 call
call __remainder_piby2d2f@PLT
jmp 0f
.L__vrs4_sincosf_lower_naninf_higher:
mov $0x00008000000000000,%r11
or %r11,%r8
mov %r8,r1(%rsp) # r = x | 0x0008000000000000
mov %r10d,region1(%rsp) # region =0
.align 16
0:
jmp .L__vrs4_sincosf_reconstruct
.align 16
.Lboth_arg_gt_5e5_higher:
# Upper Arg is >= 5e5, Lower arg is >= 5e5
# %r9,%r8
# %xmm3,,%xmm1 xmm7 = x, xmm4 = 0.5
movhlps %xmm1,%xmm7 #Save upper fp arg for remainder_piby2 call
mov $0x07ff0000000000000,%r11 #is lower arg nan/inf
mov %r11,%r10
and %r8,%r10
cmp %r11,%r10
jz .L__vrs4_sincosf_lower_naninf_of_both_gt_5e5_higher
mov %r9,p_temp1(%rsp) #Save upper arg
lea region1(%rsp),%rdx #lower arg is **NOT** nan/inf
lea r1(%rsp),%rsi
# changed input from xmm10 to xmm0
movd %xmm1,%rdi
call __remainder_piby2d2f@PLT
mov p_temp1(%rsp),%r9 #Restore upper arg
jmp 0f
.L__vrs4_sincosf_lower_naninf_of_both_gt_5e5_higher: #lower arg is nan/inf
mov $0x00008000000000000,%r11
or %r11,%r8
mov %r8,r1(%rsp) #r = x | 0x0008000000000000
mov %r10d,region1(%rsp) #region = 0
.align 16
0:
mov $0x07ff0000000000000,%r11 #is upper arg nan/inf
mov %r11,%r10
and %r9,%r10
cmp %r11,%r10
jz .L__vrs4_sincosf_upper_naninf_of_both_gt_5e5_higher
lea region1+4(%rsp),%rdx #upper arg is **NOT** nan/inf
lea r1+8(%rsp),%rsi
# changed input from xmm10 to xmm0
movd %xmm7,%rdi #Restore upper fp arg for remainder_piby2 call
call __remainder_piby2d2f@PLT
jmp 0f
.L__vrs4_sincosf_upper_naninf_of_both_gt_5e5_higher:
mov $0x00008000000000000,%r11
or %r11,%r9
mov %r9,r1+8(%rsp) #r = x | 0x0008000000000000
mov %r10d,region1+4(%rsp) #region = 0
.align 16
0:
jmp .L__vrs4_sincosf_reconstruct
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.Lfourth_arg_gt_5e5:
#first two args are < 5e5, third arg < 5e5, fourth arg >= 5e5
#%rcx,,%rax r8, r9
#%xmm2,,%xmm10 xmm6 = x, xmm4 = 0.5
# Work on first two args, both < 5e5
mulpd .L__real_3fe45f306dc9c883(%rip),%xmm2 # * twobypi
addpd %xmm4,%xmm2 # +0.5, npi2
movapd .L__real_3ff921fb54400000(%rip),%xmm10 # piby2_1
cvttpd2dq %xmm2,%xmm4 # convert packed double to packed integers
movapd .L__real_3dd0b4611a600000(%rip),%xmm8 # piby2_2
cvtdq2pd %xmm4,%xmm2 # and back to double.
###
# /* Subtract the multiple from x to get an extra-precision remainder */
movlpd %xmm4,region(%rsp) # Region
###
# rhead = x - npi2 * piby2_1;
mulpd %xmm2,%xmm10 # npi2 * piby2_1;
# rtail = npi2 * piby2_2;
mulpd %xmm2,%xmm8 # rtail
# rhead = x - npi2 * piby2_1;
subpd %xmm10,%xmm6 # rhead = x - npi2 * piby2_1;
# t = rhead;
movapd %xmm6,%xmm10 # t
# rhead = t - rtail;
subpd %xmm8,%xmm10 # rhead
# rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
mulpd .L__real_3ba3198a2e037073(%rip),%xmm2 # npi2 * piby2_2tail
subpd %xmm10,%xmm6 # t-rhead
subpd %xmm6,%xmm8 # - ((t - rhead) - rtail)
addpd %xmm2,%xmm8 # rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
# movapd %xmm10,%xmm6 ; rhead
subpd %xmm8,%xmm10 # r = rhead - rtail
movapd %xmm10,r(%rsp)
# subpd %xmm10,%xmm6 ; rr=rhead-r
# subpd xmm6, xmm8 ; rr=(rhead-r) -rtail
# movapd OWORD PTR rr[rsp], xmm6
# Work on next two args, third arg < 5e5, fourth arg >= 5e5
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.Lfirst_second_done_fourth_arg_gt_5e5:
# Upper Arg is >= 5e5, Lower arg is < 5e5
# %r9,%r8
# %xmm3,,%xmm1 xmm7 = x, xmm4 = 0.5
movhpd %xmm1,r1+8(%rsp) #Save upper fp arg for remainder_piby2 call
# Work on Lower arg
# Upper arg might contain nan/inf, to avoid exception use only scalar instructions on lower arg
movapd .L__real_3fe0000000000000(%rip),%xmm4 # Restore 0.5
mulsd .L__real_3fe45f306dc9c883(%rip),%xmm3 # x*twobypi
addsd %xmm4,%xmm3 # xmm3 = npi2=(x*twobypi+0.5)
movsd .L__real_3ff921fb54400000(%rip),%xmm2 # xmm2 = piby2_1
cvttsd2si %xmm3,%r8d # r8d = npi2 trunc to ints
movsd .L__real_3dd0b4611a600000(%rip),%xmm10 # xmm10 = piby2_2
cvtsi2sd %r8d,%xmm3 # xmm3 = npi2 trunc to doubles
#/* Subtract the multiple from x to get an extra-precision remainder */
#rhead = x - npi2 * piby2_1;
mulsd %xmm3,%xmm2 # npi2 * piby2_1
subsd %xmm2,%xmm7 # xmm7 = rhead =(x-npi2*piby2_1)
movsd .L__real_3ba3198a2e037073(%rip),%xmm6 # xmm6 =piby2_2tail
#t = rhead;
movsd %xmm7,%xmm5 # xmm5 = t = rhead
#rtail = npi2 * piby2_2;
mulsd %xmm3,%xmm10 # xmm10 =rtail=(npi2*piby2_2)
#rhead = t - rtail
subsd %xmm10,%xmm7 # xmm7 =rhead=(t-rtail)
#rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
mulsd %xmm3,%xmm6 # npi2 * piby2_2tail
subsd %xmm7,%xmm5 # t-rhead
subsd %xmm5,%xmm10 # (rtail-(t-rhead))
addsd %xmm6,%xmm10 # rtail=npi2*piby2_2tail+(rtail-(t-rhead));
#r = rhead - rtail
#rr = (rhead-r) -rtail
mov %r8d,region1(%rsp) # store lower region
# movsd %xmm7,%xmm1
# subsd xmm1, xmm10 ; xmm10 = r=(rhead-rtail)
# subsd %xmm1,%xmm7 ; rr=rhead-r
# subsd xmm7, xmm10 ; xmm6 = rr=((rhead-r) -rtail)
subsd %xmm10,%xmm7 # xmm10 = r=(rhead-rtail)
# movlpd QWORD PTR r1[rsp], xmm1 ; store upper r
# movlpd QWORD PTR rr1[rsp], xmm7 ; store upper rr
movlpd %xmm7,r1(%rsp) # store upper r
#Work on Upper arg
#Note that volatiles will be trashed by the call
#We do not care since this is the last check
#We will construct r, rr, region and sign
mov $0x07ff0000000000000,%r11 # is upper arg nan/inf
mov %r11,%r10
and %r9,%r10
cmp %r11,%r10
jz .L__vrs4_sincosf_upper_naninf_higher
lea region1+4(%rsp),%rdx # upper arg is **NOT** nan/inf
lea r1+8(%rsp),%rsi
# changed input from xmm10 to xmm0
mov r1+8(%rsp),%rdi #Restore upper fp arg for remainder_piby2 call
call __remainder_piby2d2f@PLT
jmp 0f
.L__vrs4_sincosf_upper_naninf_higher:
mov $0x00008000000000000,%r11
or %r11,%r9
mov %r9,r1+8(%rsp) # r = x | 0x0008000000000000
mov %r10d,region1+4(%rsp) # region =0
.align 16
0:
jmp .L__vrs4_sincosf_reconstruct
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.L__vrs4_sincosf_reconstruct:
#Results
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# p_sign_sin = Sign, ; p_sign_cos = Sign, xmm10 = r, xmm2 = r2
# p_sign1_sin = Sign, ; p_sign1_cos = Sign, xmm1 = r, xmm3 = r2
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
movapd r(%rsp),%xmm10
movapd r1(%rsp),%xmm1
mov region(%rsp),%r8
mov region1(%rsp),%r9
mov %r8,%r10
mov %r9,%r11
and .L__reald_one_one(%rip),%r8 #odd/even region for cos/sin
and .L__reald_one_one(%rip),%r9 #odd/even region for cos/sin
# NEW
#ADDED
mov %r10,%rdi
mov %r11,%rsi
#ADDED
shr $1,%r10 #~AB+A~B, A is sign and B is upper bit of region
shr $1,%r11 #~AB+A~B, A is sign and B is upper bit of region
mov %r10,%rax
mov %r11,%rcx
#ADDED
xor %r10,%rdi
xor %r11,%rsi
#ADDED
not %r12 #ADDED TO CHANGE THE LOGIC
not %r13 #ADDED TO CHANGE THE LOGIC
and %r12,%r10
and %r13,%r11
not %rax
not %rcx
not %r12
not %r13
and %r12,%rax
and %r13,%rcx
#ADDED
and .L__reald_one_one(%rip),%rdi #(~AB+A~B)&1
and .L__reald_one_one(%rip),%rsi #(~AB+A~B)&1
#ADDED
or %rax,%r10
or %rcx,%r11
and .L__reald_one_one(%rip),%r10 #(~AB+A~B)&1
and .L__reald_one_one(%rip),%r11 #(~AB+A~B)&1
mov %r10,%r12
mov %r11,%r13
#ADDED
mov %rdi,%rax
mov %rsi,%rcx
#ADDED
and .L__reald_one_zero(%rip),%r12 #mask out the lower sign bit leaving the upper sign bit
and .L__reald_one_zero(%rip),%r13 #mask out the lower sign bit leaving the upper sign bit
#ADDED
and .L__reald_one_zero(%rip),%rax #mask out the lower sign bit leaving the upper sign bit
and .L__reald_one_zero(%rip),%rcx #mask out the lower sign bit leaving the upper sign bit
#ADDED
shl $63,%r10 #shift lower sign bit left by 63 bits
shl $63,%r11 #shift lower sign bit left by 63 bits
shl $31,%r12 #shift upper sign bit left by 31 bits
shl $31,%r13 #shift upper sign bit left by 31 bits
#ADDED
shl $63,%rdi #shift lower sign bit left by 63 bits
shl $63,%rsi #shift lower sign bit left by 63 bits
shl $31,%rax #shift upper sign bit left by 31 bits
shl $31,%rcx #shift upper sign bit left by 31 bits
#ADDED
mov %r10,p_sign_sin(%rsp) #write out lower sign bit
mov %r12,p_sign_sin+8(%rsp) #write out upper sign bit
mov %r11,p_sign1_sin(%rsp) #write out lower sign bit
mov %r13,p_sign1_sin+8(%rsp) #write out upper sign bit
mov %rdi,p_sign_cos(%rsp) #write out lower sign bit
mov %rax,p_sign_cos+8(%rsp) #write out upper sign bit
mov %rsi,p_sign1_cos(%rsp) #write out lower sign bit
mov %rcx,p_sign1_cos+8(%rsp) #write out upper sign bit
#NEW
mov %r8,%rax
mov %r9,%rcx
movapd %xmm10,%xmm2
movapd %xmm1,%xmm3
mulpd %xmm10,%xmm2 # r2
mulpd %xmm1,%xmm3 # r2
and .L__reald_zero_one(%rip),%rax
and .L__reald_zero_one(%rip),%rcx
shr $31,%r8
shr $31,%r9
or %r8,%rax
or %r9,%rcx
shl $2,%rcx
or %rcx,%rax
# HARSHA ADDED
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# p_sign_cos = Sign, p_sign_sin = Sign, xmm10 = r, xmm2 = r2
# p_sign1_cos = Sign, p_sign1_sin = Sign, xmm1 = r, xmm3 = r2
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
movapd %xmm2,%xmm14 # for x3
movapd %xmm3,%xmm15 # for x3
movapd %xmm2,%xmm0 # for r
movapd %xmm3,%xmm11 # for r
movdqa .Lcosarray+0x30(%rip),%xmm4 # c4
movdqa .Lcosarray+0x30(%rip),%xmm5 # c4
movapd .Lcosarray+0x10(%rip),%xmm8 # c2
movapd .Lcosarray+0x10(%rip),%xmm9 # c2
movdqa .Lsinarray+0x30(%rip),%xmm6 # c4
movdqa .Lsinarray+0x30(%rip),%xmm7 # c4
movapd .Lsinarray+0x10(%rip),%xmm12 # c2
movapd .Lsinarray+0x10(%rip),%xmm13 # c2
mulpd .L__real_3fe0000000000000(%rip),%xmm0 # r = 0.5 *x2
mulpd .L__real_3fe0000000000000(%rip),%xmm11 # r = 0.5 *x2
mulpd %xmm10,%xmm14 # x3
mulpd %xmm1,%xmm15 # x3
mulpd %xmm2,%xmm4 # c4*x2
mulpd %xmm3,%xmm5 # c4*x2
mulpd %xmm2,%xmm8 # c2*x2
mulpd %xmm3,%xmm9 # c2*x2
mulpd %xmm2,%xmm6 # c2*x2
mulpd %xmm3,%xmm7 # c2*x2
mulpd %xmm2,%xmm12 # c4*x2
mulpd %xmm3,%xmm13 # c4*x2
subpd .L__real_3ff0000000000000(%rip),%xmm0 # -t=r-1.0 ;trash r
subpd .L__real_3ff0000000000000(%rip),%xmm11 # -t=r-1.0 ;trash r
mulpd %xmm2,%xmm2 # x4
mulpd %xmm3,%xmm3 # x4
addpd .Lcosarray+0x20(%rip),%xmm4 # c3+x2c4
addpd .Lcosarray+0x20(%rip),%xmm5 # c3+x2c4
addpd .Lcosarray(%rip),%xmm8 # c1+x2c2
addpd .Lcosarray(%rip),%xmm9 # c1+x2c2
addpd .Lsinarray+0x20(%rip),%xmm6 # c3+x2c4
addpd .Lsinarray+0x20(%rip),%xmm7 # c3+x2c4
addpd .Lsinarray(%rip),%xmm12 # c1+x2c2
addpd .Lsinarray(%rip),%xmm13 # c1+x2c2
mulpd %xmm2,%xmm4 # x4(c3+x2c4)
mulpd %xmm3,%xmm5 # x4(c3+x2c4)
mulpd %xmm2,%xmm6 # x4(c3+x2c4)
mulpd %xmm3,%xmm7 # x4(c3+x2c4)
addpd %xmm8,%xmm4 # zc
addpd %xmm9,%xmm5 # zc
addpd %xmm12,%xmm6 # zs
addpd %xmm13,%xmm7 # zs
mulpd %xmm2,%xmm4 # x4 * zc
mulpd %xmm3,%xmm5 # x4 * zc
mulpd %xmm14,%xmm6 # x3 * zs
mulpd %xmm15,%xmm7 # x3 * zs
subpd %xmm0,%xmm4 # - (-t)
subpd %xmm11,%xmm5 # - (-t)
addpd %xmm10,%xmm6 # +x
addpd %xmm1,%xmm7 # +x
# HARSHA ADDED
lea .Levensin_oddcos_tbl(%rip),%rcx
jmp *(%rcx,%rax,8) #Jmp table for cos/sin calculation based on even/odd region
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.L__vrsa_sincosf_cleanup:
movapd p_sign_cos(%rsp),%xmm10
movapd p_sign1_cos(%rsp),%xmm1
xorpd %xmm4,%xmm10 # Cos term (+) Sign
xorpd %xmm5,%xmm1 # Cos term (+) Sign
cvtpd2ps %xmm10,%xmm0
cvtpd2ps %xmm1,%xmm11
movapd p_sign_sin(%rsp),%xmm14
movapd p_sign1_sin(%rsp),%xmm15
xorpd %xmm6,%xmm14 # Sin term (+) Sign
xorpd %xmm7,%xmm15 # Sin term (+) Sign
cvtpd2ps %xmm14,%xmm12
cvtpd2ps %xmm15,%xmm13
.L__vrsa_bottom1:
# store the result _m128d
mov save_ysa(%rsp),%r8
mov save_yca(%rsp),%r9
movlps %xmm0, (%r9) # save the cos
movlps %xmm12, (%r8) # save the sin
movlps %xmm11, 8(%r9) # save the cos
movlps %xmm13, 8(%r8) # save the sin
prefetch 32(%r8)
prefetch 32(%r9)
add $16,%r8
add $16,%r9
mov %r8,save_ysa(%rsp) # save y_sinarray pointer
mov %r9,save_yca(%rsp) # save y_cosarray pointer
mov p_iter(%rsp),%rax # get number of iterations
sub $1,%rax
mov %rax,p_iter(%rsp) # save number of iterations
jnz .L__vrsa_top
# see if we need to do any extras
mov save_nv(%rsp),%rax # get number of values
test %rax,%rax
jnz .L__vrsa_cleanup
.L__final_check:
mov save_r12(%rsp),%r12 # restore r12
mov save_r13(%rsp),%r13 # restore r13
add $0x0298,%rsp
ret
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# we jump here when we have an odd number of cos calls to make at the end
# we assume that rdx is pointing at the next x array element, r8 at the next y array element.
# The number of values left is in save_nv
.align 16
.L__vrsa_cleanup:
mov save_nv(%rsp),%rax # get number of values
test %rax,%rax # are there any values
jz .L__final_check # exit if not
mov save_xa(%rsp),%rsi
mov save_ysa(%rsp),%rdi
mov save_yca(%rsp),%r12
# fill in a m128d with zeroes and the extra values and then make a recursive call.
xorps %xmm0,%xmm0
movss %xmm0,p_temp+4(%rsp)
movlps %xmm0,p_temp+8(%rsp)
mov (%rsi),%ecx # we know there's at least one
mov %ecx,p_temp(%rsp)
cmp $2,%rax
jl .L__vrsacg
mov 4(%rsi),%ecx # do the second value
mov %ecx,p_temp+4(%rsp)
cmp $3,%rax
jl .L__vrsacg
mov 8(%rsi),%ecx # do the third value
mov %ecx,p_temp+8(%rsp)
.L__vrsacg:
mov $4,%rdi # parameter for N
lea p_temp(%rsp),%rsi # &x parameter
lea p_temp2(%rsp),%rdx # &ys parameter
lea p_temp3(%rsp),%rcx # &yc parameter
call vrsa_sincosf@PLT # call recursively to compute four values
# now copy the results to the destination array
mov save_ysa(%rsp),%rdi
mov save_yca(%rsp),%r12
mov save_nv(%rsp),%rax # get number of values
mov p_temp2(%rsp),%ecx
mov %ecx,(%rdi) # we know there's at least one
mov p_temp3(%rsp),%edx
mov %edx,(%r12) # we know there's at least one
cmp $2,%rax
jl .L__vrsacgf
mov p_temp2+4(%rsp),%ecx
mov %ecx,4(%rdi) # do the second value
mov p_temp3+4(%rsp),%edx
mov %edx,4(%r12) # do the second value
cmp $3,%rax
jl .L__vrsacgf
mov p_temp2+8(%rsp),%ecx
mov %ecx,8(%rdi) # do the third value
mov p_temp3+8(%rsp),%edx
mov %edx,8(%r12) # do the third value
.L__vrsacgf:
jmp .L__final_check
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;JUMP TABLE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.Lcoscos_coscos_piby4:
# Cos in %xmm5,%xmm4
# Sin in %xmm7,%xmm6
# Lower and Upper Even
movapd %xmm4,%xmm8
movapd %xmm5,%xmm9
movapd %xmm6,%xmm4
movapd %xmm7,%xmm5
movapd %xmm8,%xmm6
movapd %xmm9,%xmm7
jmp .L__vrsa_sincosf_cleanup
.align 16
.Lcossin_cossin_piby4:
movhlps %xmm5,%xmm9
movhlps %xmm7,%xmm13
movlhps %xmm9,%xmm7
movlhps %xmm13,%xmm5
movhlps %xmm4,%xmm8
movhlps %xmm6,%xmm12
movlhps %xmm8,%xmm6
movlhps %xmm12,%xmm4
jmp .L__vrsa_sincosf_cleanup
.align 16
.Lsincos_cossin_piby4:
movsd %xmm5,%xmm9
movsd %xmm7,%xmm13
movsd %xmm9,%xmm7
movsd %xmm13,%xmm5
movhlps %xmm4,%xmm8
movhlps %xmm6,%xmm12
movlhps %xmm8,%xmm6
movlhps %xmm12,%xmm4
jmp .L__vrsa_sincosf_cleanup
.align 16
.Lsincos_sincos_piby4:
movsd %xmm5,%xmm9
movsd %xmm7,%xmm13
movsd %xmm9,%xmm7
movsd %xmm13,%xmm5
movsd %xmm4,%xmm8
movsd %xmm6,%xmm12
movsd %xmm8,%xmm6
movsd %xmm12,%xmm4
jmp .L__vrsa_sincosf_cleanup
.align 16
.Lcossin_sincos_piby4:
movhlps %xmm5,%xmm9
movhlps %xmm7,%xmm13
movlhps %xmm9,%xmm7
movlhps %xmm13,%xmm5
movsd %xmm4,%xmm8
movsd %xmm6,%xmm12
movsd %xmm8,%xmm6
movsd %xmm12,%xmm4
jmp .L__vrsa_sincosf_cleanup
.align 16
.Lcoscos_sinsin_piby4:
# Cos in %xmm5,%xmm4
# Sin in %xmm7,%xmm6
# Lower even, Upper odd, Swap upper
movapd %xmm5,%xmm9
movapd %xmm7,%xmm5
movapd %xmm9,%xmm7
jmp .L__vrsa_sincosf_cleanup
.align 16
.Lsinsin_coscos_piby4:
# Cos in %xmm5,%xmm4
# Sin in %xmm7,%xmm6
# Lower odd, Upper even, Swap lower
movapd %xmm4,%xmm8
movapd %xmm6,%xmm4
movapd %xmm8,%xmm6
jmp .L__vrsa_sincosf_cleanup
.align 16
.Lcoscos_cossin_piby4:
# Cos in xmm4 and xmm5
# Sin in xmm6 and xmm7
movapd %xmm5,%xmm9
movapd %xmm7,%xmm5
movapd %xmm9,%xmm7
movhlps %xmm4,%xmm8
movhlps %xmm6,%xmm12
movlhps %xmm8,%xmm6
movlhps %xmm12,%xmm4
jmp .L__vrsa_sincosf_cleanup
.align 16
.Lcoscos_sincos_piby4:
# Cos in xmm4 and xmm5
# Sin in xmm6 and xmm7
movapd %xmm5,%xmm9
movapd %xmm7,%xmm5
movapd %xmm9,%xmm7
movsd %xmm4,%xmm8
movsd %xmm6,%xmm12
movsd %xmm8,%xmm6
movsd %xmm12,%xmm4
jmp .L__vrsa_sincosf_cleanup
.align 16
.Lcossin_coscos_piby4:
# Cos in xmm4 and xmm5
# Sin in xmm6 and xmm7
movapd %xmm4,%xmm8
movapd %xmm6,%xmm4
movapd %xmm8,%xmm6
movhlps %xmm5,%xmm9
movhlps %xmm7,%xmm13
movlhps %xmm9,%xmm7
movlhps %xmm13,%xmm5
jmp .L__vrsa_sincosf_cleanup
.align 16
.Lcossin_sinsin_piby4:
# Cos in xmm4 and xmm5
# Sin in xmm6 and xmm7
movhlps %xmm5,%xmm9
movhlps %xmm7,%xmm13
movlhps %xmm9,%xmm7
movlhps %xmm13,%xmm5
jmp .L__vrsa_sincosf_cleanup
.align 16
.Lsincos_coscos_piby4:
# Cos in xmm4 and xmm5
# Sin in xmm6 and xmm7
movapd %xmm4,%xmm8
movapd %xmm6,%xmm4
movapd %xmm8,%xmm6
movsd %xmm5,%xmm9
movsd %xmm7,%xmm13
movsd %xmm9,%xmm7
movsd %xmm13,%xmm5
jmp .L__vrsa_sincosf_cleanup
.align 16
.Lsincos_sinsin_piby4:
# Cos in xmm4 and xmm5
# Sin in xmm6 and xmm7
movsd %xmm5,%xmm9
movsd %xmm7,%xmm5
movsd %xmm9,%xmm7
jmp .L__vrsa_sincosf_cleanup
.align 16
.Lsinsin_cossin_piby4:
# Cos in xmm4 and xmm5
# Sin in xmm6 and xmm7
movhlps %xmm4,%xmm8
movhlps %xmm6,%xmm12
movlhps %xmm8,%xmm6
movlhps %xmm12,%xmm4
jmp .L__vrsa_sincosf_cleanup
.align 16
.Lsinsin_sincos_piby4:
# Cos in xmm4 and xmm5
# Sin in xmm6 and xmm7
movsd %xmm4,%xmm8
movsd %xmm6,%xmm4
movsd %xmm8,%xmm6
jmp .L__vrsa_sincosf_cleanup
.align 16
.Lsinsin_sinsin_piby4:
# Cos in xmm4 and xmm5
# Sin in xmm6 and xmm7
# Lower and Upper odd, So Swap
jmp .L__vrsa_sincosf_cleanup