blob: dcdbe9afd4bb244806ce09f0a61ce0d4d15b6941 [file] [log] [blame]
#
# (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
#
# This file is part of libacml_mv.
#
# libacml_mv is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# libacml_mv is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with libacml_mv. If not, see
# <http://www.gnu.org/licenses/>.
#
#
#
# An implementation of the sincosf function.
#
# Prototype:
#
# void fastsincosf(float x, float * sinfx, float * cosfx);
#
# Computes sinf(x) and cosf(x).
# It will provide proper C99 return values,
# but may not raise floating point status bits properly.
# Based on the NAG C implementation.
# Author: Harsha Jagasia
# Email: harsha.jagasia@amd.com
#ifdef __ELF__
.section .note.GNU-stack,"",@progbits
#endif
.data
.align 32
.L__real_3ff0000000000000: .quad 0x03ff0000000000000 # 1.0
.quad 0 # for alignment
.L__real_3fe0000000000000: .quad 0x03fe0000000000000 # 0.5
.quad 0
.L__real_3fc5555555555555: .quad 0x03fc5555555555555 # 0.166666666666
.quad 0
.L__real_3fe45f306dc9c883: .quad 0x03fe45f306dc9c883 # twobypi
.quad 0
.L__real_3FF921FB54442D18: .quad 0x03FF921FB54442D18 # piby2
.quad 0
.L__real_3ff921fb54400000: .quad 0x03ff921fb54400000 # piby2_1
.quad 0
.L__real_3dd0b4611a626331: .quad 0x03dd0b4611a626331 # piby2_1tail
.quad 0
.L__real_3dd0b4611a600000: .quad 0x03dd0b4611a600000 # piby2_2
.quad 0
.L__real_3ba3198a2e037073: .quad 0x03ba3198a2e037073 # piby2_2tail
.quad 0
.L__real_fffffffff8000000: .quad 0x0fffffffff8000000 # mask for stripping head and tail
.quad 0
.L__real_8000000000000000: .quad 0x08000000000000000 # -0 or signbit
.quad 0
.L__real_411E848000000000: .quad 0x415312d000000000 # 5e6 0x0411E848000000000 # 5e5
.quad 0
.align 32
.Lcsarray:
.quad 0x0bfc5555555555555 # -0.166667 s1
.quad 0x03fa5555555555555 # 0.0416667 c1
.quad 0x03f81111111110bb3 # 0.00833333 s2
.quad 0x0bf56c16c16c16967 # -0.00138889 c2
.quad 0x0bf2a01a019e83e5c # -0.000198413 s3
.quad 0x03efa01a019f4ec90 # 2.48016e-005 c3
.quad 0x03ec71de3796cde01 # 2.75573e-006 s4
.quad 0x0be927e4fa17f65f6 # -2.75573e-007 c4
.quad 0x0be5ae600b42fdfa7 # -2.50511e-008 s5
.quad 0x03e21eeb69037ab78 # 2.08761e-009 c5
.quad 0x03de5e0b2f9a43bb8 # 1.59181e-010 s6
.quad 0x0bda907db46cc5e42 # -1.13826e-011 c6
.text
.align 16
.p2align 4,,15
#include "fn_macros.h"
#define fname FN_PROTOTYPE(sincosf)
#define fname_special _sincosf_special@PLT
# define local variable storage offsets
.equ p_temp, 0x30 # temporary for get/put bits operation
.equ p_temp1, 0x40 # temporary for get/put bits operation
.equ p_temp2, 0x50 # temporary for get/put bits operation
.equ p_temp3, 0x60 # temporary for get/put bits operation
.equ region, 0x70 # pointer to region for amd_remainder_piby2
.equ r, 0x80 # pointer to r for amd_remainder_piby2
.equ stack_size, 0xa8
.globl fname
.type fname,@function
fname:
sub $stack_size, %rsp
xorpd %xmm2,%xmm2
# GET_BITS_DP64(x, ux);
# convert input to double.
cvtss2sd %xmm0,%xmm0
# get the input value to an integer register.
movsd %xmm0,p_temp(%rsp)
mov p_temp(%rsp),%rdx # rdx is ux
## if NaN or inf
mov $0x07ff0000000000000,%rax
mov %rax,%r10
and %rdx,%r10
cmp %rax,%r10
jz .L__sc_naninf
# ax = (ux & ~SIGNBIT_DP64);
mov $0x07fffffffffffffff,%r10
and %rdx,%r10 # r10 is ax
## if (ax <= 0x3fe921fb54442d18) /* abs(x) <= pi/4 */
mov $0x03fe921fb54442d18,%rax
cmp %rax,%r10
jg .L__sc_reduce
## if (ax < 0x3f20000000000000) /* abs(x) < 2.0^(-13) */
mov $0x3f20000000000000, %rax
cmp %rax, %r10
jge .L__sc_notsmallest
# sinf = x, cos=1.0
movsd .L__real_3ff0000000000000(%rip),%xmm1
jmp .L__sc_cleanup
# *s = sin_piby4(x, 0.0);
# *c = cos_piby4(x, 0.0);
.L__sc_notsmallest:
xor %eax,%eax # region 0
mov %r10,%rdx
movsd .L__real_3fe0000000000000(%rip),%xmm5 # .5
jmp .L__sc_piby4
.L__sc_reduce:
# reduce the argument to be in a range from -pi/4 to +pi/4
# by subtracting multiples of pi/2
# xneg = (ax != ux);
cmp %r10,%rdx
## if (xneg) x = -x;
jz .Lpositive
subsd %xmm0,%xmm2
movsd %xmm2,%xmm0
.align 16
.Lpositive:
## if (x < 5.0e5)
cmp .L__real_411E848000000000(%rip),%r10
jae .Lsincosf_reduce_precise
movsd %xmm0,%xmm2
movsd %xmm0,%xmm4
mulsd .L__real_3fe45f306dc9c883(%rip),%xmm2 # twobypi
movsd .L__real_3fe0000000000000(%rip),%xmm5 # .5
#/* How many pi/2 is x a multiple of? */
# xexp = ax >> EXPSHIFTBITS_DP64;
mov %r10,%r9
shr $52,%r9 # >> EXPSHIFTBITS_DP64
# npi2 = (int)(x * twobypi + 0.5);
addsd %xmm5,%xmm2 # npi2
movsd .L__real_3ff921fb54400000(%rip),%xmm3 # piby2_1
cvttpd2dq %xmm2,%xmm0 # convert to integer
movsd .L__real_3dd0b4611a626331(%rip),%xmm1 # piby2_1tail
cvtdq2pd %xmm0,%xmm2 # and back to float.
# /* Subtract the multiple from x to get an extra-precision remainder */
# rhead = x - npi2 * piby2_1;
# /* Subtract the multiple from x to get an extra-precision remainder */
# rhead = x - npi2 * piby2_1;
mulsd %xmm2,%xmm3 # use piby2_1
subsd %xmm3,%xmm4 # rhead
# rtail = npi2 * piby2_1tail;
mulsd %xmm2,%xmm1 # rtail
movd %xmm0,%eax
# GET_BITS_DP64(rhead-rtail, uy); ; originally only rhead
movsd %xmm4,%xmm0
subsd %xmm1,%xmm0
movsd .L__real_3dd0b4611a600000(%rip),%xmm3 # piby2_2
movsd .L__real_3ba3198a2e037073(%rip),%xmm5 # piby2_2tail
movd %xmm0,%rcx
# expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
shl $1,%rcx # strip any sign bit
shr $53,%rcx # >> EXPSHIFTBITS_DP64 +1
sub %rcx,%r9 # expdiff
## if (expdiff > 15)
cmp $15,%r9
jle .Lexpdiff15
# /* The remainder is pretty small compared with x, which
# implies that x is a near multiple of pi/2
# (x matches the multiple to at least 15 bits) */
# t = rhead;
movsd %xmm4,%xmm1
# rtail = npi2 * piby2_2;
mulsd %xmm2,%xmm3
# rhead = t - rtail;
mulsd %xmm2,%xmm5 # npi2 * piby2_2tail
subsd %xmm3,%xmm4 # rhead
# rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
subsd %xmm4,%xmm1 # t - rhead
subsd %xmm3,%xmm1 # -rtail
subsd %xmm1,%xmm5 # rtail
# r = rhead - rtail;
movsd %xmm4,%xmm0
#HARSHA
#xmm1=rtail
movsd %xmm5,%xmm1
subsd %xmm5,%xmm0
# region = npi2 & 3;
# and $3,%eax
# xmm0=r, xmm4=rhead, xmm1=rtail
.Lexpdiff15:
## if the input was close to a pi/2 multiple
#
cmp $0x03f2,%rcx # if r small.
jge .L__sc_piby4 # use taylor series if not
cmp $0x03de,%rcx # if r really small.
jle .Lsinsmall # then sin(r) = r
movsd %xmm0,%xmm2
mulsd %xmm2,%xmm2 # x^2
# use simply polynomial
# *s = x - x*x*x*0.166666666666666666;
movsd .L__real_3fc5555555555555(%rip),%xmm3 #
mulsd %xmm0,%xmm3 # * x
mulsd %xmm2,%xmm3 # * x^2
subsd %xmm3,%xmm0 # xs
# *c = 1.0 - x*x*0.5;
movsd .L__real_3ff0000000000000(%rip),%xmm1 # 1.0
mulsd .L__real_3fe0000000000000(%rip),%xmm2 # 0.5 *x^2
subsd %xmm2,%xmm1
jmp .L__adjust_region
.Lsinsmall: # then sin(r) = r
movsd .L__real_3ff0000000000000(%rip),%xmm1 # cos(r) is a 1
jmp .L__adjust_region
# perform taylor series to calc sinx, cosx
# COS
# x2 = x * x;
# return (1.0 - 0.5 * x2 + (x2 * x2 *
# (c1 + x2 * (c2 + x2 * (c3 + x2 * c4)))));
# x2 = x * x;
# return (1.0 - 0.5 * x2 + (x2 * x2 *
# (c1 + x2 * (c2 + x2 * (c3 + x2 * c4)))));
# SIN
# zc,zs = (c2 + x2 * (c3 + x2 * c4 ));
# xs = r + x3 * (sc1 + x2 * zs);
# x2 = x * x;
# return (x + x * x2 * (c1 + x2 * (c2 + x2 * (c3 + x2 * c4))));
# done with reducing the argument. Now perform the sin/cos calculations.
.align 16
.L__sc_piby4:
# x2 = r * r;
movsd .L__real_3fe0000000000000(%rip),%xmm5 # .5
movsd %xmm0,%xmm2
mulsd %xmm0,%xmm2 # x2
shufpd $0,%xmm2,%xmm2 # x2,x2
movsd %xmm2,%xmm4
mulsd %xmm4,%xmm4 # x4
shufpd $0,%xmm4,%xmm4 # x4,x4
# x2m = _mm_set1_pd (x2);
# zc,zs = (c2 + x2 * (c3 + x2 * c4 ));
# xs = r + x3 * (sc1 + x2 * zs);
# xc = t + ( x2 * x2 * (cc1 + x2 * zc));
movapd .Lcsarray+0x30(%rip),%xmm1 # c4
movapd .Lcsarray+0x10(%rip),%xmm3 # c2
mulpd %xmm2,%xmm1 # x2c4
mulpd %xmm2,%xmm3 # x2c2
# rc = 0.5 * x2;
mulsd %xmm2,%xmm5 #rc
mulsd %xmm0,%xmm2 #x3
addpd .Lcsarray+0x20(%rip),%xmm1 # c3 + x2c4
addpd .Lcsarray(%rip),%xmm3 # c1 + x2c2
mulpd %xmm4,%xmm1 # x4(c3 + x2c4)
addpd %xmm3,%xmm1 # c1 + x2c2 + x4(c3 + x2c4)
# -t = rc-1;
subsd .L__real_3ff0000000000000(%rip),%xmm5 # 1.0
# now we have the poly for sin in the low half, and cos in upper half
mulsd %xmm1,%xmm2 # x3(sin poly)
shufpd $3,%xmm1,%xmm1 # get cos poly to low half of register
mulsd %xmm4,%xmm1 # x4(cos poly)
addsd %xmm2,%xmm0 # sin = r+...
subsd %xmm5,%xmm1 # cos = poly-(-t)
.L__adjust_region: # xmm0 is sin, xmm1 is cos
# switch (region)
mov %eax,%ecx
and $1,%eax
jz .Lregion02
# region 1 or 3
movsd %xmm0,%xmm2 # swap sin,cos
movsd %xmm1,%xmm0 # sin = cos
xorpd %xmm1,%xmm1
subsd %xmm2,%xmm1 # cos = -sin
.Lregion02:
and $2,%ecx
jz .Lregion23
# region 2 or 3
movsd %xmm0,%xmm2
movsd %xmm1,%xmm3
xorpd %xmm0,%xmm0
xorpd %xmm1,%xmm1
subsd %xmm2,%xmm0 # sin = -sin
subsd %xmm3,%xmm1 # cos = -cos
.Lregion23:
## if (xneg) *s = -*s ;
cmp %r10,%rdx
jz .L__sc_cleanup
movsd %xmm0,%xmm2
xorpd %xmm0,%xmm0
subsd %xmm2,%xmm0 # sin = -sin
.align 16
.L__sc_cleanup:
cvtsd2ss %xmm0,%xmm0 # convert back to floats
cvtsd2ss %xmm1,%xmm1
movss %xmm0,(%rdi) # save the sin
movss %xmm1,(%rsi) # save the cos
add $stack_size,%rsp
ret
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 16
.Lsincosf_reduce_precise:
# /* Reduce abs(x) into range [-pi/4,pi/4] */
# __amd_remainder_piby2(ax, &r, &region);
mov %rdx,p_temp(%rsp) # save ux for use later
mov %r10,p_temp1(%rsp) # save ax for use later
mov %rdi,p_temp2(%rsp) # save ux for use later
mov %rsi,p_temp3(%rsp) # save ax for use later
movd %xmm0,%rdi
lea r(%rsp),%rsi
lea region(%rsp),%rdx
sub $0x040,%rsp
call __amd_remainder_piby2d2f@PLT
add $0x040,%rsp
mov p_temp(%rsp),%rdx # restore ux for use later
mov p_temp1(%rsp),%r10 # restore ax for use later
mov p_temp2(%rsp),%rdi # restore ux for use later
mov p_temp3(%rsp),%rsi # restore ax for use later
mov $1,%r8d # for determining region later on
movsd r(%rsp),%xmm0 # r
mov region(%rsp),%eax # region
jmp .L__sc_piby4
.align 16
.L__sc_naninf:
cvtsd2ss %xmm0,%xmm0 # convert back to floats
call fname_special # rdi and rsi are ready for the function call
add $stack_size, %rsp
ret