blob: 6e7ca03533b017343eeecc0cd5db8abbe1e03c6e [file] [log] [blame]
#
# (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
#
# This file is part of libacml_mv.
#
# libacml_mv is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# libacml_mv is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with libacml_mv. If not, see
# <http://www.gnu.org/licenses/>.
#
#
#include "fn_macros.h"
#define fname FN_PROTOTYPE(expm1f)
#define fname_special _expm1f_special@PLT
#ifdef __ELF__
.section .note.GNU-stack,"",@progbits
#endif
.text
.p2align 4
.globl fname
.type fname, @function
fname:
ucomiss .L__max_expm1_arg(%rip),%xmm0 ##if(x > max_expm1_arg)
ja .L__Max_Arg
jp .L__Max_Arg
ucomiss .L__log_OnePlus_OneByFour(%rip),%xmm0 ##if(x < log_OnePlus_OneByFour)
jae .L__Normal_Flow
ucomiss .L__log_OneMinus_OneByFour(%rip),%xmm0 ##if(x > log_OneMinus_OneByFour)
ja .L__Small_Arg
ucomiss .L__min_expm1_arg(%rip),%xmm0 ##if(x < min_expm1_arg)
jb .L__Min_Arg
.p2align 4
.L__Normal_Flow:
movaps %xmm0,%xmm1 #xmm1 = x
mulss .L__thirtyTwo_by_ln2(%rip),%xmm1 #xmm1 = x*thirtyTwo_by_ln2
movd %xmm1,%eax #eax = x*thirtyTwo_by_ln2
and $0x80000000,%eax #get the sign of x*thirtyTwo_by_ln2
or $0x3F000000,%eax #make +/- 0.5
movd %eax,%xmm2 #xmm2 = +/- 0.5
addss %xmm2,%xmm1 #xmm1 = (x*32/ln2) +/- 0.5
cvttps2dq %xmm1,%xmm2 #xmm2 = n = (int)(temp)
mov $0x0000001f,%edx
movd %edx,%xmm1
andps %xmm2,%xmm1 #xmm1 = j
movd %xmm2,%ecx #ecx = n
sarl $5, %ecx #ecx = m = n >> 5
#xor %rdx,%rdx #make it zeros, to be used for address
movd %xmm1,%edx #edx = j
lea S_lead_and_trail_table(%rip),%rax
movsd (%rax,%rdx,8),%xmm3 #xmm3 = S_T,S_L
punpckldq %xmm2,%xmm1 #xmm1 = n,j
psubd %xmm1,%xmm2 #xmm2 = n1
punpcklqdq %xmm2,%xmm1 #xmm1 = n1,n,j
cvtdq2ps %xmm1,%xmm1 #xmm1 = (float)(n1,n,j)
#r2 = -(n*ln2_by_ThirtyTwo_trail);
#r1 = (x-n1*ln2_by_ThirtyTwo_lead) - j*ln2_by_ThirtyTwo_lead;
mulps .L__Ln2By32_LeadTrailLead(%rip),%xmm1
movhlps %xmm1,%xmm2 #xmm2 = n1*ln2/32lead
movaps %xmm0,%xmm4 #xmm4 = x
subss %xmm2,%xmm4 #xmm4 = x - n1*ln2/32lead
subss %xmm1,%xmm4 #xmm4 = r1
psrldq $4,%xmm1 #xmm1 = -r2 should take care of sign later
#r = r1 + r2;
movaps %xmm4,%xmm7 #xmm7 = r1
subss %xmm1,%xmm4 #xmm4 = r = r1-(-r2) = r1 + r2
#q = r*r*(B1+r*(B2));
movaps %xmm4,%xmm6 #xmm6 = r
mulss .L__B2_f(%rip),%xmm6 #xmm6 = r * B2
addss .L__B1_f(%rip),%xmm6 #xmm6 = B1 + (r * B2)
mulss %xmm4,%xmm6
mulss %xmm4,%xmm6 #xmm6 = q
#p = (r2+q) + r1;
subss %xmm1,%xmm6
addss %xmm7,%xmm6 #xmm6 = p
#s = S_L.f32 + S_T.f32;
movdqa %xmm3,%xmm2 #xmm2 = S_T,S_L
psrldq $4,%xmm2 #xmm2 = S_T
movaps %xmm2,%xmm5 #xmm5 = S_T
addss %xmm3,%xmm2 #xmm2 = s
cmp $0xfffffff9,%ecx #Check m < -7
jl .L__M_Below_Minus7
cmp $23,%ecx #Check m > 23
jg .L__M_Above_23
# -8 < m < 24
#twopm.f32 * ((S_L.f32 - twopmm.f32) + (S_L.f32*p+ S_T.f32 *(1+p)));
movaps %xmm3,%xmm2 #xmm2 = S_L
mulss %xmm6,%xmm2 #xmm2 = S_L * p
addss .L__One_f(%rip),%xmm6 #xmm6 = 1+p
mulss %xmm5,%xmm6 #xmm6 = S_T *(1+p)
addss %xmm6,%xmm2 #xmm2 = (S_L.f32*p+ S_T.f32 *(1+p))
mov $127,%eax
sub %ecx,%eax #eax = 127 - m
shl $23,%eax #eax = 2^-m
movd %eax,%xmm1
subss %xmm1,%xmm3 #xmm3 = (S_L.f32 - twopmm.f32)
addss %xmm3,%xmm2 #xmm2 = ((S_L.f32 - twopmm.f32) + (S_L.f32*p+ S_T.f32 *(1+p)))
shl $23,%ecx
movd %ecx,%xmm0
paddd %xmm2,%xmm0
ret
.p2align 4
.L__M_Below_Minus7:
#twopm.f32 * (S_L.f32 + (s*p + S_T.f32)) - 1;
mulss %xmm6,%xmm2 #xmm2 = s*p
addss %xmm5,%xmm2 #xmm2 = s*p + S_T
addss %xmm3,%xmm2 #xmm2 = (S_L.f32 + (s*p + S_T.f32))
shl $23,%ecx
movd %ecx,%xmm0
paddd %xmm2,%xmm0
subss .L__One_f(%rip),%xmm0
ret
.p2align 4
.L__M_Above_23:
#twopm.f32 * (S_L.f32 + (s*p+(S_T.f32 - twopmm.f32)));
cmp $0x00000080,%ecx #Check m < 128
je .L__M_Equals_128
cmp $47,%ecx #Check m > 47
ja .L__M_Above_47
mov $127,%eax
sub %ecx,%eax #eax = 127 - m
shl $23,%eax #eax = 2^-m
movd %eax,%xmm1
subss %xmm1,%xmm5 #xmm5 = S_T.f32 - twopmm.f32
.p2align 4
.L__M_Above_47:
shl $23,%ecx
mulss %xmm6,%xmm2 #xmm2 = s*p
addss %xmm5,%xmm2
addss %xmm3,%xmm2
movd %ecx,%xmm0
paddd %xmm2,%xmm0
ret
.p2align 4
.L__M_Equals_128:
mov $0x3f800000,%ecx #127 at exponent
mulss %xmm6,%xmm2 #xmm2 = s*p
addss %xmm5,%xmm2 #xmm2 = s*p + S_T
addss %xmm3,%xmm2 #xmm2 = (S_L.f32 + (s*p + S_T.f32))
movd %ecx,%xmm1 #127
paddd %xmm2,%xmm1 #2^127*(S_L.f32 + (s*p + S_T.f32))
mov $0x00800000,%ecx #multiply with one more 2
movd %ecx,%xmm2
paddd %xmm2,%xmm1
movd %xmm1,%ecx
and $0x7f800000,%ecx #check if we reached +inf
cmp $0x7f800000,%ecx
je .L__Overflow
movdqa %xmm1,%xmm0
ret
.p2align 4
.L__Small_Arg:
movd %xmm0,%eax
and $0x7fffffff,%eax #eax = abs(x)
cmp $0x33000000,%eax #check abs(x) < 2^-25
jl .L__VeryTiny_Arg
#log(1-1/4) < x < log(1+1/4)
#q = x*x*x*(A1 + x*(A2 + x*(A3 + x*(A4 + x*(A5)))));
movdqa %xmm0,%xmm1
mulss .L__A5_f(%rip),%xmm1
addss .L__A4_f(%rip),%xmm1
mulss %xmm0,%xmm1
addss .L__A3_f(%rip),%xmm1
mulss %xmm0,%xmm1
addss .L__A2_f(%rip),%xmm1
mulss %xmm0,%xmm1
addss .L__A1_f(%rip),%xmm1
mulss %xmm0,%xmm1
mulss %xmm0,%xmm1
mulss %xmm0,%xmm1
cvtps2pd %xmm0,%xmm2
movdqa %xmm2,%xmm0
mulsd %xmm0,%xmm2
mulsd .L__PointFive(%rip),%xmm2
addsd %xmm2,%xmm0
cvtps2pd %xmm1,%xmm2
addsd %xmm0,%xmm2
cvtpd2ps %xmm2,%xmm0
ret
.p2align 4
.L__Min_Arg:
mov $0xBF800000,%eax
#call handle_error
movd %eax,%xmm0
ret
.p2align 4
.L__Max_Arg:
movd %xmm0,%eax
and $0x7fffffff,%eax #eax = abs(x)
cmp $0x7f800000,%eax #check for Nan
jae .L__Nan
.L__Overflow:
mov $0x7f800000,%eax
#call handle_error
movd %eax,%xmm0
ret
.L__Nan:
and $0x007fffff,%eax
je .L__Overflow
addss %xmm0,%xmm0
ret
.p2align 4
.L__VeryTiny_Arg:
#((twopm.f32 * x + xabs.f32) * twopmm.f32);
movd %eax, %xmm1 #xmm1 = abs(x)
mov $0x32000000, %eax #100 at exponent's place
movd %eax, %xmm2
paddd %xmm2, %xmm0
addss %xmm1, %xmm0
psubd %xmm2, %xmm0
ret
.data
.align 16
.type S_lead_and_trail_table, @object
.size S_lead_and_trail_table, 256
S_lead_and_trail_table:
.quad 0x000000003F800000
.quad 0x355315853F82CD80
.quad 0x34D9F3123F85AAC0
.quad 0x35E8092E3F889800
.quad 0x3471F5463F8B95C0
.quad 0x36E62D173F8EA400
.quad 0x361B9D593F91C3C0
.quad 0x36BEA3FC3F94F4C0
.quad 0x36C146373F9837C0
.quad 0x36E6E7553F9B8D00
.quad 0x36C982473F9EF500
.quad 0x34C0C3123FA27040
.quad 0x36354D8B3FA5FEC0
.quad 0x3655A7543FA9A140
.quad 0x36FBA90B3FAD5800
.quad 0x36D6074B3FB123C0
.quad 0x36CCCFE73FB504C0
.quad 0x36BD1D8C3FB8FB80
.quad 0x368E7D603FBD0880
.quad 0x35CCA6673FC12C40
.quad 0x36A845543FC56700
.quad 0x36F619B93FC9B980
.quad 0x35C151F83FCE2480
.quad 0x366C8F893FD2A800
.quad 0x36F32B5A3FD744C0
.quad 0x36DE5F6C3FDBFB80
.quad 0x367761553FE0Ccc0
.quad 0x355CEF903FE5B900
.quad 0x355CFBA53FEAC0c0
.quad 0x36E66F733FEFE480
.quad 0x36F454923FF52540
.quad 0x36CB6DC93FFA8380
.align 16
.L__Ln2By32_LeadTrailLead:
.octa 0x333FBE8E3CB17200333FBE8E3CB17200
.L__max_expm1_arg:
.long 0x42B19999
.L__log_OnePlus_OneByFour:
.long 0x3E647FBF
.L__log_OneMinus_OneByFour:
.long 0xBE934B11
.L__min_expm1_arg:
.long 0xC18AA122
.L__thirtyTwo_by_ln2:
.long 0x4238AA3B
.align 16
.L__B2_f:
.long 0x3E2AAAEC
.L__B1_f:
.long 0x3F000044
.L__One_f:
.long 0x3F800000
.L__PointFive:
.quad 0x3FE0000000000000
.align 16
.L__A1_f:
.long 0x3E2AAAAA
.L__A2_f:
.long 0x3D2AAAA0
.L__A3_f:
.long 0x3C0889FF
.L__A4_f:
.long 0x3AB64DE5
.L__A5_f:
.long 0x394AB327