blob: b0e23aa363cfae26549acb828c9db7b045047978 [file] [log] [blame]
#
# (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
#
# This file is part of libacml_mv.
#
# libacml_mv is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# libacml_mv is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with libacml_mv. If not, see
# <http://www.gnu.org/licenses/>.
#
#
#
# __vrs4_expf.s
#
# A vector implementation of the expf libm function.
# This routine implemented in single precision. It is slightly
# less accurate than the double precision version, but it will
# be better for vectorizing.
#
# Prototype:
#
# __m128 __vrs4_expf(__m128 x);
#
# Computes e raised to the x power for 4 floats at a time.
# Does not perform error handling, but does return C99 values for error
# inputs. Denormal results are truncated to 0.
#
#
#ifdef __ELF__
.section .note.GNU-stack,"",@progbits
#endif
.text
.align 16
.p2align 4,,15
# define local variable storage offsets
.equ p_temp,0 # temporary for get/put bits operation
.equ p_ux,0x10 # local storage for ux array
.equ p_m,0x20 # local storage for m array
.equ p_j,0x30 # local storage for m array
.equ save_rbx,0x040 #qword
.equ stack_size,0x48
.globl __vrs4_expf
.type __vrs4_expf,@function
__vrs4_expf:
sub $stack_size,%rsp
mov %rbx,save_rbx(%rsp)
movaps %xmm0,p_ux(%rsp)
maxps .L__real_m8192(%rip),%xmm0 # protect against small input values
# /* Find m, z1 and z2 such that exp(x) = 2**m * (z1 + z2) */
# Step 1. Reduce the argument.
# r = x * thirtytwo_by_logbaseof2;
movaps .L__real_thirtytwo_by_log2(%rip),%xmm2 #
mulps %xmm0,%xmm2
xor %rax,%rax
minps .L__real_8192(%rip),%xmm2 # protect against large input values
# /* Set n = nearest integer to r */
cvtps2dq %xmm2,%xmm3
lea .L__two_to_jby32_table(%rip),%rdi
cvtdq2ps %xmm3,%xmm1
# r1 = x - n * logbaseof2_by_32_lead;
movaps .L__real_log2_by_32_head(%rip),%xmm2
mulps %xmm1,%xmm2
subps %xmm2,%xmm0 # r1 in xmm0,
# r2 = - n * logbaseof2_by_32_lead;
mulps .L__real_log2_by_32_tail(%rip),%xmm1
# j = n & 0x0000001f;
movdqa %xmm3,%xmm4
movdqa .L__int_mask_1f(%rip),%xmm2
pand %xmm4,%xmm2
movdqa %xmm2,p_j(%rsp)
# f1 = two_to_jby32_lead_table[j];
# *m = (n - j) / 32;
psubd %xmm2,%xmm4
psrad $5,%xmm4
movdqa %xmm4,p_m(%rsp)
movaps %xmm0,%xmm3
addps %xmm1,%xmm3
mov p_j(%rsp),%eax # get an individual index
mov (%rdi,%rax,4),%edx # get the f1 value
mov %edx,p_j(%rsp) # save the f1 value
# Step 2. Compute the polynomial.
# q = r1 +
# r*r*( 5.00000000000000008883e-01 +
# r*( 1.66666666665260878863e-01 +
# r*( 4.16666666662260795726e-02 +
# r*( 8.33336798434219616221e-03 +
# r*( 1.38889490863777199667e-03 )))));
# q = r + r^2/2 + r^3/6 + r^4/24 + r^5/120 + r^6/720
# q = r + r^2/2 + r^3/6 + r^4/24 good enough for single precision
movaps %xmm3,%xmm4
movaps %xmm3,%xmm2 # x*x
mulps %xmm2,%xmm2
mulps .L__real_1_24(%rip),%xmm4 # /24
mov p_j+4(%rsp),%eax # get an individual index
mov (%rdi,%rax,4),%edx # get the f1 value
mov %edx,p_j+4(%rsp) # save the f1 value
addps .L__real_1_6(%rip),%xmm4 # +1/6
mulps %xmm2,%xmm3 # x^3
mov p_j+8(%rsp),%eax # get an individual index
mov (%rdi,%rax,4),%edx # get the f1 value
mov %edx,p_j+8(%rsp) # save the f1 value
mulps .L__real_half(%rip),%xmm2 # x^2/2
mulps %xmm3,%xmm4 # *x^3
mov p_j+12(%rsp),%eax # get an individual index
mov (%rdi,%rax,4),%edx # get the f1 value
mov %edx,p_j+12(%rsp) # save the f1 value
addps %xmm4,%xmm1 # +r2
addps %xmm2,%xmm1 # + x^2/2
addps %xmm1,%xmm0 # +r1
# deal with infinite or denormal results
movdqa p_m(%rsp),%xmm1
movdqa p_m(%rsp),%xmm2
pcmpgtd .L__int_127(%rip),%xmm2
pminsw .L__int_128(%rip),%xmm1 # ceil at 128
movmskps %xmm2,%eax
test $0x0f,%eax
paddd .L__int_127(%rip),%xmm1 # add bias
# *z2 = f2 + ((f1 + f2) * q);
mulps p_j(%rsp),%xmm0 # * f1
addps p_j(%rsp),%xmm0 # + f1
jnz .L__exp_largef
.L__check1:
pxor %xmm2,%xmm2 # floor at 0
pmaxsw %xmm2,%xmm1
pslld $23,%xmm1 # build 2^n
movaps %xmm1,%xmm2
# check for infinity or nan
movaps p_ux(%rsp),%xmm1
andps .L__real_infinity(%rip),%xmm1
cmpps $0,.L__real_infinity(%rip),%xmm1
movmskps %xmm1,%eax
test $0xf,%eax
# end of splitexp
# /* Scale (z1 + z2) by 2.0**m */
# Step 3. Reconstitute.
mulps %xmm2,%xmm0 # result *= 2^n
# we'd like to avoid a branch, and can use cmp's and and's to
# eliminate them. But it adds cycles for normal cases
# to handle events that are supposed to be exceptions.
# Using this branch with the
# check above results in faster code for the normal cases.
# And branch mispredict penalties should only come into
# play for nans and infinities.
jnz .L__exp_naninf
#
.L__final_check:
mov save_rbx(%rsp),%rbx # restore rbx
add $stack_size,%rsp
ret
# deal with nans and infinities
.L__exp_naninf:
movaps %xmm0,p_temp(%rsp) # save the computed values
mov %eax,%ecx # save the error mask
test $1,%ecx # first value?
jz .__Lni2
mov p_ux(%rsp),%edx # get the input
call .L__naninf
mov %edx,p_temp(%rsp) # save the new result
.__Lni2:
test $2,%ecx # first value?
jz .__Lni3
mov p_ux+4(%rsp),%edx # get the input
call .L__naninf
mov %edx,p_temp+4(%rsp) # save the new result
.__Lni3:
test $4,%ecx # first value?
jz .__Lni4
mov p_ux+8(%rsp),%edx # get the input
call .L__naninf
mov %edx,p_temp+8(%rsp) # save the new result
.__Lni4:
test $8,%ecx # first value?
jz .__Lnie
mov p_ux+12(%rsp),%edx # get the input
call .L__naninf
mov %edx,p_temp+12(%rsp) # save the new result
.__Lnie:
movaps p_temp(%rsp),%xmm0 # get the answers
jmp .L__final_check
# a simple subroutine to check a scalar input value for infinity
# or NaN and return the correct result
# expects input in edx, and returns value in edx. Destroys eax.
.L__naninf:
mov $0x0007FFFFF,%eax
test %eax,%edx
jnz .L__enan # jump if mantissa not zero, so it's a NaN
# inf
mov %edx,%eax
rcl $1,%eax
jnc .L__r # exp(+inf) = inf
xor %edx,%edx # exp(-inf) = 0
jmp .L__r
#NaN
.L__enan:
mov $0x000400000,%eax # convert to quiet
or %eax,%edx
.L__r:
ret
.align 16
# deal with m > 127. In some instances, rounding during calculations
# can result in infinity when it shouldn't. For these cases, we scale
# m down, and scale the mantissa up.
.L__exp_largef:
movdqa %xmm0,p_temp(%rsp) # save the mantissa portion
movdqa %xmm1,p_m(%rsp) # save the exponent portion
mov %eax,%ecx # save the error mask
test $1,%ecx # first value?
jz .L__Lf2
mov p_m(%rsp),%edx # get the exponent
sub $1,%edx # scale it down
mov %edx,p_m(%rsp) # save the exponent
movss p_temp(%rsp),%xmm3 # get the mantissa
mulss .L__real_two(%rip),%xmm3 # scale it up
movss %xmm3,p_temp(%rsp) # save the mantissa
.L__Lf2:
test $2,%ecx # second value?
jz .L__Lf3
mov p_m+4(%rsp),%edx # get the exponent
sub $1,%edx # scale it down
mov %edx,p_m+4(%rsp) # save the exponent
movss p_temp+4(%rsp),%xmm3 # get the mantissa
mulss .L__real_two(%rip),%xmm3 # scale it up
movss %xmm3,p_temp+4(%rsp) # save the mantissa
.L__Lf3:
test $4,%ecx # third value?
jz .L__Lf4
mov p_m+8(%rsp),%edx # get the exponent
sub $1,%edx # scale it down
mov %edx,p_m+8(%rsp) # save the exponent
movss p_temp+8(%rsp),%xmm3 # get the mantissa
mulss .L__real_two(%rip),%xmm3 # scale it up
movss %xmm3,p_temp+8(%rsp) # save the mantissa
.L__Lf4:
test $8,%ecx # fourth value?
jz .L__Lfe
mov p_m+12(%rsp),%edx # get the exponent
sub $1,%edx # scale it down
mov %edx,p_m+12(%rsp) # save the exponent
movss p_temp+12(%rsp),%xmm3 # get the mantissa
mulss .L__real_two(%rip),%xmm3 # scale it up
movss %xmm3,p_temp+12(%rsp) # save the mantissa
.L__Lfe:
movaps p_temp(%rsp),%xmm0 # restore the mantissa portion back
movdqa p_m(%rsp),%xmm1 # restore the exponent portion
jmp .L__check1
.data
.align 64
.L__real_half: .long 0x3f000000 # 1/2
.long 0x3f000000
.long 0x3f000000
.long 0x3f000000
.L__real_two: .long 0x40000000 # 2
.long 0x40000000
.long 0x40000000
.long 0x40000000
.L__real_8192: .long 0x46000000 # 8192, to protect against really large numbers
.long 0x46000000
.long 0x46000000
.long 0x46000000
.L__real_m8192: .long 0xC6000000 # -8192, to protect against really small numbers
.long 0xC6000000
.long 0xC6000000
.long 0xC6000000
.L__real_thirtytwo_by_log2: .long 0x4238AA3B # thirtytwo_by_log2
.long 0x4238AA3B
.long 0x4238AA3B
.long 0x4238AA3B
.L__real_log2_by_32: .long 0x3CB17218 # log2_by_32
.long 0x3CB17218
.long 0x3CB17218
.long 0x3CB17218
.L__real_log2_by_32_head: .long 0x3CB17000 # log2_by_32
.long 0x3CB17000
.long 0x3CB17000
.long 0x3CB17000
.L__real_log2_by_32_tail: .long 0xB585FDF4 # log2_by_32
.long 0xB585FDF4
.long 0xB585FDF4
.long 0xB585FDF4
.L__real_1_6: .long 0x3E2AAAAB # 0.16666666666 used in polynomial
.long 0x3E2AAAAB
.long 0x3E2AAAAB
.long 0x3E2AAAAB
.L__real_1_24: .long 0x3D2AAAAB # 0.041666668 used in polynomial
.long 0x3D2AAAAB
.long 0x3D2AAAAB
.long 0x3D2AAAAB
.L__real_infinity: .long 0x7f800000 # infinity
.long 0x7f800000
.long 0x7f800000
.long 0x7f800000
.L__int_mask_1f: .long 0x00000001f
.long 0x00000001f
.long 0x00000001f
.long 0x00000001f
.L__int_128: .long 0x000000080
.long 0x000000080
.long 0x000000080
.long 0x000000080
.L__int_127: .long 0x00000007f
.long 0x00000007f
.long 0x00000007f
.long 0x00000007f
.L__two_to_jby32_table:
.long 0x3F800000 # 1
.long 0x3F82CD87 # 1.0218972
.long 0x3F85AAC3 # 1.0442737
.long 0x3F88980F # 1.0671405
.long 0x3F8B95C2 # 1.0905077
.long 0x3F8EA43A # 1.1143868
.long 0x3F91C3D3 # 1.1387886
.long 0x3F94F4F0 # 1.1637249
.long 0x3F9837F0 # 1.1892071
.long 0x3F9B8D3A # 1.2152474
.long 0x3F9EF532 # 1.2418578
.long 0x3FA27043 # 1.269051
.long 0x3FA5FED7 # 1.2968396
.long 0x3FA9A15B # 1.3252367
.long 0x3FAD583F # 1.3542556
.long 0x3FB123F6 # 1.3839099
.long 0x3FB504F3 # 1.4142135
.long 0x3FB8FBAF # 1.4451808
.long 0x3FBD08A4 # 1.4768262
.long 0x3FC12C4D # 1.5091645
.long 0x3FC5672A # 1.5422108
.long 0x3FC9B9BE # 1.5759809
.long 0x3FCE248C # 1.6104903
.long 0x3FD2A81E # 1.6457555
.long 0x3FD744FD # 1.6817929
.long 0x3FDBFBB8 # 1.7186193
.long 0x3FE0CCDF # 1.7562522
.long 0x3FE5B907 # 1.7947091
.long 0x3FEAC0C7 # 1.8340081
.long 0x3FEFE4BA # 1.8741677
.long 0x3FF5257D # 1.9152066
.long 0x3FFA83B3 # 1.9571441
.long 0 # for alignment