blob: c1ac20a407a2a743953c7376793f0e19e830b882 [file] [log] [blame]
#
# (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
#
# This file is part of libacml_mv.
#
# libacml_mv is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# libacml_mv is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with libacml_mv. If not, see
# <http://www.gnu.org/licenses/>.
#
#
# round.S
#
# An implementation of the round libm function.
#
# Prototype:
#
# double round(double x);
#
#
# Algorithm: First get the exponent of the input
# double precision number.
# IF exponent is greater than 51 then return the
# input as is.
# IF exponent is less than 0 then force an overflow
# by adding a huge number and subtracting with the
# same number.
# IF exponent is greater than 0 then add 0.5 and
# and shift the mantissa bits based on the exponent
# value to discard the fractional component.
#
#
#include "fn_macros.h"
#define fname FN_PROTOTYPE(round)
#define fname_special _round_special
# local variable storage offsets
#ifdef __ELF__
.section .note.GNU-stack,"",@progbits
#endif
.text
.align 16
.p2align 4,,15
.globl fname
.type fname,@function
#in sse5 there is a roundss,roundsd instruction
fname:
movsd .L__2p52_plus_one(%rip),%xmm4
movsd .L__sign_mask_64(%rip),%xmm5
mov $52,%r10
#take 3 copies of the input xmm0
movsd %xmm0,%xmm1
movsd %xmm0,%xmm2
movsd %xmm0,%xmm3
#get the Most signifacnt half word of the input number in r9
pand .L__exp_mask_64(%rip), %xmm1
pextrw $3,%xmm1,%r9
cmp $0X7FF0,%r9
#Check for infinity inputs
jz .L__is_infinity
movsd .L__sign_mask_64(%rip), %xmm1
pandn %xmm2,%xmm1 # xmm1 now stores the sign of the input number
#On shifting r9 and subtracting with 0x3FF
#r9 stores the exponent.
shr $0X4,%r9
sub $0x3FF,%r9
cmp $0x00, %r9
jl .L__number_less_than_zero
#IF exponent is greater than 0
.L__number_greater_than_zero:
cmp $51,%r9
jg .L__is_greater_than_2p52
#IF exponent is greater than 0 and less than 2^52
pand .L__sign_mask_64(%rip),%xmm0
#add with 0.5
addsd .L__zero_point_5(%rip),%xmm0
movsd %xmm0,%xmm5
pand .L__exp_mask_64(%rip),%xmm5
pand .L__mantissa_mask_64(%rip),%xmm0
#r10 = r9(input exponent) - r10(52=mantissa length)
sub %r9,%r10
movd %r10, %xmm2
#do right and left shift by (input exp - mantissa length)
psrlq %xmm2,%xmm0
psllq %xmm2,%xmm0
#OR the input exponent with the input sign
por %xmm1,%xmm5
#finally OR with the matissa
por %xmm5,%xmm0
ret
#IF exponent is less than 0
.L__number_less_than_zero:
pand %xmm5,%xmm3 # xmm3 =abs(input)
addsd %xmm4,%xmm3# add (2^52 + 1)
subsd %xmm4,%xmm3# sub (2^52 + 1)
por %xmm1, %xmm3 # OR with the sign of the input number
movsd %xmm3,%xmm0
ret
#IF the input is infinity
.L__is_infinity:
comisd %xmm4,%xmm0
jnp .L__is_zero #parity flag is raised
#IF one of theinputs is a Nan
.L__is_nan :
por .L__qnan_mask_64(%rip),%xmm0 # set the QNan Bit
.L__is_zero :
.L__is_greater_than_2p52:
ret
.align 16
.L__sign_mask_64: .quad 0x7FFFFFFFFFFFFFFF
.quad 0
.L__qnan_mask_64: .quad 0x0008000000000000
.quad 0
.L__exp_mask_64: .quad 0x7FF0000000000000
.quad 0
.L__mantissa_mask_64: .quad 0x000FFFFFFFFFFFFF
.quad 0
.L__zero: .quad 0x0000000000000000
.quad 0
.L__2p52_plus_one: .quad 0x4330000000000001 # = 4503599627370497.0
.quad 0
.L__zero_point_5: .quad 0x3FE0000000000001 # = 00.5
.quad 0