| |
| # |
| # (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved. |
| # |
| # This file is part of libacml_mv. |
| # |
| # libacml_mv is free software; you can redistribute it and/or |
| # modify it under the terms of the GNU Lesser General Public |
| # License as published by the Free Software Foundation; either |
| # version 2.1 of the License, or (at your option) any later version. |
| # |
| # libacml_mv is distributed in the hope that it will be useful, |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| # Lesser General Public License for more details. |
| # |
| # You should have received a copy of the GNU Lesser General Public |
| # License along with libacml_mv. If not, see |
| # <http://www.gnu.org/licenses/>. |
| # |
| # |
| |
| |
| # round.S |
| # |
| # An implementation of the round libm function. |
| # |
| # Prototype: |
| # |
| # double round(double x); |
| # |
| |
| # |
| # Algorithm: First get the exponent of the input |
| # double precision number. |
| # IF exponent is greater than 51 then return the |
| # input as is. |
| # IF exponent is less than 0 then force an overflow |
| # by adding a huge number and subtracting with the |
| # same number. |
| # IF exponent is greater than 0 then add 0.5 and |
| # and shift the mantissa bits based on the exponent |
| # value to discard the fractional component. |
| # |
| # |
| |
| #include "fn_macros.h" |
| #define fname FN_PROTOTYPE(round) |
| #define fname_special _round_special |
| |
| |
| # local variable storage offsets |
| |
| |
| #ifdef __ELF__ |
| .section .note.GNU-stack,"",@progbits |
| #endif |
| |
| .text |
| .align 16 |
| .p2align 4,,15 |
| .globl fname |
| .type fname,@function |
| #in sse5 there is a roundss,roundsd instruction |
| fname: |
| movsd .L__2p52_plus_one(%rip),%xmm4 |
| movsd .L__sign_mask_64(%rip),%xmm5 |
| mov $52,%r10 |
| #take 3 copies of the input xmm0 |
| movsd %xmm0,%xmm1 |
| movsd %xmm0,%xmm2 |
| movsd %xmm0,%xmm3 |
| #get the Most signifacnt half word of the input number in r9 |
| pand .L__exp_mask_64(%rip), %xmm1 |
| pextrw $3,%xmm1,%r9 |
| cmp $0X7FF0,%r9 |
| #Check for infinity inputs |
| jz .L__is_infinity |
| movsd .L__sign_mask_64(%rip), %xmm1 |
| pandn %xmm2,%xmm1 # xmm1 now stores the sign of the input number |
| #On shifting r9 and subtracting with 0x3FF |
| #r9 stores the exponent. |
| shr $0X4,%r9 |
| sub $0x3FF,%r9 |
| cmp $0x00, %r9 |
| jl .L__number_less_than_zero |
| |
| #IF exponent is greater than 0 |
| .L__number_greater_than_zero: |
| cmp $51,%r9 |
| jg .L__is_greater_than_2p52 |
| |
| #IF exponent is greater than 0 and less than 2^52 |
| pand .L__sign_mask_64(%rip),%xmm0 |
| #add with 0.5 |
| addsd .L__zero_point_5(%rip),%xmm0 |
| movsd %xmm0,%xmm5 |
| |
| pand .L__exp_mask_64(%rip),%xmm5 |
| pand .L__mantissa_mask_64(%rip),%xmm0 |
| #r10 = r9(input exponent) - r10(52=mantissa length) |
| sub %r9,%r10 |
| movd %r10, %xmm2 |
| #do right and left shift by (input exp - mantissa length) |
| psrlq %xmm2,%xmm0 |
| psllq %xmm2,%xmm0 |
| #OR the input exponent with the input sign |
| por %xmm1,%xmm5 |
| #finally OR with the matissa |
| por %xmm5,%xmm0 |
| ret |
| |
| #IF exponent is less than 0 |
| .L__number_less_than_zero: |
| pand %xmm5,%xmm3 # xmm3 =abs(input) |
| addsd %xmm4,%xmm3# add (2^52 + 1) |
| subsd %xmm4,%xmm3# sub (2^52 + 1) |
| por %xmm1, %xmm3 # OR with the sign of the input number |
| movsd %xmm3,%xmm0 |
| ret |
| |
| #IF the input is infinity |
| .L__is_infinity: |
| comisd %xmm4,%xmm0 |
| jnp .L__is_zero #parity flag is raised |
| #IF one of theinputs is a Nan |
| .L__is_nan : |
| por .L__qnan_mask_64(%rip),%xmm0 # set the QNan Bit |
| .L__is_zero : |
| .L__is_greater_than_2p52: |
| ret |
| |
| .align 16 |
| .L__sign_mask_64: .quad 0x7FFFFFFFFFFFFFFF |
| .quad 0 |
| |
| .L__qnan_mask_64: .quad 0x0008000000000000 |
| .quad 0 |
| .L__exp_mask_64: .quad 0x7FF0000000000000 |
| .quad 0 |
| .L__mantissa_mask_64: .quad 0x000FFFFFFFFFFFFF |
| .quad 0 |
| .L__zero: .quad 0x0000000000000000 |
| .quad 0 |
| .L__2p52_plus_one: .quad 0x4330000000000001 # = 4503599627370497.0 |
| .quad 0 |
| .L__zero_point_5: .quad 0x3FE0000000000001 # = 00.5 |
| .quad 0 |
| |
| |
| |