src/gas/remainderf.S - open64_libacml_mv - Git at Google


 #
 #  (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
 #
 #  This file is part of libacml_mv.
 #
 #  libacml_mv is free software; you can redistribute it and/or
 #  modify it under the terms of the GNU Lesser General Public
 #  License as published by the Free Software Foundation; either
 #  version 2.1 of the License, or (at your option) any later version.
 #
 #  libacml_mv is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 #  Lesser General Public License for more details.
 #
 #  You should have received a copy of the GNU Lesser General Public
 #  License along with libacml_mv.  If not, see
 #  <http://www.gnu.org/licenses/>.
 #
 #


 # remainderf.S
 #
 # An implementation of the fabs libm function.
 #
 # Prototype:
 #
 #     float remainderf(float x,float y);
 #

 #
 #   Algorithm:
 #

 #include "fn_macros.h"
 #define fname FN_PROTOTYPE(remainderf)
 #define fname_special _remainderf_special


 #ifdef __ELF__
 .section .note.GNU-stack,"",@progbits
 #endif

 .text
 .align 16
 .p2align 4,,15
 .globl fname
 .type fname,@function
 fname:
     mov .L__exp_mask_64(%rip), %rdi
     movapd .L__sign_mask_64(%rip),%xmm6
     cvtss2sd %xmm0,%xmm2 # double x
     cvtss2sd %xmm1,%xmm3 # double y
     pand %xmm6,%xmm2
     pand %xmm6,%xmm3
     movd %xmm2,%rax
     movd %xmm3,%r8
     mov %rax,%r11
     mov %r8,%r9
     movsd %xmm2,%xmm4
     #take the exponents of both x and y
     and %rdi,%rax
     and %rdi,%r8
     ror $52, %rax
     ror $52, %r8
     #ifeither of the exponents is infinity
     cmp $0X7FF,%rax
     jz  .L__InputIsNaN
     cmp $0X7FF,%r8
     jz  .L__InputIsNaNOrInf

     cmp $0,%r8
     jz  .L__Divisor_Is_Zero

     cmp %r9, %r11
     jz  .L__Input_Is_Equal
     jb  .L__ReturnImmediate

     xor %rcx,%rcx
     mov $24,%rdx
     movsd .L__One_64(%rip),%xmm7 # xmm7 = scale
     cmp %rax,%r8
     jae .L__y_is_greater
     #xmm3 = dy
     sub %r8,%rax
     div %dl       # al = ntimes
     mov %al,%cl   # cl = ntimes
     and $0xFF,%ax # set everything t o zero except al
     mul %dl       # ax = dl * al = 24* ntimes
     add $1023, %rax
     shl $52,%rax
     movd %rax,%xmm7 # xmm7 = scale
 .L__y_is_greater:
     mulsd %xmm3,%xmm7 # xmm7 = scale * dy
     movsd .L__2pminus24_decimal(%rip),%xmm6

 .align 16
 .L__Start_Loop:
     dec %cl
     js .L__End_Loop
     divsd %xmm7,%xmm4     # xmm7 = (dx / w)
     cvttsd2siq %xmm4,%rax
     cvtsi2sdq %rax,%xmm4  # xmm4 = t = (double)((int)(dx / w))
     mulsd  %xmm7,%xmm4    # xmm4 = w*t
     mulsd %xmm6,%xmm7     # w*= scale
     subsd  %xmm4,%xmm2    # xmm2 = dx -= w*t
     movsd %xmm2,%xmm4     # xmm4 = dx
     jmp .L__Start_Loop
 .L__End_Loop:
     divsd %xmm7,%xmm4     # xmm7 = (dx / w)
     cvttsd2siq %xmm4,%rax
     cvtsi2sdq %rax,%xmm4  # xmm4 = t = (double)((int)(dx / w))
     and $0x01,%rax        # todd = todd = ((int)(dx / w)) & 1
     mulsd  %xmm7,%xmm4    # xmm4 = w*t
     subsd  %xmm4,%xmm2    # xmm2 = dx -= w*t
     movsd  %xmm7,%xmm6    # store w
     mulsd .L__Zero_Point_Five64(%rip),%xmm7 #xmm7 = 0.5*w

     cmp $0x01,%rax
     jnz .L__todd_is_even
     comisd %xmm2,%xmm7
     je .L__Subtract_w

 .L__todd_is_even:
     comisd %xmm2,%xmm7
     jnb .L__Dont_Subtract_w

 .L__Subtract_w:
     subsd %xmm6,%xmm2

 .L__Dont_Subtract_w:
     comiss .L__Zero_64(%rip),%xmm0
     jb .L__Negative
     cvtsd2ss %xmm2,%xmm0
     ret
 .L__Negative:
     movsd .L__MinusZero_64(%rip),%xmm0
     subsd %xmm2,%xmm0
     cvtsd2ss %xmm0,%xmm0
     ret

 .align 16
 .L__Input_Is_Equal:
     cmp $0x7FF,%rax
     jz .L__Dividend_Is_Infinity
     cmp $0x7FF,%r8
     jz .L__InputIsNaNOrInf
     movsd %xmm0,%xmm1
     pand .L__sign_bit_32(%rip),%xmm1
     movss .L__Zero_64(%rip),%xmm0
     por  %xmm1,%xmm0
     ret

 .L__InputIsNaNOrInf:
     comiss %xmm0,%xmm1
     jp .L__InputIsNaN
     ret
 .L__Divisor_Is_Zero:
 .L__InputIsNaN:
     por .L__exp_mask_32(%rip),%xmm0
 .L__Dividend_Is_Infinity:
     por .L__QNaN_mask_32(%rip),%xmm0
     ret

 #Case when x < y
     #xmm2 = dx
 .L__ReturnImmediate:
     movsd %xmm3,%xmm5
     mulsd .L__Zero_Point_Five64(%rip), %xmm3 # xmm3 = 0.5*dy
     comisd %xmm3,%xmm2 # if (dx > 0.5*dy)
     jna .L__Finish_Immediate # xmm2 <= xmm3
     subsd %xmm5,%xmm2 #dx -= dy

 .L__Finish_Immediate:
     comiss .L__Zero_64(%rip),%xmm0
     #xmm0 contains the input and is the result
     jz .L__Zero
     ja .L__Positive

     movsd .L__Zero_64(%rip),%xmm0
     subsd %xmm2,%xmm0
     cvtsd2ss %xmm0,%xmm0
     ret

 .L__Zero:
     ret

 .L__Positive:
     cvtsd2ss %xmm2,%xmm0
     ret


 .align 32
 .L__sign_bit_32:           .quad 0x8000000080000000
                            .quad 0x0
 .L__exp_mask_64:           .quad 0x7FF0000000000000
                            .quad 0x0
 .L__exp_mask_32:           .quad 0x000000007F800000
                            .quad 0x0
 .L__27bit_andingmask_64:   .quad 0xfffffffff8000000
                            .quad 0
 .L__2p52_mask_64:          .quad 0x4330000000000000
                            .quad 0
 .L__One_64:                .quad 0x3FF0000000000000
                            .quad 0
 .L__Zero_64:               .quad 0x0
                            .quad 0
 .L__MinusZero_64:          .quad 0x8000000000000000
                            .quad 0
 .L__QNaN_mask_32:          .quad 0x0000000000400000
                            .quad 0
 .L__sign_mask_64:          .quad 0x7FFFFFFFFFFFFFFF
                            .quad 0
 .L__2pminus24_decimal:     .quad 0x3E70000000000000
                            .quad 0
 .L__Zero_Point_Five64:     .quad 0x3FE0000000000000
                            .quad 0

	#
	# (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
	#
	# This file is part of libacml_mv.
	#
	# libacml_mv is free software; you can redistribute it and/or
	# modify it under the terms of the GNU Lesser General Public
	# License as published by the Free Software Foundation; either
	# version 2.1 of the License, or (at your option) any later version.
	#
	# libacml_mv is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	# Lesser General Public License for more details.
	#
	# You should have received a copy of the GNU Lesser General Public
	# License along with libacml_mv. If not, see
	# <http://www.gnu.org/licenses/>.
	#
	#


	# remainderf.S
	#
	# An implementation of the fabs libm function.
	#
	# Prototype:
	#
	# float remainderf(float x,float y);
	#

	#
	# Algorithm:
	#

	#include "fn_macros.h"
	#define fname FN_PROTOTYPE(remainderf)
	#define fname_special _remainderf_special


	#ifdef __ELF__
	.section .note.GNU-stack,"",@progbits
	#endif

	.text
	.align 16
	.p2align 4,,15
	.globl fname
	.type fname,@function
	fname:
	mov .L__exp_mask_64(%rip), %rdi
	movapd .L__sign_mask_64(%rip),%xmm6
	cvtss2sd %xmm0,%xmm2 # double x
	cvtss2sd %xmm1,%xmm3 # double y
	pand %xmm6,%xmm2
	pand %xmm6,%xmm3
	movd %xmm2,%rax
	movd %xmm3,%r8
	mov %rax,%r11
	mov %r8,%r9
	movsd %xmm2,%xmm4
	#take the exponents of both x and y
	and %rdi,%rax
	and %rdi,%r8
	ror $52, %rax
	ror $52, %r8
	#ifeither of the exponents is infinity
	cmp $0X7FF,%rax
	jz .L__InputIsNaN
	cmp $0X7FF,%r8
	jz .L__InputIsNaNOrInf

	cmp $0,%r8
	jz .L__Divisor_Is_Zero

	cmp %r9, %r11
	jz .L__Input_Is_Equal
	jb .L__ReturnImmediate

	xor %rcx,%rcx
	mov $24,%rdx
	movsd .L__One_64(%rip),%xmm7 # xmm7 = scale
	cmp %rax,%r8
	jae .L__y_is_greater
	#xmm3 = dy
	sub %r8,%rax
	div %dl # al = ntimes
	mov %al,%cl # cl = ntimes
	and $0xFF,%ax # set everything t o zero except al
	mul %dl # ax = dl * al = 24* ntimes
	add $1023, %rax
	shl $52,%rax
	movd %rax,%xmm7 # xmm7 = scale
	.L__y_is_greater:
	mulsd %xmm3,%xmm7 # xmm7 = scale * dy
	movsd .L__2pminus24_decimal(%rip),%xmm6

	.align 16
	.L__Start_Loop:
	dec %cl
	js .L__End_Loop
	divsd %xmm7,%xmm4 # xmm7 = (dx / w)
	cvttsd2siq %xmm4,%rax
	cvtsi2sdq %rax,%xmm4 # xmm4 = t = (double)((int)(dx / w))
	mulsd %xmm7,%xmm4 # xmm4 = w*t
	mulsd %xmm6,%xmm7 # w*= scale
	subsd %xmm4,%xmm2 # xmm2 = dx -= w*t
	movsd %xmm2,%xmm4 # xmm4 = dx
	jmp .L__Start_Loop
	.L__End_Loop:
	divsd %xmm7,%xmm4 # xmm7 = (dx / w)
	cvttsd2siq %xmm4,%rax
	cvtsi2sdq %rax,%xmm4 # xmm4 = t = (double)((int)(dx / w))
	and $0x01,%rax # todd = todd = ((int)(dx / w)) & 1
	mulsd %xmm7,%xmm4 # xmm4 = w*t
	subsd %xmm4,%xmm2 # xmm2 = dx -= w*t
	movsd %xmm7,%xmm6 # store w
	mulsd .L__Zero_Point_Five64(%rip),%xmm7 #xmm7 = 0.5*w

	cmp $0x01,%rax
	jnz .L__todd_is_even
	comisd %xmm2,%xmm7
	je .L__Subtract_w

	.L__todd_is_even:
	comisd %xmm2,%xmm7
	jnb .L__Dont_Subtract_w

	.L__Subtract_w:
	subsd %xmm6,%xmm2

	.L__Dont_Subtract_w:
	comiss .L__Zero_64(%rip),%xmm0
	jb .L__Negative
	cvtsd2ss %xmm2,%xmm0
	ret
	.L__Negative:
	movsd .L__MinusZero_64(%rip),%xmm0
	subsd %xmm2,%xmm0
	cvtsd2ss %xmm0,%xmm0
	ret

	.align 16
	.L__Input_Is_Equal:
	cmp $0x7FF,%rax
	jz .L__Dividend_Is_Infinity
	cmp $0x7FF,%r8
	jz .L__InputIsNaNOrInf
	movsd %xmm0,%xmm1
	pand .L__sign_bit_32(%rip),%xmm1
	movss .L__Zero_64(%rip),%xmm0
	por %xmm1,%xmm0
	ret

	.L__InputIsNaNOrInf:
	comiss %xmm0,%xmm1
	jp .L__InputIsNaN
	ret
	.L__Divisor_Is_Zero:
	.L__InputIsNaN:
	por .L__exp_mask_32(%rip),%xmm0
	.L__Dividend_Is_Infinity:
	por .L__QNaN_mask_32(%rip),%xmm0
	ret

	#Case when x < y
	#xmm2 = dx
	.L__ReturnImmediate:
	movsd %xmm3,%xmm5
	mulsd .L__Zero_Point_Five64(%rip), %xmm3 # xmm3 = 0.5*dy
	comisd %xmm3,%xmm2 # if (dx > 0.5*dy)
	jna .L__Finish_Immediate # xmm2 <= xmm3
	subsd %xmm5,%xmm2 #dx -= dy

	.L__Finish_Immediate:
	comiss .L__Zero_64(%rip),%xmm0
	#xmm0 contains the input and is the result
	jz .L__Zero
	ja .L__Positive

	movsd .L__Zero_64(%rip),%xmm0
	subsd %xmm2,%xmm0
	cvtsd2ss %xmm0,%xmm0
	ret

	.L__Zero:
	ret

	.L__Positive:
	cvtsd2ss %xmm2,%xmm0
	ret



	.align 32
	.L__sign_bit_32: .quad 0x8000000080000000
	.quad 0x0
	.L__exp_mask_64: .quad 0x7FF0000000000000
	.quad 0x0
	.L__exp_mask_32: .quad 0x000000007F800000
	.quad 0x0
	.L__27bit_andingmask_64: .quad 0xfffffffff8000000
	.quad 0
	.L__2p52_mask_64: .quad 0x4330000000000000
	.quad 0
	.L__One_64: .quad 0x3FF0000000000000
	.quad 0
	.L__Zero_64: .quad 0x0
	.quad 0
	.L__MinusZero_64: .quad 0x8000000000000000
	.quad 0
	.L__QNaN_mask_32: .quad 0x0000000000400000
	.quad 0
	.L__sign_mask_64: .quad 0x7FFFFFFFFFFFFFFF
	.quad 0
	.L__2pminus24_decimal: .quad 0x3E70000000000000
	.quad 0
	.L__Zero_Point_Five64: .quad 0x3FE0000000000000
	.quad 0