mpn/x86_64/div_qr_1n_pi1.asm - gmp - Git at Google

 dnl  x86-64 mpn_div_qr_1n_pi1
 dnl  -- Divide an mpn number by a normalized single-limb number,
 dnl     using a single-limb inverse.

 dnl  Contributed to the GNU project by Niels Möller

 dnl  Copyright 2013 Free Software Foundation, Inc.

 dnl  This file is part of the GNU MP Library.
 dnl
 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
 dnl  it under the terms of either:
 dnl
 dnl    * the GNU Lesser General Public License as published by the Free
 dnl      Software Foundation; either version 3 of the License, or (at your
 dnl      option) any later version.
 dnl
 dnl  or
 dnl
 dnl    * the GNU General Public License as published by the Free Software
 dnl      Foundation; either version 2 of the License, or (at your option) any
 dnl      later version.
 dnl
 dnl  or both in parallel, as here.
 dnl
 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 dnl  for more details.
 dnl
 dnl  You should have received copies of the GNU General Public License and the
 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
 dnl  see https://www.gnu.org/licenses/.

 include(`config.m4')


 C		c/l
 C AMD K8,K9	13
 C AMD K10	13
 C AMD bull	16.5
 C AMD pile	15
 C AMD steam	 ?
 C AMD bobcat	16
 C AMD jaguar	 ?
 C Intel P4	47	poor
 C Intel core	19.25
 C Intel NHM	18
 C Intel SBR	15	poor
 C Intel IBR	13
 C Intel HWL	11.7
 C Intel BWL	 ?
 C Intel atom	52	very poor
 C VIA nano	19


 C INPUT Parameters
 define(`QP', `%rdi')
 define(`UP', `%rsi')
 define(`UN_INPUT', `%rdx')
 define(`U1', `%rcx')	C Also in %rax
 define(`D', `%r8')
 define(`DINV', `%r9')

 C Invariants
 define(`B2', `%rbp')
 define(`B2md', `%rbx')

 C Variables
 define(`UN', `%r8')	C Overlaps D input
 define(`T', `%r10')
 define(`U0', `%r11')
 define(`U2', `%r12')
 define(`Q0', `%r13')
 define(`Q1', `%r14')
 define(`Q2', `%r15')

 ABI_SUPPORT(STD64)

 	ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(mpn_div_qr_1n_pi1)
 	FUNC_ENTRY(4)
 IFDOS(`	mov	56(%rsp), %r8	')
 IFDOS(`	mov	64(%rsp), %r9	')
 	dec	UN_INPUT
 	jnz	L(first)

 	C Just a single 2/1 division.
 	C T, U0 are allocated in scratch registers
 	lea	1(U1), T
 	mov	U1, %rax
 	mul	DINV
 	mov	(UP), U0
 	add	U0, %rax
 	adc	T, %rdx
 	mov	%rdx, T
 	imul	D, %rdx
 	sub	%rdx, U0
 	cmp	U0, %rax
 	lea	(U0, D), %rax
 	cmovnc	U0, %rax
 	sbb	$0, T
 	cmp	D, %rax
 	jc	L(single_div_done)
 	sub	D, %rax
 	add	$1, T
 L(single_div_done):
 	mov	T, (QP)
 	FUNC_EXIT()
 	ret
 L(first):
 	C FIXME: Could delay some of these until we enter the loop.
 	push	%r15
 	push	%r14
 	push	%r13
 	push	%r12
 	push	%rbx
 	push	%rbp

 	mov	D, B2
 	imul	DINV, B2
 	neg	B2
 	mov	B2, B2md
 	sub	D, B2md

 	C D not needed until final reduction
 	push	D
 	mov	UN_INPUT, UN	C Clobbers D

 	mov	DINV, %rax
 	mul	U1
 	mov	%rax, Q0
 	add	U1, %rdx
 	mov	%rdx, T

 	mov	B2, %rax
 	mul	U1
 	mov	-8(UP, UN, 8), U0
 	mov	(UP, UN, 8), U1
 	mov	T, (QP, UN, 8)
 	add	%rax, U0
 	adc	%rdx, U1
 	sbb	U2, U2
 	dec	UN
 	mov	U1, %rax
 	jz	L(final)

 	ALIGN(16)

 	C Loop is 28 instructions, 30 decoder slots, should run in 10 cycles.
 	C At entry, %rax holds an extra copy of U1
 L(loop):
 	C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2
 	C Remains to add in B (U1 + c)
 	mov	DINV, Q1
 	mov	U2, Q2
 	and	U2, Q1
 	neg	Q2
 	mul	DINV
 	add	%rdx, Q1
 	adc	$0, Q2
 	add	Q0, Q1
 	mov	%rax, Q0
 	mov	B2, %rax
 	lea	(B2md, U0), T
 	adc	$0, Q2

 	C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u
 	mul	U1
 	and	B2, U2
 	add	U2, U0
 	cmovnc	U0, T

 	C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c
 	adc	U1, Q1
 	mov	-8(UP, UN, 8), U0
 	adc	Q2, 8(QP, UN, 8)
 	jc	L(q_incr)
 L(q_incr_done):
 	add	%rax, U0
 	mov	T, %rax
 	adc	%rdx, %rax
 	mov	Q1, (QP, UN, 8)
 	sbb	U2, U2
 	dec	UN
 	mov	%rax, U1
 	jnz	L(loop)

 L(final):
 	pop	D

 	mov	U2, Q1
 	and	D, U2
 	sub	U2, %rax
 	neg	Q1

 	mov	%rax, U1
 	sub	D, %rax
 	cmovc	U1, %rax
 	sbb	$-1, Q1

 	lea	1(%rax), T
 	mul	DINV
 	add	U0, %rax
 	adc	T, %rdx
 	mov	%rdx, T
 	imul	D, %rdx
 	sub	%rdx, U0
 	cmp	U0, %rax
 	lea	(U0, D), %rax
 	cmovnc	U0, %rax
 	sbb	$0, T
 	cmp	D, %rax
 	jc	L(div_done)
 	sub	D, %rax
 	add	$1, T
 L(div_done):
 	add	T, Q0
 	mov	Q0, (QP)
 	adc	Q1, 8(QP)
 	jnc	L(done)
 L(final_q_incr):
 	addq	$1, 16(QP)
 	lea	8(QP), QP
 	jc	L(final_q_incr)

 L(done):
 	pop	%rbp
 	pop	%rbx
 	pop	%r12
 	pop	%r13
 	pop	%r14
 	pop	%r15
 	FUNC_EXIT()
 	ret

 L(q_incr):
 	C U1 is not live, so use it for indexing
 	lea	16(QP, UN, 8), U1
 L(q_incr_loop):
 	addq	$1, (U1)
 	jnc	L(q_incr_done)
 	lea	8(U1), U1
 	jmp	L(q_incr_loop)
 EPILOGUE()
	dnl x86-64 mpn_div_qr_1n_pi1
	dnl -- Divide an mpn number by a normalized single-limb number,
	dnl using a single-limb inverse.

	dnl Contributed to the GNU project by Niels Möller

	dnl Copyright 2013 Free Software Foundation, Inc.

	dnl This file is part of the GNU MP Library.
	dnl
	dnl The GNU MP Library is free software; you can redistribute it and/or modify
	dnl it under the terms of either:
	dnl
	dnl * the GNU Lesser General Public License as published by the Free
	dnl Software Foundation; either version 3 of the License, or (at your
	dnl option) any later version.
	dnl
	dnl or
	dnl
	dnl * the GNU General Public License as published by the Free Software
	dnl Foundation; either version 2 of the License, or (at your option) any
	dnl later version.
	dnl
	dnl or both in parallel, as here.
	dnl
	dnl The GNU MP Library is distributed in the hope that it will be useful, but
	dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
	dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	dnl for more details.
	dnl
	dnl You should have received copies of the GNU General Public License and the
	dnl GNU Lesser General Public License along with the GNU MP Library. If not,
	dnl see https://www.gnu.org/licenses/.

	include(`config.m4')


	C c/l
	C AMD K8,K9 13
	C AMD K10 13
	C AMD bull 16.5
	C AMD pile 15
	C AMD steam ?
	C AMD bobcat 16
	C AMD jaguar ?
	C Intel P4 47 poor
	C Intel core 19.25
	C Intel NHM 18
	C Intel SBR 15 poor
	C Intel IBR 13
	C Intel HWL 11.7
	C Intel BWL ?
	C Intel atom 52 very poor
	C VIA nano 19


	C INPUT Parameters
	define(`QP', `%rdi')
	define(`UP', `%rsi')
	define(`UN_INPUT', `%rdx')
	define(`U1', `%rcx') C Also in %rax
	define(`D', `%r8')
	define(`DINV', `%r9')

	C Invariants
	define(`B2', `%rbp')
	define(`B2md', `%rbx')

	C Variables
	define(`UN', `%r8') C Overlaps D input
	define(`T', `%r10')
	define(`U0', `%r11')
	define(`U2', `%r12')
	define(`Q0', `%r13')
	define(`Q1', `%r14')
	define(`Q2', `%r15')

	ABI_SUPPORT(STD64)

	ASM_START()
	TEXT
	ALIGN(16)
	PROLOGUE(mpn_div_qr_1n_pi1)
	FUNC_ENTRY(4)
	IFDOS(` mov 56(%rsp), %r8 ')
	IFDOS(` mov 64(%rsp), %r9 ')
	dec UN_INPUT
	jnz L(first)

	C Just a single 2/1 division.
	C T, U0 are allocated in scratch registers
	lea 1(U1), T
	mov U1, %rax
	mul DINV
	mov (UP), U0
	add U0, %rax
	adc T, %rdx
	mov %rdx, T
	imul D, %rdx
	sub %rdx, U0
	cmp U0, %rax
	lea (U0, D), %rax
	cmovnc U0, %rax
	sbb $0, T
	cmp D, %rax
	jc L(single_div_done)
	sub D, %rax
	add $1, T
	L(single_div_done):
	mov T, (QP)
	FUNC_EXIT()
	ret
	L(first):
	C FIXME: Could delay some of these until we enter the loop.
	push %r15
	push %r14
	push %r13
	push %r12
	push %rbx
	push %rbp

	mov D, B2
	imul DINV, B2
	neg B2
	mov B2, B2md
	sub D, B2md

	C D not needed until final reduction
	push D
	mov UN_INPUT, UN C Clobbers D

	mov DINV, %rax
	mul U1
	mov %rax, Q0
	add U1, %rdx
	mov %rdx, T

	mov B2, %rax
	mul U1
	mov -8(UP, UN, 8), U0
	mov (UP, UN, 8), U1
	mov T, (QP, UN, 8)
	add %rax, U0
	adc %rdx, U1
	sbb U2, U2
	dec UN
	mov U1, %rax
	jz L(final)

	ALIGN(16)

	C Loop is 28 instructions, 30 decoder slots, should run in 10 cycles.
	C At entry, %rax holds an extra copy of U1
	L(loop):
	C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2
	C Remains to add in B (U1 + c)
	mov DINV, Q1
	mov U2, Q2
	and U2, Q1
	neg Q2
	mul DINV
	add %rdx, Q1
	adc $0, Q2
	add Q0, Q1
	mov %rax, Q0
	mov B2, %rax
	lea (B2md, U0), T
	adc $0, Q2

	C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u
	mul U1
	and B2, U2
	add U2, U0
	cmovnc U0, T

	C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c
	adc U1, Q1
	mov -8(UP, UN, 8), U0
	adc Q2, 8(QP, UN, 8)
	jc L(q_incr)
	L(q_incr_done):
	add %rax, U0
	mov T, %rax
	adc %rdx, %rax
	mov Q1, (QP, UN, 8)
	sbb U2, U2
	dec UN
	mov %rax, U1
	jnz L(loop)

	L(final):
	pop D

	mov U2, Q1
	and D, U2
	sub U2, %rax
	neg Q1

	mov %rax, U1
	sub D, %rax
	cmovc U1, %rax
	sbb $-1, Q1

	lea 1(%rax), T
	mul DINV
	add U0, %rax
	adc T, %rdx
	mov %rdx, T
	imul D, %rdx
	sub %rdx, U0
	cmp U0, %rax
	lea (U0, D), %rax
	cmovnc U0, %rax
	sbb $0, T
	cmp D, %rax
	jc L(div_done)
	sub D, %rax
	add $1, T
	L(div_done):
	add T, Q0
	mov Q0, (QP)
	adc Q1, 8(QP)
	jnc L(done)
	L(final_q_incr):
	addq $1, 16(QP)
	lea 8(QP), QP
	jc L(final_q_incr)

	L(done):
	pop %rbp
	pop %rbx
	pop %r12
	pop %r13
	pop %r14
	pop %r15
	FUNC_EXIT()
	ret

	L(q_incr):
	C U1 is not live, so use it for indexing
	lea 16(QP, UN, 8), U1
	L(q_incr_loop):
	addq $1, (U1)
	jnc L(q_incr_done)
	lea 8(U1), U1
	jmp L(q_incr_loop)
	EPILOGUE()