mpn/arm64/divrem_1.asm - gmp - Git at Google

 dnl  ARM64 mpn_divrem_1 and mpn_preinv_divrem_1.

 dnl  Contributed to the GNU project by Torbjörn Granlund.

 dnl  Copyright 2020 Free Software Foundation, Inc.

 dnl  This file is part of the GNU MP Library.
 dnl
 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
 dnl  it under the terms of either:
 dnl
 dnl    * the GNU Lesser General Public License as published by the Free
 dnl      Software Foundation; either version 3 of the License, or (at your
 dnl      option) any later version.
 dnl
 dnl  or
 dnl
 dnl    * the GNU General Public License as published by the Free Software
 dnl      Foundation; either version 2 of the License, or (at your option) any
 dnl      later version.
 dnl
 dnl  or both in parallel, as here.
 dnl
 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 dnl  for more details.
 dnl
 dnl  You should have received copies of the GNU General Public License and the
 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
 dnl  see https://www.gnu.org/licenses/.

 include(`config.m4')

 dnl TODO
 dnl  * Handle the most significant quotient limb for the unnormalised case
 dnl    specially, just like in the C code.  (It is very often 0.)

 define(`qp_arg',   x0)
 define(`fn_arg',   x1)
 define(`np_arg',   x2)
 define(`n_arg',    x3)
 define(`d_arg',    x4)
 define(`dinv_arg', x5)
 define(`cnt_arg',  x6)

 define(`qp',   x19)
 define(`np',   x20)
 define(`n',    x21)
 define(`d',    x22)
 define(`fn',   x24)
 define(`dinv', x0)
 define(`cnt',  x23)
 define(`tnc',  x8)

 dnl mp_limb_t
 dnl mpn_divrem_1 (mp_ptr qp, mp_size_t fn,
 dnl               mp_srcptr np, mp_size_t n,
 dnl               mp_limb_t d_unnorm)

 dnl mp_limb_t
 dnl mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn,
 dnl                      mp_srcptr np, mp_size_t n,
 dnl                      mp_limb_t d_unnorm, mp_limb_t dinv, int cnt)

 ASM_START()

 PROLOGUE(mpn_preinv_divrem_1)
 	cbz	n_arg, L(fz)
 	stp	x29, x30, [sp, #-80]!
 	mov	x29, sp
 	stp	x19, x20, [sp, #16]
 	stp	x21, x22, [sp, #32]
 	stp	x23, x24, [sp, #48]

 	sub	n, n_arg, #1
 	add	x7, n, fn_arg
 	add	np, np_arg, n, lsl #3
 	add	qp, qp_arg, x7, lsl #3
 	mov	fn, fn_arg
 	mov	d, d_arg
 	mov	dinv, dinv_arg
 	tbnz	d_arg, #63, L(nentry)
 	mov	cnt, cnt_arg
 	b	L(uentry)
 EPILOGUE()

 PROLOGUE(mpn_divrem_1)
 	cbz	n_arg, L(fz)
 	stp	x29, x30, [sp, #-80]!
 	mov	x29, sp
 	stp	x19, x20, [sp, #16]
 	stp	x21, x22, [sp, #32]
 	stp	x23, x24, [sp, #48]

 	sub	n, n_arg, #1
 	add	x7, n, fn_arg
 	add	np, np_arg, n, lsl #3
 	add	qp, qp_arg, x7, lsl #3
 	mov	fn, fn_arg
 	mov	d, d_arg
 	tbnz	d_arg, #63, L(normalised)

 L(unnorm):
 	clz	cnt, d
 	lsl	x0, d, cnt
 	bl	GSYM_PREFIX`'MPN(invert_limb)
 L(uentry):
 	lsl	d, d, cnt
 	ldr	x7, [np], #-8
 	sub	tnc, xzr, cnt
 	lsr	x11, x7, tnc		C r
 	lsl	x1, x7, cnt
 	cbz	n, L(uend)

 L(utop):ldr	x7, [np], #-8
 	add	x2, x11, #1
 	mul	x10, x11, dinv
 	umulh	x17, x11, dinv
 	lsr	x9, x7, tnc
 	orr	x1, x1, x9
 	adds	x10, x1, x10
 	adc	x2, x2, x17
 	msub	x11, d, x2, x1
 	lsl	x1, x7, cnt
 	cmp	x10, x11
 	add	x14, x11, d
 	csel	x11, x14, x11, cc
 	sbc	x2, x2, xzr
 	cmp	x11, d
 	bcs	L(ufx)
 L(uok):	str	x2, [qp], #-8
 	sub	n, n, #1
 	cbnz	n, L(utop)

 L(uend):add	x2, x11, #1
 	mul	x10, x11, dinv
 	umulh	x17, x11, dinv
 	adds	x10, x1, x10
 	adc	x2, x2, x17
 	msub	x11, d, x2, x1
 	cmp	x10, x11
 	add	x14, x11, d
 	csel	x11, x14, x11, cc
 	sbc	x2, x2, xzr
 	subs	x14, x11, d
 	adc	x2, x2, xzr
 	csel	x11, x14, x11, cs
 	str	x2, [qp], #-8

 	cbnz	fn, L(ftop)
 	lsr	x0, x11, cnt
 	ldp	x19, x20, [sp, #16]
 	ldp	x21, x22, [sp, #32]
 	ldp	x23, x24, [sp, #48]
 	ldp	x29, x30, [sp], #80
 	ret

 L(ufx):	add	x2, x2, #1
 	sub	x11, x11, d
 	b	L(uok)


 L(normalised):
 	mov	x0, d
 	bl	GSYM_PREFIX`'MPN(invert_limb)
 L(nentry):
 	ldr	x7, [np], #-8
 	subs	x14, x7, d
 	adc	x2, xzr, xzr		C hi q limb
 	csel	x11, x14, x7, cs
 	b	L(nok)

 L(ntop):ldr	x1, [np], #-8
 	add	x2, x11, #1
 	mul	x10, x11, dinv
 	umulh	x17, x11, dinv
 	adds	x10, x1, x10
 	adc	x2, x2, x17
 	msub	x11, d, x2, x1
 	cmp	x10, x11
 	add	x14, x11, d
 	csel	x11, x14, x11, cc	C remainder
 	sbc	x2, x2, xzr
 	cmp	x11, d
 	bcs	L(nfx)
 L(nok):	str	x2, [qp], #-8
 	sub	n, n, #1
 	tbz	n, #63, L(ntop)

 L(nend):cbnz	fn, L(frac)
 	mov	x0, x11
 	ldp	x19, x20, [sp, #16]
 	ldp	x21, x22, [sp, #32]
 	ldp	x23, x24, [sp, #48]
 	ldp	x29, x30, [sp], #80
 	ret

 L(nfx):	add	x2, x2, #1
 	sub	x11, x11, d
 	b	L(nok)

 L(frac):mov	cnt, #0
 L(ftop):add	x2, x11, #1
 	mul	x10, x11, dinv
 	umulh	x17, x11, dinv
 	add	x2, x2, x17
 	msub	x11, d, x2, xzr
 	cmp	x10, x11
 	add	x14, x11, d
 	csel	x11, x14, x11, cc	C remainder
 	sbc	x2, x2, xzr
 	str	x2, [qp], #-8
 	sub	fn, fn, #1
 	cbnz	fn, L(ftop)

 	lsr	x0, x11, cnt
 	ldp	x19, x20, [sp, #16]
 	ldp	x21, x22, [sp, #32]
 	ldp	x23, x24, [sp, #48]
 	ldp	x29, x30, [sp], #80
 	ret

 C Block zero. We need this for the degenerated case of n = 0, fn != 0.
 L(fz):	cbz	fn_arg, L(zend)
 L(ztop):str	xzr, [qp_arg], #8
 	sub	fn_arg, fn_arg, #1
 	cbnz	fn_arg, L(ztop)
 L(zend):mov	x0, #0
 	ret
 EPILOGUE()
	dnl ARM64 mpn_divrem_1 and mpn_preinv_divrem_1.

	dnl Contributed to the GNU project by Torbjörn Granlund.

	dnl Copyright 2020 Free Software Foundation, Inc.

	dnl This file is part of the GNU MP Library.
	dnl
	dnl The GNU MP Library is free software; you can redistribute it and/or modify
	dnl it under the terms of either:
	dnl
	dnl * the GNU Lesser General Public License as published by the Free
	dnl Software Foundation; either version 3 of the License, or (at your
	dnl option) any later version.
	dnl
	dnl or
	dnl
	dnl * the GNU General Public License as published by the Free Software
	dnl Foundation; either version 2 of the License, or (at your option) any
	dnl later version.
	dnl
	dnl or both in parallel, as here.
	dnl
	dnl The GNU MP Library is distributed in the hope that it will be useful, but
	dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
	dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	dnl for more details.
	dnl
	dnl You should have received copies of the GNU General Public License and the
	dnl GNU Lesser General Public License along with the GNU MP Library. If not,
	dnl see https://www.gnu.org/licenses/.

	include(`config.m4')

	dnl TODO
	dnl * Handle the most significant quotient limb for the unnormalised case
	dnl specially, just like in the C code. (It is very often 0.)

	define(`qp_arg', x0)
	define(`fn_arg', x1)
	define(`np_arg', x2)
	define(`n_arg', x3)
	define(`d_arg', x4)
	define(`dinv_arg', x5)
	define(`cnt_arg', x6)

	define(`qp', x19)
	define(`np', x20)
	define(`n', x21)
	define(`d', x22)
	define(`fn', x24)
	define(`dinv', x0)
	define(`cnt', x23)
	define(`tnc', x8)

	dnl mp_limb_t
	dnl mpn_divrem_1 (mp_ptr qp, mp_size_t fn,
	dnl mp_srcptr np, mp_size_t n,
	dnl mp_limb_t d_unnorm)

	dnl mp_limb_t
	dnl mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn,
	dnl mp_srcptr np, mp_size_t n,
	dnl mp_limb_t d_unnorm, mp_limb_t dinv, int cnt)

	ASM_START()

	PROLOGUE(mpn_preinv_divrem_1)
	cbz n_arg, L(fz)
	stp x29, x30, [sp, #-80]!
	mov x29, sp
	stp x19, x20, [sp, #16]
	stp x21, x22, [sp, #32]
	stp x23, x24, [sp, #48]

	sub n, n_arg, #1
	add x7, n, fn_arg
	add np, np_arg, n, lsl #3
	add qp, qp_arg, x7, lsl #3
	mov fn, fn_arg
	mov d, d_arg
	mov dinv, dinv_arg
	tbnz d_arg, #63, L(nentry)
	mov cnt, cnt_arg
	b L(uentry)
	EPILOGUE()

	PROLOGUE(mpn_divrem_1)
	cbz n_arg, L(fz)
	stp x29, x30, [sp, #-80]!
	mov x29, sp
	stp x19, x20, [sp, #16]
	stp x21, x22, [sp, #32]
	stp x23, x24, [sp, #48]

	sub n, n_arg, #1
	add x7, n, fn_arg
	add np, np_arg, n, lsl #3
	add qp, qp_arg, x7, lsl #3
	mov fn, fn_arg
	mov d, d_arg
	tbnz d_arg, #63, L(normalised)

	L(unnorm):
	clz cnt, d
	lsl x0, d, cnt
	bl GSYM_PREFIX`'MPN(invert_limb)
	L(uentry):
	lsl d, d, cnt
	ldr x7, [np], #-8
	sub tnc, xzr, cnt
	lsr x11, x7, tnc C r
	lsl x1, x7, cnt
	cbz n, L(uend)

	L(utop):ldr x7, [np], #-8
	add x2, x11, #1
	mul x10, x11, dinv
	umulh x17, x11, dinv
	lsr x9, x7, tnc
	orr x1, x1, x9
	adds x10, x1, x10
	adc x2, x2, x17
	msub x11, d, x2, x1
	lsl x1, x7, cnt
	cmp x10, x11
	add x14, x11, d
	csel x11, x14, x11, cc
	sbc x2, x2, xzr
	cmp x11, d
	bcs L(ufx)
	L(uok): str x2, [qp], #-8
	sub n, n, #1
	cbnz n, L(utop)

	L(uend):add x2, x11, #1
	mul x10, x11, dinv
	umulh x17, x11, dinv
	adds x10, x1, x10
	adc x2, x2, x17
	msub x11, d, x2, x1
	cmp x10, x11
	add x14, x11, d
	csel x11, x14, x11, cc
	sbc x2, x2, xzr
	subs x14, x11, d
	adc x2, x2, xzr
	csel x11, x14, x11, cs
	str x2, [qp], #-8

	cbnz fn, L(ftop)
	lsr x0, x11, cnt
	ldp x19, x20, [sp, #16]
	ldp x21, x22, [sp, #32]
	ldp x23, x24, [sp, #48]
	ldp x29, x30, [sp], #80
	ret

	L(ufx): add x2, x2, #1
	sub x11, x11, d
	b L(uok)


	L(normalised):
	mov x0, d
	bl GSYM_PREFIX`'MPN(invert_limb)
	L(nentry):
	ldr x7, [np], #-8
	subs x14, x7, d
	adc x2, xzr, xzr C hi q limb
	csel x11, x14, x7, cs
	b L(nok)

	L(ntop):ldr x1, [np], #-8
	add x2, x11, #1
	mul x10, x11, dinv
	umulh x17, x11, dinv
	adds x10, x1, x10
	adc x2, x2, x17
	msub x11, d, x2, x1
	cmp x10, x11
	add x14, x11, d
	csel x11, x14, x11, cc C remainder
	sbc x2, x2, xzr
	cmp x11, d
	bcs L(nfx)
	L(nok): str x2, [qp], #-8
	sub n, n, #1
	tbz n, #63, L(ntop)

	L(nend):cbnz fn, L(frac)
	mov x0, x11
	ldp x19, x20, [sp, #16]
	ldp x21, x22, [sp, #32]
	ldp x23, x24, [sp, #48]
	ldp x29, x30, [sp], #80
	ret

	L(nfx): add x2, x2, #1
	sub x11, x11, d
	b L(nok)

	L(frac):mov cnt, #0
	L(ftop):add x2, x11, #1
	mul x10, x11, dinv
	umulh x17, x11, dinv
	add x2, x2, x17
	msub x11, d, x2, xzr
	cmp x10, x11
	add x14, x11, d
	csel x11, x14, x11, cc C remainder
	sbc x2, x2, xzr
	str x2, [qp], #-8
	sub fn, fn, #1
	cbnz fn, L(ftop)

	lsr x0, x11, cnt
	ldp x19, x20, [sp, #16]
	ldp x21, x22, [sp, #32]
	ldp x23, x24, [sp, #48]
	ldp x29, x30, [sp], #80
	ret

	C Block zero. We need this for the degenerated case of n = 0, fn != 0.
	L(fz): cbz fn_arg, L(zend)
	L(ztop):str xzr, [qp_arg], #8
	sub fn_arg, fn_arg, #1
	cbnz fn_arg, L(ztop)
	L(zend):mov x0, #0
	ret
	EPILOGUE()