| dnl AMD64 mpn_mod_1_1p |
| |
| dnl Contributed to the GNU project by Torbjörn Granlund and Niels Möller. |
| |
| dnl Copyright 2009-2012, 2014 Free Software Foundation, Inc. |
| |
| dnl This file is part of the GNU MP Library. |
| dnl |
| dnl The GNU MP Library is free software; you can redistribute it and/or modify |
| dnl it under the terms of either: |
| dnl |
| dnl * the GNU Lesser General Public License as published by the Free |
| dnl Software Foundation; either version 3 of the License, or (at your |
| dnl option) any later version. |
| dnl |
| dnl or |
| dnl |
| dnl * the GNU General Public License as published by the Free Software |
| dnl Foundation; either version 2 of the License, or (at your option) any |
| dnl later version. |
| dnl |
| dnl or both in parallel, as here. |
| dnl |
| dnl The GNU MP Library is distributed in the hope that it will be useful, but |
| dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| dnl for more details. |
| dnl |
| dnl You should have received copies of the GNU General Public License and the |
| dnl GNU Lesser General Public License along with the GNU MP Library. If not, |
| dnl see https://www.gnu.org/licenses/. |
| |
| include(`config.m4') |
| |
| C cycles/limb |
| C AMD K8,K9 6 |
| C AMD K10 6 |
| C Intel P4 26 |
| C Intel core2 12.5 |
| C Intel NHM 11.3 |
| C Intel SBR 8.4 (slowdown, old code took 8.0) |
| C Intel atom 26 |
| C VIA nano 13 |
| |
| define(`B2mb', `%r10') |
| define(`B2modb', `%r11') |
| define(`ap', `%rdi') |
| define(`n', `%rsi') |
| define(`pre', `%r8') |
| define(`b', `%rbx') |
| |
| define(`r0', `%rbp') C r1 kept in %rax |
| define(`r2', `%rcx') C kept negated. Also used as shift count |
| define(`t0', `%r9') |
| |
| C mp_limb_t |
| C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t bmodb[4]) |
| C %rdi %rsi %rdx %rcx |
| C The pre array contains bi, cnt, B1modb, B2modb |
| C Note: This implementation needs B1modb only when cnt > 0 |
| |
| C The iteration is almost as follows, |
| C |
| C r_2 B^3 + r_1 B^2 + r_0 B + u = r_1 B2modb + (r_0 + r_2 B2mod) B + u |
| C |
| C where r2 is a single bit represented as a mask. But to make sure that the |
| C result fits in two limbs and a bit, carry from the addition |
| C |
| C r_0 + r_2 B2mod |
| C |
| C is handled specially. On carry, we subtract b to cancel the carry, |
| C and we use instead the value |
| C |
| C r_0 + B2mb (mod B) |
| C |
| C This addition can be issued early since it doesn't depend on r2, and it is |
| C the source of the cmov in the loop. |
| C |
| C We have the invariant that r_2 B^2 + r_1 B + r_0 < B^2 + B b |
| |
| ABI_SUPPORT(DOS64) |
| ABI_SUPPORT(STD64) |
| |
| ASM_START() |
| TEXT |
| ALIGN(16) |
| PROLOGUE(mpn_mod_1_1p) |
| FUNC_ENTRY(4) |
| push %rbp |
| push %rbx |
| mov %rdx, b |
| mov %rcx, pre |
| |
| mov -8(ap, n, 8), %rax |
| cmp $3, n |
| jnc L(first) |
| mov -16(ap, n, 8), r0 |
| jmp L(reduce_two) |
| |
| L(first): |
| C First iteration, no r2 |
| mov 24(pre), B2modb |
| mul B2modb |
| mov -24(ap, n, 8), r0 |
| add %rax, r0 |
| mov -16(ap, n, 8), %rax |
| adc %rdx, %rax |
| sbb r2, r2 |
| sub $4, n |
| jc L(reduce_three) |
| |
| mov B2modb, B2mb |
| sub b, B2mb |
| |
| ALIGN(16) |
| L(top): and B2modb, r2 |
| lea (B2mb, r0), t0 |
| mul B2modb |
| add r0, r2 |
| mov (ap, n, 8), r0 |
| cmovc t0, r2 |
| add %rax, r0 |
| mov r2, %rax |
| adc %rdx, %rax |
| sbb r2, r2 |
| sub $1, n |
| jnc L(top) |
| |
| L(reduce_three): |
| C Eliminate r2 |
| and b, r2 |
| sub r2, %rax |
| |
| L(reduce_two): |
| mov 8(pre), R32(%rcx) |
| test R32(%rcx), R32(%rcx) |
| jz L(normalized) |
| |
| C Unnormalized, use B1modb to reduce to size < B (b+1) |
| mulq 16(pre) |
| xor t0, t0 |
| add %rax, r0 |
| adc %rdx, t0 |
| mov t0, %rax |
| |
| C Left-shift to normalize |
| ifdef(`SHLD_SLOW',` |
| shl R8(%rcx), %rax |
| mov r0, t0 |
| neg R32(%rcx) |
| shr R8(%rcx), t0 |
| or t0, %rax |
| neg R32(%rcx) |
| ',` |
| shld R8(%rcx), r0, %rax |
| ') |
| shl R8(%rcx), r0 |
| jmp L(udiv) |
| |
| L(normalized): |
| mov %rax, t0 |
| sub b, t0 |
| cmovnc t0, %rax |
| |
| L(udiv): |
| lea 1(%rax), t0 |
| mulq (pre) |
| add r0, %rax |
| adc t0, %rdx |
| imul b, %rdx |
| sub %rdx, r0 |
| cmp r0, %rax |
| lea (b, r0), %rax |
| cmovnc r0, %rax |
| cmp b, %rax |
| jnc L(fix) |
| L(ok): shr R8(%rcx), %rax |
| |
| pop %rbx |
| pop %rbp |
| FUNC_EXIT() |
| ret |
| L(fix): sub b, %rax |
| jmp L(ok) |
| EPILOGUE() |
| |
| ALIGN(16) |
| PROLOGUE(mpn_mod_1_1p_cps) |
| FUNC_ENTRY(2) |
| push %rbp |
| bsr %rsi, %rcx |
| push %rbx |
| mov %rdi, %rbx |
| push %r12 |
| xor $63, R32(%rcx) |
| mov %rsi, %r12 |
| mov R32(%rcx), R32(%rbp) |
| sal R8(%rcx), %r12 |
| IFSTD(` mov %r12, %rdi ') C pass parameter |
| IFDOS(` mov %r12, %rcx ') C pass parameter |
| IFDOS(` sub $32, %rsp ') |
| ASSERT(nz, `test $15, %rsp') |
| CALL( mpn_invert_limb) |
| IFDOS(` add $32, %rsp ') |
| neg %r12 |
| mov %r12, %r8 |
| mov %rax, (%rbx) C store bi |
| mov %rbp, 8(%rbx) C store cnt |
| imul %rax, %r12 |
| mov %r12, 24(%rbx) C store B2modb |
| mov R32(%rbp), R32(%rcx) |
| test R32(%rcx), R32(%rcx) |
| jz L(z) |
| |
| mov $1, R32(%rdx) |
| ifdef(`SHLD_SLOW',` |
| C Destroys %rax, unlike shld. Otherwise, we could do B1modb |
| C before B2modb, and get rid of the move %r12, %r8 above. |
| |
| shl R8(%rcx), %rdx |
| neg R32(%rcx) |
| shr R8(%rcx), %rax |
| or %rax, %rdx |
| neg R32(%rcx) |
| ',` |
| shld R8(%rcx), %rax, %rdx |
| ') |
| imul %rdx, %r8 |
| shr R8(%rcx), %r8 |
| mov %r8, 16(%rbx) C store B1modb |
| L(z): |
| pop %r12 |
| pop %rbx |
| pop %rbp |
| FUNC_EXIT() |
| ret |
| EPILOGUE() |
| ASM_END() |