| dnl AMD64 mpn_modexact_1_odd -- Hensel norm remainder. |
| |
| dnl Copyright 2000-2006, 2011, 2012 Free Software Foundation, Inc. |
| |
| dnl This file is part of the GNU MP Library. |
| dnl |
| dnl The GNU MP Library is free software; you can redistribute it and/or modify |
| dnl it under the terms of either: |
| dnl |
| dnl * the GNU Lesser General Public License as published by the Free |
| dnl Software Foundation; either version 3 of the License, or (at your |
| dnl option) any later version. |
| dnl |
| dnl or |
| dnl |
| dnl * the GNU General Public License as published by the Free Software |
| dnl Foundation; either version 2 of the License, or (at your option) any |
| dnl later version. |
| dnl |
| dnl or both in parallel, as here. |
| dnl |
| dnl The GNU MP Library is distributed in the hope that it will be useful, but |
| dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| dnl for more details. |
| dnl |
| dnl You should have received copies of the GNU General Public License and the |
| dnl GNU Lesser General Public License along with the GNU MP Library. If not, |
| dnl see https://www.gnu.org/licenses/. |
| |
| include(`config.m4') |
| |
| |
| C cycles/limb |
| C AMD K8,K9 10 |
| C AMD K10 10 |
| C Intel P4 33 |
| C Intel core2 13 |
| C Intel corei 14.5 |
| C Intel atom 35 |
| C VIA nano ? |
| |
| |
| C The dependent chain in the main loop is |
| C |
| C cycles |
| C sub %rdx, %rax 1 |
| C imul %r9, %rax 4 |
| C mul %r8 5 |
| C ---- |
| C total 10 |
| C |
| C The mov load from src seems to need to be scheduled back before the jz to |
| C achieve this speed, out-of-order execution apparently can't completely hide |
| C the latency otherwise. |
| C |
| C The l=src[i]-cbit step is rotated back too, since that allows us to avoid it |
| C for the first iteration (where there's no cbit). |
| C |
| C The code alignment used (32-byte) for the loop also seems necessary. Without |
| C that the non-PIC case has adc crossing the 0x60 offset, apparently making it |
| C run at 11 cycles instead of 10. |
| |
| |
| ABI_SUPPORT(DOS64) |
| ABI_SUPPORT(STD64) |
| |
| ASM_START() |
| TEXT |
| ALIGN(32) |
| PROLOGUE(mpn_modexact_1_odd) |
| FUNC_ENTRY(3) |
| mov $0, R32(%rcx) |
| IFDOS(` jmp L(ent) ') |
| |
| PROLOGUE(mpn_modexact_1c_odd) |
| FUNC_ENTRY(4) |
| L(ent): |
| C rdi src |
| C rsi size |
| C rdx divisor |
| C rcx carry |
| |
| mov %rdx, %r8 C d |
| shr R32(%rdx) C d/2 |
| |
| LEA( binvert_limb_table, %r9) |
| |
| and $127, R32(%rdx) |
| mov %rcx, %r10 C initial carry |
| |
| movzbl (%r9,%rdx), R32(%rdx) C inv 8 bits |
| |
| mov (%rdi), %rax C src[0] |
| lea (%rdi,%rsi,8), %r11 C src end |
| mov %r8, %rdi C d, made available to imull |
| |
| lea (%rdx,%rdx), R32(%rcx) C 2*inv |
| imul R32(%rdx), R32(%rdx) C inv*inv |
| |
| neg %rsi C -size |
| |
| imul R32(%rdi), R32(%rdx) C inv*inv*d |
| |
| sub R32(%rdx), R32(%rcx) C inv = 2*inv - inv*inv*d, 16 bits |
| |
| lea (%rcx,%rcx), R32(%rdx) C 2*inv |
| imul R32(%rcx), R32(%rcx) C inv*inv |
| |
| imul R32(%rdi), R32(%rcx) C inv*inv*d |
| |
| sub R32(%rcx), R32(%rdx) C inv = 2*inv - inv*inv*d, 32 bits |
| xor R32(%rcx), R32(%rcx) C initial cbit |
| |
| lea (%rdx,%rdx), %r9 C 2*inv |
| imul %rdx, %rdx C inv*inv |
| |
| imul %r8, %rdx C inv*inv*d |
| |
| sub %rdx, %r9 C inv = 2*inv - inv*inv*d, 64 bits |
| mov %r10, %rdx C initial climb |
| |
| ASSERT(e,` C d*inv == 1 mod 2^64 |
| mov %r8, %r10 |
| imul %r9, %r10 |
| cmp $1, %r10') |
| |
| inc %rsi |
| jz L(one) |
| |
| |
| ALIGN(16) |
| L(top): |
| C rax l = src[i]-cbit |
| C rcx new cbit, 0 or 1 |
| C rdx climb, high of last product |
| C rsi counter, limbs, negative |
| C rdi |
| C r8 divisor |
| C r9 inverse |
| C r11 src end ptr |
| |
| sub %rdx, %rax C l = src[i]-cbit - climb |
| |
| adc $0, %rcx C more cbit |
| imul %r9, %rax C q = l * inverse |
| |
| mul %r8 C climb = high (q * d) |
| |
| mov (%r11,%rsi,8), %rax C src[i+1] |
| sub %rcx, %rax C next l = src[i+1] - cbit |
| setc R8(%rcx) C new cbit |
| |
| inc %rsi |
| jnz L(top) |
| |
| |
| L(one): |
| sub %rdx, %rax C l = src[i]-cbit - climb |
| |
| adc $0, %rcx C more cbit |
| imul %r9, %rax C q = l * inverse |
| |
| mul %r8 C climb = high (q * d) |
| |
| lea (%rcx,%rdx), %rax C climb+cbit |
| FUNC_EXIT() |
| ret |
| |
| EPILOGUE(mpn_modexact_1c_odd) |
| EPILOGUE(mpn_modexact_1_odd) |