1dnl AMD64 mpn_modexact_1_odd -- Hensel norm remainder. 2 3dnl Copyright 2000-2006, 2011, 2012 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C cycles/limb 35C AMD K8,K9 10 36C AMD K10 10 37C Intel P4 33 38C Intel core2 13 39C Intel corei 14.5 40C Intel atom 35 41C VIA nano ? 42 43 44C The dependent chain in the main loop is 45C 46C cycles 47C sub %rdx, %rax 1 48C imul %r9, %rax 4 49C mul %r8 5 50C ---- 51C total 10 52C 53C The mov load from src seems to need to be scheduled back before the jz to 54C achieve this speed, out-of-order execution apparently can't completely hide 55C the latency otherwise. 56C 57C The l=src[i]-cbit step is rotated back too, since that allows us to avoid it 58C for the first iteration (where there's no cbit). 59C 60C The code alignment used (32-byte) for the loop also seems necessary. Without 61C that the non-PIC case has adc crossing the 0x60 offset, apparently making it 62C run at 11 cycles instead of 10. 63 64 65ABI_SUPPORT(DOS64) 66ABI_SUPPORT(STD64) 67 68ASM_START() 69 TEXT 70 ALIGN(32) 71PROLOGUE(mpn_modexact_1_odd) 72 FUNC_ENTRY(3) 73 mov $0, R32(%rcx) 74IFDOS(` jmp L(ent) ') 75 76PROLOGUE(mpn_modexact_1c_odd) 77 FUNC_ENTRY(4) 78L(ent): 79 C rdi src 80 C rsi size 81 C rdx divisor 82 C rcx carry 83 84 mov %rdx, %r8 C d 85 shr R32(%rdx) C d/2 86 87 LEA( binvert_limb_table, %r9) 88 89 and $127, R32(%rdx) 90 mov %rcx, %r10 C initial carry 91 92 movzbl (%r9,%rdx), R32(%rdx) C inv 8 bits 93 94 mov (%rdi), %rax C src[0] 95 lea (%rdi,%rsi,8), %r11 C src end 96 mov %r8, %rdi C d, made available to imull 97 98 lea (%rdx,%rdx), R32(%rcx) C 2*inv 99 imul R32(%rdx), R32(%rdx) C inv*inv 100 101 neg %rsi C -size 102 103 imul R32(%rdi), R32(%rdx) C inv*inv*d 104 105 sub R32(%rdx), R32(%rcx) C inv = 2*inv - inv*inv*d, 16 bits 106 107 lea (%rcx,%rcx), R32(%rdx) C 2*inv 108 imul R32(%rcx), R32(%rcx) C inv*inv 109 110 imul R32(%rdi), R32(%rcx) C inv*inv*d 111 112 sub R32(%rcx), R32(%rdx) C inv = 2*inv - inv*inv*d, 32 bits 113 xor R32(%rcx), R32(%rcx) C initial cbit 114 115 lea (%rdx,%rdx), %r9 C 2*inv 116 imul %rdx, %rdx C inv*inv 117 118 imul %r8, %rdx C inv*inv*d 119 120 sub %rdx, %r9 C inv = 2*inv - inv*inv*d, 64 bits 121 mov %r10, %rdx C initial climb 122 123 ASSERT(e,` C d*inv == 1 mod 2^64 124 mov %r8, %r10 125 imul %r9, %r10 126 cmp $1, %r10') 127 128 inc %rsi 129 jz L(one) 130 131 132 ALIGN(16) 133L(top): 134 C rax l = src[i]-cbit 135 C rcx new cbit, 0 or 1 136 C rdx climb, high of last product 137 C rsi counter, limbs, negative 138 C rdi 139 C r8 divisor 140 C r9 inverse 141 C r11 src end ptr 142 143 sub %rdx, %rax C l = src[i]-cbit - climb 144 145 adc $0, %rcx C more cbit 146 imul %r9, %rax C q = l * inverse 147 148 mul %r8 C climb = high (q * d) 149 150 mov (%r11,%rsi,8), %rax C src[i+1] 151 sub %rcx, %rax C next l = src[i+1] - cbit 152 setc R8(%rcx) C new cbit 153 154 inc %rsi 155 jnz L(top) 156 157 158L(one): 159 sub %rdx, %rax C l = src[i]-cbit - climb 160 161 adc $0, %rcx C more cbit 162 imul %r9, %rax C q = l * inverse 163 164 mul %r8 C climb = high (q * d) 165 166 lea (%rcx,%rdx), %rax C climb+cbit 167 FUNC_EXIT() 168 ret 169 170EPILOGUE(mpn_modexact_1c_odd) 171EPILOGUE(mpn_modexact_1_odd) 172