1dnl AMD64 mpn_modexact_1_odd -- exact division style remainder. 2 3dnl Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2006 Free Software 4dnl Foundation, Inc. 5dnl 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or 9dnl modify it under the terms of the GNU Lesser General Public License as 10dnl published by the Free Software Foundation; either version 3 of the 11dnl License, or (at your option) any later version. 12dnl 13dnl The GNU MP Library is distributed in the hope that it will be useful, 14dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 15dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16dnl Lesser General Public License for more details. 17dnl 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23 24C cycles/limb 25C K8,K9: 10 26C K10: 10 27C P4: 33 28C P6 core2: 13 29C P6 corei7: 14.5 30C P6 Atom: 35 31 32 33C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, 34C mp_limb_t divisor); 35C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, 36C mp_limb_t divisor, mp_limb_t carry); 37C 38C 39C The dependent chain in the main loop is 40C 41C cycles 42C subq %rdx, %rax 1 43C imulq %r9, %rax 4 44C mulq %r8 5 45C ---- 46C total 10 47C 48C The movq load from src seems to need to be scheduled back before the jz to 49C achieve this speed, out-of-order execution apparently can't completely 50C hide the latency otherwise. 51C 52C The l=src[i]-cbit step is rotated back too, since that allows us to avoid 53C it for the first iteration (where there's no cbit). 54C 55C The code alignment used (32-byte) for the loop also seems necessary. 56C Without that the non-PIC case has adcq crossing the 0x60 offset, 57C apparently making it run at 11 cycles instead of 10. 58C 59C Not done: 60C 61C divq for size==1 was measured at about 79 cycles, compared to the inverse 62C at about 25 cycles (both including function call overheads), so that's not 63C used. 64C 65C Enhancements: 66C 67C For PIC, we shouldn't really need the GOT fetch for binvert_limb_table, 68C it'll be in rodata or text in libgmp.so and can be accessed directly %rip 69C relative. This would be for small model only (something we don't 70C presently detect, but which is all that gcc 3.3.3 supports), since 8-byte 71C PC-relative relocations are apparently not available. Some rough 72C experiments with binutils 2.13 looked worrylingly like it might come out 73C with an unwanted text segment relocation though, even with ".protected". 74 75 76ASM_START() 77 TEXT 78 ALIGN(32) 79PROLOGUE(mpn_modexact_1_odd) 80 81 movl $0, %ecx 82 83PROLOGUE(mpn_modexact_1c_odd) 84 85 C rdi src 86 C rsi size 87 C rdx divisor 88 C rcx carry 89 90 movq %rdx, %r8 C d 91 shrl %edx C d/2 92ifdef(`PIC',` 93 movq binvert_limb_table@GOTPCREL(%rip), %r9 94',` 95 movabsq $binvert_limb_table, %r9 96') 97 98 andl $127, %edx 99 movq %rcx, %r10 C initial carry 100 101 movzbl (%r9,%rdx), %edx C inv 8 bits 102 103 movq (%rdi), %rax C src[0] 104 leaq (%rdi,%rsi,8), %r11 C src end 105 movq %r8, %rdi C d, made available to imull 106 107 leal (%rdx,%rdx), %ecx C 2*inv 108 imull %edx, %edx C inv*inv 109 110 negq %rsi C -size 111 112 imull %edi, %edx C inv*inv*d 113 114 subl %edx, %ecx C inv = 2*inv - inv*inv*d, 16 bits 115 116 leal (%rcx,%rcx), %edx C 2*inv 117 imull %ecx, %ecx C inv*inv 118 119 imull %edi, %ecx C inv*inv*d 120 121 subl %ecx, %edx C inv = 2*inv - inv*inv*d, 32 bits 122 xorl %ecx, %ecx C initial cbit 123 124 leaq (%rdx,%rdx), %r9 C 2*inv 125 imulq %rdx, %rdx C inv*inv 126 127 imulq %r8, %rdx C inv*inv*d 128 129 subq %rdx, %r9 C inv = 2*inv - inv*inv*d, 64 bits 130 movq %r10, %rdx C initial climb 131 132 ASSERT(e,` C d*inv == 1 mod 2^64 133 movq %r8, %r10 134 imulq %r9, %r10 135 cmpq $1, %r10') 136 137 incq %rsi 138 jz L(one) 139 140 141 ALIGN(16) 142L(top): 143 C rax l = src[i]-cbit 144 C rcx new cbit, 0 or 1 145 C rdx climb, high of last product 146 C rsi counter, limbs, negative 147 C rdi 148 C r8 divisor 149 C r9 inverse 150 C r11 src end ptr 151 152 subq %rdx, %rax C l = src[i]-cbit - climb 153 154 adcq $0, %rcx C more cbit 155 imulq %r9, %rax C q = l * inverse 156 157 mulq %r8 C climb = high (q * d) 158 159 movq (%r11,%rsi,8), %rax C src[i+1] 160 subq %rcx, %rax C next l = src[i+1] - cbit 161 setc %cl C new cbit 162 163 incq %rsi 164 jnz L(top) 165 166 167L(one): 168 subq %rdx, %rax C l = src[i]-cbit - climb 169 170 adcq $0, %rcx C more cbit 171 imulq %r9, %rax C q = l * inverse 172 173 mulq %r8 C climb = high (q * d) 174 175 leaq (%rcx,%rdx), %rax C climb+cbit 176 ret 177 178EPILOGUE(mpn_modexact_1c_odd) 179EPILOGUE(mpn_modexact_1_odd) 180