1dnl AMD K7 mpn_modexact_1_odd -- exact division style remainder. 2 3dnl Copyright 2000-2002, 2004, 2007 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C cycles/limb 35C Athlon: 11.0 36C Hammer: 7.0 37 38 39C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, 40C mp_limb_t divisor); 41C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, 42C mp_limb_t divisor, mp_limb_t carry); 43C 44C With the loop running at just 11 cycles it doesn't seem worth bothering to 45C check for high<divisor to save one step. 46C 47C Using a divl for size==1 measures slower than the modexact method, which 48C is not too surprising since for the latter it's only about 24 cycles to 49C calculate the modular inverse. 50 51defframe(PARAM_CARRY, 16) 52defframe(PARAM_DIVISOR,12) 53defframe(PARAM_SIZE, 8) 54defframe(PARAM_SRC, 4) 55 56defframe(SAVE_EBX, -4) 57defframe(SAVE_ESI, -8) 58defframe(SAVE_EDI, -12) 59defframe(SAVE_EBP, -16) 60 61deflit(STACK_SPACE, 16) 62 63 TEXT 64 65 ALIGN(16) 66PROLOGUE(mpn_modexact_1c_odd) 67deflit(`FRAME',0) 68 69 movl PARAM_CARRY, %ecx 70 jmp L(start_1c) 71 72EPILOGUE() 73 74 75 ALIGN(16) 76PROLOGUE(mpn_modexact_1_odd) 77deflit(`FRAME',0) 78 79 xorl %ecx, %ecx 80L(start_1c): 81 movl PARAM_DIVISOR, %eax 82 subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) 83 84 movl %esi, SAVE_ESI 85 movl PARAM_DIVISOR, %esi 86 87 movl %edi, SAVE_EDI 88 89 shrl %eax C d/2 90 91 andl $127, %eax 92 93ifdef(`PIC',` 94 LEA( binvert_limb_table, %edi) 95 movzbl (%eax,%edi), %edi C inv 8 bits 96',` 97 movzbl binvert_limb_table(%eax), %edi C inv 8 bits 98') 99 100 xorl %edx, %edx C initial extra carry 101 leal (%edi,%edi), %eax C 2*inv 102 103 imull %edi, %edi C inv*inv 104 105 movl %ebp, SAVE_EBP 106 movl PARAM_SIZE, %ebp 107 108 movl %ebx, SAVE_EBX 109 movl PARAM_SRC, %ebx 110 111 imull %esi, %edi C inv*inv*d 112 113 subl %edi, %eax C inv = 2*inv - inv*inv*d 114 leal (%eax,%eax), %edi C 2*inv 115 116 imull %eax, %eax C inv*inv 117 118 imull %esi, %eax C inv*inv*d 119 120 leal (%ebx,%ebp,4), %ebx C src end 121 negl %ebp C -size 122 123 subl %eax, %edi C inv = 2*inv - inv*inv*d 124 125 ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS 126 movl %esi, %eax 127 imull %edi, %eax 128 cmpl $1, %eax') 129 130 131C The dependent chain here is 132C 133C cycles 134C subl %edx, %eax 1 135C imull %edi, %eax 4 136C mull %esi 6 (high limb) 137C ---- 138C total 11 139C 140C Out of order execution hides the load latency for the source data, so no 141C special scheduling is required. 142 143L(top): 144 C eax src limb 145 C ebx src end ptr 146 C ecx next carry bit, 0 or 1 (or initial carry param) 147 C edx carry limb, high of last product 148 C esi divisor 149 C edi inverse 150 C ebp counter, limbs, negative 151 152 movl (%ebx,%ebp,4), %eax 153 154 subl %ecx, %eax C apply carry bit 155 movl $0, %ecx 156 157 setc %cl C new carry bit 158 159 subl %edx, %eax C apply carry limb 160 adcl $0, %ecx 161 162 imull %edi, %eax 163 164 mull %esi 165 166 incl %ebp 167 jnz L(top) 168 169 170 movl SAVE_ESI, %esi 171 movl SAVE_EDI, %edi 172 leal (%ecx,%edx), %eax 173 174 movl SAVE_EBX, %ebx 175 movl SAVE_EBP, %ebp 176 addl $STACK_SPACE, %esp 177 178 ret 179 180EPILOGUE() 181ASM_END() 182