1dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder. 2 3dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C odd even divisor 35C P6: 10.0 12.0 cycles/limb 36 37 38C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 39C mp_limb_t divisor); 40C 41C The odd case is basically the same as mpn_modexact_1_odd, just with an 42C extra store, and it runs at the same 10 cycles which is the dependent 43C chain. 44C 45C The shifts for the even case aren't on the dependent chain so in principle 46C it could run the same too, but nothing running at 10 has been found. 47C Perhaps there's too many uops (an extra 4 over the odd case). 48 49defframe(PARAM_DIVISOR,16) 50defframe(PARAM_SIZE, 12) 51defframe(PARAM_SRC, 8) 52defframe(PARAM_DST, 4) 53 54defframe(SAVE_EBX, -4) 55defframe(SAVE_ESI, -8) 56defframe(SAVE_EDI, -12) 57defframe(SAVE_EBP, -16) 58defframe(VAR_INVERSE, -20) 59deflit(STACK_SPACE, 20) 60 61 TEXT 62 63 ALIGN(16) 64PROLOGUE(mpn_divexact_1) 65deflit(`FRAME',0) 66 67 movl PARAM_DIVISOR, %eax 68 subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) 69 70 movl %esi, SAVE_ESI 71 movl PARAM_SRC, %esi 72 73 movl %ebx, SAVE_EBX 74 movl PARAM_SIZE, %ebx 75 76 bsfl %eax, %ecx C trailing twos 77 78 movl %ebp, SAVE_EBP 79 80 shrl %cl, %eax C d without twos 81 82 movl %eax, %edx 83 shrl %eax C d/2 without twos 84 85 movl %edx, PARAM_DIVISOR 86 andl $127, %eax 87 88ifdef(`PIC',` 89 LEA( binvert_limb_table, %ebp) 90 movzbl (%eax,%ebp), %ebp C inv 8 bits 91',` 92 movzbl binvert_limb_table(%eax), %ebp C inv 8 bits 93') 94 95 leal (%ebp,%ebp), %eax C 2*inv 96 97 imull %ebp, %ebp C inv*inv 98 99 movl %edi, SAVE_EDI 100 movl PARAM_DST, %edi 101 102 leal (%esi,%ebx,4), %esi C src end 103 104 imull PARAM_DIVISOR, %ebp C inv*inv*d 105 106 subl %ebp, %eax C inv = 2*inv - inv*inv*d 107 leal (%eax,%eax), %ebp C 2*inv 108 109 imull %eax, %eax C inv*inv 110 111 leal (%edi,%ebx,4), %edi C dst end 112 negl %ebx C -size 113 114 movl %edi, PARAM_DST 115 116 imull PARAM_DIVISOR, %eax C inv*inv*d 117 118 subl %eax, %ebp C inv = 2*inv - inv*inv*d 119 120 ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS 121 movl PARAM_DIVISOR, %eax 122 imull %ebp, %eax 123 cmpl $1, %eax') 124 125 movl %ebp, VAR_INVERSE 126 movl (%esi,%ebx,4), %eax C src[0] 127 128 orl %ecx, %ecx 129 jnz L(even) 130 131 C ecx initial carry is zero 132 jmp L(odd_entry) 133 134 135C The dependent chain here is 136C 137C subl %edx, %eax 1 138C imull %ebp, %eax 4 139C mull PARAM_DIVISOR 5 140C ---- 141C total 10 142C 143C and this is the measured speed. No special scheduling is necessary, out 144C of order execution hides the load latency. 145 146L(odd_top): 147 C eax scratch (src limb) 148 C ebx counter, limbs, negative 149 C ecx carry bit 150 C edx carry limb, high of last product 151 C esi &src[size] 152 C edi &dst[size] 153 C ebp 154 155 mull PARAM_DIVISOR 156 157 movl (%esi,%ebx,4), %eax 158 subl %ecx, %eax 159 160 sbbl %ecx, %ecx 161 subl %edx, %eax 162 163 sbbl $0, %ecx 164 165L(odd_entry): 166 imull VAR_INVERSE, %eax 167 168 movl %eax, (%edi,%ebx,4) 169 negl %ecx 170 171 incl %ebx 172 jnz L(odd_top) 173 174 175 movl SAVE_ESI, %esi 176 177 movl SAVE_EDI, %edi 178 179 movl SAVE_EBP, %ebp 180 181 movl SAVE_EBX, %ebx 182 addl $STACK_SPACE, %esp 183 184 ret 185 186 187L(even): 188 C eax src[0] 189 C ebx counter, limbs, negative 190 C ecx shift 191 C edx 192 C esi 193 C edi 194 C ebp 195 196 xorl %ebp, %ebp C initial carry bit 197 xorl %edx, %edx C initial carry limb (for size==1) 198 199 incl %ebx 200 jz L(even_one) 201 202 movl (%esi,%ebx,4), %edi C src[1] 203 204 shrdl( %cl, %edi, %eax) 205 206 jmp L(even_entry) 207 208 209L(even_top): 210 C eax scratch 211 C ebx counter, limbs, negative 212 C ecx shift 213 C edx scratch 214 C esi &src[size] 215 C edi &dst[size] and scratch 216 C ebp carry bit 217 218 movl (%esi,%ebx,4), %edi 219 220 mull PARAM_DIVISOR 221 222 movl -4(%esi,%ebx,4), %eax 223 shrdl( %cl, %edi, %eax) 224 225 subl %ebp, %eax 226 227 sbbl %ebp, %ebp 228 subl %edx, %eax 229 230 sbbl $0, %ebp 231 232L(even_entry): 233 imull VAR_INVERSE, %eax 234 235 movl PARAM_DST, %edi 236 negl %ebp 237 238 movl %eax, -4(%edi,%ebx,4) 239 incl %ebx 240 jnz L(even_top) 241 242 243 244 mull PARAM_DIVISOR 245 246 movl -4(%esi), %eax 247 248L(even_one): 249 shrl %cl, %eax 250 movl SAVE_ESI, %esi 251 252 subl %ebp, %eax 253 movl SAVE_EBP, %ebp 254 255 subl %edx, %eax 256 movl SAVE_EBX, %ebx 257 258 imull VAR_INVERSE, %eax 259 260 movl %eax, -4(%edi) 261 movl SAVE_EDI, %edi 262 addl $STACK_SPACE, %esp 263 264 ret 265 266EPILOGUE() 267ASM_END() 268