1dnl AMD K6 mpn_divexact_1 -- mpn by limb exact division. 2 3dnl Copyright 2000-2002, 2007 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C divisor 35C odd even 36C K6: 10.0 12.0 cycles/limb 37C K6-2: 10.0 11.5 38 39 40C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 41C mp_limb_t divisor); 42C 43C A simple divl is used for size==1. This is about 10 cycles faster for an 44C odd divisor or 20 cycles for an even divisor. 45C 46C The loops are quite sensitive to code alignment, speeds should be 47C rechecked (odd and even divisor, pic and non-pic) if contemplating 48C changing anything. 49 50defframe(PARAM_DIVISOR,16) 51defframe(PARAM_SIZE, 12) 52defframe(PARAM_SRC, 8) 53defframe(PARAM_DST, 4) 54 55dnl re-use parameter space 56define(VAR_INVERSE,`PARAM_DST') 57 58 TEXT 59 60 ALIGN(32) 61PROLOGUE(mpn_divexact_1) 62deflit(`FRAME',0) 63 64 movl PARAM_SIZE, %ecx 65 66 movl PARAM_SRC, %eax 67 xorl %edx, %edx 68 69 cmpl $1, %ecx 70 jnz L(two_or_more) 71 72 movl (%eax), %eax 73 74 divl PARAM_DIVISOR 75 76 movl PARAM_DST, %ecx 77 movl %eax, (%ecx) 78 79 ret 80 81 82L(two_or_more): 83 movl PARAM_DIVISOR, %eax 84 pushl %ebx FRAME_pushl() 85 86 movl PARAM_SRC, %ebx 87 pushl %ebp FRAME_pushl() 88 89L(strip_twos): 90 shrl %eax 91 incl %edx C will get shift+1 92 93 jnc L(strip_twos) 94 pushl %esi FRAME_pushl() 95 96 leal 1(%eax,%eax), %esi C d without twos 97 andl $127, %eax C d/2, 7 bits 98 99ifdef(`PIC',` 100 LEA( binvert_limb_table, %ebp) 101Zdisp( movzbl, 0,(%eax,%ebp), %eax) 102',` 103 movzbl binvert_limb_table(%eax), %eax C inv 8 bits 104') 105 pushl %edi FRAME_pushl() 106 107 leal (%eax,%eax), %ebp C 2*inv 108 109 imull %eax, %eax C inv*inv 110 111 movl PARAM_DST, %edi 112 113 imull %esi, %eax C inv*inv*d 114 115 subl %eax, %ebp C inv = 2*inv - inv*inv*d 116 leal (%ebp,%ebp), %eax C 2*inv 117 118 imull %ebp, %ebp C inv*inv 119 120 movl %esi, PARAM_DIVISOR C d without twos 121 leal (%ebx,%ecx,4), %ebx C src end 122 123 imull %esi, %ebp C inv*inv*d 124 125 leal (%edi,%ecx,4), %edi C dst end 126 negl %ecx C -size 127 128 subl %ebp, %eax C inv = 2*inv - inv*inv*d 129 subl $1, %edx C shift amount, and clear carry 130 131 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS 132 pushl %eax FRAME_pushl() 133 imull PARAM_DIVISOR, %eax 134 cmpl $1, %eax 135 popl %eax FRAME_popl()') 136 137 movl %eax, VAR_INVERSE 138 jnz L(even) 139 140 movl (%ebx,%ecx,4), %esi C src low limb 141 jmp L(odd_entry) 142 143 144 ALIGN(16) 145 nop C code alignment 146L(odd_top): 147 C eax scratch 148 C ebx src end 149 C ecx counter, limbs, negative 150 C edx inverse 151 C esi next limb, adjusted for carry 152 C edi dst end 153 C ebp carry bit, 0 or -1 154 155 imull %edx, %esi 156 157 movl PARAM_DIVISOR, %eax 158 movl %esi, -4(%edi,%ecx,4) 159 160 mull %esi C carry limb in edx 161 162 subl %ebp, %edx C apply carry bit 163 movl (%ebx,%ecx,4), %esi 164 165L(odd_entry): 166 subl %edx, %esi C apply carry limb 167 movl VAR_INVERSE, %edx 168 169 sbbl %ebp, %ebp C 0 or -1 170 171 incl %ecx 172 jnz L(odd_top) 173 174 175 imull %edx, %esi 176 177 movl %esi, -4(%edi,%ecx,4) 178 179 popl %edi 180 popl %esi 181 182 popl %ebp 183 popl %ebx 184 185 ret 186 187 188L(even): 189 C eax 190 C ebx src end 191 C ecx -size 192 C edx twos 193 C esi 194 C edi dst end 195 C ebp 196 197 xorl %ebp, %ebp 198Zdisp( movq, 0,(%ebx,%ecx,4), %mm0) C src[0,1] 199 200 movd %edx, %mm7 201 movl VAR_INVERSE, %edx 202 203 addl $2, %ecx 204 psrlq %mm7, %mm0 205 206 movd %mm0, %esi 207 jz L(even_two) C if only two limbs 208 209 210C Out-of-order execution is good enough to hide the load/rshift/movd 211C latency. Having imul at the top of the loop gives 11.5 c/l instead of 12, 212C on K6-2. In fact there's only 11 of decode, but nothing running at 11 has 213C been found. Maybe the fact every second movq is unaligned costs the extra 214C 0.5. 215 216L(even_top): 217 C eax scratch 218 C ebx src end 219 C ecx counter, limbs, negative 220 C edx inverse 221 C esi next limb, adjusted for carry 222 C edi dst end 223 C ebp carry bit, 0 or -1 224 C 225 C mm0 scratch, source limbs 226 C mm7 twos 227 228 imull %edx, %esi 229 230 movl %esi, -8(%edi,%ecx,4) 231 movl PARAM_DIVISOR, %eax 232 233 mull %esi C carry limb in edx 234 235 movq -4(%ebx,%ecx,4), %mm0 236 psrlq %mm7, %mm0 237 238 movd %mm0, %esi 239 subl %ebp, %edx C apply carry bit 240 241 subl %edx, %esi C apply carry limb 242 movl VAR_INVERSE, %edx 243 244 sbbl %ebp, %ebp C 0 or -1 245 246 incl %ecx 247 jnz L(even_top) 248 249 250L(even_two): 251 movd -4(%ebx), %mm0 C src high limb 252 psrlq %mm7, %mm0 253 254 imull %edx, %esi 255 256 movl %esi, -8(%edi) 257 movl PARAM_DIVISOR, %eax 258 259 mull %esi C carry limb in edx 260 261 movd %mm0, %esi 262 subl %ebp, %edx C apply carry bit 263 264 movl VAR_INVERSE, %eax 265 subl %edx, %esi C apply carry limb 266 267 imull %eax, %esi 268 269 movl %esi, -4(%edi) 270 271 popl %edi 272 popl %esi 273 274 popl %ebp 275 popl %ebx 276 277 emms_or_femms 278 279 ret 280 281EPILOGUE() 282ASM_END() 283