1dnl Intel Pentium mpn_divexact_1 -- mpn by limb exact division. 2 3dnl Copyright 2001, 2002 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C divisor 24C odd even 25C P54: 24.5 30.5 cycles/limb 26C P55: 23.0 28.0 27 28 29C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 30C mp_limb_t divisor); 31C 32C Plain divl is used for small sizes, since the inverse takes a while to 33C setup. Multiplying works out faster for size>=3 when the divisor is odd, 34C or size>=4 when the divisor is even. Actually on P55 size==2 for odd or 35C size==3 for even are about the same speed for both divl or mul, but the 36C former is used since it will use up less code cache. 37C 38C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as 39C expected. On P54 in the even case the shrdl pairing nonsense (see 40C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a 41C further 1.5 slowdown for both odd and even. 42 43defframe(PARAM_DIVISOR,16) 44defframe(PARAM_SIZE, 12) 45defframe(PARAM_SRC, 8) 46defframe(PARAM_DST, 4) 47 48dnl re-use parameter space 49define(VAR_INVERSE,`PARAM_DST') 50 51 TEXT 52 53 ALIGN(32) 54PROLOGUE(mpn_divexact_1) 55deflit(`FRAME',0) 56 57 movl PARAM_DIVISOR, %eax 58 movl PARAM_SIZE, %ecx 59 60 pushl %esi FRAME_pushl() 61 push %edi FRAME_pushl() 62 63 movl PARAM_SRC, %esi 64 andl $1, %eax 65 66 movl PARAM_DST, %edi 67 addl %ecx, %eax C size if even, size+1 if odd 68 69 cmpl $4, %eax 70 jae L(mul_by_inverse) 71 72 73 xorl %edx, %edx 74L(div_top): 75 movl -4(%esi,%ecx,4), %eax 76 77 divl PARAM_DIVISOR 78 79 movl %eax, -4(%edi,%ecx,4) 80 decl %ecx 81 82 jnz L(div_top) 83 84 popl %edi 85 popl %esi 86 87 ret 88 89 90 91L(mul_by_inverse): 92 movl PARAM_DIVISOR, %eax 93 movl $-1, %ecx 94 95L(strip_twos): 96 ASSERT(nz, `orl %eax, %eax') 97 shrl %eax 98 incl %ecx C shift count 99 100 jnc L(strip_twos) 101 102 leal 1(%eax,%eax), %edx C d 103 andl $127, %eax C d/2, 7 bits 104 105 pushl %ebx FRAME_pushl() 106 pushl %ebp FRAME_pushl() 107 108ifdef(`PIC',` 109 call L(here) 110L(here): 111 popl %ebp C eip 112 113 addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp 114 C AGI 115 movl binvert_limb_table@GOT(%ebp), %ebp 116 C AGI 117 movzbl (%eax,%ebp), %eax 118',` 119 120dnl non-PIC 121 movzbl binvert_limb_table(%eax), %eax C inv 8 bits 122') 123 124 movl %eax, %ebp C inv 125 addl %eax, %eax C 2*inv 126 127 imull %ebp, %ebp C inv*inv 128 129 imull %edx, %ebp C inv*inv*d 130 131 subl %ebp, %eax C inv = 2*inv - inv*inv*d 132 movl PARAM_SIZE, %ebx 133 134 movl %eax, %ebp 135 addl %eax, %eax C 2*inv 136 137 imull %ebp, %ebp C inv*inv 138 139 imull %edx, %ebp C inv*inv*d 140 141 subl %ebp, %eax C inv = 2*inv - inv*inv*d 142 movl %edx, PARAM_DIVISOR C d without twos 143 144 leal (%esi,%ebx,4), %esi C src end 145 leal (%edi,%ebx,4), %edi C dst end 146 147 negl %ebx C -size 148 149 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS 150 pushl %eax FRAME_pushl() 151 imull PARAM_DIVISOR, %eax 152 cmpl $1, %eax 153 popl %eax FRAME_popl()') 154 155 movl %eax, VAR_INVERSE 156 xorl %ebp, %ebp C initial carry bit 157 158 movl (%esi,%ebx,4), %eax C src low limb 159 orl %ecx, %ecx C shift 160 161 movl 4(%esi,%ebx,4), %edx C src second limb (for even) 162 jz L(odd_entry) 163 164 shrdl( %cl, %edx, %eax) 165 166 incl %ebx 167 jmp L(even_entry) 168 169 170 ALIGN(8) 171L(odd_top): 172 C eax scratch 173 C ebx counter, limbs, negative 174 C ecx 175 C edx 176 C esi src end 177 C edi dst end 178 C ebp carry bit, 0 or -1 179 180 mull PARAM_DIVISOR 181 182 movl (%esi,%ebx,4), %eax 183 subl %ebp, %edx 184 185 subl %edx, %eax 186 187 sbbl %ebp, %ebp 188 189L(odd_entry): 190 imull VAR_INVERSE, %eax 191 192 movl %eax, (%edi,%ebx,4) 193 194 incl %ebx 195 jnz L(odd_top) 196 197 198 popl %ebp 199 popl %ebx 200 201 popl %edi 202 popl %esi 203 204 ret 205 206 207L(even_top): 208 C eax scratch 209 C ebx counter, limbs, negative 210 C ecx twos 211 C edx 212 C esi src end 213 C edi dst end 214 C ebp carry bit, 0 or -1 215 216 mull PARAM_DIVISOR 217 218 subl %ebp, %edx C carry bit 219 movl -4(%esi,%ebx,4), %eax C src limb 220 221 movl (%esi,%ebx,4), %ebp C and one above it 222 223 shrdl( %cl, %ebp, %eax) 224 225 subl %edx, %eax C carry limb 226 227 sbbl %ebp, %ebp 228 229L(even_entry): 230 imull VAR_INVERSE, %eax 231 232 movl %eax, -4(%edi,%ebx,4) 233 incl %ebx 234 235 jnz L(even_top) 236 237 238 239 mull PARAM_DIVISOR 240 241 movl -4(%esi), %eax C src high limb 242 subl %ebp, %edx 243 244 shrl %cl, %eax 245 246 subl %edx, %eax C no carry if division is exact 247 248 imull VAR_INVERSE, %eax 249 250 movl %eax, -4(%edi) C dst high limb 251 nop C protect against cache bank clash 252 253 popl %ebp 254 popl %ebx 255 256 popl %edi 257 popl %esi 258 259 ret 260 261EPILOGUE() 262