1dnl Intel Pentium mpn_divexact_1 -- mpn by limb exact division. 2 3dnl Copyright 2001, 2002, 2011 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl Rearranged from mpn/x86/pentium/dive_1.asm by Marco Bodrato. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or 10dnl modify it under the terms of the GNU Lesser General Public License as 11dnl published by the Free Software Foundation; either version 3 of the 12dnl License, or (at your option) any later version. 13dnl 14dnl The GNU MP Library is distributed in the hope that it will be useful, 15dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 16dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17dnl Lesser General Public License for more details. 18dnl 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24 25C divisor 26C odd even 27C P54: 24.5 30.5 cycles/limb 28C P55: 23.0 28.0 29 30MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) 31 32C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as 33C expected. On P54 in the even case the shrdl pairing nonsense (see 34C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a 35C further 1.5 slowdown for both odd and even. 36 37defframe(PARAM_SHIFT, 24) 38defframe(PARAM_INVERSE,20) 39defframe(PARAM_DIVISOR,16) 40defframe(PARAM_SIZE, 12) 41defframe(PARAM_SRC, 8) 42defframe(PARAM_DST, 4) 43 44dnl re-use parameter space 45define(VAR_INVERSE,`PARAM_DST') 46 47 TEXT 48 49 ALIGN(32) 50C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 51C mp_limb_t divisor); 52C 53PROLOGUE(mpn_bdiv_q_1) 54deflit(`FRAME',0) 55 56 movl $-1, %ecx 57 movl PARAM_DIVISOR, %eax 58 59L(strip_twos): 60 ASSERT(nz, `orl %eax, %eax') 61 shrl %eax 62 incl %ecx C shift count 63 64 jnc L(strip_twos) 65 66 leal 1(%eax,%eax), %edx C d 67 andl $127, %eax C d/2, 7 bits 68 69 pushl %ebx FRAME_pushl() 70 pushl %ebp FRAME_pushl() 71 72ifdef(`PIC',` 73 call L(here) 74L(here): 75 popl %ebp C eip 76 77 addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp 78 C AGI 79 movl binvert_limb_table@GOT(%ebp), %ebp 80 C AGI 81 movzbl (%eax,%ebp), %eax 82',` 83 84dnl non-PIC 85 movzbl binvert_limb_table(%eax), %eax C inv 8 bits 86') 87 88 movl %eax, %ebp C inv 89 addl %eax, %eax C 2*inv 90 91 imull %ebp, %ebp C inv*inv 92 93 imull %edx, %ebp C inv*inv*d 94 95 subl %ebp, %eax C inv = 2*inv - inv*inv*d 96 movl PARAM_SIZE, %ebx 97 98 movl %eax, %ebp 99 addl %eax, %eax C 2*inv 100 101 imull %ebp, %ebp C inv*inv 102 103 imull %edx, %ebp C inv*inv*d 104 105 subl %ebp, %eax C inv = 2*inv - inv*inv*d 106 movl %edx, PARAM_DIVISOR C d without twos 107 108 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS 109 pushl %eax FRAME_pushl() 110 imull PARAM_DIVISOR, %eax 111 cmpl $1, %eax 112 popl %eax FRAME_popl()') 113 114 jmp L(common) 115EPILOGUE() 116 117C mp_limb_t 118C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor, 119C mp_limb_t inverse, int shift) 120 ALIGN(32) 121PROLOGUE(mpn_pi1_bdiv_q_1) 122deflit(`FRAME',0) 123 124 movl PARAM_SHIFT, %ecx 125 126 pushl %ebx FRAME_pushl() 127 pushl %ebp FRAME_pushl() 128 129 movl PARAM_SIZE, %ebx 130 movl PARAM_INVERSE, %eax 131 132L(common): 133 pushl %esi FRAME_pushl() 134 push %edi FRAME_pushl() 135 136 movl PARAM_SRC, %esi 137 movl PARAM_DST, %edi 138 movl %eax, VAR_INVERSE 139 140 leal (%esi,%ebx,4), %esi C src end 141 leal (%edi,%ebx,4), %edi C dst end 142 143 negl %ebx C -size 144 145 xorl %ebp, %ebp C initial carry bit 146 147 orl %ecx, %ecx C shift 148 movl (%esi,%ebx,4), %eax C src low limb 149 jz L(odd_entry) 150 151 xorl %edx, %edx C initial carry limb (for even, if one) 152 incl %ebx 153 jz L(one) 154 155 movl (%esi,%ebx,4), %edx C src second limb (for even) 156 shrdl( %cl, %edx, %eax) 157 158 jmp L(even_entry) 159 160 161 ALIGN(8) 162L(odd_top): 163 C eax scratch 164 C ebx counter, limbs, negative 165 C ecx 166 C edx 167 C esi src end 168 C edi dst end 169 C ebp carry bit, 0 or -1 170 171 mull PARAM_DIVISOR 172 173 movl (%esi,%ebx,4), %eax 174 subl %ebp, %edx 175 176 subl %edx, %eax 177 178 sbbl %ebp, %ebp 179 180L(odd_entry): 181 imull VAR_INVERSE, %eax 182 183 movl %eax, (%edi,%ebx,4) 184 185 incl %ebx 186 jnz L(odd_top) 187 188 popl %edi 189 popl %esi 190 191 popl %ebp 192 popl %ebx 193 194 ret 195 196L(even_top): 197 C eax scratch 198 C ebx counter, limbs, negative 199 C ecx twos 200 C edx 201 C esi src end 202 C edi dst end 203 C ebp carry bit, 0 or -1 204 205 mull PARAM_DIVISOR 206 207 subl %ebp, %edx C carry bit 208 movl -4(%esi,%ebx,4), %eax C src limb 209 210 movl (%esi,%ebx,4), %ebp C and one above it 211 212 shrdl( %cl, %ebp, %eax) 213 214 subl %edx, %eax C carry limb 215 216 sbbl %ebp, %ebp 217 218L(even_entry): 219 imull VAR_INVERSE, %eax 220 221 movl %eax, -4(%edi,%ebx,4) 222 incl %ebx 223 224 jnz L(even_top) 225 226 mull PARAM_DIVISOR 227 228 movl -4(%esi), %eax C src high limb 229 subl %ebp, %edx 230 231L(one): 232 shrl %cl, %eax 233 234 subl %edx, %eax C no carry if division is exact 235 236 imull VAR_INVERSE, %eax 237 238 movl %eax, -4(%edi) C dst high limb 239 nop C protect against cache bank clash 240 241 popl %edi 242 popl %esi 243 244 popl %ebp 245 popl %ebx 246 247 ret 248 249EPILOGUE() 250