1dnl Intel Pentium mpn_divexact_1 -- mpn by limb exact division. 2 3dnl Rearranged from mpn/x86/pentium/dive_1.asm by Marco Bodrato. 4 5dnl Copyright 2001, 2002, 2011, 2014 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35 36C divisor 37C odd even 38C P54: 24.5 30.5 cycles/limb 39C P55: 23.0 28.0 40 41MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) 42 43C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as 44C expected. On P54 in the even case the shrdl pairing nonsense (see 45C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a 46C further 1.5 slowdown for both odd and even. 47 48defframe(PARAM_SHIFT, 24) 49defframe(PARAM_INVERSE,20) 50defframe(PARAM_DIVISOR,16) 51defframe(PARAM_SIZE, 12) 52defframe(PARAM_SRC, 8) 53defframe(PARAM_DST, 4) 54 55dnl re-use parameter space 56define(VAR_INVERSE,`PARAM_DST') 57 58 TEXT 59 60 ALIGN(32) 61C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 62C mp_limb_t divisor); 63C 64PROLOGUE(mpn_bdiv_q_1) 65deflit(`FRAME',0) 66 67 movl $-1, %ecx 68 movl PARAM_DIVISOR, %eax 69 70L(strip_twos): 71 ASSERT(nz, `orl %eax, %eax') 72 shrl %eax 73 incl %ecx C shift count 74 75 jnc L(strip_twos) 76 77 leal 1(%eax,%eax), %edx C d 78 andl $127, %eax C d/2, 7 bits 79 80 pushl %ebx FRAME_pushl() 81 pushl %ebp FRAME_pushl() 82 83ifdef(`PIC',` 84ifdef(`DARWIN',` 85 LEA( binvert_limb_table, %ebp) 86 movzbl (%eax,%ebp), %eax 87',` 88 call L(here) 89L(here): 90 popl %ebp C eip 91 92 addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp 93 C AGI 94 movl binvert_limb_table@GOT(%ebp), %ebp 95 C AGI 96 movzbl (%eax,%ebp), %eax 97') 98',` 99 100dnl non-PIC 101 movzbl binvert_limb_table(%eax), %eax C inv 8 bits 102') 103 104 movl %eax, %ebp C inv 105 addl %eax, %eax C 2*inv 106 107 imull %ebp, %ebp C inv*inv 108 109 imull %edx, %ebp C inv*inv*d 110 111 subl %ebp, %eax C inv = 2*inv - inv*inv*d 112 movl PARAM_SIZE, %ebx 113 114 movl %eax, %ebp 115 addl %eax, %eax C 2*inv 116 117 imull %ebp, %ebp C inv*inv 118 119 imull %edx, %ebp C inv*inv*d 120 121 subl %ebp, %eax C inv = 2*inv - inv*inv*d 122 movl %edx, PARAM_DIVISOR C d without twos 123 124 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS 125 pushl %eax FRAME_pushl() 126 imull PARAM_DIVISOR, %eax 127 cmpl $1, %eax 128 popl %eax FRAME_popl()') 129 130 jmp L(common) 131EPILOGUE() 132 133C mp_limb_t 134C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor, 135C mp_limb_t inverse, int shift) 136 ALIGN(32) 137PROLOGUE(mpn_pi1_bdiv_q_1) 138deflit(`FRAME',0) 139 140 movl PARAM_SHIFT, %ecx 141 142 pushl %ebx FRAME_pushl() 143 pushl %ebp FRAME_pushl() 144 145 movl PARAM_SIZE, %ebx 146 movl PARAM_INVERSE, %eax 147 148L(common): 149 pushl %esi FRAME_pushl() 150 push %edi FRAME_pushl() 151 152 movl PARAM_SRC, %esi 153 movl PARAM_DST, %edi 154 movl %eax, VAR_INVERSE 155 156 leal (%esi,%ebx,4), %esi C src end 157 leal (%edi,%ebx,4), %edi C dst end 158 159 negl %ebx C -size 160 161 xorl %ebp, %ebp C initial carry bit 162 163 orl %ecx, %ecx C shift 164 movl (%esi,%ebx,4), %eax C src low limb 165 jz L(odd_entry) 166 167 xorl %edx, %edx C initial carry limb (for even, if one) 168 incl %ebx 169 jz L(one) 170 171 movl (%esi,%ebx,4), %edx C src second limb (for even) 172 shrdl( %cl, %edx, %eax) 173 174 jmp L(even_entry) 175 176 177 ALIGN(8) 178L(odd_top): 179 C eax scratch 180 C ebx counter, limbs, negative 181 C ecx 182 C edx 183 C esi src end 184 C edi dst end 185 C ebp carry bit, 0 or -1 186 187 mull PARAM_DIVISOR 188 189 movl (%esi,%ebx,4), %eax 190 subl %ebp, %edx 191 192 subl %edx, %eax 193 194 sbbl %ebp, %ebp 195 196L(odd_entry): 197 imull VAR_INVERSE, %eax 198 199 movl %eax, (%edi,%ebx,4) 200 201 incl %ebx 202 jnz L(odd_top) 203 204 popl %edi 205 popl %esi 206 207 popl %ebp 208 popl %ebx 209 210 ret 211 212L(even_top): 213 C eax scratch 214 C ebx counter, limbs, negative 215 C ecx twos 216 C edx 217 C esi src end 218 C edi dst end 219 C ebp carry bit, 0 or -1 220 221 mull PARAM_DIVISOR 222 223 subl %ebp, %edx C carry bit 224 movl -4(%esi,%ebx,4), %eax C src limb 225 226 movl (%esi,%ebx,4), %ebp C and one above it 227 228 shrdl( %cl, %ebp, %eax) 229 230 subl %edx, %eax C carry limb 231 232 sbbl %ebp, %ebp 233 234L(even_entry): 235 imull VAR_INVERSE, %eax 236 237 movl %eax, -4(%edi,%ebx,4) 238 incl %ebx 239 240 jnz L(even_top) 241 242 mull PARAM_DIVISOR 243 244 movl -4(%esi), %eax C src high limb 245 subl %ebp, %edx 246 247L(one): 248 shrl %cl, %eax 249 250 subl %edx, %eax C no carry if division is exact 251 252 imull VAR_INVERSE, %eax 253 254 movl %eax, -4(%edi) C dst high limb 255 nop C protect against cache bank clash 256 257 popl %edi 258 popl %esi 259 260 popl %ebp 261 popl %ebx 262 263 ret 264 265EPILOGUE() 266ASM_END() 267