1dnl x86-64 mpn_divrem_1 -- mpn by limb division. 2 3dnl Copyright 2004, 2005, 2007-2012, 2014 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C norm unorm frac 35C AMD K8,K9 13 13 12 36C AMD K10 13 13 12 37C Intel P4 43 44 43 38C Intel core2 24.5 24.5 19.5 39C Intel corei 20.5 19.5 18 40C Intel atom 43 46 36 41C VIA nano 25.5 25.5 24 42 43C mp_limb_t 44C mpn_divrem_1 (mp_ptr qp, mp_size_t fn, 45C mp_srcptr np, mp_size_t nn, mp_limb_t d) 46 47C mp_limb_t 48C mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn, 49C mp_srcptr np, mp_size_t nn, mp_limb_t d, 50C mp_limb_t dinv, int cnt) 51 52C INPUT PARAMETERS 53define(`qp', `%rdi') 54define(`fn_param', `%rsi') 55define(`up_param', `%rdx') 56define(`un_param', `%rcx') 57define(`d', `%r8') 58define(`dinv', `%r9') C only for mpn_preinv_divrem_1 59C shift passed on stack C only for mpn_preinv_divrem_1 60 61define(`cnt', `%rcx') 62define(`up', `%rsi') 63define(`fn', `%r12') 64define(`un', `%rbx') 65 66 67C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15 68C cnt qp d dinv 69 70ABI_SUPPORT(DOS64) 71ABI_SUPPORT(STD64) 72 73IFSTD(`define(`CNTOFF', `40($1)')') 74IFDOS(`define(`CNTOFF', `104($1)')') 75 76ASM_START() 77 TEXT 78 ALIGN(16) 79PROLOGUE(mpn_preinv_divrem_1) 80 FUNC_ENTRY(4) 81IFDOS(` mov 56(%rsp), %r8 ') 82IFDOS(` mov 64(%rsp), %r9 ') 83 xor R32(%rax), R32(%rax) 84 push %r13 85 push %r12 86 push %rbp 87 push %rbx 88 89 mov fn_param, fn 90 mov un_param, un 91 add fn_param, un_param 92 mov up_param, up 93 94 lea -8(qp,un_param,8), qp 95 96 test d, d 97 js L(nent) 98 99 mov CNTOFF(%rsp), R8(cnt) 100 shl R8(cnt), d 101 jmp L(uent) 102EPILOGUE() 103 104 ALIGN(16) 105PROLOGUE(mpn_divrem_1) 106 FUNC_ENTRY(4) 107IFDOS(` mov 56(%rsp), %r8 ') 108 xor R32(%rax), R32(%rax) 109 push %r13 110 push %r12 111 push %rbp 112 push %rbx 113 114 mov fn_param, fn 115 mov un_param, un 116 add fn_param, un_param 117 mov up_param, up 118 je L(ret) 119 120 lea -8(qp,un_param,8), qp 121 xor R32(%rbp), R32(%rbp) 122 123 test d, d 124 jns L(unnormalized) 125 126L(normalized): 127 test un, un 128 je L(8) C un == 0 129 mov -8(up,un,8), %rbp 130 dec un 131 mov %rbp, %rax 132 sub d, %rbp 133 cmovc %rax, %rbp 134 sbb R32(%rax), R32(%rax) 135 inc R32(%rax) 136 mov %rax, (qp) 137 lea -8(qp), qp 138L(8): 139IFSTD(` push %rdi ') 140IFSTD(` push %rsi ') 141 push %r8 142IFSTD(` mov d, %rdi ') 143IFDOS(` sub $32, %rsp ') 144IFDOS(` mov d, %rcx ') 145 ASSERT(nz, `test $15, %rsp') 146 CALL( mpn_invert_limb) 147IFDOS(` add $32, %rsp ') 148 pop %r8 149IFSTD(` pop %rsi ') 150IFSTD(` pop %rdi ') 151 152 mov %rax, dinv 153 mov %rbp, %rax 154 jmp L(nent) 155 156 ALIGN(16) 157L(ntop):mov (up,un,8), %r10 C K8-K10 P6-CNR P6-NHM P4 158 mul dinv C 0,13 0,20 0,18 0,45 159 add %r10, %rax C 4 8 3 12 160 adc %rbp, %rdx C 5 9 10 13 161 mov %rax, %rbp C 5 9 4 13 162 mov %rdx, %r13 C 6 11 12 23 163 imul d, %rdx C 6 11 11 23 164 sub %rdx, %r10 C 10 16 14 33 165 mov d, %rax C 166 add %r10, %rax C 11 17 15 34 167 cmp %rbp, %r10 C 11 17 15 34 168 cmovc %r10, %rax C 12 18 16 35 169 adc $-1, %r13 C 170 cmp d, %rax C 171 jae L(nfx) C 172L(nok): mov %r13, (qp) C 173 sub $8, qp C 174L(nent):lea 1(%rax), %rbp C 175 dec un C 176 jns L(ntop) C 177 178 xor R32(%rcx), R32(%rcx) 179 jmp L(frac) 180 181L(nfx): sub d, %rax 182 inc %r13 183 jmp L(nok) 184 185L(unnormalized): 186 test un, un 187 je L(44) 188 mov -8(up,un,8), %rax 189 cmp d, %rax 190 jae L(44) 191 mov %rbp, (qp) 192 mov %rax, %rbp 193 lea -8(qp), qp 194 je L(ret) 195 dec un 196L(44): 197 bsr d, %rcx 198 not R32(%rcx) 199 shl R8(%rcx), d 200 shl R8(%rcx), %rbp 201 202 push %rcx 203IFSTD(` push %rdi ') 204IFSTD(` push %rsi ') 205 push %r8 206IFSTD(` sub $8, %rsp ') 207IFSTD(` mov d, %rdi ') 208IFDOS(` sub $40, %rsp ') 209IFDOS(` mov d, %rcx ') 210 ASSERT(nz, `test $15, %rsp') 211 CALL( mpn_invert_limb) 212IFSTD(` add $8, %rsp ') 213IFDOS(` add $40, %rsp ') 214 pop %r8 215IFSTD(` pop %rsi ') 216IFSTD(` pop %rdi ') 217 pop %rcx 218 219 mov %rax, dinv 220 mov %rbp, %rax 221 test un, un 222 je L(frac) 223 224L(uent):dec un 225 mov (up,un,8), %rbp 226 neg R32(%rcx) 227 shr R8(%rcx), %rbp 228 neg R32(%rcx) 229 or %rbp, %rax 230 jmp L(ent) 231 232 ALIGN(16) 233L(utop):mov (up,un,8), %r10 234 shl R8(%rcx), %rbp 235 neg R32(%rcx) 236 shr R8(%rcx), %r10 237 neg R32(%rcx) 238 or %r10, %rbp 239 mul dinv 240 add %rbp, %rax 241 adc %r11, %rdx 242 mov %rax, %r11 243 mov %rdx, %r13 244 imul d, %rdx 245 sub %rdx, %rbp 246 mov d, %rax 247 add %rbp, %rax 248 cmp %r11, %rbp 249 cmovc %rbp, %rax 250 adc $-1, %r13 251 cmp d, %rax 252 jae L(ufx) 253L(uok): mov %r13, (qp) 254 sub $8, qp 255L(ent): mov (up,un,8), %rbp 256 dec un 257 lea 1(%rax), %r11 258 jns L(utop) 259 260L(uend):shl R8(%rcx), %rbp 261 mul dinv 262 add %rbp, %rax 263 adc %r11, %rdx 264 mov %rax, %r11 265 mov %rdx, %r13 266 imul d, %rdx 267 sub %rdx, %rbp 268 mov d, %rax 269 add %rbp, %rax 270 cmp %r11, %rbp 271 cmovc %rbp, %rax 272 adc $-1, %r13 273 cmp d, %rax 274 jae L(efx) 275L(eok): mov %r13, (qp) 276 sub $8, qp 277 jmp L(frac) 278 279L(ufx): sub d, %rax 280 inc %r13 281 jmp L(uok) 282L(efx): sub d, %rax 283 inc %r13 284 jmp L(eok) 285 286L(frac):mov d, %rbp 287 neg %rbp 288 jmp L(fent) 289 290 ALIGN(16) C K8-K10 P6-CNR P6-NHM P4 291L(ftop):mul dinv C 0,12 0,17 0,17 292 add %r11, %rdx C 5 8 10 293 mov %rax, %r11 C 4 8 3 294 mov %rdx, %r13 C 6 9 11 295 imul %rbp, %rdx C 6 9 11 296 mov d, %rax C 297 add %rdx, %rax C 10 14 14 298 cmp %r11, %rdx C 10 14 14 299 cmovc %rdx, %rax C 11 15 15 300 adc $-1, %r13 C 301 mov %r13, (qp) C 302 sub $8, qp C 303L(fent):lea 1(%rax), %r11 C 304 dec fn C 305 jns L(ftop) C 306 307 shr R8(%rcx), %rax 308L(ret): pop %rbx 309 pop %rbp 310 pop %r12 311 pop %r13 312 FUNC_EXIT() 313 ret 314EPILOGUE() 315