1dnl AMD64 mpn_mul_basecase. 2 3dnl Contributed to the GNU project by Torbjorn Granlund and David Harvey. 4 5dnl Copyright 2008, 2012 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb 36C AMD K8,K9 2.375 37C AMD K10 2.375 38C Intel P4 15-16 39C Intel core2 4.45 40C Intel corei 4.35 41C Intel atom ? 42C VIA nano 4.5 43 44C The inner loops of this code are the result of running a code generation and 45C optimization tool suite written by David Harvey and Torbjorn Granlund. 46 47C TODO 48C * Use fewer registers. (how??? I can't see it -- david) 49C * Avoid some "mov $0,r" and instead use "xor r,r". 50C * Can the top of each L(addmul_outer_n) prologue be folded into the 51C mul_1/mul_2 prologues, saving a LEA (%rip)? It would slow down the 52C case where vn = 1 or 2; is it worth it? 53 54C INPUT PARAMETERS 55define(`rp', `%rdi') 56define(`up', `%rsi') 57define(`un_param',`%rdx') 58define(`vp', `%rcx') 59define(`vn', `%r8') 60 61define(`v0', `%r12') 62define(`v1', `%r9') 63 64define(`w0', `%rbx') 65define(`w1', `%r15') 66define(`w2', `%rbp') 67define(`w3', `%r10') 68 69define(`n', `%r11') 70define(`outer_addr', `%r14') 71define(`un', `%r13') 72 73ABI_SUPPORT(DOS64) 74ABI_SUPPORT(STD64) 75 76ASM_START() 77 TEXT 78 ALIGN(16) 79PROLOGUE(mpn_mul_basecase) 80 FUNC_ENTRY(4) 81IFDOS(` mov 56(%rsp), %r8d ') 82 push %rbx 83 push %rbp 84 push %r12 85 push %r13 86 push %r14 87 push %r15 88 89 xor R32(un), R32(un) 90 mov (up), %rax 91 mov (vp), v0 92 93 sub un_param, un C rdx used by mul 94 mov un, n 95 mov R32(un_param), R32(w0) 96 97 lea (rp,un_param,8), rp 98 lea (up,un_param,8), up 99 100 mul v0 101 102 test $1, R8(vn) 103 jz L(mul_2) 104 105C =========================================================== 106C mul_1 for vp[0] if vn is odd 107 108L(mul_1): 109 and $3, R32(w0) 110 jz L(mul_1_prologue_0) 111 cmp $2, R32(w0) 112 jc L(mul_1_prologue_1) 113 jz L(mul_1_prologue_2) 114 115L(mul_1_prologue_3): 116 add $-1, n 117 lea L(addmul_outer_3)(%rip), outer_addr 118 mov %rax, w3 119 mov %rdx, w0 120 jmp L(mul_1_entry_3) 121 122L(mul_1_prologue_0): 123 mov %rax, w2 124 mov %rdx, w3 C note: already w0 == 0 125 lea L(addmul_outer_0)(%rip), outer_addr 126 jmp L(mul_1_entry_0) 127 128L(mul_1_prologue_1): 129 cmp $-1, un 130 jne 2f 131 mov %rax, -8(rp) 132 mov %rdx, (rp) 133 jmp L(ret) 1342: add $1, n 135 lea L(addmul_outer_1)(%rip), outer_addr 136 mov %rax, w1 137 mov %rdx, w2 138 xor R32(w3), R32(w3) 139 mov (up,n,8), %rax 140 jmp L(mul_1_entry_1) 141 142L(mul_1_prologue_2): 143 add $-2, n 144 lea L(addmul_outer_2)(%rip), outer_addr 145 mov %rax, w0 146 mov %rdx, w1 147 mov 24(up,n,8), %rax 148 xor R32(w2), R32(w2) 149 xor R32(w3), R32(w3) 150 jmp L(mul_1_entry_2) 151 152 153 C this loop is 10 c/loop = 2.5 c/l on K8, for all up/rp alignments 154 155 ALIGN(16) 156L(mul_1_top): 157 mov w0, -16(rp,n,8) 158 add %rax, w1 159 mov (up,n,8), %rax 160 adc %rdx, w2 161L(mul_1_entry_1): 162 xor R32(w0), R32(w0) 163 mul v0 164 mov w1, -8(rp,n,8) 165 add %rax, w2 166 adc %rdx, w3 167L(mul_1_entry_0): 168 mov 8(up,n,8), %rax 169 mul v0 170 mov w2, (rp,n,8) 171 add %rax, w3 172 adc %rdx, w0 173L(mul_1_entry_3): 174 mov 16(up,n,8), %rax 175 mul v0 176 mov w3, 8(rp,n,8) 177 xor R32(w2), R32(w2) C zero 178 mov w2, w3 C zero 179 add %rax, w0 180 mov 24(up,n,8), %rax 181 mov w2, w1 C zero 182 adc %rdx, w1 183L(mul_1_entry_2): 184 mul v0 185 add $4, n 186 js L(mul_1_top) 187 188 mov w0, -16(rp) 189 add %rax, w1 190 mov w1, -8(rp) 191 adc %rdx, w2 192 mov w2, (rp) 193 194 add $-1, vn C vn -= 1 195 jz L(ret) 196 197 mov 8(vp), v0 198 mov 16(vp), v1 199 200 lea 8(vp), vp C vp += 1 201 lea 8(rp), rp C rp += 1 202 203 jmp *outer_addr 204 205C =========================================================== 206C mul_2 for vp[0], vp[1] if vn is even 207 208 ALIGN(16) 209L(mul_2): 210 mov 8(vp), v1 211 212 and $3, R32(w0) 213 jz L(mul_2_prologue_0) 214 cmp $2, R32(w0) 215 jz L(mul_2_prologue_2) 216 jc L(mul_2_prologue_1) 217 218L(mul_2_prologue_3): 219 lea L(addmul_outer_3)(%rip), outer_addr 220 add $2, n 221 mov %rax, -16(rp,n,8) 222 mov %rdx, w2 223 xor R32(w3), R32(w3) 224 xor R32(w0), R32(w0) 225 mov -16(up,n,8), %rax 226 jmp L(mul_2_entry_3) 227 228 ALIGN(16) 229L(mul_2_prologue_0): 230 add $3, n 231 mov %rax, w0 232 mov %rdx, w1 233 xor R32(w2), R32(w2) 234 mov -24(up,n,8), %rax 235 lea L(addmul_outer_0)(%rip), outer_addr 236 jmp L(mul_2_entry_0) 237 238 ALIGN(16) 239L(mul_2_prologue_1): 240 mov %rax, w3 241 mov %rdx, w0 242 xor R32(w1), R32(w1) 243 lea L(addmul_outer_1)(%rip), outer_addr 244 jmp L(mul_2_entry_1) 245 246 ALIGN(16) 247L(mul_2_prologue_2): 248 add $1, n 249 lea L(addmul_outer_2)(%rip), outer_addr 250 mov $0, R32(w0) 251 mov $0, R32(w1) 252 mov %rax, w2 253 mov -8(up,n,8), %rax 254 mov %rdx, w3 255 jmp L(mul_2_entry_2) 256 257 C this loop is 18 c/loop = 2.25 c/l on K8, for all up/rp alignments 258 259 ALIGN(16) 260L(mul_2_top): 261 mov -32(up,n,8), %rax 262 mul v1 263 add %rax, w0 264 adc %rdx, w1 265 mov -24(up,n,8), %rax 266 xor R32(w2), R32(w2) 267 mul v0 268 add %rax, w0 269 mov -24(up,n,8), %rax 270 adc %rdx, w1 271 adc $0, R32(w2) 272L(mul_2_entry_0): 273 mul v1 274 add %rax, w1 275 mov w0, -24(rp,n,8) 276 adc %rdx, w2 277 mov -16(up,n,8), %rax 278 mul v0 279 mov $0, R32(w3) 280 add %rax, w1 281 adc %rdx, w2 282 mov -16(up,n,8), %rax 283 adc $0, R32(w3) 284 mov $0, R32(w0) 285 mov w1, -16(rp,n,8) 286L(mul_2_entry_3): 287 mul v1 288 add %rax, w2 289 mov -8(up,n,8), %rax 290 adc %rdx, w3 291 mov $0, R32(w1) 292 mul v0 293 add %rax, w2 294 mov -8(up,n,8), %rax 295 adc %rdx, w3 296 adc R32(w1), R32(w0) C adc $0, w0 297L(mul_2_entry_2): 298 mul v1 299 add %rax, w3 300 mov w2, -8(rp,n,8) 301 adc %rdx, w0 302 mov (up,n,8), %rax 303 mul v0 304 add %rax, w3 305 adc %rdx, w0 306 adc $0, R32(w1) 307L(mul_2_entry_1): 308 add $4, n 309 mov w3, -32(rp,n,8) 310 js L(mul_2_top) 311 312 mov -32(up,n,8), %rax C FIXME: n is constant 313 mul v1 314 add %rax, w0 315 mov w0, (rp) 316 adc %rdx, w1 317 mov w1, 8(rp) 318 319 add $-2, vn C vn -= 2 320 jz L(ret) 321 322 mov 16(vp), v0 323 mov 24(vp), v1 324 325 lea 16(vp), vp C vp += 2 326 lea 16(rp), rp C rp += 2 327 328 jmp *outer_addr 329 330 331C =========================================================== 332C addmul_2 for remaining vp's 333 334 C in the following prologues, we reuse un to store the 335 C adjusted value of n that is reloaded on each iteration 336 337L(addmul_outer_0): 338 add $3, un 339 lea 0(%rip), outer_addr 340 341 mov un, n 342 mov -24(up,un,8), %rax 343 mul v0 344 mov %rax, w0 345 mov -24(up,un,8), %rax 346 mov %rdx, w1 347 xor R32(w2), R32(w2) 348 jmp L(addmul_entry_0) 349 350L(addmul_outer_1): 351 mov un, n 352 mov (up,un,8), %rax 353 mul v0 354 mov %rax, w3 355 mov (up,un,8), %rax 356 mov %rdx, w0 357 xor R32(w1), R32(w1) 358 jmp L(addmul_entry_1) 359 360L(addmul_outer_2): 361 add $1, un 362 lea 0(%rip), outer_addr 363 364 mov un, n 365 mov -8(up,un,8), %rax 366 mul v0 367 xor R32(w0), R32(w0) 368 mov %rax, w2 369 xor R32(w1), R32(w1) 370 mov %rdx, w3 371 mov -8(up,un,8), %rax 372 jmp L(addmul_entry_2) 373 374L(addmul_outer_3): 375 add $2, un 376 lea 0(%rip), outer_addr 377 378 mov un, n 379 mov -16(up,un,8), %rax 380 xor R32(w3), R32(w3) 381 mul v0 382 mov %rax, w1 383 mov -16(up,un,8), %rax 384 mov %rdx, w2 385 jmp L(addmul_entry_3) 386 387 C this loop is 19 c/loop = 2.375 c/l on K8, for all up/rp alignments 388 389 ALIGN(16) 390L(addmul_top): 391 add w3, -32(rp,n,8) 392 adc %rax, w0 393 mov -24(up,n,8), %rax 394 adc %rdx, w1 395 xor R32(w2), R32(w2) 396 mul v0 397 add %rax, w0 398 mov -24(up,n,8), %rax 399 adc %rdx, w1 400 adc R32(w2), R32(w2) C adc $0, w2 401L(addmul_entry_0): 402 mul v1 403 xor R32(w3), R32(w3) 404 add w0, -24(rp,n,8) 405 adc %rax, w1 406 mov -16(up,n,8), %rax 407 adc %rdx, w2 408 mul v0 409 add %rax, w1 410 mov -16(up,n,8), %rax 411 adc %rdx, w2 412 adc $0, R32(w3) 413L(addmul_entry_3): 414 mul v1 415 add w1, -16(rp,n,8) 416 adc %rax, w2 417 mov -8(up,n,8), %rax 418 adc %rdx, w3 419 mul v0 420 xor R32(w0), R32(w0) 421 add %rax, w2 422 adc %rdx, w3 423 mov $0, R32(w1) 424 mov -8(up,n,8), %rax 425 adc R32(w1), R32(w0) C adc $0, w0 426L(addmul_entry_2): 427 mul v1 428 add w2, -8(rp,n,8) 429 adc %rax, w3 430 adc %rdx, w0 431 mov (up,n,8), %rax 432 mul v0 433 add %rax, w3 434 mov (up,n,8), %rax 435 adc %rdx, w0 436 adc $0, R32(w1) 437L(addmul_entry_1): 438 mul v1 439 add $4, n 440 js L(addmul_top) 441 442 add w3, -8(rp) 443 adc %rax, w0 444 mov w0, (rp) 445 adc %rdx, w1 446 mov w1, 8(rp) 447 448 add $-2, vn C vn -= 2 449 jz L(ret) 450 451 lea 16(rp), rp C rp += 2 452 lea 16(vp), vp C vp += 2 453 454 mov (vp), v0 455 mov 8(vp), v1 456 457 jmp *outer_addr 458 459 ALIGN(16) 460L(ret): pop %r15 461 pop %r14 462 pop %r13 463 pop %r12 464 pop %rbp 465 pop %rbx 466 FUNC_EXIT() 467 ret 468 469EPILOGUE() 470