1dnl x86 mpn_sqr_basecase -- square an mpn number, optimised for atom. 2 3dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. 4dnl 5dnl Copyright 2011 Free Software Foundation, Inc. 6dnl 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13dnl 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18dnl 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C TODO 25C * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the 26C 4 large loops into one; we could use it for the outer loop branch. 27C * Optimise code outside of inner loops. 28C * Write combined addmul_1 feed-in a wind-down code, and use when iterating 29C outer each loop. ("Overlapping software pipelining") 30C * Perhaps use caller-saves regs for inlined mul_1, allowing us to postpone 31C all pushes. 32C * Perhaps write special code for n < M, for some small M. 33C * Replace inlined addmul_1 with smaller code from aorsmul_1.asm, or perhaps 34C with even less pipelined code. 35C * We run the outer loop until we have a 2-limb by 1-limb addmul_1 left. 36C Consider breaking out earlier, saving high the cost of short loops. 37 38C void mpn_sqr_basecase (mp_ptr wp, 39C mp_srcptr xp, mp_size_t xn); 40 41define(`rp', `%edi') 42define(`up', `%esi') 43define(`n', `%ecx') 44 45define(`un', `%ebp') 46 47 TEXT 48 ALIGN(16) 49PROLOGUE(mpn_sqr_basecase) 50 push %edi 51 push %esi 52 mov 12(%esp), rp 53 mov 16(%esp), up 54 mov 20(%esp), n 55 56 lea 4(rp), rp C write triangular product starting at rp[1] 57 dec n 58 movd (up), %mm7 59 60 jz L(one) 61 lea 4(up), up 62 push %ebx 63 push %ebp 64 mov n, %eax 65 66 movd (up), %mm0 67 neg n 68 pmuludq %mm7, %mm0 69 pxor %mm6, %mm6 70 mov n, un 71 72 and $3, %eax 73 jz L(of0) 74 cmp $2, %eax 75 jc L(of1) 76 jz L(of2) 77 78C ================================================================ 79 jmp L(m3) 80 ALIGN(16) 81L(lm3): movd -4(up), %mm0 82 pmuludq %mm7, %mm0 83 psrlq $32, %mm6 84 lea 16(rp), rp 85 paddq %mm0, %mm6 86 movd (up), %mm0 87 pmuludq %mm7, %mm0 88 movd %mm6, -4(rp) 89 psrlq $32, %mm6 90L(m3): paddq %mm0, %mm6 91 movd 4(up), %mm0 92 pmuludq %mm7, %mm0 93 movd %mm6, (rp) 94 psrlq $32, %mm6 95 paddq %mm0, %mm6 96 movd 8(up), %mm0 97 pmuludq %mm7, %mm0 98 movd %mm6, 4(rp) 99 psrlq $32, %mm6 100 paddq %mm0, %mm6 101 add $4, un 102 movd %mm6, 8(rp) 103 lea 16(up), up 104 js L(lm3) 105 106 psrlq $32, %mm6 107 movd %mm6, 12(rp) 108 109 inc n 110C jz L(done) 111 lea -12(up), up 112 lea 4(rp), rp 113 jmp L(ol2) 114 115C ================================================================ 116 ALIGN(16) 117L(lm0): movd (up), %mm0 118 pmuludq %mm7, %mm0 119 psrlq $32, %mm6 120 lea 16(rp), rp 121L(of0): paddq %mm0, %mm6 122 movd 4(up), %mm0 123 pmuludq %mm7, %mm0 124 movd %mm6, (rp) 125 psrlq $32, %mm6 126 paddq %mm0, %mm6 127 movd 8(up), %mm0 128 pmuludq %mm7, %mm0 129 movd %mm6, 4(rp) 130 psrlq $32, %mm6 131 paddq %mm0, %mm6 132 movd 12(up), %mm0 133 pmuludq %mm7, %mm0 134 movd %mm6, 8(rp) 135 psrlq $32, %mm6 136 paddq %mm0, %mm6 137 add $4, un 138 movd %mm6, 12(rp) 139 lea 16(up), up 140 js L(lm0) 141 142 psrlq $32, %mm6 143 movd %mm6, 16(rp) 144 145 inc n 146C jz L(done) 147 lea -8(up), up 148 lea 8(rp), rp 149 jmp L(ol3) 150 151C ================================================================ 152 ALIGN(16) 153L(lm1): movd -12(up), %mm0 154 pmuludq %mm7, %mm0 155 psrlq $32, %mm6 156 lea 16(rp), rp 157 paddq %mm0, %mm6 158 movd -8(up), %mm0 159 pmuludq %mm7, %mm0 160 movd %mm6, -12(rp) 161 psrlq $32, %mm6 162 paddq %mm0, %mm6 163 movd -4(up), %mm0 164 pmuludq %mm7, %mm0 165 movd %mm6, -8(rp) 166 psrlq $32, %mm6 167 paddq %mm0, %mm6 168 movd (up), %mm0 169 pmuludq %mm7, %mm0 170 movd %mm6, -4(rp) 171 psrlq $32, %mm6 172L(of1): paddq %mm0, %mm6 173 add $4, un 174 movd %mm6, (rp) 175 lea 16(up), up 176 js L(lm1) 177 178 psrlq $32, %mm6 179 movd %mm6, 4(rp) 180 181 inc n 182 jz L(done) C goes away when we add special n=2 code 183 lea -20(up), up 184 lea -4(rp), rp 185 jmp L(ol0) 186 187C ================================================================ 188 ALIGN(16) 189L(lm2): movd -8(up), %mm0 190 pmuludq %mm7, %mm0 191 psrlq $32, %mm6 192 lea 16(rp), rp 193 paddq %mm0, %mm6 194 movd -4(up), %mm0 195 pmuludq %mm7, %mm0 196 movd %mm6, -8(rp) 197 psrlq $32, %mm6 198 paddq %mm0, %mm6 199 movd (up), %mm0 200 pmuludq %mm7, %mm0 201 movd %mm6, -4(rp) 202 psrlq $32, %mm6 203L(of2): paddq %mm0, %mm6 204 movd 4(up), %mm0 205 pmuludq %mm7, %mm0 206 movd %mm6, (rp) 207 psrlq $32, %mm6 208 paddq %mm0, %mm6 209 add $4, un 210 movd %mm6, 4(rp) 211 lea 16(up), up 212 js L(lm2) 213 214 psrlq $32, %mm6 215 movd %mm6, 8(rp) 216 217 inc n 218C jz L(done) 219 lea -16(up), up 220C lea (rp), rp 221C jmp L(ol1) 222 223C ================================================================ 224 225L(ol1): lea 4(up,n,4), up 226 movd (up), %mm7 C read next U invariant limb 227 lea 8(rp,n,4), rp 228 mov n, un 229 230 movd 4(up), %mm1 231 pmuludq %mm7, %mm1 232 sar $2, un 233 movd %mm1, %ebx 234 inc un 235 jz L(re1) 236 237 movd 8(up), %mm0 238 pmuludq %mm7, %mm0 239 xor %edx, %edx C zero edx and CF 240 jmp L(a1) 241 242L(la1): adc $0, %edx 243 add %ebx, 12(rp) 244 movd %mm0, %eax 245 pmuludq %mm7, %mm1 246 lea 16(rp), rp 247 psrlq $32, %mm0 248 adc %edx, %eax 249 movd %mm0, %edx 250 movd %mm1, %ebx 251 movd 8(up), %mm0 252 pmuludq %mm7, %mm0 253 adc $0, %edx 254 add %eax, (rp) 255L(a1): psrlq $32, %mm1 256 adc %edx, %ebx 257 movd %mm1, %edx 258 movd %mm0, %eax 259 movd 12(up), %mm1 260 pmuludq %mm7, %mm1 261 adc $0, %edx 262 add %ebx, 4(rp) 263 psrlq $32, %mm0 264 adc %edx, %eax 265 movd %mm0, %edx 266 movd %mm1, %ebx 267 lea 16(up), up 268 movd (up), %mm0 269 adc $0, %edx 270 add %eax, 8(rp) 271 psrlq $32, %mm1 272 adc %edx, %ebx 273 movd %mm1, %edx 274 pmuludq %mm7, %mm0 275 inc un 276 movd 4(up), %mm1 277 jnz L(la1) 278 279 adc un, %edx C un is zero here 280 add %ebx, 12(rp) 281 movd %mm0, %eax 282 pmuludq %mm7, %mm1 283 lea 16(rp), rp 284 psrlq $32, %mm0 285 adc %edx, %eax 286 movd %mm0, %edx 287 movd %mm1, %ebx 288 adc un, %edx 289 add %eax, (rp) 290 psrlq $32, %mm1 291 adc %edx, %ebx 292 movd %mm1, %eax 293 adc un, %eax 294 add %ebx, 4(rp) 295 adc un, %eax 296 mov %eax, 8(rp) 297 298 inc n 299 300C ================================================================ 301 302L(ol0): lea (up,n,4), up 303 movd 4(up), %mm7 C read next U invariant limb 304 lea 4(rp,n,4), rp 305 mov n, un 306 307 movd 8(up), %mm0 308 pmuludq %mm7, %mm0 309 sar $2, un 310 movd 12(up), %mm1 311 movd %mm0, %eax 312 pmuludq %mm7, %mm1 313 xor %edx, %edx C zero edx and CF 314 jmp L(a0) 315 316L(la0): adc $0, %edx 317 add %ebx, 12(rp) 318 movd %mm0, %eax 319 pmuludq %mm7, %mm1 320 lea 16(rp), rp 321 psrlq $32, %mm0 322 adc %edx, %eax 323 movd %mm0, %edx 324 movd %mm1, %ebx 325 movd 8(up), %mm0 326 pmuludq %mm7, %mm0 327 adc $0, %edx 328 add %eax, (rp) 329 psrlq $32, %mm1 330 adc %edx, %ebx 331 movd %mm1, %edx 332 movd %mm0, %eax 333 movd 12(up), %mm1 334 pmuludq %mm7, %mm1 335 adc $0, %edx 336 add %ebx, 4(rp) 337L(a0): psrlq $32, %mm0 338 adc %edx, %eax 339 movd %mm0, %edx 340 movd %mm1, %ebx 341 lea 16(up), up 342 movd (up), %mm0 343 adc $0, %edx 344 add %eax, 8(rp) 345 psrlq $32, %mm1 346 adc %edx, %ebx 347 movd %mm1, %edx 348 pmuludq %mm7, %mm0 349 inc un 350 movd 4(up), %mm1 351 jnz L(la0) 352 353 adc un, %edx C un is zero here 354 add %ebx, 12(rp) 355 movd %mm0, %eax 356 pmuludq %mm7, %mm1 357 lea 16(rp), rp 358 psrlq $32, %mm0 359 adc %edx, %eax 360 movd %mm0, %edx 361 movd %mm1, %ebx 362 adc un, %edx 363 add %eax, (rp) 364 psrlq $32, %mm1 365 adc %edx, %ebx 366 movd %mm1, %eax 367 adc un, %eax 368 add %ebx, 4(rp) 369 adc un, %eax 370 mov %eax, 8(rp) 371 372 inc n 373 374C ================================================================ 375 376L(ol3): lea 12(up,n,4), up 377 movd -8(up), %mm7 C read next U invariant limb 378 lea (rp,n,4), rp C put rp back 379 mov n, un 380 381 movd -4(up), %mm1 382 pmuludq %mm7, %mm1 383 sar $2, un 384 movd %mm1, %ebx 385 movd (up), %mm0 386 xor %edx, %edx C zero edx and CF 387 jmp L(a3) 388 389L(la3): adc $0, %edx 390 add %ebx, 12(rp) 391 movd %mm0, %eax 392 pmuludq %mm7, %mm1 393 lea 16(rp), rp 394 psrlq $32, %mm0 395 adc %edx, %eax 396 movd %mm0, %edx 397 movd %mm1, %ebx 398 movd 8(up), %mm0 399 pmuludq %mm7, %mm0 400 adc $0, %edx 401 add %eax, (rp) 402 psrlq $32, %mm1 403 adc %edx, %ebx 404 movd %mm1, %edx 405 movd %mm0, %eax 406 movd 12(up), %mm1 407 pmuludq %mm7, %mm1 408 adc $0, %edx 409 add %ebx, 4(rp) 410 psrlq $32, %mm0 411 adc %edx, %eax 412 movd %mm0, %edx 413 movd %mm1, %ebx 414 lea 16(up), up 415 movd (up), %mm0 416 adc $0, %edx 417 add %eax, 8(rp) 418L(a3): psrlq $32, %mm1 419 adc %edx, %ebx 420 movd %mm1, %edx 421 pmuludq %mm7, %mm0 422 inc un 423 movd 4(up), %mm1 424 jnz L(la3) 425 426 adc un, %edx C un is zero here 427 add %ebx, 12(rp) 428 movd %mm0, %eax 429 pmuludq %mm7, %mm1 430 lea 16(rp), rp 431 psrlq $32, %mm0 432 adc %edx, %eax 433 movd %mm0, %edx 434 movd %mm1, %ebx 435 adc un, %edx 436 add %eax, (rp) 437 psrlq $32, %mm1 438 adc %edx, %ebx 439 movd %mm1, %eax 440 adc un, %eax 441 add %ebx, 4(rp) 442 adc un, %eax 443 mov %eax, 8(rp) 444 445 inc n 446 447C ================================================================ 448 449L(ol2): lea 8(up,n,4), up 450 movd -4(up), %mm7 C read next U invariant limb 451 lea 12(rp,n,4), rp 452 mov n, un 453 454 movd (up), %mm0 455 pmuludq %mm7, %mm0 456 xor %edx, %edx 457 sar $2, un 458 movd 4(up), %mm1 459 test un, un C clear carry 460 movd %mm0, %eax 461 pmuludq %mm7, %mm1 462 inc un 463 jnz L(a2) 464 jmp L(re2) 465 466L(la2): adc $0, %edx 467 add %ebx, 12(rp) 468 movd %mm0, %eax 469 pmuludq %mm7, %mm1 470 lea 16(rp), rp 471L(a2): psrlq $32, %mm0 472 adc %edx, %eax 473 movd %mm0, %edx 474 movd %mm1, %ebx 475 movd 8(up), %mm0 476 pmuludq %mm7, %mm0 477 adc $0, %edx 478 add %eax, (rp) 479 psrlq $32, %mm1 480 adc %edx, %ebx 481 movd %mm1, %edx 482 movd %mm0, %eax 483 movd 12(up), %mm1 484 pmuludq %mm7, %mm1 485 adc $0, %edx 486 add %ebx, 4(rp) 487 psrlq $32, %mm0 488 adc %edx, %eax 489 movd %mm0, %edx 490 movd %mm1, %ebx 491 lea 16(up), up 492 movd (up), %mm0 493 adc $0, %edx 494 add %eax, 8(rp) 495 psrlq $32, %mm1 496 adc %edx, %ebx 497 movd %mm1, %edx 498 pmuludq %mm7, %mm0 499 inc un 500 movd 4(up), %mm1 501 jnz L(la2) 502 503 adc un, %edx C un is zero here 504 add %ebx, 12(rp) 505 movd %mm0, %eax 506 pmuludq %mm7, %mm1 507 lea 16(rp), rp 508 psrlq $32, %mm0 509 adc %edx, %eax 510 movd %mm0, %edx 511 movd %mm1, %ebx 512 adc un, %edx 513 add %eax, (rp) 514 psrlq $32, %mm1 515 adc %edx, %ebx 516 movd %mm1, %eax 517 adc un, %eax 518 add %ebx, 4(rp) 519 adc un, %eax 520 mov %eax, 8(rp) 521 522 inc n 523 jmp L(ol1) 524 525C ================================================================ 526L(re2): psrlq $32, %mm0 527 movd (up), %mm7 C read next U invariant limb 528 adc %edx, %eax 529 movd %mm0, %edx 530 movd %mm1, %ebx 531 adc un, %edx 532 add %eax, (rp) 533 lea 4(rp), rp 534 psrlq $32, %mm1 535 adc %edx, %ebx 536 movd %mm1, %eax 537 movd 4(up), %mm1 538 adc un, %eax 539 add %ebx, (rp) 540 pmuludq %mm7, %mm1 541 adc un, %eax 542 mov %eax, 4(rp) 543 movd %mm1, %ebx 544 545L(re1): psrlq $32, %mm1 546 add %ebx, 4(rp) 547 movd %mm1, %eax 548 adc un, %eax 549 xor n, n C make n zeroness assumption below true 550 mov %eax, 8(rp) 551 552L(done): C n is zero here 553 mov 24(%esp), up 554 mov 28(%esp), %eax 555 556 movd (up), %mm0 557 inc %eax 558 pmuludq %mm0, %mm0 559 lea 4(up), up 560 mov 20(%esp), rp 561 shr %eax 562 movd %mm0, (rp) 563 psrlq $32, %mm0 564 lea -12(rp), rp 565 mov %eax, 28(%esp) 566 jnc L(odd) 567 568 movd %mm0, %ebp 569 movd (up), %mm0 570 lea 8(rp), rp 571 pmuludq %mm0, %mm0 572 lea -4(up), up 573 add 8(rp), %ebp 574 movd %mm0, %edx 575 adc 12(rp), %edx 576 rcr n 577 jmp L(ent) 578 579C ALIGN(16) C alignment seems irrelevant 580L(top): movd (up), %mm1 581 adc n, n 582 movd %mm0, %eax 583 pmuludq %mm1, %mm1 584 movd 4(up), %mm0 585 adc (rp), %eax 586 movd %mm1, %ebx 587 pmuludq %mm0, %mm0 588 psrlq $32, %mm1 589 adc 4(rp), %ebx 590 movd %mm1, %ebp 591 movd %mm0, %edx 592 adc 8(rp), %ebp 593 adc 12(rp), %edx 594 rcr n C FIXME: isn't this awfully slow on atom??? 595 adc %eax, (rp) 596 adc %ebx, 4(rp) 597L(ent): lea 8(up), up 598 adc %ebp, 8(rp) 599 psrlq $32, %mm0 600 adc %edx, 12(rp) 601L(odd): decl 28(%esp) 602 lea 16(rp), rp 603 jnz L(top) 604 605L(end): adc n, n 606 movd %mm0, %eax 607 adc n, %eax 608 mov %eax, (rp) 609 610L(rtn): emms 611 pop %ebp 612 pop %ebx 613 pop %esi 614 pop %edi 615 ret 616 617L(one): pmuludq %mm7, %mm7 618 movq %mm7, -4(rp) 619 emms 620 pop %esi 621 pop %edi 622 ret 623EPILOGUE() 624