1dnl AMD K6 mpn_sqr_basecase -- square an mpn number. 2 3dnl Copyright 1999-2002 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular 35C product (measured on the speed difference between 17 and 33 limbs, 36C which is roughly the Karatsuba recursing range). 37 38 39dnl SQR_TOOM2_THRESHOLD_MAX is the maximum SQR_TOOM2_THRESHOLD this 40dnl code supports. This value is used only by the tune program to know 41dnl what it can go up to. (An attempt to compile with a bigger value will 42dnl trigger some m4_assert()s in the code, making the build fail.) 43dnl 44dnl The value is determined by requiring the displacements in the unrolled 45dnl addmul to fit in single bytes. This means a maximum UNROLL_COUNT of 46dnl 63, giving a maximum SQR_TOOM2_THRESHOLD of 66. 47 48deflit(SQR_TOOM2_THRESHOLD_MAX, 66) 49 50 51dnl Allow a value from the tune program to override config.m4. 52 53ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE', 54`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)') 55 56 57dnl UNROLL_COUNT is the number of code chunks in the unrolled addmul. The 58dnl number required is determined by SQR_TOOM2_THRESHOLD, since 59dnl mpn_sqr_basecase only needs to handle sizes < SQR_TOOM2_THRESHOLD. 60dnl 61dnl The first addmul is the biggest, and this takes the second least 62dnl significant limb and multiplies it by the third least significant and 63dnl up. Hence for a maximum operand size of SQR_TOOM2_THRESHOLD-1 64dnl limbs, UNROLL_COUNT needs to be SQR_TOOM2_THRESHOLD-3. 65 66m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD') 67deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3)) 68 69 70C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); 71C 72C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a 73C lot of function call overheads are avoided, especially when the given size 74C is small. 75C 76C The code size might look a bit excessive, but not all of it is executed 77C and so won't fill up the code cache. The 1x1, 2x2 and 3x3 special cases 78C clearly apply only to those sizes; mid sizes like 10x10 only need part of 79C the unrolled addmul; and big sizes like 35x35 that do need all of it will 80C at least be getting value for money, because 35x35 spends something like 81C 5780 cycles here. 82C 83C Different values of UNROLL_COUNT give slightly different speeds, between 84C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs. 85C This isn't a big difference, but it's presumably some alignment effect 86C which if understood could give a simple speedup. 87 88defframe(PARAM_SIZE,12) 89defframe(PARAM_SRC, 8) 90defframe(PARAM_DST, 4) 91 92 TEXT 93 ALIGN(32) 94PROLOGUE(mpn_sqr_basecase) 95deflit(`FRAME',0) 96 97 movl PARAM_SIZE, %ecx 98 movl PARAM_SRC, %eax 99 100 cmpl $2, %ecx 101 je L(two_limbs) 102 103 movl PARAM_DST, %edx 104 ja L(three_or_more) 105 106 107C ----------------------------------------------------------------------------- 108C one limb only 109 C eax src 110 C ebx 111 C ecx size 112 C edx dst 113 114 movl (%eax), %eax 115 movl %edx, %ecx 116 117 mull %eax 118 119 movl %eax, (%ecx) 120 movl %edx, 4(%ecx) 121 ret 122 123 124C ----------------------------------------------------------------------------- 125 ALIGN(16) 126L(two_limbs): 127 C eax src 128 C ebx 129 C ecx size 130 C edx dst 131 132 pushl %ebx 133 movl %eax, %ebx C src 134deflit(`FRAME',4) 135 136 movl (%ebx), %eax 137 movl PARAM_DST, %ecx 138 139 mull %eax C src[0]^2 140 141 movl %eax, (%ecx) 142 movl 4(%ebx), %eax 143 144 movl %edx, 4(%ecx) 145 146 mull %eax C src[1]^2 147 148 movl %eax, 8(%ecx) 149 movl (%ebx), %eax 150 151 movl %edx, 12(%ecx) 152 movl 4(%ebx), %edx 153 154 mull %edx C src[0]*src[1] 155 156 addl %eax, 4(%ecx) 157 158 adcl %edx, 8(%ecx) 159 adcl $0, 12(%ecx) 160 161 popl %ebx 162 addl %eax, 4(%ecx) 163 164 adcl %edx, 8(%ecx) 165 adcl $0, 12(%ecx) 166 167 ret 168 169 170C ----------------------------------------------------------------------------- 171L(three_or_more): 172deflit(`FRAME',0) 173 cmpl $4, %ecx 174 jae L(four_or_more) 175 176 177C ----------------------------------------------------------------------------- 178C three limbs 179 C eax src 180 C ecx size 181 C edx dst 182 183 pushl %ebx 184 movl %eax, %ebx C src 185 186 movl (%ebx), %eax 187 movl %edx, %ecx C dst 188 189 mull %eax C src[0] ^ 2 190 191 movl %eax, (%ecx) 192 movl 4(%ebx), %eax 193 194 movl %edx, 4(%ecx) 195 pushl %esi 196 197 mull %eax C src[1] ^ 2 198 199 movl %eax, 8(%ecx) 200 movl 8(%ebx), %eax 201 202 movl %edx, 12(%ecx) 203 pushl %edi 204 205 mull %eax C src[2] ^ 2 206 207 movl %eax, 16(%ecx) 208 movl (%ebx), %eax 209 210 movl %edx, 20(%ecx) 211 movl 4(%ebx), %edx 212 213 mull %edx C src[0] * src[1] 214 215 movl %eax, %esi 216 movl (%ebx), %eax 217 218 movl %edx, %edi 219 movl 8(%ebx), %edx 220 221 pushl %ebp 222 xorl %ebp, %ebp 223 224 mull %edx C src[0] * src[2] 225 226 addl %eax, %edi 227 movl 4(%ebx), %eax 228 229 adcl %edx, %ebp 230 231 movl 8(%ebx), %edx 232 233 mull %edx C src[1] * src[2] 234 235 addl %eax, %ebp 236 237 adcl $0, %edx 238 239 240 C eax will be dst[5] 241 C ebx 242 C ecx dst 243 C edx dst[4] 244 C esi dst[1] 245 C edi dst[2] 246 C ebp dst[3] 247 248 xorl %eax, %eax 249 addl %esi, %esi 250 adcl %edi, %edi 251 adcl %ebp, %ebp 252 adcl %edx, %edx 253 adcl $0, %eax 254 255 addl %esi, 4(%ecx) 256 adcl %edi, 8(%ecx) 257 adcl %ebp, 12(%ecx) 258 259 popl %ebp 260 popl %edi 261 262 adcl %edx, 16(%ecx) 263 264 popl %esi 265 popl %ebx 266 267 adcl %eax, 20(%ecx) 268 ASSERT(nc) 269 270 ret 271 272 273C ----------------------------------------------------------------------------- 274 275defframe(SAVE_EBX, -4) 276defframe(SAVE_ESI, -8) 277defframe(SAVE_EDI, -12) 278defframe(SAVE_EBP, -16) 279defframe(VAR_COUNTER,-20) 280defframe(VAR_JMP, -24) 281deflit(STACK_SPACE, 24) 282 283 ALIGN(16) 284L(four_or_more): 285 286 C eax src 287 C ebx 288 C ecx size 289 C edx dst 290 C esi 291 C edi 292 C ebp 293 294C First multiply src[0]*src[1..size-1] and store at dst[1..size]. 295C 296C A test was done calling mpn_mul_1 here to get the benefit of its unrolled 297C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off 298C a 5780 cycle operation, which is not surprising since the loop here is 8 299C c/l and mpn_mul_1 is 6.25 c/l. 300 301 subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE) 302 303 movl %edi, SAVE_EDI 304 leal 4(%edx), %edi 305 306 movl %ebx, SAVE_EBX 307 leal 4(%eax), %ebx 308 309 movl %esi, SAVE_ESI 310 xorl %esi, %esi 311 312 movl %ebp, SAVE_EBP 313 314 C eax 315 C ebx src+4 316 C ecx size 317 C edx 318 C esi 319 C edi dst+4 320 C ebp 321 322 movl (%eax), %ebp C multiplier 323 leal -1(%ecx), %ecx C size-1, and pad to a 16 byte boundary 324 325 326 ALIGN(16) 327L(mul_1): 328 C eax scratch 329 C ebx src ptr 330 C ecx counter 331 C edx scratch 332 C esi carry 333 C edi dst ptr 334 C ebp multiplier 335 336 movl (%ebx), %eax 337 addl $4, %ebx 338 339 mull %ebp 340 341 addl %esi, %eax 342 movl $0, %esi 343 344 adcl %edx, %esi 345 346 movl %eax, (%edi) 347 addl $4, %edi 348 349 loop L(mul_1) 350 351 352C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2. 353C 354C The last two addmuls, which are the bottom right corner of the product 355C triangle, are left to the end. These are src[size-3]*src[size-2,size-1] 356C and src[size-2]*src[size-1]. If size is 4 then it's only these corner 357C cases that need to be done. 358C 359C The unrolled code is the same as mpn_addmul_1(), see that routine for some 360C comments. 361C 362C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive. 363C 364C VAR_JMP is the computed jump into the unrolled code, stepped by one code 365C chunk each outer loop. 366C 367C K6 doesn't do any branch prediction on indirect jumps, which is good 368C actually because it's a different target each time. The unrolled addmul 369C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of 370C the indirect jump is quickly recovered. 371 372 373dnl This value is also implicitly encoded in a shift and add. 374dnl 375deflit(CODE_BYTES_PER_LIMB, 15) 376 377dnl With the unmodified &src[size] and &dst[size] pointers, the 378dnl displacements in the unrolled code fit in a byte for UNROLL_COUNT 379dnl values up to 31. Above that an offset must be added to them. 380dnl 381deflit(OFFSET, 382ifelse(eval(UNROLL_COUNT>31),1, 383eval((UNROLL_COUNT-31)*4), 3840)) 385 386 C eax 387 C ebx &src[size] 388 C ecx 389 C edx 390 C esi carry 391 C edi &dst[size] 392 C ebp 393 394 movl PARAM_SIZE, %ecx 395 movl %esi, (%edi) 396 397 subl $4, %ecx 398 jz L(corner) 399 400 movl %ecx, %edx 401ifelse(OFFSET,0,, 402` subl $OFFSET, %ebx') 403 404 shll $4, %ecx 405ifelse(OFFSET,0,, 406` subl $OFFSET, %edi') 407 408 negl %ecx 409 410ifdef(`PIC',` 411 call L(pic_calc) 412L(here): 413',` 414 leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx 415') 416 negl %edx 417 418 419 C The calculated jump mustn't be before the start of the available 420 C code. This is the limitation UNROLL_COUNT puts on the src operand 421 C size, but checked here using the jump address directly. 422 C 423 ASSERT(ae,` 424 movl_text_address( L(unroll_inner_start), %eax) 425 cmpl %eax, %ecx 426 ') 427 428 429C ----------------------------------------------------------------------------- 430 ALIGN(16) 431L(unroll_outer_top): 432 C eax 433 C ebx &src[size], constant 434 C ecx VAR_JMP 435 C edx VAR_COUNTER, limbs, negative 436 C esi high limb to store 437 C edi dst ptr, high of last addmul 438 C ebp 439 440 movl -12+OFFSET(%ebx,%edx,4), %ebp C multiplier 441 movl %edx, VAR_COUNTER 442 443 movl -8+OFFSET(%ebx,%edx,4), %eax C first limb of multiplicand 444 445 mull %ebp 446 447 testb $1, %cl 448 449 movl %edx, %esi C high carry 450 movl %ecx, %edx C jump 451 452 movl %eax, %ecx C low carry 453 leal CODE_BYTES_PER_LIMB(%edx), %edx 454 455 movl %edx, VAR_JMP 456 leal 4(%edi), %edi 457 458 C A branch-free version of this using some xors was found to be a 459 C touch slower than just a conditional jump, despite the jump 460 C switching between taken and not taken on every loop. 461 462ifelse(eval(UNROLL_COUNT%2),0, 463 jz,jnz) L(unroll_noswap) 464 movl %esi, %eax C high,low carry other way around 465 466 movl %ecx, %esi 467 movl %eax, %ecx 468L(unroll_noswap): 469 470 jmp *%edx 471 472 473 C Must be on an even address here so the low bit of the jump address 474 C will indicate which way around ecx/esi should start. 475 C 476 C An attempt was made at padding here to get the end of the unrolled 477 C code to come out on a good alignment, to save padding before 478 C L(corner). This worked, but turned out to run slower than just an 479 C ALIGN(2). The reason for this is not clear, it might be related 480 C to the different speeds on different UNROLL_COUNTs noted above. 481 482 ALIGN(2) 483 484L(unroll_inner_start): 485 C eax scratch 486 C ebx src 487 C ecx carry low 488 C edx scratch 489 C esi carry high 490 C edi dst 491 C ebp multiplier 492 C 493 C 15 code bytes each limb 494 C ecx/esi swapped on each chunk 495 496forloop(`i', UNROLL_COUNT, 1, ` 497 deflit(`disp_src', eval(-i*4 + OFFSET)) 498 deflit(`disp_dst', eval(disp_src - 4)) 499 500 m4_assert(`disp_src>=-128 && disp_src<128') 501 m4_assert(`disp_dst>=-128 && disp_dst<128') 502 503ifelse(eval(i%2),0,` 504Zdisp( movl, disp_src,(%ebx), %eax) 505 mull %ebp 506Zdisp( addl, %esi, disp_dst,(%edi)) 507 adcl %eax, %ecx 508 movl %edx, %esi 509 jadcl0( %esi) 510',` 511 dnl this one comes out last 512Zdisp( movl, disp_src,(%ebx), %eax) 513 mull %ebp 514Zdisp( addl, %ecx, disp_dst,(%edi)) 515 adcl %eax, %esi 516 movl %edx, %ecx 517 jadcl0( %ecx) 518') 519') 520L(unroll_inner_end): 521 522 addl %esi, -4+OFFSET(%edi) 523 524 movl VAR_COUNTER, %edx 525 jadcl0( %ecx) 526 527 movl %ecx, m4_empty_if_zero(OFFSET)(%edi) 528 movl VAR_JMP, %ecx 529 530 incl %edx 531 jnz L(unroll_outer_top) 532 533 534ifelse(OFFSET,0,,` 535 addl $OFFSET, %ebx 536 addl $OFFSET, %edi 537') 538 539 540C ----------------------------------------------------------------------------- 541 ALIGN(16) 542L(corner): 543 C ebx &src[size] 544 C edi &dst[2*size-5] 545 546 movl -12(%ebx), %ebp 547 548 movl -8(%ebx), %eax 549 movl %eax, %ecx 550 551 mull %ebp 552 553 addl %eax, -4(%edi) 554 adcl $0, %edx 555 556 movl -4(%ebx), %eax 557 movl %edx, %esi 558 movl %eax, %ebx 559 560 mull %ebp 561 562 addl %esi, %eax 563 adcl $0, %edx 564 565 addl %eax, (%edi) 566 adcl $0, %edx 567 568 movl %edx, %esi 569 movl %ebx, %eax 570 571 mull %ecx 572 573 addl %esi, %eax 574 movl %eax, 4(%edi) 575 576 adcl $0, %edx 577 578 movl %edx, 8(%edi) 579 580 581C ----------------------------------------------------------------------------- 582C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1]. 583C The loop measures about 6 cycles/iteration, though it looks like it should 584C decode in 5. 585 586L(lshift_start): 587 movl PARAM_SIZE, %ecx 588 589 movl PARAM_DST, %edi 590 subl $1, %ecx C size-1 and clear carry 591 592 movl PARAM_SRC, %ebx 593 movl %ecx, %edx 594 595 xorl %eax, %eax C ready for adcl 596 597 598 ALIGN(16) 599L(lshift): 600 C eax 601 C ebx src (for later use) 602 C ecx counter, decrementing 603 C edx size-1 (for later use) 604 C esi 605 C edi dst, incrementing 606 C ebp 607 608 rcll 4(%edi) 609 rcll 8(%edi) 610 leal 8(%edi), %edi 611 loop L(lshift) 612 613 614 adcl %eax, %eax 615 616 movl %eax, 4(%edi) C dst most significant limb 617 movl (%ebx), %eax C src[0] 618 619 leal 4(%ebx,%edx,4), %ebx C &src[size] 620 subl %edx, %ecx C -(size-1) 621 622 623C ----------------------------------------------------------------------------- 624C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ..., 625C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the 626C low limb of src[0]^2. 627 628 629 mull %eax 630 631 movl %eax, (%edi,%ecx,8) C dst[0] 632 633 634 ALIGN(16) 635L(diag): 636 C eax scratch 637 C ebx &src[size] 638 C ecx counter, negative 639 C edx carry 640 C esi scratch 641 C edi dst[2*size-2] 642 C ebp 643 644 movl (%ebx,%ecx,4), %eax 645 movl %edx, %esi 646 647 mull %eax 648 649 addl %esi, 4(%edi,%ecx,8) 650 adcl %eax, 8(%edi,%ecx,8) 651 adcl $0, %edx 652 653 incl %ecx 654 jnz L(diag) 655 656 657 movl SAVE_EBX, %ebx 658 movl SAVE_ESI, %esi 659 660 addl %edx, 4(%edi) C dst most significant limb 661 662 movl SAVE_EDI, %edi 663 movl SAVE_EBP, %ebp 664 addl $FRAME, %esp 665 ret 666 667 668 669C ----------------------------------------------------------------------------- 670ifdef(`PIC',` 671L(pic_calc): 672 C See mpn/x86/README about old gas bugs 673 addl (%esp), %ecx 674 addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx 675 addl %edx, %ecx 676 ret_internal 677') 678 679 680EPILOGUE() 681