1dnl Intel P6 mpn_sqr_basecase -- square an mpn number. 2 3dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular 35C product (measured on the speed difference between 20 and 40 limbs, 36C which is the Karatsuba recursing range). 37 38 39dnl These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for 40dnl a description. The only difference here is that UNROLL_COUNT can go up 41dnl to 64 (not 63) making SQR_TOOM2_THRESHOLD_MAX 67. 42 43deflit(SQR_TOOM2_THRESHOLD_MAX, 67) 44 45ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE', 46`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)') 47 48m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD') 49deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3)) 50 51 52C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); 53C 54C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a 55C lot of function call overheads are avoided, especially when the given size 56C is small. 57C 58C The code size might look a bit excessive, but not all of it is executed so 59C it won't all get into the code cache. The 1x1, 2x2 and 3x3 special cases 60C clearly apply only to those sizes; mid sizes like 10x10 only need part of 61C the unrolled addmul; and big sizes like 40x40 that do use the full 62C unrolling will least be making good use of it, because 40x40 will take 63C something like 7000 cycles. 64 65defframe(PARAM_SIZE,12) 66defframe(PARAM_SRC, 8) 67defframe(PARAM_DST, 4) 68 69 TEXT 70 ALIGN(32) 71PROLOGUE(mpn_sqr_basecase) 72deflit(`FRAME',0) 73 74 movl PARAM_SIZE, %edx 75 76 movl PARAM_SRC, %eax 77 78 cmpl $2, %edx 79 movl PARAM_DST, %ecx 80 je L(two_limbs) 81 82 movl (%eax), %eax 83 ja L(three_or_more) 84 85 86C ----------------------------------------------------------------------------- 87C one limb only 88 C eax src limb 89 C ebx 90 C ecx dst 91 C edx 92 93 mull %eax 94 95 movl %eax, (%ecx) 96 movl %edx, 4(%ecx) 97 98 ret 99 100 101C ----------------------------------------------------------------------------- 102L(two_limbs): 103 C eax src 104 C ebx 105 C ecx dst 106 C edx 107 108defframe(SAVE_ESI, -4) 109defframe(SAVE_EBX, -8) 110defframe(SAVE_EDI, -12) 111defframe(SAVE_EBP, -16) 112deflit(`STACK_SPACE',16) 113 114 subl $STACK_SPACE, %esp 115deflit(`FRAME',STACK_SPACE) 116 117 movl %esi, SAVE_ESI 118 movl %eax, %esi 119 movl (%eax), %eax 120 121 mull %eax C src[0]^2 122 123 movl %eax, (%ecx) C dst[0] 124 movl 4(%esi), %eax 125 126 movl %ebx, SAVE_EBX 127 movl %edx, %ebx C dst[1] 128 129 mull %eax C src[1]^2 130 131 movl %edi, SAVE_EDI 132 movl %eax, %edi C dst[2] 133 movl (%esi), %eax 134 135 movl %ebp, SAVE_EBP 136 movl %edx, %ebp C dst[3] 137 138 mull 4(%esi) C src[0]*src[1] 139 140 addl %eax, %ebx 141 movl SAVE_ESI, %esi 142 143 adcl %edx, %edi 144 145 adcl $0, %ebp 146 addl %ebx, %eax 147 movl SAVE_EBX, %ebx 148 149 adcl %edi, %edx 150 movl SAVE_EDI, %edi 151 152 adcl $0, %ebp 153 154 movl %eax, 4(%ecx) 155 156 movl %ebp, 12(%ecx) 157 movl SAVE_EBP, %ebp 158 159 movl %edx, 8(%ecx) 160 addl $FRAME, %esp 161 162 ret 163 164 165C ----------------------------------------------------------------------------- 166L(three_or_more): 167 C eax src low limb 168 C ebx 169 C ecx dst 170 C edx size 171deflit(`FRAME',0) 172 173 pushl %esi defframe_pushl(`SAVE_ESI') 174 cmpl $4, %edx 175 176 movl PARAM_SRC, %esi 177 jae L(four_or_more) 178 179 180C ----------------------------------------------------------------------------- 181C three limbs 182 183 C eax src low limb 184 C ebx 185 C ecx dst 186 C edx 187 C esi src 188 C edi 189 C ebp 190 191 pushl %ebp defframe_pushl(`SAVE_EBP') 192 pushl %edi defframe_pushl(`SAVE_EDI') 193 194 mull %eax C src[0] ^ 2 195 196 movl %eax, (%ecx) 197 movl %edx, 4(%ecx) 198 199 movl 4(%esi), %eax 200 xorl %ebp, %ebp 201 202 mull %eax C src[1] ^ 2 203 204 movl %eax, 8(%ecx) 205 movl %edx, 12(%ecx) 206 movl 8(%esi), %eax 207 208 pushl %ebx defframe_pushl(`SAVE_EBX') 209 210 mull %eax C src[2] ^ 2 211 212 movl %eax, 16(%ecx) 213 movl %edx, 20(%ecx) 214 215 movl (%esi), %eax 216 217 mull 4(%esi) C src[0] * src[1] 218 219 movl %eax, %ebx 220 movl %edx, %edi 221 222 movl (%esi), %eax 223 224 mull 8(%esi) C src[0] * src[2] 225 226 addl %eax, %edi 227 movl %edx, %ebp 228 229 adcl $0, %ebp 230 movl 4(%esi), %eax 231 232 mull 8(%esi) C src[1] * src[2] 233 234 xorl %esi, %esi 235 addl %eax, %ebp 236 237 C eax 238 C ebx dst[1] 239 C ecx dst 240 C edx dst[4] 241 C esi zero, will be dst[5] 242 C edi dst[2] 243 C ebp dst[3] 244 245 adcl $0, %edx 246 addl %ebx, %ebx 247 248 adcl %edi, %edi 249 250 adcl %ebp, %ebp 251 252 adcl %edx, %edx 253 movl 4(%ecx), %eax 254 255 adcl $0, %esi 256 addl %ebx, %eax 257 258 movl %eax, 4(%ecx) 259 movl 8(%ecx), %eax 260 261 adcl %edi, %eax 262 movl 12(%ecx), %ebx 263 264 adcl %ebp, %ebx 265 movl 16(%ecx), %edi 266 267 movl %eax, 8(%ecx) 268 movl SAVE_EBP, %ebp 269 270 movl %ebx, 12(%ecx) 271 movl SAVE_EBX, %ebx 272 273 adcl %edx, %edi 274 movl 20(%ecx), %eax 275 276 movl %edi, 16(%ecx) 277 movl SAVE_EDI, %edi 278 279 adcl %esi, %eax C no carry out of this 280 movl SAVE_ESI, %esi 281 282 movl %eax, 20(%ecx) 283 addl $FRAME, %esp 284 285 ret 286 287 288 289C ----------------------------------------------------------------------------- 290defframe(VAR_COUNTER,-20) 291defframe(VAR_JMP, -24) 292deflit(`STACK_SPACE',24) 293 294L(four_or_more): 295 C eax src low limb 296 C ebx 297 C ecx 298 C edx size 299 C esi src 300 C edi 301 C ebp 302deflit(`FRAME',4) dnl %esi already pushed 303 304C First multiply src[0]*src[1..size-1] and store at dst[1..size]. 305 306 subl $STACK_SPACE-FRAME, %esp 307deflit(`FRAME',STACK_SPACE) 308 movl $1, %ecx 309 310 movl %edi, SAVE_EDI 311 movl PARAM_DST, %edi 312 313 movl %ebx, SAVE_EBX 314 subl %edx, %ecx C -(size-1) 315 316 movl %ebp, SAVE_EBP 317 movl $0, %ebx C initial carry 318 319 leal (%esi,%edx,4), %esi C &src[size] 320 movl %eax, %ebp C multiplier 321 322 leal -4(%edi,%edx,4), %edi C &dst[size-1] 323 324 325C This loop runs at just over 6 c/l. 326 327L(mul_1): 328 C eax scratch 329 C ebx carry 330 C ecx counter, limbs, negative, -(size-1) to -1 331 C edx scratch 332 C esi &src[size] 333 C edi &dst[size-1] 334 C ebp multiplier 335 336 movl %ebp, %eax 337 338 mull (%esi,%ecx,4) 339 340 addl %ebx, %eax 341 movl $0, %ebx 342 343 adcl %edx, %ebx 344 movl %eax, 4(%edi,%ecx,4) 345 346 incl %ecx 347 jnz L(mul_1) 348 349 350 movl %ebx, 4(%edi) 351 352 353C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2. 354C 355C The last two addmuls, which are the bottom right corner of the product 356C triangle, are left to the end. These are src[size-3]*src[size-2,size-1] 357C and src[size-2]*src[size-1]. If size is 4 then it's only these corner 358C cases that need to be done. 359C 360C The unrolled code is the same as mpn_addmul_1(), see that routine for some 361C comments. 362C 363C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive. 364C 365C VAR_JMP is the computed jump into the unrolled code, stepped by one code 366C chunk each outer loop. 367 368dnl This is also hard-coded in the address calculation below. 369deflit(CODE_BYTES_PER_LIMB, 15) 370 371dnl With &src[size] and &dst[size-1] pointers, the displacements in the 372dnl unrolled code fit in a byte for UNROLL_COUNT values up to 32, but above 373dnl that an offset must be added to them. 374deflit(OFFSET, 375ifelse(eval(UNROLL_COUNT>32),1, 376eval((UNROLL_COUNT-32)*4), 3770)) 378 379 C eax 380 C ebx carry 381 C ecx 382 C edx 383 C esi &src[size] 384 C edi &dst[size-1] 385 C ebp 386 387 movl PARAM_SIZE, %ecx 388 389 subl $4, %ecx 390 jz L(corner) 391 392 movl %ecx, %edx 393 negl %ecx 394 395 shll $4, %ecx 396ifelse(OFFSET,0,,`subl $OFFSET, %esi') 397 398ifdef(`PIC',` 399 call L(pic_calc) 400L(here): 401',` 402 leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx 403') 404 negl %edx 405 406ifelse(OFFSET,0,,`subl $OFFSET, %edi') 407 408 C The calculated jump mustn't be before the start of the available 409 C code. This is the limit that UNROLL_COUNT puts on the src operand 410 C size, but checked here using the jump address directly. 411 412 ASSERT(ae, 413 `movl_text_address( L(unroll_inner_start), %eax) 414 cmpl %eax, %ecx') 415 416 417C ----------------------------------------------------------------------------- 418 ALIGN(16) 419L(unroll_outer_top): 420 C eax 421 C ebx high limb to store 422 C ecx VAR_JMP 423 C edx VAR_COUNTER, limbs, negative 424 C esi &src[size], constant 425 C edi dst ptr, second highest limb of last addmul 426 C ebp 427 428 movl -12+OFFSET(%esi,%edx,4), %ebp C multiplier 429 movl %edx, VAR_COUNTER 430 431 movl -8+OFFSET(%esi,%edx,4), %eax C first limb of multiplicand 432 433 mull %ebp 434 435define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')') 436 437 testb $1, %cl 438 439 movl %edx, %ebx C high carry 440 leal 4(%edi), %edi 441 442 movl %ecx, %edx C jump 443 444 movl %eax, %ecx C low carry 445 leal CODE_BYTES_PER_LIMB(%edx), %edx 446 447 cmovX( %ebx, %ecx) C high carry reverse 448 cmovX( %eax, %ebx) C low carry reverse 449 movl %edx, VAR_JMP 450 jmp *%edx 451 452 453 C Must be on an even address here so the low bit of the jump address 454 C will indicate which way around ecx/ebx should start. 455 456 ALIGN(2) 457 458L(unroll_inner_start): 459 C eax scratch 460 C ebx carry high 461 C ecx carry low 462 C edx scratch 463 C esi src pointer 464 C edi dst pointer 465 C ebp multiplier 466 C 467 C 15 code bytes each limb 468 C ecx/ebx reversed on each chunk 469 470forloop(`i', UNROLL_COUNT, 1, ` 471 deflit(`disp_src', eval(-i*4 + OFFSET)) 472 deflit(`disp_dst', eval(disp_src)) 473 474 m4_assert(`disp_src>=-128 && disp_src<128') 475 m4_assert(`disp_dst>=-128 && disp_dst<128') 476 477ifelse(eval(i%2),0,` 478Zdisp( movl, disp_src,(%esi), %eax) 479 mull %ebp 480Zdisp( addl, %ebx, disp_dst,(%edi)) 481 adcl %eax, %ecx 482 movl %edx, %ebx 483 adcl $0, %ebx 484',` 485 dnl this one comes out last 486Zdisp( movl, disp_src,(%esi), %eax) 487 mull %ebp 488Zdisp( addl, %ecx, disp_dst,(%edi)) 489 adcl %eax, %ebx 490 movl %edx, %ecx 491 adcl $0, %ecx 492') 493') 494L(unroll_inner_end): 495 496 addl %ebx, m4_empty_if_zero(OFFSET)(%edi) 497 498 movl VAR_COUNTER, %edx 499 adcl $0, %ecx 500 501 movl %ecx, m4_empty_if_zero(OFFSET+4)(%edi) 502 movl VAR_JMP, %ecx 503 504 incl %edx 505 jnz L(unroll_outer_top) 506 507 508ifelse(OFFSET,0,,` 509 addl $OFFSET, %esi 510 addl $OFFSET, %edi 511') 512 513 514C ----------------------------------------------------------------------------- 515 ALIGN(16) 516L(corner): 517 C eax 518 C ebx 519 C ecx 520 C edx 521 C esi &src[size] 522 C edi &dst[2*size-5] 523 C ebp 524 525 movl -12(%esi), %eax 526 527 mull -8(%esi) 528 529 addl %eax, (%edi) 530 movl -12(%esi), %eax 531 movl $0, %ebx 532 533 adcl %edx, %ebx 534 535 mull -4(%esi) 536 537 addl %eax, %ebx 538 movl -8(%esi), %eax 539 540 adcl $0, %edx 541 542 addl %ebx, 4(%edi) 543 movl $0, %ebx 544 545 adcl %edx, %ebx 546 547 mull -4(%esi) 548 549 movl PARAM_SIZE, %ecx 550 addl %ebx, %eax 551 552 adcl $0, %edx 553 554 movl %eax, 8(%edi) 555 556 movl %edx, 12(%edi) 557 movl PARAM_DST, %edi 558 559 560C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1]. 561 562 subl $1, %ecx C size-1 563 xorl %eax, %eax C ready for final adcl, and clear carry 564 565 movl %ecx, %edx 566 movl PARAM_SRC, %esi 567 568 569L(lshift): 570 C eax 571 C ebx 572 C ecx counter, size-1 to 1 573 C edx size-1 (for later use) 574 C esi src (for later use) 575 C edi dst, incrementing 576 C ebp 577 578 rcll 4(%edi) 579 rcll 8(%edi) 580 581 leal 8(%edi), %edi 582 decl %ecx 583 jnz L(lshift) 584 585 586 adcl %eax, %eax 587 588 movl %eax, 4(%edi) C dst most significant limb 589 movl (%esi), %eax C src[0] 590 591 leal 4(%esi,%edx,4), %esi C &src[size] 592 subl %edx, %ecx C -(size-1) 593 594 595C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ..., 596C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the 597C low limb of src[0]^2. 598 599 600 mull %eax 601 602 movl %eax, (%edi,%ecx,8) C dst[0] 603 604 605L(diag): 606 C eax scratch 607 C ebx scratch 608 C ecx counter, negative 609 C edx carry 610 C esi &src[size] 611 C edi dst[2*size-2] 612 C ebp 613 614 movl (%esi,%ecx,4), %eax 615 movl %edx, %ebx 616 617 mull %eax 618 619 addl %ebx, 4(%edi,%ecx,8) 620 adcl %eax, 8(%edi,%ecx,8) 621 adcl $0, %edx 622 623 incl %ecx 624 jnz L(diag) 625 626 627 movl SAVE_ESI, %esi 628 movl SAVE_EBX, %ebx 629 630 addl %edx, 4(%edi) C dst most significant limb 631 632 movl SAVE_EDI, %edi 633 movl SAVE_EBP, %ebp 634 addl $FRAME, %esp 635 ret 636 637 638 639C ----------------------------------------------------------------------------- 640ifdef(`PIC',` 641L(pic_calc): 642 addl (%esp), %ecx 643 addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx 644 addl %edx, %ecx 645 ret_internal 646') 647 648 649EPILOGUE() 650