1dnl Intel P6 mpn_sqr_basecase -- square an mpn number. 2 3dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular 24C product (measured on the speed difference between 20 and 40 limbs, 25C which is the Karatsuba recursing range). 26 27 28dnl These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for 29dnl a description. The only difference here is that UNROLL_COUNT can go up 30dnl to 64 (not 63) making SQR_TOOM2_THRESHOLD_MAX 67. 31 32deflit(SQR_TOOM2_THRESHOLD_MAX, 67) 33 34ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE', 35`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)') 36 37m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD') 38deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3)) 39 40 41C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); 42C 43C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a 44C lot of function call overheads are avoided, especially when the given size 45C is small. 46C 47C The code size might look a bit excessive, but not all of it is executed so 48C it won't all get into the code cache. The 1x1, 2x2 and 3x3 special cases 49C clearly apply only to those sizes; mid sizes like 10x10 only need part of 50C the unrolled addmul; and big sizes like 40x40 that do use the full 51C unrolling will least be making good use of it, because 40x40 will take 52C something like 7000 cycles. 53 54defframe(PARAM_SIZE,12) 55defframe(PARAM_SRC, 8) 56defframe(PARAM_DST, 4) 57 58 TEXT 59 ALIGN(32) 60PROLOGUE(mpn_sqr_basecase) 61deflit(`FRAME',0) 62 63 movl PARAM_SIZE, %edx 64 65 movl PARAM_SRC, %eax 66 67 cmpl $2, %edx 68 movl PARAM_DST, %ecx 69 je L(two_limbs) 70 71 movl (%eax), %eax 72 ja L(three_or_more) 73 74 75C ----------------------------------------------------------------------------- 76C one limb only 77 C eax src limb 78 C ebx 79 C ecx dst 80 C edx 81 82 mull %eax 83 84 movl %eax, (%ecx) 85 movl %edx, 4(%ecx) 86 87 ret 88 89 90C ----------------------------------------------------------------------------- 91L(two_limbs): 92 C eax src 93 C ebx 94 C ecx dst 95 C edx 96 97defframe(SAVE_ESI, -4) 98defframe(SAVE_EBX, -8) 99defframe(SAVE_EDI, -12) 100defframe(SAVE_EBP, -16) 101deflit(`STACK_SPACE',16) 102 103 subl $STACK_SPACE, %esp 104deflit(`FRAME',STACK_SPACE) 105 106 movl %esi, SAVE_ESI 107 movl %eax, %esi 108 movl (%eax), %eax 109 110 mull %eax C src[0]^2 111 112 movl %eax, (%ecx) C dst[0] 113 movl 4(%esi), %eax 114 115 movl %ebx, SAVE_EBX 116 movl %edx, %ebx C dst[1] 117 118 mull %eax C src[1]^2 119 120 movl %edi, SAVE_EDI 121 movl %eax, %edi C dst[2] 122 movl (%esi), %eax 123 124 movl %ebp, SAVE_EBP 125 movl %edx, %ebp C dst[3] 126 127 mull 4(%esi) C src[0]*src[1] 128 129 addl %eax, %ebx 130 movl SAVE_ESI, %esi 131 132 adcl %edx, %edi 133 134 adcl $0, %ebp 135 addl %ebx, %eax 136 movl SAVE_EBX, %ebx 137 138 adcl %edi, %edx 139 movl SAVE_EDI, %edi 140 141 adcl $0, %ebp 142 143 movl %eax, 4(%ecx) 144 145 movl %ebp, 12(%ecx) 146 movl SAVE_EBP, %ebp 147 148 movl %edx, 8(%ecx) 149 addl $FRAME, %esp 150 151 ret 152 153 154C ----------------------------------------------------------------------------- 155L(three_or_more): 156 C eax src low limb 157 C ebx 158 C ecx dst 159 C edx size 160deflit(`FRAME',0) 161 162 pushl %esi defframe_pushl(`SAVE_ESI') 163 cmpl $4, %edx 164 165 movl PARAM_SRC, %esi 166 jae L(four_or_more) 167 168 169C ----------------------------------------------------------------------------- 170C three limbs 171 172 C eax src low limb 173 C ebx 174 C ecx dst 175 C edx 176 C esi src 177 C edi 178 C ebp 179 180 pushl %ebp defframe_pushl(`SAVE_EBP') 181 pushl %edi defframe_pushl(`SAVE_EDI') 182 183 mull %eax C src[0] ^ 2 184 185 movl %eax, (%ecx) 186 movl %edx, 4(%ecx) 187 188 movl 4(%esi), %eax 189 xorl %ebp, %ebp 190 191 mull %eax C src[1] ^ 2 192 193 movl %eax, 8(%ecx) 194 movl %edx, 12(%ecx) 195 movl 8(%esi), %eax 196 197 pushl %ebx defframe_pushl(`SAVE_EBX') 198 199 mull %eax C src[2] ^ 2 200 201 movl %eax, 16(%ecx) 202 movl %edx, 20(%ecx) 203 204 movl (%esi), %eax 205 206 mull 4(%esi) C src[0] * src[1] 207 208 movl %eax, %ebx 209 movl %edx, %edi 210 211 movl (%esi), %eax 212 213 mull 8(%esi) C src[0] * src[2] 214 215 addl %eax, %edi 216 movl %edx, %ebp 217 218 adcl $0, %ebp 219 movl 4(%esi), %eax 220 221 mull 8(%esi) C src[1] * src[2] 222 223 xorl %esi, %esi 224 addl %eax, %ebp 225 226 C eax 227 C ebx dst[1] 228 C ecx dst 229 C edx dst[4] 230 C esi zero, will be dst[5] 231 C edi dst[2] 232 C ebp dst[3] 233 234 adcl $0, %edx 235 addl %ebx, %ebx 236 237 adcl %edi, %edi 238 239 adcl %ebp, %ebp 240 241 adcl %edx, %edx 242 movl 4(%ecx), %eax 243 244 adcl $0, %esi 245 addl %ebx, %eax 246 247 movl %eax, 4(%ecx) 248 movl 8(%ecx), %eax 249 250 adcl %edi, %eax 251 movl 12(%ecx), %ebx 252 253 adcl %ebp, %ebx 254 movl 16(%ecx), %edi 255 256 movl %eax, 8(%ecx) 257 movl SAVE_EBP, %ebp 258 259 movl %ebx, 12(%ecx) 260 movl SAVE_EBX, %ebx 261 262 adcl %edx, %edi 263 movl 20(%ecx), %eax 264 265 movl %edi, 16(%ecx) 266 movl SAVE_EDI, %edi 267 268 adcl %esi, %eax C no carry out of this 269 movl SAVE_ESI, %esi 270 271 movl %eax, 20(%ecx) 272 addl $FRAME, %esp 273 274 ret 275 276 277 278C ----------------------------------------------------------------------------- 279defframe(VAR_COUNTER,-20) 280defframe(VAR_JMP, -24) 281deflit(`STACK_SPACE',24) 282 283L(four_or_more): 284 C eax src low limb 285 C ebx 286 C ecx 287 C edx size 288 C esi src 289 C edi 290 C ebp 291deflit(`FRAME',4) dnl %esi already pushed 292 293C First multiply src[0]*src[1..size-1] and store at dst[1..size]. 294 295 subl $STACK_SPACE-FRAME, %esp 296deflit(`FRAME',STACK_SPACE) 297 movl $1, %ecx 298 299 movl %edi, SAVE_EDI 300 movl PARAM_DST, %edi 301 302 movl %ebx, SAVE_EBX 303 subl %edx, %ecx C -(size-1) 304 305 movl %ebp, SAVE_EBP 306 movl $0, %ebx C initial carry 307 308 leal (%esi,%edx,4), %esi C &src[size] 309 movl %eax, %ebp C multiplier 310 311 leal -4(%edi,%edx,4), %edi C &dst[size-1] 312 313 314C This loop runs at just over 6 c/l. 315 316L(mul_1): 317 C eax scratch 318 C ebx carry 319 C ecx counter, limbs, negative, -(size-1) to -1 320 C edx scratch 321 C esi &src[size] 322 C edi &dst[size-1] 323 C ebp multiplier 324 325 movl %ebp, %eax 326 327 mull (%esi,%ecx,4) 328 329 addl %ebx, %eax 330 movl $0, %ebx 331 332 adcl %edx, %ebx 333 movl %eax, 4(%edi,%ecx,4) 334 335 incl %ecx 336 jnz L(mul_1) 337 338 339 movl %ebx, 4(%edi) 340 341 342C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2. 343C 344C The last two addmuls, which are the bottom right corner of the product 345C triangle, are left to the end. These are src[size-3]*src[size-2,size-1] 346C and src[size-2]*src[size-1]. If size is 4 then it's only these corner 347C cases that need to be done. 348C 349C The unrolled code is the same as mpn_addmul_1(), see that routine for some 350C comments. 351C 352C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive. 353C 354C VAR_JMP is the computed jump into the unrolled code, stepped by one code 355C chunk each outer loop. 356 357dnl This is also hard-coded in the address calculation below. 358deflit(CODE_BYTES_PER_LIMB, 15) 359 360dnl With &src[size] and &dst[size-1] pointers, the displacements in the 361dnl unrolled code fit in a byte for UNROLL_COUNT values up to 32, but above 362dnl that an offset must be added to them. 363deflit(OFFSET, 364ifelse(eval(UNROLL_COUNT>32),1, 365eval((UNROLL_COUNT-32)*4), 3660)) 367 368 C eax 369 C ebx carry 370 C ecx 371 C edx 372 C esi &src[size] 373 C edi &dst[size-1] 374 C ebp 375 376 movl PARAM_SIZE, %ecx 377 378 subl $4, %ecx 379 jz L(corner) 380 381 movl %ecx, %edx 382 negl %ecx 383 384 shll $4, %ecx 385ifelse(OFFSET,0,,`subl $OFFSET, %esi') 386 387ifdef(`PIC',` 388 call L(pic_calc) 389L(here): 390',` 391 leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx 392') 393 negl %edx 394 395ifelse(OFFSET,0,,`subl $OFFSET, %edi') 396 397 C The calculated jump mustn't be before the start of the available 398 C code. This is the limit that UNROLL_COUNT puts on the src operand 399 C size, but checked here using the jump address directly. 400 401 ASSERT(ae, 402 `movl_text_address( L(unroll_inner_start), %eax) 403 cmpl %eax, %ecx') 404 405 406C ----------------------------------------------------------------------------- 407 ALIGN(16) 408L(unroll_outer_top): 409 C eax 410 C ebx high limb to store 411 C ecx VAR_JMP 412 C edx VAR_COUNTER, limbs, negative 413 C esi &src[size], constant 414 C edi dst ptr, second highest limb of last addmul 415 C ebp 416 417 movl -12+OFFSET(%esi,%edx,4), %ebp C multiplier 418 movl %edx, VAR_COUNTER 419 420 movl -8+OFFSET(%esi,%edx,4), %eax C first limb of multiplicand 421 422 mull %ebp 423 424define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')') 425 426 testb $1, %cl 427 428 movl %edx, %ebx C high carry 429 leal 4(%edi), %edi 430 431 movl %ecx, %edx C jump 432 433 movl %eax, %ecx C low carry 434 leal CODE_BYTES_PER_LIMB(%edx), %edx 435 436 cmovX( %ebx, %ecx) C high carry reverse 437 cmovX( %eax, %ebx) C low carry reverse 438 movl %edx, VAR_JMP 439 jmp *%edx 440 441 442 C Must be on an even address here so the low bit of the jump address 443 C will indicate which way around ecx/ebx should start. 444 445 ALIGN(2) 446 447L(unroll_inner_start): 448 C eax scratch 449 C ebx carry high 450 C ecx carry low 451 C edx scratch 452 C esi src pointer 453 C edi dst pointer 454 C ebp multiplier 455 C 456 C 15 code bytes each limb 457 C ecx/ebx reversed on each chunk 458 459forloop(`i', UNROLL_COUNT, 1, ` 460 deflit(`disp_src', eval(-i*4 + OFFSET)) 461 deflit(`disp_dst', eval(disp_src)) 462 463 m4_assert(`disp_src>=-128 && disp_src<128') 464 m4_assert(`disp_dst>=-128 && disp_dst<128') 465 466ifelse(eval(i%2),0,` 467Zdisp( movl, disp_src,(%esi), %eax) 468 mull %ebp 469Zdisp( addl, %ebx, disp_dst,(%edi)) 470 adcl %eax, %ecx 471 movl %edx, %ebx 472 adcl $0, %ebx 473',` 474 dnl this one comes out last 475Zdisp( movl, disp_src,(%esi), %eax) 476 mull %ebp 477Zdisp( addl, %ecx, disp_dst,(%edi)) 478 adcl %eax, %ebx 479 movl %edx, %ecx 480 adcl $0, %ecx 481') 482') 483L(unroll_inner_end): 484 485 addl %ebx, m4_empty_if_zero(OFFSET)(%edi) 486 487 movl VAR_COUNTER, %edx 488 adcl $0, %ecx 489 490 movl %ecx, m4_empty_if_zero(OFFSET+4)(%edi) 491 movl VAR_JMP, %ecx 492 493 incl %edx 494 jnz L(unroll_outer_top) 495 496 497ifelse(OFFSET,0,,` 498 addl $OFFSET, %esi 499 addl $OFFSET, %edi 500') 501 502 503C ----------------------------------------------------------------------------- 504 ALIGN(16) 505L(corner): 506 C eax 507 C ebx 508 C ecx 509 C edx 510 C esi &src[size] 511 C edi &dst[2*size-5] 512 C ebp 513 514 movl -12(%esi), %eax 515 516 mull -8(%esi) 517 518 addl %eax, (%edi) 519 movl -12(%esi), %eax 520 movl $0, %ebx 521 522 adcl %edx, %ebx 523 524 mull -4(%esi) 525 526 addl %eax, %ebx 527 movl -8(%esi), %eax 528 529 adcl $0, %edx 530 531 addl %ebx, 4(%edi) 532 movl $0, %ebx 533 534 adcl %edx, %ebx 535 536 mull -4(%esi) 537 538 movl PARAM_SIZE, %ecx 539 addl %ebx, %eax 540 541 adcl $0, %edx 542 543 movl %eax, 8(%edi) 544 545 movl %edx, 12(%edi) 546 movl PARAM_DST, %edi 547 548 549C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1]. 550 551 subl $1, %ecx C size-1 552 xorl %eax, %eax C ready for final adcl, and clear carry 553 554 movl %ecx, %edx 555 movl PARAM_SRC, %esi 556 557 558L(lshift): 559 C eax 560 C ebx 561 C ecx counter, size-1 to 1 562 C edx size-1 (for later use) 563 C esi src (for later use) 564 C edi dst, incrementing 565 C ebp 566 567 rcll 4(%edi) 568 rcll 8(%edi) 569 570 leal 8(%edi), %edi 571 decl %ecx 572 jnz L(lshift) 573 574 575 adcl %eax, %eax 576 577 movl %eax, 4(%edi) C dst most significant limb 578 movl (%esi), %eax C src[0] 579 580 leal 4(%esi,%edx,4), %esi C &src[size] 581 subl %edx, %ecx C -(size-1) 582 583 584C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ..., 585C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the 586C low limb of src[0]^2. 587 588 589 mull %eax 590 591 movl %eax, (%edi,%ecx,8) C dst[0] 592 593 594L(diag): 595 C eax scratch 596 C ebx scratch 597 C ecx counter, negative 598 C edx carry 599 C esi &src[size] 600 C edi dst[2*size-2] 601 C ebp 602 603 movl (%esi,%ecx,4), %eax 604 movl %edx, %ebx 605 606 mull %eax 607 608 addl %ebx, 4(%edi,%ecx,8) 609 adcl %eax, 8(%edi,%ecx,8) 610 adcl $0, %edx 611 612 incl %ecx 613 jnz L(diag) 614 615 616 movl SAVE_ESI, %esi 617 movl SAVE_EBX, %ebx 618 619 addl %edx, 4(%edi) C dst most significant limb 620 621 movl SAVE_EDI, %edi 622 movl SAVE_EBP, %ebp 623 addl $FRAME, %esp 624 ret 625 626 627 628C ----------------------------------------------------------------------------- 629ifdef(`PIC',` 630L(pic_calc): 631 addl (%esp), %ecx 632 addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx 633 addl %edx, %ecx 634 ret_internal 635') 636 637 638EPILOGUE() 639