1dnl AMD K6 mpn_mul_basecase -- multiply two mpn numbers. 2 3dnl Copyright 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C K6: approx 9.0 cycles per cross product on 30x30 limbs (with 16 limbs/loop 24C unrolling). 25 26 27 28dnl K6: UNROLL_COUNT cycles/product (approx) 29dnl 8 9.75 30dnl 16 9.3 31dnl 32 9.3 32dnl Maximum possible with the current code is 32. 33dnl 34dnl With 16 the inner unrolled loop fits exactly in a 256 byte block, which 35dnl might explain it's good performance. 36 37deflit(UNROLL_COUNT, 16) 38 39 40C void mpn_mul_basecase (mp_ptr wp, 41C mp_srcptr xp, mp_size_t xsize, 42C mp_srcptr yp, mp_size_t ysize); 43C 44C Calculate xp,xsize multiplied by yp,ysize, storing the result in 45C wp,xsize+ysize. 46C 47C This routine is essentially the same as mpn/generic/mul_basecase.c, but 48C it's faster because it does most of the mpn_addmul_1() entry code only 49C once. The saving is about 10-20% on typical sizes coming from the 50C Karatsuba multiply code. 51C 52C Enhancements: 53C 54C The mul_1 loop is about 8.5 c/l, which is slower than mpn_mul_1 at 6.25 55C c/l. Could call mpn_mul_1 when ysize is big enough to make it worthwhile. 56C 57C The main unrolled addmul loop could be shared by mpn_addmul_1, using some 58C extra stack setups and maybe 2 or 3 wasted cycles at the end. Code saving 59C would be 256 bytes. 60 61ifdef(`PIC',` 62deflit(UNROLL_THRESHOLD, 8) 63',` 64deflit(UNROLL_THRESHOLD, 8) 65') 66 67defframe(PARAM_YSIZE,20) 68defframe(PARAM_YP, 16) 69defframe(PARAM_XSIZE,12) 70defframe(PARAM_XP, 8) 71defframe(PARAM_WP, 4) 72 73 TEXT 74 ALIGN(32) 75PROLOGUE(mpn_mul_basecase) 76deflit(`FRAME',0) 77 78 movl PARAM_XSIZE, %ecx 79 movl PARAM_YP, %eax 80 81 movl PARAM_XP, %edx 82 movl (%eax), %eax C yp low limb 83 84 cmpl $2, %ecx 85 ja L(xsize_more_than_two_limbs) 86 je L(two_by_something) 87 88 89 C one limb by one limb 90 91 movl (%edx), %edx C xp low limb 92 movl PARAM_WP, %ecx 93 94 mull %edx 95 96 movl %eax, (%ecx) 97 movl %edx, 4(%ecx) 98 ret 99 100 101C ----------------------------------------------------------------------------- 102L(two_by_something): 103 decl PARAM_YSIZE 104 pushl %ebx 105deflit(`FRAME',4) 106 107 movl PARAM_WP, %ebx 108 pushl %esi 109deflit(`FRAME',8) 110 111 movl %eax, %ecx C yp low limb 112 movl (%edx), %eax C xp low limb 113 114 movl %edx, %esi C xp 115 jnz L(two_by_two) 116 117 118 C two limbs by one limb 119 120 mull %ecx 121 122 movl %eax, (%ebx) 123 movl 4(%esi), %eax 124 125 movl %edx, %esi C carry 126 127 mull %ecx 128 129 addl %eax, %esi 130 movl %esi, 4(%ebx) 131 132 adcl $0, %edx 133 134 movl %edx, 8(%ebx) 135 popl %esi 136 137 popl %ebx 138 ret 139 140 141 142C ----------------------------------------------------------------------------- 143 ALIGN(16) 144L(two_by_two): 145 C eax xp low limb 146 C ebx wp 147 C ecx yp low limb 148 C edx 149 C esi xp 150 C edi 151 C ebp 152deflit(`FRAME',8) 153 154 mull %ecx C xp[0] * yp[0] 155 156 push %edi 157deflit(`FRAME',12) 158 movl %eax, (%ebx) 159 160 movl 4(%esi), %eax 161 movl %edx, %edi C carry, for wp[1] 162 163 mull %ecx C xp[1] * yp[0] 164 165 addl %eax, %edi 166 movl PARAM_YP, %ecx 167 168 adcl $0, %edx 169 170 movl %edi, 4(%ebx) 171 movl 4(%ecx), %ecx C yp[1] 172 173 movl 4(%esi), %eax C xp[1] 174 movl %edx, %edi C carry, for wp[2] 175 176 mull %ecx C xp[1] * yp[1] 177 178 addl %eax, %edi 179 180 adcl $0, %edx 181 182 movl (%esi), %eax C xp[0] 183 movl %edx, %esi C carry, for wp[3] 184 185 mull %ecx C xp[0] * yp[1] 186 187 addl %eax, 4(%ebx) 188 adcl %edx, %edi 189 adcl $0, %esi 190 191 movl %edi, 8(%ebx) 192 popl %edi 193 194 movl %esi, 12(%ebx) 195 popl %esi 196 197 popl %ebx 198 ret 199 200 201C ----------------------------------------------------------------------------- 202 ALIGN(16) 203L(xsize_more_than_two_limbs): 204 205C The first limb of yp is processed with a simple mpn_mul_1 style loop 206C inline. Unrolling this doesn't seem worthwhile since it's only run once 207C (whereas the addmul below is run ysize-1 many times). A call to the 208C actual mpn_mul_1 will be slowed down by the call and parameter pushing and 209C popping, and doesn't seem likely to be worthwhile on the typical 10-20 210C limb operations the Karatsuba code calls here with. 211 212 C eax yp[0] 213 C ebx 214 C ecx xsize 215 C edx xp 216 C esi 217 C edi 218 C ebp 219deflit(`FRAME',0) 220 221 pushl %edi defframe_pushl(SAVE_EDI) 222 pushl %ebp defframe_pushl(SAVE_EBP) 223 224 movl PARAM_WP, %edi 225 pushl %esi defframe_pushl(SAVE_ESI) 226 227 movl %eax, %ebp 228 pushl %ebx defframe_pushl(SAVE_EBX) 229 230 leal (%edx,%ecx,4), %ebx C xp end 231 xorl %esi, %esi 232 233 leal (%edi,%ecx,4), %edi C wp end of mul1 234 negl %ecx 235 236 237L(mul1): 238 C eax scratch 239 C ebx xp end 240 C ecx counter, negative 241 C edx scratch 242 C esi carry 243 C edi wp end of mul1 244 C ebp multiplier 245 246 movl (%ebx,%ecx,4), %eax 247 248 mull %ebp 249 250 addl %esi, %eax 251 movl $0, %esi 252 253 adcl %edx, %esi 254 255 movl %eax, (%edi,%ecx,4) 256 incl %ecx 257 258 jnz L(mul1) 259 260 261 movl PARAM_YSIZE, %edx 262 movl %esi, (%edi) C final carry 263 264 movl PARAM_XSIZE, %ecx 265 decl %edx 266 267 jnz L(ysize_more_than_one_limb) 268 269 popl %ebx 270 popl %esi 271 popl %ebp 272 popl %edi 273 ret 274 275 276L(ysize_more_than_one_limb): 277 cmpl $UNROLL_THRESHOLD, %ecx 278 movl PARAM_YP, %eax 279 280 jae L(unroll) 281 282 283C ----------------------------------------------------------------------------- 284C Simple addmul loop. 285C 286C Using ebx and edi pointing at the ends of their respective locations saves 287C a couple of instructions in the outer loop. The inner loop is still 11 288C cycles, the same as the simple loop in aorsmul_1.asm. 289 290 C eax yp 291 C ebx xp end 292 C ecx xsize 293 C edx ysize-1 294 C esi 295 C edi wp end of mul1 296 C ebp 297 298 movl 4(%eax), %ebp C multiplier 299 negl %ecx 300 301 movl %ecx, PARAM_XSIZE C -xsize 302 xorl %esi, %esi C initial carry 303 304 leal 4(%eax,%edx,4), %eax C yp end 305 negl %edx 306 307 movl %eax, PARAM_YP 308 movl %edx, PARAM_YSIZE 309 310 jmp L(simple_outer_entry) 311 312 313 C aligning here saves a couple of cycles 314 ALIGN(16) 315L(simple_outer_top): 316 C edx ysize counter, negative 317 318 movl PARAM_YP, %eax C yp end 319 xorl %esi, %esi C carry 320 321 movl PARAM_XSIZE, %ecx C -xsize 322 movl %edx, PARAM_YSIZE 323 324 movl (%eax,%edx,4), %ebp C yp limb multiplier 325L(simple_outer_entry): 326 addl $4, %edi 327 328 329L(simple_inner): 330 C eax scratch 331 C ebx xp end 332 C ecx counter, negative 333 C edx scratch 334 C esi carry 335 C edi wp end of this addmul 336 C ebp multiplier 337 338 movl (%ebx,%ecx,4), %eax 339 340 mull %ebp 341 342 addl %esi, %eax 343 movl $0, %esi 344 345 adcl $0, %edx 346 addl %eax, (%edi,%ecx,4) 347 adcl %edx, %esi 348 349 incl %ecx 350 jnz L(simple_inner) 351 352 353 movl PARAM_YSIZE, %edx 354 movl %esi, (%edi) 355 356 incl %edx 357 jnz L(simple_outer_top) 358 359 360 popl %ebx 361 popl %esi 362 popl %ebp 363 popl %edi 364 ret 365 366 367C ----------------------------------------------------------------------------- 368C Unrolled loop. 369C 370C The unrolled inner loop is the same as in aorsmul_1.asm, see that code for 371C some comments. 372C 373C VAR_COUNTER is for the inner loop, running from VAR_COUNTER_INIT down to 374C 0, inclusive. 375C 376C VAR_JMP is the computed jump into the unrolled loop. 377C 378C PARAM_XP and PARAM_WP get offset appropriately for where the unrolled loop 379C is entered. 380C 381C VAR_XP_LOW is the least significant limb of xp, which is needed at the 382C start of the unrolled loop. This can't just be fetched through the xp 383C pointer because of the offset applied to it. 384C 385C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1, 386C inclusive. 387C 388C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be 389C added to give the location of the next limb of yp, which is the multiplier 390C in the unrolled loop. 391C 392C PARAM_WP is similarly offset so that the PARAM_YSIZE counter can be added 393C to give the starting point in the destination for each unrolled loop (this 394C point is one limb upwards for each limb of yp processed). 395C 396C Having PARAM_YSIZE count negative to zero means it's not necessary to 397C store new values of PARAM_YP and PARAM_WP on each loop. Those values on 398C the stack remain constant and on each loop an leal adjusts them with the 399C PARAM_YSIZE counter value. 400 401 402defframe(VAR_COUNTER, -20) 403defframe(VAR_COUNTER_INIT, -24) 404defframe(VAR_JMP, -28) 405defframe(VAR_XP_LOW, -32) 406deflit(VAR_STACK_SPACE, 16) 407 408dnl For some strange reason using (%esp) instead of 0(%esp) is a touch 409dnl slower in this code, hence the defframe empty-if-zero feature is 410dnl disabled. 411dnl 412dnl If VAR_COUNTER is at (%esp), the effect is worse. In this case the 413dnl unrolled loop is 255 instead of 256 bytes, but quite how this affects 414dnl anything isn't clear. 415dnl 416define(`defframe_empty_if_zero_disabled',1) 417 418L(unroll): 419 C eax yp (not used) 420 C ebx xp end (not used) 421 C ecx xsize 422 C edx ysize-1 423 C esi 424 C edi wp end of mul1 (not used) 425 C ebp 426deflit(`FRAME', 16) 427 428 leal -2(%ecx), %ebp C one limb processed at start, 429 decl %ecx C and ebp is one less 430 431 shrl $UNROLL_LOG2, %ebp 432 negl %ecx 433 434 subl $VAR_STACK_SPACE, %esp 435deflit(`FRAME', 16+VAR_STACK_SPACE) 436 andl $UNROLL_MASK, %ecx 437 438 movl %ecx, %esi 439 shll $4, %ecx 440 441 movl %ebp, VAR_COUNTER_INIT 442 negl %esi 443 444 C 15 code bytes per limb 445ifdef(`PIC',` 446 call L(pic_calc) 447L(unroll_here): 448',` 449 leal L(unroll_entry) (%ecx,%esi,1), %ecx 450') 451 452 movl PARAM_XP, %ebx 453 movl %ebp, VAR_COUNTER 454 455 movl PARAM_WP, %edi 456 movl %ecx, VAR_JMP 457 458 movl (%ebx), %eax 459 leal 4(%edi,%esi,4), %edi C wp adjust for unrolling and mul1 460 461 leal (%ebx,%esi,4), %ebx C xp adjust for unrolling 462 463 movl %eax, VAR_XP_LOW 464 465 movl %ebx, PARAM_XP 466 movl PARAM_YP, %ebx 467 468 leal (%edi,%edx,4), %ecx C wp adjust for ysize indexing 469 movl 4(%ebx), %ebp C multiplier (yp second limb) 470 471 leal 4(%ebx,%edx,4), %ebx C yp adjust for ysize indexing 472 473 movl %ecx, PARAM_WP 474 475 leal 1(%esi), %ecx C adjust parity for decl %ecx above 476 477 movl %ebx, PARAM_YP 478 negl %edx 479 480 movl %edx, PARAM_YSIZE 481 jmp L(unroll_outer_entry) 482 483 484ifdef(`PIC',` 485L(pic_calc): 486 C See mpn/x86/README about old gas bugs 487 leal (%ecx,%esi,1), %ecx 488 addl $L(unroll_entry)-L(unroll_here), %ecx 489 addl (%esp), %ecx 490 ret_internal 491') 492 493 494C ----------------------------------------------------------------------------- 495 C Aligning here saves a couple of cycles per loop. Using 32 doesn't 496 C cost any extra space, since the inner unrolled loop below is 497 C aligned to 32. 498 ALIGN(32) 499L(unroll_outer_top): 500 C edx ysize 501 502 movl PARAM_YP, %eax 503 movl %edx, PARAM_YSIZE C incremented ysize counter 504 505 movl PARAM_WP, %edi 506 507 movl VAR_COUNTER_INIT, %ebx 508 movl (%eax,%edx,4), %ebp C next multiplier 509 510 movl PARAM_XSIZE, %ecx 511 leal (%edi,%edx,4), %edi C adjust wp for where we are in yp 512 513 movl VAR_XP_LOW, %eax 514 movl %ebx, VAR_COUNTER 515 516L(unroll_outer_entry): 517 mull %ebp 518 519 C using testb is a tiny bit faster than testl 520 testb $1, %cl 521 522 movl %eax, %ecx C low carry 523 movl VAR_JMP, %eax 524 525 movl %edx, %esi C high carry 526 movl PARAM_XP, %ebx 527 528 jnz L(unroll_noswap) 529 movl %ecx, %esi C high,low carry other way around 530 531 movl %edx, %ecx 532L(unroll_noswap): 533 534 jmp *%eax 535 536 537 538C ----------------------------------------------------------------------------- 539 ALIGN(32) 540L(unroll_top): 541 C eax scratch 542 C ebx xp 543 C ecx carry low 544 C edx scratch 545 C esi carry high 546 C edi wp 547 C ebp multiplier 548 C VAR_COUNTER loop counter 549 C 550 C 15 code bytes each limb 551 552 leal UNROLL_BYTES(%edi), %edi 553 554L(unroll_entry): 555deflit(CHUNK_COUNT,2) 556forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` 557 deflit(`disp0', eval(i*CHUNK_COUNT*4)) 558 deflit(`disp1', eval(disp0 + 4)) 559 deflit(`disp2', eval(disp1 + 4)) 560 561 movl disp1(%ebx), %eax 562 mull %ebp 563Zdisp( addl, %ecx, disp0,(%edi)) 564 adcl %eax, %esi 565 movl %edx, %ecx 566 jadcl0( %ecx) 567 568 movl disp2(%ebx), %eax 569 mull %ebp 570 addl %esi, disp1(%edi) 571 adcl %eax, %ecx 572 movl %edx, %esi 573 jadcl0( %esi) 574') 575 576 decl VAR_COUNTER 577 leal UNROLL_BYTES(%ebx), %ebx 578 579 jns L(unroll_top) 580 581 582 movl PARAM_YSIZE, %edx 583 addl %ecx, UNROLL_BYTES(%edi) 584 585 adcl $0, %esi 586 587 incl %edx 588 movl %esi, UNROLL_BYTES+4(%edi) 589 590 jnz L(unroll_outer_top) 591 592 593 movl SAVE_ESI, %esi 594 movl SAVE_EBP, %ebp 595 movl SAVE_EDI, %edi 596 movl SAVE_EBX, %ebx 597 598 addl $FRAME, %esp 599 ret 600 601EPILOGUE() 602