1dnl Intel Pentium-II mpn_divrem_1 -- mpn by limb division. 2 3dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part. 24 25 26C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, 27C mp_srcptr src, mp_size_t size, 28C mp_limb_t divisor); 29C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, 30C mp_srcptr src, mp_size_t size, 31C mp_limb_t divisor, mp_limb_t carry); 32C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize, 33C mp_srcptr src, mp_size_t size, 34C mp_limb_t divisor, mp_limb_t inverse, 35C unsigned shift); 36C 37C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm, 38C see that file for some comments. It's possible what's here can be improved. 39 40 41dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by 42dnl inverse method is used, rather than plain "divl"s. Minimum value 1. 43dnl 44dnl The different speeds of the integer and fraction parts means that using 45dnl xsize+size isn't quite right. The threshold wants to be a bit higher 46dnl for the integer part and a bit lower for the fraction part. (Or what's 47dnl really wanted is to speed up the integer part!) 48dnl 49dnl The threshold is set to make the integer part right. At 4 limbs the 50dnl div and mul are about the same there, but on the fractional part the 51dnl mul is much faster. 52 53deflit(MUL_THRESHOLD, 4) 54 55 56defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1 57defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1 58defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c 59defframe(PARAM_DIVISOR,20) 60defframe(PARAM_SIZE, 16) 61defframe(PARAM_SRC, 12) 62defframe(PARAM_XSIZE, 8) 63defframe(PARAM_DST, 4) 64 65defframe(SAVE_EBX, -4) 66defframe(SAVE_ESI, -8) 67defframe(SAVE_EDI, -12) 68defframe(SAVE_EBP, -16) 69 70defframe(VAR_NORM, -20) 71defframe(VAR_INVERSE, -24) 72defframe(VAR_SRC, -28) 73defframe(VAR_DST, -32) 74defframe(VAR_DST_STOP,-36) 75 76deflit(STACK_SPACE, 36) 77 78 TEXT 79 ALIGN(16) 80 81PROLOGUE(mpn_preinv_divrem_1) 82deflit(`FRAME',0) 83 movl PARAM_XSIZE, %ecx 84 subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) 85 86 movl %esi, SAVE_ESI 87 movl PARAM_SRC, %esi 88 89 movl %ebx, SAVE_EBX 90 movl PARAM_SIZE, %ebx 91 92 movl %ebp, SAVE_EBP 93 movl PARAM_DIVISOR, %ebp 94 95 movl %edi, SAVE_EDI 96 movl PARAM_DST, %edx 97 98 movl -4(%esi,%ebx,4), %eax C src high limb 99 xorl %edi, %edi C initial carry (if can't skip a div) 100 101 C 102 103 leal 8(%edx,%ecx,4), %edx C &dst[xsize+2] 104 xor %ecx, %ecx 105 106 movl %edx, VAR_DST_STOP C &dst[xsize+2] 107 cmpl %ebp, %eax C high cmp divisor 108 109 cmovc( %eax, %edi) C high is carry if high<divisor 110 111 cmovnc( %eax, %ecx) C 0 if skip div, src high if not 112 C (the latter in case src==dst) 113 114 movl %ecx, -12(%edx,%ebx,4) C dst high limb 115 116 sbbl $0, %ebx C skip one division if high<divisor 117 movl PARAM_PREINV_SHIFT, %ecx 118 119 leal -8(%edx,%ebx,4), %edx C &dst[xsize+size] 120 movl $32, %eax 121 122 movl %edx, VAR_DST C &dst[xsize+size] 123 124 shll %cl, %ebp C d normalized 125 subl %ecx, %eax 126 movl %ecx, VAR_NORM 127 128 movd %eax, %mm7 C rshift 129 movl PARAM_PREINV_INVERSE, %eax 130 jmp L(start_preinv) 131 132EPILOGUE() 133 134 135 136 ALIGN(16) 137 138PROLOGUE(mpn_divrem_1c) 139deflit(`FRAME',0) 140 movl PARAM_CARRY, %edx 141 142 movl PARAM_SIZE, %ecx 143 subl $STACK_SPACE, %esp 144deflit(`FRAME',STACK_SPACE) 145 146 movl %ebx, SAVE_EBX 147 movl PARAM_XSIZE, %ebx 148 149 movl %edi, SAVE_EDI 150 movl PARAM_DST, %edi 151 152 movl %ebp, SAVE_EBP 153 movl PARAM_DIVISOR, %ebp 154 155 movl %esi, SAVE_ESI 156 movl PARAM_SRC, %esi 157 158 leal -4(%edi,%ebx,4), %edi 159 jmp L(start_1c) 160 161EPILOGUE() 162 163 164 C offset 0x31, close enough to aligned 165PROLOGUE(mpn_divrem_1) 166deflit(`FRAME',0) 167 168 movl PARAM_SIZE, %ecx 169 movl $0, %edx C initial carry (if can't skip a div) 170 subl $STACK_SPACE, %esp 171deflit(`FRAME',STACK_SPACE) 172 173 movl %ebp, SAVE_EBP 174 movl PARAM_DIVISOR, %ebp 175 176 movl %ebx, SAVE_EBX 177 movl PARAM_XSIZE, %ebx 178 179 movl %esi, SAVE_ESI 180 movl PARAM_SRC, %esi 181 orl %ecx, %ecx C size 182 183 movl %edi, SAVE_EDI 184 movl PARAM_DST, %edi 185 186 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1] 187 jz L(no_skip_div) C if size==0 188 189 movl -4(%esi,%ecx,4), %eax C src high limb 190 xorl %esi, %esi 191 cmpl %ebp, %eax C high cmp divisor 192 193 cmovc( %eax, %edx) C high is carry if high<divisor 194 195 cmovnc( %eax, %esi) C 0 if skip div, src high if not 196 C (the latter in case src==dst) 197 198 movl %esi, (%edi,%ecx,4) C dst high limb 199 200 sbbl $0, %ecx C size-1 if high<divisor 201 movl PARAM_SRC, %esi C reload 202L(no_skip_div): 203 204 205L(start_1c): 206 C eax 207 C ebx xsize 208 C ecx size 209 C edx carry 210 C esi src 211 C edi &dst[xsize-1] 212 C ebp divisor 213 214 leal (%ebx,%ecx), %eax C size+xsize 215 cmpl $MUL_THRESHOLD, %eax 216 jae L(mul_by_inverse) 217 218 orl %ecx, %ecx 219 jz L(divide_no_integer) 220 221L(divide_integer): 222 C eax scratch (quotient) 223 C ebx xsize 224 C ecx counter 225 C edx scratch (remainder) 226 C esi src 227 C edi &dst[xsize-1] 228 C ebp divisor 229 230 movl -4(%esi,%ecx,4), %eax 231 232 divl %ebp 233 234 movl %eax, (%edi,%ecx,4) 235 decl %ecx 236 jnz L(divide_integer) 237 238 239L(divide_no_integer): 240 movl PARAM_DST, %edi 241 orl %ebx, %ebx 242 jnz L(divide_fraction) 243 244L(divide_done): 245 movl SAVE_ESI, %esi 246 247 movl SAVE_EDI, %edi 248 249 movl SAVE_EBX, %ebx 250 movl %edx, %eax 251 252 movl SAVE_EBP, %ebp 253 addl $STACK_SPACE, %esp 254 255 ret 256 257 258L(divide_fraction): 259 C eax scratch (quotient) 260 C ebx counter 261 C ecx 262 C edx scratch (remainder) 263 C esi 264 C edi dst 265 C ebp divisor 266 267 movl $0, %eax 268 269 divl %ebp 270 271 movl %eax, -4(%edi,%ebx,4) 272 decl %ebx 273 jnz L(divide_fraction) 274 275 jmp L(divide_done) 276 277 278 279C ----------------------------------------------------------------------------- 280 281L(mul_by_inverse): 282 C eax 283 C ebx xsize 284 C ecx size 285 C edx carry 286 C esi src 287 C edi &dst[xsize-1] 288 C ebp divisor 289 290 leal 12(%edi), %ebx C &dst[xsize+2], loop dst stop 291 292 movl %ebx, VAR_DST_STOP 293 leal 4(%edi,%ecx,4), %edi C &dst[xsize+size] 294 295 movl %edi, VAR_DST 296 movl %ecx, %ebx C size 297 298 bsrl %ebp, %ecx C 31-l 299 movl %edx, %edi C carry 300 301 leal 1(%ecx), %eax C 32-l 302 xorl $31, %ecx C l 303 304 movl %ecx, VAR_NORM 305 movl $-1, %edx 306 307 shll %cl, %ebp C d normalized 308 movd %eax, %mm7 309 310 movl $-1, %eax 311 subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1 312 313 divl %ebp C floor (b*(b-d)-1) / d 314 315L(start_preinv): 316 C eax inverse 317 C ebx size 318 C ecx shift 319 C edx 320 C esi src 321 C edi carry 322 C ebp divisor 323 C 324 C mm7 rshift 325 326 movl %eax, VAR_INVERSE 327 orl %ebx, %ebx C size 328 leal -12(%esi,%ebx,4), %eax C &src[size-3] 329 330 movl %eax, VAR_SRC 331 jz L(start_zero) 332 333 movl 8(%eax), %esi C src high limb 334 cmpl $1, %ebx 335 jz L(start_one) 336 337L(start_two_or_more): 338 movl 4(%eax), %edx C src second highest limb 339 340 shldl( %cl, %esi, %edi) C n2 = carry,high << l 341 342 shldl( %cl, %edx, %esi) C n10 = high,second << l 343 344 cmpl $2, %ebx 345 je L(integer_two_left) 346 jmp L(integer_top) 347 348 349L(start_one): 350 shldl( %cl, %esi, %edi) C n2 = carry,high << l 351 352 shll %cl, %esi C n10 = high << l 353 jmp L(integer_one_left) 354 355 356L(start_zero): 357 C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and 358 C skipped a division. 359 360 shll %cl, %edi C n2 = carry << l 361 movl %edi, %eax C return value for zero_done 362 cmpl $0, PARAM_XSIZE 363 364 je L(zero_done) 365 jmp L(fraction_some) 366 367 368 369C ----------------------------------------------------------------------------- 370C 371C This loop runs at about 25 cycles, which is probably sub-optimal, and 372C certainly more than the dependent chain would suggest. A better loop, or 373C a better rough analysis of what's possible, would be welcomed. 374C 375C In the current implementation, the following successively dependent 376C micro-ops seem to exist. 377C 378C uops 379C n2+n1 1 (addl) 380C mul 5 381C q1+1 3 (addl/adcl) 382C mul 5 383C sub 3 (subl/sbbl) 384C addback 2 (cmov) 385C --- 386C 19 387C 388C Lack of registers hinders explicit scheduling and it might be that the 389C normal out of order execution isn't able to hide enough under the mul 390C latencies. 391C 392C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than 393C cmov (and takes one uop off the dependent chain). A sarl/andl/addl 394C combination was tried for the addback (despite the fact it would lengthen 395C the dependent chain) but found to be no faster. 396 397 398 ALIGN(16) 399L(integer_top): 400 C eax scratch 401 C ebx scratch (nadj, q1) 402 C ecx scratch (src, dst) 403 C edx scratch 404 C esi n10 405 C edi n2 406 C ebp d 407 C 408 C mm0 scratch (src qword) 409 C mm7 rshift for normalization 410 411 movl %esi, %eax 412 movl %ebp, %ebx 413 414 sarl $31, %eax C -n1 415 movl VAR_SRC, %ecx 416 417 andl %eax, %ebx C -n1 & d 418 negl %eax C n1 419 420 addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow 421 addl %edi, %eax C n2+n1 422 movq (%ecx), %mm0 C next src limb and the one below it 423 424 mull VAR_INVERSE C m*(n2+n1) 425 426 subl $4, %ecx 427 428 movl %ecx, VAR_SRC 429 430 C 431 432 C 433 434 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag 435 movl %ebp, %eax C d 436 leal 1(%edi), %ebx C n2+1 437 438 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 439 jz L(q1_ff) 440 441 mull %ebx C (q1+1)*d 442 443 movl VAR_DST, %ecx 444 psrlq %mm7, %mm0 445 446 C 447 448 C 449 450 C 451 452 subl %eax, %esi 453 movl VAR_DST_STOP, %eax 454 455 sbbl %edx, %edi C n - (q1+1)*d 456 movl %esi, %edi C remainder -> n2 457 leal (%ebp,%esi), %edx 458 459 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 460 movd %mm0, %esi 461 462 sbbl $0, %ebx C q 463 subl $4, %ecx 464 465 movl %ebx, (%ecx) 466 cmpl %eax, %ecx 467 468 movl %ecx, VAR_DST 469 jne L(integer_top) 470 471 472L(integer_loop_done): 473 474 475C ----------------------------------------------------------------------------- 476C 477C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz 478C q1_ff special case. This make the code a bit smaller and simpler, and 479C costs only 2 cycles (each). 480 481L(integer_two_left): 482 C eax scratch 483 C ebx scratch (nadj, q1) 484 C ecx scratch (src, dst) 485 C edx scratch 486 C esi n10 487 C edi n2 488 C ebp divisor 489 C 490 C mm7 rshift 491 492 493 movl %esi, %eax 494 movl %ebp, %ebx 495 496 sarl $31, %eax C -n1 497 movl PARAM_SRC, %ecx 498 499 andl %eax, %ebx C -n1 & d 500 negl %eax C n1 501 502 addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow 503 addl %edi, %eax C n2+n1 504 505 mull VAR_INVERSE C m*(n2+n1) 506 507 movd (%ecx), %mm0 C src low limb 508 509 movl VAR_DST_STOP, %ecx 510 511 C 512 513 C 514 515 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag 516 leal 1(%edi), %ebx C n2+1 517 movl %ebp, %eax C d 518 519 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 520 521 sbbl $0, %ebx 522 523 mull %ebx C (q1+1)*d 524 525 psllq $32, %mm0 526 527 psrlq %mm7, %mm0 528 529 C 530 531 C 532 533 subl %eax, %esi 534 535 sbbl %edx, %edi C n - (q1+1)*d 536 movl %esi, %edi C remainder -> n2 537 leal (%ebp,%esi), %edx 538 539 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 540 movd %mm0, %esi 541 542 sbbl $0, %ebx C q 543 544 movl %ebx, -4(%ecx) 545 546 547C ----------------------------------------------------------------------------- 548L(integer_one_left): 549 C eax scratch 550 C ebx scratch (nadj, q1) 551 C ecx scratch (dst) 552 C edx scratch 553 C esi n10 554 C edi n2 555 C ebp divisor 556 C 557 C mm7 rshift 558 559 560 movl %esi, %eax 561 movl %ebp, %ebx 562 563 sarl $31, %eax C -n1 564 movl VAR_DST_STOP, %ecx 565 566 andl %eax, %ebx C -n1 & d 567 negl %eax C n1 568 569 addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow 570 addl %edi, %eax C n2+n1 571 572 mull VAR_INVERSE C m*(n2+n1) 573 574 C 575 576 C 577 578 C 579 580 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag 581 leal 1(%edi), %ebx C n2+1 582 movl %ebp, %eax C d 583 584 C 585 586 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 587 588 sbbl $0, %ebx C q1 if q1+1 overflowed 589 590 mull %ebx 591 592 C 593 594 C 595 596 C 597 598 C 599 600 subl %eax, %esi 601 movl PARAM_XSIZE, %eax 602 603 sbbl %edx, %edi C n - (q1+1)*d 604 movl %esi, %edi C remainder -> n2 605 leal (%ebp,%esi), %edx 606 607 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 608 609 sbbl $0, %ebx C q 610 611 movl %ebx, -8(%ecx) 612 subl $8, %ecx 613 614 615 616 orl %eax, %eax C xsize 617 jnz L(fraction_some) 618 619 movl %edi, %eax 620L(fraction_done): 621 movl VAR_NORM, %ecx 622L(zero_done): 623 movl SAVE_EBP, %ebp 624 625 movl SAVE_EDI, %edi 626 627 movl SAVE_ESI, %esi 628 629 movl SAVE_EBX, %ebx 630 addl $STACK_SPACE, %esp 631 632 shrl %cl, %eax 633 emms 634 635 ret 636 637 638C ----------------------------------------------------------------------------- 639C 640C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword 641C of q*d is simply -d and the remainder n-q*d = n10+d 642 643L(q1_ff): 644 C eax (divisor) 645 C ebx (q1+1 == 0) 646 C ecx 647 C edx 648 C esi n10 649 C edi n2 650 C ebp divisor 651 652 movl VAR_DST, %ecx 653 movl VAR_DST_STOP, %edx 654 subl $4, %ecx 655 656 movl %ecx, VAR_DST 657 psrlq %mm7, %mm0 658 leal (%ebp,%esi), %edi C n-q*d remainder -> next n2 659 660 movl $-1, (%ecx) 661 movd %mm0, %esi C next n10 662 663 cmpl %ecx, %edx 664 jne L(integer_top) 665 666 jmp L(integer_loop_done) 667 668 669 670C ----------------------------------------------------------------------------- 671C 672C In the current implementation, the following successively dependent 673C micro-ops seem to exist. 674C 675C uops 676C mul 5 677C q1+1 1 (addl) 678C mul 5 679C sub 3 (negl/sbbl) 680C addback 2 (cmov) 681C --- 682C 16 683C 684C The loop in fact runs at about 17.5 cycles. Using a sarl/andl/addl for 685C the addback was found to be a touch slower. 686 687 688 ALIGN(16) 689L(fraction_some): 690 C eax 691 C ebx 692 C ecx 693 C edx 694 C esi 695 C edi carry 696 C ebp divisor 697 698 movl PARAM_DST, %esi 699 movl VAR_DST_STOP, %ecx C &dst[xsize+2] 700 movl %edi, %eax 701 702 subl $8, %ecx C &dst[xsize] 703 704 705 ALIGN(16) 706L(fraction_top): 707 C eax n2, then scratch 708 C ebx scratch (nadj, q1) 709 C ecx dst, decrementing 710 C edx scratch 711 C esi dst stop point 712 C edi n2 713 C ebp divisor 714 715 mull VAR_INVERSE C m*n2 716 717 movl %ebp, %eax C d 718 subl $4, %ecx C dst 719 leal 1(%edi), %ebx 720 721 C 722 723 C 724 725 C 726 727 addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1 728 729 mull %ebx C (q1+1)*d 730 731 C 732 733 C 734 735 C 736 737 C 738 739 negl %eax C low of n - (q1+1)*d 740 741 sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry 742 leal (%ebp,%eax), %edx 743 744 cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1 745 746 sbbl $0, %ebx C q 747 movl %eax, %edi C remainder->n2 748 cmpl %esi, %ecx 749 750 movl %ebx, (%ecx) C previous q 751 jne L(fraction_top) 752 753 754 jmp L(fraction_done) 755 756EPILOGUE() 757