1dnl Intel Pentium-4 mpn_divrem_1 -- mpn by limb division. 2 3dnl Copyright 1999-2004 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C P4: 32 cycles/limb integer part, 30 cycles/limb fraction part. 35 36 37C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, 38C mp_srcptr src, mp_size_t size, 39C mp_limb_t divisor); 40C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, 41C mp_srcptr src, mp_size_t size, 42C mp_limb_t divisor, mp_limb_t carry); 43C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize, 44C mp_srcptr src, mp_size_t size, 45C mp_limb_t divisor, mp_limb_t inverse, 46C unsigned shift); 47C 48C Algorithm: 49C 50C The method and nomenclature follow part 8 of "Division by Invariant 51C Integers using Multiplication" by Granlund and Montgomery, reference in 52C gmp.texi. 53C 54C "m" is written for what is m' in the paper, and "d" for d_norm, which 55C won't cause any confusion since it's only the normalized divisor that's of 56C any use in the code. "b" is written for 2^N, the size of a limb, N being 57C 32 here. 58C 59C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as 60C "n-d - q1*d". This rearrangement gives the same two-limb answer but lets 61C us have just a psubq on the dependent chain. 62C 63C For reference, the way the k7 code uses "n-(q1+1)*d" would not suit here, 64C detecting an overflow of q1+1 when q1=0xFFFFFFFF would cost too much. 65C 66C Notes: 67C 68C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high 69C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero 70C carry, since in normal circumstances that will be a very rare event. 71C 72C The test for skipping a division is branch free (once size>=1 is tested). 73C The store to the destination high limb is 0 when a divide is skipped, or 74C if it's not skipped then a copy of the src high limb is stored. The 75C latter is in case src==dst. 76C 77C There's a small bias towards expecting xsize==0, by having code for 78C xsize==0 in a straight line and xsize!=0 under forward jumps. 79C 80C Enhancements: 81C 82C The loop measures 32 cycles, but the dependent chain would suggest it 83C could be done with 30. Not sure where to start looking for the extras. 84C 85C Alternatives: 86C 87C If the divisor is normalized (high bit set) then a division step can 88C always be skipped, since the high destination limb is always 0 or 1 in 89C that case. It doesn't seem worth checking for this though, since it 90C probably occurs infrequently. 91 92 93dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by 94dnl inverse method is used, rather than plain "divl"s. Minimum value 1. 95dnl 96dnl The inverse takes about 80-90 cycles to calculate, but after that the 97dnl multiply is 32 c/l versus division at about 58 c/l. 98dnl 99dnl At 4 limbs the div is a touch faster than the mul (and of course 100dnl simpler), so start the mul from 5 limbs. 101 102deflit(MUL_THRESHOLD, 5) 103 104 105defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1 106defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1 107defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c 108defframe(PARAM_DIVISOR,20) 109defframe(PARAM_SIZE, 16) 110defframe(PARAM_SRC, 12) 111defframe(PARAM_XSIZE, 8) 112defframe(PARAM_DST, 4) 113 114dnl re-use parameter space 115define(SAVE_ESI,`PARAM_SIZE') 116define(SAVE_EBP,`PARAM_SRC') 117define(SAVE_EDI,`PARAM_DIVISOR') 118define(SAVE_EBX,`PARAM_DST') 119 120 TEXT 121 122 ALIGN(16) 123PROLOGUE(mpn_preinv_divrem_1) 124deflit(`FRAME',0) 125 126 movl PARAM_SIZE, %ecx 127 xorl %edx, %edx C carry if can't skip a div 128 129 movl %esi, SAVE_ESI 130 movl PARAM_SRC, %esi 131 132 movl %ebp, SAVE_EBP 133 movl PARAM_DIVISOR, %ebp 134 135 movl %edi, SAVE_EDI 136 movl PARAM_DST, %edi 137 138 movl -4(%esi,%ecx,4), %eax C src high limb 139 140 movl %ebx, SAVE_EBX 141 movl PARAM_XSIZE, %ebx 142 143 movd PARAM_PREINV_INVERSE, %mm4 144 145 movd PARAM_PREINV_SHIFT, %mm7 C l 146 cmpl %ebp, %eax C high cmp divisor 147 148 cmovc( %eax, %edx) C high is carry if high<divisor 149 movd %edx, %mm0 C carry 150 151 movd %edx, %mm1 C carry 152 movl $0, %edx 153 154 movd %ebp, %mm5 C d 155 cmovnc( %eax, %edx) C 0 if skip div, src high if not 156 C (the latter in case src==dst) 157 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1] 158 159 movl %edx, (%edi,%ecx,4) C dst high limb 160 sbbl $0, %ecx C skip one division if high<divisor 161 movl $32, %eax 162 163 subl PARAM_PREINV_SHIFT, %eax 164 psllq %mm7, %mm5 C d normalized 165 leal (%edi,%ecx,4), %edi C &dst[xsize+size-1] 166 leal -4(%esi,%ecx,4), %esi C &src[size-1] 167 168 movd %eax, %mm6 C 32-l 169 jmp L(start_preinv) 170 171EPILOGUE() 172 173 174 ALIGN(16) 175PROLOGUE(mpn_divrem_1c) 176deflit(`FRAME',0) 177 178 movl PARAM_CARRY, %edx 179 180 movl PARAM_SIZE, %ecx 181 182 movl %esi, SAVE_ESI 183 movl PARAM_SRC, %esi 184 185 movl %ebp, SAVE_EBP 186 movl PARAM_DIVISOR, %ebp 187 188 movl %edi, SAVE_EDI 189 movl PARAM_DST, %edi 190 191 movl %ebx, SAVE_EBX 192 movl PARAM_XSIZE, %ebx 193 194 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1] 195 jmp L(start_1c) 196 197EPILOGUE() 198 199 200 ALIGN(16) 201PROLOGUE(mpn_divrem_1) 202deflit(`FRAME',0) 203 204 movl PARAM_SIZE, %ecx 205 xorl %edx, %edx C initial carry (if can't skip a div) 206 207 movl %esi, SAVE_ESI 208 movl PARAM_SRC, %esi 209 210 movl %ebp, SAVE_EBP 211 movl PARAM_DIVISOR, %ebp 212 213 movl %edi, SAVE_EDI 214 movl PARAM_DST, %edi 215 216 movl %ebx, SAVE_EBX 217 movl PARAM_XSIZE, %ebx 218 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1] 219 220 orl %ecx, %ecx C size 221 jz L(no_skip_div) C if size==0 222 movl -4(%esi,%ecx,4), %eax C src high limb 223 224 cmpl %ebp, %eax C high cmp divisor 225 226 cmovnc( %eax, %edx) C 0 if skip div, src high if not 227 movl %edx, (%edi,%ecx,4) C dst high limb 228 229 movl $0, %edx 230 cmovc( %eax, %edx) C high is carry if high<divisor 231 232 sbbl $0, %ecx C size-1 if high<divisor 233L(no_skip_div): 234 235 236L(start_1c): 237 C eax 238 C ebx xsize 239 C ecx size 240 C edx carry 241 C esi src 242 C edi &dst[xsize-1] 243 C ebp divisor 244 245 leal (%ebx,%ecx), %eax C size+xsize 246 leal -4(%esi,%ecx,4), %esi C &src[size-1] 247 leal (%edi,%ecx,4), %edi C &dst[size+xsize-1] 248 249 cmpl $MUL_THRESHOLD, %eax 250 jae L(mul_by_inverse) 251 252 253 orl %ecx, %ecx 254 jz L(divide_no_integer) C if size==0 255 256L(divide_integer): 257 C eax scratch (quotient) 258 C ebx xsize 259 C ecx counter 260 C edx carry 261 C esi src, decrementing 262 C edi dst, decrementing 263 C ebp divisor 264 265 movl (%esi), %eax 266 subl $4, %esi 267 268 divl %ebp 269 270 movl %eax, (%edi) 271 subl $4, %edi 272 273 subl $1, %ecx 274 jnz L(divide_integer) 275 276 277L(divide_no_integer): 278 orl %ebx, %ebx 279 jnz L(divide_fraction) C if xsize!=0 280 281L(divide_done): 282 movl SAVE_ESI, %esi 283 movl SAVE_EDI, %edi 284 movl SAVE_EBX, %ebx 285 movl SAVE_EBP, %ebp 286 movl %edx, %eax 287 ret 288 289 290L(divide_fraction): 291 C eax scratch (quotient) 292 C ebx counter 293 C ecx 294 C edx carry 295 C esi 296 C edi dst, decrementing 297 C ebp divisor 298 299 movl $0, %eax 300 301 divl %ebp 302 303 movl %eax, (%edi) 304 subl $4, %edi 305 306 subl $1, %ebx 307 jnz L(divide_fraction) 308 309 jmp L(divide_done) 310 311 312 313C ----------------------------------------------------------------------------- 314 315L(mul_by_inverse): 316 C eax 317 C ebx xsize 318 C ecx size 319 C edx carry 320 C esi &src[size-1] 321 C edi &dst[size+xsize-1] 322 C ebp divisor 323 324 bsrl %ebp, %eax C 31-l 325 movd %edx, %mm0 C carry 326 movd %edx, %mm1 C carry 327 movl %ecx, %edx C size 328 movl $31, %ecx 329 330 C 331 332 xorl %eax, %ecx C l = leading zeros on d 333 addl $1, %eax 334 335 shll %cl, %ebp C d normalized 336 movd %ecx, %mm7 C l 337 movl %edx, %ecx C size 338 339 movd %eax, %mm6 C 32-l 340 movl $-1, %edx 341 movl $-1, %eax 342 343 C 344 345 subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1 346 347 divl %ebp C floor (b*(b-d)-1 / d) 348 movd %ebp, %mm5 C d 349 350 C 351 352 movd %eax, %mm4 C m 353 354 355L(start_preinv): 356 C eax inverse 357 C ebx xsize 358 C ecx size 359 C edx 360 C esi &src[size-1] 361 C edi &dst[size+xsize-1] 362 C ebp 363 C 364 C mm0 carry 365 C mm1 carry 366 C mm2 367 C mm4 m 368 C mm5 d 369 C mm6 31-l 370 C mm7 l 371 372 psllq %mm7, %mm0 C n2 = carry << l, for size==0 373 374 subl $1, %ecx 375 jb L(integer_none) 376 377 movd (%esi), %mm0 C src high limb 378 punpckldq %mm1, %mm0 379 psrlq %mm6, %mm0 C n2 = high (carry:srchigh << l) 380 jz L(integer_last) 381 382 383C The dependent chain here consists of 384C 385C 2 paddd n1+n2 386C 8 pmuludq m*(n1+n2) 387C 2 paddq n2:nadj + m*(n1+n2) 388C 2 psrlq q1 389C 8 pmuludq d*q1 390C 2 psubq (n-d)-q1*d 391C 2 psrlq high n-(q1+1)*d mask 392C 2 pand d masked 393C 2 paddd n2+d addback 394C -- 395C 30 396C 397C But it seems to run at 32 cycles, so presumably there's something else 398C going on. 399 400 ALIGN(16) 401L(integer_top): 402 C eax 403 C ebx 404 C ecx counter, size-1 to 0 405 C edx 406 C esi src, decrementing 407 C edi dst, decrementing 408 C 409 C mm0 n2 410 C mm4 m 411 C mm5 d 412 C mm6 32-l 413 C mm7 l 414 415 ASSERT(b,`C n2<d 416 movd %mm0, %eax 417 movd %mm5, %edx 418 cmpl %edx, %eax') 419 420 movd -4(%esi), %mm1 C next src limbs 421 movd (%esi), %mm2 422 leal -4(%esi), %esi 423 424 punpckldq %mm2, %mm1 425 psrlq %mm6, %mm1 C n10 426 427 movq %mm1, %mm2 C n10 428 movq %mm1, %mm3 C n10 429 psrad $31, %mm1 C -n1 430 pand %mm5, %mm1 C -n1 & d 431 paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow 432 433 psrld $31, %mm2 C n1 434 paddd %mm0, %mm2 C n2+n1 435 punpckldq %mm0, %mm1 C n2:nadj 436 437 pmuludq %mm4, %mm2 C m*(n2+n1) 438 439 C 440 441 paddq %mm2, %mm1 C n2:nadj + m*(n2+n1) 442 pxor %mm2, %mm2 C break dependency, saves 4 cycles 443 pcmpeqd %mm2, %mm2 C FF...FF 444 psrlq $63, %mm2 C 1 445 446 psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1)) 447 448 paddd %mm1, %mm2 C q1+1 449 pmuludq %mm5, %mm1 C q1*d 450 451 punpckldq %mm0, %mm3 C n = n2:n10 452 pxor %mm0, %mm0 453 454 psubq %mm5, %mm3 C n - d 455 456 C 457 458 psubq %mm1, %mm3 C n - (q1+1)*d 459 460 por %mm3, %mm0 C copy remainder -> new n2 461 psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1 462 463 ASSERT(be,`C 0 or -1 464 movd %mm3, %eax 465 addl $1, %eax 466 cmpl $1, %eax') 467 468 paddd %mm3, %mm2 C q 469 pand %mm5, %mm3 C mask & d 470 471 paddd %mm3, %mm0 C addback if necessary 472 movd %mm2, (%edi) 473 leal -4(%edi), %edi 474 475 subl $1, %ecx 476 ja L(integer_top) 477 478 479L(integer_last): 480 C eax 481 C ebx xsize 482 C ecx 483 C edx 484 C esi &src[0] 485 C edi &dst[xsize] 486 C 487 C mm0 n2 488 C mm4 m 489 C mm5 d 490 C mm6 491 C mm7 l 492 493 ASSERT(b,`C n2<d 494 movd %mm0, %eax 495 movd %mm5, %edx 496 cmpl %edx, %eax') 497 498 movd (%esi), %mm1 C src[0] 499 psllq %mm7, %mm1 C n10 500 501 movq %mm1, %mm2 C n10 502 movq %mm1, %mm3 C n10 503 psrad $31, %mm1 C -n1 504 pand %mm5, %mm1 C -n1 & d 505 paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow 506 507 psrld $31, %mm2 C n1 508 paddd %mm0, %mm2 C n2+n1 509 punpckldq %mm0, %mm1 C n2:nadj 510 511 pmuludq %mm4, %mm2 C m*(n2+n1) 512 513 C 514 515 paddq %mm2, %mm1 C n2:nadj + m*(n2+n1) 516 pcmpeqd %mm2, %mm2 C FF...FF 517 psrlq $63, %mm2 C 1 518 519 psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1)) 520 paddd %mm1, %mm2 C q1 521 522 pmuludq %mm5, %mm1 C q1*d 523 punpckldq %mm0, %mm3 C n 524 psubq %mm5, %mm3 C n - d 525 pxor %mm0, %mm0 526 527 C 528 529 psubq %mm1, %mm3 C n - (q1+1)*d 530 531 por %mm3, %mm0 C remainder -> n2 532 psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1 533 534 ASSERT(be,`C 0 or -1 535 movd %mm3, %eax 536 addl $1, %eax 537 cmpl $1, %eax') 538 539 paddd %mm3, %mm2 C q 540 pand %mm5, %mm3 C mask & d 541 542 paddd %mm3, %mm0 C addback if necessary 543 movd %mm2, (%edi) 544 leal -4(%edi), %edi 545 546 547L(integer_none): 548 C eax 549 C ebx xsize 550 551 orl %ebx, %ebx 552 jnz L(fraction_some) C if xsize!=0 553 554 555L(fraction_done): 556 movl SAVE_EBP, %ebp 557 psrld %mm7, %mm0 C remainder 558 559 movl SAVE_EDI, %edi 560 movd %mm0, %eax 561 562 movl SAVE_ESI, %esi 563 movl SAVE_EBX, %ebx 564 emms 565 ret 566 567 568 569C ----------------------------------------------------------------------------- 570C 571 572L(fraction_some): 573 C eax 574 C ebx xsize 575 C ecx 576 C edx 577 C esi 578 C edi &dst[xsize-1] 579 C ebp 580 581 582L(fraction_top): 583 C eax 584 C ebx counter, xsize iterations 585 C ecx 586 C edx 587 C esi src, decrementing 588 C edi dst, decrementing 589 C 590 C mm0 n2 591 C mm4 m 592 C mm5 d 593 C mm6 32-l 594 C mm7 l 595 596 ASSERT(b,`C n2<d 597 movd %mm0, %eax 598 movd %mm5, %edx 599 cmpl %edx, %eax') 600 601 movq %mm0, %mm1 C n2 602 pmuludq %mm4, %mm0 C m*n2 603 604 pcmpeqd %mm2, %mm2 605 psrlq $63, %mm2 606 607 C 608 609 psrlq $32, %mm0 C high(m*n2) 610 611 paddd %mm1, %mm0 C q1 = high(n2:0 + m*n2) 612 613 paddd %mm0, %mm2 C q1+1 614 pmuludq %mm5, %mm0 C q1*d 615 616 psllq $32, %mm1 C n = n2:0 617 psubq %mm5, %mm1 C n - d 618 619 C 620 621 psubq %mm0, %mm1 C r = n - (q1+1)*d 622 pxor %mm0, %mm0 623 624 por %mm1, %mm0 C r -> n2 625 psrlq $32, %mm1 C high n - (q1+1)*d, 0 or -1 626 627 ASSERT(be,`C 0 or -1 628 movd %mm1, %eax 629 addl $1, %eax 630 cmpl $1, %eax') 631 632 paddd %mm1, %mm2 C q 633 pand %mm5, %mm1 C mask & d 634 635 paddd %mm1, %mm0 C addback if necessary 636 movd %mm2, (%edi) 637 leal -4(%edi), %edi 638 639 subl $1, %ebx 640 jne L(fraction_top) 641 642 643 jmp L(fraction_done) 644 645EPILOGUE() 646