1.rdata 2.asciiz "mips3.s, Version 1.2" 3.asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" 4 5.text 6.set noat 7 8.align 5 9.globl bn_mul_add_words 10.ent bn_mul_add_words 11bn_mul_add_words: 12 .set noreorder 13 bgtz $6,bn_mul_add_words_internal 14 move $2,$0 15 jr $31 16 move $4,$2 17.end bn_mul_add_words 18 19.align 5 20.ent bn_mul_add_words_internal 21bn_mul_add_words_internal: 22 .set reorder 23 li $3,-4 24 and $8,$6,$3 25 beqz $8,.L_bn_mul_add_words_tail 26 27.L_bn_mul_add_words_loop: 28 ld $12,0($5) 29 dmultu $12,$7 30 ld $13,0($4) 31 ld $14,8($5) 32 ld $15,8($4) 33 ld $8,2*8($5) 34 ld $9,2*8($4) 35 daddu $13,$2 36 sltu $2,$13,$2 # All manuals say it "compares 32-bit 37 # values", but it seems to work fine 38 # even on 64-bit registers. 39 mflo $1 40 mfhi $12 41 daddu $13,$1 42 daddu $2,$12 43 dmultu $14,$7 44 sltu $1,$13,$1 45 sd $13,0($4) 46 daddu $2,$1 47 48 ld $10,3*8($5) 49 ld $11,3*8($4) 50 daddu $15,$2 51 sltu $2,$15,$2 52 mflo $1 53 mfhi $14 54 daddu $15,$1 55 daddu $2,$14 56 dmultu $8,$7 57 sltu $1,$15,$1 58 sd $15,8($4) 59 daddu $2,$1 60 61 subu $6,4 62 daddu $4,4*8 63 daddu $5,4*8 64 daddu $9,$2 65 sltu $2,$9,$2 66 mflo $1 67 mfhi $8 68 daddu $9,$1 69 daddu $2,$8 70 dmultu $10,$7 71 sltu $1,$9,$1 72 sd $9,-2*8($4) 73 daddu $2,$1 74 75 76 and $8,$6,$3 77 daddu $11,$2 78 sltu $2,$11,$2 79 mflo $1 80 mfhi $10 81 daddu $11,$1 82 daddu $2,$10 83 sltu $1,$11,$1 84 sd $11,-8($4) 85 .set noreorder 86 bgtz $8,.L_bn_mul_add_words_loop 87 daddu $2,$1 88 89 beqz $6,.L_bn_mul_add_words_return 90 nop 91 92.L_bn_mul_add_words_tail: 93 .set reorder 94 ld $12,0($5) 95 dmultu $12,$7 96 ld $13,0($4) 97 subu $6,1 98 daddu $13,$2 99 sltu $2,$13,$2 100 mflo $1 101 mfhi $12 102 daddu $13,$1 103 daddu $2,$12 104 sltu $1,$13,$1 105 sd $13,0($4) 106 daddu $2,$1 107 beqz $6,.L_bn_mul_add_words_return 108 109 ld $12,8($5) 110 dmultu $12,$7 111 ld $13,8($4) 112 subu $6,1 113 daddu $13,$2 114 sltu $2,$13,$2 115 mflo $1 116 mfhi $12 117 daddu $13,$1 118 daddu $2,$12 119 sltu $1,$13,$1 120 sd $13,8($4) 121 daddu $2,$1 122 beqz $6,.L_bn_mul_add_words_return 123 124 ld $12,2*8($5) 125 dmultu $12,$7 126 ld $13,2*8($4) 127 daddu $13,$2 128 sltu $2,$13,$2 129 mflo $1 130 mfhi $12 131 daddu $13,$1 132 daddu $2,$12 133 sltu $1,$13,$1 134 sd $13,2*8($4) 135 daddu $2,$1 136 137.L_bn_mul_add_words_return: 138 .set noreorder 139 jr $31 140 move $4,$2 141.end bn_mul_add_words_internal 142 143.align 5 144.globl bn_mul_words 145.ent bn_mul_words 146bn_mul_words: 147 .set noreorder 148 bgtz $6,bn_mul_words_internal 149 move $2,$0 150 jr $31 151 move $4,$2 152.end bn_mul_words 153 154.align 5 155.ent bn_mul_words_internal 156bn_mul_words_internal: 157 .set reorder 158 li $3,-4 159 and $8,$6,$3 160 beqz $8,.L_bn_mul_words_tail 161 162.L_bn_mul_words_loop: 163 ld $12,0($5) 164 dmultu $12,$7 165 ld $14,8($5) 166 ld $8,2*8($5) 167 ld $10,3*8($5) 168 mflo $1 169 mfhi $12 170 daddu $2,$1 171 sltu $13,$2,$1 172 dmultu $14,$7 173 sd $2,0($4) 174 daddu $2,$13,$12 175 176 subu $6,4 177 daddu $4,4*8 178 daddu $5,4*8 179 mflo $1 180 mfhi $14 181 daddu $2,$1 182 sltu $15,$2,$1 183 dmultu $8,$7 184 sd $2,-3*8($4) 185 daddu $2,$15,$14 186 187 mflo $1 188 mfhi $8 189 daddu $2,$1 190 sltu $9,$2,$1 191 dmultu $10,$7 192 sd $2,-2*8($4) 193 daddu $2,$9,$8 194 195 and $8,$6,$3 196 mflo $1 197 mfhi $10 198 daddu $2,$1 199 sltu $11,$2,$1 200 sd $2,-8($4) 201 .set noreorder 202 bgtz $8,.L_bn_mul_words_loop 203 daddu $2,$11,$10 204 205 beqz $6,.L_bn_mul_words_return 206 nop 207 208.L_bn_mul_words_tail: 209 .set reorder 210 ld $12,0($5) 211 dmultu $12,$7 212 subu $6,1 213 mflo $1 214 mfhi $12 215 daddu $2,$1 216 sltu $13,$2,$1 217 sd $2,0($4) 218 daddu $2,$13,$12 219 beqz $6,.L_bn_mul_words_return 220 221 ld $12,8($5) 222 dmultu $12,$7 223 subu $6,1 224 mflo $1 225 mfhi $12 226 daddu $2,$1 227 sltu $13,$2,$1 228 sd $2,8($4) 229 daddu $2,$13,$12 230 beqz $6,.L_bn_mul_words_return 231 232 ld $12,2*8($5) 233 dmultu $12,$7 234 mflo $1 235 mfhi $12 236 daddu $2,$1 237 sltu $13,$2,$1 238 sd $2,2*8($4) 239 daddu $2,$13,$12 240 241.L_bn_mul_words_return: 242 .set noreorder 243 jr $31 244 move $4,$2 245.end bn_mul_words_internal 246 247.align 5 248.globl bn_sqr_words 249.ent bn_sqr_words 250bn_sqr_words: 251 .set noreorder 252 bgtz $6,bn_sqr_words_internal 253 move $2,$0 254 jr $31 255 move $4,$2 256.end bn_sqr_words 257 258.align 5 259.ent bn_sqr_words_internal 260bn_sqr_words_internal: 261 .set reorder 262 li $3,-4 263 and $8,$6,$3 264 beqz $8,.L_bn_sqr_words_tail 265 266.L_bn_sqr_words_loop: 267 ld $12,0($5) 268 dmultu $12,$12 269 ld $14,8($5) 270 ld $8,2*8($5) 271 ld $10,3*8($5) 272 mflo $13 273 mfhi $12 274 sd $13,0($4) 275 sd $12,8($4) 276 277 dmultu $14,$14 278 subu $6,4 279 daddu $4,8*8 280 daddu $5,4*8 281 mflo $15 282 mfhi $14 283 sd $15,-6*8($4) 284 sd $14,-5*8($4) 285 286 dmultu $8,$8 287 mflo $9 288 mfhi $8 289 sd $9,-4*8($4) 290 sd $8,-3*8($4) 291 292 293 dmultu $10,$10 294 and $8,$6,$3 295 mflo $11 296 mfhi $10 297 sd $11,-2*8($4) 298 299 .set noreorder 300 sd $10,-8($4) 301 bgtz $8,.L_bn_sqr_words_loop 302 nop 303 304 beqz $6,.L_bn_sqr_words_return 305 nop 306 307.L_bn_sqr_words_tail: 308 .set reorder 309 ld $12,0($5) 310 dmultu $12,$12 311 subu $6,1 312 mflo $13 313 mfhi $12 314 sd $13,0($4) 315 sd $12,8($4) 316 beqz $6,.L_bn_sqr_words_return 317 318 ld $12,8($5) 319 dmultu $12,$12 320 subu $6,1 321 mflo $13 322 mfhi $12 323 sd $13,2*8($4) 324 sd $12,3*8($4) 325 beqz $6,.L_bn_sqr_words_return 326 327 ld $12,2*8($5) 328 dmultu $12,$12 329 mflo $13 330 mfhi $12 331 sd $13,4*8($4) 332 sd $12,5*8($4) 333 334.L_bn_sqr_words_return: 335 .set noreorder 336 jr $31 337 move $4,$2 338 339.end bn_sqr_words_internal 340 341.align 5 342.globl bn_add_words 343.ent bn_add_words 344bn_add_words: 345 .set noreorder 346 bgtz $7,bn_add_words_internal 347 move $2,$0 348 jr $31 349 move $4,$2 350.end bn_add_words 351 352.align 5 353.ent bn_add_words_internal 354bn_add_words_internal: 355 .set reorder 356 li $3,-4 357 and $1,$7,$3 358 beqz $1,.L_bn_add_words_tail 359 360.L_bn_add_words_loop: 361 ld $12,0($5) 362 ld $8,0($6) 363 subu $7,4 364 ld $13,8($5) 365 and $1,$7,$3 366 ld $14,2*8($5) 367 daddu $6,4*8 368 ld $15,3*8($5) 369 daddu $4,4*8 370 ld $9,-3*8($6) 371 daddu $5,4*8 372 ld $10,-2*8($6) 373 ld $11,-8($6) 374 daddu $8,$12 375 sltu $24,$8,$12 376 daddu $12,$8,$2 377 sltu $2,$12,$8 378 sd $12,-4*8($4) 379 daddu $2,$24 380 381 daddu $9,$13 382 sltu $25,$9,$13 383 daddu $13,$9,$2 384 sltu $2,$13,$9 385 sd $13,-3*8($4) 386 daddu $2,$25 387 388 daddu $10,$14 389 sltu $24,$10,$14 390 daddu $14,$10,$2 391 sltu $2,$14,$10 392 sd $14,-2*8($4) 393 daddu $2,$24 394 395 daddu $11,$15 396 sltu $25,$11,$15 397 daddu $15,$11,$2 398 sltu $2,$15,$11 399 sd $15,-8($4) 400 401 .set noreorder 402 bgtz $1,.L_bn_add_words_loop 403 daddu $2,$25 404 405 beqz $7,.L_bn_add_words_return 406 nop 407 408.L_bn_add_words_tail: 409 .set reorder 410 ld $12,0($5) 411 ld $8,0($6) 412 daddu $8,$12 413 subu $7,1 414 sltu $24,$8,$12 415 daddu $12,$8,$2 416 sltu $2,$12,$8 417 sd $12,0($4) 418 daddu $2,$24 419 beqz $7,.L_bn_add_words_return 420 421 ld $13,8($5) 422 ld $9,8($6) 423 daddu $9,$13 424 subu $7,1 425 sltu $25,$9,$13 426 daddu $13,$9,$2 427 sltu $2,$13,$9 428 sd $13,8($4) 429 daddu $2,$25 430 beqz $7,.L_bn_add_words_return 431 432 ld $14,2*8($5) 433 ld $10,2*8($6) 434 daddu $10,$14 435 sltu $24,$10,$14 436 daddu $14,$10,$2 437 sltu $2,$14,$10 438 sd $14,2*8($4) 439 daddu $2,$24 440 441.L_bn_add_words_return: 442 .set noreorder 443 jr $31 444 move $4,$2 445 446.end bn_add_words_internal 447 448.align 5 449.globl bn_sub_words 450.ent bn_sub_words 451bn_sub_words: 452 .set noreorder 453 bgtz $7,bn_sub_words_internal 454 move $2,$0 455 jr $31 456 move $4,$0 457.end bn_sub_words 458 459.align 5 460.ent bn_sub_words_internal 461bn_sub_words_internal: 462 .set reorder 463 li $3,-4 464 and $1,$7,$3 465 beqz $1,.L_bn_sub_words_tail 466 467.L_bn_sub_words_loop: 468 ld $12,0($5) 469 ld $8,0($6) 470 subu $7,4 471 ld $13,8($5) 472 and $1,$7,$3 473 ld $14,2*8($5) 474 daddu $6,4*8 475 ld $15,3*8($5) 476 daddu $4,4*8 477 ld $9,-3*8($6) 478 daddu $5,4*8 479 ld $10,-2*8($6) 480 ld $11,-8($6) 481 sltu $24,$12,$8 482 dsubu $8,$12,$8 483 dsubu $12,$8,$2 484 sgtu $2,$12,$8 485 sd $12,-4*8($4) 486 daddu $2,$24 487 488 sltu $25,$13,$9 489 dsubu $9,$13,$9 490 dsubu $13,$9,$2 491 sgtu $2,$13,$9 492 sd $13,-3*8($4) 493 daddu $2,$25 494 495 496 sltu $24,$14,$10 497 dsubu $10,$14,$10 498 dsubu $14,$10,$2 499 sgtu $2,$14,$10 500 sd $14,-2*8($4) 501 daddu $2,$24 502 503 sltu $25,$15,$11 504 dsubu $11,$15,$11 505 dsubu $15,$11,$2 506 sgtu $2,$15,$11 507 sd $15,-8($4) 508 509 .set noreorder 510 bgtz $1,.L_bn_sub_words_loop 511 daddu $2,$25 512 513 beqz $7,.L_bn_sub_words_return 514 nop 515 516.L_bn_sub_words_tail: 517 .set reorder 518 ld $12,0($5) 519 ld $8,0($6) 520 subu $7,1 521 sltu $24,$12,$8 522 dsubu $8,$12,$8 523 dsubu $12,$8,$2 524 sgtu $2,$12,$8 525 sd $12,0($4) 526 daddu $2,$24 527 beqz $7,.L_bn_sub_words_return 528 529 ld $13,8($5) 530 subu $7,1 531 ld $9,8($6) 532 sltu $25,$13,$9 533 dsubu $9,$13,$9 534 dsubu $13,$9,$2 535 sgtu $2,$13,$9 536 sd $13,8($4) 537 daddu $2,$25 538 beqz $7,.L_bn_sub_words_return 539 540 ld $14,2*8($5) 541 ld $10,2*8($6) 542 sltu $24,$14,$10 543 dsubu $10,$14,$10 544 dsubu $14,$10,$2 545 sgtu $2,$14,$10 546 sd $14,2*8($4) 547 daddu $2,$24 548 549.L_bn_sub_words_return: 550 .set noreorder 551 jr $31 552 move $4,$2 553.end bn_sub_words_internal 554 555.align 5 556.globl bn_div_3_words 557.ent bn_div_3_words 558bn_div_3_words: 559 .set noreorder 560 move $7,$4 # we know that bn_div_words does not 561 # touch $7, $10, $11 and preserves $6 562 # so that we can save two arguments 563 # and return address in registers 564 # instead of stack:-) 565 566 ld $4,($7) 567 move $10,$5 568 ld $5,-8($7) 569 bne $4,$6,bn_div_3_words_internal 570 nop 571 li $2,-1 572 jr $31 573 move $4,$2 574.end bn_div_3_words 575 576.align 5 577.ent bn_div_3_words_internal 578bn_div_3_words_internal: 579 .set reorder 580 move $11,$31 581 bal bn_div_words_internal 582 move $31,$11 583 dmultu $10,$2 584 ld $14,-2*8($7) 585 move $8,$0 586 mfhi $13 587 mflo $12 588 sltu $24,$13,$5 589.L_bn_div_3_words_inner_loop: 590 bnez $24,.L_bn_div_3_words_inner_loop_done 591 sgeu $1,$14,$12 592 seq $25,$13,$5 593 and $1,$25 594 sltu $15,$12,$10 595 daddu $5,$6 596 dsubu $13,$15 597 dsubu $12,$10 598 sltu $24,$13,$5 599 sltu $8,$5,$6 600 or $24,$8 601 .set noreorder 602 beqz $1,.L_bn_div_3_words_inner_loop 603 dsubu $2,1 604 daddu $2,1 605 .set reorder 606.L_bn_div_3_words_inner_loop_done: 607 .set noreorder 608 jr $31 609 move $4,$2 610.end bn_div_3_words_internal 611 612.align 5 613.globl bn_div_words 614.ent bn_div_words 615bn_div_words: 616 .set noreorder 617 bnez $6,bn_div_words_internal 618 li $2,-1 # I would rather signal div-by-zero 619 # which can be done with 'break 7' 620 jr $31 621 move $4,$2 622.end bn_div_words 623 624.align 5 625.ent bn_div_words_internal 626bn_div_words_internal: 627 move $3,$0 628 bltz $6,.L_bn_div_words_body 629 move $25,$3 630 dsll $6,1 631 bgtz $6,.-4 632 addu $25,1 633 634 .set reorder 635 negu $13,$25 636 li $14,-1 637 dsll $14,$13 638 and $14,$4 639 dsrl $1,$5,$13 640 .set noreorder 641 beqz $14,.+12 642 nop 643 break 6 # signal overflow 644 .set reorder 645 dsll $4,$25 646 dsll $5,$25 647 or $4,$1 648.L_bn_div_words_body: 649 dsrl $3,$6,4*8 # bits 650 sgeu $1,$4,$6 651 .set noreorder 652 beqz $1,.+12 653 nop 654 dsubu $4,$6 655 .set reorder 656 657 li $8,-1 658 dsrl $9,$4,4*8 # bits 659 dsrl $8,4*8 # q=0xffffffff 660 beq $3,$9,.L_bn_div_words_skip_div1 661 ddivu $0,$4,$3 662 mflo $8 663.L_bn_div_words_skip_div1: 664 dmultu $6,$8 665 dsll $15,$4,4*8 # bits 666 dsrl $1,$5,4*8 # bits 667 or $15,$1 668 mflo $12 669 mfhi $13 670.L_bn_div_words_inner_loop1: 671 sltu $14,$15,$12 672 seq $24,$9,$13 673 sltu $1,$9,$13 674 and $14,$24 675 sltu $2,$12,$6 676 or $1,$14 677 .set noreorder 678 beqz $1,.L_bn_div_words_inner_loop1_done 679 dsubu $13,$2 680 dsubu $12,$6 681 b .L_bn_div_words_inner_loop1 682 dsubu $8,1 683 .set reorder 684.L_bn_div_words_inner_loop1_done: 685 686 dsll $5,4*8 # bits 687 dsubu $4,$15,$12 688 dsll $2,$8,4*8 # bits 689 690 li $8,-1 691 dsrl $9,$4,4*8 # bits 692 dsrl $8,4*8 # q=0xffffffff 693 beq $3,$9,.L_bn_div_words_skip_div2 694 ddivu $0,$4,$3 695 mflo $8 696.L_bn_div_words_skip_div2: 697 dmultu $6,$8 698 dsll $15,$4,4*8 # bits 699 dsrl $1,$5,4*8 # bits 700 or $15,$1 701 mflo $12 702 mfhi $13 703.L_bn_div_words_inner_loop2: 704 sltu $14,$15,$12 705 seq $24,$9,$13 706 sltu $1,$9,$13 707 and $14,$24 708 sltu $3,$12,$6 709 or $1,$14 710 .set noreorder 711 beqz $1,.L_bn_div_words_inner_loop2_done 712 dsubu $13,$3 713 dsubu $12,$6 714 b .L_bn_div_words_inner_loop2 715 dsubu $8,1 716 .set reorder 717.L_bn_div_words_inner_loop2_done: 718 719 dsubu $4,$15,$12 720 or $2,$8 721 dsrl $3,$4,$25 # $3 contains remainder if anybody wants it 722 dsrl $6,$25 # restore $6 723 724 .set noreorder 725 move $5,$3 726 jr $31 727 move $4,$2 728.end bn_div_words_internal 729 730.align 5 731.globl bn_mul_comba8 732.ent bn_mul_comba8 733bn_mul_comba8: 734 .set noreorder 735 .frame $29,6*8,$31 736 .mask 0x003f0000,-8 737 dsubu $29,6*8 738 sd $21,5*8($29) 739 sd $20,4*8($29) 740 sd $19,3*8($29) 741 sd $18,2*8($29) 742 sd $17,1*8($29) 743 sd $16,0*8($29) 744 745 .set reorder 746 ld $12,0($5) # If compiled with -mips3 option on 747 # R5000 box assembler barks on this 748 # 1ine with "should not have mult/div 749 # as last instruction in bb (R10K 750 # bug)" warning. If anybody out there 751 # has a clue about how to circumvent 752 # this do send me a note. 753 # <appro@fy.chalmers.se> 754 755 ld $8,0($6) 756 ld $13,8($5) 757 ld $14,2*8($5) 758 dmultu $12,$8 # mul_add_c(a[0],b[0],c1,c2,c3); 759 ld $15,3*8($5) 760 ld $9,8($6) 761 ld $10,2*8($6) 762 ld $11,3*8($6) 763 mflo $2 764 mfhi $3 765 766 ld $16,4*8($5) 767 ld $18,5*8($5) 768 dmultu $12,$9 # mul_add_c(a[0],b[1],c2,c3,c1); 769 ld $20,6*8($5) 770 ld $5,7*8($5) 771 ld $17,4*8($6) 772 ld $19,5*8($6) 773 mflo $24 774 mfhi $25 775 daddu $3,$24 776 sltu $1,$3,$24 777 dmultu $13,$8 # mul_add_c(a[1],b[0],c2,c3,c1); 778 daddu $7,$25,$1 779 ld $21,6*8($6) 780 ld $6,7*8($6) 781 sd $2,0($4) # r[0]=c1; 782 mflo $24 783 mfhi $25 784 daddu $3,$24 785 sltu $1,$3,$24 786 dmultu $14,$8 # mul_add_c(a[2],b[0],c3,c1,c2); 787 daddu $25,$1 788 daddu $7,$25 789 sltu $2,$7,$25 790 sd $3,8($4) # r[1]=c2; 791 792 mflo $24 793 mfhi $25 794 daddu $7,$24 795 sltu $1,$7,$24 796 dmultu $13,$9 # mul_add_c(a[1],b[1],c3,c1,c2); 797 daddu $25,$1 798 daddu $2,$25 799 mflo $24 800 mfhi $25 801 daddu $7,$24 802 sltu $1,$7,$24 803 dmultu $12,$10 # mul_add_c(a[0],b[2],c3,c1,c2); 804 daddu $25,$1 805 daddu $2,$25 806 sltu $3,$2,$25 807 mflo $24 808 mfhi $25 809 daddu $7,$24 810 sltu $1,$7,$24 811 dmultu $12,$11 # mul_add_c(a[0],b[3],c1,c2,c3); 812 daddu $25,$1 813 daddu $2,$25 814 sltu $1,$2,$25 815 daddu $3,$1 816 sd $7,2*8($4) # r[2]=c3; 817 818 mflo $24 819 mfhi $25 820 daddu $2,$24 821 sltu $1,$2,$24 822 dmultu $13,$10 # mul_add_c(a[1],b[2],c1,c2,c3); 823 daddu $25,$1 824 daddu $3,$25 825 sltu $7,$3,$25 826 mflo $24 827 mfhi $25 828 daddu $2,$24 829 sltu $1,$2,$24 830 dmultu $14,$9 # mul_add_c(a[2],b[1],c1,c2,c3); 831 daddu $25,$1 832 daddu $3,$25 833 sltu $1,$3,$25 834 daddu $7,$1 835 mflo $24 836 mfhi $25 837 daddu $2,$24 838 sltu $1,$2,$24 839 dmultu $15,$8 # mul_add_c(a[3],b[0],c1,c2,c3); 840 daddu $25,$1 841 daddu $3,$25 842 sltu $1,$3,$25 843 daddu $7,$1 844 mflo $24 845 mfhi $25 846 daddu $2,$24 847 sltu $1,$2,$24 848 dmultu $16,$8 # mul_add_c(a[4],b[0],c2,c3,c1); 849 daddu $25,$1 850 daddu $3,$25 851 sltu $1,$3,$25 852 daddu $7,$1 853 sd $2,3*8($4) # r[3]=c1; 854 855 mflo $24 856 mfhi $25 857 daddu $3,$24 858 sltu $1,$3,$24 859 dmultu $15,$9 # mul_add_c(a[3],b[1],c2,c3,c1); 860 daddu $25,$1 861 daddu $7,$25 862 sltu $2,$7,$25 863 mflo $24 864 mfhi $25 865 daddu $3,$24 866 sltu $1,$3,$24 867 dmultu $14,$10 # mul_add_c(a[2],b[2],c2,c3,c1); 868 daddu $25,$1 869 daddu $7,$25 870 sltu $1,$7,$25 871 daddu $2,$1 872 mflo $24 873 mfhi $25 874 daddu $3,$24 875 sltu $1,$3,$24 876 dmultu $13,$11 # mul_add_c(a[1],b[3],c2,c3,c1); 877 daddu $25,$1 878 daddu $7,$25 879 sltu $1,$7,$25 880 daddu $2,$1 881 mflo $24 882 mfhi $25 883 daddu $3,$24 884 sltu $1,$3,$24 885 dmultu $12,$17 # mul_add_c(a[0],b[4],c2,c3,c1); 886 daddu $25,$1 887 daddu $7,$25 888 sltu $1,$7,$25 889 daddu $2,$1 890 mflo $24 891 mfhi $25 892 daddu $3,$24 893 sltu $1,$3,$24 894 dmultu $12,$19 # mul_add_c(a[0],b[5],c3,c1,c2); 895 daddu $25,$1 896 daddu $7,$25 897 sltu $1,$7,$25 898 daddu $2,$1 899 sd $3,4*8($4) # r[4]=c2; 900 901 mflo $24 902 mfhi $25 903 daddu $7,$24 904 sltu $1,$7,$24 905 dmultu $13,$17 # mul_add_c(a[1],b[4],c3,c1,c2); 906 daddu $25,$1 907 daddu $2,$25 908 sltu $3,$2,$25 909 mflo $24 910 mfhi $25 911 daddu $7,$24 912 sltu $1,$7,$24 913 dmultu $14,$11 # mul_add_c(a[2],b[3],c3,c1,c2); 914 daddu $25,$1 915 daddu $2,$25 916 sltu $1,$2,$25 917 daddu $3,$1 918 mflo $24 919 mfhi $25 920 daddu $7,$24 921 sltu $1,$7,$24 922 dmultu $15,$10 # mul_add_c(a[3],b[2],c3,c1,c2); 923 daddu $25,$1 924 daddu $2,$25 925 sltu $1,$2,$25 926 daddu $3,$1 927 mflo $24 928 mfhi $25 929 daddu $7,$24 930 sltu $1,$7,$24 931 dmultu $16,$9 # mul_add_c(a[4],b[1],c3,c1,c2); 932 daddu $25,$1 933 daddu $2,$25 934 sltu $1,$2,$25 935 daddu $3,$1 936 mflo $24 937 mfhi $25 938 daddu $7,$24 939 sltu $1,$7,$24 940 dmultu $18,$8 # mul_add_c(a[5],b[0],c3,c1,c2); 941 daddu $25,$1 942 daddu $2,$25 943 sltu $1,$2,$25 944 daddu $3,$1 945 mflo $24 946 mfhi $25 947 daddu $7,$24 948 sltu $1,$7,$24 949 dmultu $20,$8 # mul_add_c(a[6],b[0],c1,c2,c3); 950 daddu $25,$1 951 daddu $2,$25 952 sltu $1,$2,$25 953 daddu $3,$1 954 sd $7,5*8($4) # r[5]=c3; 955 956 mflo $24 957 mfhi $25 958 daddu $2,$24 959 sltu $1,$2,$24 960 dmultu $18,$9 # mul_add_c(a[5],b[1],c1,c2,c3); 961 daddu $25,$1 962 daddu $3,$25 963 sltu $7,$3,$25 964 mflo $24 965 mfhi $25 966 daddu $2,$24 967 sltu $1,$2,$24 968 dmultu $16,$10 # mul_add_c(a[4],b[2],c1,c2,c3); 969 daddu $25,$1 970 daddu $3,$25 971 sltu $1,$3,$25 972 daddu $7,$1 973 mflo $24 974 mfhi $25 975 daddu $2,$24 976 sltu $1,$2,$24 977 dmultu $15,$11 # mul_add_c(a[3],b[3],c1,c2,c3); 978 daddu $25,$1 979 daddu $3,$25 980 sltu $1,$3,$25 981 daddu $7,$1 982 mflo $24 983 mfhi $25 984 daddu $2,$24 985 sltu $1,$2,$24 986 dmultu $14,$17 # mul_add_c(a[2],b[4],c1,c2,c3); 987 daddu $25,$1 988 daddu $3,$25 989 sltu $1,$3,$25 990 daddu $7,$1 991 mflo $24 992 mfhi $25 993 daddu $2,$24 994 sltu $1,$2,$24 995 dmultu $13,$19 # mul_add_c(a[1],b[5],c1,c2,c3); 996 daddu $25,$1 997 daddu $3,$25 998 sltu $1,$3,$25 999 daddu $7,$1 1000 mflo $24 1001 mfhi $25 1002 daddu $2,$24 1003 sltu $1,$2,$24 1004 dmultu $12,$21 # mul_add_c(a[0],b[6],c1,c2,c3); 1005 daddu $25,$1 1006 daddu $3,$25 1007 sltu $1,$3,$25 1008 daddu $7,$1 1009 mflo $24 1010 mfhi $25 1011 daddu $2,$24 1012 sltu $1,$2,$24 1013 dmultu $12,$6 # mul_add_c(a[0],b[7],c2,c3,c1); 1014 daddu $25,$1 1015 daddu $3,$25 1016 sltu $1,$3,$25 1017 daddu $7,$1 1018 sd $2,6*8($4) # r[6]=c1; 1019 1020 mflo $24 1021 mfhi $25 1022 daddu $3,$24 1023 sltu $1,$3,$24 1024 dmultu $13,$21 # mul_add_c(a[1],b[6],c2,c3,c1); 1025 daddu $25,$1 1026 daddu $7,$25 1027 sltu $2,$7,$25 1028 mflo $24 1029 mfhi $25 1030 daddu $3,$24 1031 sltu $1,$3,$24 1032 dmultu $14,$19 # mul_add_c(a[2],b[5],c2,c3,c1); 1033 daddu $25,$1 1034 daddu $7,$25 1035 sltu $1,$7,$25 1036 daddu $2,$1 1037 mflo $24 1038 mfhi $25 1039 daddu $3,$24 1040 sltu $1,$3,$24 1041 dmultu $15,$17 # mul_add_c(a[3],b[4],c2,c3,c1); 1042 daddu $25,$1 1043 daddu $7,$25 1044 sltu $1,$7,$25 1045 daddu $2,$1 1046 mflo $24 1047 mfhi $25 1048 daddu $3,$24 1049 sltu $1,$3,$24 1050 dmultu $16,$11 # mul_add_c(a[4],b[3],c2,c3,c1); 1051 daddu $25,$1 1052 daddu $7,$25 1053 sltu $1,$7,$25 1054 daddu $2,$1 1055 mflo $24 1056 mfhi $25 1057 daddu $3,$24 1058 sltu $1,$3,$24 1059 dmultu $18,$10 # mul_add_c(a[5],b[2],c2,c3,c1); 1060 daddu $25,$1 1061 daddu $7,$25 1062 sltu $1,$7,$25 1063 daddu $2,$1 1064 mflo $24 1065 mfhi $25 1066 daddu $3,$24 1067 sltu $1,$3,$24 1068 dmultu $20,$9 # mul_add_c(a[6],b[1],c2,c3,c1); 1069 daddu $25,$1 1070 daddu $7,$25 1071 sltu $1,$7,$25 1072 daddu $2,$1 1073 mflo $24 1074 mfhi $25 1075 daddu $3,$24 1076 sltu $1,$3,$24 1077 dmultu $5,$8 # mul_add_c(a[7],b[0],c2,c3,c1); 1078 daddu $25,$1 1079 daddu $7,$25 1080 sltu $1,$7,$25 1081 daddu $2,$1 1082 mflo $24 1083 mfhi $25 1084 daddu $3,$24 1085 sltu $1,$3,$24 1086 dmultu $5,$9 # mul_add_c(a[7],b[1],c3,c1,c2); 1087 daddu $25,$1 1088 daddu $7,$25 1089 sltu $1,$7,$25 1090 daddu $2,$1 1091 sd $3,7*8($4) # r[7]=c2; 1092 1093 mflo $24 1094 mfhi $25 1095 daddu $7,$24 1096 sltu $1,$7,$24 1097 dmultu $20,$10 # mul_add_c(a[6],b[2],c3,c1,c2); 1098 daddu $25,$1 1099 daddu $2,$25 1100 sltu $3,$2,$25 1101 mflo $24 1102 mfhi $25 1103 daddu $7,$24 1104 sltu $1,$7,$24 1105 dmultu $18,$11 # mul_add_c(a[5],b[3],c3,c1,c2); 1106 daddu $25,$1 1107 daddu $2,$25 1108 sltu $1,$2,$25 1109 daddu $3,$1 1110 mflo $24 1111 mfhi $25 1112 daddu $7,$24 1113 sltu $1,$7,$24 1114 dmultu $16,$17 # mul_add_c(a[4],b[4],c3,c1,c2); 1115 daddu $25,$1 1116 daddu $2,$25 1117 sltu $1,$2,$25 1118 daddu $3,$1 1119 mflo $24 1120 mfhi $25 1121 daddu $7,$24 1122 sltu $1,$7,$24 1123 dmultu $15,$19 # mul_add_c(a[3],b[5],c3,c1,c2); 1124 daddu $25,$1 1125 daddu $2,$25 1126 sltu $1,$2,$25 1127 daddu $3,$1 1128 mflo $24 1129 mfhi $25 1130 daddu $7,$24 1131 sltu $1,$7,$24 1132 dmultu $14,$21 # mul_add_c(a[2],b[6],c3,c1,c2); 1133 daddu $25,$1 1134 daddu $2,$25 1135 sltu $1,$2,$25 1136 daddu $3,$1 1137 mflo $24 1138 mfhi $25 1139 daddu $7,$24 1140 sltu $1,$7,$24 1141 dmultu $13,$6 # mul_add_c(a[1],b[7],c3,c1,c2); 1142 daddu $25,$1 1143 daddu $2,$25 1144 sltu $1,$2,$25 1145 daddu $3,$1 1146 mflo $24 1147 mfhi $25 1148 daddu $7,$24 1149 sltu $1,$7,$24 1150 dmultu $14,$6 # mul_add_c(a[2],b[7],c1,c2,c3); 1151 daddu $25,$1 1152 daddu $2,$25 1153 sltu $1,$2,$25 1154 daddu $3,$1 1155 sd $7,8*8($4) # r[8]=c3; 1156 1157 mflo $24 1158 mfhi $25 1159 daddu $2,$24 1160 sltu $1,$2,$24 1161 dmultu $15,$21 # mul_add_c(a[3],b[6],c1,c2,c3); 1162 daddu $25,$1 1163 daddu $3,$25 1164 sltu $7,$3,$25 1165 mflo $24 1166 mfhi $25 1167 daddu $2,$24 1168 sltu $1,$2,$24 1169 dmultu $16,$19 # mul_add_c(a[4],b[5],c1,c2,c3); 1170 daddu $25,$1 1171 daddu $3,$25 1172 sltu $1,$3,$25 1173 daddu $7,$1 1174 mflo $24 1175 mfhi $25 1176 daddu $2,$24 1177 sltu $1,$2,$24 1178 dmultu $18,$17 # mul_add_c(a[5],b[4],c1,c2,c3); 1179 daddu $25,$1 1180 daddu $3,$25 1181 sltu $1,$3,$25 1182 daddu $7,$1 1183 mflo $24 1184 mfhi $25 1185 daddu $2,$24 1186 sltu $1,$2,$24 1187 dmultu $20,$11 # mul_add_c(a[6],b[3],c1,c2,c3); 1188 daddu $25,$1 1189 daddu $3,$25 1190 sltu $1,$3,$25 1191 daddu $7,$1 1192 mflo $24 1193 mfhi $25 1194 daddu $2,$24 1195 sltu $1,$2,$24 1196 dmultu $5,$10 # mul_add_c(a[7],b[2],c1,c2,c3); 1197 daddu $25,$1 1198 daddu $3,$25 1199 sltu $1,$3,$25 1200 daddu $7,$1 1201 mflo $24 1202 mfhi $25 1203 daddu $2,$24 1204 sltu $1,$2,$24 1205 dmultu $5,$11 # mul_add_c(a[7],b[3],c2,c3,c1); 1206 daddu $25,$1 1207 daddu $3,$25 1208 sltu $1,$3,$25 1209 daddu $7,$1 1210 sd $2,9*8($4) # r[9]=c1; 1211 1212 mflo $24 1213 mfhi $25 1214 daddu $3,$24 1215 sltu $1,$3,$24 1216 dmultu $20,$17 # mul_add_c(a[6],b[4],c2,c3,c1); 1217 daddu $25,$1 1218 daddu $7,$25 1219 sltu $2,$7,$25 1220 mflo $24 1221 mfhi $25 1222 daddu $3,$24 1223 sltu $1,$3,$24 1224 dmultu $18,$19 # mul_add_c(a[5],b[5],c2,c3,c1); 1225 daddu $25,$1 1226 daddu $7,$25 1227 sltu $1,$7,$25 1228 daddu $2,$1 1229 mflo $24 1230 mfhi $25 1231 daddu $3,$24 1232 sltu $1,$3,$24 1233 dmultu $16,$21 # mul_add_c(a[4],b[6],c2,c3,c1); 1234 daddu $25,$1 1235 daddu $7,$25 1236 sltu $1,$7,$25 1237 daddu $2,$1 1238 mflo $24 1239 mfhi $25 1240 daddu $3,$24 1241 sltu $1,$3,$24 1242 dmultu $15,$6 # mul_add_c(a[3],b[7],c2,c3,c1); 1243 daddu $25,$1 1244 daddu $7,$25 1245 sltu $1,$7,$25 1246 daddu $2,$1 1247 mflo $24 1248 mfhi $25 1249 daddu $3,$24 1250 sltu $1,$3,$24 1251 dmultu $16,$6 # mul_add_c(a[4],b[7],c3,c1,c2); 1252 daddu $25,$1 1253 daddu $7,$25 1254 sltu $1,$7,$25 1255 daddu $2,$1 1256 sd $3,10*8($4) # r[10]=c2; 1257 1258 mflo $24 1259 mfhi $25 1260 daddu $7,$24 1261 sltu $1,$7,$24 1262 dmultu $18,$21 # mul_add_c(a[5],b[6],c3,c1,c2); 1263 daddu $25,$1 1264 daddu $2,$25 1265 sltu $3,$2,$25 1266 mflo $24 1267 mfhi $25 1268 daddu $7,$24 1269 sltu $1,$7,$24 1270 dmultu $20,$19 # mul_add_c(a[6],b[5],c3,c1,c2); 1271 daddu $25,$1 1272 daddu $2,$25 1273 sltu $1,$2,$25 1274 daddu $3,$1 1275 mflo $24 1276 mfhi $25 1277 daddu $7,$24 1278 sltu $1,$7,$24 1279 dmultu $5,$17 # mul_add_c(a[7],b[4],c3,c1,c2); 1280 daddu $25,$1 1281 daddu $2,$25 1282 sltu $1,$2,$25 1283 daddu $3,$1 1284 mflo $24 1285 mfhi $25 1286 daddu $7,$24 1287 sltu $1,$7,$24 1288 dmultu $5,$19 # mul_add_c(a[7],b[5],c1,c2,c3); 1289 daddu $25,$1 1290 daddu $2,$25 1291 sltu $1,$2,$25 1292 daddu $3,$1 1293 sd $7,11*8($4) # r[11]=c3; 1294 1295 mflo $24 1296 mfhi $25 1297 daddu $2,$24 1298 sltu $1,$2,$24 1299 dmultu $20,$21 # mul_add_c(a[6],b[6],c1,c2,c3); 1300 daddu $25,$1 1301 daddu $3,$25 1302 sltu $7,$3,$25 1303 mflo $24 1304 mfhi $25 1305 daddu $2,$24 1306 sltu $1,$2,$24 1307 dmultu $18,$6 # mul_add_c(a[5],b[7],c1,c2,c3); 1308 daddu $25,$1 1309 daddu $3,$25 1310 sltu $1,$3,$25 1311 daddu $7,$1 1312 mflo $24 1313 mfhi $25 1314 daddu $2,$24 1315 sltu $1,$2,$24 1316 dmultu $20,$6 # mul_add_c(a[6],b[7],c2,c3,c1); 1317 daddu $25,$1 1318 daddu $3,$25 1319 sltu $1,$3,$25 1320 daddu $7,$1 1321 sd $2,12*8($4) # r[12]=c1; 1322 1323 mflo $24 1324 mfhi $25 1325 daddu $3,$24 1326 sltu $1,$3,$24 1327 dmultu $5,$21 # mul_add_c(a[7],b[6],c2,c3,c1); 1328 daddu $25,$1 1329 daddu $7,$25 1330 sltu $2,$7,$25 1331 mflo $24 1332 mfhi $25 1333 daddu $3,$24 1334 sltu $1,$3,$24 1335 dmultu $5,$6 # mul_add_c(a[7],b[7],c3,c1,c2); 1336 daddu $25,$1 1337 daddu $7,$25 1338 sltu $1,$7,$25 1339 daddu $2,$1 1340 sd $3,13*8($4) # r[13]=c2; 1341 1342 mflo $24 1343 mfhi $25 1344 daddu $7,$24 1345 sltu $1,$7,$24 1346 daddu $25,$1 1347 daddu $2,$25 1348 sd $7,14*8($4) # r[14]=c3; 1349 sd $2,15*8($4) # r[15]=c1; 1350 1351 .set noreorder 1352 ld $21,5*8($29) 1353 ld $20,4*8($29) 1354 ld $19,3*8($29) 1355 ld $18,2*8($29) 1356 ld $17,1*8($29) 1357 ld $16,0*8($29) 1358 jr $31 1359 daddu $29,6*8 1360.end bn_mul_comba8 1361 1362.align 5 1363.globl bn_mul_comba4 1364.ent bn_mul_comba4 1365bn_mul_comba4: 1366 .set reorder 1367 ld $12,0($5) 1368 ld $8,0($6) 1369 ld $13,8($5) 1370 ld $14,2*8($5) 1371 dmultu $12,$8 # mul_add_c(a[0],b[0],c1,c2,c3); 1372 ld $15,3*8($5) 1373 ld $9,8($6) 1374 ld $10,2*8($6) 1375 ld $11,3*8($6) 1376 mflo $2 1377 mfhi $3 1378 sd $2,0($4) 1379 1380 dmultu $12,$9 # mul_add_c(a[0],b[1],c2,c3,c1); 1381 mflo $24 1382 mfhi $25 1383 daddu $3,$24 1384 sltu $1,$3,$24 1385 dmultu $13,$8 # mul_add_c(a[1],b[0],c2,c3,c1); 1386 daddu $7,$25,$1 1387 mflo $24 1388 mfhi $25 1389 daddu $3,$24 1390 sltu $1,$3,$24 1391 dmultu $14,$8 # mul_add_c(a[2],b[0],c3,c1,c2); 1392 daddu $25,$1 1393 daddu $7,$25 1394 sltu $2,$7,$25 1395 sd $3,8($4) 1396 1397 mflo $24 1398 mfhi $25 1399 daddu $7,$24 1400 sltu $1,$7,$24 1401 dmultu $13,$9 # mul_add_c(a[1],b[1],c3,c1,c2); 1402 daddu $25,$1 1403 daddu $2,$25 1404 mflo $24 1405 mfhi $25 1406 daddu $7,$24 1407 sltu $1,$7,$24 1408 dmultu $12,$10 # mul_add_c(a[0],b[2],c3,c1,c2); 1409 daddu $25,$1 1410 daddu $2,$25 1411 sltu $3,$2,$25 1412 mflo $24 1413 mfhi $25 1414 daddu $7,$24 1415 sltu $1,$7,$24 1416 dmultu $12,$11 # mul_add_c(a[0],b[3],c1,c2,c3); 1417 daddu $25,$1 1418 daddu $2,$25 1419 sltu $1,$2,$25 1420 daddu $3,$1 1421 sd $7,2*8($4) 1422 1423 mflo $24 1424 mfhi $25 1425 daddu $2,$24 1426 sltu $1,$2,$24 1427 dmultu $13,$10 # mul_add_c(a[1],b[2],c1,c2,c3); 1428 daddu $25,$1 1429 daddu $3,$25 1430 sltu $7,$3,$25 1431 mflo $24 1432 mfhi $25 1433 daddu $2,$24 1434 sltu $1,$2,$24 1435 dmultu $14,$9 # mul_add_c(a[2],b[1],c1,c2,c3); 1436 daddu $25,$1 1437 daddu $3,$25 1438 sltu $1,$3,$25 1439 daddu $7,$1 1440 mflo $24 1441 mfhi $25 1442 daddu $2,$24 1443 sltu $1,$2,$24 1444 dmultu $15,$8 # mul_add_c(a[3],b[0],c1,c2,c3); 1445 daddu $25,$1 1446 daddu $3,$25 1447 sltu $1,$3,$25 1448 daddu $7,$1 1449 mflo $24 1450 mfhi $25 1451 daddu $2,$24 1452 sltu $1,$2,$24 1453 dmultu $15,$9 # mul_add_c(a[3],b[1],c2,c3,c1); 1454 daddu $25,$1 1455 daddu $3,$25 1456 sltu $1,$3,$25 1457 daddu $7,$1 1458 sd $2,3*8($4) 1459 1460 mflo $24 1461 mfhi $25 1462 daddu $3,$24 1463 sltu $1,$3,$24 1464 dmultu $14,$10 # mul_add_c(a[2],b[2],c2,c3,c1); 1465 daddu $25,$1 1466 daddu $7,$25 1467 sltu $2,$7,$25 1468 mflo $24 1469 mfhi $25 1470 daddu $3,$24 1471 sltu $1,$3,$24 1472 dmultu $13,$11 # mul_add_c(a[1],b[3],c2,c3,c1); 1473 daddu $25,$1 1474 daddu $7,$25 1475 sltu $1,$7,$25 1476 daddu $2,$1 1477 mflo $24 1478 mfhi $25 1479 daddu $3,$24 1480 sltu $1,$3,$24 1481 dmultu $14,$11 # mul_add_c(a[2],b[3],c3,c1,c2); 1482 daddu $25,$1 1483 daddu $7,$25 1484 sltu $1,$7,$25 1485 daddu $2,$1 1486 sd $3,4*8($4) 1487 1488 mflo $24 1489 mfhi $25 1490 daddu $7,$24 1491 sltu $1,$7,$24 1492 dmultu $15,$10 # mul_add_c(a[3],b[2],c3,c1,c2); 1493 daddu $25,$1 1494 daddu $2,$25 1495 sltu $3,$2,$25 1496 mflo $24 1497 mfhi $25 1498 daddu $7,$24 1499 sltu $1,$7,$24 1500 dmultu $15,$11 # mul_add_c(a[3],b[3],c1,c2,c3); 1501 daddu $25,$1 1502 daddu $2,$25 1503 sltu $1,$2,$25 1504 daddu $3,$1 1505 sd $7,5*8($4) 1506 1507 mflo $24 1508 mfhi $25 1509 daddu $2,$24 1510 sltu $1,$2,$24 1511 daddu $25,$1 1512 daddu $3,$25 1513 sd $2,6*8($4) 1514 sd $3,7*8($4) 1515 1516 .set noreorder 1517 jr $31 1518 nop 1519.end bn_mul_comba4 1520 1521.align 5 1522.globl bn_sqr_comba8 1523.ent bn_sqr_comba8 1524bn_sqr_comba8: 1525 .set reorder 1526 ld $12,0($5) 1527 ld $13,8($5) 1528 ld $14,2*8($5) 1529 ld $15,3*8($5) 1530 1531 dmultu $12,$12 # mul_add_c(a[0],b[0],c1,c2,c3); 1532 ld $8,4*8($5) 1533 ld $9,5*8($5) 1534 ld $10,6*8($5) 1535 ld $11,7*8($5) 1536 mflo $2 1537 mfhi $3 1538 sd $2,0($4) 1539 1540 dmultu $12,$13 # mul_add_c2(a[0],b[1],c2,c3,c1); 1541 mflo $24 1542 mfhi $25 1543 slt $2,$25,$0 1544 dsll $25,1 1545 dmultu $14,$12 # mul_add_c2(a[2],b[0],c3,c1,c2); 1546 slt $6,$24,$0 1547 daddu $25,$6 1548 dsll $24,1 1549 daddu $3,$24 1550 sltu $1,$3,$24 1551 daddu $7,$25,$1 1552 sd $3,8($4) 1553 mflo $24 1554 mfhi $25 1555 daddu $7,$24 1556 sltu $1,$7,$24 1557 dmultu $13,$13 # forward multiplication 1558 daddu $7,$24 1559 daddu $1,$25 1560 sltu $24,$7,$24 1561 daddu $2,$1 1562 daddu $25,$24 1563 sltu $3,$2,$1 1564 daddu $2,$25 1565 sltu $25,$2,$25 1566 daddu $3,$25 1567 mflo $24 1568 mfhi $25 1569 daddu $7,$24 1570 sltu $1,$7,$24 1571 dmultu $12,$15 # mul_add_c2(a[0],b[3],c1,c2,c3); 1572 daddu $25,$1 1573 daddu $2,$25 1574 sltu $1,$2,$25 1575 daddu $3,$1 1576 sd $7,2*8($4) 1577 mflo $24 1578 mfhi $25 1579 daddu $2,$24 1580 sltu $1,$2,$24 1581 dmultu $13,$14 # forward multiplication 1582 daddu $2,$24 1583 daddu $1,$25 1584 sltu $24,$2,$24 1585 daddu $3,$1 1586 daddu $25,$24 1587 sltu $7,$3,$1 1588 daddu $3,$25 1589 sltu $25,$3,$25 1590 daddu $7,$25 1591 mflo $24 1592 mfhi $25 1593 daddu $2,$24 1594 sltu $1,$2,$24 1595 dmultu $8,$12 # forward multiplication 1596 daddu $2,$24 1597 daddu $1,$25 1598 sltu $24,$2,$24 1599 daddu $3,$1 1600 daddu $25,$24 1601 sltu $1,$3,$1 1602 daddu $3,$25 1603 daddu $7,$1 1604 sltu $25,$3,$25 1605 daddu $7,$25 1606 sd $2,3*8($4) 1607 mflo $24 1608 mfhi $25 1609 daddu $3,$24 1610 sltu $1,$3,$24 1611 dmultu $15,$13 # forward multiplication 1612 daddu $3,$24 1613 daddu $1,$25 1614 sltu $24,$3,$24 1615 daddu $7,$1 1616 daddu $25,$24 1617 sltu $2,$7,$1 1618 daddu $7,$25 1619 sltu $25,$7,$25 1620 daddu $2,$25 1621 mflo $24 1622 mfhi $25 1623 daddu $3,$24 1624 sltu $1,$3,$24 1625 dmultu $14,$14 # forward multiplication 1626 daddu $3,$24 1627 daddu $1,$25 1628 sltu $24,$3,$24 1629 daddu $7,$1 1630 daddu $25,$24 1631 sltu $1,$7,$1 1632 daddu $7,$25 1633 daddu $2,$1 1634 sltu $25,$7,$25 1635 daddu $2,$25 1636 mflo $24 1637 mfhi $25 1638 daddu $3,$24 1639 sltu $1,$3,$24 1640 dmultu $12,$9 # mul_add_c2(a[0],b[5],c3,c1,c2); 1641 daddu $25,$1 1642 daddu $7,$25 1643 sltu $1,$7,$25 1644 daddu $2,$1 1645 sd $3,4*8($4) 1646 mflo $24 1647 mfhi $25 1648 daddu $7,$24 1649 sltu $1,$7,$24 1650 dmultu $13,$8 # forward multiplication 1651 daddu $7,$24 1652 daddu $1,$25 1653 sltu $24,$7,$24 1654 daddu $2,$1 1655 daddu $25,$24 1656 sltu $3,$2,$1 1657 daddu $2,$25 1658 sltu $25,$2,$25 1659 daddu $3,$25 1660 mflo $24 1661 mfhi $25 1662 daddu $7,$24 1663 sltu $1,$7,$24 1664 dmultu $14,$15 # forward multiplication 1665 daddu $7,$24 1666 daddu $1,$25 1667 sltu $24,$7,$24 1668 daddu $2,$1 1669 daddu $25,$24 1670 sltu $1,$2,$1 1671 daddu $2,$25 1672 daddu $3,$1 1673 sltu $25,$2,$25 1674 daddu $3,$25 1675 mflo $24 1676 mfhi $25 1677 daddu $7,$24 1678 sltu $1,$7,$24 1679 dmultu $10,$12 # forward multiplication 1680 daddu $7,$24 1681 daddu $1,$25 1682 sltu $24,$7,$24 1683 daddu $2,$1 1684 daddu $25,$24 1685 sltu $1,$2,$1 1686 daddu $2,$25 1687 daddu $3,$1 1688 sltu $25,$2,$25 1689 daddu $3,$25 1690 sd $7,5*8($4) 1691 mflo $24 1692 mfhi $25 1693 daddu $2,$24 1694 sltu $1,$2,$24 1695 dmultu $9,$13 # forward multiplication 1696 daddu $2,$24 1697 daddu $1,$25 1698 sltu $24,$2,$24 1699 daddu $3,$1 1700 daddu $25,$24 1701 sltu $7,$3,$1 1702 daddu $3,$25 1703 sltu $25,$3,$25 1704 daddu $7,$25 1705 mflo $24 1706 mfhi $25 1707 daddu $2,$24 1708 sltu $1,$2,$24 1709 dmultu $8,$14 # forward multiplication 1710 daddu $2,$24 1711 daddu $1,$25 1712 sltu $24,$2,$24 1713 daddu $3,$1 1714 daddu $25,$24 1715 sltu $1,$3,$1 1716 daddu $3,$25 1717 daddu $7,$1 1718 sltu $25,$3,$25 1719 daddu $7,$25 1720 mflo $24 1721 mfhi $25 1722 daddu $2,$24 1723 sltu $1,$2,$24 1724 dmultu $15,$15 # forward multiplication 1725 daddu $2,$24 1726 daddu $1,$25 1727 sltu $24,$2,$24 1728 daddu $3,$1 1729 daddu $25,$24 1730 sltu $1,$3,$1 1731 daddu $3,$25 1732 daddu $7,$1 1733 sltu $25,$3,$25 1734 daddu $7,$25 1735 mflo $24 1736 mfhi $25 1737 daddu $2,$24 1738 sltu $1,$2,$24 1739 dmultu $12,$11 # mul_add_c2(a[0],b[7],c2,c3,c1); 1740 daddu $25,$1 1741 daddu $3,$25 1742 sltu $1,$3,$25 1743 daddu $7,$1 1744 sd $2,6*8($4) 1745 mflo $24 1746 mfhi $25 1747 daddu $3,$24 1748 sltu $1,$3,$24 1749 dmultu $13,$10 # forward multiplication 1750 daddu $3,$24 1751 daddu $1,$25 1752 sltu $24,$3,$24 1753 daddu $7,$1 1754 daddu $25,$24 1755 sltu $2,$7,$1 1756 daddu $7,$25 1757 sltu $25,$7,$25 1758 daddu $2,$25 1759 mflo $24 1760 mfhi $25 1761 daddu $3,$24 1762 sltu $1,$3,$24 1763 dmultu $14,$9 # forward multiplication 1764 daddu $3,$24 1765 daddu $1,$25 1766 sltu $24,$3,$24 1767 daddu $7,$1 1768 daddu $25,$24 1769 sltu $1,$7,$1 1770 daddu $7,$25 1771 daddu $2,$1 1772 sltu $25,$7,$25 1773 daddu $2,$25 1774 mflo $24 1775 mfhi $25 1776 daddu $3,$24 1777 sltu $1,$3,$24 1778 dmultu $15,$8 # forward multiplication 1779 daddu $3,$24 1780 daddu $1,$25 1781 sltu $24,$3,$24 1782 daddu $7,$1 1783 daddu $25,$24 1784 sltu $1,$7,$1 1785 daddu $7,$25 1786 daddu $2,$1 1787 sltu $25,$7,$25 1788 daddu $2,$25 1789 mflo $24 1790 mfhi $25 1791 daddu $3,$24 1792 sltu $1,$3,$24 1793 dmultu $11,$13 # forward multiplication 1794 daddu $3,$24 1795 daddu $1,$25 1796 sltu $24,$3,$24 1797 daddu $7,$1 1798 daddu $25,$24 1799 sltu $1,$7,$1 1800 daddu $7,$25 1801 daddu $2,$1 1802 sltu $25,$7,$25 1803 daddu $2,$25 1804 sd $3,7*8($4) 1805 mflo $24 1806 mfhi $25 1807 daddu $7,$24 1808 sltu $1,$7,$24 1809 dmultu $10,$14 # forward multiplication 1810 daddu $7,$24 1811 daddu $1,$25 1812 sltu $24,$7,$24 1813 daddu $2,$1 1814 daddu $25,$24 1815 sltu $3,$2,$1 1816 daddu $2,$25 1817 sltu $25,$2,$25 1818 daddu $3,$25 1819 mflo $24 1820 mfhi $25 1821 daddu $7,$24 1822 sltu $1,$7,$24 1823 dmultu $9,$15 # forward multiplication 1824 daddu $7,$24 1825 daddu $1,$25 1826 sltu $24,$7,$24 1827 daddu $2,$1 1828 daddu $25,$24 1829 sltu $1,$2,$1 1830 daddu $2,$25 1831 daddu $3,$1 1832 sltu $25,$2,$25 1833 daddu $3,$25 1834 mflo $24 1835 mfhi $25 1836 daddu $7,$24 1837 sltu $1,$7,$24 1838 dmultu $8,$8 # forward multiplication 1839 daddu $7,$24 1840 daddu $1,$25 1841 sltu $24,$7,$24 1842 daddu $2,$1 1843 daddu $25,$24 1844 sltu $1,$2,$1 1845 daddu $2,$25 1846 daddu $3,$1 1847 sltu $25,$2,$25 1848 daddu $3,$25 1849 mflo $24 1850 mfhi $25 1851 daddu $7,$24 1852 sltu $1,$7,$24 1853 dmultu $14,$11 # mul_add_c2(a[2],b[7],c1,c2,c3); 1854 daddu $25,$1 1855 daddu $2,$25 1856 sltu $1,$2,$25 1857 daddu $3,$1 1858 sd $7,8*8($4) 1859 mflo $24 1860 mfhi $25 1861 daddu $2,$24 1862 sltu $1,$2,$24 1863 dmultu $15,$10 # forward multiplication 1864 daddu $2,$24 1865 daddu $1,$25 1866 sltu $24,$2,$24 1867 daddu $3,$1 1868 daddu $25,$24 1869 sltu $7,$3,$1 1870 daddu $3,$25 1871 sltu $25,$3,$25 1872 daddu $7,$25 1873 mflo $24 1874 mfhi $25 1875 daddu $2,$24 1876 sltu $1,$2,$24 1877 dmultu $8,$9 # forward multiplication 1878 daddu $2,$24 1879 daddu $1,$25 1880 sltu $24,$2,$24 1881 daddu $3,$1 1882 daddu $25,$24 1883 sltu $1,$3,$1 1884 daddu $3,$25 1885 daddu $7,$1 1886 sltu $25,$3,$25 1887 daddu $7,$25 1888 mflo $24 1889 mfhi $25 1890 daddu $2,$24 1891 sltu $1,$2,$24 1892 dmultu $11,$15 # forward multiplication 1893 daddu $2,$24 1894 daddu $1,$25 1895 sltu $24,$2,$24 1896 daddu $3,$1 1897 daddu $25,$24 1898 sltu $1,$3,$1 1899 daddu $3,$25 1900 daddu $7,$1 1901 sltu $25,$3,$25 1902 daddu $7,$25 1903 sd $2,9*8($4) 1904 mflo $24 1905 mfhi $25 1906 daddu $3,$24 1907 sltu $1,$3,$24 1908 dmultu $10,$8 # forward multiplication 1909 daddu $3,$24 1910 daddu $1,$25 1911 sltu $24,$3,$24 1912 daddu $7,$1 1913 daddu $25,$24 1914 sltu $2,$7,$1 1915 daddu $7,$25 1916 sltu $25,$7,$25 1917 daddu $2,$25 1918 mflo $24 1919 mfhi $25 1920 daddu $3,$24 1921 sltu $1,$3,$24 1922 dmultu $9,$9 # forward multiplication 1923 daddu $3,$24 1924 daddu $1,$25 1925 sltu $24,$3,$24 1926 daddu $7,$1 1927 daddu $25,$24 1928 sltu $1,$7,$1 1929 daddu $7,$25 1930 daddu $2,$1 1931 sltu $25,$7,$25 1932 daddu $2,$25 1933 mflo $24 1934 mfhi $25 1935 daddu $3,$24 1936 sltu $1,$3,$24 1937 dmultu $8,$11 # mul_add_c2(a[4],b[7],c3,c1,c2); 1938 daddu $25,$1 1939 daddu $7,$25 1940 sltu $1,$7,$25 1941 daddu $2,$1 1942 sd $3,10*8($4) 1943 mflo $24 1944 mfhi $25 1945 daddu $7,$24 1946 sltu $1,$7,$24 1947 dmultu $9,$10 # forward multiplication 1948 daddu $7,$24 1949 daddu $1,$25 1950 sltu $24,$7,$24 1951 daddu $2,$1 1952 daddu $25,$24 1953 sltu $3,$2,$1 1954 daddu $2,$25 1955 sltu $25,$2,$25 1956 daddu $3,$25 1957 mflo $24 1958 mfhi $25 1959 daddu $7,$24 1960 sltu $1,$7,$24 1961 dmultu $11,$9 # forward multiplication 1962 daddu $7,$24 1963 daddu $1,$25 1964 sltu $24,$7,$24 1965 daddu $2,$1 1966 daddu $25,$24 1967 sltu $1,$2,$1 1968 daddu $2,$25 1969 daddu $3,$1 1970 sltu $25,$2,$25 1971 daddu $3,$25 1972 sd $7,11*8($4) 1973 mflo $24 1974 mfhi $25 1975 daddu $2,$24 1976 sltu $1,$2,$24 1977 dmultu $10,$10 # forward multiplication 1978 daddu $2,$24 1979 daddu $1,$25 1980 sltu $24,$2,$24 1981 daddu $3,$1 1982 daddu $25,$24 1983 sltu $7,$3,$1 1984 daddu $3,$25 1985 sltu $25,$3,$25 1986 daddu $7,$25 1987 mflo $24 1988 mfhi $25 1989 daddu $2,$24 1990 sltu $1,$2,$24 1991 dmultu $10,$11 # mul_add_c2(a[6],b[7],c2,c3,c1); 1992 daddu $25,$1 1993 daddu $3,$25 1994 sltu $1,$3,$25 1995 daddu $7,$1 1996 sd $2,12*8($4) 1997 mflo $24 1998 mfhi $25 1999 daddu $3,$24 2000 sltu $1,$3,$24 2001 dmultu $11,$11 # forward multiplication 2002 daddu $3,$24 2003 daddu $1,$25 2004 sltu $24,$3,$24 2005 daddu $7,$1 2006 daddu $25,$24 2007 sltu $2,$7,$1 2008 daddu $7,$25 2009 sltu $25,$7,$25 2010 daddu $2,$25 2011 sd $3,13*8($4) 2012 2013 mflo $24 2014 mfhi $25 2015 daddu $7,$24 2016 sltu $1,$7,$24 2017 daddu $25,$1 2018 daddu $2,$25 2019 sd $7,14*8($4) 2020 sd $2,15*8($4) 2021 2022 .set noreorder 2023 jr $31 2024 nop 2025.end bn_sqr_comba8 2026 2027.align 5 2028.globl bn_sqr_comba4 2029.ent bn_sqr_comba4 2030bn_sqr_comba4: 2031 .set reorder 2032 ld $12,0($5) 2033 ld $13,8($5) 2034 dmultu $12,$12 # mul_add_c(a[0],b[0],c1,c2,c3); 2035 ld $14,2*8($5) 2036 ld $15,3*8($5) 2037 mflo $2 2038 mfhi $3 2039 sd $2,0($4) 2040 2041 dmultu $12,$13 # mul_add_c2(a[0],b[1],c2,c3,c1); 2042 mflo $24 2043 mfhi $25 2044 slt $2,$25,$0 2045 dsll $25,1 2046 dmultu $14,$12 # mul_add_c2(a[2],b[0],c3,c1,c2); 2047 slt $6,$24,$0 2048 daddu $25,$6 2049 dsll $24,1 2050 daddu $3,$24 2051 sltu $1,$3,$24 2052 daddu $7,$25,$1 2053 sd $3,8($4) 2054 mflo $24 2055 mfhi $25 2056 daddu $7,$24 2057 sltu $1,$7,$24 2058 dmultu $13,$13 # forward multiplication 2059 daddu $7,$24 2060 daddu $1,$25 2061 sltu $24,$7,$24 2062 daddu $2,$1 2063 daddu $25,$24 2064 sltu $3,$2,$1 2065 daddu $2,$25 2066 sltu $25,$2,$25 2067 daddu $3,$25 2068 mflo $24 2069 mfhi $25 2070 daddu $7,$24 2071 sltu $1,$7,$24 2072 dmultu $12,$15 # mul_add_c2(a[0],b[3],c1,c2,c3); 2073 daddu $25,$1 2074 daddu $2,$25 2075 sltu $1,$2,$25 2076 daddu $3,$1 2077 sd $7,2*8($4) 2078 mflo $24 2079 mfhi $25 2080 daddu $2,$24 2081 sltu $1,$2,$24 2082 dmultu $13,$14 # forward multiplication 2083 daddu $2,$24 2084 daddu $1,$25 2085 sltu $24,$2,$24 2086 daddu $3,$1 2087 daddu $25,$24 2088 sltu $7,$3,$1 2089 daddu $3,$25 2090 sltu $25,$3,$25 2091 daddu $7,$25 2092 mflo $24 2093 mfhi $25 2094 daddu $2,$24 2095 sltu $1,$2,$24 2096 dmultu $15,$13 # forward multiplication 2097 daddu $2,$24 2098 daddu $1,$25 2099 sltu $24,$2,$24 2100 daddu $3,$1 2101 daddu $25,$24 2102 sltu $1,$3,$1 2103 daddu $3,$25 2104 daddu $7,$1 2105 sltu $25,$3,$25 2106 daddu $7,$25 2107 sd $2,3*8($4) 2108 mflo $24 2109 mfhi $25 2110 daddu $3,$24 2111 sltu $1,$3,$24 2112 dmultu $14,$14 # forward multiplication 2113 daddu $3,$24 2114 daddu $1,$25 2115 sltu $24,$3,$24 2116 daddu $7,$1 2117 daddu $25,$24 2118 sltu $2,$7,$1 2119 daddu $7,$25 2120 sltu $25,$7,$25 2121 daddu $2,$25 2122 mflo $24 2123 mfhi $25 2124 daddu $3,$24 2125 sltu $1,$3,$24 2126 dmultu $14,$15 # mul_add_c2(a[2],b[3],c3,c1,c2); 2127 daddu $25,$1 2128 daddu $7,$25 2129 sltu $1,$7,$25 2130 daddu $2,$1 2131 sd $3,4*8($4) 2132 mflo $24 2133 mfhi $25 2134 daddu $7,$24 2135 sltu $1,$7,$24 2136 dmultu $15,$15 # forward multiplication 2137 daddu $7,$24 2138 daddu $1,$25 2139 sltu $24,$7,$24 2140 daddu $2,$1 2141 daddu $25,$24 2142 sltu $3,$2,$1 2143 daddu $2,$25 2144 sltu $25,$2,$25 2145 daddu $3,$25 2146 sd $7,5*8($4) 2147 2148 mflo $24 2149 mfhi $25 2150 daddu $2,$24 2151 sltu $1,$2,$24 2152 daddu $25,$1 2153 daddu $3,$25 2154 sd $2,6*8($4) 2155 sd $3,7*8($4) 2156 2157 .set noreorder 2158 jr $31 2159 nop 2160.end bn_sqr_comba4 2161