1.text 2 3.globl bn_mul_mont 4.type bn_mul_mont,%function 5.align 5 6bn_mul_mont: 7 tst x5,#7 8 b.eq __bn_sqr8x_mont 9 tst x5,#3 10 b.eq __bn_mul4x_mont 11.Lmul_mont: 12 stp x29,x30,[sp,#-64]! 13 add x29,sp,#0 14 stp x19,x20,[sp,#16] 15 stp x21,x22,[sp,#32] 16 stp x23,x24,[sp,#48] 17 18 ldr x9,[x2],#8 // bp[0] 19 sub x22,sp,x5,lsl#3 20 ldp x7,x8,[x1],#16 // ap[0..1] 21 lsl x5,x5,#3 22 ldr x4,[x4] // *n0 23 and x22,x22,#-16 // ABI says so 24 ldp x13,x14,[x3],#16 // np[0..1] 25 26 mul x6,x7,x9 // ap[0]*bp[0] 27 sub x21,x5,#16 // j=num-2 28 umulh x7,x7,x9 29 mul x10,x8,x9 // ap[1]*bp[0] 30 umulh x11,x8,x9 31 32 mul x15,x6,x4 // "tp[0]"*n0 33 mov sp,x22 // alloca 34 35 // (*) mul x12,x13,x15 // np[0]*m1 36 umulh x13,x13,x15 37 mul x16,x14,x15 // np[1]*m1 38 // (*) adds x12,x12,x6 // discarded 39 // (*) As for removal of first multiplication and addition 40 // instructions. The outcome of first addition is 41 // guaranteed to be zero, which leaves two computationally 42 // significant outcomes: it either carries or not. Then 43 // question is when does it carry? Is there alternative 44 // way to deduce it? If you follow operations, you can 45 // observe that condition for carry is quite simple: 46 // x6 being non-zero. So that carry can be calculated 47 // by adding -1 to x6. That's what next instruction does. 48 subs xzr,x6,#1 // (*) 49 umulh x17,x14,x15 50 adc x13,x13,xzr 51 cbz x21,.L1st_skip 52 53.L1st: 54 ldr x8,[x1],#8 55 adds x6,x10,x7 56 sub x21,x21,#8 // j-- 57 adc x7,x11,xzr 58 59 ldr x14,[x3],#8 60 adds x12,x16,x13 61 mul x10,x8,x9 // ap[j]*bp[0] 62 adc x13,x17,xzr 63 umulh x11,x8,x9 64 65 adds x12,x12,x6 66 mul x16,x14,x15 // np[j]*m1 67 adc x13,x13,xzr 68 umulh x17,x14,x15 69 str x12,[x22],#8 // tp[j-1] 70 cbnz x21,.L1st 71 72.L1st_skip: 73 adds x6,x10,x7 74 sub x1,x1,x5 // rewind x1 75 adc x7,x11,xzr 76 77 adds x12,x16,x13 78 sub x3,x3,x5 // rewind x3 79 adc x13,x17,xzr 80 81 adds x12,x12,x6 82 sub x20,x5,#8 // i=num-1 83 adcs x13,x13,x7 84 85 adc x19,xzr,xzr // upmost overflow bit 86 stp x12,x13,[x22] 87 88.Louter: 89 ldr x9,[x2],#8 // bp[i] 90 ldp x7,x8,[x1],#16 91 ldr x23,[sp] // tp[0] 92 add x22,sp,#8 93 94 mul x6,x7,x9 // ap[0]*bp[i] 95 sub x21,x5,#16 // j=num-2 96 umulh x7,x7,x9 97 ldp x13,x14,[x3],#16 98 mul x10,x8,x9 // ap[1]*bp[i] 99 adds x6,x6,x23 100 umulh x11,x8,x9 101 adc x7,x7,xzr 102 103 mul x15,x6,x4 104 sub x20,x20,#8 // i-- 105 106 // (*) mul x12,x13,x15 // np[0]*m1 107 umulh x13,x13,x15 108 mul x16,x14,x15 // np[1]*m1 109 // (*) adds x12,x12,x6 110 subs xzr,x6,#1 // (*) 111 umulh x17,x14,x15 112 cbz x21,.Linner_skip 113 114.Linner: 115 ldr x8,[x1],#8 116 adc x13,x13,xzr 117 ldr x23,[x22],#8 // tp[j] 118 adds x6,x10,x7 119 sub x21,x21,#8 // j-- 120 adc x7,x11,xzr 121 122 adds x12,x16,x13 123 ldr x14,[x3],#8 124 adc x13,x17,xzr 125 126 mul x10,x8,x9 // ap[j]*bp[i] 127 adds x6,x6,x23 128 umulh x11,x8,x9 129 adc x7,x7,xzr 130 131 mul x16,x14,x15 // np[j]*m1 132 adds x12,x12,x6 133 umulh x17,x14,x15 134 str x12,[x22,#-16] // tp[j-1] 135 cbnz x21,.Linner 136 137.Linner_skip: 138 ldr x23,[x22],#8 // tp[j] 139 adc x13,x13,xzr 140 adds x6,x10,x7 141 sub x1,x1,x5 // rewind x1 142 adc x7,x11,xzr 143 144 adds x12,x16,x13 145 sub x3,x3,x5 // rewind x3 146 adcs x13,x17,x19 147 adc x19,xzr,xzr 148 149 adds x6,x6,x23 150 adc x7,x7,xzr 151 152 adds x12,x12,x6 153 adcs x13,x13,x7 154 adc x19,x19,xzr // upmost overflow bit 155 stp x12,x13,[x22,#-16] 156 157 cbnz x20,.Louter 158 159 // Final step. We see if result is larger than modulus, and 160 // if it is, subtract the modulus. But comparison implies 161 // subtraction. So we subtract modulus, see if it borrowed, 162 // and conditionally copy original value. 163 ldr x23,[sp] // tp[0] 164 add x22,sp,#8 165 ldr x14,[x3],#8 // np[0] 166 subs x21,x5,#8 // j=num-1 and clear borrow 167 mov x1,x0 168.Lsub: 169 sbcs x8,x23,x14 // tp[j]-np[j] 170 ldr x23,[x22],#8 171 sub x21,x21,#8 // j-- 172 ldr x14,[x3],#8 173 str x8,[x1],#8 // rp[j]=tp[j]-np[j] 174 cbnz x21,.Lsub 175 176 sbcs x8,x23,x14 177 sbcs x19,x19,xzr // did it borrow? 178 str x8,[x1],#8 // rp[num-1] 179 180 ldr x23,[sp] // tp[0] 181 add x22,sp,#8 182 ldr x8,[x0],#8 // rp[0] 183 sub x5,x5,#8 // num-- 184 nop 185.Lcond_copy: 186 sub x5,x5,#8 // num-- 187 csel x14,x23,x8,lo // did it borrow? 188 ldr x23,[x22],#8 189 ldr x8,[x0],#8 190 str xzr,[x22,#-16] // wipe tp 191 str x14,[x0,#-16] 192 cbnz x5,.Lcond_copy 193 194 csel x14,x23,x8,lo 195 str xzr,[x22,#-8] // wipe tp 196 str x14,[x0,#-8] 197 198 ldp x19,x20,[x29,#16] 199 mov sp,x29 200 ldp x21,x22,[x29,#32] 201 mov x0,#1 202 ldp x23,x24,[x29,#48] 203 ldr x29,[sp],#64 204 ret 205.size bn_mul_mont,.-bn_mul_mont 206.type __bn_sqr8x_mont,%function 207.align 5 208__bn_sqr8x_mont: 209 cmp x1,x2 210 b.ne __bn_mul4x_mont 211.Lsqr8x_mont: 212 stp x29,x30,[sp,#-128]! 213 add x29,sp,#0 214 stp x19,x20,[sp,#16] 215 stp x21,x22,[sp,#32] 216 stp x23,x24,[sp,#48] 217 stp x25,x26,[sp,#64] 218 stp x27,x28,[sp,#80] 219 stp x0,x3,[sp,#96] // offload rp and np 220 221 ldp x6,x7,[x1,#8*0] 222 ldp x8,x9,[x1,#8*2] 223 ldp x10,x11,[x1,#8*4] 224 ldp x12,x13,[x1,#8*6] 225 226 sub x2,sp,x5,lsl#4 227 lsl x5,x5,#3 228 ldr x4,[x4] // *n0 229 mov sp,x2 // alloca 230 sub x27,x5,#8*8 231 b .Lsqr8x_zero_start 232 233.Lsqr8x_zero: 234 sub x27,x27,#8*8 235 stp xzr,xzr,[x2,#8*0] 236 stp xzr,xzr,[x2,#8*2] 237 stp xzr,xzr,[x2,#8*4] 238 stp xzr,xzr,[x2,#8*6] 239.Lsqr8x_zero_start: 240 stp xzr,xzr,[x2,#8*8] 241 stp xzr,xzr,[x2,#8*10] 242 stp xzr,xzr,[x2,#8*12] 243 stp xzr,xzr,[x2,#8*14] 244 add x2,x2,#8*16 245 cbnz x27,.Lsqr8x_zero 246 247 add x3,x1,x5 248 add x1,x1,#8*8 249 mov x19,xzr 250 mov x20,xzr 251 mov x21,xzr 252 mov x22,xzr 253 mov x23,xzr 254 mov x24,xzr 255 mov x25,xzr 256 mov x26,xzr 257 mov x2,sp 258 str x4,[x29,#112] // offload n0 259 260 // Multiply everything but a[i]*a[i] 261.align 4 262.Lsqr8x_outer_loop: 263 // a[1]a[0] (i) 264 // a[2]a[0] 265 // a[3]a[0] 266 // a[4]a[0] 267 // a[5]a[0] 268 // a[6]a[0] 269 // a[7]a[0] 270 // a[2]a[1] (ii) 271 // a[3]a[1] 272 // a[4]a[1] 273 // a[5]a[1] 274 // a[6]a[1] 275 // a[7]a[1] 276 // a[3]a[2] (iii) 277 // a[4]a[2] 278 // a[5]a[2] 279 // a[6]a[2] 280 // a[7]a[2] 281 // a[4]a[3] (iv) 282 // a[5]a[3] 283 // a[6]a[3] 284 // a[7]a[3] 285 // a[5]a[4] (v) 286 // a[6]a[4] 287 // a[7]a[4] 288 // a[6]a[5] (vi) 289 // a[7]a[5] 290 // a[7]a[6] (vii) 291 292 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) 293 mul x15,x8,x6 294 mul x16,x9,x6 295 mul x17,x10,x6 296 adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) 297 mul x14,x11,x6 298 adcs x21,x21,x15 299 mul x15,x12,x6 300 adcs x22,x22,x16 301 mul x16,x13,x6 302 adcs x23,x23,x17 303 umulh x17,x7,x6 // hi(a[1..7]*a[0]) 304 adcs x24,x24,x14 305 umulh x14,x8,x6 306 adcs x25,x25,x15 307 umulh x15,x9,x6 308 adcs x26,x26,x16 309 umulh x16,x10,x6 310 stp x19,x20,[x2],#8*2 // t[0..1] 311 adc x19,xzr,xzr // t[8] 312 adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) 313 umulh x17,x11,x6 314 adcs x22,x22,x14 315 umulh x14,x12,x6 316 adcs x23,x23,x15 317 umulh x15,x13,x6 318 adcs x24,x24,x16 319 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) 320 adcs x25,x25,x17 321 mul x17,x9,x7 322 adcs x26,x26,x14 323 mul x14,x10,x7 324 adc x19,x19,x15 325 326 mul x15,x11,x7 327 adds x22,x22,x16 328 mul x16,x12,x7 329 adcs x23,x23,x17 330 mul x17,x13,x7 331 adcs x24,x24,x14 332 umulh x14,x8,x7 // hi(a[2..7]*a[1]) 333 adcs x25,x25,x15 334 umulh x15,x9,x7 335 adcs x26,x26,x16 336 umulh x16,x10,x7 337 adcs x19,x19,x17 338 umulh x17,x11,x7 339 stp x21,x22,[x2],#8*2 // t[2..3] 340 adc x20,xzr,xzr // t[9] 341 adds x23,x23,x14 342 umulh x14,x12,x7 343 adcs x24,x24,x15 344 umulh x15,x13,x7 345 adcs x25,x25,x16 346 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) 347 adcs x26,x26,x17 348 mul x17,x10,x8 349 adcs x19,x19,x14 350 mul x14,x11,x8 351 adc x20,x20,x15 352 353 mul x15,x12,x8 354 adds x24,x24,x16 355 mul x16,x13,x8 356 adcs x25,x25,x17 357 umulh x17,x9,x8 // hi(a[3..7]*a[2]) 358 adcs x26,x26,x14 359 umulh x14,x10,x8 360 adcs x19,x19,x15 361 umulh x15,x11,x8 362 adcs x20,x20,x16 363 umulh x16,x12,x8 364 stp x23,x24,[x2],#8*2 // t[4..5] 365 adc x21,xzr,xzr // t[10] 366 adds x25,x25,x17 367 umulh x17,x13,x8 368 adcs x26,x26,x14 369 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) 370 adcs x19,x19,x15 371 mul x15,x11,x9 372 adcs x20,x20,x16 373 mul x16,x12,x9 374 adc x21,x21,x17 375 376 mul x17,x13,x9 377 adds x26,x26,x14 378 umulh x14,x10,x9 // hi(a[4..7]*a[3]) 379 adcs x19,x19,x15 380 umulh x15,x11,x9 381 adcs x20,x20,x16 382 umulh x16,x12,x9 383 adcs x21,x21,x17 384 umulh x17,x13,x9 385 stp x25,x26,[x2],#8*2 // t[6..7] 386 adc x22,xzr,xzr // t[11] 387 adds x19,x19,x14 388 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) 389 adcs x20,x20,x15 390 mul x15,x12,x10 391 adcs x21,x21,x16 392 mul x16,x13,x10 393 adc x22,x22,x17 394 395 umulh x17,x11,x10 // hi(a[5..7]*a[4]) 396 adds x20,x20,x14 397 umulh x14,x12,x10 398 adcs x21,x21,x15 399 umulh x15,x13,x10 400 adcs x22,x22,x16 401 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) 402 adc x23,xzr,xzr // t[12] 403 adds x21,x21,x17 404 mul x17,x13,x11 405 adcs x22,x22,x14 406 umulh x14,x12,x11 // hi(a[6..7]*a[5]) 407 adc x23,x23,x15 408 409 umulh x15,x13,x11 410 adds x22,x22,x16 411 mul x16,x13,x12 // lo(a[7]*a[6]) (vii) 412 adcs x23,x23,x17 413 umulh x17,x13,x12 // hi(a[7]*a[6]) 414 adc x24,xzr,xzr // t[13] 415 adds x23,x23,x14 416 sub x27,x3,x1 // done yet? 417 adc x24,x24,x15 418 419 adds x24,x24,x16 420 sub x14,x3,x5 // rewinded ap 421 adc x25,xzr,xzr // t[14] 422 add x25,x25,x17 423 424 cbz x27,.Lsqr8x_outer_break 425 426 mov x4,x6 427 ldp x6,x7,[x2,#8*0] 428 ldp x8,x9,[x2,#8*2] 429 ldp x10,x11,[x2,#8*4] 430 ldp x12,x13,[x2,#8*6] 431 adds x19,x19,x6 432 adcs x20,x20,x7 433 ldp x6,x7,[x1,#8*0] 434 adcs x21,x21,x8 435 adcs x22,x22,x9 436 ldp x8,x9,[x1,#8*2] 437 adcs x23,x23,x10 438 adcs x24,x24,x11 439 ldp x10,x11,[x1,#8*4] 440 adcs x25,x25,x12 441 mov x0,x1 442 adcs x26,xzr,x13 443 ldp x12,x13,[x1,#8*6] 444 add x1,x1,#8*8 445 //adc x28,xzr,xzr // moved below 446 mov x27,#-8*8 447 448 // a[8]a[0] 449 // a[9]a[0] 450 // a[a]a[0] 451 // a[b]a[0] 452 // a[c]a[0] 453 // a[d]a[0] 454 // a[e]a[0] 455 // a[f]a[0] 456 // a[8]a[1] 457 // a[f]a[1]........................ 458 // a[8]a[2] 459 // a[f]a[2]........................ 460 // a[8]a[3] 461 // a[f]a[3]........................ 462 // a[8]a[4] 463 // a[f]a[4]........................ 464 // a[8]a[5] 465 // a[f]a[5]........................ 466 // a[8]a[6] 467 // a[f]a[6]........................ 468 // a[8]a[7] 469 // a[f]a[7]........................ 470.Lsqr8x_mul: 471 mul x14,x6,x4 472 adc x28,xzr,xzr // carry bit, modulo-scheduled 473 mul x15,x7,x4 474 add x27,x27,#8 475 mul x16,x8,x4 476 mul x17,x9,x4 477 adds x19,x19,x14 478 mul x14,x10,x4 479 adcs x20,x20,x15 480 mul x15,x11,x4 481 adcs x21,x21,x16 482 mul x16,x12,x4 483 adcs x22,x22,x17 484 mul x17,x13,x4 485 adcs x23,x23,x14 486 umulh x14,x6,x4 487 adcs x24,x24,x15 488 umulh x15,x7,x4 489 adcs x25,x25,x16 490 umulh x16,x8,x4 491 adcs x26,x26,x17 492 umulh x17,x9,x4 493 adc x28,x28,xzr 494 str x19,[x2],#8 495 adds x19,x20,x14 496 umulh x14,x10,x4 497 adcs x20,x21,x15 498 umulh x15,x11,x4 499 adcs x21,x22,x16 500 umulh x16,x12,x4 501 adcs x22,x23,x17 502 umulh x17,x13,x4 503 ldr x4,[x0,x27] 504 adcs x23,x24,x14 505 adcs x24,x25,x15 506 adcs x25,x26,x16 507 adcs x26,x28,x17 508 //adc x28,xzr,xzr // moved above 509 cbnz x27,.Lsqr8x_mul 510 // note that carry flag is guaranteed 511 // to be zero at this point 512 cmp x1,x3 // done yet? 513 b.eq .Lsqr8x_break 514 515 ldp x6,x7,[x2,#8*0] 516 ldp x8,x9,[x2,#8*2] 517 ldp x10,x11,[x2,#8*4] 518 ldp x12,x13,[x2,#8*6] 519 adds x19,x19,x6 520 ldr x4,[x0,#-8*8] 521 adcs x20,x20,x7 522 ldp x6,x7,[x1,#8*0] 523 adcs x21,x21,x8 524 adcs x22,x22,x9 525 ldp x8,x9,[x1,#8*2] 526 adcs x23,x23,x10 527 adcs x24,x24,x11 528 ldp x10,x11,[x1,#8*4] 529 adcs x25,x25,x12 530 mov x27,#-8*8 531 adcs x26,x26,x13 532 ldp x12,x13,[x1,#8*6] 533 add x1,x1,#8*8 534 //adc x28,xzr,xzr // moved above 535 b .Lsqr8x_mul 536 537.align 4 538.Lsqr8x_break: 539 ldp x6,x7,[x0,#8*0] 540 add x1,x0,#8*8 541 ldp x8,x9,[x0,#8*2] 542 sub x14,x3,x1 // is it last iteration? 543 ldp x10,x11,[x0,#8*4] 544 sub x15,x2,x14 545 ldp x12,x13,[x0,#8*6] 546 cbz x14,.Lsqr8x_outer_loop 547 548 stp x19,x20,[x2,#8*0] 549 ldp x19,x20,[x15,#8*0] 550 stp x21,x22,[x2,#8*2] 551 ldp x21,x22,[x15,#8*2] 552 stp x23,x24,[x2,#8*4] 553 ldp x23,x24,[x15,#8*4] 554 stp x25,x26,[x2,#8*6] 555 mov x2,x15 556 ldp x25,x26,[x15,#8*6] 557 b .Lsqr8x_outer_loop 558 559.align 4 560.Lsqr8x_outer_break: 561 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 562 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] 563 ldp x15,x16,[sp,#8*1] 564 ldp x11,x13,[x14,#8*2] 565 add x1,x14,#8*4 566 ldp x17,x14,[sp,#8*3] 567 568 stp x19,x20,[x2,#8*0] 569 mul x19,x7,x7 570 stp x21,x22,[x2,#8*2] 571 umulh x7,x7,x7 572 stp x23,x24,[x2,#8*4] 573 mul x8,x9,x9 574 stp x25,x26,[x2,#8*6] 575 mov x2,sp 576 umulh x9,x9,x9 577 adds x20,x7,x15,lsl#1 578 extr x15,x16,x15,#63 579 sub x27,x5,#8*4 580 581.Lsqr4x_shift_n_add: 582 adcs x21,x8,x15 583 extr x16,x17,x16,#63 584 sub x27,x27,#8*4 585 adcs x22,x9,x16 586 ldp x15,x16,[x2,#8*5] 587 mul x10,x11,x11 588 ldp x7,x9,[x1],#8*2 589 umulh x11,x11,x11 590 mul x12,x13,x13 591 umulh x13,x13,x13 592 extr x17,x14,x17,#63 593 stp x19,x20,[x2,#8*0] 594 adcs x23,x10,x17 595 extr x14,x15,x14,#63 596 stp x21,x22,[x2,#8*2] 597 adcs x24,x11,x14 598 ldp x17,x14,[x2,#8*7] 599 extr x15,x16,x15,#63 600 adcs x25,x12,x15 601 extr x16,x17,x16,#63 602 adcs x26,x13,x16 603 ldp x15,x16,[x2,#8*9] 604 mul x6,x7,x7 605 ldp x11,x13,[x1],#8*2 606 umulh x7,x7,x7 607 mul x8,x9,x9 608 umulh x9,x9,x9 609 stp x23,x24,[x2,#8*4] 610 extr x17,x14,x17,#63 611 stp x25,x26,[x2,#8*6] 612 add x2,x2,#8*8 613 adcs x19,x6,x17 614 extr x14,x15,x14,#63 615 adcs x20,x7,x14 616 ldp x17,x14,[x2,#8*3] 617 extr x15,x16,x15,#63 618 cbnz x27,.Lsqr4x_shift_n_add 619 ldp x1,x4,[x29,#104] // pull np and n0 620 621 adcs x21,x8,x15 622 extr x16,x17,x16,#63 623 adcs x22,x9,x16 624 ldp x15,x16,[x2,#8*5] 625 mul x10,x11,x11 626 umulh x11,x11,x11 627 stp x19,x20,[x2,#8*0] 628 mul x12,x13,x13 629 umulh x13,x13,x13 630 stp x21,x22,[x2,#8*2] 631 extr x17,x14,x17,#63 632 adcs x23,x10,x17 633 extr x14,x15,x14,#63 634 ldp x19,x20,[sp,#8*0] 635 adcs x24,x11,x14 636 extr x15,x16,x15,#63 637 ldp x6,x7,[x1,#8*0] 638 adcs x25,x12,x15 639 extr x16,xzr,x16,#63 640 ldp x8,x9,[x1,#8*2] 641 adc x26,x13,x16 642 ldp x10,x11,[x1,#8*4] 643 644 // Reduce by 512 bits per iteration 645 mul x28,x4,x19 // t[0]*n0 646 ldp x12,x13,[x1,#8*6] 647 add x3,x1,x5 648 ldp x21,x22,[sp,#8*2] 649 stp x23,x24,[x2,#8*4] 650 ldp x23,x24,[sp,#8*4] 651 stp x25,x26,[x2,#8*6] 652 ldp x25,x26,[sp,#8*6] 653 add x1,x1,#8*8 654 mov x30,xzr // initial top-most carry 655 mov x2,sp 656 mov x27,#8 657 658.Lsqr8x_reduction: 659 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) 660 mul x15,x7,x28 661 sub x27,x27,#1 662 mul x16,x8,x28 663 str x28,[x2],#8 // put aside t[0]*n0 for tail processing 664 mul x17,x9,x28 665 // (*) adds xzr,x19,x14 666 subs xzr,x19,#1 // (*) 667 mul x14,x10,x28 668 adcs x19,x20,x15 669 mul x15,x11,x28 670 adcs x20,x21,x16 671 mul x16,x12,x28 672 adcs x21,x22,x17 673 mul x17,x13,x28 674 adcs x22,x23,x14 675 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) 676 adcs x23,x24,x15 677 umulh x15,x7,x28 678 adcs x24,x25,x16 679 umulh x16,x8,x28 680 adcs x25,x26,x17 681 umulh x17,x9,x28 682 adc x26,xzr,xzr 683 adds x19,x19,x14 684 umulh x14,x10,x28 685 adcs x20,x20,x15 686 umulh x15,x11,x28 687 adcs x21,x21,x16 688 umulh x16,x12,x28 689 adcs x22,x22,x17 690 umulh x17,x13,x28 691 mul x28,x4,x19 // next t[0]*n0 692 adcs x23,x23,x14 693 adcs x24,x24,x15 694 adcs x25,x25,x16 695 adc x26,x26,x17 696 cbnz x27,.Lsqr8x_reduction 697 698 ldp x14,x15,[x2,#8*0] 699 ldp x16,x17,[x2,#8*2] 700 mov x0,x2 701 sub x27,x3,x1 // done yet? 702 adds x19,x19,x14 703 adcs x20,x20,x15 704 ldp x14,x15,[x2,#8*4] 705 adcs x21,x21,x16 706 adcs x22,x22,x17 707 ldp x16,x17,[x2,#8*6] 708 adcs x23,x23,x14 709 adcs x24,x24,x15 710 adcs x25,x25,x16 711 adcs x26,x26,x17 712 //adc x28,xzr,xzr // moved below 713 cbz x27,.Lsqr8x8_post_condition 714 715 ldr x4,[x2,#-8*8] 716 ldp x6,x7,[x1,#8*0] 717 ldp x8,x9,[x1,#8*2] 718 ldp x10,x11,[x1,#8*4] 719 mov x27,#-8*8 720 ldp x12,x13,[x1,#8*6] 721 add x1,x1,#8*8 722 723.Lsqr8x_tail: 724 mul x14,x6,x4 725 adc x28,xzr,xzr // carry bit, modulo-scheduled 726 mul x15,x7,x4 727 add x27,x27,#8 728 mul x16,x8,x4 729 mul x17,x9,x4 730 adds x19,x19,x14 731 mul x14,x10,x4 732 adcs x20,x20,x15 733 mul x15,x11,x4 734 adcs x21,x21,x16 735 mul x16,x12,x4 736 adcs x22,x22,x17 737 mul x17,x13,x4 738 adcs x23,x23,x14 739 umulh x14,x6,x4 740 adcs x24,x24,x15 741 umulh x15,x7,x4 742 adcs x25,x25,x16 743 umulh x16,x8,x4 744 adcs x26,x26,x17 745 umulh x17,x9,x4 746 adc x28,x28,xzr 747 str x19,[x2],#8 748 adds x19,x20,x14 749 umulh x14,x10,x4 750 adcs x20,x21,x15 751 umulh x15,x11,x4 752 adcs x21,x22,x16 753 umulh x16,x12,x4 754 adcs x22,x23,x17 755 umulh x17,x13,x4 756 ldr x4,[x0,x27] 757 adcs x23,x24,x14 758 adcs x24,x25,x15 759 adcs x25,x26,x16 760 adcs x26,x28,x17 761 //adc x28,xzr,xzr // moved above 762 cbnz x27,.Lsqr8x_tail 763 // note that carry flag is guaranteed 764 // to be zero at this point 765 ldp x6,x7,[x2,#8*0] 766 sub x27,x3,x1 // done yet? 767 sub x16,x3,x5 // rewinded np 768 ldp x8,x9,[x2,#8*2] 769 ldp x10,x11,[x2,#8*4] 770 ldp x12,x13,[x2,#8*6] 771 cbz x27,.Lsqr8x_tail_break 772 773 ldr x4,[x0,#-8*8] 774 adds x19,x19,x6 775 adcs x20,x20,x7 776 ldp x6,x7,[x1,#8*0] 777 adcs x21,x21,x8 778 adcs x22,x22,x9 779 ldp x8,x9,[x1,#8*2] 780 adcs x23,x23,x10 781 adcs x24,x24,x11 782 ldp x10,x11,[x1,#8*4] 783 adcs x25,x25,x12 784 mov x27,#-8*8 785 adcs x26,x26,x13 786 ldp x12,x13,[x1,#8*6] 787 add x1,x1,#8*8 788 //adc x28,xzr,xzr // moved above 789 b .Lsqr8x_tail 790 791.align 4 792.Lsqr8x_tail_break: 793 ldr x4,[x29,#112] // pull n0 794 add x27,x2,#8*8 // end of current t[num] window 795 796 subs xzr,x30,#1 // "move" top-most carry to carry bit 797 adcs x14,x19,x6 798 adcs x15,x20,x7 799 ldp x19,x20,[x0,#8*0] 800 adcs x21,x21,x8 801 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] 802 adcs x22,x22,x9 803 ldp x8,x9,[x16,#8*2] 804 adcs x23,x23,x10 805 adcs x24,x24,x11 806 ldp x10,x11,[x16,#8*4] 807 adcs x25,x25,x12 808 adcs x26,x26,x13 809 ldp x12,x13,[x16,#8*6] 810 add x1,x16,#8*8 811 adc x30,xzr,xzr // top-most carry 812 mul x28,x4,x19 813 stp x14,x15,[x2,#8*0] 814 stp x21,x22,[x2,#8*2] 815 ldp x21,x22,[x0,#8*2] 816 stp x23,x24,[x2,#8*4] 817 ldp x23,x24,[x0,#8*4] 818 cmp x27,x29 // did we hit the bottom? 819 stp x25,x26,[x2,#8*6] 820 mov x2,x0 // slide the window 821 ldp x25,x26,[x0,#8*6] 822 mov x27,#8 823 b.ne .Lsqr8x_reduction 824 825 // Final step. We see if result is larger than modulus, and 826 // if it is, subtract the modulus. But comparison implies 827 // subtraction. So we subtract modulus, see if it borrowed, 828 // and conditionally copy original value. 829 ldr x0,[x29,#96] // pull rp 830 add x2,x2,#8*8 831 subs x14,x19,x6 832 sbcs x15,x20,x7 833 sub x27,x5,#8*8 834 mov x3,x0 // x0 copy 835 836.Lsqr8x_sub: 837 sbcs x16,x21,x8 838 ldp x6,x7,[x1,#8*0] 839 sbcs x17,x22,x9 840 stp x14,x15,[x0,#8*0] 841 sbcs x14,x23,x10 842 ldp x8,x9,[x1,#8*2] 843 sbcs x15,x24,x11 844 stp x16,x17,[x0,#8*2] 845 sbcs x16,x25,x12 846 ldp x10,x11,[x1,#8*4] 847 sbcs x17,x26,x13 848 ldp x12,x13,[x1,#8*6] 849 add x1,x1,#8*8 850 ldp x19,x20,[x2,#8*0] 851 sub x27,x27,#8*8 852 ldp x21,x22,[x2,#8*2] 853 ldp x23,x24,[x2,#8*4] 854 ldp x25,x26,[x2,#8*6] 855 add x2,x2,#8*8 856 stp x14,x15,[x0,#8*4] 857 sbcs x14,x19,x6 858 stp x16,x17,[x0,#8*6] 859 add x0,x0,#8*8 860 sbcs x15,x20,x7 861 cbnz x27,.Lsqr8x_sub 862 863 sbcs x16,x21,x8 864 mov x2,sp 865 add x1,sp,x5 866 ldp x6,x7,[x3,#8*0] 867 sbcs x17,x22,x9 868 stp x14,x15,[x0,#8*0] 869 sbcs x14,x23,x10 870 ldp x8,x9,[x3,#8*2] 871 sbcs x15,x24,x11 872 stp x16,x17,[x0,#8*2] 873 sbcs x16,x25,x12 874 ldp x19,x20,[x1,#8*0] 875 sbcs x17,x26,x13 876 ldp x21,x22,[x1,#8*2] 877 sbcs xzr,x30,xzr // did it borrow? 878 ldr x30,[x29,#8] // pull return address 879 stp x14,x15,[x0,#8*4] 880 stp x16,x17,[x0,#8*6] 881 882 sub x27,x5,#8*4 883.Lsqr4x_cond_copy: 884 sub x27,x27,#8*4 885 csel x14,x19,x6,lo 886 stp xzr,xzr,[x2,#8*0] 887 csel x15,x20,x7,lo 888 ldp x6,x7,[x3,#8*4] 889 ldp x19,x20,[x1,#8*4] 890 csel x16,x21,x8,lo 891 stp xzr,xzr,[x2,#8*2] 892 add x2,x2,#8*4 893 csel x17,x22,x9,lo 894 ldp x8,x9,[x3,#8*6] 895 ldp x21,x22,[x1,#8*6] 896 add x1,x1,#8*4 897 stp x14,x15,[x3,#8*0] 898 stp x16,x17,[x3,#8*2] 899 add x3,x3,#8*4 900 stp xzr,xzr,[x1,#8*0] 901 stp xzr,xzr,[x1,#8*2] 902 cbnz x27,.Lsqr4x_cond_copy 903 904 csel x14,x19,x6,lo 905 stp xzr,xzr,[x2,#8*0] 906 csel x15,x20,x7,lo 907 stp xzr,xzr,[x2,#8*2] 908 csel x16,x21,x8,lo 909 csel x17,x22,x9,lo 910 stp x14,x15,[x3,#8*0] 911 stp x16,x17,[x3,#8*2] 912 913 b .Lsqr8x_done 914 915.align 4 916.Lsqr8x8_post_condition: 917 adc x28,xzr,xzr 918 ldr x30,[x29,#8] // pull return address 919 // x19-7,x28 hold result, x6-7 hold modulus 920 subs x6,x19,x6 921 ldr x1,[x29,#96] // pull rp 922 sbcs x7,x20,x7 923 stp xzr,xzr,[sp,#8*0] 924 sbcs x8,x21,x8 925 stp xzr,xzr,[sp,#8*2] 926 sbcs x9,x22,x9 927 stp xzr,xzr,[sp,#8*4] 928 sbcs x10,x23,x10 929 stp xzr,xzr,[sp,#8*6] 930 sbcs x11,x24,x11 931 stp xzr,xzr,[sp,#8*8] 932 sbcs x12,x25,x12 933 stp xzr,xzr,[sp,#8*10] 934 sbcs x13,x26,x13 935 stp xzr,xzr,[sp,#8*12] 936 sbcs x28,x28,xzr // did it borrow? 937 stp xzr,xzr,[sp,#8*14] 938 939 // x6-7 hold result-modulus 940 csel x6,x19,x6,lo 941 csel x7,x20,x7,lo 942 csel x8,x21,x8,lo 943 csel x9,x22,x9,lo 944 stp x6,x7,[x1,#8*0] 945 csel x10,x23,x10,lo 946 csel x11,x24,x11,lo 947 stp x8,x9,[x1,#8*2] 948 csel x12,x25,x12,lo 949 csel x13,x26,x13,lo 950 stp x10,x11,[x1,#8*4] 951 stp x12,x13,[x1,#8*6] 952 953.Lsqr8x_done: 954 ldp x19,x20,[x29,#16] 955 mov sp,x29 956 ldp x21,x22,[x29,#32] 957 mov x0,#1 958 ldp x23,x24,[x29,#48] 959 ldp x25,x26,[x29,#64] 960 ldp x27,x28,[x29,#80] 961 ldr x29,[sp],#128 962 ret 963.size __bn_sqr8x_mont,.-__bn_sqr8x_mont 964.type __bn_mul4x_mont,%function 965.align 5 966__bn_mul4x_mont: 967 stp x29,x30,[sp,#-128]! 968 add x29,sp,#0 969 stp x19,x20,[sp,#16] 970 stp x21,x22,[sp,#32] 971 stp x23,x24,[sp,#48] 972 stp x25,x26,[sp,#64] 973 stp x27,x28,[sp,#80] 974 975 sub x26,sp,x5,lsl#3 976 lsl x5,x5,#3 977 ldr x4,[x4] // *n0 978 sub sp,x26,#8*4 // alloca 979 980 add x10,x2,x5 981 add x27,x1,x5 982 stp x0,x10,[x29,#96] // offload rp and &b[num] 983 984 ldr x24,[x2,#8*0] // b[0] 985 ldp x6,x7,[x1,#8*0] // a[0..3] 986 ldp x8,x9,[x1,#8*2] 987 add x1,x1,#8*4 988 mov x19,xzr 989 mov x20,xzr 990 mov x21,xzr 991 mov x22,xzr 992 ldp x14,x15,[x3,#8*0] // n[0..3] 993 ldp x16,x17,[x3,#8*2] 994 adds x3,x3,#8*4 // clear carry bit 995 mov x0,xzr 996 mov x28,#0 997 mov x26,sp 998 999.Loop_mul4x_1st_reduction: 1000 mul x10,x6,x24 // lo(a[0..3]*b[0]) 1001 adc x0,x0,xzr // modulo-scheduled 1002 mul x11,x7,x24 1003 add x28,x28,#8 1004 mul x12,x8,x24 1005 and x28,x28,#31 1006 mul x13,x9,x24 1007 adds x19,x19,x10 1008 umulh x10,x6,x24 // hi(a[0..3]*b[0]) 1009 adcs x20,x20,x11 1010 mul x25,x19,x4 // t[0]*n0 1011 adcs x21,x21,x12 1012 umulh x11,x7,x24 1013 adcs x22,x22,x13 1014 umulh x12,x8,x24 1015 adc x23,xzr,xzr 1016 umulh x13,x9,x24 1017 ldr x24,[x2,x28] // next b[i] (or b[0]) 1018 adds x20,x20,x10 1019 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) 1020 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1021 adcs x21,x21,x11 1022 mul x11,x15,x25 1023 adcs x22,x22,x12 1024 mul x12,x16,x25 1025 adc x23,x23,x13 // can't overflow 1026 mul x13,x17,x25 1027 // (*) adds xzr,x19,x10 1028 subs xzr,x19,#1 // (*) 1029 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) 1030 adcs x19,x20,x11 1031 umulh x11,x15,x25 1032 adcs x20,x21,x12 1033 umulh x12,x16,x25 1034 adcs x21,x22,x13 1035 umulh x13,x17,x25 1036 adcs x22,x23,x0 1037 adc x0,xzr,xzr 1038 adds x19,x19,x10 1039 sub x10,x27,x1 1040 adcs x20,x20,x11 1041 adcs x21,x21,x12 1042 adcs x22,x22,x13 1043 //adc x0,x0,xzr 1044 cbnz x28,.Loop_mul4x_1st_reduction 1045 1046 cbz x10,.Lmul4x4_post_condition 1047 1048 ldp x6,x7,[x1,#8*0] // a[4..7] 1049 ldp x8,x9,[x1,#8*2] 1050 add x1,x1,#8*4 1051 ldr x25,[sp] // a[0]*n0 1052 ldp x14,x15,[x3,#8*0] // n[4..7] 1053 ldp x16,x17,[x3,#8*2] 1054 add x3,x3,#8*4 1055 1056.Loop_mul4x_1st_tail: 1057 mul x10,x6,x24 // lo(a[4..7]*b[i]) 1058 adc x0,x0,xzr // modulo-scheduled 1059 mul x11,x7,x24 1060 add x28,x28,#8 1061 mul x12,x8,x24 1062 and x28,x28,#31 1063 mul x13,x9,x24 1064 adds x19,x19,x10 1065 umulh x10,x6,x24 // hi(a[4..7]*b[i]) 1066 adcs x20,x20,x11 1067 umulh x11,x7,x24 1068 adcs x21,x21,x12 1069 umulh x12,x8,x24 1070 adcs x22,x22,x13 1071 umulh x13,x9,x24 1072 adc x23,xzr,xzr 1073 ldr x24,[x2,x28] // next b[i] (or b[0]) 1074 adds x20,x20,x10 1075 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) 1076 adcs x21,x21,x11 1077 mul x11,x15,x25 1078 adcs x22,x22,x12 1079 mul x12,x16,x25 1080 adc x23,x23,x13 // can't overflow 1081 mul x13,x17,x25 1082 adds x19,x19,x10 1083 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) 1084 adcs x20,x20,x11 1085 umulh x11,x15,x25 1086 adcs x21,x21,x12 1087 umulh x12,x16,x25 1088 adcs x22,x22,x13 1089 adcs x23,x23,x0 1090 umulh x13,x17,x25 1091 adc x0,xzr,xzr 1092 ldr x25,[sp,x28] // next t[0]*n0 1093 str x19,[x26],#8 // result!!! 1094 adds x19,x20,x10 1095 sub x10,x27,x1 // done yet? 1096 adcs x20,x21,x11 1097 adcs x21,x22,x12 1098 adcs x22,x23,x13 1099 //adc x0,x0,xzr 1100 cbnz x28,.Loop_mul4x_1st_tail 1101 1102 sub x11,x27,x5 // rewinded x1 1103 cbz x10,.Lmul4x_proceed 1104 1105 ldp x6,x7,[x1,#8*0] 1106 ldp x8,x9,[x1,#8*2] 1107 add x1,x1,#8*4 1108 ldp x14,x15,[x3,#8*0] 1109 ldp x16,x17,[x3,#8*2] 1110 add x3,x3,#8*4 1111 b .Loop_mul4x_1st_tail 1112 1113.align 5 1114.Lmul4x_proceed: 1115 ldr x24,[x2,#8*4]! // *++b 1116 adc x30,x0,xzr 1117 ldp x6,x7,[x11,#8*0] // a[0..3] 1118 sub x3,x3,x5 // rewind np 1119 ldp x8,x9,[x11,#8*2] 1120 add x1,x11,#8*4 1121 1122 stp x19,x20,[x26,#8*0] // result!!! 1123 ldp x19,x20,[sp,#8*4] // t[0..3] 1124 stp x21,x22,[x26,#8*2] // result!!! 1125 ldp x21,x22,[sp,#8*6] 1126 1127 ldp x14,x15,[x3,#8*0] // n[0..3] 1128 mov x26,sp 1129 ldp x16,x17,[x3,#8*2] 1130 adds x3,x3,#8*4 // clear carry bit 1131 mov x0,xzr 1132 1133.align 4 1134.Loop_mul4x_reduction: 1135 mul x10,x6,x24 // lo(a[0..3]*b[4]) 1136 adc x0,x0,xzr // modulo-scheduled 1137 mul x11,x7,x24 1138 add x28,x28,#8 1139 mul x12,x8,x24 1140 and x28,x28,#31 1141 mul x13,x9,x24 1142 adds x19,x19,x10 1143 umulh x10,x6,x24 // hi(a[0..3]*b[4]) 1144 adcs x20,x20,x11 1145 mul x25,x19,x4 // t[0]*n0 1146 adcs x21,x21,x12 1147 umulh x11,x7,x24 1148 adcs x22,x22,x13 1149 umulh x12,x8,x24 1150 adc x23,xzr,xzr 1151 umulh x13,x9,x24 1152 ldr x24,[x2,x28] // next b[i] 1153 adds x20,x20,x10 1154 // (*) mul x10,x14,x25 1155 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1156 adcs x21,x21,x11 1157 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 1158 adcs x22,x22,x12 1159 mul x12,x16,x25 1160 adc x23,x23,x13 // can't overflow 1161 mul x13,x17,x25 1162 // (*) adds xzr,x19,x10 1163 subs xzr,x19,#1 // (*) 1164 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 1165 adcs x19,x20,x11 1166 umulh x11,x15,x25 1167 adcs x20,x21,x12 1168 umulh x12,x16,x25 1169 adcs x21,x22,x13 1170 umulh x13,x17,x25 1171 adcs x22,x23,x0 1172 adc x0,xzr,xzr 1173 adds x19,x19,x10 1174 adcs x20,x20,x11 1175 adcs x21,x21,x12 1176 adcs x22,x22,x13 1177 //adc x0,x0,xzr 1178 cbnz x28,.Loop_mul4x_reduction 1179 1180 adc x0,x0,xzr 1181 ldp x10,x11,[x26,#8*4] // t[4..7] 1182 ldp x12,x13,[x26,#8*6] 1183 ldp x6,x7,[x1,#8*0] // a[4..7] 1184 ldp x8,x9,[x1,#8*2] 1185 add x1,x1,#8*4 1186 adds x19,x19,x10 1187 adcs x20,x20,x11 1188 adcs x21,x21,x12 1189 adcs x22,x22,x13 1190 //adc x0,x0,xzr 1191 1192 ldr x25,[sp] // t[0]*n0 1193 ldp x14,x15,[x3,#8*0] // n[4..7] 1194 ldp x16,x17,[x3,#8*2] 1195 add x3,x3,#8*4 1196 1197.align 4 1198.Loop_mul4x_tail: 1199 mul x10,x6,x24 // lo(a[4..7]*b[4]) 1200 adc x0,x0,xzr // modulo-scheduled 1201 mul x11,x7,x24 1202 add x28,x28,#8 1203 mul x12,x8,x24 1204 and x28,x28,#31 1205 mul x13,x9,x24 1206 adds x19,x19,x10 1207 umulh x10,x6,x24 // hi(a[4..7]*b[4]) 1208 adcs x20,x20,x11 1209 umulh x11,x7,x24 1210 adcs x21,x21,x12 1211 umulh x12,x8,x24 1212 adcs x22,x22,x13 1213 umulh x13,x9,x24 1214 adc x23,xzr,xzr 1215 ldr x24,[x2,x28] // next b[i] 1216 adds x20,x20,x10 1217 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) 1218 adcs x21,x21,x11 1219 mul x11,x15,x25 1220 adcs x22,x22,x12 1221 mul x12,x16,x25 1222 adc x23,x23,x13 // can't overflow 1223 mul x13,x17,x25 1224 adds x19,x19,x10 1225 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) 1226 adcs x20,x20,x11 1227 umulh x11,x15,x25 1228 adcs x21,x21,x12 1229 umulh x12,x16,x25 1230 adcs x22,x22,x13 1231 umulh x13,x17,x25 1232 adcs x23,x23,x0 1233 ldr x25,[sp,x28] // next a[0]*n0 1234 adc x0,xzr,xzr 1235 str x19,[x26],#8 // result!!! 1236 adds x19,x20,x10 1237 sub x10,x27,x1 // done yet? 1238 adcs x20,x21,x11 1239 adcs x21,x22,x12 1240 adcs x22,x23,x13 1241 //adc x0,x0,xzr 1242 cbnz x28,.Loop_mul4x_tail 1243 1244 sub x11,x3,x5 // rewinded np? 1245 adc x0,x0,xzr 1246 cbz x10,.Loop_mul4x_break 1247 1248 ldp x10,x11,[x26,#8*4] 1249 ldp x12,x13,[x26,#8*6] 1250 ldp x6,x7,[x1,#8*0] 1251 ldp x8,x9,[x1,#8*2] 1252 add x1,x1,#8*4 1253 adds x19,x19,x10 1254 adcs x20,x20,x11 1255 adcs x21,x21,x12 1256 adcs x22,x22,x13 1257 //adc x0,x0,xzr 1258 ldp x14,x15,[x3,#8*0] 1259 ldp x16,x17,[x3,#8*2] 1260 add x3,x3,#8*4 1261 b .Loop_mul4x_tail 1262 1263.align 4 1264.Loop_mul4x_break: 1265 ldp x12,x13,[x29,#96] // pull rp and &b[num] 1266 adds x19,x19,x30 1267 add x2,x2,#8*4 // bp++ 1268 adcs x20,x20,xzr 1269 sub x1,x1,x5 // rewind ap 1270 adcs x21,x21,xzr 1271 stp x19,x20,[x26,#8*0] // result!!! 1272 adcs x22,x22,xzr 1273 ldp x19,x20,[sp,#8*4] // t[0..3] 1274 adc x30,x0,xzr 1275 stp x21,x22,[x26,#8*2] // result!!! 1276 cmp x2,x13 // done yet? 1277 ldp x21,x22,[sp,#8*6] 1278 ldp x14,x15,[x11,#8*0] // n[0..3] 1279 ldp x16,x17,[x11,#8*2] 1280 add x3,x11,#8*4 1281 b.eq .Lmul4x_post 1282 1283 ldr x24,[x2] 1284 ldp x6,x7,[x1,#8*0] // a[0..3] 1285 ldp x8,x9,[x1,#8*2] 1286 adds x1,x1,#8*4 // clear carry bit 1287 mov x0,xzr 1288 mov x26,sp 1289 b .Loop_mul4x_reduction 1290 1291.align 4 1292.Lmul4x_post: 1293 // Final step. We see if result is larger than modulus, and 1294 // if it is, subtract the modulus. But comparison implies 1295 // subtraction. So we subtract modulus, see if it borrowed, 1296 // and conditionally copy original value. 1297 mov x0,x12 1298 mov x27,x12 // x0 copy 1299 subs x10,x19,x14 1300 add x26,sp,#8*8 1301 sbcs x11,x20,x15 1302 sub x28,x5,#8*4 1303 1304.Lmul4x_sub: 1305 sbcs x12,x21,x16 1306 ldp x14,x15,[x3,#8*0] 1307 sub x28,x28,#8*4 1308 ldp x19,x20,[x26,#8*0] 1309 sbcs x13,x22,x17 1310 ldp x16,x17,[x3,#8*2] 1311 add x3,x3,#8*4 1312 ldp x21,x22,[x26,#8*2] 1313 add x26,x26,#8*4 1314 stp x10,x11,[x0,#8*0] 1315 sbcs x10,x19,x14 1316 stp x12,x13,[x0,#8*2] 1317 add x0,x0,#8*4 1318 sbcs x11,x20,x15 1319 cbnz x28,.Lmul4x_sub 1320 1321 sbcs x12,x21,x16 1322 mov x26,sp 1323 add x1,sp,#8*4 1324 ldp x6,x7,[x27,#8*0] 1325 sbcs x13,x22,x17 1326 stp x10,x11,[x0,#8*0] 1327 ldp x8,x9,[x27,#8*2] 1328 stp x12,x13,[x0,#8*2] 1329 ldp x19,x20,[x1,#8*0] 1330 ldp x21,x22,[x1,#8*2] 1331 sbcs xzr,x30,xzr // did it borrow? 1332 ldr x30,[x29,#8] // pull return address 1333 1334 sub x28,x5,#8*4 1335.Lmul4x_cond_copy: 1336 sub x28,x28,#8*4 1337 csel x10,x19,x6,lo 1338 stp xzr,xzr,[x26,#8*0] 1339 csel x11,x20,x7,lo 1340 ldp x6,x7,[x27,#8*4] 1341 ldp x19,x20,[x1,#8*4] 1342 csel x12,x21,x8,lo 1343 stp xzr,xzr,[x26,#8*2] 1344 add x26,x26,#8*4 1345 csel x13,x22,x9,lo 1346 ldp x8,x9,[x27,#8*6] 1347 ldp x21,x22,[x1,#8*6] 1348 add x1,x1,#8*4 1349 stp x10,x11,[x27,#8*0] 1350 stp x12,x13,[x27,#8*2] 1351 add x27,x27,#8*4 1352 cbnz x28,.Lmul4x_cond_copy 1353 1354 csel x10,x19,x6,lo 1355 stp xzr,xzr,[x26,#8*0] 1356 csel x11,x20,x7,lo 1357 stp xzr,xzr,[x26,#8*2] 1358 csel x12,x21,x8,lo 1359 stp xzr,xzr,[x26,#8*3] 1360 csel x13,x22,x9,lo 1361 stp xzr,xzr,[x26,#8*4] 1362 stp x10,x11,[x27,#8*0] 1363 stp x12,x13,[x27,#8*2] 1364 1365 b .Lmul4x_done 1366 1367.align 4 1368.Lmul4x4_post_condition: 1369 adc x0,x0,xzr 1370 ldr x1,[x29,#96] // pull rp 1371 // x19-3,x0 hold result, x14-7 hold modulus 1372 subs x6,x19,x14 1373 ldr x30,[x29,#8] // pull return address 1374 sbcs x7,x20,x15 1375 stp xzr,xzr,[sp,#8*0] 1376 sbcs x8,x21,x16 1377 stp xzr,xzr,[sp,#8*2] 1378 sbcs x9,x22,x17 1379 stp xzr,xzr,[sp,#8*4] 1380 sbcs xzr,x0,xzr // did it borrow? 1381 stp xzr,xzr,[sp,#8*6] 1382 1383 // x6-3 hold result-modulus 1384 csel x6,x19,x6,lo 1385 csel x7,x20,x7,lo 1386 csel x8,x21,x8,lo 1387 csel x9,x22,x9,lo 1388 stp x6,x7,[x1,#8*0] 1389 stp x8,x9,[x1,#8*2] 1390 1391.Lmul4x_done: 1392 ldp x19,x20,[x29,#16] 1393 mov sp,x29 1394 ldp x21,x22,[x29,#32] 1395 mov x0,#1 1396 ldp x23,x24,[x29,#48] 1397 ldp x25,x26,[x29,#64] 1398 ldp x27,x28,[x29,#80] 1399 ldr x29,[sp],#128 1400 ret 1401.size __bn_mul4x_mont,.-__bn_mul4x_mont 1402.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1403.align 2 1404.align 4 1405