1.text 2 3.globl bn_mul_mont 4.type bn_mul_mont,%function 5.align 5 6bn_mul_mont: 7 tst x5,#7 8 b.eq __bn_sqr8x_mont 9 tst x5,#3 10 b.eq __bn_mul4x_mont 11.Lmul_mont: 12 stp x29,x30,[sp,#-64]! 13 add x29,sp,#0 14 stp x19,x20,[sp,#16] 15 stp x21,x22,[sp,#32] 16 stp x23,x24,[sp,#48] 17 18 ldr x9,[x2],#8 // bp[0] 19 sub x22,sp,x5,lsl#3 20 ldp x7,x8,[x1],#16 // ap[0..1] 21 lsl x5,x5,#3 22 ldr x4,[x4] // *n0 23 and x22,x22,#-16 // ABI says so 24 ldp x13,x14,[x3],#16 // np[0..1] 25 26 mul x6,x7,x9 // ap[0]*bp[0] 27 sub x21,x5,#16 // j=num-2 28 umulh x7,x7,x9 29 mul x10,x8,x9 // ap[1]*bp[0] 30 umulh x11,x8,x9 31 32 mul x15,x6,x4 // "tp[0]"*n0 33 mov sp,x22 // alloca 34 35 // (*) mul x12,x13,x15 // np[0]*m1 36 umulh x13,x13,x15 37 mul x16,x14,x15 // np[1]*m1 38 // (*) adds x12,x12,x6 // discarded 39 // (*) As for removal of first multiplication and addition 40 // instructions. The outcome of first addition is 41 // guaranteed to be zero, which leaves two computationally 42 // significant outcomes: it either carries or not. Then 43 // question is when does it carry? Is there alternative 44 // way to deduce it? If you follow operations, you can 45 // observe that condition for carry is quite simple: 46 // x6 being non-zero. So that carry can be calculated 47 // by adding -1 to x6. That's what next instruction does. 48 subs xzr,x6,#1 // (*) 49 umulh x17,x14,x15 50 adc x13,x13,xzr 51 cbz x21,.L1st_skip 52 53.L1st: 54 ldr x8,[x1],#8 55 adds x6,x10,x7 56 sub x21,x21,#8 // j-- 57 adc x7,x11,xzr 58 59 ldr x14,[x3],#8 60 adds x12,x16,x13 61 mul x10,x8,x9 // ap[j]*bp[0] 62 adc x13,x17,xzr 63 umulh x11,x8,x9 64 65 adds x12,x12,x6 66 mul x16,x14,x15 // np[j]*m1 67 adc x13,x13,xzr 68 umulh x17,x14,x15 69 str x12,[x22],#8 // tp[j-1] 70 cbnz x21,.L1st 71 72.L1st_skip: 73 adds x6,x10,x7 74 sub x1,x1,x5 // rewind x1 75 adc x7,x11,xzr 76 77 adds x12,x16,x13 78 sub x3,x3,x5 // rewind x3 79 adc x13,x17,xzr 80 81 adds x12,x12,x6 82 sub x20,x5,#8 // i=num-1 83 adcs x13,x13,x7 84 85 adc x19,xzr,xzr // upmost overflow bit 86 stp x12,x13,[x22] 87 88.Louter: 89 ldr x9,[x2],#8 // bp[i] 90 ldp x7,x8,[x1],#16 91 ldr x23,[sp] // tp[0] 92 add x22,sp,#8 93 94 mul x6,x7,x9 // ap[0]*bp[i] 95 sub x21,x5,#16 // j=num-2 96 umulh x7,x7,x9 97 ldp x13,x14,[x3],#16 98 mul x10,x8,x9 // ap[1]*bp[i] 99 adds x6,x6,x23 100 umulh x11,x8,x9 101 adc x7,x7,xzr 102 103 mul x15,x6,x4 104 sub x20,x20,#8 // i-- 105 106 // (*) mul x12,x13,x15 // np[0]*m1 107 umulh x13,x13,x15 108 mul x16,x14,x15 // np[1]*m1 109 // (*) adds x12,x12,x6 110 subs xzr,x6,#1 // (*) 111 umulh x17,x14,x15 112 cbz x21,.Linner_skip 113 114.Linner: 115 ldr x8,[x1],#8 116 adc x13,x13,xzr 117 ldr x23,[x22],#8 // tp[j] 118 adds x6,x10,x7 119 sub x21,x21,#8 // j-- 120 adc x7,x11,xzr 121 122 adds x12,x16,x13 123 ldr x14,[x3],#8 124 adc x13,x17,xzr 125 126 mul x10,x8,x9 // ap[j]*bp[i] 127 adds x6,x6,x23 128 umulh x11,x8,x9 129 adc x7,x7,xzr 130 131 mul x16,x14,x15 // np[j]*m1 132 adds x12,x12,x6 133 umulh x17,x14,x15 134 str x12,[x22,#-16] // tp[j-1] 135 cbnz x21,.Linner 136 137.Linner_skip: 138 ldr x23,[x22],#8 // tp[j] 139 adc x13,x13,xzr 140 adds x6,x10,x7 141 sub x1,x1,x5 // rewind x1 142 adc x7,x11,xzr 143 144 adds x12,x16,x13 145 sub x3,x3,x5 // rewind x3 146 adcs x13,x17,x19 147 adc x19,xzr,xzr 148 149 adds x6,x6,x23 150 adc x7,x7,xzr 151 152 adds x12,x12,x6 153 adcs x13,x13,x7 154 adc x19,x19,xzr // upmost overflow bit 155 stp x12,x13,[x22,#-16] 156 157 cbnz x20,.Louter 158 159 // Final step. We see if result is larger than modulus, and 160 // if it is, subtract the modulus. But comparison implies 161 // subtraction. So we subtract modulus, see if it borrowed, 162 // and conditionally copy original value. 163 ldr x23,[sp] // tp[0] 164 add x22,sp,#8 165 ldr x14,[x3],#8 // np[0] 166 subs x21,x5,#8 // j=num-1 and clear borrow 167 mov x1,x0 168.Lsub: 169 sbcs x8,x23,x14 // tp[j]-np[j] 170 ldr x23,[x22],#8 171 sub x21,x21,#8 // j-- 172 ldr x14,[x3],#8 173 str x8,[x1],#8 // rp[j]=tp[j]-np[j] 174 cbnz x21,.Lsub 175 176 sbcs x8,x23,x14 177 sbcs x19,x19,xzr // did it borrow? 178 str x8,[x1],#8 // rp[num-1] 179 180 ldr x23,[sp] // tp[0] 181 add x22,sp,#8 182 ldr x8,[x0],#8 // rp[0] 183 sub x5,x5,#8 // num-- 184 nop 185.Lcond_copy: 186 sub x5,x5,#8 // num-- 187 csel x14,x23,x8,lo // did it borrow? 188 ldr x23,[x22],#8 189 ldr x8,[x0],#8 190 str xzr,[x22,#-16] // wipe tp 191 str x14,[x0,#-16] 192 cbnz x5,.Lcond_copy 193 194 csel x14,x23,x8,lo 195 str xzr,[x22,#-8] // wipe tp 196 str x14,[x0,#-8] 197 198 ldp x19,x20,[x29,#16] 199 mov sp,x29 200 ldp x21,x22,[x29,#32] 201 mov x0,#1 202 ldp x23,x24,[x29,#48] 203 ldr x29,[sp],#64 204 ret 205.size bn_mul_mont,.-bn_mul_mont 206.type __bn_sqr8x_mont,%function 207.align 5 208__bn_sqr8x_mont: 209 cmp x1,x2 210 b.ne __bn_mul4x_mont 211.Lsqr8x_mont: 212.inst 0xd503233f // paciasp 213 stp x29,x30,[sp,#-128]! 214 add x29,sp,#0 215 stp x19,x20,[sp,#16] 216 stp x21,x22,[sp,#32] 217 stp x23,x24,[sp,#48] 218 stp x25,x26,[sp,#64] 219 stp x27,x28,[sp,#80] 220 stp x0,x3,[sp,#96] // offload rp and np 221 222 ldp x6,x7,[x1,#8*0] 223 ldp x8,x9,[x1,#8*2] 224 ldp x10,x11,[x1,#8*4] 225 ldp x12,x13,[x1,#8*6] 226 227 sub x2,sp,x5,lsl#4 228 lsl x5,x5,#3 229 ldr x4,[x4] // *n0 230 mov sp,x2 // alloca 231 sub x27,x5,#8*8 232 b .Lsqr8x_zero_start 233 234.Lsqr8x_zero: 235 sub x27,x27,#8*8 236 stp xzr,xzr,[x2,#8*0] 237 stp xzr,xzr,[x2,#8*2] 238 stp xzr,xzr,[x2,#8*4] 239 stp xzr,xzr,[x2,#8*6] 240.Lsqr8x_zero_start: 241 stp xzr,xzr,[x2,#8*8] 242 stp xzr,xzr,[x2,#8*10] 243 stp xzr,xzr,[x2,#8*12] 244 stp xzr,xzr,[x2,#8*14] 245 add x2,x2,#8*16 246 cbnz x27,.Lsqr8x_zero 247 248 add x3,x1,x5 249 add x1,x1,#8*8 250 mov x19,xzr 251 mov x20,xzr 252 mov x21,xzr 253 mov x22,xzr 254 mov x23,xzr 255 mov x24,xzr 256 mov x25,xzr 257 mov x26,xzr 258 mov x2,sp 259 str x4,[x29,#112] // offload n0 260 261 // Multiply everything but a[i]*a[i] 262.align 4 263.Lsqr8x_outer_loop: 264 // a[1]a[0] (i) 265 // a[2]a[0] 266 // a[3]a[0] 267 // a[4]a[0] 268 // a[5]a[0] 269 // a[6]a[0] 270 // a[7]a[0] 271 // a[2]a[1] (ii) 272 // a[3]a[1] 273 // a[4]a[1] 274 // a[5]a[1] 275 // a[6]a[1] 276 // a[7]a[1] 277 // a[3]a[2] (iii) 278 // a[4]a[2] 279 // a[5]a[2] 280 // a[6]a[2] 281 // a[7]a[2] 282 // a[4]a[3] (iv) 283 // a[5]a[3] 284 // a[6]a[3] 285 // a[7]a[3] 286 // a[5]a[4] (v) 287 // a[6]a[4] 288 // a[7]a[4] 289 // a[6]a[5] (vi) 290 // a[7]a[5] 291 // a[7]a[6] (vii) 292 293 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) 294 mul x15,x8,x6 295 mul x16,x9,x6 296 mul x17,x10,x6 297 adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) 298 mul x14,x11,x6 299 adcs x21,x21,x15 300 mul x15,x12,x6 301 adcs x22,x22,x16 302 mul x16,x13,x6 303 adcs x23,x23,x17 304 umulh x17,x7,x6 // hi(a[1..7]*a[0]) 305 adcs x24,x24,x14 306 umulh x14,x8,x6 307 adcs x25,x25,x15 308 umulh x15,x9,x6 309 adcs x26,x26,x16 310 umulh x16,x10,x6 311 stp x19,x20,[x2],#8*2 // t[0..1] 312 adc x19,xzr,xzr // t[8] 313 adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) 314 umulh x17,x11,x6 315 adcs x22,x22,x14 316 umulh x14,x12,x6 317 adcs x23,x23,x15 318 umulh x15,x13,x6 319 adcs x24,x24,x16 320 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) 321 adcs x25,x25,x17 322 mul x17,x9,x7 323 adcs x26,x26,x14 324 mul x14,x10,x7 325 adc x19,x19,x15 326 327 mul x15,x11,x7 328 adds x22,x22,x16 329 mul x16,x12,x7 330 adcs x23,x23,x17 331 mul x17,x13,x7 332 adcs x24,x24,x14 333 umulh x14,x8,x7 // hi(a[2..7]*a[1]) 334 adcs x25,x25,x15 335 umulh x15,x9,x7 336 adcs x26,x26,x16 337 umulh x16,x10,x7 338 adcs x19,x19,x17 339 umulh x17,x11,x7 340 stp x21,x22,[x2],#8*2 // t[2..3] 341 adc x20,xzr,xzr // t[9] 342 adds x23,x23,x14 343 umulh x14,x12,x7 344 adcs x24,x24,x15 345 umulh x15,x13,x7 346 adcs x25,x25,x16 347 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) 348 adcs x26,x26,x17 349 mul x17,x10,x8 350 adcs x19,x19,x14 351 mul x14,x11,x8 352 adc x20,x20,x15 353 354 mul x15,x12,x8 355 adds x24,x24,x16 356 mul x16,x13,x8 357 adcs x25,x25,x17 358 umulh x17,x9,x8 // hi(a[3..7]*a[2]) 359 adcs x26,x26,x14 360 umulh x14,x10,x8 361 adcs x19,x19,x15 362 umulh x15,x11,x8 363 adcs x20,x20,x16 364 umulh x16,x12,x8 365 stp x23,x24,[x2],#8*2 // t[4..5] 366 adc x21,xzr,xzr // t[10] 367 adds x25,x25,x17 368 umulh x17,x13,x8 369 adcs x26,x26,x14 370 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) 371 adcs x19,x19,x15 372 mul x15,x11,x9 373 adcs x20,x20,x16 374 mul x16,x12,x9 375 adc x21,x21,x17 376 377 mul x17,x13,x9 378 adds x26,x26,x14 379 umulh x14,x10,x9 // hi(a[4..7]*a[3]) 380 adcs x19,x19,x15 381 umulh x15,x11,x9 382 adcs x20,x20,x16 383 umulh x16,x12,x9 384 adcs x21,x21,x17 385 umulh x17,x13,x9 386 stp x25,x26,[x2],#8*2 // t[6..7] 387 adc x22,xzr,xzr // t[11] 388 adds x19,x19,x14 389 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) 390 adcs x20,x20,x15 391 mul x15,x12,x10 392 adcs x21,x21,x16 393 mul x16,x13,x10 394 adc x22,x22,x17 395 396 umulh x17,x11,x10 // hi(a[5..7]*a[4]) 397 adds x20,x20,x14 398 umulh x14,x12,x10 399 adcs x21,x21,x15 400 umulh x15,x13,x10 401 adcs x22,x22,x16 402 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) 403 adc x23,xzr,xzr // t[12] 404 adds x21,x21,x17 405 mul x17,x13,x11 406 adcs x22,x22,x14 407 umulh x14,x12,x11 // hi(a[6..7]*a[5]) 408 adc x23,x23,x15 409 410 umulh x15,x13,x11 411 adds x22,x22,x16 412 mul x16,x13,x12 // lo(a[7]*a[6]) (vii) 413 adcs x23,x23,x17 414 umulh x17,x13,x12 // hi(a[7]*a[6]) 415 adc x24,xzr,xzr // t[13] 416 adds x23,x23,x14 417 sub x27,x3,x1 // done yet? 418 adc x24,x24,x15 419 420 adds x24,x24,x16 421 sub x14,x3,x5 // rewinded ap 422 adc x25,xzr,xzr // t[14] 423 add x25,x25,x17 424 425 cbz x27,.Lsqr8x_outer_break 426 427 mov x4,x6 428 ldp x6,x7,[x2,#8*0] 429 ldp x8,x9,[x2,#8*2] 430 ldp x10,x11,[x2,#8*4] 431 ldp x12,x13,[x2,#8*6] 432 adds x19,x19,x6 433 adcs x20,x20,x7 434 ldp x6,x7,[x1,#8*0] 435 adcs x21,x21,x8 436 adcs x22,x22,x9 437 ldp x8,x9,[x1,#8*2] 438 adcs x23,x23,x10 439 adcs x24,x24,x11 440 ldp x10,x11,[x1,#8*4] 441 adcs x25,x25,x12 442 mov x0,x1 443 adcs x26,xzr,x13 444 ldp x12,x13,[x1,#8*6] 445 add x1,x1,#8*8 446 //adc x28,xzr,xzr // moved below 447 mov x27,#-8*8 448 449 // a[8]a[0] 450 // a[9]a[0] 451 // a[a]a[0] 452 // a[b]a[0] 453 // a[c]a[0] 454 // a[d]a[0] 455 // a[e]a[0] 456 // a[f]a[0] 457 // a[8]a[1] 458 // a[f]a[1]........................ 459 // a[8]a[2] 460 // a[f]a[2]........................ 461 // a[8]a[3] 462 // a[f]a[3]........................ 463 // a[8]a[4] 464 // a[f]a[4]........................ 465 // a[8]a[5] 466 // a[f]a[5]........................ 467 // a[8]a[6] 468 // a[f]a[6]........................ 469 // a[8]a[7] 470 // a[f]a[7]........................ 471.Lsqr8x_mul: 472 mul x14,x6,x4 473 adc x28,xzr,xzr // carry bit, modulo-scheduled 474 mul x15,x7,x4 475 add x27,x27,#8 476 mul x16,x8,x4 477 mul x17,x9,x4 478 adds x19,x19,x14 479 mul x14,x10,x4 480 adcs x20,x20,x15 481 mul x15,x11,x4 482 adcs x21,x21,x16 483 mul x16,x12,x4 484 adcs x22,x22,x17 485 mul x17,x13,x4 486 adcs x23,x23,x14 487 umulh x14,x6,x4 488 adcs x24,x24,x15 489 umulh x15,x7,x4 490 adcs x25,x25,x16 491 umulh x16,x8,x4 492 adcs x26,x26,x17 493 umulh x17,x9,x4 494 adc x28,x28,xzr 495 str x19,[x2],#8 496 adds x19,x20,x14 497 umulh x14,x10,x4 498 adcs x20,x21,x15 499 umulh x15,x11,x4 500 adcs x21,x22,x16 501 umulh x16,x12,x4 502 adcs x22,x23,x17 503 umulh x17,x13,x4 504 ldr x4,[x0,x27] 505 adcs x23,x24,x14 506 adcs x24,x25,x15 507 adcs x25,x26,x16 508 adcs x26,x28,x17 509 //adc x28,xzr,xzr // moved above 510 cbnz x27,.Lsqr8x_mul 511 // note that carry flag is guaranteed 512 // to be zero at this point 513 cmp x1,x3 // done yet? 514 b.eq .Lsqr8x_break 515 516 ldp x6,x7,[x2,#8*0] 517 ldp x8,x9,[x2,#8*2] 518 ldp x10,x11,[x2,#8*4] 519 ldp x12,x13,[x2,#8*6] 520 adds x19,x19,x6 521 ldr x4,[x0,#-8*8] 522 adcs x20,x20,x7 523 ldp x6,x7,[x1,#8*0] 524 adcs x21,x21,x8 525 adcs x22,x22,x9 526 ldp x8,x9,[x1,#8*2] 527 adcs x23,x23,x10 528 adcs x24,x24,x11 529 ldp x10,x11,[x1,#8*4] 530 adcs x25,x25,x12 531 mov x27,#-8*8 532 adcs x26,x26,x13 533 ldp x12,x13,[x1,#8*6] 534 add x1,x1,#8*8 535 //adc x28,xzr,xzr // moved above 536 b .Lsqr8x_mul 537 538.align 4 539.Lsqr8x_break: 540 ldp x6,x7,[x0,#8*0] 541 add x1,x0,#8*8 542 ldp x8,x9,[x0,#8*2] 543 sub x14,x3,x1 // is it last iteration? 544 ldp x10,x11,[x0,#8*4] 545 sub x15,x2,x14 546 ldp x12,x13,[x0,#8*6] 547 cbz x14,.Lsqr8x_outer_loop 548 549 stp x19,x20,[x2,#8*0] 550 ldp x19,x20,[x15,#8*0] 551 stp x21,x22,[x2,#8*2] 552 ldp x21,x22,[x15,#8*2] 553 stp x23,x24,[x2,#8*4] 554 ldp x23,x24,[x15,#8*4] 555 stp x25,x26,[x2,#8*6] 556 mov x2,x15 557 ldp x25,x26,[x15,#8*6] 558 b .Lsqr8x_outer_loop 559 560.align 4 561.Lsqr8x_outer_break: 562 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 563 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] 564 ldp x15,x16,[sp,#8*1] 565 ldp x11,x13,[x14,#8*2] 566 add x1,x14,#8*4 567 ldp x17,x14,[sp,#8*3] 568 569 stp x19,x20,[x2,#8*0] 570 mul x19,x7,x7 571 stp x21,x22,[x2,#8*2] 572 umulh x7,x7,x7 573 stp x23,x24,[x2,#8*4] 574 mul x8,x9,x9 575 stp x25,x26,[x2,#8*6] 576 mov x2,sp 577 umulh x9,x9,x9 578 adds x20,x7,x15,lsl#1 579 extr x15,x16,x15,#63 580 sub x27,x5,#8*4 581 582.Lsqr4x_shift_n_add: 583 adcs x21,x8,x15 584 extr x16,x17,x16,#63 585 sub x27,x27,#8*4 586 adcs x22,x9,x16 587 ldp x15,x16,[x2,#8*5] 588 mul x10,x11,x11 589 ldp x7,x9,[x1],#8*2 590 umulh x11,x11,x11 591 mul x12,x13,x13 592 umulh x13,x13,x13 593 extr x17,x14,x17,#63 594 stp x19,x20,[x2,#8*0] 595 adcs x23,x10,x17 596 extr x14,x15,x14,#63 597 stp x21,x22,[x2,#8*2] 598 adcs x24,x11,x14 599 ldp x17,x14,[x2,#8*7] 600 extr x15,x16,x15,#63 601 adcs x25,x12,x15 602 extr x16,x17,x16,#63 603 adcs x26,x13,x16 604 ldp x15,x16,[x2,#8*9] 605 mul x6,x7,x7 606 ldp x11,x13,[x1],#8*2 607 umulh x7,x7,x7 608 mul x8,x9,x9 609 umulh x9,x9,x9 610 stp x23,x24,[x2,#8*4] 611 extr x17,x14,x17,#63 612 stp x25,x26,[x2,#8*6] 613 add x2,x2,#8*8 614 adcs x19,x6,x17 615 extr x14,x15,x14,#63 616 adcs x20,x7,x14 617 ldp x17,x14,[x2,#8*3] 618 extr x15,x16,x15,#63 619 cbnz x27,.Lsqr4x_shift_n_add 620 ldp x1,x4,[x29,#104] // pull np and n0 621 622 adcs x21,x8,x15 623 extr x16,x17,x16,#63 624 adcs x22,x9,x16 625 ldp x15,x16,[x2,#8*5] 626 mul x10,x11,x11 627 umulh x11,x11,x11 628 stp x19,x20,[x2,#8*0] 629 mul x12,x13,x13 630 umulh x13,x13,x13 631 stp x21,x22,[x2,#8*2] 632 extr x17,x14,x17,#63 633 adcs x23,x10,x17 634 extr x14,x15,x14,#63 635 ldp x19,x20,[sp,#8*0] 636 adcs x24,x11,x14 637 extr x15,x16,x15,#63 638 ldp x6,x7,[x1,#8*0] 639 adcs x25,x12,x15 640 extr x16,xzr,x16,#63 641 ldp x8,x9,[x1,#8*2] 642 adc x26,x13,x16 643 ldp x10,x11,[x1,#8*4] 644 645 // Reduce by 512 bits per iteration 646 mul x28,x4,x19 // t[0]*n0 647 ldp x12,x13,[x1,#8*6] 648 add x3,x1,x5 649 ldp x21,x22,[sp,#8*2] 650 stp x23,x24,[x2,#8*4] 651 ldp x23,x24,[sp,#8*4] 652 stp x25,x26,[x2,#8*6] 653 ldp x25,x26,[sp,#8*6] 654 add x1,x1,#8*8 655 mov x30,xzr // initial top-most carry 656 mov x2,sp 657 mov x27,#8 658 659.Lsqr8x_reduction: 660 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) 661 mul x15,x7,x28 662 sub x27,x27,#1 663 mul x16,x8,x28 664 str x28,[x2],#8 // put aside t[0]*n0 for tail processing 665 mul x17,x9,x28 666 // (*) adds xzr,x19,x14 667 subs xzr,x19,#1 // (*) 668 mul x14,x10,x28 669 adcs x19,x20,x15 670 mul x15,x11,x28 671 adcs x20,x21,x16 672 mul x16,x12,x28 673 adcs x21,x22,x17 674 mul x17,x13,x28 675 adcs x22,x23,x14 676 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) 677 adcs x23,x24,x15 678 umulh x15,x7,x28 679 adcs x24,x25,x16 680 umulh x16,x8,x28 681 adcs x25,x26,x17 682 umulh x17,x9,x28 683 adc x26,xzr,xzr 684 adds x19,x19,x14 685 umulh x14,x10,x28 686 adcs x20,x20,x15 687 umulh x15,x11,x28 688 adcs x21,x21,x16 689 umulh x16,x12,x28 690 adcs x22,x22,x17 691 umulh x17,x13,x28 692 mul x28,x4,x19 // next t[0]*n0 693 adcs x23,x23,x14 694 adcs x24,x24,x15 695 adcs x25,x25,x16 696 adc x26,x26,x17 697 cbnz x27,.Lsqr8x_reduction 698 699 ldp x14,x15,[x2,#8*0] 700 ldp x16,x17,[x2,#8*2] 701 mov x0,x2 702 sub x27,x3,x1 // done yet? 703 adds x19,x19,x14 704 adcs x20,x20,x15 705 ldp x14,x15,[x2,#8*4] 706 adcs x21,x21,x16 707 adcs x22,x22,x17 708 ldp x16,x17,[x2,#8*6] 709 adcs x23,x23,x14 710 adcs x24,x24,x15 711 adcs x25,x25,x16 712 adcs x26,x26,x17 713 //adc x28,xzr,xzr // moved below 714 cbz x27,.Lsqr8x8_post_condition 715 716 ldr x4,[x2,#-8*8] 717 ldp x6,x7,[x1,#8*0] 718 ldp x8,x9,[x1,#8*2] 719 ldp x10,x11,[x1,#8*4] 720 mov x27,#-8*8 721 ldp x12,x13,[x1,#8*6] 722 add x1,x1,#8*8 723 724.Lsqr8x_tail: 725 mul x14,x6,x4 726 adc x28,xzr,xzr // carry bit, modulo-scheduled 727 mul x15,x7,x4 728 add x27,x27,#8 729 mul x16,x8,x4 730 mul x17,x9,x4 731 adds x19,x19,x14 732 mul x14,x10,x4 733 adcs x20,x20,x15 734 mul x15,x11,x4 735 adcs x21,x21,x16 736 mul x16,x12,x4 737 adcs x22,x22,x17 738 mul x17,x13,x4 739 adcs x23,x23,x14 740 umulh x14,x6,x4 741 adcs x24,x24,x15 742 umulh x15,x7,x4 743 adcs x25,x25,x16 744 umulh x16,x8,x4 745 adcs x26,x26,x17 746 umulh x17,x9,x4 747 adc x28,x28,xzr 748 str x19,[x2],#8 749 adds x19,x20,x14 750 umulh x14,x10,x4 751 adcs x20,x21,x15 752 umulh x15,x11,x4 753 adcs x21,x22,x16 754 umulh x16,x12,x4 755 adcs x22,x23,x17 756 umulh x17,x13,x4 757 ldr x4,[x0,x27] 758 adcs x23,x24,x14 759 adcs x24,x25,x15 760 adcs x25,x26,x16 761 adcs x26,x28,x17 762 //adc x28,xzr,xzr // moved above 763 cbnz x27,.Lsqr8x_tail 764 // note that carry flag is guaranteed 765 // to be zero at this point 766 ldp x6,x7,[x2,#8*0] 767 sub x27,x3,x1 // done yet? 768 sub x16,x3,x5 // rewinded np 769 ldp x8,x9,[x2,#8*2] 770 ldp x10,x11,[x2,#8*4] 771 ldp x12,x13,[x2,#8*6] 772 cbz x27,.Lsqr8x_tail_break 773 774 ldr x4,[x0,#-8*8] 775 adds x19,x19,x6 776 adcs x20,x20,x7 777 ldp x6,x7,[x1,#8*0] 778 adcs x21,x21,x8 779 adcs x22,x22,x9 780 ldp x8,x9,[x1,#8*2] 781 adcs x23,x23,x10 782 adcs x24,x24,x11 783 ldp x10,x11,[x1,#8*4] 784 adcs x25,x25,x12 785 mov x27,#-8*8 786 adcs x26,x26,x13 787 ldp x12,x13,[x1,#8*6] 788 add x1,x1,#8*8 789 //adc x28,xzr,xzr // moved above 790 b .Lsqr8x_tail 791 792.align 4 793.Lsqr8x_tail_break: 794 ldr x4,[x29,#112] // pull n0 795 add x27,x2,#8*8 // end of current t[num] window 796 797 subs xzr,x30,#1 // "move" top-most carry to carry bit 798 adcs x14,x19,x6 799 adcs x15,x20,x7 800 ldp x19,x20,[x0,#8*0] 801 adcs x21,x21,x8 802 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] 803 adcs x22,x22,x9 804 ldp x8,x9,[x16,#8*2] 805 adcs x23,x23,x10 806 adcs x24,x24,x11 807 ldp x10,x11,[x16,#8*4] 808 adcs x25,x25,x12 809 adcs x26,x26,x13 810 ldp x12,x13,[x16,#8*6] 811 add x1,x16,#8*8 812 adc x30,xzr,xzr // top-most carry 813 mul x28,x4,x19 814 stp x14,x15,[x2,#8*0] 815 stp x21,x22,[x2,#8*2] 816 ldp x21,x22,[x0,#8*2] 817 stp x23,x24,[x2,#8*4] 818 ldp x23,x24,[x0,#8*4] 819 cmp x27,x29 // did we hit the bottom? 820 stp x25,x26,[x2,#8*6] 821 mov x2,x0 // slide the window 822 ldp x25,x26,[x0,#8*6] 823 mov x27,#8 824 b.ne .Lsqr8x_reduction 825 826 // Final step. We see if result is larger than modulus, and 827 // if it is, subtract the modulus. But comparison implies 828 // subtraction. So we subtract modulus, see if it borrowed, 829 // and conditionally copy original value. 830 ldr x0,[x29,#96] // pull rp 831 add x2,x2,#8*8 832 subs x14,x19,x6 833 sbcs x15,x20,x7 834 sub x27,x5,#8*8 835 mov x3,x0 // x0 copy 836 837.Lsqr8x_sub: 838 sbcs x16,x21,x8 839 ldp x6,x7,[x1,#8*0] 840 sbcs x17,x22,x9 841 stp x14,x15,[x0,#8*0] 842 sbcs x14,x23,x10 843 ldp x8,x9,[x1,#8*2] 844 sbcs x15,x24,x11 845 stp x16,x17,[x0,#8*2] 846 sbcs x16,x25,x12 847 ldp x10,x11,[x1,#8*4] 848 sbcs x17,x26,x13 849 ldp x12,x13,[x1,#8*6] 850 add x1,x1,#8*8 851 ldp x19,x20,[x2,#8*0] 852 sub x27,x27,#8*8 853 ldp x21,x22,[x2,#8*2] 854 ldp x23,x24,[x2,#8*4] 855 ldp x25,x26,[x2,#8*6] 856 add x2,x2,#8*8 857 stp x14,x15,[x0,#8*4] 858 sbcs x14,x19,x6 859 stp x16,x17,[x0,#8*6] 860 add x0,x0,#8*8 861 sbcs x15,x20,x7 862 cbnz x27,.Lsqr8x_sub 863 864 sbcs x16,x21,x8 865 mov x2,sp 866 add x1,sp,x5 867 ldp x6,x7,[x3,#8*0] 868 sbcs x17,x22,x9 869 stp x14,x15,[x0,#8*0] 870 sbcs x14,x23,x10 871 ldp x8,x9,[x3,#8*2] 872 sbcs x15,x24,x11 873 stp x16,x17,[x0,#8*2] 874 sbcs x16,x25,x12 875 ldp x19,x20,[x1,#8*0] 876 sbcs x17,x26,x13 877 ldp x21,x22,[x1,#8*2] 878 sbcs xzr,x30,xzr // did it borrow? 879 ldr x30,[x29,#8] // pull return address 880 stp x14,x15,[x0,#8*4] 881 stp x16,x17,[x0,#8*6] 882 883 sub x27,x5,#8*4 884.Lsqr4x_cond_copy: 885 sub x27,x27,#8*4 886 csel x14,x19,x6,lo 887 stp xzr,xzr,[x2,#8*0] 888 csel x15,x20,x7,lo 889 ldp x6,x7,[x3,#8*4] 890 ldp x19,x20,[x1,#8*4] 891 csel x16,x21,x8,lo 892 stp xzr,xzr,[x2,#8*2] 893 add x2,x2,#8*4 894 csel x17,x22,x9,lo 895 ldp x8,x9,[x3,#8*6] 896 ldp x21,x22,[x1,#8*6] 897 add x1,x1,#8*4 898 stp x14,x15,[x3,#8*0] 899 stp x16,x17,[x3,#8*2] 900 add x3,x3,#8*4 901 stp xzr,xzr,[x1,#8*0] 902 stp xzr,xzr,[x1,#8*2] 903 cbnz x27,.Lsqr4x_cond_copy 904 905 csel x14,x19,x6,lo 906 stp xzr,xzr,[x2,#8*0] 907 csel x15,x20,x7,lo 908 stp xzr,xzr,[x2,#8*2] 909 csel x16,x21,x8,lo 910 csel x17,x22,x9,lo 911 stp x14,x15,[x3,#8*0] 912 stp x16,x17,[x3,#8*2] 913 914 b .Lsqr8x_done 915 916.align 4 917.Lsqr8x8_post_condition: 918 adc x28,xzr,xzr 919 ldr x30,[x29,#8] // pull return address 920 // x19-7,x28 hold result, x6-7 hold modulus 921 subs x6,x19,x6 922 ldr x1,[x29,#96] // pull rp 923 sbcs x7,x20,x7 924 stp xzr,xzr,[sp,#8*0] 925 sbcs x8,x21,x8 926 stp xzr,xzr,[sp,#8*2] 927 sbcs x9,x22,x9 928 stp xzr,xzr,[sp,#8*4] 929 sbcs x10,x23,x10 930 stp xzr,xzr,[sp,#8*6] 931 sbcs x11,x24,x11 932 stp xzr,xzr,[sp,#8*8] 933 sbcs x12,x25,x12 934 stp xzr,xzr,[sp,#8*10] 935 sbcs x13,x26,x13 936 stp xzr,xzr,[sp,#8*12] 937 sbcs x28,x28,xzr // did it borrow? 938 stp xzr,xzr,[sp,#8*14] 939 940 // x6-7 hold result-modulus 941 csel x6,x19,x6,lo 942 csel x7,x20,x7,lo 943 csel x8,x21,x8,lo 944 csel x9,x22,x9,lo 945 stp x6,x7,[x1,#8*0] 946 csel x10,x23,x10,lo 947 csel x11,x24,x11,lo 948 stp x8,x9,[x1,#8*2] 949 csel x12,x25,x12,lo 950 csel x13,x26,x13,lo 951 stp x10,x11,[x1,#8*4] 952 stp x12,x13,[x1,#8*6] 953 954.Lsqr8x_done: 955 ldp x19,x20,[x29,#16] 956 mov sp,x29 957 ldp x21,x22,[x29,#32] 958 mov x0,#1 959 ldp x23,x24,[x29,#48] 960 ldp x25,x26,[x29,#64] 961 ldp x27,x28,[x29,#80] 962 ldr x29,[sp],#128 963.inst 0xd50323bf // autiasp 964 ret 965.size __bn_sqr8x_mont,.-__bn_sqr8x_mont 966.type __bn_mul4x_mont,%function 967.align 5 968__bn_mul4x_mont: 969.inst 0xd503233f // paciasp 970 stp x29,x30,[sp,#-128]! 971 add x29,sp,#0 972 stp x19,x20,[sp,#16] 973 stp x21,x22,[sp,#32] 974 stp x23,x24,[sp,#48] 975 stp x25,x26,[sp,#64] 976 stp x27,x28,[sp,#80] 977 978 sub x26,sp,x5,lsl#3 979 lsl x5,x5,#3 980 ldr x4,[x4] // *n0 981 sub sp,x26,#8*4 // alloca 982 983 add x10,x2,x5 984 add x27,x1,x5 985 stp x0,x10,[x29,#96] // offload rp and &b[num] 986 987 ldr x24,[x2,#8*0] // b[0] 988 ldp x6,x7,[x1,#8*0] // a[0..3] 989 ldp x8,x9,[x1,#8*2] 990 add x1,x1,#8*4 991 mov x19,xzr 992 mov x20,xzr 993 mov x21,xzr 994 mov x22,xzr 995 ldp x14,x15,[x3,#8*0] // n[0..3] 996 ldp x16,x17,[x3,#8*2] 997 adds x3,x3,#8*4 // clear carry bit 998 mov x0,xzr 999 mov x28,#0 1000 mov x26,sp 1001 1002.Loop_mul4x_1st_reduction: 1003 mul x10,x6,x24 // lo(a[0..3]*b[0]) 1004 adc x0,x0,xzr // modulo-scheduled 1005 mul x11,x7,x24 1006 add x28,x28,#8 1007 mul x12,x8,x24 1008 and x28,x28,#31 1009 mul x13,x9,x24 1010 adds x19,x19,x10 1011 umulh x10,x6,x24 // hi(a[0..3]*b[0]) 1012 adcs x20,x20,x11 1013 mul x25,x19,x4 // t[0]*n0 1014 adcs x21,x21,x12 1015 umulh x11,x7,x24 1016 adcs x22,x22,x13 1017 umulh x12,x8,x24 1018 adc x23,xzr,xzr 1019 umulh x13,x9,x24 1020 ldr x24,[x2,x28] // next b[i] (or b[0]) 1021 adds x20,x20,x10 1022 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) 1023 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1024 adcs x21,x21,x11 1025 mul x11,x15,x25 1026 adcs x22,x22,x12 1027 mul x12,x16,x25 1028 adc x23,x23,x13 // can't overflow 1029 mul x13,x17,x25 1030 // (*) adds xzr,x19,x10 1031 subs xzr,x19,#1 // (*) 1032 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) 1033 adcs x19,x20,x11 1034 umulh x11,x15,x25 1035 adcs x20,x21,x12 1036 umulh x12,x16,x25 1037 adcs x21,x22,x13 1038 umulh x13,x17,x25 1039 adcs x22,x23,x0 1040 adc x0,xzr,xzr 1041 adds x19,x19,x10 1042 sub x10,x27,x1 1043 adcs x20,x20,x11 1044 adcs x21,x21,x12 1045 adcs x22,x22,x13 1046 //adc x0,x0,xzr 1047 cbnz x28,.Loop_mul4x_1st_reduction 1048 1049 cbz x10,.Lmul4x4_post_condition 1050 1051 ldp x6,x7,[x1,#8*0] // a[4..7] 1052 ldp x8,x9,[x1,#8*2] 1053 add x1,x1,#8*4 1054 ldr x25,[sp] // a[0]*n0 1055 ldp x14,x15,[x3,#8*0] // n[4..7] 1056 ldp x16,x17,[x3,#8*2] 1057 add x3,x3,#8*4 1058 1059.Loop_mul4x_1st_tail: 1060 mul x10,x6,x24 // lo(a[4..7]*b[i]) 1061 adc x0,x0,xzr // modulo-scheduled 1062 mul x11,x7,x24 1063 add x28,x28,#8 1064 mul x12,x8,x24 1065 and x28,x28,#31 1066 mul x13,x9,x24 1067 adds x19,x19,x10 1068 umulh x10,x6,x24 // hi(a[4..7]*b[i]) 1069 adcs x20,x20,x11 1070 umulh x11,x7,x24 1071 adcs x21,x21,x12 1072 umulh x12,x8,x24 1073 adcs x22,x22,x13 1074 umulh x13,x9,x24 1075 adc x23,xzr,xzr 1076 ldr x24,[x2,x28] // next b[i] (or b[0]) 1077 adds x20,x20,x10 1078 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) 1079 adcs x21,x21,x11 1080 mul x11,x15,x25 1081 adcs x22,x22,x12 1082 mul x12,x16,x25 1083 adc x23,x23,x13 // can't overflow 1084 mul x13,x17,x25 1085 adds x19,x19,x10 1086 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) 1087 adcs x20,x20,x11 1088 umulh x11,x15,x25 1089 adcs x21,x21,x12 1090 umulh x12,x16,x25 1091 adcs x22,x22,x13 1092 adcs x23,x23,x0 1093 umulh x13,x17,x25 1094 adc x0,xzr,xzr 1095 ldr x25,[sp,x28] // next t[0]*n0 1096 str x19,[x26],#8 // result!!! 1097 adds x19,x20,x10 1098 sub x10,x27,x1 // done yet? 1099 adcs x20,x21,x11 1100 adcs x21,x22,x12 1101 adcs x22,x23,x13 1102 //adc x0,x0,xzr 1103 cbnz x28,.Loop_mul4x_1st_tail 1104 1105 sub x11,x27,x5 // rewinded x1 1106 cbz x10,.Lmul4x_proceed 1107 1108 ldp x6,x7,[x1,#8*0] 1109 ldp x8,x9,[x1,#8*2] 1110 add x1,x1,#8*4 1111 ldp x14,x15,[x3,#8*0] 1112 ldp x16,x17,[x3,#8*2] 1113 add x3,x3,#8*4 1114 b .Loop_mul4x_1st_tail 1115 1116.align 5 1117.Lmul4x_proceed: 1118 ldr x24,[x2,#8*4]! // *++b 1119 adc x30,x0,xzr 1120 ldp x6,x7,[x11,#8*0] // a[0..3] 1121 sub x3,x3,x5 // rewind np 1122 ldp x8,x9,[x11,#8*2] 1123 add x1,x11,#8*4 1124 1125 stp x19,x20,[x26,#8*0] // result!!! 1126 ldp x19,x20,[sp,#8*4] // t[0..3] 1127 stp x21,x22,[x26,#8*2] // result!!! 1128 ldp x21,x22,[sp,#8*6] 1129 1130 ldp x14,x15,[x3,#8*0] // n[0..3] 1131 mov x26,sp 1132 ldp x16,x17,[x3,#8*2] 1133 adds x3,x3,#8*4 // clear carry bit 1134 mov x0,xzr 1135 1136.align 4 1137.Loop_mul4x_reduction: 1138 mul x10,x6,x24 // lo(a[0..3]*b[4]) 1139 adc x0,x0,xzr // modulo-scheduled 1140 mul x11,x7,x24 1141 add x28,x28,#8 1142 mul x12,x8,x24 1143 and x28,x28,#31 1144 mul x13,x9,x24 1145 adds x19,x19,x10 1146 umulh x10,x6,x24 // hi(a[0..3]*b[4]) 1147 adcs x20,x20,x11 1148 mul x25,x19,x4 // t[0]*n0 1149 adcs x21,x21,x12 1150 umulh x11,x7,x24 1151 adcs x22,x22,x13 1152 umulh x12,x8,x24 1153 adc x23,xzr,xzr 1154 umulh x13,x9,x24 1155 ldr x24,[x2,x28] // next b[i] 1156 adds x20,x20,x10 1157 // (*) mul x10,x14,x25 1158 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1159 adcs x21,x21,x11 1160 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 1161 adcs x22,x22,x12 1162 mul x12,x16,x25 1163 adc x23,x23,x13 // can't overflow 1164 mul x13,x17,x25 1165 // (*) adds xzr,x19,x10 1166 subs xzr,x19,#1 // (*) 1167 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 1168 adcs x19,x20,x11 1169 umulh x11,x15,x25 1170 adcs x20,x21,x12 1171 umulh x12,x16,x25 1172 adcs x21,x22,x13 1173 umulh x13,x17,x25 1174 adcs x22,x23,x0 1175 adc x0,xzr,xzr 1176 adds x19,x19,x10 1177 adcs x20,x20,x11 1178 adcs x21,x21,x12 1179 adcs x22,x22,x13 1180 //adc x0,x0,xzr 1181 cbnz x28,.Loop_mul4x_reduction 1182 1183 adc x0,x0,xzr 1184 ldp x10,x11,[x26,#8*4] // t[4..7] 1185 ldp x12,x13,[x26,#8*6] 1186 ldp x6,x7,[x1,#8*0] // a[4..7] 1187 ldp x8,x9,[x1,#8*2] 1188 add x1,x1,#8*4 1189 adds x19,x19,x10 1190 adcs x20,x20,x11 1191 adcs x21,x21,x12 1192 adcs x22,x22,x13 1193 //adc x0,x0,xzr 1194 1195 ldr x25,[sp] // t[0]*n0 1196 ldp x14,x15,[x3,#8*0] // n[4..7] 1197 ldp x16,x17,[x3,#8*2] 1198 add x3,x3,#8*4 1199 1200.align 4 1201.Loop_mul4x_tail: 1202 mul x10,x6,x24 // lo(a[4..7]*b[4]) 1203 adc x0,x0,xzr // modulo-scheduled 1204 mul x11,x7,x24 1205 add x28,x28,#8 1206 mul x12,x8,x24 1207 and x28,x28,#31 1208 mul x13,x9,x24 1209 adds x19,x19,x10 1210 umulh x10,x6,x24 // hi(a[4..7]*b[4]) 1211 adcs x20,x20,x11 1212 umulh x11,x7,x24 1213 adcs x21,x21,x12 1214 umulh x12,x8,x24 1215 adcs x22,x22,x13 1216 umulh x13,x9,x24 1217 adc x23,xzr,xzr 1218 ldr x24,[x2,x28] // next b[i] 1219 adds x20,x20,x10 1220 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) 1221 adcs x21,x21,x11 1222 mul x11,x15,x25 1223 adcs x22,x22,x12 1224 mul x12,x16,x25 1225 adc x23,x23,x13 // can't overflow 1226 mul x13,x17,x25 1227 adds x19,x19,x10 1228 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) 1229 adcs x20,x20,x11 1230 umulh x11,x15,x25 1231 adcs x21,x21,x12 1232 umulh x12,x16,x25 1233 adcs x22,x22,x13 1234 umulh x13,x17,x25 1235 adcs x23,x23,x0 1236 ldr x25,[sp,x28] // next a[0]*n0 1237 adc x0,xzr,xzr 1238 str x19,[x26],#8 // result!!! 1239 adds x19,x20,x10 1240 sub x10,x27,x1 // done yet? 1241 adcs x20,x21,x11 1242 adcs x21,x22,x12 1243 adcs x22,x23,x13 1244 //adc x0,x0,xzr 1245 cbnz x28,.Loop_mul4x_tail 1246 1247 sub x11,x3,x5 // rewinded np? 1248 adc x0,x0,xzr 1249 cbz x10,.Loop_mul4x_break 1250 1251 ldp x10,x11,[x26,#8*4] 1252 ldp x12,x13,[x26,#8*6] 1253 ldp x6,x7,[x1,#8*0] 1254 ldp x8,x9,[x1,#8*2] 1255 add x1,x1,#8*4 1256 adds x19,x19,x10 1257 adcs x20,x20,x11 1258 adcs x21,x21,x12 1259 adcs x22,x22,x13 1260 //adc x0,x0,xzr 1261 ldp x14,x15,[x3,#8*0] 1262 ldp x16,x17,[x3,#8*2] 1263 add x3,x3,#8*4 1264 b .Loop_mul4x_tail 1265 1266.align 4 1267.Loop_mul4x_break: 1268 ldp x12,x13,[x29,#96] // pull rp and &b[num] 1269 adds x19,x19,x30 1270 add x2,x2,#8*4 // bp++ 1271 adcs x20,x20,xzr 1272 sub x1,x1,x5 // rewind ap 1273 adcs x21,x21,xzr 1274 stp x19,x20,[x26,#8*0] // result!!! 1275 adcs x22,x22,xzr 1276 ldp x19,x20,[sp,#8*4] // t[0..3] 1277 adc x30,x0,xzr 1278 stp x21,x22,[x26,#8*2] // result!!! 1279 cmp x2,x13 // done yet? 1280 ldp x21,x22,[sp,#8*6] 1281 ldp x14,x15,[x11,#8*0] // n[0..3] 1282 ldp x16,x17,[x11,#8*2] 1283 add x3,x11,#8*4 1284 b.eq .Lmul4x_post 1285 1286 ldr x24,[x2] 1287 ldp x6,x7,[x1,#8*0] // a[0..3] 1288 ldp x8,x9,[x1,#8*2] 1289 adds x1,x1,#8*4 // clear carry bit 1290 mov x0,xzr 1291 mov x26,sp 1292 b .Loop_mul4x_reduction 1293 1294.align 4 1295.Lmul4x_post: 1296 // Final step. We see if result is larger than modulus, and 1297 // if it is, subtract the modulus. But comparison implies 1298 // subtraction. So we subtract modulus, see if it borrowed, 1299 // and conditionally copy original value. 1300 mov x0,x12 1301 mov x27,x12 // x0 copy 1302 subs x10,x19,x14 1303 add x26,sp,#8*8 1304 sbcs x11,x20,x15 1305 sub x28,x5,#8*4 1306 1307.Lmul4x_sub: 1308 sbcs x12,x21,x16 1309 ldp x14,x15,[x3,#8*0] 1310 sub x28,x28,#8*4 1311 ldp x19,x20,[x26,#8*0] 1312 sbcs x13,x22,x17 1313 ldp x16,x17,[x3,#8*2] 1314 add x3,x3,#8*4 1315 ldp x21,x22,[x26,#8*2] 1316 add x26,x26,#8*4 1317 stp x10,x11,[x0,#8*0] 1318 sbcs x10,x19,x14 1319 stp x12,x13,[x0,#8*2] 1320 add x0,x0,#8*4 1321 sbcs x11,x20,x15 1322 cbnz x28,.Lmul4x_sub 1323 1324 sbcs x12,x21,x16 1325 mov x26,sp 1326 add x1,sp,#8*4 1327 ldp x6,x7,[x27,#8*0] 1328 sbcs x13,x22,x17 1329 stp x10,x11,[x0,#8*0] 1330 ldp x8,x9,[x27,#8*2] 1331 stp x12,x13,[x0,#8*2] 1332 ldp x19,x20,[x1,#8*0] 1333 ldp x21,x22,[x1,#8*2] 1334 sbcs xzr,x30,xzr // did it borrow? 1335 ldr x30,[x29,#8] // pull return address 1336 1337 sub x28,x5,#8*4 1338.Lmul4x_cond_copy: 1339 sub x28,x28,#8*4 1340 csel x10,x19,x6,lo 1341 stp xzr,xzr,[x26,#8*0] 1342 csel x11,x20,x7,lo 1343 ldp x6,x7,[x27,#8*4] 1344 ldp x19,x20,[x1,#8*4] 1345 csel x12,x21,x8,lo 1346 stp xzr,xzr,[x26,#8*2] 1347 add x26,x26,#8*4 1348 csel x13,x22,x9,lo 1349 ldp x8,x9,[x27,#8*6] 1350 ldp x21,x22,[x1,#8*6] 1351 add x1,x1,#8*4 1352 stp x10,x11,[x27,#8*0] 1353 stp x12,x13,[x27,#8*2] 1354 add x27,x27,#8*4 1355 cbnz x28,.Lmul4x_cond_copy 1356 1357 csel x10,x19,x6,lo 1358 stp xzr,xzr,[x26,#8*0] 1359 csel x11,x20,x7,lo 1360 stp xzr,xzr,[x26,#8*2] 1361 csel x12,x21,x8,lo 1362 stp xzr,xzr,[x26,#8*3] 1363 csel x13,x22,x9,lo 1364 stp xzr,xzr,[x26,#8*4] 1365 stp x10,x11,[x27,#8*0] 1366 stp x12,x13,[x27,#8*2] 1367 1368 b .Lmul4x_done 1369 1370.align 4 1371.Lmul4x4_post_condition: 1372 adc x0,x0,xzr 1373 ldr x1,[x29,#96] // pull rp 1374 // x19-3,x0 hold result, x14-7 hold modulus 1375 subs x6,x19,x14 1376 ldr x30,[x29,#8] // pull return address 1377 sbcs x7,x20,x15 1378 stp xzr,xzr,[sp,#8*0] 1379 sbcs x8,x21,x16 1380 stp xzr,xzr,[sp,#8*2] 1381 sbcs x9,x22,x17 1382 stp xzr,xzr,[sp,#8*4] 1383 sbcs xzr,x0,xzr // did it borrow? 1384 stp xzr,xzr,[sp,#8*6] 1385 1386 // x6-3 hold result-modulus 1387 csel x6,x19,x6,lo 1388 csel x7,x20,x7,lo 1389 csel x8,x21,x8,lo 1390 csel x9,x22,x9,lo 1391 stp x6,x7,[x1,#8*0] 1392 stp x8,x9,[x1,#8*2] 1393 1394.Lmul4x_done: 1395 ldp x19,x20,[x29,#16] 1396 mov sp,x29 1397 ldp x21,x22,[x29,#32] 1398 mov x0,#1 1399 ldp x23,x24,[x29,#48] 1400 ldp x25,x26,[x29,#64] 1401 ldp x27,x28,[x29,#80] 1402 ldr x29,[sp],#128 1403.inst 0xd50323bf // autiasp 1404 ret 1405.size __bn_mul4x_mont,.-__bn_mul4x_mont 1406.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1407.align 2 1408.align 4 1409