1#ifndef __KERNEL__ 2# include "arm_arch.h" 3 4.hidden OPENSSL_armv8_rsa_neonized 5#endif 6.text 7 8.globl bn_mul_mont 9.type bn_mul_mont,%function 10.align 5 11bn_mul_mont: 12.Lbn_mul_mont: 13 tst x5,#3 14 b.ne .Lmul_mont 15 cmp x5,#32 16 b.le .Lscalar_impl 17#ifndef __KERNEL__ 18 adrp x17,OPENSSL_armv8_rsa_neonized 19 ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized] 20 cbnz w17, bn_mul8x_mont_neon 21#endif 22 23.Lscalar_impl: 24 tst x5,#7 25 b.eq __bn_sqr8x_mont 26 tst x5,#3 27 b.eq __bn_mul4x_mont 28 29.Lmul_mont: 30 stp x29,x30,[sp,#-64]! 31 add x29,sp,#0 32 stp x19,x20,[sp,#16] 33 stp x21,x22,[sp,#32] 34 stp x23,x24,[sp,#48] 35 36 ldr x9,[x2],#8 // bp[0] 37 sub x22,sp,x5,lsl#3 38 ldp x7,x8,[x1],#16 // ap[0..1] 39 lsl x5,x5,#3 40 ldr x4,[x4] // *n0 41 and x22,x22,#-16 // ABI says so 42 ldp x13,x14,[x3],#16 // np[0..1] 43 44 mul x6,x7,x9 // ap[0]*bp[0] 45 sub x21,x5,#16 // j=num-2 46 umulh x7,x7,x9 47 mul x10,x8,x9 // ap[1]*bp[0] 48 umulh x11,x8,x9 49 50 mul x15,x6,x4 // "tp[0]"*n0 51 mov sp,x22 // alloca 52 53 // (*) mul x12,x13,x15 // np[0]*m1 54 umulh x13,x13,x15 55 mul x16,x14,x15 // np[1]*m1 56 // (*) adds x12,x12,x6 // discarded 57 // (*) As for removal of first multiplication and addition 58 // instructions. The outcome of first addition is 59 // guaranteed to be zero, which leaves two computationally 60 // significant outcomes: it either carries or not. Then 61 // question is when does it carry? Is there alternative 62 // way to deduce it? If you follow operations, you can 63 // observe that condition for carry is quite simple: 64 // x6 being non-zero. So that carry can be calculated 65 // by adding -1 to x6. That's what next instruction does. 66 subs xzr,x6,#1 // (*) 67 umulh x17,x14,x15 68 adc x13,x13,xzr 69 cbz x21,.L1st_skip 70 71.L1st: 72 ldr x8,[x1],#8 73 adds x6,x10,x7 74 sub x21,x21,#8 // j-- 75 adc x7,x11,xzr 76 77 ldr x14,[x3],#8 78 adds x12,x16,x13 79 mul x10,x8,x9 // ap[j]*bp[0] 80 adc x13,x17,xzr 81 umulh x11,x8,x9 82 83 adds x12,x12,x6 84 mul x16,x14,x15 // np[j]*m1 85 adc x13,x13,xzr 86 umulh x17,x14,x15 87 str x12,[x22],#8 // tp[j-1] 88 cbnz x21,.L1st 89 90.L1st_skip: 91 adds x6,x10,x7 92 sub x1,x1,x5 // rewind x1 93 adc x7,x11,xzr 94 95 adds x12,x16,x13 96 sub x3,x3,x5 // rewind x3 97 adc x13,x17,xzr 98 99 adds x12,x12,x6 100 sub x20,x5,#8 // i=num-1 101 adcs x13,x13,x7 102 103 adc x19,xzr,xzr // upmost overflow bit 104 stp x12,x13,[x22] 105 106.Louter: 107 ldr x9,[x2],#8 // bp[i] 108 ldp x7,x8,[x1],#16 109 ldr x23,[sp] // tp[0] 110 add x22,sp,#8 111 112 mul x6,x7,x9 // ap[0]*bp[i] 113 sub x21,x5,#16 // j=num-2 114 umulh x7,x7,x9 115 ldp x13,x14,[x3],#16 116 mul x10,x8,x9 // ap[1]*bp[i] 117 adds x6,x6,x23 118 umulh x11,x8,x9 119 adc x7,x7,xzr 120 121 mul x15,x6,x4 122 sub x20,x20,#8 // i-- 123 124 // (*) mul x12,x13,x15 // np[0]*m1 125 umulh x13,x13,x15 126 mul x16,x14,x15 // np[1]*m1 127 // (*) adds x12,x12,x6 128 subs xzr,x6,#1 // (*) 129 umulh x17,x14,x15 130 cbz x21,.Linner_skip 131 132.Linner: 133 ldr x8,[x1],#8 134 adc x13,x13,xzr 135 ldr x23,[x22],#8 // tp[j] 136 adds x6,x10,x7 137 sub x21,x21,#8 // j-- 138 adc x7,x11,xzr 139 140 adds x12,x16,x13 141 ldr x14,[x3],#8 142 adc x13,x17,xzr 143 144 mul x10,x8,x9 // ap[j]*bp[i] 145 adds x6,x6,x23 146 umulh x11,x8,x9 147 adc x7,x7,xzr 148 149 mul x16,x14,x15 // np[j]*m1 150 adds x12,x12,x6 151 umulh x17,x14,x15 152 stur x12,[x22,#-16] // tp[j-1] 153 cbnz x21,.Linner 154 155.Linner_skip: 156 ldr x23,[x22],#8 // tp[j] 157 adc x13,x13,xzr 158 adds x6,x10,x7 159 sub x1,x1,x5 // rewind x1 160 adc x7,x11,xzr 161 162 adds x12,x16,x13 163 sub x3,x3,x5 // rewind x3 164 adcs x13,x17,x19 165 adc x19,xzr,xzr 166 167 adds x6,x6,x23 168 adc x7,x7,xzr 169 170 adds x12,x12,x6 171 adcs x13,x13,x7 172 adc x19,x19,xzr // upmost overflow bit 173 stp x12,x13,[x22,#-16] 174 175 cbnz x20,.Louter 176 177 // Final step. We see if result is larger than modulus, and 178 // if it is, subtract the modulus. But comparison implies 179 // subtraction. So we subtract modulus, see if it borrowed, 180 // and conditionally copy original value. 181 ldr x23,[sp] // tp[0] 182 add x22,sp,#8 183 ldr x14,[x3],#8 // np[0] 184 subs x21,x5,#8 // j=num-1 and clear borrow 185 mov x1,x0 186.Lsub: 187 sbcs x8,x23,x14 // tp[j]-np[j] 188 ldr x23,[x22],#8 189 sub x21,x21,#8 // j-- 190 ldr x14,[x3],#8 191 str x8,[x1],#8 // rp[j]=tp[j]-np[j] 192 cbnz x21,.Lsub 193 194 sbcs x8,x23,x14 195 sbcs x19,x19,xzr // did it borrow? 196 str x8,[x1],#8 // rp[num-1] 197 198 ldr x23,[sp] // tp[0] 199 add x22,sp,#8 200 ldr x8,[x0],#8 // rp[0] 201 sub x5,x5,#8 // num-- 202 nop 203.Lcond_copy: 204 sub x5,x5,#8 // num-- 205 csel x14,x23,x8,lo // did it borrow? 206 ldr x23,[x22],#8 207 ldr x8,[x0],#8 208 stur xzr,[x22,#-16] // wipe tp 209 stur x14,[x0,#-16] 210 cbnz x5,.Lcond_copy 211 212 csel x14,x23,x8,lo 213 stur xzr,[x22,#-8] // wipe tp 214 stur x14,[x0,#-8] 215 216 ldp x19,x20,[x29,#16] 217 mov sp,x29 218 ldp x21,x22,[x29,#32] 219 mov x0,#1 220 ldp x23,x24,[x29,#48] 221 ldr x29,[sp],#64 222 ret 223.size bn_mul_mont,.-bn_mul_mont 224.type bn_mul8x_mont_neon,%function 225.align 5 226bn_mul8x_mont_neon: 227 stp x29,x30,[sp,#-80]! 228 mov x16,sp 229 stp d8,d9,[sp,#16] 230 stp d10,d11,[sp,#32] 231 stp d12,d13,[sp,#48] 232 stp d14,d15,[sp,#64] 233 lsl x5,x5,#1 234 eor v14.16b,v14.16b,v14.16b 235 236.align 4 237.LNEON_8n: 238 eor v6.16b,v6.16b,v6.16b 239 sub x7,sp,#128 240 eor v7.16b,v7.16b,v7.16b 241 sub x7,x7,x5,lsl#4 242 eor v8.16b,v8.16b,v8.16b 243 and x7,x7,#-64 244 eor v9.16b,v9.16b,v9.16b 245 mov sp,x7 // alloca 246 eor v10.16b,v10.16b,v10.16b 247 add x7,x7,#256 248 eor v11.16b,v11.16b,v11.16b 249 sub x8,x5,#8 250 eor v12.16b,v12.16b,v12.16b 251 eor v13.16b,v13.16b,v13.16b 252 253.LNEON_8n_init: 254 st1 {v6.2d,v7.2d},[x7],#32 255 subs x8,x8,#8 256 st1 {v8.2d,v9.2d},[x7],#32 257 st1 {v10.2d,v11.2d},[x7],#32 258 st1 {v12.2d,v13.2d},[x7],#32 259 bne .LNEON_8n_init 260 261 add x6,sp,#256 262 ld1 {v0.4s,v1.4s},[x1],#32 263 add x10,sp,#8 264 ldr s30,[x4],#4 265 mov x9,x5 266 b .LNEON_8n_outer 267 268.align 4 269.LNEON_8n_outer: 270 ldr s28,[x2],#4 // *b++ 271 uxtl v28.4s,v28.4h 272 add x7,sp,#128 273 ld1 {v2.4s,v3.4s},[x3],#32 274 275 umlal v6.2d,v28.2s,v0.s[0] 276 umlal v7.2d,v28.2s,v0.s[1] 277 umlal v8.2d,v28.2s,v0.s[2] 278 shl v29.2d,v6.2d,#16 279 ext v29.16b,v29.16b,v29.16b,#8 280 umlal v9.2d,v28.2s,v0.s[3] 281 add v29.2d,v29.2d,v6.2d 282 umlal v10.2d,v28.2s,v1.s[0] 283 mul v29.2s,v29.2s,v30.2s 284 umlal v11.2d,v28.2s,v1.s[1] 285 st1 {v28.2s},[sp] // put aside smashed b[8*i+0] 286 umlal v12.2d,v28.2s,v1.s[2] 287 uxtl v29.4s,v29.4h 288 umlal v13.2d,v28.2s,v1.s[3] 289 ldr s28,[x2],#4 // *b++ 290 umlal v6.2d,v29.2s,v2.s[0] 291 umlal v7.2d,v29.2s,v2.s[1] 292 uxtl v28.4s,v28.4h 293 umlal v8.2d,v29.2s,v2.s[2] 294 ushr v15.2d,v6.2d,#16 295 umlal v9.2d,v29.2s,v2.s[3] 296 umlal v10.2d,v29.2s,v3.s[0] 297 ext v6.16b,v6.16b,v6.16b,#8 298 add v6.2d,v6.2d,v15.2d 299 umlal v11.2d,v29.2s,v3.s[1] 300 ushr v6.2d,v6.2d,#16 301 umlal v12.2d,v29.2s,v3.s[2] 302 umlal v13.2d,v29.2s,v3.s[3] 303 add v16.2d,v7.2d,v6.2d 304 ins v7.d[0],v16.d[0] 305 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+0] 306 umlal v7.2d,v28.2s,v0.s[0] 307 ld1 {v6.2d},[x6],#16 308 umlal v8.2d,v28.2s,v0.s[1] 309 umlal v9.2d,v28.2s,v0.s[2] 310 shl v29.2d,v7.2d,#16 311 ext v29.16b,v29.16b,v29.16b,#8 312 umlal v10.2d,v28.2s,v0.s[3] 313 add v29.2d,v29.2d,v7.2d 314 umlal v11.2d,v28.2s,v1.s[0] 315 mul v29.2s,v29.2s,v30.2s 316 umlal v12.2d,v28.2s,v1.s[1] 317 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+1] 318 umlal v13.2d,v28.2s,v1.s[2] 319 uxtl v29.4s,v29.4h 320 umlal v6.2d,v28.2s,v1.s[3] 321 ldr s28,[x2],#4 // *b++ 322 umlal v7.2d,v29.2s,v2.s[0] 323 umlal v8.2d,v29.2s,v2.s[1] 324 uxtl v28.4s,v28.4h 325 umlal v9.2d,v29.2s,v2.s[2] 326 ushr v15.2d,v7.2d,#16 327 umlal v10.2d,v29.2s,v2.s[3] 328 umlal v11.2d,v29.2s,v3.s[0] 329 ext v7.16b,v7.16b,v7.16b,#8 330 add v7.2d,v7.2d,v15.2d 331 umlal v12.2d,v29.2s,v3.s[1] 332 ushr v7.2d,v7.2d,#16 333 umlal v13.2d,v29.2s,v3.s[2] 334 umlal v6.2d,v29.2s,v3.s[3] 335 add v16.2d,v8.2d,v7.2d 336 ins v8.d[0],v16.d[0] 337 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+1] 338 umlal v8.2d,v28.2s,v0.s[0] 339 ld1 {v7.2d},[x6],#16 340 umlal v9.2d,v28.2s,v0.s[1] 341 umlal v10.2d,v28.2s,v0.s[2] 342 shl v29.2d,v8.2d,#16 343 ext v29.16b,v29.16b,v29.16b,#8 344 umlal v11.2d,v28.2s,v0.s[3] 345 add v29.2d,v29.2d,v8.2d 346 umlal v12.2d,v28.2s,v1.s[0] 347 mul v29.2s,v29.2s,v30.2s 348 umlal v13.2d,v28.2s,v1.s[1] 349 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+2] 350 umlal v6.2d,v28.2s,v1.s[2] 351 uxtl v29.4s,v29.4h 352 umlal v7.2d,v28.2s,v1.s[3] 353 ldr s28,[x2],#4 // *b++ 354 umlal v8.2d,v29.2s,v2.s[0] 355 umlal v9.2d,v29.2s,v2.s[1] 356 uxtl v28.4s,v28.4h 357 umlal v10.2d,v29.2s,v2.s[2] 358 ushr v15.2d,v8.2d,#16 359 umlal v11.2d,v29.2s,v2.s[3] 360 umlal v12.2d,v29.2s,v3.s[0] 361 ext v8.16b,v8.16b,v8.16b,#8 362 add v8.2d,v8.2d,v15.2d 363 umlal v13.2d,v29.2s,v3.s[1] 364 ushr v8.2d,v8.2d,#16 365 umlal v6.2d,v29.2s,v3.s[2] 366 umlal v7.2d,v29.2s,v3.s[3] 367 add v16.2d,v9.2d,v8.2d 368 ins v9.d[0],v16.d[0] 369 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+2] 370 umlal v9.2d,v28.2s,v0.s[0] 371 ld1 {v8.2d},[x6],#16 372 umlal v10.2d,v28.2s,v0.s[1] 373 umlal v11.2d,v28.2s,v0.s[2] 374 shl v29.2d,v9.2d,#16 375 ext v29.16b,v29.16b,v29.16b,#8 376 umlal v12.2d,v28.2s,v0.s[3] 377 add v29.2d,v29.2d,v9.2d 378 umlal v13.2d,v28.2s,v1.s[0] 379 mul v29.2s,v29.2s,v30.2s 380 umlal v6.2d,v28.2s,v1.s[1] 381 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+3] 382 umlal v7.2d,v28.2s,v1.s[2] 383 uxtl v29.4s,v29.4h 384 umlal v8.2d,v28.2s,v1.s[3] 385 ldr s28,[x2],#4 // *b++ 386 umlal v9.2d,v29.2s,v2.s[0] 387 umlal v10.2d,v29.2s,v2.s[1] 388 uxtl v28.4s,v28.4h 389 umlal v11.2d,v29.2s,v2.s[2] 390 ushr v15.2d,v9.2d,#16 391 umlal v12.2d,v29.2s,v2.s[3] 392 umlal v13.2d,v29.2s,v3.s[0] 393 ext v9.16b,v9.16b,v9.16b,#8 394 add v9.2d,v9.2d,v15.2d 395 umlal v6.2d,v29.2s,v3.s[1] 396 ushr v9.2d,v9.2d,#16 397 umlal v7.2d,v29.2s,v3.s[2] 398 umlal v8.2d,v29.2s,v3.s[3] 399 add v16.2d,v10.2d,v9.2d 400 ins v10.d[0],v16.d[0] 401 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+3] 402 umlal v10.2d,v28.2s,v0.s[0] 403 ld1 {v9.2d},[x6],#16 404 umlal v11.2d,v28.2s,v0.s[1] 405 umlal v12.2d,v28.2s,v0.s[2] 406 shl v29.2d,v10.2d,#16 407 ext v29.16b,v29.16b,v29.16b,#8 408 umlal v13.2d,v28.2s,v0.s[3] 409 add v29.2d,v29.2d,v10.2d 410 umlal v6.2d,v28.2s,v1.s[0] 411 mul v29.2s,v29.2s,v30.2s 412 umlal v7.2d,v28.2s,v1.s[1] 413 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+4] 414 umlal v8.2d,v28.2s,v1.s[2] 415 uxtl v29.4s,v29.4h 416 umlal v9.2d,v28.2s,v1.s[3] 417 ldr s28,[x2],#4 // *b++ 418 umlal v10.2d,v29.2s,v2.s[0] 419 umlal v11.2d,v29.2s,v2.s[1] 420 uxtl v28.4s,v28.4h 421 umlal v12.2d,v29.2s,v2.s[2] 422 ushr v15.2d,v10.2d,#16 423 umlal v13.2d,v29.2s,v2.s[3] 424 umlal v6.2d,v29.2s,v3.s[0] 425 ext v10.16b,v10.16b,v10.16b,#8 426 add v10.2d,v10.2d,v15.2d 427 umlal v7.2d,v29.2s,v3.s[1] 428 ushr v10.2d,v10.2d,#16 429 umlal v8.2d,v29.2s,v3.s[2] 430 umlal v9.2d,v29.2s,v3.s[3] 431 add v16.2d,v11.2d,v10.2d 432 ins v11.d[0],v16.d[0] 433 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+4] 434 umlal v11.2d,v28.2s,v0.s[0] 435 ld1 {v10.2d},[x6],#16 436 umlal v12.2d,v28.2s,v0.s[1] 437 umlal v13.2d,v28.2s,v0.s[2] 438 shl v29.2d,v11.2d,#16 439 ext v29.16b,v29.16b,v29.16b,#8 440 umlal v6.2d,v28.2s,v0.s[3] 441 add v29.2d,v29.2d,v11.2d 442 umlal v7.2d,v28.2s,v1.s[0] 443 mul v29.2s,v29.2s,v30.2s 444 umlal v8.2d,v28.2s,v1.s[1] 445 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+5] 446 umlal v9.2d,v28.2s,v1.s[2] 447 uxtl v29.4s,v29.4h 448 umlal v10.2d,v28.2s,v1.s[3] 449 ldr s28,[x2],#4 // *b++ 450 umlal v11.2d,v29.2s,v2.s[0] 451 umlal v12.2d,v29.2s,v2.s[1] 452 uxtl v28.4s,v28.4h 453 umlal v13.2d,v29.2s,v2.s[2] 454 ushr v15.2d,v11.2d,#16 455 umlal v6.2d,v29.2s,v2.s[3] 456 umlal v7.2d,v29.2s,v3.s[0] 457 ext v11.16b,v11.16b,v11.16b,#8 458 add v11.2d,v11.2d,v15.2d 459 umlal v8.2d,v29.2s,v3.s[1] 460 ushr v11.2d,v11.2d,#16 461 umlal v9.2d,v29.2s,v3.s[2] 462 umlal v10.2d,v29.2s,v3.s[3] 463 add v16.2d,v12.2d,v11.2d 464 ins v12.d[0],v16.d[0] 465 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+5] 466 umlal v12.2d,v28.2s,v0.s[0] 467 ld1 {v11.2d},[x6],#16 468 umlal v13.2d,v28.2s,v0.s[1] 469 umlal v6.2d,v28.2s,v0.s[2] 470 shl v29.2d,v12.2d,#16 471 ext v29.16b,v29.16b,v29.16b,#8 472 umlal v7.2d,v28.2s,v0.s[3] 473 add v29.2d,v29.2d,v12.2d 474 umlal v8.2d,v28.2s,v1.s[0] 475 mul v29.2s,v29.2s,v30.2s 476 umlal v9.2d,v28.2s,v1.s[1] 477 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+6] 478 umlal v10.2d,v28.2s,v1.s[2] 479 uxtl v29.4s,v29.4h 480 umlal v11.2d,v28.2s,v1.s[3] 481 ldr s28,[x2],#4 // *b++ 482 umlal v12.2d,v29.2s,v2.s[0] 483 umlal v13.2d,v29.2s,v2.s[1] 484 uxtl v28.4s,v28.4h 485 umlal v6.2d,v29.2s,v2.s[2] 486 ushr v15.2d,v12.2d,#16 487 umlal v7.2d,v29.2s,v2.s[3] 488 umlal v8.2d,v29.2s,v3.s[0] 489 ext v12.16b,v12.16b,v12.16b,#8 490 add v12.2d,v12.2d,v15.2d 491 umlal v9.2d,v29.2s,v3.s[1] 492 ushr v12.2d,v12.2d,#16 493 umlal v10.2d,v29.2s,v3.s[2] 494 umlal v11.2d,v29.2s,v3.s[3] 495 add v16.2d,v13.2d,v12.2d 496 ins v13.d[0],v16.d[0] 497 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+6] 498 umlal v13.2d,v28.2s,v0.s[0] 499 ld1 {v12.2d},[x6],#16 500 umlal v6.2d,v28.2s,v0.s[1] 501 umlal v7.2d,v28.2s,v0.s[2] 502 shl v29.2d,v13.2d,#16 503 ext v29.16b,v29.16b,v29.16b,#8 504 umlal v8.2d,v28.2s,v0.s[3] 505 add v29.2d,v29.2d,v13.2d 506 umlal v9.2d,v28.2s,v1.s[0] 507 mul v29.2s,v29.2s,v30.2s 508 umlal v10.2d,v28.2s,v1.s[1] 509 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+7] 510 umlal v11.2d,v28.2s,v1.s[2] 511 uxtl v29.4s,v29.4h 512 umlal v12.2d,v28.2s,v1.s[3] 513 ld1 {v28.2s},[sp] // pull smashed b[8*i+0] 514 umlal v13.2d,v29.2s,v2.s[0] 515 ld1 {v0.4s,v1.4s},[x1],#32 516 umlal v6.2d,v29.2s,v2.s[1] 517 umlal v7.2d,v29.2s,v2.s[2] 518 mov v5.16b,v13.16b 519 ushr v5.2d,v5.2d,#16 520 ext v13.16b,v13.16b,v13.16b,#8 521 umlal v8.2d,v29.2s,v2.s[3] 522 umlal v9.2d,v29.2s,v3.s[0] 523 add v13.2d,v13.2d,v5.2d 524 umlal v10.2d,v29.2s,v3.s[1] 525 ushr v13.2d,v13.2d,#16 526 eor v15.16b,v15.16b,v15.16b 527 ins v13.d[1],v15.d[0] 528 umlal v11.2d,v29.2s,v3.s[2] 529 umlal v12.2d,v29.2s,v3.s[3] 530 add v6.2d,v6.2d,v13.2d 531 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+7] 532 add x10,sp,#8 // rewind 533 sub x8,x5,#8 534 b .LNEON_8n_inner 535 536.align 4 537.LNEON_8n_inner: 538 subs x8,x8,#8 539 umlal v6.2d,v28.2s,v0.s[0] 540 ld1 {v13.2d},[x6] 541 umlal v7.2d,v28.2s,v0.s[1] 542 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+0] 543 umlal v8.2d,v28.2s,v0.s[2] 544 ld1 {v2.4s,v3.4s},[x3],#32 545 umlal v9.2d,v28.2s,v0.s[3] 546 b.eq .LInner_jump 547 add x6,x6,#16 // don't advance in last iteration 548.LInner_jump: 549 umlal v10.2d,v28.2s,v1.s[0] 550 umlal v11.2d,v28.2s,v1.s[1] 551 umlal v12.2d,v28.2s,v1.s[2] 552 umlal v13.2d,v28.2s,v1.s[3] 553 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+1] 554 umlal v6.2d,v29.2s,v2.s[0] 555 umlal v7.2d,v29.2s,v2.s[1] 556 umlal v8.2d,v29.2s,v2.s[2] 557 umlal v9.2d,v29.2s,v2.s[3] 558 umlal v10.2d,v29.2s,v3.s[0] 559 umlal v11.2d,v29.2s,v3.s[1] 560 umlal v12.2d,v29.2s,v3.s[2] 561 umlal v13.2d,v29.2s,v3.s[3] 562 st1 {v6.2d},[x7],#16 563 umlal v7.2d,v28.2s,v0.s[0] 564 ld1 {v6.2d},[x6] 565 umlal v8.2d,v28.2s,v0.s[1] 566 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+1] 567 umlal v9.2d,v28.2s,v0.s[2] 568 b.eq .LInner_jump1 569 add x6,x6,#16 // don't advance in last iteration 570.LInner_jump1: 571 umlal v10.2d,v28.2s,v0.s[3] 572 umlal v11.2d,v28.2s,v1.s[0] 573 umlal v12.2d,v28.2s,v1.s[1] 574 umlal v13.2d,v28.2s,v1.s[2] 575 umlal v6.2d,v28.2s,v1.s[3] 576 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+2] 577 umlal v7.2d,v29.2s,v2.s[0] 578 umlal v8.2d,v29.2s,v2.s[1] 579 umlal v9.2d,v29.2s,v2.s[2] 580 umlal v10.2d,v29.2s,v2.s[3] 581 umlal v11.2d,v29.2s,v3.s[0] 582 umlal v12.2d,v29.2s,v3.s[1] 583 umlal v13.2d,v29.2s,v3.s[2] 584 umlal v6.2d,v29.2s,v3.s[3] 585 st1 {v7.2d},[x7],#16 586 umlal v8.2d,v28.2s,v0.s[0] 587 ld1 {v7.2d},[x6] 588 umlal v9.2d,v28.2s,v0.s[1] 589 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+2] 590 umlal v10.2d,v28.2s,v0.s[2] 591 b.eq .LInner_jump2 592 add x6,x6,#16 // don't advance in last iteration 593.LInner_jump2: 594 umlal v11.2d,v28.2s,v0.s[3] 595 umlal v12.2d,v28.2s,v1.s[0] 596 umlal v13.2d,v28.2s,v1.s[1] 597 umlal v6.2d,v28.2s,v1.s[2] 598 umlal v7.2d,v28.2s,v1.s[3] 599 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+3] 600 umlal v8.2d,v29.2s,v2.s[0] 601 umlal v9.2d,v29.2s,v2.s[1] 602 umlal v10.2d,v29.2s,v2.s[2] 603 umlal v11.2d,v29.2s,v2.s[3] 604 umlal v12.2d,v29.2s,v3.s[0] 605 umlal v13.2d,v29.2s,v3.s[1] 606 umlal v6.2d,v29.2s,v3.s[2] 607 umlal v7.2d,v29.2s,v3.s[3] 608 st1 {v8.2d},[x7],#16 609 umlal v9.2d,v28.2s,v0.s[0] 610 ld1 {v8.2d},[x6] 611 umlal v10.2d,v28.2s,v0.s[1] 612 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+3] 613 umlal v11.2d,v28.2s,v0.s[2] 614 b.eq .LInner_jump3 615 add x6,x6,#16 // don't advance in last iteration 616.LInner_jump3: 617 umlal v12.2d,v28.2s,v0.s[3] 618 umlal v13.2d,v28.2s,v1.s[0] 619 umlal v6.2d,v28.2s,v1.s[1] 620 umlal v7.2d,v28.2s,v1.s[2] 621 umlal v8.2d,v28.2s,v1.s[3] 622 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+4] 623 umlal v9.2d,v29.2s,v2.s[0] 624 umlal v10.2d,v29.2s,v2.s[1] 625 umlal v11.2d,v29.2s,v2.s[2] 626 umlal v12.2d,v29.2s,v2.s[3] 627 umlal v13.2d,v29.2s,v3.s[0] 628 umlal v6.2d,v29.2s,v3.s[1] 629 umlal v7.2d,v29.2s,v3.s[2] 630 umlal v8.2d,v29.2s,v3.s[3] 631 st1 {v9.2d},[x7],#16 632 umlal v10.2d,v28.2s,v0.s[0] 633 ld1 {v9.2d},[x6] 634 umlal v11.2d,v28.2s,v0.s[1] 635 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+4] 636 umlal v12.2d,v28.2s,v0.s[2] 637 b.eq .LInner_jump4 638 add x6,x6,#16 // don't advance in last iteration 639.LInner_jump4: 640 umlal v13.2d,v28.2s,v0.s[3] 641 umlal v6.2d,v28.2s,v1.s[0] 642 umlal v7.2d,v28.2s,v1.s[1] 643 umlal v8.2d,v28.2s,v1.s[2] 644 umlal v9.2d,v28.2s,v1.s[3] 645 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+5] 646 umlal v10.2d,v29.2s,v2.s[0] 647 umlal v11.2d,v29.2s,v2.s[1] 648 umlal v12.2d,v29.2s,v2.s[2] 649 umlal v13.2d,v29.2s,v2.s[3] 650 umlal v6.2d,v29.2s,v3.s[0] 651 umlal v7.2d,v29.2s,v3.s[1] 652 umlal v8.2d,v29.2s,v3.s[2] 653 umlal v9.2d,v29.2s,v3.s[3] 654 st1 {v10.2d},[x7],#16 655 umlal v11.2d,v28.2s,v0.s[0] 656 ld1 {v10.2d},[x6] 657 umlal v12.2d,v28.2s,v0.s[1] 658 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+5] 659 umlal v13.2d,v28.2s,v0.s[2] 660 b.eq .LInner_jump5 661 add x6,x6,#16 // don't advance in last iteration 662.LInner_jump5: 663 umlal v6.2d,v28.2s,v0.s[3] 664 umlal v7.2d,v28.2s,v1.s[0] 665 umlal v8.2d,v28.2s,v1.s[1] 666 umlal v9.2d,v28.2s,v1.s[2] 667 umlal v10.2d,v28.2s,v1.s[3] 668 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+6] 669 umlal v11.2d,v29.2s,v2.s[0] 670 umlal v12.2d,v29.2s,v2.s[1] 671 umlal v13.2d,v29.2s,v2.s[2] 672 umlal v6.2d,v29.2s,v2.s[3] 673 umlal v7.2d,v29.2s,v3.s[0] 674 umlal v8.2d,v29.2s,v3.s[1] 675 umlal v9.2d,v29.2s,v3.s[2] 676 umlal v10.2d,v29.2s,v3.s[3] 677 st1 {v11.2d},[x7],#16 678 umlal v12.2d,v28.2s,v0.s[0] 679 ld1 {v11.2d},[x6] 680 umlal v13.2d,v28.2s,v0.s[1] 681 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+6] 682 umlal v6.2d,v28.2s,v0.s[2] 683 b.eq .LInner_jump6 684 add x6,x6,#16 // don't advance in last iteration 685.LInner_jump6: 686 umlal v7.2d,v28.2s,v0.s[3] 687 umlal v8.2d,v28.2s,v1.s[0] 688 umlal v9.2d,v28.2s,v1.s[1] 689 umlal v10.2d,v28.2s,v1.s[2] 690 umlal v11.2d,v28.2s,v1.s[3] 691 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+7] 692 umlal v12.2d,v29.2s,v2.s[0] 693 umlal v13.2d,v29.2s,v2.s[1] 694 umlal v6.2d,v29.2s,v2.s[2] 695 umlal v7.2d,v29.2s,v2.s[3] 696 umlal v8.2d,v29.2s,v3.s[0] 697 umlal v9.2d,v29.2s,v3.s[1] 698 umlal v10.2d,v29.2s,v3.s[2] 699 umlal v11.2d,v29.2s,v3.s[3] 700 st1 {v12.2d},[x7],#16 701 umlal v13.2d,v28.2s,v0.s[0] 702 ld1 {v12.2d},[x6] 703 umlal v6.2d,v28.2s,v0.s[1] 704 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+7] 705 umlal v7.2d,v28.2s,v0.s[2] 706 b.eq .LInner_jump7 707 add x6,x6,#16 // don't advance in last iteration 708.LInner_jump7: 709 umlal v8.2d,v28.2s,v0.s[3] 710 umlal v9.2d,v28.2s,v1.s[0] 711 umlal v10.2d,v28.2s,v1.s[1] 712 umlal v11.2d,v28.2s,v1.s[2] 713 umlal v12.2d,v28.2s,v1.s[3] 714 b.ne .LInner_after_rewind8 715 sub x1,x1,x5,lsl#2 // rewind 716.LInner_after_rewind8: 717 umlal v13.2d,v29.2s,v2.s[0] 718 ld1 {v28.2s},[sp] // pull smashed b[8*i+0] 719 umlal v6.2d,v29.2s,v2.s[1] 720 ld1 {v0.4s,v1.4s},[x1],#32 721 umlal v7.2d,v29.2s,v2.s[2] 722 add x10,sp,#8 // rewind 723 umlal v8.2d,v29.2s,v2.s[3] 724 umlal v9.2d,v29.2s,v3.s[0] 725 umlal v10.2d,v29.2s,v3.s[1] 726 umlal v11.2d,v29.2s,v3.s[2] 727 st1 {v13.2d},[x7],#16 728 umlal v12.2d,v29.2s,v3.s[3] 729 730 bne .LNEON_8n_inner 731 add x6,sp,#128 732 st1 {v6.2d,v7.2d},[x7],#32 733 eor v2.16b,v2.16b,v2.16b // v2 734 st1 {v8.2d,v9.2d},[x7],#32 735 eor v3.16b,v3.16b,v3.16b // v3 736 st1 {v10.2d,v11.2d},[x7],#32 737 st1 {v12.2d},[x7] 738 739 subs x9,x9,#8 740 ld1 {v6.2d,v7.2d},[x6],#32 741 ld1 {v8.2d,v9.2d},[x6],#32 742 ld1 {v10.2d,v11.2d},[x6],#32 743 ld1 {v12.2d,v13.2d},[x6],#32 744 745 b.eq .LInner_8n_jump_2steps 746 sub x3,x3,x5,lsl#2 // rewind 747 b .LNEON_8n_outer 748 749.LInner_8n_jump_2steps: 750 add x7,sp,#128 751 st1 {v2.2d,v3.2d}, [sp],#32 // start wiping stack frame 752 mov v5.16b,v6.16b 753 ushr v15.2d,v6.2d,#16 754 ext v6.16b,v6.16b,v6.16b,#8 755 st1 {v2.2d,v3.2d}, [sp],#32 756 add v6.2d,v6.2d,v15.2d 757 st1 {v2.2d,v3.2d}, [sp],#32 758 ushr v15.2d,v6.2d,#16 759 st1 {v2.2d,v3.2d}, [sp],#32 760 zip1 v6.4h,v5.4h,v6.4h 761 ins v15.d[1],v14.d[0] 762 763 mov x8,x5 764 b .LNEON_tail_entry 765 766.align 4 767.LNEON_tail: 768 add v6.2d,v6.2d,v15.2d 769 mov v5.16b,v6.16b 770 ushr v15.2d,v6.2d,#16 771 ext v6.16b,v6.16b,v6.16b,#8 772 ld1 {v8.2d,v9.2d}, [x6],#32 773 add v6.2d,v6.2d,v15.2d 774 ld1 {v10.2d,v11.2d}, [x6],#32 775 ushr v15.2d,v6.2d,#16 776 ld1 {v12.2d,v13.2d}, [x6],#32 777 zip1 v6.4h,v5.4h,v6.4h 778 ins v15.d[1],v14.d[0] 779 780.LNEON_tail_entry: 781 add v7.2d,v7.2d,v15.2d 782 st1 {v6.s}[0], [x7],#4 783 ushr v15.2d,v7.2d,#16 784 mov v5.16b,v7.16b 785 ext v7.16b,v7.16b,v7.16b,#8 786 add v7.2d,v7.2d,v15.2d 787 ushr v15.2d,v7.2d,#16 788 zip1 v7.4h,v5.4h,v7.4h 789 ins v15.d[1],v14.d[0] 790 add v8.2d,v8.2d,v15.2d 791 st1 {v7.s}[0], [x7],#4 792 ushr v15.2d,v8.2d,#16 793 mov v5.16b,v8.16b 794 ext v8.16b,v8.16b,v8.16b,#8 795 add v8.2d,v8.2d,v15.2d 796 ushr v15.2d,v8.2d,#16 797 zip1 v8.4h,v5.4h,v8.4h 798 ins v15.d[1],v14.d[0] 799 add v9.2d,v9.2d,v15.2d 800 st1 {v8.s}[0], [x7],#4 801 ushr v15.2d,v9.2d,#16 802 mov v5.16b,v9.16b 803 ext v9.16b,v9.16b,v9.16b,#8 804 add v9.2d,v9.2d,v15.2d 805 ushr v15.2d,v9.2d,#16 806 zip1 v9.4h,v5.4h,v9.4h 807 ins v15.d[1],v14.d[0] 808 add v10.2d,v10.2d,v15.2d 809 st1 {v9.s}[0], [x7],#4 810 ushr v15.2d,v10.2d,#16 811 mov v5.16b,v10.16b 812 ext v10.16b,v10.16b,v10.16b,#8 813 add v10.2d,v10.2d,v15.2d 814 ushr v15.2d,v10.2d,#16 815 zip1 v10.4h,v5.4h,v10.4h 816 ins v15.d[1],v14.d[0] 817 add v11.2d,v11.2d,v15.2d 818 st1 {v10.s}[0], [x7],#4 819 ushr v15.2d,v11.2d,#16 820 mov v5.16b,v11.16b 821 ext v11.16b,v11.16b,v11.16b,#8 822 add v11.2d,v11.2d,v15.2d 823 ushr v15.2d,v11.2d,#16 824 zip1 v11.4h,v5.4h,v11.4h 825 ins v15.d[1],v14.d[0] 826 add v12.2d,v12.2d,v15.2d 827 st1 {v11.s}[0], [x7],#4 828 ushr v15.2d,v12.2d,#16 829 mov v5.16b,v12.16b 830 ext v12.16b,v12.16b,v12.16b,#8 831 add v12.2d,v12.2d,v15.2d 832 ushr v15.2d,v12.2d,#16 833 zip1 v12.4h,v5.4h,v12.4h 834 ins v15.d[1],v14.d[0] 835 add v13.2d,v13.2d,v15.2d 836 st1 {v12.s}[0], [x7],#4 837 ushr v15.2d,v13.2d,#16 838 mov v5.16b,v13.16b 839 ext v13.16b,v13.16b,v13.16b,#8 840 add v13.2d,v13.2d,v15.2d 841 ushr v15.2d,v13.2d,#16 842 zip1 v13.4h,v5.4h,v13.4h 843 ins v15.d[1],v14.d[0] 844 ld1 {v6.2d,v7.2d}, [x6],#32 845 subs x8,x8,#8 846 st1 {v13.s}[0], [x7],#4 847 bne .LNEON_tail 848 849 st1 {v15.s}[0], [x7],#4 // top-most bit 850 sub x3,x3,x5,lsl#2 // rewind x3 851 subs x1,sp,#0 // clear carry flag 852 add x2,sp,x5,lsl#2 853 854.LNEON_sub: 855 ldp w4,w5,[x1],#8 856 ldp w6,w7,[x1],#8 857 ldp w8,w9,[x3],#8 858 ldp w10,w11,[x3],#8 859 sbcs w8,w4,w8 860 sbcs w9,w5,w9 861 sbcs w10,w6,w10 862 sbcs w11,w7,w11 863 sub x17,x2,x1 864 stp w8,w9,[x0],#8 865 stp w10,w11,[x0],#8 866 cbnz x17,.LNEON_sub 867 868 ldr w10, [x1] // load top-most bit 869 mov x11,sp 870 eor v0.16b,v0.16b,v0.16b 871 sub x11,x2,x11 // this is num*4 872 eor v1.16b,v1.16b,v1.16b 873 mov x1,sp 874 sub x0,x0,x11 // rewind x0 875 mov x3,x2 // second 3/4th of frame 876 sbcs w10,w10,wzr // result is carry flag 877 878.LNEON_copy_n_zap: 879 ldp w4,w5,[x1],#8 880 ldp w6,w7,[x1],#8 881 ldp w8,w9,[x0],#8 882 ldp w10,w11,[x0] 883 sub x0,x0,#8 884 b.cs .LCopy_1 885 mov w8,w4 886 mov w9,w5 887 mov w10,w6 888 mov w11,w7 889.LCopy_1: 890 st1 {v0.2d,v1.2d}, [x3],#32 // wipe 891 st1 {v0.2d,v1.2d}, [x3],#32 // wipe 892 ldp w4,w5,[x1],#8 893 ldp w6,w7,[x1],#8 894 stp w8,w9,[x0],#8 895 stp w10,w11,[x0],#8 896 sub x1,x1,#32 897 ldp w8,w9,[x0],#8 898 ldp w10,w11,[x0] 899 sub x0,x0,#8 900 b.cs .LCopy_2 901 mov w8, w4 902 mov w9, w5 903 mov w10, w6 904 mov w11, w7 905.LCopy_2: 906 st1 {v0.2d,v1.2d}, [x1],#32 // wipe 907 st1 {v0.2d,v1.2d}, [x3],#32 // wipe 908 sub x17,x2,x1 // preserves carry 909 stp w8,w9,[x0],#8 910 stp w10,w11,[x0],#8 911 cbnz x17,.LNEON_copy_n_zap 912 913 mov sp,x16 914 ldp d14,d15,[sp,#64] 915 ldp d12,d13,[sp,#48] 916 ldp d10,d11,[sp,#32] 917 ldp d8,d9,[sp,#16] 918 ldr x29,[sp],#80 919 ret // bx lr 920 921.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon 922.type __bn_sqr8x_mont,%function 923.align 5 924__bn_sqr8x_mont: 925 cmp x1,x2 926 b.ne __bn_mul4x_mont 927.Lsqr8x_mont: 928.inst 0xd503233f // paciasp 929 stp x29,x30,[sp,#-128]! 930 add x29,sp,#0 931 stp x19,x20,[sp,#16] 932 stp x21,x22,[sp,#32] 933 stp x23,x24,[sp,#48] 934 stp x25,x26,[sp,#64] 935 stp x27,x28,[sp,#80] 936 stp x0,x3,[sp,#96] // offload rp and np 937 938 ldp x6,x7,[x1,#8*0] 939 ldp x8,x9,[x1,#8*2] 940 ldp x10,x11,[x1,#8*4] 941 ldp x12,x13,[x1,#8*6] 942 943 sub x2,sp,x5,lsl#4 944 lsl x5,x5,#3 945 ldr x4,[x4] // *n0 946 mov sp,x2 // alloca 947 sub x27,x5,#8*8 948 b .Lsqr8x_zero_start 949 950.Lsqr8x_zero: 951 sub x27,x27,#8*8 952 stp xzr,xzr,[x2,#8*0] 953 stp xzr,xzr,[x2,#8*2] 954 stp xzr,xzr,[x2,#8*4] 955 stp xzr,xzr,[x2,#8*6] 956.Lsqr8x_zero_start: 957 stp xzr,xzr,[x2,#8*8] 958 stp xzr,xzr,[x2,#8*10] 959 stp xzr,xzr,[x2,#8*12] 960 stp xzr,xzr,[x2,#8*14] 961 add x2,x2,#8*16 962 cbnz x27,.Lsqr8x_zero 963 964 add x3,x1,x5 965 add x1,x1,#8*8 966 mov x19,xzr 967 mov x20,xzr 968 mov x21,xzr 969 mov x22,xzr 970 mov x23,xzr 971 mov x24,xzr 972 mov x25,xzr 973 mov x26,xzr 974 mov x2,sp 975 str x4,[x29,#112] // offload n0 976 977 // Multiply everything but a[i]*a[i] 978.align 4 979.Lsqr8x_outer_loop: 980 // a[1]a[0] (i) 981 // a[2]a[0] 982 // a[3]a[0] 983 // a[4]a[0] 984 // a[5]a[0] 985 // a[6]a[0] 986 // a[7]a[0] 987 // a[2]a[1] (ii) 988 // a[3]a[1] 989 // a[4]a[1] 990 // a[5]a[1] 991 // a[6]a[1] 992 // a[7]a[1] 993 // a[3]a[2] (iii) 994 // a[4]a[2] 995 // a[5]a[2] 996 // a[6]a[2] 997 // a[7]a[2] 998 // a[4]a[3] (iv) 999 // a[5]a[3] 1000 // a[6]a[3] 1001 // a[7]a[3] 1002 // a[5]a[4] (v) 1003 // a[6]a[4] 1004 // a[7]a[4] 1005 // a[6]a[5] (vi) 1006 // a[7]a[5] 1007 // a[7]a[6] (vii) 1008 1009 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) 1010 mul x15,x8,x6 1011 mul x16,x9,x6 1012 mul x17,x10,x6 1013 adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) 1014 mul x14,x11,x6 1015 adcs x21,x21,x15 1016 mul x15,x12,x6 1017 adcs x22,x22,x16 1018 mul x16,x13,x6 1019 adcs x23,x23,x17 1020 umulh x17,x7,x6 // hi(a[1..7]*a[0]) 1021 adcs x24,x24,x14 1022 umulh x14,x8,x6 1023 adcs x25,x25,x15 1024 umulh x15,x9,x6 1025 adcs x26,x26,x16 1026 umulh x16,x10,x6 1027 stp x19,x20,[x2],#8*2 // t[0..1] 1028 adc x19,xzr,xzr // t[8] 1029 adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) 1030 umulh x17,x11,x6 1031 adcs x22,x22,x14 1032 umulh x14,x12,x6 1033 adcs x23,x23,x15 1034 umulh x15,x13,x6 1035 adcs x24,x24,x16 1036 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) 1037 adcs x25,x25,x17 1038 mul x17,x9,x7 1039 adcs x26,x26,x14 1040 mul x14,x10,x7 1041 adc x19,x19,x15 1042 1043 mul x15,x11,x7 1044 adds x22,x22,x16 1045 mul x16,x12,x7 1046 adcs x23,x23,x17 1047 mul x17,x13,x7 1048 adcs x24,x24,x14 1049 umulh x14,x8,x7 // hi(a[2..7]*a[1]) 1050 adcs x25,x25,x15 1051 umulh x15,x9,x7 1052 adcs x26,x26,x16 1053 umulh x16,x10,x7 1054 adcs x19,x19,x17 1055 umulh x17,x11,x7 1056 stp x21,x22,[x2],#8*2 // t[2..3] 1057 adc x20,xzr,xzr // t[9] 1058 adds x23,x23,x14 1059 umulh x14,x12,x7 1060 adcs x24,x24,x15 1061 umulh x15,x13,x7 1062 adcs x25,x25,x16 1063 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) 1064 adcs x26,x26,x17 1065 mul x17,x10,x8 1066 adcs x19,x19,x14 1067 mul x14,x11,x8 1068 adc x20,x20,x15 1069 1070 mul x15,x12,x8 1071 adds x24,x24,x16 1072 mul x16,x13,x8 1073 adcs x25,x25,x17 1074 umulh x17,x9,x8 // hi(a[3..7]*a[2]) 1075 adcs x26,x26,x14 1076 umulh x14,x10,x8 1077 adcs x19,x19,x15 1078 umulh x15,x11,x8 1079 adcs x20,x20,x16 1080 umulh x16,x12,x8 1081 stp x23,x24,[x2],#8*2 // t[4..5] 1082 adc x21,xzr,xzr // t[10] 1083 adds x25,x25,x17 1084 umulh x17,x13,x8 1085 adcs x26,x26,x14 1086 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) 1087 adcs x19,x19,x15 1088 mul x15,x11,x9 1089 adcs x20,x20,x16 1090 mul x16,x12,x9 1091 adc x21,x21,x17 1092 1093 mul x17,x13,x9 1094 adds x26,x26,x14 1095 umulh x14,x10,x9 // hi(a[4..7]*a[3]) 1096 adcs x19,x19,x15 1097 umulh x15,x11,x9 1098 adcs x20,x20,x16 1099 umulh x16,x12,x9 1100 adcs x21,x21,x17 1101 umulh x17,x13,x9 1102 stp x25,x26,[x2],#8*2 // t[6..7] 1103 adc x22,xzr,xzr // t[11] 1104 adds x19,x19,x14 1105 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) 1106 adcs x20,x20,x15 1107 mul x15,x12,x10 1108 adcs x21,x21,x16 1109 mul x16,x13,x10 1110 adc x22,x22,x17 1111 1112 umulh x17,x11,x10 // hi(a[5..7]*a[4]) 1113 adds x20,x20,x14 1114 umulh x14,x12,x10 1115 adcs x21,x21,x15 1116 umulh x15,x13,x10 1117 adcs x22,x22,x16 1118 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) 1119 adc x23,xzr,xzr // t[12] 1120 adds x21,x21,x17 1121 mul x17,x13,x11 1122 adcs x22,x22,x14 1123 umulh x14,x12,x11 // hi(a[6..7]*a[5]) 1124 adc x23,x23,x15 1125 1126 umulh x15,x13,x11 1127 adds x22,x22,x16 1128 mul x16,x13,x12 // lo(a[7]*a[6]) (vii) 1129 adcs x23,x23,x17 1130 umulh x17,x13,x12 // hi(a[7]*a[6]) 1131 adc x24,xzr,xzr // t[13] 1132 adds x23,x23,x14 1133 sub x27,x3,x1 // done yet? 1134 adc x24,x24,x15 1135 1136 adds x24,x24,x16 1137 sub x14,x3,x5 // rewinded ap 1138 adc x25,xzr,xzr // t[14] 1139 add x25,x25,x17 1140 1141 cbz x27,.Lsqr8x_outer_break 1142 1143 mov x4,x6 1144 ldp x6,x7,[x2,#8*0] 1145 ldp x8,x9,[x2,#8*2] 1146 ldp x10,x11,[x2,#8*4] 1147 ldp x12,x13,[x2,#8*6] 1148 adds x19,x19,x6 1149 adcs x20,x20,x7 1150 ldp x6,x7,[x1,#8*0] 1151 adcs x21,x21,x8 1152 adcs x22,x22,x9 1153 ldp x8,x9,[x1,#8*2] 1154 adcs x23,x23,x10 1155 adcs x24,x24,x11 1156 ldp x10,x11,[x1,#8*4] 1157 adcs x25,x25,x12 1158 mov x0,x1 1159 adcs x26,xzr,x13 1160 ldp x12,x13,[x1,#8*6] 1161 add x1,x1,#8*8 1162 //adc x28,xzr,xzr // moved below 1163 mov x27,#-8*8 1164 1165 // a[8]a[0] 1166 // a[9]a[0] 1167 // a[a]a[0] 1168 // a[b]a[0] 1169 // a[c]a[0] 1170 // a[d]a[0] 1171 // a[e]a[0] 1172 // a[f]a[0] 1173 // a[8]a[1] 1174 // a[f]a[1]........................ 1175 // a[8]a[2] 1176 // a[f]a[2]........................ 1177 // a[8]a[3] 1178 // a[f]a[3]........................ 1179 // a[8]a[4] 1180 // a[f]a[4]........................ 1181 // a[8]a[5] 1182 // a[f]a[5]........................ 1183 // a[8]a[6] 1184 // a[f]a[6]........................ 1185 // a[8]a[7] 1186 // a[f]a[7]........................ 1187.Lsqr8x_mul: 1188 mul x14,x6,x4 1189 adc x28,xzr,xzr // carry bit, modulo-scheduled 1190 mul x15,x7,x4 1191 add x27,x27,#8 1192 mul x16,x8,x4 1193 mul x17,x9,x4 1194 adds x19,x19,x14 1195 mul x14,x10,x4 1196 adcs x20,x20,x15 1197 mul x15,x11,x4 1198 adcs x21,x21,x16 1199 mul x16,x12,x4 1200 adcs x22,x22,x17 1201 mul x17,x13,x4 1202 adcs x23,x23,x14 1203 umulh x14,x6,x4 1204 adcs x24,x24,x15 1205 umulh x15,x7,x4 1206 adcs x25,x25,x16 1207 umulh x16,x8,x4 1208 adcs x26,x26,x17 1209 umulh x17,x9,x4 1210 adc x28,x28,xzr 1211 str x19,[x2],#8 1212 adds x19,x20,x14 1213 umulh x14,x10,x4 1214 adcs x20,x21,x15 1215 umulh x15,x11,x4 1216 adcs x21,x22,x16 1217 umulh x16,x12,x4 1218 adcs x22,x23,x17 1219 umulh x17,x13,x4 1220 ldr x4,[x0,x27] 1221 adcs x23,x24,x14 1222 adcs x24,x25,x15 1223 adcs x25,x26,x16 1224 adcs x26,x28,x17 1225 //adc x28,xzr,xzr // moved above 1226 cbnz x27,.Lsqr8x_mul 1227 // note that carry flag is guaranteed 1228 // to be zero at this point 1229 cmp x1,x3 // done yet? 1230 b.eq .Lsqr8x_break 1231 1232 ldp x6,x7,[x2,#8*0] 1233 ldp x8,x9,[x2,#8*2] 1234 ldp x10,x11,[x2,#8*4] 1235 ldp x12,x13,[x2,#8*6] 1236 adds x19,x19,x6 1237 ldur x4,[x0,#-8*8] 1238 adcs x20,x20,x7 1239 ldp x6,x7,[x1,#8*0] 1240 adcs x21,x21,x8 1241 adcs x22,x22,x9 1242 ldp x8,x9,[x1,#8*2] 1243 adcs x23,x23,x10 1244 adcs x24,x24,x11 1245 ldp x10,x11,[x1,#8*4] 1246 adcs x25,x25,x12 1247 mov x27,#-8*8 1248 adcs x26,x26,x13 1249 ldp x12,x13,[x1,#8*6] 1250 add x1,x1,#8*8 1251 //adc x28,xzr,xzr // moved above 1252 b .Lsqr8x_mul 1253 1254.align 4 1255.Lsqr8x_break: 1256 ldp x6,x7,[x0,#8*0] 1257 add x1,x0,#8*8 1258 ldp x8,x9,[x0,#8*2] 1259 sub x14,x3,x1 // is it last iteration? 1260 ldp x10,x11,[x0,#8*4] 1261 sub x15,x2,x14 1262 ldp x12,x13,[x0,#8*6] 1263 cbz x14,.Lsqr8x_outer_loop 1264 1265 stp x19,x20,[x2,#8*0] 1266 ldp x19,x20,[x15,#8*0] 1267 stp x21,x22,[x2,#8*2] 1268 ldp x21,x22,[x15,#8*2] 1269 stp x23,x24,[x2,#8*4] 1270 ldp x23,x24,[x15,#8*4] 1271 stp x25,x26,[x2,#8*6] 1272 mov x2,x15 1273 ldp x25,x26,[x15,#8*6] 1274 b .Lsqr8x_outer_loop 1275 1276.align 4 1277.Lsqr8x_outer_break: 1278 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 1279 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] 1280 ldp x15,x16,[sp,#8*1] 1281 ldp x11,x13,[x14,#8*2] 1282 add x1,x14,#8*4 1283 ldp x17,x14,[sp,#8*3] 1284 1285 stp x19,x20,[x2,#8*0] 1286 mul x19,x7,x7 1287 stp x21,x22,[x2,#8*2] 1288 umulh x7,x7,x7 1289 stp x23,x24,[x2,#8*4] 1290 mul x8,x9,x9 1291 stp x25,x26,[x2,#8*6] 1292 mov x2,sp 1293 umulh x9,x9,x9 1294 adds x20,x7,x15,lsl#1 1295 extr x15,x16,x15,#63 1296 sub x27,x5,#8*4 1297 1298.Lsqr4x_shift_n_add: 1299 adcs x21,x8,x15 1300 extr x16,x17,x16,#63 1301 sub x27,x27,#8*4 1302 adcs x22,x9,x16 1303 ldp x15,x16,[x2,#8*5] 1304 mul x10,x11,x11 1305 ldp x7,x9,[x1],#8*2 1306 umulh x11,x11,x11 1307 mul x12,x13,x13 1308 umulh x13,x13,x13 1309 extr x17,x14,x17,#63 1310 stp x19,x20,[x2,#8*0] 1311 adcs x23,x10,x17 1312 extr x14,x15,x14,#63 1313 stp x21,x22,[x2,#8*2] 1314 adcs x24,x11,x14 1315 ldp x17,x14,[x2,#8*7] 1316 extr x15,x16,x15,#63 1317 adcs x25,x12,x15 1318 extr x16,x17,x16,#63 1319 adcs x26,x13,x16 1320 ldp x15,x16,[x2,#8*9] 1321 mul x6,x7,x7 1322 ldp x11,x13,[x1],#8*2 1323 umulh x7,x7,x7 1324 mul x8,x9,x9 1325 umulh x9,x9,x9 1326 stp x23,x24,[x2,#8*4] 1327 extr x17,x14,x17,#63 1328 stp x25,x26,[x2,#8*6] 1329 add x2,x2,#8*8 1330 adcs x19,x6,x17 1331 extr x14,x15,x14,#63 1332 adcs x20,x7,x14 1333 ldp x17,x14,[x2,#8*3] 1334 extr x15,x16,x15,#63 1335 cbnz x27,.Lsqr4x_shift_n_add 1336 ldp x1,x4,[x29,#104] // pull np and n0 1337 1338 adcs x21,x8,x15 1339 extr x16,x17,x16,#63 1340 adcs x22,x9,x16 1341 ldp x15,x16,[x2,#8*5] 1342 mul x10,x11,x11 1343 umulh x11,x11,x11 1344 stp x19,x20,[x2,#8*0] 1345 mul x12,x13,x13 1346 umulh x13,x13,x13 1347 stp x21,x22,[x2,#8*2] 1348 extr x17,x14,x17,#63 1349 adcs x23,x10,x17 1350 extr x14,x15,x14,#63 1351 ldp x19,x20,[sp,#8*0] 1352 adcs x24,x11,x14 1353 extr x15,x16,x15,#63 1354 ldp x6,x7,[x1,#8*0] 1355 adcs x25,x12,x15 1356 extr x16,xzr,x16,#63 1357 ldp x8,x9,[x1,#8*2] 1358 adc x26,x13,x16 1359 ldp x10,x11,[x1,#8*4] 1360 1361 // Reduce by 512 bits per iteration 1362 mul x28,x4,x19 // t[0]*n0 1363 ldp x12,x13,[x1,#8*6] 1364 add x3,x1,x5 1365 ldp x21,x22,[sp,#8*2] 1366 stp x23,x24,[x2,#8*4] 1367 ldp x23,x24,[sp,#8*4] 1368 stp x25,x26,[x2,#8*6] 1369 ldp x25,x26,[sp,#8*6] 1370 add x1,x1,#8*8 1371 mov x30,xzr // initial top-most carry 1372 mov x2,sp 1373 mov x27,#8 1374 1375.Lsqr8x_reduction: 1376 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) 1377 mul x15,x7,x28 1378 sub x27,x27,#1 1379 mul x16,x8,x28 1380 str x28,[x2],#8 // put aside t[0]*n0 for tail processing 1381 mul x17,x9,x28 1382 // (*) adds xzr,x19,x14 1383 subs xzr,x19,#1 // (*) 1384 mul x14,x10,x28 1385 adcs x19,x20,x15 1386 mul x15,x11,x28 1387 adcs x20,x21,x16 1388 mul x16,x12,x28 1389 adcs x21,x22,x17 1390 mul x17,x13,x28 1391 adcs x22,x23,x14 1392 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) 1393 adcs x23,x24,x15 1394 umulh x15,x7,x28 1395 adcs x24,x25,x16 1396 umulh x16,x8,x28 1397 adcs x25,x26,x17 1398 umulh x17,x9,x28 1399 adc x26,xzr,xzr 1400 adds x19,x19,x14 1401 umulh x14,x10,x28 1402 adcs x20,x20,x15 1403 umulh x15,x11,x28 1404 adcs x21,x21,x16 1405 umulh x16,x12,x28 1406 adcs x22,x22,x17 1407 umulh x17,x13,x28 1408 mul x28,x4,x19 // next t[0]*n0 1409 adcs x23,x23,x14 1410 adcs x24,x24,x15 1411 adcs x25,x25,x16 1412 adc x26,x26,x17 1413 cbnz x27,.Lsqr8x_reduction 1414 1415 ldp x14,x15,[x2,#8*0] 1416 ldp x16,x17,[x2,#8*2] 1417 mov x0,x2 1418 sub x27,x3,x1 // done yet? 1419 adds x19,x19,x14 1420 adcs x20,x20,x15 1421 ldp x14,x15,[x2,#8*4] 1422 adcs x21,x21,x16 1423 adcs x22,x22,x17 1424 ldp x16,x17,[x2,#8*6] 1425 adcs x23,x23,x14 1426 adcs x24,x24,x15 1427 adcs x25,x25,x16 1428 adcs x26,x26,x17 1429 //adc x28,xzr,xzr // moved below 1430 cbz x27,.Lsqr8x8_post_condition 1431 1432 ldur x4,[x2,#-8*8] 1433 ldp x6,x7,[x1,#8*0] 1434 ldp x8,x9,[x1,#8*2] 1435 ldp x10,x11,[x1,#8*4] 1436 mov x27,#-8*8 1437 ldp x12,x13,[x1,#8*6] 1438 add x1,x1,#8*8 1439 1440.Lsqr8x_tail: 1441 mul x14,x6,x4 1442 adc x28,xzr,xzr // carry bit, modulo-scheduled 1443 mul x15,x7,x4 1444 add x27,x27,#8 1445 mul x16,x8,x4 1446 mul x17,x9,x4 1447 adds x19,x19,x14 1448 mul x14,x10,x4 1449 adcs x20,x20,x15 1450 mul x15,x11,x4 1451 adcs x21,x21,x16 1452 mul x16,x12,x4 1453 adcs x22,x22,x17 1454 mul x17,x13,x4 1455 adcs x23,x23,x14 1456 umulh x14,x6,x4 1457 adcs x24,x24,x15 1458 umulh x15,x7,x4 1459 adcs x25,x25,x16 1460 umulh x16,x8,x4 1461 adcs x26,x26,x17 1462 umulh x17,x9,x4 1463 adc x28,x28,xzr 1464 str x19,[x2],#8 1465 adds x19,x20,x14 1466 umulh x14,x10,x4 1467 adcs x20,x21,x15 1468 umulh x15,x11,x4 1469 adcs x21,x22,x16 1470 umulh x16,x12,x4 1471 adcs x22,x23,x17 1472 umulh x17,x13,x4 1473 ldr x4,[x0,x27] 1474 adcs x23,x24,x14 1475 adcs x24,x25,x15 1476 adcs x25,x26,x16 1477 adcs x26,x28,x17 1478 //adc x28,xzr,xzr // moved above 1479 cbnz x27,.Lsqr8x_tail 1480 // note that carry flag is guaranteed 1481 // to be zero at this point 1482 ldp x6,x7,[x2,#8*0] 1483 sub x27,x3,x1 // done yet? 1484 sub x16,x3,x5 // rewinded np 1485 ldp x8,x9,[x2,#8*2] 1486 ldp x10,x11,[x2,#8*4] 1487 ldp x12,x13,[x2,#8*6] 1488 cbz x27,.Lsqr8x_tail_break 1489 1490 ldur x4,[x0,#-8*8] 1491 adds x19,x19,x6 1492 adcs x20,x20,x7 1493 ldp x6,x7,[x1,#8*0] 1494 adcs x21,x21,x8 1495 adcs x22,x22,x9 1496 ldp x8,x9,[x1,#8*2] 1497 adcs x23,x23,x10 1498 adcs x24,x24,x11 1499 ldp x10,x11,[x1,#8*4] 1500 adcs x25,x25,x12 1501 mov x27,#-8*8 1502 adcs x26,x26,x13 1503 ldp x12,x13,[x1,#8*6] 1504 add x1,x1,#8*8 1505 //adc x28,xzr,xzr // moved above 1506 b .Lsqr8x_tail 1507 1508.align 4 1509.Lsqr8x_tail_break: 1510 ldr x4,[x29,#112] // pull n0 1511 add x27,x2,#8*8 // end of current t[num] window 1512 1513 subs xzr,x30,#1 // "move" top-most carry to carry bit 1514 adcs x14,x19,x6 1515 adcs x15,x20,x7 1516 ldp x19,x20,[x0,#8*0] 1517 adcs x21,x21,x8 1518 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] 1519 adcs x22,x22,x9 1520 ldp x8,x9,[x16,#8*2] 1521 adcs x23,x23,x10 1522 adcs x24,x24,x11 1523 ldp x10,x11,[x16,#8*4] 1524 adcs x25,x25,x12 1525 adcs x26,x26,x13 1526 ldp x12,x13,[x16,#8*6] 1527 add x1,x16,#8*8 1528 adc x30,xzr,xzr // top-most carry 1529 mul x28,x4,x19 1530 stp x14,x15,[x2,#8*0] 1531 stp x21,x22,[x2,#8*2] 1532 ldp x21,x22,[x0,#8*2] 1533 stp x23,x24,[x2,#8*4] 1534 ldp x23,x24,[x0,#8*4] 1535 cmp x27,x29 // did we hit the bottom? 1536 stp x25,x26,[x2,#8*6] 1537 mov x2,x0 // slide the window 1538 ldp x25,x26,[x0,#8*6] 1539 mov x27,#8 1540 b.ne .Lsqr8x_reduction 1541 1542 // Final step. We see if result is larger than modulus, and 1543 // if it is, subtract the modulus. But comparison implies 1544 // subtraction. So we subtract modulus, see if it borrowed, 1545 // and conditionally copy original value. 1546 ldr x0,[x29,#96] // pull rp 1547 add x2,x2,#8*8 1548 subs x14,x19,x6 1549 sbcs x15,x20,x7 1550 sub x27,x5,#8*8 1551 mov x3,x0 // x0 copy 1552 1553.Lsqr8x_sub: 1554 sbcs x16,x21,x8 1555 ldp x6,x7,[x1,#8*0] 1556 sbcs x17,x22,x9 1557 stp x14,x15,[x0,#8*0] 1558 sbcs x14,x23,x10 1559 ldp x8,x9,[x1,#8*2] 1560 sbcs x15,x24,x11 1561 stp x16,x17,[x0,#8*2] 1562 sbcs x16,x25,x12 1563 ldp x10,x11,[x1,#8*4] 1564 sbcs x17,x26,x13 1565 ldp x12,x13,[x1,#8*6] 1566 add x1,x1,#8*8 1567 ldp x19,x20,[x2,#8*0] 1568 sub x27,x27,#8*8 1569 ldp x21,x22,[x2,#8*2] 1570 ldp x23,x24,[x2,#8*4] 1571 ldp x25,x26,[x2,#8*6] 1572 add x2,x2,#8*8 1573 stp x14,x15,[x0,#8*4] 1574 sbcs x14,x19,x6 1575 stp x16,x17,[x0,#8*6] 1576 add x0,x0,#8*8 1577 sbcs x15,x20,x7 1578 cbnz x27,.Lsqr8x_sub 1579 1580 sbcs x16,x21,x8 1581 mov x2,sp 1582 add x1,sp,x5 1583 ldp x6,x7,[x3,#8*0] 1584 sbcs x17,x22,x9 1585 stp x14,x15,[x0,#8*0] 1586 sbcs x14,x23,x10 1587 ldp x8,x9,[x3,#8*2] 1588 sbcs x15,x24,x11 1589 stp x16,x17,[x0,#8*2] 1590 sbcs x16,x25,x12 1591 ldp x19,x20,[x1,#8*0] 1592 sbcs x17,x26,x13 1593 ldp x21,x22,[x1,#8*2] 1594 sbcs xzr,x30,xzr // did it borrow? 1595 ldr x30,[x29,#8] // pull return address 1596 stp x14,x15,[x0,#8*4] 1597 stp x16,x17,[x0,#8*6] 1598 1599 sub x27,x5,#8*4 1600.Lsqr4x_cond_copy: 1601 sub x27,x27,#8*4 1602 csel x14,x19,x6,lo 1603 stp xzr,xzr,[x2,#8*0] 1604 csel x15,x20,x7,lo 1605 ldp x6,x7,[x3,#8*4] 1606 ldp x19,x20,[x1,#8*4] 1607 csel x16,x21,x8,lo 1608 stp xzr,xzr,[x2,#8*2] 1609 add x2,x2,#8*4 1610 csel x17,x22,x9,lo 1611 ldp x8,x9,[x3,#8*6] 1612 ldp x21,x22,[x1,#8*6] 1613 add x1,x1,#8*4 1614 stp x14,x15,[x3,#8*0] 1615 stp x16,x17,[x3,#8*2] 1616 add x3,x3,#8*4 1617 stp xzr,xzr,[x1,#8*0] 1618 stp xzr,xzr,[x1,#8*2] 1619 cbnz x27,.Lsqr4x_cond_copy 1620 1621 csel x14,x19,x6,lo 1622 stp xzr,xzr,[x2,#8*0] 1623 csel x15,x20,x7,lo 1624 stp xzr,xzr,[x2,#8*2] 1625 csel x16,x21,x8,lo 1626 csel x17,x22,x9,lo 1627 stp x14,x15,[x3,#8*0] 1628 stp x16,x17,[x3,#8*2] 1629 1630 b .Lsqr8x_done 1631 1632.align 4 1633.Lsqr8x8_post_condition: 1634 adc x28,xzr,xzr 1635 ldr x30,[x29,#8] // pull return address 1636 // x19-7,x28 hold result, x6-7 hold modulus 1637 subs x6,x19,x6 1638 ldr x1,[x29,#96] // pull rp 1639 sbcs x7,x20,x7 1640 stp xzr,xzr,[sp,#8*0] 1641 sbcs x8,x21,x8 1642 stp xzr,xzr,[sp,#8*2] 1643 sbcs x9,x22,x9 1644 stp xzr,xzr,[sp,#8*4] 1645 sbcs x10,x23,x10 1646 stp xzr,xzr,[sp,#8*6] 1647 sbcs x11,x24,x11 1648 stp xzr,xzr,[sp,#8*8] 1649 sbcs x12,x25,x12 1650 stp xzr,xzr,[sp,#8*10] 1651 sbcs x13,x26,x13 1652 stp xzr,xzr,[sp,#8*12] 1653 sbcs x28,x28,xzr // did it borrow? 1654 stp xzr,xzr,[sp,#8*14] 1655 1656 // x6-7 hold result-modulus 1657 csel x6,x19,x6,lo 1658 csel x7,x20,x7,lo 1659 csel x8,x21,x8,lo 1660 csel x9,x22,x9,lo 1661 stp x6,x7,[x1,#8*0] 1662 csel x10,x23,x10,lo 1663 csel x11,x24,x11,lo 1664 stp x8,x9,[x1,#8*2] 1665 csel x12,x25,x12,lo 1666 csel x13,x26,x13,lo 1667 stp x10,x11,[x1,#8*4] 1668 stp x12,x13,[x1,#8*6] 1669 1670.Lsqr8x_done: 1671 ldp x19,x20,[x29,#16] 1672 mov sp,x29 1673 ldp x21,x22,[x29,#32] 1674 mov x0,#1 1675 ldp x23,x24,[x29,#48] 1676 ldp x25,x26,[x29,#64] 1677 ldp x27,x28,[x29,#80] 1678 ldr x29,[sp],#128 1679.inst 0xd50323bf // autiasp 1680 ret 1681.size __bn_sqr8x_mont,.-__bn_sqr8x_mont 1682.type __bn_mul4x_mont,%function 1683.align 5 1684__bn_mul4x_mont: 1685.inst 0xd503233f // paciasp 1686 stp x29,x30,[sp,#-128]! 1687 add x29,sp,#0 1688 stp x19,x20,[sp,#16] 1689 stp x21,x22,[sp,#32] 1690 stp x23,x24,[sp,#48] 1691 stp x25,x26,[sp,#64] 1692 stp x27,x28,[sp,#80] 1693 1694 sub x26,sp,x5,lsl#3 1695 lsl x5,x5,#3 1696 ldr x4,[x4] // *n0 1697 sub sp,x26,#8*4 // alloca 1698 1699 add x10,x2,x5 1700 add x27,x1,x5 1701 stp x0,x10,[x29,#96] // offload rp and &b[num] 1702 1703 ldr x24,[x2,#8*0] // b[0] 1704 ldp x6,x7,[x1,#8*0] // a[0..3] 1705 ldp x8,x9,[x1,#8*2] 1706 add x1,x1,#8*4 1707 mov x19,xzr 1708 mov x20,xzr 1709 mov x21,xzr 1710 mov x22,xzr 1711 ldp x14,x15,[x3,#8*0] // n[0..3] 1712 ldp x16,x17,[x3,#8*2] 1713 adds x3,x3,#8*4 // clear carry bit 1714 mov x0,xzr 1715 mov x28,#0 1716 mov x26,sp 1717 1718.Loop_mul4x_1st_reduction: 1719 mul x10,x6,x24 // lo(a[0..3]*b[0]) 1720 adc x0,x0,xzr // modulo-scheduled 1721 mul x11,x7,x24 1722 add x28,x28,#8 1723 mul x12,x8,x24 1724 and x28,x28,#31 1725 mul x13,x9,x24 1726 adds x19,x19,x10 1727 umulh x10,x6,x24 // hi(a[0..3]*b[0]) 1728 adcs x20,x20,x11 1729 mul x25,x19,x4 // t[0]*n0 1730 adcs x21,x21,x12 1731 umulh x11,x7,x24 1732 adcs x22,x22,x13 1733 umulh x12,x8,x24 1734 adc x23,xzr,xzr 1735 umulh x13,x9,x24 1736 ldr x24,[x2,x28] // next b[i] (or b[0]) 1737 adds x20,x20,x10 1738 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) 1739 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1740 adcs x21,x21,x11 1741 mul x11,x15,x25 1742 adcs x22,x22,x12 1743 mul x12,x16,x25 1744 adc x23,x23,x13 // can't overflow 1745 mul x13,x17,x25 1746 // (*) adds xzr,x19,x10 1747 subs xzr,x19,#1 // (*) 1748 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) 1749 adcs x19,x20,x11 1750 umulh x11,x15,x25 1751 adcs x20,x21,x12 1752 umulh x12,x16,x25 1753 adcs x21,x22,x13 1754 umulh x13,x17,x25 1755 adcs x22,x23,x0 1756 adc x0,xzr,xzr 1757 adds x19,x19,x10 1758 sub x10,x27,x1 1759 adcs x20,x20,x11 1760 adcs x21,x21,x12 1761 adcs x22,x22,x13 1762 //adc x0,x0,xzr 1763 cbnz x28,.Loop_mul4x_1st_reduction 1764 1765 cbz x10,.Lmul4x4_post_condition 1766 1767 ldp x6,x7,[x1,#8*0] // a[4..7] 1768 ldp x8,x9,[x1,#8*2] 1769 add x1,x1,#8*4 1770 ldr x25,[sp] // a[0]*n0 1771 ldp x14,x15,[x3,#8*0] // n[4..7] 1772 ldp x16,x17,[x3,#8*2] 1773 add x3,x3,#8*4 1774 1775.Loop_mul4x_1st_tail: 1776 mul x10,x6,x24 // lo(a[4..7]*b[i]) 1777 adc x0,x0,xzr // modulo-scheduled 1778 mul x11,x7,x24 1779 add x28,x28,#8 1780 mul x12,x8,x24 1781 and x28,x28,#31 1782 mul x13,x9,x24 1783 adds x19,x19,x10 1784 umulh x10,x6,x24 // hi(a[4..7]*b[i]) 1785 adcs x20,x20,x11 1786 umulh x11,x7,x24 1787 adcs x21,x21,x12 1788 umulh x12,x8,x24 1789 adcs x22,x22,x13 1790 umulh x13,x9,x24 1791 adc x23,xzr,xzr 1792 ldr x24,[x2,x28] // next b[i] (or b[0]) 1793 adds x20,x20,x10 1794 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) 1795 adcs x21,x21,x11 1796 mul x11,x15,x25 1797 adcs x22,x22,x12 1798 mul x12,x16,x25 1799 adc x23,x23,x13 // can't overflow 1800 mul x13,x17,x25 1801 adds x19,x19,x10 1802 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) 1803 adcs x20,x20,x11 1804 umulh x11,x15,x25 1805 adcs x21,x21,x12 1806 umulh x12,x16,x25 1807 adcs x22,x22,x13 1808 adcs x23,x23,x0 1809 umulh x13,x17,x25 1810 adc x0,xzr,xzr 1811 ldr x25,[sp,x28] // next t[0]*n0 1812 str x19,[x26],#8 // result!!! 1813 adds x19,x20,x10 1814 sub x10,x27,x1 // done yet? 1815 adcs x20,x21,x11 1816 adcs x21,x22,x12 1817 adcs x22,x23,x13 1818 //adc x0,x0,xzr 1819 cbnz x28,.Loop_mul4x_1st_tail 1820 1821 sub x11,x27,x5 // rewinded x1 1822 cbz x10,.Lmul4x_proceed 1823 1824 ldp x6,x7,[x1,#8*0] 1825 ldp x8,x9,[x1,#8*2] 1826 add x1,x1,#8*4 1827 ldp x14,x15,[x3,#8*0] 1828 ldp x16,x17,[x3,#8*2] 1829 add x3,x3,#8*4 1830 b .Loop_mul4x_1st_tail 1831 1832.align 5 1833.Lmul4x_proceed: 1834 ldr x24,[x2,#8*4]! // *++b 1835 adc x30,x0,xzr 1836 ldp x6,x7,[x11,#8*0] // a[0..3] 1837 sub x3,x3,x5 // rewind np 1838 ldp x8,x9,[x11,#8*2] 1839 add x1,x11,#8*4 1840 1841 stp x19,x20,[x26,#8*0] // result!!! 1842 ldp x19,x20,[sp,#8*4] // t[0..3] 1843 stp x21,x22,[x26,#8*2] // result!!! 1844 ldp x21,x22,[sp,#8*6] 1845 1846 ldp x14,x15,[x3,#8*0] // n[0..3] 1847 mov x26,sp 1848 ldp x16,x17,[x3,#8*2] 1849 adds x3,x3,#8*4 // clear carry bit 1850 mov x0,xzr 1851 1852.align 4 1853.Loop_mul4x_reduction: 1854 mul x10,x6,x24 // lo(a[0..3]*b[4]) 1855 adc x0,x0,xzr // modulo-scheduled 1856 mul x11,x7,x24 1857 add x28,x28,#8 1858 mul x12,x8,x24 1859 and x28,x28,#31 1860 mul x13,x9,x24 1861 adds x19,x19,x10 1862 umulh x10,x6,x24 // hi(a[0..3]*b[4]) 1863 adcs x20,x20,x11 1864 mul x25,x19,x4 // t[0]*n0 1865 adcs x21,x21,x12 1866 umulh x11,x7,x24 1867 adcs x22,x22,x13 1868 umulh x12,x8,x24 1869 adc x23,xzr,xzr 1870 umulh x13,x9,x24 1871 ldr x24,[x2,x28] // next b[i] 1872 adds x20,x20,x10 1873 // (*) mul x10,x14,x25 1874 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1875 adcs x21,x21,x11 1876 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 1877 adcs x22,x22,x12 1878 mul x12,x16,x25 1879 adc x23,x23,x13 // can't overflow 1880 mul x13,x17,x25 1881 // (*) adds xzr,x19,x10 1882 subs xzr,x19,#1 // (*) 1883 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 1884 adcs x19,x20,x11 1885 umulh x11,x15,x25 1886 adcs x20,x21,x12 1887 umulh x12,x16,x25 1888 adcs x21,x22,x13 1889 umulh x13,x17,x25 1890 adcs x22,x23,x0 1891 adc x0,xzr,xzr 1892 adds x19,x19,x10 1893 adcs x20,x20,x11 1894 adcs x21,x21,x12 1895 adcs x22,x22,x13 1896 //adc x0,x0,xzr 1897 cbnz x28,.Loop_mul4x_reduction 1898 1899 adc x0,x0,xzr 1900 ldp x10,x11,[x26,#8*4] // t[4..7] 1901 ldp x12,x13,[x26,#8*6] 1902 ldp x6,x7,[x1,#8*0] // a[4..7] 1903 ldp x8,x9,[x1,#8*2] 1904 add x1,x1,#8*4 1905 adds x19,x19,x10 1906 adcs x20,x20,x11 1907 adcs x21,x21,x12 1908 adcs x22,x22,x13 1909 //adc x0,x0,xzr 1910 1911 ldr x25,[sp] // t[0]*n0 1912 ldp x14,x15,[x3,#8*0] // n[4..7] 1913 ldp x16,x17,[x3,#8*2] 1914 add x3,x3,#8*4 1915 1916.align 4 1917.Loop_mul4x_tail: 1918 mul x10,x6,x24 // lo(a[4..7]*b[4]) 1919 adc x0,x0,xzr // modulo-scheduled 1920 mul x11,x7,x24 1921 add x28,x28,#8 1922 mul x12,x8,x24 1923 and x28,x28,#31 1924 mul x13,x9,x24 1925 adds x19,x19,x10 1926 umulh x10,x6,x24 // hi(a[4..7]*b[4]) 1927 adcs x20,x20,x11 1928 umulh x11,x7,x24 1929 adcs x21,x21,x12 1930 umulh x12,x8,x24 1931 adcs x22,x22,x13 1932 umulh x13,x9,x24 1933 adc x23,xzr,xzr 1934 ldr x24,[x2,x28] // next b[i] 1935 adds x20,x20,x10 1936 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) 1937 adcs x21,x21,x11 1938 mul x11,x15,x25 1939 adcs x22,x22,x12 1940 mul x12,x16,x25 1941 adc x23,x23,x13 // can't overflow 1942 mul x13,x17,x25 1943 adds x19,x19,x10 1944 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) 1945 adcs x20,x20,x11 1946 umulh x11,x15,x25 1947 adcs x21,x21,x12 1948 umulh x12,x16,x25 1949 adcs x22,x22,x13 1950 umulh x13,x17,x25 1951 adcs x23,x23,x0 1952 ldr x25,[sp,x28] // next a[0]*n0 1953 adc x0,xzr,xzr 1954 str x19,[x26],#8 // result!!! 1955 adds x19,x20,x10 1956 sub x10,x27,x1 // done yet? 1957 adcs x20,x21,x11 1958 adcs x21,x22,x12 1959 adcs x22,x23,x13 1960 //adc x0,x0,xzr 1961 cbnz x28,.Loop_mul4x_tail 1962 1963 sub x11,x3,x5 // rewinded np? 1964 adc x0,x0,xzr 1965 cbz x10,.Loop_mul4x_break 1966 1967 ldp x10,x11,[x26,#8*4] 1968 ldp x12,x13,[x26,#8*6] 1969 ldp x6,x7,[x1,#8*0] 1970 ldp x8,x9,[x1,#8*2] 1971 add x1,x1,#8*4 1972 adds x19,x19,x10 1973 adcs x20,x20,x11 1974 adcs x21,x21,x12 1975 adcs x22,x22,x13 1976 //adc x0,x0,xzr 1977 ldp x14,x15,[x3,#8*0] 1978 ldp x16,x17,[x3,#8*2] 1979 add x3,x3,#8*4 1980 b .Loop_mul4x_tail 1981 1982.align 4 1983.Loop_mul4x_break: 1984 ldp x12,x13,[x29,#96] // pull rp and &b[num] 1985 adds x19,x19,x30 1986 add x2,x2,#8*4 // bp++ 1987 adcs x20,x20,xzr 1988 sub x1,x1,x5 // rewind ap 1989 adcs x21,x21,xzr 1990 stp x19,x20,[x26,#8*0] // result!!! 1991 adcs x22,x22,xzr 1992 ldp x19,x20,[sp,#8*4] // t[0..3] 1993 adc x30,x0,xzr 1994 stp x21,x22,[x26,#8*2] // result!!! 1995 cmp x2,x13 // done yet? 1996 ldp x21,x22,[sp,#8*6] 1997 ldp x14,x15,[x11,#8*0] // n[0..3] 1998 ldp x16,x17,[x11,#8*2] 1999 add x3,x11,#8*4 2000 b.eq .Lmul4x_post 2001 2002 ldr x24,[x2] 2003 ldp x6,x7,[x1,#8*0] // a[0..3] 2004 ldp x8,x9,[x1,#8*2] 2005 adds x1,x1,#8*4 // clear carry bit 2006 mov x0,xzr 2007 mov x26,sp 2008 b .Loop_mul4x_reduction 2009 2010.align 4 2011.Lmul4x_post: 2012 // Final step. We see if result is larger than modulus, and 2013 // if it is, subtract the modulus. But comparison implies 2014 // subtraction. So we subtract modulus, see if it borrowed, 2015 // and conditionally copy original value. 2016 mov x0,x12 2017 mov x27,x12 // x0 copy 2018 subs x10,x19,x14 2019 add x26,sp,#8*8 2020 sbcs x11,x20,x15 2021 sub x28,x5,#8*4 2022 2023.Lmul4x_sub: 2024 sbcs x12,x21,x16 2025 ldp x14,x15,[x3,#8*0] 2026 sub x28,x28,#8*4 2027 ldp x19,x20,[x26,#8*0] 2028 sbcs x13,x22,x17 2029 ldp x16,x17,[x3,#8*2] 2030 add x3,x3,#8*4 2031 ldp x21,x22,[x26,#8*2] 2032 add x26,x26,#8*4 2033 stp x10,x11,[x0,#8*0] 2034 sbcs x10,x19,x14 2035 stp x12,x13,[x0,#8*2] 2036 add x0,x0,#8*4 2037 sbcs x11,x20,x15 2038 cbnz x28,.Lmul4x_sub 2039 2040 sbcs x12,x21,x16 2041 mov x26,sp 2042 add x1,sp,#8*4 2043 ldp x6,x7,[x27,#8*0] 2044 sbcs x13,x22,x17 2045 stp x10,x11,[x0,#8*0] 2046 ldp x8,x9,[x27,#8*2] 2047 stp x12,x13,[x0,#8*2] 2048 ldp x19,x20,[x1,#8*0] 2049 ldp x21,x22,[x1,#8*2] 2050 sbcs xzr,x30,xzr // did it borrow? 2051 ldr x30,[x29,#8] // pull return address 2052 2053 sub x28,x5,#8*4 2054.Lmul4x_cond_copy: 2055 sub x28,x28,#8*4 2056 csel x10,x19,x6,lo 2057 stp xzr,xzr,[x26,#8*0] 2058 csel x11,x20,x7,lo 2059 ldp x6,x7,[x27,#8*4] 2060 ldp x19,x20,[x1,#8*4] 2061 csel x12,x21,x8,lo 2062 stp xzr,xzr,[x26,#8*2] 2063 add x26,x26,#8*4 2064 csel x13,x22,x9,lo 2065 ldp x8,x9,[x27,#8*6] 2066 ldp x21,x22,[x1,#8*6] 2067 add x1,x1,#8*4 2068 stp x10,x11,[x27,#8*0] 2069 stp x12,x13,[x27,#8*2] 2070 add x27,x27,#8*4 2071 cbnz x28,.Lmul4x_cond_copy 2072 2073 csel x10,x19,x6,lo 2074 stp xzr,xzr,[x26,#8*0] 2075 csel x11,x20,x7,lo 2076 stp xzr,xzr,[x26,#8*2] 2077 csel x12,x21,x8,lo 2078 stp xzr,xzr,[x26,#8*3] 2079 csel x13,x22,x9,lo 2080 stp xzr,xzr,[x26,#8*4] 2081 stp x10,x11,[x27,#8*0] 2082 stp x12,x13,[x27,#8*2] 2083 2084 b .Lmul4x_done 2085 2086.align 4 2087.Lmul4x4_post_condition: 2088 adc x0,x0,xzr 2089 ldr x1,[x29,#96] // pull rp 2090 // x19-3,x0 hold result, x14-7 hold modulus 2091 subs x6,x19,x14 2092 ldr x30,[x29,#8] // pull return address 2093 sbcs x7,x20,x15 2094 stp xzr,xzr,[sp,#8*0] 2095 sbcs x8,x21,x16 2096 stp xzr,xzr,[sp,#8*2] 2097 sbcs x9,x22,x17 2098 stp xzr,xzr,[sp,#8*4] 2099 sbcs xzr,x0,xzr // did it borrow? 2100 stp xzr,xzr,[sp,#8*6] 2101 2102 // x6-3 hold result-modulus 2103 csel x6,x19,x6,lo 2104 csel x7,x20,x7,lo 2105 csel x8,x21,x8,lo 2106 csel x9,x22,x9,lo 2107 stp x6,x7,[x1,#8*0] 2108 stp x8,x9,[x1,#8*2] 2109 2110.Lmul4x_done: 2111 ldp x19,x20,[x29,#16] 2112 mov sp,x29 2113 ldp x21,x22,[x29,#32] 2114 mov x0,#1 2115 ldp x23,x24,[x29,#48] 2116 ldp x25,x26,[x29,#64] 2117 ldp x27,x28,[x29,#80] 2118 ldr x29,[sp],#128 2119.inst 0xd50323bf // autiasp 2120 ret 2121.size __bn_mul4x_mont,.-__bn_mul4x_mont 2122.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 2123.align 2 2124.align 4 2125