1#include "arm_asm.h" 2#include "arm_arch.h" 3 4.text 5 6// forward "declarations" are required for Apple 7 8.globl poly1305_blocks 9.globl poly1305_emit 10 11.globl poly1305_init 12.type poly1305_init,%function 13.align 5 14poly1305_init: 15 cmp x1,xzr 16 stp xzr,xzr,[x0] // zero hash value 17 stp xzr,xzr,[x0,#16] // [along with is_base2_26] 18 19 csel x0,xzr,x0,eq 20 b.eq .Lno_key 21 22#ifdef __ILP32__ 23 ldrsw x11,.LOPENSSL_armcap_P 24#else 25 ldr x11,.LOPENSSL_armcap_P 26#endif 27 adr x10,.LOPENSSL_armcap_P 28 29 ldp x7,x8,[x1] // load key 30 mov x9,#0xfffffffc0fffffff 31 movk x9,#0x0fff,lsl#48 32 ldr w17,[x10,x11] 33#ifdef __ARMEB__ 34 rev x7,x7 // flip bytes 35 rev x8,x8 36#endif 37 and x7,x7,x9 // &=0ffffffc0fffffff 38 and x9,x9,#-4 39 and x8,x8,x9 // &=0ffffffc0ffffffc 40 stp x7,x8,[x0,#32] // save key value 41 42 tst w17,#ARMV7_NEON 43 44 adr x12,poly1305_blocks 45 adr x7,poly1305_blocks_neon 46 adr x13,poly1305_emit 47 adr x8,poly1305_emit_neon 48 49 csel x12,x12,x7,eq 50 csel x13,x13,x8,eq 51 52#ifdef __ILP32__ 53 stp w12,w13,[x2] 54#else 55 stp x12,x13,[x2] 56#endif 57 58 mov x0,#1 59.Lno_key: 60 ret 61.size poly1305_init,.-poly1305_init 62 63.type poly1305_blocks,%function 64.align 5 65poly1305_blocks: 66 ands x2,x2,#-16 67 b.eq .Lno_data 68 69 ldp x4,x5,[x0] // load hash value 70 ldp x7,x8,[x0,#32] // load key value 71 ldr x6,[x0,#16] 72 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 73 b .Loop 74 75.align 5 76.Loop: 77 ldp x10,x11,[x1],#16 // load input 78 sub x2,x2,#16 79#ifdef __ARMEB__ 80 rev x10,x10 81 rev x11,x11 82#endif 83 adds x4,x4,x10 // accumulate input 84 adcs x5,x5,x11 85 86 mul x12,x4,x7 // h0*r0 87 adc x6,x6,x3 88 umulh x13,x4,x7 89 90 mul x10,x5,x9 // h1*5*r1 91 umulh x11,x5,x9 92 93 adds x12,x12,x10 94 mul x10,x4,x8 // h0*r1 95 adc x13,x13,x11 96 umulh x14,x4,x8 97 98 adds x13,x13,x10 99 mul x10,x5,x7 // h1*r0 100 adc x14,x14,xzr 101 umulh x11,x5,x7 102 103 adds x13,x13,x10 104 mul x10,x6,x9 // h2*5*r1 105 adc x14,x14,x11 106 mul x11,x6,x7 // h2*r0 107 108 adds x13,x13,x10 109 adc x14,x14,x11 110 111 and x10,x14,#-4 // final reduction 112 and x6,x14,#3 113 add x10,x10,x14,lsr#2 114 adds x4,x12,x10 115 adcs x5,x13,xzr 116 adc x6,x6,xzr 117 118 cbnz x2,.Loop 119 120 stp x4,x5,[x0] // store hash value 121 str x6,[x0,#16] 122 123.Lno_data: 124 ret 125.size poly1305_blocks,.-poly1305_blocks 126 127.type poly1305_emit,%function 128.align 5 129poly1305_emit: 130 ldp x4,x5,[x0] // load hash base 2^64 131 ldr x6,[x0,#16] 132 ldp x10,x11,[x2] // load nonce 133 134 adds x12,x4,#5 // compare to modulus 135 adcs x13,x5,xzr 136 adc x14,x6,xzr 137 138 tst x14,#-4 // see if it's carried/borrowed 139 140 csel x4,x4,x12,eq 141 csel x5,x5,x13,eq 142 143#ifdef __ARMEB__ 144 ror x10,x10,#32 // flip nonce words 145 ror x11,x11,#32 146#endif 147 adds x4,x4,x10 // accumulate nonce 148 adc x5,x5,x11 149#ifdef __ARMEB__ 150 rev x4,x4 // flip output bytes 151 rev x5,x5 152#endif 153 stp x4,x5,[x1] // write result 154 155 ret 156.size poly1305_emit,.-poly1305_emit 157.type poly1305_mult,%function 158.align 5 159poly1305_mult: 160 mul x12,x4,x7 // h0*r0 161 umulh x13,x4,x7 162 163 mul x10,x5,x9 // h1*5*r1 164 umulh x11,x5,x9 165 166 adds x12,x12,x10 167 mul x10,x4,x8 // h0*r1 168 adc x13,x13,x11 169 umulh x14,x4,x8 170 171 adds x13,x13,x10 172 mul x10,x5,x7 // h1*r0 173 adc x14,x14,xzr 174 umulh x11,x5,x7 175 176 adds x13,x13,x10 177 mul x10,x6,x9 // h2*5*r1 178 adc x14,x14,x11 179 mul x11,x6,x7 // h2*r0 180 181 adds x13,x13,x10 182 adc x14,x14,x11 183 184 and x10,x14,#-4 // final reduction 185 and x6,x14,#3 186 add x10,x10,x14,lsr#2 187 adds x4,x12,x10 188 adcs x5,x13,xzr 189 adc x6,x6,xzr 190 191 ret 192.size poly1305_mult,.-poly1305_mult 193 194.type poly1305_splat,%function 195.align 5 196poly1305_splat: 197 and x12,x4,#0x03ffffff // base 2^64 -> base 2^26 198 ubfx x13,x4,#26,#26 199 extr x14,x5,x4,#52 200 and x14,x14,#0x03ffffff 201 ubfx x15,x5,#14,#26 202 extr x16,x6,x5,#40 203 204 str w12,[x0,#16*0] // r0 205 add w12,w13,w13,lsl#2 // r1*5 206 str w13,[x0,#16*1] // r1 207 add w13,w14,w14,lsl#2 // r2*5 208 str w12,[x0,#16*2] // s1 209 str w14,[x0,#16*3] // r2 210 add w14,w15,w15,lsl#2 // r3*5 211 str w13,[x0,#16*4] // s2 212 str w15,[x0,#16*5] // r3 213 add w15,w16,w16,lsl#2 // r4*5 214 str w14,[x0,#16*6] // s3 215 str w16,[x0,#16*7] // r4 216 str w15,[x0,#16*8] // s4 217 218 ret 219.size poly1305_splat,.-poly1305_splat 220 221.type poly1305_blocks_neon,%function 222.align 5 223poly1305_blocks_neon: 224 ldr x17,[x0,#24] 225 cmp x2,#128 226 b.hs .Lblocks_neon 227 cbz x17,poly1305_blocks 228 229.Lblocks_neon: 230.inst 0xd503233f // paciasp 231 stp x29,x30,[sp,#-80]! 232 add x29,sp,#0 233 234 ands x2,x2,#-16 235 b.eq .Lno_data_neon 236 237 cbz x17,.Lbase2_64_neon 238 239 ldp w10,w11,[x0] // load hash value base 2^26 240 ldp w12,w13,[x0,#8] 241 ldr w14,[x0,#16] 242 243 tst x2,#31 244 b.eq .Leven_neon 245 246 ldp x7,x8,[x0,#32] // load key value 247 248 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 249 lsr x5,x12,#12 250 adds x4,x4,x12,lsl#52 251 add x5,x5,x13,lsl#14 252 adc x5,x5,xzr 253 lsr x6,x14,#24 254 adds x5,x5,x14,lsl#40 255 adc x14,x6,xzr // can be partially reduced... 256 257 ldp x12,x13,[x1],#16 // load input 258 sub x2,x2,#16 259 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 260 261 and x10,x14,#-4 // ... so reduce 262 and x6,x14,#3 263 add x10,x10,x14,lsr#2 264 adds x4,x4,x10 265 adcs x5,x5,xzr 266 adc x6,x6,xzr 267 268#ifdef __ARMEB__ 269 rev x12,x12 270 rev x13,x13 271#endif 272 adds x4,x4,x12 // accumulate input 273 adcs x5,x5,x13 274 adc x6,x6,x3 275 276 bl poly1305_mult 277 ldr x30,[sp,#8] 278 279 cbz x3,.Lstore_base2_64_neon 280 281 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 282 ubfx x11,x4,#26,#26 283 extr x12,x5,x4,#52 284 and x12,x12,#0x03ffffff 285 ubfx x13,x5,#14,#26 286 extr x14,x6,x5,#40 287 288 cbnz x2,.Leven_neon 289 290 stp w10,w11,[x0] // store hash value base 2^26 291 stp w12,w13,[x0,#8] 292 str w14,[x0,#16] 293 b .Lno_data_neon 294 295.align 4 296.Lstore_base2_64_neon: 297 stp x4,x5,[x0] // store hash value base 2^64 298 stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed 299 b .Lno_data_neon 300 301.align 4 302.Lbase2_64_neon: 303 ldp x7,x8,[x0,#32] // load key value 304 305 ldp x4,x5,[x0] // load hash value base 2^64 306 ldr x6,[x0,#16] 307 308 tst x2,#31 309 b.eq .Linit_neon 310 311 ldp x12,x13,[x1],#16 // load input 312 sub x2,x2,#16 313 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 314#ifdef __ARMEB__ 315 rev x12,x12 316 rev x13,x13 317#endif 318 adds x4,x4,x12 // accumulate input 319 adcs x5,x5,x13 320 adc x6,x6,x3 321 322 bl poly1305_mult 323 324.Linit_neon: 325 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 326 ubfx x11,x4,#26,#26 327 extr x12,x5,x4,#52 328 and x12,x12,#0x03ffffff 329 ubfx x13,x5,#14,#26 330 extr x14,x6,x5,#40 331 332 stp d8,d9,[sp,#16] // meet ABI requirements 333 stp d10,d11,[sp,#32] 334 stp d12,d13,[sp,#48] 335 stp d14,d15,[sp,#64] 336 337 fmov d24,x10 338 fmov d25,x11 339 fmov d26,x12 340 fmov d27,x13 341 fmov d28,x14 342 343 ////////////////////////////////// initialize r^n table 344 mov x4,x7 // r^1 345 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 346 mov x5,x8 347 mov x6,xzr 348 add x0,x0,#48+12 349 bl poly1305_splat 350 351 bl poly1305_mult // r^2 352 sub x0,x0,#4 353 bl poly1305_splat 354 355 bl poly1305_mult // r^3 356 sub x0,x0,#4 357 bl poly1305_splat 358 359 bl poly1305_mult // r^4 360 sub x0,x0,#4 361 bl poly1305_splat 362 ldr x30,[sp,#8] 363 364 add x16,x1,#32 365 adr x17,.Lzeros 366 subs x2,x2,#64 367 csel x16,x17,x16,lo 368 369 mov x4,#1 370 str x4,[x0,#-24] // set is_base2_26 371 sub x0,x0,#48 // restore original x0 372 b .Ldo_neon 373 374.align 4 375.Leven_neon: 376 add x16,x1,#32 377 adr x17,.Lzeros 378 subs x2,x2,#64 379 csel x16,x17,x16,lo 380 381 stp d8,d9,[sp,#16] // meet ABI requirements 382 stp d10,d11,[sp,#32] 383 stp d12,d13,[sp,#48] 384 stp d14,d15,[sp,#64] 385 386 fmov d24,x10 387 fmov d25,x11 388 fmov d26,x12 389 fmov d27,x13 390 fmov d28,x14 391 392.Ldo_neon: 393 ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 394 ldp x9,x13,[x16],#48 395 396 lsl x3,x3,#24 397 add x15,x0,#48 398 399#ifdef __ARMEB__ 400 rev x8,x8 401 rev x12,x12 402 rev x9,x9 403 rev x13,x13 404#endif 405 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 406 and x5,x9,#0x03ffffff 407 ubfx x6,x8,#26,#26 408 ubfx x7,x9,#26,#26 409 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 410 extr x8,x12,x8,#52 411 extr x9,x13,x9,#52 412 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 413 fmov d14,x4 414 and x8,x8,#0x03ffffff 415 and x9,x9,#0x03ffffff 416 ubfx x10,x12,#14,#26 417 ubfx x11,x13,#14,#26 418 add x12,x3,x12,lsr#40 419 add x13,x3,x13,lsr#40 420 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 421 fmov d15,x6 422 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 423 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 424 fmov d16,x8 425 fmov d17,x10 426 fmov d18,x12 427 428 ldp x8,x12,[x1],#16 // inp[0:1] 429 ldp x9,x13,[x1],#48 430 431 ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64 432 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64 433 ld1 {v8.4s},[x15] 434 435#ifdef __ARMEB__ 436 rev x8,x8 437 rev x12,x12 438 rev x9,x9 439 rev x13,x13 440#endif 441 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 442 and x5,x9,#0x03ffffff 443 ubfx x6,x8,#26,#26 444 ubfx x7,x9,#26,#26 445 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 446 extr x8,x12,x8,#52 447 extr x9,x13,x9,#52 448 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 449 fmov d9,x4 450 and x8,x8,#0x03ffffff 451 and x9,x9,#0x03ffffff 452 ubfx x10,x12,#14,#26 453 ubfx x11,x13,#14,#26 454 add x12,x3,x12,lsr#40 455 add x13,x3,x13,lsr#40 456 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 457 fmov d10,x6 458 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 459 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 460 movi v31.2d,#-1 461 fmov d11,x8 462 fmov d12,x10 463 fmov d13,x12 464 ushr v31.2d,v31.2d,#38 465 466 b.ls .Lskip_loop 467 468.align 4 469.Loop_neon: 470 //////////////////////////////////////////////////////////////// 471 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 472 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 473 // ___________________/ 474 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 475 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 476 // ___________________/ ____________________/ 477 // 478 // Note that we start with inp[2:3]*r^2. This is because it 479 // doesn't depend on reduction in previous iteration. 480 //////////////////////////////////////////////////////////////// 481 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 482 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 483 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 484 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 485 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 486 487 subs x2,x2,#64 488 umull v23.2d,v14.2s,v7.s[2] 489 csel x16,x17,x16,lo 490 umull v22.2d,v14.2s,v5.s[2] 491 umull v21.2d,v14.2s,v3.s[2] 492 ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 493 umull v20.2d,v14.2s,v1.s[2] 494 ldp x9,x13,[x16],#48 495 umull v19.2d,v14.2s,v0.s[2] 496#ifdef __ARMEB__ 497 rev x8,x8 498 rev x12,x12 499 rev x9,x9 500 rev x13,x13 501#endif 502 503 umlal v23.2d,v15.2s,v5.s[2] 504 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 505 umlal v22.2d,v15.2s,v3.s[2] 506 and x5,x9,#0x03ffffff 507 umlal v21.2d,v15.2s,v1.s[2] 508 ubfx x6,x8,#26,#26 509 umlal v20.2d,v15.2s,v0.s[2] 510 ubfx x7,x9,#26,#26 511 umlal v19.2d,v15.2s,v8.s[2] 512 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 513 514 umlal v23.2d,v16.2s,v3.s[2] 515 extr x8,x12,x8,#52 516 umlal v22.2d,v16.2s,v1.s[2] 517 extr x9,x13,x9,#52 518 umlal v21.2d,v16.2s,v0.s[2] 519 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 520 umlal v20.2d,v16.2s,v8.s[2] 521 fmov d14,x4 522 umlal v19.2d,v16.2s,v6.s[2] 523 and x8,x8,#0x03ffffff 524 525 umlal v23.2d,v17.2s,v1.s[2] 526 and x9,x9,#0x03ffffff 527 umlal v22.2d,v17.2s,v0.s[2] 528 ubfx x10,x12,#14,#26 529 umlal v21.2d,v17.2s,v8.s[2] 530 ubfx x11,x13,#14,#26 531 umlal v20.2d,v17.2s,v6.s[2] 532 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 533 umlal v19.2d,v17.2s,v4.s[2] 534 fmov d15,x6 535 536 add v11.2s,v11.2s,v26.2s 537 add x12,x3,x12,lsr#40 538 umlal v23.2d,v18.2s,v0.s[2] 539 add x13,x3,x13,lsr#40 540 umlal v22.2d,v18.2s,v8.s[2] 541 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 542 umlal v21.2d,v18.2s,v6.s[2] 543 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 544 umlal v20.2d,v18.2s,v4.s[2] 545 fmov d16,x8 546 umlal v19.2d,v18.2s,v2.s[2] 547 fmov d17,x10 548 549 //////////////////////////////////////////////////////////////// 550 // (hash+inp[0:1])*r^4 and accumulate 551 552 add v9.2s,v9.2s,v24.2s 553 fmov d18,x12 554 umlal v22.2d,v11.2s,v1.s[0] 555 ldp x8,x12,[x1],#16 // inp[0:1] 556 umlal v19.2d,v11.2s,v6.s[0] 557 ldp x9,x13,[x1],#48 558 umlal v23.2d,v11.2s,v3.s[0] 559 umlal v20.2d,v11.2s,v8.s[0] 560 umlal v21.2d,v11.2s,v0.s[0] 561#ifdef __ARMEB__ 562 rev x8,x8 563 rev x12,x12 564 rev x9,x9 565 rev x13,x13 566#endif 567 568 add v10.2s,v10.2s,v25.2s 569 umlal v22.2d,v9.2s,v5.s[0] 570 umlal v23.2d,v9.2s,v7.s[0] 571 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 572 umlal v21.2d,v9.2s,v3.s[0] 573 and x5,x9,#0x03ffffff 574 umlal v19.2d,v9.2s,v0.s[0] 575 ubfx x6,x8,#26,#26 576 umlal v20.2d,v9.2s,v1.s[0] 577 ubfx x7,x9,#26,#26 578 579 add v12.2s,v12.2s,v27.2s 580 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 581 umlal v22.2d,v10.2s,v3.s[0] 582 extr x8,x12,x8,#52 583 umlal v23.2d,v10.2s,v5.s[0] 584 extr x9,x13,x9,#52 585 umlal v19.2d,v10.2s,v8.s[0] 586 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 587 umlal v21.2d,v10.2s,v1.s[0] 588 fmov d9,x4 589 umlal v20.2d,v10.2s,v0.s[0] 590 and x8,x8,#0x03ffffff 591 592 add v13.2s,v13.2s,v28.2s 593 and x9,x9,#0x03ffffff 594 umlal v22.2d,v12.2s,v0.s[0] 595 ubfx x10,x12,#14,#26 596 umlal v19.2d,v12.2s,v4.s[0] 597 ubfx x11,x13,#14,#26 598 umlal v23.2d,v12.2s,v1.s[0] 599 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 600 umlal v20.2d,v12.2s,v6.s[0] 601 fmov d10,x6 602 umlal v21.2d,v12.2s,v8.s[0] 603 add x12,x3,x12,lsr#40 604 605 umlal v22.2d,v13.2s,v8.s[0] 606 add x13,x3,x13,lsr#40 607 umlal v19.2d,v13.2s,v2.s[0] 608 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 609 umlal v23.2d,v13.2s,v0.s[0] 610 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 611 umlal v20.2d,v13.2s,v4.s[0] 612 fmov d11,x8 613 umlal v21.2d,v13.2s,v6.s[0] 614 fmov d12,x10 615 fmov d13,x12 616 617 ///////////////////////////////////////////////////////////////// 618 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 619 // and P. Schwabe 620 // 621 // [see discussion in poly1305-armv4 module] 622 623 ushr v29.2d,v22.2d,#26 624 xtn v27.2s,v22.2d 625 ushr v30.2d,v19.2d,#26 626 and v19.16b,v19.16b,v31.16b 627 add v23.2d,v23.2d,v29.2d // h3 -> h4 628 bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff 629 add v20.2d,v20.2d,v30.2d // h0 -> h1 630 631 ushr v29.2d,v23.2d,#26 632 xtn v28.2s,v23.2d 633 ushr v30.2d,v20.2d,#26 634 xtn v25.2s,v20.2d 635 bic v28.2s,#0xfc,lsl#24 636 add v21.2d,v21.2d,v30.2d // h1 -> h2 637 638 add v19.2d,v19.2d,v29.2d 639 shl v29.2d,v29.2d,#2 640 shrn v30.2s,v21.2d,#26 641 xtn v26.2s,v21.2d 642 add v19.2d,v19.2d,v29.2d // h4 -> h0 643 bic v25.2s,#0xfc,lsl#24 644 add v27.2s,v27.2s,v30.2s // h2 -> h3 645 bic v26.2s,#0xfc,lsl#24 646 647 shrn v29.2s,v19.2d,#26 648 xtn v24.2s,v19.2d 649 ushr v30.2s,v27.2s,#26 650 bic v27.2s,#0xfc,lsl#24 651 bic v24.2s,#0xfc,lsl#24 652 add v25.2s,v25.2s,v29.2s // h0 -> h1 653 add v28.2s,v28.2s,v30.2s // h3 -> h4 654 655 b.hi .Loop_neon 656 657.Lskip_loop: 658 dup v16.2d,v16.d[0] 659 add v11.2s,v11.2s,v26.2s 660 661 //////////////////////////////////////////////////////////////// 662 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 663 664 adds x2,x2,#32 665 b.ne .Long_tail 666 667 dup v16.2d,v11.d[0] 668 add v14.2s,v9.2s,v24.2s 669 add v17.2s,v12.2s,v27.2s 670 add v15.2s,v10.2s,v25.2s 671 add v18.2s,v13.2s,v28.2s 672 673.Long_tail: 674 dup v14.2d,v14.d[0] 675 umull2 v19.2d,v16.4s,v6.4s 676 umull2 v22.2d,v16.4s,v1.4s 677 umull2 v23.2d,v16.4s,v3.4s 678 umull2 v21.2d,v16.4s,v0.4s 679 umull2 v20.2d,v16.4s,v8.4s 680 681 dup v15.2d,v15.d[0] 682 umlal2 v19.2d,v14.4s,v0.4s 683 umlal2 v21.2d,v14.4s,v3.4s 684 umlal2 v22.2d,v14.4s,v5.4s 685 umlal2 v23.2d,v14.4s,v7.4s 686 umlal2 v20.2d,v14.4s,v1.4s 687 688 dup v17.2d,v17.d[0] 689 umlal2 v19.2d,v15.4s,v8.4s 690 umlal2 v22.2d,v15.4s,v3.4s 691 umlal2 v21.2d,v15.4s,v1.4s 692 umlal2 v23.2d,v15.4s,v5.4s 693 umlal2 v20.2d,v15.4s,v0.4s 694 695 dup v18.2d,v18.d[0] 696 umlal2 v22.2d,v17.4s,v0.4s 697 umlal2 v23.2d,v17.4s,v1.4s 698 umlal2 v19.2d,v17.4s,v4.4s 699 umlal2 v20.2d,v17.4s,v6.4s 700 umlal2 v21.2d,v17.4s,v8.4s 701 702 umlal2 v22.2d,v18.4s,v8.4s 703 umlal2 v19.2d,v18.4s,v2.4s 704 umlal2 v23.2d,v18.4s,v0.4s 705 umlal2 v20.2d,v18.4s,v4.4s 706 umlal2 v21.2d,v18.4s,v6.4s 707 708 b.eq .Lshort_tail 709 710 //////////////////////////////////////////////////////////////// 711 // (hash+inp[0:1])*r^4:r^3 and accumulate 712 713 add v9.2s,v9.2s,v24.2s 714 umlal v22.2d,v11.2s,v1.2s 715 umlal v19.2d,v11.2s,v6.2s 716 umlal v23.2d,v11.2s,v3.2s 717 umlal v20.2d,v11.2s,v8.2s 718 umlal v21.2d,v11.2s,v0.2s 719 720 add v10.2s,v10.2s,v25.2s 721 umlal v22.2d,v9.2s,v5.2s 722 umlal v19.2d,v9.2s,v0.2s 723 umlal v23.2d,v9.2s,v7.2s 724 umlal v20.2d,v9.2s,v1.2s 725 umlal v21.2d,v9.2s,v3.2s 726 727 add v12.2s,v12.2s,v27.2s 728 umlal v22.2d,v10.2s,v3.2s 729 umlal v19.2d,v10.2s,v8.2s 730 umlal v23.2d,v10.2s,v5.2s 731 umlal v20.2d,v10.2s,v0.2s 732 umlal v21.2d,v10.2s,v1.2s 733 734 add v13.2s,v13.2s,v28.2s 735 umlal v22.2d,v12.2s,v0.2s 736 umlal v19.2d,v12.2s,v4.2s 737 umlal v23.2d,v12.2s,v1.2s 738 umlal v20.2d,v12.2s,v6.2s 739 umlal v21.2d,v12.2s,v8.2s 740 741 umlal v22.2d,v13.2s,v8.2s 742 umlal v19.2d,v13.2s,v2.2s 743 umlal v23.2d,v13.2s,v0.2s 744 umlal v20.2d,v13.2s,v4.2s 745 umlal v21.2d,v13.2s,v6.2s 746 747.Lshort_tail: 748 //////////////////////////////////////////////////////////////// 749 // horizontal add 750 751 addp v22.2d,v22.2d,v22.2d 752 ldp d8,d9,[sp,#16] // meet ABI requirements 753 addp v19.2d,v19.2d,v19.2d 754 ldp d10,d11,[sp,#32] 755 addp v23.2d,v23.2d,v23.2d 756 ldp d12,d13,[sp,#48] 757 addp v20.2d,v20.2d,v20.2d 758 ldp d14,d15,[sp,#64] 759 addp v21.2d,v21.2d,v21.2d 760 761 //////////////////////////////////////////////////////////////// 762 // lazy reduction, but without narrowing 763 764 ushr v29.2d,v22.2d,#26 765 and v22.16b,v22.16b,v31.16b 766 ushr v30.2d,v19.2d,#26 767 and v19.16b,v19.16b,v31.16b 768 769 add v23.2d,v23.2d,v29.2d // h3 -> h4 770 add v20.2d,v20.2d,v30.2d // h0 -> h1 771 772 ushr v29.2d,v23.2d,#26 773 and v23.16b,v23.16b,v31.16b 774 ushr v30.2d,v20.2d,#26 775 and v20.16b,v20.16b,v31.16b 776 add v21.2d,v21.2d,v30.2d // h1 -> h2 777 778 add v19.2d,v19.2d,v29.2d 779 shl v29.2d,v29.2d,#2 780 ushr v30.2d,v21.2d,#26 781 and v21.16b,v21.16b,v31.16b 782 add v19.2d,v19.2d,v29.2d // h4 -> h0 783 add v22.2d,v22.2d,v30.2d // h2 -> h3 784 785 ushr v29.2d,v19.2d,#26 786 and v19.16b,v19.16b,v31.16b 787 ushr v30.2d,v22.2d,#26 788 and v22.16b,v22.16b,v31.16b 789 add v20.2d,v20.2d,v29.2d // h0 -> h1 790 add v23.2d,v23.2d,v30.2d // h3 -> h4 791 792 //////////////////////////////////////////////////////////////// 793 // write the result, can be partially reduced 794 795 st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16 796 st1 {v23.s}[0],[x0] 797 798.Lno_data_neon: 799.inst 0xd50323bf // autiasp 800 ldr x29,[sp],#80 801 ret 802.size poly1305_blocks_neon,.-poly1305_blocks_neon 803 804.type poly1305_emit_neon,%function 805.align 5 806poly1305_emit_neon: 807 ldr x17,[x0,#24] 808 cbz x17,poly1305_emit 809 810 ldp w10,w11,[x0] // load hash value base 2^26 811 ldp w12,w13,[x0,#8] 812 ldr w14,[x0,#16] 813 814 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 815 lsr x5,x12,#12 816 adds x4,x4,x12,lsl#52 817 add x5,x5,x13,lsl#14 818 adc x5,x5,xzr 819 lsr x6,x14,#24 820 adds x5,x5,x14,lsl#40 821 adc x6,x6,xzr // can be partially reduced... 822 823 ldp x10,x11,[x2] // load nonce 824 825 and x12,x6,#-4 // ... so reduce 826 add x12,x12,x6,lsr#2 827 and x6,x6,#3 828 adds x4,x4,x12 829 adcs x5,x5,xzr 830 adc x6,x6,xzr 831 832 adds x12,x4,#5 // compare to modulus 833 adcs x13,x5,xzr 834 adc x14,x6,xzr 835 836 tst x14,#-4 // see if it's carried/borrowed 837 838 csel x4,x4,x12,eq 839 csel x5,x5,x13,eq 840 841#ifdef __ARMEB__ 842 ror x10,x10,#32 // flip nonce words 843 ror x11,x11,#32 844#endif 845 adds x4,x4,x10 // accumulate nonce 846 adc x5,x5,x11 847#ifdef __ARMEB__ 848 rev x4,x4 // flip output bytes 849 rev x5,x5 850#endif 851 stp x4,x5,[x1] // write result 852 853 ret 854.size poly1305_emit_neon,.-poly1305_emit_neon 855 856.align 5 857.Lzeros: 858.long 0,0,0,0,0,0,0,0 859.LOPENSSL_armcap_P: 860#ifdef __ILP32__ 861.long OPENSSL_armcap_P-. 862#else 863.quad OPENSSL_armcap_P-. 864#endif 865.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 866.align 2 867.align 2 868