1#include "arm_asm.h" 2#include "arm_arch.h" 3 4.text 5 6// forward "declarations" are required for Apple 7 8.globl poly1305_blocks 9.globl poly1305_emit 10 11.globl poly1305_init 12.type poly1305_init,%function 13.align 5 14poly1305_init: 15 cmp x1,xzr 16 stp xzr,xzr,[x0] // zero hash value 17 stp xzr,xzr,[x0,#16] // [along with is_base2_26] 18 19 csel x0,xzr,x0,eq 20 b.eq .Lno_key 21 22#ifdef __ILP32__ 23 ldrsw x11,.LOPENSSL_armcap_P 24#else 25 ldr x11,.LOPENSSL_armcap_P 26#endif 27 adr x10,.LOPENSSL_armcap_P 28 29 ldp x7,x8,[x1] // load key 30 mov x9,#0xfffffffc0fffffff 31 movk x9,#0x0fff,lsl#48 32 ldr w17,[x10,x11] 33#ifdef __ARMEB__ 34 rev x7,x7 // flip bytes 35 rev x8,x8 36#endif 37 and x7,x7,x9 // &=0ffffffc0fffffff 38 and x9,x9,#-4 39 and x8,x8,x9 // &=0ffffffc0ffffffc 40 stp x7,x8,[x0,#32] // save key value 41 42 tst w17,#ARMV7_NEON 43 44 adr x12,poly1305_blocks 45 adr x7,poly1305_blocks_neon 46 adr x13,poly1305_emit 47 adr x8,poly1305_emit_neon 48 49 csel x12,x12,x7,eq 50 csel x13,x13,x8,eq 51 52#ifdef __ILP32__ 53 stp w12,w13,[x2] 54#else 55 stp x12,x13,[x2] 56#endif 57 58 mov x0,#1 59.Lno_key: 60 ret 61.size poly1305_init,.-poly1305_init 62 63.type poly1305_blocks,%function 64.align 5 65poly1305_blocks: 66 ands x2,x2,#-16 67 b.eq .Lno_data 68 69 ldp x4,x5,[x0] // load hash value 70 ldp x7,x8,[x0,#32] // load key value 71 ldr x6,[x0,#16] 72 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 73 b .Loop 74 75.align 5 76.Loop: 77 ldp x10,x11,[x1],#16 // load input 78 sub x2,x2,#16 79#ifdef __ARMEB__ 80 rev x10,x10 81 rev x11,x11 82#endif 83 adds x4,x4,x10 // accumulate input 84 adcs x5,x5,x11 85 86 mul x12,x4,x7 // h0*r0 87 adc x6,x6,x3 88 umulh x13,x4,x7 89 90 mul x10,x5,x9 // h1*5*r1 91 umulh x11,x5,x9 92 93 adds x12,x12,x10 94 mul x10,x4,x8 // h0*r1 95 adc x13,x13,x11 96 umulh x14,x4,x8 97 98 adds x13,x13,x10 99 mul x10,x5,x7 // h1*r0 100 adc x14,x14,xzr 101 umulh x11,x5,x7 102 103 adds x13,x13,x10 104 mul x10,x6,x9 // h2*5*r1 105 adc x14,x14,x11 106 mul x11,x6,x7 // h2*r0 107 108 adds x13,x13,x10 109 adc x14,x14,x11 110 111 and x10,x14,#-4 // final reduction 112 and x6,x14,#3 113 add x10,x10,x14,lsr#2 114 adds x4,x12,x10 115 adcs x5,x13,xzr 116 adc x6,x6,xzr 117 118 cbnz x2,.Loop 119 120 stp x4,x5,[x0] // store hash value 121 str x6,[x0,#16] 122 123.Lno_data: 124 ret 125.size poly1305_blocks,.-poly1305_blocks 126 127.type poly1305_emit,%function 128.align 5 129poly1305_emit: 130 ldp x4,x5,[x0] // load hash base 2^64 131 ldr x6,[x0,#16] 132 ldp x10,x11,[x2] // load nonce 133 134 adds x12,x4,#5 // compare to modulus 135 adcs x13,x5,xzr 136 adc x14,x6,xzr 137 138 tst x14,#-4 // see if it's carried/borrowed 139 140 csel x4,x4,x12,eq 141 csel x5,x5,x13,eq 142 143#ifdef __ARMEB__ 144 ror x10,x10,#32 // flip nonce words 145 ror x11,x11,#32 146#endif 147 adds x4,x4,x10 // accumulate nonce 148 adc x5,x5,x11 149#ifdef __ARMEB__ 150 rev x4,x4 // flip output bytes 151 rev x5,x5 152#endif 153 stp x4,x5,[x1] // write result 154 155 ret 156.size poly1305_emit,.-poly1305_emit 157.type poly1305_mult,%function 158.align 5 159poly1305_mult: 160 mul x12,x4,x7 // h0*r0 161 umulh x13,x4,x7 162 163 mul x10,x5,x9 // h1*5*r1 164 umulh x11,x5,x9 165 166 adds x12,x12,x10 167 mul x10,x4,x8 // h0*r1 168 adc x13,x13,x11 169 umulh x14,x4,x8 170 171 adds x13,x13,x10 172 mul x10,x5,x7 // h1*r0 173 adc x14,x14,xzr 174 umulh x11,x5,x7 175 176 adds x13,x13,x10 177 mul x10,x6,x9 // h2*5*r1 178 adc x14,x14,x11 179 mul x11,x6,x7 // h2*r0 180 181 adds x13,x13,x10 182 adc x14,x14,x11 183 184 and x10,x14,#-4 // final reduction 185 and x6,x14,#3 186 add x10,x10,x14,lsr#2 187 adds x4,x12,x10 188 adcs x5,x13,xzr 189 adc x6,x6,xzr 190 191 ret 192.size poly1305_mult,.-poly1305_mult 193 194.type poly1305_splat,%function 195.align 5 196poly1305_splat: 197 and x12,x4,#0x03ffffff // base 2^64 -> base 2^26 198 ubfx x13,x4,#26,#26 199 extr x14,x5,x4,#52 200 and x14,x14,#0x03ffffff 201 ubfx x15,x5,#14,#26 202 extr x16,x6,x5,#40 203 204 str w12,[x0,#16*0] // r0 205 add w12,w13,w13,lsl#2 // r1*5 206 str w13,[x0,#16*1] // r1 207 add w13,w14,w14,lsl#2 // r2*5 208 str w12,[x0,#16*2] // s1 209 str w14,[x0,#16*3] // r2 210 add w14,w15,w15,lsl#2 // r3*5 211 str w13,[x0,#16*4] // s2 212 str w15,[x0,#16*5] // r3 213 add w15,w16,w16,lsl#2 // r4*5 214 str w14,[x0,#16*6] // s3 215 str w16,[x0,#16*7] // r4 216 str w15,[x0,#16*8] // s4 217 218 ret 219.size poly1305_splat,.-poly1305_splat 220 221.type poly1305_blocks_neon,%function 222.align 5 223poly1305_blocks_neon: 224 ldr x17,[x0,#24] 225 cmp x2,#128 226 b.hs .Lblocks_neon 227 cbz x17,poly1305_blocks 228 229.Lblocks_neon: 230 stp x29,x30,[sp,#-80]! 231 add x29,sp,#0 232 233 ands x2,x2,#-16 234 b.eq .Lno_data_neon 235 236 cbz x17,.Lbase2_64_neon 237 238 ldp w10,w11,[x0] // load hash value base 2^26 239 ldp w12,w13,[x0,#8] 240 ldr w14,[x0,#16] 241 242 tst x2,#31 243 b.eq .Leven_neon 244 245 ldp x7,x8,[x0,#32] // load key value 246 247 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 248 lsr x5,x12,#12 249 adds x4,x4,x12,lsl#52 250 add x5,x5,x13,lsl#14 251 adc x5,x5,xzr 252 lsr x6,x14,#24 253 adds x5,x5,x14,lsl#40 254 adc x14,x6,xzr // can be partially reduced... 255 256 ldp x12,x13,[x1],#16 // load input 257 sub x2,x2,#16 258 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 259 260 and x10,x14,#-4 // ... so reduce 261 and x6,x14,#3 262 add x10,x10,x14,lsr#2 263 adds x4,x4,x10 264 adcs x5,x5,xzr 265 adc x6,x6,xzr 266 267#ifdef __ARMEB__ 268 rev x12,x12 269 rev x13,x13 270#endif 271 adds x4,x4,x12 // accumulate input 272 adcs x5,x5,x13 273 adc x6,x6,x3 274 275 bl poly1305_mult 276 ldr x30,[sp,#8] 277 278 cbz x3,.Lstore_base2_64_neon 279 280 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 281 ubfx x11,x4,#26,#26 282 extr x12,x5,x4,#52 283 and x12,x12,#0x03ffffff 284 ubfx x13,x5,#14,#26 285 extr x14,x6,x5,#40 286 287 cbnz x2,.Leven_neon 288 289 stp w10,w11,[x0] // store hash value base 2^26 290 stp w12,w13,[x0,#8] 291 str w14,[x0,#16] 292 b .Lno_data_neon 293 294.align 4 295.Lstore_base2_64_neon: 296 stp x4,x5,[x0] // store hash value base 2^64 297 stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed 298 b .Lno_data_neon 299 300.align 4 301.Lbase2_64_neon: 302 ldp x7,x8,[x0,#32] // load key value 303 304 ldp x4,x5,[x0] // load hash value base 2^64 305 ldr x6,[x0,#16] 306 307 tst x2,#31 308 b.eq .Linit_neon 309 310 ldp x12,x13,[x1],#16 // load input 311 sub x2,x2,#16 312 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 313#ifdef __ARMEB__ 314 rev x12,x12 315 rev x13,x13 316#endif 317 adds x4,x4,x12 // accumulate input 318 adcs x5,x5,x13 319 adc x6,x6,x3 320 321 bl poly1305_mult 322 323.Linit_neon: 324 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 325 ubfx x11,x4,#26,#26 326 extr x12,x5,x4,#52 327 and x12,x12,#0x03ffffff 328 ubfx x13,x5,#14,#26 329 extr x14,x6,x5,#40 330 331 stp d8,d9,[sp,#16] // meet ABI requirements 332 stp d10,d11,[sp,#32] 333 stp d12,d13,[sp,#48] 334 stp d14,d15,[sp,#64] 335 336 fmov d24,x10 337 fmov d25,x11 338 fmov d26,x12 339 fmov d27,x13 340 fmov d28,x14 341 342 ////////////////////////////////// initialize r^n table 343 mov x4,x7 // r^1 344 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 345 mov x5,x8 346 mov x6,xzr 347 add x0,x0,#48+12 348 bl poly1305_splat 349 350 bl poly1305_mult // r^2 351 sub x0,x0,#4 352 bl poly1305_splat 353 354 bl poly1305_mult // r^3 355 sub x0,x0,#4 356 bl poly1305_splat 357 358 bl poly1305_mult // r^4 359 sub x0,x0,#4 360 bl poly1305_splat 361 ldr x30,[sp,#8] 362 363 add x16,x1,#32 364 adr x17,.Lzeros 365 subs x2,x2,#64 366 csel x16,x17,x16,lo 367 368 mov x4,#1 369 str x4,[x0,#-24] // set is_base2_26 370 sub x0,x0,#48 // restore original x0 371 b .Ldo_neon 372 373.align 4 374.Leven_neon: 375 add x16,x1,#32 376 adr x17,.Lzeros 377 subs x2,x2,#64 378 csel x16,x17,x16,lo 379 380 stp d8,d9,[sp,#16] // meet ABI requirements 381 stp d10,d11,[sp,#32] 382 stp d12,d13,[sp,#48] 383 stp d14,d15,[sp,#64] 384 385 fmov d24,x10 386 fmov d25,x11 387 fmov d26,x12 388 fmov d27,x13 389 fmov d28,x14 390 391.Ldo_neon: 392 ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 393 ldp x9,x13,[x16],#48 394 395 lsl x3,x3,#24 396 add x15,x0,#48 397 398#ifdef __ARMEB__ 399 rev x8,x8 400 rev x12,x12 401 rev x9,x9 402 rev x13,x13 403#endif 404 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 405 and x5,x9,#0x03ffffff 406 ubfx x6,x8,#26,#26 407 ubfx x7,x9,#26,#26 408 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 409 extr x8,x12,x8,#52 410 extr x9,x13,x9,#52 411 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 412 fmov d14,x4 413 and x8,x8,#0x03ffffff 414 and x9,x9,#0x03ffffff 415 ubfx x10,x12,#14,#26 416 ubfx x11,x13,#14,#26 417 add x12,x3,x12,lsr#40 418 add x13,x3,x13,lsr#40 419 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 420 fmov d15,x6 421 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 422 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 423 fmov d16,x8 424 fmov d17,x10 425 fmov d18,x12 426 427 ldp x8,x12,[x1],#16 // inp[0:1] 428 ldp x9,x13,[x1],#48 429 430 ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64 431 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64 432 ld1 {v8.4s},[x15] 433 434#ifdef __ARMEB__ 435 rev x8,x8 436 rev x12,x12 437 rev x9,x9 438 rev x13,x13 439#endif 440 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 441 and x5,x9,#0x03ffffff 442 ubfx x6,x8,#26,#26 443 ubfx x7,x9,#26,#26 444 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 445 extr x8,x12,x8,#52 446 extr x9,x13,x9,#52 447 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 448 fmov d9,x4 449 and x8,x8,#0x03ffffff 450 and x9,x9,#0x03ffffff 451 ubfx x10,x12,#14,#26 452 ubfx x11,x13,#14,#26 453 add x12,x3,x12,lsr#40 454 add x13,x3,x13,lsr#40 455 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 456 fmov d10,x6 457 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 458 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 459 movi v31.2d,#-1 460 fmov d11,x8 461 fmov d12,x10 462 fmov d13,x12 463 ushr v31.2d,v31.2d,#38 464 465 b.ls .Lskip_loop 466 467.align 4 468.Loop_neon: 469 //////////////////////////////////////////////////////////////// 470 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 471 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 472 // ___________________/ 473 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 474 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 475 // ___________________/ ____________________/ 476 // 477 // Note that we start with inp[2:3]*r^2. This is because it 478 // doesn't depend on reduction in previous iteration. 479 //////////////////////////////////////////////////////////////// 480 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 481 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 482 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 483 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 484 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 485 486 subs x2,x2,#64 487 umull v23.2d,v14.2s,v7.s[2] 488 csel x16,x17,x16,lo 489 umull v22.2d,v14.2s,v5.s[2] 490 umull v21.2d,v14.2s,v3.s[2] 491 ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 492 umull v20.2d,v14.2s,v1.s[2] 493 ldp x9,x13,[x16],#48 494 umull v19.2d,v14.2s,v0.s[2] 495#ifdef __ARMEB__ 496 rev x8,x8 497 rev x12,x12 498 rev x9,x9 499 rev x13,x13 500#endif 501 502 umlal v23.2d,v15.2s,v5.s[2] 503 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 504 umlal v22.2d,v15.2s,v3.s[2] 505 and x5,x9,#0x03ffffff 506 umlal v21.2d,v15.2s,v1.s[2] 507 ubfx x6,x8,#26,#26 508 umlal v20.2d,v15.2s,v0.s[2] 509 ubfx x7,x9,#26,#26 510 umlal v19.2d,v15.2s,v8.s[2] 511 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 512 513 umlal v23.2d,v16.2s,v3.s[2] 514 extr x8,x12,x8,#52 515 umlal v22.2d,v16.2s,v1.s[2] 516 extr x9,x13,x9,#52 517 umlal v21.2d,v16.2s,v0.s[2] 518 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 519 umlal v20.2d,v16.2s,v8.s[2] 520 fmov d14,x4 521 umlal v19.2d,v16.2s,v6.s[2] 522 and x8,x8,#0x03ffffff 523 524 umlal v23.2d,v17.2s,v1.s[2] 525 and x9,x9,#0x03ffffff 526 umlal v22.2d,v17.2s,v0.s[2] 527 ubfx x10,x12,#14,#26 528 umlal v21.2d,v17.2s,v8.s[2] 529 ubfx x11,x13,#14,#26 530 umlal v20.2d,v17.2s,v6.s[2] 531 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 532 umlal v19.2d,v17.2s,v4.s[2] 533 fmov d15,x6 534 535 add v11.2s,v11.2s,v26.2s 536 add x12,x3,x12,lsr#40 537 umlal v23.2d,v18.2s,v0.s[2] 538 add x13,x3,x13,lsr#40 539 umlal v22.2d,v18.2s,v8.s[2] 540 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 541 umlal v21.2d,v18.2s,v6.s[2] 542 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 543 umlal v20.2d,v18.2s,v4.s[2] 544 fmov d16,x8 545 umlal v19.2d,v18.2s,v2.s[2] 546 fmov d17,x10 547 548 //////////////////////////////////////////////////////////////// 549 // (hash+inp[0:1])*r^4 and accumulate 550 551 add v9.2s,v9.2s,v24.2s 552 fmov d18,x12 553 umlal v22.2d,v11.2s,v1.s[0] 554 ldp x8,x12,[x1],#16 // inp[0:1] 555 umlal v19.2d,v11.2s,v6.s[0] 556 ldp x9,x13,[x1],#48 557 umlal v23.2d,v11.2s,v3.s[0] 558 umlal v20.2d,v11.2s,v8.s[0] 559 umlal v21.2d,v11.2s,v0.s[0] 560#ifdef __ARMEB__ 561 rev x8,x8 562 rev x12,x12 563 rev x9,x9 564 rev x13,x13 565#endif 566 567 add v10.2s,v10.2s,v25.2s 568 umlal v22.2d,v9.2s,v5.s[0] 569 umlal v23.2d,v9.2s,v7.s[0] 570 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 571 umlal v21.2d,v9.2s,v3.s[0] 572 and x5,x9,#0x03ffffff 573 umlal v19.2d,v9.2s,v0.s[0] 574 ubfx x6,x8,#26,#26 575 umlal v20.2d,v9.2s,v1.s[0] 576 ubfx x7,x9,#26,#26 577 578 add v12.2s,v12.2s,v27.2s 579 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 580 umlal v22.2d,v10.2s,v3.s[0] 581 extr x8,x12,x8,#52 582 umlal v23.2d,v10.2s,v5.s[0] 583 extr x9,x13,x9,#52 584 umlal v19.2d,v10.2s,v8.s[0] 585 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 586 umlal v21.2d,v10.2s,v1.s[0] 587 fmov d9,x4 588 umlal v20.2d,v10.2s,v0.s[0] 589 and x8,x8,#0x03ffffff 590 591 add v13.2s,v13.2s,v28.2s 592 and x9,x9,#0x03ffffff 593 umlal v22.2d,v12.2s,v0.s[0] 594 ubfx x10,x12,#14,#26 595 umlal v19.2d,v12.2s,v4.s[0] 596 ubfx x11,x13,#14,#26 597 umlal v23.2d,v12.2s,v1.s[0] 598 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 599 umlal v20.2d,v12.2s,v6.s[0] 600 fmov d10,x6 601 umlal v21.2d,v12.2s,v8.s[0] 602 add x12,x3,x12,lsr#40 603 604 umlal v22.2d,v13.2s,v8.s[0] 605 add x13,x3,x13,lsr#40 606 umlal v19.2d,v13.2s,v2.s[0] 607 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 608 umlal v23.2d,v13.2s,v0.s[0] 609 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 610 umlal v20.2d,v13.2s,v4.s[0] 611 fmov d11,x8 612 umlal v21.2d,v13.2s,v6.s[0] 613 fmov d12,x10 614 fmov d13,x12 615 616 ///////////////////////////////////////////////////////////////// 617 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 618 // and P. Schwabe 619 // 620 // [see discussion in poly1305-armv4 module] 621 622 ushr v29.2d,v22.2d,#26 623 xtn v27.2s,v22.2d 624 ushr v30.2d,v19.2d,#26 625 and v19.16b,v19.16b,v31.16b 626 add v23.2d,v23.2d,v29.2d // h3 -> h4 627 bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff 628 add v20.2d,v20.2d,v30.2d // h0 -> h1 629 630 ushr v29.2d,v23.2d,#26 631 xtn v28.2s,v23.2d 632 ushr v30.2d,v20.2d,#26 633 xtn v25.2s,v20.2d 634 bic v28.2s,#0xfc,lsl#24 635 add v21.2d,v21.2d,v30.2d // h1 -> h2 636 637 add v19.2d,v19.2d,v29.2d 638 shl v29.2d,v29.2d,#2 639 shrn v30.2s,v21.2d,#26 640 xtn v26.2s,v21.2d 641 add v19.2d,v19.2d,v29.2d // h4 -> h0 642 bic v25.2s,#0xfc,lsl#24 643 add v27.2s,v27.2s,v30.2s // h2 -> h3 644 bic v26.2s,#0xfc,lsl#24 645 646 shrn v29.2s,v19.2d,#26 647 xtn v24.2s,v19.2d 648 ushr v30.2s,v27.2s,#26 649 bic v27.2s,#0xfc,lsl#24 650 bic v24.2s,#0xfc,lsl#24 651 add v25.2s,v25.2s,v29.2s // h0 -> h1 652 add v28.2s,v28.2s,v30.2s // h3 -> h4 653 654 b.hi .Loop_neon 655 656.Lskip_loop: 657 dup v16.2d,v16.d[0] 658 add v11.2s,v11.2s,v26.2s 659 660 //////////////////////////////////////////////////////////////// 661 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 662 663 adds x2,x2,#32 664 b.ne .Long_tail 665 666 dup v16.2d,v11.d[0] 667 add v14.2s,v9.2s,v24.2s 668 add v17.2s,v12.2s,v27.2s 669 add v15.2s,v10.2s,v25.2s 670 add v18.2s,v13.2s,v28.2s 671 672.Long_tail: 673 dup v14.2d,v14.d[0] 674 umull2 v19.2d,v16.4s,v6.4s 675 umull2 v22.2d,v16.4s,v1.4s 676 umull2 v23.2d,v16.4s,v3.4s 677 umull2 v21.2d,v16.4s,v0.4s 678 umull2 v20.2d,v16.4s,v8.4s 679 680 dup v15.2d,v15.d[0] 681 umlal2 v19.2d,v14.4s,v0.4s 682 umlal2 v21.2d,v14.4s,v3.4s 683 umlal2 v22.2d,v14.4s,v5.4s 684 umlal2 v23.2d,v14.4s,v7.4s 685 umlal2 v20.2d,v14.4s,v1.4s 686 687 dup v17.2d,v17.d[0] 688 umlal2 v19.2d,v15.4s,v8.4s 689 umlal2 v22.2d,v15.4s,v3.4s 690 umlal2 v21.2d,v15.4s,v1.4s 691 umlal2 v23.2d,v15.4s,v5.4s 692 umlal2 v20.2d,v15.4s,v0.4s 693 694 dup v18.2d,v18.d[0] 695 umlal2 v22.2d,v17.4s,v0.4s 696 umlal2 v23.2d,v17.4s,v1.4s 697 umlal2 v19.2d,v17.4s,v4.4s 698 umlal2 v20.2d,v17.4s,v6.4s 699 umlal2 v21.2d,v17.4s,v8.4s 700 701 umlal2 v22.2d,v18.4s,v8.4s 702 umlal2 v19.2d,v18.4s,v2.4s 703 umlal2 v23.2d,v18.4s,v0.4s 704 umlal2 v20.2d,v18.4s,v4.4s 705 umlal2 v21.2d,v18.4s,v6.4s 706 707 b.eq .Lshort_tail 708 709 //////////////////////////////////////////////////////////////// 710 // (hash+inp[0:1])*r^4:r^3 and accumulate 711 712 add v9.2s,v9.2s,v24.2s 713 umlal v22.2d,v11.2s,v1.2s 714 umlal v19.2d,v11.2s,v6.2s 715 umlal v23.2d,v11.2s,v3.2s 716 umlal v20.2d,v11.2s,v8.2s 717 umlal v21.2d,v11.2s,v0.2s 718 719 add v10.2s,v10.2s,v25.2s 720 umlal v22.2d,v9.2s,v5.2s 721 umlal v19.2d,v9.2s,v0.2s 722 umlal v23.2d,v9.2s,v7.2s 723 umlal v20.2d,v9.2s,v1.2s 724 umlal v21.2d,v9.2s,v3.2s 725 726 add v12.2s,v12.2s,v27.2s 727 umlal v22.2d,v10.2s,v3.2s 728 umlal v19.2d,v10.2s,v8.2s 729 umlal v23.2d,v10.2s,v5.2s 730 umlal v20.2d,v10.2s,v0.2s 731 umlal v21.2d,v10.2s,v1.2s 732 733 add v13.2s,v13.2s,v28.2s 734 umlal v22.2d,v12.2s,v0.2s 735 umlal v19.2d,v12.2s,v4.2s 736 umlal v23.2d,v12.2s,v1.2s 737 umlal v20.2d,v12.2s,v6.2s 738 umlal v21.2d,v12.2s,v8.2s 739 740 umlal v22.2d,v13.2s,v8.2s 741 umlal v19.2d,v13.2s,v2.2s 742 umlal v23.2d,v13.2s,v0.2s 743 umlal v20.2d,v13.2s,v4.2s 744 umlal v21.2d,v13.2s,v6.2s 745 746.Lshort_tail: 747 //////////////////////////////////////////////////////////////// 748 // horizontal add 749 750 addp v22.2d,v22.2d,v22.2d 751 ldp d8,d9,[sp,#16] // meet ABI requirements 752 addp v19.2d,v19.2d,v19.2d 753 ldp d10,d11,[sp,#32] 754 addp v23.2d,v23.2d,v23.2d 755 ldp d12,d13,[sp,#48] 756 addp v20.2d,v20.2d,v20.2d 757 ldp d14,d15,[sp,#64] 758 addp v21.2d,v21.2d,v21.2d 759 760 //////////////////////////////////////////////////////////////// 761 // lazy reduction, but without narrowing 762 763 ushr v29.2d,v22.2d,#26 764 and v22.16b,v22.16b,v31.16b 765 ushr v30.2d,v19.2d,#26 766 and v19.16b,v19.16b,v31.16b 767 768 add v23.2d,v23.2d,v29.2d // h3 -> h4 769 add v20.2d,v20.2d,v30.2d // h0 -> h1 770 771 ushr v29.2d,v23.2d,#26 772 and v23.16b,v23.16b,v31.16b 773 ushr v30.2d,v20.2d,#26 774 and v20.16b,v20.16b,v31.16b 775 add v21.2d,v21.2d,v30.2d // h1 -> h2 776 777 add v19.2d,v19.2d,v29.2d 778 shl v29.2d,v29.2d,#2 779 ushr v30.2d,v21.2d,#26 780 and v21.16b,v21.16b,v31.16b 781 add v19.2d,v19.2d,v29.2d // h4 -> h0 782 add v22.2d,v22.2d,v30.2d // h2 -> h3 783 784 ushr v29.2d,v19.2d,#26 785 and v19.16b,v19.16b,v31.16b 786 ushr v30.2d,v22.2d,#26 787 and v22.16b,v22.16b,v31.16b 788 add v20.2d,v20.2d,v29.2d // h0 -> h1 789 add v23.2d,v23.2d,v30.2d // h3 -> h4 790 791 //////////////////////////////////////////////////////////////// 792 // write the result, can be partially reduced 793 794 st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16 795 st1 {v23.s}[0],[x0] 796 797.Lno_data_neon: 798 ldr x29,[sp],#80 799 ret 800.size poly1305_blocks_neon,.-poly1305_blocks_neon 801 802.type poly1305_emit_neon,%function 803.align 5 804poly1305_emit_neon: 805 ldr x17,[x0,#24] 806 cbz x17,poly1305_emit 807 808 ldp w10,w11,[x0] // load hash value base 2^26 809 ldp w12,w13,[x0,#8] 810 ldr w14,[x0,#16] 811 812 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 813 lsr x5,x12,#12 814 adds x4,x4,x12,lsl#52 815 add x5,x5,x13,lsl#14 816 adc x5,x5,xzr 817 lsr x6,x14,#24 818 adds x5,x5,x14,lsl#40 819 adc x6,x6,xzr // can be partially reduced... 820 821 ldp x10,x11,[x2] // load nonce 822 823 and x12,x6,#-4 // ... so reduce 824 add x12,x12,x6,lsr#2 825 and x6,x6,#3 826 adds x4,x4,x12 827 adcs x5,x5,xzr 828 adc x6,x6,xzr 829 830 adds x12,x4,#5 // compare to modulus 831 adcs x13,x5,xzr 832 adc x14,x6,xzr 833 834 tst x14,#-4 // see if it's carried/borrowed 835 836 csel x4,x4,x12,eq 837 csel x5,x5,x13,eq 838 839#ifdef __ARMEB__ 840 ror x10,x10,#32 // flip nonce words 841 ror x11,x11,#32 842#endif 843 adds x4,x4,x10 // accumulate nonce 844 adc x5,x5,x11 845#ifdef __ARMEB__ 846 rev x4,x4 // flip output bytes 847 rev x5,x5 848#endif 849 stp x4,x5,[x1] // write result 850 851 ret 852.size poly1305_emit_neon,.-poly1305_emit_neon 853 854.align 5 855.Lzeros: 856.long 0,0,0,0,0,0,0,0 857.LOPENSSL_armcap_P: 858#ifdef __ILP32__ 859.long OPENSSL_armcap_P-. 860#else 861.quad OPENSSL_armcap_P-. 862#endif 863.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 864.align 2 865.align 2 866