1#include "arm_arch.h" 2 3.text 4 5// forward "declarations" are required for Apple 6 7.globl poly1305_blocks 8.globl poly1305_emit 9 10.globl poly1305_init 11.type poly1305_init,%function 12.align 5 13poly1305_init: 14 cmp x1,xzr 15 stp xzr,xzr,[x0] // zero hash value 16 stp xzr,xzr,[x0,#16] // [along with is_base2_26] 17 18 csel x0,xzr,x0,eq 19 b.eq .Lno_key 20 21#ifdef __ILP32__ 22 ldrsw x11,.LOPENSSL_armcap_P 23#else 24 ldr x11,.LOPENSSL_armcap_P 25#endif 26 adr x10,.LOPENSSL_armcap_P 27 28 ldp x7,x8,[x1] // load key 29 mov x9,#0xfffffffc0fffffff 30 movk x9,#0x0fff,lsl#48 31 ldr w17,[x10,x11] 32#ifdef __ARMEB__ 33 rev x7,x7 // flip bytes 34 rev x8,x8 35#endif 36 and x7,x7,x9 // &=0ffffffc0fffffff 37 and x9,x9,#-4 38 and x8,x8,x9 // &=0ffffffc0ffffffc 39 stp x7,x8,[x0,#32] // save key value 40 41 tst w17,#ARMV7_NEON 42 43 adr x12,poly1305_blocks 44 adr x7,poly1305_blocks_neon 45 adr x13,poly1305_emit 46 adr x8,poly1305_emit_neon 47 48 csel x12,x12,x7,eq 49 csel x13,x13,x8,eq 50 51#ifdef __ILP32__ 52 stp w12,w13,[x2] 53#else 54 stp x12,x13,[x2] 55#endif 56 57 mov x0,#1 58.Lno_key: 59 ret 60.size poly1305_init,.-poly1305_init 61 62.type poly1305_blocks,%function 63.align 5 64poly1305_blocks: 65 ands x2,x2,#-16 66 b.eq .Lno_data 67 68 ldp x4,x5,[x0] // load hash value 69 ldp x7,x8,[x0,#32] // load key value 70 ldr x6,[x0,#16] 71 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 72 b .Loop 73 74.align 5 75.Loop: 76 ldp x10,x11,[x1],#16 // load input 77 sub x2,x2,#16 78#ifdef __ARMEB__ 79 rev x10,x10 80 rev x11,x11 81#endif 82 adds x4,x4,x10 // accumulate input 83 adcs x5,x5,x11 84 85 mul x12,x4,x7 // h0*r0 86 adc x6,x6,x3 87 umulh x13,x4,x7 88 89 mul x10,x5,x9 // h1*5*r1 90 umulh x11,x5,x9 91 92 adds x12,x12,x10 93 mul x10,x4,x8 // h0*r1 94 adc x13,x13,x11 95 umulh x14,x4,x8 96 97 adds x13,x13,x10 98 mul x10,x5,x7 // h1*r0 99 adc x14,x14,xzr 100 umulh x11,x5,x7 101 102 adds x13,x13,x10 103 mul x10,x6,x9 // h2*5*r1 104 adc x14,x14,x11 105 mul x11,x6,x7 // h2*r0 106 107 adds x13,x13,x10 108 adc x14,x14,x11 109 110 and x10,x14,#-4 // final reduction 111 and x6,x14,#3 112 add x10,x10,x14,lsr#2 113 adds x4,x12,x10 114 adcs x5,x13,xzr 115 adc x6,x6,xzr 116 117 cbnz x2,.Loop 118 119 stp x4,x5,[x0] // store hash value 120 str x6,[x0,#16] 121 122.Lno_data: 123 ret 124.size poly1305_blocks,.-poly1305_blocks 125 126.type poly1305_emit,%function 127.align 5 128poly1305_emit: 129 ldp x4,x5,[x0] // load hash base 2^64 130 ldr x6,[x0,#16] 131 ldp x10,x11,[x2] // load nonce 132 133 adds x12,x4,#5 // compare to modulus 134 adcs x13,x5,xzr 135 adc x14,x6,xzr 136 137 tst x14,#-4 // see if it's carried/borrowed 138 139 csel x4,x4,x12,eq 140 csel x5,x5,x13,eq 141 142#ifdef __ARMEB__ 143 ror x10,x10,#32 // flip nonce words 144 ror x11,x11,#32 145#endif 146 adds x4,x4,x10 // accumulate nonce 147 adc x5,x5,x11 148#ifdef __ARMEB__ 149 rev x4,x4 // flip output bytes 150 rev x5,x5 151#endif 152 stp x4,x5,[x1] // write result 153 154 ret 155.size poly1305_emit,.-poly1305_emit 156.type poly1305_mult,%function 157.align 5 158poly1305_mult: 159 mul x12,x4,x7 // h0*r0 160 umulh x13,x4,x7 161 162 mul x10,x5,x9 // h1*5*r1 163 umulh x11,x5,x9 164 165 adds x12,x12,x10 166 mul x10,x4,x8 // h0*r1 167 adc x13,x13,x11 168 umulh x14,x4,x8 169 170 adds x13,x13,x10 171 mul x10,x5,x7 // h1*r0 172 adc x14,x14,xzr 173 umulh x11,x5,x7 174 175 adds x13,x13,x10 176 mul x10,x6,x9 // h2*5*r1 177 adc x14,x14,x11 178 mul x11,x6,x7 // h2*r0 179 180 adds x13,x13,x10 181 adc x14,x14,x11 182 183 and x10,x14,#-4 // final reduction 184 and x6,x14,#3 185 add x10,x10,x14,lsr#2 186 adds x4,x12,x10 187 adcs x5,x13,xzr 188 adc x6,x6,xzr 189 190 ret 191.size poly1305_mult,.-poly1305_mult 192 193.type poly1305_splat,%function 194.align 5 195poly1305_splat: 196 and x12,x4,#0x03ffffff // base 2^64 -> base 2^26 197 ubfx x13,x4,#26,#26 198 extr x14,x5,x4,#52 199 and x14,x14,#0x03ffffff 200 ubfx x15,x5,#14,#26 201 extr x16,x6,x5,#40 202 203 str w12,[x0,#16*0] // r0 204 add w12,w13,w13,lsl#2 // r1*5 205 str w13,[x0,#16*1] // r1 206 add w13,w14,w14,lsl#2 // r2*5 207 str w12,[x0,#16*2] // s1 208 str w14,[x0,#16*3] // r2 209 add w14,w15,w15,lsl#2 // r3*5 210 str w13,[x0,#16*4] // s2 211 str w15,[x0,#16*5] // r3 212 add w15,w16,w16,lsl#2 // r4*5 213 str w14,[x0,#16*6] // s3 214 str w16,[x0,#16*7] // r4 215 str w15,[x0,#16*8] // s4 216 217 ret 218.size poly1305_splat,.-poly1305_splat 219 220.type poly1305_blocks_neon,%function 221.align 5 222poly1305_blocks_neon: 223 ldr x17,[x0,#24] 224 cmp x2,#128 225 b.hs .Lblocks_neon 226 cbz x17,poly1305_blocks 227 228.Lblocks_neon: 229 stp x29,x30,[sp,#-80]! 230 add x29,sp,#0 231 232 ands x2,x2,#-16 233 b.eq .Lno_data_neon 234 235 cbz x17,.Lbase2_64_neon 236 237 ldp w10,w11,[x0] // load hash value base 2^26 238 ldp w12,w13,[x0,#8] 239 ldr w14,[x0,#16] 240 241 tst x2,#31 242 b.eq .Leven_neon 243 244 ldp x7,x8,[x0,#32] // load key value 245 246 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 247 lsr x5,x12,#12 248 adds x4,x4,x12,lsl#52 249 add x5,x5,x13,lsl#14 250 adc x5,x5,xzr 251 lsr x6,x14,#24 252 adds x5,x5,x14,lsl#40 253 adc x14,x6,xzr // can be partially reduced... 254 255 ldp x12,x13,[x1],#16 // load input 256 sub x2,x2,#16 257 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 258 259 and x10,x14,#-4 // ... so reduce 260 and x6,x14,#3 261 add x10,x10,x14,lsr#2 262 adds x4,x4,x10 263 adcs x5,x5,xzr 264 adc x6,x6,xzr 265 266#ifdef __ARMEB__ 267 rev x12,x12 268 rev x13,x13 269#endif 270 adds x4,x4,x12 // accumulate input 271 adcs x5,x5,x13 272 adc x6,x6,x3 273 274 bl poly1305_mult 275 ldr x30,[sp,#8] 276 277 cbz x3,.Lstore_base2_64_neon 278 279 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 280 ubfx x11,x4,#26,#26 281 extr x12,x5,x4,#52 282 and x12,x12,#0x03ffffff 283 ubfx x13,x5,#14,#26 284 extr x14,x6,x5,#40 285 286 cbnz x2,.Leven_neon 287 288 stp w10,w11,[x0] // store hash value base 2^26 289 stp w12,w13,[x0,#8] 290 str w14,[x0,#16] 291 b .Lno_data_neon 292 293.align 4 294.Lstore_base2_64_neon: 295 stp x4,x5,[x0] // store hash value base 2^64 296 stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed 297 b .Lno_data_neon 298 299.align 4 300.Lbase2_64_neon: 301 ldp x7,x8,[x0,#32] // load key value 302 303 ldp x4,x5,[x0] // load hash value base 2^64 304 ldr x6,[x0,#16] 305 306 tst x2,#31 307 b.eq .Linit_neon 308 309 ldp x12,x13,[x1],#16 // load input 310 sub x2,x2,#16 311 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 312#ifdef __ARMEB__ 313 rev x12,x12 314 rev x13,x13 315#endif 316 adds x4,x4,x12 // accumulate input 317 adcs x5,x5,x13 318 adc x6,x6,x3 319 320 bl poly1305_mult 321 322.Linit_neon: 323 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 324 ubfx x11,x4,#26,#26 325 extr x12,x5,x4,#52 326 and x12,x12,#0x03ffffff 327 ubfx x13,x5,#14,#26 328 extr x14,x6,x5,#40 329 330 stp d8,d9,[sp,#16] // meet ABI requirements 331 stp d10,d11,[sp,#32] 332 stp d12,d13,[sp,#48] 333 stp d14,d15,[sp,#64] 334 335 fmov d24,x10 336 fmov d25,x11 337 fmov d26,x12 338 fmov d27,x13 339 fmov d28,x14 340 341 ////////////////////////////////// initialize r^n table 342 mov x4,x7 // r^1 343 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 344 mov x5,x8 345 mov x6,xzr 346 add x0,x0,#48+12 347 bl poly1305_splat 348 349 bl poly1305_mult // r^2 350 sub x0,x0,#4 351 bl poly1305_splat 352 353 bl poly1305_mult // r^3 354 sub x0,x0,#4 355 bl poly1305_splat 356 357 bl poly1305_mult // r^4 358 sub x0,x0,#4 359 bl poly1305_splat 360 ldr x30,[sp,#8] 361 362 add x16,x1,#32 363 adr x17,.Lzeros 364 subs x2,x2,#64 365 csel x16,x17,x16,lo 366 367 mov x4,#1 368 str x4,[x0,#-24] // set is_base2_26 369 sub x0,x0,#48 // restore original x0 370 b .Ldo_neon 371 372.align 4 373.Leven_neon: 374 add x16,x1,#32 375 adr x17,.Lzeros 376 subs x2,x2,#64 377 csel x16,x17,x16,lo 378 379 stp d8,d9,[sp,#16] // meet ABI requirements 380 stp d10,d11,[sp,#32] 381 stp d12,d13,[sp,#48] 382 stp d14,d15,[sp,#64] 383 384 fmov d24,x10 385 fmov d25,x11 386 fmov d26,x12 387 fmov d27,x13 388 fmov d28,x14 389 390.Ldo_neon: 391 ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 392 ldp x9,x13,[x16],#48 393 394 lsl x3,x3,#24 395 add x15,x0,#48 396 397#ifdef __ARMEB__ 398 rev x8,x8 399 rev x12,x12 400 rev x9,x9 401 rev x13,x13 402#endif 403 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 404 and x5,x9,#0x03ffffff 405 ubfx x6,x8,#26,#26 406 ubfx x7,x9,#26,#26 407 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 408 extr x8,x12,x8,#52 409 extr x9,x13,x9,#52 410 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 411 fmov d14,x4 412 and x8,x8,#0x03ffffff 413 and x9,x9,#0x03ffffff 414 ubfx x10,x12,#14,#26 415 ubfx x11,x13,#14,#26 416 add x12,x3,x12,lsr#40 417 add x13,x3,x13,lsr#40 418 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 419 fmov d15,x6 420 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 421 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 422 fmov d16,x8 423 fmov d17,x10 424 fmov d18,x12 425 426 ldp x8,x12,[x1],#16 // inp[0:1] 427 ldp x9,x13,[x1],#48 428 429 ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64 430 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64 431 ld1 {v8.4s},[x15] 432 433#ifdef __ARMEB__ 434 rev x8,x8 435 rev x12,x12 436 rev x9,x9 437 rev x13,x13 438#endif 439 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 440 and x5,x9,#0x03ffffff 441 ubfx x6,x8,#26,#26 442 ubfx x7,x9,#26,#26 443 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 444 extr x8,x12,x8,#52 445 extr x9,x13,x9,#52 446 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 447 fmov d9,x4 448 and x8,x8,#0x03ffffff 449 and x9,x9,#0x03ffffff 450 ubfx x10,x12,#14,#26 451 ubfx x11,x13,#14,#26 452 add x12,x3,x12,lsr#40 453 add x13,x3,x13,lsr#40 454 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 455 fmov d10,x6 456 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 457 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 458 movi v31.2d,#-1 459 fmov d11,x8 460 fmov d12,x10 461 fmov d13,x12 462 ushr v31.2d,v31.2d,#38 463 464 b.ls .Lskip_loop 465 466.align 4 467.Loop_neon: 468 //////////////////////////////////////////////////////////////// 469 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 470 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 471 // ___________________/ 472 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 473 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 474 // ___________________/ ____________________/ 475 // 476 // Note that we start with inp[2:3]*r^2. This is because it 477 // doesn't depend on reduction in previous iteration. 478 //////////////////////////////////////////////////////////////// 479 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 480 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 481 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 482 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 483 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 484 485 subs x2,x2,#64 486 umull v23.2d,v14.2s,v7.s[2] 487 csel x16,x17,x16,lo 488 umull v22.2d,v14.2s,v5.s[2] 489 umull v21.2d,v14.2s,v3.s[2] 490 ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 491 umull v20.2d,v14.2s,v1.s[2] 492 ldp x9,x13,[x16],#48 493 umull v19.2d,v14.2s,v0.s[2] 494#ifdef __ARMEB__ 495 rev x8,x8 496 rev x12,x12 497 rev x9,x9 498 rev x13,x13 499#endif 500 501 umlal v23.2d,v15.2s,v5.s[2] 502 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 503 umlal v22.2d,v15.2s,v3.s[2] 504 and x5,x9,#0x03ffffff 505 umlal v21.2d,v15.2s,v1.s[2] 506 ubfx x6,x8,#26,#26 507 umlal v20.2d,v15.2s,v0.s[2] 508 ubfx x7,x9,#26,#26 509 umlal v19.2d,v15.2s,v8.s[2] 510 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 511 512 umlal v23.2d,v16.2s,v3.s[2] 513 extr x8,x12,x8,#52 514 umlal v22.2d,v16.2s,v1.s[2] 515 extr x9,x13,x9,#52 516 umlal v21.2d,v16.2s,v0.s[2] 517 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 518 umlal v20.2d,v16.2s,v8.s[2] 519 fmov d14,x4 520 umlal v19.2d,v16.2s,v6.s[2] 521 and x8,x8,#0x03ffffff 522 523 umlal v23.2d,v17.2s,v1.s[2] 524 and x9,x9,#0x03ffffff 525 umlal v22.2d,v17.2s,v0.s[2] 526 ubfx x10,x12,#14,#26 527 umlal v21.2d,v17.2s,v8.s[2] 528 ubfx x11,x13,#14,#26 529 umlal v20.2d,v17.2s,v6.s[2] 530 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 531 umlal v19.2d,v17.2s,v4.s[2] 532 fmov d15,x6 533 534 add v11.2s,v11.2s,v26.2s 535 add x12,x3,x12,lsr#40 536 umlal v23.2d,v18.2s,v0.s[2] 537 add x13,x3,x13,lsr#40 538 umlal v22.2d,v18.2s,v8.s[2] 539 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 540 umlal v21.2d,v18.2s,v6.s[2] 541 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 542 umlal v20.2d,v18.2s,v4.s[2] 543 fmov d16,x8 544 umlal v19.2d,v18.2s,v2.s[2] 545 fmov d17,x10 546 547 //////////////////////////////////////////////////////////////// 548 // (hash+inp[0:1])*r^4 and accumulate 549 550 add v9.2s,v9.2s,v24.2s 551 fmov d18,x12 552 umlal v22.2d,v11.2s,v1.s[0] 553 ldp x8,x12,[x1],#16 // inp[0:1] 554 umlal v19.2d,v11.2s,v6.s[0] 555 ldp x9,x13,[x1],#48 556 umlal v23.2d,v11.2s,v3.s[0] 557 umlal v20.2d,v11.2s,v8.s[0] 558 umlal v21.2d,v11.2s,v0.s[0] 559#ifdef __ARMEB__ 560 rev x8,x8 561 rev x12,x12 562 rev x9,x9 563 rev x13,x13 564#endif 565 566 add v10.2s,v10.2s,v25.2s 567 umlal v22.2d,v9.2s,v5.s[0] 568 umlal v23.2d,v9.2s,v7.s[0] 569 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 570 umlal v21.2d,v9.2s,v3.s[0] 571 and x5,x9,#0x03ffffff 572 umlal v19.2d,v9.2s,v0.s[0] 573 ubfx x6,x8,#26,#26 574 umlal v20.2d,v9.2s,v1.s[0] 575 ubfx x7,x9,#26,#26 576 577 add v12.2s,v12.2s,v27.2s 578 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 579 umlal v22.2d,v10.2s,v3.s[0] 580 extr x8,x12,x8,#52 581 umlal v23.2d,v10.2s,v5.s[0] 582 extr x9,x13,x9,#52 583 umlal v19.2d,v10.2s,v8.s[0] 584 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 585 umlal v21.2d,v10.2s,v1.s[0] 586 fmov d9,x4 587 umlal v20.2d,v10.2s,v0.s[0] 588 and x8,x8,#0x03ffffff 589 590 add v13.2s,v13.2s,v28.2s 591 and x9,x9,#0x03ffffff 592 umlal v22.2d,v12.2s,v0.s[0] 593 ubfx x10,x12,#14,#26 594 umlal v19.2d,v12.2s,v4.s[0] 595 ubfx x11,x13,#14,#26 596 umlal v23.2d,v12.2s,v1.s[0] 597 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 598 umlal v20.2d,v12.2s,v6.s[0] 599 fmov d10,x6 600 umlal v21.2d,v12.2s,v8.s[0] 601 add x12,x3,x12,lsr#40 602 603 umlal v22.2d,v13.2s,v8.s[0] 604 add x13,x3,x13,lsr#40 605 umlal v19.2d,v13.2s,v2.s[0] 606 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 607 umlal v23.2d,v13.2s,v0.s[0] 608 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 609 umlal v20.2d,v13.2s,v4.s[0] 610 fmov d11,x8 611 umlal v21.2d,v13.2s,v6.s[0] 612 fmov d12,x10 613 fmov d13,x12 614 615 ///////////////////////////////////////////////////////////////// 616 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 617 // and P. Schwabe 618 // 619 // [see discussion in poly1305-armv4 module] 620 621 ushr v29.2d,v22.2d,#26 622 xtn v27.2s,v22.2d 623 ushr v30.2d,v19.2d,#26 624 and v19.16b,v19.16b,v31.16b 625 add v23.2d,v23.2d,v29.2d // h3 -> h4 626 bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff 627 add v20.2d,v20.2d,v30.2d // h0 -> h1 628 629 ushr v29.2d,v23.2d,#26 630 xtn v28.2s,v23.2d 631 ushr v30.2d,v20.2d,#26 632 xtn v25.2s,v20.2d 633 bic v28.2s,#0xfc,lsl#24 634 add v21.2d,v21.2d,v30.2d // h1 -> h2 635 636 add v19.2d,v19.2d,v29.2d 637 shl v29.2d,v29.2d,#2 638 shrn v30.2s,v21.2d,#26 639 xtn v26.2s,v21.2d 640 add v19.2d,v19.2d,v29.2d // h4 -> h0 641 bic v25.2s,#0xfc,lsl#24 642 add v27.2s,v27.2s,v30.2s // h2 -> h3 643 bic v26.2s,#0xfc,lsl#24 644 645 shrn v29.2s,v19.2d,#26 646 xtn v24.2s,v19.2d 647 ushr v30.2s,v27.2s,#26 648 bic v27.2s,#0xfc,lsl#24 649 bic v24.2s,#0xfc,lsl#24 650 add v25.2s,v25.2s,v29.2s // h0 -> h1 651 add v28.2s,v28.2s,v30.2s // h3 -> h4 652 653 b.hi .Loop_neon 654 655.Lskip_loop: 656 dup v16.2d,v16.d[0] 657 add v11.2s,v11.2s,v26.2s 658 659 //////////////////////////////////////////////////////////////// 660 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 661 662 adds x2,x2,#32 663 b.ne .Long_tail 664 665 dup v16.2d,v11.d[0] 666 add v14.2s,v9.2s,v24.2s 667 add v17.2s,v12.2s,v27.2s 668 add v15.2s,v10.2s,v25.2s 669 add v18.2s,v13.2s,v28.2s 670 671.Long_tail: 672 dup v14.2d,v14.d[0] 673 umull2 v19.2d,v16.4s,v6.4s 674 umull2 v22.2d,v16.4s,v1.4s 675 umull2 v23.2d,v16.4s,v3.4s 676 umull2 v21.2d,v16.4s,v0.4s 677 umull2 v20.2d,v16.4s,v8.4s 678 679 dup v15.2d,v15.d[0] 680 umlal2 v19.2d,v14.4s,v0.4s 681 umlal2 v21.2d,v14.4s,v3.4s 682 umlal2 v22.2d,v14.4s,v5.4s 683 umlal2 v23.2d,v14.4s,v7.4s 684 umlal2 v20.2d,v14.4s,v1.4s 685 686 dup v17.2d,v17.d[0] 687 umlal2 v19.2d,v15.4s,v8.4s 688 umlal2 v22.2d,v15.4s,v3.4s 689 umlal2 v21.2d,v15.4s,v1.4s 690 umlal2 v23.2d,v15.4s,v5.4s 691 umlal2 v20.2d,v15.4s,v0.4s 692 693 dup v18.2d,v18.d[0] 694 umlal2 v22.2d,v17.4s,v0.4s 695 umlal2 v23.2d,v17.4s,v1.4s 696 umlal2 v19.2d,v17.4s,v4.4s 697 umlal2 v20.2d,v17.4s,v6.4s 698 umlal2 v21.2d,v17.4s,v8.4s 699 700 umlal2 v22.2d,v18.4s,v8.4s 701 umlal2 v19.2d,v18.4s,v2.4s 702 umlal2 v23.2d,v18.4s,v0.4s 703 umlal2 v20.2d,v18.4s,v4.4s 704 umlal2 v21.2d,v18.4s,v6.4s 705 706 b.eq .Lshort_tail 707 708 //////////////////////////////////////////////////////////////// 709 // (hash+inp[0:1])*r^4:r^3 and accumulate 710 711 add v9.2s,v9.2s,v24.2s 712 umlal v22.2d,v11.2s,v1.2s 713 umlal v19.2d,v11.2s,v6.2s 714 umlal v23.2d,v11.2s,v3.2s 715 umlal v20.2d,v11.2s,v8.2s 716 umlal v21.2d,v11.2s,v0.2s 717 718 add v10.2s,v10.2s,v25.2s 719 umlal v22.2d,v9.2s,v5.2s 720 umlal v19.2d,v9.2s,v0.2s 721 umlal v23.2d,v9.2s,v7.2s 722 umlal v20.2d,v9.2s,v1.2s 723 umlal v21.2d,v9.2s,v3.2s 724 725 add v12.2s,v12.2s,v27.2s 726 umlal v22.2d,v10.2s,v3.2s 727 umlal v19.2d,v10.2s,v8.2s 728 umlal v23.2d,v10.2s,v5.2s 729 umlal v20.2d,v10.2s,v0.2s 730 umlal v21.2d,v10.2s,v1.2s 731 732 add v13.2s,v13.2s,v28.2s 733 umlal v22.2d,v12.2s,v0.2s 734 umlal v19.2d,v12.2s,v4.2s 735 umlal v23.2d,v12.2s,v1.2s 736 umlal v20.2d,v12.2s,v6.2s 737 umlal v21.2d,v12.2s,v8.2s 738 739 umlal v22.2d,v13.2s,v8.2s 740 umlal v19.2d,v13.2s,v2.2s 741 umlal v23.2d,v13.2s,v0.2s 742 umlal v20.2d,v13.2s,v4.2s 743 umlal v21.2d,v13.2s,v6.2s 744 745.Lshort_tail: 746 //////////////////////////////////////////////////////////////// 747 // horizontal add 748 749 addp v22.2d,v22.2d,v22.2d 750 ldp d8,d9,[sp,#16] // meet ABI requirements 751 addp v19.2d,v19.2d,v19.2d 752 ldp d10,d11,[sp,#32] 753 addp v23.2d,v23.2d,v23.2d 754 ldp d12,d13,[sp,#48] 755 addp v20.2d,v20.2d,v20.2d 756 ldp d14,d15,[sp,#64] 757 addp v21.2d,v21.2d,v21.2d 758 759 //////////////////////////////////////////////////////////////// 760 // lazy reduction, but without narrowing 761 762 ushr v29.2d,v22.2d,#26 763 and v22.16b,v22.16b,v31.16b 764 ushr v30.2d,v19.2d,#26 765 and v19.16b,v19.16b,v31.16b 766 767 add v23.2d,v23.2d,v29.2d // h3 -> h4 768 add v20.2d,v20.2d,v30.2d // h0 -> h1 769 770 ushr v29.2d,v23.2d,#26 771 and v23.16b,v23.16b,v31.16b 772 ushr v30.2d,v20.2d,#26 773 and v20.16b,v20.16b,v31.16b 774 add v21.2d,v21.2d,v30.2d // h1 -> h2 775 776 add v19.2d,v19.2d,v29.2d 777 shl v29.2d,v29.2d,#2 778 ushr v30.2d,v21.2d,#26 779 and v21.16b,v21.16b,v31.16b 780 add v19.2d,v19.2d,v29.2d // h4 -> h0 781 add v22.2d,v22.2d,v30.2d // h2 -> h3 782 783 ushr v29.2d,v19.2d,#26 784 and v19.16b,v19.16b,v31.16b 785 ushr v30.2d,v22.2d,#26 786 and v22.16b,v22.16b,v31.16b 787 add v20.2d,v20.2d,v29.2d // h0 -> h1 788 add v23.2d,v23.2d,v30.2d // h3 -> h4 789 790 //////////////////////////////////////////////////////////////// 791 // write the result, can be partially reduced 792 793 st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16 794 st1 {v23.s}[0],[x0] 795 796.Lno_data_neon: 797 ldr x29,[sp],#80 798 ret 799.size poly1305_blocks_neon,.-poly1305_blocks_neon 800 801.type poly1305_emit_neon,%function 802.align 5 803poly1305_emit_neon: 804 ldr x17,[x0,#24] 805 cbz x17,poly1305_emit 806 807 ldp w10,w11,[x0] // load hash value base 2^26 808 ldp w12,w13,[x0,#8] 809 ldr w14,[x0,#16] 810 811 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 812 lsr x5,x12,#12 813 adds x4,x4,x12,lsl#52 814 add x5,x5,x13,lsl#14 815 adc x5,x5,xzr 816 lsr x6,x14,#24 817 adds x5,x5,x14,lsl#40 818 adc x6,x6,xzr // can be partially reduced... 819 820 ldp x10,x11,[x2] // load nonce 821 822 and x12,x6,#-4 // ... so reduce 823 add x12,x12,x6,lsr#2 824 and x6,x6,#3 825 adds x4,x4,x12 826 adcs x5,x5,xzr 827 adc x6,x6,xzr 828 829 adds x12,x4,#5 // compare to modulus 830 adcs x13,x5,xzr 831 adc x14,x6,xzr 832 833 tst x14,#-4 // see if it's carried/borrowed 834 835 csel x4,x4,x12,eq 836 csel x5,x5,x13,eq 837 838#ifdef __ARMEB__ 839 ror x10,x10,#32 // flip nonce words 840 ror x11,x11,#32 841#endif 842 adds x4,x4,x10 // accumulate nonce 843 adc x5,x5,x11 844#ifdef __ARMEB__ 845 rev x4,x4 // flip output bytes 846 rev x5,x5 847#endif 848 stp x4,x5,[x1] // write result 849 850 ret 851.size poly1305_emit_neon,.-poly1305_emit_neon 852 853.align 5 854.Lzeros: 855.long 0,0,0,0,0,0,0,0 856.LOPENSSL_armcap_P: 857#ifdef __ILP32__ 858.long OPENSSL_armcap_P-. 859#else 860.quad OPENSSL_armcap_P-. 861#endif 862.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 863.align 2 864.align 2 865