1#include "arm_arch.h" 2 3.text 4#if defined(__thumb2__) 5.syntax unified 6.thumb 7#else 8.code 32 9#endif 10 11.globl poly1305_emit 12.globl poly1305_blocks 13.globl poly1305_init 14.type poly1305_init,%function 15.align 5 16poly1305_init: 17.Lpoly1305_init: 18 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 19 20 eor r3,r3,r3 21 cmp r1,#0 22 str r3,[r0,#0] @ zero hash value 23 str r3,[r0,#4] 24 str r3,[r0,#8] 25 str r3,[r0,#12] 26 str r3,[r0,#16] 27 str r3,[r0,#36] @ is_base2_26 28 add r0,r0,#20 29 30#ifdef __thumb2__ 31 it eq 32#endif 33 moveq r0,#0 34 beq .Lno_key 35 36#if __ARM_MAX_ARCH__>=7 37 adr r11,.Lpoly1305_init 38 ldr r12,.LOPENSSL_armcap 39#endif 40 ldrb r4,[r1,#0] 41 mov r10,#0x0fffffff 42 ldrb r5,[r1,#1] 43 and r3,r10,#-4 @ 0x0ffffffc 44 ldrb r6,[r1,#2] 45 ldrb r7,[r1,#3] 46 orr r4,r4,r5,lsl#8 47 ldrb r5,[r1,#4] 48 orr r4,r4,r6,lsl#16 49 ldrb r6,[r1,#5] 50 orr r4,r4,r7,lsl#24 51 ldrb r7,[r1,#6] 52 and r4,r4,r10 53 54#if __ARM_MAX_ARCH__>=7 55 ldr r12,[r11,r12] @ OPENSSL_armcap_P 56# ifdef __APPLE__ 57 ldr r12,[r12] 58# endif 59#endif 60 ldrb r8,[r1,#7] 61 orr r5,r5,r6,lsl#8 62 ldrb r6,[r1,#8] 63 orr r5,r5,r7,lsl#16 64 ldrb r7,[r1,#9] 65 orr r5,r5,r8,lsl#24 66 ldrb r8,[r1,#10] 67 and r5,r5,r3 68 69#if __ARM_MAX_ARCH__>=7 70 tst r12,#ARMV7_NEON @ check for NEON 71# ifdef __APPLE__ 72 adr r9,poly1305_blocks_neon 73 adr r11,poly1305_blocks 74# ifdef __thumb2__ 75 it ne 76# endif 77 movne r11,r9 78 adr r12,poly1305_emit 79 adr r10,poly1305_emit_neon 80# ifdef __thumb2__ 81 it ne 82# endif 83 movne r12,r10 84# else 85# ifdef __thumb2__ 86 itete eq 87# endif 88 addeq r12,r11,#(poly1305_emit-.Lpoly1305_init) 89 addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init) 90 addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init) 91 addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init) 92# endif 93# ifdef __thumb2__ 94 orr r12,r12,#1 @ thumb-ify address 95 orr r11,r11,#1 96# endif 97#endif 98 ldrb r9,[r1,#11] 99 orr r6,r6,r7,lsl#8 100 ldrb r7,[r1,#12] 101 orr r6,r6,r8,lsl#16 102 ldrb r8,[r1,#13] 103 orr r6,r6,r9,lsl#24 104 ldrb r9,[r1,#14] 105 and r6,r6,r3 106 107 ldrb r10,[r1,#15] 108 orr r7,r7,r8,lsl#8 109 str r4,[r0,#0] 110 orr r7,r7,r9,lsl#16 111 str r5,[r0,#4] 112 orr r7,r7,r10,lsl#24 113 str r6,[r0,#8] 114 and r7,r7,r3 115 str r7,[r0,#12] 116#if __ARM_MAX_ARCH__>=7 117 stmia r2,{r11,r12} @ fill functions table 118 mov r0,#1 119#else 120 mov r0,#0 121#endif 122.Lno_key: 123 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 124#if __ARM_ARCH__>=5 125 bx lr @ bx lr 126#else 127 tst lr,#1 128 moveq pc,lr @ be binary compatible with V4, yet 129.word 0xe12fff1e @ interoperable with Thumb ISA:-) 130#endif 131.size poly1305_init,.-poly1305_init 132.type poly1305_blocks,%function 133.align 5 134poly1305_blocks: 135 stmdb sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} 136 137 ands r2,r2,#-16 138 beq .Lno_data 139 140 cmp r3,#0 141 add r2,r2,r1 @ end pointer 142 sub sp,sp,#32 143 144 ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11,r12} @ load context 145 146 str r0,[sp,#12] @ offload stuff 147 mov lr,r1 148 str r2,[sp,#16] 149 str r10,[sp,#20] 150 str r11,[sp,#24] 151 str r12,[sp,#28] 152 b .Loop 153 154.Loop: 155#if __ARM_ARCH__<7 156 ldrb r0,[lr],#16 @ load input 157# ifdef __thumb2__ 158 it hi 159# endif 160 addhi r8,r8,#1 @ 1<<128 161 ldrb r1,[lr,#-15] 162 ldrb r2,[lr,#-14] 163 ldrb r3,[lr,#-13] 164 orr r1,r0,r1,lsl#8 165 ldrb r0,[lr,#-12] 166 orr r2,r1,r2,lsl#16 167 ldrb r1,[lr,#-11] 168 orr r3,r2,r3,lsl#24 169 ldrb r2,[lr,#-10] 170 adds r4,r4,r3 @ accumulate input 171 172 ldrb r3,[lr,#-9] 173 orr r1,r0,r1,lsl#8 174 ldrb r0,[lr,#-8] 175 orr r2,r1,r2,lsl#16 176 ldrb r1,[lr,#-7] 177 orr r3,r2,r3,lsl#24 178 ldrb r2,[lr,#-6] 179 adcs r5,r5,r3 180 181 ldrb r3,[lr,#-5] 182 orr r1,r0,r1,lsl#8 183 ldrb r0,[lr,#-4] 184 orr r2,r1,r2,lsl#16 185 ldrb r1,[lr,#-3] 186 orr r3,r2,r3,lsl#24 187 ldrb r2,[lr,#-2] 188 adcs r6,r6,r3 189 190 ldrb r3,[lr,#-1] 191 orr r1,r0,r1,lsl#8 192 str lr,[sp,#8] @ offload input pointer 193 orr r2,r1,r2,lsl#16 194 add r10,r10,r10,lsr#2 195 orr r3,r2,r3,lsl#24 196#else 197 ldr r0,[lr],#16 @ load input 198# ifdef __thumb2__ 199 it hi 200# endif 201 addhi r8,r8,#1 @ padbit 202 ldr r1,[lr,#-12] 203 ldr r2,[lr,#-8] 204 ldr r3,[lr,#-4] 205# ifdef __ARMEB__ 206 rev r0,r0 207 rev r1,r1 208 rev r2,r2 209 rev r3,r3 210# endif 211 adds r4,r4,r0 @ accumulate input 212 str lr,[sp,#8] @ offload input pointer 213 adcs r5,r5,r1 214 add r10,r10,r10,lsr#2 215 adcs r6,r6,r2 216#endif 217 add r11,r11,r11,lsr#2 218 adcs r7,r7,r3 219 add r12,r12,r12,lsr#2 220 221 umull r2,r3,r5,r9 222 adc r8,r8,#0 223 umull r0,r1,r4,r9 224 umlal r2,r3,r8,r10 225 umlal r0,r1,r7,r10 226 ldr r10,[sp,#20] @ reload r10 227 umlal r2,r3,r6,r12 228 umlal r0,r1,r5,r12 229 umlal r2,r3,r7,r11 230 umlal r0,r1,r6,r11 231 umlal r2,r3,r4,r10 232 str r0,[sp,#0] @ future r4 233 mul r0,r11,r8 234 ldr r11,[sp,#24] @ reload r11 235 adds r2,r2,r1 @ d1+=d0>>32 236 eor r1,r1,r1 237 adc lr,r3,#0 @ future r6 238 str r2,[sp,#4] @ future r5 239 240 mul r2,r12,r8 241 eor r3,r3,r3 242 umlal r0,r1,r7,r12 243 ldr r12,[sp,#28] @ reload r12 244 umlal r2,r3,r7,r9 245 umlal r0,r1,r6,r9 246 umlal r2,r3,r6,r10 247 umlal r0,r1,r5,r10 248 umlal r2,r3,r5,r11 249 umlal r0,r1,r4,r11 250 umlal r2,r3,r4,r12 251 ldr r4,[sp,#0] 252 mul r8,r9,r8 253 ldr r5,[sp,#4] 254 255 adds r6,lr,r0 @ d2+=d1>>32 256 ldr lr,[sp,#8] @ reload input pointer 257 adc r1,r1,#0 258 adds r7,r2,r1 @ d3+=d2>>32 259 ldr r0,[sp,#16] @ reload end pointer 260 adc r3,r3,#0 261 add r8,r8,r3 @ h4+=d3>>32 262 263 and r1,r8,#-4 264 and r8,r8,#3 265 add r1,r1,r1,lsr#2 @ *=5 266 adds r4,r4,r1 267 adcs r5,r5,#0 268 adcs r6,r6,#0 269 adcs r7,r7,#0 270 adc r8,r8,#0 271 272 cmp r0,lr @ done yet? 273 bhi .Loop 274 275 ldr r0,[sp,#12] 276 add sp,sp,#32 277 stmia r0,{r4,r5,r6,r7,r8} @ store the result 278 279.Lno_data: 280#if __ARM_ARCH__>=5 281 ldmia sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,pc} 282#else 283 ldmia sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} 284 tst lr,#1 285 moveq pc,lr @ be binary compatible with V4, yet 286.word 0xe12fff1e @ interoperable with Thumb ISA:-) 287#endif 288.size poly1305_blocks,.-poly1305_blocks 289.type poly1305_emit,%function 290.align 5 291poly1305_emit: 292 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 293.Lpoly1305_emit_enter: 294 295 ldmia r0,{r3,r4,r5,r6,r7} 296 adds r8,r3,#5 @ compare to modulus 297 adcs r9,r4,#0 298 adcs r10,r5,#0 299 adcs r11,r6,#0 300 adc r7,r7,#0 301 tst r7,#4 @ did it carry/borrow? 302 303#ifdef __thumb2__ 304 it ne 305#endif 306 movne r3,r8 307 ldr r8,[r2,#0] 308#ifdef __thumb2__ 309 it ne 310#endif 311 movne r4,r9 312 ldr r9,[r2,#4] 313#ifdef __thumb2__ 314 it ne 315#endif 316 movne r5,r10 317 ldr r10,[r2,#8] 318#ifdef __thumb2__ 319 it ne 320#endif 321 movne r6,r11 322 ldr r11,[r2,#12] 323 324 adds r3,r3,r8 325 adcs r4,r4,r9 326 adcs r5,r5,r10 327 adc r6,r6,r11 328 329#if __ARM_ARCH__>=7 330# ifdef __ARMEB__ 331 rev r3,r3 332 rev r4,r4 333 rev r5,r5 334 rev r6,r6 335# endif 336 str r3,[r1,#0] 337 str r4,[r1,#4] 338 str r5,[r1,#8] 339 str r6,[r1,#12] 340#else 341 strb r3,[r1,#0] 342 mov r3,r3,lsr#8 343 strb r4,[r1,#4] 344 mov r4,r4,lsr#8 345 strb r5,[r1,#8] 346 mov r5,r5,lsr#8 347 strb r6,[r1,#12] 348 mov r6,r6,lsr#8 349 350 strb r3,[r1,#1] 351 mov r3,r3,lsr#8 352 strb r4,[r1,#5] 353 mov r4,r4,lsr#8 354 strb r5,[r1,#9] 355 mov r5,r5,lsr#8 356 strb r6,[r1,#13] 357 mov r6,r6,lsr#8 358 359 strb r3,[r1,#2] 360 mov r3,r3,lsr#8 361 strb r4,[r1,#6] 362 mov r4,r4,lsr#8 363 strb r5,[r1,#10] 364 mov r5,r5,lsr#8 365 strb r6,[r1,#14] 366 mov r6,r6,lsr#8 367 368 strb r3,[r1,#3] 369 strb r4,[r1,#7] 370 strb r5,[r1,#11] 371 strb r6,[r1,#15] 372#endif 373 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 374#if __ARM_ARCH__>=5 375 bx lr @ bx lr 376#else 377 tst lr,#1 378 moveq pc,lr @ be binary compatible with V4, yet 379.word 0xe12fff1e @ interoperable with Thumb ISA:-) 380#endif 381.size poly1305_emit,.-poly1305_emit 382#if __ARM_MAX_ARCH__>=7 383.fpu neon 384 385.type poly1305_init_neon,%function 386.align 5 387poly1305_init_neon: 388 ldr r4,[r0,#20] @ load key base 2^32 389 ldr r5,[r0,#24] 390 ldr r6,[r0,#28] 391 ldr r7,[r0,#32] 392 393 and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 394 mov r3,r4,lsr#26 395 mov r4,r5,lsr#20 396 orr r3,r3,r5,lsl#6 397 mov r5,r6,lsr#14 398 orr r4,r4,r6,lsl#12 399 mov r6,r7,lsr#8 400 orr r5,r5,r7,lsl#18 401 and r3,r3,#0x03ffffff 402 and r4,r4,#0x03ffffff 403 and r5,r5,#0x03ffffff 404 405 vdup.32 d0,r2 @ r^1 in both lanes 406 add r2,r3,r3,lsl#2 @ *5 407 vdup.32 d1,r3 408 add r3,r4,r4,lsl#2 409 vdup.32 d2,r2 410 vdup.32 d3,r4 411 add r4,r5,r5,lsl#2 412 vdup.32 d4,r3 413 vdup.32 d5,r5 414 add r5,r6,r6,lsl#2 415 vdup.32 d6,r4 416 vdup.32 d7,r6 417 vdup.32 d8,r5 418 419 mov r5,#2 @ counter 420 421.Lsquare_neon: 422 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 423 @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 424 @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 425 @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 426 @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 427 @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 428 429 vmull.u32 q5,d0,d0[1] 430 vmull.u32 q6,d1,d0[1] 431 vmull.u32 q7,d3,d0[1] 432 vmull.u32 q8,d5,d0[1] 433 vmull.u32 q9,d7,d0[1] 434 435 vmlal.u32 q5,d7,d2[1] 436 vmlal.u32 q6,d0,d1[1] 437 vmlal.u32 q7,d1,d1[1] 438 vmlal.u32 q8,d3,d1[1] 439 vmlal.u32 q9,d5,d1[1] 440 441 vmlal.u32 q5,d5,d4[1] 442 vmlal.u32 q6,d7,d4[1] 443 vmlal.u32 q8,d1,d3[1] 444 vmlal.u32 q7,d0,d3[1] 445 vmlal.u32 q9,d3,d3[1] 446 447 vmlal.u32 q5,d3,d6[1] 448 vmlal.u32 q8,d0,d5[1] 449 vmlal.u32 q6,d5,d6[1] 450 vmlal.u32 q7,d7,d6[1] 451 vmlal.u32 q9,d1,d5[1] 452 453 vmlal.u32 q8,d7,d8[1] 454 vmlal.u32 q5,d1,d8[1] 455 vmlal.u32 q6,d3,d8[1] 456 vmlal.u32 q7,d5,d8[1] 457 vmlal.u32 q9,d0,d7[1] 458 459 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 460 @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 461 @ and P. Schwabe 462 @ 463 @ H0>>+H1>>+H2>>+H3>>+H4 464 @ H3>>+H4>>*5+H0>>+H1 465 @ 466 @ Trivia. 467 @ 468 @ Result of multiplication of n-bit number by m-bit number is 469 @ n+m bits wide. However! Even though 2^n is a n+1-bit number, 470 @ m-bit number multiplied by 2^n is still n+m bits wide. 471 @ 472 @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, 473 @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit 474 @ one is n+1 bits wide. 475 @ 476 @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that 477 @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 478 @ can be 27. However! In cases when their width exceeds 26 bits 479 @ they are limited by 2^26+2^6. This in turn means that *sum* 480 @ of the products with these values can still be viewed as sum 481 @ of 52-bit numbers as long as the amount of addends is not a 482 @ power of 2. For example, 483 @ 484 @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, 485 @ 486 @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or 487 @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than 488 @ 8 * (2^52) or 2^55. However, the value is then multiplied by 489 @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), 490 @ which is less than 32 * (2^52) or 2^57. And when processing 491 @ data we are looking at triple as many addends... 492 @ 493 @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and 494 @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the 495 @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while 496 @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 497 @ instruction accepts 2x32-bit input and writes 2x64-bit result. 498 @ This means that result of reduction have to be compressed upon 499 @ loop wrap-around. This can be done in the process of reduction 500 @ to minimize amount of instructions [as well as amount of 501 @ 128-bit instructions, which benefits low-end processors], but 502 @ one has to watch for H2 (which is narrower than H0) and 5*H4 503 @ not being wider than 58 bits, so that result of right shift 504 @ by 26 bits fits in 32 bits. This is also useful on x86, 505 @ because it allows to use paddd in place for paddq, which 506 @ benefits Atom, where paddq is ridiculously slow. 507 508 vshr.u64 q15,q8,#26 509 vmovn.i64 d16,q8 510 vshr.u64 q4,q5,#26 511 vmovn.i64 d10,q5 512 vadd.i64 q9,q9,q15 @ h3 -> h4 513 vbic.i32 d16,#0xfc000000 @ &=0x03ffffff 514 vadd.i64 q6,q6,q4 @ h0 -> h1 515 vbic.i32 d10,#0xfc000000 516 517 vshrn.u64 d30,q9,#26 518 vmovn.i64 d18,q9 519 vshr.u64 q4,q6,#26 520 vmovn.i64 d12,q6 521 vadd.i64 q7,q7,q4 @ h1 -> h2 522 vbic.i32 d18,#0xfc000000 523 vbic.i32 d12,#0xfc000000 524 525 vadd.i32 d10,d10,d30 526 vshl.u32 d30,d30,#2 527 vshrn.u64 d8,q7,#26 528 vmovn.i64 d14,q7 529 vadd.i32 d10,d10,d30 @ h4 -> h0 530 vadd.i32 d16,d16,d8 @ h2 -> h3 531 vbic.i32 d14,#0xfc000000 532 533 vshr.u32 d30,d10,#26 534 vbic.i32 d10,#0xfc000000 535 vshr.u32 d8,d16,#26 536 vbic.i32 d16,#0xfc000000 537 vadd.i32 d12,d12,d30 @ h0 -> h1 538 vadd.i32 d18,d18,d8 @ h3 -> h4 539 540 subs r5,r5,#1 541 beq .Lsquare_break_neon 542 543 add r6,r0,#(48+0*9*4) 544 add r7,r0,#(48+1*9*4) 545 546 vtrn.32 d0,d10 @ r^2:r^1 547 vtrn.32 d3,d14 548 vtrn.32 d5,d16 549 vtrn.32 d1,d12 550 vtrn.32 d7,d18 551 552 vshl.u32 d4,d3,#2 @ *5 553 vshl.u32 d6,d5,#2 554 vshl.u32 d2,d1,#2 555 vshl.u32 d8,d7,#2 556 vadd.i32 d4,d4,d3 557 vadd.i32 d2,d2,d1 558 vadd.i32 d6,d6,d5 559 vadd.i32 d8,d8,d7 560 561 vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! 562 vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! 563 vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 564 vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 565 vst1.32 {d8[0]},[r6,:32] 566 vst1.32 {d8[1]},[r7,:32] 567 568 b .Lsquare_neon 569 570.align 4 571.Lsquare_break_neon: 572 add r6,r0,#(48+2*4*9) 573 add r7,r0,#(48+3*4*9) 574 575 vmov d0,d10 @ r^4:r^3 576 vshl.u32 d2,d12,#2 @ *5 577 vmov d1,d12 578 vshl.u32 d4,d14,#2 579 vmov d3,d14 580 vshl.u32 d6,d16,#2 581 vmov d5,d16 582 vshl.u32 d8,d18,#2 583 vmov d7,d18 584 vadd.i32 d2,d2,d12 585 vadd.i32 d4,d4,d14 586 vadd.i32 d6,d6,d16 587 vadd.i32 d8,d8,d18 588 589 vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! 590 vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! 591 vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 592 vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 593 vst1.32 {d8[0]},[r6] 594 vst1.32 {d8[1]},[r7] 595 596 bx lr @ bx lr 597.size poly1305_init_neon,.-poly1305_init_neon 598 599.type poly1305_blocks_neon,%function 600.align 5 601poly1305_blocks_neon: 602 ldr ip,[r0,#36] @ is_base2_26 603 ands r2,r2,#-16 604 beq .Lno_data_neon 605 606 cmp r2,#64 607 bhs .Lenter_neon 608 tst ip,ip @ is_base2_26? 609 beq poly1305_blocks 610 611.Lenter_neon: 612 stmdb sp!,{r4,r5,r6,r7} 613 vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so 614 615 tst ip,ip @ is_base2_26? 616 bne .Lbase2_26_neon 617 618 stmdb sp!,{r1,r2,r3,lr} 619 bl poly1305_init_neon 620 621 ldr r4,[r0,#0] @ load hash value base 2^32 622 ldr r5,[r0,#4] 623 ldr r6,[r0,#8] 624 ldr r7,[r0,#12] 625 ldr ip,[r0,#16] 626 627 and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 628 mov r3,r4,lsr#26 629 veor d10,d10,d10 630 mov r4,r5,lsr#20 631 orr r3,r3,r5,lsl#6 632 veor d12,d12,d12 633 mov r5,r6,lsr#14 634 orr r4,r4,r6,lsl#12 635 veor d14,d14,d14 636 mov r6,r7,lsr#8 637 orr r5,r5,r7,lsl#18 638 veor d16,d16,d16 639 and r3,r3,#0x03ffffff 640 orr r6,r6,ip,lsl#24 641 veor d18,d18,d18 642 and r4,r4,#0x03ffffff 643 mov r1,#1 644 and r5,r5,#0x03ffffff 645 str r1,[r0,#36] @ is_base2_26 646 647 vmov.32 d10[0],r2 648 vmov.32 d12[0],r3 649 vmov.32 d14[0],r4 650 vmov.32 d16[0],r5 651 vmov.32 d18[0],r6 652 adr r5,.Lzeros 653 654 ldmia sp!,{r1,r2,r3,lr} 655 b .Lbase2_32_neon 656 657.align 4 658.Lbase2_26_neon: 659 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 660 @ load hash value 661 662 veor d10,d10,d10 663 veor d12,d12,d12 664 veor d14,d14,d14 665 veor d16,d16,d16 666 veor d18,d18,d18 667 vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]! 668 adr r5,.Lzeros 669 vld1.32 {d18[0]},[r0] 670 sub r0,r0,#16 @ rewind 671 672.Lbase2_32_neon: 673 add r4,r1,#32 674 mov r3,r3,lsl#24 675 tst r2,#31 676 beq .Leven 677 678 vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]! 679 vmov.32 d28[0],r3 680 sub r2,r2,#16 681 add r4,r1,#32 682 683# ifdef __ARMEB__ 684 vrev32.8 q10,q10 685 vrev32.8 q13,q13 686 vrev32.8 q11,q11 687 vrev32.8 q12,q12 688# endif 689 vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26 690 vshl.u32 d26,d26,#18 691 692 vsri.u32 d26,d24,#14 693 vshl.u32 d24,d24,#12 694 vadd.i32 d29,d28,d18 @ add hash value and move to #hi 695 696 vbic.i32 d26,#0xfc000000 697 vsri.u32 d24,d22,#20 698 vshl.u32 d22,d22,#6 699 700 vbic.i32 d24,#0xfc000000 701 vsri.u32 d22,d20,#26 702 vadd.i32 d27,d26,d16 703 704 vbic.i32 d20,#0xfc000000 705 vbic.i32 d22,#0xfc000000 706 vadd.i32 d25,d24,d14 707 708 vadd.i32 d21,d20,d10 709 vadd.i32 d23,d22,d12 710 711 mov r7,r5 712 add r6,r0,#48 713 714 cmp r2,r2 715 b .Long_tail 716 717.align 4 718.Leven: 719 subs r2,r2,#64 720 it lo 721 movlo r4,r5 722 723 vmov.i32 q14,#1<<24 @ padbit, yes, always 724 vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1] 725 add r1,r1,#64 726 vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0) 727 add r4,r4,#64 728 itt hi 729 addhi r7,r0,#(48+1*9*4) 730 addhi r6,r0,#(48+3*9*4) 731 732# ifdef __ARMEB__ 733 vrev32.8 q10,q10 734 vrev32.8 q13,q13 735 vrev32.8 q11,q11 736 vrev32.8 q12,q12 737# endif 738 vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26 739 vshl.u32 q13,q13,#18 740 741 vsri.u32 q13,q12,#14 742 vshl.u32 q12,q12,#12 743 744 vbic.i32 q13,#0xfc000000 745 vsri.u32 q12,q11,#20 746 vshl.u32 q11,q11,#6 747 748 vbic.i32 q12,#0xfc000000 749 vsri.u32 q11,q10,#26 750 751 vbic.i32 q10,#0xfc000000 752 vbic.i32 q11,#0xfc000000 753 754 bls .Lskip_loop 755 756 vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2 757 vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4 758 vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 759 vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 760 b .Loop_neon 761 762.align 5 763.Loop_neon: 764 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 765 @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 766 @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 767 @ ___________________/ 768 @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 769 @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 770 @ ___________________/ ____________________/ 771 @ 772 @ Note that we start with inp[2:3]*r^2. This is because it 773 @ doesn't depend on reduction in previous iteration. 774 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 775 @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 776 @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 777 @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 778 @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 779 @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 780 781 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 782 @ inp[2:3]*r^2 783 784 vadd.i32 d24,d24,d14 @ accumulate inp[0:1] 785 vmull.u32 q7,d25,d0[1] 786 vadd.i32 d20,d20,d10 787 vmull.u32 q5,d21,d0[1] 788 vadd.i32 d26,d26,d16 789 vmull.u32 q8,d27,d0[1] 790 vmlal.u32 q7,d23,d1[1] 791 vadd.i32 d22,d22,d12 792 vmull.u32 q6,d23,d0[1] 793 794 vadd.i32 d28,d28,d18 795 vmull.u32 q9,d29,d0[1] 796 subs r2,r2,#64 797 vmlal.u32 q5,d29,d2[1] 798 it lo 799 movlo r4,r5 800 vmlal.u32 q8,d25,d1[1] 801 vld1.32 d8[1],[r7,:32] 802 vmlal.u32 q6,d21,d1[1] 803 vmlal.u32 q9,d27,d1[1] 804 805 vmlal.u32 q5,d27,d4[1] 806 vmlal.u32 q8,d23,d3[1] 807 vmlal.u32 q9,d25,d3[1] 808 vmlal.u32 q6,d29,d4[1] 809 vmlal.u32 q7,d21,d3[1] 810 811 vmlal.u32 q8,d21,d5[1] 812 vmlal.u32 q5,d25,d6[1] 813 vmlal.u32 q9,d23,d5[1] 814 vmlal.u32 q6,d27,d6[1] 815 vmlal.u32 q7,d29,d6[1] 816 817 vmlal.u32 q8,d29,d8[1] 818 vmlal.u32 q5,d23,d8[1] 819 vmlal.u32 q9,d21,d7[1] 820 vmlal.u32 q6,d25,d8[1] 821 vmlal.u32 q7,d27,d8[1] 822 823 vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0) 824 add r4,r4,#64 825 826 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 827 @ (hash+inp[0:1])*r^4 and accumulate 828 829 vmlal.u32 q8,d26,d0[0] 830 vmlal.u32 q5,d20,d0[0] 831 vmlal.u32 q9,d28,d0[0] 832 vmlal.u32 q6,d22,d0[0] 833 vmlal.u32 q7,d24,d0[0] 834 vld1.32 d8[0],[r6,:32] 835 836 vmlal.u32 q8,d24,d1[0] 837 vmlal.u32 q5,d28,d2[0] 838 vmlal.u32 q9,d26,d1[0] 839 vmlal.u32 q6,d20,d1[0] 840 vmlal.u32 q7,d22,d1[0] 841 842 vmlal.u32 q8,d22,d3[0] 843 vmlal.u32 q5,d26,d4[0] 844 vmlal.u32 q9,d24,d3[0] 845 vmlal.u32 q6,d28,d4[0] 846 vmlal.u32 q7,d20,d3[0] 847 848 vmlal.u32 q8,d20,d5[0] 849 vmlal.u32 q5,d24,d6[0] 850 vmlal.u32 q9,d22,d5[0] 851 vmlal.u32 q6,d26,d6[0] 852 vmlal.u32 q8,d28,d8[0] 853 854 vmlal.u32 q7,d28,d6[0] 855 vmlal.u32 q5,d22,d8[0] 856 vmlal.u32 q9,d20,d7[0] 857 vmov.i32 q14,#1<<24 @ padbit, yes, always 858 vmlal.u32 q6,d24,d8[0] 859 vmlal.u32 q7,d26,d8[0] 860 861 vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1] 862 add r1,r1,#64 863# ifdef __ARMEB__ 864 vrev32.8 q10,q10 865 vrev32.8 q11,q11 866 vrev32.8 q12,q12 867 vrev32.8 q13,q13 868# endif 869 870 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 871 @ lazy reduction interleaved with base 2^32 -> base 2^26 of 872 @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14. 873 874 vshr.u64 q15,q8,#26 875 vmovn.i64 d16,q8 876 vshr.u64 q4,q5,#26 877 vmovn.i64 d10,q5 878 vadd.i64 q9,q9,q15 @ h3 -> h4 879 vbic.i32 d16,#0xfc000000 880 vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26 881 vadd.i64 q6,q6,q4 @ h0 -> h1 882 vshl.u32 q13,q13,#18 883 vbic.i32 d10,#0xfc000000 884 885 vshrn.u64 d30,q9,#26 886 vmovn.i64 d18,q9 887 vshr.u64 q4,q6,#26 888 vmovn.i64 d12,q6 889 vadd.i64 q7,q7,q4 @ h1 -> h2 890 vsri.u32 q13,q12,#14 891 vbic.i32 d18,#0xfc000000 892 vshl.u32 q12,q12,#12 893 vbic.i32 d12,#0xfc000000 894 895 vadd.i32 d10,d10,d30 896 vshl.u32 d30,d30,#2 897 vbic.i32 q13,#0xfc000000 898 vshrn.u64 d8,q7,#26 899 vmovn.i64 d14,q7 900 vaddl.u32 q5,d10,d30 @ h4 -> h0 [widen for a sec] 901 vsri.u32 q12,q11,#20 902 vadd.i32 d16,d16,d8 @ h2 -> h3 903 vshl.u32 q11,q11,#6 904 vbic.i32 d14,#0xfc000000 905 vbic.i32 q12,#0xfc000000 906 907 vshrn.u64 d30,q5,#26 @ re-narrow 908 vmovn.i64 d10,q5 909 vsri.u32 q11,q10,#26 910 vbic.i32 q10,#0xfc000000 911 vshr.u32 d8,d16,#26 912 vbic.i32 d16,#0xfc000000 913 vbic.i32 d10,#0xfc000000 914 vadd.i32 d12,d12,d30 @ h0 -> h1 915 vadd.i32 d18,d18,d8 @ h3 -> h4 916 vbic.i32 q11,#0xfc000000 917 918 bhi .Loop_neon 919 920.Lskip_loop: 921 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 922 @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 923 924 add r7,r0,#(48+0*9*4) 925 add r6,r0,#(48+1*9*4) 926 adds r2,r2,#32 927 it ne 928 movne r2,#0 929 bne .Long_tail 930 931 vadd.i32 d25,d24,d14 @ add hash value and move to #hi 932 vadd.i32 d21,d20,d10 933 vadd.i32 d27,d26,d16 934 vadd.i32 d23,d22,d12 935 vadd.i32 d29,d28,d18 936 937.Long_tail: 938 vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1 939 vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2 940 941 vadd.i32 d24,d24,d14 @ can be redundant 942 vmull.u32 q7,d25,d0 943 vadd.i32 d20,d20,d10 944 vmull.u32 q5,d21,d0 945 vadd.i32 d26,d26,d16 946 vmull.u32 q8,d27,d0 947 vadd.i32 d22,d22,d12 948 vmull.u32 q6,d23,d0 949 vadd.i32 d28,d28,d18 950 vmull.u32 q9,d29,d0 951 952 vmlal.u32 q5,d29,d2 953 vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 954 vmlal.u32 q8,d25,d1 955 vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 956 vmlal.u32 q6,d21,d1 957 vmlal.u32 q9,d27,d1 958 vmlal.u32 q7,d23,d1 959 960 vmlal.u32 q8,d23,d3 961 vld1.32 d8[1],[r7,:32] 962 vmlal.u32 q5,d27,d4 963 vld1.32 d8[0],[r6,:32] 964 vmlal.u32 q9,d25,d3 965 vmlal.u32 q6,d29,d4 966 vmlal.u32 q7,d21,d3 967 968 vmlal.u32 q8,d21,d5 969 it ne 970 addne r7,r0,#(48+2*9*4) 971 vmlal.u32 q5,d25,d6 972 it ne 973 addne r6,r0,#(48+3*9*4) 974 vmlal.u32 q9,d23,d5 975 vmlal.u32 q6,d27,d6 976 vmlal.u32 q7,d29,d6 977 978 vmlal.u32 q8,d29,d8 979 vorn q0,q0,q0 @ all-ones, can be redundant 980 vmlal.u32 q5,d23,d8 981 vshr.u64 q0,q0,#38 982 vmlal.u32 q9,d21,d7 983 vmlal.u32 q6,d25,d8 984 vmlal.u32 q7,d27,d8 985 986 beq .Lshort_tail 987 988 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 989 @ (hash+inp[0:1])*r^4:r^3 and accumulate 990 991 vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3 992 vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4 993 994 vmlal.u32 q7,d24,d0 995 vmlal.u32 q5,d20,d0 996 vmlal.u32 q8,d26,d0 997 vmlal.u32 q6,d22,d0 998 vmlal.u32 q9,d28,d0 999 1000 vmlal.u32 q5,d28,d2 1001 vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 1002 vmlal.u32 q8,d24,d1 1003 vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 1004 vmlal.u32 q6,d20,d1 1005 vmlal.u32 q9,d26,d1 1006 vmlal.u32 q7,d22,d1 1007 1008 vmlal.u32 q8,d22,d3 1009 vld1.32 d8[1],[r7,:32] 1010 vmlal.u32 q5,d26,d4 1011 vld1.32 d8[0],[r6,:32] 1012 vmlal.u32 q9,d24,d3 1013 vmlal.u32 q6,d28,d4 1014 vmlal.u32 q7,d20,d3 1015 1016 vmlal.u32 q8,d20,d5 1017 vmlal.u32 q5,d24,d6 1018 vmlal.u32 q9,d22,d5 1019 vmlal.u32 q6,d26,d6 1020 vmlal.u32 q7,d28,d6 1021 1022 vmlal.u32 q8,d28,d8 1023 vorn q0,q0,q0 @ all-ones 1024 vmlal.u32 q5,d22,d8 1025 vshr.u64 q0,q0,#38 1026 vmlal.u32 q9,d20,d7 1027 vmlal.u32 q6,d24,d8 1028 vmlal.u32 q7,d26,d8 1029 1030.Lshort_tail: 1031 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1032 @ horizontal addition 1033 1034 vadd.i64 d16,d16,d17 1035 vadd.i64 d10,d10,d11 1036 vadd.i64 d18,d18,d19 1037 vadd.i64 d12,d12,d13 1038 vadd.i64 d14,d14,d15 1039 1040 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1041 @ lazy reduction, but without narrowing 1042 1043 vshr.u64 q15,q8,#26 1044 vand.i64 q8,q8,q0 1045 vshr.u64 q4,q5,#26 1046 vand.i64 q5,q5,q0 1047 vadd.i64 q9,q9,q15 @ h3 -> h4 1048 vadd.i64 q6,q6,q4 @ h0 -> h1 1049 1050 vshr.u64 q15,q9,#26 1051 vand.i64 q9,q9,q0 1052 vshr.u64 q4,q6,#26 1053 vand.i64 q6,q6,q0 1054 vadd.i64 q7,q7,q4 @ h1 -> h2 1055 1056 vadd.i64 q5,q5,q15 1057 vshl.u64 q15,q15,#2 1058 vshr.u64 q4,q7,#26 1059 vand.i64 q7,q7,q0 1060 vadd.i64 q5,q5,q15 @ h4 -> h0 1061 vadd.i64 q8,q8,q4 @ h2 -> h3 1062 1063 vshr.u64 q15,q5,#26 1064 vand.i64 q5,q5,q0 1065 vshr.u64 q4,q8,#26 1066 vand.i64 q8,q8,q0 1067 vadd.i64 q6,q6,q15 @ h0 -> h1 1068 vadd.i64 q9,q9,q4 @ h3 -> h4 1069 1070 cmp r2,#0 1071 bne .Leven 1072 1073 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1074 @ store hash value 1075 1076 vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]! 1077 vst1.32 {d18[0]},[r0] 1078 1079 vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ epilogue 1080 ldmia sp!,{r4,r5,r6,r7} 1081.Lno_data_neon: 1082 bx lr @ bx lr 1083.size poly1305_blocks_neon,.-poly1305_blocks_neon 1084 1085.type poly1305_emit_neon,%function 1086.align 5 1087poly1305_emit_neon: 1088 ldr ip,[r0,#36] @ is_base2_26 1089 1090 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 1091 1092 tst ip,ip 1093 beq .Lpoly1305_emit_enter 1094 1095 ldmia r0,{r3,r4,r5,r6,r7} 1096 eor r8,r8,r8 1097 1098 adds r3,r3,r4,lsl#26 @ base 2^26 -> base 2^32 1099 mov r4,r4,lsr#6 1100 adcs r4,r4,r5,lsl#20 1101 mov r5,r5,lsr#12 1102 adcs r5,r5,r6,lsl#14 1103 mov r6,r6,lsr#18 1104 adcs r6,r6,r7,lsl#8 1105 adc r7,r8,r7,lsr#24 @ can be partially reduced ... 1106 1107 and r8,r7,#-4 @ ... so reduce 1108 and r7,r6,#3 1109 add r8,r8,r8,lsr#2 @ *= 5 1110 adds r3,r3,r8 1111 adcs r4,r4,#0 1112 adcs r5,r5,#0 1113 adcs r6,r6,#0 1114 adc r7,r7,#0 1115 1116 adds r8,r3,#5 @ compare to modulus 1117 adcs r9,r4,#0 1118 adcs r10,r5,#0 1119 adcs r11,r6,#0 1120 adc r7,r7,#0 1121 tst r7,#4 @ did it carry/borrow? 1122 1123 it ne 1124 movne r3,r8 1125 ldr r8,[r2,#0] 1126 it ne 1127 movne r4,r9 1128 ldr r9,[r2,#4] 1129 it ne 1130 movne r5,r10 1131 ldr r10,[r2,#8] 1132 it ne 1133 movne r6,r11 1134 ldr r11,[r2,#12] 1135 1136 adds r3,r3,r8 @ accumulate nonce 1137 adcs r4,r4,r9 1138 adcs r5,r5,r10 1139 adc r6,r6,r11 1140 1141# ifdef __ARMEB__ 1142 rev r3,r3 1143 rev r4,r4 1144 rev r5,r5 1145 rev r6,r6 1146# endif 1147 str r3,[r1,#0] @ store the result 1148 str r4,[r1,#4] 1149 str r5,[r1,#8] 1150 str r6,[r1,#12] 1151 1152 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 1153 bx lr @ bx lr 1154.size poly1305_emit_neon,.-poly1305_emit_neon 1155 1156.align 5 1157.Lzeros: 1158.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 1159.LOPENSSL_armcap: 1160.word OPENSSL_armcap_P-.Lpoly1305_init 1161#endif 1162.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1163.align 2 1164.align 2 1165#if __ARM_MAX_ARCH__>=7 1166.comm OPENSSL_armcap_P,4,4 1167#endif 1168