1#include "arm_asm.h" 2#include "arm_arch.h" 3 4.text 5#if defined(__thumb2__) 6.syntax unified 7.thumb 8#else 9.code 32 10#endif 11 12.globl poly1305_emit 13.globl poly1305_blocks 14.globl poly1305_init 15.type poly1305_init,%function 16.align 5 17poly1305_init: 18.Lpoly1305_init: 19 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 20 21 eor r3,r3,r3 22 cmp r1,#0 23 str r3,[r0,#0] @ zero hash value 24 str r3,[r0,#4] 25 str r3,[r0,#8] 26 str r3,[r0,#12] 27 str r3,[r0,#16] 28 str r3,[r0,#36] @ is_base2_26 29 add r0,r0,#20 30 31#ifdef __thumb2__ 32 it eq 33#endif 34 moveq r0,#0 35 beq .Lno_key 36 37#if __ARM_MAX_ARCH__>=7 38 adr r11,.Lpoly1305_init 39 ldr r12,.LOPENSSL_armcap 40#endif 41 ldrb r4,[r1,#0] 42 mov r10,#0x0fffffff 43 ldrb r5,[r1,#1] 44 and r3,r10,#-4 @ 0x0ffffffc 45 ldrb r6,[r1,#2] 46 ldrb r7,[r1,#3] 47 orr r4,r4,r5,lsl#8 48 ldrb r5,[r1,#4] 49 orr r4,r4,r6,lsl#16 50 ldrb r6,[r1,#5] 51 orr r4,r4,r7,lsl#24 52 ldrb r7,[r1,#6] 53 and r4,r4,r10 54 55#if __ARM_MAX_ARCH__>=7 56 ldr r12,[r11,r12] @ OPENSSL_armcap_P 57# ifdef __APPLE__ 58 ldr r12,[r12] 59# endif 60#endif 61 ldrb r8,[r1,#7] 62 orr r5,r5,r6,lsl#8 63 ldrb r6,[r1,#8] 64 orr r5,r5,r7,lsl#16 65 ldrb r7,[r1,#9] 66 orr r5,r5,r8,lsl#24 67 ldrb r8,[r1,#10] 68 and r5,r5,r3 69 70#if __ARM_MAX_ARCH__>=7 71 tst r12,#ARMV7_NEON @ check for NEON 72# ifdef __APPLE__ 73 adr r9,poly1305_blocks_neon 74 adr r11,poly1305_blocks 75# ifdef __thumb2__ 76 it ne 77# endif 78 movne r11,r9 79 adr r12,poly1305_emit 80 adr r10,poly1305_emit_neon 81# ifdef __thumb2__ 82 it ne 83# endif 84 movne r12,r10 85# else 86# ifdef __thumb2__ 87 itete eq 88# endif 89 addeq r12,r11,#(poly1305_emit-.Lpoly1305_init) 90 addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init) 91 addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init) 92 addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init) 93# endif 94# ifdef __thumb2__ 95 orr r12,r12,#1 @ thumb-ify address 96 orr r11,r11,#1 97# endif 98#endif 99 ldrb r9,[r1,#11] 100 orr r6,r6,r7,lsl#8 101 ldrb r7,[r1,#12] 102 orr r6,r6,r8,lsl#16 103 ldrb r8,[r1,#13] 104 orr r6,r6,r9,lsl#24 105 ldrb r9,[r1,#14] 106 and r6,r6,r3 107 108 ldrb r10,[r1,#15] 109 orr r7,r7,r8,lsl#8 110 str r4,[r0,#0] 111 orr r7,r7,r9,lsl#16 112 str r5,[r0,#4] 113 orr r7,r7,r10,lsl#24 114 str r6,[r0,#8] 115 and r7,r7,r3 116 str r7,[r0,#12] 117#if __ARM_MAX_ARCH__>=7 118 stmia r2,{r11,r12} @ fill functions table 119 mov r0,#1 120#else 121 mov r0,#0 122#endif 123.Lno_key: 124 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 125#if __ARM_ARCH__>=5 126 RET @ bx lr 127#else 128 tst lr,#1 129 moveq pc,lr @ be binary compatible with V4, yet 130.word 0xe12fff1e @ interoperable with Thumb ISA:-) 131#endif 132.size poly1305_init,.-poly1305_init 133.type poly1305_blocks,%function 134.align 5 135poly1305_blocks: 136.Lpoly1305_blocks: 137 stmdb sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} 138 139 ands r2,r2,#-16 140 beq .Lno_data 141 142 cmp r3,#0 143 add r2,r2,r1 @ end pointer 144 sub sp,sp,#32 145 146 ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11,r12} @ load context 147 148 str r0,[sp,#12] @ offload stuff 149 mov lr,r1 150 str r2,[sp,#16] 151 str r10,[sp,#20] 152 str r11,[sp,#24] 153 str r12,[sp,#28] 154 b .Loop 155 156.Loop: 157#if __ARM_ARCH__<7 158 ldrb r0,[lr],#16 @ load input 159# ifdef __thumb2__ 160 it hi 161# endif 162 addhi r8,r8,#1 @ 1<<128 163 ldrb r1,[lr,#-15] 164 ldrb r2,[lr,#-14] 165 ldrb r3,[lr,#-13] 166 orr r1,r0,r1,lsl#8 167 ldrb r0,[lr,#-12] 168 orr r2,r1,r2,lsl#16 169 ldrb r1,[lr,#-11] 170 orr r3,r2,r3,lsl#24 171 ldrb r2,[lr,#-10] 172 adds r4,r4,r3 @ accumulate input 173 174 ldrb r3,[lr,#-9] 175 orr r1,r0,r1,lsl#8 176 ldrb r0,[lr,#-8] 177 orr r2,r1,r2,lsl#16 178 ldrb r1,[lr,#-7] 179 orr r3,r2,r3,lsl#24 180 ldrb r2,[lr,#-6] 181 adcs r5,r5,r3 182 183 ldrb r3,[lr,#-5] 184 orr r1,r0,r1,lsl#8 185 ldrb r0,[lr,#-4] 186 orr r2,r1,r2,lsl#16 187 ldrb r1,[lr,#-3] 188 orr r3,r2,r3,lsl#24 189 ldrb r2,[lr,#-2] 190 adcs r6,r6,r3 191 192 ldrb r3,[lr,#-1] 193 orr r1,r0,r1,lsl#8 194 str lr,[sp,#8] @ offload input pointer 195 orr r2,r1,r2,lsl#16 196 add r10,r10,r10,lsr#2 197 orr r3,r2,r3,lsl#24 198#else 199 ldr r0,[lr],#16 @ load input 200# ifdef __thumb2__ 201 it hi 202# endif 203 addhi r8,r8,#1 @ padbit 204 ldr r1,[lr,#-12] 205 ldr r2,[lr,#-8] 206 ldr r3,[lr,#-4] 207# ifdef __ARMEB__ 208 rev r0,r0 209 rev r1,r1 210 rev r2,r2 211 rev r3,r3 212# endif 213 adds r4,r4,r0 @ accumulate input 214 str lr,[sp,#8] @ offload input pointer 215 adcs r5,r5,r1 216 add r10,r10,r10,lsr#2 217 adcs r6,r6,r2 218#endif 219 add r11,r11,r11,lsr#2 220 adcs r7,r7,r3 221 add r12,r12,r12,lsr#2 222 223 umull r2,r3,r5,r9 224 adc r8,r8,#0 225 umull r0,r1,r4,r9 226 umlal r2,r3,r8,r10 227 umlal r0,r1,r7,r10 228 ldr r10,[sp,#20] @ reload r10 229 umlal r2,r3,r6,r12 230 umlal r0,r1,r5,r12 231 umlal r2,r3,r7,r11 232 umlal r0,r1,r6,r11 233 umlal r2,r3,r4,r10 234 str r0,[sp,#0] @ future r4 235 mul r0,r11,r8 236 ldr r11,[sp,#24] @ reload r11 237 adds r2,r2,r1 @ d1+=d0>>32 238 eor r1,r1,r1 239 adc lr,r3,#0 @ future r6 240 str r2,[sp,#4] @ future r5 241 242 mul r2,r12,r8 243 eor r3,r3,r3 244 umlal r0,r1,r7,r12 245 ldr r12,[sp,#28] @ reload r12 246 umlal r2,r3,r7,r9 247 umlal r0,r1,r6,r9 248 umlal r2,r3,r6,r10 249 umlal r0,r1,r5,r10 250 umlal r2,r3,r5,r11 251 umlal r0,r1,r4,r11 252 umlal r2,r3,r4,r12 253 ldr r4,[sp,#0] 254 mul r8,r9,r8 255 ldr r5,[sp,#4] 256 257 adds r6,lr,r0 @ d2+=d1>>32 258 ldr lr,[sp,#8] @ reload input pointer 259 adc r1,r1,#0 260 adds r7,r2,r1 @ d3+=d2>>32 261 ldr r0,[sp,#16] @ reload end pointer 262 adc r3,r3,#0 263 add r8,r8,r3 @ h4+=d3>>32 264 265 and r1,r8,#-4 266 and r8,r8,#3 267 add r1,r1,r1,lsr#2 @ *=5 268 adds r4,r4,r1 269 adcs r5,r5,#0 270 adcs r6,r6,#0 271 adcs r7,r7,#0 272 adc r8,r8,#0 273 274 cmp r0,lr @ done yet? 275 bhi .Loop 276 277 ldr r0,[sp,#12] 278 add sp,sp,#32 279 stmia r0,{r4,r5,r6,r7,r8} @ store the result 280 281.Lno_data: 282#if __ARM_ARCH__>=5 283 ldmia sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,pc} 284#else 285 ldmia sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} 286 tst lr,#1 287 moveq pc,lr @ be binary compatible with V4, yet 288.word 0xe12fff1e @ interoperable with Thumb ISA:-) 289#endif 290.size poly1305_blocks,.-poly1305_blocks 291.type poly1305_emit,%function 292.align 5 293poly1305_emit: 294 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 295.Lpoly1305_emit_enter: 296 297 ldmia r0,{r3,r4,r5,r6,r7} 298 adds r8,r3,#5 @ compare to modulus 299 adcs r9,r4,#0 300 adcs r10,r5,#0 301 adcs r11,r6,#0 302 adc r7,r7,#0 303 tst r7,#4 @ did it carry/borrow? 304 305#ifdef __thumb2__ 306 it ne 307#endif 308 movne r3,r8 309 ldr r8,[r2,#0] 310#ifdef __thumb2__ 311 it ne 312#endif 313 movne r4,r9 314 ldr r9,[r2,#4] 315#ifdef __thumb2__ 316 it ne 317#endif 318 movne r5,r10 319 ldr r10,[r2,#8] 320#ifdef __thumb2__ 321 it ne 322#endif 323 movne r6,r11 324 ldr r11,[r2,#12] 325 326 adds r3,r3,r8 327 adcs r4,r4,r9 328 adcs r5,r5,r10 329 adc r6,r6,r11 330 331#if __ARM_ARCH__>=7 332# ifdef __ARMEB__ 333 rev r3,r3 334 rev r4,r4 335 rev r5,r5 336 rev r6,r6 337# endif 338 str r3,[r1,#0] 339 str r4,[r1,#4] 340 str r5,[r1,#8] 341 str r6,[r1,#12] 342#else 343 strb r3,[r1,#0] 344 mov r3,r3,lsr#8 345 strb r4,[r1,#4] 346 mov r4,r4,lsr#8 347 strb r5,[r1,#8] 348 mov r5,r5,lsr#8 349 strb r6,[r1,#12] 350 mov r6,r6,lsr#8 351 352 strb r3,[r1,#1] 353 mov r3,r3,lsr#8 354 strb r4,[r1,#5] 355 mov r4,r4,lsr#8 356 strb r5,[r1,#9] 357 mov r5,r5,lsr#8 358 strb r6,[r1,#13] 359 mov r6,r6,lsr#8 360 361 strb r3,[r1,#2] 362 mov r3,r3,lsr#8 363 strb r4,[r1,#6] 364 mov r4,r4,lsr#8 365 strb r5,[r1,#10] 366 mov r5,r5,lsr#8 367 strb r6,[r1,#14] 368 mov r6,r6,lsr#8 369 370 strb r3,[r1,#3] 371 strb r4,[r1,#7] 372 strb r5,[r1,#11] 373 strb r6,[r1,#15] 374#endif 375 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 376#if __ARM_ARCH__>=5 377 RET @ bx lr 378#else 379 tst lr,#1 380 moveq pc,lr @ be binary compatible with V4, yet 381.word 0xe12fff1e @ interoperable with Thumb ISA:-) 382#endif 383.size poly1305_emit,.-poly1305_emit 384#if __ARM_MAX_ARCH__>=7 385.fpu neon 386 387.type poly1305_init_neon,%function 388.align 5 389poly1305_init_neon: 390 ldr r4,[r0,#20] @ load key base 2^32 391 ldr r5,[r0,#24] 392 ldr r6,[r0,#28] 393 ldr r7,[r0,#32] 394 395 and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 396 mov r3,r4,lsr#26 397 mov r4,r5,lsr#20 398 orr r3,r3,r5,lsl#6 399 mov r5,r6,lsr#14 400 orr r4,r4,r6,lsl#12 401 mov r6,r7,lsr#8 402 orr r5,r5,r7,lsl#18 403 and r3,r3,#0x03ffffff 404 and r4,r4,#0x03ffffff 405 and r5,r5,#0x03ffffff 406 407 vdup.32 d0,r2 @ r^1 in both lanes 408 add r2,r3,r3,lsl#2 @ *5 409 vdup.32 d1,r3 410 add r3,r4,r4,lsl#2 411 vdup.32 d2,r2 412 vdup.32 d3,r4 413 add r4,r5,r5,lsl#2 414 vdup.32 d4,r3 415 vdup.32 d5,r5 416 add r5,r6,r6,lsl#2 417 vdup.32 d6,r4 418 vdup.32 d7,r6 419 vdup.32 d8,r5 420 421 mov r5,#2 @ counter 422 423.Lsquare_neon: 424 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 425 @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 426 @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 427 @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 428 @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 429 @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 430 431 vmull.u32 q5,d0,d0[1] 432 vmull.u32 q6,d1,d0[1] 433 vmull.u32 q7,d3,d0[1] 434 vmull.u32 q8,d5,d0[1] 435 vmull.u32 q9,d7,d0[1] 436 437 vmlal.u32 q5,d7,d2[1] 438 vmlal.u32 q6,d0,d1[1] 439 vmlal.u32 q7,d1,d1[1] 440 vmlal.u32 q8,d3,d1[1] 441 vmlal.u32 q9,d5,d1[1] 442 443 vmlal.u32 q5,d5,d4[1] 444 vmlal.u32 q6,d7,d4[1] 445 vmlal.u32 q8,d1,d3[1] 446 vmlal.u32 q7,d0,d3[1] 447 vmlal.u32 q9,d3,d3[1] 448 449 vmlal.u32 q5,d3,d6[1] 450 vmlal.u32 q8,d0,d5[1] 451 vmlal.u32 q6,d5,d6[1] 452 vmlal.u32 q7,d7,d6[1] 453 vmlal.u32 q9,d1,d5[1] 454 455 vmlal.u32 q8,d7,d8[1] 456 vmlal.u32 q5,d1,d8[1] 457 vmlal.u32 q6,d3,d8[1] 458 vmlal.u32 q7,d5,d8[1] 459 vmlal.u32 q9,d0,d7[1] 460 461 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 462 @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 463 @ and P. Schwabe 464 @ 465 @ H0>>+H1>>+H2>>+H3>>+H4 466 @ H3>>+H4>>*5+H0>>+H1 467 @ 468 @ Trivia. 469 @ 470 @ Result of multiplication of n-bit number by m-bit number is 471 @ n+m bits wide. However! Even though 2^n is a n+1-bit number, 472 @ m-bit number multiplied by 2^n is still n+m bits wide. 473 @ 474 @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, 475 @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit 476 @ one is n+1 bits wide. 477 @ 478 @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that 479 @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 480 @ can be 27. However! In cases when their width exceeds 26 bits 481 @ they are limited by 2^26+2^6. This in turn means that *sum* 482 @ of the products with these values can still be viewed as sum 483 @ of 52-bit numbers as long as the amount of addends is not a 484 @ power of 2. For example, 485 @ 486 @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, 487 @ 488 @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or 489 @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than 490 @ 8 * (2^52) or 2^55. However, the value is then multiplied by 491 @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), 492 @ which is less than 32 * (2^52) or 2^57. And when processing 493 @ data we are looking at triple as many addends... 494 @ 495 @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and 496 @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the 497 @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while 498 @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 499 @ instruction accepts 2x32-bit input and writes 2x64-bit result. 500 @ This means that result of reduction have to be compressed upon 501 @ loop wrap-around. This can be done in the process of reduction 502 @ to minimize amount of instructions [as well as amount of 503 @ 128-bit instructions, which benefits low-end processors], but 504 @ one has to watch for H2 (which is narrower than H0) and 5*H4 505 @ not being wider than 58 bits, so that result of right shift 506 @ by 26 bits fits in 32 bits. This is also useful on x86, 507 @ because it allows to use paddd in place for paddq, which 508 @ benefits Atom, where paddq is ridiculously slow. 509 510 vshr.u64 q15,q8,#26 511 vmovn.i64 d16,q8 512 vshr.u64 q4,q5,#26 513 vmovn.i64 d10,q5 514 vadd.i64 q9,q9,q15 @ h3 -> h4 515 vbic.i32 d16,#0xfc000000 @ &=0x03ffffff 516 vadd.i64 q6,q6,q4 @ h0 -> h1 517 vbic.i32 d10,#0xfc000000 518 519 vshrn.u64 d30,q9,#26 520 vmovn.i64 d18,q9 521 vshr.u64 q4,q6,#26 522 vmovn.i64 d12,q6 523 vadd.i64 q7,q7,q4 @ h1 -> h2 524 vbic.i32 d18,#0xfc000000 525 vbic.i32 d12,#0xfc000000 526 527 vadd.i32 d10,d10,d30 528 vshl.u32 d30,d30,#2 529 vshrn.u64 d8,q7,#26 530 vmovn.i64 d14,q7 531 vadd.i32 d10,d10,d30 @ h4 -> h0 532 vadd.i32 d16,d16,d8 @ h2 -> h3 533 vbic.i32 d14,#0xfc000000 534 535 vshr.u32 d30,d10,#26 536 vbic.i32 d10,#0xfc000000 537 vshr.u32 d8,d16,#26 538 vbic.i32 d16,#0xfc000000 539 vadd.i32 d12,d12,d30 @ h0 -> h1 540 vadd.i32 d18,d18,d8 @ h3 -> h4 541 542 subs r5,r5,#1 543 beq .Lsquare_break_neon 544 545 add r6,r0,#(48+0*9*4) 546 add r7,r0,#(48+1*9*4) 547 548 vtrn.32 d0,d10 @ r^2:r^1 549 vtrn.32 d3,d14 550 vtrn.32 d5,d16 551 vtrn.32 d1,d12 552 vtrn.32 d7,d18 553 554 vshl.u32 d4,d3,#2 @ *5 555 vshl.u32 d6,d5,#2 556 vshl.u32 d2,d1,#2 557 vshl.u32 d8,d7,#2 558 vadd.i32 d4,d4,d3 559 vadd.i32 d2,d2,d1 560 vadd.i32 d6,d6,d5 561 vadd.i32 d8,d8,d7 562 563 vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! 564 vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! 565 vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 566 vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 567 vst1.32 {d8[0]},[r6,:32] 568 vst1.32 {d8[1]},[r7,:32] 569 570 b .Lsquare_neon 571 572.align 4 573.Lsquare_break_neon: 574 add r6,r0,#(48+2*4*9) 575 add r7,r0,#(48+3*4*9) 576 577 vmov d0,d10 @ r^4:r^3 578 vshl.u32 d2,d12,#2 @ *5 579 vmov d1,d12 580 vshl.u32 d4,d14,#2 581 vmov d3,d14 582 vshl.u32 d6,d16,#2 583 vmov d5,d16 584 vshl.u32 d8,d18,#2 585 vmov d7,d18 586 vadd.i32 d2,d2,d12 587 vadd.i32 d4,d4,d14 588 vadd.i32 d6,d6,d16 589 vadd.i32 d8,d8,d18 590 591 vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! 592 vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! 593 vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 594 vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 595 vst1.32 {d8[0]},[r6] 596 vst1.32 {d8[1]},[r7] 597 598 RET @ bx lr 599.size poly1305_init_neon,.-poly1305_init_neon 600 601.type poly1305_blocks_neon,%function 602.align 5 603poly1305_blocks_neon: 604 ldr ip,[r0,#36] @ is_base2_26 605 ands r2,r2,#-16 606 beq .Lno_data_neon 607 608 cmp r2,#64 609 bhs .Lenter_neon 610 tst ip,ip @ is_base2_26? 611 beq .Lpoly1305_blocks 612 613.Lenter_neon: 614 stmdb sp!,{r4,r5,r6,r7} 615 vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so 616 617 tst ip,ip @ is_base2_26? 618 bne .Lbase2_26_neon 619 620 stmdb sp!,{r1,r2,r3,lr} 621 bl poly1305_init_neon 622 623 ldr r4,[r0,#0] @ load hash value base 2^32 624 ldr r5,[r0,#4] 625 ldr r6,[r0,#8] 626 ldr r7,[r0,#12] 627 ldr ip,[r0,#16] 628 629 and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 630 mov r3,r4,lsr#26 631 veor d10,d10,d10 632 mov r4,r5,lsr#20 633 orr r3,r3,r5,lsl#6 634 veor d12,d12,d12 635 mov r5,r6,lsr#14 636 orr r4,r4,r6,lsl#12 637 veor d14,d14,d14 638 mov r6,r7,lsr#8 639 orr r5,r5,r7,lsl#18 640 veor d16,d16,d16 641 and r3,r3,#0x03ffffff 642 orr r6,r6,ip,lsl#24 643 veor d18,d18,d18 644 and r4,r4,#0x03ffffff 645 mov r1,#1 646 and r5,r5,#0x03ffffff 647 str r1,[r0,#36] @ is_base2_26 648 649 vmov.32 d10[0],r2 650 vmov.32 d12[0],r3 651 vmov.32 d14[0],r4 652 vmov.32 d16[0],r5 653 vmov.32 d18[0],r6 654 adr r5,.Lzeros 655 656 ldmia sp!,{r1,r2,r3,lr} 657 b .Lbase2_32_neon 658 659.align 4 660.Lbase2_26_neon: 661 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 662 @ load hash value 663 664 veor d10,d10,d10 665 veor d12,d12,d12 666 veor d14,d14,d14 667 veor d16,d16,d16 668 veor d18,d18,d18 669 vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]! 670 adr r5,.Lzeros 671 vld1.32 {d18[0]},[r0] 672 sub r0,r0,#16 @ rewind 673 674.Lbase2_32_neon: 675 add r4,r1,#32 676 mov r3,r3,lsl#24 677 tst r2,#31 678 beq .Leven 679 680 vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]! 681 vmov.32 d28[0],r3 682 sub r2,r2,#16 683 add r4,r1,#32 684 685# ifdef __ARMEB__ 686 vrev32.8 q10,q10 687 vrev32.8 q13,q13 688 vrev32.8 q11,q11 689 vrev32.8 q12,q12 690# endif 691 vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26 692 vshl.u32 d26,d26,#18 693 694 vsri.u32 d26,d24,#14 695 vshl.u32 d24,d24,#12 696 vadd.i32 d29,d28,d18 @ add hash value and move to #hi 697 698 vbic.i32 d26,#0xfc000000 699 vsri.u32 d24,d22,#20 700 vshl.u32 d22,d22,#6 701 702 vbic.i32 d24,#0xfc000000 703 vsri.u32 d22,d20,#26 704 vadd.i32 d27,d26,d16 705 706 vbic.i32 d20,#0xfc000000 707 vbic.i32 d22,#0xfc000000 708 vadd.i32 d25,d24,d14 709 710 vadd.i32 d21,d20,d10 711 vadd.i32 d23,d22,d12 712 713 mov r7,r5 714 add r6,r0,#48 715 716 cmp r2,r2 717 b .Long_tail 718 719.align 4 720.Leven: 721 subs r2,r2,#64 722 it lo 723 movlo r4,r5 724 725 vmov.i32 q14,#1<<24 @ padbit, yes, always 726 vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1] 727 add r1,r1,#64 728 vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0) 729 add r4,r4,#64 730 itt hi 731 addhi r7,r0,#(48+1*9*4) 732 addhi r6,r0,#(48+3*9*4) 733 734# ifdef __ARMEB__ 735 vrev32.8 q10,q10 736 vrev32.8 q13,q13 737 vrev32.8 q11,q11 738 vrev32.8 q12,q12 739# endif 740 vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26 741 vshl.u32 q13,q13,#18 742 743 vsri.u32 q13,q12,#14 744 vshl.u32 q12,q12,#12 745 746 vbic.i32 q13,#0xfc000000 747 vsri.u32 q12,q11,#20 748 vshl.u32 q11,q11,#6 749 750 vbic.i32 q12,#0xfc000000 751 vsri.u32 q11,q10,#26 752 753 vbic.i32 q10,#0xfc000000 754 vbic.i32 q11,#0xfc000000 755 756 bls .Lskip_loop 757 758 vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2 759 vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4 760 vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 761 vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 762 b .Loop_neon 763 764.align 5 765.Loop_neon: 766 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 767 @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 768 @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 769 @ ___________________/ 770 @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 771 @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 772 @ ___________________/ ____________________/ 773 @ 774 @ Note that we start with inp[2:3]*r^2. This is because it 775 @ doesn't depend on reduction in previous iteration. 776 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 777 @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 778 @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 779 @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 780 @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 781 @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 782 783 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 784 @ inp[2:3]*r^2 785 786 vadd.i32 d24,d24,d14 @ accumulate inp[0:1] 787 vmull.u32 q7,d25,d0[1] 788 vadd.i32 d20,d20,d10 789 vmull.u32 q5,d21,d0[1] 790 vadd.i32 d26,d26,d16 791 vmull.u32 q8,d27,d0[1] 792 vmlal.u32 q7,d23,d1[1] 793 vadd.i32 d22,d22,d12 794 vmull.u32 q6,d23,d0[1] 795 796 vadd.i32 d28,d28,d18 797 vmull.u32 q9,d29,d0[1] 798 subs r2,r2,#64 799 vmlal.u32 q5,d29,d2[1] 800 it lo 801 movlo r4,r5 802 vmlal.u32 q8,d25,d1[1] 803 vld1.32 d8[1],[r7,:32] 804 vmlal.u32 q6,d21,d1[1] 805 vmlal.u32 q9,d27,d1[1] 806 807 vmlal.u32 q5,d27,d4[1] 808 vmlal.u32 q8,d23,d3[1] 809 vmlal.u32 q9,d25,d3[1] 810 vmlal.u32 q6,d29,d4[1] 811 vmlal.u32 q7,d21,d3[1] 812 813 vmlal.u32 q8,d21,d5[1] 814 vmlal.u32 q5,d25,d6[1] 815 vmlal.u32 q9,d23,d5[1] 816 vmlal.u32 q6,d27,d6[1] 817 vmlal.u32 q7,d29,d6[1] 818 819 vmlal.u32 q8,d29,d8[1] 820 vmlal.u32 q5,d23,d8[1] 821 vmlal.u32 q9,d21,d7[1] 822 vmlal.u32 q6,d25,d8[1] 823 vmlal.u32 q7,d27,d8[1] 824 825 vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0) 826 add r4,r4,#64 827 828 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 829 @ (hash+inp[0:1])*r^4 and accumulate 830 831 vmlal.u32 q8,d26,d0[0] 832 vmlal.u32 q5,d20,d0[0] 833 vmlal.u32 q9,d28,d0[0] 834 vmlal.u32 q6,d22,d0[0] 835 vmlal.u32 q7,d24,d0[0] 836 vld1.32 d8[0],[r6,:32] 837 838 vmlal.u32 q8,d24,d1[0] 839 vmlal.u32 q5,d28,d2[0] 840 vmlal.u32 q9,d26,d1[0] 841 vmlal.u32 q6,d20,d1[0] 842 vmlal.u32 q7,d22,d1[0] 843 844 vmlal.u32 q8,d22,d3[0] 845 vmlal.u32 q5,d26,d4[0] 846 vmlal.u32 q9,d24,d3[0] 847 vmlal.u32 q6,d28,d4[0] 848 vmlal.u32 q7,d20,d3[0] 849 850 vmlal.u32 q8,d20,d5[0] 851 vmlal.u32 q5,d24,d6[0] 852 vmlal.u32 q9,d22,d5[0] 853 vmlal.u32 q6,d26,d6[0] 854 vmlal.u32 q8,d28,d8[0] 855 856 vmlal.u32 q7,d28,d6[0] 857 vmlal.u32 q5,d22,d8[0] 858 vmlal.u32 q9,d20,d7[0] 859 vmov.i32 q14,#1<<24 @ padbit, yes, always 860 vmlal.u32 q6,d24,d8[0] 861 vmlal.u32 q7,d26,d8[0] 862 863 vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1] 864 add r1,r1,#64 865# ifdef __ARMEB__ 866 vrev32.8 q10,q10 867 vrev32.8 q11,q11 868 vrev32.8 q12,q12 869 vrev32.8 q13,q13 870# endif 871 872 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 873 @ lazy reduction interleaved with base 2^32 -> base 2^26 of 874 @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14. 875 876 vshr.u64 q15,q8,#26 877 vmovn.i64 d16,q8 878 vshr.u64 q4,q5,#26 879 vmovn.i64 d10,q5 880 vadd.i64 q9,q9,q15 @ h3 -> h4 881 vbic.i32 d16,#0xfc000000 882 vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26 883 vadd.i64 q6,q6,q4 @ h0 -> h1 884 vshl.u32 q13,q13,#18 885 vbic.i32 d10,#0xfc000000 886 887 vshrn.u64 d30,q9,#26 888 vmovn.i64 d18,q9 889 vshr.u64 q4,q6,#26 890 vmovn.i64 d12,q6 891 vadd.i64 q7,q7,q4 @ h1 -> h2 892 vsri.u32 q13,q12,#14 893 vbic.i32 d18,#0xfc000000 894 vshl.u32 q12,q12,#12 895 vbic.i32 d12,#0xfc000000 896 897 vadd.i32 d10,d10,d30 898 vshl.u32 d30,d30,#2 899 vbic.i32 q13,#0xfc000000 900 vshrn.u64 d8,q7,#26 901 vmovn.i64 d14,q7 902 vaddl.u32 q5,d10,d30 @ h4 -> h0 [widen for a sec] 903 vsri.u32 q12,q11,#20 904 vadd.i32 d16,d16,d8 @ h2 -> h3 905 vshl.u32 q11,q11,#6 906 vbic.i32 d14,#0xfc000000 907 vbic.i32 q12,#0xfc000000 908 909 vshrn.u64 d30,q5,#26 @ re-narrow 910 vmovn.i64 d10,q5 911 vsri.u32 q11,q10,#26 912 vbic.i32 q10,#0xfc000000 913 vshr.u32 d8,d16,#26 914 vbic.i32 d16,#0xfc000000 915 vbic.i32 d10,#0xfc000000 916 vadd.i32 d12,d12,d30 @ h0 -> h1 917 vadd.i32 d18,d18,d8 @ h3 -> h4 918 vbic.i32 q11,#0xfc000000 919 920 bhi .Loop_neon 921 922.Lskip_loop: 923 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 924 @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 925 926 add r7,r0,#(48+0*9*4) 927 add r6,r0,#(48+1*9*4) 928 adds r2,r2,#32 929 it ne 930 movne r2,#0 931 bne .Long_tail 932 933 vadd.i32 d25,d24,d14 @ add hash value and move to #hi 934 vadd.i32 d21,d20,d10 935 vadd.i32 d27,d26,d16 936 vadd.i32 d23,d22,d12 937 vadd.i32 d29,d28,d18 938 939.Long_tail: 940 vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1 941 vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2 942 943 vadd.i32 d24,d24,d14 @ can be redundant 944 vmull.u32 q7,d25,d0 945 vadd.i32 d20,d20,d10 946 vmull.u32 q5,d21,d0 947 vadd.i32 d26,d26,d16 948 vmull.u32 q8,d27,d0 949 vadd.i32 d22,d22,d12 950 vmull.u32 q6,d23,d0 951 vadd.i32 d28,d28,d18 952 vmull.u32 q9,d29,d0 953 954 vmlal.u32 q5,d29,d2 955 vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 956 vmlal.u32 q8,d25,d1 957 vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 958 vmlal.u32 q6,d21,d1 959 vmlal.u32 q9,d27,d1 960 vmlal.u32 q7,d23,d1 961 962 vmlal.u32 q8,d23,d3 963 vld1.32 d8[1],[r7,:32] 964 vmlal.u32 q5,d27,d4 965 vld1.32 d8[0],[r6,:32] 966 vmlal.u32 q9,d25,d3 967 vmlal.u32 q6,d29,d4 968 vmlal.u32 q7,d21,d3 969 970 vmlal.u32 q8,d21,d5 971 it ne 972 addne r7,r0,#(48+2*9*4) 973 vmlal.u32 q5,d25,d6 974 it ne 975 addne r6,r0,#(48+3*9*4) 976 vmlal.u32 q9,d23,d5 977 vmlal.u32 q6,d27,d6 978 vmlal.u32 q7,d29,d6 979 980 vmlal.u32 q8,d29,d8 981 vorn q0,q0,q0 @ all-ones, can be redundant 982 vmlal.u32 q5,d23,d8 983 vshr.u64 q0,q0,#38 984 vmlal.u32 q9,d21,d7 985 vmlal.u32 q6,d25,d8 986 vmlal.u32 q7,d27,d8 987 988 beq .Lshort_tail 989 990 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 991 @ (hash+inp[0:1])*r^4:r^3 and accumulate 992 993 vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3 994 vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4 995 996 vmlal.u32 q7,d24,d0 997 vmlal.u32 q5,d20,d0 998 vmlal.u32 q8,d26,d0 999 vmlal.u32 q6,d22,d0 1000 vmlal.u32 q9,d28,d0 1001 1002 vmlal.u32 q5,d28,d2 1003 vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 1004 vmlal.u32 q8,d24,d1 1005 vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 1006 vmlal.u32 q6,d20,d1 1007 vmlal.u32 q9,d26,d1 1008 vmlal.u32 q7,d22,d1 1009 1010 vmlal.u32 q8,d22,d3 1011 vld1.32 d8[1],[r7,:32] 1012 vmlal.u32 q5,d26,d4 1013 vld1.32 d8[0],[r6,:32] 1014 vmlal.u32 q9,d24,d3 1015 vmlal.u32 q6,d28,d4 1016 vmlal.u32 q7,d20,d3 1017 1018 vmlal.u32 q8,d20,d5 1019 vmlal.u32 q5,d24,d6 1020 vmlal.u32 q9,d22,d5 1021 vmlal.u32 q6,d26,d6 1022 vmlal.u32 q7,d28,d6 1023 1024 vmlal.u32 q8,d28,d8 1025 vorn q0,q0,q0 @ all-ones 1026 vmlal.u32 q5,d22,d8 1027 vshr.u64 q0,q0,#38 1028 vmlal.u32 q9,d20,d7 1029 vmlal.u32 q6,d24,d8 1030 vmlal.u32 q7,d26,d8 1031 1032.Lshort_tail: 1033 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1034 @ horizontal addition 1035 1036 vadd.i64 d16,d16,d17 1037 vadd.i64 d10,d10,d11 1038 vadd.i64 d18,d18,d19 1039 vadd.i64 d12,d12,d13 1040 vadd.i64 d14,d14,d15 1041 1042 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1043 @ lazy reduction, but without narrowing 1044 1045 vshr.u64 q15,q8,#26 1046 vand.i64 q8,q8,q0 1047 vshr.u64 q4,q5,#26 1048 vand.i64 q5,q5,q0 1049 vadd.i64 q9,q9,q15 @ h3 -> h4 1050 vadd.i64 q6,q6,q4 @ h0 -> h1 1051 1052 vshr.u64 q15,q9,#26 1053 vand.i64 q9,q9,q0 1054 vshr.u64 q4,q6,#26 1055 vand.i64 q6,q6,q0 1056 vadd.i64 q7,q7,q4 @ h1 -> h2 1057 1058 vadd.i64 q5,q5,q15 1059 vshl.u64 q15,q15,#2 1060 vshr.u64 q4,q7,#26 1061 vand.i64 q7,q7,q0 1062 vadd.i64 q5,q5,q15 @ h4 -> h0 1063 vadd.i64 q8,q8,q4 @ h2 -> h3 1064 1065 vshr.u64 q15,q5,#26 1066 vand.i64 q5,q5,q0 1067 vshr.u64 q4,q8,#26 1068 vand.i64 q8,q8,q0 1069 vadd.i64 q6,q6,q15 @ h0 -> h1 1070 vadd.i64 q9,q9,q4 @ h3 -> h4 1071 1072 cmp r2,#0 1073 bne .Leven 1074 1075 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1076 @ store hash value 1077 1078 vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]! 1079 vst1.32 {d18[0]},[r0] 1080 1081 vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ epilogue 1082 ldmia sp!,{r4,r5,r6,r7} 1083.Lno_data_neon: 1084 RET @ bx lr 1085.size poly1305_blocks_neon,.-poly1305_blocks_neon 1086 1087.type poly1305_emit_neon,%function 1088.align 5 1089poly1305_emit_neon: 1090 ldr ip,[r0,#36] @ is_base2_26 1091 1092 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 1093 1094 tst ip,ip 1095 beq .Lpoly1305_emit_enter 1096 1097 ldmia r0,{r3,r4,r5,r6,r7} 1098 eor r8,r8,r8 1099 1100 adds r3,r3,r4,lsl#26 @ base 2^26 -> base 2^32 1101 mov r4,r4,lsr#6 1102 adcs r4,r4,r5,lsl#20 1103 mov r5,r5,lsr#12 1104 adcs r5,r5,r6,lsl#14 1105 mov r6,r6,lsr#18 1106 adcs r6,r6,r7,lsl#8 1107 adc r7,r8,r7,lsr#24 @ can be partially reduced ... 1108 1109 and r8,r7,#-4 @ ... so reduce 1110 and r7,r6,#3 1111 add r8,r8,r8,lsr#2 @ *= 5 1112 adds r3,r3,r8 1113 adcs r4,r4,#0 1114 adcs r5,r5,#0 1115 adcs r6,r6,#0 1116 adc r7,r7,#0 1117 1118 adds r8,r3,#5 @ compare to modulus 1119 adcs r9,r4,#0 1120 adcs r10,r5,#0 1121 adcs r11,r6,#0 1122 adc r7,r7,#0 1123 tst r7,#4 @ did it carry/borrow? 1124 1125 it ne 1126 movne r3,r8 1127 ldr r8,[r2,#0] 1128 it ne 1129 movne r4,r9 1130 ldr r9,[r2,#4] 1131 it ne 1132 movne r5,r10 1133 ldr r10,[r2,#8] 1134 it ne 1135 movne r6,r11 1136 ldr r11,[r2,#12] 1137 1138 adds r3,r3,r8 @ accumulate nonce 1139 adcs r4,r4,r9 1140 adcs r5,r5,r10 1141 adc r6,r6,r11 1142 1143# ifdef __ARMEB__ 1144 rev r3,r3 1145 rev r4,r4 1146 rev r5,r5 1147 rev r6,r6 1148# endif 1149 str r3,[r1,#0] @ store the result 1150 str r4,[r1,#4] 1151 str r5,[r1,#8] 1152 str r6,[r1,#12] 1153 1154 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 1155 RET @ bx lr 1156.size poly1305_emit_neon,.-poly1305_emit_neon 1157 1158.align 5 1159.Lzeros: 1160.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 1161.LOPENSSL_armcap: 1162.word OPENSSL_armcap_P-.Lpoly1305_init 1163#endif 1164.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1165.align 2 1166.align 2 1167#if __ARM_MAX_ARCH__>=7 1168.comm OPENSSL_armcap_P,4,4 1169#endif 1170