1#include "arm_arch.h" 2 3.text 4 5 6 7.align 5 8.Lsigma: 9.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral 10.Lone: 11.long 1,0,0,0 12.LOPENSSL_armcap_P: 13#ifdef __ILP32__ 14.long OPENSSL_armcap_P-. 15#else 16.quad OPENSSL_armcap_P-. 17#endif 18.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 19.align 2 20 21.globl ChaCha20_ctr32 22.type ChaCha20_ctr32,%function 23.align 5 24ChaCha20_ctr32: 25 cbz x2,.Labort 26 adr x5,.LOPENSSL_armcap_P 27 cmp x2,#192 28 b.lo .Lshort 29#ifdef __ILP32__ 30 ldrsw x6,[x5] 31#else 32 ldr x6,[x5] 33#endif 34 ldr w17,[x6,x5] 35 tst w17,#ARMV7_NEON 36 b.ne ChaCha20_neon 37 38.Lshort: 39 stp x29,x30,[sp,#-96]! 40 add x29,sp,#0 41 42 adr x5,.Lsigma 43 stp x19,x20,[sp,#16] 44 stp x21,x22,[sp,#32] 45 stp x23,x24,[sp,#48] 46 stp x25,x26,[sp,#64] 47 stp x27,x28,[sp,#80] 48 sub sp,sp,#64 49 50 ldp x22,x23,[x5] // load sigma 51 ldp x24,x25,[x3] // load key 52 ldp x26,x27,[x3,#16] 53 ldp x28,x30,[x4] // load counter 54#ifdef __ARMEB__ 55 ror x24,x24,#32 56 ror x25,x25,#32 57 ror x26,x26,#32 58 ror x27,x27,#32 59 ror x28,x28,#32 60 ror x30,x30,#32 61#endif 62 63.Loop_outer: 64 mov w5,w22 // unpack key block 65 lsr x6,x22,#32 66 mov w7,w23 67 lsr x8,x23,#32 68 mov w9,w24 69 lsr x10,x24,#32 70 mov w11,w25 71 lsr x12,x25,#32 72 mov w13,w26 73 lsr x14,x26,#32 74 mov w15,w27 75 lsr x16,x27,#32 76 mov w17,w28 77 lsr x19,x28,#32 78 mov w20,w30 79 lsr x21,x30,#32 80 81 mov x4,#10 82 subs x2,x2,#64 83.Loop: 84 sub x4,x4,#1 85 add w5,w5,w9 86 add w6,w6,w10 87 add w7,w7,w11 88 add w8,w8,w12 89 eor w17,w17,w5 90 eor w19,w19,w6 91 eor w20,w20,w7 92 eor w21,w21,w8 93 ror w17,w17,#16 94 ror w19,w19,#16 95 ror w20,w20,#16 96 ror w21,w21,#16 97 add w13,w13,w17 98 add w14,w14,w19 99 add w15,w15,w20 100 add w16,w16,w21 101 eor w9,w9,w13 102 eor w10,w10,w14 103 eor w11,w11,w15 104 eor w12,w12,w16 105 ror w9,w9,#20 106 ror w10,w10,#20 107 ror w11,w11,#20 108 ror w12,w12,#20 109 add w5,w5,w9 110 add w6,w6,w10 111 add w7,w7,w11 112 add w8,w8,w12 113 eor w17,w17,w5 114 eor w19,w19,w6 115 eor w20,w20,w7 116 eor w21,w21,w8 117 ror w17,w17,#24 118 ror w19,w19,#24 119 ror w20,w20,#24 120 ror w21,w21,#24 121 add w13,w13,w17 122 add w14,w14,w19 123 add w15,w15,w20 124 add w16,w16,w21 125 eor w9,w9,w13 126 eor w10,w10,w14 127 eor w11,w11,w15 128 eor w12,w12,w16 129 ror w9,w9,#25 130 ror w10,w10,#25 131 ror w11,w11,#25 132 ror w12,w12,#25 133 add w5,w5,w10 134 add w6,w6,w11 135 add w7,w7,w12 136 add w8,w8,w9 137 eor w21,w21,w5 138 eor w17,w17,w6 139 eor w19,w19,w7 140 eor w20,w20,w8 141 ror w21,w21,#16 142 ror w17,w17,#16 143 ror w19,w19,#16 144 ror w20,w20,#16 145 add w15,w15,w21 146 add w16,w16,w17 147 add w13,w13,w19 148 add w14,w14,w20 149 eor w10,w10,w15 150 eor w11,w11,w16 151 eor w12,w12,w13 152 eor w9,w9,w14 153 ror w10,w10,#20 154 ror w11,w11,#20 155 ror w12,w12,#20 156 ror w9,w9,#20 157 add w5,w5,w10 158 add w6,w6,w11 159 add w7,w7,w12 160 add w8,w8,w9 161 eor w21,w21,w5 162 eor w17,w17,w6 163 eor w19,w19,w7 164 eor w20,w20,w8 165 ror w21,w21,#24 166 ror w17,w17,#24 167 ror w19,w19,#24 168 ror w20,w20,#24 169 add w15,w15,w21 170 add w16,w16,w17 171 add w13,w13,w19 172 add w14,w14,w20 173 eor w10,w10,w15 174 eor w11,w11,w16 175 eor w12,w12,w13 176 eor w9,w9,w14 177 ror w10,w10,#25 178 ror w11,w11,#25 179 ror w12,w12,#25 180 ror w9,w9,#25 181 cbnz x4,.Loop 182 183 add w5,w5,w22 // accumulate key block 184 add x6,x6,x22,lsr#32 185 add w7,w7,w23 186 add x8,x8,x23,lsr#32 187 add w9,w9,w24 188 add x10,x10,x24,lsr#32 189 add w11,w11,w25 190 add x12,x12,x25,lsr#32 191 add w13,w13,w26 192 add x14,x14,x26,lsr#32 193 add w15,w15,w27 194 add x16,x16,x27,lsr#32 195 add w17,w17,w28 196 add x19,x19,x28,lsr#32 197 add w20,w20,w30 198 add x21,x21,x30,lsr#32 199 200 b.lo .Ltail 201 202 add x5,x5,x6,lsl#32 // pack 203 add x7,x7,x8,lsl#32 204 ldp x6,x8,[x1,#0] // load input 205 add x9,x9,x10,lsl#32 206 add x11,x11,x12,lsl#32 207 ldp x10,x12,[x1,#16] 208 add x13,x13,x14,lsl#32 209 add x15,x15,x16,lsl#32 210 ldp x14,x16,[x1,#32] 211 add x17,x17,x19,lsl#32 212 add x20,x20,x21,lsl#32 213 ldp x19,x21,[x1,#48] 214 add x1,x1,#64 215#ifdef __ARMEB__ 216 rev x5,x5 217 rev x7,x7 218 rev x9,x9 219 rev x11,x11 220 rev x13,x13 221 rev x15,x15 222 rev x17,x17 223 rev x20,x20 224#endif 225 eor x5,x5,x6 226 eor x7,x7,x8 227 eor x9,x9,x10 228 eor x11,x11,x12 229 eor x13,x13,x14 230 eor x15,x15,x16 231 eor x17,x17,x19 232 eor x20,x20,x21 233 234 stp x5,x7,[x0,#0] // store output 235 add x28,x28,#1 // increment counter 236 stp x9,x11,[x0,#16] 237 stp x13,x15,[x0,#32] 238 stp x17,x20,[x0,#48] 239 add x0,x0,#64 240 241 b.hi .Loop_outer 242 243 ldp x19,x20,[x29,#16] 244 add sp,sp,#64 245 ldp x21,x22,[x29,#32] 246 ldp x23,x24,[x29,#48] 247 ldp x25,x26,[x29,#64] 248 ldp x27,x28,[x29,#80] 249 ldp x29,x30,[sp],#96 250.Labort: 251 ret 252 253.align 4 254.Ltail: 255 add x2,x2,#64 256.Less_than_64: 257 sub x0,x0,#1 258 add x1,x1,x2 259 add x0,x0,x2 260 add x4,sp,x2 261 neg x2,x2 262 263 add x5,x5,x6,lsl#32 // pack 264 add x7,x7,x8,lsl#32 265 add x9,x9,x10,lsl#32 266 add x11,x11,x12,lsl#32 267 add x13,x13,x14,lsl#32 268 add x15,x15,x16,lsl#32 269 add x17,x17,x19,lsl#32 270 add x20,x20,x21,lsl#32 271#ifdef __ARMEB__ 272 rev x5,x5 273 rev x7,x7 274 rev x9,x9 275 rev x11,x11 276 rev x13,x13 277 rev x15,x15 278 rev x17,x17 279 rev x20,x20 280#endif 281 stp x5,x7,[sp,#0] 282 stp x9,x11,[sp,#16] 283 stp x13,x15,[sp,#32] 284 stp x17,x20,[sp,#48] 285 286.Loop_tail: 287 ldrb w10,[x1,x2] 288 ldrb w11,[x4,x2] 289 add x2,x2,#1 290 eor w10,w10,w11 291 strb w10,[x0,x2] 292 cbnz x2,.Loop_tail 293 294 stp xzr,xzr,[sp,#0] 295 stp xzr,xzr,[sp,#16] 296 stp xzr,xzr,[sp,#32] 297 stp xzr,xzr,[sp,#48] 298 299 ldp x19,x20,[x29,#16] 300 add sp,sp,#64 301 ldp x21,x22,[x29,#32] 302 ldp x23,x24,[x29,#48] 303 ldp x25,x26,[x29,#64] 304 ldp x27,x28,[x29,#80] 305 ldp x29,x30,[sp],#96 306 ret 307.size ChaCha20_ctr32,.-ChaCha20_ctr32 308 309.type ChaCha20_neon,%function 310.align 5 311ChaCha20_neon: 312 stp x29,x30,[sp,#-96]! 313 add x29,sp,#0 314 315 adr x5,.Lsigma 316 stp x19,x20,[sp,#16] 317 stp x21,x22,[sp,#32] 318 stp x23,x24,[sp,#48] 319 stp x25,x26,[sp,#64] 320 stp x27,x28,[sp,#80] 321 cmp x2,#512 322 b.hs .L512_or_more_neon 323 324 sub sp,sp,#64 325 326 ldp x22,x23,[x5] // load sigma 327 ld1 {v24.4s},[x5],#16 328 ldp x24,x25,[x3] // load key 329 ldp x26,x27,[x3,#16] 330 ld1 {v25.4s,v26.4s},[x3] 331 ldp x28,x30,[x4] // load counter 332 ld1 {v27.4s},[x4] 333 ld1 {v31.4s},[x5] 334#ifdef __ARMEB__ 335 rev64 v24.4s,v24.4s 336 ror x24,x24,#32 337 ror x25,x25,#32 338 ror x26,x26,#32 339 ror x27,x27,#32 340 ror x28,x28,#32 341 ror x30,x30,#32 342#endif 343 add v27.4s,v27.4s,v31.4s // += 1 344 add v28.4s,v27.4s,v31.4s 345 add v29.4s,v28.4s,v31.4s 346 shl v31.4s,v31.4s,#2 // 1 -> 4 347 348.Loop_outer_neon: 349 mov w5,w22 // unpack key block 350 lsr x6,x22,#32 351 mov v0.16b,v24.16b 352 mov w7,w23 353 lsr x8,x23,#32 354 mov v4.16b,v24.16b 355 mov w9,w24 356 lsr x10,x24,#32 357 mov v16.16b,v24.16b 358 mov w11,w25 359 mov v1.16b,v25.16b 360 lsr x12,x25,#32 361 mov v5.16b,v25.16b 362 mov w13,w26 363 mov v17.16b,v25.16b 364 lsr x14,x26,#32 365 mov v3.16b,v27.16b 366 mov w15,w27 367 mov v7.16b,v28.16b 368 lsr x16,x27,#32 369 mov v19.16b,v29.16b 370 mov w17,w28 371 mov v2.16b,v26.16b 372 lsr x19,x28,#32 373 mov v6.16b,v26.16b 374 mov w20,w30 375 mov v18.16b,v26.16b 376 lsr x21,x30,#32 377 378 mov x4,#10 379 subs x2,x2,#256 380.Loop_neon: 381 sub x4,x4,#1 382 add v0.4s,v0.4s,v1.4s 383 add w5,w5,w9 384 add v4.4s,v4.4s,v5.4s 385 add w6,w6,w10 386 add v16.4s,v16.4s,v17.4s 387 add w7,w7,w11 388 eor v3.16b,v3.16b,v0.16b 389 add w8,w8,w12 390 eor v7.16b,v7.16b,v4.16b 391 eor w17,w17,w5 392 eor v19.16b,v19.16b,v16.16b 393 eor w19,w19,w6 394 rev32 v3.8h,v3.8h 395 eor w20,w20,w7 396 rev32 v7.8h,v7.8h 397 eor w21,w21,w8 398 rev32 v19.8h,v19.8h 399 ror w17,w17,#16 400 add v2.4s,v2.4s,v3.4s 401 ror w19,w19,#16 402 add v6.4s,v6.4s,v7.4s 403 ror w20,w20,#16 404 add v18.4s,v18.4s,v19.4s 405 ror w21,w21,#16 406 eor v20.16b,v1.16b,v2.16b 407 add w13,w13,w17 408 eor v21.16b,v5.16b,v6.16b 409 add w14,w14,w19 410 eor v22.16b,v17.16b,v18.16b 411 add w15,w15,w20 412 ushr v1.4s,v20.4s,#20 413 add w16,w16,w21 414 ushr v5.4s,v21.4s,#20 415 eor w9,w9,w13 416 ushr v17.4s,v22.4s,#20 417 eor w10,w10,w14 418 sli v1.4s,v20.4s,#12 419 eor w11,w11,w15 420 sli v5.4s,v21.4s,#12 421 eor w12,w12,w16 422 sli v17.4s,v22.4s,#12 423 ror w9,w9,#20 424 add v0.4s,v0.4s,v1.4s 425 ror w10,w10,#20 426 add v4.4s,v4.4s,v5.4s 427 ror w11,w11,#20 428 add v16.4s,v16.4s,v17.4s 429 ror w12,w12,#20 430 eor v20.16b,v3.16b,v0.16b 431 add w5,w5,w9 432 eor v21.16b,v7.16b,v4.16b 433 add w6,w6,w10 434 eor v22.16b,v19.16b,v16.16b 435 add w7,w7,w11 436 ushr v3.4s,v20.4s,#24 437 add w8,w8,w12 438 ushr v7.4s,v21.4s,#24 439 eor w17,w17,w5 440 ushr v19.4s,v22.4s,#24 441 eor w19,w19,w6 442 sli v3.4s,v20.4s,#8 443 eor w20,w20,w7 444 sli v7.4s,v21.4s,#8 445 eor w21,w21,w8 446 sli v19.4s,v22.4s,#8 447 ror w17,w17,#24 448 add v2.4s,v2.4s,v3.4s 449 ror w19,w19,#24 450 add v6.4s,v6.4s,v7.4s 451 ror w20,w20,#24 452 add v18.4s,v18.4s,v19.4s 453 ror w21,w21,#24 454 eor v20.16b,v1.16b,v2.16b 455 add w13,w13,w17 456 eor v21.16b,v5.16b,v6.16b 457 add w14,w14,w19 458 eor v22.16b,v17.16b,v18.16b 459 add w15,w15,w20 460 ushr v1.4s,v20.4s,#25 461 add w16,w16,w21 462 ushr v5.4s,v21.4s,#25 463 eor w9,w9,w13 464 ushr v17.4s,v22.4s,#25 465 eor w10,w10,w14 466 sli v1.4s,v20.4s,#7 467 eor w11,w11,w15 468 sli v5.4s,v21.4s,#7 469 eor w12,w12,w16 470 sli v17.4s,v22.4s,#7 471 ror w9,w9,#25 472 ext v2.16b,v2.16b,v2.16b,#8 473 ror w10,w10,#25 474 ext v6.16b,v6.16b,v6.16b,#8 475 ror w11,w11,#25 476 ext v18.16b,v18.16b,v18.16b,#8 477 ror w12,w12,#25 478 ext v3.16b,v3.16b,v3.16b,#12 479 ext v7.16b,v7.16b,v7.16b,#12 480 ext v19.16b,v19.16b,v19.16b,#12 481 ext v1.16b,v1.16b,v1.16b,#4 482 ext v5.16b,v5.16b,v5.16b,#4 483 ext v17.16b,v17.16b,v17.16b,#4 484 add v0.4s,v0.4s,v1.4s 485 add w5,w5,w10 486 add v4.4s,v4.4s,v5.4s 487 add w6,w6,w11 488 add v16.4s,v16.4s,v17.4s 489 add w7,w7,w12 490 eor v3.16b,v3.16b,v0.16b 491 add w8,w8,w9 492 eor v7.16b,v7.16b,v4.16b 493 eor w21,w21,w5 494 eor v19.16b,v19.16b,v16.16b 495 eor w17,w17,w6 496 rev32 v3.8h,v3.8h 497 eor w19,w19,w7 498 rev32 v7.8h,v7.8h 499 eor w20,w20,w8 500 rev32 v19.8h,v19.8h 501 ror w21,w21,#16 502 add v2.4s,v2.4s,v3.4s 503 ror w17,w17,#16 504 add v6.4s,v6.4s,v7.4s 505 ror w19,w19,#16 506 add v18.4s,v18.4s,v19.4s 507 ror w20,w20,#16 508 eor v20.16b,v1.16b,v2.16b 509 add w15,w15,w21 510 eor v21.16b,v5.16b,v6.16b 511 add w16,w16,w17 512 eor v22.16b,v17.16b,v18.16b 513 add w13,w13,w19 514 ushr v1.4s,v20.4s,#20 515 add w14,w14,w20 516 ushr v5.4s,v21.4s,#20 517 eor w10,w10,w15 518 ushr v17.4s,v22.4s,#20 519 eor w11,w11,w16 520 sli v1.4s,v20.4s,#12 521 eor w12,w12,w13 522 sli v5.4s,v21.4s,#12 523 eor w9,w9,w14 524 sli v17.4s,v22.4s,#12 525 ror w10,w10,#20 526 add v0.4s,v0.4s,v1.4s 527 ror w11,w11,#20 528 add v4.4s,v4.4s,v5.4s 529 ror w12,w12,#20 530 add v16.4s,v16.4s,v17.4s 531 ror w9,w9,#20 532 eor v20.16b,v3.16b,v0.16b 533 add w5,w5,w10 534 eor v21.16b,v7.16b,v4.16b 535 add w6,w6,w11 536 eor v22.16b,v19.16b,v16.16b 537 add w7,w7,w12 538 ushr v3.4s,v20.4s,#24 539 add w8,w8,w9 540 ushr v7.4s,v21.4s,#24 541 eor w21,w21,w5 542 ushr v19.4s,v22.4s,#24 543 eor w17,w17,w6 544 sli v3.4s,v20.4s,#8 545 eor w19,w19,w7 546 sli v7.4s,v21.4s,#8 547 eor w20,w20,w8 548 sli v19.4s,v22.4s,#8 549 ror w21,w21,#24 550 add v2.4s,v2.4s,v3.4s 551 ror w17,w17,#24 552 add v6.4s,v6.4s,v7.4s 553 ror w19,w19,#24 554 add v18.4s,v18.4s,v19.4s 555 ror w20,w20,#24 556 eor v20.16b,v1.16b,v2.16b 557 add w15,w15,w21 558 eor v21.16b,v5.16b,v6.16b 559 add w16,w16,w17 560 eor v22.16b,v17.16b,v18.16b 561 add w13,w13,w19 562 ushr v1.4s,v20.4s,#25 563 add w14,w14,w20 564 ushr v5.4s,v21.4s,#25 565 eor w10,w10,w15 566 ushr v17.4s,v22.4s,#25 567 eor w11,w11,w16 568 sli v1.4s,v20.4s,#7 569 eor w12,w12,w13 570 sli v5.4s,v21.4s,#7 571 eor w9,w9,w14 572 sli v17.4s,v22.4s,#7 573 ror w10,w10,#25 574 ext v2.16b,v2.16b,v2.16b,#8 575 ror w11,w11,#25 576 ext v6.16b,v6.16b,v6.16b,#8 577 ror w12,w12,#25 578 ext v18.16b,v18.16b,v18.16b,#8 579 ror w9,w9,#25 580 ext v3.16b,v3.16b,v3.16b,#4 581 ext v7.16b,v7.16b,v7.16b,#4 582 ext v19.16b,v19.16b,v19.16b,#4 583 ext v1.16b,v1.16b,v1.16b,#12 584 ext v5.16b,v5.16b,v5.16b,#12 585 ext v17.16b,v17.16b,v17.16b,#12 586 cbnz x4,.Loop_neon 587 588 add w5,w5,w22 // accumulate key block 589 add v0.4s,v0.4s,v24.4s 590 add x6,x6,x22,lsr#32 591 add v4.4s,v4.4s,v24.4s 592 add w7,w7,w23 593 add v16.4s,v16.4s,v24.4s 594 add x8,x8,x23,lsr#32 595 add v2.4s,v2.4s,v26.4s 596 add w9,w9,w24 597 add v6.4s,v6.4s,v26.4s 598 add x10,x10,x24,lsr#32 599 add v18.4s,v18.4s,v26.4s 600 add w11,w11,w25 601 add v3.4s,v3.4s,v27.4s 602 add x12,x12,x25,lsr#32 603 add w13,w13,w26 604 add v7.4s,v7.4s,v28.4s 605 add x14,x14,x26,lsr#32 606 add w15,w15,w27 607 add v19.4s,v19.4s,v29.4s 608 add x16,x16,x27,lsr#32 609 add w17,w17,w28 610 add v1.4s,v1.4s,v25.4s 611 add x19,x19,x28,lsr#32 612 add w20,w20,w30 613 add v5.4s,v5.4s,v25.4s 614 add x21,x21,x30,lsr#32 615 add v17.4s,v17.4s,v25.4s 616 617 b.lo .Ltail_neon 618 619 add x5,x5,x6,lsl#32 // pack 620 add x7,x7,x8,lsl#32 621 ldp x6,x8,[x1,#0] // load input 622 add x9,x9,x10,lsl#32 623 add x11,x11,x12,lsl#32 624 ldp x10,x12,[x1,#16] 625 add x13,x13,x14,lsl#32 626 add x15,x15,x16,lsl#32 627 ldp x14,x16,[x1,#32] 628 add x17,x17,x19,lsl#32 629 add x20,x20,x21,lsl#32 630 ldp x19,x21,[x1,#48] 631 add x1,x1,#64 632#ifdef __ARMEB__ 633 rev x5,x5 634 rev x7,x7 635 rev x9,x9 636 rev x11,x11 637 rev x13,x13 638 rev x15,x15 639 rev x17,x17 640 rev x20,x20 641#endif 642 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 643 eor x5,x5,x6 644 eor x7,x7,x8 645 eor x9,x9,x10 646 eor x11,x11,x12 647 eor x13,x13,x14 648 eor v0.16b,v0.16b,v20.16b 649 eor x15,x15,x16 650 eor v1.16b,v1.16b,v21.16b 651 eor x17,x17,x19 652 eor v2.16b,v2.16b,v22.16b 653 eor x20,x20,x21 654 eor v3.16b,v3.16b,v23.16b 655 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 656 657 stp x5,x7,[x0,#0] // store output 658 add x28,x28,#4 // increment counter 659 stp x9,x11,[x0,#16] 660 add v27.4s,v27.4s,v31.4s // += 4 661 stp x13,x15,[x0,#32] 662 add v28.4s,v28.4s,v31.4s 663 stp x17,x20,[x0,#48] 664 add v29.4s,v29.4s,v31.4s 665 add x0,x0,#64 666 667 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 668 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 669 670 eor v4.16b,v4.16b,v20.16b 671 eor v5.16b,v5.16b,v21.16b 672 eor v6.16b,v6.16b,v22.16b 673 eor v7.16b,v7.16b,v23.16b 674 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 675 676 eor v16.16b,v16.16b,v0.16b 677 eor v17.16b,v17.16b,v1.16b 678 eor v18.16b,v18.16b,v2.16b 679 eor v19.16b,v19.16b,v3.16b 680 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 681 682 b.hi .Loop_outer_neon 683 684 ldp x19,x20,[x29,#16] 685 add sp,sp,#64 686 ldp x21,x22,[x29,#32] 687 ldp x23,x24,[x29,#48] 688 ldp x25,x26,[x29,#64] 689 ldp x27,x28,[x29,#80] 690 ldp x29,x30,[sp],#96 691 ret 692 693.Ltail_neon: 694 add x2,x2,#256 695 cmp x2,#64 696 b.lo .Less_than_64 697 698 add x5,x5,x6,lsl#32 // pack 699 add x7,x7,x8,lsl#32 700 ldp x6,x8,[x1,#0] // load input 701 add x9,x9,x10,lsl#32 702 add x11,x11,x12,lsl#32 703 ldp x10,x12,[x1,#16] 704 add x13,x13,x14,lsl#32 705 add x15,x15,x16,lsl#32 706 ldp x14,x16,[x1,#32] 707 add x17,x17,x19,lsl#32 708 add x20,x20,x21,lsl#32 709 ldp x19,x21,[x1,#48] 710 add x1,x1,#64 711#ifdef __ARMEB__ 712 rev x5,x5 713 rev x7,x7 714 rev x9,x9 715 rev x11,x11 716 rev x13,x13 717 rev x15,x15 718 rev x17,x17 719 rev x20,x20 720#endif 721 eor x5,x5,x6 722 eor x7,x7,x8 723 eor x9,x9,x10 724 eor x11,x11,x12 725 eor x13,x13,x14 726 eor x15,x15,x16 727 eor x17,x17,x19 728 eor x20,x20,x21 729 730 stp x5,x7,[x0,#0] // store output 731 add x28,x28,#4 // increment counter 732 stp x9,x11,[x0,#16] 733 stp x13,x15,[x0,#32] 734 stp x17,x20,[x0,#48] 735 add x0,x0,#64 736 b.eq .Ldone_neon 737 sub x2,x2,#64 738 cmp x2,#64 739 b.lo .Less_than_128 740 741 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 742 eor v0.16b,v0.16b,v20.16b 743 eor v1.16b,v1.16b,v21.16b 744 eor v2.16b,v2.16b,v22.16b 745 eor v3.16b,v3.16b,v23.16b 746 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 747 b.eq .Ldone_neon 748 sub x2,x2,#64 749 cmp x2,#64 750 b.lo .Less_than_192 751 752 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 753 eor v4.16b,v4.16b,v20.16b 754 eor v5.16b,v5.16b,v21.16b 755 eor v6.16b,v6.16b,v22.16b 756 eor v7.16b,v7.16b,v23.16b 757 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 758 b.eq .Ldone_neon 759 sub x2,x2,#64 760 761 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] 762 b .Last_neon 763 764.Less_than_128: 765 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] 766 b .Last_neon 767.Less_than_192: 768 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] 769 b .Last_neon 770 771.align 4 772.Last_neon: 773 sub x0,x0,#1 774 add x1,x1,x2 775 add x0,x0,x2 776 add x4,sp,x2 777 neg x2,x2 778 779.Loop_tail_neon: 780 ldrb w10,[x1,x2] 781 ldrb w11,[x4,x2] 782 add x2,x2,#1 783 eor w10,w10,w11 784 strb w10,[x0,x2] 785 cbnz x2,.Loop_tail_neon 786 787 stp xzr,xzr,[sp,#0] 788 stp xzr,xzr,[sp,#16] 789 stp xzr,xzr,[sp,#32] 790 stp xzr,xzr,[sp,#48] 791 792.Ldone_neon: 793 ldp x19,x20,[x29,#16] 794 add sp,sp,#64 795 ldp x21,x22,[x29,#32] 796 ldp x23,x24,[x29,#48] 797 ldp x25,x26,[x29,#64] 798 ldp x27,x28,[x29,#80] 799 ldp x29,x30,[sp],#96 800 ret 801.size ChaCha20_neon,.-ChaCha20_neon 802.type ChaCha20_512_neon,%function 803.align 5 804ChaCha20_512_neon: 805 stp x29,x30,[sp,#-96]! 806 add x29,sp,#0 807 808 adr x5,.Lsigma 809 stp x19,x20,[sp,#16] 810 stp x21,x22,[sp,#32] 811 stp x23,x24,[sp,#48] 812 stp x25,x26,[sp,#64] 813 stp x27,x28,[sp,#80] 814 815.L512_or_more_neon: 816 sub sp,sp,#128+64 817 818 ldp x22,x23,[x5] // load sigma 819 ld1 {v24.4s},[x5],#16 820 ldp x24,x25,[x3] // load key 821 ldp x26,x27,[x3,#16] 822 ld1 {v25.4s,v26.4s},[x3] 823 ldp x28,x30,[x4] // load counter 824 ld1 {v27.4s},[x4] 825 ld1 {v31.4s},[x5] 826#ifdef __ARMEB__ 827 rev64 v24.4s,v24.4s 828 ror x24,x24,#32 829 ror x25,x25,#32 830 ror x26,x26,#32 831 ror x27,x27,#32 832 ror x28,x28,#32 833 ror x30,x30,#32 834#endif 835 add v27.4s,v27.4s,v31.4s // += 1 836 stp q24,q25,[sp,#0] // off-load key block, invariant part 837 add v27.4s,v27.4s,v31.4s // not typo 838 str q26,[sp,#32] 839 add v28.4s,v27.4s,v31.4s 840 add v29.4s,v28.4s,v31.4s 841 add v30.4s,v29.4s,v31.4s 842 shl v31.4s,v31.4s,#2 // 1 -> 4 843 844 stp d8,d9,[sp,#128+0] // meet ABI requirements 845 stp d10,d11,[sp,#128+16] 846 stp d12,d13,[sp,#128+32] 847 stp d14,d15,[sp,#128+48] 848 849 sub x2,x2,#512 // not typo 850 851.Loop_outer_512_neon: 852 mov v0.16b,v24.16b 853 mov v4.16b,v24.16b 854 mov v8.16b,v24.16b 855 mov v12.16b,v24.16b 856 mov v16.16b,v24.16b 857 mov v20.16b,v24.16b 858 mov v1.16b,v25.16b 859 mov w5,w22 // unpack key block 860 mov v5.16b,v25.16b 861 lsr x6,x22,#32 862 mov v9.16b,v25.16b 863 mov w7,w23 864 mov v13.16b,v25.16b 865 lsr x8,x23,#32 866 mov v17.16b,v25.16b 867 mov w9,w24 868 mov v21.16b,v25.16b 869 lsr x10,x24,#32 870 mov v3.16b,v27.16b 871 mov w11,w25 872 mov v7.16b,v28.16b 873 lsr x12,x25,#32 874 mov v11.16b,v29.16b 875 mov w13,w26 876 mov v15.16b,v30.16b 877 lsr x14,x26,#32 878 mov v2.16b,v26.16b 879 mov w15,w27 880 mov v6.16b,v26.16b 881 lsr x16,x27,#32 882 add v19.4s,v3.4s,v31.4s // +4 883 mov w17,w28 884 add v23.4s,v7.4s,v31.4s // +4 885 lsr x19,x28,#32 886 mov v10.16b,v26.16b 887 mov w20,w30 888 mov v14.16b,v26.16b 889 lsr x21,x30,#32 890 mov v18.16b,v26.16b 891 stp q27,q28,[sp,#48] // off-load key block, variable part 892 mov v22.16b,v26.16b 893 str q29,[sp,#80] 894 895 mov x4,#5 896 subs x2,x2,#512 897.Loop_upper_neon: 898 sub x4,x4,#1 899 add v0.4s,v0.4s,v1.4s 900 add w5,w5,w9 901 add v4.4s,v4.4s,v5.4s 902 add w6,w6,w10 903 add v8.4s,v8.4s,v9.4s 904 add w7,w7,w11 905 add v12.4s,v12.4s,v13.4s 906 add w8,w8,w12 907 add v16.4s,v16.4s,v17.4s 908 eor w17,w17,w5 909 add v20.4s,v20.4s,v21.4s 910 eor w19,w19,w6 911 eor v3.16b,v3.16b,v0.16b 912 eor w20,w20,w7 913 eor v7.16b,v7.16b,v4.16b 914 eor w21,w21,w8 915 eor v11.16b,v11.16b,v8.16b 916 ror w17,w17,#16 917 eor v15.16b,v15.16b,v12.16b 918 ror w19,w19,#16 919 eor v19.16b,v19.16b,v16.16b 920 ror w20,w20,#16 921 eor v23.16b,v23.16b,v20.16b 922 ror w21,w21,#16 923 rev32 v3.8h,v3.8h 924 add w13,w13,w17 925 rev32 v7.8h,v7.8h 926 add w14,w14,w19 927 rev32 v11.8h,v11.8h 928 add w15,w15,w20 929 rev32 v15.8h,v15.8h 930 add w16,w16,w21 931 rev32 v19.8h,v19.8h 932 eor w9,w9,w13 933 rev32 v23.8h,v23.8h 934 eor w10,w10,w14 935 add v2.4s,v2.4s,v3.4s 936 eor w11,w11,w15 937 add v6.4s,v6.4s,v7.4s 938 eor w12,w12,w16 939 add v10.4s,v10.4s,v11.4s 940 ror w9,w9,#20 941 add v14.4s,v14.4s,v15.4s 942 ror w10,w10,#20 943 add v18.4s,v18.4s,v19.4s 944 ror w11,w11,#20 945 add v22.4s,v22.4s,v23.4s 946 ror w12,w12,#20 947 eor v24.16b,v1.16b,v2.16b 948 add w5,w5,w9 949 eor v25.16b,v5.16b,v6.16b 950 add w6,w6,w10 951 eor v26.16b,v9.16b,v10.16b 952 add w7,w7,w11 953 eor v27.16b,v13.16b,v14.16b 954 add w8,w8,w12 955 eor v28.16b,v17.16b,v18.16b 956 eor w17,w17,w5 957 eor v29.16b,v21.16b,v22.16b 958 eor w19,w19,w6 959 ushr v1.4s,v24.4s,#20 960 eor w20,w20,w7 961 ushr v5.4s,v25.4s,#20 962 eor w21,w21,w8 963 ushr v9.4s,v26.4s,#20 964 ror w17,w17,#24 965 ushr v13.4s,v27.4s,#20 966 ror w19,w19,#24 967 ushr v17.4s,v28.4s,#20 968 ror w20,w20,#24 969 ushr v21.4s,v29.4s,#20 970 ror w21,w21,#24 971 sli v1.4s,v24.4s,#12 972 add w13,w13,w17 973 sli v5.4s,v25.4s,#12 974 add w14,w14,w19 975 sli v9.4s,v26.4s,#12 976 add w15,w15,w20 977 sli v13.4s,v27.4s,#12 978 add w16,w16,w21 979 sli v17.4s,v28.4s,#12 980 eor w9,w9,w13 981 sli v21.4s,v29.4s,#12 982 eor w10,w10,w14 983 add v0.4s,v0.4s,v1.4s 984 eor w11,w11,w15 985 add v4.4s,v4.4s,v5.4s 986 eor w12,w12,w16 987 add v8.4s,v8.4s,v9.4s 988 ror w9,w9,#25 989 add v12.4s,v12.4s,v13.4s 990 ror w10,w10,#25 991 add v16.4s,v16.4s,v17.4s 992 ror w11,w11,#25 993 add v20.4s,v20.4s,v21.4s 994 ror w12,w12,#25 995 eor v24.16b,v3.16b,v0.16b 996 add w5,w5,w10 997 eor v25.16b,v7.16b,v4.16b 998 add w6,w6,w11 999 eor v26.16b,v11.16b,v8.16b 1000 add w7,w7,w12 1001 eor v27.16b,v15.16b,v12.16b 1002 add w8,w8,w9 1003 eor v28.16b,v19.16b,v16.16b 1004 eor w21,w21,w5 1005 eor v29.16b,v23.16b,v20.16b 1006 eor w17,w17,w6 1007 ushr v3.4s,v24.4s,#24 1008 eor w19,w19,w7 1009 ushr v7.4s,v25.4s,#24 1010 eor w20,w20,w8 1011 ushr v11.4s,v26.4s,#24 1012 ror w21,w21,#16 1013 ushr v15.4s,v27.4s,#24 1014 ror w17,w17,#16 1015 ushr v19.4s,v28.4s,#24 1016 ror w19,w19,#16 1017 ushr v23.4s,v29.4s,#24 1018 ror w20,w20,#16 1019 sli v3.4s,v24.4s,#8 1020 add w15,w15,w21 1021 sli v7.4s,v25.4s,#8 1022 add w16,w16,w17 1023 sli v11.4s,v26.4s,#8 1024 add w13,w13,w19 1025 sli v15.4s,v27.4s,#8 1026 add w14,w14,w20 1027 sli v19.4s,v28.4s,#8 1028 eor w10,w10,w15 1029 sli v23.4s,v29.4s,#8 1030 eor w11,w11,w16 1031 add v2.4s,v2.4s,v3.4s 1032 eor w12,w12,w13 1033 add v6.4s,v6.4s,v7.4s 1034 eor w9,w9,w14 1035 add v10.4s,v10.4s,v11.4s 1036 ror w10,w10,#20 1037 add v14.4s,v14.4s,v15.4s 1038 ror w11,w11,#20 1039 add v18.4s,v18.4s,v19.4s 1040 ror w12,w12,#20 1041 add v22.4s,v22.4s,v23.4s 1042 ror w9,w9,#20 1043 eor v24.16b,v1.16b,v2.16b 1044 add w5,w5,w10 1045 eor v25.16b,v5.16b,v6.16b 1046 add w6,w6,w11 1047 eor v26.16b,v9.16b,v10.16b 1048 add w7,w7,w12 1049 eor v27.16b,v13.16b,v14.16b 1050 add w8,w8,w9 1051 eor v28.16b,v17.16b,v18.16b 1052 eor w21,w21,w5 1053 eor v29.16b,v21.16b,v22.16b 1054 eor w17,w17,w6 1055 ushr v1.4s,v24.4s,#25 1056 eor w19,w19,w7 1057 ushr v5.4s,v25.4s,#25 1058 eor w20,w20,w8 1059 ushr v9.4s,v26.4s,#25 1060 ror w21,w21,#24 1061 ushr v13.4s,v27.4s,#25 1062 ror w17,w17,#24 1063 ushr v17.4s,v28.4s,#25 1064 ror w19,w19,#24 1065 ushr v21.4s,v29.4s,#25 1066 ror w20,w20,#24 1067 sli v1.4s,v24.4s,#7 1068 add w15,w15,w21 1069 sli v5.4s,v25.4s,#7 1070 add w16,w16,w17 1071 sli v9.4s,v26.4s,#7 1072 add w13,w13,w19 1073 sli v13.4s,v27.4s,#7 1074 add w14,w14,w20 1075 sli v17.4s,v28.4s,#7 1076 eor w10,w10,w15 1077 sli v21.4s,v29.4s,#7 1078 eor w11,w11,w16 1079 ext v2.16b,v2.16b,v2.16b,#8 1080 eor w12,w12,w13 1081 ext v6.16b,v6.16b,v6.16b,#8 1082 eor w9,w9,w14 1083 ext v10.16b,v10.16b,v10.16b,#8 1084 ror w10,w10,#25 1085 ext v14.16b,v14.16b,v14.16b,#8 1086 ror w11,w11,#25 1087 ext v18.16b,v18.16b,v18.16b,#8 1088 ror w12,w12,#25 1089 ext v22.16b,v22.16b,v22.16b,#8 1090 ror w9,w9,#25 1091 ext v3.16b,v3.16b,v3.16b,#12 1092 ext v7.16b,v7.16b,v7.16b,#12 1093 ext v11.16b,v11.16b,v11.16b,#12 1094 ext v15.16b,v15.16b,v15.16b,#12 1095 ext v19.16b,v19.16b,v19.16b,#12 1096 ext v23.16b,v23.16b,v23.16b,#12 1097 ext v1.16b,v1.16b,v1.16b,#4 1098 ext v5.16b,v5.16b,v5.16b,#4 1099 ext v9.16b,v9.16b,v9.16b,#4 1100 ext v13.16b,v13.16b,v13.16b,#4 1101 ext v17.16b,v17.16b,v17.16b,#4 1102 ext v21.16b,v21.16b,v21.16b,#4 1103 add v0.4s,v0.4s,v1.4s 1104 add w5,w5,w9 1105 add v4.4s,v4.4s,v5.4s 1106 add w6,w6,w10 1107 add v8.4s,v8.4s,v9.4s 1108 add w7,w7,w11 1109 add v12.4s,v12.4s,v13.4s 1110 add w8,w8,w12 1111 add v16.4s,v16.4s,v17.4s 1112 eor w17,w17,w5 1113 add v20.4s,v20.4s,v21.4s 1114 eor w19,w19,w6 1115 eor v3.16b,v3.16b,v0.16b 1116 eor w20,w20,w7 1117 eor v7.16b,v7.16b,v4.16b 1118 eor w21,w21,w8 1119 eor v11.16b,v11.16b,v8.16b 1120 ror w17,w17,#16 1121 eor v15.16b,v15.16b,v12.16b 1122 ror w19,w19,#16 1123 eor v19.16b,v19.16b,v16.16b 1124 ror w20,w20,#16 1125 eor v23.16b,v23.16b,v20.16b 1126 ror w21,w21,#16 1127 rev32 v3.8h,v3.8h 1128 add w13,w13,w17 1129 rev32 v7.8h,v7.8h 1130 add w14,w14,w19 1131 rev32 v11.8h,v11.8h 1132 add w15,w15,w20 1133 rev32 v15.8h,v15.8h 1134 add w16,w16,w21 1135 rev32 v19.8h,v19.8h 1136 eor w9,w9,w13 1137 rev32 v23.8h,v23.8h 1138 eor w10,w10,w14 1139 add v2.4s,v2.4s,v3.4s 1140 eor w11,w11,w15 1141 add v6.4s,v6.4s,v7.4s 1142 eor w12,w12,w16 1143 add v10.4s,v10.4s,v11.4s 1144 ror w9,w9,#20 1145 add v14.4s,v14.4s,v15.4s 1146 ror w10,w10,#20 1147 add v18.4s,v18.4s,v19.4s 1148 ror w11,w11,#20 1149 add v22.4s,v22.4s,v23.4s 1150 ror w12,w12,#20 1151 eor v24.16b,v1.16b,v2.16b 1152 add w5,w5,w9 1153 eor v25.16b,v5.16b,v6.16b 1154 add w6,w6,w10 1155 eor v26.16b,v9.16b,v10.16b 1156 add w7,w7,w11 1157 eor v27.16b,v13.16b,v14.16b 1158 add w8,w8,w12 1159 eor v28.16b,v17.16b,v18.16b 1160 eor w17,w17,w5 1161 eor v29.16b,v21.16b,v22.16b 1162 eor w19,w19,w6 1163 ushr v1.4s,v24.4s,#20 1164 eor w20,w20,w7 1165 ushr v5.4s,v25.4s,#20 1166 eor w21,w21,w8 1167 ushr v9.4s,v26.4s,#20 1168 ror w17,w17,#24 1169 ushr v13.4s,v27.4s,#20 1170 ror w19,w19,#24 1171 ushr v17.4s,v28.4s,#20 1172 ror w20,w20,#24 1173 ushr v21.4s,v29.4s,#20 1174 ror w21,w21,#24 1175 sli v1.4s,v24.4s,#12 1176 add w13,w13,w17 1177 sli v5.4s,v25.4s,#12 1178 add w14,w14,w19 1179 sli v9.4s,v26.4s,#12 1180 add w15,w15,w20 1181 sli v13.4s,v27.4s,#12 1182 add w16,w16,w21 1183 sli v17.4s,v28.4s,#12 1184 eor w9,w9,w13 1185 sli v21.4s,v29.4s,#12 1186 eor w10,w10,w14 1187 add v0.4s,v0.4s,v1.4s 1188 eor w11,w11,w15 1189 add v4.4s,v4.4s,v5.4s 1190 eor w12,w12,w16 1191 add v8.4s,v8.4s,v9.4s 1192 ror w9,w9,#25 1193 add v12.4s,v12.4s,v13.4s 1194 ror w10,w10,#25 1195 add v16.4s,v16.4s,v17.4s 1196 ror w11,w11,#25 1197 add v20.4s,v20.4s,v21.4s 1198 ror w12,w12,#25 1199 eor v24.16b,v3.16b,v0.16b 1200 add w5,w5,w10 1201 eor v25.16b,v7.16b,v4.16b 1202 add w6,w6,w11 1203 eor v26.16b,v11.16b,v8.16b 1204 add w7,w7,w12 1205 eor v27.16b,v15.16b,v12.16b 1206 add w8,w8,w9 1207 eor v28.16b,v19.16b,v16.16b 1208 eor w21,w21,w5 1209 eor v29.16b,v23.16b,v20.16b 1210 eor w17,w17,w6 1211 ushr v3.4s,v24.4s,#24 1212 eor w19,w19,w7 1213 ushr v7.4s,v25.4s,#24 1214 eor w20,w20,w8 1215 ushr v11.4s,v26.4s,#24 1216 ror w21,w21,#16 1217 ushr v15.4s,v27.4s,#24 1218 ror w17,w17,#16 1219 ushr v19.4s,v28.4s,#24 1220 ror w19,w19,#16 1221 ushr v23.4s,v29.4s,#24 1222 ror w20,w20,#16 1223 sli v3.4s,v24.4s,#8 1224 add w15,w15,w21 1225 sli v7.4s,v25.4s,#8 1226 add w16,w16,w17 1227 sli v11.4s,v26.4s,#8 1228 add w13,w13,w19 1229 sli v15.4s,v27.4s,#8 1230 add w14,w14,w20 1231 sli v19.4s,v28.4s,#8 1232 eor w10,w10,w15 1233 sli v23.4s,v29.4s,#8 1234 eor w11,w11,w16 1235 add v2.4s,v2.4s,v3.4s 1236 eor w12,w12,w13 1237 add v6.4s,v6.4s,v7.4s 1238 eor w9,w9,w14 1239 add v10.4s,v10.4s,v11.4s 1240 ror w10,w10,#20 1241 add v14.4s,v14.4s,v15.4s 1242 ror w11,w11,#20 1243 add v18.4s,v18.4s,v19.4s 1244 ror w12,w12,#20 1245 add v22.4s,v22.4s,v23.4s 1246 ror w9,w9,#20 1247 eor v24.16b,v1.16b,v2.16b 1248 add w5,w5,w10 1249 eor v25.16b,v5.16b,v6.16b 1250 add w6,w6,w11 1251 eor v26.16b,v9.16b,v10.16b 1252 add w7,w7,w12 1253 eor v27.16b,v13.16b,v14.16b 1254 add w8,w8,w9 1255 eor v28.16b,v17.16b,v18.16b 1256 eor w21,w21,w5 1257 eor v29.16b,v21.16b,v22.16b 1258 eor w17,w17,w6 1259 ushr v1.4s,v24.4s,#25 1260 eor w19,w19,w7 1261 ushr v5.4s,v25.4s,#25 1262 eor w20,w20,w8 1263 ushr v9.4s,v26.4s,#25 1264 ror w21,w21,#24 1265 ushr v13.4s,v27.4s,#25 1266 ror w17,w17,#24 1267 ushr v17.4s,v28.4s,#25 1268 ror w19,w19,#24 1269 ushr v21.4s,v29.4s,#25 1270 ror w20,w20,#24 1271 sli v1.4s,v24.4s,#7 1272 add w15,w15,w21 1273 sli v5.4s,v25.4s,#7 1274 add w16,w16,w17 1275 sli v9.4s,v26.4s,#7 1276 add w13,w13,w19 1277 sli v13.4s,v27.4s,#7 1278 add w14,w14,w20 1279 sli v17.4s,v28.4s,#7 1280 eor w10,w10,w15 1281 sli v21.4s,v29.4s,#7 1282 eor w11,w11,w16 1283 ext v2.16b,v2.16b,v2.16b,#8 1284 eor w12,w12,w13 1285 ext v6.16b,v6.16b,v6.16b,#8 1286 eor w9,w9,w14 1287 ext v10.16b,v10.16b,v10.16b,#8 1288 ror w10,w10,#25 1289 ext v14.16b,v14.16b,v14.16b,#8 1290 ror w11,w11,#25 1291 ext v18.16b,v18.16b,v18.16b,#8 1292 ror w12,w12,#25 1293 ext v22.16b,v22.16b,v22.16b,#8 1294 ror w9,w9,#25 1295 ext v3.16b,v3.16b,v3.16b,#4 1296 ext v7.16b,v7.16b,v7.16b,#4 1297 ext v11.16b,v11.16b,v11.16b,#4 1298 ext v15.16b,v15.16b,v15.16b,#4 1299 ext v19.16b,v19.16b,v19.16b,#4 1300 ext v23.16b,v23.16b,v23.16b,#4 1301 ext v1.16b,v1.16b,v1.16b,#12 1302 ext v5.16b,v5.16b,v5.16b,#12 1303 ext v9.16b,v9.16b,v9.16b,#12 1304 ext v13.16b,v13.16b,v13.16b,#12 1305 ext v17.16b,v17.16b,v17.16b,#12 1306 ext v21.16b,v21.16b,v21.16b,#12 1307 cbnz x4,.Loop_upper_neon 1308 1309 add w5,w5,w22 // accumulate key block 1310 add x6,x6,x22,lsr#32 1311 add w7,w7,w23 1312 add x8,x8,x23,lsr#32 1313 add w9,w9,w24 1314 add x10,x10,x24,lsr#32 1315 add w11,w11,w25 1316 add x12,x12,x25,lsr#32 1317 add w13,w13,w26 1318 add x14,x14,x26,lsr#32 1319 add w15,w15,w27 1320 add x16,x16,x27,lsr#32 1321 add w17,w17,w28 1322 add x19,x19,x28,lsr#32 1323 add w20,w20,w30 1324 add x21,x21,x30,lsr#32 1325 1326 add x5,x5,x6,lsl#32 // pack 1327 add x7,x7,x8,lsl#32 1328 ldp x6,x8,[x1,#0] // load input 1329 add x9,x9,x10,lsl#32 1330 add x11,x11,x12,lsl#32 1331 ldp x10,x12,[x1,#16] 1332 add x13,x13,x14,lsl#32 1333 add x15,x15,x16,lsl#32 1334 ldp x14,x16,[x1,#32] 1335 add x17,x17,x19,lsl#32 1336 add x20,x20,x21,lsl#32 1337 ldp x19,x21,[x1,#48] 1338 add x1,x1,#64 1339#ifdef __ARMEB__ 1340 rev x5,x5 1341 rev x7,x7 1342 rev x9,x9 1343 rev x11,x11 1344 rev x13,x13 1345 rev x15,x15 1346 rev x17,x17 1347 rev x20,x20 1348#endif 1349 eor x5,x5,x6 1350 eor x7,x7,x8 1351 eor x9,x9,x10 1352 eor x11,x11,x12 1353 eor x13,x13,x14 1354 eor x15,x15,x16 1355 eor x17,x17,x19 1356 eor x20,x20,x21 1357 1358 stp x5,x7,[x0,#0] // store output 1359 add x28,x28,#1 // increment counter 1360 mov w5,w22 // unpack key block 1361 lsr x6,x22,#32 1362 stp x9,x11,[x0,#16] 1363 mov w7,w23 1364 lsr x8,x23,#32 1365 stp x13,x15,[x0,#32] 1366 mov w9,w24 1367 lsr x10,x24,#32 1368 stp x17,x20,[x0,#48] 1369 add x0,x0,#64 1370 mov w11,w25 1371 lsr x12,x25,#32 1372 mov w13,w26 1373 lsr x14,x26,#32 1374 mov w15,w27 1375 lsr x16,x27,#32 1376 mov w17,w28 1377 lsr x19,x28,#32 1378 mov w20,w30 1379 lsr x21,x30,#32 1380 1381 mov x4,#5 1382.Loop_lower_neon: 1383 sub x4,x4,#1 1384 add v0.4s,v0.4s,v1.4s 1385 add w5,w5,w9 1386 add v4.4s,v4.4s,v5.4s 1387 add w6,w6,w10 1388 add v8.4s,v8.4s,v9.4s 1389 add w7,w7,w11 1390 add v12.4s,v12.4s,v13.4s 1391 add w8,w8,w12 1392 add v16.4s,v16.4s,v17.4s 1393 eor w17,w17,w5 1394 add v20.4s,v20.4s,v21.4s 1395 eor w19,w19,w6 1396 eor v3.16b,v3.16b,v0.16b 1397 eor w20,w20,w7 1398 eor v7.16b,v7.16b,v4.16b 1399 eor w21,w21,w8 1400 eor v11.16b,v11.16b,v8.16b 1401 ror w17,w17,#16 1402 eor v15.16b,v15.16b,v12.16b 1403 ror w19,w19,#16 1404 eor v19.16b,v19.16b,v16.16b 1405 ror w20,w20,#16 1406 eor v23.16b,v23.16b,v20.16b 1407 ror w21,w21,#16 1408 rev32 v3.8h,v3.8h 1409 add w13,w13,w17 1410 rev32 v7.8h,v7.8h 1411 add w14,w14,w19 1412 rev32 v11.8h,v11.8h 1413 add w15,w15,w20 1414 rev32 v15.8h,v15.8h 1415 add w16,w16,w21 1416 rev32 v19.8h,v19.8h 1417 eor w9,w9,w13 1418 rev32 v23.8h,v23.8h 1419 eor w10,w10,w14 1420 add v2.4s,v2.4s,v3.4s 1421 eor w11,w11,w15 1422 add v6.4s,v6.4s,v7.4s 1423 eor w12,w12,w16 1424 add v10.4s,v10.4s,v11.4s 1425 ror w9,w9,#20 1426 add v14.4s,v14.4s,v15.4s 1427 ror w10,w10,#20 1428 add v18.4s,v18.4s,v19.4s 1429 ror w11,w11,#20 1430 add v22.4s,v22.4s,v23.4s 1431 ror w12,w12,#20 1432 eor v24.16b,v1.16b,v2.16b 1433 add w5,w5,w9 1434 eor v25.16b,v5.16b,v6.16b 1435 add w6,w6,w10 1436 eor v26.16b,v9.16b,v10.16b 1437 add w7,w7,w11 1438 eor v27.16b,v13.16b,v14.16b 1439 add w8,w8,w12 1440 eor v28.16b,v17.16b,v18.16b 1441 eor w17,w17,w5 1442 eor v29.16b,v21.16b,v22.16b 1443 eor w19,w19,w6 1444 ushr v1.4s,v24.4s,#20 1445 eor w20,w20,w7 1446 ushr v5.4s,v25.4s,#20 1447 eor w21,w21,w8 1448 ushr v9.4s,v26.4s,#20 1449 ror w17,w17,#24 1450 ushr v13.4s,v27.4s,#20 1451 ror w19,w19,#24 1452 ushr v17.4s,v28.4s,#20 1453 ror w20,w20,#24 1454 ushr v21.4s,v29.4s,#20 1455 ror w21,w21,#24 1456 sli v1.4s,v24.4s,#12 1457 add w13,w13,w17 1458 sli v5.4s,v25.4s,#12 1459 add w14,w14,w19 1460 sli v9.4s,v26.4s,#12 1461 add w15,w15,w20 1462 sli v13.4s,v27.4s,#12 1463 add w16,w16,w21 1464 sli v17.4s,v28.4s,#12 1465 eor w9,w9,w13 1466 sli v21.4s,v29.4s,#12 1467 eor w10,w10,w14 1468 add v0.4s,v0.4s,v1.4s 1469 eor w11,w11,w15 1470 add v4.4s,v4.4s,v5.4s 1471 eor w12,w12,w16 1472 add v8.4s,v8.4s,v9.4s 1473 ror w9,w9,#25 1474 add v12.4s,v12.4s,v13.4s 1475 ror w10,w10,#25 1476 add v16.4s,v16.4s,v17.4s 1477 ror w11,w11,#25 1478 add v20.4s,v20.4s,v21.4s 1479 ror w12,w12,#25 1480 eor v24.16b,v3.16b,v0.16b 1481 add w5,w5,w10 1482 eor v25.16b,v7.16b,v4.16b 1483 add w6,w6,w11 1484 eor v26.16b,v11.16b,v8.16b 1485 add w7,w7,w12 1486 eor v27.16b,v15.16b,v12.16b 1487 add w8,w8,w9 1488 eor v28.16b,v19.16b,v16.16b 1489 eor w21,w21,w5 1490 eor v29.16b,v23.16b,v20.16b 1491 eor w17,w17,w6 1492 ushr v3.4s,v24.4s,#24 1493 eor w19,w19,w7 1494 ushr v7.4s,v25.4s,#24 1495 eor w20,w20,w8 1496 ushr v11.4s,v26.4s,#24 1497 ror w21,w21,#16 1498 ushr v15.4s,v27.4s,#24 1499 ror w17,w17,#16 1500 ushr v19.4s,v28.4s,#24 1501 ror w19,w19,#16 1502 ushr v23.4s,v29.4s,#24 1503 ror w20,w20,#16 1504 sli v3.4s,v24.4s,#8 1505 add w15,w15,w21 1506 sli v7.4s,v25.4s,#8 1507 add w16,w16,w17 1508 sli v11.4s,v26.4s,#8 1509 add w13,w13,w19 1510 sli v15.4s,v27.4s,#8 1511 add w14,w14,w20 1512 sli v19.4s,v28.4s,#8 1513 eor w10,w10,w15 1514 sli v23.4s,v29.4s,#8 1515 eor w11,w11,w16 1516 add v2.4s,v2.4s,v3.4s 1517 eor w12,w12,w13 1518 add v6.4s,v6.4s,v7.4s 1519 eor w9,w9,w14 1520 add v10.4s,v10.4s,v11.4s 1521 ror w10,w10,#20 1522 add v14.4s,v14.4s,v15.4s 1523 ror w11,w11,#20 1524 add v18.4s,v18.4s,v19.4s 1525 ror w12,w12,#20 1526 add v22.4s,v22.4s,v23.4s 1527 ror w9,w9,#20 1528 eor v24.16b,v1.16b,v2.16b 1529 add w5,w5,w10 1530 eor v25.16b,v5.16b,v6.16b 1531 add w6,w6,w11 1532 eor v26.16b,v9.16b,v10.16b 1533 add w7,w7,w12 1534 eor v27.16b,v13.16b,v14.16b 1535 add w8,w8,w9 1536 eor v28.16b,v17.16b,v18.16b 1537 eor w21,w21,w5 1538 eor v29.16b,v21.16b,v22.16b 1539 eor w17,w17,w6 1540 ushr v1.4s,v24.4s,#25 1541 eor w19,w19,w7 1542 ushr v5.4s,v25.4s,#25 1543 eor w20,w20,w8 1544 ushr v9.4s,v26.4s,#25 1545 ror w21,w21,#24 1546 ushr v13.4s,v27.4s,#25 1547 ror w17,w17,#24 1548 ushr v17.4s,v28.4s,#25 1549 ror w19,w19,#24 1550 ushr v21.4s,v29.4s,#25 1551 ror w20,w20,#24 1552 sli v1.4s,v24.4s,#7 1553 add w15,w15,w21 1554 sli v5.4s,v25.4s,#7 1555 add w16,w16,w17 1556 sli v9.4s,v26.4s,#7 1557 add w13,w13,w19 1558 sli v13.4s,v27.4s,#7 1559 add w14,w14,w20 1560 sli v17.4s,v28.4s,#7 1561 eor w10,w10,w15 1562 sli v21.4s,v29.4s,#7 1563 eor w11,w11,w16 1564 ext v2.16b,v2.16b,v2.16b,#8 1565 eor w12,w12,w13 1566 ext v6.16b,v6.16b,v6.16b,#8 1567 eor w9,w9,w14 1568 ext v10.16b,v10.16b,v10.16b,#8 1569 ror w10,w10,#25 1570 ext v14.16b,v14.16b,v14.16b,#8 1571 ror w11,w11,#25 1572 ext v18.16b,v18.16b,v18.16b,#8 1573 ror w12,w12,#25 1574 ext v22.16b,v22.16b,v22.16b,#8 1575 ror w9,w9,#25 1576 ext v3.16b,v3.16b,v3.16b,#12 1577 ext v7.16b,v7.16b,v7.16b,#12 1578 ext v11.16b,v11.16b,v11.16b,#12 1579 ext v15.16b,v15.16b,v15.16b,#12 1580 ext v19.16b,v19.16b,v19.16b,#12 1581 ext v23.16b,v23.16b,v23.16b,#12 1582 ext v1.16b,v1.16b,v1.16b,#4 1583 ext v5.16b,v5.16b,v5.16b,#4 1584 ext v9.16b,v9.16b,v9.16b,#4 1585 ext v13.16b,v13.16b,v13.16b,#4 1586 ext v17.16b,v17.16b,v17.16b,#4 1587 ext v21.16b,v21.16b,v21.16b,#4 1588 add v0.4s,v0.4s,v1.4s 1589 add w5,w5,w9 1590 add v4.4s,v4.4s,v5.4s 1591 add w6,w6,w10 1592 add v8.4s,v8.4s,v9.4s 1593 add w7,w7,w11 1594 add v12.4s,v12.4s,v13.4s 1595 add w8,w8,w12 1596 add v16.4s,v16.4s,v17.4s 1597 eor w17,w17,w5 1598 add v20.4s,v20.4s,v21.4s 1599 eor w19,w19,w6 1600 eor v3.16b,v3.16b,v0.16b 1601 eor w20,w20,w7 1602 eor v7.16b,v7.16b,v4.16b 1603 eor w21,w21,w8 1604 eor v11.16b,v11.16b,v8.16b 1605 ror w17,w17,#16 1606 eor v15.16b,v15.16b,v12.16b 1607 ror w19,w19,#16 1608 eor v19.16b,v19.16b,v16.16b 1609 ror w20,w20,#16 1610 eor v23.16b,v23.16b,v20.16b 1611 ror w21,w21,#16 1612 rev32 v3.8h,v3.8h 1613 add w13,w13,w17 1614 rev32 v7.8h,v7.8h 1615 add w14,w14,w19 1616 rev32 v11.8h,v11.8h 1617 add w15,w15,w20 1618 rev32 v15.8h,v15.8h 1619 add w16,w16,w21 1620 rev32 v19.8h,v19.8h 1621 eor w9,w9,w13 1622 rev32 v23.8h,v23.8h 1623 eor w10,w10,w14 1624 add v2.4s,v2.4s,v3.4s 1625 eor w11,w11,w15 1626 add v6.4s,v6.4s,v7.4s 1627 eor w12,w12,w16 1628 add v10.4s,v10.4s,v11.4s 1629 ror w9,w9,#20 1630 add v14.4s,v14.4s,v15.4s 1631 ror w10,w10,#20 1632 add v18.4s,v18.4s,v19.4s 1633 ror w11,w11,#20 1634 add v22.4s,v22.4s,v23.4s 1635 ror w12,w12,#20 1636 eor v24.16b,v1.16b,v2.16b 1637 add w5,w5,w9 1638 eor v25.16b,v5.16b,v6.16b 1639 add w6,w6,w10 1640 eor v26.16b,v9.16b,v10.16b 1641 add w7,w7,w11 1642 eor v27.16b,v13.16b,v14.16b 1643 add w8,w8,w12 1644 eor v28.16b,v17.16b,v18.16b 1645 eor w17,w17,w5 1646 eor v29.16b,v21.16b,v22.16b 1647 eor w19,w19,w6 1648 ushr v1.4s,v24.4s,#20 1649 eor w20,w20,w7 1650 ushr v5.4s,v25.4s,#20 1651 eor w21,w21,w8 1652 ushr v9.4s,v26.4s,#20 1653 ror w17,w17,#24 1654 ushr v13.4s,v27.4s,#20 1655 ror w19,w19,#24 1656 ushr v17.4s,v28.4s,#20 1657 ror w20,w20,#24 1658 ushr v21.4s,v29.4s,#20 1659 ror w21,w21,#24 1660 sli v1.4s,v24.4s,#12 1661 add w13,w13,w17 1662 sli v5.4s,v25.4s,#12 1663 add w14,w14,w19 1664 sli v9.4s,v26.4s,#12 1665 add w15,w15,w20 1666 sli v13.4s,v27.4s,#12 1667 add w16,w16,w21 1668 sli v17.4s,v28.4s,#12 1669 eor w9,w9,w13 1670 sli v21.4s,v29.4s,#12 1671 eor w10,w10,w14 1672 add v0.4s,v0.4s,v1.4s 1673 eor w11,w11,w15 1674 add v4.4s,v4.4s,v5.4s 1675 eor w12,w12,w16 1676 add v8.4s,v8.4s,v9.4s 1677 ror w9,w9,#25 1678 add v12.4s,v12.4s,v13.4s 1679 ror w10,w10,#25 1680 add v16.4s,v16.4s,v17.4s 1681 ror w11,w11,#25 1682 add v20.4s,v20.4s,v21.4s 1683 ror w12,w12,#25 1684 eor v24.16b,v3.16b,v0.16b 1685 add w5,w5,w10 1686 eor v25.16b,v7.16b,v4.16b 1687 add w6,w6,w11 1688 eor v26.16b,v11.16b,v8.16b 1689 add w7,w7,w12 1690 eor v27.16b,v15.16b,v12.16b 1691 add w8,w8,w9 1692 eor v28.16b,v19.16b,v16.16b 1693 eor w21,w21,w5 1694 eor v29.16b,v23.16b,v20.16b 1695 eor w17,w17,w6 1696 ushr v3.4s,v24.4s,#24 1697 eor w19,w19,w7 1698 ushr v7.4s,v25.4s,#24 1699 eor w20,w20,w8 1700 ushr v11.4s,v26.4s,#24 1701 ror w21,w21,#16 1702 ushr v15.4s,v27.4s,#24 1703 ror w17,w17,#16 1704 ushr v19.4s,v28.4s,#24 1705 ror w19,w19,#16 1706 ushr v23.4s,v29.4s,#24 1707 ror w20,w20,#16 1708 sli v3.4s,v24.4s,#8 1709 add w15,w15,w21 1710 sli v7.4s,v25.4s,#8 1711 add w16,w16,w17 1712 sli v11.4s,v26.4s,#8 1713 add w13,w13,w19 1714 sli v15.4s,v27.4s,#8 1715 add w14,w14,w20 1716 sli v19.4s,v28.4s,#8 1717 eor w10,w10,w15 1718 sli v23.4s,v29.4s,#8 1719 eor w11,w11,w16 1720 add v2.4s,v2.4s,v3.4s 1721 eor w12,w12,w13 1722 add v6.4s,v6.4s,v7.4s 1723 eor w9,w9,w14 1724 add v10.4s,v10.4s,v11.4s 1725 ror w10,w10,#20 1726 add v14.4s,v14.4s,v15.4s 1727 ror w11,w11,#20 1728 add v18.4s,v18.4s,v19.4s 1729 ror w12,w12,#20 1730 add v22.4s,v22.4s,v23.4s 1731 ror w9,w9,#20 1732 eor v24.16b,v1.16b,v2.16b 1733 add w5,w5,w10 1734 eor v25.16b,v5.16b,v6.16b 1735 add w6,w6,w11 1736 eor v26.16b,v9.16b,v10.16b 1737 add w7,w7,w12 1738 eor v27.16b,v13.16b,v14.16b 1739 add w8,w8,w9 1740 eor v28.16b,v17.16b,v18.16b 1741 eor w21,w21,w5 1742 eor v29.16b,v21.16b,v22.16b 1743 eor w17,w17,w6 1744 ushr v1.4s,v24.4s,#25 1745 eor w19,w19,w7 1746 ushr v5.4s,v25.4s,#25 1747 eor w20,w20,w8 1748 ushr v9.4s,v26.4s,#25 1749 ror w21,w21,#24 1750 ushr v13.4s,v27.4s,#25 1751 ror w17,w17,#24 1752 ushr v17.4s,v28.4s,#25 1753 ror w19,w19,#24 1754 ushr v21.4s,v29.4s,#25 1755 ror w20,w20,#24 1756 sli v1.4s,v24.4s,#7 1757 add w15,w15,w21 1758 sli v5.4s,v25.4s,#7 1759 add w16,w16,w17 1760 sli v9.4s,v26.4s,#7 1761 add w13,w13,w19 1762 sli v13.4s,v27.4s,#7 1763 add w14,w14,w20 1764 sli v17.4s,v28.4s,#7 1765 eor w10,w10,w15 1766 sli v21.4s,v29.4s,#7 1767 eor w11,w11,w16 1768 ext v2.16b,v2.16b,v2.16b,#8 1769 eor w12,w12,w13 1770 ext v6.16b,v6.16b,v6.16b,#8 1771 eor w9,w9,w14 1772 ext v10.16b,v10.16b,v10.16b,#8 1773 ror w10,w10,#25 1774 ext v14.16b,v14.16b,v14.16b,#8 1775 ror w11,w11,#25 1776 ext v18.16b,v18.16b,v18.16b,#8 1777 ror w12,w12,#25 1778 ext v22.16b,v22.16b,v22.16b,#8 1779 ror w9,w9,#25 1780 ext v3.16b,v3.16b,v3.16b,#4 1781 ext v7.16b,v7.16b,v7.16b,#4 1782 ext v11.16b,v11.16b,v11.16b,#4 1783 ext v15.16b,v15.16b,v15.16b,#4 1784 ext v19.16b,v19.16b,v19.16b,#4 1785 ext v23.16b,v23.16b,v23.16b,#4 1786 ext v1.16b,v1.16b,v1.16b,#12 1787 ext v5.16b,v5.16b,v5.16b,#12 1788 ext v9.16b,v9.16b,v9.16b,#12 1789 ext v13.16b,v13.16b,v13.16b,#12 1790 ext v17.16b,v17.16b,v17.16b,#12 1791 ext v21.16b,v21.16b,v21.16b,#12 1792 cbnz x4,.Loop_lower_neon 1793 1794 add w5,w5,w22 // accumulate key block 1795 ldp q24,q25,[sp,#0] 1796 add x6,x6,x22,lsr#32 1797 ldp q26,q27,[sp,#32] 1798 add w7,w7,w23 1799 ldp q28,q29,[sp,#64] 1800 add x8,x8,x23,lsr#32 1801 add v0.4s,v0.4s,v24.4s 1802 add w9,w9,w24 1803 add v4.4s,v4.4s,v24.4s 1804 add x10,x10,x24,lsr#32 1805 add v8.4s,v8.4s,v24.4s 1806 add w11,w11,w25 1807 add v12.4s,v12.4s,v24.4s 1808 add x12,x12,x25,lsr#32 1809 add v16.4s,v16.4s,v24.4s 1810 add w13,w13,w26 1811 add v20.4s,v20.4s,v24.4s 1812 add x14,x14,x26,lsr#32 1813 add v2.4s,v2.4s,v26.4s 1814 add w15,w15,w27 1815 add v6.4s,v6.4s,v26.4s 1816 add x16,x16,x27,lsr#32 1817 add v10.4s,v10.4s,v26.4s 1818 add w17,w17,w28 1819 add v14.4s,v14.4s,v26.4s 1820 add x19,x19,x28,lsr#32 1821 add v18.4s,v18.4s,v26.4s 1822 add w20,w20,w30 1823 add v22.4s,v22.4s,v26.4s 1824 add x21,x21,x30,lsr#32 1825 add v19.4s,v19.4s,v31.4s // +4 1826 add x5,x5,x6,lsl#32 // pack 1827 add v23.4s,v23.4s,v31.4s // +4 1828 add x7,x7,x8,lsl#32 1829 add v3.4s,v3.4s,v27.4s 1830 ldp x6,x8,[x1,#0] // load input 1831 add v7.4s,v7.4s,v28.4s 1832 add x9,x9,x10,lsl#32 1833 add v11.4s,v11.4s,v29.4s 1834 add x11,x11,x12,lsl#32 1835 add v15.4s,v15.4s,v30.4s 1836 ldp x10,x12,[x1,#16] 1837 add v19.4s,v19.4s,v27.4s 1838 add x13,x13,x14,lsl#32 1839 add v23.4s,v23.4s,v28.4s 1840 add x15,x15,x16,lsl#32 1841 add v1.4s,v1.4s,v25.4s 1842 ldp x14,x16,[x1,#32] 1843 add v5.4s,v5.4s,v25.4s 1844 add x17,x17,x19,lsl#32 1845 add v9.4s,v9.4s,v25.4s 1846 add x20,x20,x21,lsl#32 1847 add v13.4s,v13.4s,v25.4s 1848 ldp x19,x21,[x1,#48] 1849 add v17.4s,v17.4s,v25.4s 1850 add x1,x1,#64 1851 add v21.4s,v21.4s,v25.4s 1852 1853#ifdef __ARMEB__ 1854 rev x5,x5 1855 rev x7,x7 1856 rev x9,x9 1857 rev x11,x11 1858 rev x13,x13 1859 rev x15,x15 1860 rev x17,x17 1861 rev x20,x20 1862#endif 1863 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1864 eor x5,x5,x6 1865 eor x7,x7,x8 1866 eor x9,x9,x10 1867 eor x11,x11,x12 1868 eor x13,x13,x14 1869 eor v0.16b,v0.16b,v24.16b 1870 eor x15,x15,x16 1871 eor v1.16b,v1.16b,v25.16b 1872 eor x17,x17,x19 1873 eor v2.16b,v2.16b,v26.16b 1874 eor x20,x20,x21 1875 eor v3.16b,v3.16b,v27.16b 1876 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1877 1878 stp x5,x7,[x0,#0] // store output 1879 add x28,x28,#7 // increment counter 1880 stp x9,x11,[x0,#16] 1881 stp x13,x15,[x0,#32] 1882 stp x17,x20,[x0,#48] 1883 add x0,x0,#64 1884 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 1885 1886 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 1887 eor v4.16b,v4.16b,v24.16b 1888 eor v5.16b,v5.16b,v25.16b 1889 eor v6.16b,v6.16b,v26.16b 1890 eor v7.16b,v7.16b,v27.16b 1891 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 1892 1893 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 1894 eor v8.16b,v8.16b,v0.16b 1895 ldp q24,q25,[sp,#0] 1896 eor v9.16b,v9.16b,v1.16b 1897 ldp q26,q27,[sp,#32] 1898 eor v10.16b,v10.16b,v2.16b 1899 eor v11.16b,v11.16b,v3.16b 1900 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 1901 1902 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 1903 eor v12.16b,v12.16b,v4.16b 1904 eor v13.16b,v13.16b,v5.16b 1905 eor v14.16b,v14.16b,v6.16b 1906 eor v15.16b,v15.16b,v7.16b 1907 st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 1908 1909 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 1910 eor v16.16b,v16.16b,v8.16b 1911 eor v17.16b,v17.16b,v9.16b 1912 eor v18.16b,v18.16b,v10.16b 1913 eor v19.16b,v19.16b,v11.16b 1914 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 1915 1916 shl v0.4s,v31.4s,#1 // 4 -> 8 1917 eor v20.16b,v20.16b,v12.16b 1918 eor v21.16b,v21.16b,v13.16b 1919 eor v22.16b,v22.16b,v14.16b 1920 eor v23.16b,v23.16b,v15.16b 1921 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 1922 1923 add v27.4s,v27.4s,v0.4s // += 8 1924 add v28.4s,v28.4s,v0.4s 1925 add v29.4s,v29.4s,v0.4s 1926 add v30.4s,v30.4s,v0.4s 1927 1928 b.hs .Loop_outer_512_neon 1929 1930 adds x2,x2,#512 1931 ushr v0.4s,v31.4s,#2 // 4 -> 1 1932 1933 ldp d8,d9,[sp,#128+0] // meet ABI requirements 1934 ldp d10,d11,[sp,#128+16] 1935 ldp d12,d13,[sp,#128+32] 1936 ldp d14,d15,[sp,#128+48] 1937 1938 stp q24,q31,[sp,#0] // wipe off-load area 1939 stp q24,q31,[sp,#32] 1940 stp q24,q31,[sp,#64] 1941 1942 b.eq .Ldone_512_neon 1943 1944 cmp x2,#192 1945 sub v27.4s,v27.4s,v0.4s // -= 1 1946 sub v28.4s,v28.4s,v0.4s 1947 sub v29.4s,v29.4s,v0.4s 1948 add sp,sp,#128 1949 b.hs .Loop_outer_neon 1950 1951 eor v25.16b,v25.16b,v25.16b 1952 eor v26.16b,v26.16b,v26.16b 1953 eor v27.16b,v27.16b,v27.16b 1954 eor v28.16b,v28.16b,v28.16b 1955 eor v29.16b,v29.16b,v29.16b 1956 eor v30.16b,v30.16b,v30.16b 1957 b .Loop_outer 1958 1959.Ldone_512_neon: 1960 ldp x19,x20,[x29,#16] 1961 add sp,sp,#128+64 1962 ldp x21,x22,[x29,#32] 1963 ldp x23,x24,[x29,#48] 1964 ldp x25,x26,[x29,#64] 1965 ldp x27,x28,[x29,#80] 1966 ldp x29,x30,[sp],#96 1967 ret 1968.size ChaCha20_512_neon,.-ChaCha20_512_neon 1969