1#include <machine/asm.h> 2.text 3 4 5 6.align 64 7.Lzero: 8.long 0,0,0,0 9.Lone: 10.long 1,0,0,0 11.Linc: 12.long 0,1,2,3 13.Lfour: 14.long 4,4,4,4 15.Lincy: 16.long 0,2,4,6,1,3,5,7 17.Leight: 18.long 8,8,8,8,8,8,8,8 19.Lrot16: 20.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 21.Lrot24: 22.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 23.Ltwoy: 24.long 2,0,0,0, 2,0,0,0 25.align 64 26.Lzeroz: 27.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 28.Lfourz: 29.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 30.Lincz: 31.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 32.Lsixteen: 33.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 34.Lsigma: 35.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 36.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 37.globl ChaCha20_ctr32 38.type ChaCha20_ctr32,@function 39.align 64 40ChaCha20_ctr32: 41.cfi_startproc 42 cmpq $0,%rdx 43 je .Lno_data 44 movq OPENSSL_ia32cap_P+4(%rip),%r10 45 btq $48,%r10 46 jc .LChaCha20_avx512 47 testq %r10,%r10 48 js .LChaCha20_avx512vl 49 testl $512,%r10d 50 jnz .LChaCha20_ssse3 51 52 pushq %rbx 53.cfi_adjust_cfa_offset 8 54.cfi_offset %rbx,-16 55 pushq %rbp 56.cfi_adjust_cfa_offset 8 57.cfi_offset %rbp,-24 58 pushq %r12 59.cfi_adjust_cfa_offset 8 60.cfi_offset %r12,-32 61 pushq %r13 62.cfi_adjust_cfa_offset 8 63.cfi_offset %r13,-40 64 pushq %r14 65.cfi_adjust_cfa_offset 8 66.cfi_offset %r14,-48 67 pushq %r15 68.cfi_adjust_cfa_offset 8 69.cfi_offset %r15,-56 70 subq $64+24,%rsp 71.cfi_adjust_cfa_offset 64+24 72.Lctr32_body: 73 74 75 movdqu (%rcx),%xmm1 76 movdqu 16(%rcx),%xmm2 77 movdqu (%r8),%xmm3 78 movdqa .Lone(%rip),%xmm4 79 80 81 movdqa %xmm1,16(%rsp) 82 movdqa %xmm2,32(%rsp) 83 movdqa %xmm3,48(%rsp) 84 movq %rdx,%rbp 85 jmp .Loop_outer 86 87.align 32 88.Loop_outer: 89 movl $0x61707865,%eax 90 movl $0x3320646e,%ebx 91 movl $0x79622d32,%ecx 92 movl $0x6b206574,%edx 93 movl 16(%rsp),%r8d 94 movl 20(%rsp),%r9d 95 movl 24(%rsp),%r10d 96 movl 28(%rsp),%r11d 97 movd %xmm3,%r12d 98 movl 52(%rsp),%r13d 99 movl 56(%rsp),%r14d 100 movl 60(%rsp),%r15d 101 102 movq %rbp,64+0(%rsp) 103 movl $10,%ebp 104 movq %rsi,64+8(%rsp) 105.byte 102,72,15,126,214 106 movq %rdi,64+16(%rsp) 107 movq %rsi,%rdi 108 shrq $32,%rdi 109 jmp .Loop 110 111.align 32 112.Loop: 113 addl %r8d,%eax 114 xorl %eax,%r12d 115 roll $16,%r12d 116 addl %r9d,%ebx 117 xorl %ebx,%r13d 118 roll $16,%r13d 119 addl %r12d,%esi 120 xorl %esi,%r8d 121 roll $12,%r8d 122 addl %r13d,%edi 123 xorl %edi,%r9d 124 roll $12,%r9d 125 addl %r8d,%eax 126 xorl %eax,%r12d 127 roll $8,%r12d 128 addl %r9d,%ebx 129 xorl %ebx,%r13d 130 roll $8,%r13d 131 addl %r12d,%esi 132 xorl %esi,%r8d 133 roll $7,%r8d 134 addl %r13d,%edi 135 xorl %edi,%r9d 136 roll $7,%r9d 137 movl %esi,32(%rsp) 138 movl %edi,36(%rsp) 139 movl 40(%rsp),%esi 140 movl 44(%rsp),%edi 141 addl %r10d,%ecx 142 xorl %ecx,%r14d 143 roll $16,%r14d 144 addl %r11d,%edx 145 xorl %edx,%r15d 146 roll $16,%r15d 147 addl %r14d,%esi 148 xorl %esi,%r10d 149 roll $12,%r10d 150 addl %r15d,%edi 151 xorl %edi,%r11d 152 roll $12,%r11d 153 addl %r10d,%ecx 154 xorl %ecx,%r14d 155 roll $8,%r14d 156 addl %r11d,%edx 157 xorl %edx,%r15d 158 roll $8,%r15d 159 addl %r14d,%esi 160 xorl %esi,%r10d 161 roll $7,%r10d 162 addl %r15d,%edi 163 xorl %edi,%r11d 164 roll $7,%r11d 165 addl %r9d,%eax 166 xorl %eax,%r15d 167 roll $16,%r15d 168 addl %r10d,%ebx 169 xorl %ebx,%r12d 170 roll $16,%r12d 171 addl %r15d,%esi 172 xorl %esi,%r9d 173 roll $12,%r9d 174 addl %r12d,%edi 175 xorl %edi,%r10d 176 roll $12,%r10d 177 addl %r9d,%eax 178 xorl %eax,%r15d 179 roll $8,%r15d 180 addl %r10d,%ebx 181 xorl %ebx,%r12d 182 roll $8,%r12d 183 addl %r15d,%esi 184 xorl %esi,%r9d 185 roll $7,%r9d 186 addl %r12d,%edi 187 xorl %edi,%r10d 188 roll $7,%r10d 189 movl %esi,40(%rsp) 190 movl %edi,44(%rsp) 191 movl 32(%rsp),%esi 192 movl 36(%rsp),%edi 193 addl %r11d,%ecx 194 xorl %ecx,%r13d 195 roll $16,%r13d 196 addl %r8d,%edx 197 xorl %edx,%r14d 198 roll $16,%r14d 199 addl %r13d,%esi 200 xorl %esi,%r11d 201 roll $12,%r11d 202 addl %r14d,%edi 203 xorl %edi,%r8d 204 roll $12,%r8d 205 addl %r11d,%ecx 206 xorl %ecx,%r13d 207 roll $8,%r13d 208 addl %r8d,%edx 209 xorl %edx,%r14d 210 roll $8,%r14d 211 addl %r13d,%esi 212 xorl %esi,%r11d 213 roll $7,%r11d 214 addl %r14d,%edi 215 xorl %edi,%r8d 216 roll $7,%r8d 217 decl %ebp 218 jnz .Loop 219 movl %edi,36(%rsp) 220 movl %esi,32(%rsp) 221 movq 64(%rsp),%rbp 222 movdqa %xmm2,%xmm1 223 movq 64+8(%rsp),%rsi 224 paddd %xmm4,%xmm3 225 movq 64+16(%rsp),%rdi 226 227 addl $0x61707865,%eax 228 addl $0x3320646e,%ebx 229 addl $0x79622d32,%ecx 230 addl $0x6b206574,%edx 231 addl 16(%rsp),%r8d 232 addl 20(%rsp),%r9d 233 addl 24(%rsp),%r10d 234 addl 28(%rsp),%r11d 235 addl 48(%rsp),%r12d 236 addl 52(%rsp),%r13d 237 addl 56(%rsp),%r14d 238 addl 60(%rsp),%r15d 239 paddd 32(%rsp),%xmm1 240 241 cmpq $64,%rbp 242 jb .Ltail 243 244 xorl 0(%rsi),%eax 245 xorl 4(%rsi),%ebx 246 xorl 8(%rsi),%ecx 247 xorl 12(%rsi),%edx 248 xorl 16(%rsi),%r8d 249 xorl 20(%rsi),%r9d 250 xorl 24(%rsi),%r10d 251 xorl 28(%rsi),%r11d 252 movdqu 32(%rsi),%xmm0 253 xorl 48(%rsi),%r12d 254 xorl 52(%rsi),%r13d 255 xorl 56(%rsi),%r14d 256 xorl 60(%rsi),%r15d 257 leaq 64(%rsi),%rsi 258 pxor %xmm1,%xmm0 259 260 movdqa %xmm2,32(%rsp) 261 movd %xmm3,48(%rsp) 262 263 movl %eax,0(%rdi) 264 movl %ebx,4(%rdi) 265 movl %ecx,8(%rdi) 266 movl %edx,12(%rdi) 267 movl %r8d,16(%rdi) 268 movl %r9d,20(%rdi) 269 movl %r10d,24(%rdi) 270 movl %r11d,28(%rdi) 271 movdqu %xmm0,32(%rdi) 272 movl %r12d,48(%rdi) 273 movl %r13d,52(%rdi) 274 movl %r14d,56(%rdi) 275 movl %r15d,60(%rdi) 276 leaq 64(%rdi),%rdi 277 278 subq $64,%rbp 279 jnz .Loop_outer 280 281 jmp .Ldone 282 283.align 16 284.Ltail: 285 movl %eax,0(%rsp) 286 movl %ebx,4(%rsp) 287 xorq %rbx,%rbx 288 movl %ecx,8(%rsp) 289 movl %edx,12(%rsp) 290 movl %r8d,16(%rsp) 291 movl %r9d,20(%rsp) 292 movl %r10d,24(%rsp) 293 movl %r11d,28(%rsp) 294 movdqa %xmm1,32(%rsp) 295 movl %r12d,48(%rsp) 296 movl %r13d,52(%rsp) 297 movl %r14d,56(%rsp) 298 movl %r15d,60(%rsp) 299 300.Loop_tail: 301 movzbl (%rsi,%rbx,1),%eax 302 movzbl (%rsp,%rbx,1),%edx 303 leaq 1(%rbx),%rbx 304 xorl %edx,%eax 305 movb %al,-1(%rdi,%rbx,1) 306 decq %rbp 307 jnz .Loop_tail 308 309.Ldone: 310 leaq 64+24+48(%rsp),%rsi 311.cfi_def_cfa %rsi,8 312 movq -48(%rsi),%r15 313.cfi_restore %r15 314 movq -40(%rsi),%r14 315.cfi_restore %r14 316 movq -32(%rsi),%r13 317.cfi_restore %r13 318 movq -24(%rsi),%r12 319.cfi_restore %r12 320 movq -16(%rsi),%rbp 321.cfi_restore %rbp 322 movq -8(%rsi),%rbx 323.cfi_restore %rbx 324 leaq (%rsi),%rsp 325.cfi_def_cfa_register %rsp 326.Lno_data: 327 .byte 0xf3,0xc3 328.cfi_endproc 329.size ChaCha20_ctr32,.-ChaCha20_ctr32 330.type ChaCha20_ssse3,@function 331.align 32 332ChaCha20_ssse3: 333.cfi_startproc 334.LChaCha20_ssse3: 335 movq %rsp,%r9 336.cfi_def_cfa_register %r9 337 testl $2048,%r10d 338 jnz .LChaCha20_4xop 339 cmpq $128,%rdx 340 je .LChaCha20_128 341 ja .LChaCha20_4x 342 343.Ldo_sse3_after_all: 344 subq $64+8,%rsp 345 movdqa .Lsigma(%rip),%xmm0 346 movdqu (%rcx),%xmm1 347 movdqu 16(%rcx),%xmm2 348 movdqu (%r8),%xmm3 349 movdqa .Lrot16(%rip),%xmm6 350 movdqa .Lrot24(%rip),%xmm7 351 352 movdqa %xmm0,0(%rsp) 353 movdqa %xmm1,16(%rsp) 354 movdqa %xmm2,32(%rsp) 355 movdqa %xmm3,48(%rsp) 356 movq $10,%r8 357 jmp .Loop_ssse3 358 359.align 32 360.Loop_outer_ssse3: 361 movdqa .Lone(%rip),%xmm3 362 movdqa 0(%rsp),%xmm0 363 movdqa 16(%rsp),%xmm1 364 movdqa 32(%rsp),%xmm2 365 paddd 48(%rsp),%xmm3 366 movq $10,%r8 367 movdqa %xmm3,48(%rsp) 368 jmp .Loop_ssse3 369 370.align 32 371.Loop_ssse3: 372 paddd %xmm1,%xmm0 373 pxor %xmm0,%xmm3 374.byte 102,15,56,0,222 375 paddd %xmm3,%xmm2 376 pxor %xmm2,%xmm1 377 movdqa %xmm1,%xmm4 378 psrld $20,%xmm1 379 pslld $12,%xmm4 380 por %xmm4,%xmm1 381 paddd %xmm1,%xmm0 382 pxor %xmm0,%xmm3 383.byte 102,15,56,0,223 384 paddd %xmm3,%xmm2 385 pxor %xmm2,%xmm1 386 movdqa %xmm1,%xmm4 387 psrld $25,%xmm1 388 pslld $7,%xmm4 389 por %xmm4,%xmm1 390 pshufd $78,%xmm2,%xmm2 391 pshufd $57,%xmm1,%xmm1 392 pshufd $147,%xmm3,%xmm3 393 nop 394 paddd %xmm1,%xmm0 395 pxor %xmm0,%xmm3 396.byte 102,15,56,0,222 397 paddd %xmm3,%xmm2 398 pxor %xmm2,%xmm1 399 movdqa %xmm1,%xmm4 400 psrld $20,%xmm1 401 pslld $12,%xmm4 402 por %xmm4,%xmm1 403 paddd %xmm1,%xmm0 404 pxor %xmm0,%xmm3 405.byte 102,15,56,0,223 406 paddd %xmm3,%xmm2 407 pxor %xmm2,%xmm1 408 movdqa %xmm1,%xmm4 409 psrld $25,%xmm1 410 pslld $7,%xmm4 411 por %xmm4,%xmm1 412 pshufd $78,%xmm2,%xmm2 413 pshufd $147,%xmm1,%xmm1 414 pshufd $57,%xmm3,%xmm3 415 decq %r8 416 jnz .Loop_ssse3 417 paddd 0(%rsp),%xmm0 418 paddd 16(%rsp),%xmm1 419 paddd 32(%rsp),%xmm2 420 paddd 48(%rsp),%xmm3 421 422 cmpq $64,%rdx 423 jb .Ltail_ssse3 424 425 movdqu 0(%rsi),%xmm4 426 movdqu 16(%rsi),%xmm5 427 pxor %xmm4,%xmm0 428 movdqu 32(%rsi),%xmm4 429 pxor %xmm5,%xmm1 430 movdqu 48(%rsi),%xmm5 431 leaq 64(%rsi),%rsi 432 pxor %xmm4,%xmm2 433 pxor %xmm5,%xmm3 434 435 movdqu %xmm0,0(%rdi) 436 movdqu %xmm1,16(%rdi) 437 movdqu %xmm2,32(%rdi) 438 movdqu %xmm3,48(%rdi) 439 leaq 64(%rdi),%rdi 440 441 subq $64,%rdx 442 jnz .Loop_outer_ssse3 443 444 jmp .Ldone_ssse3 445 446.align 16 447.Ltail_ssse3: 448 movdqa %xmm0,0(%rsp) 449 movdqa %xmm1,16(%rsp) 450 movdqa %xmm2,32(%rsp) 451 movdqa %xmm3,48(%rsp) 452 xorq %r8,%r8 453 454.Loop_tail_ssse3: 455 movzbl (%rsi,%r8,1),%eax 456 movzbl (%rsp,%r8,1),%ecx 457 leaq 1(%r8),%r8 458 xorl %ecx,%eax 459 movb %al,-1(%rdi,%r8,1) 460 decq %rdx 461 jnz .Loop_tail_ssse3 462 463.Ldone_ssse3: 464 leaq (%r9),%rsp 465.cfi_def_cfa_register %rsp 466.Lssse3_epilogue: 467 .byte 0xf3,0xc3 468.cfi_endproc 469.size ChaCha20_ssse3,.-ChaCha20_ssse3 470.type ChaCha20_128,@function 471.align 32 472ChaCha20_128: 473.cfi_startproc 474.LChaCha20_128: 475 movq %rsp,%r9 476.cfi_def_cfa_register %r9 477 subq $64+8,%rsp 478 movdqa .Lsigma(%rip),%xmm8 479 movdqu (%rcx),%xmm9 480 movdqu 16(%rcx),%xmm2 481 movdqu (%r8),%xmm3 482 movdqa .Lone(%rip),%xmm1 483 movdqa .Lrot16(%rip),%xmm6 484 movdqa .Lrot24(%rip),%xmm7 485 486 movdqa %xmm8,%xmm10 487 movdqa %xmm8,0(%rsp) 488 movdqa %xmm9,%xmm11 489 movdqa %xmm9,16(%rsp) 490 movdqa %xmm2,%xmm0 491 movdqa %xmm2,32(%rsp) 492 paddd %xmm3,%xmm1 493 movdqa %xmm3,48(%rsp) 494 movq $10,%r8 495 jmp .Loop_128 496 497.align 32 498.Loop_128: 499 paddd %xmm9,%xmm8 500 pxor %xmm8,%xmm3 501 paddd %xmm11,%xmm10 502 pxor %xmm10,%xmm1 503.byte 102,15,56,0,222 504.byte 102,15,56,0,206 505 paddd %xmm3,%xmm2 506 paddd %xmm1,%xmm0 507 pxor %xmm2,%xmm9 508 pxor %xmm0,%xmm11 509 movdqa %xmm9,%xmm4 510 psrld $20,%xmm9 511 movdqa %xmm11,%xmm5 512 pslld $12,%xmm4 513 psrld $20,%xmm11 514 por %xmm4,%xmm9 515 pslld $12,%xmm5 516 por %xmm5,%xmm11 517 paddd %xmm9,%xmm8 518 pxor %xmm8,%xmm3 519 paddd %xmm11,%xmm10 520 pxor %xmm10,%xmm1 521.byte 102,15,56,0,223 522.byte 102,15,56,0,207 523 paddd %xmm3,%xmm2 524 paddd %xmm1,%xmm0 525 pxor %xmm2,%xmm9 526 pxor %xmm0,%xmm11 527 movdqa %xmm9,%xmm4 528 psrld $25,%xmm9 529 movdqa %xmm11,%xmm5 530 pslld $7,%xmm4 531 psrld $25,%xmm11 532 por %xmm4,%xmm9 533 pslld $7,%xmm5 534 por %xmm5,%xmm11 535 pshufd $78,%xmm2,%xmm2 536 pshufd $57,%xmm9,%xmm9 537 pshufd $147,%xmm3,%xmm3 538 pshufd $78,%xmm0,%xmm0 539 pshufd $57,%xmm11,%xmm11 540 pshufd $147,%xmm1,%xmm1 541 paddd %xmm9,%xmm8 542 pxor %xmm8,%xmm3 543 paddd %xmm11,%xmm10 544 pxor %xmm10,%xmm1 545.byte 102,15,56,0,222 546.byte 102,15,56,0,206 547 paddd %xmm3,%xmm2 548 paddd %xmm1,%xmm0 549 pxor %xmm2,%xmm9 550 pxor %xmm0,%xmm11 551 movdqa %xmm9,%xmm4 552 psrld $20,%xmm9 553 movdqa %xmm11,%xmm5 554 pslld $12,%xmm4 555 psrld $20,%xmm11 556 por %xmm4,%xmm9 557 pslld $12,%xmm5 558 por %xmm5,%xmm11 559 paddd %xmm9,%xmm8 560 pxor %xmm8,%xmm3 561 paddd %xmm11,%xmm10 562 pxor %xmm10,%xmm1 563.byte 102,15,56,0,223 564.byte 102,15,56,0,207 565 paddd %xmm3,%xmm2 566 paddd %xmm1,%xmm0 567 pxor %xmm2,%xmm9 568 pxor %xmm0,%xmm11 569 movdqa %xmm9,%xmm4 570 psrld $25,%xmm9 571 movdqa %xmm11,%xmm5 572 pslld $7,%xmm4 573 psrld $25,%xmm11 574 por %xmm4,%xmm9 575 pslld $7,%xmm5 576 por %xmm5,%xmm11 577 pshufd $78,%xmm2,%xmm2 578 pshufd $147,%xmm9,%xmm9 579 pshufd $57,%xmm3,%xmm3 580 pshufd $78,%xmm0,%xmm0 581 pshufd $147,%xmm11,%xmm11 582 pshufd $57,%xmm1,%xmm1 583 decq %r8 584 jnz .Loop_128 585 paddd 0(%rsp),%xmm8 586 paddd 16(%rsp),%xmm9 587 paddd 32(%rsp),%xmm2 588 paddd 48(%rsp),%xmm3 589 paddd .Lone(%rip),%xmm1 590 paddd 0(%rsp),%xmm10 591 paddd 16(%rsp),%xmm11 592 paddd 32(%rsp),%xmm0 593 paddd 48(%rsp),%xmm1 594 595 movdqu 0(%rsi),%xmm4 596 movdqu 16(%rsi),%xmm5 597 pxor %xmm4,%xmm8 598 movdqu 32(%rsi),%xmm4 599 pxor %xmm5,%xmm9 600 movdqu 48(%rsi),%xmm5 601 pxor %xmm4,%xmm2 602 movdqu 64(%rsi),%xmm4 603 pxor %xmm5,%xmm3 604 movdqu 80(%rsi),%xmm5 605 pxor %xmm4,%xmm10 606 movdqu 96(%rsi),%xmm4 607 pxor %xmm5,%xmm11 608 movdqu 112(%rsi),%xmm5 609 pxor %xmm4,%xmm0 610 pxor %xmm5,%xmm1 611 612 movdqu %xmm8,0(%rdi) 613 movdqu %xmm9,16(%rdi) 614 movdqu %xmm2,32(%rdi) 615 movdqu %xmm3,48(%rdi) 616 movdqu %xmm10,64(%rdi) 617 movdqu %xmm11,80(%rdi) 618 movdqu %xmm0,96(%rdi) 619 movdqu %xmm1,112(%rdi) 620 leaq (%r9),%rsp 621.cfi_def_cfa_register %rsp 622.L128_epilogue: 623 .byte 0xf3,0xc3 624.cfi_endproc 625.size ChaCha20_128,.-ChaCha20_128 626.type ChaCha20_4x,@function 627.align 32 628ChaCha20_4x: 629.cfi_startproc 630.LChaCha20_4x: 631 movq %rsp,%r9 632.cfi_def_cfa_register %r9 633 movq %r10,%r11 634 shrq $32,%r10 635 testq $32,%r10 636 jnz .LChaCha20_8x 637 cmpq $192,%rdx 638 ja .Lproceed4x 639 640 andq $71303168,%r11 641 cmpq $4194304,%r11 642 je .Ldo_sse3_after_all 643 644.Lproceed4x: 645 subq $0x140+8,%rsp 646 movdqa .Lsigma(%rip),%xmm11 647 movdqu (%rcx),%xmm15 648 movdqu 16(%rcx),%xmm7 649 movdqu (%r8),%xmm3 650 leaq 256(%rsp),%rcx 651 leaq .Lrot16(%rip),%r10 652 leaq .Lrot24(%rip),%r11 653 654 pshufd $0x00,%xmm11,%xmm8 655 pshufd $0x55,%xmm11,%xmm9 656 movdqa %xmm8,64(%rsp) 657 pshufd $0xaa,%xmm11,%xmm10 658 movdqa %xmm9,80(%rsp) 659 pshufd $0xff,%xmm11,%xmm11 660 movdqa %xmm10,96(%rsp) 661 movdqa %xmm11,112(%rsp) 662 663 pshufd $0x00,%xmm15,%xmm12 664 pshufd $0x55,%xmm15,%xmm13 665 movdqa %xmm12,128-256(%rcx) 666 pshufd $0xaa,%xmm15,%xmm14 667 movdqa %xmm13,144-256(%rcx) 668 pshufd $0xff,%xmm15,%xmm15 669 movdqa %xmm14,160-256(%rcx) 670 movdqa %xmm15,176-256(%rcx) 671 672 pshufd $0x00,%xmm7,%xmm4 673 pshufd $0x55,%xmm7,%xmm5 674 movdqa %xmm4,192-256(%rcx) 675 pshufd $0xaa,%xmm7,%xmm6 676 movdqa %xmm5,208-256(%rcx) 677 pshufd $0xff,%xmm7,%xmm7 678 movdqa %xmm6,224-256(%rcx) 679 movdqa %xmm7,240-256(%rcx) 680 681 pshufd $0x00,%xmm3,%xmm0 682 pshufd $0x55,%xmm3,%xmm1 683 paddd .Linc(%rip),%xmm0 684 pshufd $0xaa,%xmm3,%xmm2 685 movdqa %xmm1,272-256(%rcx) 686 pshufd $0xff,%xmm3,%xmm3 687 movdqa %xmm2,288-256(%rcx) 688 movdqa %xmm3,304-256(%rcx) 689 690 jmp .Loop_enter4x 691 692.align 32 693.Loop_outer4x: 694 movdqa 64(%rsp),%xmm8 695 movdqa 80(%rsp),%xmm9 696 movdqa 96(%rsp),%xmm10 697 movdqa 112(%rsp),%xmm11 698 movdqa 128-256(%rcx),%xmm12 699 movdqa 144-256(%rcx),%xmm13 700 movdqa 160-256(%rcx),%xmm14 701 movdqa 176-256(%rcx),%xmm15 702 movdqa 192-256(%rcx),%xmm4 703 movdqa 208-256(%rcx),%xmm5 704 movdqa 224-256(%rcx),%xmm6 705 movdqa 240-256(%rcx),%xmm7 706 movdqa 256-256(%rcx),%xmm0 707 movdqa 272-256(%rcx),%xmm1 708 movdqa 288-256(%rcx),%xmm2 709 movdqa 304-256(%rcx),%xmm3 710 paddd .Lfour(%rip),%xmm0 711 712.Loop_enter4x: 713 movdqa %xmm6,32(%rsp) 714 movdqa %xmm7,48(%rsp) 715 movdqa (%r10),%xmm7 716 movl $10,%eax 717 movdqa %xmm0,256-256(%rcx) 718 jmp .Loop4x 719 720.align 32 721.Loop4x: 722 paddd %xmm12,%xmm8 723 paddd %xmm13,%xmm9 724 pxor %xmm8,%xmm0 725 pxor %xmm9,%xmm1 726.byte 102,15,56,0,199 727.byte 102,15,56,0,207 728 paddd %xmm0,%xmm4 729 paddd %xmm1,%xmm5 730 pxor %xmm4,%xmm12 731 pxor %xmm5,%xmm13 732 movdqa %xmm12,%xmm6 733 pslld $12,%xmm12 734 psrld $20,%xmm6 735 movdqa %xmm13,%xmm7 736 pslld $12,%xmm13 737 por %xmm6,%xmm12 738 psrld $20,%xmm7 739 movdqa (%r11),%xmm6 740 por %xmm7,%xmm13 741 paddd %xmm12,%xmm8 742 paddd %xmm13,%xmm9 743 pxor %xmm8,%xmm0 744 pxor %xmm9,%xmm1 745.byte 102,15,56,0,198 746.byte 102,15,56,0,206 747 paddd %xmm0,%xmm4 748 paddd %xmm1,%xmm5 749 pxor %xmm4,%xmm12 750 pxor %xmm5,%xmm13 751 movdqa %xmm12,%xmm7 752 pslld $7,%xmm12 753 psrld $25,%xmm7 754 movdqa %xmm13,%xmm6 755 pslld $7,%xmm13 756 por %xmm7,%xmm12 757 psrld $25,%xmm6 758 movdqa (%r10),%xmm7 759 por %xmm6,%xmm13 760 movdqa %xmm4,0(%rsp) 761 movdqa %xmm5,16(%rsp) 762 movdqa 32(%rsp),%xmm4 763 movdqa 48(%rsp),%xmm5 764 paddd %xmm14,%xmm10 765 paddd %xmm15,%xmm11 766 pxor %xmm10,%xmm2 767 pxor %xmm11,%xmm3 768.byte 102,15,56,0,215 769.byte 102,15,56,0,223 770 paddd %xmm2,%xmm4 771 paddd %xmm3,%xmm5 772 pxor %xmm4,%xmm14 773 pxor %xmm5,%xmm15 774 movdqa %xmm14,%xmm6 775 pslld $12,%xmm14 776 psrld $20,%xmm6 777 movdqa %xmm15,%xmm7 778 pslld $12,%xmm15 779 por %xmm6,%xmm14 780 psrld $20,%xmm7 781 movdqa (%r11),%xmm6 782 por %xmm7,%xmm15 783 paddd %xmm14,%xmm10 784 paddd %xmm15,%xmm11 785 pxor %xmm10,%xmm2 786 pxor %xmm11,%xmm3 787.byte 102,15,56,0,214 788.byte 102,15,56,0,222 789 paddd %xmm2,%xmm4 790 paddd %xmm3,%xmm5 791 pxor %xmm4,%xmm14 792 pxor %xmm5,%xmm15 793 movdqa %xmm14,%xmm7 794 pslld $7,%xmm14 795 psrld $25,%xmm7 796 movdqa %xmm15,%xmm6 797 pslld $7,%xmm15 798 por %xmm7,%xmm14 799 psrld $25,%xmm6 800 movdqa (%r10),%xmm7 801 por %xmm6,%xmm15 802 paddd %xmm13,%xmm8 803 paddd %xmm14,%xmm9 804 pxor %xmm8,%xmm3 805 pxor %xmm9,%xmm0 806.byte 102,15,56,0,223 807.byte 102,15,56,0,199 808 paddd %xmm3,%xmm4 809 paddd %xmm0,%xmm5 810 pxor %xmm4,%xmm13 811 pxor %xmm5,%xmm14 812 movdqa %xmm13,%xmm6 813 pslld $12,%xmm13 814 psrld $20,%xmm6 815 movdqa %xmm14,%xmm7 816 pslld $12,%xmm14 817 por %xmm6,%xmm13 818 psrld $20,%xmm7 819 movdqa (%r11),%xmm6 820 por %xmm7,%xmm14 821 paddd %xmm13,%xmm8 822 paddd %xmm14,%xmm9 823 pxor %xmm8,%xmm3 824 pxor %xmm9,%xmm0 825.byte 102,15,56,0,222 826.byte 102,15,56,0,198 827 paddd %xmm3,%xmm4 828 paddd %xmm0,%xmm5 829 pxor %xmm4,%xmm13 830 pxor %xmm5,%xmm14 831 movdqa %xmm13,%xmm7 832 pslld $7,%xmm13 833 psrld $25,%xmm7 834 movdqa %xmm14,%xmm6 835 pslld $7,%xmm14 836 por %xmm7,%xmm13 837 psrld $25,%xmm6 838 movdqa (%r10),%xmm7 839 por %xmm6,%xmm14 840 movdqa %xmm4,32(%rsp) 841 movdqa %xmm5,48(%rsp) 842 movdqa 0(%rsp),%xmm4 843 movdqa 16(%rsp),%xmm5 844 paddd %xmm15,%xmm10 845 paddd %xmm12,%xmm11 846 pxor %xmm10,%xmm1 847 pxor %xmm11,%xmm2 848.byte 102,15,56,0,207 849.byte 102,15,56,0,215 850 paddd %xmm1,%xmm4 851 paddd %xmm2,%xmm5 852 pxor %xmm4,%xmm15 853 pxor %xmm5,%xmm12 854 movdqa %xmm15,%xmm6 855 pslld $12,%xmm15 856 psrld $20,%xmm6 857 movdqa %xmm12,%xmm7 858 pslld $12,%xmm12 859 por %xmm6,%xmm15 860 psrld $20,%xmm7 861 movdqa (%r11),%xmm6 862 por %xmm7,%xmm12 863 paddd %xmm15,%xmm10 864 paddd %xmm12,%xmm11 865 pxor %xmm10,%xmm1 866 pxor %xmm11,%xmm2 867.byte 102,15,56,0,206 868.byte 102,15,56,0,214 869 paddd %xmm1,%xmm4 870 paddd %xmm2,%xmm5 871 pxor %xmm4,%xmm15 872 pxor %xmm5,%xmm12 873 movdqa %xmm15,%xmm7 874 pslld $7,%xmm15 875 psrld $25,%xmm7 876 movdqa %xmm12,%xmm6 877 pslld $7,%xmm12 878 por %xmm7,%xmm15 879 psrld $25,%xmm6 880 movdqa (%r10),%xmm7 881 por %xmm6,%xmm12 882 decl %eax 883 jnz .Loop4x 884 885 paddd 64(%rsp),%xmm8 886 paddd 80(%rsp),%xmm9 887 paddd 96(%rsp),%xmm10 888 paddd 112(%rsp),%xmm11 889 890 movdqa %xmm8,%xmm6 891 punpckldq %xmm9,%xmm8 892 movdqa %xmm10,%xmm7 893 punpckldq %xmm11,%xmm10 894 punpckhdq %xmm9,%xmm6 895 punpckhdq %xmm11,%xmm7 896 movdqa %xmm8,%xmm9 897 punpcklqdq %xmm10,%xmm8 898 movdqa %xmm6,%xmm11 899 punpcklqdq %xmm7,%xmm6 900 punpckhqdq %xmm10,%xmm9 901 punpckhqdq %xmm7,%xmm11 902 paddd 128-256(%rcx),%xmm12 903 paddd 144-256(%rcx),%xmm13 904 paddd 160-256(%rcx),%xmm14 905 paddd 176-256(%rcx),%xmm15 906 907 movdqa %xmm8,0(%rsp) 908 movdqa %xmm9,16(%rsp) 909 movdqa 32(%rsp),%xmm8 910 movdqa 48(%rsp),%xmm9 911 912 movdqa %xmm12,%xmm10 913 punpckldq %xmm13,%xmm12 914 movdqa %xmm14,%xmm7 915 punpckldq %xmm15,%xmm14 916 punpckhdq %xmm13,%xmm10 917 punpckhdq %xmm15,%xmm7 918 movdqa %xmm12,%xmm13 919 punpcklqdq %xmm14,%xmm12 920 movdqa %xmm10,%xmm15 921 punpcklqdq %xmm7,%xmm10 922 punpckhqdq %xmm14,%xmm13 923 punpckhqdq %xmm7,%xmm15 924 paddd 192-256(%rcx),%xmm4 925 paddd 208-256(%rcx),%xmm5 926 paddd 224-256(%rcx),%xmm8 927 paddd 240-256(%rcx),%xmm9 928 929 movdqa %xmm6,32(%rsp) 930 movdqa %xmm11,48(%rsp) 931 932 movdqa %xmm4,%xmm14 933 punpckldq %xmm5,%xmm4 934 movdqa %xmm8,%xmm7 935 punpckldq %xmm9,%xmm8 936 punpckhdq %xmm5,%xmm14 937 punpckhdq %xmm9,%xmm7 938 movdqa %xmm4,%xmm5 939 punpcklqdq %xmm8,%xmm4 940 movdqa %xmm14,%xmm9 941 punpcklqdq %xmm7,%xmm14 942 punpckhqdq %xmm8,%xmm5 943 punpckhqdq %xmm7,%xmm9 944 paddd 256-256(%rcx),%xmm0 945 paddd 272-256(%rcx),%xmm1 946 paddd 288-256(%rcx),%xmm2 947 paddd 304-256(%rcx),%xmm3 948 949 movdqa %xmm0,%xmm8 950 punpckldq %xmm1,%xmm0 951 movdqa %xmm2,%xmm7 952 punpckldq %xmm3,%xmm2 953 punpckhdq %xmm1,%xmm8 954 punpckhdq %xmm3,%xmm7 955 movdqa %xmm0,%xmm1 956 punpcklqdq %xmm2,%xmm0 957 movdqa %xmm8,%xmm3 958 punpcklqdq %xmm7,%xmm8 959 punpckhqdq %xmm2,%xmm1 960 punpckhqdq %xmm7,%xmm3 961 cmpq $256,%rdx 962 jb .Ltail4x 963 964 movdqu 0(%rsi),%xmm6 965 movdqu 16(%rsi),%xmm11 966 movdqu 32(%rsi),%xmm2 967 movdqu 48(%rsi),%xmm7 968 pxor 0(%rsp),%xmm6 969 pxor %xmm12,%xmm11 970 pxor %xmm4,%xmm2 971 pxor %xmm0,%xmm7 972 973 movdqu %xmm6,0(%rdi) 974 movdqu 64(%rsi),%xmm6 975 movdqu %xmm11,16(%rdi) 976 movdqu 80(%rsi),%xmm11 977 movdqu %xmm2,32(%rdi) 978 movdqu 96(%rsi),%xmm2 979 movdqu %xmm7,48(%rdi) 980 movdqu 112(%rsi),%xmm7 981 leaq 128(%rsi),%rsi 982 pxor 16(%rsp),%xmm6 983 pxor %xmm13,%xmm11 984 pxor %xmm5,%xmm2 985 pxor %xmm1,%xmm7 986 987 movdqu %xmm6,64(%rdi) 988 movdqu 0(%rsi),%xmm6 989 movdqu %xmm11,80(%rdi) 990 movdqu 16(%rsi),%xmm11 991 movdqu %xmm2,96(%rdi) 992 movdqu 32(%rsi),%xmm2 993 movdqu %xmm7,112(%rdi) 994 leaq 128(%rdi),%rdi 995 movdqu 48(%rsi),%xmm7 996 pxor 32(%rsp),%xmm6 997 pxor %xmm10,%xmm11 998 pxor %xmm14,%xmm2 999 pxor %xmm8,%xmm7 1000 1001 movdqu %xmm6,0(%rdi) 1002 movdqu 64(%rsi),%xmm6 1003 movdqu %xmm11,16(%rdi) 1004 movdqu 80(%rsi),%xmm11 1005 movdqu %xmm2,32(%rdi) 1006 movdqu 96(%rsi),%xmm2 1007 movdqu %xmm7,48(%rdi) 1008 movdqu 112(%rsi),%xmm7 1009 leaq 128(%rsi),%rsi 1010 pxor 48(%rsp),%xmm6 1011 pxor %xmm15,%xmm11 1012 pxor %xmm9,%xmm2 1013 pxor %xmm3,%xmm7 1014 movdqu %xmm6,64(%rdi) 1015 movdqu %xmm11,80(%rdi) 1016 movdqu %xmm2,96(%rdi) 1017 movdqu %xmm7,112(%rdi) 1018 leaq 128(%rdi),%rdi 1019 1020 subq $256,%rdx 1021 jnz .Loop_outer4x 1022 1023 jmp .Ldone4x 1024 1025.Ltail4x: 1026 cmpq $192,%rdx 1027 jae .L192_or_more4x 1028 cmpq $128,%rdx 1029 jae .L128_or_more4x 1030 cmpq $64,%rdx 1031 jae .L64_or_more4x 1032 1033 1034 xorq %r10,%r10 1035 1036 movdqa %xmm12,16(%rsp) 1037 movdqa %xmm4,32(%rsp) 1038 movdqa %xmm0,48(%rsp) 1039 jmp .Loop_tail4x 1040 1041.align 32 1042.L64_or_more4x: 1043 movdqu 0(%rsi),%xmm6 1044 movdqu 16(%rsi),%xmm11 1045 movdqu 32(%rsi),%xmm2 1046 movdqu 48(%rsi),%xmm7 1047 pxor 0(%rsp),%xmm6 1048 pxor %xmm12,%xmm11 1049 pxor %xmm4,%xmm2 1050 pxor %xmm0,%xmm7 1051 movdqu %xmm6,0(%rdi) 1052 movdqu %xmm11,16(%rdi) 1053 movdqu %xmm2,32(%rdi) 1054 movdqu %xmm7,48(%rdi) 1055 je .Ldone4x 1056 1057 movdqa 16(%rsp),%xmm6 1058 leaq 64(%rsi),%rsi 1059 xorq %r10,%r10 1060 movdqa %xmm6,0(%rsp) 1061 movdqa %xmm13,16(%rsp) 1062 leaq 64(%rdi),%rdi 1063 movdqa %xmm5,32(%rsp) 1064 subq $64,%rdx 1065 movdqa %xmm1,48(%rsp) 1066 jmp .Loop_tail4x 1067 1068.align 32 1069.L128_or_more4x: 1070 movdqu 0(%rsi),%xmm6 1071 movdqu 16(%rsi),%xmm11 1072 movdqu 32(%rsi),%xmm2 1073 movdqu 48(%rsi),%xmm7 1074 pxor 0(%rsp),%xmm6 1075 pxor %xmm12,%xmm11 1076 pxor %xmm4,%xmm2 1077 pxor %xmm0,%xmm7 1078 1079 movdqu %xmm6,0(%rdi) 1080 movdqu 64(%rsi),%xmm6 1081 movdqu %xmm11,16(%rdi) 1082 movdqu 80(%rsi),%xmm11 1083 movdqu %xmm2,32(%rdi) 1084 movdqu 96(%rsi),%xmm2 1085 movdqu %xmm7,48(%rdi) 1086 movdqu 112(%rsi),%xmm7 1087 pxor 16(%rsp),%xmm6 1088 pxor %xmm13,%xmm11 1089 pxor %xmm5,%xmm2 1090 pxor %xmm1,%xmm7 1091 movdqu %xmm6,64(%rdi) 1092 movdqu %xmm11,80(%rdi) 1093 movdqu %xmm2,96(%rdi) 1094 movdqu %xmm7,112(%rdi) 1095 je .Ldone4x 1096 1097 movdqa 32(%rsp),%xmm6 1098 leaq 128(%rsi),%rsi 1099 xorq %r10,%r10 1100 movdqa %xmm6,0(%rsp) 1101 movdqa %xmm10,16(%rsp) 1102 leaq 128(%rdi),%rdi 1103 movdqa %xmm14,32(%rsp) 1104 subq $128,%rdx 1105 movdqa %xmm8,48(%rsp) 1106 jmp .Loop_tail4x 1107 1108.align 32 1109.L192_or_more4x: 1110 movdqu 0(%rsi),%xmm6 1111 movdqu 16(%rsi),%xmm11 1112 movdqu 32(%rsi),%xmm2 1113 movdqu 48(%rsi),%xmm7 1114 pxor 0(%rsp),%xmm6 1115 pxor %xmm12,%xmm11 1116 pxor %xmm4,%xmm2 1117 pxor %xmm0,%xmm7 1118 1119 movdqu %xmm6,0(%rdi) 1120 movdqu 64(%rsi),%xmm6 1121 movdqu %xmm11,16(%rdi) 1122 movdqu 80(%rsi),%xmm11 1123 movdqu %xmm2,32(%rdi) 1124 movdqu 96(%rsi),%xmm2 1125 movdqu %xmm7,48(%rdi) 1126 movdqu 112(%rsi),%xmm7 1127 leaq 128(%rsi),%rsi 1128 pxor 16(%rsp),%xmm6 1129 pxor %xmm13,%xmm11 1130 pxor %xmm5,%xmm2 1131 pxor %xmm1,%xmm7 1132 1133 movdqu %xmm6,64(%rdi) 1134 movdqu 0(%rsi),%xmm6 1135 movdqu %xmm11,80(%rdi) 1136 movdqu 16(%rsi),%xmm11 1137 movdqu %xmm2,96(%rdi) 1138 movdqu 32(%rsi),%xmm2 1139 movdqu %xmm7,112(%rdi) 1140 leaq 128(%rdi),%rdi 1141 movdqu 48(%rsi),%xmm7 1142 pxor 32(%rsp),%xmm6 1143 pxor %xmm10,%xmm11 1144 pxor %xmm14,%xmm2 1145 pxor %xmm8,%xmm7 1146 movdqu %xmm6,0(%rdi) 1147 movdqu %xmm11,16(%rdi) 1148 movdqu %xmm2,32(%rdi) 1149 movdqu %xmm7,48(%rdi) 1150 je .Ldone4x 1151 1152 movdqa 48(%rsp),%xmm6 1153 leaq 64(%rsi),%rsi 1154 xorq %r10,%r10 1155 movdqa %xmm6,0(%rsp) 1156 movdqa %xmm15,16(%rsp) 1157 leaq 64(%rdi),%rdi 1158 movdqa %xmm9,32(%rsp) 1159 subq $192,%rdx 1160 movdqa %xmm3,48(%rsp) 1161 1162.Loop_tail4x: 1163 movzbl (%rsi,%r10,1),%eax 1164 movzbl (%rsp,%r10,1),%ecx 1165 leaq 1(%r10),%r10 1166 xorl %ecx,%eax 1167 movb %al,-1(%rdi,%r10,1) 1168 decq %rdx 1169 jnz .Loop_tail4x 1170 1171.Ldone4x: 1172 leaq (%r9),%rsp 1173.cfi_def_cfa_register %rsp 1174.L4x_epilogue: 1175 .byte 0xf3,0xc3 1176.cfi_endproc 1177.size ChaCha20_4x,.-ChaCha20_4x 1178.type ChaCha20_4xop,@function 1179.align 32 1180ChaCha20_4xop: 1181.cfi_startproc 1182.LChaCha20_4xop: 1183 movq %rsp,%r9 1184.cfi_def_cfa_register %r9 1185 subq $0x140+8,%rsp 1186 vzeroupper 1187 1188 vmovdqa .Lsigma(%rip),%xmm11 1189 vmovdqu (%rcx),%xmm3 1190 vmovdqu 16(%rcx),%xmm15 1191 vmovdqu (%r8),%xmm7 1192 leaq 256(%rsp),%rcx 1193 1194 vpshufd $0x00,%xmm11,%xmm8 1195 vpshufd $0x55,%xmm11,%xmm9 1196 vmovdqa %xmm8,64(%rsp) 1197 vpshufd $0xaa,%xmm11,%xmm10 1198 vmovdqa %xmm9,80(%rsp) 1199 vpshufd $0xff,%xmm11,%xmm11 1200 vmovdqa %xmm10,96(%rsp) 1201 vmovdqa %xmm11,112(%rsp) 1202 1203 vpshufd $0x00,%xmm3,%xmm0 1204 vpshufd $0x55,%xmm3,%xmm1 1205 vmovdqa %xmm0,128-256(%rcx) 1206 vpshufd $0xaa,%xmm3,%xmm2 1207 vmovdqa %xmm1,144-256(%rcx) 1208 vpshufd $0xff,%xmm3,%xmm3 1209 vmovdqa %xmm2,160-256(%rcx) 1210 vmovdqa %xmm3,176-256(%rcx) 1211 1212 vpshufd $0x00,%xmm15,%xmm12 1213 vpshufd $0x55,%xmm15,%xmm13 1214 vmovdqa %xmm12,192-256(%rcx) 1215 vpshufd $0xaa,%xmm15,%xmm14 1216 vmovdqa %xmm13,208-256(%rcx) 1217 vpshufd $0xff,%xmm15,%xmm15 1218 vmovdqa %xmm14,224-256(%rcx) 1219 vmovdqa %xmm15,240-256(%rcx) 1220 1221 vpshufd $0x00,%xmm7,%xmm4 1222 vpshufd $0x55,%xmm7,%xmm5 1223 vpaddd .Linc(%rip),%xmm4,%xmm4 1224 vpshufd $0xaa,%xmm7,%xmm6 1225 vmovdqa %xmm5,272-256(%rcx) 1226 vpshufd $0xff,%xmm7,%xmm7 1227 vmovdqa %xmm6,288-256(%rcx) 1228 vmovdqa %xmm7,304-256(%rcx) 1229 1230 jmp .Loop_enter4xop 1231 1232.align 32 1233.Loop_outer4xop: 1234 vmovdqa 64(%rsp),%xmm8 1235 vmovdqa 80(%rsp),%xmm9 1236 vmovdqa 96(%rsp),%xmm10 1237 vmovdqa 112(%rsp),%xmm11 1238 vmovdqa 128-256(%rcx),%xmm0 1239 vmovdqa 144-256(%rcx),%xmm1 1240 vmovdqa 160-256(%rcx),%xmm2 1241 vmovdqa 176-256(%rcx),%xmm3 1242 vmovdqa 192-256(%rcx),%xmm12 1243 vmovdqa 208-256(%rcx),%xmm13 1244 vmovdqa 224-256(%rcx),%xmm14 1245 vmovdqa 240-256(%rcx),%xmm15 1246 vmovdqa 256-256(%rcx),%xmm4 1247 vmovdqa 272-256(%rcx),%xmm5 1248 vmovdqa 288-256(%rcx),%xmm6 1249 vmovdqa 304-256(%rcx),%xmm7 1250 vpaddd .Lfour(%rip),%xmm4,%xmm4 1251 1252.Loop_enter4xop: 1253 movl $10,%eax 1254 vmovdqa %xmm4,256-256(%rcx) 1255 jmp .Loop4xop 1256 1257.align 32 1258.Loop4xop: 1259 vpaddd %xmm0,%xmm8,%xmm8 1260 vpaddd %xmm1,%xmm9,%xmm9 1261 vpaddd %xmm2,%xmm10,%xmm10 1262 vpaddd %xmm3,%xmm11,%xmm11 1263 vpxor %xmm4,%xmm8,%xmm4 1264 vpxor %xmm5,%xmm9,%xmm5 1265 vpxor %xmm6,%xmm10,%xmm6 1266 vpxor %xmm7,%xmm11,%xmm7 1267.byte 143,232,120,194,228,16 1268.byte 143,232,120,194,237,16 1269.byte 143,232,120,194,246,16 1270.byte 143,232,120,194,255,16 1271 vpaddd %xmm4,%xmm12,%xmm12 1272 vpaddd %xmm5,%xmm13,%xmm13 1273 vpaddd %xmm6,%xmm14,%xmm14 1274 vpaddd %xmm7,%xmm15,%xmm15 1275 vpxor %xmm0,%xmm12,%xmm0 1276 vpxor %xmm1,%xmm13,%xmm1 1277 vpxor %xmm14,%xmm2,%xmm2 1278 vpxor %xmm15,%xmm3,%xmm3 1279.byte 143,232,120,194,192,12 1280.byte 143,232,120,194,201,12 1281.byte 143,232,120,194,210,12 1282.byte 143,232,120,194,219,12 1283 vpaddd %xmm8,%xmm0,%xmm8 1284 vpaddd %xmm9,%xmm1,%xmm9 1285 vpaddd %xmm2,%xmm10,%xmm10 1286 vpaddd %xmm3,%xmm11,%xmm11 1287 vpxor %xmm4,%xmm8,%xmm4 1288 vpxor %xmm5,%xmm9,%xmm5 1289 vpxor %xmm6,%xmm10,%xmm6 1290 vpxor %xmm7,%xmm11,%xmm7 1291.byte 143,232,120,194,228,8 1292.byte 143,232,120,194,237,8 1293.byte 143,232,120,194,246,8 1294.byte 143,232,120,194,255,8 1295 vpaddd %xmm4,%xmm12,%xmm12 1296 vpaddd %xmm5,%xmm13,%xmm13 1297 vpaddd %xmm6,%xmm14,%xmm14 1298 vpaddd %xmm7,%xmm15,%xmm15 1299 vpxor %xmm0,%xmm12,%xmm0 1300 vpxor %xmm1,%xmm13,%xmm1 1301 vpxor %xmm14,%xmm2,%xmm2 1302 vpxor %xmm15,%xmm3,%xmm3 1303.byte 143,232,120,194,192,7 1304.byte 143,232,120,194,201,7 1305.byte 143,232,120,194,210,7 1306.byte 143,232,120,194,219,7 1307 vpaddd %xmm1,%xmm8,%xmm8 1308 vpaddd %xmm2,%xmm9,%xmm9 1309 vpaddd %xmm3,%xmm10,%xmm10 1310 vpaddd %xmm0,%xmm11,%xmm11 1311 vpxor %xmm7,%xmm8,%xmm7 1312 vpxor %xmm4,%xmm9,%xmm4 1313 vpxor %xmm5,%xmm10,%xmm5 1314 vpxor %xmm6,%xmm11,%xmm6 1315.byte 143,232,120,194,255,16 1316.byte 143,232,120,194,228,16 1317.byte 143,232,120,194,237,16 1318.byte 143,232,120,194,246,16 1319 vpaddd %xmm7,%xmm14,%xmm14 1320 vpaddd %xmm4,%xmm15,%xmm15 1321 vpaddd %xmm5,%xmm12,%xmm12 1322 vpaddd %xmm6,%xmm13,%xmm13 1323 vpxor %xmm1,%xmm14,%xmm1 1324 vpxor %xmm2,%xmm15,%xmm2 1325 vpxor %xmm12,%xmm3,%xmm3 1326 vpxor %xmm13,%xmm0,%xmm0 1327.byte 143,232,120,194,201,12 1328.byte 143,232,120,194,210,12 1329.byte 143,232,120,194,219,12 1330.byte 143,232,120,194,192,12 1331 vpaddd %xmm8,%xmm1,%xmm8 1332 vpaddd %xmm9,%xmm2,%xmm9 1333 vpaddd %xmm3,%xmm10,%xmm10 1334 vpaddd %xmm0,%xmm11,%xmm11 1335 vpxor %xmm7,%xmm8,%xmm7 1336 vpxor %xmm4,%xmm9,%xmm4 1337 vpxor %xmm5,%xmm10,%xmm5 1338 vpxor %xmm6,%xmm11,%xmm6 1339.byte 143,232,120,194,255,8 1340.byte 143,232,120,194,228,8 1341.byte 143,232,120,194,237,8 1342.byte 143,232,120,194,246,8 1343 vpaddd %xmm7,%xmm14,%xmm14 1344 vpaddd %xmm4,%xmm15,%xmm15 1345 vpaddd %xmm5,%xmm12,%xmm12 1346 vpaddd %xmm6,%xmm13,%xmm13 1347 vpxor %xmm1,%xmm14,%xmm1 1348 vpxor %xmm2,%xmm15,%xmm2 1349 vpxor %xmm12,%xmm3,%xmm3 1350 vpxor %xmm13,%xmm0,%xmm0 1351.byte 143,232,120,194,201,7 1352.byte 143,232,120,194,210,7 1353.byte 143,232,120,194,219,7 1354.byte 143,232,120,194,192,7 1355 decl %eax 1356 jnz .Loop4xop 1357 1358 vpaddd 64(%rsp),%xmm8,%xmm8 1359 vpaddd 80(%rsp),%xmm9,%xmm9 1360 vpaddd 96(%rsp),%xmm10,%xmm10 1361 vpaddd 112(%rsp),%xmm11,%xmm11 1362 1363 vmovdqa %xmm14,32(%rsp) 1364 vmovdqa %xmm15,48(%rsp) 1365 1366 vpunpckldq %xmm9,%xmm8,%xmm14 1367 vpunpckldq %xmm11,%xmm10,%xmm15 1368 vpunpckhdq %xmm9,%xmm8,%xmm8 1369 vpunpckhdq %xmm11,%xmm10,%xmm10 1370 vpunpcklqdq %xmm15,%xmm14,%xmm9 1371 vpunpckhqdq %xmm15,%xmm14,%xmm14 1372 vpunpcklqdq %xmm10,%xmm8,%xmm11 1373 vpunpckhqdq %xmm10,%xmm8,%xmm8 1374 vpaddd 128-256(%rcx),%xmm0,%xmm0 1375 vpaddd 144-256(%rcx),%xmm1,%xmm1 1376 vpaddd 160-256(%rcx),%xmm2,%xmm2 1377 vpaddd 176-256(%rcx),%xmm3,%xmm3 1378 1379 vmovdqa %xmm9,0(%rsp) 1380 vmovdqa %xmm14,16(%rsp) 1381 vmovdqa 32(%rsp),%xmm9 1382 vmovdqa 48(%rsp),%xmm14 1383 1384 vpunpckldq %xmm1,%xmm0,%xmm10 1385 vpunpckldq %xmm3,%xmm2,%xmm15 1386 vpunpckhdq %xmm1,%xmm0,%xmm0 1387 vpunpckhdq %xmm3,%xmm2,%xmm2 1388 vpunpcklqdq %xmm15,%xmm10,%xmm1 1389 vpunpckhqdq %xmm15,%xmm10,%xmm10 1390 vpunpcklqdq %xmm2,%xmm0,%xmm3 1391 vpunpckhqdq %xmm2,%xmm0,%xmm0 1392 vpaddd 192-256(%rcx),%xmm12,%xmm12 1393 vpaddd 208-256(%rcx),%xmm13,%xmm13 1394 vpaddd 224-256(%rcx),%xmm9,%xmm9 1395 vpaddd 240-256(%rcx),%xmm14,%xmm14 1396 1397 vpunpckldq %xmm13,%xmm12,%xmm2 1398 vpunpckldq %xmm14,%xmm9,%xmm15 1399 vpunpckhdq %xmm13,%xmm12,%xmm12 1400 vpunpckhdq %xmm14,%xmm9,%xmm9 1401 vpunpcklqdq %xmm15,%xmm2,%xmm13 1402 vpunpckhqdq %xmm15,%xmm2,%xmm2 1403 vpunpcklqdq %xmm9,%xmm12,%xmm14 1404 vpunpckhqdq %xmm9,%xmm12,%xmm12 1405 vpaddd 256-256(%rcx),%xmm4,%xmm4 1406 vpaddd 272-256(%rcx),%xmm5,%xmm5 1407 vpaddd 288-256(%rcx),%xmm6,%xmm6 1408 vpaddd 304-256(%rcx),%xmm7,%xmm7 1409 1410 vpunpckldq %xmm5,%xmm4,%xmm9 1411 vpunpckldq %xmm7,%xmm6,%xmm15 1412 vpunpckhdq %xmm5,%xmm4,%xmm4 1413 vpunpckhdq %xmm7,%xmm6,%xmm6 1414 vpunpcklqdq %xmm15,%xmm9,%xmm5 1415 vpunpckhqdq %xmm15,%xmm9,%xmm9 1416 vpunpcklqdq %xmm6,%xmm4,%xmm7 1417 vpunpckhqdq %xmm6,%xmm4,%xmm4 1418 vmovdqa 0(%rsp),%xmm6 1419 vmovdqa 16(%rsp),%xmm15 1420 1421 cmpq $256,%rdx 1422 jb .Ltail4xop 1423 1424 vpxor 0(%rsi),%xmm6,%xmm6 1425 vpxor 16(%rsi),%xmm1,%xmm1 1426 vpxor 32(%rsi),%xmm13,%xmm13 1427 vpxor 48(%rsi),%xmm5,%xmm5 1428 vpxor 64(%rsi),%xmm15,%xmm15 1429 vpxor 80(%rsi),%xmm10,%xmm10 1430 vpxor 96(%rsi),%xmm2,%xmm2 1431 vpxor 112(%rsi),%xmm9,%xmm9 1432 leaq 128(%rsi),%rsi 1433 vpxor 0(%rsi),%xmm11,%xmm11 1434 vpxor 16(%rsi),%xmm3,%xmm3 1435 vpxor 32(%rsi),%xmm14,%xmm14 1436 vpxor 48(%rsi),%xmm7,%xmm7 1437 vpxor 64(%rsi),%xmm8,%xmm8 1438 vpxor 80(%rsi),%xmm0,%xmm0 1439 vpxor 96(%rsi),%xmm12,%xmm12 1440 vpxor 112(%rsi),%xmm4,%xmm4 1441 leaq 128(%rsi),%rsi 1442 1443 vmovdqu %xmm6,0(%rdi) 1444 vmovdqu %xmm1,16(%rdi) 1445 vmovdqu %xmm13,32(%rdi) 1446 vmovdqu %xmm5,48(%rdi) 1447 vmovdqu %xmm15,64(%rdi) 1448 vmovdqu %xmm10,80(%rdi) 1449 vmovdqu %xmm2,96(%rdi) 1450 vmovdqu %xmm9,112(%rdi) 1451 leaq 128(%rdi),%rdi 1452 vmovdqu %xmm11,0(%rdi) 1453 vmovdqu %xmm3,16(%rdi) 1454 vmovdqu %xmm14,32(%rdi) 1455 vmovdqu %xmm7,48(%rdi) 1456 vmovdqu %xmm8,64(%rdi) 1457 vmovdqu %xmm0,80(%rdi) 1458 vmovdqu %xmm12,96(%rdi) 1459 vmovdqu %xmm4,112(%rdi) 1460 leaq 128(%rdi),%rdi 1461 1462 subq $256,%rdx 1463 jnz .Loop_outer4xop 1464 1465 jmp .Ldone4xop 1466 1467.align 32 1468.Ltail4xop: 1469 cmpq $192,%rdx 1470 jae .L192_or_more4xop 1471 cmpq $128,%rdx 1472 jae .L128_or_more4xop 1473 cmpq $64,%rdx 1474 jae .L64_or_more4xop 1475 1476 xorq %r10,%r10 1477 vmovdqa %xmm6,0(%rsp) 1478 vmovdqa %xmm1,16(%rsp) 1479 vmovdqa %xmm13,32(%rsp) 1480 vmovdqa %xmm5,48(%rsp) 1481 jmp .Loop_tail4xop 1482 1483.align 32 1484.L64_or_more4xop: 1485 vpxor 0(%rsi),%xmm6,%xmm6 1486 vpxor 16(%rsi),%xmm1,%xmm1 1487 vpxor 32(%rsi),%xmm13,%xmm13 1488 vpxor 48(%rsi),%xmm5,%xmm5 1489 vmovdqu %xmm6,0(%rdi) 1490 vmovdqu %xmm1,16(%rdi) 1491 vmovdqu %xmm13,32(%rdi) 1492 vmovdqu %xmm5,48(%rdi) 1493 je .Ldone4xop 1494 1495 leaq 64(%rsi),%rsi 1496 vmovdqa %xmm15,0(%rsp) 1497 xorq %r10,%r10 1498 vmovdqa %xmm10,16(%rsp) 1499 leaq 64(%rdi),%rdi 1500 vmovdqa %xmm2,32(%rsp) 1501 subq $64,%rdx 1502 vmovdqa %xmm9,48(%rsp) 1503 jmp .Loop_tail4xop 1504 1505.align 32 1506.L128_or_more4xop: 1507 vpxor 0(%rsi),%xmm6,%xmm6 1508 vpxor 16(%rsi),%xmm1,%xmm1 1509 vpxor 32(%rsi),%xmm13,%xmm13 1510 vpxor 48(%rsi),%xmm5,%xmm5 1511 vpxor 64(%rsi),%xmm15,%xmm15 1512 vpxor 80(%rsi),%xmm10,%xmm10 1513 vpxor 96(%rsi),%xmm2,%xmm2 1514 vpxor 112(%rsi),%xmm9,%xmm9 1515 1516 vmovdqu %xmm6,0(%rdi) 1517 vmovdqu %xmm1,16(%rdi) 1518 vmovdqu %xmm13,32(%rdi) 1519 vmovdqu %xmm5,48(%rdi) 1520 vmovdqu %xmm15,64(%rdi) 1521 vmovdqu %xmm10,80(%rdi) 1522 vmovdqu %xmm2,96(%rdi) 1523 vmovdqu %xmm9,112(%rdi) 1524 je .Ldone4xop 1525 1526 leaq 128(%rsi),%rsi 1527 vmovdqa %xmm11,0(%rsp) 1528 xorq %r10,%r10 1529 vmovdqa %xmm3,16(%rsp) 1530 leaq 128(%rdi),%rdi 1531 vmovdqa %xmm14,32(%rsp) 1532 subq $128,%rdx 1533 vmovdqa %xmm7,48(%rsp) 1534 jmp .Loop_tail4xop 1535 1536.align 32 1537.L192_or_more4xop: 1538 vpxor 0(%rsi),%xmm6,%xmm6 1539 vpxor 16(%rsi),%xmm1,%xmm1 1540 vpxor 32(%rsi),%xmm13,%xmm13 1541 vpxor 48(%rsi),%xmm5,%xmm5 1542 vpxor 64(%rsi),%xmm15,%xmm15 1543 vpxor 80(%rsi),%xmm10,%xmm10 1544 vpxor 96(%rsi),%xmm2,%xmm2 1545 vpxor 112(%rsi),%xmm9,%xmm9 1546 leaq 128(%rsi),%rsi 1547 vpxor 0(%rsi),%xmm11,%xmm11 1548 vpxor 16(%rsi),%xmm3,%xmm3 1549 vpxor 32(%rsi),%xmm14,%xmm14 1550 vpxor 48(%rsi),%xmm7,%xmm7 1551 1552 vmovdqu %xmm6,0(%rdi) 1553 vmovdqu %xmm1,16(%rdi) 1554 vmovdqu %xmm13,32(%rdi) 1555 vmovdqu %xmm5,48(%rdi) 1556 vmovdqu %xmm15,64(%rdi) 1557 vmovdqu %xmm10,80(%rdi) 1558 vmovdqu %xmm2,96(%rdi) 1559 vmovdqu %xmm9,112(%rdi) 1560 leaq 128(%rdi),%rdi 1561 vmovdqu %xmm11,0(%rdi) 1562 vmovdqu %xmm3,16(%rdi) 1563 vmovdqu %xmm14,32(%rdi) 1564 vmovdqu %xmm7,48(%rdi) 1565 je .Ldone4xop 1566 1567 leaq 64(%rsi),%rsi 1568 vmovdqa %xmm8,0(%rsp) 1569 xorq %r10,%r10 1570 vmovdqa %xmm0,16(%rsp) 1571 leaq 64(%rdi),%rdi 1572 vmovdqa %xmm12,32(%rsp) 1573 subq $192,%rdx 1574 vmovdqa %xmm4,48(%rsp) 1575 1576.Loop_tail4xop: 1577 movzbl (%rsi,%r10,1),%eax 1578 movzbl (%rsp,%r10,1),%ecx 1579 leaq 1(%r10),%r10 1580 xorl %ecx,%eax 1581 movb %al,-1(%rdi,%r10,1) 1582 decq %rdx 1583 jnz .Loop_tail4xop 1584 1585.Ldone4xop: 1586 vzeroupper 1587 leaq (%r9),%rsp 1588.cfi_def_cfa_register %rsp 1589.L4xop_epilogue: 1590 .byte 0xf3,0xc3 1591.cfi_endproc 1592.size ChaCha20_4xop,.-ChaCha20_4xop 1593.type ChaCha20_8x,@function 1594.align 32 1595ChaCha20_8x: 1596.cfi_startproc 1597.LChaCha20_8x: 1598 movq %rsp,%r9 1599.cfi_def_cfa_register %r9 1600 subq $0x280+8,%rsp 1601 andq $-32,%rsp 1602 vzeroupper 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 vbroadcasti128 .Lsigma(%rip),%ymm11 1614 vbroadcasti128 (%rcx),%ymm3 1615 vbroadcasti128 16(%rcx),%ymm15 1616 vbroadcasti128 (%r8),%ymm7 1617 leaq 256(%rsp),%rcx 1618 leaq 512(%rsp),%rax 1619 leaq .Lrot16(%rip),%r10 1620 leaq .Lrot24(%rip),%r11 1621 1622 vpshufd $0x00,%ymm11,%ymm8 1623 vpshufd $0x55,%ymm11,%ymm9 1624 vmovdqa %ymm8,128-256(%rcx) 1625 vpshufd $0xaa,%ymm11,%ymm10 1626 vmovdqa %ymm9,160-256(%rcx) 1627 vpshufd $0xff,%ymm11,%ymm11 1628 vmovdqa %ymm10,192-256(%rcx) 1629 vmovdqa %ymm11,224-256(%rcx) 1630 1631 vpshufd $0x00,%ymm3,%ymm0 1632 vpshufd $0x55,%ymm3,%ymm1 1633 vmovdqa %ymm0,256-256(%rcx) 1634 vpshufd $0xaa,%ymm3,%ymm2 1635 vmovdqa %ymm1,288-256(%rcx) 1636 vpshufd $0xff,%ymm3,%ymm3 1637 vmovdqa %ymm2,320-256(%rcx) 1638 vmovdqa %ymm3,352-256(%rcx) 1639 1640 vpshufd $0x00,%ymm15,%ymm12 1641 vpshufd $0x55,%ymm15,%ymm13 1642 vmovdqa %ymm12,384-512(%rax) 1643 vpshufd $0xaa,%ymm15,%ymm14 1644 vmovdqa %ymm13,416-512(%rax) 1645 vpshufd $0xff,%ymm15,%ymm15 1646 vmovdqa %ymm14,448-512(%rax) 1647 vmovdqa %ymm15,480-512(%rax) 1648 1649 vpshufd $0x00,%ymm7,%ymm4 1650 vpshufd $0x55,%ymm7,%ymm5 1651 vpaddd .Lincy(%rip),%ymm4,%ymm4 1652 vpshufd $0xaa,%ymm7,%ymm6 1653 vmovdqa %ymm5,544-512(%rax) 1654 vpshufd $0xff,%ymm7,%ymm7 1655 vmovdqa %ymm6,576-512(%rax) 1656 vmovdqa %ymm7,608-512(%rax) 1657 1658 jmp .Loop_enter8x 1659 1660.align 32 1661.Loop_outer8x: 1662 vmovdqa 128-256(%rcx),%ymm8 1663 vmovdqa 160-256(%rcx),%ymm9 1664 vmovdqa 192-256(%rcx),%ymm10 1665 vmovdqa 224-256(%rcx),%ymm11 1666 vmovdqa 256-256(%rcx),%ymm0 1667 vmovdqa 288-256(%rcx),%ymm1 1668 vmovdqa 320-256(%rcx),%ymm2 1669 vmovdqa 352-256(%rcx),%ymm3 1670 vmovdqa 384-512(%rax),%ymm12 1671 vmovdqa 416-512(%rax),%ymm13 1672 vmovdqa 448-512(%rax),%ymm14 1673 vmovdqa 480-512(%rax),%ymm15 1674 vmovdqa 512-512(%rax),%ymm4 1675 vmovdqa 544-512(%rax),%ymm5 1676 vmovdqa 576-512(%rax),%ymm6 1677 vmovdqa 608-512(%rax),%ymm7 1678 vpaddd .Leight(%rip),%ymm4,%ymm4 1679 1680.Loop_enter8x: 1681 vmovdqa %ymm14,64(%rsp) 1682 vmovdqa %ymm15,96(%rsp) 1683 vbroadcasti128 (%r10),%ymm15 1684 vmovdqa %ymm4,512-512(%rax) 1685 movl $10,%eax 1686 jmp .Loop8x 1687 1688.align 32 1689.Loop8x: 1690 vpaddd %ymm0,%ymm8,%ymm8 1691 vpxor %ymm4,%ymm8,%ymm4 1692 vpshufb %ymm15,%ymm4,%ymm4 1693 vpaddd %ymm1,%ymm9,%ymm9 1694 vpxor %ymm5,%ymm9,%ymm5 1695 vpshufb %ymm15,%ymm5,%ymm5 1696 vpaddd %ymm4,%ymm12,%ymm12 1697 vpxor %ymm0,%ymm12,%ymm0 1698 vpslld $12,%ymm0,%ymm14 1699 vpsrld $20,%ymm0,%ymm0 1700 vpor %ymm0,%ymm14,%ymm0 1701 vbroadcasti128 (%r11),%ymm14 1702 vpaddd %ymm5,%ymm13,%ymm13 1703 vpxor %ymm1,%ymm13,%ymm1 1704 vpslld $12,%ymm1,%ymm15 1705 vpsrld $20,%ymm1,%ymm1 1706 vpor %ymm1,%ymm15,%ymm1 1707 vpaddd %ymm0,%ymm8,%ymm8 1708 vpxor %ymm4,%ymm8,%ymm4 1709 vpshufb %ymm14,%ymm4,%ymm4 1710 vpaddd %ymm1,%ymm9,%ymm9 1711 vpxor %ymm5,%ymm9,%ymm5 1712 vpshufb %ymm14,%ymm5,%ymm5 1713 vpaddd %ymm4,%ymm12,%ymm12 1714 vpxor %ymm0,%ymm12,%ymm0 1715 vpslld $7,%ymm0,%ymm15 1716 vpsrld $25,%ymm0,%ymm0 1717 vpor %ymm0,%ymm15,%ymm0 1718 vbroadcasti128 (%r10),%ymm15 1719 vpaddd %ymm5,%ymm13,%ymm13 1720 vpxor %ymm1,%ymm13,%ymm1 1721 vpslld $7,%ymm1,%ymm14 1722 vpsrld $25,%ymm1,%ymm1 1723 vpor %ymm1,%ymm14,%ymm1 1724 vmovdqa %ymm12,0(%rsp) 1725 vmovdqa %ymm13,32(%rsp) 1726 vmovdqa 64(%rsp),%ymm12 1727 vmovdqa 96(%rsp),%ymm13 1728 vpaddd %ymm2,%ymm10,%ymm10 1729 vpxor %ymm6,%ymm10,%ymm6 1730 vpshufb %ymm15,%ymm6,%ymm6 1731 vpaddd %ymm3,%ymm11,%ymm11 1732 vpxor %ymm7,%ymm11,%ymm7 1733 vpshufb %ymm15,%ymm7,%ymm7 1734 vpaddd %ymm6,%ymm12,%ymm12 1735 vpxor %ymm2,%ymm12,%ymm2 1736 vpslld $12,%ymm2,%ymm14 1737 vpsrld $20,%ymm2,%ymm2 1738 vpor %ymm2,%ymm14,%ymm2 1739 vbroadcasti128 (%r11),%ymm14 1740 vpaddd %ymm7,%ymm13,%ymm13 1741 vpxor %ymm3,%ymm13,%ymm3 1742 vpslld $12,%ymm3,%ymm15 1743 vpsrld $20,%ymm3,%ymm3 1744 vpor %ymm3,%ymm15,%ymm3 1745 vpaddd %ymm2,%ymm10,%ymm10 1746 vpxor %ymm6,%ymm10,%ymm6 1747 vpshufb %ymm14,%ymm6,%ymm6 1748 vpaddd %ymm3,%ymm11,%ymm11 1749 vpxor %ymm7,%ymm11,%ymm7 1750 vpshufb %ymm14,%ymm7,%ymm7 1751 vpaddd %ymm6,%ymm12,%ymm12 1752 vpxor %ymm2,%ymm12,%ymm2 1753 vpslld $7,%ymm2,%ymm15 1754 vpsrld $25,%ymm2,%ymm2 1755 vpor %ymm2,%ymm15,%ymm2 1756 vbroadcasti128 (%r10),%ymm15 1757 vpaddd %ymm7,%ymm13,%ymm13 1758 vpxor %ymm3,%ymm13,%ymm3 1759 vpslld $7,%ymm3,%ymm14 1760 vpsrld $25,%ymm3,%ymm3 1761 vpor %ymm3,%ymm14,%ymm3 1762 vpaddd %ymm1,%ymm8,%ymm8 1763 vpxor %ymm7,%ymm8,%ymm7 1764 vpshufb %ymm15,%ymm7,%ymm7 1765 vpaddd %ymm2,%ymm9,%ymm9 1766 vpxor %ymm4,%ymm9,%ymm4 1767 vpshufb %ymm15,%ymm4,%ymm4 1768 vpaddd %ymm7,%ymm12,%ymm12 1769 vpxor %ymm1,%ymm12,%ymm1 1770 vpslld $12,%ymm1,%ymm14 1771 vpsrld $20,%ymm1,%ymm1 1772 vpor %ymm1,%ymm14,%ymm1 1773 vbroadcasti128 (%r11),%ymm14 1774 vpaddd %ymm4,%ymm13,%ymm13 1775 vpxor %ymm2,%ymm13,%ymm2 1776 vpslld $12,%ymm2,%ymm15 1777 vpsrld $20,%ymm2,%ymm2 1778 vpor %ymm2,%ymm15,%ymm2 1779 vpaddd %ymm1,%ymm8,%ymm8 1780 vpxor %ymm7,%ymm8,%ymm7 1781 vpshufb %ymm14,%ymm7,%ymm7 1782 vpaddd %ymm2,%ymm9,%ymm9 1783 vpxor %ymm4,%ymm9,%ymm4 1784 vpshufb %ymm14,%ymm4,%ymm4 1785 vpaddd %ymm7,%ymm12,%ymm12 1786 vpxor %ymm1,%ymm12,%ymm1 1787 vpslld $7,%ymm1,%ymm15 1788 vpsrld $25,%ymm1,%ymm1 1789 vpor %ymm1,%ymm15,%ymm1 1790 vbroadcasti128 (%r10),%ymm15 1791 vpaddd %ymm4,%ymm13,%ymm13 1792 vpxor %ymm2,%ymm13,%ymm2 1793 vpslld $7,%ymm2,%ymm14 1794 vpsrld $25,%ymm2,%ymm2 1795 vpor %ymm2,%ymm14,%ymm2 1796 vmovdqa %ymm12,64(%rsp) 1797 vmovdqa %ymm13,96(%rsp) 1798 vmovdqa 0(%rsp),%ymm12 1799 vmovdqa 32(%rsp),%ymm13 1800 vpaddd %ymm3,%ymm10,%ymm10 1801 vpxor %ymm5,%ymm10,%ymm5 1802 vpshufb %ymm15,%ymm5,%ymm5 1803 vpaddd %ymm0,%ymm11,%ymm11 1804 vpxor %ymm6,%ymm11,%ymm6 1805 vpshufb %ymm15,%ymm6,%ymm6 1806 vpaddd %ymm5,%ymm12,%ymm12 1807 vpxor %ymm3,%ymm12,%ymm3 1808 vpslld $12,%ymm3,%ymm14 1809 vpsrld $20,%ymm3,%ymm3 1810 vpor %ymm3,%ymm14,%ymm3 1811 vbroadcasti128 (%r11),%ymm14 1812 vpaddd %ymm6,%ymm13,%ymm13 1813 vpxor %ymm0,%ymm13,%ymm0 1814 vpslld $12,%ymm0,%ymm15 1815 vpsrld $20,%ymm0,%ymm0 1816 vpor %ymm0,%ymm15,%ymm0 1817 vpaddd %ymm3,%ymm10,%ymm10 1818 vpxor %ymm5,%ymm10,%ymm5 1819 vpshufb %ymm14,%ymm5,%ymm5 1820 vpaddd %ymm0,%ymm11,%ymm11 1821 vpxor %ymm6,%ymm11,%ymm6 1822 vpshufb %ymm14,%ymm6,%ymm6 1823 vpaddd %ymm5,%ymm12,%ymm12 1824 vpxor %ymm3,%ymm12,%ymm3 1825 vpslld $7,%ymm3,%ymm15 1826 vpsrld $25,%ymm3,%ymm3 1827 vpor %ymm3,%ymm15,%ymm3 1828 vbroadcasti128 (%r10),%ymm15 1829 vpaddd %ymm6,%ymm13,%ymm13 1830 vpxor %ymm0,%ymm13,%ymm0 1831 vpslld $7,%ymm0,%ymm14 1832 vpsrld $25,%ymm0,%ymm0 1833 vpor %ymm0,%ymm14,%ymm0 1834 decl %eax 1835 jnz .Loop8x 1836 1837 leaq 512(%rsp),%rax 1838 vpaddd 128-256(%rcx),%ymm8,%ymm8 1839 vpaddd 160-256(%rcx),%ymm9,%ymm9 1840 vpaddd 192-256(%rcx),%ymm10,%ymm10 1841 vpaddd 224-256(%rcx),%ymm11,%ymm11 1842 1843 vpunpckldq %ymm9,%ymm8,%ymm14 1844 vpunpckldq %ymm11,%ymm10,%ymm15 1845 vpunpckhdq %ymm9,%ymm8,%ymm8 1846 vpunpckhdq %ymm11,%ymm10,%ymm10 1847 vpunpcklqdq %ymm15,%ymm14,%ymm9 1848 vpunpckhqdq %ymm15,%ymm14,%ymm14 1849 vpunpcklqdq %ymm10,%ymm8,%ymm11 1850 vpunpckhqdq %ymm10,%ymm8,%ymm8 1851 vpaddd 256-256(%rcx),%ymm0,%ymm0 1852 vpaddd 288-256(%rcx),%ymm1,%ymm1 1853 vpaddd 320-256(%rcx),%ymm2,%ymm2 1854 vpaddd 352-256(%rcx),%ymm3,%ymm3 1855 1856 vpunpckldq %ymm1,%ymm0,%ymm10 1857 vpunpckldq %ymm3,%ymm2,%ymm15 1858 vpunpckhdq %ymm1,%ymm0,%ymm0 1859 vpunpckhdq %ymm3,%ymm2,%ymm2 1860 vpunpcklqdq %ymm15,%ymm10,%ymm1 1861 vpunpckhqdq %ymm15,%ymm10,%ymm10 1862 vpunpcklqdq %ymm2,%ymm0,%ymm3 1863 vpunpckhqdq %ymm2,%ymm0,%ymm0 1864 vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 1865 vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 1866 vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 1867 vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 1868 vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 1869 vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 1870 vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 1871 vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 1872 vmovdqa %ymm15,0(%rsp) 1873 vmovdqa %ymm9,32(%rsp) 1874 vmovdqa 64(%rsp),%ymm15 1875 vmovdqa 96(%rsp),%ymm9 1876 1877 vpaddd 384-512(%rax),%ymm12,%ymm12 1878 vpaddd 416-512(%rax),%ymm13,%ymm13 1879 vpaddd 448-512(%rax),%ymm15,%ymm15 1880 vpaddd 480-512(%rax),%ymm9,%ymm9 1881 1882 vpunpckldq %ymm13,%ymm12,%ymm2 1883 vpunpckldq %ymm9,%ymm15,%ymm8 1884 vpunpckhdq %ymm13,%ymm12,%ymm12 1885 vpunpckhdq %ymm9,%ymm15,%ymm15 1886 vpunpcklqdq %ymm8,%ymm2,%ymm13 1887 vpunpckhqdq %ymm8,%ymm2,%ymm2 1888 vpunpcklqdq %ymm15,%ymm12,%ymm9 1889 vpunpckhqdq %ymm15,%ymm12,%ymm12 1890 vpaddd 512-512(%rax),%ymm4,%ymm4 1891 vpaddd 544-512(%rax),%ymm5,%ymm5 1892 vpaddd 576-512(%rax),%ymm6,%ymm6 1893 vpaddd 608-512(%rax),%ymm7,%ymm7 1894 1895 vpunpckldq %ymm5,%ymm4,%ymm15 1896 vpunpckldq %ymm7,%ymm6,%ymm8 1897 vpunpckhdq %ymm5,%ymm4,%ymm4 1898 vpunpckhdq %ymm7,%ymm6,%ymm6 1899 vpunpcklqdq %ymm8,%ymm15,%ymm5 1900 vpunpckhqdq %ymm8,%ymm15,%ymm15 1901 vpunpcklqdq %ymm6,%ymm4,%ymm7 1902 vpunpckhqdq %ymm6,%ymm4,%ymm4 1903 vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 1904 vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 1905 vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 1906 vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 1907 vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 1908 vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 1909 vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 1910 vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 1911 vmovdqa 0(%rsp),%ymm6 1912 vmovdqa 32(%rsp),%ymm12 1913 1914 cmpq $512,%rdx 1915 jb .Ltail8x 1916 1917 vpxor 0(%rsi),%ymm6,%ymm6 1918 vpxor 32(%rsi),%ymm8,%ymm8 1919 vpxor 64(%rsi),%ymm1,%ymm1 1920 vpxor 96(%rsi),%ymm5,%ymm5 1921 leaq 128(%rsi),%rsi 1922 vmovdqu %ymm6,0(%rdi) 1923 vmovdqu %ymm8,32(%rdi) 1924 vmovdqu %ymm1,64(%rdi) 1925 vmovdqu %ymm5,96(%rdi) 1926 leaq 128(%rdi),%rdi 1927 1928 vpxor 0(%rsi),%ymm12,%ymm12 1929 vpxor 32(%rsi),%ymm13,%ymm13 1930 vpxor 64(%rsi),%ymm10,%ymm10 1931 vpxor 96(%rsi),%ymm15,%ymm15 1932 leaq 128(%rsi),%rsi 1933 vmovdqu %ymm12,0(%rdi) 1934 vmovdqu %ymm13,32(%rdi) 1935 vmovdqu %ymm10,64(%rdi) 1936 vmovdqu %ymm15,96(%rdi) 1937 leaq 128(%rdi),%rdi 1938 1939 vpxor 0(%rsi),%ymm14,%ymm14 1940 vpxor 32(%rsi),%ymm2,%ymm2 1941 vpxor 64(%rsi),%ymm3,%ymm3 1942 vpxor 96(%rsi),%ymm7,%ymm7 1943 leaq 128(%rsi),%rsi 1944 vmovdqu %ymm14,0(%rdi) 1945 vmovdqu %ymm2,32(%rdi) 1946 vmovdqu %ymm3,64(%rdi) 1947 vmovdqu %ymm7,96(%rdi) 1948 leaq 128(%rdi),%rdi 1949 1950 vpxor 0(%rsi),%ymm11,%ymm11 1951 vpxor 32(%rsi),%ymm9,%ymm9 1952 vpxor 64(%rsi),%ymm0,%ymm0 1953 vpxor 96(%rsi),%ymm4,%ymm4 1954 leaq 128(%rsi),%rsi 1955 vmovdqu %ymm11,0(%rdi) 1956 vmovdqu %ymm9,32(%rdi) 1957 vmovdqu %ymm0,64(%rdi) 1958 vmovdqu %ymm4,96(%rdi) 1959 leaq 128(%rdi),%rdi 1960 1961 subq $512,%rdx 1962 jnz .Loop_outer8x 1963 1964 jmp .Ldone8x 1965 1966.Ltail8x: 1967 cmpq $448,%rdx 1968 jae .L448_or_more8x 1969 cmpq $384,%rdx 1970 jae .L384_or_more8x 1971 cmpq $320,%rdx 1972 jae .L320_or_more8x 1973 cmpq $256,%rdx 1974 jae .L256_or_more8x 1975 cmpq $192,%rdx 1976 jae .L192_or_more8x 1977 cmpq $128,%rdx 1978 jae .L128_or_more8x 1979 cmpq $64,%rdx 1980 jae .L64_or_more8x 1981 1982 xorq %r10,%r10 1983 vmovdqa %ymm6,0(%rsp) 1984 vmovdqa %ymm8,32(%rsp) 1985 jmp .Loop_tail8x 1986 1987.align 32 1988.L64_or_more8x: 1989 vpxor 0(%rsi),%ymm6,%ymm6 1990 vpxor 32(%rsi),%ymm8,%ymm8 1991 vmovdqu %ymm6,0(%rdi) 1992 vmovdqu %ymm8,32(%rdi) 1993 je .Ldone8x 1994 1995 leaq 64(%rsi),%rsi 1996 xorq %r10,%r10 1997 vmovdqa %ymm1,0(%rsp) 1998 leaq 64(%rdi),%rdi 1999 subq $64,%rdx 2000 vmovdqa %ymm5,32(%rsp) 2001 jmp .Loop_tail8x 2002 2003.align 32 2004.L128_or_more8x: 2005 vpxor 0(%rsi),%ymm6,%ymm6 2006 vpxor 32(%rsi),%ymm8,%ymm8 2007 vpxor 64(%rsi),%ymm1,%ymm1 2008 vpxor 96(%rsi),%ymm5,%ymm5 2009 vmovdqu %ymm6,0(%rdi) 2010 vmovdqu %ymm8,32(%rdi) 2011 vmovdqu %ymm1,64(%rdi) 2012 vmovdqu %ymm5,96(%rdi) 2013 je .Ldone8x 2014 2015 leaq 128(%rsi),%rsi 2016 xorq %r10,%r10 2017 vmovdqa %ymm12,0(%rsp) 2018 leaq 128(%rdi),%rdi 2019 subq $128,%rdx 2020 vmovdqa %ymm13,32(%rsp) 2021 jmp .Loop_tail8x 2022 2023.align 32 2024.L192_or_more8x: 2025 vpxor 0(%rsi),%ymm6,%ymm6 2026 vpxor 32(%rsi),%ymm8,%ymm8 2027 vpxor 64(%rsi),%ymm1,%ymm1 2028 vpxor 96(%rsi),%ymm5,%ymm5 2029 vpxor 128(%rsi),%ymm12,%ymm12 2030 vpxor 160(%rsi),%ymm13,%ymm13 2031 vmovdqu %ymm6,0(%rdi) 2032 vmovdqu %ymm8,32(%rdi) 2033 vmovdqu %ymm1,64(%rdi) 2034 vmovdqu %ymm5,96(%rdi) 2035 vmovdqu %ymm12,128(%rdi) 2036 vmovdqu %ymm13,160(%rdi) 2037 je .Ldone8x 2038 2039 leaq 192(%rsi),%rsi 2040 xorq %r10,%r10 2041 vmovdqa %ymm10,0(%rsp) 2042 leaq 192(%rdi),%rdi 2043 subq $192,%rdx 2044 vmovdqa %ymm15,32(%rsp) 2045 jmp .Loop_tail8x 2046 2047.align 32 2048.L256_or_more8x: 2049 vpxor 0(%rsi),%ymm6,%ymm6 2050 vpxor 32(%rsi),%ymm8,%ymm8 2051 vpxor 64(%rsi),%ymm1,%ymm1 2052 vpxor 96(%rsi),%ymm5,%ymm5 2053 vpxor 128(%rsi),%ymm12,%ymm12 2054 vpxor 160(%rsi),%ymm13,%ymm13 2055 vpxor 192(%rsi),%ymm10,%ymm10 2056 vpxor 224(%rsi),%ymm15,%ymm15 2057 vmovdqu %ymm6,0(%rdi) 2058 vmovdqu %ymm8,32(%rdi) 2059 vmovdqu %ymm1,64(%rdi) 2060 vmovdqu %ymm5,96(%rdi) 2061 vmovdqu %ymm12,128(%rdi) 2062 vmovdqu %ymm13,160(%rdi) 2063 vmovdqu %ymm10,192(%rdi) 2064 vmovdqu %ymm15,224(%rdi) 2065 je .Ldone8x 2066 2067 leaq 256(%rsi),%rsi 2068 xorq %r10,%r10 2069 vmovdqa %ymm14,0(%rsp) 2070 leaq 256(%rdi),%rdi 2071 subq $256,%rdx 2072 vmovdqa %ymm2,32(%rsp) 2073 jmp .Loop_tail8x 2074 2075.align 32 2076.L320_or_more8x: 2077 vpxor 0(%rsi),%ymm6,%ymm6 2078 vpxor 32(%rsi),%ymm8,%ymm8 2079 vpxor 64(%rsi),%ymm1,%ymm1 2080 vpxor 96(%rsi),%ymm5,%ymm5 2081 vpxor 128(%rsi),%ymm12,%ymm12 2082 vpxor 160(%rsi),%ymm13,%ymm13 2083 vpxor 192(%rsi),%ymm10,%ymm10 2084 vpxor 224(%rsi),%ymm15,%ymm15 2085 vpxor 256(%rsi),%ymm14,%ymm14 2086 vpxor 288(%rsi),%ymm2,%ymm2 2087 vmovdqu %ymm6,0(%rdi) 2088 vmovdqu %ymm8,32(%rdi) 2089 vmovdqu %ymm1,64(%rdi) 2090 vmovdqu %ymm5,96(%rdi) 2091 vmovdqu %ymm12,128(%rdi) 2092 vmovdqu %ymm13,160(%rdi) 2093 vmovdqu %ymm10,192(%rdi) 2094 vmovdqu %ymm15,224(%rdi) 2095 vmovdqu %ymm14,256(%rdi) 2096 vmovdqu %ymm2,288(%rdi) 2097 je .Ldone8x 2098 2099 leaq 320(%rsi),%rsi 2100 xorq %r10,%r10 2101 vmovdqa %ymm3,0(%rsp) 2102 leaq 320(%rdi),%rdi 2103 subq $320,%rdx 2104 vmovdqa %ymm7,32(%rsp) 2105 jmp .Loop_tail8x 2106 2107.align 32 2108.L384_or_more8x: 2109 vpxor 0(%rsi),%ymm6,%ymm6 2110 vpxor 32(%rsi),%ymm8,%ymm8 2111 vpxor 64(%rsi),%ymm1,%ymm1 2112 vpxor 96(%rsi),%ymm5,%ymm5 2113 vpxor 128(%rsi),%ymm12,%ymm12 2114 vpxor 160(%rsi),%ymm13,%ymm13 2115 vpxor 192(%rsi),%ymm10,%ymm10 2116 vpxor 224(%rsi),%ymm15,%ymm15 2117 vpxor 256(%rsi),%ymm14,%ymm14 2118 vpxor 288(%rsi),%ymm2,%ymm2 2119 vpxor 320(%rsi),%ymm3,%ymm3 2120 vpxor 352(%rsi),%ymm7,%ymm7 2121 vmovdqu %ymm6,0(%rdi) 2122 vmovdqu %ymm8,32(%rdi) 2123 vmovdqu %ymm1,64(%rdi) 2124 vmovdqu %ymm5,96(%rdi) 2125 vmovdqu %ymm12,128(%rdi) 2126 vmovdqu %ymm13,160(%rdi) 2127 vmovdqu %ymm10,192(%rdi) 2128 vmovdqu %ymm15,224(%rdi) 2129 vmovdqu %ymm14,256(%rdi) 2130 vmovdqu %ymm2,288(%rdi) 2131 vmovdqu %ymm3,320(%rdi) 2132 vmovdqu %ymm7,352(%rdi) 2133 je .Ldone8x 2134 2135 leaq 384(%rsi),%rsi 2136 xorq %r10,%r10 2137 vmovdqa %ymm11,0(%rsp) 2138 leaq 384(%rdi),%rdi 2139 subq $384,%rdx 2140 vmovdqa %ymm9,32(%rsp) 2141 jmp .Loop_tail8x 2142 2143.align 32 2144.L448_or_more8x: 2145 vpxor 0(%rsi),%ymm6,%ymm6 2146 vpxor 32(%rsi),%ymm8,%ymm8 2147 vpxor 64(%rsi),%ymm1,%ymm1 2148 vpxor 96(%rsi),%ymm5,%ymm5 2149 vpxor 128(%rsi),%ymm12,%ymm12 2150 vpxor 160(%rsi),%ymm13,%ymm13 2151 vpxor 192(%rsi),%ymm10,%ymm10 2152 vpxor 224(%rsi),%ymm15,%ymm15 2153 vpxor 256(%rsi),%ymm14,%ymm14 2154 vpxor 288(%rsi),%ymm2,%ymm2 2155 vpxor 320(%rsi),%ymm3,%ymm3 2156 vpxor 352(%rsi),%ymm7,%ymm7 2157 vpxor 384(%rsi),%ymm11,%ymm11 2158 vpxor 416(%rsi),%ymm9,%ymm9 2159 vmovdqu %ymm6,0(%rdi) 2160 vmovdqu %ymm8,32(%rdi) 2161 vmovdqu %ymm1,64(%rdi) 2162 vmovdqu %ymm5,96(%rdi) 2163 vmovdqu %ymm12,128(%rdi) 2164 vmovdqu %ymm13,160(%rdi) 2165 vmovdqu %ymm10,192(%rdi) 2166 vmovdqu %ymm15,224(%rdi) 2167 vmovdqu %ymm14,256(%rdi) 2168 vmovdqu %ymm2,288(%rdi) 2169 vmovdqu %ymm3,320(%rdi) 2170 vmovdqu %ymm7,352(%rdi) 2171 vmovdqu %ymm11,384(%rdi) 2172 vmovdqu %ymm9,416(%rdi) 2173 je .Ldone8x 2174 2175 leaq 448(%rsi),%rsi 2176 xorq %r10,%r10 2177 vmovdqa %ymm0,0(%rsp) 2178 leaq 448(%rdi),%rdi 2179 subq $448,%rdx 2180 vmovdqa %ymm4,32(%rsp) 2181 2182.Loop_tail8x: 2183 movzbl (%rsi,%r10,1),%eax 2184 movzbl (%rsp,%r10,1),%ecx 2185 leaq 1(%r10),%r10 2186 xorl %ecx,%eax 2187 movb %al,-1(%rdi,%r10,1) 2188 decq %rdx 2189 jnz .Loop_tail8x 2190 2191.Ldone8x: 2192 vzeroall 2193 leaq (%r9),%rsp 2194.cfi_def_cfa_register %rsp 2195.L8x_epilogue: 2196 .byte 0xf3,0xc3 2197.cfi_endproc 2198.size ChaCha20_8x,.-ChaCha20_8x 2199.type ChaCha20_avx512,@function 2200.align 32 2201ChaCha20_avx512: 2202.cfi_startproc 2203.LChaCha20_avx512: 2204 movq %rsp,%r9 2205.cfi_def_cfa_register %r9 2206 cmpq $512,%rdx 2207 ja .LChaCha20_16x 2208 2209 subq $64+8,%rsp 2210 vbroadcasti32x4 .Lsigma(%rip),%zmm0 2211 vbroadcasti32x4 (%rcx),%zmm1 2212 vbroadcasti32x4 16(%rcx),%zmm2 2213 vbroadcasti32x4 (%r8),%zmm3 2214 2215 vmovdqa32 %zmm0,%zmm16 2216 vmovdqa32 %zmm1,%zmm17 2217 vmovdqa32 %zmm2,%zmm18 2218 vpaddd .Lzeroz(%rip),%zmm3,%zmm3 2219 vmovdqa32 .Lfourz(%rip),%zmm20 2220 movq $10,%r8 2221 vmovdqa32 %zmm3,%zmm19 2222 jmp .Loop_avx512 2223 2224.align 16 2225.Loop_outer_avx512: 2226 vmovdqa32 %zmm16,%zmm0 2227 vmovdqa32 %zmm17,%zmm1 2228 vmovdqa32 %zmm18,%zmm2 2229 vpaddd %zmm20,%zmm19,%zmm3 2230 movq $10,%r8 2231 vmovdqa32 %zmm3,%zmm19 2232 jmp .Loop_avx512 2233 2234.align 32 2235.Loop_avx512: 2236 vpaddd %zmm1,%zmm0,%zmm0 2237 vpxord %zmm0,%zmm3,%zmm3 2238 vprold $16,%zmm3,%zmm3 2239 vpaddd %zmm3,%zmm2,%zmm2 2240 vpxord %zmm2,%zmm1,%zmm1 2241 vprold $12,%zmm1,%zmm1 2242 vpaddd %zmm1,%zmm0,%zmm0 2243 vpxord %zmm0,%zmm3,%zmm3 2244 vprold $8,%zmm3,%zmm3 2245 vpaddd %zmm3,%zmm2,%zmm2 2246 vpxord %zmm2,%zmm1,%zmm1 2247 vprold $7,%zmm1,%zmm1 2248 vpshufd $78,%zmm2,%zmm2 2249 vpshufd $57,%zmm1,%zmm1 2250 vpshufd $147,%zmm3,%zmm3 2251 vpaddd %zmm1,%zmm0,%zmm0 2252 vpxord %zmm0,%zmm3,%zmm3 2253 vprold $16,%zmm3,%zmm3 2254 vpaddd %zmm3,%zmm2,%zmm2 2255 vpxord %zmm2,%zmm1,%zmm1 2256 vprold $12,%zmm1,%zmm1 2257 vpaddd %zmm1,%zmm0,%zmm0 2258 vpxord %zmm0,%zmm3,%zmm3 2259 vprold $8,%zmm3,%zmm3 2260 vpaddd %zmm3,%zmm2,%zmm2 2261 vpxord %zmm2,%zmm1,%zmm1 2262 vprold $7,%zmm1,%zmm1 2263 vpshufd $78,%zmm2,%zmm2 2264 vpshufd $147,%zmm1,%zmm1 2265 vpshufd $57,%zmm3,%zmm3 2266 decq %r8 2267 jnz .Loop_avx512 2268 vpaddd %zmm16,%zmm0,%zmm0 2269 vpaddd %zmm17,%zmm1,%zmm1 2270 vpaddd %zmm18,%zmm2,%zmm2 2271 vpaddd %zmm19,%zmm3,%zmm3 2272 2273 subq $64,%rdx 2274 jb .Ltail64_avx512 2275 2276 vpxor 0(%rsi),%xmm0,%xmm4 2277 vpxor 16(%rsi),%xmm1,%xmm5 2278 vpxor 32(%rsi),%xmm2,%xmm6 2279 vpxor 48(%rsi),%xmm3,%xmm7 2280 leaq 64(%rsi),%rsi 2281 2282 vmovdqu %xmm4,0(%rdi) 2283 vmovdqu %xmm5,16(%rdi) 2284 vmovdqu %xmm6,32(%rdi) 2285 vmovdqu %xmm7,48(%rdi) 2286 leaq 64(%rdi),%rdi 2287 2288 jz .Ldone_avx512 2289 2290 vextracti32x4 $1,%zmm0,%xmm4 2291 vextracti32x4 $1,%zmm1,%xmm5 2292 vextracti32x4 $1,%zmm2,%xmm6 2293 vextracti32x4 $1,%zmm3,%xmm7 2294 2295 subq $64,%rdx 2296 jb .Ltail_avx512 2297 2298 vpxor 0(%rsi),%xmm4,%xmm4 2299 vpxor 16(%rsi),%xmm5,%xmm5 2300 vpxor 32(%rsi),%xmm6,%xmm6 2301 vpxor 48(%rsi),%xmm7,%xmm7 2302 leaq 64(%rsi),%rsi 2303 2304 vmovdqu %xmm4,0(%rdi) 2305 vmovdqu %xmm5,16(%rdi) 2306 vmovdqu %xmm6,32(%rdi) 2307 vmovdqu %xmm7,48(%rdi) 2308 leaq 64(%rdi),%rdi 2309 2310 jz .Ldone_avx512 2311 2312 vextracti32x4 $2,%zmm0,%xmm4 2313 vextracti32x4 $2,%zmm1,%xmm5 2314 vextracti32x4 $2,%zmm2,%xmm6 2315 vextracti32x4 $2,%zmm3,%xmm7 2316 2317 subq $64,%rdx 2318 jb .Ltail_avx512 2319 2320 vpxor 0(%rsi),%xmm4,%xmm4 2321 vpxor 16(%rsi),%xmm5,%xmm5 2322 vpxor 32(%rsi),%xmm6,%xmm6 2323 vpxor 48(%rsi),%xmm7,%xmm7 2324 leaq 64(%rsi),%rsi 2325 2326 vmovdqu %xmm4,0(%rdi) 2327 vmovdqu %xmm5,16(%rdi) 2328 vmovdqu %xmm6,32(%rdi) 2329 vmovdqu %xmm7,48(%rdi) 2330 leaq 64(%rdi),%rdi 2331 2332 jz .Ldone_avx512 2333 2334 vextracti32x4 $3,%zmm0,%xmm4 2335 vextracti32x4 $3,%zmm1,%xmm5 2336 vextracti32x4 $3,%zmm2,%xmm6 2337 vextracti32x4 $3,%zmm3,%xmm7 2338 2339 subq $64,%rdx 2340 jb .Ltail_avx512 2341 2342 vpxor 0(%rsi),%xmm4,%xmm4 2343 vpxor 16(%rsi),%xmm5,%xmm5 2344 vpxor 32(%rsi),%xmm6,%xmm6 2345 vpxor 48(%rsi),%xmm7,%xmm7 2346 leaq 64(%rsi),%rsi 2347 2348 vmovdqu %xmm4,0(%rdi) 2349 vmovdqu %xmm5,16(%rdi) 2350 vmovdqu %xmm6,32(%rdi) 2351 vmovdqu %xmm7,48(%rdi) 2352 leaq 64(%rdi),%rdi 2353 2354 jnz .Loop_outer_avx512 2355 2356 jmp .Ldone_avx512 2357 2358.align 16 2359.Ltail64_avx512: 2360 vmovdqa %xmm0,0(%rsp) 2361 vmovdqa %xmm1,16(%rsp) 2362 vmovdqa %xmm2,32(%rsp) 2363 vmovdqa %xmm3,48(%rsp) 2364 addq $64,%rdx 2365 jmp .Loop_tail_avx512 2366 2367.align 16 2368.Ltail_avx512: 2369 vmovdqa %xmm4,0(%rsp) 2370 vmovdqa %xmm5,16(%rsp) 2371 vmovdqa %xmm6,32(%rsp) 2372 vmovdqa %xmm7,48(%rsp) 2373 addq $64,%rdx 2374 2375.Loop_tail_avx512: 2376 movzbl (%rsi,%r8,1),%eax 2377 movzbl (%rsp,%r8,1),%ecx 2378 leaq 1(%r8),%r8 2379 xorl %ecx,%eax 2380 movb %al,-1(%rdi,%r8,1) 2381 decq %rdx 2382 jnz .Loop_tail_avx512 2383 2384 vmovdqu32 %zmm16,0(%rsp) 2385 2386.Ldone_avx512: 2387 vzeroall 2388 leaq (%r9),%rsp 2389.cfi_def_cfa_register %rsp 2390.Lavx512_epilogue: 2391 .byte 0xf3,0xc3 2392.cfi_endproc 2393.size ChaCha20_avx512,.-ChaCha20_avx512 2394.type ChaCha20_avx512vl,@function 2395.align 32 2396ChaCha20_avx512vl: 2397.cfi_startproc 2398.LChaCha20_avx512vl: 2399 movq %rsp,%r9 2400.cfi_def_cfa_register %r9 2401 cmpq $128,%rdx 2402 ja .LChaCha20_8xvl 2403 2404 subq $64+8,%rsp 2405 vbroadcasti128 .Lsigma(%rip),%ymm0 2406 vbroadcasti128 (%rcx),%ymm1 2407 vbroadcasti128 16(%rcx),%ymm2 2408 vbroadcasti128 (%r8),%ymm3 2409 2410 vmovdqa32 %ymm0,%ymm16 2411 vmovdqa32 %ymm1,%ymm17 2412 vmovdqa32 %ymm2,%ymm18 2413 vpaddd .Lzeroz(%rip),%ymm3,%ymm3 2414 vmovdqa32 .Ltwoy(%rip),%ymm20 2415 movq $10,%r8 2416 vmovdqa32 %ymm3,%ymm19 2417 jmp .Loop_avx512vl 2418 2419.align 16 2420.Loop_outer_avx512vl: 2421 vmovdqa32 %ymm18,%ymm2 2422 vpaddd %ymm20,%ymm19,%ymm3 2423 movq $10,%r8 2424 vmovdqa32 %ymm3,%ymm19 2425 jmp .Loop_avx512vl 2426 2427.align 32 2428.Loop_avx512vl: 2429 vpaddd %ymm1,%ymm0,%ymm0 2430 vpxor %ymm0,%ymm3,%ymm3 2431 vprold $16,%ymm3,%ymm3 2432 vpaddd %ymm3,%ymm2,%ymm2 2433 vpxor %ymm2,%ymm1,%ymm1 2434 vprold $12,%ymm1,%ymm1 2435 vpaddd %ymm1,%ymm0,%ymm0 2436 vpxor %ymm0,%ymm3,%ymm3 2437 vprold $8,%ymm3,%ymm3 2438 vpaddd %ymm3,%ymm2,%ymm2 2439 vpxor %ymm2,%ymm1,%ymm1 2440 vprold $7,%ymm1,%ymm1 2441 vpshufd $78,%ymm2,%ymm2 2442 vpshufd $57,%ymm1,%ymm1 2443 vpshufd $147,%ymm3,%ymm3 2444 vpaddd %ymm1,%ymm0,%ymm0 2445 vpxor %ymm0,%ymm3,%ymm3 2446 vprold $16,%ymm3,%ymm3 2447 vpaddd %ymm3,%ymm2,%ymm2 2448 vpxor %ymm2,%ymm1,%ymm1 2449 vprold $12,%ymm1,%ymm1 2450 vpaddd %ymm1,%ymm0,%ymm0 2451 vpxor %ymm0,%ymm3,%ymm3 2452 vprold $8,%ymm3,%ymm3 2453 vpaddd %ymm3,%ymm2,%ymm2 2454 vpxor %ymm2,%ymm1,%ymm1 2455 vprold $7,%ymm1,%ymm1 2456 vpshufd $78,%ymm2,%ymm2 2457 vpshufd $147,%ymm1,%ymm1 2458 vpshufd $57,%ymm3,%ymm3 2459 decq %r8 2460 jnz .Loop_avx512vl 2461 vpaddd %ymm16,%ymm0,%ymm0 2462 vpaddd %ymm17,%ymm1,%ymm1 2463 vpaddd %ymm18,%ymm2,%ymm2 2464 vpaddd %ymm19,%ymm3,%ymm3 2465 2466 subq $64,%rdx 2467 jb .Ltail64_avx512vl 2468 2469 vpxor 0(%rsi),%xmm0,%xmm4 2470 vpxor 16(%rsi),%xmm1,%xmm5 2471 vpxor 32(%rsi),%xmm2,%xmm6 2472 vpxor 48(%rsi),%xmm3,%xmm7 2473 leaq 64(%rsi),%rsi 2474 2475 vmovdqu %xmm4,0(%rdi) 2476 vmovdqu %xmm5,16(%rdi) 2477 vmovdqu %xmm6,32(%rdi) 2478 vmovdqu %xmm7,48(%rdi) 2479 leaq 64(%rdi),%rdi 2480 2481 jz .Ldone_avx512vl 2482 2483 vextracti128 $1,%ymm0,%xmm4 2484 vextracti128 $1,%ymm1,%xmm5 2485 vextracti128 $1,%ymm2,%xmm6 2486 vextracti128 $1,%ymm3,%xmm7 2487 2488 subq $64,%rdx 2489 jb .Ltail_avx512vl 2490 2491 vpxor 0(%rsi),%xmm4,%xmm4 2492 vpxor 16(%rsi),%xmm5,%xmm5 2493 vpxor 32(%rsi),%xmm6,%xmm6 2494 vpxor 48(%rsi),%xmm7,%xmm7 2495 leaq 64(%rsi),%rsi 2496 2497 vmovdqu %xmm4,0(%rdi) 2498 vmovdqu %xmm5,16(%rdi) 2499 vmovdqu %xmm6,32(%rdi) 2500 vmovdqu %xmm7,48(%rdi) 2501 leaq 64(%rdi),%rdi 2502 2503 vmovdqa32 %ymm16,%ymm0 2504 vmovdqa32 %ymm17,%ymm1 2505 jnz .Loop_outer_avx512vl 2506 2507 jmp .Ldone_avx512vl 2508 2509.align 16 2510.Ltail64_avx512vl: 2511 vmovdqa %xmm0,0(%rsp) 2512 vmovdqa %xmm1,16(%rsp) 2513 vmovdqa %xmm2,32(%rsp) 2514 vmovdqa %xmm3,48(%rsp) 2515 addq $64,%rdx 2516 jmp .Loop_tail_avx512vl 2517 2518.align 16 2519.Ltail_avx512vl: 2520 vmovdqa %xmm4,0(%rsp) 2521 vmovdqa %xmm5,16(%rsp) 2522 vmovdqa %xmm6,32(%rsp) 2523 vmovdqa %xmm7,48(%rsp) 2524 addq $64,%rdx 2525 2526.Loop_tail_avx512vl: 2527 movzbl (%rsi,%r8,1),%eax 2528 movzbl (%rsp,%r8,1),%ecx 2529 leaq 1(%r8),%r8 2530 xorl %ecx,%eax 2531 movb %al,-1(%rdi,%r8,1) 2532 decq %rdx 2533 jnz .Loop_tail_avx512vl 2534 2535 vmovdqu32 %ymm16,0(%rsp) 2536 vmovdqu32 %ymm16,32(%rsp) 2537 2538.Ldone_avx512vl: 2539 vzeroall 2540 leaq (%r9),%rsp 2541.cfi_def_cfa_register %rsp 2542.Lavx512vl_epilogue: 2543 .byte 0xf3,0xc3 2544.cfi_endproc 2545.size ChaCha20_avx512vl,.-ChaCha20_avx512vl 2546.type ChaCha20_16x,@function 2547.align 32 2548ChaCha20_16x: 2549.cfi_startproc 2550.LChaCha20_16x: 2551 movq %rsp,%r9 2552.cfi_def_cfa_register %r9 2553 subq $64+8,%rsp 2554 andq $-64,%rsp 2555 vzeroupper 2556 2557 leaq .Lsigma(%rip),%r10 2558 vbroadcasti32x4 (%r10),%zmm3 2559 vbroadcasti32x4 (%rcx),%zmm7 2560 vbroadcasti32x4 16(%rcx),%zmm11 2561 vbroadcasti32x4 (%r8),%zmm15 2562 2563 vpshufd $0x00,%zmm3,%zmm0 2564 vpshufd $0x55,%zmm3,%zmm1 2565 vpshufd $0xaa,%zmm3,%zmm2 2566 vpshufd $0xff,%zmm3,%zmm3 2567 vmovdqa64 %zmm0,%zmm16 2568 vmovdqa64 %zmm1,%zmm17 2569 vmovdqa64 %zmm2,%zmm18 2570 vmovdqa64 %zmm3,%zmm19 2571 2572 vpshufd $0x00,%zmm7,%zmm4 2573 vpshufd $0x55,%zmm7,%zmm5 2574 vpshufd $0xaa,%zmm7,%zmm6 2575 vpshufd $0xff,%zmm7,%zmm7 2576 vmovdqa64 %zmm4,%zmm20 2577 vmovdqa64 %zmm5,%zmm21 2578 vmovdqa64 %zmm6,%zmm22 2579 vmovdqa64 %zmm7,%zmm23 2580 2581 vpshufd $0x00,%zmm11,%zmm8 2582 vpshufd $0x55,%zmm11,%zmm9 2583 vpshufd $0xaa,%zmm11,%zmm10 2584 vpshufd $0xff,%zmm11,%zmm11 2585 vmovdqa64 %zmm8,%zmm24 2586 vmovdqa64 %zmm9,%zmm25 2587 vmovdqa64 %zmm10,%zmm26 2588 vmovdqa64 %zmm11,%zmm27 2589 2590 vpshufd $0x00,%zmm15,%zmm12 2591 vpshufd $0x55,%zmm15,%zmm13 2592 vpshufd $0xaa,%zmm15,%zmm14 2593 vpshufd $0xff,%zmm15,%zmm15 2594 vpaddd .Lincz(%rip),%zmm12,%zmm12 2595 vmovdqa64 %zmm12,%zmm28 2596 vmovdqa64 %zmm13,%zmm29 2597 vmovdqa64 %zmm14,%zmm30 2598 vmovdqa64 %zmm15,%zmm31 2599 2600 movl $10,%eax 2601 jmp .Loop16x 2602 2603.align 32 2604.Loop_outer16x: 2605 vpbroadcastd 0(%r10),%zmm0 2606 vpbroadcastd 4(%r10),%zmm1 2607 vpbroadcastd 8(%r10),%zmm2 2608 vpbroadcastd 12(%r10),%zmm3 2609 vpaddd .Lsixteen(%rip),%zmm28,%zmm28 2610 vmovdqa64 %zmm20,%zmm4 2611 vmovdqa64 %zmm21,%zmm5 2612 vmovdqa64 %zmm22,%zmm6 2613 vmovdqa64 %zmm23,%zmm7 2614 vmovdqa64 %zmm24,%zmm8 2615 vmovdqa64 %zmm25,%zmm9 2616 vmovdqa64 %zmm26,%zmm10 2617 vmovdqa64 %zmm27,%zmm11 2618 vmovdqa64 %zmm28,%zmm12 2619 vmovdqa64 %zmm29,%zmm13 2620 vmovdqa64 %zmm30,%zmm14 2621 vmovdqa64 %zmm31,%zmm15 2622 2623 vmovdqa64 %zmm0,%zmm16 2624 vmovdqa64 %zmm1,%zmm17 2625 vmovdqa64 %zmm2,%zmm18 2626 vmovdqa64 %zmm3,%zmm19 2627 2628 movl $10,%eax 2629 jmp .Loop16x 2630 2631.align 32 2632.Loop16x: 2633 vpaddd %zmm4,%zmm0,%zmm0 2634 vpaddd %zmm5,%zmm1,%zmm1 2635 vpaddd %zmm6,%zmm2,%zmm2 2636 vpaddd %zmm7,%zmm3,%zmm3 2637 vpxord %zmm0,%zmm12,%zmm12 2638 vpxord %zmm1,%zmm13,%zmm13 2639 vpxord %zmm2,%zmm14,%zmm14 2640 vpxord %zmm3,%zmm15,%zmm15 2641 vprold $16,%zmm12,%zmm12 2642 vprold $16,%zmm13,%zmm13 2643 vprold $16,%zmm14,%zmm14 2644 vprold $16,%zmm15,%zmm15 2645 vpaddd %zmm12,%zmm8,%zmm8 2646 vpaddd %zmm13,%zmm9,%zmm9 2647 vpaddd %zmm14,%zmm10,%zmm10 2648 vpaddd %zmm15,%zmm11,%zmm11 2649 vpxord %zmm8,%zmm4,%zmm4 2650 vpxord %zmm9,%zmm5,%zmm5 2651 vpxord %zmm10,%zmm6,%zmm6 2652 vpxord %zmm11,%zmm7,%zmm7 2653 vprold $12,%zmm4,%zmm4 2654 vprold $12,%zmm5,%zmm5 2655 vprold $12,%zmm6,%zmm6 2656 vprold $12,%zmm7,%zmm7 2657 vpaddd %zmm4,%zmm0,%zmm0 2658 vpaddd %zmm5,%zmm1,%zmm1 2659 vpaddd %zmm6,%zmm2,%zmm2 2660 vpaddd %zmm7,%zmm3,%zmm3 2661 vpxord %zmm0,%zmm12,%zmm12 2662 vpxord %zmm1,%zmm13,%zmm13 2663 vpxord %zmm2,%zmm14,%zmm14 2664 vpxord %zmm3,%zmm15,%zmm15 2665 vprold $8,%zmm12,%zmm12 2666 vprold $8,%zmm13,%zmm13 2667 vprold $8,%zmm14,%zmm14 2668 vprold $8,%zmm15,%zmm15 2669 vpaddd %zmm12,%zmm8,%zmm8 2670 vpaddd %zmm13,%zmm9,%zmm9 2671 vpaddd %zmm14,%zmm10,%zmm10 2672 vpaddd %zmm15,%zmm11,%zmm11 2673 vpxord %zmm8,%zmm4,%zmm4 2674 vpxord %zmm9,%zmm5,%zmm5 2675 vpxord %zmm10,%zmm6,%zmm6 2676 vpxord %zmm11,%zmm7,%zmm7 2677 vprold $7,%zmm4,%zmm4 2678 vprold $7,%zmm5,%zmm5 2679 vprold $7,%zmm6,%zmm6 2680 vprold $7,%zmm7,%zmm7 2681 vpaddd %zmm5,%zmm0,%zmm0 2682 vpaddd %zmm6,%zmm1,%zmm1 2683 vpaddd %zmm7,%zmm2,%zmm2 2684 vpaddd %zmm4,%zmm3,%zmm3 2685 vpxord %zmm0,%zmm15,%zmm15 2686 vpxord %zmm1,%zmm12,%zmm12 2687 vpxord %zmm2,%zmm13,%zmm13 2688 vpxord %zmm3,%zmm14,%zmm14 2689 vprold $16,%zmm15,%zmm15 2690 vprold $16,%zmm12,%zmm12 2691 vprold $16,%zmm13,%zmm13 2692 vprold $16,%zmm14,%zmm14 2693 vpaddd %zmm15,%zmm10,%zmm10 2694 vpaddd %zmm12,%zmm11,%zmm11 2695 vpaddd %zmm13,%zmm8,%zmm8 2696 vpaddd %zmm14,%zmm9,%zmm9 2697 vpxord %zmm10,%zmm5,%zmm5 2698 vpxord %zmm11,%zmm6,%zmm6 2699 vpxord %zmm8,%zmm7,%zmm7 2700 vpxord %zmm9,%zmm4,%zmm4 2701 vprold $12,%zmm5,%zmm5 2702 vprold $12,%zmm6,%zmm6 2703 vprold $12,%zmm7,%zmm7 2704 vprold $12,%zmm4,%zmm4 2705 vpaddd %zmm5,%zmm0,%zmm0 2706 vpaddd %zmm6,%zmm1,%zmm1 2707 vpaddd %zmm7,%zmm2,%zmm2 2708 vpaddd %zmm4,%zmm3,%zmm3 2709 vpxord %zmm0,%zmm15,%zmm15 2710 vpxord %zmm1,%zmm12,%zmm12 2711 vpxord %zmm2,%zmm13,%zmm13 2712 vpxord %zmm3,%zmm14,%zmm14 2713 vprold $8,%zmm15,%zmm15 2714 vprold $8,%zmm12,%zmm12 2715 vprold $8,%zmm13,%zmm13 2716 vprold $8,%zmm14,%zmm14 2717 vpaddd %zmm15,%zmm10,%zmm10 2718 vpaddd %zmm12,%zmm11,%zmm11 2719 vpaddd %zmm13,%zmm8,%zmm8 2720 vpaddd %zmm14,%zmm9,%zmm9 2721 vpxord %zmm10,%zmm5,%zmm5 2722 vpxord %zmm11,%zmm6,%zmm6 2723 vpxord %zmm8,%zmm7,%zmm7 2724 vpxord %zmm9,%zmm4,%zmm4 2725 vprold $7,%zmm5,%zmm5 2726 vprold $7,%zmm6,%zmm6 2727 vprold $7,%zmm7,%zmm7 2728 vprold $7,%zmm4,%zmm4 2729 decl %eax 2730 jnz .Loop16x 2731 2732 vpaddd %zmm16,%zmm0,%zmm0 2733 vpaddd %zmm17,%zmm1,%zmm1 2734 vpaddd %zmm18,%zmm2,%zmm2 2735 vpaddd %zmm19,%zmm3,%zmm3 2736 2737 vpunpckldq %zmm1,%zmm0,%zmm18 2738 vpunpckldq %zmm3,%zmm2,%zmm19 2739 vpunpckhdq %zmm1,%zmm0,%zmm0 2740 vpunpckhdq %zmm3,%zmm2,%zmm2 2741 vpunpcklqdq %zmm19,%zmm18,%zmm1 2742 vpunpckhqdq %zmm19,%zmm18,%zmm18 2743 vpunpcklqdq %zmm2,%zmm0,%zmm3 2744 vpunpckhqdq %zmm2,%zmm0,%zmm0 2745 vpaddd %zmm20,%zmm4,%zmm4 2746 vpaddd %zmm21,%zmm5,%zmm5 2747 vpaddd %zmm22,%zmm6,%zmm6 2748 vpaddd %zmm23,%zmm7,%zmm7 2749 2750 vpunpckldq %zmm5,%zmm4,%zmm2 2751 vpunpckldq %zmm7,%zmm6,%zmm19 2752 vpunpckhdq %zmm5,%zmm4,%zmm4 2753 vpunpckhdq %zmm7,%zmm6,%zmm6 2754 vpunpcklqdq %zmm19,%zmm2,%zmm5 2755 vpunpckhqdq %zmm19,%zmm2,%zmm2 2756 vpunpcklqdq %zmm6,%zmm4,%zmm7 2757 vpunpckhqdq %zmm6,%zmm4,%zmm4 2758 vshufi32x4 $0x44,%zmm5,%zmm1,%zmm19 2759 vshufi32x4 $0xee,%zmm5,%zmm1,%zmm5 2760 vshufi32x4 $0x44,%zmm2,%zmm18,%zmm1 2761 vshufi32x4 $0xee,%zmm2,%zmm18,%zmm2 2762 vshufi32x4 $0x44,%zmm7,%zmm3,%zmm18 2763 vshufi32x4 $0xee,%zmm7,%zmm3,%zmm7 2764 vshufi32x4 $0x44,%zmm4,%zmm0,%zmm3 2765 vshufi32x4 $0xee,%zmm4,%zmm0,%zmm4 2766 vpaddd %zmm24,%zmm8,%zmm8 2767 vpaddd %zmm25,%zmm9,%zmm9 2768 vpaddd %zmm26,%zmm10,%zmm10 2769 vpaddd %zmm27,%zmm11,%zmm11 2770 2771 vpunpckldq %zmm9,%zmm8,%zmm6 2772 vpunpckldq %zmm11,%zmm10,%zmm0 2773 vpunpckhdq %zmm9,%zmm8,%zmm8 2774 vpunpckhdq %zmm11,%zmm10,%zmm10 2775 vpunpcklqdq %zmm0,%zmm6,%zmm9 2776 vpunpckhqdq %zmm0,%zmm6,%zmm6 2777 vpunpcklqdq %zmm10,%zmm8,%zmm11 2778 vpunpckhqdq %zmm10,%zmm8,%zmm8 2779 vpaddd %zmm28,%zmm12,%zmm12 2780 vpaddd %zmm29,%zmm13,%zmm13 2781 vpaddd %zmm30,%zmm14,%zmm14 2782 vpaddd %zmm31,%zmm15,%zmm15 2783 2784 vpunpckldq %zmm13,%zmm12,%zmm10 2785 vpunpckldq %zmm15,%zmm14,%zmm0 2786 vpunpckhdq %zmm13,%zmm12,%zmm12 2787 vpunpckhdq %zmm15,%zmm14,%zmm14 2788 vpunpcklqdq %zmm0,%zmm10,%zmm13 2789 vpunpckhqdq %zmm0,%zmm10,%zmm10 2790 vpunpcklqdq %zmm14,%zmm12,%zmm15 2791 vpunpckhqdq %zmm14,%zmm12,%zmm12 2792 vshufi32x4 $0x44,%zmm13,%zmm9,%zmm0 2793 vshufi32x4 $0xee,%zmm13,%zmm9,%zmm13 2794 vshufi32x4 $0x44,%zmm10,%zmm6,%zmm9 2795 vshufi32x4 $0xee,%zmm10,%zmm6,%zmm10 2796 vshufi32x4 $0x44,%zmm15,%zmm11,%zmm6 2797 vshufi32x4 $0xee,%zmm15,%zmm11,%zmm15 2798 vshufi32x4 $0x44,%zmm12,%zmm8,%zmm11 2799 vshufi32x4 $0xee,%zmm12,%zmm8,%zmm12 2800 vshufi32x4 $0x88,%zmm0,%zmm19,%zmm16 2801 vshufi32x4 $0xdd,%zmm0,%zmm19,%zmm19 2802 vshufi32x4 $0x88,%zmm13,%zmm5,%zmm0 2803 vshufi32x4 $0xdd,%zmm13,%zmm5,%zmm13 2804 vshufi32x4 $0x88,%zmm9,%zmm1,%zmm17 2805 vshufi32x4 $0xdd,%zmm9,%zmm1,%zmm1 2806 vshufi32x4 $0x88,%zmm10,%zmm2,%zmm9 2807 vshufi32x4 $0xdd,%zmm10,%zmm2,%zmm10 2808 vshufi32x4 $0x88,%zmm6,%zmm18,%zmm14 2809 vshufi32x4 $0xdd,%zmm6,%zmm18,%zmm18 2810 vshufi32x4 $0x88,%zmm15,%zmm7,%zmm6 2811 vshufi32x4 $0xdd,%zmm15,%zmm7,%zmm15 2812 vshufi32x4 $0x88,%zmm11,%zmm3,%zmm8 2813 vshufi32x4 $0xdd,%zmm11,%zmm3,%zmm3 2814 vshufi32x4 $0x88,%zmm12,%zmm4,%zmm11 2815 vshufi32x4 $0xdd,%zmm12,%zmm4,%zmm12 2816 cmpq $1024,%rdx 2817 jb .Ltail16x 2818 2819 vpxord 0(%rsi),%zmm16,%zmm16 2820 vpxord 64(%rsi),%zmm17,%zmm17 2821 vpxord 128(%rsi),%zmm14,%zmm14 2822 vpxord 192(%rsi),%zmm8,%zmm8 2823 vmovdqu32 %zmm16,0(%rdi) 2824 vmovdqu32 %zmm17,64(%rdi) 2825 vmovdqu32 %zmm14,128(%rdi) 2826 vmovdqu32 %zmm8,192(%rdi) 2827 2828 vpxord 256(%rsi),%zmm19,%zmm19 2829 vpxord 320(%rsi),%zmm1,%zmm1 2830 vpxord 384(%rsi),%zmm18,%zmm18 2831 vpxord 448(%rsi),%zmm3,%zmm3 2832 vmovdqu32 %zmm19,256(%rdi) 2833 vmovdqu32 %zmm1,320(%rdi) 2834 vmovdqu32 %zmm18,384(%rdi) 2835 vmovdqu32 %zmm3,448(%rdi) 2836 2837 vpxord 512(%rsi),%zmm0,%zmm0 2838 vpxord 576(%rsi),%zmm9,%zmm9 2839 vpxord 640(%rsi),%zmm6,%zmm6 2840 vpxord 704(%rsi),%zmm11,%zmm11 2841 vmovdqu32 %zmm0,512(%rdi) 2842 vmovdqu32 %zmm9,576(%rdi) 2843 vmovdqu32 %zmm6,640(%rdi) 2844 vmovdqu32 %zmm11,704(%rdi) 2845 2846 vpxord 768(%rsi),%zmm13,%zmm13 2847 vpxord 832(%rsi),%zmm10,%zmm10 2848 vpxord 896(%rsi),%zmm15,%zmm15 2849 vpxord 960(%rsi),%zmm12,%zmm12 2850 leaq 1024(%rsi),%rsi 2851 vmovdqu32 %zmm13,768(%rdi) 2852 vmovdqu32 %zmm10,832(%rdi) 2853 vmovdqu32 %zmm15,896(%rdi) 2854 vmovdqu32 %zmm12,960(%rdi) 2855 leaq 1024(%rdi),%rdi 2856 2857 subq $1024,%rdx 2858 jnz .Loop_outer16x 2859 2860 jmp .Ldone16x 2861 2862.align 32 2863.Ltail16x: 2864 xorq %r10,%r10 2865 subq %rsi,%rdi 2866 cmpq $64,%rdx 2867 jb .Less_than_64_16x 2868 vpxord (%rsi),%zmm16,%zmm16 2869 vmovdqu32 %zmm16,(%rdi,%rsi,1) 2870 je .Ldone16x 2871 vmovdqa32 %zmm17,%zmm16 2872 leaq 64(%rsi),%rsi 2873 2874 cmpq $128,%rdx 2875 jb .Less_than_64_16x 2876 vpxord (%rsi),%zmm17,%zmm17 2877 vmovdqu32 %zmm17,(%rdi,%rsi,1) 2878 je .Ldone16x 2879 vmovdqa32 %zmm14,%zmm16 2880 leaq 64(%rsi),%rsi 2881 2882 cmpq $192,%rdx 2883 jb .Less_than_64_16x 2884 vpxord (%rsi),%zmm14,%zmm14 2885 vmovdqu32 %zmm14,(%rdi,%rsi,1) 2886 je .Ldone16x 2887 vmovdqa32 %zmm8,%zmm16 2888 leaq 64(%rsi),%rsi 2889 2890 cmpq $256,%rdx 2891 jb .Less_than_64_16x 2892 vpxord (%rsi),%zmm8,%zmm8 2893 vmovdqu32 %zmm8,(%rdi,%rsi,1) 2894 je .Ldone16x 2895 vmovdqa32 %zmm19,%zmm16 2896 leaq 64(%rsi),%rsi 2897 2898 cmpq $320,%rdx 2899 jb .Less_than_64_16x 2900 vpxord (%rsi),%zmm19,%zmm19 2901 vmovdqu32 %zmm19,(%rdi,%rsi,1) 2902 je .Ldone16x 2903 vmovdqa32 %zmm1,%zmm16 2904 leaq 64(%rsi),%rsi 2905 2906 cmpq $384,%rdx 2907 jb .Less_than_64_16x 2908 vpxord (%rsi),%zmm1,%zmm1 2909 vmovdqu32 %zmm1,(%rdi,%rsi,1) 2910 je .Ldone16x 2911 vmovdqa32 %zmm18,%zmm16 2912 leaq 64(%rsi),%rsi 2913 2914 cmpq $448,%rdx 2915 jb .Less_than_64_16x 2916 vpxord (%rsi),%zmm18,%zmm18 2917 vmovdqu32 %zmm18,(%rdi,%rsi,1) 2918 je .Ldone16x 2919 vmovdqa32 %zmm3,%zmm16 2920 leaq 64(%rsi),%rsi 2921 2922 cmpq $512,%rdx 2923 jb .Less_than_64_16x 2924 vpxord (%rsi),%zmm3,%zmm3 2925 vmovdqu32 %zmm3,(%rdi,%rsi,1) 2926 je .Ldone16x 2927 vmovdqa32 %zmm0,%zmm16 2928 leaq 64(%rsi),%rsi 2929 2930 cmpq $576,%rdx 2931 jb .Less_than_64_16x 2932 vpxord (%rsi),%zmm0,%zmm0 2933 vmovdqu32 %zmm0,(%rdi,%rsi,1) 2934 je .Ldone16x 2935 vmovdqa32 %zmm9,%zmm16 2936 leaq 64(%rsi),%rsi 2937 2938 cmpq $640,%rdx 2939 jb .Less_than_64_16x 2940 vpxord (%rsi),%zmm9,%zmm9 2941 vmovdqu32 %zmm9,(%rdi,%rsi,1) 2942 je .Ldone16x 2943 vmovdqa32 %zmm6,%zmm16 2944 leaq 64(%rsi),%rsi 2945 2946 cmpq $704,%rdx 2947 jb .Less_than_64_16x 2948 vpxord (%rsi),%zmm6,%zmm6 2949 vmovdqu32 %zmm6,(%rdi,%rsi,1) 2950 je .Ldone16x 2951 vmovdqa32 %zmm11,%zmm16 2952 leaq 64(%rsi),%rsi 2953 2954 cmpq $768,%rdx 2955 jb .Less_than_64_16x 2956 vpxord (%rsi),%zmm11,%zmm11 2957 vmovdqu32 %zmm11,(%rdi,%rsi,1) 2958 je .Ldone16x 2959 vmovdqa32 %zmm13,%zmm16 2960 leaq 64(%rsi),%rsi 2961 2962 cmpq $832,%rdx 2963 jb .Less_than_64_16x 2964 vpxord (%rsi),%zmm13,%zmm13 2965 vmovdqu32 %zmm13,(%rdi,%rsi,1) 2966 je .Ldone16x 2967 vmovdqa32 %zmm10,%zmm16 2968 leaq 64(%rsi),%rsi 2969 2970 cmpq $896,%rdx 2971 jb .Less_than_64_16x 2972 vpxord (%rsi),%zmm10,%zmm10 2973 vmovdqu32 %zmm10,(%rdi,%rsi,1) 2974 je .Ldone16x 2975 vmovdqa32 %zmm15,%zmm16 2976 leaq 64(%rsi),%rsi 2977 2978 cmpq $960,%rdx 2979 jb .Less_than_64_16x 2980 vpxord (%rsi),%zmm15,%zmm15 2981 vmovdqu32 %zmm15,(%rdi,%rsi,1) 2982 je .Ldone16x 2983 vmovdqa32 %zmm12,%zmm16 2984 leaq 64(%rsi),%rsi 2985 2986.Less_than_64_16x: 2987 vmovdqa32 %zmm16,0(%rsp) 2988 leaq (%rdi,%rsi,1),%rdi 2989 andq $63,%rdx 2990 2991.Loop_tail16x: 2992 movzbl (%rsi,%r10,1),%eax 2993 movzbl (%rsp,%r10,1),%ecx 2994 leaq 1(%r10),%r10 2995 xorl %ecx,%eax 2996 movb %al,-1(%rdi,%r10,1) 2997 decq %rdx 2998 jnz .Loop_tail16x 2999 3000 vpxord %zmm16,%zmm16,%zmm16 3001 vmovdqa32 %zmm16,0(%rsp) 3002 3003.Ldone16x: 3004 vzeroall 3005 leaq (%r9),%rsp 3006.cfi_def_cfa_register %rsp 3007.L16x_epilogue: 3008 .byte 0xf3,0xc3 3009.cfi_endproc 3010.size ChaCha20_16x,.-ChaCha20_16x 3011.type ChaCha20_8xvl,@function 3012.align 32 3013ChaCha20_8xvl: 3014.cfi_startproc 3015.LChaCha20_8xvl: 3016 movq %rsp,%r9 3017.cfi_def_cfa_register %r9 3018 subq $64+8,%rsp 3019 andq $-64,%rsp 3020 vzeroupper 3021 3022 leaq .Lsigma(%rip),%r10 3023 vbroadcasti128 (%r10),%ymm3 3024 vbroadcasti128 (%rcx),%ymm7 3025 vbroadcasti128 16(%rcx),%ymm11 3026 vbroadcasti128 (%r8),%ymm15 3027 3028 vpshufd $0x00,%ymm3,%ymm0 3029 vpshufd $0x55,%ymm3,%ymm1 3030 vpshufd $0xaa,%ymm3,%ymm2 3031 vpshufd $0xff,%ymm3,%ymm3 3032 vmovdqa64 %ymm0,%ymm16 3033 vmovdqa64 %ymm1,%ymm17 3034 vmovdqa64 %ymm2,%ymm18 3035 vmovdqa64 %ymm3,%ymm19 3036 3037 vpshufd $0x00,%ymm7,%ymm4 3038 vpshufd $0x55,%ymm7,%ymm5 3039 vpshufd $0xaa,%ymm7,%ymm6 3040 vpshufd $0xff,%ymm7,%ymm7 3041 vmovdqa64 %ymm4,%ymm20 3042 vmovdqa64 %ymm5,%ymm21 3043 vmovdqa64 %ymm6,%ymm22 3044 vmovdqa64 %ymm7,%ymm23 3045 3046 vpshufd $0x00,%ymm11,%ymm8 3047 vpshufd $0x55,%ymm11,%ymm9 3048 vpshufd $0xaa,%ymm11,%ymm10 3049 vpshufd $0xff,%ymm11,%ymm11 3050 vmovdqa64 %ymm8,%ymm24 3051 vmovdqa64 %ymm9,%ymm25 3052 vmovdqa64 %ymm10,%ymm26 3053 vmovdqa64 %ymm11,%ymm27 3054 3055 vpshufd $0x00,%ymm15,%ymm12 3056 vpshufd $0x55,%ymm15,%ymm13 3057 vpshufd $0xaa,%ymm15,%ymm14 3058 vpshufd $0xff,%ymm15,%ymm15 3059 vpaddd .Lincy(%rip),%ymm12,%ymm12 3060 vmovdqa64 %ymm12,%ymm28 3061 vmovdqa64 %ymm13,%ymm29 3062 vmovdqa64 %ymm14,%ymm30 3063 vmovdqa64 %ymm15,%ymm31 3064 3065 movl $10,%eax 3066 jmp .Loop8xvl 3067 3068.align 32 3069.Loop_outer8xvl: 3070 3071 3072 vpbroadcastd 8(%r10),%ymm2 3073 vpbroadcastd 12(%r10),%ymm3 3074 vpaddd .Leight(%rip),%ymm28,%ymm28 3075 vmovdqa64 %ymm20,%ymm4 3076 vmovdqa64 %ymm21,%ymm5 3077 vmovdqa64 %ymm22,%ymm6 3078 vmovdqa64 %ymm23,%ymm7 3079 vmovdqa64 %ymm24,%ymm8 3080 vmovdqa64 %ymm25,%ymm9 3081 vmovdqa64 %ymm26,%ymm10 3082 vmovdqa64 %ymm27,%ymm11 3083 vmovdqa64 %ymm28,%ymm12 3084 vmovdqa64 %ymm29,%ymm13 3085 vmovdqa64 %ymm30,%ymm14 3086 vmovdqa64 %ymm31,%ymm15 3087 3088 vmovdqa64 %ymm0,%ymm16 3089 vmovdqa64 %ymm1,%ymm17 3090 vmovdqa64 %ymm2,%ymm18 3091 vmovdqa64 %ymm3,%ymm19 3092 3093 movl $10,%eax 3094 jmp .Loop8xvl 3095 3096.align 32 3097.Loop8xvl: 3098 vpaddd %ymm4,%ymm0,%ymm0 3099 vpaddd %ymm5,%ymm1,%ymm1 3100 vpaddd %ymm6,%ymm2,%ymm2 3101 vpaddd %ymm7,%ymm3,%ymm3 3102 vpxor %ymm0,%ymm12,%ymm12 3103 vpxor %ymm1,%ymm13,%ymm13 3104 vpxor %ymm2,%ymm14,%ymm14 3105 vpxor %ymm3,%ymm15,%ymm15 3106 vprold $16,%ymm12,%ymm12 3107 vprold $16,%ymm13,%ymm13 3108 vprold $16,%ymm14,%ymm14 3109 vprold $16,%ymm15,%ymm15 3110 vpaddd %ymm12,%ymm8,%ymm8 3111 vpaddd %ymm13,%ymm9,%ymm9 3112 vpaddd %ymm14,%ymm10,%ymm10 3113 vpaddd %ymm15,%ymm11,%ymm11 3114 vpxor %ymm8,%ymm4,%ymm4 3115 vpxor %ymm9,%ymm5,%ymm5 3116 vpxor %ymm10,%ymm6,%ymm6 3117 vpxor %ymm11,%ymm7,%ymm7 3118 vprold $12,%ymm4,%ymm4 3119 vprold $12,%ymm5,%ymm5 3120 vprold $12,%ymm6,%ymm6 3121 vprold $12,%ymm7,%ymm7 3122 vpaddd %ymm4,%ymm0,%ymm0 3123 vpaddd %ymm5,%ymm1,%ymm1 3124 vpaddd %ymm6,%ymm2,%ymm2 3125 vpaddd %ymm7,%ymm3,%ymm3 3126 vpxor %ymm0,%ymm12,%ymm12 3127 vpxor %ymm1,%ymm13,%ymm13 3128 vpxor %ymm2,%ymm14,%ymm14 3129 vpxor %ymm3,%ymm15,%ymm15 3130 vprold $8,%ymm12,%ymm12 3131 vprold $8,%ymm13,%ymm13 3132 vprold $8,%ymm14,%ymm14 3133 vprold $8,%ymm15,%ymm15 3134 vpaddd %ymm12,%ymm8,%ymm8 3135 vpaddd %ymm13,%ymm9,%ymm9 3136 vpaddd %ymm14,%ymm10,%ymm10 3137 vpaddd %ymm15,%ymm11,%ymm11 3138 vpxor %ymm8,%ymm4,%ymm4 3139 vpxor %ymm9,%ymm5,%ymm5 3140 vpxor %ymm10,%ymm6,%ymm6 3141 vpxor %ymm11,%ymm7,%ymm7 3142 vprold $7,%ymm4,%ymm4 3143 vprold $7,%ymm5,%ymm5 3144 vprold $7,%ymm6,%ymm6 3145 vprold $7,%ymm7,%ymm7 3146 vpaddd %ymm5,%ymm0,%ymm0 3147 vpaddd %ymm6,%ymm1,%ymm1 3148 vpaddd %ymm7,%ymm2,%ymm2 3149 vpaddd %ymm4,%ymm3,%ymm3 3150 vpxor %ymm0,%ymm15,%ymm15 3151 vpxor %ymm1,%ymm12,%ymm12 3152 vpxor %ymm2,%ymm13,%ymm13 3153 vpxor %ymm3,%ymm14,%ymm14 3154 vprold $16,%ymm15,%ymm15 3155 vprold $16,%ymm12,%ymm12 3156 vprold $16,%ymm13,%ymm13 3157 vprold $16,%ymm14,%ymm14 3158 vpaddd %ymm15,%ymm10,%ymm10 3159 vpaddd %ymm12,%ymm11,%ymm11 3160 vpaddd %ymm13,%ymm8,%ymm8 3161 vpaddd %ymm14,%ymm9,%ymm9 3162 vpxor %ymm10,%ymm5,%ymm5 3163 vpxor %ymm11,%ymm6,%ymm6 3164 vpxor %ymm8,%ymm7,%ymm7 3165 vpxor %ymm9,%ymm4,%ymm4 3166 vprold $12,%ymm5,%ymm5 3167 vprold $12,%ymm6,%ymm6 3168 vprold $12,%ymm7,%ymm7 3169 vprold $12,%ymm4,%ymm4 3170 vpaddd %ymm5,%ymm0,%ymm0 3171 vpaddd %ymm6,%ymm1,%ymm1 3172 vpaddd %ymm7,%ymm2,%ymm2 3173 vpaddd %ymm4,%ymm3,%ymm3 3174 vpxor %ymm0,%ymm15,%ymm15 3175 vpxor %ymm1,%ymm12,%ymm12 3176 vpxor %ymm2,%ymm13,%ymm13 3177 vpxor %ymm3,%ymm14,%ymm14 3178 vprold $8,%ymm15,%ymm15 3179 vprold $8,%ymm12,%ymm12 3180 vprold $8,%ymm13,%ymm13 3181 vprold $8,%ymm14,%ymm14 3182 vpaddd %ymm15,%ymm10,%ymm10 3183 vpaddd %ymm12,%ymm11,%ymm11 3184 vpaddd %ymm13,%ymm8,%ymm8 3185 vpaddd %ymm14,%ymm9,%ymm9 3186 vpxor %ymm10,%ymm5,%ymm5 3187 vpxor %ymm11,%ymm6,%ymm6 3188 vpxor %ymm8,%ymm7,%ymm7 3189 vpxor %ymm9,%ymm4,%ymm4 3190 vprold $7,%ymm5,%ymm5 3191 vprold $7,%ymm6,%ymm6 3192 vprold $7,%ymm7,%ymm7 3193 vprold $7,%ymm4,%ymm4 3194 decl %eax 3195 jnz .Loop8xvl 3196 3197 vpaddd %ymm16,%ymm0,%ymm0 3198 vpaddd %ymm17,%ymm1,%ymm1 3199 vpaddd %ymm18,%ymm2,%ymm2 3200 vpaddd %ymm19,%ymm3,%ymm3 3201 3202 vpunpckldq %ymm1,%ymm0,%ymm18 3203 vpunpckldq %ymm3,%ymm2,%ymm19 3204 vpunpckhdq %ymm1,%ymm0,%ymm0 3205 vpunpckhdq %ymm3,%ymm2,%ymm2 3206 vpunpcklqdq %ymm19,%ymm18,%ymm1 3207 vpunpckhqdq %ymm19,%ymm18,%ymm18 3208 vpunpcklqdq %ymm2,%ymm0,%ymm3 3209 vpunpckhqdq %ymm2,%ymm0,%ymm0 3210 vpaddd %ymm20,%ymm4,%ymm4 3211 vpaddd %ymm21,%ymm5,%ymm5 3212 vpaddd %ymm22,%ymm6,%ymm6 3213 vpaddd %ymm23,%ymm7,%ymm7 3214 3215 vpunpckldq %ymm5,%ymm4,%ymm2 3216 vpunpckldq %ymm7,%ymm6,%ymm19 3217 vpunpckhdq %ymm5,%ymm4,%ymm4 3218 vpunpckhdq %ymm7,%ymm6,%ymm6 3219 vpunpcklqdq %ymm19,%ymm2,%ymm5 3220 vpunpckhqdq %ymm19,%ymm2,%ymm2 3221 vpunpcklqdq %ymm6,%ymm4,%ymm7 3222 vpunpckhqdq %ymm6,%ymm4,%ymm4 3223 vshufi32x4 $0,%ymm5,%ymm1,%ymm19 3224 vshufi32x4 $3,%ymm5,%ymm1,%ymm5 3225 vshufi32x4 $0,%ymm2,%ymm18,%ymm1 3226 vshufi32x4 $3,%ymm2,%ymm18,%ymm2 3227 vshufi32x4 $0,%ymm7,%ymm3,%ymm18 3228 vshufi32x4 $3,%ymm7,%ymm3,%ymm7 3229 vshufi32x4 $0,%ymm4,%ymm0,%ymm3 3230 vshufi32x4 $3,%ymm4,%ymm0,%ymm4 3231 vpaddd %ymm24,%ymm8,%ymm8 3232 vpaddd %ymm25,%ymm9,%ymm9 3233 vpaddd %ymm26,%ymm10,%ymm10 3234 vpaddd %ymm27,%ymm11,%ymm11 3235 3236 vpunpckldq %ymm9,%ymm8,%ymm6 3237 vpunpckldq %ymm11,%ymm10,%ymm0 3238 vpunpckhdq %ymm9,%ymm8,%ymm8 3239 vpunpckhdq %ymm11,%ymm10,%ymm10 3240 vpunpcklqdq %ymm0,%ymm6,%ymm9 3241 vpunpckhqdq %ymm0,%ymm6,%ymm6 3242 vpunpcklqdq %ymm10,%ymm8,%ymm11 3243 vpunpckhqdq %ymm10,%ymm8,%ymm8 3244 vpaddd %ymm28,%ymm12,%ymm12 3245 vpaddd %ymm29,%ymm13,%ymm13 3246 vpaddd %ymm30,%ymm14,%ymm14 3247 vpaddd %ymm31,%ymm15,%ymm15 3248 3249 vpunpckldq %ymm13,%ymm12,%ymm10 3250 vpunpckldq %ymm15,%ymm14,%ymm0 3251 vpunpckhdq %ymm13,%ymm12,%ymm12 3252 vpunpckhdq %ymm15,%ymm14,%ymm14 3253 vpunpcklqdq %ymm0,%ymm10,%ymm13 3254 vpunpckhqdq %ymm0,%ymm10,%ymm10 3255 vpunpcklqdq %ymm14,%ymm12,%ymm15 3256 vpunpckhqdq %ymm14,%ymm12,%ymm12 3257 vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 3258 vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 3259 vperm2i128 $0x20,%ymm10,%ymm6,%ymm9 3260 vperm2i128 $0x31,%ymm10,%ymm6,%ymm10 3261 vperm2i128 $0x20,%ymm15,%ymm11,%ymm6 3262 vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 3263 vperm2i128 $0x20,%ymm12,%ymm8,%ymm11 3264 vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 3265 cmpq $512,%rdx 3266 jb .Ltail8xvl 3267 3268 movl $0x80,%eax 3269 vpxord 0(%rsi),%ymm19,%ymm19 3270 vpxor 32(%rsi),%ymm0,%ymm0 3271 vpxor 64(%rsi),%ymm5,%ymm5 3272 vpxor 96(%rsi),%ymm13,%ymm13 3273 leaq (%rsi,%rax,1),%rsi 3274 vmovdqu32 %ymm19,0(%rdi) 3275 vmovdqu %ymm0,32(%rdi) 3276 vmovdqu %ymm5,64(%rdi) 3277 vmovdqu %ymm13,96(%rdi) 3278 leaq (%rdi,%rax,1),%rdi 3279 3280 vpxor 0(%rsi),%ymm1,%ymm1 3281 vpxor 32(%rsi),%ymm9,%ymm9 3282 vpxor 64(%rsi),%ymm2,%ymm2 3283 vpxor 96(%rsi),%ymm10,%ymm10 3284 leaq (%rsi,%rax,1),%rsi 3285 vmovdqu %ymm1,0(%rdi) 3286 vmovdqu %ymm9,32(%rdi) 3287 vmovdqu %ymm2,64(%rdi) 3288 vmovdqu %ymm10,96(%rdi) 3289 leaq (%rdi,%rax,1),%rdi 3290 3291 vpxord 0(%rsi),%ymm18,%ymm18 3292 vpxor 32(%rsi),%ymm6,%ymm6 3293 vpxor 64(%rsi),%ymm7,%ymm7 3294 vpxor 96(%rsi),%ymm15,%ymm15 3295 leaq (%rsi,%rax,1),%rsi 3296 vmovdqu32 %ymm18,0(%rdi) 3297 vmovdqu %ymm6,32(%rdi) 3298 vmovdqu %ymm7,64(%rdi) 3299 vmovdqu %ymm15,96(%rdi) 3300 leaq (%rdi,%rax,1),%rdi 3301 3302 vpxor 0(%rsi),%ymm3,%ymm3 3303 vpxor 32(%rsi),%ymm11,%ymm11 3304 vpxor 64(%rsi),%ymm4,%ymm4 3305 vpxor 96(%rsi),%ymm12,%ymm12 3306 leaq (%rsi,%rax,1),%rsi 3307 vmovdqu %ymm3,0(%rdi) 3308 vmovdqu %ymm11,32(%rdi) 3309 vmovdqu %ymm4,64(%rdi) 3310 vmovdqu %ymm12,96(%rdi) 3311 leaq (%rdi,%rax,1),%rdi 3312 3313 vpbroadcastd 0(%r10),%ymm0 3314 vpbroadcastd 4(%r10),%ymm1 3315 3316 subq $512,%rdx 3317 jnz .Loop_outer8xvl 3318 3319 jmp .Ldone8xvl 3320 3321.align 32 3322.Ltail8xvl: 3323 vmovdqa64 %ymm19,%ymm8 3324 xorq %r10,%r10 3325 subq %rsi,%rdi 3326 cmpq $64,%rdx 3327 jb .Less_than_64_8xvl 3328 vpxor 0(%rsi),%ymm8,%ymm8 3329 vpxor 32(%rsi),%ymm0,%ymm0 3330 vmovdqu %ymm8,0(%rdi,%rsi,1) 3331 vmovdqu %ymm0,32(%rdi,%rsi,1) 3332 je .Ldone8xvl 3333 vmovdqa %ymm5,%ymm8 3334 vmovdqa %ymm13,%ymm0 3335 leaq 64(%rsi),%rsi 3336 3337 cmpq $128,%rdx 3338 jb .Less_than_64_8xvl 3339 vpxor 0(%rsi),%ymm5,%ymm5 3340 vpxor 32(%rsi),%ymm13,%ymm13 3341 vmovdqu %ymm5,0(%rdi,%rsi,1) 3342 vmovdqu %ymm13,32(%rdi,%rsi,1) 3343 je .Ldone8xvl 3344 vmovdqa %ymm1,%ymm8 3345 vmovdqa %ymm9,%ymm0 3346 leaq 64(%rsi),%rsi 3347 3348 cmpq $192,%rdx 3349 jb .Less_than_64_8xvl 3350 vpxor 0(%rsi),%ymm1,%ymm1 3351 vpxor 32(%rsi),%ymm9,%ymm9 3352 vmovdqu %ymm1,0(%rdi,%rsi,1) 3353 vmovdqu %ymm9,32(%rdi,%rsi,1) 3354 je .Ldone8xvl 3355 vmovdqa %ymm2,%ymm8 3356 vmovdqa %ymm10,%ymm0 3357 leaq 64(%rsi),%rsi 3358 3359 cmpq $256,%rdx 3360 jb .Less_than_64_8xvl 3361 vpxor 0(%rsi),%ymm2,%ymm2 3362 vpxor 32(%rsi),%ymm10,%ymm10 3363 vmovdqu %ymm2,0(%rdi,%rsi,1) 3364 vmovdqu %ymm10,32(%rdi,%rsi,1) 3365 je .Ldone8xvl 3366 vmovdqa32 %ymm18,%ymm8 3367 vmovdqa %ymm6,%ymm0 3368 leaq 64(%rsi),%rsi 3369 3370 cmpq $320,%rdx 3371 jb .Less_than_64_8xvl 3372 vpxord 0(%rsi),%ymm18,%ymm18 3373 vpxor 32(%rsi),%ymm6,%ymm6 3374 vmovdqu32 %ymm18,0(%rdi,%rsi,1) 3375 vmovdqu %ymm6,32(%rdi,%rsi,1) 3376 je .Ldone8xvl 3377 vmovdqa %ymm7,%ymm8 3378 vmovdqa %ymm15,%ymm0 3379 leaq 64(%rsi),%rsi 3380 3381 cmpq $384,%rdx 3382 jb .Less_than_64_8xvl 3383 vpxor 0(%rsi),%ymm7,%ymm7 3384 vpxor 32(%rsi),%ymm15,%ymm15 3385 vmovdqu %ymm7,0(%rdi,%rsi,1) 3386 vmovdqu %ymm15,32(%rdi,%rsi,1) 3387 je .Ldone8xvl 3388 vmovdqa %ymm3,%ymm8 3389 vmovdqa %ymm11,%ymm0 3390 leaq 64(%rsi),%rsi 3391 3392 cmpq $448,%rdx 3393 jb .Less_than_64_8xvl 3394 vpxor 0(%rsi),%ymm3,%ymm3 3395 vpxor 32(%rsi),%ymm11,%ymm11 3396 vmovdqu %ymm3,0(%rdi,%rsi,1) 3397 vmovdqu %ymm11,32(%rdi,%rsi,1) 3398 je .Ldone8xvl 3399 vmovdqa %ymm4,%ymm8 3400 vmovdqa %ymm12,%ymm0 3401 leaq 64(%rsi),%rsi 3402 3403.Less_than_64_8xvl: 3404 vmovdqa %ymm8,0(%rsp) 3405 vmovdqa %ymm0,32(%rsp) 3406 leaq (%rdi,%rsi,1),%rdi 3407 andq $63,%rdx 3408 3409.Loop_tail8xvl: 3410 movzbl (%rsi,%r10,1),%eax 3411 movzbl (%rsp,%r10,1),%ecx 3412 leaq 1(%r10),%r10 3413 xorl %ecx,%eax 3414 movb %al,-1(%rdi,%r10,1) 3415 decq %rdx 3416 jnz .Loop_tail8xvl 3417 3418 vpxor %ymm8,%ymm8,%ymm8 3419 vmovdqa %ymm8,0(%rsp) 3420 vmovdqa %ymm8,32(%rsp) 3421 3422.Ldone8xvl: 3423 vzeroall 3424 leaq (%r9),%rsp 3425.cfi_def_cfa_register %rsp 3426.L8xvl_epilogue: 3427 .byte 0xf3,0xc3 3428.cfi_endproc 3429.size ChaCha20_8xvl,.-ChaCha20_8xvl 3430