xref: /netbsd-src/crypto/external/bsd/openssl.old/lib/libcrypto/arch/x86_64/chacha-x86_64.S (revision 4724848cf0da353df257f730694b7882798e5daf)
1#include <machine/asm.h>
2.text
3
4
5
6.align	64
7.Lzero:
8.long	0,0,0,0
9.Lone:
10.long	1,0,0,0
11.Linc:
12.long	0,1,2,3
13.Lfour:
14.long	4,4,4,4
15.Lincy:
16.long	0,2,4,6,1,3,5,7
17.Leight:
18.long	8,8,8,8,8,8,8,8
19.Lrot16:
20.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
21.Lrot24:
22.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
23.Ltwoy:
24.long	2,0,0,0, 2,0,0,0
25.align	64
26.Lzeroz:
27.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
28.Lfourz:
29.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
30.Lincz:
31.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
32.Lsixteen:
33.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
34.Lsigma:
35.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
36.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
37.globl	ChaCha20_ctr32
38.type	ChaCha20_ctr32,@function
39.align	64
40ChaCha20_ctr32:
41.cfi_startproc
42	cmpq	$0,%rdx
43	je	.Lno_data
44	movq	OPENSSL_ia32cap_P+4(%rip),%r10
45	btq	$48,%r10
46	jc	.LChaCha20_avx512
47	testq	%r10,%r10
48	js	.LChaCha20_avx512vl
49	testl	$512,%r10d
50	jnz	.LChaCha20_ssse3
51
52	pushq	%rbx
53.cfi_adjust_cfa_offset	8
54.cfi_offset	%rbx,-16
55	pushq	%rbp
56.cfi_adjust_cfa_offset	8
57.cfi_offset	%rbp,-24
58	pushq	%r12
59.cfi_adjust_cfa_offset	8
60.cfi_offset	%r12,-32
61	pushq	%r13
62.cfi_adjust_cfa_offset	8
63.cfi_offset	%r13,-40
64	pushq	%r14
65.cfi_adjust_cfa_offset	8
66.cfi_offset	%r14,-48
67	pushq	%r15
68.cfi_adjust_cfa_offset	8
69.cfi_offset	%r15,-56
70	subq	$64+24,%rsp
71.cfi_adjust_cfa_offset	64+24
72.Lctr32_body:
73
74
75	movdqu	(%rcx),%xmm1
76	movdqu	16(%rcx),%xmm2
77	movdqu	(%r8),%xmm3
78	movdqa	.Lone(%rip),%xmm4
79
80
81	movdqa	%xmm1,16(%rsp)
82	movdqa	%xmm2,32(%rsp)
83	movdqa	%xmm3,48(%rsp)
84	movq	%rdx,%rbp
85	jmp	.Loop_outer
86
87.align	32
88.Loop_outer:
89	movl	$0x61707865,%eax
90	movl	$0x3320646e,%ebx
91	movl	$0x79622d32,%ecx
92	movl	$0x6b206574,%edx
93	movl	16(%rsp),%r8d
94	movl	20(%rsp),%r9d
95	movl	24(%rsp),%r10d
96	movl	28(%rsp),%r11d
97	movd	%xmm3,%r12d
98	movl	52(%rsp),%r13d
99	movl	56(%rsp),%r14d
100	movl	60(%rsp),%r15d
101
102	movq	%rbp,64+0(%rsp)
103	movl	$10,%ebp
104	movq	%rsi,64+8(%rsp)
105.byte	102,72,15,126,214
106	movq	%rdi,64+16(%rsp)
107	movq	%rsi,%rdi
108	shrq	$32,%rdi
109	jmp	.Loop
110
111.align	32
112.Loop:
113	addl	%r8d,%eax
114	xorl	%eax,%r12d
115	roll	$16,%r12d
116	addl	%r9d,%ebx
117	xorl	%ebx,%r13d
118	roll	$16,%r13d
119	addl	%r12d,%esi
120	xorl	%esi,%r8d
121	roll	$12,%r8d
122	addl	%r13d,%edi
123	xorl	%edi,%r9d
124	roll	$12,%r9d
125	addl	%r8d,%eax
126	xorl	%eax,%r12d
127	roll	$8,%r12d
128	addl	%r9d,%ebx
129	xorl	%ebx,%r13d
130	roll	$8,%r13d
131	addl	%r12d,%esi
132	xorl	%esi,%r8d
133	roll	$7,%r8d
134	addl	%r13d,%edi
135	xorl	%edi,%r9d
136	roll	$7,%r9d
137	movl	%esi,32(%rsp)
138	movl	%edi,36(%rsp)
139	movl	40(%rsp),%esi
140	movl	44(%rsp),%edi
141	addl	%r10d,%ecx
142	xorl	%ecx,%r14d
143	roll	$16,%r14d
144	addl	%r11d,%edx
145	xorl	%edx,%r15d
146	roll	$16,%r15d
147	addl	%r14d,%esi
148	xorl	%esi,%r10d
149	roll	$12,%r10d
150	addl	%r15d,%edi
151	xorl	%edi,%r11d
152	roll	$12,%r11d
153	addl	%r10d,%ecx
154	xorl	%ecx,%r14d
155	roll	$8,%r14d
156	addl	%r11d,%edx
157	xorl	%edx,%r15d
158	roll	$8,%r15d
159	addl	%r14d,%esi
160	xorl	%esi,%r10d
161	roll	$7,%r10d
162	addl	%r15d,%edi
163	xorl	%edi,%r11d
164	roll	$7,%r11d
165	addl	%r9d,%eax
166	xorl	%eax,%r15d
167	roll	$16,%r15d
168	addl	%r10d,%ebx
169	xorl	%ebx,%r12d
170	roll	$16,%r12d
171	addl	%r15d,%esi
172	xorl	%esi,%r9d
173	roll	$12,%r9d
174	addl	%r12d,%edi
175	xorl	%edi,%r10d
176	roll	$12,%r10d
177	addl	%r9d,%eax
178	xorl	%eax,%r15d
179	roll	$8,%r15d
180	addl	%r10d,%ebx
181	xorl	%ebx,%r12d
182	roll	$8,%r12d
183	addl	%r15d,%esi
184	xorl	%esi,%r9d
185	roll	$7,%r9d
186	addl	%r12d,%edi
187	xorl	%edi,%r10d
188	roll	$7,%r10d
189	movl	%esi,40(%rsp)
190	movl	%edi,44(%rsp)
191	movl	32(%rsp),%esi
192	movl	36(%rsp),%edi
193	addl	%r11d,%ecx
194	xorl	%ecx,%r13d
195	roll	$16,%r13d
196	addl	%r8d,%edx
197	xorl	%edx,%r14d
198	roll	$16,%r14d
199	addl	%r13d,%esi
200	xorl	%esi,%r11d
201	roll	$12,%r11d
202	addl	%r14d,%edi
203	xorl	%edi,%r8d
204	roll	$12,%r8d
205	addl	%r11d,%ecx
206	xorl	%ecx,%r13d
207	roll	$8,%r13d
208	addl	%r8d,%edx
209	xorl	%edx,%r14d
210	roll	$8,%r14d
211	addl	%r13d,%esi
212	xorl	%esi,%r11d
213	roll	$7,%r11d
214	addl	%r14d,%edi
215	xorl	%edi,%r8d
216	roll	$7,%r8d
217	decl	%ebp
218	jnz	.Loop
219	movl	%edi,36(%rsp)
220	movl	%esi,32(%rsp)
221	movq	64(%rsp),%rbp
222	movdqa	%xmm2,%xmm1
223	movq	64+8(%rsp),%rsi
224	paddd	%xmm4,%xmm3
225	movq	64+16(%rsp),%rdi
226
227	addl	$0x61707865,%eax
228	addl	$0x3320646e,%ebx
229	addl	$0x79622d32,%ecx
230	addl	$0x6b206574,%edx
231	addl	16(%rsp),%r8d
232	addl	20(%rsp),%r9d
233	addl	24(%rsp),%r10d
234	addl	28(%rsp),%r11d
235	addl	48(%rsp),%r12d
236	addl	52(%rsp),%r13d
237	addl	56(%rsp),%r14d
238	addl	60(%rsp),%r15d
239	paddd	32(%rsp),%xmm1
240
241	cmpq	$64,%rbp
242	jb	.Ltail
243
244	xorl	0(%rsi),%eax
245	xorl	4(%rsi),%ebx
246	xorl	8(%rsi),%ecx
247	xorl	12(%rsi),%edx
248	xorl	16(%rsi),%r8d
249	xorl	20(%rsi),%r9d
250	xorl	24(%rsi),%r10d
251	xorl	28(%rsi),%r11d
252	movdqu	32(%rsi),%xmm0
253	xorl	48(%rsi),%r12d
254	xorl	52(%rsi),%r13d
255	xorl	56(%rsi),%r14d
256	xorl	60(%rsi),%r15d
257	leaq	64(%rsi),%rsi
258	pxor	%xmm1,%xmm0
259
260	movdqa	%xmm2,32(%rsp)
261	movd	%xmm3,48(%rsp)
262
263	movl	%eax,0(%rdi)
264	movl	%ebx,4(%rdi)
265	movl	%ecx,8(%rdi)
266	movl	%edx,12(%rdi)
267	movl	%r8d,16(%rdi)
268	movl	%r9d,20(%rdi)
269	movl	%r10d,24(%rdi)
270	movl	%r11d,28(%rdi)
271	movdqu	%xmm0,32(%rdi)
272	movl	%r12d,48(%rdi)
273	movl	%r13d,52(%rdi)
274	movl	%r14d,56(%rdi)
275	movl	%r15d,60(%rdi)
276	leaq	64(%rdi),%rdi
277
278	subq	$64,%rbp
279	jnz	.Loop_outer
280
281	jmp	.Ldone
282
283.align	16
284.Ltail:
285	movl	%eax,0(%rsp)
286	movl	%ebx,4(%rsp)
287	xorq	%rbx,%rbx
288	movl	%ecx,8(%rsp)
289	movl	%edx,12(%rsp)
290	movl	%r8d,16(%rsp)
291	movl	%r9d,20(%rsp)
292	movl	%r10d,24(%rsp)
293	movl	%r11d,28(%rsp)
294	movdqa	%xmm1,32(%rsp)
295	movl	%r12d,48(%rsp)
296	movl	%r13d,52(%rsp)
297	movl	%r14d,56(%rsp)
298	movl	%r15d,60(%rsp)
299
300.Loop_tail:
301	movzbl	(%rsi,%rbx,1),%eax
302	movzbl	(%rsp,%rbx,1),%edx
303	leaq	1(%rbx),%rbx
304	xorl	%edx,%eax
305	movb	%al,-1(%rdi,%rbx,1)
306	decq	%rbp
307	jnz	.Loop_tail
308
309.Ldone:
310	leaq	64+24+48(%rsp),%rsi
311.cfi_def_cfa	%rsi,8
312	movq	-48(%rsi),%r15
313.cfi_restore	%r15
314	movq	-40(%rsi),%r14
315.cfi_restore	%r14
316	movq	-32(%rsi),%r13
317.cfi_restore	%r13
318	movq	-24(%rsi),%r12
319.cfi_restore	%r12
320	movq	-16(%rsi),%rbp
321.cfi_restore	%rbp
322	movq	-8(%rsi),%rbx
323.cfi_restore	%rbx
324	leaq	(%rsi),%rsp
325.cfi_def_cfa_register	%rsp
326.Lno_data:
327	.byte	0xf3,0xc3
328.cfi_endproc
329.size	ChaCha20_ctr32,.-ChaCha20_ctr32
330.type	ChaCha20_ssse3,@function
331.align	32
332ChaCha20_ssse3:
333.cfi_startproc
334.LChaCha20_ssse3:
335	movq	%rsp,%r9
336.cfi_def_cfa_register	%r9
337	testl	$2048,%r10d
338	jnz	.LChaCha20_4xop
339	cmpq	$128,%rdx
340	je	.LChaCha20_128
341	ja	.LChaCha20_4x
342
343.Ldo_sse3_after_all:
344	subq	$64+8,%rsp
345	movdqa	.Lsigma(%rip),%xmm0
346	movdqu	(%rcx),%xmm1
347	movdqu	16(%rcx),%xmm2
348	movdqu	(%r8),%xmm3
349	movdqa	.Lrot16(%rip),%xmm6
350	movdqa	.Lrot24(%rip),%xmm7
351
352	movdqa	%xmm0,0(%rsp)
353	movdqa	%xmm1,16(%rsp)
354	movdqa	%xmm2,32(%rsp)
355	movdqa	%xmm3,48(%rsp)
356	movq	$10,%r8
357	jmp	.Loop_ssse3
358
359.align	32
360.Loop_outer_ssse3:
361	movdqa	.Lone(%rip),%xmm3
362	movdqa	0(%rsp),%xmm0
363	movdqa	16(%rsp),%xmm1
364	movdqa	32(%rsp),%xmm2
365	paddd	48(%rsp),%xmm3
366	movq	$10,%r8
367	movdqa	%xmm3,48(%rsp)
368	jmp	.Loop_ssse3
369
370.align	32
371.Loop_ssse3:
372	paddd	%xmm1,%xmm0
373	pxor	%xmm0,%xmm3
374.byte	102,15,56,0,222
375	paddd	%xmm3,%xmm2
376	pxor	%xmm2,%xmm1
377	movdqa	%xmm1,%xmm4
378	psrld	$20,%xmm1
379	pslld	$12,%xmm4
380	por	%xmm4,%xmm1
381	paddd	%xmm1,%xmm0
382	pxor	%xmm0,%xmm3
383.byte	102,15,56,0,223
384	paddd	%xmm3,%xmm2
385	pxor	%xmm2,%xmm1
386	movdqa	%xmm1,%xmm4
387	psrld	$25,%xmm1
388	pslld	$7,%xmm4
389	por	%xmm4,%xmm1
390	pshufd	$78,%xmm2,%xmm2
391	pshufd	$57,%xmm1,%xmm1
392	pshufd	$147,%xmm3,%xmm3
393	nop
394	paddd	%xmm1,%xmm0
395	pxor	%xmm0,%xmm3
396.byte	102,15,56,0,222
397	paddd	%xmm3,%xmm2
398	pxor	%xmm2,%xmm1
399	movdqa	%xmm1,%xmm4
400	psrld	$20,%xmm1
401	pslld	$12,%xmm4
402	por	%xmm4,%xmm1
403	paddd	%xmm1,%xmm0
404	pxor	%xmm0,%xmm3
405.byte	102,15,56,0,223
406	paddd	%xmm3,%xmm2
407	pxor	%xmm2,%xmm1
408	movdqa	%xmm1,%xmm4
409	psrld	$25,%xmm1
410	pslld	$7,%xmm4
411	por	%xmm4,%xmm1
412	pshufd	$78,%xmm2,%xmm2
413	pshufd	$147,%xmm1,%xmm1
414	pshufd	$57,%xmm3,%xmm3
415	decq	%r8
416	jnz	.Loop_ssse3
417	paddd	0(%rsp),%xmm0
418	paddd	16(%rsp),%xmm1
419	paddd	32(%rsp),%xmm2
420	paddd	48(%rsp),%xmm3
421
422	cmpq	$64,%rdx
423	jb	.Ltail_ssse3
424
425	movdqu	0(%rsi),%xmm4
426	movdqu	16(%rsi),%xmm5
427	pxor	%xmm4,%xmm0
428	movdqu	32(%rsi),%xmm4
429	pxor	%xmm5,%xmm1
430	movdqu	48(%rsi),%xmm5
431	leaq	64(%rsi),%rsi
432	pxor	%xmm4,%xmm2
433	pxor	%xmm5,%xmm3
434
435	movdqu	%xmm0,0(%rdi)
436	movdqu	%xmm1,16(%rdi)
437	movdqu	%xmm2,32(%rdi)
438	movdqu	%xmm3,48(%rdi)
439	leaq	64(%rdi),%rdi
440
441	subq	$64,%rdx
442	jnz	.Loop_outer_ssse3
443
444	jmp	.Ldone_ssse3
445
446.align	16
447.Ltail_ssse3:
448	movdqa	%xmm0,0(%rsp)
449	movdqa	%xmm1,16(%rsp)
450	movdqa	%xmm2,32(%rsp)
451	movdqa	%xmm3,48(%rsp)
452	xorq	%r8,%r8
453
454.Loop_tail_ssse3:
455	movzbl	(%rsi,%r8,1),%eax
456	movzbl	(%rsp,%r8,1),%ecx
457	leaq	1(%r8),%r8
458	xorl	%ecx,%eax
459	movb	%al,-1(%rdi,%r8,1)
460	decq	%rdx
461	jnz	.Loop_tail_ssse3
462
463.Ldone_ssse3:
464	leaq	(%r9),%rsp
465.cfi_def_cfa_register	%rsp
466.Lssse3_epilogue:
467	.byte	0xf3,0xc3
468.cfi_endproc
469.size	ChaCha20_ssse3,.-ChaCha20_ssse3
470.type	ChaCha20_128,@function
471.align	32
472ChaCha20_128:
473.cfi_startproc
474.LChaCha20_128:
475	movq	%rsp,%r9
476.cfi_def_cfa_register	%r9
477	subq	$64+8,%rsp
478	movdqa	.Lsigma(%rip),%xmm8
479	movdqu	(%rcx),%xmm9
480	movdqu	16(%rcx),%xmm2
481	movdqu	(%r8),%xmm3
482	movdqa	.Lone(%rip),%xmm1
483	movdqa	.Lrot16(%rip),%xmm6
484	movdqa	.Lrot24(%rip),%xmm7
485
486	movdqa	%xmm8,%xmm10
487	movdqa	%xmm8,0(%rsp)
488	movdqa	%xmm9,%xmm11
489	movdqa	%xmm9,16(%rsp)
490	movdqa	%xmm2,%xmm0
491	movdqa	%xmm2,32(%rsp)
492	paddd	%xmm3,%xmm1
493	movdqa	%xmm3,48(%rsp)
494	movq	$10,%r8
495	jmp	.Loop_128
496
497.align	32
498.Loop_128:
499	paddd	%xmm9,%xmm8
500	pxor	%xmm8,%xmm3
501	paddd	%xmm11,%xmm10
502	pxor	%xmm10,%xmm1
503.byte	102,15,56,0,222
504.byte	102,15,56,0,206
505	paddd	%xmm3,%xmm2
506	paddd	%xmm1,%xmm0
507	pxor	%xmm2,%xmm9
508	pxor	%xmm0,%xmm11
509	movdqa	%xmm9,%xmm4
510	psrld	$20,%xmm9
511	movdqa	%xmm11,%xmm5
512	pslld	$12,%xmm4
513	psrld	$20,%xmm11
514	por	%xmm4,%xmm9
515	pslld	$12,%xmm5
516	por	%xmm5,%xmm11
517	paddd	%xmm9,%xmm8
518	pxor	%xmm8,%xmm3
519	paddd	%xmm11,%xmm10
520	pxor	%xmm10,%xmm1
521.byte	102,15,56,0,223
522.byte	102,15,56,0,207
523	paddd	%xmm3,%xmm2
524	paddd	%xmm1,%xmm0
525	pxor	%xmm2,%xmm9
526	pxor	%xmm0,%xmm11
527	movdqa	%xmm9,%xmm4
528	psrld	$25,%xmm9
529	movdqa	%xmm11,%xmm5
530	pslld	$7,%xmm4
531	psrld	$25,%xmm11
532	por	%xmm4,%xmm9
533	pslld	$7,%xmm5
534	por	%xmm5,%xmm11
535	pshufd	$78,%xmm2,%xmm2
536	pshufd	$57,%xmm9,%xmm9
537	pshufd	$147,%xmm3,%xmm3
538	pshufd	$78,%xmm0,%xmm0
539	pshufd	$57,%xmm11,%xmm11
540	pshufd	$147,%xmm1,%xmm1
541	paddd	%xmm9,%xmm8
542	pxor	%xmm8,%xmm3
543	paddd	%xmm11,%xmm10
544	pxor	%xmm10,%xmm1
545.byte	102,15,56,0,222
546.byte	102,15,56,0,206
547	paddd	%xmm3,%xmm2
548	paddd	%xmm1,%xmm0
549	pxor	%xmm2,%xmm9
550	pxor	%xmm0,%xmm11
551	movdqa	%xmm9,%xmm4
552	psrld	$20,%xmm9
553	movdqa	%xmm11,%xmm5
554	pslld	$12,%xmm4
555	psrld	$20,%xmm11
556	por	%xmm4,%xmm9
557	pslld	$12,%xmm5
558	por	%xmm5,%xmm11
559	paddd	%xmm9,%xmm8
560	pxor	%xmm8,%xmm3
561	paddd	%xmm11,%xmm10
562	pxor	%xmm10,%xmm1
563.byte	102,15,56,0,223
564.byte	102,15,56,0,207
565	paddd	%xmm3,%xmm2
566	paddd	%xmm1,%xmm0
567	pxor	%xmm2,%xmm9
568	pxor	%xmm0,%xmm11
569	movdqa	%xmm9,%xmm4
570	psrld	$25,%xmm9
571	movdqa	%xmm11,%xmm5
572	pslld	$7,%xmm4
573	psrld	$25,%xmm11
574	por	%xmm4,%xmm9
575	pslld	$7,%xmm5
576	por	%xmm5,%xmm11
577	pshufd	$78,%xmm2,%xmm2
578	pshufd	$147,%xmm9,%xmm9
579	pshufd	$57,%xmm3,%xmm3
580	pshufd	$78,%xmm0,%xmm0
581	pshufd	$147,%xmm11,%xmm11
582	pshufd	$57,%xmm1,%xmm1
583	decq	%r8
584	jnz	.Loop_128
585	paddd	0(%rsp),%xmm8
586	paddd	16(%rsp),%xmm9
587	paddd	32(%rsp),%xmm2
588	paddd	48(%rsp),%xmm3
589	paddd	.Lone(%rip),%xmm1
590	paddd	0(%rsp),%xmm10
591	paddd	16(%rsp),%xmm11
592	paddd	32(%rsp),%xmm0
593	paddd	48(%rsp),%xmm1
594
595	movdqu	0(%rsi),%xmm4
596	movdqu	16(%rsi),%xmm5
597	pxor	%xmm4,%xmm8
598	movdqu	32(%rsi),%xmm4
599	pxor	%xmm5,%xmm9
600	movdqu	48(%rsi),%xmm5
601	pxor	%xmm4,%xmm2
602	movdqu	64(%rsi),%xmm4
603	pxor	%xmm5,%xmm3
604	movdqu	80(%rsi),%xmm5
605	pxor	%xmm4,%xmm10
606	movdqu	96(%rsi),%xmm4
607	pxor	%xmm5,%xmm11
608	movdqu	112(%rsi),%xmm5
609	pxor	%xmm4,%xmm0
610	pxor	%xmm5,%xmm1
611
612	movdqu	%xmm8,0(%rdi)
613	movdqu	%xmm9,16(%rdi)
614	movdqu	%xmm2,32(%rdi)
615	movdqu	%xmm3,48(%rdi)
616	movdqu	%xmm10,64(%rdi)
617	movdqu	%xmm11,80(%rdi)
618	movdqu	%xmm0,96(%rdi)
619	movdqu	%xmm1,112(%rdi)
620	leaq	(%r9),%rsp
621.cfi_def_cfa_register	%rsp
622.L128_epilogue:
623	.byte	0xf3,0xc3
624.cfi_endproc
625.size	ChaCha20_128,.-ChaCha20_128
626.type	ChaCha20_4x,@function
627.align	32
628ChaCha20_4x:
629.cfi_startproc
630.LChaCha20_4x:
631	movq	%rsp,%r9
632.cfi_def_cfa_register	%r9
633	movq	%r10,%r11
634	shrq	$32,%r10
635	testq	$32,%r10
636	jnz	.LChaCha20_8x
637	cmpq	$192,%rdx
638	ja	.Lproceed4x
639
640	andq	$71303168,%r11
641	cmpq	$4194304,%r11
642	je	.Ldo_sse3_after_all
643
644.Lproceed4x:
645	subq	$0x140+8,%rsp
646	movdqa	.Lsigma(%rip),%xmm11
647	movdqu	(%rcx),%xmm15
648	movdqu	16(%rcx),%xmm7
649	movdqu	(%r8),%xmm3
650	leaq	256(%rsp),%rcx
651	leaq	.Lrot16(%rip),%r10
652	leaq	.Lrot24(%rip),%r11
653
654	pshufd	$0x00,%xmm11,%xmm8
655	pshufd	$0x55,%xmm11,%xmm9
656	movdqa	%xmm8,64(%rsp)
657	pshufd	$0xaa,%xmm11,%xmm10
658	movdqa	%xmm9,80(%rsp)
659	pshufd	$0xff,%xmm11,%xmm11
660	movdqa	%xmm10,96(%rsp)
661	movdqa	%xmm11,112(%rsp)
662
663	pshufd	$0x00,%xmm15,%xmm12
664	pshufd	$0x55,%xmm15,%xmm13
665	movdqa	%xmm12,128-256(%rcx)
666	pshufd	$0xaa,%xmm15,%xmm14
667	movdqa	%xmm13,144-256(%rcx)
668	pshufd	$0xff,%xmm15,%xmm15
669	movdqa	%xmm14,160-256(%rcx)
670	movdqa	%xmm15,176-256(%rcx)
671
672	pshufd	$0x00,%xmm7,%xmm4
673	pshufd	$0x55,%xmm7,%xmm5
674	movdqa	%xmm4,192-256(%rcx)
675	pshufd	$0xaa,%xmm7,%xmm6
676	movdqa	%xmm5,208-256(%rcx)
677	pshufd	$0xff,%xmm7,%xmm7
678	movdqa	%xmm6,224-256(%rcx)
679	movdqa	%xmm7,240-256(%rcx)
680
681	pshufd	$0x00,%xmm3,%xmm0
682	pshufd	$0x55,%xmm3,%xmm1
683	paddd	.Linc(%rip),%xmm0
684	pshufd	$0xaa,%xmm3,%xmm2
685	movdqa	%xmm1,272-256(%rcx)
686	pshufd	$0xff,%xmm3,%xmm3
687	movdqa	%xmm2,288-256(%rcx)
688	movdqa	%xmm3,304-256(%rcx)
689
690	jmp	.Loop_enter4x
691
692.align	32
693.Loop_outer4x:
694	movdqa	64(%rsp),%xmm8
695	movdqa	80(%rsp),%xmm9
696	movdqa	96(%rsp),%xmm10
697	movdqa	112(%rsp),%xmm11
698	movdqa	128-256(%rcx),%xmm12
699	movdqa	144-256(%rcx),%xmm13
700	movdqa	160-256(%rcx),%xmm14
701	movdqa	176-256(%rcx),%xmm15
702	movdqa	192-256(%rcx),%xmm4
703	movdqa	208-256(%rcx),%xmm5
704	movdqa	224-256(%rcx),%xmm6
705	movdqa	240-256(%rcx),%xmm7
706	movdqa	256-256(%rcx),%xmm0
707	movdqa	272-256(%rcx),%xmm1
708	movdqa	288-256(%rcx),%xmm2
709	movdqa	304-256(%rcx),%xmm3
710	paddd	.Lfour(%rip),%xmm0
711
712.Loop_enter4x:
713	movdqa	%xmm6,32(%rsp)
714	movdqa	%xmm7,48(%rsp)
715	movdqa	(%r10),%xmm7
716	movl	$10,%eax
717	movdqa	%xmm0,256-256(%rcx)
718	jmp	.Loop4x
719
720.align	32
721.Loop4x:
722	paddd	%xmm12,%xmm8
723	paddd	%xmm13,%xmm9
724	pxor	%xmm8,%xmm0
725	pxor	%xmm9,%xmm1
726.byte	102,15,56,0,199
727.byte	102,15,56,0,207
728	paddd	%xmm0,%xmm4
729	paddd	%xmm1,%xmm5
730	pxor	%xmm4,%xmm12
731	pxor	%xmm5,%xmm13
732	movdqa	%xmm12,%xmm6
733	pslld	$12,%xmm12
734	psrld	$20,%xmm6
735	movdqa	%xmm13,%xmm7
736	pslld	$12,%xmm13
737	por	%xmm6,%xmm12
738	psrld	$20,%xmm7
739	movdqa	(%r11),%xmm6
740	por	%xmm7,%xmm13
741	paddd	%xmm12,%xmm8
742	paddd	%xmm13,%xmm9
743	pxor	%xmm8,%xmm0
744	pxor	%xmm9,%xmm1
745.byte	102,15,56,0,198
746.byte	102,15,56,0,206
747	paddd	%xmm0,%xmm4
748	paddd	%xmm1,%xmm5
749	pxor	%xmm4,%xmm12
750	pxor	%xmm5,%xmm13
751	movdqa	%xmm12,%xmm7
752	pslld	$7,%xmm12
753	psrld	$25,%xmm7
754	movdqa	%xmm13,%xmm6
755	pslld	$7,%xmm13
756	por	%xmm7,%xmm12
757	psrld	$25,%xmm6
758	movdqa	(%r10),%xmm7
759	por	%xmm6,%xmm13
760	movdqa	%xmm4,0(%rsp)
761	movdqa	%xmm5,16(%rsp)
762	movdqa	32(%rsp),%xmm4
763	movdqa	48(%rsp),%xmm5
764	paddd	%xmm14,%xmm10
765	paddd	%xmm15,%xmm11
766	pxor	%xmm10,%xmm2
767	pxor	%xmm11,%xmm3
768.byte	102,15,56,0,215
769.byte	102,15,56,0,223
770	paddd	%xmm2,%xmm4
771	paddd	%xmm3,%xmm5
772	pxor	%xmm4,%xmm14
773	pxor	%xmm5,%xmm15
774	movdqa	%xmm14,%xmm6
775	pslld	$12,%xmm14
776	psrld	$20,%xmm6
777	movdqa	%xmm15,%xmm7
778	pslld	$12,%xmm15
779	por	%xmm6,%xmm14
780	psrld	$20,%xmm7
781	movdqa	(%r11),%xmm6
782	por	%xmm7,%xmm15
783	paddd	%xmm14,%xmm10
784	paddd	%xmm15,%xmm11
785	pxor	%xmm10,%xmm2
786	pxor	%xmm11,%xmm3
787.byte	102,15,56,0,214
788.byte	102,15,56,0,222
789	paddd	%xmm2,%xmm4
790	paddd	%xmm3,%xmm5
791	pxor	%xmm4,%xmm14
792	pxor	%xmm5,%xmm15
793	movdqa	%xmm14,%xmm7
794	pslld	$7,%xmm14
795	psrld	$25,%xmm7
796	movdqa	%xmm15,%xmm6
797	pslld	$7,%xmm15
798	por	%xmm7,%xmm14
799	psrld	$25,%xmm6
800	movdqa	(%r10),%xmm7
801	por	%xmm6,%xmm15
802	paddd	%xmm13,%xmm8
803	paddd	%xmm14,%xmm9
804	pxor	%xmm8,%xmm3
805	pxor	%xmm9,%xmm0
806.byte	102,15,56,0,223
807.byte	102,15,56,0,199
808	paddd	%xmm3,%xmm4
809	paddd	%xmm0,%xmm5
810	pxor	%xmm4,%xmm13
811	pxor	%xmm5,%xmm14
812	movdqa	%xmm13,%xmm6
813	pslld	$12,%xmm13
814	psrld	$20,%xmm6
815	movdqa	%xmm14,%xmm7
816	pslld	$12,%xmm14
817	por	%xmm6,%xmm13
818	psrld	$20,%xmm7
819	movdqa	(%r11),%xmm6
820	por	%xmm7,%xmm14
821	paddd	%xmm13,%xmm8
822	paddd	%xmm14,%xmm9
823	pxor	%xmm8,%xmm3
824	pxor	%xmm9,%xmm0
825.byte	102,15,56,0,222
826.byte	102,15,56,0,198
827	paddd	%xmm3,%xmm4
828	paddd	%xmm0,%xmm5
829	pxor	%xmm4,%xmm13
830	pxor	%xmm5,%xmm14
831	movdqa	%xmm13,%xmm7
832	pslld	$7,%xmm13
833	psrld	$25,%xmm7
834	movdqa	%xmm14,%xmm6
835	pslld	$7,%xmm14
836	por	%xmm7,%xmm13
837	psrld	$25,%xmm6
838	movdqa	(%r10),%xmm7
839	por	%xmm6,%xmm14
840	movdqa	%xmm4,32(%rsp)
841	movdqa	%xmm5,48(%rsp)
842	movdqa	0(%rsp),%xmm4
843	movdqa	16(%rsp),%xmm5
844	paddd	%xmm15,%xmm10
845	paddd	%xmm12,%xmm11
846	pxor	%xmm10,%xmm1
847	pxor	%xmm11,%xmm2
848.byte	102,15,56,0,207
849.byte	102,15,56,0,215
850	paddd	%xmm1,%xmm4
851	paddd	%xmm2,%xmm5
852	pxor	%xmm4,%xmm15
853	pxor	%xmm5,%xmm12
854	movdqa	%xmm15,%xmm6
855	pslld	$12,%xmm15
856	psrld	$20,%xmm6
857	movdqa	%xmm12,%xmm7
858	pslld	$12,%xmm12
859	por	%xmm6,%xmm15
860	psrld	$20,%xmm7
861	movdqa	(%r11),%xmm6
862	por	%xmm7,%xmm12
863	paddd	%xmm15,%xmm10
864	paddd	%xmm12,%xmm11
865	pxor	%xmm10,%xmm1
866	pxor	%xmm11,%xmm2
867.byte	102,15,56,0,206
868.byte	102,15,56,0,214
869	paddd	%xmm1,%xmm4
870	paddd	%xmm2,%xmm5
871	pxor	%xmm4,%xmm15
872	pxor	%xmm5,%xmm12
873	movdqa	%xmm15,%xmm7
874	pslld	$7,%xmm15
875	psrld	$25,%xmm7
876	movdqa	%xmm12,%xmm6
877	pslld	$7,%xmm12
878	por	%xmm7,%xmm15
879	psrld	$25,%xmm6
880	movdqa	(%r10),%xmm7
881	por	%xmm6,%xmm12
882	decl	%eax
883	jnz	.Loop4x
884
885	paddd	64(%rsp),%xmm8
886	paddd	80(%rsp),%xmm9
887	paddd	96(%rsp),%xmm10
888	paddd	112(%rsp),%xmm11
889
890	movdqa	%xmm8,%xmm6
891	punpckldq	%xmm9,%xmm8
892	movdqa	%xmm10,%xmm7
893	punpckldq	%xmm11,%xmm10
894	punpckhdq	%xmm9,%xmm6
895	punpckhdq	%xmm11,%xmm7
896	movdqa	%xmm8,%xmm9
897	punpcklqdq	%xmm10,%xmm8
898	movdqa	%xmm6,%xmm11
899	punpcklqdq	%xmm7,%xmm6
900	punpckhqdq	%xmm10,%xmm9
901	punpckhqdq	%xmm7,%xmm11
902	paddd	128-256(%rcx),%xmm12
903	paddd	144-256(%rcx),%xmm13
904	paddd	160-256(%rcx),%xmm14
905	paddd	176-256(%rcx),%xmm15
906
907	movdqa	%xmm8,0(%rsp)
908	movdqa	%xmm9,16(%rsp)
909	movdqa	32(%rsp),%xmm8
910	movdqa	48(%rsp),%xmm9
911
912	movdqa	%xmm12,%xmm10
913	punpckldq	%xmm13,%xmm12
914	movdqa	%xmm14,%xmm7
915	punpckldq	%xmm15,%xmm14
916	punpckhdq	%xmm13,%xmm10
917	punpckhdq	%xmm15,%xmm7
918	movdqa	%xmm12,%xmm13
919	punpcklqdq	%xmm14,%xmm12
920	movdqa	%xmm10,%xmm15
921	punpcklqdq	%xmm7,%xmm10
922	punpckhqdq	%xmm14,%xmm13
923	punpckhqdq	%xmm7,%xmm15
924	paddd	192-256(%rcx),%xmm4
925	paddd	208-256(%rcx),%xmm5
926	paddd	224-256(%rcx),%xmm8
927	paddd	240-256(%rcx),%xmm9
928
929	movdqa	%xmm6,32(%rsp)
930	movdqa	%xmm11,48(%rsp)
931
932	movdqa	%xmm4,%xmm14
933	punpckldq	%xmm5,%xmm4
934	movdqa	%xmm8,%xmm7
935	punpckldq	%xmm9,%xmm8
936	punpckhdq	%xmm5,%xmm14
937	punpckhdq	%xmm9,%xmm7
938	movdqa	%xmm4,%xmm5
939	punpcklqdq	%xmm8,%xmm4
940	movdqa	%xmm14,%xmm9
941	punpcklqdq	%xmm7,%xmm14
942	punpckhqdq	%xmm8,%xmm5
943	punpckhqdq	%xmm7,%xmm9
944	paddd	256-256(%rcx),%xmm0
945	paddd	272-256(%rcx),%xmm1
946	paddd	288-256(%rcx),%xmm2
947	paddd	304-256(%rcx),%xmm3
948
949	movdqa	%xmm0,%xmm8
950	punpckldq	%xmm1,%xmm0
951	movdqa	%xmm2,%xmm7
952	punpckldq	%xmm3,%xmm2
953	punpckhdq	%xmm1,%xmm8
954	punpckhdq	%xmm3,%xmm7
955	movdqa	%xmm0,%xmm1
956	punpcklqdq	%xmm2,%xmm0
957	movdqa	%xmm8,%xmm3
958	punpcklqdq	%xmm7,%xmm8
959	punpckhqdq	%xmm2,%xmm1
960	punpckhqdq	%xmm7,%xmm3
961	cmpq	$256,%rdx
962	jb	.Ltail4x
963
964	movdqu	0(%rsi),%xmm6
965	movdqu	16(%rsi),%xmm11
966	movdqu	32(%rsi),%xmm2
967	movdqu	48(%rsi),%xmm7
968	pxor	0(%rsp),%xmm6
969	pxor	%xmm12,%xmm11
970	pxor	%xmm4,%xmm2
971	pxor	%xmm0,%xmm7
972
973	movdqu	%xmm6,0(%rdi)
974	movdqu	64(%rsi),%xmm6
975	movdqu	%xmm11,16(%rdi)
976	movdqu	80(%rsi),%xmm11
977	movdqu	%xmm2,32(%rdi)
978	movdqu	96(%rsi),%xmm2
979	movdqu	%xmm7,48(%rdi)
980	movdqu	112(%rsi),%xmm7
981	leaq	128(%rsi),%rsi
982	pxor	16(%rsp),%xmm6
983	pxor	%xmm13,%xmm11
984	pxor	%xmm5,%xmm2
985	pxor	%xmm1,%xmm7
986
987	movdqu	%xmm6,64(%rdi)
988	movdqu	0(%rsi),%xmm6
989	movdqu	%xmm11,80(%rdi)
990	movdqu	16(%rsi),%xmm11
991	movdqu	%xmm2,96(%rdi)
992	movdqu	32(%rsi),%xmm2
993	movdqu	%xmm7,112(%rdi)
994	leaq	128(%rdi),%rdi
995	movdqu	48(%rsi),%xmm7
996	pxor	32(%rsp),%xmm6
997	pxor	%xmm10,%xmm11
998	pxor	%xmm14,%xmm2
999	pxor	%xmm8,%xmm7
1000
1001	movdqu	%xmm6,0(%rdi)
1002	movdqu	64(%rsi),%xmm6
1003	movdqu	%xmm11,16(%rdi)
1004	movdqu	80(%rsi),%xmm11
1005	movdqu	%xmm2,32(%rdi)
1006	movdqu	96(%rsi),%xmm2
1007	movdqu	%xmm7,48(%rdi)
1008	movdqu	112(%rsi),%xmm7
1009	leaq	128(%rsi),%rsi
1010	pxor	48(%rsp),%xmm6
1011	pxor	%xmm15,%xmm11
1012	pxor	%xmm9,%xmm2
1013	pxor	%xmm3,%xmm7
1014	movdqu	%xmm6,64(%rdi)
1015	movdqu	%xmm11,80(%rdi)
1016	movdqu	%xmm2,96(%rdi)
1017	movdqu	%xmm7,112(%rdi)
1018	leaq	128(%rdi),%rdi
1019
1020	subq	$256,%rdx
1021	jnz	.Loop_outer4x
1022
1023	jmp	.Ldone4x
1024
1025.Ltail4x:
1026	cmpq	$192,%rdx
1027	jae	.L192_or_more4x
1028	cmpq	$128,%rdx
1029	jae	.L128_or_more4x
1030	cmpq	$64,%rdx
1031	jae	.L64_or_more4x
1032
1033
1034	xorq	%r10,%r10
1035
1036	movdqa	%xmm12,16(%rsp)
1037	movdqa	%xmm4,32(%rsp)
1038	movdqa	%xmm0,48(%rsp)
1039	jmp	.Loop_tail4x
1040
1041.align	32
1042.L64_or_more4x:
1043	movdqu	0(%rsi),%xmm6
1044	movdqu	16(%rsi),%xmm11
1045	movdqu	32(%rsi),%xmm2
1046	movdqu	48(%rsi),%xmm7
1047	pxor	0(%rsp),%xmm6
1048	pxor	%xmm12,%xmm11
1049	pxor	%xmm4,%xmm2
1050	pxor	%xmm0,%xmm7
1051	movdqu	%xmm6,0(%rdi)
1052	movdqu	%xmm11,16(%rdi)
1053	movdqu	%xmm2,32(%rdi)
1054	movdqu	%xmm7,48(%rdi)
1055	je	.Ldone4x
1056
1057	movdqa	16(%rsp),%xmm6
1058	leaq	64(%rsi),%rsi
1059	xorq	%r10,%r10
1060	movdqa	%xmm6,0(%rsp)
1061	movdqa	%xmm13,16(%rsp)
1062	leaq	64(%rdi),%rdi
1063	movdqa	%xmm5,32(%rsp)
1064	subq	$64,%rdx
1065	movdqa	%xmm1,48(%rsp)
1066	jmp	.Loop_tail4x
1067
1068.align	32
1069.L128_or_more4x:
1070	movdqu	0(%rsi),%xmm6
1071	movdqu	16(%rsi),%xmm11
1072	movdqu	32(%rsi),%xmm2
1073	movdqu	48(%rsi),%xmm7
1074	pxor	0(%rsp),%xmm6
1075	pxor	%xmm12,%xmm11
1076	pxor	%xmm4,%xmm2
1077	pxor	%xmm0,%xmm7
1078
1079	movdqu	%xmm6,0(%rdi)
1080	movdqu	64(%rsi),%xmm6
1081	movdqu	%xmm11,16(%rdi)
1082	movdqu	80(%rsi),%xmm11
1083	movdqu	%xmm2,32(%rdi)
1084	movdqu	96(%rsi),%xmm2
1085	movdqu	%xmm7,48(%rdi)
1086	movdqu	112(%rsi),%xmm7
1087	pxor	16(%rsp),%xmm6
1088	pxor	%xmm13,%xmm11
1089	pxor	%xmm5,%xmm2
1090	pxor	%xmm1,%xmm7
1091	movdqu	%xmm6,64(%rdi)
1092	movdqu	%xmm11,80(%rdi)
1093	movdqu	%xmm2,96(%rdi)
1094	movdqu	%xmm7,112(%rdi)
1095	je	.Ldone4x
1096
1097	movdqa	32(%rsp),%xmm6
1098	leaq	128(%rsi),%rsi
1099	xorq	%r10,%r10
1100	movdqa	%xmm6,0(%rsp)
1101	movdqa	%xmm10,16(%rsp)
1102	leaq	128(%rdi),%rdi
1103	movdqa	%xmm14,32(%rsp)
1104	subq	$128,%rdx
1105	movdqa	%xmm8,48(%rsp)
1106	jmp	.Loop_tail4x
1107
1108.align	32
1109.L192_or_more4x:
1110	movdqu	0(%rsi),%xmm6
1111	movdqu	16(%rsi),%xmm11
1112	movdqu	32(%rsi),%xmm2
1113	movdqu	48(%rsi),%xmm7
1114	pxor	0(%rsp),%xmm6
1115	pxor	%xmm12,%xmm11
1116	pxor	%xmm4,%xmm2
1117	pxor	%xmm0,%xmm7
1118
1119	movdqu	%xmm6,0(%rdi)
1120	movdqu	64(%rsi),%xmm6
1121	movdqu	%xmm11,16(%rdi)
1122	movdqu	80(%rsi),%xmm11
1123	movdqu	%xmm2,32(%rdi)
1124	movdqu	96(%rsi),%xmm2
1125	movdqu	%xmm7,48(%rdi)
1126	movdqu	112(%rsi),%xmm7
1127	leaq	128(%rsi),%rsi
1128	pxor	16(%rsp),%xmm6
1129	pxor	%xmm13,%xmm11
1130	pxor	%xmm5,%xmm2
1131	pxor	%xmm1,%xmm7
1132
1133	movdqu	%xmm6,64(%rdi)
1134	movdqu	0(%rsi),%xmm6
1135	movdqu	%xmm11,80(%rdi)
1136	movdqu	16(%rsi),%xmm11
1137	movdqu	%xmm2,96(%rdi)
1138	movdqu	32(%rsi),%xmm2
1139	movdqu	%xmm7,112(%rdi)
1140	leaq	128(%rdi),%rdi
1141	movdqu	48(%rsi),%xmm7
1142	pxor	32(%rsp),%xmm6
1143	pxor	%xmm10,%xmm11
1144	pxor	%xmm14,%xmm2
1145	pxor	%xmm8,%xmm7
1146	movdqu	%xmm6,0(%rdi)
1147	movdqu	%xmm11,16(%rdi)
1148	movdqu	%xmm2,32(%rdi)
1149	movdqu	%xmm7,48(%rdi)
1150	je	.Ldone4x
1151
1152	movdqa	48(%rsp),%xmm6
1153	leaq	64(%rsi),%rsi
1154	xorq	%r10,%r10
1155	movdqa	%xmm6,0(%rsp)
1156	movdqa	%xmm15,16(%rsp)
1157	leaq	64(%rdi),%rdi
1158	movdqa	%xmm9,32(%rsp)
1159	subq	$192,%rdx
1160	movdqa	%xmm3,48(%rsp)
1161
1162.Loop_tail4x:
1163	movzbl	(%rsi,%r10,1),%eax
1164	movzbl	(%rsp,%r10,1),%ecx
1165	leaq	1(%r10),%r10
1166	xorl	%ecx,%eax
1167	movb	%al,-1(%rdi,%r10,1)
1168	decq	%rdx
1169	jnz	.Loop_tail4x
1170
1171.Ldone4x:
1172	leaq	(%r9),%rsp
1173.cfi_def_cfa_register	%rsp
1174.L4x_epilogue:
1175	.byte	0xf3,0xc3
1176.cfi_endproc
1177.size	ChaCha20_4x,.-ChaCha20_4x
1178.type	ChaCha20_4xop,@function
1179.align	32
1180ChaCha20_4xop:
1181.cfi_startproc
1182.LChaCha20_4xop:
1183	movq	%rsp,%r9
1184.cfi_def_cfa_register	%r9
1185	subq	$0x140+8,%rsp
1186	vzeroupper
1187
1188	vmovdqa	.Lsigma(%rip),%xmm11
1189	vmovdqu	(%rcx),%xmm3
1190	vmovdqu	16(%rcx),%xmm15
1191	vmovdqu	(%r8),%xmm7
1192	leaq	256(%rsp),%rcx
1193
1194	vpshufd	$0x00,%xmm11,%xmm8
1195	vpshufd	$0x55,%xmm11,%xmm9
1196	vmovdqa	%xmm8,64(%rsp)
1197	vpshufd	$0xaa,%xmm11,%xmm10
1198	vmovdqa	%xmm9,80(%rsp)
1199	vpshufd	$0xff,%xmm11,%xmm11
1200	vmovdqa	%xmm10,96(%rsp)
1201	vmovdqa	%xmm11,112(%rsp)
1202
1203	vpshufd	$0x00,%xmm3,%xmm0
1204	vpshufd	$0x55,%xmm3,%xmm1
1205	vmovdqa	%xmm0,128-256(%rcx)
1206	vpshufd	$0xaa,%xmm3,%xmm2
1207	vmovdqa	%xmm1,144-256(%rcx)
1208	vpshufd	$0xff,%xmm3,%xmm3
1209	vmovdqa	%xmm2,160-256(%rcx)
1210	vmovdqa	%xmm3,176-256(%rcx)
1211
1212	vpshufd	$0x00,%xmm15,%xmm12
1213	vpshufd	$0x55,%xmm15,%xmm13
1214	vmovdqa	%xmm12,192-256(%rcx)
1215	vpshufd	$0xaa,%xmm15,%xmm14
1216	vmovdqa	%xmm13,208-256(%rcx)
1217	vpshufd	$0xff,%xmm15,%xmm15
1218	vmovdqa	%xmm14,224-256(%rcx)
1219	vmovdqa	%xmm15,240-256(%rcx)
1220
1221	vpshufd	$0x00,%xmm7,%xmm4
1222	vpshufd	$0x55,%xmm7,%xmm5
1223	vpaddd	.Linc(%rip),%xmm4,%xmm4
1224	vpshufd	$0xaa,%xmm7,%xmm6
1225	vmovdqa	%xmm5,272-256(%rcx)
1226	vpshufd	$0xff,%xmm7,%xmm7
1227	vmovdqa	%xmm6,288-256(%rcx)
1228	vmovdqa	%xmm7,304-256(%rcx)
1229
1230	jmp	.Loop_enter4xop
1231
1232.align	32
1233.Loop_outer4xop:
1234	vmovdqa	64(%rsp),%xmm8
1235	vmovdqa	80(%rsp),%xmm9
1236	vmovdqa	96(%rsp),%xmm10
1237	vmovdqa	112(%rsp),%xmm11
1238	vmovdqa	128-256(%rcx),%xmm0
1239	vmovdqa	144-256(%rcx),%xmm1
1240	vmovdqa	160-256(%rcx),%xmm2
1241	vmovdqa	176-256(%rcx),%xmm3
1242	vmovdqa	192-256(%rcx),%xmm12
1243	vmovdqa	208-256(%rcx),%xmm13
1244	vmovdqa	224-256(%rcx),%xmm14
1245	vmovdqa	240-256(%rcx),%xmm15
1246	vmovdqa	256-256(%rcx),%xmm4
1247	vmovdqa	272-256(%rcx),%xmm5
1248	vmovdqa	288-256(%rcx),%xmm6
1249	vmovdqa	304-256(%rcx),%xmm7
1250	vpaddd	.Lfour(%rip),%xmm4,%xmm4
1251
1252.Loop_enter4xop:
1253	movl	$10,%eax
1254	vmovdqa	%xmm4,256-256(%rcx)
1255	jmp	.Loop4xop
1256
1257.align	32
1258.Loop4xop:
1259	vpaddd	%xmm0,%xmm8,%xmm8
1260	vpaddd	%xmm1,%xmm9,%xmm9
1261	vpaddd	%xmm2,%xmm10,%xmm10
1262	vpaddd	%xmm3,%xmm11,%xmm11
1263	vpxor	%xmm4,%xmm8,%xmm4
1264	vpxor	%xmm5,%xmm9,%xmm5
1265	vpxor	%xmm6,%xmm10,%xmm6
1266	vpxor	%xmm7,%xmm11,%xmm7
1267.byte	143,232,120,194,228,16
1268.byte	143,232,120,194,237,16
1269.byte	143,232,120,194,246,16
1270.byte	143,232,120,194,255,16
1271	vpaddd	%xmm4,%xmm12,%xmm12
1272	vpaddd	%xmm5,%xmm13,%xmm13
1273	vpaddd	%xmm6,%xmm14,%xmm14
1274	vpaddd	%xmm7,%xmm15,%xmm15
1275	vpxor	%xmm0,%xmm12,%xmm0
1276	vpxor	%xmm1,%xmm13,%xmm1
1277	vpxor	%xmm14,%xmm2,%xmm2
1278	vpxor	%xmm15,%xmm3,%xmm3
1279.byte	143,232,120,194,192,12
1280.byte	143,232,120,194,201,12
1281.byte	143,232,120,194,210,12
1282.byte	143,232,120,194,219,12
1283	vpaddd	%xmm8,%xmm0,%xmm8
1284	vpaddd	%xmm9,%xmm1,%xmm9
1285	vpaddd	%xmm2,%xmm10,%xmm10
1286	vpaddd	%xmm3,%xmm11,%xmm11
1287	vpxor	%xmm4,%xmm8,%xmm4
1288	vpxor	%xmm5,%xmm9,%xmm5
1289	vpxor	%xmm6,%xmm10,%xmm6
1290	vpxor	%xmm7,%xmm11,%xmm7
1291.byte	143,232,120,194,228,8
1292.byte	143,232,120,194,237,8
1293.byte	143,232,120,194,246,8
1294.byte	143,232,120,194,255,8
1295	vpaddd	%xmm4,%xmm12,%xmm12
1296	vpaddd	%xmm5,%xmm13,%xmm13
1297	vpaddd	%xmm6,%xmm14,%xmm14
1298	vpaddd	%xmm7,%xmm15,%xmm15
1299	vpxor	%xmm0,%xmm12,%xmm0
1300	vpxor	%xmm1,%xmm13,%xmm1
1301	vpxor	%xmm14,%xmm2,%xmm2
1302	vpxor	%xmm15,%xmm3,%xmm3
1303.byte	143,232,120,194,192,7
1304.byte	143,232,120,194,201,7
1305.byte	143,232,120,194,210,7
1306.byte	143,232,120,194,219,7
1307	vpaddd	%xmm1,%xmm8,%xmm8
1308	vpaddd	%xmm2,%xmm9,%xmm9
1309	vpaddd	%xmm3,%xmm10,%xmm10
1310	vpaddd	%xmm0,%xmm11,%xmm11
1311	vpxor	%xmm7,%xmm8,%xmm7
1312	vpxor	%xmm4,%xmm9,%xmm4
1313	vpxor	%xmm5,%xmm10,%xmm5
1314	vpxor	%xmm6,%xmm11,%xmm6
1315.byte	143,232,120,194,255,16
1316.byte	143,232,120,194,228,16
1317.byte	143,232,120,194,237,16
1318.byte	143,232,120,194,246,16
1319	vpaddd	%xmm7,%xmm14,%xmm14
1320	vpaddd	%xmm4,%xmm15,%xmm15
1321	vpaddd	%xmm5,%xmm12,%xmm12
1322	vpaddd	%xmm6,%xmm13,%xmm13
1323	vpxor	%xmm1,%xmm14,%xmm1
1324	vpxor	%xmm2,%xmm15,%xmm2
1325	vpxor	%xmm12,%xmm3,%xmm3
1326	vpxor	%xmm13,%xmm0,%xmm0
1327.byte	143,232,120,194,201,12
1328.byte	143,232,120,194,210,12
1329.byte	143,232,120,194,219,12
1330.byte	143,232,120,194,192,12
1331	vpaddd	%xmm8,%xmm1,%xmm8
1332	vpaddd	%xmm9,%xmm2,%xmm9
1333	vpaddd	%xmm3,%xmm10,%xmm10
1334	vpaddd	%xmm0,%xmm11,%xmm11
1335	vpxor	%xmm7,%xmm8,%xmm7
1336	vpxor	%xmm4,%xmm9,%xmm4
1337	vpxor	%xmm5,%xmm10,%xmm5
1338	vpxor	%xmm6,%xmm11,%xmm6
1339.byte	143,232,120,194,255,8
1340.byte	143,232,120,194,228,8
1341.byte	143,232,120,194,237,8
1342.byte	143,232,120,194,246,8
1343	vpaddd	%xmm7,%xmm14,%xmm14
1344	vpaddd	%xmm4,%xmm15,%xmm15
1345	vpaddd	%xmm5,%xmm12,%xmm12
1346	vpaddd	%xmm6,%xmm13,%xmm13
1347	vpxor	%xmm1,%xmm14,%xmm1
1348	vpxor	%xmm2,%xmm15,%xmm2
1349	vpxor	%xmm12,%xmm3,%xmm3
1350	vpxor	%xmm13,%xmm0,%xmm0
1351.byte	143,232,120,194,201,7
1352.byte	143,232,120,194,210,7
1353.byte	143,232,120,194,219,7
1354.byte	143,232,120,194,192,7
1355	decl	%eax
1356	jnz	.Loop4xop
1357
1358	vpaddd	64(%rsp),%xmm8,%xmm8
1359	vpaddd	80(%rsp),%xmm9,%xmm9
1360	vpaddd	96(%rsp),%xmm10,%xmm10
1361	vpaddd	112(%rsp),%xmm11,%xmm11
1362
1363	vmovdqa	%xmm14,32(%rsp)
1364	vmovdqa	%xmm15,48(%rsp)
1365
1366	vpunpckldq	%xmm9,%xmm8,%xmm14
1367	vpunpckldq	%xmm11,%xmm10,%xmm15
1368	vpunpckhdq	%xmm9,%xmm8,%xmm8
1369	vpunpckhdq	%xmm11,%xmm10,%xmm10
1370	vpunpcklqdq	%xmm15,%xmm14,%xmm9
1371	vpunpckhqdq	%xmm15,%xmm14,%xmm14
1372	vpunpcklqdq	%xmm10,%xmm8,%xmm11
1373	vpunpckhqdq	%xmm10,%xmm8,%xmm8
1374	vpaddd	128-256(%rcx),%xmm0,%xmm0
1375	vpaddd	144-256(%rcx),%xmm1,%xmm1
1376	vpaddd	160-256(%rcx),%xmm2,%xmm2
1377	vpaddd	176-256(%rcx),%xmm3,%xmm3
1378
1379	vmovdqa	%xmm9,0(%rsp)
1380	vmovdqa	%xmm14,16(%rsp)
1381	vmovdqa	32(%rsp),%xmm9
1382	vmovdqa	48(%rsp),%xmm14
1383
1384	vpunpckldq	%xmm1,%xmm0,%xmm10
1385	vpunpckldq	%xmm3,%xmm2,%xmm15
1386	vpunpckhdq	%xmm1,%xmm0,%xmm0
1387	vpunpckhdq	%xmm3,%xmm2,%xmm2
1388	vpunpcklqdq	%xmm15,%xmm10,%xmm1
1389	vpunpckhqdq	%xmm15,%xmm10,%xmm10
1390	vpunpcklqdq	%xmm2,%xmm0,%xmm3
1391	vpunpckhqdq	%xmm2,%xmm0,%xmm0
1392	vpaddd	192-256(%rcx),%xmm12,%xmm12
1393	vpaddd	208-256(%rcx),%xmm13,%xmm13
1394	vpaddd	224-256(%rcx),%xmm9,%xmm9
1395	vpaddd	240-256(%rcx),%xmm14,%xmm14
1396
1397	vpunpckldq	%xmm13,%xmm12,%xmm2
1398	vpunpckldq	%xmm14,%xmm9,%xmm15
1399	vpunpckhdq	%xmm13,%xmm12,%xmm12
1400	vpunpckhdq	%xmm14,%xmm9,%xmm9
1401	vpunpcklqdq	%xmm15,%xmm2,%xmm13
1402	vpunpckhqdq	%xmm15,%xmm2,%xmm2
1403	vpunpcklqdq	%xmm9,%xmm12,%xmm14
1404	vpunpckhqdq	%xmm9,%xmm12,%xmm12
1405	vpaddd	256-256(%rcx),%xmm4,%xmm4
1406	vpaddd	272-256(%rcx),%xmm5,%xmm5
1407	vpaddd	288-256(%rcx),%xmm6,%xmm6
1408	vpaddd	304-256(%rcx),%xmm7,%xmm7
1409
1410	vpunpckldq	%xmm5,%xmm4,%xmm9
1411	vpunpckldq	%xmm7,%xmm6,%xmm15
1412	vpunpckhdq	%xmm5,%xmm4,%xmm4
1413	vpunpckhdq	%xmm7,%xmm6,%xmm6
1414	vpunpcklqdq	%xmm15,%xmm9,%xmm5
1415	vpunpckhqdq	%xmm15,%xmm9,%xmm9
1416	vpunpcklqdq	%xmm6,%xmm4,%xmm7
1417	vpunpckhqdq	%xmm6,%xmm4,%xmm4
1418	vmovdqa	0(%rsp),%xmm6
1419	vmovdqa	16(%rsp),%xmm15
1420
1421	cmpq	$256,%rdx
1422	jb	.Ltail4xop
1423
1424	vpxor	0(%rsi),%xmm6,%xmm6
1425	vpxor	16(%rsi),%xmm1,%xmm1
1426	vpxor	32(%rsi),%xmm13,%xmm13
1427	vpxor	48(%rsi),%xmm5,%xmm5
1428	vpxor	64(%rsi),%xmm15,%xmm15
1429	vpxor	80(%rsi),%xmm10,%xmm10
1430	vpxor	96(%rsi),%xmm2,%xmm2
1431	vpxor	112(%rsi),%xmm9,%xmm9
1432	leaq	128(%rsi),%rsi
1433	vpxor	0(%rsi),%xmm11,%xmm11
1434	vpxor	16(%rsi),%xmm3,%xmm3
1435	vpxor	32(%rsi),%xmm14,%xmm14
1436	vpxor	48(%rsi),%xmm7,%xmm7
1437	vpxor	64(%rsi),%xmm8,%xmm8
1438	vpxor	80(%rsi),%xmm0,%xmm0
1439	vpxor	96(%rsi),%xmm12,%xmm12
1440	vpxor	112(%rsi),%xmm4,%xmm4
1441	leaq	128(%rsi),%rsi
1442
1443	vmovdqu	%xmm6,0(%rdi)
1444	vmovdqu	%xmm1,16(%rdi)
1445	vmovdqu	%xmm13,32(%rdi)
1446	vmovdqu	%xmm5,48(%rdi)
1447	vmovdqu	%xmm15,64(%rdi)
1448	vmovdqu	%xmm10,80(%rdi)
1449	vmovdqu	%xmm2,96(%rdi)
1450	vmovdqu	%xmm9,112(%rdi)
1451	leaq	128(%rdi),%rdi
1452	vmovdqu	%xmm11,0(%rdi)
1453	vmovdqu	%xmm3,16(%rdi)
1454	vmovdqu	%xmm14,32(%rdi)
1455	vmovdqu	%xmm7,48(%rdi)
1456	vmovdqu	%xmm8,64(%rdi)
1457	vmovdqu	%xmm0,80(%rdi)
1458	vmovdqu	%xmm12,96(%rdi)
1459	vmovdqu	%xmm4,112(%rdi)
1460	leaq	128(%rdi),%rdi
1461
1462	subq	$256,%rdx
1463	jnz	.Loop_outer4xop
1464
1465	jmp	.Ldone4xop
1466
1467.align	32
1468.Ltail4xop:
1469	cmpq	$192,%rdx
1470	jae	.L192_or_more4xop
1471	cmpq	$128,%rdx
1472	jae	.L128_or_more4xop
1473	cmpq	$64,%rdx
1474	jae	.L64_or_more4xop
1475
1476	xorq	%r10,%r10
1477	vmovdqa	%xmm6,0(%rsp)
1478	vmovdqa	%xmm1,16(%rsp)
1479	vmovdqa	%xmm13,32(%rsp)
1480	vmovdqa	%xmm5,48(%rsp)
1481	jmp	.Loop_tail4xop
1482
1483.align	32
1484.L64_or_more4xop:
1485	vpxor	0(%rsi),%xmm6,%xmm6
1486	vpxor	16(%rsi),%xmm1,%xmm1
1487	vpxor	32(%rsi),%xmm13,%xmm13
1488	vpxor	48(%rsi),%xmm5,%xmm5
1489	vmovdqu	%xmm6,0(%rdi)
1490	vmovdqu	%xmm1,16(%rdi)
1491	vmovdqu	%xmm13,32(%rdi)
1492	vmovdqu	%xmm5,48(%rdi)
1493	je	.Ldone4xop
1494
1495	leaq	64(%rsi),%rsi
1496	vmovdqa	%xmm15,0(%rsp)
1497	xorq	%r10,%r10
1498	vmovdqa	%xmm10,16(%rsp)
1499	leaq	64(%rdi),%rdi
1500	vmovdqa	%xmm2,32(%rsp)
1501	subq	$64,%rdx
1502	vmovdqa	%xmm9,48(%rsp)
1503	jmp	.Loop_tail4xop
1504
1505.align	32
1506.L128_or_more4xop:
1507	vpxor	0(%rsi),%xmm6,%xmm6
1508	vpxor	16(%rsi),%xmm1,%xmm1
1509	vpxor	32(%rsi),%xmm13,%xmm13
1510	vpxor	48(%rsi),%xmm5,%xmm5
1511	vpxor	64(%rsi),%xmm15,%xmm15
1512	vpxor	80(%rsi),%xmm10,%xmm10
1513	vpxor	96(%rsi),%xmm2,%xmm2
1514	vpxor	112(%rsi),%xmm9,%xmm9
1515
1516	vmovdqu	%xmm6,0(%rdi)
1517	vmovdqu	%xmm1,16(%rdi)
1518	vmovdqu	%xmm13,32(%rdi)
1519	vmovdqu	%xmm5,48(%rdi)
1520	vmovdqu	%xmm15,64(%rdi)
1521	vmovdqu	%xmm10,80(%rdi)
1522	vmovdqu	%xmm2,96(%rdi)
1523	vmovdqu	%xmm9,112(%rdi)
1524	je	.Ldone4xop
1525
1526	leaq	128(%rsi),%rsi
1527	vmovdqa	%xmm11,0(%rsp)
1528	xorq	%r10,%r10
1529	vmovdqa	%xmm3,16(%rsp)
1530	leaq	128(%rdi),%rdi
1531	vmovdqa	%xmm14,32(%rsp)
1532	subq	$128,%rdx
1533	vmovdqa	%xmm7,48(%rsp)
1534	jmp	.Loop_tail4xop
1535
1536.align	32
1537.L192_or_more4xop:
1538	vpxor	0(%rsi),%xmm6,%xmm6
1539	vpxor	16(%rsi),%xmm1,%xmm1
1540	vpxor	32(%rsi),%xmm13,%xmm13
1541	vpxor	48(%rsi),%xmm5,%xmm5
1542	vpxor	64(%rsi),%xmm15,%xmm15
1543	vpxor	80(%rsi),%xmm10,%xmm10
1544	vpxor	96(%rsi),%xmm2,%xmm2
1545	vpxor	112(%rsi),%xmm9,%xmm9
1546	leaq	128(%rsi),%rsi
1547	vpxor	0(%rsi),%xmm11,%xmm11
1548	vpxor	16(%rsi),%xmm3,%xmm3
1549	vpxor	32(%rsi),%xmm14,%xmm14
1550	vpxor	48(%rsi),%xmm7,%xmm7
1551
1552	vmovdqu	%xmm6,0(%rdi)
1553	vmovdqu	%xmm1,16(%rdi)
1554	vmovdqu	%xmm13,32(%rdi)
1555	vmovdqu	%xmm5,48(%rdi)
1556	vmovdqu	%xmm15,64(%rdi)
1557	vmovdqu	%xmm10,80(%rdi)
1558	vmovdqu	%xmm2,96(%rdi)
1559	vmovdqu	%xmm9,112(%rdi)
1560	leaq	128(%rdi),%rdi
1561	vmovdqu	%xmm11,0(%rdi)
1562	vmovdqu	%xmm3,16(%rdi)
1563	vmovdqu	%xmm14,32(%rdi)
1564	vmovdqu	%xmm7,48(%rdi)
1565	je	.Ldone4xop
1566
1567	leaq	64(%rsi),%rsi
1568	vmovdqa	%xmm8,0(%rsp)
1569	xorq	%r10,%r10
1570	vmovdqa	%xmm0,16(%rsp)
1571	leaq	64(%rdi),%rdi
1572	vmovdqa	%xmm12,32(%rsp)
1573	subq	$192,%rdx
1574	vmovdqa	%xmm4,48(%rsp)
1575
1576.Loop_tail4xop:
1577	movzbl	(%rsi,%r10,1),%eax
1578	movzbl	(%rsp,%r10,1),%ecx
1579	leaq	1(%r10),%r10
1580	xorl	%ecx,%eax
1581	movb	%al,-1(%rdi,%r10,1)
1582	decq	%rdx
1583	jnz	.Loop_tail4xop
1584
1585.Ldone4xop:
1586	vzeroupper
1587	leaq	(%r9),%rsp
1588.cfi_def_cfa_register	%rsp
1589.L4xop_epilogue:
1590	.byte	0xf3,0xc3
1591.cfi_endproc
1592.size	ChaCha20_4xop,.-ChaCha20_4xop
1593.type	ChaCha20_8x,@function
1594.align	32
1595ChaCha20_8x:
1596.cfi_startproc
1597.LChaCha20_8x:
1598	movq	%rsp,%r9
1599.cfi_def_cfa_register	%r9
1600	subq	$0x280+8,%rsp
1601	andq	$-32,%rsp
1602	vzeroupper
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613	vbroadcasti128	.Lsigma(%rip),%ymm11
1614	vbroadcasti128	(%rcx),%ymm3
1615	vbroadcasti128	16(%rcx),%ymm15
1616	vbroadcasti128	(%r8),%ymm7
1617	leaq	256(%rsp),%rcx
1618	leaq	512(%rsp),%rax
1619	leaq	.Lrot16(%rip),%r10
1620	leaq	.Lrot24(%rip),%r11
1621
1622	vpshufd	$0x00,%ymm11,%ymm8
1623	vpshufd	$0x55,%ymm11,%ymm9
1624	vmovdqa	%ymm8,128-256(%rcx)
1625	vpshufd	$0xaa,%ymm11,%ymm10
1626	vmovdqa	%ymm9,160-256(%rcx)
1627	vpshufd	$0xff,%ymm11,%ymm11
1628	vmovdqa	%ymm10,192-256(%rcx)
1629	vmovdqa	%ymm11,224-256(%rcx)
1630
1631	vpshufd	$0x00,%ymm3,%ymm0
1632	vpshufd	$0x55,%ymm3,%ymm1
1633	vmovdqa	%ymm0,256-256(%rcx)
1634	vpshufd	$0xaa,%ymm3,%ymm2
1635	vmovdqa	%ymm1,288-256(%rcx)
1636	vpshufd	$0xff,%ymm3,%ymm3
1637	vmovdqa	%ymm2,320-256(%rcx)
1638	vmovdqa	%ymm3,352-256(%rcx)
1639
1640	vpshufd	$0x00,%ymm15,%ymm12
1641	vpshufd	$0x55,%ymm15,%ymm13
1642	vmovdqa	%ymm12,384-512(%rax)
1643	vpshufd	$0xaa,%ymm15,%ymm14
1644	vmovdqa	%ymm13,416-512(%rax)
1645	vpshufd	$0xff,%ymm15,%ymm15
1646	vmovdqa	%ymm14,448-512(%rax)
1647	vmovdqa	%ymm15,480-512(%rax)
1648
1649	vpshufd	$0x00,%ymm7,%ymm4
1650	vpshufd	$0x55,%ymm7,%ymm5
1651	vpaddd	.Lincy(%rip),%ymm4,%ymm4
1652	vpshufd	$0xaa,%ymm7,%ymm6
1653	vmovdqa	%ymm5,544-512(%rax)
1654	vpshufd	$0xff,%ymm7,%ymm7
1655	vmovdqa	%ymm6,576-512(%rax)
1656	vmovdqa	%ymm7,608-512(%rax)
1657
1658	jmp	.Loop_enter8x
1659
1660.align	32
1661.Loop_outer8x:
1662	vmovdqa	128-256(%rcx),%ymm8
1663	vmovdqa	160-256(%rcx),%ymm9
1664	vmovdqa	192-256(%rcx),%ymm10
1665	vmovdqa	224-256(%rcx),%ymm11
1666	vmovdqa	256-256(%rcx),%ymm0
1667	vmovdqa	288-256(%rcx),%ymm1
1668	vmovdqa	320-256(%rcx),%ymm2
1669	vmovdqa	352-256(%rcx),%ymm3
1670	vmovdqa	384-512(%rax),%ymm12
1671	vmovdqa	416-512(%rax),%ymm13
1672	vmovdqa	448-512(%rax),%ymm14
1673	vmovdqa	480-512(%rax),%ymm15
1674	vmovdqa	512-512(%rax),%ymm4
1675	vmovdqa	544-512(%rax),%ymm5
1676	vmovdqa	576-512(%rax),%ymm6
1677	vmovdqa	608-512(%rax),%ymm7
1678	vpaddd	.Leight(%rip),%ymm4,%ymm4
1679
1680.Loop_enter8x:
1681	vmovdqa	%ymm14,64(%rsp)
1682	vmovdqa	%ymm15,96(%rsp)
1683	vbroadcasti128	(%r10),%ymm15
1684	vmovdqa	%ymm4,512-512(%rax)
1685	movl	$10,%eax
1686	jmp	.Loop8x
1687
1688.align	32
1689.Loop8x:
1690	vpaddd	%ymm0,%ymm8,%ymm8
1691	vpxor	%ymm4,%ymm8,%ymm4
1692	vpshufb	%ymm15,%ymm4,%ymm4
1693	vpaddd	%ymm1,%ymm9,%ymm9
1694	vpxor	%ymm5,%ymm9,%ymm5
1695	vpshufb	%ymm15,%ymm5,%ymm5
1696	vpaddd	%ymm4,%ymm12,%ymm12
1697	vpxor	%ymm0,%ymm12,%ymm0
1698	vpslld	$12,%ymm0,%ymm14
1699	vpsrld	$20,%ymm0,%ymm0
1700	vpor	%ymm0,%ymm14,%ymm0
1701	vbroadcasti128	(%r11),%ymm14
1702	vpaddd	%ymm5,%ymm13,%ymm13
1703	vpxor	%ymm1,%ymm13,%ymm1
1704	vpslld	$12,%ymm1,%ymm15
1705	vpsrld	$20,%ymm1,%ymm1
1706	vpor	%ymm1,%ymm15,%ymm1
1707	vpaddd	%ymm0,%ymm8,%ymm8
1708	vpxor	%ymm4,%ymm8,%ymm4
1709	vpshufb	%ymm14,%ymm4,%ymm4
1710	vpaddd	%ymm1,%ymm9,%ymm9
1711	vpxor	%ymm5,%ymm9,%ymm5
1712	vpshufb	%ymm14,%ymm5,%ymm5
1713	vpaddd	%ymm4,%ymm12,%ymm12
1714	vpxor	%ymm0,%ymm12,%ymm0
1715	vpslld	$7,%ymm0,%ymm15
1716	vpsrld	$25,%ymm0,%ymm0
1717	vpor	%ymm0,%ymm15,%ymm0
1718	vbroadcasti128	(%r10),%ymm15
1719	vpaddd	%ymm5,%ymm13,%ymm13
1720	vpxor	%ymm1,%ymm13,%ymm1
1721	vpslld	$7,%ymm1,%ymm14
1722	vpsrld	$25,%ymm1,%ymm1
1723	vpor	%ymm1,%ymm14,%ymm1
1724	vmovdqa	%ymm12,0(%rsp)
1725	vmovdqa	%ymm13,32(%rsp)
1726	vmovdqa	64(%rsp),%ymm12
1727	vmovdqa	96(%rsp),%ymm13
1728	vpaddd	%ymm2,%ymm10,%ymm10
1729	vpxor	%ymm6,%ymm10,%ymm6
1730	vpshufb	%ymm15,%ymm6,%ymm6
1731	vpaddd	%ymm3,%ymm11,%ymm11
1732	vpxor	%ymm7,%ymm11,%ymm7
1733	vpshufb	%ymm15,%ymm7,%ymm7
1734	vpaddd	%ymm6,%ymm12,%ymm12
1735	vpxor	%ymm2,%ymm12,%ymm2
1736	vpslld	$12,%ymm2,%ymm14
1737	vpsrld	$20,%ymm2,%ymm2
1738	vpor	%ymm2,%ymm14,%ymm2
1739	vbroadcasti128	(%r11),%ymm14
1740	vpaddd	%ymm7,%ymm13,%ymm13
1741	vpxor	%ymm3,%ymm13,%ymm3
1742	vpslld	$12,%ymm3,%ymm15
1743	vpsrld	$20,%ymm3,%ymm3
1744	vpor	%ymm3,%ymm15,%ymm3
1745	vpaddd	%ymm2,%ymm10,%ymm10
1746	vpxor	%ymm6,%ymm10,%ymm6
1747	vpshufb	%ymm14,%ymm6,%ymm6
1748	vpaddd	%ymm3,%ymm11,%ymm11
1749	vpxor	%ymm7,%ymm11,%ymm7
1750	vpshufb	%ymm14,%ymm7,%ymm7
1751	vpaddd	%ymm6,%ymm12,%ymm12
1752	vpxor	%ymm2,%ymm12,%ymm2
1753	vpslld	$7,%ymm2,%ymm15
1754	vpsrld	$25,%ymm2,%ymm2
1755	vpor	%ymm2,%ymm15,%ymm2
1756	vbroadcasti128	(%r10),%ymm15
1757	vpaddd	%ymm7,%ymm13,%ymm13
1758	vpxor	%ymm3,%ymm13,%ymm3
1759	vpslld	$7,%ymm3,%ymm14
1760	vpsrld	$25,%ymm3,%ymm3
1761	vpor	%ymm3,%ymm14,%ymm3
1762	vpaddd	%ymm1,%ymm8,%ymm8
1763	vpxor	%ymm7,%ymm8,%ymm7
1764	vpshufb	%ymm15,%ymm7,%ymm7
1765	vpaddd	%ymm2,%ymm9,%ymm9
1766	vpxor	%ymm4,%ymm9,%ymm4
1767	vpshufb	%ymm15,%ymm4,%ymm4
1768	vpaddd	%ymm7,%ymm12,%ymm12
1769	vpxor	%ymm1,%ymm12,%ymm1
1770	vpslld	$12,%ymm1,%ymm14
1771	vpsrld	$20,%ymm1,%ymm1
1772	vpor	%ymm1,%ymm14,%ymm1
1773	vbroadcasti128	(%r11),%ymm14
1774	vpaddd	%ymm4,%ymm13,%ymm13
1775	vpxor	%ymm2,%ymm13,%ymm2
1776	vpslld	$12,%ymm2,%ymm15
1777	vpsrld	$20,%ymm2,%ymm2
1778	vpor	%ymm2,%ymm15,%ymm2
1779	vpaddd	%ymm1,%ymm8,%ymm8
1780	vpxor	%ymm7,%ymm8,%ymm7
1781	vpshufb	%ymm14,%ymm7,%ymm7
1782	vpaddd	%ymm2,%ymm9,%ymm9
1783	vpxor	%ymm4,%ymm9,%ymm4
1784	vpshufb	%ymm14,%ymm4,%ymm4
1785	vpaddd	%ymm7,%ymm12,%ymm12
1786	vpxor	%ymm1,%ymm12,%ymm1
1787	vpslld	$7,%ymm1,%ymm15
1788	vpsrld	$25,%ymm1,%ymm1
1789	vpor	%ymm1,%ymm15,%ymm1
1790	vbroadcasti128	(%r10),%ymm15
1791	vpaddd	%ymm4,%ymm13,%ymm13
1792	vpxor	%ymm2,%ymm13,%ymm2
1793	vpslld	$7,%ymm2,%ymm14
1794	vpsrld	$25,%ymm2,%ymm2
1795	vpor	%ymm2,%ymm14,%ymm2
1796	vmovdqa	%ymm12,64(%rsp)
1797	vmovdqa	%ymm13,96(%rsp)
1798	vmovdqa	0(%rsp),%ymm12
1799	vmovdqa	32(%rsp),%ymm13
1800	vpaddd	%ymm3,%ymm10,%ymm10
1801	vpxor	%ymm5,%ymm10,%ymm5
1802	vpshufb	%ymm15,%ymm5,%ymm5
1803	vpaddd	%ymm0,%ymm11,%ymm11
1804	vpxor	%ymm6,%ymm11,%ymm6
1805	vpshufb	%ymm15,%ymm6,%ymm6
1806	vpaddd	%ymm5,%ymm12,%ymm12
1807	vpxor	%ymm3,%ymm12,%ymm3
1808	vpslld	$12,%ymm3,%ymm14
1809	vpsrld	$20,%ymm3,%ymm3
1810	vpor	%ymm3,%ymm14,%ymm3
1811	vbroadcasti128	(%r11),%ymm14
1812	vpaddd	%ymm6,%ymm13,%ymm13
1813	vpxor	%ymm0,%ymm13,%ymm0
1814	vpslld	$12,%ymm0,%ymm15
1815	vpsrld	$20,%ymm0,%ymm0
1816	vpor	%ymm0,%ymm15,%ymm0
1817	vpaddd	%ymm3,%ymm10,%ymm10
1818	vpxor	%ymm5,%ymm10,%ymm5
1819	vpshufb	%ymm14,%ymm5,%ymm5
1820	vpaddd	%ymm0,%ymm11,%ymm11
1821	vpxor	%ymm6,%ymm11,%ymm6
1822	vpshufb	%ymm14,%ymm6,%ymm6
1823	vpaddd	%ymm5,%ymm12,%ymm12
1824	vpxor	%ymm3,%ymm12,%ymm3
1825	vpslld	$7,%ymm3,%ymm15
1826	vpsrld	$25,%ymm3,%ymm3
1827	vpor	%ymm3,%ymm15,%ymm3
1828	vbroadcasti128	(%r10),%ymm15
1829	vpaddd	%ymm6,%ymm13,%ymm13
1830	vpxor	%ymm0,%ymm13,%ymm0
1831	vpslld	$7,%ymm0,%ymm14
1832	vpsrld	$25,%ymm0,%ymm0
1833	vpor	%ymm0,%ymm14,%ymm0
1834	decl	%eax
1835	jnz	.Loop8x
1836
1837	leaq	512(%rsp),%rax
1838	vpaddd	128-256(%rcx),%ymm8,%ymm8
1839	vpaddd	160-256(%rcx),%ymm9,%ymm9
1840	vpaddd	192-256(%rcx),%ymm10,%ymm10
1841	vpaddd	224-256(%rcx),%ymm11,%ymm11
1842
1843	vpunpckldq	%ymm9,%ymm8,%ymm14
1844	vpunpckldq	%ymm11,%ymm10,%ymm15
1845	vpunpckhdq	%ymm9,%ymm8,%ymm8
1846	vpunpckhdq	%ymm11,%ymm10,%ymm10
1847	vpunpcklqdq	%ymm15,%ymm14,%ymm9
1848	vpunpckhqdq	%ymm15,%ymm14,%ymm14
1849	vpunpcklqdq	%ymm10,%ymm8,%ymm11
1850	vpunpckhqdq	%ymm10,%ymm8,%ymm8
1851	vpaddd	256-256(%rcx),%ymm0,%ymm0
1852	vpaddd	288-256(%rcx),%ymm1,%ymm1
1853	vpaddd	320-256(%rcx),%ymm2,%ymm2
1854	vpaddd	352-256(%rcx),%ymm3,%ymm3
1855
1856	vpunpckldq	%ymm1,%ymm0,%ymm10
1857	vpunpckldq	%ymm3,%ymm2,%ymm15
1858	vpunpckhdq	%ymm1,%ymm0,%ymm0
1859	vpunpckhdq	%ymm3,%ymm2,%ymm2
1860	vpunpcklqdq	%ymm15,%ymm10,%ymm1
1861	vpunpckhqdq	%ymm15,%ymm10,%ymm10
1862	vpunpcklqdq	%ymm2,%ymm0,%ymm3
1863	vpunpckhqdq	%ymm2,%ymm0,%ymm0
1864	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
1865	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
1866	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
1867	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
1868	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
1869	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
1870	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
1871	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
1872	vmovdqa	%ymm15,0(%rsp)
1873	vmovdqa	%ymm9,32(%rsp)
1874	vmovdqa	64(%rsp),%ymm15
1875	vmovdqa	96(%rsp),%ymm9
1876
1877	vpaddd	384-512(%rax),%ymm12,%ymm12
1878	vpaddd	416-512(%rax),%ymm13,%ymm13
1879	vpaddd	448-512(%rax),%ymm15,%ymm15
1880	vpaddd	480-512(%rax),%ymm9,%ymm9
1881
1882	vpunpckldq	%ymm13,%ymm12,%ymm2
1883	vpunpckldq	%ymm9,%ymm15,%ymm8
1884	vpunpckhdq	%ymm13,%ymm12,%ymm12
1885	vpunpckhdq	%ymm9,%ymm15,%ymm15
1886	vpunpcklqdq	%ymm8,%ymm2,%ymm13
1887	vpunpckhqdq	%ymm8,%ymm2,%ymm2
1888	vpunpcklqdq	%ymm15,%ymm12,%ymm9
1889	vpunpckhqdq	%ymm15,%ymm12,%ymm12
1890	vpaddd	512-512(%rax),%ymm4,%ymm4
1891	vpaddd	544-512(%rax),%ymm5,%ymm5
1892	vpaddd	576-512(%rax),%ymm6,%ymm6
1893	vpaddd	608-512(%rax),%ymm7,%ymm7
1894
1895	vpunpckldq	%ymm5,%ymm4,%ymm15
1896	vpunpckldq	%ymm7,%ymm6,%ymm8
1897	vpunpckhdq	%ymm5,%ymm4,%ymm4
1898	vpunpckhdq	%ymm7,%ymm6,%ymm6
1899	vpunpcklqdq	%ymm8,%ymm15,%ymm5
1900	vpunpckhqdq	%ymm8,%ymm15,%ymm15
1901	vpunpcklqdq	%ymm6,%ymm4,%ymm7
1902	vpunpckhqdq	%ymm6,%ymm4,%ymm4
1903	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
1904	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
1905	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
1906	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
1907	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
1908	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
1909	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
1910	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
1911	vmovdqa	0(%rsp),%ymm6
1912	vmovdqa	32(%rsp),%ymm12
1913
1914	cmpq	$512,%rdx
1915	jb	.Ltail8x
1916
1917	vpxor	0(%rsi),%ymm6,%ymm6
1918	vpxor	32(%rsi),%ymm8,%ymm8
1919	vpxor	64(%rsi),%ymm1,%ymm1
1920	vpxor	96(%rsi),%ymm5,%ymm5
1921	leaq	128(%rsi),%rsi
1922	vmovdqu	%ymm6,0(%rdi)
1923	vmovdqu	%ymm8,32(%rdi)
1924	vmovdqu	%ymm1,64(%rdi)
1925	vmovdqu	%ymm5,96(%rdi)
1926	leaq	128(%rdi),%rdi
1927
1928	vpxor	0(%rsi),%ymm12,%ymm12
1929	vpxor	32(%rsi),%ymm13,%ymm13
1930	vpxor	64(%rsi),%ymm10,%ymm10
1931	vpxor	96(%rsi),%ymm15,%ymm15
1932	leaq	128(%rsi),%rsi
1933	vmovdqu	%ymm12,0(%rdi)
1934	vmovdqu	%ymm13,32(%rdi)
1935	vmovdqu	%ymm10,64(%rdi)
1936	vmovdqu	%ymm15,96(%rdi)
1937	leaq	128(%rdi),%rdi
1938
1939	vpxor	0(%rsi),%ymm14,%ymm14
1940	vpxor	32(%rsi),%ymm2,%ymm2
1941	vpxor	64(%rsi),%ymm3,%ymm3
1942	vpxor	96(%rsi),%ymm7,%ymm7
1943	leaq	128(%rsi),%rsi
1944	vmovdqu	%ymm14,0(%rdi)
1945	vmovdqu	%ymm2,32(%rdi)
1946	vmovdqu	%ymm3,64(%rdi)
1947	vmovdqu	%ymm7,96(%rdi)
1948	leaq	128(%rdi),%rdi
1949
1950	vpxor	0(%rsi),%ymm11,%ymm11
1951	vpxor	32(%rsi),%ymm9,%ymm9
1952	vpxor	64(%rsi),%ymm0,%ymm0
1953	vpxor	96(%rsi),%ymm4,%ymm4
1954	leaq	128(%rsi),%rsi
1955	vmovdqu	%ymm11,0(%rdi)
1956	vmovdqu	%ymm9,32(%rdi)
1957	vmovdqu	%ymm0,64(%rdi)
1958	vmovdqu	%ymm4,96(%rdi)
1959	leaq	128(%rdi),%rdi
1960
1961	subq	$512,%rdx
1962	jnz	.Loop_outer8x
1963
1964	jmp	.Ldone8x
1965
1966.Ltail8x:
1967	cmpq	$448,%rdx
1968	jae	.L448_or_more8x
1969	cmpq	$384,%rdx
1970	jae	.L384_or_more8x
1971	cmpq	$320,%rdx
1972	jae	.L320_or_more8x
1973	cmpq	$256,%rdx
1974	jae	.L256_or_more8x
1975	cmpq	$192,%rdx
1976	jae	.L192_or_more8x
1977	cmpq	$128,%rdx
1978	jae	.L128_or_more8x
1979	cmpq	$64,%rdx
1980	jae	.L64_or_more8x
1981
1982	xorq	%r10,%r10
1983	vmovdqa	%ymm6,0(%rsp)
1984	vmovdqa	%ymm8,32(%rsp)
1985	jmp	.Loop_tail8x
1986
1987.align	32
1988.L64_or_more8x:
1989	vpxor	0(%rsi),%ymm6,%ymm6
1990	vpxor	32(%rsi),%ymm8,%ymm8
1991	vmovdqu	%ymm6,0(%rdi)
1992	vmovdqu	%ymm8,32(%rdi)
1993	je	.Ldone8x
1994
1995	leaq	64(%rsi),%rsi
1996	xorq	%r10,%r10
1997	vmovdqa	%ymm1,0(%rsp)
1998	leaq	64(%rdi),%rdi
1999	subq	$64,%rdx
2000	vmovdqa	%ymm5,32(%rsp)
2001	jmp	.Loop_tail8x
2002
2003.align	32
2004.L128_or_more8x:
2005	vpxor	0(%rsi),%ymm6,%ymm6
2006	vpxor	32(%rsi),%ymm8,%ymm8
2007	vpxor	64(%rsi),%ymm1,%ymm1
2008	vpxor	96(%rsi),%ymm5,%ymm5
2009	vmovdqu	%ymm6,0(%rdi)
2010	vmovdqu	%ymm8,32(%rdi)
2011	vmovdqu	%ymm1,64(%rdi)
2012	vmovdqu	%ymm5,96(%rdi)
2013	je	.Ldone8x
2014
2015	leaq	128(%rsi),%rsi
2016	xorq	%r10,%r10
2017	vmovdqa	%ymm12,0(%rsp)
2018	leaq	128(%rdi),%rdi
2019	subq	$128,%rdx
2020	vmovdqa	%ymm13,32(%rsp)
2021	jmp	.Loop_tail8x
2022
2023.align	32
2024.L192_or_more8x:
2025	vpxor	0(%rsi),%ymm6,%ymm6
2026	vpxor	32(%rsi),%ymm8,%ymm8
2027	vpxor	64(%rsi),%ymm1,%ymm1
2028	vpxor	96(%rsi),%ymm5,%ymm5
2029	vpxor	128(%rsi),%ymm12,%ymm12
2030	vpxor	160(%rsi),%ymm13,%ymm13
2031	vmovdqu	%ymm6,0(%rdi)
2032	vmovdqu	%ymm8,32(%rdi)
2033	vmovdqu	%ymm1,64(%rdi)
2034	vmovdqu	%ymm5,96(%rdi)
2035	vmovdqu	%ymm12,128(%rdi)
2036	vmovdqu	%ymm13,160(%rdi)
2037	je	.Ldone8x
2038
2039	leaq	192(%rsi),%rsi
2040	xorq	%r10,%r10
2041	vmovdqa	%ymm10,0(%rsp)
2042	leaq	192(%rdi),%rdi
2043	subq	$192,%rdx
2044	vmovdqa	%ymm15,32(%rsp)
2045	jmp	.Loop_tail8x
2046
2047.align	32
2048.L256_or_more8x:
2049	vpxor	0(%rsi),%ymm6,%ymm6
2050	vpxor	32(%rsi),%ymm8,%ymm8
2051	vpxor	64(%rsi),%ymm1,%ymm1
2052	vpxor	96(%rsi),%ymm5,%ymm5
2053	vpxor	128(%rsi),%ymm12,%ymm12
2054	vpxor	160(%rsi),%ymm13,%ymm13
2055	vpxor	192(%rsi),%ymm10,%ymm10
2056	vpxor	224(%rsi),%ymm15,%ymm15
2057	vmovdqu	%ymm6,0(%rdi)
2058	vmovdqu	%ymm8,32(%rdi)
2059	vmovdqu	%ymm1,64(%rdi)
2060	vmovdqu	%ymm5,96(%rdi)
2061	vmovdqu	%ymm12,128(%rdi)
2062	vmovdqu	%ymm13,160(%rdi)
2063	vmovdqu	%ymm10,192(%rdi)
2064	vmovdqu	%ymm15,224(%rdi)
2065	je	.Ldone8x
2066
2067	leaq	256(%rsi),%rsi
2068	xorq	%r10,%r10
2069	vmovdqa	%ymm14,0(%rsp)
2070	leaq	256(%rdi),%rdi
2071	subq	$256,%rdx
2072	vmovdqa	%ymm2,32(%rsp)
2073	jmp	.Loop_tail8x
2074
2075.align	32
2076.L320_or_more8x:
2077	vpxor	0(%rsi),%ymm6,%ymm6
2078	vpxor	32(%rsi),%ymm8,%ymm8
2079	vpxor	64(%rsi),%ymm1,%ymm1
2080	vpxor	96(%rsi),%ymm5,%ymm5
2081	vpxor	128(%rsi),%ymm12,%ymm12
2082	vpxor	160(%rsi),%ymm13,%ymm13
2083	vpxor	192(%rsi),%ymm10,%ymm10
2084	vpxor	224(%rsi),%ymm15,%ymm15
2085	vpxor	256(%rsi),%ymm14,%ymm14
2086	vpxor	288(%rsi),%ymm2,%ymm2
2087	vmovdqu	%ymm6,0(%rdi)
2088	vmovdqu	%ymm8,32(%rdi)
2089	vmovdqu	%ymm1,64(%rdi)
2090	vmovdqu	%ymm5,96(%rdi)
2091	vmovdqu	%ymm12,128(%rdi)
2092	vmovdqu	%ymm13,160(%rdi)
2093	vmovdqu	%ymm10,192(%rdi)
2094	vmovdqu	%ymm15,224(%rdi)
2095	vmovdqu	%ymm14,256(%rdi)
2096	vmovdqu	%ymm2,288(%rdi)
2097	je	.Ldone8x
2098
2099	leaq	320(%rsi),%rsi
2100	xorq	%r10,%r10
2101	vmovdqa	%ymm3,0(%rsp)
2102	leaq	320(%rdi),%rdi
2103	subq	$320,%rdx
2104	vmovdqa	%ymm7,32(%rsp)
2105	jmp	.Loop_tail8x
2106
2107.align	32
2108.L384_or_more8x:
2109	vpxor	0(%rsi),%ymm6,%ymm6
2110	vpxor	32(%rsi),%ymm8,%ymm8
2111	vpxor	64(%rsi),%ymm1,%ymm1
2112	vpxor	96(%rsi),%ymm5,%ymm5
2113	vpxor	128(%rsi),%ymm12,%ymm12
2114	vpxor	160(%rsi),%ymm13,%ymm13
2115	vpxor	192(%rsi),%ymm10,%ymm10
2116	vpxor	224(%rsi),%ymm15,%ymm15
2117	vpxor	256(%rsi),%ymm14,%ymm14
2118	vpxor	288(%rsi),%ymm2,%ymm2
2119	vpxor	320(%rsi),%ymm3,%ymm3
2120	vpxor	352(%rsi),%ymm7,%ymm7
2121	vmovdqu	%ymm6,0(%rdi)
2122	vmovdqu	%ymm8,32(%rdi)
2123	vmovdqu	%ymm1,64(%rdi)
2124	vmovdqu	%ymm5,96(%rdi)
2125	vmovdqu	%ymm12,128(%rdi)
2126	vmovdqu	%ymm13,160(%rdi)
2127	vmovdqu	%ymm10,192(%rdi)
2128	vmovdqu	%ymm15,224(%rdi)
2129	vmovdqu	%ymm14,256(%rdi)
2130	vmovdqu	%ymm2,288(%rdi)
2131	vmovdqu	%ymm3,320(%rdi)
2132	vmovdqu	%ymm7,352(%rdi)
2133	je	.Ldone8x
2134
2135	leaq	384(%rsi),%rsi
2136	xorq	%r10,%r10
2137	vmovdqa	%ymm11,0(%rsp)
2138	leaq	384(%rdi),%rdi
2139	subq	$384,%rdx
2140	vmovdqa	%ymm9,32(%rsp)
2141	jmp	.Loop_tail8x
2142
2143.align	32
2144.L448_or_more8x:
2145	vpxor	0(%rsi),%ymm6,%ymm6
2146	vpxor	32(%rsi),%ymm8,%ymm8
2147	vpxor	64(%rsi),%ymm1,%ymm1
2148	vpxor	96(%rsi),%ymm5,%ymm5
2149	vpxor	128(%rsi),%ymm12,%ymm12
2150	vpxor	160(%rsi),%ymm13,%ymm13
2151	vpxor	192(%rsi),%ymm10,%ymm10
2152	vpxor	224(%rsi),%ymm15,%ymm15
2153	vpxor	256(%rsi),%ymm14,%ymm14
2154	vpxor	288(%rsi),%ymm2,%ymm2
2155	vpxor	320(%rsi),%ymm3,%ymm3
2156	vpxor	352(%rsi),%ymm7,%ymm7
2157	vpxor	384(%rsi),%ymm11,%ymm11
2158	vpxor	416(%rsi),%ymm9,%ymm9
2159	vmovdqu	%ymm6,0(%rdi)
2160	vmovdqu	%ymm8,32(%rdi)
2161	vmovdqu	%ymm1,64(%rdi)
2162	vmovdqu	%ymm5,96(%rdi)
2163	vmovdqu	%ymm12,128(%rdi)
2164	vmovdqu	%ymm13,160(%rdi)
2165	vmovdqu	%ymm10,192(%rdi)
2166	vmovdqu	%ymm15,224(%rdi)
2167	vmovdqu	%ymm14,256(%rdi)
2168	vmovdqu	%ymm2,288(%rdi)
2169	vmovdqu	%ymm3,320(%rdi)
2170	vmovdqu	%ymm7,352(%rdi)
2171	vmovdqu	%ymm11,384(%rdi)
2172	vmovdqu	%ymm9,416(%rdi)
2173	je	.Ldone8x
2174
2175	leaq	448(%rsi),%rsi
2176	xorq	%r10,%r10
2177	vmovdqa	%ymm0,0(%rsp)
2178	leaq	448(%rdi),%rdi
2179	subq	$448,%rdx
2180	vmovdqa	%ymm4,32(%rsp)
2181
2182.Loop_tail8x:
2183	movzbl	(%rsi,%r10,1),%eax
2184	movzbl	(%rsp,%r10,1),%ecx
2185	leaq	1(%r10),%r10
2186	xorl	%ecx,%eax
2187	movb	%al,-1(%rdi,%r10,1)
2188	decq	%rdx
2189	jnz	.Loop_tail8x
2190
2191.Ldone8x:
2192	vzeroall
2193	leaq	(%r9),%rsp
2194.cfi_def_cfa_register	%rsp
2195.L8x_epilogue:
2196	.byte	0xf3,0xc3
2197.cfi_endproc
2198.size	ChaCha20_8x,.-ChaCha20_8x
2199.type	ChaCha20_avx512,@function
2200.align	32
2201ChaCha20_avx512:
2202.cfi_startproc
2203.LChaCha20_avx512:
2204	movq	%rsp,%r9
2205.cfi_def_cfa_register	%r9
2206	cmpq	$512,%rdx
2207	ja	.LChaCha20_16x
2208
2209	subq	$64+8,%rsp
2210	vbroadcasti32x4	.Lsigma(%rip),%zmm0
2211	vbroadcasti32x4	(%rcx),%zmm1
2212	vbroadcasti32x4	16(%rcx),%zmm2
2213	vbroadcasti32x4	(%r8),%zmm3
2214
2215	vmovdqa32	%zmm0,%zmm16
2216	vmovdqa32	%zmm1,%zmm17
2217	vmovdqa32	%zmm2,%zmm18
2218	vpaddd	.Lzeroz(%rip),%zmm3,%zmm3
2219	vmovdqa32	.Lfourz(%rip),%zmm20
2220	movq	$10,%r8
2221	vmovdqa32	%zmm3,%zmm19
2222	jmp	.Loop_avx512
2223
2224.align	16
2225.Loop_outer_avx512:
2226	vmovdqa32	%zmm16,%zmm0
2227	vmovdqa32	%zmm17,%zmm1
2228	vmovdqa32	%zmm18,%zmm2
2229	vpaddd	%zmm20,%zmm19,%zmm3
2230	movq	$10,%r8
2231	vmovdqa32	%zmm3,%zmm19
2232	jmp	.Loop_avx512
2233
2234.align	32
2235.Loop_avx512:
2236	vpaddd	%zmm1,%zmm0,%zmm0
2237	vpxord	%zmm0,%zmm3,%zmm3
2238	vprold	$16,%zmm3,%zmm3
2239	vpaddd	%zmm3,%zmm2,%zmm2
2240	vpxord	%zmm2,%zmm1,%zmm1
2241	vprold	$12,%zmm1,%zmm1
2242	vpaddd	%zmm1,%zmm0,%zmm0
2243	vpxord	%zmm0,%zmm3,%zmm3
2244	vprold	$8,%zmm3,%zmm3
2245	vpaddd	%zmm3,%zmm2,%zmm2
2246	vpxord	%zmm2,%zmm1,%zmm1
2247	vprold	$7,%zmm1,%zmm1
2248	vpshufd	$78,%zmm2,%zmm2
2249	vpshufd	$57,%zmm1,%zmm1
2250	vpshufd	$147,%zmm3,%zmm3
2251	vpaddd	%zmm1,%zmm0,%zmm0
2252	vpxord	%zmm0,%zmm3,%zmm3
2253	vprold	$16,%zmm3,%zmm3
2254	vpaddd	%zmm3,%zmm2,%zmm2
2255	vpxord	%zmm2,%zmm1,%zmm1
2256	vprold	$12,%zmm1,%zmm1
2257	vpaddd	%zmm1,%zmm0,%zmm0
2258	vpxord	%zmm0,%zmm3,%zmm3
2259	vprold	$8,%zmm3,%zmm3
2260	vpaddd	%zmm3,%zmm2,%zmm2
2261	vpxord	%zmm2,%zmm1,%zmm1
2262	vprold	$7,%zmm1,%zmm1
2263	vpshufd	$78,%zmm2,%zmm2
2264	vpshufd	$147,%zmm1,%zmm1
2265	vpshufd	$57,%zmm3,%zmm3
2266	decq	%r8
2267	jnz	.Loop_avx512
2268	vpaddd	%zmm16,%zmm0,%zmm0
2269	vpaddd	%zmm17,%zmm1,%zmm1
2270	vpaddd	%zmm18,%zmm2,%zmm2
2271	vpaddd	%zmm19,%zmm3,%zmm3
2272
2273	subq	$64,%rdx
2274	jb	.Ltail64_avx512
2275
2276	vpxor	0(%rsi),%xmm0,%xmm4
2277	vpxor	16(%rsi),%xmm1,%xmm5
2278	vpxor	32(%rsi),%xmm2,%xmm6
2279	vpxor	48(%rsi),%xmm3,%xmm7
2280	leaq	64(%rsi),%rsi
2281
2282	vmovdqu	%xmm4,0(%rdi)
2283	vmovdqu	%xmm5,16(%rdi)
2284	vmovdqu	%xmm6,32(%rdi)
2285	vmovdqu	%xmm7,48(%rdi)
2286	leaq	64(%rdi),%rdi
2287
2288	jz	.Ldone_avx512
2289
2290	vextracti32x4	$1,%zmm0,%xmm4
2291	vextracti32x4	$1,%zmm1,%xmm5
2292	vextracti32x4	$1,%zmm2,%xmm6
2293	vextracti32x4	$1,%zmm3,%xmm7
2294
2295	subq	$64,%rdx
2296	jb	.Ltail_avx512
2297
2298	vpxor	0(%rsi),%xmm4,%xmm4
2299	vpxor	16(%rsi),%xmm5,%xmm5
2300	vpxor	32(%rsi),%xmm6,%xmm6
2301	vpxor	48(%rsi),%xmm7,%xmm7
2302	leaq	64(%rsi),%rsi
2303
2304	vmovdqu	%xmm4,0(%rdi)
2305	vmovdqu	%xmm5,16(%rdi)
2306	vmovdqu	%xmm6,32(%rdi)
2307	vmovdqu	%xmm7,48(%rdi)
2308	leaq	64(%rdi),%rdi
2309
2310	jz	.Ldone_avx512
2311
2312	vextracti32x4	$2,%zmm0,%xmm4
2313	vextracti32x4	$2,%zmm1,%xmm5
2314	vextracti32x4	$2,%zmm2,%xmm6
2315	vextracti32x4	$2,%zmm3,%xmm7
2316
2317	subq	$64,%rdx
2318	jb	.Ltail_avx512
2319
2320	vpxor	0(%rsi),%xmm4,%xmm4
2321	vpxor	16(%rsi),%xmm5,%xmm5
2322	vpxor	32(%rsi),%xmm6,%xmm6
2323	vpxor	48(%rsi),%xmm7,%xmm7
2324	leaq	64(%rsi),%rsi
2325
2326	vmovdqu	%xmm4,0(%rdi)
2327	vmovdqu	%xmm5,16(%rdi)
2328	vmovdqu	%xmm6,32(%rdi)
2329	vmovdqu	%xmm7,48(%rdi)
2330	leaq	64(%rdi),%rdi
2331
2332	jz	.Ldone_avx512
2333
2334	vextracti32x4	$3,%zmm0,%xmm4
2335	vextracti32x4	$3,%zmm1,%xmm5
2336	vextracti32x4	$3,%zmm2,%xmm6
2337	vextracti32x4	$3,%zmm3,%xmm7
2338
2339	subq	$64,%rdx
2340	jb	.Ltail_avx512
2341
2342	vpxor	0(%rsi),%xmm4,%xmm4
2343	vpxor	16(%rsi),%xmm5,%xmm5
2344	vpxor	32(%rsi),%xmm6,%xmm6
2345	vpxor	48(%rsi),%xmm7,%xmm7
2346	leaq	64(%rsi),%rsi
2347
2348	vmovdqu	%xmm4,0(%rdi)
2349	vmovdqu	%xmm5,16(%rdi)
2350	vmovdqu	%xmm6,32(%rdi)
2351	vmovdqu	%xmm7,48(%rdi)
2352	leaq	64(%rdi),%rdi
2353
2354	jnz	.Loop_outer_avx512
2355
2356	jmp	.Ldone_avx512
2357
2358.align	16
2359.Ltail64_avx512:
2360	vmovdqa	%xmm0,0(%rsp)
2361	vmovdqa	%xmm1,16(%rsp)
2362	vmovdqa	%xmm2,32(%rsp)
2363	vmovdqa	%xmm3,48(%rsp)
2364	addq	$64,%rdx
2365	jmp	.Loop_tail_avx512
2366
2367.align	16
2368.Ltail_avx512:
2369	vmovdqa	%xmm4,0(%rsp)
2370	vmovdqa	%xmm5,16(%rsp)
2371	vmovdqa	%xmm6,32(%rsp)
2372	vmovdqa	%xmm7,48(%rsp)
2373	addq	$64,%rdx
2374
2375.Loop_tail_avx512:
2376	movzbl	(%rsi,%r8,1),%eax
2377	movzbl	(%rsp,%r8,1),%ecx
2378	leaq	1(%r8),%r8
2379	xorl	%ecx,%eax
2380	movb	%al,-1(%rdi,%r8,1)
2381	decq	%rdx
2382	jnz	.Loop_tail_avx512
2383
2384	vmovdqu32	%zmm16,0(%rsp)
2385
2386.Ldone_avx512:
2387	vzeroall
2388	leaq	(%r9),%rsp
2389.cfi_def_cfa_register	%rsp
2390.Lavx512_epilogue:
2391	.byte	0xf3,0xc3
2392.cfi_endproc
2393.size	ChaCha20_avx512,.-ChaCha20_avx512
2394.type	ChaCha20_avx512vl,@function
2395.align	32
2396ChaCha20_avx512vl:
2397.cfi_startproc
2398.LChaCha20_avx512vl:
2399	movq	%rsp,%r9
2400.cfi_def_cfa_register	%r9
2401	cmpq	$128,%rdx
2402	ja	.LChaCha20_8xvl
2403
2404	subq	$64+8,%rsp
2405	vbroadcasti128	.Lsigma(%rip),%ymm0
2406	vbroadcasti128	(%rcx),%ymm1
2407	vbroadcasti128	16(%rcx),%ymm2
2408	vbroadcasti128	(%r8),%ymm3
2409
2410	vmovdqa32	%ymm0,%ymm16
2411	vmovdqa32	%ymm1,%ymm17
2412	vmovdqa32	%ymm2,%ymm18
2413	vpaddd	.Lzeroz(%rip),%ymm3,%ymm3
2414	vmovdqa32	.Ltwoy(%rip),%ymm20
2415	movq	$10,%r8
2416	vmovdqa32	%ymm3,%ymm19
2417	jmp	.Loop_avx512vl
2418
2419.align	16
2420.Loop_outer_avx512vl:
2421	vmovdqa32	%ymm18,%ymm2
2422	vpaddd	%ymm20,%ymm19,%ymm3
2423	movq	$10,%r8
2424	vmovdqa32	%ymm3,%ymm19
2425	jmp	.Loop_avx512vl
2426
2427.align	32
2428.Loop_avx512vl:
2429	vpaddd	%ymm1,%ymm0,%ymm0
2430	vpxor	%ymm0,%ymm3,%ymm3
2431	vprold	$16,%ymm3,%ymm3
2432	vpaddd	%ymm3,%ymm2,%ymm2
2433	vpxor	%ymm2,%ymm1,%ymm1
2434	vprold	$12,%ymm1,%ymm1
2435	vpaddd	%ymm1,%ymm0,%ymm0
2436	vpxor	%ymm0,%ymm3,%ymm3
2437	vprold	$8,%ymm3,%ymm3
2438	vpaddd	%ymm3,%ymm2,%ymm2
2439	vpxor	%ymm2,%ymm1,%ymm1
2440	vprold	$7,%ymm1,%ymm1
2441	vpshufd	$78,%ymm2,%ymm2
2442	vpshufd	$57,%ymm1,%ymm1
2443	vpshufd	$147,%ymm3,%ymm3
2444	vpaddd	%ymm1,%ymm0,%ymm0
2445	vpxor	%ymm0,%ymm3,%ymm3
2446	vprold	$16,%ymm3,%ymm3
2447	vpaddd	%ymm3,%ymm2,%ymm2
2448	vpxor	%ymm2,%ymm1,%ymm1
2449	vprold	$12,%ymm1,%ymm1
2450	vpaddd	%ymm1,%ymm0,%ymm0
2451	vpxor	%ymm0,%ymm3,%ymm3
2452	vprold	$8,%ymm3,%ymm3
2453	vpaddd	%ymm3,%ymm2,%ymm2
2454	vpxor	%ymm2,%ymm1,%ymm1
2455	vprold	$7,%ymm1,%ymm1
2456	vpshufd	$78,%ymm2,%ymm2
2457	vpshufd	$147,%ymm1,%ymm1
2458	vpshufd	$57,%ymm3,%ymm3
2459	decq	%r8
2460	jnz	.Loop_avx512vl
2461	vpaddd	%ymm16,%ymm0,%ymm0
2462	vpaddd	%ymm17,%ymm1,%ymm1
2463	vpaddd	%ymm18,%ymm2,%ymm2
2464	vpaddd	%ymm19,%ymm3,%ymm3
2465
2466	subq	$64,%rdx
2467	jb	.Ltail64_avx512vl
2468
2469	vpxor	0(%rsi),%xmm0,%xmm4
2470	vpxor	16(%rsi),%xmm1,%xmm5
2471	vpxor	32(%rsi),%xmm2,%xmm6
2472	vpxor	48(%rsi),%xmm3,%xmm7
2473	leaq	64(%rsi),%rsi
2474
2475	vmovdqu	%xmm4,0(%rdi)
2476	vmovdqu	%xmm5,16(%rdi)
2477	vmovdqu	%xmm6,32(%rdi)
2478	vmovdqu	%xmm7,48(%rdi)
2479	leaq	64(%rdi),%rdi
2480
2481	jz	.Ldone_avx512vl
2482
2483	vextracti128	$1,%ymm0,%xmm4
2484	vextracti128	$1,%ymm1,%xmm5
2485	vextracti128	$1,%ymm2,%xmm6
2486	vextracti128	$1,%ymm3,%xmm7
2487
2488	subq	$64,%rdx
2489	jb	.Ltail_avx512vl
2490
2491	vpxor	0(%rsi),%xmm4,%xmm4
2492	vpxor	16(%rsi),%xmm5,%xmm5
2493	vpxor	32(%rsi),%xmm6,%xmm6
2494	vpxor	48(%rsi),%xmm7,%xmm7
2495	leaq	64(%rsi),%rsi
2496
2497	vmovdqu	%xmm4,0(%rdi)
2498	vmovdqu	%xmm5,16(%rdi)
2499	vmovdqu	%xmm6,32(%rdi)
2500	vmovdqu	%xmm7,48(%rdi)
2501	leaq	64(%rdi),%rdi
2502
2503	vmovdqa32	%ymm16,%ymm0
2504	vmovdqa32	%ymm17,%ymm1
2505	jnz	.Loop_outer_avx512vl
2506
2507	jmp	.Ldone_avx512vl
2508
2509.align	16
2510.Ltail64_avx512vl:
2511	vmovdqa	%xmm0,0(%rsp)
2512	vmovdqa	%xmm1,16(%rsp)
2513	vmovdqa	%xmm2,32(%rsp)
2514	vmovdqa	%xmm3,48(%rsp)
2515	addq	$64,%rdx
2516	jmp	.Loop_tail_avx512vl
2517
2518.align	16
2519.Ltail_avx512vl:
2520	vmovdqa	%xmm4,0(%rsp)
2521	vmovdqa	%xmm5,16(%rsp)
2522	vmovdqa	%xmm6,32(%rsp)
2523	vmovdqa	%xmm7,48(%rsp)
2524	addq	$64,%rdx
2525
2526.Loop_tail_avx512vl:
2527	movzbl	(%rsi,%r8,1),%eax
2528	movzbl	(%rsp,%r8,1),%ecx
2529	leaq	1(%r8),%r8
2530	xorl	%ecx,%eax
2531	movb	%al,-1(%rdi,%r8,1)
2532	decq	%rdx
2533	jnz	.Loop_tail_avx512vl
2534
2535	vmovdqu32	%ymm16,0(%rsp)
2536	vmovdqu32	%ymm16,32(%rsp)
2537
2538.Ldone_avx512vl:
2539	vzeroall
2540	leaq	(%r9),%rsp
2541.cfi_def_cfa_register	%rsp
2542.Lavx512vl_epilogue:
2543	.byte	0xf3,0xc3
2544.cfi_endproc
2545.size	ChaCha20_avx512vl,.-ChaCha20_avx512vl
2546.type	ChaCha20_16x,@function
2547.align	32
2548ChaCha20_16x:
2549.cfi_startproc
2550.LChaCha20_16x:
2551	movq	%rsp,%r9
2552.cfi_def_cfa_register	%r9
2553	subq	$64+8,%rsp
2554	andq	$-64,%rsp
2555	vzeroupper
2556
2557	leaq	.Lsigma(%rip),%r10
2558	vbroadcasti32x4	(%r10),%zmm3
2559	vbroadcasti32x4	(%rcx),%zmm7
2560	vbroadcasti32x4	16(%rcx),%zmm11
2561	vbroadcasti32x4	(%r8),%zmm15
2562
2563	vpshufd	$0x00,%zmm3,%zmm0
2564	vpshufd	$0x55,%zmm3,%zmm1
2565	vpshufd	$0xaa,%zmm3,%zmm2
2566	vpshufd	$0xff,%zmm3,%zmm3
2567	vmovdqa64	%zmm0,%zmm16
2568	vmovdqa64	%zmm1,%zmm17
2569	vmovdqa64	%zmm2,%zmm18
2570	vmovdqa64	%zmm3,%zmm19
2571
2572	vpshufd	$0x00,%zmm7,%zmm4
2573	vpshufd	$0x55,%zmm7,%zmm5
2574	vpshufd	$0xaa,%zmm7,%zmm6
2575	vpshufd	$0xff,%zmm7,%zmm7
2576	vmovdqa64	%zmm4,%zmm20
2577	vmovdqa64	%zmm5,%zmm21
2578	vmovdqa64	%zmm6,%zmm22
2579	vmovdqa64	%zmm7,%zmm23
2580
2581	vpshufd	$0x00,%zmm11,%zmm8
2582	vpshufd	$0x55,%zmm11,%zmm9
2583	vpshufd	$0xaa,%zmm11,%zmm10
2584	vpshufd	$0xff,%zmm11,%zmm11
2585	vmovdqa64	%zmm8,%zmm24
2586	vmovdqa64	%zmm9,%zmm25
2587	vmovdqa64	%zmm10,%zmm26
2588	vmovdqa64	%zmm11,%zmm27
2589
2590	vpshufd	$0x00,%zmm15,%zmm12
2591	vpshufd	$0x55,%zmm15,%zmm13
2592	vpshufd	$0xaa,%zmm15,%zmm14
2593	vpshufd	$0xff,%zmm15,%zmm15
2594	vpaddd	.Lincz(%rip),%zmm12,%zmm12
2595	vmovdqa64	%zmm12,%zmm28
2596	vmovdqa64	%zmm13,%zmm29
2597	vmovdqa64	%zmm14,%zmm30
2598	vmovdqa64	%zmm15,%zmm31
2599
2600	movl	$10,%eax
2601	jmp	.Loop16x
2602
2603.align	32
2604.Loop_outer16x:
2605	vpbroadcastd	0(%r10),%zmm0
2606	vpbroadcastd	4(%r10),%zmm1
2607	vpbroadcastd	8(%r10),%zmm2
2608	vpbroadcastd	12(%r10),%zmm3
2609	vpaddd	.Lsixteen(%rip),%zmm28,%zmm28
2610	vmovdqa64	%zmm20,%zmm4
2611	vmovdqa64	%zmm21,%zmm5
2612	vmovdqa64	%zmm22,%zmm6
2613	vmovdqa64	%zmm23,%zmm7
2614	vmovdqa64	%zmm24,%zmm8
2615	vmovdqa64	%zmm25,%zmm9
2616	vmovdqa64	%zmm26,%zmm10
2617	vmovdqa64	%zmm27,%zmm11
2618	vmovdqa64	%zmm28,%zmm12
2619	vmovdqa64	%zmm29,%zmm13
2620	vmovdqa64	%zmm30,%zmm14
2621	vmovdqa64	%zmm31,%zmm15
2622
2623	vmovdqa64	%zmm0,%zmm16
2624	vmovdqa64	%zmm1,%zmm17
2625	vmovdqa64	%zmm2,%zmm18
2626	vmovdqa64	%zmm3,%zmm19
2627
2628	movl	$10,%eax
2629	jmp	.Loop16x
2630
2631.align	32
2632.Loop16x:
2633	vpaddd	%zmm4,%zmm0,%zmm0
2634	vpaddd	%zmm5,%zmm1,%zmm1
2635	vpaddd	%zmm6,%zmm2,%zmm2
2636	vpaddd	%zmm7,%zmm3,%zmm3
2637	vpxord	%zmm0,%zmm12,%zmm12
2638	vpxord	%zmm1,%zmm13,%zmm13
2639	vpxord	%zmm2,%zmm14,%zmm14
2640	vpxord	%zmm3,%zmm15,%zmm15
2641	vprold	$16,%zmm12,%zmm12
2642	vprold	$16,%zmm13,%zmm13
2643	vprold	$16,%zmm14,%zmm14
2644	vprold	$16,%zmm15,%zmm15
2645	vpaddd	%zmm12,%zmm8,%zmm8
2646	vpaddd	%zmm13,%zmm9,%zmm9
2647	vpaddd	%zmm14,%zmm10,%zmm10
2648	vpaddd	%zmm15,%zmm11,%zmm11
2649	vpxord	%zmm8,%zmm4,%zmm4
2650	vpxord	%zmm9,%zmm5,%zmm5
2651	vpxord	%zmm10,%zmm6,%zmm6
2652	vpxord	%zmm11,%zmm7,%zmm7
2653	vprold	$12,%zmm4,%zmm4
2654	vprold	$12,%zmm5,%zmm5
2655	vprold	$12,%zmm6,%zmm6
2656	vprold	$12,%zmm7,%zmm7
2657	vpaddd	%zmm4,%zmm0,%zmm0
2658	vpaddd	%zmm5,%zmm1,%zmm1
2659	vpaddd	%zmm6,%zmm2,%zmm2
2660	vpaddd	%zmm7,%zmm3,%zmm3
2661	vpxord	%zmm0,%zmm12,%zmm12
2662	vpxord	%zmm1,%zmm13,%zmm13
2663	vpxord	%zmm2,%zmm14,%zmm14
2664	vpxord	%zmm3,%zmm15,%zmm15
2665	vprold	$8,%zmm12,%zmm12
2666	vprold	$8,%zmm13,%zmm13
2667	vprold	$8,%zmm14,%zmm14
2668	vprold	$8,%zmm15,%zmm15
2669	vpaddd	%zmm12,%zmm8,%zmm8
2670	vpaddd	%zmm13,%zmm9,%zmm9
2671	vpaddd	%zmm14,%zmm10,%zmm10
2672	vpaddd	%zmm15,%zmm11,%zmm11
2673	vpxord	%zmm8,%zmm4,%zmm4
2674	vpxord	%zmm9,%zmm5,%zmm5
2675	vpxord	%zmm10,%zmm6,%zmm6
2676	vpxord	%zmm11,%zmm7,%zmm7
2677	vprold	$7,%zmm4,%zmm4
2678	vprold	$7,%zmm5,%zmm5
2679	vprold	$7,%zmm6,%zmm6
2680	vprold	$7,%zmm7,%zmm7
2681	vpaddd	%zmm5,%zmm0,%zmm0
2682	vpaddd	%zmm6,%zmm1,%zmm1
2683	vpaddd	%zmm7,%zmm2,%zmm2
2684	vpaddd	%zmm4,%zmm3,%zmm3
2685	vpxord	%zmm0,%zmm15,%zmm15
2686	vpxord	%zmm1,%zmm12,%zmm12
2687	vpxord	%zmm2,%zmm13,%zmm13
2688	vpxord	%zmm3,%zmm14,%zmm14
2689	vprold	$16,%zmm15,%zmm15
2690	vprold	$16,%zmm12,%zmm12
2691	vprold	$16,%zmm13,%zmm13
2692	vprold	$16,%zmm14,%zmm14
2693	vpaddd	%zmm15,%zmm10,%zmm10
2694	vpaddd	%zmm12,%zmm11,%zmm11
2695	vpaddd	%zmm13,%zmm8,%zmm8
2696	vpaddd	%zmm14,%zmm9,%zmm9
2697	vpxord	%zmm10,%zmm5,%zmm5
2698	vpxord	%zmm11,%zmm6,%zmm6
2699	vpxord	%zmm8,%zmm7,%zmm7
2700	vpxord	%zmm9,%zmm4,%zmm4
2701	vprold	$12,%zmm5,%zmm5
2702	vprold	$12,%zmm6,%zmm6
2703	vprold	$12,%zmm7,%zmm7
2704	vprold	$12,%zmm4,%zmm4
2705	vpaddd	%zmm5,%zmm0,%zmm0
2706	vpaddd	%zmm6,%zmm1,%zmm1
2707	vpaddd	%zmm7,%zmm2,%zmm2
2708	vpaddd	%zmm4,%zmm3,%zmm3
2709	vpxord	%zmm0,%zmm15,%zmm15
2710	vpxord	%zmm1,%zmm12,%zmm12
2711	vpxord	%zmm2,%zmm13,%zmm13
2712	vpxord	%zmm3,%zmm14,%zmm14
2713	vprold	$8,%zmm15,%zmm15
2714	vprold	$8,%zmm12,%zmm12
2715	vprold	$8,%zmm13,%zmm13
2716	vprold	$8,%zmm14,%zmm14
2717	vpaddd	%zmm15,%zmm10,%zmm10
2718	vpaddd	%zmm12,%zmm11,%zmm11
2719	vpaddd	%zmm13,%zmm8,%zmm8
2720	vpaddd	%zmm14,%zmm9,%zmm9
2721	vpxord	%zmm10,%zmm5,%zmm5
2722	vpxord	%zmm11,%zmm6,%zmm6
2723	vpxord	%zmm8,%zmm7,%zmm7
2724	vpxord	%zmm9,%zmm4,%zmm4
2725	vprold	$7,%zmm5,%zmm5
2726	vprold	$7,%zmm6,%zmm6
2727	vprold	$7,%zmm7,%zmm7
2728	vprold	$7,%zmm4,%zmm4
2729	decl	%eax
2730	jnz	.Loop16x
2731
2732	vpaddd	%zmm16,%zmm0,%zmm0
2733	vpaddd	%zmm17,%zmm1,%zmm1
2734	vpaddd	%zmm18,%zmm2,%zmm2
2735	vpaddd	%zmm19,%zmm3,%zmm3
2736
2737	vpunpckldq	%zmm1,%zmm0,%zmm18
2738	vpunpckldq	%zmm3,%zmm2,%zmm19
2739	vpunpckhdq	%zmm1,%zmm0,%zmm0
2740	vpunpckhdq	%zmm3,%zmm2,%zmm2
2741	vpunpcklqdq	%zmm19,%zmm18,%zmm1
2742	vpunpckhqdq	%zmm19,%zmm18,%zmm18
2743	vpunpcklqdq	%zmm2,%zmm0,%zmm3
2744	vpunpckhqdq	%zmm2,%zmm0,%zmm0
2745	vpaddd	%zmm20,%zmm4,%zmm4
2746	vpaddd	%zmm21,%zmm5,%zmm5
2747	vpaddd	%zmm22,%zmm6,%zmm6
2748	vpaddd	%zmm23,%zmm7,%zmm7
2749
2750	vpunpckldq	%zmm5,%zmm4,%zmm2
2751	vpunpckldq	%zmm7,%zmm6,%zmm19
2752	vpunpckhdq	%zmm5,%zmm4,%zmm4
2753	vpunpckhdq	%zmm7,%zmm6,%zmm6
2754	vpunpcklqdq	%zmm19,%zmm2,%zmm5
2755	vpunpckhqdq	%zmm19,%zmm2,%zmm2
2756	vpunpcklqdq	%zmm6,%zmm4,%zmm7
2757	vpunpckhqdq	%zmm6,%zmm4,%zmm4
2758	vshufi32x4	$0x44,%zmm5,%zmm1,%zmm19
2759	vshufi32x4	$0xee,%zmm5,%zmm1,%zmm5
2760	vshufi32x4	$0x44,%zmm2,%zmm18,%zmm1
2761	vshufi32x4	$0xee,%zmm2,%zmm18,%zmm2
2762	vshufi32x4	$0x44,%zmm7,%zmm3,%zmm18
2763	vshufi32x4	$0xee,%zmm7,%zmm3,%zmm7
2764	vshufi32x4	$0x44,%zmm4,%zmm0,%zmm3
2765	vshufi32x4	$0xee,%zmm4,%zmm0,%zmm4
2766	vpaddd	%zmm24,%zmm8,%zmm8
2767	vpaddd	%zmm25,%zmm9,%zmm9
2768	vpaddd	%zmm26,%zmm10,%zmm10
2769	vpaddd	%zmm27,%zmm11,%zmm11
2770
2771	vpunpckldq	%zmm9,%zmm8,%zmm6
2772	vpunpckldq	%zmm11,%zmm10,%zmm0
2773	vpunpckhdq	%zmm9,%zmm8,%zmm8
2774	vpunpckhdq	%zmm11,%zmm10,%zmm10
2775	vpunpcklqdq	%zmm0,%zmm6,%zmm9
2776	vpunpckhqdq	%zmm0,%zmm6,%zmm6
2777	vpunpcklqdq	%zmm10,%zmm8,%zmm11
2778	vpunpckhqdq	%zmm10,%zmm8,%zmm8
2779	vpaddd	%zmm28,%zmm12,%zmm12
2780	vpaddd	%zmm29,%zmm13,%zmm13
2781	vpaddd	%zmm30,%zmm14,%zmm14
2782	vpaddd	%zmm31,%zmm15,%zmm15
2783
2784	vpunpckldq	%zmm13,%zmm12,%zmm10
2785	vpunpckldq	%zmm15,%zmm14,%zmm0
2786	vpunpckhdq	%zmm13,%zmm12,%zmm12
2787	vpunpckhdq	%zmm15,%zmm14,%zmm14
2788	vpunpcklqdq	%zmm0,%zmm10,%zmm13
2789	vpunpckhqdq	%zmm0,%zmm10,%zmm10
2790	vpunpcklqdq	%zmm14,%zmm12,%zmm15
2791	vpunpckhqdq	%zmm14,%zmm12,%zmm12
2792	vshufi32x4	$0x44,%zmm13,%zmm9,%zmm0
2793	vshufi32x4	$0xee,%zmm13,%zmm9,%zmm13
2794	vshufi32x4	$0x44,%zmm10,%zmm6,%zmm9
2795	vshufi32x4	$0xee,%zmm10,%zmm6,%zmm10
2796	vshufi32x4	$0x44,%zmm15,%zmm11,%zmm6
2797	vshufi32x4	$0xee,%zmm15,%zmm11,%zmm15
2798	vshufi32x4	$0x44,%zmm12,%zmm8,%zmm11
2799	vshufi32x4	$0xee,%zmm12,%zmm8,%zmm12
2800	vshufi32x4	$0x88,%zmm0,%zmm19,%zmm16
2801	vshufi32x4	$0xdd,%zmm0,%zmm19,%zmm19
2802	vshufi32x4	$0x88,%zmm13,%zmm5,%zmm0
2803	vshufi32x4	$0xdd,%zmm13,%zmm5,%zmm13
2804	vshufi32x4	$0x88,%zmm9,%zmm1,%zmm17
2805	vshufi32x4	$0xdd,%zmm9,%zmm1,%zmm1
2806	vshufi32x4	$0x88,%zmm10,%zmm2,%zmm9
2807	vshufi32x4	$0xdd,%zmm10,%zmm2,%zmm10
2808	vshufi32x4	$0x88,%zmm6,%zmm18,%zmm14
2809	vshufi32x4	$0xdd,%zmm6,%zmm18,%zmm18
2810	vshufi32x4	$0x88,%zmm15,%zmm7,%zmm6
2811	vshufi32x4	$0xdd,%zmm15,%zmm7,%zmm15
2812	vshufi32x4	$0x88,%zmm11,%zmm3,%zmm8
2813	vshufi32x4	$0xdd,%zmm11,%zmm3,%zmm3
2814	vshufi32x4	$0x88,%zmm12,%zmm4,%zmm11
2815	vshufi32x4	$0xdd,%zmm12,%zmm4,%zmm12
2816	cmpq	$1024,%rdx
2817	jb	.Ltail16x
2818
2819	vpxord	0(%rsi),%zmm16,%zmm16
2820	vpxord	64(%rsi),%zmm17,%zmm17
2821	vpxord	128(%rsi),%zmm14,%zmm14
2822	vpxord	192(%rsi),%zmm8,%zmm8
2823	vmovdqu32	%zmm16,0(%rdi)
2824	vmovdqu32	%zmm17,64(%rdi)
2825	vmovdqu32	%zmm14,128(%rdi)
2826	vmovdqu32	%zmm8,192(%rdi)
2827
2828	vpxord	256(%rsi),%zmm19,%zmm19
2829	vpxord	320(%rsi),%zmm1,%zmm1
2830	vpxord	384(%rsi),%zmm18,%zmm18
2831	vpxord	448(%rsi),%zmm3,%zmm3
2832	vmovdqu32	%zmm19,256(%rdi)
2833	vmovdqu32	%zmm1,320(%rdi)
2834	vmovdqu32	%zmm18,384(%rdi)
2835	vmovdqu32	%zmm3,448(%rdi)
2836
2837	vpxord	512(%rsi),%zmm0,%zmm0
2838	vpxord	576(%rsi),%zmm9,%zmm9
2839	vpxord	640(%rsi),%zmm6,%zmm6
2840	vpxord	704(%rsi),%zmm11,%zmm11
2841	vmovdqu32	%zmm0,512(%rdi)
2842	vmovdqu32	%zmm9,576(%rdi)
2843	vmovdqu32	%zmm6,640(%rdi)
2844	vmovdqu32	%zmm11,704(%rdi)
2845
2846	vpxord	768(%rsi),%zmm13,%zmm13
2847	vpxord	832(%rsi),%zmm10,%zmm10
2848	vpxord	896(%rsi),%zmm15,%zmm15
2849	vpxord	960(%rsi),%zmm12,%zmm12
2850	leaq	1024(%rsi),%rsi
2851	vmovdqu32	%zmm13,768(%rdi)
2852	vmovdqu32	%zmm10,832(%rdi)
2853	vmovdqu32	%zmm15,896(%rdi)
2854	vmovdqu32	%zmm12,960(%rdi)
2855	leaq	1024(%rdi),%rdi
2856
2857	subq	$1024,%rdx
2858	jnz	.Loop_outer16x
2859
2860	jmp	.Ldone16x
2861
2862.align	32
2863.Ltail16x:
2864	xorq	%r10,%r10
2865	subq	%rsi,%rdi
2866	cmpq	$64,%rdx
2867	jb	.Less_than_64_16x
2868	vpxord	(%rsi),%zmm16,%zmm16
2869	vmovdqu32	%zmm16,(%rdi,%rsi,1)
2870	je	.Ldone16x
2871	vmovdqa32	%zmm17,%zmm16
2872	leaq	64(%rsi),%rsi
2873
2874	cmpq	$128,%rdx
2875	jb	.Less_than_64_16x
2876	vpxord	(%rsi),%zmm17,%zmm17
2877	vmovdqu32	%zmm17,(%rdi,%rsi,1)
2878	je	.Ldone16x
2879	vmovdqa32	%zmm14,%zmm16
2880	leaq	64(%rsi),%rsi
2881
2882	cmpq	$192,%rdx
2883	jb	.Less_than_64_16x
2884	vpxord	(%rsi),%zmm14,%zmm14
2885	vmovdqu32	%zmm14,(%rdi,%rsi,1)
2886	je	.Ldone16x
2887	vmovdqa32	%zmm8,%zmm16
2888	leaq	64(%rsi),%rsi
2889
2890	cmpq	$256,%rdx
2891	jb	.Less_than_64_16x
2892	vpxord	(%rsi),%zmm8,%zmm8
2893	vmovdqu32	%zmm8,(%rdi,%rsi,1)
2894	je	.Ldone16x
2895	vmovdqa32	%zmm19,%zmm16
2896	leaq	64(%rsi),%rsi
2897
2898	cmpq	$320,%rdx
2899	jb	.Less_than_64_16x
2900	vpxord	(%rsi),%zmm19,%zmm19
2901	vmovdqu32	%zmm19,(%rdi,%rsi,1)
2902	je	.Ldone16x
2903	vmovdqa32	%zmm1,%zmm16
2904	leaq	64(%rsi),%rsi
2905
2906	cmpq	$384,%rdx
2907	jb	.Less_than_64_16x
2908	vpxord	(%rsi),%zmm1,%zmm1
2909	vmovdqu32	%zmm1,(%rdi,%rsi,1)
2910	je	.Ldone16x
2911	vmovdqa32	%zmm18,%zmm16
2912	leaq	64(%rsi),%rsi
2913
2914	cmpq	$448,%rdx
2915	jb	.Less_than_64_16x
2916	vpxord	(%rsi),%zmm18,%zmm18
2917	vmovdqu32	%zmm18,(%rdi,%rsi,1)
2918	je	.Ldone16x
2919	vmovdqa32	%zmm3,%zmm16
2920	leaq	64(%rsi),%rsi
2921
2922	cmpq	$512,%rdx
2923	jb	.Less_than_64_16x
2924	vpxord	(%rsi),%zmm3,%zmm3
2925	vmovdqu32	%zmm3,(%rdi,%rsi,1)
2926	je	.Ldone16x
2927	vmovdqa32	%zmm0,%zmm16
2928	leaq	64(%rsi),%rsi
2929
2930	cmpq	$576,%rdx
2931	jb	.Less_than_64_16x
2932	vpxord	(%rsi),%zmm0,%zmm0
2933	vmovdqu32	%zmm0,(%rdi,%rsi,1)
2934	je	.Ldone16x
2935	vmovdqa32	%zmm9,%zmm16
2936	leaq	64(%rsi),%rsi
2937
2938	cmpq	$640,%rdx
2939	jb	.Less_than_64_16x
2940	vpxord	(%rsi),%zmm9,%zmm9
2941	vmovdqu32	%zmm9,(%rdi,%rsi,1)
2942	je	.Ldone16x
2943	vmovdqa32	%zmm6,%zmm16
2944	leaq	64(%rsi),%rsi
2945
2946	cmpq	$704,%rdx
2947	jb	.Less_than_64_16x
2948	vpxord	(%rsi),%zmm6,%zmm6
2949	vmovdqu32	%zmm6,(%rdi,%rsi,1)
2950	je	.Ldone16x
2951	vmovdqa32	%zmm11,%zmm16
2952	leaq	64(%rsi),%rsi
2953
2954	cmpq	$768,%rdx
2955	jb	.Less_than_64_16x
2956	vpxord	(%rsi),%zmm11,%zmm11
2957	vmovdqu32	%zmm11,(%rdi,%rsi,1)
2958	je	.Ldone16x
2959	vmovdqa32	%zmm13,%zmm16
2960	leaq	64(%rsi),%rsi
2961
2962	cmpq	$832,%rdx
2963	jb	.Less_than_64_16x
2964	vpxord	(%rsi),%zmm13,%zmm13
2965	vmovdqu32	%zmm13,(%rdi,%rsi,1)
2966	je	.Ldone16x
2967	vmovdqa32	%zmm10,%zmm16
2968	leaq	64(%rsi),%rsi
2969
2970	cmpq	$896,%rdx
2971	jb	.Less_than_64_16x
2972	vpxord	(%rsi),%zmm10,%zmm10
2973	vmovdqu32	%zmm10,(%rdi,%rsi,1)
2974	je	.Ldone16x
2975	vmovdqa32	%zmm15,%zmm16
2976	leaq	64(%rsi),%rsi
2977
2978	cmpq	$960,%rdx
2979	jb	.Less_than_64_16x
2980	vpxord	(%rsi),%zmm15,%zmm15
2981	vmovdqu32	%zmm15,(%rdi,%rsi,1)
2982	je	.Ldone16x
2983	vmovdqa32	%zmm12,%zmm16
2984	leaq	64(%rsi),%rsi
2985
2986.Less_than_64_16x:
2987	vmovdqa32	%zmm16,0(%rsp)
2988	leaq	(%rdi,%rsi,1),%rdi
2989	andq	$63,%rdx
2990
2991.Loop_tail16x:
2992	movzbl	(%rsi,%r10,1),%eax
2993	movzbl	(%rsp,%r10,1),%ecx
2994	leaq	1(%r10),%r10
2995	xorl	%ecx,%eax
2996	movb	%al,-1(%rdi,%r10,1)
2997	decq	%rdx
2998	jnz	.Loop_tail16x
2999
3000	vpxord	%zmm16,%zmm16,%zmm16
3001	vmovdqa32	%zmm16,0(%rsp)
3002
3003.Ldone16x:
3004	vzeroall
3005	leaq	(%r9),%rsp
3006.cfi_def_cfa_register	%rsp
3007.L16x_epilogue:
3008	.byte	0xf3,0xc3
3009.cfi_endproc
3010.size	ChaCha20_16x,.-ChaCha20_16x
3011.type	ChaCha20_8xvl,@function
3012.align	32
3013ChaCha20_8xvl:
3014.cfi_startproc
3015.LChaCha20_8xvl:
3016	movq	%rsp,%r9
3017.cfi_def_cfa_register	%r9
3018	subq	$64+8,%rsp
3019	andq	$-64,%rsp
3020	vzeroupper
3021
3022	leaq	.Lsigma(%rip),%r10
3023	vbroadcasti128	(%r10),%ymm3
3024	vbroadcasti128	(%rcx),%ymm7
3025	vbroadcasti128	16(%rcx),%ymm11
3026	vbroadcasti128	(%r8),%ymm15
3027
3028	vpshufd	$0x00,%ymm3,%ymm0
3029	vpshufd	$0x55,%ymm3,%ymm1
3030	vpshufd	$0xaa,%ymm3,%ymm2
3031	vpshufd	$0xff,%ymm3,%ymm3
3032	vmovdqa64	%ymm0,%ymm16
3033	vmovdqa64	%ymm1,%ymm17
3034	vmovdqa64	%ymm2,%ymm18
3035	vmovdqa64	%ymm3,%ymm19
3036
3037	vpshufd	$0x00,%ymm7,%ymm4
3038	vpshufd	$0x55,%ymm7,%ymm5
3039	vpshufd	$0xaa,%ymm7,%ymm6
3040	vpshufd	$0xff,%ymm7,%ymm7
3041	vmovdqa64	%ymm4,%ymm20
3042	vmovdqa64	%ymm5,%ymm21
3043	vmovdqa64	%ymm6,%ymm22
3044	vmovdqa64	%ymm7,%ymm23
3045
3046	vpshufd	$0x00,%ymm11,%ymm8
3047	vpshufd	$0x55,%ymm11,%ymm9
3048	vpshufd	$0xaa,%ymm11,%ymm10
3049	vpshufd	$0xff,%ymm11,%ymm11
3050	vmovdqa64	%ymm8,%ymm24
3051	vmovdqa64	%ymm9,%ymm25
3052	vmovdqa64	%ymm10,%ymm26
3053	vmovdqa64	%ymm11,%ymm27
3054
3055	vpshufd	$0x00,%ymm15,%ymm12
3056	vpshufd	$0x55,%ymm15,%ymm13
3057	vpshufd	$0xaa,%ymm15,%ymm14
3058	vpshufd	$0xff,%ymm15,%ymm15
3059	vpaddd	.Lincy(%rip),%ymm12,%ymm12
3060	vmovdqa64	%ymm12,%ymm28
3061	vmovdqa64	%ymm13,%ymm29
3062	vmovdqa64	%ymm14,%ymm30
3063	vmovdqa64	%ymm15,%ymm31
3064
3065	movl	$10,%eax
3066	jmp	.Loop8xvl
3067
3068.align	32
3069.Loop_outer8xvl:
3070
3071
3072	vpbroadcastd	8(%r10),%ymm2
3073	vpbroadcastd	12(%r10),%ymm3
3074	vpaddd	.Leight(%rip),%ymm28,%ymm28
3075	vmovdqa64	%ymm20,%ymm4
3076	vmovdqa64	%ymm21,%ymm5
3077	vmovdqa64	%ymm22,%ymm6
3078	vmovdqa64	%ymm23,%ymm7
3079	vmovdqa64	%ymm24,%ymm8
3080	vmovdqa64	%ymm25,%ymm9
3081	vmovdqa64	%ymm26,%ymm10
3082	vmovdqa64	%ymm27,%ymm11
3083	vmovdqa64	%ymm28,%ymm12
3084	vmovdqa64	%ymm29,%ymm13
3085	vmovdqa64	%ymm30,%ymm14
3086	vmovdqa64	%ymm31,%ymm15
3087
3088	vmovdqa64	%ymm0,%ymm16
3089	vmovdqa64	%ymm1,%ymm17
3090	vmovdqa64	%ymm2,%ymm18
3091	vmovdqa64	%ymm3,%ymm19
3092
3093	movl	$10,%eax
3094	jmp	.Loop8xvl
3095
3096.align	32
3097.Loop8xvl:
3098	vpaddd	%ymm4,%ymm0,%ymm0
3099	vpaddd	%ymm5,%ymm1,%ymm1
3100	vpaddd	%ymm6,%ymm2,%ymm2
3101	vpaddd	%ymm7,%ymm3,%ymm3
3102	vpxor	%ymm0,%ymm12,%ymm12
3103	vpxor	%ymm1,%ymm13,%ymm13
3104	vpxor	%ymm2,%ymm14,%ymm14
3105	vpxor	%ymm3,%ymm15,%ymm15
3106	vprold	$16,%ymm12,%ymm12
3107	vprold	$16,%ymm13,%ymm13
3108	vprold	$16,%ymm14,%ymm14
3109	vprold	$16,%ymm15,%ymm15
3110	vpaddd	%ymm12,%ymm8,%ymm8
3111	vpaddd	%ymm13,%ymm9,%ymm9
3112	vpaddd	%ymm14,%ymm10,%ymm10
3113	vpaddd	%ymm15,%ymm11,%ymm11
3114	vpxor	%ymm8,%ymm4,%ymm4
3115	vpxor	%ymm9,%ymm5,%ymm5
3116	vpxor	%ymm10,%ymm6,%ymm6
3117	vpxor	%ymm11,%ymm7,%ymm7
3118	vprold	$12,%ymm4,%ymm4
3119	vprold	$12,%ymm5,%ymm5
3120	vprold	$12,%ymm6,%ymm6
3121	vprold	$12,%ymm7,%ymm7
3122	vpaddd	%ymm4,%ymm0,%ymm0
3123	vpaddd	%ymm5,%ymm1,%ymm1
3124	vpaddd	%ymm6,%ymm2,%ymm2
3125	vpaddd	%ymm7,%ymm3,%ymm3
3126	vpxor	%ymm0,%ymm12,%ymm12
3127	vpxor	%ymm1,%ymm13,%ymm13
3128	vpxor	%ymm2,%ymm14,%ymm14
3129	vpxor	%ymm3,%ymm15,%ymm15
3130	vprold	$8,%ymm12,%ymm12
3131	vprold	$8,%ymm13,%ymm13
3132	vprold	$8,%ymm14,%ymm14
3133	vprold	$8,%ymm15,%ymm15
3134	vpaddd	%ymm12,%ymm8,%ymm8
3135	vpaddd	%ymm13,%ymm9,%ymm9
3136	vpaddd	%ymm14,%ymm10,%ymm10
3137	vpaddd	%ymm15,%ymm11,%ymm11
3138	vpxor	%ymm8,%ymm4,%ymm4
3139	vpxor	%ymm9,%ymm5,%ymm5
3140	vpxor	%ymm10,%ymm6,%ymm6
3141	vpxor	%ymm11,%ymm7,%ymm7
3142	vprold	$7,%ymm4,%ymm4
3143	vprold	$7,%ymm5,%ymm5
3144	vprold	$7,%ymm6,%ymm6
3145	vprold	$7,%ymm7,%ymm7
3146	vpaddd	%ymm5,%ymm0,%ymm0
3147	vpaddd	%ymm6,%ymm1,%ymm1
3148	vpaddd	%ymm7,%ymm2,%ymm2
3149	vpaddd	%ymm4,%ymm3,%ymm3
3150	vpxor	%ymm0,%ymm15,%ymm15
3151	vpxor	%ymm1,%ymm12,%ymm12
3152	vpxor	%ymm2,%ymm13,%ymm13
3153	vpxor	%ymm3,%ymm14,%ymm14
3154	vprold	$16,%ymm15,%ymm15
3155	vprold	$16,%ymm12,%ymm12
3156	vprold	$16,%ymm13,%ymm13
3157	vprold	$16,%ymm14,%ymm14
3158	vpaddd	%ymm15,%ymm10,%ymm10
3159	vpaddd	%ymm12,%ymm11,%ymm11
3160	vpaddd	%ymm13,%ymm8,%ymm8
3161	vpaddd	%ymm14,%ymm9,%ymm9
3162	vpxor	%ymm10,%ymm5,%ymm5
3163	vpxor	%ymm11,%ymm6,%ymm6
3164	vpxor	%ymm8,%ymm7,%ymm7
3165	vpxor	%ymm9,%ymm4,%ymm4
3166	vprold	$12,%ymm5,%ymm5
3167	vprold	$12,%ymm6,%ymm6
3168	vprold	$12,%ymm7,%ymm7
3169	vprold	$12,%ymm4,%ymm4
3170	vpaddd	%ymm5,%ymm0,%ymm0
3171	vpaddd	%ymm6,%ymm1,%ymm1
3172	vpaddd	%ymm7,%ymm2,%ymm2
3173	vpaddd	%ymm4,%ymm3,%ymm3
3174	vpxor	%ymm0,%ymm15,%ymm15
3175	vpxor	%ymm1,%ymm12,%ymm12
3176	vpxor	%ymm2,%ymm13,%ymm13
3177	vpxor	%ymm3,%ymm14,%ymm14
3178	vprold	$8,%ymm15,%ymm15
3179	vprold	$8,%ymm12,%ymm12
3180	vprold	$8,%ymm13,%ymm13
3181	vprold	$8,%ymm14,%ymm14
3182	vpaddd	%ymm15,%ymm10,%ymm10
3183	vpaddd	%ymm12,%ymm11,%ymm11
3184	vpaddd	%ymm13,%ymm8,%ymm8
3185	vpaddd	%ymm14,%ymm9,%ymm9
3186	vpxor	%ymm10,%ymm5,%ymm5
3187	vpxor	%ymm11,%ymm6,%ymm6
3188	vpxor	%ymm8,%ymm7,%ymm7
3189	vpxor	%ymm9,%ymm4,%ymm4
3190	vprold	$7,%ymm5,%ymm5
3191	vprold	$7,%ymm6,%ymm6
3192	vprold	$7,%ymm7,%ymm7
3193	vprold	$7,%ymm4,%ymm4
3194	decl	%eax
3195	jnz	.Loop8xvl
3196
3197	vpaddd	%ymm16,%ymm0,%ymm0
3198	vpaddd	%ymm17,%ymm1,%ymm1
3199	vpaddd	%ymm18,%ymm2,%ymm2
3200	vpaddd	%ymm19,%ymm3,%ymm3
3201
3202	vpunpckldq	%ymm1,%ymm0,%ymm18
3203	vpunpckldq	%ymm3,%ymm2,%ymm19
3204	vpunpckhdq	%ymm1,%ymm0,%ymm0
3205	vpunpckhdq	%ymm3,%ymm2,%ymm2
3206	vpunpcklqdq	%ymm19,%ymm18,%ymm1
3207	vpunpckhqdq	%ymm19,%ymm18,%ymm18
3208	vpunpcklqdq	%ymm2,%ymm0,%ymm3
3209	vpunpckhqdq	%ymm2,%ymm0,%ymm0
3210	vpaddd	%ymm20,%ymm4,%ymm4
3211	vpaddd	%ymm21,%ymm5,%ymm5
3212	vpaddd	%ymm22,%ymm6,%ymm6
3213	vpaddd	%ymm23,%ymm7,%ymm7
3214
3215	vpunpckldq	%ymm5,%ymm4,%ymm2
3216	vpunpckldq	%ymm7,%ymm6,%ymm19
3217	vpunpckhdq	%ymm5,%ymm4,%ymm4
3218	vpunpckhdq	%ymm7,%ymm6,%ymm6
3219	vpunpcklqdq	%ymm19,%ymm2,%ymm5
3220	vpunpckhqdq	%ymm19,%ymm2,%ymm2
3221	vpunpcklqdq	%ymm6,%ymm4,%ymm7
3222	vpunpckhqdq	%ymm6,%ymm4,%ymm4
3223	vshufi32x4	$0,%ymm5,%ymm1,%ymm19
3224	vshufi32x4	$3,%ymm5,%ymm1,%ymm5
3225	vshufi32x4	$0,%ymm2,%ymm18,%ymm1
3226	vshufi32x4	$3,%ymm2,%ymm18,%ymm2
3227	vshufi32x4	$0,%ymm7,%ymm3,%ymm18
3228	vshufi32x4	$3,%ymm7,%ymm3,%ymm7
3229	vshufi32x4	$0,%ymm4,%ymm0,%ymm3
3230	vshufi32x4	$3,%ymm4,%ymm0,%ymm4
3231	vpaddd	%ymm24,%ymm8,%ymm8
3232	vpaddd	%ymm25,%ymm9,%ymm9
3233	vpaddd	%ymm26,%ymm10,%ymm10
3234	vpaddd	%ymm27,%ymm11,%ymm11
3235
3236	vpunpckldq	%ymm9,%ymm8,%ymm6
3237	vpunpckldq	%ymm11,%ymm10,%ymm0
3238	vpunpckhdq	%ymm9,%ymm8,%ymm8
3239	vpunpckhdq	%ymm11,%ymm10,%ymm10
3240	vpunpcklqdq	%ymm0,%ymm6,%ymm9
3241	vpunpckhqdq	%ymm0,%ymm6,%ymm6
3242	vpunpcklqdq	%ymm10,%ymm8,%ymm11
3243	vpunpckhqdq	%ymm10,%ymm8,%ymm8
3244	vpaddd	%ymm28,%ymm12,%ymm12
3245	vpaddd	%ymm29,%ymm13,%ymm13
3246	vpaddd	%ymm30,%ymm14,%ymm14
3247	vpaddd	%ymm31,%ymm15,%ymm15
3248
3249	vpunpckldq	%ymm13,%ymm12,%ymm10
3250	vpunpckldq	%ymm15,%ymm14,%ymm0
3251	vpunpckhdq	%ymm13,%ymm12,%ymm12
3252	vpunpckhdq	%ymm15,%ymm14,%ymm14
3253	vpunpcklqdq	%ymm0,%ymm10,%ymm13
3254	vpunpckhqdq	%ymm0,%ymm10,%ymm10
3255	vpunpcklqdq	%ymm14,%ymm12,%ymm15
3256	vpunpckhqdq	%ymm14,%ymm12,%ymm12
3257	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
3258	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
3259	vperm2i128	$0x20,%ymm10,%ymm6,%ymm9
3260	vperm2i128	$0x31,%ymm10,%ymm6,%ymm10
3261	vperm2i128	$0x20,%ymm15,%ymm11,%ymm6
3262	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
3263	vperm2i128	$0x20,%ymm12,%ymm8,%ymm11
3264	vperm2i128	$0x31,%ymm12,%ymm8,%ymm12
3265	cmpq	$512,%rdx
3266	jb	.Ltail8xvl
3267
3268	movl	$0x80,%eax
3269	vpxord	0(%rsi),%ymm19,%ymm19
3270	vpxor	32(%rsi),%ymm0,%ymm0
3271	vpxor	64(%rsi),%ymm5,%ymm5
3272	vpxor	96(%rsi),%ymm13,%ymm13
3273	leaq	(%rsi,%rax,1),%rsi
3274	vmovdqu32	%ymm19,0(%rdi)
3275	vmovdqu	%ymm0,32(%rdi)
3276	vmovdqu	%ymm5,64(%rdi)
3277	vmovdqu	%ymm13,96(%rdi)
3278	leaq	(%rdi,%rax,1),%rdi
3279
3280	vpxor	0(%rsi),%ymm1,%ymm1
3281	vpxor	32(%rsi),%ymm9,%ymm9
3282	vpxor	64(%rsi),%ymm2,%ymm2
3283	vpxor	96(%rsi),%ymm10,%ymm10
3284	leaq	(%rsi,%rax,1),%rsi
3285	vmovdqu	%ymm1,0(%rdi)
3286	vmovdqu	%ymm9,32(%rdi)
3287	vmovdqu	%ymm2,64(%rdi)
3288	vmovdqu	%ymm10,96(%rdi)
3289	leaq	(%rdi,%rax,1),%rdi
3290
3291	vpxord	0(%rsi),%ymm18,%ymm18
3292	vpxor	32(%rsi),%ymm6,%ymm6
3293	vpxor	64(%rsi),%ymm7,%ymm7
3294	vpxor	96(%rsi),%ymm15,%ymm15
3295	leaq	(%rsi,%rax,1),%rsi
3296	vmovdqu32	%ymm18,0(%rdi)
3297	vmovdqu	%ymm6,32(%rdi)
3298	vmovdqu	%ymm7,64(%rdi)
3299	vmovdqu	%ymm15,96(%rdi)
3300	leaq	(%rdi,%rax,1),%rdi
3301
3302	vpxor	0(%rsi),%ymm3,%ymm3
3303	vpxor	32(%rsi),%ymm11,%ymm11
3304	vpxor	64(%rsi),%ymm4,%ymm4
3305	vpxor	96(%rsi),%ymm12,%ymm12
3306	leaq	(%rsi,%rax,1),%rsi
3307	vmovdqu	%ymm3,0(%rdi)
3308	vmovdqu	%ymm11,32(%rdi)
3309	vmovdqu	%ymm4,64(%rdi)
3310	vmovdqu	%ymm12,96(%rdi)
3311	leaq	(%rdi,%rax,1),%rdi
3312
3313	vpbroadcastd	0(%r10),%ymm0
3314	vpbroadcastd	4(%r10),%ymm1
3315
3316	subq	$512,%rdx
3317	jnz	.Loop_outer8xvl
3318
3319	jmp	.Ldone8xvl
3320
3321.align	32
3322.Ltail8xvl:
3323	vmovdqa64	%ymm19,%ymm8
3324	xorq	%r10,%r10
3325	subq	%rsi,%rdi
3326	cmpq	$64,%rdx
3327	jb	.Less_than_64_8xvl
3328	vpxor	0(%rsi),%ymm8,%ymm8
3329	vpxor	32(%rsi),%ymm0,%ymm0
3330	vmovdqu	%ymm8,0(%rdi,%rsi,1)
3331	vmovdqu	%ymm0,32(%rdi,%rsi,1)
3332	je	.Ldone8xvl
3333	vmovdqa	%ymm5,%ymm8
3334	vmovdqa	%ymm13,%ymm0
3335	leaq	64(%rsi),%rsi
3336
3337	cmpq	$128,%rdx
3338	jb	.Less_than_64_8xvl
3339	vpxor	0(%rsi),%ymm5,%ymm5
3340	vpxor	32(%rsi),%ymm13,%ymm13
3341	vmovdqu	%ymm5,0(%rdi,%rsi,1)
3342	vmovdqu	%ymm13,32(%rdi,%rsi,1)
3343	je	.Ldone8xvl
3344	vmovdqa	%ymm1,%ymm8
3345	vmovdqa	%ymm9,%ymm0
3346	leaq	64(%rsi),%rsi
3347
3348	cmpq	$192,%rdx
3349	jb	.Less_than_64_8xvl
3350	vpxor	0(%rsi),%ymm1,%ymm1
3351	vpxor	32(%rsi),%ymm9,%ymm9
3352	vmovdqu	%ymm1,0(%rdi,%rsi,1)
3353	vmovdqu	%ymm9,32(%rdi,%rsi,1)
3354	je	.Ldone8xvl
3355	vmovdqa	%ymm2,%ymm8
3356	vmovdqa	%ymm10,%ymm0
3357	leaq	64(%rsi),%rsi
3358
3359	cmpq	$256,%rdx
3360	jb	.Less_than_64_8xvl
3361	vpxor	0(%rsi),%ymm2,%ymm2
3362	vpxor	32(%rsi),%ymm10,%ymm10
3363	vmovdqu	%ymm2,0(%rdi,%rsi,1)
3364	vmovdqu	%ymm10,32(%rdi,%rsi,1)
3365	je	.Ldone8xvl
3366	vmovdqa32	%ymm18,%ymm8
3367	vmovdqa	%ymm6,%ymm0
3368	leaq	64(%rsi),%rsi
3369
3370	cmpq	$320,%rdx
3371	jb	.Less_than_64_8xvl
3372	vpxord	0(%rsi),%ymm18,%ymm18
3373	vpxor	32(%rsi),%ymm6,%ymm6
3374	vmovdqu32	%ymm18,0(%rdi,%rsi,1)
3375	vmovdqu	%ymm6,32(%rdi,%rsi,1)
3376	je	.Ldone8xvl
3377	vmovdqa	%ymm7,%ymm8
3378	vmovdqa	%ymm15,%ymm0
3379	leaq	64(%rsi),%rsi
3380
3381	cmpq	$384,%rdx
3382	jb	.Less_than_64_8xvl
3383	vpxor	0(%rsi),%ymm7,%ymm7
3384	vpxor	32(%rsi),%ymm15,%ymm15
3385	vmovdqu	%ymm7,0(%rdi,%rsi,1)
3386	vmovdqu	%ymm15,32(%rdi,%rsi,1)
3387	je	.Ldone8xvl
3388	vmovdqa	%ymm3,%ymm8
3389	vmovdqa	%ymm11,%ymm0
3390	leaq	64(%rsi),%rsi
3391
3392	cmpq	$448,%rdx
3393	jb	.Less_than_64_8xvl
3394	vpxor	0(%rsi),%ymm3,%ymm3
3395	vpxor	32(%rsi),%ymm11,%ymm11
3396	vmovdqu	%ymm3,0(%rdi,%rsi,1)
3397	vmovdqu	%ymm11,32(%rdi,%rsi,1)
3398	je	.Ldone8xvl
3399	vmovdqa	%ymm4,%ymm8
3400	vmovdqa	%ymm12,%ymm0
3401	leaq	64(%rsi),%rsi
3402
3403.Less_than_64_8xvl:
3404	vmovdqa	%ymm8,0(%rsp)
3405	vmovdqa	%ymm0,32(%rsp)
3406	leaq	(%rdi,%rsi,1),%rdi
3407	andq	$63,%rdx
3408
3409.Loop_tail8xvl:
3410	movzbl	(%rsi,%r10,1),%eax
3411	movzbl	(%rsp,%r10,1),%ecx
3412	leaq	1(%r10),%r10
3413	xorl	%ecx,%eax
3414	movb	%al,-1(%rdi,%r10,1)
3415	decq	%rdx
3416	jnz	.Loop_tail8xvl
3417
3418	vpxor	%ymm8,%ymm8,%ymm8
3419	vmovdqa	%ymm8,0(%rsp)
3420	vmovdqa	%ymm8,32(%rsp)
3421
3422.Ldone8xvl:
3423	vzeroall
3424	leaq	(%r9),%rsp
3425.cfi_def_cfa_register	%rsp
3426.L8xvl_epilogue:
3427	.byte	0xf3,0xc3
3428.cfi_endproc
3429.size	ChaCha20_8xvl,.-ChaCha20_8xvl
3430