xref: /netbsd-src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/chacha-armv8.S (revision c38e7cc395b1472a774ff828e46123de44c628e9)
1#include "arm_arch.h"
2
3.text
4
5
6
7.align	5
8.Lsigma:
9.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
10.Lone:
11.long	1,0,0,0
12.LOPENSSL_armcap_P:
13#ifdef	__ILP32__
14.long	OPENSSL_armcap_P-.
15#else
16.quad	OPENSSL_armcap_P-.
17#endif
18.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
19.align	2
20
21.globl	ChaCha20_ctr32
22.type	ChaCha20_ctr32,%function
23.align	5
24ChaCha20_ctr32:
25	cbz	x2,.Labort
26	adr	x5,.LOPENSSL_armcap_P
27	cmp	x2,#192
28	b.lo	.Lshort
29#ifdef	__ILP32__
30	ldrsw	x6,[x5]
31#else
32	ldr	x6,[x5]
33#endif
34	ldr	w17,[x6,x5]
35	tst	w17,#ARMV7_NEON
36	b.ne	ChaCha20_neon
37
38.Lshort:
39	stp	x29,x30,[sp,#-96]!
40	add	x29,sp,#0
41
42	adr	x5,.Lsigma
43	stp	x19,x20,[sp,#16]
44	stp	x21,x22,[sp,#32]
45	stp	x23,x24,[sp,#48]
46	stp	x25,x26,[sp,#64]
47	stp	x27,x28,[sp,#80]
48	sub	sp,sp,#64
49
50	ldp	x22,x23,[x5]		// load sigma
51	ldp	x24,x25,[x3]		// load key
52	ldp	x26,x27,[x3,#16]
53	ldp	x28,x30,[x4]		// load counter
54#ifdef	__ARMEB__
55	ror	x24,x24,#32
56	ror	x25,x25,#32
57	ror	x26,x26,#32
58	ror	x27,x27,#32
59	ror	x28,x28,#32
60	ror	x30,x30,#32
61#endif
62
63.Loop_outer:
64	mov	w5,w22			// unpack key block
65	lsr	x6,x22,#32
66	mov	w7,w23
67	lsr	x8,x23,#32
68	mov	w9,w24
69	lsr	x10,x24,#32
70	mov	w11,w25
71	lsr	x12,x25,#32
72	mov	w13,w26
73	lsr	x14,x26,#32
74	mov	w15,w27
75	lsr	x16,x27,#32
76	mov	w17,w28
77	lsr	x19,x28,#32
78	mov	w20,w30
79	lsr	x21,x30,#32
80
81	mov	x4,#10
82	subs	x2,x2,#64
83.Loop:
84	sub	x4,x4,#1
85	add	w5,w5,w9
86	add	w6,w6,w10
87	add	w7,w7,w11
88	add	w8,w8,w12
89	eor	w17,w17,w5
90	eor	w19,w19,w6
91	eor	w20,w20,w7
92	eor	w21,w21,w8
93	ror	w17,w17,#16
94	ror	w19,w19,#16
95	ror	w20,w20,#16
96	ror	w21,w21,#16
97	add	w13,w13,w17
98	add	w14,w14,w19
99	add	w15,w15,w20
100	add	w16,w16,w21
101	eor	w9,w9,w13
102	eor	w10,w10,w14
103	eor	w11,w11,w15
104	eor	w12,w12,w16
105	ror	w9,w9,#20
106	ror	w10,w10,#20
107	ror	w11,w11,#20
108	ror	w12,w12,#20
109	add	w5,w5,w9
110	add	w6,w6,w10
111	add	w7,w7,w11
112	add	w8,w8,w12
113	eor	w17,w17,w5
114	eor	w19,w19,w6
115	eor	w20,w20,w7
116	eor	w21,w21,w8
117	ror	w17,w17,#24
118	ror	w19,w19,#24
119	ror	w20,w20,#24
120	ror	w21,w21,#24
121	add	w13,w13,w17
122	add	w14,w14,w19
123	add	w15,w15,w20
124	add	w16,w16,w21
125	eor	w9,w9,w13
126	eor	w10,w10,w14
127	eor	w11,w11,w15
128	eor	w12,w12,w16
129	ror	w9,w9,#25
130	ror	w10,w10,#25
131	ror	w11,w11,#25
132	ror	w12,w12,#25
133	add	w5,w5,w10
134	add	w6,w6,w11
135	add	w7,w7,w12
136	add	w8,w8,w9
137	eor	w21,w21,w5
138	eor	w17,w17,w6
139	eor	w19,w19,w7
140	eor	w20,w20,w8
141	ror	w21,w21,#16
142	ror	w17,w17,#16
143	ror	w19,w19,#16
144	ror	w20,w20,#16
145	add	w15,w15,w21
146	add	w16,w16,w17
147	add	w13,w13,w19
148	add	w14,w14,w20
149	eor	w10,w10,w15
150	eor	w11,w11,w16
151	eor	w12,w12,w13
152	eor	w9,w9,w14
153	ror	w10,w10,#20
154	ror	w11,w11,#20
155	ror	w12,w12,#20
156	ror	w9,w9,#20
157	add	w5,w5,w10
158	add	w6,w6,w11
159	add	w7,w7,w12
160	add	w8,w8,w9
161	eor	w21,w21,w5
162	eor	w17,w17,w6
163	eor	w19,w19,w7
164	eor	w20,w20,w8
165	ror	w21,w21,#24
166	ror	w17,w17,#24
167	ror	w19,w19,#24
168	ror	w20,w20,#24
169	add	w15,w15,w21
170	add	w16,w16,w17
171	add	w13,w13,w19
172	add	w14,w14,w20
173	eor	w10,w10,w15
174	eor	w11,w11,w16
175	eor	w12,w12,w13
176	eor	w9,w9,w14
177	ror	w10,w10,#25
178	ror	w11,w11,#25
179	ror	w12,w12,#25
180	ror	w9,w9,#25
181	cbnz	x4,.Loop
182
183	add	w5,w5,w22		// accumulate key block
184	add	x6,x6,x22,lsr#32
185	add	w7,w7,w23
186	add	x8,x8,x23,lsr#32
187	add	w9,w9,w24
188	add	x10,x10,x24,lsr#32
189	add	w11,w11,w25
190	add	x12,x12,x25,lsr#32
191	add	w13,w13,w26
192	add	x14,x14,x26,lsr#32
193	add	w15,w15,w27
194	add	x16,x16,x27,lsr#32
195	add	w17,w17,w28
196	add	x19,x19,x28,lsr#32
197	add	w20,w20,w30
198	add	x21,x21,x30,lsr#32
199
200	b.lo	.Ltail
201
202	add	x5,x5,x6,lsl#32	// pack
203	add	x7,x7,x8,lsl#32
204	ldp	x6,x8,[x1,#0]		// load input
205	add	x9,x9,x10,lsl#32
206	add	x11,x11,x12,lsl#32
207	ldp	x10,x12,[x1,#16]
208	add	x13,x13,x14,lsl#32
209	add	x15,x15,x16,lsl#32
210	ldp	x14,x16,[x1,#32]
211	add	x17,x17,x19,lsl#32
212	add	x20,x20,x21,lsl#32
213	ldp	x19,x21,[x1,#48]
214	add	x1,x1,#64
215#ifdef	__ARMEB__
216	rev	x5,x5
217	rev	x7,x7
218	rev	x9,x9
219	rev	x11,x11
220	rev	x13,x13
221	rev	x15,x15
222	rev	x17,x17
223	rev	x20,x20
224#endif
225	eor	x5,x5,x6
226	eor	x7,x7,x8
227	eor	x9,x9,x10
228	eor	x11,x11,x12
229	eor	x13,x13,x14
230	eor	x15,x15,x16
231	eor	x17,x17,x19
232	eor	x20,x20,x21
233
234	stp	x5,x7,[x0,#0]		// store output
235	add	x28,x28,#1			// increment counter
236	stp	x9,x11,[x0,#16]
237	stp	x13,x15,[x0,#32]
238	stp	x17,x20,[x0,#48]
239	add	x0,x0,#64
240
241	b.hi	.Loop_outer
242
243	ldp	x19,x20,[x29,#16]
244	add	sp,sp,#64
245	ldp	x21,x22,[x29,#32]
246	ldp	x23,x24,[x29,#48]
247	ldp	x25,x26,[x29,#64]
248	ldp	x27,x28,[x29,#80]
249	ldp	x29,x30,[sp],#96
250.Labort:
251	ret
252
253.align	4
254.Ltail:
255	add	x2,x2,#64
256.Less_than_64:
257	sub	x0,x0,#1
258	add	x1,x1,x2
259	add	x0,x0,x2
260	add	x4,sp,x2
261	neg	x2,x2
262
263	add	x5,x5,x6,lsl#32	// pack
264	add	x7,x7,x8,lsl#32
265	add	x9,x9,x10,lsl#32
266	add	x11,x11,x12,lsl#32
267	add	x13,x13,x14,lsl#32
268	add	x15,x15,x16,lsl#32
269	add	x17,x17,x19,lsl#32
270	add	x20,x20,x21,lsl#32
271#ifdef	__ARMEB__
272	rev	x5,x5
273	rev	x7,x7
274	rev	x9,x9
275	rev	x11,x11
276	rev	x13,x13
277	rev	x15,x15
278	rev	x17,x17
279	rev	x20,x20
280#endif
281	stp	x5,x7,[sp,#0]
282	stp	x9,x11,[sp,#16]
283	stp	x13,x15,[sp,#32]
284	stp	x17,x20,[sp,#48]
285
286.Loop_tail:
287	ldrb	w10,[x1,x2]
288	ldrb	w11,[x4,x2]
289	add	x2,x2,#1
290	eor	w10,w10,w11
291	strb	w10,[x0,x2]
292	cbnz	x2,.Loop_tail
293
294	stp	xzr,xzr,[sp,#0]
295	stp	xzr,xzr,[sp,#16]
296	stp	xzr,xzr,[sp,#32]
297	stp	xzr,xzr,[sp,#48]
298
299	ldp	x19,x20,[x29,#16]
300	add	sp,sp,#64
301	ldp	x21,x22,[x29,#32]
302	ldp	x23,x24,[x29,#48]
303	ldp	x25,x26,[x29,#64]
304	ldp	x27,x28,[x29,#80]
305	ldp	x29,x30,[sp],#96
306	ret
307.size	ChaCha20_ctr32,.-ChaCha20_ctr32
308
309.type	ChaCha20_neon,%function
310.align	5
311ChaCha20_neon:
312	stp	x29,x30,[sp,#-96]!
313	add	x29,sp,#0
314
315	adr	x5,.Lsigma
316	stp	x19,x20,[sp,#16]
317	stp	x21,x22,[sp,#32]
318	stp	x23,x24,[sp,#48]
319	stp	x25,x26,[sp,#64]
320	stp	x27,x28,[sp,#80]
321	cmp	x2,#512
322	b.hs	.L512_or_more_neon
323
324	sub	sp,sp,#64
325
326	ldp	x22,x23,[x5]		// load sigma
327	ld1	{v24.4s},[x5],#16
328	ldp	x24,x25,[x3]		// load key
329	ldp	x26,x27,[x3,#16]
330	ld1	{v25.4s,v26.4s},[x3]
331	ldp	x28,x30,[x4]		// load counter
332	ld1	{v27.4s},[x4]
333	ld1	{v31.4s},[x5]
334#ifdef	__ARMEB__
335	rev64	v24.4s,v24.4s
336	ror	x24,x24,#32
337	ror	x25,x25,#32
338	ror	x26,x26,#32
339	ror	x27,x27,#32
340	ror	x28,x28,#32
341	ror	x30,x30,#32
342#endif
343	add	v27.4s,v27.4s,v31.4s		// += 1
344	add	v28.4s,v27.4s,v31.4s
345	add	v29.4s,v28.4s,v31.4s
346	shl	v31.4s,v31.4s,#2			// 1 -> 4
347
348.Loop_outer_neon:
349	mov	w5,w22			// unpack key block
350	lsr	x6,x22,#32
351	mov	v0.16b,v24.16b
352	mov	w7,w23
353	lsr	x8,x23,#32
354	mov	v4.16b,v24.16b
355	mov	w9,w24
356	lsr	x10,x24,#32
357	mov	v16.16b,v24.16b
358	mov	w11,w25
359	mov	v1.16b,v25.16b
360	lsr	x12,x25,#32
361	mov	v5.16b,v25.16b
362	mov	w13,w26
363	mov	v17.16b,v25.16b
364	lsr	x14,x26,#32
365	mov	v3.16b,v27.16b
366	mov	w15,w27
367	mov	v7.16b,v28.16b
368	lsr	x16,x27,#32
369	mov	v19.16b,v29.16b
370	mov	w17,w28
371	mov	v2.16b,v26.16b
372	lsr	x19,x28,#32
373	mov	v6.16b,v26.16b
374	mov	w20,w30
375	mov	v18.16b,v26.16b
376	lsr	x21,x30,#32
377
378	mov	x4,#10
379	subs	x2,x2,#256
380.Loop_neon:
381	sub	x4,x4,#1
382	add	v0.4s,v0.4s,v1.4s
383	add	w5,w5,w9
384	add	v4.4s,v4.4s,v5.4s
385	add	w6,w6,w10
386	add	v16.4s,v16.4s,v17.4s
387	add	w7,w7,w11
388	eor	v3.16b,v3.16b,v0.16b
389	add	w8,w8,w12
390	eor	v7.16b,v7.16b,v4.16b
391	eor	w17,w17,w5
392	eor	v19.16b,v19.16b,v16.16b
393	eor	w19,w19,w6
394	rev32	v3.8h,v3.8h
395	eor	w20,w20,w7
396	rev32	v7.8h,v7.8h
397	eor	w21,w21,w8
398	rev32	v19.8h,v19.8h
399	ror	w17,w17,#16
400	add	v2.4s,v2.4s,v3.4s
401	ror	w19,w19,#16
402	add	v6.4s,v6.4s,v7.4s
403	ror	w20,w20,#16
404	add	v18.4s,v18.4s,v19.4s
405	ror	w21,w21,#16
406	eor	v20.16b,v1.16b,v2.16b
407	add	w13,w13,w17
408	eor	v21.16b,v5.16b,v6.16b
409	add	w14,w14,w19
410	eor	v22.16b,v17.16b,v18.16b
411	add	w15,w15,w20
412	ushr	v1.4s,v20.4s,#20
413	add	w16,w16,w21
414	ushr	v5.4s,v21.4s,#20
415	eor	w9,w9,w13
416	ushr	v17.4s,v22.4s,#20
417	eor	w10,w10,w14
418	sli	v1.4s,v20.4s,#12
419	eor	w11,w11,w15
420	sli	v5.4s,v21.4s,#12
421	eor	w12,w12,w16
422	sli	v17.4s,v22.4s,#12
423	ror	w9,w9,#20
424	add	v0.4s,v0.4s,v1.4s
425	ror	w10,w10,#20
426	add	v4.4s,v4.4s,v5.4s
427	ror	w11,w11,#20
428	add	v16.4s,v16.4s,v17.4s
429	ror	w12,w12,#20
430	eor	v20.16b,v3.16b,v0.16b
431	add	w5,w5,w9
432	eor	v21.16b,v7.16b,v4.16b
433	add	w6,w6,w10
434	eor	v22.16b,v19.16b,v16.16b
435	add	w7,w7,w11
436	ushr	v3.4s,v20.4s,#24
437	add	w8,w8,w12
438	ushr	v7.4s,v21.4s,#24
439	eor	w17,w17,w5
440	ushr	v19.4s,v22.4s,#24
441	eor	w19,w19,w6
442	sli	v3.4s,v20.4s,#8
443	eor	w20,w20,w7
444	sli	v7.4s,v21.4s,#8
445	eor	w21,w21,w8
446	sli	v19.4s,v22.4s,#8
447	ror	w17,w17,#24
448	add	v2.4s,v2.4s,v3.4s
449	ror	w19,w19,#24
450	add	v6.4s,v6.4s,v7.4s
451	ror	w20,w20,#24
452	add	v18.4s,v18.4s,v19.4s
453	ror	w21,w21,#24
454	eor	v20.16b,v1.16b,v2.16b
455	add	w13,w13,w17
456	eor	v21.16b,v5.16b,v6.16b
457	add	w14,w14,w19
458	eor	v22.16b,v17.16b,v18.16b
459	add	w15,w15,w20
460	ushr	v1.4s,v20.4s,#25
461	add	w16,w16,w21
462	ushr	v5.4s,v21.4s,#25
463	eor	w9,w9,w13
464	ushr	v17.4s,v22.4s,#25
465	eor	w10,w10,w14
466	sli	v1.4s,v20.4s,#7
467	eor	w11,w11,w15
468	sli	v5.4s,v21.4s,#7
469	eor	w12,w12,w16
470	sli	v17.4s,v22.4s,#7
471	ror	w9,w9,#25
472	ext	v2.16b,v2.16b,v2.16b,#8
473	ror	w10,w10,#25
474	ext	v6.16b,v6.16b,v6.16b,#8
475	ror	w11,w11,#25
476	ext	v18.16b,v18.16b,v18.16b,#8
477	ror	w12,w12,#25
478	ext	v3.16b,v3.16b,v3.16b,#12
479	ext	v7.16b,v7.16b,v7.16b,#12
480	ext	v19.16b,v19.16b,v19.16b,#12
481	ext	v1.16b,v1.16b,v1.16b,#4
482	ext	v5.16b,v5.16b,v5.16b,#4
483	ext	v17.16b,v17.16b,v17.16b,#4
484	add	v0.4s,v0.4s,v1.4s
485	add	w5,w5,w10
486	add	v4.4s,v4.4s,v5.4s
487	add	w6,w6,w11
488	add	v16.4s,v16.4s,v17.4s
489	add	w7,w7,w12
490	eor	v3.16b,v3.16b,v0.16b
491	add	w8,w8,w9
492	eor	v7.16b,v7.16b,v4.16b
493	eor	w21,w21,w5
494	eor	v19.16b,v19.16b,v16.16b
495	eor	w17,w17,w6
496	rev32	v3.8h,v3.8h
497	eor	w19,w19,w7
498	rev32	v7.8h,v7.8h
499	eor	w20,w20,w8
500	rev32	v19.8h,v19.8h
501	ror	w21,w21,#16
502	add	v2.4s,v2.4s,v3.4s
503	ror	w17,w17,#16
504	add	v6.4s,v6.4s,v7.4s
505	ror	w19,w19,#16
506	add	v18.4s,v18.4s,v19.4s
507	ror	w20,w20,#16
508	eor	v20.16b,v1.16b,v2.16b
509	add	w15,w15,w21
510	eor	v21.16b,v5.16b,v6.16b
511	add	w16,w16,w17
512	eor	v22.16b,v17.16b,v18.16b
513	add	w13,w13,w19
514	ushr	v1.4s,v20.4s,#20
515	add	w14,w14,w20
516	ushr	v5.4s,v21.4s,#20
517	eor	w10,w10,w15
518	ushr	v17.4s,v22.4s,#20
519	eor	w11,w11,w16
520	sli	v1.4s,v20.4s,#12
521	eor	w12,w12,w13
522	sli	v5.4s,v21.4s,#12
523	eor	w9,w9,w14
524	sli	v17.4s,v22.4s,#12
525	ror	w10,w10,#20
526	add	v0.4s,v0.4s,v1.4s
527	ror	w11,w11,#20
528	add	v4.4s,v4.4s,v5.4s
529	ror	w12,w12,#20
530	add	v16.4s,v16.4s,v17.4s
531	ror	w9,w9,#20
532	eor	v20.16b,v3.16b,v0.16b
533	add	w5,w5,w10
534	eor	v21.16b,v7.16b,v4.16b
535	add	w6,w6,w11
536	eor	v22.16b,v19.16b,v16.16b
537	add	w7,w7,w12
538	ushr	v3.4s,v20.4s,#24
539	add	w8,w8,w9
540	ushr	v7.4s,v21.4s,#24
541	eor	w21,w21,w5
542	ushr	v19.4s,v22.4s,#24
543	eor	w17,w17,w6
544	sli	v3.4s,v20.4s,#8
545	eor	w19,w19,w7
546	sli	v7.4s,v21.4s,#8
547	eor	w20,w20,w8
548	sli	v19.4s,v22.4s,#8
549	ror	w21,w21,#24
550	add	v2.4s,v2.4s,v3.4s
551	ror	w17,w17,#24
552	add	v6.4s,v6.4s,v7.4s
553	ror	w19,w19,#24
554	add	v18.4s,v18.4s,v19.4s
555	ror	w20,w20,#24
556	eor	v20.16b,v1.16b,v2.16b
557	add	w15,w15,w21
558	eor	v21.16b,v5.16b,v6.16b
559	add	w16,w16,w17
560	eor	v22.16b,v17.16b,v18.16b
561	add	w13,w13,w19
562	ushr	v1.4s,v20.4s,#25
563	add	w14,w14,w20
564	ushr	v5.4s,v21.4s,#25
565	eor	w10,w10,w15
566	ushr	v17.4s,v22.4s,#25
567	eor	w11,w11,w16
568	sli	v1.4s,v20.4s,#7
569	eor	w12,w12,w13
570	sli	v5.4s,v21.4s,#7
571	eor	w9,w9,w14
572	sli	v17.4s,v22.4s,#7
573	ror	w10,w10,#25
574	ext	v2.16b,v2.16b,v2.16b,#8
575	ror	w11,w11,#25
576	ext	v6.16b,v6.16b,v6.16b,#8
577	ror	w12,w12,#25
578	ext	v18.16b,v18.16b,v18.16b,#8
579	ror	w9,w9,#25
580	ext	v3.16b,v3.16b,v3.16b,#4
581	ext	v7.16b,v7.16b,v7.16b,#4
582	ext	v19.16b,v19.16b,v19.16b,#4
583	ext	v1.16b,v1.16b,v1.16b,#12
584	ext	v5.16b,v5.16b,v5.16b,#12
585	ext	v17.16b,v17.16b,v17.16b,#12
586	cbnz	x4,.Loop_neon
587
588	add	w5,w5,w22		// accumulate key block
589	add	v0.4s,v0.4s,v24.4s
590	add	x6,x6,x22,lsr#32
591	add	v4.4s,v4.4s,v24.4s
592	add	w7,w7,w23
593	add	v16.4s,v16.4s,v24.4s
594	add	x8,x8,x23,lsr#32
595	add	v2.4s,v2.4s,v26.4s
596	add	w9,w9,w24
597	add	v6.4s,v6.4s,v26.4s
598	add	x10,x10,x24,lsr#32
599	add	v18.4s,v18.4s,v26.4s
600	add	w11,w11,w25
601	add	v3.4s,v3.4s,v27.4s
602	add	x12,x12,x25,lsr#32
603	add	w13,w13,w26
604	add	v7.4s,v7.4s,v28.4s
605	add	x14,x14,x26,lsr#32
606	add	w15,w15,w27
607	add	v19.4s,v19.4s,v29.4s
608	add	x16,x16,x27,lsr#32
609	add	w17,w17,w28
610	add	v1.4s,v1.4s,v25.4s
611	add	x19,x19,x28,lsr#32
612	add	w20,w20,w30
613	add	v5.4s,v5.4s,v25.4s
614	add	x21,x21,x30,lsr#32
615	add	v17.4s,v17.4s,v25.4s
616
617	b.lo	.Ltail_neon
618
619	add	x5,x5,x6,lsl#32	// pack
620	add	x7,x7,x8,lsl#32
621	ldp	x6,x8,[x1,#0]		// load input
622	add	x9,x9,x10,lsl#32
623	add	x11,x11,x12,lsl#32
624	ldp	x10,x12,[x1,#16]
625	add	x13,x13,x14,lsl#32
626	add	x15,x15,x16,lsl#32
627	ldp	x14,x16,[x1,#32]
628	add	x17,x17,x19,lsl#32
629	add	x20,x20,x21,lsl#32
630	ldp	x19,x21,[x1,#48]
631	add	x1,x1,#64
632#ifdef	__ARMEB__
633	rev	x5,x5
634	rev	x7,x7
635	rev	x9,x9
636	rev	x11,x11
637	rev	x13,x13
638	rev	x15,x15
639	rev	x17,x17
640	rev	x20,x20
641#endif
642	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
643	eor	x5,x5,x6
644	eor	x7,x7,x8
645	eor	x9,x9,x10
646	eor	x11,x11,x12
647	eor	x13,x13,x14
648	eor	v0.16b,v0.16b,v20.16b
649	eor	x15,x15,x16
650	eor	v1.16b,v1.16b,v21.16b
651	eor	x17,x17,x19
652	eor	v2.16b,v2.16b,v22.16b
653	eor	x20,x20,x21
654	eor	v3.16b,v3.16b,v23.16b
655	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
656
657	stp	x5,x7,[x0,#0]		// store output
658	add	x28,x28,#4			// increment counter
659	stp	x9,x11,[x0,#16]
660	add	v27.4s,v27.4s,v31.4s		// += 4
661	stp	x13,x15,[x0,#32]
662	add	v28.4s,v28.4s,v31.4s
663	stp	x17,x20,[x0,#48]
664	add	v29.4s,v29.4s,v31.4s
665	add	x0,x0,#64
666
667	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
668	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
669
670	eor	v4.16b,v4.16b,v20.16b
671	eor	v5.16b,v5.16b,v21.16b
672	eor	v6.16b,v6.16b,v22.16b
673	eor	v7.16b,v7.16b,v23.16b
674	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
675
676	eor	v16.16b,v16.16b,v0.16b
677	eor	v17.16b,v17.16b,v1.16b
678	eor	v18.16b,v18.16b,v2.16b
679	eor	v19.16b,v19.16b,v3.16b
680	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
681
682	b.hi	.Loop_outer_neon
683
684	ldp	x19,x20,[x29,#16]
685	add	sp,sp,#64
686	ldp	x21,x22,[x29,#32]
687	ldp	x23,x24,[x29,#48]
688	ldp	x25,x26,[x29,#64]
689	ldp	x27,x28,[x29,#80]
690	ldp	x29,x30,[sp],#96
691	ret
692
693.Ltail_neon:
694	add	x2,x2,#256
695	cmp	x2,#64
696	b.lo	.Less_than_64
697
698	add	x5,x5,x6,lsl#32	// pack
699	add	x7,x7,x8,lsl#32
700	ldp	x6,x8,[x1,#0]		// load input
701	add	x9,x9,x10,lsl#32
702	add	x11,x11,x12,lsl#32
703	ldp	x10,x12,[x1,#16]
704	add	x13,x13,x14,lsl#32
705	add	x15,x15,x16,lsl#32
706	ldp	x14,x16,[x1,#32]
707	add	x17,x17,x19,lsl#32
708	add	x20,x20,x21,lsl#32
709	ldp	x19,x21,[x1,#48]
710	add	x1,x1,#64
711#ifdef	__ARMEB__
712	rev	x5,x5
713	rev	x7,x7
714	rev	x9,x9
715	rev	x11,x11
716	rev	x13,x13
717	rev	x15,x15
718	rev	x17,x17
719	rev	x20,x20
720#endif
721	eor	x5,x5,x6
722	eor	x7,x7,x8
723	eor	x9,x9,x10
724	eor	x11,x11,x12
725	eor	x13,x13,x14
726	eor	x15,x15,x16
727	eor	x17,x17,x19
728	eor	x20,x20,x21
729
730	stp	x5,x7,[x0,#0]		// store output
731	add	x28,x28,#4			// increment counter
732	stp	x9,x11,[x0,#16]
733	stp	x13,x15,[x0,#32]
734	stp	x17,x20,[x0,#48]
735	add	x0,x0,#64
736	b.eq	.Ldone_neon
737	sub	x2,x2,#64
738	cmp	x2,#64
739	b.lo	.Less_than_128
740
741	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
742	eor	v0.16b,v0.16b,v20.16b
743	eor	v1.16b,v1.16b,v21.16b
744	eor	v2.16b,v2.16b,v22.16b
745	eor	v3.16b,v3.16b,v23.16b
746	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
747	b.eq	.Ldone_neon
748	sub	x2,x2,#64
749	cmp	x2,#64
750	b.lo	.Less_than_192
751
752	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
753	eor	v4.16b,v4.16b,v20.16b
754	eor	v5.16b,v5.16b,v21.16b
755	eor	v6.16b,v6.16b,v22.16b
756	eor	v7.16b,v7.16b,v23.16b
757	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
758	b.eq	.Ldone_neon
759	sub	x2,x2,#64
760
761	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
762	b	.Last_neon
763
764.Less_than_128:
765	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
766	b	.Last_neon
767.Less_than_192:
768	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
769	b	.Last_neon
770
771.align	4
772.Last_neon:
773	sub	x0,x0,#1
774	add	x1,x1,x2
775	add	x0,x0,x2
776	add	x4,sp,x2
777	neg	x2,x2
778
779.Loop_tail_neon:
780	ldrb	w10,[x1,x2]
781	ldrb	w11,[x4,x2]
782	add	x2,x2,#1
783	eor	w10,w10,w11
784	strb	w10,[x0,x2]
785	cbnz	x2,.Loop_tail_neon
786
787	stp	xzr,xzr,[sp,#0]
788	stp	xzr,xzr,[sp,#16]
789	stp	xzr,xzr,[sp,#32]
790	stp	xzr,xzr,[sp,#48]
791
792.Ldone_neon:
793	ldp	x19,x20,[x29,#16]
794	add	sp,sp,#64
795	ldp	x21,x22,[x29,#32]
796	ldp	x23,x24,[x29,#48]
797	ldp	x25,x26,[x29,#64]
798	ldp	x27,x28,[x29,#80]
799	ldp	x29,x30,[sp],#96
800	ret
801.size	ChaCha20_neon,.-ChaCha20_neon
802.type	ChaCha20_512_neon,%function
803.align	5
804ChaCha20_512_neon:
805	stp	x29,x30,[sp,#-96]!
806	add	x29,sp,#0
807
808	adr	x5,.Lsigma
809	stp	x19,x20,[sp,#16]
810	stp	x21,x22,[sp,#32]
811	stp	x23,x24,[sp,#48]
812	stp	x25,x26,[sp,#64]
813	stp	x27,x28,[sp,#80]
814
815.L512_or_more_neon:
816	sub	sp,sp,#128+64
817
818	ldp	x22,x23,[x5]		// load sigma
819	ld1	{v24.4s},[x5],#16
820	ldp	x24,x25,[x3]		// load key
821	ldp	x26,x27,[x3,#16]
822	ld1	{v25.4s,v26.4s},[x3]
823	ldp	x28,x30,[x4]		// load counter
824	ld1	{v27.4s},[x4]
825	ld1	{v31.4s},[x5]
826#ifdef	__ARMEB__
827	rev64	v24.4s,v24.4s
828	ror	x24,x24,#32
829	ror	x25,x25,#32
830	ror	x26,x26,#32
831	ror	x27,x27,#32
832	ror	x28,x28,#32
833	ror	x30,x30,#32
834#endif
835	add	v27.4s,v27.4s,v31.4s		// += 1
836	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
837	add	v27.4s,v27.4s,v31.4s		// not typo
838	str	q26,[sp,#32]
839	add	v28.4s,v27.4s,v31.4s
840	add	v29.4s,v28.4s,v31.4s
841	add	v30.4s,v29.4s,v31.4s
842	shl	v31.4s,v31.4s,#2			// 1 -> 4
843
844	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
845	stp	d10,d11,[sp,#128+16]
846	stp	d12,d13,[sp,#128+32]
847	stp	d14,d15,[sp,#128+48]
848
849	sub	x2,x2,#512			// not typo
850
851.Loop_outer_512_neon:
852	mov	v0.16b,v24.16b
853	mov	v4.16b,v24.16b
854	mov	v8.16b,v24.16b
855	mov	v12.16b,v24.16b
856	mov	v16.16b,v24.16b
857	mov	v20.16b,v24.16b
858	mov	v1.16b,v25.16b
859	mov	w5,w22			// unpack key block
860	mov	v5.16b,v25.16b
861	lsr	x6,x22,#32
862	mov	v9.16b,v25.16b
863	mov	w7,w23
864	mov	v13.16b,v25.16b
865	lsr	x8,x23,#32
866	mov	v17.16b,v25.16b
867	mov	w9,w24
868	mov	v21.16b,v25.16b
869	lsr	x10,x24,#32
870	mov	v3.16b,v27.16b
871	mov	w11,w25
872	mov	v7.16b,v28.16b
873	lsr	x12,x25,#32
874	mov	v11.16b,v29.16b
875	mov	w13,w26
876	mov	v15.16b,v30.16b
877	lsr	x14,x26,#32
878	mov	v2.16b,v26.16b
879	mov	w15,w27
880	mov	v6.16b,v26.16b
881	lsr	x16,x27,#32
882	add	v19.4s,v3.4s,v31.4s			// +4
883	mov	w17,w28
884	add	v23.4s,v7.4s,v31.4s			// +4
885	lsr	x19,x28,#32
886	mov	v10.16b,v26.16b
887	mov	w20,w30
888	mov	v14.16b,v26.16b
889	lsr	x21,x30,#32
890	mov	v18.16b,v26.16b
891	stp	q27,q28,[sp,#48]		// off-load key block, variable part
892	mov	v22.16b,v26.16b
893	str	q29,[sp,#80]
894
895	mov	x4,#5
896	subs	x2,x2,#512
897.Loop_upper_neon:
898	sub	x4,x4,#1
899	add	v0.4s,v0.4s,v1.4s
900	add	w5,w5,w9
901	add	v4.4s,v4.4s,v5.4s
902	add	w6,w6,w10
903	add	v8.4s,v8.4s,v9.4s
904	add	w7,w7,w11
905	add	v12.4s,v12.4s,v13.4s
906	add	w8,w8,w12
907	add	v16.4s,v16.4s,v17.4s
908	eor	w17,w17,w5
909	add	v20.4s,v20.4s,v21.4s
910	eor	w19,w19,w6
911	eor	v3.16b,v3.16b,v0.16b
912	eor	w20,w20,w7
913	eor	v7.16b,v7.16b,v4.16b
914	eor	w21,w21,w8
915	eor	v11.16b,v11.16b,v8.16b
916	ror	w17,w17,#16
917	eor	v15.16b,v15.16b,v12.16b
918	ror	w19,w19,#16
919	eor	v19.16b,v19.16b,v16.16b
920	ror	w20,w20,#16
921	eor	v23.16b,v23.16b,v20.16b
922	ror	w21,w21,#16
923	rev32	v3.8h,v3.8h
924	add	w13,w13,w17
925	rev32	v7.8h,v7.8h
926	add	w14,w14,w19
927	rev32	v11.8h,v11.8h
928	add	w15,w15,w20
929	rev32	v15.8h,v15.8h
930	add	w16,w16,w21
931	rev32	v19.8h,v19.8h
932	eor	w9,w9,w13
933	rev32	v23.8h,v23.8h
934	eor	w10,w10,w14
935	add	v2.4s,v2.4s,v3.4s
936	eor	w11,w11,w15
937	add	v6.4s,v6.4s,v7.4s
938	eor	w12,w12,w16
939	add	v10.4s,v10.4s,v11.4s
940	ror	w9,w9,#20
941	add	v14.4s,v14.4s,v15.4s
942	ror	w10,w10,#20
943	add	v18.4s,v18.4s,v19.4s
944	ror	w11,w11,#20
945	add	v22.4s,v22.4s,v23.4s
946	ror	w12,w12,#20
947	eor	v24.16b,v1.16b,v2.16b
948	add	w5,w5,w9
949	eor	v25.16b,v5.16b,v6.16b
950	add	w6,w6,w10
951	eor	v26.16b,v9.16b,v10.16b
952	add	w7,w7,w11
953	eor	v27.16b,v13.16b,v14.16b
954	add	w8,w8,w12
955	eor	v28.16b,v17.16b,v18.16b
956	eor	w17,w17,w5
957	eor	v29.16b,v21.16b,v22.16b
958	eor	w19,w19,w6
959	ushr	v1.4s,v24.4s,#20
960	eor	w20,w20,w7
961	ushr	v5.4s,v25.4s,#20
962	eor	w21,w21,w8
963	ushr	v9.4s,v26.4s,#20
964	ror	w17,w17,#24
965	ushr	v13.4s,v27.4s,#20
966	ror	w19,w19,#24
967	ushr	v17.4s,v28.4s,#20
968	ror	w20,w20,#24
969	ushr	v21.4s,v29.4s,#20
970	ror	w21,w21,#24
971	sli	v1.4s,v24.4s,#12
972	add	w13,w13,w17
973	sli	v5.4s,v25.4s,#12
974	add	w14,w14,w19
975	sli	v9.4s,v26.4s,#12
976	add	w15,w15,w20
977	sli	v13.4s,v27.4s,#12
978	add	w16,w16,w21
979	sli	v17.4s,v28.4s,#12
980	eor	w9,w9,w13
981	sli	v21.4s,v29.4s,#12
982	eor	w10,w10,w14
983	add	v0.4s,v0.4s,v1.4s
984	eor	w11,w11,w15
985	add	v4.4s,v4.4s,v5.4s
986	eor	w12,w12,w16
987	add	v8.4s,v8.4s,v9.4s
988	ror	w9,w9,#25
989	add	v12.4s,v12.4s,v13.4s
990	ror	w10,w10,#25
991	add	v16.4s,v16.4s,v17.4s
992	ror	w11,w11,#25
993	add	v20.4s,v20.4s,v21.4s
994	ror	w12,w12,#25
995	eor	v24.16b,v3.16b,v0.16b
996	add	w5,w5,w10
997	eor	v25.16b,v7.16b,v4.16b
998	add	w6,w6,w11
999	eor	v26.16b,v11.16b,v8.16b
1000	add	w7,w7,w12
1001	eor	v27.16b,v15.16b,v12.16b
1002	add	w8,w8,w9
1003	eor	v28.16b,v19.16b,v16.16b
1004	eor	w21,w21,w5
1005	eor	v29.16b,v23.16b,v20.16b
1006	eor	w17,w17,w6
1007	ushr	v3.4s,v24.4s,#24
1008	eor	w19,w19,w7
1009	ushr	v7.4s,v25.4s,#24
1010	eor	w20,w20,w8
1011	ushr	v11.4s,v26.4s,#24
1012	ror	w21,w21,#16
1013	ushr	v15.4s,v27.4s,#24
1014	ror	w17,w17,#16
1015	ushr	v19.4s,v28.4s,#24
1016	ror	w19,w19,#16
1017	ushr	v23.4s,v29.4s,#24
1018	ror	w20,w20,#16
1019	sli	v3.4s,v24.4s,#8
1020	add	w15,w15,w21
1021	sli	v7.4s,v25.4s,#8
1022	add	w16,w16,w17
1023	sli	v11.4s,v26.4s,#8
1024	add	w13,w13,w19
1025	sli	v15.4s,v27.4s,#8
1026	add	w14,w14,w20
1027	sli	v19.4s,v28.4s,#8
1028	eor	w10,w10,w15
1029	sli	v23.4s,v29.4s,#8
1030	eor	w11,w11,w16
1031	add	v2.4s,v2.4s,v3.4s
1032	eor	w12,w12,w13
1033	add	v6.4s,v6.4s,v7.4s
1034	eor	w9,w9,w14
1035	add	v10.4s,v10.4s,v11.4s
1036	ror	w10,w10,#20
1037	add	v14.4s,v14.4s,v15.4s
1038	ror	w11,w11,#20
1039	add	v18.4s,v18.4s,v19.4s
1040	ror	w12,w12,#20
1041	add	v22.4s,v22.4s,v23.4s
1042	ror	w9,w9,#20
1043	eor	v24.16b,v1.16b,v2.16b
1044	add	w5,w5,w10
1045	eor	v25.16b,v5.16b,v6.16b
1046	add	w6,w6,w11
1047	eor	v26.16b,v9.16b,v10.16b
1048	add	w7,w7,w12
1049	eor	v27.16b,v13.16b,v14.16b
1050	add	w8,w8,w9
1051	eor	v28.16b,v17.16b,v18.16b
1052	eor	w21,w21,w5
1053	eor	v29.16b,v21.16b,v22.16b
1054	eor	w17,w17,w6
1055	ushr	v1.4s,v24.4s,#25
1056	eor	w19,w19,w7
1057	ushr	v5.4s,v25.4s,#25
1058	eor	w20,w20,w8
1059	ushr	v9.4s,v26.4s,#25
1060	ror	w21,w21,#24
1061	ushr	v13.4s,v27.4s,#25
1062	ror	w17,w17,#24
1063	ushr	v17.4s,v28.4s,#25
1064	ror	w19,w19,#24
1065	ushr	v21.4s,v29.4s,#25
1066	ror	w20,w20,#24
1067	sli	v1.4s,v24.4s,#7
1068	add	w15,w15,w21
1069	sli	v5.4s,v25.4s,#7
1070	add	w16,w16,w17
1071	sli	v9.4s,v26.4s,#7
1072	add	w13,w13,w19
1073	sli	v13.4s,v27.4s,#7
1074	add	w14,w14,w20
1075	sli	v17.4s,v28.4s,#7
1076	eor	w10,w10,w15
1077	sli	v21.4s,v29.4s,#7
1078	eor	w11,w11,w16
1079	ext	v2.16b,v2.16b,v2.16b,#8
1080	eor	w12,w12,w13
1081	ext	v6.16b,v6.16b,v6.16b,#8
1082	eor	w9,w9,w14
1083	ext	v10.16b,v10.16b,v10.16b,#8
1084	ror	w10,w10,#25
1085	ext	v14.16b,v14.16b,v14.16b,#8
1086	ror	w11,w11,#25
1087	ext	v18.16b,v18.16b,v18.16b,#8
1088	ror	w12,w12,#25
1089	ext	v22.16b,v22.16b,v22.16b,#8
1090	ror	w9,w9,#25
1091	ext	v3.16b,v3.16b,v3.16b,#12
1092	ext	v7.16b,v7.16b,v7.16b,#12
1093	ext	v11.16b,v11.16b,v11.16b,#12
1094	ext	v15.16b,v15.16b,v15.16b,#12
1095	ext	v19.16b,v19.16b,v19.16b,#12
1096	ext	v23.16b,v23.16b,v23.16b,#12
1097	ext	v1.16b,v1.16b,v1.16b,#4
1098	ext	v5.16b,v5.16b,v5.16b,#4
1099	ext	v9.16b,v9.16b,v9.16b,#4
1100	ext	v13.16b,v13.16b,v13.16b,#4
1101	ext	v17.16b,v17.16b,v17.16b,#4
1102	ext	v21.16b,v21.16b,v21.16b,#4
1103	add	v0.4s,v0.4s,v1.4s
1104	add	w5,w5,w9
1105	add	v4.4s,v4.4s,v5.4s
1106	add	w6,w6,w10
1107	add	v8.4s,v8.4s,v9.4s
1108	add	w7,w7,w11
1109	add	v12.4s,v12.4s,v13.4s
1110	add	w8,w8,w12
1111	add	v16.4s,v16.4s,v17.4s
1112	eor	w17,w17,w5
1113	add	v20.4s,v20.4s,v21.4s
1114	eor	w19,w19,w6
1115	eor	v3.16b,v3.16b,v0.16b
1116	eor	w20,w20,w7
1117	eor	v7.16b,v7.16b,v4.16b
1118	eor	w21,w21,w8
1119	eor	v11.16b,v11.16b,v8.16b
1120	ror	w17,w17,#16
1121	eor	v15.16b,v15.16b,v12.16b
1122	ror	w19,w19,#16
1123	eor	v19.16b,v19.16b,v16.16b
1124	ror	w20,w20,#16
1125	eor	v23.16b,v23.16b,v20.16b
1126	ror	w21,w21,#16
1127	rev32	v3.8h,v3.8h
1128	add	w13,w13,w17
1129	rev32	v7.8h,v7.8h
1130	add	w14,w14,w19
1131	rev32	v11.8h,v11.8h
1132	add	w15,w15,w20
1133	rev32	v15.8h,v15.8h
1134	add	w16,w16,w21
1135	rev32	v19.8h,v19.8h
1136	eor	w9,w9,w13
1137	rev32	v23.8h,v23.8h
1138	eor	w10,w10,w14
1139	add	v2.4s,v2.4s,v3.4s
1140	eor	w11,w11,w15
1141	add	v6.4s,v6.4s,v7.4s
1142	eor	w12,w12,w16
1143	add	v10.4s,v10.4s,v11.4s
1144	ror	w9,w9,#20
1145	add	v14.4s,v14.4s,v15.4s
1146	ror	w10,w10,#20
1147	add	v18.4s,v18.4s,v19.4s
1148	ror	w11,w11,#20
1149	add	v22.4s,v22.4s,v23.4s
1150	ror	w12,w12,#20
1151	eor	v24.16b,v1.16b,v2.16b
1152	add	w5,w5,w9
1153	eor	v25.16b,v5.16b,v6.16b
1154	add	w6,w6,w10
1155	eor	v26.16b,v9.16b,v10.16b
1156	add	w7,w7,w11
1157	eor	v27.16b,v13.16b,v14.16b
1158	add	w8,w8,w12
1159	eor	v28.16b,v17.16b,v18.16b
1160	eor	w17,w17,w5
1161	eor	v29.16b,v21.16b,v22.16b
1162	eor	w19,w19,w6
1163	ushr	v1.4s,v24.4s,#20
1164	eor	w20,w20,w7
1165	ushr	v5.4s,v25.4s,#20
1166	eor	w21,w21,w8
1167	ushr	v9.4s,v26.4s,#20
1168	ror	w17,w17,#24
1169	ushr	v13.4s,v27.4s,#20
1170	ror	w19,w19,#24
1171	ushr	v17.4s,v28.4s,#20
1172	ror	w20,w20,#24
1173	ushr	v21.4s,v29.4s,#20
1174	ror	w21,w21,#24
1175	sli	v1.4s,v24.4s,#12
1176	add	w13,w13,w17
1177	sli	v5.4s,v25.4s,#12
1178	add	w14,w14,w19
1179	sli	v9.4s,v26.4s,#12
1180	add	w15,w15,w20
1181	sli	v13.4s,v27.4s,#12
1182	add	w16,w16,w21
1183	sli	v17.4s,v28.4s,#12
1184	eor	w9,w9,w13
1185	sli	v21.4s,v29.4s,#12
1186	eor	w10,w10,w14
1187	add	v0.4s,v0.4s,v1.4s
1188	eor	w11,w11,w15
1189	add	v4.4s,v4.4s,v5.4s
1190	eor	w12,w12,w16
1191	add	v8.4s,v8.4s,v9.4s
1192	ror	w9,w9,#25
1193	add	v12.4s,v12.4s,v13.4s
1194	ror	w10,w10,#25
1195	add	v16.4s,v16.4s,v17.4s
1196	ror	w11,w11,#25
1197	add	v20.4s,v20.4s,v21.4s
1198	ror	w12,w12,#25
1199	eor	v24.16b,v3.16b,v0.16b
1200	add	w5,w5,w10
1201	eor	v25.16b,v7.16b,v4.16b
1202	add	w6,w6,w11
1203	eor	v26.16b,v11.16b,v8.16b
1204	add	w7,w7,w12
1205	eor	v27.16b,v15.16b,v12.16b
1206	add	w8,w8,w9
1207	eor	v28.16b,v19.16b,v16.16b
1208	eor	w21,w21,w5
1209	eor	v29.16b,v23.16b,v20.16b
1210	eor	w17,w17,w6
1211	ushr	v3.4s,v24.4s,#24
1212	eor	w19,w19,w7
1213	ushr	v7.4s,v25.4s,#24
1214	eor	w20,w20,w8
1215	ushr	v11.4s,v26.4s,#24
1216	ror	w21,w21,#16
1217	ushr	v15.4s,v27.4s,#24
1218	ror	w17,w17,#16
1219	ushr	v19.4s,v28.4s,#24
1220	ror	w19,w19,#16
1221	ushr	v23.4s,v29.4s,#24
1222	ror	w20,w20,#16
1223	sli	v3.4s,v24.4s,#8
1224	add	w15,w15,w21
1225	sli	v7.4s,v25.4s,#8
1226	add	w16,w16,w17
1227	sli	v11.4s,v26.4s,#8
1228	add	w13,w13,w19
1229	sli	v15.4s,v27.4s,#8
1230	add	w14,w14,w20
1231	sli	v19.4s,v28.4s,#8
1232	eor	w10,w10,w15
1233	sli	v23.4s,v29.4s,#8
1234	eor	w11,w11,w16
1235	add	v2.4s,v2.4s,v3.4s
1236	eor	w12,w12,w13
1237	add	v6.4s,v6.4s,v7.4s
1238	eor	w9,w9,w14
1239	add	v10.4s,v10.4s,v11.4s
1240	ror	w10,w10,#20
1241	add	v14.4s,v14.4s,v15.4s
1242	ror	w11,w11,#20
1243	add	v18.4s,v18.4s,v19.4s
1244	ror	w12,w12,#20
1245	add	v22.4s,v22.4s,v23.4s
1246	ror	w9,w9,#20
1247	eor	v24.16b,v1.16b,v2.16b
1248	add	w5,w5,w10
1249	eor	v25.16b,v5.16b,v6.16b
1250	add	w6,w6,w11
1251	eor	v26.16b,v9.16b,v10.16b
1252	add	w7,w7,w12
1253	eor	v27.16b,v13.16b,v14.16b
1254	add	w8,w8,w9
1255	eor	v28.16b,v17.16b,v18.16b
1256	eor	w21,w21,w5
1257	eor	v29.16b,v21.16b,v22.16b
1258	eor	w17,w17,w6
1259	ushr	v1.4s,v24.4s,#25
1260	eor	w19,w19,w7
1261	ushr	v5.4s,v25.4s,#25
1262	eor	w20,w20,w8
1263	ushr	v9.4s,v26.4s,#25
1264	ror	w21,w21,#24
1265	ushr	v13.4s,v27.4s,#25
1266	ror	w17,w17,#24
1267	ushr	v17.4s,v28.4s,#25
1268	ror	w19,w19,#24
1269	ushr	v21.4s,v29.4s,#25
1270	ror	w20,w20,#24
1271	sli	v1.4s,v24.4s,#7
1272	add	w15,w15,w21
1273	sli	v5.4s,v25.4s,#7
1274	add	w16,w16,w17
1275	sli	v9.4s,v26.4s,#7
1276	add	w13,w13,w19
1277	sli	v13.4s,v27.4s,#7
1278	add	w14,w14,w20
1279	sli	v17.4s,v28.4s,#7
1280	eor	w10,w10,w15
1281	sli	v21.4s,v29.4s,#7
1282	eor	w11,w11,w16
1283	ext	v2.16b,v2.16b,v2.16b,#8
1284	eor	w12,w12,w13
1285	ext	v6.16b,v6.16b,v6.16b,#8
1286	eor	w9,w9,w14
1287	ext	v10.16b,v10.16b,v10.16b,#8
1288	ror	w10,w10,#25
1289	ext	v14.16b,v14.16b,v14.16b,#8
1290	ror	w11,w11,#25
1291	ext	v18.16b,v18.16b,v18.16b,#8
1292	ror	w12,w12,#25
1293	ext	v22.16b,v22.16b,v22.16b,#8
1294	ror	w9,w9,#25
1295	ext	v3.16b,v3.16b,v3.16b,#4
1296	ext	v7.16b,v7.16b,v7.16b,#4
1297	ext	v11.16b,v11.16b,v11.16b,#4
1298	ext	v15.16b,v15.16b,v15.16b,#4
1299	ext	v19.16b,v19.16b,v19.16b,#4
1300	ext	v23.16b,v23.16b,v23.16b,#4
1301	ext	v1.16b,v1.16b,v1.16b,#12
1302	ext	v5.16b,v5.16b,v5.16b,#12
1303	ext	v9.16b,v9.16b,v9.16b,#12
1304	ext	v13.16b,v13.16b,v13.16b,#12
1305	ext	v17.16b,v17.16b,v17.16b,#12
1306	ext	v21.16b,v21.16b,v21.16b,#12
1307	cbnz	x4,.Loop_upper_neon
1308
1309	add	w5,w5,w22		// accumulate key block
1310	add	x6,x6,x22,lsr#32
1311	add	w7,w7,w23
1312	add	x8,x8,x23,lsr#32
1313	add	w9,w9,w24
1314	add	x10,x10,x24,lsr#32
1315	add	w11,w11,w25
1316	add	x12,x12,x25,lsr#32
1317	add	w13,w13,w26
1318	add	x14,x14,x26,lsr#32
1319	add	w15,w15,w27
1320	add	x16,x16,x27,lsr#32
1321	add	w17,w17,w28
1322	add	x19,x19,x28,lsr#32
1323	add	w20,w20,w30
1324	add	x21,x21,x30,lsr#32
1325
1326	add	x5,x5,x6,lsl#32	// pack
1327	add	x7,x7,x8,lsl#32
1328	ldp	x6,x8,[x1,#0]		// load input
1329	add	x9,x9,x10,lsl#32
1330	add	x11,x11,x12,lsl#32
1331	ldp	x10,x12,[x1,#16]
1332	add	x13,x13,x14,lsl#32
1333	add	x15,x15,x16,lsl#32
1334	ldp	x14,x16,[x1,#32]
1335	add	x17,x17,x19,lsl#32
1336	add	x20,x20,x21,lsl#32
1337	ldp	x19,x21,[x1,#48]
1338	add	x1,x1,#64
1339#ifdef	__ARMEB__
1340	rev	x5,x5
1341	rev	x7,x7
1342	rev	x9,x9
1343	rev	x11,x11
1344	rev	x13,x13
1345	rev	x15,x15
1346	rev	x17,x17
1347	rev	x20,x20
1348#endif
1349	eor	x5,x5,x6
1350	eor	x7,x7,x8
1351	eor	x9,x9,x10
1352	eor	x11,x11,x12
1353	eor	x13,x13,x14
1354	eor	x15,x15,x16
1355	eor	x17,x17,x19
1356	eor	x20,x20,x21
1357
1358	stp	x5,x7,[x0,#0]		// store output
1359	add	x28,x28,#1			// increment counter
1360	mov	w5,w22			// unpack key block
1361	lsr	x6,x22,#32
1362	stp	x9,x11,[x0,#16]
1363	mov	w7,w23
1364	lsr	x8,x23,#32
1365	stp	x13,x15,[x0,#32]
1366	mov	w9,w24
1367	lsr	x10,x24,#32
1368	stp	x17,x20,[x0,#48]
1369	add	x0,x0,#64
1370	mov	w11,w25
1371	lsr	x12,x25,#32
1372	mov	w13,w26
1373	lsr	x14,x26,#32
1374	mov	w15,w27
1375	lsr	x16,x27,#32
1376	mov	w17,w28
1377	lsr	x19,x28,#32
1378	mov	w20,w30
1379	lsr	x21,x30,#32
1380
1381	mov	x4,#5
1382.Loop_lower_neon:
1383	sub	x4,x4,#1
1384	add	v0.4s,v0.4s,v1.4s
1385	add	w5,w5,w9
1386	add	v4.4s,v4.4s,v5.4s
1387	add	w6,w6,w10
1388	add	v8.4s,v8.4s,v9.4s
1389	add	w7,w7,w11
1390	add	v12.4s,v12.4s,v13.4s
1391	add	w8,w8,w12
1392	add	v16.4s,v16.4s,v17.4s
1393	eor	w17,w17,w5
1394	add	v20.4s,v20.4s,v21.4s
1395	eor	w19,w19,w6
1396	eor	v3.16b,v3.16b,v0.16b
1397	eor	w20,w20,w7
1398	eor	v7.16b,v7.16b,v4.16b
1399	eor	w21,w21,w8
1400	eor	v11.16b,v11.16b,v8.16b
1401	ror	w17,w17,#16
1402	eor	v15.16b,v15.16b,v12.16b
1403	ror	w19,w19,#16
1404	eor	v19.16b,v19.16b,v16.16b
1405	ror	w20,w20,#16
1406	eor	v23.16b,v23.16b,v20.16b
1407	ror	w21,w21,#16
1408	rev32	v3.8h,v3.8h
1409	add	w13,w13,w17
1410	rev32	v7.8h,v7.8h
1411	add	w14,w14,w19
1412	rev32	v11.8h,v11.8h
1413	add	w15,w15,w20
1414	rev32	v15.8h,v15.8h
1415	add	w16,w16,w21
1416	rev32	v19.8h,v19.8h
1417	eor	w9,w9,w13
1418	rev32	v23.8h,v23.8h
1419	eor	w10,w10,w14
1420	add	v2.4s,v2.4s,v3.4s
1421	eor	w11,w11,w15
1422	add	v6.4s,v6.4s,v7.4s
1423	eor	w12,w12,w16
1424	add	v10.4s,v10.4s,v11.4s
1425	ror	w9,w9,#20
1426	add	v14.4s,v14.4s,v15.4s
1427	ror	w10,w10,#20
1428	add	v18.4s,v18.4s,v19.4s
1429	ror	w11,w11,#20
1430	add	v22.4s,v22.4s,v23.4s
1431	ror	w12,w12,#20
1432	eor	v24.16b,v1.16b,v2.16b
1433	add	w5,w5,w9
1434	eor	v25.16b,v5.16b,v6.16b
1435	add	w6,w6,w10
1436	eor	v26.16b,v9.16b,v10.16b
1437	add	w7,w7,w11
1438	eor	v27.16b,v13.16b,v14.16b
1439	add	w8,w8,w12
1440	eor	v28.16b,v17.16b,v18.16b
1441	eor	w17,w17,w5
1442	eor	v29.16b,v21.16b,v22.16b
1443	eor	w19,w19,w6
1444	ushr	v1.4s,v24.4s,#20
1445	eor	w20,w20,w7
1446	ushr	v5.4s,v25.4s,#20
1447	eor	w21,w21,w8
1448	ushr	v9.4s,v26.4s,#20
1449	ror	w17,w17,#24
1450	ushr	v13.4s,v27.4s,#20
1451	ror	w19,w19,#24
1452	ushr	v17.4s,v28.4s,#20
1453	ror	w20,w20,#24
1454	ushr	v21.4s,v29.4s,#20
1455	ror	w21,w21,#24
1456	sli	v1.4s,v24.4s,#12
1457	add	w13,w13,w17
1458	sli	v5.4s,v25.4s,#12
1459	add	w14,w14,w19
1460	sli	v9.4s,v26.4s,#12
1461	add	w15,w15,w20
1462	sli	v13.4s,v27.4s,#12
1463	add	w16,w16,w21
1464	sli	v17.4s,v28.4s,#12
1465	eor	w9,w9,w13
1466	sli	v21.4s,v29.4s,#12
1467	eor	w10,w10,w14
1468	add	v0.4s,v0.4s,v1.4s
1469	eor	w11,w11,w15
1470	add	v4.4s,v4.4s,v5.4s
1471	eor	w12,w12,w16
1472	add	v8.4s,v8.4s,v9.4s
1473	ror	w9,w9,#25
1474	add	v12.4s,v12.4s,v13.4s
1475	ror	w10,w10,#25
1476	add	v16.4s,v16.4s,v17.4s
1477	ror	w11,w11,#25
1478	add	v20.4s,v20.4s,v21.4s
1479	ror	w12,w12,#25
1480	eor	v24.16b,v3.16b,v0.16b
1481	add	w5,w5,w10
1482	eor	v25.16b,v7.16b,v4.16b
1483	add	w6,w6,w11
1484	eor	v26.16b,v11.16b,v8.16b
1485	add	w7,w7,w12
1486	eor	v27.16b,v15.16b,v12.16b
1487	add	w8,w8,w9
1488	eor	v28.16b,v19.16b,v16.16b
1489	eor	w21,w21,w5
1490	eor	v29.16b,v23.16b,v20.16b
1491	eor	w17,w17,w6
1492	ushr	v3.4s,v24.4s,#24
1493	eor	w19,w19,w7
1494	ushr	v7.4s,v25.4s,#24
1495	eor	w20,w20,w8
1496	ushr	v11.4s,v26.4s,#24
1497	ror	w21,w21,#16
1498	ushr	v15.4s,v27.4s,#24
1499	ror	w17,w17,#16
1500	ushr	v19.4s,v28.4s,#24
1501	ror	w19,w19,#16
1502	ushr	v23.4s,v29.4s,#24
1503	ror	w20,w20,#16
1504	sli	v3.4s,v24.4s,#8
1505	add	w15,w15,w21
1506	sli	v7.4s,v25.4s,#8
1507	add	w16,w16,w17
1508	sli	v11.4s,v26.4s,#8
1509	add	w13,w13,w19
1510	sli	v15.4s,v27.4s,#8
1511	add	w14,w14,w20
1512	sli	v19.4s,v28.4s,#8
1513	eor	w10,w10,w15
1514	sli	v23.4s,v29.4s,#8
1515	eor	w11,w11,w16
1516	add	v2.4s,v2.4s,v3.4s
1517	eor	w12,w12,w13
1518	add	v6.4s,v6.4s,v7.4s
1519	eor	w9,w9,w14
1520	add	v10.4s,v10.4s,v11.4s
1521	ror	w10,w10,#20
1522	add	v14.4s,v14.4s,v15.4s
1523	ror	w11,w11,#20
1524	add	v18.4s,v18.4s,v19.4s
1525	ror	w12,w12,#20
1526	add	v22.4s,v22.4s,v23.4s
1527	ror	w9,w9,#20
1528	eor	v24.16b,v1.16b,v2.16b
1529	add	w5,w5,w10
1530	eor	v25.16b,v5.16b,v6.16b
1531	add	w6,w6,w11
1532	eor	v26.16b,v9.16b,v10.16b
1533	add	w7,w7,w12
1534	eor	v27.16b,v13.16b,v14.16b
1535	add	w8,w8,w9
1536	eor	v28.16b,v17.16b,v18.16b
1537	eor	w21,w21,w5
1538	eor	v29.16b,v21.16b,v22.16b
1539	eor	w17,w17,w6
1540	ushr	v1.4s,v24.4s,#25
1541	eor	w19,w19,w7
1542	ushr	v5.4s,v25.4s,#25
1543	eor	w20,w20,w8
1544	ushr	v9.4s,v26.4s,#25
1545	ror	w21,w21,#24
1546	ushr	v13.4s,v27.4s,#25
1547	ror	w17,w17,#24
1548	ushr	v17.4s,v28.4s,#25
1549	ror	w19,w19,#24
1550	ushr	v21.4s,v29.4s,#25
1551	ror	w20,w20,#24
1552	sli	v1.4s,v24.4s,#7
1553	add	w15,w15,w21
1554	sli	v5.4s,v25.4s,#7
1555	add	w16,w16,w17
1556	sli	v9.4s,v26.4s,#7
1557	add	w13,w13,w19
1558	sli	v13.4s,v27.4s,#7
1559	add	w14,w14,w20
1560	sli	v17.4s,v28.4s,#7
1561	eor	w10,w10,w15
1562	sli	v21.4s,v29.4s,#7
1563	eor	w11,w11,w16
1564	ext	v2.16b,v2.16b,v2.16b,#8
1565	eor	w12,w12,w13
1566	ext	v6.16b,v6.16b,v6.16b,#8
1567	eor	w9,w9,w14
1568	ext	v10.16b,v10.16b,v10.16b,#8
1569	ror	w10,w10,#25
1570	ext	v14.16b,v14.16b,v14.16b,#8
1571	ror	w11,w11,#25
1572	ext	v18.16b,v18.16b,v18.16b,#8
1573	ror	w12,w12,#25
1574	ext	v22.16b,v22.16b,v22.16b,#8
1575	ror	w9,w9,#25
1576	ext	v3.16b,v3.16b,v3.16b,#12
1577	ext	v7.16b,v7.16b,v7.16b,#12
1578	ext	v11.16b,v11.16b,v11.16b,#12
1579	ext	v15.16b,v15.16b,v15.16b,#12
1580	ext	v19.16b,v19.16b,v19.16b,#12
1581	ext	v23.16b,v23.16b,v23.16b,#12
1582	ext	v1.16b,v1.16b,v1.16b,#4
1583	ext	v5.16b,v5.16b,v5.16b,#4
1584	ext	v9.16b,v9.16b,v9.16b,#4
1585	ext	v13.16b,v13.16b,v13.16b,#4
1586	ext	v17.16b,v17.16b,v17.16b,#4
1587	ext	v21.16b,v21.16b,v21.16b,#4
1588	add	v0.4s,v0.4s,v1.4s
1589	add	w5,w5,w9
1590	add	v4.4s,v4.4s,v5.4s
1591	add	w6,w6,w10
1592	add	v8.4s,v8.4s,v9.4s
1593	add	w7,w7,w11
1594	add	v12.4s,v12.4s,v13.4s
1595	add	w8,w8,w12
1596	add	v16.4s,v16.4s,v17.4s
1597	eor	w17,w17,w5
1598	add	v20.4s,v20.4s,v21.4s
1599	eor	w19,w19,w6
1600	eor	v3.16b,v3.16b,v0.16b
1601	eor	w20,w20,w7
1602	eor	v7.16b,v7.16b,v4.16b
1603	eor	w21,w21,w8
1604	eor	v11.16b,v11.16b,v8.16b
1605	ror	w17,w17,#16
1606	eor	v15.16b,v15.16b,v12.16b
1607	ror	w19,w19,#16
1608	eor	v19.16b,v19.16b,v16.16b
1609	ror	w20,w20,#16
1610	eor	v23.16b,v23.16b,v20.16b
1611	ror	w21,w21,#16
1612	rev32	v3.8h,v3.8h
1613	add	w13,w13,w17
1614	rev32	v7.8h,v7.8h
1615	add	w14,w14,w19
1616	rev32	v11.8h,v11.8h
1617	add	w15,w15,w20
1618	rev32	v15.8h,v15.8h
1619	add	w16,w16,w21
1620	rev32	v19.8h,v19.8h
1621	eor	w9,w9,w13
1622	rev32	v23.8h,v23.8h
1623	eor	w10,w10,w14
1624	add	v2.4s,v2.4s,v3.4s
1625	eor	w11,w11,w15
1626	add	v6.4s,v6.4s,v7.4s
1627	eor	w12,w12,w16
1628	add	v10.4s,v10.4s,v11.4s
1629	ror	w9,w9,#20
1630	add	v14.4s,v14.4s,v15.4s
1631	ror	w10,w10,#20
1632	add	v18.4s,v18.4s,v19.4s
1633	ror	w11,w11,#20
1634	add	v22.4s,v22.4s,v23.4s
1635	ror	w12,w12,#20
1636	eor	v24.16b,v1.16b,v2.16b
1637	add	w5,w5,w9
1638	eor	v25.16b,v5.16b,v6.16b
1639	add	w6,w6,w10
1640	eor	v26.16b,v9.16b,v10.16b
1641	add	w7,w7,w11
1642	eor	v27.16b,v13.16b,v14.16b
1643	add	w8,w8,w12
1644	eor	v28.16b,v17.16b,v18.16b
1645	eor	w17,w17,w5
1646	eor	v29.16b,v21.16b,v22.16b
1647	eor	w19,w19,w6
1648	ushr	v1.4s,v24.4s,#20
1649	eor	w20,w20,w7
1650	ushr	v5.4s,v25.4s,#20
1651	eor	w21,w21,w8
1652	ushr	v9.4s,v26.4s,#20
1653	ror	w17,w17,#24
1654	ushr	v13.4s,v27.4s,#20
1655	ror	w19,w19,#24
1656	ushr	v17.4s,v28.4s,#20
1657	ror	w20,w20,#24
1658	ushr	v21.4s,v29.4s,#20
1659	ror	w21,w21,#24
1660	sli	v1.4s,v24.4s,#12
1661	add	w13,w13,w17
1662	sli	v5.4s,v25.4s,#12
1663	add	w14,w14,w19
1664	sli	v9.4s,v26.4s,#12
1665	add	w15,w15,w20
1666	sli	v13.4s,v27.4s,#12
1667	add	w16,w16,w21
1668	sli	v17.4s,v28.4s,#12
1669	eor	w9,w9,w13
1670	sli	v21.4s,v29.4s,#12
1671	eor	w10,w10,w14
1672	add	v0.4s,v0.4s,v1.4s
1673	eor	w11,w11,w15
1674	add	v4.4s,v4.4s,v5.4s
1675	eor	w12,w12,w16
1676	add	v8.4s,v8.4s,v9.4s
1677	ror	w9,w9,#25
1678	add	v12.4s,v12.4s,v13.4s
1679	ror	w10,w10,#25
1680	add	v16.4s,v16.4s,v17.4s
1681	ror	w11,w11,#25
1682	add	v20.4s,v20.4s,v21.4s
1683	ror	w12,w12,#25
1684	eor	v24.16b,v3.16b,v0.16b
1685	add	w5,w5,w10
1686	eor	v25.16b,v7.16b,v4.16b
1687	add	w6,w6,w11
1688	eor	v26.16b,v11.16b,v8.16b
1689	add	w7,w7,w12
1690	eor	v27.16b,v15.16b,v12.16b
1691	add	w8,w8,w9
1692	eor	v28.16b,v19.16b,v16.16b
1693	eor	w21,w21,w5
1694	eor	v29.16b,v23.16b,v20.16b
1695	eor	w17,w17,w6
1696	ushr	v3.4s,v24.4s,#24
1697	eor	w19,w19,w7
1698	ushr	v7.4s,v25.4s,#24
1699	eor	w20,w20,w8
1700	ushr	v11.4s,v26.4s,#24
1701	ror	w21,w21,#16
1702	ushr	v15.4s,v27.4s,#24
1703	ror	w17,w17,#16
1704	ushr	v19.4s,v28.4s,#24
1705	ror	w19,w19,#16
1706	ushr	v23.4s,v29.4s,#24
1707	ror	w20,w20,#16
1708	sli	v3.4s,v24.4s,#8
1709	add	w15,w15,w21
1710	sli	v7.4s,v25.4s,#8
1711	add	w16,w16,w17
1712	sli	v11.4s,v26.4s,#8
1713	add	w13,w13,w19
1714	sli	v15.4s,v27.4s,#8
1715	add	w14,w14,w20
1716	sli	v19.4s,v28.4s,#8
1717	eor	w10,w10,w15
1718	sli	v23.4s,v29.4s,#8
1719	eor	w11,w11,w16
1720	add	v2.4s,v2.4s,v3.4s
1721	eor	w12,w12,w13
1722	add	v6.4s,v6.4s,v7.4s
1723	eor	w9,w9,w14
1724	add	v10.4s,v10.4s,v11.4s
1725	ror	w10,w10,#20
1726	add	v14.4s,v14.4s,v15.4s
1727	ror	w11,w11,#20
1728	add	v18.4s,v18.4s,v19.4s
1729	ror	w12,w12,#20
1730	add	v22.4s,v22.4s,v23.4s
1731	ror	w9,w9,#20
1732	eor	v24.16b,v1.16b,v2.16b
1733	add	w5,w5,w10
1734	eor	v25.16b,v5.16b,v6.16b
1735	add	w6,w6,w11
1736	eor	v26.16b,v9.16b,v10.16b
1737	add	w7,w7,w12
1738	eor	v27.16b,v13.16b,v14.16b
1739	add	w8,w8,w9
1740	eor	v28.16b,v17.16b,v18.16b
1741	eor	w21,w21,w5
1742	eor	v29.16b,v21.16b,v22.16b
1743	eor	w17,w17,w6
1744	ushr	v1.4s,v24.4s,#25
1745	eor	w19,w19,w7
1746	ushr	v5.4s,v25.4s,#25
1747	eor	w20,w20,w8
1748	ushr	v9.4s,v26.4s,#25
1749	ror	w21,w21,#24
1750	ushr	v13.4s,v27.4s,#25
1751	ror	w17,w17,#24
1752	ushr	v17.4s,v28.4s,#25
1753	ror	w19,w19,#24
1754	ushr	v21.4s,v29.4s,#25
1755	ror	w20,w20,#24
1756	sli	v1.4s,v24.4s,#7
1757	add	w15,w15,w21
1758	sli	v5.4s,v25.4s,#7
1759	add	w16,w16,w17
1760	sli	v9.4s,v26.4s,#7
1761	add	w13,w13,w19
1762	sli	v13.4s,v27.4s,#7
1763	add	w14,w14,w20
1764	sli	v17.4s,v28.4s,#7
1765	eor	w10,w10,w15
1766	sli	v21.4s,v29.4s,#7
1767	eor	w11,w11,w16
1768	ext	v2.16b,v2.16b,v2.16b,#8
1769	eor	w12,w12,w13
1770	ext	v6.16b,v6.16b,v6.16b,#8
1771	eor	w9,w9,w14
1772	ext	v10.16b,v10.16b,v10.16b,#8
1773	ror	w10,w10,#25
1774	ext	v14.16b,v14.16b,v14.16b,#8
1775	ror	w11,w11,#25
1776	ext	v18.16b,v18.16b,v18.16b,#8
1777	ror	w12,w12,#25
1778	ext	v22.16b,v22.16b,v22.16b,#8
1779	ror	w9,w9,#25
1780	ext	v3.16b,v3.16b,v3.16b,#4
1781	ext	v7.16b,v7.16b,v7.16b,#4
1782	ext	v11.16b,v11.16b,v11.16b,#4
1783	ext	v15.16b,v15.16b,v15.16b,#4
1784	ext	v19.16b,v19.16b,v19.16b,#4
1785	ext	v23.16b,v23.16b,v23.16b,#4
1786	ext	v1.16b,v1.16b,v1.16b,#12
1787	ext	v5.16b,v5.16b,v5.16b,#12
1788	ext	v9.16b,v9.16b,v9.16b,#12
1789	ext	v13.16b,v13.16b,v13.16b,#12
1790	ext	v17.16b,v17.16b,v17.16b,#12
1791	ext	v21.16b,v21.16b,v21.16b,#12
1792	cbnz	x4,.Loop_lower_neon
1793
1794	add	w5,w5,w22		// accumulate key block
1795	ldp	q24,q25,[sp,#0]
1796	add	x6,x6,x22,lsr#32
1797	ldp	q26,q27,[sp,#32]
1798	add	w7,w7,w23
1799	ldp	q28,q29,[sp,#64]
1800	add	x8,x8,x23,lsr#32
1801	add	v0.4s,v0.4s,v24.4s
1802	add	w9,w9,w24
1803	add	v4.4s,v4.4s,v24.4s
1804	add	x10,x10,x24,lsr#32
1805	add	v8.4s,v8.4s,v24.4s
1806	add	w11,w11,w25
1807	add	v12.4s,v12.4s,v24.4s
1808	add	x12,x12,x25,lsr#32
1809	add	v16.4s,v16.4s,v24.4s
1810	add	w13,w13,w26
1811	add	v20.4s,v20.4s,v24.4s
1812	add	x14,x14,x26,lsr#32
1813	add	v2.4s,v2.4s,v26.4s
1814	add	w15,w15,w27
1815	add	v6.4s,v6.4s,v26.4s
1816	add	x16,x16,x27,lsr#32
1817	add	v10.4s,v10.4s,v26.4s
1818	add	w17,w17,w28
1819	add	v14.4s,v14.4s,v26.4s
1820	add	x19,x19,x28,lsr#32
1821	add	v18.4s,v18.4s,v26.4s
1822	add	w20,w20,w30
1823	add	v22.4s,v22.4s,v26.4s
1824	add	x21,x21,x30,lsr#32
1825	add	v19.4s,v19.4s,v31.4s			// +4
1826	add	x5,x5,x6,lsl#32	// pack
1827	add	v23.4s,v23.4s,v31.4s			// +4
1828	add	x7,x7,x8,lsl#32
1829	add	v3.4s,v3.4s,v27.4s
1830	ldp	x6,x8,[x1,#0]		// load input
1831	add	v7.4s,v7.4s,v28.4s
1832	add	x9,x9,x10,lsl#32
1833	add	v11.4s,v11.4s,v29.4s
1834	add	x11,x11,x12,lsl#32
1835	add	v15.4s,v15.4s,v30.4s
1836	ldp	x10,x12,[x1,#16]
1837	add	v19.4s,v19.4s,v27.4s
1838	add	x13,x13,x14,lsl#32
1839	add	v23.4s,v23.4s,v28.4s
1840	add	x15,x15,x16,lsl#32
1841	add	v1.4s,v1.4s,v25.4s
1842	ldp	x14,x16,[x1,#32]
1843	add	v5.4s,v5.4s,v25.4s
1844	add	x17,x17,x19,lsl#32
1845	add	v9.4s,v9.4s,v25.4s
1846	add	x20,x20,x21,lsl#32
1847	add	v13.4s,v13.4s,v25.4s
1848	ldp	x19,x21,[x1,#48]
1849	add	v17.4s,v17.4s,v25.4s
1850	add	x1,x1,#64
1851	add	v21.4s,v21.4s,v25.4s
1852
1853#ifdef	__ARMEB__
1854	rev	x5,x5
1855	rev	x7,x7
1856	rev	x9,x9
1857	rev	x11,x11
1858	rev	x13,x13
1859	rev	x15,x15
1860	rev	x17,x17
1861	rev	x20,x20
1862#endif
1863	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1864	eor	x5,x5,x6
1865	eor	x7,x7,x8
1866	eor	x9,x9,x10
1867	eor	x11,x11,x12
1868	eor	x13,x13,x14
1869	eor	v0.16b,v0.16b,v24.16b
1870	eor	x15,x15,x16
1871	eor	v1.16b,v1.16b,v25.16b
1872	eor	x17,x17,x19
1873	eor	v2.16b,v2.16b,v26.16b
1874	eor	x20,x20,x21
1875	eor	v3.16b,v3.16b,v27.16b
1876	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1877
1878	stp	x5,x7,[x0,#0]		// store output
1879	add	x28,x28,#7			// increment counter
1880	stp	x9,x11,[x0,#16]
1881	stp	x13,x15,[x0,#32]
1882	stp	x17,x20,[x0,#48]
1883	add	x0,x0,#64
1884	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
1885
1886	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1887	eor	v4.16b,v4.16b,v24.16b
1888	eor	v5.16b,v5.16b,v25.16b
1889	eor	v6.16b,v6.16b,v26.16b
1890	eor	v7.16b,v7.16b,v27.16b
1891	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1892
1893	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
1894	eor	v8.16b,v8.16b,v0.16b
1895	ldp	q24,q25,[sp,#0]
1896	eor	v9.16b,v9.16b,v1.16b
1897	ldp	q26,q27,[sp,#32]
1898	eor	v10.16b,v10.16b,v2.16b
1899	eor	v11.16b,v11.16b,v3.16b
1900	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1901
1902	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1903	eor	v12.16b,v12.16b,v4.16b
1904	eor	v13.16b,v13.16b,v5.16b
1905	eor	v14.16b,v14.16b,v6.16b
1906	eor	v15.16b,v15.16b,v7.16b
1907	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1908
1909	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
1910	eor	v16.16b,v16.16b,v8.16b
1911	eor	v17.16b,v17.16b,v9.16b
1912	eor	v18.16b,v18.16b,v10.16b
1913	eor	v19.16b,v19.16b,v11.16b
1914	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1915
1916	shl	v0.4s,v31.4s,#1			// 4 -> 8
1917	eor	v20.16b,v20.16b,v12.16b
1918	eor	v21.16b,v21.16b,v13.16b
1919	eor	v22.16b,v22.16b,v14.16b
1920	eor	v23.16b,v23.16b,v15.16b
1921	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1922
1923	add	v27.4s,v27.4s,v0.4s			// += 8
1924	add	v28.4s,v28.4s,v0.4s
1925	add	v29.4s,v29.4s,v0.4s
1926	add	v30.4s,v30.4s,v0.4s
1927
1928	b.hs	.Loop_outer_512_neon
1929
1930	adds	x2,x2,#512
1931	ushr	v0.4s,v31.4s,#2			// 4 -> 1
1932
1933	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
1934	ldp	d10,d11,[sp,#128+16]
1935	ldp	d12,d13,[sp,#128+32]
1936	ldp	d14,d15,[sp,#128+48]
1937
1938	stp	q24,q31,[sp,#0]		// wipe off-load area
1939	stp	q24,q31,[sp,#32]
1940	stp	q24,q31,[sp,#64]
1941
1942	b.eq	.Ldone_512_neon
1943
1944	cmp	x2,#192
1945	sub	v27.4s,v27.4s,v0.4s			// -= 1
1946	sub	v28.4s,v28.4s,v0.4s
1947	sub	v29.4s,v29.4s,v0.4s
1948	add	sp,sp,#128
1949	b.hs	.Loop_outer_neon
1950
1951	eor	v25.16b,v25.16b,v25.16b
1952	eor	v26.16b,v26.16b,v26.16b
1953	eor	v27.16b,v27.16b,v27.16b
1954	eor	v28.16b,v28.16b,v28.16b
1955	eor	v29.16b,v29.16b,v29.16b
1956	eor	v30.16b,v30.16b,v30.16b
1957	b	.Loop_outer
1958
1959.Ldone_512_neon:
1960	ldp	x19,x20,[x29,#16]
1961	add	sp,sp,#128+64
1962	ldp	x21,x22,[x29,#32]
1963	ldp	x23,x24,[x29,#48]
1964	ldp	x25,x26,[x29,#64]
1965	ldp	x27,x28,[x29,#80]
1966	ldp	x29,x30,[sp],#96
1967	ret
1968.size	ChaCha20_512_neon,.-ChaCha20_512_neon
1969