xref: /netbsd-src/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/keccak1600-armv8.S (revision 1b3d6f93806f8821fe459e13ad13e605b37c6d43)
1.text
2
3.align	8	// strategic alignment and padding that allows to use
4		// address value as loop termination condition...
5.quad	0,0,0,0,0,0,0,0
6.type	iotas,%object
7iotas:
8.quad	0x0000000000000001
9.quad	0x0000000000008082
10.quad	0x800000000000808a
11.quad	0x8000000080008000
12.quad	0x000000000000808b
13.quad	0x0000000080000001
14.quad	0x8000000080008081
15.quad	0x8000000000008009
16.quad	0x000000000000008a
17.quad	0x0000000000000088
18.quad	0x0000000080008009
19.quad	0x000000008000000a
20.quad	0x000000008000808b
21.quad	0x800000000000008b
22.quad	0x8000000000008089
23.quad	0x8000000000008003
24.quad	0x8000000000008002
25.quad	0x8000000000000080
26.quad	0x000000000000800a
27.quad	0x800000008000000a
28.quad	0x8000000080008081
29.quad	0x8000000000008080
30.quad	0x0000000080000001
31.quad	0x8000000080008008
32.size	iotas,.-iotas
33.type	KeccakF1600_int,%function
34.align	5
35KeccakF1600_int:
36	adr	x28,iotas
37.inst	0xd503233f			// paciasp
38	stp	x28,x30,[sp,#16]		// 32 bytes on top are mine
39	b	.Loop
40.align	4
41.Loop:
42	////////////////////////////////////////// Theta
43	eor	x26,x0,x5
44	stp	x4,x9,[sp,#0]	// offload pair...
45	eor	x27,x1,x6
46	eor	x28,x2,x7
47	eor	x30,x3,x8
48	eor	x4,x4,x9
49	eor	x26,x26,x10
50	eor	x27,x27,x11
51	eor	x28,x28,x12
52	eor	x30,x30,x13
53	eor	x4,x4,x14
54	eor	x26,x26,x15
55	eor	x27,x27,x16
56	eor	x28,x28,x17
57	eor	x30,x30,x25
58	eor	x4,x4,x19
59	eor	x26,x26,x20
60	eor	x28,x28,x22
61	eor	x27,x27,x21
62	eor	x30,x30,x23
63	eor	x4,x4,x24
64
65	eor	x9,x26,x28,ror#63
66
67	eor	x1,x1,x9
68	eor	x6,x6,x9
69	eor	x11,x11,x9
70	eor	x16,x16,x9
71	eor	x21,x21,x9
72
73	eor	x9,x27,x30,ror#63
74	eor	x28,x28,x4,ror#63
75	eor	x30,x30,x26,ror#63
76	eor	x4,x4,x27,ror#63
77
78	eor	x27,   x2,x9		// mov	x27,x2
79	eor	x7,x7,x9
80	eor	x12,x12,x9
81	eor	x17,x17,x9
82	eor	x22,x22,x9
83
84	eor	x0,x0,x4
85	eor	x5,x5,x4
86	eor	x10,x10,x4
87	eor	x15,x15,x4
88	eor	x20,x20,x4
89	ldp	x4,x9,[sp,#0]	// re-load offloaded data
90	eor	x26,   x3,x28		// mov	x26,x3
91	eor	x8,x8,x28
92	eor	x13,x13,x28
93	eor	x25,x25,x28
94	eor	x23,x23,x28
95
96	eor	x28,   x4,x30		// mov	x28,x4
97	eor	x9,x9,x30
98	eor	x14,x14,x30
99	eor	x19,x19,x30
100	eor	x24,x24,x30
101
102	////////////////////////////////////////// Rho+Pi
103	mov	x30,x1
104	ror	x1,x6,#64-44
105	//mov	x27,x2
106	ror	x2,x12,#64-43
107	//mov	x26,x3
108	ror	x3,x25,#64-21
109	//mov	x28,x4
110	ror	x4,x24,#64-14
111
112	ror	x6,x9,#64-20
113	ror	x12,x13,#64-25
114	ror	x25,x17,#64-15
115	ror	x24,x21,#64-2
116
117	ror	x9,x22,#64-61
118	ror	x13,x19,#64-8
119	ror	x17,x11,#64-10
120	ror	x21,x8,#64-55
121
122	ror	x22,x14,#64-39
123	ror	x19,x23,#64-56
124	ror	x11,x7,#64-6
125	ror	x8,x16,#64-45
126
127	ror	x14,x20,#64-18
128	ror	x23,x15,#64-41
129	ror	x7,x10,#64-3
130	ror	x16,x5,#64-36
131
132	ror	x5,x26,#64-28
133	ror	x10,x30,#64-1
134	ror	x15,x28,#64-27
135	ror	x20,x27,#64-62
136
137	////////////////////////////////////////// Chi+Iota
138	bic	x26,x2,x1
139	bic	x27,x3,x2
140	bic	x28,x0,x4
141	bic	x30,x1,x0
142	eor	x0,x0,x26
143	bic	x26,x4,x3
144	eor	x1,x1,x27
145	ldr	x27,[sp,#16]
146	eor	x3,x3,x28
147	eor	x4,x4,x30
148	eor	x2,x2,x26
149	ldr	x30,[x27],#8		// Iota[i++]
150
151	bic	x26,x7,x6
152	tst	x27,#255			// are we done?
153	str	x27,[sp,#16]
154	bic	x27,x8,x7
155	bic	x28,x5,x9
156	eor	x0,x0,x30		// A[0][0] ^= Iota
157	bic	x30,x6,x5
158	eor	x5,x5,x26
159	bic	x26,x9,x8
160	eor	x6,x6,x27
161	eor	x8,x8,x28
162	eor	x9,x9,x30
163	eor	x7,x7,x26
164
165	bic	x26,x12,x11
166	bic	x27,x13,x12
167	bic	x28,x10,x14
168	bic	x30,x11,x10
169	eor	x10,x10,x26
170	bic	x26,x14,x13
171	eor	x11,x11,x27
172	eor	x13,x13,x28
173	eor	x14,x14,x30
174	eor	x12,x12,x26
175
176	bic	x26,x17,x16
177	bic	x27,x25,x17
178	bic	x28,x15,x19
179	bic	x30,x16,x15
180	eor	x15,x15,x26
181	bic	x26,x19,x25
182	eor	x16,x16,x27
183	eor	x25,x25,x28
184	eor	x19,x19,x30
185	eor	x17,x17,x26
186
187	bic	x26,x22,x21
188	bic	x27,x23,x22
189	bic	x28,x20,x24
190	bic	x30,x21,x20
191	eor	x20,x20,x26
192	bic	x26,x24,x23
193	eor	x21,x21,x27
194	eor	x23,x23,x28
195	eor	x24,x24,x30
196	eor	x22,x22,x26
197
198	bne	.Loop
199
200	ldr	x30,[sp,#24]
201.inst	0xd50323bf			// autiasp
202	ret
203.size	KeccakF1600_int,.-KeccakF1600_int
204
205.type	KeccakF1600,%function
206.align	5
207KeccakF1600:
208.inst	0xd503233f			// paciasp
209	stp	x29,x30,[sp,#-128]!
210	add	x29,sp,#0
211	stp	x19,x20,[sp,#16]
212	stp	x21,x22,[sp,#32]
213	stp	x23,x24,[sp,#48]
214	stp	x25,x26,[sp,#64]
215	stp	x27,x28,[sp,#80]
216	sub	sp,sp,#48
217
218	str	x0,[sp,#32]			// offload argument
219	mov	x26,x0
220	ldp	x0,x1,[x0,#16*0]
221	ldp	x2,x3,[x26,#16*1]
222	ldp	x4,x5,[x26,#16*2]
223	ldp	x6,x7,[x26,#16*3]
224	ldp	x8,x9,[x26,#16*4]
225	ldp	x10,x11,[x26,#16*5]
226	ldp	x12,x13,[x26,#16*6]
227	ldp	x14,x15,[x26,#16*7]
228	ldp	x16,x17,[x26,#16*8]
229	ldp	x25,x19,[x26,#16*9]
230	ldp	x20,x21,[x26,#16*10]
231	ldp	x22,x23,[x26,#16*11]
232	ldr	x24,[x26,#16*12]
233
234	bl	KeccakF1600_int
235
236	ldr	x26,[sp,#32]
237	stp	x0,x1,[x26,#16*0]
238	stp	x2,x3,[x26,#16*1]
239	stp	x4,x5,[x26,#16*2]
240	stp	x6,x7,[x26,#16*3]
241	stp	x8,x9,[x26,#16*4]
242	stp	x10,x11,[x26,#16*5]
243	stp	x12,x13,[x26,#16*6]
244	stp	x14,x15,[x26,#16*7]
245	stp	x16,x17,[x26,#16*8]
246	stp	x25,x19,[x26,#16*9]
247	stp	x20,x21,[x26,#16*10]
248	stp	x22,x23,[x26,#16*11]
249	str	x24,[x26,#16*12]
250
251	ldp	x19,x20,[x29,#16]
252	add	sp,sp,#48
253	ldp	x21,x22,[x29,#32]
254	ldp	x23,x24,[x29,#48]
255	ldp	x25,x26,[x29,#64]
256	ldp	x27,x28,[x29,#80]
257	ldp	x29,x30,[sp],#128
258.inst	0xd50323bf			// autiasp
259	ret
260.size	KeccakF1600,.-KeccakF1600
261
262.globl	SHA3_absorb
263.type	SHA3_absorb,%function
264.align	5
265SHA3_absorb:
266.inst	0xd503233f			// paciasp
267	stp	x29,x30,[sp,#-128]!
268	add	x29,sp,#0
269	stp	x19,x20,[sp,#16]
270	stp	x21,x22,[sp,#32]
271	stp	x23,x24,[sp,#48]
272	stp	x25,x26,[sp,#64]
273	stp	x27,x28,[sp,#80]
274	sub	sp,sp,#64
275
276	stp	x0,x1,[sp,#32]			// offload arguments
277	stp	x2,x3,[sp,#48]
278
279	mov	x26,x0			// uint64_t A[5][5]
280	mov	x27,x1			// const void *inp
281	mov	x28,x2			// size_t len
282	mov	x30,x3			// size_t bsz
283	ldp	x0,x1,[x26,#16*0]
284	ldp	x2,x3,[x26,#16*1]
285	ldp	x4,x5,[x26,#16*2]
286	ldp	x6,x7,[x26,#16*3]
287	ldp	x8,x9,[x26,#16*4]
288	ldp	x10,x11,[x26,#16*5]
289	ldp	x12,x13,[x26,#16*6]
290	ldp	x14,x15,[x26,#16*7]
291	ldp	x16,x17,[x26,#16*8]
292	ldp	x25,x19,[x26,#16*9]
293	ldp	x20,x21,[x26,#16*10]
294	ldp	x22,x23,[x26,#16*11]
295	ldr	x24,[x26,#16*12]
296	b	.Loop_absorb
297
298.align	4
299.Loop_absorb:
300	subs	x26,x28,x30		// len - bsz
301	blo	.Labsorbed
302
303	str	x26,[sp,#48]			// save len - bsz
304	ldr	x26,[x27],#8		// *inp++
305#ifdef	__AARCH64EB__
306	rev	x26,x26
307#endif
308	eor	x0,x0,x26
309	cmp	x30,#8*(0+2)
310	blo	.Lprocess_block
311	ldr	x26,[x27],#8		// *inp++
312#ifdef	__AARCH64EB__
313	rev	x26,x26
314#endif
315	eor	x1,x1,x26
316	beq	.Lprocess_block
317	ldr	x26,[x27],#8		// *inp++
318#ifdef	__AARCH64EB__
319	rev	x26,x26
320#endif
321	eor	x2,x2,x26
322	cmp	x30,#8*(2+2)
323	blo	.Lprocess_block
324	ldr	x26,[x27],#8		// *inp++
325#ifdef	__AARCH64EB__
326	rev	x26,x26
327#endif
328	eor	x3,x3,x26
329	beq	.Lprocess_block
330	ldr	x26,[x27],#8		// *inp++
331#ifdef	__AARCH64EB__
332	rev	x26,x26
333#endif
334	eor	x4,x4,x26
335	cmp	x30,#8*(4+2)
336	blo	.Lprocess_block
337	ldr	x26,[x27],#8		// *inp++
338#ifdef	__AARCH64EB__
339	rev	x26,x26
340#endif
341	eor	x5,x5,x26
342	beq	.Lprocess_block
343	ldr	x26,[x27],#8		// *inp++
344#ifdef	__AARCH64EB__
345	rev	x26,x26
346#endif
347	eor	x6,x6,x26
348	cmp	x30,#8*(6+2)
349	blo	.Lprocess_block
350	ldr	x26,[x27],#8		// *inp++
351#ifdef	__AARCH64EB__
352	rev	x26,x26
353#endif
354	eor	x7,x7,x26
355	beq	.Lprocess_block
356	ldr	x26,[x27],#8		// *inp++
357#ifdef	__AARCH64EB__
358	rev	x26,x26
359#endif
360	eor	x8,x8,x26
361	cmp	x30,#8*(8+2)
362	blo	.Lprocess_block
363	ldr	x26,[x27],#8		// *inp++
364#ifdef	__AARCH64EB__
365	rev	x26,x26
366#endif
367	eor	x9,x9,x26
368	beq	.Lprocess_block
369	ldr	x26,[x27],#8		// *inp++
370#ifdef	__AARCH64EB__
371	rev	x26,x26
372#endif
373	eor	x10,x10,x26
374	cmp	x30,#8*(10+2)
375	blo	.Lprocess_block
376	ldr	x26,[x27],#8		// *inp++
377#ifdef	__AARCH64EB__
378	rev	x26,x26
379#endif
380	eor	x11,x11,x26
381	beq	.Lprocess_block
382	ldr	x26,[x27],#8		// *inp++
383#ifdef	__AARCH64EB__
384	rev	x26,x26
385#endif
386	eor	x12,x12,x26
387	cmp	x30,#8*(12+2)
388	blo	.Lprocess_block
389	ldr	x26,[x27],#8		// *inp++
390#ifdef	__AARCH64EB__
391	rev	x26,x26
392#endif
393	eor	x13,x13,x26
394	beq	.Lprocess_block
395	ldr	x26,[x27],#8		// *inp++
396#ifdef	__AARCH64EB__
397	rev	x26,x26
398#endif
399	eor	x14,x14,x26
400	cmp	x30,#8*(14+2)
401	blo	.Lprocess_block
402	ldr	x26,[x27],#8		// *inp++
403#ifdef	__AARCH64EB__
404	rev	x26,x26
405#endif
406	eor	x15,x15,x26
407	beq	.Lprocess_block
408	ldr	x26,[x27],#8		// *inp++
409#ifdef	__AARCH64EB__
410	rev	x26,x26
411#endif
412	eor	x16,x16,x26
413	cmp	x30,#8*(16+2)
414	blo	.Lprocess_block
415	ldr	x26,[x27],#8		// *inp++
416#ifdef	__AARCH64EB__
417	rev	x26,x26
418#endif
419	eor	x17,x17,x26
420	beq	.Lprocess_block
421	ldr	x26,[x27],#8		// *inp++
422#ifdef	__AARCH64EB__
423	rev	x26,x26
424#endif
425	eor	x25,x25,x26
426	cmp	x30,#8*(18+2)
427	blo	.Lprocess_block
428	ldr	x26,[x27],#8		// *inp++
429#ifdef	__AARCH64EB__
430	rev	x26,x26
431#endif
432	eor	x19,x19,x26
433	beq	.Lprocess_block
434	ldr	x26,[x27],#8		// *inp++
435#ifdef	__AARCH64EB__
436	rev	x26,x26
437#endif
438	eor	x20,x20,x26
439	cmp	x30,#8*(20+2)
440	blo	.Lprocess_block
441	ldr	x26,[x27],#8		// *inp++
442#ifdef	__AARCH64EB__
443	rev	x26,x26
444#endif
445	eor	x21,x21,x26
446	beq	.Lprocess_block
447	ldr	x26,[x27],#8		// *inp++
448#ifdef	__AARCH64EB__
449	rev	x26,x26
450#endif
451	eor	x22,x22,x26
452	cmp	x30,#8*(22+2)
453	blo	.Lprocess_block
454	ldr	x26,[x27],#8		// *inp++
455#ifdef	__AARCH64EB__
456	rev	x26,x26
457#endif
458	eor	x23,x23,x26
459	beq	.Lprocess_block
460	ldr	x26,[x27],#8		// *inp++
461#ifdef	__AARCH64EB__
462	rev	x26,x26
463#endif
464	eor	x24,x24,x26
465
466.Lprocess_block:
467	str	x27,[sp,#40]			// save inp
468
469	bl	KeccakF1600_int
470
471	ldr	x27,[sp,#40]			// restore arguments
472	ldp	x28,x30,[sp,#48]
473	b	.Loop_absorb
474
475.align	4
476.Labsorbed:
477	ldr	x27,[sp,#32]
478	stp	x0,x1,[x27,#16*0]
479	stp	x2,x3,[x27,#16*1]
480	stp	x4,x5,[x27,#16*2]
481	stp	x6,x7,[x27,#16*3]
482	stp	x8,x9,[x27,#16*4]
483	stp	x10,x11,[x27,#16*5]
484	stp	x12,x13,[x27,#16*6]
485	stp	x14,x15,[x27,#16*7]
486	stp	x16,x17,[x27,#16*8]
487	stp	x25,x19,[x27,#16*9]
488	stp	x20,x21,[x27,#16*10]
489	stp	x22,x23,[x27,#16*11]
490	str	x24,[x27,#16*12]
491
492	mov	x0,x28			// return value
493	ldp	x19,x20,[x29,#16]
494	add	sp,sp,#64
495	ldp	x21,x22,[x29,#32]
496	ldp	x23,x24,[x29,#48]
497	ldp	x25,x26,[x29,#64]
498	ldp	x27,x28,[x29,#80]
499	ldp	x29,x30,[sp],#128
500.inst	0xd50323bf			// autiasp
501	ret
502.size	SHA3_absorb,.-SHA3_absorb
503.globl	SHA3_squeeze
504.type	SHA3_squeeze,%function
505.align	5
506SHA3_squeeze:
507.inst	0xd503233f			// paciasp
508	stp	x29,x30,[sp,#-48]!
509	add	x29,sp,#0
510	stp	x19,x20,[sp,#16]
511	stp	x21,x22,[sp,#32]
512
513	mov	x19,x0			// put aside arguments
514	mov	x20,x1
515	mov	x21,x2
516	mov	x22,x3
517
518.Loop_squeeze:
519	ldr	x4,[x0],#8
520	cmp	x21,#8
521	blo	.Lsqueeze_tail
522#ifdef	__AARCH64EB__
523	rev	x4,x4
524#endif
525	str	x4,[x20],#8
526	subs	x21,x21,#8
527	beq	.Lsqueeze_done
528
529	subs	x3,x3,#8
530	bhi	.Loop_squeeze
531
532	mov	x0,x19
533	bl	KeccakF1600
534	mov	x0,x19
535	mov	x3,x22
536	b	.Loop_squeeze
537
538.align	4
539.Lsqueeze_tail:
540	strb	w4,[x20],#1
541	lsr	x4,x4,#8
542	subs	x21,x21,#1
543	beq	.Lsqueeze_done
544	strb	w4,[x20],#1
545	lsr	x4,x4,#8
546	subs	x21,x21,#1
547	beq	.Lsqueeze_done
548	strb	w4,[x20],#1
549	lsr	x4,x4,#8
550	subs	x21,x21,#1
551	beq	.Lsqueeze_done
552	strb	w4,[x20],#1
553	lsr	x4,x4,#8
554	subs	x21,x21,#1
555	beq	.Lsqueeze_done
556	strb	w4,[x20],#1
557	lsr	x4,x4,#8
558	subs	x21,x21,#1
559	beq	.Lsqueeze_done
560	strb	w4,[x20],#1
561	lsr	x4,x4,#8
562	subs	x21,x21,#1
563	beq	.Lsqueeze_done
564	strb	w4,[x20],#1
565
566.Lsqueeze_done:
567	ldp	x19,x20,[sp,#16]
568	ldp	x21,x22,[sp,#32]
569	ldp	x29,x30,[sp],#48
570.inst	0xd50323bf			// autiasp
571	ret
572.size	SHA3_squeeze,.-SHA3_squeeze
573.type	KeccakF1600_ce,%function
574.align	5
575KeccakF1600_ce:
576	mov	x9,#24
577	adr	x10,iotas
578	b	.Loop_ce
579.align	4
580.Loop_ce:
581	////////////////////////////////////////////////// Theta
582.inst	0xce0f2a99	//eor3 v25.16b,v20.16b,v15.16b,v10.16b
583.inst	0xce102eba	//eor3 v26.16b,v21.16b,v16.16b,v11.16b
584.inst	0xce1132db	//eor3 v27.16b,v22.16b,v17.16b,v12.16b
585.inst	0xce1236fc	//eor3 v28.16b,v23.16b,v18.16b,v13.16b
586.inst	0xce133b1d	//eor3 v29.16b,v24.16b,v19.16b,v14.16b
587.inst	0xce050339	//eor3 v25.16b,v25.16b,   v5.16b,v0.16b
588.inst	0xce06075a	//eor3 v26.16b,v26.16b,   v6.16b,v1.16b
589.inst	0xce070b7b	//eor3 v27.16b,v27.16b,   v7.16b,v2.16b
590.inst	0xce080f9c	//eor3 v28.16b,v28.16b,   v8.16b,v3.16b
591.inst	0xce0913bd	//eor3 v29.16b,v29.16b,   v9.16b,v4.16b
592
593.inst	0xce7b8f3e	//rax1 v30.16b,v25.16b,v27.16b			// D[1]
594.inst	0xce7c8f5f	//rax1 v31.16b,v26.16b,v28.16b			// D[2]
595.inst	0xce7d8f7b	//rax1 v27.16b,v27.16b,v29.16b			// D[3]
596.inst	0xce798f9c	//rax1 v28.16b,v28.16b,v25.16b			// D[4]
597.inst	0xce7a8fbd	//rax1 v29.16b,v29.16b,v26.16b			// D[0]
598
599	////////////////////////////////////////////////// Theta+Rho+Pi
600.inst	0xce9efc39	//xar v25.16b,   v1.16b,v30.16b,#64-1 // C[0]=A[2][0]
601
602.inst	0xce9e50c1	//xar v1.16b,v6.16b,v30.16b,#64-44
603.inst	0xce9cb126	//xar v6.16b,v9.16b,v28.16b,#64-20
604.inst	0xce9f0ec9	//xar v9.16b,v22.16b,v31.16b,#64-61
605.inst	0xce9c65d6	//xar v22.16b,v14.16b,v28.16b,#64-39
606.inst	0xce9dba8e	//xar v14.16b,v20.16b,v29.16b,#64-18
607
608.inst	0xce9f085a	//xar v26.16b,   v2.16b,v31.16b,#64-62 // C[1]=A[4][0]
609
610.inst	0xce9f5582	//xar v2.16b,v12.16b,v31.16b,#64-43
611.inst	0xce9b9dac	//xar v12.16b,v13.16b,v27.16b,#64-25
612.inst	0xce9ce26d	//xar v13.16b,v19.16b,v28.16b,#64-8
613.inst	0xce9b22f3	//xar v19.16b,v23.16b,v27.16b,#64-56
614.inst	0xce9d5df7	//xar v23.16b,v15.16b,v29.16b,#64-41
615
616.inst	0xce9c948f	//xar v15.16b,v4.16b,v28.16b,#64-27
617
618.inst	0xce9ccb1c	//xar v28.16b,   v24.16b,v28.16b,#64-14 // D[4]=A[0][4]
619.inst	0xce9efab8	//xar v24.16b,v21.16b,v30.16b,#64-2
620.inst	0xce9b2508	//xar v8.16b,v8.16b,v27.16b,#64-55 // A[1][3]=A[4][1]
621.inst	0xce9e4e04	//xar v4.16b,v16.16b,v30.16b,#64-45 // A[0][4]=A[1][3]
622.inst	0xce9d70b0	//xar v16.16b,v5.16b,v29.16b,#64-36
623
624.inst	0xce9b9065	//xar v5.16b,v3.16b,v27.16b,#64-28
625
626	eor	v0.16b,v0.16b,v29.16b
627
628.inst	0xce9bae5b	//xar v27.16b,   v18.16b,v27.16b,#64-21 // D[3]=A[0][3]
629.inst	0xce9fc623	//xar v3.16b,v17.16b,v31.16b,#64-15 // A[0][3]=A[3][3]
630.inst	0xce9ed97e	//xar v30.16b,   v11.16b,v30.16b,#64-10 // D[1]=A[3][2]
631.inst	0xce9fe8ff	//xar v31.16b,   v7.16b,v31.16b,#64-6 // D[2]=A[2][1]
632.inst	0xce9df55d	//xar v29.16b,   v10.16b,v29.16b,#64-3 // D[0]=A[1][2]
633
634	////////////////////////////////////////////////// Chi+Iota
635.inst	0xce362354	//bcax v20.16b,v26.16b,   v22.16b,v8.16b	// A[1][3]=A[4][1]
636.inst	0xce375915	//bcax v21.16b,v8.16b,v23.16b,v22.16b	// A[1][3]=A[4][1]
637.inst	0xce385ed6	//bcax v22.16b,v22.16b,v24.16b,v23.16b
638.inst	0xce3a62f7	//bcax v23.16b,v23.16b,v26.16b,   v24.16b
639.inst	0xce286b18	//bcax v24.16b,v24.16b,v8.16b,v26.16b	// A[1][3]=A[4][1]
640
641	ld1r	{v26.2d},[x10],#8
642
643.inst	0xce330fd1	//bcax v17.16b,v30.16b,   v19.16b,v3.16b	// A[0][3]=A[3][3]
644.inst	0xce2f4c72	//bcax v18.16b,v3.16b,v15.16b,v19.16b	// A[0][3]=A[3][3]
645.inst	0xce303e73	//bcax v19.16b,v19.16b,v16.16b,v15.16b
646.inst	0xce3e41ef	//bcax v15.16b,v15.16b,v30.16b,   v16.16b
647.inst	0xce237a10	//bcax v16.16b,v16.16b,v3.16b,v30.16b	// A[0][3]=A[3][3]
648
649.inst	0xce2c7f2a	//bcax v10.16b,v25.16b,   v12.16b,v31.16b
650.inst	0xce2d33eb	//bcax v11.16b,v31.16b,   v13.16b,v12.16b
651.inst	0xce2e358c	//bcax v12.16b,v12.16b,v14.16b,v13.16b
652.inst	0xce3939ad	//bcax v13.16b,v13.16b,v25.16b,   v14.16b
653.inst	0xce3f65ce	//bcax v14.16b,v14.16b,v31.16b,   v25.16b
654
655.inst	0xce2913a7	//bcax v7.16b,v29.16b,   v9.16b,v4.16b	// A[0][4]=A[1][3]
656.inst	0xce252488	//bcax v8.16b,v4.16b,v5.16b,v9.16b	// A[0][4]=A[1][3]
657.inst	0xce261529	//bcax v9.16b,v9.16b,v6.16b,v5.16b
658.inst	0xce3d18a5	//bcax v5.16b,v5.16b,v29.16b,   v6.16b
659.inst	0xce2474c6	//bcax v6.16b,v6.16b,v4.16b,v29.16b	// A[0][4]=A[1][3]
660
661.inst	0xce207363	//bcax v3.16b,v27.16b,   v0.16b,v28.16b
662.inst	0xce210384	//bcax v4.16b,v28.16b,   v1.16b,v0.16b
663.inst	0xce220400	//bcax v0.16b,v0.16b,v2.16b,v1.16b
664.inst	0xce3b0821	//bcax v1.16b,v1.16b,v27.16b,   v2.16b
665.inst	0xce3c6c42	//bcax v2.16b,v2.16b,v28.16b,   v27.16b
666
667	eor	v0.16b,v0.16b,v26.16b
668
669	subs	x9,x9,#1
670	bne	.Loop_ce
671
672	ret
673.size	KeccakF1600_ce,.-KeccakF1600_ce
674
675.type	KeccakF1600_cext,%function
676.align	5
677KeccakF1600_cext:
678.inst	0xd503233f		// paciasp
679	stp	x29,x30,[sp,#-80]!
680	add	x29,sp,#0
681	stp	d8,d9,[sp,#16]		// per ABI requirement
682	stp	d10,d11,[sp,#32]
683	stp	d12,d13,[sp,#48]
684	stp	d14,d15,[sp,#64]
685	ldp	d0,d1,[x0,#8*0]
686	ldp	d2,d3,[x0,#8*2]
687	ldp	d4,d5,[x0,#8*4]
688	ldp	d6,d7,[x0,#8*6]
689	ldp	d8,d9,[x0,#8*8]
690	ldp	d10,d11,[x0,#8*10]
691	ldp	d12,d13,[x0,#8*12]
692	ldp	d14,d15,[x0,#8*14]
693	ldp	d16,d17,[x0,#8*16]
694	ldp	d18,d19,[x0,#8*18]
695	ldp	d20,d21,[x0,#8*20]
696	ldp	d22,d23,[x0,#8*22]
697	ldr	d24,[x0,#8*24]
698	bl	KeccakF1600_ce
699	ldr	x30,[sp,#8]
700	stp	d0,d1,[x0,#8*0]
701	stp	d2,d3,[x0,#8*2]
702	stp	d4,d5,[x0,#8*4]
703	stp	d6,d7,[x0,#8*6]
704	stp	d8,d9,[x0,#8*8]
705	stp	d10,d11,[x0,#8*10]
706	stp	d12,d13,[x0,#8*12]
707	stp	d14,d15,[x0,#8*14]
708	stp	d16,d17,[x0,#8*16]
709	stp	d18,d19,[x0,#8*18]
710	stp	d20,d21,[x0,#8*20]
711	stp	d22,d23,[x0,#8*22]
712	str	d24,[x0,#8*24]
713
714	ldp	d8,d9,[sp,#16]
715	ldp	d10,d11,[sp,#32]
716	ldp	d12,d13,[sp,#48]
717	ldp	d14,d15,[sp,#64]
718	ldr	x29,[sp],#80
719.inst	0xd50323bf		// autiasp
720	ret
721.size	KeccakF1600_cext,.-KeccakF1600_cext
722.globl	SHA3_absorb_cext
723.type	SHA3_absorb_cext,%function
724.align	5
725SHA3_absorb_cext:
726.inst	0xd503233f		// paciasp
727	stp	x29,x30,[sp,#-80]!
728	add	x29,sp,#0
729	stp	d8,d9,[sp,#16]		// per ABI requirement
730	stp	d10,d11,[sp,#32]
731	stp	d12,d13,[sp,#48]
732	stp	d14,d15,[sp,#64]
733	ldp	d0,d1,[x0,#8*0]
734	ldp	d2,d3,[x0,#8*2]
735	ldp	d4,d5,[x0,#8*4]
736	ldp	d6,d7,[x0,#8*6]
737	ldp	d8,d9,[x0,#8*8]
738	ldp	d10,d11,[x0,#8*10]
739	ldp	d12,d13,[x0,#8*12]
740	ldp	d14,d15,[x0,#8*14]
741	ldp	d16,d17,[x0,#8*16]
742	ldp	d18,d19,[x0,#8*18]
743	ldp	d20,d21,[x0,#8*20]
744	ldp	d22,d23,[x0,#8*22]
745	ldr	d24,[x0,#8*24]
746	b	.Loop_absorb_ce
747
748.align	4
749.Loop_absorb_ce:
750	subs	x2,x2,x3		// len - bsz
751	blo	.Labsorbed_ce
752	ldr	d31,[x1],#8		// *inp++
753#ifdef	__AARCH64EB__
754	rev64	v31.16b,v31.16b
755#endif
756	eor	v0.16b,v0.16b,v31.16b
757	cmp	x3,#8*(0+2)
758	blo	.Lprocess_block_ce
759	ldr	d31,[x1],#8		// *inp++
760#ifdef	__AARCH64EB__
761	rev64	v31.16b,v31.16b
762#endif
763	eor	v1.16b,v1.16b,v31.16b
764	beq	.Lprocess_block_ce
765	ldr	d31,[x1],#8		// *inp++
766#ifdef	__AARCH64EB__
767	rev64	v31.16b,v31.16b
768#endif
769	eor	v2.16b,v2.16b,v31.16b
770	cmp	x3,#8*(2+2)
771	blo	.Lprocess_block_ce
772	ldr	d31,[x1],#8		// *inp++
773#ifdef	__AARCH64EB__
774	rev64	v31.16b,v31.16b
775#endif
776	eor	v3.16b,v3.16b,v31.16b
777	beq	.Lprocess_block_ce
778	ldr	d31,[x1],#8		// *inp++
779#ifdef	__AARCH64EB__
780	rev64	v31.16b,v31.16b
781#endif
782	eor	v4.16b,v4.16b,v31.16b
783	cmp	x3,#8*(4+2)
784	blo	.Lprocess_block_ce
785	ldr	d31,[x1],#8		// *inp++
786#ifdef	__AARCH64EB__
787	rev64	v31.16b,v31.16b
788#endif
789	eor	v5.16b,v5.16b,v31.16b
790	beq	.Lprocess_block_ce
791	ldr	d31,[x1],#8		// *inp++
792#ifdef	__AARCH64EB__
793	rev64	v31.16b,v31.16b
794#endif
795	eor	v6.16b,v6.16b,v31.16b
796	cmp	x3,#8*(6+2)
797	blo	.Lprocess_block_ce
798	ldr	d31,[x1],#8		// *inp++
799#ifdef	__AARCH64EB__
800	rev64	v31.16b,v31.16b
801#endif
802	eor	v7.16b,v7.16b,v31.16b
803	beq	.Lprocess_block_ce
804	ldr	d31,[x1],#8		// *inp++
805#ifdef	__AARCH64EB__
806	rev64	v31.16b,v31.16b
807#endif
808	eor	v8.16b,v8.16b,v31.16b
809	cmp	x3,#8*(8+2)
810	blo	.Lprocess_block_ce
811	ldr	d31,[x1],#8		// *inp++
812#ifdef	__AARCH64EB__
813	rev64	v31.16b,v31.16b
814#endif
815	eor	v9.16b,v9.16b,v31.16b
816	beq	.Lprocess_block_ce
817	ldr	d31,[x1],#8		// *inp++
818#ifdef	__AARCH64EB__
819	rev64	v31.16b,v31.16b
820#endif
821	eor	v10.16b,v10.16b,v31.16b
822	cmp	x3,#8*(10+2)
823	blo	.Lprocess_block_ce
824	ldr	d31,[x1],#8		// *inp++
825#ifdef	__AARCH64EB__
826	rev64	v31.16b,v31.16b
827#endif
828	eor	v11.16b,v11.16b,v31.16b
829	beq	.Lprocess_block_ce
830	ldr	d31,[x1],#8		// *inp++
831#ifdef	__AARCH64EB__
832	rev64	v31.16b,v31.16b
833#endif
834	eor	v12.16b,v12.16b,v31.16b
835	cmp	x3,#8*(12+2)
836	blo	.Lprocess_block_ce
837	ldr	d31,[x1],#8		// *inp++
838#ifdef	__AARCH64EB__
839	rev64	v31.16b,v31.16b
840#endif
841	eor	v13.16b,v13.16b,v31.16b
842	beq	.Lprocess_block_ce
843	ldr	d31,[x1],#8		// *inp++
844#ifdef	__AARCH64EB__
845	rev64	v31.16b,v31.16b
846#endif
847	eor	v14.16b,v14.16b,v31.16b
848	cmp	x3,#8*(14+2)
849	blo	.Lprocess_block_ce
850	ldr	d31,[x1],#8		// *inp++
851#ifdef	__AARCH64EB__
852	rev64	v31.16b,v31.16b
853#endif
854	eor	v15.16b,v15.16b,v31.16b
855	beq	.Lprocess_block_ce
856	ldr	d31,[x1],#8		// *inp++
857#ifdef	__AARCH64EB__
858	rev64	v31.16b,v31.16b
859#endif
860	eor	v16.16b,v16.16b,v31.16b
861	cmp	x3,#8*(16+2)
862	blo	.Lprocess_block_ce
863	ldr	d31,[x1],#8		// *inp++
864#ifdef	__AARCH64EB__
865	rev64	v31.16b,v31.16b
866#endif
867	eor	v17.16b,v17.16b,v31.16b
868	beq	.Lprocess_block_ce
869	ldr	d31,[x1],#8		// *inp++
870#ifdef	__AARCH64EB__
871	rev64	v31.16b,v31.16b
872#endif
873	eor	v18.16b,v18.16b,v31.16b
874	cmp	x3,#8*(18+2)
875	blo	.Lprocess_block_ce
876	ldr	d31,[x1],#8		// *inp++
877#ifdef	__AARCH64EB__
878	rev64	v31.16b,v31.16b
879#endif
880	eor	v19.16b,v19.16b,v31.16b
881	beq	.Lprocess_block_ce
882	ldr	d31,[x1],#8		// *inp++
883#ifdef	__AARCH64EB__
884	rev64	v31.16b,v31.16b
885#endif
886	eor	v20.16b,v20.16b,v31.16b
887	cmp	x3,#8*(20+2)
888	blo	.Lprocess_block_ce
889	ldr	d31,[x1],#8		// *inp++
890#ifdef	__AARCH64EB__
891	rev64	v31.16b,v31.16b
892#endif
893	eor	v21.16b,v21.16b,v31.16b
894	beq	.Lprocess_block_ce
895	ldr	d31,[x1],#8		// *inp++
896#ifdef	__AARCH64EB__
897	rev64	v31.16b,v31.16b
898#endif
899	eor	v22.16b,v22.16b,v31.16b
900	cmp	x3,#8*(22+2)
901	blo	.Lprocess_block_ce
902	ldr	d31,[x1],#8		// *inp++
903#ifdef	__AARCH64EB__
904	rev64	v31.16b,v31.16b
905#endif
906	eor	v23.16b,v23.16b,v31.16b
907	beq	.Lprocess_block_ce
908	ldr	d31,[x1],#8		// *inp++
909#ifdef	__AARCH64EB__
910	rev64	v31.16b,v31.16b
911#endif
912	eor	v24.16b,v24.16b,v31.16b
913
914.Lprocess_block_ce:
915
916	bl	KeccakF1600_ce
917
918	b	.Loop_absorb_ce
919
920.align	4
921.Labsorbed_ce:
922	stp	d0,d1,[x0,#8*0]
923	stp	d2,d3,[x0,#8*2]
924	stp	d4,d5,[x0,#8*4]
925	stp	d6,d7,[x0,#8*6]
926	stp	d8,d9,[x0,#8*8]
927	stp	d10,d11,[x0,#8*10]
928	stp	d12,d13,[x0,#8*12]
929	stp	d14,d15,[x0,#8*14]
930	stp	d16,d17,[x0,#8*16]
931	stp	d18,d19,[x0,#8*18]
932	stp	d20,d21,[x0,#8*20]
933	stp	d22,d23,[x0,#8*22]
934	str	d24,[x0,#8*24]
935	add	x0,x2,x3		// return value
936
937	ldp	d8,d9,[sp,#16]
938	ldp	d10,d11,[sp,#32]
939	ldp	d12,d13,[sp,#48]
940	ldp	d14,d15,[sp,#64]
941	ldp	x29,x30,[sp],#80
942.inst	0xd50323bf		// autiasp
943	ret
944.size	SHA3_absorb_cext,.-SHA3_absorb_cext
945.globl	SHA3_squeeze_cext
946.type	SHA3_squeeze_cext,%function
947.align	5
948SHA3_squeeze_cext:
949.inst	0xd503233f		// paciasp
950	stp	x29,x30,[sp,#-16]!
951	add	x29,sp,#0
952	mov	x9,x0
953	mov	x10,x3
954
955.Loop_squeeze_ce:
956	ldr	x4,[x9],#8
957	cmp	x2,#8
958	blo	.Lsqueeze_tail_ce
959#ifdef	__AARCH64EB__
960	rev	x4,x4
961#endif
962	str	x4,[x1],#8
963	beq	.Lsqueeze_done_ce
964
965	sub	x2,x2,#8
966	subs	x10,x10,#8
967	bhi	.Loop_squeeze_ce
968
969	bl	KeccakF1600_cext
970	ldr	x30,[sp,#8]
971	mov	x9,x0
972	mov	x10,x3
973	b	.Loop_squeeze_ce
974
975.align	4
976.Lsqueeze_tail_ce:
977	strb	w4,[x1],#1
978	lsr	x4,x4,#8
979	subs	x2,x2,#1
980	beq	.Lsqueeze_done_ce
981	strb	w4,[x1],#1
982	lsr	x4,x4,#8
983	subs	x2,x2,#1
984	beq	.Lsqueeze_done_ce
985	strb	w4,[x1],#1
986	lsr	x4,x4,#8
987	subs	x2,x2,#1
988	beq	.Lsqueeze_done_ce
989	strb	w4,[x1],#1
990	lsr	x4,x4,#8
991	subs	x2,x2,#1
992	beq	.Lsqueeze_done_ce
993	strb	w4,[x1],#1
994	lsr	x4,x4,#8
995	subs	x2,x2,#1
996	beq	.Lsqueeze_done_ce
997	strb	w4,[x1],#1
998	lsr	x4,x4,#8
999	subs	x2,x2,#1
1000	beq	.Lsqueeze_done_ce
1001	strb	w4,[x1],#1
1002
1003.Lsqueeze_done_ce:
1004	ldr	x29,[sp],#16
1005.inst	0xd50323bf		// autiasp
1006	ret
1007.size	SHA3_squeeze_cext,.-SHA3_squeeze_cext
1008.byte	75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1009.align	2
1010