xref: /netbsd-src/crypto/external/bsd/openssl.old/lib/libcrypto/arch/aarch64/keccak1600-armv8.S (revision 4724848cf0da353df257f730694b7882798e5daf)
1.text
2
3.align	8	// strategic alignment and padding that allows to use
4		// address value as loop termination condition...
5.quad	0,0,0,0,0,0,0,0
6.type	iotas,%object
7iotas:
8.quad	0x0000000000000001
9.quad	0x0000000000008082
10.quad	0x800000000000808a
11.quad	0x8000000080008000
12.quad	0x000000000000808b
13.quad	0x0000000080000001
14.quad	0x8000000080008081
15.quad	0x8000000000008009
16.quad	0x000000000000008a
17.quad	0x0000000000000088
18.quad	0x0000000080008009
19.quad	0x000000008000000a
20.quad	0x000000008000808b
21.quad	0x800000000000008b
22.quad	0x8000000000008089
23.quad	0x8000000000008003
24.quad	0x8000000000008002
25.quad	0x8000000000000080
26.quad	0x000000000000800a
27.quad	0x800000008000000a
28.quad	0x8000000080008081
29.quad	0x8000000000008080
30.quad	0x0000000080000001
31.quad	0x8000000080008008
32.size	iotas,.-iotas
33.type	KeccakF1600_int,%function
34.align	5
35KeccakF1600_int:
36	adr	x28,iotas
37.inst	0xd503233f			// paciasp
38	stp	x28,x30,[sp,#16]		// 32 bytes on top are mine
39	b	.Loop
40.align	4
41.Loop:
42	////////////////////////////////////////// Theta
43	eor	x26,x0,x5
44	stp	x4,x9,[sp,#0]	// offload pair...
45	eor	x27,x1,x6
46	eor	x28,x2,x7
47	eor	x30,x3,x8
48	eor	x4,x4,x9
49	eor	x26,x26,x10
50	eor	x27,x27,x11
51	eor	x28,x28,x12
52	eor	x30,x30,x13
53	eor	x4,x4,x14
54	eor	x26,x26,x15
55	eor	x27,x27,x16
56	eor	x28,x28,x17
57	eor	x30,x30,x25
58	eor	x4,x4,x19
59	eor	x26,x26,x20
60	eor	x28,x28,x22
61	eor	x27,x27,x21
62	eor	x30,x30,x23
63	eor	x4,x4,x24
64
65	eor	x9,x26,x28,ror#63
66
67	eor	x1,x1,x9
68	eor	x6,x6,x9
69	eor	x11,x11,x9
70	eor	x16,x16,x9
71	eor	x21,x21,x9
72
73	eor	x9,x27,x30,ror#63
74	eor	x28,x28,x4,ror#63
75	eor	x30,x30,x26,ror#63
76	eor	x4,x4,x27,ror#63
77
78	eor	x27,   x2,x9		// mov	x27,x2
79	eor	x7,x7,x9
80	eor	x12,x12,x9
81	eor	x17,x17,x9
82	eor	x22,x22,x9
83
84	eor	x0,x0,x4
85	eor	x5,x5,x4
86	eor	x10,x10,x4
87	eor	x15,x15,x4
88	eor	x20,x20,x4
89	ldp	x4,x9,[sp,#0]	// re-load offloaded data
90	eor	x26,   x3,x28		// mov	x26,x3
91	eor	x8,x8,x28
92	eor	x13,x13,x28
93	eor	x25,x25,x28
94	eor	x23,x23,x28
95
96	eor	x28,   x4,x30		// mov	x28,x4
97	eor	x9,x9,x30
98	eor	x14,x14,x30
99	eor	x19,x19,x30
100	eor	x24,x24,x30
101
102	////////////////////////////////////////// Rho+Pi
103	mov	x30,x1
104	ror	x1,x6,#64-44
105	//mov	x27,x2
106	ror	x2,x12,#64-43
107	//mov	x26,x3
108	ror	x3,x25,#64-21
109	//mov	x28,x4
110	ror	x4,x24,#64-14
111
112	ror	x6,x9,#64-20
113	ror	x12,x13,#64-25
114	ror	x25,x17,#64-15
115	ror	x24,x21,#64-2
116
117	ror	x9,x22,#64-61
118	ror	x13,x19,#64-8
119	ror	x17,x11,#64-10
120	ror	x21,x8,#64-55
121
122	ror	x22,x14,#64-39
123	ror	x19,x23,#64-56
124	ror	x11,x7,#64-6
125	ror	x8,x16,#64-45
126
127	ror	x14,x20,#64-18
128	ror	x23,x15,#64-41
129	ror	x7,x10,#64-3
130	ror	x16,x5,#64-36
131
132	ror	x5,x26,#64-28
133	ror	x10,x30,#64-1
134	ror	x15,x28,#64-27
135	ror	x20,x27,#64-62
136
137	////////////////////////////////////////// Chi+Iota
138	bic	x26,x2,x1
139	bic	x27,x3,x2
140	bic	x28,x0,x4
141	bic	x30,x1,x0
142	eor	x0,x0,x26
143	bic	x26,x4,x3
144	eor	x1,x1,x27
145	ldr	x27,[sp,#16]
146	eor	x3,x3,x28
147	eor	x4,x4,x30
148	eor	x2,x2,x26
149	ldr	x30,[x27],#8		// Iota[i++]
150
151	bic	x26,x7,x6
152	tst	x27,#255			// are we done?
153	str	x27,[sp,#16]
154	bic	x27,x8,x7
155	bic	x28,x5,x9
156	eor	x0,x0,x30		// A[0][0] ^= Iota
157	bic	x30,x6,x5
158	eor	x5,x5,x26
159	bic	x26,x9,x8
160	eor	x6,x6,x27
161	eor	x8,x8,x28
162	eor	x9,x9,x30
163	eor	x7,x7,x26
164
165	bic	x26,x12,x11
166	bic	x27,x13,x12
167	bic	x28,x10,x14
168	bic	x30,x11,x10
169	eor	x10,x10,x26
170	bic	x26,x14,x13
171	eor	x11,x11,x27
172	eor	x13,x13,x28
173	eor	x14,x14,x30
174	eor	x12,x12,x26
175
176	bic	x26,x17,x16
177	bic	x27,x25,x17
178	bic	x28,x15,x19
179	bic	x30,x16,x15
180	eor	x15,x15,x26
181	bic	x26,x19,x25
182	eor	x16,x16,x27
183	eor	x25,x25,x28
184	eor	x19,x19,x30
185	eor	x17,x17,x26
186
187	bic	x26,x22,x21
188	bic	x27,x23,x22
189	bic	x28,x20,x24
190	bic	x30,x21,x20
191	eor	x20,x20,x26
192	bic	x26,x24,x23
193	eor	x21,x21,x27
194	eor	x23,x23,x28
195	eor	x24,x24,x30
196	eor	x22,x22,x26
197
198	bne	.Loop
199
200	ldr	x30,[sp,#24]
201.inst	0xd50323bf			// autiasp
202	ret
203.size	KeccakF1600_int,.-KeccakF1600_int
204
205.type	KeccakF1600,%function
206.align	5
207KeccakF1600:
208.inst	0xd503233f			// paciasp
209	stp	x29,x30,[sp,#-128]!
210	add	x29,sp,#0
211	stp	x19,x20,[sp,#16]
212	stp	x21,x22,[sp,#32]
213	stp	x23,x24,[sp,#48]
214	stp	x25,x26,[sp,#64]
215	stp	x27,x28,[sp,#80]
216	sub	sp,sp,#48
217
218	str	x0,[sp,#32]			// offload argument
219	mov	x26,x0
220	ldp	x0,x1,[x0,#16*0]
221	ldp	x2,x3,[x26,#16*1]
222	ldp	x4,x5,[x26,#16*2]
223	ldp	x6,x7,[x26,#16*3]
224	ldp	x8,x9,[x26,#16*4]
225	ldp	x10,x11,[x26,#16*5]
226	ldp	x12,x13,[x26,#16*6]
227	ldp	x14,x15,[x26,#16*7]
228	ldp	x16,x17,[x26,#16*8]
229	ldp	x25,x19,[x26,#16*9]
230	ldp	x20,x21,[x26,#16*10]
231	ldp	x22,x23,[x26,#16*11]
232	ldr	x24,[x26,#16*12]
233
234	bl	KeccakF1600_int
235
236	ldr	x26,[sp,#32]
237	stp	x0,x1,[x26,#16*0]
238	stp	x2,x3,[x26,#16*1]
239	stp	x4,x5,[x26,#16*2]
240	stp	x6,x7,[x26,#16*3]
241	stp	x8,x9,[x26,#16*4]
242	stp	x10,x11,[x26,#16*5]
243	stp	x12,x13,[x26,#16*6]
244	stp	x14,x15,[x26,#16*7]
245	stp	x16,x17,[x26,#16*8]
246	stp	x25,x19,[x26,#16*9]
247	stp	x20,x21,[x26,#16*10]
248	stp	x22,x23,[x26,#16*11]
249	str	x24,[x26,#16*12]
250
251	ldp	x19,x20,[x29,#16]
252	add	sp,sp,#48
253	ldp	x21,x22,[x29,#32]
254	ldp	x23,x24,[x29,#48]
255	ldp	x25,x26,[x29,#64]
256	ldp	x27,x28,[x29,#80]
257	ldp	x29,x30,[sp],#128
258.inst	0xd50323bf			// autiasp
259	ret
260.size	KeccakF1600,.-KeccakF1600
261
262.globl	SHA3_absorb
263.type	SHA3_absorb,%function
264.align	5
265SHA3_absorb:
266.inst	0xd503233f			// paciasp
267	stp	x29,x30,[sp,#-128]!
268	add	x29,sp,#0
269	stp	x19,x20,[sp,#16]
270	stp	x21,x22,[sp,#32]
271	stp	x23,x24,[sp,#48]
272	stp	x25,x26,[sp,#64]
273	stp	x27,x28,[sp,#80]
274	sub	sp,sp,#64
275
276	stp	x0,x1,[sp,#32]			// offload arguments
277	stp	x2,x3,[sp,#48]
278
279	mov	x26,x0			// uint64_t A[5][5]
280	mov	x27,x1			// const void *inp
281	mov	x28,x2			// size_t len
282	mov	x30,x3			// size_t bsz
283	ldp	x0,x1,[x26,#16*0]
284	ldp	x2,x3,[x26,#16*1]
285	ldp	x4,x5,[x26,#16*2]
286	ldp	x6,x7,[x26,#16*3]
287	ldp	x8,x9,[x26,#16*4]
288	ldp	x10,x11,[x26,#16*5]
289	ldp	x12,x13,[x26,#16*6]
290	ldp	x14,x15,[x26,#16*7]
291	ldp	x16,x17,[x26,#16*8]
292	ldp	x25,x19,[x26,#16*9]
293	ldp	x20,x21,[x26,#16*10]
294	ldp	x22,x23,[x26,#16*11]
295	ldr	x24,[x26,#16*12]
296	b	.Loop_absorb
297
298.align	4
299.Loop_absorb:
300	subs	x26,x28,x30		// len - bsz
301	blo	.Labsorbed
302
303	str	x26,[sp,#48]			// save len - bsz
304	ldr	x26,[x27],#8		// *inp++
305#ifdef	__AARCH64EB__
306	rev	x26,x26
307#endif
308	eor	x0,x0,x26
309	cmp	x30,#8*(0+2)
310	blo	.Lprocess_block
311	ldr	x26,[x27],#8		// *inp++
312#ifdef	__AARCH64EB__
313	rev	x26,x26
314#endif
315	eor	x1,x1,x26
316	beq	.Lprocess_block
317	ldr	x26,[x27],#8		// *inp++
318#ifdef	__AARCH64EB__
319	rev	x26,x26
320#endif
321	eor	x2,x2,x26
322	cmp	x30,#8*(2+2)
323	blo	.Lprocess_block
324	ldr	x26,[x27],#8		// *inp++
325#ifdef	__AARCH64EB__
326	rev	x26,x26
327#endif
328	eor	x3,x3,x26
329	beq	.Lprocess_block
330	ldr	x26,[x27],#8		// *inp++
331#ifdef	__AARCH64EB__
332	rev	x26,x26
333#endif
334	eor	x4,x4,x26
335	cmp	x30,#8*(4+2)
336	blo	.Lprocess_block
337	ldr	x26,[x27],#8		// *inp++
338#ifdef	__AARCH64EB__
339	rev	x26,x26
340#endif
341	eor	x5,x5,x26
342	beq	.Lprocess_block
343	ldr	x26,[x27],#8		// *inp++
344#ifdef	__AARCH64EB__
345	rev	x26,x26
346#endif
347	eor	x6,x6,x26
348	cmp	x30,#8*(6+2)
349	blo	.Lprocess_block
350	ldr	x26,[x27],#8		// *inp++
351#ifdef	__AARCH64EB__
352	rev	x26,x26
353#endif
354	eor	x7,x7,x26
355	beq	.Lprocess_block
356	ldr	x26,[x27],#8		// *inp++
357#ifdef	__AARCH64EB__
358	rev	x26,x26
359#endif
360	eor	x8,x8,x26
361	cmp	x30,#8*(8+2)
362	blo	.Lprocess_block
363	ldr	x26,[x27],#8		// *inp++
364#ifdef	__AARCH64EB__
365	rev	x26,x26
366#endif
367	eor	x9,x9,x26
368	beq	.Lprocess_block
369	ldr	x26,[x27],#8		// *inp++
370#ifdef	__AARCH64EB__
371	rev	x26,x26
372#endif
373	eor	x10,x10,x26
374	cmp	x30,#8*(10+2)
375	blo	.Lprocess_block
376	ldr	x26,[x27],#8		// *inp++
377#ifdef	__AARCH64EB__
378	rev	x26,x26
379#endif
380	eor	x11,x11,x26
381	beq	.Lprocess_block
382	ldr	x26,[x27],#8		// *inp++
383#ifdef	__AARCH64EB__
384	rev	x26,x26
385#endif
386	eor	x12,x12,x26
387	cmp	x30,#8*(12+2)
388	blo	.Lprocess_block
389	ldr	x26,[x27],#8		// *inp++
390#ifdef	__AARCH64EB__
391	rev	x26,x26
392#endif
393	eor	x13,x13,x26
394	beq	.Lprocess_block
395	ldr	x26,[x27],#8		// *inp++
396#ifdef	__AARCH64EB__
397	rev	x26,x26
398#endif
399	eor	x14,x14,x26
400	cmp	x30,#8*(14+2)
401	blo	.Lprocess_block
402	ldr	x26,[x27],#8		// *inp++
403#ifdef	__AARCH64EB__
404	rev	x26,x26
405#endif
406	eor	x15,x15,x26
407	beq	.Lprocess_block
408	ldr	x26,[x27],#8		// *inp++
409#ifdef	__AARCH64EB__
410	rev	x26,x26
411#endif
412	eor	x16,x16,x26
413	cmp	x30,#8*(16+2)
414	blo	.Lprocess_block
415	ldr	x26,[x27],#8		// *inp++
416#ifdef	__AARCH64EB__
417	rev	x26,x26
418#endif
419	eor	x17,x17,x26
420	beq	.Lprocess_block
421	ldr	x26,[x27],#8		// *inp++
422#ifdef	__AARCH64EB__
423	rev	x26,x26
424#endif
425	eor	x25,x25,x26
426	cmp	x30,#8*(18+2)
427	blo	.Lprocess_block
428	ldr	x26,[x27],#8		// *inp++
429#ifdef	__AARCH64EB__
430	rev	x26,x26
431#endif
432	eor	x19,x19,x26
433	beq	.Lprocess_block
434	ldr	x26,[x27],#8		// *inp++
435#ifdef	__AARCH64EB__
436	rev	x26,x26
437#endif
438	eor	x20,x20,x26
439	cmp	x30,#8*(20+2)
440	blo	.Lprocess_block
441	ldr	x26,[x27],#8		// *inp++
442#ifdef	__AARCH64EB__
443	rev	x26,x26
444#endif
445	eor	x21,x21,x26
446	beq	.Lprocess_block
447	ldr	x26,[x27],#8		// *inp++
448#ifdef	__AARCH64EB__
449	rev	x26,x26
450#endif
451	eor	x22,x22,x26
452	cmp	x30,#8*(22+2)
453	blo	.Lprocess_block
454	ldr	x26,[x27],#8		// *inp++
455#ifdef	__AARCH64EB__
456	rev	x26,x26
457#endif
458	eor	x23,x23,x26
459	beq	.Lprocess_block
460	ldr	x26,[x27],#8		// *inp++
461#ifdef	__AARCH64EB__
462	rev	x26,x26
463#endif
464	eor	x24,x24,x26
465
466.Lprocess_block:
467	str	x27,[sp,#40]			// save inp
468
469	bl	KeccakF1600_int
470
471	ldr	x27,[sp,#40]			// restore arguments
472	ldp	x28,x30,[sp,#48]
473	b	.Loop_absorb
474
475.align	4
476.Labsorbed:
477	ldr	x27,[sp,#32]
478	stp	x0,x1,[x27,#16*0]
479	stp	x2,x3,[x27,#16*1]
480	stp	x4,x5,[x27,#16*2]
481	stp	x6,x7,[x27,#16*3]
482	stp	x8,x9,[x27,#16*4]
483	stp	x10,x11,[x27,#16*5]
484	stp	x12,x13,[x27,#16*6]
485	stp	x14,x15,[x27,#16*7]
486	stp	x16,x17,[x27,#16*8]
487	stp	x25,x19,[x27,#16*9]
488	stp	x20,x21,[x27,#16*10]
489	stp	x22,x23,[x27,#16*11]
490	str	x24,[x27,#16*12]
491
492	mov	x0,x28			// return value
493	ldp	x19,x20,[x29,#16]
494	add	sp,sp,#64
495	ldp	x21,x22,[x29,#32]
496	ldp	x23,x24,[x29,#48]
497	ldp	x25,x26,[x29,#64]
498	ldp	x27,x28,[x29,#80]
499	ldp	x29,x30,[sp],#128
500.inst	0xd50323bf			// autiasp
501	ret
502.size	SHA3_absorb,.-SHA3_absorb
503.globl	SHA3_squeeze
504.type	SHA3_squeeze,%function
505.align	5
506SHA3_squeeze:
507.inst	0xd503233f			// paciasp
508	stp	x29,x30,[sp,#-48]!
509	add	x29,sp,#0
510	stp	x19,x20,[sp,#16]
511	stp	x21,x22,[sp,#32]
512
513	mov	x19,x0			// put aside arguments
514	mov	x20,x1
515	mov	x21,x2
516	mov	x22,x3
517
518.Loop_squeeze:
519	ldr	x4,[x0],#8
520	cmp	x21,#8
521	blo	.Lsqueeze_tail
522#ifdef	__AARCH64EB__
523	rev	x4,x4
524#endif
525	str	x4,[x20],#8
526	subs	x21,x21,#8
527	beq	.Lsqueeze_done
528
529	subs	x3,x3,#8
530	bhi	.Loop_squeeze
531
532	mov	x0,x19
533	bl	KeccakF1600
534	mov	x0,x19
535	mov	x3,x22
536	b	.Loop_squeeze
537
538.align	4
539.Lsqueeze_tail:
540	strb	w4,[x20],#1
541	lsr	x4,x4,#8
542	subs	x21,x21,#1
543	beq	.Lsqueeze_done
544	strb	w4,[x20],#1
545	lsr	x4,x4,#8
546	subs	x21,x21,#1
547	beq	.Lsqueeze_done
548	strb	w4,[x20],#1
549	lsr	x4,x4,#8
550	subs	x21,x21,#1
551	beq	.Lsqueeze_done
552	strb	w4,[x20],#1
553	lsr	x4,x4,#8
554	subs	x21,x21,#1
555	beq	.Lsqueeze_done
556	strb	w4,[x20],#1
557	lsr	x4,x4,#8
558	subs	x21,x21,#1
559	beq	.Lsqueeze_done
560	strb	w4,[x20],#1
561	lsr	x4,x4,#8
562	subs	x21,x21,#1
563	beq	.Lsqueeze_done
564	strb	w4,[x20],#1
565
566.Lsqueeze_done:
567	ldp	x19,x20,[sp,#16]
568	ldp	x21,x22,[sp,#32]
569	ldp	x29,x30,[sp],#48
570.inst	0xd50323bf			// autiasp
571	ret
572.size	SHA3_squeeze,.-SHA3_squeeze
573.type	KeccakF1600_ce,%function
574.align	5
575KeccakF1600_ce:
576	mov	x9,#12
577	adr	x10,iotas
578	b	.Loop_ce
579.align	4
580.Loop_ce:
581	////////////////////////////////////////////////// Theta
582.inst	0xce052819	//eor3 v25.16b,v0.16b,v5.16b,v10.16b
583.inst	0xce062c3a	//eor3 v26.16b,v1.16b,v6.16b,v11.16b
584.inst	0xce07305b	//eor3 v27.16b,v2.16b,v7.16b,v12.16b
585.inst	0xce08347c	//eor3 v28.16b,v3.16b,v8.16b,v13.16b
586.inst	0xce09389d	//eor3 v29.16b,v4.16b,v9.16b,v14.16b
587.inst	0xce0f5339	//eor3 v25.16b,v25.16b,   v15.16b,v20.16b
588.inst	0xce10575a	//eor3 v26.16b,v26.16b,   v16.16b,v21.16b
589.inst	0xce115b7b	//eor3 v27.16b,v27.16b,   v17.16b,v22.16b
590.inst	0xce125f9c	//eor3 v28.16b,v28.16b,   v18.16b,v23.16b
591.inst	0xce1363bd	//eor3 v29.16b,v29.16b,   v19.16b,v24.16b
592
593.inst	0xce7b8f3e	//rax1 v30.16b,v25.16b,v27.16b			// D[1]
594.inst	0xce7c8f5f	//rax1 v31.16b,v26.16b,v28.16b			// D[2]
595.inst	0xce7d8f7b	//rax1 v27.16b,v27.16b,v29.16b			// D[3]
596.inst	0xce798f9c	//rax1 v28.16b,v28.16b,v25.16b			// D[4]
597.inst	0xce7a8fbd	//rax1 v29.16b,v29.16b,v26.16b			// D[0]
598
599	////////////////////////////////////////////////// Theta+Rho+Pi
600.inst	0xce9e50d9	//xar v25.16b,   v6.16b,v30.16b,#64-44	// C[0]=A[0][1]
601.inst	0xce9cb126	//xar v6.16b,v9.16b,v28.16b,#64-20
602.inst	0xce9f0ec9	//xar v9.16b,v22.16b,v31.16b,#64-61
603.inst	0xce9c65d6	//xar v22.16b,v14.16b,v28.16b,#64-39
604.inst	0xce9dba8e	//xar v14.16b,v20.16b,v29.16b,#64-18
605
606.inst	0xce9f0854	//xar v20.16b,v2.16b,v31.16b,#64-62
607
608.inst	0xce9f5582	//xar v2.16b,v12.16b,v31.16b,#64-43
609.inst	0xce9b9dac	//xar v12.16b,v13.16b,v27.16b,#64-25
610.inst	0xce9ce26d	//xar v13.16b,v19.16b,v28.16b,#64-8
611.inst	0xce9b22f3	//xar v19.16b,v23.16b,v27.16b,#64-56
612.inst	0xce9d5df7	//xar v23.16b,v15.16b,v29.16b,#64-41
613
614.inst	0xce9c948f	//xar v15.16b,v4.16b,v28.16b,#64-27
615
616	eor	v0.16b,v0.16b,v29.16b
617	ldr	x11,[x10],#8
618
619.inst	0xce9bae5a	//xar v26.16b,   v18.16b,v27.16b,#64-21	// C[1]=A[0][3]
620.inst	0xce9fc632	//xar v18.16b,v17.16b,v31.16b,#64-15
621.inst	0xce9ed971	//xar v17.16b,v11.16b,v30.16b,#64-10
622.inst	0xce9fe8eb	//xar v11.16b,v7.16b,v31.16b,#64-6
623.inst	0xce9df547	//xar v7.16b,v10.16b,v29.16b,#64-3
624
625.inst	0xce9efc2a	//xar v10.16b,v1.16b,v30.16b,#64-1	// *
626
627.inst	0xce9ccb04	//xar v4.16b,v24.16b,v28.16b,#64-14
628.inst	0xce9efab8	//xar v24.16b,v21.16b,v30.16b,#64-2
629.inst	0xce9b2515	//xar v21.16b,v8.16b,v27.16b,#64-55
630.inst	0xce9e4e08	//xar v8.16b,v16.16b,v30.16b,#64-45
631.inst	0xce9d70b0	//xar v16.16b,v5.16b,v29.16b,#64-36
632
633.inst	0xce9b907b	//xar v27.16b,   v3.16b,v27.16b,#64-28	// C[2]=A[1][0]
634
635	////////////////////////////////////////////////// Chi+Iota
636	dup	v31.2d,x11				// borrow C[6]
637.inst	0xce22641c	//bcax v28.16b,   v0.16b,v2.16b,v25.16b	// *
638.inst	0xce3a0b21	//bcax v1.16b,v25.16b,   v26.16b,   v2.16b	// *
639.inst	0xce246842	//bcax v2.16b,v2.16b,v4.16b,v26.16b
640.inst	0xce201343	//bcax v3.16b,v26.16b,   v0.16b,v4.16b
641.inst	0xce390084	//bcax v4.16b,v4.16b,v25.16b,   v0.16b
642
643.inst	0xce271b65	//bcax v5.16b,v27.16b,   v7.16b,v6.16b	// *
644.inst	0xce281cd9	//bcax v25.16b,   v6.16b,v8.16b,v7.16b	// *
645.inst	0xce2920e7	//bcax v7.16b,v7.16b,v9.16b,v8.16b
646.inst	0xce3b2508	//bcax v8.16b,v8.16b,v27.16b,   v9.16b
647.inst	0xce266d29	//bcax v9.16b,v9.16b,v6.16b,v27.16b
648
649	eor	v0.16b,v28.16b,v31.16b			// Iota
650
651.inst	0xce2c2d5a	//bcax v26.16b,   v10.16b,v12.16b,v11.16b	// *
652.inst	0xce2d317b	//bcax v27.16b,   v11.16b,v13.16b,v12.16b	// *
653.inst	0xce2e358c	//bcax v12.16b,v12.16b,v14.16b,v13.16b
654.inst	0xce2a39ad	//bcax v13.16b,v13.16b,v10.16b,v14.16b
655.inst	0xce2b29ce	//bcax v14.16b,v14.16b,v11.16b,v10.16b
656
657.inst	0xce3141fc	//bcax v28.16b,   v15.16b,v17.16b,v16.16b	// *
658.inst	0xce32461d	//bcax v29.16b,   v16.16b,v18.16b,v17.16b	// *
659.inst	0xce334a31	//bcax v17.16b,v17.16b,v19.16b,v18.16b
660.inst	0xce2f4e52	//bcax v18.16b,v18.16b,v15.16b,v19.16b
661.inst	0xce303e73	//bcax v19.16b,v19.16b,v16.16b,v15.16b
662
663.inst	0xce36569e	//bcax v30.16b,   v20.16b,v22.16b,v21.16b	// *
664.inst	0xce375abf	//bcax v31.16b,   v21.16b,v23.16b,v22.16b	// *
665.inst	0xce385ed6	//bcax v22.16b,v22.16b,v24.16b,v23.16b
666.inst	0xce3462f7	//bcax v23.16b,v23.16b,v20.16b,v24.16b
667.inst	0xce355318	//bcax v24.16b,v24.16b,v21.16b,v20.16b
668	////////////////////////////////////////////////// Theta
669.inst	0xce056806	//eor3 v6.16b,v0.16b,v5.16b,v26.16b
670.inst	0xce196c2a	//eor3 v10.16b,v1.16b,v25.16b,v27.16b
671.inst	0xce07304b	//eor3 v11.16b,v2.16b,v7.16b,v12.16b
672.inst	0xce08346f	//eor3 v15.16b,v3.16b,v8.16b,v13.16b
673.inst	0xce093890	//eor3 v16.16b,v4.16b,v9.16b,v14.16b
674.inst	0xce1c78c6	//eor3 v6.16b,v6.16b,   v28.16b,v30.16b
675.inst	0xce1d7d4a	//eor3 v10.16b,v10.16b,   v29.16b,v31.16b
676.inst	0xce11596b	//eor3 v11.16b,v11.16b,   v17.16b,v22.16b
677.inst	0xce125def	//eor3 v15.16b,v15.16b,   v18.16b,v23.16b
678.inst	0xce136210	//eor3 v16.16b,v16.16b,   v19.16b,v24.16b
679
680.inst	0xce6b8cd4	//rax1 v20.16b,v6.16b,v11.16b			// D[1]
681.inst	0xce6f8d55	//rax1 v21.16b,v10.16b,v15.16b			// D[2]
682.inst	0xce708d6b	//rax1 v11.16b,v11.16b,v16.16b			// D[3]
683.inst	0xce668def	//rax1 v15.16b,v15.16b,v6.16b			// D[4]
684.inst	0xce6a8e10	//rax1 v16.16b,v16.16b,v10.16b			// D[0]
685
686	////////////////////////////////////////////////// Theta+Rho+Pi
687.inst	0xce945326	//xar v6.16b,   v25.16b,v20.16b,#64-44	// C[0]=A[0][1]
688.inst	0xce8fb139	//xar v25.16b,v9.16b,v15.16b,#64-20
689.inst	0xce950ec9	//xar v9.16b,v22.16b,v21.16b,#64-61
690.inst	0xce8f65d6	//xar v22.16b,v14.16b,v15.16b,#64-39
691.inst	0xce90bbce	//xar v14.16b,v30.16b,v16.16b,#64-18
692
693.inst	0xce95085e	//xar v30.16b,v2.16b,v21.16b,#64-62
694
695.inst	0xce955582	//xar v2.16b,v12.16b,v21.16b,#64-43
696.inst	0xce8b9dac	//xar v12.16b,v13.16b,v11.16b,#64-25
697.inst	0xce8fe26d	//xar v13.16b,v19.16b,v15.16b,#64-8
698.inst	0xce8b22f3	//xar v19.16b,v23.16b,v11.16b,#64-56
699.inst	0xce905f97	//xar v23.16b,v28.16b,v16.16b,#64-41
700
701.inst	0xce8f949c	//xar v28.16b,v4.16b,v15.16b,#64-27
702
703	eor	v0.16b,v0.16b,v16.16b
704	ldr	x11,[x10],#8
705
706.inst	0xce8bae4a	//xar v10.16b,   v18.16b,v11.16b,#64-21	// C[1]=A[0][3]
707.inst	0xce95c632	//xar v18.16b,v17.16b,v21.16b,#64-15
708.inst	0xce94db71	//xar v17.16b,v27.16b,v20.16b,#64-10
709.inst	0xce95e8fb	//xar v27.16b,v7.16b,v21.16b,#64-6
710.inst	0xce90f747	//xar v7.16b,v26.16b,v16.16b,#64-3
711
712.inst	0xce94fc3a	//xar v26.16b,v1.16b,v20.16b,#64-1	// *
713
714.inst	0xce8fcb04	//xar v4.16b,v24.16b,v15.16b,#64-14
715.inst	0xce94fbf8	//xar v24.16b,v31.16b,v20.16b,#64-2
716.inst	0xce8b251f	//xar v31.16b,v8.16b,v11.16b,#64-55
717.inst	0xce944fa8	//xar v8.16b,v29.16b,v20.16b,#64-45
718.inst	0xce9070bd	//xar v29.16b,v5.16b,v16.16b,#64-36
719
720.inst	0xce8b906b	//xar v11.16b,   v3.16b,v11.16b,#64-28	// C[2]=A[1][0]
721
722	////////////////////////////////////////////////// Chi+Iota
723	dup	v21.2d,x11				// borrow C[6]
724.inst	0xce22180f	//bcax v15.16b,   v0.16b,v2.16b,v6.16b	// *
725.inst	0xce2a08c1	//bcax v1.16b,v6.16b,   v10.16b,   v2.16b	// *
726.inst	0xce242842	//bcax v2.16b,v2.16b,v4.16b,v10.16b
727.inst	0xce201143	//bcax v3.16b,v10.16b,   v0.16b,v4.16b
728.inst	0xce260084	//bcax v4.16b,v4.16b,v6.16b,   v0.16b
729
730.inst	0xce276565	//bcax v5.16b,v11.16b,   v7.16b,v25.16b	// *
731.inst	0xce281f26	//bcax v6.16b,   v25.16b,v8.16b,v7.16b	// *
732.inst	0xce2920e7	//bcax v7.16b,v7.16b,v9.16b,v8.16b
733.inst	0xce2b2508	//bcax v8.16b,v8.16b,v11.16b,   v9.16b
734.inst	0xce392d29	//bcax v9.16b,v9.16b,v25.16b,v11.16b
735
736	eor	v0.16b,v15.16b,v21.16b			// Iota
737
738.inst	0xce2c6f4a	//bcax v10.16b,   v26.16b,v12.16b,v27.16b	// *
739.inst	0xce2d336b	//bcax v11.16b,   v27.16b,v13.16b,v12.16b	// *
740.inst	0xce2e358c	//bcax v12.16b,v12.16b,v14.16b,v13.16b
741.inst	0xce3a39ad	//bcax v13.16b,v13.16b,v26.16b,v14.16b
742.inst	0xce3b69ce	//bcax v14.16b,v14.16b,v27.16b,v26.16b
743
744.inst	0xce31778f	//bcax v15.16b,   v28.16b,v17.16b,v29.16b	// *
745.inst	0xce3247b0	//bcax v16.16b,   v29.16b,v18.16b,v17.16b	// *
746.inst	0xce334a31	//bcax v17.16b,v17.16b,v19.16b,v18.16b
747.inst	0xce3c4e52	//bcax v18.16b,v18.16b,v28.16b,v19.16b
748.inst	0xce3d7273	//bcax v19.16b,v19.16b,v29.16b,v28.16b
749
750.inst	0xce367fd4	//bcax v20.16b,   v30.16b,v22.16b,v31.16b	// *
751.inst	0xce375bf5	//bcax v21.16b,   v31.16b,v23.16b,v22.16b	// *
752.inst	0xce385ed6	//bcax v22.16b,v22.16b,v24.16b,v23.16b
753.inst	0xce3e62f7	//bcax v23.16b,v23.16b,v30.16b,v24.16b
754.inst	0xce3f7b18	//bcax v24.16b,v24.16b,v31.16b,v30.16b
755	subs	x9,x9,#1
756	bne	.Loop_ce
757
758	ret
759.size	KeccakF1600_ce,.-KeccakF1600_ce
760
761.type	KeccakF1600_cext,%function
762.align	5
763KeccakF1600_cext:
764.inst	0xd503233f		// paciasp
765	stp	x29,x30,[sp,#-80]!
766	add	x29,sp,#0
767	stp	d8,d9,[sp,#16]		// per ABI requirement
768	stp	d10,d11,[sp,#32]
769	stp	d12,d13,[sp,#48]
770	stp	d14,d15,[sp,#64]
771	ldp	d0,d1,[x0,#8*0]
772	ldp	d2,d3,[x0,#8*2]
773	ldp	d4,d5,[x0,#8*4]
774	ldp	d6,d7,[x0,#8*6]
775	ldp	d8,d9,[x0,#8*8]
776	ldp	d10,d11,[x0,#8*10]
777	ldp	d12,d13,[x0,#8*12]
778	ldp	d14,d15,[x0,#8*14]
779	ldp	d16,d17,[x0,#8*16]
780	ldp	d18,d19,[x0,#8*18]
781	ldp	d20,d21,[x0,#8*20]
782	ldp	d22,d23,[x0,#8*22]
783	ldr	d24,[x0,#8*24]
784	bl	KeccakF1600_ce
785	ldr	x30,[sp,#8]
786	stp	d0,d1,[x0,#8*0]
787	stp	d2,d3,[x0,#8*2]
788	stp	d4,d5,[x0,#8*4]
789	stp	d6,d7,[x0,#8*6]
790	stp	d8,d9,[x0,#8*8]
791	stp	d10,d11,[x0,#8*10]
792	stp	d12,d13,[x0,#8*12]
793	stp	d14,d15,[x0,#8*14]
794	stp	d16,d17,[x0,#8*16]
795	stp	d18,d19,[x0,#8*18]
796	stp	d20,d21,[x0,#8*20]
797	stp	d22,d23,[x0,#8*22]
798	str	d24,[x0,#8*24]
799
800	ldp	d8,d9,[sp,#16]
801	ldp	d10,d11,[sp,#32]
802	ldp	d12,d13,[sp,#48]
803	ldp	d14,d15,[sp,#64]
804	ldr	x29,[sp],#80
805.inst	0xd50323bf		// autiasp
806	ret
807.size	KeccakF1600_cext,.-KeccakF1600_cext
808.globl	SHA3_absorb_cext
809.type	SHA3_absorb_cext,%function
810.align	5
811SHA3_absorb_cext:
812.inst	0xd503233f		// paciasp
813	stp	x29,x30,[sp,#-80]!
814	add	x29,sp,#0
815	stp	d8,d9,[sp,#16]		// per ABI requirement
816	stp	d10,d11,[sp,#32]
817	stp	d12,d13,[sp,#48]
818	stp	d14,d15,[sp,#64]
819	ldp	d0,d1,[x0,#8*0]
820	ldp	d2,d3,[x0,#8*2]
821	ldp	d4,d5,[x0,#8*4]
822	ldp	d6,d7,[x0,#8*6]
823	ldp	d8,d9,[x0,#8*8]
824	ldp	d10,d11,[x0,#8*10]
825	ldp	d12,d13,[x0,#8*12]
826	ldp	d14,d15,[x0,#8*14]
827	ldp	d16,d17,[x0,#8*16]
828	ldp	d18,d19,[x0,#8*18]
829	ldp	d20,d21,[x0,#8*20]
830	ldp	d22,d23,[x0,#8*22]
831	ldr	d24,[x0,#8*24]
832	b	.Loop_absorb_ce
833
834.align	4
835.Loop_absorb_ce:
836	subs	x2,x2,x3		// len - bsz
837	blo	.Labsorbed_ce
838	ldr	d31,[x1],#8		// *inp++
839#ifdef	__AARCH64EB__
840	rev64	v31.16b,v31.16b
841#endif
842	eor	v0.16b,v0.16b,v31.16b
843	cmp	x3,#8*(0+2)
844	blo	.Lprocess_block_ce
845	ldr	d31,[x1],#8		// *inp++
846#ifdef	__AARCH64EB__
847	rev64	v31.16b,v31.16b
848#endif
849	eor	v1.16b,v1.16b,v31.16b
850	beq	.Lprocess_block_ce
851	ldr	d31,[x1],#8		// *inp++
852#ifdef	__AARCH64EB__
853	rev64	v31.16b,v31.16b
854#endif
855	eor	v2.16b,v2.16b,v31.16b
856	cmp	x3,#8*(2+2)
857	blo	.Lprocess_block_ce
858	ldr	d31,[x1],#8		// *inp++
859#ifdef	__AARCH64EB__
860	rev64	v31.16b,v31.16b
861#endif
862	eor	v3.16b,v3.16b,v31.16b
863	beq	.Lprocess_block_ce
864	ldr	d31,[x1],#8		// *inp++
865#ifdef	__AARCH64EB__
866	rev64	v31.16b,v31.16b
867#endif
868	eor	v4.16b,v4.16b,v31.16b
869	cmp	x3,#8*(4+2)
870	blo	.Lprocess_block_ce
871	ldr	d31,[x1],#8		// *inp++
872#ifdef	__AARCH64EB__
873	rev64	v31.16b,v31.16b
874#endif
875	eor	v5.16b,v5.16b,v31.16b
876	beq	.Lprocess_block_ce
877	ldr	d31,[x1],#8		// *inp++
878#ifdef	__AARCH64EB__
879	rev64	v31.16b,v31.16b
880#endif
881	eor	v6.16b,v6.16b,v31.16b
882	cmp	x3,#8*(6+2)
883	blo	.Lprocess_block_ce
884	ldr	d31,[x1],#8		// *inp++
885#ifdef	__AARCH64EB__
886	rev64	v31.16b,v31.16b
887#endif
888	eor	v7.16b,v7.16b,v31.16b
889	beq	.Lprocess_block_ce
890	ldr	d31,[x1],#8		// *inp++
891#ifdef	__AARCH64EB__
892	rev64	v31.16b,v31.16b
893#endif
894	eor	v8.16b,v8.16b,v31.16b
895	cmp	x3,#8*(8+2)
896	blo	.Lprocess_block_ce
897	ldr	d31,[x1],#8		// *inp++
898#ifdef	__AARCH64EB__
899	rev64	v31.16b,v31.16b
900#endif
901	eor	v9.16b,v9.16b,v31.16b
902	beq	.Lprocess_block_ce
903	ldr	d31,[x1],#8		// *inp++
904#ifdef	__AARCH64EB__
905	rev64	v31.16b,v31.16b
906#endif
907	eor	v10.16b,v10.16b,v31.16b
908	cmp	x3,#8*(10+2)
909	blo	.Lprocess_block_ce
910	ldr	d31,[x1],#8		// *inp++
911#ifdef	__AARCH64EB__
912	rev64	v31.16b,v31.16b
913#endif
914	eor	v11.16b,v11.16b,v31.16b
915	beq	.Lprocess_block_ce
916	ldr	d31,[x1],#8		// *inp++
917#ifdef	__AARCH64EB__
918	rev64	v31.16b,v31.16b
919#endif
920	eor	v12.16b,v12.16b,v31.16b
921	cmp	x3,#8*(12+2)
922	blo	.Lprocess_block_ce
923	ldr	d31,[x1],#8		// *inp++
924#ifdef	__AARCH64EB__
925	rev64	v31.16b,v31.16b
926#endif
927	eor	v13.16b,v13.16b,v31.16b
928	beq	.Lprocess_block_ce
929	ldr	d31,[x1],#8		// *inp++
930#ifdef	__AARCH64EB__
931	rev64	v31.16b,v31.16b
932#endif
933	eor	v14.16b,v14.16b,v31.16b
934	cmp	x3,#8*(14+2)
935	blo	.Lprocess_block_ce
936	ldr	d31,[x1],#8		// *inp++
937#ifdef	__AARCH64EB__
938	rev64	v31.16b,v31.16b
939#endif
940	eor	v15.16b,v15.16b,v31.16b
941	beq	.Lprocess_block_ce
942	ldr	d31,[x1],#8		// *inp++
943#ifdef	__AARCH64EB__
944	rev64	v31.16b,v31.16b
945#endif
946	eor	v16.16b,v16.16b,v31.16b
947	cmp	x3,#8*(16+2)
948	blo	.Lprocess_block_ce
949	ldr	d31,[x1],#8		// *inp++
950#ifdef	__AARCH64EB__
951	rev64	v31.16b,v31.16b
952#endif
953	eor	v17.16b,v17.16b,v31.16b
954	beq	.Lprocess_block_ce
955	ldr	d31,[x1],#8		// *inp++
956#ifdef	__AARCH64EB__
957	rev64	v31.16b,v31.16b
958#endif
959	eor	v18.16b,v18.16b,v31.16b
960	cmp	x3,#8*(18+2)
961	blo	.Lprocess_block_ce
962	ldr	d31,[x1],#8		// *inp++
963#ifdef	__AARCH64EB__
964	rev64	v31.16b,v31.16b
965#endif
966	eor	v19.16b,v19.16b,v31.16b
967	beq	.Lprocess_block_ce
968	ldr	d31,[x1],#8		// *inp++
969#ifdef	__AARCH64EB__
970	rev64	v31.16b,v31.16b
971#endif
972	eor	v20.16b,v20.16b,v31.16b
973	cmp	x3,#8*(20+2)
974	blo	.Lprocess_block_ce
975	ldr	d31,[x1],#8		// *inp++
976#ifdef	__AARCH64EB__
977	rev64	v31.16b,v31.16b
978#endif
979	eor	v21.16b,v21.16b,v31.16b
980	beq	.Lprocess_block_ce
981	ldr	d31,[x1],#8		// *inp++
982#ifdef	__AARCH64EB__
983	rev64	v31.16b,v31.16b
984#endif
985	eor	v22.16b,v22.16b,v31.16b
986	cmp	x3,#8*(22+2)
987	blo	.Lprocess_block_ce
988	ldr	d31,[x1],#8		// *inp++
989#ifdef	__AARCH64EB__
990	rev64	v31.16b,v31.16b
991#endif
992	eor	v23.16b,v23.16b,v31.16b
993	beq	.Lprocess_block_ce
994	ldr	d31,[x1],#8		// *inp++
995#ifdef	__AARCH64EB__
996	rev64	v31.16b,v31.16b
997#endif
998	eor	v24.16b,v24.16b,v31.16b
999
1000.Lprocess_block_ce:
1001
1002	bl	KeccakF1600_ce
1003
1004	b	.Loop_absorb_ce
1005
1006.align	4
1007.Labsorbed_ce:
1008	stp	d0,d1,[x0,#8*0]
1009	stp	d2,d3,[x0,#8*2]
1010	stp	d4,d5,[x0,#8*4]
1011	stp	d6,d7,[x0,#8*6]
1012	stp	d8,d9,[x0,#8*8]
1013	stp	d10,d11,[x0,#8*10]
1014	stp	d12,d13,[x0,#8*12]
1015	stp	d14,d15,[x0,#8*14]
1016	stp	d16,d17,[x0,#8*16]
1017	stp	d18,d19,[x0,#8*18]
1018	stp	d20,d21,[x0,#8*20]
1019	stp	d22,d23,[x0,#8*22]
1020	str	d24,[x0,#8*24]
1021	add	x0,x2,x3		// return value
1022
1023	ldp	d8,d9,[sp,#16]
1024	ldp	d10,d11,[sp,#32]
1025	ldp	d12,d13,[sp,#48]
1026	ldp	d14,d15,[sp,#64]
1027	ldp	x29,x30,[sp],#80
1028.inst	0xd50323bf		// autiasp
1029	ret
1030.size	SHA3_absorb_cext,.-SHA3_absorb_cext
1031.globl	SHA3_squeeze_cext
1032.type	SHA3_squeeze_cext,%function
1033.align	5
1034SHA3_squeeze_cext:
1035.inst	0xd503233f		// paciasp
1036	stp	x29,x30,[sp,#-16]!
1037	add	x29,sp,#0
1038	mov	x9,x0
1039	mov	x10,x3
1040
1041.Loop_squeeze_ce:
1042	ldr	x4,[x9],#8
1043	cmp	x2,#8
1044	blo	.Lsqueeze_tail_ce
1045#ifdef	__AARCH64EB__
1046	rev	x4,x4
1047#endif
1048	str	x4,[x1],#8
1049	beq	.Lsqueeze_done_ce
1050
1051	sub	x2,x2,#8
1052	subs	x10,x10,#8
1053	bhi	.Loop_squeeze_ce
1054
1055	bl	KeccakF1600_cext
1056	ldr	x30,[sp,#8]
1057	mov	x9,x0
1058	mov	x10,x3
1059	b	.Loop_squeeze_ce
1060
1061.align	4
1062.Lsqueeze_tail_ce:
1063	strb	w4,[x1],#1
1064	lsr	x4,x4,#8
1065	subs	x2,x2,#1
1066	beq	.Lsqueeze_done_ce
1067	strb	w4,[x1],#1
1068	lsr	x4,x4,#8
1069	subs	x2,x2,#1
1070	beq	.Lsqueeze_done_ce
1071	strb	w4,[x1],#1
1072	lsr	x4,x4,#8
1073	subs	x2,x2,#1
1074	beq	.Lsqueeze_done_ce
1075	strb	w4,[x1],#1
1076	lsr	x4,x4,#8
1077	subs	x2,x2,#1
1078	beq	.Lsqueeze_done_ce
1079	strb	w4,[x1],#1
1080	lsr	x4,x4,#8
1081	subs	x2,x2,#1
1082	beq	.Lsqueeze_done_ce
1083	strb	w4,[x1],#1
1084	lsr	x4,x4,#8
1085	subs	x2,x2,#1
1086	beq	.Lsqueeze_done_ce
1087	strb	w4,[x1],#1
1088
1089.Lsqueeze_done_ce:
1090	ldr	x29,[sp],#16
1091.inst	0xd50323bf		// autiasp
1092	ret
1093.size	SHA3_squeeze_cext,.-SHA3_squeeze_cext
1094.byte	75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1095.align	2
1096