xref: /netbsd-src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/aesv8-armx.S (revision d90047b5d07facf36e6c01dcc0bded8997ce9cc2)
1#include "arm_asm.h"
2#include "arm_arch.h"
3
4#if __ARM_MAX_ARCH__>=7
5.text
6.arch	armv7-a	@ don't confuse not-so-latest binutils with argv8 :-)
7.fpu	neon
8.code	32
9#undef	__thumb2__
10.align	5
11.Lrcon:
12.long	0x01,0x01,0x01,0x01
13.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	@ rotate-n-splat
14.long	0x1b,0x1b,0x1b,0x1b
15
16.globl	aes_v8_set_encrypt_key
17.type	aes_v8_set_encrypt_key,%function
18.align	5
19aes_v8_set_encrypt_key:
20.Lenc_key:
21	mov	r3,#-1
22	cmp	r0,#0
23	beq	.Lenc_key_abort
24	cmp	r2,#0
25	beq	.Lenc_key_abort
26	mov	r3,#-2
27	cmp	r1,#128
28	blt	.Lenc_key_abort
29	cmp	r1,#256
30	bgt	.Lenc_key_abort
31	tst	r1,#0x3f
32	bne	.Lenc_key_abort
33
34	adr	r3,.Lrcon
35	cmp	r1,#192
36
37	veor	q0,q0,q0
38	vld1.8	{q3},[r0]!
39	mov	r1,#8		@ reuse r1
40	vld1.32	{q1,q2},[r3]!
41
42	blt	.Loop128
43	beq	.L192
44	b	.L256
45
46.align	4
47.Loop128:
48	vtbl.8	d20,{q3},d4
49	vtbl.8	d21,{q3},d5
50	vext.8	q9,q0,q3,#12
51	vst1.32	{q3},[r2]!
52.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
53	subs	r1,r1,#1
54
55	veor	q3,q3,q9
56	vext.8	q9,q0,q9,#12
57	veor	q3,q3,q9
58	vext.8	q9,q0,q9,#12
59	veor	q10,q10,q1
60	veor	q3,q3,q9
61	vshl.u8	q1,q1,#1
62	veor	q3,q3,q10
63	bne	.Loop128
64
65	vld1.32	{q1},[r3]
66
67	vtbl.8	d20,{q3},d4
68	vtbl.8	d21,{q3},d5
69	vext.8	q9,q0,q3,#12
70	vst1.32	{q3},[r2]!
71.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
72
73	veor	q3,q3,q9
74	vext.8	q9,q0,q9,#12
75	veor	q3,q3,q9
76	vext.8	q9,q0,q9,#12
77	veor	q10,q10,q1
78	veor	q3,q3,q9
79	vshl.u8	q1,q1,#1
80	veor	q3,q3,q10
81
82	vtbl.8	d20,{q3},d4
83	vtbl.8	d21,{q3},d5
84	vext.8	q9,q0,q3,#12
85	vst1.32	{q3},[r2]!
86.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
87
88	veor	q3,q3,q9
89	vext.8	q9,q0,q9,#12
90	veor	q3,q3,q9
91	vext.8	q9,q0,q9,#12
92	veor	q10,q10,q1
93	veor	q3,q3,q9
94	veor	q3,q3,q10
95	vst1.32	{q3},[r2]
96	add	r2,r2,#0x50
97
98	mov	r12,#10
99	b	.Ldone
100
101.align	4
102.L192:
103	vld1.8	{d16},[r0]!
104	vmov.i8	q10,#8			@ borrow q10
105	vst1.32	{q3},[r2]!
106	vsub.i8	q2,q2,q10	@ adjust the mask
107
108.Loop192:
109	vtbl.8	d20,{q8},d4
110	vtbl.8	d21,{q8},d5
111	vext.8	q9,q0,q3,#12
112	vst1.32	{d16},[r2]!
113.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
114	subs	r1,r1,#1
115
116	veor	q3,q3,q9
117	vext.8	q9,q0,q9,#12
118	veor	q3,q3,q9
119	vext.8	q9,q0,q9,#12
120	veor	q3,q3,q9
121
122	vdup.32	q9,d7[1]
123	veor	q9,q9,q8
124	veor	q10,q10,q1
125	vext.8	q8,q0,q8,#12
126	vshl.u8	q1,q1,#1
127	veor	q8,q8,q9
128	veor	q3,q3,q10
129	veor	q8,q8,q10
130	vst1.32	{q3},[r2]!
131	bne	.Loop192
132
133	mov	r12,#12
134	add	r2,r2,#0x20
135	b	.Ldone
136
137.align	4
138.L256:
139	vld1.8	{q8},[r0]
140	mov	r1,#7
141	mov	r12,#14
142	vst1.32	{q3},[r2]!
143
144.Loop256:
145	vtbl.8	d20,{q8},d4
146	vtbl.8	d21,{q8},d5
147	vext.8	q9,q0,q3,#12
148	vst1.32	{q8},[r2]!
149.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
150	subs	r1,r1,#1
151
152	veor	q3,q3,q9
153	vext.8	q9,q0,q9,#12
154	veor	q3,q3,q9
155	vext.8	q9,q0,q9,#12
156	veor	q10,q10,q1
157	veor	q3,q3,q9
158	vshl.u8	q1,q1,#1
159	veor	q3,q3,q10
160	vst1.32	{q3},[r2]!
161	beq	.Ldone
162
163	vdup.32	q10,d7[1]
164	vext.8	q9,q0,q8,#12
165.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
166
167	veor	q8,q8,q9
168	vext.8	q9,q0,q9,#12
169	veor	q8,q8,q9
170	vext.8	q9,q0,q9,#12
171	veor	q8,q8,q9
172
173	veor	q8,q8,q10
174	b	.Loop256
175
176.Ldone:
177	str	r12,[r2]
178	mov	r3,#0
179
180.Lenc_key_abort:
181	mov	r0,r3			@ return value
182
183	RET
184.size	aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
185
186.globl	aes_v8_set_decrypt_key
187.type	aes_v8_set_decrypt_key,%function
188.align	5
189aes_v8_set_decrypt_key:
190	stmdb	sp!,{r4,lr}
191	bl	.Lenc_key
192
193	cmp	r0,#0
194	bne	.Ldec_key_abort
195
196	sub	r2,r2,#240		@ restore original r2
197	mov	r4,#-16
198	add	r0,r2,r12,lsl#4	@ end of key schedule
199
200	vld1.32	{q0},[r2]
201	vld1.32	{q1},[r0]
202	vst1.32	{q0},[r0],r4
203	vst1.32	{q1},[r2]!
204
205.Loop_imc:
206	vld1.32	{q0},[r2]
207	vld1.32	{q1},[r0]
208.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
209.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
210	vst1.32	{q0},[r0],r4
211	vst1.32	{q1},[r2]!
212	cmp	r0,r2
213	bhi	.Loop_imc
214
215	vld1.32	{q0},[r2]
216.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
217	vst1.32	{q0},[r0]
218
219	eor	r0,r0,r0		@ return value
220.Ldec_key_abort:
221	ldmia	sp!,{r4,pc}
222.size	aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
223.globl	aes_v8_encrypt
224.type	aes_v8_encrypt,%function
225.align	5
226aes_v8_encrypt:
227	ldr	r3,[r2,#240]
228	vld1.32	{q0},[r2]!
229	vld1.8	{q2},[r0]
230	sub	r3,r3,#2
231	vld1.32	{q1},[r2]!
232
233.Loop_enc:
234.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
235.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
236	vld1.32	{q0},[r2]!
237	subs	r3,r3,#2
238.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
239.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
240	vld1.32	{q1},[r2]!
241	bgt	.Loop_enc
242
243.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
244.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
245	vld1.32	{q0},[r2]
246.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
247	veor	q2,q2,q0
248
249	vst1.8	{q2},[r1]
250	RET
251.size	aes_v8_encrypt,.-aes_v8_encrypt
252.globl	aes_v8_decrypt
253.type	aes_v8_decrypt,%function
254.align	5
255aes_v8_decrypt:
256	ldr	r3,[r2,#240]
257	vld1.32	{q0},[r2]!
258	vld1.8	{q2},[r0]
259	sub	r3,r3,#2
260	vld1.32	{q1},[r2]!
261
262.Loop_dec:
263.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
264.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
265	vld1.32	{q0},[r2]!
266	subs	r3,r3,#2
267.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
268.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
269	vld1.32	{q1},[r2]!
270	bgt	.Loop_dec
271
272.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
273.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
274	vld1.32	{q0},[r2]
275.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
276	veor	q2,q2,q0
277
278	vst1.8	{q2},[r1]
279	RET
280.size	aes_v8_decrypt,.-aes_v8_decrypt
281.globl	aes_v8_cbc_encrypt
282.type	aes_v8_cbc_encrypt,%function
283.align	5
284aes_v8_cbc_encrypt:
285	mov	ip,sp
286	stmdb	sp!,{r4,r5,r6,r7,r8,lr}
287	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}            @ ABI specification says so
288	ldmia	ip,{r4,r5}		@ load remaining args
289	subs	r2,r2,#16
290	mov	r8,#16
291	blo	.Lcbc_abort
292	moveq	r8,#0
293
294	cmp	r5,#0			@ en- or decrypting?
295	ldr	r5,[r3,#240]
296	and	r2,r2,#-16
297	vld1.8	{q6},[r4]
298	vld1.8	{q0},[r0],r8
299
300	vld1.32	{q8,q9},[r3]		@ load key schedule...
301	sub	r5,r5,#6
302	add	r7,r3,r5,lsl#4	@ pointer to last 7 round keys
303	sub	r5,r5,#2
304	vld1.32	{q10,q11},[r7]!
305	vld1.32	{q12,q13},[r7]!
306	vld1.32	{q14,q15},[r7]!
307	vld1.32	{q7},[r7]
308
309	add	r7,r3,#32
310	mov	r6,r5
311	beq	.Lcbc_dec
312
313	cmp	r5,#2
314	veor	q0,q0,q6
315	veor	q5,q8,q7
316	beq	.Lcbc_enc128
317
318	vld1.32	{q2,q3},[r7]
319	add	r7,r3,#16
320	add	r6,r3,#16*4
321	add	r12,r3,#16*5
322.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
323.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
324	add	r14,r3,#16*6
325	add	r3,r3,#16*7
326	b	.Lenter_cbc_enc
327
328.align	4
329.Loop_cbc_enc:
330.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
331.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
332	vst1.8	{q6},[r1]!
333.Lenter_cbc_enc:
334.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
335.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
336.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
337.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
338	vld1.32	{q8},[r6]
339	cmp	r5,#4
340.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
341.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
342	vld1.32	{q9},[r12]
343	beq	.Lcbc_enc192
344
345.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
346.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
347	vld1.32	{q8},[r14]
348.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
349.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
350	vld1.32	{q9},[r3]
351	nop
352
353.Lcbc_enc192:
354.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
355.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
356	subs	r2,r2,#16
357.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
358.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
359	moveq	r8,#0
360.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
361.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
362.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
363.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
364	vld1.8	{q8},[r0],r8
365.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
366.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
367	veor	q8,q8,q5
368.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
369.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
370	vld1.32	{q9},[r7]		@ re-pre-load rndkey[1]
371.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
372.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
373.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
374	veor	q6,q0,q7
375	bhs	.Loop_cbc_enc
376
377	vst1.8	{q6},[r1]!
378	b	.Lcbc_done
379
380.align	5
381.Lcbc_enc128:
382	vld1.32	{q2,q3},[r7]
383.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
384.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
385	b	.Lenter_cbc_enc128
386.Loop_cbc_enc128:
387.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
388.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
389	vst1.8	{q6},[r1]!
390.Lenter_cbc_enc128:
391.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
392.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
393	subs	r2,r2,#16
394.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
395.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
396	moveq	r8,#0
397.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
398.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
399.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
400.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
401.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
402.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
403	vld1.8	{q8},[r0],r8
404.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
405.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
406.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
407.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
408.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
409.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
410	veor	q8,q8,q5
411.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
412	veor	q6,q0,q7
413	bhs	.Loop_cbc_enc128
414
415	vst1.8	{q6},[r1]!
416	b	.Lcbc_done
417.align	5
418.Lcbc_dec:
419	vld1.8	{q10},[r0]!
420	subs	r2,r2,#32		@ bias
421	add	r6,r5,#2
422	vorr	q3,q0,q0
423	vorr	q1,q0,q0
424	vorr	q11,q10,q10
425	blo	.Lcbc_dec_tail
426
427	vorr	q1,q10,q10
428	vld1.8	{q10},[r0]!
429	vorr	q2,q0,q0
430	vorr	q3,q1,q1
431	vorr	q11,q10,q10
432
433.Loop3x_cbc_dec:
434.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
435.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
436.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
437.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
438.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
439.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
440	vld1.32	{q8},[r7]!
441	subs	r6,r6,#2
442.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
443.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
444.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
445.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
446.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
447.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
448	vld1.32	{q9},[r7]!
449	bgt	.Loop3x_cbc_dec
450
451.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
452.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
453.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
454.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
455.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
456.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
457	veor	q4,q6,q7
458	subs	r2,r2,#0x30
459	veor	q5,q2,q7
460	movlo	r6,r2			@ r6, r6, is zero at this point
461.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
462.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
463.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
464.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
465.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
466.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
467	veor	q9,q3,q7
468	add	r0,r0,r6		@ r0 is adjusted in such way that
469					@ at exit from the loop q1-q10
470					@ are loaded with last "words"
471	vorr	q6,q11,q11
472	mov	r7,r3
473.byte	0x68,0x03,0xb0,0xf3	@ aesd q0,q12
474.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
475.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
476.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
477.byte	0x68,0x43,0xf0,0xf3	@ aesd q10,q12
478.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
479	vld1.8	{q2},[r0]!
480.byte	0x6a,0x03,0xb0,0xf3	@ aesd q0,q13
481.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
482.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
483.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
484.byte	0x6a,0x43,0xf0,0xf3	@ aesd q10,q13
485.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
486	vld1.8	{q3},[r0]!
487.byte	0x6c,0x03,0xb0,0xf3	@ aesd q0,q14
488.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
489.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
490.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
491.byte	0x6c,0x43,0xf0,0xf3	@ aesd q10,q14
492.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
493	vld1.8	{q11},[r0]!
494.byte	0x6e,0x03,0xb0,0xf3	@ aesd q0,q15
495.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
496.byte	0x6e,0x43,0xf0,0xf3	@ aesd q10,q15
497	vld1.32	{q8},[r7]!	@ re-pre-load rndkey[0]
498	add	r6,r5,#2
499	veor	q4,q4,q0
500	veor	q5,q5,q1
501	veor	q10,q10,q9
502	vld1.32	{q9},[r7]!	@ re-pre-load rndkey[1]
503	vst1.8	{q4},[r1]!
504	vorr	q0,q2,q2
505	vst1.8	{q5},[r1]!
506	vorr	q1,q3,q3
507	vst1.8	{q10},[r1]!
508	vorr	q10,q11,q11
509	bhs	.Loop3x_cbc_dec
510
511	cmn	r2,#0x30
512	beq	.Lcbc_done
513	nop
514
515.Lcbc_dec_tail:
516.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
517.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
518.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
519.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
520	vld1.32	{q8},[r7]!
521	subs	r6,r6,#2
522.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
523.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
524.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
525.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
526	vld1.32	{q9},[r7]!
527	bgt	.Lcbc_dec_tail
528
529.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
530.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
531.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
532.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
533.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
534.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
535.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
536.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
537.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
538.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
539.byte	0x68,0x43,0xf0,0xf3	@ aesd q10,q12
540.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
541	cmn	r2,#0x20
542.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
543.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
544.byte	0x6a,0x43,0xf0,0xf3	@ aesd q10,q13
545.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
546	veor	q5,q6,q7
547.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
548.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
549.byte	0x6c,0x43,0xf0,0xf3	@ aesd q10,q14
550.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
551	veor	q9,q3,q7
552.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
553.byte	0x6e,0x43,0xf0,0xf3	@ aesd q10,q15
554	beq	.Lcbc_dec_one
555	veor	q5,q5,q1
556	veor	q9,q9,q10
557	vorr	q6,q11,q11
558	vst1.8	{q5},[r1]!
559	vst1.8	{q9},[r1]!
560	b	.Lcbc_done
561
562.Lcbc_dec_one:
563	veor	q5,q5,q10
564	vorr	q6,q11,q11
565	vst1.8	{q5},[r1]!
566
567.Lcbc_done:
568	vst1.8	{q6},[r4]
569.Lcbc_abort:
570	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
571	ldmia	sp!,{r4,r5,r6,r7,r8,pc}
572.size	aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
573.globl	aes_v8_ctr32_encrypt_blocks
574.type	aes_v8_ctr32_encrypt_blocks,%function
575.align	5
576aes_v8_ctr32_encrypt_blocks:
577	mov	ip,sp
578	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,lr}
579	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}            @ ABI specification says so
580	ldr	r4, [ip]		@ load remaining arg
581	ldr	r5,[r3,#240]
582
583	ldr	r8, [r4, #12]
584	vld1.32	{q0},[r4]
585
586	vld1.32	{q8,q9},[r3]		@ load key schedule...
587	sub	r5,r5,#4
588	mov	r12,#16
589	cmp	r2,#2
590	add	r7,r3,r5,lsl#4	@ pointer to last 5 round keys
591	sub	r5,r5,#2
592	vld1.32	{q12,q13},[r7]!
593	vld1.32	{q14,q15},[r7]!
594	vld1.32	{q7},[r7]
595	add	r7,r3,#32
596	mov	r6,r5
597	movlo	r12,#0
598#ifndef __ARMEB__
599	rev	r8, r8
600#endif
601	vorr	q1,q0,q0
602	add	r10, r8, #1
603	vorr	q10,q0,q0
604	add	r8, r8, #2
605	vorr	q6,q0,q0
606	rev	r10, r10
607	vmov.32	d3[1],r10
608	bls	.Lctr32_tail
609	rev	r12, r8
610	sub	r2,r2,#3		@ bias
611	vmov.32	d21[1],r12
612	b	.Loop3x_ctr32
613
614.align	4
615.Loop3x_ctr32:
616.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
617.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
618.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
619.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
620.byte	0x20,0x43,0xf0,0xf3	@ aese q10,q8
621.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
622	vld1.32	{q8},[r7]!
623	subs	r6,r6,#2
624.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
625.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
626.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
627.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
628.byte	0x22,0x43,0xf0,0xf3	@ aese q10,q9
629.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
630	vld1.32	{q9},[r7]!
631	bgt	.Loop3x_ctr32
632
633.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
634.byte	0x80,0x83,0xb0,0xf3	@ aesmc q4,q0
635.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
636.byte	0x82,0xa3,0xb0,0xf3	@ aesmc q5,q1
637	vld1.8	{q2},[r0]!
638	vorr	q0,q6,q6
639.byte	0x20,0x43,0xf0,0xf3	@ aese q10,q8
640.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
641	vld1.8	{q3},[r0]!
642	vorr	q1,q6,q6
643.byte	0x22,0x83,0xb0,0xf3	@ aese q4,q9
644.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
645.byte	0x22,0xa3,0xb0,0xf3	@ aese q5,q9
646.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
647	vld1.8	{q11},[r0]!
648	mov	r7,r3
649.byte	0x22,0x43,0xf0,0xf3	@ aese q10,q9
650.byte	0xa4,0x23,0xf0,0xf3	@ aesmc q9,q10
651	vorr	q10,q6,q6
652	add	r9,r8,#1
653.byte	0x28,0x83,0xb0,0xf3	@ aese q4,q12
654.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
655.byte	0x28,0xa3,0xb0,0xf3	@ aese q5,q12
656.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
657	veor	q2,q2,q7
658	add	r10,r8,#2
659.byte	0x28,0x23,0xf0,0xf3	@ aese q9,q12
660.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
661	veor	q3,q3,q7
662	add	r8,r8,#3
663.byte	0x2a,0x83,0xb0,0xf3	@ aese q4,q13
664.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
665.byte	0x2a,0xa3,0xb0,0xf3	@ aese q5,q13
666.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
667	veor	q11,q11,q7
668	rev	r9,r9
669.byte	0x2a,0x23,0xf0,0xf3	@ aese q9,q13
670.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
671	vmov.32	d1[1], r9
672	rev	r10,r10
673.byte	0x2c,0x83,0xb0,0xf3	@ aese q4,q14
674.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
675.byte	0x2c,0xa3,0xb0,0xf3	@ aese q5,q14
676.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
677	vmov.32	d3[1], r10
678	rev	r12,r8
679.byte	0x2c,0x23,0xf0,0xf3	@ aese q9,q14
680.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
681	vmov.32	d21[1], r12
682	subs	r2,r2,#3
683.byte	0x2e,0x83,0xb0,0xf3	@ aese q4,q15
684.byte	0x2e,0xa3,0xb0,0xf3	@ aese q5,q15
685.byte	0x2e,0x23,0xf0,0xf3	@ aese q9,q15
686
687	veor	q2,q2,q4
688	vld1.32	{q8},[r7]!	@ re-pre-load rndkey[0]
689	vst1.8	{q2},[r1]!
690	veor	q3,q3,q5
691	mov	r6,r5
692	vst1.8	{q3},[r1]!
693	veor	q11,q11,q9
694	vld1.32	{q9},[r7]!	@ re-pre-load rndkey[1]
695	vst1.8	{q11},[r1]!
696	bhs	.Loop3x_ctr32
697
698	adds	r2,r2,#3
699	beq	.Lctr32_done
700	cmp	r2,#1
701	mov	r12,#16
702	moveq	r12,#0
703
704.Lctr32_tail:
705.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
706.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
707.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
708.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
709	vld1.32	{q8},[r7]!
710	subs	r6,r6,#2
711.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
712.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
713.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
714.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
715	vld1.32	{q9},[r7]!
716	bgt	.Lctr32_tail
717
718.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
719.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
720.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
721.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
722.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
723.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
724.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
725.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
726	vld1.8	{q2},[r0],r12
727.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
728.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
729.byte	0x28,0x23,0xb0,0xf3	@ aese q1,q12
730.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
731	vld1.8	{q3},[r0]
732.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
733.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
734.byte	0x2a,0x23,0xb0,0xf3	@ aese q1,q13
735.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
736	veor	q2,q2,q7
737.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
738.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
739.byte	0x2c,0x23,0xb0,0xf3	@ aese q1,q14
740.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
741	veor	q3,q3,q7
742.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
743.byte	0x2e,0x23,0xb0,0xf3	@ aese q1,q15
744
745	cmp	r2,#1
746	veor	q2,q2,q0
747	veor	q3,q3,q1
748	vst1.8	{q2},[r1]!
749	beq	.Lctr32_done
750	vst1.8	{q3},[r1]
751
752.Lctr32_done:
753	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
754	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,pc}
755.size	aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
756#endif
757