xref: /netbsd-src/sys/crypto/aes/arch/arm/aes_neon_32.S (revision ea2d112d7ce6c2aa2053c36545c0893a72bcac36)
1/*	$NetBSD: aes_neon_32.S,v 1.11 2020/09/10 11:31:03 riastradh Exp $	*/
2
3/*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <arm/asm.h>
30
31RCSID("$NetBSD: aes_neon_32.S,v 1.11 2020/09/10 11:31:03 riastradh Exp $")
32
33	.fpu	neon
34
35	.text
36	.p2align 2
37.Lconstants_addr:
38	.long	.Lconstants - .
39
40	.section .rodata
41	.p2align 5
42.Lconstants:
43
44.Linv_inva:	/* inv and inva must be consecutive */
45	.type	inv,_ASM_TYPE_OBJECT
46inv:
47	.byte	0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E
48	.byte	0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04
49END(inv)
50
51	.type	inva,_ASM_TYPE_OBJECT
52inva:
53	.byte	0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01
54	.byte	0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03
55END(inva)
56
57	.type	mc,_ASM_TYPE_OBJECT
58mc:
59	.byte	0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04	/* 0 forward */
60	.byte	0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C
61	.byte	0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06	/* 0 backward */
62	.byte	0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E
63	.byte	0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08	/* 1 forward */
64	.byte	0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00
65	.byte	0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02	/* 1 backward */
66	.byte	0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A
67	.byte	0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C	/* 2 forward */
68	.byte	0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04
69	.byte	0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E	/* 2 backward */
70	.byte	0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06
71.Lmc_forward_3:
72	.byte	0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00	/* 3 forward */
73	.byte	0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08
74	.byte	0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A	/* 3 backward */
75	.byte	0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02
76END(mc)
77
78	.type	sr,_ASM_TYPE_OBJECT
79sr:
80	.byte	0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07	/* 0 */
81	.byte	0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F
82
83	.byte	0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03	/* 1 */
84	.byte	0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B
85
86	.byte	0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F	/* 2 */
87	.byte	0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07
88
89	.byte	0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B	/* 3 */
90	.byte	0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03
91END(sr)
92
93	.type	ipt,_ASM_TYPE_OBJECT
94ipt:
95	.byte	0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2	/* lo */
96	.byte	0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA
97	.byte	0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C /* hi */
98	.byte	0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD
99END(ipt)
100
101	.type	sb1,_ASM_TYPE_OBJECT
102sb1:
103	.byte	0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1 /* 0 */
104	.byte	0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5
105	.byte	0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36 /* 1 */
106	.byte	0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B
107END(sb1)
108
109	.type	sb2,_ASM_TYPE_OBJECT
110sb2:
111	.byte	0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2 /* 0 */
112	.byte	0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E
113	.byte	0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69 /* 1 */
114	.byte	0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2
115END(sb2)
116
117	.type	sbo,_ASM_TYPE_OBJECT
118sbo:
119	.byte	0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0 /* 0 */
120	.byte	0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15
121	.byte	0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF /* 1 */
122	.byte	0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E
123END(sbo)
124
125	.type	dipt,_ASM_TYPE_OBJECT
126dipt:
127	.byte	0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F	/* lo */
128	.byte	0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15
129	.byte	0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86	/* hi */
130	.byte	0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12
131END(dipt)
132
133	.type	dsb9,_ASM_TYPE_OBJECT
134dsb9:
135	.byte	0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85	/* 0 */
136	.byte	0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA
137	.byte	0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0	/* 1 */
138	.byte	0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72
139END(dsb9)
140
141	.type	dsbd,_ASM_TYPE_OBJECT
142dsbd:
143	.byte	0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D	/* 0 */
144	.byte	0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5
145	.byte	0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C	/* 1 */
146	.byte	0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29
147END(dsbd)
148
149	.type	dsbb,_ASM_TYPE_OBJECT
150dsbb:
151	.byte	0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0	/* 0 */
152	.byte	0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60
153	.byte	0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1	/* 1 */
154	.byte	0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3
155END(dsbb)
156
157	.type	dsbe,_ASM_TYPE_OBJECT
158dsbe:
159	.byte	0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46	/* 0 */
160	.byte	0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22
161	.byte	0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C	/* 1 */
162	.byte	0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94
163END(dsbe)
164
165	.type	dsbo,_ASM_TYPE_OBJECT
166dsbo:
167	.byte	0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13	/* 0 */
168	.byte	0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7
169	.byte	0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12	/* 1 */
170	.byte	0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA
171END(dsbo)
172
173/*
174 * aes_neon_enc1(enc, x, nrounds)
175 *
176 *	With -mfloat-abi=hard:
177 *
178 * uint8x16_t@q0
179 * aes_neon_enc1(const struct aesenc *enc@r0, uint8x16_t x@q0,
180 *     unsigned nrounds@r1)
181 *
182 *	With -mfloat-abi=soft(fp) (i.e., __SOFTFP__):
183 *
184 * uint8x16_t@(r0,r1,r2,r3)
185 * aes_neon_enc1(const struct aesenc *enc@r0,
186 *     uint8x16_t x@(r2,r3,sp[0],sp[4]), nrounds@sp[8])
187 */
188ENTRY(aes_neon_enc1)
189#ifdef __SOFTFP__
190#ifdef __ARM_BIG_ENDIAN
191	vmov	d0, r3, r2		/* d0 := x lo */
192#else
193	vmov	d0, r2, r3		/* d0 := x lo */
194#endif
195	vldr	d1, [sp]		/* d1 := x hi */
196	ldr	r1, [sp, #8]		/* r1 := nrounds */
197#endif
198	push	{r4, r5, r6, r8, r10, lr}
199	vpush	{d8-d15}
200
201	/*
202	 * r3: rmod4
203	 * r4: mc
204	 * r6,r8,r10,ip: temporaries
205	 * q0={d0-d1}: x/ak/A
206	 * q1={d2-d3}: 0x0f0f...
207	 * q2={d4-d5}: lo/k/j/io
208	 * q3={d6-d7}: hi/i/jo
209	 * q4={d8-d9}: iptlo
210	 * q5={d10-d11}: ipthi
211	 * q6={d12-d13}: sb1[0]/sbo[0]
212	 * q7={d14-d15}: sb1[1]/sbo[1]
213	 * q8={d16-d17}: sb2[0]
214	 * q9={d18-d19}: sb2[1]
215	 * q10={d20-d21}: inv
216	 * q11={d22-d23}: inva
217	 * q12={d24-d25}: ir/iak/iakr/sb1_0(io)/mc[rmod4].backward
218	 * q13={d26-d27}: jr/jak/jakr/sb1_1(jo)/mc[rmod4].forward
219	 * q14={d28-d29}: rk/A2/A2_B_D
220	 * q15={d30-d31}: A2_B/sr[rmod4]
221	 */
222
223	/* ip := .Lconstants - .Lconstants_addr, r10 := .Lconstants_addr */
224	ldr	ip, .Lconstants_addr
225	adr	r10, .Lconstants_addr
226
227	vld1.8	{q14}, [r0 :128]!	/* q14 = *rk++ */
228	movw	r3, #0
229	vmov.i8	q1, #0x0f
230
231	/* ip := .Lconstants */
232	add	ip, ip, r10
233
234	/* (q4, q5) := (iptlo, ipthi) */
235	add	r6, ip, #(ipt - .Lconstants)
236	vld1.8	{q4-q5}, [r6 :256]
237
238	/* load the rest of the constants */
239	add	r4, ip, #(sb1 - .Lconstants)
240	add	r6, ip, #(sb2 - .Lconstants)
241	add	r8, ip, #(.Linv_inva - .Lconstants)
242	vld1.8	{q6-q7}, [r4 :256]	/* q6 = sb1[0], q7 = sb1[1] */
243	vld1.8	{q8-q9}, [r6 :256]	/* q8 = sb2[0], q9 = sb2[1] */
244	vld1.8	{q10-q11}, [r8 :256]	/* q10 = inv, q11 = inva */
245
246	/* r4 := mc */
247	add	r4, ip, #(mc - .Lconstants)
248
249	/* (q2, q3) := (lo, hi) */
250	vshr.u8	q3, q0, #4
251	vand	q2, q0, q1		/* q2 := x & 0x0f0f... */
252	vand	q3, q3, q1		/* q3 := (x >> 4) & 0x0f0f... */
253
254	/* (q2, q3) := (iptlo(lo), ipthi(hi)) */
255	vtbl.8	d4, {q4}, d4
256	vtbl.8	d5, {q4}, d5
257	vtbl.8	d6, {q5}, d6
258	vtbl.8	d7, {q5}, d7
259
260	/* q0 := rk[0] + iptlo(lo) + ipthi(hi) */
261	veor	q0, q14, q2
262	veor	q0, q0, q3
263
264	b	2f
265
266	_ALIGN_TEXT
2671:	vld1.8	{q14}, [r0 :128]!	/* q14 = *rk++ */
268
269	/* q0 := A = rk[i] + sb1_0(io) + sb1_1(jo) */
270	vtbl.8	d24, {q6}, d4
271	vtbl.8	d25, {q6}, d5
272	vtbl.8	d26, {q7}, d6
273	vtbl.8	d27, {q7}, d7
274	veor	q0, q14, q12
275	veor	q0, q0, q13
276
277	/* q14 := A2 = sb2_0[io] + sb2_1[jo] */
278	vtbl.8	d24, {q8}, d4
279	vtbl.8	d25, {q8}, d5
280	vtbl.8	d26, {q9}, d6
281	vtbl.8	d27, {q9}, d7
282	add	r6, r4, r3, lsl #5	/* r6 := &mc[rmod4] */
283	veor	q14, q12, q13
284
285	/* (q12, q13) := (mc[rmod4].forward, mc[rmod4].backward) */
286	vld1.8	{q12-q13}, [r6 :256]
287
288	/* q15 := A2_B = A2 + A(mcf) */
289	vtbl.8	d30, {q0}, d24
290	vtbl.8	d31, {q0}, d25
291	veor	q15, q15, q14
292
293	/* q14 := A2_B_D = A2_B + A(mcb) */
294	vtbl.8	d28, {q0}, d26
295	vtbl.8	d29, {q0}, d27
296	veor	q14, q14, q15
297
298	/* q0 := x = A2_B_D + A2_B(mcf) */
299	vtbl.8	d0, {q15}, d24
300	vtbl.8	d1, {q15}, d25
301	veor	q0, q0, q14
302
3032:	/*
304	 * SubBytes
305	 */
306
307	/* (q2, q3) := (k, i) */
308	vshr.u8	q3, q0, #4
309	vand	q2, q0, q1		/* q2 := x & 0x0f0f... */
310	vand	q3, q3, q1		/* q3 := (x >> 4) & 0x0f0f... */
311
312	/* q0 := a/k */
313	vtbl.8	d0, {q11}, d4
314	vtbl.8	d1, {q11}, d5
315
316	/* q2 := j = i + k */
317	veor	q2, q3, q2
318
319	/* q12 := ir = 1/i */
320	vtbl.8	d24, {q10}, d6
321	vtbl.8	d25, {q10}, d7
322
323	/* q13 := jr = 1/j */
324	vtbl.8	d26, {q10}, d4
325	vtbl.8	d27, {q10}, d5
326
327	/* q12 := iak = 1/i + a/k */
328	veor	q12, q12, q0
329
330	/* q13 := jak = 1/j + a/k */
331	veor	q13, q13, q0
332
333	/* q12 := iakr = 1/(1/i + a/k) */
334	vtbl.8	d24, {q10}, d24
335	vtbl.8	d25, {q10}, d25
336
337	/* q13 := jakr = 1/(1/j + a/k) */
338	vtbl.8	d26, {q10}, d26
339	vtbl.8	d27, {q10}, d27
340
341	/* q2 := io = j + 1/(1/i + a/k) */
342	veor	q2, q2, q12
343
344	/* q3 := jo = i + 1/(1/j + a/k) */
345	veor	q3, q3, q13
346
347	/* advance round */
348	add	r3, r3, #1
349	subs	r1, r1, #1
350	and	r3, r3, #3
351	bne	1b
352
353	/* (q6, q7, q15) := (sbo[0], sbo[1], sr[rmod4]) */
354	add	r8, ip, #(sr - .Lconstants)
355	add	r6, ip, #(sbo - .Lconstants)
356	add	r8, r8, r3, lsl #4
357	vld1.8	{q6-q7}, [r6 :256]
358	vld1.8	{q15}, [r8 :128]
359
360	vld1.8	{q14}, [r0 :128]!	/* q14 = *rk++ */
361
362	/* (q2, q3) := (sbo_0(io), sbo_1(jo)) */
363	vtbl.8	d4, {q6}, d4
364	vtbl.8	d5, {q6}, d5
365	vtbl.8	d6, {q7}, d6
366	vtbl.8	d7, {q7}, d7
367
368	/* q2 := x = rk[nr] + sbo_0(io) + sbo_1(jo) */
369	veor	q2, q2, q14
370	veor	q2, q2, q3
371
372	/* q0 := x(sr[rmod4]) */
373	vtbl.8	d0, {q2}, d30
374	vtbl.8	d1, {q2}, d31
375
376	vpop	{d8-d15}
377	pop	{r4, r5, r6, r8, r10, lr}
378#ifdef __SOFTFP__
379#ifdef __ARM_BIG_ENDIAN
380	vmov	r1, r0, d0
381	vmov	r3, r2, d1
382#else
383	vmov	r0, r1, d0
384	vmov	r2, r3, d1
385#endif
386#endif
387	bx	lr
388END(aes_neon_enc1)
389
390/*
391 * aes_neon_dec1(dec, x, nrounds)
392 *
393 *	With -mfloat-abi=hard:
394 *
395 * uint8x16_t@q0
396 * aes_neon_dec1(const struct aesdec *dec@r0, uint8x16_t x@q0,
397 *     unsigned nrounds@r1)
398 *
399 *	With -mfloat-abi=soft(fp) (here spelled `#ifdef _KERNEL'):
400 *
401 * uint8x16_t@(r0,r1,r2,r3)
402 * aes_neon_dec1(const struct aesdec *dec@r0,
403 *     uint8x16_t x@(r2,r3,sp[0],sp[4]), nrounds@sp[8])
404 */
405ENTRY(aes_neon_dec1)
406#ifdef __SOFTFP__
407#ifdef __ARM_BIG_ENDIAN
408	vmov	d0, r3, r2		/* d0 := x lo */
409#else
410	vmov	d0, r2, r3		/* d0 := x lo */
411#endif
412	vldr	d1, [sp]		/* d1 := x hi */
413	ldr	r1, [sp, #8]		/* r1 := nrounds */
414#endif
415	push	{r4, r5, r6, r8, r10, lr}
416	vpush	{d8-d15}
417
418	/*
419	 * r3: 3 & ~(nrounds - 1)
420	 * r4: dsbd
421	 * r5: dsbe
422	 * r6,r8,r10,ip: temporaries
423	 * q0={d0-d1}: x/ak
424	 * q1={d2-d3}: 0x0f0f...
425	 * q2={d4-d5}: lo/k/j/io
426	 * q3={d6-d7}: hi/i/jo
427	 * q4={d8-d9}: diptlo/dsb9[0]
428	 * q5={d10-d11}: dipthi/dsb9[1]
429	 * q6={d12-d13}: dsbb[0]/dsbo[0]
430	 * q7={d14-d15}: dsbb[1]/dsbo[1]
431	 * q8={d16-d17}: dsbd[0]/dsbe[0]
432	 * q9={d18-d19}: dsbd[1]/dsbe[0]
433	 * q10={d20-d21}: inv
434	 * q11={d22-d23}: inva
435	 * q12={d24-d25}: ir/iak/iakr/dsbX_0(io)
436	 * q13={d26-d27}: jr/jak/jakr/dsbX_1(jo)
437	 * q14={d28-d29}: rk/xmc
438	 * q15={d30-d31}: mc/sr[3 & ~(nrounds - 1)]
439	 */
440
441	/* ip := .Lconstants - .Lconstants_addr, r10 := .Lconstants_addr */
442	ldr	ip, .Lconstants_addr
443	adr	r10, .Lconstants_addr
444
445	vld1.8	{q14}, [r0 :128]!	/* q14 = *rk++ */
446	rsb	r3, r1, #0		/* r3 := ~(x - 1) = -x */
447	vmov.i8	q1, #0x0f
448	and	r3, r3, #3		/* r3 := 3 & ~(x - 1) */
449
450	/* ip := .Lconstants */
451	add	ip, ip, r10
452
453	/* (q4, q5) := (diptlo, dipthi) */
454	add	r6, ip, #(dipt - .Lconstants)
455	vld1.8	{q4-q5}, [r6 :256]
456
457	/* load the rest of the constants */
458	add	r4, ip, #(dsbb - .Lconstants)
459	add	r6, ip, #(.Linv_inva - .Lconstants)
460	add	r8, ip, #(.Lmc_forward_3 - .Lconstants)
461	vld1.8	{q6-q7}, [r4 :256]	/* q6 := dsbb[0], q7 := dsbb[1] */
462	vld1.8	{q10-q11}, [r6 :256]	/* q10 := inv, q11 := inva */
463	vld1.8	{q15}, [r8 :128]	/* q15 := mc[3].forward */
464
465	/* (q2, q3) := (lo, hi) */
466	vshr.u8	q3, q0, #4
467	vand	q2, q0, q1		/* q2 := x & 0x0f0f... */
468	vand	q3, q3, q1		/* q3 := (x >> 4) & 0x0f0f... */
469
470	/* (q2, q3) := (diptlo(lo), dipthi(hi)) */
471	vtbl.8	d4, {q4}, d4
472	vtbl.8	d5, {q4}, d5
473	vtbl.8	d6, {q5}, d6
474	vtbl.8	d7, {q5}, d7
475
476	/* load dsb9 */
477	add	r4, ip, #(dsb9 - .Lconstants)
478	vld1.8	{q4-q5}, [r4 :256]	/* q4 := dsb9[0], q5 := dsb9[1] */
479
480	/* r4 := dsbd, r5 := dsbe */
481	add	r4, ip, #(dsbd - .Lconstants)
482	add	r5, ip, #(dsbe - .Lconstants)
483
484	/* q0 := rk[0] + diptlo(lo) + dipthi(hi) */
485	veor	q0, q14, q2
486	veor	q0, q0, q3
487
488	b	2f
489
490	_ALIGN_TEXT
4911:	/* load dsbd */
492	vld1.8	{q8-q9}, [r4 :256]	/* q8 := dsbd[0], q9 := dsbd[1] */
493
494	vld1.8	{q14}, [r0 :128]!	/* q14 = *rk++ */
495
496	/* q0 := rk[i] + dsb9_0(io) + dsb9_1(jo) */
497	vtbl.8	d24, {q4}, d4
498	vtbl.8	d25, {q4}, d5
499	vtbl.8	d26, {q5}, d6
500	vtbl.8	d27, {q5}, d7
501	veor	q0, q14, q12
502	veor	q0, q0, q13
503
504	/* q14 := x(mc) */
505	vtbl.8	d28, {q0}, d30
506	vtbl.8	d29, {q0}, d31
507
508	/* q0 := x(mc) + dsbd_0(io) + dsbd_1(jo) */
509	vtbl.8	d24, {q8}, d4
510	vtbl.8	d25, {q8}, d5
511	vtbl.8	d26, {q9}, d6
512	vtbl.8	d27, {q9}, d7
513	veor	q0, q14, q12
514	veor	q0, q0, q13
515
516	/* load dsbe */
517	vld1.8	{q8-q9}, [r5 :256]	/* q8 := dsbe[0], q9 := dsbe[1] */
518
519	/* q0 := x(mc) + dsbb_0(io) + dsbb_1(jo) */
520	vtbl.8	d28, {q0}, d30
521	vtbl.8	d29, {q0}, d31
522	vtbl.8	d24, {q6}, d4
523	vtbl.8	d25, {q6}, d5
524	vtbl.8	d26, {q7}, d6
525	vtbl.8	d27, {q7}, d7
526	veor	q0, q14, q12
527	veor	q0, q0, q13
528
529	/* q0 := x(mc) + dsbe_0(io) + dsbe_1(jo) */
530	vtbl.8	d28, {q0}, d30
531	vtbl.8	d29, {q0}, d31
532	vtbl.8	d24, {q8}, d4
533	vtbl.8	d25, {q8}, d5
534	vtbl.8	d26, {q9}, d6
535	vtbl.8	d27, {q9}, d7
536	veor	q0, q14, q12
537	veor	q0, q0, q13
538
539	/* q15 := mc := mc <<< 12*8 */
540	vext.8	q15, q15, q15, #12
541
5422:	/*
543	 * SubBytes
544	 */
545
546	/* (q2, q3) := (k, i) */
547	vshr.u8	q3, q0, #4
548	vand	q2, q0, q1		/* q2 := x & 0x0f0f... */
549	vand	q3, q3, q1		/* q3 := (x >> 4) & 0x0f0f... */
550
551	/* q0 := a/k */
552	vtbl.8	d0, {q11}, d4
553	vtbl.8	d1, {q11}, d5
554
555	/* q2 := j = i + k */
556	veor	q2, q3, q2
557
558	/* q12 := ir = 1/i */
559	vtbl.8	d24, {q10}, d6
560	vtbl.8	d25, {q10}, d7
561
562	/* q13 := jr = 1/j */
563	vtbl.8	d26, {q10}, d4
564	vtbl.8	d27, {q10}, d5
565
566	/* q12 := iak = 1/i + a/k */
567	veor	q12, q12, q0
568
569	/* q13 := jak = 1/j + a/k */
570	veor	q13, q13, q0
571
572	/* q12 := iakr = 1/(1/i + a/k) */
573	vtbl.8	d24, {q10}, d24
574	vtbl.8	d25, {q10}, d25
575
576	/* q13 := jakr = 1/(1/j + a/k) */
577	vtbl.8	d26, {q10}, d26
578	vtbl.8	d27, {q10}, d27
579
580	/* q2 := io = j + 1/(1/i + a/k) */
581	veor	q2, q2, q12
582
583	/* q3 := jo = i + 1/(1/j + a/k) */
584	veor	q3, q3, q13
585
586	/* advance round */
587	subs	r1, r1, #1
588	bne	1b
589
590	/* (q6, q7, q15) := (dsbo[0], dsbo[1], sr[i]) */
591	add	r8, ip, #(sr - .Lconstants)
592	add	r6, ip, #(dsbo - .Lconstants)
593	add	r8, r8, r3, lsl #4
594	vld1.8	{q6-q7}, [r6 :256]
595	vld1.8	{q15}, [r8 :128]
596
597	vld1.8	{q14}, [r0 :128]!	/* q14 = *rk++ */
598
599	/* (q2, q3) := (dsbo_0(io), dsbo_1(jo)) */
600	vtbl.8	d4, {q6}, d4
601	vtbl.8	d5, {q6}, d5
602	vtbl.8	d6, {q7}, d6
603	vtbl.8	d7, {q7}, d7
604
605	/* q2 := x = rk[nr] + dsbo_0(io) + dsbo_1(jo) */
606	veor	q2, q2, q14
607	veor	q2, q2, q3
608
609	/* q0 := x(sr[i]) */
610	vtbl.8	d0, {q2}, d30
611	vtbl.8	d1, {q2}, d31
612
613	vpop	{d8-d15}
614	pop	{r4, r5, r6, r8, r10, lr}
615#ifdef __SOFTFP__
616#ifdef __ARM_BIG_ENDIAN
617	vmov	r1, r0, d0
618	vmov	r3, r2, d1
619#else
620	vmov	r0, r1, d0
621	vmov	r2, r3, d1
622#endif
623#endif
624	bx	lr
625END(aes_neon_dec1)
626