xref: /netbsd-src/sys/crypto/chacha/arch/arm/chacha_neon.c (revision d754abaff476d231fd71aa71666f90c1bce5fe6f)
1 /*	$NetBSD: chacha_neon.c,v 1.9 2023/08/07 01:07:36 rin Exp $	*/
2 
3 /*-
4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/types.h>
30 #include <sys/endian.h>
31 
32 #include <crypto/arch/arm/arm_neon.h>
33 #include <crypto/arch/arm/arm_neon_imm.h>
34 #include "chacha_neon.h"
35 
36 /*
37  * Tempting to use VSHL/VSRI instead of VSHL/VSHR/VORR, but in practice
38  * it hurts performance at least on Cortex-A8.
39  */
40 #if 1
41 #define	vrolq_n_u32(x, n)	(vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - (n)))
42 #else
43 #define	vrolq_n_u32(x, n)	vsriq_n_u32(vshlq_n_u32(x, n), x, 32 - (n))
44 #endif
45 
46 static inline uint32x4_t
rol16(uint32x4_t x)47 rol16(uint32x4_t x)
48 {
49 	uint16x8_t y16, x16 = vreinterpretq_u16_u32(x);
50 
51 	y16 = vrev32q_u16(x16);
52 
53 	return vreinterpretq_u32_u16(y16);
54 }
55 
56 static inline uint32x4_t
rol12(uint32x4_t x)57 rol12(uint32x4_t x)
58 {
59 
60 	return vrolq_n_u32(x, 12);
61 }
62 
63 static inline uint32x4_t
rol8(uint32x4_t x)64 rol8(uint32x4_t x)
65 {
66 #if defined(__aarch64__)
67 	static const uint8x16_t rol8_tab = VQ_N_U8(
68 		  3, 0, 1, 2,  7, 4, 5, 6,
69 		 11, 8, 9,10, 15,12,13,14
70 	);
71 	uint8x16_t y8, x8 = vreinterpretq_u8_u32(x);
72 
73 	y8 = vqtbl1q_u8(x8, rol8_tab);
74 
75 	return vreinterpretq_u32_u8(y8);
76 #elif 0
77 	/*
78 	 * GCC does a lousy job with this, spilling two 64-bit vector
79 	 * registers to the stack every time.  There should be plenty
80 	 * of vector registers free, requiring no spills at all, and
81 	 * GCC should be able to hoist the load of rol8_tab out of any
82 	 * loops, but it doesn't and so attempting to use VTBL hurts
83 	 * more than it helps.
84 	 */
85 	static const uint8x8_t rol8_tab = V_N_U8(
86 		 3, 0, 1, 2,  7, 4, 5, 6
87 	);
88 
89 	uint64x2_t y64, x64 = vreinterpretq_u64_u32(x);
90 
91 	y64 = (uint64x2_t) {
92 		(uint64_t)vtbl1_u8((uint8x8_t)x64[0], rol8_tab),
93 		(uint64_t)vtbl1_u8((uint8x8_t)x64[1], rol8_tab),
94 	};
95 
96 	return vreinterpretq_u32_u64(y64);
97 #else
98 	return vrolq_n_u32(x, 8);
99 #endif
100 }
101 
102 static inline uint32x4_t
rol7(uint32x4_t x)103 rol7(uint32x4_t x)
104 {
105 
106 	return vrolq_n_u32(x, 7);
107 }
108 
109 static inline void
chacha_permute(uint32x4_t * p0,uint32x4_t * p1,uint32x4_t * p2,uint32x4_t * p3,unsigned nr)110 chacha_permute(uint32x4_t *p0, uint32x4_t *p1, uint32x4_t *p2, uint32x4_t *p3,
111     unsigned nr)
112 {
113 	uint32x4_t r0, r1, r2, r3;
114 	uint32x4_t c0, c1, c2, c3;
115 
116 	r0 = *p0;
117 	r1 = *p1;
118 	r2 = *p2;
119 	r3 = *p3;
120 
121 	for (; nr > 0; nr -= 2) {
122 		r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol16(r3);
123 		r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol12(r1);
124 		r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol8(r3);
125 		r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol7(r1);
126 
127 		c0 = r0;
128 		c1 = vextq_u32(r1, r1, 1);
129 		c2 = vextq_u32(r2, r2, 2);
130 		c3 = vextq_u32(r3, r3, 3);
131 
132 		c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol16(c3);
133 		c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol12(c1);
134 		c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol8(c3);
135 		c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol7(c1);
136 
137 		r0 = c0;
138 		r1 = vextq_u32(c1, c1, 3);
139 		r2 = vextq_u32(c2, c2, 2);
140 		r3 = vextq_u32(c3, c3, 1);
141 	}
142 
143 	*p0 = r0;
144 	*p1 = r1;
145 	*p2 = r2;
146 	*p3 = r3;
147 }
148 
149 void
chacha_core_neon(uint8_t out[restrict static64],const uint8_t in[static16],const uint8_t k[static32],const uint8_t c[static16],unsigned nr)150 chacha_core_neon(uint8_t out[restrict static 64],
151     const uint8_t in[static 16],
152     const uint8_t k[static 32],
153     const uint8_t c[static 16],
154     unsigned nr)
155 {
156 	uint32x4_t in0, in1, in2, in3;
157 	uint32x4_t r0, r1, r2, r3;
158 
159 	r0 = in0 = vreinterpretq_u32_u8(vld1q_u8(c));
160 	r1 = in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0));
161 	r2 = in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16));
162 	r3 = in3 = vreinterpretq_u32_u8(vld1q_u8(in));
163 
164 	chacha_permute(&r0, &r1, &r2, &r3, nr);
165 
166 	vst1q_u8(out + 0, vreinterpretq_u8_u32(vaddq_u32(r0, in0)));
167 	vst1q_u8(out + 16, vreinterpretq_u8_u32(vaddq_u32(r1, in1)));
168 	vst1q_u8(out + 32, vreinterpretq_u8_u32(vaddq_u32(r2, in2)));
169 	vst1q_u8(out + 48, vreinterpretq_u8_u32(vaddq_u32(r3, in3)));
170 }
171 
172 void
hchacha_neon(uint8_t out[restrict static32],const uint8_t in[static16],const uint8_t k[static32],const uint8_t c[static16],unsigned nr)173 hchacha_neon(uint8_t out[restrict static 32],
174     const uint8_t in[static 16],
175     const uint8_t k[static 32],
176     const uint8_t c[static 16],
177     unsigned nr)
178 {
179 	uint32x4_t r0, r1, r2, r3;
180 
181 	r0 = vreinterpretq_u32_u8(vld1q_u8(c));
182 	r1 = vreinterpretq_u32_u8(vld1q_u8(k + 0));
183 	r2 = vreinterpretq_u32_u8(vld1q_u8(k + 16));
184 	r3 = vreinterpretq_u32_u8(vld1q_u8(in));
185 
186 	chacha_permute(&r0, &r1, &r2, &r3, nr);
187 
188 	vst1q_u8(out + 0, vreinterpretq_u8_u32(r0));
189 	vst1q_u8(out + 16, vreinterpretq_u8_u32(r3));
190 }
191 
192 void
chacha_stream_neon(uint8_t * restrict s,size_t n,uint32_t blkno,const uint8_t nonce[static12],const uint8_t k[static32],unsigned nr)193 chacha_stream_neon(uint8_t *restrict s, size_t n,
194     uint32_t blkno,
195     const uint8_t nonce[static 12],
196     const uint8_t k[static 32],
197     unsigned nr)
198 {
199 
200 	for (; n >= 256; s += 256, n -= 256, blkno += 4)
201 		chacha_stream256_neon(s, blkno, nonce, k, chacha_const32, nr);
202 
203 	if (n) {
204 		const uint32x4_t blkno_inc = /* (1,0,0,0) */
205 		    vsetq_lane_u32(1, vdupq_n_u32(0), 0);
206 		uint32x4_t in0, in1, in2, in3;
207 		uint32x4_t r0, r1, r2, r3;
208 
209 		in0 = vreinterpretq_u32_u8(vld1q_u8(chacha_const32));
210 		in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0));
211 		in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16));
212 		in3 = (uint32x4_t) VQ_N_U32(
213 			blkno,
214 			le32dec(nonce),
215 			le32dec(nonce + 4),
216 			le32dec(nonce + 8)
217 		);
218 
219 		for (; n; s += 64, n -= 64) {
220 			r0 = in0;
221 			r1 = in1;
222 			r2 = in2;
223 			r3 = in3;
224 			chacha_permute(&r0, &r1, &r2, &r3, nr);
225 			r0 = vaddq_u32(r0, in0);
226 			r1 = vaddq_u32(r1, in1);
227 			r2 = vaddq_u32(r2, in2);
228 			r3 = vaddq_u32(r3, in3);
229 
230 			if (n < 64) {
231 				uint8_t buf[64] __aligned(16);
232 
233 				vst1q_u8(buf + 0, vreinterpretq_u8_u32(r0));
234 				vst1q_u8(buf + 16, vreinterpretq_u8_u32(r1));
235 				vst1q_u8(buf + 32, vreinterpretq_u8_u32(r2));
236 				vst1q_u8(buf + 48, vreinterpretq_u8_u32(r3));
237 				memcpy(s, buf, n);
238 
239 				break;
240 			}
241 
242 			vst1q_u8(s + 0, vreinterpretq_u8_u32(r0));
243 			vst1q_u8(s + 16, vreinterpretq_u8_u32(r1));
244 			vst1q_u8(s + 32, vreinterpretq_u8_u32(r2));
245 			vst1q_u8(s + 48, vreinterpretq_u8_u32(r3));
246 			in3 = vaddq_u32(in3, blkno_inc);
247 		}
248 	}
249 }
250 
251 void
chacha_stream_xor_neon(uint8_t * s,const uint8_t * p,size_t n,uint32_t blkno,const uint8_t nonce[static12],const uint8_t k[static32],unsigned nr)252 chacha_stream_xor_neon(uint8_t *s, const uint8_t *p, size_t n,
253     uint32_t blkno,
254     const uint8_t nonce[static 12],
255     const uint8_t k[static 32],
256     unsigned nr)
257 {
258 
259 	for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4)
260 		chacha_stream_xor256_neon(s, p, blkno, nonce, k,
261 		    chacha_const32, nr);
262 
263 	if (n) {
264 		const uint32x4_t blkno_inc = /* (1,0,0,0) */
265 		    vsetq_lane_u32(1, vdupq_n_u32(0), 0);
266 		uint32x4_t in0, in1, in2, in3;
267 		uint32x4_t r0, r1, r2, r3;
268 
269 		in0 = vreinterpretq_u32_u8(vld1q_u8(chacha_const32));
270 		in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0));
271 		in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16));
272 		in3 = (uint32x4_t) VQ_N_U32(
273 			blkno,
274 			le32dec(nonce),
275 			le32dec(nonce + 4),
276 			le32dec(nonce + 8)
277 		);
278 
279 		for (; n; s += 64, p += 64, n -= 64) {
280 			r0 = in0;
281 			r1 = in1;
282 			r2 = in2;
283 			r3 = in3;
284 			chacha_permute(&r0, &r1, &r2, &r3, nr);
285 			r0 = vaddq_u32(r0, in0);
286 			r1 = vaddq_u32(r1, in1);
287 			r2 = vaddq_u32(r2, in2);
288 			r3 = vaddq_u32(r3, in3);
289 
290 			if (n < 64) {
291 				uint8_t buf[64] __aligned(16);
292 				unsigned i;
293 
294 				vst1q_u8(buf + 0, vreinterpretq_u8_u32(r0));
295 				vst1q_u8(buf + 16, vreinterpretq_u8_u32(r1));
296 				vst1q_u8(buf + 32, vreinterpretq_u8_u32(r2));
297 				vst1q_u8(buf + 48, vreinterpretq_u8_u32(r3));
298 
299 				for (i = 0; i < n - n%4; i += 4)
300 					le32enc(s + i,
301 					    le32dec(p + i) ^ le32dec(buf + i));
302 				for (; i < n; i++)
303 					s[i] = p[i] ^ buf[i];
304 
305 				break;
306 			}
307 
308 			r0 ^= vreinterpretq_u32_u8(vld1q_u8(p + 0));
309 			r1 ^= vreinterpretq_u32_u8(vld1q_u8(p + 16));
310 			r2 ^= vreinterpretq_u32_u8(vld1q_u8(p + 32));
311 			r3 ^= vreinterpretq_u32_u8(vld1q_u8(p + 48));
312 			vst1q_u8(s + 0, vreinterpretq_u8_u32(r0));
313 			vst1q_u8(s + 16, vreinterpretq_u8_u32(r1));
314 			vst1q_u8(s + 32, vreinterpretq_u8_u32(r2));
315 			vst1q_u8(s + 48, vreinterpretq_u8_u32(r3));
316 			in3 = vaddq_u32(in3, blkno_inc);
317 		}
318 	}
319 }
320 
321 void
xchacha_stream_neon(uint8_t * restrict s,size_t nbytes,uint32_t blkno,const uint8_t nonce[static24],const uint8_t k[static32],unsigned nr)322 xchacha_stream_neon(uint8_t *restrict s, size_t nbytes,
323     uint32_t blkno,
324     const uint8_t nonce[static 24],
325     const uint8_t k[static 32],
326     unsigned nr)
327 {
328 	uint8_t subkey[32];
329 	uint8_t subnonce[12];
330 
331 	hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
332 	memset(subnonce, 0, 4);
333 	memcpy(subnonce + 4, nonce + 16, 8);
334 	chacha_stream_neon(s, nbytes, blkno, subnonce, subkey, nr);
335 }
336 
337 void
xchacha_stream_xor_neon(uint8_t * restrict c,const uint8_t * p,size_t nbytes,uint32_t blkno,const uint8_t nonce[static24],const uint8_t k[static32],unsigned nr)338 xchacha_stream_xor_neon(uint8_t *restrict c, const uint8_t *p, size_t nbytes,
339     uint32_t blkno,
340     const uint8_t nonce[static 24],
341     const uint8_t k[static 32],
342     unsigned nr)
343 {
344 	uint8_t subkey[32];
345 	uint8_t subnonce[12];
346 
347 	hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
348 	memset(subnonce, 0, 4);
349 	memcpy(subnonce + 4, nonce + 16, 8);
350 	chacha_stream_xor_neon(c, p, nbytes, blkno, subnonce, subkey, nr);
351 }
352