xref: /netbsd-src/sys/crypto/arch/arm/arm_neon.h (revision 78c3759dfd1c9c0f551070d5066603512624c11b)
1 /*	$NetBSD: arm_neon.h,v 1.2 2023/08/07 01:14:19 rin Exp $	*/
2 
3 /*-
4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #ifndef	_SYS_CRYPTO_ARCH_ARM_ARM_NEON_H
30 #define	_SYS_CRYPTO_ARCH_ARM_ARM_NEON_H
31 
32 #if defined(__GNUC__) && !defined(__clang__)
33 
34 #define	_INTRINSATTR							      \
35 	__extension__							      \
36 	__attribute__((__always_inline__, __gnu_inline__, __artificial__))
37 
38 #ifdef __aarch64__
39 typedef __Int32x4_t int32x4_t;
40 typedef __Int64x2_t int64x2_t;
41 typedef __Int8x16_t int8x16_t;
42 typedef __Uint16x8_t uint16x8_t;
43 typedef __Uint32x4_t uint32x4_t;
44 typedef __Uint64x2_t uint64x2_t;
45 typedef __Uint8x16_t uint8x16_t;
46 typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
47 #else
48 typedef __simd128_int32_t int32x4_t;
49 typedef __simd128_int64_t int64x2_t;
50 typedef __simd128_int8_t int8x16_t;
51 typedef __simd128_uint16_t uint16x8_t;
52 typedef __simd128_uint32_t uint32x4_t;
53 typedef __simd128_uint64_t uint64x2_t;
54 typedef __simd128_uint8_t uint8x16_t;
55 
56 typedef __simd64_int8_t int8x8_t;
57 typedef __simd64_uint8_t uint8x8_t;
58 typedef __builtin_neon_udi uint64x1_t;
59 typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
60 typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
61 #endif
62 
63 #if defined(__AARCH64EB__)
64 #define	__neon_lane_index(__v, __i)	(__arraycount(__v) - 1 - (__i))
65 #define	__neon_laneq_index(__v, __i)	(__arraycount(__v) - 1 - (__i))
66 #elif defined(__ARM_BIG_ENDIAN)
67 #define	__neon_lane_index(__v, __i)	((__i) ^ (__arraycount(__v) - 1))
68 #define	__neon_laneq_index(__v, __i)	((__i) ^ (__arraycount(__v)/2 - 1))
69 #else
70 #define	__neon_lane_index(__v, __i)	(__i)
71 #define	__neon_laneq_index(__v, __i)	(__i)
72 #endif
73 
74 #elif defined(__clang__)
75 
76 #define	_INTRINSATTR							      \
77 	__attribute__((__always_inline__, __nodebug__))
78 
79 typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
80 typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
81 typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t;
82 
83 typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
84 typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
85 typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
86 typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t;
87 
88 typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t;
89 
90 typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
91 
92 typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
93 typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
94 
95 #ifdef __LITTLE_ENDIAN__
96 #define	__neon_lane_index(__v, __i)	__i
97 #define	__neon_laneq_index(__v, __i)	__i
98 #else
99 #define	__neon_lane_index(__v, __i)	(__arraycount(__v) - 1 - __i)
100 #define	__neon_laneq_index(__v, __i)	(__arraycount(__v) - 1 - __i)
101 #endif
102 
103 #else
104 
105 #error Teach me how to neon in your compile!
106 
107 #endif
108 
109 _INTRINSATTR
110 static __inline uint32x4_t
vaddq_u32(uint32x4_t __v0,uint32x4_t __v1)111 vaddq_u32(uint32x4_t __v0, uint32x4_t __v1)
112 {
113 	return __v0 + __v1;
114 }
115 
116 _INTRINSATTR
117 static __inline uint32x4_t
vcltq_s32(int32x4_t __v0,int32x4_t __v1)118 vcltq_s32(int32x4_t __v0, int32x4_t __v1)
119 {
120 	return (uint32x4_t)(__v0 < __v1);
121 }
122 
123 _INTRINSATTR
124 static __inline int32x4_t
vdupq_n_s32(int32_t __x)125 vdupq_n_s32(int32_t __x)
126 {
127 	return (int32x4_t) { __x, __x, __x, __x };
128 }
129 
130 _INTRINSATTR
131 static __inline uint32x4_t
vdupq_n_u32(uint32_t __x)132 vdupq_n_u32(uint32_t __x)
133 {
134 	return (uint32x4_t) { __x, __x, __x, __x };
135 }
136 
137 _INTRINSATTR
138 static __inline uint8x16_t
vdupq_n_u8(uint8_t __x)139 vdupq_n_u8(uint8_t __x)
140 {
141 	return (uint8x16_t) {
142 		__x, __x, __x, __x, __x, __x, __x, __x,
143 		__x, __x, __x, __x, __x, __x, __x, __x,
144 	};
145 }
146 
147 #if defined(__GNUC__) && !defined(__clang__)
148 _INTRINSATTR
149 static __inline uint32x4_t
vextq_u32(uint32x4_t __lo,uint32x4_t __hi,uint8_t __i)150 vextq_u32(uint32x4_t __lo, uint32x4_t __hi, uint8_t __i)
151 {
152 #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
153 	return __builtin_shuffle(__hi, __lo,
154 	    (uint32x4_t) { 4 - __i, 5 - __i, 6 - __i, 7 - __i });
155 #else
156 	return __builtin_shuffle(__lo, __hi,
157 	    (uint32x4_t) { __i + 0, __i + 1, __i + 2, __i + 3 });
158 #endif
159 }
160 #elif defined(__clang__)
161 #ifdef __LITTLE_ENDIAN__
162 #define	vextq_u32(__lo, __hi, __i)					      \
163 	(uint32x4_t)__builtin_neon_vextq_v((int8x16_t)(__lo),		      \
164 	    (int8x16_t)(__hi), (__i), 50)
165 #else
166 #define	vextq_u32(__lo, __hi, __i) (					      \
167 {									      \
168 	uint32x4_t __tlo = (__lo);					      \
169 	uint32x4_t __thi = (__hi);					      \
170 	uint32x4_t __lo_r = __builtin_shufflevector(__tlo, __tlo, 3,2,1,0);   \
171 	uint32x4_t __hi_r = __builtin_shufflevector(__thi, __thi, 3,2,1,0);   \
172 	uint32x4_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r,	      \
173 	    (int8x16_t)__hi_r, __i, 50);				      \
174 	__builtin_shufflevector(__r, __r, 3,2,1,0);			      \
175 })
176 #endif	/* __LITTLE_ENDIAN__ */
177 #endif
178 
179 #if defined(__GNUC__) && !defined(__clang__)
180 _INTRINSATTR
181 static __inline uint8x16_t
vextq_u8(uint8x16_t __lo,uint8x16_t __hi,uint8_t __i)182 vextq_u8(uint8x16_t __lo, uint8x16_t __hi, uint8_t __i)
183 {
184 #ifdef __aarch64__
185 #if defined(__AARCH64EB__)
186 	return __builtin_shuffle(__hi, __lo,
187 	    (uint8x16_t) {
188 		16 - __i, 17 - __i, 18 - __i, 19 - __i,
189 		20 - __i, 21 - __i, 22 - __i, 23 - __i,
190 		24 - __i, 25 - __i, 26 - __i, 27 - __i,
191 		28 - __i, 29 - __i, 30 - __i, 31 - __i,
192 	});
193 #else
194 	return __builtin_shuffle(__lo, __hi,
195 	    (uint8x16_t) {
196 		__i +  0, __i +  1, __i +  2, __i +  3,
197 		__i +  4, __i +  5, __i +  6, __i +  7,
198 		__i +  8, __i +  9, __i + 10, __i + 11,
199 		__i + 12, __i + 13, __i + 14, __i + 15,
200 	});
201 #endif
202 #else
203 	return (uint8x16_t)__builtin_neon_vextv16qi((int8x16_t)__lo,
204 	    (int8x16_t)__hi, __i);
205 #endif
206 }
207 #elif defined(__clang__)
208 #ifdef __LITTLE_ENDIAN__
209 #define	vextq_u8(__lo, __hi, __i)					      \
210 	(uint8x16_t)__builtin_neon_vextq_v((int8x16_t)(__lo),		      \
211 	    (int8x16_t)(__hi), (__i), 48)
212 #else
213 #define	vextq_u8(__lo, __hi, __i) (					      \
214 {									      \
215 	uint8x16_t __tlo = (__lo);					      \
216 	uint8x16_t __thi = (__hi);					      \
217 	uint8x16_t __lo_r = __builtin_shufflevector(__tlo, __tlo,	      \
218 	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);			      \
219 	uint8x16_t __hi_r = __builtin_shufflevector(__thi, __thi,	      \
220 	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);			      \
221 	uint8x16_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r,	      \
222 	    (int8x16_t)__hi_r, (__i), 48);				      \
223 	__builtin_shufflevector(__r, __r,				      \
224 	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);			      \
225 })
226 #endif	/* __LITTLE_ENDIAN */
227 #endif
228 
229 #if defined(__GNUC__) && !defined(__clang__)
230 _INTRINSATTR
231 static __inline uint32_t
vgetq_lane_u32(uint32x4_t __v,uint8_t __i)232 vgetq_lane_u32(uint32x4_t __v, uint8_t __i)
233 {
234 #ifdef __aarch64__
235 	return __v[__neon_laneq_index(__v, __i)];
236 #else
237 	return (uint32_t)__builtin_neon_vget_laneuv4si((int32x4_t)__v, __i);
238 #endif
239 }
240 #elif defined(__clang__)
241 #define	vgetq_lane_u32(__v, __i)					      \
242 	(uint32_t)__builtin_neon_vgetq_lane_i32((int32x4_t)(__v),	      \
243 	    __neon_laneq_index(__v, __i))
244 #endif
245 
246 _INTRINSATTR
247 static __inline uint32x4_t
vld1q_u32(const uint32_t * __p32)248 vld1q_u32(const uint32_t *__p32)
249 {
250 #if defined(__GNUC__) && !defined(__clang__)
251 #ifdef __aarch64__
252 	const __builtin_aarch64_simd_si *__p =
253 	    (const __builtin_aarch64_simd_si *)__p32;
254 
255 	return (uint32x4_t)__builtin_aarch64_ld1v4si(__p);
256 #else
257 	const __builtin_neon_si *__p = (const __builtin_neon_si *)__p32;
258 
259 	return (uint32x4_t)__builtin_neon_vld1v4si(__p);
260 #endif
261 #elif defined(__clang__)
262 	uint32x4_t __v = (uint32x4_t)__builtin_neon_vld1q_v(__p32, 50);
263 #ifndef __LITTLE_ENDIAN__
264 	__v = __builtin_shufflevector(__v, __v, 3,2,1,0);
265 #endif
266 	return __v;
267 #endif
268 }
269 
270 _INTRINSATTR
271 static __inline uint8x16_t
vld1q_u8(const uint8_t * __p8)272 vld1q_u8(const uint8_t *__p8)
273 {
274 #if defined(__GNUC__) && !defined(__clang__)
275 #ifdef __aarch64__
276 	const __builtin_aarch64_simd_qi *__p =
277 	    (const __builtin_aarch64_simd_qi *)__p8;
278 
279 	return (uint8x16_t)__builtin_aarch64_ld1v16qi(__p);
280 #else
281 	const __builtin_neon_qi *__p = (const __builtin_neon_qi *)__p8;
282 
283 	return (uint8x16_t)__builtin_neon_vld1v16qi(__p);
284 #endif
285 #elif defined(__clang__)
286 	uint8x16_t __v = (uint8x16_t)__builtin_neon_vld1q_v(__p8, 48);
287 #ifndef __LITTLE_ENDIAN__
288 	__v = __builtin_shufflevector(__v, __v,
289 	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
290 #endif
291 	return __v;
292 #endif
293 }
294 
295 _INTRINSATTR
296 static __inline uint8x16_t
vqtbl1q_u8(uint8x16_t __tab,uint8x16_t __idx)297 vqtbl1q_u8(uint8x16_t __tab, uint8x16_t __idx)
298 {
299 #if defined(__GNUC__) && !defined(__clang__)
300 #ifdef __aarch64__
301 	uint8x16_t __res;
302 	__asm__("tbl %0.16b, {%1.16b}, %2.16b"
303 	    : "=w"(__res) : "w"(__tab), "w"(__idx));
304 	return __res;
305 #else
306 	/*
307 	 * No native ARMv7 NEON instruction for this, so do it via two
308 	 * half-width TBLs instead (vtbl2_u8 equivalent).
309 	 */
310 	uint64x2_t __tab64 = (uint64x2_t)__tab;
311 	uint8x8_t __tablo = (uint8x8_t)__tab64[0];
312 	uint8x8_t __tabhi = (uint8x8_t)__tab64[1];
313 	uint8x8x2_t __tab8x8x2 = { { __tablo, __tabhi } };
314 	union {
315 		uint8x8x2_t __u8x8x2;
316 		__builtin_neon_ti __ti;
317 	} __u = { __tab8x8x2 };
318 	uint64x2_t __idx64, __out64;
319 	int8x8_t __idxlo, __idxhi, __outlo, __outhi;
320 
321 	__idx64 = (uint64x2_t)__idx;
322 	__idxlo = (int8x8_t)__idx64[0];
323 	__idxhi = (int8x8_t)__idx64[1];
324 	__outlo = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxlo);
325 	__outhi = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxhi);
326 	__out64 = (uint64x2_t) { (uint64x1_t)__outlo, (uint64x1_t)__outhi };
327 
328 	return (uint8x16_t)__out64;
329 #endif
330 #elif defined(__clang__)
331 #ifndef __LITTLE_ENDIAN__
332 	__tab = __builtin_shufflevector(__tab, __tab,
333 	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
334 	__idx = __builtin_shufflevector(__idx, __idx,
335 	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
336 #endif
337 	uint8x16_t __r;
338 #ifdef __aarch64__
339 	__r = __builtin_neon_vqtbl1q_v((int8x16_t)__tab, (int8x16_t)__idx, 48);
340 #else
341 	uint64x2_t __tab64 = (uint64x2_t)__tab;
342 	uint8x8_t __tablo = (uint8x8_t)__tab64[0];
343 	uint8x8_t __tabhi = (uint8x8_t)__tab64[1];
344 	uint64x2_t __idx64, __out64;
345 	int8x8_t __idxlo, __idxhi, __outlo, __outhi;
346 
347 	__idx64 = (uint64x2_t)__idx;
348 	__idxlo = (int8x8_t)__idx64[0];
349 	__idxhi = (int8x8_t)__idx64[1];
350 	__outlo = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tablo,
351 	    (int8x8_t)__tabhi, (int8x8_t)__idxlo, 16);
352 	__outhi = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tablo,
353 	    (int8x8_t)__tabhi, (int8x8_t)__idxhi, 16);
354 	__out64 = (uint64x2_t) { (uint64_t)__outlo, (uint64_t)__outhi };
355 	__r = (uint8x16_t)__out64;
356 #endif
357 #ifndef __LITTLE_ENDIAN__
358 	__r = __builtin_shufflevector(__r, __r,
359 	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
360 #endif
361 	return __r;
362 #endif
363 }
364 
365 _INTRINSATTR
366 static __inline int32x4_t
vreinterpretq_s32_u8(uint8x16_t __v)367 vreinterpretq_s32_u8(uint8x16_t __v)
368 {
369 	return (int32x4_t)__v;
370 }
371 
372 _INTRINSATTR
373 static __inline uint16x8_t
vreinterpretq_u16_u32(uint32x4_t __v)374 vreinterpretq_u16_u32(uint32x4_t __v)
375 {
376 	return (uint16x8_t)__v;
377 }
378 
379 _INTRINSATTR
380 static __inline uint32x4_t
vreinterpretq_u32_u16(uint16x8_t __v)381 vreinterpretq_u32_u16(uint16x8_t __v)
382 {
383 	return (uint32x4_t)__v;
384 }
385 
386 _INTRINSATTR
387 static __inline uint32x4_t
vreinterpretq_u32_u64(uint64x2_t __v)388 vreinterpretq_u32_u64(uint64x2_t __v)
389 {
390 	return (uint32x4_t)__v;
391 }
392 
393 _INTRINSATTR
394 static __inline uint32x4_t
vreinterpretq_u32_u8(uint8x16_t __v)395 vreinterpretq_u32_u8(uint8x16_t __v)
396 {
397 	return (uint32x4_t)__v;
398 }
399 
400 _INTRINSATTR
401 static __inline uint64x2_t
vreinterpretq_u64_u32(uint32x4_t __v)402 vreinterpretq_u64_u32(uint32x4_t __v)
403 {
404 	return (uint64x2_t)__v;
405 }
406 
407 _INTRINSATTR
408 static __inline uint64x2_t
vreinterpretq_u64_u8(uint8x16_t __v)409 vreinterpretq_u64_u8(uint8x16_t __v)
410 {
411 	return (uint64x2_t)__v;
412 }
413 
414 _INTRINSATTR
415 static __inline uint8x16_t
vreinterpretq_u8_s32(int32x4_t __v)416 vreinterpretq_u8_s32(int32x4_t __v)
417 {
418 	return (uint8x16_t)__v;
419 }
420 
421 _INTRINSATTR
422 static __inline uint8x16_t
vreinterpretq_u8_u32(uint32x4_t __v)423 vreinterpretq_u8_u32(uint32x4_t __v)
424 {
425 	return (uint8x16_t)__v;
426 }
427 
428 _INTRINSATTR
429 static __inline uint8x16_t
vreinterpretq_u8_u64(uint64x2_t __v)430 vreinterpretq_u8_u64(uint64x2_t __v)
431 {
432 	return (uint8x16_t)__v;
433 }
434 
435 _INTRINSATTR
436 static __inline uint16x8_t
vrev32q_u16(uint16x8_t __v)437 vrev32q_u16(uint16x8_t __v)
438 {
439 #if defined(__GNUC__) && !defined(__clang__)
440 	return __builtin_shuffle(__v, (uint16x8_t) { 1,0, 3,2, 5,4, 7,6 });
441 #elif defined(__clang__)
442 	return __builtin_shufflevector(__v, __v,  1,0, 3,2, 5,4, 7,6);
443 #endif
444 }
445 
446 _INTRINSATTR
447 static __inline uint8x16_t
vrev32q_u8(uint8x16_t __v)448 vrev32q_u8(uint8x16_t __v)
449 {
450 #if defined(__GNUC__) && !defined(__clang__)
451 	return __builtin_shuffle(__v,
452 	    (uint8x16_t) { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 });
453 #elif defined(__clang__)
454 	return __builtin_shufflevector(__v, __v,
455 	    3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12);
456 #endif
457 }
458 
459 #if defined(__GNUC__) && !defined(__clang__)
460 _INTRINSATTR
461 static __inline uint32x4_t
vsetq_lane_u32(uint32_t __x,uint32x4_t __v,uint8_t __i)462 vsetq_lane_u32(uint32_t __x, uint32x4_t __v, uint8_t __i)
463 {
464 	__v[__neon_laneq_index(__v, __i)] = __x;
465 	return __v;
466 }
467 #elif defined(__clang__)
468 #define	vsetq_lane_u32(__x, __v, __i)					      \
469 	(uint32x4_t)__builtin_neon_vsetq_lane_i32((__x), (int32x4_t)(__v),    \
470 	    __neon_laneq_index(__v, __i))
471 #endif
472 
473 #if defined(__GNUC__) && !defined(__clang__)
474 _INTRINSATTR
475 static __inline uint64x2_t
vsetq_lane_u64(uint64_t __x,uint64x2_t __v,uint8_t __i)476 vsetq_lane_u64(uint64_t __x, uint64x2_t __v, uint8_t __i)
477 {
478 	__v[__neon_laneq_index(__v, __i)] = __x;
479 	return __v;
480 }
481 #elif defined(__clang__)
482 #define	vsetq_lane_u64(__x, __v, __i)					      \
483 	(uint64x2_t)__builtin_neon_vsetq_lane_i64((__x), (int64x2_t)(__v),    \
484 	    __neon_laneq_index(__v, __i));
485 #endif
486 
487 #if defined(__GNUC__) && !defined(__clang__)
488 _INTRINSATTR
489 static __inline int32x4_t
vshlq_n_s32(int32x4_t __v,uint8_t __bits)490 vshlq_n_s32(int32x4_t __v, uint8_t __bits)
491 {
492 #ifdef __aarch64__
493 	return (int32x4_t)__builtin_aarch64_ashlv4si(__v, __bits);
494 #else
495 	return (int32x4_t)__builtin_neon_vshl_nv4si(__v, __bits);
496 #endif
497 }
498 #elif defined(__clang__)
499 #define	vshlq_n_s32(__v, __bits)					      \
500 	(int32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 34)
501 #endif
502 
503 #if defined(__GNUC__) && !defined(__clang__)
504 _INTRINSATTR
505 static __inline uint32x4_t
vshlq_n_u32(uint32x4_t __v,uint8_t __bits)506 vshlq_n_u32(uint32x4_t __v, uint8_t __bits)
507 {
508 #ifdef __aarch64__
509 	return (uint32x4_t)__builtin_aarch64_ashlv4si((int32x4_t)__v, __bits);
510 #else
511 	return (uint32x4_t)__builtin_neon_vshl_nv4si((int32x4_t)__v, __bits);
512 #endif
513 }
514 #elif defined(__clang__)
515 #define	vshlq_n_u32(__v, __bits)					      \
516 	(uint32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 50)
517 #endif
518 
519 #if defined(__GNUC__) && !defined(__clang__)
520 _INTRINSATTR
521 static __inline uint32x4_t
vshrq_n_u32(uint32x4_t __v,uint8_t __bits)522 vshrq_n_u32(uint32x4_t __v, uint8_t __bits)
523 {
524 #ifdef __aarch64__
525 #  if __GNUC_PREREQ__(12, 0)
526 	return __builtin_aarch64_lshrv4si_uus(__v, __bits);
527 #  else
528 	return (uint32x4_t)__builtin_aarch64_lshrv4si((int32x4_t)__v, __bits);
529 #  endif
530 #else
531 	return (uint32x4_t)__builtin_neon_vshru_nv4si((int32x4_t)__v, __bits);
532 #endif
533 }
534 #elif defined(__clang__)
535 #define	vshrq_n_u32(__v, __bits)					      \
536 	(uint32x4_t)__builtin_neon_vshrq_n_v((int32x4_t)(__v), (__bits), 50)
537 #endif
538 
539 #if defined(__GNUC__) && !defined(__clang__)
540 _INTRINSATTR
541 static __inline uint8x16_t
vshrq_n_u8(uint8x16_t __v,uint8_t __bits)542 vshrq_n_u8(uint8x16_t __v, uint8_t __bits)
543 {
544 #ifdef __aarch64__
545 #  if __GNUC_PREREQ__(12, 0)
546 	return __builtin_aarch64_lshrv16qi_uus(__v, __bits);
547 #  else
548 	return (uint8x16_t)__builtin_aarch64_lshrv16qi((int8x16_t)__v, __bits);
549 #  endif
550 #else
551 	return (uint8x16_t)__builtin_neon_vshru_nv16qi((int8x16_t)__v, __bits);
552 #endif
553 }
554 #elif defined(__clang__)
555 #define	vshrq_n_u8(__v, __bits)						      \
556 	(uint8x16_t)__builtin_neon_vshrq_n_v((int8x16_t)(__v), (__bits), 48)
557 #endif
558 
559 #if defined(__GNUC__) && !defined(__clang__)
560 _INTRINSATTR
561 static __inline int32x4_t
vsliq_n_s32(int32x4_t __vins,int32x4_t __vsh,uint8_t __bits)562 vsliq_n_s32(int32x4_t __vins, int32x4_t __vsh, uint8_t __bits)
563 {
564 #ifdef __aarch64__
565 	return (int32x4_t)__builtin_aarch64_ssli_nv4si(__vins, __vsh, __bits);
566 #else
567 	return (int32x4_t)__builtin_neon_vsli_nv4si(__vins, __vsh, __bits);
568 #endif
569 }
570 #elif defined(__clang__)
571 #ifdef __LITTLE_ENDIAN__
572 #define	vsliq_n_s32(__vins, __vsh, __bits)				      \
573 	(int32x4_t)__builtin_neon_vsliq_n_v((int32x4_t)(__vins),	      \
574 	    (int32x4_t)(__vsh), (__bits), 34)
575 #else
576 #define	vsliq_n_s32(__vins, __vsh, __bits) (				      \
577 {									      \
578 	int32x4_t __tvins = (__vins);					      \
579 	int32x4_t __tvsh = (__vsh);					      \
580 	uint8_t __tbits = (__bits);					      \
581 	int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins,	      \
582 	    3,2,1,0);							      \
583 	int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh,	      \
584 	    3,2,1,0);							      \
585 	int32x4_t __r = __builtin_neon_vsliq_n_v(__tvins, __tvsh, __tbits,    \
586 	    34);							      \
587 	__builtin_shufflevector(__r, __r, 3,2,1,0);			      \
588 })
589 #endif	/* __LITTLE_ENDIAN__ */
590 #endif
591 
592 #if defined(__GNUC__) && !defined(__clang__)
593 _INTRINSATTR
594 static __inline uint32x4_t
vsriq_n_u32(uint32x4_t __vins,uint32x4_t __vsh,uint8_t __bits)595 vsriq_n_u32(uint32x4_t __vins, uint32x4_t __vsh, uint8_t __bits)
596 {
597 #ifdef __aarch64__
598 	return __builtin_aarch64_usri_nv4si_uuus(__vins, __vsh, __bits);
599 #else
600 	return (uint32x4_t)__builtin_neon_vsri_nv4si((int32x4_t)__vins,
601 	    (int32x4_t)__vsh, __bits);
602 #endif
603 }
604 #elif defined(__clang__)
605 #ifdef __LITTLE_ENDIAN__
606 #define	vsriq_n_u32(__vins, __vsh, __bits)				      \
607 	(int32x4_t)__builtin_neon_vsriq_n_v((int32x4_t)(__vins),	      \
608 	    (int32x4_t)(__vsh), (__bits), 34)
609 #else
610 #define	vsriq_n_s32(__vins, __vsh, __bits) (				      \
611 {									      \
612 	int32x4_t __tvins = (__vins);					      \
613 	int32x4_t __tvsh = (__vsh);					      \
614 	uint8_t __tbits = (__bits);					      \
615 	int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins,	      \
616 	    3,2,1,0);							      \
617 	int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh,	      \
618 	    3,2,1,0);							      \
619 	int32x4_t __r = __builtin_neon_vsriq_n_v(__tvins, __tvsh, __tbits,    \
620 	    34);							      \
621 	__builtin_shufflevector(__r, __r, 3,2,1,0);			      \
622 })
623 #endif
624 #endif
625 
626 _INTRINSATTR
627 static __inline void
vst1q_u32(uint32_t * __p32,uint32x4_t __v)628 vst1q_u32(uint32_t *__p32, uint32x4_t __v)
629 {
630 #if defined(__GNUC__) && !defined(__clang__)
631 #ifdef __aarch64__
632 	__builtin_aarch64_simd_si *__p = (__builtin_aarch64_simd_si *)__p32;
633 
634 	__builtin_aarch64_st1v4si(__p, (int32x4_t)__v);
635 #else
636 	__builtin_neon_si *__p = (__builtin_neon_si *)__p32;
637 
638 	__builtin_neon_vst1v4si(__p, (int32x4_t)__v);
639 #endif
640 #elif defined(__clang__)
641 #ifndef __LITTLE_ENDIAN__
642 	__v = __builtin_shufflevector(__v, __v, 3,2,1,0);
643 #endif
644 	__builtin_neon_vst1q_v(__p32, __v, 50);
645 #endif
646 }
647 
648 _INTRINSATTR
649 static __inline void
vst1q_u8(uint8_t * __p8,uint8x16_t __v)650 vst1q_u8(uint8_t *__p8, uint8x16_t __v)
651 {
652 #if defined(__GNUC__) && !defined(__clang__)
653 #ifdef __aarch64__
654 	__builtin_aarch64_simd_qi *__p = (__builtin_aarch64_simd_qi *)__p8;
655 
656 	__builtin_aarch64_st1v16qi(__p, (int8x16_t)__v);
657 #else
658 	__builtin_neon_qi *__p = (__builtin_neon_qi *)__p8;
659 
660 	__builtin_neon_vst1v16qi(__p, (int8x16_t)__v);
661 #endif
662 #elif defined(__clang__)
663 #ifndef __LITTLE_ENDIAN__
664 	__v = __builtin_shufflevector(__v, __v,
665 	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
666 #endif
667 	__builtin_neon_vst1q_v(__p8, __v, 48);
668 #endif
669 }
670 
671 #ifndef __aarch64__		/* XXX */
672 
673 _INTRINSATTR
674 static __inline uint8x8_t
vtbl1_u8(uint8x8_t __tab,uint8x8_t __idx)675 vtbl1_u8(uint8x8_t __tab, uint8x8_t __idx)
676 {
677 #if defined(__GNUC__) && !defined(__clang__)
678 	return (uint8x8_t)__builtin_neon_vtbl1v8qi((int8x8_t)__tab,
679 	    (int8x8_t)__idx);
680 #elif defined(__clang__)
681 	uint8x8_t __ret;
682 #ifndef __LITTLE_ENDIAN__
683 	__tab = __builtin_shufflevector(__tab, __tab, 7,6,5,4,3,2,1,0);
684 	__idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0);
685 #endif
686 	__ret = (uint8x8_t)__builtin_neon_vtbl1_v((int8x8_t)__tab,
687 	    (int8x8_t)__idx, 16);
688 #ifndef __LITTLE_ENDIAN__
689 	__ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0);
690 #endif
691 	return __ret;
692 #endif
693 }
694 
695 _INTRINSATTR
696 static __inline uint8x8_t
vtbl2_u8(uint8x8x2_t __tab,uint8x8_t __idx)697 vtbl2_u8(uint8x8x2_t __tab, uint8x8_t __idx)
698 {
699 #if defined(__GNUC__) && !defined(__clang__)
700 	union {
701 		uint8x8x2_t __u8x8x82;
702 		__builtin_neon_ti __ti;
703 	} __u = { __tab };
704 	return (uint8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, (int8x8_t)__idx);
705 #elif defined(__clang__)
706 	uint8x8_t __ret;
707 #ifndef __LITTLE_ENDIAN__
708 	__tab.val[0] = __builtin_shufflevector(__tab.val[0], __tab.val[0],
709 	    7,6,5,4,3,2,1,0);
710 	__tab.val[1] = __builtin_shufflevector(__tab.val[1], __tab.val[1],
711 	    7,6,5,4,3,2,1,0);
712 	__idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0);
713 #endif
714 	__ret = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tab.val[0],
715 	    (int8x8_t)__tab.val[1], (int8x8_t)__idx, 16);
716 #ifndef __LITTLE_ENDIAN__
717 	__ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0);
718 #endif
719 	return __ret;
720 #endif
721 }
722 
723 #endif	/* !defined(__aarch64__) */
724 
725 #endif	/* _SYS_CRYPTO_ARCH_ARM_ARM_NEON_H */
726