xref: /netbsd-src/sys/crypto/arch/x86/immintrin.h (revision 0a3071956a3a9fdebdbf7f338cf2d439b45fc728)
1 /*	$NetBSD: immintrin.h,v 1.1 2023/08/07 01:07:36 rin Exp $	*/
2 
3 /*-
4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #ifndef	_SYS_CRYPTO_ARCH_X86_IMMINTRIN_H
30 #define	_SYS_CRYPTO_ARCH_X86_IMMINTRIN_H
31 
32 #include <sys/types.h>
33 
34 /*
35  * This kludgerous header file provides definitions for the Intel
36  * intrinsics that work with GCC and Clang, because <immintrin.h> is
37  * not available during the kernel build and arranging to make it
38  * available is complicated.  Please fix this properly!
39  */
40 
41 #if defined(__GNUC__) && !defined(__clang__)
42 
43 #define	_INTRINSATTR							      \
44 	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
45 #define	_PACKALIAS
46 
47 typedef float __m128 __attribute__((__vector_size__(16), __may_alias__));
48 typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
49 typedef long long __m128i_u
50     __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
51 typedef long long __v2di __attribute__((__vector_size__(16)));
52 typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
53 typedef int __v4si __attribute__((__vector_size__(16)));
54 typedef unsigned __v4su __attribute__((__vector_size__(16)));
55 typedef float __v4sf __attribute__((__vector_size__(16)));
56 typedef short __v8hi __attribute__((__vector_size__(16)));
57 typedef char __v16qi __attribute__((__vector_size__(16)));
58 
59 #elif defined(__clang__)
60 
61 typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
62 typedef long long __m128i
63     __attribute__((__vector_size__(16), __aligned__(16)));
64 typedef long long __m128i_u
65     __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
66 typedef long long __v2di __attribute__((__vector_size__(16)));
67 typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
68 typedef int __v4si __attribute__((__vector_size__(16)));
69 typedef unsigned __v4su __attribute__((__vector_size__(16)));
70 typedef float __v4sf __attribute__((__vector_size__(16)));
71 typedef short __v8hi __attribute__((__vector_size__(16)));
72 typedef char __v16qi __attribute__((__vector_size__(16)));
73 
74 #define	_INTRINSATTR							      \
75 	__attribute__((__always_inline__, __nodebug__, __target__("sse2"),    \
76 		__min_vector_width__(128)))
77 #define	_PACKALIAS							      \
78 	__attribute__((__packed__, __may_alias__))
79 
80 #else
81 
82 #error Please teach me how to do Intel intrinsics for your compiler!
83 
84 #endif
85 
86 #define	_SSSE3_ATTR	__attribute__((target("ssse3")))
87 
88 _INTRINSATTR
89 static __inline __m128i
90 _mm_add_epi32(__m128i __a, __m128i __b)
91 {
92 	return (__m128i)((__v4su)__a + (__v4su)__b);
93 }
94 
95 #if defined(__GNUC__) && !defined(__clang__)
96 #define	_mm_alignr_epi8(hi,lo,bytes)					      \
97 	(__m128i)__builtin_ia32_palignr128((__v2di)(__m128i)(hi),	      \
98 	    (__v2di)(__m128i)(lo), 8*(int)(bytes))
99 #elif defined(__clang__)
100 #define	_mm_alignr_epi8(hi,lo,bytes)					      \
101 	(__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(hi),	      \
102 	    (__v16qi)(__m128i)(lo), (int)(bytes))
103 #endif
104 
105 _INTRINSATTR
106 static __inline __m128
107 _mm_load1_ps(const float *__p)
108 {
109 	return __extension__ (__m128)(__v4sf) { *__p, *__p, *__p, *__p };
110 }
111 
112 _INTRINSATTR
113 static __inline __m128i
114 _mm_loadu_si128(const __m128i_u *__p)
115 {
116 	return ((const struct { __m128i_u __v; } _PACKALIAS *)__p)->__v;
117 }
118 
119 _INTRINSATTR
120 static __inline __m128i
121 _mm_loadu_si32(const void *__p)
122 {
123 	int32_t __v = ((const struct { int32_t __v; } _PACKALIAS *)__p)->__v;
124 	return __extension__ (__m128i)(__v4si){ __v, 0, 0, 0 };
125 }
126 
127 _INTRINSATTR
128 static __inline __m128i
129 _mm_loadu_si64(const void *__p)
130 {
131 	int64_t __v = ((const struct { int64_t __v; } _PACKALIAS *)__p)->__v;
132 	return __extension__ (__m128i)(__v2di){ __v, 0 };
133 }
134 
135 _INTRINSATTR
136 static __inline __m128i
137 _mm_load_si128(const __m128i *__p)
138 {
139 	return *__p;
140 }
141 
142 _INTRINSATTR
143 static __inline __m128
144 _mm_movehl_ps(__m128 __v0, __m128 __v1)
145 {
146 #if defined(__GNUC__) && !defined(__clang__)
147 	return (__m128)__builtin_ia32_movhlps((__v4sf)__v0, (__v4sf)__v1);
148 #elif defined(__clang__)
149 	return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 6,7,2,3);
150 #endif
151 }
152 
153 _INTRINSATTR
154 static __inline __m128
155 _mm_movelh_ps(__m128 __v0, __m128 __v1)
156 {
157 #if defined(__GNUC__) && !defined(__clang__)
158 	return (__m128)__builtin_ia32_movlhps((__v4sf)__v0, (__v4sf)__v1);
159 #elif defined(__clang__)
160 	return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 0,1,4,5);
161 #endif
162 }
163 
164 _INTRINSATTR
165 static __inline __m128i
166 _mm_set1_epi16(int16_t __v)
167 {
168 	return __extension__ (__m128i)(__v8hi){
169 	    __v, __v, __v, __v, __v, __v, __v, __v
170 	};
171 }
172 
173 _INTRINSATTR
174 static __inline __m128i
175 _mm_set1_epi32(int32_t __v)
176 {
177 	return __extension__ (__m128i)(__v4si){ __v, __v, __v, __v };
178 }
179 
180 _INTRINSATTR
181 static __inline __m128i
182 _mm_set1_epi64x(int64_t __v)
183 {
184 	return __extension__ (__m128i)(__v2di){ __v, __v };
185 }
186 
187 _INTRINSATTR
188 static __inline __m128i
189 _mm_set_epi32(int32_t __v3, int32_t __v2, int32_t __v1, int32_t __v0)
190 {
191 	return __extension__ (__m128i)(__v4si){ __v0, __v1, __v2, __v3 };
192 }
193 
194 _INTRINSATTR
195 static __inline __m128i
196 _mm_set_epi64x(int64_t __v1, int64_t __v0)
197 {
198 	return __extension__ (__m128i)(__v2di){ __v0, __v1 };
199 }
200 
201 _INTRINSATTR
202 static __inline __m128
203 _mm_setzero_ps(void)
204 {
205 	return __extension__ (__m128){ 0, 0, 0, 0 };
206 }
207 
208 _INTRINSATTR
209 static __inline __m128i
210 _mm_setzero_si128(void)
211 {
212 	return _mm_set1_epi64x(0);
213 }
214 
215 _INTRINSATTR _SSSE3_ATTR
216 static __inline __m128i
217 _mm_shuffle_epi8(__m128i __vtbl, __m128i __vidx)
218 {
219 	return (__m128i)__builtin_ia32_pshufb128((__v16qi)__vtbl,
220 	    (__v16qi)__vidx);
221 }
222 
223 #define	_mm_shuffle_epi32(v,m)						      \
224 	(__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(v), (int)(m))
225 
226 #define	_mm_shuffle_ps(x,y,m)						      \
227 	(__m128)__builtin_ia32_shufps((__v4sf)(__m128)(x),		      \
228 	    (__v4sf)(__m128)(y), (int)(m))				      \
229 
230 _INTRINSATTR
231 static __inline __m128i
232 _mm_slli_epi32(__m128i __v, uint8_t __bits)
233 {
234 	return (__m128i)__builtin_ia32_pslldi128((__v4si)__v, (int)__bits);
235 }
236 
237 _INTRINSATTR
238 static __inline __m128i
239 _mm_slli_epi64(__m128i __v, uint8_t __bits)
240 {
241 	return (__m128i)__builtin_ia32_psllqi128((__v2di)__v, (int)__bits);
242 }
243 
244 #if defined(__GNUC__) && !defined(__clang__)
245 #define	_mm_slli_si128(v,bytes)						      \
246 	(__m128i)__builtin_ia32_pslldqi128((__v2di)(__m128i)(v),	      \
247 	    8*(int)(bytes))
248 #elif defined(__clang__)
249 #define	_mm_slli_si128(v,bytes)						      \
250 	(__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(v),    \
251 	    (int)(bytes))
252 #endif
253 
254 _INTRINSATTR
255 static __inline __m128i
256 _mm_srli_epi32(__m128i __v, uint8_t __bits)
257 {
258 	return (__m128i)__builtin_ia32_psrldi128((__v4si)__v, (int)__bits);
259 }
260 
261 _INTRINSATTR
262 static __inline __m128i
263 _mm_srli_epi64(__m128i __v, uint8_t __bits)
264 {
265 	return (__m128i)__builtin_ia32_psrlqi128((__v2di)__v, (int)__bits);
266 }
267 
268 #if defined(__GNUC__) && !defined(__clang__)
269 #define	_mm_srli_si128(v,bytes)						      \
270 	(__m128i)__builtin_ia32_psrldqi128((__m128i)(v), 8*(int)(bytes))
271 #elif defined(__clang__)
272 #define	_mm_srli_si128(v,bytes)						      \
273 	(__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(v),    \
274 	    (int)(bytes));
275 #endif
276 
277 _INTRINSATTR
278 static __inline void
279 _mm_storeu_si128(__m128i_u *__p, __m128i __v)
280 {
281 	((struct { __m128i_u __v; } _PACKALIAS *)__p)->__v = __v;
282 }
283 
284 _INTRINSATTR
285 static __inline void
286 _mm_storeu_si32(void *__p, __m128i __v)
287 {
288 	((struct { int32_t __v; } _PACKALIAS *)__p)->__v = ((__v4si)__v)[0];
289 }
290 
291 _INTRINSATTR
292 static __inline void
293 _mm_storeu_si64(void *__p, __m128i __v)
294 {
295 	((struct { int64_t __v; } _PACKALIAS *)__p)->__v = ((__v2di)__v)[0];
296 }
297 
298 _INTRINSATTR
299 static __inline void
300 _mm_store_si128(__m128i *__p, __m128i __v)
301 {
302 	*__p = __v;
303 }
304 
305 _INTRINSATTR
306 static __inline __m128i
307 _mm_sub_epi64(__m128i __x, __m128i __y)
308 {
309 	return (__m128i)((__v2du)__x - (__v2du)__y);
310 }
311 
312 _INTRINSATTR
313 static __inline __m128i
314 _mm_unpackhi_epi32(__m128i __lo, __m128i __hi)
315 {
316 #if defined(__GNUC__) && !defined(__clang__)
317 	return (__m128i)__builtin_ia32_punpckhdq128((__v4si)__lo,
318 	    (__v4si)__hi);
319 #elif defined(__clang__)
320 	return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi,
321 	    2,6,3,7);
322 #endif
323 }
324 
325 _INTRINSATTR
326 static __inline __m128i
327 _mm_unpacklo_epi32(__m128i __lo, __m128i __hi)
328 {
329 #if defined(__GNUC__) && !defined(__clang__)
330 	return (__m128i)__builtin_ia32_punpckldq128((__v4si)__lo,
331 	    (__v4si)__hi);
332 #elif defined(__clang__)
333 	return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi,
334 	    0,4,1,5);
335 #endif
336 }
337 
338 _INTRINSATTR
339 static __inline __m128i
340 _mm_unpacklo_epi64(__m128i __lo, __m128i __hi)
341 {
342 #if defined(__GNUC__) && !defined(__clang__)
343 	return (__m128i)__builtin_ia32_punpcklqdq128((__v2di)__lo,
344 	    (__v2di)__hi);
345 #elif defined(__clang__)
346 	return (__m128i)__builtin_shufflevector((__v2di)__lo, (__v2di)__hi,
347 	    0,2);
348 #endif
349 }
350 
351 #endif	/* _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H */
352