1 /* $NetBSD: immintrin.h,v 1.1 2023/08/07 01:07:36 rin Exp $ */ 2 3 /*- 4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 #ifndef _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H 30 #define _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H 31 32 #include <sys/types.h> 33 34 /* 35 * This kludgerous header file provides definitions for the Intel 36 * intrinsics that work with GCC and Clang, because <immintrin.h> is 37 * not available during the kernel build and arranging to make it 38 * available is complicated. Please fix this properly! 39 */ 40 41 #if defined(__GNUC__) && !defined(__clang__) 42 43 #define _INTRINSATTR \ 44 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 45 #define _PACKALIAS 46 47 typedef float __m128 __attribute__((__vector_size__(16), __may_alias__)); 48 typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__)); 49 typedef long long __m128i_u 50 __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); 51 typedef long long __v2di __attribute__((__vector_size__(16))); 52 typedef unsigned long long __v2du __attribute__((__vector_size__(16))); 53 typedef int __v4si __attribute__((__vector_size__(16))); 54 typedef unsigned __v4su __attribute__((__vector_size__(16))); 55 typedef float __v4sf __attribute__((__vector_size__(16))); 56 typedef short __v8hi __attribute__((__vector_size__(16))); 57 typedef char __v16qi __attribute__((__vector_size__(16))); 58 59 #elif defined(__clang__) 60 61 typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16))); 62 typedef long long __m128i 63 __attribute__((__vector_size__(16), __aligned__(16))); 64 typedef long long __m128i_u 65 __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); 66 typedef long long __v2di __attribute__((__vector_size__(16))); 67 typedef unsigned long long __v2du __attribute__((__vector_size__(16))); 68 typedef int __v4si __attribute__((__vector_size__(16))); 69 typedef unsigned __v4su __attribute__((__vector_size__(16))); 70 typedef float __v4sf __attribute__((__vector_size__(16))); 71 typedef short __v8hi __attribute__((__vector_size__(16))); 72 typedef char __v16qi __attribute__((__vector_size__(16))); 73 74 #define _INTRINSATTR \ 75 __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \ 76 __min_vector_width__(128))) 77 #define _PACKALIAS \ 78 __attribute__((__packed__, __may_alias__)) 79 80 #else 81 82 #error Please teach me how to do Intel intrinsics for your compiler! 83 84 #endif 85 86 #define _SSSE3_ATTR __attribute__((target("ssse3"))) 87 88 _INTRINSATTR 89 static __inline __m128i 90 _mm_add_epi32(__m128i __a, __m128i __b) 91 { 92 return (__m128i)((__v4su)__a + (__v4su)__b); 93 } 94 95 #if defined(__GNUC__) && !defined(__clang__) 96 #define _mm_alignr_epi8(hi,lo,bytes) \ 97 (__m128i)__builtin_ia32_palignr128((__v2di)(__m128i)(hi), \ 98 (__v2di)(__m128i)(lo), 8*(int)(bytes)) 99 #elif defined(__clang__) 100 #define _mm_alignr_epi8(hi,lo,bytes) \ 101 (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(hi), \ 102 (__v16qi)(__m128i)(lo), (int)(bytes)) 103 #endif 104 105 _INTRINSATTR 106 static __inline __m128 107 _mm_load1_ps(const float *__p) 108 { 109 return __extension__ (__m128)(__v4sf) { *__p, *__p, *__p, *__p }; 110 } 111 112 _INTRINSATTR 113 static __inline __m128i 114 _mm_loadu_si128(const __m128i_u *__p) 115 { 116 return ((const struct { __m128i_u __v; } _PACKALIAS *)__p)->__v; 117 } 118 119 _INTRINSATTR 120 static __inline __m128i 121 _mm_loadu_si32(const void *__p) 122 { 123 int32_t __v = ((const struct { int32_t __v; } _PACKALIAS *)__p)->__v; 124 return __extension__ (__m128i)(__v4si){ __v, 0, 0, 0 }; 125 } 126 127 _INTRINSATTR 128 static __inline __m128i 129 _mm_loadu_si64(const void *__p) 130 { 131 int64_t __v = ((const struct { int64_t __v; } _PACKALIAS *)__p)->__v; 132 return __extension__ (__m128i)(__v2di){ __v, 0 }; 133 } 134 135 _INTRINSATTR 136 static __inline __m128i 137 _mm_load_si128(const __m128i *__p) 138 { 139 return *__p; 140 } 141 142 _INTRINSATTR 143 static __inline __m128 144 _mm_movehl_ps(__m128 __v0, __m128 __v1) 145 { 146 #if defined(__GNUC__) && !defined(__clang__) 147 return (__m128)__builtin_ia32_movhlps((__v4sf)__v0, (__v4sf)__v1); 148 #elif defined(__clang__) 149 return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 6,7,2,3); 150 #endif 151 } 152 153 _INTRINSATTR 154 static __inline __m128 155 _mm_movelh_ps(__m128 __v0, __m128 __v1) 156 { 157 #if defined(__GNUC__) && !defined(__clang__) 158 return (__m128)__builtin_ia32_movlhps((__v4sf)__v0, (__v4sf)__v1); 159 #elif defined(__clang__) 160 return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 0,1,4,5); 161 #endif 162 } 163 164 _INTRINSATTR 165 static __inline __m128i 166 _mm_set1_epi16(int16_t __v) 167 { 168 return __extension__ (__m128i)(__v8hi){ 169 __v, __v, __v, __v, __v, __v, __v, __v 170 }; 171 } 172 173 _INTRINSATTR 174 static __inline __m128i 175 _mm_set1_epi32(int32_t __v) 176 { 177 return __extension__ (__m128i)(__v4si){ __v, __v, __v, __v }; 178 } 179 180 _INTRINSATTR 181 static __inline __m128i 182 _mm_set1_epi64x(int64_t __v) 183 { 184 return __extension__ (__m128i)(__v2di){ __v, __v }; 185 } 186 187 _INTRINSATTR 188 static __inline __m128i 189 _mm_set_epi32(int32_t __v3, int32_t __v2, int32_t __v1, int32_t __v0) 190 { 191 return __extension__ (__m128i)(__v4si){ __v0, __v1, __v2, __v3 }; 192 } 193 194 _INTRINSATTR 195 static __inline __m128i 196 _mm_set_epi64x(int64_t __v1, int64_t __v0) 197 { 198 return __extension__ (__m128i)(__v2di){ __v0, __v1 }; 199 } 200 201 _INTRINSATTR 202 static __inline __m128 203 _mm_setzero_ps(void) 204 { 205 return __extension__ (__m128){ 0, 0, 0, 0 }; 206 } 207 208 _INTRINSATTR 209 static __inline __m128i 210 _mm_setzero_si128(void) 211 { 212 return _mm_set1_epi64x(0); 213 } 214 215 _INTRINSATTR _SSSE3_ATTR 216 static __inline __m128i 217 _mm_shuffle_epi8(__m128i __vtbl, __m128i __vidx) 218 { 219 return (__m128i)__builtin_ia32_pshufb128((__v16qi)__vtbl, 220 (__v16qi)__vidx); 221 } 222 223 #define _mm_shuffle_epi32(v,m) \ 224 (__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(v), (int)(m)) 225 226 #define _mm_shuffle_ps(x,y,m) \ 227 (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(x), \ 228 (__v4sf)(__m128)(y), (int)(m)) \ 229 230 _INTRINSATTR 231 static __inline __m128i 232 _mm_slli_epi32(__m128i __v, uint8_t __bits) 233 { 234 return (__m128i)__builtin_ia32_pslldi128((__v4si)__v, (int)__bits); 235 } 236 237 _INTRINSATTR 238 static __inline __m128i 239 _mm_slli_epi64(__m128i __v, uint8_t __bits) 240 { 241 return (__m128i)__builtin_ia32_psllqi128((__v2di)__v, (int)__bits); 242 } 243 244 #if defined(__GNUC__) && !defined(__clang__) 245 #define _mm_slli_si128(v,bytes) \ 246 (__m128i)__builtin_ia32_pslldqi128((__v2di)(__m128i)(v), \ 247 8*(int)(bytes)) 248 #elif defined(__clang__) 249 #define _mm_slli_si128(v,bytes) \ 250 (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(v), \ 251 (int)(bytes)) 252 #endif 253 254 _INTRINSATTR 255 static __inline __m128i 256 _mm_srli_epi32(__m128i __v, uint8_t __bits) 257 { 258 return (__m128i)__builtin_ia32_psrldi128((__v4si)__v, (int)__bits); 259 } 260 261 _INTRINSATTR 262 static __inline __m128i 263 _mm_srli_epi64(__m128i __v, uint8_t __bits) 264 { 265 return (__m128i)__builtin_ia32_psrlqi128((__v2di)__v, (int)__bits); 266 } 267 268 #if defined(__GNUC__) && !defined(__clang__) 269 #define _mm_srli_si128(v,bytes) \ 270 (__m128i)__builtin_ia32_psrldqi128((__m128i)(v), 8*(int)(bytes)) 271 #elif defined(__clang__) 272 #define _mm_srli_si128(v,bytes) \ 273 (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(v), \ 274 (int)(bytes)); 275 #endif 276 277 _INTRINSATTR 278 static __inline void 279 _mm_storeu_si128(__m128i_u *__p, __m128i __v) 280 { 281 ((struct { __m128i_u __v; } _PACKALIAS *)__p)->__v = __v; 282 } 283 284 _INTRINSATTR 285 static __inline void 286 _mm_storeu_si32(void *__p, __m128i __v) 287 { 288 ((struct { int32_t __v; } _PACKALIAS *)__p)->__v = ((__v4si)__v)[0]; 289 } 290 291 _INTRINSATTR 292 static __inline void 293 _mm_storeu_si64(void *__p, __m128i __v) 294 { 295 ((struct { int64_t __v; } _PACKALIAS *)__p)->__v = ((__v2di)__v)[0]; 296 } 297 298 _INTRINSATTR 299 static __inline void 300 _mm_store_si128(__m128i *__p, __m128i __v) 301 { 302 *__p = __v; 303 } 304 305 _INTRINSATTR 306 static __inline __m128i 307 _mm_sub_epi64(__m128i __x, __m128i __y) 308 { 309 return (__m128i)((__v2du)__x - (__v2du)__y); 310 } 311 312 _INTRINSATTR 313 static __inline __m128i 314 _mm_unpackhi_epi32(__m128i __lo, __m128i __hi) 315 { 316 #if defined(__GNUC__) && !defined(__clang__) 317 return (__m128i)__builtin_ia32_punpckhdq128((__v4si)__lo, 318 (__v4si)__hi); 319 #elif defined(__clang__) 320 return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi, 321 2,6,3,7); 322 #endif 323 } 324 325 _INTRINSATTR 326 static __inline __m128i 327 _mm_unpacklo_epi32(__m128i __lo, __m128i __hi) 328 { 329 #if defined(__GNUC__) && !defined(__clang__) 330 return (__m128i)__builtin_ia32_punpckldq128((__v4si)__lo, 331 (__v4si)__hi); 332 #elif defined(__clang__) 333 return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi, 334 0,4,1,5); 335 #endif 336 } 337 338 _INTRINSATTR 339 static __inline __m128i 340 _mm_unpacklo_epi64(__m128i __lo, __m128i __hi) 341 { 342 #if defined(__GNUC__) && !defined(__clang__) 343 return (__m128i)__builtin_ia32_punpcklqdq128((__v2di)__lo, 344 (__v2di)__hi); 345 #elif defined(__clang__) 346 return (__m128i)__builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 347 0,2); 348 #endif 349 } 350 351 #endif /* _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H */ 352