1 /*===---- smmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 /* Implemented from the specification included in the Intel C++ Compiler 11 User Guide and Reference, version 9.0. 12 13 NOTE: This is NOT a complete implementation of the SSE4 intrinsics! */ 14 15 #ifndef NO_WARN_X86_INTRINSICS 16 /* This header is distributed to simplify porting x86_64 code that 17 makes explicit use of Intel intrinsics to powerp64/powerpc64le. 18 19 It is the user's responsibility to determine if the results are 20 acceptable and make additional changes as necessary. 21 22 Note that much code that uses Intel intrinsics can be rewritten in 23 standard C or GNU C extensions, which are more portable and better 24 optimized across multiple targets. */ 25 #error \ 26 "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." 27 #endif 28 29 #ifndef SMMINTRIN_H_ 30 #define SMMINTRIN_H_ 31 32 #if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) 33 34 #include <altivec.h> 35 #include <tmmintrin.h> 36 37 /* Rounding mode macros. */ 38 #define _MM_FROUND_TO_NEAREST_INT 0x00 39 #define _MM_FROUND_TO_ZERO 0x01 40 #define _MM_FROUND_TO_POS_INF 0x02 41 #define _MM_FROUND_TO_NEG_INF 0x03 42 #define _MM_FROUND_CUR_DIRECTION 0x04 43 44 #define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) 45 #define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) 46 #define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) 47 #define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) 48 #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) 49 #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) 50 51 #define _MM_FROUND_RAISE_EXC 0x00 52 #define _MM_FROUND_NO_EXC 0x08 53 54 extern __inline __m128d 55 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 56 _mm_round_pd(__m128d __A, int __rounding) { 57 __v2df __r; 58 union { 59 double __fr; 60 long long __fpscr; 61 } __enables_save, __fpscr_save; 62 63 if (__rounding & _MM_FROUND_NO_EXC) { 64 /* Save enabled exceptions, disable all exceptions, 65 and preserve the rounding mode. */ 66 #ifdef _ARCH_PWR9 67 __asm__("mffsce %0" : "=f"(__fpscr_save.__fr)); 68 __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; 69 #else 70 __fpscr_save.__fr = __builtin_mffs(); 71 __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; 72 __fpscr_save.__fpscr &= ~0xf8; 73 __builtin_mtfsf(0b00000011, __fpscr_save.__fr); 74 #endif 75 /* Insert an artificial "read/write" reference to the variable 76 read below, to ensure the compiler does not schedule 77 a read/use of the variable before the FPSCR is modified, above. 78 This can be removed if and when GCC PR102783 is fixed. 79 */ 80 __asm__("" : "+wa"(__A)); 81 } 82 83 switch (__rounding) { 84 case _MM_FROUND_TO_NEAREST_INT: 85 __fpscr_save.__fr = __builtin_mffsl(); 86 __attribute__((fallthrough)); 87 case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC: 88 __builtin_set_fpscr_rn(0b00); 89 /* Insert an artificial "read/write" reference to the variable 90 read below, to ensure the compiler does not schedule 91 a read/use of the variable before the FPSCR is modified, above. 92 This can be removed if and when GCC PR102783 is fixed. 93 */ 94 __asm__("" : "+wa"(__A)); 95 96 __r = vec_rint((__v2df)__A); 97 98 /* Insert an artificial "read" reference to the variable written 99 above, to ensure the compiler does not schedule the computation 100 of the value after the manipulation of the FPSCR, below. 101 This can be removed if and when GCC PR102783 is fixed. 102 */ 103 __asm__("" : : "wa"(__r)); 104 __builtin_set_fpscr_rn(__fpscr_save.__fpscr); 105 break; 106 case _MM_FROUND_TO_NEG_INF: 107 case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC: 108 __r = vec_floor((__v2df)__A); 109 break; 110 case _MM_FROUND_TO_POS_INF: 111 case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC: 112 __r = vec_ceil((__v2df)__A); 113 break; 114 case _MM_FROUND_TO_ZERO: 115 case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC: 116 __r = vec_trunc((__v2df)__A); 117 break; 118 case _MM_FROUND_CUR_DIRECTION: 119 __r = vec_rint((__v2df)__A); 120 break; 121 } 122 if (__rounding & _MM_FROUND_NO_EXC) { 123 /* Insert an artificial "read" reference to the variable written 124 above, to ensure the compiler does not schedule the computation 125 of the value after the manipulation of the FPSCR, below. 126 This can be removed if and when GCC PR102783 is fixed. 127 */ 128 __asm__("" : : "wa"(__r)); 129 /* Restore enabled exceptions. */ 130 __fpscr_save.__fr = __builtin_mffsl(); 131 __fpscr_save.__fpscr |= __enables_save.__fpscr; 132 __builtin_mtfsf(0b00000011, __fpscr_save.__fr); 133 } 134 return (__m128d)__r; 135 } 136 137 extern __inline __m128d 138 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 139 _mm_round_sd(__m128d __A, __m128d __B, int __rounding) { 140 __B = _mm_round_pd(__B, __rounding); 141 __v2df __r = {((__v2df)__B)[0], ((__v2df)__A)[1]}; 142 return (__m128d)__r; 143 } 144 145 extern __inline __m128 146 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 147 _mm_round_ps(__m128 __A, int __rounding) { 148 __v4sf __r; 149 union { 150 double __fr; 151 long long __fpscr; 152 } __enables_save, __fpscr_save; 153 154 if (__rounding & _MM_FROUND_NO_EXC) { 155 /* Save enabled exceptions, disable all exceptions, 156 and preserve the rounding mode. */ 157 #ifdef _ARCH_PWR9 158 __asm__("mffsce %0" : "=f"(__fpscr_save.__fr)); 159 __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; 160 #else 161 __fpscr_save.__fr = __builtin_mffs(); 162 __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; 163 __fpscr_save.__fpscr &= ~0xf8; 164 __builtin_mtfsf(0b00000011, __fpscr_save.__fr); 165 #endif 166 /* Insert an artificial "read/write" reference to the variable 167 read below, to ensure the compiler does not schedule 168 a read/use of the variable before the FPSCR is modified, above. 169 This can be removed if and when GCC PR102783 is fixed. 170 */ 171 __asm__("" : "+wa"(__A)); 172 } 173 174 switch (__rounding) { 175 case _MM_FROUND_TO_NEAREST_INT: 176 __fpscr_save.__fr = __builtin_mffsl(); 177 __attribute__((fallthrough)); 178 case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC: 179 __builtin_set_fpscr_rn(0b00); 180 /* Insert an artificial "read/write" reference to the variable 181 read below, to ensure the compiler does not schedule 182 a read/use of the variable before the FPSCR is modified, above. 183 This can be removed if and when GCC PR102783 is fixed. 184 */ 185 __asm__("" : "+wa"(__A)); 186 187 __r = vec_rint((__v4sf)__A); 188 189 /* Insert an artificial "read" reference to the variable written 190 above, to ensure the compiler does not schedule the computation 191 of the value after the manipulation of the FPSCR, below. 192 This can be removed if and when GCC PR102783 is fixed. 193 */ 194 __asm__("" : : "wa"(__r)); 195 __builtin_set_fpscr_rn(__fpscr_save.__fpscr); 196 break; 197 case _MM_FROUND_TO_NEG_INF: 198 case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC: 199 __r = vec_floor((__v4sf)__A); 200 break; 201 case _MM_FROUND_TO_POS_INF: 202 case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC: 203 __r = vec_ceil((__v4sf)__A); 204 break; 205 case _MM_FROUND_TO_ZERO: 206 case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC: 207 __r = vec_trunc((__v4sf)__A); 208 break; 209 case _MM_FROUND_CUR_DIRECTION: 210 __r = vec_rint((__v4sf)__A); 211 break; 212 } 213 if (__rounding & _MM_FROUND_NO_EXC) { 214 /* Insert an artificial "read" reference to the variable written 215 above, to ensure the compiler does not schedule the computation 216 of the value after the manipulation of the FPSCR, below. 217 This can be removed if and when GCC PR102783 is fixed. 218 */ 219 __asm__("" : : "wa"(__r)); 220 /* Restore enabled exceptions. */ 221 __fpscr_save.__fr = __builtin_mffsl(); 222 __fpscr_save.__fpscr |= __enables_save.__fpscr; 223 __builtin_mtfsf(0b00000011, __fpscr_save.__fr); 224 } 225 return (__m128)__r; 226 } 227 228 extern __inline __m128 229 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 230 _mm_round_ss(__m128 __A, __m128 __B, int __rounding) { 231 __B = _mm_round_ps(__B, __rounding); 232 __v4sf __r = (__v4sf)__A; 233 __r[0] = ((__v4sf)__B)[0]; 234 return (__m128)__r; 235 } 236 237 #define _mm_ceil_pd(V) _mm_round_pd((V), _MM_FROUND_CEIL) 238 #define _mm_ceil_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_CEIL) 239 240 #define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR) 241 #define _mm_floor_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_FLOOR) 242 243 #define _mm_ceil_ps(V) _mm_round_ps((V), _MM_FROUND_CEIL) 244 #define _mm_ceil_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_CEIL) 245 246 #define _mm_floor_ps(V) _mm_round_ps((V), _MM_FROUND_FLOOR) 247 #define _mm_floor_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_FLOOR) 248 249 extern __inline __m128i 250 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 251 _mm_insert_epi8(__m128i const __A, int const __D, int const __N) { 252 __v16qi __result = (__v16qi)__A; 253 254 __result[__N & 0xf] = __D; 255 256 return (__m128i)__result; 257 } 258 259 extern __inline __m128i 260 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 261 _mm_insert_epi32(__m128i const __A, int const __D, int const __N) { 262 __v4si __result = (__v4si)__A; 263 264 __result[__N & 3] = __D; 265 266 return (__m128i)__result; 267 } 268 269 extern __inline __m128i 270 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 271 _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) { 272 __v2di __result = (__v2di)__A; 273 274 __result[__N & 1] = __D; 275 276 return (__m128i)__result; 277 } 278 279 extern __inline int 280 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 281 _mm_extract_epi8(__m128i __X, const int __N) { 282 return (unsigned char)((__v16qi)__X)[__N & 15]; 283 } 284 285 extern __inline int 286 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 287 _mm_extract_epi32(__m128i __X, const int __N) { 288 return ((__v4si)__X)[__N & 3]; 289 } 290 291 extern __inline int 292 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 293 _mm_extract_epi64(__m128i __X, const int __N) { 294 return ((__v2di)__X)[__N & 1]; 295 } 296 297 extern __inline int 298 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 299 _mm_extract_ps(__m128 __X, const int __N) { 300 return ((__v4si)__X)[__N & 3]; 301 } 302 303 #ifdef _ARCH_PWR8 304 extern __inline __m128i 305 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 306 _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) { 307 __v16qi __charmask = vec_splats((signed char)__imm8); 308 __charmask = vec_gb(__charmask); 309 __v8hu __shortmask = (__v8hu)vec_unpackh(__charmask); 310 #ifdef __BIG_ENDIAN__ 311 __shortmask = vec_reve(__shortmask); 312 #endif 313 return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask); 314 } 315 #endif 316 317 extern __inline __m128i 318 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 319 _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) { 320 #ifdef _ARCH_PWR10 321 return (__m128i)vec_blendv((__v16qi)__A, (__v16qi)__B, (__v16qu)__mask); 322 #else 323 const __v16qu __seven = vec_splats((unsigned char)0x07); 324 __v16qu __lmask = vec_sra((__v16qu)__mask, __seven); 325 return (__m128i)vec_sel((__v16qi)__A, (__v16qi)__B, __lmask); 326 #endif 327 } 328 329 extern __inline __m128 330 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 331 _mm_blend_ps(__m128 __A, __m128 __B, const int __imm8) { 332 __v16qu __pcv[] = { 333 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, 334 {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, 335 {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15}, 336 {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15}, 337 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15}, 338 {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15}, 339 {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15}, 340 {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15}, 341 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31}, 342 {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31}, 343 {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31}, 344 {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31}, 345 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}, 346 {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}, 347 {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, 348 {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, 349 }; 350 __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]); 351 return (__m128)__r; 352 } 353 354 extern __inline __m128 355 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 356 _mm_blendv_ps(__m128 __A, __m128 __B, __m128 __mask) { 357 #ifdef _ARCH_PWR10 358 return (__m128)vec_blendv((__v4sf)__A, (__v4sf)__B, (__v4su)__mask); 359 #else 360 const __v4si __zero = {0}; 361 const __vector __bool int __boolmask = vec_cmplt((__v4si)__mask, __zero); 362 return (__m128)vec_sel((__v4su)__A, (__v4su)__B, (__v4su)__boolmask); 363 #endif 364 } 365 366 extern __inline __m128d 367 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 368 _mm_blend_pd(__m128d __A, __m128d __B, const int __imm8) { 369 __v16qu __pcv[] = { 370 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, 371 {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15}, 372 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}, 373 {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}}; 374 __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]); 375 return (__m128d)__r; 376 } 377 378 #ifdef _ARCH_PWR8 379 extern __inline __m128d 380 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 381 _mm_blendv_pd(__m128d __A, __m128d __B, __m128d __mask) { 382 #ifdef _ARCH_PWR10 383 return (__m128d)vec_blendv((__v2df)__A, (__v2df)__B, (__v2du)__mask); 384 #else 385 const __v2di __zero = {0}; 386 const __vector __bool long long __boolmask = 387 vec_cmplt((__v2di)__mask, __zero); 388 return (__m128d)vec_sel((__v2du)__A, (__v2du)__B, (__v2du)__boolmask); 389 #endif 390 } 391 #endif 392 393 extern __inline int 394 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 395 _mm_testz_si128(__m128i __A, __m128i __B) { 396 /* Note: This implementation does NOT set "zero" or "carry" flags. */ 397 const __v16qu __zero = {0}; 398 return vec_all_eq(vec_and((__v16qu)__A, (__v16qu)__B), __zero); 399 } 400 401 extern __inline int 402 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 403 _mm_testc_si128(__m128i __A, __m128i __B) { 404 /* Note: This implementation does NOT set "zero" or "carry" flags. */ 405 const __v16qu __zero = {0}; 406 const __v16qu __notA = vec_nor((__v16qu)__A, (__v16qu)__A); 407 return vec_all_eq(vec_and((__v16qu)__notA, (__v16qu)__B), __zero); 408 } 409 410 extern __inline int 411 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 412 _mm_testnzc_si128(__m128i __A, __m128i __B) { 413 /* Note: This implementation does NOT set "zero" or "carry" flags. */ 414 return _mm_testz_si128(__A, __B) == 0 && _mm_testc_si128(__A, __B) == 0; 415 } 416 417 #define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V)) 418 419 #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V))) 420 421 #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V)) 422 423 #ifdef _ARCH_PWR8 424 extern __inline __m128i 425 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 426 _mm_cmpeq_epi64(__m128i __X, __m128i __Y) { 427 return (__m128i)vec_cmpeq((__v2di)__X, (__v2di)__Y); 428 } 429 #endif 430 431 extern __inline __m128i 432 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 433 _mm_min_epi8(__m128i __X, __m128i __Y) { 434 return (__m128i)vec_min((__v16qi)__X, (__v16qi)__Y); 435 } 436 437 extern __inline __m128i 438 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 439 _mm_min_epu16(__m128i __X, __m128i __Y) { 440 return (__m128i)vec_min((__v8hu)__X, (__v8hu)__Y); 441 } 442 443 extern __inline __m128i 444 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 445 _mm_min_epi32(__m128i __X, __m128i __Y) { 446 return (__m128i)vec_min((__v4si)__X, (__v4si)__Y); 447 } 448 449 extern __inline __m128i 450 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 451 _mm_min_epu32(__m128i __X, __m128i __Y) { 452 return (__m128i)vec_min((__v4su)__X, (__v4su)__Y); 453 } 454 455 extern __inline __m128i 456 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 457 _mm_max_epi8(__m128i __X, __m128i __Y) { 458 return (__m128i)vec_max((__v16qi)__X, (__v16qi)__Y); 459 } 460 461 extern __inline __m128i 462 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 463 _mm_max_epu16(__m128i __X, __m128i __Y) { 464 return (__m128i)vec_max((__v8hu)__X, (__v8hu)__Y); 465 } 466 467 extern __inline __m128i 468 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 469 _mm_max_epi32(__m128i __X, __m128i __Y) { 470 return (__m128i)vec_max((__v4si)__X, (__v4si)__Y); 471 } 472 473 extern __inline __m128i 474 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 475 _mm_max_epu32(__m128i __X, __m128i __Y) { 476 return (__m128i)vec_max((__v4su)__X, (__v4su)__Y); 477 } 478 479 extern __inline __m128i 480 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 481 _mm_mullo_epi32(__m128i __X, __m128i __Y) { 482 return (__m128i)vec_mul((__v4su)__X, (__v4su)__Y); 483 } 484 485 #ifdef _ARCH_PWR8 486 extern __inline __m128i 487 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 488 _mm_mul_epi32(__m128i __X, __m128i __Y) { 489 return (__m128i)vec_mule((__v4si)__X, (__v4si)__Y); 490 } 491 #endif 492 493 extern __inline __m128i 494 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 495 _mm_cvtepi8_epi16(__m128i __A) { 496 return (__m128i)vec_unpackh((__v16qi)__A); 497 } 498 499 extern __inline __m128i 500 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 501 _mm_cvtepi8_epi32(__m128i __A) { 502 __A = (__m128i)vec_unpackh((__v16qi)__A); 503 return (__m128i)vec_unpackh((__v8hi)__A); 504 } 505 506 #ifdef _ARCH_PWR8 507 extern __inline __m128i 508 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 509 _mm_cvtepi8_epi64(__m128i __A) { 510 __A = (__m128i)vec_unpackh((__v16qi)__A); 511 __A = (__m128i)vec_unpackh((__v8hi)__A); 512 return (__m128i)vec_unpackh((__v4si)__A); 513 } 514 #endif 515 516 extern __inline __m128i 517 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 518 _mm_cvtepi16_epi32(__m128i __A) { 519 return (__m128i)vec_unpackh((__v8hi)__A); 520 } 521 522 #ifdef _ARCH_PWR8 523 extern __inline __m128i 524 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 525 _mm_cvtepi16_epi64(__m128i __A) { 526 __A = (__m128i)vec_unpackh((__v8hi)__A); 527 return (__m128i)vec_unpackh((__v4si)__A); 528 } 529 #endif 530 531 #ifdef _ARCH_PWR8 532 extern __inline __m128i 533 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 534 _mm_cvtepi32_epi64(__m128i __A) { 535 return (__m128i)vec_unpackh((__v4si)__A); 536 } 537 #endif 538 539 extern __inline __m128i 540 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 541 _mm_cvtepu8_epi16(__m128i __A) { 542 const __v16qu __zero = {0}; 543 #ifdef __LITTLE_ENDIAN__ 544 __A = (__m128i)vec_mergeh((__v16qu)__A, __zero); 545 #else /* __BIG_ENDIAN__. */ 546 __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A); 547 #endif /* __BIG_ENDIAN__. */ 548 return __A; 549 } 550 551 extern __inline __m128i 552 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 553 _mm_cvtepu8_epi32(__m128i __A) { 554 const __v16qu __zero = {0}; 555 #ifdef __LITTLE_ENDIAN__ 556 __A = (__m128i)vec_mergeh((__v16qu)__A, __zero); 557 __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero); 558 #else /* __BIG_ENDIAN__. */ 559 __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A); 560 __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A); 561 #endif /* __BIG_ENDIAN__. */ 562 return __A; 563 } 564 565 extern __inline __m128i 566 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 567 _mm_cvtepu8_epi64(__m128i __A) { 568 const __v16qu __zero = {0}; 569 #ifdef __LITTLE_ENDIAN__ 570 __A = (__m128i)vec_mergeh((__v16qu)__A, __zero); 571 __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero); 572 __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero); 573 #else /* __BIG_ENDIAN__. */ 574 __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A); 575 __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A); 576 __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A); 577 #endif /* __BIG_ENDIAN__. */ 578 return __A; 579 } 580 581 extern __inline __m128i 582 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 583 _mm_cvtepu16_epi32(__m128i __A) { 584 const __v8hu __zero = {0}; 585 #ifdef __LITTLE_ENDIAN__ 586 __A = (__m128i)vec_mergeh((__v8hu)__A, __zero); 587 #else /* __BIG_ENDIAN__. */ 588 __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A); 589 #endif /* __BIG_ENDIAN__. */ 590 return __A; 591 } 592 593 extern __inline __m128i 594 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 595 _mm_cvtepu16_epi64(__m128i __A) { 596 const __v8hu __zero = {0}; 597 #ifdef __LITTLE_ENDIAN__ 598 __A = (__m128i)vec_mergeh((__v8hu)__A, __zero); 599 __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero); 600 #else /* __BIG_ENDIAN__. */ 601 __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A); 602 __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A); 603 #endif /* __BIG_ENDIAN__. */ 604 return __A; 605 } 606 607 extern __inline __m128i 608 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 609 _mm_cvtepu32_epi64(__m128i __A) { 610 const __v4su __zero = {0}; 611 #ifdef __LITTLE_ENDIAN__ 612 __A = (__m128i)vec_mergeh((__v4su)__A, __zero); 613 #else /* __BIG_ENDIAN__. */ 614 __A = (__m128i)vec_mergeh(__zero, (__v4su)__A); 615 #endif /* __BIG_ENDIAN__. */ 616 return __A; 617 } 618 619 /* Return horizontal packed word minimum and its index in bits [15:0] 620 and bits [18:16] respectively. */ 621 extern __inline __m128i 622 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 623 _mm_minpos_epu16(__m128i __A) { 624 union __u { 625 __m128i __m; 626 __v8hu __uh; 627 }; 628 union __u __u = {.__m = __A}, __r = {.__m = {0}}; 629 unsigned short __ridx = 0; 630 unsigned short __rmin = __u.__uh[__ridx]; 631 unsigned long __i; 632 for (__i = 1; __i < 8; __i++) { 633 if (__u.__uh[__i] < __rmin) { 634 __rmin = __u.__uh[__i]; 635 __ridx = __i; 636 } 637 } 638 __r.__uh[0] = __rmin; 639 __r.__uh[1] = __ridx; 640 return __r.__m; 641 } 642 643 extern __inline __m128i 644 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 645 _mm_packus_epi32(__m128i __X, __m128i __Y) { 646 return (__m128i)vec_packsu((__v4si)__X, (__v4si)__Y); 647 } 648 649 #ifdef _ARCH_PWR8 650 extern __inline __m128i 651 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 652 _mm_cmpgt_epi64(__m128i __X, __m128i __Y) { 653 return (__m128i)vec_cmpgt((__v2di)__X, (__v2di)__Y); 654 } 655 #endif 656 657 #else 658 #include_next <smmintrin.h> 659 #endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \ 660 */ 661 662 #endif /* SMMINTRIN_H_ */ 663