1 /* Copyright (C) 2003-2018 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 GCC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 Under Section 7 of GPL version 3, you are granted additional 16 permissions described in the GCC Runtime Library Exception, version 17 3.1, as published by the Free Software Foundation. 18 19 You should have received a copy of the GNU General Public License and 20 a copy of the GCC Runtime Library Exception along with this program; 21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 <http://www.gnu.org/licenses/>. */ 23 24 /* Implemented from the specification included in the Intel C++ Compiler 25 User Guide and Reference, version 9.0. */ 26 27 #ifndef NO_WARN_X86_INTRINSICS 28 /* This header is distributed to simplify porting x86_64 code that 29 makes explicit use of Intel intrinsics to powerpc64le. 30 It is the user's responsibility to determine if the results are 31 acceptable and make additional changes as necessary. 32 Note that much code that uses Intel intrinsics can be rewritten in 33 standard C or GNU C extensions, which are more portable and better 34 optimized across multiple targets. 35 36 In the specific case of X86 SSE2 (__m128i, __m128d) intrinsics, 37 the PowerPC VMX/VSX ISA is a good match for vector double SIMD 38 operations. However scalar double operations in vector (XMM) 39 registers require the POWER8 VSX ISA (2.07) level. Also there are 40 important differences for data format and placement of double 41 scalars in the vector register. 42 43 For PowerISA Scalar double is in FPRs (left most 64-bits of the 44 low 32 VSRs), while X86_64 SSE2 uses the right most 64-bits of 45 the XMM. These differences require extra steps on POWER to match 46 the SSE2 scalar double semantics. 47 48 Most SSE2 scalar double intrinsic operations can be performed more 49 efficiently as C language double scalar operations or optimized to 50 use vector SIMD operations. We recommend this for new applications. 51 52 Another difference is the format and details of the X86_64 MXSCR vs 53 the PowerISA FPSCR / VSCR registers. We recommend applications 54 replace direct access to the MXSCR with the more portable <fenv.h> 55 Posix APIs. */ 56 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." 57 #endif 58 59 #ifndef EMMINTRIN_H_ 60 #define EMMINTRIN_H_ 61 62 #include <altivec.h> 63 #include <assert.h> 64 65 /* We need definitions from the SSE header files. */ 66 #include <xmmintrin.h> 67 68 /* SSE2 */ 69 typedef __vector double __v2df; 70 typedef __vector long long __v2di; 71 typedef __vector unsigned long long __v2du; 72 typedef __vector int __v4si; 73 typedef __vector unsigned int __v4su; 74 typedef __vector short __v8hi; 75 typedef __vector unsigned short __v8hu; 76 typedef __vector signed char __v16qi; 77 typedef __vector unsigned char __v16qu; 78 79 /* The Intel API is flexible enough that we must allow aliasing with other 80 vector types, and their scalar components. */ 81 typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); 82 typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__)); 83 84 /* Unaligned version of the same types. */ 85 typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); 86 typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); 87 88 /* Define two value permute mask */ 89 #define _MM_SHUFFLE2(x,y) (((x) << 1) | (y)) 90 91 /* Create a vector with element 0 as F and the rest zero. */ 92 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 93 _mm_set_sd (double __F) 94 { 95 return __extension__ (__m128d){ __F, 0.0 }; 96 } 97 98 /* Create a vector with both elements equal to F. */ 99 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 100 _mm_set1_pd (double __F) 101 { 102 return __extension__ (__m128d){ __F, __F }; 103 } 104 105 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 106 _mm_set_pd1 (double __F) 107 { 108 return _mm_set1_pd (__F); 109 } 110 111 /* Create a vector with the lower value X and upper value W. */ 112 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 113 _mm_set_pd (double __W, double __X) 114 { 115 return __extension__ (__m128d){ __X, __W }; 116 } 117 118 /* Create a vector with the lower value W and upper value X. */ 119 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 120 _mm_setr_pd (double __W, double __X) 121 { 122 return __extension__ (__m128d){ __W, __X }; 123 } 124 125 /* Create an undefined vector. */ 126 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 127 _mm_undefined_pd (void) 128 { 129 __m128d __Y = __Y; 130 return __Y; 131 } 132 133 /* Create a vector of zeros. */ 134 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 135 _mm_setzero_pd (void) 136 { 137 return (__m128d) vec_splats (0); 138 } 139 140 /* Sets the low DPFP value of A from the low value of B. */ 141 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 142 _mm_move_sd (__m128d __A, __m128d __B) 143 { 144 __v2df result = (__v2df) __A; 145 result [0] = ((__v2df) __B)[0]; 146 return (__m128d) result; 147 } 148 149 /* Load two DPFP values from P. The address must be 16-byte aligned. */ 150 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 151 _mm_load_pd (double const *__P) 152 { 153 assert(((unsigned long)__P & 0xfUL) == 0UL); 154 return ((__m128d)vec_ld(0, (__v16qu*)__P)); 155 } 156 157 /* Load two DPFP values from P. The address need not be 16-byte aligned. */ 158 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 159 _mm_loadu_pd (double const *__P) 160 { 161 return (vec_vsx_ld(0, __P)); 162 } 163 164 /* Create a vector with all two elements equal to *P. */ 165 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 166 _mm_load1_pd (double const *__P) 167 { 168 return (vec_splats (*__P)); 169 } 170 171 /* Create a vector with element 0 as *P and the rest zero. */ 172 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 173 _mm_load_sd (double const *__P) 174 { 175 return _mm_set_sd (*__P); 176 } 177 178 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 179 _mm_load_pd1 (double const *__P) 180 { 181 return _mm_load1_pd (__P); 182 } 183 184 /* Load two DPFP values in reverse order. The address must be aligned. */ 185 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 186 _mm_loadr_pd (double const *__P) 187 { 188 __v2df __tmp = _mm_load_pd (__P); 189 return (__m128d)vec_xxpermdi (__tmp, __tmp, 2); 190 } 191 192 /* Store two DPFP values. The address must be 16-byte aligned. */ 193 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 194 _mm_store_pd (double *__P, __m128d __A) 195 { 196 assert(((unsigned long)__P & 0xfUL) == 0UL); 197 vec_st((__v16qu)__A, 0, (__v16qu*)__P); 198 } 199 200 /* Store two DPFP values. The address need not be 16-byte aligned. */ 201 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 202 _mm_storeu_pd (double *__P, __m128d __A) 203 { 204 *(__m128d *)__P = __A; 205 } 206 207 /* Stores the lower DPFP value. */ 208 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 209 _mm_store_sd (double *__P, __m128d __A) 210 { 211 *__P = ((__v2df)__A)[0]; 212 } 213 214 extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 215 _mm_cvtsd_f64 (__m128d __A) 216 { 217 return ((__v2df)__A)[0]; 218 } 219 220 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 221 _mm_storel_pd (double *__P, __m128d __A) 222 { 223 _mm_store_sd (__P, __A); 224 } 225 226 /* Stores the upper DPFP value. */ 227 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 228 _mm_storeh_pd (double *__P, __m128d __A) 229 { 230 *__P = ((__v2df)__A)[1]; 231 } 232 /* Store the lower DPFP value across two words. 233 The address must be 16-byte aligned. */ 234 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 235 _mm_store1_pd (double *__P, __m128d __A) 236 { 237 _mm_store_pd (__P, vec_splat (__A, 0)); 238 } 239 240 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 241 _mm_store_pd1 (double *__P, __m128d __A) 242 { 243 _mm_store1_pd (__P, __A); 244 } 245 246 /* Store two DPFP values in reverse order. The address must be aligned. */ 247 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 248 _mm_storer_pd (double *__P, __m128d __A) 249 { 250 _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2)); 251 } 252 253 /* Intel intrinsic. */ 254 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 255 _mm_cvtsi128_si64 (__m128i __A) 256 { 257 return ((__v2di)__A)[0]; 258 } 259 260 /* Microsoft intrinsic. */ 261 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 262 _mm_cvtsi128_si64x (__m128i __A) 263 { 264 return ((__v2di)__A)[0]; 265 } 266 267 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 268 _mm_add_pd (__m128d __A, __m128d __B) 269 { 270 return (__m128d) ((__v2df)__A + (__v2df)__B); 271 } 272 273 /* Add the lower double-precision (64-bit) floating-point element in 274 a and b, store the result in the lower element of dst, and copy 275 the upper element from a to the upper element of dst. */ 276 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 277 _mm_add_sd (__m128d __A, __m128d __B) 278 { 279 __A[0] = __A[0] + __B[0]; 280 return (__A); 281 } 282 283 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 284 _mm_sub_pd (__m128d __A, __m128d __B) 285 { 286 return (__m128d) ((__v2df)__A - (__v2df)__B); 287 } 288 289 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 290 _mm_sub_sd (__m128d __A, __m128d __B) 291 { 292 __A[0] = __A[0] - __B[0]; 293 return (__A); 294 } 295 296 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 297 _mm_mul_pd (__m128d __A, __m128d __B) 298 { 299 return (__m128d) ((__v2df)__A * (__v2df)__B); 300 } 301 302 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 303 _mm_mul_sd (__m128d __A, __m128d __B) 304 { 305 __A[0] = __A[0] * __B[0]; 306 return (__A); 307 } 308 309 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 310 _mm_div_pd (__m128d __A, __m128d __B) 311 { 312 return (__m128d) ((__v2df)__A / (__v2df)__B); 313 } 314 315 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 316 _mm_div_sd (__m128d __A, __m128d __B) 317 { 318 __A[0] = __A[0] / __B[0]; 319 return (__A); 320 } 321 322 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 323 _mm_sqrt_pd (__m128d __A) 324 { 325 return (vec_sqrt (__A)); 326 } 327 328 /* Return pair {sqrt (B[0]), A[1]}. */ 329 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 330 _mm_sqrt_sd (__m128d __A, __m128d __B) 331 { 332 __v2df c; 333 c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0])); 334 return (__m128d) _mm_setr_pd (c[0], __A[1]); 335 } 336 337 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 338 _mm_min_pd (__m128d __A, __m128d __B) 339 { 340 return (vec_min (__A, __B)); 341 } 342 343 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 344 _mm_min_sd (__m128d __A, __m128d __B) 345 { 346 __v2df a, b, c; 347 a = vec_splats (__A[0]); 348 b = vec_splats (__B[0]); 349 c = vec_min (a, b); 350 return (__m128d) _mm_setr_pd (c[0], __A[1]); 351 } 352 353 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 354 _mm_max_pd (__m128d __A, __m128d __B) 355 { 356 return (vec_max (__A, __B)); 357 } 358 359 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 360 _mm_max_sd (__m128d __A, __m128d __B) 361 { 362 __v2df a, b, c; 363 a = vec_splats (__A[0]); 364 b = vec_splats (__B[0]); 365 c = vec_max (a, b); 366 return (__m128d) _mm_setr_pd (c[0], __A[1]); 367 } 368 369 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 370 _mm_cmpeq_pd (__m128d __A, __m128d __B) 371 { 372 return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B)); 373 } 374 375 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 376 _mm_cmplt_pd (__m128d __A, __m128d __B) 377 { 378 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B)); 379 } 380 381 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 382 _mm_cmple_pd (__m128d __A, __m128d __B) 383 { 384 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B)); 385 } 386 387 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 388 _mm_cmpgt_pd (__m128d __A, __m128d __B) 389 { 390 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B)); 391 } 392 393 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 394 _mm_cmpge_pd (__m128d __A, __m128d __B) 395 { 396 return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B)); 397 } 398 399 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 400 _mm_cmpneq_pd (__m128d __A, __m128d __B) 401 { 402 __v2df temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B); 403 return ((__m128d)vec_nor (temp, temp)); 404 } 405 406 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 407 _mm_cmpnlt_pd (__m128d __A, __m128d __B) 408 { 409 return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B)); 410 } 411 412 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 413 _mm_cmpnle_pd (__m128d __A, __m128d __B) 414 { 415 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B)); 416 } 417 418 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 419 _mm_cmpngt_pd (__m128d __A, __m128d __B) 420 { 421 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B)); 422 } 423 424 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 425 _mm_cmpnge_pd (__m128d __A, __m128d __B) 426 { 427 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B)); 428 } 429 430 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 431 _mm_cmpord_pd (__m128d __A, __m128d __B) 432 { 433 #if _ARCH_PWR8 434 __v2du c, d; 435 /* Compare against self will return false (0's) if NAN. */ 436 c = (__v2du)vec_cmpeq (__A, __A); 437 d = (__v2du)vec_cmpeq (__B, __B); 438 #else 439 __v2du a, b; 440 __v2du c, d; 441 const __v2du double_exp_mask = {0x7ff0000000000000, 0x7ff0000000000000}; 442 a = (__v2du)vec_abs ((__v2df)__A); 443 b = (__v2du)vec_abs ((__v2df)__B); 444 c = (__v2du)vec_cmpgt (double_exp_mask, a); 445 d = (__v2du)vec_cmpgt (double_exp_mask, b); 446 #endif 447 /* A != NAN and B != NAN. */ 448 return ((__m128d)vec_and(c, d)); 449 } 450 451 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 452 _mm_cmpunord_pd (__m128d __A, __m128d __B) 453 { 454 #if _ARCH_PWR8 455 __v2du c, d; 456 /* Compare against self will return false (0's) if NAN. */ 457 c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A); 458 d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B); 459 /* A == NAN OR B == NAN converts too: 460 NOT(A != NAN) OR NOT(B != NAN). */ 461 c = vec_nor (c, c); 462 return ((__m128d)vec_orc(c, d)); 463 #else 464 __v2du c, d; 465 /* Compare against self will return false (0's) if NAN. */ 466 c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A); 467 d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B); 468 /* Convert the true ('1's) is NAN. */ 469 c = vec_nor (c, c); 470 d = vec_nor (d, d); 471 return ((__m128d)vec_or(c, d)); 472 #endif 473 } 474 475 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 476 _mm_cmpeq_sd(__m128d __A, __m128d __B) 477 { 478 __v2df a, b, c; 479 /* PowerISA VSX does not allow partial (for just lower double) 480 results. So to insure we don't generate spurious exceptions 481 (from the upper double values) we splat the lower double 482 before we do the operation. */ 483 a = vec_splats (__A[0]); 484 b = vec_splats (__B[0]); 485 c = (__v2df) vec_cmpeq(a, b); 486 /* Then we merge the lower double result with the original upper 487 double from __A. */ 488 return (__m128d) _mm_setr_pd (c[0], __A[1]); 489 } 490 491 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 492 _mm_cmplt_sd (__m128d __A, __m128d __B) 493 { 494 __v2df a, b, c; 495 a = vec_splats (__A[0]); 496 b = vec_splats (__B[0]); 497 c = (__v2df) vec_cmplt(a, b); 498 return (__m128d) _mm_setr_pd (c[0], __A[1]); 499 } 500 501 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 502 _mm_cmple_sd (__m128d __A, __m128d __B) 503 { 504 __v2df a, b, c; 505 a = vec_splats (__A[0]); 506 b = vec_splats (__B[0]); 507 c = (__v2df) vec_cmple(a, b); 508 return (__m128d) _mm_setr_pd (c[0], __A[1]); 509 } 510 511 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 512 _mm_cmpgt_sd (__m128d __A, __m128d __B) 513 { 514 __v2df a, b, c; 515 a = vec_splats (__A[0]); 516 b = vec_splats (__B[0]); 517 c = (__v2df) vec_cmpgt(a, b); 518 return (__m128d) _mm_setr_pd (c[0], __A[1]); 519 } 520 521 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 522 _mm_cmpge_sd (__m128d __A, __m128d __B) 523 { 524 __v2df a, b, c; 525 a = vec_splats (__A[0]); 526 b = vec_splats (__B[0]); 527 c = (__v2df) vec_cmpge(a, b); 528 return (__m128d) _mm_setr_pd (c[0], __A[1]); 529 } 530 531 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 532 _mm_cmpneq_sd (__m128d __A, __m128d __B) 533 { 534 __v2df a, b, c; 535 a = vec_splats (__A[0]); 536 b = vec_splats (__B[0]); 537 c = (__v2df) vec_cmpeq(a, b); 538 c = vec_nor (c, c); 539 return (__m128d) _mm_setr_pd (c[0], __A[1]); 540 } 541 542 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 543 _mm_cmpnlt_sd (__m128d __A, __m128d __B) 544 { 545 __v2df a, b, c; 546 a = vec_splats (__A[0]); 547 b = vec_splats (__B[0]); 548 /* Not less than is just greater than or equal. */ 549 c = (__v2df) vec_cmpge(a, b); 550 return (__m128d) _mm_setr_pd (c[0], __A[1]); 551 } 552 553 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 554 _mm_cmpnle_sd (__m128d __A, __m128d __B) 555 { 556 __v2df a, b, c; 557 a = vec_splats (__A[0]); 558 b = vec_splats (__B[0]); 559 /* Not less than or equal is just greater than. */ 560 c = (__v2df) vec_cmpge(a, b); 561 return (__m128d) _mm_setr_pd (c[0], __A[1]); 562 } 563 564 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 565 _mm_cmpngt_sd (__m128d __A, __m128d __B) 566 { 567 __v2df a, b, c; 568 a = vec_splats (__A[0]); 569 b = vec_splats (__B[0]); 570 /* Not greater than is just less than or equal. */ 571 c = (__v2df) vec_cmple(a, b); 572 return (__m128d) _mm_setr_pd (c[0], __A[1]); 573 } 574 575 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 576 _mm_cmpnge_sd (__m128d __A, __m128d __B) 577 { 578 __v2df a, b, c; 579 a = vec_splats (__A[0]); 580 b = vec_splats (__B[0]); 581 /* Not greater than or equal is just less than. */ 582 c = (__v2df) vec_cmplt(a, b); 583 return (__m128d) _mm_setr_pd (c[0], __A[1]); 584 } 585 586 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 587 _mm_cmpord_sd (__m128d __A, __m128d __B) 588 { 589 __v2df r; 590 r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0])); 591 return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]); 592 } 593 594 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 595 _mm_cmpunord_sd (__m128d __A, __m128d __B) 596 { 597 __v2df r; 598 r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0])); 599 return (__m128d) _mm_setr_pd (r[0], __A[1]); 600 } 601 602 /* FIXME 603 The __mm_comi??_sd and __mm_ucomi??_sd implementations below are 604 exactly the same because GCC for PowerPC only generates unordered 605 compares (scalar and vector). 606 Technically __mm_comieq_sp et all should be using the ordered 607 compare and signal for QNaNs. The __mm_ucomieq_sd et all should 608 be OK. */ 609 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 610 _mm_comieq_sd (__m128d __A, __m128d __B) 611 { 612 return (__A[0] == __B[0]); 613 } 614 615 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 616 _mm_comilt_sd (__m128d __A, __m128d __B) 617 { 618 return (__A[0] < __B[0]); 619 } 620 621 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 622 _mm_comile_sd (__m128d __A, __m128d __B) 623 { 624 return (__A[0] <= __B[0]); 625 } 626 627 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 628 _mm_comigt_sd (__m128d __A, __m128d __B) 629 { 630 return (__A[0] > __B[0]); 631 } 632 633 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 634 _mm_comige_sd (__m128d __A, __m128d __B) 635 { 636 return (__A[0] >= __B[0]); 637 } 638 639 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 640 _mm_comineq_sd (__m128d __A, __m128d __B) 641 { 642 return (__A[0] != __B[0]); 643 } 644 645 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 646 _mm_ucomieq_sd (__m128d __A, __m128d __B) 647 { 648 return (__A[0] == __B[0]); 649 } 650 651 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 652 _mm_ucomilt_sd (__m128d __A, __m128d __B) 653 { 654 return (__A[0] < __B[0]); 655 } 656 657 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 658 _mm_ucomile_sd (__m128d __A, __m128d __B) 659 { 660 return (__A[0] <= __B[0]); 661 } 662 663 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 664 _mm_ucomigt_sd (__m128d __A, __m128d __B) 665 { 666 return (__A[0] > __B[0]); 667 } 668 669 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 670 _mm_ucomige_sd (__m128d __A, __m128d __B) 671 { 672 return (__A[0] >= __B[0]); 673 } 674 675 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 676 _mm_ucomineq_sd (__m128d __A, __m128d __B) 677 { 678 return (__A[0] != __B[0]); 679 } 680 681 /* Create a vector of Qi, where i is the element number. */ 682 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 683 _mm_set_epi64x (long long __q1, long long __q0) 684 { 685 return __extension__ (__m128i)(__v2di){ __q0, __q1 }; 686 } 687 688 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 689 _mm_set_epi64 (__m64 __q1, __m64 __q0) 690 { 691 return _mm_set_epi64x ((long long)__q1, (long long)__q0); 692 } 693 694 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 695 _mm_set_epi32 (int __q3, int __q2, int __q1, int __q0) 696 { 697 return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 }; 698 } 699 700 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 701 _mm_set_epi16 (short __q7, short __q6, short __q5, short __q4, 702 short __q3, short __q2, short __q1, short __q0) 703 { 704 return __extension__ (__m128i)(__v8hi){ 705 __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 }; 706 } 707 708 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 709 _mm_set_epi8 (char __q15, char __q14, char __q13, char __q12, 710 char __q11, char __q10, char __q09, char __q08, 711 char __q07, char __q06, char __q05, char __q04, 712 char __q03, char __q02, char __q01, char __q00) 713 { 714 return __extension__ (__m128i)(__v16qi){ 715 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, 716 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15 717 }; 718 } 719 720 /* Set all of the elements of the vector to A. */ 721 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 722 _mm_set1_epi64x (long long __A) 723 { 724 return _mm_set_epi64x (__A, __A); 725 } 726 727 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 728 _mm_set1_epi64 (__m64 __A) 729 { 730 return _mm_set_epi64 (__A, __A); 731 } 732 733 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 734 _mm_set1_epi32 (int __A) 735 { 736 return _mm_set_epi32 (__A, __A, __A, __A); 737 } 738 739 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 740 _mm_set1_epi16 (short __A) 741 { 742 return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A); 743 } 744 745 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 746 _mm_set1_epi8 (char __A) 747 { 748 return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A, 749 __A, __A, __A, __A, __A, __A, __A, __A); 750 } 751 752 /* Create a vector of Qi, where i is the element number. 753 The parameter order is reversed from the _mm_set_epi* functions. */ 754 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 755 _mm_setr_epi64 (__m64 __q0, __m64 __q1) 756 { 757 return _mm_set_epi64 (__q1, __q0); 758 } 759 760 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 761 _mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3) 762 { 763 return _mm_set_epi32 (__q3, __q2, __q1, __q0); 764 } 765 766 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 767 _mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3, 768 short __q4, short __q5, short __q6, short __q7) 769 { 770 return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0); 771 } 772 773 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 774 _mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03, 775 char __q04, char __q05, char __q06, char __q07, 776 char __q08, char __q09, char __q10, char __q11, 777 char __q12, char __q13, char __q14, char __q15) 778 { 779 return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08, 780 __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00); 781 } 782 783 /* Create a vector with element 0 as *P and the rest zero. */ 784 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 785 _mm_load_si128 (__m128i const *__P) 786 { 787 return *__P; 788 } 789 790 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 791 _mm_loadu_si128 (__m128i_u const *__P) 792 { 793 return (__m128i) (vec_vsx_ld(0, (signed int const *)__P)); 794 } 795 796 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 797 _mm_loadl_epi64 (__m128i_u const *__P) 798 { 799 return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P); 800 } 801 802 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 803 _mm_store_si128 (__m128i *__P, __m128i __B) 804 { 805 assert(((unsigned long )__P & 0xfUL) == 0UL); 806 vec_st ((__v16qu) __B, 0, (__v16qu*)__P); 807 } 808 809 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 810 _mm_storeu_si128 (__m128i_u *__P, __m128i __B) 811 { 812 *__P = __B; 813 } 814 815 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 816 _mm_storel_epi64 (__m128i_u *__P, __m128i __B) 817 { 818 *(long long *)__P = ((__v2di)__B)[0]; 819 } 820 821 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 822 _mm_movepi64_pi64 (__m128i_u __B) 823 { 824 return (__m64) ((__v2di)__B)[0]; 825 } 826 827 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 828 _mm_movpi64_epi64 (__m64 __A) 829 { 830 return _mm_set_epi64 ((__m64)0LL, __A); 831 } 832 833 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 834 _mm_move_epi64 (__m128i __A) 835 { 836 return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]); 837 } 838 839 /* Create an undefined vector. */ 840 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 841 _mm_undefined_si128 (void) 842 { 843 __m128i __Y = __Y; 844 return __Y; 845 } 846 847 /* Create a vector of zeros. */ 848 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 849 _mm_setzero_si128 (void) 850 { 851 return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 }; 852 } 853 854 #ifdef _ARCH_PWR8 855 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 856 _mm_cvtepi32_pd (__m128i __A) 857 { 858 __v2di val; 859 /* For LE need to generate Vector Unpack Low Signed Word. 860 Which is generated from unpackh. */ 861 val = (__v2di)vec_unpackh ((__v4si)__A); 862 863 return (__m128d)vec_ctf (val, 0); 864 } 865 #endif 866 867 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 868 _mm_cvtepi32_ps (__m128i __A) 869 { 870 return ((__m128)vec_ctf((__v4si)__A, 0)); 871 } 872 873 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 874 _mm_cvtpd_epi32 (__m128d __A) 875 { 876 __v2df rounded = vec_rint (__A); 877 __v4si result, temp; 878 const __v4si vzero = 879 { 0, 0, 0, 0 }; 880 881 /* VSX Vector truncate Double-Precision to integer and Convert to 882 Signed Integer Word format with Saturate. */ 883 __asm__( 884 "xvcvdpsxws %x0,%x1" 885 : "=wa" (temp) 886 : "wa" (rounded) 887 : ); 888 889 #ifdef _ARCH_PWR8 890 temp = vec_mergeo (temp, temp); 891 result = (__v4si)vec_vpkudum ((__vector long)temp, (__vector long)vzero); 892 #else 893 { 894 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 895 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f }; 896 result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm); 897 } 898 #endif 899 return (__m128i) result; 900 } 901 902 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 903 _mm_cvtpd_pi32 (__m128d __A) 904 { 905 __m128i result = _mm_cvtpd_epi32(__A); 906 907 return (__m64) result[0]; 908 } 909 910 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 911 _mm_cvtpd_ps (__m128d __A) 912 { 913 __v4sf result; 914 __v4si temp; 915 const __v4si vzero = { 0, 0, 0, 0 }; 916 917 __asm__( 918 "xvcvdpsp %x0,%x1" 919 : "=wa" (temp) 920 : "wa" (__A) 921 : ); 922 923 #ifdef _ARCH_PWR8 924 temp = vec_mergeo (temp, temp); 925 result = (__v4sf)vec_vpkudum ((__vector long)temp, (__vector long)vzero); 926 #else 927 { 928 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 929 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f }; 930 result = (__v4sf) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm); 931 } 932 #endif 933 return ((__m128)result); 934 } 935 936 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 937 _mm_cvttpd_epi32 (__m128d __A) 938 { 939 __v4si result; 940 __v4si temp; 941 const __v4si vzero = { 0, 0, 0, 0 }; 942 943 /* VSX Vector truncate Double-Precision to integer and Convert to 944 Signed Integer Word format with Saturate. */ 945 __asm__( 946 "xvcvdpsxws %x0,%x1" 947 : "=wa" (temp) 948 : "wa" (__A) 949 : ); 950 951 #ifdef _ARCH_PWR8 952 temp = vec_mergeo (temp, temp); 953 result = (__v4si)vec_vpkudum ((__vector long)temp, (__vector long)vzero); 954 #else 955 { 956 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 957 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f }; 958 result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm); 959 } 960 #endif 961 962 return ((__m128i) result); 963 } 964 965 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 966 _mm_cvttpd_pi32 (__m128d __A) 967 { 968 __m128i result = _mm_cvttpd_epi32 (__A); 969 970 return (__m64) result[0]; 971 } 972 973 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 974 _mm_cvtsi128_si32 (__m128i __A) 975 { 976 return ((__v4si)__A)[0]; 977 } 978 979 #ifdef _ARCH_PWR8 980 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 981 _mm_cvtpi32_pd (__m64 __A) 982 { 983 __v4si temp; 984 __v2di tmp2; 985 __v2df result; 986 987 temp = (__v4si)vec_splats (__A); 988 tmp2 = (__v2di)vec_unpackl (temp); 989 result = vec_ctf ((__vector signed long)tmp2, 0); 990 return (__m128d)result; 991 } 992 #endif 993 994 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 995 _mm_cvtps_epi32 (__m128 __A) 996 { 997 __v4sf rounded; 998 __v4si result; 999 1000 rounded = vec_rint((__v4sf) __A); 1001 result = vec_cts (rounded, 0); 1002 return (__m128i) result; 1003 } 1004 1005 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1006 _mm_cvttps_epi32 (__m128 __A) 1007 { 1008 __v4si result; 1009 1010 result = vec_cts ((__v4sf) __A, 0); 1011 return (__m128i) result; 1012 } 1013 1014 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1015 _mm_cvtps_pd (__m128 __A) 1016 { 1017 /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */ 1018 #ifdef vec_doubleh 1019 return (__m128d) vec_doubleh ((__v4sf)__A); 1020 #else 1021 /* Otherwise the compiler is not current and so need to generate the 1022 equivalent code. */ 1023 __v4sf a = (__v4sf)__A; 1024 __v4sf temp; 1025 __v2df result; 1026 #ifdef __LITTLE_ENDIAN__ 1027 /* The input float values are in elements {[0], [1]} but the convert 1028 instruction needs them in elements {[1], [3]}, So we use two 1029 shift left double vector word immediates to get the elements 1030 lined up. */ 1031 temp = __builtin_vsx_xxsldwi (a, a, 3); 1032 temp = __builtin_vsx_xxsldwi (a, temp, 2); 1033 #elif __BIG_ENDIAN__ 1034 /* The input float values are in elements {[0], [1]} but the convert 1035 instruction needs them in elements {[0], [2]}, So we use two 1036 shift left double vector word immediates to get the elements 1037 lined up. */ 1038 temp = vec_vmrghw (a, a); 1039 #endif 1040 __asm__( 1041 " xvcvspdp %x0,%x1" 1042 : "=wa" (result) 1043 : "wa" (temp) 1044 : ); 1045 return (__m128d) result; 1046 #endif 1047 } 1048 1049 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1050 _mm_cvtsd_si32 (__m128d __A) 1051 { 1052 __v2df rounded = vec_rint((__v2df) __A); 1053 int result = ((__v2df)rounded)[0]; 1054 1055 return result; 1056 } 1057 /* Intel intrinsic. */ 1058 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1059 _mm_cvtsd_si64 (__m128d __A) 1060 { 1061 __v2df rounded = vec_rint ((__v2df) __A ); 1062 long long result = ((__v2df) rounded)[0]; 1063 1064 return result; 1065 } 1066 1067 /* Microsoft intrinsic. */ 1068 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1069 _mm_cvtsd_si64x (__m128d __A) 1070 { 1071 return _mm_cvtsd_si64 ((__v2df)__A); 1072 } 1073 1074 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1075 _mm_cvttsd_si32 (__m128d __A) 1076 { 1077 int result = ((__v2df)__A)[0]; 1078 1079 return result; 1080 } 1081 1082 /* Intel intrinsic. */ 1083 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1084 _mm_cvttsd_si64 (__m128d __A) 1085 { 1086 long long result = ((__v2df)__A)[0]; 1087 1088 return result; 1089 } 1090 1091 /* Microsoft intrinsic. */ 1092 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1093 _mm_cvttsd_si64x (__m128d __A) 1094 { 1095 return _mm_cvttsd_si64 (__A); 1096 } 1097 1098 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1099 _mm_cvtsd_ss (__m128 __A, __m128d __B) 1100 { 1101 __v4sf result = (__v4sf)__A; 1102 1103 #ifdef __LITTLE_ENDIAN__ 1104 __v4sf temp_s; 1105 /* Copy double element[0] to element [1] for conversion. */ 1106 __v2df temp_b = vec_splat((__v2df)__B, 0); 1107 1108 /* Pre-rotate __A left 3 (logically right 1) elements. */ 1109 result = __builtin_vsx_xxsldwi (result, result, 3); 1110 /* Convert double to single float scalar in a vector. */ 1111 __asm__( 1112 "xscvdpsp %x0,%x1" 1113 : "=wa" (temp_s) 1114 : "wa" (temp_b) 1115 : ); 1116 /* Shift the resulting scalar into vector element [0]. */ 1117 result = __builtin_vsx_xxsldwi (result, temp_s, 1); 1118 #else 1119 result [0] = ((__v2df)__B)[0]; 1120 #endif 1121 return (__m128) result; 1122 } 1123 1124 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1125 _mm_cvtsi32_sd (__m128d __A, int __B) 1126 { 1127 __v2df result = (__v2df)__A; 1128 double db = __B; 1129 result [0] = db; 1130 return (__m128d)result; 1131 } 1132 1133 /* Intel intrinsic. */ 1134 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1135 _mm_cvtsi64_sd (__m128d __A, long long __B) 1136 { 1137 __v2df result = (__v2df)__A; 1138 double db = __B; 1139 result [0] = db; 1140 return (__m128d)result; 1141 } 1142 1143 /* Microsoft intrinsic. */ 1144 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1145 _mm_cvtsi64x_sd (__m128d __A, long long __B) 1146 { 1147 return _mm_cvtsi64_sd (__A, __B); 1148 } 1149 1150 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1151 _mm_cvtss_sd (__m128d __A, __m128 __B) 1152 { 1153 #ifdef __LITTLE_ENDIAN__ 1154 /* Use splat to move element [0] into position for the convert. */ 1155 __v4sf temp = vec_splat ((__v4sf)__B, 0); 1156 __v2df res; 1157 /* Convert single float scalar to double in a vector. */ 1158 __asm__( 1159 "xscvspdp %x0,%x1" 1160 : "=wa" (res) 1161 : "wa" (temp) 1162 : ); 1163 return (__m128d) vec_mergel (res, (__v2df)__A); 1164 #else 1165 __v2df res = (__v2df)__A; 1166 res [0] = ((__v4sf)__B) [0]; 1167 return (__m128d) res; 1168 #endif 1169 } 1170 1171 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1172 _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) 1173 { 1174 __vector double result; 1175 const int litmsk = __mask & 0x3; 1176 1177 if (litmsk == 0) 1178 result = vec_mergeh (__A, __B); 1179 #if __GNUC__ < 6 1180 else if (litmsk == 1) 1181 result = vec_xxpermdi (__B, __A, 2); 1182 else if (litmsk == 2) 1183 result = vec_xxpermdi (__B, __A, 1); 1184 #else 1185 else if (litmsk == 1) 1186 result = vec_xxpermdi (__A, __B, 2); 1187 else if (litmsk == 2) 1188 result = vec_xxpermdi (__A, __B, 1); 1189 #endif 1190 else 1191 result = vec_mergel (__A, __B); 1192 1193 return result; 1194 } 1195 1196 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1197 _mm_unpackhi_pd (__m128d __A, __m128d __B) 1198 { 1199 return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B); 1200 } 1201 1202 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1203 _mm_unpacklo_pd (__m128d __A, __m128d __B) 1204 { 1205 return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B); 1206 } 1207 1208 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1209 _mm_loadh_pd (__m128d __A, double const *__B) 1210 { 1211 __v2df result = (__v2df)__A; 1212 result [1] = *__B; 1213 return (__m128d)result; 1214 } 1215 1216 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1217 _mm_loadl_pd (__m128d __A, double const *__B) 1218 { 1219 __v2df result = (__v2df)__A; 1220 result [0] = *__B; 1221 return (__m128d)result; 1222 } 1223 1224 #ifdef _ARCH_PWR8 1225 /* Intrinsic functions that require PowerISA 2.07 minimum. */ 1226 1227 /* Creates a 2-bit mask from the most significant bits of the DPFP values. */ 1228 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1229 _mm_movemask_pd (__m128d __A) 1230 { 1231 __vector __m64 result; 1232 static const __vector unsigned int perm_mask = 1233 { 1234 #ifdef __LITTLE_ENDIAN__ 1235 0x80800040, 0x80808080, 0x80808080, 0x80808080 1236 #elif __BIG_ENDIAN__ 1237 0x80808080, 0x80808080, 0x80808080, 0x80800040 1238 #endif 1239 }; 1240 1241 result = (__vector __m64) vec_vbpermq ((__vector unsigned char) __A, 1242 (__vector unsigned char) perm_mask); 1243 1244 #ifdef __LITTLE_ENDIAN__ 1245 return result[1]; 1246 #elif __BIG_ENDIAN__ 1247 return result[0]; 1248 #endif 1249 } 1250 #endif /* _ARCH_PWR8 */ 1251 1252 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1253 _mm_packs_epi16 (__m128i __A, __m128i __B) 1254 { 1255 return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B); 1256 } 1257 1258 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1259 _mm_packs_epi32 (__m128i __A, __m128i __B) 1260 { 1261 return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B); 1262 } 1263 1264 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1265 _mm_packus_epi16 (__m128i __A, __m128i __B) 1266 { 1267 return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B); 1268 } 1269 1270 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1271 _mm_unpackhi_epi8 (__m128i __A, __m128i __B) 1272 { 1273 return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B); 1274 } 1275 1276 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1277 _mm_unpackhi_epi16 (__m128i __A, __m128i __B) 1278 { 1279 return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B); 1280 } 1281 1282 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1283 _mm_unpackhi_epi32 (__m128i __A, __m128i __B) 1284 { 1285 return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B); 1286 } 1287 1288 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1289 _mm_unpackhi_epi64 (__m128i __A, __m128i __B) 1290 { 1291 return (__m128i) vec_mergel ((__vector long)__A, (__vector long)__B); 1292 } 1293 1294 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1295 _mm_unpacklo_epi8 (__m128i __A, __m128i __B) 1296 { 1297 return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B); 1298 } 1299 1300 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1301 _mm_unpacklo_epi16 (__m128i __A, __m128i __B) 1302 { 1303 return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B); 1304 } 1305 1306 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1307 _mm_unpacklo_epi32 (__m128i __A, __m128i __B) 1308 { 1309 return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B); 1310 } 1311 1312 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1313 _mm_unpacklo_epi64 (__m128i __A, __m128i __B) 1314 { 1315 return (__m128i) vec_mergeh ((__vector long)__A, (__vector long)__B); 1316 } 1317 1318 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1319 _mm_add_epi8 (__m128i __A, __m128i __B) 1320 { 1321 return (__m128i) ((__v16qu)__A + (__v16qu)__B); 1322 } 1323 1324 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1325 _mm_add_epi16 (__m128i __A, __m128i __B) 1326 { 1327 return (__m128i) ((__v8hu)__A + (__v8hu)__B); 1328 } 1329 1330 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1331 _mm_add_epi32 (__m128i __A, __m128i __B) 1332 { 1333 return (__m128i) ((__v4su)__A + (__v4su)__B); 1334 } 1335 1336 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1337 _mm_add_epi64 (__m128i __A, __m128i __B) 1338 { 1339 return (__m128i) ((__v2du)__A + (__v2du)__B); 1340 } 1341 1342 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1343 _mm_adds_epi8 (__m128i __A, __m128i __B) 1344 { 1345 return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B); 1346 } 1347 1348 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1349 _mm_adds_epi16 (__m128i __A, __m128i __B) 1350 { 1351 return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B); 1352 } 1353 1354 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1355 _mm_adds_epu8 (__m128i __A, __m128i __B) 1356 { 1357 return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B); 1358 } 1359 1360 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1361 _mm_adds_epu16 (__m128i __A, __m128i __B) 1362 { 1363 return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B); 1364 } 1365 1366 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1367 _mm_sub_epi8 (__m128i __A, __m128i __B) 1368 { 1369 return (__m128i) ((__v16qu)__A - (__v16qu)__B); 1370 } 1371 1372 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1373 _mm_sub_epi16 (__m128i __A, __m128i __B) 1374 { 1375 return (__m128i) ((__v8hu)__A - (__v8hu)__B); 1376 } 1377 1378 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1379 _mm_sub_epi32 (__m128i __A, __m128i __B) 1380 { 1381 return (__m128i) ((__v4su)__A - (__v4su)__B); 1382 } 1383 1384 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1385 _mm_sub_epi64 (__m128i __A, __m128i __B) 1386 { 1387 return (__m128i) ((__v2du)__A - (__v2du)__B); 1388 } 1389 1390 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1391 _mm_subs_epi8 (__m128i __A, __m128i __B) 1392 { 1393 return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B); 1394 } 1395 1396 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1397 _mm_subs_epi16 (__m128i __A, __m128i __B) 1398 { 1399 return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B); 1400 } 1401 1402 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1403 _mm_subs_epu8 (__m128i __A, __m128i __B) 1404 { 1405 return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B); 1406 } 1407 1408 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1409 _mm_subs_epu16 (__m128i __A, __m128i __B) 1410 { 1411 return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B); 1412 } 1413 1414 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1415 _mm_madd_epi16 (__m128i __A, __m128i __B) 1416 { 1417 __vector signed int zero = {0, 0, 0, 0}; 1418 1419 return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, zero); 1420 } 1421 1422 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1423 _mm_mulhi_epi16 (__m128i __A, __m128i __B) 1424 { 1425 __vector signed int w0, w1; 1426 1427 __vector unsigned char xform1 = { 1428 #ifdef __LITTLE_ENDIAN__ 1429 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 1430 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 1431 #elif __BIG_ENDIAN__ 1432 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 1433 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D 1434 #endif 1435 }; 1436 1437 w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B); 1438 w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B); 1439 return (__m128i) vec_perm (w0, w1, xform1); 1440 } 1441 1442 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1443 _mm_mullo_epi16 (__m128i __A, __m128i __B) 1444 { 1445 return (__m128i) ((__v8hi)__A * (__v8hi)__B); 1446 } 1447 1448 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1449 _mm_mul_su32 (__m64 __A, __m64 __B) 1450 { 1451 unsigned int a = __A; 1452 unsigned int b = __B; 1453 1454 return ((__m64)a * (__m64)b); 1455 } 1456 1457 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1458 _mm_mul_epu32 (__m128i __A, __m128i __B) 1459 { 1460 #if __GNUC__ < 8 1461 __v2du result; 1462 1463 #ifdef __LITTLE_ENDIAN__ 1464 /* VMX Vector Multiply Odd Unsigned Word. */ 1465 __asm__( 1466 "vmulouw %0,%1,%2" 1467 : "=v" (result) 1468 : "v" (__A), "v" (__B) 1469 : ); 1470 #elif __BIG_ENDIAN__ 1471 /* VMX Vector Multiply Even Unsigned Word. */ 1472 __asm__( 1473 "vmuleuw %0,%1,%2" 1474 : "=v" (result) 1475 : "v" (__A), "v" (__B) 1476 : ); 1477 #endif 1478 return (__m128i) result; 1479 #else 1480 #ifdef __LITTLE_ENDIAN__ 1481 return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B); 1482 #elif __BIG_ENDIAN__ 1483 return (__m128i) vec_mulo ((__v4su)__A, (__v4su)__B); 1484 #endif 1485 #endif 1486 } 1487 1488 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1489 _mm_slli_epi16 (__m128i __A, int __B) 1490 { 1491 __v8hu lshift; 1492 __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 }; 1493 1494 if (__B >= 0 && __B < 16) 1495 { 1496 if (__builtin_constant_p(__B)) 1497 lshift = (__v8hu) vec_splat_s16(__B); 1498 else 1499 lshift = vec_splats ((unsigned short) __B); 1500 1501 result = vec_vslh ((__v8hi) __A, lshift); 1502 } 1503 1504 return (__m128i) result; 1505 } 1506 1507 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1508 _mm_slli_epi32 (__m128i __A, int __B) 1509 { 1510 __v4su lshift; 1511 __v4si result = { 0, 0, 0, 0 }; 1512 1513 if (__B >= 0 && __B < 32) 1514 { 1515 if (__builtin_constant_p(__B) && __B < 16) 1516 lshift = (__v4su) vec_splat_s32(__B); 1517 else 1518 lshift = vec_splats ((unsigned int) __B); 1519 1520 result = vec_vslw ((__v4si) __A, lshift); 1521 } 1522 1523 return (__m128i) result; 1524 } 1525 1526 #ifdef _ARCH_PWR8 1527 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1528 _mm_slli_epi64 (__m128i __A, int __B) 1529 { 1530 __v2du lshift; 1531 __v2di result = { 0, 0 }; 1532 1533 if (__B >= 0 && __B < 64) 1534 { 1535 if (__builtin_constant_p(__B) && __B < 16) 1536 lshift = (__v2du) vec_splat_s32(__B); 1537 else 1538 lshift = (__v2du) vec_splats ((unsigned int) __B); 1539 1540 result = vec_vsld ((__v2di) __A, lshift); 1541 } 1542 1543 return (__m128i) result; 1544 } 1545 #endif 1546 1547 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1548 _mm_srai_epi16 (__m128i __A, int __B) 1549 { 1550 __v8hu rshift = { 15, 15, 15, 15, 15, 15, 15, 15 }; 1551 __v8hi result; 1552 1553 if (__B < 16) 1554 { 1555 if (__builtin_constant_p(__B)) 1556 rshift = (__v8hu) vec_splat_s16(__B); 1557 else 1558 rshift = vec_splats ((unsigned short) __B); 1559 } 1560 result = vec_vsrah ((__v8hi) __A, rshift); 1561 1562 return (__m128i) result; 1563 } 1564 1565 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1566 _mm_srai_epi32 (__m128i __A, int __B) 1567 { 1568 __v4su rshift = { 31, 31, 31, 31 }; 1569 __v4si result; 1570 1571 if (__B < 32) 1572 { 1573 if (__builtin_constant_p(__B)) 1574 { 1575 if (__B < 16) 1576 rshift = (__v4su) vec_splat_s32(__B); 1577 else 1578 rshift = (__v4su) vec_splats((unsigned int)__B); 1579 } 1580 else 1581 rshift = vec_splats ((unsigned int) __B); 1582 } 1583 result = vec_vsraw ((__v4si) __A, rshift); 1584 1585 return (__m128i) result; 1586 } 1587 1588 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1589 _mm_bslli_si128 (__m128i __A, const int __N) 1590 { 1591 __v16qu result; 1592 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 1593 1594 if (__N < 16) 1595 result = vec_sld ((__v16qu) __A, zeros, __N); 1596 else 1597 result = zeros; 1598 1599 return (__m128i) result; 1600 } 1601 1602 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1603 _mm_bsrli_si128 (__m128i __A, const int __N) 1604 { 1605 __v16qu result; 1606 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 1607 1608 if (__N < 16) 1609 if (__builtin_constant_p(__N)) 1610 /* Would like to use Vector Shift Left Double by Octet 1611 Immediate here to use the immediate form and avoid 1612 load of __N * 8 value into a separate VR. */ 1613 result = vec_sld (zeros, (__v16qu) __A, (16 - __N)); 1614 else 1615 { 1616 __v16qu shift = vec_splats((unsigned char)(__N*8)); 1617 result = vec_sro ((__v16qu)__A, shift); 1618 } 1619 else 1620 result = zeros; 1621 1622 return (__m128i) result; 1623 } 1624 1625 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1626 _mm_srli_si128 (__m128i __A, const int __N) 1627 { 1628 return _mm_bsrli_si128 (__A, __N); 1629 } 1630 1631 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1632 _mm_slli_si128 (__m128i __A, const int _imm5) 1633 { 1634 __v16qu result; 1635 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 1636 1637 if (_imm5 < 16) 1638 #ifdef __LITTLE_ENDIAN__ 1639 result = vec_sld ((__v16qu) __A, zeros, _imm5); 1640 #elif __BIG_ENDIAN__ 1641 result = vec_sld (zeros, (__v16qu) __A, (16 - _imm5)); 1642 #endif 1643 else 1644 result = zeros; 1645 1646 return (__m128i) result; 1647 } 1648 1649 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1650 1651 _mm_srli_epi16 (__m128i __A, int __B) 1652 { 1653 __v8hu rshift; 1654 __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 }; 1655 1656 if (__B < 16) 1657 { 1658 if (__builtin_constant_p(__B)) 1659 rshift = (__v8hu) vec_splat_s16(__B); 1660 else 1661 rshift = vec_splats ((unsigned short) __B); 1662 1663 result = vec_vsrh ((__v8hi) __A, rshift); 1664 } 1665 1666 return (__m128i) result; 1667 } 1668 1669 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1670 _mm_srli_epi32 (__m128i __A, int __B) 1671 { 1672 __v4su rshift; 1673 __v4si result = { 0, 0, 0, 0 }; 1674 1675 if (__B < 32) 1676 { 1677 if (__builtin_constant_p(__B)) 1678 { 1679 if (__B < 16) 1680 rshift = (__v4su) vec_splat_s32(__B); 1681 else 1682 rshift = (__v4su) vec_splats((unsigned int)__B); 1683 } 1684 else 1685 rshift = vec_splats ((unsigned int) __B); 1686 1687 result = vec_vsrw ((__v4si) __A, rshift); 1688 } 1689 1690 return (__m128i) result; 1691 } 1692 1693 #ifdef _ARCH_PWR8 1694 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1695 _mm_srli_epi64 (__m128i __A, int __B) 1696 { 1697 __v2du rshift; 1698 __v2di result = { 0, 0 }; 1699 1700 if (__B < 64) 1701 { 1702 if (__builtin_constant_p(__B)) 1703 { 1704 if (__B < 16) 1705 rshift = (__v2du) vec_splat_s32(__B); 1706 else 1707 rshift = (__v2du) vec_splats((unsigned long long)__B); 1708 } 1709 else 1710 rshift = (__v2du) vec_splats ((unsigned int) __B); 1711 1712 result = vec_vsrd ((__v2di) __A, rshift); 1713 } 1714 1715 return (__m128i) result; 1716 } 1717 #endif 1718 1719 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1720 _mm_sll_epi16 (__m128i __A, __m128i __B) 1721 { 1722 __v8hu lshift, shmask; 1723 const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; 1724 __v8hu result; 1725 1726 #ifdef __LITTLE_ENDIAN__ 1727 lshift = vec_splat ((__v8hu)__B, 0); 1728 #elif __BIG_ENDIAN__ 1729 lshift = vec_splat ((__v8hu)__B, 3); 1730 #endif 1731 shmask = lshift <= shmax; 1732 result = vec_vslh ((__v8hu) __A, lshift); 1733 result = vec_sel (shmask, result, shmask); 1734 1735 return (__m128i) result; 1736 } 1737 1738 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1739 _mm_sll_epi32 (__m128i __A, __m128i __B) 1740 { 1741 __v4su lshift, shmask; 1742 const __v4su shmax = { 32, 32, 32, 32 }; 1743 __v4su result; 1744 #ifdef __LITTLE_ENDIAN__ 1745 lshift = vec_splat ((__v4su)__B, 0); 1746 #elif __BIG_ENDIAN__ 1747 lshift = vec_splat ((__v4su)__B, 1); 1748 #endif 1749 shmask = lshift < shmax; 1750 result = vec_vslw ((__v4su) __A, lshift); 1751 result = vec_sel (shmask, result, shmask); 1752 1753 return (__m128i) result; 1754 } 1755 1756 #ifdef _ARCH_PWR8 1757 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1758 _mm_sll_epi64 (__m128i __A, __m128i __B) 1759 { 1760 __v2du lshift, shmask; 1761 const __v2du shmax = { 64, 64 }; 1762 __v2du result; 1763 1764 lshift = (__v2du) vec_splat ((__v2du)__B, 0); 1765 shmask = lshift < shmax; 1766 result = vec_vsld ((__v2du) __A, lshift); 1767 result = (__v2du) vec_sel ((__v2df) shmask, (__v2df) result, 1768 (__v2df) shmask); 1769 1770 return (__m128i) result; 1771 } 1772 #endif 1773 1774 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1775 _mm_sra_epi16 (__m128i __A, __m128i __B) 1776 { 1777 const __v8hu rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; 1778 __v8hu rshift; 1779 __v8hi result; 1780 1781 #ifdef __LITTLE_ENDIAN__ 1782 rshift = vec_splat ((__v8hu)__B, 0); 1783 #elif __BIG_ENDIAN__ 1784 rshift = vec_splat ((__v8hu)__B, 3); 1785 #endif 1786 rshift = vec_min (rshift, rshmax); 1787 result = vec_vsrah ((__v8hi) __A, rshift); 1788 1789 return (__m128i) result; 1790 } 1791 1792 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1793 _mm_sra_epi32 (__m128i __A, __m128i __B) 1794 { 1795 const __v4su rshmax = { 31, 31, 31, 31 }; 1796 __v4su rshift; 1797 __v4si result; 1798 1799 #ifdef __LITTLE_ENDIAN__ 1800 rshift = vec_splat ((__v4su)__B, 0); 1801 #elif __BIG_ENDIAN__ 1802 rshift = vec_splat ((__v4su)__B, 1); 1803 #endif 1804 rshift = vec_min (rshift, rshmax); 1805 result = vec_vsraw ((__v4si) __A, rshift); 1806 1807 return (__m128i) result; 1808 } 1809 1810 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1811 _mm_srl_epi16 (__m128i __A, __m128i __B) 1812 { 1813 __v8hu rshift, shmask; 1814 const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; 1815 __v8hu result; 1816 1817 #ifdef __LITTLE_ENDIAN__ 1818 rshift = vec_splat ((__v8hu)__B, 0); 1819 #elif __BIG_ENDIAN__ 1820 rshift = vec_splat ((__v8hu)__B, 3); 1821 #endif 1822 shmask = rshift <= shmax; 1823 result = vec_vsrh ((__v8hu) __A, rshift); 1824 result = vec_sel (shmask, result, shmask); 1825 1826 return (__m128i) result; 1827 } 1828 1829 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1830 _mm_srl_epi32 (__m128i __A, __m128i __B) 1831 { 1832 __v4su rshift, shmask; 1833 const __v4su shmax = { 32, 32, 32, 32 }; 1834 __v4su result; 1835 1836 #ifdef __LITTLE_ENDIAN__ 1837 rshift = vec_splat ((__v4su)__B, 0); 1838 #elif __BIG_ENDIAN__ 1839 rshift = vec_splat ((__v4su)__B, 1); 1840 #endif 1841 shmask = rshift < shmax; 1842 result = vec_vsrw ((__v4su) __A, rshift); 1843 result = vec_sel (shmask, result, shmask); 1844 1845 return (__m128i) result; 1846 } 1847 1848 #ifdef _ARCH_PWR8 1849 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1850 _mm_srl_epi64 (__m128i __A, __m128i __B) 1851 { 1852 __v2du rshift, shmask; 1853 const __v2du shmax = { 64, 64 }; 1854 __v2du result; 1855 1856 rshift = (__v2du) vec_splat ((__v2du)__B, 0); 1857 shmask = rshift < shmax; 1858 result = vec_vsrd ((__v2du) __A, rshift); 1859 result = (__v2du)vec_sel ((__v2du)shmask, (__v2du)result, (__v2du)shmask); 1860 1861 return (__m128i) result; 1862 } 1863 #endif 1864 1865 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1866 _mm_and_pd (__m128d __A, __m128d __B) 1867 { 1868 return (vec_and ((__v2df) __A, (__v2df) __B)); 1869 } 1870 1871 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1872 _mm_andnot_pd (__m128d __A, __m128d __B) 1873 { 1874 return (vec_andc ((__v2df) __B, (__v2df) __A)); 1875 } 1876 1877 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1878 _mm_or_pd (__m128d __A, __m128d __B) 1879 { 1880 return (vec_or ((__v2df) __A, (__v2df) __B)); 1881 } 1882 1883 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1884 _mm_xor_pd (__m128d __A, __m128d __B) 1885 { 1886 return (vec_xor ((__v2df) __A, (__v2df) __B)); 1887 } 1888 1889 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1890 _mm_and_si128 (__m128i __A, __m128i __B) 1891 { 1892 return (__m128i)vec_and ((__v2di) __A, (__v2di) __B); 1893 } 1894 1895 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1896 _mm_andnot_si128 (__m128i __A, __m128i __B) 1897 { 1898 return (__m128i)vec_andc ((__v2di) __B, (__v2di) __A); 1899 } 1900 1901 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1902 _mm_or_si128 (__m128i __A, __m128i __B) 1903 { 1904 return (__m128i)vec_or ((__v2di) __A, (__v2di) __B); 1905 } 1906 1907 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1908 _mm_xor_si128 (__m128i __A, __m128i __B) 1909 { 1910 return (__m128i)vec_xor ((__v2di) __A, (__v2di) __B); 1911 } 1912 1913 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1914 _mm_cmpeq_epi8 (__m128i __A, __m128i __B) 1915 { 1916 return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B); 1917 } 1918 1919 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1920 _mm_cmpeq_epi16 (__m128i __A, __m128i __B) 1921 { 1922 return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B); 1923 } 1924 1925 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1926 _mm_cmpeq_epi32 (__m128i __A, __m128i __B) 1927 { 1928 return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B); 1929 } 1930 1931 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1932 _mm_cmplt_epi8 (__m128i __A, __m128i __B) 1933 { 1934 return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B); 1935 } 1936 1937 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1938 _mm_cmplt_epi16 (__m128i __A, __m128i __B) 1939 { 1940 return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B); 1941 } 1942 1943 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1944 _mm_cmplt_epi32 (__m128i __A, __m128i __B) 1945 { 1946 return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B); 1947 } 1948 1949 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1950 _mm_cmpgt_epi8 (__m128i __A, __m128i __B) 1951 { 1952 return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B); 1953 } 1954 1955 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1956 _mm_cmpgt_epi16 (__m128i __A, __m128i __B) 1957 { 1958 return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B); 1959 } 1960 1961 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1962 _mm_cmpgt_epi32 (__m128i __A, __m128i __B) 1963 { 1964 return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B); 1965 } 1966 1967 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1968 _mm_extract_epi16 (__m128i const __A, int const __N) 1969 { 1970 return (unsigned short) ((__v8hi)__A)[__N & 7]; 1971 } 1972 1973 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1974 _mm_insert_epi16 (__m128i const __A, int const __D, int const __N) 1975 { 1976 __v8hi result = (__v8hi)__A; 1977 1978 result [(__N & 7)] = __D; 1979 1980 return (__m128i) result; 1981 } 1982 1983 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1984 _mm_max_epi16 (__m128i __A, __m128i __B) 1985 { 1986 return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B); 1987 } 1988 1989 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1990 _mm_max_epu8 (__m128i __A, __m128i __B) 1991 { 1992 return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B); 1993 } 1994 1995 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1996 _mm_min_epi16 (__m128i __A, __m128i __B) 1997 { 1998 return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B); 1999 } 2000 2001 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2002 _mm_min_epu8 (__m128i __A, __m128i __B) 2003 { 2004 return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B); 2005 } 2006 2007 2008 #ifdef _ARCH_PWR8 2009 /* Intrinsic functions that require PowerISA 2.07 minimum. */ 2010 2011 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */ 2012 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2013 _mm_movemask_epi8 (__m128i __A) 2014 { 2015 __vector __m64 result; 2016 static const __vector unsigned char perm_mask = 2017 { 2018 #ifdef __LITTLE_ENDIAN__ 2019 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40, 2020 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00 2021 #elif __BIG_ENDIAN__ 2022 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38, 2023 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78 2024 #endif 2025 }; 2026 2027 result = (__vector __m64) vec_vbpermq ((__vector unsigned char) __A, 2028 (__vector unsigned char) perm_mask); 2029 2030 #ifdef __LITTLE_ENDIAN__ 2031 return result[1]; 2032 #elif __BIG_ENDIAN__ 2033 return result[0]; 2034 #endif 2035 } 2036 #endif /* _ARCH_PWR8 */ 2037 2038 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2039 _mm_mulhi_epu16 (__m128i __A, __m128i __B) 2040 { 2041 __v4su w0, w1; 2042 __v16qu xform1 = { 2043 #ifdef __LITTLE_ENDIAN__ 2044 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 2045 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 2046 #elif __BIG_ENDIAN__ 2047 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 2048 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D 2049 #endif 2050 }; 2051 2052 w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B); 2053 w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B); 2054 return (__m128i) vec_perm (w0, w1, xform1); 2055 } 2056 2057 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2058 _mm_shufflehi_epi16 (__m128i __A, const int __mask) 2059 { 2060 unsigned long element_selector_98 = __mask & 0x03; 2061 unsigned long element_selector_BA = (__mask >> 2) & 0x03; 2062 unsigned long element_selector_DC = (__mask >> 4) & 0x03; 2063 unsigned long element_selector_FE = (__mask >> 6) & 0x03; 2064 static const unsigned short permute_selectors[4] = 2065 { 2066 #ifdef __LITTLE_ENDIAN__ 2067 0x0908, 0x0B0A, 0x0D0C, 0x0F0E 2068 #elif __BIG_ENDIAN__ 2069 0x0607, 0x0405, 0x0203, 0x0001 2070 #endif 2071 }; 2072 __v2du pmask = 2073 #ifdef __LITTLE_ENDIAN__ 2074 { 0x1716151413121110UL, 0x1f1e1d1c1b1a1918UL}; 2075 #elif __BIG_ENDIAN__ 2076 { 0x1011121314151617UL, 0x18191a1b1c1d1e1fUL}; 2077 #endif 2078 __m64_union t; 2079 __v2du a, r; 2080 2081 #ifdef __LITTLE_ENDIAN__ 2082 t.as_short[0] = permute_selectors[element_selector_98]; 2083 t.as_short[1] = permute_selectors[element_selector_BA]; 2084 t.as_short[2] = permute_selectors[element_selector_DC]; 2085 t.as_short[3] = permute_selectors[element_selector_FE]; 2086 #elif __BIG_ENDIAN__ 2087 t.as_short[3] = permute_selectors[element_selector_98]; 2088 t.as_short[2] = permute_selectors[element_selector_BA]; 2089 t.as_short[1] = permute_selectors[element_selector_DC]; 2090 t.as_short[0] = permute_selectors[element_selector_FE]; 2091 #endif 2092 #ifdef __LITTLE_ENDIAN__ 2093 pmask[1] = t.as_m64; 2094 #elif __BIG_ENDIAN__ 2095 pmask[0] = t.as_m64; 2096 #endif 2097 a = (__v2du)__A; 2098 r = vec_perm (a, a, (__vector unsigned char)pmask); 2099 return (__m128i) r; 2100 } 2101 2102 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2103 _mm_shufflelo_epi16 (__m128i __A, const int __mask) 2104 { 2105 unsigned long element_selector_10 = __mask & 0x03; 2106 unsigned long element_selector_32 = (__mask >> 2) & 0x03; 2107 unsigned long element_selector_54 = (__mask >> 4) & 0x03; 2108 unsigned long element_selector_76 = (__mask >> 6) & 0x03; 2109 static const unsigned short permute_selectors[4] = 2110 { 2111 #ifdef __LITTLE_ENDIAN__ 2112 0x0100, 0x0302, 0x0504, 0x0706 2113 #elif __BIG_ENDIAN__ 2114 0x0e0f, 0x0c0d, 0x0a0b, 0x0809 2115 #endif 2116 }; 2117 __v2du pmask = { 0x1011121314151617UL, 0x1f1e1d1c1b1a1918UL}; 2118 __m64_union t; 2119 __v2du a, r; 2120 2121 #ifdef __LITTLE_ENDIAN__ 2122 t.as_short[0] = permute_selectors[element_selector_10]; 2123 t.as_short[1] = permute_selectors[element_selector_32]; 2124 t.as_short[2] = permute_selectors[element_selector_54]; 2125 t.as_short[3] = permute_selectors[element_selector_76]; 2126 #elif __BIG_ENDIAN__ 2127 t.as_short[3] = permute_selectors[element_selector_10]; 2128 t.as_short[2] = permute_selectors[element_selector_32]; 2129 t.as_short[1] = permute_selectors[element_selector_54]; 2130 t.as_short[0] = permute_selectors[element_selector_76]; 2131 #endif 2132 #ifdef __LITTLE_ENDIAN__ 2133 pmask[0] = t.as_m64; 2134 #elif __BIG_ENDIAN__ 2135 pmask[1] = t.as_m64; 2136 #endif 2137 a = (__v2du)__A; 2138 r = vec_perm (a, a, (__vector unsigned char)pmask); 2139 return (__m128i) r; 2140 } 2141 2142 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2143 _mm_shuffle_epi32 (__m128i __A, const int __mask) 2144 { 2145 unsigned long element_selector_10 = __mask & 0x03; 2146 unsigned long element_selector_32 = (__mask >> 2) & 0x03; 2147 unsigned long element_selector_54 = (__mask >> 4) & 0x03; 2148 unsigned long element_selector_76 = (__mask >> 6) & 0x03; 2149 static const unsigned int permute_selectors[4] = 2150 { 2151 #ifdef __LITTLE_ENDIAN__ 2152 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C 2153 #elif __BIG_ENDIAN__ 2154 0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203 2155 #endif 2156 }; 2157 __v4su t; 2158 2159 #ifdef __LITTLE_ENDIAN__ 2160 t[0] = permute_selectors[element_selector_10]; 2161 t[1] = permute_selectors[element_selector_32]; 2162 t[2] = permute_selectors[element_selector_54] + 0x10101010; 2163 t[3] = permute_selectors[element_selector_76] + 0x10101010; 2164 #elif __BIG_ENDIAN__ 2165 t[3] = permute_selectors[element_selector_10] + 0x10101010; 2166 t[2] = permute_selectors[element_selector_32] + 0x10101010; 2167 t[1] = permute_selectors[element_selector_54]; 2168 t[0] = permute_selectors[element_selector_76]; 2169 #endif 2170 return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t); 2171 } 2172 2173 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2174 _mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C) 2175 { 2176 __v2du hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL}; 2177 __v16qu mask, tmp; 2178 __m128i *p = (__m128i*)__C; 2179 2180 tmp = (__v16qu)_mm_loadu_si128(p); 2181 mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)hibit); 2182 tmp = vec_sel (tmp, (__v16qu)__A, mask); 2183 _mm_storeu_si128 (p, (__m128i)tmp); 2184 } 2185 2186 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2187 _mm_avg_epu8 (__m128i __A, __m128i __B) 2188 { 2189 return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B); 2190 } 2191 2192 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2193 _mm_avg_epu16 (__m128i __A, __m128i __B) 2194 { 2195 return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B); 2196 } 2197 2198 2199 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2200 _mm_sad_epu8 (__m128i __A, __m128i __B) 2201 { 2202 __v16qu a, b; 2203 __v16qu vmin, vmax, vabsdiff; 2204 __v4si vsum; 2205 const __v4su zero = { 0, 0, 0, 0 }; 2206 __v4si result; 2207 2208 a = (__v16qu) __A; 2209 b = (__v16qu) __B; 2210 vmin = vec_min (a, b); 2211 vmax = vec_max (a, b); 2212 vabsdiff = vec_sub (vmax, vmin); 2213 /* Sum four groups of bytes into integers. */ 2214 vsum = (__vector signed int) vec_sum4s (vabsdiff, zero); 2215 /* Sum across four integers with two integer results. */ 2216 result = vec_sum2s (vsum, (__vector signed int) zero); 2217 /* Rotate the sums into the correct position. */ 2218 #ifdef __LITTLE_ENDIAN__ 2219 result = vec_sld (result, result, 4); 2220 #elif __BIG_ENDIAN__ 2221 result = vec_sld (result, result, 6); 2222 #endif 2223 /* Rotate the sums into the correct position. */ 2224 return (__m128i) result; 2225 } 2226 2227 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2228 _mm_stream_si32 (int *__A, int __B) 2229 { 2230 /* Use the data cache block touch for store transient. */ 2231 __asm__ ( 2232 "dcbtstt 0,%0" 2233 : 2234 : "b" (__A) 2235 : "memory" 2236 ); 2237 *__A = __B; 2238 } 2239 2240 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2241 _mm_stream_si64 (long long int *__A, long long int __B) 2242 { 2243 /* Use the data cache block touch for store transient. */ 2244 __asm__ ( 2245 " dcbtstt 0,%0" 2246 : 2247 : "b" (__A) 2248 : "memory" 2249 ); 2250 *__A = __B; 2251 } 2252 2253 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2254 _mm_stream_si128 (__m128i *__A, __m128i __B) 2255 { 2256 /* Use the data cache block touch for store transient. */ 2257 __asm__ ( 2258 "dcbtstt 0,%0" 2259 : 2260 : "b" (__A) 2261 : "memory" 2262 ); 2263 *__A = __B; 2264 } 2265 2266 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2267 _mm_stream_pd (double *__A, __m128d __B) 2268 { 2269 /* Use the data cache block touch for store transient. */ 2270 __asm__ ( 2271 "dcbtstt 0,%0" 2272 : 2273 : "b" (__A) 2274 : "memory" 2275 ); 2276 *(__m128d*)__A = __B; 2277 } 2278 2279 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2280 _mm_clflush (void const *__A) 2281 { 2282 /* Use the data cache block flush. */ 2283 __asm__ ( 2284 "dcbf 0,%0" 2285 : 2286 : "b" (__A) 2287 : "memory" 2288 ); 2289 } 2290 2291 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2292 _mm_lfence (void) 2293 { 2294 /* Use light weight sync for load to load ordering. */ 2295 __atomic_thread_fence (__ATOMIC_RELEASE); 2296 } 2297 2298 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2299 _mm_mfence (void) 2300 { 2301 /* Use heavy weight sync for any to any ordering. */ 2302 __atomic_thread_fence (__ATOMIC_SEQ_CST); 2303 } 2304 2305 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2306 _mm_cvtsi32_si128 (int __A) 2307 { 2308 return _mm_set_epi32 (0, 0, 0, __A); 2309 } 2310 2311 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2312 _mm_cvtsi64_si128 (long long __A) 2313 { 2314 return __extension__ (__m128i)(__v2di){ __A, 0LL }; 2315 } 2316 2317 /* Microsoft intrinsic. */ 2318 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2319 _mm_cvtsi64x_si128 (long long __A) 2320 { 2321 return __extension__ (__m128i)(__v2di){ __A, 0LL }; 2322 } 2323 2324 /* Casts between various SP, DP, INT vector types. Note that these do no 2325 conversion of values, they just change the type. */ 2326 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2327 _mm_castpd_ps(__m128d __A) 2328 { 2329 return (__m128) __A; 2330 } 2331 2332 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2333 _mm_castpd_si128(__m128d __A) 2334 { 2335 return (__m128i) __A; 2336 } 2337 2338 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2339 _mm_castps_pd(__m128 __A) 2340 { 2341 return (__m128d) __A; 2342 } 2343 2344 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2345 _mm_castps_si128(__m128 __A) 2346 { 2347 return (__m128i) __A; 2348 } 2349 2350 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2351 _mm_castsi128_ps(__m128i __A) 2352 { 2353 return (__m128) __A; 2354 } 2355 2356 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2357 _mm_castsi128_pd(__m128i __A) 2358 { 2359 return (__m128d) __A; 2360 } 2361 2362 #endif /* EMMINTRIN_H_ */ 2363 2364