1 /* Copyright (C) 2003-2019 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 GCC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 Under Section 7 of GPL version 3, you are granted additional 16 permissions described in the GCC Runtime Library Exception, version 17 3.1, as published by the Free Software Foundation. 18 19 You should have received a copy of the GNU General Public License and 20 a copy of the GCC Runtime Library Exception along with this program; 21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 <http://www.gnu.org/licenses/>. */ 23 24 /* Implemented from the specification included in the Intel C++ Compiler 25 User Guide and Reference, version 9.0. */ 26 27 #ifndef NO_WARN_X86_INTRINSICS 28 /* This header is distributed to simplify porting x86_64 code that 29 makes explicit use of Intel intrinsics to powerpc64le. 30 It is the user's responsibility to determine if the results are 31 acceptable and make additional changes as necessary. 32 Note that much code that uses Intel intrinsics can be rewritten in 33 standard C or GNU C extensions, which are more portable and better 34 optimized across multiple targets. 35 36 In the specific case of X86 SSE2 (__m128i, __m128d) intrinsics, 37 the PowerPC VMX/VSX ISA is a good match for vector double SIMD 38 operations. However scalar double operations in vector (XMM) 39 registers require the POWER8 VSX ISA (2.07) level. Also there are 40 important differences for data format and placement of double 41 scalars in the vector register. 42 43 For PowerISA Scalar double is in FPRs (left most 64-bits of the 44 low 32 VSRs), while X86_64 SSE2 uses the right most 64-bits of 45 the XMM. These differences require extra steps on POWER to match 46 the SSE2 scalar double semantics. 47 48 Most SSE2 scalar double intrinsic operations can be performed more 49 efficiently as C language double scalar operations or optimized to 50 use vector SIMD operations. We recommend this for new applications. 51 52 Another difference is the format and details of the X86_64 MXSCR vs 53 the PowerISA FPSCR / VSCR registers. We recommend applications 54 replace direct access to the MXSCR with the more portable <fenv.h> 55 Posix APIs. */ 56 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." 57 #endif 58 59 #ifndef EMMINTRIN_H_ 60 #define EMMINTRIN_H_ 61 62 #include <altivec.h> 63 #include <assert.h> 64 65 /* We need definitions from the SSE header files. */ 66 #include <xmmintrin.h> 67 68 /* SSE2 */ 69 typedef __vector double __v2df; 70 typedef __vector long long __v2di; 71 typedef __vector unsigned long long __v2du; 72 typedef __vector int __v4si; 73 typedef __vector unsigned int __v4su; 74 typedef __vector short __v8hi; 75 typedef __vector unsigned short __v8hu; 76 typedef __vector signed char __v16qi; 77 typedef __vector unsigned char __v16qu; 78 79 /* The Intel API is flexible enough that we must allow aliasing with other 80 vector types, and their scalar components. */ 81 typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); 82 typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__)); 83 84 /* Unaligned version of the same types. */ 85 typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); 86 typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); 87 88 /* Define two value permute mask. */ 89 #define _MM_SHUFFLE2(x,y) (((x) << 1) | (y)) 90 91 /* Create a vector with element 0 as F and the rest zero. */ 92 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 93 _mm_set_sd (double __F) 94 { 95 return __extension__ (__m128d){ __F, 0.0 }; 96 } 97 98 /* Create a vector with both elements equal to F. */ 99 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 100 _mm_set1_pd (double __F) 101 { 102 return __extension__ (__m128d){ __F, __F }; 103 } 104 105 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 106 _mm_set_pd1 (double __F) 107 { 108 return _mm_set1_pd (__F); 109 } 110 111 /* Create a vector with the lower value X and upper value W. */ 112 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 113 _mm_set_pd (double __W, double __X) 114 { 115 return __extension__ (__m128d){ __X, __W }; 116 } 117 118 /* Create a vector with the lower value W and upper value X. */ 119 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 120 _mm_setr_pd (double __W, double __X) 121 { 122 return __extension__ (__m128d){ __W, __X }; 123 } 124 125 /* Create an undefined vector. */ 126 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 127 _mm_undefined_pd (void) 128 { 129 __m128d __Y = __Y; 130 return __Y; 131 } 132 133 /* Create a vector of zeros. */ 134 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 135 _mm_setzero_pd (void) 136 { 137 return (__m128d) vec_splats (0); 138 } 139 140 /* Sets the low DPFP value of A from the low value of B. */ 141 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 142 _mm_move_sd (__m128d __A, __m128d __B) 143 { 144 __v2df result = (__v2df) __A; 145 result [0] = ((__v2df) __B)[0]; 146 return (__m128d) result; 147 } 148 149 /* Load two DPFP values from P. The address must be 16-byte aligned. */ 150 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 151 _mm_load_pd (double const *__P) 152 { 153 assert(((unsigned long)__P & 0xfUL) == 0UL); 154 return ((__m128d)vec_ld(0, (__v16qu*)__P)); 155 } 156 157 /* Load two DPFP values from P. The address need not be 16-byte aligned. */ 158 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 159 _mm_loadu_pd (double const *__P) 160 { 161 return (vec_vsx_ld(0, __P)); 162 } 163 164 /* Create a vector with all two elements equal to *P. */ 165 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 166 _mm_load1_pd (double const *__P) 167 { 168 return (vec_splats (*__P)); 169 } 170 171 /* Create a vector with element 0 as *P and the rest zero. */ 172 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 173 _mm_load_sd (double const *__P) 174 { 175 return _mm_set_sd (*__P); 176 } 177 178 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 179 _mm_load_pd1 (double const *__P) 180 { 181 return _mm_load1_pd (__P); 182 } 183 184 /* Load two DPFP values in reverse order. The address must be aligned. */ 185 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 186 _mm_loadr_pd (double const *__P) 187 { 188 __v2df __tmp = _mm_load_pd (__P); 189 return (__m128d)vec_xxpermdi (__tmp, __tmp, 2); 190 } 191 192 /* Store two DPFP values. The address must be 16-byte aligned. */ 193 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 194 _mm_store_pd (double *__P, __m128d __A) 195 { 196 assert(((unsigned long)__P & 0xfUL) == 0UL); 197 vec_st((__v16qu)__A, 0, (__v16qu*)__P); 198 } 199 200 /* Store two DPFP values. The address need not be 16-byte aligned. */ 201 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 202 _mm_storeu_pd (double *__P, __m128d __A) 203 { 204 *(__m128d_u *)__P = __A; 205 } 206 207 /* Stores the lower DPFP value. */ 208 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 209 _mm_store_sd (double *__P, __m128d __A) 210 { 211 *__P = ((__v2df)__A)[0]; 212 } 213 214 extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 215 _mm_cvtsd_f64 (__m128d __A) 216 { 217 return ((__v2df)__A)[0]; 218 } 219 220 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 221 _mm_storel_pd (double *__P, __m128d __A) 222 { 223 _mm_store_sd (__P, __A); 224 } 225 226 /* Stores the upper DPFP value. */ 227 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 228 _mm_storeh_pd (double *__P, __m128d __A) 229 { 230 *__P = ((__v2df)__A)[1]; 231 } 232 /* Store the lower DPFP value across two words. 233 The address must be 16-byte aligned. */ 234 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 235 _mm_store1_pd (double *__P, __m128d __A) 236 { 237 _mm_store_pd (__P, vec_splat (__A, 0)); 238 } 239 240 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 241 _mm_store_pd1 (double *__P, __m128d __A) 242 { 243 _mm_store1_pd (__P, __A); 244 } 245 246 /* Store two DPFP values in reverse order. The address must be aligned. */ 247 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 248 _mm_storer_pd (double *__P, __m128d __A) 249 { 250 _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2)); 251 } 252 253 /* Intel intrinsic. */ 254 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 255 _mm_cvtsi128_si64 (__m128i __A) 256 { 257 return ((__v2di)__A)[0]; 258 } 259 260 /* Microsoft intrinsic. */ 261 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 262 _mm_cvtsi128_si64x (__m128i __A) 263 { 264 return ((__v2di)__A)[0]; 265 } 266 267 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 268 _mm_add_pd (__m128d __A, __m128d __B) 269 { 270 return (__m128d) ((__v2df)__A + (__v2df)__B); 271 } 272 273 /* Add the lower double-precision (64-bit) floating-point element in 274 a and b, store the result in the lower element of dst, and copy 275 the upper element from a to the upper element of dst. */ 276 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 277 _mm_add_sd (__m128d __A, __m128d __B) 278 { 279 __A[0] = __A[0] + __B[0]; 280 return (__A); 281 } 282 283 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 284 _mm_sub_pd (__m128d __A, __m128d __B) 285 { 286 return (__m128d) ((__v2df)__A - (__v2df)__B); 287 } 288 289 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 290 _mm_sub_sd (__m128d __A, __m128d __B) 291 { 292 __A[0] = __A[0] - __B[0]; 293 return (__A); 294 } 295 296 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 297 _mm_mul_pd (__m128d __A, __m128d __B) 298 { 299 return (__m128d) ((__v2df)__A * (__v2df)__B); 300 } 301 302 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 303 _mm_mul_sd (__m128d __A, __m128d __B) 304 { 305 __A[0] = __A[0] * __B[0]; 306 return (__A); 307 } 308 309 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 310 _mm_div_pd (__m128d __A, __m128d __B) 311 { 312 return (__m128d) ((__v2df)__A / (__v2df)__B); 313 } 314 315 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 316 _mm_div_sd (__m128d __A, __m128d __B) 317 { 318 __A[0] = __A[0] / __B[0]; 319 return (__A); 320 } 321 322 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 323 _mm_sqrt_pd (__m128d __A) 324 { 325 return (vec_sqrt (__A)); 326 } 327 328 /* Return pair {sqrt (B[0]), A[1]}. */ 329 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 330 _mm_sqrt_sd (__m128d __A, __m128d __B) 331 { 332 __v2df c; 333 c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0])); 334 return (__m128d) _mm_setr_pd (c[0], __A[1]); 335 } 336 337 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 338 _mm_min_pd (__m128d __A, __m128d __B) 339 { 340 return (vec_min (__A, __B)); 341 } 342 343 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 344 _mm_min_sd (__m128d __A, __m128d __B) 345 { 346 __v2df a, b, c; 347 a = vec_splats (__A[0]); 348 b = vec_splats (__B[0]); 349 c = vec_min (a, b); 350 return (__m128d) _mm_setr_pd (c[0], __A[1]); 351 } 352 353 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 354 _mm_max_pd (__m128d __A, __m128d __B) 355 { 356 return (vec_max (__A, __B)); 357 } 358 359 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 360 _mm_max_sd (__m128d __A, __m128d __B) 361 { 362 __v2df a, b, c; 363 a = vec_splats (__A[0]); 364 b = vec_splats (__B[0]); 365 c = vec_max (a, b); 366 return (__m128d) _mm_setr_pd (c[0], __A[1]); 367 } 368 369 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 370 _mm_cmpeq_pd (__m128d __A, __m128d __B) 371 { 372 return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B)); 373 } 374 375 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 376 _mm_cmplt_pd (__m128d __A, __m128d __B) 377 { 378 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B)); 379 } 380 381 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 382 _mm_cmple_pd (__m128d __A, __m128d __B) 383 { 384 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B)); 385 } 386 387 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 388 _mm_cmpgt_pd (__m128d __A, __m128d __B) 389 { 390 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B)); 391 } 392 393 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 394 _mm_cmpge_pd (__m128d __A, __m128d __B) 395 { 396 return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B)); 397 } 398 399 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 400 _mm_cmpneq_pd (__m128d __A, __m128d __B) 401 { 402 __v2df temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B); 403 return ((__m128d)vec_nor (temp, temp)); 404 } 405 406 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 407 _mm_cmpnlt_pd (__m128d __A, __m128d __B) 408 { 409 return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B)); 410 } 411 412 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 413 _mm_cmpnle_pd (__m128d __A, __m128d __B) 414 { 415 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B)); 416 } 417 418 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 419 _mm_cmpngt_pd (__m128d __A, __m128d __B) 420 { 421 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B)); 422 } 423 424 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 425 _mm_cmpnge_pd (__m128d __A, __m128d __B) 426 { 427 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B)); 428 } 429 430 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 431 _mm_cmpord_pd (__m128d __A, __m128d __B) 432 { 433 #if _ARCH_PWR8 434 __v2du c, d; 435 /* Compare against self will return false (0's) if NAN. */ 436 c = (__v2du)vec_cmpeq (__A, __A); 437 d = (__v2du)vec_cmpeq (__B, __B); 438 #else 439 __v2du a, b; 440 __v2du c, d; 441 const __v2du double_exp_mask = {0x7ff0000000000000, 0x7ff0000000000000}; 442 a = (__v2du)vec_abs ((__v2df)__A); 443 b = (__v2du)vec_abs ((__v2df)__B); 444 c = (__v2du)vec_cmpgt (double_exp_mask, a); 445 d = (__v2du)vec_cmpgt (double_exp_mask, b); 446 #endif 447 /* A != NAN and B != NAN. */ 448 return ((__m128d)vec_and(c, d)); 449 } 450 451 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 452 _mm_cmpunord_pd (__m128d __A, __m128d __B) 453 { 454 #if _ARCH_PWR8 455 __v2du c, d; 456 /* Compare against self will return false (0's) if NAN. */ 457 c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A); 458 d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B); 459 /* A == NAN OR B == NAN converts too: 460 NOT(A != NAN) OR NOT(B != NAN). */ 461 c = vec_nor (c, c); 462 return ((__m128d)vec_orc(c, d)); 463 #else 464 __v2du c, d; 465 /* Compare against self will return false (0's) if NAN. */ 466 c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A); 467 d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B); 468 /* Convert the true ('1's) is NAN. */ 469 c = vec_nor (c, c); 470 d = vec_nor (d, d); 471 return ((__m128d)vec_or(c, d)); 472 #endif 473 } 474 475 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 476 _mm_cmpeq_sd(__m128d __A, __m128d __B) 477 { 478 __v2df a, b, c; 479 /* PowerISA VSX does not allow partial (for just lower double) 480 results. So to insure we don't generate spurious exceptions 481 (from the upper double values) we splat the lower double 482 before we do the operation. */ 483 a = vec_splats (__A[0]); 484 b = vec_splats (__B[0]); 485 c = (__v2df) vec_cmpeq(a, b); 486 /* Then we merge the lower double result with the original upper 487 double from __A. */ 488 return (__m128d) _mm_setr_pd (c[0], __A[1]); 489 } 490 491 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 492 _mm_cmplt_sd (__m128d __A, __m128d __B) 493 { 494 __v2df a, b, c; 495 a = vec_splats (__A[0]); 496 b = vec_splats (__B[0]); 497 c = (__v2df) vec_cmplt(a, b); 498 return (__m128d) _mm_setr_pd (c[0], __A[1]); 499 } 500 501 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 502 _mm_cmple_sd (__m128d __A, __m128d __B) 503 { 504 __v2df a, b, c; 505 a = vec_splats (__A[0]); 506 b = vec_splats (__B[0]); 507 c = (__v2df) vec_cmple(a, b); 508 return (__m128d) _mm_setr_pd (c[0], __A[1]); 509 } 510 511 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 512 _mm_cmpgt_sd (__m128d __A, __m128d __B) 513 { 514 __v2df a, b, c; 515 a = vec_splats (__A[0]); 516 b = vec_splats (__B[0]); 517 c = (__v2df) vec_cmpgt(a, b); 518 return (__m128d) _mm_setr_pd (c[0], __A[1]); 519 } 520 521 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 522 _mm_cmpge_sd (__m128d __A, __m128d __B) 523 { 524 __v2df a, b, c; 525 a = vec_splats (__A[0]); 526 b = vec_splats (__B[0]); 527 c = (__v2df) vec_cmpge(a, b); 528 return (__m128d) _mm_setr_pd (c[0], __A[1]); 529 } 530 531 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 532 _mm_cmpneq_sd (__m128d __A, __m128d __B) 533 { 534 __v2df a, b, c; 535 a = vec_splats (__A[0]); 536 b = vec_splats (__B[0]); 537 c = (__v2df) vec_cmpeq(a, b); 538 c = vec_nor (c, c); 539 return (__m128d) _mm_setr_pd (c[0], __A[1]); 540 } 541 542 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 543 _mm_cmpnlt_sd (__m128d __A, __m128d __B) 544 { 545 __v2df a, b, c; 546 a = vec_splats (__A[0]); 547 b = vec_splats (__B[0]); 548 /* Not less than is just greater than or equal. */ 549 c = (__v2df) vec_cmpge(a, b); 550 return (__m128d) _mm_setr_pd (c[0], __A[1]); 551 } 552 553 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 554 _mm_cmpnle_sd (__m128d __A, __m128d __B) 555 { 556 __v2df a, b, c; 557 a = vec_splats (__A[0]); 558 b = vec_splats (__B[0]); 559 /* Not less than or equal is just greater than. */ 560 c = (__v2df) vec_cmpge(a, b); 561 return (__m128d) _mm_setr_pd (c[0], __A[1]); 562 } 563 564 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 565 _mm_cmpngt_sd (__m128d __A, __m128d __B) 566 { 567 __v2df a, b, c; 568 a = vec_splats (__A[0]); 569 b = vec_splats (__B[0]); 570 /* Not greater than is just less than or equal. */ 571 c = (__v2df) vec_cmple(a, b); 572 return (__m128d) _mm_setr_pd (c[0], __A[1]); 573 } 574 575 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 576 _mm_cmpnge_sd (__m128d __A, __m128d __B) 577 { 578 __v2df a, b, c; 579 a = vec_splats (__A[0]); 580 b = vec_splats (__B[0]); 581 /* Not greater than or equal is just less than. */ 582 c = (__v2df) vec_cmplt(a, b); 583 return (__m128d) _mm_setr_pd (c[0], __A[1]); 584 } 585 586 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 587 _mm_cmpord_sd (__m128d __A, __m128d __B) 588 { 589 __v2df r; 590 r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0])); 591 return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]); 592 } 593 594 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 595 _mm_cmpunord_sd (__m128d __A, __m128d __B) 596 { 597 __v2df r; 598 r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0])); 599 return (__m128d) _mm_setr_pd (r[0], __A[1]); 600 } 601 602 /* FIXME 603 The __mm_comi??_sd and __mm_ucomi??_sd implementations below are 604 exactly the same because GCC for PowerPC only generates unordered 605 compares (scalar and vector). 606 Technically __mm_comieq_sp et all should be using the ordered 607 compare and signal for QNaNs. The __mm_ucomieq_sd et all should 608 be OK. */ 609 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 610 _mm_comieq_sd (__m128d __A, __m128d __B) 611 { 612 return (__A[0] == __B[0]); 613 } 614 615 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 616 _mm_comilt_sd (__m128d __A, __m128d __B) 617 { 618 return (__A[0] < __B[0]); 619 } 620 621 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 622 _mm_comile_sd (__m128d __A, __m128d __B) 623 { 624 return (__A[0] <= __B[0]); 625 } 626 627 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 628 _mm_comigt_sd (__m128d __A, __m128d __B) 629 { 630 return (__A[0] > __B[0]); 631 } 632 633 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 634 _mm_comige_sd (__m128d __A, __m128d __B) 635 { 636 return (__A[0] >= __B[0]); 637 } 638 639 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 640 _mm_comineq_sd (__m128d __A, __m128d __B) 641 { 642 return (__A[0] != __B[0]); 643 } 644 645 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 646 _mm_ucomieq_sd (__m128d __A, __m128d __B) 647 { 648 return (__A[0] == __B[0]); 649 } 650 651 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 652 _mm_ucomilt_sd (__m128d __A, __m128d __B) 653 { 654 return (__A[0] < __B[0]); 655 } 656 657 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 658 _mm_ucomile_sd (__m128d __A, __m128d __B) 659 { 660 return (__A[0] <= __B[0]); 661 } 662 663 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 664 _mm_ucomigt_sd (__m128d __A, __m128d __B) 665 { 666 return (__A[0] > __B[0]); 667 } 668 669 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 670 _mm_ucomige_sd (__m128d __A, __m128d __B) 671 { 672 return (__A[0] >= __B[0]); 673 } 674 675 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 676 _mm_ucomineq_sd (__m128d __A, __m128d __B) 677 { 678 return (__A[0] != __B[0]); 679 } 680 681 /* Create a vector of Qi, where i is the element number. */ 682 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 683 _mm_set_epi64x (long long __q1, long long __q0) 684 { 685 return __extension__ (__m128i)(__v2di){ __q0, __q1 }; 686 } 687 688 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 689 _mm_set_epi64 (__m64 __q1, __m64 __q0) 690 { 691 return _mm_set_epi64x ((long long)__q1, (long long)__q0); 692 } 693 694 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 695 _mm_set_epi32 (int __q3, int __q2, int __q1, int __q0) 696 { 697 return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 }; 698 } 699 700 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 701 _mm_set_epi16 (short __q7, short __q6, short __q5, short __q4, 702 short __q3, short __q2, short __q1, short __q0) 703 { 704 return __extension__ (__m128i)(__v8hi){ 705 __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 }; 706 } 707 708 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 709 _mm_set_epi8 (char __q15, char __q14, char __q13, char __q12, 710 char __q11, char __q10, char __q09, char __q08, 711 char __q07, char __q06, char __q05, char __q04, 712 char __q03, char __q02, char __q01, char __q00) 713 { 714 return __extension__ (__m128i)(__v16qi){ 715 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, 716 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15 717 }; 718 } 719 720 /* Set all of the elements of the vector to A. */ 721 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 722 _mm_set1_epi64x (long long __A) 723 { 724 return _mm_set_epi64x (__A, __A); 725 } 726 727 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 728 _mm_set1_epi64 (__m64 __A) 729 { 730 return _mm_set_epi64 (__A, __A); 731 } 732 733 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 734 _mm_set1_epi32 (int __A) 735 { 736 return _mm_set_epi32 (__A, __A, __A, __A); 737 } 738 739 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 740 _mm_set1_epi16 (short __A) 741 { 742 return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A); 743 } 744 745 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 746 _mm_set1_epi8 (char __A) 747 { 748 return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A, 749 __A, __A, __A, __A, __A, __A, __A, __A); 750 } 751 752 /* Create a vector of Qi, where i is the element number. 753 The parameter order is reversed from the _mm_set_epi* functions. */ 754 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 755 _mm_setr_epi64 (__m64 __q0, __m64 __q1) 756 { 757 return _mm_set_epi64 (__q1, __q0); 758 } 759 760 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 761 _mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3) 762 { 763 return _mm_set_epi32 (__q3, __q2, __q1, __q0); 764 } 765 766 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 767 _mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3, 768 short __q4, short __q5, short __q6, short __q7) 769 { 770 return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0); 771 } 772 773 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 774 _mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03, 775 char __q04, char __q05, char __q06, char __q07, 776 char __q08, char __q09, char __q10, char __q11, 777 char __q12, char __q13, char __q14, char __q15) 778 { 779 return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08, 780 __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00); 781 } 782 783 /* Create a vector with element 0 as *P and the rest zero. */ 784 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 785 _mm_load_si128 (__m128i const *__P) 786 { 787 return *__P; 788 } 789 790 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 791 _mm_loadu_si128 (__m128i_u const *__P) 792 { 793 return (__m128i) (vec_vsx_ld(0, (signed int const *)__P)); 794 } 795 796 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 797 _mm_loadl_epi64 (__m128i_u const *__P) 798 { 799 return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P); 800 } 801 802 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 803 _mm_store_si128 (__m128i *__P, __m128i __B) 804 { 805 assert(((unsigned long )__P & 0xfUL) == 0UL); 806 vec_st ((__v16qu) __B, 0, (__v16qu*)__P); 807 } 808 809 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 810 _mm_storeu_si128 (__m128i_u *__P, __m128i __B) 811 { 812 *__P = __B; 813 } 814 815 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 816 _mm_storel_epi64 (__m128i_u *__P, __m128i __B) 817 { 818 *(long long *)__P = ((__v2di)__B)[0]; 819 } 820 821 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 822 _mm_movepi64_pi64 (__m128i_u __B) 823 { 824 return (__m64) ((__v2di)__B)[0]; 825 } 826 827 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 828 _mm_movpi64_epi64 (__m64 __A) 829 { 830 return _mm_set_epi64 ((__m64)0LL, __A); 831 } 832 833 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 834 _mm_move_epi64 (__m128i __A) 835 { 836 return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]); 837 } 838 839 /* Create an undefined vector. */ 840 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 841 _mm_undefined_si128 (void) 842 { 843 __m128i __Y = __Y; 844 return __Y; 845 } 846 847 /* Create a vector of zeros. */ 848 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 849 _mm_setzero_si128 (void) 850 { 851 return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 }; 852 } 853 854 #ifdef _ARCH_PWR8 855 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 856 _mm_cvtepi32_pd (__m128i __A) 857 { 858 __v2di val; 859 /* For LE need to generate Vector Unpack Low Signed Word. 860 Which is generated from unpackh. */ 861 val = (__v2di)vec_unpackh ((__v4si)__A); 862 863 return (__m128d)vec_ctf (val, 0); 864 } 865 #endif 866 867 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 868 _mm_cvtepi32_ps (__m128i __A) 869 { 870 return ((__m128)vec_ctf((__v4si)__A, 0)); 871 } 872 873 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 874 _mm_cvtpd_epi32 (__m128d __A) 875 { 876 __v2df rounded = vec_rint (__A); 877 __v4si result, temp; 878 const __v4si vzero = 879 { 0, 0, 0, 0 }; 880 881 /* VSX Vector truncate Double-Precision to integer and Convert to 882 Signed Integer Word format with Saturate. */ 883 __asm__( 884 "xvcvdpsxws %x0,%x1" 885 : "=wa" (temp) 886 : "wa" (rounded) 887 : ); 888 889 #ifdef _ARCH_PWR8 890 #ifdef __LITTLE_ENDIAN__ 891 temp = vec_mergeo (temp, temp); 892 #else 893 temp = vec_mergee (temp, temp); 894 #endif 895 result = (__v4si) vec_vpkudum ((__vector long long) temp, 896 (__vector long long) vzero); 897 #else 898 { 899 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 900 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f }; 901 result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm); 902 } 903 #endif 904 return (__m128i) result; 905 } 906 907 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 908 _mm_cvtpd_pi32 (__m128d __A) 909 { 910 __m128i result = _mm_cvtpd_epi32(__A); 911 912 return (__m64) result[0]; 913 } 914 915 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 916 _mm_cvtpd_ps (__m128d __A) 917 { 918 __v4sf result; 919 __v4si temp; 920 const __v4si vzero = { 0, 0, 0, 0 }; 921 922 __asm__( 923 "xvcvdpsp %x0,%x1" 924 : "=wa" (temp) 925 : "wa" (__A) 926 : ); 927 928 #ifdef _ARCH_PWR8 929 #ifdef __LITTLE_ENDIAN__ 930 temp = vec_mergeo (temp, temp); 931 #else 932 temp = vec_mergee (temp, temp); 933 #endif 934 result = (__v4sf) vec_vpkudum ((__vector long long) temp, 935 (__vector long long) vzero); 936 #else 937 { 938 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 939 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f }; 940 result = (__v4sf) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm); 941 } 942 #endif 943 return ((__m128)result); 944 } 945 946 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 947 _mm_cvttpd_epi32 (__m128d __A) 948 { 949 __v4si result; 950 __v4si temp; 951 const __v4si vzero = { 0, 0, 0, 0 }; 952 953 /* VSX Vector truncate Double-Precision to integer and Convert to 954 Signed Integer Word format with Saturate. */ 955 __asm__( 956 "xvcvdpsxws %x0,%x1" 957 : "=wa" (temp) 958 : "wa" (__A) 959 : ); 960 961 #ifdef _ARCH_PWR8 962 #ifdef __LITTLE_ENDIAN__ 963 temp = vec_mergeo (temp, temp); 964 #else 965 temp = vec_mergee (temp, temp); 966 #endif 967 result = (__v4si) vec_vpkudum ((__vector long long) temp, 968 (__vector long long) vzero); 969 #else 970 { 971 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 972 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f }; 973 result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm); 974 } 975 #endif 976 977 return ((__m128i) result); 978 } 979 980 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 981 _mm_cvttpd_pi32 (__m128d __A) 982 { 983 __m128i result = _mm_cvttpd_epi32 (__A); 984 985 return (__m64) result[0]; 986 } 987 988 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 989 _mm_cvtsi128_si32 (__m128i __A) 990 { 991 return ((__v4si)__A)[0]; 992 } 993 994 #ifdef _ARCH_PWR8 995 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 996 _mm_cvtpi32_pd (__m64 __A) 997 { 998 __v4si temp; 999 __v2di tmp2; 1000 __v2df result; 1001 1002 temp = (__v4si)vec_splats (__A); 1003 tmp2 = (__v2di)vec_unpackl (temp); 1004 result = vec_ctf ((__vector signed long long) tmp2, 0); 1005 return (__m128d)result; 1006 } 1007 #endif 1008 1009 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1010 _mm_cvtps_epi32 (__m128 __A) 1011 { 1012 __v4sf rounded; 1013 __v4si result; 1014 1015 rounded = vec_rint((__v4sf) __A); 1016 result = vec_cts (rounded, 0); 1017 return (__m128i) result; 1018 } 1019 1020 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1021 _mm_cvttps_epi32 (__m128 __A) 1022 { 1023 __v4si result; 1024 1025 result = vec_cts ((__v4sf) __A, 0); 1026 return (__m128i) result; 1027 } 1028 1029 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1030 _mm_cvtps_pd (__m128 __A) 1031 { 1032 /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */ 1033 #ifdef vec_doubleh 1034 return (__m128d) vec_doubleh ((__v4sf)__A); 1035 #else 1036 /* Otherwise the compiler is not current and so need to generate the 1037 equivalent code. */ 1038 __v4sf a = (__v4sf)__A; 1039 __v4sf temp; 1040 __v2df result; 1041 #ifdef __LITTLE_ENDIAN__ 1042 /* The input float values are in elements {[0], [1]} but the convert 1043 instruction needs them in elements {[1], [3]}, So we use two 1044 shift left double vector word immediates to get the elements 1045 lined up. */ 1046 temp = __builtin_vsx_xxsldwi (a, a, 3); 1047 temp = __builtin_vsx_xxsldwi (a, temp, 2); 1048 #else 1049 /* The input float values are in elements {[0], [1]} but the convert 1050 instruction needs them in elements {[0], [2]}, So we use two 1051 shift left double vector word immediates to get the elements 1052 lined up. */ 1053 temp = vec_vmrghw (a, a); 1054 #endif 1055 __asm__( 1056 " xvcvspdp %x0,%x1" 1057 : "=wa" (result) 1058 : "wa" (temp) 1059 : ); 1060 return (__m128d) result; 1061 #endif 1062 } 1063 1064 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1065 _mm_cvtsd_si32 (__m128d __A) 1066 { 1067 __v2df rounded = vec_rint((__v2df) __A); 1068 int result = ((__v2df)rounded)[0]; 1069 1070 return result; 1071 } 1072 /* Intel intrinsic. */ 1073 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1074 _mm_cvtsd_si64 (__m128d __A) 1075 { 1076 __v2df rounded = vec_rint ((__v2df) __A ); 1077 long long result = ((__v2df) rounded)[0]; 1078 1079 return result; 1080 } 1081 1082 /* Microsoft intrinsic. */ 1083 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1084 _mm_cvtsd_si64x (__m128d __A) 1085 { 1086 return _mm_cvtsd_si64 ((__v2df)__A); 1087 } 1088 1089 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1090 _mm_cvttsd_si32 (__m128d __A) 1091 { 1092 int result = ((__v2df)__A)[0]; 1093 1094 return result; 1095 } 1096 1097 /* Intel intrinsic. */ 1098 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1099 _mm_cvttsd_si64 (__m128d __A) 1100 { 1101 long long result = ((__v2df)__A)[0]; 1102 1103 return result; 1104 } 1105 1106 /* Microsoft intrinsic. */ 1107 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1108 _mm_cvttsd_si64x (__m128d __A) 1109 { 1110 return _mm_cvttsd_si64 (__A); 1111 } 1112 1113 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1114 _mm_cvtsd_ss (__m128 __A, __m128d __B) 1115 { 1116 __v4sf result = (__v4sf)__A; 1117 1118 #ifdef __LITTLE_ENDIAN__ 1119 __v4sf temp_s; 1120 /* Copy double element[0] to element [1] for conversion. */ 1121 __v2df temp_b = vec_splat((__v2df)__B, 0); 1122 1123 /* Pre-rotate __A left 3 (logically right 1) elements. */ 1124 result = __builtin_vsx_xxsldwi (result, result, 3); 1125 /* Convert double to single float scalar in a vector. */ 1126 __asm__( 1127 "xscvdpsp %x0,%x1" 1128 : "=wa" (temp_s) 1129 : "wa" (temp_b) 1130 : ); 1131 /* Shift the resulting scalar into vector element [0]. */ 1132 result = __builtin_vsx_xxsldwi (result, temp_s, 1); 1133 #else 1134 result [0] = ((__v2df)__B)[0]; 1135 #endif 1136 return (__m128) result; 1137 } 1138 1139 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1140 _mm_cvtsi32_sd (__m128d __A, int __B) 1141 { 1142 __v2df result = (__v2df)__A; 1143 double db = __B; 1144 result [0] = db; 1145 return (__m128d)result; 1146 } 1147 1148 /* Intel intrinsic. */ 1149 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1150 _mm_cvtsi64_sd (__m128d __A, long long __B) 1151 { 1152 __v2df result = (__v2df)__A; 1153 double db = __B; 1154 result [0] = db; 1155 return (__m128d)result; 1156 } 1157 1158 /* Microsoft intrinsic. */ 1159 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1160 _mm_cvtsi64x_sd (__m128d __A, long long __B) 1161 { 1162 return _mm_cvtsi64_sd (__A, __B); 1163 } 1164 1165 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1166 _mm_cvtss_sd (__m128d __A, __m128 __B) 1167 { 1168 #ifdef __LITTLE_ENDIAN__ 1169 /* Use splat to move element [0] into position for the convert. */ 1170 __v4sf temp = vec_splat ((__v4sf)__B, 0); 1171 __v2df res; 1172 /* Convert single float scalar to double in a vector. */ 1173 __asm__( 1174 "xscvspdp %x0,%x1" 1175 : "=wa" (res) 1176 : "wa" (temp) 1177 : ); 1178 return (__m128d) vec_mergel (res, (__v2df)__A); 1179 #else 1180 __v2df res = (__v2df)__A; 1181 res [0] = ((__v4sf)__B) [0]; 1182 return (__m128d) res; 1183 #endif 1184 } 1185 1186 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1187 _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) 1188 { 1189 __vector double result; 1190 const int litmsk = __mask & 0x3; 1191 1192 if (litmsk == 0) 1193 result = vec_mergeh (__A, __B); 1194 #if __GNUC__ < 6 1195 else if (litmsk == 1) 1196 result = vec_xxpermdi (__B, __A, 2); 1197 else if (litmsk == 2) 1198 result = vec_xxpermdi (__B, __A, 1); 1199 #else 1200 else if (litmsk == 1) 1201 result = vec_xxpermdi (__A, __B, 2); 1202 else if (litmsk == 2) 1203 result = vec_xxpermdi (__A, __B, 1); 1204 #endif 1205 else 1206 result = vec_mergel (__A, __B); 1207 1208 return result; 1209 } 1210 1211 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1212 _mm_unpackhi_pd (__m128d __A, __m128d __B) 1213 { 1214 return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B); 1215 } 1216 1217 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1218 _mm_unpacklo_pd (__m128d __A, __m128d __B) 1219 { 1220 return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B); 1221 } 1222 1223 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1224 _mm_loadh_pd (__m128d __A, double const *__B) 1225 { 1226 __v2df result = (__v2df)__A; 1227 result [1] = *__B; 1228 return (__m128d)result; 1229 } 1230 1231 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1232 _mm_loadl_pd (__m128d __A, double const *__B) 1233 { 1234 __v2df result = (__v2df)__A; 1235 result [0] = *__B; 1236 return (__m128d)result; 1237 } 1238 1239 #ifdef _ARCH_PWR8 1240 /* Intrinsic functions that require PowerISA 2.07 minimum. */ 1241 1242 /* Creates a 2-bit mask from the most significant bits of the DPFP values. */ 1243 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1244 _mm_movemask_pd (__m128d __A) 1245 { 1246 __vector unsigned long long result; 1247 static const __vector unsigned int perm_mask = 1248 { 1249 #ifdef __LITTLE_ENDIAN__ 1250 0x80800040, 0x80808080, 0x80808080, 0x80808080 1251 #else 1252 0x80808080, 0x80808080, 0x80808080, 0x80804000 1253 #endif 1254 }; 1255 1256 result = ((__vector unsigned long long) 1257 vec_vbpermq ((__vector unsigned char) __A, 1258 (__vector unsigned char) perm_mask)); 1259 1260 #ifdef __LITTLE_ENDIAN__ 1261 return result[1]; 1262 #else 1263 return result[0]; 1264 #endif 1265 } 1266 #endif /* _ARCH_PWR8 */ 1267 1268 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1269 _mm_packs_epi16 (__m128i __A, __m128i __B) 1270 { 1271 return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B); 1272 } 1273 1274 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1275 _mm_packs_epi32 (__m128i __A, __m128i __B) 1276 { 1277 return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B); 1278 } 1279 1280 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1281 _mm_packus_epi16 (__m128i __A, __m128i __B) 1282 { 1283 return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B); 1284 } 1285 1286 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1287 _mm_unpackhi_epi8 (__m128i __A, __m128i __B) 1288 { 1289 return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B); 1290 } 1291 1292 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1293 _mm_unpackhi_epi16 (__m128i __A, __m128i __B) 1294 { 1295 return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B); 1296 } 1297 1298 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1299 _mm_unpackhi_epi32 (__m128i __A, __m128i __B) 1300 { 1301 return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B); 1302 } 1303 1304 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1305 _mm_unpackhi_epi64 (__m128i __A, __m128i __B) 1306 { 1307 return (__m128i) vec_mergel ((__vector long long) __A, 1308 (__vector long long) __B); 1309 } 1310 1311 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1312 _mm_unpacklo_epi8 (__m128i __A, __m128i __B) 1313 { 1314 return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B); 1315 } 1316 1317 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1318 _mm_unpacklo_epi16 (__m128i __A, __m128i __B) 1319 { 1320 return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B); 1321 } 1322 1323 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1324 _mm_unpacklo_epi32 (__m128i __A, __m128i __B) 1325 { 1326 return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B); 1327 } 1328 1329 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1330 _mm_unpacklo_epi64 (__m128i __A, __m128i __B) 1331 { 1332 return (__m128i) vec_mergeh ((__vector long long) __A, 1333 (__vector long long) __B); 1334 } 1335 1336 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1337 _mm_add_epi8 (__m128i __A, __m128i __B) 1338 { 1339 return (__m128i) ((__v16qu)__A + (__v16qu)__B); 1340 } 1341 1342 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1343 _mm_add_epi16 (__m128i __A, __m128i __B) 1344 { 1345 return (__m128i) ((__v8hu)__A + (__v8hu)__B); 1346 } 1347 1348 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1349 _mm_add_epi32 (__m128i __A, __m128i __B) 1350 { 1351 return (__m128i) ((__v4su)__A + (__v4su)__B); 1352 } 1353 1354 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1355 _mm_add_epi64 (__m128i __A, __m128i __B) 1356 { 1357 return (__m128i) ((__v2du)__A + (__v2du)__B); 1358 } 1359 1360 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1361 _mm_adds_epi8 (__m128i __A, __m128i __B) 1362 { 1363 return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B); 1364 } 1365 1366 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1367 _mm_adds_epi16 (__m128i __A, __m128i __B) 1368 { 1369 return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B); 1370 } 1371 1372 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1373 _mm_adds_epu8 (__m128i __A, __m128i __B) 1374 { 1375 return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B); 1376 } 1377 1378 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1379 _mm_adds_epu16 (__m128i __A, __m128i __B) 1380 { 1381 return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B); 1382 } 1383 1384 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1385 _mm_sub_epi8 (__m128i __A, __m128i __B) 1386 { 1387 return (__m128i) ((__v16qu)__A - (__v16qu)__B); 1388 } 1389 1390 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1391 _mm_sub_epi16 (__m128i __A, __m128i __B) 1392 { 1393 return (__m128i) ((__v8hu)__A - (__v8hu)__B); 1394 } 1395 1396 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1397 _mm_sub_epi32 (__m128i __A, __m128i __B) 1398 { 1399 return (__m128i) ((__v4su)__A - (__v4su)__B); 1400 } 1401 1402 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1403 _mm_sub_epi64 (__m128i __A, __m128i __B) 1404 { 1405 return (__m128i) ((__v2du)__A - (__v2du)__B); 1406 } 1407 1408 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1409 _mm_subs_epi8 (__m128i __A, __m128i __B) 1410 { 1411 return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B); 1412 } 1413 1414 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1415 _mm_subs_epi16 (__m128i __A, __m128i __B) 1416 { 1417 return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B); 1418 } 1419 1420 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1421 _mm_subs_epu8 (__m128i __A, __m128i __B) 1422 { 1423 return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B); 1424 } 1425 1426 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1427 _mm_subs_epu16 (__m128i __A, __m128i __B) 1428 { 1429 return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B); 1430 } 1431 1432 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1433 _mm_madd_epi16 (__m128i __A, __m128i __B) 1434 { 1435 __vector signed int zero = {0, 0, 0, 0}; 1436 1437 return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, zero); 1438 } 1439 1440 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1441 _mm_mulhi_epi16 (__m128i __A, __m128i __B) 1442 { 1443 __vector signed int w0, w1; 1444 1445 __vector unsigned char xform1 = { 1446 #ifdef __LITTLE_ENDIAN__ 1447 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 1448 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 1449 #else 1450 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 1451 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D 1452 #endif 1453 }; 1454 1455 w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B); 1456 w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B); 1457 return (__m128i) vec_perm (w0, w1, xform1); 1458 } 1459 1460 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1461 _mm_mullo_epi16 (__m128i __A, __m128i __B) 1462 { 1463 return (__m128i) ((__v8hi)__A * (__v8hi)__B); 1464 } 1465 1466 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1467 _mm_mul_su32 (__m64 __A, __m64 __B) 1468 { 1469 unsigned int a = __A; 1470 unsigned int b = __B; 1471 1472 return ((__m64)a * (__m64)b); 1473 } 1474 1475 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1476 _mm_mul_epu32 (__m128i __A, __m128i __B) 1477 { 1478 #if __GNUC__ < 8 1479 __v2du result; 1480 1481 #ifdef __LITTLE_ENDIAN__ 1482 /* VMX Vector Multiply Odd Unsigned Word. */ 1483 __asm__( 1484 "vmulouw %0,%1,%2" 1485 : "=v" (result) 1486 : "v" (__A), "v" (__B) 1487 : ); 1488 #else 1489 /* VMX Vector Multiply Even Unsigned Word. */ 1490 __asm__( 1491 "vmuleuw %0,%1,%2" 1492 : "=v" (result) 1493 : "v" (__A), "v" (__B) 1494 : ); 1495 #endif 1496 return (__m128i) result; 1497 #else 1498 return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B); 1499 #endif 1500 } 1501 1502 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1503 _mm_slli_epi16 (__m128i __A, int __B) 1504 { 1505 __v8hu lshift; 1506 __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 }; 1507 1508 if (__B >= 0 && __B < 16) 1509 { 1510 if (__builtin_constant_p(__B)) 1511 lshift = (__v8hu) vec_splat_s16(__B); 1512 else 1513 lshift = vec_splats ((unsigned short) __B); 1514 1515 result = vec_sl ((__v8hi) __A, lshift); 1516 } 1517 1518 return (__m128i) result; 1519 } 1520 1521 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1522 _mm_slli_epi32 (__m128i __A, int __B) 1523 { 1524 __v4su lshift; 1525 __v4si result = { 0, 0, 0, 0 }; 1526 1527 if (__B >= 0 && __B < 32) 1528 { 1529 if (__builtin_constant_p(__B) && __B < 16) 1530 lshift = (__v4su) vec_splat_s32(__B); 1531 else 1532 lshift = vec_splats ((unsigned int) __B); 1533 1534 result = vec_sl ((__v4si) __A, lshift); 1535 } 1536 1537 return (__m128i) result; 1538 } 1539 1540 #ifdef _ARCH_PWR8 1541 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1542 _mm_slli_epi64 (__m128i __A, int __B) 1543 { 1544 __v2du lshift; 1545 __v2di result = { 0, 0 }; 1546 1547 if (__B >= 0 && __B < 64) 1548 { 1549 if (__builtin_constant_p(__B) && __B < 16) 1550 lshift = (__v2du) vec_splat_s32(__B); 1551 else 1552 lshift = (__v2du) vec_splats ((unsigned int) __B); 1553 1554 result = vec_sl ((__v2di) __A, lshift); 1555 } 1556 1557 return (__m128i) result; 1558 } 1559 #endif 1560 1561 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1562 _mm_srai_epi16 (__m128i __A, int __B) 1563 { 1564 __v8hu rshift = { 15, 15, 15, 15, 15, 15, 15, 15 }; 1565 __v8hi result; 1566 1567 if (__B < 16) 1568 { 1569 if (__builtin_constant_p(__B)) 1570 rshift = (__v8hu) vec_splat_s16(__B); 1571 else 1572 rshift = vec_splats ((unsigned short) __B); 1573 } 1574 result = vec_sra ((__v8hi) __A, rshift); 1575 1576 return (__m128i) result; 1577 } 1578 1579 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1580 _mm_srai_epi32 (__m128i __A, int __B) 1581 { 1582 __v4su rshift = { 31, 31, 31, 31 }; 1583 __v4si result; 1584 1585 if (__B < 32) 1586 { 1587 if (__builtin_constant_p(__B)) 1588 { 1589 if (__B < 16) 1590 rshift = (__v4su) vec_splat_s32(__B); 1591 else 1592 rshift = (__v4su) vec_splats((unsigned int)__B); 1593 } 1594 else 1595 rshift = vec_splats ((unsigned int) __B); 1596 } 1597 result = vec_sra ((__v4si) __A, rshift); 1598 1599 return (__m128i) result; 1600 } 1601 1602 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1603 _mm_bslli_si128 (__m128i __A, const int __N) 1604 { 1605 __v16qu result; 1606 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 1607 1608 if (__N < 16) 1609 result = vec_sld ((__v16qu) __A, zeros, __N); 1610 else 1611 result = zeros; 1612 1613 return (__m128i) result; 1614 } 1615 1616 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1617 _mm_bsrli_si128 (__m128i __A, const int __N) 1618 { 1619 __v16qu result; 1620 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 1621 1622 if (__N < 16) 1623 #ifdef __LITTLE_ENDIAN__ 1624 if (__builtin_constant_p(__N)) 1625 /* Would like to use Vector Shift Left Double by Octet 1626 Immediate here to use the immediate form and avoid 1627 load of __N * 8 value into a separate VR. */ 1628 result = vec_sld (zeros, (__v16qu) __A, (16 - __N)); 1629 else 1630 #endif 1631 { 1632 __v16qu shift = vec_splats((unsigned char)(__N*8)); 1633 #ifdef __LITTLE_ENDIAN__ 1634 result = vec_sro ((__v16qu)__A, shift); 1635 #else 1636 result = vec_slo ((__v16qu)__A, shift); 1637 #endif 1638 } 1639 else 1640 result = zeros; 1641 1642 return (__m128i) result; 1643 } 1644 1645 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1646 _mm_srli_si128 (__m128i __A, const int __N) 1647 { 1648 return _mm_bsrli_si128 (__A, __N); 1649 } 1650 1651 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1652 _mm_slli_si128 (__m128i __A, const int _imm5) 1653 { 1654 __v16qu result; 1655 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 1656 1657 if (_imm5 < 16) 1658 #ifdef __LITTLE_ENDIAN__ 1659 result = vec_sld ((__v16qu) __A, zeros, _imm5); 1660 #else 1661 result = vec_sld (zeros, (__v16qu) __A, (16 - _imm5)); 1662 #endif 1663 else 1664 result = zeros; 1665 1666 return (__m128i) result; 1667 } 1668 1669 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1670 1671 _mm_srli_epi16 (__m128i __A, int __B) 1672 { 1673 __v8hu rshift; 1674 __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 }; 1675 1676 if (__B < 16) 1677 { 1678 if (__builtin_constant_p(__B)) 1679 rshift = (__v8hu) vec_splat_s16(__B); 1680 else 1681 rshift = vec_splats ((unsigned short) __B); 1682 1683 result = vec_sr ((__v8hi) __A, rshift); 1684 } 1685 1686 return (__m128i) result; 1687 } 1688 1689 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1690 _mm_srli_epi32 (__m128i __A, int __B) 1691 { 1692 __v4su rshift; 1693 __v4si result = { 0, 0, 0, 0 }; 1694 1695 if (__B < 32) 1696 { 1697 if (__builtin_constant_p(__B)) 1698 { 1699 if (__B < 16) 1700 rshift = (__v4su) vec_splat_s32(__B); 1701 else 1702 rshift = (__v4su) vec_splats((unsigned int)__B); 1703 } 1704 else 1705 rshift = vec_splats ((unsigned int) __B); 1706 1707 result = vec_sr ((__v4si) __A, rshift); 1708 } 1709 1710 return (__m128i) result; 1711 } 1712 1713 #ifdef _ARCH_PWR8 1714 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1715 _mm_srli_epi64 (__m128i __A, int __B) 1716 { 1717 __v2du rshift; 1718 __v2di result = { 0, 0 }; 1719 1720 if (__B < 64) 1721 { 1722 if (__builtin_constant_p(__B)) 1723 { 1724 if (__B < 16) 1725 rshift = (__v2du) vec_splat_s32(__B); 1726 else 1727 rshift = (__v2du) vec_splats((unsigned long long)__B); 1728 } 1729 else 1730 rshift = (__v2du) vec_splats ((unsigned int) __B); 1731 1732 result = vec_sr ((__v2di) __A, rshift); 1733 } 1734 1735 return (__m128i) result; 1736 } 1737 #endif 1738 1739 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1740 _mm_sll_epi16 (__m128i __A, __m128i __B) 1741 { 1742 __v8hu lshift; 1743 __vector __bool short shmask; 1744 const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; 1745 __v8hu result; 1746 1747 #ifdef __LITTLE_ENDIAN__ 1748 lshift = vec_splat ((__v8hu) __B, 0); 1749 #else 1750 lshift = vec_splat ((__v8hu) __B, 3); 1751 #endif 1752 shmask = vec_cmple (lshift, shmax); 1753 result = vec_sl ((__v8hu) __A, lshift); 1754 result = vec_sel ((__v8hu) shmask, result, shmask); 1755 1756 return (__m128i) result; 1757 } 1758 1759 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1760 _mm_sll_epi32 (__m128i __A, __m128i __B) 1761 { 1762 __v4su lshift; 1763 __vector __bool int shmask; 1764 const __v4su shmax = { 32, 32, 32, 32 }; 1765 __v4su result; 1766 #ifdef __LITTLE_ENDIAN__ 1767 lshift = vec_splat ((__v4su) __B, 0); 1768 #else 1769 lshift = vec_splat ((__v4su) __B, 1); 1770 #endif 1771 shmask = vec_cmplt (lshift, shmax); 1772 result = vec_sl ((__v4su) __A, lshift); 1773 result = vec_sel ((__v4su) shmask, result, shmask); 1774 1775 return (__m128i) result; 1776 } 1777 1778 #ifdef _ARCH_PWR8 1779 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1780 _mm_sll_epi64 (__m128i __A, __m128i __B) 1781 { 1782 __v2du lshift; 1783 __vector __bool long long shmask; 1784 const __v2du shmax = { 64, 64 }; 1785 __v2du result; 1786 1787 lshift = vec_splat ((__v2du) __B, 0); 1788 shmask = vec_cmplt (lshift, shmax); 1789 result = vec_sl ((__v2du) __A, lshift); 1790 result = vec_sel ((__v2du) shmask, result, shmask); 1791 1792 return (__m128i) result; 1793 } 1794 #endif 1795 1796 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1797 _mm_sra_epi16 (__m128i __A, __m128i __B) 1798 { 1799 const __v8hu rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; 1800 __v8hu rshift; 1801 __v8hi result; 1802 1803 #ifdef __LITTLE_ENDIAN__ 1804 rshift = vec_splat ((__v8hu)__B, 0); 1805 #else 1806 rshift = vec_splat ((__v8hu)__B, 3); 1807 #endif 1808 rshift = vec_min (rshift, rshmax); 1809 result = vec_sra ((__v8hi) __A, rshift); 1810 1811 return (__m128i) result; 1812 } 1813 1814 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1815 _mm_sra_epi32 (__m128i __A, __m128i __B) 1816 { 1817 const __v4su rshmax = { 31, 31, 31, 31 }; 1818 __v4su rshift; 1819 __v4si result; 1820 1821 #ifdef __LITTLE_ENDIAN__ 1822 rshift = vec_splat ((__v4su)__B, 0); 1823 #else 1824 rshift = vec_splat ((__v4su)__B, 1); 1825 #endif 1826 rshift = vec_min (rshift, rshmax); 1827 result = vec_sra ((__v4si) __A, rshift); 1828 1829 return (__m128i) result; 1830 } 1831 1832 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1833 _mm_srl_epi16 (__m128i __A, __m128i __B) 1834 { 1835 __v8hu rshift; 1836 __vector __bool short shmask; 1837 const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; 1838 __v8hu result; 1839 1840 #ifdef __LITTLE_ENDIAN__ 1841 rshift = vec_splat ((__v8hu) __B, 0); 1842 #else 1843 rshift = vec_splat ((__v8hu) __B, 3); 1844 #endif 1845 shmask = vec_cmple (rshift, shmax); 1846 result = vec_sr ((__v8hu) __A, rshift); 1847 result = vec_sel ((__v8hu) shmask, result, shmask); 1848 1849 return (__m128i) result; 1850 } 1851 1852 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1853 _mm_srl_epi32 (__m128i __A, __m128i __B) 1854 { 1855 __v4su rshift; 1856 __vector __bool int shmask; 1857 const __v4su shmax = { 32, 32, 32, 32 }; 1858 __v4su result; 1859 1860 #ifdef __LITTLE_ENDIAN__ 1861 rshift = vec_splat ((__v4su) __B, 0); 1862 #else 1863 rshift = vec_splat ((__v4su) __B, 1); 1864 #endif 1865 shmask = vec_cmplt (rshift, shmax); 1866 result = vec_sr ((__v4su) __A, rshift); 1867 result = vec_sel ((__v4su) shmask, result, shmask); 1868 1869 return (__m128i) result; 1870 } 1871 1872 #ifdef _ARCH_PWR8 1873 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1874 _mm_srl_epi64 (__m128i __A, __m128i __B) 1875 { 1876 __v2du rshift; 1877 __vector __bool long long shmask; 1878 const __v2du shmax = { 64, 64 }; 1879 __v2du result; 1880 1881 rshift = vec_splat ((__v2du) __B, 0); 1882 shmask = vec_cmplt (rshift, shmax); 1883 result = vec_sr ((__v2du) __A, rshift); 1884 result = vec_sel ((__v2du) shmask, result, shmask); 1885 1886 return (__m128i) result; 1887 } 1888 #endif 1889 1890 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1891 _mm_and_pd (__m128d __A, __m128d __B) 1892 { 1893 return (vec_and ((__v2df) __A, (__v2df) __B)); 1894 } 1895 1896 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1897 _mm_andnot_pd (__m128d __A, __m128d __B) 1898 { 1899 return (vec_andc ((__v2df) __B, (__v2df) __A)); 1900 } 1901 1902 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1903 _mm_or_pd (__m128d __A, __m128d __B) 1904 { 1905 return (vec_or ((__v2df) __A, (__v2df) __B)); 1906 } 1907 1908 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1909 _mm_xor_pd (__m128d __A, __m128d __B) 1910 { 1911 return (vec_xor ((__v2df) __A, (__v2df) __B)); 1912 } 1913 1914 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1915 _mm_and_si128 (__m128i __A, __m128i __B) 1916 { 1917 return (__m128i)vec_and ((__v2di) __A, (__v2di) __B); 1918 } 1919 1920 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1921 _mm_andnot_si128 (__m128i __A, __m128i __B) 1922 { 1923 return (__m128i)vec_andc ((__v2di) __B, (__v2di) __A); 1924 } 1925 1926 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1927 _mm_or_si128 (__m128i __A, __m128i __B) 1928 { 1929 return (__m128i)vec_or ((__v2di) __A, (__v2di) __B); 1930 } 1931 1932 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1933 _mm_xor_si128 (__m128i __A, __m128i __B) 1934 { 1935 return (__m128i)vec_xor ((__v2di) __A, (__v2di) __B); 1936 } 1937 1938 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1939 _mm_cmpeq_epi8 (__m128i __A, __m128i __B) 1940 { 1941 return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B); 1942 } 1943 1944 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1945 _mm_cmpeq_epi16 (__m128i __A, __m128i __B) 1946 { 1947 return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B); 1948 } 1949 1950 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1951 _mm_cmpeq_epi32 (__m128i __A, __m128i __B) 1952 { 1953 return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B); 1954 } 1955 1956 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1957 _mm_cmplt_epi8 (__m128i __A, __m128i __B) 1958 { 1959 return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B); 1960 } 1961 1962 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1963 _mm_cmplt_epi16 (__m128i __A, __m128i __B) 1964 { 1965 return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B); 1966 } 1967 1968 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1969 _mm_cmplt_epi32 (__m128i __A, __m128i __B) 1970 { 1971 return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B); 1972 } 1973 1974 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1975 _mm_cmpgt_epi8 (__m128i __A, __m128i __B) 1976 { 1977 return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B); 1978 } 1979 1980 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1981 _mm_cmpgt_epi16 (__m128i __A, __m128i __B) 1982 { 1983 return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B); 1984 } 1985 1986 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1987 _mm_cmpgt_epi32 (__m128i __A, __m128i __B) 1988 { 1989 return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B); 1990 } 1991 1992 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1993 _mm_extract_epi16 (__m128i const __A, int const __N) 1994 { 1995 return (unsigned short) ((__v8hi)__A)[__N & 7]; 1996 } 1997 1998 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1999 _mm_insert_epi16 (__m128i const __A, int const __D, int const __N) 2000 { 2001 __v8hi result = (__v8hi)__A; 2002 2003 result [(__N & 7)] = __D; 2004 2005 return (__m128i) result; 2006 } 2007 2008 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2009 _mm_max_epi16 (__m128i __A, __m128i __B) 2010 { 2011 return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B); 2012 } 2013 2014 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2015 _mm_max_epu8 (__m128i __A, __m128i __B) 2016 { 2017 return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B); 2018 } 2019 2020 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2021 _mm_min_epi16 (__m128i __A, __m128i __B) 2022 { 2023 return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B); 2024 } 2025 2026 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2027 _mm_min_epu8 (__m128i __A, __m128i __B) 2028 { 2029 return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B); 2030 } 2031 2032 2033 #ifdef _ARCH_PWR8 2034 /* Intrinsic functions that require PowerISA 2.07 minimum. */ 2035 2036 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */ 2037 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2038 _mm_movemask_epi8 (__m128i __A) 2039 { 2040 __vector unsigned long long result; 2041 static const __vector unsigned char perm_mask = 2042 { 2043 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40, 2044 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00 2045 }; 2046 2047 result = ((__vector unsigned long long) 2048 vec_vbpermq ((__vector unsigned char) __A, 2049 (__vector unsigned char) perm_mask)); 2050 2051 #ifdef __LITTLE_ENDIAN__ 2052 return result[1]; 2053 #else 2054 return result[0]; 2055 #endif 2056 } 2057 #endif /* _ARCH_PWR8 */ 2058 2059 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2060 _mm_mulhi_epu16 (__m128i __A, __m128i __B) 2061 { 2062 __v4su w0, w1; 2063 __v16qu xform1 = { 2064 #ifdef __LITTLE_ENDIAN__ 2065 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 2066 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 2067 #else 2068 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 2069 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D 2070 #endif 2071 }; 2072 2073 w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B); 2074 w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B); 2075 return (__m128i) vec_perm (w0, w1, xform1); 2076 } 2077 2078 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2079 _mm_shufflehi_epi16 (__m128i __A, const int __mask) 2080 { 2081 unsigned long element_selector_98 = __mask & 0x03; 2082 unsigned long element_selector_BA = (__mask >> 2) & 0x03; 2083 unsigned long element_selector_DC = (__mask >> 4) & 0x03; 2084 unsigned long element_selector_FE = (__mask >> 6) & 0x03; 2085 static const unsigned short permute_selectors[4] = 2086 { 2087 #ifdef __LITTLE_ENDIAN__ 2088 0x0908, 0x0B0A, 0x0D0C, 0x0F0E 2089 #else 2090 0x0809, 0x0A0B, 0x0C0D, 0x0E0F 2091 #endif 2092 }; 2093 __v2du pmask = 2094 #ifdef __LITTLE_ENDIAN__ 2095 { 0x1716151413121110UL, 0UL}; 2096 #else 2097 { 0x1011121314151617UL, 0UL}; 2098 #endif 2099 __m64_union t; 2100 __v2du a, r; 2101 2102 t.as_short[0] = permute_selectors[element_selector_98]; 2103 t.as_short[1] = permute_selectors[element_selector_BA]; 2104 t.as_short[2] = permute_selectors[element_selector_DC]; 2105 t.as_short[3] = permute_selectors[element_selector_FE]; 2106 pmask[1] = t.as_m64; 2107 a = (__v2du)__A; 2108 r = vec_perm (a, a, (__vector unsigned char)pmask); 2109 return (__m128i) r; 2110 } 2111 2112 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2113 _mm_shufflelo_epi16 (__m128i __A, const int __mask) 2114 { 2115 unsigned long element_selector_10 = __mask & 0x03; 2116 unsigned long element_selector_32 = (__mask >> 2) & 0x03; 2117 unsigned long element_selector_54 = (__mask >> 4) & 0x03; 2118 unsigned long element_selector_76 = (__mask >> 6) & 0x03; 2119 static const unsigned short permute_selectors[4] = 2120 { 2121 #ifdef __LITTLE_ENDIAN__ 2122 0x0100, 0x0302, 0x0504, 0x0706 2123 #else 2124 0x0001, 0x0203, 0x0405, 0x0607 2125 #endif 2126 }; 2127 __v2du pmask = 2128 #ifdef __LITTLE_ENDIAN__ 2129 { 0UL, 0x1f1e1d1c1b1a1918UL}; 2130 #else 2131 { 0UL, 0x18191a1b1c1d1e1fUL}; 2132 #endif 2133 __m64_union t; 2134 __v2du a, r; 2135 t.as_short[0] = permute_selectors[element_selector_10]; 2136 t.as_short[1] = permute_selectors[element_selector_32]; 2137 t.as_short[2] = permute_selectors[element_selector_54]; 2138 t.as_short[3] = permute_selectors[element_selector_76]; 2139 pmask[0] = t.as_m64; 2140 a = (__v2du)__A; 2141 r = vec_perm (a, a, (__vector unsigned char)pmask); 2142 return (__m128i) r; 2143 } 2144 2145 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2146 _mm_shuffle_epi32 (__m128i __A, const int __mask) 2147 { 2148 unsigned long element_selector_10 = __mask & 0x03; 2149 unsigned long element_selector_32 = (__mask >> 2) & 0x03; 2150 unsigned long element_selector_54 = (__mask >> 4) & 0x03; 2151 unsigned long element_selector_76 = (__mask >> 6) & 0x03; 2152 static const unsigned int permute_selectors[4] = 2153 { 2154 #ifdef __LITTLE_ENDIAN__ 2155 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C 2156 #else 2157 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F 2158 #endif 2159 }; 2160 __v4su t; 2161 2162 t[0] = permute_selectors[element_selector_10]; 2163 t[1] = permute_selectors[element_selector_32]; 2164 t[2] = permute_selectors[element_selector_54] + 0x10101010; 2165 t[3] = permute_selectors[element_selector_76] + 0x10101010; 2166 return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t); 2167 } 2168 2169 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2170 _mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C) 2171 { 2172 __v2du hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL}; 2173 __v16qu mask, tmp; 2174 __m128i_u *p = (__m128i_u*)__C; 2175 2176 tmp = (__v16qu)_mm_loadu_si128(p); 2177 mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)hibit); 2178 tmp = vec_sel (tmp, (__v16qu)__A, mask); 2179 _mm_storeu_si128 (p, (__m128i)tmp); 2180 } 2181 2182 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2183 _mm_avg_epu8 (__m128i __A, __m128i __B) 2184 { 2185 return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B); 2186 } 2187 2188 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2189 _mm_avg_epu16 (__m128i __A, __m128i __B) 2190 { 2191 return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B); 2192 } 2193 2194 2195 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2196 _mm_sad_epu8 (__m128i __A, __m128i __B) 2197 { 2198 __v16qu a, b; 2199 __v16qu vmin, vmax, vabsdiff; 2200 __v4si vsum; 2201 const __v4su zero = { 0, 0, 0, 0 }; 2202 __v4si result; 2203 2204 a = (__v16qu) __A; 2205 b = (__v16qu) __B; 2206 vmin = vec_min (a, b); 2207 vmax = vec_max (a, b); 2208 vabsdiff = vec_sub (vmax, vmin); 2209 /* Sum four groups of bytes into integers. */ 2210 vsum = (__vector signed int) vec_sum4s (vabsdiff, zero); 2211 /* Sum across four integers with two integer results. */ 2212 result = vec_sum2s (vsum, (__vector signed int) zero); 2213 /* Rotate the sums into the correct position. */ 2214 #ifdef __LITTLE_ENDIAN__ 2215 result = vec_sld (result, result, 4); 2216 #else 2217 result = vec_sld (result, result, 6); 2218 #endif 2219 /* Rotate the sums into the correct position. */ 2220 return (__m128i) result; 2221 } 2222 2223 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2224 _mm_stream_si32 (int *__A, int __B) 2225 { 2226 /* Use the data cache block touch for store transient. */ 2227 __asm__ ( 2228 "dcbtstt 0,%0" 2229 : 2230 : "b" (__A) 2231 : "memory" 2232 ); 2233 *__A = __B; 2234 } 2235 2236 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2237 _mm_stream_si64 (long long int *__A, long long int __B) 2238 { 2239 /* Use the data cache block touch for store transient. */ 2240 __asm__ ( 2241 " dcbtstt 0,%0" 2242 : 2243 : "b" (__A) 2244 : "memory" 2245 ); 2246 *__A = __B; 2247 } 2248 2249 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2250 _mm_stream_si128 (__m128i *__A, __m128i __B) 2251 { 2252 /* Use the data cache block touch for store transient. */ 2253 __asm__ ( 2254 "dcbtstt 0,%0" 2255 : 2256 : "b" (__A) 2257 : "memory" 2258 ); 2259 *__A = __B; 2260 } 2261 2262 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2263 _mm_stream_pd (double *__A, __m128d __B) 2264 { 2265 /* Use the data cache block touch for store transient. */ 2266 __asm__ ( 2267 "dcbtstt 0,%0" 2268 : 2269 : "b" (__A) 2270 : "memory" 2271 ); 2272 *(__m128d*)__A = __B; 2273 } 2274 2275 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2276 _mm_clflush (void const *__A) 2277 { 2278 /* Use the data cache block flush. */ 2279 __asm__ ( 2280 "dcbf 0,%0" 2281 : 2282 : "b" (__A) 2283 : "memory" 2284 ); 2285 } 2286 2287 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2288 _mm_lfence (void) 2289 { 2290 /* Use light weight sync for load to load ordering. */ 2291 __atomic_thread_fence (__ATOMIC_RELEASE); 2292 } 2293 2294 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2295 _mm_mfence (void) 2296 { 2297 /* Use heavy weight sync for any to any ordering. */ 2298 __atomic_thread_fence (__ATOMIC_SEQ_CST); 2299 } 2300 2301 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2302 _mm_cvtsi32_si128 (int __A) 2303 { 2304 return _mm_set_epi32 (0, 0, 0, __A); 2305 } 2306 2307 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2308 _mm_cvtsi64_si128 (long long __A) 2309 { 2310 return __extension__ (__m128i)(__v2di){ __A, 0LL }; 2311 } 2312 2313 /* Microsoft intrinsic. */ 2314 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2315 _mm_cvtsi64x_si128 (long long __A) 2316 { 2317 return __extension__ (__m128i)(__v2di){ __A, 0LL }; 2318 } 2319 2320 /* Casts between various SP, DP, INT vector types. Note that these do no 2321 conversion of values, they just change the type. */ 2322 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2323 _mm_castpd_ps(__m128d __A) 2324 { 2325 return (__m128) __A; 2326 } 2327 2328 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2329 _mm_castpd_si128(__m128d __A) 2330 { 2331 return (__m128i) __A; 2332 } 2333 2334 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2335 _mm_castps_pd(__m128 __A) 2336 { 2337 return (__m128d) __A; 2338 } 2339 2340 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2341 _mm_castps_si128(__m128 __A) 2342 { 2343 return (__m128i) __A; 2344 } 2345 2346 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2347 _mm_castsi128_ps(__m128i __A) 2348 { 2349 return (__m128) __A; 2350 } 2351 2352 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2353 _mm_castsi128_pd(__m128i __A) 2354 { 2355 return (__m128d) __A; 2356 } 2357 2358 #endif /* EMMINTRIN_H_ */ 2359 2360