1 /* Copyright (C) 2002-2018 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 GCC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 Under Section 7 of GPL version 3, you are granted additional 16 permissions described in the GCC Runtime Library Exception, version 17 3.1, as published by the Free Software Foundation. 18 19 You should have received a copy of the GNU General Public License and 20 a copy of the GCC Runtime Library Exception along with this program; 21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 <http://www.gnu.org/licenses/>. */ 23 24 /* Implemented from the specification included in the Intel C++ Compiler 25 User Guide and Reference, version 9.0. */ 26 27 #ifndef NO_WARN_X86_INTRINSICS 28 /* This header is distributed to simplify porting x86_64 code that 29 makes explicit use of Intel intrinsics to powerpc64le. 30 It is the user's responsibility to determine if the results are 31 acceptable and make additional changes as necessary. 32 Note that much code that uses Intel intrinsics can be rewritten in 33 standard C or GNU C extensions, which are more portable and better 34 optimized across multiple targets. 35 36 In the specific case of X86 SSE (__m128) intrinsics, the PowerPC 37 VMX/VSX ISA is a good match for vector float SIMD operations. 38 However scalar float operations in vector (XMM) registers require 39 the POWER8 VSX ISA (2.07) level. Also there are important 40 differences for data format and placement of float scalars in the 41 vector register. For PowerISA Scalar floats in FPRs (left most 42 64-bits of the low 32 VSRs) is in double format, while X86_64 SSE 43 uses the right most 32-bits of the XMM. These differences require 44 extra steps on POWER to match the SSE scalar float semantics. 45 46 Most SSE scalar float intrinsic operations can be performed more 47 efficiently as C language float scalar operations or optimized to 48 use vector SIMD operations. We recommend this for new applications. 49 50 Another difference is the format and details of the X86_64 MXSCR vs 51 the PowerISA FPSCR / VSCR registers. We recommend applications 52 replace direct access to the MXSCR with the more portable <fenv.h> 53 Posix APIs. */ 54 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." 55 #endif 56 57 #ifndef _XMMINTRIN_H_INCLUDED 58 #define _XMMINTRIN_H_INCLUDED 59 60 /* Define four value permute mask */ 61 #define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z)) 62 63 #include <altivec.h> 64 65 /* Avoid collisions between altivec.h and strict adherence to C++ and 66 C11 standards. This should eventually be done inside altivec.h itself, 67 but only after testing a full distro build. */ 68 #if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \ 69 (defined(__STDC_VERSION__) && \ 70 __STDC_VERSION__ >= 201112L)) 71 #undef vector 72 #undef pixel 73 #undef bool 74 #endif 75 76 #include <assert.h> 77 78 /* We need type definitions from the MMX header file. */ 79 #include <mmintrin.h> 80 81 /* Get _mm_malloc () and _mm_free (). */ 82 #include <mm_malloc.h> 83 84 /* The Intel API is flexible enough that we must allow aliasing with other 85 vector types, and their scalar components. */ 86 typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__)); 87 88 /* Internal data types for implementing the intrinsics. */ 89 typedef float __v4sf __attribute__ ((__vector_size__ (16))); 90 91 /* Create an undefined vector. */ 92 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 93 _mm_undefined_ps (void) 94 { 95 __m128 __Y = __Y; 96 return __Y; 97 } 98 99 /* Create a vector of zeros. */ 100 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 101 _mm_setzero_ps (void) 102 { 103 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f }; 104 } 105 106 /* Load four SPFP values from P. The address must be 16-byte aligned. */ 107 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 108 _mm_load_ps (float const *__P) 109 { 110 assert(((unsigned long)__P & 0xfUL) == 0UL); 111 return ((__m128)vec_ld(0, (__v4sf*)__P)); 112 } 113 114 /* Load four SPFP values from P. The address need not be 16-byte aligned. */ 115 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 116 _mm_loadu_ps (float const *__P) 117 { 118 return (vec_vsx_ld(0, __P)); 119 } 120 121 /* Load four SPFP values in reverse order. The address must be aligned. */ 122 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 123 _mm_loadr_ps (float const *__P) 124 { 125 __v4sf __tmp; 126 __m128 result; 127 static const __vector unsigned char permute_vector = 128 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16, 129 0x17, 0x10, 0x11, 0x12, 0x13 }; 130 131 __tmp = vec_ld (0, (__v4sf *) __P); 132 result = (__m128) vec_perm (__tmp, __tmp, permute_vector); 133 return result; 134 } 135 136 /* Create a vector with all four elements equal to F. */ 137 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 138 _mm_set1_ps (float __F) 139 { 140 return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F }; 141 } 142 143 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 144 _mm_set_ps1 (float __F) 145 { 146 return _mm_set1_ps (__F); 147 } 148 149 /* Create the vector [Z Y X W]. */ 150 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 151 _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W) 152 { 153 return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z }; 154 } 155 156 /* Create the vector [W X Y Z]. */ 157 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 158 _mm_setr_ps (float __Z, float __Y, float __X, float __W) 159 { 160 return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W }; 161 } 162 163 /* Store four SPFP values. The address must be 16-byte aligned. */ 164 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 165 _mm_store_ps (float *__P, __m128 __A) 166 { 167 assert(((unsigned long)__P & 0xfUL) == 0UL); 168 vec_st((__v4sf)__A, 0, (__v4sf*)__P); 169 } 170 171 /* Store four SPFP values. The address need not be 16-byte aligned. */ 172 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 173 _mm_storeu_ps (float *__P, __m128 __A) 174 { 175 *(__m128 *)__P = __A; 176 } 177 178 /* Store four SPFP values in reverse order. The address must be aligned. */ 179 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 180 _mm_storer_ps (float *__P, __m128 __A) 181 { 182 __v4sf __tmp; 183 static const __vector unsigned char permute_vector = 184 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16, 185 0x17, 0x10, 0x11, 0x12, 0x13 }; 186 187 __tmp = (__m128) vec_perm (__A, __A, permute_vector); 188 189 _mm_store_ps (__P, __tmp); 190 } 191 192 /* Store the lower SPFP value across four words. */ 193 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 194 _mm_store1_ps (float *__P, __m128 __A) 195 { 196 __v4sf __va = vec_splat((__v4sf)__A, 0); 197 _mm_store_ps (__P, __va); 198 } 199 200 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 201 _mm_store_ps1 (float *__P, __m128 __A) 202 { 203 _mm_store1_ps (__P, __A); 204 } 205 206 /* Create a vector with element 0 as F and the rest zero. */ 207 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 208 _mm_set_ss (float __F) 209 { 210 return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f }; 211 } 212 213 /* Sets the low SPFP value of A from the low value of B. */ 214 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 215 _mm_move_ss (__m128 __A, __m128 __B) 216 { 217 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; 218 219 return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask)); 220 } 221 222 /* Create a vector with element 0 as *P and the rest zero. */ 223 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 224 _mm_load_ss (float const *__P) 225 { 226 return _mm_set_ss (*__P); 227 } 228 229 /* Stores the lower SPFP value. */ 230 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 231 _mm_store_ss (float *__P, __m128 __A) 232 { 233 *__P = ((__v4sf)__A)[0]; 234 } 235 236 /* Perform the respective operation on the lower SPFP (single-precision 237 floating-point) values of A and B; the upper three SPFP values are 238 passed through from A. */ 239 240 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 241 _mm_add_ss (__m128 __A, __m128 __B) 242 { 243 #ifdef _ARCH_PWR7 244 __m128 a, b, c; 245 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; 246 /* PowerISA VSX does not allow partial (for just lower double) 247 results. So to insure we don't generate spurious exceptions 248 (from the upper double values) we splat the lower double 249 before we to the operation. */ 250 a = vec_splat (__A, 0); 251 b = vec_splat (__B, 0); 252 c = a + b; 253 /* Then we merge the lower float result with the original upper 254 float elements from __A. */ 255 return (vec_sel (__A, c, mask)); 256 #else 257 __A[0] = __A[0] + __B[0]; 258 return (__A); 259 #endif 260 } 261 262 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 263 _mm_sub_ss (__m128 __A, __m128 __B) 264 { 265 #ifdef _ARCH_PWR7 266 __m128 a, b, c; 267 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; 268 /* PowerISA VSX does not allow partial (for just lower double) 269 results. So to insure we don't generate spurious exceptions 270 (from the upper double values) we splat the lower double 271 before we to the operation. */ 272 a = vec_splat (__A, 0); 273 b = vec_splat (__B, 0); 274 c = a - b; 275 /* Then we merge the lower float result with the original upper 276 float elements from __A. */ 277 return (vec_sel (__A, c, mask)); 278 #else 279 __A[0] = __A[0] - __B[0]; 280 return (__A); 281 #endif 282 } 283 284 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 285 _mm_mul_ss (__m128 __A, __m128 __B) 286 { 287 #ifdef _ARCH_PWR7 288 __m128 a, b, c; 289 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; 290 /* PowerISA VSX does not allow partial (for just lower double) 291 results. So to insure we don't generate spurious exceptions 292 (from the upper double values) we splat the lower double 293 before we to the operation. */ 294 a = vec_splat (__A, 0); 295 b = vec_splat (__B, 0); 296 c = a * b; 297 /* Then we merge the lower float result with the original upper 298 float elements from __A. */ 299 return (vec_sel (__A, c, mask)); 300 #else 301 __A[0] = __A[0] * __B[0]; 302 return (__A); 303 #endif 304 } 305 306 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 307 _mm_div_ss (__m128 __A, __m128 __B) 308 { 309 #ifdef _ARCH_PWR7 310 __m128 a, b, c; 311 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; 312 /* PowerISA VSX does not allow partial (for just lower double) 313 results. So to insure we don't generate spurious exceptions 314 (from the upper double values) we splat the lower double 315 before we to the operation. */ 316 a = vec_splat (__A, 0); 317 b = vec_splat (__B, 0); 318 c = a / b; 319 /* Then we merge the lower float result with the original upper 320 float elements from __A. */ 321 return (vec_sel (__A, c, mask)); 322 #else 323 __A[0] = __A[0] / __B[0]; 324 return (__A); 325 #endif 326 } 327 328 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 329 _mm_sqrt_ss (__m128 __A) 330 { 331 __m128 a, c; 332 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; 333 /* PowerISA VSX does not allow partial (for just lower double) 334 * results. So to insure we don't generate spurious exceptions 335 * (from the upper double values) we splat the lower double 336 * before we to the operation. */ 337 a = vec_splat (__A, 0); 338 c = vec_sqrt (a); 339 /* Then we merge the lower float result with the original upper 340 * float elements from __A. */ 341 return (vec_sel (__A, c, mask)); 342 } 343 344 /* Perform the respective operation on the four SPFP values in A and B. */ 345 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 346 _mm_add_ps (__m128 __A, __m128 __B) 347 { 348 return (__m128) ((__v4sf)__A + (__v4sf)__B); 349 } 350 351 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 352 _mm_sub_ps (__m128 __A, __m128 __B) 353 { 354 return (__m128) ((__v4sf)__A - (__v4sf)__B); 355 } 356 357 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 358 _mm_mul_ps (__m128 __A, __m128 __B) 359 { 360 return (__m128) ((__v4sf)__A * (__v4sf)__B); 361 } 362 363 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 364 _mm_div_ps (__m128 __A, __m128 __B) 365 { 366 return (__m128) ((__v4sf)__A / (__v4sf)__B); 367 } 368 369 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 370 _mm_sqrt_ps (__m128 __A) 371 { 372 return (vec_sqrt ((__v4sf)__A)); 373 } 374 375 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 376 _mm_rcp_ps (__m128 __A) 377 { 378 return (vec_re ((__v4sf)__A)); 379 } 380 381 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 382 _mm_rsqrt_ps (__m128 __A) 383 { 384 return (vec_rsqrte (__A)); 385 } 386 387 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 388 _mm_rcp_ss (__m128 __A) 389 { 390 __m128 a, c; 391 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; 392 /* PowerISA VSX does not allow partial (for just lower double) 393 * results. So to insure we don't generate spurious exceptions 394 * (from the upper double values) we splat the lower double 395 * before we to the operation. */ 396 a = vec_splat (__A, 0); 397 c = _mm_rcp_ps (a); 398 /* Then we merge the lower float result with the original upper 399 * float elements from __A. */ 400 return (vec_sel (__A, c, mask)); 401 } 402 403 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 404 _mm_rsqrt_ss (__m128 __A) 405 { 406 __m128 a, c; 407 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; 408 /* PowerISA VSX does not allow partial (for just lower double) 409 * results. So to insure we don't generate spurious exceptions 410 * (from the upper double values) we splat the lower double 411 * before we to the operation. */ 412 a = vec_splat (__A, 0); 413 c = vec_rsqrte (a); 414 /* Then we merge the lower float result with the original upper 415 * float elements from __A. */ 416 return (vec_sel (__A, c, mask)); 417 } 418 419 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 420 _mm_min_ss (__m128 __A, __m128 __B) 421 { 422 __v4sf a, b, c; 423 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; 424 /* PowerISA VSX does not allow partial (for just lower float) 425 * results. So to insure we don't generate spurious exceptions 426 * (from the upper float values) we splat the lower float 427 * before we to the operation. */ 428 a = vec_splat ((__v4sf)__A, 0); 429 b = vec_splat ((__v4sf)__B, 0); 430 c = vec_min (a, b); 431 /* Then we merge the lower float result with the original upper 432 * float elements from __A. */ 433 return (vec_sel ((__v4sf)__A, c, mask)); 434 } 435 436 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 437 _mm_max_ss (__m128 __A, __m128 __B) 438 { 439 __v4sf a, b, c; 440 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; 441 /* PowerISA VSX does not allow partial (for just lower float) 442 * results. So to insure we don't generate spurious exceptions 443 * (from the upper float values) we splat the lower float 444 * before we to the operation. */ 445 a = vec_splat (__A, 0); 446 b = vec_splat (__B, 0); 447 c = vec_max (a, b); 448 /* Then we merge the lower float result with the original upper 449 * float elements from __A. */ 450 return (vec_sel ((__v4sf)__A, c, mask)); 451 } 452 453 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 454 _mm_min_ps (__m128 __A, __m128 __B) 455 { 456 __m128 m = (__m128) vec_vcmpgtfp ((__v4sf) __B, (__v4sf) __A); 457 return vec_sel (__B, __A, m); 458 } 459 460 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 461 _mm_max_ps (__m128 __A, __m128 __B) 462 { 463 __m128 m = (__m128) vec_vcmpgtfp ((__v4sf) __A, (__v4sf) __B); 464 return vec_sel (__B, __A, m); 465 } 466 467 /* Perform logical bit-wise operations on 128-bit values. */ 468 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 469 _mm_and_ps (__m128 __A, __m128 __B) 470 { 471 return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B)); 472 // return __builtin_ia32_andps (__A, __B); 473 } 474 475 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 476 _mm_andnot_ps (__m128 __A, __m128 __B) 477 { 478 return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A)); 479 } 480 481 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 482 _mm_or_ps (__m128 __A, __m128 __B) 483 { 484 return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B)); 485 } 486 487 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 488 _mm_xor_ps (__m128 __A, __m128 __B) 489 { 490 return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B)); 491 } 492 493 /* Perform a comparison on the four SPFP values of A and B. For each 494 element, if the comparison is true, place a mask of all ones in the 495 result, otherwise a mask of zeros. */ 496 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 497 _mm_cmpeq_ps (__m128 __A, __m128 __B) 498 { 499 return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B)); 500 } 501 502 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 503 _mm_cmplt_ps (__m128 __A, __m128 __B) 504 { 505 return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B)); 506 } 507 508 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 509 _mm_cmple_ps (__m128 __A, __m128 __B) 510 { 511 return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B)); 512 } 513 514 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 515 _mm_cmpgt_ps (__m128 __A, __m128 __B) 516 { 517 return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B)); 518 } 519 520 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 521 _mm_cmpge_ps (__m128 __A, __m128 __B) 522 { 523 return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B)); 524 } 525 526 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 527 _mm_cmpneq_ps (__m128 __A, __m128 __B) 528 { 529 __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B); 530 return ((__m128)vec_nor (temp, temp)); 531 } 532 533 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 534 _mm_cmpnlt_ps (__m128 __A, __m128 __B) 535 { 536 return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B)); 537 } 538 539 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 540 _mm_cmpnle_ps (__m128 __A, __m128 __B) 541 { 542 return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B)); 543 } 544 545 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 546 _mm_cmpngt_ps (__m128 __A, __m128 __B) 547 { 548 return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B)); 549 } 550 551 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 552 _mm_cmpnge_ps (__m128 __A, __m128 __B) 553 { 554 return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B)); 555 } 556 557 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 558 _mm_cmpord_ps (__m128 __A, __m128 __B) 559 { 560 __vector unsigned int a, b; 561 __vector unsigned int c, d; 562 static const __vector unsigned int float_exp_mask = 563 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; 564 565 a = (__vector unsigned int) vec_abs ((__v4sf)__A); 566 b = (__vector unsigned int) vec_abs ((__v4sf)__B); 567 c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a); 568 d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b); 569 return ((__m128 ) vec_and (c, d)); 570 } 571 572 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 573 _mm_cmpunord_ps (__m128 __A, __m128 __B) 574 { 575 __vector unsigned int a, b; 576 __vector unsigned int c, d; 577 static const __vector unsigned int float_exp_mask = 578 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; 579 580 a = (__vector unsigned int) vec_abs ((__v4sf)__A); 581 b = (__vector unsigned int) vec_abs ((__v4sf)__B); 582 c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask); 583 d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask); 584 return ((__m128 ) vec_or (c, d)); 585 } 586 587 /* Perform a comparison on the lower SPFP values of A and B. If the 588 comparison is true, place a mask of all ones in the result, otherwise a 589 mask of zeros. The upper three SPFP values are passed through from A. */ 590 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 591 _mm_cmpeq_ss (__m128 __A, __m128 __B) 592 { 593 static const __vector unsigned int mask = 594 { 0xffffffff, 0, 0, 0 }; 595 __v4sf a, b, c; 596 /* PowerISA VMX does not allow partial (for just element 0) 597 * results. So to insure we don't generate spurious exceptions 598 * (from the upper elements) we splat the lower float 599 * before we to the operation. */ 600 a = vec_splat ((__v4sf) __A, 0); 601 b = vec_splat ((__v4sf) __B, 0); 602 c = (__v4sf) vec_cmpeq(a, b); 603 /* Then we merge the lower float result with the original upper 604 * float elements from __A. */ 605 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); 606 } 607 608 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 609 _mm_cmplt_ss (__m128 __A, __m128 __B) 610 { 611 static const __vector unsigned int mask = 612 { 0xffffffff, 0, 0, 0 }; 613 __v4sf a, b, c; 614 /* PowerISA VMX does not allow partial (for just element 0) 615 * results. So to insure we don't generate spurious exceptions 616 * (from the upper elements) we splat the lower float 617 * before we to the operation. */ 618 a = vec_splat ((__v4sf) __A, 0); 619 b = vec_splat ((__v4sf) __B, 0); 620 c = (__v4sf) vec_cmplt(a, b); 621 /* Then we merge the lower float result with the original upper 622 * float elements from __A. */ 623 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); 624 } 625 626 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 627 _mm_cmple_ss (__m128 __A, __m128 __B) 628 { 629 static const __vector unsigned int mask = 630 { 0xffffffff, 0, 0, 0 }; 631 __v4sf a, b, c; 632 /* PowerISA VMX does not allow partial (for just element 0) 633 * results. So to insure we don't generate spurious exceptions 634 * (from the upper elements) we splat the lower float 635 * before we to the operation. */ 636 a = vec_splat ((__v4sf) __A, 0); 637 b = vec_splat ((__v4sf) __B, 0); 638 c = (__v4sf) vec_cmple(a, b); 639 /* Then we merge the lower float result with the original upper 640 * float elements from __A. */ 641 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); 642 } 643 644 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 645 _mm_cmpgt_ss (__m128 __A, __m128 __B) 646 { 647 static const __vector unsigned int mask = 648 { 0xffffffff, 0, 0, 0 }; 649 __v4sf a, b, c; 650 /* PowerISA VMX does not allow partial (for just element 0) 651 * results. So to insure we don't generate spurious exceptions 652 * (from the upper elements) we splat the lower float 653 * before we to the operation. */ 654 a = vec_splat ((__v4sf) __A, 0); 655 b = vec_splat ((__v4sf) __B, 0); 656 c = (__v4sf) vec_cmpgt(a, b); 657 /* Then we merge the lower float result with the original upper 658 * float elements from __A. */ 659 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); 660 } 661 662 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 663 _mm_cmpge_ss (__m128 __A, __m128 __B) 664 { 665 static const __vector unsigned int mask = 666 { 0xffffffff, 0, 0, 0 }; 667 __v4sf a, b, c; 668 /* PowerISA VMX does not allow partial (for just element 0) 669 * results. So to insure we don't generate spurious exceptions 670 * (from the upper elements) we splat the lower float 671 * before we to the operation. */ 672 a = vec_splat ((__v4sf) __A, 0); 673 b = vec_splat ((__v4sf) __B, 0); 674 c = (__v4sf) vec_cmpge(a, b); 675 /* Then we merge the lower float result with the original upper 676 * float elements from __A. */ 677 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); 678 } 679 680 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 681 _mm_cmpneq_ss (__m128 __A, __m128 __B) 682 { 683 static const __vector unsigned int mask = 684 { 0xffffffff, 0, 0, 0 }; 685 __v4sf a, b, c; 686 /* PowerISA VMX does not allow partial (for just element 0) 687 * results. So to insure we don't generate spurious exceptions 688 * (from the upper elements) we splat the lower float 689 * before we to the operation. */ 690 a = vec_splat ((__v4sf) __A, 0); 691 b = vec_splat ((__v4sf) __B, 0); 692 c = (__v4sf) vec_cmpeq(a, b); 693 c = vec_nor (c, c); 694 /* Then we merge the lower float result with the original upper 695 * float elements from __A. */ 696 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); 697 } 698 699 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 700 _mm_cmpnlt_ss (__m128 __A, __m128 __B) 701 { 702 static const __vector unsigned int mask = 703 { 0xffffffff, 0, 0, 0 }; 704 __v4sf a, b, c; 705 /* PowerISA VMX does not allow partial (for just element 0) 706 * results. So to insure we don't generate spurious exceptions 707 * (from the upper elements) we splat the lower float 708 * before we to the operation. */ 709 a = vec_splat ((__v4sf) __A, 0); 710 b = vec_splat ((__v4sf) __B, 0); 711 c = (__v4sf) vec_cmpge(a, b); 712 /* Then we merge the lower float result with the original upper 713 * float elements from __A. */ 714 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); 715 } 716 717 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 718 _mm_cmpnle_ss (__m128 __A, __m128 __B) 719 { 720 static const __vector unsigned int mask = 721 { 0xffffffff, 0, 0, 0 }; 722 __v4sf a, b, c; 723 /* PowerISA VMX does not allow partial (for just element 0) 724 * results. So to insure we don't generate spurious exceptions 725 * (from the upper elements) we splat the lower float 726 * before we to the operation. */ 727 a = vec_splat ((__v4sf) __A, 0); 728 b = vec_splat ((__v4sf) __B, 0); 729 c = (__v4sf) vec_cmpgt(a, b); 730 /* Then we merge the lower float result with the original upper 731 * float elements from __A. */ 732 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); 733 } 734 735 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 736 _mm_cmpngt_ss (__m128 __A, __m128 __B) 737 { 738 static const __vector unsigned int mask = 739 { 0xffffffff, 0, 0, 0 }; 740 __v4sf a, b, c; 741 /* PowerISA VMX does not allow partial (for just element 0) 742 * results. So to insure we don't generate spurious exceptions 743 * (from the upper elements) we splat the lower float 744 * before we to the operation. */ 745 a = vec_splat ((__v4sf) __A, 0); 746 b = vec_splat ((__v4sf) __B, 0); 747 c = (__v4sf) vec_cmple(a, b); 748 /* Then we merge the lower float result with the original upper 749 * float elements from __A. */ 750 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); 751 } 752 753 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 754 _mm_cmpnge_ss (__m128 __A, __m128 __B) 755 { 756 static const __vector unsigned int mask = 757 { 0xffffffff, 0, 0, 0 }; 758 __v4sf a, b, c; 759 /* PowerISA VMX does not allow partial (for just element 0) 760 * results. So to insure we don't generate spurious exceptions 761 * (from the upper elements) we splat the lower float 762 * before we do the operation. */ 763 a = vec_splat ((__v4sf) __A, 0); 764 b = vec_splat ((__v4sf) __B, 0); 765 c = (__v4sf) vec_cmplt(a, b); 766 /* Then we merge the lower float result with the original upper 767 * float elements from __A. */ 768 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); 769 } 770 771 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 772 _mm_cmpord_ss (__m128 __A, __m128 __B) 773 { 774 __vector unsigned int a, b; 775 __vector unsigned int c, d; 776 static const __vector unsigned int float_exp_mask = 777 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; 778 static const __vector unsigned int mask = 779 { 0xffffffff, 0, 0, 0 }; 780 781 a = (__vector unsigned int) vec_abs ((__v4sf)__A); 782 b = (__vector unsigned int) vec_abs ((__v4sf)__B); 783 c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a); 784 d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b); 785 c = vec_and (c, d); 786 /* Then we merge the lower float result with the original upper 787 * float elements from __A. */ 788 return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask)); 789 } 790 791 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 792 _mm_cmpunord_ss (__m128 __A, __m128 __B) 793 { 794 __vector unsigned int a, b; 795 __vector unsigned int c, d; 796 static const __vector unsigned int float_exp_mask = 797 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; 798 static const __vector unsigned int mask = 799 { 0xffffffff, 0, 0, 0 }; 800 801 a = (__vector unsigned int) vec_abs ((__v4sf)__A); 802 b = (__vector unsigned int) vec_abs ((__v4sf)__B); 803 c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask); 804 d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask); 805 c = vec_or (c, d); 806 /* Then we merge the lower float result with the original upper 807 * float elements from __A. */ 808 return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask)); 809 } 810 811 /* Compare the lower SPFP values of A and B and return 1 if true 812 and 0 if false. */ 813 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 814 _mm_comieq_ss (__m128 __A, __m128 __B) 815 { 816 return (__A[0] == __B[0]); 817 } 818 819 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 820 _mm_comilt_ss (__m128 __A, __m128 __B) 821 { 822 return (__A[0] < __B[0]); 823 } 824 825 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 826 _mm_comile_ss (__m128 __A, __m128 __B) 827 { 828 return (__A[0] <= __B[0]); 829 } 830 831 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 832 _mm_comigt_ss (__m128 __A, __m128 __B) 833 { 834 return (__A[0] > __B[0]); 835 } 836 837 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 838 _mm_comige_ss (__m128 __A, __m128 __B) 839 { 840 return (__A[0] >= __B[0]); 841 } 842 843 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 844 _mm_comineq_ss (__m128 __A, __m128 __B) 845 { 846 return (__A[0] != __B[0]); 847 } 848 849 /* FIXME 850 * The __mm_ucomi??_ss implementations below are exactly the same as 851 * __mm_comi??_ss because GCC for PowerPC only generates unordered 852 * compares (scalar and vector). 853 * Technically __mm_comieq_ss et al should be using the ordered 854 * compare and signal for QNaNs. 855 * The __mm_ucomieq_sd et all should be OK, as is. 856 */ 857 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 858 _mm_ucomieq_ss (__m128 __A, __m128 __B) 859 { 860 return (__A[0] == __B[0]); 861 } 862 863 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 864 _mm_ucomilt_ss (__m128 __A, __m128 __B) 865 { 866 return (__A[0] < __B[0]); 867 } 868 869 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 870 _mm_ucomile_ss (__m128 __A, __m128 __B) 871 { 872 return (__A[0] <= __B[0]); 873 } 874 875 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 876 _mm_ucomigt_ss (__m128 __A, __m128 __B) 877 { 878 return (__A[0] > __B[0]); 879 } 880 881 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 882 _mm_ucomige_ss (__m128 __A, __m128 __B) 883 { 884 return (__A[0] >= __B[0]); 885 } 886 887 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 888 _mm_ucomineq_ss (__m128 __A, __m128 __B) 889 { 890 return (__A[0] != __B[0]); 891 } 892 893 extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 894 _mm_cvtss_f32 (__m128 __A) 895 { 896 return ((__v4sf)__A)[0]; 897 } 898 899 /* Convert the lower SPFP value to a 32-bit integer according to the current 900 rounding mode. */ 901 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 902 _mm_cvtss_si32 (__m128 __A) 903 { 904 __m64 res = 0; 905 #ifdef _ARCH_PWR8 906 __m128 vtmp; 907 __asm__( 908 "xxsldwi %x1,%x2,%x2,3;\n" 909 "xscvspdp %x1,%x1;\n" 910 "fctiw %1,%1;\n" 911 "mfvsrd %0,%x1;\n" 912 : "=r" (res), 913 "=&wi" (vtmp) 914 : "wa" (__A) 915 : ); 916 #else 917 res = __builtin_rint(__A[0]); 918 #endif 919 return (res); 920 } 921 922 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 923 _mm_cvt_ss2si (__m128 __A) 924 { 925 return _mm_cvtss_si32 (__A); 926 } 927 928 /* Convert the lower SPFP value to a 32-bit integer according to the 929 current rounding mode. */ 930 931 /* Intel intrinsic. */ 932 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 933 _mm_cvtss_si64 (__m128 __A) 934 { 935 __m64 res = 0; 936 #ifdef _ARCH_PWR8 937 __m128 vtmp; 938 __asm__( 939 "xxsldwi %x1,%x2,%x2,3;\n" 940 "xscvspdp %x1,%x1;\n" 941 "fctid %1,%1;\n" 942 "mfvsrd %0,%x1;\n" 943 : "=r" (res), 944 "=&wi" (vtmp) 945 : "wa" (__A) 946 : ); 947 #else 948 res = __builtin_llrint(__A[0]); 949 #endif 950 return (res); 951 } 952 953 /* Microsoft intrinsic. */ 954 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 955 _mm_cvtss_si64x (__m128 __A) 956 { 957 return _mm_cvtss_si64 ((__v4sf) __A); 958 } 959 960 /* Constants for use with _mm_prefetch. */ 961 enum _mm_hint 962 { 963 /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */ 964 _MM_HINT_ET0 = 7, 965 _MM_HINT_ET1 = 6, 966 _MM_HINT_T0 = 3, 967 _MM_HINT_T1 = 2, 968 _MM_HINT_T2 = 1, 969 _MM_HINT_NTA = 0 970 }; 971 972 /* Loads one cache line from address P to a location "closer" to the 973 processor. The selector I specifies the type of prefetch operation. */ 974 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 975 _mm_prefetch (const void *__P, enum _mm_hint __I) 976 { 977 /* Current PowerPC will ignores the hint parameters. */ 978 __builtin_prefetch (__P); 979 } 980 981 /* Convert the two lower SPFP values to 32-bit integers according to the 982 current rounding mode. Return the integers in packed form. */ 983 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 984 _mm_cvtps_pi32 (__m128 __A) 985 { 986 /* Splat two lower SPFP values to both halves. */ 987 __v4sf temp, rounded; 988 __vector __m64 result; 989 990 /* Splat two lower SPFP values to both halves. */ 991 temp = (__v4sf) vec_splat ((__vector long long)__A, 0); 992 rounded = vec_rint(temp); 993 result = (__vector __m64) vec_cts (rounded, 0); 994 995 return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0)); 996 } 997 998 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 999 _mm_cvt_ps2pi (__m128 __A) 1000 { 1001 return _mm_cvtps_pi32 (__A); 1002 } 1003 1004 /* Truncate the lower SPFP value to a 32-bit integer. */ 1005 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1006 _mm_cvttss_si32 (__m128 __A) 1007 { 1008 /* Extract the lower float element. */ 1009 float temp = __A[0]; 1010 /* truncate to 32-bit integer and return. */ 1011 return temp; 1012 } 1013 1014 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1015 _mm_cvtt_ss2si (__m128 __A) 1016 { 1017 return _mm_cvttss_si32 (__A); 1018 } 1019 1020 /* Intel intrinsic. */ 1021 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1022 _mm_cvttss_si64 (__m128 __A) 1023 { 1024 /* Extract the lower float element. */ 1025 float temp = __A[0]; 1026 /* truncate to 32-bit integer and return. */ 1027 return temp; 1028 } 1029 1030 /* Microsoft intrinsic. */ 1031 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1032 _mm_cvttss_si64x (__m128 __A) 1033 { 1034 /* Extract the lower float element. */ 1035 float temp = __A[0]; 1036 /* truncate to 32-bit integer and return. */ 1037 return temp; 1038 } 1039 1040 /* Truncate the two lower SPFP values to 32-bit integers. Return the 1041 integers in packed form. */ 1042 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1043 _mm_cvttps_pi32 (__m128 __A) 1044 { 1045 __v4sf temp; 1046 __vector __m64 result; 1047 1048 /* Splat two lower SPFP values to both halves. */ 1049 temp = (__v4sf) vec_splat ((__vector long long)__A, 0); 1050 result = (__vector __m64) vec_cts (temp, 0); 1051 1052 return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0)); 1053 } 1054 1055 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1056 _mm_cvtt_ps2pi (__m128 __A) 1057 { 1058 return _mm_cvttps_pi32 (__A); 1059 } 1060 1061 /* Convert B to a SPFP value and insert it as element zero in A. */ 1062 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1063 _mm_cvtsi32_ss (__m128 __A, int __B) 1064 { 1065 float temp = __B; 1066 __A[0] = temp; 1067 1068 return __A; 1069 } 1070 1071 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1072 _mm_cvt_si2ss (__m128 __A, int __B) 1073 { 1074 return _mm_cvtsi32_ss (__A, __B); 1075 } 1076 1077 /* Convert B to a SPFP value and insert it as element zero in A. */ 1078 /* Intel intrinsic. */ 1079 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1080 _mm_cvtsi64_ss (__m128 __A, long long __B) 1081 { 1082 float temp = __B; 1083 __A[0] = temp; 1084 1085 return __A; 1086 } 1087 1088 /* Microsoft intrinsic. */ 1089 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1090 _mm_cvtsi64x_ss (__m128 __A, long long __B) 1091 { 1092 return _mm_cvtsi64_ss (__A, __B); 1093 } 1094 1095 /* Convert the two 32-bit values in B to SPFP form and insert them 1096 as the two lower elements in A. */ 1097 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1098 _mm_cvtpi32_ps (__m128 __A, __m64 __B) 1099 { 1100 __vector signed int vm1; 1101 __vector float vf1; 1102 1103 vm1 = (__vector signed int) __builtin_pack_vector_int128 (__B, __B); 1104 vf1 = (__vector float) vec_ctf (vm1, 0); 1105 1106 return ((__m128) (__vector __m64) 1107 { ((__vector __m64)vf1) [0], ((__vector __m64)__A) [1]}); 1108 } 1109 1110 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1111 _mm_cvt_pi2ps (__m128 __A, __m64 __B) 1112 { 1113 return _mm_cvtpi32_ps (__A, __B); 1114 } 1115 1116 /* Convert the four signed 16-bit values in A to SPFP form. */ 1117 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1118 _mm_cvtpi16_ps (__m64 __A) 1119 { 1120 __vector signed short vs8; 1121 __vector signed int vi4; 1122 __vector float vf1; 1123 1124 vs8 = (__vector signed short) __builtin_pack_vector_int128 (__A, __A); 1125 vi4 = vec_vupklsh (vs8); 1126 vf1 = (__vector float) vec_ctf (vi4, 0); 1127 1128 return (__m128) vf1; 1129 } 1130 1131 /* Convert the four unsigned 16-bit values in A to SPFP form. */ 1132 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1133 _mm_cvtpu16_ps (__m64 __A) 1134 { 1135 const __vector unsigned short zero = 1136 { 0, 0, 0, 0, 0, 0, 0, 0 }; 1137 __vector unsigned short vs8; 1138 __vector unsigned int vi4; 1139 __vector float vf1; 1140 1141 vs8 = (__vector unsigned short) __builtin_pack_vector_int128 (__A, __A); 1142 vi4 = (__vector unsigned int) vec_vmrglh (vs8, zero); 1143 vf1 = (__vector float) vec_ctf (vi4, 0); 1144 1145 return (__m128) vf1; 1146 } 1147 1148 /* Convert the low four signed 8-bit values in A to SPFP form. */ 1149 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1150 _mm_cvtpi8_ps (__m64 __A) 1151 { 1152 __vector signed char vc16; 1153 __vector signed short vs8; 1154 __vector signed int vi4; 1155 __vector float vf1; 1156 1157 vc16 = (__vector signed char) __builtin_pack_vector_int128 (__A, __A); 1158 vs8 = vec_vupkhsb (vc16); 1159 vi4 = vec_vupkhsh (vs8); 1160 vf1 = (__vector float) vec_ctf (vi4, 0); 1161 1162 return (__m128) vf1; 1163 } 1164 1165 /* Convert the low four unsigned 8-bit values in A to SPFP form. */ 1166 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1167 1168 _mm_cvtpu8_ps (__m64 __A) 1169 { 1170 const __vector unsigned char zero = 1171 { 0, 0, 0, 0, 0, 0, 0, 0 }; 1172 __vector unsigned char vc16; 1173 __vector unsigned short vs8; 1174 __vector unsigned int vi4; 1175 __vector float vf1; 1176 1177 vc16 = (__vector unsigned char) __builtin_pack_vector_int128 (__A, __A); 1178 vs8 = (__vector unsigned short) vec_vmrglb (vc16, zero); 1179 vi4 = (__vector unsigned int) vec_vmrghh (vs8, 1180 (__vector unsigned short) zero); 1181 vf1 = (__vector float) vec_ctf (vi4, 0); 1182 1183 return (__m128) vf1; 1184 } 1185 1186 /* Convert the four signed 32-bit values in A and B to SPFP form. */ 1187 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1188 _mm_cvtpi32x2_ps(__m64 __A, __m64 __B) 1189 { 1190 __vector signed int vi4; 1191 __vector float vf4; 1192 1193 vi4 = (__vector signed int) __builtin_pack_vector_int128 (__B, __A); 1194 vf4 = (__vector float) vec_ctf (vi4, 0); 1195 return (__m128) vf4; 1196 } 1197 1198 /* Convert the four SPFP values in A to four signed 16-bit integers. */ 1199 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1200 _mm_cvtps_pi16(__m128 __A) 1201 { 1202 __v4sf rounded; 1203 __vector signed int temp; 1204 __vector __m64 result; 1205 1206 rounded = vec_rint(__A); 1207 temp = vec_cts (rounded, 0); 1208 result = (__vector __m64) vec_pack (temp, temp); 1209 1210 return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0)); 1211 } 1212 1213 /* Convert the four SPFP values in A to four signed 8-bit integers. */ 1214 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1215 _mm_cvtps_pi8(__m128 __A) 1216 { 1217 __v4sf rounded; 1218 __vector signed int tmp_i; 1219 static const __vector signed int zero = {0, 0, 0, 0}; 1220 __vector signed short tmp_s; 1221 __vector signed char res_v; 1222 __m64 result; 1223 1224 rounded = vec_rint(__A); 1225 tmp_i = vec_cts (rounded, 0); 1226 tmp_s = vec_pack (tmp_i, zero); 1227 res_v = vec_pack (tmp_s, tmp_s); 1228 result = (__m64) __builtin_unpack_vector_int128 ((__vector __int128)res_v, 0); 1229 1230 return (result); 1231 } 1232 1233 /* Selects four specific SPFP values from A and B based on MASK. */ 1234 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1235 1236 _mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask) 1237 { 1238 unsigned long element_selector_10 = __mask & 0x03; 1239 unsigned long element_selector_32 = (__mask >> 2) & 0x03; 1240 unsigned long element_selector_54 = (__mask >> 4) & 0x03; 1241 unsigned long element_selector_76 = (__mask >> 6) & 0x03; 1242 static const unsigned int permute_selectors[4] = 1243 { 1244 #ifdef __LITTLE_ENDIAN__ 1245 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C 1246 #elif __BIG_ENDIAN__ 1247 0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203 1248 #endif 1249 }; 1250 __vector unsigned int t; 1251 1252 #ifdef __LITTLE_ENDIAN__ 1253 t[0] = permute_selectors[element_selector_10]; 1254 t[1] = permute_selectors[element_selector_32]; 1255 t[2] = permute_selectors[element_selector_54] + 0x10101010; 1256 t[3] = permute_selectors[element_selector_76] + 0x10101010; 1257 #elif __BIG_ENDIAN__ 1258 t[3] = permute_selectors[element_selector_10] + 0x10101010; 1259 t[2] = permute_selectors[element_selector_32] + 0x10101010; 1260 t[1] = permute_selectors[element_selector_54]; 1261 t[0] = permute_selectors[element_selector_76]; 1262 #endif 1263 return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t); 1264 } 1265 1266 /* Selects and interleaves the upper two SPFP values from A and B. */ 1267 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1268 _mm_unpackhi_ps (__m128 __A, __m128 __B) 1269 { 1270 return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B); 1271 } 1272 1273 /* Selects and interleaves the lower two SPFP values from A and B. */ 1274 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1275 _mm_unpacklo_ps (__m128 __A, __m128 __B) 1276 { 1277 return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B); 1278 } 1279 1280 /* Sets the upper two SPFP values with 64-bits of data loaded from P; 1281 the lower two values are passed through from A. */ 1282 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1283 _mm_loadh_pi (__m128 __A, __m64 const *__P) 1284 { 1285 __vector __m64 __a = (__vector __m64)__A; 1286 __vector __m64 __p = vec_splats(*__P); 1287 __a [1] = __p [1]; 1288 1289 return (__m128)__a; 1290 } 1291 1292 /* Stores the upper two SPFP values of A into P. */ 1293 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1294 _mm_storeh_pi (__m64 *__P, __m128 __A) 1295 { 1296 __vector __m64 __a = (__vector __m64) __A; 1297 1298 *__P = __a[1]; 1299 } 1300 1301 /* Moves the upper two values of B into the lower two values of A. */ 1302 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1303 _mm_movehl_ps (__m128 __A, __m128 __B) 1304 { 1305 return (__m128) vec_mergel ((__vector __m64)__B, (__vector __m64)__A); 1306 } 1307 1308 /* Moves the lower two values of B into the upper two values of A. */ 1309 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1310 _mm_movelh_ps (__m128 __A, __m128 __B) 1311 { 1312 return (__m128) vec_mergeh ((__vector __m64)__A, (__vector __m64)__B); 1313 } 1314 1315 /* Sets the lower two SPFP values with 64-bits of data loaded from P; 1316 the upper two values are passed through from A. */ 1317 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1318 _mm_loadl_pi (__m128 __A, __m64 const *__P) 1319 { 1320 __vector __m64 __a = (__vector __m64)__A; 1321 __vector __m64 __p = vec_splats(*__P); 1322 __a [0] = __p [0]; 1323 1324 return (__m128)__a; 1325 } 1326 1327 /* Stores the lower two SPFP values of A into P. */ 1328 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1329 _mm_storel_pi (__m64 *__P, __m128 __A) 1330 { 1331 __vector __m64 __a = (__vector __m64) __A; 1332 1333 *__P = __a[0]; 1334 } 1335 1336 #ifdef _ARCH_PWR8 1337 /* Intrinsic functions that require PowerISA 2.07 minimum. */ 1338 1339 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */ 1340 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1341 _mm_movemask_ps (__m128 __A) 1342 { 1343 __vector __m64 result; 1344 static const __vector unsigned int perm_mask = 1345 { 1346 #ifdef __LITTLE_ENDIAN__ 1347 0x00204060, 0x80808080, 0x80808080, 0x80808080 1348 #elif __BIG_ENDIAN__ 1349 0x80808080, 0x80808080, 0x80808080, 0x00204060 1350 #endif 1351 }; 1352 1353 result = (__vector __m64) vec_vbpermq ((__vector unsigned char) __A, 1354 (__vector unsigned char) perm_mask); 1355 1356 #ifdef __LITTLE_ENDIAN__ 1357 return result[1]; 1358 #elif __BIG_ENDIAN__ 1359 return result[0]; 1360 #endif 1361 } 1362 #endif /* _ARCH_PWR8 */ 1363 1364 /* Create a vector with all four elements equal to *P. */ 1365 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1366 _mm_load1_ps (float const *__P) 1367 { 1368 return _mm_set1_ps (*__P); 1369 } 1370 1371 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1372 _mm_load_ps1 (float const *__P) 1373 { 1374 return _mm_load1_ps (__P); 1375 } 1376 1377 /* Extracts one of the four words of A. The selector N must be immediate. */ 1378 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1379 _mm_extract_pi16 (__m64 const __A, int const __N) 1380 { 1381 unsigned int shiftr = __N & 3; 1382 #ifdef __BIG_ENDIAN__ 1383 shiftr = 3 - shiftr; 1384 #endif 1385 1386 return ((__A >> (shiftr * 16)) & 0xffff); 1387 } 1388 1389 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1390 _m_pextrw (__m64 const __A, int const __N) 1391 { 1392 return _mm_extract_pi16 (__A, __N); 1393 } 1394 1395 /* Inserts word D into one of four words of A. The selector N must be 1396 immediate. */ 1397 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1398 _mm_insert_pi16 (__m64 const __A, int const __D, int const __N) 1399 { 1400 const int shiftl = (__N & 3) * 16; 1401 const __m64 shiftD = (const __m64) __D << shiftl; 1402 const __m64 mask = 0xffffUL << shiftl; 1403 __m64 result = (__A & (~mask)) | (shiftD & mask); 1404 1405 return (result); 1406 } 1407 1408 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1409 _m_pinsrw (__m64 const __A, int const __D, int const __N) 1410 { 1411 return _mm_insert_pi16 (__A, __D, __N); 1412 } 1413 1414 /* Compute the element-wise maximum of signed 16-bit values. */ 1415 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1416 1417 _mm_max_pi16 (__m64 __A, __m64 __B) 1418 { 1419 #if _ARCH_PWR8 1420 __vector signed short a, b, r; 1421 __vector __bool short c; 1422 1423 a = (__vector signed short)vec_splats (__A); 1424 b = (__vector signed short)vec_splats (__B); 1425 c = (__vector __bool short)vec_cmpgt (a, b); 1426 r = vec_sel (b, a, c); 1427 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0)); 1428 #else 1429 __m64_union m1, m2, res; 1430 1431 m1.as_m64 = __A; 1432 m2.as_m64 = __B; 1433 1434 res.as_short[0] = 1435 (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0]; 1436 res.as_short[1] = 1437 (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1]; 1438 res.as_short[2] = 1439 (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2]; 1440 res.as_short[3] = 1441 (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3]; 1442 1443 return (__m64) res.as_m64; 1444 #endif 1445 } 1446 1447 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1448 _m_pmaxsw (__m64 __A, __m64 __B) 1449 { 1450 return _mm_max_pi16 (__A, __B); 1451 } 1452 1453 /* Compute the element-wise maximum of unsigned 8-bit values. */ 1454 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1455 _mm_max_pu8 (__m64 __A, __m64 __B) 1456 { 1457 #if _ARCH_PWR8 1458 __vector unsigned char a, b, r; 1459 __vector __bool char c; 1460 1461 a = (__vector unsigned char)vec_splats (__A); 1462 b = (__vector unsigned char)vec_splats (__B); 1463 c = (__vector __bool char)vec_cmpgt (a, b); 1464 r = vec_sel (b, a, c); 1465 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0)); 1466 #else 1467 __m64_union m1, m2, res; 1468 long i; 1469 1470 m1.as_m64 = __A; 1471 m2.as_m64 = __B; 1472 1473 1474 for (i = 0; i < 8; i++) 1475 res.as_char[i] = 1476 ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ? 1477 m1.as_char[i] : m2.as_char[i]; 1478 1479 return (__m64) res.as_m64; 1480 #endif 1481 } 1482 1483 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1484 _m_pmaxub (__m64 __A, __m64 __B) 1485 { 1486 return _mm_max_pu8 (__A, __B); 1487 } 1488 1489 /* Compute the element-wise minimum of signed 16-bit values. */ 1490 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1491 _mm_min_pi16 (__m64 __A, __m64 __B) 1492 { 1493 #if _ARCH_PWR8 1494 __vector signed short a, b, r; 1495 __vector __bool short c; 1496 1497 a = (__vector signed short)vec_splats (__A); 1498 b = (__vector signed short)vec_splats (__B); 1499 c = (__vector __bool short)vec_cmplt (a, b); 1500 r = vec_sel (b, a, c); 1501 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0)); 1502 #else 1503 __m64_union m1, m2, res; 1504 1505 m1.as_m64 = __A; 1506 m2.as_m64 = __B; 1507 1508 res.as_short[0] = 1509 (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0]; 1510 res.as_short[1] = 1511 (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1]; 1512 res.as_short[2] = 1513 (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2]; 1514 res.as_short[3] = 1515 (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3]; 1516 1517 return (__m64) res.as_m64; 1518 #endif 1519 } 1520 1521 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1522 _m_pminsw (__m64 __A, __m64 __B) 1523 { 1524 return _mm_min_pi16 (__A, __B); 1525 } 1526 1527 /* Compute the element-wise minimum of unsigned 8-bit values. */ 1528 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1529 _mm_min_pu8 (__m64 __A, __m64 __B) 1530 { 1531 #if _ARCH_PWR8 1532 __vector unsigned char a, b, r; 1533 __vector __bool char c; 1534 1535 a = (__vector unsigned char)vec_splats (__A); 1536 b = (__vector unsigned char)vec_splats (__B); 1537 c = (__vector __bool char)vec_cmplt (a, b); 1538 r = vec_sel (b, a, c); 1539 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0)); 1540 #else 1541 __m64_union m1, m2, res; 1542 long i; 1543 1544 m1.as_m64 = __A; 1545 m2.as_m64 = __B; 1546 1547 1548 for (i = 0; i < 8; i++) 1549 res.as_char[i] = 1550 ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ? 1551 m1.as_char[i] : m2.as_char[i]; 1552 1553 return (__m64) res.as_m64; 1554 #endif 1555 } 1556 1557 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1558 _m_pminub (__m64 __A, __m64 __B) 1559 { 1560 return _mm_min_pu8 (__A, __B); 1561 } 1562 1563 /* Create an 8-bit mask of the signs of 8-bit values. */ 1564 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1565 _mm_movemask_pi8 (__m64 __A) 1566 { 1567 unsigned long p = 0x0008101820283038UL; // permute control for sign bits 1568 1569 return __builtin_bpermd (p, __A); 1570 } 1571 1572 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1573 _m_pmovmskb (__m64 __A) 1574 { 1575 return _mm_movemask_pi8 (__A); 1576 } 1577 1578 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values 1579 in B and produce the high 16 bits of the 32-bit results. */ 1580 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1581 _mm_mulhi_pu16 (__m64 __A, __m64 __B) 1582 { 1583 __vector unsigned short a, b; 1584 __vector unsigned short c; 1585 __vector unsigned int w0, w1; 1586 __vector unsigned char xform1 = { 1587 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 1588 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 1589 }; 1590 1591 a = (__vector unsigned short)vec_splats (__A); 1592 b = (__vector unsigned short)vec_splats (__B); 1593 1594 w0 = vec_vmuleuh (a, b); 1595 w1 = vec_vmulouh (a, b); 1596 c = (__vector unsigned short)vec_perm (w0, w1, xform1); 1597 1598 return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0)); 1599 } 1600 1601 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1602 _m_pmulhuw (__m64 __A, __m64 __B) 1603 { 1604 return _mm_mulhi_pu16 (__A, __B); 1605 } 1606 1607 /* Return a combination of the four 16-bit values in A. The selector 1608 must be an immediate. */ 1609 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1610 _mm_shuffle_pi16 (__m64 __A, int const __N) 1611 { 1612 unsigned long element_selector_10 = __N & 0x03; 1613 unsigned long element_selector_32 = (__N >> 2) & 0x03; 1614 unsigned long element_selector_54 = (__N >> 4) & 0x03; 1615 unsigned long element_selector_76 = (__N >> 6) & 0x03; 1616 static const unsigned short permute_selectors[4] = 1617 { 1618 #ifdef __LITTLE_ENDIAN__ 1619 0x0908, 0x0B0A, 0x0D0C, 0x0F0E 1620 #elif __BIG_ENDIAN__ 1621 0x0607, 0x0405, 0x0203, 0x0001 1622 #endif 1623 }; 1624 __m64_union t; 1625 __vector __m64 a, p, r; 1626 1627 #ifdef __LITTLE_ENDIAN__ 1628 t.as_short[0] = permute_selectors[element_selector_10]; 1629 t.as_short[1] = permute_selectors[element_selector_32]; 1630 t.as_short[2] = permute_selectors[element_selector_54]; 1631 t.as_short[3] = permute_selectors[element_selector_76]; 1632 #elif __BIG_ENDIAN__ 1633 t.as_short[3] = permute_selectors[element_selector_10]; 1634 t.as_short[2] = permute_selectors[element_selector_32]; 1635 t.as_short[1] = permute_selectors[element_selector_54]; 1636 t.as_short[0] = permute_selectors[element_selector_76]; 1637 #endif 1638 p = vec_splats (t.as_m64); 1639 a = vec_splats (__A); 1640 r = vec_perm (a, a, (__vector unsigned char)p); 1641 return (__builtin_unpack_vector_int128 ((__vector __int128)r, 0)); 1642 } 1643 1644 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1645 _m_pshufw (__m64 __A, int const __N) 1646 { 1647 return _mm_shuffle_pi16 (__A, __N); 1648 } 1649 1650 /* Conditionally store byte elements of A into P. The high bit of each 1651 byte in the selector N determines whether the corresponding byte from 1652 A is stored. */ 1653 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1654 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P) 1655 { 1656 __m64 hibit = 0x8080808080808080UL; 1657 __m64 mask, tmp; 1658 __m64 *p = (__m64*)__P; 1659 1660 tmp = *p; 1661 mask = _mm_cmpeq_pi8 ((__N & hibit), hibit); 1662 tmp = (tmp & (~mask)) | (__A & mask); 1663 *p = tmp; 1664 } 1665 1666 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1667 _m_maskmovq (__m64 __A, __m64 __N, char *__P) 1668 { 1669 _mm_maskmove_si64 (__A, __N, __P); 1670 } 1671 1672 /* Compute the rounded averages of the unsigned 8-bit values in A and B. */ 1673 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1674 _mm_avg_pu8 (__m64 __A, __m64 __B) 1675 { 1676 __vector unsigned char a, b, c; 1677 1678 a = (__vector unsigned char)vec_splats (__A); 1679 b = (__vector unsigned char)vec_splats (__B); 1680 c = vec_avg (a, b); 1681 return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0)); 1682 } 1683 1684 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1685 _m_pavgb (__m64 __A, __m64 __B) 1686 { 1687 return _mm_avg_pu8 (__A, __B); 1688 } 1689 1690 /* Compute the rounded averages of the unsigned 16-bit values in A and B. */ 1691 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1692 _mm_avg_pu16 (__m64 __A, __m64 __B) 1693 { 1694 __vector unsigned short a, b, c; 1695 1696 a = (__vector unsigned short)vec_splats (__A); 1697 b = (__vector unsigned short)vec_splats (__B); 1698 c = vec_avg (a, b); 1699 return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0)); 1700 } 1701 1702 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1703 _m_pavgw (__m64 __A, __m64 __B) 1704 { 1705 return _mm_avg_pu16 (__A, __B); 1706 } 1707 1708 /* Compute the sum of the absolute differences of the unsigned 8-bit 1709 values in A and B. Return the value in the lower 16-bit word; the 1710 upper words are cleared. */ 1711 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1712 _mm_sad_pu8 (__m64 __A, __m64 __B) 1713 { 1714 __vector unsigned char a, b; 1715 __vector unsigned char vmin, vmax, vabsdiff; 1716 __vector signed int vsum; 1717 const __vector unsigned int zero = 1718 { 0, 0, 0, 0 }; 1719 unsigned short result; 1720 1721 a = (__vector unsigned char) __builtin_pack_vector_int128 (0UL, __A); 1722 b = (__vector unsigned char) __builtin_pack_vector_int128 (0UL, __B); 1723 vmin = vec_min (a, b); 1724 vmax = vec_max (a, b); 1725 vabsdiff = vec_sub (vmax, vmin); 1726 /* Sum four groups of bytes into integers. */ 1727 vsum = (__vector signed int) vec_sum4s (vabsdiff, zero); 1728 /* Sum across four integers with integer result. */ 1729 vsum = vec_sums (vsum, (__vector signed int) zero); 1730 /* The sum is in the right most 32-bits of the vector result. 1731 Transfer to a GPR and truncate to 16 bits. */ 1732 result = vsum[3]; 1733 return (result); 1734 } 1735 1736 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1737 _m_psadbw (__m64 __A, __m64 __B) 1738 { 1739 return _mm_sad_pu8 (__A, __B); 1740 } 1741 1742 /* Stores the data in A to the address P without polluting the caches. */ 1743 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1744 _mm_stream_pi (__m64 *__P, __m64 __A) 1745 { 1746 /* Use the data cache block touch for store transient. */ 1747 __asm__ ( 1748 " dcbtstt 0,%0" 1749 : 1750 : "b" (__P) 1751 : "memory" 1752 ); 1753 *__P = __A; 1754 } 1755 1756 /* Likewise. The address must be 16-byte aligned. */ 1757 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1758 _mm_stream_ps (float *__P, __m128 __A) 1759 { 1760 /* Use the data cache block touch for store transient. */ 1761 __asm__ ( 1762 " dcbtstt 0,%0" 1763 : 1764 : "b" (__P) 1765 : "memory" 1766 ); 1767 _mm_store_ps (__P, __A); 1768 } 1769 1770 /* Guarantees that every preceding store is globally visible before 1771 any subsequent store. */ 1772 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1773 _mm_sfence (void) 1774 { 1775 /* Generate a light weight sync. */ 1776 __atomic_thread_fence (__ATOMIC_RELEASE); 1777 } 1778 1779 /* The execution of the next instruction is delayed by an implementation 1780 specific amount of time. The instruction does not modify the 1781 architectural state. This is after the pop_options pragma because 1782 it does not require SSE support in the processor--the encoding is a 1783 nop on processors that do not support it. */ 1784 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1785 _mm_pause (void) 1786 { 1787 /* There is no exact match with this construct, but the following is 1788 close to the desired effect. */ 1789 #if _ARCH_PWR8 1790 /* On power8 and later processors we can depend on Program Priority 1791 (PRI) and associated "very low" PPI setting. Since we don't know 1792 what PPI this thread is running at we: 1) save the current PRI 1793 from the PPR SPR into a local GRP, 2) set the PRI to "very low* 1794 via the special or 31,31,31 encoding. 3) issue an "isync" to 1795 insure the PRI change takes effect before we execute any more 1796 instructions. 1797 Now we can execute a lwsync (release barrier) while we execute 1798 this thread at "very low" PRI. Finally we restore the original 1799 PRI and continue execution. */ 1800 unsigned long __PPR; 1801 1802 __asm__ volatile ( 1803 " mfppr %0;" 1804 " or 31,31,31;" 1805 " isync;" 1806 " lwsync;" 1807 " isync;" 1808 " mtppr %0;" 1809 : "=r" (__PPR) 1810 : 1811 : "memory" 1812 ); 1813 #else 1814 /* For older processor where we may not even have Program Priority 1815 controls we can only depend on Heavy Weight Sync. */ 1816 __atomic_thread_fence (__ATOMIC_SEQ_CST); 1817 #endif 1818 } 1819 1820 /* Transpose the 4x4 matrix composed of row[0-3]. */ 1821 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 1822 do { \ 1823 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \ 1824 __v4sf __t0 = vec_vmrghw (__r0, __r1); \ 1825 __v4sf __t1 = vec_vmrghw (__r2, __r3); \ 1826 __v4sf __t2 = vec_vmrglw (__r0, __r1); \ 1827 __v4sf __t3 = vec_vmrglw (__r2, __r3); \ 1828 (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, \ 1829 (__vector long long)__t1); \ 1830 (row1) = (__v4sf)vec_mergel ((__vector long long)__t0, \ 1831 (__vector long long)__t1); \ 1832 (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2, \ 1833 (__vector long long)__t3); \ 1834 (row3) = (__v4sf)vec_mergel ((__vector long long)__t2, \ 1835 (__vector long long)__t3); \ 1836 } while (0) 1837 1838 /* For backward source compatibility. */ 1839 //# include <emmintrin.h> 1840 1841 #endif /* _XMMINTRIN_H_INCLUDED */ 1842