1 /* Copyright (C) 2002-2020 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 GCC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 Under Section 7 of GPL version 3, you are granted additional 16 permissions described in the GCC Runtime Library Exception, version 17 3.1, as published by the Free Software Foundation. 18 19 You should have received a copy of the GNU General Public License and 20 a copy of the GCC Runtime Library Exception along with this program; 21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 <http://www.gnu.org/licenses/>. */ 23 24 /* Implemented from the specification included in the Intel C++ Compiler 25 User Guide and Reference, version 9.0. */ 26 27 #ifndef NO_WARN_X86_INTRINSICS 28 /* This header is distributed to simplify porting x86_64 code that 29 makes explicit use of Intel intrinsics to powerpc64le. 30 It is the user's responsibility to determine if the results are 31 acceptable and make additional changes as necessary. 32 Note that much code that uses Intel intrinsics can be rewritten in 33 standard C or GNU C extensions, which are more portable and better 34 optimized across multiple targets. 35 36 In the specific case of X86 MMX (__m64) intrinsics, the PowerPC 37 target does not support a native __vector_size__ (8) type. Instead 38 we typedef __m64 to a 64-bit unsigned long long, which is natively 39 supported in 64-bit mode. This works well for the _si64 and some 40 _pi32 operations, but starts to generate long sequences for _pi16 41 and _pi8 operations. For those cases it better (faster and 42 smaller code) to transfer __m64 data to the PowerPC vector 128-bit 43 unit, perform the operation, and then transfer the result back to 44 the __m64 type. This implies that the direct register move 45 instructions, introduced with power8, are available for efficient 46 implementation of these transfers. 47 48 Most MMX intrinsic operations can be performed efficiently as 49 C language 64-bit scalar operation or optimized to use the newer 50 128-bit SSE/Altivec operations. We recomend this for new 51 applications. */ 52 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." 53 #endif 54 55 #ifndef _MMINTRIN_H_INCLUDED 56 #define _MMINTRIN_H_INCLUDED 57 58 #include <altivec.h> 59 /* The Intel API is flexible enough that we must allow aliasing with other 60 vector types, and their scalar components. */ 61 typedef __attribute__ ((__aligned__ (8), 62 __may_alias__)) unsigned long long __m64; 63 64 typedef __attribute__ ((__aligned__ (8))) 65 union 66 { 67 __m64 as_m64; 68 char as_char[8]; 69 signed char as_signed_char [8]; 70 short as_short[4]; 71 int as_int[2]; 72 long long as_long_long; 73 float as_float[2]; 74 double as_double; 75 } __m64_union; 76 77 /* Empty the multimedia state. */ 78 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 79 _mm_empty (void) 80 { 81 /* nothing to do on PowerPC. */ 82 } 83 84 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 85 _m_empty (void) 86 { 87 /* nothing to do on PowerPC. */ 88 } 89 90 /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */ 91 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 92 _mm_cvtsi32_si64 (int __i) 93 { 94 return (__m64) (unsigned int) __i; 95 } 96 97 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 98 _m_from_int (int __i) 99 { 100 return _mm_cvtsi32_si64 (__i); 101 } 102 103 /* Convert the lower 32 bits of the __m64 object into an integer. */ 104 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 105 _mm_cvtsi64_si32 (__m64 __i) 106 { 107 return ((int) __i); 108 } 109 110 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 111 _m_to_int (__m64 __i) 112 { 113 return _mm_cvtsi64_si32 (__i); 114 } 115 116 /* Convert I to a __m64 object. */ 117 118 /* Intel intrinsic. */ 119 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 120 _m_from_int64 (long long __i) 121 { 122 return (__m64) __i; 123 } 124 125 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 126 _mm_cvtsi64_m64 (long long __i) 127 { 128 return (__m64) __i; 129 } 130 131 /* Microsoft intrinsic. */ 132 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 133 _mm_cvtsi64x_si64 (long long __i) 134 { 135 return (__m64) __i; 136 } 137 138 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 139 _mm_set_pi64x (long long __i) 140 { 141 return (__m64) __i; 142 } 143 144 /* Convert the __m64 object to a 64bit integer. */ 145 146 /* Intel intrinsic. */ 147 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 148 _m_to_int64 (__m64 __i) 149 { 150 return (long long)__i; 151 } 152 153 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 154 _mm_cvtm64_si64 (__m64 __i) 155 { 156 return (long long) __i; 157 } 158 159 /* Microsoft intrinsic. */ 160 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 161 _mm_cvtsi64_si64x (__m64 __i) 162 { 163 return (long long) __i; 164 } 165 166 #ifdef _ARCH_PWR8 167 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of 168 the result, and the four 16-bit values from M2 into the upper four 8-bit 169 values of the result, all with signed saturation. */ 170 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 171 _mm_packs_pi16 (__m64 __m1, __m64 __m2) 172 { 173 __vector signed short __vm1; 174 __vector signed char __vresult; 175 176 __vm1 = (__vector signed short) (__vector unsigned long long) 177 #ifdef __LITTLE_ENDIAN__ 178 { __m1, __m2 }; 179 #else 180 { __m2, __m1 }; 181 #endif 182 __vresult = vec_packs (__vm1, __vm1); 183 return (__m64) ((__vector long long) __vresult)[0]; 184 } 185 186 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 187 _m_packsswb (__m64 __m1, __m64 __m2) 188 { 189 return _mm_packs_pi16 (__m1, __m2); 190 } 191 192 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of 193 the result, and the two 32-bit values from M2 into the upper two 16-bit 194 values of the result, all with signed saturation. */ 195 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 196 _mm_packs_pi32 (__m64 __m1, __m64 __m2) 197 { 198 __vector signed int __vm1; 199 __vector signed short __vresult; 200 201 __vm1 = (__vector signed int) (__vector unsigned long long) 202 #ifdef __LITTLE_ENDIAN__ 203 { __m1, __m2 }; 204 #else 205 { __m2, __m1 }; 206 #endif 207 __vresult = vec_packs (__vm1, __vm1); 208 return (__m64) ((__vector long long) __vresult)[0]; 209 } 210 211 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 212 _m_packssdw (__m64 __m1, __m64 __m2) 213 { 214 return _mm_packs_pi32 (__m1, __m2); 215 } 216 217 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of 218 the result, and the four 16-bit values from M2 into the upper four 8-bit 219 values of the result, all with unsigned saturation. */ 220 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 221 _mm_packs_pu16 (__m64 __m1, __m64 __m2) 222 { 223 __vector unsigned char __r; 224 __vector signed short __vm1 = (__vector signed short) (__vector long long) 225 #ifdef __LITTLE_ENDIAN__ 226 { __m1, __m2 }; 227 #else 228 { __m2, __m1 }; 229 #endif 230 const __vector signed short __zero = { 0 }; 231 __vector __bool short __select = vec_cmplt (__vm1, __zero); 232 __r = vec_packs ((__vector unsigned short) __vm1, (__vector unsigned short) __vm1); 233 __vector __bool char __packsel = vec_pack (__select, __select); 234 __r = vec_sel (__r, (const __vector unsigned char) __zero, __packsel); 235 return (__m64) ((__vector long long) __r)[0]; 236 } 237 238 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 239 _m_packuswb (__m64 __m1, __m64 __m2) 240 { 241 return _mm_packs_pu16 (__m1, __m2); 242 } 243 #endif /* end ARCH_PWR8 */ 244 245 /* Interleave the four 8-bit values from the high half of M1 with the four 246 8-bit values from the high half of M2. */ 247 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 248 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2) 249 { 250 #if _ARCH_PWR8 251 __vector unsigned char __a, __b, __c; 252 253 __a = (__vector unsigned char)vec_splats (__m1); 254 __b = (__vector unsigned char)vec_splats (__m2); 255 __c = vec_mergel (__a, __b); 256 return (__m64) ((__vector long long) __c)[1]; 257 #else 258 __m64_union __mu1, __mu2, __res; 259 260 __mu1.as_m64 = __m1; 261 __mu2.as_m64 = __m2; 262 263 __res.as_char[0] = __mu1.as_char[4]; 264 __res.as_char[1] = __mu2.as_char[4]; 265 __res.as_char[2] = __mu1.as_char[5]; 266 __res.as_char[3] = __mu2.as_char[5]; 267 __res.as_char[4] = __mu1.as_char[6]; 268 __res.as_char[5] = __mu2.as_char[6]; 269 __res.as_char[6] = __mu1.as_char[7]; 270 __res.as_char[7] = __mu2.as_char[7]; 271 272 return (__m64) __res.as_m64; 273 #endif 274 } 275 276 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 277 _m_punpckhbw (__m64 __m1, __m64 __m2) 278 { 279 return _mm_unpackhi_pi8 (__m1, __m2); 280 } 281 282 /* Interleave the two 16-bit values from the high half of M1 with the two 283 16-bit values from the high half of M2. */ 284 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 285 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2) 286 { 287 __m64_union __mu1, __mu2, __res; 288 289 __mu1.as_m64 = __m1; 290 __mu2.as_m64 = __m2; 291 292 __res.as_short[0] = __mu1.as_short[2]; 293 __res.as_short[1] = __mu2.as_short[2]; 294 __res.as_short[2] = __mu1.as_short[3]; 295 __res.as_short[3] = __mu2.as_short[3]; 296 297 return (__m64) __res.as_m64; 298 } 299 300 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 301 _m_punpckhwd (__m64 __m1, __m64 __m2) 302 { 303 return _mm_unpackhi_pi16 (__m1, __m2); 304 } 305 /* Interleave the 32-bit value from the high half of M1 with the 32-bit 306 value from the high half of M2. */ 307 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 308 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2) 309 { 310 __m64_union __mu1, __mu2, __res; 311 312 __mu1.as_m64 = __m1; 313 __mu2.as_m64 = __m2; 314 315 __res.as_int[0] = __mu1.as_int[1]; 316 __res.as_int[1] = __mu2.as_int[1]; 317 318 return (__m64) __res.as_m64; 319 } 320 321 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 322 _m_punpckhdq (__m64 __m1, __m64 __m2) 323 { 324 return _mm_unpackhi_pi32 (__m1, __m2); 325 } 326 /* Interleave the four 8-bit values from the low half of M1 with the four 327 8-bit values from the low half of M2. */ 328 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 329 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2) 330 { 331 #if _ARCH_PWR8 332 __vector unsigned char __a, __b, __c; 333 334 __a = (__vector unsigned char)vec_splats (__m1); 335 __b = (__vector unsigned char)vec_splats (__m2); 336 __c = vec_mergel (__a, __b); 337 return (__m64) ((__vector long long) __c)[0]; 338 #else 339 __m64_union __mu1, __mu2, __res; 340 341 __mu1.as_m64 = __m1; 342 __mu2.as_m64 = __m2; 343 344 __res.as_char[0] = __mu1.as_char[0]; 345 __res.as_char[1] = __mu2.as_char[0]; 346 __res.as_char[2] = __mu1.as_char[1]; 347 __res.as_char[3] = __mu2.as_char[1]; 348 __res.as_char[4] = __mu1.as_char[2]; 349 __res.as_char[5] = __mu2.as_char[2]; 350 __res.as_char[6] = __mu1.as_char[3]; 351 __res.as_char[7] = __mu2.as_char[3]; 352 353 return (__m64) __res.as_m64; 354 #endif 355 } 356 357 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 358 _m_punpcklbw (__m64 __m1, __m64 __m2) 359 { 360 return _mm_unpacklo_pi8 (__m1, __m2); 361 } 362 /* Interleave the two 16-bit values from the low half of M1 with the two 363 16-bit values from the low half of M2. */ 364 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 365 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2) 366 { 367 __m64_union __mu1, __mu2, __res; 368 369 __mu1.as_m64 = __m1; 370 __mu2.as_m64 = __m2; 371 372 __res.as_short[0] = __mu1.as_short[0]; 373 __res.as_short[1] = __mu2.as_short[0]; 374 __res.as_short[2] = __mu1.as_short[1]; 375 __res.as_short[3] = __mu2.as_short[1]; 376 377 return (__m64) __res.as_m64; 378 } 379 380 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 381 _m_punpcklwd (__m64 __m1, __m64 __m2) 382 { 383 return _mm_unpacklo_pi16 (__m1, __m2); 384 } 385 386 /* Interleave the 32-bit value from the low half of M1 with the 32-bit 387 value from the low half of M2. */ 388 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 389 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2) 390 { 391 __m64_union __mu1, __mu2, __res; 392 393 __mu1.as_m64 = __m1; 394 __mu2.as_m64 = __m2; 395 396 __res.as_int[0] = __mu1.as_int[0]; 397 __res.as_int[1] = __mu2.as_int[0]; 398 399 return (__m64) __res.as_m64; 400 } 401 402 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 403 _m_punpckldq (__m64 __m1, __m64 __m2) 404 { 405 return _mm_unpacklo_pi32 (__m1, __m2); 406 } 407 408 /* Add the 8-bit values in M1 to the 8-bit values in M2. */ 409 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 410 _mm_add_pi8 (__m64 __m1, __m64 __m2) 411 { 412 #if _ARCH_PWR8 413 __vector signed char __a, __b, __c; 414 415 __a = (__vector signed char)vec_splats (__m1); 416 __b = (__vector signed char)vec_splats (__m2); 417 __c = vec_add (__a, __b); 418 return (__m64) ((__vector long long) __c)[0]; 419 #else 420 __m64_union __mu1, __mu2, __res; 421 422 __mu1.as_m64 = __m1; 423 __mu2.as_m64 = __m2; 424 425 __res.as_char[0] = __mu1.as_char[0] + __mu2.as_char[0]; 426 __res.as_char[1] = __mu1.as_char[1] + __mu2.as_char[1]; 427 __res.as_char[2] = __mu1.as_char[2] + __mu2.as_char[2]; 428 __res.as_char[3] = __mu1.as_char[3] + __mu2.as_char[3]; 429 __res.as_char[4] = __mu1.as_char[4] + __mu2.as_char[4]; 430 __res.as_char[5] = __mu1.as_char[5] + __mu2.as_char[5]; 431 __res.as_char[6] = __mu1.as_char[6] + __mu2.as_char[6]; 432 __res.as_char[7] = __mu1.as_char[7] + __mu2.as_char[7]; 433 434 return (__m64) __res.as_m64; 435 #endif 436 } 437 438 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 439 _m_paddb (__m64 __m1, __m64 __m2) 440 { 441 return _mm_add_pi8 (__m1, __m2); 442 } 443 444 /* Add the 16-bit values in M1 to the 16-bit values in M2. */ 445 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 446 _mm_add_pi16 (__m64 __m1, __m64 __m2) 447 { 448 #if _ARCH_PWR8 449 __vector signed short __a, __b, __c; 450 451 __a = (__vector signed short)vec_splats (__m1); 452 __b = (__vector signed short)vec_splats (__m2); 453 __c = vec_add (__a, __b); 454 return (__m64) ((__vector long long) __c)[0]; 455 #else 456 __m64_union __mu1, __mu2, __res; 457 458 __mu1.as_m64 = __m1; 459 __mu2.as_m64 = __m2; 460 461 __res.as_short[0] = __mu1.as_short[0] + __mu2.as_short[0]; 462 __res.as_short[1] = __mu1.as_short[1] + __mu2.as_short[1]; 463 __res.as_short[2] = __mu1.as_short[2] + __mu2.as_short[2]; 464 __res.as_short[3] = __mu1.as_short[3] + __mu2.as_short[3]; 465 466 return (__m64) __res.as_m64; 467 #endif 468 } 469 470 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 471 _m_paddw (__m64 __m1, __m64 __m2) 472 { 473 return _mm_add_pi16 (__m1, __m2); 474 } 475 476 /* Add the 32-bit values in M1 to the 32-bit values in M2. */ 477 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 478 _mm_add_pi32 (__m64 __m1, __m64 __m2) 479 { 480 #if _ARCH_PWR9 481 __vector signed int __a, __b, __c; 482 483 __a = (__vector signed int)vec_splats (__m1); 484 __b = (__vector signed int)vec_splats (__m2); 485 __c = vec_add (__a, __b); 486 return (__m64) ((__vector long long) __c)[0]; 487 #else 488 __m64_union __mu1, __mu2, __res; 489 490 __mu1.as_m64 = __m1; 491 __mu2.as_m64 = __m2; 492 493 __res.as_int[0] = __mu1.as_int[0] + __mu2.as_int[0]; 494 __res.as_int[1] = __mu1.as_int[1] + __mu2.as_int[1]; 495 496 return (__m64) __res.as_m64; 497 #endif 498 } 499 500 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 501 _m_paddd (__m64 __m1, __m64 __m2) 502 { 503 return _mm_add_pi32 (__m1, __m2); 504 } 505 506 /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */ 507 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 508 _mm_sub_pi8 (__m64 __m1, __m64 __m2) 509 { 510 #if _ARCH_PWR8 511 __vector signed char __a, __b, __c; 512 513 __a = (__vector signed char)vec_splats (__m1); 514 __b = (__vector signed char)vec_splats (__m2); 515 __c = vec_sub (__a, __b); 516 return (__m64) ((__vector long long) __c)[0]; 517 #else 518 __m64_union __mu1, __mu2, __res; 519 520 __mu1.as_m64 = __m1; 521 __mu2.as_m64 = __m2; 522 523 __res.as_char[0] = __mu1.as_char[0] - __mu2.as_char[0]; 524 __res.as_char[1] = __mu1.as_char[1] - __mu2.as_char[1]; 525 __res.as_char[2] = __mu1.as_char[2] - __mu2.as_char[2]; 526 __res.as_char[3] = __mu1.as_char[3] - __mu2.as_char[3]; 527 __res.as_char[4] = __mu1.as_char[4] - __mu2.as_char[4]; 528 __res.as_char[5] = __mu1.as_char[5] - __mu2.as_char[5]; 529 __res.as_char[6] = __mu1.as_char[6] - __mu2.as_char[6]; 530 __res.as_char[7] = __mu1.as_char[7] - __mu2.as_char[7]; 531 532 return (__m64) __res.as_m64; 533 #endif 534 } 535 536 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 537 _m_psubb (__m64 __m1, __m64 __m2) 538 { 539 return _mm_sub_pi8 (__m1, __m2); 540 } 541 542 /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */ 543 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 544 _mm_sub_pi16 (__m64 __m1, __m64 __m2) 545 { 546 #if _ARCH_PWR8 547 __vector signed short __a, __b, __c; 548 549 __a = (__vector signed short)vec_splats (__m1); 550 __b = (__vector signed short)vec_splats (__m2); 551 __c = vec_sub (__a, __b); 552 return (__m64) ((__vector long long) __c)[0]; 553 #else 554 __m64_union __mu1, __mu2, __res; 555 556 __mu1.as_m64 = __m1; 557 __mu2.as_m64 = __m2; 558 559 __res.as_short[0] = __mu1.as_short[0] - __mu2.as_short[0]; 560 __res.as_short[1] = __mu1.as_short[1] - __mu2.as_short[1]; 561 __res.as_short[2] = __mu1.as_short[2] - __mu2.as_short[2]; 562 __res.as_short[3] = __mu1.as_short[3] - __mu2.as_short[3]; 563 564 return (__m64) __res.as_m64; 565 #endif 566 } 567 568 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 569 _m_psubw (__m64 __m1, __m64 __m2) 570 { 571 return _mm_sub_pi16 (__m1, __m2); 572 } 573 574 /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */ 575 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 576 _mm_sub_pi32 (__m64 __m1, __m64 __m2) 577 { 578 #if _ARCH_PWR9 579 __vector signed int __a, __b, __c; 580 581 __a = (__vector signed int)vec_splats (__m1); 582 __b = (__vector signed int)vec_splats (__m2); 583 __c = vec_sub (__a, __b); 584 return (__m64) ((__vector long long) __c)[0]; 585 #else 586 __m64_union __mu1, __mu2, __res; 587 588 __mu1.as_m64 = __m1; 589 __mu2.as_m64 = __m2; 590 591 __res.as_int[0] = __mu1.as_int[0] - __mu2.as_int[0]; 592 __res.as_int[1] = __mu1.as_int[1] - __mu2.as_int[1]; 593 594 return (__m64) __res.as_m64; 595 #endif 596 } 597 598 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 599 _m_psubd (__m64 __m1, __m64 __m2) 600 { 601 return _mm_sub_pi32 (__m1, __m2); 602 } 603 604 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 605 _mm_add_si64 (__m64 __m1, __m64 __m2) 606 { 607 return (__m1 + __m2); 608 } 609 610 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 611 _mm_sub_si64 (__m64 __m1, __m64 __m2) 612 { 613 return (__m1 - __m2); 614 } 615 616 /* Shift the 64-bit value in M left by COUNT. */ 617 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 618 _mm_sll_si64 (__m64 __m, __m64 __count) 619 { 620 return (__m << __count); 621 } 622 623 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 624 _m_psllq (__m64 __m, __m64 __count) 625 { 626 return _mm_sll_si64 (__m, __count); 627 } 628 629 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 630 _mm_slli_si64 (__m64 __m, const int __count) 631 { 632 return (__m << __count); 633 } 634 635 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 636 _m_psllqi (__m64 __m, const int __count) 637 { 638 return _mm_slli_si64 (__m, __count); 639 } 640 641 /* Shift the 64-bit value in M left by COUNT; shift in zeros. */ 642 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 643 _mm_srl_si64 (__m64 __m, __m64 __count) 644 { 645 return (__m >> __count); 646 } 647 648 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 649 _m_psrlq (__m64 __m, __m64 __count) 650 { 651 return _mm_srl_si64 (__m, __count); 652 } 653 654 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 655 _mm_srli_si64 (__m64 __m, const int __count) 656 { 657 return (__m >> __count); 658 } 659 660 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 661 _m_psrlqi (__m64 __m, const int __count) 662 { 663 return _mm_srli_si64 (__m, __count); 664 } 665 666 /* Bit-wise AND the 64-bit values in M1 and M2. */ 667 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 668 _mm_and_si64 (__m64 __m1, __m64 __m2) 669 { 670 return (__m1 & __m2); 671 } 672 673 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 674 _m_pand (__m64 __m1, __m64 __m2) 675 { 676 return _mm_and_si64 (__m1, __m2); 677 } 678 679 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the 680 64-bit value in M2. */ 681 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 682 _mm_andnot_si64 (__m64 __m1, __m64 __m2) 683 { 684 return (~__m1 & __m2); 685 } 686 687 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 688 _m_pandn (__m64 __m1, __m64 __m2) 689 { 690 return _mm_andnot_si64 (__m1, __m2); 691 } 692 693 /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */ 694 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 695 _mm_or_si64 (__m64 __m1, __m64 __m2) 696 { 697 return (__m1 | __m2); 698 } 699 700 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 701 _m_por (__m64 __m1, __m64 __m2) 702 { 703 return _mm_or_si64 (__m1, __m2); 704 } 705 706 /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */ 707 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 708 _mm_xor_si64 (__m64 __m1, __m64 __m2) 709 { 710 return (__m1 ^ __m2); 711 } 712 713 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 714 _m_pxor (__m64 __m1, __m64 __m2) 715 { 716 return _mm_xor_si64 (__m1, __m2); 717 } 718 719 /* Creates a 64-bit zero. */ 720 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 721 _mm_setzero_si64 (void) 722 { 723 return (__m64) 0; 724 } 725 726 /* Compare eight 8-bit values. The result of the comparison is 0xFF if the 727 test is true and zero if false. */ 728 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 729 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2) 730 { 731 #if defined(_ARCH_PWR6) && defined(__powerpc64__) 732 __m64 __res; 733 __asm__( 734 "cmpb %0,%1,%2;\n" 735 : "=r" (__res) 736 : "r" (__m1), 737 "r" (__m2) 738 : ); 739 return (__res); 740 #else 741 __m64_union __mu1, __mu2, __res; 742 743 __mu1.as_m64 = __m1; 744 __mu2.as_m64 = __m2; 745 746 __res.as_char[0] = (__mu1.as_char[0] == __mu2.as_char[0])? -1: 0; 747 __res.as_char[1] = (__mu1.as_char[1] == __mu2.as_char[1])? -1: 0; 748 __res.as_char[2] = (__mu1.as_char[2] == __mu2.as_char[2])? -1: 0; 749 __res.as_char[3] = (__mu1.as_char[3] == __mu2.as_char[3])? -1: 0; 750 __res.as_char[4] = (__mu1.as_char[4] == __mu2.as_char[4])? -1: 0; 751 __res.as_char[5] = (__mu1.as_char[5] == __mu2.as_char[5])? -1: 0; 752 __res.as_char[6] = (__mu1.as_char[6] == __mu2.as_char[6])? -1: 0; 753 __res.as_char[7] = (__mu1.as_char[7] == __mu2.as_char[7])? -1: 0; 754 755 return (__m64) __res.as_m64; 756 #endif 757 } 758 759 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 760 _m_pcmpeqb (__m64 __m1, __m64 __m2) 761 { 762 return _mm_cmpeq_pi8 (__m1, __m2); 763 } 764 765 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 766 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2) 767 { 768 #if _ARCH_PWR8 769 __vector signed char __a, __b, __c; 770 771 __a = (__vector signed char)vec_splats (__m1); 772 __b = (__vector signed char)vec_splats (__m2); 773 __c = (__vector signed char)vec_cmpgt (__a, __b); 774 return (__m64) ((__vector long long) __c)[0]; 775 #else 776 __m64_union __mu1, __mu2, __res; 777 778 __mu1.as_m64 = __m1; 779 __mu2.as_m64 = __m2; 780 781 __res.as_char[0] = (__mu1.as_char[0] > __mu2.as_char[0])? -1: 0; 782 __res.as_char[1] = (__mu1.as_char[1] > __mu2.as_char[1])? -1: 0; 783 __res.as_char[2] = (__mu1.as_char[2] > __mu2.as_char[2])? -1: 0; 784 __res.as_char[3] = (__mu1.as_char[3] > __mu2.as_char[3])? -1: 0; 785 __res.as_char[4] = (__mu1.as_char[4] > __mu2.as_char[4])? -1: 0; 786 __res.as_char[5] = (__mu1.as_char[5] > __mu2.as_char[5])? -1: 0; 787 __res.as_char[6] = (__mu1.as_char[6] > __mu2.as_char[6])? -1: 0; 788 __res.as_char[7] = (__mu1.as_char[7] > __mu2.as_char[7])? -1: 0; 789 790 return (__m64) __res.as_m64; 791 #endif 792 } 793 794 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 795 _m_pcmpgtb (__m64 __m1, __m64 __m2) 796 { 797 return _mm_cmpgt_pi8 (__m1, __m2); 798 } 799 800 /* Compare four 16-bit values. The result of the comparison is 0xFFFF if 801 the test is true and zero if false. */ 802 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 803 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2) 804 { 805 #if _ARCH_PWR8 806 __vector signed short __a, __b, __c; 807 808 __a = (__vector signed short)vec_splats (__m1); 809 __b = (__vector signed short)vec_splats (__m2); 810 __c = (__vector signed short)vec_cmpeq (__a, __b); 811 return (__m64) ((__vector long long) __c)[0]; 812 #else 813 __m64_union __mu1, __mu2, __res; 814 815 __mu1.as_m64 = __m1; 816 __mu2.as_m64 = __m2; 817 818 __res.as_short[0] = (__mu1.as_short[0] == __mu2.as_short[0])? -1: 0; 819 __res.as_short[1] = (__mu1.as_short[1] == __mu2.as_short[1])? -1: 0; 820 __res.as_short[2] = (__mu1.as_short[2] == __mu2.as_short[2])? -1: 0; 821 __res.as_short[3] = (__mu1.as_short[3] == __mu2.as_short[3])? -1: 0; 822 823 return (__m64) __res.as_m64; 824 #endif 825 } 826 827 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 828 _m_pcmpeqw (__m64 __m1, __m64 __m2) 829 { 830 return _mm_cmpeq_pi16 (__m1, __m2); 831 } 832 833 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 834 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2) 835 { 836 #if _ARCH_PWR8 837 __vector signed short __a, __b, __c; 838 839 __a = (__vector signed short)vec_splats (__m1); 840 __b = (__vector signed short)vec_splats (__m2); 841 __c = (__vector signed short)vec_cmpgt (__a, __b); 842 return (__m64) ((__vector long long) __c)[0]; 843 #else 844 __m64_union __mu1, __mu2, __res; 845 846 __mu1.as_m64 = __m1; 847 __mu2.as_m64 = __m2; 848 849 __res.as_short[0] = (__mu1.as_short[0] > __mu2.as_short[0])? -1: 0; 850 __res.as_short[1] = (__mu1.as_short[1] > __mu2.as_short[1])? -1: 0; 851 __res.as_short[2] = (__mu1.as_short[2] > __mu2.as_short[2])? -1: 0; 852 __res.as_short[3] = (__mu1.as_short[3] > __mu2.as_short[3])? -1: 0; 853 854 return (__m64) __res.as_m64; 855 #endif 856 } 857 858 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 859 _m_pcmpgtw (__m64 __m1, __m64 __m2) 860 { 861 return _mm_cmpgt_pi16 (__m1, __m2); 862 } 863 864 /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if 865 the test is true and zero if false. */ 866 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 867 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2) 868 { 869 #if _ARCH_PWR9 870 __vector signed int __a, __b, __c; 871 872 __a = (__vector signed int)vec_splats (__m1); 873 __b = (__vector signed int)vec_splats (__m2); 874 __c = (__vector signed int)vec_cmpeq (__a, __b); 875 return (__m64) ((__vector long long) __c)[0]; 876 #else 877 __m64_union __mu1, __mu2, __res; 878 879 __mu1.as_m64 = __m1; 880 __mu2.as_m64 = __m2; 881 882 __res.as_int[0] = (__mu1.as_int[0] == __mu2.as_int[0])? -1: 0; 883 __res.as_int[1] = (__mu1.as_int[1] == __mu2.as_int[1])? -1: 0; 884 885 return (__m64) __res.as_m64; 886 #endif 887 } 888 889 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 890 _m_pcmpeqd (__m64 __m1, __m64 __m2) 891 { 892 return _mm_cmpeq_pi32 (__m1, __m2); 893 } 894 895 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 896 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2) 897 { 898 #if _ARCH_PWR9 899 __vector signed int __a, __b, __c; 900 901 __a = (__vector signed int)vec_splats (__m1); 902 __b = (__vector signed int)vec_splats (__m2); 903 __c = (__vector signed int)vec_cmpgt (__a, __b); 904 return (__m64) ((__vector long long) __c)[0]; 905 #else 906 __m64_union __mu1, __mu2, __res; 907 908 __mu1.as_m64 = __m1; 909 __mu2.as_m64 = __m2; 910 911 __res.as_int[0] = (__mu1.as_int[0] > __mu2.as_int[0])? -1: 0; 912 __res.as_int[1] = (__mu1.as_int[1] > __mu2.as_int[1])? -1: 0; 913 914 return (__m64) __res.as_m64; 915 #endif 916 } 917 918 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 919 _m_pcmpgtd (__m64 __m1, __m64 __m2) 920 { 921 return _mm_cmpgt_pi32 (__m1, __m2); 922 } 923 924 #if _ARCH_PWR8 925 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed 926 saturated arithmetic. */ 927 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 928 _mm_adds_pi8 (__m64 __m1, __m64 __m2) 929 { 930 __vector signed char __a, __b, __c; 931 932 __a = (__vector signed char)vec_splats (__m1); 933 __b = (__vector signed char)vec_splats (__m2); 934 __c = vec_adds (__a, __b); 935 return (__m64) ((__vector long long) __c)[0]; 936 } 937 938 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 939 _m_paddsb (__m64 __m1, __m64 __m2) 940 { 941 return _mm_adds_pi8 (__m1, __m2); 942 } 943 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed 944 saturated arithmetic. */ 945 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 946 _mm_adds_pi16 (__m64 __m1, __m64 __m2) 947 { 948 __vector signed short __a, __b, __c; 949 950 __a = (__vector signed short)vec_splats (__m1); 951 __b = (__vector signed short)vec_splats (__m2); 952 __c = vec_adds (__a, __b); 953 return (__m64) ((__vector long long) __c)[0]; 954 } 955 956 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 957 _m_paddsw (__m64 __m1, __m64 __m2) 958 { 959 return _mm_adds_pi16 (__m1, __m2); 960 } 961 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned 962 saturated arithmetic. */ 963 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 964 _mm_adds_pu8 (__m64 __m1, __m64 __m2) 965 { 966 __vector unsigned char __a, __b, __c; 967 968 __a = (__vector unsigned char)vec_splats (__m1); 969 __b = (__vector unsigned char)vec_splats (__m2); 970 __c = vec_adds (__a, __b); 971 return (__m64) ((__vector long long) __c)[0]; 972 } 973 974 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 975 _m_paddusb (__m64 __m1, __m64 __m2) 976 { 977 return _mm_adds_pu8 (__m1, __m2); 978 } 979 980 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned 981 saturated arithmetic. */ 982 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 983 _mm_adds_pu16 (__m64 __m1, __m64 __m2) 984 { 985 __vector unsigned short __a, __b, __c; 986 987 __a = (__vector unsigned short)vec_splats (__m1); 988 __b = (__vector unsigned short)vec_splats (__m2); 989 __c = vec_adds (__a, __b); 990 return (__m64) ((__vector long long) __c)[0]; 991 } 992 993 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 994 _m_paddusw (__m64 __m1, __m64 __m2) 995 { 996 return _mm_adds_pu16 (__m1, __m2); 997 } 998 999 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed 1000 saturating arithmetic. */ 1001 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1002 _mm_subs_pi8 (__m64 __m1, __m64 __m2) 1003 { 1004 __vector signed char __a, __b, __c; 1005 1006 __a = (__vector signed char)vec_splats (__m1); 1007 __b = (__vector signed char)vec_splats (__m2); 1008 __c = vec_subs (__a, __b); 1009 return (__m64) ((__vector long long) __c)[0]; 1010 } 1011 1012 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1013 _m_psubsb (__m64 __m1, __m64 __m2) 1014 { 1015 return _mm_subs_pi8 (__m1, __m2); 1016 } 1017 1018 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using 1019 signed saturating arithmetic. */ 1020 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1021 _mm_subs_pi16 (__m64 __m1, __m64 __m2) 1022 { 1023 __vector signed short __a, __b, __c; 1024 1025 __a = (__vector signed short)vec_splats (__m1); 1026 __b = (__vector signed short)vec_splats (__m2); 1027 __c = vec_subs (__a, __b); 1028 return (__m64) ((__vector long long) __c)[0]; 1029 } 1030 1031 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1032 _m_psubsw (__m64 __m1, __m64 __m2) 1033 { 1034 return _mm_subs_pi16 (__m1, __m2); 1035 } 1036 1037 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using 1038 unsigned saturating arithmetic. */ 1039 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1040 _mm_subs_pu8 (__m64 __m1, __m64 __m2) 1041 { 1042 __vector unsigned char __a, __b, __c; 1043 1044 __a = (__vector unsigned char)vec_splats (__m1); 1045 __b = (__vector unsigned char)vec_splats (__m2); 1046 __c = vec_subs (__a, __b); 1047 return (__m64) ((__vector long long) __c)[0]; 1048 } 1049 1050 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1051 _m_psubusb (__m64 __m1, __m64 __m2) 1052 { 1053 return _mm_subs_pu8 (__m1, __m2); 1054 } 1055 1056 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using 1057 unsigned saturating arithmetic. */ 1058 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1059 _mm_subs_pu16 (__m64 __m1, __m64 __m2) 1060 { 1061 __vector unsigned short __a, __b, __c; 1062 1063 __a = (__vector unsigned short)vec_splats (__m1); 1064 __b = (__vector unsigned short)vec_splats (__m2); 1065 __c = vec_subs (__a, __b); 1066 return (__m64) ((__vector long long) __c)[0]; 1067 } 1068 1069 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1070 _m_psubusw (__m64 __m1, __m64 __m2) 1071 { 1072 return _mm_subs_pu16 (__m1, __m2); 1073 } 1074 1075 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing 1076 four 32-bit intermediate results, which are then summed by pairs to 1077 produce two 32-bit results. */ 1078 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1079 _mm_madd_pi16 (__m64 __m1, __m64 __m2) 1080 { 1081 __vector signed short __a, __b; 1082 __vector signed int __c; 1083 __vector signed int __zero = {0, 0, 0, 0}; 1084 1085 __a = (__vector signed short)vec_splats (__m1); 1086 __b = (__vector signed short)vec_splats (__m2); 1087 __c = vec_vmsumshm (__a, __b, __zero); 1088 return (__m64) ((__vector long long) __c)[0]; 1089 } 1090 1091 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1092 _m_pmaddwd (__m64 __m1, __m64 __m2) 1093 { 1094 return _mm_madd_pi16 (__m1, __m2); 1095 } 1096 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in 1097 M2 and produce the high 16 bits of the 32-bit results. */ 1098 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1099 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2) 1100 { 1101 __vector signed short __a, __b; 1102 __vector signed short __c; 1103 __vector signed int __w0, __w1; 1104 __vector unsigned char __xform1 = { 1105 #ifdef __LITTLE_ENDIAN__ 1106 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 1107 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 1108 #else 1109 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 1110 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15 1111 #endif 1112 }; 1113 1114 __a = (__vector signed short)vec_splats (__m1); 1115 __b = (__vector signed short)vec_splats (__m2); 1116 1117 __w0 = vec_vmulesh (__a, __b); 1118 __w1 = vec_vmulosh (__a, __b); 1119 __c = (__vector signed short)vec_perm (__w0, __w1, __xform1); 1120 1121 return (__m64) ((__vector long long) __c)[0]; 1122 } 1123 1124 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1125 _m_pmulhw (__m64 __m1, __m64 __m2) 1126 { 1127 return _mm_mulhi_pi16 (__m1, __m2); 1128 } 1129 1130 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce 1131 the low 16 bits of the results. */ 1132 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1133 _mm_mullo_pi16 (__m64 __m1, __m64 __m2) 1134 { 1135 __vector signed short __a, __b, __c; 1136 1137 __a = (__vector signed short)vec_splats (__m1); 1138 __b = (__vector signed short)vec_splats (__m2); 1139 __c = __a * __b; 1140 return (__m64) ((__vector long long) __c)[0]; 1141 } 1142 1143 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1144 _m_pmullw (__m64 __m1, __m64 __m2) 1145 { 1146 return _mm_mullo_pi16 (__m1, __m2); 1147 } 1148 1149 /* Shift four 16-bit values in M left by COUNT. */ 1150 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1151 _mm_sll_pi16 (__m64 __m, __m64 __count) 1152 { 1153 __vector signed short __r; 1154 __vector unsigned short __c; 1155 1156 if (__count <= 15) 1157 { 1158 __r = (__vector signed short)vec_splats (__m); 1159 __c = (__vector unsigned short)vec_splats ((unsigned short)__count); 1160 __r = vec_sl (__r, (__vector unsigned short)__c); 1161 return (__m64) ((__vector long long) __r)[0]; 1162 } 1163 else 1164 return (0); 1165 } 1166 1167 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1168 _m_psllw (__m64 __m, __m64 __count) 1169 { 1170 return _mm_sll_pi16 (__m, __count); 1171 } 1172 1173 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1174 _mm_slli_pi16 (__m64 __m, int __count) 1175 { 1176 /* Promote int to long then invoke mm_sll_pi16. */ 1177 return _mm_sll_pi16 (__m, __count); 1178 } 1179 1180 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1181 _m_psllwi (__m64 __m, int __count) 1182 { 1183 return _mm_slli_pi16 (__m, __count); 1184 } 1185 1186 /* Shift two 32-bit values in M left by COUNT. */ 1187 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1188 _mm_sll_pi32 (__m64 __m, __m64 __count) 1189 { 1190 __m64_union __res; 1191 1192 __res.as_m64 = __m; 1193 1194 __res.as_int[0] = __res.as_int[0] << __count; 1195 __res.as_int[1] = __res.as_int[1] << __count; 1196 return (__res.as_m64); 1197 } 1198 1199 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1200 _m_pslld (__m64 __m, __m64 __count) 1201 { 1202 return _mm_sll_pi32 (__m, __count); 1203 } 1204 1205 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1206 _mm_slli_pi32 (__m64 __m, int __count) 1207 { 1208 /* Promote int to long then invoke mm_sll_pi32. */ 1209 return _mm_sll_pi32 (__m, __count); 1210 } 1211 1212 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1213 _m_pslldi (__m64 __m, int __count) 1214 { 1215 return _mm_slli_pi32 (__m, __count); 1216 } 1217 1218 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */ 1219 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1220 _mm_sra_pi16 (__m64 __m, __m64 __count) 1221 { 1222 __vector signed short __r; 1223 __vector unsigned short __c; 1224 1225 if (__count <= 15) 1226 { 1227 __r = (__vector signed short)vec_splats (__m); 1228 __c = (__vector unsigned short)vec_splats ((unsigned short)__count); 1229 __r = vec_sra (__r, (__vector unsigned short)__c); 1230 return (__m64) ((__vector long long) __r)[0]; 1231 } 1232 else 1233 return (0); 1234 } 1235 1236 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1237 _m_psraw (__m64 __m, __m64 __count) 1238 { 1239 return _mm_sra_pi16 (__m, __count); 1240 } 1241 1242 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1243 _mm_srai_pi16 (__m64 __m, int __count) 1244 { 1245 /* Promote int to long then invoke mm_sra_pi32. */ 1246 return _mm_sra_pi16 (__m, __count); 1247 } 1248 1249 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1250 _m_psrawi (__m64 __m, int __count) 1251 { 1252 return _mm_srai_pi16 (__m, __count); 1253 } 1254 1255 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */ 1256 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1257 _mm_sra_pi32 (__m64 __m, __m64 __count) 1258 { 1259 __m64_union __res; 1260 1261 __res.as_m64 = __m; 1262 1263 __res.as_int[0] = __res.as_int[0] >> __count; 1264 __res.as_int[1] = __res.as_int[1] >> __count; 1265 return (__res.as_m64); 1266 } 1267 1268 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1269 _m_psrad (__m64 __m, __m64 __count) 1270 { 1271 return _mm_sra_pi32 (__m, __count); 1272 } 1273 1274 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1275 _mm_srai_pi32 (__m64 __m, int __count) 1276 { 1277 /* Promote int to long then invoke mm_sra_pi32. */ 1278 return _mm_sra_pi32 (__m, __count); 1279 } 1280 1281 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1282 _m_psradi (__m64 __m, int __count) 1283 { 1284 return _mm_srai_pi32 (__m, __count); 1285 } 1286 1287 /* Shift four 16-bit values in M right by COUNT; shift in zeros. */ 1288 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1289 _mm_srl_pi16 (__m64 __m, __m64 __count) 1290 { 1291 __vector unsigned short __r; 1292 __vector unsigned short __c; 1293 1294 if (__count <= 15) 1295 { 1296 __r = (__vector unsigned short)vec_splats (__m); 1297 __c = (__vector unsigned short)vec_splats ((unsigned short)__count); 1298 __r = vec_sr (__r, (__vector unsigned short)__c); 1299 return (__m64) ((__vector long long) __r)[0]; 1300 } 1301 else 1302 return (0); 1303 } 1304 1305 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1306 _m_psrlw (__m64 __m, __m64 __count) 1307 { 1308 return _mm_srl_pi16 (__m, __count); 1309 } 1310 1311 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1312 _mm_srli_pi16 (__m64 __m, int __count) 1313 { 1314 /* Promote int to long then invoke mm_sra_pi32. */ 1315 return _mm_srl_pi16 (__m, __count); 1316 } 1317 1318 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1319 _m_psrlwi (__m64 __m, int __count) 1320 { 1321 return _mm_srli_pi16 (__m, __count); 1322 } 1323 1324 /* Shift two 32-bit values in M right by COUNT; shift in zeros. */ 1325 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1326 _mm_srl_pi32 (__m64 __m, __m64 __count) 1327 { 1328 __m64_union __res; 1329 1330 __res.as_m64 = __m; 1331 1332 __res.as_int[0] = (unsigned int)__res.as_int[0] >> __count; 1333 __res.as_int[1] = (unsigned int)__res.as_int[1] >> __count; 1334 return (__res.as_m64); 1335 } 1336 1337 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1338 _m_psrld (__m64 __m, __m64 __count) 1339 { 1340 return _mm_srl_pi32 (__m, __count); 1341 } 1342 1343 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1344 _mm_srli_pi32 (__m64 __m, int __count) 1345 { 1346 /* Promote int to long then invoke mm_srl_pi32. */ 1347 return _mm_srl_pi32 (__m, __count); 1348 } 1349 1350 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1351 _m_psrldi (__m64 __m, int __count) 1352 { 1353 return _mm_srli_pi32 (__m, __count); 1354 } 1355 #endif /* _ARCH_PWR8 */ 1356 1357 /* Creates a vector of two 32-bit values; I0 is least significant. */ 1358 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1359 _mm_set_pi32 (int __i1, int __i0) 1360 { 1361 __m64_union __res; 1362 1363 __res.as_int[0] = __i0; 1364 __res.as_int[1] = __i1; 1365 return (__res.as_m64); 1366 } 1367 1368 /* Creates a vector of four 16-bit values; W0 is least significant. */ 1369 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1370 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0) 1371 { 1372 __m64_union __res; 1373 1374 __res.as_short[0] = __w0; 1375 __res.as_short[1] = __w1; 1376 __res.as_short[2] = __w2; 1377 __res.as_short[3] = __w3; 1378 return (__res.as_m64); 1379 } 1380 1381 /* Creates a vector of eight 8-bit values; B0 is least significant. */ 1382 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1383 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4, 1384 char __b3, char __b2, char __b1, char __b0) 1385 { 1386 __m64_union __res; 1387 1388 __res.as_char[0] = __b0; 1389 __res.as_char[1] = __b1; 1390 __res.as_char[2] = __b2; 1391 __res.as_char[3] = __b3; 1392 __res.as_char[4] = __b4; 1393 __res.as_char[5] = __b5; 1394 __res.as_char[6] = __b6; 1395 __res.as_char[7] = __b7; 1396 return (__res.as_m64); 1397 } 1398 1399 /* Similar, but with the arguments in reverse order. */ 1400 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1401 _mm_setr_pi32 (int __i0, int __i1) 1402 { 1403 __m64_union __res; 1404 1405 __res.as_int[0] = __i0; 1406 __res.as_int[1] = __i1; 1407 return (__res.as_m64); 1408 } 1409 1410 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1411 _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3) 1412 { 1413 return _mm_set_pi16 (__w3, __w2, __w1, __w0); 1414 } 1415 1416 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1417 _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3, 1418 char __b4, char __b5, char __b6, char __b7) 1419 { 1420 return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); 1421 } 1422 1423 /* Creates a vector of two 32-bit values, both elements containing I. */ 1424 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1425 _mm_set1_pi32 (int __i) 1426 { 1427 __m64_union __res; 1428 1429 __res.as_int[0] = __i; 1430 __res.as_int[1] = __i; 1431 return (__res.as_m64); 1432 } 1433 1434 /* Creates a vector of four 16-bit values, all elements containing W. */ 1435 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1436 _mm_set1_pi16 (short __w) 1437 { 1438 #if _ARCH_PWR9 1439 __vector signed short w; 1440 1441 w = (__vector signed short)vec_splats (__w); 1442 return (__m64) ((__vector long long) w)[0]; 1443 #else 1444 __m64_union __res; 1445 1446 __res.as_short[0] = __w; 1447 __res.as_short[1] = __w; 1448 __res.as_short[2] = __w; 1449 __res.as_short[3] = __w; 1450 return (__res.as_m64); 1451 #endif 1452 } 1453 1454 /* Creates a vector of eight 8-bit values, all elements containing B. */ 1455 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1456 _mm_set1_pi8 (signed char __b) 1457 { 1458 #if _ARCH_PWR8 1459 __vector signed char __res; 1460 1461 __res = (__vector signed char)vec_splats (__b); 1462 return (__m64) ((__vector long long) __res)[0]; 1463 #else 1464 __m64_union __res; 1465 1466 __res.as_char[0] = __b; 1467 __res.as_char[1] = __b; 1468 __res.as_char[2] = __b; 1469 __res.as_char[3] = __b; 1470 __res.as_char[4] = __b; 1471 __res.as_char[5] = __b; 1472 __res.as_char[6] = __b; 1473 __res.as_char[7] = __b; 1474 return (__res.as_m64); 1475 #endif 1476 } 1477 #endif /* _MMINTRIN_H_INCLUDED */ 1478