1 /* Copyright (C) 2002-2019 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 GCC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 Under Section 7 of GPL version 3, you are granted additional 16 permissions described in the GCC Runtime Library Exception, version 17 3.1, as published by the Free Software Foundation. 18 19 You should have received a copy of the GNU General Public License and 20 a copy of the GCC Runtime Library Exception along with this program; 21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 <http://www.gnu.org/licenses/>. */ 23 24 /* Implemented from the specification included in the Intel C++ Compiler 25 User Guide and Reference, version 9.0. */ 26 27 #ifndef NO_WARN_X86_INTRINSICS 28 /* This header is distributed to simplify porting x86_64 code that 29 makes explicit use of Intel intrinsics to powerpc64le. 30 It is the user's responsibility to determine if the results are 31 acceptable and make additional changes as necessary. 32 Note that much code that uses Intel intrinsics can be rewritten in 33 standard C or GNU C extensions, which are more portable and better 34 optimized across multiple targets. 35 36 In the specific case of X86 MMX (__m64) intrinsics, the PowerPC 37 target does not support a native __vector_size__ (8) type. Instead 38 we typedef __m64 to a 64-bit unsigned long long, which is natively 39 supported in 64-bit mode. This works well for the _si64 and some 40 _pi32 operations, but starts to generate long sequences for _pi16 41 and _pi8 operations. For those cases it better (faster and 42 smaller code) to transfer __m64 data to the PowerPC vector 128-bit 43 unit, perform the operation, and then transfer the result back to 44 the __m64 type. This implies that the direct register move 45 instructions, introduced with power8, are available for efficient 46 implementation of these transfers. 47 48 Most MMX intrinsic operations can be performed efficiently as 49 C language 64-bit scalar operation or optimized to use the newer 50 128-bit SSE/Altivec operations. We recomend this for new 51 applications. */ 52 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." 53 #endif 54 55 #ifndef _MMINTRIN_H_INCLUDED 56 #define _MMINTRIN_H_INCLUDED 57 58 #include <altivec.h> 59 /* The Intel API is flexible enough that we must allow aliasing with other 60 vector types, and their scalar components. */ 61 typedef __attribute__ ((__aligned__ (8))) unsigned long long __m64; 62 63 typedef __attribute__ ((__aligned__ (8))) 64 union 65 { 66 __m64 as_m64; 67 char as_char[8]; 68 signed char as_signed_char [8]; 69 short as_short[4]; 70 int as_int[2]; 71 long long as_long_long; 72 float as_float[2]; 73 double as_double; 74 } __m64_union; 75 76 /* Empty the multimedia state. */ 77 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 78 _mm_empty (void) 79 { 80 /* nothing to do on PowerPC. */ 81 } 82 83 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 84 _m_empty (void) 85 { 86 /* nothing to do on PowerPC. */ 87 } 88 89 /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */ 90 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 91 _mm_cvtsi32_si64 (int __i) 92 { 93 return (__m64) (unsigned int) __i; 94 } 95 96 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 97 _m_from_int (int __i) 98 { 99 return _mm_cvtsi32_si64 (__i); 100 } 101 102 /* Convert the lower 32 bits of the __m64 object into an integer. */ 103 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 104 _mm_cvtsi64_si32 (__m64 __i) 105 { 106 return ((int) __i); 107 } 108 109 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 110 _m_to_int (__m64 __i) 111 { 112 return _mm_cvtsi64_si32 (__i); 113 } 114 115 /* Convert I to a __m64 object. */ 116 117 /* Intel intrinsic. */ 118 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 119 _m_from_int64 (long long __i) 120 { 121 return (__m64) __i; 122 } 123 124 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 125 _mm_cvtsi64_m64 (long long __i) 126 { 127 return (__m64) __i; 128 } 129 130 /* Microsoft intrinsic. */ 131 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 132 _mm_cvtsi64x_si64 (long long __i) 133 { 134 return (__m64) __i; 135 } 136 137 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 138 _mm_set_pi64x (long long __i) 139 { 140 return (__m64) __i; 141 } 142 143 /* Convert the __m64 object to a 64bit integer. */ 144 145 /* Intel intrinsic. */ 146 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 147 _m_to_int64 (__m64 __i) 148 { 149 return (long long)__i; 150 } 151 152 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 153 _mm_cvtm64_si64 (__m64 __i) 154 { 155 return (long long) __i; 156 } 157 158 /* Microsoft intrinsic. */ 159 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 160 _mm_cvtsi64_si64x (__m64 __i) 161 { 162 return (long long) __i; 163 } 164 165 #ifdef _ARCH_PWR8 166 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of 167 the result, and the four 16-bit values from M2 into the upper four 8-bit 168 values of the result, all with signed saturation. */ 169 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 170 _mm_packs_pi16 (__m64 __m1, __m64 __m2) 171 { 172 __vector signed short vm1; 173 __vector signed char vresult; 174 175 vm1 = (__vector signed short) (__vector unsigned long long) 176 #ifdef __LITTLE_ENDIAN__ 177 { __m1, __m2 }; 178 #else 179 { __m2, __m1 }; 180 #endif 181 vresult = vec_packs (vm1, vm1); 182 return (__m64) ((__vector long long) vresult)[0]; 183 } 184 185 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 186 _m_packsswb (__m64 __m1, __m64 __m2) 187 { 188 return _mm_packs_pi16 (__m1, __m2); 189 } 190 191 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of 192 the result, and the two 32-bit values from M2 into the upper two 16-bit 193 values of the result, all with signed saturation. */ 194 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 195 _mm_packs_pi32 (__m64 __m1, __m64 __m2) 196 { 197 __vector signed int vm1; 198 __vector signed short vresult; 199 200 vm1 = (__vector signed int) (__vector unsigned long long) 201 #ifdef __LITTLE_ENDIAN__ 202 { __m1, __m2 }; 203 #else 204 { __m2, __m1 }; 205 #endif 206 vresult = vec_packs (vm1, vm1); 207 return (__m64) ((__vector long long) vresult)[0]; 208 } 209 210 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 211 _m_packssdw (__m64 __m1, __m64 __m2) 212 { 213 return _mm_packs_pi32 (__m1, __m2); 214 } 215 216 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of 217 the result, and the four 16-bit values from M2 into the upper four 8-bit 218 values of the result, all with unsigned saturation. */ 219 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 220 _mm_packs_pu16 (__m64 __m1, __m64 __m2) 221 { 222 __vector unsigned char r; 223 __vector signed short vm1 = (__vector signed short) (__vector long long) 224 #ifdef __LITTLE_ENDIAN__ 225 { __m1, __m2 }; 226 #else 227 { __m2, __m1 }; 228 #endif 229 const __vector signed short __zero = { 0 }; 230 __vector __bool short __select = vec_cmplt (vm1, __zero); 231 r = vec_packs ((__vector unsigned short) vm1, (__vector unsigned short) vm1); 232 __vector __bool char packsel = vec_pack (__select, __select); 233 r = vec_sel (r, (const __vector unsigned char) __zero, packsel); 234 return (__m64) ((__vector long long) r)[0]; 235 } 236 237 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 238 _m_packuswb (__m64 __m1, __m64 __m2) 239 { 240 return _mm_packs_pu16 (__m1, __m2); 241 } 242 #endif /* end ARCH_PWR8 */ 243 244 /* Interleave the four 8-bit values from the high half of M1 with the four 245 8-bit values from the high half of M2. */ 246 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 247 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2) 248 { 249 #if _ARCH_PWR8 250 __vector unsigned char a, b, c; 251 252 a = (__vector unsigned char)vec_splats (__m1); 253 b = (__vector unsigned char)vec_splats (__m2); 254 c = vec_mergel (a, b); 255 return (__m64) ((__vector long long) c)[1]; 256 #else 257 __m64_union m1, m2, res; 258 259 m1.as_m64 = __m1; 260 m2.as_m64 = __m2; 261 262 res.as_char[0] = m1.as_char[4]; 263 res.as_char[1] = m2.as_char[4]; 264 res.as_char[2] = m1.as_char[5]; 265 res.as_char[3] = m2.as_char[5]; 266 res.as_char[4] = m1.as_char[6]; 267 res.as_char[5] = m2.as_char[6]; 268 res.as_char[6] = m1.as_char[7]; 269 res.as_char[7] = m2.as_char[7]; 270 271 return (__m64) res.as_m64; 272 #endif 273 } 274 275 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 276 _m_punpckhbw (__m64 __m1, __m64 __m2) 277 { 278 return _mm_unpackhi_pi8 (__m1, __m2); 279 } 280 281 /* Interleave the two 16-bit values from the high half of M1 with the two 282 16-bit values from the high half of M2. */ 283 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 284 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2) 285 { 286 __m64_union m1, m2, res; 287 288 m1.as_m64 = __m1; 289 m2.as_m64 = __m2; 290 291 res.as_short[0] = m1.as_short[2]; 292 res.as_short[1] = m2.as_short[2]; 293 res.as_short[2] = m1.as_short[3]; 294 res.as_short[3] = m2.as_short[3]; 295 296 return (__m64) res.as_m64; 297 } 298 299 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 300 _m_punpckhwd (__m64 __m1, __m64 __m2) 301 { 302 return _mm_unpackhi_pi16 (__m1, __m2); 303 } 304 /* Interleave the 32-bit value from the high half of M1 with the 32-bit 305 value from the high half of M2. */ 306 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 307 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2) 308 { 309 __m64_union m1, m2, res; 310 311 m1.as_m64 = __m1; 312 m2.as_m64 = __m2; 313 314 res.as_int[0] = m1.as_int[1]; 315 res.as_int[1] = m2.as_int[1]; 316 317 return (__m64) res.as_m64; 318 } 319 320 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 321 _m_punpckhdq (__m64 __m1, __m64 __m2) 322 { 323 return _mm_unpackhi_pi32 (__m1, __m2); 324 } 325 /* Interleave the four 8-bit values from the low half of M1 with the four 326 8-bit values from the low half of M2. */ 327 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 328 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2) 329 { 330 #if _ARCH_PWR8 331 __vector unsigned char a, b, c; 332 333 a = (__vector unsigned char)vec_splats (__m1); 334 b = (__vector unsigned char)vec_splats (__m2); 335 c = vec_mergel (a, b); 336 return (__m64) ((__vector long long) c)[0]; 337 #else 338 __m64_union m1, m2, res; 339 340 m1.as_m64 = __m1; 341 m2.as_m64 = __m2; 342 343 res.as_char[0] = m1.as_char[0]; 344 res.as_char[1] = m2.as_char[0]; 345 res.as_char[2] = m1.as_char[1]; 346 res.as_char[3] = m2.as_char[1]; 347 res.as_char[4] = m1.as_char[2]; 348 res.as_char[5] = m2.as_char[2]; 349 res.as_char[6] = m1.as_char[3]; 350 res.as_char[7] = m2.as_char[3]; 351 352 return (__m64) res.as_m64; 353 #endif 354 } 355 356 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 357 _m_punpcklbw (__m64 __m1, __m64 __m2) 358 { 359 return _mm_unpacklo_pi8 (__m1, __m2); 360 } 361 /* Interleave the two 16-bit values from the low half of M1 with the two 362 16-bit values from the low half of M2. */ 363 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 364 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2) 365 { 366 __m64_union m1, m2, res; 367 368 m1.as_m64 = __m1; 369 m2.as_m64 = __m2; 370 371 res.as_short[0] = m1.as_short[0]; 372 res.as_short[1] = m2.as_short[0]; 373 res.as_short[2] = m1.as_short[1]; 374 res.as_short[3] = m2.as_short[1]; 375 376 return (__m64) res.as_m64; 377 } 378 379 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 380 _m_punpcklwd (__m64 __m1, __m64 __m2) 381 { 382 return _mm_unpacklo_pi16 (__m1, __m2); 383 } 384 385 /* Interleave the 32-bit value from the low half of M1 with the 32-bit 386 value from the low half of M2. */ 387 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 388 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2) 389 { 390 __m64_union m1, m2, res; 391 392 m1.as_m64 = __m1; 393 m2.as_m64 = __m2; 394 395 res.as_int[0] = m1.as_int[0]; 396 res.as_int[1] = m2.as_int[0]; 397 398 return (__m64) res.as_m64; 399 } 400 401 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 402 _m_punpckldq (__m64 __m1, __m64 __m2) 403 { 404 return _mm_unpacklo_pi32 (__m1, __m2); 405 } 406 407 /* Add the 8-bit values in M1 to the 8-bit values in M2. */ 408 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 409 _mm_add_pi8 (__m64 __m1, __m64 __m2) 410 { 411 #if _ARCH_PWR8 412 __vector signed char a, b, c; 413 414 a = (__vector signed char)vec_splats (__m1); 415 b = (__vector signed char)vec_splats (__m2); 416 c = vec_add (a, b); 417 return (__m64) ((__vector long long) c)[0]; 418 #else 419 __m64_union m1, m2, res; 420 421 m1.as_m64 = __m1; 422 m2.as_m64 = __m2; 423 424 res.as_char[0] = m1.as_char[0] + m2.as_char[0]; 425 res.as_char[1] = m1.as_char[1] + m2.as_char[1]; 426 res.as_char[2] = m1.as_char[2] + m2.as_char[2]; 427 res.as_char[3] = m1.as_char[3] + m2.as_char[3]; 428 res.as_char[4] = m1.as_char[4] + m2.as_char[4]; 429 res.as_char[5] = m1.as_char[5] + m2.as_char[5]; 430 res.as_char[6] = m1.as_char[6] + m2.as_char[6]; 431 res.as_char[7] = m1.as_char[7] + m2.as_char[7]; 432 433 return (__m64) res.as_m64; 434 #endif 435 } 436 437 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 438 _m_paddb (__m64 __m1, __m64 __m2) 439 { 440 return _mm_add_pi8 (__m1, __m2); 441 } 442 443 /* Add the 16-bit values in M1 to the 16-bit values in M2. */ 444 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 445 _mm_add_pi16 (__m64 __m1, __m64 __m2) 446 { 447 #if _ARCH_PWR8 448 __vector signed short a, b, c; 449 450 a = (__vector signed short)vec_splats (__m1); 451 b = (__vector signed short)vec_splats (__m2); 452 c = vec_add (a, b); 453 return (__m64) ((__vector long long) c)[0]; 454 #else 455 __m64_union m1, m2, res; 456 457 m1.as_m64 = __m1; 458 m2.as_m64 = __m2; 459 460 res.as_short[0] = m1.as_short[0] + m2.as_short[0]; 461 res.as_short[1] = m1.as_short[1] + m2.as_short[1]; 462 res.as_short[2] = m1.as_short[2] + m2.as_short[2]; 463 res.as_short[3] = m1.as_short[3] + m2.as_short[3]; 464 465 return (__m64) res.as_m64; 466 #endif 467 } 468 469 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 470 _m_paddw (__m64 __m1, __m64 __m2) 471 { 472 return _mm_add_pi16 (__m1, __m2); 473 } 474 475 /* Add the 32-bit values in M1 to the 32-bit values in M2. */ 476 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 477 _mm_add_pi32 (__m64 __m1, __m64 __m2) 478 { 479 #if _ARCH_PWR9 480 __vector signed int a, b, c; 481 482 a = (__vector signed int)vec_splats (__m1); 483 b = (__vector signed int)vec_splats (__m2); 484 c = vec_add (a, b); 485 return (__m64) ((__vector long long) c)[0]; 486 #else 487 __m64_union m1, m2, res; 488 489 m1.as_m64 = __m1; 490 m2.as_m64 = __m2; 491 492 res.as_int[0] = m1.as_int[0] + m2.as_int[0]; 493 res.as_int[1] = m1.as_int[1] + m2.as_int[1]; 494 495 return (__m64) res.as_m64; 496 #endif 497 } 498 499 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 500 _m_paddd (__m64 __m1, __m64 __m2) 501 { 502 return _mm_add_pi32 (__m1, __m2); 503 } 504 505 /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */ 506 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 507 _mm_sub_pi8 (__m64 __m1, __m64 __m2) 508 { 509 #if _ARCH_PWR8 510 __vector signed char a, b, c; 511 512 a = (__vector signed char)vec_splats (__m1); 513 b = (__vector signed char)vec_splats (__m2); 514 c = vec_sub (a, b); 515 return (__m64) ((__vector long long) c)[0]; 516 #else 517 __m64_union m1, m2, res; 518 519 m1.as_m64 = __m1; 520 m2.as_m64 = __m2; 521 522 res.as_char[0] = m1.as_char[0] - m2.as_char[0]; 523 res.as_char[1] = m1.as_char[1] - m2.as_char[1]; 524 res.as_char[2] = m1.as_char[2] - m2.as_char[2]; 525 res.as_char[3] = m1.as_char[3] - m2.as_char[3]; 526 res.as_char[4] = m1.as_char[4] - m2.as_char[4]; 527 res.as_char[5] = m1.as_char[5] - m2.as_char[5]; 528 res.as_char[6] = m1.as_char[6] - m2.as_char[6]; 529 res.as_char[7] = m1.as_char[7] - m2.as_char[7]; 530 531 return (__m64) res.as_m64; 532 #endif 533 } 534 535 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 536 _m_psubb (__m64 __m1, __m64 __m2) 537 { 538 return _mm_sub_pi8 (__m1, __m2); 539 } 540 541 /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */ 542 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 543 _mm_sub_pi16 (__m64 __m1, __m64 __m2) 544 { 545 #if _ARCH_PWR8 546 __vector signed short a, b, c; 547 548 a = (__vector signed short)vec_splats (__m1); 549 b = (__vector signed short)vec_splats (__m2); 550 c = vec_sub (a, b); 551 return (__m64) ((__vector long long) c)[0]; 552 #else 553 __m64_union m1, m2, res; 554 555 m1.as_m64 = __m1; 556 m2.as_m64 = __m2; 557 558 res.as_short[0] = m1.as_short[0] - m2.as_short[0]; 559 res.as_short[1] = m1.as_short[1] - m2.as_short[1]; 560 res.as_short[2] = m1.as_short[2] - m2.as_short[2]; 561 res.as_short[3] = m1.as_short[3] - m2.as_short[3]; 562 563 return (__m64) res.as_m64; 564 #endif 565 } 566 567 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 568 _m_psubw (__m64 __m1, __m64 __m2) 569 { 570 return _mm_sub_pi16 (__m1, __m2); 571 } 572 573 /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */ 574 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 575 _mm_sub_pi32 (__m64 __m1, __m64 __m2) 576 { 577 #if _ARCH_PWR9 578 __vector signed int a, b, c; 579 580 a = (__vector signed int)vec_splats (__m1); 581 b = (__vector signed int)vec_splats (__m2); 582 c = vec_sub (a, b); 583 return (__m64) ((__vector long long) c)[0]; 584 #else 585 __m64_union m1, m2, res; 586 587 m1.as_m64 = __m1; 588 m2.as_m64 = __m2; 589 590 res.as_int[0] = m1.as_int[0] - m2.as_int[0]; 591 res.as_int[1] = m1.as_int[1] - m2.as_int[1]; 592 593 return (__m64) res.as_m64; 594 #endif 595 } 596 597 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 598 _m_psubd (__m64 __m1, __m64 __m2) 599 { 600 return _mm_sub_pi32 (__m1, __m2); 601 } 602 603 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 604 _mm_add_si64 (__m64 __m1, __m64 __m2) 605 { 606 return (__m1 + __m2); 607 } 608 609 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 610 _mm_sub_si64 (__m64 __m1, __m64 __m2) 611 { 612 return (__m1 - __m2); 613 } 614 615 /* Shift the 64-bit value in M left by COUNT. */ 616 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 617 _mm_sll_si64 (__m64 __m, __m64 __count) 618 { 619 return (__m << __count); 620 } 621 622 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 623 _m_psllq (__m64 __m, __m64 __count) 624 { 625 return _mm_sll_si64 (__m, __count); 626 } 627 628 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 629 _mm_slli_si64 (__m64 __m, const int __count) 630 { 631 return (__m << __count); 632 } 633 634 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 635 _m_psllqi (__m64 __m, const int __count) 636 { 637 return _mm_slli_si64 (__m, __count); 638 } 639 640 /* Shift the 64-bit value in M left by COUNT; shift in zeros. */ 641 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 642 _mm_srl_si64 (__m64 __m, __m64 __count) 643 { 644 return (__m >> __count); 645 } 646 647 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 648 _m_psrlq (__m64 __m, __m64 __count) 649 { 650 return _mm_srl_si64 (__m, __count); 651 } 652 653 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 654 _mm_srli_si64 (__m64 __m, const int __count) 655 { 656 return (__m >> __count); 657 } 658 659 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 660 _m_psrlqi (__m64 __m, const int __count) 661 { 662 return _mm_srli_si64 (__m, __count); 663 } 664 665 /* Bit-wise AND the 64-bit values in M1 and M2. */ 666 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 667 _mm_and_si64 (__m64 __m1, __m64 __m2) 668 { 669 return (__m1 & __m2); 670 } 671 672 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 673 _m_pand (__m64 __m1, __m64 __m2) 674 { 675 return _mm_and_si64 (__m1, __m2); 676 } 677 678 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the 679 64-bit value in M2. */ 680 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 681 _mm_andnot_si64 (__m64 __m1, __m64 __m2) 682 { 683 return (~__m1 & __m2); 684 } 685 686 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 687 _m_pandn (__m64 __m1, __m64 __m2) 688 { 689 return _mm_andnot_si64 (__m1, __m2); 690 } 691 692 /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */ 693 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 694 _mm_or_si64 (__m64 __m1, __m64 __m2) 695 { 696 return (__m1 | __m2); 697 } 698 699 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 700 _m_por (__m64 __m1, __m64 __m2) 701 { 702 return _mm_or_si64 (__m1, __m2); 703 } 704 705 /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */ 706 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 707 _mm_xor_si64 (__m64 __m1, __m64 __m2) 708 { 709 return (__m1 ^ __m2); 710 } 711 712 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 713 _m_pxor (__m64 __m1, __m64 __m2) 714 { 715 return _mm_xor_si64 (__m1, __m2); 716 } 717 718 /* Creates a 64-bit zero. */ 719 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 720 _mm_setzero_si64 (void) 721 { 722 return (__m64) 0; 723 } 724 725 /* Compare eight 8-bit values. The result of the comparison is 0xFF if the 726 test is true and zero if false. */ 727 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 728 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2) 729 { 730 #if defined(_ARCH_PWR6) && defined(__powerpc64__) 731 __m64 res; 732 __asm__( 733 "cmpb %0,%1,%2;\n" 734 : "=r" (res) 735 : "r" (__m1), 736 "r" (__m2) 737 : ); 738 return (res); 739 #else 740 __m64_union m1, m2, res; 741 742 m1.as_m64 = __m1; 743 m2.as_m64 = __m2; 744 745 res.as_char[0] = (m1.as_char[0] == m2.as_char[0])? -1: 0; 746 res.as_char[1] = (m1.as_char[1] == m2.as_char[1])? -1: 0; 747 res.as_char[2] = (m1.as_char[2] == m2.as_char[2])? -1: 0; 748 res.as_char[3] = (m1.as_char[3] == m2.as_char[3])? -1: 0; 749 res.as_char[4] = (m1.as_char[4] == m2.as_char[4])? -1: 0; 750 res.as_char[5] = (m1.as_char[5] == m2.as_char[5])? -1: 0; 751 res.as_char[6] = (m1.as_char[6] == m2.as_char[6])? -1: 0; 752 res.as_char[7] = (m1.as_char[7] == m2.as_char[7])? -1: 0; 753 754 return (__m64) res.as_m64; 755 #endif 756 } 757 758 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 759 _m_pcmpeqb (__m64 __m1, __m64 __m2) 760 { 761 return _mm_cmpeq_pi8 (__m1, __m2); 762 } 763 764 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 765 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2) 766 { 767 #if _ARCH_PWR8 768 __vector signed char a, b, c; 769 770 a = (__vector signed char)vec_splats (__m1); 771 b = (__vector signed char)vec_splats (__m2); 772 c = (__vector signed char)vec_cmpgt (a, b); 773 return (__m64) ((__vector long long) c)[0]; 774 #else 775 __m64_union m1, m2, res; 776 777 m1.as_m64 = __m1; 778 m2.as_m64 = __m2; 779 780 res.as_char[0] = (m1.as_char[0] > m2.as_char[0])? -1: 0; 781 res.as_char[1] = (m1.as_char[1] > m2.as_char[1])? -1: 0; 782 res.as_char[2] = (m1.as_char[2] > m2.as_char[2])? -1: 0; 783 res.as_char[3] = (m1.as_char[3] > m2.as_char[3])? -1: 0; 784 res.as_char[4] = (m1.as_char[4] > m2.as_char[4])? -1: 0; 785 res.as_char[5] = (m1.as_char[5] > m2.as_char[5])? -1: 0; 786 res.as_char[6] = (m1.as_char[6] > m2.as_char[6])? -1: 0; 787 res.as_char[7] = (m1.as_char[7] > m2.as_char[7])? -1: 0; 788 789 return (__m64) res.as_m64; 790 #endif 791 } 792 793 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 794 _m_pcmpgtb (__m64 __m1, __m64 __m2) 795 { 796 return _mm_cmpgt_pi8 (__m1, __m2); 797 } 798 799 /* Compare four 16-bit values. The result of the comparison is 0xFFFF if 800 the test is true and zero if false. */ 801 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 802 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2) 803 { 804 #if _ARCH_PWR8 805 __vector signed short a, b, c; 806 807 a = (__vector signed short)vec_splats (__m1); 808 b = (__vector signed short)vec_splats (__m2); 809 c = (__vector signed short)vec_cmpeq (a, b); 810 return (__m64) ((__vector long long) c)[0]; 811 #else 812 __m64_union m1, m2, res; 813 814 m1.as_m64 = __m1; 815 m2.as_m64 = __m2; 816 817 res.as_short[0] = (m1.as_short[0] == m2.as_short[0])? -1: 0; 818 res.as_short[1] = (m1.as_short[1] == m2.as_short[1])? -1: 0; 819 res.as_short[2] = (m1.as_short[2] == m2.as_short[2])? -1: 0; 820 res.as_short[3] = (m1.as_short[3] == m2.as_short[3])? -1: 0; 821 822 return (__m64) res.as_m64; 823 #endif 824 } 825 826 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 827 _m_pcmpeqw (__m64 __m1, __m64 __m2) 828 { 829 return _mm_cmpeq_pi16 (__m1, __m2); 830 } 831 832 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 833 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2) 834 { 835 #if _ARCH_PWR8 836 __vector signed short a, b, c; 837 838 a = (__vector signed short)vec_splats (__m1); 839 b = (__vector signed short)vec_splats (__m2); 840 c = (__vector signed short)vec_cmpgt (a, b); 841 return (__m64) ((__vector long long) c)[0]; 842 #else 843 __m64_union m1, m2, res; 844 845 m1.as_m64 = __m1; 846 m2.as_m64 = __m2; 847 848 res.as_short[0] = (m1.as_short[0] > m2.as_short[0])? -1: 0; 849 res.as_short[1] = (m1.as_short[1] > m2.as_short[1])? -1: 0; 850 res.as_short[2] = (m1.as_short[2] > m2.as_short[2])? -1: 0; 851 res.as_short[3] = (m1.as_short[3] > m2.as_short[3])? -1: 0; 852 853 return (__m64) res.as_m64; 854 #endif 855 } 856 857 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 858 _m_pcmpgtw (__m64 __m1, __m64 __m2) 859 { 860 return _mm_cmpgt_pi16 (__m1, __m2); 861 } 862 863 /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if 864 the test is true and zero if false. */ 865 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 866 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2) 867 { 868 #if _ARCH_PWR9 869 __vector signed int a, b, c; 870 871 a = (__vector signed int)vec_splats (__m1); 872 b = (__vector signed int)vec_splats (__m2); 873 c = (__vector signed int)vec_cmpeq (a, b); 874 return (__m64) ((__vector long long) c)[0]; 875 #else 876 __m64_union m1, m2, res; 877 878 m1.as_m64 = __m1; 879 m2.as_m64 = __m2; 880 881 res.as_int[0] = (m1.as_int[0] == m2.as_int[0])? -1: 0; 882 res.as_int[1] = (m1.as_int[1] == m2.as_int[1])? -1: 0; 883 884 return (__m64) res.as_m64; 885 #endif 886 } 887 888 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 889 _m_pcmpeqd (__m64 __m1, __m64 __m2) 890 { 891 return _mm_cmpeq_pi32 (__m1, __m2); 892 } 893 894 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 895 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2) 896 { 897 #if _ARCH_PWR9 898 __vector signed int a, b, c; 899 900 a = (__vector signed int)vec_splats (__m1); 901 b = (__vector signed int)vec_splats (__m2); 902 c = (__vector signed int)vec_cmpgt (a, b); 903 return (__m64) ((__vector long long) c)[0]; 904 #else 905 __m64_union m1, m2, res; 906 907 m1.as_m64 = __m1; 908 m2.as_m64 = __m2; 909 910 res.as_int[0] = (m1.as_int[0] > m2.as_int[0])? -1: 0; 911 res.as_int[1] = (m1.as_int[1] > m2.as_int[1])? -1: 0; 912 913 return (__m64) res.as_m64; 914 #endif 915 } 916 917 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 918 _m_pcmpgtd (__m64 __m1, __m64 __m2) 919 { 920 return _mm_cmpgt_pi32 (__m1, __m2); 921 } 922 923 #if _ARCH_PWR8 924 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed 925 saturated arithmetic. */ 926 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 927 _mm_adds_pi8 (__m64 __m1, __m64 __m2) 928 { 929 __vector signed char a, b, c; 930 931 a = (__vector signed char)vec_splats (__m1); 932 b = (__vector signed char)vec_splats (__m2); 933 c = vec_adds (a, b); 934 return (__m64) ((__vector long long) c)[0]; 935 } 936 937 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 938 _m_paddsb (__m64 __m1, __m64 __m2) 939 { 940 return _mm_adds_pi8 (__m1, __m2); 941 } 942 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed 943 saturated arithmetic. */ 944 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 945 _mm_adds_pi16 (__m64 __m1, __m64 __m2) 946 { 947 __vector signed short a, b, c; 948 949 a = (__vector signed short)vec_splats (__m1); 950 b = (__vector signed short)vec_splats (__m2); 951 c = vec_adds (a, b); 952 return (__m64) ((__vector long long) c)[0]; 953 } 954 955 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 956 _m_paddsw (__m64 __m1, __m64 __m2) 957 { 958 return _mm_adds_pi16 (__m1, __m2); 959 } 960 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned 961 saturated arithmetic. */ 962 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 963 _mm_adds_pu8 (__m64 __m1, __m64 __m2) 964 { 965 __vector unsigned char a, b, c; 966 967 a = (__vector unsigned char)vec_splats (__m1); 968 b = (__vector unsigned char)vec_splats (__m2); 969 c = vec_adds (a, b); 970 return (__m64) ((__vector long long) c)[0]; 971 } 972 973 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 974 _m_paddusb (__m64 __m1, __m64 __m2) 975 { 976 return _mm_adds_pu8 (__m1, __m2); 977 } 978 979 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned 980 saturated arithmetic. */ 981 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 982 _mm_adds_pu16 (__m64 __m1, __m64 __m2) 983 { 984 __vector unsigned short a, b, c; 985 986 a = (__vector unsigned short)vec_splats (__m1); 987 b = (__vector unsigned short)vec_splats (__m2); 988 c = vec_adds (a, b); 989 return (__m64) ((__vector long long) c)[0]; 990 } 991 992 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 993 _m_paddusw (__m64 __m1, __m64 __m2) 994 { 995 return _mm_adds_pu16 (__m1, __m2); 996 } 997 998 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed 999 saturating arithmetic. */ 1000 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1001 _mm_subs_pi8 (__m64 __m1, __m64 __m2) 1002 { 1003 __vector signed char a, b, c; 1004 1005 a = (__vector signed char)vec_splats (__m1); 1006 b = (__vector signed char)vec_splats (__m2); 1007 c = vec_subs (a, b); 1008 return (__m64) ((__vector long long) c)[0]; 1009 } 1010 1011 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1012 _m_psubsb (__m64 __m1, __m64 __m2) 1013 { 1014 return _mm_subs_pi8 (__m1, __m2); 1015 } 1016 1017 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using 1018 signed saturating arithmetic. */ 1019 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1020 _mm_subs_pi16 (__m64 __m1, __m64 __m2) 1021 { 1022 __vector signed short a, b, c; 1023 1024 a = (__vector signed short)vec_splats (__m1); 1025 b = (__vector signed short)vec_splats (__m2); 1026 c = vec_subs (a, b); 1027 return (__m64) ((__vector long long) c)[0]; 1028 } 1029 1030 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1031 _m_psubsw (__m64 __m1, __m64 __m2) 1032 { 1033 return _mm_subs_pi16 (__m1, __m2); 1034 } 1035 1036 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using 1037 unsigned saturating arithmetic. */ 1038 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1039 _mm_subs_pu8 (__m64 __m1, __m64 __m2) 1040 { 1041 __vector unsigned char a, b, c; 1042 1043 a = (__vector unsigned char)vec_splats (__m1); 1044 b = (__vector unsigned char)vec_splats (__m2); 1045 c = vec_subs (a, b); 1046 return (__m64) ((__vector long long) c)[0]; 1047 } 1048 1049 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1050 _m_psubusb (__m64 __m1, __m64 __m2) 1051 { 1052 return _mm_subs_pu8 (__m1, __m2); 1053 } 1054 1055 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using 1056 unsigned saturating arithmetic. */ 1057 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1058 _mm_subs_pu16 (__m64 __m1, __m64 __m2) 1059 { 1060 __vector unsigned short a, b, c; 1061 1062 a = (__vector unsigned short)vec_splats (__m1); 1063 b = (__vector unsigned short)vec_splats (__m2); 1064 c = vec_subs (a, b); 1065 return (__m64) ((__vector long long) c)[0]; 1066 } 1067 1068 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1069 _m_psubusw (__m64 __m1, __m64 __m2) 1070 { 1071 return _mm_subs_pu16 (__m1, __m2); 1072 } 1073 1074 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing 1075 four 32-bit intermediate results, which are then summed by pairs to 1076 produce two 32-bit results. */ 1077 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1078 _mm_madd_pi16 (__m64 __m1, __m64 __m2) 1079 { 1080 __vector signed short a, b; 1081 __vector signed int c; 1082 __vector signed int zero = {0, 0, 0, 0}; 1083 1084 a = (__vector signed short)vec_splats (__m1); 1085 b = (__vector signed short)vec_splats (__m2); 1086 c = vec_vmsumshm (a, b, zero); 1087 return (__m64) ((__vector long long) c)[0]; 1088 } 1089 1090 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1091 _m_pmaddwd (__m64 __m1, __m64 __m2) 1092 { 1093 return _mm_madd_pi16 (__m1, __m2); 1094 } 1095 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in 1096 M2 and produce the high 16 bits of the 32-bit results. */ 1097 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1098 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2) 1099 { 1100 __vector signed short a, b; 1101 __vector signed short c; 1102 __vector signed int w0, w1; 1103 __vector unsigned char xform1 = { 1104 #ifdef __LITTLE_ENDIAN__ 1105 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 1106 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 1107 #else 1108 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 1109 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15 1110 #endif 1111 }; 1112 1113 a = (__vector signed short)vec_splats (__m1); 1114 b = (__vector signed short)vec_splats (__m2); 1115 1116 w0 = vec_vmulesh (a, b); 1117 w1 = vec_vmulosh (a, b); 1118 c = (__vector signed short)vec_perm (w0, w1, xform1); 1119 1120 return (__m64) ((__vector long long) c)[0]; 1121 } 1122 1123 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1124 _m_pmulhw (__m64 __m1, __m64 __m2) 1125 { 1126 return _mm_mulhi_pi16 (__m1, __m2); 1127 } 1128 1129 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce 1130 the low 16 bits of the results. */ 1131 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1132 _mm_mullo_pi16 (__m64 __m1, __m64 __m2) 1133 { 1134 __vector signed short a, b, c; 1135 1136 a = (__vector signed short)vec_splats (__m1); 1137 b = (__vector signed short)vec_splats (__m2); 1138 c = a * b; 1139 return (__m64) ((__vector long long) c)[0]; 1140 } 1141 1142 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1143 _m_pmullw (__m64 __m1, __m64 __m2) 1144 { 1145 return _mm_mullo_pi16 (__m1, __m2); 1146 } 1147 1148 /* Shift four 16-bit values in M left by COUNT. */ 1149 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1150 _mm_sll_pi16 (__m64 __m, __m64 __count) 1151 { 1152 __vector signed short m, r; 1153 __vector unsigned short c; 1154 1155 if (__count <= 15) 1156 { 1157 m = (__vector signed short)vec_splats (__m); 1158 c = (__vector unsigned short)vec_splats ((unsigned short)__count); 1159 r = vec_sl (m, (__vector unsigned short)c); 1160 return (__m64) ((__vector long long) r)[0]; 1161 } 1162 else 1163 return (0); 1164 } 1165 1166 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1167 _m_psllw (__m64 __m, __m64 __count) 1168 { 1169 return _mm_sll_pi16 (__m, __count); 1170 } 1171 1172 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1173 _mm_slli_pi16 (__m64 __m, int __count) 1174 { 1175 /* Promote int to long then invoke mm_sll_pi16. */ 1176 return _mm_sll_pi16 (__m, __count); 1177 } 1178 1179 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1180 _m_psllwi (__m64 __m, int __count) 1181 { 1182 return _mm_slli_pi16 (__m, __count); 1183 } 1184 1185 /* Shift two 32-bit values in M left by COUNT. */ 1186 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1187 _mm_sll_pi32 (__m64 __m, __m64 __count) 1188 { 1189 __m64_union m, res; 1190 1191 m.as_m64 = __m; 1192 1193 res.as_int[0] = m.as_int[0] << __count; 1194 res.as_int[1] = m.as_int[1] << __count; 1195 return (res.as_m64); 1196 } 1197 1198 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1199 _m_pslld (__m64 __m, __m64 __count) 1200 { 1201 return _mm_sll_pi32 (__m, __count); 1202 } 1203 1204 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1205 _mm_slli_pi32 (__m64 __m, int __count) 1206 { 1207 /* Promote int to long then invoke mm_sll_pi32. */ 1208 return _mm_sll_pi32 (__m, __count); 1209 } 1210 1211 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1212 _m_pslldi (__m64 __m, int __count) 1213 { 1214 return _mm_slli_pi32 (__m, __count); 1215 } 1216 1217 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */ 1218 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1219 _mm_sra_pi16 (__m64 __m, __m64 __count) 1220 { 1221 __vector signed short m, r; 1222 __vector unsigned short c; 1223 1224 if (__count <= 15) 1225 { 1226 m = (__vector signed short)vec_splats (__m); 1227 c = (__vector unsigned short)vec_splats ((unsigned short)__count); 1228 r = vec_sra (m, (__vector unsigned short)c); 1229 return (__m64) ((__vector long long) r)[0]; 1230 } 1231 else 1232 return (0); 1233 } 1234 1235 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1236 _m_psraw (__m64 __m, __m64 __count) 1237 { 1238 return _mm_sra_pi16 (__m, __count); 1239 } 1240 1241 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1242 _mm_srai_pi16 (__m64 __m, int __count) 1243 { 1244 /* Promote int to long then invoke mm_sra_pi32. */ 1245 return _mm_sra_pi16 (__m, __count); 1246 } 1247 1248 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1249 _m_psrawi (__m64 __m, int __count) 1250 { 1251 return _mm_srai_pi16 (__m, __count); 1252 } 1253 1254 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */ 1255 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1256 _mm_sra_pi32 (__m64 __m, __m64 __count) 1257 { 1258 __m64_union m, res; 1259 1260 m.as_m64 = __m; 1261 1262 res.as_int[0] = m.as_int[0] >> __count; 1263 res.as_int[1] = m.as_int[1] >> __count; 1264 return (res.as_m64); 1265 } 1266 1267 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1268 _m_psrad (__m64 __m, __m64 __count) 1269 { 1270 return _mm_sra_pi32 (__m, __count); 1271 } 1272 1273 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1274 _mm_srai_pi32 (__m64 __m, int __count) 1275 { 1276 /* Promote int to long then invoke mm_sra_pi32. */ 1277 return _mm_sra_pi32 (__m, __count); 1278 } 1279 1280 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1281 _m_psradi (__m64 __m, int __count) 1282 { 1283 return _mm_srai_pi32 (__m, __count); 1284 } 1285 1286 /* Shift four 16-bit values in M right by COUNT; shift in zeros. */ 1287 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1288 _mm_srl_pi16 (__m64 __m, __m64 __count) 1289 { 1290 __vector unsigned short m, r; 1291 __vector unsigned short c; 1292 1293 if (__count <= 15) 1294 { 1295 m = (__vector unsigned short)vec_splats (__m); 1296 c = (__vector unsigned short)vec_splats ((unsigned short)__count); 1297 r = vec_sr (m, (__vector unsigned short)c); 1298 return (__m64) ((__vector long long) r)[0]; 1299 } 1300 else 1301 return (0); 1302 } 1303 1304 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1305 _m_psrlw (__m64 __m, __m64 __count) 1306 { 1307 return _mm_srl_pi16 (__m, __count); 1308 } 1309 1310 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1311 _mm_srli_pi16 (__m64 __m, int __count) 1312 { 1313 /* Promote int to long then invoke mm_sra_pi32. */ 1314 return _mm_srl_pi16 (__m, __count); 1315 } 1316 1317 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1318 _m_psrlwi (__m64 __m, int __count) 1319 { 1320 return _mm_srli_pi16 (__m, __count); 1321 } 1322 1323 /* Shift two 32-bit values in M right by COUNT; shift in zeros. */ 1324 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1325 _mm_srl_pi32 (__m64 __m, __m64 __count) 1326 { 1327 __m64_union m, res; 1328 1329 m.as_m64 = __m; 1330 1331 res.as_int[0] = (unsigned int)m.as_int[0] >> __count; 1332 res.as_int[1] = (unsigned int)m.as_int[1] >> __count; 1333 return (res.as_m64); 1334 } 1335 1336 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1337 _m_psrld (__m64 __m, __m64 __count) 1338 { 1339 return _mm_srl_pi32 (__m, __count); 1340 } 1341 1342 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1343 _mm_srli_pi32 (__m64 __m, int __count) 1344 { 1345 /* Promote int to long then invoke mm_srl_pi32. */ 1346 return _mm_srl_pi32 (__m, __count); 1347 } 1348 1349 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1350 _m_psrldi (__m64 __m, int __count) 1351 { 1352 return _mm_srli_pi32 (__m, __count); 1353 } 1354 #endif /* _ARCH_PWR8 */ 1355 1356 /* Creates a vector of two 32-bit values; I0 is least significant. */ 1357 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1358 _mm_set_pi32 (int __i1, int __i0) 1359 { 1360 __m64_union res; 1361 1362 res.as_int[0] = __i0; 1363 res.as_int[1] = __i1; 1364 return (res.as_m64); 1365 } 1366 1367 /* Creates a vector of four 16-bit values; W0 is least significant. */ 1368 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1369 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0) 1370 { 1371 __m64_union res; 1372 1373 res.as_short[0] = __w0; 1374 res.as_short[1] = __w1; 1375 res.as_short[2] = __w2; 1376 res.as_short[3] = __w3; 1377 return (res.as_m64); 1378 } 1379 1380 /* Creates a vector of eight 8-bit values; B0 is least significant. */ 1381 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1382 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4, 1383 char __b3, char __b2, char __b1, char __b0) 1384 { 1385 __m64_union res; 1386 1387 res.as_char[0] = __b0; 1388 res.as_char[1] = __b1; 1389 res.as_char[2] = __b2; 1390 res.as_char[3] = __b3; 1391 res.as_char[4] = __b4; 1392 res.as_char[5] = __b5; 1393 res.as_char[6] = __b6; 1394 res.as_char[7] = __b7; 1395 return (res.as_m64); 1396 } 1397 1398 /* Similar, but with the arguments in reverse order. */ 1399 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1400 _mm_setr_pi32 (int __i0, int __i1) 1401 { 1402 __m64_union res; 1403 1404 res.as_int[0] = __i0; 1405 res.as_int[1] = __i1; 1406 return (res.as_m64); 1407 } 1408 1409 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1410 _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3) 1411 { 1412 return _mm_set_pi16 (__w3, __w2, __w1, __w0); 1413 } 1414 1415 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1416 _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3, 1417 char __b4, char __b5, char __b6, char __b7) 1418 { 1419 return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); 1420 } 1421 1422 /* Creates a vector of two 32-bit values, both elements containing I. */ 1423 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1424 _mm_set1_pi32 (int __i) 1425 { 1426 __m64_union res; 1427 1428 res.as_int[0] = __i; 1429 res.as_int[1] = __i; 1430 return (res.as_m64); 1431 } 1432 1433 /* Creates a vector of four 16-bit values, all elements containing W. */ 1434 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1435 _mm_set1_pi16 (short __w) 1436 { 1437 #if _ARCH_PWR9 1438 __vector signed short w; 1439 1440 w = (__vector signed short)vec_splats (__w); 1441 return (__m64) ((__vector long long) w)[0]; 1442 #else 1443 __m64_union res; 1444 1445 res.as_short[0] = __w; 1446 res.as_short[1] = __w; 1447 res.as_short[2] = __w; 1448 res.as_short[3] = __w; 1449 return (res.as_m64); 1450 #endif 1451 } 1452 1453 /* Creates a vector of eight 8-bit values, all elements containing B. */ 1454 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1455 _mm_set1_pi8 (signed char __b) 1456 { 1457 #if _ARCH_PWR8 1458 __vector signed char b; 1459 1460 b = (__vector signed char)vec_splats (__b); 1461 return (__m64) ((__vector long long) b)[0]; 1462 #else 1463 __m64_union res; 1464 1465 res.as_char[0] = __b; 1466 res.as_char[1] = __b; 1467 res.as_char[2] = __b; 1468 res.as_char[3] = __b; 1469 res.as_char[4] = __b; 1470 res.as_char[5] = __b; 1471 res.as_char[6] = __b; 1472 res.as_char[7] = __b; 1473 return (res.as_m64); 1474 #endif 1475 } 1476 #endif /* _MMINTRIN_H_INCLUDED */ 1477