1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __EMMINTRIN_H 11 #define __EMMINTRIN_H 12 13 #if !defined(__i386__) && !defined(__x86_64__) 14 #error "This header is only meant to be used on x86 and x64 architecture" 15 #endif 16 17 #include <xmmintrin.h> 18 19 typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16))); 20 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16))); 21 22 typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1))); 23 typedef long long __m128i_u 24 __attribute__((__vector_size__(16), __aligned__(1))); 25 26 /* Type defines. */ 27 typedef double __v2df __attribute__((__vector_size__(16))); 28 typedef long long __v2di __attribute__((__vector_size__(16))); 29 typedef short __v8hi __attribute__((__vector_size__(16))); 30 typedef char __v16qi __attribute__((__vector_size__(16))); 31 32 /* Unsigned types */ 33 typedef unsigned long long __v2du __attribute__((__vector_size__(16))); 34 typedef unsigned short __v8hu __attribute__((__vector_size__(16))); 35 typedef unsigned char __v16qu __attribute__((__vector_size__(16))); 36 37 /* We need an explicitly signed variant for char. Note that this shouldn't 38 * appear in the interface though. */ 39 typedef signed char __v16qs __attribute__((__vector_size__(16))); 40 41 #ifdef __SSE2__ 42 /* Both _Float16 and __bf16 require SSE2 being enabled. */ 43 typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16))); 44 typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16))); 45 typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1))); 46 47 typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16))); 48 typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16))); 49 #endif 50 51 /* Define the default attributes for the functions in this file. */ 52 #if defined(__EVEX512__) && !defined(__AVX10_1_512__) 53 #define __DEFAULT_FN_ATTRS \ 54 __attribute__((__always_inline__, __nodebug__, \ 55 __target__("sse2,no-evex512"), __min_vector_width__(128))) 56 #else 57 #define __DEFAULT_FN_ATTRS \ 58 __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \ 59 __min_vector_width__(128))) 60 #endif 61 62 #if defined(__cplusplus) && (__cplusplus >= 201103L) 63 #define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr 64 #else 65 #define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS 66 #endif 67 68 #define __trunc64(x) \ 69 (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0) 70 #define __anyext128(x) \ 71 (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \ 72 1, -1, -1) 73 74 /// Adds lower double-precision values in both operands and returns the 75 /// sum in the lower 64 bits of the result. The upper 64 bits of the result 76 /// are copied from the upper double-precision value of the first operand. 77 /// 78 /// \headerfile <x86intrin.h> 79 /// 80 /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction. 81 /// 82 /// \param __a 83 /// A 128-bit vector of [2 x double] containing one of the source operands. 84 /// \param __b 85 /// A 128-bit vector of [2 x double] containing one of the source operands. 86 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 87 /// sum of the lower 64 bits of both operands. The upper 64 bits are copied 88 /// from the upper 64 bits of the first source operand. 89 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_sd(__m128d __a, 90 __m128d __b) { 91 __a[0] += __b[0]; 92 return __a; 93 } 94 95 /// Adds two 128-bit vectors of [2 x double]. 96 /// 97 /// \headerfile <x86intrin.h> 98 /// 99 /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction. 100 /// 101 /// \param __a 102 /// A 128-bit vector of [2 x double] containing one of the source operands. 103 /// \param __b 104 /// A 128-bit vector of [2 x double] containing one of the source operands. 105 /// \returns A 128-bit vector of [2 x double] containing the sums of both 106 /// operands. 107 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_pd(__m128d __a, 108 __m128d __b) { 109 return (__m128d)((__v2df)__a + (__v2df)__b); 110 } 111 112 /// Subtracts the lower double-precision value of the second operand 113 /// from the lower double-precision value of the first operand and returns 114 /// the difference in the lower 64 bits of the result. The upper 64 bits of 115 /// the result are copied from the upper double-precision value of the first 116 /// operand. 117 /// 118 /// \headerfile <x86intrin.h> 119 /// 120 /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction. 121 /// 122 /// \param __a 123 /// A 128-bit vector of [2 x double] containing the minuend. 124 /// \param __b 125 /// A 128-bit vector of [2 x double] containing the subtrahend. 126 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 127 /// difference of the lower 64 bits of both operands. The upper 64 bits are 128 /// copied from the upper 64 bits of the first source operand. 129 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_sd(__m128d __a, 130 __m128d __b) { 131 __a[0] -= __b[0]; 132 return __a; 133 } 134 135 /// Subtracts two 128-bit vectors of [2 x double]. 136 /// 137 /// \headerfile <x86intrin.h> 138 /// 139 /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction. 140 /// 141 /// \param __a 142 /// A 128-bit vector of [2 x double] containing the minuend. 143 /// \param __b 144 /// A 128-bit vector of [2 x double] containing the subtrahend. 145 /// \returns A 128-bit vector of [2 x double] containing the differences between 146 /// both operands. 147 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_pd(__m128d __a, 148 __m128d __b) { 149 return (__m128d)((__v2df)__a - (__v2df)__b); 150 } 151 152 /// Multiplies lower double-precision values in both operands and returns 153 /// the product in the lower 64 bits of the result. The upper 64 bits of the 154 /// result are copied from the upper double-precision value of the first 155 /// operand. 156 /// 157 /// \headerfile <x86intrin.h> 158 /// 159 /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction. 160 /// 161 /// \param __a 162 /// A 128-bit vector of [2 x double] containing one of the source operands. 163 /// \param __b 164 /// A 128-bit vector of [2 x double] containing one of the source operands. 165 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 166 /// product of the lower 64 bits of both operands. The upper 64 bits are 167 /// copied from the upper 64 bits of the first source operand. 168 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_sd(__m128d __a, 169 __m128d __b) { 170 __a[0] *= __b[0]; 171 return __a; 172 } 173 174 /// Multiplies two 128-bit vectors of [2 x double]. 175 /// 176 /// \headerfile <x86intrin.h> 177 /// 178 /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction. 179 /// 180 /// \param __a 181 /// A 128-bit vector of [2 x double] containing one of the operands. 182 /// \param __b 183 /// A 128-bit vector of [2 x double] containing one of the operands. 184 /// \returns A 128-bit vector of [2 x double] containing the products of both 185 /// operands. 186 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_pd(__m128d __a, 187 __m128d __b) { 188 return (__m128d)((__v2df)__a * (__v2df)__b); 189 } 190 191 /// Divides the lower double-precision value of the first operand by the 192 /// lower double-precision value of the second operand and returns the 193 /// quotient in the lower 64 bits of the result. The upper 64 bits of the 194 /// result are copied from the upper double-precision value of the first 195 /// operand. 196 /// 197 /// \headerfile <x86intrin.h> 198 /// 199 /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction. 200 /// 201 /// \param __a 202 /// A 128-bit vector of [2 x double] containing the dividend. 203 /// \param __b 204 /// A 128-bit vector of [2 x double] containing divisor. 205 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 206 /// quotient of the lower 64 bits of both operands. The upper 64 bits are 207 /// copied from the upper 64 bits of the first source operand. 208 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_sd(__m128d __a, 209 __m128d __b) { 210 __a[0] /= __b[0]; 211 return __a; 212 } 213 214 /// Performs an element-by-element division of two 128-bit vectors of 215 /// [2 x double]. 216 /// 217 /// \headerfile <x86intrin.h> 218 /// 219 /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction. 220 /// 221 /// \param __a 222 /// A 128-bit vector of [2 x double] containing the dividend. 223 /// \param __b 224 /// A 128-bit vector of [2 x double] containing the divisor. 225 /// \returns A 128-bit vector of [2 x double] containing the quotients of both 226 /// operands. 227 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_pd(__m128d __a, 228 __m128d __b) { 229 return (__m128d)((__v2df)__a / (__v2df)__b); 230 } 231 232 /// Calculates the square root of the lower double-precision value of 233 /// the second operand and returns it in the lower 64 bits of the result. 234 /// The upper 64 bits of the result are copied from the upper 235 /// double-precision value of the first operand. 236 /// 237 /// \headerfile <x86intrin.h> 238 /// 239 /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction. 240 /// 241 /// \param __a 242 /// A 128-bit vector of [2 x double] containing one of the operands. The 243 /// upper 64 bits of this operand are copied to the upper 64 bits of the 244 /// result. 245 /// \param __b 246 /// A 128-bit vector of [2 x double] containing one of the operands. The 247 /// square root is calculated using the lower 64 bits of this operand. 248 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 249 /// square root of the lower 64 bits of operand \a __b, and whose upper 64 250 /// bits are copied from the upper 64 bits of operand \a __a. 251 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, 252 __m128d __b) { 253 __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b); 254 return __extension__(__m128d){__c[0], __a[1]}; 255 } 256 257 /// Calculates the square root of the each of two values stored in a 258 /// 128-bit vector of [2 x double]. 259 /// 260 /// \headerfile <x86intrin.h> 261 /// 262 /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction. 263 /// 264 /// \param __a 265 /// A 128-bit vector of [2 x double]. 266 /// \returns A 128-bit vector of [2 x double] containing the square roots of the 267 /// values in the operand. 268 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) { 269 return __builtin_ia32_sqrtpd((__v2df)__a); 270 } 271 272 /// Compares lower 64-bit double-precision values of both operands, and 273 /// returns the lesser of the pair of values in the lower 64-bits of the 274 /// result. The upper 64 bits of the result are copied from the upper 275 /// double-precision value of the first operand. 276 /// 277 /// If either value in a comparison is NaN, returns the value from \a __b. 278 /// 279 /// \headerfile <x86intrin.h> 280 /// 281 /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction. 282 /// 283 /// \param __a 284 /// A 128-bit vector of [2 x double] containing one of the operands. The 285 /// lower 64 bits of this operand are used in the comparison. 286 /// \param __b 287 /// A 128-bit vector of [2 x double] containing one of the operands. The 288 /// lower 64 bits of this operand are used in the comparison. 289 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 290 /// minimum value between both operands. The upper 64 bits are copied from 291 /// the upper 64 bits of the first source operand. 292 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a, 293 __m128d __b) { 294 return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b); 295 } 296 297 /// Performs element-by-element comparison of the two 128-bit vectors of 298 /// [2 x double] and returns a vector containing the lesser of each pair of 299 /// values. 300 /// 301 /// If either value in a comparison is NaN, returns the value from \a __b. 302 /// 303 /// \headerfile <x86intrin.h> 304 /// 305 /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction. 306 /// 307 /// \param __a 308 /// A 128-bit vector of [2 x double] containing one of the operands. 309 /// \param __b 310 /// A 128-bit vector of [2 x double] containing one of the operands. 311 /// \returns A 128-bit vector of [2 x double] containing the minimum values 312 /// between both operands. 313 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a, 314 __m128d __b) { 315 return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b); 316 } 317 318 /// Compares lower 64-bit double-precision values of both operands, and 319 /// returns the greater of the pair of values in the lower 64-bits of the 320 /// result. The upper 64 bits of the result are copied from the upper 321 /// double-precision value of the first operand. 322 /// 323 /// If either value in a comparison is NaN, returns the value from \a __b. 324 /// 325 /// \headerfile <x86intrin.h> 326 /// 327 /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction. 328 /// 329 /// \param __a 330 /// A 128-bit vector of [2 x double] containing one of the operands. The 331 /// lower 64 bits of this operand are used in the comparison. 332 /// \param __b 333 /// A 128-bit vector of [2 x double] containing one of the operands. The 334 /// lower 64 bits of this operand are used in the comparison. 335 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 336 /// maximum value between both operands. The upper 64 bits are copied from 337 /// the upper 64 bits of the first source operand. 338 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a, 339 __m128d __b) { 340 return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b); 341 } 342 343 /// Performs element-by-element comparison of the two 128-bit vectors of 344 /// [2 x double] and returns a vector containing the greater of each pair 345 /// of values. 346 /// 347 /// If either value in a comparison is NaN, returns the value from \a __b. 348 /// 349 /// \headerfile <x86intrin.h> 350 /// 351 /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction. 352 /// 353 /// \param __a 354 /// A 128-bit vector of [2 x double] containing one of the operands. 355 /// \param __b 356 /// A 128-bit vector of [2 x double] containing one of the operands. 357 /// \returns A 128-bit vector of [2 x double] containing the maximum values 358 /// between both operands. 359 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a, 360 __m128d __b) { 361 return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b); 362 } 363 364 /// Performs a bitwise AND of two 128-bit vectors of [2 x double]. 365 /// 366 /// \headerfile <x86intrin.h> 367 /// 368 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction. 369 /// 370 /// \param __a 371 /// A 128-bit vector of [2 x double] containing one of the source operands. 372 /// \param __b 373 /// A 128-bit vector of [2 x double] containing one of the source operands. 374 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the 375 /// values between both operands. 376 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_and_pd(__m128d __a, 377 __m128d __b) { 378 return (__m128d)((__v2du)__a & (__v2du)__b); 379 } 380 381 /// Performs a bitwise AND of two 128-bit vectors of [2 x double], using 382 /// the one's complement of the values contained in the first source operand. 383 /// 384 /// \headerfile <x86intrin.h> 385 /// 386 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction. 387 /// 388 /// \param __a 389 /// A 128-bit vector of [2 x double] containing the left source operand. The 390 /// one's complement of this value is used in the bitwise AND. 391 /// \param __b 392 /// A 128-bit vector of [2 x double] containing the right source operand. 393 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the 394 /// values in the second operand and the one's complement of the first 395 /// operand. 396 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR 397 _mm_andnot_pd(__m128d __a, __m128d __b) { 398 return (__m128d)(~(__v2du)__a & (__v2du)__b); 399 } 400 401 /// Performs a bitwise OR of two 128-bit vectors of [2 x double]. 402 /// 403 /// \headerfile <x86intrin.h> 404 /// 405 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction. 406 /// 407 /// \param __a 408 /// A 128-bit vector of [2 x double] containing one of the source operands. 409 /// \param __b 410 /// A 128-bit vector of [2 x double] containing one of the source operands. 411 /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the 412 /// values between both operands. 413 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_or_pd(__m128d __a, 414 __m128d __b) { 415 return (__m128d)((__v2du)__a | (__v2du)__b); 416 } 417 418 /// Performs a bitwise XOR of two 128-bit vectors of [2 x double]. 419 /// 420 /// \headerfile <x86intrin.h> 421 /// 422 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction. 423 /// 424 /// \param __a 425 /// A 128-bit vector of [2 x double] containing one of the source operands. 426 /// \param __b 427 /// A 128-bit vector of [2 x double] containing one of the source operands. 428 /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the 429 /// values between both operands. 430 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_xor_pd(__m128d __a, 431 __m128d __b) { 432 return (__m128d)((__v2du)__a ^ (__v2du)__b); 433 } 434 435 /// Compares each of the corresponding double-precision values of the 436 /// 128-bit vectors of [2 x double] for equality. 437 /// 438 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 439 /// If either value in a comparison is NaN, returns false. 440 /// 441 /// \headerfile <x86intrin.h> 442 /// 443 /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction. 444 /// 445 /// \param __a 446 /// A 128-bit vector of [2 x double]. 447 /// \param __b 448 /// A 128-bit vector of [2 x double]. 449 /// \returns A 128-bit vector containing the comparison results. 450 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a, 451 __m128d __b) { 452 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b); 453 } 454 455 /// Compares each of the corresponding double-precision values of the 456 /// 128-bit vectors of [2 x double] to determine if the values in the first 457 /// operand are less than those in the second operand. 458 /// 459 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 460 /// If either value in a comparison is NaN, returns false. 461 /// 462 /// \headerfile <x86intrin.h> 463 /// 464 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction. 465 /// 466 /// \param __a 467 /// A 128-bit vector of [2 x double]. 468 /// \param __b 469 /// A 128-bit vector of [2 x double]. 470 /// \returns A 128-bit vector containing the comparison results. 471 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a, 472 __m128d __b) { 473 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b); 474 } 475 476 /// Compares each of the corresponding double-precision values of the 477 /// 128-bit vectors of [2 x double] to determine if the values in the first 478 /// operand are less than or equal to those in the second operand. 479 /// 480 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 481 /// If either value in a comparison is NaN, returns false. 482 /// 483 /// \headerfile <x86intrin.h> 484 /// 485 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction. 486 /// 487 /// \param __a 488 /// A 128-bit vector of [2 x double]. 489 /// \param __b 490 /// A 128-bit vector of [2 x double]. 491 /// \returns A 128-bit vector containing the comparison results. 492 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a, 493 __m128d __b) { 494 return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b); 495 } 496 497 /// Compares each of the corresponding double-precision values of the 498 /// 128-bit vectors of [2 x double] to determine if the values in the first 499 /// operand are greater than those in the second operand. 500 /// 501 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 502 /// If either value in a comparison is NaN, returns false. 503 /// 504 /// \headerfile <x86intrin.h> 505 /// 506 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction. 507 /// 508 /// \param __a 509 /// A 128-bit vector of [2 x double]. 510 /// \param __b 511 /// A 128-bit vector of [2 x double]. 512 /// \returns A 128-bit vector containing the comparison results. 513 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a, 514 __m128d __b) { 515 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a); 516 } 517 518 /// Compares each of the corresponding double-precision values of the 519 /// 128-bit vectors of [2 x double] to determine if the values in the first 520 /// operand are greater than or equal to those in the second operand. 521 /// 522 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 523 /// If either value in a comparison is NaN, returns false. 524 /// 525 /// \headerfile <x86intrin.h> 526 /// 527 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction. 528 /// 529 /// \param __a 530 /// A 128-bit vector of [2 x double]. 531 /// \param __b 532 /// A 128-bit vector of [2 x double]. 533 /// \returns A 128-bit vector containing the comparison results. 534 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a, 535 __m128d __b) { 536 return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a); 537 } 538 539 /// Compares each of the corresponding double-precision values of the 540 /// 128-bit vectors of [2 x double] to determine if the values in the first 541 /// operand are ordered with respect to those in the second operand. 542 /// 543 /// A pair of double-precision values are ordered with respect to each 544 /// other if neither value is a NaN. Each comparison returns 0x0 for false, 545 /// 0xFFFFFFFFFFFFFFFF for true. 546 /// 547 /// \headerfile <x86intrin.h> 548 /// 549 /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction. 550 /// 551 /// \param __a 552 /// A 128-bit vector of [2 x double]. 553 /// \param __b 554 /// A 128-bit vector of [2 x double]. 555 /// \returns A 128-bit vector containing the comparison results. 556 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a, 557 __m128d __b) { 558 return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b); 559 } 560 561 /// Compares each of the corresponding double-precision values of the 562 /// 128-bit vectors of [2 x double] to determine if the values in the first 563 /// operand are unordered with respect to those in the second operand. 564 /// 565 /// A pair of double-precision values are unordered with respect to each 566 /// other if one or both values are NaN. Each comparison returns 0x0 for 567 /// false, 0xFFFFFFFFFFFFFFFF for true. 568 /// 569 /// \headerfile <x86intrin.h> 570 /// 571 /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c> 572 /// instruction. 573 /// 574 /// \param __a 575 /// A 128-bit vector of [2 x double]. 576 /// \param __b 577 /// A 128-bit vector of [2 x double]. 578 /// \returns A 128-bit vector containing the comparison results. 579 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a, 580 __m128d __b) { 581 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b); 582 } 583 584 /// Compares each of the corresponding double-precision values of the 585 /// 128-bit vectors of [2 x double] to determine if the values in the first 586 /// operand are unequal to those in the second operand. 587 /// 588 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 589 /// If either value in a comparison is NaN, returns true. 590 /// 591 /// \headerfile <x86intrin.h> 592 /// 593 /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction. 594 /// 595 /// \param __a 596 /// A 128-bit vector of [2 x double]. 597 /// \param __b 598 /// A 128-bit vector of [2 x double]. 599 /// \returns A 128-bit vector containing the comparison results. 600 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a, 601 __m128d __b) { 602 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b); 603 } 604 605 /// Compares each of the corresponding double-precision values of the 606 /// 128-bit vectors of [2 x double] to determine if the values in the first 607 /// operand are not less than those in the second operand. 608 /// 609 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 610 /// If either value in a comparison is NaN, returns true. 611 /// 612 /// \headerfile <x86intrin.h> 613 /// 614 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction. 615 /// 616 /// \param __a 617 /// A 128-bit vector of [2 x double]. 618 /// \param __b 619 /// A 128-bit vector of [2 x double]. 620 /// \returns A 128-bit vector containing the comparison results. 621 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a, 622 __m128d __b) { 623 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b); 624 } 625 626 /// Compares each of the corresponding double-precision values of the 627 /// 128-bit vectors of [2 x double] to determine if the values in the first 628 /// operand are not less than or equal to those in the second operand. 629 /// 630 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 631 /// If either value in a comparison is NaN, returns true. 632 /// 633 /// \headerfile <x86intrin.h> 634 /// 635 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction. 636 /// 637 /// \param __a 638 /// A 128-bit vector of [2 x double]. 639 /// \param __b 640 /// A 128-bit vector of [2 x double]. 641 /// \returns A 128-bit vector containing the comparison results. 642 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a, 643 __m128d __b) { 644 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b); 645 } 646 647 /// Compares each of the corresponding double-precision values of the 648 /// 128-bit vectors of [2 x double] to determine if the values in the first 649 /// operand are not greater than those in the second operand. 650 /// 651 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 652 /// If either value in a comparison is NaN, returns true. 653 /// 654 /// \headerfile <x86intrin.h> 655 /// 656 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction. 657 /// 658 /// \param __a 659 /// A 128-bit vector of [2 x double]. 660 /// \param __b 661 /// A 128-bit vector of [2 x double]. 662 /// \returns A 128-bit vector containing the comparison results. 663 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a, 664 __m128d __b) { 665 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a); 666 } 667 668 /// Compares each of the corresponding double-precision values of the 669 /// 128-bit vectors of [2 x double] to determine if the values in the first 670 /// operand are not greater than or equal to those in the second operand. 671 /// 672 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 673 /// If either value in a comparison is NaN, returns true. 674 /// 675 /// \headerfile <x86intrin.h> 676 /// 677 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction. 678 /// 679 /// \param __a 680 /// A 128-bit vector of [2 x double]. 681 /// \param __b 682 /// A 128-bit vector of [2 x double]. 683 /// \returns A 128-bit vector containing the comparison results. 684 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a, 685 __m128d __b) { 686 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a); 687 } 688 689 /// Compares the lower double-precision floating-point values in each of 690 /// the two 128-bit floating-point vectors of [2 x double] for equality. 691 /// 692 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 693 /// If either value in a comparison is NaN, returns false. 694 /// 695 /// \headerfile <x86intrin.h> 696 /// 697 /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction. 698 /// 699 /// \param __a 700 /// A 128-bit vector of [2 x double]. The lower double-precision value is 701 /// compared to the lower double-precision value of \a __b. 702 /// \param __b 703 /// A 128-bit vector of [2 x double]. The lower double-precision value is 704 /// compared to the lower double-precision value of \a __a. 705 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 706 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 707 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a, 708 __m128d __b) { 709 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b); 710 } 711 712 /// Compares the lower double-precision floating-point values in each of 713 /// the two 128-bit floating-point vectors of [2 x double] to determine if 714 /// the value in the first parameter is less than the corresponding value in 715 /// the second parameter. 716 /// 717 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 718 /// If either value in a comparison is NaN, returns false. 719 /// 720 /// \headerfile <x86intrin.h> 721 /// 722 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction. 723 /// 724 /// \param __a 725 /// A 128-bit vector of [2 x double]. The lower double-precision value is 726 /// compared to the lower double-precision value of \a __b. 727 /// \param __b 728 /// A 128-bit vector of [2 x double]. The lower double-precision value is 729 /// compared to the lower double-precision value of \a __a. 730 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 731 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 732 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a, 733 __m128d __b) { 734 return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b); 735 } 736 737 /// Compares the lower double-precision floating-point values in each of 738 /// the two 128-bit floating-point vectors of [2 x double] to determine if 739 /// the value in the first parameter is less than or equal to the 740 /// corresponding value in the second parameter. 741 /// 742 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 743 /// If either value in a comparison is NaN, returns false. 744 /// 745 /// \headerfile <x86intrin.h> 746 /// 747 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction. 748 /// 749 /// \param __a 750 /// A 128-bit vector of [2 x double]. The lower double-precision value is 751 /// compared to the lower double-precision value of \a __b. 752 /// \param __b 753 /// A 128-bit vector of [2 x double]. The lower double-precision value is 754 /// compared to the lower double-precision value of \a __a. 755 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 756 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 757 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a, 758 __m128d __b) { 759 return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b); 760 } 761 762 /// Compares the lower double-precision floating-point values in each of 763 /// the two 128-bit floating-point vectors of [2 x double] to determine if 764 /// the value in the first parameter is greater than the corresponding value 765 /// in the second parameter. 766 /// 767 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 768 /// If either value in a comparison is NaN, returns false. 769 /// 770 /// \headerfile <x86intrin.h> 771 /// 772 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction. 773 /// 774 /// \param __a 775 /// A 128-bit vector of [2 x double]. The lower double-precision value is 776 /// compared to the lower double-precision value of \a __b. 777 /// \param __b 778 /// A 128-bit vector of [2 x double]. The lower double-precision value is 779 /// compared to the lower double-precision value of \a __a. 780 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 781 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 782 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a, 783 __m128d __b) { 784 __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a); 785 return __extension__(__m128d){__c[0], __a[1]}; 786 } 787 788 /// Compares the lower double-precision floating-point values in each of 789 /// the two 128-bit floating-point vectors of [2 x double] to determine if 790 /// the value in the first parameter is greater than or equal to the 791 /// corresponding value in the second parameter. 792 /// 793 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 794 /// If either value in a comparison is NaN, returns false. 795 /// 796 /// \headerfile <x86intrin.h> 797 /// 798 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction. 799 /// 800 /// \param __a 801 /// A 128-bit vector of [2 x double]. The lower double-precision value is 802 /// compared to the lower double-precision value of \a __b. 803 /// \param __b 804 /// A 128-bit vector of [2 x double]. The lower double-precision value is 805 /// compared to the lower double-precision value of \a __a. 806 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 807 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 808 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a, 809 __m128d __b) { 810 __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a); 811 return __extension__(__m128d){__c[0], __a[1]}; 812 } 813 814 /// Compares the lower double-precision floating-point values in each of 815 /// the two 128-bit floating-point vectors of [2 x double] to determine if 816 /// the value in the first parameter is ordered with respect to the 817 /// corresponding value in the second parameter. 818 /// 819 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair 820 /// of double-precision values are ordered with respect to each other if 821 /// neither value is a NaN. 822 /// 823 /// \headerfile <x86intrin.h> 824 /// 825 /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction. 826 /// 827 /// \param __a 828 /// A 128-bit vector of [2 x double]. The lower double-precision value is 829 /// compared to the lower double-precision value of \a __b. 830 /// \param __b 831 /// A 128-bit vector of [2 x double]. The lower double-precision value is 832 /// compared to the lower double-precision value of \a __a. 833 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 834 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 835 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a, 836 __m128d __b) { 837 return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b); 838 } 839 840 /// Compares the lower double-precision floating-point values in each of 841 /// the two 128-bit floating-point vectors of [2 x double] to determine if 842 /// the value in the first parameter is unordered with respect to the 843 /// corresponding value in the second parameter. 844 /// 845 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair 846 /// of double-precision values are unordered with respect to each other if 847 /// one or both values are NaN. 848 /// 849 /// \headerfile <x86intrin.h> 850 /// 851 /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c> 852 /// instruction. 853 /// 854 /// \param __a 855 /// A 128-bit vector of [2 x double]. The lower double-precision value is 856 /// compared to the lower double-precision value of \a __b. 857 /// \param __b 858 /// A 128-bit vector of [2 x double]. The lower double-precision value is 859 /// compared to the lower double-precision value of \a __a. 860 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 861 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 862 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a, 863 __m128d __b) { 864 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b); 865 } 866 867 /// Compares the lower double-precision floating-point values in each of 868 /// the two 128-bit floating-point vectors of [2 x double] to determine if 869 /// the value in the first parameter is unequal to the corresponding value in 870 /// the second parameter. 871 /// 872 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 873 /// If either value in a comparison is NaN, returns true. 874 /// 875 /// \headerfile <x86intrin.h> 876 /// 877 /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction. 878 /// 879 /// \param __a 880 /// A 128-bit vector of [2 x double]. The lower double-precision value is 881 /// compared to the lower double-precision value of \a __b. 882 /// \param __b 883 /// A 128-bit vector of [2 x double]. The lower double-precision value is 884 /// compared to the lower double-precision value of \a __a. 885 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 886 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 887 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a, 888 __m128d __b) { 889 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b); 890 } 891 892 /// Compares the lower double-precision floating-point values in each of 893 /// the two 128-bit floating-point vectors of [2 x double] to determine if 894 /// the value in the first parameter is not less than the corresponding 895 /// value in the second parameter. 896 /// 897 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 898 /// If either value in a comparison is NaN, returns true. 899 /// 900 /// \headerfile <x86intrin.h> 901 /// 902 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction. 903 /// 904 /// \param __a 905 /// A 128-bit vector of [2 x double]. The lower double-precision value is 906 /// compared to the lower double-precision value of \a __b. 907 /// \param __b 908 /// A 128-bit vector of [2 x double]. The lower double-precision value is 909 /// compared to the lower double-precision value of \a __a. 910 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 911 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 912 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a, 913 __m128d __b) { 914 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b); 915 } 916 917 /// Compares the lower double-precision floating-point values in each of 918 /// the two 128-bit floating-point vectors of [2 x double] to determine if 919 /// the value in the first parameter is not less than or equal to the 920 /// corresponding value in the second parameter. 921 /// 922 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 923 /// If either value in a comparison is NaN, returns true. 924 /// 925 /// \headerfile <x86intrin.h> 926 /// 927 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction. 928 /// 929 /// \param __a 930 /// A 128-bit vector of [2 x double]. The lower double-precision value is 931 /// compared to the lower double-precision value of \a __b. 932 /// \param __b 933 /// A 128-bit vector of [2 x double]. The lower double-precision value is 934 /// compared to the lower double-precision value of \a __a. 935 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 936 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 937 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a, 938 __m128d __b) { 939 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b); 940 } 941 942 /// Compares the lower double-precision floating-point values in each of 943 /// the two 128-bit floating-point vectors of [2 x double] to determine if 944 /// the value in the first parameter is not greater than the corresponding 945 /// value in the second parameter. 946 /// 947 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 948 /// If either value in a comparison is NaN, returns true. 949 /// 950 /// \headerfile <x86intrin.h> 951 /// 952 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction. 953 /// 954 /// \param __a 955 /// A 128-bit vector of [2 x double]. The lower double-precision value is 956 /// compared to the lower double-precision value of \a __b. 957 /// \param __b 958 /// A 128-bit vector of [2 x double]. The lower double-precision value is 959 /// compared to the lower double-precision value of \a __a. 960 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 961 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 962 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a, 963 __m128d __b) { 964 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a); 965 return __extension__(__m128d){__c[0], __a[1]}; 966 } 967 968 /// Compares the lower double-precision floating-point values in each of 969 /// the two 128-bit floating-point vectors of [2 x double] to determine if 970 /// the value in the first parameter is not greater than or equal to the 971 /// corresponding value in the second parameter. 972 /// 973 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 974 /// If either value in a comparison is NaN, returns true. 975 /// 976 /// \headerfile <x86intrin.h> 977 /// 978 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction. 979 /// 980 /// \param __a 981 /// A 128-bit vector of [2 x double]. The lower double-precision value is 982 /// compared to the lower double-precision value of \a __b. 983 /// \param __b 984 /// A 128-bit vector of [2 x double]. The lower double-precision value is 985 /// compared to the lower double-precision value of \a __a. 986 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 987 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 988 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a, 989 __m128d __b) { 990 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a); 991 return __extension__(__m128d){__c[0], __a[1]}; 992 } 993 994 /// Compares the lower double-precision floating-point values in each of 995 /// the two 128-bit floating-point vectors of [2 x double] for equality. 996 /// 997 /// The comparison returns 0 for false, 1 for true. If either value in a 998 /// comparison is NaN, returns 0. 999 /// 1000 /// \headerfile <x86intrin.h> 1001 /// 1002 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1003 /// 1004 /// \param __a 1005 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1006 /// compared to the lower double-precision value of \a __b. 1007 /// \param __b 1008 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1009 /// compared to the lower double-precision value of \a __a. 1010 /// \returns An integer containing the comparison results. 1011 static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a, 1012 __m128d __b) { 1013 return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b); 1014 } 1015 1016 /// Compares the lower double-precision floating-point values in each of 1017 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1018 /// the value in the first parameter is less than the corresponding value in 1019 /// the second parameter. 1020 /// 1021 /// The comparison returns 0 for false, 1 for true. If either value in a 1022 /// comparison is NaN, returns 0. 1023 /// 1024 /// \headerfile <x86intrin.h> 1025 /// 1026 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1027 /// 1028 /// \param __a 1029 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1030 /// compared to the lower double-precision value of \a __b. 1031 /// \param __b 1032 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1033 /// compared to the lower double-precision value of \a __a. 1034 /// \returns An integer containing the comparison results. 1035 static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a, 1036 __m128d __b) { 1037 return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b); 1038 } 1039 1040 /// Compares the lower double-precision floating-point values in each of 1041 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1042 /// the value in the first parameter is less than or equal to the 1043 /// corresponding value in the second parameter. 1044 /// 1045 /// The comparison returns 0 for false, 1 for true. If either value in a 1046 /// comparison is NaN, returns 0. 1047 /// 1048 /// \headerfile <x86intrin.h> 1049 /// 1050 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1051 /// 1052 /// \param __a 1053 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1054 /// compared to the lower double-precision value of \a __b. 1055 /// \param __b 1056 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1057 /// compared to the lower double-precision value of \a __a. 1058 /// \returns An integer containing the comparison results. 1059 static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a, 1060 __m128d __b) { 1061 return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b); 1062 } 1063 1064 /// Compares the lower double-precision floating-point values in each of 1065 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1066 /// the value in the first parameter is greater than the corresponding value 1067 /// in the second parameter. 1068 /// 1069 /// The comparison returns 0 for false, 1 for true. If either value in a 1070 /// comparison is NaN, returns 0. 1071 /// 1072 /// \headerfile <x86intrin.h> 1073 /// 1074 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1075 /// 1076 /// \param __a 1077 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1078 /// compared to the lower double-precision value of \a __b. 1079 /// \param __b 1080 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1081 /// compared to the lower double-precision value of \a __a. 1082 /// \returns An integer containing the comparison results. 1083 static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a, 1084 __m128d __b) { 1085 return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b); 1086 } 1087 1088 /// Compares the lower double-precision floating-point values in each of 1089 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1090 /// the value in the first parameter is greater than or equal to the 1091 /// corresponding value in the second parameter. 1092 /// 1093 /// The comparison returns 0 for false, 1 for true. If either value in a 1094 /// comparison is NaN, returns 0. 1095 /// 1096 /// \headerfile <x86intrin.h> 1097 /// 1098 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1099 /// 1100 /// \param __a 1101 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1102 /// compared to the lower double-precision value of \a __b. 1103 /// \param __b 1104 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1105 /// compared to the lower double-precision value of \a __a. 1106 /// \returns An integer containing the comparison results. 1107 static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a, 1108 __m128d __b) { 1109 return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b); 1110 } 1111 1112 /// Compares the lower double-precision floating-point values in each of 1113 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1114 /// the value in the first parameter is unequal to the corresponding value in 1115 /// the second parameter. 1116 /// 1117 /// The comparison returns 0 for false, 1 for true. If either value in a 1118 /// comparison is NaN, returns 1. 1119 /// 1120 /// \headerfile <x86intrin.h> 1121 /// 1122 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1123 /// 1124 /// \param __a 1125 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1126 /// compared to the lower double-precision value of \a __b. 1127 /// \param __b 1128 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1129 /// compared to the lower double-precision value of \a __a. 1130 /// \returns An integer containing the comparison results. 1131 static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a, 1132 __m128d __b) { 1133 return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b); 1134 } 1135 1136 /// Compares the lower double-precision floating-point values in each of 1137 /// the two 128-bit floating-point vectors of [2 x double] for equality. 1138 /// 1139 /// The comparison returns 0 for false, 1 for true. If either value in a 1140 /// comparison is NaN, returns 0. 1141 /// 1142 /// \headerfile <x86intrin.h> 1143 /// 1144 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1145 /// 1146 /// \param __a 1147 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1148 /// compared to the lower double-precision value of \a __b. 1149 /// \param __b 1150 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1151 /// compared to the lower double-precision value of \a __a. 1152 /// \returns An integer containing the comparison results. 1153 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a, 1154 __m128d __b) { 1155 return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b); 1156 } 1157 1158 /// Compares the lower double-precision floating-point values in each of 1159 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1160 /// the value in the first parameter is less than the corresponding value in 1161 /// the second parameter. 1162 /// 1163 /// The comparison returns 0 for false, 1 for true. If either value in a 1164 /// comparison is NaN, returns 0. 1165 /// 1166 /// \headerfile <x86intrin.h> 1167 /// 1168 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1169 /// 1170 /// \param __a 1171 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1172 /// compared to the lower double-precision value of \a __b. 1173 /// \param __b 1174 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1175 /// compared to the lower double-precision value of \a __a. 1176 /// \returns An integer containing the comparison results. 1177 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a, 1178 __m128d __b) { 1179 return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b); 1180 } 1181 1182 /// Compares the lower double-precision floating-point values in each of 1183 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1184 /// the value in the first parameter is less than or equal to the 1185 /// corresponding value in the second parameter. 1186 /// 1187 /// The comparison returns 0 for false, 1 for true. If either value in a 1188 /// comparison is NaN, returns 0. 1189 /// 1190 /// \headerfile <x86intrin.h> 1191 /// 1192 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1193 /// 1194 /// \param __a 1195 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1196 /// compared to the lower double-precision value of \a __b. 1197 /// \param __b 1198 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1199 /// compared to the lower double-precision value of \a __a. 1200 /// \returns An integer containing the comparison results. 1201 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a, 1202 __m128d __b) { 1203 return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b); 1204 } 1205 1206 /// Compares the lower double-precision floating-point values in each of 1207 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1208 /// the value in the first parameter is greater than the corresponding value 1209 /// in the second parameter. 1210 /// 1211 /// The comparison returns 0 for false, 1 for true. If either value in a 1212 /// comparison is NaN, returns 0. 1213 /// 1214 /// \headerfile <x86intrin.h> 1215 /// 1216 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1217 /// 1218 /// \param __a 1219 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1220 /// compared to the lower double-precision value of \a __b. 1221 /// \param __b 1222 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1223 /// compared to the lower double-precision value of \a __a. 1224 /// \returns An integer containing the comparison results. 1225 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a, 1226 __m128d __b) { 1227 return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b); 1228 } 1229 1230 /// Compares the lower double-precision floating-point values in each of 1231 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1232 /// the value in the first parameter is greater than or equal to the 1233 /// corresponding value in the second parameter. 1234 /// 1235 /// The comparison returns 0 for false, 1 for true. If either value in a 1236 /// comparison is NaN, returns 0. 1237 /// 1238 /// \headerfile <x86intrin.h> 1239 /// 1240 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1241 /// 1242 /// \param __a 1243 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1244 /// compared to the lower double-precision value of \a __b. 1245 /// \param __b 1246 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1247 /// compared to the lower double-precision value of \a __a. 1248 /// \returns An integer containing the comparison results. 1249 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a, 1250 __m128d __b) { 1251 return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b); 1252 } 1253 1254 /// Compares the lower double-precision floating-point values in each of 1255 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1256 /// the value in the first parameter is unequal to the corresponding value in 1257 /// the second parameter. 1258 /// 1259 /// The comparison returns 0 for false, 1 for true. If either value in a 1260 /// comparison is NaN, returns 1. 1261 /// 1262 /// \headerfile <x86intrin.h> 1263 /// 1264 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1265 /// 1266 /// \param __a 1267 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1268 /// compared to the lower double-precision value of \a __b. 1269 /// \param __b 1270 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1271 /// compared to the lower double-precision value of \a __a. 1272 /// \returns An integer containing the comparison result. 1273 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a, 1274 __m128d __b) { 1275 return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b); 1276 } 1277 1278 /// Converts the two double-precision floating-point elements of a 1279 /// 128-bit vector of [2 x double] into two single-precision floating-point 1280 /// values, returned in the lower 64 bits of a 128-bit vector of [4 x float]. 1281 /// The upper 64 bits of the result vector are set to zero. 1282 /// 1283 /// \headerfile <x86intrin.h> 1284 /// 1285 /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction. 1286 /// 1287 /// \param __a 1288 /// A 128-bit vector of [2 x double]. 1289 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the 1290 /// converted values. The upper 64 bits are set to zero. 1291 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) { 1292 return __builtin_ia32_cvtpd2ps((__v2df)__a); 1293 } 1294 1295 /// Converts the lower two single-precision floating-point elements of a 1296 /// 128-bit vector of [4 x float] into two double-precision floating-point 1297 /// values, returned in a 128-bit vector of [2 x double]. The upper two 1298 /// elements of the input vector are unused. 1299 /// 1300 /// \headerfile <x86intrin.h> 1301 /// 1302 /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction. 1303 /// 1304 /// \param __a 1305 /// A 128-bit vector of [4 x float]. The lower two single-precision 1306 /// floating-point elements are converted to double-precision values. The 1307 /// upper two elements are unused. 1308 /// \returns A 128-bit vector of [2 x double] containing the converted values. 1309 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR 1310 _mm_cvtps_pd(__m128 __a) { 1311 return (__m128d) __builtin_convertvector( 1312 __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df); 1313 } 1314 1315 /// Converts the lower two integer elements of a 128-bit vector of 1316 /// [4 x i32] into two double-precision floating-point values, returned in a 1317 /// 128-bit vector of [2 x double]. 1318 /// 1319 /// The upper two elements of the input vector are unused. 1320 /// 1321 /// \headerfile <x86intrin.h> 1322 /// 1323 /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction. 1324 /// 1325 /// \param __a 1326 /// A 128-bit integer vector of [4 x i32]. The lower two integer elements are 1327 /// converted to double-precision values. 1328 /// 1329 /// The upper two elements are unused. 1330 /// \returns A 128-bit vector of [2 x double] containing the converted values. 1331 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR 1332 _mm_cvtepi32_pd(__m128i __a) { 1333 return (__m128d) __builtin_convertvector( 1334 __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df); 1335 } 1336 1337 /// Converts the two double-precision floating-point elements of a 1338 /// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1339 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper 1340 /// 64 bits of the result vector are set to zero. 1341 /// 1342 /// If a converted value does not fit in a 32-bit integer, raises a 1343 /// floating-point invalid exception. If the exception is masked, returns 1344 /// the most negative integer. 1345 /// 1346 /// \headerfile <x86intrin.h> 1347 /// 1348 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction. 1349 /// 1350 /// \param __a 1351 /// A 128-bit vector of [2 x double]. 1352 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the 1353 /// converted values. The upper 64 bits are set to zero. 1354 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) { 1355 return __builtin_ia32_cvtpd2dq((__v2df)__a); 1356 } 1357 1358 /// Converts the low-order element of a 128-bit vector of [2 x double] 1359 /// into a 32-bit signed integer value. 1360 /// 1361 /// If the converted value does not fit in a 32-bit integer, raises a 1362 /// floating-point invalid exception. If the exception is masked, returns 1363 /// the most negative integer. 1364 /// 1365 /// \headerfile <x86intrin.h> 1366 /// 1367 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction. 1368 /// 1369 /// \param __a 1370 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 1371 /// conversion. 1372 /// \returns A 32-bit signed integer containing the converted value. 1373 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) { 1374 return __builtin_ia32_cvtsd2si((__v2df)__a); 1375 } 1376 1377 /// Converts the lower double-precision floating-point element of a 1378 /// 128-bit vector of [2 x double], in the second parameter, into a 1379 /// single-precision floating-point value, returned in the lower 32 bits of a 1380 /// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are 1381 /// copied from the upper 96 bits of the first parameter. 1382 /// 1383 /// \headerfile <x86intrin.h> 1384 /// 1385 /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction. 1386 /// 1387 /// \param __a 1388 /// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are 1389 /// copied to the upper 96 bits of the result. 1390 /// \param __b 1391 /// A 128-bit vector of [2 x double]. The lower double-precision 1392 /// floating-point element is used in the conversion. 1393 /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the 1394 /// converted value from the second parameter. The upper 96 bits are copied 1395 /// from the upper 96 bits of the first parameter. 1396 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a, 1397 __m128d __b) { 1398 return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b); 1399 } 1400 1401 /// Converts a 32-bit signed integer value, in the second parameter, into 1402 /// a double-precision floating-point value, returned in the lower 64 bits of 1403 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector 1404 /// are copied from the upper 64 bits of the first parameter. 1405 /// 1406 /// \headerfile <x86intrin.h> 1407 /// 1408 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction. 1409 /// 1410 /// \param __a 1411 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are 1412 /// copied to the upper 64 bits of the result. 1413 /// \param __b 1414 /// A 32-bit signed integer containing the value to be converted. 1415 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the 1416 /// converted value from the second parameter. The upper 64 bits are copied 1417 /// from the upper 64 bits of the first parameter. 1418 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR 1419 _mm_cvtsi32_sd(__m128d __a, int __b) { 1420 __a[0] = __b; 1421 return __a; 1422 } 1423 1424 /// Converts the lower single-precision floating-point element of a 1425 /// 128-bit vector of [4 x float], in the second parameter, into a 1426 /// double-precision floating-point value, returned in the lower 64 bits of 1427 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector 1428 /// are copied from the upper 64 bits of the first parameter. 1429 /// 1430 /// \headerfile <x86intrin.h> 1431 /// 1432 /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction. 1433 /// 1434 /// \param __a 1435 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are 1436 /// copied to the upper 64 bits of the result. 1437 /// \param __b 1438 /// A 128-bit vector of [4 x float]. The lower single-precision 1439 /// floating-point element is used in the conversion. 1440 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the 1441 /// converted value from the second parameter. The upper 64 bits are copied 1442 /// from the upper 64 bits of the first parameter. 1443 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR 1444 _mm_cvtss_sd(__m128d __a, __m128 __b) { 1445 __a[0] = __b[0]; 1446 return __a; 1447 } 1448 1449 /// Converts the two double-precision floating-point elements of a 1450 /// 128-bit vector of [2 x double] into two signed truncated (rounded 1451 /// toward zero) 32-bit integer values, returned in the lower 64 bits 1452 /// of a 128-bit vector of [4 x i32]. 1453 /// 1454 /// If a converted value does not fit in a 32-bit integer, raises a 1455 /// floating-point invalid exception. If the exception is masked, returns 1456 /// the most negative integer. 1457 /// 1458 /// \headerfile <x86intrin.h> 1459 /// 1460 /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c> 1461 /// instruction. 1462 /// 1463 /// \param __a 1464 /// A 128-bit vector of [2 x double]. 1465 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the 1466 /// converted values. The upper 64 bits are set to zero. 1467 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) { 1468 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a); 1469 } 1470 1471 /// Converts the low-order element of a [2 x double] vector into a 32-bit 1472 /// signed truncated (rounded toward zero) integer value. 1473 /// 1474 /// If the converted value does not fit in a 32-bit integer, raises a 1475 /// floating-point invalid exception. If the exception is masked, returns 1476 /// the most negative integer. 1477 /// 1478 /// \headerfile <x86intrin.h> 1479 /// 1480 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c> 1481 /// instruction. 1482 /// 1483 /// \param __a 1484 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 1485 /// conversion. 1486 /// \returns A 32-bit signed integer containing the converted value. 1487 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) { 1488 return __builtin_ia32_cvttsd2si((__v2df)__a); 1489 } 1490 1491 /// Converts the two double-precision floating-point elements of a 1492 /// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1493 /// returned in a 64-bit vector of [2 x i32]. 1494 /// 1495 /// If a converted value does not fit in a 32-bit integer, raises a 1496 /// floating-point invalid exception. If the exception is masked, returns 1497 /// the most negative integer. 1498 /// 1499 /// \headerfile <x86intrin.h> 1500 /// 1501 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction. 1502 /// 1503 /// \param __a 1504 /// A 128-bit vector of [2 x double]. 1505 /// \returns A 64-bit vector of [2 x i32] containing the converted values. 1506 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtpd_pi32(__m128d __a) { 1507 return __trunc64(__builtin_ia32_cvtpd2dq((__v2df)__a)); 1508 } 1509 1510 /// Converts the two double-precision floating-point elements of a 1511 /// 128-bit vector of [2 x double] into two signed truncated (rounded toward 1512 /// zero) 32-bit integer values, returned in a 64-bit vector of [2 x i32]. 1513 /// 1514 /// If a converted value does not fit in a 32-bit integer, raises a 1515 /// floating-point invalid exception. If the exception is masked, returns 1516 /// the most negative integer. 1517 /// 1518 /// \headerfile <x86intrin.h> 1519 /// 1520 /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction. 1521 /// 1522 /// \param __a 1523 /// A 128-bit vector of [2 x double]. 1524 /// \returns A 64-bit vector of [2 x i32] containing the converted values. 1525 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvttpd_pi32(__m128d __a) { 1526 return __trunc64(__builtin_ia32_cvttpd2dq((__v2df)__a)); 1527 } 1528 1529 /// Converts the two signed 32-bit integer elements of a 64-bit vector of 1530 /// [2 x i32] into two double-precision floating-point values, returned in a 1531 /// 128-bit vector of [2 x double]. 1532 /// 1533 /// \headerfile <x86intrin.h> 1534 /// 1535 /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction. 1536 /// 1537 /// \param __a 1538 /// A 64-bit vector of [2 x i32]. 1539 /// \returns A 128-bit vector of [2 x double] containing the converted values. 1540 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR 1541 _mm_cvtpi32_pd(__m64 __a) { 1542 return (__m128d) __builtin_convertvector((__v2si)__a, __v2df); 1543 } 1544 1545 /// Returns the low-order element of a 128-bit vector of [2 x double] as 1546 /// a double-precision floating-point value. 1547 /// 1548 /// \headerfile <x86intrin.h> 1549 /// 1550 /// This intrinsic has no corresponding instruction. 1551 /// 1552 /// \param __a 1553 /// A 128-bit vector of [2 x double]. The lower 64 bits are returned. 1554 /// \returns A double-precision floating-point value copied from the lower 64 1555 /// bits of \a __a. 1556 static __inline__ double __DEFAULT_FN_ATTRS_CONSTEXPR 1557 _mm_cvtsd_f64(__m128d __a) { 1558 return __a[0]; 1559 } 1560 1561 /// Loads a 128-bit floating-point vector of [2 x double] from an aligned 1562 /// memory location. 1563 /// 1564 /// \headerfile <x86intrin.h> 1565 /// 1566 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction. 1567 /// 1568 /// \param __dp 1569 /// A pointer to a 128-bit memory location. The address of the memory 1570 /// location has to be 16-byte aligned. 1571 /// \returns A 128-bit vector of [2 x double] containing the loaded values. 1572 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) { 1573 return *(const __m128d *)__dp; 1574 } 1575 1576 /// Loads a double-precision floating-point value from a specified memory 1577 /// location and duplicates it to both vector elements of a 128-bit vector of 1578 /// [2 x double]. 1579 /// 1580 /// \headerfile <x86intrin.h> 1581 /// 1582 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction. 1583 /// 1584 /// \param __dp 1585 /// A pointer to a memory location containing a double-precision value. 1586 /// \returns A 128-bit vector of [2 x double] containing the loaded and 1587 /// duplicated values. 1588 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) { 1589 struct __mm_load1_pd_struct { 1590 double __u; 1591 } __attribute__((__packed__, __may_alias__)); 1592 double __u = ((const struct __mm_load1_pd_struct *)__dp)->__u; 1593 return __extension__(__m128d){__u, __u}; 1594 } 1595 1596 #define _mm_load_pd1(dp) _mm_load1_pd(dp) 1597 1598 /// Loads two double-precision values, in reverse order, from an aligned 1599 /// memory location into a 128-bit vector of [2 x double]. 1600 /// 1601 /// \headerfile <x86intrin.h> 1602 /// 1603 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction + 1604 /// needed shuffling instructions. In AVX mode, the shuffling may be combined 1605 /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction. 1606 /// 1607 /// \param __dp 1608 /// A 16-byte aligned pointer to an array of double-precision values to be 1609 /// loaded in reverse order. 1610 /// \returns A 128-bit vector of [2 x double] containing the reversed loaded 1611 /// values. 1612 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) { 1613 __m128d __u = *(const __m128d *)__dp; 1614 return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0); 1615 } 1616 1617 /// Loads a 128-bit floating-point vector of [2 x double] from an 1618 /// unaligned memory location. 1619 /// 1620 /// \headerfile <x86intrin.h> 1621 /// 1622 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction. 1623 /// 1624 /// \param __dp 1625 /// A pointer to a 128-bit memory location. The address of the memory 1626 /// location does not have to be aligned. 1627 /// \returns A 128-bit vector of [2 x double] containing the loaded values. 1628 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) { 1629 struct __loadu_pd { 1630 __m128d_u __v; 1631 } __attribute__((__packed__, __may_alias__)); 1632 return ((const struct __loadu_pd *)__dp)->__v; 1633 } 1634 1635 /// Loads a 64-bit integer value to the low element of a 128-bit integer 1636 /// vector and clears the upper element. 1637 /// 1638 /// \headerfile <x86intrin.h> 1639 /// 1640 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 1641 /// 1642 /// \param __a 1643 /// A pointer to a 64-bit memory location. The address of the memory 1644 /// location does not have to be aligned. 1645 /// \returns A 128-bit vector of [2 x i64] containing the loaded value. 1646 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) { 1647 struct __loadu_si64 { 1648 long long __v; 1649 } __attribute__((__packed__, __may_alias__)); 1650 long long __u = ((const struct __loadu_si64 *)__a)->__v; 1651 return __extension__(__m128i)(__v2di){__u, 0LL}; 1652 } 1653 1654 /// Loads a 32-bit integer value to the low element of a 128-bit integer 1655 /// vector and clears the upper element. 1656 /// 1657 /// \headerfile <x86intrin.h> 1658 /// 1659 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 1660 /// 1661 /// \param __a 1662 /// A pointer to a 32-bit memory location. The address of the memory 1663 /// location does not have to be aligned. 1664 /// \returns A 128-bit vector of [4 x i32] containing the loaded value. 1665 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a) { 1666 struct __loadu_si32 { 1667 int __v; 1668 } __attribute__((__packed__, __may_alias__)); 1669 int __u = ((const struct __loadu_si32 *)__a)->__v; 1670 return __extension__(__m128i)(__v4si){__u, 0, 0, 0}; 1671 } 1672 1673 /// Loads a 16-bit integer value to the low element of a 128-bit integer 1674 /// vector and clears the upper element. 1675 /// 1676 /// \headerfile <x86intrin.h> 1677 /// 1678 /// This intrinsic does not correspond to a specific instruction. 1679 /// 1680 /// \param __a 1681 /// A pointer to a 16-bit memory location. The address of the memory 1682 /// location does not have to be aligned. 1683 /// \returns A 128-bit vector of [8 x i16] containing the loaded value. 1684 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a) { 1685 struct __loadu_si16 { 1686 short __v; 1687 } __attribute__((__packed__, __may_alias__)); 1688 short __u = ((const struct __loadu_si16 *)__a)->__v; 1689 return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0}; 1690 } 1691 1692 /// Loads a 64-bit double-precision value to the low element of a 1693 /// 128-bit integer vector and clears the upper element. 1694 /// 1695 /// \headerfile <x86intrin.h> 1696 /// 1697 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction. 1698 /// 1699 /// \param __dp 1700 /// A pointer to a memory location containing a double-precision value. 1701 /// The address of the memory location does not have to be aligned. 1702 /// \returns A 128-bit vector of [2 x double] containing the loaded value. 1703 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) { 1704 struct __mm_load_sd_struct { 1705 double __u; 1706 } __attribute__((__packed__, __may_alias__)); 1707 double __u = ((const struct __mm_load_sd_struct *)__dp)->__u; 1708 return __extension__(__m128d){__u, 0}; 1709 } 1710 1711 /// Loads a double-precision value into the high-order bits of a 128-bit 1712 /// vector of [2 x double]. The low-order bits are copied from the low-order 1713 /// bits of the first operand. 1714 /// 1715 /// \headerfile <x86intrin.h> 1716 /// 1717 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction. 1718 /// 1719 /// \param __a 1720 /// A 128-bit vector of [2 x double]. \n 1721 /// Bits [63:0] are written to bits [63:0] of the result. 1722 /// \param __dp 1723 /// A pointer to a 64-bit memory location containing a double-precision 1724 /// floating-point value that is loaded. The loaded value is written to bits 1725 /// [127:64] of the result. The address of the memory location does not have 1726 /// to be aligned. 1727 /// \returns A 128-bit vector of [2 x double] containing the moved values. 1728 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a, 1729 double const *__dp) { 1730 struct __mm_loadh_pd_struct { 1731 double __u; 1732 } __attribute__((__packed__, __may_alias__)); 1733 double __u = ((const struct __mm_loadh_pd_struct *)__dp)->__u; 1734 return __extension__(__m128d){__a[0], __u}; 1735 } 1736 1737 /// Loads a double-precision value into the low-order bits of a 128-bit 1738 /// vector of [2 x double]. The high-order bits are copied from the 1739 /// high-order bits of the first operand. 1740 /// 1741 /// \headerfile <x86intrin.h> 1742 /// 1743 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction. 1744 /// 1745 /// \param __a 1746 /// A 128-bit vector of [2 x double]. \n 1747 /// Bits [127:64] are written to bits [127:64] of the result. 1748 /// \param __dp 1749 /// A pointer to a 64-bit memory location containing a double-precision 1750 /// floating-point value that is loaded. The loaded value is written to bits 1751 /// [63:0] of the result. The address of the memory location does not have to 1752 /// be aligned. 1753 /// \returns A 128-bit vector of [2 x double] containing the moved values. 1754 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a, 1755 double const *__dp) { 1756 struct __mm_loadl_pd_struct { 1757 double __u; 1758 } __attribute__((__packed__, __may_alias__)); 1759 double __u = ((const struct __mm_loadl_pd_struct *)__dp)->__u; 1760 return __extension__(__m128d){__u, __a[1]}; 1761 } 1762 1763 /// Constructs a 128-bit floating-point vector of [2 x double] with 1764 /// unspecified content. This could be used as an argument to another 1765 /// intrinsic function where the argument is required but the value is not 1766 /// actually used. 1767 /// 1768 /// \headerfile <x86intrin.h> 1769 /// 1770 /// This intrinsic has no corresponding instruction. 1771 /// 1772 /// \returns A 128-bit floating-point vector of [2 x double] with unspecified 1773 /// content. 1774 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) { 1775 return (__m128d)__builtin_ia32_undef128(); 1776 } 1777 1778 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower 1779 /// 64 bits of the vector are initialized with the specified double-precision 1780 /// floating-point value. The upper 64 bits are set to zero. 1781 /// 1782 /// \headerfile <x86intrin.h> 1783 /// 1784 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 1785 /// 1786 /// \param __w 1787 /// A double-precision floating-point value used to initialize the lower 64 1788 /// bits of the result. 1789 /// \returns An initialized 128-bit floating-point vector of [2 x double]. The 1790 /// lower 64 bits contain the value of the parameter. The upper 64 bits are 1791 /// set to zero. 1792 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_sd(double __w) { 1793 return __extension__(__m128d){__w, 0.0}; 1794 } 1795 1796 /// Constructs a 128-bit floating-point vector of [2 x double], with each 1797 /// of the two double-precision floating-point vector elements set to the 1798 /// specified double-precision floating-point value. 1799 /// 1800 /// \headerfile <x86intrin.h> 1801 /// 1802 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction. 1803 /// 1804 /// \param __w 1805 /// A double-precision floating-point value used to initialize each vector 1806 /// element of the result. 1807 /// \returns An initialized 128-bit floating-point vector of [2 x double]. 1808 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_pd(double __w) { 1809 return __extension__(__m128d){__w, __w}; 1810 } 1811 1812 /// Constructs a 128-bit floating-point vector of [2 x double], with each 1813 /// of the two double-precision floating-point vector elements set to the 1814 /// specified double-precision floating-point value. 1815 /// 1816 /// \headerfile <x86intrin.h> 1817 /// 1818 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction. 1819 /// 1820 /// \param __w 1821 /// A double-precision floating-point value used to initialize each vector 1822 /// element of the result. 1823 /// \returns An initialized 128-bit floating-point vector of [2 x double]. 1824 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_pd1(double __w) { 1825 return _mm_set1_pd(__w); 1826 } 1827 1828 /// Constructs a 128-bit floating-point vector of [2 x double] 1829 /// initialized with the specified double-precision floating-point values. 1830 /// 1831 /// \headerfile <x86intrin.h> 1832 /// 1833 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 1834 /// 1835 /// \param __w 1836 /// A double-precision floating-point value used to initialize the upper 64 1837 /// bits of the result. 1838 /// \param __x 1839 /// A double-precision floating-point value used to initialize the lower 64 1840 /// bits of the result. 1841 /// \returns An initialized 128-bit floating-point vector of [2 x double]. 1842 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_pd(double __w, 1843 double __x) { 1844 return __extension__(__m128d){__x, __w}; 1845 } 1846 1847 /// Constructs a 128-bit floating-point vector of [2 x double], 1848 /// initialized in reverse order with the specified double-precision 1849 /// floating-point values. 1850 /// 1851 /// \headerfile <x86intrin.h> 1852 /// 1853 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 1854 /// 1855 /// \param __w 1856 /// A double-precision floating-point value used to initialize the lower 64 1857 /// bits of the result. 1858 /// \param __x 1859 /// A double-precision floating-point value used to initialize the upper 64 1860 /// bits of the result. 1861 /// \returns An initialized 128-bit floating-point vector of [2 x double]. 1862 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setr_pd(double __w, 1863 double __x) { 1864 return __extension__(__m128d){__w, __x}; 1865 } 1866 1867 /// Constructs a 128-bit floating-point vector of [2 x double] 1868 /// initialized to zero. 1869 /// 1870 /// \headerfile <x86intrin.h> 1871 /// 1872 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction. 1873 /// 1874 /// \returns An initialized 128-bit floating-point vector of [2 x double] with 1875 /// all elements set to zero. 1876 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_pd(void) { 1877 return __extension__(__m128d){0.0, 0.0}; 1878 } 1879 1880 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower 1881 /// 64 bits are set to the lower 64 bits of the second parameter. The upper 1882 /// 64 bits are set to the upper 64 bits of the first parameter. 1883 /// 1884 /// \headerfile <x86intrin.h> 1885 /// 1886 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction. 1887 /// 1888 /// \param __a 1889 /// A 128-bit vector of [2 x double]. The upper 64 bits are written to the 1890 /// upper 64 bits of the result. 1891 /// \param __b 1892 /// A 128-bit vector of [2 x double]. The lower 64 bits are written to the 1893 /// lower 64 bits of the result. 1894 /// \returns A 128-bit vector of [2 x double] containing the moved values. 1895 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR 1896 _mm_move_sd(__m128d __a, __m128d __b) { 1897 __a[0] = __b[0]; 1898 return __a; 1899 } 1900 1901 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a 1902 /// memory location. 1903 /// 1904 /// \headerfile <x86intrin.h> 1905 /// 1906 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction. 1907 /// 1908 /// \param __dp 1909 /// A pointer to a 64-bit memory location. 1910 /// \param __a 1911 /// A 128-bit vector of [2 x double] containing the value to be stored. 1912 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp, 1913 __m128d __a) { 1914 struct __mm_store_sd_struct { 1915 double __u; 1916 } __attribute__((__packed__, __may_alias__)); 1917 ((struct __mm_store_sd_struct *)__dp)->__u = __a[0]; 1918 } 1919 1920 /// Moves packed double-precision values from a 128-bit vector of 1921 /// [2 x double] to a memory location. 1922 /// 1923 /// \headerfile <x86intrin.h> 1924 /// 1925 /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction. 1926 /// 1927 /// \param __dp 1928 /// A pointer to an aligned memory location that can store two 1929 /// double-precision values. 1930 /// \param __a 1931 /// A packed 128-bit vector of [2 x double] containing the values to be 1932 /// moved. 1933 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp, 1934 __m128d __a) { 1935 *(__m128d *)__dp = __a; 1936 } 1937 1938 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to 1939 /// the upper and lower 64 bits of a memory location. 1940 /// 1941 /// \headerfile <x86intrin.h> 1942 /// 1943 /// This intrinsic corresponds to the 1944 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction. 1945 /// 1946 /// \param __dp 1947 /// A pointer to a memory location that can store two double-precision 1948 /// values. 1949 /// \param __a 1950 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each 1951 /// of the values in \a __dp. 1952 static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp, 1953 __m128d __a) { 1954 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); 1955 _mm_store_pd(__dp, __a); 1956 } 1957 1958 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to 1959 /// the upper and lower 64 bits of a memory location. 1960 /// 1961 /// \headerfile <x86intrin.h> 1962 /// 1963 /// This intrinsic corresponds to the 1964 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction. 1965 /// 1966 /// \param __dp 1967 /// A pointer to a memory location that can store two double-precision 1968 /// values. 1969 /// \param __a 1970 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each 1971 /// of the values in \a __dp. 1972 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp, 1973 __m128d __a) { 1974 _mm_store1_pd(__dp, __a); 1975 } 1976 1977 /// Stores a 128-bit vector of [2 x double] into an unaligned memory 1978 /// location. 1979 /// 1980 /// \headerfile <x86intrin.h> 1981 /// 1982 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction. 1983 /// 1984 /// \param __dp 1985 /// A pointer to a 128-bit memory location. The address of the memory 1986 /// location does not have to be aligned. 1987 /// \param __a 1988 /// A 128-bit vector of [2 x double] containing the values to be stored. 1989 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, 1990 __m128d __a) { 1991 struct __storeu_pd { 1992 __m128d_u __v; 1993 } __attribute__((__packed__, __may_alias__)); 1994 ((struct __storeu_pd *)__dp)->__v = __a; 1995 } 1996 1997 /// Stores two double-precision values, in reverse order, from a 128-bit 1998 /// vector of [2 x double] to a 16-byte aligned memory location. 1999 /// 2000 /// \headerfile <x86intrin.h> 2001 /// 2002 /// This intrinsic corresponds to a shuffling instruction followed by a 2003 /// <c> VMOVAPD / MOVAPD </c> instruction. 2004 /// 2005 /// \param __dp 2006 /// A pointer to a 16-byte aligned memory location that can store two 2007 /// double-precision values. 2008 /// \param __a 2009 /// A 128-bit vector of [2 x double] containing the values to be reversed and 2010 /// stored. 2011 static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp, 2012 __m128d __a) { 2013 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0); 2014 *(__m128d *)__dp = __a; 2015 } 2016 2017 /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a 2018 /// memory location. 2019 /// 2020 /// \headerfile <x86intrin.h> 2021 /// 2022 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction. 2023 /// 2024 /// \param __dp 2025 /// A pointer to a 64-bit memory location. 2026 /// \param __a 2027 /// A 128-bit vector of [2 x double] containing the value to be stored. 2028 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp, 2029 __m128d __a) { 2030 struct __mm_storeh_pd_struct { 2031 double __u; 2032 } __attribute__((__packed__, __may_alias__)); 2033 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[1]; 2034 } 2035 2036 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a 2037 /// memory location. 2038 /// 2039 /// \headerfile <x86intrin.h> 2040 /// 2041 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction. 2042 /// 2043 /// \param __dp 2044 /// A pointer to a 64-bit memory location. 2045 /// \param __a 2046 /// A 128-bit vector of [2 x double] containing the value to be stored. 2047 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp, 2048 __m128d __a) { 2049 struct __mm_storeh_pd_struct { 2050 double __u; 2051 } __attribute__((__packed__, __may_alias__)); 2052 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[0]; 2053 } 2054 2055 /// Adds the corresponding elements of two 128-bit vectors of [16 x i8], 2056 /// saving the lower 8 bits of each sum in the corresponding element of a 2057 /// 128-bit result vector of [16 x i8]. 2058 /// 2059 /// The integer elements of both parameters can be either signed or unsigned. 2060 /// 2061 /// \headerfile <x86intrin.h> 2062 /// 2063 /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction. 2064 /// 2065 /// \param __a 2066 /// A 128-bit vector of [16 x i8]. 2067 /// \param __b 2068 /// A 128-bit vector of [16 x i8]. 2069 /// \returns A 128-bit vector of [16 x i8] containing the sums of both 2070 /// parameters. 2071 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a, 2072 __m128i __b) { 2073 return (__m128i)((__v16qu)__a + (__v16qu)__b); 2074 } 2075 2076 /// Adds the corresponding elements of two 128-bit vectors of [8 x i16], 2077 /// saving the lower 16 bits of each sum in the corresponding element of a 2078 /// 128-bit result vector of [8 x i16]. 2079 /// 2080 /// The integer elements of both parameters can be either signed or unsigned. 2081 /// 2082 /// \headerfile <x86intrin.h> 2083 /// 2084 /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction. 2085 /// 2086 /// \param __a 2087 /// A 128-bit vector of [8 x i16]. 2088 /// \param __b 2089 /// A 128-bit vector of [8 x i16]. 2090 /// \returns A 128-bit vector of [8 x i16] containing the sums of both 2091 /// parameters. 2092 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a, 2093 __m128i __b) { 2094 return (__m128i)((__v8hu)__a + (__v8hu)__b); 2095 } 2096 2097 /// Adds the corresponding elements of two 128-bit vectors of [4 x i32], 2098 /// saving the lower 32 bits of each sum in the corresponding element of a 2099 /// 128-bit result vector of [4 x i32]. 2100 /// 2101 /// The integer elements of both parameters can be either signed or unsigned. 2102 /// 2103 /// \headerfile <x86intrin.h> 2104 /// 2105 /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction. 2106 /// 2107 /// \param __a 2108 /// A 128-bit vector of [4 x i32]. 2109 /// \param __b 2110 /// A 128-bit vector of [4 x i32]. 2111 /// \returns A 128-bit vector of [4 x i32] containing the sums of both 2112 /// parameters. 2113 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR 2114 _mm_add_epi32(__m128i __a, __m128i __b) { 2115 return (__m128i)((__v4su)__a + (__v4su)__b); 2116 } 2117 2118 /// Adds two signed or unsigned 64-bit integer values, returning the 2119 /// lower 64 bits of the sum. 2120 /// 2121 /// \headerfile <x86intrin.h> 2122 /// 2123 /// This intrinsic corresponds to the <c> PADDQ </c> instruction. 2124 /// 2125 /// \param __a 2126 /// A 64-bit integer. 2127 /// \param __b 2128 /// A 64-bit integer. 2129 /// \returns A 64-bit integer containing the sum of both parameters. 2130 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_si64(__m64 __a, __m64 __b) { 2131 return (__m64)(((unsigned long long)__a) + ((unsigned long long)__b)); 2132 } 2133 2134 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64], 2135 /// saving the lower 64 bits of each sum in the corresponding element of a 2136 /// 128-bit result vector of [2 x i64]. 2137 /// 2138 /// The integer elements of both parameters can be either signed or unsigned. 2139 /// 2140 /// \headerfile <x86intrin.h> 2141 /// 2142 /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction. 2143 /// 2144 /// \param __a 2145 /// A 128-bit vector of [2 x i64]. 2146 /// \param __b 2147 /// A 128-bit vector of [2 x i64]. 2148 /// \returns A 128-bit vector of [2 x i64] containing the sums of both 2149 /// parameters. 2150 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR 2151 _mm_add_epi64(__m128i __a, __m128i __b) { 2152 return (__m128i)((__v2du)__a + (__v2du)__b); 2153 } 2154 2155 /// Adds, with saturation, the corresponding elements of two 128-bit 2156 /// signed [16 x i8] vectors, saving each sum in the corresponding element 2157 /// of a 128-bit result vector of [16 x i8]. 2158 /// 2159 /// Positive sums greater than 0x7F are saturated to 0x7F. Negative sums 2160 /// less than 0x80 are saturated to 0x80. 2161 /// 2162 /// \headerfile <x86intrin.h> 2163 /// 2164 /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction. 2165 /// 2166 /// \param __a 2167 /// A 128-bit signed [16 x i8] vector. 2168 /// \param __b 2169 /// A 128-bit signed [16 x i8] vector. 2170 /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of 2171 /// both parameters. 2172 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a, 2173 __m128i __b) { 2174 return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b); 2175 } 2176 2177 /// Adds, with saturation, the corresponding elements of two 128-bit 2178 /// signed [8 x i16] vectors, saving each sum in the corresponding element 2179 /// of a 128-bit result vector of [8 x i16]. 2180 /// 2181 /// Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums 2182 /// less than 0x8000 are saturated to 0x8000. 2183 /// 2184 /// \headerfile <x86intrin.h> 2185 /// 2186 /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction. 2187 /// 2188 /// \param __a 2189 /// A 128-bit signed [8 x i16] vector. 2190 /// \param __b 2191 /// A 128-bit signed [8 x i16] vector. 2192 /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of 2193 /// both parameters. 2194 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a, 2195 __m128i __b) { 2196 return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b); 2197 } 2198 2199 /// Adds, with saturation, the corresponding elements of two 128-bit 2200 /// unsigned [16 x i8] vectors, saving each sum in the corresponding element 2201 /// of a 128-bit result vector of [16 x i8]. 2202 /// 2203 /// Positive sums greater than 0xFF are saturated to 0xFF. Negative sums are 2204 /// saturated to 0x00. 2205 /// 2206 /// \headerfile <x86intrin.h> 2207 /// 2208 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction. 2209 /// 2210 /// \param __a 2211 /// A 128-bit unsigned [16 x i8] vector. 2212 /// \param __b 2213 /// A 128-bit unsigned [16 x i8] vector. 2214 /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums 2215 /// of both parameters. 2216 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a, 2217 __m128i __b) { 2218 return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b); 2219 } 2220 2221 /// Adds, with saturation, the corresponding elements of two 128-bit 2222 /// unsigned [8 x i16] vectors, saving each sum in the corresponding element 2223 /// of a 128-bit result vector of [8 x i16]. 2224 /// 2225 /// Positive sums greater than 0xFFFF are saturated to 0xFFFF. Negative sums 2226 /// are saturated to 0x0000. 2227 /// 2228 /// \headerfile <x86intrin.h> 2229 /// 2230 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction. 2231 /// 2232 /// \param __a 2233 /// A 128-bit unsigned [8 x i16] vector. 2234 /// \param __b 2235 /// A 128-bit unsigned [8 x i16] vector. 2236 /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums 2237 /// of both parameters. 2238 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a, 2239 __m128i __b) { 2240 return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b); 2241 } 2242 2243 /// Computes the rounded averages of corresponding elements of two 2244 /// 128-bit unsigned [16 x i8] vectors, saving each result in the 2245 /// corresponding element of a 128-bit result vector of [16 x i8]. 2246 /// 2247 /// \headerfile <x86intrin.h> 2248 /// 2249 /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction. 2250 /// 2251 /// \param __a 2252 /// A 128-bit unsigned [16 x i8] vector. 2253 /// \param __b 2254 /// A 128-bit unsigned [16 x i8] vector. 2255 /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded 2256 /// averages of both parameters. 2257 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, 2258 __m128i __b) { 2259 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b); 2260 } 2261 2262 /// Computes the rounded averages of corresponding elements of two 2263 /// 128-bit unsigned [8 x i16] vectors, saving each result in the 2264 /// corresponding element of a 128-bit result vector of [8 x i16]. 2265 /// 2266 /// \headerfile <x86intrin.h> 2267 /// 2268 /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction. 2269 /// 2270 /// \param __a 2271 /// A 128-bit unsigned [8 x i16] vector. 2272 /// \param __b 2273 /// A 128-bit unsigned [8 x i16] vector. 2274 /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded 2275 /// averages of both parameters. 2276 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, 2277 __m128i __b) { 2278 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b); 2279 } 2280 2281 /// Multiplies the corresponding elements of two 128-bit signed [8 x i16] 2282 /// vectors, producing eight intermediate 32-bit signed integer products, and 2283 /// adds the consecutive pairs of 32-bit products to form a 128-bit signed 2284 /// [4 x i32] vector. 2285 /// 2286 /// For example, bits [15:0] of both parameters are multiplied producing a 2287 /// 32-bit product, bits [31:16] of both parameters are multiplied producing 2288 /// a 32-bit product, and the sum of those two products becomes bits [31:0] 2289 /// of the result. 2290 /// 2291 /// \headerfile <x86intrin.h> 2292 /// 2293 /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction. 2294 /// 2295 /// \param __a 2296 /// A 128-bit signed [8 x i16] vector. 2297 /// \param __b 2298 /// A 128-bit signed [8 x i16] vector. 2299 /// \returns A 128-bit signed [4 x i32] vector containing the sums of products 2300 /// of both parameters. 2301 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a, 2302 __m128i __b) { 2303 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b); 2304 } 2305 2306 /// Compares corresponding elements of two 128-bit signed [8 x i16] 2307 /// vectors, saving the greater value from each comparison in the 2308 /// corresponding element of a 128-bit result vector of [8 x i16]. 2309 /// 2310 /// \headerfile <x86intrin.h> 2311 /// 2312 /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction. 2313 /// 2314 /// \param __a 2315 /// A 128-bit signed [8 x i16] vector. 2316 /// \param __b 2317 /// A 128-bit signed [8 x i16] vector. 2318 /// \returns A 128-bit signed [8 x i16] vector containing the greater value of 2319 /// each comparison. 2320 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a, 2321 __m128i __b) { 2322 return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b); 2323 } 2324 2325 /// Compares corresponding elements of two 128-bit unsigned [16 x i8] 2326 /// vectors, saving the greater value from each comparison in the 2327 /// corresponding element of a 128-bit result vector of [16 x i8]. 2328 /// 2329 /// \headerfile <x86intrin.h> 2330 /// 2331 /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction. 2332 /// 2333 /// \param __a 2334 /// A 128-bit unsigned [16 x i8] vector. 2335 /// \param __b 2336 /// A 128-bit unsigned [16 x i8] vector. 2337 /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of 2338 /// each comparison. 2339 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a, 2340 __m128i __b) { 2341 return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b); 2342 } 2343 2344 /// Compares corresponding elements of two 128-bit signed [8 x i16] 2345 /// vectors, saving the smaller value from each comparison in the 2346 /// corresponding element of a 128-bit result vector of [8 x i16]. 2347 /// 2348 /// \headerfile <x86intrin.h> 2349 /// 2350 /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction. 2351 /// 2352 /// \param __a 2353 /// A 128-bit signed [8 x i16] vector. 2354 /// \param __b 2355 /// A 128-bit signed [8 x i16] vector. 2356 /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of 2357 /// each comparison. 2358 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a, 2359 __m128i __b) { 2360 return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b); 2361 } 2362 2363 /// Compares corresponding elements of two 128-bit unsigned [16 x i8] 2364 /// vectors, saving the smaller value from each comparison in the 2365 /// corresponding element of a 128-bit result vector of [16 x i8]. 2366 /// 2367 /// \headerfile <x86intrin.h> 2368 /// 2369 /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction. 2370 /// 2371 /// \param __a 2372 /// A 128-bit unsigned [16 x i8] vector. 2373 /// \param __b 2374 /// A 128-bit unsigned [16 x i8] vector. 2375 /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of 2376 /// each comparison. 2377 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a, 2378 __m128i __b) { 2379 return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b); 2380 } 2381 2382 /// Multiplies the corresponding elements of two signed [8 x i16] 2383 /// vectors, saving the upper 16 bits of each 32-bit product in the 2384 /// corresponding element of a 128-bit signed [8 x i16] result vector. 2385 /// 2386 /// \headerfile <x86intrin.h> 2387 /// 2388 /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction. 2389 /// 2390 /// \param __a 2391 /// A 128-bit signed [8 x i16] vector. 2392 /// \param __b 2393 /// A 128-bit signed [8 x i16] vector. 2394 /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of 2395 /// each of the eight 32-bit products. 2396 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a, 2397 __m128i __b) { 2398 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b); 2399 } 2400 2401 /// Multiplies the corresponding elements of two unsigned [8 x i16] 2402 /// vectors, saving the upper 16 bits of each 32-bit product in the 2403 /// corresponding element of a 128-bit unsigned [8 x i16] result vector. 2404 /// 2405 /// \headerfile <x86intrin.h> 2406 /// 2407 /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction. 2408 /// 2409 /// \param __a 2410 /// A 128-bit unsigned [8 x i16] vector. 2411 /// \param __b 2412 /// A 128-bit unsigned [8 x i16] vector. 2413 /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits 2414 /// of each of the eight 32-bit products. 2415 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a, 2416 __m128i __b) { 2417 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b); 2418 } 2419 2420 /// Multiplies the corresponding elements of two signed [8 x i16] 2421 /// vectors, saving the lower 16 bits of each 32-bit product in the 2422 /// corresponding element of a 128-bit signed [8 x i16] result vector. 2423 /// 2424 /// \headerfile <x86intrin.h> 2425 /// 2426 /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction. 2427 /// 2428 /// \param __a 2429 /// A 128-bit signed [8 x i16] vector. 2430 /// \param __b 2431 /// A 128-bit signed [8 x i16] vector. 2432 /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of 2433 /// each of the eight 32-bit products. 2434 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, 2435 __m128i __b) { 2436 return (__m128i)((__v8hu)__a * (__v8hu)__b); 2437 } 2438 2439 /// Multiplies 32-bit unsigned integer values contained in the lower bits 2440 /// of the two 64-bit integer vectors and returns the 64-bit unsigned 2441 /// product. 2442 /// 2443 /// \headerfile <x86intrin.h> 2444 /// 2445 /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction. 2446 /// 2447 /// \param __a 2448 /// A 64-bit integer containing one of the source operands. 2449 /// \param __b 2450 /// A 64-bit integer containing one of the source operands. 2451 /// \returns A 64-bit integer vector containing the product of both operands. 2452 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) { 2453 return __trunc64(__builtin_ia32_pmuludq128((__v4si)__anyext128(__a), 2454 (__v4si)__anyext128(__b))); 2455 } 2456 2457 /// Multiplies 32-bit unsigned integer values contained in the lower 2458 /// bits of the corresponding elements of two [2 x i64] vectors, and returns 2459 /// the 64-bit products in the corresponding elements of a [2 x i64] vector. 2460 /// 2461 /// \headerfile <x86intrin.h> 2462 /// 2463 /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction. 2464 /// 2465 /// \param __a 2466 /// A [2 x i64] vector containing one of the source operands. 2467 /// \param __b 2468 /// A [2 x i64] vector containing one of the source operands. 2469 /// \returns A [2 x i64] vector containing the product of both operands. 2470 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a, 2471 __m128i __b) { 2472 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b); 2473 } 2474 2475 /// Computes the absolute differences of corresponding 8-bit integer 2476 /// values in two 128-bit vectors. Sums the first 8 absolute differences, and 2477 /// separately sums the second 8 absolute differences. Packs these two 2478 /// unsigned 16-bit integer sums into the upper and lower elements of a 2479 /// [2 x i64] vector. 2480 /// 2481 /// \headerfile <x86intrin.h> 2482 /// 2483 /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction. 2484 /// 2485 /// \param __a 2486 /// A 128-bit integer vector containing one of the source operands. 2487 /// \param __b 2488 /// A 128-bit integer vector containing one of the source operands. 2489 /// \returns A [2 x i64] vector containing the sums of the sets of absolute 2490 /// differences between both operands. 2491 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a, 2492 __m128i __b) { 2493 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b); 2494 } 2495 2496 /// Subtracts the corresponding 8-bit integer values in the operands. 2497 /// 2498 /// \headerfile <x86intrin.h> 2499 /// 2500 /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction. 2501 /// 2502 /// \param __a 2503 /// A 128-bit integer vector containing the minuends. 2504 /// \param __b 2505 /// A 128-bit integer vector containing the subtrahends. 2506 /// \returns A 128-bit integer vector containing the differences of the values 2507 /// in the operands. 2508 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a, 2509 __m128i __b) { 2510 return (__m128i)((__v16qu)__a - (__v16qu)__b); 2511 } 2512 2513 /// Subtracts the corresponding 16-bit integer values in the operands. 2514 /// 2515 /// \headerfile <x86intrin.h> 2516 /// 2517 /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction. 2518 /// 2519 /// \param __a 2520 /// A 128-bit integer vector containing the minuends. 2521 /// \param __b 2522 /// A 128-bit integer vector containing the subtrahends. 2523 /// \returns A 128-bit integer vector containing the differences of the values 2524 /// in the operands. 2525 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a, 2526 __m128i __b) { 2527 return (__m128i)((__v8hu)__a - (__v8hu)__b); 2528 } 2529 2530 /// Subtracts the corresponding 32-bit integer values in the operands. 2531 /// 2532 /// \headerfile <x86intrin.h> 2533 /// 2534 /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction. 2535 /// 2536 /// \param __a 2537 /// A 128-bit integer vector containing the minuends. 2538 /// \param __b 2539 /// A 128-bit integer vector containing the subtrahends. 2540 /// \returns A 128-bit integer vector containing the differences of the values 2541 /// in the operands. 2542 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR 2543 _mm_sub_epi32(__m128i __a, __m128i __b) { 2544 return (__m128i)((__v4su)__a - (__v4su)__b); 2545 } 2546 2547 /// Subtracts signed or unsigned 64-bit integer values and writes the 2548 /// difference to the corresponding bits in the destination. 2549 /// 2550 /// \headerfile <x86intrin.h> 2551 /// 2552 /// This intrinsic corresponds to the <c> PSUBQ </c> instruction. 2553 /// 2554 /// \param __a 2555 /// A 64-bit integer vector containing the minuend. 2556 /// \param __b 2557 /// A 64-bit integer vector containing the subtrahend. 2558 /// \returns A 64-bit integer vector containing the difference of the values in 2559 /// the operands. 2560 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_si64(__m64 __a, __m64 __b) { 2561 return (__m64)((unsigned long long)__a - (unsigned long long)__b); 2562 } 2563 2564 /// Subtracts the corresponding elements of two [2 x i64] vectors. 2565 /// 2566 /// \headerfile <x86intrin.h> 2567 /// 2568 /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction. 2569 /// 2570 /// \param __a 2571 /// A 128-bit integer vector containing the minuends. 2572 /// \param __b 2573 /// A 128-bit integer vector containing the subtrahends. 2574 /// \returns A 128-bit integer vector containing the differences of the values 2575 /// in the operands. 2576 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR 2577 _mm_sub_epi64(__m128i __a, __m128i __b) { 2578 return (__m128i)((__v2du)__a - (__v2du)__b); 2579 } 2580 2581 /// Subtracts, with saturation, corresponding 8-bit signed integer values in 2582 /// the input and returns the differences in the corresponding bytes in the 2583 /// destination. 2584 /// 2585 /// Differences greater than 0x7F are saturated to 0x7F, and differences 2586 /// less than 0x80 are saturated to 0x80. 2587 /// 2588 /// \headerfile <x86intrin.h> 2589 /// 2590 /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction. 2591 /// 2592 /// \param __a 2593 /// A 128-bit integer vector containing the minuends. 2594 /// \param __b 2595 /// A 128-bit integer vector containing the subtrahends. 2596 /// \returns A 128-bit integer vector containing the differences of the values 2597 /// in the operands. 2598 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a, 2599 __m128i __b) { 2600 return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b); 2601 } 2602 2603 /// Subtracts, with saturation, corresponding 16-bit signed integer values in 2604 /// the input and returns the differences in the corresponding bytes in the 2605 /// destination. 2606 /// 2607 /// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less 2608 /// than 0x8000 are saturated to 0x8000. 2609 /// 2610 /// \headerfile <x86intrin.h> 2611 /// 2612 /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction. 2613 /// 2614 /// \param __a 2615 /// A 128-bit integer vector containing the minuends. 2616 /// \param __b 2617 /// A 128-bit integer vector containing the subtrahends. 2618 /// \returns A 128-bit integer vector containing the differences of the values 2619 /// in the operands. 2620 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a, 2621 __m128i __b) { 2622 return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b); 2623 } 2624 2625 /// Subtracts, with saturation, corresponding 8-bit unsigned integer values in 2626 /// the input and returns the differences in the corresponding bytes in the 2627 /// destination. 2628 /// 2629 /// Differences less than 0x00 are saturated to 0x00. 2630 /// 2631 /// \headerfile <x86intrin.h> 2632 /// 2633 /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction. 2634 /// 2635 /// \param __a 2636 /// A 128-bit integer vector containing the minuends. 2637 /// \param __b 2638 /// A 128-bit integer vector containing the subtrahends. 2639 /// \returns A 128-bit integer vector containing the unsigned integer 2640 /// differences of the values in the operands. 2641 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a, 2642 __m128i __b) { 2643 return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b); 2644 } 2645 2646 /// Subtracts, with saturation, corresponding 16-bit unsigned integer values in 2647 /// the input and returns the differences in the corresponding bytes in the 2648 /// destination. 2649 /// 2650 /// Differences less than 0x0000 are saturated to 0x0000. 2651 /// 2652 /// \headerfile <x86intrin.h> 2653 /// 2654 /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction. 2655 /// 2656 /// \param __a 2657 /// A 128-bit integer vector containing the minuends. 2658 /// \param __b 2659 /// A 128-bit integer vector containing the subtrahends. 2660 /// \returns A 128-bit integer vector containing the unsigned integer 2661 /// differences of the values in the operands. 2662 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a, 2663 __m128i __b) { 2664 return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b); 2665 } 2666 2667 /// Performs a bitwise AND of two 128-bit integer vectors. 2668 /// 2669 /// \headerfile <x86intrin.h> 2670 /// 2671 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction. 2672 /// 2673 /// \param __a 2674 /// A 128-bit integer vector containing one of the source operands. 2675 /// \param __b 2676 /// A 128-bit integer vector containing one of the source operands. 2677 /// \returns A 128-bit integer vector containing the bitwise AND of the values 2678 /// in both operands. 2679 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a, 2680 __m128i __b) { 2681 return (__m128i)((__v2du)__a & (__v2du)__b); 2682 } 2683 2684 /// Performs a bitwise AND of two 128-bit integer vectors, using the 2685 /// one's complement of the values contained in the first source operand. 2686 /// 2687 /// \headerfile <x86intrin.h> 2688 /// 2689 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction. 2690 /// 2691 /// \param __a 2692 /// A 128-bit vector containing the left source operand. The one's complement 2693 /// of this value is used in the bitwise AND. 2694 /// \param __b 2695 /// A 128-bit vector containing the right source operand. 2696 /// \returns A 128-bit integer vector containing the bitwise AND of the one's 2697 /// complement of the first operand and the values in the second operand. 2698 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a, 2699 __m128i __b) { 2700 return (__m128i)(~(__v2du)__a & (__v2du)__b); 2701 } 2702 /// Performs a bitwise OR of two 128-bit integer vectors. 2703 /// 2704 /// \headerfile <x86intrin.h> 2705 /// 2706 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction. 2707 /// 2708 /// \param __a 2709 /// A 128-bit integer vector containing one of the source operands. 2710 /// \param __b 2711 /// A 128-bit integer vector containing one of the source operands. 2712 /// \returns A 128-bit integer vector containing the bitwise OR of the values 2713 /// in both operands. 2714 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a, 2715 __m128i __b) { 2716 return (__m128i)((__v2du)__a | (__v2du)__b); 2717 } 2718 2719 /// Performs a bitwise exclusive OR of two 128-bit integer vectors. 2720 /// 2721 /// \headerfile <x86intrin.h> 2722 /// 2723 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction. 2724 /// 2725 /// \param __a 2726 /// A 128-bit integer vector containing one of the source operands. 2727 /// \param __b 2728 /// A 128-bit integer vector containing one of the source operands. 2729 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the 2730 /// values in both operands. 2731 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a, 2732 __m128i __b) { 2733 return (__m128i)((__v2du)__a ^ (__v2du)__b); 2734 } 2735 2736 /// Left-shifts the 128-bit integer vector operand by the specified 2737 /// number of bytes. Low-order bits are cleared. 2738 /// 2739 /// \headerfile <x86intrin.h> 2740 /// 2741 /// \code 2742 /// __m128i _mm_slli_si128(__m128i a, const int imm); 2743 /// \endcode 2744 /// 2745 /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction. 2746 /// 2747 /// \param a 2748 /// A 128-bit integer vector containing the source operand. 2749 /// \param imm 2750 /// An immediate value specifying the number of bytes to left-shift operand 2751 /// \a a. 2752 /// \returns A 128-bit integer vector containing the left-shifted value. 2753 #define _mm_slli_si128(a, imm) \ 2754 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \ 2755 (int)(imm))) 2756 2757 #define _mm_bslli_si128(a, imm) \ 2758 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \ 2759 (int)(imm))) 2760 2761 /// Left-shifts each 16-bit value in the 128-bit integer vector operand 2762 /// by the specified number of bits. Low-order bits are cleared. 2763 /// 2764 /// \headerfile <x86intrin.h> 2765 /// 2766 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction. 2767 /// 2768 /// \param __a 2769 /// A 128-bit integer vector containing the source operand. 2770 /// \param __count 2771 /// An integer value specifying the number of bits to left-shift each value 2772 /// in operand \a __a. 2773 /// \returns A 128-bit integer vector containing the left-shifted values. 2774 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a, 2775 int __count) { 2776 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count); 2777 } 2778 2779 /// Left-shifts each 16-bit value in the 128-bit integer vector operand 2780 /// by the specified number of bits. Low-order bits are cleared. 2781 /// 2782 /// \headerfile <x86intrin.h> 2783 /// 2784 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction. 2785 /// 2786 /// \param __a 2787 /// A 128-bit integer vector containing the source operand. 2788 /// \param __count 2789 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2790 /// to left-shift each value in operand \a __a. 2791 /// \returns A 128-bit integer vector containing the left-shifted values. 2792 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a, 2793 __m128i __count) { 2794 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count); 2795 } 2796 2797 /// Left-shifts each 32-bit value in the 128-bit integer vector operand 2798 /// by the specified number of bits. Low-order bits are cleared. 2799 /// 2800 /// \headerfile <x86intrin.h> 2801 /// 2802 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction. 2803 /// 2804 /// \param __a 2805 /// A 128-bit integer vector containing the source operand. 2806 /// \param __count 2807 /// An integer value specifying the number of bits to left-shift each value 2808 /// in operand \a __a. 2809 /// \returns A 128-bit integer vector containing the left-shifted values. 2810 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a, 2811 int __count) { 2812 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count); 2813 } 2814 2815 /// Left-shifts each 32-bit value in the 128-bit integer vector operand 2816 /// by the specified number of bits. Low-order bits are cleared. 2817 /// 2818 /// \headerfile <x86intrin.h> 2819 /// 2820 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction. 2821 /// 2822 /// \param __a 2823 /// A 128-bit integer vector containing the source operand. 2824 /// \param __count 2825 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2826 /// to left-shift each value in operand \a __a. 2827 /// \returns A 128-bit integer vector containing the left-shifted values. 2828 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a, 2829 __m128i __count) { 2830 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count); 2831 } 2832 2833 /// Left-shifts each 64-bit value in the 128-bit integer vector operand 2834 /// by the specified number of bits. Low-order bits are cleared. 2835 /// 2836 /// \headerfile <x86intrin.h> 2837 /// 2838 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction. 2839 /// 2840 /// \param __a 2841 /// A 128-bit integer vector containing the source operand. 2842 /// \param __count 2843 /// An integer value specifying the number of bits to left-shift each value 2844 /// in operand \a __a. 2845 /// \returns A 128-bit integer vector containing the left-shifted values. 2846 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a, 2847 int __count) { 2848 return __builtin_ia32_psllqi128((__v2di)__a, __count); 2849 } 2850 2851 /// Left-shifts each 64-bit value in the 128-bit integer vector operand 2852 /// by the specified number of bits. Low-order bits are cleared. 2853 /// 2854 /// \headerfile <x86intrin.h> 2855 /// 2856 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction. 2857 /// 2858 /// \param __a 2859 /// A 128-bit integer vector containing the source operand. 2860 /// \param __count 2861 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2862 /// to left-shift each value in operand \a __a. 2863 /// \returns A 128-bit integer vector containing the left-shifted values. 2864 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a, 2865 __m128i __count) { 2866 return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count); 2867 } 2868 2869 /// Right-shifts each 16-bit value in the 128-bit integer vector operand 2870 /// by the specified number of bits. High-order bits are filled with the sign 2871 /// bit of the initial value. 2872 /// 2873 /// \headerfile <x86intrin.h> 2874 /// 2875 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction. 2876 /// 2877 /// \param __a 2878 /// A 128-bit integer vector containing the source operand. 2879 /// \param __count 2880 /// An integer value specifying the number of bits to right-shift each value 2881 /// in operand \a __a. 2882 /// \returns A 128-bit integer vector containing the right-shifted values. 2883 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a, 2884 int __count) { 2885 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count); 2886 } 2887 2888 /// Right-shifts each 16-bit value in the 128-bit integer vector operand 2889 /// by the specified number of bits. High-order bits are filled with the sign 2890 /// bit of the initial value. 2891 /// 2892 /// \headerfile <x86intrin.h> 2893 /// 2894 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction. 2895 /// 2896 /// \param __a 2897 /// A 128-bit integer vector containing the source operand. 2898 /// \param __count 2899 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2900 /// to right-shift each value in operand \a __a. 2901 /// \returns A 128-bit integer vector containing the right-shifted values. 2902 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a, 2903 __m128i __count) { 2904 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count); 2905 } 2906 2907 /// Right-shifts each 32-bit value in the 128-bit integer vector operand 2908 /// by the specified number of bits. High-order bits are filled with the sign 2909 /// bit of the initial value. 2910 /// 2911 /// \headerfile <x86intrin.h> 2912 /// 2913 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction. 2914 /// 2915 /// \param __a 2916 /// A 128-bit integer vector containing the source operand. 2917 /// \param __count 2918 /// An integer value specifying the number of bits to right-shift each value 2919 /// in operand \a __a. 2920 /// \returns A 128-bit integer vector containing the right-shifted values. 2921 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a, 2922 int __count) { 2923 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count); 2924 } 2925 2926 /// Right-shifts each 32-bit value in the 128-bit integer vector operand 2927 /// by the specified number of bits. High-order bits are filled with the sign 2928 /// bit of the initial value. 2929 /// 2930 /// \headerfile <x86intrin.h> 2931 /// 2932 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction. 2933 /// 2934 /// \param __a 2935 /// A 128-bit integer vector containing the source operand. 2936 /// \param __count 2937 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2938 /// to right-shift each value in operand \a __a. 2939 /// \returns A 128-bit integer vector containing the right-shifted values. 2940 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, 2941 __m128i __count) { 2942 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count); 2943 } 2944 2945 /// Right-shifts the 128-bit integer vector operand by the specified 2946 /// number of bytes. High-order bits are cleared. 2947 /// 2948 /// \headerfile <x86intrin.h> 2949 /// 2950 /// \code 2951 /// __m128i _mm_srli_si128(__m128i a, const int imm); 2952 /// \endcode 2953 /// 2954 /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction. 2955 /// 2956 /// \param a 2957 /// A 128-bit integer vector containing the source operand. 2958 /// \param imm 2959 /// An immediate value specifying the number of bytes to right-shift operand 2960 /// \a a. 2961 /// \returns A 128-bit integer vector containing the right-shifted value. 2962 #define _mm_srli_si128(a, imm) \ 2963 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \ 2964 (int)(imm))) 2965 2966 #define _mm_bsrli_si128(a, imm) \ 2967 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \ 2968 (int)(imm))) 2969 2970 /// Right-shifts each of 16-bit values in the 128-bit integer vector 2971 /// operand by the specified number of bits. High-order bits are cleared. 2972 /// 2973 /// \headerfile <x86intrin.h> 2974 /// 2975 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction. 2976 /// 2977 /// \param __a 2978 /// A 128-bit integer vector containing the source operand. 2979 /// \param __count 2980 /// An integer value specifying the number of bits to right-shift each value 2981 /// in operand \a __a. 2982 /// \returns A 128-bit integer vector containing the right-shifted values. 2983 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a, 2984 int __count) { 2985 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count); 2986 } 2987 2988 /// Right-shifts each of 16-bit values in the 128-bit integer vector 2989 /// operand by the specified number of bits. High-order bits are cleared. 2990 /// 2991 /// \headerfile <x86intrin.h> 2992 /// 2993 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction. 2994 /// 2995 /// \param __a 2996 /// A 128-bit integer vector containing the source operand. 2997 /// \param __count 2998 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2999 /// to right-shift each value in operand \a __a. 3000 /// \returns A 128-bit integer vector containing the right-shifted values. 3001 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a, 3002 __m128i __count) { 3003 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count); 3004 } 3005 3006 /// Right-shifts each of 32-bit values in the 128-bit integer vector 3007 /// operand by the specified number of bits. High-order bits are cleared. 3008 /// 3009 /// \headerfile <x86intrin.h> 3010 /// 3011 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction. 3012 /// 3013 /// \param __a 3014 /// A 128-bit integer vector containing the source operand. 3015 /// \param __count 3016 /// An integer value specifying the number of bits to right-shift each value 3017 /// in operand \a __a. 3018 /// \returns A 128-bit integer vector containing the right-shifted values. 3019 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a, 3020 int __count) { 3021 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count); 3022 } 3023 3024 /// Right-shifts each of 32-bit values in the 128-bit integer vector 3025 /// operand by the specified number of bits. High-order bits are cleared. 3026 /// 3027 /// \headerfile <x86intrin.h> 3028 /// 3029 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction. 3030 /// 3031 /// \param __a 3032 /// A 128-bit integer vector containing the source operand. 3033 /// \param __count 3034 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 3035 /// to right-shift each value in operand \a __a. 3036 /// \returns A 128-bit integer vector containing the right-shifted values. 3037 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a, 3038 __m128i __count) { 3039 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count); 3040 } 3041 3042 /// Right-shifts each of 64-bit values in the 128-bit integer vector 3043 /// operand by the specified number of bits. High-order bits are cleared. 3044 /// 3045 /// \headerfile <x86intrin.h> 3046 /// 3047 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction. 3048 /// 3049 /// \param __a 3050 /// A 128-bit integer vector containing the source operand. 3051 /// \param __count 3052 /// An integer value specifying the number of bits to right-shift each value 3053 /// in operand \a __a. 3054 /// \returns A 128-bit integer vector containing the right-shifted values. 3055 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a, 3056 int __count) { 3057 return __builtin_ia32_psrlqi128((__v2di)__a, __count); 3058 } 3059 3060 /// Right-shifts each of 64-bit values in the 128-bit integer vector 3061 /// operand by the specified number of bits. High-order bits are cleared. 3062 /// 3063 /// \headerfile <x86intrin.h> 3064 /// 3065 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction. 3066 /// 3067 /// \param __a 3068 /// A 128-bit integer vector containing the source operand. 3069 /// \param __count 3070 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 3071 /// to right-shift each value in operand \a __a. 3072 /// \returns A 128-bit integer vector containing the right-shifted values. 3073 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a, 3074 __m128i __count) { 3075 return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count); 3076 } 3077 3078 /// Compares each of the corresponding 8-bit values of the 128-bit 3079 /// integer vectors for equality. 3080 /// 3081 /// Each comparison returns 0x0 for false, 0xFF for true. 3082 /// 3083 /// \headerfile <x86intrin.h> 3084 /// 3085 /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction. 3086 /// 3087 /// \param __a 3088 /// A 128-bit integer vector. 3089 /// \param __b 3090 /// A 128-bit integer vector. 3091 /// \returns A 128-bit integer vector containing the comparison results. 3092 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, 3093 __m128i __b) { 3094 return (__m128i)((__v16qi)__a == (__v16qi)__b); 3095 } 3096 3097 /// Compares each of the corresponding 16-bit values of the 128-bit 3098 /// integer vectors for equality. 3099 /// 3100 /// Each comparison returns 0x0 for false, 0xFFFF for true. 3101 /// 3102 /// \headerfile <x86intrin.h> 3103 /// 3104 /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction. 3105 /// 3106 /// \param __a 3107 /// A 128-bit integer vector. 3108 /// \param __b 3109 /// A 128-bit integer vector. 3110 /// \returns A 128-bit integer vector containing the comparison results. 3111 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a, 3112 __m128i __b) { 3113 return (__m128i)((__v8hi)__a == (__v8hi)__b); 3114 } 3115 3116 /// Compares each of the corresponding 32-bit values of the 128-bit 3117 /// integer vectors for equality. 3118 /// 3119 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true. 3120 /// 3121 /// \headerfile <x86intrin.h> 3122 /// 3123 /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction. 3124 /// 3125 /// \param __a 3126 /// A 128-bit integer vector. 3127 /// \param __b 3128 /// A 128-bit integer vector. 3129 /// \returns A 128-bit integer vector containing the comparison results. 3130 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a, 3131 __m128i __b) { 3132 return (__m128i)((__v4si)__a == (__v4si)__b); 3133 } 3134 3135 /// Compares each of the corresponding signed 8-bit values of the 128-bit 3136 /// integer vectors to determine if the values in the first operand are 3137 /// greater than those in the second operand. 3138 /// 3139 /// Each comparison returns 0x0 for false, 0xFF for true. 3140 /// 3141 /// \headerfile <x86intrin.h> 3142 /// 3143 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction. 3144 /// 3145 /// \param __a 3146 /// A 128-bit integer vector. 3147 /// \param __b 3148 /// A 128-bit integer vector. 3149 /// \returns A 128-bit integer vector containing the comparison results. 3150 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a, 3151 __m128i __b) { 3152 /* This function always performs a signed comparison, but __v16qi is a char 3153 which may be signed or unsigned, so use __v16qs. */ 3154 return (__m128i)((__v16qs)__a > (__v16qs)__b); 3155 } 3156 3157 /// Compares each of the corresponding signed 16-bit values of the 3158 /// 128-bit integer vectors to determine if the values in the first operand 3159 /// are greater than those in the second operand. 3160 /// 3161 /// Each comparison returns 0x0 for false, 0xFFFF for true. 3162 /// 3163 /// \headerfile <x86intrin.h> 3164 /// 3165 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction. 3166 /// 3167 /// \param __a 3168 /// A 128-bit integer vector. 3169 /// \param __b 3170 /// A 128-bit integer vector. 3171 /// \returns A 128-bit integer vector containing the comparison results. 3172 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a, 3173 __m128i __b) { 3174 return (__m128i)((__v8hi)__a > (__v8hi)__b); 3175 } 3176 3177 /// Compares each of the corresponding signed 32-bit values of the 3178 /// 128-bit integer vectors to determine if the values in the first operand 3179 /// are greater than those in the second operand. 3180 /// 3181 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true. 3182 /// 3183 /// \headerfile <x86intrin.h> 3184 /// 3185 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction. 3186 /// 3187 /// \param __a 3188 /// A 128-bit integer vector. 3189 /// \param __b 3190 /// A 128-bit integer vector. 3191 /// \returns A 128-bit integer vector containing the comparison results. 3192 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a, 3193 __m128i __b) { 3194 return (__m128i)((__v4si)__a > (__v4si)__b); 3195 } 3196 3197 /// Compares each of the corresponding signed 8-bit values of the 128-bit 3198 /// integer vectors to determine if the values in the first operand are less 3199 /// than those in the second operand. 3200 /// 3201 /// Each comparison returns 0x0 for false, 0xFF for true. 3202 /// 3203 /// \headerfile <x86intrin.h> 3204 /// 3205 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction. 3206 /// 3207 /// \param __a 3208 /// A 128-bit integer vector. 3209 /// \param __b 3210 /// A 128-bit integer vector. 3211 /// \returns A 128-bit integer vector containing the comparison results. 3212 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a, 3213 __m128i __b) { 3214 return _mm_cmpgt_epi8(__b, __a); 3215 } 3216 3217 /// Compares each of the corresponding signed 16-bit values of the 3218 /// 128-bit integer vectors to determine if the values in the first operand 3219 /// are less than those in the second operand. 3220 /// 3221 /// Each comparison returns 0x0 for false, 0xFFFF for true. 3222 /// 3223 /// \headerfile <x86intrin.h> 3224 /// 3225 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction. 3226 /// 3227 /// \param __a 3228 /// A 128-bit integer vector. 3229 /// \param __b 3230 /// A 128-bit integer vector. 3231 /// \returns A 128-bit integer vector containing the comparison results. 3232 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a, 3233 __m128i __b) { 3234 return _mm_cmpgt_epi16(__b, __a); 3235 } 3236 3237 /// Compares each of the corresponding signed 32-bit values of the 3238 /// 128-bit integer vectors to determine if the values in the first operand 3239 /// are less than those in the second operand. 3240 /// 3241 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true. 3242 /// 3243 /// \headerfile <x86intrin.h> 3244 /// 3245 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction. 3246 /// 3247 /// \param __a 3248 /// A 128-bit integer vector. 3249 /// \param __b 3250 /// A 128-bit integer vector. 3251 /// \returns A 128-bit integer vector containing the comparison results. 3252 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a, 3253 __m128i __b) { 3254 return _mm_cmpgt_epi32(__b, __a); 3255 } 3256 3257 #ifdef __x86_64__ 3258 /// Converts a 64-bit signed integer value from the second operand into a 3259 /// double-precision value and returns it in the lower element of a [2 x 3260 /// double] vector; the upper element of the returned vector is copied from 3261 /// the upper element of the first operand. 3262 /// 3263 /// \headerfile <x86intrin.h> 3264 /// 3265 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction. 3266 /// 3267 /// \param __a 3268 /// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are 3269 /// copied to the upper 64 bits of the destination. 3270 /// \param __b 3271 /// A 64-bit signed integer operand containing the value to be converted. 3272 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 3273 /// converted value of the second operand. The upper 64 bits are copied from 3274 /// the upper 64 bits of the first operand. 3275 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR 3276 _mm_cvtsi64_sd(__m128d __a, long long __b) { 3277 __a[0] = __b; 3278 return __a; 3279 } 3280 3281 /// Converts the first (lower) element of a vector of [2 x double] into a 3282 /// 64-bit signed integer value. 3283 /// 3284 /// If the converted value does not fit in a 64-bit integer, raises a 3285 /// floating-point invalid exception. If the exception is masked, returns 3286 /// the most negative integer. 3287 /// 3288 /// \headerfile <x86intrin.h> 3289 /// 3290 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction. 3291 /// 3292 /// \param __a 3293 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 3294 /// conversion. 3295 /// \returns A 64-bit signed integer containing the converted value. 3296 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) { 3297 return __builtin_ia32_cvtsd2si64((__v2df)__a); 3298 } 3299 3300 /// Converts the first (lower) element of a vector of [2 x double] into a 3301 /// 64-bit signed truncated (rounded toward zero) integer value. 3302 /// 3303 /// If a converted value does not fit in a 64-bit integer, raises a 3304 /// floating-point invalid exception. If the exception is masked, returns 3305 /// the most negative integer. 3306 /// 3307 /// \headerfile <x86intrin.h> 3308 /// 3309 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c> 3310 /// instruction. 3311 /// 3312 /// \param __a 3313 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 3314 /// conversion. 3315 /// \returns A 64-bit signed integer containing the converted value. 3316 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) { 3317 return __builtin_ia32_cvttsd2si64((__v2df)__a); 3318 } 3319 #endif 3320 3321 /// Converts a vector of [4 x i32] into a vector of [4 x float]. 3322 /// 3323 /// \headerfile <x86intrin.h> 3324 /// 3325 /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction. 3326 /// 3327 /// \param __a 3328 /// A 128-bit integer vector. 3329 /// \returns A 128-bit vector of [4 x float] containing the converted values. 3330 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR 3331 _mm_cvtepi32_ps(__m128i __a) { 3332 return (__m128) __builtin_convertvector((__v4si)__a, __v4sf); 3333 } 3334 3335 /// Converts a vector of [4 x float] into a vector of [4 x i32]. 3336 /// 3337 /// If a converted value does not fit in a 32-bit integer, raises a 3338 /// floating-point invalid exception. If the exception is masked, returns 3339 /// the most negative integer. 3340 /// 3341 /// \headerfile <x86intrin.h> 3342 /// 3343 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction. 3344 /// 3345 /// \param __a 3346 /// A 128-bit vector of [4 x float]. 3347 /// \returns A 128-bit integer vector of [4 x i32] containing the converted 3348 /// values. 3349 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) { 3350 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a); 3351 } 3352 3353 /// Converts a vector of [4 x float] into four signed truncated (rounded toward 3354 /// zero) 32-bit integers, returned in a vector of [4 x i32]. 3355 /// 3356 /// If a converted value does not fit in a 32-bit integer, raises a 3357 /// floating-point invalid exception. If the exception is masked, returns 3358 /// the most negative integer. 3359 /// 3360 /// \headerfile <x86intrin.h> 3361 /// 3362 /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c> 3363 /// instruction. 3364 /// 3365 /// \param __a 3366 /// A 128-bit vector of [4 x float]. 3367 /// \returns A 128-bit vector of [4 x i32] containing the converted values. 3368 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) { 3369 return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a); 3370 } 3371 3372 /// Returns a vector of [4 x i32] where the lowest element is the input 3373 /// operand and the remaining elements are zero. 3374 /// 3375 /// \headerfile <x86intrin.h> 3376 /// 3377 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 3378 /// 3379 /// \param __a 3380 /// A 32-bit signed integer operand. 3381 /// \returns A 128-bit vector of [4 x i32]. 3382 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) { 3383 return __extension__(__m128i)(__v4si){__a, 0, 0, 0}; 3384 } 3385 3386 /// Returns a vector of [2 x i64] where the lower element is the input 3387 /// operand and the upper element is zero. 3388 /// 3389 /// \headerfile <x86intrin.h> 3390 /// 3391 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction 3392 /// in 64-bit mode. 3393 /// 3394 /// \param __a 3395 /// A 64-bit signed integer operand containing the value to be converted. 3396 /// \returns A 128-bit vector of [2 x i64] containing the converted value. 3397 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) { 3398 return __extension__(__m128i)(__v2di){__a, 0}; 3399 } 3400 3401 /// Moves the least significant 32 bits of a vector of [4 x i32] to a 3402 /// 32-bit signed integer value. 3403 /// 3404 /// \headerfile <x86intrin.h> 3405 /// 3406 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 3407 /// 3408 /// \param __a 3409 /// A vector of [4 x i32]. The least significant 32 bits are moved to the 3410 /// destination. 3411 /// \returns A 32-bit signed integer containing the moved value. 3412 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) { 3413 __v4si __b = (__v4si)__a; 3414 return __b[0]; 3415 } 3416 3417 /// Moves the least significant 64 bits of a vector of [2 x i64] to a 3418 /// 64-bit signed integer value. 3419 /// 3420 /// \headerfile <x86intrin.h> 3421 /// 3422 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 3423 /// 3424 /// \param __a 3425 /// A vector of [2 x i64]. The least significant 64 bits are moved to the 3426 /// destination. 3427 /// \returns A 64-bit signed integer containing the moved value. 3428 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) { 3429 return __a[0]; 3430 } 3431 3432 /// Moves packed integer values from an aligned 128-bit memory location 3433 /// to elements in a 128-bit integer vector. 3434 /// 3435 /// \headerfile <x86intrin.h> 3436 /// 3437 /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction. 3438 /// 3439 /// \param __p 3440 /// An aligned pointer to a memory location containing integer values. 3441 /// \returns A 128-bit integer vector containing the moved values. 3442 static __inline__ __m128i __DEFAULT_FN_ATTRS 3443 _mm_load_si128(__m128i const *__p) { 3444 return *__p; 3445 } 3446 3447 /// Moves packed integer values from an unaligned 128-bit memory location 3448 /// to elements in a 128-bit integer vector. 3449 /// 3450 /// \headerfile <x86intrin.h> 3451 /// 3452 /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction. 3453 /// 3454 /// \param __p 3455 /// A pointer to a memory location containing integer values. 3456 /// \returns A 128-bit integer vector containing the moved values. 3457 static __inline__ __m128i __DEFAULT_FN_ATTRS 3458 _mm_loadu_si128(__m128i_u const *__p) { 3459 struct __loadu_si128 { 3460 __m128i_u __v; 3461 } __attribute__((__packed__, __may_alias__)); 3462 return ((const struct __loadu_si128 *)__p)->__v; 3463 } 3464 3465 /// Returns a vector of [2 x i64] where the lower element is taken from 3466 /// the lower element of the operand, and the upper element is zero. 3467 /// 3468 /// \headerfile <x86intrin.h> 3469 /// 3470 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 3471 /// 3472 /// \param __p 3473 /// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of 3474 /// the destination. 3475 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the 3476 /// moved value. The higher order bits are cleared. 3477 static __inline__ __m128i __DEFAULT_FN_ATTRS 3478 _mm_loadl_epi64(__m128i_u const *__p) { 3479 struct __mm_loadl_epi64_struct { 3480 long long __u; 3481 } __attribute__((__packed__, __may_alias__)); 3482 return __extension__(__m128i){ 3483 ((const struct __mm_loadl_epi64_struct *)__p)->__u, 0}; 3484 } 3485 3486 /// Generates a 128-bit vector of [4 x i32] with unspecified content. 3487 /// This could be used as an argument to another intrinsic function where the 3488 /// argument is required but the value is not actually used. 3489 /// 3490 /// \headerfile <x86intrin.h> 3491 /// 3492 /// This intrinsic has no corresponding instruction. 3493 /// 3494 /// \returns A 128-bit vector of [4 x i32] with unspecified content. 3495 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) { 3496 return (__m128i)__builtin_ia32_undef128(); 3497 } 3498 3499 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with 3500 /// the specified 64-bit integer values. 3501 /// 3502 /// \headerfile <x86intrin.h> 3503 /// 3504 /// This intrinsic is a utility function and does not correspond to a specific 3505 /// instruction. 3506 /// 3507 /// \param __q1 3508 /// A 64-bit integer value used to initialize the upper 64 bits of the 3509 /// destination vector of [2 x i64]. 3510 /// \param __q0 3511 /// A 64-bit integer value used to initialize the lower 64 bits of the 3512 /// destination vector of [2 x i64]. 3513 /// \returns An initialized 128-bit vector of [2 x i64] containing the values 3514 /// provided in the operands. 3515 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR 3516 _mm_set_epi64x(long long __q1, long long __q0) { 3517 return __extension__(__m128i)(__v2di){__q0, __q1}; 3518 } 3519 3520 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with 3521 /// the specified 64-bit integer values. 3522 /// 3523 /// \headerfile <x86intrin.h> 3524 /// 3525 /// This intrinsic is a utility function and does not correspond to a specific 3526 /// instruction. 3527 /// 3528 /// \param __q1 3529 /// A 64-bit integer value used to initialize the upper 64 bits of the 3530 /// destination vector of [2 x i64]. 3531 /// \param __q0 3532 /// A 64-bit integer value used to initialize the lower 64 bits of the 3533 /// destination vector of [2 x i64]. 3534 /// \returns An initialized 128-bit vector of [2 x i64] containing the values 3535 /// provided in the operands. 3536 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR 3537 _mm_set_epi64(__m64 __q1, __m64 __q0) { 3538 return _mm_set_epi64x((long long)__q1[0], (long long)__q0[0]); 3539 } 3540 3541 /// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with 3542 /// the specified 32-bit integer values. 3543 /// 3544 /// \headerfile <x86intrin.h> 3545 /// 3546 /// This intrinsic is a utility function and does not correspond to a specific 3547 /// instruction. 3548 /// 3549 /// \param __i3 3550 /// A 32-bit integer value used to initialize bits [127:96] of the 3551 /// destination vector. 3552 /// \param __i2 3553 /// A 32-bit integer value used to initialize bits [95:64] of the destination 3554 /// vector. 3555 /// \param __i1 3556 /// A 32-bit integer value used to initialize bits [63:32] of the destination 3557 /// vector. 3558 /// \param __i0 3559 /// A 32-bit integer value used to initialize bits [31:0] of the destination 3560 /// vector. 3561 /// \returns An initialized 128-bit vector of [4 x i32] containing the values 3562 /// provided in the operands. 3563 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_epi32(int __i3, 3564 int __i2, 3565 int __i1, 3566 int __i0) { 3567 return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3}; 3568 } 3569 3570 /// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with 3571 /// the specified 16-bit integer values. 3572 /// 3573 /// \headerfile <x86intrin.h> 3574 /// 3575 /// This intrinsic is a utility function and does not correspond to a specific 3576 /// instruction. 3577 /// 3578 /// \param __w7 3579 /// A 16-bit integer value used to initialize bits [127:112] of the 3580 /// destination vector. 3581 /// \param __w6 3582 /// A 16-bit integer value used to initialize bits [111:96] of the 3583 /// destination vector. 3584 /// \param __w5 3585 /// A 16-bit integer value used to initialize bits [95:80] of the destination 3586 /// vector. 3587 /// \param __w4 3588 /// A 16-bit integer value used to initialize bits [79:64] of the destination 3589 /// vector. 3590 /// \param __w3 3591 /// A 16-bit integer value used to initialize bits [63:48] of the destination 3592 /// vector. 3593 /// \param __w2 3594 /// A 16-bit integer value used to initialize bits [47:32] of the destination 3595 /// vector. 3596 /// \param __w1 3597 /// A 16-bit integer value used to initialize bits [31:16] of the destination 3598 /// vector. 3599 /// \param __w0 3600 /// A 16-bit integer value used to initialize bits [15:0] of the destination 3601 /// vector. 3602 /// \returns An initialized 128-bit vector of [8 x i16] containing the values 3603 /// provided in the operands. 3604 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR 3605 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, 3606 short __w2, short __w1, short __w0) { 3607 return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3, 3608 __w4, __w5, __w6, __w7}; 3609 } 3610 3611 /// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with 3612 /// the specified 8-bit integer values. 3613 /// 3614 /// \headerfile <x86intrin.h> 3615 /// 3616 /// This intrinsic is a utility function and does not correspond to a specific 3617 /// instruction. 3618 /// 3619 /// \param __b15 3620 /// Initializes bits [127:120] of the destination vector. 3621 /// \param __b14 3622 /// Initializes bits [119:112] of the destination vector. 3623 /// \param __b13 3624 /// Initializes bits [111:104] of the destination vector. 3625 /// \param __b12 3626 /// Initializes bits [103:96] of the destination vector. 3627 /// \param __b11 3628 /// Initializes bits [95:88] of the destination vector. 3629 /// \param __b10 3630 /// Initializes bits [87:80] of the destination vector. 3631 /// \param __b9 3632 /// Initializes bits [79:72] of the destination vector. 3633 /// \param __b8 3634 /// Initializes bits [71:64] of the destination vector. 3635 /// \param __b7 3636 /// Initializes bits [63:56] of the destination vector. 3637 /// \param __b6 3638 /// Initializes bits [55:48] of the destination vector. 3639 /// \param __b5 3640 /// Initializes bits [47:40] of the destination vector. 3641 /// \param __b4 3642 /// Initializes bits [39:32] of the destination vector. 3643 /// \param __b3 3644 /// Initializes bits [31:24] of the destination vector. 3645 /// \param __b2 3646 /// Initializes bits [23:16] of the destination vector. 3647 /// \param __b1 3648 /// Initializes bits [15:8] of the destination vector. 3649 /// \param __b0 3650 /// Initializes bits [7:0] of the destination vector. 3651 /// \returns An initialized 128-bit vector of [16 x i8] containing the values 3652 /// provided in the operands. 3653 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR 3654 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, 3655 char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, 3656 char __b4, char __b3, char __b2, char __b1, char __b0) { 3657 return __extension__(__m128i)(__v16qi){ 3658 __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, 3659 __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15}; 3660 } 3661 3662 /// Initializes both values in a 128-bit integer vector with the 3663 /// specified 64-bit integer value. 3664 /// 3665 /// \headerfile <x86intrin.h> 3666 /// 3667 /// This intrinsic is a utility function and does not correspond to a specific 3668 /// instruction. 3669 /// 3670 /// \param __q 3671 /// Integer value used to initialize the elements of the destination integer 3672 /// vector. 3673 /// \returns An initialized 128-bit integer vector of [2 x i64] with both 3674 /// elements containing the value provided in the operand. 3675 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR 3676 _mm_set1_epi64x(long long __q) { 3677 return _mm_set_epi64x(__q, __q); 3678 } 3679 3680 /// Initializes both values in a 128-bit vector of [2 x i64] with the 3681 /// specified 64-bit value. 3682 /// 3683 /// \headerfile <x86intrin.h> 3684 /// 3685 /// This intrinsic is a utility function and does not correspond to a specific 3686 /// instruction. 3687 /// 3688 /// \param __q 3689 /// A 64-bit value used to initialize the elements of the destination integer 3690 /// vector. 3691 /// \returns An initialized 128-bit vector of [2 x i64] with all elements 3692 /// containing the value provided in the operand. 3693 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR 3694 _mm_set1_epi64(__m64 __q) { 3695 return _mm_set_epi64(__q, __q); 3696 } 3697 3698 /// Initializes all values in a 128-bit vector of [4 x i32] with the 3699 /// specified 32-bit value. 3700 /// 3701 /// \headerfile <x86intrin.h> 3702 /// 3703 /// This intrinsic is a utility function and does not correspond to a specific 3704 /// instruction. 3705 /// 3706 /// \param __i 3707 /// A 32-bit value used to initialize the elements of the destination integer 3708 /// vector. 3709 /// \returns An initialized 128-bit vector of [4 x i32] with all elements 3710 /// containing the value provided in the operand. 3711 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi32(int __i) { 3712 return _mm_set_epi32(__i, __i, __i, __i); 3713 } 3714 3715 /// Initializes all values in a 128-bit vector of [8 x i16] with the 3716 /// specified 16-bit value. 3717 /// 3718 /// \headerfile <x86intrin.h> 3719 /// 3720 /// This intrinsic is a utility function and does not correspond to a specific 3721 /// instruction. 3722 /// 3723 /// \param __w 3724 /// A 16-bit value used to initialize the elements of the destination integer 3725 /// vector. 3726 /// \returns An initialized 128-bit vector of [8 x i16] with all elements 3727 /// containing the value provided in the operand. 3728 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR 3729 _mm_set1_epi16(short __w) { 3730 return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w); 3731 } 3732 3733 /// Initializes all values in a 128-bit vector of [16 x i8] with the 3734 /// specified 8-bit value. 3735 /// 3736 /// \headerfile <x86intrin.h> 3737 /// 3738 /// This intrinsic is a utility function and does not correspond to a specific 3739 /// instruction. 3740 /// 3741 /// \param __b 3742 /// An 8-bit value used to initialize the elements of the destination integer 3743 /// vector. 3744 /// \returns An initialized 128-bit vector of [16 x i8] with all elements 3745 /// containing the value provided in the operand. 3746 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi8(char __b) { 3747 return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, 3748 __b, __b, __b, __b, __b); 3749 } 3750 3751 /// Constructs a 128-bit integer vector, initialized in reverse order 3752 /// with the specified 64-bit integral values. 3753 /// 3754 /// \headerfile <x86intrin.h> 3755 /// 3756 /// This intrinsic does not correspond to a specific instruction. 3757 /// 3758 /// \param __q0 3759 /// A 64-bit integral value used to initialize the lower 64 bits of the 3760 /// result. 3761 /// \param __q1 3762 /// A 64-bit integral value used to initialize the upper 64 bits of the 3763 /// result. 3764 /// \returns An initialized 128-bit integer vector. 3765 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR 3766 _mm_setr_epi64(__m64 __q0, __m64 __q1) { 3767 return _mm_set_epi64(__q1, __q0); 3768 } 3769 3770 /// Constructs a 128-bit integer vector, initialized in reverse order 3771 /// with the specified 32-bit integral values. 3772 /// 3773 /// \headerfile <x86intrin.h> 3774 /// 3775 /// This intrinsic is a utility function and does not correspond to a specific 3776 /// instruction. 3777 /// 3778 /// \param __i0 3779 /// A 32-bit integral value used to initialize bits [31:0] of the result. 3780 /// \param __i1 3781 /// A 32-bit integral value used to initialize bits [63:32] of the result. 3782 /// \param __i2 3783 /// A 32-bit integral value used to initialize bits [95:64] of the result. 3784 /// \param __i3 3785 /// A 32-bit integral value used to initialize bits [127:96] of the result. 3786 /// \returns An initialized 128-bit integer vector. 3787 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR 3788 _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3) { 3789 return _mm_set_epi32(__i3, __i2, __i1, __i0); 3790 } 3791 3792 /// Constructs a 128-bit integer vector, initialized in reverse order 3793 /// with the specified 16-bit integral values. 3794 /// 3795 /// \headerfile <x86intrin.h> 3796 /// 3797 /// This intrinsic is a utility function and does not correspond to a specific 3798 /// instruction. 3799 /// 3800 /// \param __w0 3801 /// A 16-bit integral value used to initialize bits [15:0] of the result. 3802 /// \param __w1 3803 /// A 16-bit integral value used to initialize bits [31:16] of the result. 3804 /// \param __w2 3805 /// A 16-bit integral value used to initialize bits [47:32] of the result. 3806 /// \param __w3 3807 /// A 16-bit integral value used to initialize bits [63:48] of the result. 3808 /// \param __w4 3809 /// A 16-bit integral value used to initialize bits [79:64] of the result. 3810 /// \param __w5 3811 /// A 16-bit integral value used to initialize bits [95:80] of the result. 3812 /// \param __w6 3813 /// A 16-bit integral value used to initialize bits [111:96] of the result. 3814 /// \param __w7 3815 /// A 16-bit integral value used to initialize bits [127:112] of the result. 3816 /// \returns An initialized 128-bit integer vector. 3817 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR 3818 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, 3819 short __w5, short __w6, short __w7) { 3820 return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0); 3821 } 3822 3823 /// Constructs a 128-bit integer vector, initialized in reverse order 3824 /// with the specified 8-bit integral values. 3825 /// 3826 /// \headerfile <x86intrin.h> 3827 /// 3828 /// This intrinsic is a utility function and does not correspond to a specific 3829 /// instruction. 3830 /// 3831 /// \param __b0 3832 /// An 8-bit integral value used to initialize bits [7:0] of the result. 3833 /// \param __b1 3834 /// An 8-bit integral value used to initialize bits [15:8] of the result. 3835 /// \param __b2 3836 /// An 8-bit integral value used to initialize bits [23:16] of the result. 3837 /// \param __b3 3838 /// An 8-bit integral value used to initialize bits [31:24] of the result. 3839 /// \param __b4 3840 /// An 8-bit integral value used to initialize bits [39:32] of the result. 3841 /// \param __b5 3842 /// An 8-bit integral value used to initialize bits [47:40] of the result. 3843 /// \param __b6 3844 /// An 8-bit integral value used to initialize bits [55:48] of the result. 3845 /// \param __b7 3846 /// An 8-bit integral value used to initialize bits [63:56] of the result. 3847 /// \param __b8 3848 /// An 8-bit integral value used to initialize bits [71:64] of the result. 3849 /// \param __b9 3850 /// An 8-bit integral value used to initialize bits [79:72] of the result. 3851 /// \param __b10 3852 /// An 8-bit integral value used to initialize bits [87:80] of the result. 3853 /// \param __b11 3854 /// An 8-bit integral value used to initialize bits [95:88] of the result. 3855 /// \param __b12 3856 /// An 8-bit integral value used to initialize bits [103:96] of the result. 3857 /// \param __b13 3858 /// An 8-bit integral value used to initialize bits [111:104] of the result. 3859 /// \param __b14 3860 /// An 8-bit integral value used to initialize bits [119:112] of the result. 3861 /// \param __b15 3862 /// An 8-bit integral value used to initialize bits [127:120] of the result. 3863 /// \returns An initialized 128-bit integer vector. 3864 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR 3865 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, 3866 char __b6, char __b7, char __b8, char __b9, char __b10, 3867 char __b11, char __b12, char __b13, char __b14, char __b15) { 3868 return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8, 3869 __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); 3870 } 3871 3872 /// Creates a 128-bit integer vector initialized to zero. 3873 /// 3874 /// \headerfile <x86intrin.h> 3875 /// 3876 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction. 3877 /// 3878 /// \returns An initialized 128-bit integer vector with all elements set to 3879 /// zero. 3880 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_si128(void) { 3881 return __extension__(__m128i)(__v2di){0LL, 0LL}; 3882 } 3883 3884 /// Stores a 128-bit integer vector to a memory location aligned on a 3885 /// 128-bit boundary. 3886 /// 3887 /// \headerfile <x86intrin.h> 3888 /// 3889 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction. 3890 /// 3891 /// \param __p 3892 /// A pointer to an aligned memory location that will receive the integer 3893 /// values. 3894 /// \param __b 3895 /// A 128-bit integer vector containing the values to be moved. 3896 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p, 3897 __m128i __b) { 3898 *__p = __b; 3899 } 3900 3901 /// Stores a 128-bit integer vector to an unaligned memory location. 3902 /// 3903 /// \headerfile <x86intrin.h> 3904 /// 3905 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction. 3906 /// 3907 /// \param __p 3908 /// A pointer to a memory location that will receive the integer values. 3909 /// \param __b 3910 /// A 128-bit integer vector containing the values to be moved. 3911 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, 3912 __m128i __b) { 3913 struct __storeu_si128 { 3914 __m128i_u __v; 3915 } __attribute__((__packed__, __may_alias__)); 3916 ((struct __storeu_si128 *)__p)->__v = __b; 3917 } 3918 3919 /// Stores a 64-bit integer value from the low element of a 128-bit integer 3920 /// vector. 3921 /// 3922 /// \headerfile <x86intrin.h> 3923 /// 3924 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 3925 /// 3926 /// \param __p 3927 /// A pointer to a 64-bit memory location. The address of the memory 3928 /// location does not have to be aligned. 3929 /// \param __b 3930 /// A 128-bit integer vector containing the value to be stored. 3931 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p, 3932 __m128i __b) { 3933 struct __storeu_si64 { 3934 long long __v; 3935 } __attribute__((__packed__, __may_alias__)); 3936 ((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0]; 3937 } 3938 3939 /// Stores a 32-bit integer value from the low element of a 128-bit integer 3940 /// vector. 3941 /// 3942 /// \headerfile <x86intrin.h> 3943 /// 3944 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 3945 /// 3946 /// \param __p 3947 /// A pointer to a 32-bit memory location. The address of the memory 3948 /// location does not have to be aligned. 3949 /// \param __b 3950 /// A 128-bit integer vector containing the value to be stored. 3951 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p, 3952 __m128i __b) { 3953 struct __storeu_si32 { 3954 int __v; 3955 } __attribute__((__packed__, __may_alias__)); 3956 ((struct __storeu_si32 *)__p)->__v = ((__v4si)__b)[0]; 3957 } 3958 3959 /// Stores a 16-bit integer value from the low element of a 128-bit integer 3960 /// vector. 3961 /// 3962 /// \headerfile <x86intrin.h> 3963 /// 3964 /// This intrinsic does not correspond to a specific instruction. 3965 /// 3966 /// \param __p 3967 /// A pointer to a 16-bit memory location. The address of the memory 3968 /// location does not have to be aligned. 3969 /// \param __b 3970 /// A 128-bit integer vector containing the value to be stored. 3971 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p, 3972 __m128i __b) { 3973 struct __storeu_si16 { 3974 short __v; 3975 } __attribute__((__packed__, __may_alias__)); 3976 ((struct __storeu_si16 *)__p)->__v = ((__v8hi)__b)[0]; 3977 } 3978 3979 /// Moves bytes selected by the mask from the first operand to the 3980 /// specified unaligned memory location. When a mask bit is 1, the 3981 /// corresponding byte is written, otherwise it is not written. 3982 /// 3983 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 3984 /// used again soon). Exception and trap behavior for elements not selected 3985 /// for storage to memory are implementation dependent. 3986 /// 3987 /// \headerfile <x86intrin.h> 3988 /// 3989 /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c> 3990 /// instruction. 3991 /// 3992 /// \param __d 3993 /// A 128-bit integer vector containing the values to be moved. 3994 /// \param __n 3995 /// A 128-bit integer vector containing the mask. The most significant bit of 3996 /// each byte represents the mask bits. 3997 /// \param __p 3998 /// A pointer to an unaligned 128-bit memory location where the specified 3999 /// values are moved. 4000 static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d, 4001 __m128i __n, 4002 char *__p) { 4003 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p); 4004 } 4005 4006 /// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to 4007 /// a memory location. 4008 /// 4009 /// \headerfile <x86intrin.h> 4010 /// 4011 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction. 4012 /// 4013 /// \param __p 4014 /// A pointer to a 64-bit memory location that will receive the lower 64 bits 4015 /// of the integer vector parameter. 4016 /// \param __a 4017 /// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the 4018 /// value to be stored. 4019 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p, 4020 __m128i __a) { 4021 struct __mm_storel_epi64_struct { 4022 long long __u; 4023 } __attribute__((__packed__, __may_alias__)); 4024 ((struct __mm_storel_epi64_struct *)__p)->__u = __a[0]; 4025 } 4026 4027 /// Stores a 128-bit floating point vector of [2 x double] to a 128-bit 4028 /// aligned memory location. 4029 /// 4030 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 4031 /// used again soon). 4032 /// 4033 /// \headerfile <x86intrin.h> 4034 /// 4035 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction. 4036 /// 4037 /// \param __p 4038 /// A pointer to the 128-bit aligned memory location used to store the value. 4039 /// \param __a 4040 /// A vector of [2 x double] containing the 64-bit values to be stored. 4041 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p, 4042 __m128d __a) { 4043 __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p); 4044 } 4045 4046 /// Stores a 128-bit integer vector to a 128-bit aligned memory location. 4047 /// 4048 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 4049 /// used again soon). 4050 /// 4051 /// \headerfile <x86intrin.h> 4052 /// 4053 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction. 4054 /// 4055 /// \param __p 4056 /// A pointer to the 128-bit aligned memory location used to store the value. 4057 /// \param __a 4058 /// A 128-bit integer vector containing the values to be stored. 4059 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p, 4060 __m128i __a) { 4061 __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p); 4062 } 4063 4064 /// Stores a 32-bit integer value in the specified memory location. 4065 /// 4066 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 4067 /// used again soon). 4068 /// 4069 /// \headerfile <x86intrin.h> 4070 /// 4071 /// This intrinsic corresponds to the <c> MOVNTI </c> instruction. 4072 /// 4073 /// \param __p 4074 /// A pointer to the 32-bit memory location used to store the value. 4075 /// \param __a 4076 /// A 32-bit integer containing the value to be stored. 4077 static __inline__ void 4078 __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) 4079 _mm_stream_si32(void *__p, int __a) { 4080 __builtin_ia32_movnti((int *)__p, __a); 4081 } 4082 4083 #ifdef __x86_64__ 4084 /// Stores a 64-bit integer value in the specified memory location. 4085 /// 4086 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 4087 /// used again soon). 4088 /// 4089 /// \headerfile <x86intrin.h> 4090 /// 4091 /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction. 4092 /// 4093 /// \param __p 4094 /// A pointer to the 64-bit memory location used to store the value. 4095 /// \param __a 4096 /// A 64-bit integer containing the value to be stored. 4097 static __inline__ void 4098 __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) 4099 _mm_stream_si64(void *__p, long long __a) { 4100 __builtin_ia32_movnti64((long long *)__p, __a); 4101 } 4102 #endif 4103 4104 #if defined(__cplusplus) 4105 extern "C" { 4106 #endif 4107 4108 /// The cache line containing \a __p is flushed and invalidated from all 4109 /// caches in the coherency domain. 4110 /// 4111 /// \headerfile <x86intrin.h> 4112 /// 4113 /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction. 4114 /// 4115 /// \param __p 4116 /// A pointer to the memory location used to identify the cache line to be 4117 /// flushed. 4118 void _mm_clflush(void const *__p); 4119 4120 /// Forces strong memory ordering (serialization) between load 4121 /// instructions preceding this instruction and load instructions following 4122 /// this instruction, ensuring the system completes all previous loads before 4123 /// executing subsequent loads. 4124 /// 4125 /// \headerfile <x86intrin.h> 4126 /// 4127 /// This intrinsic corresponds to the <c> LFENCE </c> instruction. 4128 /// 4129 void _mm_lfence(void); 4130 4131 /// Forces strong memory ordering (serialization) between load and store 4132 /// instructions preceding this instruction and load and store instructions 4133 /// following this instruction, ensuring that the system completes all 4134 /// previous memory accesses before executing subsequent memory accesses. 4135 /// 4136 /// \headerfile <x86intrin.h> 4137 /// 4138 /// This intrinsic corresponds to the <c> MFENCE </c> instruction. 4139 /// 4140 void _mm_mfence(void); 4141 4142 #if defined(__cplusplus) 4143 } // extern "C" 4144 #endif 4145 4146 /// Converts, with saturation, 16-bit signed integers from both 128-bit integer 4147 /// vector operands into 8-bit signed integers, and packs the results into 4148 /// the destination. 4149 /// 4150 /// Positive values greater than 0x7F are saturated to 0x7F. Negative values 4151 /// less than 0x80 are saturated to 0x80. 4152 /// 4153 /// \headerfile <x86intrin.h> 4154 /// 4155 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction. 4156 /// 4157 /// \param __a 4158 /// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are 4159 /// written to the lower 64 bits of the result. 4160 /// \param __b 4161 /// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are 4162 /// written to the higher 64 bits of the result. 4163 /// \returns A 128-bit vector of [16 x i8] containing the converted values. 4164 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a, 4165 __m128i __b) { 4166 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b); 4167 } 4168 4169 /// Converts, with saturation, 32-bit signed integers from both 128-bit integer 4170 /// vector operands into 16-bit signed integers, and packs the results into 4171 /// the destination. 4172 /// 4173 /// Positive values greater than 0x7FFF are saturated to 0x7FFF. Negative 4174 /// values less than 0x8000 are saturated to 0x8000. 4175 /// 4176 /// \headerfile <x86intrin.h> 4177 /// 4178 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction. 4179 /// 4180 /// \param __a 4181 /// A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values 4182 /// are written to the lower 64 bits of the result. 4183 /// \param __b 4184 /// A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values 4185 /// are written to the higher 64 bits of the result. 4186 /// \returns A 128-bit vector of [8 x i16] containing the converted values. 4187 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a, 4188 __m128i __b) { 4189 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b); 4190 } 4191 4192 /// Converts, with saturation, 16-bit signed integers from both 128-bit integer 4193 /// vector operands into 8-bit unsigned integers, and packs the results into 4194 /// the destination. 4195 /// 4196 /// Values greater than 0xFF are saturated to 0xFF. Values less than 0x00 4197 /// are saturated to 0x00. 4198 /// 4199 /// \headerfile <x86intrin.h> 4200 /// 4201 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction. 4202 /// 4203 /// \param __a 4204 /// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are 4205 /// written to the lower 64 bits of the result. 4206 /// \param __b 4207 /// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are 4208 /// written to the higher 64 bits of the result. 4209 /// \returns A 128-bit vector of [16 x i8] containing the converted values. 4210 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a, 4211 __m128i __b) { 4212 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b); 4213 } 4214 4215 /// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using 4216 /// the immediate-value parameter as a selector. 4217 /// 4218 /// \headerfile <x86intrin.h> 4219 /// 4220 /// \code 4221 /// __m128i _mm_extract_epi16(__m128i a, const int imm); 4222 /// \endcode 4223 /// 4224 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction. 4225 /// 4226 /// \param a 4227 /// A 128-bit integer vector. 4228 /// \param imm 4229 /// An immediate value. Bits [2:0] selects values from \a a to be assigned 4230 /// to bits[15:0] of the result. \n 4231 /// 000: assign values from bits [15:0] of \a a. \n 4232 /// 001: assign values from bits [31:16] of \a a. \n 4233 /// 010: assign values from bits [47:32] of \a a. \n 4234 /// 011: assign values from bits [63:48] of \a a. \n 4235 /// 100: assign values from bits [79:64] of \a a. \n 4236 /// 101: assign values from bits [95:80] of \a a. \n 4237 /// 110: assign values from bits [111:96] of \a a. \n 4238 /// 111: assign values from bits [127:112] of \a a. 4239 /// \returns An integer, whose lower 16 bits are selected from the 128-bit 4240 /// integer vector parameter and the remaining bits are assigned zeros. 4241 #define _mm_extract_epi16(a, imm) \ 4242 ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \ 4243 (int)(imm))) 4244 4245 /// Constructs a 128-bit integer vector by first making a copy of the 4246 /// 128-bit integer vector parameter, and then inserting the lower 16 bits 4247 /// of an integer parameter into an offset specified by the immediate-value 4248 /// parameter. 4249 /// 4250 /// \headerfile <x86intrin.h> 4251 /// 4252 /// \code 4253 /// __m128i _mm_insert_epi16(__m128i a, int b, const int imm); 4254 /// \endcode 4255 /// 4256 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction. 4257 /// 4258 /// \param a 4259 /// A 128-bit integer vector of [8 x i16]. This vector is copied to the 4260 /// result and then one of the eight elements in the result is replaced by 4261 /// the lower 16 bits of \a b. 4262 /// \param b 4263 /// An integer. The lower 16 bits of this parameter are written to the 4264 /// result beginning at an offset specified by \a imm. 4265 /// \param imm 4266 /// An immediate value specifying the bit offset in the result at which the 4267 /// lower 16 bits of \a b are written. 4268 /// \returns A 128-bit integer vector containing the constructed values. 4269 #define _mm_insert_epi16(a, b, imm) \ 4270 ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \ 4271 (int)(imm))) 4272 4273 /// Copies the values of the most significant bits from each 8-bit 4274 /// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask 4275 /// value, zero-extends the value, and writes it to the destination. 4276 /// 4277 /// \headerfile <x86intrin.h> 4278 /// 4279 /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction. 4280 /// 4281 /// \param __a 4282 /// A 128-bit integer vector containing the values with bits to be extracted. 4283 /// \returns The most significant bits from each 8-bit element in \a __a, 4284 /// written to bits [15:0]. The other bits are assigned zeros. 4285 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) { 4286 return __builtin_ia32_pmovmskb128((__v16qi)__a); 4287 } 4288 4289 /// Constructs a 128-bit integer vector by shuffling four 32-bit 4290 /// elements of a 128-bit integer vector parameter, using the immediate-value 4291 /// parameter as a specifier. 4292 /// 4293 /// \headerfile <x86intrin.h> 4294 /// 4295 /// \code 4296 /// __m128i _mm_shuffle_epi32(__m128i a, const int imm); 4297 /// \endcode 4298 /// 4299 /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction. 4300 /// 4301 /// \param a 4302 /// A 128-bit integer vector containing the values to be copied. 4303 /// \param imm 4304 /// An immediate value containing an 8-bit value specifying which elements to 4305 /// copy from a. The destinations within the 128-bit destination are assigned 4306 /// values as follows: \n 4307 /// Bits [1:0] are used to assign values to bits [31:0] of the result. \n 4308 /// Bits [3:2] are used to assign values to bits [63:32] of the result. \n 4309 /// Bits [5:4] are used to assign values to bits [95:64] of the result. \n 4310 /// Bits [7:6] are used to assign values to bits [127:96] of the result. \n 4311 /// Bit value assignments: \n 4312 /// 00: assign values from bits [31:0] of \a a. \n 4313 /// 01: assign values from bits [63:32] of \a a. \n 4314 /// 10: assign values from bits [95:64] of \a a. \n 4315 /// 11: assign values from bits [127:96] of \a a. \n 4316 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro. 4317 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form 4318 /// <c>[b6, b4, b2, b0]</c>. 4319 /// \returns A 128-bit integer vector containing the shuffled values. 4320 #define _mm_shuffle_epi32(a, imm) \ 4321 ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm))) 4322 4323 /// Constructs a 128-bit integer vector by shuffling four lower 16-bit 4324 /// elements of a 128-bit integer vector of [8 x i16], using the immediate 4325 /// value parameter as a specifier. 4326 /// 4327 /// \headerfile <x86intrin.h> 4328 /// 4329 /// \code 4330 /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm); 4331 /// \endcode 4332 /// 4333 /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction. 4334 /// 4335 /// \param a 4336 /// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits 4337 /// [127:64] of the result. 4338 /// \param imm 4339 /// An 8-bit immediate value specifying which elements to copy from \a a. \n 4340 /// Bits[1:0] are used to assign values to bits [15:0] of the result. \n 4341 /// Bits[3:2] are used to assign values to bits [31:16] of the result. \n 4342 /// Bits[5:4] are used to assign values to bits [47:32] of the result. \n 4343 /// Bits[7:6] are used to assign values to bits [63:48] of the result. \n 4344 /// Bit value assignments: \n 4345 /// 00: assign values from bits [15:0] of \a a. \n 4346 /// 01: assign values from bits [31:16] of \a a. \n 4347 /// 10: assign values from bits [47:32] of \a a. \n 4348 /// 11: assign values from bits [63:48] of \a a. \n 4349 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro. 4350 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form 4351 /// <c>[b6, b4, b2, b0]</c>. 4352 /// \returns A 128-bit integer vector containing the shuffled values. 4353 #define _mm_shufflelo_epi16(a, imm) \ 4354 ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm))) 4355 4356 /// Constructs a 128-bit integer vector by shuffling four upper 16-bit 4357 /// elements of a 128-bit integer vector of [8 x i16], using the immediate 4358 /// value parameter as a specifier. 4359 /// 4360 /// \headerfile <x86intrin.h> 4361 /// 4362 /// \code 4363 /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm); 4364 /// \endcode 4365 /// 4366 /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction. 4367 /// 4368 /// \param a 4369 /// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits 4370 /// [63:0] of the result. 4371 /// \param imm 4372 /// An 8-bit immediate value specifying which elements to copy from \a a. \n 4373 /// Bits[1:0] are used to assign values to bits [79:64] of the result. \n 4374 /// Bits[3:2] are used to assign values to bits [95:80] of the result. \n 4375 /// Bits[5:4] are used to assign values to bits [111:96] of the result. \n 4376 /// Bits[7:6] are used to assign values to bits [127:112] of the result. \n 4377 /// Bit value assignments: \n 4378 /// 00: assign values from bits [79:64] of \a a. \n 4379 /// 01: assign values from bits [95:80] of \a a. \n 4380 /// 10: assign values from bits [111:96] of \a a. \n 4381 /// 11: assign values from bits [127:112] of \a a. \n 4382 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro. 4383 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form 4384 /// <c>[b6, b4, b2, b0]</c>. 4385 /// \returns A 128-bit integer vector containing the shuffled values. 4386 #define _mm_shufflehi_epi16(a, imm) \ 4387 ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm))) 4388 4389 /// Unpacks the high-order (index 8-15) values from two 128-bit vectors 4390 /// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. 4391 /// 4392 /// \headerfile <x86intrin.h> 4393 /// 4394 /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c> 4395 /// instruction. 4396 /// 4397 /// \param __a 4398 /// A 128-bit vector of [16 x i8]. 4399 /// Bits [71:64] are written to bits [7:0] of the result. \n 4400 /// Bits [79:72] are written to bits [23:16] of the result. \n 4401 /// Bits [87:80] are written to bits [39:32] of the result. \n 4402 /// Bits [95:88] are written to bits [55:48] of the result. \n 4403 /// Bits [103:96] are written to bits [71:64] of the result. \n 4404 /// Bits [111:104] are written to bits [87:80] of the result. \n 4405 /// Bits [119:112] are written to bits [103:96] of the result. \n 4406 /// Bits [127:120] are written to bits [119:112] of the result. 4407 /// \param __b 4408 /// A 128-bit vector of [16 x i8]. \n 4409 /// Bits [71:64] are written to bits [15:8] of the result. \n 4410 /// Bits [79:72] are written to bits [31:24] of the result. \n 4411 /// Bits [87:80] are written to bits [47:40] of the result. \n 4412 /// Bits [95:88] are written to bits [63:56] of the result. \n 4413 /// Bits [103:96] are written to bits [79:72] of the result. \n 4414 /// Bits [111:104] are written to bits [95:88] of the result. \n 4415 /// Bits [119:112] are written to bits [111:104] of the result. \n 4416 /// Bits [127:120] are written to bits [127:120] of the result. 4417 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values. 4418 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a, 4419 __m128i __b) { 4420 return (__m128i)__builtin_shufflevector( 4421 (__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11, 4422 16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15); 4423 } 4424 4425 /// Unpacks the high-order (index 4-7) values from two 128-bit vectors of 4426 /// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16]. 4427 /// 4428 /// \headerfile <x86intrin.h> 4429 /// 4430 /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c> 4431 /// instruction. 4432 /// 4433 /// \param __a 4434 /// A 128-bit vector of [8 x i16]. 4435 /// Bits [79:64] are written to bits [15:0] of the result. \n 4436 /// Bits [95:80] are written to bits [47:32] of the result. \n 4437 /// Bits [111:96] are written to bits [79:64] of the result. \n 4438 /// Bits [127:112] are written to bits [111:96] of the result. 4439 /// \param __b 4440 /// A 128-bit vector of [8 x i16]. 4441 /// Bits [79:64] are written to bits [31:16] of the result. \n 4442 /// Bits [95:80] are written to bits [63:48] of the result. \n 4443 /// Bits [111:96] are written to bits [95:80] of the result. \n 4444 /// Bits [127:112] are written to bits [127:112] of the result. 4445 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values. 4446 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a, 4447 __m128i __b) { 4448 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5, 4449 8 + 5, 6, 8 + 6, 7, 8 + 7); 4450 } 4451 4452 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of 4453 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32]. 4454 /// 4455 /// \headerfile <x86intrin.h> 4456 /// 4457 /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c> 4458 /// instruction. 4459 /// 4460 /// \param __a 4461 /// A 128-bit vector of [4 x i32]. \n 4462 /// Bits [95:64] are written to bits [31:0] of the destination. \n 4463 /// Bits [127:96] are written to bits [95:64] of the destination. 4464 /// \param __b 4465 /// A 128-bit vector of [4 x i32]. \n 4466 /// Bits [95:64] are written to bits [64:32] of the destination. \n 4467 /// Bits [127:96] are written to bits [127:96] of the destination. 4468 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values. 4469 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a, 4470 __m128i __b) { 4471 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3, 4472 4 + 3); 4473 } 4474 4475 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of 4476 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. 4477 /// 4478 /// \headerfile <x86intrin.h> 4479 /// 4480 /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c> 4481 /// instruction. 4482 /// 4483 /// \param __a 4484 /// A 128-bit vector of [2 x i64]. \n 4485 /// Bits [127:64] are written to bits [63:0] of the destination. 4486 /// \param __b 4487 /// A 128-bit vector of [2 x i64]. \n 4488 /// Bits [127:64] are written to bits [127:64] of the destination. 4489 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values. 4490 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a, 4491 __m128i __b) { 4492 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1); 4493 } 4494 4495 /// Unpacks the low-order (index 0-7) values from two 128-bit vectors of 4496 /// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. 4497 /// 4498 /// \headerfile <x86intrin.h> 4499 /// 4500 /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c> 4501 /// instruction. 4502 /// 4503 /// \param __a 4504 /// A 128-bit vector of [16 x i8]. \n 4505 /// Bits [7:0] are written to bits [7:0] of the result. \n 4506 /// Bits [15:8] are written to bits [23:16] of the result. \n 4507 /// Bits [23:16] are written to bits [39:32] of the result. \n 4508 /// Bits [31:24] are written to bits [55:48] of the result. \n 4509 /// Bits [39:32] are written to bits [71:64] of the result. \n 4510 /// Bits [47:40] are written to bits [87:80] of the result. \n 4511 /// Bits [55:48] are written to bits [103:96] of the result. \n 4512 /// Bits [63:56] are written to bits [119:112] of the result. 4513 /// \param __b 4514 /// A 128-bit vector of [16 x i8]. 4515 /// Bits [7:0] are written to bits [15:8] of the result. \n 4516 /// Bits [15:8] are written to bits [31:24] of the result. \n 4517 /// Bits [23:16] are written to bits [47:40] of the result. \n 4518 /// Bits [31:24] are written to bits [63:56] of the result. \n 4519 /// Bits [39:32] are written to bits [79:72] of the result. \n 4520 /// Bits [47:40] are written to bits [95:88] of the result. \n 4521 /// Bits [55:48] are written to bits [111:104] of the result. \n 4522 /// Bits [63:56] are written to bits [127:120] of the result. 4523 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values. 4524 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a, 4525 __m128i __b) { 4526 return (__m128i)__builtin_shufflevector( 4527 (__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4, 4528 16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7); 4529 } 4530 4531 /// Unpacks the low-order (index 0-3) values from each of the two 128-bit 4532 /// vectors of [8 x i16] and interleaves them into a 128-bit vector of 4533 /// [8 x i16]. 4534 /// 4535 /// \headerfile <x86intrin.h> 4536 /// 4537 /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c> 4538 /// instruction. 4539 /// 4540 /// \param __a 4541 /// A 128-bit vector of [8 x i16]. 4542 /// Bits [15:0] are written to bits [15:0] of the result. \n 4543 /// Bits [31:16] are written to bits [47:32] of the result. \n 4544 /// Bits [47:32] are written to bits [79:64] of the result. \n 4545 /// Bits [63:48] are written to bits [111:96] of the result. 4546 /// \param __b 4547 /// A 128-bit vector of [8 x i16]. 4548 /// Bits [15:0] are written to bits [31:16] of the result. \n 4549 /// Bits [31:16] are written to bits [63:48] of the result. \n 4550 /// Bits [47:32] are written to bits [95:80] of the result. \n 4551 /// Bits [63:48] are written to bits [127:112] of the result. 4552 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values. 4553 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a, 4554 __m128i __b) { 4555 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1, 4556 8 + 1, 2, 8 + 2, 3, 8 + 3); 4557 } 4558 4559 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of 4560 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32]. 4561 /// 4562 /// \headerfile <x86intrin.h> 4563 /// 4564 /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c> 4565 /// instruction. 4566 /// 4567 /// \param __a 4568 /// A 128-bit vector of [4 x i32]. \n 4569 /// Bits [31:0] are written to bits [31:0] of the destination. \n 4570 /// Bits [63:32] are written to bits [95:64] of the destination. 4571 /// \param __b 4572 /// A 128-bit vector of [4 x i32]. \n 4573 /// Bits [31:0] are written to bits [64:32] of the destination. \n 4574 /// Bits [63:32] are written to bits [127:96] of the destination. 4575 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values. 4576 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a, 4577 __m128i __b) { 4578 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1, 4579 4 + 1); 4580 } 4581 4582 /// Unpacks the low-order 64-bit elements from two 128-bit vectors of 4583 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. 4584 /// 4585 /// \headerfile <x86intrin.h> 4586 /// 4587 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c> 4588 /// instruction. 4589 /// 4590 /// \param __a 4591 /// A 128-bit vector of [2 x i64]. \n 4592 /// Bits [63:0] are written to bits [63:0] of the destination. \n 4593 /// \param __b 4594 /// A 128-bit vector of [2 x i64]. \n 4595 /// Bits [63:0] are written to bits [127:64] of the destination. \n 4596 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values. 4597 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a, 4598 __m128i __b) { 4599 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0); 4600 } 4601 4602 /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit 4603 /// integer. 4604 /// 4605 /// \headerfile <x86intrin.h> 4606 /// 4607 /// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction. 4608 /// 4609 /// \param __a 4610 /// A 128-bit integer vector operand. The lower 64 bits are moved to the 4611 /// destination. 4612 /// \returns A 64-bit integer containing the lower 64 bits of the parameter. 4613 static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR 4614 _mm_movepi64_pi64(__m128i __a) { 4615 return (__m64)__a[0]; 4616 } 4617 4618 /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the 4619 /// upper bits. 4620 /// 4621 /// \headerfile <x86intrin.h> 4622 /// 4623 /// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction. 4624 /// 4625 /// \param __a 4626 /// A 64-bit value. 4627 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from 4628 /// the operand. The upper 64 bits are assigned zeros. 4629 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR 4630 _mm_movpi64_epi64(__m64 __a) { 4631 return __builtin_shufflevector((__v1di)__a, _mm_setzero_si64(), 0, 1); 4632 } 4633 4634 /// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit 4635 /// integer vector, zeroing the upper bits. 4636 /// 4637 /// \headerfile <x86intrin.h> 4638 /// 4639 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 4640 /// 4641 /// \param __a 4642 /// A 128-bit integer vector operand. The lower 64 bits are moved to the 4643 /// destination. 4644 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from 4645 /// the operand. The upper 64 bits are assigned zeros. 4646 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR 4647 _mm_move_epi64(__m128i __a) { 4648 return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2); 4649 } 4650 4651 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of 4652 /// [2 x double] and interleaves them into a 128-bit vector of [2 x 4653 /// double]. 4654 /// 4655 /// \headerfile <x86intrin.h> 4656 /// 4657 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction. 4658 /// 4659 /// \param __a 4660 /// A 128-bit vector of [2 x double]. \n 4661 /// Bits [127:64] are written to bits [63:0] of the destination. 4662 /// \param __b 4663 /// A 128-bit vector of [2 x double]. \n 4664 /// Bits [127:64] are written to bits [127:64] of the destination. 4665 /// \returns A 128-bit vector of [2 x double] containing the interleaved values. 4666 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR 4667 _mm_unpackhi_pd(__m128d __a, __m128d __b) { 4668 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1); 4669 } 4670 4671 /// Unpacks the low-order 64-bit elements from two 128-bit vectors 4672 /// of [2 x double] and interleaves them into a 128-bit vector of [2 x 4673 /// double]. 4674 /// 4675 /// \headerfile <x86intrin.h> 4676 /// 4677 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 4678 /// 4679 /// \param __a 4680 /// A 128-bit vector of [2 x double]. \n 4681 /// Bits [63:0] are written to bits [63:0] of the destination. 4682 /// \param __b 4683 /// A 128-bit vector of [2 x double]. \n 4684 /// Bits [63:0] are written to bits [127:64] of the destination. 4685 /// \returns A 128-bit vector of [2 x double] containing the interleaved values. 4686 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR 4687 _mm_unpacklo_pd(__m128d __a, __m128d __b) { 4688 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0); 4689 } 4690 4691 /// Extracts the sign bits of the double-precision values in the 128-bit 4692 /// vector of [2 x double], zero-extends the value, and writes it to the 4693 /// low-order bits of the destination. 4694 /// 4695 /// \headerfile <x86intrin.h> 4696 /// 4697 /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction. 4698 /// 4699 /// \param __a 4700 /// A 128-bit vector of [2 x double] containing the values with sign bits to 4701 /// be extracted. 4702 /// \returns The sign bits from each of the double-precision elements in \a __a, 4703 /// written to bits [1:0]. The remaining bits are assigned values of zero. 4704 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) { 4705 return __builtin_ia32_movmskpd((__v2df)__a); 4706 } 4707 4708 /// Constructs a 128-bit floating-point vector of [2 x double] from two 4709 /// 128-bit vector parameters of [2 x double], using the immediate-value 4710 /// parameter as a specifier. 4711 /// 4712 /// \headerfile <x86intrin.h> 4713 /// 4714 /// \code 4715 /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i); 4716 /// \endcode 4717 /// 4718 /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction. 4719 /// 4720 /// \param a 4721 /// A 128-bit vector of [2 x double]. 4722 /// \param b 4723 /// A 128-bit vector of [2 x double]. 4724 /// \param i 4725 /// An 8-bit immediate value. The least significant two bits specify which 4726 /// elements to copy from \a a and \a b: \n 4727 /// Bit[0] = 0: lower element of \a a copied to lower element of result. \n 4728 /// Bit[0] = 1: upper element of \a a copied to lower element of result. \n 4729 /// Bit[1] = 0: lower element of \a b copied to upper element of result. \n 4730 /// Bit[1] = 1: upper element of \a b copied to upper element of result. \n 4731 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro. 4732 /// <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form 4733 /// <c>[b1, b0]</c>. 4734 /// \returns A 128-bit vector of [2 x double] containing the shuffled values. 4735 #define _mm_shuffle_pd(a, b, i) \ 4736 ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ 4737 (int)(i))) 4738 4739 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit 4740 /// floating-point vector of [4 x float]. 4741 /// 4742 /// \headerfile <x86intrin.h> 4743 /// 4744 /// This intrinsic has no corresponding instruction. 4745 /// 4746 /// \param __a 4747 /// A 128-bit floating-point vector of [2 x double]. 4748 /// \returns A 128-bit floating-point vector of [4 x float] containing the same 4749 /// bitwise pattern as the parameter. 4750 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR 4751 _mm_castpd_ps(__m128d __a) { 4752 return (__m128)__a; 4753 } 4754 4755 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit 4756 /// integer vector. 4757 /// 4758 /// \headerfile <x86intrin.h> 4759 /// 4760 /// This intrinsic has no corresponding instruction. 4761 /// 4762 /// \param __a 4763 /// A 128-bit floating-point vector of [2 x double]. 4764 /// \returns A 128-bit integer vector containing the same bitwise pattern as the 4765 /// parameter. 4766 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR 4767 _mm_castpd_si128(__m128d __a) { 4768 return (__m128i)__a; 4769 } 4770 4771 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit 4772 /// floating-point vector of [2 x double]. 4773 /// 4774 /// \headerfile <x86intrin.h> 4775 /// 4776 /// This intrinsic has no corresponding instruction. 4777 /// 4778 /// \param __a 4779 /// A 128-bit floating-point vector of [4 x float]. 4780 /// \returns A 128-bit floating-point vector of [2 x double] containing the same 4781 /// bitwise pattern as the parameter. 4782 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR 4783 _mm_castps_pd(__m128 __a) { 4784 return (__m128d)__a; 4785 } 4786 4787 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit 4788 /// integer vector. 4789 /// 4790 /// \headerfile <x86intrin.h> 4791 /// 4792 /// This intrinsic has no corresponding instruction. 4793 /// 4794 /// \param __a 4795 /// A 128-bit floating-point vector of [4 x float]. 4796 /// \returns A 128-bit integer vector containing the same bitwise pattern as the 4797 /// parameter. 4798 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR 4799 _mm_castps_si128(__m128 __a) { 4800 return (__m128i)__a; 4801 } 4802 4803 /// Casts a 128-bit integer vector into a 128-bit floating-point vector 4804 /// of [4 x float]. 4805 /// 4806 /// \headerfile <x86intrin.h> 4807 /// 4808 /// This intrinsic has no corresponding instruction. 4809 /// 4810 /// \param __a 4811 /// A 128-bit integer vector. 4812 /// \returns A 128-bit floating-point vector of [4 x float] containing the same 4813 /// bitwise pattern as the parameter. 4814 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR 4815 _mm_castsi128_ps(__m128i __a) { 4816 return (__m128)__a; 4817 } 4818 4819 /// Casts a 128-bit integer vector into a 128-bit floating-point vector 4820 /// of [2 x double]. 4821 /// 4822 /// \headerfile <x86intrin.h> 4823 /// 4824 /// This intrinsic has no corresponding instruction. 4825 /// 4826 /// \param __a 4827 /// A 128-bit integer vector. 4828 /// \returns A 128-bit floating-point vector of [2 x double] containing the same 4829 /// bitwise pattern as the parameter. 4830 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR 4831 _mm_castsi128_pd(__m128i __a) { 4832 return (__m128d)__a; 4833 } 4834 4835 /// Compares each of the corresponding double-precision values of two 4836 /// 128-bit vectors of [2 x double], using the operation specified by the 4837 /// immediate integer operand. 4838 /// 4839 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 4840 /// If either value in a comparison is NaN, comparisons that are ordered 4841 /// return false, and comparisons that are unordered return true. 4842 /// 4843 /// \headerfile <x86intrin.h> 4844 /// 4845 /// \code 4846 /// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c); 4847 /// \endcode 4848 /// 4849 /// This intrinsic corresponds to the <c> (V)CMPPD </c> instruction. 4850 /// 4851 /// \param a 4852 /// A 128-bit vector of [2 x double]. 4853 /// \param b 4854 /// A 128-bit vector of [2 x double]. 4855 /// \param c 4856 /// An immediate integer operand, with bits [4:0] specifying which comparison 4857 /// operation to use: \n 4858 /// 0x00: Equal (ordered, non-signaling) \n 4859 /// 0x01: Less-than (ordered, signaling) \n 4860 /// 0x02: Less-than-or-equal (ordered, signaling) \n 4861 /// 0x03: Unordered (non-signaling) \n 4862 /// 0x04: Not-equal (unordered, non-signaling) \n 4863 /// 0x05: Not-less-than (unordered, signaling) \n 4864 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n 4865 /// 0x07: Ordered (non-signaling) \n 4866 /// \returns A 128-bit vector of [2 x double] containing the comparison results. 4867 #define _mm_cmp_pd(a, b, c) \ 4868 ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ 4869 (c))) 4870 4871 /// Compares each of the corresponding scalar double-precision values of 4872 /// two 128-bit vectors of [2 x double], using the operation specified by the 4873 /// immediate integer operand. 4874 /// 4875 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 4876 /// If either value in a comparison is NaN, comparisons that are ordered 4877 /// return false, and comparisons that are unordered return true. 4878 /// 4879 /// \headerfile <x86intrin.h> 4880 /// 4881 /// \code 4882 /// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c); 4883 /// \endcode 4884 /// 4885 /// This intrinsic corresponds to the <c> (V)CMPSD </c> instruction. 4886 /// 4887 /// \param a 4888 /// A 128-bit vector of [2 x double]. 4889 /// \param b 4890 /// A 128-bit vector of [2 x double]. 4891 /// \param c 4892 /// An immediate integer operand, with bits [4:0] specifying which comparison 4893 /// operation to use: \n 4894 /// 0x00: Equal (ordered, non-signaling) \n 4895 /// 0x01: Less-than (ordered, signaling) \n 4896 /// 0x02: Less-than-or-equal (ordered, signaling) \n 4897 /// 0x03: Unordered (non-signaling) \n 4898 /// 0x04: Not-equal (unordered, non-signaling) \n 4899 /// 0x05: Not-less-than (unordered, signaling) \n 4900 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n 4901 /// 0x07: Ordered (non-signaling) \n 4902 /// \returns A 128-bit vector of [2 x double] containing the comparison results. 4903 #define _mm_cmp_sd(a, b, c) \ 4904 ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ 4905 (c))) 4906 4907 #if defined(__cplusplus) 4908 extern "C" { 4909 #endif 4910 4911 /// Indicates that a spin loop is being executed for the purposes of 4912 /// optimizing power consumption during the loop. 4913 /// 4914 /// \headerfile <x86intrin.h> 4915 /// 4916 /// This intrinsic corresponds to the <c> PAUSE </c> instruction. 4917 /// 4918 void _mm_pause(void); 4919 4920 #if defined(__cplusplus) 4921 } // extern "C" 4922 #endif 4923 4924 #undef __anyext128 4925 #undef __trunc64 4926 #undef __DEFAULT_FN_ATTRS 4927 #undef __DEFAULT_FN_ATTRS_CONSTEXPR 4928 4929 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 4930 4931 #define _MM_DENORMALS_ZERO_ON (0x0040U) 4932 #define _MM_DENORMALS_ZERO_OFF (0x0000U) 4933 4934 #define _MM_DENORMALS_ZERO_MASK (0x0040U) 4935 4936 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK) 4937 #define _MM_SET_DENORMALS_ZERO_MODE(x) \ 4938 (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) 4939 4940 #endif /* __EMMINTRIN_H */ 4941