1 /*===---- avxintrin.h - AVX intrinsics -------------------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __IMMINTRIN_H 11 #error "Never use <avxintrin.h> directly; include <immintrin.h> instead." 12 #endif 13 14 #ifndef __AVXINTRIN_H 15 #define __AVXINTRIN_H 16 17 typedef double __v4df __attribute__ ((__vector_size__ (32))); 18 typedef float __v8sf __attribute__ ((__vector_size__ (32))); 19 typedef long long __v4di __attribute__ ((__vector_size__ (32))); 20 typedef int __v8si __attribute__ ((__vector_size__ (32))); 21 typedef short __v16hi __attribute__ ((__vector_size__ (32))); 22 typedef char __v32qi __attribute__ ((__vector_size__ (32))); 23 24 /* Unsigned types */ 25 typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32))); 26 typedef unsigned int __v8su __attribute__ ((__vector_size__ (32))); 27 typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32))); 28 typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32))); 29 30 /* We need an explicitly signed variant for char. Note that this shouldn't 31 * appear in the interface though. */ 32 typedef signed char __v32qs __attribute__((__vector_size__(32))); 33 34 typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32))); 35 typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32))); 36 typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32))); 37 38 typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1))); 39 typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1))); 40 typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1))); 41 42 #ifdef __SSE2__ 43 /* Both _Float16 and __bf16 require SSE2 being enabled. */ 44 typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32))); 45 typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32))); 46 typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1))); 47 48 typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32))); 49 typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32))); 50 #endif 51 52 /* Define the default attributes for the functions in this file. */ 53 #if defined(__EVEX512__) && !defined(__AVX10_1_512__) 54 #define __DEFAULT_FN_ATTRS \ 55 __attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \ 56 __min_vector_width__(256))) 57 #define __DEFAULT_FN_ATTRS128 \ 58 __attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \ 59 __min_vector_width__(128))) 60 #else 61 #define __DEFAULT_FN_ATTRS \ 62 __attribute__((__always_inline__, __nodebug__, __target__("avx"), \ 63 __min_vector_width__(256))) 64 #define __DEFAULT_FN_ATTRS128 \ 65 __attribute__((__always_inline__, __nodebug__, __target__("avx"), \ 66 __min_vector_width__(128))) 67 #endif 68 69 #if defined(__cplusplus) && (__cplusplus >= 201103L) 70 #define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr 71 #define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr 72 #else 73 #define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS128 74 #define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS 75 #endif 76 77 /* Arithmetic */ 78 /// Adds two 256-bit vectors of [4 x double]. 79 /// 80 /// \headerfile <x86intrin.h> 81 /// 82 /// This intrinsic corresponds to the <c> VADDPD </c> instruction. 83 /// 84 /// \param __a 85 /// A 256-bit vector of [4 x double] containing one of the source operands. 86 /// \param __b 87 /// A 256-bit vector of [4 x double] containing one of the source operands. 88 /// \returns A 256-bit vector of [4 x double] containing the sums of both 89 /// operands. 90 static __inline __m256d __DEFAULT_FN_ATTRS 91 _mm256_add_pd(__m256d __a, __m256d __b) 92 { 93 return (__m256d)((__v4df)__a+(__v4df)__b); 94 } 95 96 /// Adds two 256-bit vectors of [8 x float]. 97 /// 98 /// \headerfile <x86intrin.h> 99 /// 100 /// This intrinsic corresponds to the <c> VADDPS </c> instruction. 101 /// 102 /// \param __a 103 /// A 256-bit vector of [8 x float] containing one of the source operands. 104 /// \param __b 105 /// A 256-bit vector of [8 x float] containing one of the source operands. 106 /// \returns A 256-bit vector of [8 x float] containing the sums of both 107 /// operands. 108 static __inline __m256 __DEFAULT_FN_ATTRS 109 _mm256_add_ps(__m256 __a, __m256 __b) 110 { 111 return (__m256)((__v8sf)__a+(__v8sf)__b); 112 } 113 114 /// Subtracts two 256-bit vectors of [4 x double]. 115 /// 116 /// \headerfile <x86intrin.h> 117 /// 118 /// This intrinsic corresponds to the <c> VSUBPD </c> instruction. 119 /// 120 /// \param __a 121 /// A 256-bit vector of [4 x double] containing the minuend. 122 /// \param __b 123 /// A 256-bit vector of [4 x double] containing the subtrahend. 124 /// \returns A 256-bit vector of [4 x double] containing the differences between 125 /// both operands. 126 static __inline __m256d __DEFAULT_FN_ATTRS 127 _mm256_sub_pd(__m256d __a, __m256d __b) 128 { 129 return (__m256d)((__v4df)__a-(__v4df)__b); 130 } 131 132 /// Subtracts two 256-bit vectors of [8 x float]. 133 /// 134 /// \headerfile <x86intrin.h> 135 /// 136 /// This intrinsic corresponds to the <c> VSUBPS </c> instruction. 137 /// 138 /// \param __a 139 /// A 256-bit vector of [8 x float] containing the minuend. 140 /// \param __b 141 /// A 256-bit vector of [8 x float] containing the subtrahend. 142 /// \returns A 256-bit vector of [8 x float] containing the differences between 143 /// both operands. 144 static __inline __m256 __DEFAULT_FN_ATTRS 145 _mm256_sub_ps(__m256 __a, __m256 __b) 146 { 147 return (__m256)((__v8sf)__a-(__v8sf)__b); 148 } 149 150 /// Adds the even-indexed values and subtracts the odd-indexed values of 151 /// two 256-bit vectors of [4 x double]. 152 /// 153 /// \headerfile <x86intrin.h> 154 /// 155 /// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction. 156 /// 157 /// \param __a 158 /// A 256-bit vector of [4 x double] containing the left source operand. 159 /// \param __b 160 /// A 256-bit vector of [4 x double] containing the right source operand. 161 /// \returns A 256-bit vector of [4 x double] containing the alternating sums 162 /// and differences between both operands. 163 static __inline __m256d __DEFAULT_FN_ATTRS 164 _mm256_addsub_pd(__m256d __a, __m256d __b) 165 { 166 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b); 167 } 168 169 /// Adds the even-indexed values and subtracts the odd-indexed values of 170 /// two 256-bit vectors of [8 x float]. 171 /// 172 /// \headerfile <x86intrin.h> 173 /// 174 /// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction. 175 /// 176 /// \param __a 177 /// A 256-bit vector of [8 x float] containing the left source operand. 178 /// \param __b 179 /// A 256-bit vector of [8 x float] containing the right source operand. 180 /// \returns A 256-bit vector of [8 x float] containing the alternating sums and 181 /// differences between both operands. 182 static __inline __m256 __DEFAULT_FN_ATTRS 183 _mm256_addsub_ps(__m256 __a, __m256 __b) 184 { 185 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b); 186 } 187 188 /// Divides two 256-bit vectors of [4 x double]. 189 /// 190 /// \headerfile <x86intrin.h> 191 /// 192 /// This intrinsic corresponds to the <c> VDIVPD </c> instruction. 193 /// 194 /// \param __a 195 /// A 256-bit vector of [4 x double] containing the dividend. 196 /// \param __b 197 /// A 256-bit vector of [4 x double] containing the divisor. 198 /// \returns A 256-bit vector of [4 x double] containing the quotients of both 199 /// operands. 200 static __inline __m256d __DEFAULT_FN_ATTRS 201 _mm256_div_pd(__m256d __a, __m256d __b) 202 { 203 return (__m256d)((__v4df)__a/(__v4df)__b); 204 } 205 206 /// Divides two 256-bit vectors of [8 x float]. 207 /// 208 /// \headerfile <x86intrin.h> 209 /// 210 /// This intrinsic corresponds to the <c> VDIVPS </c> instruction. 211 /// 212 /// \param __a 213 /// A 256-bit vector of [8 x float] containing the dividend. 214 /// \param __b 215 /// A 256-bit vector of [8 x float] containing the divisor. 216 /// \returns A 256-bit vector of [8 x float] containing the quotients of both 217 /// operands. 218 static __inline __m256 __DEFAULT_FN_ATTRS 219 _mm256_div_ps(__m256 __a, __m256 __b) 220 { 221 return (__m256)((__v8sf)__a/(__v8sf)__b); 222 } 223 224 /// Compares two 256-bit vectors of [4 x double] and returns the greater 225 /// of each pair of values. 226 /// 227 /// If either value in a comparison is NaN, returns the value from \a __b. 228 /// 229 /// \headerfile <x86intrin.h> 230 /// 231 /// This intrinsic corresponds to the <c> VMAXPD </c> instruction. 232 /// 233 /// \param __a 234 /// A 256-bit vector of [4 x double] containing one of the operands. 235 /// \param __b 236 /// A 256-bit vector of [4 x double] containing one of the operands. 237 /// \returns A 256-bit vector of [4 x double] containing the maximum values 238 /// between both operands. 239 static __inline __m256d __DEFAULT_FN_ATTRS 240 _mm256_max_pd(__m256d __a, __m256d __b) 241 { 242 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b); 243 } 244 245 /// Compares two 256-bit vectors of [8 x float] and returns the greater 246 /// of each pair of values. 247 /// 248 /// If either value in a comparison is NaN, returns the value from \a __b. 249 /// 250 /// \headerfile <x86intrin.h> 251 /// 252 /// This intrinsic corresponds to the <c> VMAXPS </c> instruction. 253 /// 254 /// \param __a 255 /// A 256-bit vector of [8 x float] containing one of the operands. 256 /// \param __b 257 /// A 256-bit vector of [8 x float] containing one of the operands. 258 /// \returns A 256-bit vector of [8 x float] containing the maximum values 259 /// between both operands. 260 static __inline __m256 __DEFAULT_FN_ATTRS 261 _mm256_max_ps(__m256 __a, __m256 __b) 262 { 263 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b); 264 } 265 266 /// Compares two 256-bit vectors of [4 x double] and returns the lesser 267 /// of each pair of values. 268 /// 269 /// If either value in a comparison is NaN, returns the value from \a __b. 270 /// 271 /// \headerfile <x86intrin.h> 272 /// 273 /// This intrinsic corresponds to the <c> VMINPD </c> instruction. 274 /// 275 /// \param __a 276 /// A 256-bit vector of [4 x double] containing one of the operands. 277 /// \param __b 278 /// A 256-bit vector of [4 x double] containing one of the operands. 279 /// \returns A 256-bit vector of [4 x double] containing the minimum values 280 /// between both operands. 281 static __inline __m256d __DEFAULT_FN_ATTRS 282 _mm256_min_pd(__m256d __a, __m256d __b) 283 { 284 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b); 285 } 286 287 /// Compares two 256-bit vectors of [8 x float] and returns the lesser 288 /// of each pair of values. 289 /// 290 /// If either value in a comparison is NaN, returns the value from \a __b. 291 /// 292 /// \headerfile <x86intrin.h> 293 /// 294 /// This intrinsic corresponds to the <c> VMINPS </c> instruction. 295 /// 296 /// \param __a 297 /// A 256-bit vector of [8 x float] containing one of the operands. 298 /// \param __b 299 /// A 256-bit vector of [8 x float] containing one of the operands. 300 /// \returns A 256-bit vector of [8 x float] containing the minimum values 301 /// between both operands. 302 static __inline __m256 __DEFAULT_FN_ATTRS 303 _mm256_min_ps(__m256 __a, __m256 __b) 304 { 305 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b); 306 } 307 308 /// Multiplies two 256-bit vectors of [4 x double]. 309 /// 310 /// \headerfile <x86intrin.h> 311 /// 312 /// This intrinsic corresponds to the <c> VMULPD </c> instruction. 313 /// 314 /// \param __a 315 /// A 256-bit vector of [4 x double] containing one of the operands. 316 /// \param __b 317 /// A 256-bit vector of [4 x double] containing one of the operands. 318 /// \returns A 256-bit vector of [4 x double] containing the products of both 319 /// operands. 320 static __inline __m256d __DEFAULT_FN_ATTRS 321 _mm256_mul_pd(__m256d __a, __m256d __b) 322 { 323 return (__m256d)((__v4df)__a * (__v4df)__b); 324 } 325 326 /// Multiplies two 256-bit vectors of [8 x float]. 327 /// 328 /// \headerfile <x86intrin.h> 329 /// 330 /// This intrinsic corresponds to the <c> VMULPS </c> instruction. 331 /// 332 /// \param __a 333 /// A 256-bit vector of [8 x float] containing one of the operands. 334 /// \param __b 335 /// A 256-bit vector of [8 x float] containing one of the operands. 336 /// \returns A 256-bit vector of [8 x float] containing the products of both 337 /// operands. 338 static __inline __m256 __DEFAULT_FN_ATTRS 339 _mm256_mul_ps(__m256 __a, __m256 __b) 340 { 341 return (__m256)((__v8sf)__a * (__v8sf)__b); 342 } 343 344 /// Calculates the square roots of the values in a 256-bit vector of 345 /// [4 x double]. 346 /// 347 /// \headerfile <x86intrin.h> 348 /// 349 /// This intrinsic corresponds to the <c> VSQRTPD </c> instruction. 350 /// 351 /// \param __a 352 /// A 256-bit vector of [4 x double]. 353 /// \returns A 256-bit vector of [4 x double] containing the square roots of the 354 /// values in the operand. 355 static __inline __m256d __DEFAULT_FN_ATTRS 356 _mm256_sqrt_pd(__m256d __a) 357 { 358 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a); 359 } 360 361 /// Calculates the square roots of the values in a 256-bit vector of 362 /// [8 x float]. 363 /// 364 /// \headerfile <x86intrin.h> 365 /// 366 /// This intrinsic corresponds to the <c> VSQRTPS </c> instruction. 367 /// 368 /// \param __a 369 /// A 256-bit vector of [8 x float]. 370 /// \returns A 256-bit vector of [8 x float] containing the square roots of the 371 /// values in the operand. 372 static __inline __m256 __DEFAULT_FN_ATTRS 373 _mm256_sqrt_ps(__m256 __a) 374 { 375 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a); 376 } 377 378 /// Calculates the reciprocal square roots of the values in a 256-bit 379 /// vector of [8 x float]. 380 /// 381 /// \headerfile <x86intrin.h> 382 /// 383 /// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction. 384 /// 385 /// \param __a 386 /// A 256-bit vector of [8 x float]. 387 /// \returns A 256-bit vector of [8 x float] containing the reciprocal square 388 /// roots of the values in the operand. 389 static __inline __m256 __DEFAULT_FN_ATTRS 390 _mm256_rsqrt_ps(__m256 __a) 391 { 392 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a); 393 } 394 395 /// Calculates the reciprocals of the values in a 256-bit vector of 396 /// [8 x float]. 397 /// 398 /// \headerfile <x86intrin.h> 399 /// 400 /// This intrinsic corresponds to the <c> VRCPPS </c> instruction. 401 /// 402 /// \param __a 403 /// A 256-bit vector of [8 x float]. 404 /// \returns A 256-bit vector of [8 x float] containing the reciprocals of the 405 /// values in the operand. 406 static __inline __m256 __DEFAULT_FN_ATTRS 407 _mm256_rcp_ps(__m256 __a) 408 { 409 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a); 410 } 411 412 /// Rounds the values in a 256-bit vector of [4 x double] as specified 413 /// by the byte operand. The source values are rounded to integer values and 414 /// returned as 64-bit double-precision floating-point values. 415 /// 416 /// \headerfile <x86intrin.h> 417 /// 418 /// \code 419 /// __m256d _mm256_round_pd(__m256d V, const int M); 420 /// \endcode 421 /// 422 /// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. 423 /// 424 /// \param V 425 /// A 256-bit vector of [4 x double]. 426 /// \param M 427 /// An integer value that specifies the rounding operation. \n 428 /// Bits [7:4] are reserved. \n 429 /// Bit [3] is a precision exception value: \n 430 /// 0: A normal PE exception is used. \n 431 /// 1: The PE field is not updated. \n 432 /// Bit [2] is the rounding control source: \n 433 /// 0: Use bits [1:0] of \a M. \n 434 /// 1: Use the current MXCSR setting. \n 435 /// Bits [1:0] contain the rounding control definition: \n 436 /// 00: Nearest. \n 437 /// 01: Downward (toward negative infinity). \n 438 /// 10: Upward (toward positive infinity). \n 439 /// 11: Truncated. 440 /// \returns A 256-bit vector of [4 x double] containing the rounded values. 441 #define _mm256_round_pd(V, M) \ 442 ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M))) 443 444 /// Rounds the values stored in a 256-bit vector of [8 x float] as 445 /// specified by the byte operand. The source values are rounded to integer 446 /// values and returned as floating-point values. 447 /// 448 /// \headerfile <x86intrin.h> 449 /// 450 /// \code 451 /// __m256 _mm256_round_ps(__m256 V, const int M); 452 /// \endcode 453 /// 454 /// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. 455 /// 456 /// \param V 457 /// A 256-bit vector of [8 x float]. 458 /// \param M 459 /// An integer value that specifies the rounding operation. \n 460 /// Bits [7:4] are reserved. \n 461 /// Bit [3] is a precision exception value: \n 462 /// 0: A normal PE exception is used. \n 463 /// 1: The PE field is not updated. \n 464 /// Bit [2] is the rounding control source: \n 465 /// 0: Use bits [1:0] of \a M. \n 466 /// 1: Use the current MXCSR setting. \n 467 /// Bits [1:0] contain the rounding control definition: \n 468 /// 00: Nearest. \n 469 /// 01: Downward (toward negative infinity). \n 470 /// 10: Upward (toward positive infinity). \n 471 /// 11: Truncated. 472 /// \returns A 256-bit vector of [8 x float] containing the rounded values. 473 #define _mm256_round_ps(V, M) \ 474 ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M))) 475 476 /// Rounds up the values stored in a 256-bit vector of [4 x double]. The 477 /// source values are rounded up to integer values and returned as 64-bit 478 /// double-precision floating-point values. 479 /// 480 /// \headerfile <x86intrin.h> 481 /// 482 /// \code 483 /// __m256d _mm256_ceil_pd(__m256d V); 484 /// \endcode 485 /// 486 /// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. 487 /// 488 /// \param V 489 /// A 256-bit vector of [4 x double]. 490 /// \returns A 256-bit vector of [4 x double] containing the rounded up values. 491 #define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL) 492 493 /// Rounds down the values stored in a 256-bit vector of [4 x double]. 494 /// The source values are rounded down to integer values and returned as 495 /// 64-bit double-precision floating-point values. 496 /// 497 /// \headerfile <x86intrin.h> 498 /// 499 /// \code 500 /// __m256d _mm256_floor_pd(__m256d V); 501 /// \endcode 502 /// 503 /// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. 504 /// 505 /// \param V 506 /// A 256-bit vector of [4 x double]. 507 /// \returns A 256-bit vector of [4 x double] containing the rounded down 508 /// values. 509 #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR) 510 511 /// Rounds up the values stored in a 256-bit vector of [8 x float]. The 512 /// source values are rounded up to integer values and returned as 513 /// floating-point values. 514 /// 515 /// \headerfile <x86intrin.h> 516 /// 517 /// \code 518 /// __m256 _mm256_ceil_ps(__m256 V); 519 /// \endcode 520 /// 521 /// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. 522 /// 523 /// \param V 524 /// A 256-bit vector of [8 x float]. 525 /// \returns A 256-bit vector of [8 x float] containing the rounded up values. 526 #define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL) 527 528 /// Rounds down the values stored in a 256-bit vector of [8 x float]. The 529 /// source values are rounded down to integer values and returned as 530 /// floating-point values. 531 /// 532 /// \headerfile <x86intrin.h> 533 /// 534 /// \code 535 /// __m256 _mm256_floor_ps(__m256 V); 536 /// \endcode 537 /// 538 /// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. 539 /// 540 /// \param V 541 /// A 256-bit vector of [8 x float]. 542 /// \returns A 256-bit vector of [8 x float] containing the rounded down values. 543 #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR) 544 545 /* Logical */ 546 /// Performs a bitwise AND of two 256-bit vectors of [4 x double]. 547 /// 548 /// \headerfile <x86intrin.h> 549 /// 550 /// This intrinsic corresponds to the <c> VANDPD </c> instruction. 551 /// 552 /// \param __a 553 /// A 256-bit vector of [4 x double] containing one of the source operands. 554 /// \param __b 555 /// A 256-bit vector of [4 x double] containing one of the source operands. 556 /// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the 557 /// values between both operands. 558 static __inline __m256d __DEFAULT_FN_ATTRS 559 _mm256_and_pd(__m256d __a, __m256d __b) 560 { 561 return (__m256d)((__v4du)__a & (__v4du)__b); 562 } 563 564 /// Performs a bitwise AND of two 256-bit vectors of [8 x float]. 565 /// 566 /// \headerfile <x86intrin.h> 567 /// 568 /// This intrinsic corresponds to the <c> VANDPS </c> instruction. 569 /// 570 /// \param __a 571 /// A 256-bit vector of [8 x float] containing one of the source operands. 572 /// \param __b 573 /// A 256-bit vector of [8 x float] containing one of the source operands. 574 /// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the 575 /// values between both operands. 576 static __inline __m256 __DEFAULT_FN_ATTRS 577 _mm256_and_ps(__m256 __a, __m256 __b) 578 { 579 return (__m256)((__v8su)__a & (__v8su)__b); 580 } 581 582 /// Performs a bitwise AND of two 256-bit vectors of [4 x double], using 583 /// the one's complement of the values contained in the first source operand. 584 /// 585 /// \headerfile <x86intrin.h> 586 /// 587 /// This intrinsic corresponds to the <c> VANDNPD </c> instruction. 588 /// 589 /// \param __a 590 /// A 256-bit vector of [4 x double] containing the left source operand. The 591 /// one's complement of this value is used in the bitwise AND. 592 /// \param __b 593 /// A 256-bit vector of [4 x double] containing the right source operand. 594 /// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the 595 /// values of the second operand and the one's complement of the first 596 /// operand. 597 static __inline __m256d __DEFAULT_FN_ATTRS 598 _mm256_andnot_pd(__m256d __a, __m256d __b) 599 { 600 return (__m256d)(~(__v4du)__a & (__v4du)__b); 601 } 602 603 /// Performs a bitwise AND of two 256-bit vectors of [8 x float], using 604 /// the one's complement of the values contained in the first source operand. 605 /// 606 /// \headerfile <x86intrin.h> 607 /// 608 /// This intrinsic corresponds to the <c> VANDNPS </c> instruction. 609 /// 610 /// \param __a 611 /// A 256-bit vector of [8 x float] containing the left source operand. The 612 /// one's complement of this value is used in the bitwise AND. 613 /// \param __b 614 /// A 256-bit vector of [8 x float] containing the right source operand. 615 /// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the 616 /// values of the second operand and the one's complement of the first 617 /// operand. 618 static __inline __m256 __DEFAULT_FN_ATTRS 619 _mm256_andnot_ps(__m256 __a, __m256 __b) 620 { 621 return (__m256)(~(__v8su)__a & (__v8su)__b); 622 } 623 624 /// Performs a bitwise OR of two 256-bit vectors of [4 x double]. 625 /// 626 /// \headerfile <x86intrin.h> 627 /// 628 /// This intrinsic corresponds to the <c> VORPD </c> instruction. 629 /// 630 /// \param __a 631 /// A 256-bit vector of [4 x double] containing one of the source operands. 632 /// \param __b 633 /// A 256-bit vector of [4 x double] containing one of the source operands. 634 /// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the 635 /// values between both operands. 636 static __inline __m256d __DEFAULT_FN_ATTRS 637 _mm256_or_pd(__m256d __a, __m256d __b) 638 { 639 return (__m256d)((__v4du)__a | (__v4du)__b); 640 } 641 642 /// Performs a bitwise OR of two 256-bit vectors of [8 x float]. 643 /// 644 /// \headerfile <x86intrin.h> 645 /// 646 /// This intrinsic corresponds to the <c> VORPS </c> instruction. 647 /// 648 /// \param __a 649 /// A 256-bit vector of [8 x float] containing one of the source operands. 650 /// \param __b 651 /// A 256-bit vector of [8 x float] containing one of the source operands. 652 /// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the 653 /// values between both operands. 654 static __inline __m256 __DEFAULT_FN_ATTRS 655 _mm256_or_ps(__m256 __a, __m256 __b) 656 { 657 return (__m256)((__v8su)__a | (__v8su)__b); 658 } 659 660 /// Performs a bitwise XOR of two 256-bit vectors of [4 x double]. 661 /// 662 /// \headerfile <x86intrin.h> 663 /// 664 /// This intrinsic corresponds to the <c> VXORPD </c> instruction. 665 /// 666 /// \param __a 667 /// A 256-bit vector of [4 x double] containing one of the source operands. 668 /// \param __b 669 /// A 256-bit vector of [4 x double] containing one of the source operands. 670 /// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the 671 /// values between both operands. 672 static __inline __m256d __DEFAULT_FN_ATTRS 673 _mm256_xor_pd(__m256d __a, __m256d __b) 674 { 675 return (__m256d)((__v4du)__a ^ (__v4du)__b); 676 } 677 678 /// Performs a bitwise XOR of two 256-bit vectors of [8 x float]. 679 /// 680 /// \headerfile <x86intrin.h> 681 /// 682 /// This intrinsic corresponds to the <c> VXORPS </c> instruction. 683 /// 684 /// \param __a 685 /// A 256-bit vector of [8 x float] containing one of the source operands. 686 /// \param __b 687 /// A 256-bit vector of [8 x float] containing one of the source operands. 688 /// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the 689 /// values between both operands. 690 static __inline __m256 __DEFAULT_FN_ATTRS 691 _mm256_xor_ps(__m256 __a, __m256 __b) 692 { 693 return (__m256)((__v8su)__a ^ (__v8su)__b); 694 } 695 696 /* Horizontal arithmetic */ 697 /// Horizontally adds the adjacent pairs of values contained in two 698 /// 256-bit vectors of [4 x double]. 699 /// 700 /// \headerfile <x86intrin.h> 701 /// 702 /// This intrinsic corresponds to the <c> VHADDPD </c> instruction. 703 /// 704 /// \param __a 705 /// A 256-bit vector of [4 x double] containing one of the source operands. 706 /// The horizontal sums of the values are returned in the even-indexed 707 /// elements of a vector of [4 x double]. 708 /// \param __b 709 /// A 256-bit vector of [4 x double] containing one of the source operands. 710 /// The horizontal sums of the values are returned in the odd-indexed 711 /// elements of a vector of [4 x double]. 712 /// \returns A 256-bit vector of [4 x double] containing the horizontal sums of 713 /// both operands. 714 static __inline __m256d __DEFAULT_FN_ATTRS 715 _mm256_hadd_pd(__m256d __a, __m256d __b) 716 { 717 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b); 718 } 719 720 /// Horizontally adds the adjacent pairs of values contained in two 721 /// 256-bit vectors of [8 x float]. 722 /// 723 /// \headerfile <x86intrin.h> 724 /// 725 /// This intrinsic corresponds to the <c> VHADDPS </c> instruction. 726 /// 727 /// \param __a 728 /// A 256-bit vector of [8 x float] containing one of the source operands. 729 /// The horizontal sums of the values are returned in the elements with 730 /// index 0, 1, 4, 5 of a vector of [8 x float]. 731 /// \param __b 732 /// A 256-bit vector of [8 x float] containing one of the source operands. 733 /// The horizontal sums of the values are returned in the elements with 734 /// index 2, 3, 6, 7 of a vector of [8 x float]. 735 /// \returns A 256-bit vector of [8 x float] containing the horizontal sums of 736 /// both operands. 737 static __inline __m256 __DEFAULT_FN_ATTRS 738 _mm256_hadd_ps(__m256 __a, __m256 __b) 739 { 740 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b); 741 } 742 743 /// Horizontally subtracts the adjacent pairs of values contained in two 744 /// 256-bit vectors of [4 x double]. 745 /// 746 /// \headerfile <x86intrin.h> 747 /// 748 /// This intrinsic corresponds to the <c> VHSUBPD </c> instruction. 749 /// 750 /// \param __a 751 /// A 256-bit vector of [4 x double] containing one of the source operands. 752 /// The horizontal differences between the values are returned in the 753 /// even-indexed elements of a vector of [4 x double]. 754 /// \param __b 755 /// A 256-bit vector of [4 x double] containing one of the source operands. 756 /// The horizontal differences between the values are returned in the 757 /// odd-indexed elements of a vector of [4 x double]. 758 /// \returns A 256-bit vector of [4 x double] containing the horizontal 759 /// differences of both operands. 760 static __inline __m256d __DEFAULT_FN_ATTRS 761 _mm256_hsub_pd(__m256d __a, __m256d __b) 762 { 763 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b); 764 } 765 766 /// Horizontally subtracts the adjacent pairs of values contained in two 767 /// 256-bit vectors of [8 x float]. 768 /// 769 /// \headerfile <x86intrin.h> 770 /// 771 /// This intrinsic corresponds to the <c> VHSUBPS </c> instruction. 772 /// 773 /// \param __a 774 /// A 256-bit vector of [8 x float] containing one of the source operands. 775 /// The horizontal differences between the values are returned in the 776 /// elements with index 0, 1, 4, 5 of a vector of [8 x float]. 777 /// \param __b 778 /// A 256-bit vector of [8 x float] containing one of the source operands. 779 /// The horizontal differences between the values are returned in the 780 /// elements with index 2, 3, 6, 7 of a vector of [8 x float]. 781 /// \returns A 256-bit vector of [8 x float] containing the horizontal 782 /// differences of both operands. 783 static __inline __m256 __DEFAULT_FN_ATTRS 784 _mm256_hsub_ps(__m256 __a, __m256 __b) 785 { 786 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b); 787 } 788 789 /* Vector permutations */ 790 /// Copies the values in a 128-bit vector of [2 x double] as specified 791 /// by the 128-bit integer vector operand. 792 /// 793 /// \headerfile <x86intrin.h> 794 /// 795 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 796 /// 797 /// \param __a 798 /// A 128-bit vector of [2 x double]. 799 /// \param __c 800 /// A 128-bit integer vector operand specifying how the values are to be 801 /// copied. \n 802 /// Bit [1]: \n 803 /// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 804 /// vector. \n 805 /// 1: Bits [127:64] of the source are copied to bits [63:0] of the 806 /// returned vector. \n 807 /// Bit [65]: \n 808 /// 0: Bits [63:0] of the source are copied to bits [127:64] of the 809 /// returned vector. \n 810 /// 1: Bits [127:64] of the source are copied to bits [127:64] of the 811 /// returned vector. 812 /// \returns A 128-bit vector of [2 x double] containing the copied values. 813 static __inline __m128d __DEFAULT_FN_ATTRS128 814 _mm_permutevar_pd(__m128d __a, __m128i __c) 815 { 816 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c); 817 } 818 819 /// Copies the values in a 256-bit vector of [4 x double] as specified 820 /// by the 256-bit integer vector operand. 821 /// 822 /// \headerfile <x86intrin.h> 823 /// 824 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 825 /// 826 /// \param __a 827 /// A 256-bit vector of [4 x double]. 828 /// \param __c 829 /// A 256-bit integer vector operand specifying how the values are to be 830 /// copied. \n 831 /// Bit [1]: \n 832 /// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 833 /// vector. \n 834 /// 1: Bits [127:64] of the source are copied to bits [63:0] of the 835 /// returned vector. \n 836 /// Bit [65]: \n 837 /// 0: Bits [63:0] of the source are copied to bits [127:64] of the 838 /// returned vector. \n 839 /// 1: Bits [127:64] of the source are copied to bits [127:64] of the 840 /// returned vector. \n 841 /// Bit [129]: \n 842 /// 0: Bits [191:128] of the source are copied to bits [191:128] of the 843 /// returned vector. \n 844 /// 1: Bits [255:192] of the source are copied to bits [191:128] of the 845 /// returned vector. \n 846 /// Bit [193]: \n 847 /// 0: Bits [191:128] of the source are copied to bits [255:192] of the 848 /// returned vector. \n 849 /// 1: Bits [255:192] of the source are copied to bits [255:192] of the 850 /// returned vector. 851 /// \returns A 256-bit vector of [4 x double] containing the copied values. 852 static __inline __m256d __DEFAULT_FN_ATTRS 853 _mm256_permutevar_pd(__m256d __a, __m256i __c) 854 { 855 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c); 856 } 857 858 /// Copies the values stored in a 128-bit vector of [4 x float] as 859 /// specified by the 128-bit integer vector operand. 860 /// 861 /// \headerfile <x86intrin.h> 862 /// 863 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 864 /// 865 /// \param __a 866 /// A 128-bit vector of [4 x float]. 867 /// \param __c 868 /// A 128-bit integer vector operand specifying how the values are to be 869 /// copied. \n 870 /// Bits [1:0]: \n 871 /// 00: Bits [31:0] of the source are copied to bits [31:0] of the 872 /// returned vector. \n 873 /// 01: Bits [63:32] of the source are copied to bits [31:0] of the 874 /// returned vector. \n 875 /// 10: Bits [95:64] of the source are copied to bits [31:0] of the 876 /// returned vector. \n 877 /// 11: Bits [127:96] of the source are copied to bits [31:0] of the 878 /// returned vector. \n 879 /// Bits [33:32]: \n 880 /// 00: Bits [31:0] of the source are copied to bits [63:32] of the 881 /// returned vector. \n 882 /// 01: Bits [63:32] of the source are copied to bits [63:32] of the 883 /// returned vector. \n 884 /// 10: Bits [95:64] of the source are copied to bits [63:32] of the 885 /// returned vector. \n 886 /// 11: Bits [127:96] of the source are copied to bits [63:32] of the 887 /// returned vector. \n 888 /// Bits [65:64]: \n 889 /// 00: Bits [31:0] of the source are copied to bits [95:64] of the 890 /// returned vector. \n 891 /// 01: Bits [63:32] of the source are copied to bits [95:64] of the 892 /// returned vector. \n 893 /// 10: Bits [95:64] of the source are copied to bits [95:64] of the 894 /// returned vector. \n 895 /// 11: Bits [127:96] of the source are copied to bits [95:64] of the 896 /// returned vector. \n 897 /// Bits [97:96]: \n 898 /// 00: Bits [31:0] of the source are copied to bits [127:96] of the 899 /// returned vector. \n 900 /// 01: Bits [63:32] of the source are copied to bits [127:96] of the 901 /// returned vector. \n 902 /// 10: Bits [95:64] of the source are copied to bits [127:96] of the 903 /// returned vector. \n 904 /// 11: Bits [127:96] of the source are copied to bits [127:96] of the 905 /// returned vector. 906 /// \returns A 128-bit vector of [4 x float] containing the copied values. 907 static __inline __m128 __DEFAULT_FN_ATTRS128 908 _mm_permutevar_ps(__m128 __a, __m128i __c) 909 { 910 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c); 911 } 912 913 /// Copies the values stored in a 256-bit vector of [8 x float] as 914 /// specified by the 256-bit integer vector operand. 915 /// 916 /// \headerfile <x86intrin.h> 917 /// 918 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 919 /// 920 /// \param __a 921 /// A 256-bit vector of [8 x float]. 922 /// \param __c 923 /// A 256-bit integer vector operand specifying how the values are to be 924 /// copied. \n 925 /// Bits [1:0]: \n 926 /// 00: Bits [31:0] of the source are copied to bits [31:0] of the 927 /// returned vector. \n 928 /// 01: Bits [63:32] of the source are copied to bits [31:0] of the 929 /// returned vector. \n 930 /// 10: Bits [95:64] of the source are copied to bits [31:0] of the 931 /// returned vector. \n 932 /// 11: Bits [127:96] of the source are copied to bits [31:0] of the 933 /// returned vector. \n 934 /// Bits [33:32]: \n 935 /// 00: Bits [31:0] of the source are copied to bits [63:32] of the 936 /// returned vector. \n 937 /// 01: Bits [63:32] of the source are copied to bits [63:32] of the 938 /// returned vector. \n 939 /// 10: Bits [95:64] of the source are copied to bits [63:32] of the 940 /// returned vector. \n 941 /// 11: Bits [127:96] of the source are copied to bits [63:32] of the 942 /// returned vector. \n 943 /// Bits [65:64]: \n 944 /// 00: Bits [31:0] of the source are copied to bits [95:64] of the 945 /// returned vector. \n 946 /// 01: Bits [63:32] of the source are copied to bits [95:64] of the 947 /// returned vector. \n 948 /// 10: Bits [95:64] of the source are copied to bits [95:64] of the 949 /// returned vector. \n 950 /// 11: Bits [127:96] of the source are copied to bits [95:64] of the 951 /// returned vector. \n 952 /// Bits [97:96]: \n 953 /// 00: Bits [31:0] of the source are copied to bits [127:96] of the 954 /// returned vector. \n 955 /// 01: Bits [63:32] of the source are copied to bits [127:96] of the 956 /// returned vector. \n 957 /// 10: Bits [95:64] of the source are copied to bits [127:96] of the 958 /// returned vector. \n 959 /// 11: Bits [127:96] of the source are copied to bits [127:96] of the 960 /// returned vector. \n 961 /// Bits [129:128]: \n 962 /// 00: Bits [159:128] of the source are copied to bits [159:128] of the 963 /// returned vector. \n 964 /// 01: Bits [191:160] of the source are copied to bits [159:128] of the 965 /// returned vector. \n 966 /// 10: Bits [223:192] of the source are copied to bits [159:128] of the 967 /// returned vector. \n 968 /// 11: Bits [255:224] of the source are copied to bits [159:128] of the 969 /// returned vector. \n 970 /// Bits [161:160]: \n 971 /// 00: Bits [159:128] of the source are copied to bits [191:160] of the 972 /// returned vector. \n 973 /// 01: Bits [191:160] of the source are copied to bits [191:160] of the 974 /// returned vector. \n 975 /// 10: Bits [223:192] of the source are copied to bits [191:160] of the 976 /// returned vector. \n 977 /// 11: Bits [255:224] of the source are copied to bits [191:160] of the 978 /// returned vector. \n 979 /// Bits [193:192]: \n 980 /// 00: Bits [159:128] of the source are copied to bits [223:192] of the 981 /// returned vector. \n 982 /// 01: Bits [191:160] of the source are copied to bits [223:192] of the 983 /// returned vector. \n 984 /// 10: Bits [223:192] of the source are copied to bits [223:192] of the 985 /// returned vector. \n 986 /// 11: Bits [255:224] of the source are copied to bits [223:192] of the 987 /// returned vector. \n 988 /// Bits [225:224]: \n 989 /// 00: Bits [159:128] of the source are copied to bits [255:224] of the 990 /// returned vector. \n 991 /// 01: Bits [191:160] of the source are copied to bits [255:224] of the 992 /// returned vector. \n 993 /// 10: Bits [223:192] of the source are copied to bits [255:224] of the 994 /// returned vector. \n 995 /// 11: Bits [255:224] of the source are copied to bits [255:224] of the 996 /// returned vector. 997 /// \returns A 256-bit vector of [8 x float] containing the copied values. 998 static __inline __m256 __DEFAULT_FN_ATTRS 999 _mm256_permutevar_ps(__m256 __a, __m256i __c) 1000 { 1001 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c); 1002 } 1003 1004 /// Copies the values in a 128-bit vector of [2 x double] as specified 1005 /// by the immediate integer operand. 1006 /// 1007 /// \headerfile <x86intrin.h> 1008 /// 1009 /// \code 1010 /// __m128d _mm_permute_pd(__m128d A, const int C); 1011 /// \endcode 1012 /// 1013 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 1014 /// 1015 /// \param A 1016 /// A 128-bit vector of [2 x double]. 1017 /// \param C 1018 /// An immediate integer operand specifying how the values are to be 1019 /// copied. \n 1020 /// Bit [0]: \n 1021 /// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 1022 /// vector. \n 1023 /// 1: Bits [127:64] of the source are copied to bits [63:0] of the 1024 /// returned vector. \n 1025 /// Bit [1]: \n 1026 /// 0: Bits [63:0] of the source are copied to bits [127:64] of the 1027 /// returned vector. \n 1028 /// 1: Bits [127:64] of the source are copied to bits [127:64] of the 1029 /// returned vector. 1030 /// \returns A 128-bit vector of [2 x double] containing the copied values. 1031 #define _mm_permute_pd(A, C) \ 1032 ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C))) 1033 1034 /// Copies the values in a 256-bit vector of [4 x double] as specified by 1035 /// the immediate integer operand. 1036 /// 1037 /// \headerfile <x86intrin.h> 1038 /// 1039 /// \code 1040 /// __m256d _mm256_permute_pd(__m256d A, const int C); 1041 /// \endcode 1042 /// 1043 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 1044 /// 1045 /// \param A 1046 /// A 256-bit vector of [4 x double]. 1047 /// \param C 1048 /// An immediate integer operand specifying how the values are to be 1049 /// copied. \n 1050 /// Bit [0]: \n 1051 /// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 1052 /// vector. \n 1053 /// 1: Bits [127:64] of the source are copied to bits [63:0] of the 1054 /// returned vector. \n 1055 /// Bit [1]: \n 1056 /// 0: Bits [63:0] of the source are copied to bits [127:64] of the 1057 /// returned vector. \n 1058 /// 1: Bits [127:64] of the source are copied to bits [127:64] of the 1059 /// returned vector. \n 1060 /// Bit [2]: \n 1061 /// 0: Bits [191:128] of the source are copied to bits [191:128] of the 1062 /// returned vector. \n 1063 /// 1: Bits [255:192] of the source are copied to bits [191:128] of the 1064 /// returned vector. \n 1065 /// Bit [3]: \n 1066 /// 0: Bits [191:128] of the source are copied to bits [255:192] of the 1067 /// returned vector. \n 1068 /// 1: Bits [255:192] of the source are copied to bits [255:192] of the 1069 /// returned vector. 1070 /// \returns A 256-bit vector of [4 x double] containing the copied values. 1071 #define _mm256_permute_pd(A, C) \ 1072 ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C))) 1073 1074 /// Copies the values in a 128-bit vector of [4 x float] as specified by 1075 /// the immediate integer operand. 1076 /// 1077 /// \headerfile <x86intrin.h> 1078 /// 1079 /// \code 1080 /// __m128 _mm_permute_ps(__m128 A, const int C); 1081 /// \endcode 1082 /// 1083 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 1084 /// 1085 /// \param A 1086 /// A 128-bit vector of [4 x float]. 1087 /// \param C 1088 /// An immediate integer operand specifying how the values are to be 1089 /// copied. \n 1090 /// Bits [1:0]: \n 1091 /// 00: Bits [31:0] of the source are copied to bits [31:0] of the 1092 /// returned vector. \n 1093 /// 01: Bits [63:32] of the source are copied to bits [31:0] of the 1094 /// returned vector. \n 1095 /// 10: Bits [95:64] of the source are copied to bits [31:0] of the 1096 /// returned vector. \n 1097 /// 11: Bits [127:96] of the source are copied to bits [31:0] of the 1098 /// returned vector. \n 1099 /// Bits [3:2]: \n 1100 /// 00: Bits [31:0] of the source are copied to bits [63:32] of the 1101 /// returned vector. \n 1102 /// 01: Bits [63:32] of the source are copied to bits [63:32] of the 1103 /// returned vector. \n 1104 /// 10: Bits [95:64] of the source are copied to bits [63:32] of the 1105 /// returned vector. \n 1106 /// 11: Bits [127:96] of the source are copied to bits [63:32] of the 1107 /// returned vector. \n 1108 /// Bits [5:4]: \n 1109 /// 00: Bits [31:0] of the source are copied to bits [95:64] of the 1110 /// returned vector. \n 1111 /// 01: Bits [63:32] of the source are copied to bits [95:64] of the 1112 /// returned vector. \n 1113 /// 10: Bits [95:64] of the source are copied to bits [95:64] of the 1114 /// returned vector. \n 1115 /// 11: Bits [127:96] of the source are copied to bits [95:64] of the 1116 /// returned vector. \n 1117 /// Bits [7:6]: \n 1118 /// 00: Bits [31:0] of the source are copied to bits [127:96] of the 1119 /// returned vector. \n 1120 /// 01: Bits [63:32] of the source are copied to bits [127:96] of the 1121 /// returned vector. \n 1122 /// 10: Bits [95:64] of the source are copied to bits [127:96] of the 1123 /// returned vector. \n 1124 /// 11: Bits [127:96] of the source are copied to bits [127:96] of the 1125 /// returned vector. 1126 /// \returns A 128-bit vector of [4 x float] containing the copied values. 1127 #define _mm_permute_ps(A, C) \ 1128 ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C))) 1129 1130 /// Copies the values in a 256-bit vector of [8 x float] as specified by 1131 /// the immediate integer operand. 1132 /// 1133 /// \headerfile <x86intrin.h> 1134 /// 1135 /// \code 1136 /// __m256 _mm256_permute_ps(__m256 A, const int C); 1137 /// \endcode 1138 /// 1139 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 1140 /// 1141 /// \param A 1142 /// A 256-bit vector of [8 x float]. 1143 /// \param C 1144 /// An immediate integer operand specifying how the values are to be 1145 /// copied. \n 1146 /// Bits [1:0]: \n 1147 /// 00: Bits [31:0] of the source are copied to bits [31:0] of the 1148 /// returned vector. \n 1149 /// 01: Bits [63:32] of the source are copied to bits [31:0] of the 1150 /// returned vector. \n 1151 /// 10: Bits [95:64] of the source are copied to bits [31:0] of the 1152 /// returned vector. \n 1153 /// 11: Bits [127:96] of the source are copied to bits [31:0] of the 1154 /// returned vector. \n 1155 /// Bits [3:2]: \n 1156 /// 00: Bits [31:0] of the source are copied to bits [63:32] of the 1157 /// returned vector. \n 1158 /// 01: Bits [63:32] of the source are copied to bits [63:32] of the 1159 /// returned vector. \n 1160 /// 10: Bits [95:64] of the source are copied to bits [63:32] of the 1161 /// returned vector. \n 1162 /// 11: Bits [127:96] of the source are copied to bits [63:32] of the 1163 /// returned vector. \n 1164 /// Bits [5:4]: \n 1165 /// 00: Bits [31:0] of the source are copied to bits [95:64] of the 1166 /// returned vector. \n 1167 /// 01: Bits [63:32] of the source are copied to bits [95:64] of the 1168 /// returned vector. \n 1169 /// 10: Bits [95:64] of the source are copied to bits [95:64] of the 1170 /// returned vector. \n 1171 /// 11: Bits [127:96] of the source are copied to bits [95:64] of the 1172 /// returned vector. \n 1173 /// Bits [7:6]: \n 1174 /// 00: Bits [31:0] of the source are copied to bits [127:96] of the 1175 /// returned vector. \n 1176 /// 01: Bits [63:32] of the source are copied to bits [127:96] of the 1177 /// returned vector. \n 1178 /// 10: Bits [95:64] of the source are copied to bits [127:96] of the 1179 /// returned vector. \n 1180 /// 11: Bits [127:96] of the source are copied to bits [127:96] of the 1181 /// returned vector. \n 1182 /// Bits [1:0]: \n 1183 /// 00: Bits [159:128] of the source are copied to bits [159:128] of the 1184 /// returned vector. \n 1185 /// 01: Bits [191:160] of the source are copied to bits [159:128] of the 1186 /// returned vector. \n 1187 /// 10: Bits [223:192] of the source are copied to bits [159:128] of the 1188 /// returned vector. \n 1189 /// 11: Bits [255:224] of the source are copied to bits [159:128] of the 1190 /// returned vector. \n 1191 /// Bits [3:2]: \n 1192 /// 00: Bits [159:128] of the source are copied to bits [191:160] of the 1193 /// returned vector. \n 1194 /// 01: Bits [191:160] of the source are copied to bits [191:160] of the 1195 /// returned vector. \n 1196 /// 10: Bits [223:192] of the source are copied to bits [191:160] of the 1197 /// returned vector. \n 1198 /// 11: Bits [255:224] of the source are copied to bits [191:160] of the 1199 /// returned vector. \n 1200 /// Bits [5:4]: \n 1201 /// 00: Bits [159:128] of the source are copied to bits [223:192] of the 1202 /// returned vector. \n 1203 /// 01: Bits [191:160] of the source are copied to bits [223:192] of the 1204 /// returned vector. \n 1205 /// 10: Bits [223:192] of the source are copied to bits [223:192] of the 1206 /// returned vector. \n 1207 /// 11: Bits [255:224] of the source are copied to bits [223:192] of the 1208 /// returned vector. \n 1209 /// Bits [7:6]: \n 1210 /// 00: Bits [159:128] of the source are copied to bits [255:224] of the 1211 /// returned vector. \n 1212 /// 01: Bits [191:160] of the source are copied to bits [255:224] of the 1213 /// returned vector. \n 1214 /// 10: Bits [223:192] of the source are copied to bits [255:224] of the 1215 /// returned vector. \n 1216 /// 11: Bits [255:224] of the source are copied to bits [255:224] of the 1217 /// returned vector. 1218 /// \returns A 256-bit vector of [8 x float] containing the copied values. 1219 #define _mm256_permute_ps(A, C) \ 1220 ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C))) 1221 1222 /// Permutes 128-bit data values stored in two 256-bit vectors of 1223 /// [4 x double], as specified by the immediate integer operand. 1224 /// 1225 /// \headerfile <x86intrin.h> 1226 /// 1227 /// \code 1228 /// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M); 1229 /// \endcode 1230 /// 1231 /// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. 1232 /// 1233 /// \param V1 1234 /// A 256-bit vector of [4 x double]. 1235 /// \param V2 1236 /// A 256-bit vector of [4 x double. 1237 /// \param M 1238 /// An immediate integer operand specifying how the values are to be 1239 /// permuted. \n 1240 /// Bits [1:0]: \n 1241 /// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the 1242 /// destination. \n 1243 /// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the 1244 /// destination. \n 1245 /// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the 1246 /// destination. \n 1247 /// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the 1248 /// destination. \n 1249 /// Bits [5:4]: \n 1250 /// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the 1251 /// destination. \n 1252 /// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the 1253 /// destination. \n 1254 /// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the 1255 /// destination. \n 1256 /// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the 1257 /// destination. 1258 /// \returns A 256-bit vector of [4 x double] containing the copied values. 1259 #define _mm256_permute2f128_pd(V1, V2, M) \ 1260 ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \ 1261 (__v4df)(__m256d)(V2), (int)(M))) 1262 1263 /// Permutes 128-bit data values stored in two 256-bit vectors of 1264 /// [8 x float], as specified by the immediate integer operand. 1265 /// 1266 /// \headerfile <x86intrin.h> 1267 /// 1268 /// \code 1269 /// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M); 1270 /// \endcode 1271 /// 1272 /// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. 1273 /// 1274 /// \param V1 1275 /// A 256-bit vector of [8 x float]. 1276 /// \param V2 1277 /// A 256-bit vector of [8 x float]. 1278 /// \param M 1279 /// An immediate integer operand specifying how the values are to be 1280 /// permuted. \n 1281 /// Bits [1:0]: \n 1282 /// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the 1283 /// destination. \n 1284 /// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the 1285 /// destination. \n 1286 /// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the 1287 /// destination. \n 1288 /// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the 1289 /// destination. \n 1290 /// Bits [5:4]: \n 1291 /// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the 1292 /// destination. \n 1293 /// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the 1294 /// destination. \n 1295 /// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the 1296 /// destination. \n 1297 /// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the 1298 /// destination. 1299 /// \returns A 256-bit vector of [8 x float] containing the copied values. 1300 #define _mm256_permute2f128_ps(V1, V2, M) \ 1301 ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \ 1302 (__v8sf)(__m256)(V2), (int)(M))) 1303 1304 /// Permutes 128-bit data values stored in two 256-bit integer vectors, 1305 /// as specified by the immediate integer operand. 1306 /// 1307 /// \headerfile <x86intrin.h> 1308 /// 1309 /// \code 1310 /// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M); 1311 /// \endcode 1312 /// 1313 /// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. 1314 /// 1315 /// \param V1 1316 /// A 256-bit integer vector. 1317 /// \param V2 1318 /// A 256-bit integer vector. 1319 /// \param M 1320 /// An immediate integer operand specifying how the values are to be copied. 1321 /// Bits [1:0]: \n 1322 /// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the 1323 /// destination. \n 1324 /// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the 1325 /// destination. \n 1326 /// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the 1327 /// destination. \n 1328 /// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the 1329 /// destination. \n 1330 /// Bits [5:4]: \n 1331 /// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the 1332 /// destination. \n 1333 /// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the 1334 /// destination. \n 1335 /// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the 1336 /// destination. \n 1337 /// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the 1338 /// destination. 1339 /// \returns A 256-bit integer vector containing the copied values. 1340 #define _mm256_permute2f128_si256(V1, V2, M) \ 1341 ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \ 1342 (__v8si)(__m256i)(V2), (int)(M))) 1343 1344 /* Vector Blend */ 1345 /// Merges 64-bit double-precision data values stored in either of the 1346 /// two 256-bit vectors of [4 x double], as specified by the immediate 1347 /// integer operand. 1348 /// 1349 /// \headerfile <x86intrin.h> 1350 /// 1351 /// \code 1352 /// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M); 1353 /// \endcode 1354 /// 1355 /// This intrinsic corresponds to the <c> VBLENDPD </c> instruction. 1356 /// 1357 /// \param V1 1358 /// A 256-bit vector of [4 x double]. 1359 /// \param V2 1360 /// A 256-bit vector of [4 x double]. 1361 /// \param M 1362 /// An immediate integer operand, with mask bits [3:0] specifying how the 1363 /// values are to be copied. The position of the mask bit corresponds to the 1364 /// index of a copied value. When a mask bit is 0, the corresponding 64-bit 1365 /// element in operand \a V1 is copied to the same position in the 1366 /// destination. When a mask bit is 1, the corresponding 64-bit element in 1367 /// operand \a V2 is copied to the same position in the destination. 1368 /// \returns A 256-bit vector of [4 x double] containing the copied values. 1369 #define _mm256_blend_pd(V1, V2, M) \ 1370 ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \ 1371 (__v4df)(__m256d)(V2), (int)(M))) 1372 1373 /// Merges 32-bit single-precision data values stored in either of the 1374 /// two 256-bit vectors of [8 x float], as specified by the immediate 1375 /// integer operand. 1376 /// 1377 /// \headerfile <x86intrin.h> 1378 /// 1379 /// \code 1380 /// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M); 1381 /// \endcode 1382 /// 1383 /// This intrinsic corresponds to the <c> VBLENDPS </c> instruction. 1384 /// 1385 /// \param V1 1386 /// A 256-bit vector of [8 x float]. 1387 /// \param V2 1388 /// A 256-bit vector of [8 x float]. 1389 /// \param M 1390 /// An immediate integer operand, with mask bits [7:0] specifying how the 1391 /// values are to be copied. The position of the mask bit corresponds to the 1392 /// index of a copied value. When a mask bit is 0, the corresponding 32-bit 1393 /// element in operand \a V1 is copied to the same position in the 1394 /// destination. When a mask bit is 1, the corresponding 32-bit element in 1395 /// operand \a V2 is copied to the same position in the destination. 1396 /// \returns A 256-bit vector of [8 x float] containing the copied values. 1397 #define _mm256_blend_ps(V1, V2, M) \ 1398 ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \ 1399 (__v8sf)(__m256)(V2), (int)(M))) 1400 1401 /// Merges 64-bit double-precision data values stored in either of the 1402 /// two 256-bit vectors of [4 x double], as specified by the 256-bit vector 1403 /// operand. 1404 /// 1405 /// \headerfile <x86intrin.h> 1406 /// 1407 /// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction. 1408 /// 1409 /// \param __a 1410 /// A 256-bit vector of [4 x double]. 1411 /// \param __b 1412 /// A 256-bit vector of [4 x double]. 1413 /// \param __c 1414 /// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying 1415 /// how the values are to be copied. The position of the mask bit corresponds 1416 /// to the most significant bit of a copied value. When a mask bit is 0, the 1417 /// corresponding 64-bit element in operand \a __a is copied to the same 1418 /// position in the destination. When a mask bit is 1, the corresponding 1419 /// 64-bit element in operand \a __b is copied to the same position in the 1420 /// destination. 1421 /// \returns A 256-bit vector of [4 x double] containing the copied values. 1422 static __inline __m256d __DEFAULT_FN_ATTRS 1423 _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) 1424 { 1425 return (__m256d)__builtin_ia32_blendvpd256( 1426 (__v4df)__a, (__v4df)__b, (__v4df)__c); 1427 } 1428 1429 /// Merges 32-bit single-precision data values stored in either of the 1430 /// two 256-bit vectors of [8 x float], as specified by the 256-bit vector 1431 /// operand. 1432 /// 1433 /// \headerfile <x86intrin.h> 1434 /// 1435 /// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction. 1436 /// 1437 /// \param __a 1438 /// A 256-bit vector of [8 x float]. 1439 /// \param __b 1440 /// A 256-bit vector of [8 x float]. 1441 /// \param __c 1442 /// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63, 1443 /// and 31 specifying how the values are to be copied. The position of the 1444 /// mask bit corresponds to the most significant bit of a copied value. When 1445 /// a mask bit is 0, the corresponding 32-bit element in operand \a __a is 1446 /// copied to the same position in the destination. When a mask bit is 1, the 1447 /// corresponding 32-bit element in operand \a __b is copied to the same 1448 /// position in the destination. 1449 /// \returns A 256-bit vector of [8 x float] containing the copied values. 1450 static __inline __m256 __DEFAULT_FN_ATTRS 1451 _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) 1452 { 1453 return (__m256)__builtin_ia32_blendvps256( 1454 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c); 1455 } 1456 1457 /* Vector Dot Product */ 1458 /// Computes two dot products in parallel, using the lower and upper 1459 /// halves of two [8 x float] vectors as input to the two computations, and 1460 /// returning the two dot products in the lower and upper halves of the 1461 /// [8 x float] result. 1462 /// 1463 /// The immediate integer operand controls which input elements will 1464 /// contribute to the dot product, and where the final results are returned. 1465 /// In general, for each dot product, the four corresponding elements of the 1466 /// input vectors are multiplied; the first two and second two products are 1467 /// summed, then the two sums are added to form the final result. 1468 /// 1469 /// \headerfile <x86intrin.h> 1470 /// 1471 /// \code 1472 /// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M); 1473 /// \endcode 1474 /// 1475 /// This intrinsic corresponds to the <c> VDPPS </c> instruction. 1476 /// 1477 /// \param V1 1478 /// A vector of [8 x float] values, treated as two [4 x float] vectors. 1479 /// \param V2 1480 /// A vector of [8 x float] values, treated as two [4 x float] vectors. 1481 /// \param M 1482 /// An immediate integer argument. Bits [7:4] determine which elements of 1483 /// the input vectors are used, with bit [4] corresponding to the lowest 1484 /// element and bit [7] corresponding to the highest element of each [4 x 1485 /// float] subvector. If a bit is set, the corresponding elements from the 1486 /// two input vectors are used as an input for dot product; otherwise that 1487 /// input is treated as zero. Bits [3:0] determine which elements of the 1488 /// result will receive a copy of the final dot product, with bit [0] 1489 /// corresponding to the lowest element and bit [3] corresponding to the 1490 /// highest element of each [4 x float] subvector. If a bit is set, the dot 1491 /// product is returned in the corresponding element; otherwise that element 1492 /// is set to zero. The bitmask is applied in the same way to each of the 1493 /// two parallel dot product computations. 1494 /// \returns A 256-bit vector of [8 x float] containing the two dot products. 1495 #define _mm256_dp_ps(V1, V2, M) \ 1496 ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \ 1497 (__v8sf)(__m256)(V2), (M))) 1498 1499 /* Vector shuffle */ 1500 /// Selects 8 float values from the 256-bit operands of [8 x float], as 1501 /// specified by the immediate value operand. 1502 /// 1503 /// The four selected elements in each operand are copied to the destination 1504 /// according to the bits specified in the immediate operand. The selected 1505 /// elements from the first 256-bit operand are copied to bits [63:0] and 1506 /// bits [191:128] of the destination, and the selected elements from the 1507 /// second 256-bit operand are copied to bits [127:64] and bits [255:192] of 1508 /// the destination. For example, if bits [7:0] of the immediate operand 1509 /// contain a value of 0xFF, the 256-bit destination vector would contain the 1510 /// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3]. 1511 /// 1512 /// \headerfile <x86intrin.h> 1513 /// 1514 /// \code 1515 /// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask); 1516 /// \endcode 1517 /// 1518 /// This intrinsic corresponds to the <c> VSHUFPS </c> instruction. 1519 /// 1520 /// \param a 1521 /// A 256-bit vector of [8 x float]. The four selected elements in this 1522 /// operand are copied to bits [63:0] and bits [191:128] in the destination, 1523 /// according to the bits specified in the immediate operand. 1524 /// \param b 1525 /// A 256-bit vector of [8 x float]. The four selected elements in this 1526 /// operand are copied to bits [127:64] and bits [255:192] in the 1527 /// destination, according to the bits specified in the immediate operand. 1528 /// \param mask 1529 /// An immediate value containing an 8-bit value specifying which elements to 1530 /// copy from \a a and \a b \n. 1531 /// Bits [3:0] specify the values copied from operand \a a. \n 1532 /// Bits [7:4] specify the values copied from operand \a b. \n 1533 /// The destinations within the 256-bit destination are assigned values as 1534 /// follows, according to the bit value assignments described below: \n 1535 /// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the 1536 /// destination. \n 1537 /// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the 1538 /// destination. \n 1539 /// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the 1540 /// destination. \n 1541 /// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in 1542 /// the destination. \n 1543 /// Bit value assignments: \n 1544 /// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n 1545 /// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n 1546 /// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n 1547 /// 11: Bits [127:96] and [255:224] are copied from the selected operand. \n 1548 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro. 1549 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form 1550 /// <c>[b6, b4, b2, b0]</c>. 1551 /// \returns A 256-bit vector of [8 x float] containing the shuffled values. 1552 #define _mm256_shuffle_ps(a, b, mask) \ 1553 ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \ 1554 (__v8sf)(__m256)(b), (int)(mask))) 1555 1556 /// Selects four double-precision values from the 256-bit operands of 1557 /// [4 x double], as specified by the immediate value operand. 1558 /// 1559 /// The selected elements from the first 256-bit operand are copied to bits 1560 /// [63:0] and bits [191:128] in the destination, and the selected elements 1561 /// from the second 256-bit operand are copied to bits [127:64] and bits 1562 /// [255:192] in the destination. For example, if bits [3:0] of the immediate 1563 /// operand contain a value of 0xF, the 256-bit destination vector would 1564 /// contain the following values: b[3], a[3], b[1], a[1]. 1565 /// 1566 /// \headerfile <x86intrin.h> 1567 /// 1568 /// \code 1569 /// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask); 1570 /// \endcode 1571 /// 1572 /// This intrinsic corresponds to the <c> VSHUFPD </c> instruction. 1573 /// 1574 /// \param a 1575 /// A 256-bit vector of [4 x double]. 1576 /// \param b 1577 /// A 256-bit vector of [4 x double]. 1578 /// \param mask 1579 /// An immediate value containing 8-bit values specifying which elements to 1580 /// copy from \a a and \a b: \n 1581 /// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the 1582 /// destination. \n 1583 /// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the 1584 /// destination. \n 1585 /// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the 1586 /// destination. \n 1587 /// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the 1588 /// destination. \n 1589 /// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the 1590 /// destination. \n 1591 /// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the 1592 /// destination. \n 1593 /// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the 1594 /// destination. \n 1595 /// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the 1596 /// destination. 1597 /// \returns A 256-bit vector of [4 x double] containing the shuffled values. 1598 #define _mm256_shuffle_pd(a, b, mask) \ 1599 ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \ 1600 (__v4df)(__m256d)(b), (int)(mask))) 1601 1602 /* Compare */ 1603 #define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ 1604 #define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */ 1605 #define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */ 1606 #define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */ 1607 #define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */ 1608 #define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */ 1609 #define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */ 1610 #define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */ 1611 #define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */ 1612 #define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */ 1613 #define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */ 1614 #define _CMP_UNORD_S 0x13 /* Unordered (signaling) */ 1615 #define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ 1616 #define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */ 1617 #define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */ 1618 #define _CMP_ORD_S 0x17 /* Ordered (signaling) */ 1619 #define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ 1620 #define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */ 1621 #define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */ 1622 #define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */ 1623 #define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */ 1624 #define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */ 1625 #define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */ 1626 #define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */ 1627 1628 /* Below intrinsic defined in emmintrin.h can be used for AVX */ 1629 /// Compares each of the corresponding double-precision values of two 1630 /// 128-bit vectors of [2 x double], using the operation specified by the 1631 /// immediate integer operand. 1632 /// 1633 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 1634 /// If either value in a comparison is NaN, comparisons that are ordered 1635 /// return false, and comparisons that are unordered return true. 1636 /// 1637 /// \headerfile <x86intrin.h> 1638 /// 1639 /// \code 1640 /// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c); 1641 /// \endcode 1642 /// 1643 /// This intrinsic corresponds to the <c> VCMPPD </c> instruction. 1644 /// 1645 /// \param a 1646 /// A 128-bit vector of [2 x double]. 1647 /// \param b 1648 /// A 128-bit vector of [2 x double]. 1649 /// \param c 1650 /// An immediate integer operand, with bits [4:0] specifying which comparison 1651 /// operation to use: \n 1652 /// 0x00: Equal (ordered, non-signaling) \n 1653 /// 0x01: Less-than (ordered, signaling) \n 1654 /// 0x02: Less-than-or-equal (ordered, signaling) \n 1655 /// 0x03: Unordered (non-signaling) \n 1656 /// 0x04: Not-equal (unordered, non-signaling) \n 1657 /// 0x05: Not-less-than (unordered, signaling) \n 1658 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n 1659 /// 0x07: Ordered (non-signaling) \n 1660 /// 0x08: Equal (unordered, non-signaling) \n 1661 /// 0x09: Not-greater-than-or-equal (unordered, signaling) \n 1662 /// 0x0A: Not-greater-than (unordered, signaling) \n 1663 /// 0x0B: False (ordered, non-signaling) \n 1664 /// 0x0C: Not-equal (ordered, non-signaling) \n 1665 /// 0x0D: Greater-than-or-equal (ordered, signaling) \n 1666 /// 0x0E: Greater-than (ordered, signaling) \n 1667 /// 0x0F: True (unordered, non-signaling) \n 1668 /// 0x10: Equal (ordered, signaling) \n 1669 /// 0x11: Less-than (ordered, non-signaling) \n 1670 /// 0x12: Less-than-or-equal (ordered, non-signaling) \n 1671 /// 0x13: Unordered (signaling) \n 1672 /// 0x14: Not-equal (unordered, signaling) \n 1673 /// 0x15: Not-less-than (unordered, non-signaling) \n 1674 /// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n 1675 /// 0x17: Ordered (signaling) \n 1676 /// 0x18: Equal (unordered, signaling) \n 1677 /// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n 1678 /// 0x1A: Not-greater-than (unordered, non-signaling) \n 1679 /// 0x1B: False (ordered, signaling) \n 1680 /// 0x1C: Not-equal (ordered, signaling) \n 1681 /// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n 1682 /// 0x1E: Greater-than (ordered, non-signaling) \n 1683 /// 0x1F: True (unordered, signaling) 1684 /// \returns A 128-bit vector of [2 x double] containing the comparison results. 1685 /// \fn __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c) 1686 1687 /* Below intrinsic defined in xmmintrin.h can be used for AVX */ 1688 /// Compares each of the corresponding values of two 128-bit vectors of 1689 /// [4 x float], using the operation specified by the immediate integer 1690 /// operand. 1691 /// 1692 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true. 1693 /// If either value in a comparison is NaN, comparisons that are ordered 1694 /// return false, and comparisons that are unordered return true. 1695 /// 1696 /// \headerfile <x86intrin.h> 1697 /// 1698 /// \code 1699 /// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c); 1700 /// \endcode 1701 /// 1702 /// This intrinsic corresponds to the <c> VCMPPS </c> instruction. 1703 /// 1704 /// \param a 1705 /// A 128-bit vector of [4 x float]. 1706 /// \param b 1707 /// A 128-bit vector of [4 x float]. 1708 /// \param c 1709 /// An immediate integer operand, with bits [4:0] specifying which comparison 1710 /// operation to use: \n 1711 /// 0x00: Equal (ordered, non-signaling) \n 1712 /// 0x01: Less-than (ordered, signaling) \n 1713 /// 0x02: Less-than-or-equal (ordered, signaling) \n 1714 /// 0x03: Unordered (non-signaling) \n 1715 /// 0x04: Not-equal (unordered, non-signaling) \n 1716 /// 0x05: Not-less-than (unordered, signaling) \n 1717 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n 1718 /// 0x07: Ordered (non-signaling) \n 1719 /// 0x08: Equal (unordered, non-signaling) \n 1720 /// 0x09: Not-greater-than-or-equal (unordered, signaling) \n 1721 /// 0x0A: Not-greater-than (unordered, signaling) \n 1722 /// 0x0B: False (ordered, non-signaling) \n 1723 /// 0x0C: Not-equal (ordered, non-signaling) \n 1724 /// 0x0D: Greater-than-or-equal (ordered, signaling) \n 1725 /// 0x0E: Greater-than (ordered, signaling) \n 1726 /// 0x0F: True (unordered, non-signaling) \n 1727 /// 0x10: Equal (ordered, signaling) \n 1728 /// 0x11: Less-than (ordered, non-signaling) \n 1729 /// 0x12: Less-than-or-equal (ordered, non-signaling) \n 1730 /// 0x13: Unordered (signaling) \n 1731 /// 0x14: Not-equal (unordered, signaling) \n 1732 /// 0x15: Not-less-than (unordered, non-signaling) \n 1733 /// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n 1734 /// 0x17: Ordered (signaling) \n 1735 /// 0x18: Equal (unordered, signaling) \n 1736 /// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n 1737 /// 0x1A: Not-greater-than (unordered, non-signaling) \n 1738 /// 0x1B: False (ordered, signaling) \n 1739 /// 0x1C: Not-equal (ordered, signaling) \n 1740 /// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n 1741 /// 0x1E: Greater-than (ordered, non-signaling) \n 1742 /// 0x1F: True (unordered, signaling) 1743 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 1744 /// \fn __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c) 1745 1746 /// Compares each of the corresponding double-precision values of two 1747 /// 256-bit vectors of [4 x double], using the operation specified by the 1748 /// immediate integer operand. 1749 /// 1750 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 1751 /// If either value in a comparison is NaN, comparisons that are ordered 1752 /// return false, and comparisons that are unordered return true. 1753 /// 1754 /// \headerfile <x86intrin.h> 1755 /// 1756 /// \code 1757 /// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c); 1758 /// \endcode 1759 /// 1760 /// This intrinsic corresponds to the <c> VCMPPD </c> instruction. 1761 /// 1762 /// \param a 1763 /// A 256-bit vector of [4 x double]. 1764 /// \param b 1765 /// A 256-bit vector of [4 x double]. 1766 /// \param c 1767 /// An immediate integer operand, with bits [4:0] specifying which comparison 1768 /// operation to use: \n 1769 /// 0x00: Equal (ordered, non-signaling) \n 1770 /// 0x01: Less-than (ordered, signaling) \n 1771 /// 0x02: Less-than-or-equal (ordered, signaling) \n 1772 /// 0x03: Unordered (non-signaling) \n 1773 /// 0x04: Not-equal (unordered, non-signaling) \n 1774 /// 0x05: Not-less-than (unordered, signaling) \n 1775 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n 1776 /// 0x07: Ordered (non-signaling) \n 1777 /// 0x08: Equal (unordered, non-signaling) \n 1778 /// 0x09: Not-greater-than-or-equal (unordered, signaling) \n 1779 /// 0x0A: Not-greater-than (unordered, signaling) \n 1780 /// 0x0B: False (ordered, non-signaling) \n 1781 /// 0x0C: Not-equal (ordered, non-signaling) \n 1782 /// 0x0D: Greater-than-or-equal (ordered, signaling) \n 1783 /// 0x0E: Greater-than (ordered, signaling) \n 1784 /// 0x0F: True (unordered, non-signaling) \n 1785 /// 0x10: Equal (ordered, signaling) \n 1786 /// 0x11: Less-than (ordered, non-signaling) \n 1787 /// 0x12: Less-than-or-equal (ordered, non-signaling) \n 1788 /// 0x13: Unordered (signaling) \n 1789 /// 0x14: Not-equal (unordered, signaling) \n 1790 /// 0x15: Not-less-than (unordered, non-signaling) \n 1791 /// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n 1792 /// 0x17: Ordered (signaling) \n 1793 /// 0x18: Equal (unordered, signaling) \n 1794 /// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n 1795 /// 0x1A: Not-greater-than (unordered, non-signaling) \n 1796 /// 0x1B: False (ordered, signaling) \n 1797 /// 0x1C: Not-equal (ordered, signaling) \n 1798 /// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n 1799 /// 0x1E: Greater-than (ordered, non-signaling) \n 1800 /// 0x1F: True (unordered, signaling) 1801 /// \returns A 256-bit vector of [4 x double] containing the comparison results. 1802 #define _mm256_cmp_pd(a, b, c) \ 1803 ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \ 1804 (__v4df)(__m256d)(b), (c))) 1805 1806 /// Compares each of the corresponding values of two 256-bit vectors of 1807 /// [8 x float], using the operation specified by the immediate integer 1808 /// operand. 1809 /// 1810 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true. 1811 /// If either value in a comparison is NaN, comparisons that are ordered 1812 /// return false, and comparisons that are unordered return true. 1813 /// 1814 /// \headerfile <x86intrin.h> 1815 /// 1816 /// \code 1817 /// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c); 1818 /// \endcode 1819 /// 1820 /// This intrinsic corresponds to the <c> VCMPPS </c> instruction. 1821 /// 1822 /// \param a 1823 /// A 256-bit vector of [8 x float]. 1824 /// \param b 1825 /// A 256-bit vector of [8 x float]. 1826 /// \param c 1827 /// An immediate integer operand, with bits [4:0] specifying which comparison 1828 /// operation to use: \n 1829 /// 0x00: Equal (ordered, non-signaling) \n 1830 /// 0x01: Less-than (ordered, signaling) \n 1831 /// 0x02: Less-than-or-equal (ordered, signaling) \n 1832 /// 0x03: Unordered (non-signaling) \n 1833 /// 0x04: Not-equal (unordered, non-signaling) \n 1834 /// 0x05: Not-less-than (unordered, signaling) \n 1835 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n 1836 /// 0x07: Ordered (non-signaling) \n 1837 /// 0x08: Equal (unordered, non-signaling) \n 1838 /// 0x09: Not-greater-than-or-equal (unordered, signaling) \n 1839 /// 0x0A: Not-greater-than (unordered, signaling) \n 1840 /// 0x0B: False (ordered, non-signaling) \n 1841 /// 0x0C: Not-equal (ordered, non-signaling) \n 1842 /// 0x0D: Greater-than-or-equal (ordered, signaling) \n 1843 /// 0x0E: Greater-than (ordered, signaling) \n 1844 /// 0x0F: True (unordered, non-signaling) \n 1845 /// 0x10: Equal (ordered, signaling) \n 1846 /// 0x11: Less-than (ordered, non-signaling) \n 1847 /// 0x12: Less-than-or-equal (ordered, non-signaling) \n 1848 /// 0x13: Unordered (signaling) \n 1849 /// 0x14: Not-equal (unordered, signaling) \n 1850 /// 0x15: Not-less-than (unordered, non-signaling) \n 1851 /// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n 1852 /// 0x17: Ordered (signaling) \n 1853 /// 0x18: Equal (unordered, signaling) \n 1854 /// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n 1855 /// 0x1A: Not-greater-than (unordered, non-signaling) \n 1856 /// 0x1B: False (ordered, signaling) \n 1857 /// 0x1C: Not-equal (ordered, signaling) \n 1858 /// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n 1859 /// 0x1E: Greater-than (ordered, non-signaling) \n 1860 /// 0x1F: True (unordered, signaling) 1861 /// \returns A 256-bit vector of [8 x float] containing the comparison results. 1862 #define _mm256_cmp_ps(a, b, c) \ 1863 ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \ 1864 (__v8sf)(__m256)(b), (c))) 1865 1866 /* Below intrinsic defined in emmintrin.h can be used for AVX */ 1867 /// Compares each of the corresponding scalar double-precision values of 1868 /// two 128-bit vectors of [2 x double], using the operation specified by the 1869 /// immediate integer operand. 1870 /// 1871 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 1872 /// If either value in a comparison is NaN, comparisons that are ordered 1873 /// return false, and comparisons that are unordered return true. 1874 /// 1875 /// \headerfile <x86intrin.h> 1876 /// 1877 /// \code 1878 /// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c); 1879 /// \endcode 1880 /// 1881 /// This intrinsic corresponds to the <c> VCMPSD </c> instruction. 1882 /// 1883 /// \param a 1884 /// A 128-bit vector of [2 x double]. 1885 /// \param b 1886 /// A 128-bit vector of [2 x double]. 1887 /// \param c 1888 /// An immediate integer operand, with bits [4:0] specifying which comparison 1889 /// operation to use: \n 1890 /// 0x00: Equal (ordered, non-signaling) \n 1891 /// 0x01: Less-than (ordered, signaling) \n 1892 /// 0x02: Less-than-or-equal (ordered, signaling) \n 1893 /// 0x03: Unordered (non-signaling) \n 1894 /// 0x04: Not-equal (unordered, non-signaling) \n 1895 /// 0x05: Not-less-than (unordered, signaling) \n 1896 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n 1897 /// 0x07: Ordered (non-signaling) \n 1898 /// 0x08: Equal (unordered, non-signaling) \n 1899 /// 0x09: Not-greater-than-or-equal (unordered, signaling) \n 1900 /// 0x0A: Not-greater-than (unordered, signaling) \n 1901 /// 0x0B: False (ordered, non-signaling) \n 1902 /// 0x0C: Not-equal (ordered, non-signaling) \n 1903 /// 0x0D: Greater-than-or-equal (ordered, signaling) \n 1904 /// 0x0E: Greater-than (ordered, signaling) \n 1905 /// 0x0F: True (unordered, non-signaling) \n 1906 /// 0x10: Equal (ordered, signaling) \n 1907 /// 0x11: Less-than (ordered, non-signaling) \n 1908 /// 0x12: Less-than-or-equal (ordered, non-signaling) \n 1909 /// 0x13: Unordered (signaling) \n 1910 /// 0x14: Not-equal (unordered, signaling) \n 1911 /// 0x15: Not-less-than (unordered, non-signaling) \n 1912 /// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n 1913 /// 0x17: Ordered (signaling) \n 1914 /// 0x18: Equal (unordered, signaling) \n 1915 /// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n 1916 /// 0x1A: Not-greater-than (unordered, non-signaling) \n 1917 /// 0x1B: False (ordered, signaling) \n 1918 /// 0x1C: Not-equal (ordered, signaling) \n 1919 /// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n 1920 /// 0x1E: Greater-than (ordered, non-signaling) \n 1921 /// 0x1F: True (unordered, signaling) 1922 /// \returns A 128-bit vector of [2 x double] containing the comparison results. 1923 /// \fn __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c) 1924 1925 /* Below intrinsic defined in xmmintrin.h can be used for AVX */ 1926 /// Compares each of the corresponding scalar values of two 128-bit 1927 /// vectors of [4 x float], using the operation specified by the immediate 1928 /// integer operand. 1929 /// 1930 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true. 1931 /// If either value in a comparison is NaN, comparisons that are ordered 1932 /// return false, and comparisons that are unordered return true. 1933 /// 1934 /// \headerfile <x86intrin.h> 1935 /// 1936 /// \code 1937 /// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c); 1938 /// \endcode 1939 /// 1940 /// This intrinsic corresponds to the <c> VCMPSS </c> instruction. 1941 /// 1942 /// \param a 1943 /// A 128-bit vector of [4 x float]. 1944 /// \param b 1945 /// A 128-bit vector of [4 x float]. 1946 /// \param c 1947 /// An immediate integer operand, with bits [4:0] specifying which comparison 1948 /// operation to use: \n 1949 /// 0x00: Equal (ordered, non-signaling) \n 1950 /// 0x01: Less-than (ordered, signaling) \n 1951 /// 0x02: Less-than-or-equal (ordered, signaling) \n 1952 /// 0x03: Unordered (non-signaling) \n 1953 /// 0x04: Not-equal (unordered, non-signaling) \n 1954 /// 0x05: Not-less-than (unordered, signaling) \n 1955 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n 1956 /// 0x07: Ordered (non-signaling) \n 1957 /// 0x08: Equal (unordered, non-signaling) \n 1958 /// 0x09: Not-greater-than-or-equal (unordered, signaling) \n 1959 /// 0x0A: Not-greater-than (unordered, signaling) \n 1960 /// 0x0B: False (ordered, non-signaling) \n 1961 /// 0x0C: Not-equal (ordered, non-signaling) \n 1962 /// 0x0D: Greater-than-or-equal (ordered, signaling) \n 1963 /// 0x0E: Greater-than (ordered, signaling) \n 1964 /// 0x0F: True (unordered, non-signaling) \n 1965 /// 0x10: Equal (ordered, signaling) \n 1966 /// 0x11: Less-than (ordered, non-signaling) \n 1967 /// 0x12: Less-than-or-equal (ordered, non-signaling) \n 1968 /// 0x13: Unordered (signaling) \n 1969 /// 0x14: Not-equal (unordered, signaling) \n 1970 /// 0x15: Not-less-than (unordered, non-signaling) \n 1971 /// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n 1972 /// 0x17: Ordered (signaling) \n 1973 /// 0x18: Equal (unordered, signaling) \n 1974 /// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n 1975 /// 0x1A: Not-greater-than (unordered, non-signaling) \n 1976 /// 0x1B: False (ordered, signaling) \n 1977 /// 0x1C: Not-equal (ordered, signaling) \n 1978 /// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n 1979 /// 0x1E: Greater-than (ordered, non-signaling) \n 1980 /// 0x1F: True (unordered, signaling) 1981 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 1982 /// \fn __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c) 1983 1984 /// Takes a [8 x i32] vector and returns the vector element value 1985 /// indexed by the immediate constant operand. 1986 /// 1987 /// \headerfile <x86intrin.h> 1988 /// 1989 /// \code 1990 /// int _mm256_extract_epi32(__m256i X, const int N); 1991 /// \endcode 1992 /// 1993 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 1994 /// instruction. 1995 /// 1996 /// \param X 1997 /// A 256-bit vector of [8 x i32]. 1998 /// \param N 1999 /// An immediate integer operand with bits [2:0] determining which vector 2000 /// element is extracted and returned. 2001 /// \returns A 32-bit integer containing the extracted 32 bits of extended 2002 /// packed data. 2003 #define _mm256_extract_epi32(X, N) \ 2004 ((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N))) 2005 2006 /// Takes a [16 x i16] vector and returns the vector element value 2007 /// indexed by the immediate constant operand. 2008 /// 2009 /// \headerfile <x86intrin.h> 2010 /// 2011 /// \code 2012 /// int _mm256_extract_epi16(__m256i X, const int N); 2013 /// \endcode 2014 /// 2015 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 2016 /// instruction. 2017 /// 2018 /// \param X 2019 /// A 256-bit integer vector of [16 x i16]. 2020 /// \param N 2021 /// An immediate integer operand with bits [3:0] determining which vector 2022 /// element is extracted and returned. 2023 /// \returns A 32-bit integer containing the extracted 16 bits of zero extended 2024 /// packed data. 2025 #define _mm256_extract_epi16(X, N) \ 2026 ((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \ 2027 (int)(N))) 2028 2029 /// Takes a [32 x i8] vector and returns the vector element value 2030 /// indexed by the immediate constant operand. 2031 /// 2032 /// \headerfile <x86intrin.h> 2033 /// 2034 /// \code 2035 /// int _mm256_extract_epi8(__m256i X, const int N); 2036 /// \endcode 2037 /// 2038 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 2039 /// instruction. 2040 /// 2041 /// \param X 2042 /// A 256-bit integer vector of [32 x i8]. 2043 /// \param N 2044 /// An immediate integer operand with bits [4:0] determining which vector 2045 /// element is extracted and returned. 2046 /// \returns A 32-bit integer containing the extracted 8 bits of zero extended 2047 /// packed data. 2048 #define _mm256_extract_epi8(X, N) \ 2049 ((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \ 2050 (int)(N))) 2051 2052 #ifdef __x86_64__ 2053 /// Takes a [4 x i64] vector and returns the vector element value 2054 /// indexed by the immediate constant operand. 2055 /// 2056 /// \headerfile <x86intrin.h> 2057 /// 2058 /// \code 2059 /// long long _mm256_extract_epi64(__m256i X, const int N); 2060 /// \endcode 2061 /// 2062 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 2063 /// instruction. 2064 /// 2065 /// \param X 2066 /// A 256-bit integer vector of [4 x i64]. 2067 /// \param N 2068 /// An immediate integer operand with bits [1:0] determining which vector 2069 /// element is extracted and returned. 2070 /// \returns A 64-bit integer containing the extracted 64 bits of extended 2071 /// packed data. 2072 #define _mm256_extract_epi64(X, N) \ 2073 ((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N))) 2074 #endif 2075 2076 /// Takes a [8 x i32] vector and replaces the vector element value 2077 /// indexed by the immediate constant operand by a new value. Returns the 2078 /// modified vector. 2079 /// 2080 /// \headerfile <x86intrin.h> 2081 /// 2082 /// \code 2083 /// __m256i _mm256_insert_epi32(__m256i X, int I, const int N); 2084 /// \endcode 2085 /// 2086 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2087 /// instruction. 2088 /// 2089 /// \param X 2090 /// A vector of [8 x i32] to be used by the insert operation. 2091 /// \param I 2092 /// An integer value. The replacement value for the insert operation. 2093 /// \param N 2094 /// An immediate integer specifying the index of the vector element to be 2095 /// replaced. 2096 /// \returns A copy of vector \a X, after replacing its element indexed by 2097 /// \a N with \a I. 2098 #define _mm256_insert_epi32(X, I, N) \ 2099 ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \ 2100 (int)(I), (int)(N))) 2101 2102 2103 /// Takes a [16 x i16] vector and replaces the vector element value 2104 /// indexed by the immediate constant operand with a new value. Returns the 2105 /// modified vector. 2106 /// 2107 /// \headerfile <x86intrin.h> 2108 /// 2109 /// \code 2110 /// __m256i _mm256_insert_epi16(__m256i X, int I, const int N); 2111 /// \endcode 2112 /// 2113 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2114 /// instruction. 2115 /// 2116 /// \param X 2117 /// A vector of [16 x i16] to be used by the insert operation. 2118 /// \param I 2119 /// An i16 integer value. The replacement value for the insert operation. 2120 /// \param N 2121 /// An immediate integer specifying the index of the vector element to be 2122 /// replaced. 2123 /// \returns A copy of vector \a X, after replacing its element indexed by 2124 /// \a N with \a I. 2125 #define _mm256_insert_epi16(X, I, N) \ 2126 ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \ 2127 (int)(I), (int)(N))) 2128 2129 /// Takes a [32 x i8] vector and replaces the vector element value 2130 /// indexed by the immediate constant operand with a new value. Returns the 2131 /// modified vector. 2132 /// 2133 /// \headerfile <x86intrin.h> 2134 /// 2135 /// \code 2136 /// __m256i _mm256_insert_epi8(__m256i X, int I, const int N); 2137 /// \endcode 2138 /// 2139 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2140 /// instruction. 2141 /// 2142 /// \param X 2143 /// A vector of [32 x i8] to be used by the insert operation. 2144 /// \param I 2145 /// An i8 integer value. The replacement value for the insert operation. 2146 /// \param N 2147 /// An immediate integer specifying the index of the vector element to be 2148 /// replaced. 2149 /// \returns A copy of vector \a X, after replacing its element indexed by 2150 /// \a N with \a I. 2151 #define _mm256_insert_epi8(X, I, N) \ 2152 ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \ 2153 (int)(I), (int)(N))) 2154 2155 #ifdef __x86_64__ 2156 /// Takes a [4 x i64] vector and replaces the vector element value 2157 /// indexed by the immediate constant operand with a new value. Returns the 2158 /// modified vector. 2159 /// 2160 /// \headerfile <x86intrin.h> 2161 /// 2162 /// \code 2163 /// __m256i _mm256_insert_epi64(__m256i X, int I, const int N); 2164 /// \endcode 2165 /// 2166 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2167 /// instruction. 2168 /// 2169 /// \param X 2170 /// A vector of [4 x i64] to be used by the insert operation. 2171 /// \param I 2172 /// A 64-bit integer value. The replacement value for the insert operation. 2173 /// \param N 2174 /// An immediate integer specifying the index of the vector element to be 2175 /// replaced. 2176 /// \returns A copy of vector \a X, after replacing its element indexed by 2177 /// \a N with \a I. 2178 #define _mm256_insert_epi64(X, I, N) \ 2179 ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \ 2180 (long long)(I), (int)(N))) 2181 #endif 2182 2183 /* Conversion */ 2184 /// Converts a vector of [4 x i32] into a vector of [4 x double]. 2185 /// 2186 /// \headerfile <x86intrin.h> 2187 /// 2188 /// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction. 2189 /// 2190 /// \param __a 2191 /// A 128-bit integer vector of [4 x i32]. 2192 /// \returns A 256-bit vector of [4 x double] containing the converted values. 2193 static __inline __m256d __DEFAULT_FN_ATTRS 2194 _mm256_cvtepi32_pd(__m128i __a) 2195 { 2196 return (__m256d)__builtin_convertvector((__v4si)__a, __v4df); 2197 } 2198 2199 /// Converts a vector of [8 x i32] into a vector of [8 x float]. 2200 /// 2201 /// \headerfile <x86intrin.h> 2202 /// 2203 /// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction. 2204 /// 2205 /// \param __a 2206 /// A 256-bit integer vector. 2207 /// \returns A 256-bit vector of [8 x float] containing the converted values. 2208 static __inline __m256 __DEFAULT_FN_ATTRS 2209 _mm256_cvtepi32_ps(__m256i __a) 2210 { 2211 return (__m256)__builtin_convertvector((__v8si)__a, __v8sf); 2212 } 2213 2214 /// Converts a 256-bit vector of [4 x double] into a 128-bit vector of 2215 /// [4 x float]. 2216 /// 2217 /// \headerfile <x86intrin.h> 2218 /// 2219 /// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction. 2220 /// 2221 /// \param __a 2222 /// A 256-bit vector of [4 x double]. 2223 /// \returns A 128-bit vector of [4 x float] containing the converted values. 2224 static __inline __m128 __DEFAULT_FN_ATTRS 2225 _mm256_cvtpd_ps(__m256d __a) 2226 { 2227 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a); 2228 } 2229 2230 /// Converts a vector of [8 x float] into a vector of [8 x i32]. 2231 /// 2232 /// If a converted value does not fit in a 32-bit integer, raises a 2233 /// floating-point invalid exception. If the exception is masked, returns 2234 /// the most negative integer. 2235 /// 2236 /// \headerfile <x86intrin.h> 2237 /// 2238 /// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction. 2239 /// 2240 /// \param __a 2241 /// A 256-bit vector of [8 x float]. 2242 /// \returns A 256-bit integer vector containing the converted values. 2243 static __inline __m256i __DEFAULT_FN_ATTRS 2244 _mm256_cvtps_epi32(__m256 __a) 2245 { 2246 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a); 2247 } 2248 2249 /// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4 2250 /// x double]. 2251 /// 2252 /// \headerfile <x86intrin.h> 2253 /// 2254 /// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction. 2255 /// 2256 /// \param __a 2257 /// A 128-bit vector of [4 x float]. 2258 /// \returns A 256-bit vector of [4 x double] containing the converted values. 2259 static __inline __m256d __DEFAULT_FN_ATTRS 2260 _mm256_cvtps_pd(__m128 __a) 2261 { 2262 return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df); 2263 } 2264 2265 /// Converts a 256-bit vector of [4 x double] into four signed truncated 2266 /// (rounded toward zero) 32-bit integers returned in a 128-bit vector of 2267 /// [4 x i32]. 2268 /// 2269 /// If a converted value does not fit in a 32-bit integer, raises a 2270 /// floating-point invalid exception. If the exception is masked, returns 2271 /// the most negative integer. 2272 /// 2273 /// \headerfile <x86intrin.h> 2274 /// 2275 /// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction. 2276 /// 2277 /// \param __a 2278 /// A 256-bit vector of [4 x double]. 2279 /// \returns A 128-bit integer vector containing the converted values. 2280 static __inline __m128i __DEFAULT_FN_ATTRS 2281 _mm256_cvttpd_epi32(__m256d __a) 2282 { 2283 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a); 2284 } 2285 2286 /// Converts a 256-bit vector of [4 x double] into a 128-bit vector of 2287 /// [4 x i32]. 2288 /// 2289 /// If a converted value does not fit in a 32-bit integer, raises a 2290 /// floating-point invalid exception. If the exception is masked, returns 2291 /// the most negative integer. 2292 /// 2293 /// \headerfile <x86intrin.h> 2294 /// 2295 /// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction. 2296 /// 2297 /// \param __a 2298 /// A 256-bit vector of [4 x double]. 2299 /// \returns A 128-bit integer vector containing the converted values. 2300 static __inline __m128i __DEFAULT_FN_ATTRS 2301 _mm256_cvtpd_epi32(__m256d __a) 2302 { 2303 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a); 2304 } 2305 2306 /// Converts a vector of [8 x float] into eight signed truncated (rounded 2307 /// toward zero) 32-bit integers returned in a vector of [8 x i32]. 2308 /// 2309 /// If a converted value does not fit in a 32-bit integer, raises a 2310 /// floating-point invalid exception. If the exception is masked, returns 2311 /// the most negative integer. 2312 /// 2313 /// \headerfile <x86intrin.h> 2314 /// 2315 /// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction. 2316 /// 2317 /// \param __a 2318 /// A 256-bit vector of [8 x float]. 2319 /// \returns A 256-bit integer vector containing the converted values. 2320 static __inline __m256i __DEFAULT_FN_ATTRS 2321 _mm256_cvttps_epi32(__m256 __a) 2322 { 2323 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a); 2324 } 2325 2326 /// Returns the first element of the input vector of [4 x double]. 2327 /// 2328 /// \headerfile <x86intrin.h> 2329 /// 2330 /// This intrinsic is a utility function and does not correspond to a specific 2331 /// instruction. 2332 /// 2333 /// \param __a 2334 /// A 256-bit vector of [4 x double]. 2335 /// \returns A 64 bit double containing the first element of the input vector. 2336 static __inline double __DEFAULT_FN_ATTRS 2337 _mm256_cvtsd_f64(__m256d __a) 2338 { 2339 return __a[0]; 2340 } 2341 2342 /// Returns the first element of the input vector of [8 x i32]. 2343 /// 2344 /// \headerfile <x86intrin.h> 2345 /// 2346 /// This intrinsic is a utility function and does not correspond to a specific 2347 /// instruction. 2348 /// 2349 /// \param __a 2350 /// A 256-bit vector of [8 x i32]. 2351 /// \returns A 32 bit integer containing the first element of the input vector. 2352 static __inline int __DEFAULT_FN_ATTRS 2353 _mm256_cvtsi256_si32(__m256i __a) 2354 { 2355 __v8si __b = (__v8si)__a; 2356 return __b[0]; 2357 } 2358 2359 /// Returns the first element of the input vector of [8 x float]. 2360 /// 2361 /// \headerfile <x86intrin.h> 2362 /// 2363 /// This intrinsic is a utility function and does not correspond to a specific 2364 /// instruction. 2365 /// 2366 /// \param __a 2367 /// A 256-bit vector of [8 x float]. 2368 /// \returns A 32 bit float containing the first element of the input vector. 2369 static __inline float __DEFAULT_FN_ATTRS 2370 _mm256_cvtss_f32(__m256 __a) 2371 { 2372 return __a[0]; 2373 } 2374 2375 /* Vector replicate */ 2376 /// Moves and duplicates odd-indexed values from a 256-bit vector of 2377 /// [8 x float] to float values in a 256-bit vector of [8 x float]. 2378 /// 2379 /// \headerfile <x86intrin.h> 2380 /// 2381 /// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction. 2382 /// 2383 /// \param __a 2384 /// A 256-bit vector of [8 x float]. \n 2385 /// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of 2386 /// the return value. \n 2387 /// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of 2388 /// the return value. \n 2389 /// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the 2390 /// return value. \n 2391 /// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the 2392 /// return value. 2393 /// \returns A 256-bit vector of [8 x float] containing the moved and duplicated 2394 /// values. 2395 static __inline __m256 __DEFAULT_FN_ATTRS 2396 _mm256_movehdup_ps(__m256 __a) 2397 { 2398 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7); 2399 } 2400 2401 /// Moves and duplicates even-indexed values from a 256-bit vector of 2402 /// [8 x float] to float values in a 256-bit vector of [8 x float]. 2403 /// 2404 /// \headerfile <x86intrin.h> 2405 /// 2406 /// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction. 2407 /// 2408 /// \param __a 2409 /// A 256-bit vector of [8 x float]. \n 2410 /// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of 2411 /// the return value. \n 2412 /// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of 2413 /// the return value. \n 2414 /// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the 2415 /// return value. \n 2416 /// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the 2417 /// return value. 2418 /// \returns A 256-bit vector of [8 x float] containing the moved and duplicated 2419 /// values. 2420 static __inline __m256 __DEFAULT_FN_ATTRS 2421 _mm256_moveldup_ps(__m256 __a) 2422 { 2423 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6); 2424 } 2425 2426 /// Moves and duplicates double-precision floating point values from a 2427 /// 256-bit vector of [4 x double] to double-precision values in a 256-bit 2428 /// vector of [4 x double]. 2429 /// 2430 /// \headerfile <x86intrin.h> 2431 /// 2432 /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction. 2433 /// 2434 /// \param __a 2435 /// A 256-bit vector of [4 x double]. \n 2436 /// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the 2437 /// return value. \n 2438 /// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of 2439 /// the return value. 2440 /// \returns A 256-bit vector of [4 x double] containing the moved and 2441 /// duplicated values. 2442 static __inline __m256d __DEFAULT_FN_ATTRS 2443 _mm256_movedup_pd(__m256d __a) 2444 { 2445 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2); 2446 } 2447 2448 /* Unpack and Interleave */ 2449 /// Unpacks the odd-indexed vector elements from two 256-bit vectors of 2450 /// [4 x double] and interleaves them into a 256-bit vector of [4 x double]. 2451 /// 2452 /// \headerfile <x86intrin.h> 2453 /// 2454 /// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction. 2455 /// 2456 /// \param __a 2457 /// A 256-bit floating-point vector of [4 x double]. \n 2458 /// Bits [127:64] are written to bits [63:0] of the return value. \n 2459 /// Bits [255:192] are written to bits [191:128] of the return value. \n 2460 /// \param __b 2461 /// A 256-bit floating-point vector of [4 x double]. \n 2462 /// Bits [127:64] are written to bits [127:64] of the return value. \n 2463 /// Bits [255:192] are written to bits [255:192] of the return value. \n 2464 /// \returns A 256-bit vector of [4 x double] containing the interleaved values. 2465 static __inline __m256d __DEFAULT_FN_ATTRS 2466 _mm256_unpackhi_pd(__m256d __a, __m256d __b) 2467 { 2468 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2); 2469 } 2470 2471 /// Unpacks the even-indexed vector elements from two 256-bit vectors of 2472 /// [4 x double] and interleaves them into a 256-bit vector of [4 x double]. 2473 /// 2474 /// \headerfile <x86intrin.h> 2475 /// 2476 /// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction. 2477 /// 2478 /// \param __a 2479 /// A 256-bit floating-point vector of [4 x double]. \n 2480 /// Bits [63:0] are written to bits [63:0] of the return value. \n 2481 /// Bits [191:128] are written to bits [191:128] of the return value. 2482 /// \param __b 2483 /// A 256-bit floating-point vector of [4 x double]. \n 2484 /// Bits [63:0] are written to bits [127:64] of the return value. \n 2485 /// Bits [191:128] are written to bits [255:192] of the return value. \n 2486 /// \returns A 256-bit vector of [4 x double] containing the interleaved values. 2487 static __inline __m256d __DEFAULT_FN_ATTRS 2488 _mm256_unpacklo_pd(__m256d __a, __m256d __b) 2489 { 2490 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2); 2491 } 2492 2493 /// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the 2494 /// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit 2495 /// vector of [8 x float]. 2496 /// 2497 /// \headerfile <x86intrin.h> 2498 /// 2499 /// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction. 2500 /// 2501 /// \param __a 2502 /// A 256-bit vector of [8 x float]. \n 2503 /// Bits [95:64] are written to bits [31:0] of the return value. \n 2504 /// Bits [127:96] are written to bits [95:64] of the return value. \n 2505 /// Bits [223:192] are written to bits [159:128] of the return value. \n 2506 /// Bits [255:224] are written to bits [223:192] of the return value. 2507 /// \param __b 2508 /// A 256-bit vector of [8 x float]. \n 2509 /// Bits [95:64] are written to bits [63:32] of the return value. \n 2510 /// Bits [127:96] are written to bits [127:96] of the return value. \n 2511 /// Bits [223:192] are written to bits [191:160] of the return value. \n 2512 /// Bits [255:224] are written to bits [255:224] of the return value. 2513 /// \returns A 256-bit vector of [8 x float] containing the interleaved values. 2514 static __inline __m256 __DEFAULT_FN_ATTRS 2515 _mm256_unpackhi_ps(__m256 __a, __m256 __b) 2516 { 2517 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1); 2518 } 2519 2520 /// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the 2521 /// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit 2522 /// vector of [8 x float]. 2523 /// 2524 /// \headerfile <x86intrin.h> 2525 /// 2526 /// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction. 2527 /// 2528 /// \param __a 2529 /// A 256-bit vector of [8 x float]. \n 2530 /// Bits [31:0] are written to bits [31:0] of the return value. \n 2531 /// Bits [63:32] are written to bits [95:64] of the return value. \n 2532 /// Bits [159:128] are written to bits [159:128] of the return value. \n 2533 /// Bits [191:160] are written to bits [223:192] of the return value. 2534 /// \param __b 2535 /// A 256-bit vector of [8 x float]. \n 2536 /// Bits [31:0] are written to bits [63:32] of the return value. \n 2537 /// Bits [63:32] are written to bits [127:96] of the return value. \n 2538 /// Bits [159:128] are written to bits [191:160] of the return value. \n 2539 /// Bits [191:160] are written to bits [255:224] of the return value. 2540 /// \returns A 256-bit vector of [8 x float] containing the interleaved values. 2541 static __inline __m256 __DEFAULT_FN_ATTRS 2542 _mm256_unpacklo_ps(__m256 __a, __m256 __b) 2543 { 2544 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1); 2545 } 2546 2547 /* Bit Test */ 2548 /// Given two 128-bit floating-point vectors of [2 x double], perform an 2549 /// element-by-element comparison of the double-precision element in the 2550 /// first source vector and the corresponding element in the second source 2551 /// vector. 2552 /// 2553 /// The EFLAGS register is updated as follows: \n 2554 /// If there is at least one pair of double-precision elements where the 2555 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2556 /// ZF flag is set to 1. \n 2557 /// If there is at least one pair of double-precision elements where the 2558 /// sign-bit of the first element is 0 and the sign-bit of the second element 2559 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2560 /// This intrinsic returns the value of the ZF flag. 2561 /// 2562 /// \headerfile <x86intrin.h> 2563 /// 2564 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2565 /// 2566 /// \param __a 2567 /// A 128-bit vector of [2 x double]. 2568 /// \param __b 2569 /// A 128-bit vector of [2 x double]. 2570 /// \returns the ZF flag in the EFLAGS register. 2571 static __inline int __DEFAULT_FN_ATTRS128 2572 _mm_testz_pd(__m128d __a, __m128d __b) 2573 { 2574 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b); 2575 } 2576 2577 /// Given two 128-bit floating-point vectors of [2 x double], perform an 2578 /// element-by-element comparison of the double-precision element in the 2579 /// first source vector and the corresponding element in the second source 2580 /// vector. 2581 /// 2582 /// The EFLAGS register is updated as follows: \n 2583 /// If there is at least one pair of double-precision elements where the 2584 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2585 /// ZF flag is set to 1. \n 2586 /// If there is at least one pair of double-precision elements where the 2587 /// sign-bit of the first element is 0 and the sign-bit of the second element 2588 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2589 /// This intrinsic returns the value of the CF flag. 2590 /// 2591 /// \headerfile <x86intrin.h> 2592 /// 2593 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2594 /// 2595 /// \param __a 2596 /// A 128-bit vector of [2 x double]. 2597 /// \param __b 2598 /// A 128-bit vector of [2 x double]. 2599 /// \returns the CF flag in the EFLAGS register. 2600 static __inline int __DEFAULT_FN_ATTRS128 2601 _mm_testc_pd(__m128d __a, __m128d __b) 2602 { 2603 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b); 2604 } 2605 2606 /// Given two 128-bit floating-point vectors of [2 x double], perform an 2607 /// element-by-element comparison of the double-precision element in the 2608 /// first source vector and the corresponding element in the second source 2609 /// vector. 2610 /// 2611 /// The EFLAGS register is updated as follows: \n 2612 /// If there is at least one pair of double-precision elements where the 2613 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2614 /// ZF flag is set to 1. \n 2615 /// If there is at least one pair of double-precision elements where the 2616 /// sign-bit of the first element is 0 and the sign-bit of the second element 2617 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2618 /// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2619 /// otherwise it returns 0. 2620 /// 2621 /// \headerfile <x86intrin.h> 2622 /// 2623 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2624 /// 2625 /// \param __a 2626 /// A 128-bit vector of [2 x double]. 2627 /// \param __b 2628 /// A 128-bit vector of [2 x double]. 2629 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2630 static __inline int __DEFAULT_FN_ATTRS128 2631 _mm_testnzc_pd(__m128d __a, __m128d __b) 2632 { 2633 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b); 2634 } 2635 2636 /// Given two 128-bit floating-point vectors of [4 x float], perform an 2637 /// element-by-element comparison of the single-precision element in the 2638 /// first source vector and the corresponding element in the second source 2639 /// vector. 2640 /// 2641 /// The EFLAGS register is updated as follows: \n 2642 /// If there is at least one pair of single-precision elements where the 2643 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2644 /// ZF flag is set to 1. \n 2645 /// If there is at least one pair of single-precision elements where the 2646 /// sign-bit of the first element is 0 and the sign-bit of the second element 2647 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2648 /// This intrinsic returns the value of the ZF flag. 2649 /// 2650 /// \headerfile <x86intrin.h> 2651 /// 2652 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2653 /// 2654 /// \param __a 2655 /// A 128-bit vector of [4 x float]. 2656 /// \param __b 2657 /// A 128-bit vector of [4 x float]. 2658 /// \returns the ZF flag. 2659 static __inline int __DEFAULT_FN_ATTRS128 2660 _mm_testz_ps(__m128 __a, __m128 __b) 2661 { 2662 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b); 2663 } 2664 2665 /// Given two 128-bit floating-point vectors of [4 x float], perform an 2666 /// element-by-element comparison of the single-precision element in the 2667 /// first source vector and the corresponding element in the second source 2668 /// vector. 2669 /// 2670 /// The EFLAGS register is updated as follows: \n 2671 /// If there is at least one pair of single-precision elements where the 2672 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2673 /// ZF flag is set to 1. \n 2674 /// If there is at least one pair of single-precision elements where the 2675 /// sign-bit of the first element is 0 and the sign-bit of the second element 2676 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2677 /// This intrinsic returns the value of the CF flag. 2678 /// 2679 /// \headerfile <x86intrin.h> 2680 /// 2681 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2682 /// 2683 /// \param __a 2684 /// A 128-bit vector of [4 x float]. 2685 /// \param __b 2686 /// A 128-bit vector of [4 x float]. 2687 /// \returns the CF flag. 2688 static __inline int __DEFAULT_FN_ATTRS128 2689 _mm_testc_ps(__m128 __a, __m128 __b) 2690 { 2691 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b); 2692 } 2693 2694 /// Given two 128-bit floating-point vectors of [4 x float], perform an 2695 /// element-by-element comparison of the single-precision element in the 2696 /// first source vector and the corresponding element in the second source 2697 /// vector. 2698 /// 2699 /// The EFLAGS register is updated as follows: \n 2700 /// If there is at least one pair of single-precision elements where the 2701 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2702 /// ZF flag is set to 1. \n 2703 /// If there is at least one pair of single-precision elements where the 2704 /// sign-bit of the first element is 0 and the sign-bit of the second element 2705 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2706 /// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2707 /// otherwise it returns 0. 2708 /// 2709 /// \headerfile <x86intrin.h> 2710 /// 2711 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2712 /// 2713 /// \param __a 2714 /// A 128-bit vector of [4 x float]. 2715 /// \param __b 2716 /// A 128-bit vector of [4 x float]. 2717 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2718 static __inline int __DEFAULT_FN_ATTRS128 2719 _mm_testnzc_ps(__m128 __a, __m128 __b) 2720 { 2721 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b); 2722 } 2723 2724 /// Given two 256-bit floating-point vectors of [4 x double], perform an 2725 /// element-by-element comparison of the double-precision elements in the 2726 /// first source vector and the corresponding elements in the second source 2727 /// vector. 2728 /// 2729 /// The EFLAGS register is updated as follows: \n 2730 /// If there is at least one pair of double-precision elements where the 2731 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2732 /// ZF flag is set to 1. \n 2733 /// If there is at least one pair of double-precision elements where the 2734 /// sign-bit of the first element is 0 and the sign-bit of the second element 2735 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2736 /// This intrinsic returns the value of the ZF flag. 2737 /// 2738 /// \headerfile <x86intrin.h> 2739 /// 2740 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2741 /// 2742 /// \param __a 2743 /// A 256-bit vector of [4 x double]. 2744 /// \param __b 2745 /// A 256-bit vector of [4 x double]. 2746 /// \returns the ZF flag. 2747 static __inline int __DEFAULT_FN_ATTRS 2748 _mm256_testz_pd(__m256d __a, __m256d __b) 2749 { 2750 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b); 2751 } 2752 2753 /// Given two 256-bit floating-point vectors of [4 x double], perform an 2754 /// element-by-element comparison of the double-precision elements in the 2755 /// first source vector and the corresponding elements in the second source 2756 /// vector. 2757 /// 2758 /// The EFLAGS register is updated as follows: \n 2759 /// If there is at least one pair of double-precision elements where the 2760 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2761 /// ZF flag is set to 1. \n 2762 /// If there is at least one pair of double-precision elements where the 2763 /// sign-bit of the first element is 0 and the sign-bit of the second element 2764 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2765 /// This intrinsic returns the value of the CF flag. 2766 /// 2767 /// \headerfile <x86intrin.h> 2768 /// 2769 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2770 /// 2771 /// \param __a 2772 /// A 256-bit vector of [4 x double]. 2773 /// \param __b 2774 /// A 256-bit vector of [4 x double]. 2775 /// \returns the CF flag. 2776 static __inline int __DEFAULT_FN_ATTRS 2777 _mm256_testc_pd(__m256d __a, __m256d __b) 2778 { 2779 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b); 2780 } 2781 2782 /// Given two 256-bit floating-point vectors of [4 x double], perform an 2783 /// element-by-element comparison of the double-precision elements in the 2784 /// first source vector and the corresponding elements in the second source 2785 /// vector. 2786 /// 2787 /// The EFLAGS register is updated as follows: \n 2788 /// If there is at least one pair of double-precision elements where the 2789 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2790 /// ZF flag is set to 1. \n 2791 /// If there is at least one pair of double-precision elements where the 2792 /// sign-bit of the first element is 0 and the sign-bit of the second element 2793 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2794 /// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2795 /// otherwise it returns 0. 2796 /// 2797 /// \headerfile <x86intrin.h> 2798 /// 2799 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2800 /// 2801 /// \param __a 2802 /// A 256-bit vector of [4 x double]. 2803 /// \param __b 2804 /// A 256-bit vector of [4 x double]. 2805 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2806 static __inline int __DEFAULT_FN_ATTRS 2807 _mm256_testnzc_pd(__m256d __a, __m256d __b) 2808 { 2809 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b); 2810 } 2811 2812 /// Given two 256-bit floating-point vectors of [8 x float], perform an 2813 /// element-by-element comparison of the single-precision element in the 2814 /// first source vector and the corresponding element in the second source 2815 /// vector. 2816 /// 2817 /// The EFLAGS register is updated as follows: \n 2818 /// If there is at least one pair of single-precision elements where the 2819 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2820 /// ZF flag is set to 1. \n 2821 /// If there is at least one pair of single-precision elements where the 2822 /// sign-bit of the first element is 0 and the sign-bit of the second element 2823 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2824 /// This intrinsic returns the value of the ZF flag. 2825 /// 2826 /// \headerfile <x86intrin.h> 2827 /// 2828 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2829 /// 2830 /// \param __a 2831 /// A 256-bit vector of [8 x float]. 2832 /// \param __b 2833 /// A 256-bit vector of [8 x float]. 2834 /// \returns the ZF flag. 2835 static __inline int __DEFAULT_FN_ATTRS 2836 _mm256_testz_ps(__m256 __a, __m256 __b) 2837 { 2838 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b); 2839 } 2840 2841 /// Given two 256-bit floating-point vectors of [8 x float], perform an 2842 /// element-by-element comparison of the single-precision element in the 2843 /// first source vector and the corresponding element in the second source 2844 /// vector. 2845 /// 2846 /// The EFLAGS register is updated as follows: \n 2847 /// If there is at least one pair of single-precision elements where the 2848 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2849 /// ZF flag is set to 1. \n 2850 /// If there is at least one pair of single-precision elements where the 2851 /// sign-bit of the first element is 0 and the sign-bit of the second element 2852 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2853 /// This intrinsic returns the value of the CF flag. 2854 /// 2855 /// \headerfile <x86intrin.h> 2856 /// 2857 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2858 /// 2859 /// \param __a 2860 /// A 256-bit vector of [8 x float]. 2861 /// \param __b 2862 /// A 256-bit vector of [8 x float]. 2863 /// \returns the CF flag. 2864 static __inline int __DEFAULT_FN_ATTRS 2865 _mm256_testc_ps(__m256 __a, __m256 __b) 2866 { 2867 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b); 2868 } 2869 2870 /// Given two 256-bit floating-point vectors of [8 x float], perform an 2871 /// element-by-element comparison of the single-precision elements in the 2872 /// first source vector and the corresponding elements in the second source 2873 /// vector. 2874 /// 2875 /// The EFLAGS register is updated as follows: \n 2876 /// If there is at least one pair of single-precision elements where the 2877 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2878 /// ZF flag is set to 1. \n 2879 /// If there is at least one pair of single-precision elements where the 2880 /// sign-bit of the first element is 0 and the sign-bit of the second element 2881 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2882 /// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2883 /// otherwise it returns 0. 2884 /// 2885 /// \headerfile <x86intrin.h> 2886 /// 2887 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2888 /// 2889 /// \param __a 2890 /// A 256-bit vector of [8 x float]. 2891 /// \param __b 2892 /// A 256-bit vector of [8 x float]. 2893 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2894 static __inline int __DEFAULT_FN_ATTRS 2895 _mm256_testnzc_ps(__m256 __a, __m256 __b) 2896 { 2897 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b); 2898 } 2899 2900 /// Given two 256-bit integer vectors, perform a bit-by-bit comparison 2901 /// of the two source vectors. 2902 /// 2903 /// The EFLAGS register is updated as follows: \n 2904 /// If there is at least one pair of bits where both bits are 1, the ZF flag 2905 /// is set to 0. Otherwise the ZF flag is set to 1. \n 2906 /// If there is at least one pair of bits where the bit from the first source 2907 /// vector is 0 and the bit from the second source vector is 1, the CF flag 2908 /// is set to 0. Otherwise the CF flag is set to 1. \n 2909 /// This intrinsic returns the value of the ZF flag. 2910 /// 2911 /// \headerfile <x86intrin.h> 2912 /// 2913 /// This intrinsic corresponds to the <c> VPTEST </c> instruction. 2914 /// 2915 /// \param __a 2916 /// A 256-bit integer vector. 2917 /// \param __b 2918 /// A 256-bit integer vector. 2919 /// \returns the ZF flag. 2920 static __inline int __DEFAULT_FN_ATTRS 2921 _mm256_testz_si256(__m256i __a, __m256i __b) 2922 { 2923 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b); 2924 } 2925 2926 /// Given two 256-bit integer vectors, perform a bit-by-bit comparison 2927 /// of the two source vectors. 2928 /// 2929 /// The EFLAGS register is updated as follows: \n 2930 /// If there is at least one pair of bits where both bits are 1, the ZF flag 2931 /// is set to 0. Otherwise the ZF flag is set to 1. \n 2932 /// If there is at least one pair of bits where the bit from the first source 2933 /// vector is 0 and the bit from the second source vector is 1, the CF flag 2934 /// is set to 0. Otherwise the CF flag is set to 1. \n 2935 /// This intrinsic returns the value of the CF flag. 2936 /// 2937 /// \headerfile <x86intrin.h> 2938 /// 2939 /// This intrinsic corresponds to the <c> VPTEST </c> instruction. 2940 /// 2941 /// \param __a 2942 /// A 256-bit integer vector. 2943 /// \param __b 2944 /// A 256-bit integer vector. 2945 /// \returns the CF flag. 2946 static __inline int __DEFAULT_FN_ATTRS 2947 _mm256_testc_si256(__m256i __a, __m256i __b) 2948 { 2949 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b); 2950 } 2951 2952 /// Given two 256-bit integer vectors, perform a bit-by-bit comparison 2953 /// of the two source vectors. 2954 /// 2955 /// The EFLAGS register is updated as follows: \n 2956 /// If there is at least one pair of bits where both bits are 1, the ZF flag 2957 /// is set to 0. Otherwise the ZF flag is set to 1. \n 2958 /// If there is at least one pair of bits where the bit from the first source 2959 /// vector is 0 and the bit from the second source vector is 1, the CF flag 2960 /// is set to 0. Otherwise the CF flag is set to 1. \n 2961 /// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2962 /// otherwise it returns 0. 2963 /// 2964 /// \headerfile <x86intrin.h> 2965 /// 2966 /// This intrinsic corresponds to the <c> VPTEST </c> instruction. 2967 /// 2968 /// \param __a 2969 /// A 256-bit integer vector. 2970 /// \param __b 2971 /// A 256-bit integer vector. 2972 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2973 static __inline int __DEFAULT_FN_ATTRS 2974 _mm256_testnzc_si256(__m256i __a, __m256i __b) 2975 { 2976 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b); 2977 } 2978 2979 /* Vector extract sign mask */ 2980 /// Extracts the sign bits of double-precision floating point elements 2981 /// in a 256-bit vector of [4 x double] and writes them to the lower order 2982 /// bits of the return value. 2983 /// 2984 /// \headerfile <x86intrin.h> 2985 /// 2986 /// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction. 2987 /// 2988 /// \param __a 2989 /// A 256-bit vector of [4 x double] containing the double-precision 2990 /// floating point values with sign bits to be extracted. 2991 /// \returns The sign bits from the operand, written to bits [3:0]. 2992 static __inline int __DEFAULT_FN_ATTRS 2993 _mm256_movemask_pd(__m256d __a) 2994 { 2995 return __builtin_ia32_movmskpd256((__v4df)__a); 2996 } 2997 2998 /// Extracts the sign bits of single-precision floating point elements 2999 /// in a 256-bit vector of [8 x float] and writes them to the lower order 3000 /// bits of the return value. 3001 /// 3002 /// \headerfile <x86intrin.h> 3003 /// 3004 /// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction. 3005 /// 3006 /// \param __a 3007 /// A 256-bit vector of [8 x float] containing the single-precision floating 3008 /// point values with sign bits to be extracted. 3009 /// \returns The sign bits from the operand, written to bits [7:0]. 3010 static __inline int __DEFAULT_FN_ATTRS 3011 _mm256_movemask_ps(__m256 __a) 3012 { 3013 return __builtin_ia32_movmskps256((__v8sf)__a); 3014 } 3015 3016 /* Vector __zero */ 3017 /// Zeroes the contents of all XMM or YMM registers. 3018 /// 3019 /// \headerfile <x86intrin.h> 3020 /// 3021 /// This intrinsic corresponds to the <c> VZEROALL </c> instruction. 3022 static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx"))) 3023 _mm256_zeroall(void) 3024 { 3025 __builtin_ia32_vzeroall(); 3026 } 3027 3028 /// Zeroes the upper 128 bits (bits 255:128) of all YMM registers. 3029 /// 3030 /// \headerfile <x86intrin.h> 3031 /// 3032 /// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction. 3033 static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx"))) 3034 _mm256_zeroupper(void) 3035 { 3036 __builtin_ia32_vzeroupper(); 3037 } 3038 3039 /* Vector load with broadcast */ 3040 /// Loads a scalar single-precision floating point value from the 3041 /// specified address pointed to by \a __a and broadcasts it to the elements 3042 /// of a [4 x float] vector. 3043 /// 3044 /// \headerfile <x86intrin.h> 3045 /// 3046 /// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction. 3047 /// 3048 /// \param __a 3049 /// The single-precision floating point value to be broadcast. 3050 /// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set 3051 /// equal to the broadcast value. 3052 static __inline __m128 __DEFAULT_FN_ATTRS128 3053 _mm_broadcast_ss(float const *__a) 3054 { 3055 struct __mm_broadcast_ss_struct { 3056 float __f; 3057 } __attribute__((__packed__, __may_alias__)); 3058 float __f = ((const struct __mm_broadcast_ss_struct*)__a)->__f; 3059 return __extension__ (__m128){ __f, __f, __f, __f }; 3060 } 3061 3062 /// Loads a scalar double-precision floating point value from the 3063 /// specified address pointed to by \a __a and broadcasts it to the elements 3064 /// of a [4 x double] vector. 3065 /// 3066 /// \headerfile <x86intrin.h> 3067 /// 3068 /// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction. 3069 /// 3070 /// \param __a 3071 /// The double-precision floating point value to be broadcast. 3072 /// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set 3073 /// equal to the broadcast value. 3074 static __inline __m256d __DEFAULT_FN_ATTRS 3075 _mm256_broadcast_sd(double const *__a) 3076 { 3077 struct __mm256_broadcast_sd_struct { 3078 double __d; 3079 } __attribute__((__packed__, __may_alias__)); 3080 double __d = ((const struct __mm256_broadcast_sd_struct*)__a)->__d; 3081 return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d }; 3082 } 3083 3084 /// Loads a scalar single-precision floating point value from the 3085 /// specified address pointed to by \a __a and broadcasts it to the elements 3086 /// of a [8 x float] vector. 3087 /// 3088 /// \headerfile <x86intrin.h> 3089 /// 3090 /// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction. 3091 /// 3092 /// \param __a 3093 /// The single-precision floating point value to be broadcast. 3094 /// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set 3095 /// equal to the broadcast value. 3096 static __inline __m256 __DEFAULT_FN_ATTRS 3097 _mm256_broadcast_ss(float const *__a) 3098 { 3099 struct __mm256_broadcast_ss_struct { 3100 float __f; 3101 } __attribute__((__packed__, __may_alias__)); 3102 float __f = ((const struct __mm256_broadcast_ss_struct*)__a)->__f; 3103 return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f }; 3104 } 3105 3106 /// Loads the data from a 128-bit vector of [2 x double] from the 3107 /// specified address pointed to by \a __a and broadcasts it to 128-bit 3108 /// elements in a 256-bit vector of [4 x double]. 3109 /// 3110 /// \headerfile <x86intrin.h> 3111 /// 3112 /// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction. 3113 /// 3114 /// \param __a 3115 /// The 128-bit vector of [2 x double] to be broadcast. 3116 /// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set 3117 /// equal to the broadcast value. 3118 static __inline __m256d __DEFAULT_FN_ATTRS 3119 _mm256_broadcast_pd(__m128d const *__a) 3120 { 3121 __m128d __b = _mm_loadu_pd((const double *)__a); 3122 return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b, 3123 0, 1, 0, 1); 3124 } 3125 3126 /// Loads the data from a 128-bit vector of [4 x float] from the 3127 /// specified address pointed to by \a __a and broadcasts it to 128-bit 3128 /// elements in a 256-bit vector of [8 x float]. 3129 /// 3130 /// \headerfile <x86intrin.h> 3131 /// 3132 /// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction. 3133 /// 3134 /// \param __a 3135 /// The 128-bit vector of [4 x float] to be broadcast. 3136 /// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set 3137 /// equal to the broadcast value. 3138 static __inline __m256 __DEFAULT_FN_ATTRS 3139 _mm256_broadcast_ps(__m128 const *__a) 3140 { 3141 __m128 __b = _mm_loadu_ps((const float *)__a); 3142 return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b, 3143 0, 1, 2, 3, 0, 1, 2, 3); 3144 } 3145 3146 /* SIMD load ops */ 3147 /// Loads 4 double-precision floating point values from a 32-byte aligned 3148 /// memory location pointed to by \a __p into a vector of [4 x double]. 3149 /// 3150 /// \headerfile <x86intrin.h> 3151 /// 3152 /// This intrinsic corresponds to the <c> VMOVAPD </c> instruction. 3153 /// 3154 /// \param __p 3155 /// A 32-byte aligned pointer to a memory location containing 3156 /// double-precision floating point values. 3157 /// \returns A 256-bit vector of [4 x double] containing the moved values. 3158 static __inline __m256d __DEFAULT_FN_ATTRS 3159 _mm256_load_pd(double const *__p) 3160 { 3161 return *(const __m256d *)__p; 3162 } 3163 3164 /// Loads 8 single-precision floating point values from a 32-byte aligned 3165 /// memory location pointed to by \a __p into a vector of [8 x float]. 3166 /// 3167 /// \headerfile <x86intrin.h> 3168 /// 3169 /// This intrinsic corresponds to the <c> VMOVAPS </c> instruction. 3170 /// 3171 /// \param __p 3172 /// A 32-byte aligned pointer to a memory location containing float values. 3173 /// \returns A 256-bit vector of [8 x float] containing the moved values. 3174 static __inline __m256 __DEFAULT_FN_ATTRS 3175 _mm256_load_ps(float const *__p) 3176 { 3177 return *(const __m256 *)__p; 3178 } 3179 3180 /// Loads 4 double-precision floating point values from an unaligned 3181 /// memory location pointed to by \a __p into a vector of [4 x double]. 3182 /// 3183 /// \headerfile <x86intrin.h> 3184 /// 3185 /// This intrinsic corresponds to the <c> VMOVUPD </c> instruction. 3186 /// 3187 /// \param __p 3188 /// A pointer to a memory location containing double-precision floating 3189 /// point values. 3190 /// \returns A 256-bit vector of [4 x double] containing the moved values. 3191 static __inline __m256d __DEFAULT_FN_ATTRS 3192 _mm256_loadu_pd(double const *__p) 3193 { 3194 struct __loadu_pd { 3195 __m256d_u __v; 3196 } __attribute__((__packed__, __may_alias__)); 3197 return ((const struct __loadu_pd*)__p)->__v; 3198 } 3199 3200 /// Loads 8 single-precision floating point values from an unaligned 3201 /// memory location pointed to by \a __p into a vector of [8 x float]. 3202 /// 3203 /// \headerfile <x86intrin.h> 3204 /// 3205 /// This intrinsic corresponds to the <c> VMOVUPS </c> instruction. 3206 /// 3207 /// \param __p 3208 /// A pointer to a memory location containing single-precision floating 3209 /// point values. 3210 /// \returns A 256-bit vector of [8 x float] containing the moved values. 3211 static __inline __m256 __DEFAULT_FN_ATTRS 3212 _mm256_loadu_ps(float const *__p) 3213 { 3214 struct __loadu_ps { 3215 __m256_u __v; 3216 } __attribute__((__packed__, __may_alias__)); 3217 return ((const struct __loadu_ps*)__p)->__v; 3218 } 3219 3220 /// Loads 256 bits of integer data from a 32-byte aligned memory 3221 /// location pointed to by \a __p into elements of a 256-bit integer vector. 3222 /// 3223 /// \headerfile <x86intrin.h> 3224 /// 3225 /// This intrinsic corresponds to the <c> VMOVDQA </c> instruction. 3226 /// 3227 /// \param __p 3228 /// A 32-byte aligned pointer to a 256-bit integer vector containing integer 3229 /// values. 3230 /// \returns A 256-bit integer vector containing the moved values. 3231 static __inline __m256i __DEFAULT_FN_ATTRS 3232 _mm256_load_si256(__m256i const *__p) 3233 { 3234 return *__p; 3235 } 3236 3237 /// Loads 256 bits of integer data from an unaligned memory location 3238 /// pointed to by \a __p into a 256-bit integer vector. 3239 /// 3240 /// \headerfile <x86intrin.h> 3241 /// 3242 /// This intrinsic corresponds to the <c> VMOVDQU </c> instruction. 3243 /// 3244 /// \param __p 3245 /// A pointer to a 256-bit integer vector containing integer values. 3246 /// \returns A 256-bit integer vector containing the moved values. 3247 static __inline __m256i __DEFAULT_FN_ATTRS 3248 _mm256_loadu_si256(__m256i_u const *__p) 3249 { 3250 struct __loadu_si256 { 3251 __m256i_u __v; 3252 } __attribute__((__packed__, __may_alias__)); 3253 return ((const struct __loadu_si256*)__p)->__v; 3254 } 3255 3256 /// Loads 256 bits of integer data from an unaligned memory location 3257 /// pointed to by \a __p into a 256-bit integer vector. This intrinsic may 3258 /// perform better than \c _mm256_loadu_si256 when the data crosses a cache 3259 /// line boundary. 3260 /// 3261 /// \headerfile <x86intrin.h> 3262 /// 3263 /// This intrinsic corresponds to the <c> VLDDQU </c> instruction. 3264 /// 3265 /// \param __p 3266 /// A pointer to a 256-bit integer vector containing integer values. 3267 /// \returns A 256-bit integer vector containing the moved values. 3268 static __inline __m256i __DEFAULT_FN_ATTRS 3269 _mm256_lddqu_si256(__m256i_u const *__p) 3270 { 3271 return (__m256i)__builtin_ia32_lddqu256((char const *)__p); 3272 } 3273 3274 /* SIMD store ops */ 3275 /// Stores double-precision floating point values from a 256-bit vector 3276 /// of [4 x double] to a 32-byte aligned memory location pointed to by 3277 /// \a __p. 3278 /// 3279 /// \headerfile <x86intrin.h> 3280 /// 3281 /// This intrinsic corresponds to the <c> VMOVAPD </c> instruction. 3282 /// 3283 /// \param __p 3284 /// A 32-byte aligned pointer to a memory location that will receive the 3285 /// double-precision floaing point values. 3286 /// \param __a 3287 /// A 256-bit vector of [4 x double] containing the values to be moved. 3288 static __inline void __DEFAULT_FN_ATTRS 3289 _mm256_store_pd(double *__p, __m256d __a) 3290 { 3291 *(__m256d *)__p = __a; 3292 } 3293 3294 /// Stores single-precision floating point values from a 256-bit vector 3295 /// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p. 3296 /// 3297 /// \headerfile <x86intrin.h> 3298 /// 3299 /// This intrinsic corresponds to the <c> VMOVAPS </c> instruction. 3300 /// 3301 /// \param __p 3302 /// A 32-byte aligned pointer to a memory location that will receive the 3303 /// float values. 3304 /// \param __a 3305 /// A 256-bit vector of [8 x float] containing the values to be moved. 3306 static __inline void __DEFAULT_FN_ATTRS 3307 _mm256_store_ps(float *__p, __m256 __a) 3308 { 3309 *(__m256 *)__p = __a; 3310 } 3311 3312 /// Stores double-precision floating point values from a 256-bit vector 3313 /// of [4 x double] to an unaligned memory location pointed to by \a __p. 3314 /// 3315 /// \headerfile <x86intrin.h> 3316 /// 3317 /// This intrinsic corresponds to the <c> VMOVUPD </c> instruction. 3318 /// 3319 /// \param __p 3320 /// A pointer to a memory location that will receive the double-precision 3321 /// floating point values. 3322 /// \param __a 3323 /// A 256-bit vector of [4 x double] containing the values to be moved. 3324 static __inline void __DEFAULT_FN_ATTRS 3325 _mm256_storeu_pd(double *__p, __m256d __a) 3326 { 3327 struct __storeu_pd { 3328 __m256d_u __v; 3329 } __attribute__((__packed__, __may_alias__)); 3330 ((struct __storeu_pd*)__p)->__v = __a; 3331 } 3332 3333 /// Stores single-precision floating point values from a 256-bit vector 3334 /// of [8 x float] to an unaligned memory location pointed to by \a __p. 3335 /// 3336 /// \headerfile <x86intrin.h> 3337 /// 3338 /// This intrinsic corresponds to the <c> VMOVUPS </c> instruction. 3339 /// 3340 /// \param __p 3341 /// A pointer to a memory location that will receive the float values. 3342 /// \param __a 3343 /// A 256-bit vector of [8 x float] containing the values to be moved. 3344 static __inline void __DEFAULT_FN_ATTRS 3345 _mm256_storeu_ps(float *__p, __m256 __a) 3346 { 3347 struct __storeu_ps { 3348 __m256_u __v; 3349 } __attribute__((__packed__, __may_alias__)); 3350 ((struct __storeu_ps*)__p)->__v = __a; 3351 } 3352 3353 /// Stores integer values from a 256-bit integer vector to a 32-byte 3354 /// aligned memory location pointed to by \a __p. 3355 /// 3356 /// \headerfile <x86intrin.h> 3357 /// 3358 /// This intrinsic corresponds to the <c> VMOVDQA </c> instruction. 3359 /// 3360 /// \param __p 3361 /// A 32-byte aligned pointer to a memory location that will receive the 3362 /// integer values. 3363 /// \param __a 3364 /// A 256-bit integer vector containing the values to be moved. 3365 static __inline void __DEFAULT_FN_ATTRS 3366 _mm256_store_si256(__m256i *__p, __m256i __a) 3367 { 3368 *__p = __a; 3369 } 3370 3371 /// Stores integer values from a 256-bit integer vector to an unaligned 3372 /// memory location pointed to by \a __p. 3373 /// 3374 /// \headerfile <x86intrin.h> 3375 /// 3376 /// This intrinsic corresponds to the <c> VMOVDQU </c> instruction. 3377 /// 3378 /// \param __p 3379 /// A pointer to a memory location that will receive the integer values. 3380 /// \param __a 3381 /// A 256-bit integer vector containing the values to be moved. 3382 static __inline void __DEFAULT_FN_ATTRS 3383 _mm256_storeu_si256(__m256i_u *__p, __m256i __a) 3384 { 3385 struct __storeu_si256 { 3386 __m256i_u __v; 3387 } __attribute__((__packed__, __may_alias__)); 3388 ((struct __storeu_si256*)__p)->__v = __a; 3389 } 3390 3391 /* Conditional load ops */ 3392 /// Conditionally loads double-precision floating point elements from a 3393 /// memory location pointed to by \a __p into a 128-bit vector of 3394 /// [2 x double], depending on the mask bits associated with each data 3395 /// element. 3396 /// 3397 /// \headerfile <x86intrin.h> 3398 /// 3399 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3400 /// 3401 /// \param __p 3402 /// A pointer to a memory location that contains the double-precision 3403 /// floating point values. 3404 /// \param __m 3405 /// A 128-bit integer vector containing the mask. The most significant bit of 3406 /// each data element represents the mask bits. If a mask bit is zero, the 3407 /// corresponding value in the memory location is not loaded and the 3408 /// corresponding field in the return value is set to zero. 3409 /// \returns A 128-bit vector of [2 x double] containing the loaded values. 3410 static __inline __m128d __DEFAULT_FN_ATTRS128 3411 _mm_maskload_pd(double const *__p, __m128i __m) 3412 { 3413 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m); 3414 } 3415 3416 /// Conditionally loads double-precision floating point elements from a 3417 /// memory location pointed to by \a __p into a 256-bit vector of 3418 /// [4 x double], depending on the mask bits associated with each data 3419 /// element. 3420 /// 3421 /// \headerfile <x86intrin.h> 3422 /// 3423 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3424 /// 3425 /// \param __p 3426 /// A pointer to a memory location that contains the double-precision 3427 /// floating point values. 3428 /// \param __m 3429 /// A 256-bit integer vector of [4 x quadword] containing the mask. The most 3430 /// significant bit of each quadword element represents the mask bits. If a 3431 /// mask bit is zero, the corresponding value in the memory location is not 3432 /// loaded and the corresponding field in the return value is set to zero. 3433 /// \returns A 256-bit vector of [4 x double] containing the loaded values. 3434 static __inline __m256d __DEFAULT_FN_ATTRS 3435 _mm256_maskload_pd(double const *__p, __m256i __m) 3436 { 3437 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p, 3438 (__v4di)__m); 3439 } 3440 3441 /// Conditionally loads single-precision floating point elements from a 3442 /// memory location pointed to by \a __p into a 128-bit vector of 3443 /// [4 x float], depending on the mask bits associated with each data 3444 /// element. 3445 /// 3446 /// \headerfile <x86intrin.h> 3447 /// 3448 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3449 /// 3450 /// \param __p 3451 /// A pointer to a memory location that contains the single-precision 3452 /// floating point values. 3453 /// \param __m 3454 /// A 128-bit integer vector containing the mask. The most significant bit of 3455 /// each data element represents the mask bits. If a mask bit is zero, the 3456 /// corresponding value in the memory location is not loaded and the 3457 /// corresponding field in the return value is set to zero. 3458 /// \returns A 128-bit vector of [4 x float] containing the loaded values. 3459 static __inline __m128 __DEFAULT_FN_ATTRS128 3460 _mm_maskload_ps(float const *__p, __m128i __m) 3461 { 3462 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m); 3463 } 3464 3465 /// Conditionally loads single-precision floating point elements from a 3466 /// memory location pointed to by \a __p into a 256-bit vector of 3467 /// [8 x float], depending on the mask bits associated with each data 3468 /// element. 3469 /// 3470 /// \headerfile <x86intrin.h> 3471 /// 3472 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3473 /// 3474 /// \param __p 3475 /// A pointer to a memory location that contains the single-precision 3476 /// floating point values. 3477 /// \param __m 3478 /// A 256-bit integer vector of [8 x dword] containing the mask. The most 3479 /// significant bit of each dword element represents the mask bits. If a mask 3480 /// bit is zero, the corresponding value in the memory location is not loaded 3481 /// and the corresponding field in the return value is set to zero. 3482 /// \returns A 256-bit vector of [8 x float] containing the loaded values. 3483 static __inline __m256 __DEFAULT_FN_ATTRS 3484 _mm256_maskload_ps(float const *__p, __m256i __m) 3485 { 3486 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m); 3487 } 3488 3489 /* Conditional store ops */ 3490 /// Moves single-precision floating point values from a 256-bit vector 3491 /// of [8 x float] to a memory location pointed to by \a __p, according to 3492 /// the specified mask. 3493 /// 3494 /// \headerfile <x86intrin.h> 3495 /// 3496 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3497 /// 3498 /// \param __p 3499 /// A pointer to a memory location that will receive the float values. 3500 /// \param __m 3501 /// A 256-bit integer vector of [8 x dword] containing the mask. The most 3502 /// significant bit of each dword element in the mask vector represents the 3503 /// mask bits. If a mask bit is zero, the corresponding value from vector 3504 /// \a __a is not stored and the corresponding field in the memory location 3505 /// pointed to by \a __p is not changed. 3506 /// \param __a 3507 /// A 256-bit vector of [8 x float] containing the values to be stored. 3508 static __inline void __DEFAULT_FN_ATTRS 3509 _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a) 3510 { 3511 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a); 3512 } 3513 3514 /// Moves double-precision values from a 128-bit vector of [2 x double] 3515 /// to a memory location pointed to by \a __p, according to the specified 3516 /// mask. 3517 /// 3518 /// \headerfile <x86intrin.h> 3519 /// 3520 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3521 /// 3522 /// \param __p 3523 /// A pointer to a memory location that will receive the float values. 3524 /// \param __m 3525 /// A 128-bit integer vector containing the mask. The most significant bit of 3526 /// each field in the mask vector represents the mask bits. If a mask bit is 3527 /// zero, the corresponding value from vector \a __a is not stored and the 3528 /// corresponding field in the memory location pointed to by \a __p is not 3529 /// changed. 3530 /// \param __a 3531 /// A 128-bit vector of [2 x double] containing the values to be stored. 3532 static __inline void __DEFAULT_FN_ATTRS128 3533 _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a) 3534 { 3535 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a); 3536 } 3537 3538 /// Moves double-precision values from a 256-bit vector of [4 x double] 3539 /// to a memory location pointed to by \a __p, according to the specified 3540 /// mask. 3541 /// 3542 /// \headerfile <x86intrin.h> 3543 /// 3544 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3545 /// 3546 /// \param __p 3547 /// A pointer to a memory location that will receive the float values. 3548 /// \param __m 3549 /// A 256-bit integer vector of [4 x quadword] containing the mask. The most 3550 /// significant bit of each quadword element in the mask vector represents 3551 /// the mask bits. If a mask bit is zero, the corresponding value from vector 3552 /// __a is not stored and the corresponding field in the memory location 3553 /// pointed to by \a __p is not changed. 3554 /// \param __a 3555 /// A 256-bit vector of [4 x double] containing the values to be stored. 3556 static __inline void __DEFAULT_FN_ATTRS 3557 _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a) 3558 { 3559 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a); 3560 } 3561 3562 /// Moves single-precision floating point values from a 128-bit vector 3563 /// of [4 x float] to a memory location pointed to by \a __p, according to 3564 /// the specified mask. 3565 /// 3566 /// \headerfile <x86intrin.h> 3567 /// 3568 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3569 /// 3570 /// \param __p 3571 /// A pointer to a memory location that will receive the float values. 3572 /// \param __m 3573 /// A 128-bit integer vector containing the mask. The most significant bit of 3574 /// each field in the mask vector represents the mask bits. If a mask bit is 3575 /// zero, the corresponding value from vector __a is not stored and the 3576 /// corresponding field in the memory location pointed to by \a __p is not 3577 /// changed. 3578 /// \param __a 3579 /// A 128-bit vector of [4 x float] containing the values to be stored. 3580 static __inline void __DEFAULT_FN_ATTRS128 3581 _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a) 3582 { 3583 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a); 3584 } 3585 3586 /* Cacheability support ops */ 3587 /// Moves integer data from a 256-bit integer vector to a 32-byte 3588 /// aligned memory location. To minimize caching, the data is flagged as 3589 /// non-temporal (unlikely to be used again soon). 3590 /// 3591 /// \headerfile <x86intrin.h> 3592 /// 3593 /// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction. 3594 /// 3595 /// \param __a 3596 /// A pointer to a 32-byte aligned memory location that will receive the 3597 /// integer values. 3598 /// \param __b 3599 /// A 256-bit integer vector containing the values to be moved. 3600 static __inline void __DEFAULT_FN_ATTRS 3601 _mm256_stream_si256(void *__a, __m256i __b) 3602 { 3603 typedef __v4di __v4di_aligned __attribute__((aligned(32))); 3604 __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a); 3605 } 3606 3607 /// Moves double-precision values from a 256-bit vector of [4 x double] 3608 /// to a 32-byte aligned memory location. To minimize caching, the data is 3609 /// flagged as non-temporal (unlikely to be used again soon). 3610 /// 3611 /// \headerfile <x86intrin.h> 3612 /// 3613 /// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction. 3614 /// 3615 /// \param __a 3616 /// A pointer to a 32-byte aligned memory location that will receive the 3617 /// double-precision floating-point values. 3618 /// \param __b 3619 /// A 256-bit vector of [4 x double] containing the values to be moved. 3620 static __inline void __DEFAULT_FN_ATTRS 3621 _mm256_stream_pd(void *__a, __m256d __b) 3622 { 3623 typedef __v4df __v4df_aligned __attribute__((aligned(32))); 3624 __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a); 3625 } 3626 3627 /// Moves single-precision floating point values from a 256-bit vector 3628 /// of [8 x float] to a 32-byte aligned memory location. To minimize 3629 /// caching, the data is flagged as non-temporal (unlikely to be used again 3630 /// soon). 3631 /// 3632 /// \headerfile <x86intrin.h> 3633 /// 3634 /// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction. 3635 /// 3636 /// \param __p 3637 /// A pointer to a 32-byte aligned memory location that will receive the 3638 /// single-precision floating point values. 3639 /// \param __a 3640 /// A 256-bit vector of [8 x float] containing the values to be moved. 3641 static __inline void __DEFAULT_FN_ATTRS 3642 _mm256_stream_ps(void *__p, __m256 __a) 3643 { 3644 typedef __v8sf __v8sf_aligned __attribute__((aligned(32))); 3645 __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p); 3646 } 3647 3648 /* Create vectors */ 3649 /// Create a 256-bit vector of [4 x double] with undefined values. 3650 /// 3651 /// \headerfile <x86intrin.h> 3652 /// 3653 /// This intrinsic has no corresponding instruction. 3654 /// 3655 /// \returns A 256-bit vector of [4 x double] containing undefined values. 3656 static __inline__ __m256d __DEFAULT_FN_ATTRS 3657 _mm256_undefined_pd(void) 3658 { 3659 return (__m256d)__builtin_ia32_undef256(); 3660 } 3661 3662 /// Create a 256-bit vector of [8 x float] with undefined values. 3663 /// 3664 /// \headerfile <x86intrin.h> 3665 /// 3666 /// This intrinsic has no corresponding instruction. 3667 /// 3668 /// \returns A 256-bit vector of [8 x float] containing undefined values. 3669 static __inline__ __m256 __DEFAULT_FN_ATTRS 3670 _mm256_undefined_ps(void) 3671 { 3672 return (__m256)__builtin_ia32_undef256(); 3673 } 3674 3675 /// Create a 256-bit integer vector with undefined values. 3676 /// 3677 /// \headerfile <x86intrin.h> 3678 /// 3679 /// This intrinsic has no corresponding instruction. 3680 /// 3681 /// \returns A 256-bit integer vector containing undefined values. 3682 static __inline__ __m256i __DEFAULT_FN_ATTRS 3683 _mm256_undefined_si256(void) 3684 { 3685 return (__m256i)__builtin_ia32_undef256(); 3686 } 3687 3688 /// Constructs a 256-bit floating-point vector of [4 x double] 3689 /// initialized with the specified double-precision floating-point values. 3690 /// 3691 /// \headerfile <x86intrin.h> 3692 /// 3693 /// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c> 3694 /// instruction. 3695 /// 3696 /// \param __a 3697 /// A double-precision floating-point value used to initialize bits [255:192] 3698 /// of the result. 3699 /// \param __b 3700 /// A double-precision floating-point value used to initialize bits [191:128] 3701 /// of the result. 3702 /// \param __c 3703 /// A double-precision floating-point value used to initialize bits [127:64] 3704 /// of the result. 3705 /// \param __d 3706 /// A double-precision floating-point value used to initialize bits [63:0] 3707 /// of the result. 3708 /// \returns An initialized 256-bit floating-point vector of [4 x double]. 3709 static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR 3710 _mm256_set_pd(double __a, double __b, double __c, double __d) 3711 { 3712 return __extension__ (__m256d){ __d, __c, __b, __a }; 3713 } 3714 3715 /// Constructs a 256-bit floating-point vector of [8 x float] initialized 3716 /// with the specified single-precision floating-point values. 3717 /// 3718 /// \headerfile <x86intrin.h> 3719 /// 3720 /// This intrinsic is a utility function and does not correspond to a specific 3721 /// instruction. 3722 /// 3723 /// \param __a 3724 /// A single-precision floating-point value used to initialize bits [255:224] 3725 /// of the result. 3726 /// \param __b 3727 /// A single-precision floating-point value used to initialize bits [223:192] 3728 /// of the result. 3729 /// \param __c 3730 /// A single-precision floating-point value used to initialize bits [191:160] 3731 /// of the result. 3732 /// \param __d 3733 /// A single-precision floating-point value used to initialize bits [159:128] 3734 /// of the result. 3735 /// \param __e 3736 /// A single-precision floating-point value used to initialize bits [127:96] 3737 /// of the result. 3738 /// \param __f 3739 /// A single-precision floating-point value used to initialize bits [95:64] 3740 /// of the result. 3741 /// \param __g 3742 /// A single-precision floating-point value used to initialize bits [63:32] 3743 /// of the result. 3744 /// \param __h 3745 /// A single-precision floating-point value used to initialize bits [31:0] 3746 /// of the result. 3747 /// \returns An initialized 256-bit floating-point vector of [8 x float]. 3748 static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR 3749 _mm256_set_ps(float __a, float __b, float __c, float __d, 3750 float __e, float __f, float __g, float __h) 3751 { 3752 return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a }; 3753 } 3754 3755 /// Constructs a 256-bit integer vector initialized with the specified 3756 /// 32-bit integral values. 3757 /// 3758 /// \headerfile <x86intrin.h> 3759 /// 3760 /// This intrinsic is a utility function and does not correspond to a specific 3761 /// instruction. 3762 /// 3763 /// \param __i0 3764 /// A 32-bit integral value used to initialize bits [255:224] of the result. 3765 /// \param __i1 3766 /// A 32-bit integral value used to initialize bits [223:192] of the result. 3767 /// \param __i2 3768 /// A 32-bit integral value used to initialize bits [191:160] of the result. 3769 /// \param __i3 3770 /// A 32-bit integral value used to initialize bits [159:128] of the result. 3771 /// \param __i4 3772 /// A 32-bit integral value used to initialize bits [127:96] of the result. 3773 /// \param __i5 3774 /// A 32-bit integral value used to initialize bits [95:64] of the result. 3775 /// \param __i6 3776 /// A 32-bit integral value used to initialize bits [63:32] of the result. 3777 /// \param __i7 3778 /// A 32-bit integral value used to initialize bits [31:0] of the result. 3779 /// \returns An initialized 256-bit integer vector. 3780 static __inline __m256i __DEFAULT_FN_ATTRS 3781 _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, 3782 int __i4, int __i5, int __i6, int __i7) 3783 { 3784 return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 }; 3785 } 3786 3787 /// Constructs a 256-bit integer vector initialized with the specified 3788 /// 16-bit integral values. 3789 /// 3790 /// \headerfile <x86intrin.h> 3791 /// 3792 /// This intrinsic is a utility function and does not correspond to a specific 3793 /// instruction. 3794 /// 3795 /// \param __w15 3796 /// A 16-bit integral value used to initialize bits [255:240] of the result. 3797 /// \param __w14 3798 /// A 16-bit integral value used to initialize bits [239:224] of the result. 3799 /// \param __w13 3800 /// A 16-bit integral value used to initialize bits [223:208] of the result. 3801 /// \param __w12 3802 /// A 16-bit integral value used to initialize bits [207:192] of the result. 3803 /// \param __w11 3804 /// A 16-bit integral value used to initialize bits [191:176] of the result. 3805 /// \param __w10 3806 /// A 16-bit integral value used to initialize bits [175:160] of the result. 3807 /// \param __w09 3808 /// A 16-bit integral value used to initialize bits [159:144] of the result. 3809 /// \param __w08 3810 /// A 16-bit integral value used to initialize bits [143:128] of the result. 3811 /// \param __w07 3812 /// A 16-bit integral value used to initialize bits [127:112] of the result. 3813 /// \param __w06 3814 /// A 16-bit integral value used to initialize bits [111:96] of the result. 3815 /// \param __w05 3816 /// A 16-bit integral value used to initialize bits [95:80] of the result. 3817 /// \param __w04 3818 /// A 16-bit integral value used to initialize bits [79:64] of the result. 3819 /// \param __w03 3820 /// A 16-bit integral value used to initialize bits [63:48] of the result. 3821 /// \param __w02 3822 /// A 16-bit integral value used to initialize bits [47:32] of the result. 3823 /// \param __w01 3824 /// A 16-bit integral value used to initialize bits [31:16] of the result. 3825 /// \param __w00 3826 /// A 16-bit integral value used to initialize bits [15:0] of the result. 3827 /// \returns An initialized 256-bit integer vector. 3828 static __inline __m256i __DEFAULT_FN_ATTRS 3829 _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, 3830 short __w11, short __w10, short __w09, short __w08, 3831 short __w07, short __w06, short __w05, short __w04, 3832 short __w03, short __w02, short __w01, short __w00) 3833 { 3834 return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06, 3835 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 }; 3836 } 3837 3838 /// Constructs a 256-bit integer vector initialized with the specified 3839 /// 8-bit integral values. 3840 /// 3841 /// \headerfile <x86intrin.h> 3842 /// 3843 /// This intrinsic is a utility function and does not correspond to a specific 3844 /// instruction. 3845 /// 3846 /// \param __b31 3847 /// An 8-bit integral value used to initialize bits [255:248] of the result. 3848 /// \param __b30 3849 /// An 8-bit integral value used to initialize bits [247:240] of the result. 3850 /// \param __b29 3851 /// An 8-bit integral value used to initialize bits [239:232] of the result. 3852 /// \param __b28 3853 /// An 8-bit integral value used to initialize bits [231:224] of the result. 3854 /// \param __b27 3855 /// An 8-bit integral value used to initialize bits [223:216] of the result. 3856 /// \param __b26 3857 /// An 8-bit integral value used to initialize bits [215:208] of the result. 3858 /// \param __b25 3859 /// An 8-bit integral value used to initialize bits [207:200] of the result. 3860 /// \param __b24 3861 /// An 8-bit integral value used to initialize bits [199:192] of the result. 3862 /// \param __b23 3863 /// An 8-bit integral value used to initialize bits [191:184] of the result. 3864 /// \param __b22 3865 /// An 8-bit integral value used to initialize bits [183:176] of the result. 3866 /// \param __b21 3867 /// An 8-bit integral value used to initialize bits [175:168] of the result. 3868 /// \param __b20 3869 /// An 8-bit integral value used to initialize bits [167:160] of the result. 3870 /// \param __b19 3871 /// An 8-bit integral value used to initialize bits [159:152] of the result. 3872 /// \param __b18 3873 /// An 8-bit integral value used to initialize bits [151:144] of the result. 3874 /// \param __b17 3875 /// An 8-bit integral value used to initialize bits [143:136] of the result. 3876 /// \param __b16 3877 /// An 8-bit integral value used to initialize bits [135:128] of the result. 3878 /// \param __b15 3879 /// An 8-bit integral value used to initialize bits [127:120] of the result. 3880 /// \param __b14 3881 /// An 8-bit integral value used to initialize bits [119:112] of the result. 3882 /// \param __b13 3883 /// An 8-bit integral value used to initialize bits [111:104] of the result. 3884 /// \param __b12 3885 /// An 8-bit integral value used to initialize bits [103:96] of the result. 3886 /// \param __b11 3887 /// An 8-bit integral value used to initialize bits [95:88] of the result. 3888 /// \param __b10 3889 /// An 8-bit integral value used to initialize bits [87:80] of the result. 3890 /// \param __b09 3891 /// An 8-bit integral value used to initialize bits [79:72] of the result. 3892 /// \param __b08 3893 /// An 8-bit integral value used to initialize bits [71:64] of the result. 3894 /// \param __b07 3895 /// An 8-bit integral value used to initialize bits [63:56] of the result. 3896 /// \param __b06 3897 /// An 8-bit integral value used to initialize bits [55:48] of the result. 3898 /// \param __b05 3899 /// An 8-bit integral value used to initialize bits [47:40] of the result. 3900 /// \param __b04 3901 /// An 8-bit integral value used to initialize bits [39:32] of the result. 3902 /// \param __b03 3903 /// An 8-bit integral value used to initialize bits [31:24] of the result. 3904 /// \param __b02 3905 /// An 8-bit integral value used to initialize bits [23:16] of the result. 3906 /// \param __b01 3907 /// An 8-bit integral value used to initialize bits [15:8] of the result. 3908 /// \param __b00 3909 /// An 8-bit integral value used to initialize bits [7:0] of the result. 3910 /// \returns An initialized 256-bit integer vector. 3911 static __inline __m256i __DEFAULT_FN_ATTRS 3912 _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, 3913 char __b27, char __b26, char __b25, char __b24, 3914 char __b23, char __b22, char __b21, char __b20, 3915 char __b19, char __b18, char __b17, char __b16, 3916 char __b15, char __b14, char __b13, char __b12, 3917 char __b11, char __b10, char __b09, char __b08, 3918 char __b07, char __b06, char __b05, char __b04, 3919 char __b03, char __b02, char __b01, char __b00) 3920 { 3921 return __extension__ (__m256i)(__v32qi){ 3922 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07, 3923 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15, 3924 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23, 3925 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31 3926 }; 3927 } 3928 3929 /// Constructs a 256-bit integer vector initialized with the specified 3930 /// 64-bit integral values. 3931 /// 3932 /// \headerfile <x86intrin.h> 3933 /// 3934 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c> 3935 /// instruction. 3936 /// 3937 /// \param __a 3938 /// A 64-bit integral value used to initialize bits [255:192] of the result. 3939 /// \param __b 3940 /// A 64-bit integral value used to initialize bits [191:128] of the result. 3941 /// \param __c 3942 /// A 64-bit integral value used to initialize bits [127:64] of the result. 3943 /// \param __d 3944 /// A 64-bit integral value used to initialize bits [63:0] of the result. 3945 /// \returns An initialized 256-bit integer vector. 3946 static __inline __m256i __DEFAULT_FN_ATTRS 3947 _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d) 3948 { 3949 return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a }; 3950 } 3951 3952 /* Create vectors with elements in reverse order */ 3953 /// Constructs a 256-bit floating-point vector of [4 x double], 3954 /// initialized in reverse order with the specified double-precision 3955 /// floating-point values. 3956 /// 3957 /// \headerfile <x86intrin.h> 3958 /// 3959 /// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c> 3960 /// instruction. 3961 /// 3962 /// \param __a 3963 /// A double-precision floating-point value used to initialize bits [63:0] 3964 /// of the result. 3965 /// \param __b 3966 /// A double-precision floating-point value used to initialize bits [127:64] 3967 /// of the result. 3968 /// \param __c 3969 /// A double-precision floating-point value used to initialize bits [191:128] 3970 /// of the result. 3971 /// \param __d 3972 /// A double-precision floating-point value used to initialize bits [255:192] 3973 /// of the result. 3974 /// \returns An initialized 256-bit floating-point vector of [4 x double]. 3975 static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR 3976 _mm256_setr_pd(double __a, double __b, double __c, double __d) 3977 { 3978 return _mm256_set_pd(__d, __c, __b, __a); 3979 } 3980 3981 /// Constructs a 256-bit floating-point vector of [8 x float], 3982 /// initialized in reverse order with the specified single-precision 3983 /// float-point values. 3984 /// 3985 /// \headerfile <x86intrin.h> 3986 /// 3987 /// This intrinsic is a utility function and does not correspond to a specific 3988 /// instruction. 3989 /// 3990 /// \param __a 3991 /// A single-precision floating-point value used to initialize bits [31:0] 3992 /// of the result. 3993 /// \param __b 3994 /// A single-precision floating-point value used to initialize bits [63:32] 3995 /// of the result. 3996 /// \param __c 3997 /// A single-precision floating-point value used to initialize bits [95:64] 3998 /// of the result. 3999 /// \param __d 4000 /// A single-precision floating-point value used to initialize bits [127:96] 4001 /// of the result. 4002 /// \param __e 4003 /// A single-precision floating-point value used to initialize bits [159:128] 4004 /// of the result. 4005 /// \param __f 4006 /// A single-precision floating-point value used to initialize bits [191:160] 4007 /// of the result. 4008 /// \param __g 4009 /// A single-precision floating-point value used to initialize bits [223:192] 4010 /// of the result. 4011 /// \param __h 4012 /// A single-precision floating-point value used to initialize bits [255:224] 4013 /// of the result. 4014 /// \returns An initialized 256-bit floating-point vector of [8 x float]. 4015 static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR 4016 _mm256_setr_ps(float __a, float __b, float __c, float __d, 4017 float __e, float __f, float __g, float __h) 4018 { 4019 return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a); 4020 } 4021 4022 /// Constructs a 256-bit integer vector, initialized in reverse order 4023 /// with the specified 32-bit integral values. 4024 /// 4025 /// \headerfile <x86intrin.h> 4026 /// 4027 /// This intrinsic is a utility function and does not correspond to a specific 4028 /// instruction. 4029 /// 4030 /// \param __i0 4031 /// A 32-bit integral value used to initialize bits [31:0] of the result. 4032 /// \param __i1 4033 /// A 32-bit integral value used to initialize bits [63:32] of the result. 4034 /// \param __i2 4035 /// A 32-bit integral value used to initialize bits [95:64] of the result. 4036 /// \param __i3 4037 /// A 32-bit integral value used to initialize bits [127:96] of the result. 4038 /// \param __i4 4039 /// A 32-bit integral value used to initialize bits [159:128] of the result. 4040 /// \param __i5 4041 /// A 32-bit integral value used to initialize bits [191:160] of the result. 4042 /// \param __i6 4043 /// A 32-bit integral value used to initialize bits [223:192] of the result. 4044 /// \param __i7 4045 /// A 32-bit integral value used to initialize bits [255:224] of the result. 4046 /// \returns An initialized 256-bit integer vector. 4047 static __inline __m256i __DEFAULT_FN_ATTRS 4048 _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, 4049 int __i4, int __i5, int __i6, int __i7) 4050 { 4051 return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0); 4052 } 4053 4054 /// Constructs a 256-bit integer vector, initialized in reverse order 4055 /// with the specified 16-bit integral values. 4056 /// 4057 /// \headerfile <x86intrin.h> 4058 /// 4059 /// This intrinsic is a utility function and does not correspond to a specific 4060 /// instruction. 4061 /// 4062 /// \param __w15 4063 /// A 16-bit integral value used to initialize bits [15:0] of the result. 4064 /// \param __w14 4065 /// A 16-bit integral value used to initialize bits [31:16] of the result. 4066 /// \param __w13 4067 /// A 16-bit integral value used to initialize bits [47:32] of the result. 4068 /// \param __w12 4069 /// A 16-bit integral value used to initialize bits [63:48] of the result. 4070 /// \param __w11 4071 /// A 16-bit integral value used to initialize bits [79:64] of the result. 4072 /// \param __w10 4073 /// A 16-bit integral value used to initialize bits [95:80] of the result. 4074 /// \param __w09 4075 /// A 16-bit integral value used to initialize bits [111:96] of the result. 4076 /// \param __w08 4077 /// A 16-bit integral value used to initialize bits [127:112] of the result. 4078 /// \param __w07 4079 /// A 16-bit integral value used to initialize bits [143:128] of the result. 4080 /// \param __w06 4081 /// A 16-bit integral value used to initialize bits [159:144] of the result. 4082 /// \param __w05 4083 /// A 16-bit integral value used to initialize bits [175:160] of the result. 4084 /// \param __w04 4085 /// A 16-bit integral value used to initialize bits [191:176] of the result. 4086 /// \param __w03 4087 /// A 16-bit integral value used to initialize bits [207:192] of the result. 4088 /// \param __w02 4089 /// A 16-bit integral value used to initialize bits [223:208] of the result. 4090 /// \param __w01 4091 /// A 16-bit integral value used to initialize bits [239:224] of the result. 4092 /// \param __w00 4093 /// A 16-bit integral value used to initialize bits [255:240] of the result. 4094 /// \returns An initialized 256-bit integer vector. 4095 static __inline __m256i __DEFAULT_FN_ATTRS 4096 _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, 4097 short __w11, short __w10, short __w09, short __w08, 4098 short __w07, short __w06, short __w05, short __w04, 4099 short __w03, short __w02, short __w01, short __w00) 4100 { 4101 return _mm256_set_epi16(__w00, __w01, __w02, __w03, 4102 __w04, __w05, __w06, __w07, 4103 __w08, __w09, __w10, __w11, 4104 __w12, __w13, __w14, __w15); 4105 } 4106 4107 /// Constructs a 256-bit integer vector, initialized in reverse order 4108 /// with the specified 8-bit integral values. 4109 /// 4110 /// \headerfile <x86intrin.h> 4111 /// 4112 /// This intrinsic is a utility function and does not correspond to a specific 4113 /// instruction. 4114 /// 4115 /// \param __b31 4116 /// An 8-bit integral value used to initialize bits [7:0] of the result. 4117 /// \param __b30 4118 /// An 8-bit integral value used to initialize bits [15:8] of the result. 4119 /// \param __b29 4120 /// An 8-bit integral value used to initialize bits [23:16] of the result. 4121 /// \param __b28 4122 /// An 8-bit integral value used to initialize bits [31:24] of the result. 4123 /// \param __b27 4124 /// An 8-bit integral value used to initialize bits [39:32] of the result. 4125 /// \param __b26 4126 /// An 8-bit integral value used to initialize bits [47:40] of the result. 4127 /// \param __b25 4128 /// An 8-bit integral value used to initialize bits [55:48] of the result. 4129 /// \param __b24 4130 /// An 8-bit integral value used to initialize bits [63:56] of the result. 4131 /// \param __b23 4132 /// An 8-bit integral value used to initialize bits [71:64] of the result. 4133 /// \param __b22 4134 /// An 8-bit integral value used to initialize bits [79:72] of the result. 4135 /// \param __b21 4136 /// An 8-bit integral value used to initialize bits [87:80] of the result. 4137 /// \param __b20 4138 /// An 8-bit integral value used to initialize bits [95:88] of the result. 4139 /// \param __b19 4140 /// An 8-bit integral value used to initialize bits [103:96] of the result. 4141 /// \param __b18 4142 /// An 8-bit integral value used to initialize bits [111:104] of the result. 4143 /// \param __b17 4144 /// An 8-bit integral value used to initialize bits [119:112] of the result. 4145 /// \param __b16 4146 /// An 8-bit integral value used to initialize bits [127:120] of the result. 4147 /// \param __b15 4148 /// An 8-bit integral value used to initialize bits [135:128] of the result. 4149 /// \param __b14 4150 /// An 8-bit integral value used to initialize bits [143:136] of the result. 4151 /// \param __b13 4152 /// An 8-bit integral value used to initialize bits [151:144] of the result. 4153 /// \param __b12 4154 /// An 8-bit integral value used to initialize bits [159:152] of the result. 4155 /// \param __b11 4156 /// An 8-bit integral value used to initialize bits [167:160] of the result. 4157 /// \param __b10 4158 /// An 8-bit integral value used to initialize bits [175:168] of the result. 4159 /// \param __b09 4160 /// An 8-bit integral value used to initialize bits [183:176] of the result. 4161 /// \param __b08 4162 /// An 8-bit integral value used to initialize bits [191:184] of the result. 4163 /// \param __b07 4164 /// An 8-bit integral value used to initialize bits [199:192] of the result. 4165 /// \param __b06 4166 /// An 8-bit integral value used to initialize bits [207:200] of the result. 4167 /// \param __b05 4168 /// An 8-bit integral value used to initialize bits [215:208] of the result. 4169 /// \param __b04 4170 /// An 8-bit integral value used to initialize bits [223:216] of the result. 4171 /// \param __b03 4172 /// An 8-bit integral value used to initialize bits [231:224] of the result. 4173 /// \param __b02 4174 /// An 8-bit integral value used to initialize bits [239:232] of the result. 4175 /// \param __b01 4176 /// An 8-bit integral value used to initialize bits [247:240] of the result. 4177 /// \param __b00 4178 /// An 8-bit integral value used to initialize bits [255:248] of the result. 4179 /// \returns An initialized 256-bit integer vector. 4180 static __inline __m256i __DEFAULT_FN_ATTRS 4181 _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, 4182 char __b27, char __b26, char __b25, char __b24, 4183 char __b23, char __b22, char __b21, char __b20, 4184 char __b19, char __b18, char __b17, char __b16, 4185 char __b15, char __b14, char __b13, char __b12, 4186 char __b11, char __b10, char __b09, char __b08, 4187 char __b07, char __b06, char __b05, char __b04, 4188 char __b03, char __b02, char __b01, char __b00) 4189 { 4190 return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07, 4191 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15, 4192 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23, 4193 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31); 4194 } 4195 4196 /// Constructs a 256-bit integer vector, initialized in reverse order 4197 /// with the specified 64-bit integral values. 4198 /// 4199 /// \headerfile <x86intrin.h> 4200 /// 4201 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c> 4202 /// instruction. 4203 /// 4204 /// \param __a 4205 /// A 64-bit integral value used to initialize bits [63:0] of the result. 4206 /// \param __b 4207 /// A 64-bit integral value used to initialize bits [127:64] of the result. 4208 /// \param __c 4209 /// A 64-bit integral value used to initialize bits [191:128] of the result. 4210 /// \param __d 4211 /// A 64-bit integral value used to initialize bits [255:192] of the result. 4212 /// \returns An initialized 256-bit integer vector. 4213 static __inline __m256i __DEFAULT_FN_ATTRS 4214 _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d) 4215 { 4216 return _mm256_set_epi64x(__d, __c, __b, __a); 4217 } 4218 4219 /* Create vectors with repeated elements */ 4220 /// Constructs a 256-bit floating-point vector of [4 x double], with each 4221 /// of the four double-precision floating-point vector elements set to the 4222 /// specified double-precision floating-point value. 4223 /// 4224 /// \headerfile <x86intrin.h> 4225 /// 4226 /// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction. 4227 /// 4228 /// \param __w 4229 /// A double-precision floating-point value used to initialize each vector 4230 /// element of the result. 4231 /// \returns An initialized 256-bit floating-point vector of [4 x double]. 4232 static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR 4233 _mm256_set1_pd(double __w) 4234 { 4235 return _mm256_set_pd(__w, __w, __w, __w); 4236 } 4237 4238 /// Constructs a 256-bit floating-point vector of [8 x float], with each 4239 /// of the eight single-precision floating-point vector elements set to the 4240 /// specified single-precision floating-point value. 4241 /// 4242 /// \headerfile <x86intrin.h> 4243 /// 4244 /// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c> 4245 /// instruction. 4246 /// 4247 /// \param __w 4248 /// A single-precision floating-point value used to initialize each vector 4249 /// element of the result. 4250 /// \returns An initialized 256-bit floating-point vector of [8 x float]. 4251 static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR 4252 _mm256_set1_ps(float __w) 4253 { 4254 return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w); 4255 } 4256 4257 /// Constructs a 256-bit integer vector of [8 x i32], with each of the 4258 /// 32-bit integral vector elements set to the specified 32-bit integral 4259 /// value. 4260 /// 4261 /// \headerfile <x86intrin.h> 4262 /// 4263 /// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c> 4264 /// instruction. 4265 /// 4266 /// \param __i 4267 /// A 32-bit integral value used to initialize each vector element of the 4268 /// result. 4269 /// \returns An initialized 256-bit integer vector of [8 x i32]. 4270 static __inline __m256i __DEFAULT_FN_ATTRS 4271 _mm256_set1_epi32(int __i) 4272 { 4273 return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i); 4274 } 4275 4276 /// Constructs a 256-bit integer vector of [16 x i16], with each of the 4277 /// 16-bit integral vector elements set to the specified 16-bit integral 4278 /// value. 4279 /// 4280 /// \headerfile <x86intrin.h> 4281 /// 4282 /// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction. 4283 /// 4284 /// \param __w 4285 /// A 16-bit integral value used to initialize each vector element of the 4286 /// result. 4287 /// \returns An initialized 256-bit integer vector of [16 x i16]. 4288 static __inline __m256i __DEFAULT_FN_ATTRS 4289 _mm256_set1_epi16(short __w) 4290 { 4291 return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w, 4292 __w, __w, __w, __w, __w, __w, __w, __w); 4293 } 4294 4295 /// Constructs a 256-bit integer vector of [32 x i8], with each of the 4296 /// 8-bit integral vector elements set to the specified 8-bit integral value. 4297 /// 4298 /// \headerfile <x86intrin.h> 4299 /// 4300 /// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction. 4301 /// 4302 /// \param __b 4303 /// An 8-bit integral value used to initialize each vector element of the 4304 /// result. 4305 /// \returns An initialized 256-bit integer vector of [32 x i8]. 4306 static __inline __m256i __DEFAULT_FN_ATTRS 4307 _mm256_set1_epi8(char __b) 4308 { 4309 return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, 4310 __b, __b, __b, __b, __b, __b, __b, __b, 4311 __b, __b, __b, __b, __b, __b, __b, __b, 4312 __b, __b, __b, __b, __b, __b, __b, __b); 4313 } 4314 4315 /// Constructs a 256-bit integer vector of [4 x i64], with each of the 4316 /// 64-bit integral vector elements set to the specified 64-bit integral 4317 /// value. 4318 /// 4319 /// \headerfile <x86intrin.h> 4320 /// 4321 /// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction. 4322 /// 4323 /// \param __q 4324 /// A 64-bit integral value used to initialize each vector element of the 4325 /// result. 4326 /// \returns An initialized 256-bit integer vector of [4 x i64]. 4327 static __inline __m256i __DEFAULT_FN_ATTRS 4328 _mm256_set1_epi64x(long long __q) 4329 { 4330 return _mm256_set_epi64x(__q, __q, __q, __q); 4331 } 4332 4333 /* Create __zeroed vectors */ 4334 /// Constructs a 256-bit floating-point vector of [4 x double] with all 4335 /// vector elements initialized to zero. 4336 /// 4337 /// \headerfile <x86intrin.h> 4338 /// 4339 /// This intrinsic corresponds to the <c> VXORPS </c> instruction. 4340 /// 4341 /// \returns A 256-bit vector of [4 x double] with all elements set to zero. 4342 static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_pd(void) { 4343 return __extension__(__m256d){0.0, 0.0, 0.0, 0.0}; 4344 } 4345 4346 /// Constructs a 256-bit floating-point vector of [8 x float] with all 4347 /// vector elements initialized to zero. 4348 /// 4349 /// \headerfile <x86intrin.h> 4350 /// 4351 /// This intrinsic corresponds to the <c> VXORPS </c> instruction. 4352 /// 4353 /// \returns A 256-bit vector of [8 x float] with all elements set to zero. 4354 static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_ps(void) { 4355 return __extension__ (__m256){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; 4356 } 4357 4358 /// Constructs a 256-bit integer vector initialized to zero. 4359 /// 4360 /// \headerfile <x86intrin.h> 4361 /// 4362 /// This intrinsic corresponds to the <c> VXORPS </c> instruction. 4363 /// 4364 /// \returns A 256-bit integer vector initialized to zero. 4365 static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR 4366 _mm256_setzero_si256(void) { 4367 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 }; 4368 } 4369 4370 /* Cast between vector types */ 4371 /// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit 4372 /// floating-point vector of [8 x float]. 4373 /// 4374 /// \headerfile <x86intrin.h> 4375 /// 4376 /// This intrinsic has no corresponding instruction. 4377 /// 4378 /// \param __a 4379 /// A 256-bit floating-point vector of [4 x double]. 4380 /// \returns A 256-bit floating-point vector of [8 x float] containing the same 4381 /// bitwise pattern as the parameter. 4382 static __inline __m256 __DEFAULT_FN_ATTRS 4383 _mm256_castpd_ps(__m256d __a) 4384 { 4385 return (__m256)__a; 4386 } 4387 4388 /// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit 4389 /// integer vector. 4390 /// 4391 /// \headerfile <x86intrin.h> 4392 /// 4393 /// This intrinsic has no corresponding instruction. 4394 /// 4395 /// \param __a 4396 /// A 256-bit floating-point vector of [4 x double]. 4397 /// \returns A 256-bit integer vector containing the same bitwise pattern as the 4398 /// parameter. 4399 static __inline __m256i __DEFAULT_FN_ATTRS 4400 _mm256_castpd_si256(__m256d __a) 4401 { 4402 return (__m256i)__a; 4403 } 4404 4405 /// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit 4406 /// floating-point vector of [4 x double]. 4407 /// 4408 /// \headerfile <x86intrin.h> 4409 /// 4410 /// This intrinsic has no corresponding instruction. 4411 /// 4412 /// \param __a 4413 /// A 256-bit floating-point vector of [8 x float]. 4414 /// \returns A 256-bit floating-point vector of [4 x double] containing the same 4415 /// bitwise pattern as the parameter. 4416 static __inline __m256d __DEFAULT_FN_ATTRS 4417 _mm256_castps_pd(__m256 __a) 4418 { 4419 return (__m256d)__a; 4420 } 4421 4422 /// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit 4423 /// integer vector. 4424 /// 4425 /// \headerfile <x86intrin.h> 4426 /// 4427 /// This intrinsic has no corresponding instruction. 4428 /// 4429 /// \param __a 4430 /// A 256-bit floating-point vector of [8 x float]. 4431 /// \returns A 256-bit integer vector containing the same bitwise pattern as the 4432 /// parameter. 4433 static __inline __m256i __DEFAULT_FN_ATTRS 4434 _mm256_castps_si256(__m256 __a) 4435 { 4436 return (__m256i)__a; 4437 } 4438 4439 /// Casts a 256-bit integer vector into a 256-bit floating-point vector 4440 /// of [8 x float]. 4441 /// 4442 /// \headerfile <x86intrin.h> 4443 /// 4444 /// This intrinsic has no corresponding instruction. 4445 /// 4446 /// \param __a 4447 /// A 256-bit integer vector. 4448 /// \returns A 256-bit floating-point vector of [8 x float] containing the same 4449 /// bitwise pattern as the parameter. 4450 static __inline __m256 __DEFAULT_FN_ATTRS 4451 _mm256_castsi256_ps(__m256i __a) 4452 { 4453 return (__m256)__a; 4454 } 4455 4456 /// Casts a 256-bit integer vector into a 256-bit floating-point vector 4457 /// of [4 x double]. 4458 /// 4459 /// \headerfile <x86intrin.h> 4460 /// 4461 /// This intrinsic has no corresponding instruction. 4462 /// 4463 /// \param __a 4464 /// A 256-bit integer vector. 4465 /// \returns A 256-bit floating-point vector of [4 x double] containing the same 4466 /// bitwise pattern as the parameter. 4467 static __inline __m256d __DEFAULT_FN_ATTRS 4468 _mm256_castsi256_pd(__m256i __a) 4469 { 4470 return (__m256d)__a; 4471 } 4472 4473 /// Returns the lower 128 bits of a 256-bit floating-point vector of 4474 /// [4 x double] as a 128-bit floating-point vector of [2 x double]. 4475 /// 4476 /// \headerfile <x86intrin.h> 4477 /// 4478 /// This intrinsic has no corresponding instruction. 4479 /// 4480 /// \param __a 4481 /// A 256-bit floating-point vector of [4 x double]. 4482 /// \returns A 128-bit floating-point vector of [2 x double] containing the 4483 /// lower 128 bits of the parameter. 4484 static __inline __m128d __DEFAULT_FN_ATTRS 4485 _mm256_castpd256_pd128(__m256d __a) 4486 { 4487 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1); 4488 } 4489 4490 /// Returns the lower 128 bits of a 256-bit floating-point vector of 4491 /// [8 x float] as a 128-bit floating-point vector of [4 x float]. 4492 /// 4493 /// \headerfile <x86intrin.h> 4494 /// 4495 /// This intrinsic has no corresponding instruction. 4496 /// 4497 /// \param __a 4498 /// A 256-bit floating-point vector of [8 x float]. 4499 /// \returns A 128-bit floating-point vector of [4 x float] containing the 4500 /// lower 128 bits of the parameter. 4501 static __inline __m128 __DEFAULT_FN_ATTRS 4502 _mm256_castps256_ps128(__m256 __a) 4503 { 4504 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3); 4505 } 4506 4507 /// Truncates a 256-bit integer vector into a 128-bit integer vector. 4508 /// 4509 /// \headerfile <x86intrin.h> 4510 /// 4511 /// This intrinsic has no corresponding instruction. 4512 /// 4513 /// \param __a 4514 /// A 256-bit integer vector. 4515 /// \returns A 128-bit integer vector containing the lower 128 bits of the 4516 /// parameter. 4517 static __inline __m128i __DEFAULT_FN_ATTRS 4518 _mm256_castsi256_si128(__m256i __a) 4519 { 4520 return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1); 4521 } 4522 4523 /// Constructs a 256-bit floating-point vector of [4 x double] from a 4524 /// 128-bit floating-point vector of [2 x double]. 4525 /// 4526 /// The lower 128 bits contain the value of the source vector. The contents 4527 /// of the upper 128 bits are undefined. 4528 /// 4529 /// \headerfile <x86intrin.h> 4530 /// 4531 /// This intrinsic has no corresponding instruction. 4532 /// 4533 /// \param __a 4534 /// A 128-bit vector of [2 x double]. 4535 /// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits 4536 /// contain the value of the parameter. The contents of the upper 128 bits 4537 /// are undefined. 4538 static __inline __m256d __DEFAULT_FN_ATTRS 4539 _mm256_castpd128_pd256(__m128d __a) 4540 { 4541 return __builtin_shufflevector( 4542 (__v2df)__a, (__v2df)__builtin_nondeterministic_value(__a), 0, 1, 2, 3); 4543 } 4544 4545 /// Constructs a 256-bit floating-point vector of [8 x float] from a 4546 /// 128-bit floating-point vector of [4 x float]. 4547 /// 4548 /// The lower 128 bits contain the value of the source vector. The contents 4549 /// of the upper 128 bits are undefined. 4550 /// 4551 /// \headerfile <x86intrin.h> 4552 /// 4553 /// This intrinsic has no corresponding instruction. 4554 /// 4555 /// \param __a 4556 /// A 128-bit vector of [4 x float]. 4557 /// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits 4558 /// contain the value of the parameter. The contents of the upper 128 bits 4559 /// are undefined. 4560 static __inline __m256 __DEFAULT_FN_ATTRS 4561 _mm256_castps128_ps256(__m128 __a) 4562 { 4563 return __builtin_shufflevector((__v4sf)__a, 4564 (__v4sf)__builtin_nondeterministic_value(__a), 4565 0, 1, 2, 3, 4, 5, 6, 7); 4566 } 4567 4568 /// Constructs a 256-bit integer vector from a 128-bit integer vector. 4569 /// 4570 /// The lower 128 bits contain the value of the source vector. The contents 4571 /// of the upper 128 bits are undefined. 4572 /// 4573 /// \headerfile <x86intrin.h> 4574 /// 4575 /// This intrinsic has no corresponding instruction. 4576 /// 4577 /// \param __a 4578 /// A 128-bit integer vector. 4579 /// \returns A 256-bit integer vector. The lower 128 bits contain the value of 4580 /// the parameter. The contents of the upper 128 bits are undefined. 4581 static __inline __m256i __DEFAULT_FN_ATTRS 4582 _mm256_castsi128_si256(__m128i __a) 4583 { 4584 return __builtin_shufflevector( 4585 (__v2di)__a, (__v2di)__builtin_nondeterministic_value(__a), 0, 1, 2, 3); 4586 } 4587 4588 /// Constructs a 256-bit floating-point vector of [4 x double] from a 4589 /// 128-bit floating-point vector of [2 x double]. The lower 128 bits 4590 /// contain the value of the source vector. The upper 128 bits are set 4591 /// to zero. 4592 /// 4593 /// \headerfile <x86intrin.h> 4594 /// 4595 /// This intrinsic has no corresponding instruction. 4596 /// 4597 /// \param __a 4598 /// A 128-bit vector of [2 x double]. 4599 /// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits 4600 /// contain the value of the parameter. The upper 128 bits are set to zero. 4601 static __inline __m256d __DEFAULT_FN_ATTRS 4602 _mm256_zextpd128_pd256(__m128d __a) 4603 { 4604 return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3); 4605 } 4606 4607 /// Constructs a 256-bit floating-point vector of [8 x float] from a 4608 /// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain 4609 /// the value of the source vector. The upper 128 bits are set to zero. 4610 /// 4611 /// \headerfile <x86intrin.h> 4612 /// 4613 /// This intrinsic has no corresponding instruction. 4614 /// 4615 /// \param __a 4616 /// A 128-bit vector of [4 x float]. 4617 /// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits 4618 /// contain the value of the parameter. The upper 128 bits are set to zero. 4619 static __inline __m256 __DEFAULT_FN_ATTRS 4620 _mm256_zextps128_ps256(__m128 __a) 4621 { 4622 return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7); 4623 } 4624 4625 /// Constructs a 256-bit integer vector from a 128-bit integer vector. 4626 /// The lower 128 bits contain the value of the source vector. The upper 4627 /// 128 bits are set to zero. 4628 /// 4629 /// \headerfile <x86intrin.h> 4630 /// 4631 /// This intrinsic has no corresponding instruction. 4632 /// 4633 /// \param __a 4634 /// A 128-bit integer vector. 4635 /// \returns A 256-bit integer vector. The lower 128 bits contain the value of 4636 /// the parameter. The upper 128 bits are set to zero. 4637 static __inline __m256i __DEFAULT_FN_ATTRS 4638 _mm256_zextsi128_si256(__m128i __a) 4639 { 4640 return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3); 4641 } 4642 4643 /* 4644 Vector insert. 4645 We use macros rather than inlines because we only want to accept 4646 invocations where the immediate M is a constant expression. 4647 */ 4648 /// Constructs a new 256-bit vector of [8 x float] by first duplicating 4649 /// a 256-bit vector of [8 x float] given in the first parameter, and then 4650 /// replacing either the upper or the lower 128 bits with the contents of a 4651 /// 128-bit vector of [4 x float] in the second parameter. 4652 /// 4653 /// The immediate integer parameter determines between the upper or the lower 4654 /// 128 bits. 4655 /// 4656 /// \headerfile <x86intrin.h> 4657 /// 4658 /// \code 4659 /// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M); 4660 /// \endcode 4661 /// 4662 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4663 /// 4664 /// \param V1 4665 /// A 256-bit vector of [8 x float]. This vector is copied to the result 4666 /// first, and then either the upper or the lower 128 bits of the result will 4667 /// be replaced by the contents of \a V2. 4668 /// \param V2 4669 /// A 128-bit vector of [4 x float]. The contents of this parameter are 4670 /// written to either the upper or the lower 128 bits of the result depending 4671 /// on the value of parameter \a M. 4672 /// \param M 4673 /// An immediate integer. The least significant bit determines how the values 4674 /// from the two parameters are interleaved: \n 4675 /// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, 4676 /// and bits [255:128] of \a V1 are copied to bits [255:128] of the 4677 /// result. \n 4678 /// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the 4679 /// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the 4680 /// result. 4681 /// \returns A 256-bit vector of [8 x float] containing the interleaved values. 4682 #define _mm256_insertf128_ps(V1, V2, M) \ 4683 ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \ 4684 (__v4sf)(__m128)(V2), (int)(M))) 4685 4686 /// Constructs a new 256-bit vector of [4 x double] by first duplicating 4687 /// a 256-bit vector of [4 x double] given in the first parameter, and then 4688 /// replacing either the upper or the lower 128 bits with the contents of a 4689 /// 128-bit vector of [2 x double] in the second parameter. 4690 /// 4691 /// The immediate integer parameter determines between the upper or the lower 4692 /// 128 bits. 4693 /// 4694 /// \headerfile <x86intrin.h> 4695 /// 4696 /// \code 4697 /// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M); 4698 /// \endcode 4699 /// 4700 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4701 /// 4702 /// \param V1 4703 /// A 256-bit vector of [4 x double]. This vector is copied to the result 4704 /// first, and then either the upper or the lower 128 bits of the result will 4705 /// be replaced by the contents of \a V2. 4706 /// \param V2 4707 /// A 128-bit vector of [2 x double]. The contents of this parameter are 4708 /// written to either the upper or the lower 128 bits of the result depending 4709 /// on the value of parameter \a M. 4710 /// \param M 4711 /// An immediate integer. The least significant bit determines how the values 4712 /// from the two parameters are interleaved: \n 4713 /// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, 4714 /// and bits [255:128] of \a V1 are copied to bits [255:128] of the 4715 /// result. \n 4716 /// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the 4717 /// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the 4718 /// result. 4719 /// \returns A 256-bit vector of [4 x double] containing the interleaved values. 4720 #define _mm256_insertf128_pd(V1, V2, M) \ 4721 ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \ 4722 (__v2df)(__m128d)(V2), (int)(M))) 4723 4724 /// Constructs a new 256-bit integer vector by first duplicating a 4725 /// 256-bit integer vector given in the first parameter, and then replacing 4726 /// either the upper or the lower 128 bits with the contents of a 128-bit 4727 /// integer vector in the second parameter. 4728 /// 4729 /// The immediate integer parameter determines between the upper or the lower 4730 /// 128 bits. 4731 /// 4732 /// \headerfile <x86intrin.h> 4733 /// 4734 /// \code 4735 /// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M); 4736 /// \endcode 4737 /// 4738 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4739 /// 4740 /// \param V1 4741 /// A 256-bit integer vector. This vector is copied to the result first, and 4742 /// then either the upper or the lower 128 bits of the result will be 4743 /// replaced by the contents of \a V2. 4744 /// \param V2 4745 /// A 128-bit integer vector. The contents of this parameter are written to 4746 /// either the upper or the lower 128 bits of the result depending on the 4747 /// value of parameter \a M. 4748 /// \param M 4749 /// An immediate integer. The least significant bit determines how the values 4750 /// from the two parameters are interleaved: \n 4751 /// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, 4752 /// and bits [255:128] of \a V1 are copied to bits [255:128] of the 4753 /// result. \n 4754 /// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the 4755 /// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the 4756 /// result. 4757 /// \returns A 256-bit integer vector containing the interleaved values. 4758 #define _mm256_insertf128_si256(V1, V2, M) \ 4759 ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \ 4760 (__v4si)(__m128i)(V2), (int)(M))) 4761 4762 /* 4763 Vector extract. 4764 We use macros rather than inlines because we only want to accept 4765 invocations where the immediate M is a constant expression. 4766 */ 4767 /// Extracts either the upper or the lower 128 bits from a 256-bit vector 4768 /// of [8 x float], as determined by the immediate integer parameter, and 4769 /// returns the extracted bits as a 128-bit vector of [4 x float]. 4770 /// 4771 /// \headerfile <x86intrin.h> 4772 /// 4773 /// \code 4774 /// __m128 _mm256_extractf128_ps(__m256 V, const int M); 4775 /// \endcode 4776 /// 4777 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. 4778 /// 4779 /// \param V 4780 /// A 256-bit vector of [8 x float]. 4781 /// \param M 4782 /// An immediate integer. The least significant bit determines which bits are 4783 /// extracted from the first parameter: \n 4784 /// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the 4785 /// result. \n 4786 /// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. 4787 /// \returns A 128-bit vector of [4 x float] containing the extracted bits. 4788 #define _mm256_extractf128_ps(V, M) \ 4789 ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M))) 4790 4791 /// Extracts either the upper or the lower 128 bits from a 256-bit vector 4792 /// of [4 x double], as determined by the immediate integer parameter, and 4793 /// returns the extracted bits as a 128-bit vector of [2 x double]. 4794 /// 4795 /// \headerfile <x86intrin.h> 4796 /// 4797 /// \code 4798 /// __m128d _mm256_extractf128_pd(__m256d V, const int M); 4799 /// \endcode 4800 /// 4801 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. 4802 /// 4803 /// \param V 4804 /// A 256-bit vector of [4 x double]. 4805 /// \param M 4806 /// An immediate integer. The least significant bit determines which bits are 4807 /// extracted from the first parameter: \n 4808 /// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the 4809 /// result. \n 4810 /// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. 4811 /// \returns A 128-bit vector of [2 x double] containing the extracted bits. 4812 #define _mm256_extractf128_pd(V, M) \ 4813 ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M))) 4814 4815 /// Extracts either the upper or the lower 128 bits from a 256-bit 4816 /// integer vector, as determined by the immediate integer parameter, and 4817 /// returns the extracted bits as a 128-bit integer vector. 4818 /// 4819 /// \headerfile <x86intrin.h> 4820 /// 4821 /// \code 4822 /// __m128i _mm256_extractf128_si256(__m256i V, const int M); 4823 /// \endcode 4824 /// 4825 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. 4826 /// 4827 /// \param V 4828 /// A 256-bit integer vector. 4829 /// \param M 4830 /// An immediate integer. The least significant bit determines which bits are 4831 /// extracted from the first parameter: \n 4832 /// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the 4833 /// result. \n 4834 /// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. 4835 /// \returns A 128-bit integer vector containing the extracted bits. 4836 #define _mm256_extractf128_si256(V, M) \ 4837 ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M))) 4838 4839 /// Constructs a 256-bit floating-point vector of [8 x float] by 4840 /// concatenating two 128-bit floating-point vectors of [4 x float]. 4841 /// 4842 /// \headerfile <x86intrin.h> 4843 /// 4844 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4845 /// 4846 /// \param __hi 4847 /// A 128-bit floating-point vector of [4 x float] to be copied to the upper 4848 /// 128 bits of the result. 4849 /// \param __lo 4850 /// A 128-bit floating-point vector of [4 x float] to be copied to the lower 4851 /// 128 bits of the result. 4852 /// \returns A 256-bit floating-point vector of [8 x float] containing the 4853 /// concatenated result. 4854 static __inline __m256 __DEFAULT_FN_ATTRS 4855 _mm256_set_m128 (__m128 __hi, __m128 __lo) 4856 { 4857 return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7); 4858 } 4859 4860 /// Constructs a 256-bit floating-point vector of [4 x double] by 4861 /// concatenating two 128-bit floating-point vectors of [2 x double]. 4862 /// 4863 /// \headerfile <x86intrin.h> 4864 /// 4865 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4866 /// 4867 /// \param __hi 4868 /// A 128-bit floating-point vector of [2 x double] to be copied to the upper 4869 /// 128 bits of the result. 4870 /// \param __lo 4871 /// A 128-bit floating-point vector of [2 x double] to be copied to the lower 4872 /// 128 bits of the result. 4873 /// \returns A 256-bit floating-point vector of [4 x double] containing the 4874 /// concatenated result. 4875 static __inline __m256d __DEFAULT_FN_ATTRS 4876 _mm256_set_m128d (__m128d __hi, __m128d __lo) 4877 { 4878 return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3); 4879 } 4880 4881 /// Constructs a 256-bit integer vector by concatenating two 128-bit 4882 /// integer vectors. 4883 /// 4884 /// \headerfile <x86intrin.h> 4885 /// 4886 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4887 /// 4888 /// \param __hi 4889 /// A 128-bit integer vector to be copied to the upper 128 bits of the 4890 /// result. 4891 /// \param __lo 4892 /// A 128-bit integer vector to be copied to the lower 128 bits of the 4893 /// result. 4894 /// \returns A 256-bit integer vector containing the concatenated result. 4895 static __inline __m256i __DEFAULT_FN_ATTRS 4896 _mm256_set_m128i (__m128i __hi, __m128i __lo) 4897 { 4898 return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3); 4899 } 4900 4901 /// Constructs a 256-bit floating-point vector of [8 x float] by 4902 /// concatenating two 128-bit floating-point vectors of [4 x float]. This is 4903 /// similar to _mm256_set_m128, but the order of the input parameters is 4904 /// swapped. 4905 /// 4906 /// \headerfile <x86intrin.h> 4907 /// 4908 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4909 /// 4910 /// \param __lo 4911 /// A 128-bit floating-point vector of [4 x float] to be copied to the lower 4912 /// 128 bits of the result. 4913 /// \param __hi 4914 /// A 128-bit floating-point vector of [4 x float] to be copied to the upper 4915 /// 128 bits of the result. 4916 /// \returns A 256-bit floating-point vector of [8 x float] containing the 4917 /// concatenated result. 4918 static __inline __m256 __DEFAULT_FN_ATTRS 4919 _mm256_setr_m128 (__m128 __lo, __m128 __hi) 4920 { 4921 return _mm256_set_m128(__hi, __lo); 4922 } 4923 4924 /// Constructs a 256-bit floating-point vector of [4 x double] by 4925 /// concatenating two 128-bit floating-point vectors of [2 x double]. This is 4926 /// similar to _mm256_set_m128d, but the order of the input parameters is 4927 /// swapped. 4928 /// 4929 /// \headerfile <x86intrin.h> 4930 /// 4931 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4932 /// 4933 /// \param __lo 4934 /// A 128-bit floating-point vector of [2 x double] to be copied to the lower 4935 /// 128 bits of the result. 4936 /// \param __hi 4937 /// A 128-bit floating-point vector of [2 x double] to be copied to the upper 4938 /// 128 bits of the result. 4939 /// \returns A 256-bit floating-point vector of [4 x double] containing the 4940 /// concatenated result. 4941 static __inline __m256d __DEFAULT_FN_ATTRS 4942 _mm256_setr_m128d (__m128d __lo, __m128d __hi) 4943 { 4944 return (__m256d)_mm256_set_m128d(__hi, __lo); 4945 } 4946 4947 /// Constructs a 256-bit integer vector by concatenating two 128-bit 4948 /// integer vectors. This is similar to _mm256_set_m128i, but the order of 4949 /// the input parameters is swapped. 4950 /// 4951 /// \headerfile <x86intrin.h> 4952 /// 4953 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4954 /// 4955 /// \param __lo 4956 /// A 128-bit integer vector to be copied to the lower 128 bits of the 4957 /// result. 4958 /// \param __hi 4959 /// A 128-bit integer vector to be copied to the upper 128 bits of the 4960 /// result. 4961 /// \returns A 256-bit integer vector containing the concatenated result. 4962 static __inline __m256i __DEFAULT_FN_ATTRS 4963 _mm256_setr_m128i (__m128i __lo, __m128i __hi) 4964 { 4965 return (__m256i)_mm256_set_m128i(__hi, __lo); 4966 } 4967 4968 /* SIMD load ops (unaligned) */ 4969 /// Loads two 128-bit floating-point vectors of [4 x float] from 4970 /// unaligned memory locations and constructs a 256-bit floating-point vector 4971 /// of [8 x float] by concatenating the two 128-bit vectors. 4972 /// 4973 /// \headerfile <x86intrin.h> 4974 /// 4975 /// This intrinsic corresponds to load instructions followed by the 4976 /// <c> VINSERTF128 </c> instruction. 4977 /// 4978 /// \param __addr_hi 4979 /// A pointer to a 128-bit memory location containing 4 consecutive 4980 /// single-precision floating-point values. These values are to be copied to 4981 /// bits[255:128] of the result. The address of the memory location does not 4982 /// have to be aligned. 4983 /// \param __addr_lo 4984 /// A pointer to a 128-bit memory location containing 4 consecutive 4985 /// single-precision floating-point values. These values are to be copied to 4986 /// bits[127:0] of the result. The address of the memory location does not 4987 /// have to be aligned. 4988 /// \returns A 256-bit floating-point vector of [8 x float] containing the 4989 /// concatenated result. 4990 static __inline __m256 __DEFAULT_FN_ATTRS 4991 _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo) 4992 { 4993 return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo)); 4994 } 4995 4996 /// Loads two 128-bit floating-point vectors of [2 x double] from 4997 /// unaligned memory locations and constructs a 256-bit floating-point vector 4998 /// of [4 x double] by concatenating the two 128-bit vectors. 4999 /// 5000 /// \headerfile <x86intrin.h> 5001 /// 5002 /// This intrinsic corresponds to load instructions followed by the 5003 /// <c> VINSERTF128 </c> instruction. 5004 /// 5005 /// \param __addr_hi 5006 /// A pointer to a 128-bit memory location containing two consecutive 5007 /// double-precision floating-point values. These values are to be copied to 5008 /// bits[255:128] of the result. The address of the memory location does not 5009 /// have to be aligned. 5010 /// \param __addr_lo 5011 /// A pointer to a 128-bit memory location containing two consecutive 5012 /// double-precision floating-point values. These values are to be copied to 5013 /// bits[127:0] of the result. The address of the memory location does not 5014 /// have to be aligned. 5015 /// \returns A 256-bit floating-point vector of [4 x double] containing the 5016 /// concatenated result. 5017 static __inline __m256d __DEFAULT_FN_ATTRS 5018 _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo) 5019 { 5020 return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo)); 5021 } 5022 5023 /// Loads two 128-bit integer vectors from unaligned memory locations and 5024 /// constructs a 256-bit integer vector by concatenating the two 128-bit 5025 /// vectors. 5026 /// 5027 /// \headerfile <x86intrin.h> 5028 /// 5029 /// This intrinsic corresponds to load instructions followed by the 5030 /// <c> VINSERTF128 </c> instruction. 5031 /// 5032 /// \param __addr_hi 5033 /// A pointer to a 128-bit memory location containing a 128-bit integer 5034 /// vector. This vector is to be copied to bits[255:128] of the result. The 5035 /// address of the memory location does not have to be aligned. 5036 /// \param __addr_lo 5037 /// A pointer to a 128-bit memory location containing a 128-bit integer 5038 /// vector. This vector is to be copied to bits[127:0] of the result. The 5039 /// address of the memory location does not have to be aligned. 5040 /// \returns A 256-bit integer vector containing the concatenated result. 5041 static __inline __m256i __DEFAULT_FN_ATTRS 5042 _mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo) 5043 { 5044 return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo)); 5045 } 5046 5047 /* SIMD store ops (unaligned) */ 5048 /// Stores the upper and lower 128 bits of a 256-bit floating-point 5049 /// vector of [8 x float] into two different unaligned memory locations. 5050 /// 5051 /// \headerfile <x86intrin.h> 5052 /// 5053 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the 5054 /// store instructions. 5055 /// 5056 /// \param __addr_hi 5057 /// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be 5058 /// copied to this memory location. The address of this memory location does 5059 /// not have to be aligned. 5060 /// \param __addr_lo 5061 /// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be 5062 /// copied to this memory location. The address of this memory location does 5063 /// not have to be aligned. 5064 /// \param __a 5065 /// A 256-bit floating-point vector of [8 x float]. 5066 static __inline void __DEFAULT_FN_ATTRS 5067 _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a) 5068 { 5069 __m128 __v128; 5070 5071 __v128 = _mm256_castps256_ps128(__a); 5072 _mm_storeu_ps(__addr_lo, __v128); 5073 __v128 = _mm256_extractf128_ps(__a, 1); 5074 _mm_storeu_ps(__addr_hi, __v128); 5075 } 5076 5077 /// Stores the upper and lower 128 bits of a 256-bit floating-point 5078 /// vector of [4 x double] into two different unaligned memory locations. 5079 /// 5080 /// \headerfile <x86intrin.h> 5081 /// 5082 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the 5083 /// store instructions. 5084 /// 5085 /// \param __addr_hi 5086 /// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be 5087 /// copied to this memory location. The address of this memory location does 5088 /// not have to be aligned. 5089 /// \param __addr_lo 5090 /// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be 5091 /// copied to this memory location. The address of this memory location does 5092 /// not have to be aligned. 5093 /// \param __a 5094 /// A 256-bit floating-point vector of [4 x double]. 5095 static __inline void __DEFAULT_FN_ATTRS 5096 _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a) 5097 { 5098 __m128d __v128; 5099 5100 __v128 = _mm256_castpd256_pd128(__a); 5101 _mm_storeu_pd(__addr_lo, __v128); 5102 __v128 = _mm256_extractf128_pd(__a, 1); 5103 _mm_storeu_pd(__addr_hi, __v128); 5104 } 5105 5106 /// Stores the upper and lower 128 bits of a 256-bit integer vector into 5107 /// two different unaligned memory locations. 5108 /// 5109 /// \headerfile <x86intrin.h> 5110 /// 5111 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the 5112 /// store instructions. 5113 /// 5114 /// \param __addr_hi 5115 /// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be 5116 /// copied to this memory location. The address of this memory location does 5117 /// not have to be aligned. 5118 /// \param __addr_lo 5119 /// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be 5120 /// copied to this memory location. The address of this memory location does 5121 /// not have to be aligned. 5122 /// \param __a 5123 /// A 256-bit integer vector. 5124 static __inline void __DEFAULT_FN_ATTRS 5125 _mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a) 5126 { 5127 __m128i __v128; 5128 5129 __v128 = _mm256_castsi256_si128(__a); 5130 _mm_storeu_si128(__addr_lo, __v128); 5131 __v128 = _mm256_extractf128_si256(__a, 1); 5132 _mm_storeu_si128(__addr_hi, __v128); 5133 } 5134 5135 #undef __DEFAULT_FN_ATTRS 5136 #undef __DEFAULT_FN_ATTRS_CONSTEXPR 5137 #undef __DEFAULT_FN_ATTRS128 5138 #undef __DEFAULT_FN_ATTRS128_CONSTEXPR 5139 5140 #endif /* __AVXINTRIN_H */ 5141