1 /*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __IMMINTRIN_H 11 #error "Never use <avx2intrin.h> directly; include <immintrin.h> instead." 12 #endif 13 14 #ifndef __AVX2INTRIN_H 15 #define __AVX2INTRIN_H 16 17 /* Define the default attributes for the functions in this file. */ 18 #if defined(__EVEX512__) && !defined(__AVX10_1_512__) 19 #define __DEFAULT_FN_ATTRS256 \ 20 __attribute__((__always_inline__, __nodebug__, \ 21 __target__("avx2,no-evex512"), __min_vector_width__(256))) 22 #define __DEFAULT_FN_ATTRS128 \ 23 __attribute__((__always_inline__, __nodebug__, \ 24 __target__("avx2,no-evex512"), __min_vector_width__(128))) 25 #else 26 #define __DEFAULT_FN_ATTRS256 \ 27 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), \ 28 __min_vector_width__(256))) 29 #define __DEFAULT_FN_ATTRS128 \ 30 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), \ 31 __min_vector_width__(128))) 32 #endif 33 34 /* SSE4 Multiple Packed Sums of Absolute Difference. */ 35 /// Computes sixteen sum of absolute difference (SAD) operations on sets of 36 /// four unsigned 8-bit integers from the 256-bit integer vectors \a X and 37 /// \a Y. 38 /// 39 /// Eight SAD results are computed using the lower half of the input 40 /// vectors, and another eight using the upper half. These 16-bit values 41 /// are returned in the lower and upper halves of the 256-bit result, 42 /// respectively. 43 /// 44 /// A single SAD operation selects four bytes from \a X and four bytes from 45 /// \a Y as input. It computes the differences between each \a X byte and 46 /// the corresponding \a Y byte, takes the absolute value of each 47 /// difference, and sums these four values to form one 16-bit result. The 48 /// intrinsic computes 16 of these results with different sets of input 49 /// bytes. 50 /// 51 /// For each set of eight results, the SAD operations use the same four 52 /// bytes from \a Y; the starting bit position for these four bytes is 53 /// specified by \a M[1:0] times 32. The eight operations use successive 54 /// sets of four bytes from \a X; the starting bit position for the first 55 /// set of four bytes is specified by \a M[2] times 32. These bit positions 56 /// are all relative to the 128-bit lane for each set of eight operations. 57 /// 58 /// \code{.operation} 59 /// r := 0 60 /// FOR i := 0 TO 1 61 /// j := i*3 62 /// Ybase := M[j+1:j]*32 + i*128 63 /// Xbase := M[j+2]*32 + i*128 64 /// FOR k := 0 TO 3 65 /// temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase]) 66 /// temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8]) 67 /// temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16]) 68 /// temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24]) 69 /// result[r+15:r] := temp0 + temp1 + temp2 + temp3 70 /// Xbase := Xbase + 8 71 /// r := r + 16 72 /// ENDFOR 73 /// ENDFOR 74 /// \endcode 75 /// 76 /// \headerfile <immintrin.h> 77 /// 78 /// \code 79 /// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M); 80 /// \endcode 81 /// 82 /// This intrinsic corresponds to the \c VMPSADBW instruction. 83 /// 84 /// \param X 85 /// A 256-bit integer vector containing one of the inputs. 86 /// \param Y 87 /// A 256-bit integer vector containing one of the inputs. 88 /// \param M 89 /// An unsigned immediate value specifying the starting positions of the 90 /// bytes to operate on. 91 /// \returns A 256-bit vector of [16 x i16] containing the result. 92 #define _mm256_mpsadbw_epu8(X, Y, M) \ 93 ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \ 94 (__v32qi)(__m256i)(Y), (int)(M))) 95 96 /// Computes the absolute value of each signed byte in the 256-bit integer 97 /// vector \a __a and returns each value in the corresponding byte of 98 /// the result. 99 /// 100 /// \headerfile <immintrin.h> 101 /// 102 /// This intrinsic corresponds to the \c VPABSB instruction. 103 /// 104 /// \param __a 105 /// A 256-bit integer vector. 106 /// \returns A 256-bit integer vector containing the result. 107 static __inline__ __m256i __DEFAULT_FN_ATTRS256 108 _mm256_abs_epi8(__m256i __a) 109 { 110 return (__m256i)__builtin_elementwise_abs((__v32qs)__a); 111 } 112 113 /// Computes the absolute value of each signed 16-bit element in the 256-bit 114 /// vector of [16 x i16] in \a __a and returns each value in the 115 /// corresponding element of the result. 116 /// 117 /// \headerfile <immintrin.h> 118 /// 119 /// This intrinsic corresponds to the \c VPABSW instruction. 120 /// 121 /// \param __a 122 /// A 256-bit vector of [16 x i16]. 123 /// \returns A 256-bit vector of [16 x i16] containing the result. 124 static __inline__ __m256i __DEFAULT_FN_ATTRS256 125 _mm256_abs_epi16(__m256i __a) 126 { 127 return (__m256i)__builtin_elementwise_abs((__v16hi)__a); 128 } 129 130 /// Computes the absolute value of each signed 32-bit element in the 256-bit 131 /// vector of [8 x i32] in \a __a and returns each value in the 132 /// corresponding element of the result. 133 /// 134 /// \headerfile <immintrin.h> 135 /// 136 /// This intrinsic corresponds to the \c VPABSD instruction. 137 /// 138 /// \param __a 139 /// A 256-bit vector of [8 x i32]. 140 /// \returns A 256-bit vector of [8 x i32] containing the result. 141 static __inline__ __m256i __DEFAULT_FN_ATTRS256 142 _mm256_abs_epi32(__m256i __a) 143 { 144 return (__m256i)__builtin_elementwise_abs((__v8si)__a); 145 } 146 147 /// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit 148 /// integers using signed saturation, and returns the 256-bit result. 149 /// 150 /// \code{.operation} 151 /// FOR i := 0 TO 7 152 /// j := i*16 153 /// k := i*8 154 /// result[7+k:k] := SATURATE8(__a[15+j:j]) 155 /// result[71+k:64+k] := SATURATE8(__b[15+j:j]) 156 /// result[135+k:128+k] := SATURATE8(__a[143+j:128+j]) 157 /// result[199+k:192+k] := SATURATE8(__b[143+j:128+j]) 158 /// ENDFOR 159 /// \endcode 160 /// 161 /// \headerfile <immintrin.h> 162 /// 163 /// This intrinsic corresponds to the \c VPACKSSWB instruction. 164 /// 165 /// \param __a 166 /// A 256-bit vector of [16 x i16] used to generate result[63:0] and 167 /// result[191:128]. 168 /// \param __b 169 /// A 256-bit vector of [16 x i16] used to generate result[127:64] and 170 /// result[255:192]. 171 /// \returns A 256-bit integer vector containing the result. 172 static __inline__ __m256i __DEFAULT_FN_ATTRS256 173 _mm256_packs_epi16(__m256i __a, __m256i __b) 174 { 175 return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b); 176 } 177 178 /// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit 179 /// integers using signed saturation, and returns the resulting 256-bit 180 /// vector of [16 x i16]. 181 /// 182 /// \code{.operation} 183 /// FOR i := 0 TO 3 184 /// j := i*32 185 /// k := i*16 186 /// result[15+k:k] := SATURATE16(__a[31+j:j]) 187 /// result[79+k:64+k] := SATURATE16(__b[31+j:j]) 188 /// result[143+k:128+k] := SATURATE16(__a[159+j:128+j]) 189 /// result[207+k:192+k] := SATURATE16(__b[159+j:128+j]) 190 /// ENDFOR 191 /// \endcode 192 /// 193 /// \headerfile <immintrin.h> 194 /// 195 /// This intrinsic corresponds to the \c VPACKSSDW instruction. 196 /// 197 /// \param __a 198 /// A 256-bit vector of [8 x i32] used to generate result[63:0] and 199 /// result[191:128]. 200 /// \param __b 201 /// A 256-bit vector of [8 x i32] used to generate result[127:64] and 202 /// result[255:192]. 203 /// \returns A 256-bit vector of [16 x i16] containing the result. 204 static __inline__ __m256i __DEFAULT_FN_ATTRS256 205 _mm256_packs_epi32(__m256i __a, __m256i __b) 206 { 207 return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b); 208 } 209 210 /// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers 211 /// using unsigned saturation, and returns the 256-bit result. 212 /// 213 /// \code{.operation} 214 /// FOR i := 0 TO 7 215 /// j := i*16 216 /// k := i*8 217 /// result[7+k:k] := SATURATE8U(__a[15+j:j]) 218 /// result[71+k:64+k] := SATURATE8U(__b[15+j:j]) 219 /// result[135+k:128+k] := SATURATE8U(__a[143+j:128+j]) 220 /// result[199+k:192+k] := SATURATE8U(__b[143+j:128+j]) 221 /// ENDFOR 222 /// \endcode 223 /// 224 /// \headerfile <immintrin.h> 225 /// 226 /// This intrinsic corresponds to the \c VPACKUSWB instruction. 227 /// 228 /// \param __a 229 /// A 256-bit vector of [16 x i16] used to generate result[63:0] and 230 /// result[191:128]. 231 /// \param __b 232 /// A 256-bit vector of [16 x i16] used to generate result[127:64] and 233 /// result[255:192]. 234 /// \returns A 256-bit integer vector containing the result. 235 static __inline__ __m256i __DEFAULT_FN_ATTRS256 236 _mm256_packus_epi16(__m256i __a, __m256i __b) 237 { 238 return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b); 239 } 240 241 /// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers 242 /// using unsigned saturation, and returns the resulting 256-bit vector of 243 /// [16 x i16]. 244 /// 245 /// \code{.operation} 246 /// FOR i := 0 TO 3 247 /// j := i*32 248 /// k := i*16 249 /// result[15+k:k] := SATURATE16U(__V1[31+j:j]) 250 /// result[79+k:64+k] := SATURATE16U(__V2[31+j:j]) 251 /// result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j]) 252 /// result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j]) 253 /// ENDFOR 254 /// \endcode 255 /// 256 /// \headerfile <immintrin.h> 257 /// 258 /// This intrinsic corresponds to the \c VPACKUSDW instruction. 259 /// 260 /// \param __V1 261 /// A 256-bit vector of [8 x i32] used to generate result[63:0] and 262 /// result[191:128]. 263 /// \param __V2 264 /// A 256-bit vector of [8 x i32] used to generate result[127:64] and 265 /// result[255:192]. 266 /// \returns A 256-bit vector of [16 x i16] containing the result. 267 static __inline__ __m256i __DEFAULT_FN_ATTRS256 268 _mm256_packus_epi32(__m256i __V1, __m256i __V2) 269 { 270 return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2); 271 } 272 273 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer 274 /// vectors and returns the lower 8 bits of each sum in the corresponding 275 /// byte of the 256-bit integer vector result (overflow is ignored). 276 /// 277 /// \headerfile <immintrin.h> 278 /// 279 /// This intrinsic corresponds to the \c VPADDB instruction. 280 /// 281 /// \param __a 282 /// A 256-bit integer vector containing one of the source operands. 283 /// \param __b 284 /// A 256-bit integer vector containing one of the source operands. 285 /// \returns A 256-bit integer vector containing the sums. 286 static __inline__ __m256i __DEFAULT_FN_ATTRS256 287 _mm256_add_epi8(__m256i __a, __m256i __b) 288 { 289 return (__m256i)((__v32qu)__a + (__v32qu)__b); 290 } 291 292 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of 293 /// [16 x i16] and returns the lower 16 bits of each sum in the 294 /// corresponding element of the [16 x i16] result (overflow is ignored). 295 /// 296 /// \headerfile <immintrin.h> 297 /// 298 /// This intrinsic corresponds to the \c VPADDW instruction. 299 /// 300 /// \param __a 301 /// A 256-bit vector of [16 x i16] containing one of the source operands. 302 /// \param __b 303 /// A 256-bit vector of [16 x i16] containing one of the source operands. 304 /// \returns A 256-bit vector of [16 x i16] containing the sums. 305 static __inline__ __m256i __DEFAULT_FN_ATTRS256 306 _mm256_add_epi16(__m256i __a, __m256i __b) 307 { 308 return (__m256i)((__v16hu)__a + (__v16hu)__b); 309 } 310 311 /// Adds 32-bit integers from corresponding elements of two 256-bit vectors of 312 /// [8 x i32] and returns the lower 32 bits of each sum in the corresponding 313 /// element of the [8 x i32] result (overflow is ignored). 314 /// 315 /// \headerfile <immintrin.h> 316 /// 317 /// This intrinsic corresponds to the \c VPADDD instruction. 318 /// 319 /// \param __a 320 /// A 256-bit vector of [8 x i32] containing one of the source operands. 321 /// \param __b 322 /// A 256-bit vector of [8 x i32] containing one of the source operands. 323 /// \returns A 256-bit vector of [8 x i32] containing the sums. 324 static __inline__ __m256i __DEFAULT_FN_ATTRS256 325 _mm256_add_epi32(__m256i __a, __m256i __b) 326 { 327 return (__m256i)((__v8su)__a + (__v8su)__b); 328 } 329 330 /// Adds 64-bit integers from corresponding elements of two 256-bit vectors of 331 /// [4 x i64] and returns the lower 64 bits of each sum in the corresponding 332 /// element of the [4 x i64] result (overflow is ignored). 333 /// 334 /// \headerfile <immintrin.h> 335 /// 336 /// This intrinsic corresponds to the \c VPADDQ instruction. 337 /// 338 /// \param __a 339 /// A 256-bit vector of [4 x i64] containing one of the source operands. 340 /// \param __b 341 /// A 256-bit vector of [4 x i64] containing one of the source operands. 342 /// \returns A 256-bit vector of [4 x i64] containing the sums. 343 static __inline__ __m256i __DEFAULT_FN_ATTRS256 344 _mm256_add_epi64(__m256i __a, __m256i __b) 345 { 346 return (__m256i)((__v4du)__a + (__v4du)__b); 347 } 348 349 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer 350 /// vectors using signed saturation, and returns each sum in the 351 /// corresponding byte of the 256-bit integer vector result. 352 /// 353 /// \headerfile <immintrin.h> 354 /// 355 /// This intrinsic corresponds to the \c VPADDSB instruction. 356 /// 357 /// \param __a 358 /// A 256-bit integer vector containing one of the source operands. 359 /// \param __b 360 /// A 256-bit integer vector containing one of the source operands. 361 /// \returns A 256-bit integer vector containing the sums. 362 static __inline__ __m256i __DEFAULT_FN_ATTRS256 363 _mm256_adds_epi8(__m256i __a, __m256i __b) 364 { 365 return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b); 366 } 367 368 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of 369 /// [16 x i16] using signed saturation, and returns the [16 x i16] result. 370 /// 371 /// \headerfile <immintrin.h> 372 /// 373 /// This intrinsic corresponds to the \c VPADDSW instruction. 374 /// 375 /// \param __a 376 /// A 256-bit vector of [16 x i16] containing one of the source operands. 377 /// \param __b 378 /// A 256-bit vector of [16 x i16] containing one of the source operands. 379 /// \returns A 256-bit vector of [16 x i16] containing the sums. 380 static __inline__ __m256i __DEFAULT_FN_ATTRS256 381 _mm256_adds_epi16(__m256i __a, __m256i __b) 382 { 383 return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b); 384 } 385 386 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer 387 /// vectors using unsigned saturation, and returns each sum in the 388 /// corresponding byte of the 256-bit integer vector result. 389 /// 390 /// \headerfile <immintrin.h> 391 /// 392 /// This intrinsic corresponds to the \c VPADDUSB instruction. 393 /// 394 /// \param __a 395 /// A 256-bit integer vector containing one of the source operands. 396 /// \param __b 397 /// A 256-bit integer vector containing one of the source operands. 398 /// \returns A 256-bit integer vector containing the sums. 399 static __inline__ __m256i __DEFAULT_FN_ATTRS256 400 _mm256_adds_epu8(__m256i __a, __m256i __b) 401 { 402 return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b); 403 } 404 405 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of 406 /// [16 x i16] using unsigned saturation, and returns the [16 x i16] result. 407 /// 408 /// \headerfile <immintrin.h> 409 /// 410 /// This intrinsic corresponds to the \c VPADDUSW instruction. 411 /// 412 /// \param __a 413 /// A 256-bit vector of [16 x i16] containing one of the source operands. 414 /// \param __b 415 /// A 256-bit vector of [16 x i16] containing one of the source operands. 416 /// \returns A 256-bit vector of [16 x i16] containing the sums. 417 static __inline__ __m256i __DEFAULT_FN_ATTRS256 418 _mm256_adds_epu16(__m256i __a, __m256i __b) 419 { 420 return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b); 421 } 422 423 /// Uses the lower half of the 256-bit vector \a a as the upper half of a 424 /// temporary 256-bit value, and the lower half of the 256-bit vector \a b 425 /// as the lower half of the temporary value. Right-shifts the temporary 426 /// value by \a n bytes, and uses the lower 16 bytes of the shifted value 427 /// as the lower 16 bytes of the result. Uses the upper halves of \a a and 428 /// \a b to make another temporary value, right shifts by \a n, and uses 429 /// the lower 16 bytes of the shifted value as the upper 16 bytes of the 430 /// result. 431 /// 432 /// \headerfile <immintrin.h> 433 /// 434 /// \code 435 /// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n); 436 /// \endcode 437 /// 438 /// This intrinsic corresponds to the \c VPALIGNR instruction. 439 /// 440 /// \param a 441 /// A 256-bit integer vector containing source values. 442 /// \param b 443 /// A 256-bit integer vector containing source values. 444 /// \param n 445 /// An immediate value specifying the number of bytes to shift. 446 /// \returns A 256-bit integer vector containing the result. 447 #define _mm256_alignr_epi8(a, b, n) \ 448 ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \ 449 (__v32qi)(__m256i)(b), (n))) 450 451 /// Computes the bitwise AND of the 256-bit integer vectors in \a __a and 452 /// \a __b. 453 /// 454 /// \headerfile <immintrin.h> 455 /// 456 /// This intrinsic corresponds to the \c VPAND instruction. 457 /// 458 /// \param __a 459 /// A 256-bit integer vector. 460 /// \param __b 461 /// A 256-bit integer vector. 462 /// \returns A 256-bit integer vector containing the result. 463 static __inline__ __m256i __DEFAULT_FN_ATTRS256 464 _mm256_and_si256(__m256i __a, __m256i __b) 465 { 466 return (__m256i)((__v4du)__a & (__v4du)__b); 467 } 468 469 /// Computes the bitwise AND of the 256-bit integer vector in \a __b with 470 /// the bitwise NOT of the 256-bit integer vector in \a __a. 471 /// 472 /// \headerfile <immintrin.h> 473 /// 474 /// This intrinsic corresponds to the \c VPANDN instruction. 475 /// 476 /// \param __a 477 /// A 256-bit integer vector. 478 /// \param __b 479 /// A 256-bit integer vector. 480 /// \returns A 256-bit integer vector containing the result. 481 static __inline__ __m256i __DEFAULT_FN_ATTRS256 482 _mm256_andnot_si256(__m256i __a, __m256i __b) 483 { 484 return (__m256i)(~(__v4du)__a & (__v4du)__b); 485 } 486 487 /// Computes the averages of the corresponding unsigned bytes in the two 488 /// 256-bit integer vectors in \a __a and \a __b and returns each 489 /// average in the corresponding byte of the 256-bit result. 490 /// 491 /// \code{.operation} 492 /// FOR i := 0 TO 31 493 /// j := i*8 494 /// result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1 495 /// ENDFOR 496 /// \endcode 497 /// 498 /// \headerfile <immintrin.h> 499 /// 500 /// This intrinsic corresponds to the \c VPAVGB instruction. 501 /// 502 /// \param __a 503 /// A 256-bit integer vector. 504 /// \param __b 505 /// A 256-bit integer vector. 506 /// \returns A 256-bit integer vector containing the result. 507 static __inline__ __m256i __DEFAULT_FN_ATTRS256 508 _mm256_avg_epu8(__m256i __a, __m256i __b) 509 { 510 return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b); 511 } 512 513 /// Computes the averages of the corresponding unsigned 16-bit integers in 514 /// the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns 515 /// each average in the corresponding element of the 256-bit result. 516 /// 517 /// \code{.operation} 518 /// FOR i := 0 TO 15 519 /// j := i*16 520 /// result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1 521 /// ENDFOR 522 /// \endcode 523 /// 524 /// \headerfile <immintrin.h> 525 /// 526 /// This intrinsic corresponds to the \c VPAVGW instruction. 527 /// 528 /// \param __a 529 /// A 256-bit vector of [16 x i16]. 530 /// \param __b 531 /// A 256-bit vector of [16 x i16]. 532 /// \returns A 256-bit vector of [16 x i16] containing the result. 533 static __inline__ __m256i __DEFAULT_FN_ATTRS256 534 _mm256_avg_epu16(__m256i __a, __m256i __b) 535 { 536 return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b); 537 } 538 539 /// Merges 8-bit integer values from either of the two 256-bit vectors 540 /// \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns 541 /// the resulting 256-bit integer vector. 542 /// 543 /// \code{.operation} 544 /// FOR i := 0 TO 31 545 /// j := i*8 546 /// IF __M[7+i] == 0 547 /// result[7+j:j] := __V1[7+j:j] 548 /// ELSE 549 /// result[7+j:j] := __V2[7+j:j] 550 /// FI 551 /// ENDFOR 552 /// \endcode 553 /// 554 /// \headerfile <immintrin.h> 555 /// 556 /// This intrinsic corresponds to the \c VPBLENDVB instruction. 557 /// 558 /// \param __V1 559 /// A 256-bit integer vector containing source values. 560 /// \param __V2 561 /// A 256-bit integer vector containing source values. 562 /// \param __M 563 /// A 256-bit integer vector, with bit [7] of each byte specifying the 564 /// source for each corresponding byte of the result. When the mask bit 565 /// is 0, the byte is copied from \a __V1; otherwise, it is copied from 566 /// \a __V2. 567 /// \returns A 256-bit integer vector containing the result. 568 static __inline__ __m256i __DEFAULT_FN_ATTRS256 569 _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M) 570 { 571 return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2, 572 (__v32qi)__M); 573 } 574 575 /// Merges 16-bit integer values from either of the two 256-bit vectors 576 /// \a V1 or \a V2, as specified by the immediate integer operand \a M, 577 /// and returns the resulting 256-bit vector of [16 x i16]. 578 /// 579 /// \code{.operation} 580 /// FOR i := 0 TO 7 581 /// j := i*16 582 /// IF M[i] == 0 583 /// result[7+j:j] := V1[7+j:j] 584 /// result[135+j:128+j] := V1[135+j:128+j] 585 /// ELSE 586 /// result[7+j:j] := V2[7+j:j] 587 /// result[135+j:128+j] := V2[135+j:128+j] 588 /// FI 589 /// ENDFOR 590 /// \endcode 591 /// 592 /// \headerfile <immintrin.h> 593 /// 594 /// \code 595 /// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M); 596 /// \endcode 597 /// 598 /// This intrinsic corresponds to the \c VPBLENDW instruction. 599 /// 600 /// \param V1 601 /// A 256-bit vector of [16 x i16] containing source values. 602 /// \param V2 603 /// A 256-bit vector of [16 x i16] containing source values. 604 /// \param M 605 /// An immediate 8-bit integer operand, with bits [7:0] specifying the 606 /// source for each element of the result. The position of the mask bit 607 /// corresponds to the index of a copied value. When a mask bit is 0, the 608 /// element is copied from \a V1; otherwise, it is copied from \a V2. 609 /// \a M[0] determines the source for elements 0 and 8, \a M[1] for 610 /// elements 1 and 9, and so forth. 611 /// \returns A 256-bit vector of [16 x i16] containing the result. 612 #define _mm256_blend_epi16(V1, V2, M) \ 613 ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \ 614 (__v16hi)(__m256i)(V2), (int)(M))) 615 616 /// Compares corresponding bytes in the 256-bit integer vectors in \a __a and 617 /// \a __b for equality and returns the outcomes in the corresponding 618 /// bytes of the 256-bit result. 619 /// 620 /// \code{.operation} 621 /// FOR i := 0 TO 31 622 /// j := i*8 623 /// result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0 624 /// ENDFOR 625 /// \endcode 626 /// 627 /// \headerfile <immintrin.h> 628 /// 629 /// This intrinsic corresponds to the \c VPCMPEQB instruction. 630 /// 631 /// \param __a 632 /// A 256-bit integer vector containing one of the inputs. 633 /// \param __b 634 /// A 256-bit integer vector containing one of the inputs. 635 /// \returns A 256-bit integer vector containing the result. 636 static __inline__ __m256i __DEFAULT_FN_ATTRS256 637 _mm256_cmpeq_epi8(__m256i __a, __m256i __b) 638 { 639 return (__m256i)((__v32qi)__a == (__v32qi)__b); 640 } 641 642 /// Compares corresponding elements in the 256-bit vectors of [16 x i16] in 643 /// \a __a and \a __b for equality and returns the outcomes in the 644 /// corresponding elements of the 256-bit result. 645 /// 646 /// \code{.operation} 647 /// FOR i := 0 TO 15 648 /// j := i*16 649 /// result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0 650 /// ENDFOR 651 /// \endcode 652 /// 653 /// \headerfile <immintrin.h> 654 /// 655 /// This intrinsic corresponds to the \c VPCMPEQW instruction. 656 /// 657 /// \param __a 658 /// A 256-bit vector of [16 x i16] containing one of the inputs. 659 /// \param __b 660 /// A 256-bit vector of [16 x i16] containing one of the inputs. 661 /// \returns A 256-bit vector of [16 x i16] containing the result. 662 static __inline__ __m256i __DEFAULT_FN_ATTRS256 663 _mm256_cmpeq_epi16(__m256i __a, __m256i __b) 664 { 665 return (__m256i)((__v16hi)__a == (__v16hi)__b); 666 } 667 668 /// Compares corresponding elements in the 256-bit vectors of [8 x i32] in 669 /// \a __a and \a __b for equality and returns the outcomes in the 670 /// corresponding elements of the 256-bit result. 671 /// 672 /// \code{.operation} 673 /// FOR i := 0 TO 7 674 /// j := i*32 675 /// result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0 676 /// ENDFOR 677 /// \endcode 678 /// 679 /// \headerfile <immintrin.h> 680 /// 681 /// This intrinsic corresponds to the \c VPCMPEQD instruction. 682 /// 683 /// \param __a 684 /// A 256-bit vector of [8 x i32] containing one of the inputs. 685 /// \param __b 686 /// A 256-bit vector of [8 x i32] containing one of the inputs. 687 /// \returns A 256-bit vector of [8 x i32] containing the result. 688 static __inline__ __m256i __DEFAULT_FN_ATTRS256 689 _mm256_cmpeq_epi32(__m256i __a, __m256i __b) 690 { 691 return (__m256i)((__v8si)__a == (__v8si)__b); 692 } 693 694 /// Compares corresponding elements in the 256-bit vectors of [4 x i64] in 695 /// \a __a and \a __b for equality and returns the outcomes in the 696 /// corresponding elements of the 256-bit result. 697 /// 698 /// \code{.operation} 699 /// FOR i := 0 TO 3 700 /// j := i*64 701 /// result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0 702 /// ENDFOR 703 /// \endcode 704 /// 705 /// \headerfile <immintrin.h> 706 /// 707 /// This intrinsic corresponds to the \c VPCMPEQQ instruction. 708 /// 709 /// \param __a 710 /// A 256-bit vector of [4 x i64] containing one of the inputs. 711 /// \param __b 712 /// A 256-bit vector of [4 x i64] containing one of the inputs. 713 /// \returns A 256-bit vector of [4 x i64] containing the result. 714 static __inline__ __m256i __DEFAULT_FN_ATTRS256 715 _mm256_cmpeq_epi64(__m256i __a, __m256i __b) 716 { 717 return (__m256i)((__v4di)__a == (__v4di)__b); 718 } 719 720 /// Compares corresponding signed bytes in the 256-bit integer vectors in 721 /// \a __a and \a __b for greater-than and returns the outcomes in the 722 /// corresponding bytes of the 256-bit result. 723 /// 724 /// \code{.operation} 725 /// FOR i := 0 TO 31 726 /// j := i*8 727 /// result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0 728 /// ENDFOR 729 /// \endcode 730 /// 731 /// \headerfile <immintrin.h> 732 /// 733 /// This intrinsic corresponds to the \c VPCMPGTB instruction. 734 /// 735 /// \param __a 736 /// A 256-bit integer vector containing one of the inputs. 737 /// \param __b 738 /// A 256-bit integer vector containing one of the inputs. 739 /// \returns A 256-bit integer vector containing the result. 740 static __inline__ __m256i __DEFAULT_FN_ATTRS256 741 _mm256_cmpgt_epi8(__m256i __a, __m256i __b) 742 { 743 /* This function always performs a signed comparison, but __v32qi is a char 744 which may be signed or unsigned, so use __v32qs. */ 745 return (__m256i)((__v32qs)__a > (__v32qs)__b); 746 } 747 748 /// Compares corresponding signed elements in the 256-bit vectors of 749 /// [16 x i16] in \a __a and \a __b for greater-than and returns the 750 /// outcomes in the corresponding elements of the 256-bit result. 751 /// 752 /// \code{.operation} 753 /// FOR i := 0 TO 15 754 /// j := i*16 755 /// result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0 756 /// ENDFOR 757 /// \endcode 758 /// 759 /// \headerfile <immintrin.h> 760 /// 761 /// This intrinsic corresponds to the \c VPCMPGTW instruction. 762 /// 763 /// \param __a 764 /// A 256-bit vector of [16 x i16] containing one of the inputs. 765 /// \param __b 766 /// A 256-bit vector of [16 x i16] containing one of the inputs. 767 /// \returns A 256-bit vector of [16 x i16] containing the result. 768 static __inline__ __m256i __DEFAULT_FN_ATTRS256 769 _mm256_cmpgt_epi16(__m256i __a, __m256i __b) 770 { 771 return (__m256i)((__v16hi)__a > (__v16hi)__b); 772 } 773 774 /// Compares corresponding signed elements in the 256-bit vectors of 775 /// [8 x i32] in \a __a and \a __b for greater-than and returns the 776 /// outcomes in the corresponding elements of the 256-bit result. 777 /// 778 /// \code{.operation} 779 /// FOR i := 0 TO 7 780 /// j := i*32 781 /// result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0 782 /// ENDFOR 783 /// \endcode 784 /// 785 /// \headerfile <immintrin.h> 786 /// 787 /// This intrinsic corresponds to the \c VPCMPGTD instruction. 788 /// 789 /// \param __a 790 /// A 256-bit vector of [8 x i32] containing one of the inputs. 791 /// \param __b 792 /// A 256-bit vector of [8 x i32] containing one of the inputs. 793 /// \returns A 256-bit vector of [8 x i32] containing the result. 794 static __inline__ __m256i __DEFAULT_FN_ATTRS256 795 _mm256_cmpgt_epi32(__m256i __a, __m256i __b) 796 { 797 return (__m256i)((__v8si)__a > (__v8si)__b); 798 } 799 800 /// Compares corresponding signed elements in the 256-bit vectors of 801 /// [4 x i64] in \a __a and \a __b for greater-than and returns the 802 /// outcomes in the corresponding elements of the 256-bit result. 803 /// 804 /// \code{.operation} 805 /// FOR i := 0 TO 3 806 /// j := i*64 807 /// result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0 808 /// ENDFOR 809 /// \endcode 810 /// 811 /// \headerfile <immintrin.h> 812 /// 813 /// This intrinsic corresponds to the \c VPCMPGTQ instruction. 814 /// 815 /// \param __a 816 /// A 256-bit vector of [4 x i64] containing one of the inputs. 817 /// \param __b 818 /// A 256-bit vector of [4 x i64] containing one of the inputs. 819 /// \returns A 256-bit vector of [4 x i64] containing the result. 820 static __inline__ __m256i __DEFAULT_FN_ATTRS256 821 _mm256_cmpgt_epi64(__m256i __a, __m256i __b) 822 { 823 return (__m256i)((__v4di)__a > (__v4di)__b); 824 } 825 826 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit 827 /// vectors of [16 x i16] and returns the lower 16 bits of each sum in an 828 /// element of the [16 x i16] result (overflow is ignored). Sums from 829 /// \a __a are returned in the lower 64 bits of each 128-bit half of the 830 /// result; sums from \a __b are returned in the upper 64 bits of each 831 /// 128-bit half of the result. 832 /// 833 /// \code{.operation} 834 /// FOR i := 0 TO 1 835 /// j := i*128 836 /// result[j+15:j] := __a[j+15:j] + __a[j+31:j+16] 837 /// result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48] 838 /// result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80] 839 /// result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112] 840 /// result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16] 841 /// result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48] 842 /// result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80] 843 /// result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112] 844 /// ENDFOR 845 /// \endcode 846 /// 847 /// \headerfile <immintrin.h> 848 /// 849 /// This intrinsic corresponds to the \c VPHADDW instruction. 850 /// 851 /// \param __a 852 /// A 256-bit vector of [16 x i16] containing one of the source operands. 853 /// \param __b 854 /// A 256-bit vector of [16 x i16] containing one of the source operands. 855 /// \returns A 256-bit vector of [16 x i16] containing the sums. 856 static __inline__ __m256i __DEFAULT_FN_ATTRS256 857 _mm256_hadd_epi16(__m256i __a, __m256i __b) 858 { 859 return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b); 860 } 861 862 /// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit 863 /// vectors of [8 x i32] and returns the lower 32 bits of each sum in an 864 /// element of the [8 x i32] result (overflow is ignored). Sums from \a __a 865 /// are returned in the lower 64 bits of each 128-bit half of the result; 866 /// sums from \a __b are returned in the upper 64 bits of each 128-bit half 867 /// of the result. 868 /// 869 /// \code{.operation} 870 /// FOR i := 0 TO 1 871 /// j := i*128 872 /// result[j+31:j] := __a[j+31:j] + __a[j+63:j+32] 873 /// result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96] 874 /// result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32] 875 /// result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96] 876 /// ENDFOR 877 /// \endcode 878 /// 879 /// \headerfile <immintrin.h> 880 /// 881 /// This intrinsic corresponds to the \c VPHADDD instruction. 882 /// 883 /// \param __a 884 /// A 256-bit vector of [8 x i32] containing one of the source operands. 885 /// \param __b 886 /// A 256-bit vector of [8 x i32] containing one of the source operands. 887 /// \returns A 256-bit vector of [8 x i32] containing the sums. 888 static __inline__ __m256i __DEFAULT_FN_ATTRS256 889 _mm256_hadd_epi32(__m256i __a, __m256i __b) 890 { 891 return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b); 892 } 893 894 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit 895 /// vectors of [16 x i16] using signed saturation and returns each sum in 896 /// an element of the [16 x i16] result. Sums from \a __a are returned in 897 /// the lower 64 bits of each 128-bit half of the result; sums from \a __b 898 /// are returned in the upper 64 bits of each 128-bit half of the result. 899 /// 900 /// \code{.operation} 901 /// FOR i := 0 TO 1 902 /// j := i*128 903 /// result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16]) 904 /// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48]) 905 /// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80]) 906 /// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112]) 907 /// result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16]) 908 /// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48]) 909 /// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80]) 910 /// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112]) 911 /// ENDFOR 912 /// \endcode 913 /// 914 /// \headerfile <immintrin.h> 915 /// 916 /// This intrinsic corresponds to the \c VPHADDSW instruction. 917 /// 918 /// \param __a 919 /// A 256-bit vector of [16 x i16] containing one of the source operands. 920 /// \param __b 921 /// A 256-bit vector of [16 x i16] containing one of the source operands. 922 /// \returns A 256-bit vector of [16 x i16] containing the sums. 923 static __inline__ __m256i __DEFAULT_FN_ATTRS256 924 _mm256_hadds_epi16(__m256i __a, __m256i __b) 925 { 926 return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b); 927 } 928 929 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit 930 /// vectors of [16 x i16] and returns the lower 16 bits of each difference 931 /// in an element of the [16 x i16] result (overflow is ignored). 932 /// Differences from \a __a are returned in the lower 64 bits of each 933 /// 128-bit half of the result; differences from \a __b are returned in the 934 /// upper 64 bits of each 128-bit half of the result. 935 /// 936 /// \code{.operation} 937 /// FOR i := 0 TO 1 938 /// j := i*128 939 /// result[j+15:j] := __a[j+15:j] - __a[j+31:j+16] 940 /// result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48] 941 /// result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80] 942 /// result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112] 943 /// result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16] 944 /// result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48] 945 /// result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80] 946 /// result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112] 947 /// ENDFOR 948 /// \endcode 949 /// 950 /// \headerfile <immintrin.h> 951 /// 952 /// This intrinsic corresponds to the \c VPHSUBW instruction. 953 /// 954 /// \param __a 955 /// A 256-bit vector of [16 x i16] containing one of the source operands. 956 /// \param __b 957 /// A 256-bit vector of [16 x i16] containing one of the source operands. 958 /// \returns A 256-bit vector of [16 x i16] containing the differences. 959 static __inline__ __m256i __DEFAULT_FN_ATTRS256 960 _mm256_hsub_epi16(__m256i __a, __m256i __b) 961 { 962 return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b); 963 } 964 965 /// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit 966 /// vectors of [8 x i32] and returns the lower 32 bits of each difference in 967 /// an element of the [8 x i32] result (overflow is ignored). Differences 968 /// from \a __a are returned in the lower 64 bits of each 128-bit half of 969 /// the result; differences from \a __b are returned in the upper 64 bits 970 /// of each 128-bit half of the result. 971 /// 972 /// \code{.operation} 973 /// FOR i := 0 TO 1 974 /// j := i*128 975 /// result[j+31:j] := __a[j+31:j] - __a[j+63:j+32] 976 /// result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96] 977 /// result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32] 978 /// result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96] 979 /// ENDFOR 980 /// \endcode 981 /// 982 /// \headerfile <immintrin.h> 983 /// 984 /// This intrinsic corresponds to the \c VPHSUBD instruction. 985 /// 986 /// \param __a 987 /// A 256-bit vector of [8 x i32] containing one of the source operands. 988 /// \param __b 989 /// A 256-bit vector of [8 x i32] containing one of the source operands. 990 /// \returns A 256-bit vector of [8 x i32] containing the differences. 991 static __inline__ __m256i __DEFAULT_FN_ATTRS256 992 _mm256_hsub_epi32(__m256i __a, __m256i __b) 993 { 994 return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b); 995 } 996 997 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit 998 /// vectors of [16 x i16] using signed saturation and returns each sum in 999 /// an element of the [16 x i16] result. Differences from \a __a are 1000 /// returned in the lower 64 bits of each 128-bit half of the result; 1001 /// differences from \a __b are returned in the upper 64 bits of each 1002 /// 128-bit half of the result. 1003 /// 1004 /// \code{.operation} 1005 /// FOR i := 0 TO 1 1006 /// j := i*128 1007 /// result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16]) 1008 /// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48]) 1009 /// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80]) 1010 /// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112]) 1011 /// result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16]) 1012 /// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48]) 1013 /// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80]) 1014 /// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112]) 1015 /// ENDFOR 1016 /// \endcode 1017 /// 1018 /// \headerfile <immintrin.h> 1019 /// 1020 /// This intrinsic corresponds to the \c VPHSUBSW instruction. 1021 /// 1022 /// \param __a 1023 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1024 /// \param __b 1025 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1026 /// \returns A 256-bit vector of [16 x i16] containing the differences. 1027 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1028 _mm256_hsubs_epi16(__m256i __a, __m256i __b) 1029 { 1030 return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b); 1031 } 1032 1033 /// Multiplies each unsigned byte from the 256-bit integer vector in \a __a 1034 /// with the corresponding signed byte from the 256-bit integer vector in 1035 /// \a __b, forming signed 16-bit intermediate products. Adds adjacent 1036 /// pairs of those products using signed saturation to form 16-bit sums 1037 /// returned as elements of the [16 x i16] result. 1038 /// 1039 /// \code{.operation} 1040 /// FOR i := 0 TO 15 1041 /// j := i*16 1042 /// temp1 := __a[j+7:j] * __b[j+7:j] 1043 /// temp2 := __a[j+15:j+8] * __b[j+15:j+8] 1044 /// result[j+15:j] := SATURATE16(temp1 + temp2) 1045 /// ENDFOR 1046 /// \endcode 1047 /// 1048 /// \headerfile <immintrin.h> 1049 /// 1050 /// This intrinsic corresponds to the \c VPMADDUBSW instruction. 1051 /// 1052 /// \param __a 1053 /// A 256-bit vector containing one of the source operands. 1054 /// \param __b 1055 /// A 256-bit vector containing one of the source operands. 1056 /// \returns A 256-bit vector of [16 x i16] containing the result. 1057 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1058 _mm256_maddubs_epi16(__m256i __a, __m256i __b) 1059 { 1060 return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b); 1061 } 1062 1063 /// Multiplies corresponding 16-bit elements of two 256-bit vectors of 1064 /// [16 x i16], forming 32-bit intermediate products, and adds pairs of 1065 /// those products to form 32-bit sums returned as elements of the 1066 /// [8 x i32] result. 1067 /// 1068 /// There is only one wraparound case: when all four of the 16-bit sources 1069 /// are \c 0x8000, the result will be \c 0x80000000. 1070 /// 1071 /// \code{.operation} 1072 /// FOR i := 0 TO 7 1073 /// j := i*32 1074 /// temp1 := __a[j+15:j] * __b[j+15:j] 1075 /// temp2 := __a[j+31:j+16] * __b[j+31:j+16] 1076 /// result[j+31:j] := temp1 + temp2 1077 /// ENDFOR 1078 /// \endcode 1079 /// 1080 /// \headerfile <immintrin.h> 1081 /// 1082 /// This intrinsic corresponds to the \c VPMADDWD instruction. 1083 /// 1084 /// \param __a 1085 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1086 /// \param __b 1087 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1088 /// \returns A 256-bit vector of [8 x i32] containing the result. 1089 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1090 _mm256_madd_epi16(__m256i __a, __m256i __b) 1091 { 1092 return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b); 1093 } 1094 1095 /// Compares the corresponding signed bytes in the two 256-bit integer vectors 1096 /// in \a __a and \a __b and returns the larger of each pair in the 1097 /// corresponding byte of the 256-bit result. 1098 /// 1099 /// \headerfile <immintrin.h> 1100 /// 1101 /// This intrinsic corresponds to the \c VPMAXSB instruction. 1102 /// 1103 /// \param __a 1104 /// A 256-bit integer vector. 1105 /// \param __b 1106 /// A 256-bit integer vector. 1107 /// \returns A 256-bit integer vector containing the result. 1108 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1109 _mm256_max_epi8(__m256i __a, __m256i __b) 1110 { 1111 return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b); 1112 } 1113 1114 /// Compares the corresponding signed 16-bit integers in the two 256-bit 1115 /// vectors of [16 x i16] in \a __a and \a __b and returns the larger of 1116 /// each pair in the corresponding element of the 256-bit result. 1117 /// 1118 /// \headerfile <immintrin.h> 1119 /// 1120 /// This intrinsic corresponds to the \c VPMAXSW instruction. 1121 /// 1122 /// \param __a 1123 /// A 256-bit vector of [16 x i16]. 1124 /// \param __b 1125 /// A 256-bit vector of [16 x i16]. 1126 /// \returns A 256-bit vector of [16 x i16] containing the result. 1127 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1128 _mm256_max_epi16(__m256i __a, __m256i __b) 1129 { 1130 return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b); 1131 } 1132 1133 /// Compares the corresponding signed 32-bit integers in the two 256-bit 1134 /// vectors of [8 x i32] in \a __a and \a __b and returns the larger of 1135 /// each pair in the corresponding element of the 256-bit result. 1136 /// 1137 /// \headerfile <immintrin.h> 1138 /// 1139 /// This intrinsic corresponds to the \c VPMAXSD instruction. 1140 /// 1141 /// \param __a 1142 /// A 256-bit vector of [8 x i32]. 1143 /// \param __b 1144 /// A 256-bit vector of [8 x i32]. 1145 /// \returns A 256-bit vector of [8 x i32] containing the result. 1146 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1147 _mm256_max_epi32(__m256i __a, __m256i __b) 1148 { 1149 return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b); 1150 } 1151 1152 /// Compares the corresponding unsigned bytes in the two 256-bit integer 1153 /// vectors in \a __a and \a __b and returns the larger of each pair in 1154 /// the corresponding byte of the 256-bit result. 1155 /// 1156 /// \headerfile <immintrin.h> 1157 /// 1158 /// This intrinsic corresponds to the \c VPMAXUB instruction. 1159 /// 1160 /// \param __a 1161 /// A 256-bit integer vector. 1162 /// \param __b 1163 /// A 256-bit integer vector. 1164 /// \returns A 256-bit integer vector containing the result. 1165 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1166 _mm256_max_epu8(__m256i __a, __m256i __b) 1167 { 1168 return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b); 1169 } 1170 1171 /// Compares the corresponding unsigned 16-bit integers in the two 256-bit 1172 /// vectors of [16 x i16] in \a __a and \a __b and returns the larger of 1173 /// each pair in the corresponding element of the 256-bit result. 1174 /// 1175 /// \headerfile <immintrin.h> 1176 /// 1177 /// This intrinsic corresponds to the \c VPMAXUW instruction. 1178 /// 1179 /// \param __a 1180 /// A 256-bit vector of [16 x i16]. 1181 /// \param __b 1182 /// A 256-bit vector of [16 x i16]. 1183 /// \returns A 256-bit vector of [16 x i16] containing the result. 1184 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1185 _mm256_max_epu16(__m256i __a, __m256i __b) 1186 { 1187 return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b); 1188 } 1189 1190 /// Compares the corresponding unsigned 32-bit integers in the two 256-bit 1191 /// vectors of [8 x i32] in \a __a and \a __b and returns the larger of 1192 /// each pair in the corresponding element of the 256-bit result. 1193 /// 1194 /// \headerfile <immintrin.h> 1195 /// 1196 /// This intrinsic corresponds to the \c VPMAXUD instruction. 1197 /// 1198 /// \param __a 1199 /// A 256-bit vector of [8 x i32]. 1200 /// \param __b 1201 /// A 256-bit vector of [8 x i32]. 1202 /// \returns A 256-bit vector of [8 x i32] containing the result. 1203 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1204 _mm256_max_epu32(__m256i __a, __m256i __b) 1205 { 1206 return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b); 1207 } 1208 1209 /// Compares the corresponding signed bytes in the two 256-bit integer vectors 1210 /// in \a __a and \a __b and returns the smaller of each pair in the 1211 /// corresponding byte of the 256-bit result. 1212 /// 1213 /// \headerfile <immintrin.h> 1214 /// 1215 /// This intrinsic corresponds to the \c VPMINSB instruction. 1216 /// 1217 /// \param __a 1218 /// A 256-bit integer vector. 1219 /// \param __b 1220 /// A 256-bit integer vector. 1221 /// \returns A 256-bit integer vector containing the result. 1222 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1223 _mm256_min_epi8(__m256i __a, __m256i __b) 1224 { 1225 return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b); 1226 } 1227 1228 /// Compares the corresponding signed 16-bit integers in the two 256-bit 1229 /// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of 1230 /// each pair in the corresponding element of the 256-bit result. 1231 /// 1232 /// \headerfile <immintrin.h> 1233 /// 1234 /// This intrinsic corresponds to the \c VPMINSW instruction. 1235 /// 1236 /// \param __a 1237 /// A 256-bit vector of [16 x i16]. 1238 /// \param __b 1239 /// A 256-bit vector of [16 x i16]. 1240 /// \returns A 256-bit vector of [16 x i16] containing the result. 1241 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1242 _mm256_min_epi16(__m256i __a, __m256i __b) 1243 { 1244 return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b); 1245 } 1246 1247 /// Compares the corresponding signed 32-bit integers in the two 256-bit 1248 /// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of 1249 /// each pair in the corresponding element of the 256-bit result. 1250 /// 1251 /// \headerfile <immintrin.h> 1252 /// 1253 /// This intrinsic corresponds to the \c VPMINSD instruction. 1254 /// 1255 /// \param __a 1256 /// A 256-bit vector of [8 x i32]. 1257 /// \param __b 1258 /// A 256-bit vector of [8 x i32]. 1259 /// \returns A 256-bit vector of [8 x i32] containing the result. 1260 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1261 _mm256_min_epi32(__m256i __a, __m256i __b) 1262 { 1263 return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b); 1264 } 1265 1266 /// Compares the corresponding unsigned bytes in the two 256-bit integer 1267 /// vectors in \a __a and \a __b and returns the smaller of each pair in 1268 /// the corresponding byte of the 256-bit result. 1269 /// 1270 /// \headerfile <immintrin.h> 1271 /// 1272 /// This intrinsic corresponds to the \c VPMINUB instruction. 1273 /// 1274 /// \param __a 1275 /// A 256-bit integer vector. 1276 /// \param __b 1277 /// A 256-bit integer vector. 1278 /// \returns A 256-bit integer vector containing the result. 1279 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1280 _mm256_min_epu8(__m256i __a, __m256i __b) 1281 { 1282 return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b); 1283 } 1284 1285 /// Compares the corresponding unsigned 16-bit integers in the two 256-bit 1286 /// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of 1287 /// each pair in the corresponding element of the 256-bit result. 1288 /// 1289 /// \headerfile <immintrin.h> 1290 /// 1291 /// This intrinsic corresponds to the \c VPMINUW instruction. 1292 /// 1293 /// \param __a 1294 /// A 256-bit vector of [16 x i16]. 1295 /// \param __b 1296 /// A 256-bit vector of [16 x i16]. 1297 /// \returns A 256-bit vector of [16 x i16] containing the result. 1298 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1299 _mm256_min_epu16(__m256i __a, __m256i __b) 1300 { 1301 return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b); 1302 } 1303 1304 /// Compares the corresponding unsigned 32-bit integers in the two 256-bit 1305 /// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of 1306 /// each pair in the corresponding element of the 256-bit result. 1307 /// 1308 /// \headerfile <immintrin.h> 1309 /// 1310 /// This intrinsic corresponds to the \c VPMINUD instruction. 1311 /// 1312 /// \param __a 1313 /// A 256-bit vector of [8 x i32]. 1314 /// \param __b 1315 /// A 256-bit vector of [8 x i32]. 1316 /// \returns A 256-bit vector of [8 x i32] containing the result. 1317 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1318 _mm256_min_epu32(__m256i __a, __m256i __b) 1319 { 1320 return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b); 1321 } 1322 1323 /// Creates a 32-bit integer mask from the most significant bit of each byte 1324 /// in the 256-bit integer vector in \a __a and returns the result. 1325 /// 1326 /// \code{.operation} 1327 /// FOR i := 0 TO 31 1328 /// j := i*8 1329 /// result[i] := __a[j+7] 1330 /// ENDFOR 1331 /// \endcode 1332 /// 1333 /// \headerfile <immintrin.h> 1334 /// 1335 /// This intrinsic corresponds to the \c VPMOVMSKB instruction. 1336 /// 1337 /// \param __a 1338 /// A 256-bit integer vector containing the source bytes. 1339 /// \returns The 32-bit integer mask. 1340 static __inline__ int __DEFAULT_FN_ATTRS256 1341 _mm256_movemask_epi8(__m256i __a) 1342 { 1343 return __builtin_ia32_pmovmskb256((__v32qi)__a); 1344 } 1345 1346 /// Sign-extends bytes from the 128-bit integer vector in \a __V and returns 1347 /// the 16-bit values in the corresponding elements of a 256-bit vector 1348 /// of [16 x i16]. 1349 /// 1350 /// \code{.operation} 1351 /// FOR i := 0 TO 15 1352 /// j := i*8 1353 /// k := i*16 1354 /// result[k+15:k] := SignExtend(__V[j+7:j]) 1355 /// ENDFOR 1356 /// \endcode 1357 /// 1358 /// \headerfile <immintrin.h> 1359 /// 1360 /// This intrinsic corresponds to the \c VPMOVSXBW instruction. 1361 /// 1362 /// \param __V 1363 /// A 128-bit integer vector containing the source bytes. 1364 /// \returns A 256-bit vector of [16 x i16] containing the sign-extended 1365 /// values. 1366 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1367 _mm256_cvtepi8_epi16(__m128i __V) 1368 { 1369 /* This function always performs a signed extension, but __v16qi is a char 1370 which may be signed or unsigned, so use __v16qs. */ 1371 return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi); 1372 } 1373 1374 /// Sign-extends bytes from the lower half of the 128-bit integer vector in 1375 /// \a __V and returns the 32-bit values in the corresponding elements of a 1376 /// 256-bit vector of [8 x i32]. 1377 /// 1378 /// \code{.operation} 1379 /// FOR i := 0 TO 7 1380 /// j := i*8 1381 /// k := i*32 1382 /// result[k+31:k] := SignExtend(__V[j+7:j]) 1383 /// ENDFOR 1384 /// \endcode 1385 /// 1386 /// \headerfile <immintrin.h> 1387 /// 1388 /// This intrinsic corresponds to the \c VPMOVSXBD instruction. 1389 /// 1390 /// \param __V 1391 /// A 128-bit integer vector containing the source bytes. 1392 /// \returns A 256-bit vector of [8 x i32] containing the sign-extended 1393 /// values. 1394 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1395 _mm256_cvtepi8_epi32(__m128i __V) 1396 { 1397 /* This function always performs a signed extension, but __v16qi is a char 1398 which may be signed or unsigned, so use __v16qs. */ 1399 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si); 1400 } 1401 1402 /// Sign-extends the first four bytes from the 128-bit integer vector in 1403 /// \a __V and returns the 64-bit values in the corresponding elements of a 1404 /// 256-bit vector of [4 x i64]. 1405 /// 1406 /// \code{.operation} 1407 /// result[63:0] := SignExtend(__V[7:0]) 1408 /// result[127:64] := SignExtend(__V[15:8]) 1409 /// result[191:128] := SignExtend(__V[23:16]) 1410 /// result[255:192] := SignExtend(__V[31:24]) 1411 /// \endcode 1412 /// 1413 /// \headerfile <immintrin.h> 1414 /// 1415 /// This intrinsic corresponds to the \c VPMOVSXBQ instruction. 1416 /// 1417 /// \param __V 1418 /// A 128-bit integer vector containing the source bytes. 1419 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended 1420 /// values. 1421 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1422 _mm256_cvtepi8_epi64(__m128i __V) 1423 { 1424 /* This function always performs a signed extension, but __v16qi is a char 1425 which may be signed or unsigned, so use __v16qs. */ 1426 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di); 1427 } 1428 1429 /// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in 1430 /// \a __V and returns the 32-bit values in the corresponding elements of a 1431 /// 256-bit vector of [8 x i32]. 1432 /// 1433 /// \code{.operation} 1434 /// FOR i := 0 TO 7 1435 /// j := i*16 1436 /// k := i*32 1437 /// result[k+31:k] := SignExtend(__V[j+15:j]) 1438 /// ENDFOR 1439 /// \endcode 1440 /// 1441 /// \headerfile <immintrin.h> 1442 /// 1443 /// This intrinsic corresponds to the \c VPMOVSXWD instruction. 1444 /// 1445 /// \param __V 1446 /// A 128-bit vector of [8 x i16] containing the source values. 1447 /// \returns A 256-bit vector of [8 x i32] containing the sign-extended 1448 /// values. 1449 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1450 _mm256_cvtepi16_epi32(__m128i __V) 1451 { 1452 return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si); 1453 } 1454 1455 /// Sign-extends 16-bit elements from the lower half of the 128-bit vector of 1456 /// [8 x i16] in \a __V and returns the 64-bit values in the corresponding 1457 /// elements of a 256-bit vector of [4 x i64]. 1458 /// 1459 /// \code{.operation} 1460 /// result[63:0] := SignExtend(__V[15:0]) 1461 /// result[127:64] := SignExtend(__V[31:16]) 1462 /// result[191:128] := SignExtend(__V[47:32]) 1463 /// result[255:192] := SignExtend(__V[64:48]) 1464 /// \endcode 1465 /// 1466 /// \headerfile <immintrin.h> 1467 /// 1468 /// This intrinsic corresponds to the \c VPMOVSXWQ instruction. 1469 /// 1470 /// \param __V 1471 /// A 128-bit vector of [8 x i16] containing the source values. 1472 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended 1473 /// values. 1474 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1475 _mm256_cvtepi16_epi64(__m128i __V) 1476 { 1477 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di); 1478 } 1479 1480 /// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in 1481 /// \a __V and returns the 64-bit values in the corresponding elements of a 1482 /// 256-bit vector of [4 x i64]. 1483 /// 1484 /// \code{.operation} 1485 /// result[63:0] := SignExtend(__V[31:0]) 1486 /// result[127:64] := SignExtend(__V[63:32]) 1487 /// result[191:128] := SignExtend(__V[95:64]) 1488 /// result[255:192] := SignExtend(__V[127:96]) 1489 /// \endcode 1490 /// 1491 /// \headerfile <immintrin.h> 1492 /// 1493 /// This intrinsic corresponds to the \c VPMOVSXDQ instruction. 1494 /// 1495 /// \param __V 1496 /// A 128-bit vector of [4 x i32] containing the source values. 1497 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended 1498 /// values. 1499 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1500 _mm256_cvtepi32_epi64(__m128i __V) 1501 { 1502 return (__m256i)__builtin_convertvector((__v4si)__V, __v4di); 1503 } 1504 1505 /// Zero-extends bytes from the 128-bit integer vector in \a __V and returns 1506 /// the 16-bit values in the corresponding elements of a 256-bit vector 1507 /// of [16 x i16]. 1508 /// 1509 /// \code{.operation} 1510 /// FOR i := 0 TO 15 1511 /// j := i*8 1512 /// k := i*16 1513 /// result[k+15:k] := ZeroExtend(__V[j+7:j]) 1514 /// ENDFOR 1515 /// \endcode 1516 /// 1517 /// \headerfile <immintrin.h> 1518 /// 1519 /// This intrinsic corresponds to the \c VPMOVZXBW instruction. 1520 /// 1521 /// \param __V 1522 /// A 128-bit integer vector containing the source bytes. 1523 /// \returns A 256-bit vector of [16 x i16] containing the zero-extended 1524 /// values. 1525 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1526 _mm256_cvtepu8_epi16(__m128i __V) 1527 { 1528 return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi); 1529 } 1530 1531 /// Zero-extends bytes from the lower half of the 128-bit integer vector in 1532 /// \a __V and returns the 32-bit values in the corresponding elements of a 1533 /// 256-bit vector of [8 x i32]. 1534 /// 1535 /// \code{.operation} 1536 /// FOR i := 0 TO 7 1537 /// j := i*8 1538 /// k := i*32 1539 /// result[k+31:k] := ZeroExtend(__V[j+7:j]) 1540 /// ENDFOR 1541 /// \endcode 1542 /// 1543 /// \headerfile <immintrin.h> 1544 /// 1545 /// This intrinsic corresponds to the \c VPMOVZXBD instruction. 1546 /// 1547 /// \param __V 1548 /// A 128-bit integer vector containing the source bytes. 1549 /// \returns A 256-bit vector of [8 x i32] containing the zero-extended 1550 /// values. 1551 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1552 _mm256_cvtepu8_epi32(__m128i __V) 1553 { 1554 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si); 1555 } 1556 1557 /// Zero-extends the first four bytes from the 128-bit integer vector in 1558 /// \a __V and returns the 64-bit values in the corresponding elements of a 1559 /// 256-bit vector of [4 x i64]. 1560 /// 1561 /// \code{.operation} 1562 /// result[63:0] := ZeroExtend(__V[7:0]) 1563 /// result[127:64] := ZeroExtend(__V[15:8]) 1564 /// result[191:128] := ZeroExtend(__V[23:16]) 1565 /// result[255:192] := ZeroExtend(__V[31:24]) 1566 /// \endcode 1567 /// 1568 /// \headerfile <immintrin.h> 1569 /// 1570 /// This intrinsic corresponds to the \c VPMOVZXBQ instruction. 1571 /// 1572 /// \param __V 1573 /// A 128-bit integer vector containing the source bytes. 1574 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended 1575 /// values. 1576 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1577 _mm256_cvtepu8_epi64(__m128i __V) 1578 { 1579 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di); 1580 } 1581 1582 /// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in 1583 /// \a __V and returns the 32-bit values in the corresponding elements of a 1584 /// 256-bit vector of [8 x i32]. 1585 /// 1586 /// \code{.operation} 1587 /// FOR i := 0 TO 7 1588 /// j := i*16 1589 /// k := i*32 1590 /// result[k+31:k] := ZeroExtend(__V[j+15:j]) 1591 /// ENDFOR 1592 /// \endcode 1593 /// 1594 /// \headerfile <immintrin.h> 1595 /// 1596 /// This intrinsic corresponds to the \c VPMOVZXWD instruction. 1597 /// 1598 /// \param __V 1599 /// A 128-bit vector of [8 x i16] containing the source values. 1600 /// \returns A 256-bit vector of [8 x i32] containing the zero-extended 1601 /// values. 1602 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1603 _mm256_cvtepu16_epi32(__m128i __V) 1604 { 1605 return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si); 1606 } 1607 1608 /// Zero-extends 16-bit elements from the lower half of the 128-bit vector of 1609 /// [8 x i16] in \a __V and returns the 64-bit values in the corresponding 1610 /// elements of a 256-bit vector of [4 x i64]. 1611 /// 1612 /// \code{.operation} 1613 /// result[63:0] := ZeroExtend(__V[15:0]) 1614 /// result[127:64] := ZeroExtend(__V[31:16]) 1615 /// result[191:128] := ZeroExtend(__V[47:32]) 1616 /// result[255:192] := ZeroExtend(__V[64:48]) 1617 /// \endcode 1618 /// 1619 /// \headerfile <immintrin.h> 1620 /// 1621 /// This intrinsic corresponds to the \c VPMOVSXWQ instruction. 1622 /// 1623 /// \param __V 1624 /// A 128-bit vector of [8 x i16] containing the source values. 1625 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended 1626 /// values. 1627 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1628 _mm256_cvtepu16_epi64(__m128i __V) 1629 { 1630 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di); 1631 } 1632 1633 /// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in 1634 /// \a __V and returns the 64-bit values in the corresponding elements of a 1635 /// 256-bit vector of [4 x i64]. 1636 /// 1637 /// \code{.operation} 1638 /// result[63:0] := ZeroExtend(__V[31:0]) 1639 /// result[127:64] := ZeroExtend(__V[63:32]) 1640 /// result[191:128] := ZeroExtend(__V[95:64]) 1641 /// result[255:192] := ZeroExtend(__V[127:96]) 1642 /// \endcode 1643 /// 1644 /// \headerfile <immintrin.h> 1645 /// 1646 /// This intrinsic corresponds to the \c VPMOVZXDQ instruction. 1647 /// 1648 /// \param __V 1649 /// A 128-bit vector of [4 x i32] containing the source values. 1650 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended 1651 /// values. 1652 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1653 _mm256_cvtepu32_epi64(__m128i __V) 1654 { 1655 return (__m256i)__builtin_convertvector((__v4su)__V, __v4di); 1656 } 1657 1658 /// Multiplies signed 32-bit integers from even-numbered elements of two 1659 /// 256-bit vectors of [8 x i32] and returns the 64-bit products in the 1660 /// [4 x i64] result. 1661 /// 1662 /// \code{.operation} 1663 /// result[63:0] := __a[31:0] * __b[31:0] 1664 /// result[127:64] := __a[95:64] * __b[95:64] 1665 /// result[191:128] := __a[159:128] * __b[159:128] 1666 /// result[255:192] := __a[223:192] * __b[223:192] 1667 /// \endcode 1668 /// 1669 /// \headerfile <immintrin.h> 1670 /// 1671 /// This intrinsic corresponds to the \c VPMULDQ instruction. 1672 /// 1673 /// \param __a 1674 /// A 256-bit vector of [8 x i32] containing one of the source operands. 1675 /// \param __b 1676 /// A 256-bit vector of [8 x i32] containing one of the source operands. 1677 /// \returns A 256-bit vector of [4 x i64] containing the products. 1678 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1679 _mm256_mul_epi32(__m256i __a, __m256i __b) 1680 { 1681 return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b); 1682 } 1683 1684 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of 1685 /// [16 x i16], truncates the 32-bit results to the most significant 18 1686 /// bits, rounds by adding 1, and returns bits [16:1] of each rounded 1687 /// product in the [16 x i16] result. 1688 /// 1689 /// \code{.operation} 1690 /// FOR i := 0 TO 15 1691 /// j := i*16 1692 /// temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1 1693 /// result[j+15:j] := temp[16:1] 1694 /// \endcode 1695 /// 1696 /// \headerfile <immintrin.h> 1697 /// 1698 /// This intrinsic corresponds to the \c VPMULHRSW instruction. 1699 /// 1700 /// \param __a 1701 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1702 /// \param __b 1703 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1704 /// \returns A 256-bit vector of [16 x i16] containing the rounded products. 1705 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1706 _mm256_mulhrs_epi16(__m256i __a, __m256i __b) 1707 { 1708 return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b); 1709 } 1710 1711 /// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of 1712 /// [16 x i16], and returns the upper 16 bits of each 32-bit product in the 1713 /// [16 x i16] result. 1714 /// 1715 /// \headerfile <immintrin.h> 1716 /// 1717 /// This intrinsic corresponds to the \c VPMULHUW instruction. 1718 /// 1719 /// \param __a 1720 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1721 /// \param __b 1722 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1723 /// \returns A 256-bit vector of [16 x i16] containing the products. 1724 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1725 _mm256_mulhi_epu16(__m256i __a, __m256i __b) 1726 { 1727 return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b); 1728 } 1729 1730 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of 1731 /// [16 x i16], and returns the upper 16 bits of each 32-bit product in the 1732 /// [16 x i16] result. 1733 /// 1734 /// \headerfile <immintrin.h> 1735 /// 1736 /// This intrinsic corresponds to the \c VPMULHW instruction. 1737 /// 1738 /// \param __a 1739 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1740 /// \param __b 1741 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1742 /// \returns A 256-bit vector of [16 x i16] containing the products. 1743 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1744 _mm256_mulhi_epi16(__m256i __a, __m256i __b) 1745 { 1746 return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b); 1747 } 1748 1749 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of 1750 /// [16 x i16], and returns the lower 16 bits of each 32-bit product in the 1751 /// [16 x i16] result. 1752 /// 1753 /// \headerfile <immintrin.h> 1754 /// 1755 /// This intrinsic corresponds to the \c VPMULLW instruction. 1756 /// 1757 /// \param __a 1758 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1759 /// \param __b 1760 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1761 /// \returns A 256-bit vector of [16 x i16] containing the products. 1762 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1763 _mm256_mullo_epi16(__m256i __a, __m256i __b) 1764 { 1765 return (__m256i)((__v16hu)__a * (__v16hu)__b); 1766 } 1767 1768 /// Multiplies signed 32-bit integer elements of two 256-bit vectors of 1769 /// [8 x i32], and returns the lower 32 bits of each 64-bit product in the 1770 /// [8 x i32] result. 1771 /// 1772 /// \headerfile <immintrin.h> 1773 /// 1774 /// This intrinsic corresponds to the \c VPMULLD instruction. 1775 /// 1776 /// \param __a 1777 /// A 256-bit vector of [8 x i32] containing one of the source operands. 1778 /// \param __b 1779 /// A 256-bit vector of [8 x i32] containing one of the source operands. 1780 /// \returns A 256-bit vector of [8 x i32] containing the products. 1781 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1782 _mm256_mullo_epi32 (__m256i __a, __m256i __b) 1783 { 1784 return (__m256i)((__v8su)__a * (__v8su)__b); 1785 } 1786 1787 /// Multiplies unsigned 32-bit integers from even-numered elements of two 1788 /// 256-bit vectors of [8 x i32] and returns the 64-bit products in the 1789 /// [4 x i64] result. 1790 /// 1791 /// \code{.operation} 1792 /// result[63:0] := __a[31:0] * __b[31:0] 1793 /// result[127:64] := __a[95:64] * __b[95:64] 1794 /// result[191:128] := __a[159:128] * __b[159:128] 1795 /// result[255:192] := __a[223:192] * __b[223:192] 1796 /// \endcode 1797 /// 1798 /// \headerfile <immintrin.h> 1799 /// 1800 /// This intrinsic corresponds to the \c VPMULUDQ instruction. 1801 /// 1802 /// \param __a 1803 /// A 256-bit vector of [8 x i32] containing one of the source operands. 1804 /// \param __b 1805 /// A 256-bit vector of [8 x i32] containing one of the source operands. 1806 /// \returns A 256-bit vector of [4 x i64] containing the products. 1807 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1808 _mm256_mul_epu32(__m256i __a, __m256i __b) 1809 { 1810 return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b); 1811 } 1812 1813 /// Computes the bitwise OR of the 256-bit integer vectors in \a __a and 1814 /// \a __b. 1815 /// 1816 /// \headerfile <immintrin.h> 1817 /// 1818 /// This intrinsic corresponds to the \c VPOR instruction. 1819 /// 1820 /// \param __a 1821 /// A 256-bit integer vector. 1822 /// \param __b 1823 /// A 256-bit integer vector. 1824 /// \returns A 256-bit integer vector containing the result. 1825 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1826 _mm256_or_si256(__m256i __a, __m256i __b) 1827 { 1828 return (__m256i)((__v4du)__a | (__v4du)__b); 1829 } 1830 1831 /// Computes four sum of absolute difference (SAD) operations on sets of eight 1832 /// unsigned 8-bit integers from the 256-bit integer vectors \a __a and 1833 /// \a __b. 1834 /// 1835 /// One SAD result is computed for each set of eight bytes from \a __a and 1836 /// eight bytes from \a __b. The zero-extended SAD value is returned in the 1837 /// corresponding 64-bit element of the result. 1838 /// 1839 /// A single SAD operation takes the differences between the corresponding 1840 /// bytes of \a __a and \a __b, takes the absolute value of each difference, 1841 /// and sums these eight values to form one 16-bit result. This operation 1842 /// is repeated four times with successive sets of eight bytes. 1843 /// 1844 /// \code{.operation} 1845 /// FOR i := 0 TO 3 1846 /// j := i*64 1847 /// temp0 := ABS(__a[j+7:j] - __b[j+7:j]) 1848 /// temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8]) 1849 /// temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16]) 1850 /// temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24]) 1851 /// temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32]) 1852 /// temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40]) 1853 /// temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48]) 1854 /// temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56]) 1855 /// result[j+15:j] := temp0 + temp1 + temp2 + temp3 + 1856 /// temp4 + temp5 + temp6 + temp7 1857 /// result[j+63:j+16] := 0 1858 /// ENDFOR 1859 /// \endcode 1860 /// 1861 /// \headerfile <immintrin.h> 1862 /// 1863 /// This intrinsic corresponds to the \c VPSADBW instruction. 1864 /// 1865 /// \param __a 1866 /// A 256-bit integer vector. 1867 /// \param __b 1868 /// A 256-bit integer vector. 1869 /// \returns A 256-bit integer vector containing the result. 1870 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1871 _mm256_sad_epu8(__m256i __a, __m256i __b) 1872 { 1873 return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b); 1874 } 1875 1876 /// Shuffles 8-bit integers in the 256-bit integer vector \a __a according 1877 /// to control information in the 256-bit integer vector \a __b, and 1878 /// returns the 256-bit result. In effect there are two separate 128-bit 1879 /// shuffles in the lower and upper halves. 1880 /// 1881 /// \code{.operation} 1882 /// FOR i := 0 TO 31 1883 /// j := i*8 1884 /// IF __b[j+7] == 1 1885 /// result[j+7:j] := 0 1886 /// ELSE 1887 /// k := __b[j+3:j] * 8 1888 /// IF i > 15 1889 /// k := k + 128 1890 /// FI 1891 /// result[j+7:j] := __a[k+7:k] 1892 /// FI 1893 /// ENDFOR 1894 /// \endcode 1895 /// 1896 /// \headerfile <immintrin.h> 1897 /// 1898 /// This intrinsic corresponds to the \c VPSHUFB instruction. 1899 /// 1900 /// \param __a 1901 /// A 256-bit integer vector containing source values. 1902 /// \param __b 1903 /// A 256-bit integer vector containing control information to determine 1904 /// what goes into the corresponding byte of the result. If bit 7 of the 1905 /// control byte is 1, the result byte is 0; otherwise, bits 3:0 of the 1906 /// control byte specify the index (within the same 128-bit half) of \a __a 1907 /// to copy to the result byte. 1908 /// \returns A 256-bit integer vector containing the result. 1909 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1910 _mm256_shuffle_epi8(__m256i __a, __m256i __b) 1911 { 1912 return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b); 1913 } 1914 1915 /// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a 1916 /// according to control information in the integer literal \a imm, and 1917 /// returns the 256-bit result. In effect there are two parallel 128-bit 1918 /// shuffles in the lower and upper halves. 1919 /// 1920 /// \code{.operation} 1921 /// FOR i := 0 to 3 1922 /// j := i*32 1923 /// k := (imm >> i*2)[1:0] * 32 1924 /// result[j+31:j] := a[k+31:k] 1925 /// result[128+j+31:128+j] := a[128+k+31:128+k] 1926 /// ENDFOR 1927 /// \endcode 1928 /// 1929 /// \headerfile <immintrin.h> 1930 /// 1931 /// \code 1932 /// __m256i _mm256_shuffle_epi32(__m256i a, const int imm); 1933 /// \endcode 1934 /// 1935 /// This intrinsic corresponds to the \c VPSHUFB instruction. 1936 /// 1937 /// \param a 1938 /// A 256-bit vector of [8 x i32] containing source values. 1939 /// \param imm 1940 /// An immediate 8-bit value specifying which elements to copy from \a a. 1941 /// \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the 1942 /// result, \a imm[3:2] specifies the index for elements 1 and 5, and so 1943 /// forth. 1944 /// \returns A 256-bit vector of [8 x i32] containing the result. 1945 #define _mm256_shuffle_epi32(a, imm) \ 1946 ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm))) 1947 1948 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a 1949 /// according to control information in the integer literal \a imm, and 1950 /// returns the 256-bit result. The upper 64 bits of each 128-bit half 1951 /// are shuffled in parallel; the lower 64 bits of each 128-bit half are 1952 /// copied from \a a unchanged. 1953 /// 1954 /// \code{.operation} 1955 /// result[63:0] := a[63:0] 1956 /// result[191:128] := a[191:128] 1957 /// FOR i := 0 TO 3 1958 /// j := i * 16 + 64 1959 /// k := (imm >> i*2)[1:0] * 16 + 64 1960 /// result[j+15:j] := a[k+15:k] 1961 /// result[128+j+15:128+j] := a[128+k+15:128+k] 1962 /// ENDFOR 1963 /// \endcode 1964 /// 1965 /// \headerfile <immintrin.h> 1966 /// 1967 /// \code 1968 /// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm); 1969 /// \endcode 1970 /// 1971 /// This intrinsic corresponds to the \c VPSHUFHW instruction. 1972 /// 1973 /// \param a 1974 /// A 256-bit vector of [16 x i16] containing source values. 1975 /// \param imm 1976 /// An immediate 8-bit value specifying which elements to copy from \a a. 1977 /// \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the 1978 /// result, \a imm[3:2] specifies the index for elements 5 and 9, and so 1979 /// forth. Indexes are offset by 4 (so 0 means index 4, and so forth). 1980 /// \returns A 256-bit vector of [16 x i16] containing the result. 1981 #define _mm256_shufflehi_epi16(a, imm) \ 1982 ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm))) 1983 1984 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a 1985 /// according to control information in the integer literal \a imm, and 1986 /// returns the 256-bit [16 x i16] result. The lower 64 bits of each 1987 /// 128-bit half are shuffled; the upper 64 bits of each 128-bit half are 1988 /// copied from \a a unchanged. 1989 /// 1990 /// \code{.operation} 1991 /// result[127:64] := a[127:64] 1992 /// result[255:192] := a[255:192] 1993 /// FOR i := 0 TO 3 1994 /// j := i * 16 1995 /// k := (imm >> i*2)[1:0] * 16 1996 /// result[j+15:j] := a[k+15:k] 1997 /// result[128+j+15:128+j] := a[128+k+15:128+k] 1998 /// ENDFOR 1999 /// \endcode 2000 /// 2001 /// \headerfile <immintrin.h> 2002 /// 2003 /// \code 2004 /// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm); 2005 /// \endcode 2006 /// 2007 /// This intrinsic corresponds to the \c VPSHUFLW instruction. 2008 /// 2009 /// \param a 2010 /// A 256-bit vector of [16 x i16] to use as a source of data for the 2011 /// result. 2012 /// \param imm 2013 /// An immediate 8-bit value specifying which elements to copy from \a a. 2014 /// \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the 2015 /// result, \a imm[3:2] specifies the index for elements 1 and 9, and so 2016 /// forth. 2017 /// \returns A 256-bit vector of [16 x i16] containing the result. 2018 #define _mm256_shufflelo_epi16(a, imm) \ 2019 ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm))) 2020 2021 /// Sets each byte of the result to the corresponding byte of the 256-bit 2022 /// integer vector in \a __a, the negative of that byte, or zero, depending 2023 /// on whether the corresponding byte of the 256-bit integer vector in 2024 /// \a __b is greater than zero, less than zero, or equal to zero, 2025 /// respectively. 2026 /// 2027 /// \headerfile <immintrin.h> 2028 /// 2029 /// This intrinsic corresponds to the \c VPSIGNB instruction. 2030 /// 2031 /// \param __a 2032 /// A 256-bit integer vector. 2033 /// \param __b 2034 /// A 256-bit integer vector]. 2035 /// \returns A 256-bit integer vector containing the result. 2036 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2037 _mm256_sign_epi8(__m256i __a, __m256i __b) 2038 { 2039 return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b); 2040 } 2041 2042 /// Sets each element of the result to the corresponding element of the 2043 /// 256-bit vector of [16 x i16] in \a __a, the negative of that element, 2044 /// or zero, depending on whether the corresponding element of the 256-bit 2045 /// vector of [16 x i16] in \a __b is greater than zero, less than zero, or 2046 /// equal to zero, respectively. 2047 /// 2048 /// \headerfile <immintrin.h> 2049 /// 2050 /// This intrinsic corresponds to the \c VPSIGNW instruction. 2051 /// 2052 /// \param __a 2053 /// A 256-bit vector of [16 x i16]. 2054 /// \param __b 2055 /// A 256-bit vector of [16 x i16]. 2056 /// \returns A 256-bit vector of [16 x i16] containing the result. 2057 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2058 _mm256_sign_epi16(__m256i __a, __m256i __b) 2059 { 2060 return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b); 2061 } 2062 2063 /// Sets each element of the result to the corresponding element of the 2064 /// 256-bit vector of [8 x i32] in \a __a, the negative of that element, or 2065 /// zero, depending on whether the corresponding element of the 256-bit 2066 /// vector of [8 x i32] in \a __b is greater than zero, less than zero, or 2067 /// equal to zero, respectively. 2068 /// 2069 /// \headerfile <immintrin.h> 2070 /// 2071 /// This intrinsic corresponds to the \c VPSIGND instruction. 2072 /// 2073 /// \param __a 2074 /// A 256-bit vector of [8 x i32]. 2075 /// \param __b 2076 /// A 256-bit vector of [8 x i32]. 2077 /// \returns A 256-bit vector of [8 x i32] containing the result. 2078 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2079 _mm256_sign_epi32(__m256i __a, __m256i __b) 2080 { 2081 return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b); 2082 } 2083 2084 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by 2085 /// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm 2086 /// is greater than 15, the returned result is all zeroes. 2087 /// 2088 /// \headerfile <immintrin.h> 2089 /// 2090 /// \code 2091 /// __m256i _mm256_slli_si256(__m256i a, const int imm); 2092 /// \endcode 2093 /// 2094 /// This intrinsic corresponds to the \c VPSLLDQ instruction. 2095 /// 2096 /// \param a 2097 /// A 256-bit integer vector to be shifted. 2098 /// \param imm 2099 /// An unsigned immediate value specifying the shift count (in bytes). 2100 /// \returns A 256-bit integer vector containing the result. 2101 #define _mm256_slli_si256(a, imm) \ 2102 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))) 2103 2104 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by 2105 /// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm 2106 /// is greater than 15, the returned result is all zeroes. 2107 /// 2108 /// \headerfile <immintrin.h> 2109 /// 2110 /// \code 2111 /// __m256i _mm256_bslli_epi128(__m256i a, const int imm); 2112 /// \endcode 2113 /// 2114 /// This intrinsic corresponds to the \c VPSLLDQ instruction. 2115 /// 2116 /// \param a 2117 /// A 256-bit integer vector to be shifted. 2118 /// \param imm 2119 /// An unsigned immediate value specifying the shift count (in bytes). 2120 /// \returns A 256-bit integer vector containing the result. 2121 #define _mm256_bslli_epi128(a, imm) \ 2122 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))) 2123 2124 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 2125 /// left by \a __count bits, shifting in zero bits, and returns the result. 2126 /// If \a __count is greater than 15, the returned result is all zeroes. 2127 /// 2128 /// \headerfile <immintrin.h> 2129 /// 2130 /// This intrinsic corresponds to the \c VPSLLW instruction. 2131 /// 2132 /// \param __a 2133 /// A 256-bit vector of [16 x i16] to be shifted. 2134 /// \param __count 2135 /// An unsigned integer value specifying the shift count (in bits). 2136 /// \returns A 256-bit vector of [16 x i16] containing the result. 2137 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2138 _mm256_slli_epi16(__m256i __a, int __count) 2139 { 2140 return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count); 2141 } 2142 2143 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 2144 /// left by the number of bits specified by the lower 64 bits of \a __count, 2145 /// shifting in zero bits, and returns the result. If \a __count is greater 2146 /// than 15, the returned result is all zeroes. 2147 /// 2148 /// \headerfile <immintrin.h> 2149 /// 2150 /// This intrinsic corresponds to the \c VPSLLW instruction. 2151 /// 2152 /// \param __a 2153 /// A 256-bit vector of [16 x i16] to be shifted. 2154 /// \param __count 2155 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2156 /// shift count (in bits). The upper element is ignored. 2157 /// \returns A 256-bit vector of [16 x i16] containing the result. 2158 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2159 _mm256_sll_epi16(__m256i __a, __m128i __count) 2160 { 2161 return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count); 2162 } 2163 2164 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 2165 /// left by \a __count bits, shifting in zero bits, and returns the result. 2166 /// If \a __count is greater than 31, the returned result is all zeroes. 2167 /// 2168 /// \headerfile <immintrin.h> 2169 /// 2170 /// This intrinsic corresponds to the \c VPSLLD instruction. 2171 /// 2172 /// \param __a 2173 /// A 256-bit vector of [8 x i32] to be shifted. 2174 /// \param __count 2175 /// An unsigned integer value specifying the shift count (in bits). 2176 /// \returns A 256-bit vector of [8 x i32] containing the result. 2177 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2178 _mm256_slli_epi32(__m256i __a, int __count) 2179 { 2180 return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count); 2181 } 2182 2183 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 2184 /// left by the number of bits given in the lower 64 bits of \a __count, 2185 /// shifting in zero bits, and returns the result. If \a __count is greater 2186 /// than 31, the returned result is all zeroes. 2187 /// 2188 /// \headerfile <immintrin.h> 2189 /// 2190 /// This intrinsic corresponds to the \c VPSLLD instruction. 2191 /// 2192 /// \param __a 2193 /// A 256-bit vector of [8 x i32] to be shifted. 2194 /// \param __count 2195 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2196 /// shift count (in bits). The upper element is ignored. 2197 /// \returns A 256-bit vector of [8 x i32] containing the result. 2198 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2199 _mm256_sll_epi32(__m256i __a, __m128i __count) 2200 { 2201 return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count); 2202 } 2203 2204 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a 2205 /// left by \a __count bits, shifting in zero bits, and returns the result. 2206 /// If \a __count is greater than 63, the returned result is all zeroes. 2207 /// 2208 /// \headerfile <immintrin.h> 2209 /// 2210 /// This intrinsic corresponds to the \c VPSLLQ instruction. 2211 /// 2212 /// \param __a 2213 /// A 256-bit vector of [4 x i64] to be shifted. 2214 /// \param __count 2215 /// An unsigned integer value specifying the shift count (in bits). 2216 /// \returns A 256-bit vector of [4 x i64] containing the result. 2217 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2218 _mm256_slli_epi64(__m256i __a, int __count) 2219 { 2220 return __builtin_ia32_psllqi256((__v4di)__a, __count); 2221 } 2222 2223 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a 2224 /// left by the number of bits given in the lower 64 bits of \a __count, 2225 /// shifting in zero bits, and returns the result. If \a __count is greater 2226 /// than 63, the returned result is all zeroes. 2227 /// 2228 /// \headerfile <immintrin.h> 2229 /// 2230 /// This intrinsic corresponds to the \c VPSLLQ instruction. 2231 /// 2232 /// \param __a 2233 /// A 256-bit vector of [4 x i64] to be shifted. 2234 /// \param __count 2235 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2236 /// shift count (in bits). The upper element is ignored. 2237 /// \returns A 256-bit vector of [4 x i64] containing the result. 2238 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2239 _mm256_sll_epi64(__m256i __a, __m128i __count) 2240 { 2241 return __builtin_ia32_psllq256((__v4di)__a, __count); 2242 } 2243 2244 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 2245 /// right by \a __count bits, shifting in sign bits, and returns the result. 2246 /// If \a __count is greater than 15, each element of the result is either 2247 /// 0 or -1 according to the corresponding input sign bit. 2248 /// 2249 /// \headerfile <immintrin.h> 2250 /// 2251 /// This intrinsic corresponds to the \c VPSRAW instruction. 2252 /// 2253 /// \param __a 2254 /// A 256-bit vector of [16 x i16] to be shifted. 2255 /// \param __count 2256 /// An unsigned integer value specifying the shift count (in bits). 2257 /// \returns A 256-bit vector of [16 x i16] containing the result. 2258 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2259 _mm256_srai_epi16(__m256i __a, int __count) 2260 { 2261 return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count); 2262 } 2263 2264 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 2265 /// right by the number of bits given in the lower 64 bits of \a __count, 2266 /// shifting in sign bits, and returns the result. If \a __count is greater 2267 /// than 15, each element of the result is either 0 or -1 according to the 2268 /// corresponding input sign bit. 2269 /// 2270 /// \headerfile <immintrin.h> 2271 /// 2272 /// This intrinsic corresponds to the \c VPSRAW instruction. 2273 /// 2274 /// \param __a 2275 /// A 256-bit vector of [16 x i16] to be shifted. 2276 /// \param __count 2277 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2278 /// shift count (in bits). The upper element is ignored. 2279 /// \returns A 256-bit vector of [16 x i16] containing the result. 2280 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2281 _mm256_sra_epi16(__m256i __a, __m128i __count) 2282 { 2283 return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count); 2284 } 2285 2286 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 2287 /// right by \a __count bits, shifting in sign bits, and returns the result. 2288 /// If \a __count is greater than 31, each element of the result is either 2289 /// 0 or -1 according to the corresponding input sign bit. 2290 /// 2291 /// \headerfile <immintrin.h> 2292 /// 2293 /// This intrinsic corresponds to the \c VPSRAD instruction. 2294 /// 2295 /// \param __a 2296 /// A 256-bit vector of [8 x i32] to be shifted. 2297 /// \param __count 2298 /// An unsigned integer value specifying the shift count (in bits). 2299 /// \returns A 256-bit vector of [8 x i32] containing the result. 2300 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2301 _mm256_srai_epi32(__m256i __a, int __count) 2302 { 2303 return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count); 2304 } 2305 2306 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 2307 /// right by the number of bits given in the lower 64 bits of \a __count, 2308 /// shifting in sign bits, and returns the result. If \a __count is greater 2309 /// than 31, each element of the result is either 0 or -1 according to the 2310 /// corresponding input sign bit. 2311 /// 2312 /// \headerfile <immintrin.h> 2313 /// 2314 /// This intrinsic corresponds to the \c VPSRAD instruction. 2315 /// 2316 /// \param __a 2317 /// A 256-bit vector of [8 x i32] to be shifted. 2318 /// \param __count 2319 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2320 /// shift count (in bits). The upper element is ignored. 2321 /// \returns A 256-bit vector of [8 x i32] containing the result. 2322 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2323 _mm256_sra_epi32(__m256i __a, __m128i __count) 2324 { 2325 return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count); 2326 } 2327 2328 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by 2329 /// \a imm bytes, shifting in zero bytes, and returns the result. If 2330 /// \a imm is greater than 15, the returned result is all zeroes. 2331 /// 2332 /// \headerfile <immintrin.h> 2333 /// 2334 /// \code 2335 /// __m256i _mm256_srli_si256(__m256i a, const int imm); 2336 /// \endcode 2337 /// 2338 /// This intrinsic corresponds to the \c VPSRLDQ instruction. 2339 /// 2340 /// \param a 2341 /// A 256-bit integer vector to be shifted. 2342 /// \param imm 2343 /// An unsigned immediate value specifying the shift count (in bytes). 2344 /// \returns A 256-bit integer vector containing the result. 2345 #define _mm256_srli_si256(a, imm) \ 2346 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))) 2347 2348 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by 2349 /// \a imm bytes, shifting in zero bytes, and returns the result. If 2350 /// \a imm is greater than 15, the returned result is all zeroes. 2351 /// 2352 /// \headerfile <immintrin.h> 2353 /// 2354 /// \code 2355 /// __m256i _mm256_bsrli_epi128(__m256i a, const int imm); 2356 /// \endcode 2357 /// 2358 /// This intrinsic corresponds to the \c VPSRLDQ instruction. 2359 /// 2360 /// \param a 2361 /// A 256-bit integer vector to be shifted. 2362 /// \param imm 2363 /// An unsigned immediate value specifying the shift count (in bytes). 2364 /// \returns A 256-bit integer vector containing the result. 2365 #define _mm256_bsrli_epi128(a, imm) \ 2366 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))) 2367 2368 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 2369 /// right by \a __count bits, shifting in zero bits, and returns the result. 2370 /// If \a __count is greater than 15, the returned result is all zeroes. 2371 /// 2372 /// \headerfile <immintrin.h> 2373 /// 2374 /// This intrinsic corresponds to the \c VPSRLW instruction. 2375 /// 2376 /// \param __a 2377 /// A 256-bit vector of [16 x i16] to be shifted. 2378 /// \param __count 2379 /// An unsigned integer value specifying the shift count (in bits). 2380 /// \returns A 256-bit vector of [16 x i16] containing the result. 2381 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2382 _mm256_srli_epi16(__m256i __a, int __count) 2383 { 2384 return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count); 2385 } 2386 2387 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 2388 /// right by the number of bits given in the lower 64 bits of \a __count, 2389 /// shifting in zero bits, and returns the result. If \a __count is greater 2390 /// than 15, the returned result is all zeroes. 2391 /// 2392 /// \headerfile <immintrin.h> 2393 /// 2394 /// This intrinsic corresponds to the \c VPSRLW instruction. 2395 /// 2396 /// \param __a 2397 /// A 256-bit vector of [16 x i16] to be shifted. 2398 /// \param __count 2399 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2400 /// shift count (in bits). The upper element is ignored. 2401 /// \returns A 256-bit vector of [16 x i16] containing the result. 2402 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2403 _mm256_srl_epi16(__m256i __a, __m128i __count) 2404 { 2405 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count); 2406 } 2407 2408 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 2409 /// right by \a __count bits, shifting in zero bits, and returns the result. 2410 /// If \a __count is greater than 31, the returned result is all zeroes. 2411 /// 2412 /// \headerfile <immintrin.h> 2413 /// 2414 /// This intrinsic corresponds to the \c VPSRLD instruction. 2415 /// 2416 /// \param __a 2417 /// A 256-bit vector of [8 x i32] to be shifted. 2418 /// \param __count 2419 /// An unsigned integer value specifying the shift count (in bits). 2420 /// \returns A 256-bit vector of [8 x i32] containing the result. 2421 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2422 _mm256_srli_epi32(__m256i __a, int __count) 2423 { 2424 return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count); 2425 } 2426 2427 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 2428 /// right by the number of bits given in the lower 64 bits of \a __count, 2429 /// shifting in zero bits, and returns the result. If \a __count is greater 2430 /// than 31, the returned result is all zeroes. 2431 /// 2432 /// \headerfile <immintrin.h> 2433 /// 2434 /// This intrinsic corresponds to the \c VPSRLD instruction. 2435 /// 2436 /// \param __a 2437 /// A 256-bit vector of [8 x i32] to be shifted. 2438 /// \param __count 2439 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2440 /// shift count (in bits). The upper element is ignored. 2441 /// \returns A 256-bit vector of [8 x i32] containing the result. 2442 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2443 _mm256_srl_epi32(__m256i __a, __m128i __count) 2444 { 2445 return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count); 2446 } 2447 2448 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a 2449 /// right by \a __count bits, shifting in zero bits, and returns the result. 2450 /// If \a __count is greater than 63, the returned result is all zeroes. 2451 /// 2452 /// \headerfile <immintrin.h> 2453 /// 2454 /// This intrinsic corresponds to the \c VPSRLQ instruction. 2455 /// 2456 /// \param __a 2457 /// A 256-bit vector of [4 x i64] to be shifted. 2458 /// \param __count 2459 /// An unsigned integer value specifying the shift count (in bits). 2460 /// \returns A 256-bit vector of [4 x i64] containing the result. 2461 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2462 _mm256_srli_epi64(__m256i __a, int __count) 2463 { 2464 return __builtin_ia32_psrlqi256((__v4di)__a, __count); 2465 } 2466 2467 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a 2468 /// right by the number of bits given in the lower 64 bits of \a __count, 2469 /// shifting in zero bits, and returns the result. If \a __count is greater 2470 /// than 63, the returned result is all zeroes. 2471 /// 2472 /// \headerfile <immintrin.h> 2473 /// 2474 /// This intrinsic corresponds to the \c VPSRLQ instruction. 2475 /// 2476 /// \param __a 2477 /// A 256-bit vector of [4 x i64] to be shifted. 2478 /// \param __count 2479 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2480 /// shift count (in bits). The upper element is ignored. 2481 /// \returns A 256-bit vector of [4 x i64] containing the result. 2482 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2483 _mm256_srl_epi64(__m256i __a, __m128i __count) 2484 { 2485 return __builtin_ia32_psrlq256((__v4di)__a, __count); 2486 } 2487 2488 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer 2489 /// vectors. Returns the lower 8 bits of each difference in the 2490 /// corresponding byte of the 256-bit integer vector result (overflow is 2491 /// ignored). 2492 /// 2493 /// \code{.operation} 2494 /// FOR i := 0 TO 31 2495 /// j := i*8 2496 /// result[j+7:j] := __a[j+7:j] - __b[j+7:j] 2497 /// ENDFOR 2498 /// \endcode 2499 /// 2500 /// \headerfile <immintrin.h> 2501 /// 2502 /// This intrinsic corresponds to the \c VPSUBB instruction. 2503 /// 2504 /// \param __a 2505 /// A 256-bit integer vector containing the minuends. 2506 /// \param __b 2507 /// A 256-bit integer vector containing the subtrahends. 2508 /// \returns A 256-bit integer vector containing the differences. 2509 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2510 _mm256_sub_epi8(__m256i __a, __m256i __b) 2511 { 2512 return (__m256i)((__v32qu)__a - (__v32qu)__b); 2513 } 2514 2515 /// Subtracts 16-bit integers from corresponding elements of two 256-bit 2516 /// vectors of [16 x i16]. Returns the lower 16 bits of each difference in 2517 /// the corresponding element of the [16 x i16] result (overflow is 2518 /// ignored). 2519 /// 2520 /// \code{.operation} 2521 /// FOR i := 0 TO 15 2522 /// j := i*16 2523 /// result[j+15:j] := __a[j+15:j] - __b[j+15:j] 2524 /// ENDFOR 2525 /// \endcode 2526 /// 2527 /// \headerfile <immintrin.h> 2528 /// 2529 /// This intrinsic corresponds to the \c VPSUBW instruction. 2530 /// 2531 /// \param __a 2532 /// A 256-bit vector of [16 x i16] containing the minuends. 2533 /// \param __b 2534 /// A 256-bit vector of [16 x i16] containing the subtrahends. 2535 /// \returns A 256-bit vector of [16 x i16] containing the differences. 2536 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2537 _mm256_sub_epi16(__m256i __a, __m256i __b) 2538 { 2539 return (__m256i)((__v16hu)__a - (__v16hu)__b); 2540 } 2541 2542 /// Subtracts 32-bit integers from corresponding elements of two 256-bit 2543 /// vectors of [8 x i32]. Returns the lower 32 bits of each difference in 2544 /// the corresponding element of the [8 x i32] result (overflow is ignored). 2545 /// 2546 /// \code{.operation} 2547 /// FOR i := 0 TO 7 2548 /// j := i*32 2549 /// result[j+31:j] := __a[j+31:j] - __b[j+31:j] 2550 /// ENDFOR 2551 /// \endcode 2552 /// 2553 /// \headerfile <immintrin.h> 2554 /// 2555 /// This intrinsic corresponds to the \c VPSUBD instruction. 2556 /// 2557 /// \param __a 2558 /// A 256-bit vector of [8 x i32] containing the minuends. 2559 /// \param __b 2560 /// A 256-bit vector of [8 x i32] containing the subtrahends. 2561 /// \returns A 256-bit vector of [8 x i32] containing the differences. 2562 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2563 _mm256_sub_epi32(__m256i __a, __m256i __b) 2564 { 2565 return (__m256i)((__v8su)__a - (__v8su)__b); 2566 } 2567 2568 /// Subtracts 64-bit integers from corresponding elements of two 256-bit 2569 /// vectors of [4 x i64]. Returns the lower 64 bits of each difference in 2570 /// the corresponding element of the [4 x i64] result (overflow is ignored). 2571 /// 2572 /// \code{.operation} 2573 /// FOR i := 0 TO 3 2574 /// j := i*64 2575 /// result[j+63:j] := __a[j+63:j] - __b[j+63:j] 2576 /// ENDFOR 2577 /// \endcode 2578 /// 2579 /// \headerfile <immintrin.h> 2580 /// 2581 /// This intrinsic corresponds to the \c VPSUBQ instruction. 2582 /// 2583 /// \param __a 2584 /// A 256-bit vector of [4 x i64] containing the minuends. 2585 /// \param __b 2586 /// A 256-bit vector of [4 x i64] containing the subtrahends. 2587 /// \returns A 256-bit vector of [4 x i64] containing the differences. 2588 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2589 _mm256_sub_epi64(__m256i __a, __m256i __b) 2590 { 2591 return (__m256i)((__v4du)__a - (__v4du)__b); 2592 } 2593 2594 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer 2595 /// vectors using signed saturation, and returns each differences in the 2596 /// corresponding byte of the 256-bit integer vector result. 2597 /// 2598 /// \code{.operation} 2599 /// FOR i := 0 TO 31 2600 /// j := i*8 2601 /// result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j]) 2602 /// ENDFOR 2603 /// \endcode 2604 /// 2605 /// \headerfile <immintrin.h> 2606 /// 2607 /// This intrinsic corresponds to the \c VPSUBSB instruction. 2608 /// 2609 /// \param __a 2610 /// A 256-bit integer vector containing the minuends. 2611 /// \param __b 2612 /// A 256-bit integer vector containing the subtrahends. 2613 /// \returns A 256-bit integer vector containing the differences. 2614 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2615 _mm256_subs_epi8(__m256i __a, __m256i __b) 2616 { 2617 return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b); 2618 } 2619 2620 /// Subtracts 16-bit integers from corresponding elements of two 256-bit 2621 /// vectors of [16 x i16] using signed saturation, and returns each 2622 /// difference in the corresponding element of the [16 x i16] result. 2623 /// 2624 /// \code{.operation} 2625 /// FOR i := 0 TO 15 2626 /// j := i*16 2627 /// result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j]) 2628 /// ENDFOR 2629 /// \endcode 2630 /// 2631 /// \headerfile <immintrin.h> 2632 /// 2633 /// This intrinsic corresponds to the \c VPSUBSW instruction. 2634 /// 2635 /// \param __a 2636 /// A 256-bit vector of [16 x i16] containing the minuends. 2637 /// \param __b 2638 /// A 256-bit vector of [16 x i16] containing the subtrahends. 2639 /// \returns A 256-bit vector of [16 x i16] containing the differences. 2640 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2641 _mm256_subs_epi16(__m256i __a, __m256i __b) 2642 { 2643 return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b); 2644 } 2645 2646 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer 2647 /// vectors using unsigned saturation, and returns each difference in the 2648 /// corresponding byte of the 256-bit integer vector result. For each byte, 2649 /// computes <c> result = __a - __b </c>. 2650 /// 2651 /// \code{.operation} 2652 /// FOR i := 0 TO 31 2653 /// j := i*8 2654 /// result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j]) 2655 /// ENDFOR 2656 /// \endcode 2657 /// 2658 /// \headerfile <immintrin.h> 2659 /// 2660 /// This intrinsic corresponds to the \c VPSUBUSB instruction. 2661 /// 2662 /// \param __a 2663 /// A 256-bit integer vector containing the minuends. 2664 /// \param __b 2665 /// A 256-bit integer vector containing the subtrahends. 2666 /// \returns A 256-bit integer vector containing the differences. 2667 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2668 _mm256_subs_epu8(__m256i __a, __m256i __b) 2669 { 2670 return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b); 2671 } 2672 2673 /// Subtracts 16-bit integers from corresponding elements of two 256-bit 2674 /// vectors of [16 x i16] using unsigned saturation, and returns each 2675 /// difference in the corresponding element of the [16 x i16] result. 2676 /// 2677 /// \code{.operation} 2678 /// FOR i := 0 TO 15 2679 /// j := i*16 2680 /// result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j]) 2681 /// ENDFOR 2682 /// \endcode 2683 /// 2684 /// \headerfile <immintrin.h> 2685 /// 2686 /// This intrinsic corresponds to the \c VPSUBUSW instruction. 2687 /// 2688 /// \param __a 2689 /// A 256-bit vector of [16 x i16] containing the minuends. 2690 /// \param __b 2691 /// A 256-bit vector of [16 x i16] containing the subtrahends. 2692 /// \returns A 256-bit vector of [16 x i16] containing the differences. 2693 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2694 _mm256_subs_epu16(__m256i __a, __m256i __b) 2695 { 2696 return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b); 2697 } 2698 2699 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer 2700 /// vectors in \a __a and \a __b to form the 256-bit result. Specifically, 2701 /// uses the upper 64 bits of each 128-bit half of \a __a and \a __b as 2702 /// input; other bits in these parameters are ignored. 2703 /// 2704 /// \code{.operation} 2705 /// result[7:0] := __a[71:64] 2706 /// result[15:8] := __b[71:64] 2707 /// result[23:16] := __a[79:72] 2708 /// result[31:24] := __b[79:72] 2709 /// . . . 2710 /// result[127:120] := __b[127:120] 2711 /// result[135:128] := __a[199:192] 2712 /// . . . 2713 /// result[255:248] := __b[255:248] 2714 /// \endcode 2715 /// 2716 /// \headerfile <immintrin.h> 2717 /// 2718 /// This intrinsic corresponds to the \c VPUNPCKHBW instruction. 2719 /// 2720 /// \param __a 2721 /// A 256-bit integer vector used as the source for the even-numbered bytes 2722 /// of the result. 2723 /// \param __b 2724 /// A 256-bit integer vector used as the source for the odd-numbered bytes 2725 /// of the result. 2726 /// \returns A 256-bit integer vector containing the result. 2727 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2728 _mm256_unpackhi_epi8(__m256i __a, __m256i __b) 2729 { 2730 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31); 2731 } 2732 2733 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors 2734 /// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit 2735 /// vector of [16 x i16]. Specifically, uses the upper 64 bits of each 2736 /// 128-bit half of \a __a and \a __b as input; other bits in these 2737 /// parameters are ignored. 2738 /// 2739 /// \code{.operation} 2740 /// result[15:0] := __a[79:64] 2741 /// result[31:16] := __b[79:64] 2742 /// result[47:32] := __a[95:80] 2743 /// result[63:48] := __b[95:80] 2744 /// . . . 2745 /// result[127:112] := __b[127:112] 2746 /// result[143:128] := __a[211:196] 2747 /// . . . 2748 /// result[255:240] := __b[255:240] 2749 /// \endcode 2750 /// 2751 /// \headerfile <immintrin.h> 2752 /// 2753 /// This intrinsic corresponds to the \c VPUNPCKHWD instruction. 2754 /// 2755 /// \param __a 2756 /// A 256-bit vector of [16 x i16] used as the source for the even-numbered 2757 /// elements of the result. 2758 /// \param __b 2759 /// A 256-bit vector of [16 x i16] used as the source for the odd-numbered 2760 /// elements of the result. 2761 /// \returns A 256-bit vector of [16 x i16] containing the result. 2762 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2763 _mm256_unpackhi_epi16(__m256i __a, __m256i __b) 2764 { 2765 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 2766 } 2767 2768 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors 2769 /// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector 2770 /// of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half 2771 /// of \a __a and \a __b as input; other bits in these parameters are 2772 /// ignored. 2773 /// 2774 /// \code{.operation} 2775 /// result[31:0] := __a[95:64] 2776 /// result[63:32] := __b[95:64] 2777 /// result[95:64] := __a[127:96] 2778 /// result[127:96] := __b[127:96] 2779 /// result[159:128] := __a[223:192] 2780 /// result[191:160] := __b[223:192] 2781 /// result[223:192] := __a[255:224] 2782 /// result[255:224] := __b[255:224] 2783 /// \endcode 2784 /// 2785 /// \headerfile <immintrin.h> 2786 /// 2787 /// This intrinsic corresponds to the \c VPUNPCKHDQ instruction. 2788 /// 2789 /// \param __a 2790 /// A 256-bit vector of [8 x i32] used as the source for the even-numbered 2791 /// elements of the result. 2792 /// \param __b 2793 /// A 256-bit vector of [8 x i32] used as the source for the odd-numbered 2794 /// elements of the result. 2795 /// \returns A 256-bit vector of [8 x i32] containing the result. 2796 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2797 _mm256_unpackhi_epi32(__m256i __a, __m256i __b) 2798 { 2799 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7); 2800 } 2801 2802 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors 2803 /// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector 2804 /// of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half 2805 /// of \a __a and \a __b as input; other bits in these parameters are 2806 /// ignored. 2807 /// 2808 /// \code{.operation} 2809 /// result[63:0] := __a[127:64] 2810 /// result[127:64] := __b[127:64] 2811 /// result[191:128] := __a[255:192] 2812 /// result[255:192] := __b[255:192] 2813 /// \endcode 2814 /// 2815 /// \headerfile <immintrin.h> 2816 /// 2817 /// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction. 2818 /// 2819 /// \param __a 2820 /// A 256-bit vector of [4 x i64] used as the source for the even-numbered 2821 /// elements of the result. 2822 /// \param __b 2823 /// A 256-bit vector of [4 x i64] used as the source for the odd-numbered 2824 /// elements of the result. 2825 /// \returns A 256-bit vector of [4 x i64] containing the result. 2826 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2827 _mm256_unpackhi_epi64(__m256i __a, __m256i __b) 2828 { 2829 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3); 2830 } 2831 2832 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer 2833 /// vectors in \a __a and \a __b to form the 256-bit result. Specifically, 2834 /// uses the lower 64 bits of each 128-bit half of \a __a and \a __b as 2835 /// input; other bits in these parameters are ignored. 2836 /// 2837 /// \code{.operation} 2838 /// result[7:0] := __a[7:0] 2839 /// result[15:8] := __b[7:0] 2840 /// result[23:16] := __a[15:8] 2841 /// result[31:24] := __b[15:8] 2842 /// . . . 2843 /// result[127:120] := __b[63:56] 2844 /// result[135:128] := __a[135:128] 2845 /// . . . 2846 /// result[255:248] := __b[191:184] 2847 /// \endcode 2848 /// 2849 /// \headerfile <immintrin.h> 2850 /// 2851 /// This intrinsic corresponds to the \c VPUNPCKLBW instruction. 2852 /// 2853 /// \param __a 2854 /// A 256-bit integer vector used as the source for the even-numbered bytes 2855 /// of the result. 2856 /// \param __b 2857 /// A 256-bit integer vector used as the source for the odd-numbered bytes 2858 /// of the result. 2859 /// \returns A 256-bit integer vector containing the result. 2860 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2861 _mm256_unpacklo_epi8(__m256i __a, __m256i __b) 2862 { 2863 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23); 2864 } 2865 2866 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors 2867 /// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit 2868 /// vector of [16 x i16]. Specifically, uses the lower 64 bits of each 2869 /// 128-bit half of \a __a and \a __b as input; other bits in these 2870 /// parameters are ignored. 2871 /// 2872 /// \code{.operation} 2873 /// result[15:0] := __a[15:0] 2874 /// result[31:16] := __b[15:0] 2875 /// result[47:32] := __a[31:16] 2876 /// result[63:48] := __b[31:16] 2877 /// . . . 2878 /// result[127:112] := __b[63:48] 2879 /// result[143:128] := __a[143:128] 2880 /// . . . 2881 /// result[255:239] := __b[191:176] 2882 /// \endcode 2883 /// 2884 /// \headerfile <immintrin.h> 2885 /// 2886 /// This intrinsic corresponds to the \c VPUNPCKLWD instruction. 2887 /// 2888 /// \param __a 2889 /// A 256-bit vector of [16 x i16] used as the source for the even-numbered 2890 /// elements of the result. 2891 /// \param __b 2892 /// A 256-bit vector of [16 x i16] used as the source for the odd-numbered 2893 /// elements of the result. 2894 /// \returns A 256-bit vector of [16 x i16] containing the result. 2895 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2896 _mm256_unpacklo_epi16(__m256i __a, __m256i __b) 2897 { 2898 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11); 2899 } 2900 2901 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors 2902 /// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector 2903 /// of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half 2904 /// of \a __a and \a __b as input; other bits in these parameters are 2905 /// ignored. 2906 /// 2907 /// \code{.operation} 2908 /// result[31:0] := __a[31:0] 2909 /// result[63:32] := __b[31:0] 2910 /// result[95:64] := __a[63:32] 2911 /// result[127:96] := __b[63:32] 2912 /// result[159:128] := __a[159:128] 2913 /// result[191:160] := __b[159:128] 2914 /// result[223:192] := __a[191:160] 2915 /// result[255:224] := __b[191:190] 2916 /// \endcode 2917 /// 2918 /// \headerfile <immintrin.h> 2919 /// 2920 /// This intrinsic corresponds to the \c VPUNPCKLDQ instruction. 2921 /// 2922 /// \param __a 2923 /// A 256-bit vector of [8 x i32] used as the source for the even-numbered 2924 /// elements of the result. 2925 /// \param __b 2926 /// A 256-bit vector of [8 x i32] used as the source for the odd-numbered 2927 /// elements of the result. 2928 /// \returns A 256-bit vector of [8 x i32] containing the result. 2929 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2930 _mm256_unpacklo_epi32(__m256i __a, __m256i __b) 2931 { 2932 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5); 2933 } 2934 2935 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors 2936 /// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector 2937 /// of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half 2938 /// of \a __a and \a __b as input; other bits in these parameters are 2939 /// ignored. 2940 /// 2941 /// \code{.operation} 2942 /// result[63:0] := __a[63:0] 2943 /// result[127:64] := __b[63:0] 2944 /// result[191:128] := __a[191:128] 2945 /// result[255:192] := __b[191:128] 2946 /// \endcode 2947 /// 2948 /// \headerfile <immintrin.h> 2949 /// 2950 /// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction. 2951 /// 2952 /// \param __a 2953 /// A 256-bit vector of [4 x i64] used as the source for the even-numbered 2954 /// elements of the result. 2955 /// \param __b 2956 /// A 256-bit vector of [4 x i64] used as the source for the odd-numbered 2957 /// elements of the result. 2958 /// \returns A 256-bit vector of [4 x i64] containing the result. 2959 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2960 _mm256_unpacklo_epi64(__m256i __a, __m256i __b) 2961 { 2962 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2); 2963 } 2964 2965 /// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and 2966 /// \a __b. 2967 /// 2968 /// \headerfile <immintrin.h> 2969 /// 2970 /// This intrinsic corresponds to the \c VPXOR instruction. 2971 /// 2972 /// \param __a 2973 /// A 256-bit integer vector. 2974 /// \param __b 2975 /// A 256-bit integer vector. 2976 /// \returns A 256-bit integer vector containing the result. 2977 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2978 _mm256_xor_si256(__m256i __a, __m256i __b) 2979 { 2980 return (__m256i)((__v4du)__a ^ (__v4du)__b); 2981 } 2982 2983 /// Loads the 256-bit integer vector from memory \a __V using a non-temporal 2984 /// memory hint and returns the vector. \a __V must be aligned on a 32-byte 2985 /// boundary. 2986 /// 2987 /// \headerfile <immintrin.h> 2988 /// 2989 /// This intrinsic corresponds to the \c VMOVNTDQA instruction. 2990 /// 2991 /// \param __V 2992 /// A pointer to the 32-byte aligned memory containing the vector to load. 2993 /// \returns A 256-bit integer vector loaded from memory. 2994 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2995 _mm256_stream_load_si256(const void *__V) 2996 { 2997 typedef __v4di __v4di_aligned __attribute__((aligned(32))); 2998 return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V); 2999 } 3000 3001 /// Broadcasts the 32-bit floating-point value from the low element of the 3002 /// 128-bit vector of [4 x float] in \a __X to all elements of the result's 3003 /// 128-bit vector of [4 x float]. 3004 /// 3005 /// \headerfile <immintrin.h> 3006 /// 3007 /// This intrinsic corresponds to the \c VBROADCASTSS instruction. 3008 /// 3009 /// \param __X 3010 /// A 128-bit vector of [4 x float] whose low element will be broadcast. 3011 /// \returns A 128-bit vector of [4 x float] containing the result. 3012 static __inline__ __m128 __DEFAULT_FN_ATTRS128 3013 _mm_broadcastss_ps(__m128 __X) 3014 { 3015 return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0); 3016 } 3017 3018 /// Broadcasts the 64-bit floating-point value from the low element of the 3019 /// 128-bit vector of [2 x double] in \a __a to both elements of the 3020 /// result's 128-bit vector of [2 x double]. 3021 /// 3022 /// \headerfile <immintrin.h> 3023 /// 3024 /// This intrinsic corresponds to the \c MOVDDUP instruction. 3025 /// 3026 /// \param __a 3027 /// A 128-bit vector of [2 x double] whose low element will be broadcast. 3028 /// \returns A 128-bit vector of [2 x double] containing the result. 3029 static __inline__ __m128d __DEFAULT_FN_ATTRS128 3030 _mm_broadcastsd_pd(__m128d __a) 3031 { 3032 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); 3033 } 3034 3035 /// Broadcasts the 32-bit floating-point value from the low element of the 3036 /// 128-bit vector of [4 x float] in \a __X to all elements of the 3037 /// result's 256-bit vector of [8 x float]. 3038 /// 3039 /// \headerfile <immintrin.h> 3040 /// 3041 /// This intrinsic corresponds to the \c VBROADCASTSS instruction. 3042 /// 3043 /// \param __X 3044 /// A 128-bit vector of [4 x float] whose low element will be broadcast. 3045 /// \returns A 256-bit vector of [8 x float] containing the result. 3046 static __inline__ __m256 __DEFAULT_FN_ATTRS256 3047 _mm256_broadcastss_ps(__m128 __X) 3048 { 3049 return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0); 3050 } 3051 3052 /// Broadcasts the 64-bit floating-point value from the low element of the 3053 /// 128-bit vector of [2 x double] in \a __X to all elements of the 3054 /// result's 256-bit vector of [4 x double]. 3055 /// 3056 /// \headerfile <immintrin.h> 3057 /// 3058 /// This intrinsic corresponds to the \c VBROADCASTSD instruction. 3059 /// 3060 /// \param __X 3061 /// A 128-bit vector of [2 x double] whose low element will be broadcast. 3062 /// \returns A 256-bit vector of [4 x double] containing the result. 3063 static __inline__ __m256d __DEFAULT_FN_ATTRS256 3064 _mm256_broadcastsd_pd(__m128d __X) 3065 { 3066 return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0); 3067 } 3068 3069 /// Broadcasts the 128-bit integer data from \a __X to both the lower and 3070 /// upper halves of the 256-bit result. 3071 /// 3072 /// \headerfile <immintrin.h> 3073 /// 3074 /// This intrinsic corresponds to the \c VBROADCASTI128 instruction. 3075 /// 3076 /// \param __X 3077 /// A 128-bit integer vector to be broadcast. 3078 /// \returns A 256-bit integer vector containing the result. 3079 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3080 _mm256_broadcastsi128_si256(__m128i __X) 3081 { 3082 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1); 3083 } 3084 3085 #define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X) 3086 3087 /// Merges 32-bit integer elements from either of the two 128-bit vectors of 3088 /// [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32], 3089 /// as specified by the immediate integer operand \a M. 3090 /// 3091 /// \code{.operation} 3092 /// FOR i := 0 TO 3 3093 /// j := i*32 3094 /// IF M[i] == 0 3095 /// result[31+j:j] := V1[31+j:j] 3096 /// ELSE 3097 /// result[31+j:j] := V2[32+j:j] 3098 /// FI 3099 /// ENDFOR 3100 /// \endcode 3101 /// 3102 /// \headerfile <immintrin.h> 3103 /// 3104 /// \code 3105 /// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M); 3106 /// \endcode 3107 /// 3108 /// This intrinsic corresponds to the \c VPBLENDDD instruction. 3109 /// 3110 /// \param V1 3111 /// A 128-bit vector of [4 x i32] containing source values. 3112 /// \param V2 3113 /// A 128-bit vector of [4 x i32] containing source values. 3114 /// \param M 3115 /// An immediate 8-bit integer operand, with bits [3:0] specifying the 3116 /// source for each element of the result. The position of the mask bit 3117 /// corresponds to the index of a copied value. When a mask bit is 0, the 3118 /// element is copied from \a V1; otherwise, it is copied from \a V2. 3119 /// \returns A 128-bit vector of [4 x i32] containing the result. 3120 #define _mm_blend_epi32(V1, V2, M) \ 3121 ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \ 3122 (__v4si)(__m128i)(V2), (int)(M))) 3123 3124 /// Merges 32-bit integer elements from either of the two 256-bit vectors of 3125 /// [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32], 3126 /// as specified by the immediate integer operand \a M. 3127 /// 3128 /// \code{.operation} 3129 /// FOR i := 0 TO 7 3130 /// j := i*32 3131 /// IF M[i] == 0 3132 /// result[31+j:j] := V1[31+j:j] 3133 /// ELSE 3134 /// result[31+j:j] := V2[32+j:j] 3135 /// FI 3136 /// ENDFOR 3137 /// \endcode 3138 /// 3139 /// \headerfile <immintrin.h> 3140 /// 3141 /// \code 3142 /// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M); 3143 /// \endcode 3144 /// 3145 /// This intrinsic corresponds to the \c VPBLENDDD instruction. 3146 /// 3147 /// \param V1 3148 /// A 256-bit vector of [8 x i32] containing source values. 3149 /// \param V2 3150 /// A 256-bit vector of [8 x i32] containing source values. 3151 /// \param M 3152 /// An immediate 8-bit integer operand, with bits [7:0] specifying the 3153 /// source for each element of the result. The position of the mask bit 3154 /// corresponds to the index of a copied value. When a mask bit is 0, the 3155 /// element is copied from \a V1; otherwise, it is is copied from \a V2. 3156 /// \returns A 256-bit vector of [8 x i32] containing the result. 3157 #define _mm256_blend_epi32(V1, V2, M) \ 3158 ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \ 3159 (__v8si)(__m256i)(V2), (int)(M))) 3160 3161 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all 3162 /// bytes of the 256-bit result. 3163 /// 3164 /// \headerfile <immintrin.h> 3165 /// 3166 /// This intrinsic corresponds to the \c VPBROADCASTB instruction. 3167 /// 3168 /// \param __X 3169 /// A 128-bit integer vector whose low byte will be broadcast. 3170 /// \returns A 256-bit integer vector containing the result. 3171 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3172 _mm256_broadcastb_epi8(__m128i __X) 3173 { 3174 return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 3175 } 3176 3177 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X 3178 /// to all elements of the result's 256-bit vector of [16 x i16]. 3179 /// 3180 /// \headerfile <immintrin.h> 3181 /// 3182 /// This intrinsic corresponds to the \c VPBROADCASTW instruction. 3183 /// 3184 /// \param __X 3185 /// A 128-bit vector of [8 x i16] whose low element will be broadcast. 3186 /// \returns A 256-bit vector of [16 x i16] containing the result. 3187 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3188 _mm256_broadcastw_epi16(__m128i __X) 3189 { 3190 return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 3191 } 3192 3193 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X 3194 /// to all elements of the result's 256-bit vector of [8 x i32]. 3195 /// 3196 /// \headerfile <immintrin.h> 3197 /// 3198 /// This intrinsic corresponds to the \c VPBROADCASTD instruction. 3199 /// 3200 /// \param __X 3201 /// A 128-bit vector of [4 x i32] whose low element will be broadcast. 3202 /// \returns A 256-bit vector of [8 x i32] containing the result. 3203 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3204 _mm256_broadcastd_epi32(__m128i __X) 3205 { 3206 return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0); 3207 } 3208 3209 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X 3210 /// to all elements of the result's 256-bit vector of [4 x i64]. 3211 /// 3212 /// \headerfile <immintrin.h> 3213 /// 3214 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction. 3215 /// 3216 /// \param __X 3217 /// A 128-bit vector of [2 x i64] whose low element will be broadcast. 3218 /// \returns A 256-bit vector of [4 x i64] containing the result. 3219 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3220 _mm256_broadcastq_epi64(__m128i __X) 3221 { 3222 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0); 3223 } 3224 3225 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all 3226 /// bytes of the 128-bit result. 3227 /// 3228 /// \headerfile <immintrin.h> 3229 /// 3230 /// This intrinsic corresponds to the \c VPBROADCASTB instruction. 3231 /// 3232 /// \param __X 3233 /// A 128-bit integer vector whose low byte will be broadcast. 3234 /// \returns A 128-bit integer vector containing the result. 3235 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3236 _mm_broadcastb_epi8(__m128i __X) 3237 { 3238 return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 3239 } 3240 3241 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in 3242 /// \a __X to all elements of the result's 128-bit vector of [8 x i16]. 3243 /// 3244 /// \headerfile <immintrin.h> 3245 /// 3246 /// This intrinsic corresponds to the \c VPBROADCASTW instruction. 3247 /// 3248 /// \param __X 3249 /// A 128-bit vector of [8 x i16] whose low element will be broadcast. 3250 /// \returns A 128-bit vector of [8 x i16] containing the result. 3251 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3252 _mm_broadcastw_epi16(__m128i __X) 3253 { 3254 return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0); 3255 } 3256 3257 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X 3258 /// to all elements of the result's vector of [4 x i32]. 3259 /// 3260 /// \headerfile <immintrin.h> 3261 /// 3262 /// This intrinsic corresponds to the \c VPBROADCASTD instruction. 3263 /// 3264 /// \param __X 3265 /// A 128-bit vector of [4 x i32] whose low element will be broadcast. 3266 /// \returns A 128-bit vector of [4 x i32] containing the result. 3267 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3268 _mm_broadcastd_epi32(__m128i __X) 3269 { 3270 return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0); 3271 } 3272 3273 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X 3274 /// to both elements of the result's 128-bit vector of [2 x i64]. 3275 /// 3276 /// \headerfile <immintrin.h> 3277 /// 3278 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction. 3279 /// 3280 /// \param __X 3281 /// A 128-bit vector of [2 x i64] whose low element will be broadcast. 3282 /// \returns A 128-bit vector of [2 x i64] containing the result. 3283 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3284 _mm_broadcastq_epi64(__m128i __X) 3285 { 3286 return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0); 3287 } 3288 3289 /// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the 3290 /// 256-bit vector of [8 x i32] in \a __a as specified by indexes in the 3291 /// elements of the 256-bit vector of [8 x i32] in \a __b. 3292 /// 3293 /// \code{.operation} 3294 /// FOR i := 0 TO 7 3295 /// j := i*32 3296 /// k := __b[j+2:j] * 32 3297 /// result[j+31:j] := __a[k+31:k] 3298 /// ENDFOR 3299 /// \endcode 3300 /// 3301 /// \headerfile <immintrin.h> 3302 /// 3303 /// This intrinsic corresponds to the \c VPERMD instruction. 3304 /// 3305 /// \param __a 3306 /// A 256-bit vector of [8 x i32] containing the source values. 3307 /// \param __b 3308 /// A 256-bit vector of [8 x i32] containing indexes of values to use from 3309 /// \a __a. 3310 /// \returns A 256-bit vector of [8 x i32] containing the result. 3311 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3312 _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b) 3313 { 3314 return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b); 3315 } 3316 3317 /// Sets the result's 256-bit vector of [4 x double] to copies of elements of 3318 /// the 256-bit vector of [4 x double] in \a V as specified by the 3319 /// immediate value \a M. 3320 /// 3321 /// \code{.operation} 3322 /// FOR i := 0 TO 3 3323 /// j := i*64 3324 /// k := (M >> i*2)[1:0] * 64 3325 /// result[j+63:j] := V[k+63:k] 3326 /// ENDFOR 3327 /// \endcode 3328 /// 3329 /// \headerfile <immintrin.h> 3330 /// 3331 /// \code 3332 /// __m256d _mm256_permute4x64_pd(__m256d V, const int M); 3333 /// \endcode 3334 /// 3335 /// This intrinsic corresponds to the \c VPERMPD instruction. 3336 /// 3337 /// \param V 3338 /// A 256-bit vector of [4 x double] containing the source values. 3339 /// \param M 3340 /// An immediate 8-bit value specifying which elements to copy from \a V. 3341 /// \a M[1:0] specifies the index in \a a for element 0 of the result, 3342 /// \a M[3:2] specifies the index for element 1, and so forth. 3343 /// \returns A 256-bit vector of [4 x double] containing the result. 3344 #define _mm256_permute4x64_pd(V, M) \ 3345 ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M))) 3346 3347 /// Sets the result's 256-bit vector of [8 x float] to copies of elements of 3348 /// the 256-bit vector of [8 x float] in \a __a as specified by indexes in 3349 /// the elements of the 256-bit vector of [8 x i32] in \a __b. 3350 /// 3351 /// \code{.operation} 3352 /// FOR i := 0 TO 7 3353 /// j := i*32 3354 /// k := __b[j+2:j] * 32 3355 /// result[j+31:j] := __a[k+31:k] 3356 /// ENDFOR 3357 /// \endcode 3358 /// 3359 /// \headerfile <immintrin.h> 3360 /// 3361 /// This intrinsic corresponds to the \c VPERMPS instruction. 3362 /// 3363 /// \param __a 3364 /// A 256-bit vector of [8 x float] containing the source values. 3365 /// \param __b 3366 /// A 256-bit vector of [8 x i32] containing indexes of values to use from 3367 /// \a __a. 3368 /// \returns A 256-bit vector of [8 x float] containing the result. 3369 static __inline__ __m256 __DEFAULT_FN_ATTRS256 3370 _mm256_permutevar8x32_ps(__m256 __a, __m256i __b) 3371 { 3372 return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b); 3373 } 3374 3375 /// Sets the result's 256-bit vector of [4 x i64] result to copies of elements 3376 /// of the 256-bit vector of [4 x i64] in \a V as specified by the 3377 /// immediate value \a M. 3378 /// 3379 /// \code{.operation} 3380 /// FOR i := 0 TO 3 3381 /// j := i*64 3382 /// k := (M >> i*2)[1:0] * 64 3383 /// result[j+63:j] := V[k+63:k] 3384 /// ENDFOR 3385 /// \endcode 3386 /// 3387 /// \headerfile <immintrin.h> 3388 /// 3389 /// \code 3390 /// __m256i _mm256_permute4x64_epi64(__m256i V, const int M); 3391 /// \endcode 3392 /// 3393 /// This intrinsic corresponds to the \c VPERMQ instruction. 3394 /// 3395 /// \param V 3396 /// A 256-bit vector of [4 x i64] containing the source values. 3397 /// \param M 3398 /// An immediate 8-bit value specifying which elements to copy from \a V. 3399 /// \a M[1:0] specifies the index in \a a for element 0 of the result, 3400 /// \a M[3:2] specifies the index for element 1, and so forth. 3401 /// \returns A 256-bit vector of [4 x i64] containing the result. 3402 #define _mm256_permute4x64_epi64(V, M) \ 3403 ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M))) 3404 3405 /// Sets each half of the 256-bit result either to zero or to one of the 3406 /// four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2, 3407 /// as specified by the immediate value \a M. 3408 /// 3409 /// \code{.operation} 3410 /// FOR i := 0 TO 1 3411 /// j := i*128 3412 /// k := M >> (i*4) 3413 /// IF k[3] == 0 3414 /// CASE (k[1:0]) OF 3415 /// 0: result[127+j:j] := V1[127:0] 3416 /// 1: result[127+j:j] := V1[255:128] 3417 /// 2: result[127+j:j] := V2[127:0] 3418 /// 3: result[127+j:j] := V2[255:128] 3419 /// ESAC 3420 /// ELSE 3421 /// result[127+j:j] := 0 3422 /// FI 3423 /// ENDFOR 3424 /// \endcode 3425 /// 3426 /// \headerfile <immintrin.h> 3427 /// 3428 /// \code 3429 /// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M); 3430 /// \endcode 3431 /// 3432 /// This intrinsic corresponds to the \c VPERM2I128 instruction. 3433 /// 3434 /// \param V1 3435 /// A 256-bit integer vector containing source values. 3436 /// \param V2 3437 /// A 256-bit integer vector containing source values. 3438 /// \param M 3439 /// An immediate value specifying how to form the result. Bits [3:0] 3440 /// control the lower half of the result, bits [7:4] control the upper half. 3441 /// Within each 4-bit control value, if bit 3 is 1, the result is zero, 3442 /// otherwise bits [1:0] determine the source as follows. \n 3443 /// 0: the lower half of \a V1 \n 3444 /// 1: the upper half of \a V1 \n 3445 /// 2: the lower half of \a V2 \n 3446 /// 3: the upper half of \a V2 3447 /// \returns A 256-bit integer vector containing the result. 3448 #define _mm256_permute2x128_si256(V1, V2, M) \ 3449 ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M))) 3450 3451 /// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0 3452 /// of the immediate \a M is zero, extracts the lower half of the result; 3453 /// otherwise, extracts the upper half. 3454 /// 3455 /// \headerfile <immintrin.h> 3456 /// 3457 /// \code 3458 /// __m128i _mm256_extracti128_si256(__m256i V, const int M); 3459 /// \endcode 3460 /// 3461 /// This intrinsic corresponds to the \c VEXTRACTI128 instruction. 3462 /// 3463 /// \param V 3464 /// A 256-bit integer vector containing the source values. 3465 /// \param M 3466 /// An immediate value specifying which half of \a V to extract. 3467 /// \returns A 128-bit integer vector containing the result. 3468 #define _mm256_extracti128_si256(V, M) \ 3469 ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M))) 3470 3471 /// Copies the 256-bit vector \a V1 to the result, then overwrites half of the 3472 /// result with the 128-bit vector \a V2. If bit 0 of the immediate \a M 3473 /// is zero, overwrites the lower half of the result; otherwise, 3474 /// overwrites the upper half. 3475 /// 3476 /// \headerfile <immintrin.h> 3477 /// 3478 /// \code 3479 /// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M); 3480 /// \endcode 3481 /// 3482 /// This intrinsic corresponds to the \c VINSERTI128 instruction. 3483 /// 3484 /// \param V1 3485 /// A 256-bit integer vector containing a source value. 3486 /// \param V2 3487 /// A 128-bit integer vector containing a source value. 3488 /// \param M 3489 /// An immediate value specifying where to put \a V2 in the result. 3490 /// \returns A 256-bit integer vector containing the result. 3491 #define _mm256_inserti128_si256(V1, V2, M) \ 3492 ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \ 3493 (__v2di)(__m128i)(V2), (int)(M))) 3494 3495 /// Conditionally loads eight 32-bit integer elements from memory \a __X, if 3496 /// the most significant bit of the corresponding element in the mask 3497 /// \a __M is set; otherwise, sets that element of the result to zero. 3498 /// Returns the 256-bit [8 x i32] result. 3499 /// 3500 /// \code{.operation} 3501 /// FOR i := 0 TO 7 3502 /// j := i*32 3503 /// IF __M[j+31] == 1 3504 /// result[j+31:j] := Load32(__X+(i*4)) 3505 /// ELSE 3506 /// result[j+31:j] := 0 3507 /// FI 3508 /// ENDFOR 3509 /// \endcode 3510 /// 3511 /// \headerfile <immintrin.h> 3512 /// 3513 /// This intrinsic corresponds to the \c VPMASKMOVD instruction. 3514 /// 3515 /// \param __X 3516 /// A pointer to the memory used for loading values. 3517 /// \param __M 3518 /// A 256-bit vector of [8 x i32] containing the mask bits. 3519 /// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed 3520 /// elements. 3521 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3522 _mm256_maskload_epi32(int const *__X, __m256i __M) 3523 { 3524 return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M); 3525 } 3526 3527 /// Conditionally loads four 64-bit integer elements from memory \a __X, if 3528 /// the most significant bit of the corresponding element in the mask 3529 /// \a __M is set; otherwise, sets that element of the result to zero. 3530 /// Returns the 256-bit [4 x i64] result. 3531 /// 3532 /// \code{.operation} 3533 /// FOR i := 0 TO 3 3534 /// j := i*64 3535 /// IF __M[j+63] == 1 3536 /// result[j+63:j] := Load64(__X+(i*8)) 3537 /// ELSE 3538 /// result[j+63:j] := 0 3539 /// FI 3540 /// ENDFOR 3541 /// \endcode 3542 /// 3543 /// \headerfile <immintrin.h> 3544 /// 3545 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction. 3546 /// 3547 /// \param __X 3548 /// A pointer to the memory used for loading values. 3549 /// \param __M 3550 /// A 256-bit vector of [4 x i64] containing the mask bits. 3551 /// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed 3552 /// elements. 3553 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3554 _mm256_maskload_epi64(long long const *__X, __m256i __M) 3555 { 3556 return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M); 3557 } 3558 3559 /// Conditionally loads four 32-bit integer elements from memory \a __X, if 3560 /// the most significant bit of the corresponding element in the mask 3561 /// \a __M is set; otherwise, sets that element of the result to zero. 3562 /// Returns the 128-bit [4 x i32] result. 3563 /// 3564 /// \code{.operation} 3565 /// FOR i := 0 TO 3 3566 /// j := i*32 3567 /// IF __M[j+31] == 1 3568 /// result[j+31:j] := Load32(__X+(i*4)) 3569 /// ELSE 3570 /// result[j+31:j] := 0 3571 /// FI 3572 /// ENDFOR 3573 /// \endcode 3574 /// 3575 /// \headerfile <immintrin.h> 3576 /// 3577 /// This intrinsic corresponds to the \c VPMASKMOVD instruction. 3578 /// 3579 /// \param __X 3580 /// A pointer to the memory used for loading values. 3581 /// \param __M 3582 /// A 128-bit vector of [4 x i32] containing the mask bits. 3583 /// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed 3584 /// elements. 3585 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3586 _mm_maskload_epi32(int const *__X, __m128i __M) 3587 { 3588 return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M); 3589 } 3590 3591 /// Conditionally loads two 64-bit integer elements from memory \a __X, if 3592 /// the most significant bit of the corresponding element in the mask 3593 /// \a __M is set; otherwise, sets that element of the result to zero. 3594 /// Returns the 128-bit [2 x i64] result. 3595 /// 3596 /// \code{.operation} 3597 /// FOR i := 0 TO 1 3598 /// j := i*64 3599 /// IF __M[j+63] == 1 3600 /// result[j+63:j] := Load64(__X+(i*8)) 3601 /// ELSE 3602 /// result[j+63:j] := 0 3603 /// FI 3604 /// ENDFOR 3605 /// \endcode 3606 /// 3607 /// \headerfile <immintrin.h> 3608 /// 3609 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction. 3610 /// 3611 /// \param __X 3612 /// A pointer to the memory used for loading values. 3613 /// \param __M 3614 /// A 128-bit vector of [2 x i64] containing the mask bits. 3615 /// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed 3616 /// elements. 3617 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3618 _mm_maskload_epi64(long long const *__X, __m128i __M) 3619 { 3620 return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M); 3621 } 3622 3623 /// Conditionally stores eight 32-bit integer elements from the 256-bit vector 3624 /// of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of 3625 /// the corresponding element in the mask \a __M is set; otherwise, the 3626 /// memory element is unchanged. 3627 /// 3628 /// \code{.operation} 3629 /// FOR i := 0 TO 7 3630 /// j := i*32 3631 /// IF __M[j+31] == 1 3632 /// Store32(__X+(i*4), __Y[j+31:j]) 3633 /// FI 3634 /// ENDFOR 3635 /// \endcode 3636 /// 3637 /// \headerfile <immintrin.h> 3638 /// 3639 /// This intrinsic corresponds to the \c VPMASKMOVD instruction. 3640 /// 3641 /// \param __X 3642 /// A pointer to the memory used for storing values. 3643 /// \param __M 3644 /// A 256-bit vector of [8 x i32] containing the mask bits. 3645 /// \param __Y 3646 /// A 256-bit vector of [8 x i32] containing the values to store. 3647 static __inline__ void __DEFAULT_FN_ATTRS256 3648 _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y) 3649 { 3650 __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y); 3651 } 3652 3653 /// Conditionally stores four 64-bit integer elements from the 256-bit vector 3654 /// of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of 3655 /// the corresponding element in the mask \a __M is set; otherwise, the 3656 /// memory element is unchanged. 3657 /// 3658 /// \code{.operation} 3659 /// FOR i := 0 TO 3 3660 /// j := i*64 3661 /// IF __M[j+63] == 1 3662 /// Store64(__X+(i*8), __Y[j+63:j]) 3663 /// FI 3664 /// ENDFOR 3665 /// \endcode 3666 /// 3667 /// \headerfile <immintrin.h> 3668 /// 3669 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction. 3670 /// 3671 /// \param __X 3672 /// A pointer to the memory used for storing values. 3673 /// \param __M 3674 /// A 256-bit vector of [4 x i64] containing the mask bits. 3675 /// \param __Y 3676 /// A 256-bit vector of [4 x i64] containing the values to store. 3677 static __inline__ void __DEFAULT_FN_ATTRS256 3678 _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y) 3679 { 3680 __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y); 3681 } 3682 3683 /// Conditionally stores four 32-bit integer elements from the 128-bit vector 3684 /// of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of 3685 /// the corresponding element in the mask \a __M is set; otherwise, the 3686 /// memory element is unchanged. 3687 /// 3688 /// \code{.operation} 3689 /// FOR i := 0 TO 3 3690 /// j := i*32 3691 /// IF __M[j+31] == 1 3692 /// Store32(__X+(i*4), __Y[j+31:j]) 3693 /// FI 3694 /// ENDFOR 3695 /// \endcode 3696 /// 3697 /// \headerfile <immintrin.h> 3698 /// 3699 /// This intrinsic corresponds to the \c VPMASKMOVD instruction. 3700 /// 3701 /// \param __X 3702 /// A pointer to the memory used for storing values. 3703 /// \param __M 3704 /// A 128-bit vector of [4 x i32] containing the mask bits. 3705 /// \param __Y 3706 /// A 128-bit vector of [4 x i32] containing the values to store. 3707 static __inline__ void __DEFAULT_FN_ATTRS128 3708 _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y) 3709 { 3710 __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y); 3711 } 3712 3713 /// Conditionally stores two 64-bit integer elements from the 128-bit vector 3714 /// of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of 3715 /// the corresponding element in the mask \a __M is set; otherwise, the 3716 /// memory element is unchanged. 3717 /// 3718 /// \code{.operation} 3719 /// FOR i := 0 TO 1 3720 /// j := i*64 3721 /// IF __M[j+63] == 1 3722 /// Store64(__X+(i*8), __Y[j+63:j]) 3723 /// FI 3724 /// ENDFOR 3725 /// \endcode 3726 /// 3727 /// \headerfile <immintrin.h> 3728 /// 3729 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction. 3730 /// 3731 /// \param __X 3732 /// A pointer to the memory used for storing values. 3733 /// \param __M 3734 /// A 128-bit vector of [2 x i64] containing the mask bits. 3735 /// \param __Y 3736 /// A 128-bit vector of [2 x i64] containing the values to store. 3737 static __inline__ void __DEFAULT_FN_ATTRS128 3738 _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y) 3739 { 3740 __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y); 3741 } 3742 3743 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X 3744 /// left by the number of bits given in the corresponding element of the 3745 /// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and 3746 /// returns the result. If the shift count for any element is greater than 3747 /// 31, the result for that element is zero. 3748 /// 3749 /// \headerfile <immintrin.h> 3750 /// 3751 /// This intrinsic corresponds to the \c VPSLLVD instruction. 3752 /// 3753 /// \param __X 3754 /// A 256-bit vector of [8 x i32] to be shifted. 3755 /// \param __Y 3756 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in 3757 /// bits). 3758 /// \returns A 256-bit vector of [8 x i32] containing the result. 3759 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3760 _mm256_sllv_epi32(__m256i __X, __m256i __Y) 3761 { 3762 return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y); 3763 } 3764 3765 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X 3766 /// left by the number of bits given in the corresponding element of the 3767 /// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and 3768 /// returns the result. If the shift count for any element is greater than 3769 /// 31, the result for that element is zero. 3770 /// 3771 /// \headerfile <immintrin.h> 3772 /// 3773 /// This intrinsic corresponds to the \c VPSLLVD instruction. 3774 /// 3775 /// \param __X 3776 /// A 128-bit vector of [4 x i32] to be shifted. 3777 /// \param __Y 3778 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in 3779 /// bits). 3780 /// \returns A 128-bit vector of [4 x i32] containing the result. 3781 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3782 _mm_sllv_epi32(__m128i __X, __m128i __Y) 3783 { 3784 return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y); 3785 } 3786 3787 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X 3788 /// left by the number of bits given in the corresponding element of the 3789 /// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and 3790 /// returns the result. If the shift count for any element is greater than 3791 /// 63, the result for that element is zero. 3792 /// 3793 /// \headerfile <immintrin.h> 3794 /// 3795 /// This intrinsic corresponds to the \c VPSLLVQ instruction. 3796 /// 3797 /// \param __X 3798 /// A 256-bit vector of [4 x i64] to be shifted. 3799 /// \param __Y 3800 /// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in 3801 /// bits). 3802 /// \returns A 256-bit vector of [4 x i64] containing the result. 3803 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3804 _mm256_sllv_epi64(__m256i __X, __m256i __Y) 3805 { 3806 return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y); 3807 } 3808 3809 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X 3810 /// left by the number of bits given in the corresponding element of the 3811 /// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and 3812 /// returns the result. If the shift count for any element is greater than 3813 /// 63, the result for that element is zero. 3814 /// 3815 /// \headerfile <immintrin.h> 3816 /// 3817 /// This intrinsic corresponds to the \c VPSLLVQ instruction. 3818 /// 3819 /// \param __X 3820 /// A 128-bit vector of [2 x i64] to be shifted. 3821 /// \param __Y 3822 /// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in 3823 /// bits). 3824 /// \returns A 128-bit vector of [2 x i64] containing the result. 3825 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3826 _mm_sllv_epi64(__m128i __X, __m128i __Y) 3827 { 3828 return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y); 3829 } 3830 3831 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X 3832 /// right by the number of bits given in the corresponding element of the 3833 /// 256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and 3834 /// returns the result. If the shift count for any element is greater than 3835 /// 31, the result for that element is 0 or -1 according to the sign bit 3836 /// for that element. 3837 /// 3838 /// \headerfile <immintrin.h> 3839 /// 3840 /// This intrinsic corresponds to the \c VPSRAVD instruction. 3841 /// 3842 /// \param __X 3843 /// A 256-bit vector of [8 x i32] to be shifted. 3844 /// \param __Y 3845 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in 3846 /// bits). 3847 /// \returns A 256-bit vector of [8 x i32] containing the result. 3848 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3849 _mm256_srav_epi32(__m256i __X, __m256i __Y) 3850 { 3851 return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y); 3852 } 3853 3854 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X 3855 /// right by the number of bits given in the corresponding element of the 3856 /// 128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and 3857 /// returns the result. If the shift count for any element is greater than 3858 /// 31, the result for that element is 0 or -1 according to the sign bit 3859 /// for that element. 3860 /// 3861 /// \headerfile <immintrin.h> 3862 /// 3863 /// This intrinsic corresponds to the \c VPSRAVD instruction. 3864 /// 3865 /// \param __X 3866 /// A 128-bit vector of [4 x i32] to be shifted. 3867 /// \param __Y 3868 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in 3869 /// bits). 3870 /// \returns A 128-bit vector of [4 x i32] containing the result. 3871 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3872 _mm_srav_epi32(__m128i __X, __m128i __Y) 3873 { 3874 return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y); 3875 } 3876 3877 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X 3878 /// right by the number of bits given in the corresponding element of the 3879 /// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and 3880 /// returns the result. If the shift count for any element is greater than 3881 /// 31, the result for that element is zero. 3882 /// 3883 /// \headerfile <immintrin.h> 3884 /// 3885 /// This intrinsic corresponds to the \c VPSRLVD instruction. 3886 /// 3887 /// \param __X 3888 /// A 256-bit vector of [8 x i32] to be shifted. 3889 /// \param __Y 3890 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in 3891 /// bits). 3892 /// \returns A 256-bit vector of [8 x i32] containing the result. 3893 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3894 _mm256_srlv_epi32(__m256i __X, __m256i __Y) 3895 { 3896 return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y); 3897 } 3898 3899 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X 3900 /// right by the number of bits given in the corresponding element of the 3901 /// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and 3902 /// returns the result. If the shift count for any element is greater than 3903 /// 31, the result for that element is zero. 3904 /// 3905 /// \headerfile <immintrin.h> 3906 /// 3907 /// This intrinsic corresponds to the \c VPSRLVD instruction. 3908 /// 3909 /// \param __X 3910 /// A 128-bit vector of [4 x i32] to be shifted. 3911 /// \param __Y 3912 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in 3913 /// bits). 3914 /// \returns A 128-bit vector of [4 x i32] containing the result. 3915 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3916 _mm_srlv_epi32(__m128i __X, __m128i __Y) 3917 { 3918 return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y); 3919 } 3920 3921 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X 3922 /// right by the number of bits given in the corresponding element of the 3923 /// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and 3924 /// returns the result. If the shift count for any element is greater than 3925 /// 63, the result for that element is zero. 3926 /// 3927 /// \headerfile <immintrin.h> 3928 /// 3929 /// This intrinsic corresponds to the \c VPSRLVQ instruction. 3930 /// 3931 /// \param __X 3932 /// A 256-bit vector of [4 x i64] to be shifted. 3933 /// \param __Y 3934 /// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in 3935 /// bits). 3936 /// \returns A 256-bit vector of [4 x i64] containing the result. 3937 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3938 _mm256_srlv_epi64(__m256i __X, __m256i __Y) 3939 { 3940 return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y); 3941 } 3942 3943 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X 3944 /// right by the number of bits given in the corresponding element of the 3945 /// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and 3946 /// returns the result. If the shift count for any element is greater than 3947 /// 63, the result for that element is zero. 3948 /// 3949 /// \headerfile <immintrin.h> 3950 /// 3951 /// This intrinsic corresponds to the \c VPSRLVQ instruction. 3952 /// 3953 /// \param __X 3954 /// A 128-bit vector of [2 x i64] to be shifted. 3955 /// \param __Y 3956 /// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in 3957 /// bits). 3958 /// \returns A 128-bit vector of [2 x i64] containing the result. 3959 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3960 _mm_srlv_epi64(__m128i __X, __m128i __Y) 3961 { 3962 return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y); 3963 } 3964 3965 /// Conditionally gathers two 64-bit floating-point values, either from the 3966 /// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled 3967 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector 3968 /// of [2 x double] in \a mask determines the source for each element. 3969 /// 3970 /// \code{.operation} 3971 /// FOR element := 0 to 1 3972 /// j := element*64 3973 /// k := element*32 3974 /// IF mask[j+63] == 0 3975 /// result[j+63:j] := a[j+63:j] 3976 /// ELSE 3977 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 3978 /// FI 3979 /// ENDFOR 3980 /// \endcode 3981 /// 3982 /// \headerfile <immintrin.h> 3983 /// 3984 /// \code 3985 /// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i, 3986 /// __m128d mask, const int s); 3987 /// \endcode 3988 /// 3989 /// This intrinsic corresponds to the \c VGATHERDPD instruction. 3990 /// 3991 /// \param a 3992 /// A 128-bit vector of [2 x double] used as the source when a mask bit is 3993 /// zero. 3994 /// \param m 3995 /// A pointer to the memory used for loading values. 3996 /// \param i 3997 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only 3998 /// the first two elements are used. 3999 /// \param mask 4000 /// A 128-bit vector of [2 x double] containing the mask. The most 4001 /// significant bit of each element in the mask vector represents the mask 4002 /// bits. If a mask bit is zero, the corresponding value from vector \a a 4003 /// is gathered; otherwise the value is loaded from memory. 4004 /// \param s 4005 /// A literal constant scale factor for the indexes in \a i. Must be 4006 /// 1, 2, 4, or 8. 4007 /// \returns A 128-bit vector of [2 x double] containing the gathered values. 4008 #define _mm_mask_i32gather_pd(a, m, i, mask, s) \ 4009 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \ 4010 (double const *)(m), \ 4011 (__v4si)(__m128i)(i), \ 4012 (__v2df)(__m128d)(mask), (s))) 4013 4014 /// Conditionally gathers four 64-bit floating-point values, either from the 4015 /// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled 4016 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector 4017 /// of [4 x double] in \a mask determines the source for each element. 4018 /// 4019 /// \code{.operation} 4020 /// FOR element := 0 to 3 4021 /// j := element*64 4022 /// k := element*32 4023 /// IF mask[j+63] == 0 4024 /// result[j+63:j] := a[j+63:j] 4025 /// ELSE 4026 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 4027 /// FI 4028 /// ENDFOR 4029 /// \endcode 4030 /// 4031 /// \headerfile <immintrin.h> 4032 /// 4033 /// \code 4034 /// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i, 4035 /// __m256d mask, const int s); 4036 /// \endcode 4037 /// 4038 /// This intrinsic corresponds to the \c VGATHERDPD instruction. 4039 /// 4040 /// \param a 4041 /// A 256-bit vector of [4 x double] used as the source when a mask bit is 4042 /// zero. 4043 /// \param m 4044 /// A pointer to the memory used for loading values. 4045 /// \param i 4046 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 4047 /// \param mask 4048 /// A 256-bit vector of [4 x double] containing the mask. The most 4049 /// significant bit of each element in the mask vector represents the mask 4050 /// bits. If a mask bit is zero, the corresponding value from vector \a a 4051 /// is gathered; otherwise the value is loaded from memory. 4052 /// \param s 4053 /// A literal constant scale factor for the indexes in \a i. Must be 4054 /// 1, 2, 4, or 8. 4055 /// \returns A 256-bit vector of [4 x double] containing the gathered values. 4056 #define _mm256_mask_i32gather_pd(a, m, i, mask, s) \ 4057 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \ 4058 (double const *)(m), \ 4059 (__v4si)(__m128i)(i), \ 4060 (__v4df)(__m256d)(mask), (s))) 4061 4062 /// Conditionally gathers two 64-bit floating-point values, either from the 4063 /// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled 4064 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector 4065 /// of [2 x double] in \a mask determines the source for each element. 4066 /// 4067 /// \code{.operation} 4068 /// FOR element := 0 to 1 4069 /// j := element*64 4070 /// k := element*64 4071 /// IF mask[j+63] == 0 4072 /// result[j+63:j] := a[j+63:j] 4073 /// ELSE 4074 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 4075 /// FI 4076 /// ENDFOR 4077 /// \endcode 4078 /// 4079 /// \headerfile <immintrin.h> 4080 /// 4081 /// \code 4082 /// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i, 4083 /// __m128d mask, const int s); 4084 /// \endcode 4085 /// 4086 /// This intrinsic corresponds to the \c VGATHERQPD instruction. 4087 /// 4088 /// \param a 4089 /// A 128-bit vector of [2 x double] used as the source when a mask bit is 4090 /// zero. 4091 /// \param m 4092 /// A pointer to the memory used for loading values. 4093 /// \param i 4094 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 4095 /// \param mask 4096 /// A 128-bit vector of [2 x double] containing the mask. The most 4097 /// significant bit of each element in the mask vector represents the mask 4098 /// bits. If a mask bit is zero, the corresponding value from vector \a a 4099 /// is gathered; otherwise the value is loaded from memory. 4100 /// \param s 4101 /// A literal constant scale factor for the indexes in \a i. Must be 4102 /// 1, 2, 4, or 8. 4103 /// \returns A 128-bit vector of [2 x double] containing the gathered values. 4104 #define _mm_mask_i64gather_pd(a, m, i, mask, s) \ 4105 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \ 4106 (double const *)(m), \ 4107 (__v2di)(__m128i)(i), \ 4108 (__v2df)(__m128d)(mask), (s))) 4109 4110 /// Conditionally gathers four 64-bit floating-point values, either from the 4111 /// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled 4112 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector 4113 /// of [4 x double] in \a mask determines the source for each element. 4114 /// 4115 /// \code{.operation} 4116 /// FOR element := 0 to 3 4117 /// j := element*64 4118 /// k := element*64 4119 /// IF mask[j+63] == 0 4120 /// result[j+63:j] := a[j+63:j] 4121 /// ELSE 4122 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 4123 /// FI 4124 /// ENDFOR 4125 /// \endcode 4126 /// 4127 /// \headerfile <immintrin.h> 4128 /// 4129 /// \code 4130 /// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i, 4131 /// __m256d mask, const int s); 4132 /// \endcode 4133 /// 4134 /// This intrinsic corresponds to the \c VGATHERQPD instruction. 4135 /// 4136 /// \param a 4137 /// A 256-bit vector of [4 x double] used as the source when a mask bit is 4138 /// zero. 4139 /// \param m 4140 /// A pointer to the memory used for loading values. 4141 /// \param i 4142 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 4143 /// \param mask 4144 /// A 256-bit vector of [4 x double] containing the mask. The most 4145 /// significant bit of each element in the mask vector represents the mask 4146 /// bits. If a mask bit is zero, the corresponding value from vector \a a 4147 /// is gathered; otherwise the value is loaded from memory. 4148 /// \param s 4149 /// A literal constant scale factor for the indexes in \a i. Must be 4150 /// 1, 2, 4, or 8. 4151 /// \returns A 256-bit vector of [4 x double] containing the gathered values. 4152 #define _mm256_mask_i64gather_pd(a, m, i, mask, s) \ 4153 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \ 4154 (double const *)(m), \ 4155 (__v4di)(__m256i)(i), \ 4156 (__v4df)(__m256d)(mask), (s))) 4157 4158 /// Conditionally gathers four 32-bit floating-point values, either from the 4159 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled 4160 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector 4161 /// of [4 x float] in \a mask determines the source for each element. 4162 /// 4163 /// \code{.operation} 4164 /// FOR element := 0 to 3 4165 /// j := element*32 4166 /// k := element*32 4167 /// IF mask[j+31] == 0 4168 /// result[j+31:j] := a[j+31:j] 4169 /// ELSE 4170 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 4171 /// FI 4172 /// ENDFOR 4173 /// \endcode 4174 /// 4175 /// \headerfile <immintrin.h> 4176 /// 4177 /// \code 4178 /// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i, 4179 /// __m128 mask, const int s); 4180 /// \endcode 4181 /// 4182 /// This intrinsic corresponds to the \c VGATHERDPS instruction. 4183 /// 4184 /// \param a 4185 /// A 128-bit vector of [4 x float] used as the source when a mask bit is 4186 /// zero. 4187 /// \param m 4188 /// A pointer to the memory used for loading values. 4189 /// \param i 4190 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 4191 /// \param mask 4192 /// A 128-bit vector of [4 x float] containing the mask. The most 4193 /// significant bit of each element in the mask vector represents the mask 4194 /// bits. If a mask bit is zero, the corresponding value from vector \a a 4195 /// is gathered; otherwise the value is loaded from memory. 4196 /// \param s 4197 /// A literal constant scale factor for the indexes in \a i. Must be 4198 /// 1, 2, 4, or 8. 4199 /// \returns A 128-bit vector of [4 x float] containing the gathered values. 4200 #define _mm_mask_i32gather_ps(a, m, i, mask, s) \ 4201 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \ 4202 (float const *)(m), \ 4203 (__v4si)(__m128i)(i), \ 4204 (__v4sf)(__m128)(mask), (s))) 4205 4206 /// Conditionally gathers eight 32-bit floating-point values, either from the 4207 /// 256-bit vector of [8 x float] in \a a, or from memory \a m using scaled 4208 /// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector 4209 /// of [8 x float] in \a mask determines the source for each element. 4210 /// 4211 /// \code{.operation} 4212 /// FOR element := 0 to 7 4213 /// j := element*32 4214 /// k := element*32 4215 /// IF mask[j+31] == 0 4216 /// result[j+31:j] := a[j+31:j] 4217 /// ELSE 4218 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 4219 /// FI 4220 /// ENDFOR 4221 /// \endcode 4222 /// 4223 /// \headerfile <immintrin.h> 4224 /// 4225 /// \code 4226 /// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i, 4227 /// __m256 mask, const int s); 4228 /// \endcode 4229 /// 4230 /// This intrinsic corresponds to the \c VGATHERDPS instruction. 4231 /// 4232 /// \param a 4233 /// A 256-bit vector of [8 x float] used as the source when a mask bit is 4234 /// zero. 4235 /// \param m 4236 /// A pointer to the memory used for loading values. 4237 /// \param i 4238 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m. 4239 /// \param mask 4240 /// A 256-bit vector of [8 x float] containing the mask. The most 4241 /// significant bit of each element in the mask vector represents the mask 4242 /// bits. If a mask bit is zero, the corresponding value from vector \a a 4243 /// is gathered; otherwise the value is loaded from memory. 4244 /// \param s 4245 /// A literal constant scale factor for the indexes in \a i. Must be 4246 /// 1, 2, 4, or 8. 4247 /// \returns A 256-bit vector of [8 x float] containing the gathered values. 4248 #define _mm256_mask_i32gather_ps(a, m, i, mask, s) \ 4249 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \ 4250 (float const *)(m), \ 4251 (__v8si)(__m256i)(i), \ 4252 (__v8sf)(__m256)(mask), (s))) 4253 4254 /// Conditionally gathers two 32-bit floating-point values, either from the 4255 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled 4256 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector 4257 /// of [4 x float] in \a mask determines the source for the lower two 4258 /// elements. The upper two elements of the result are zeroed. 4259 /// 4260 /// \code{.operation} 4261 /// FOR element := 0 to 1 4262 /// j := element*32 4263 /// k := element*64 4264 /// IF mask[j+31] == 0 4265 /// result[j+31:j] := a[j+31:j] 4266 /// ELSE 4267 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 4268 /// FI 4269 /// ENDFOR 4270 /// result[127:64] := 0 4271 /// \endcode 4272 /// 4273 /// \headerfile <immintrin.h> 4274 /// 4275 /// \code 4276 /// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i, 4277 /// __m128 mask, const int s); 4278 /// \endcode 4279 /// 4280 /// This intrinsic corresponds to the \c VGATHERQPS instruction. 4281 /// 4282 /// \param a 4283 /// A 128-bit vector of [4 x float] used as the source when a mask bit is 4284 /// zero. Only the first two elements are used. 4285 /// \param m 4286 /// A pointer to the memory used for loading values. 4287 /// \param i 4288 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 4289 /// \param mask 4290 /// A 128-bit vector of [4 x float] containing the mask. The most 4291 /// significant bit of each element in the mask vector represents the mask 4292 /// bits. If a mask bit is zero, the corresponding value from vector \a a 4293 /// is gathered; otherwise the value is loaded from memory. Only the first 4294 /// two elements are used. 4295 /// \param s 4296 /// A literal constant scale factor for the indexes in \a i. Must be 4297 /// 1, 2, 4, or 8. 4298 /// \returns A 128-bit vector of [4 x float] containing the gathered values. 4299 #define _mm_mask_i64gather_ps(a, m, i, mask, s) \ 4300 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \ 4301 (float const *)(m), \ 4302 (__v2di)(__m128i)(i), \ 4303 (__v4sf)(__m128)(mask), (s))) 4304 4305 /// Conditionally gathers four 32-bit floating-point values, either from the 4306 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled 4307 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector 4308 /// of [4 x float] in \a mask determines the source for each element. 4309 /// 4310 /// \code{.operation} 4311 /// FOR element := 0 to 3 4312 /// j := element*32 4313 /// k := element*64 4314 /// IF mask[j+31] == 0 4315 /// result[j+31:j] := a[j+31:j] 4316 /// ELSE 4317 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 4318 /// FI 4319 /// ENDFOR 4320 /// \endcode 4321 /// 4322 /// \headerfile <immintrin.h> 4323 /// 4324 /// \code 4325 /// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i, 4326 /// __m128 mask, const int s); 4327 /// \endcode 4328 /// 4329 /// This intrinsic corresponds to the \c VGATHERQPS instruction. 4330 /// 4331 /// \param a 4332 /// A 128-bit vector of [4 x float] used as the source when a mask bit is 4333 /// zero. 4334 /// \param m 4335 /// A pointer to the memory used for loading values. 4336 /// \param i 4337 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 4338 /// \param mask 4339 /// A 128-bit vector of [4 x float] containing the mask. The most 4340 /// significant bit of each element in the mask vector represents the mask 4341 /// bits. If a mask bit is zero, the corresponding value from vector \a a 4342 /// is gathered; otherwise the value is loaded from memory. 4343 /// \param s 4344 /// A literal constant scale factor for the indexes in \a i. Must be 4345 /// 1, 2, 4, or 8. 4346 /// \returns A 128-bit vector of [4 x float] containing the gathered values. 4347 #define _mm256_mask_i64gather_ps(a, m, i, mask, s) \ 4348 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \ 4349 (float const *)(m), \ 4350 (__v4di)(__m256i)(i), \ 4351 (__v4sf)(__m128)(mask), (s))) 4352 4353 /// Conditionally gathers four 32-bit integer values, either from the 4354 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled 4355 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector 4356 /// of [4 x i32] in \a mask determines the source for each element. 4357 /// 4358 /// \code{.operation} 4359 /// FOR element := 0 to 3 4360 /// j := element*32 4361 /// k := element*32 4362 /// IF mask[j+31] == 0 4363 /// result[j+31:j] := a[j+31:j] 4364 /// ELSE 4365 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 4366 /// FI 4367 /// ENDFOR 4368 /// \endcode 4369 /// 4370 /// \headerfile <immintrin.h> 4371 /// 4372 /// \code 4373 /// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i, 4374 /// __m128i mask, const int s); 4375 /// \endcode 4376 /// 4377 /// This intrinsic corresponds to the \c VPGATHERDD instruction. 4378 /// 4379 /// \param a 4380 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is 4381 /// zero. 4382 /// \param m 4383 /// A pointer to the memory used for loading values. 4384 /// \param i 4385 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 4386 /// \param mask 4387 /// A 128-bit vector of [4 x i32] containing the mask. The most significant 4388 /// bit of each element in the mask vector represents the mask bits. If a 4389 /// mask bit is zero, the corresponding value from vector \a a is gathered; 4390 /// otherwise the value is loaded from memory. 4391 /// \param s 4392 /// A literal constant scale factor for the indexes in \a i. Must be 4393 /// 1, 2, 4, or 8. 4394 /// \returns A 128-bit vector of [4 x i32] containing the gathered values. 4395 #define _mm_mask_i32gather_epi32(a, m, i, mask, s) \ 4396 ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \ 4397 (int const *)(m), \ 4398 (__v4si)(__m128i)(i), \ 4399 (__v4si)(__m128i)(mask), (s))) 4400 4401 /// Conditionally gathers eight 32-bit integer values, either from the 4402 /// 256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled 4403 /// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector 4404 /// of [8 x i32] in \a mask determines the source for each element. 4405 /// 4406 /// \code{.operation} 4407 /// FOR element := 0 to 7 4408 /// j := element*32 4409 /// k := element*32 4410 /// IF mask[j+31] == 0 4411 /// result[j+31:j] := a[j+31:j] 4412 /// ELSE 4413 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 4414 /// FI 4415 /// ENDFOR 4416 /// \endcode 4417 /// 4418 /// \headerfile <immintrin.h> 4419 /// 4420 /// \code 4421 /// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i, 4422 /// __m256i mask, const int s); 4423 /// \endcode 4424 /// 4425 /// This intrinsic corresponds to the \c VPGATHERDD instruction. 4426 /// 4427 /// \param a 4428 /// A 256-bit vector of [8 x i32] used as the source when a mask bit is 4429 /// zero. 4430 /// \param m 4431 /// A pointer to the memory used for loading values. 4432 /// \param i 4433 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m. 4434 /// \param mask 4435 /// A 256-bit vector of [8 x i32] containing the mask. The most significant 4436 /// bit of each element in the mask vector represents the mask bits. If a 4437 /// mask bit is zero, the corresponding value from vector \a a is gathered; 4438 /// otherwise the value is loaded from memory. 4439 /// \param s 4440 /// A literal constant scale factor for the indexes in \a i. Must be 4441 /// 1, 2, 4, or 8. 4442 /// \returns A 256-bit vector of [8 x i32] containing the gathered values. 4443 #define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \ 4444 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \ 4445 (int const *)(m), \ 4446 (__v8si)(__m256i)(i), \ 4447 (__v8si)(__m256i)(mask), (s))) 4448 4449 /// Conditionally gathers two 32-bit integer values, either from the 4450 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled 4451 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector 4452 /// of [4 x i32] in \a mask determines the source for the lower two 4453 /// elements. The upper two elements of the result are zeroed. 4454 /// 4455 /// \code{.operation} 4456 /// FOR element := 0 to 1 4457 /// j := element*32 4458 /// k := element*64 4459 /// IF mask[j+31] == 0 4460 /// result[j+31:j] := a[j+31:j] 4461 /// ELSE 4462 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 4463 /// FI 4464 /// ENDFOR 4465 /// result[127:64] := 0 4466 /// \endcode 4467 /// 4468 /// \headerfile <immintrin.h> 4469 /// 4470 /// \code 4471 /// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i, 4472 /// __m128i mask, const int s); 4473 /// \endcode 4474 /// 4475 /// This intrinsic corresponds to the \c VPGATHERQD instruction. 4476 /// 4477 /// \param a 4478 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is 4479 /// zero. Only the first two elements are used. 4480 /// \param m 4481 /// A pointer to the memory used for loading values. 4482 /// \param i 4483 /// A 128-bit vector of [2 x i64] containing indexes into \a m. 4484 /// \param mask 4485 /// A 128-bit vector of [4 x i32] containing the mask. The most significant 4486 /// bit of each element in the mask vector represents the mask bits. If a 4487 /// mask bit is zero, the corresponding value from vector \a a is gathered; 4488 /// otherwise the value is loaded from memory. Only the first two elements 4489 /// are used. 4490 /// \param s 4491 /// A literal constant scale factor for the indexes in \a i. Must be 4492 /// 1, 2, 4, or 8. 4493 /// \returns A 128-bit vector of [4 x i32] containing the gathered values. 4494 #define _mm_mask_i64gather_epi32(a, m, i, mask, s) \ 4495 ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \ 4496 (int const *)(m), \ 4497 (__v2di)(__m128i)(i), \ 4498 (__v4si)(__m128i)(mask), (s))) 4499 4500 /// Conditionally gathers four 32-bit integer values, either from the 4501 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled 4502 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector 4503 /// of [4 x i32] in \a mask determines the source for each element. 4504 /// 4505 /// \code{.operation} 4506 /// FOR element := 0 to 3 4507 /// j := element*32 4508 /// k := element*64 4509 /// IF mask[j+31] == 0 4510 /// result[j+31:j] := a[j+31:j] 4511 /// ELSE 4512 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 4513 /// FI 4514 /// ENDFOR 4515 /// \endcode 4516 /// 4517 /// \headerfile <immintrin.h> 4518 /// 4519 /// \code 4520 /// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i, 4521 /// __m128i mask, const int s); 4522 /// \endcode 4523 /// 4524 /// This intrinsic corresponds to the \c VPGATHERQD instruction. 4525 /// 4526 /// \param a 4527 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is 4528 /// zero. 4529 /// \param m 4530 /// A pointer to the memory used for loading values. 4531 /// \param i 4532 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 4533 /// \param mask 4534 /// A 128-bit vector of [4 x i32] containing the mask. The most significant 4535 /// bit of each element in the mask vector represents the mask bits. If a 4536 /// mask bit is zero, the corresponding value from vector \a a is gathered; 4537 /// otherwise the value is loaded from memory. 4538 /// \param s 4539 /// A literal constant scale factor for the indexes in \a i. Must be 4540 /// 1, 2, 4, or 8. 4541 /// \returns A 128-bit vector of [4 x i32] containing the gathered values. 4542 #define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \ 4543 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \ 4544 (int const *)(m), \ 4545 (__v4di)(__m256i)(i), \ 4546 (__v4si)(__m128i)(mask), (s))) 4547 4548 /// Conditionally gathers two 64-bit integer values, either from the 4549 /// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled 4550 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector 4551 /// of [2 x i64] in \a mask determines the source for each element. 4552 /// 4553 /// \code{.operation} 4554 /// FOR element := 0 to 1 4555 /// j := element*64 4556 /// k := element*32 4557 /// IF mask[j+63] == 0 4558 /// result[j+63:j] := a[j+63:j] 4559 /// ELSE 4560 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 4561 /// FI 4562 /// ENDFOR 4563 /// \endcode 4564 /// 4565 /// \headerfile <immintrin.h> 4566 /// 4567 /// \code 4568 /// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i, 4569 /// __m128i mask, const int s); 4570 /// \endcode 4571 /// 4572 /// This intrinsic corresponds to the \c VPGATHERDQ instruction. 4573 /// 4574 /// \param a 4575 /// A 128-bit vector of [2 x i64] used as the source when a mask bit is 4576 /// zero. 4577 /// \param m 4578 /// A pointer to the memory used for loading values. 4579 /// \param i 4580 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only 4581 /// the first two elements are used. 4582 /// \param mask 4583 /// A 128-bit vector of [2 x i64] containing the mask. The most significant 4584 /// bit of each element in the mask vector represents the mask bits. If a 4585 /// mask bit is zero, the corresponding value from vector \a a is gathered; 4586 /// otherwise the value is loaded from memory. 4587 /// \param s 4588 /// A literal constant scale factor for the indexes in \a i. Must be 4589 /// 1, 2, 4, or 8. 4590 /// \returns A 128-bit vector of [2 x i64] containing the gathered values. 4591 #define _mm_mask_i32gather_epi64(a, m, i, mask, s) \ 4592 ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \ 4593 (long long const *)(m), \ 4594 (__v4si)(__m128i)(i), \ 4595 (__v2di)(__m128i)(mask), (s))) 4596 4597 /// Conditionally gathers four 64-bit integer values, either from the 4598 /// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled 4599 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector 4600 /// of [4 x i64] in \a mask determines the source for each element. 4601 /// 4602 /// \code{.operation} 4603 /// FOR element := 0 to 3 4604 /// j := element*64 4605 /// k := element*32 4606 /// IF mask[j+63] == 0 4607 /// result[j+63:j] := a[j+63:j] 4608 /// ELSE 4609 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 4610 /// FI 4611 /// ENDFOR 4612 /// \endcode 4613 /// 4614 /// \headerfile <immintrin.h> 4615 /// 4616 /// \code 4617 /// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m, 4618 /// __m128i i, __m256i mask, const int s); 4619 /// \endcode 4620 /// 4621 /// This intrinsic corresponds to the \c VPGATHERDQ instruction. 4622 /// 4623 /// \param a 4624 /// A 256-bit vector of [4 x i64] used as the source when a mask bit is 4625 /// zero. 4626 /// \param m 4627 /// A pointer to the memory used for loading values. 4628 /// \param i 4629 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 4630 /// \param mask 4631 /// A 256-bit vector of [4 x i64] containing the mask. The most significant 4632 /// bit of each element in the mask vector represents the mask bits. If a 4633 /// mask bit is zero, the corresponding value from vector \a a is gathered; 4634 /// otherwise the value is loaded from memory. 4635 /// \param s 4636 /// A literal constant scale factor for the indexes in \a i. Must be 4637 /// 1, 2, 4, or 8. 4638 /// \returns A 256-bit vector of [4 x i64] containing the gathered values. 4639 #define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \ 4640 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \ 4641 (long long const *)(m), \ 4642 (__v4si)(__m128i)(i), \ 4643 (__v4di)(__m256i)(mask), (s))) 4644 4645 /// Conditionally gathers two 64-bit integer values, either from the 4646 /// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled 4647 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector 4648 /// of [2 x i64] in \a mask determines the source for each element. 4649 /// 4650 /// \code{.operation} 4651 /// FOR element := 0 to 1 4652 /// j := element*64 4653 /// k := element*64 4654 /// IF mask[j+63] == 0 4655 /// result[j+63:j] := a[j+63:j] 4656 /// ELSE 4657 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 4658 /// FI 4659 /// ENDFOR 4660 /// \endcode 4661 /// 4662 /// \headerfile <immintrin.h> 4663 /// 4664 /// \code 4665 /// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i, 4666 /// __m128i mask, const int s); 4667 /// \endcode 4668 /// 4669 /// This intrinsic corresponds to the \c VPGATHERQQ instruction. 4670 /// 4671 /// \param a 4672 /// A 128-bit vector of [2 x i64] used as the source when a mask bit is 4673 /// zero. 4674 /// \param m 4675 /// A pointer to the memory used for loading values. 4676 /// \param i 4677 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 4678 /// \param mask 4679 /// A 128-bit vector of [2 x i64] containing the mask. The most significant 4680 /// bit of each element in the mask vector represents the mask bits. If a 4681 /// mask bit is zero, the corresponding value from vector \a a is gathered; 4682 /// otherwise the value is loaded from memory. 4683 /// \param s 4684 /// A literal constant scale factor for the indexes in \a i. Must be 4685 /// 1, 2, 4, or 8. 4686 /// \returns A 128-bit vector of [2 x i64] containing the gathered values. 4687 #define _mm_mask_i64gather_epi64(a, m, i, mask, s) \ 4688 ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \ 4689 (long long const *)(m), \ 4690 (__v2di)(__m128i)(i), \ 4691 (__v2di)(__m128i)(mask), (s))) 4692 4693 /// Conditionally gathers four 64-bit integer values, either from the 4694 /// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled 4695 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector 4696 /// of [4 x i64] in \a mask determines the source for each element. 4697 /// 4698 /// \code{.operation} 4699 /// FOR element := 0 to 3 4700 /// j := element*64 4701 /// k := element*64 4702 /// IF mask[j+63] == 0 4703 /// result[j+63:j] := a[j+63:j] 4704 /// ELSE 4705 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 4706 /// FI 4707 /// ENDFOR 4708 /// \endcode 4709 /// 4710 /// \headerfile <immintrin.h> 4711 /// 4712 /// \code 4713 /// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m, 4714 /// __m256i i, __m256i mask, const int s); 4715 /// \endcode 4716 /// 4717 /// This intrinsic corresponds to the \c VPGATHERQQ instruction. 4718 /// 4719 /// \param a 4720 /// A 256-bit vector of [4 x i64] used as the source when a mask bit is 4721 /// zero. 4722 /// \param m 4723 /// A pointer to the memory used for loading values. 4724 /// \param i 4725 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 4726 /// \param mask 4727 /// A 256-bit vector of [4 x i64] containing the mask. The most significant 4728 /// bit of each element in the mask vector represents the mask bits. If a 4729 /// mask bit is zero, the corresponding value from vector \a a is gathered; 4730 /// otherwise the value is loaded from memory. 4731 /// \param s 4732 /// A literal constant scale factor for the indexes in \a i. Must be 4733 /// 1, 2, 4, or 8. 4734 /// \returns A 256-bit vector of [4 x i64] containing the gathered values. 4735 #define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \ 4736 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \ 4737 (long long const *)(m), \ 4738 (__v4di)(__m256i)(i), \ 4739 (__v4di)(__m256i)(mask), (s))) 4740 4741 /// Gathers two 64-bit floating-point values from memory \a m using scaled 4742 /// indexes from the 128-bit vector of [4 x i32] in \a i. 4743 /// 4744 /// \code{.operation} 4745 /// FOR element := 0 to 1 4746 /// j := element*64 4747 /// k := element*32 4748 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 4749 /// ENDFOR 4750 /// \endcode 4751 /// 4752 /// \headerfile <immintrin.h> 4753 /// 4754 /// \code 4755 /// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s); 4756 /// \endcode 4757 /// 4758 /// This intrinsic corresponds to the \c VGATHERDPD instruction. 4759 /// 4760 /// \param m 4761 /// A pointer to the memory used for loading values. 4762 /// \param i 4763 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only 4764 /// the first two elements are used. 4765 /// \param s 4766 /// A literal constant scale factor for the indexes in \a i. Must be 4767 /// 1, 2, 4, or 8. 4768 /// \returns A 128-bit vector of [2 x double] containing the gathered values. 4769 #define _mm_i32gather_pd(m, i, s) \ 4770 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \ 4771 (double const *)(m), \ 4772 (__v4si)(__m128i)(i), \ 4773 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \ 4774 _mm_setzero_pd()), \ 4775 (s))) 4776 4777 /// Gathers four 64-bit floating-point values from memory \a m using scaled 4778 /// indexes from the 128-bit vector of [4 x i32] in \a i. 4779 /// 4780 /// \code{.operation} 4781 /// FOR element := 0 to 3 4782 /// j := element*64 4783 /// k := element*32 4784 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 4785 /// ENDFOR 4786 /// \endcode 4787 /// 4788 /// \headerfile <immintrin.h> 4789 /// 4790 /// \code 4791 /// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s); 4792 /// \endcode 4793 /// 4794 /// This intrinsic corresponds to the \c VGATHERDPD instruction. 4795 /// 4796 /// \param m 4797 /// A pointer to the memory used for loading values. 4798 /// \param i 4799 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 4800 /// \param s 4801 /// A literal constant scale factor for the indexes in \a i. Must be 4802 /// 1, 2, 4, or 8. 4803 /// \returns A 256-bit vector of [4 x double] containing the gathered values. 4804 #define _mm256_i32gather_pd(m, i, s) \ 4805 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \ 4806 (double const *)(m), \ 4807 (__v4si)(__m128i)(i), \ 4808 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \ 4809 _mm256_setzero_pd(), \ 4810 _CMP_EQ_OQ), \ 4811 (s))) 4812 4813 /// Gathers two 64-bit floating-point values from memory \a m using scaled 4814 /// indexes from the 128-bit vector of [2 x i64] in \a i. 4815 /// 4816 /// \code{.operation} 4817 /// FOR element := 0 to 1 4818 /// j := element*64 4819 /// k := element*64 4820 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 4821 /// ENDFOR 4822 /// \endcode 4823 /// 4824 /// \headerfile <immintrin.h> 4825 /// 4826 /// \code 4827 /// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s); 4828 /// \endcode 4829 /// 4830 /// This intrinsic corresponds to the \c VGATHERQPD instruction. 4831 /// 4832 /// \param m 4833 /// A pointer to the memory used for loading values. 4834 /// \param i 4835 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 4836 /// \param s 4837 /// A literal constant scale factor for the indexes in \a i. Must be 4838 /// 1, 2, 4, or 8. 4839 /// \returns A 128-bit vector of [2 x double] containing the gathered values. 4840 #define _mm_i64gather_pd(m, i, s) \ 4841 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \ 4842 (double const *)(m), \ 4843 (__v2di)(__m128i)(i), \ 4844 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \ 4845 _mm_setzero_pd()), \ 4846 (s))) 4847 4848 /// Gathers four 64-bit floating-point values from memory \a m using scaled 4849 /// indexes from the 256-bit vector of [4 x i64] in \a i. 4850 /// 4851 /// \code{.operation} 4852 /// FOR element := 0 to 3 4853 /// j := element*64 4854 /// k := element*64 4855 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 4856 /// ENDFOR 4857 /// \endcode 4858 /// 4859 /// \headerfile <immintrin.h> 4860 /// 4861 /// \code 4862 /// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s); 4863 /// \endcode 4864 /// 4865 /// This intrinsic corresponds to the \c VGATHERQPD instruction. 4866 /// 4867 /// \param m 4868 /// A pointer to the memory used for loading values. 4869 /// \param i 4870 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 4871 /// \param s 4872 /// A literal constant scale factor for the indexes in \a i. Must be 4873 /// 1, 2, 4, or 8. 4874 /// \returns A 256-bit vector of [4 x double] containing the gathered values. 4875 #define _mm256_i64gather_pd(m, i, s) \ 4876 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \ 4877 (double const *)(m), \ 4878 (__v4di)(__m256i)(i), \ 4879 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \ 4880 _mm256_setzero_pd(), \ 4881 _CMP_EQ_OQ), \ 4882 (s))) 4883 4884 /// Gathers four 32-bit floating-point values from memory \a m using scaled 4885 /// indexes from the 128-bit vector of [4 x i32] in \a i. 4886 /// 4887 /// \code{.operation} 4888 /// FOR element := 0 to 3 4889 /// j := element*32 4890 /// k := element*32 4891 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 4892 /// ENDFOR 4893 /// \endcode 4894 /// 4895 /// \headerfile <immintrin.h> 4896 /// 4897 /// \code 4898 /// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s); 4899 /// \endcode 4900 /// 4901 /// This intrinsic corresponds to the \c VGATHERDPS instruction. 4902 /// 4903 /// \param m 4904 /// A pointer to the memory used for loading values. 4905 /// \param i 4906 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 4907 /// \param s 4908 /// A literal constant scale factor for the indexes in \a i. Must be 4909 /// 1, 2, 4, or 8. 4910 /// \returns A 128-bit vector of [4 x float] containing the gathered values. 4911 #define _mm_i32gather_ps(m, i, s) \ 4912 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \ 4913 (float const *)(m), \ 4914 (__v4si)(__m128i)(i), \ 4915 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ 4916 _mm_setzero_ps()), \ 4917 (s))) 4918 4919 /// Gathers eight 32-bit floating-point values from memory \a m using scaled 4920 /// indexes from the 256-bit vector of [8 x i32] in \a i. 4921 /// 4922 /// \code{.operation} 4923 /// FOR element := 0 to 7 4924 /// j := element*32 4925 /// k := element*32 4926 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 4927 /// ENDFOR 4928 /// \endcode 4929 /// 4930 /// \headerfile <immintrin.h> 4931 /// 4932 /// \code 4933 /// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s); 4934 /// \endcode 4935 /// 4936 /// This intrinsic corresponds to the \c VGATHERDPS instruction. 4937 /// 4938 /// \param m 4939 /// A pointer to the memory used for loading values. 4940 /// \param i 4941 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m. 4942 /// \param s 4943 /// A literal constant scale factor for the indexes in \a i. Must be 4944 /// 1, 2, 4, or 8. 4945 /// \returns A 256-bit vector of [8 x float] containing the gathered values. 4946 #define _mm256_i32gather_ps(m, i, s) \ 4947 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \ 4948 (float const *)(m), \ 4949 (__v8si)(__m256i)(i), \ 4950 (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \ 4951 _mm256_setzero_ps(), \ 4952 _CMP_EQ_OQ), \ 4953 (s))) 4954 4955 /// Gathers two 32-bit floating-point values from memory \a m using scaled 4956 /// indexes from the 128-bit vector of [2 x i64] in \a i. The upper two 4957 /// elements of the result are zeroed. 4958 /// 4959 /// \code{.operation} 4960 /// FOR element := 0 to 1 4961 /// j := element*32 4962 /// k := element*64 4963 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 4964 /// ENDFOR 4965 /// result[127:64] := 0 4966 /// \endcode 4967 /// 4968 /// \headerfile <immintrin.h> 4969 /// 4970 /// \code 4971 /// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s); 4972 /// \endcode 4973 /// 4974 /// This intrinsic corresponds to the \c VGATHERQPS instruction. 4975 /// 4976 /// \param m 4977 /// A pointer to the memory used for loading values. 4978 /// \param i 4979 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 4980 /// \param s 4981 /// A literal constant scale factor for the indexes in \a i. Must be 4982 /// 1, 2, 4, or 8. 4983 /// \returns A 128-bit vector of [4 x float] containing the gathered values. 4984 #define _mm_i64gather_ps(m, i, s) \ 4985 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \ 4986 (float const *)(m), \ 4987 (__v2di)(__m128i)(i), \ 4988 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ 4989 _mm_setzero_ps()), \ 4990 (s))) 4991 4992 /// Gathers four 32-bit floating-point values from memory \a m using scaled 4993 /// indexes from the 256-bit vector of [4 x i64] in \a i. 4994 /// 4995 /// \code{.operation} 4996 /// FOR element := 0 to 3 4997 /// j := element*32 4998 /// k := element*64 4999 /// result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s) 5000 /// ENDFOR 5001 /// \endcode 5002 /// 5003 /// \headerfile <immintrin.h> 5004 /// 5005 /// \code 5006 /// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s); 5007 /// \endcode 5008 /// 5009 /// This intrinsic corresponds to the \c VGATHERQPS instruction. 5010 /// 5011 /// \param m 5012 /// A pointer to the memory used for loading values. 5013 /// \param i 5014 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 5015 /// \param s 5016 /// A literal constant scale factor for the indexes in \a i. Must be 5017 /// 1, 2, 4, or 8. 5018 /// \returns A 128-bit vector of [4 x float] containing the gathered values. 5019 #define _mm256_i64gather_ps(m, i, s) \ 5020 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \ 5021 (float const *)(m), \ 5022 (__v4di)(__m256i)(i), \ 5023 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ 5024 _mm_setzero_ps()), \ 5025 (s))) 5026 5027 /// Gathers four 32-bit floating-point values from memory \a m using scaled 5028 /// indexes from the 128-bit vector of [4 x i32] in \a i. 5029 /// 5030 /// \code{.operation} 5031 /// FOR element := 0 to 3 5032 /// j := element*32 5033 /// k := element*32 5034 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 5035 /// ENDFOR 5036 /// \endcode 5037 /// 5038 /// \headerfile <immintrin.h> 5039 /// 5040 /// \code 5041 /// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s); 5042 /// \endcode 5043 /// 5044 /// This intrinsic corresponds to the \c VPGATHERDD instruction. 5045 /// 5046 /// \param m 5047 /// A pointer to the memory used for loading values. 5048 /// \param i 5049 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 5050 /// \param s 5051 /// A literal constant scale factor for the indexes in \a i. Must be 5052 /// 1, 2, 4, or 8. 5053 /// \returns A 128-bit vector of [4 x i32] containing the gathered values. 5054 #define _mm_i32gather_epi32(m, i, s) \ 5055 ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \ 5056 (int const *)(m), (__v4si)(__m128i)(i), \ 5057 (__v4si)_mm_set1_epi32(-1), (s))) 5058 5059 /// Gathers eight 32-bit floating-point values from memory \a m using scaled 5060 /// indexes from the 256-bit vector of [8 x i32] in \a i. 5061 /// 5062 /// \code{.operation} 5063 /// FOR element := 0 to 7 5064 /// j := element*32 5065 /// k := element*32 5066 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 5067 /// ENDFOR 5068 /// \endcode 5069 /// 5070 /// \headerfile <immintrin.h> 5071 /// 5072 /// \code 5073 /// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s); 5074 /// \endcode 5075 /// 5076 /// This intrinsic corresponds to the \c VPGATHERDD instruction. 5077 /// 5078 /// \param m 5079 /// A pointer to the memory used for loading values. 5080 /// \param i 5081 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m. 5082 /// \param s 5083 /// A literal constant scale factor for the indexes in \a i. Must be 5084 /// 1, 2, 4, or 8. 5085 /// \returns A 256-bit vector of [8 x i32] containing the gathered values. 5086 #define _mm256_i32gather_epi32(m, i, s) \ 5087 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \ 5088 (int const *)(m), (__v8si)(__m256i)(i), \ 5089 (__v8si)_mm256_set1_epi32(-1), (s))) 5090 5091 /// Gathers two 32-bit integer values from memory \a m using scaled indexes 5092 /// from the 128-bit vector of [2 x i64] in \a i. The upper two elements 5093 /// of the result are zeroed. 5094 /// 5095 /// \code{.operation} 5096 /// FOR element := 0 to 1 5097 /// j := element*32 5098 /// k := element*64 5099 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 5100 /// ENDFOR 5101 /// result[127:64] := 0 5102 /// \endcode 5103 /// 5104 /// \headerfile <immintrin.h> 5105 /// 5106 /// \code 5107 /// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s); 5108 /// \endcode 5109 /// 5110 /// This intrinsic corresponds to the \c VPGATHERQD instruction. 5111 /// 5112 /// \param m 5113 /// A pointer to the memory used for loading values. 5114 /// \param i 5115 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 5116 /// \param s 5117 /// A literal constant scale factor for the indexes in \a i. Must be 5118 /// 1, 2, 4, or 8. 5119 /// \returns A 128-bit vector of [4 x i32] containing the gathered values. 5120 #define _mm_i64gather_epi32(m, i, s) \ 5121 ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \ 5122 (int const *)(m), (__v2di)(__m128i)(i), \ 5123 (__v4si)_mm_set1_epi32(-1), (s))) 5124 5125 /// Gathers four 32-bit integer values from memory \a m using scaled indexes 5126 /// from the 256-bit vector of [4 x i64] in \a i. 5127 /// 5128 /// \code{.operation} 5129 /// FOR element := 0 to 3 5130 /// j := element*32 5131 /// k := element*64 5132 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 5133 /// ENDFOR 5134 /// \endcode 5135 /// 5136 /// \headerfile <immintrin.h> 5137 /// 5138 /// \code 5139 /// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s); 5140 /// \endcode 5141 /// 5142 /// This intrinsic corresponds to the \c VPGATHERQD instruction. 5143 /// 5144 /// \param m 5145 /// A pointer to the memory used for loading values. 5146 /// \param i 5147 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 5148 /// \param s 5149 /// A literal constant scale factor for the indexes in \a i. Must be 5150 /// 1, 2, 4, or 8. 5151 /// \returns A 128-bit vector of [4 x i32] containing the gathered values. 5152 #define _mm256_i64gather_epi32(m, i, s) \ 5153 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \ 5154 (int const *)(m), (__v4di)(__m256i)(i), \ 5155 (__v4si)_mm_set1_epi32(-1), (s))) 5156 5157 /// Gathers two 64-bit integer values from memory \a m using scaled indexes 5158 /// from the 128-bit vector of [4 x i32] in \a i. 5159 /// 5160 /// \code{.operation} 5161 /// FOR element := 0 to 1 5162 /// j := element*64 5163 /// k := element*32 5164 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 5165 /// ENDFOR 5166 /// \endcode 5167 /// 5168 /// \headerfile <immintrin.h> 5169 /// 5170 /// \code 5171 /// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s); 5172 /// \endcode 5173 /// 5174 /// This intrinsic corresponds to the \c VPGATHERDQ instruction. 5175 /// 5176 /// \param m 5177 /// A pointer to the memory used for loading values. 5178 /// \param i 5179 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only 5180 /// the first two elements are used. 5181 /// \param s 5182 /// A literal constant scale factor for the indexes in \a i. Must be 5183 /// 1, 2, 4, or 8. 5184 /// \returns A 128-bit vector of [2 x i64] containing the gathered values. 5185 #define _mm_i32gather_epi64(m, i, s) \ 5186 ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \ 5187 (long long const *)(m), \ 5188 (__v4si)(__m128i)(i), \ 5189 (__v2di)_mm_set1_epi64x(-1), (s))) 5190 5191 /// Gathers four 64-bit integer values from memory \a m using scaled indexes 5192 /// from the 128-bit vector of [4 x i32] in \a i. 5193 /// 5194 /// \code{.operation} 5195 /// FOR element := 0 to 3 5196 /// j := element*64 5197 /// k := element*32 5198 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 5199 /// ENDFOR 5200 /// \endcode 5201 /// 5202 /// \headerfile <immintrin.h> 5203 /// 5204 /// \code 5205 /// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s); 5206 /// \endcode 5207 /// 5208 /// This intrinsic corresponds to the \c VPGATHERDQ instruction. 5209 /// 5210 /// \param m 5211 /// A pointer to the memory used for loading values. 5212 /// \param i 5213 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 5214 /// \param s 5215 /// A literal constant scale factor for the indexes in \a i. Must be 5216 /// 1, 2, 4, or 8. 5217 /// \returns A 256-bit vector of [4 x i64] containing the gathered values. 5218 #define _mm256_i32gather_epi64(m, i, s) \ 5219 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \ 5220 (long long const *)(m), \ 5221 (__v4si)(__m128i)(i), \ 5222 (__v4di)_mm256_set1_epi64x(-1), (s))) 5223 5224 /// Gathers two 64-bit integer values from memory \a m using scaled indexes 5225 /// from the 128-bit vector of [2 x i64] in \a i. 5226 /// 5227 /// \code{.operation} 5228 /// FOR element := 0 to 1 5229 /// j := element*64 5230 /// k := element*64 5231 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 5232 /// ENDFOR 5233 /// \endcode 5234 /// 5235 /// \headerfile <immintrin.h> 5236 /// 5237 /// \code 5238 /// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s); 5239 /// \endcode 5240 /// 5241 /// This intrinsic corresponds to the \c VPGATHERQQ instruction. 5242 /// 5243 /// \param m 5244 /// A pointer to the memory used for loading values. 5245 /// \param i 5246 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 5247 /// \param s 5248 /// A literal constant scale factor for the indexes in \a i. Must be 5249 /// 1, 2, 4, or 8. 5250 /// \returns A 128-bit vector of [2 x i64] containing the gathered values. 5251 #define _mm_i64gather_epi64(m, i, s) \ 5252 ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \ 5253 (long long const *)(m), \ 5254 (__v2di)(__m128i)(i), \ 5255 (__v2di)_mm_set1_epi64x(-1), (s))) 5256 5257 /// Gathers four 64-bit integer values from memory \a m using scaled indexes 5258 /// from the 256-bit vector of [4 x i64] in \a i. 5259 /// 5260 /// \code{.operation} 5261 /// FOR element := 0 to 3 5262 /// j := element*64 5263 /// k := element*64 5264 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 5265 /// ENDFOR 5266 /// \endcode 5267 /// 5268 /// \headerfile <immintrin.h> 5269 /// 5270 /// \code 5271 /// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s); 5272 /// \endcode 5273 /// 5274 /// This intrinsic corresponds to the \c VPGATHERQQ instruction. 5275 /// 5276 /// \param m 5277 /// A pointer to the memory used for loading values. 5278 /// \param i 5279 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 5280 /// \param s 5281 /// A literal constant scale factor for the indexes in \a i. Must be 5282 /// 1, 2, 4, or 8. 5283 /// \returns A 256-bit vector of [4 x i64] containing the gathered values. 5284 #define _mm256_i64gather_epi64(m, i, s) \ 5285 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \ 5286 (long long const *)(m), \ 5287 (__v4di)(__m256i)(i), \ 5288 (__v4di)_mm256_set1_epi64x(-1), (s))) 5289 5290 #undef __DEFAULT_FN_ATTRS256 5291 #undef __DEFAULT_FN_ATTRS128 5292 5293 #endif /* __AVX2INTRIN_H */ 5294