avx2intrin.h - OpenGrok cross reference for /freebsd-src/contrib/llvm-project/clang/lib/Headers/avx2intrin.h

Lines Matching +full:16 +full:- +full:bit
1 /*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7  *===-----------------------------------------------------------------------===
20                  __target__("avx2,no-evex512"), __min_vector_width__(256)))
23                  __target__("avx2,no-evex512"), __min_vector_width__(128)))
27 ///    four unsigned 8-bit integers from the 256-bit integer vectors \a X and
31 ///    vectors, and another eight using the upper half. These 16-bit values
32 ///    are returned in the lower and upper halves of the 256-bit result,
38 ///    difference, and sums these four values to form one 16-bit result. The
39 ///    intrinsic computes 16 of these results with different sets of input
43 ///    bytes from \a Y; the starting bit position for these four bytes is
45 ///    sets of four bytes from \a X; the starting bit position for the first
46 ///    set of four bytes is specified by \a M[2] times 32. These bit positions
47 ///    are all relative to the 128-bit lane for each set of eight operations.
56 ///     temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase])
57 ///     temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8])
58 ///     temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16])
59 ///     temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24])
62 ///     r := r + 16
76 ///    A 256-bit integer vector containing one of the inputs.
78 ///    A 256-bit integer vector containing one of the inputs.
82 /// \returns A 256-bit vector of [16 x i16] containing the result.
87 /// Computes the absolute value of each signed byte in the 256-bit integer
96 ///    A 256-bit integer vector.
97 /// \returns A 256-bit integer vector containing the result.
104 /// Computes the absolute value of each signed 16-bit element in the 256-bit
105 ///    vector of [16 x i16] in \a __a and returns each value in the
113 ///    A 256-bit vector of [16 x i16].
114 /// \returns A 256-bit vector of [16 x i16] containing the result.
121 /// Computes the absolute value of each signed 32-bit element in the 256-bit
130 ///    A 256-bit vector of [8 x i32].
131 /// \returns A 256-bit vector of [8 x i32] containing the result.
138 /// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
139 ///    integers using signed saturation, and returns the 256-bit result.
143 ///   j := i*16
157 ///    A 256-bit vector of [16 x i16] used to generate result[63:0] and
160 ///    A 256-bit vector of [16 x i16] used to generate result[127:64] and
162 /// \returns A 256-bit integer vector containing the result.
169 /// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit
170 ///    integers using signed saturation, and returns the resulting 256-bit
171 ///    vector of [16 x i16].
176 ///   k := i*16
189 ///    A 256-bit vector of [8 x i32] used to generate result[63:0] and
192 ///    A 256-bit vector of [8 x i32] used to generate result[127:64] and
194 /// \returns A 256-bit vector of [16 x i16] containing the result.
201 /// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers
202 ///    using unsigned saturation, and returns the 256-bit result.
206 ///   j := i*16
220 ///    A 256-bit vector of [16 x i16] used to generate result[63:0] and
223 ///    A 256-bit vector of [16 x i16] used to generate result[127:64] and
225 /// \returns A 256-bit integer vector containing the result.
232 /// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers
233 ///    using unsigned saturation, and returns the resulting 256-bit vector of
234 ///    [16 x i16].
239 ///   k := i*16
252 ///    A 256-bit vector of [8 x i32] used to generate result[63:0] and
255 ///    A 256-bit vector of [8 x i32] used to generate result[127:64] and
257 /// \returns A 256-bit vector of [16 x i16] containing the result.
264 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
266 ///    byte of the 256-bit integer vector result (overflow is ignored).
273 ///    A 256-bit integer vector containing one of the source operands.
275 ///    A 256-bit integer vector containing one of the source operands.
276 /// \returns A 256-bit integer vector containing the sums.
283 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
284 ///    [16 x i16] and returns the lower 16 bits of each sum in the
285 ///    corresponding element of the [16 x i16] result (overflow is ignored).
292 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
294 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
295 /// \returns A 256-bit vector of [16 x i16] containing the sums.
302 /// Adds 32-bit integers from corresponding elements of two 256-bit vectors of
311 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
313 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
314 /// \returns A 256-bit vector of [8 x i32] containing the sums.
321 /// Adds 64-bit integers from corresponding elements of two 256-bit vectors of
330 ///    A 256-bit vector of [4 x i64] containing one of the source operands.
332 ///    A 256-bit vector of [4 x i64] containing one of the source operands.
333 /// \returns A 256-bit vector of [4 x i64] containing the sums.
340 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
342 ///    corresponding byte of the 256-bit integer vector result.
349 ///    A 256-bit integer vector containing one of the source operands.
351 ///    A 256-bit integer vector containing one of the source operands.
352 /// \returns A 256-bit integer vector containing the sums.
359 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
360 ///    [16 x i16] using signed saturation, and returns the [16 x i16] result.
367 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
369 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
370 /// \returns A 256-bit vector of [16 x i16] containing the sums.
377 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
379 ///    corresponding byte of the 256-bit integer vector result.
386 ///    A 256-bit integer vector containing one of the source operands.
388 ///    A 256-bit integer vector containing one of the source operands.
389 /// \returns A 256-bit integer vector containing the sums.
396 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
397 ///    [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
404 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
406 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
407 /// \returns A 256-bit vector of [16 x i16] containing the sums.
414 /// Uses the lower half of the 256-bit vector \a a as the upper half of a
415 ///    temporary 256-bit value, and the lower half of the 256-bit vector \a b
416 ///    as the lower half of the temporary value. Right-shifts the temporary
417 ///    value by \a n bytes, and uses the lower 16 bytes of the shifted value
418 ///    as the lower 16 bytes of the result. Uses the upper halves of \a a and
420 ///    the lower 16 bytes of the shifted value as the upper 16 bytes of the
432 ///    A 256-bit integer vector containing source values.
434 ///    A 256-bit integer vector containing source values.
437 /// \returns A 256-bit integer vector containing the result.
442 /// Computes the bitwise AND of the 256-bit integer vectors in \a __a and
450 ///    A 256-bit integer vector.
452 ///    A 256-bit integer vector.
453 /// \returns A 256-bit integer vector containing the result.
460 /// Computes the bitwise AND of the 256-bit integer vector in \a __b with
461 ///    the bitwise NOT of the 256-bit integer vector in \a __a.
468 ///    A 256-bit integer vector.
470 ///    A 256-bit integer vector.
471 /// \returns A 256-bit integer vector containing the result.
479 ///    256-bit integer vectors in \a __a and \a __b and returns each
480 ///    average in the corresponding byte of the 256-bit result.
494 ///    A 256-bit integer vector.
496 ///    A 256-bit integer vector.
497 /// \returns A 256-bit integer vector containing the result.
504 /// Computes the averages of the corresponding unsigned 16-bit integers in
505 ///    the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns
506 ///    each average in the corresponding element of the 256-bit result.
510 ///   j := i*16
520 ///    A 256-bit vector of [16 x i16].
522 ///    A 256-bit vector of [16 x i16].
523 /// \returns A 256-bit vector of [16 x i16] containing the result.
530 /// Merges 8-bit integer values from either of the two 256-bit vectors
531 ///    \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns
532 ///    the resulting 256-bit integer vector.
550 ///    A 256-bit integer vector containing source values.
552 ///    A 256-bit integer vector containing source values.
554 ///    A 256-bit integer vector, with bit [7] of each byte specifying the
555 ///    source for each corresponding byte of the result. When the mask bit
558 /// \returns A 256-bit integer vector containing the result.
566 /// Merges 16-bit integer values from either of the two 256-bit vectors
568 ///    and returns the resulting 256-bit vector of [16 x i16].
572 ///   j := i*16
592 ///    A 256-bit vector of [16 x i16] containing source values.
594 ///    A 256-bit vector of [16 x i16] containing source values.
596 ///    An immediate 8-bit integer operand, with bits [7:0] specifying the
597 ///    source for each element of the result. The position of the mask bit
598 ///    corresponds to the index of a copied value. When a mask bit is 0, the
602 /// \returns A 256-bit vector of [16 x i16] containing the result.
607 /// Compares corresponding bytes in the 256-bit integer vectors in \a __a and
609 ///    bytes of the 256-bit result.
623 ///    A 256-bit integer vector containing one of the inputs.
625 ///    A 256-bit integer vector containing one of the inputs.
626 /// \returns A 256-bit integer vector containing the result.
633 /// Compares corresponding elements in the 256-bit vectors of [16 x i16] in
635 ///    corresponding elements of the 256-bit result.
639 ///   j := i*16
649 ///    A 256-bit vector of [16 x i16] containing one of the inputs.
651 ///    A 256-bit vector of [16 x i16] containing one of the inputs.
652 /// \returns A 256-bit vector of [16 x i16] containing the result.
659 /// Compares corresponding elements in the 256-bit vectors of [8 x i32] in
661 ///    corresponding elements of the 256-bit result.
675 ///    A 256-bit vector of [8 x i32] containing one of the inputs.
677 ///    A 256-bit vector of [8 x i32] containing one of the inputs.
678 /// \returns A 256-bit vector of [8 x i32] containing the result.
685 /// Compares corresponding elements in the 256-bit vectors of [4 x i64] in
687 ///    corresponding elements of the 256-bit result.
701 ///    A 256-bit vector of [4 x i64] containing one of the inputs.
703 ///    A 256-bit vector of [4 x i64] containing one of the inputs.
704 /// \returns A 256-bit vector of [4 x i64] containing the result.
711 /// Compares corresponding signed bytes in the 256-bit integer vectors in
712 ///    \a __a and \a __b for greater-than and returns the outcomes in the
713 ///    corresponding bytes of the 256-bit result.
727 ///    A 256-bit integer vector containing one of the inputs.
729 ///    A 256-bit integer vector containing one of the inputs.
730 /// \returns A 256-bit integer vector containing the result.
739 /// Compares corresponding signed elements in the 256-bit vectors of
740 ///    [16 x i16] in \a __a and \a __b for greater-than and returns the
741 ///    outcomes in the corresponding elements of the 256-bit result.
745 ///   j := i*16
755 ///    A 256-bit vector of [16 x i16] containing one of the inputs.
757 ///    A 256-bit vector of [16 x i16] containing one of the inputs.
758 /// \returns A 256-bit vector of [16 x i16] containing the result.
765 /// Compares corresponding signed elements in the 256-bit vectors of
766 ///    [8 x i32] in \a __a and \a __b for greater-than and returns the
767 ///    outcomes in the corresponding elements of the 256-bit result.
781 ///    A 256-bit vector of [8 x i32] containing one of the inputs.
783 ///    A 256-bit vector of [8 x i32] containing one of the inputs.
784 /// \returns A 256-bit vector of [8 x i32] containing the result.
791 /// Compares corresponding signed elements in the 256-bit vectors of
792 ///    [4 x i64] in \a __a and \a __b for greater-than and returns the
793 ///    outcomes in the corresponding elements of the 256-bit result.
807 ///    A 256-bit vector of [4 x i64] containing one of the inputs.
809 ///    A 256-bit vector of [4 x i64] containing one of the inputs.
810 /// \returns A 256-bit vector of [4 x i64] containing the result.
817 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
818 ///    vectors of [16 x i16] and returns the lower 16 bits of each sum in an
819 ///    element of the [16 x i16] result (overflow is ignored). Sums from
820 ///    \a __a are returned in the lower 64 bits of each 128-bit half of the
822 ///    128-bit half of the result.
827 ///   result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]
828 ///   result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]
831 ///   result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]
843 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
845 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
846 /// \returns A 256-bit vector of [16 x i16] containing the sums.
853 /// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
856 ///    are returned in the lower 64 bits of each 128-bit half of the result;
857 ///    sums from \a __b are returned in the upper 64 bits of each 128-bit half
875 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
877 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
878 /// \returns A 256-bit vector of [8 x i32] containing the sums.
885 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
886 ///    vectors of [16 x i16] using signed saturation and returns each sum in
887 ///    an element of the [16 x i16] result. Sums from \a __a are returned in
888 ///    the lower 64 bits of each 128-bit half of the result; sums from \a __b
889 ///    are returned in the upper 64 bits of each 128-bit half of the result.
894 ///   result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])
895 ///   result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])
898 ///   result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])
910 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
912 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
913 /// \returns A 256-bit vector of [16 x i16] containing the sums.
920 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
921 ///    vectors of [16 x i16] and returns the lower 16 bits of each difference
922 ///    in an element of the [16 x i16] result (overflow is ignored).
924 ///    128-bit half of the result; differences from \a __b are returned in the
925 ///    upper 64 bits of each 128-bit half of the result.
930 ///   result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]
931 ///   result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]
932 ///   result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]
933 ///   result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]
934 ///   result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]
935 ///   result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]
936 ///   result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]
937 ///   result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]
946 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
948 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
949 /// \returns A 256-bit vector of [16 x i16] containing the differences.
956 /// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
959 ///    from \a __a are returned in the lower 64 bits of each 128-bit half of
961 ///    of each 128-bit half of the result.
966 ///   result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]
967 ///   result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]
968 ///   result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]
969 ///   result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]
978 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
980 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
981 /// \returns A 256-bit vector of [8 x i32] containing the differences.
988 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
989 ///    vectors of [16 x i16] using signed saturation and returns each sum in
990 ///    an element of the [16 x i16] result. Differences from \a __a are
991 ///    returned in the lower 64 bits of each 128-bit half of the result;
993 ///    128-bit half of the result.
998 ///   result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])
999 ///   result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])
1000 ///   result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])
1001 ///   result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])
1002 ///   result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])
1003 ///   result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])
1004 ///   result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])
1005 ///   result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])
1014 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1016 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1017 /// \returns A 256-bit vector of [16 x i16] containing the differences.
1024 /// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
1025 ///    with the corresponding signed byte from the 256-bit integer vector in
1026 ///    \a __b, forming signed 16-bit intermediate products. Adds adjacent
1027 ///    pairs of those products using signed saturation to form 16-bit sums
1028 ///    returned as elements of the [16 x i16] result.
1032 ///   j := i*16
1044 ///    A 256-bit vector containing one of the source operands.
1046 ///    A 256-bit vector containing one of the source operands.
1047 /// \returns A 256-bit vector of [16 x i16] containing the result.
1054 /// Multiplies corresponding 16-bit elements of two 256-bit vectors of
1055 ///    [16 x i16], forming 32-bit intermediate products, and adds pairs of
1056 ///    those products to form 32-bit sums returned as elements of the
1059 ///    There is only one wraparound case: when all four of the 16-bit sources
1066 ///   temp2 := __a[j+31:j+16] * __b[j+31:j+16]
1076 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1078 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1079 /// \returns A 256-bit vector of [8 x i32] containing the result.
1086 /// Compares the corresponding signed bytes in the two 256-bit integer vectors
1088 ///     corresponding byte of the 256-bit result.
1095 ///    A 256-bit integer vector.
1097 ///    A 256-bit integer vector.
1098 /// \returns A 256-bit integer vector containing the result.
1105 /// Compares the corresponding signed 16-bit integers in the two 256-bit
1106 ///    vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1107 ///    each pair in the corresponding element of the 256-bit result.
1114 ///    A 256-bit vector of [16 x i16].
1116 ///    A 256-bit vector of [16 x i16].
1117 /// \returns A 256-bit vector of [16 x i16] containing the result.
1124 /// Compares the corresponding signed 32-bit integers in the two 256-bit
1126 ///    each pair in the corresponding element of the 256-bit result.
1133 ///    A 256-bit vector of [8 x i32].
1135 ///    A 256-bit vector of [8 x i32].
1136 /// \returns A 256-bit vector of [8 x i32] containing the result.
1143 /// Compares the corresponding unsigned bytes in the two 256-bit integer
1145 ///     the corresponding byte of the 256-bit result.
1152 ///    A 256-bit integer vector.
1154 ///    A 256-bit integer vector.
1155 /// \returns A 256-bit integer vector containing the result.
1162 /// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1163 ///    vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1164 ///    each pair in the corresponding element of the 256-bit result.
1171 ///    A 256-bit vector of [16 x i16].
1173 ///    A 256-bit vector of [16 x i16].
1174 /// \returns A 256-bit vector of [16 x i16] containing the result.
1181 /// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1183 ///    each pair in the corresponding element of the 256-bit result.
1190 ///    A 256-bit vector of [8 x i32].
1192 ///    A 256-bit vector of [8 x i32].
1193 /// \returns A 256-bit vector of [8 x i32] containing the result.
1200 /// Compares the corresponding signed bytes in the two 256-bit integer vectors
1202 ///     corresponding byte of the 256-bit result.
1209 ///    A 256-bit integer vector.
1211 ///    A 256-bit integer vector.
1212 /// \returns A 256-bit integer vector containing the result.
1219 /// Compares the corresponding signed 16-bit integers in the two 256-bit
1220 ///    vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1221 ///    each pair in the corresponding element of the 256-bit result.
1228 ///    A 256-bit vector of [16 x i16].
1230 ///    A 256-bit vector of [16 x i16].
1231 /// \returns A 256-bit vector of [16 x i16] containing the result.
1238 /// Compares the corresponding signed 32-bit integers in the two 256-bit
1240 ///    each pair in the corresponding element of the 256-bit result.
1247 ///    A 256-bit vector of [8 x i32].
1249 ///    A 256-bit vector of [8 x i32].
1250 /// \returns A 256-bit vector of [8 x i32] containing the result.
1257 /// Compares the corresponding unsigned bytes in the two 256-bit integer
1259 ///     the corresponding byte of the 256-bit result.
1266 ///    A 256-bit integer vector.
1268 ///    A 256-bit integer vector.
1269 /// \returns A 256-bit integer vector containing the result.
1276 /// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1277 ///    vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1278 ///    each pair in the corresponding element of the 256-bit result.
1285 ///    A 256-bit vector of [16 x i16].
1287 ///    A 256-bit vector of [16 x i16].
1288 /// \returns A 256-bit vector of [16 x i16] containing the result.
1295 /// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1297 ///    each pair in the corresponding element of the 256-bit result.
1304 ///    A 256-bit vector of [8 x i32].
1306 ///    A 256-bit vector of [8 x i32].
1307 /// \returns A 256-bit vector of [8 x i32] containing the result.
1314 /// Creates a 32-bit integer mask from the most significant bit of each byte
1315 ///    in the 256-bit integer vector in \a __a and returns the result.
1329 ///    A 256-bit integer vector containing the source bytes.
1330 /// \returns The 32-bit integer mask.
1337 /// Sign-extends bytes from the 128-bit integer vector in \a __V and returns
1338 ///    the 16-bit values in the corresponding elements of a 256-bit vector
1339 ///    of [16 x i16].
1344 ///   k := i*16
1354 ///    A 128-bit integer vector containing the source bytes.
1355 /// \returns A 256-bit vector of [16 x i16] containing the sign-extended
1365 /// Sign-extends bytes from the lower half of the 128-bit integer vector in
1366 ///    \a __V and returns the 32-bit values in the corresponding elements of a
1367 ///    256-bit vector of [8 x i32].
1382 ///    A 128-bit integer vector containing the source bytes.
1383 /// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1393 /// Sign-extends the first four bytes from the 128-bit integer vector in
1394 ///    \a __V and returns the 64-bit values in the corresponding elements of a
1395 ///    256-bit vector of [4 x i64].
1400 /// result[191:128] := SignExtend(__V[23:16])
1409 ///    A 128-bit integer vector containing the source bytes.
1410 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1420 /// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1421 ///    \a __V and returns the 32-bit values in the corresponding elements of a
1422 ///    256-bit vector of [8 x i32].
1426 ///   j := i*16
1437 ///    A 128-bit vector of [8 x i16] containing the source values.
1438 /// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1446 /// Sign-extends 16-bit elements from the lower half of the 128-bit vector of
1447 ///    [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1448 ///    elements of a 256-bit vector of [4 x i64].
1452 /// result[127:64] := SignExtend(__V[31:16])
1462 ///    A 128-bit vector of [8 x i16] containing the source values.
1463 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1471 /// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1472 ///    \a __V and returns the 64-bit values in the corresponding elements of a
1473 ///    256-bit vector of [4 x i64].
1487 ///    A 128-bit vector of [4 x i32] containing the source values.
1488 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1496 /// Zero-extends bytes from the 128-bit integer vector in \a __V and returns
1497 ///    the 16-bit values in the corresponding elements of a 256-bit vector
1498 ///    of [16 x i16].
1503 ///   k := i*16
1513 ///    A 128-bit integer vector containing the source bytes.
1514 /// \returns A 256-bit vector of [16 x i16] containing the zero-extended
1522 /// Zero-extends bytes from the lower half of the 128-bit integer vector in
1523 ///    \a __V and returns the 32-bit values in the corresponding elements of a
1524 ///    256-bit vector of [8 x i32].
1539 ///    A 128-bit integer vector containing the source bytes.
1540 /// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1548 /// Zero-extends the first four bytes from the 128-bit integer vector in
1549 ///    \a __V and returns the 64-bit values in the corresponding elements of a
1550 ///    256-bit vector of [4 x i64].
1555 /// result[191:128] := ZeroExtend(__V[23:16])
1564 ///    A 128-bit integer vector containing the source bytes.
1565 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1573 /// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1574 ///    \a __V and returns the 32-bit values in the corresponding elements of a
1575 ///    256-bit vector of [8 x i32].
1579 ///   j := i*16
1590 ///    A 128-bit vector of [8 x i16] containing the source values.
1591 /// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1599 /// Zero-extends 16-bit elements from the lower half of the 128-bit vector of
1600 ///    [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1601 ///    elements of a 256-bit vector of [4 x i64].
1605 /// result[127:64] := ZeroExtend(__V[31:16])
1615 ///    A 128-bit vector of [8 x i16] containing the source values.
1616 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1624 /// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1625 ///    \a __V and returns the 64-bit values in the corresponding elements of a
1626 ///    256-bit vector of [4 x i64].
1640 ///    A 128-bit vector of [4 x i32] containing the source values.
1641 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1649 /// Multiplies signed 32-bit integers from even-numbered elements of two
1650 ///    256-bit vectors of [8 x i32] and returns the 64-bit products in the
1665 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
1667 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
1668 /// \returns A 256-bit vector of [4 x i64] containing the products.
1675 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1676 ///    [16 x i16], truncates the 32-bit results to the most significant 18
1677 ///    bits, rounds by adding 1, and returns bits [16:1] of each rounded
1678 ///    product in the [16 x i16] result.
1682 ///   j := i*16
1684 ///   result[j+15:j] := temp[16:1]
1692 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1694 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1695 /// \returns A 256-bit vector of [16 x i16] containing the rounded products.
1702 /// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of
1703 ///    [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1704 ///    [16 x i16] result.
1711 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1713 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1714 /// \returns A 256-bit vector of [16 x i16] containing the products.
1721 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1722 ///    [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1723 ///    [16 x i16] result.
1730 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1732 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1733 /// \returns A 256-bit vector of [16 x i16] containing the products.
1740 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1741 ///    [16 x i16], and returns the lower 16 bits of each 32-bit product in the
1742 ///    [16 x i16] result.
1749 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1751 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1752 /// \returns A 256-bit vector of [16 x i16] containing the products.
1759 /// Multiplies signed 32-bit integer elements of two 256-bit vectors of
1760 ///    [8 x i32], and returns the lower 32 bits of each 64-bit product in the
1768 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
1770 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
1771 /// \returns A 256-bit vector of [8 x i32] containing the products.
1778 /// Multiplies unsigned 32-bit integers from even-numered elements of two
1779 ///    256-bit vectors of [8 x i32] and returns the 64-bit products in the
1794 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
1796 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
1797 /// \returns A 256-bit vector of [4 x i64] containing the products.
1804 /// Computes the bitwise OR of the 256-bit integer vectors in \a __a and
1812 ///    A 256-bit integer vector.
1814 ///    A 256-bit integer vector.
1815 /// \returns A 256-bit integer vector containing the result.
1823 ///    unsigned 8-bit integers from the 256-bit integer vectors \a __a and
1827 ///    eight bytes from \a __b. The zero-extended SAD value is returned in the
1828 ///    corresponding 64-bit element of the result.
1832 ///    and sums these eight values to form one 16-bit result. This operation
1838 ///   temp0 := ABS(__a[j+7:j] - __b[j+7:j])
1839 ///   temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8])
1840 ///   temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16])
1841 ///   temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24])
1842 ///   temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32])
1843 ///   temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40])
1844 ///   temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48])
1845 ///   temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56])
1848 ///   result[j+63:j+16] := 0
1857 ///    A 256-bit integer vector.
1859 ///    A 256-bit integer vector.
1860 /// \returns A 256-bit integer vector containing the result.
1867 /// Shuffles 8-bit integers in the 256-bit integer vector \a __a according
1868 ///    to control information in the 256-bit integer vector \a __b, and
1869 ///    returns the 256-bit result. In effect there are two separate 128-bit
1892 ///    A 256-bit integer vector containing source values.
1894 ///    A 256-bit integer vector containing control information to determine
1895 ///    what goes into the corresponding byte of the result. If bit 7 of the
1897 ///    control byte specify the index (within the same 128-bit half) of \a __a
1899 /// \returns A 256-bit integer vector containing the result.
1906 /// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a
1908 ///    returns the 256-bit result. In effect there are two parallel 128-bit
1929 ///    A 256-bit vector of [8 x i32] containing source values.
1931 ///    An immediate 8-bit value specifying which elements to copy from \a a.
1935 /// \returns A 256-bit vector of [8 x i32] containing the result.
1939 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
1941 ///    returns the 256-bit result. The upper 64 bits of each 128-bit half
1942 ///    are shuffled in parallel; the lower 64 bits of each 128-bit half are
1949 ///   j := i * 16 + 64
1950 ///   k := (imm >> i*2)[1:0] * 16 + 64
1965 ///    A 256-bit vector of [16 x i16] containing source values.
1967 ///    An immediate 8-bit value specifying which elements to copy from \a a.
1971 /// \returns A 256-bit vector of [16 x i16] containing the result.
1975 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
1977 ///    returns the 256-bit [16 x i16] result. The lower 64 bits of each
1978 ///    128-bit half are shuffled; the upper 64 bits of each 128-bit half are
1985 ///   j := i * 16
1986 ///   k := (imm >> i*2)[1:0] * 16
2001 ///    A 256-bit vector of [16 x i16] to use as a source of data for the
2004 ///    An immediate 8-bit value specifying which elements to copy from \a a.
2008 /// \returns A 256-bit vector of [16 x i16] containing the result.
2012 /// Sets each byte of the result to the corresponding byte of the 256-bit
2014 ///    on whether the corresponding byte of the 256-bit integer vector in
2023 ///    A 256-bit integer vector.
2025 ///    A 256-bit integer vector].
2026 /// \returns A 256-bit integer vector containing the result.
2034 ///    256-bit vector of [16 x i16] in \a __a, the negative of that element,
2035 ///    or zero, depending on whether the corresponding element of the 256-bit
2036 ///    vector of [16 x i16] in \a __b is greater than zero, less than zero, or
2044 ///    A 256-bit vector of [16 x i16].
2046 ///    A 256-bit vector of [16 x i16].
2047 /// \returns A 256-bit vector of [16 x i16] containing the result.
2055 ///    256-bit vector of [8 x i32] in \a __a, the negative of that element, or
2056 ///    zero, depending on whether the corresponding element of the 256-bit
2065 ///    A 256-bit vector of [8 x i32].
2067 ///    A 256-bit vector of [8 x i32].
2068 /// \returns A 256-bit vector of [8 x i32] containing the result.
2075 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2088 ///    A 256-bit integer vector to be shifted.
2091 /// \returns A 256-bit integer vector containing the result.
2095 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2108 ///    A 256-bit integer vector to be shifted.
2111 /// \returns A 256-bit integer vector containing the result.
2115 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2124 ///    A 256-bit vector of [16 x i16] to be shifted.
2127 /// \returns A 256-bit vector of [16 x i16] containing the result.
2134 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2144 ///    A 256-bit vector of [16 x i16] to be shifted.
2146 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2148 /// \returns A 256-bit vector of [16 x i16] containing the result.
2155 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2164 ///    A 256-bit vector of [8 x i32] to be shifted.
2167 /// \returns A 256-bit vector of [8 x i32] containing the result.
2174 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2184 ///    A 256-bit vector of [8 x i32] to be shifted.
2186 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2188 /// \returns A 256-bit vector of [8 x i32] containing the result.
2195 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2204 ///    A 256-bit vector of [4 x i64] to be shifted.
2207 /// \returns A 256-bit vector of [4 x i64] containing the result.
2214 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2224 ///    A 256-bit vector of [4 x i64] to be shifted.
2226 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2228 /// \returns A 256-bit vector of [4 x i64] containing the result.
2235 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2238 ///    0 or -1 according to the corresponding input sign bit.
2245 ///    A 256-bit vector of [16 x i16] to be shifted.
2248 /// \returns A 256-bit vector of [16 x i16] containing the result.
2255 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2258 ///    than 15, each element of the result is either 0 or -1 according to the
2259 ///    corresponding input sign bit.
2266 ///    A 256-bit vector of [16 x i16] to be shifted.
2268 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2270 /// \returns A 256-bit vector of [16 x i16] containing the result.
2277 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2280 ///    0 or -1 according to the corresponding input sign bit.
2287 ///    A 256-bit vector of [8 x i32] to be shifted.
2290 /// \returns A 256-bit vector of [8 x i32] containing the result.
2297 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2300 ///    than 31, each element of the result is either 0 or -1 according to the
2301 ///    corresponding input sign bit.
2308 ///    A 256-bit vector of [8 x i32] to be shifted.
2310 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2312 /// \returns A 256-bit vector of [8 x i32] containing the result.
2319 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2332 ///    A 256-bit integer vector to be shifted.
2335 /// \returns A 256-bit integer vector containing the result.
2339 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2352 ///    A 256-bit integer vector to be shifted.
2355 /// \returns A 256-bit integer vector containing the result.
2359 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2368 ///    A 256-bit vector of [16 x i16] to be shifted.
2371 /// \returns A 256-bit vector of [16 x i16] containing the result.
2378 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2388 ///    A 256-bit vector of [16 x i16] to be shifted.
2390 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2392 /// \returns A 256-bit vector of [16 x i16] containing the result.
2399 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2408 ///    A 256-bit vector of [8 x i32] to be shifted.
2411 /// \returns A 256-bit vector of [8 x i32] containing the result.
2418 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2428 ///    A 256-bit vector of [8 x i32] to be shifted.
2430 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2432 /// \returns A 256-bit vector of [8 x i32] containing the result.
2439 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2448 ///    A 256-bit vector of [4 x i64] to be shifted.
2451 /// \returns A 256-bit vector of [4 x i64] containing the result.
2458 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2468 ///    A 256-bit vector of [4 x i64] to be shifted.
2470 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2472 /// \returns A 256-bit vector of [4 x i64] containing the result.
2479 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2481 ///    corresponding byte of the 256-bit integer vector result (overflow is
2487 ///   result[j+7:j] := __a[j+7:j] - __b[j+7:j]
2496 ///    A 256-bit integer vector containing the minuends.
2498 ///    A 256-bit integer vector containing the subtrahends.
2499 /// \returns A 256-bit integer vector containing the differences.
2503   return (__m256i)((__v32qu)__a - (__v32qu)__b);  in _mm256_sub_epi8()
2506 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
2507 ///    vectors of [16 x i16]. Returns the lower 16 bits of each difference in
2508 ///    the corresponding element of the [16 x i16] result (overflow is
2513 ///   j := i*16
2514 ///   result[j+15:j] := __a[j+15:j] - __b[j+15:j]
2523 ///    A 256-bit vector of [16 x i16] containing the minuends.
2525 ///    A 256-bit vector of [16 x i16] containing the subtrahends.
2526 /// \returns A 256-bit vector of [16 x i16] containing the differences.
2530   return (__m256i)((__v16hu)__a - (__v16hu)__b);  in _mm256_sub_epi16()
2533 /// Subtracts 32-bit integers from corresponding elements of two 256-bit
2540 ///   result[j+31:j] := __a[j+31:j] - __b[j+31:j]
2549 ///    A 256-bit vector of [8 x i32] containing the minuends.
2551 ///    A 256-bit vector of [8 x i32] containing the subtrahends.
2552 /// \returns A 256-bit vector of [8 x i32] containing the differences.
2556   return (__m256i)((__v8su)__a - (__v8su)__b);  in _mm256_sub_epi32()
2559 /// Subtracts 64-bit integers from corresponding elements of two 256-bit
2566 ///   result[j+63:j] := __a[j+63:j] - __b[j+63:j]
2575 ///    A 256-bit vector of [4 x i64] containing the minuends.
2577 ///    A 256-bit vector of [4 x i64] containing the subtrahends.
2578 /// \returns A 256-bit vector of [4 x i64] containing the differences.
2582   return (__m256i)((__v4du)__a - (__v4du)__b);  in _mm256_sub_epi64()
2585 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2587 ///    corresponding byte of the 256-bit integer vector result.
2592 ///   result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])
2601 ///    A 256-bit integer vector containing the minuends.
2603 ///    A 256-bit integer vector containing the subtrahends.
2604 /// \returns A 256-bit integer vector containing the differences.
2611 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
2612 ///    vectors of [16 x i16] using signed saturation, and returns each
2613 ///    difference in the corresponding element of the [16 x i16] result.
2617 ///   j := i*16
2618 ///   result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])
2627 ///    A 256-bit vector of [16 x i16] containing the minuends.
2629 ///    A 256-bit vector of [16 x i16] containing the subtrahends.
2630 /// \returns A 256-bit vector of [16 x i16] containing the differences.
2637 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2639 ///    corresponding byte of the 256-bit integer vector result. For each byte,
2640 ///    computes <c> result = __a - __b </c>.
2645 ///   result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])
2654 ///    A 256-bit integer vector containing the minuends.
2656 ///    A 256-bit integer vector containing the subtrahends.
2657 /// \returns A 256-bit integer vector containing the differences.
2664 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
2665 ///    vectors of [16 x i16] using unsigned saturation, and returns each
2666 ///    difference in the corresponding element of the [16 x i16] result.
2670 ///   j := i*16
2671 ///   result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])
2680 ///    A 256-bit vector of [16 x i16] containing the minuends.
2682 ///    A 256-bit vector of [16 x i16] containing the subtrahends.
2683 /// \returns A 256-bit vector of [16 x i16] containing the differences.
2690 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2691 ///    vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2692 ///    uses the upper 64 bits of each 128-bit half of \a __a and \a __b as
2698 /// result[23:16] := __a[79:72]
2712 ///    A 256-bit integer vector used as the source for the even-numbered bytes
2715 ///    A 256-bit integer vector used as the source for the odd-numbered bytes
2717 /// \returns A 256-bit integer vector containing the result.
2724 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2725 ///    of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2726 ///    vector of [16 x i16]. Specifically, uses the upper 64 bits of each
2727 ///    128-bit half of \a __a and \a __b as input; other bits in these
2732 /// result[31:16] := __b[79:64]
2747 ///    A 256-bit vector of [16 x i16] used as the source for the even-numbered
2750 ///    A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2752 /// \returns A 256-bit vector of [16 x i16] containing the result.
2756 …levector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14,…  in _mm256_unpackhi_epi16()
2759 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2760 ///    of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2761 ///    of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half
2781 ///    A 256-bit vector of [8 x i32] used as the source for the even-numbered
2784 ///    A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2786 /// \returns A 256-bit vector of [8 x i32] containing the result.
2793 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2794 ///    of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2795 ///    of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half
2811 ///    A 256-bit vector of [4 x i64] used as the source for the even-numbered
2814 ///    A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2816 /// \returns A 256-bit vector of [4 x i64] containing the result.
2823 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2824 ///    vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2825 ///    uses the lower 64 bits of each 128-bit half of \a __a and \a __b as
2831 /// result[23:16] := __a[15:8]
2845 ///    A 256-bit integer vector used as the source for the even-numbered bytes
2848 ///    A 256-bit integer vector used as the source for the odd-numbered bytes
2850 /// \returns A 256-bit integer vector containing the result.
2854 …0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, …  in _mm256_unpacklo_epi8()
2857 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2858 ///    of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2859 ///    vector of [16 x i16]. Specifically, uses the lower 64 bits of each
2860 ///    128-bit half of \a __a and \a __b as input; other bits in these
2865 /// result[31:16] := __b[15:0]
2866 /// result[47:32] := __a[31:16]
2867 /// result[63:48] := __b[31:16]
2880 ///    A 256-bit vector of [16 x i16] used as the source for the even-numbered
2883 ///    A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2885 /// \returns A 256-bit vector of [16 x i16] containing the result.
2889 …fflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 1…  in _mm256_unpacklo_epi16()
2892 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2893 ///    of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2894 ///    of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half
2914 ///    A 256-bit vector of [8 x i32] used as the source for the even-numbered
2917 ///    A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2919 /// \returns A 256-bit vector of [8 x i32] containing the result.
2926 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2927 ///    of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2928 ///    of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half
2944 ///    A 256-bit vector of [4 x i64] used as the source for the even-numbered
2947 ///    A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2949 /// \returns A 256-bit vector of [4 x i64] containing the result.
2956 /// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and
2964 ///    A 256-bit integer vector.
2966 ///    A 256-bit integer vector.
2967 /// \returns A 256-bit integer vector containing the result.
2974 /// Loads the 256-bit integer vector from memory \a __V using a non-temporal
2975 ///   memory hint and returns the vector. \a __V must be aligned on a 32-byte
2983 ///    A pointer to the 32-byte aligned memory containing the vector to load.
2984 /// \returns A 256-bit integer vector loaded from memory.
2992 /// Broadcasts the 32-bit floating-point value from the low element of the
2993 ///    128-bit vector of [4 x float] in \a __X to all elements of the result's
2994 ///    128-bit vector of [4 x float].
3001 ///    A 128-bit vector of [4 x float] whose low element will be broadcast.
3002 /// \returns A 128-bit vector of [4 x float] containing the result.
3009 /// Broadcasts the 64-bit floating-point value from the low element of the
3010 ///    128-bit vector of [2 x double] in \a __a to both elements of the
3011 ///    result's 128-bit vector of [2 x double].
3018 ///    A 128-bit vector of [2 x double] whose low element will be broadcast.
3019 /// \returns A 128-bit vector of [2 x double] containing the result.
3026 /// Broadcasts the 32-bit floating-point value from the low element of the
3027 ///    128-bit vector of [4 x float] in \a __X to all elements of the
3028 ///    result's 256-bit vector of [8 x float].
3035 ///    A 128-bit vector of [4 x float] whose low element will be broadcast.
3036 /// \returns A 256-bit vector of [8 x float] containing the result.
3043 /// Broadcasts the 64-bit floating-point value from the low element of the
3044 ///    128-bit vector of [2 x double] in \a __X to all elements of the
3045 ///    result's 256-bit vector of [4 x double].
3052 ///    A 128-bit vector of [2 x double] whose low element will be broadcast.
3053 /// \returns A 256-bit vector of [4 x double] containing the result.
3060 /// Broadcasts the 128-bit integer data from \a __X to both the lower and
3061 ///    upper halves of the 256-bit result.
3068 ///    A 128-bit integer vector to be broadcast.
3069 /// \returns A 256-bit integer vector containing the result.
3078 /// Merges 32-bit integer elements from either of the two 128-bit vectors of
3079 ///    [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32],
3102 ///    A 128-bit vector of [4 x i32] containing source values.
3104 ///    A 128-bit vector of [4 x i32] containing source values.
3106 ///    An immediate 8-bit integer operand, with bits [3:0] specifying the
3107 ///    source for each element of the result. The position of the mask bit
3108 ///    corresponds to the index of a copied value. When a mask bit is 0, the
3110 /// \returns A 128-bit vector of [4 x i32] containing the result.
3115 /// Merges 32-bit integer elements from either of the two 256-bit vectors of
3116 ///    [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32],
3139 ///    A 256-bit vector of [8 x i32] containing source values.
3141 ///    A 256-bit vector of [8 x i32] containing source values.
3143 ///    An immediate 8-bit integer operand, with bits [7:0] specifying the
3144 ///    source for each element of the result. The position of the mask bit
3145 ///    corresponds to the index of a copied value. When a mask bit is 0, the
3147 /// \returns A 256-bit vector of [8 x i32] containing the result.
3152 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3153 ///    bytes of the 256-bit result.
3160 ///    A 128-bit integer vector whose low byte will be broadcast.
3161 /// \returns A 256-bit integer vector containing the result.
3168 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
3169 ///    to all elements of the result's 256-bit vector of [16 x i16].
3176 ///    A 128-bit vector of [8 x i16] whose low element will be broadcast.
3177 /// \returns A 256-bit vector of [16 x i16] containing the result.
3184 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3185 ///    to all elements of the result's 256-bit vector of [8 x i32].
3192 ///    A 128-bit vector of [4 x i32] whose low element will be broadcast.
3193 /// \returns A 256-bit vector of [8 x i32] containing the result.
3200 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3201 ///    to all elements of the result's 256-bit vector of [4 x i64].
3208 ///    A 128-bit vector of [2 x i64] whose low element will be broadcast.
3209 /// \returns A 256-bit vector of [4 x i64] containing the result.
3216 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3217 ///    bytes of the 128-bit result.
3224 ///    A 128-bit integer vector whose low byte will be broadcast.
3225 /// \returns A 128-bit integer vector containing the result.
3232 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in
3233 ///    \a __X to all elements of the result's 128-bit vector of [8 x i16].
3240 ///    A 128-bit vector of [8 x i16] whose low element will be broadcast.
3241 /// \returns A 128-bit vector of [8 x i16] containing the result.
3248 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3256 ///    A 128-bit vector of [4 x i32] whose low element will be broadcast.
3257 /// \returns A 128-bit vector of [4 x i32] containing the result.
3264 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3265 ///    to both elements of the result's 128-bit vector of [2 x i64].
3272 ///    A 128-bit vector of [2 x i64] whose low element will be broadcast.
3273 /// \returns A 128-bit vector of [2 x i64] containing the result.
3280 /// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the
3281 ///    256-bit vector of [8 x i32] in \a __a as specified by indexes in the
3282 ///    elements of the 256-bit vector of [8 x i32] in \a __b.
3297 ///    A 256-bit vector of [8 x i32] containing the source values.
3299 ///    A 256-bit vector of [8 x i32] containing indexes of values to use from
3301 /// \returns A 256-bit vector of [8 x i32] containing the result.
3308 /// Sets the result's 256-bit vector of [4 x double] to copies of elements of
3309 ///    the 256-bit vector of [4 x double] in \a V as specified by the
3329 ///    A 256-bit vector of [4 x double] containing the source values.
3331 ///    An immediate 8-bit value specifying which elements to copy from \a V.
3334 /// \returns A 256-bit vector of [4 x double] containing the result.
3338 /// Sets the result's 256-bit vector of [8 x float] to copies of elements of
3339 ///    the 256-bit vector of [8 x float] in \a __a as specified by indexes in
3340 ///    the elements of the 256-bit vector of [8 x i32] in \a __b.
3355 ///    A 256-bit vector of [8 x float] containing the source values.
3357 ///    A 256-bit vector of [8 x i32] containing indexes of values to use from
3359 /// \returns A 256-bit vector of [8 x float] containing the result.
3366 /// Sets the result's 256-bit vector of [4 x i64] result to copies of elements
3367 ///    of the 256-bit vector of [4 x i64] in \a V as specified by the
3387 ///    A 256-bit vector of [4 x i64] containing the source values.
3389 ///    An immediate 8-bit value specifying which elements to copy from \a V.
3392 /// \returns A 256-bit vector of [4 x i64] containing the result.
3396 /// Sets each half of the 256-bit result either to zero or to one of the
3397 ///    four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2,
3426 ///    A 256-bit integer vector containing source values.
3428 ///    A 256-bit integer vector containing source values.
3432 ///    Within each 4-bit control value, if bit 3 is 1, the result is zero,
3438 /// \returns A 256-bit integer vector containing the result.
3442 /// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
3455 ///    A 256-bit integer vector containing the source values.
3458 /// \returns A 128-bit integer vector containing the result.
3462 /// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
3463 ///     result with the 128-bit vector \a V2. If bit 0 of the immediate \a M
3476 ///    A 256-bit integer vector containing a source value.
3478 ///    A 128-bit integer vector containing a source value.
3481 /// \returns A 256-bit integer vector containing the result.
3486 /// Conditionally loads eight 32-bit integer elements from memory \a __X, if
3487 ///    the most significant bit of the corresponding element in the mask
3489 ///    Returns the 256-bit [8 x i32] result.
3509 ///    A 256-bit vector of [8 x i32] containing the mask bits.
3510 /// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed
3518 /// Conditionally loads four 64-bit integer elements from memory \a __X, if
3519 ///    the most significant bit of the corresponding element in the mask
3521 ///    Returns the 256-bit [4 x i64] result.
3541 ///    A 256-bit vector of [4 x i64] containing the mask bits.
3542 /// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed
3550 /// Conditionally loads four 32-bit integer elements from memory \a __X, if
3551 ///    the most significant bit of the corresponding element in the mask
3553 ///    Returns the 128-bit [4 x i32] result.
3573 ///    A 128-bit vector of [4 x i32] containing the mask bits.
3574 /// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed
3582 /// Conditionally loads two 64-bit integer elements from memory \a __X, if
3583 ///    the most significant bit of the corresponding element in the mask
3585 ///    Returns the 128-bit [2 x i64] result.
3605 ///    A 128-bit vector of [2 x i64] containing the mask bits.
3606 /// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed
3614 /// Conditionally stores eight 32-bit integer elements from the 256-bit vector
3615 ///    of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of
3635 ///    A 256-bit vector of [8 x i32] containing the mask bits.
3637 ///    A 256-bit vector of [8 x i32] containing the values to store.
3644 /// Conditionally stores four 64-bit integer elements from the 256-bit vector
3645 ///    of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of
3665 ///    A 256-bit vector of [4 x i64] containing the mask bits.
3667 ///    A 256-bit vector of [4 x i64] containing the values to store.
3674 /// Conditionally stores four 32-bit integer elements from the 128-bit vector
3675 ///    of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of
3695 ///    A 128-bit vector of [4 x i32] containing the mask bits.
3697 ///    A 128-bit vector of [4 x i32] containing the values to store.
3704 /// Conditionally stores two 64-bit integer elements from the 128-bit vector
3705 ///    of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of
3725 ///    A 128-bit vector of [2 x i64] containing the mask bits.
3727 ///    A 128-bit vector of [2 x i64] containing the values to store.
3734 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3736 ///    256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3745 ///    A 256-bit vector of [8 x i32] to be shifted.
3747 ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3749 /// \returns A 256-bit vector of [8 x i32] containing the result.
3756 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3758 ///    128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3767 ///    A 128-bit vector of [4 x i32] to be shifted.
3769 ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3771 /// \returns A 128-bit vector of [4 x i32] containing the result.
3778 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3780 ///    128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3789 ///    A 256-bit vector of [4 x i64] to be shifted.
3791 ///    A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3793 /// \returns A 256-bit vector of [4 x i64] containing the result.
3800 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3802 ///    128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3811 ///    A 128-bit vector of [2 x i64] to be shifted.
3813 ///    A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3815 /// \returns A 128-bit vector of [2 x i64] containing the result.
3822 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3824 ///    256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and
3826 ///    31, the result for that element is 0 or -1 according to the sign bit
3834 ///    A 256-bit vector of [8 x i32] to be shifted.
3836 ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3838 /// \returns A 256-bit vector of [8 x i32] containing the result.
3845 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3847 ///    128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and
3849 ///    31, the result for that element is 0 or -1 according to the sign bit
3857 ///    A 128-bit vector of [4 x i32] to be shifted.
3859 ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3861 /// \returns A 128-bit vector of [4 x i32] containing the result.
3868 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3870 ///    256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3879 ///    A 256-bit vector of [8 x i32] to be shifted.
3881 ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3883 /// \returns A 256-bit vector of [8 x i32] containing the result.
3890 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3892 ///    128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3901 ///    A 128-bit vector of [4 x i32] to be shifted.
3903 ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3905 /// \returns A 128-bit vector of [4 x i32] containing the result.
3912 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3914 ///    128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3923 ///    A 256-bit vector of [4 x i64] to be shifted.
3925 ///    A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3927 /// \returns A 256-bit vector of [4 x i64] containing the result.
3934 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3936 ///    128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3945 ///    A 128-bit vector of [2 x i64] to be shifted.
3947 ///    A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3949 /// \returns A 128-bit vector of [2 x i64] containing the result.
3956 /// Conditionally gathers two 64-bit floating-point values, either from the
3957 ///    128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3958 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3983 ///    A 128-bit vector of [2 x double] used as the source when a mask bit is
3988 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
3991 ///    A 128-bit vector of [2 x double] containing the mask. The most
3992 ///    significant bit of each element in the mask vector represents the mask
3993 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
3998 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4005 /// Conditionally gathers four 64-bit floating-point values, either from the
4006 ///    256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4007 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4032 ///    A 256-bit vector of [4 x double] used as the source when a mask bit is
4037 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4039 ///    A 256-bit vector of [4 x double] containing the mask. The most
4040 ///    significant bit of each element in the mask vector represents the mask
4041 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4046 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4053 /// Conditionally gathers two 64-bit floating-point values, either from the
4054 ///    128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
4055 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4080 ///    A 128-bit vector of [2 x double] used as the source when a mask bit is
4085 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4087 ///    A 128-bit vector of [2 x double] containing the mask. The most
4088 ///    significant bit of each element in the mask vector represents the mask
4089 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4094 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4101 /// Conditionally gathers four 64-bit floating-point values, either from the
4102 ///    256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4103 ///    indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4128 ///    A 256-bit vector of [4 x double] used as the source when a mask bit is
4133 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4135 ///    A 256-bit vector of [4 x double] containing the mask. The most
4136 ///    significant bit of each element in the mask vector represents the mask
4137 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4142 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4149 /// Conditionally gathers four 32-bit floating-point values, either from the
4150 ///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4151 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4176 ///    A 128-bit vector of [4 x float] used as the source when a mask bit is
4181 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4183 ///    A 128-bit vector of [4 x float] containing the mask. The most
4184 ///    significant bit of each element in the mask vector represents the mask
4185 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4190 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4197 /// Conditionally gathers eight 32-bit floating-point values, either from the
4198 ///    256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
4199 ///    indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4224 ///    A 256-bit vector of [8 x float] used as the source when a mask bit is
4229 ///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4231 ///    A 256-bit vector of [8 x float] containing the mask. The most
4232 ///    significant bit of each element in the mask vector represents the mask
4233 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4238 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
4245 /// Conditionally gathers two 32-bit floating-point values, either from the
4246 ///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4247 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4274 ///    A 128-bit vector of [4 x float] used as the source when a mask bit is
4279 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4281 ///    A 128-bit vector of [4 x float] containing the mask. The most
4282 ///    significant bit of each element in the mask vector represents the mask
4283 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4289 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4296 /// Conditionally gathers four 32-bit floating-point values, either from the
4297 ///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4298 ///    indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4323 ///    A 128-bit vector of [4 x float] used as the source when a mask bit is
4328 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4330 ///    A 128-bit vector of [4 x float] containing the mask. The most
4331 ///    significant bit of each element in the mask vector represents the mask
4332 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4337 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4344 /// Conditionally gathers four 32-bit integer values, either from the
4345 ///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4346 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4371 ///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
4376 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4378 ///    A 128-bit vector of [4 x i32] containing the mask. The most significant
4379 ///    bit of each element in the mask vector represents the mask bits. If a
4380 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4385 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4392 /// Conditionally gathers eight 32-bit integer values, either from the
4393 ///    256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
4394 ///    indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4419 ///    A 256-bit vector of [8 x i32] used as the source when a mask bit is
4424 ///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4426 ///    A 256-bit vector of [8 x i32] containing the mask. The most significant
4427 ///    bit of each element in the mask vector represents the mask bits. If a
4428 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4433 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
4440 /// Conditionally gathers two 32-bit integer values, either from the
4441 ///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4442 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4469 ///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
4474 ///    A 128-bit vector of [2 x i64] containing indexes into \a m.
4476 ///    A 128-bit vector of [4 x i32] containing the mask. The most significant
4477 ///    bit of each element in the mask vector represents the mask bits. If a
4478 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4484 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4491 /// Conditionally gathers four 32-bit integer values, either from the
4492 ///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4493 ///    indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4518 ///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
4523 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4525 ///    A 128-bit vector of [4 x i32] containing the mask. The most significant
4526 ///    bit of each element in the mask vector represents the mask bits. If a
4527 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4532 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4539 /// Conditionally gathers two 64-bit integer values, either from the
4540 ///    128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4541 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4566 ///    A 128-bit vector of [2 x i64] used as the source when a mask bit is
4571 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4574 ///    A 128-bit vector of [2 x i64] containing the mask. The most significant
4575 ///    bit of each element in the mask vector represents the mask bits. If a
4576 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4581 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4588 /// Conditionally gathers four 64-bit integer values, either from the
4589 ///    256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4590 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4615 ///    A 256-bit vector of [4 x i64] used as the source when a mask bit is
4620 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4622 ///    A 256-bit vector of [4 x i64] containing the mask. The most significant
4623 ///    bit of each element in the mask vector represents the mask bits. If a
4624 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4629 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4636 /// Conditionally gathers two 64-bit integer values, either from the
4637 ///    128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4638 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4663 ///    A 128-bit vector of [2 x i64] used as the source when a mask bit is
4668 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4670 ///    A 128-bit vector of [2 x i64] containing the mask. The most significant
4671 ///    bit of each element in the mask vector represents the mask bits. If a
4672 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4677 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4684 /// Conditionally gathers four 64-bit integer values, either from the
4685 ///    256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4686 ///    indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4711 ///    A 256-bit vector of [4 x i64] used as the source when a mask bit is
4716 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4718 ///    A 256-bit vector of [4 x i64] containing the mask. The most significant
4719 ///    bit of each element in the mask vector represents the mask bits. If a
4720 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4725 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4732 /// Gathers two 64-bit floating-point values from memory \a m using scaled
4733 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
4754 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4759 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4768 /// Gathers four 64-bit floating-point values from memory \a m using scaled
4769 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
4790 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4794 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4804 /// Gathers two 64-bit floating-point values from memory \a m using scaled
4805 ///    indexes from the 128-bit vector of [2 x i64] in \a i.
4826 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4830 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4839 /// Gathers four 64-bit floating-point values from memory \a m using scaled
4840 ///    indexes from the 256-bit vector of [4 x i64] in \a i.
4861 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4865 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4875 /// Gathers four 32-bit floating-point values from memory \a m using scaled
4876 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
4897 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4901 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4910 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
4911 ///    indexes from the 256-bit vector of [8 x i32] in \a i.
4932 ///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4936 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
4946 /// Gathers two 32-bit floating-point values from memory \a m using scaled
4947 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
4970 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4974 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4983 /// Gathers four 32-bit floating-point values from memory \a m using scaled
4984 ///    indexes from the 256-bit vector of [4 x i64] in \a i.
5005 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5009 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
5018 /// Gathers four 32-bit floating-point values from memory \a m using scaled
5019 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
5040 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5044 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5048                                      (__v4si)_mm_set1_epi32(-1), (s)))
5050 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
5051 ///    indexes from the 256-bit vector of [8 x i32] in \a i.
5072 ///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
5076 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
5080                                         (__v8si)_mm256_set1_epi32(-1), (s)))
5082 /// Gathers two 32-bit integer values from memory \a m using scaled indexes
5083 ///    from the 128-bit vector of [2 x i64] in \a i. The upper two elements
5106 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5110 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5114                                      (__v4si)_mm_set1_epi32(-1), (s)))
5116 /// Gathers four 32-bit integer values from memory \a m using scaled indexes
5117 ///    from the 256-bit vector of [4 x i64] in \a i.
5138 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5142 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5146                                         (__v4si)_mm_set1_epi32(-1), (s)))
5148 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
5149 ///    from the 128-bit vector of [4 x i32] in \a i.
5170 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
5175 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5180                                      (__v2di)_mm_set1_epi64x(-1), (s)))
5182 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
5183 ///    from the 128-bit vector of [4 x i32] in \a i.
5204 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5208 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5213                                         (__v4di)_mm256_set1_epi64x(-1), (s)))
5215 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
5216 ///    from the 128-bit vector of [2 x i64] in \a i.
5237 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5241 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5246                                      (__v2di)_mm_set1_epi64x(-1), (s)))
5248 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
5249 ///    from the 256-bit vector of [4 x i64] in \a i.
5270 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5274 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5279                                         (__v4di)_mm256_set1_epi64x(-1), (s)))