xref: /freebsd-src/contrib/llvm-project/clang/lib/Headers/smmintrin.h (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
10b57cec5SDimitry Andric /*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
20b57cec5SDimitry Andric  *
30b57cec5SDimitry Andric  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric  * See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric  *
70b57cec5SDimitry Andric  *===-----------------------------------------------------------------------===
80b57cec5SDimitry Andric  */
90b57cec5SDimitry Andric 
100b57cec5SDimitry Andric #ifndef __SMMINTRIN_H
110b57cec5SDimitry Andric #define __SMMINTRIN_H
120b57cec5SDimitry Andric 
13349cc55cSDimitry Andric #if !defined(__i386__) && !defined(__x86_64__)
14349cc55cSDimitry Andric #error "This header is only meant to be used on x86 and x64 architecture"
15349cc55cSDimitry Andric #endif
16349cc55cSDimitry Andric 
170b57cec5SDimitry Andric #include <tmmintrin.h>
180b57cec5SDimitry Andric 
190b57cec5SDimitry Andric /* Define the default attributes for the functions in this file. */
2081ad6265SDimitry Andric #define __DEFAULT_FN_ATTRS                                                     \
215f757f3fSDimitry Andric   __attribute__((__always_inline__, __nodebug__,                               \
225f757f3fSDimitry Andric                  __target__("sse4.1,no-evex512"), __min_vector_width__(128)))
230b57cec5SDimitry Andric 
240b57cec5SDimitry Andric /* SSE4 Rounding macros. */
250b57cec5SDimitry Andric #define _MM_FROUND_TO_NEAREST_INT 0x00
260b57cec5SDimitry Andric #define _MM_FROUND_TO_NEG_INF 0x01
270b57cec5SDimitry Andric #define _MM_FROUND_TO_POS_INF 0x02
280b57cec5SDimitry Andric #define _MM_FROUND_TO_ZERO 0x03
290b57cec5SDimitry Andric #define _MM_FROUND_CUR_DIRECTION 0x04
300b57cec5SDimitry Andric 
310b57cec5SDimitry Andric #define _MM_FROUND_RAISE_EXC 0x00
320b57cec5SDimitry Andric #define _MM_FROUND_NO_EXC 0x08
330b57cec5SDimitry Andric 
340b57cec5SDimitry Andric #define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
350b57cec5SDimitry Andric #define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
360b57cec5SDimitry Andric #define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
370b57cec5SDimitry Andric #define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
380b57cec5SDimitry Andric #define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
390b57cec5SDimitry Andric #define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
400b57cec5SDimitry Andric 
410b57cec5SDimitry Andric /// Rounds up each element of the 128-bit vector of [4 x float] to an
420b57cec5SDimitry Andric ///    integer and returns the rounded values in a 128-bit vector of
430b57cec5SDimitry Andric ///    [4 x float].
440b57cec5SDimitry Andric ///
450b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
460b57cec5SDimitry Andric ///
470b57cec5SDimitry Andric /// \code
480b57cec5SDimitry Andric /// __m128 _mm_ceil_ps(__m128 X);
490b57cec5SDimitry Andric /// \endcode
500b57cec5SDimitry Andric ///
510b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
520b57cec5SDimitry Andric ///
530b57cec5SDimitry Andric /// \param X
540b57cec5SDimitry Andric ///    A 128-bit vector of [4 x float] values to be rounded up.
550b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the rounded values.
560b57cec5SDimitry Andric #define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL)
570b57cec5SDimitry Andric 
580b57cec5SDimitry Andric /// Rounds up each element of the 128-bit vector of [2 x double] to an
590b57cec5SDimitry Andric ///    integer and returns the rounded values in a 128-bit vector of
600b57cec5SDimitry Andric ///    [2 x double].
610b57cec5SDimitry Andric ///
620b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
630b57cec5SDimitry Andric ///
640b57cec5SDimitry Andric /// \code
650b57cec5SDimitry Andric /// __m128d _mm_ceil_pd(__m128d X);
660b57cec5SDimitry Andric /// \endcode
670b57cec5SDimitry Andric ///
680b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
690b57cec5SDimitry Andric ///
700b57cec5SDimitry Andric /// \param X
710b57cec5SDimitry Andric ///    A 128-bit vector of [2 x double] values to be rounded up.
720b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the rounded values.
730b57cec5SDimitry Andric #define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL)
740b57cec5SDimitry Andric 
750b57cec5SDimitry Andric /// Copies three upper elements of the first 128-bit vector operand to
760b57cec5SDimitry Andric ///    the corresponding three upper elements of the 128-bit result vector of
770b57cec5SDimitry Andric ///    [4 x float]. Rounds up the lowest element of the second 128-bit vector
780b57cec5SDimitry Andric ///    operand to an integer and copies it to the lowest element of the 128-bit
790b57cec5SDimitry Andric ///    result vector of [4 x float].
800b57cec5SDimitry Andric ///
810b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
820b57cec5SDimitry Andric ///
830b57cec5SDimitry Andric /// \code
840b57cec5SDimitry Andric /// __m128 _mm_ceil_ss(__m128 X, __m128 Y);
850b57cec5SDimitry Andric /// \endcode
860b57cec5SDimitry Andric ///
870b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
880b57cec5SDimitry Andric ///
890b57cec5SDimitry Andric /// \param X
900b57cec5SDimitry Andric ///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
910b57cec5SDimitry Andric ///    copied to the corresponding bits of the result.
920b57cec5SDimitry Andric /// \param Y
930b57cec5SDimitry Andric ///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
940b57cec5SDimitry Andric ///    rounded up to the nearest integer and copied to the corresponding bits
950b57cec5SDimitry Andric ///    of the result.
960b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the copied and rounded
970b57cec5SDimitry Andric ///    values.
980b57cec5SDimitry Andric #define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
990b57cec5SDimitry Andric 
1000b57cec5SDimitry Andric /// Copies the upper element of the first 128-bit vector operand to the
1010b57cec5SDimitry Andric ///    corresponding upper element of the 128-bit result vector of [2 x double].
1020b57cec5SDimitry Andric ///    Rounds up the lower element of the second 128-bit vector operand to an
1030b57cec5SDimitry Andric ///    integer and copies it to the lower element of the 128-bit result vector
1040b57cec5SDimitry Andric ///    of [2 x double].
1050b57cec5SDimitry Andric ///
1060b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
1070b57cec5SDimitry Andric ///
1080b57cec5SDimitry Andric /// \code
1090b57cec5SDimitry Andric /// __m128d _mm_ceil_sd(__m128d X, __m128d Y);
1100b57cec5SDimitry Andric /// \endcode
1110b57cec5SDimitry Andric ///
1120b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
1130b57cec5SDimitry Andric ///
1140b57cec5SDimitry Andric /// \param X
1150b57cec5SDimitry Andric ///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
1160b57cec5SDimitry Andric ///    copied to the corresponding bits of the result.
1170b57cec5SDimitry Andric /// \param Y
1180b57cec5SDimitry Andric ///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
1190b57cec5SDimitry Andric ///    rounded up to the nearest integer and copied to the corresponding bits
1200b57cec5SDimitry Andric ///    of the result.
1210b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the copied and rounded
1220b57cec5SDimitry Andric ///    values.
1230b57cec5SDimitry Andric #define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
1240b57cec5SDimitry Andric 
1250b57cec5SDimitry Andric /// Rounds down each element of the 128-bit vector of [4 x float] to an
1260b57cec5SDimitry Andric ///    an integer and returns the rounded values in a 128-bit vector of
1270b57cec5SDimitry Andric ///    [4 x float].
1280b57cec5SDimitry Andric ///
1290b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
1300b57cec5SDimitry Andric ///
1310b57cec5SDimitry Andric /// \code
1320b57cec5SDimitry Andric /// __m128 _mm_floor_ps(__m128 X);
1330b57cec5SDimitry Andric /// \endcode
1340b57cec5SDimitry Andric ///
1350b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
1360b57cec5SDimitry Andric ///
1370b57cec5SDimitry Andric /// \param X
1380b57cec5SDimitry Andric ///    A 128-bit vector of [4 x float] values to be rounded down.
1390b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the rounded values.
1400b57cec5SDimitry Andric #define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR)
1410b57cec5SDimitry Andric 
1420b57cec5SDimitry Andric /// Rounds down each element of the 128-bit vector of [2 x double] to an
1430b57cec5SDimitry Andric ///    integer and returns the rounded values in a 128-bit vector of
1440b57cec5SDimitry Andric ///    [2 x double].
1450b57cec5SDimitry Andric ///
1460b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
1470b57cec5SDimitry Andric ///
1480b57cec5SDimitry Andric /// \code
1490b57cec5SDimitry Andric /// __m128d _mm_floor_pd(__m128d X);
1500b57cec5SDimitry Andric /// \endcode
1510b57cec5SDimitry Andric ///
1520b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
1530b57cec5SDimitry Andric ///
1540b57cec5SDimitry Andric /// \param X
1550b57cec5SDimitry Andric ///    A 128-bit vector of [2 x double].
1560b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the rounded values.
1570b57cec5SDimitry Andric #define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR)
1580b57cec5SDimitry Andric 
1590b57cec5SDimitry Andric /// Copies three upper elements of the first 128-bit vector operand to
1600b57cec5SDimitry Andric ///    the corresponding three upper elements of the 128-bit result vector of
1610b57cec5SDimitry Andric ///    [4 x float]. Rounds down the lowest element of the second 128-bit vector
1620b57cec5SDimitry Andric ///    operand to an integer and copies it to the lowest element of the 128-bit
1630b57cec5SDimitry Andric ///    result vector of [4 x float].
1640b57cec5SDimitry Andric ///
1650b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
1660b57cec5SDimitry Andric ///
1670b57cec5SDimitry Andric /// \code
1680b57cec5SDimitry Andric /// __m128 _mm_floor_ss(__m128 X, __m128 Y);
1690b57cec5SDimitry Andric /// \endcode
1700b57cec5SDimitry Andric ///
1710b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
1720b57cec5SDimitry Andric ///
1730b57cec5SDimitry Andric /// \param X
1740b57cec5SDimitry Andric ///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
1750b57cec5SDimitry Andric ///    copied to the corresponding bits of the result.
1760b57cec5SDimitry Andric /// \param Y
1770b57cec5SDimitry Andric ///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
1780b57cec5SDimitry Andric ///    rounded down to the nearest integer and copied to the corresponding bits
1790b57cec5SDimitry Andric ///    of the result.
1800b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the copied and rounded
1810b57cec5SDimitry Andric ///    values.
1820b57cec5SDimitry Andric #define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
1830b57cec5SDimitry Andric 
1840b57cec5SDimitry Andric /// Copies the upper element of the first 128-bit vector operand to the
1850b57cec5SDimitry Andric ///    corresponding upper element of the 128-bit result vector of [2 x double].
1860b57cec5SDimitry Andric ///    Rounds down the lower element of the second 128-bit vector operand to an
1870b57cec5SDimitry Andric ///    integer and copies it to the lower element of the 128-bit result vector
1880b57cec5SDimitry Andric ///    of [2 x double].
1890b57cec5SDimitry Andric ///
1900b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
1910b57cec5SDimitry Andric ///
1920b57cec5SDimitry Andric /// \code
1930b57cec5SDimitry Andric /// __m128d _mm_floor_sd(__m128d X, __m128d Y);
1940b57cec5SDimitry Andric /// \endcode
1950b57cec5SDimitry Andric ///
1960b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
1970b57cec5SDimitry Andric ///
1980b57cec5SDimitry Andric /// \param X
1990b57cec5SDimitry Andric ///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
2000b57cec5SDimitry Andric ///    copied to the corresponding bits of the result.
2010b57cec5SDimitry Andric /// \param Y
2020b57cec5SDimitry Andric ///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
2030b57cec5SDimitry Andric ///    rounded down to the nearest integer and copied to the corresponding bits
2040b57cec5SDimitry Andric ///    of the result.
2050b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the copied and rounded
2060b57cec5SDimitry Andric ///    values.
2070b57cec5SDimitry Andric #define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
2080b57cec5SDimitry Andric 
2090b57cec5SDimitry Andric /// Rounds each element of the 128-bit vector of [4 x float] to an
2100b57cec5SDimitry Andric ///    integer value according to the rounding control specified by the second
2110b57cec5SDimitry Andric ///    argument and returns the rounded values in a 128-bit vector of
2120b57cec5SDimitry Andric ///    [4 x float].
2130b57cec5SDimitry Andric ///
2140b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
2150b57cec5SDimitry Andric ///
2160b57cec5SDimitry Andric /// \code
2170b57cec5SDimitry Andric /// __m128 _mm_round_ps(__m128 X, const int M);
2180b57cec5SDimitry Andric /// \endcode
2190b57cec5SDimitry Andric ///
2200b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
2210b57cec5SDimitry Andric ///
2220b57cec5SDimitry Andric /// \param X
2230b57cec5SDimitry Andric ///    A 128-bit vector of [4 x float].
2240b57cec5SDimitry Andric /// \param M
2250b57cec5SDimitry Andric ///    An integer value that specifies the rounding operation. \n
2260b57cec5SDimitry Andric ///    Bits [7:4] are reserved. \n
2270b57cec5SDimitry Andric ///    Bit [3] is a precision exception value: \n
2280b57cec5SDimitry Andric ///      0: A normal PE exception is used \n
2290b57cec5SDimitry Andric ///      1: The PE field is not updated \n
2300b57cec5SDimitry Andric ///    Bit [2] is the rounding control source: \n
2310b57cec5SDimitry Andric ///      0: Use bits [1:0] of \a M \n
2320b57cec5SDimitry Andric ///      1: Use the current MXCSR setting \n
2330b57cec5SDimitry Andric ///    Bits [1:0] contain the rounding control definition: \n
2340b57cec5SDimitry Andric ///      00: Nearest \n
2350b57cec5SDimitry Andric ///      01: Downward (toward negative infinity) \n
2360b57cec5SDimitry Andric ///      10: Upward (toward positive infinity) \n
2370b57cec5SDimitry Andric ///      11: Truncated
2380b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the rounded values.
2390b57cec5SDimitry Andric #define _mm_round_ps(X, M)                                                     \
240349cc55cSDimitry Andric   ((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)))
2410b57cec5SDimitry Andric 
2420b57cec5SDimitry Andric /// Copies three upper elements of the first 128-bit vector operand to
2430b57cec5SDimitry Andric ///    the corresponding three upper elements of the 128-bit result vector of
2440b57cec5SDimitry Andric ///    [4 x float]. Rounds the lowest element of the second 128-bit vector
2450b57cec5SDimitry Andric ///    operand to an integer value according to the rounding control specified
2460b57cec5SDimitry Andric ///    by the third argument and copies it to the lowest element of the 128-bit
2470b57cec5SDimitry Andric ///    result vector of [4 x float].
2480b57cec5SDimitry Andric ///
2490b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
2500b57cec5SDimitry Andric ///
2510b57cec5SDimitry Andric /// \code
2520b57cec5SDimitry Andric /// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M);
2530b57cec5SDimitry Andric /// \endcode
2540b57cec5SDimitry Andric ///
2550b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
2560b57cec5SDimitry Andric ///
2570b57cec5SDimitry Andric /// \param X
2580b57cec5SDimitry Andric ///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
2590b57cec5SDimitry Andric ///    copied to the corresponding bits of the result.
2600b57cec5SDimitry Andric /// \param Y
2610b57cec5SDimitry Andric ///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
2620b57cec5SDimitry Andric ///    rounded to the nearest integer using the specified rounding control and
2630b57cec5SDimitry Andric ///    copied to the corresponding bits of the result.
2640b57cec5SDimitry Andric /// \param M
2650b57cec5SDimitry Andric ///    An integer value that specifies the rounding operation. \n
2660b57cec5SDimitry Andric ///    Bits [7:4] are reserved. \n
2670b57cec5SDimitry Andric ///    Bit [3] is a precision exception value: \n
2680b57cec5SDimitry Andric ///      0: A normal PE exception is used \n
2690b57cec5SDimitry Andric ///      1: The PE field is not updated \n
2700b57cec5SDimitry Andric ///    Bit [2] is the rounding control source: \n
2710b57cec5SDimitry Andric ///      0: Use bits [1:0] of \a M \n
2720b57cec5SDimitry Andric ///      1: Use the current MXCSR setting \n
2730b57cec5SDimitry Andric ///    Bits [1:0] contain the rounding control definition: \n
2740b57cec5SDimitry Andric ///      00: Nearest \n
2750b57cec5SDimitry Andric ///      01: Downward (toward negative infinity) \n
2760b57cec5SDimitry Andric ///      10: Upward (toward positive infinity) \n
2770b57cec5SDimitry Andric ///      11: Truncated
2780b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the copied and rounded
2790b57cec5SDimitry Andric ///    values.
2800b57cec5SDimitry Andric #define _mm_round_ss(X, Y, M)                                                  \
28181ad6265SDimitry Andric   ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y),    \
28281ad6265SDimitry Andric                                   (M)))
2830b57cec5SDimitry Andric 
2840b57cec5SDimitry Andric /// Rounds each element of the 128-bit vector of [2 x double] to an
2850b57cec5SDimitry Andric ///    integer value according to the rounding control specified by the second
2860b57cec5SDimitry Andric ///    argument and returns the rounded values in a 128-bit vector of
2870b57cec5SDimitry Andric ///    [2 x double].
2880b57cec5SDimitry Andric ///
2890b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
2900b57cec5SDimitry Andric ///
2910b57cec5SDimitry Andric /// \code
2920b57cec5SDimitry Andric /// __m128d _mm_round_pd(__m128d X, const int M);
2930b57cec5SDimitry Andric /// \endcode
2940b57cec5SDimitry Andric ///
2950b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
2960b57cec5SDimitry Andric ///
2970b57cec5SDimitry Andric /// \param X
2980b57cec5SDimitry Andric ///    A 128-bit vector of [2 x double].
2990b57cec5SDimitry Andric /// \param M
3000b57cec5SDimitry Andric ///    An integer value that specifies the rounding operation. \n
3010b57cec5SDimitry Andric ///    Bits [7:4] are reserved. \n
3020b57cec5SDimitry Andric ///    Bit [3] is a precision exception value: \n
3030b57cec5SDimitry Andric ///      0: A normal PE exception is used \n
3040b57cec5SDimitry Andric ///      1: The PE field is not updated \n
3050b57cec5SDimitry Andric ///    Bit [2] is the rounding control source: \n
3060b57cec5SDimitry Andric ///      0: Use bits [1:0] of \a M \n
3070b57cec5SDimitry Andric ///      1: Use the current MXCSR setting \n
3080b57cec5SDimitry Andric ///    Bits [1:0] contain the rounding control definition: \n
3090b57cec5SDimitry Andric ///      00: Nearest \n
3100b57cec5SDimitry Andric ///      01: Downward (toward negative infinity) \n
3110b57cec5SDimitry Andric ///      10: Upward (toward positive infinity) \n
3120b57cec5SDimitry Andric ///      11: Truncated
3130b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the rounded values.
3140b57cec5SDimitry Andric #define _mm_round_pd(X, M)                                                     \
315349cc55cSDimitry Andric   ((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)))
3160b57cec5SDimitry Andric 
3170b57cec5SDimitry Andric /// Copies the upper element of the first 128-bit vector operand to the
3180b57cec5SDimitry Andric ///    corresponding upper element of the 128-bit result vector of [2 x double].
3190b57cec5SDimitry Andric ///    Rounds the lower element of the second 128-bit vector operand to an
3200b57cec5SDimitry Andric ///    integer value according to the rounding control specified by the third
3210b57cec5SDimitry Andric ///    argument and copies it to the lower element of the 128-bit result vector
3220b57cec5SDimitry Andric ///    of [2 x double].
3230b57cec5SDimitry Andric ///
3240b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
3250b57cec5SDimitry Andric ///
3260b57cec5SDimitry Andric /// \code
3270b57cec5SDimitry Andric /// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M);
3280b57cec5SDimitry Andric /// \endcode
3290b57cec5SDimitry Andric ///
3300b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
3310b57cec5SDimitry Andric ///
3320b57cec5SDimitry Andric /// \param X
3330b57cec5SDimitry Andric ///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
3340b57cec5SDimitry Andric ///    copied to the corresponding bits of the result.
3350b57cec5SDimitry Andric /// \param Y
3360b57cec5SDimitry Andric ///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
3370b57cec5SDimitry Andric ///    rounded to the nearest integer using the specified rounding control and
3380b57cec5SDimitry Andric ///    copied to the corresponding bits of the result.
3390b57cec5SDimitry Andric /// \param M
3400b57cec5SDimitry Andric ///    An integer value that specifies the rounding operation. \n
3410b57cec5SDimitry Andric ///    Bits [7:4] are reserved. \n
3420b57cec5SDimitry Andric ///    Bit [3] is a precision exception value: \n
3430b57cec5SDimitry Andric ///      0: A normal PE exception is used \n
3440b57cec5SDimitry Andric ///      1: The PE field is not updated \n
3450b57cec5SDimitry Andric ///    Bit [2] is the rounding control source: \n
3460b57cec5SDimitry Andric ///      0: Use bits [1:0] of \a M \n
3470b57cec5SDimitry Andric ///      1: Use the current MXCSR setting \n
3480b57cec5SDimitry Andric ///    Bits [1:0] contain the rounding control definition: \n
3490b57cec5SDimitry Andric ///      00: Nearest \n
3500b57cec5SDimitry Andric ///      01: Downward (toward negative infinity) \n
3510b57cec5SDimitry Andric ///      10: Upward (toward positive infinity) \n
3520b57cec5SDimitry Andric ///      11: Truncated
3530b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the copied and rounded
3540b57cec5SDimitry Andric ///    values.
3550b57cec5SDimitry Andric #define _mm_round_sd(X, Y, M)                                                  \
35681ad6265SDimitry Andric   ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \
35781ad6265SDimitry Andric                                    (M)))
3580b57cec5SDimitry Andric 
3590b57cec5SDimitry Andric /* SSE4 Packed Blending Intrinsics.  */
3600b57cec5SDimitry Andric /// Returns a 128-bit vector of [2 x double] where the values are
3610b57cec5SDimitry Andric ///    selected from either the first or second operand as specified by the
3620b57cec5SDimitry Andric ///    third operand, the control mask.
3630b57cec5SDimitry Andric ///
3640b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
3650b57cec5SDimitry Andric ///
3660b57cec5SDimitry Andric /// \code
3670b57cec5SDimitry Andric /// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M);
3680b57cec5SDimitry Andric /// \endcode
3690b57cec5SDimitry Andric ///
3700b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
3710b57cec5SDimitry Andric ///
3720b57cec5SDimitry Andric /// \param V1
3730b57cec5SDimitry Andric ///    A 128-bit vector of [2 x double].
3740b57cec5SDimitry Andric /// \param V2
3750b57cec5SDimitry Andric ///    A 128-bit vector of [2 x double].
3760b57cec5SDimitry Andric /// \param M
3770b57cec5SDimitry Andric ///    An immediate integer operand, with mask bits [1:0] specifying how the
3780b57cec5SDimitry Andric ///    values are to be copied. The position of the mask bit corresponds to the
3790b57cec5SDimitry Andric ///    index of a copied value. When a mask bit is 0, the corresponding 64-bit
3800b57cec5SDimitry Andric ///    element in operand \a V1 is copied to the same position in the result.
3810b57cec5SDimitry Andric ///    When a mask bit is 1, the corresponding 64-bit element in operand \a V2
3820b57cec5SDimitry Andric ///    is copied to the same position in the result.
3830b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the copied values.
3840b57cec5SDimitry Andric #define _mm_blend_pd(V1, V2, M)                                                \
385349cc55cSDimitry Andric   ((__m128d)__builtin_ia32_blendpd((__v2df)(__m128d)(V1),                      \
386349cc55cSDimitry Andric                                    (__v2df)(__m128d)(V2), (int)(M)))
3870b57cec5SDimitry Andric 
3880b57cec5SDimitry Andric /// Returns a 128-bit vector of [4 x float] where the values are selected
3890b57cec5SDimitry Andric ///    from either the first or second operand as specified by the third
3900b57cec5SDimitry Andric ///    operand, the control mask.
3910b57cec5SDimitry Andric ///
3920b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
3930b57cec5SDimitry Andric ///
3940b57cec5SDimitry Andric /// \code
3950b57cec5SDimitry Andric /// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M);
3960b57cec5SDimitry Andric /// \endcode
3970b57cec5SDimitry Andric ///
3980b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction.
3990b57cec5SDimitry Andric ///
4000b57cec5SDimitry Andric /// \param V1
4010b57cec5SDimitry Andric ///    A 128-bit vector of [4 x float].
4020b57cec5SDimitry Andric /// \param V2
4030b57cec5SDimitry Andric ///    A 128-bit vector of [4 x float].
4040b57cec5SDimitry Andric /// \param M
4050b57cec5SDimitry Andric ///    An immediate integer operand, with mask bits [3:0] specifying how the
4060b57cec5SDimitry Andric ///    values are to be copied. The position of the mask bit corresponds to the
4070b57cec5SDimitry Andric ///    index of a copied value. When a mask bit is 0, the corresponding 32-bit
4080b57cec5SDimitry Andric ///    element in operand \a V1 is copied to the same position in the result.
4090b57cec5SDimitry Andric ///    When a mask bit is 1, the corresponding 32-bit element in operand \a V2
4100b57cec5SDimitry Andric ///    is copied to the same position in the result.
4110b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the copied values.
4120b57cec5SDimitry Andric #define _mm_blend_ps(V1, V2, M)                                                \
41381ad6265SDimitry Andric   ((__m128)__builtin_ia32_blendps((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2),  \
41481ad6265SDimitry Andric                                   (int)(M)))
4150b57cec5SDimitry Andric 
4160b57cec5SDimitry Andric /// Returns a 128-bit vector of [2 x double] where the values are
4170b57cec5SDimitry Andric ///    selected from either the first or second operand as specified by the
4180b57cec5SDimitry Andric ///    third operand, the control mask.
4190b57cec5SDimitry Andric ///
4200b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
4210b57cec5SDimitry Andric ///
4220b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction.
4230b57cec5SDimitry Andric ///
4240b57cec5SDimitry Andric /// \param __V1
4250b57cec5SDimitry Andric ///    A 128-bit vector of [2 x double].
4260b57cec5SDimitry Andric /// \param __V2
4270b57cec5SDimitry Andric ///    A 128-bit vector of [2 x double].
4280b57cec5SDimitry Andric /// \param __M
4290b57cec5SDimitry Andric ///    A 128-bit vector operand, with mask bits 127 and 63 specifying how the
4300b57cec5SDimitry Andric ///    values are to be copied. The position of the mask bit corresponds to the
4310b57cec5SDimitry Andric ///    most significant bit of a copied value. When a mask bit is 0, the
4320b57cec5SDimitry Andric ///    corresponding 64-bit element in operand \a __V1 is copied to the same
4330b57cec5SDimitry Andric ///    position in the result. When a mask bit is 1, the corresponding 64-bit
4340b57cec5SDimitry Andric ///    element in operand \a __V2 is copied to the same position in the result.
4350b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the copied values.
43681ad6265SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_blendv_pd(__m128d __V1,
43781ad6265SDimitry Andric                                                            __m128d __V2,
43881ad6265SDimitry Andric                                                            __m128d __M) {
4390b57cec5SDimitry Andric   return (__m128d)__builtin_ia32_blendvpd((__v2df)__V1, (__v2df)__V2,
4400b57cec5SDimitry Andric                                           (__v2df)__M);
4410b57cec5SDimitry Andric }
4420b57cec5SDimitry Andric 
4430b57cec5SDimitry Andric /// Returns a 128-bit vector of [4 x float] where the values are
4440b57cec5SDimitry Andric ///    selected from either the first or second operand as specified by the
4450b57cec5SDimitry Andric ///    third operand, the control mask.
4460b57cec5SDimitry Andric ///
4470b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
4480b57cec5SDimitry Andric ///
4490b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction.
4500b57cec5SDimitry Andric ///
4510b57cec5SDimitry Andric /// \param __V1
4520b57cec5SDimitry Andric ///    A 128-bit vector of [4 x float].
4530b57cec5SDimitry Andric /// \param __V2
4540b57cec5SDimitry Andric ///    A 128-bit vector of [4 x float].
4550b57cec5SDimitry Andric /// \param __M
4560b57cec5SDimitry Andric ///    A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying
4570b57cec5SDimitry Andric ///    how the values are to be copied. The position of the mask bit corresponds
4580b57cec5SDimitry Andric ///    to the most significant bit of a copied value. When a mask bit is 0, the
4590b57cec5SDimitry Andric ///    corresponding 32-bit element in operand \a __V1 is copied to the same
4600b57cec5SDimitry Andric ///    position in the result. When a mask bit is 1, the corresponding 32-bit
4610b57cec5SDimitry Andric ///    element in operand \a __V2 is copied to the same position in the result.
4620b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the copied values.
46381ad6265SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_blendv_ps(__m128 __V1,
46481ad6265SDimitry Andric                                                           __m128 __V2,
46581ad6265SDimitry Andric                                                           __m128 __M) {
4660b57cec5SDimitry Andric   return (__m128)__builtin_ia32_blendvps((__v4sf)__V1, (__v4sf)__V2,
4670b57cec5SDimitry Andric                                          (__v4sf)__M);
4680b57cec5SDimitry Andric }
4690b57cec5SDimitry Andric 
4700b57cec5SDimitry Andric /// Returns a 128-bit vector of [16 x i8] where the values are selected
4710b57cec5SDimitry Andric ///    from either of the first or second operand as specified by the third
4720b57cec5SDimitry Andric ///    operand, the control mask.
4730b57cec5SDimitry Andric ///
4740b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
4750b57cec5SDimitry Andric ///
4760b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction.
4770b57cec5SDimitry Andric ///
4780b57cec5SDimitry Andric /// \param __V1
4790b57cec5SDimitry Andric ///    A 128-bit vector of [16 x i8].
4800b57cec5SDimitry Andric /// \param __V2
4810b57cec5SDimitry Andric ///    A 128-bit vector of [16 x i8].
4820b57cec5SDimitry Andric /// \param __M
4830b57cec5SDimitry Andric ///    A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying
4840b57cec5SDimitry Andric ///    how the values are to be copied. The position of the mask bit corresponds
4850b57cec5SDimitry Andric ///    to the most significant bit of a copied value. When a mask bit is 0, the
4860b57cec5SDimitry Andric ///    corresponding 8-bit element in operand \a __V1 is copied to the same
4870b57cec5SDimitry Andric ///    position in the result. When a mask bit is 1, the corresponding 8-bit
4880b57cec5SDimitry Andric ///    element in operand \a __V2 is copied to the same position in the result.
4890b57cec5SDimitry Andric /// \returns A 128-bit vector of [16 x i8] containing the copied values.
49081ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_blendv_epi8(__m128i __V1,
49181ad6265SDimitry Andric                                                              __m128i __V2,
49281ad6265SDimitry Andric                                                              __m128i __M) {
4930b57cec5SDimitry Andric   return (__m128i)__builtin_ia32_pblendvb128((__v16qi)__V1, (__v16qi)__V2,
4940b57cec5SDimitry Andric                                              (__v16qi)__M);
4950b57cec5SDimitry Andric }
4960b57cec5SDimitry Andric 
4970b57cec5SDimitry Andric /// Returns a 128-bit vector of [8 x i16] where the values are selected
4980b57cec5SDimitry Andric ///    from either of the first or second operand as specified by the third
4990b57cec5SDimitry Andric ///    operand, the control mask.
5000b57cec5SDimitry Andric ///
5010b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
5020b57cec5SDimitry Andric ///
5030b57cec5SDimitry Andric /// \code
5040b57cec5SDimitry Andric /// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M);
5050b57cec5SDimitry Andric /// \endcode
5060b57cec5SDimitry Andric ///
5070b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction.
5080b57cec5SDimitry Andric ///
5090b57cec5SDimitry Andric /// \param V1
5100b57cec5SDimitry Andric ///    A 128-bit vector of [8 x i16].
5110b57cec5SDimitry Andric /// \param V2
5120b57cec5SDimitry Andric ///    A 128-bit vector of [8 x i16].
5130b57cec5SDimitry Andric /// \param M
5140b57cec5SDimitry Andric ///    An immediate integer operand, with mask bits [7:0] specifying how the
5150b57cec5SDimitry Andric ///    values are to be copied. The position of the mask bit corresponds to the
5160b57cec5SDimitry Andric ///    index of a copied value. When a mask bit is 0, the corresponding 16-bit
5170b57cec5SDimitry Andric ///    element in operand \a V1 is copied to the same position in the result.
5180b57cec5SDimitry Andric ///    When a mask bit is 1, the corresponding 16-bit element in operand \a V2
5190b57cec5SDimitry Andric ///    is copied to the same position in the result.
5200b57cec5SDimitry Andric /// \returns A 128-bit vector of [8 x i16] containing the copied values.
5210b57cec5SDimitry Andric #define _mm_blend_epi16(V1, V2, M)                                             \
522349cc55cSDimitry Andric   ((__m128i)__builtin_ia32_pblendw128((__v8hi)(__m128i)(V1),                   \
523349cc55cSDimitry Andric                                       (__v8hi)(__m128i)(V2), (int)(M)))
5240b57cec5SDimitry Andric 
5250b57cec5SDimitry Andric /* SSE4 Dword Multiply Instructions.  */
5260b57cec5SDimitry Andric /// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
5270b57cec5SDimitry Andric ///    and returns the lower 32 bits of the each product in a 128-bit vector of
5280b57cec5SDimitry Andric ///    [4 x i32].
5290b57cec5SDimitry Andric ///
5300b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
5310b57cec5SDimitry Andric ///
5320b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction.
5330b57cec5SDimitry Andric ///
5340b57cec5SDimitry Andric /// \param __V1
5350b57cec5SDimitry Andric ///    A 128-bit integer vector.
5360b57cec5SDimitry Andric /// \param __V2
5370b57cec5SDimitry Andric ///    A 128-bit integer vector.
5380b57cec5SDimitry Andric /// \returns A 128-bit integer vector containing the products of both operands.
53981ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi32(__m128i __V1,
54081ad6265SDimitry Andric                                                              __m128i __V2) {
5410b57cec5SDimitry Andric   return (__m128i)((__v4su)__V1 * (__v4su)__V2);
5420b57cec5SDimitry Andric }
5430b57cec5SDimitry Andric 
5440b57cec5SDimitry Andric /// Multiplies corresponding even-indexed elements of two 128-bit
5450b57cec5SDimitry Andric ///    vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]
5460b57cec5SDimitry Andric ///    containing the products.
5470b57cec5SDimitry Andric ///
5480b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
5490b57cec5SDimitry Andric ///
5500b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction.
5510b57cec5SDimitry Andric ///
5520b57cec5SDimitry Andric /// \param __V1
5530b57cec5SDimitry Andric ///    A 128-bit vector of [4 x i32].
5540b57cec5SDimitry Andric /// \param __V2
5550b57cec5SDimitry Andric ///    A 128-bit vector of [4 x i32].
5560b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the products of both
5570b57cec5SDimitry Andric ///    operands.
55881ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1,
55981ad6265SDimitry Andric                                                            __m128i __V2) {
5600b57cec5SDimitry Andric   return (__m128i)__builtin_ia32_pmuldq128((__v4si)__V1, (__v4si)__V2);
5610b57cec5SDimitry Andric }
5620b57cec5SDimitry Andric 
5630b57cec5SDimitry Andric /* SSE4 Floating Point Dot Product Instructions.  */
5640b57cec5SDimitry Andric /// Computes the dot product of the two 128-bit vectors of [4 x float]
5650b57cec5SDimitry Andric ///    and returns it in the elements of the 128-bit result vector of
5660b57cec5SDimitry Andric ///    [4 x float].
5670b57cec5SDimitry Andric ///
5680b57cec5SDimitry Andric ///    The immediate integer operand controls which input elements
5690b57cec5SDimitry Andric ///    will contribute to the dot product, and where the final results are
5700b57cec5SDimitry Andric ///    returned.
5710b57cec5SDimitry Andric ///
5720b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
5730b57cec5SDimitry Andric ///
5740b57cec5SDimitry Andric /// \code
5750b57cec5SDimitry Andric /// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M);
5760b57cec5SDimitry Andric /// \endcode
5770b57cec5SDimitry Andric ///
5780b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction.
5790b57cec5SDimitry Andric ///
5800b57cec5SDimitry Andric /// \param X
5810b57cec5SDimitry Andric ///    A 128-bit vector of [4 x float].
5820b57cec5SDimitry Andric /// \param Y
5830b57cec5SDimitry Andric ///    A 128-bit vector of [4 x float].
5840b57cec5SDimitry Andric /// \param M
5850b57cec5SDimitry Andric ///    An immediate integer operand. Mask bits [7:4] determine which elements
5860b57cec5SDimitry Andric ///    of the input vectors are used, with bit [4] corresponding to the lowest
5870b57cec5SDimitry Andric ///    element and bit [7] corresponding to the highest element of each [4 x
5880b57cec5SDimitry Andric ///    float] vector. If a bit is set, the corresponding elements from the two
5890b57cec5SDimitry Andric ///    input vectors are used as an input for dot product; otherwise that input
5900b57cec5SDimitry Andric ///    is treated as zero. Bits [3:0] determine which elements of the result
5910b57cec5SDimitry Andric ///    will receive a copy of the final dot product, with bit [0] corresponding
5920b57cec5SDimitry Andric ///    to the lowest element and bit [3] corresponding to the highest element of
5930b57cec5SDimitry Andric ///    each [4 x float] subvector. If a bit is set, the dot product is returned
5940b57cec5SDimitry Andric ///    in the corresponding element; otherwise that element is set to zero.
5950b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the dot product.
5960b57cec5SDimitry Andric #define _mm_dp_ps(X, Y, M)                                                     \
59781ad6265SDimitry Andric   ((__m128)__builtin_ia32_dpps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (M)))
5980b57cec5SDimitry Andric 
5990b57cec5SDimitry Andric /// Computes the dot product of the two 128-bit vectors of [2 x double]
6000b57cec5SDimitry Andric ///    and returns it in the elements of the 128-bit result vector of
6010b57cec5SDimitry Andric ///    [2 x double].
6020b57cec5SDimitry Andric ///
6030b57cec5SDimitry Andric ///    The immediate integer operand controls which input
6040b57cec5SDimitry Andric ///    elements will contribute to the dot product, and where the final results
6050b57cec5SDimitry Andric ///    are returned.
6060b57cec5SDimitry Andric ///
6070b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
6080b57cec5SDimitry Andric ///
6090b57cec5SDimitry Andric /// \code
6100b57cec5SDimitry Andric /// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M);
6110b57cec5SDimitry Andric /// \endcode
6120b57cec5SDimitry Andric ///
6130b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction.
6140b57cec5SDimitry Andric ///
6150b57cec5SDimitry Andric /// \param X
6160b57cec5SDimitry Andric ///    A 128-bit vector of [2 x double].
6170b57cec5SDimitry Andric /// \param Y
6180b57cec5SDimitry Andric ///    A 128-bit vector of [2 x double].
6190b57cec5SDimitry Andric /// \param M
6200b57cec5SDimitry Andric ///    An immediate integer operand. Mask bits [5:4] determine which elements
6210b57cec5SDimitry Andric ///    of the input vectors are used, with bit [4] corresponding to the lowest
6220b57cec5SDimitry Andric ///    element and bit [5] corresponding to the highest element of each of [2 x
6230b57cec5SDimitry Andric ///    double] vector. If a bit is set, the corresponding elements from the two
6240b57cec5SDimitry Andric ///    input vectors are used as an input for dot product; otherwise that input
6250b57cec5SDimitry Andric ///    is treated as zero. Bits [1:0] determine which elements of the result
6260b57cec5SDimitry Andric ///    will receive a copy of the final dot product, with bit [0] corresponding
6270b57cec5SDimitry Andric ///    to the lowest element and bit [1] corresponding to the highest element of
6280b57cec5SDimitry Andric ///    each [2 x double] vector. If a bit is set, the dot product is returned in
6290b57cec5SDimitry Andric ///    the corresponding element; otherwise that element is set to zero.
6300b57cec5SDimitry Andric #define _mm_dp_pd(X, Y, M)                                                     \
63181ad6265SDimitry Andric   ((__m128d)__builtin_ia32_dppd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y),    \
63281ad6265SDimitry Andric                                 (M)))
6330b57cec5SDimitry Andric 
6340b57cec5SDimitry Andric /* SSE4 Streaming Load Hint Instruction.  */
6350b57cec5SDimitry Andric /// Loads integer values from a 128-bit aligned memory location to a
6360b57cec5SDimitry Andric ///    128-bit integer vector.
6370b57cec5SDimitry Andric ///
6380b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
6390b57cec5SDimitry Andric ///
6400b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction.
6410b57cec5SDimitry Andric ///
6420b57cec5SDimitry Andric /// \param __V
6430b57cec5SDimitry Andric ///    A pointer to a 128-bit aligned memory location that contains the integer
6440b57cec5SDimitry Andric ///    values.
6450b57cec5SDimitry Andric /// \returns A 128-bit integer vector containing the data stored at the
6460b57cec5SDimitry Andric ///    specified memory location.
6470b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS
6485f757f3fSDimitry Andric _mm_stream_load_si128(const void *__V) {
6490b57cec5SDimitry Andric   return (__m128i)__builtin_nontemporal_load((const __v2di *)__V);
6500b57cec5SDimitry Andric }
6510b57cec5SDimitry Andric 
6520b57cec5SDimitry Andric /* SSE4 Packed Integer Min/Max Instructions.  */
6530b57cec5SDimitry Andric /// Compares the corresponding elements of two 128-bit vectors of
6540b57cec5SDimitry Andric ///    [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser
6550b57cec5SDimitry Andric ///    of the two values.
6560b57cec5SDimitry Andric ///
6570b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
6580b57cec5SDimitry Andric ///
6590b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction.
6600b57cec5SDimitry Andric ///
6610b57cec5SDimitry Andric /// \param __V1
6620b57cec5SDimitry Andric ///    A 128-bit vector of [16 x i8].
6630b57cec5SDimitry Andric /// \param __V2
6640b57cec5SDimitry Andric ///    A 128-bit vector of [16 x i8]
6650b57cec5SDimitry Andric /// \returns A 128-bit vector of [16 x i8] containing the lesser values.
66681ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi8(__m128i __V1,
66781ad6265SDimitry Andric                                                           __m128i __V2) {
66804eeddc0SDimitry Andric   return (__m128i)__builtin_elementwise_min((__v16qs)__V1, (__v16qs)__V2);
6690b57cec5SDimitry Andric }
6700b57cec5SDimitry Andric 
6710b57cec5SDimitry Andric /// Compares the corresponding elements of two 128-bit vectors of
6720b57cec5SDimitry Andric ///    [16 x i8] and returns a 128-bit vector of [16 x i8] containing the
6730b57cec5SDimitry Andric ///    greater value of the two.
6740b57cec5SDimitry Andric ///
6750b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
6760b57cec5SDimitry Andric ///
6770b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction.
6780b57cec5SDimitry Andric ///
6790b57cec5SDimitry Andric /// \param __V1
6800b57cec5SDimitry Andric ///    A 128-bit vector of [16 x i8].
6810b57cec5SDimitry Andric /// \param __V2
6820b57cec5SDimitry Andric ///    A 128-bit vector of [16 x i8].
6830b57cec5SDimitry Andric /// \returns A 128-bit vector of [16 x i8] containing the greater values.
68481ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi8(__m128i __V1,
68581ad6265SDimitry Andric                                                           __m128i __V2) {
68604eeddc0SDimitry Andric   return (__m128i)__builtin_elementwise_max((__v16qs)__V1, (__v16qs)__V2);
6870b57cec5SDimitry Andric }
6880b57cec5SDimitry Andric 
6890b57cec5SDimitry Andric /// Compares the corresponding elements of two 128-bit vectors of
6900b57cec5SDimitry Andric ///    [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser
6910b57cec5SDimitry Andric ///    value of the two.
6920b57cec5SDimitry Andric ///
6930b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
6940b57cec5SDimitry Andric ///
6950b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction.
6960b57cec5SDimitry Andric ///
6970b57cec5SDimitry Andric /// \param __V1
6980b57cec5SDimitry Andric ///    A 128-bit vector of [8 x u16].
6990b57cec5SDimitry Andric /// \param __V2
7000b57cec5SDimitry Andric ///    A 128-bit vector of [8 x u16].
7010b57cec5SDimitry Andric /// \returns A 128-bit vector of [8 x u16] containing the lesser values.
70281ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu16(__m128i __V1,
70381ad6265SDimitry Andric                                                            __m128i __V2) {
70404eeddc0SDimitry Andric   return (__m128i)__builtin_elementwise_min((__v8hu)__V1, (__v8hu)__V2);
7050b57cec5SDimitry Andric }
7060b57cec5SDimitry Andric 
7070b57cec5SDimitry Andric /// Compares the corresponding elements of two 128-bit vectors of
7080b57cec5SDimitry Andric ///    [8 x u16] and returns a 128-bit vector of [8 x u16] containing the
7090b57cec5SDimitry Andric ///    greater value of the two.
7100b57cec5SDimitry Andric ///
7110b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
7120b57cec5SDimitry Andric ///
7130b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction.
7140b57cec5SDimitry Andric ///
7150b57cec5SDimitry Andric /// \param __V1
7160b57cec5SDimitry Andric ///    A 128-bit vector of [8 x u16].
7170b57cec5SDimitry Andric /// \param __V2
7180b57cec5SDimitry Andric ///    A 128-bit vector of [8 x u16].
7190b57cec5SDimitry Andric /// \returns A 128-bit vector of [8 x u16] containing the greater values.
72081ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu16(__m128i __V1,
72181ad6265SDimitry Andric                                                            __m128i __V2) {
72204eeddc0SDimitry Andric   return (__m128i)__builtin_elementwise_max((__v8hu)__V1, (__v8hu)__V2);
7230b57cec5SDimitry Andric }
7240b57cec5SDimitry Andric 
7250b57cec5SDimitry Andric /// Compares the corresponding elements of two 128-bit vectors of
7260b57cec5SDimitry Andric ///    [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser
7270b57cec5SDimitry Andric ///    value of the two.
7280b57cec5SDimitry Andric ///
7290b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
7300b57cec5SDimitry Andric ///
7310b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction.
7320b57cec5SDimitry Andric ///
7330b57cec5SDimitry Andric /// \param __V1
7340b57cec5SDimitry Andric ///    A 128-bit vector of [4 x i32].
7350b57cec5SDimitry Andric /// \param __V2
7360b57cec5SDimitry Andric ///    A 128-bit vector of [4 x i32].
7370b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the lesser values.
73881ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi32(__m128i __V1,
73981ad6265SDimitry Andric                                                            __m128i __V2) {
74004eeddc0SDimitry Andric   return (__m128i)__builtin_elementwise_min((__v4si)__V1, (__v4si)__V2);
7410b57cec5SDimitry Andric }
7420b57cec5SDimitry Andric 
7430b57cec5SDimitry Andric /// Compares the corresponding elements of two 128-bit vectors of
7440b57cec5SDimitry Andric ///    [4 x i32] and returns a 128-bit vector of [4 x i32] containing the
7450b57cec5SDimitry Andric ///    greater value of the two.
7460b57cec5SDimitry Andric ///
7470b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
7480b57cec5SDimitry Andric ///
7490b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction.
7500b57cec5SDimitry Andric ///
7510b57cec5SDimitry Andric /// \param __V1
7520b57cec5SDimitry Andric ///    A 128-bit vector of [4 x i32].
7530b57cec5SDimitry Andric /// \param __V2
7540b57cec5SDimitry Andric ///    A 128-bit vector of [4 x i32].
7550b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the greater values.
75681ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi32(__m128i __V1,
75781ad6265SDimitry Andric                                                            __m128i __V2) {
75804eeddc0SDimitry Andric   return (__m128i)__builtin_elementwise_max((__v4si)__V1, (__v4si)__V2);
7590b57cec5SDimitry Andric }
7600b57cec5SDimitry Andric 
7610b57cec5SDimitry Andric /// Compares the corresponding elements of two 128-bit vectors of
7620b57cec5SDimitry Andric ///    [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser
7630b57cec5SDimitry Andric ///    value of the two.
7640b57cec5SDimitry Andric ///
7650b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
7660b57cec5SDimitry Andric ///
7670b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c>  instruction.
7680b57cec5SDimitry Andric ///
7690b57cec5SDimitry Andric /// \param __V1
7700b57cec5SDimitry Andric ///    A 128-bit vector of [4 x u32].
7710b57cec5SDimitry Andric /// \param __V2
7720b57cec5SDimitry Andric ///    A 128-bit vector of [4 x u32].
7730b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x u32] containing the lesser values.
77481ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu32(__m128i __V1,
77581ad6265SDimitry Andric                                                            __m128i __V2) {
77604eeddc0SDimitry Andric   return (__m128i)__builtin_elementwise_min((__v4su)__V1, (__v4su)__V2);
7770b57cec5SDimitry Andric }
7780b57cec5SDimitry Andric 
7790b57cec5SDimitry Andric /// Compares the corresponding elements of two 128-bit vectors of
7800b57cec5SDimitry Andric ///    [4 x u32] and returns a 128-bit vector of [4 x u32] containing the
7810b57cec5SDimitry Andric ///    greater value of the two.
7820b57cec5SDimitry Andric ///
7830b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
7840b57cec5SDimitry Andric ///
7850b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction.
7860b57cec5SDimitry Andric ///
7870b57cec5SDimitry Andric /// \param __V1
7880b57cec5SDimitry Andric ///    A 128-bit vector of [4 x u32].
7890b57cec5SDimitry Andric /// \param __V2
7900b57cec5SDimitry Andric ///    A 128-bit vector of [4 x u32].
7910b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x u32] containing the greater values.
79281ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu32(__m128i __V1,
79381ad6265SDimitry Andric                                                            __m128i __V2) {
79404eeddc0SDimitry Andric   return (__m128i)__builtin_elementwise_max((__v4su)__V1, (__v4su)__V2);
7950b57cec5SDimitry Andric }
7960b57cec5SDimitry Andric 
7970b57cec5SDimitry Andric /* SSE4 Insertion and Extraction from XMM Register Instructions.  */
7980b57cec5SDimitry Andric /// Takes the first argument \a X and inserts an element from the second
7990b57cec5SDimitry Andric ///    argument \a Y as selected by the third argument \a N. That result then
8000b57cec5SDimitry Andric ///    has elements zeroed out also as selected by the third argument \a N. The
8010b57cec5SDimitry Andric ///    resulting 128-bit vector of [4 x float] is then returned.
8020b57cec5SDimitry Andric ///
8030b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
8040b57cec5SDimitry Andric ///
8050b57cec5SDimitry Andric /// \code
8060b57cec5SDimitry Andric /// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N);
8070b57cec5SDimitry Andric /// \endcode
8080b57cec5SDimitry Andric ///
8090b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VINSERTPS </c> instruction.
8100b57cec5SDimitry Andric ///
8110b57cec5SDimitry Andric /// \param X
8120b57cec5SDimitry Andric ///    A 128-bit vector source operand of [4 x float]. With the exception of
8130b57cec5SDimitry Andric ///    those bits in the result copied from parameter \a Y and zeroed by bits
8140b57cec5SDimitry Andric ///    [3:0] of \a N, all bits from this parameter are copied to the result.
8150b57cec5SDimitry Andric /// \param Y
8160b57cec5SDimitry Andric ///    A 128-bit vector source operand of [4 x float]. One single-precision
8170b57cec5SDimitry Andric ///    floating-point element from this source, as determined by the immediate
8180b57cec5SDimitry Andric ///    parameter, is copied to the result.
8190b57cec5SDimitry Andric /// \param N
8200b57cec5SDimitry Andric ///    Specifies which bits from operand \a Y will be copied, which bits in the
821bdd1243dSDimitry Andric ///    result they will be copied to, and which bits in the result will be
8220b57cec5SDimitry Andric ///    cleared. The following assignments are made: \n
8230b57cec5SDimitry Andric ///    Bits [7:6] specify the bits to copy from operand \a Y: \n
8240b57cec5SDimitry Andric ///      00: Selects bits [31:0] from operand \a Y. \n
8250b57cec5SDimitry Andric ///      01: Selects bits [63:32] from operand \a Y. \n
8260b57cec5SDimitry Andric ///      10: Selects bits [95:64] from operand \a Y. \n
8270b57cec5SDimitry Andric ///      11: Selects bits [127:96] from operand \a Y. \n
8280b57cec5SDimitry Andric ///    Bits [5:4] specify the bits in the result to which the selected bits
8290b57cec5SDimitry Andric ///    from operand \a Y are copied: \n
8300b57cec5SDimitry Andric ///      00: Copies the selected bits from \a Y to result bits [31:0]. \n
8310b57cec5SDimitry Andric ///      01: Copies the selected bits from \a Y to result bits [63:32]. \n
8320b57cec5SDimitry Andric ///      10: Copies the selected bits from \a Y to result bits [95:64]. \n
8330b57cec5SDimitry Andric ///      11: Copies the selected bits from \a Y to result bits [127:96]. \n
8340b57cec5SDimitry Andric ///    Bits[3:0]: If any of these bits are set, the corresponding result
8350b57cec5SDimitry Andric ///    element is cleared.
8360b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the copied
8370b57cec5SDimitry Andric ///    single-precision floating point elements from the operands.
8380b57cec5SDimitry Andric #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
8390b57cec5SDimitry Andric 
8400b57cec5SDimitry Andric /// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
8410b57cec5SDimitry Andric ///    returns it, using the immediate value parameter \a N as a selector.
8420b57cec5SDimitry Andric ///
8430b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
8440b57cec5SDimitry Andric ///
8450b57cec5SDimitry Andric /// \code
8460b57cec5SDimitry Andric /// int _mm_extract_ps(__m128 X, const int N);
8470b57cec5SDimitry Andric /// \endcode
8480b57cec5SDimitry Andric ///
8490b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c>
8500b57cec5SDimitry Andric /// instruction.
8510b57cec5SDimitry Andric ///
8520b57cec5SDimitry Andric /// \param X
8530b57cec5SDimitry Andric ///    A 128-bit vector of [4 x float].
8540b57cec5SDimitry Andric /// \param N
8550b57cec5SDimitry Andric ///    An immediate value. Bits [1:0] determines which bits from the argument
8560b57cec5SDimitry Andric ///    \a X are extracted and returned: \n
8570b57cec5SDimitry Andric ///    00: Bits [31:0] of parameter \a X are returned. \n
8580b57cec5SDimitry Andric ///    01: Bits [63:32] of parameter \a X are returned. \n
8590b57cec5SDimitry Andric ///    10: Bits [95:64] of parameter \a X are returned. \n
8600b57cec5SDimitry Andric ///    11: Bits [127:96] of parameter \a X are returned.
8610b57cec5SDimitry Andric /// \returns A 32-bit integer containing the extracted 32 bits of float data.
862349cc55cSDimitry Andric #define _mm_extract_ps(X, N)                                                   \
86381ad6265SDimitry Andric   __builtin_bit_cast(                                                          \
86481ad6265SDimitry Andric       int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)))
8650b57cec5SDimitry Andric 
8660b57cec5SDimitry Andric /* Miscellaneous insert and extract macros.  */
8670b57cec5SDimitry Andric /* Extract a single-precision float from X at index N into D.  */
8680b57cec5SDimitry Andric #define _MM_EXTRACT_FLOAT(D, X, N)                                             \
86981ad6265SDimitry Andric   do {                                                                         \
87081ad6265SDimitry Andric     (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N));          \
87181ad6265SDimitry Andric   } while (0)
8720b57cec5SDimitry Andric 
8730b57cec5SDimitry Andric /* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
8740b57cec5SDimitry Andric    an index suitable for _mm_insert_ps.  */
8750b57cec5SDimitry Andric #define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
8760b57cec5SDimitry Andric 
8770b57cec5SDimitry Andric /* Extract a float from X at index N into the first index of the return.  */
87881ad6265SDimitry Andric #define _MM_PICK_OUT_PS(X, N)                                                  \
87981ad6265SDimitry Andric   _mm_insert_ps(_mm_setzero_ps(), (X), _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
8800b57cec5SDimitry Andric 
8810b57cec5SDimitry Andric /* Insert int into packed integer array at index.  */
8820b57cec5SDimitry Andric /// Constructs a 128-bit vector of [16 x i8] by first making a copy of
8830b57cec5SDimitry Andric ///    the 128-bit integer vector parameter, and then inserting the lower 8 bits
8840b57cec5SDimitry Andric ///    of an integer parameter \a I into an offset specified by the immediate
8850b57cec5SDimitry Andric ///    value parameter \a N.
8860b57cec5SDimitry Andric ///
8870b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
8880b57cec5SDimitry Andric ///
8890b57cec5SDimitry Andric /// \code
8900b57cec5SDimitry Andric /// __m128i _mm_insert_epi8(__m128i X, int I, const int N);
8910b57cec5SDimitry Andric /// \endcode
8920b57cec5SDimitry Andric ///
8930b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction.
8940b57cec5SDimitry Andric ///
8950b57cec5SDimitry Andric /// \param X
8960b57cec5SDimitry Andric ///    A 128-bit integer vector of [16 x i8]. This vector is copied to the
8970b57cec5SDimitry Andric ///    result and then one of the sixteen elements in the result vector is
8980b57cec5SDimitry Andric ///    replaced by the lower 8 bits of \a I.
8990b57cec5SDimitry Andric /// \param I
9000b57cec5SDimitry Andric ///    An integer. The lower 8 bits of this operand are written to the result
9010b57cec5SDimitry Andric ///    beginning at the offset specified by \a N.
9020b57cec5SDimitry Andric /// \param N
9030b57cec5SDimitry Andric ///    An immediate value. Bits [3:0] specify the bit offset in the result at
9040b57cec5SDimitry Andric ///    which the lower 8 bits of \a I are written. \n
9050b57cec5SDimitry Andric ///    0000: Bits [7:0] of the result are used for insertion. \n
9060b57cec5SDimitry Andric ///    0001: Bits [15:8] of the result are used for insertion. \n
9070b57cec5SDimitry Andric ///    0010: Bits [23:16] of the result are used for insertion. \n
9080b57cec5SDimitry Andric ///    0011: Bits [31:24] of the result are used for insertion. \n
9090b57cec5SDimitry Andric ///    0100: Bits [39:32] of the result are used for insertion. \n
9100b57cec5SDimitry Andric ///    0101: Bits [47:40] of the result are used for insertion. \n
9110b57cec5SDimitry Andric ///    0110: Bits [55:48] of the result are used for insertion. \n
9120b57cec5SDimitry Andric ///    0111: Bits [63:56] of the result are used for insertion. \n
9130b57cec5SDimitry Andric ///    1000: Bits [71:64] of the result are used for insertion. \n
9140b57cec5SDimitry Andric ///    1001: Bits [79:72] of the result are used for insertion. \n
9150b57cec5SDimitry Andric ///    1010: Bits [87:80] of the result are used for insertion. \n
9160b57cec5SDimitry Andric ///    1011: Bits [95:88] of the result are used for insertion. \n
9170b57cec5SDimitry Andric ///    1100: Bits [103:96] of the result are used for insertion. \n
9180b57cec5SDimitry Andric ///    1101: Bits [111:104] of the result are used for insertion. \n
9190b57cec5SDimitry Andric ///    1110: Bits [119:112] of the result are used for insertion. \n
9200b57cec5SDimitry Andric ///    1111: Bits [127:120] of the result are used for insertion.
9210b57cec5SDimitry Andric /// \returns A 128-bit integer vector containing the constructed values.
9220b57cec5SDimitry Andric #define _mm_insert_epi8(X, I, N)                                               \
92381ad6265SDimitry Andric   ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), (int)(I),      \
92481ad6265SDimitry Andric                                          (int)(N)))
9250b57cec5SDimitry Andric 
9260b57cec5SDimitry Andric /// Constructs a 128-bit vector of [4 x i32] by first making a copy of
9270b57cec5SDimitry Andric ///    the 128-bit integer vector parameter, and then inserting the 32-bit
9280b57cec5SDimitry Andric ///    integer parameter \a I at the offset specified by the immediate value
9290b57cec5SDimitry Andric ///    parameter \a N.
9300b57cec5SDimitry Andric ///
9310b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
9320b57cec5SDimitry Andric ///
9330b57cec5SDimitry Andric /// \code
9340b57cec5SDimitry Andric /// __m128i _mm_insert_epi32(__m128i X, int I, const int N);
9350b57cec5SDimitry Andric /// \endcode
9360b57cec5SDimitry Andric ///
9370b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction.
9380b57cec5SDimitry Andric ///
9390b57cec5SDimitry Andric /// \param X
9400b57cec5SDimitry Andric ///    A 128-bit integer vector of [4 x i32]. This vector is copied to the
9410b57cec5SDimitry Andric ///    result and then one of the four elements in the result vector is
9420b57cec5SDimitry Andric ///    replaced by \a I.
9430b57cec5SDimitry Andric /// \param I
9440b57cec5SDimitry Andric ///    A 32-bit integer that is written to the result beginning at the offset
9450b57cec5SDimitry Andric ///    specified by \a N.
9460b57cec5SDimitry Andric /// \param N
9470b57cec5SDimitry Andric ///    An immediate value. Bits [1:0] specify the bit offset in the result at
9480b57cec5SDimitry Andric ///    which the integer \a I is written. \n
9490b57cec5SDimitry Andric ///    00: Bits [31:0] of the result are used for insertion. \n
9500b57cec5SDimitry Andric ///    01: Bits [63:32] of the result are used for insertion. \n
9510b57cec5SDimitry Andric ///    10: Bits [95:64] of the result are used for insertion. \n
9520b57cec5SDimitry Andric ///    11: Bits [127:96] of the result are used for insertion.
9530b57cec5SDimitry Andric /// \returns A 128-bit integer vector containing the constructed values.
9540b57cec5SDimitry Andric #define _mm_insert_epi32(X, I, N)                                              \
95581ad6265SDimitry Andric   ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), (int)(I),        \
95681ad6265SDimitry Andric                                         (int)(N)))
9570b57cec5SDimitry Andric 
9580b57cec5SDimitry Andric #ifdef __x86_64__
9590b57cec5SDimitry Andric /// Constructs a 128-bit vector of [2 x i64] by first making a copy of
9600b57cec5SDimitry Andric ///    the 128-bit integer vector parameter, and then inserting the 64-bit
9610b57cec5SDimitry Andric ///    integer parameter \a I, using the immediate value parameter \a N as an
9620b57cec5SDimitry Andric ///    insertion location selector.
9630b57cec5SDimitry Andric ///
9640b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
9650b57cec5SDimitry Andric ///
9660b57cec5SDimitry Andric /// \code
9670b57cec5SDimitry Andric /// __m128i _mm_insert_epi64(__m128i X, long long I, const int N);
9680b57cec5SDimitry Andric /// \endcode
9690b57cec5SDimitry Andric ///
9700b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction.
9710b57cec5SDimitry Andric ///
9720b57cec5SDimitry Andric /// \param X
9730b57cec5SDimitry Andric ///    A 128-bit integer vector of [2 x i64]. This vector is copied to the
9740b57cec5SDimitry Andric ///    result and then one of the two elements in the result vector is replaced
9750b57cec5SDimitry Andric ///    by \a I.
9760b57cec5SDimitry Andric /// \param I
9770b57cec5SDimitry Andric ///    A 64-bit integer that is written to the result beginning at the offset
9780b57cec5SDimitry Andric ///    specified by \a N.
9790b57cec5SDimitry Andric /// \param N
9800b57cec5SDimitry Andric ///    An immediate value. Bit [0] specifies the bit offset in the result at
9810b57cec5SDimitry Andric ///    which the integer \a I is written. \n
9820b57cec5SDimitry Andric ///    0: Bits [63:0] of the result are used for insertion. \n
9830b57cec5SDimitry Andric ///    1: Bits [127:64] of the result are used for insertion. \n
9840b57cec5SDimitry Andric /// \returns A 128-bit integer vector containing the constructed values.
9850b57cec5SDimitry Andric #define _mm_insert_epi64(X, I, N)                                              \
98681ad6265SDimitry Andric   ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), (long long)(I),  \
98781ad6265SDimitry Andric                                         (int)(N)))
9880b57cec5SDimitry Andric #endif /* __x86_64__ */
9890b57cec5SDimitry Andric 
9900b57cec5SDimitry Andric /* Extract int from packed integer array at index.  This returns the element
9910b57cec5SDimitry Andric  * as a zero extended value, so it is unsigned.
9920b57cec5SDimitry Andric  */
9930b57cec5SDimitry Andric /// Extracts an 8-bit element from the 128-bit integer vector of
9940b57cec5SDimitry Andric ///    [16 x i8], using the immediate value parameter \a N as a selector.
9950b57cec5SDimitry Andric ///
9960b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
9970b57cec5SDimitry Andric ///
9980b57cec5SDimitry Andric /// \code
9990b57cec5SDimitry Andric /// int _mm_extract_epi8(__m128i X, const int N);
10000b57cec5SDimitry Andric /// \endcode
10010b57cec5SDimitry Andric ///
10020b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction.
10030b57cec5SDimitry Andric ///
10040b57cec5SDimitry Andric /// \param X
10050b57cec5SDimitry Andric ///    A 128-bit integer vector.
10060b57cec5SDimitry Andric /// \param N
10070b57cec5SDimitry Andric ///    An immediate value. Bits [3:0] specify which 8-bit vector element from
10080b57cec5SDimitry Andric ///    the argument \a X to extract and copy to the result. \n
10090b57cec5SDimitry Andric ///    0000: Bits [7:0] of parameter \a X are extracted. \n
10100b57cec5SDimitry Andric ///    0001: Bits [15:8] of the parameter \a X are extracted. \n
10110b57cec5SDimitry Andric ///    0010: Bits [23:16] of the parameter \a X are extracted. \n
10120b57cec5SDimitry Andric ///    0011: Bits [31:24] of the parameter \a X are extracted. \n
10130b57cec5SDimitry Andric ///    0100: Bits [39:32] of the parameter \a X are extracted. \n
10140b57cec5SDimitry Andric ///    0101: Bits [47:40] of the parameter \a X are extracted. \n
10150b57cec5SDimitry Andric ///    0110: Bits [55:48] of the parameter \a X are extracted. \n
10160b57cec5SDimitry Andric ///    0111: Bits [63:56] of the parameter \a X are extracted. \n
10170b57cec5SDimitry Andric ///    1000: Bits [71:64] of the parameter \a X are extracted. \n
10180b57cec5SDimitry Andric ///    1001: Bits [79:72] of the parameter \a X are extracted. \n
10190b57cec5SDimitry Andric ///    1010: Bits [87:80] of the parameter \a X are extracted. \n
10200b57cec5SDimitry Andric ///    1011: Bits [95:88] of the parameter \a X are extracted. \n
10210b57cec5SDimitry Andric ///    1100: Bits [103:96] of the parameter \a X are extracted. \n
10220b57cec5SDimitry Andric ///    1101: Bits [111:104] of the parameter \a X are extracted. \n
10230b57cec5SDimitry Andric ///    1110: Bits [119:112] of the parameter \a X are extracted. \n
10240b57cec5SDimitry Andric ///    1111: Bits [127:120] of the parameter \a X are extracted.
10250b57cec5SDimitry Andric /// \returns  An unsigned integer, whose lower 8 bits are selected from the
10260b57cec5SDimitry Andric ///    128-bit integer vector parameter and the remaining bits are assigned
10270b57cec5SDimitry Andric ///    zeros.
10280b57cec5SDimitry Andric #define _mm_extract_epi8(X, N)                                                 \
1029349cc55cSDimitry Andric   ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X),     \
1030349cc55cSDimitry Andric                                                     (int)(N)))
10310b57cec5SDimitry Andric 
10320b57cec5SDimitry Andric /// Extracts a 32-bit element from the 128-bit integer vector of
10330b57cec5SDimitry Andric ///    [4 x i32], using the immediate value parameter \a N as a selector.
10340b57cec5SDimitry Andric ///
10350b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
10360b57cec5SDimitry Andric ///
10370b57cec5SDimitry Andric /// \code
10380b57cec5SDimitry Andric /// int _mm_extract_epi32(__m128i X, const int N);
10390b57cec5SDimitry Andric /// \endcode
10400b57cec5SDimitry Andric ///
10410b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction.
10420b57cec5SDimitry Andric ///
10430b57cec5SDimitry Andric /// \param X
10440b57cec5SDimitry Andric ///    A 128-bit integer vector.
10450b57cec5SDimitry Andric /// \param N
10460b57cec5SDimitry Andric ///    An immediate value. Bits [1:0] specify which 32-bit vector element from
10470b57cec5SDimitry Andric ///    the argument \a X to extract and copy to the result. \n
10480b57cec5SDimitry Andric ///    00: Bits [31:0] of the parameter \a X are extracted. \n
10490b57cec5SDimitry Andric ///    01: Bits [63:32] of the parameter \a X are extracted. \n
10500b57cec5SDimitry Andric ///    10: Bits [95:64] of the parameter \a X are extracted. \n
10510b57cec5SDimitry Andric ///    11: Bits [127:96] of the parameter \a X are exracted.
10520b57cec5SDimitry Andric /// \returns  An integer, whose lower 32 bits are selected from the 128-bit
10530b57cec5SDimitry Andric ///    integer vector parameter and the remaining bits are assigned zeros.
10540b57cec5SDimitry Andric #define _mm_extract_epi32(X, N)                                                \
1055349cc55cSDimitry Andric   ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))
10560b57cec5SDimitry Andric 
10570b57cec5SDimitry Andric /// Extracts a 64-bit element from the 128-bit integer vector of
10580b57cec5SDimitry Andric ///    [2 x i64], using the immediate value parameter \a N as a selector.
10590b57cec5SDimitry Andric ///
10600b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
10610b57cec5SDimitry Andric ///
10620b57cec5SDimitry Andric /// \code
10630b57cec5SDimitry Andric /// long long _mm_extract_epi64(__m128i X, const int N);
10640b57cec5SDimitry Andric /// \endcode
10650b57cec5SDimitry Andric ///
106681ad6265SDimitry Andric /// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction
106781ad6265SDimitry Andric /// in 64-bit mode.
10680b57cec5SDimitry Andric ///
10690b57cec5SDimitry Andric /// \param X
10700b57cec5SDimitry Andric ///    A 128-bit integer vector.
10710b57cec5SDimitry Andric /// \param N
10720b57cec5SDimitry Andric ///    An immediate value. Bit [0] specifies which 64-bit vector element from
10730b57cec5SDimitry Andric ///    the argument \a X to return. \n
10740b57cec5SDimitry Andric ///    0: Bits [63:0] are returned. \n
10750b57cec5SDimitry Andric ///    1: Bits [127:64] are returned. \n
10760b57cec5SDimitry Andric /// \returns  A 64-bit integer.
10770b57cec5SDimitry Andric #define _mm_extract_epi64(X, N)                                                \
1078349cc55cSDimitry Andric   ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
10790b57cec5SDimitry Andric 
10800b57cec5SDimitry Andric /* SSE4 128-bit Packed Integer Comparisons.  */
10810b57cec5SDimitry Andric /// Tests whether the specified bits in a 128-bit integer vector are all
10820b57cec5SDimitry Andric ///    zeros.
10830b57cec5SDimitry Andric ///
10840b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
10850b57cec5SDimitry Andric ///
10860b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
10870b57cec5SDimitry Andric ///
10880b57cec5SDimitry Andric /// \param __M
10890b57cec5SDimitry Andric ///    A 128-bit integer vector containing the bits to be tested.
10900b57cec5SDimitry Andric /// \param __V
10910b57cec5SDimitry Andric ///    A 128-bit integer vector selecting which bits to test in operand \a __M.
10920b57cec5SDimitry Andric /// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
109381ad6265SDimitry Andric static __inline__ int __DEFAULT_FN_ATTRS _mm_testz_si128(__m128i __M,
109481ad6265SDimitry Andric                                                          __m128i __V) {
10950b57cec5SDimitry Andric   return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
10960b57cec5SDimitry Andric }
10970b57cec5SDimitry Andric 
10980b57cec5SDimitry Andric /// Tests whether the specified bits in a 128-bit integer vector are all
10990b57cec5SDimitry Andric ///    ones.
11000b57cec5SDimitry Andric ///
11010b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
11020b57cec5SDimitry Andric ///
11030b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
11040b57cec5SDimitry Andric ///
11050b57cec5SDimitry Andric /// \param __M
11060b57cec5SDimitry Andric ///    A 128-bit integer vector containing the bits to be tested.
11070b57cec5SDimitry Andric /// \param __V
11080b57cec5SDimitry Andric ///    A 128-bit integer vector selecting which bits to test in operand \a __M.
11090b57cec5SDimitry Andric /// \returns TRUE if the specified bits are all ones; FALSE otherwise.
111081ad6265SDimitry Andric static __inline__ int __DEFAULT_FN_ATTRS _mm_testc_si128(__m128i __M,
111181ad6265SDimitry Andric                                                          __m128i __V) {
11120b57cec5SDimitry Andric   return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
11130b57cec5SDimitry Andric }
11140b57cec5SDimitry Andric 
11150b57cec5SDimitry Andric /// Tests whether the specified bits in a 128-bit integer vector are
11160b57cec5SDimitry Andric ///    neither all zeros nor all ones.
11170b57cec5SDimitry Andric ///
11180b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
11190b57cec5SDimitry Andric ///
11200b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
11210b57cec5SDimitry Andric ///
11220b57cec5SDimitry Andric /// \param __M
11230b57cec5SDimitry Andric ///    A 128-bit integer vector containing the bits to be tested.
11240b57cec5SDimitry Andric /// \param __V
11250b57cec5SDimitry Andric ///    A 128-bit integer vector selecting which bits to test in operand \a __M.
11260b57cec5SDimitry Andric /// \returns TRUE if the specified bits are neither all zeros nor all ones;
11270b57cec5SDimitry Andric ///    FALSE otherwise.
112881ad6265SDimitry Andric static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M,
112981ad6265SDimitry Andric                                                            __m128i __V) {
11300b57cec5SDimitry Andric   return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
11310b57cec5SDimitry Andric }
11320b57cec5SDimitry Andric 
11330b57cec5SDimitry Andric /// Tests whether the specified bits in a 128-bit integer vector are all
11340b57cec5SDimitry Andric ///    ones.
11350b57cec5SDimitry Andric ///
11360b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
11370b57cec5SDimitry Andric ///
11380b57cec5SDimitry Andric /// \code
11390b57cec5SDimitry Andric /// int _mm_test_all_ones(__m128i V);
11400b57cec5SDimitry Andric /// \endcode
11410b57cec5SDimitry Andric ///
11420b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
11430b57cec5SDimitry Andric ///
11440b57cec5SDimitry Andric /// \param V
11450b57cec5SDimitry Andric ///    A 128-bit integer vector containing the bits to be tested.
11460b57cec5SDimitry Andric /// \returns TRUE if the bits specified in the operand are all set to 1; FALSE
11470b57cec5SDimitry Andric ///    otherwise.
11481ac55f4cSDimitry Andric #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_set1_epi32(-1))
11490b57cec5SDimitry Andric 
11500b57cec5SDimitry Andric /// Tests whether the specified bits in a 128-bit integer vector are
11510b57cec5SDimitry Andric ///    neither all zeros nor all ones.
11520b57cec5SDimitry Andric ///
11530b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
11540b57cec5SDimitry Andric ///
11550b57cec5SDimitry Andric /// \code
11560b57cec5SDimitry Andric /// int _mm_test_mix_ones_zeros(__m128i M, __m128i V);
11570b57cec5SDimitry Andric /// \endcode
11580b57cec5SDimitry Andric ///
11590b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
11600b57cec5SDimitry Andric ///
11610b57cec5SDimitry Andric /// \param M
11620b57cec5SDimitry Andric ///    A 128-bit integer vector containing the bits to be tested.
11630b57cec5SDimitry Andric /// \param V
11640b57cec5SDimitry Andric ///    A 128-bit integer vector selecting which bits to test in operand \a M.
11650b57cec5SDimitry Andric /// \returns TRUE if the specified bits are neither all zeros nor all ones;
11660b57cec5SDimitry Andric ///    FALSE otherwise.
11670b57cec5SDimitry Andric #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
11680b57cec5SDimitry Andric 
11690b57cec5SDimitry Andric /// Tests whether the specified bits in a 128-bit integer vector are all
11700b57cec5SDimitry Andric ///    zeros.
11710b57cec5SDimitry Andric ///
11720b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
11730b57cec5SDimitry Andric ///
11740b57cec5SDimitry Andric /// \code
11750b57cec5SDimitry Andric /// int _mm_test_all_zeros(__m128i M, __m128i V);
11760b57cec5SDimitry Andric /// \endcode
11770b57cec5SDimitry Andric ///
11780b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
11790b57cec5SDimitry Andric ///
11800b57cec5SDimitry Andric /// \param M
11810b57cec5SDimitry Andric ///    A 128-bit integer vector containing the bits to be tested.
11820b57cec5SDimitry Andric /// \param V
11830b57cec5SDimitry Andric ///    A 128-bit integer vector selecting which bits to test in operand \a M.
11840b57cec5SDimitry Andric /// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
11850b57cec5SDimitry Andric #define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
11860b57cec5SDimitry Andric 
11870b57cec5SDimitry Andric /* SSE4 64-bit Packed Integer Comparisons.  */
11880b57cec5SDimitry Andric /// Compares each of the corresponding 64-bit values of the 128-bit
11890b57cec5SDimitry Andric ///    integer vectors for equality.
11900b57cec5SDimitry Andric ///
1191*0fca6ea1SDimitry Andric ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1192*0fca6ea1SDimitry Andric ///
11930b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
11940b57cec5SDimitry Andric ///
11950b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction.
11960b57cec5SDimitry Andric ///
11970b57cec5SDimitry Andric /// \param __V1
11980b57cec5SDimitry Andric ///    A 128-bit integer vector.
11990b57cec5SDimitry Andric /// \param __V2
12000b57cec5SDimitry Andric ///    A 128-bit integer vector.
12010b57cec5SDimitry Andric /// \returns A 128-bit integer vector containing the comparison results.
120281ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi64(__m128i __V1,
120381ad6265SDimitry Andric                                                              __m128i __V2) {
12040b57cec5SDimitry Andric   return (__m128i)((__v2di)__V1 == (__v2di)__V2);
12050b57cec5SDimitry Andric }
12060b57cec5SDimitry Andric 
12070b57cec5SDimitry Andric /* SSE4 Packed Integer Sign-Extension.  */
12080b57cec5SDimitry Andric /// Sign-extends each of the lower eight 8-bit integer elements of a
12090b57cec5SDimitry Andric ///    128-bit vector of [16 x i8] to 16-bit values and returns them in a
12100b57cec5SDimitry Andric ///    128-bit vector of [8 x i16]. The upper eight elements of the input vector
12110b57cec5SDimitry Andric ///    are unused.
12120b57cec5SDimitry Andric ///
12130b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
12140b57cec5SDimitry Andric ///
12150b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction.
12160b57cec5SDimitry Andric ///
12170b57cec5SDimitry Andric /// \param __V
121881ad6265SDimitry Andric ///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
121981ad6265SDimitry Andric ///    sign-extended to 16-bit values.
12200b57cec5SDimitry Andric /// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
122181ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi16(__m128i __V) {
12220b57cec5SDimitry Andric   /* This function always performs a signed extension, but __v16qi is a char
12230b57cec5SDimitry Andric      which may be signed or unsigned, so use __v16qs. */
122481ad6265SDimitry Andric   return (__m128i) __builtin_convertvector(
122581ad6265SDimitry Andric       __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6,
122681ad6265SDimitry Andric                               7),
122781ad6265SDimitry Andric       __v8hi);
12280b57cec5SDimitry Andric }
12290b57cec5SDimitry Andric 
12300b57cec5SDimitry Andric /// Sign-extends each of the lower four 8-bit integer elements of a
12310b57cec5SDimitry Andric ///    128-bit vector of [16 x i8] to 32-bit values and returns them in a
12320b57cec5SDimitry Andric ///    128-bit vector of [4 x i32]. The upper twelve elements of the input
12330b57cec5SDimitry Andric ///    vector are unused.
12340b57cec5SDimitry Andric ///
12350b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
12360b57cec5SDimitry Andric ///
12370b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction.
12380b57cec5SDimitry Andric ///
12390b57cec5SDimitry Andric /// \param __V
12400b57cec5SDimitry Andric ///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
12410b57cec5SDimitry Andric ///    sign-extended to 32-bit values.
12420b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
124381ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi32(__m128i __V) {
12440b57cec5SDimitry Andric   /* This function always performs a signed extension, but __v16qi is a char
12450b57cec5SDimitry Andric      which may be signed or unsigned, so use __v16qs. */
124681ad6265SDimitry Andric   return (__m128i) __builtin_convertvector(
124781ad6265SDimitry Andric       __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
12480b57cec5SDimitry Andric }
12490b57cec5SDimitry Andric 
12500b57cec5SDimitry Andric /// Sign-extends each of the lower two 8-bit integer elements of a
12510b57cec5SDimitry Andric ///    128-bit integer vector of [16 x i8] to 64-bit values and returns them in
12520b57cec5SDimitry Andric ///    a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
12530b57cec5SDimitry Andric ///    vector are unused.
12540b57cec5SDimitry Andric ///
12550b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
12560b57cec5SDimitry Andric ///
12570b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction.
12580b57cec5SDimitry Andric ///
12590b57cec5SDimitry Andric /// \param __V
12600b57cec5SDimitry Andric ///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
12610b57cec5SDimitry Andric ///    sign-extended to 64-bit values.
12620b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
126381ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi64(__m128i __V) {
12640b57cec5SDimitry Andric   /* This function always performs a signed extension, but __v16qi is a char
12650b57cec5SDimitry Andric      which may be signed or unsigned, so use __v16qs. */
126681ad6265SDimitry Andric   return (__m128i) __builtin_convertvector(
126781ad6265SDimitry Andric       __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
12680b57cec5SDimitry Andric }
12690b57cec5SDimitry Andric 
12700b57cec5SDimitry Andric /// Sign-extends each of the lower four 16-bit integer elements of a
12710b57cec5SDimitry Andric ///    128-bit integer vector of [8 x i16] to 32-bit values and returns them in
12720b57cec5SDimitry Andric ///    a 128-bit vector of [4 x i32]. The upper four elements of the input
12730b57cec5SDimitry Andric ///    vector are unused.
12740b57cec5SDimitry Andric ///
12750b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
12760b57cec5SDimitry Andric ///
12770b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction.
12780b57cec5SDimitry Andric ///
12790b57cec5SDimitry Andric /// \param __V
12800b57cec5SDimitry Andric ///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
12810b57cec5SDimitry Andric ///    sign-extended to 32-bit values.
12820b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
128381ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V) {
128481ad6265SDimitry Andric   return (__m128i) __builtin_convertvector(
128581ad6265SDimitry Andric       __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
12860b57cec5SDimitry Andric }
12870b57cec5SDimitry Andric 
12880b57cec5SDimitry Andric /// Sign-extends each of the lower two 16-bit integer elements of a
12890b57cec5SDimitry Andric ///    128-bit integer vector of [8 x i16] to 64-bit values and returns them in
12900b57cec5SDimitry Andric ///    a 128-bit vector of [2 x i64]. The upper six elements of the input
12910b57cec5SDimitry Andric ///    vector are unused.
12920b57cec5SDimitry Andric ///
12930b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
12940b57cec5SDimitry Andric ///
12950b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction.
12960b57cec5SDimitry Andric ///
12970b57cec5SDimitry Andric /// \param __V
12980b57cec5SDimitry Andric ///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
12990b57cec5SDimitry Andric ///     sign-extended to 64-bit values.
13000b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
130181ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi64(__m128i __V) {
130281ad6265SDimitry Andric   return (__m128i) __builtin_convertvector(
130381ad6265SDimitry Andric       __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
13040b57cec5SDimitry Andric }
13050b57cec5SDimitry Andric 
13060b57cec5SDimitry Andric /// Sign-extends each of the lower two 32-bit integer elements of a
13070b57cec5SDimitry Andric ///    128-bit integer vector of [4 x i32] to 64-bit values and returns them in
13080b57cec5SDimitry Andric ///    a 128-bit vector of [2 x i64]. The upper two elements of the input vector
13090b57cec5SDimitry Andric ///    are unused.
13100b57cec5SDimitry Andric ///
13110b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
13120b57cec5SDimitry Andric ///
13130b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction.
13140b57cec5SDimitry Andric ///
13150b57cec5SDimitry Andric /// \param __V
13160b57cec5SDimitry Andric ///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
13170b57cec5SDimitry Andric ///    sign-extended to 64-bit values.
13180b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
131981ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi64(__m128i __V) {
132081ad6265SDimitry Andric   return (__m128i) __builtin_convertvector(
132181ad6265SDimitry Andric       __builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
13220b57cec5SDimitry Andric }
13230b57cec5SDimitry Andric 
13240b57cec5SDimitry Andric /* SSE4 Packed Integer Zero-Extension.  */
13250b57cec5SDimitry Andric /// Zero-extends each of the lower eight 8-bit integer elements of a
13260b57cec5SDimitry Andric ///    128-bit vector of [16 x i8] to 16-bit values and returns them in a
13270b57cec5SDimitry Andric ///    128-bit vector of [8 x i16]. The upper eight elements of the input vector
13280b57cec5SDimitry Andric ///    are unused.
13290b57cec5SDimitry Andric ///
13300b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
13310b57cec5SDimitry Andric ///
13320b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction.
13330b57cec5SDimitry Andric ///
13340b57cec5SDimitry Andric /// \param __V
13350b57cec5SDimitry Andric ///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
13360b57cec5SDimitry Andric ///    zero-extended to 16-bit values.
13370b57cec5SDimitry Andric /// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
133881ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi16(__m128i __V) {
133981ad6265SDimitry Andric   return (__m128i) __builtin_convertvector(
134081ad6265SDimitry Andric       __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6,
134181ad6265SDimitry Andric                               7),
134281ad6265SDimitry Andric       __v8hi);
13430b57cec5SDimitry Andric }
13440b57cec5SDimitry Andric 
13450b57cec5SDimitry Andric /// Zero-extends each of the lower four 8-bit integer elements of a
13460b57cec5SDimitry Andric ///    128-bit vector of [16 x i8] to 32-bit values and returns them in a
13470b57cec5SDimitry Andric ///    128-bit vector of [4 x i32]. The upper twelve elements of the input
13480b57cec5SDimitry Andric ///    vector are unused.
13490b57cec5SDimitry Andric ///
13500b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
13510b57cec5SDimitry Andric ///
13520b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction.
13530b57cec5SDimitry Andric ///
13540b57cec5SDimitry Andric /// \param __V
13550b57cec5SDimitry Andric ///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
13560b57cec5SDimitry Andric ///    zero-extended to 32-bit values.
13570b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
135881ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi32(__m128i __V) {
135981ad6265SDimitry Andric   return (__m128i) __builtin_convertvector(
136081ad6265SDimitry Andric       __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
13610b57cec5SDimitry Andric }
13620b57cec5SDimitry Andric 
13630b57cec5SDimitry Andric /// Zero-extends each of the lower two 8-bit integer elements of a
13640b57cec5SDimitry Andric ///    128-bit integer vector of [16 x i8] to 64-bit values and returns them in
13650b57cec5SDimitry Andric ///    a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
13660b57cec5SDimitry Andric ///    vector are unused.
13670b57cec5SDimitry Andric ///
13680b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
13690b57cec5SDimitry Andric ///
13700b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction.
13710b57cec5SDimitry Andric ///
13720b57cec5SDimitry Andric /// \param __V
13730b57cec5SDimitry Andric ///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
13740b57cec5SDimitry Andric ///    zero-extended to 64-bit values.
13750b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
137681ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi64(__m128i __V) {
137781ad6265SDimitry Andric   return (__m128i) __builtin_convertvector(
137881ad6265SDimitry Andric       __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
13790b57cec5SDimitry Andric }
13800b57cec5SDimitry Andric 
13810b57cec5SDimitry Andric /// Zero-extends each of the lower four 16-bit integer elements of a
13820b57cec5SDimitry Andric ///    128-bit integer vector of [8 x i16] to 32-bit values and returns them in
13830b57cec5SDimitry Andric ///    a 128-bit vector of [4 x i32]. The upper four elements of the input
13840b57cec5SDimitry Andric ///    vector are unused.
13850b57cec5SDimitry Andric ///
13860b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
13870b57cec5SDimitry Andric ///
13880b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction.
13890b57cec5SDimitry Andric ///
13900b57cec5SDimitry Andric /// \param __V
13910b57cec5SDimitry Andric ///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
13920b57cec5SDimitry Andric ///    zero-extended to 32-bit values.
13930b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
139481ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi32(__m128i __V) {
139581ad6265SDimitry Andric   return (__m128i) __builtin_convertvector(
139681ad6265SDimitry Andric       __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
13970b57cec5SDimitry Andric }
13980b57cec5SDimitry Andric 
13990b57cec5SDimitry Andric /// Zero-extends each of the lower two 16-bit integer elements of a
14000b57cec5SDimitry Andric ///    128-bit integer vector of [8 x i16] to 64-bit values and returns them in
14010b57cec5SDimitry Andric ///    a 128-bit vector of [2 x i64]. The upper six elements of the input vector
14020b57cec5SDimitry Andric ///    are unused.
14030b57cec5SDimitry Andric ///
14040b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
14050b57cec5SDimitry Andric ///
14060b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction.
14070b57cec5SDimitry Andric ///
14080b57cec5SDimitry Andric /// \param __V
14090b57cec5SDimitry Andric ///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
14100b57cec5SDimitry Andric ///    zero-extended to 64-bit values.
14110b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
141281ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi64(__m128i __V) {
141381ad6265SDimitry Andric   return (__m128i) __builtin_convertvector(
141481ad6265SDimitry Andric       __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
14150b57cec5SDimitry Andric }
14160b57cec5SDimitry Andric 
14170b57cec5SDimitry Andric /// Zero-extends each of the lower two 32-bit integer elements of a
14180b57cec5SDimitry Andric ///    128-bit integer vector of [4 x i32] to 64-bit values and returns them in
14190b57cec5SDimitry Andric ///    a 128-bit vector of [2 x i64]. The upper two elements of the input vector
14200b57cec5SDimitry Andric ///    are unused.
14210b57cec5SDimitry Andric ///
14220b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
14230b57cec5SDimitry Andric ///
14240b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction.
14250b57cec5SDimitry Andric ///
14260b57cec5SDimitry Andric /// \param __V
14270b57cec5SDimitry Andric ///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
14280b57cec5SDimitry Andric ///    zero-extended to 64-bit values.
14290b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
143081ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) {
143181ad6265SDimitry Andric   return (__m128i) __builtin_convertvector(
143281ad6265SDimitry Andric       __builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
14330b57cec5SDimitry Andric }
14340b57cec5SDimitry Andric 
14350b57cec5SDimitry Andric /* SSE4 Pack with Unsigned Saturation.  */
1436*0fca6ea1SDimitry Andric /// Converts, with saturation, 32-bit signed integers from both 128-bit integer
1437*0fca6ea1SDimitry Andric ///    vector operands into 16-bit unsigned integers, and returns the packed
1438*0fca6ea1SDimitry Andric ///    result.
1439*0fca6ea1SDimitry Andric ///
14400b57cec5SDimitry Andric ///    Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
14410b57cec5SDimitry Andric ///    0x0000 are saturated to 0x0000.
14420b57cec5SDimitry Andric ///
14430b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
14440b57cec5SDimitry Andric ///
14450b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction.
14460b57cec5SDimitry Andric ///
14470b57cec5SDimitry Andric /// \param __V1
1448*0fca6ea1SDimitry Andric ///    A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
1449*0fca6ea1SDimitry Andric ///    written to the lower 64 bits of the result.
14500b57cec5SDimitry Andric /// \param __V2
1451*0fca6ea1SDimitry Andric ///    A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
1452*0fca6ea1SDimitry Andric ///    written to the higher 64 bits of the result.
14530b57cec5SDimitry Andric /// \returns A 128-bit vector of [8 x i16] containing the converted values.
145481ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1,
145581ad6265SDimitry Andric                                                               __m128i __V2) {
14560b57cec5SDimitry Andric   return (__m128i)__builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
14570b57cec5SDimitry Andric }
14580b57cec5SDimitry Andric 
14590b57cec5SDimitry Andric /* SSE4 Multiple Packed Sums of Absolute Difference.  */
14600b57cec5SDimitry Andric /// Subtracts 8-bit unsigned integer values and computes the absolute
14610b57cec5SDimitry Andric ///    values of the differences to the corresponding bits in the destination.
14620b57cec5SDimitry Andric ///    Then sums of the absolute differences are returned according to the bit
14630b57cec5SDimitry Andric ///    fields in the immediate operand.
14640b57cec5SDimitry Andric ///
14650b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
14660b57cec5SDimitry Andric ///
14670b57cec5SDimitry Andric /// \code
14680b57cec5SDimitry Andric /// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M);
14690b57cec5SDimitry Andric /// \endcode
14700b57cec5SDimitry Andric ///
14710b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction.
14720b57cec5SDimitry Andric ///
14730b57cec5SDimitry Andric /// \param X
14740b57cec5SDimitry Andric ///    A 128-bit vector of [16 x i8].
14750b57cec5SDimitry Andric /// \param Y
14760b57cec5SDimitry Andric ///    A 128-bit vector of [16 x i8].
14770b57cec5SDimitry Andric /// \param M
14780b57cec5SDimitry Andric ///    An 8-bit immediate operand specifying how the absolute differences are to
14790b57cec5SDimitry Andric ///    be calculated, according to the following algorithm:
14800b57cec5SDimitry Andric ///    \code
14810b57cec5SDimitry Andric ///    // M2 represents bit 2 of the immediate operand
14820b57cec5SDimitry Andric ///    // M10 represents bits [1:0] of the immediate operand
14830b57cec5SDimitry Andric ///    i = M2 * 4;
14840b57cec5SDimitry Andric ///    j = M10 * 4;
14850b57cec5SDimitry Andric ///    for (k = 0; k < 8; k = k + 1) {
14860b57cec5SDimitry Andric ///      d0 = abs(X[i + k + 0] - Y[j + 0]);
14870b57cec5SDimitry Andric ///      d1 = abs(X[i + k + 1] - Y[j + 1]);
14880b57cec5SDimitry Andric ///      d2 = abs(X[i + k + 2] - Y[j + 2]);
14890b57cec5SDimitry Andric ///      d3 = abs(X[i + k + 3] - Y[j + 3]);
14900b57cec5SDimitry Andric ///      r[k] = d0 + d1 + d2 + d3;
14910b57cec5SDimitry Andric ///    }
14920b57cec5SDimitry Andric ///    \endcode
14930b57cec5SDimitry Andric /// \returns A 128-bit integer vector containing the sums of the sets of
14940b57cec5SDimitry Andric ///    absolute differences between both operands.
14950b57cec5SDimitry Andric #define _mm_mpsadbw_epu8(X, Y, M)                                              \
1496349cc55cSDimitry Andric   ((__m128i)__builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X),                   \
1497349cc55cSDimitry Andric                                       (__v16qi)(__m128i)(Y), (M)))
14980b57cec5SDimitry Andric 
14990b57cec5SDimitry Andric /// Finds the minimum unsigned 16-bit element in the input 128-bit
15000b57cec5SDimitry Andric ///    vector of [8 x u16] and returns it and along with its index.
15010b57cec5SDimitry Andric ///
15020b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
15030b57cec5SDimitry Andric ///
15040b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
15050b57cec5SDimitry Andric /// instruction.
15060b57cec5SDimitry Andric ///
15070b57cec5SDimitry Andric /// \param __V
15080b57cec5SDimitry Andric ///    A 128-bit vector of [8 x u16].
15090b57cec5SDimitry Andric /// \returns A 128-bit value where bits [15:0] contain the minimum value found
15100b57cec5SDimitry Andric ///    in parameter \a __V, bits [18:16] contain the index of the minimum value
15110b57cec5SDimitry Andric ///    and the remaining bits are set to 0.
151281ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) {
15130b57cec5SDimitry Andric   return (__m128i)__builtin_ia32_phminposuw128((__v8hi)__V);
15140b57cec5SDimitry Andric }
15150b57cec5SDimitry Andric 
15160b57cec5SDimitry Andric /* Handle the sse4.2 definitions here. */
15170b57cec5SDimitry Andric 
15180b57cec5SDimitry Andric /* These definitions are normally in nmmintrin.h, but gcc puts them in here
15190b57cec5SDimitry Andric    so we'll do the same.  */
15200b57cec5SDimitry Andric 
15210b57cec5SDimitry Andric #undef __DEFAULT_FN_ATTRS
152281ad6265SDimitry Andric #define __DEFAULT_FN_ATTRS                                                     \
152381ad6265SDimitry Andric   __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
15240b57cec5SDimitry Andric 
15250b57cec5SDimitry Andric /* These specify the type of data that we're comparing.  */
15260b57cec5SDimitry Andric #define _SIDD_UBYTE_OPS 0x00
15270b57cec5SDimitry Andric #define _SIDD_UWORD_OPS 0x01
15280b57cec5SDimitry Andric #define _SIDD_SBYTE_OPS 0x02
15290b57cec5SDimitry Andric #define _SIDD_SWORD_OPS 0x03
15300b57cec5SDimitry Andric 
15310b57cec5SDimitry Andric /* These specify the type of comparison operation.  */
15320b57cec5SDimitry Andric #define _SIDD_CMP_EQUAL_ANY 0x00
15330b57cec5SDimitry Andric #define _SIDD_CMP_RANGES 0x04
15340b57cec5SDimitry Andric #define _SIDD_CMP_EQUAL_EACH 0x08
15350b57cec5SDimitry Andric #define _SIDD_CMP_EQUAL_ORDERED 0x0c
15360b57cec5SDimitry Andric 
15370b57cec5SDimitry Andric /* These macros specify the polarity of the operation.  */
15380b57cec5SDimitry Andric #define _SIDD_POSITIVE_POLARITY 0x00
15390b57cec5SDimitry Andric #define _SIDD_NEGATIVE_POLARITY 0x10
15400b57cec5SDimitry Andric #define _SIDD_MASKED_POSITIVE_POLARITY 0x20
15410b57cec5SDimitry Andric #define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
15420b57cec5SDimitry Andric 
15430b57cec5SDimitry Andric /* These macros are used in _mm_cmpXstri() to specify the return.  */
15440b57cec5SDimitry Andric #define _SIDD_LEAST_SIGNIFICANT 0x00
15450b57cec5SDimitry Andric #define _SIDD_MOST_SIGNIFICANT 0x40
15460b57cec5SDimitry Andric 
15470b57cec5SDimitry Andric /* These macros are used in _mm_cmpXstri() to specify the return.  */
15480b57cec5SDimitry Andric #define _SIDD_BIT_MASK 0x00
15490b57cec5SDimitry Andric #define _SIDD_UNIT_MASK 0x40
15500b57cec5SDimitry Andric 
15510b57cec5SDimitry Andric /* SSE4.2 Packed Comparison Intrinsics.  */
15520b57cec5SDimitry Andric /// Uses the immediate operand \a M to perform a comparison of string
15530b57cec5SDimitry Andric ///    data with implicitly defined lengths that is contained in source operands
15540b57cec5SDimitry Andric ///    \a A and \a B. Returns a 128-bit integer vector representing the result
15550b57cec5SDimitry Andric ///    mask of the comparison.
15560b57cec5SDimitry Andric ///
15570b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
15580b57cec5SDimitry Andric ///
15590b57cec5SDimitry Andric /// \code
15600b57cec5SDimitry Andric /// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M);
15610b57cec5SDimitry Andric /// \endcode
15620b57cec5SDimitry Andric ///
15630b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c>
15640b57cec5SDimitry Andric /// instruction.
15650b57cec5SDimitry Andric ///
15660b57cec5SDimitry Andric /// \param A
15670b57cec5SDimitry Andric ///    A 128-bit integer vector containing one of the source operands to be
15680b57cec5SDimitry Andric ///    compared.
15690b57cec5SDimitry Andric /// \param B
15700b57cec5SDimitry Andric ///    A 128-bit integer vector containing one of the source operands to be
15710b57cec5SDimitry Andric ///    compared.
15720b57cec5SDimitry Andric /// \param M
15730b57cec5SDimitry Andric ///    An 8-bit immediate operand specifying whether the characters are bytes or
15740b57cec5SDimitry Andric ///    words, the type of comparison to perform, and the format of the return
15750b57cec5SDimitry Andric ///    value. \n
15760b57cec5SDimitry Andric ///    Bits [1:0]: Determine source data format. \n
15770b57cec5SDimitry Andric ///      00: 16 unsigned bytes \n
15780b57cec5SDimitry Andric ///      01: 8 unsigned words \n
15790b57cec5SDimitry Andric ///      10: 16 signed bytes \n
15800b57cec5SDimitry Andric ///      11: 8 signed words \n
15810b57cec5SDimitry Andric ///    Bits [3:2]: Determine comparison type and aggregation method. \n
15820b57cec5SDimitry Andric ///      00: Subset: Each character in \a B is compared for equality with all
15830b57cec5SDimitry Andric ///          the characters in \a A. \n
15840b57cec5SDimitry Andric ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
15850b57cec5SDimitry Andric ///          basis is greater than or equal for even-indexed elements in \a A,
15860b57cec5SDimitry Andric ///          and less than or equal for odd-indexed elements in \a A. \n
15870b57cec5SDimitry Andric ///      10: Match: Compare each pair of corresponding characters in \a A and
15880b57cec5SDimitry Andric ///          \a B for equality. \n
15890b57cec5SDimitry Andric ///      11: Substring: Search \a B for substring matches of \a A. \n
15900b57cec5SDimitry Andric ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
15910b57cec5SDimitry Andric ///                mask of the comparison results. \n
15920b57cec5SDimitry Andric ///      00: No effect. \n
15930b57cec5SDimitry Andric ///      01: Negate the bit mask. \n
15940b57cec5SDimitry Andric ///      10: No effect. \n
15950b57cec5SDimitry Andric ///      11: Negate the bit mask only for bits with an index less than or equal
15960b57cec5SDimitry Andric ///          to the size of \a A or \a B. \n
15970b57cec5SDimitry Andric ///    Bit [6]: Determines whether the result is zero-extended or expanded to 16
15980b57cec5SDimitry Andric ///             bytes. \n
15990b57cec5SDimitry Andric ///      0: The result is zero-extended to 16 bytes. \n
16000b57cec5SDimitry Andric ///      1: The result is expanded to 16 bytes (this expansion is performed by
16010b57cec5SDimitry Andric ///         repeating each bit 8 or 16 times).
16020b57cec5SDimitry Andric /// \returns Returns a 128-bit integer vector representing the result mask of
16030b57cec5SDimitry Andric ///    the comparison.
16040b57cec5SDimitry Andric #define _mm_cmpistrm(A, B, M)                                                  \
1605349cc55cSDimitry Andric   ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A),                 \
1606349cc55cSDimitry Andric                                         (__v16qi)(__m128i)(B), (int)(M)))
16070b57cec5SDimitry Andric 
16080b57cec5SDimitry Andric /// Uses the immediate operand \a M to perform a comparison of string
16090b57cec5SDimitry Andric ///    data with implicitly defined lengths that is contained in source operands
16100b57cec5SDimitry Andric ///    \a A and \a B. Returns an integer representing the result index of the
16110b57cec5SDimitry Andric ///    comparison.
16120b57cec5SDimitry Andric ///
16130b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
16140b57cec5SDimitry Andric ///
16150b57cec5SDimitry Andric /// \code
16160b57cec5SDimitry Andric /// int _mm_cmpistri(__m128i A, __m128i B, const int M);
16170b57cec5SDimitry Andric /// \endcode
16180b57cec5SDimitry Andric ///
16190b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
16200b57cec5SDimitry Andric /// instruction.
16210b57cec5SDimitry Andric ///
16220b57cec5SDimitry Andric /// \param A
16230b57cec5SDimitry Andric ///    A 128-bit integer vector containing one of the source operands to be
16240b57cec5SDimitry Andric ///    compared.
16250b57cec5SDimitry Andric /// \param B
16260b57cec5SDimitry Andric ///    A 128-bit integer vector containing one of the source operands to be
16270b57cec5SDimitry Andric ///    compared.
16280b57cec5SDimitry Andric /// \param M
16290b57cec5SDimitry Andric ///    An 8-bit immediate operand specifying whether the characters are bytes or
16300b57cec5SDimitry Andric ///    words, the type of comparison to perform, and the format of the return
16310b57cec5SDimitry Andric ///    value. \n
16320b57cec5SDimitry Andric ///    Bits [1:0]: Determine source data format. \n
16330b57cec5SDimitry Andric ///      00: 16 unsigned bytes \n
16340b57cec5SDimitry Andric ///      01: 8 unsigned words \n
16350b57cec5SDimitry Andric ///      10: 16 signed bytes \n
16360b57cec5SDimitry Andric ///      11: 8 signed words \n
16370b57cec5SDimitry Andric ///    Bits [3:2]: Determine comparison type and aggregation method. \n
16380b57cec5SDimitry Andric ///      00: Subset: Each character in \a B is compared for equality with all
16390b57cec5SDimitry Andric ///          the characters in \a A. \n
16400b57cec5SDimitry Andric ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
16410b57cec5SDimitry Andric ///          basis is greater than or equal for even-indexed elements in \a A,
16420b57cec5SDimitry Andric ///          and less than or equal for odd-indexed elements in \a A. \n
16430b57cec5SDimitry Andric ///      10: Match: Compare each pair of corresponding characters in \a A and
16440b57cec5SDimitry Andric ///          \a B for equality. \n
16450b57cec5SDimitry Andric ///      11: Substring: Search B for substring matches of \a A. \n
16460b57cec5SDimitry Andric ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
16470b57cec5SDimitry Andric ///                mask of the comparison results. \n
16480b57cec5SDimitry Andric ///      00: No effect. \n
16490b57cec5SDimitry Andric ///      01: Negate the bit mask. \n
16500b57cec5SDimitry Andric ///      10: No effect. \n
16510b57cec5SDimitry Andric ///      11: Negate the bit mask only for bits with an index less than or equal
16520b57cec5SDimitry Andric ///          to the size of \a A or \a B. \n
16530b57cec5SDimitry Andric ///    Bit [6]: Determines whether the index of the lowest set bit or the
16540b57cec5SDimitry Andric ///             highest set bit is returned. \n
16550b57cec5SDimitry Andric ///      0: The index of the least significant set bit. \n
16560b57cec5SDimitry Andric ///      1: The index of the most significant set bit. \n
16570b57cec5SDimitry Andric /// \returns Returns an integer representing the result index of the comparison.
16580b57cec5SDimitry Andric #define _mm_cmpistri(A, B, M)                                                  \
1659349cc55cSDimitry Andric   ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A),                     \
1660349cc55cSDimitry Andric                                     (__v16qi)(__m128i)(B), (int)(M)))
16610b57cec5SDimitry Andric 
16620b57cec5SDimitry Andric /// Uses the immediate operand \a M to perform a comparison of string
16630b57cec5SDimitry Andric ///    data with explicitly defined lengths that is contained in source operands
16640b57cec5SDimitry Andric ///    \a A and \a B. Returns a 128-bit integer vector representing the result
16650b57cec5SDimitry Andric ///    mask of the comparison.
16660b57cec5SDimitry Andric ///
16670b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
16680b57cec5SDimitry Andric ///
16690b57cec5SDimitry Andric /// \code
16700b57cec5SDimitry Andric /// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M);
16710b57cec5SDimitry Andric /// \endcode
16720b57cec5SDimitry Andric ///
16730b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c>
16740b57cec5SDimitry Andric /// instruction.
16750b57cec5SDimitry Andric ///
16760b57cec5SDimitry Andric /// \param A
16770b57cec5SDimitry Andric ///    A 128-bit integer vector containing one of the source operands to be
16780b57cec5SDimitry Andric ///    compared.
16790b57cec5SDimitry Andric /// \param LA
16800b57cec5SDimitry Andric ///    An integer that specifies the length of the string in \a A.
16810b57cec5SDimitry Andric /// \param B
16820b57cec5SDimitry Andric ///    A 128-bit integer vector containing one of the source operands to be
16830b57cec5SDimitry Andric ///    compared.
16840b57cec5SDimitry Andric /// \param LB
16850b57cec5SDimitry Andric ///    An integer that specifies the length of the string in \a B.
16860b57cec5SDimitry Andric /// \param M
16870b57cec5SDimitry Andric ///    An 8-bit immediate operand specifying whether the characters are bytes or
16880b57cec5SDimitry Andric ///    words, the type of comparison to perform, and the format of the return
16890b57cec5SDimitry Andric ///    value. \n
16900b57cec5SDimitry Andric ///    Bits [1:0]: Determine source data format. \n
16910b57cec5SDimitry Andric ///      00: 16 unsigned bytes \n
16920b57cec5SDimitry Andric ///      01: 8 unsigned words \n
16930b57cec5SDimitry Andric ///      10: 16 signed bytes \n
16940b57cec5SDimitry Andric ///      11: 8 signed words \n
16950b57cec5SDimitry Andric ///    Bits [3:2]: Determine comparison type and aggregation method. \n
16960b57cec5SDimitry Andric ///      00: Subset: Each character in \a B is compared for equality with all
16970b57cec5SDimitry Andric ///          the characters in \a A. \n
16980b57cec5SDimitry Andric ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
16990b57cec5SDimitry Andric ///          basis is greater than or equal for even-indexed elements in \a A,
17000b57cec5SDimitry Andric ///          and less than or equal for odd-indexed elements in \a A. \n
17010b57cec5SDimitry Andric ///      10: Match: Compare each pair of corresponding characters in \a A and
17020b57cec5SDimitry Andric ///          \a B for equality. \n
17030b57cec5SDimitry Andric ///      11: Substring: Search \a B for substring matches of \a A. \n
17040b57cec5SDimitry Andric ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
17050b57cec5SDimitry Andric ///                mask of the comparison results. \n
17060b57cec5SDimitry Andric ///      00: No effect. \n
17070b57cec5SDimitry Andric ///      01: Negate the bit mask. \n
17080b57cec5SDimitry Andric ///      10: No effect. \n
17090b57cec5SDimitry Andric ///      11: Negate the bit mask only for bits with an index less than or equal
17100b57cec5SDimitry Andric ///          to the size of \a A or \a B. \n
17110b57cec5SDimitry Andric ///    Bit [6]: Determines whether the result is zero-extended or expanded to 16
17120b57cec5SDimitry Andric ///             bytes. \n
17130b57cec5SDimitry Andric ///      0: The result is zero-extended to 16 bytes. \n
17140b57cec5SDimitry Andric ///      1: The result is expanded to 16 bytes (this expansion is performed by
17150b57cec5SDimitry Andric ///         repeating each bit 8 or 16 times). \n
17160b57cec5SDimitry Andric /// \returns Returns a 128-bit integer vector representing the result mask of
17170b57cec5SDimitry Andric ///    the comparison.
17180b57cec5SDimitry Andric #define _mm_cmpestrm(A, LA, B, LB, M)                                          \
1719349cc55cSDimitry Andric   ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA),      \
17200b57cec5SDimitry Andric                                         (__v16qi)(__m128i)(B), (int)(LB),      \
1721349cc55cSDimitry Andric                                         (int)(M)))
17220b57cec5SDimitry Andric 
17230b57cec5SDimitry Andric /// Uses the immediate operand \a M to perform a comparison of string
17240b57cec5SDimitry Andric ///    data with explicitly defined lengths that is contained in source operands
17250b57cec5SDimitry Andric ///    \a A and \a B. Returns an integer representing the result index of the
17260b57cec5SDimitry Andric ///    comparison.
17270b57cec5SDimitry Andric ///
17280b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
17290b57cec5SDimitry Andric ///
17300b57cec5SDimitry Andric /// \code
17310b57cec5SDimitry Andric /// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M);
17320b57cec5SDimitry Andric /// \endcode
17330b57cec5SDimitry Andric ///
17340b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
17350b57cec5SDimitry Andric /// instruction.
17360b57cec5SDimitry Andric ///
17370b57cec5SDimitry Andric /// \param A
17380b57cec5SDimitry Andric ///    A 128-bit integer vector containing one of the source operands to be
17390b57cec5SDimitry Andric ///    compared.
17400b57cec5SDimitry Andric /// \param LA
17410b57cec5SDimitry Andric ///    An integer that specifies the length of the string in \a A.
17420b57cec5SDimitry Andric /// \param B
17430b57cec5SDimitry Andric ///    A 128-bit integer vector containing one of the source operands to be
17440b57cec5SDimitry Andric ///    compared.
17450b57cec5SDimitry Andric /// \param LB
17460b57cec5SDimitry Andric ///    An integer that specifies the length of the string in \a B.
17470b57cec5SDimitry Andric /// \param M
17480b57cec5SDimitry Andric ///    An 8-bit immediate operand specifying whether the characters are bytes or
17490b57cec5SDimitry Andric ///    words, the type of comparison to perform, and the format of the return
17500b57cec5SDimitry Andric ///    value. \n
17510b57cec5SDimitry Andric ///    Bits [1:0]: Determine source data format. \n
17520b57cec5SDimitry Andric ///      00: 16 unsigned bytes \n
17530b57cec5SDimitry Andric ///      01: 8 unsigned words \n
17540b57cec5SDimitry Andric ///      10: 16 signed bytes \n
17550b57cec5SDimitry Andric ///      11: 8 signed words \n
17560b57cec5SDimitry Andric ///    Bits [3:2]: Determine comparison type and aggregation method. \n
17570b57cec5SDimitry Andric ///      00: Subset: Each character in \a B is compared for equality with all
17580b57cec5SDimitry Andric ///          the characters in \a A. \n
17590b57cec5SDimitry Andric ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
17600b57cec5SDimitry Andric ///          basis is greater than or equal for even-indexed elements in \a A,
17610b57cec5SDimitry Andric ///          and less than or equal for odd-indexed elements in \a A. \n
17620b57cec5SDimitry Andric ///      10: Match: Compare each pair of corresponding characters in \a A and
17630b57cec5SDimitry Andric ///          \a B for equality. \n
17640b57cec5SDimitry Andric ///      11: Substring: Search B for substring matches of \a A. \n
17650b57cec5SDimitry Andric ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
17660b57cec5SDimitry Andric ///                mask of the comparison results. \n
17670b57cec5SDimitry Andric ///      00: No effect. \n
17680b57cec5SDimitry Andric ///      01: Negate the bit mask. \n
17690b57cec5SDimitry Andric ///      10: No effect. \n
17700b57cec5SDimitry Andric ///      11: Negate the bit mask only for bits with an index less than or equal
17710b57cec5SDimitry Andric ///          to the size of \a A or \a B. \n
17720b57cec5SDimitry Andric ///    Bit [6]: Determines whether the index of the lowest set bit or the
17730b57cec5SDimitry Andric ///             highest set bit is returned. \n
17740b57cec5SDimitry Andric ///      0: The index of the least significant set bit. \n
17750b57cec5SDimitry Andric ///      1: The index of the most significant set bit. \n
17760b57cec5SDimitry Andric /// \returns Returns an integer representing the result index of the comparison.
17770b57cec5SDimitry Andric #define _mm_cmpestri(A, LA, B, LB, M)                                          \
1778349cc55cSDimitry Andric   ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA),          \
17790b57cec5SDimitry Andric                                     (__v16qi)(__m128i)(B), (int)(LB),          \
1780349cc55cSDimitry Andric                                     (int)(M)))
17810b57cec5SDimitry Andric 
17820b57cec5SDimitry Andric /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading.  */
17830b57cec5SDimitry Andric /// Uses the immediate operand \a M to perform a comparison of string
17840b57cec5SDimitry Andric ///    data with implicitly defined lengths that is contained in source operands
17850b57cec5SDimitry Andric ///    \a A and \a B. Returns 1 if the bit mask is zero and the length of the
17860b57cec5SDimitry Andric ///    string in \a B is the maximum, otherwise, returns 0.
17870b57cec5SDimitry Andric ///
17880b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
17890b57cec5SDimitry Andric ///
17900b57cec5SDimitry Andric /// \code
17910b57cec5SDimitry Andric /// int _mm_cmpistra(__m128i A, __m128i B, const int M);
17920b57cec5SDimitry Andric /// \endcode
17930b57cec5SDimitry Andric ///
17940b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
17950b57cec5SDimitry Andric /// instruction.
17960b57cec5SDimitry Andric ///
17970b57cec5SDimitry Andric /// \param A
17980b57cec5SDimitry Andric ///    A 128-bit integer vector containing one of the source operands to be
17990b57cec5SDimitry Andric ///    compared.
18000b57cec5SDimitry Andric /// \param B
18010b57cec5SDimitry Andric ///    A 128-bit integer vector containing one of the source operands to be
18020b57cec5SDimitry Andric ///    compared.
18030b57cec5SDimitry Andric /// \param M
18040b57cec5SDimitry Andric ///    An 8-bit immediate operand specifying whether the characters are bytes or
18050b57cec5SDimitry Andric ///    words and the type of comparison to perform. \n
18060b57cec5SDimitry Andric ///    Bits [1:0]: Determine source data format. \n
18070b57cec5SDimitry Andric ///      00: 16 unsigned bytes \n
18080b57cec5SDimitry Andric ///      01: 8 unsigned words \n
18090b57cec5SDimitry Andric ///      10: 16 signed bytes \n
18100b57cec5SDimitry Andric ///      11: 8 signed words \n
18110b57cec5SDimitry Andric ///    Bits [3:2]: Determine comparison type and aggregation method. \n
18120b57cec5SDimitry Andric ///      00: Subset: Each character in \a B is compared for equality with all
18130b57cec5SDimitry Andric ///          the characters in \a A. \n
18140b57cec5SDimitry Andric ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
18150b57cec5SDimitry Andric ///          basis is greater than or equal for even-indexed elements in \a A,
18160b57cec5SDimitry Andric ///          and less than or equal for odd-indexed elements in \a A. \n
18170b57cec5SDimitry Andric ///      10: Match: Compare each pair of corresponding characters in \a A and
18180b57cec5SDimitry Andric ///          \a B for equality. \n
18190b57cec5SDimitry Andric ///      11: Substring: Search \a B for substring matches of \a A. \n
18200b57cec5SDimitry Andric ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
18210b57cec5SDimitry Andric ///                mask of the comparison results. \n
18220b57cec5SDimitry Andric ///      00: No effect. \n
18230b57cec5SDimitry Andric ///      01: Negate the bit mask. \n
18240b57cec5SDimitry Andric ///      10: No effect. \n
18250b57cec5SDimitry Andric ///      11: Negate the bit mask only for bits with an index less than or equal
18260b57cec5SDimitry Andric ///          to the size of \a A or \a B. \n
18270b57cec5SDimitry Andric /// \returns Returns 1 if the bit mask is zero and the length of the string in
18280b57cec5SDimitry Andric ///    \a B is the maximum; otherwise, returns 0.
18290b57cec5SDimitry Andric #define _mm_cmpistra(A, B, M)                                                  \
1830349cc55cSDimitry Andric   ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A),                    \
1831349cc55cSDimitry Andric                                      (__v16qi)(__m128i)(B), (int)(M)))
18320b57cec5SDimitry Andric 
18330b57cec5SDimitry Andric /// Uses the immediate operand \a M to perform a comparison of string
18340b57cec5SDimitry Andric ///    data with implicitly defined lengths that is contained in source operands
18350b57cec5SDimitry Andric ///    \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns
18360b57cec5SDimitry Andric ///    0.
18370b57cec5SDimitry Andric ///
18380b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
18390b57cec5SDimitry Andric ///
18400b57cec5SDimitry Andric /// \code
18410b57cec5SDimitry Andric /// int _mm_cmpistrc(__m128i A, __m128i B, const int M);
18420b57cec5SDimitry Andric /// \endcode
18430b57cec5SDimitry Andric ///
18440b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
18450b57cec5SDimitry Andric /// instruction.
18460b57cec5SDimitry Andric ///
18470b57cec5SDimitry Andric /// \param A
18480b57cec5SDimitry Andric ///    A 128-bit integer vector containing one of the source operands to be
18490b57cec5SDimitry Andric ///    compared.
18500b57cec5SDimitry Andric /// \param B
18510b57cec5SDimitry Andric ///    A 128-bit integer vector containing one of the source operands to be
18520b57cec5SDimitry Andric ///    compared.
18530b57cec5SDimitry Andric /// \param M
18540b57cec5SDimitry Andric ///    An 8-bit immediate operand specifying whether the characters are bytes or
18550b57cec5SDimitry Andric ///    words and the type of comparison to perform. \n
18560b57cec5SDimitry Andric ///    Bits [1:0]: Determine source data format. \n
18570b57cec5SDimitry Andric ///      00: 16 unsigned bytes \n
18580b57cec5SDimitry Andric ///      01: 8 unsigned words \n
18590b57cec5SDimitry Andric ///      10: 16 signed bytes \n
18600b57cec5SDimitry Andric ///      11: 8 signed words \n
18610b57cec5SDimitry Andric ///    Bits [3:2]: Determine comparison type and aggregation method. \n
18620b57cec5SDimitry Andric ///      00: Subset: Each character in \a B is compared for equality with all
18630b57cec5SDimitry Andric ///          the characters in \a A. \n
18640b57cec5SDimitry Andric ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
18650b57cec5SDimitry Andric ///          basis is greater than or equal for even-indexed elements in \a A,
18660b57cec5SDimitry Andric ///          and less than or equal for odd-indexed elements in \a A. \n
18670b57cec5SDimitry Andric ///      10: Match: Compare each pair of corresponding characters in \a A and
18680b57cec5SDimitry Andric ///          \a B for equality. \n
18690b57cec5SDimitry Andric ///      11: Substring: Search B for substring matches of \a A. \n
18700b57cec5SDimitry Andric ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
18710b57cec5SDimitry Andric ///                mask of the comparison results. \n
18720b57cec5SDimitry Andric ///      00: No effect. \n
18730b57cec5SDimitry Andric ///      01: Negate the bit mask. \n
18740b57cec5SDimitry Andric ///      10: No effect. \n
18750b57cec5SDimitry Andric ///      11: Negate the bit mask only for bits with an index less than or equal
18760b57cec5SDimitry Andric ///          to the size of \a A or \a B.
18770b57cec5SDimitry Andric /// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
18780b57cec5SDimitry Andric #define _mm_cmpistrc(A, B, M)                                                  \
1879349cc55cSDimitry Andric   ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A),                    \
1880349cc55cSDimitry Andric                                      (__v16qi)(__m128i)(B), (int)(M)))
18810b57cec5SDimitry Andric 
18820b57cec5SDimitry Andric /// Uses the immediate operand \a M to perform a comparison of string
18830b57cec5SDimitry Andric ///    data with implicitly defined lengths that is contained in source operands
18840b57cec5SDimitry Andric ///    \a A and \a B. Returns bit 0 of the resulting bit mask.
18850b57cec5SDimitry Andric ///
18860b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
18870b57cec5SDimitry Andric ///
18880b57cec5SDimitry Andric /// \code
18890b57cec5SDimitry Andric /// int _mm_cmpistro(__m128i A, __m128i B, const int M);
18900b57cec5SDimitry Andric /// \endcode
18910b57cec5SDimitry Andric ///
18920b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
18930b57cec5SDimitry Andric /// instruction.
18940b57cec5SDimitry Andric ///
18950b57cec5SDimitry Andric /// \param A
18960b57cec5SDimitry Andric ///    A 128-bit integer vector containing one of the source operands to be
18970b57cec5SDimitry Andric ///    compared.
18980b57cec5SDimitry Andric /// \param B
18990b57cec5SDimitry Andric ///    A 128-bit integer vector containing one of the source operands to be
19000b57cec5SDimitry Andric ///    compared.
19010b57cec5SDimitry Andric /// \param M
19020b57cec5SDimitry Andric ///    An 8-bit immediate operand specifying whether the characters are bytes or
19030b57cec5SDimitry Andric ///    words and the type of comparison to perform. \n
19040b57cec5SDimitry Andric ///    Bits [1:0]: Determine source data format. \n
19050b57cec5SDimitry Andric ///      00: 16 unsigned bytes \n
19060b57cec5SDimitry Andric ///      01: 8 unsigned words \n
19070b57cec5SDimitry Andric ///      10: 16 signed bytes \n
19080b57cec5SDimitry Andric ///      11: 8 signed words \n
19090b57cec5SDimitry Andric ///    Bits [3:2]: Determine comparison type and aggregation method. \n
19100b57cec5SDimitry Andric ///      00: Subset: Each character in \a B is compared for equality with all
19110b57cec5SDimitry Andric ///          the characters in \a A. \n
19120b57cec5SDimitry Andric ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
19130b57cec5SDimitry Andric ///          basis is greater than or equal for even-indexed elements in \a A,
19140b57cec5SDimitry Andric ///          and less than or equal for odd-indexed elements in \a A. \n
19150b57cec5SDimitry Andric ///      10: Match: Compare each pair of corresponding characters in \a A and
19160b57cec5SDimitry Andric ///          \a B for equality. \n
19170b57cec5SDimitry Andric ///      11: Substring: Search B for substring matches of \a A. \n
19180b57cec5SDimitry Andric ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
19190b57cec5SDimitry Andric ///                mask of the comparison results. \n
19200b57cec5SDimitry Andric ///      00: No effect. \n
19210b57cec5SDimitry Andric ///      01: Negate the bit mask. \n
19220b57cec5SDimitry Andric ///      10: No effect. \n
19230b57cec5SDimitry Andric ///      11: Negate the bit mask only for bits with an index less than or equal
19240b57cec5SDimitry Andric ///          to the size of \a A or \a B. \n
19250b57cec5SDimitry Andric /// \returns Returns bit 0 of the resulting bit mask.
19260b57cec5SDimitry Andric #define _mm_cmpistro(A, B, M)                                                  \
1927349cc55cSDimitry Andric   ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A),                    \
1928349cc55cSDimitry Andric                                      (__v16qi)(__m128i)(B), (int)(M)))
19290b57cec5SDimitry Andric 
19300b57cec5SDimitry Andric /// Uses the immediate operand \a M to perform a comparison of string
19310b57cec5SDimitry Andric ///    data with implicitly defined lengths that is contained in source operands
19320b57cec5SDimitry Andric ///    \a A and \a B. Returns 1 if the length of the string in \a A is less than
19330b57cec5SDimitry Andric ///    the maximum, otherwise, returns 0.
19340b57cec5SDimitry Andric ///
19350b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
19360b57cec5SDimitry Andric ///
19370b57cec5SDimitry Andric /// \code
19380b57cec5SDimitry Andric /// int _mm_cmpistrs(__m128i A, __m128i B, const int M);
19390b57cec5SDimitry Andric /// \endcode
19400b57cec5SDimitry Andric ///
19410b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
19420b57cec5SDimitry Andric /// instruction.
19430b57cec5SDimitry Andric ///
19440b57cec5SDimitry Andric /// \param A
19450b57cec5SDimitry Andric ///    A 128-bit integer vector containing one of the source operands to be
19460b57cec5SDimitry Andric ///    compared.
19470b57cec5SDimitry Andric /// \param B
19480b57cec5SDimitry Andric ///    A 128-bit integer vector containing one of the source operands to be
19490b57cec5SDimitry Andric ///    compared.
19500b57cec5SDimitry Andric /// \param M
19510b57cec5SDimitry Andric ///    An 8-bit immediate operand specifying whether the characters are bytes or
19520b57cec5SDimitry Andric ///    words and the type of comparison to perform. \n
19530b57cec5SDimitry Andric ///    Bits [1:0]: Determine source data format. \n
19540b57cec5SDimitry Andric ///      00: 16 unsigned bytes \n
19550b57cec5SDimitry Andric ///      01: 8 unsigned words \n
19560b57cec5SDimitry Andric ///      10: 16 signed bytes \n
19570b57cec5SDimitry Andric ///      11: 8 signed words \n
19580b57cec5SDimitry Andric ///    Bits [3:2]: Determine comparison type and aggregation method. \n
19590b57cec5SDimitry Andric ///      00: Subset: Each character in \a B is compared for equality with all
19600b57cec5SDimitry Andric ///          the characters in \a A. \n
19610b57cec5SDimitry Andric ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
19620b57cec5SDimitry Andric ///          basis is greater than or equal for even-indexed elements in \a A,
19630b57cec5SDimitry Andric ///          and less than or equal for odd-indexed elements in \a A. \n
19640b57cec5SDimitry Andric ///      10: Match: Compare each pair of corresponding characters in \a A and
19650b57cec5SDimitry Andric ///          \a B for equality. \n
19660b57cec5SDimitry Andric ///      11: Substring: Search \a B for substring matches of \a A. \n
19670b57cec5SDimitry Andric ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
19680b57cec5SDimitry Andric ///                mask of the comparison results. \n
19690b57cec5SDimitry Andric ///      00: No effect. \n
19700b57cec5SDimitry Andric ///      01: Negate the bit mask. \n
19710b57cec5SDimitry Andric ///      10: No effect. \n
19720b57cec5SDimitry Andric ///      11: Negate the bit mask only for bits with an index less than or equal
19730b57cec5SDimitry Andric ///          to the size of \a A or \a B. \n
19740b57cec5SDimitry Andric /// \returns Returns 1 if the length of the string in \a A is less than the
19750b57cec5SDimitry Andric ///    maximum, otherwise, returns 0.
19760b57cec5SDimitry Andric #define _mm_cmpistrs(A, B, M)                                                  \
1977349cc55cSDimitry Andric   ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A),                    \
1978349cc55cSDimitry Andric                                      (__v16qi)(__m128i)(B), (int)(M)))
19790b57cec5SDimitry Andric 
19800b57cec5SDimitry Andric /// Uses the immediate operand \a M to perform a comparison of string
19810b57cec5SDimitry Andric ///    data with implicitly defined lengths that is contained in source operands
19820b57cec5SDimitry Andric ///    \a A and \a B. Returns 1 if the length of the string in \a B is less than
19830b57cec5SDimitry Andric ///    the maximum, otherwise, returns 0.
19840b57cec5SDimitry Andric ///
19850b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
19860b57cec5SDimitry Andric ///
19870b57cec5SDimitry Andric /// \code
19880b57cec5SDimitry Andric /// int _mm_cmpistrz(__m128i A, __m128i B, const int M);
19890b57cec5SDimitry Andric /// \endcode
19900b57cec5SDimitry Andric ///
19910b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
19920b57cec5SDimitry Andric /// instruction.
19930b57cec5SDimitry Andric ///
19940b57cec5SDimitry Andric /// \param A
19950b57cec5SDimitry Andric ///    A 128-bit integer vector containing one of the source operands to be
19960b57cec5SDimitry Andric ///    compared.
19970b57cec5SDimitry Andric /// \param B
19980b57cec5SDimitry Andric ///    A 128-bit integer vector containing one of the source operands to be
19990b57cec5SDimitry Andric ///    compared.
20000b57cec5SDimitry Andric /// \param M
20010b57cec5SDimitry Andric ///    An 8-bit immediate operand specifying whether the characters are bytes or
20020b57cec5SDimitry Andric ///    words and the type of comparison to perform. \n
20030b57cec5SDimitry Andric ///    Bits [1:0]: Determine source data format. \n
20040b57cec5SDimitry Andric ///      00: 16 unsigned bytes \n
20050b57cec5SDimitry Andric ///      01: 8 unsigned words \n
20060b57cec5SDimitry Andric ///      10: 16 signed bytes \n
20070b57cec5SDimitry Andric ///      11: 8 signed words \n
20080b57cec5SDimitry Andric ///    Bits [3:2]: Determine comparison type and aggregation method. \n
20090b57cec5SDimitry Andric ///      00: Subset: Each character in \a B is compared for equality with all
20100b57cec5SDimitry Andric ///          the characters in \a A. \n
20110b57cec5SDimitry Andric ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
20120b57cec5SDimitry Andric ///          basis is greater than or equal for even-indexed elements in \a A,
20130b57cec5SDimitry Andric ///          and less than or equal for odd-indexed elements in \a A. \n
20140b57cec5SDimitry Andric ///      10: Match: Compare each pair of corresponding characters in \a A and
20150b57cec5SDimitry Andric ///          \a B for equality. \n
20160b57cec5SDimitry Andric ///      11: Substring: Search \a B for substring matches of \a A. \n
20170b57cec5SDimitry Andric ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
20180b57cec5SDimitry Andric ///                mask of the comparison results. \n
20190b57cec5SDimitry Andric ///      00: No effect. \n
20200b57cec5SDimitry Andric ///      01: Negate the bit mask. \n
20210b57cec5SDimitry Andric ///      10: No effect. \n
20220b57cec5SDimitry Andric ///      11: Negate the bit mask only for bits with an index less than or equal
20230b57cec5SDimitry Andric ///          to the size of \a A or \a B.
20240b57cec5SDimitry Andric /// \returns Returns 1 if the length of the string in \a B is less than the
20250b57cec5SDimitry Andric ///    maximum, otherwise, returns 0.
20260b57cec5SDimitry Andric #define _mm_cmpistrz(A, B, M)                                                  \
2027349cc55cSDimitry Andric   ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A),                    \
2028349cc55cSDimitry Andric                                      (__v16qi)(__m128i)(B), (int)(M)))
20290b57cec5SDimitry Andric 
20300b57cec5SDimitry Andric /// Uses the immediate operand \a M to perform a comparison of string
20310b57cec5SDimitry Andric ///    data with explicitly defined lengths that is contained in source operands
20320b57cec5SDimitry Andric ///    \a A and \a B. Returns 1 if the bit mask is zero and the length of the
20330b57cec5SDimitry Andric ///    string in \a B is the maximum, otherwise, returns 0.
20340b57cec5SDimitry Andric ///
20350b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
20360b57cec5SDimitry Andric ///
20370b57cec5SDimitry Andric /// \code
20380b57cec5SDimitry Andric /// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M);
20390b57cec5SDimitry Andric /// \endcode
20400b57cec5SDimitry Andric ///
20410b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
20420b57cec5SDimitry Andric /// instruction.
20430b57cec5SDimitry Andric ///
20440b57cec5SDimitry Andric /// \param A
20450b57cec5SDimitry Andric ///    A 128-bit integer vector containing one of the source operands to be
20460b57cec5SDimitry Andric ///    compared.
20470b57cec5SDimitry Andric /// \param LA
20480b57cec5SDimitry Andric ///    An integer that specifies the length of the string in \a A.
20490b57cec5SDimitry Andric /// \param B
20500b57cec5SDimitry Andric ///    A 128-bit integer vector containing one of the source operands to be
20510b57cec5SDimitry Andric ///    compared.
20520b57cec5SDimitry Andric /// \param LB
20530b57cec5SDimitry Andric ///    An integer that specifies the length of the string in \a B.
20540b57cec5SDimitry Andric /// \param M
20550b57cec5SDimitry Andric ///    An 8-bit immediate operand specifying whether the characters are bytes or
20560b57cec5SDimitry Andric ///    words and the type of comparison to perform. \n
20570b57cec5SDimitry Andric ///    Bits [1:0]: Determine source data format. \n
20580b57cec5SDimitry Andric ///      00: 16 unsigned bytes \n
20590b57cec5SDimitry Andric ///      01: 8 unsigned words \n
20600b57cec5SDimitry Andric ///      10: 16 signed bytes \n
20610b57cec5SDimitry Andric ///      11: 8 signed words \n
20620b57cec5SDimitry Andric ///    Bits [3:2]: Determine comparison type and aggregation method. \n
20630b57cec5SDimitry Andric ///      00: Subset: Each character in \a B is compared for equality with all
20640b57cec5SDimitry Andric ///          the characters in \a A. \n
20650b57cec5SDimitry Andric ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
20660b57cec5SDimitry Andric ///          basis is greater than or equal for even-indexed elements in \a A,
20670b57cec5SDimitry Andric ///          and less than or equal for odd-indexed elements in \a A. \n
20680b57cec5SDimitry Andric ///      10: Match: Compare each pair of corresponding characters in \a A and
20690b57cec5SDimitry Andric ///          \a B for equality. \n
20700b57cec5SDimitry Andric ///      11: Substring: Search \a B for substring matches of \a A. \n
20710b57cec5SDimitry Andric ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
20720b57cec5SDimitry Andric ///                mask of the comparison results. \n
20730b57cec5SDimitry Andric ///      00: No effect. \n
20740b57cec5SDimitry Andric ///      01: Negate the bit mask. \n
20750b57cec5SDimitry Andric ///      10: No effect. \n
20760b57cec5SDimitry Andric ///      11: Negate the bit mask only for bits with an index less than or equal
20770b57cec5SDimitry Andric ///          to the size of \a A or \a B.
20780b57cec5SDimitry Andric /// \returns Returns 1 if the bit mask is zero and the length of the string in
20790b57cec5SDimitry Andric ///    \a B is the maximum, otherwise, returns 0.
20800b57cec5SDimitry Andric #define _mm_cmpestra(A, LA, B, LB, M)                                          \
2081349cc55cSDimitry Andric   ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA),         \
20820b57cec5SDimitry Andric                                      (__v16qi)(__m128i)(B), (int)(LB),         \
2083349cc55cSDimitry Andric                                      (int)(M)))
20840b57cec5SDimitry Andric 
20850b57cec5SDimitry Andric /// Uses the immediate operand \a M to perform a comparison of string
20860b57cec5SDimitry Andric ///    data with explicitly defined lengths that is contained in source operands
20870b57cec5SDimitry Andric ///    \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise,
20880b57cec5SDimitry Andric ///    returns 0.
20890b57cec5SDimitry Andric ///
20900b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
20910b57cec5SDimitry Andric ///
20920b57cec5SDimitry Andric /// \code
20930b57cec5SDimitry Andric /// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M);
20940b57cec5SDimitry Andric /// \endcode
20950b57cec5SDimitry Andric ///
20960b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
20970b57cec5SDimitry Andric /// instruction.
20980b57cec5SDimitry Andric ///
20990b57cec5SDimitry Andric /// \param A
21000b57cec5SDimitry Andric ///    A 128-bit integer vector containing one of the source operands to be
21010b57cec5SDimitry Andric ///    compared.
21020b57cec5SDimitry Andric /// \param LA
21030b57cec5SDimitry Andric ///    An integer that specifies the length of the string in \a A.
21040b57cec5SDimitry Andric /// \param B
21050b57cec5SDimitry Andric ///    A 128-bit integer vector containing one of the source operands to be
21060b57cec5SDimitry Andric ///    compared.
21070b57cec5SDimitry Andric /// \param LB
21080b57cec5SDimitry Andric ///    An integer that specifies the length of the string in \a B.
21090b57cec5SDimitry Andric /// \param M
21100b57cec5SDimitry Andric ///    An 8-bit immediate operand specifying whether the characters are bytes or
21110b57cec5SDimitry Andric ///    words and the type of comparison to perform. \n
21120b57cec5SDimitry Andric ///    Bits [1:0]: Determine source data format. \n
21130b57cec5SDimitry Andric ///      00: 16 unsigned bytes \n
21140b57cec5SDimitry Andric ///      01: 8 unsigned words \n
21150b57cec5SDimitry Andric ///      10: 16 signed bytes \n
21160b57cec5SDimitry Andric ///      11: 8 signed words \n
21170b57cec5SDimitry Andric ///    Bits [3:2]: Determine comparison type and aggregation method. \n
21180b57cec5SDimitry Andric ///      00: Subset: Each character in \a B is compared for equality with all
21190b57cec5SDimitry Andric ///          the characters in \a A. \n
21200b57cec5SDimitry Andric ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
21210b57cec5SDimitry Andric ///          basis is greater than or equal for even-indexed elements in \a A,
21220b57cec5SDimitry Andric ///          and less than or equal for odd-indexed elements in \a A. \n
21230b57cec5SDimitry Andric ///      10: Match: Compare each pair of corresponding characters in \a A and
21240b57cec5SDimitry Andric ///          \a B for equality. \n
21250b57cec5SDimitry Andric ///      11: Substring: Search \a B for substring matches of \a A. \n
21260b57cec5SDimitry Andric ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
21270b57cec5SDimitry Andric ///                mask of the comparison results. \n
21280b57cec5SDimitry Andric ///      00: No effect. \n
21290b57cec5SDimitry Andric ///      01: Negate the bit mask. \n
21300b57cec5SDimitry Andric ///      10: No effect. \n
21310b57cec5SDimitry Andric ///      11: Negate the bit mask only for bits with an index less than or equal
21320b57cec5SDimitry Andric ///          to the size of \a A or \a B. \n
21330b57cec5SDimitry Andric /// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
21340b57cec5SDimitry Andric #define _mm_cmpestrc(A, LA, B, LB, M)                                          \
2135349cc55cSDimitry Andric   ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA),         \
21360b57cec5SDimitry Andric                                      (__v16qi)(__m128i)(B), (int)(LB),         \
2137349cc55cSDimitry Andric                                      (int)(M)))
21380b57cec5SDimitry Andric 
21390b57cec5SDimitry Andric /// Uses the immediate operand \a M to perform a comparison of string
21400b57cec5SDimitry Andric ///    data with explicitly defined lengths that is contained in source operands
21410b57cec5SDimitry Andric ///    \a A and \a B. Returns bit 0 of the resulting bit mask.
21420b57cec5SDimitry Andric ///
21430b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
21440b57cec5SDimitry Andric ///
21450b57cec5SDimitry Andric /// \code
21460b57cec5SDimitry Andric /// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M);
21470b57cec5SDimitry Andric /// \endcode
21480b57cec5SDimitry Andric ///
21490b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
21500b57cec5SDimitry Andric /// instruction.
21510b57cec5SDimitry Andric ///
21520b57cec5SDimitry Andric /// \param A
21530b57cec5SDimitry Andric ///    A 128-bit integer vector containing one of the source operands to be
21540b57cec5SDimitry Andric ///    compared.
21550b57cec5SDimitry Andric /// \param LA
21560b57cec5SDimitry Andric ///    An integer that specifies the length of the string in \a A.
21570b57cec5SDimitry Andric /// \param B
21580b57cec5SDimitry Andric ///    A 128-bit integer vector containing one of the source operands to be
21590b57cec5SDimitry Andric ///    compared.
21600b57cec5SDimitry Andric /// \param LB
21610b57cec5SDimitry Andric ///    An integer that specifies the length of the string in \a B.
21620b57cec5SDimitry Andric /// \param M
21630b57cec5SDimitry Andric ///    An 8-bit immediate operand specifying whether the characters are bytes or
21640b57cec5SDimitry Andric ///    words and the type of comparison to perform. \n
21650b57cec5SDimitry Andric ///    Bits [1:0]: Determine source data format. \n
21660b57cec5SDimitry Andric ///      00: 16 unsigned bytes \n
21670b57cec5SDimitry Andric ///      01: 8 unsigned words \n
21680b57cec5SDimitry Andric ///      10: 16 signed bytes \n
21690b57cec5SDimitry Andric ///      11: 8 signed words \n
21700b57cec5SDimitry Andric ///    Bits [3:2]: Determine comparison type and aggregation method. \n
21710b57cec5SDimitry Andric ///      00: Subset: Each character in \a B is compared for equality with all
21720b57cec5SDimitry Andric ///          the characters in \a A. \n
21730b57cec5SDimitry Andric ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
21740b57cec5SDimitry Andric ///          basis is greater than or equal for even-indexed elements in \a A,
21750b57cec5SDimitry Andric ///          and less than or equal for odd-indexed elements in \a A. \n
21760b57cec5SDimitry Andric ///      10: Match: Compare each pair of corresponding characters in \a A and
21770b57cec5SDimitry Andric ///          \a B for equality. \n
21780b57cec5SDimitry Andric ///      11: Substring: Search \a B for substring matches of \a A. \n
21790b57cec5SDimitry Andric ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
21800b57cec5SDimitry Andric ///                mask of the comparison results. \n
21810b57cec5SDimitry Andric ///      00: No effect. \n
21820b57cec5SDimitry Andric ///      01: Negate the bit mask. \n
21830b57cec5SDimitry Andric ///      10: No effect. \n
21840b57cec5SDimitry Andric ///      11: Negate the bit mask only for bits with an index less than or equal
21850b57cec5SDimitry Andric ///          to the size of \a A or \a B.
21860b57cec5SDimitry Andric /// \returns Returns bit 0 of the resulting bit mask.
21870b57cec5SDimitry Andric #define _mm_cmpestro(A, LA, B, LB, M)                                          \
2188349cc55cSDimitry Andric   ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA),         \
21890b57cec5SDimitry Andric                                      (__v16qi)(__m128i)(B), (int)(LB),         \
2190349cc55cSDimitry Andric                                      (int)(M)))
21910b57cec5SDimitry Andric 
21920b57cec5SDimitry Andric /// Uses the immediate operand \a M to perform a comparison of string
21930b57cec5SDimitry Andric ///    data with explicitly defined lengths that is contained in source operands
21940b57cec5SDimitry Andric ///    \a A and \a B. Returns 1 if the length of the string in \a A is less than
21950b57cec5SDimitry Andric ///    the maximum, otherwise, returns 0.
21960b57cec5SDimitry Andric ///
21970b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
21980b57cec5SDimitry Andric ///
21990b57cec5SDimitry Andric /// \code
22000b57cec5SDimitry Andric /// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M);
22010b57cec5SDimitry Andric /// \endcode
22020b57cec5SDimitry Andric ///
22030b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
22040b57cec5SDimitry Andric /// instruction.
22050b57cec5SDimitry Andric ///
22060b57cec5SDimitry Andric /// \param A
22070b57cec5SDimitry Andric ///    A 128-bit integer vector containing one of the source operands to be
22080b57cec5SDimitry Andric ///    compared.
22090b57cec5SDimitry Andric /// \param LA
22100b57cec5SDimitry Andric ///    An integer that specifies the length of the string in \a A.
22110b57cec5SDimitry Andric /// \param B
22120b57cec5SDimitry Andric ///    A 128-bit integer vector containing one of the source operands to be
22130b57cec5SDimitry Andric ///    compared.
22140b57cec5SDimitry Andric /// \param LB
22150b57cec5SDimitry Andric ///    An integer that specifies the length of the string in \a B.
22160b57cec5SDimitry Andric /// \param M
22170b57cec5SDimitry Andric ///    An 8-bit immediate operand specifying whether the characters are bytes or
22180b57cec5SDimitry Andric ///    words and the type of comparison to perform. \n
22190b57cec5SDimitry Andric ///    Bits [1:0]: Determine source data format. \n
22200b57cec5SDimitry Andric ///      00: 16 unsigned bytes \n
22210b57cec5SDimitry Andric ///      01: 8 unsigned words \n
22220b57cec5SDimitry Andric ///      10: 16 signed bytes \n
22230b57cec5SDimitry Andric ///      11: 8 signed words \n
22240b57cec5SDimitry Andric ///    Bits [3:2]: Determine comparison type and aggregation method. \n
22250b57cec5SDimitry Andric ///      00: Subset: Each character in \a B is compared for equality with all
22260b57cec5SDimitry Andric ///          the characters in \a A. \n
22270b57cec5SDimitry Andric ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
22280b57cec5SDimitry Andric ///          basis is greater than or equal for even-indexed elements in \a A,
22290b57cec5SDimitry Andric ///          and less than or equal for odd-indexed elements in \a A. \n
22300b57cec5SDimitry Andric ///      10: Match: Compare each pair of corresponding characters in \a A and
22310b57cec5SDimitry Andric ///          \a B for equality. \n
22320b57cec5SDimitry Andric ///      11: Substring: Search \a B for substring matches of \a A. \n
22330b57cec5SDimitry Andric ///    Bits [5:4]: Determine whether to perform a one's complement in the bit
22340b57cec5SDimitry Andric ///                mask of the comparison results. \n
22350b57cec5SDimitry Andric ///      00: No effect. \n
22360b57cec5SDimitry Andric ///      01: Negate the bit mask. \n
22370b57cec5SDimitry Andric ///      10: No effect. \n
22380b57cec5SDimitry Andric ///      11: Negate the bit mask only for bits with an index less than or equal
22390b57cec5SDimitry Andric ///          to the size of \a A or \a B. \n
22400b57cec5SDimitry Andric /// \returns Returns 1 if the length of the string in \a A is less than the
22410b57cec5SDimitry Andric ///    maximum, otherwise, returns 0.
22420b57cec5SDimitry Andric #define _mm_cmpestrs(A, LA, B, LB, M)                                          \
2243349cc55cSDimitry Andric   ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA),         \
22440b57cec5SDimitry Andric                                      (__v16qi)(__m128i)(B), (int)(LB),         \
2245349cc55cSDimitry Andric                                      (int)(M)))
22460b57cec5SDimitry Andric 
22470b57cec5SDimitry Andric /// Uses the immediate operand \a M to perform a comparison of string
22480b57cec5SDimitry Andric ///    data with explicitly defined lengths that is contained in source operands
22490b57cec5SDimitry Andric ///    \a A and \a B. Returns 1 if the length of the string in \a B is less than
22500b57cec5SDimitry Andric ///    the maximum, otherwise, returns 0.
22510b57cec5SDimitry Andric ///
22520b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
22530b57cec5SDimitry Andric ///
22540b57cec5SDimitry Andric /// \code
22550b57cec5SDimitry Andric /// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M);
22560b57cec5SDimitry Andric /// \endcode
22570b57cec5SDimitry Andric ///
22580b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction.
22590b57cec5SDimitry Andric ///
22600b57cec5SDimitry Andric /// \param A
22610b57cec5SDimitry Andric ///    A 128-bit integer vector containing one of the source operands to be
22620b57cec5SDimitry Andric ///    compared.
22630b57cec5SDimitry Andric /// \param LA
22640b57cec5SDimitry Andric ///    An integer that specifies the length of the string in \a A.
22650b57cec5SDimitry Andric /// \param B
22660b57cec5SDimitry Andric ///    A 128-bit integer vector containing one of the source operands to be
22670b57cec5SDimitry Andric ///    compared.
22680b57cec5SDimitry Andric /// \param LB
22690b57cec5SDimitry Andric ///    An integer that specifies the length of the string in \a B.
22700b57cec5SDimitry Andric /// \param M
22710b57cec5SDimitry Andric ///    An 8-bit immediate operand specifying whether the characters are bytes or
22720b57cec5SDimitry Andric ///    words and the type of comparison to perform. \n
22730b57cec5SDimitry Andric ///    Bits [1:0]: Determine source data format. \n
22740b57cec5SDimitry Andric ///      00: 16 unsigned bytes  \n
22750b57cec5SDimitry Andric ///      01: 8 unsigned words \n
22760b57cec5SDimitry Andric ///      10: 16 signed bytes \n
22770b57cec5SDimitry Andric ///      11: 8 signed words \n
22780b57cec5SDimitry Andric ///    Bits [3:2]: Determine comparison type and aggregation method. \n
22790b57cec5SDimitry Andric ///      00: Subset: Each character in \a B is compared for equality with all
22800b57cec5SDimitry Andric ///          the characters in \a A. \n
22810b57cec5SDimitry Andric ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
22820b57cec5SDimitry Andric ///          basis is greater than or equal for even-indexed elements in \a A,
22830b57cec5SDimitry Andric ///          and less than or equal for odd-indexed elements in \a A. \n
22840b57cec5SDimitry Andric ///      10: Match: Compare each pair of corresponding characters in \a A and
22850b57cec5SDimitry Andric ///          \a B for equality. \n
22860b57cec5SDimitry Andric ///      11: Substring: Search \a B for substring matches of \a A. \n
22870b57cec5SDimitry Andric ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
22880b57cec5SDimitry Andric ///                mask of the comparison results. \n
22890b57cec5SDimitry Andric ///      00: No effect. \n
22900b57cec5SDimitry Andric ///      01: Negate the bit mask. \n
22910b57cec5SDimitry Andric ///      10: No effect. \n
22920b57cec5SDimitry Andric ///      11: Negate the bit mask only for bits with an index less than or equal
22930b57cec5SDimitry Andric ///          to the size of \a A or \a B.
22940b57cec5SDimitry Andric /// \returns Returns 1 if the length of the string in \a B is less than the
22950b57cec5SDimitry Andric ///    maximum, otherwise, returns 0.
22960b57cec5SDimitry Andric #define _mm_cmpestrz(A, LA, B, LB, M)                                          \
2297349cc55cSDimitry Andric   ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA),         \
22980b57cec5SDimitry Andric                                      (__v16qi)(__m128i)(B), (int)(LB),         \
2299349cc55cSDimitry Andric                                      (int)(M)))
23000b57cec5SDimitry Andric 
23010b57cec5SDimitry Andric /* SSE4.2 Compare Packed Data -- Greater Than.  */
23020b57cec5SDimitry Andric /// Compares each of the corresponding 64-bit values of the 128-bit
23030b57cec5SDimitry Andric ///    integer vectors to determine if the values in the first operand are
23040b57cec5SDimitry Andric ///    greater than those in the second operand.
23050b57cec5SDimitry Andric ///
2306*0fca6ea1SDimitry Andric ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
2307*0fca6ea1SDimitry Andric ///
23080b57cec5SDimitry Andric /// \headerfile <x86intrin.h>
23090b57cec5SDimitry Andric ///
23100b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction.
23110b57cec5SDimitry Andric ///
23120b57cec5SDimitry Andric /// \param __V1
23130b57cec5SDimitry Andric ///    A 128-bit integer vector.
23140b57cec5SDimitry Andric /// \param __V2
23150b57cec5SDimitry Andric ///    A 128-bit integer vector.
23160b57cec5SDimitry Andric /// \returns A 128-bit integer vector containing the comparison results.
231781ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi64(__m128i __V1,
231881ad6265SDimitry Andric                                                              __m128i __V2) {
23190b57cec5SDimitry Andric   return (__m128i)((__v2di)__V1 > (__v2di)__V2);
23200b57cec5SDimitry Andric }
23210b57cec5SDimitry Andric 
23220b57cec5SDimitry Andric #undef __DEFAULT_FN_ATTRS
23230b57cec5SDimitry Andric 
23240b57cec5SDimitry Andric #include <popcntintrin.h>
23250b57cec5SDimitry Andric 
2326349cc55cSDimitry Andric #include <crc32intrin.h>
2327349cc55cSDimitry Andric 
23280b57cec5SDimitry Andric #endif /* __SMMINTRIN_H */
2329