10b57cec5SDimitry Andric /*===---- smmintrin.h - SSE4 intrinsics ------------------------------------=== 20b57cec5SDimitry Andric * 30b57cec5SDimitry Andric * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric * See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric * 70b57cec5SDimitry Andric *===-----------------------------------------------------------------------=== 80b57cec5SDimitry Andric */ 90b57cec5SDimitry Andric 100b57cec5SDimitry Andric #ifndef __SMMINTRIN_H 110b57cec5SDimitry Andric #define __SMMINTRIN_H 120b57cec5SDimitry Andric 13349cc55cSDimitry Andric #if !defined(__i386__) && !defined(__x86_64__) 14349cc55cSDimitry Andric #error "This header is only meant to be used on x86 and x64 architecture" 15349cc55cSDimitry Andric #endif 16349cc55cSDimitry Andric 170b57cec5SDimitry Andric #include <tmmintrin.h> 180b57cec5SDimitry Andric 190b57cec5SDimitry Andric /* Define the default attributes for the functions in this file. */ 2081ad6265SDimitry Andric #define __DEFAULT_FN_ATTRS \ 215f757f3fSDimitry Andric __attribute__((__always_inline__, __nodebug__, \ 225f757f3fSDimitry Andric __target__("sse4.1,no-evex512"), __min_vector_width__(128))) 230b57cec5SDimitry Andric 240b57cec5SDimitry Andric /* SSE4 Rounding macros. */ 250b57cec5SDimitry Andric #define _MM_FROUND_TO_NEAREST_INT 0x00 260b57cec5SDimitry Andric #define _MM_FROUND_TO_NEG_INF 0x01 270b57cec5SDimitry Andric #define _MM_FROUND_TO_POS_INF 0x02 280b57cec5SDimitry Andric #define _MM_FROUND_TO_ZERO 0x03 290b57cec5SDimitry Andric #define _MM_FROUND_CUR_DIRECTION 0x04 300b57cec5SDimitry Andric 310b57cec5SDimitry Andric #define _MM_FROUND_RAISE_EXC 0x00 320b57cec5SDimitry Andric #define _MM_FROUND_NO_EXC 0x08 330b57cec5SDimitry Andric 340b57cec5SDimitry Andric #define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT) 350b57cec5SDimitry Andric #define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF) 360b57cec5SDimitry Andric #define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF) 370b57cec5SDimitry Andric #define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO) 380b57cec5SDimitry Andric #define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION) 390b57cec5SDimitry Andric #define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION) 400b57cec5SDimitry Andric 410b57cec5SDimitry Andric /// Rounds up each element of the 128-bit vector of [4 x float] to an 420b57cec5SDimitry Andric /// integer and returns the rounded values in a 128-bit vector of 430b57cec5SDimitry Andric /// [4 x float]. 440b57cec5SDimitry Andric /// 450b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 460b57cec5SDimitry Andric /// 470b57cec5SDimitry Andric /// \code 480b57cec5SDimitry Andric /// __m128 _mm_ceil_ps(__m128 X); 490b57cec5SDimitry Andric /// \endcode 500b57cec5SDimitry Andric /// 510b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. 520b57cec5SDimitry Andric /// 530b57cec5SDimitry Andric /// \param X 540b57cec5SDimitry Andric /// A 128-bit vector of [4 x float] values to be rounded up. 550b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the rounded values. 560b57cec5SDimitry Andric #define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL) 570b57cec5SDimitry Andric 580b57cec5SDimitry Andric /// Rounds up each element of the 128-bit vector of [2 x double] to an 590b57cec5SDimitry Andric /// integer and returns the rounded values in a 128-bit vector of 600b57cec5SDimitry Andric /// [2 x double]. 610b57cec5SDimitry Andric /// 620b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 630b57cec5SDimitry Andric /// 640b57cec5SDimitry Andric /// \code 650b57cec5SDimitry Andric /// __m128d _mm_ceil_pd(__m128d X); 660b57cec5SDimitry Andric /// \endcode 670b57cec5SDimitry Andric /// 680b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. 690b57cec5SDimitry Andric /// 700b57cec5SDimitry Andric /// \param X 710b57cec5SDimitry Andric /// A 128-bit vector of [2 x double] values to be rounded up. 720b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the rounded values. 730b57cec5SDimitry Andric #define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL) 740b57cec5SDimitry Andric 750b57cec5SDimitry Andric /// Copies three upper elements of the first 128-bit vector operand to 760b57cec5SDimitry Andric /// the corresponding three upper elements of the 128-bit result vector of 770b57cec5SDimitry Andric /// [4 x float]. Rounds up the lowest element of the second 128-bit vector 780b57cec5SDimitry Andric /// operand to an integer and copies it to the lowest element of the 128-bit 790b57cec5SDimitry Andric /// result vector of [4 x float]. 800b57cec5SDimitry Andric /// 810b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 820b57cec5SDimitry Andric /// 830b57cec5SDimitry Andric /// \code 840b57cec5SDimitry Andric /// __m128 _mm_ceil_ss(__m128 X, __m128 Y); 850b57cec5SDimitry Andric /// \endcode 860b57cec5SDimitry Andric /// 870b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. 880b57cec5SDimitry Andric /// 890b57cec5SDimitry Andric /// \param X 900b57cec5SDimitry Andric /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 910b57cec5SDimitry Andric /// copied to the corresponding bits of the result. 920b57cec5SDimitry Andric /// \param Y 930b57cec5SDimitry Andric /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 940b57cec5SDimitry Andric /// rounded up to the nearest integer and copied to the corresponding bits 950b57cec5SDimitry Andric /// of the result. 960b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the copied and rounded 970b57cec5SDimitry Andric /// values. 980b57cec5SDimitry Andric #define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL) 990b57cec5SDimitry Andric 1000b57cec5SDimitry Andric /// Copies the upper element of the first 128-bit vector operand to the 1010b57cec5SDimitry Andric /// corresponding upper element of the 128-bit result vector of [2 x double]. 1020b57cec5SDimitry Andric /// Rounds up the lower element of the second 128-bit vector operand to an 1030b57cec5SDimitry Andric /// integer and copies it to the lower element of the 128-bit result vector 1040b57cec5SDimitry Andric /// of [2 x double]. 1050b57cec5SDimitry Andric /// 1060b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 1070b57cec5SDimitry Andric /// 1080b57cec5SDimitry Andric /// \code 1090b57cec5SDimitry Andric /// __m128d _mm_ceil_sd(__m128d X, __m128d Y); 1100b57cec5SDimitry Andric /// \endcode 1110b57cec5SDimitry Andric /// 1120b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. 1130b57cec5SDimitry Andric /// 1140b57cec5SDimitry Andric /// \param X 1150b57cec5SDimitry Andric /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 1160b57cec5SDimitry Andric /// copied to the corresponding bits of the result. 1170b57cec5SDimitry Andric /// \param Y 1180b57cec5SDimitry Andric /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 1190b57cec5SDimitry Andric /// rounded up to the nearest integer and copied to the corresponding bits 1200b57cec5SDimitry Andric /// of the result. 1210b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the copied and rounded 1220b57cec5SDimitry Andric /// values. 1230b57cec5SDimitry Andric #define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL) 1240b57cec5SDimitry Andric 1250b57cec5SDimitry Andric /// Rounds down each element of the 128-bit vector of [4 x float] to an 1260b57cec5SDimitry Andric /// an integer and returns the rounded values in a 128-bit vector of 1270b57cec5SDimitry Andric /// [4 x float]. 1280b57cec5SDimitry Andric /// 1290b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 1300b57cec5SDimitry Andric /// 1310b57cec5SDimitry Andric /// \code 1320b57cec5SDimitry Andric /// __m128 _mm_floor_ps(__m128 X); 1330b57cec5SDimitry Andric /// \endcode 1340b57cec5SDimitry Andric /// 1350b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. 1360b57cec5SDimitry Andric /// 1370b57cec5SDimitry Andric /// \param X 1380b57cec5SDimitry Andric /// A 128-bit vector of [4 x float] values to be rounded down. 1390b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the rounded values. 1400b57cec5SDimitry Andric #define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR) 1410b57cec5SDimitry Andric 1420b57cec5SDimitry Andric /// Rounds down each element of the 128-bit vector of [2 x double] to an 1430b57cec5SDimitry Andric /// integer and returns the rounded values in a 128-bit vector of 1440b57cec5SDimitry Andric /// [2 x double]. 1450b57cec5SDimitry Andric /// 1460b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 1470b57cec5SDimitry Andric /// 1480b57cec5SDimitry Andric /// \code 1490b57cec5SDimitry Andric /// __m128d _mm_floor_pd(__m128d X); 1500b57cec5SDimitry Andric /// \endcode 1510b57cec5SDimitry Andric /// 1520b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. 1530b57cec5SDimitry Andric /// 1540b57cec5SDimitry Andric /// \param X 1550b57cec5SDimitry Andric /// A 128-bit vector of [2 x double]. 1560b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the rounded values. 1570b57cec5SDimitry Andric #define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR) 1580b57cec5SDimitry Andric 1590b57cec5SDimitry Andric /// Copies three upper elements of the first 128-bit vector operand to 1600b57cec5SDimitry Andric /// the corresponding three upper elements of the 128-bit result vector of 1610b57cec5SDimitry Andric /// [4 x float]. Rounds down the lowest element of the second 128-bit vector 1620b57cec5SDimitry Andric /// operand to an integer and copies it to the lowest element of the 128-bit 1630b57cec5SDimitry Andric /// result vector of [4 x float]. 1640b57cec5SDimitry Andric /// 1650b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 1660b57cec5SDimitry Andric /// 1670b57cec5SDimitry Andric /// \code 1680b57cec5SDimitry Andric /// __m128 _mm_floor_ss(__m128 X, __m128 Y); 1690b57cec5SDimitry Andric /// \endcode 1700b57cec5SDimitry Andric /// 1710b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. 1720b57cec5SDimitry Andric /// 1730b57cec5SDimitry Andric /// \param X 1740b57cec5SDimitry Andric /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 1750b57cec5SDimitry Andric /// copied to the corresponding bits of the result. 1760b57cec5SDimitry Andric /// \param Y 1770b57cec5SDimitry Andric /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 1780b57cec5SDimitry Andric /// rounded down to the nearest integer and copied to the corresponding bits 1790b57cec5SDimitry Andric /// of the result. 1800b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the copied and rounded 1810b57cec5SDimitry Andric /// values. 1820b57cec5SDimitry Andric #define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR) 1830b57cec5SDimitry Andric 1840b57cec5SDimitry Andric /// Copies the upper element of the first 128-bit vector operand to the 1850b57cec5SDimitry Andric /// corresponding upper element of the 128-bit result vector of [2 x double]. 1860b57cec5SDimitry Andric /// Rounds down the lower element of the second 128-bit vector operand to an 1870b57cec5SDimitry Andric /// integer and copies it to the lower element of the 128-bit result vector 1880b57cec5SDimitry Andric /// of [2 x double]. 1890b57cec5SDimitry Andric /// 1900b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 1910b57cec5SDimitry Andric /// 1920b57cec5SDimitry Andric /// \code 1930b57cec5SDimitry Andric /// __m128d _mm_floor_sd(__m128d X, __m128d Y); 1940b57cec5SDimitry Andric /// \endcode 1950b57cec5SDimitry Andric /// 1960b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. 1970b57cec5SDimitry Andric /// 1980b57cec5SDimitry Andric /// \param X 1990b57cec5SDimitry Andric /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 2000b57cec5SDimitry Andric /// copied to the corresponding bits of the result. 2010b57cec5SDimitry Andric /// \param Y 2020b57cec5SDimitry Andric /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 2030b57cec5SDimitry Andric /// rounded down to the nearest integer and copied to the corresponding bits 2040b57cec5SDimitry Andric /// of the result. 2050b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the copied and rounded 2060b57cec5SDimitry Andric /// values. 2070b57cec5SDimitry Andric #define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR) 2080b57cec5SDimitry Andric 2090b57cec5SDimitry Andric /// Rounds each element of the 128-bit vector of [4 x float] to an 2100b57cec5SDimitry Andric /// integer value according to the rounding control specified by the second 2110b57cec5SDimitry Andric /// argument and returns the rounded values in a 128-bit vector of 2120b57cec5SDimitry Andric /// [4 x float]. 2130b57cec5SDimitry Andric /// 2140b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 2150b57cec5SDimitry Andric /// 2160b57cec5SDimitry Andric /// \code 2170b57cec5SDimitry Andric /// __m128 _mm_round_ps(__m128 X, const int M); 2180b57cec5SDimitry Andric /// \endcode 2190b57cec5SDimitry Andric /// 2200b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. 2210b57cec5SDimitry Andric /// 2220b57cec5SDimitry Andric /// \param X 2230b57cec5SDimitry Andric /// A 128-bit vector of [4 x float]. 2240b57cec5SDimitry Andric /// \param M 2250b57cec5SDimitry Andric /// An integer value that specifies the rounding operation. \n 2260b57cec5SDimitry Andric /// Bits [7:4] are reserved. \n 2270b57cec5SDimitry Andric /// Bit [3] is a precision exception value: \n 2280b57cec5SDimitry Andric /// 0: A normal PE exception is used \n 2290b57cec5SDimitry Andric /// 1: The PE field is not updated \n 2300b57cec5SDimitry Andric /// Bit [2] is the rounding control source: \n 2310b57cec5SDimitry Andric /// 0: Use bits [1:0] of \a M \n 2320b57cec5SDimitry Andric /// 1: Use the current MXCSR setting \n 2330b57cec5SDimitry Andric /// Bits [1:0] contain the rounding control definition: \n 2340b57cec5SDimitry Andric /// 00: Nearest \n 2350b57cec5SDimitry Andric /// 01: Downward (toward negative infinity) \n 2360b57cec5SDimitry Andric /// 10: Upward (toward positive infinity) \n 2370b57cec5SDimitry Andric /// 11: Truncated 2380b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the rounded values. 2390b57cec5SDimitry Andric #define _mm_round_ps(X, M) \ 240349cc55cSDimitry Andric ((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M))) 2410b57cec5SDimitry Andric 2420b57cec5SDimitry Andric /// Copies three upper elements of the first 128-bit vector operand to 2430b57cec5SDimitry Andric /// the corresponding three upper elements of the 128-bit result vector of 2440b57cec5SDimitry Andric /// [4 x float]. Rounds the lowest element of the second 128-bit vector 2450b57cec5SDimitry Andric /// operand to an integer value according to the rounding control specified 2460b57cec5SDimitry Andric /// by the third argument and copies it to the lowest element of the 128-bit 2470b57cec5SDimitry Andric /// result vector of [4 x float]. 2480b57cec5SDimitry Andric /// 2490b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 2500b57cec5SDimitry Andric /// 2510b57cec5SDimitry Andric /// \code 2520b57cec5SDimitry Andric /// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M); 2530b57cec5SDimitry Andric /// \endcode 2540b57cec5SDimitry Andric /// 2550b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. 2560b57cec5SDimitry Andric /// 2570b57cec5SDimitry Andric /// \param X 2580b57cec5SDimitry Andric /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 2590b57cec5SDimitry Andric /// copied to the corresponding bits of the result. 2600b57cec5SDimitry Andric /// \param Y 2610b57cec5SDimitry Andric /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 2620b57cec5SDimitry Andric /// rounded to the nearest integer using the specified rounding control and 2630b57cec5SDimitry Andric /// copied to the corresponding bits of the result. 2640b57cec5SDimitry Andric /// \param M 2650b57cec5SDimitry Andric /// An integer value that specifies the rounding operation. \n 2660b57cec5SDimitry Andric /// Bits [7:4] are reserved. \n 2670b57cec5SDimitry Andric /// Bit [3] is a precision exception value: \n 2680b57cec5SDimitry Andric /// 0: A normal PE exception is used \n 2690b57cec5SDimitry Andric /// 1: The PE field is not updated \n 2700b57cec5SDimitry Andric /// Bit [2] is the rounding control source: \n 2710b57cec5SDimitry Andric /// 0: Use bits [1:0] of \a M \n 2720b57cec5SDimitry Andric /// 1: Use the current MXCSR setting \n 2730b57cec5SDimitry Andric /// Bits [1:0] contain the rounding control definition: \n 2740b57cec5SDimitry Andric /// 00: Nearest \n 2750b57cec5SDimitry Andric /// 01: Downward (toward negative infinity) \n 2760b57cec5SDimitry Andric /// 10: Upward (toward positive infinity) \n 2770b57cec5SDimitry Andric /// 11: Truncated 2780b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the copied and rounded 2790b57cec5SDimitry Andric /// values. 2800b57cec5SDimitry Andric #define _mm_round_ss(X, Y, M) \ 28181ad6265SDimitry Andric ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \ 28281ad6265SDimitry Andric (M))) 2830b57cec5SDimitry Andric 2840b57cec5SDimitry Andric /// Rounds each element of the 128-bit vector of [2 x double] to an 2850b57cec5SDimitry Andric /// integer value according to the rounding control specified by the second 2860b57cec5SDimitry Andric /// argument and returns the rounded values in a 128-bit vector of 2870b57cec5SDimitry Andric /// [2 x double]. 2880b57cec5SDimitry Andric /// 2890b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 2900b57cec5SDimitry Andric /// 2910b57cec5SDimitry Andric /// \code 2920b57cec5SDimitry Andric /// __m128d _mm_round_pd(__m128d X, const int M); 2930b57cec5SDimitry Andric /// \endcode 2940b57cec5SDimitry Andric /// 2950b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. 2960b57cec5SDimitry Andric /// 2970b57cec5SDimitry Andric /// \param X 2980b57cec5SDimitry Andric /// A 128-bit vector of [2 x double]. 2990b57cec5SDimitry Andric /// \param M 3000b57cec5SDimitry Andric /// An integer value that specifies the rounding operation. \n 3010b57cec5SDimitry Andric /// Bits [7:4] are reserved. \n 3020b57cec5SDimitry Andric /// Bit [3] is a precision exception value: \n 3030b57cec5SDimitry Andric /// 0: A normal PE exception is used \n 3040b57cec5SDimitry Andric /// 1: The PE field is not updated \n 3050b57cec5SDimitry Andric /// Bit [2] is the rounding control source: \n 3060b57cec5SDimitry Andric /// 0: Use bits [1:0] of \a M \n 3070b57cec5SDimitry Andric /// 1: Use the current MXCSR setting \n 3080b57cec5SDimitry Andric /// Bits [1:0] contain the rounding control definition: \n 3090b57cec5SDimitry Andric /// 00: Nearest \n 3100b57cec5SDimitry Andric /// 01: Downward (toward negative infinity) \n 3110b57cec5SDimitry Andric /// 10: Upward (toward positive infinity) \n 3120b57cec5SDimitry Andric /// 11: Truncated 3130b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the rounded values. 3140b57cec5SDimitry Andric #define _mm_round_pd(X, M) \ 315349cc55cSDimitry Andric ((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M))) 3160b57cec5SDimitry Andric 3170b57cec5SDimitry Andric /// Copies the upper element of the first 128-bit vector operand to the 3180b57cec5SDimitry Andric /// corresponding upper element of the 128-bit result vector of [2 x double]. 3190b57cec5SDimitry Andric /// Rounds the lower element of the second 128-bit vector operand to an 3200b57cec5SDimitry Andric /// integer value according to the rounding control specified by the third 3210b57cec5SDimitry Andric /// argument and copies it to the lower element of the 128-bit result vector 3220b57cec5SDimitry Andric /// of [2 x double]. 3230b57cec5SDimitry Andric /// 3240b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 3250b57cec5SDimitry Andric /// 3260b57cec5SDimitry Andric /// \code 3270b57cec5SDimitry Andric /// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M); 3280b57cec5SDimitry Andric /// \endcode 3290b57cec5SDimitry Andric /// 3300b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. 3310b57cec5SDimitry Andric /// 3320b57cec5SDimitry Andric /// \param X 3330b57cec5SDimitry Andric /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 3340b57cec5SDimitry Andric /// copied to the corresponding bits of the result. 3350b57cec5SDimitry Andric /// \param Y 3360b57cec5SDimitry Andric /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 3370b57cec5SDimitry Andric /// rounded to the nearest integer using the specified rounding control and 3380b57cec5SDimitry Andric /// copied to the corresponding bits of the result. 3390b57cec5SDimitry Andric /// \param M 3400b57cec5SDimitry Andric /// An integer value that specifies the rounding operation. \n 3410b57cec5SDimitry Andric /// Bits [7:4] are reserved. \n 3420b57cec5SDimitry Andric /// Bit [3] is a precision exception value: \n 3430b57cec5SDimitry Andric /// 0: A normal PE exception is used \n 3440b57cec5SDimitry Andric /// 1: The PE field is not updated \n 3450b57cec5SDimitry Andric /// Bit [2] is the rounding control source: \n 3460b57cec5SDimitry Andric /// 0: Use bits [1:0] of \a M \n 3470b57cec5SDimitry Andric /// 1: Use the current MXCSR setting \n 3480b57cec5SDimitry Andric /// Bits [1:0] contain the rounding control definition: \n 3490b57cec5SDimitry Andric /// 00: Nearest \n 3500b57cec5SDimitry Andric /// 01: Downward (toward negative infinity) \n 3510b57cec5SDimitry Andric /// 10: Upward (toward positive infinity) \n 3520b57cec5SDimitry Andric /// 11: Truncated 3530b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the copied and rounded 3540b57cec5SDimitry Andric /// values. 3550b57cec5SDimitry Andric #define _mm_round_sd(X, Y, M) \ 35681ad6265SDimitry Andric ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \ 35781ad6265SDimitry Andric (M))) 3580b57cec5SDimitry Andric 3590b57cec5SDimitry Andric /* SSE4 Packed Blending Intrinsics. */ 3600b57cec5SDimitry Andric /// Returns a 128-bit vector of [2 x double] where the values are 3610b57cec5SDimitry Andric /// selected from either the first or second operand as specified by the 3620b57cec5SDimitry Andric /// third operand, the control mask. 3630b57cec5SDimitry Andric /// 3640b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 3650b57cec5SDimitry Andric /// 3660b57cec5SDimitry Andric /// \code 3670b57cec5SDimitry Andric /// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M); 3680b57cec5SDimitry Andric /// \endcode 3690b57cec5SDimitry Andric /// 3700b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction. 3710b57cec5SDimitry Andric /// 3720b57cec5SDimitry Andric /// \param V1 3730b57cec5SDimitry Andric /// A 128-bit vector of [2 x double]. 3740b57cec5SDimitry Andric /// \param V2 3750b57cec5SDimitry Andric /// A 128-bit vector of [2 x double]. 3760b57cec5SDimitry Andric /// \param M 3770b57cec5SDimitry Andric /// An immediate integer operand, with mask bits [1:0] specifying how the 3780b57cec5SDimitry Andric /// values are to be copied. The position of the mask bit corresponds to the 3790b57cec5SDimitry Andric /// index of a copied value. When a mask bit is 0, the corresponding 64-bit 3800b57cec5SDimitry Andric /// element in operand \a V1 is copied to the same position in the result. 3810b57cec5SDimitry Andric /// When a mask bit is 1, the corresponding 64-bit element in operand \a V2 3820b57cec5SDimitry Andric /// is copied to the same position in the result. 3830b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the copied values. 3840b57cec5SDimitry Andric #define _mm_blend_pd(V1, V2, M) \ 385349cc55cSDimitry Andric ((__m128d)__builtin_ia32_blendpd((__v2df)(__m128d)(V1), \ 386349cc55cSDimitry Andric (__v2df)(__m128d)(V2), (int)(M))) 3870b57cec5SDimitry Andric 3880b57cec5SDimitry Andric /// Returns a 128-bit vector of [4 x float] where the values are selected 3890b57cec5SDimitry Andric /// from either the first or second operand as specified by the third 3900b57cec5SDimitry Andric /// operand, the control mask. 3910b57cec5SDimitry Andric /// 3920b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 3930b57cec5SDimitry Andric /// 3940b57cec5SDimitry Andric /// \code 3950b57cec5SDimitry Andric /// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M); 3960b57cec5SDimitry Andric /// \endcode 3970b57cec5SDimitry Andric /// 3980b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction. 3990b57cec5SDimitry Andric /// 4000b57cec5SDimitry Andric /// \param V1 4010b57cec5SDimitry Andric /// A 128-bit vector of [4 x float]. 4020b57cec5SDimitry Andric /// \param V2 4030b57cec5SDimitry Andric /// A 128-bit vector of [4 x float]. 4040b57cec5SDimitry Andric /// \param M 4050b57cec5SDimitry Andric /// An immediate integer operand, with mask bits [3:0] specifying how the 4060b57cec5SDimitry Andric /// values are to be copied. The position of the mask bit corresponds to the 4070b57cec5SDimitry Andric /// index of a copied value. When a mask bit is 0, the corresponding 32-bit 4080b57cec5SDimitry Andric /// element in operand \a V1 is copied to the same position in the result. 4090b57cec5SDimitry Andric /// When a mask bit is 1, the corresponding 32-bit element in operand \a V2 4100b57cec5SDimitry Andric /// is copied to the same position in the result. 4110b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the copied values. 4120b57cec5SDimitry Andric #define _mm_blend_ps(V1, V2, M) \ 41381ad6265SDimitry Andric ((__m128)__builtin_ia32_blendps((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \ 41481ad6265SDimitry Andric (int)(M))) 4150b57cec5SDimitry Andric 4160b57cec5SDimitry Andric /// Returns a 128-bit vector of [2 x double] where the values are 4170b57cec5SDimitry Andric /// selected from either the first or second operand as specified by the 4180b57cec5SDimitry Andric /// third operand, the control mask. 4190b57cec5SDimitry Andric /// 4200b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 4210b57cec5SDimitry Andric /// 4220b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction. 4230b57cec5SDimitry Andric /// 4240b57cec5SDimitry Andric /// \param __V1 4250b57cec5SDimitry Andric /// A 128-bit vector of [2 x double]. 4260b57cec5SDimitry Andric /// \param __V2 4270b57cec5SDimitry Andric /// A 128-bit vector of [2 x double]. 4280b57cec5SDimitry Andric /// \param __M 4290b57cec5SDimitry Andric /// A 128-bit vector operand, with mask bits 127 and 63 specifying how the 4300b57cec5SDimitry Andric /// values are to be copied. The position of the mask bit corresponds to the 4310b57cec5SDimitry Andric /// most significant bit of a copied value. When a mask bit is 0, the 4320b57cec5SDimitry Andric /// corresponding 64-bit element in operand \a __V1 is copied to the same 4330b57cec5SDimitry Andric /// position in the result. When a mask bit is 1, the corresponding 64-bit 4340b57cec5SDimitry Andric /// element in operand \a __V2 is copied to the same position in the result. 4350b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the copied values. 43681ad6265SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_blendv_pd(__m128d __V1, 43781ad6265SDimitry Andric __m128d __V2, 43881ad6265SDimitry Andric __m128d __M) { 4390b57cec5SDimitry Andric return (__m128d)__builtin_ia32_blendvpd((__v2df)__V1, (__v2df)__V2, 4400b57cec5SDimitry Andric (__v2df)__M); 4410b57cec5SDimitry Andric } 4420b57cec5SDimitry Andric 4430b57cec5SDimitry Andric /// Returns a 128-bit vector of [4 x float] where the values are 4440b57cec5SDimitry Andric /// selected from either the first or second operand as specified by the 4450b57cec5SDimitry Andric /// third operand, the control mask. 4460b57cec5SDimitry Andric /// 4470b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 4480b57cec5SDimitry Andric /// 4490b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction. 4500b57cec5SDimitry Andric /// 4510b57cec5SDimitry Andric /// \param __V1 4520b57cec5SDimitry Andric /// A 128-bit vector of [4 x float]. 4530b57cec5SDimitry Andric /// \param __V2 4540b57cec5SDimitry Andric /// A 128-bit vector of [4 x float]. 4550b57cec5SDimitry Andric /// \param __M 4560b57cec5SDimitry Andric /// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying 4570b57cec5SDimitry Andric /// how the values are to be copied. The position of the mask bit corresponds 4580b57cec5SDimitry Andric /// to the most significant bit of a copied value. When a mask bit is 0, the 4590b57cec5SDimitry Andric /// corresponding 32-bit element in operand \a __V1 is copied to the same 4600b57cec5SDimitry Andric /// position in the result. When a mask bit is 1, the corresponding 32-bit 4610b57cec5SDimitry Andric /// element in operand \a __V2 is copied to the same position in the result. 4620b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the copied values. 46381ad6265SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_blendv_ps(__m128 __V1, 46481ad6265SDimitry Andric __m128 __V2, 46581ad6265SDimitry Andric __m128 __M) { 4660b57cec5SDimitry Andric return (__m128)__builtin_ia32_blendvps((__v4sf)__V1, (__v4sf)__V2, 4670b57cec5SDimitry Andric (__v4sf)__M); 4680b57cec5SDimitry Andric } 4690b57cec5SDimitry Andric 4700b57cec5SDimitry Andric /// Returns a 128-bit vector of [16 x i8] where the values are selected 4710b57cec5SDimitry Andric /// from either of the first or second operand as specified by the third 4720b57cec5SDimitry Andric /// operand, the control mask. 4730b57cec5SDimitry Andric /// 4740b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 4750b57cec5SDimitry Andric /// 4760b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction. 4770b57cec5SDimitry Andric /// 4780b57cec5SDimitry Andric /// \param __V1 4790b57cec5SDimitry Andric /// A 128-bit vector of [16 x i8]. 4800b57cec5SDimitry Andric /// \param __V2 4810b57cec5SDimitry Andric /// A 128-bit vector of [16 x i8]. 4820b57cec5SDimitry Andric /// \param __M 4830b57cec5SDimitry Andric /// A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying 4840b57cec5SDimitry Andric /// how the values are to be copied. The position of the mask bit corresponds 4850b57cec5SDimitry Andric /// to the most significant bit of a copied value. When a mask bit is 0, the 4860b57cec5SDimitry Andric /// corresponding 8-bit element in operand \a __V1 is copied to the same 4870b57cec5SDimitry Andric /// position in the result. When a mask bit is 1, the corresponding 8-bit 4880b57cec5SDimitry Andric /// element in operand \a __V2 is copied to the same position in the result. 4890b57cec5SDimitry Andric /// \returns A 128-bit vector of [16 x i8] containing the copied values. 49081ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_blendv_epi8(__m128i __V1, 49181ad6265SDimitry Andric __m128i __V2, 49281ad6265SDimitry Andric __m128i __M) { 4930b57cec5SDimitry Andric return (__m128i)__builtin_ia32_pblendvb128((__v16qi)__V1, (__v16qi)__V2, 4940b57cec5SDimitry Andric (__v16qi)__M); 4950b57cec5SDimitry Andric } 4960b57cec5SDimitry Andric 4970b57cec5SDimitry Andric /// Returns a 128-bit vector of [8 x i16] where the values are selected 4980b57cec5SDimitry Andric /// from either of the first or second operand as specified by the third 4990b57cec5SDimitry Andric /// operand, the control mask. 5000b57cec5SDimitry Andric /// 5010b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 5020b57cec5SDimitry Andric /// 5030b57cec5SDimitry Andric /// \code 5040b57cec5SDimitry Andric /// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M); 5050b57cec5SDimitry Andric /// \endcode 5060b57cec5SDimitry Andric /// 5070b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction. 5080b57cec5SDimitry Andric /// 5090b57cec5SDimitry Andric /// \param V1 5100b57cec5SDimitry Andric /// A 128-bit vector of [8 x i16]. 5110b57cec5SDimitry Andric /// \param V2 5120b57cec5SDimitry Andric /// A 128-bit vector of [8 x i16]. 5130b57cec5SDimitry Andric /// \param M 5140b57cec5SDimitry Andric /// An immediate integer operand, with mask bits [7:0] specifying how the 5150b57cec5SDimitry Andric /// values are to be copied. The position of the mask bit corresponds to the 5160b57cec5SDimitry Andric /// index of a copied value. When a mask bit is 0, the corresponding 16-bit 5170b57cec5SDimitry Andric /// element in operand \a V1 is copied to the same position in the result. 5180b57cec5SDimitry Andric /// When a mask bit is 1, the corresponding 16-bit element in operand \a V2 5190b57cec5SDimitry Andric /// is copied to the same position in the result. 5200b57cec5SDimitry Andric /// \returns A 128-bit vector of [8 x i16] containing the copied values. 5210b57cec5SDimitry Andric #define _mm_blend_epi16(V1, V2, M) \ 522349cc55cSDimitry Andric ((__m128i)__builtin_ia32_pblendw128((__v8hi)(__m128i)(V1), \ 523349cc55cSDimitry Andric (__v8hi)(__m128i)(V2), (int)(M))) 5240b57cec5SDimitry Andric 5250b57cec5SDimitry Andric /* SSE4 Dword Multiply Instructions. */ 5260b57cec5SDimitry Andric /// Multiples corresponding elements of two 128-bit vectors of [4 x i32] 5270b57cec5SDimitry Andric /// and returns the lower 32 bits of the each product in a 128-bit vector of 5280b57cec5SDimitry Andric /// [4 x i32]. 5290b57cec5SDimitry Andric /// 5300b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 5310b57cec5SDimitry Andric /// 5320b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction. 5330b57cec5SDimitry Andric /// 5340b57cec5SDimitry Andric /// \param __V1 5350b57cec5SDimitry Andric /// A 128-bit integer vector. 5360b57cec5SDimitry Andric /// \param __V2 5370b57cec5SDimitry Andric /// A 128-bit integer vector. 5380b57cec5SDimitry Andric /// \returns A 128-bit integer vector containing the products of both operands. 53981ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi32(__m128i __V1, 54081ad6265SDimitry Andric __m128i __V2) { 5410b57cec5SDimitry Andric return (__m128i)((__v4su)__V1 * (__v4su)__V2); 5420b57cec5SDimitry Andric } 5430b57cec5SDimitry Andric 5440b57cec5SDimitry Andric /// Multiplies corresponding even-indexed elements of two 128-bit 5450b57cec5SDimitry Andric /// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64] 5460b57cec5SDimitry Andric /// containing the products. 5470b57cec5SDimitry Andric /// 5480b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 5490b57cec5SDimitry Andric /// 5500b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction. 5510b57cec5SDimitry Andric /// 5520b57cec5SDimitry Andric /// \param __V1 5530b57cec5SDimitry Andric /// A 128-bit vector of [4 x i32]. 5540b57cec5SDimitry Andric /// \param __V2 5550b57cec5SDimitry Andric /// A 128-bit vector of [4 x i32]. 5560b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the products of both 5570b57cec5SDimitry Andric /// operands. 55881ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1, 55981ad6265SDimitry Andric __m128i __V2) { 5600b57cec5SDimitry Andric return (__m128i)__builtin_ia32_pmuldq128((__v4si)__V1, (__v4si)__V2); 5610b57cec5SDimitry Andric } 5620b57cec5SDimitry Andric 5630b57cec5SDimitry Andric /* SSE4 Floating Point Dot Product Instructions. */ 5640b57cec5SDimitry Andric /// Computes the dot product of the two 128-bit vectors of [4 x float] 5650b57cec5SDimitry Andric /// and returns it in the elements of the 128-bit result vector of 5660b57cec5SDimitry Andric /// [4 x float]. 5670b57cec5SDimitry Andric /// 5680b57cec5SDimitry Andric /// The immediate integer operand controls which input elements 5690b57cec5SDimitry Andric /// will contribute to the dot product, and where the final results are 5700b57cec5SDimitry Andric /// returned. 5710b57cec5SDimitry Andric /// 5720b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 5730b57cec5SDimitry Andric /// 5740b57cec5SDimitry Andric /// \code 5750b57cec5SDimitry Andric /// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M); 5760b57cec5SDimitry Andric /// \endcode 5770b57cec5SDimitry Andric /// 5780b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction. 5790b57cec5SDimitry Andric /// 5800b57cec5SDimitry Andric /// \param X 5810b57cec5SDimitry Andric /// A 128-bit vector of [4 x float]. 5820b57cec5SDimitry Andric /// \param Y 5830b57cec5SDimitry Andric /// A 128-bit vector of [4 x float]. 5840b57cec5SDimitry Andric /// \param M 5850b57cec5SDimitry Andric /// An immediate integer operand. Mask bits [7:4] determine which elements 5860b57cec5SDimitry Andric /// of the input vectors are used, with bit [4] corresponding to the lowest 5870b57cec5SDimitry Andric /// element and bit [7] corresponding to the highest element of each [4 x 5880b57cec5SDimitry Andric /// float] vector. If a bit is set, the corresponding elements from the two 5890b57cec5SDimitry Andric /// input vectors are used as an input for dot product; otherwise that input 5900b57cec5SDimitry Andric /// is treated as zero. Bits [3:0] determine which elements of the result 5910b57cec5SDimitry Andric /// will receive a copy of the final dot product, with bit [0] corresponding 5920b57cec5SDimitry Andric /// to the lowest element and bit [3] corresponding to the highest element of 5930b57cec5SDimitry Andric /// each [4 x float] subvector. If a bit is set, the dot product is returned 5940b57cec5SDimitry Andric /// in the corresponding element; otherwise that element is set to zero. 5950b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the dot product. 5960b57cec5SDimitry Andric #define _mm_dp_ps(X, Y, M) \ 59781ad6265SDimitry Andric ((__m128)__builtin_ia32_dpps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (M))) 5980b57cec5SDimitry Andric 5990b57cec5SDimitry Andric /// Computes the dot product of the two 128-bit vectors of [2 x double] 6000b57cec5SDimitry Andric /// and returns it in the elements of the 128-bit result vector of 6010b57cec5SDimitry Andric /// [2 x double]. 6020b57cec5SDimitry Andric /// 6030b57cec5SDimitry Andric /// The immediate integer operand controls which input 6040b57cec5SDimitry Andric /// elements will contribute to the dot product, and where the final results 6050b57cec5SDimitry Andric /// are returned. 6060b57cec5SDimitry Andric /// 6070b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 6080b57cec5SDimitry Andric /// 6090b57cec5SDimitry Andric /// \code 6100b57cec5SDimitry Andric /// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M); 6110b57cec5SDimitry Andric /// \endcode 6120b57cec5SDimitry Andric /// 6130b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction. 6140b57cec5SDimitry Andric /// 6150b57cec5SDimitry Andric /// \param X 6160b57cec5SDimitry Andric /// A 128-bit vector of [2 x double]. 6170b57cec5SDimitry Andric /// \param Y 6180b57cec5SDimitry Andric /// A 128-bit vector of [2 x double]. 6190b57cec5SDimitry Andric /// \param M 6200b57cec5SDimitry Andric /// An immediate integer operand. Mask bits [5:4] determine which elements 6210b57cec5SDimitry Andric /// of the input vectors are used, with bit [4] corresponding to the lowest 6220b57cec5SDimitry Andric /// element and bit [5] corresponding to the highest element of each of [2 x 6230b57cec5SDimitry Andric /// double] vector. If a bit is set, the corresponding elements from the two 6240b57cec5SDimitry Andric /// input vectors are used as an input for dot product; otherwise that input 6250b57cec5SDimitry Andric /// is treated as zero. Bits [1:0] determine which elements of the result 6260b57cec5SDimitry Andric /// will receive a copy of the final dot product, with bit [0] corresponding 6270b57cec5SDimitry Andric /// to the lowest element and bit [1] corresponding to the highest element of 6280b57cec5SDimitry Andric /// each [2 x double] vector. If a bit is set, the dot product is returned in 6290b57cec5SDimitry Andric /// the corresponding element; otherwise that element is set to zero. 6300b57cec5SDimitry Andric #define _mm_dp_pd(X, Y, M) \ 63181ad6265SDimitry Andric ((__m128d)__builtin_ia32_dppd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \ 63281ad6265SDimitry Andric (M))) 6330b57cec5SDimitry Andric 6340b57cec5SDimitry Andric /* SSE4 Streaming Load Hint Instruction. */ 6350b57cec5SDimitry Andric /// Loads integer values from a 128-bit aligned memory location to a 6360b57cec5SDimitry Andric /// 128-bit integer vector. 6370b57cec5SDimitry Andric /// 6380b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 6390b57cec5SDimitry Andric /// 6400b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction. 6410b57cec5SDimitry Andric /// 6420b57cec5SDimitry Andric /// \param __V 6430b57cec5SDimitry Andric /// A pointer to a 128-bit aligned memory location that contains the integer 6440b57cec5SDimitry Andric /// values. 6450b57cec5SDimitry Andric /// \returns A 128-bit integer vector containing the data stored at the 6460b57cec5SDimitry Andric /// specified memory location. 6470b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS 6485f757f3fSDimitry Andric _mm_stream_load_si128(const void *__V) { 6490b57cec5SDimitry Andric return (__m128i)__builtin_nontemporal_load((const __v2di *)__V); 6500b57cec5SDimitry Andric } 6510b57cec5SDimitry Andric 6520b57cec5SDimitry Andric /* SSE4 Packed Integer Min/Max Instructions. */ 6530b57cec5SDimitry Andric /// Compares the corresponding elements of two 128-bit vectors of 6540b57cec5SDimitry Andric /// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser 6550b57cec5SDimitry Andric /// of the two values. 6560b57cec5SDimitry Andric /// 6570b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 6580b57cec5SDimitry Andric /// 6590b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction. 6600b57cec5SDimitry Andric /// 6610b57cec5SDimitry Andric /// \param __V1 6620b57cec5SDimitry Andric /// A 128-bit vector of [16 x i8]. 6630b57cec5SDimitry Andric /// \param __V2 6640b57cec5SDimitry Andric /// A 128-bit vector of [16 x i8] 6650b57cec5SDimitry Andric /// \returns A 128-bit vector of [16 x i8] containing the lesser values. 66681ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi8(__m128i __V1, 66781ad6265SDimitry Andric __m128i __V2) { 66804eeddc0SDimitry Andric return (__m128i)__builtin_elementwise_min((__v16qs)__V1, (__v16qs)__V2); 6690b57cec5SDimitry Andric } 6700b57cec5SDimitry Andric 6710b57cec5SDimitry Andric /// Compares the corresponding elements of two 128-bit vectors of 6720b57cec5SDimitry Andric /// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the 6730b57cec5SDimitry Andric /// greater value of the two. 6740b57cec5SDimitry Andric /// 6750b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 6760b57cec5SDimitry Andric /// 6770b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction. 6780b57cec5SDimitry Andric /// 6790b57cec5SDimitry Andric /// \param __V1 6800b57cec5SDimitry Andric /// A 128-bit vector of [16 x i8]. 6810b57cec5SDimitry Andric /// \param __V2 6820b57cec5SDimitry Andric /// A 128-bit vector of [16 x i8]. 6830b57cec5SDimitry Andric /// \returns A 128-bit vector of [16 x i8] containing the greater values. 68481ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi8(__m128i __V1, 68581ad6265SDimitry Andric __m128i __V2) { 68604eeddc0SDimitry Andric return (__m128i)__builtin_elementwise_max((__v16qs)__V1, (__v16qs)__V2); 6870b57cec5SDimitry Andric } 6880b57cec5SDimitry Andric 6890b57cec5SDimitry Andric /// Compares the corresponding elements of two 128-bit vectors of 6900b57cec5SDimitry Andric /// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser 6910b57cec5SDimitry Andric /// value of the two. 6920b57cec5SDimitry Andric /// 6930b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 6940b57cec5SDimitry Andric /// 6950b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction. 6960b57cec5SDimitry Andric /// 6970b57cec5SDimitry Andric /// \param __V1 6980b57cec5SDimitry Andric /// A 128-bit vector of [8 x u16]. 6990b57cec5SDimitry Andric /// \param __V2 7000b57cec5SDimitry Andric /// A 128-bit vector of [8 x u16]. 7010b57cec5SDimitry Andric /// \returns A 128-bit vector of [8 x u16] containing the lesser values. 70281ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu16(__m128i __V1, 70381ad6265SDimitry Andric __m128i __V2) { 70404eeddc0SDimitry Andric return (__m128i)__builtin_elementwise_min((__v8hu)__V1, (__v8hu)__V2); 7050b57cec5SDimitry Andric } 7060b57cec5SDimitry Andric 7070b57cec5SDimitry Andric /// Compares the corresponding elements of two 128-bit vectors of 7080b57cec5SDimitry Andric /// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the 7090b57cec5SDimitry Andric /// greater value of the two. 7100b57cec5SDimitry Andric /// 7110b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 7120b57cec5SDimitry Andric /// 7130b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction. 7140b57cec5SDimitry Andric /// 7150b57cec5SDimitry Andric /// \param __V1 7160b57cec5SDimitry Andric /// A 128-bit vector of [8 x u16]. 7170b57cec5SDimitry Andric /// \param __V2 7180b57cec5SDimitry Andric /// A 128-bit vector of [8 x u16]. 7190b57cec5SDimitry Andric /// \returns A 128-bit vector of [8 x u16] containing the greater values. 72081ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu16(__m128i __V1, 72181ad6265SDimitry Andric __m128i __V2) { 72204eeddc0SDimitry Andric return (__m128i)__builtin_elementwise_max((__v8hu)__V1, (__v8hu)__V2); 7230b57cec5SDimitry Andric } 7240b57cec5SDimitry Andric 7250b57cec5SDimitry Andric /// Compares the corresponding elements of two 128-bit vectors of 7260b57cec5SDimitry Andric /// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser 7270b57cec5SDimitry Andric /// value of the two. 7280b57cec5SDimitry Andric /// 7290b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 7300b57cec5SDimitry Andric /// 7310b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction. 7320b57cec5SDimitry Andric /// 7330b57cec5SDimitry Andric /// \param __V1 7340b57cec5SDimitry Andric /// A 128-bit vector of [4 x i32]. 7350b57cec5SDimitry Andric /// \param __V2 7360b57cec5SDimitry Andric /// A 128-bit vector of [4 x i32]. 7370b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the lesser values. 73881ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi32(__m128i __V1, 73981ad6265SDimitry Andric __m128i __V2) { 74004eeddc0SDimitry Andric return (__m128i)__builtin_elementwise_min((__v4si)__V1, (__v4si)__V2); 7410b57cec5SDimitry Andric } 7420b57cec5SDimitry Andric 7430b57cec5SDimitry Andric /// Compares the corresponding elements of two 128-bit vectors of 7440b57cec5SDimitry Andric /// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the 7450b57cec5SDimitry Andric /// greater value of the two. 7460b57cec5SDimitry Andric /// 7470b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 7480b57cec5SDimitry Andric /// 7490b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction. 7500b57cec5SDimitry Andric /// 7510b57cec5SDimitry Andric /// \param __V1 7520b57cec5SDimitry Andric /// A 128-bit vector of [4 x i32]. 7530b57cec5SDimitry Andric /// \param __V2 7540b57cec5SDimitry Andric /// A 128-bit vector of [4 x i32]. 7550b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the greater values. 75681ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi32(__m128i __V1, 75781ad6265SDimitry Andric __m128i __V2) { 75804eeddc0SDimitry Andric return (__m128i)__builtin_elementwise_max((__v4si)__V1, (__v4si)__V2); 7590b57cec5SDimitry Andric } 7600b57cec5SDimitry Andric 7610b57cec5SDimitry Andric /// Compares the corresponding elements of two 128-bit vectors of 7620b57cec5SDimitry Andric /// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser 7630b57cec5SDimitry Andric /// value of the two. 7640b57cec5SDimitry Andric /// 7650b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 7660b57cec5SDimitry Andric /// 7670b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c> instruction. 7680b57cec5SDimitry Andric /// 7690b57cec5SDimitry Andric /// \param __V1 7700b57cec5SDimitry Andric /// A 128-bit vector of [4 x u32]. 7710b57cec5SDimitry Andric /// \param __V2 7720b57cec5SDimitry Andric /// A 128-bit vector of [4 x u32]. 7730b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x u32] containing the lesser values. 77481ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu32(__m128i __V1, 77581ad6265SDimitry Andric __m128i __V2) { 77604eeddc0SDimitry Andric return (__m128i)__builtin_elementwise_min((__v4su)__V1, (__v4su)__V2); 7770b57cec5SDimitry Andric } 7780b57cec5SDimitry Andric 7790b57cec5SDimitry Andric /// Compares the corresponding elements of two 128-bit vectors of 7800b57cec5SDimitry Andric /// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the 7810b57cec5SDimitry Andric /// greater value of the two. 7820b57cec5SDimitry Andric /// 7830b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 7840b57cec5SDimitry Andric /// 7850b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction. 7860b57cec5SDimitry Andric /// 7870b57cec5SDimitry Andric /// \param __V1 7880b57cec5SDimitry Andric /// A 128-bit vector of [4 x u32]. 7890b57cec5SDimitry Andric /// \param __V2 7900b57cec5SDimitry Andric /// A 128-bit vector of [4 x u32]. 7910b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x u32] containing the greater values. 79281ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu32(__m128i __V1, 79381ad6265SDimitry Andric __m128i __V2) { 79404eeddc0SDimitry Andric return (__m128i)__builtin_elementwise_max((__v4su)__V1, (__v4su)__V2); 7950b57cec5SDimitry Andric } 7960b57cec5SDimitry Andric 7970b57cec5SDimitry Andric /* SSE4 Insertion and Extraction from XMM Register Instructions. */ 7980b57cec5SDimitry Andric /// Takes the first argument \a X and inserts an element from the second 7990b57cec5SDimitry Andric /// argument \a Y as selected by the third argument \a N. That result then 8000b57cec5SDimitry Andric /// has elements zeroed out also as selected by the third argument \a N. The 8010b57cec5SDimitry Andric /// resulting 128-bit vector of [4 x float] is then returned. 8020b57cec5SDimitry Andric /// 8030b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 8040b57cec5SDimitry Andric /// 8050b57cec5SDimitry Andric /// \code 8060b57cec5SDimitry Andric /// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N); 8070b57cec5SDimitry Andric /// \endcode 8080b57cec5SDimitry Andric /// 8090b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VINSERTPS </c> instruction. 8100b57cec5SDimitry Andric /// 8110b57cec5SDimitry Andric /// \param X 8120b57cec5SDimitry Andric /// A 128-bit vector source operand of [4 x float]. With the exception of 8130b57cec5SDimitry Andric /// those bits in the result copied from parameter \a Y and zeroed by bits 8140b57cec5SDimitry Andric /// [3:0] of \a N, all bits from this parameter are copied to the result. 8150b57cec5SDimitry Andric /// \param Y 8160b57cec5SDimitry Andric /// A 128-bit vector source operand of [4 x float]. One single-precision 8170b57cec5SDimitry Andric /// floating-point element from this source, as determined by the immediate 8180b57cec5SDimitry Andric /// parameter, is copied to the result. 8190b57cec5SDimitry Andric /// \param N 8200b57cec5SDimitry Andric /// Specifies which bits from operand \a Y will be copied, which bits in the 821bdd1243dSDimitry Andric /// result they will be copied to, and which bits in the result will be 8220b57cec5SDimitry Andric /// cleared. The following assignments are made: \n 8230b57cec5SDimitry Andric /// Bits [7:6] specify the bits to copy from operand \a Y: \n 8240b57cec5SDimitry Andric /// 00: Selects bits [31:0] from operand \a Y. \n 8250b57cec5SDimitry Andric /// 01: Selects bits [63:32] from operand \a Y. \n 8260b57cec5SDimitry Andric /// 10: Selects bits [95:64] from operand \a Y. \n 8270b57cec5SDimitry Andric /// 11: Selects bits [127:96] from operand \a Y. \n 8280b57cec5SDimitry Andric /// Bits [5:4] specify the bits in the result to which the selected bits 8290b57cec5SDimitry Andric /// from operand \a Y are copied: \n 8300b57cec5SDimitry Andric /// 00: Copies the selected bits from \a Y to result bits [31:0]. \n 8310b57cec5SDimitry Andric /// 01: Copies the selected bits from \a Y to result bits [63:32]. \n 8320b57cec5SDimitry Andric /// 10: Copies the selected bits from \a Y to result bits [95:64]. \n 8330b57cec5SDimitry Andric /// 11: Copies the selected bits from \a Y to result bits [127:96]. \n 8340b57cec5SDimitry Andric /// Bits[3:0]: If any of these bits are set, the corresponding result 8350b57cec5SDimitry Andric /// element is cleared. 8360b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the copied 8370b57cec5SDimitry Andric /// single-precision floating point elements from the operands. 8380b57cec5SDimitry Andric #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N)) 8390b57cec5SDimitry Andric 8400b57cec5SDimitry Andric /// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and 8410b57cec5SDimitry Andric /// returns it, using the immediate value parameter \a N as a selector. 8420b57cec5SDimitry Andric /// 8430b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 8440b57cec5SDimitry Andric /// 8450b57cec5SDimitry Andric /// \code 8460b57cec5SDimitry Andric /// int _mm_extract_ps(__m128 X, const int N); 8470b57cec5SDimitry Andric /// \endcode 8480b57cec5SDimitry Andric /// 8490b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c> 8500b57cec5SDimitry Andric /// instruction. 8510b57cec5SDimitry Andric /// 8520b57cec5SDimitry Andric /// \param X 8530b57cec5SDimitry Andric /// A 128-bit vector of [4 x float]. 8540b57cec5SDimitry Andric /// \param N 8550b57cec5SDimitry Andric /// An immediate value. Bits [1:0] determines which bits from the argument 8560b57cec5SDimitry Andric /// \a X are extracted and returned: \n 8570b57cec5SDimitry Andric /// 00: Bits [31:0] of parameter \a X are returned. \n 8580b57cec5SDimitry Andric /// 01: Bits [63:32] of parameter \a X are returned. \n 8590b57cec5SDimitry Andric /// 10: Bits [95:64] of parameter \a X are returned. \n 8600b57cec5SDimitry Andric /// 11: Bits [127:96] of parameter \a X are returned. 8610b57cec5SDimitry Andric /// \returns A 32-bit integer containing the extracted 32 bits of float data. 862349cc55cSDimitry Andric #define _mm_extract_ps(X, N) \ 86381ad6265SDimitry Andric __builtin_bit_cast( \ 86481ad6265SDimitry Andric int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N))) 8650b57cec5SDimitry Andric 8660b57cec5SDimitry Andric /* Miscellaneous insert and extract macros. */ 8670b57cec5SDimitry Andric /* Extract a single-precision float from X at index N into D. */ 8680b57cec5SDimitry Andric #define _MM_EXTRACT_FLOAT(D, X, N) \ 86981ad6265SDimitry Andric do { \ 87081ad6265SDimitry Andric (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \ 87181ad6265SDimitry Andric } while (0) 8720b57cec5SDimitry Andric 8730b57cec5SDimitry Andric /* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create 8740b57cec5SDimitry Andric an index suitable for _mm_insert_ps. */ 8750b57cec5SDimitry Andric #define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z)) 8760b57cec5SDimitry Andric 8770b57cec5SDimitry Andric /* Extract a float from X at index N into the first index of the return. */ 87881ad6265SDimitry Andric #define _MM_PICK_OUT_PS(X, N) \ 87981ad6265SDimitry Andric _mm_insert_ps(_mm_setzero_ps(), (X), _MM_MK_INSERTPS_NDX((N), 0, 0x0e)) 8800b57cec5SDimitry Andric 8810b57cec5SDimitry Andric /* Insert int into packed integer array at index. */ 8820b57cec5SDimitry Andric /// Constructs a 128-bit vector of [16 x i8] by first making a copy of 8830b57cec5SDimitry Andric /// the 128-bit integer vector parameter, and then inserting the lower 8 bits 8840b57cec5SDimitry Andric /// of an integer parameter \a I into an offset specified by the immediate 8850b57cec5SDimitry Andric /// value parameter \a N. 8860b57cec5SDimitry Andric /// 8870b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 8880b57cec5SDimitry Andric /// 8890b57cec5SDimitry Andric /// \code 8900b57cec5SDimitry Andric /// __m128i _mm_insert_epi8(__m128i X, int I, const int N); 8910b57cec5SDimitry Andric /// \endcode 8920b57cec5SDimitry Andric /// 8930b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction. 8940b57cec5SDimitry Andric /// 8950b57cec5SDimitry Andric /// \param X 8960b57cec5SDimitry Andric /// A 128-bit integer vector of [16 x i8]. This vector is copied to the 8970b57cec5SDimitry Andric /// result and then one of the sixteen elements in the result vector is 8980b57cec5SDimitry Andric /// replaced by the lower 8 bits of \a I. 8990b57cec5SDimitry Andric /// \param I 9000b57cec5SDimitry Andric /// An integer. The lower 8 bits of this operand are written to the result 9010b57cec5SDimitry Andric /// beginning at the offset specified by \a N. 9020b57cec5SDimitry Andric /// \param N 9030b57cec5SDimitry Andric /// An immediate value. Bits [3:0] specify the bit offset in the result at 9040b57cec5SDimitry Andric /// which the lower 8 bits of \a I are written. \n 9050b57cec5SDimitry Andric /// 0000: Bits [7:0] of the result are used for insertion. \n 9060b57cec5SDimitry Andric /// 0001: Bits [15:8] of the result are used for insertion. \n 9070b57cec5SDimitry Andric /// 0010: Bits [23:16] of the result are used for insertion. \n 9080b57cec5SDimitry Andric /// 0011: Bits [31:24] of the result are used for insertion. \n 9090b57cec5SDimitry Andric /// 0100: Bits [39:32] of the result are used for insertion. \n 9100b57cec5SDimitry Andric /// 0101: Bits [47:40] of the result are used for insertion. \n 9110b57cec5SDimitry Andric /// 0110: Bits [55:48] of the result are used for insertion. \n 9120b57cec5SDimitry Andric /// 0111: Bits [63:56] of the result are used for insertion. \n 9130b57cec5SDimitry Andric /// 1000: Bits [71:64] of the result are used for insertion. \n 9140b57cec5SDimitry Andric /// 1001: Bits [79:72] of the result are used for insertion. \n 9150b57cec5SDimitry Andric /// 1010: Bits [87:80] of the result are used for insertion. \n 9160b57cec5SDimitry Andric /// 1011: Bits [95:88] of the result are used for insertion. \n 9170b57cec5SDimitry Andric /// 1100: Bits [103:96] of the result are used for insertion. \n 9180b57cec5SDimitry Andric /// 1101: Bits [111:104] of the result are used for insertion. \n 9190b57cec5SDimitry Andric /// 1110: Bits [119:112] of the result are used for insertion. \n 9200b57cec5SDimitry Andric /// 1111: Bits [127:120] of the result are used for insertion. 9210b57cec5SDimitry Andric /// \returns A 128-bit integer vector containing the constructed values. 9220b57cec5SDimitry Andric #define _mm_insert_epi8(X, I, N) \ 92381ad6265SDimitry Andric ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), (int)(I), \ 92481ad6265SDimitry Andric (int)(N))) 9250b57cec5SDimitry Andric 9260b57cec5SDimitry Andric /// Constructs a 128-bit vector of [4 x i32] by first making a copy of 9270b57cec5SDimitry Andric /// the 128-bit integer vector parameter, and then inserting the 32-bit 9280b57cec5SDimitry Andric /// integer parameter \a I at the offset specified by the immediate value 9290b57cec5SDimitry Andric /// parameter \a N. 9300b57cec5SDimitry Andric /// 9310b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 9320b57cec5SDimitry Andric /// 9330b57cec5SDimitry Andric /// \code 9340b57cec5SDimitry Andric /// __m128i _mm_insert_epi32(__m128i X, int I, const int N); 9350b57cec5SDimitry Andric /// \endcode 9360b57cec5SDimitry Andric /// 9370b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction. 9380b57cec5SDimitry Andric /// 9390b57cec5SDimitry Andric /// \param X 9400b57cec5SDimitry Andric /// A 128-bit integer vector of [4 x i32]. This vector is copied to the 9410b57cec5SDimitry Andric /// result and then one of the four elements in the result vector is 9420b57cec5SDimitry Andric /// replaced by \a I. 9430b57cec5SDimitry Andric /// \param I 9440b57cec5SDimitry Andric /// A 32-bit integer that is written to the result beginning at the offset 9450b57cec5SDimitry Andric /// specified by \a N. 9460b57cec5SDimitry Andric /// \param N 9470b57cec5SDimitry Andric /// An immediate value. Bits [1:0] specify the bit offset in the result at 9480b57cec5SDimitry Andric /// which the integer \a I is written. \n 9490b57cec5SDimitry Andric /// 00: Bits [31:0] of the result are used for insertion. \n 9500b57cec5SDimitry Andric /// 01: Bits [63:32] of the result are used for insertion. \n 9510b57cec5SDimitry Andric /// 10: Bits [95:64] of the result are used for insertion. \n 9520b57cec5SDimitry Andric /// 11: Bits [127:96] of the result are used for insertion. 9530b57cec5SDimitry Andric /// \returns A 128-bit integer vector containing the constructed values. 9540b57cec5SDimitry Andric #define _mm_insert_epi32(X, I, N) \ 95581ad6265SDimitry Andric ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), (int)(I), \ 95681ad6265SDimitry Andric (int)(N))) 9570b57cec5SDimitry Andric 9580b57cec5SDimitry Andric #ifdef __x86_64__ 9590b57cec5SDimitry Andric /// Constructs a 128-bit vector of [2 x i64] by first making a copy of 9600b57cec5SDimitry Andric /// the 128-bit integer vector parameter, and then inserting the 64-bit 9610b57cec5SDimitry Andric /// integer parameter \a I, using the immediate value parameter \a N as an 9620b57cec5SDimitry Andric /// insertion location selector. 9630b57cec5SDimitry Andric /// 9640b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 9650b57cec5SDimitry Andric /// 9660b57cec5SDimitry Andric /// \code 9670b57cec5SDimitry Andric /// __m128i _mm_insert_epi64(__m128i X, long long I, const int N); 9680b57cec5SDimitry Andric /// \endcode 9690b57cec5SDimitry Andric /// 9700b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction. 9710b57cec5SDimitry Andric /// 9720b57cec5SDimitry Andric /// \param X 9730b57cec5SDimitry Andric /// A 128-bit integer vector of [2 x i64]. This vector is copied to the 9740b57cec5SDimitry Andric /// result and then one of the two elements in the result vector is replaced 9750b57cec5SDimitry Andric /// by \a I. 9760b57cec5SDimitry Andric /// \param I 9770b57cec5SDimitry Andric /// A 64-bit integer that is written to the result beginning at the offset 9780b57cec5SDimitry Andric /// specified by \a N. 9790b57cec5SDimitry Andric /// \param N 9800b57cec5SDimitry Andric /// An immediate value. Bit [0] specifies the bit offset in the result at 9810b57cec5SDimitry Andric /// which the integer \a I is written. \n 9820b57cec5SDimitry Andric /// 0: Bits [63:0] of the result are used for insertion. \n 9830b57cec5SDimitry Andric /// 1: Bits [127:64] of the result are used for insertion. \n 9840b57cec5SDimitry Andric /// \returns A 128-bit integer vector containing the constructed values. 9850b57cec5SDimitry Andric #define _mm_insert_epi64(X, I, N) \ 98681ad6265SDimitry Andric ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), (long long)(I), \ 98781ad6265SDimitry Andric (int)(N))) 9880b57cec5SDimitry Andric #endif /* __x86_64__ */ 9890b57cec5SDimitry Andric 9900b57cec5SDimitry Andric /* Extract int from packed integer array at index. This returns the element 9910b57cec5SDimitry Andric * as a zero extended value, so it is unsigned. 9920b57cec5SDimitry Andric */ 9930b57cec5SDimitry Andric /// Extracts an 8-bit element from the 128-bit integer vector of 9940b57cec5SDimitry Andric /// [16 x i8], using the immediate value parameter \a N as a selector. 9950b57cec5SDimitry Andric /// 9960b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 9970b57cec5SDimitry Andric /// 9980b57cec5SDimitry Andric /// \code 9990b57cec5SDimitry Andric /// int _mm_extract_epi8(__m128i X, const int N); 10000b57cec5SDimitry Andric /// \endcode 10010b57cec5SDimitry Andric /// 10020b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction. 10030b57cec5SDimitry Andric /// 10040b57cec5SDimitry Andric /// \param X 10050b57cec5SDimitry Andric /// A 128-bit integer vector. 10060b57cec5SDimitry Andric /// \param N 10070b57cec5SDimitry Andric /// An immediate value. Bits [3:0] specify which 8-bit vector element from 10080b57cec5SDimitry Andric /// the argument \a X to extract and copy to the result. \n 10090b57cec5SDimitry Andric /// 0000: Bits [7:0] of parameter \a X are extracted. \n 10100b57cec5SDimitry Andric /// 0001: Bits [15:8] of the parameter \a X are extracted. \n 10110b57cec5SDimitry Andric /// 0010: Bits [23:16] of the parameter \a X are extracted. \n 10120b57cec5SDimitry Andric /// 0011: Bits [31:24] of the parameter \a X are extracted. \n 10130b57cec5SDimitry Andric /// 0100: Bits [39:32] of the parameter \a X are extracted. \n 10140b57cec5SDimitry Andric /// 0101: Bits [47:40] of the parameter \a X are extracted. \n 10150b57cec5SDimitry Andric /// 0110: Bits [55:48] of the parameter \a X are extracted. \n 10160b57cec5SDimitry Andric /// 0111: Bits [63:56] of the parameter \a X are extracted. \n 10170b57cec5SDimitry Andric /// 1000: Bits [71:64] of the parameter \a X are extracted. \n 10180b57cec5SDimitry Andric /// 1001: Bits [79:72] of the parameter \a X are extracted. \n 10190b57cec5SDimitry Andric /// 1010: Bits [87:80] of the parameter \a X are extracted. \n 10200b57cec5SDimitry Andric /// 1011: Bits [95:88] of the parameter \a X are extracted. \n 10210b57cec5SDimitry Andric /// 1100: Bits [103:96] of the parameter \a X are extracted. \n 10220b57cec5SDimitry Andric /// 1101: Bits [111:104] of the parameter \a X are extracted. \n 10230b57cec5SDimitry Andric /// 1110: Bits [119:112] of the parameter \a X are extracted. \n 10240b57cec5SDimitry Andric /// 1111: Bits [127:120] of the parameter \a X are extracted. 10250b57cec5SDimitry Andric /// \returns An unsigned integer, whose lower 8 bits are selected from the 10260b57cec5SDimitry Andric /// 128-bit integer vector parameter and the remaining bits are assigned 10270b57cec5SDimitry Andric /// zeros. 10280b57cec5SDimitry Andric #define _mm_extract_epi8(X, N) \ 1029349cc55cSDimitry Andric ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \ 1030349cc55cSDimitry Andric (int)(N))) 10310b57cec5SDimitry Andric 10320b57cec5SDimitry Andric /// Extracts a 32-bit element from the 128-bit integer vector of 10330b57cec5SDimitry Andric /// [4 x i32], using the immediate value parameter \a N as a selector. 10340b57cec5SDimitry Andric /// 10350b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 10360b57cec5SDimitry Andric /// 10370b57cec5SDimitry Andric /// \code 10380b57cec5SDimitry Andric /// int _mm_extract_epi32(__m128i X, const int N); 10390b57cec5SDimitry Andric /// \endcode 10400b57cec5SDimitry Andric /// 10410b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction. 10420b57cec5SDimitry Andric /// 10430b57cec5SDimitry Andric /// \param X 10440b57cec5SDimitry Andric /// A 128-bit integer vector. 10450b57cec5SDimitry Andric /// \param N 10460b57cec5SDimitry Andric /// An immediate value. Bits [1:0] specify which 32-bit vector element from 10470b57cec5SDimitry Andric /// the argument \a X to extract and copy to the result. \n 10480b57cec5SDimitry Andric /// 00: Bits [31:0] of the parameter \a X are extracted. \n 10490b57cec5SDimitry Andric /// 01: Bits [63:32] of the parameter \a X are extracted. \n 10500b57cec5SDimitry Andric /// 10: Bits [95:64] of the parameter \a X are extracted. \n 10510b57cec5SDimitry Andric /// 11: Bits [127:96] of the parameter \a X are exracted. 10520b57cec5SDimitry Andric /// \returns An integer, whose lower 32 bits are selected from the 128-bit 10530b57cec5SDimitry Andric /// integer vector parameter and the remaining bits are assigned zeros. 10540b57cec5SDimitry Andric #define _mm_extract_epi32(X, N) \ 1055349cc55cSDimitry Andric ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N))) 10560b57cec5SDimitry Andric 10570b57cec5SDimitry Andric /// Extracts a 64-bit element from the 128-bit integer vector of 10580b57cec5SDimitry Andric /// [2 x i64], using the immediate value parameter \a N as a selector. 10590b57cec5SDimitry Andric /// 10600b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 10610b57cec5SDimitry Andric /// 10620b57cec5SDimitry Andric /// \code 10630b57cec5SDimitry Andric /// long long _mm_extract_epi64(__m128i X, const int N); 10640b57cec5SDimitry Andric /// \endcode 10650b57cec5SDimitry Andric /// 106681ad6265SDimitry Andric /// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction 106781ad6265SDimitry Andric /// in 64-bit mode. 10680b57cec5SDimitry Andric /// 10690b57cec5SDimitry Andric /// \param X 10700b57cec5SDimitry Andric /// A 128-bit integer vector. 10710b57cec5SDimitry Andric /// \param N 10720b57cec5SDimitry Andric /// An immediate value. Bit [0] specifies which 64-bit vector element from 10730b57cec5SDimitry Andric /// the argument \a X to return. \n 10740b57cec5SDimitry Andric /// 0: Bits [63:0] are returned. \n 10750b57cec5SDimitry Andric /// 1: Bits [127:64] are returned. \n 10760b57cec5SDimitry Andric /// \returns A 64-bit integer. 10770b57cec5SDimitry Andric #define _mm_extract_epi64(X, N) \ 1078349cc55cSDimitry Andric ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N))) 10790b57cec5SDimitry Andric 10800b57cec5SDimitry Andric /* SSE4 128-bit Packed Integer Comparisons. */ 10810b57cec5SDimitry Andric /// Tests whether the specified bits in a 128-bit integer vector are all 10820b57cec5SDimitry Andric /// zeros. 10830b57cec5SDimitry Andric /// 10840b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 10850b57cec5SDimitry Andric /// 10860b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 10870b57cec5SDimitry Andric /// 10880b57cec5SDimitry Andric /// \param __M 10890b57cec5SDimitry Andric /// A 128-bit integer vector containing the bits to be tested. 10900b57cec5SDimitry Andric /// \param __V 10910b57cec5SDimitry Andric /// A 128-bit integer vector selecting which bits to test in operand \a __M. 10920b57cec5SDimitry Andric /// \returns TRUE if the specified bits are all zeros; FALSE otherwise. 109381ad6265SDimitry Andric static __inline__ int __DEFAULT_FN_ATTRS _mm_testz_si128(__m128i __M, 109481ad6265SDimitry Andric __m128i __V) { 10950b57cec5SDimitry Andric return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V); 10960b57cec5SDimitry Andric } 10970b57cec5SDimitry Andric 10980b57cec5SDimitry Andric /// Tests whether the specified bits in a 128-bit integer vector are all 10990b57cec5SDimitry Andric /// ones. 11000b57cec5SDimitry Andric /// 11010b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 11020b57cec5SDimitry Andric /// 11030b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 11040b57cec5SDimitry Andric /// 11050b57cec5SDimitry Andric /// \param __M 11060b57cec5SDimitry Andric /// A 128-bit integer vector containing the bits to be tested. 11070b57cec5SDimitry Andric /// \param __V 11080b57cec5SDimitry Andric /// A 128-bit integer vector selecting which bits to test in operand \a __M. 11090b57cec5SDimitry Andric /// \returns TRUE if the specified bits are all ones; FALSE otherwise. 111081ad6265SDimitry Andric static __inline__ int __DEFAULT_FN_ATTRS _mm_testc_si128(__m128i __M, 111181ad6265SDimitry Andric __m128i __V) { 11120b57cec5SDimitry Andric return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V); 11130b57cec5SDimitry Andric } 11140b57cec5SDimitry Andric 11150b57cec5SDimitry Andric /// Tests whether the specified bits in a 128-bit integer vector are 11160b57cec5SDimitry Andric /// neither all zeros nor all ones. 11170b57cec5SDimitry Andric /// 11180b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 11190b57cec5SDimitry Andric /// 11200b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 11210b57cec5SDimitry Andric /// 11220b57cec5SDimitry Andric /// \param __M 11230b57cec5SDimitry Andric /// A 128-bit integer vector containing the bits to be tested. 11240b57cec5SDimitry Andric /// \param __V 11250b57cec5SDimitry Andric /// A 128-bit integer vector selecting which bits to test in operand \a __M. 11260b57cec5SDimitry Andric /// \returns TRUE if the specified bits are neither all zeros nor all ones; 11270b57cec5SDimitry Andric /// FALSE otherwise. 112881ad6265SDimitry Andric static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M, 112981ad6265SDimitry Andric __m128i __V) { 11300b57cec5SDimitry Andric return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V); 11310b57cec5SDimitry Andric } 11320b57cec5SDimitry Andric 11330b57cec5SDimitry Andric /// Tests whether the specified bits in a 128-bit integer vector are all 11340b57cec5SDimitry Andric /// ones. 11350b57cec5SDimitry Andric /// 11360b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 11370b57cec5SDimitry Andric /// 11380b57cec5SDimitry Andric /// \code 11390b57cec5SDimitry Andric /// int _mm_test_all_ones(__m128i V); 11400b57cec5SDimitry Andric /// \endcode 11410b57cec5SDimitry Andric /// 11420b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 11430b57cec5SDimitry Andric /// 11440b57cec5SDimitry Andric /// \param V 11450b57cec5SDimitry Andric /// A 128-bit integer vector containing the bits to be tested. 11460b57cec5SDimitry Andric /// \returns TRUE if the bits specified in the operand are all set to 1; FALSE 11470b57cec5SDimitry Andric /// otherwise. 11481ac55f4cSDimitry Andric #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_set1_epi32(-1)) 11490b57cec5SDimitry Andric 11500b57cec5SDimitry Andric /// Tests whether the specified bits in a 128-bit integer vector are 11510b57cec5SDimitry Andric /// neither all zeros nor all ones. 11520b57cec5SDimitry Andric /// 11530b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 11540b57cec5SDimitry Andric /// 11550b57cec5SDimitry Andric /// \code 11560b57cec5SDimitry Andric /// int _mm_test_mix_ones_zeros(__m128i M, __m128i V); 11570b57cec5SDimitry Andric /// \endcode 11580b57cec5SDimitry Andric /// 11590b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 11600b57cec5SDimitry Andric /// 11610b57cec5SDimitry Andric /// \param M 11620b57cec5SDimitry Andric /// A 128-bit integer vector containing the bits to be tested. 11630b57cec5SDimitry Andric /// \param V 11640b57cec5SDimitry Andric /// A 128-bit integer vector selecting which bits to test in operand \a M. 11650b57cec5SDimitry Andric /// \returns TRUE if the specified bits are neither all zeros nor all ones; 11660b57cec5SDimitry Andric /// FALSE otherwise. 11670b57cec5SDimitry Andric #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V)) 11680b57cec5SDimitry Andric 11690b57cec5SDimitry Andric /// Tests whether the specified bits in a 128-bit integer vector are all 11700b57cec5SDimitry Andric /// zeros. 11710b57cec5SDimitry Andric /// 11720b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 11730b57cec5SDimitry Andric /// 11740b57cec5SDimitry Andric /// \code 11750b57cec5SDimitry Andric /// int _mm_test_all_zeros(__m128i M, __m128i V); 11760b57cec5SDimitry Andric /// \endcode 11770b57cec5SDimitry Andric /// 11780b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 11790b57cec5SDimitry Andric /// 11800b57cec5SDimitry Andric /// \param M 11810b57cec5SDimitry Andric /// A 128-bit integer vector containing the bits to be tested. 11820b57cec5SDimitry Andric /// \param V 11830b57cec5SDimitry Andric /// A 128-bit integer vector selecting which bits to test in operand \a M. 11840b57cec5SDimitry Andric /// \returns TRUE if the specified bits are all zeros; FALSE otherwise. 11850b57cec5SDimitry Andric #define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V)) 11860b57cec5SDimitry Andric 11870b57cec5SDimitry Andric /* SSE4 64-bit Packed Integer Comparisons. */ 11880b57cec5SDimitry Andric /// Compares each of the corresponding 64-bit values of the 128-bit 11890b57cec5SDimitry Andric /// integer vectors for equality. 11900b57cec5SDimitry Andric /// 1191*0fca6ea1SDimitry Andric /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 1192*0fca6ea1SDimitry Andric /// 11930b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 11940b57cec5SDimitry Andric /// 11950b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction. 11960b57cec5SDimitry Andric /// 11970b57cec5SDimitry Andric /// \param __V1 11980b57cec5SDimitry Andric /// A 128-bit integer vector. 11990b57cec5SDimitry Andric /// \param __V2 12000b57cec5SDimitry Andric /// A 128-bit integer vector. 12010b57cec5SDimitry Andric /// \returns A 128-bit integer vector containing the comparison results. 120281ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi64(__m128i __V1, 120381ad6265SDimitry Andric __m128i __V2) { 12040b57cec5SDimitry Andric return (__m128i)((__v2di)__V1 == (__v2di)__V2); 12050b57cec5SDimitry Andric } 12060b57cec5SDimitry Andric 12070b57cec5SDimitry Andric /* SSE4 Packed Integer Sign-Extension. */ 12080b57cec5SDimitry Andric /// Sign-extends each of the lower eight 8-bit integer elements of a 12090b57cec5SDimitry Andric /// 128-bit vector of [16 x i8] to 16-bit values and returns them in a 12100b57cec5SDimitry Andric /// 128-bit vector of [8 x i16]. The upper eight elements of the input vector 12110b57cec5SDimitry Andric /// are unused. 12120b57cec5SDimitry Andric /// 12130b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 12140b57cec5SDimitry Andric /// 12150b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction. 12160b57cec5SDimitry Andric /// 12170b57cec5SDimitry Andric /// \param __V 121881ad6265SDimitry Andric /// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are 121981ad6265SDimitry Andric /// sign-extended to 16-bit values. 12200b57cec5SDimitry Andric /// \returns A 128-bit vector of [8 x i16] containing the sign-extended values. 122181ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi16(__m128i __V) { 12220b57cec5SDimitry Andric /* This function always performs a signed extension, but __v16qi is a char 12230b57cec5SDimitry Andric which may be signed or unsigned, so use __v16qs. */ 122481ad6265SDimitry Andric return (__m128i) __builtin_convertvector( 122581ad6265SDimitry Andric __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 122681ad6265SDimitry Andric 7), 122781ad6265SDimitry Andric __v8hi); 12280b57cec5SDimitry Andric } 12290b57cec5SDimitry Andric 12300b57cec5SDimitry Andric /// Sign-extends each of the lower four 8-bit integer elements of a 12310b57cec5SDimitry Andric /// 128-bit vector of [16 x i8] to 32-bit values and returns them in a 12320b57cec5SDimitry Andric /// 128-bit vector of [4 x i32]. The upper twelve elements of the input 12330b57cec5SDimitry Andric /// vector are unused. 12340b57cec5SDimitry Andric /// 12350b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 12360b57cec5SDimitry Andric /// 12370b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction. 12380b57cec5SDimitry Andric /// 12390b57cec5SDimitry Andric /// \param __V 12400b57cec5SDimitry Andric /// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are 12410b57cec5SDimitry Andric /// sign-extended to 32-bit values. 12420b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. 124381ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi32(__m128i __V) { 12440b57cec5SDimitry Andric /* This function always performs a signed extension, but __v16qi is a char 12450b57cec5SDimitry Andric which may be signed or unsigned, so use __v16qs. */ 124681ad6265SDimitry Andric return (__m128i) __builtin_convertvector( 124781ad6265SDimitry Andric __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si); 12480b57cec5SDimitry Andric } 12490b57cec5SDimitry Andric 12500b57cec5SDimitry Andric /// Sign-extends each of the lower two 8-bit integer elements of a 12510b57cec5SDimitry Andric /// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in 12520b57cec5SDimitry Andric /// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input 12530b57cec5SDimitry Andric /// vector are unused. 12540b57cec5SDimitry Andric /// 12550b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 12560b57cec5SDimitry Andric /// 12570b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction. 12580b57cec5SDimitry Andric /// 12590b57cec5SDimitry Andric /// \param __V 12600b57cec5SDimitry Andric /// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are 12610b57cec5SDimitry Andric /// sign-extended to 64-bit values. 12620b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 126381ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi64(__m128i __V) { 12640b57cec5SDimitry Andric /* This function always performs a signed extension, but __v16qi is a char 12650b57cec5SDimitry Andric which may be signed or unsigned, so use __v16qs. */ 126681ad6265SDimitry Andric return (__m128i) __builtin_convertvector( 126781ad6265SDimitry Andric __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di); 12680b57cec5SDimitry Andric } 12690b57cec5SDimitry Andric 12700b57cec5SDimitry Andric /// Sign-extends each of the lower four 16-bit integer elements of a 12710b57cec5SDimitry Andric /// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in 12720b57cec5SDimitry Andric /// a 128-bit vector of [4 x i32]. The upper four elements of the input 12730b57cec5SDimitry Andric /// vector are unused. 12740b57cec5SDimitry Andric /// 12750b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 12760b57cec5SDimitry Andric /// 12770b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction. 12780b57cec5SDimitry Andric /// 12790b57cec5SDimitry Andric /// \param __V 12800b57cec5SDimitry Andric /// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are 12810b57cec5SDimitry Andric /// sign-extended to 32-bit values. 12820b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. 128381ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V) { 128481ad6265SDimitry Andric return (__m128i) __builtin_convertvector( 128581ad6265SDimitry Andric __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si); 12860b57cec5SDimitry Andric } 12870b57cec5SDimitry Andric 12880b57cec5SDimitry Andric /// Sign-extends each of the lower two 16-bit integer elements of a 12890b57cec5SDimitry Andric /// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in 12900b57cec5SDimitry Andric /// a 128-bit vector of [2 x i64]. The upper six elements of the input 12910b57cec5SDimitry Andric /// vector are unused. 12920b57cec5SDimitry Andric /// 12930b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 12940b57cec5SDimitry Andric /// 12950b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction. 12960b57cec5SDimitry Andric /// 12970b57cec5SDimitry Andric /// \param __V 12980b57cec5SDimitry Andric /// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are 12990b57cec5SDimitry Andric /// sign-extended to 64-bit values. 13000b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 130181ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi64(__m128i __V) { 130281ad6265SDimitry Andric return (__m128i) __builtin_convertvector( 130381ad6265SDimitry Andric __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di); 13040b57cec5SDimitry Andric } 13050b57cec5SDimitry Andric 13060b57cec5SDimitry Andric /// Sign-extends each of the lower two 32-bit integer elements of a 13070b57cec5SDimitry Andric /// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in 13080b57cec5SDimitry Andric /// a 128-bit vector of [2 x i64]. The upper two elements of the input vector 13090b57cec5SDimitry Andric /// are unused. 13100b57cec5SDimitry Andric /// 13110b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 13120b57cec5SDimitry Andric /// 13130b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction. 13140b57cec5SDimitry Andric /// 13150b57cec5SDimitry Andric /// \param __V 13160b57cec5SDimitry Andric /// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are 13170b57cec5SDimitry Andric /// sign-extended to 64-bit values. 13180b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 131981ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi64(__m128i __V) { 132081ad6265SDimitry Andric return (__m128i) __builtin_convertvector( 132181ad6265SDimitry Andric __builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di); 13220b57cec5SDimitry Andric } 13230b57cec5SDimitry Andric 13240b57cec5SDimitry Andric /* SSE4 Packed Integer Zero-Extension. */ 13250b57cec5SDimitry Andric /// Zero-extends each of the lower eight 8-bit integer elements of a 13260b57cec5SDimitry Andric /// 128-bit vector of [16 x i8] to 16-bit values and returns them in a 13270b57cec5SDimitry Andric /// 128-bit vector of [8 x i16]. The upper eight elements of the input vector 13280b57cec5SDimitry Andric /// are unused. 13290b57cec5SDimitry Andric /// 13300b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 13310b57cec5SDimitry Andric /// 13320b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction. 13330b57cec5SDimitry Andric /// 13340b57cec5SDimitry Andric /// \param __V 13350b57cec5SDimitry Andric /// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are 13360b57cec5SDimitry Andric /// zero-extended to 16-bit values. 13370b57cec5SDimitry Andric /// \returns A 128-bit vector of [8 x i16] containing the zero-extended values. 133881ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi16(__m128i __V) { 133981ad6265SDimitry Andric return (__m128i) __builtin_convertvector( 134081ad6265SDimitry Andric __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 134181ad6265SDimitry Andric 7), 134281ad6265SDimitry Andric __v8hi); 13430b57cec5SDimitry Andric } 13440b57cec5SDimitry Andric 13450b57cec5SDimitry Andric /// Zero-extends each of the lower four 8-bit integer elements of a 13460b57cec5SDimitry Andric /// 128-bit vector of [16 x i8] to 32-bit values and returns them in a 13470b57cec5SDimitry Andric /// 128-bit vector of [4 x i32]. The upper twelve elements of the input 13480b57cec5SDimitry Andric /// vector are unused. 13490b57cec5SDimitry Andric /// 13500b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 13510b57cec5SDimitry Andric /// 13520b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction. 13530b57cec5SDimitry Andric /// 13540b57cec5SDimitry Andric /// \param __V 13550b57cec5SDimitry Andric /// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are 13560b57cec5SDimitry Andric /// zero-extended to 32-bit values. 13570b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. 135881ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi32(__m128i __V) { 135981ad6265SDimitry Andric return (__m128i) __builtin_convertvector( 136081ad6265SDimitry Andric __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si); 13610b57cec5SDimitry Andric } 13620b57cec5SDimitry Andric 13630b57cec5SDimitry Andric /// Zero-extends each of the lower two 8-bit integer elements of a 13640b57cec5SDimitry Andric /// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in 13650b57cec5SDimitry Andric /// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input 13660b57cec5SDimitry Andric /// vector are unused. 13670b57cec5SDimitry Andric /// 13680b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 13690b57cec5SDimitry Andric /// 13700b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction. 13710b57cec5SDimitry Andric /// 13720b57cec5SDimitry Andric /// \param __V 13730b57cec5SDimitry Andric /// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are 13740b57cec5SDimitry Andric /// zero-extended to 64-bit values. 13750b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 137681ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi64(__m128i __V) { 137781ad6265SDimitry Andric return (__m128i) __builtin_convertvector( 137881ad6265SDimitry Andric __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di); 13790b57cec5SDimitry Andric } 13800b57cec5SDimitry Andric 13810b57cec5SDimitry Andric /// Zero-extends each of the lower four 16-bit integer elements of a 13820b57cec5SDimitry Andric /// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in 13830b57cec5SDimitry Andric /// a 128-bit vector of [4 x i32]. The upper four elements of the input 13840b57cec5SDimitry Andric /// vector are unused. 13850b57cec5SDimitry Andric /// 13860b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 13870b57cec5SDimitry Andric /// 13880b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction. 13890b57cec5SDimitry Andric /// 13900b57cec5SDimitry Andric /// \param __V 13910b57cec5SDimitry Andric /// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are 13920b57cec5SDimitry Andric /// zero-extended to 32-bit values. 13930b57cec5SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. 139481ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi32(__m128i __V) { 139581ad6265SDimitry Andric return (__m128i) __builtin_convertvector( 139681ad6265SDimitry Andric __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si); 13970b57cec5SDimitry Andric } 13980b57cec5SDimitry Andric 13990b57cec5SDimitry Andric /// Zero-extends each of the lower two 16-bit integer elements of a 14000b57cec5SDimitry Andric /// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in 14010b57cec5SDimitry Andric /// a 128-bit vector of [2 x i64]. The upper six elements of the input vector 14020b57cec5SDimitry Andric /// are unused. 14030b57cec5SDimitry Andric /// 14040b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 14050b57cec5SDimitry Andric /// 14060b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction. 14070b57cec5SDimitry Andric /// 14080b57cec5SDimitry Andric /// \param __V 14090b57cec5SDimitry Andric /// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are 14100b57cec5SDimitry Andric /// zero-extended to 64-bit values. 14110b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 141281ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi64(__m128i __V) { 141381ad6265SDimitry Andric return (__m128i) __builtin_convertvector( 141481ad6265SDimitry Andric __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di); 14150b57cec5SDimitry Andric } 14160b57cec5SDimitry Andric 14170b57cec5SDimitry Andric /// Zero-extends each of the lower two 32-bit integer elements of a 14180b57cec5SDimitry Andric /// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in 14190b57cec5SDimitry Andric /// a 128-bit vector of [2 x i64]. The upper two elements of the input vector 14200b57cec5SDimitry Andric /// are unused. 14210b57cec5SDimitry Andric /// 14220b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 14230b57cec5SDimitry Andric /// 14240b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction. 14250b57cec5SDimitry Andric /// 14260b57cec5SDimitry Andric /// \param __V 14270b57cec5SDimitry Andric /// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are 14280b57cec5SDimitry Andric /// zero-extended to 64-bit values. 14290b57cec5SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 143081ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) { 143181ad6265SDimitry Andric return (__m128i) __builtin_convertvector( 143281ad6265SDimitry Andric __builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di); 14330b57cec5SDimitry Andric } 14340b57cec5SDimitry Andric 14350b57cec5SDimitry Andric /* SSE4 Pack with Unsigned Saturation. */ 1436*0fca6ea1SDimitry Andric /// Converts, with saturation, 32-bit signed integers from both 128-bit integer 1437*0fca6ea1SDimitry Andric /// vector operands into 16-bit unsigned integers, and returns the packed 1438*0fca6ea1SDimitry Andric /// result. 1439*0fca6ea1SDimitry Andric /// 14400b57cec5SDimitry Andric /// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than 14410b57cec5SDimitry Andric /// 0x0000 are saturated to 0x0000. 14420b57cec5SDimitry Andric /// 14430b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 14440b57cec5SDimitry Andric /// 14450b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction. 14460b57cec5SDimitry Andric /// 14470b57cec5SDimitry Andric /// \param __V1 1448*0fca6ea1SDimitry Andric /// A 128-bit vector of [4 x i32]. The converted [4 x i16] values are 1449*0fca6ea1SDimitry Andric /// written to the lower 64 bits of the result. 14500b57cec5SDimitry Andric /// \param __V2 1451*0fca6ea1SDimitry Andric /// A 128-bit vector of [4 x i32]. The converted [4 x i16] values are 1452*0fca6ea1SDimitry Andric /// written to the higher 64 bits of the result. 14530b57cec5SDimitry Andric /// \returns A 128-bit vector of [8 x i16] containing the converted values. 145481ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1, 145581ad6265SDimitry Andric __m128i __V2) { 14560b57cec5SDimitry Andric return (__m128i)__builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2); 14570b57cec5SDimitry Andric } 14580b57cec5SDimitry Andric 14590b57cec5SDimitry Andric /* SSE4 Multiple Packed Sums of Absolute Difference. */ 14600b57cec5SDimitry Andric /// Subtracts 8-bit unsigned integer values and computes the absolute 14610b57cec5SDimitry Andric /// values of the differences to the corresponding bits in the destination. 14620b57cec5SDimitry Andric /// Then sums of the absolute differences are returned according to the bit 14630b57cec5SDimitry Andric /// fields in the immediate operand. 14640b57cec5SDimitry Andric /// 14650b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 14660b57cec5SDimitry Andric /// 14670b57cec5SDimitry Andric /// \code 14680b57cec5SDimitry Andric /// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M); 14690b57cec5SDimitry Andric /// \endcode 14700b57cec5SDimitry Andric /// 14710b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction. 14720b57cec5SDimitry Andric /// 14730b57cec5SDimitry Andric /// \param X 14740b57cec5SDimitry Andric /// A 128-bit vector of [16 x i8]. 14750b57cec5SDimitry Andric /// \param Y 14760b57cec5SDimitry Andric /// A 128-bit vector of [16 x i8]. 14770b57cec5SDimitry Andric /// \param M 14780b57cec5SDimitry Andric /// An 8-bit immediate operand specifying how the absolute differences are to 14790b57cec5SDimitry Andric /// be calculated, according to the following algorithm: 14800b57cec5SDimitry Andric /// \code 14810b57cec5SDimitry Andric /// // M2 represents bit 2 of the immediate operand 14820b57cec5SDimitry Andric /// // M10 represents bits [1:0] of the immediate operand 14830b57cec5SDimitry Andric /// i = M2 * 4; 14840b57cec5SDimitry Andric /// j = M10 * 4; 14850b57cec5SDimitry Andric /// for (k = 0; k < 8; k = k + 1) { 14860b57cec5SDimitry Andric /// d0 = abs(X[i + k + 0] - Y[j + 0]); 14870b57cec5SDimitry Andric /// d1 = abs(X[i + k + 1] - Y[j + 1]); 14880b57cec5SDimitry Andric /// d2 = abs(X[i + k + 2] - Y[j + 2]); 14890b57cec5SDimitry Andric /// d3 = abs(X[i + k + 3] - Y[j + 3]); 14900b57cec5SDimitry Andric /// r[k] = d0 + d1 + d2 + d3; 14910b57cec5SDimitry Andric /// } 14920b57cec5SDimitry Andric /// \endcode 14930b57cec5SDimitry Andric /// \returns A 128-bit integer vector containing the sums of the sets of 14940b57cec5SDimitry Andric /// absolute differences between both operands. 14950b57cec5SDimitry Andric #define _mm_mpsadbw_epu8(X, Y, M) \ 1496349cc55cSDimitry Andric ((__m128i)__builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \ 1497349cc55cSDimitry Andric (__v16qi)(__m128i)(Y), (M))) 14980b57cec5SDimitry Andric 14990b57cec5SDimitry Andric /// Finds the minimum unsigned 16-bit element in the input 128-bit 15000b57cec5SDimitry Andric /// vector of [8 x u16] and returns it and along with its index. 15010b57cec5SDimitry Andric /// 15020b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 15030b57cec5SDimitry Andric /// 15040b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c> 15050b57cec5SDimitry Andric /// instruction. 15060b57cec5SDimitry Andric /// 15070b57cec5SDimitry Andric /// \param __V 15080b57cec5SDimitry Andric /// A 128-bit vector of [8 x u16]. 15090b57cec5SDimitry Andric /// \returns A 128-bit value where bits [15:0] contain the minimum value found 15100b57cec5SDimitry Andric /// in parameter \a __V, bits [18:16] contain the index of the minimum value 15110b57cec5SDimitry Andric /// and the remaining bits are set to 0. 151281ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) { 15130b57cec5SDimitry Andric return (__m128i)__builtin_ia32_phminposuw128((__v8hi)__V); 15140b57cec5SDimitry Andric } 15150b57cec5SDimitry Andric 15160b57cec5SDimitry Andric /* Handle the sse4.2 definitions here. */ 15170b57cec5SDimitry Andric 15180b57cec5SDimitry Andric /* These definitions are normally in nmmintrin.h, but gcc puts them in here 15190b57cec5SDimitry Andric so we'll do the same. */ 15200b57cec5SDimitry Andric 15210b57cec5SDimitry Andric #undef __DEFAULT_FN_ATTRS 152281ad6265SDimitry Andric #define __DEFAULT_FN_ATTRS \ 152381ad6265SDimitry Andric __attribute__((__always_inline__, __nodebug__, __target__("sse4.2"))) 15240b57cec5SDimitry Andric 15250b57cec5SDimitry Andric /* These specify the type of data that we're comparing. */ 15260b57cec5SDimitry Andric #define _SIDD_UBYTE_OPS 0x00 15270b57cec5SDimitry Andric #define _SIDD_UWORD_OPS 0x01 15280b57cec5SDimitry Andric #define _SIDD_SBYTE_OPS 0x02 15290b57cec5SDimitry Andric #define _SIDD_SWORD_OPS 0x03 15300b57cec5SDimitry Andric 15310b57cec5SDimitry Andric /* These specify the type of comparison operation. */ 15320b57cec5SDimitry Andric #define _SIDD_CMP_EQUAL_ANY 0x00 15330b57cec5SDimitry Andric #define _SIDD_CMP_RANGES 0x04 15340b57cec5SDimitry Andric #define _SIDD_CMP_EQUAL_EACH 0x08 15350b57cec5SDimitry Andric #define _SIDD_CMP_EQUAL_ORDERED 0x0c 15360b57cec5SDimitry Andric 15370b57cec5SDimitry Andric /* These macros specify the polarity of the operation. */ 15380b57cec5SDimitry Andric #define _SIDD_POSITIVE_POLARITY 0x00 15390b57cec5SDimitry Andric #define _SIDD_NEGATIVE_POLARITY 0x10 15400b57cec5SDimitry Andric #define _SIDD_MASKED_POSITIVE_POLARITY 0x20 15410b57cec5SDimitry Andric #define _SIDD_MASKED_NEGATIVE_POLARITY 0x30 15420b57cec5SDimitry Andric 15430b57cec5SDimitry Andric /* These macros are used in _mm_cmpXstri() to specify the return. */ 15440b57cec5SDimitry Andric #define _SIDD_LEAST_SIGNIFICANT 0x00 15450b57cec5SDimitry Andric #define _SIDD_MOST_SIGNIFICANT 0x40 15460b57cec5SDimitry Andric 15470b57cec5SDimitry Andric /* These macros are used in _mm_cmpXstri() to specify the return. */ 15480b57cec5SDimitry Andric #define _SIDD_BIT_MASK 0x00 15490b57cec5SDimitry Andric #define _SIDD_UNIT_MASK 0x40 15500b57cec5SDimitry Andric 15510b57cec5SDimitry Andric /* SSE4.2 Packed Comparison Intrinsics. */ 15520b57cec5SDimitry Andric /// Uses the immediate operand \a M to perform a comparison of string 15530b57cec5SDimitry Andric /// data with implicitly defined lengths that is contained in source operands 15540b57cec5SDimitry Andric /// \a A and \a B. Returns a 128-bit integer vector representing the result 15550b57cec5SDimitry Andric /// mask of the comparison. 15560b57cec5SDimitry Andric /// 15570b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 15580b57cec5SDimitry Andric /// 15590b57cec5SDimitry Andric /// \code 15600b57cec5SDimitry Andric /// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M); 15610b57cec5SDimitry Andric /// \endcode 15620b57cec5SDimitry Andric /// 15630b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c> 15640b57cec5SDimitry Andric /// instruction. 15650b57cec5SDimitry Andric /// 15660b57cec5SDimitry Andric /// \param A 15670b57cec5SDimitry Andric /// A 128-bit integer vector containing one of the source operands to be 15680b57cec5SDimitry Andric /// compared. 15690b57cec5SDimitry Andric /// \param B 15700b57cec5SDimitry Andric /// A 128-bit integer vector containing one of the source operands to be 15710b57cec5SDimitry Andric /// compared. 15720b57cec5SDimitry Andric /// \param M 15730b57cec5SDimitry Andric /// An 8-bit immediate operand specifying whether the characters are bytes or 15740b57cec5SDimitry Andric /// words, the type of comparison to perform, and the format of the return 15750b57cec5SDimitry Andric /// value. \n 15760b57cec5SDimitry Andric /// Bits [1:0]: Determine source data format. \n 15770b57cec5SDimitry Andric /// 00: 16 unsigned bytes \n 15780b57cec5SDimitry Andric /// 01: 8 unsigned words \n 15790b57cec5SDimitry Andric /// 10: 16 signed bytes \n 15800b57cec5SDimitry Andric /// 11: 8 signed words \n 15810b57cec5SDimitry Andric /// Bits [3:2]: Determine comparison type and aggregation method. \n 15820b57cec5SDimitry Andric /// 00: Subset: Each character in \a B is compared for equality with all 15830b57cec5SDimitry Andric /// the characters in \a A. \n 15840b57cec5SDimitry Andric /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 15850b57cec5SDimitry Andric /// basis is greater than or equal for even-indexed elements in \a A, 15860b57cec5SDimitry Andric /// and less than or equal for odd-indexed elements in \a A. \n 15870b57cec5SDimitry Andric /// 10: Match: Compare each pair of corresponding characters in \a A and 15880b57cec5SDimitry Andric /// \a B for equality. \n 15890b57cec5SDimitry Andric /// 11: Substring: Search \a B for substring matches of \a A. \n 15900b57cec5SDimitry Andric /// Bits [5:4]: Determine whether to perform a one's complement on the bit 15910b57cec5SDimitry Andric /// mask of the comparison results. \n 15920b57cec5SDimitry Andric /// 00: No effect. \n 15930b57cec5SDimitry Andric /// 01: Negate the bit mask. \n 15940b57cec5SDimitry Andric /// 10: No effect. \n 15950b57cec5SDimitry Andric /// 11: Negate the bit mask only for bits with an index less than or equal 15960b57cec5SDimitry Andric /// to the size of \a A or \a B. \n 15970b57cec5SDimitry Andric /// Bit [6]: Determines whether the result is zero-extended or expanded to 16 15980b57cec5SDimitry Andric /// bytes. \n 15990b57cec5SDimitry Andric /// 0: The result is zero-extended to 16 bytes. \n 16000b57cec5SDimitry Andric /// 1: The result is expanded to 16 bytes (this expansion is performed by 16010b57cec5SDimitry Andric /// repeating each bit 8 or 16 times). 16020b57cec5SDimitry Andric /// \returns Returns a 128-bit integer vector representing the result mask of 16030b57cec5SDimitry Andric /// the comparison. 16040b57cec5SDimitry Andric #define _mm_cmpistrm(A, B, M) \ 1605349cc55cSDimitry Andric ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \ 1606349cc55cSDimitry Andric (__v16qi)(__m128i)(B), (int)(M))) 16070b57cec5SDimitry Andric 16080b57cec5SDimitry Andric /// Uses the immediate operand \a M to perform a comparison of string 16090b57cec5SDimitry Andric /// data with implicitly defined lengths that is contained in source operands 16100b57cec5SDimitry Andric /// \a A and \a B. Returns an integer representing the result index of the 16110b57cec5SDimitry Andric /// comparison. 16120b57cec5SDimitry Andric /// 16130b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 16140b57cec5SDimitry Andric /// 16150b57cec5SDimitry Andric /// \code 16160b57cec5SDimitry Andric /// int _mm_cmpistri(__m128i A, __m128i B, const int M); 16170b57cec5SDimitry Andric /// \endcode 16180b57cec5SDimitry Andric /// 16190b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 16200b57cec5SDimitry Andric /// instruction. 16210b57cec5SDimitry Andric /// 16220b57cec5SDimitry Andric /// \param A 16230b57cec5SDimitry Andric /// A 128-bit integer vector containing one of the source operands to be 16240b57cec5SDimitry Andric /// compared. 16250b57cec5SDimitry Andric /// \param B 16260b57cec5SDimitry Andric /// A 128-bit integer vector containing one of the source operands to be 16270b57cec5SDimitry Andric /// compared. 16280b57cec5SDimitry Andric /// \param M 16290b57cec5SDimitry Andric /// An 8-bit immediate operand specifying whether the characters are bytes or 16300b57cec5SDimitry Andric /// words, the type of comparison to perform, and the format of the return 16310b57cec5SDimitry Andric /// value. \n 16320b57cec5SDimitry Andric /// Bits [1:0]: Determine source data format. \n 16330b57cec5SDimitry Andric /// 00: 16 unsigned bytes \n 16340b57cec5SDimitry Andric /// 01: 8 unsigned words \n 16350b57cec5SDimitry Andric /// 10: 16 signed bytes \n 16360b57cec5SDimitry Andric /// 11: 8 signed words \n 16370b57cec5SDimitry Andric /// Bits [3:2]: Determine comparison type and aggregation method. \n 16380b57cec5SDimitry Andric /// 00: Subset: Each character in \a B is compared for equality with all 16390b57cec5SDimitry Andric /// the characters in \a A. \n 16400b57cec5SDimitry Andric /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 16410b57cec5SDimitry Andric /// basis is greater than or equal for even-indexed elements in \a A, 16420b57cec5SDimitry Andric /// and less than or equal for odd-indexed elements in \a A. \n 16430b57cec5SDimitry Andric /// 10: Match: Compare each pair of corresponding characters in \a A and 16440b57cec5SDimitry Andric /// \a B for equality. \n 16450b57cec5SDimitry Andric /// 11: Substring: Search B for substring matches of \a A. \n 16460b57cec5SDimitry Andric /// Bits [5:4]: Determine whether to perform a one's complement on the bit 16470b57cec5SDimitry Andric /// mask of the comparison results. \n 16480b57cec5SDimitry Andric /// 00: No effect. \n 16490b57cec5SDimitry Andric /// 01: Negate the bit mask. \n 16500b57cec5SDimitry Andric /// 10: No effect. \n 16510b57cec5SDimitry Andric /// 11: Negate the bit mask only for bits with an index less than or equal 16520b57cec5SDimitry Andric /// to the size of \a A or \a B. \n 16530b57cec5SDimitry Andric /// Bit [6]: Determines whether the index of the lowest set bit or the 16540b57cec5SDimitry Andric /// highest set bit is returned. \n 16550b57cec5SDimitry Andric /// 0: The index of the least significant set bit. \n 16560b57cec5SDimitry Andric /// 1: The index of the most significant set bit. \n 16570b57cec5SDimitry Andric /// \returns Returns an integer representing the result index of the comparison. 16580b57cec5SDimitry Andric #define _mm_cmpistri(A, B, M) \ 1659349cc55cSDimitry Andric ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \ 1660349cc55cSDimitry Andric (__v16qi)(__m128i)(B), (int)(M))) 16610b57cec5SDimitry Andric 16620b57cec5SDimitry Andric /// Uses the immediate operand \a M to perform a comparison of string 16630b57cec5SDimitry Andric /// data with explicitly defined lengths that is contained in source operands 16640b57cec5SDimitry Andric /// \a A and \a B. Returns a 128-bit integer vector representing the result 16650b57cec5SDimitry Andric /// mask of the comparison. 16660b57cec5SDimitry Andric /// 16670b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 16680b57cec5SDimitry Andric /// 16690b57cec5SDimitry Andric /// \code 16700b57cec5SDimitry Andric /// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M); 16710b57cec5SDimitry Andric /// \endcode 16720b57cec5SDimitry Andric /// 16730b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c> 16740b57cec5SDimitry Andric /// instruction. 16750b57cec5SDimitry Andric /// 16760b57cec5SDimitry Andric /// \param A 16770b57cec5SDimitry Andric /// A 128-bit integer vector containing one of the source operands to be 16780b57cec5SDimitry Andric /// compared. 16790b57cec5SDimitry Andric /// \param LA 16800b57cec5SDimitry Andric /// An integer that specifies the length of the string in \a A. 16810b57cec5SDimitry Andric /// \param B 16820b57cec5SDimitry Andric /// A 128-bit integer vector containing one of the source operands to be 16830b57cec5SDimitry Andric /// compared. 16840b57cec5SDimitry Andric /// \param LB 16850b57cec5SDimitry Andric /// An integer that specifies the length of the string in \a B. 16860b57cec5SDimitry Andric /// \param M 16870b57cec5SDimitry Andric /// An 8-bit immediate operand specifying whether the characters are bytes or 16880b57cec5SDimitry Andric /// words, the type of comparison to perform, and the format of the return 16890b57cec5SDimitry Andric /// value. \n 16900b57cec5SDimitry Andric /// Bits [1:0]: Determine source data format. \n 16910b57cec5SDimitry Andric /// 00: 16 unsigned bytes \n 16920b57cec5SDimitry Andric /// 01: 8 unsigned words \n 16930b57cec5SDimitry Andric /// 10: 16 signed bytes \n 16940b57cec5SDimitry Andric /// 11: 8 signed words \n 16950b57cec5SDimitry Andric /// Bits [3:2]: Determine comparison type and aggregation method. \n 16960b57cec5SDimitry Andric /// 00: Subset: Each character in \a B is compared for equality with all 16970b57cec5SDimitry Andric /// the characters in \a A. \n 16980b57cec5SDimitry Andric /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 16990b57cec5SDimitry Andric /// basis is greater than or equal for even-indexed elements in \a A, 17000b57cec5SDimitry Andric /// and less than or equal for odd-indexed elements in \a A. \n 17010b57cec5SDimitry Andric /// 10: Match: Compare each pair of corresponding characters in \a A and 17020b57cec5SDimitry Andric /// \a B for equality. \n 17030b57cec5SDimitry Andric /// 11: Substring: Search \a B for substring matches of \a A. \n 17040b57cec5SDimitry Andric /// Bits [5:4]: Determine whether to perform a one's complement on the bit 17050b57cec5SDimitry Andric /// mask of the comparison results. \n 17060b57cec5SDimitry Andric /// 00: No effect. \n 17070b57cec5SDimitry Andric /// 01: Negate the bit mask. \n 17080b57cec5SDimitry Andric /// 10: No effect. \n 17090b57cec5SDimitry Andric /// 11: Negate the bit mask only for bits with an index less than or equal 17100b57cec5SDimitry Andric /// to the size of \a A or \a B. \n 17110b57cec5SDimitry Andric /// Bit [6]: Determines whether the result is zero-extended or expanded to 16 17120b57cec5SDimitry Andric /// bytes. \n 17130b57cec5SDimitry Andric /// 0: The result is zero-extended to 16 bytes. \n 17140b57cec5SDimitry Andric /// 1: The result is expanded to 16 bytes (this expansion is performed by 17150b57cec5SDimitry Andric /// repeating each bit 8 or 16 times). \n 17160b57cec5SDimitry Andric /// \returns Returns a 128-bit integer vector representing the result mask of 17170b57cec5SDimitry Andric /// the comparison. 17180b57cec5SDimitry Andric #define _mm_cmpestrm(A, LA, B, LB, M) \ 1719349cc55cSDimitry Andric ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \ 17200b57cec5SDimitry Andric (__v16qi)(__m128i)(B), (int)(LB), \ 1721349cc55cSDimitry Andric (int)(M))) 17220b57cec5SDimitry Andric 17230b57cec5SDimitry Andric /// Uses the immediate operand \a M to perform a comparison of string 17240b57cec5SDimitry Andric /// data with explicitly defined lengths that is contained in source operands 17250b57cec5SDimitry Andric /// \a A and \a B. Returns an integer representing the result index of the 17260b57cec5SDimitry Andric /// comparison. 17270b57cec5SDimitry Andric /// 17280b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 17290b57cec5SDimitry Andric /// 17300b57cec5SDimitry Andric /// \code 17310b57cec5SDimitry Andric /// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M); 17320b57cec5SDimitry Andric /// \endcode 17330b57cec5SDimitry Andric /// 17340b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 17350b57cec5SDimitry Andric /// instruction. 17360b57cec5SDimitry Andric /// 17370b57cec5SDimitry Andric /// \param A 17380b57cec5SDimitry Andric /// A 128-bit integer vector containing one of the source operands to be 17390b57cec5SDimitry Andric /// compared. 17400b57cec5SDimitry Andric /// \param LA 17410b57cec5SDimitry Andric /// An integer that specifies the length of the string in \a A. 17420b57cec5SDimitry Andric /// \param B 17430b57cec5SDimitry Andric /// A 128-bit integer vector containing one of the source operands to be 17440b57cec5SDimitry Andric /// compared. 17450b57cec5SDimitry Andric /// \param LB 17460b57cec5SDimitry Andric /// An integer that specifies the length of the string in \a B. 17470b57cec5SDimitry Andric /// \param M 17480b57cec5SDimitry Andric /// An 8-bit immediate operand specifying whether the characters are bytes or 17490b57cec5SDimitry Andric /// words, the type of comparison to perform, and the format of the return 17500b57cec5SDimitry Andric /// value. \n 17510b57cec5SDimitry Andric /// Bits [1:0]: Determine source data format. \n 17520b57cec5SDimitry Andric /// 00: 16 unsigned bytes \n 17530b57cec5SDimitry Andric /// 01: 8 unsigned words \n 17540b57cec5SDimitry Andric /// 10: 16 signed bytes \n 17550b57cec5SDimitry Andric /// 11: 8 signed words \n 17560b57cec5SDimitry Andric /// Bits [3:2]: Determine comparison type and aggregation method. \n 17570b57cec5SDimitry Andric /// 00: Subset: Each character in \a B is compared for equality with all 17580b57cec5SDimitry Andric /// the characters in \a A. \n 17590b57cec5SDimitry Andric /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 17600b57cec5SDimitry Andric /// basis is greater than or equal for even-indexed elements in \a A, 17610b57cec5SDimitry Andric /// and less than or equal for odd-indexed elements in \a A. \n 17620b57cec5SDimitry Andric /// 10: Match: Compare each pair of corresponding characters in \a A and 17630b57cec5SDimitry Andric /// \a B for equality. \n 17640b57cec5SDimitry Andric /// 11: Substring: Search B for substring matches of \a A. \n 17650b57cec5SDimitry Andric /// Bits [5:4]: Determine whether to perform a one's complement on the bit 17660b57cec5SDimitry Andric /// mask of the comparison results. \n 17670b57cec5SDimitry Andric /// 00: No effect. \n 17680b57cec5SDimitry Andric /// 01: Negate the bit mask. \n 17690b57cec5SDimitry Andric /// 10: No effect. \n 17700b57cec5SDimitry Andric /// 11: Negate the bit mask only for bits with an index less than or equal 17710b57cec5SDimitry Andric /// to the size of \a A or \a B. \n 17720b57cec5SDimitry Andric /// Bit [6]: Determines whether the index of the lowest set bit or the 17730b57cec5SDimitry Andric /// highest set bit is returned. \n 17740b57cec5SDimitry Andric /// 0: The index of the least significant set bit. \n 17750b57cec5SDimitry Andric /// 1: The index of the most significant set bit. \n 17760b57cec5SDimitry Andric /// \returns Returns an integer representing the result index of the comparison. 17770b57cec5SDimitry Andric #define _mm_cmpestri(A, LA, B, LB, M) \ 1778349cc55cSDimitry Andric ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \ 17790b57cec5SDimitry Andric (__v16qi)(__m128i)(B), (int)(LB), \ 1780349cc55cSDimitry Andric (int)(M))) 17810b57cec5SDimitry Andric 17820b57cec5SDimitry Andric /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */ 17830b57cec5SDimitry Andric /// Uses the immediate operand \a M to perform a comparison of string 17840b57cec5SDimitry Andric /// data with implicitly defined lengths that is contained in source operands 17850b57cec5SDimitry Andric /// \a A and \a B. Returns 1 if the bit mask is zero and the length of the 17860b57cec5SDimitry Andric /// string in \a B is the maximum, otherwise, returns 0. 17870b57cec5SDimitry Andric /// 17880b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 17890b57cec5SDimitry Andric /// 17900b57cec5SDimitry Andric /// \code 17910b57cec5SDimitry Andric /// int _mm_cmpistra(__m128i A, __m128i B, const int M); 17920b57cec5SDimitry Andric /// \endcode 17930b57cec5SDimitry Andric /// 17940b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 17950b57cec5SDimitry Andric /// instruction. 17960b57cec5SDimitry Andric /// 17970b57cec5SDimitry Andric /// \param A 17980b57cec5SDimitry Andric /// A 128-bit integer vector containing one of the source operands to be 17990b57cec5SDimitry Andric /// compared. 18000b57cec5SDimitry Andric /// \param B 18010b57cec5SDimitry Andric /// A 128-bit integer vector containing one of the source operands to be 18020b57cec5SDimitry Andric /// compared. 18030b57cec5SDimitry Andric /// \param M 18040b57cec5SDimitry Andric /// An 8-bit immediate operand specifying whether the characters are bytes or 18050b57cec5SDimitry Andric /// words and the type of comparison to perform. \n 18060b57cec5SDimitry Andric /// Bits [1:0]: Determine source data format. \n 18070b57cec5SDimitry Andric /// 00: 16 unsigned bytes \n 18080b57cec5SDimitry Andric /// 01: 8 unsigned words \n 18090b57cec5SDimitry Andric /// 10: 16 signed bytes \n 18100b57cec5SDimitry Andric /// 11: 8 signed words \n 18110b57cec5SDimitry Andric /// Bits [3:2]: Determine comparison type and aggregation method. \n 18120b57cec5SDimitry Andric /// 00: Subset: Each character in \a B is compared for equality with all 18130b57cec5SDimitry Andric /// the characters in \a A. \n 18140b57cec5SDimitry Andric /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 18150b57cec5SDimitry Andric /// basis is greater than or equal for even-indexed elements in \a A, 18160b57cec5SDimitry Andric /// and less than or equal for odd-indexed elements in \a A. \n 18170b57cec5SDimitry Andric /// 10: Match: Compare each pair of corresponding characters in \a A and 18180b57cec5SDimitry Andric /// \a B for equality. \n 18190b57cec5SDimitry Andric /// 11: Substring: Search \a B for substring matches of \a A. \n 18200b57cec5SDimitry Andric /// Bits [5:4]: Determine whether to perform a one's complement on the bit 18210b57cec5SDimitry Andric /// mask of the comparison results. \n 18220b57cec5SDimitry Andric /// 00: No effect. \n 18230b57cec5SDimitry Andric /// 01: Negate the bit mask. \n 18240b57cec5SDimitry Andric /// 10: No effect. \n 18250b57cec5SDimitry Andric /// 11: Negate the bit mask only for bits with an index less than or equal 18260b57cec5SDimitry Andric /// to the size of \a A or \a B. \n 18270b57cec5SDimitry Andric /// \returns Returns 1 if the bit mask is zero and the length of the string in 18280b57cec5SDimitry Andric /// \a B is the maximum; otherwise, returns 0. 18290b57cec5SDimitry Andric #define _mm_cmpistra(A, B, M) \ 1830349cc55cSDimitry Andric ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \ 1831349cc55cSDimitry Andric (__v16qi)(__m128i)(B), (int)(M))) 18320b57cec5SDimitry Andric 18330b57cec5SDimitry Andric /// Uses the immediate operand \a M to perform a comparison of string 18340b57cec5SDimitry Andric /// data with implicitly defined lengths that is contained in source operands 18350b57cec5SDimitry Andric /// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns 18360b57cec5SDimitry Andric /// 0. 18370b57cec5SDimitry Andric /// 18380b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 18390b57cec5SDimitry Andric /// 18400b57cec5SDimitry Andric /// \code 18410b57cec5SDimitry Andric /// int _mm_cmpistrc(__m128i A, __m128i B, const int M); 18420b57cec5SDimitry Andric /// \endcode 18430b57cec5SDimitry Andric /// 18440b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 18450b57cec5SDimitry Andric /// instruction. 18460b57cec5SDimitry Andric /// 18470b57cec5SDimitry Andric /// \param A 18480b57cec5SDimitry Andric /// A 128-bit integer vector containing one of the source operands to be 18490b57cec5SDimitry Andric /// compared. 18500b57cec5SDimitry Andric /// \param B 18510b57cec5SDimitry Andric /// A 128-bit integer vector containing one of the source operands to be 18520b57cec5SDimitry Andric /// compared. 18530b57cec5SDimitry Andric /// \param M 18540b57cec5SDimitry Andric /// An 8-bit immediate operand specifying whether the characters are bytes or 18550b57cec5SDimitry Andric /// words and the type of comparison to perform. \n 18560b57cec5SDimitry Andric /// Bits [1:0]: Determine source data format. \n 18570b57cec5SDimitry Andric /// 00: 16 unsigned bytes \n 18580b57cec5SDimitry Andric /// 01: 8 unsigned words \n 18590b57cec5SDimitry Andric /// 10: 16 signed bytes \n 18600b57cec5SDimitry Andric /// 11: 8 signed words \n 18610b57cec5SDimitry Andric /// Bits [3:2]: Determine comparison type and aggregation method. \n 18620b57cec5SDimitry Andric /// 00: Subset: Each character in \a B is compared for equality with all 18630b57cec5SDimitry Andric /// the characters in \a A. \n 18640b57cec5SDimitry Andric /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 18650b57cec5SDimitry Andric /// basis is greater than or equal for even-indexed elements in \a A, 18660b57cec5SDimitry Andric /// and less than or equal for odd-indexed elements in \a A. \n 18670b57cec5SDimitry Andric /// 10: Match: Compare each pair of corresponding characters in \a A and 18680b57cec5SDimitry Andric /// \a B for equality. \n 18690b57cec5SDimitry Andric /// 11: Substring: Search B for substring matches of \a A. \n 18700b57cec5SDimitry Andric /// Bits [5:4]: Determine whether to perform a one's complement on the bit 18710b57cec5SDimitry Andric /// mask of the comparison results. \n 18720b57cec5SDimitry Andric /// 00: No effect. \n 18730b57cec5SDimitry Andric /// 01: Negate the bit mask. \n 18740b57cec5SDimitry Andric /// 10: No effect. \n 18750b57cec5SDimitry Andric /// 11: Negate the bit mask only for bits with an index less than or equal 18760b57cec5SDimitry Andric /// to the size of \a A or \a B. 18770b57cec5SDimitry Andric /// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0. 18780b57cec5SDimitry Andric #define _mm_cmpistrc(A, B, M) \ 1879349cc55cSDimitry Andric ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \ 1880349cc55cSDimitry Andric (__v16qi)(__m128i)(B), (int)(M))) 18810b57cec5SDimitry Andric 18820b57cec5SDimitry Andric /// Uses the immediate operand \a M to perform a comparison of string 18830b57cec5SDimitry Andric /// data with implicitly defined lengths that is contained in source operands 18840b57cec5SDimitry Andric /// \a A and \a B. Returns bit 0 of the resulting bit mask. 18850b57cec5SDimitry Andric /// 18860b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 18870b57cec5SDimitry Andric /// 18880b57cec5SDimitry Andric /// \code 18890b57cec5SDimitry Andric /// int _mm_cmpistro(__m128i A, __m128i B, const int M); 18900b57cec5SDimitry Andric /// \endcode 18910b57cec5SDimitry Andric /// 18920b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 18930b57cec5SDimitry Andric /// instruction. 18940b57cec5SDimitry Andric /// 18950b57cec5SDimitry Andric /// \param A 18960b57cec5SDimitry Andric /// A 128-bit integer vector containing one of the source operands to be 18970b57cec5SDimitry Andric /// compared. 18980b57cec5SDimitry Andric /// \param B 18990b57cec5SDimitry Andric /// A 128-bit integer vector containing one of the source operands to be 19000b57cec5SDimitry Andric /// compared. 19010b57cec5SDimitry Andric /// \param M 19020b57cec5SDimitry Andric /// An 8-bit immediate operand specifying whether the characters are bytes or 19030b57cec5SDimitry Andric /// words and the type of comparison to perform. \n 19040b57cec5SDimitry Andric /// Bits [1:0]: Determine source data format. \n 19050b57cec5SDimitry Andric /// 00: 16 unsigned bytes \n 19060b57cec5SDimitry Andric /// 01: 8 unsigned words \n 19070b57cec5SDimitry Andric /// 10: 16 signed bytes \n 19080b57cec5SDimitry Andric /// 11: 8 signed words \n 19090b57cec5SDimitry Andric /// Bits [3:2]: Determine comparison type and aggregation method. \n 19100b57cec5SDimitry Andric /// 00: Subset: Each character in \a B is compared for equality with all 19110b57cec5SDimitry Andric /// the characters in \a A. \n 19120b57cec5SDimitry Andric /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 19130b57cec5SDimitry Andric /// basis is greater than or equal for even-indexed elements in \a A, 19140b57cec5SDimitry Andric /// and less than or equal for odd-indexed elements in \a A. \n 19150b57cec5SDimitry Andric /// 10: Match: Compare each pair of corresponding characters in \a A and 19160b57cec5SDimitry Andric /// \a B for equality. \n 19170b57cec5SDimitry Andric /// 11: Substring: Search B for substring matches of \a A. \n 19180b57cec5SDimitry Andric /// Bits [5:4]: Determine whether to perform a one's complement on the bit 19190b57cec5SDimitry Andric /// mask of the comparison results. \n 19200b57cec5SDimitry Andric /// 00: No effect. \n 19210b57cec5SDimitry Andric /// 01: Negate the bit mask. \n 19220b57cec5SDimitry Andric /// 10: No effect. \n 19230b57cec5SDimitry Andric /// 11: Negate the bit mask only for bits with an index less than or equal 19240b57cec5SDimitry Andric /// to the size of \a A or \a B. \n 19250b57cec5SDimitry Andric /// \returns Returns bit 0 of the resulting bit mask. 19260b57cec5SDimitry Andric #define _mm_cmpistro(A, B, M) \ 1927349cc55cSDimitry Andric ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \ 1928349cc55cSDimitry Andric (__v16qi)(__m128i)(B), (int)(M))) 19290b57cec5SDimitry Andric 19300b57cec5SDimitry Andric /// Uses the immediate operand \a M to perform a comparison of string 19310b57cec5SDimitry Andric /// data with implicitly defined lengths that is contained in source operands 19320b57cec5SDimitry Andric /// \a A and \a B. Returns 1 if the length of the string in \a A is less than 19330b57cec5SDimitry Andric /// the maximum, otherwise, returns 0. 19340b57cec5SDimitry Andric /// 19350b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 19360b57cec5SDimitry Andric /// 19370b57cec5SDimitry Andric /// \code 19380b57cec5SDimitry Andric /// int _mm_cmpistrs(__m128i A, __m128i B, const int M); 19390b57cec5SDimitry Andric /// \endcode 19400b57cec5SDimitry Andric /// 19410b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 19420b57cec5SDimitry Andric /// instruction. 19430b57cec5SDimitry Andric /// 19440b57cec5SDimitry Andric /// \param A 19450b57cec5SDimitry Andric /// A 128-bit integer vector containing one of the source operands to be 19460b57cec5SDimitry Andric /// compared. 19470b57cec5SDimitry Andric /// \param B 19480b57cec5SDimitry Andric /// A 128-bit integer vector containing one of the source operands to be 19490b57cec5SDimitry Andric /// compared. 19500b57cec5SDimitry Andric /// \param M 19510b57cec5SDimitry Andric /// An 8-bit immediate operand specifying whether the characters are bytes or 19520b57cec5SDimitry Andric /// words and the type of comparison to perform. \n 19530b57cec5SDimitry Andric /// Bits [1:0]: Determine source data format. \n 19540b57cec5SDimitry Andric /// 00: 16 unsigned bytes \n 19550b57cec5SDimitry Andric /// 01: 8 unsigned words \n 19560b57cec5SDimitry Andric /// 10: 16 signed bytes \n 19570b57cec5SDimitry Andric /// 11: 8 signed words \n 19580b57cec5SDimitry Andric /// Bits [3:2]: Determine comparison type and aggregation method. \n 19590b57cec5SDimitry Andric /// 00: Subset: Each character in \a B is compared for equality with all 19600b57cec5SDimitry Andric /// the characters in \a A. \n 19610b57cec5SDimitry Andric /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 19620b57cec5SDimitry Andric /// basis is greater than or equal for even-indexed elements in \a A, 19630b57cec5SDimitry Andric /// and less than or equal for odd-indexed elements in \a A. \n 19640b57cec5SDimitry Andric /// 10: Match: Compare each pair of corresponding characters in \a A and 19650b57cec5SDimitry Andric /// \a B for equality. \n 19660b57cec5SDimitry Andric /// 11: Substring: Search \a B for substring matches of \a A. \n 19670b57cec5SDimitry Andric /// Bits [5:4]: Determine whether to perform a one's complement on the bit 19680b57cec5SDimitry Andric /// mask of the comparison results. \n 19690b57cec5SDimitry Andric /// 00: No effect. \n 19700b57cec5SDimitry Andric /// 01: Negate the bit mask. \n 19710b57cec5SDimitry Andric /// 10: No effect. \n 19720b57cec5SDimitry Andric /// 11: Negate the bit mask only for bits with an index less than or equal 19730b57cec5SDimitry Andric /// to the size of \a A or \a B. \n 19740b57cec5SDimitry Andric /// \returns Returns 1 if the length of the string in \a A is less than the 19750b57cec5SDimitry Andric /// maximum, otherwise, returns 0. 19760b57cec5SDimitry Andric #define _mm_cmpistrs(A, B, M) \ 1977349cc55cSDimitry Andric ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \ 1978349cc55cSDimitry Andric (__v16qi)(__m128i)(B), (int)(M))) 19790b57cec5SDimitry Andric 19800b57cec5SDimitry Andric /// Uses the immediate operand \a M to perform a comparison of string 19810b57cec5SDimitry Andric /// data with implicitly defined lengths that is contained in source operands 19820b57cec5SDimitry Andric /// \a A and \a B. Returns 1 if the length of the string in \a B is less than 19830b57cec5SDimitry Andric /// the maximum, otherwise, returns 0. 19840b57cec5SDimitry Andric /// 19850b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 19860b57cec5SDimitry Andric /// 19870b57cec5SDimitry Andric /// \code 19880b57cec5SDimitry Andric /// int _mm_cmpistrz(__m128i A, __m128i B, const int M); 19890b57cec5SDimitry Andric /// \endcode 19900b57cec5SDimitry Andric /// 19910b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 19920b57cec5SDimitry Andric /// instruction. 19930b57cec5SDimitry Andric /// 19940b57cec5SDimitry Andric /// \param A 19950b57cec5SDimitry Andric /// A 128-bit integer vector containing one of the source operands to be 19960b57cec5SDimitry Andric /// compared. 19970b57cec5SDimitry Andric /// \param B 19980b57cec5SDimitry Andric /// A 128-bit integer vector containing one of the source operands to be 19990b57cec5SDimitry Andric /// compared. 20000b57cec5SDimitry Andric /// \param M 20010b57cec5SDimitry Andric /// An 8-bit immediate operand specifying whether the characters are bytes or 20020b57cec5SDimitry Andric /// words and the type of comparison to perform. \n 20030b57cec5SDimitry Andric /// Bits [1:0]: Determine source data format. \n 20040b57cec5SDimitry Andric /// 00: 16 unsigned bytes \n 20050b57cec5SDimitry Andric /// 01: 8 unsigned words \n 20060b57cec5SDimitry Andric /// 10: 16 signed bytes \n 20070b57cec5SDimitry Andric /// 11: 8 signed words \n 20080b57cec5SDimitry Andric /// Bits [3:2]: Determine comparison type and aggregation method. \n 20090b57cec5SDimitry Andric /// 00: Subset: Each character in \a B is compared for equality with all 20100b57cec5SDimitry Andric /// the characters in \a A. \n 20110b57cec5SDimitry Andric /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 20120b57cec5SDimitry Andric /// basis is greater than or equal for even-indexed elements in \a A, 20130b57cec5SDimitry Andric /// and less than or equal for odd-indexed elements in \a A. \n 20140b57cec5SDimitry Andric /// 10: Match: Compare each pair of corresponding characters in \a A and 20150b57cec5SDimitry Andric /// \a B for equality. \n 20160b57cec5SDimitry Andric /// 11: Substring: Search \a B for substring matches of \a A. \n 20170b57cec5SDimitry Andric /// Bits [5:4]: Determine whether to perform a one's complement on the bit 20180b57cec5SDimitry Andric /// mask of the comparison results. \n 20190b57cec5SDimitry Andric /// 00: No effect. \n 20200b57cec5SDimitry Andric /// 01: Negate the bit mask. \n 20210b57cec5SDimitry Andric /// 10: No effect. \n 20220b57cec5SDimitry Andric /// 11: Negate the bit mask only for bits with an index less than or equal 20230b57cec5SDimitry Andric /// to the size of \a A or \a B. 20240b57cec5SDimitry Andric /// \returns Returns 1 if the length of the string in \a B is less than the 20250b57cec5SDimitry Andric /// maximum, otherwise, returns 0. 20260b57cec5SDimitry Andric #define _mm_cmpistrz(A, B, M) \ 2027349cc55cSDimitry Andric ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \ 2028349cc55cSDimitry Andric (__v16qi)(__m128i)(B), (int)(M))) 20290b57cec5SDimitry Andric 20300b57cec5SDimitry Andric /// Uses the immediate operand \a M to perform a comparison of string 20310b57cec5SDimitry Andric /// data with explicitly defined lengths that is contained in source operands 20320b57cec5SDimitry Andric /// \a A and \a B. Returns 1 if the bit mask is zero and the length of the 20330b57cec5SDimitry Andric /// string in \a B is the maximum, otherwise, returns 0. 20340b57cec5SDimitry Andric /// 20350b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 20360b57cec5SDimitry Andric /// 20370b57cec5SDimitry Andric /// \code 20380b57cec5SDimitry Andric /// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M); 20390b57cec5SDimitry Andric /// \endcode 20400b57cec5SDimitry Andric /// 20410b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 20420b57cec5SDimitry Andric /// instruction. 20430b57cec5SDimitry Andric /// 20440b57cec5SDimitry Andric /// \param A 20450b57cec5SDimitry Andric /// A 128-bit integer vector containing one of the source operands to be 20460b57cec5SDimitry Andric /// compared. 20470b57cec5SDimitry Andric /// \param LA 20480b57cec5SDimitry Andric /// An integer that specifies the length of the string in \a A. 20490b57cec5SDimitry Andric /// \param B 20500b57cec5SDimitry Andric /// A 128-bit integer vector containing one of the source operands to be 20510b57cec5SDimitry Andric /// compared. 20520b57cec5SDimitry Andric /// \param LB 20530b57cec5SDimitry Andric /// An integer that specifies the length of the string in \a B. 20540b57cec5SDimitry Andric /// \param M 20550b57cec5SDimitry Andric /// An 8-bit immediate operand specifying whether the characters are bytes or 20560b57cec5SDimitry Andric /// words and the type of comparison to perform. \n 20570b57cec5SDimitry Andric /// Bits [1:0]: Determine source data format. \n 20580b57cec5SDimitry Andric /// 00: 16 unsigned bytes \n 20590b57cec5SDimitry Andric /// 01: 8 unsigned words \n 20600b57cec5SDimitry Andric /// 10: 16 signed bytes \n 20610b57cec5SDimitry Andric /// 11: 8 signed words \n 20620b57cec5SDimitry Andric /// Bits [3:2]: Determine comparison type and aggregation method. \n 20630b57cec5SDimitry Andric /// 00: Subset: Each character in \a B is compared for equality with all 20640b57cec5SDimitry Andric /// the characters in \a A. \n 20650b57cec5SDimitry Andric /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 20660b57cec5SDimitry Andric /// basis is greater than or equal for even-indexed elements in \a A, 20670b57cec5SDimitry Andric /// and less than or equal for odd-indexed elements in \a A. \n 20680b57cec5SDimitry Andric /// 10: Match: Compare each pair of corresponding characters in \a A and 20690b57cec5SDimitry Andric /// \a B for equality. \n 20700b57cec5SDimitry Andric /// 11: Substring: Search \a B for substring matches of \a A. \n 20710b57cec5SDimitry Andric /// Bits [5:4]: Determine whether to perform a one's complement on the bit 20720b57cec5SDimitry Andric /// mask of the comparison results. \n 20730b57cec5SDimitry Andric /// 00: No effect. \n 20740b57cec5SDimitry Andric /// 01: Negate the bit mask. \n 20750b57cec5SDimitry Andric /// 10: No effect. \n 20760b57cec5SDimitry Andric /// 11: Negate the bit mask only for bits with an index less than or equal 20770b57cec5SDimitry Andric /// to the size of \a A or \a B. 20780b57cec5SDimitry Andric /// \returns Returns 1 if the bit mask is zero and the length of the string in 20790b57cec5SDimitry Andric /// \a B is the maximum, otherwise, returns 0. 20800b57cec5SDimitry Andric #define _mm_cmpestra(A, LA, B, LB, M) \ 2081349cc55cSDimitry Andric ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \ 20820b57cec5SDimitry Andric (__v16qi)(__m128i)(B), (int)(LB), \ 2083349cc55cSDimitry Andric (int)(M))) 20840b57cec5SDimitry Andric 20850b57cec5SDimitry Andric /// Uses the immediate operand \a M to perform a comparison of string 20860b57cec5SDimitry Andric /// data with explicitly defined lengths that is contained in source operands 20870b57cec5SDimitry Andric /// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise, 20880b57cec5SDimitry Andric /// returns 0. 20890b57cec5SDimitry Andric /// 20900b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 20910b57cec5SDimitry Andric /// 20920b57cec5SDimitry Andric /// \code 20930b57cec5SDimitry Andric /// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M); 20940b57cec5SDimitry Andric /// \endcode 20950b57cec5SDimitry Andric /// 20960b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 20970b57cec5SDimitry Andric /// instruction. 20980b57cec5SDimitry Andric /// 20990b57cec5SDimitry Andric /// \param A 21000b57cec5SDimitry Andric /// A 128-bit integer vector containing one of the source operands to be 21010b57cec5SDimitry Andric /// compared. 21020b57cec5SDimitry Andric /// \param LA 21030b57cec5SDimitry Andric /// An integer that specifies the length of the string in \a A. 21040b57cec5SDimitry Andric /// \param B 21050b57cec5SDimitry Andric /// A 128-bit integer vector containing one of the source operands to be 21060b57cec5SDimitry Andric /// compared. 21070b57cec5SDimitry Andric /// \param LB 21080b57cec5SDimitry Andric /// An integer that specifies the length of the string in \a B. 21090b57cec5SDimitry Andric /// \param M 21100b57cec5SDimitry Andric /// An 8-bit immediate operand specifying whether the characters are bytes or 21110b57cec5SDimitry Andric /// words and the type of comparison to perform. \n 21120b57cec5SDimitry Andric /// Bits [1:0]: Determine source data format. \n 21130b57cec5SDimitry Andric /// 00: 16 unsigned bytes \n 21140b57cec5SDimitry Andric /// 01: 8 unsigned words \n 21150b57cec5SDimitry Andric /// 10: 16 signed bytes \n 21160b57cec5SDimitry Andric /// 11: 8 signed words \n 21170b57cec5SDimitry Andric /// Bits [3:2]: Determine comparison type and aggregation method. \n 21180b57cec5SDimitry Andric /// 00: Subset: Each character in \a B is compared for equality with all 21190b57cec5SDimitry Andric /// the characters in \a A. \n 21200b57cec5SDimitry Andric /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 21210b57cec5SDimitry Andric /// basis is greater than or equal for even-indexed elements in \a A, 21220b57cec5SDimitry Andric /// and less than or equal for odd-indexed elements in \a A. \n 21230b57cec5SDimitry Andric /// 10: Match: Compare each pair of corresponding characters in \a A and 21240b57cec5SDimitry Andric /// \a B for equality. \n 21250b57cec5SDimitry Andric /// 11: Substring: Search \a B for substring matches of \a A. \n 21260b57cec5SDimitry Andric /// Bits [5:4]: Determine whether to perform a one's complement on the bit 21270b57cec5SDimitry Andric /// mask of the comparison results. \n 21280b57cec5SDimitry Andric /// 00: No effect. \n 21290b57cec5SDimitry Andric /// 01: Negate the bit mask. \n 21300b57cec5SDimitry Andric /// 10: No effect. \n 21310b57cec5SDimitry Andric /// 11: Negate the bit mask only for bits with an index less than or equal 21320b57cec5SDimitry Andric /// to the size of \a A or \a B. \n 21330b57cec5SDimitry Andric /// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0. 21340b57cec5SDimitry Andric #define _mm_cmpestrc(A, LA, B, LB, M) \ 2135349cc55cSDimitry Andric ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \ 21360b57cec5SDimitry Andric (__v16qi)(__m128i)(B), (int)(LB), \ 2137349cc55cSDimitry Andric (int)(M))) 21380b57cec5SDimitry Andric 21390b57cec5SDimitry Andric /// Uses the immediate operand \a M to perform a comparison of string 21400b57cec5SDimitry Andric /// data with explicitly defined lengths that is contained in source operands 21410b57cec5SDimitry Andric /// \a A and \a B. Returns bit 0 of the resulting bit mask. 21420b57cec5SDimitry Andric /// 21430b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 21440b57cec5SDimitry Andric /// 21450b57cec5SDimitry Andric /// \code 21460b57cec5SDimitry Andric /// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M); 21470b57cec5SDimitry Andric /// \endcode 21480b57cec5SDimitry Andric /// 21490b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 21500b57cec5SDimitry Andric /// instruction. 21510b57cec5SDimitry Andric /// 21520b57cec5SDimitry Andric /// \param A 21530b57cec5SDimitry Andric /// A 128-bit integer vector containing one of the source operands to be 21540b57cec5SDimitry Andric /// compared. 21550b57cec5SDimitry Andric /// \param LA 21560b57cec5SDimitry Andric /// An integer that specifies the length of the string in \a A. 21570b57cec5SDimitry Andric /// \param B 21580b57cec5SDimitry Andric /// A 128-bit integer vector containing one of the source operands to be 21590b57cec5SDimitry Andric /// compared. 21600b57cec5SDimitry Andric /// \param LB 21610b57cec5SDimitry Andric /// An integer that specifies the length of the string in \a B. 21620b57cec5SDimitry Andric /// \param M 21630b57cec5SDimitry Andric /// An 8-bit immediate operand specifying whether the characters are bytes or 21640b57cec5SDimitry Andric /// words and the type of comparison to perform. \n 21650b57cec5SDimitry Andric /// Bits [1:0]: Determine source data format. \n 21660b57cec5SDimitry Andric /// 00: 16 unsigned bytes \n 21670b57cec5SDimitry Andric /// 01: 8 unsigned words \n 21680b57cec5SDimitry Andric /// 10: 16 signed bytes \n 21690b57cec5SDimitry Andric /// 11: 8 signed words \n 21700b57cec5SDimitry Andric /// Bits [3:2]: Determine comparison type and aggregation method. \n 21710b57cec5SDimitry Andric /// 00: Subset: Each character in \a B is compared for equality with all 21720b57cec5SDimitry Andric /// the characters in \a A. \n 21730b57cec5SDimitry Andric /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 21740b57cec5SDimitry Andric /// basis is greater than or equal for even-indexed elements in \a A, 21750b57cec5SDimitry Andric /// and less than or equal for odd-indexed elements in \a A. \n 21760b57cec5SDimitry Andric /// 10: Match: Compare each pair of corresponding characters in \a A and 21770b57cec5SDimitry Andric /// \a B for equality. \n 21780b57cec5SDimitry Andric /// 11: Substring: Search \a B for substring matches of \a A. \n 21790b57cec5SDimitry Andric /// Bits [5:4]: Determine whether to perform a one's complement on the bit 21800b57cec5SDimitry Andric /// mask of the comparison results. \n 21810b57cec5SDimitry Andric /// 00: No effect. \n 21820b57cec5SDimitry Andric /// 01: Negate the bit mask. \n 21830b57cec5SDimitry Andric /// 10: No effect. \n 21840b57cec5SDimitry Andric /// 11: Negate the bit mask only for bits with an index less than or equal 21850b57cec5SDimitry Andric /// to the size of \a A or \a B. 21860b57cec5SDimitry Andric /// \returns Returns bit 0 of the resulting bit mask. 21870b57cec5SDimitry Andric #define _mm_cmpestro(A, LA, B, LB, M) \ 2188349cc55cSDimitry Andric ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \ 21890b57cec5SDimitry Andric (__v16qi)(__m128i)(B), (int)(LB), \ 2190349cc55cSDimitry Andric (int)(M))) 21910b57cec5SDimitry Andric 21920b57cec5SDimitry Andric /// Uses the immediate operand \a M to perform a comparison of string 21930b57cec5SDimitry Andric /// data with explicitly defined lengths that is contained in source operands 21940b57cec5SDimitry Andric /// \a A and \a B. Returns 1 if the length of the string in \a A is less than 21950b57cec5SDimitry Andric /// the maximum, otherwise, returns 0. 21960b57cec5SDimitry Andric /// 21970b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 21980b57cec5SDimitry Andric /// 21990b57cec5SDimitry Andric /// \code 22000b57cec5SDimitry Andric /// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M); 22010b57cec5SDimitry Andric /// \endcode 22020b57cec5SDimitry Andric /// 22030b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 22040b57cec5SDimitry Andric /// instruction. 22050b57cec5SDimitry Andric /// 22060b57cec5SDimitry Andric /// \param A 22070b57cec5SDimitry Andric /// A 128-bit integer vector containing one of the source operands to be 22080b57cec5SDimitry Andric /// compared. 22090b57cec5SDimitry Andric /// \param LA 22100b57cec5SDimitry Andric /// An integer that specifies the length of the string in \a A. 22110b57cec5SDimitry Andric /// \param B 22120b57cec5SDimitry Andric /// A 128-bit integer vector containing one of the source operands to be 22130b57cec5SDimitry Andric /// compared. 22140b57cec5SDimitry Andric /// \param LB 22150b57cec5SDimitry Andric /// An integer that specifies the length of the string in \a B. 22160b57cec5SDimitry Andric /// \param M 22170b57cec5SDimitry Andric /// An 8-bit immediate operand specifying whether the characters are bytes or 22180b57cec5SDimitry Andric /// words and the type of comparison to perform. \n 22190b57cec5SDimitry Andric /// Bits [1:0]: Determine source data format. \n 22200b57cec5SDimitry Andric /// 00: 16 unsigned bytes \n 22210b57cec5SDimitry Andric /// 01: 8 unsigned words \n 22220b57cec5SDimitry Andric /// 10: 16 signed bytes \n 22230b57cec5SDimitry Andric /// 11: 8 signed words \n 22240b57cec5SDimitry Andric /// Bits [3:2]: Determine comparison type and aggregation method. \n 22250b57cec5SDimitry Andric /// 00: Subset: Each character in \a B is compared for equality with all 22260b57cec5SDimitry Andric /// the characters in \a A. \n 22270b57cec5SDimitry Andric /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 22280b57cec5SDimitry Andric /// basis is greater than or equal for even-indexed elements in \a A, 22290b57cec5SDimitry Andric /// and less than or equal for odd-indexed elements in \a A. \n 22300b57cec5SDimitry Andric /// 10: Match: Compare each pair of corresponding characters in \a A and 22310b57cec5SDimitry Andric /// \a B for equality. \n 22320b57cec5SDimitry Andric /// 11: Substring: Search \a B for substring matches of \a A. \n 22330b57cec5SDimitry Andric /// Bits [5:4]: Determine whether to perform a one's complement in the bit 22340b57cec5SDimitry Andric /// mask of the comparison results. \n 22350b57cec5SDimitry Andric /// 00: No effect. \n 22360b57cec5SDimitry Andric /// 01: Negate the bit mask. \n 22370b57cec5SDimitry Andric /// 10: No effect. \n 22380b57cec5SDimitry Andric /// 11: Negate the bit mask only for bits with an index less than or equal 22390b57cec5SDimitry Andric /// to the size of \a A or \a B. \n 22400b57cec5SDimitry Andric /// \returns Returns 1 if the length of the string in \a A is less than the 22410b57cec5SDimitry Andric /// maximum, otherwise, returns 0. 22420b57cec5SDimitry Andric #define _mm_cmpestrs(A, LA, B, LB, M) \ 2243349cc55cSDimitry Andric ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \ 22440b57cec5SDimitry Andric (__v16qi)(__m128i)(B), (int)(LB), \ 2245349cc55cSDimitry Andric (int)(M))) 22460b57cec5SDimitry Andric 22470b57cec5SDimitry Andric /// Uses the immediate operand \a M to perform a comparison of string 22480b57cec5SDimitry Andric /// data with explicitly defined lengths that is contained in source operands 22490b57cec5SDimitry Andric /// \a A and \a B. Returns 1 if the length of the string in \a B is less than 22500b57cec5SDimitry Andric /// the maximum, otherwise, returns 0. 22510b57cec5SDimitry Andric /// 22520b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 22530b57cec5SDimitry Andric /// 22540b57cec5SDimitry Andric /// \code 22550b57cec5SDimitry Andric /// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M); 22560b57cec5SDimitry Andric /// \endcode 22570b57cec5SDimitry Andric /// 22580b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction. 22590b57cec5SDimitry Andric /// 22600b57cec5SDimitry Andric /// \param A 22610b57cec5SDimitry Andric /// A 128-bit integer vector containing one of the source operands to be 22620b57cec5SDimitry Andric /// compared. 22630b57cec5SDimitry Andric /// \param LA 22640b57cec5SDimitry Andric /// An integer that specifies the length of the string in \a A. 22650b57cec5SDimitry Andric /// \param B 22660b57cec5SDimitry Andric /// A 128-bit integer vector containing one of the source operands to be 22670b57cec5SDimitry Andric /// compared. 22680b57cec5SDimitry Andric /// \param LB 22690b57cec5SDimitry Andric /// An integer that specifies the length of the string in \a B. 22700b57cec5SDimitry Andric /// \param M 22710b57cec5SDimitry Andric /// An 8-bit immediate operand specifying whether the characters are bytes or 22720b57cec5SDimitry Andric /// words and the type of comparison to perform. \n 22730b57cec5SDimitry Andric /// Bits [1:0]: Determine source data format. \n 22740b57cec5SDimitry Andric /// 00: 16 unsigned bytes \n 22750b57cec5SDimitry Andric /// 01: 8 unsigned words \n 22760b57cec5SDimitry Andric /// 10: 16 signed bytes \n 22770b57cec5SDimitry Andric /// 11: 8 signed words \n 22780b57cec5SDimitry Andric /// Bits [3:2]: Determine comparison type and aggregation method. \n 22790b57cec5SDimitry Andric /// 00: Subset: Each character in \a B is compared for equality with all 22800b57cec5SDimitry Andric /// the characters in \a A. \n 22810b57cec5SDimitry Andric /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 22820b57cec5SDimitry Andric /// basis is greater than or equal for even-indexed elements in \a A, 22830b57cec5SDimitry Andric /// and less than or equal for odd-indexed elements in \a A. \n 22840b57cec5SDimitry Andric /// 10: Match: Compare each pair of corresponding characters in \a A and 22850b57cec5SDimitry Andric /// \a B for equality. \n 22860b57cec5SDimitry Andric /// 11: Substring: Search \a B for substring matches of \a A. \n 22870b57cec5SDimitry Andric /// Bits [5:4]: Determine whether to perform a one's complement on the bit 22880b57cec5SDimitry Andric /// mask of the comparison results. \n 22890b57cec5SDimitry Andric /// 00: No effect. \n 22900b57cec5SDimitry Andric /// 01: Negate the bit mask. \n 22910b57cec5SDimitry Andric /// 10: No effect. \n 22920b57cec5SDimitry Andric /// 11: Negate the bit mask only for bits with an index less than or equal 22930b57cec5SDimitry Andric /// to the size of \a A or \a B. 22940b57cec5SDimitry Andric /// \returns Returns 1 if the length of the string in \a B is less than the 22950b57cec5SDimitry Andric /// maximum, otherwise, returns 0. 22960b57cec5SDimitry Andric #define _mm_cmpestrz(A, LA, B, LB, M) \ 2297349cc55cSDimitry Andric ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \ 22980b57cec5SDimitry Andric (__v16qi)(__m128i)(B), (int)(LB), \ 2299349cc55cSDimitry Andric (int)(M))) 23000b57cec5SDimitry Andric 23010b57cec5SDimitry Andric /* SSE4.2 Compare Packed Data -- Greater Than. */ 23020b57cec5SDimitry Andric /// Compares each of the corresponding 64-bit values of the 128-bit 23030b57cec5SDimitry Andric /// integer vectors to determine if the values in the first operand are 23040b57cec5SDimitry Andric /// greater than those in the second operand. 23050b57cec5SDimitry Andric /// 2306*0fca6ea1SDimitry Andric /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 2307*0fca6ea1SDimitry Andric /// 23080b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 23090b57cec5SDimitry Andric /// 23100b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction. 23110b57cec5SDimitry Andric /// 23120b57cec5SDimitry Andric /// \param __V1 23130b57cec5SDimitry Andric /// A 128-bit integer vector. 23140b57cec5SDimitry Andric /// \param __V2 23150b57cec5SDimitry Andric /// A 128-bit integer vector. 23160b57cec5SDimitry Andric /// \returns A 128-bit integer vector containing the comparison results. 231781ad6265SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi64(__m128i __V1, 231881ad6265SDimitry Andric __m128i __V2) { 23190b57cec5SDimitry Andric return (__m128i)((__v2di)__V1 > (__v2di)__V2); 23200b57cec5SDimitry Andric } 23210b57cec5SDimitry Andric 23220b57cec5SDimitry Andric #undef __DEFAULT_FN_ATTRS 23230b57cec5SDimitry Andric 23240b57cec5SDimitry Andric #include <popcntintrin.h> 23250b57cec5SDimitry Andric 2326349cc55cSDimitry Andric #include <crc32intrin.h> 2327349cc55cSDimitry Andric 23280b57cec5SDimitry Andric #endif /* __SMMINTRIN_H */ 2329