1f4a2713aSLionel Sambuc /*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
2f4a2713aSLionel Sambuc *
3f4a2713aSLionel Sambuc * Permission is hereby granted, free of charge, to any person obtaining a copy
4f4a2713aSLionel Sambuc * of this software and associated documentation files (the "Software"), to deal
5f4a2713aSLionel Sambuc * in the Software without restriction, including without limitation the rights
6f4a2713aSLionel Sambuc * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7f4a2713aSLionel Sambuc * copies of the Software, and to permit persons to whom the Software is
8f4a2713aSLionel Sambuc * furnished to do so, subject to the following conditions:
9f4a2713aSLionel Sambuc *
10f4a2713aSLionel Sambuc * The above copyright notice and this permission notice shall be included in
11f4a2713aSLionel Sambuc * all copies or substantial portions of the Software.
12f4a2713aSLionel Sambuc *
13f4a2713aSLionel Sambuc * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14f4a2713aSLionel Sambuc * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15f4a2713aSLionel Sambuc * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16f4a2713aSLionel Sambuc * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17f4a2713aSLionel Sambuc * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18f4a2713aSLionel Sambuc * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19f4a2713aSLionel Sambuc * THE SOFTWARE.
20f4a2713aSLionel Sambuc *
21f4a2713aSLionel Sambuc *===-----------------------------------------------------------------------===
22f4a2713aSLionel Sambuc */
23f4a2713aSLionel Sambuc
24f4a2713aSLionel Sambuc #ifndef _SMMINTRIN_H
25f4a2713aSLionel Sambuc #define _SMMINTRIN_H
26f4a2713aSLionel Sambuc
27f4a2713aSLionel Sambuc #ifndef __SSE4_1__
28f4a2713aSLionel Sambuc #error "SSE4.1 instruction set not enabled"
29f4a2713aSLionel Sambuc #else
30f4a2713aSLionel Sambuc
31f4a2713aSLionel Sambuc #include <tmmintrin.h>
32f4a2713aSLionel Sambuc
33f4a2713aSLionel Sambuc /* SSE4 Rounding macros. */
34f4a2713aSLionel Sambuc #define _MM_FROUND_TO_NEAREST_INT 0x00
35f4a2713aSLionel Sambuc #define _MM_FROUND_TO_NEG_INF 0x01
36f4a2713aSLionel Sambuc #define _MM_FROUND_TO_POS_INF 0x02
37f4a2713aSLionel Sambuc #define _MM_FROUND_TO_ZERO 0x03
38f4a2713aSLionel Sambuc #define _MM_FROUND_CUR_DIRECTION 0x04
39f4a2713aSLionel Sambuc
40f4a2713aSLionel Sambuc #define _MM_FROUND_RAISE_EXC 0x00
41f4a2713aSLionel Sambuc #define _MM_FROUND_NO_EXC 0x08
42f4a2713aSLionel Sambuc
43f4a2713aSLionel Sambuc #define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
44f4a2713aSLionel Sambuc #define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
45f4a2713aSLionel Sambuc #define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
46f4a2713aSLionel Sambuc #define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
47f4a2713aSLionel Sambuc #define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
48f4a2713aSLionel Sambuc #define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
49f4a2713aSLionel Sambuc
50f4a2713aSLionel Sambuc #define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL)
51f4a2713aSLionel Sambuc #define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL)
52f4a2713aSLionel Sambuc #define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
53f4a2713aSLionel Sambuc #define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
54f4a2713aSLionel Sambuc
55f4a2713aSLionel Sambuc #define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR)
56f4a2713aSLionel Sambuc #define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR)
57f4a2713aSLionel Sambuc #define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
58f4a2713aSLionel Sambuc #define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
59f4a2713aSLionel Sambuc
60f4a2713aSLionel Sambuc #define _mm_round_ps(X, M) __extension__ ({ \
61f4a2713aSLionel Sambuc __m128 __X = (X); \
62f4a2713aSLionel Sambuc (__m128) __builtin_ia32_roundps((__v4sf)__X, (M)); })
63f4a2713aSLionel Sambuc
64f4a2713aSLionel Sambuc #define _mm_round_ss(X, Y, M) __extension__ ({ \
65f4a2713aSLionel Sambuc __m128 __X = (X); \
66f4a2713aSLionel Sambuc __m128 __Y = (Y); \
67f4a2713aSLionel Sambuc (__m128) __builtin_ia32_roundss((__v4sf)__X, (__v4sf)__Y, (M)); })
68f4a2713aSLionel Sambuc
69f4a2713aSLionel Sambuc #define _mm_round_pd(X, M) __extension__ ({ \
70f4a2713aSLionel Sambuc __m128d __X = (X); \
71f4a2713aSLionel Sambuc (__m128d) __builtin_ia32_roundpd((__v2df)__X, (M)); })
72f4a2713aSLionel Sambuc
73f4a2713aSLionel Sambuc #define _mm_round_sd(X, Y, M) __extension__ ({ \
74f4a2713aSLionel Sambuc __m128d __X = (X); \
75f4a2713aSLionel Sambuc __m128d __Y = (Y); \
76f4a2713aSLionel Sambuc (__m128d) __builtin_ia32_roundsd((__v2df)__X, (__v2df)__Y, (M)); })
77f4a2713aSLionel Sambuc
78f4a2713aSLionel Sambuc /* SSE4 Packed Blending Intrinsics. */
79f4a2713aSLionel Sambuc #define _mm_blend_pd(V1, V2, M) __extension__ ({ \
80f4a2713aSLionel Sambuc __m128d __V1 = (V1); \
81f4a2713aSLionel Sambuc __m128d __V2 = (V2); \
82*0a6a1f1dSLionel Sambuc (__m128d)__builtin_shufflevector((__v2df)__V1, (__v2df)__V2, \
83*0a6a1f1dSLionel Sambuc (((M) & 0x01) ? 2 : 0), \
84*0a6a1f1dSLionel Sambuc (((M) & 0x02) ? 3 : 1)); })
85f4a2713aSLionel Sambuc
86f4a2713aSLionel Sambuc #define _mm_blend_ps(V1, V2, M) __extension__ ({ \
87f4a2713aSLionel Sambuc __m128 __V1 = (V1); \
88f4a2713aSLionel Sambuc __m128 __V2 = (V2); \
89*0a6a1f1dSLionel Sambuc (__m128)__builtin_shufflevector((__v4sf)__V1, (__v4sf)__V2, \
90*0a6a1f1dSLionel Sambuc (((M) & 0x01) ? 4 : 0), \
91*0a6a1f1dSLionel Sambuc (((M) & 0x02) ? 5 : 1), \
92*0a6a1f1dSLionel Sambuc (((M) & 0x04) ? 6 : 2), \
93*0a6a1f1dSLionel Sambuc (((M) & 0x08) ? 7 : 3)); })
94f4a2713aSLionel Sambuc
95f4a2713aSLionel Sambuc static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_blendv_pd(__m128d __V1,__m128d __V2,__m128d __M)96f4a2713aSLionel Sambuc _mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
97f4a2713aSLionel Sambuc {
98f4a2713aSLionel Sambuc return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2,
99f4a2713aSLionel Sambuc (__v2df)__M);
100f4a2713aSLionel Sambuc }
101f4a2713aSLionel Sambuc
102f4a2713aSLionel Sambuc static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_blendv_ps(__m128 __V1,__m128 __V2,__m128 __M)103f4a2713aSLionel Sambuc _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
104f4a2713aSLionel Sambuc {
105f4a2713aSLionel Sambuc return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2,
106f4a2713aSLionel Sambuc (__v4sf)__M);
107f4a2713aSLionel Sambuc }
108f4a2713aSLionel Sambuc
109f4a2713aSLionel Sambuc static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_blendv_epi8(__m128i __V1,__m128i __V2,__m128i __M)110f4a2713aSLionel Sambuc _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
111f4a2713aSLionel Sambuc {
112f4a2713aSLionel Sambuc return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2,
113f4a2713aSLionel Sambuc (__v16qi)__M);
114f4a2713aSLionel Sambuc }
115f4a2713aSLionel Sambuc
116f4a2713aSLionel Sambuc #define _mm_blend_epi16(V1, V2, M) __extension__ ({ \
117f4a2713aSLionel Sambuc __m128i __V1 = (V1); \
118f4a2713aSLionel Sambuc __m128i __V2 = (V2); \
119*0a6a1f1dSLionel Sambuc (__m128i)__builtin_shufflevector((__v8hi)__V1, (__v8hi)__V2, \
120*0a6a1f1dSLionel Sambuc (((M) & 0x01) ? 8 : 0), \
121*0a6a1f1dSLionel Sambuc (((M) & 0x02) ? 9 : 1), \
122*0a6a1f1dSLionel Sambuc (((M) & 0x04) ? 10 : 2), \
123*0a6a1f1dSLionel Sambuc (((M) & 0x08) ? 11 : 3), \
124*0a6a1f1dSLionel Sambuc (((M) & 0x10) ? 12 : 4), \
125*0a6a1f1dSLionel Sambuc (((M) & 0x20) ? 13 : 5), \
126*0a6a1f1dSLionel Sambuc (((M) & 0x40) ? 14 : 6), \
127*0a6a1f1dSLionel Sambuc (((M) & 0x80) ? 15 : 7)); })
128f4a2713aSLionel Sambuc
129f4a2713aSLionel Sambuc /* SSE4 Dword Multiply Instructions. */
130f4a2713aSLionel Sambuc static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mullo_epi32(__m128i __V1,__m128i __V2)131f4a2713aSLionel Sambuc _mm_mullo_epi32 (__m128i __V1, __m128i __V2)
132f4a2713aSLionel Sambuc {
133f4a2713aSLionel Sambuc return (__m128i) ((__v4si)__V1 * (__v4si)__V2);
134f4a2713aSLionel Sambuc }
135f4a2713aSLionel Sambuc
136f4a2713aSLionel Sambuc static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mul_epi32(__m128i __V1,__m128i __V2)137f4a2713aSLionel Sambuc _mm_mul_epi32 (__m128i __V1, __m128i __V2)
138f4a2713aSLionel Sambuc {
139f4a2713aSLionel Sambuc return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2);
140f4a2713aSLionel Sambuc }
141f4a2713aSLionel Sambuc
142f4a2713aSLionel Sambuc /* SSE4 Floating Point Dot Product Instructions. */
143f4a2713aSLionel Sambuc #define _mm_dp_ps(X, Y, M) __extension__ ({ \
144f4a2713aSLionel Sambuc __m128 __X = (X); \
145f4a2713aSLionel Sambuc __m128 __Y = (Y); \
146f4a2713aSLionel Sambuc (__m128) __builtin_ia32_dpps((__v4sf)__X, (__v4sf)__Y, (M)); })
147f4a2713aSLionel Sambuc
148f4a2713aSLionel Sambuc #define _mm_dp_pd(X, Y, M) __extension__ ({\
149f4a2713aSLionel Sambuc __m128d __X = (X); \
150f4a2713aSLionel Sambuc __m128d __Y = (Y); \
151f4a2713aSLionel Sambuc (__m128d) __builtin_ia32_dppd((__v2df)__X, (__v2df)__Y, (M)); })
152f4a2713aSLionel Sambuc
153f4a2713aSLionel Sambuc /* SSE4 Streaming Load Hint Instruction. */
154f4a2713aSLionel Sambuc static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_stream_load_si128(__m128i * __V)155f4a2713aSLionel Sambuc _mm_stream_load_si128 (__m128i *__V)
156f4a2713aSLionel Sambuc {
157f4a2713aSLionel Sambuc return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __V);
158f4a2713aSLionel Sambuc }
159f4a2713aSLionel Sambuc
160f4a2713aSLionel Sambuc /* SSE4 Packed Integer Min/Max Instructions. */
161f4a2713aSLionel Sambuc static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_min_epi8(__m128i __V1,__m128i __V2)162f4a2713aSLionel Sambuc _mm_min_epi8 (__m128i __V1, __m128i __V2)
163f4a2713aSLionel Sambuc {
164f4a2713aSLionel Sambuc return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
165f4a2713aSLionel Sambuc }
166f4a2713aSLionel Sambuc
167f4a2713aSLionel Sambuc static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_max_epi8(__m128i __V1,__m128i __V2)168f4a2713aSLionel Sambuc _mm_max_epi8 (__m128i __V1, __m128i __V2)
169f4a2713aSLionel Sambuc {
170f4a2713aSLionel Sambuc return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
171f4a2713aSLionel Sambuc }
172f4a2713aSLionel Sambuc
173f4a2713aSLionel Sambuc static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_min_epu16(__m128i __V1,__m128i __V2)174f4a2713aSLionel Sambuc _mm_min_epu16 (__m128i __V1, __m128i __V2)
175f4a2713aSLionel Sambuc {
176f4a2713aSLionel Sambuc return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
177f4a2713aSLionel Sambuc }
178f4a2713aSLionel Sambuc
179f4a2713aSLionel Sambuc static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_max_epu16(__m128i __V1,__m128i __V2)180f4a2713aSLionel Sambuc _mm_max_epu16 (__m128i __V1, __m128i __V2)
181f4a2713aSLionel Sambuc {
182f4a2713aSLionel Sambuc return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
183f4a2713aSLionel Sambuc }
184f4a2713aSLionel Sambuc
185f4a2713aSLionel Sambuc static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_min_epi32(__m128i __V1,__m128i __V2)186f4a2713aSLionel Sambuc _mm_min_epi32 (__m128i __V1, __m128i __V2)
187f4a2713aSLionel Sambuc {
188f4a2713aSLionel Sambuc return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
189f4a2713aSLionel Sambuc }
190f4a2713aSLionel Sambuc
191f4a2713aSLionel Sambuc static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_max_epi32(__m128i __V1,__m128i __V2)192f4a2713aSLionel Sambuc _mm_max_epi32 (__m128i __V1, __m128i __V2)
193f4a2713aSLionel Sambuc {
194f4a2713aSLionel Sambuc return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
195f4a2713aSLionel Sambuc }
196f4a2713aSLionel Sambuc
197f4a2713aSLionel Sambuc static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_min_epu32(__m128i __V1,__m128i __V2)198f4a2713aSLionel Sambuc _mm_min_epu32 (__m128i __V1, __m128i __V2)
199f4a2713aSLionel Sambuc {
200f4a2713aSLionel Sambuc return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
201f4a2713aSLionel Sambuc }
202f4a2713aSLionel Sambuc
203f4a2713aSLionel Sambuc static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_max_epu32(__m128i __V1,__m128i __V2)204f4a2713aSLionel Sambuc _mm_max_epu32 (__m128i __V1, __m128i __V2)
205f4a2713aSLionel Sambuc {
206f4a2713aSLionel Sambuc return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2);
207f4a2713aSLionel Sambuc }
208f4a2713aSLionel Sambuc
209f4a2713aSLionel Sambuc /* SSE4 Insertion and Extraction from XMM Register Instructions. */
210f4a2713aSLionel Sambuc #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
211f4a2713aSLionel Sambuc #define _mm_extract_ps(X, N) (__extension__ \
212f4a2713aSLionel Sambuc ({ union { int __i; float __f; } __t; \
213f4a2713aSLionel Sambuc __v4sf __a = (__v4sf)(X); \
214f4a2713aSLionel Sambuc __t.__f = __a[(N) & 3]; \
215f4a2713aSLionel Sambuc __t.__i;}))
216f4a2713aSLionel Sambuc
217f4a2713aSLionel Sambuc /* Miscellaneous insert and extract macros. */
218f4a2713aSLionel Sambuc /* Extract a single-precision float from X at index N into D. */
219f4a2713aSLionel Sambuc #define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \
220f4a2713aSLionel Sambuc (D) = __a[N]; }))
221f4a2713aSLionel Sambuc
222f4a2713aSLionel Sambuc /* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
223f4a2713aSLionel Sambuc an index suitable for _mm_insert_ps. */
224f4a2713aSLionel Sambuc #define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
225f4a2713aSLionel Sambuc
226f4a2713aSLionel Sambuc /* Extract a float from X at index N into the first index of the return. */
227f4a2713aSLionel Sambuc #define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \
228f4a2713aSLionel Sambuc _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
229f4a2713aSLionel Sambuc
230f4a2713aSLionel Sambuc /* Insert int into packed integer array at index. */
231f4a2713aSLionel Sambuc #define _mm_insert_epi8(X, I, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \
232f4a2713aSLionel Sambuc __a[(N) & 15] = (I); \
233f4a2713aSLionel Sambuc __a;}))
234f4a2713aSLionel Sambuc #define _mm_insert_epi32(X, I, N) (__extension__ ({ __v4si __a = (__v4si)(X); \
235f4a2713aSLionel Sambuc __a[(N) & 3] = (I); \
236f4a2713aSLionel Sambuc __a;}))
237f4a2713aSLionel Sambuc #ifdef __x86_64__
238f4a2713aSLionel Sambuc #define _mm_insert_epi64(X, I, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
239f4a2713aSLionel Sambuc __a[(N) & 1] = (I); \
240f4a2713aSLionel Sambuc __a;}))
241f4a2713aSLionel Sambuc #endif /* __x86_64__ */
242f4a2713aSLionel Sambuc
243f4a2713aSLionel Sambuc /* Extract int from packed integer array at index. This returns the element
244f4a2713aSLionel Sambuc * as a zero extended value, so it is unsigned.
245f4a2713aSLionel Sambuc */
246f4a2713aSLionel Sambuc #define _mm_extract_epi8(X, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \
247f4a2713aSLionel Sambuc (int)(unsigned char) \
248f4a2713aSLionel Sambuc __a[(N) & 15];}))
249f4a2713aSLionel Sambuc #define _mm_extract_epi32(X, N) (__extension__ ({ __v4si __a = (__v4si)(X); \
250f4a2713aSLionel Sambuc __a[(N) & 3];}))
251f4a2713aSLionel Sambuc #ifdef __x86_64__
252f4a2713aSLionel Sambuc #define _mm_extract_epi64(X, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
253f4a2713aSLionel Sambuc __a[(N) & 1];}))
254f4a2713aSLionel Sambuc #endif /* __x86_64 */
255f4a2713aSLionel Sambuc
256f4a2713aSLionel Sambuc /* SSE4 128-bit Packed Integer Comparisons. */
257f4a2713aSLionel Sambuc static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_testz_si128(__m128i __M,__m128i __V)258f4a2713aSLionel Sambuc _mm_testz_si128(__m128i __M, __m128i __V)
259f4a2713aSLionel Sambuc {
260f4a2713aSLionel Sambuc return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
261f4a2713aSLionel Sambuc }
262f4a2713aSLionel Sambuc
263f4a2713aSLionel Sambuc static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_testc_si128(__m128i __M,__m128i __V)264f4a2713aSLionel Sambuc _mm_testc_si128(__m128i __M, __m128i __V)
265f4a2713aSLionel Sambuc {
266f4a2713aSLionel Sambuc return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
267f4a2713aSLionel Sambuc }
268f4a2713aSLionel Sambuc
269f4a2713aSLionel Sambuc static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_testnzc_si128(__m128i __M,__m128i __V)270f4a2713aSLionel Sambuc _mm_testnzc_si128(__m128i __M, __m128i __V)
271f4a2713aSLionel Sambuc {
272f4a2713aSLionel Sambuc return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
273f4a2713aSLionel Sambuc }
274f4a2713aSLionel Sambuc
275f4a2713aSLionel Sambuc #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
276f4a2713aSLionel Sambuc #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
277f4a2713aSLionel Sambuc #define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
278f4a2713aSLionel Sambuc
279f4a2713aSLionel Sambuc /* SSE4 64-bit Packed Integer Comparisons. */
280f4a2713aSLionel Sambuc static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi64(__m128i __V1,__m128i __V2)281f4a2713aSLionel Sambuc _mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
282f4a2713aSLionel Sambuc {
283f4a2713aSLionel Sambuc return (__m128i)((__v2di)__V1 == (__v2di)__V2);
284f4a2713aSLionel Sambuc }
285f4a2713aSLionel Sambuc
286f4a2713aSLionel Sambuc /* SSE4 Packed Integer Sign-Extension. */
287f4a2713aSLionel Sambuc static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi8_epi16(__m128i __V)288f4a2713aSLionel Sambuc _mm_cvtepi8_epi16(__m128i __V)
289f4a2713aSLionel Sambuc {
290f4a2713aSLionel Sambuc return (__m128i) __builtin_ia32_pmovsxbw128((__v16qi) __V);
291f4a2713aSLionel Sambuc }
292f4a2713aSLionel Sambuc
293f4a2713aSLionel Sambuc static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi8_epi32(__m128i __V)294f4a2713aSLionel Sambuc _mm_cvtepi8_epi32(__m128i __V)
295f4a2713aSLionel Sambuc {
296f4a2713aSLionel Sambuc return (__m128i) __builtin_ia32_pmovsxbd128((__v16qi) __V);
297f4a2713aSLionel Sambuc }
298f4a2713aSLionel Sambuc
299f4a2713aSLionel Sambuc static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi8_epi64(__m128i __V)300f4a2713aSLionel Sambuc _mm_cvtepi8_epi64(__m128i __V)
301f4a2713aSLionel Sambuc {
302f4a2713aSLionel Sambuc return (__m128i) __builtin_ia32_pmovsxbq128((__v16qi) __V);
303f4a2713aSLionel Sambuc }
304f4a2713aSLionel Sambuc
305f4a2713aSLionel Sambuc static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi16_epi32(__m128i __V)306f4a2713aSLionel Sambuc _mm_cvtepi16_epi32(__m128i __V)
307f4a2713aSLionel Sambuc {
308f4a2713aSLionel Sambuc return (__m128i) __builtin_ia32_pmovsxwd128((__v8hi) __V);
309f4a2713aSLionel Sambuc }
310f4a2713aSLionel Sambuc
311f4a2713aSLionel Sambuc static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi16_epi64(__m128i __V)312f4a2713aSLionel Sambuc _mm_cvtepi16_epi64(__m128i __V)
313f4a2713aSLionel Sambuc {
314f4a2713aSLionel Sambuc return (__m128i) __builtin_ia32_pmovsxwq128((__v8hi)__V);
315f4a2713aSLionel Sambuc }
316f4a2713aSLionel Sambuc
317f4a2713aSLionel Sambuc static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi32_epi64(__m128i __V)318f4a2713aSLionel Sambuc _mm_cvtepi32_epi64(__m128i __V)
319f4a2713aSLionel Sambuc {
320f4a2713aSLionel Sambuc return (__m128i) __builtin_ia32_pmovsxdq128((__v4si)__V);
321f4a2713aSLionel Sambuc }
322f4a2713aSLionel Sambuc
323f4a2713aSLionel Sambuc /* SSE4 Packed Integer Zero-Extension. */
324f4a2713aSLionel Sambuc static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtepu8_epi16(__m128i __V)325f4a2713aSLionel Sambuc _mm_cvtepu8_epi16(__m128i __V)
326f4a2713aSLionel Sambuc {
327f4a2713aSLionel Sambuc return (__m128i) __builtin_ia32_pmovzxbw128((__v16qi) __V);
328f4a2713aSLionel Sambuc }
329f4a2713aSLionel Sambuc
330f4a2713aSLionel Sambuc static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtepu8_epi32(__m128i __V)331f4a2713aSLionel Sambuc _mm_cvtepu8_epi32(__m128i __V)
332f4a2713aSLionel Sambuc {
333f4a2713aSLionel Sambuc return (__m128i) __builtin_ia32_pmovzxbd128((__v16qi)__V);
334f4a2713aSLionel Sambuc }
335f4a2713aSLionel Sambuc
336f4a2713aSLionel Sambuc static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtepu8_epi64(__m128i __V)337f4a2713aSLionel Sambuc _mm_cvtepu8_epi64(__m128i __V)
338f4a2713aSLionel Sambuc {
339f4a2713aSLionel Sambuc return (__m128i) __builtin_ia32_pmovzxbq128((__v16qi)__V);
340f4a2713aSLionel Sambuc }
341f4a2713aSLionel Sambuc
342f4a2713aSLionel Sambuc static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtepu16_epi32(__m128i __V)343f4a2713aSLionel Sambuc _mm_cvtepu16_epi32(__m128i __V)
344f4a2713aSLionel Sambuc {
345f4a2713aSLionel Sambuc return (__m128i) __builtin_ia32_pmovzxwd128((__v8hi)__V);
346f4a2713aSLionel Sambuc }
347f4a2713aSLionel Sambuc
348f4a2713aSLionel Sambuc static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtepu16_epi64(__m128i __V)349f4a2713aSLionel Sambuc _mm_cvtepu16_epi64(__m128i __V)
350f4a2713aSLionel Sambuc {
351f4a2713aSLionel Sambuc return (__m128i) __builtin_ia32_pmovzxwq128((__v8hi)__V);
352f4a2713aSLionel Sambuc }
353f4a2713aSLionel Sambuc
354f4a2713aSLionel Sambuc static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtepu32_epi64(__m128i __V)355f4a2713aSLionel Sambuc _mm_cvtepu32_epi64(__m128i __V)
356f4a2713aSLionel Sambuc {
357f4a2713aSLionel Sambuc return (__m128i) __builtin_ia32_pmovzxdq128((__v4si)__V);
358f4a2713aSLionel Sambuc }
359f4a2713aSLionel Sambuc
360f4a2713aSLionel Sambuc /* SSE4 Pack with Unsigned Saturation. */
361f4a2713aSLionel Sambuc static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packus_epi32(__m128i __V1,__m128i __V2)362f4a2713aSLionel Sambuc _mm_packus_epi32(__m128i __V1, __m128i __V2)
363f4a2713aSLionel Sambuc {
364f4a2713aSLionel Sambuc return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
365f4a2713aSLionel Sambuc }
366f4a2713aSLionel Sambuc
367f4a2713aSLionel Sambuc /* SSE4 Multiple Packed Sums of Absolute Difference. */
368f4a2713aSLionel Sambuc #define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \
369f4a2713aSLionel Sambuc __m128i __X = (X); \
370f4a2713aSLionel Sambuc __m128i __Y = (Y); \
371f4a2713aSLionel Sambuc (__m128i) __builtin_ia32_mpsadbw128((__v16qi)__X, (__v16qi)__Y, (M)); })
372f4a2713aSLionel Sambuc
373f4a2713aSLionel Sambuc static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_minpos_epu16(__m128i __V)374f4a2713aSLionel Sambuc _mm_minpos_epu16(__m128i __V)
375f4a2713aSLionel Sambuc {
376f4a2713aSLionel Sambuc return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V);
377f4a2713aSLionel Sambuc }
378f4a2713aSLionel Sambuc
379f4a2713aSLionel Sambuc /* These definitions are normally in nmmintrin.h, but gcc puts them in here
380f4a2713aSLionel Sambuc so we'll do the same. */
381f4a2713aSLionel Sambuc #ifdef __SSE4_2__
382f4a2713aSLionel Sambuc
383f4a2713aSLionel Sambuc /* These specify the type of data that we're comparing. */
384f4a2713aSLionel Sambuc #define _SIDD_UBYTE_OPS 0x00
385f4a2713aSLionel Sambuc #define _SIDD_UWORD_OPS 0x01
386f4a2713aSLionel Sambuc #define _SIDD_SBYTE_OPS 0x02
387f4a2713aSLionel Sambuc #define _SIDD_SWORD_OPS 0x03
388f4a2713aSLionel Sambuc
389f4a2713aSLionel Sambuc /* These specify the type of comparison operation. */
390f4a2713aSLionel Sambuc #define _SIDD_CMP_EQUAL_ANY 0x00
391f4a2713aSLionel Sambuc #define _SIDD_CMP_RANGES 0x04
392f4a2713aSLionel Sambuc #define _SIDD_CMP_EQUAL_EACH 0x08
393f4a2713aSLionel Sambuc #define _SIDD_CMP_EQUAL_ORDERED 0x0c
394f4a2713aSLionel Sambuc
395f4a2713aSLionel Sambuc /* These macros specify the polarity of the operation. */
396f4a2713aSLionel Sambuc #define _SIDD_POSITIVE_POLARITY 0x00
397f4a2713aSLionel Sambuc #define _SIDD_NEGATIVE_POLARITY 0x10
398f4a2713aSLionel Sambuc #define _SIDD_MASKED_POSITIVE_POLARITY 0x20
399f4a2713aSLionel Sambuc #define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
400f4a2713aSLionel Sambuc
401f4a2713aSLionel Sambuc /* These macros are used in _mm_cmpXstri() to specify the return. */
402f4a2713aSLionel Sambuc #define _SIDD_LEAST_SIGNIFICANT 0x00
403f4a2713aSLionel Sambuc #define _SIDD_MOST_SIGNIFICANT 0x40
404f4a2713aSLionel Sambuc
405f4a2713aSLionel Sambuc /* These macros are used in _mm_cmpXstri() to specify the return. */
406f4a2713aSLionel Sambuc #define _SIDD_BIT_MASK 0x00
407f4a2713aSLionel Sambuc #define _SIDD_UNIT_MASK 0x40
408f4a2713aSLionel Sambuc
409f4a2713aSLionel Sambuc /* SSE4.2 Packed Comparison Intrinsics. */
410f4a2713aSLionel Sambuc #define _mm_cmpistrm(A, B, M) __builtin_ia32_pcmpistrm128((A), (B), (M))
411f4a2713aSLionel Sambuc #define _mm_cmpistri(A, B, M) __builtin_ia32_pcmpistri128((A), (B), (M))
412f4a2713aSLionel Sambuc
413f4a2713aSLionel Sambuc #define _mm_cmpestrm(A, LA, B, LB, M) \
414f4a2713aSLionel Sambuc __builtin_ia32_pcmpestrm128((A), (LA), (B), (LB), (M))
415f4a2713aSLionel Sambuc #define _mm_cmpestri(A, LA, B, LB, M) \
416f4a2713aSLionel Sambuc __builtin_ia32_pcmpestri128((A), (LA), (B), (LB), (M))
417f4a2713aSLionel Sambuc
418f4a2713aSLionel Sambuc /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */
419f4a2713aSLionel Sambuc #define _mm_cmpistra(A, B, M) \
420f4a2713aSLionel Sambuc __builtin_ia32_pcmpistria128((A), (B), (M))
421f4a2713aSLionel Sambuc #define _mm_cmpistrc(A, B, M) \
422f4a2713aSLionel Sambuc __builtin_ia32_pcmpistric128((A), (B), (M))
423f4a2713aSLionel Sambuc #define _mm_cmpistro(A, B, M) \
424f4a2713aSLionel Sambuc __builtin_ia32_pcmpistrio128((A), (B), (M))
425f4a2713aSLionel Sambuc #define _mm_cmpistrs(A, B, M) \
426f4a2713aSLionel Sambuc __builtin_ia32_pcmpistris128((A), (B), (M))
427f4a2713aSLionel Sambuc #define _mm_cmpistrz(A, B, M) \
428f4a2713aSLionel Sambuc __builtin_ia32_pcmpistriz128((A), (B), (M))
429f4a2713aSLionel Sambuc
430f4a2713aSLionel Sambuc #define _mm_cmpestra(A, LA, B, LB, M) \
431f4a2713aSLionel Sambuc __builtin_ia32_pcmpestria128((A), (LA), (B), (LB), (M))
432f4a2713aSLionel Sambuc #define _mm_cmpestrc(A, LA, B, LB, M) \
433f4a2713aSLionel Sambuc __builtin_ia32_pcmpestric128((A), (LA), (B), (LB), (M))
434f4a2713aSLionel Sambuc #define _mm_cmpestro(A, LA, B, LB, M) \
435f4a2713aSLionel Sambuc __builtin_ia32_pcmpestrio128((A), (LA), (B), (LB), (M))
436f4a2713aSLionel Sambuc #define _mm_cmpestrs(A, LA, B, LB, M) \
437f4a2713aSLionel Sambuc __builtin_ia32_pcmpestris128((A), (LA), (B), (LB), (M))
438f4a2713aSLionel Sambuc #define _mm_cmpestrz(A, LA, B, LB, M) \
439f4a2713aSLionel Sambuc __builtin_ia32_pcmpestriz128((A), (LA), (B), (LB), (M))
440f4a2713aSLionel Sambuc
441f4a2713aSLionel Sambuc /* SSE4.2 Compare Packed Data -- Greater Than. */
442f4a2713aSLionel Sambuc static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi64(__m128i __V1,__m128i __V2)443f4a2713aSLionel Sambuc _mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
444f4a2713aSLionel Sambuc {
445f4a2713aSLionel Sambuc return (__m128i)((__v2di)__V1 > (__v2di)__V2);
446f4a2713aSLionel Sambuc }
447f4a2713aSLionel Sambuc
448f4a2713aSLionel Sambuc /* SSE4.2 Accumulate CRC32. */
449f4a2713aSLionel Sambuc static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
_mm_crc32_u8(unsigned int __C,unsigned char __D)450f4a2713aSLionel Sambuc _mm_crc32_u8(unsigned int __C, unsigned char __D)
451f4a2713aSLionel Sambuc {
452f4a2713aSLionel Sambuc return __builtin_ia32_crc32qi(__C, __D);
453f4a2713aSLionel Sambuc }
454f4a2713aSLionel Sambuc
455f4a2713aSLionel Sambuc static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
_mm_crc32_u16(unsigned int __C,unsigned short __D)456f4a2713aSLionel Sambuc _mm_crc32_u16(unsigned int __C, unsigned short __D)
457f4a2713aSLionel Sambuc {
458f4a2713aSLionel Sambuc return __builtin_ia32_crc32hi(__C, __D);
459f4a2713aSLionel Sambuc }
460f4a2713aSLionel Sambuc
461f4a2713aSLionel Sambuc static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
_mm_crc32_u32(unsigned int __C,unsigned int __D)462f4a2713aSLionel Sambuc _mm_crc32_u32(unsigned int __C, unsigned int __D)
463f4a2713aSLionel Sambuc {
464f4a2713aSLionel Sambuc return __builtin_ia32_crc32si(__C, __D);
465f4a2713aSLionel Sambuc }
466f4a2713aSLionel Sambuc
467f4a2713aSLionel Sambuc #ifdef __x86_64__
468f4a2713aSLionel Sambuc static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
_mm_crc32_u64(unsigned long long __C,unsigned long long __D)469f4a2713aSLionel Sambuc _mm_crc32_u64(unsigned long long __C, unsigned long long __D)
470f4a2713aSLionel Sambuc {
471f4a2713aSLionel Sambuc return __builtin_ia32_crc32di(__C, __D);
472f4a2713aSLionel Sambuc }
473f4a2713aSLionel Sambuc #endif /* __x86_64__ */
474f4a2713aSLionel Sambuc
475f4a2713aSLionel Sambuc #ifdef __POPCNT__
476f4a2713aSLionel Sambuc #include <popcntintrin.h>
477f4a2713aSLionel Sambuc #endif
478f4a2713aSLionel Sambuc
479f4a2713aSLionel Sambuc #endif /* __SSE4_2__ */
480f4a2713aSLionel Sambuc #endif /* __SSE4_1__ */
481f4a2713aSLionel Sambuc
482f4a2713aSLionel Sambuc #endif /* _SMMINTRIN_H */
483