1*0a6a1f1dSLionel Sambuc /*===---- avx512fintrin.h - AVX2 intrinsics --------------------------------===
2*0a6a1f1dSLionel Sambuc *
3*0a6a1f1dSLionel Sambuc * Permission is hereby granted, free of charge, to any person obtaining a copy
4*0a6a1f1dSLionel Sambuc * of this software and associated documentation files (the "Software"), to deal
5*0a6a1f1dSLionel Sambuc * in the Software without restriction, including without limitation the rights
6*0a6a1f1dSLionel Sambuc * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7*0a6a1f1dSLionel Sambuc * copies of the Software, and to permit persons to whom the Software is
8*0a6a1f1dSLionel Sambuc * furnished to do so, subject to the following conditions:
9*0a6a1f1dSLionel Sambuc *
10*0a6a1f1dSLionel Sambuc * The above copyright notice and this permission notice shall be included in
11*0a6a1f1dSLionel Sambuc * all copies or substantial portions of the Software.
12*0a6a1f1dSLionel Sambuc *
13*0a6a1f1dSLionel Sambuc * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14*0a6a1f1dSLionel Sambuc * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15*0a6a1f1dSLionel Sambuc * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16*0a6a1f1dSLionel Sambuc * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17*0a6a1f1dSLionel Sambuc * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18*0a6a1f1dSLionel Sambuc * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19*0a6a1f1dSLionel Sambuc * THE SOFTWARE.
20*0a6a1f1dSLionel Sambuc *
21*0a6a1f1dSLionel Sambuc *===-----------------------------------------------------------------------===
22*0a6a1f1dSLionel Sambuc */
23*0a6a1f1dSLionel Sambuc #ifndef __IMMINTRIN_H
24*0a6a1f1dSLionel Sambuc #error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead."
25*0a6a1f1dSLionel Sambuc #endif
26*0a6a1f1dSLionel Sambuc
27*0a6a1f1dSLionel Sambuc #ifndef __AVX512FINTRIN_H
28*0a6a1f1dSLionel Sambuc #define __AVX512FINTRIN_H
29*0a6a1f1dSLionel Sambuc
30*0a6a1f1dSLionel Sambuc typedef double __v8df __attribute__((__vector_size__(64)));
31*0a6a1f1dSLionel Sambuc typedef float __v16sf __attribute__((__vector_size__(64)));
32*0a6a1f1dSLionel Sambuc typedef long long __v8di __attribute__((__vector_size__(64)));
33*0a6a1f1dSLionel Sambuc typedef int __v16si __attribute__((__vector_size__(64)));
34*0a6a1f1dSLionel Sambuc
35*0a6a1f1dSLionel Sambuc typedef float __m512 __attribute__((__vector_size__(64)));
36*0a6a1f1dSLionel Sambuc typedef double __m512d __attribute__((__vector_size__(64)));
37*0a6a1f1dSLionel Sambuc typedef long long __m512i __attribute__((__vector_size__(64)));
38*0a6a1f1dSLionel Sambuc
39*0a6a1f1dSLionel Sambuc typedef unsigned char __mmask8;
40*0a6a1f1dSLionel Sambuc typedef unsigned short __mmask16;
41*0a6a1f1dSLionel Sambuc
42*0a6a1f1dSLionel Sambuc /* Rounding mode macros. */
43*0a6a1f1dSLionel Sambuc #define _MM_FROUND_TO_NEAREST_INT 0x00
44*0a6a1f1dSLionel Sambuc #define _MM_FROUND_TO_NEG_INF 0x01
45*0a6a1f1dSLionel Sambuc #define _MM_FROUND_TO_POS_INF 0x02
46*0a6a1f1dSLionel Sambuc #define _MM_FROUND_TO_ZERO 0x03
47*0a6a1f1dSLionel Sambuc #define _MM_FROUND_CUR_DIRECTION 0x04
48*0a6a1f1dSLionel Sambuc
49*0a6a1f1dSLionel Sambuc /* Create vectors with repeated elements */
50*0a6a1f1dSLionel Sambuc
51*0a6a1f1dSLionel Sambuc static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
_mm512_setzero_si512(void)52*0a6a1f1dSLionel Sambuc _mm512_setzero_si512(void)
53*0a6a1f1dSLionel Sambuc {
54*0a6a1f1dSLionel Sambuc return (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
55*0a6a1f1dSLionel Sambuc }
56*0a6a1f1dSLionel Sambuc
57*0a6a1f1dSLionel Sambuc static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
_mm512_maskz_set1_epi32(__mmask16 __M,int __A)58*0a6a1f1dSLionel Sambuc _mm512_maskz_set1_epi32(__mmask16 __M, int __A)
59*0a6a1f1dSLionel Sambuc {
60*0a6a1f1dSLionel Sambuc return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A,
61*0a6a1f1dSLionel Sambuc (__v16si)
62*0a6a1f1dSLionel Sambuc _mm512_setzero_si512 (),
63*0a6a1f1dSLionel Sambuc __M);
64*0a6a1f1dSLionel Sambuc }
65*0a6a1f1dSLionel Sambuc
66*0a6a1f1dSLionel Sambuc static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
_mm512_maskz_set1_epi64(__mmask8 __M,long long __A)67*0a6a1f1dSLionel Sambuc _mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
68*0a6a1f1dSLionel Sambuc {
69*0a6a1f1dSLionel Sambuc #ifdef __x86_64__
70*0a6a1f1dSLionel Sambuc return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A,
71*0a6a1f1dSLionel Sambuc (__v8di)
72*0a6a1f1dSLionel Sambuc _mm512_setzero_si512 (),
73*0a6a1f1dSLionel Sambuc __M);
74*0a6a1f1dSLionel Sambuc #else
75*0a6a1f1dSLionel Sambuc return (__m512i) __builtin_ia32_pbroadcastq512_mem_mask (__A,
76*0a6a1f1dSLionel Sambuc (__v8di)
77*0a6a1f1dSLionel Sambuc _mm512_setzero_si512 (),
78*0a6a1f1dSLionel Sambuc __M);
79*0a6a1f1dSLionel Sambuc #endif
80*0a6a1f1dSLionel Sambuc }
81*0a6a1f1dSLionel Sambuc
82*0a6a1f1dSLionel Sambuc static __inline __m512 __attribute__ ((__always_inline__, __nodebug__))
_mm512_setzero_ps(void)83*0a6a1f1dSLionel Sambuc _mm512_setzero_ps(void)
84*0a6a1f1dSLionel Sambuc {
85*0a6a1f1dSLionel Sambuc return (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
86*0a6a1f1dSLionel Sambuc 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
87*0a6a1f1dSLionel Sambuc }
88*0a6a1f1dSLionel Sambuc static __inline __m512d __attribute__ ((__always_inline__, __nodebug__))
_mm512_setzero_pd(void)89*0a6a1f1dSLionel Sambuc _mm512_setzero_pd(void)
90*0a6a1f1dSLionel Sambuc {
91*0a6a1f1dSLionel Sambuc return (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
92*0a6a1f1dSLionel Sambuc }
93*0a6a1f1dSLionel Sambuc
94*0a6a1f1dSLionel Sambuc static __inline __m512 __attribute__((__always_inline__, __nodebug__))
_mm512_set1_ps(float __w)95*0a6a1f1dSLionel Sambuc _mm512_set1_ps(float __w)
96*0a6a1f1dSLionel Sambuc {
97*0a6a1f1dSLionel Sambuc return (__m512){ __w, __w, __w, __w, __w, __w, __w, __w,
98*0a6a1f1dSLionel Sambuc __w, __w, __w, __w, __w, __w, __w, __w };
99*0a6a1f1dSLionel Sambuc }
100*0a6a1f1dSLionel Sambuc
101*0a6a1f1dSLionel Sambuc static __inline __m512d __attribute__((__always_inline__, __nodebug__))
_mm512_set1_pd(double __w)102*0a6a1f1dSLionel Sambuc _mm512_set1_pd(double __w)
103*0a6a1f1dSLionel Sambuc {
104*0a6a1f1dSLionel Sambuc return (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w };
105*0a6a1f1dSLionel Sambuc }
106*0a6a1f1dSLionel Sambuc
107*0a6a1f1dSLionel Sambuc static __inline __m512i __attribute__((__always_inline__, __nodebug__))
_mm512_set1_epi32(int __s)108*0a6a1f1dSLionel Sambuc _mm512_set1_epi32(int __s)
109*0a6a1f1dSLionel Sambuc {
110*0a6a1f1dSLionel Sambuc return (__m512i)(__v16si){ __s, __s, __s, __s, __s, __s, __s, __s,
111*0a6a1f1dSLionel Sambuc __s, __s, __s, __s, __s, __s, __s, __s };
112*0a6a1f1dSLionel Sambuc }
113*0a6a1f1dSLionel Sambuc
114*0a6a1f1dSLionel Sambuc static __inline __m512i __attribute__((__always_inline__, __nodebug__))
_mm512_set1_epi64(long long __d)115*0a6a1f1dSLionel Sambuc _mm512_set1_epi64(long long __d)
116*0a6a1f1dSLionel Sambuc {
117*0a6a1f1dSLionel Sambuc return (__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };
118*0a6a1f1dSLionel Sambuc }
119*0a6a1f1dSLionel Sambuc
120*0a6a1f1dSLionel Sambuc static __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
_mm512_broadcastss_ps(__m128 __X)121*0a6a1f1dSLionel Sambuc _mm512_broadcastss_ps(__m128 __X)
122*0a6a1f1dSLionel Sambuc {
123*0a6a1f1dSLionel Sambuc float __f = __X[0];
124*0a6a1f1dSLionel Sambuc return (__v16sf){ __f, __f, __f, __f,
125*0a6a1f1dSLionel Sambuc __f, __f, __f, __f,
126*0a6a1f1dSLionel Sambuc __f, __f, __f, __f,
127*0a6a1f1dSLionel Sambuc __f, __f, __f, __f };
128*0a6a1f1dSLionel Sambuc }
129*0a6a1f1dSLionel Sambuc
130*0a6a1f1dSLionel Sambuc static __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
_mm512_broadcastsd_pd(__m128d __X)131*0a6a1f1dSLionel Sambuc _mm512_broadcastsd_pd(__m128d __X)
132*0a6a1f1dSLionel Sambuc {
133*0a6a1f1dSLionel Sambuc double __d = __X[0];
134*0a6a1f1dSLionel Sambuc return (__v8df){ __d, __d, __d, __d,
135*0a6a1f1dSLionel Sambuc __d, __d, __d, __d };
136*0a6a1f1dSLionel Sambuc }
137*0a6a1f1dSLionel Sambuc
138*0a6a1f1dSLionel Sambuc /* Cast between vector types */
139*0a6a1f1dSLionel Sambuc
140*0a6a1f1dSLionel Sambuc static __inline __m512d __attribute__((__always_inline__, __nodebug__))
_mm512_castpd256_pd512(__m256d __a)141*0a6a1f1dSLionel Sambuc _mm512_castpd256_pd512(__m256d __a)
142*0a6a1f1dSLionel Sambuc {
143*0a6a1f1dSLionel Sambuc return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
144*0a6a1f1dSLionel Sambuc }
145*0a6a1f1dSLionel Sambuc
146*0a6a1f1dSLionel Sambuc static __inline __m512 __attribute__((__always_inline__, __nodebug__))
_mm512_castps256_ps512(__m256 __a)147*0a6a1f1dSLionel Sambuc _mm512_castps256_ps512(__m256 __a)
148*0a6a1f1dSLionel Sambuc {
149*0a6a1f1dSLionel Sambuc return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7,
150*0a6a1f1dSLionel Sambuc -1, -1, -1, -1, -1, -1, -1, -1);
151*0a6a1f1dSLionel Sambuc }
152*0a6a1f1dSLionel Sambuc
153*0a6a1f1dSLionel Sambuc static __inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm512_castpd512_pd128(__m512d __a)154*0a6a1f1dSLionel Sambuc _mm512_castpd512_pd128(__m512d __a)
155*0a6a1f1dSLionel Sambuc {
156*0a6a1f1dSLionel Sambuc return __builtin_shufflevector(__a, __a, 0, 1);
157*0a6a1f1dSLionel Sambuc }
158*0a6a1f1dSLionel Sambuc
159*0a6a1f1dSLionel Sambuc static __inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm512_castps512_ps128(__m512 __a)160*0a6a1f1dSLionel Sambuc _mm512_castps512_ps128(__m512 __a)
161*0a6a1f1dSLionel Sambuc {
162*0a6a1f1dSLionel Sambuc return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
163*0a6a1f1dSLionel Sambuc }
164*0a6a1f1dSLionel Sambuc
165*0a6a1f1dSLionel Sambuc /* Arithmetic */
166*0a6a1f1dSLionel Sambuc
167*0a6a1f1dSLionel Sambuc static __inline __m512d __attribute__((__always_inline__, __nodebug__))
_mm512_add_pd(__m512d __a,__m512d __b)168*0a6a1f1dSLionel Sambuc _mm512_add_pd(__m512d __a, __m512d __b)
169*0a6a1f1dSLionel Sambuc {
170*0a6a1f1dSLionel Sambuc return __a + __b;
171*0a6a1f1dSLionel Sambuc }
172*0a6a1f1dSLionel Sambuc
173*0a6a1f1dSLionel Sambuc static __inline __m512 __attribute__((__always_inline__, __nodebug__))
_mm512_add_ps(__m512 __a,__m512 __b)174*0a6a1f1dSLionel Sambuc _mm512_add_ps(__m512 __a, __m512 __b)
175*0a6a1f1dSLionel Sambuc {
176*0a6a1f1dSLionel Sambuc return __a + __b;
177*0a6a1f1dSLionel Sambuc }
178*0a6a1f1dSLionel Sambuc
179*0a6a1f1dSLionel Sambuc static __inline __m512d __attribute__((__always_inline__, __nodebug__))
_mm512_mul_pd(__m512d __a,__m512d __b)180*0a6a1f1dSLionel Sambuc _mm512_mul_pd(__m512d __a, __m512d __b)
181*0a6a1f1dSLionel Sambuc {
182*0a6a1f1dSLionel Sambuc return __a * __b;
183*0a6a1f1dSLionel Sambuc }
184*0a6a1f1dSLionel Sambuc
185*0a6a1f1dSLionel Sambuc static __inline __m512 __attribute__((__always_inline__, __nodebug__))
_mm512_mul_ps(__m512 __a,__m512 __b)186*0a6a1f1dSLionel Sambuc _mm512_mul_ps(__m512 __a, __m512 __b)
187*0a6a1f1dSLionel Sambuc {
188*0a6a1f1dSLionel Sambuc return __a * __b;
189*0a6a1f1dSLionel Sambuc }
190*0a6a1f1dSLionel Sambuc
191*0a6a1f1dSLionel Sambuc static __inline __m512d __attribute__((__always_inline__, __nodebug__))
_mm512_sub_pd(__m512d __a,__m512d __b)192*0a6a1f1dSLionel Sambuc _mm512_sub_pd(__m512d __a, __m512d __b)
193*0a6a1f1dSLionel Sambuc {
194*0a6a1f1dSLionel Sambuc return __a - __b;
195*0a6a1f1dSLionel Sambuc }
196*0a6a1f1dSLionel Sambuc
197*0a6a1f1dSLionel Sambuc static __inline __m512 __attribute__((__always_inline__, __nodebug__))
_mm512_sub_ps(__m512 __a,__m512 __b)198*0a6a1f1dSLionel Sambuc _mm512_sub_ps(__m512 __a, __m512 __b)
199*0a6a1f1dSLionel Sambuc {
200*0a6a1f1dSLionel Sambuc return __a - __b;
201*0a6a1f1dSLionel Sambuc }
202*0a6a1f1dSLionel Sambuc
203*0a6a1f1dSLionel Sambuc static __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
_mm512_max_pd(__m512d __A,__m512d __B)204*0a6a1f1dSLionel Sambuc _mm512_max_pd(__m512d __A, __m512d __B)
205*0a6a1f1dSLionel Sambuc {
206*0a6a1f1dSLionel Sambuc return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
207*0a6a1f1dSLionel Sambuc (__v8df) __B,
208*0a6a1f1dSLionel Sambuc (__v8df)
209*0a6a1f1dSLionel Sambuc _mm512_setzero_pd (),
210*0a6a1f1dSLionel Sambuc (__mmask8) -1,
211*0a6a1f1dSLionel Sambuc _MM_FROUND_CUR_DIRECTION);
212*0a6a1f1dSLionel Sambuc }
213*0a6a1f1dSLionel Sambuc
214*0a6a1f1dSLionel Sambuc static __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
_mm512_max_ps(__m512 __A,__m512 __B)215*0a6a1f1dSLionel Sambuc _mm512_max_ps(__m512 __A, __m512 __B)
216*0a6a1f1dSLionel Sambuc {
217*0a6a1f1dSLionel Sambuc return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
218*0a6a1f1dSLionel Sambuc (__v16sf) __B,
219*0a6a1f1dSLionel Sambuc (__v16sf)
220*0a6a1f1dSLionel Sambuc _mm512_setzero_ps (),
221*0a6a1f1dSLionel Sambuc (__mmask16) -1,
222*0a6a1f1dSLionel Sambuc _MM_FROUND_CUR_DIRECTION);
223*0a6a1f1dSLionel Sambuc }
224*0a6a1f1dSLionel Sambuc
225*0a6a1f1dSLionel Sambuc static __inline __m512i
226*0a6a1f1dSLionel Sambuc __attribute__ ((__always_inline__, __nodebug__))
_mm512_max_epi32(__m512i __A,__m512i __B)227*0a6a1f1dSLionel Sambuc _mm512_max_epi32(__m512i __A, __m512i __B)
228*0a6a1f1dSLionel Sambuc {
229*0a6a1f1dSLionel Sambuc return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
230*0a6a1f1dSLionel Sambuc (__v16si) __B,
231*0a6a1f1dSLionel Sambuc (__v16si)
232*0a6a1f1dSLionel Sambuc _mm512_setzero_si512 (),
233*0a6a1f1dSLionel Sambuc (__mmask16) -1);
234*0a6a1f1dSLionel Sambuc }
235*0a6a1f1dSLionel Sambuc
236*0a6a1f1dSLionel Sambuc static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
_mm512_max_epu32(__m512i __A,__m512i __B)237*0a6a1f1dSLionel Sambuc _mm512_max_epu32(__m512i __A, __m512i __B)
238*0a6a1f1dSLionel Sambuc {
239*0a6a1f1dSLionel Sambuc return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
240*0a6a1f1dSLionel Sambuc (__v16si) __B,
241*0a6a1f1dSLionel Sambuc (__v16si)
242*0a6a1f1dSLionel Sambuc _mm512_setzero_si512 (),
243*0a6a1f1dSLionel Sambuc (__mmask16) -1);
244*0a6a1f1dSLionel Sambuc }
245*0a6a1f1dSLionel Sambuc
246*0a6a1f1dSLionel Sambuc static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
_mm512_max_epi64(__m512i __A,__m512i __B)247*0a6a1f1dSLionel Sambuc _mm512_max_epi64(__m512i __A, __m512i __B)
248*0a6a1f1dSLionel Sambuc {
249*0a6a1f1dSLionel Sambuc return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
250*0a6a1f1dSLionel Sambuc (__v8di) __B,
251*0a6a1f1dSLionel Sambuc (__v8di)
252*0a6a1f1dSLionel Sambuc _mm512_setzero_si512 (),
253*0a6a1f1dSLionel Sambuc (__mmask8) -1);
254*0a6a1f1dSLionel Sambuc }
255*0a6a1f1dSLionel Sambuc
256*0a6a1f1dSLionel Sambuc static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
_mm512_max_epu64(__m512i __A,__m512i __B)257*0a6a1f1dSLionel Sambuc _mm512_max_epu64(__m512i __A, __m512i __B)
258*0a6a1f1dSLionel Sambuc {
259*0a6a1f1dSLionel Sambuc return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
260*0a6a1f1dSLionel Sambuc (__v8di) __B,
261*0a6a1f1dSLionel Sambuc (__v8di)
262*0a6a1f1dSLionel Sambuc _mm512_setzero_si512 (),
263*0a6a1f1dSLionel Sambuc (__mmask8) -1);
264*0a6a1f1dSLionel Sambuc }
265*0a6a1f1dSLionel Sambuc
266*0a6a1f1dSLionel Sambuc static __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
_mm512_min_pd(__m512d __A,__m512d __B)267*0a6a1f1dSLionel Sambuc _mm512_min_pd(__m512d __A, __m512d __B)
268*0a6a1f1dSLionel Sambuc {
269*0a6a1f1dSLionel Sambuc return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
270*0a6a1f1dSLionel Sambuc (__v8df) __B,
271*0a6a1f1dSLionel Sambuc (__v8df)
272*0a6a1f1dSLionel Sambuc _mm512_setzero_pd (),
273*0a6a1f1dSLionel Sambuc (__mmask8) -1,
274*0a6a1f1dSLionel Sambuc _MM_FROUND_CUR_DIRECTION);
275*0a6a1f1dSLionel Sambuc }
276*0a6a1f1dSLionel Sambuc
277*0a6a1f1dSLionel Sambuc static __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
_mm512_min_ps(__m512 __A,__m512 __B)278*0a6a1f1dSLionel Sambuc _mm512_min_ps(__m512 __A, __m512 __B)
279*0a6a1f1dSLionel Sambuc {
280*0a6a1f1dSLionel Sambuc return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
281*0a6a1f1dSLionel Sambuc (__v16sf) __B,
282*0a6a1f1dSLionel Sambuc (__v16sf)
283*0a6a1f1dSLionel Sambuc _mm512_setzero_ps (),
284*0a6a1f1dSLionel Sambuc (__mmask16) -1,
285*0a6a1f1dSLionel Sambuc _MM_FROUND_CUR_DIRECTION);
286*0a6a1f1dSLionel Sambuc }
287*0a6a1f1dSLionel Sambuc
288*0a6a1f1dSLionel Sambuc static __inline __m512i
289*0a6a1f1dSLionel Sambuc __attribute__ ((__always_inline__, __nodebug__))
_mm512_min_epi32(__m512i __A,__m512i __B)290*0a6a1f1dSLionel Sambuc _mm512_min_epi32(__m512i __A, __m512i __B)
291*0a6a1f1dSLionel Sambuc {
292*0a6a1f1dSLionel Sambuc return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
293*0a6a1f1dSLionel Sambuc (__v16si) __B,
294*0a6a1f1dSLionel Sambuc (__v16si)
295*0a6a1f1dSLionel Sambuc _mm512_setzero_si512 (),
296*0a6a1f1dSLionel Sambuc (__mmask16) -1);
297*0a6a1f1dSLionel Sambuc }
298*0a6a1f1dSLionel Sambuc
299*0a6a1f1dSLionel Sambuc static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
_mm512_min_epu32(__m512i __A,__m512i __B)300*0a6a1f1dSLionel Sambuc _mm512_min_epu32(__m512i __A, __m512i __B)
301*0a6a1f1dSLionel Sambuc {
302*0a6a1f1dSLionel Sambuc return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
303*0a6a1f1dSLionel Sambuc (__v16si) __B,
304*0a6a1f1dSLionel Sambuc (__v16si)
305*0a6a1f1dSLionel Sambuc _mm512_setzero_si512 (),
306*0a6a1f1dSLionel Sambuc (__mmask16) -1);
307*0a6a1f1dSLionel Sambuc }
308*0a6a1f1dSLionel Sambuc
309*0a6a1f1dSLionel Sambuc static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
_mm512_min_epi64(__m512i __A,__m512i __B)310*0a6a1f1dSLionel Sambuc _mm512_min_epi64(__m512i __A, __m512i __B)
311*0a6a1f1dSLionel Sambuc {
312*0a6a1f1dSLionel Sambuc return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
313*0a6a1f1dSLionel Sambuc (__v8di) __B,
314*0a6a1f1dSLionel Sambuc (__v8di)
315*0a6a1f1dSLionel Sambuc _mm512_setzero_si512 (),
316*0a6a1f1dSLionel Sambuc (__mmask8) -1);
317*0a6a1f1dSLionel Sambuc }
318*0a6a1f1dSLionel Sambuc
319*0a6a1f1dSLionel Sambuc static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
_mm512_min_epu64(__m512i __A,__m512i __B)320*0a6a1f1dSLionel Sambuc _mm512_min_epu64(__m512i __A, __m512i __B)
321*0a6a1f1dSLionel Sambuc {
322*0a6a1f1dSLionel Sambuc return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
323*0a6a1f1dSLionel Sambuc (__v8di) __B,
324*0a6a1f1dSLionel Sambuc (__v8di)
325*0a6a1f1dSLionel Sambuc _mm512_setzero_si512 (),
326*0a6a1f1dSLionel Sambuc (__mmask8) -1);
327*0a6a1f1dSLionel Sambuc }
328*0a6a1f1dSLionel Sambuc
329*0a6a1f1dSLionel Sambuc static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
_mm512_mul_epi32(__m512i __X,__m512i __Y)330*0a6a1f1dSLionel Sambuc _mm512_mul_epi32(__m512i __X, __m512i __Y)
331*0a6a1f1dSLionel Sambuc {
332*0a6a1f1dSLionel Sambuc return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
333*0a6a1f1dSLionel Sambuc (__v16si) __Y,
334*0a6a1f1dSLionel Sambuc (__v8di)
335*0a6a1f1dSLionel Sambuc _mm512_setzero_si512 (),
336*0a6a1f1dSLionel Sambuc (__mmask8) -1);
337*0a6a1f1dSLionel Sambuc }
338*0a6a1f1dSLionel Sambuc
339*0a6a1f1dSLionel Sambuc static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
_mm512_mul_epu32(__m512i __X,__m512i __Y)340*0a6a1f1dSLionel Sambuc _mm512_mul_epu32(__m512i __X, __m512i __Y)
341*0a6a1f1dSLionel Sambuc {
342*0a6a1f1dSLionel Sambuc return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
343*0a6a1f1dSLionel Sambuc (__v16si) __Y,
344*0a6a1f1dSLionel Sambuc (__v8di)
345*0a6a1f1dSLionel Sambuc _mm512_setzero_si512 (),
346*0a6a1f1dSLionel Sambuc (__mmask8) -1);
347*0a6a1f1dSLionel Sambuc }
348*0a6a1f1dSLionel Sambuc
349*0a6a1f1dSLionel Sambuc static __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
_mm512_sqrt_pd(__m512d a)350*0a6a1f1dSLionel Sambuc _mm512_sqrt_pd(__m512d a)
351*0a6a1f1dSLionel Sambuc {
352*0a6a1f1dSLionel Sambuc return (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)a,
353*0a6a1f1dSLionel Sambuc (__v8df) _mm512_setzero_pd (),
354*0a6a1f1dSLionel Sambuc (__mmask8) -1,
355*0a6a1f1dSLionel Sambuc _MM_FROUND_CUR_DIRECTION);
356*0a6a1f1dSLionel Sambuc }
357*0a6a1f1dSLionel Sambuc
358*0a6a1f1dSLionel Sambuc static __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
_mm512_sqrt_ps(__m512 a)359*0a6a1f1dSLionel Sambuc _mm512_sqrt_ps(__m512 a)
360*0a6a1f1dSLionel Sambuc {
361*0a6a1f1dSLionel Sambuc return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)a,
362*0a6a1f1dSLionel Sambuc (__v16sf) _mm512_setzero_ps (),
363*0a6a1f1dSLionel Sambuc (__mmask16) -1,
364*0a6a1f1dSLionel Sambuc _MM_FROUND_CUR_DIRECTION);
365*0a6a1f1dSLionel Sambuc }
366*0a6a1f1dSLionel Sambuc
367*0a6a1f1dSLionel Sambuc static __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
_mm512_rsqrt14_pd(__m512d __A)368*0a6a1f1dSLionel Sambuc _mm512_rsqrt14_pd(__m512d __A)
369*0a6a1f1dSLionel Sambuc {
370*0a6a1f1dSLionel Sambuc return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
371*0a6a1f1dSLionel Sambuc (__v8df)
372*0a6a1f1dSLionel Sambuc _mm512_setzero_pd (),
373*0a6a1f1dSLionel Sambuc (__mmask8) -1);}
374*0a6a1f1dSLionel Sambuc
375*0a6a1f1dSLionel Sambuc static __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
_mm512_rsqrt14_ps(__m512 __A)376*0a6a1f1dSLionel Sambuc _mm512_rsqrt14_ps(__m512 __A)
377*0a6a1f1dSLionel Sambuc {
378*0a6a1f1dSLionel Sambuc return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
379*0a6a1f1dSLionel Sambuc (__v16sf)
380*0a6a1f1dSLionel Sambuc _mm512_setzero_ps (),
381*0a6a1f1dSLionel Sambuc (__mmask16) -1);
382*0a6a1f1dSLionel Sambuc }
383*0a6a1f1dSLionel Sambuc
384*0a6a1f1dSLionel Sambuc static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_rsqrt14_ss(__m128 __A,__m128 __B)385*0a6a1f1dSLionel Sambuc _mm_rsqrt14_ss(__m128 __A, __m128 __B)
386*0a6a1f1dSLionel Sambuc {
387*0a6a1f1dSLionel Sambuc return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
388*0a6a1f1dSLionel Sambuc (__v4sf) __B,
389*0a6a1f1dSLionel Sambuc (__v4sf)
390*0a6a1f1dSLionel Sambuc _mm_setzero_ps (),
391*0a6a1f1dSLionel Sambuc (__mmask8) -1);
392*0a6a1f1dSLionel Sambuc }
393*0a6a1f1dSLionel Sambuc
394*0a6a1f1dSLionel Sambuc static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_rsqrt14_sd(__m128d __A,__m128d __B)395*0a6a1f1dSLionel Sambuc _mm_rsqrt14_sd(__m128d __A, __m128d __B)
396*0a6a1f1dSLionel Sambuc {
397*0a6a1f1dSLionel Sambuc return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __A,
398*0a6a1f1dSLionel Sambuc (__v2df) __B,
399*0a6a1f1dSLionel Sambuc (__v2df)
400*0a6a1f1dSLionel Sambuc _mm_setzero_pd (),
401*0a6a1f1dSLionel Sambuc (__mmask8) -1);
402*0a6a1f1dSLionel Sambuc }
403*0a6a1f1dSLionel Sambuc
404*0a6a1f1dSLionel Sambuc static __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
_mm512_rcp14_pd(__m512d __A)405*0a6a1f1dSLionel Sambuc _mm512_rcp14_pd(__m512d __A)
406*0a6a1f1dSLionel Sambuc {
407*0a6a1f1dSLionel Sambuc return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
408*0a6a1f1dSLionel Sambuc (__v8df)
409*0a6a1f1dSLionel Sambuc _mm512_setzero_pd (),
410*0a6a1f1dSLionel Sambuc (__mmask8) -1);
411*0a6a1f1dSLionel Sambuc }
412*0a6a1f1dSLionel Sambuc
413*0a6a1f1dSLionel Sambuc static __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
_mm512_rcp14_ps(__m512 __A)414*0a6a1f1dSLionel Sambuc _mm512_rcp14_ps(__m512 __A)
415*0a6a1f1dSLionel Sambuc {
416*0a6a1f1dSLionel Sambuc return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
417*0a6a1f1dSLionel Sambuc (__v16sf)
418*0a6a1f1dSLionel Sambuc _mm512_setzero_ps (),
419*0a6a1f1dSLionel Sambuc (__mmask16) -1);
420*0a6a1f1dSLionel Sambuc }
421*0a6a1f1dSLionel Sambuc static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_rcp14_ss(__m128 __A,__m128 __B)422*0a6a1f1dSLionel Sambuc _mm_rcp14_ss(__m128 __A, __m128 __B)
423*0a6a1f1dSLionel Sambuc {
424*0a6a1f1dSLionel Sambuc return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
425*0a6a1f1dSLionel Sambuc (__v4sf) __B,
426*0a6a1f1dSLionel Sambuc (__v4sf)
427*0a6a1f1dSLionel Sambuc _mm_setzero_ps (),
428*0a6a1f1dSLionel Sambuc (__mmask8) -1);
429*0a6a1f1dSLionel Sambuc }
430*0a6a1f1dSLionel Sambuc
431*0a6a1f1dSLionel Sambuc static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_rcp14_sd(__m128d __A,__m128d __B)432*0a6a1f1dSLionel Sambuc _mm_rcp14_sd(__m128d __A, __m128d __B)
433*0a6a1f1dSLionel Sambuc {
434*0a6a1f1dSLionel Sambuc return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __A,
435*0a6a1f1dSLionel Sambuc (__v2df) __B,
436*0a6a1f1dSLionel Sambuc (__v2df)
437*0a6a1f1dSLionel Sambuc _mm_setzero_pd (),
438*0a6a1f1dSLionel Sambuc (__mmask8) -1);
439*0a6a1f1dSLionel Sambuc }
440*0a6a1f1dSLionel Sambuc
441*0a6a1f1dSLionel Sambuc static __inline __m512 __attribute__ ((__always_inline__, __nodebug__))
_mm512_floor_ps(__m512 __A)442*0a6a1f1dSLionel Sambuc _mm512_floor_ps(__m512 __A)
443*0a6a1f1dSLionel Sambuc {
444*0a6a1f1dSLionel Sambuc return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
445*0a6a1f1dSLionel Sambuc _MM_FROUND_FLOOR,
446*0a6a1f1dSLionel Sambuc (__v16sf) __A, -1,
447*0a6a1f1dSLionel Sambuc _MM_FROUND_CUR_DIRECTION);
448*0a6a1f1dSLionel Sambuc }
449*0a6a1f1dSLionel Sambuc
450*0a6a1f1dSLionel Sambuc static __inline __m512d __attribute__ ((__always_inline__, __nodebug__))
_mm512_floor_pd(__m512d __A)451*0a6a1f1dSLionel Sambuc _mm512_floor_pd(__m512d __A)
452*0a6a1f1dSLionel Sambuc {
453*0a6a1f1dSLionel Sambuc return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
454*0a6a1f1dSLionel Sambuc _MM_FROUND_FLOOR,
455*0a6a1f1dSLionel Sambuc (__v8df) __A, -1,
456*0a6a1f1dSLionel Sambuc _MM_FROUND_CUR_DIRECTION);
457*0a6a1f1dSLionel Sambuc }
458*0a6a1f1dSLionel Sambuc
459*0a6a1f1dSLionel Sambuc static __inline __m512 __attribute__ ((__always_inline__, __nodebug__))
_mm512_ceil_ps(__m512 __A)460*0a6a1f1dSLionel Sambuc _mm512_ceil_ps(__m512 __A)
461*0a6a1f1dSLionel Sambuc {
462*0a6a1f1dSLionel Sambuc return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
463*0a6a1f1dSLionel Sambuc _MM_FROUND_CEIL,
464*0a6a1f1dSLionel Sambuc (__v16sf) __A, -1,
465*0a6a1f1dSLionel Sambuc _MM_FROUND_CUR_DIRECTION);
466*0a6a1f1dSLionel Sambuc }
467*0a6a1f1dSLionel Sambuc
468*0a6a1f1dSLionel Sambuc static __inline __m512d __attribute__ ((__always_inline__, __nodebug__))
_mm512_ceil_pd(__m512d __A)469*0a6a1f1dSLionel Sambuc _mm512_ceil_pd(__m512d __A)
470*0a6a1f1dSLionel Sambuc {
471*0a6a1f1dSLionel Sambuc return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
472*0a6a1f1dSLionel Sambuc _MM_FROUND_CEIL,
473*0a6a1f1dSLionel Sambuc (__v8df) __A, -1,
474*0a6a1f1dSLionel Sambuc _MM_FROUND_CUR_DIRECTION);
475*0a6a1f1dSLionel Sambuc }
476*0a6a1f1dSLionel Sambuc
477*0a6a1f1dSLionel Sambuc static __inline __m512i __attribute__ (( __always_inline__, __nodebug__))
_mm512_abs_epi64(__m512i __A)478*0a6a1f1dSLionel Sambuc _mm512_abs_epi64(__m512i __A)
479*0a6a1f1dSLionel Sambuc {
480*0a6a1f1dSLionel Sambuc return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
481*0a6a1f1dSLionel Sambuc (__v8di)
482*0a6a1f1dSLionel Sambuc _mm512_setzero_si512 (),
483*0a6a1f1dSLionel Sambuc (__mmask8) -1);
484*0a6a1f1dSLionel Sambuc }
485*0a6a1f1dSLionel Sambuc
486*0a6a1f1dSLionel Sambuc static __inline __m512i __attribute__ (( __always_inline__, __nodebug__))
_mm512_abs_epi32(__m512i __A)487*0a6a1f1dSLionel Sambuc _mm512_abs_epi32(__m512i __A)
488*0a6a1f1dSLionel Sambuc {
489*0a6a1f1dSLionel Sambuc return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
490*0a6a1f1dSLionel Sambuc (__v16si)
491*0a6a1f1dSLionel Sambuc _mm512_setzero_si512 (),
492*0a6a1f1dSLionel Sambuc (__mmask16) -1);
493*0a6a1f1dSLionel Sambuc }
494*0a6a1f1dSLionel Sambuc
495*0a6a1f1dSLionel Sambuc static __inline __m512 __attribute__ ((__always_inline__, __nodebug__))
_mm512_roundscale_ps(__m512 __A,const int __imm)496*0a6a1f1dSLionel Sambuc _mm512_roundscale_ps(__m512 __A, const int __imm)
497*0a6a1f1dSLionel Sambuc {
498*0a6a1f1dSLionel Sambuc return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, __imm,
499*0a6a1f1dSLionel Sambuc (__v16sf) __A, -1,
500*0a6a1f1dSLionel Sambuc _MM_FROUND_CUR_DIRECTION);
501*0a6a1f1dSLionel Sambuc }
502*0a6a1f1dSLionel Sambuc static __inline __m512d __attribute__ ((__always_inline__, __nodebug__))
_mm512_roundscale_pd(__m512d __A,const int __imm)503*0a6a1f1dSLionel Sambuc _mm512_roundscale_pd(__m512d __A, const int __imm)
504*0a6a1f1dSLionel Sambuc {
505*0a6a1f1dSLionel Sambuc return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, __imm,
506*0a6a1f1dSLionel Sambuc (__v8df) __A, -1,
507*0a6a1f1dSLionel Sambuc _MM_FROUND_CUR_DIRECTION);
508*0a6a1f1dSLionel Sambuc }
509*0a6a1f1dSLionel Sambuc
510*0a6a1f1dSLionel Sambuc static __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
_mm512_fmadd_pd(__m512d __A,__m512d __B,__m512d __C)511*0a6a1f1dSLionel Sambuc _mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C)
512*0a6a1f1dSLionel Sambuc {
513*0a6a1f1dSLionel Sambuc return (__m512d)
514*0a6a1f1dSLionel Sambuc __builtin_ia32_vfmaddpd512_mask(__A,
515*0a6a1f1dSLionel Sambuc __B,
516*0a6a1f1dSLionel Sambuc __C,
517*0a6a1f1dSLionel Sambuc (__mmask8) -1,
518*0a6a1f1dSLionel Sambuc _MM_FROUND_CUR_DIRECTION);
519*0a6a1f1dSLionel Sambuc }
520*0a6a1f1dSLionel Sambuc
521*0a6a1f1dSLionel Sambuc static __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
_mm512_fmsub_pd(__m512d __A,__m512d __B,__m512d __C)522*0a6a1f1dSLionel Sambuc _mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C)
523*0a6a1f1dSLionel Sambuc {
524*0a6a1f1dSLionel Sambuc return (__m512d)
525*0a6a1f1dSLionel Sambuc __builtin_ia32_vfmsubpd512_mask(__A,
526*0a6a1f1dSLionel Sambuc __B,
527*0a6a1f1dSLionel Sambuc __C,
528*0a6a1f1dSLionel Sambuc (__mmask8) -1,
529*0a6a1f1dSLionel Sambuc _MM_FROUND_CUR_DIRECTION);
530*0a6a1f1dSLionel Sambuc }
531*0a6a1f1dSLionel Sambuc
532*0a6a1f1dSLionel Sambuc static __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
_mm512_fnmadd_pd(__m512d __A,__m512d __B,__m512d __C)533*0a6a1f1dSLionel Sambuc _mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C)
534*0a6a1f1dSLionel Sambuc {
535*0a6a1f1dSLionel Sambuc return (__m512d)
536*0a6a1f1dSLionel Sambuc __builtin_ia32_vfnmaddpd512_mask(__A,
537*0a6a1f1dSLionel Sambuc __B,
538*0a6a1f1dSLionel Sambuc __C,
539*0a6a1f1dSLionel Sambuc (__mmask8) -1,
540*0a6a1f1dSLionel Sambuc _MM_FROUND_CUR_DIRECTION);
541*0a6a1f1dSLionel Sambuc }
542*0a6a1f1dSLionel Sambuc
543*0a6a1f1dSLionel Sambuc static __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
_mm512_fmadd_ps(__m512 __A,__m512 __B,__m512 __C)544*0a6a1f1dSLionel Sambuc _mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C)
545*0a6a1f1dSLionel Sambuc {
546*0a6a1f1dSLionel Sambuc return (__m512)
547*0a6a1f1dSLionel Sambuc __builtin_ia32_vfmaddps512_mask(__A,
548*0a6a1f1dSLionel Sambuc __B,
549*0a6a1f1dSLionel Sambuc __C,
550*0a6a1f1dSLionel Sambuc (__mmask16) -1,
551*0a6a1f1dSLionel Sambuc _MM_FROUND_CUR_DIRECTION);
552*0a6a1f1dSLionel Sambuc }
553*0a6a1f1dSLionel Sambuc
554*0a6a1f1dSLionel Sambuc static __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
_mm512_fmsub_ps(__m512 __A,__m512 __B,__m512 __C)555*0a6a1f1dSLionel Sambuc _mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C)
556*0a6a1f1dSLionel Sambuc {
557*0a6a1f1dSLionel Sambuc return (__m512)
558*0a6a1f1dSLionel Sambuc __builtin_ia32_vfmsubps512_mask(__A,
559*0a6a1f1dSLionel Sambuc __B,
560*0a6a1f1dSLionel Sambuc __C,
561*0a6a1f1dSLionel Sambuc (__mmask16) -1,
562*0a6a1f1dSLionel Sambuc _MM_FROUND_CUR_DIRECTION);
563*0a6a1f1dSLionel Sambuc }
564*0a6a1f1dSLionel Sambuc
565*0a6a1f1dSLionel Sambuc static __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
_mm512_fnmadd_ps(__m512 __A,__m512 __B,__m512 __C)566*0a6a1f1dSLionel Sambuc _mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C)
567*0a6a1f1dSLionel Sambuc {
568*0a6a1f1dSLionel Sambuc return (__m512)
569*0a6a1f1dSLionel Sambuc __builtin_ia32_vfnmaddps512_mask(__A,
570*0a6a1f1dSLionel Sambuc __B,
571*0a6a1f1dSLionel Sambuc __C,
572*0a6a1f1dSLionel Sambuc (__mmask16) -1,
573*0a6a1f1dSLionel Sambuc _MM_FROUND_CUR_DIRECTION);
574*0a6a1f1dSLionel Sambuc }
575*0a6a1f1dSLionel Sambuc
576*0a6a1f1dSLionel Sambuc /* Vector permutations */
577*0a6a1f1dSLionel Sambuc
578*0a6a1f1dSLionel Sambuc static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
_mm512_permutex2var_epi32(__m512i __A,__m512i __I,__m512i __B)579*0a6a1f1dSLionel Sambuc _mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B)
580*0a6a1f1dSLionel Sambuc {
581*0a6a1f1dSLionel Sambuc return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I
582*0a6a1f1dSLionel Sambuc /* idx */ ,
583*0a6a1f1dSLionel Sambuc (__v16si) __A,
584*0a6a1f1dSLionel Sambuc (__v16si) __B,
585*0a6a1f1dSLionel Sambuc (__mmask16) -1);
586*0a6a1f1dSLionel Sambuc }
587*0a6a1f1dSLionel Sambuc static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
_mm512_permutex2var_epi64(__m512i __A,__m512i __I,__m512i __B)588*0a6a1f1dSLionel Sambuc _mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B)
589*0a6a1f1dSLionel Sambuc {
590*0a6a1f1dSLionel Sambuc return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I
591*0a6a1f1dSLionel Sambuc /* idx */ ,
592*0a6a1f1dSLionel Sambuc (__v8di) __A,
593*0a6a1f1dSLionel Sambuc (__v8di) __B,
594*0a6a1f1dSLionel Sambuc (__mmask8) -1);
595*0a6a1f1dSLionel Sambuc }
596*0a6a1f1dSLionel Sambuc
597*0a6a1f1dSLionel Sambuc static __inline __m512d __attribute__ ((__always_inline__, __nodebug__))
_mm512_permutex2var_pd(__m512d __A,__m512i __I,__m512d __B)598*0a6a1f1dSLionel Sambuc _mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B)
599*0a6a1f1dSLionel Sambuc {
600*0a6a1f1dSLionel Sambuc return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I
601*0a6a1f1dSLionel Sambuc /* idx */ ,
602*0a6a1f1dSLionel Sambuc (__v8df) __A,
603*0a6a1f1dSLionel Sambuc (__v8df) __B,
604*0a6a1f1dSLionel Sambuc (__mmask8) -1);
605*0a6a1f1dSLionel Sambuc }
606*0a6a1f1dSLionel Sambuc static __inline __m512 __attribute__ ((__always_inline__, __nodebug__))
_mm512_permutex2var_ps(__m512 __A,__m512i __I,__m512 __B)607*0a6a1f1dSLionel Sambuc _mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B)
608*0a6a1f1dSLionel Sambuc {
609*0a6a1f1dSLionel Sambuc return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I
610*0a6a1f1dSLionel Sambuc /* idx */ ,
611*0a6a1f1dSLionel Sambuc (__v16sf) __A,
612*0a6a1f1dSLionel Sambuc (__v16sf) __B,
613*0a6a1f1dSLionel Sambuc (__mmask16) -1);
614*0a6a1f1dSLionel Sambuc }
615*0a6a1f1dSLionel Sambuc
616*0a6a1f1dSLionel Sambuc static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
_mm512_valign_epi64(__m512i __A,__m512i __B,const int __I)617*0a6a1f1dSLionel Sambuc _mm512_valign_epi64(__m512i __A, __m512i __B, const int __I)
618*0a6a1f1dSLionel Sambuc {
619*0a6a1f1dSLionel Sambuc return (__m512i) __builtin_ia32_alignq512_mask((__v8di)__A,
620*0a6a1f1dSLionel Sambuc (__v8di)__B,
621*0a6a1f1dSLionel Sambuc __I,
622*0a6a1f1dSLionel Sambuc (__v8di)_mm512_setzero_si512(),
623*0a6a1f1dSLionel Sambuc (__mmask8) -1);
624*0a6a1f1dSLionel Sambuc }
625*0a6a1f1dSLionel Sambuc
626*0a6a1f1dSLionel Sambuc static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
_mm512_valign_epi32(__m512i __A,__m512i __B,const int __I)627*0a6a1f1dSLionel Sambuc _mm512_valign_epi32(__m512i __A, __m512i __B, const int __I)
628*0a6a1f1dSLionel Sambuc {
629*0a6a1f1dSLionel Sambuc return (__m512i)__builtin_ia32_alignd512_mask((__v16si)__A,
630*0a6a1f1dSLionel Sambuc (__v16si)__B,
631*0a6a1f1dSLionel Sambuc __I,
632*0a6a1f1dSLionel Sambuc (__v16si)_mm512_setzero_si512(),
633*0a6a1f1dSLionel Sambuc (__mmask16) -1);
634*0a6a1f1dSLionel Sambuc }
635*0a6a1f1dSLionel Sambuc
636*0a6a1f1dSLionel Sambuc /* Vector Blend */
637*0a6a1f1dSLionel Sambuc
638*0a6a1f1dSLionel Sambuc static __inline __m512d __attribute__ ((__always_inline__, __nodebug__))
_mm512_mask_blend_pd(__mmask8 __U,__m512d __A,__m512d __W)639*0a6a1f1dSLionel Sambuc _mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W)
640*0a6a1f1dSLionel Sambuc {
641*0a6a1f1dSLionel Sambuc return (__m512d) __builtin_ia32_blendmpd_512_mask ((__v8df) __A,
642*0a6a1f1dSLionel Sambuc (__v8df) __W,
643*0a6a1f1dSLionel Sambuc (__mmask8) __U);
644*0a6a1f1dSLionel Sambuc }
645*0a6a1f1dSLionel Sambuc
646*0a6a1f1dSLionel Sambuc static __inline __m512 __attribute__ ((__always_inline__, __nodebug__))
_mm512_mask_blend_ps(__mmask16 __U,__m512 __A,__m512 __W)647*0a6a1f1dSLionel Sambuc _mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W)
648*0a6a1f1dSLionel Sambuc {
649*0a6a1f1dSLionel Sambuc return (__m512) __builtin_ia32_blendmps_512_mask ((__v16sf) __A,
650*0a6a1f1dSLionel Sambuc (__v16sf) __W,
651*0a6a1f1dSLionel Sambuc (__mmask16) __U);
652*0a6a1f1dSLionel Sambuc }
653*0a6a1f1dSLionel Sambuc
654*0a6a1f1dSLionel Sambuc static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
_mm512_mask_blend_epi64(__mmask8 __U,__m512i __A,__m512i __W)655*0a6a1f1dSLionel Sambuc _mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W)
656*0a6a1f1dSLionel Sambuc {
657*0a6a1f1dSLionel Sambuc return (__m512i) __builtin_ia32_blendmq_512_mask ((__v8di) __A,
658*0a6a1f1dSLionel Sambuc (__v8di) __W,
659*0a6a1f1dSLionel Sambuc (__mmask8) __U);
660*0a6a1f1dSLionel Sambuc }
661*0a6a1f1dSLionel Sambuc
662*0a6a1f1dSLionel Sambuc static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
_mm512_mask_blend_epi32(__mmask16 __U,__m512i __A,__m512i __W)663*0a6a1f1dSLionel Sambuc _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
664*0a6a1f1dSLionel Sambuc {
665*0a6a1f1dSLionel Sambuc return (__m512i) __builtin_ia32_blendmd_512_mask ((__v16si) __A,
666*0a6a1f1dSLionel Sambuc (__v16si) __W,
667*0a6a1f1dSLionel Sambuc (__mmask16) __U);
668*0a6a1f1dSLionel Sambuc }
669*0a6a1f1dSLionel Sambuc
670*0a6a1f1dSLionel Sambuc /* Compare */
671*0a6a1f1dSLionel Sambuc
672*0a6a1f1dSLionel Sambuc static __inline __mmask16 __attribute__ ((__always_inline__, __nodebug__))
_mm512_cmp_ps_mask(__m512 a,__m512 b,const int p)673*0a6a1f1dSLionel Sambuc _mm512_cmp_ps_mask(__m512 a, __m512 b, const int p)
674*0a6a1f1dSLionel Sambuc {
675*0a6a1f1dSLionel Sambuc return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) a,
676*0a6a1f1dSLionel Sambuc (__v16sf) b, p, (__mmask16) -1,
677*0a6a1f1dSLionel Sambuc _MM_FROUND_CUR_DIRECTION);
678*0a6a1f1dSLionel Sambuc }
679*0a6a1f1dSLionel Sambuc
680*0a6a1f1dSLionel Sambuc static __inline __mmask8 __attribute__ ((__always_inline__, __nodebug__))
_mm512_cmp_pd_mask(__m512d __X,__m512d __Y,const int __P)681*0a6a1f1dSLionel Sambuc _mm512_cmp_pd_mask(__m512d __X, __m512d __Y, const int __P)
682*0a6a1f1dSLionel Sambuc {
683*0a6a1f1dSLionel Sambuc return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
684*0a6a1f1dSLionel Sambuc (__v8df) __Y, __P,
685*0a6a1f1dSLionel Sambuc (__mmask8) -1,
686*0a6a1f1dSLionel Sambuc _MM_FROUND_CUR_DIRECTION);
687*0a6a1f1dSLionel Sambuc }
688*0a6a1f1dSLionel Sambuc
689*0a6a1f1dSLionel Sambuc /* Conversion */
690*0a6a1f1dSLionel Sambuc
691*0a6a1f1dSLionel Sambuc static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
_mm512_cvttps_epu32(__m512 __A)692*0a6a1f1dSLionel Sambuc _mm512_cvttps_epu32(__m512 __A)
693*0a6a1f1dSLionel Sambuc {
694*0a6a1f1dSLionel Sambuc return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
695*0a6a1f1dSLionel Sambuc (__v16si)
696*0a6a1f1dSLionel Sambuc _mm512_setzero_si512 (),
697*0a6a1f1dSLionel Sambuc (__mmask16) -1,
698*0a6a1f1dSLionel Sambuc _MM_FROUND_CUR_DIRECTION);
699*0a6a1f1dSLionel Sambuc }
700*0a6a1f1dSLionel Sambuc
701*0a6a1f1dSLionel Sambuc static __inline __m512 __attribute__ (( __always_inline__, __nodebug__))
_mm512_cvt_roundepi32_ps(__m512i __A,const int __R)702*0a6a1f1dSLionel Sambuc _mm512_cvt_roundepi32_ps(__m512i __A, const int __R)
703*0a6a1f1dSLionel Sambuc {
704*0a6a1f1dSLionel Sambuc return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
705*0a6a1f1dSLionel Sambuc (__v16sf)
706*0a6a1f1dSLionel Sambuc _mm512_setzero_ps (),
707*0a6a1f1dSLionel Sambuc (__mmask16) -1,
708*0a6a1f1dSLionel Sambuc __R);
709*0a6a1f1dSLionel Sambuc }
710*0a6a1f1dSLionel Sambuc
711*0a6a1f1dSLionel Sambuc static __inline __m512 __attribute__ (( __always_inline__, __nodebug__))
_mm512_cvt_roundepu32_ps(__m512i __A,const int __R)712*0a6a1f1dSLionel Sambuc _mm512_cvt_roundepu32_ps(__m512i __A, const int __R)
713*0a6a1f1dSLionel Sambuc {
714*0a6a1f1dSLionel Sambuc return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
715*0a6a1f1dSLionel Sambuc (__v16sf)
716*0a6a1f1dSLionel Sambuc _mm512_setzero_ps (),
717*0a6a1f1dSLionel Sambuc (__mmask16) -1,
718*0a6a1f1dSLionel Sambuc __R);
719*0a6a1f1dSLionel Sambuc }
720*0a6a1f1dSLionel Sambuc
721*0a6a1f1dSLionel Sambuc static __inline __m512d __attribute__ (( __always_inline__, __nodebug__))
_mm512_cvtepi32_pd(__m256i __A)722*0a6a1f1dSLionel Sambuc _mm512_cvtepi32_pd(__m256i __A)
723*0a6a1f1dSLionel Sambuc {
724*0a6a1f1dSLionel Sambuc return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
725*0a6a1f1dSLionel Sambuc (__v8df)
726*0a6a1f1dSLionel Sambuc _mm512_setzero_pd (),
727*0a6a1f1dSLionel Sambuc (__mmask8) -1);
728*0a6a1f1dSLionel Sambuc }
729*0a6a1f1dSLionel Sambuc
730*0a6a1f1dSLionel Sambuc static __inline __m512d __attribute__ (( __always_inline__, __nodebug__))
_mm512_cvtepu32_pd(__m256i __A)731*0a6a1f1dSLionel Sambuc _mm512_cvtepu32_pd(__m256i __A)
732*0a6a1f1dSLionel Sambuc {
733*0a6a1f1dSLionel Sambuc return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
734*0a6a1f1dSLionel Sambuc (__v8df)
735*0a6a1f1dSLionel Sambuc _mm512_setzero_pd (),
736*0a6a1f1dSLionel Sambuc (__mmask8) -1);
737*0a6a1f1dSLionel Sambuc }
738*0a6a1f1dSLionel Sambuc static __inline __m256 __attribute__ (( __always_inline__, __nodebug__))
_mm512_cvt_roundpd_ps(__m512d __A,const int __R)739*0a6a1f1dSLionel Sambuc _mm512_cvt_roundpd_ps(__m512d __A, const int __R)
740*0a6a1f1dSLionel Sambuc {
741*0a6a1f1dSLionel Sambuc return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
742*0a6a1f1dSLionel Sambuc (__v8sf)
743*0a6a1f1dSLionel Sambuc _mm256_setzero_ps (),
744*0a6a1f1dSLionel Sambuc (__mmask8) -1,
745*0a6a1f1dSLionel Sambuc __R);
746*0a6a1f1dSLionel Sambuc }
747*0a6a1f1dSLionel Sambuc
748*0a6a1f1dSLionel Sambuc static __inline __m256i __attribute__ ((__always_inline__, __nodebug__))
_mm512_cvtps_ph(__m512 __A,const int __I)749*0a6a1f1dSLionel Sambuc _mm512_cvtps_ph(__m512 __A, const int __I)
750*0a6a1f1dSLionel Sambuc {
751*0a6a1f1dSLionel Sambuc return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A,
752*0a6a1f1dSLionel Sambuc __I,
753*0a6a1f1dSLionel Sambuc (__v16hi)
754*0a6a1f1dSLionel Sambuc _mm256_setzero_si256 (),
755*0a6a1f1dSLionel Sambuc -1);
756*0a6a1f1dSLionel Sambuc }
757*0a6a1f1dSLionel Sambuc
758*0a6a1f1dSLionel Sambuc static __inline __m512 __attribute__ ((__always_inline__, __nodebug__))
_mm512_cvtph_ps(__m256i __A)759*0a6a1f1dSLionel Sambuc _mm512_cvtph_ps(__m256i __A)
760*0a6a1f1dSLionel Sambuc {
761*0a6a1f1dSLionel Sambuc return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
762*0a6a1f1dSLionel Sambuc (__v16sf)
763*0a6a1f1dSLionel Sambuc _mm512_setzero_ps (),
764*0a6a1f1dSLionel Sambuc (__mmask16) -1,
765*0a6a1f1dSLionel Sambuc _MM_FROUND_CUR_DIRECTION);
766*0a6a1f1dSLionel Sambuc }
767*0a6a1f1dSLionel Sambuc
768*0a6a1f1dSLionel Sambuc static __inline __m512i __attribute__((__always_inline__, __nodebug__))
_mm512_cvttps_epi32(__m512 a)769*0a6a1f1dSLionel Sambuc _mm512_cvttps_epi32(__m512 a)
770*0a6a1f1dSLionel Sambuc {
771*0a6a1f1dSLionel Sambuc return (__m512i)
772*0a6a1f1dSLionel Sambuc __builtin_ia32_cvttps2dq512_mask((__v16sf) a,
773*0a6a1f1dSLionel Sambuc (__v16si) _mm512_setzero_si512 (),
774*0a6a1f1dSLionel Sambuc (__mmask16) -1, _MM_FROUND_CUR_DIRECTION);
775*0a6a1f1dSLionel Sambuc }
776*0a6a1f1dSLionel Sambuc
777*0a6a1f1dSLionel Sambuc static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm512_cvttpd_epi32(__m512d a)778*0a6a1f1dSLionel Sambuc _mm512_cvttpd_epi32(__m512d a)
779*0a6a1f1dSLionel Sambuc {
780*0a6a1f1dSLionel Sambuc return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df) a,
781*0a6a1f1dSLionel Sambuc (__v8si)_mm256_setzero_si256(),
782*0a6a1f1dSLionel Sambuc (__mmask8) -1,
783*0a6a1f1dSLionel Sambuc _MM_FROUND_CUR_DIRECTION);
784*0a6a1f1dSLionel Sambuc }
785*0a6a1f1dSLionel Sambuc
786*0a6a1f1dSLionel Sambuc static __inline __m256i __attribute__ ((__always_inline__, __nodebug__))
_mm512_cvtt_roundpd_epi32(__m512d __A,const int __R)787*0a6a1f1dSLionel Sambuc _mm512_cvtt_roundpd_epi32(__m512d __A, const int __R)
788*0a6a1f1dSLionel Sambuc {
789*0a6a1f1dSLionel Sambuc return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
790*0a6a1f1dSLionel Sambuc (__v8si)
791*0a6a1f1dSLionel Sambuc _mm256_setzero_si256 (),
792*0a6a1f1dSLionel Sambuc (__mmask8) -1,
793*0a6a1f1dSLionel Sambuc __R);
794*0a6a1f1dSLionel Sambuc }
795*0a6a1f1dSLionel Sambuc static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
_mm512_cvtt_roundps_epi32(__m512 __A,const int __R)796*0a6a1f1dSLionel Sambuc _mm512_cvtt_roundps_epi32(__m512 __A, const int __R)
797*0a6a1f1dSLionel Sambuc {
798*0a6a1f1dSLionel Sambuc return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
799*0a6a1f1dSLionel Sambuc (__v16si)
800*0a6a1f1dSLionel Sambuc _mm512_setzero_si512 (),
801*0a6a1f1dSLionel Sambuc (__mmask16) -1,
802*0a6a1f1dSLionel Sambuc __R);
803*0a6a1f1dSLionel Sambuc }
804*0a6a1f1dSLionel Sambuc
805*0a6a1f1dSLionel Sambuc static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
_mm512_cvt_roundps_epi32(__m512 __A,const int __R)806*0a6a1f1dSLionel Sambuc _mm512_cvt_roundps_epi32(__m512 __A, const int __R)
807*0a6a1f1dSLionel Sambuc {
808*0a6a1f1dSLionel Sambuc return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
809*0a6a1f1dSLionel Sambuc (__v16si)
810*0a6a1f1dSLionel Sambuc _mm512_setzero_si512 (),
811*0a6a1f1dSLionel Sambuc (__mmask16) -1,
812*0a6a1f1dSLionel Sambuc __R);
813*0a6a1f1dSLionel Sambuc }
814*0a6a1f1dSLionel Sambuc static __inline __m256i __attribute__ ((__always_inline__, __nodebug__))
_mm512_cvt_roundpd_epi32(__m512d __A,const int __R)815*0a6a1f1dSLionel Sambuc _mm512_cvt_roundpd_epi32(__m512d __A, const int __R)
816*0a6a1f1dSLionel Sambuc {
817*0a6a1f1dSLionel Sambuc return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
818*0a6a1f1dSLionel Sambuc (__v8si)
819*0a6a1f1dSLionel Sambuc _mm256_setzero_si256 (),
820*0a6a1f1dSLionel Sambuc (__mmask8) -1,
821*0a6a1f1dSLionel Sambuc __R);
822*0a6a1f1dSLionel Sambuc }
823*0a6a1f1dSLionel Sambuc static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
_mm512_cvt_roundps_epu32(__m512 __A,const int __R)824*0a6a1f1dSLionel Sambuc _mm512_cvt_roundps_epu32(__m512 __A, const int __R)
825*0a6a1f1dSLionel Sambuc {
826*0a6a1f1dSLionel Sambuc return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
827*0a6a1f1dSLionel Sambuc (__v16si)
828*0a6a1f1dSLionel Sambuc _mm512_setzero_si512 (),
829*0a6a1f1dSLionel Sambuc (__mmask16) -1,
830*0a6a1f1dSLionel Sambuc __R);
831*0a6a1f1dSLionel Sambuc }
832*0a6a1f1dSLionel Sambuc static __inline __m256i __attribute__ ((__always_inline__, __nodebug__))
_mm512_cvt_roundpd_epu32(__m512d __A,const int __R)833*0a6a1f1dSLionel Sambuc _mm512_cvt_roundpd_epu32(__m512d __A, const int __R)
834*0a6a1f1dSLionel Sambuc {
835*0a6a1f1dSLionel Sambuc return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
836*0a6a1f1dSLionel Sambuc (__v8si)
837*0a6a1f1dSLionel Sambuc _mm256_setzero_si256 (),
838*0a6a1f1dSLionel Sambuc (__mmask8) -1,
839*0a6a1f1dSLionel Sambuc __R);
840*0a6a1f1dSLionel Sambuc }
841*0a6a1f1dSLionel Sambuc
842*0a6a1f1dSLionel Sambuc /* Unpack and Interleave */
843*0a6a1f1dSLionel Sambuc static __inline __m512d __attribute__((__always_inline__, __nodebug__))
_mm512_unpackhi_pd(__m512d __a,__m512d __b)844*0a6a1f1dSLionel Sambuc _mm512_unpackhi_pd(__m512d __a, __m512d __b)
845*0a6a1f1dSLionel Sambuc {
846*0a6a1f1dSLionel Sambuc return __builtin_shufflevector(__a, __b, 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
847*0a6a1f1dSLionel Sambuc }
848*0a6a1f1dSLionel Sambuc
849*0a6a1f1dSLionel Sambuc static __inline __m512d __attribute__((__always_inline__, __nodebug__))
_mm512_unpacklo_pd(__m512d __a,__m512d __b)850*0a6a1f1dSLionel Sambuc _mm512_unpacklo_pd(__m512d __a, __m512d __b)
851*0a6a1f1dSLionel Sambuc {
852*0a6a1f1dSLionel Sambuc return __builtin_shufflevector(__a, __b, 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
853*0a6a1f1dSLionel Sambuc }
854*0a6a1f1dSLionel Sambuc
855*0a6a1f1dSLionel Sambuc static __inline __m512 __attribute__((__always_inline__, __nodebug__))
_mm512_unpackhi_ps(__m512 __a,__m512 __b)856*0a6a1f1dSLionel Sambuc _mm512_unpackhi_ps(__m512 __a, __m512 __b)
857*0a6a1f1dSLionel Sambuc {
858*0a6a1f1dSLionel Sambuc return __builtin_shufflevector(__a, __b,
859*0a6a1f1dSLionel Sambuc 2, 18, 3, 19,
860*0a6a1f1dSLionel Sambuc 2+4, 18+4, 3+4, 19+4,
861*0a6a1f1dSLionel Sambuc 2+8, 18+8, 3+8, 19+8,
862*0a6a1f1dSLionel Sambuc 2+12, 18+12, 3+12, 19+12);
863*0a6a1f1dSLionel Sambuc }
864*0a6a1f1dSLionel Sambuc
865*0a6a1f1dSLionel Sambuc static __inline __m512 __attribute__((__always_inline__, __nodebug__))
_mm512_unpacklo_ps(__m512 __a,__m512 __b)866*0a6a1f1dSLionel Sambuc _mm512_unpacklo_ps(__m512 __a, __m512 __b)
867*0a6a1f1dSLionel Sambuc {
868*0a6a1f1dSLionel Sambuc return __builtin_shufflevector(__a, __b,
869*0a6a1f1dSLionel Sambuc 0, 16, 1, 17,
870*0a6a1f1dSLionel Sambuc 0+4, 16+4, 1+4, 17+4,
871*0a6a1f1dSLionel Sambuc 0+8, 16+8, 1+8, 17+8,
872*0a6a1f1dSLionel Sambuc 0+12, 16+12, 1+12, 17+12);
873*0a6a1f1dSLionel Sambuc }
874*0a6a1f1dSLionel Sambuc
875*0a6a1f1dSLionel Sambuc /* Bit Test */
876*0a6a1f1dSLionel Sambuc
877*0a6a1f1dSLionel Sambuc static __inline __mmask16 __attribute__ ((__always_inline__, __nodebug__))
_mm512_test_epi32_mask(__m512i __A,__m512i __B)878*0a6a1f1dSLionel Sambuc _mm512_test_epi32_mask(__m512i __A, __m512i __B)
879*0a6a1f1dSLionel Sambuc {
880*0a6a1f1dSLionel Sambuc return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A,
881*0a6a1f1dSLionel Sambuc (__v16si) __B,
882*0a6a1f1dSLionel Sambuc (__mmask16) -1);
883*0a6a1f1dSLionel Sambuc }
884*0a6a1f1dSLionel Sambuc
885*0a6a1f1dSLionel Sambuc static __inline __mmask8 __attribute__ ((__always_inline__, __nodebug__))
_mm512_test_epi64_mask(__m512i __A,__m512i __B)886*0a6a1f1dSLionel Sambuc _mm512_test_epi64_mask(__m512i __A, __m512i __B)
887*0a6a1f1dSLionel Sambuc {
888*0a6a1f1dSLionel Sambuc return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A,
889*0a6a1f1dSLionel Sambuc (__v8di) __B,
890*0a6a1f1dSLionel Sambuc (__mmask8) -1);
891*0a6a1f1dSLionel Sambuc }
892*0a6a1f1dSLionel Sambuc
893*0a6a1f1dSLionel Sambuc /* SIMD load ops */
894*0a6a1f1dSLionel Sambuc
895*0a6a1f1dSLionel Sambuc static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
_mm512_maskz_loadu_epi32(__mmask16 __U,void const * __P)896*0a6a1f1dSLionel Sambuc _mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P)
897*0a6a1f1dSLionel Sambuc {
898*0a6a1f1dSLionel Sambuc return (__m512i) __builtin_ia32_loaddqusi512_mask ((const __v16si *)__P,
899*0a6a1f1dSLionel Sambuc (__v16si)
900*0a6a1f1dSLionel Sambuc _mm512_setzero_si512 (),
901*0a6a1f1dSLionel Sambuc (__mmask16) __U);
902*0a6a1f1dSLionel Sambuc }
903*0a6a1f1dSLionel Sambuc
904*0a6a1f1dSLionel Sambuc static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
_mm512_maskz_loadu_epi64(__mmask8 __U,void const * __P)905*0a6a1f1dSLionel Sambuc _mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P)
906*0a6a1f1dSLionel Sambuc {
907*0a6a1f1dSLionel Sambuc return (__m512i) __builtin_ia32_loaddqudi512_mask ((const __v8di *)__P,
908*0a6a1f1dSLionel Sambuc (__v8di)
909*0a6a1f1dSLionel Sambuc _mm512_setzero_si512 (),
910*0a6a1f1dSLionel Sambuc (__mmask8) __U);
911*0a6a1f1dSLionel Sambuc }
912*0a6a1f1dSLionel Sambuc
913*0a6a1f1dSLionel Sambuc static __inline __m512 __attribute__ ((__always_inline__, __nodebug__))
_mm512_maskz_loadu_ps(__mmask16 __U,void const * __P)914*0a6a1f1dSLionel Sambuc _mm512_maskz_loadu_ps(__mmask16 __U, void const *__P)
915*0a6a1f1dSLionel Sambuc {
916*0a6a1f1dSLionel Sambuc return (__m512) __builtin_ia32_loadups512_mask ((const __v16sf *)__P,
917*0a6a1f1dSLionel Sambuc (__v16sf)
918*0a6a1f1dSLionel Sambuc _mm512_setzero_ps (),
919*0a6a1f1dSLionel Sambuc (__mmask16) __U);
920*0a6a1f1dSLionel Sambuc }
921*0a6a1f1dSLionel Sambuc
922*0a6a1f1dSLionel Sambuc static __inline __m512d __attribute__ ((__always_inline__, __nodebug__))
_mm512_maskz_loadu_pd(__mmask8 __U,void const * __P)923*0a6a1f1dSLionel Sambuc _mm512_maskz_loadu_pd(__mmask8 __U, void const *__P)
924*0a6a1f1dSLionel Sambuc {
925*0a6a1f1dSLionel Sambuc return (__m512d) __builtin_ia32_loadupd512_mask ((const __v8df *)__P,
926*0a6a1f1dSLionel Sambuc (__v8df)
927*0a6a1f1dSLionel Sambuc _mm512_setzero_pd (),
928*0a6a1f1dSLionel Sambuc (__mmask8) __U);
929*0a6a1f1dSLionel Sambuc }
930*0a6a1f1dSLionel Sambuc
931*0a6a1f1dSLionel Sambuc static __inline __m512d __attribute__((__always_inline__, __nodebug__))
_mm512_loadu_pd(double const * __p)932*0a6a1f1dSLionel Sambuc _mm512_loadu_pd(double const *__p)
933*0a6a1f1dSLionel Sambuc {
934*0a6a1f1dSLionel Sambuc struct __loadu_pd {
935*0a6a1f1dSLionel Sambuc __m512d __v;
936*0a6a1f1dSLionel Sambuc } __attribute__((packed, may_alias));
937*0a6a1f1dSLionel Sambuc return ((struct __loadu_pd*)__p)->__v;
938*0a6a1f1dSLionel Sambuc }
939*0a6a1f1dSLionel Sambuc
940*0a6a1f1dSLionel Sambuc static __inline __m512 __attribute__((__always_inline__, __nodebug__))
_mm512_loadu_ps(float const * __p)941*0a6a1f1dSLionel Sambuc _mm512_loadu_ps(float const *__p)
942*0a6a1f1dSLionel Sambuc {
943*0a6a1f1dSLionel Sambuc struct __loadu_ps {
944*0a6a1f1dSLionel Sambuc __m512 __v;
945*0a6a1f1dSLionel Sambuc } __attribute__((packed, may_alias));
946*0a6a1f1dSLionel Sambuc return ((struct __loadu_ps*)__p)->__v;
947*0a6a1f1dSLionel Sambuc }
948*0a6a1f1dSLionel Sambuc
949*0a6a1f1dSLionel Sambuc /* SIMD store ops */
950*0a6a1f1dSLionel Sambuc
951*0a6a1f1dSLionel Sambuc static __inline void __attribute__ ((__always_inline__, __nodebug__))
_mm512_mask_storeu_epi64(void * __P,__mmask8 __U,__m512i __A)952*0a6a1f1dSLionel Sambuc _mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A)
953*0a6a1f1dSLionel Sambuc {
954*0a6a1f1dSLionel Sambuc __builtin_ia32_storedqudi512_mask ((__v8di *)__P, (__v8di) __A,
955*0a6a1f1dSLionel Sambuc (__mmask8) __U);
956*0a6a1f1dSLionel Sambuc }
957*0a6a1f1dSLionel Sambuc
958*0a6a1f1dSLionel Sambuc static __inline void __attribute__ ((__always_inline__, __nodebug__))
_mm512_mask_storeu_epi32(void * __P,__mmask16 __U,__m512i __A)959*0a6a1f1dSLionel Sambuc _mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A)
960*0a6a1f1dSLionel Sambuc {
961*0a6a1f1dSLionel Sambuc __builtin_ia32_storedqusi512_mask ((__v16si *)__P, (__v16si) __A,
962*0a6a1f1dSLionel Sambuc (__mmask16) __U);
963*0a6a1f1dSLionel Sambuc }
964*0a6a1f1dSLionel Sambuc
965*0a6a1f1dSLionel Sambuc static __inline void __attribute__ ((__always_inline__, __nodebug__))
_mm512_mask_storeu_pd(void * __P,__mmask8 __U,__m512d __A)966*0a6a1f1dSLionel Sambuc _mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A)
967*0a6a1f1dSLionel Sambuc {
968*0a6a1f1dSLionel Sambuc __builtin_ia32_storeupd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U);
969*0a6a1f1dSLionel Sambuc }
970*0a6a1f1dSLionel Sambuc
971*0a6a1f1dSLionel Sambuc static __inline void __attribute__ ((__always_inline__, __nodebug__))
_mm512_storeu_pd(void * __P,__m512d __A)972*0a6a1f1dSLionel Sambuc _mm512_storeu_pd(void *__P, __m512d __A)
973*0a6a1f1dSLionel Sambuc {
974*0a6a1f1dSLionel Sambuc __builtin_ia32_storeupd512_mask((__v8df *)__P, (__v8df)__A, (__mmask8)-1);
975*0a6a1f1dSLionel Sambuc }
976*0a6a1f1dSLionel Sambuc
977*0a6a1f1dSLionel Sambuc static __inline void __attribute__ ((__always_inline__, __nodebug__))
_mm512_mask_storeu_ps(void * __P,__mmask16 __U,__m512 __A)978*0a6a1f1dSLionel Sambuc _mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A)
979*0a6a1f1dSLionel Sambuc {
980*0a6a1f1dSLionel Sambuc __builtin_ia32_storeups512_mask ((__v16sf *)__P, (__v16sf) __A,
981*0a6a1f1dSLionel Sambuc (__mmask16) __U);
982*0a6a1f1dSLionel Sambuc }
983*0a6a1f1dSLionel Sambuc
984*0a6a1f1dSLionel Sambuc static __inline void __attribute__ ((__always_inline__, __nodebug__))
_mm512_storeu_ps(void * __P,__m512 __A)985*0a6a1f1dSLionel Sambuc _mm512_storeu_ps(void *__P, __m512 __A)
986*0a6a1f1dSLionel Sambuc {
987*0a6a1f1dSLionel Sambuc __builtin_ia32_storeups512_mask((__v16sf *)__P, (__v16sf)__A, (__mmask16)-1);
988*0a6a1f1dSLionel Sambuc }
989*0a6a1f1dSLionel Sambuc
990*0a6a1f1dSLionel Sambuc static __inline void __attribute__ ((__always_inline__, __nodebug__))
_mm512_store_ps(void * __P,__m512 __A)991*0a6a1f1dSLionel Sambuc _mm512_store_ps(void *__P, __m512 __A)
992*0a6a1f1dSLionel Sambuc {
993*0a6a1f1dSLionel Sambuc *(__m512*)__P = __A;
994*0a6a1f1dSLionel Sambuc }
995*0a6a1f1dSLionel Sambuc
996*0a6a1f1dSLionel Sambuc static __inline void __attribute__ ((__always_inline__, __nodebug__))
_mm512_store_pd(void * __P,__m512d __A)997*0a6a1f1dSLionel Sambuc _mm512_store_pd(void *__P, __m512d __A)
998*0a6a1f1dSLionel Sambuc {
999*0a6a1f1dSLionel Sambuc *(__m512d*)__P = __A;
1000*0a6a1f1dSLionel Sambuc }
1001*0a6a1f1dSLionel Sambuc
1002*0a6a1f1dSLionel Sambuc /* Mask ops */
1003*0a6a1f1dSLionel Sambuc
1004*0a6a1f1dSLionel Sambuc static __inline __mmask16 __attribute__ ((__always_inline__, __nodebug__))
_mm512_knot(__mmask16 __M)1005*0a6a1f1dSLionel Sambuc _mm512_knot(__mmask16 __M)
1006*0a6a1f1dSLionel Sambuc {
1007*0a6a1f1dSLionel Sambuc return __builtin_ia32_knothi(__M);
1008*0a6a1f1dSLionel Sambuc }
1009*0a6a1f1dSLionel Sambuc
1010*0a6a1f1dSLionel Sambuc /* Integer compare */
1011*0a6a1f1dSLionel Sambuc
1012*0a6a1f1dSLionel Sambuc static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
_mm512_cmpeq_epi32_mask(__m512i __a,__m512i __b)1013*0a6a1f1dSLionel Sambuc _mm512_cmpeq_epi32_mask(__m512i __a, __m512i __b) {
1014*0a6a1f1dSLionel Sambuc return (__mmask16)__builtin_ia32_pcmpeqd512_mask((__v16si)__a, (__v16si)__b,
1015*0a6a1f1dSLionel Sambuc (__mmask16)-1);
1016*0a6a1f1dSLionel Sambuc }
1017*0a6a1f1dSLionel Sambuc
1018*0a6a1f1dSLionel Sambuc static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
_mm512_mask_cmpeq_epi32_mask(__mmask16 __u,__m512i __a,__m512i __b)1019*0a6a1f1dSLionel Sambuc _mm512_mask_cmpeq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
1020*0a6a1f1dSLionel Sambuc return (__mmask16)__builtin_ia32_pcmpeqd512_mask((__v16si)__a, (__v16si)__b,
1021*0a6a1f1dSLionel Sambuc __u);
1022*0a6a1f1dSLionel Sambuc }
1023*0a6a1f1dSLionel Sambuc
1024*0a6a1f1dSLionel Sambuc static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
_mm512_mask_cmpeq_epi64_mask(__mmask8 __u,__m512i __a,__m512i __b)1025*0a6a1f1dSLionel Sambuc _mm512_mask_cmpeq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
1026*0a6a1f1dSLionel Sambuc return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__a, (__v8di)__b,
1027*0a6a1f1dSLionel Sambuc __u);
1028*0a6a1f1dSLionel Sambuc }
1029*0a6a1f1dSLionel Sambuc
1030*0a6a1f1dSLionel Sambuc static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
_mm512_cmpeq_epi64_mask(__m512i __a,__m512i __b)1031*0a6a1f1dSLionel Sambuc _mm512_cmpeq_epi64_mask(__m512i __a, __m512i __b) {
1032*0a6a1f1dSLionel Sambuc return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__a, (__v8di)__b,
1033*0a6a1f1dSLionel Sambuc (__mmask8)-1);
1034*0a6a1f1dSLionel Sambuc }
1035*0a6a1f1dSLionel Sambuc
1036*0a6a1f1dSLionel Sambuc #endif // __AVX512FINTRIN_H
1037