xref: /llvm-project/clang/lib/Headers/avx10_2niintrin.h (revision b0329206db8e66fe180c504115103b27ca50f64e)
1 /*===---- avx10_2niintrin.h - AVX10.2 new instruction intrinsics -----------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 #ifndef __IMMINTRIN_H
10 #error "Never use <avx10_2niintrin.h> directly; include <immintrin.h> instead."
11 #endif
12 
13 #ifdef __SSE2__
14 
15 #ifndef __AVX10_2NIINTRIN_H
16 #define __AVX10_2NIINTRIN_H
17 
18 #define __DEFAULT_FN_ATTRS128                                                  \
19   __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"),    \
20                  __min_vector_width__(128)))
21 #define __DEFAULT_FN_ATTRS256                                                  \
22   __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"),    \
23                  __min_vector_width__(256)))
24 
25 /* VNNI FP16 */
26 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_dpph_ps(__m128 __W,
27                                                            __m128h __A,
28                                                            __m128h __B) {
29   return (__m128)__builtin_ia32_vdpphps128((__v4sf)__W, (__v8hf)__A,
30                                            (__v8hf)__B);
31 }
32 
33 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_dpph_ps(__m128 __W,
34                                                                 __mmask8 __U,
35                                                                 __m128h __A,
36                                                                 __m128h __B) {
37   return (__m128)__builtin_ia32_selectps_128(
38       (__mmask8)__U, (__v4sf)_mm_dpph_ps(__W, __A, __B), (__v4sf)__W);
39 }
40 
41 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_dpph_ps(__mmask8 __U,
42                                                                  __m128 __W,
43                                                                  __m128h __A,
44                                                                  __m128h __B) {
45   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
46                                              (__v4sf)_mm_dpph_ps(__W, __A, __B),
47                                              (__v4sf)_mm_setzero_ps());
48 }
49 
50 static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_dpph_ps(__m256 __W,
51                                                               __m256h __A,
52                                                               __m256h __B) {
53   return (__m256)__builtin_ia32_vdpphps256((__v8sf)__W, (__v16hf)__A,
54                                            (__v16hf)__B);
55 }
56 
57 static __inline__ __m256 __DEFAULT_FN_ATTRS256
58 _mm256_mask_dpph_ps(__m256 __W, __mmask8 __U, __m256h __A, __m256h __B) {
59   return (__m256)__builtin_ia32_selectps_256(
60       (__mmask8)__U, (__v8sf)_mm256_dpph_ps(__W, __A, __B), (__v8sf)__W);
61 }
62 
63 static __inline__ __m256 __DEFAULT_FN_ATTRS256
64 _mm256_maskz_dpph_ps(__mmask8 __U, __m256 __W, __m256h __A, __m256h __B) {
65   return (__m256)__builtin_ia32_selectps_256(
66       (__mmask8)__U, (__v8sf)_mm256_dpph_ps(__W, __A, __B),
67       (__v8sf)_mm256_setzero_ps());
68 }
69 
70 /* VMPSADBW */
71 #define _mm_mask_mpsadbw_epu8(W, U, A, B, imm)                                 \
72   ((__m128i)__builtin_ia32_selectw_128(                                        \
73       (__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)),                \
74       (__v8hi)(__m128i)(W)))
75 
76 #define _mm_maskz_mpsadbw_epu8(U, A, B, imm)                                   \
77   ((__m128i)__builtin_ia32_selectw_128(                                        \
78       (__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)),                \
79       (__v8hi)_mm_setzero_si128()))
80 
81 #define _mm256_mask_mpsadbw_epu8(W, U, A, B, imm)                              \
82   ((__m256i)__builtin_ia32_selectw_256(                                        \
83       (__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)),           \
84       (__v16hi)(__m256i)(W)))
85 
86 #define _mm256_maskz_mpsadbw_epu8(U, A, B, imm)                                \
87   ((__m256i)__builtin_ia32_selectw_256(                                        \
88       (__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)),           \
89       (__v16hi)_mm256_setzero_si256()))
90 
91 /* VNNI INT8 */
92 static __inline__ __m128i __DEFAULT_FN_ATTRS128
93 _mm_mask_dpbssd_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
94   return (__m128i)__builtin_ia32_selectd_128(
95       __U, (__v4si)_mm_dpbssd_epi32(__W, __A, __B), (__v4si)__W);
96 }
97 
98 static __inline__ __m128i __DEFAULT_FN_ATTRS128
99 _mm_maskz_dpbssd_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
100   return (__m128i)__builtin_ia32_selectd_128(
101       __U, (__v4si)_mm_dpbssd_epi32(__W, __A, __B),
102       (__v4si)_mm_setzero_si128());
103 }
104 
105 static __inline__ __m256i __DEFAULT_FN_ATTRS256
106 _mm256_mask_dpbssd_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
107   return (__m256i)__builtin_ia32_selectd_256(
108       __U, (__v8si)_mm256_dpbssd_epi32(__W, __A, __B), (__v8si)__W);
109 }
110 
111 static __inline__ __m256i __DEFAULT_FN_ATTRS256
112 _mm256_maskz_dpbssd_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
113   return (__m256i)__builtin_ia32_selectd_256(
114       __U, (__v8si)_mm256_dpbssd_epi32(__W, __A, __B),
115       (__v8si)_mm256_setzero_si256());
116 }
117 
118 static __inline__ __m128i __DEFAULT_FN_ATTRS128
119 _mm_mask_dpbssds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
120   return (__m128i)__builtin_ia32_selectd_128(
121       __U, (__v4si)_mm_dpbssds_epi32(__W, __A, __B), (__v4si)__W);
122 }
123 
124 static __inline__ __m128i __DEFAULT_FN_ATTRS128
125 _mm_maskz_dpbssds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
126   return (__m128i)__builtin_ia32_selectd_128(
127       __U, (__v4si)_mm_dpbssds_epi32(__W, __A, __B),
128       (__v4si)_mm_setzero_si128());
129 }
130 
131 static __inline__ __m256i __DEFAULT_FN_ATTRS256
132 _mm256_mask_dpbssds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
133   return (__m256i)__builtin_ia32_selectd_256(
134       __U, (__v8si)_mm256_dpbssds_epi32(__W, __A, __B), (__v8si)__W);
135 }
136 
137 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbssds_epi32(
138     __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
139   return (__m256i)__builtin_ia32_selectd_256(
140       __U, (__v8si)_mm256_dpbssds_epi32(__W, __A, __B),
141       (__v8si)_mm256_setzero_si256());
142 }
143 
144 static __inline__ __m128i __DEFAULT_FN_ATTRS128
145 _mm_mask_dpbsud_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
146   return (__m128i)__builtin_ia32_selectd_128(
147       __U, (__v4si)_mm_dpbsud_epi32(__W, __A, __B), (__v4si)__W);
148 }
149 
150 static __inline__ __m128i __DEFAULT_FN_ATTRS128
151 _mm_maskz_dpbsud_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
152   return (__m128i)__builtin_ia32_selectd_128(
153       __U, (__v4si)_mm_dpbsud_epi32(__W, __A, __B),
154       (__v4si)_mm_setzero_si128());
155 }
156 
157 static __inline__ __m256i __DEFAULT_FN_ATTRS256
158 _mm256_mask_dpbsud_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
159   return (__m256i)__builtin_ia32_selectd_256(
160       __U, (__v8si)_mm256_dpbsud_epi32(__W, __A, __B), (__v8si)__W);
161 }
162 
163 static __inline__ __m256i __DEFAULT_FN_ATTRS256
164 _mm256_maskz_dpbsud_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
165   return (__m256i)__builtin_ia32_selectd_256(
166       __U, (__v8si)_mm256_dpbsud_epi32(__W, __A, __B),
167       (__v8si)_mm256_setzero_si256());
168 }
169 
170 static __inline__ __m128i __DEFAULT_FN_ATTRS128
171 _mm_mask_dpbsuds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
172   return (__m128i)__builtin_ia32_selectd_128(
173       __U, (__v4si)_mm_dpbsuds_epi32(__W, __A, __B), (__v4si)__W);
174 }
175 
176 static __inline__ __m128i __DEFAULT_FN_ATTRS128
177 _mm_maskz_dpbsuds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
178   return (__m128i)__builtin_ia32_selectd_128(
179       __U, (__v4si)_mm_dpbsuds_epi32(__W, __A, __B),
180       (__v4si)_mm_setzero_si128());
181 }
182 
183 static __inline__ __m256i __DEFAULT_FN_ATTRS256
184 _mm256_mask_dpbsuds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
185   return (__m256i)__builtin_ia32_selectd_256(
186       __U, (__v8si)_mm256_dpbsuds_epi32(__W, __A, __B), (__v8si)__W);
187 }
188 
189 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbsuds_epi32(
190     __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
191   return (__m256i)__builtin_ia32_selectd_256(
192       __U, (__v8si)_mm256_dpbsuds_epi32(__W, __A, __B),
193       (__v8si)_mm256_setzero_si256());
194 }
195 
196 static __inline__ __m128i __DEFAULT_FN_ATTRS128
197 _mm_mask_dpbuud_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
198   return (__m128i)__builtin_ia32_selectd_128(
199       __U, (__v4si)_mm_dpbuud_epi32(__W, __A, __B), (__v4si)__W);
200 }
201 
202 static __inline__ __m128i __DEFAULT_FN_ATTRS128
203 _mm_maskz_dpbuud_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
204   return (__m128i)__builtin_ia32_selectd_128(
205       __U, (__v4si)_mm_dpbuud_epi32(__W, __A, __B),
206       (__v4si)_mm_setzero_si128());
207 }
208 
209 static __inline__ __m256i __DEFAULT_FN_ATTRS256
210 _mm256_mask_dpbuud_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
211   return (__m256i)__builtin_ia32_selectd_256(
212       __U, (__v8si)_mm256_dpbuud_epi32(__W, __A, __B), (__v8si)__W);
213 }
214 
215 static __inline__ __m256i __DEFAULT_FN_ATTRS256
216 _mm256_maskz_dpbuud_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
217   return (__m256i)__builtin_ia32_selectd_256(
218       __U, (__v8si)_mm256_dpbuud_epi32(__W, __A, __B),
219       (__v8si)_mm256_setzero_si256());
220 }
221 
222 static __inline__ __m128i __DEFAULT_FN_ATTRS128
223 _mm_mask_dpbuuds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
224   return (__m128i)__builtin_ia32_selectd_128(
225       __U, (__v4si)_mm_dpbuuds_epi32(__W, __A, __B), (__v4si)__W);
226 }
227 
228 static __inline__ __m128i __DEFAULT_FN_ATTRS128
229 _mm_maskz_dpbuuds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
230   return (__m128i)__builtin_ia32_selectd_128(
231       __U, (__v4si)_mm_dpbuuds_epi32(__W, __A, __B),
232       (__v4si)_mm_setzero_si128());
233 }
234 
235 static __inline__ __m256i __DEFAULT_FN_ATTRS256
236 _mm256_mask_dpbuuds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
237   return (__m256i)__builtin_ia32_selectd_256(
238       __U, (__v8si)_mm256_dpbuuds_epi32(__W, __A, __B), (__v8si)__W);
239 }
240 
241 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbuuds_epi32(
242     __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
243   return (__m256i)__builtin_ia32_selectd_256(
244       __U, (__v8si)_mm256_dpbuuds_epi32(__W, __A, __B),
245       (__v8si)_mm256_setzero_si256());
246 }
247 
248 /* VNNI INT16 */
249 static __inline__ __m128i __DEFAULT_FN_ATTRS128
250 _mm_mask_dpwsud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
251   return (__m128i)__builtin_ia32_selectd_128(
252       (__mmask8)__U, (__v4si)_mm_dpwsud_epi32(__A, __B, __C), (__v4si)__A);
253 }
254 
255 static __inline__ __m128i __DEFAULT_FN_ATTRS128
256 _mm_maskz_dpwsud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
257   return (__m128i)__builtin_ia32_selectd_128(
258       (__mmask8)__U, (__v4si)_mm_dpwsud_epi32(__A, __B, __C),
259       (__v4si)_mm_setzero_si128());
260 }
261 
262 static __inline__ __m256i __DEFAULT_FN_ATTRS256
263 _mm256_mask_dpwsud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
264   return (__m256i)__builtin_ia32_selectd_256(
265       (__mmask8)__U, (__v8si)_mm256_dpwsud_epi32(__A, __B, __C), (__v8si)__A);
266 }
267 
268 static __inline__ __m256i __DEFAULT_FN_ATTRS256
269 _mm256_maskz_dpwsud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
270   return (__m256i)__builtin_ia32_selectd_256(
271       (__mmask8)__U, (__v8si)_mm256_dpwsud_epi32(__A, __B, __C),
272       (__v8si)_mm256_setzero_si256());
273 }
274 
275 static __inline__ __m128i __DEFAULT_FN_ATTRS128
276 _mm_mask_dpwsuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
277   return (__m128i)__builtin_ia32_selectd_128(
278       (__mmask8)__U, (__v4si)_mm_dpwsuds_epi32(__A, __B, __C), (__v4si)__A);
279 }
280 
281 static __inline__ __m128i __DEFAULT_FN_ATTRS128
282 _mm_maskz_dpwsuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
283   return (__m128i)__builtin_ia32_selectd_128(
284       (__mmask8)__U, (__v4si)_mm_dpwsuds_epi32(__A, __B, __C),
285       (__v4si)_mm_setzero_si128());
286 }
287 
288 static __inline__ __m256i __DEFAULT_FN_ATTRS256
289 _mm256_mask_dpwsuds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
290   return (__m256i)__builtin_ia32_selectd_256(
291       (__mmask8)__U, (__v8si)_mm256_dpwsuds_epi32(__A, __B, __C), (__v8si)__A);
292 }
293 
294 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwsuds_epi32(
295     __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
296   return (__m256i)__builtin_ia32_selectd_256(
297       (__mmask8)__U, (__v8si)_mm256_dpwsuds_epi32(__A, __B, __C),
298       (__v8si)_mm256_setzero_si256());
299 }
300 
301 static __inline__ __m128i __DEFAULT_FN_ATTRS128
302 _mm_mask_dpwusd_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
303   return (__m128i)__builtin_ia32_selectd_128(
304       (__mmask8)__U, (__v4si)_mm_dpwusd_epi32(__A, __B, __C), (__v4si)__A);
305 }
306 
307 static __inline__ __m128i __DEFAULT_FN_ATTRS128
308 _mm_maskz_dpwusd_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
309   return (__m128i)__builtin_ia32_selectd_128(
310       (__mmask8)__U, (__v4si)_mm_dpwusd_epi32(__A, __B, __C),
311       (__v4si)_mm_setzero_si128());
312 }
313 
314 static __inline__ __m256i __DEFAULT_FN_ATTRS256
315 _mm256_mask_dpwusd_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
316   return (__m256i)__builtin_ia32_selectd_256(
317       (__mmask8)__U, (__v8si)_mm256_dpwusd_epi32(__A, __B, __C), (__v8si)__A);
318 }
319 
320 static __inline__ __m256i __DEFAULT_FN_ATTRS256
321 _mm256_maskz_dpwusd_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
322   return (__m256i)__builtin_ia32_selectd_256(
323       (__mmask8)__U, (__v8si)_mm256_dpwusd_epi32(__A, __B, __C),
324       (__v8si)_mm256_setzero_si256());
325 }
326 
327 static __inline__ __m128i __DEFAULT_FN_ATTRS128
328 _mm_mask_dpwusds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
329   return (__m128i)__builtin_ia32_selectd_128(
330       (__mmask8)__U, (__v4si)_mm_dpwusds_epi32(__A, __B, __C), (__v4si)__A);
331 }
332 
333 static __inline__ __m128i __DEFAULT_FN_ATTRS128
334 _mm_maskz_dpwusds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
335   return (__m128i)__builtin_ia32_selectd_128(
336       (__mmask8)__U, (__v4si)_mm_dpwusds_epi32(__A, __B, __C),
337       (__v4si)_mm_setzero_si128());
338 }
339 
340 static __inline__ __m256i __DEFAULT_FN_ATTRS256
341 _mm256_mask_dpwusds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
342   return (__m256i)__builtin_ia32_selectd_256(
343       (__mmask8)__U, (__v8si)_mm256_dpwusds_epi32(__A, __B, __C), (__v8si)__A);
344 }
345 
346 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwusds_epi32(
347     __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
348   return (__m256i)__builtin_ia32_selectd_256(
349       (__mmask8)__U, (__v8si)_mm256_dpwusds_epi32(__A, __B, __C),
350       (__v8si)_mm256_setzero_si256());
351 }
352 
353 static __inline__ __m128i __DEFAULT_FN_ATTRS128
354 _mm_mask_dpwuud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
355   return (__m128i)__builtin_ia32_selectd_128(
356       (__mmask8)__U, (__v4si)_mm_dpwuud_epi32(__A, __B, __C), (__v4si)__A);
357 }
358 
359 static __inline__ __m128i __DEFAULT_FN_ATTRS128
360 _mm_maskz_dpwuud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
361   return (__m128i)__builtin_ia32_selectd_128(
362       (__mmask8)__U, (__v4si)_mm_dpwuud_epi32(__A, __B, __C),
363       (__v4si)_mm_setzero_si128());
364 }
365 
366 static __inline__ __m256i __DEFAULT_FN_ATTRS256
367 _mm256_mask_dpwuud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
368   return (__m256i)__builtin_ia32_selectd_256(
369       (__mmask8)__U, (__v8si)_mm256_dpwuud_epi32(__A, __B, __C), (__v8si)__A);
370 }
371 
372 static __inline__ __m256i __DEFAULT_FN_ATTRS256
373 _mm256_maskz_dpwuud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
374   return (__m256i)__builtin_ia32_selectd_256(
375       (__mmask8)__U, (__v8si)_mm256_dpwuud_epi32(__A, __B, __C),
376       (__v8si)_mm256_setzero_si256());
377 }
378 
379 static __inline__ __m128i __DEFAULT_FN_ATTRS128
380 _mm_mask_dpwuuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
381   return (__m128i)__builtin_ia32_selectd_128(
382       (__mmask8)__U, (__v4si)_mm_dpwuuds_epi32(__A, __B, __C), (__v4si)__A);
383 }
384 
385 static __inline__ __m128i __DEFAULT_FN_ATTRS128
386 _mm_maskz_dpwuuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
387   return (__m128i)__builtin_ia32_selectd_128(
388       (__mmask8)__U, (__v4si)_mm_dpwuuds_epi32(__A, __B, __C),
389       (__v4si)_mm_setzero_si128());
390 }
391 
392 static __inline__ __m256i __DEFAULT_FN_ATTRS256
393 _mm256_mask_dpwuuds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
394   return (__m256i)__builtin_ia32_selectd_256(
395       (__mmask8)__U, (__v8si)_mm256_dpwuuds_epi32(__A, __B, __C), (__v8si)__A);
396 }
397 
398 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwuuds_epi32(
399     __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
400   return (__m256i)__builtin_ia32_selectd_256(
401       (__mmask8)__U, (__v8si)_mm256_dpwuuds_epi32(__A, __B, __C),
402       (__v8si)_mm256_setzero_si256());
403 }
404 
405 /* YMM Rounding */
406 #define _mm256_add_round_pd(A, B, R)                                           \
407   ((__m256d)__builtin_ia32_vaddpd256_round((__v4df)(__m256d)(A),               \
408                                            (__v4df)(__m256d)(B), (int)(R)))
409 
410 #define _mm256_mask_add_round_pd(W, U, A, B, R)                                \
411   ((__m256d)__builtin_ia32_selectpd_256(                                       \
412       (__mmask8)(U), (__v4df)_mm256_add_round_pd((A), (B), (R)),               \
413       (__v4df)(__m256d)(W)))
414 
415 #define _mm256_maskz_add_round_pd(U, A, B, R)                                  \
416   ((__m256d)__builtin_ia32_selectpd_256(                                       \
417       (__mmask8)(U), (__v4df)_mm256_add_round_pd((A), (B), (R)),               \
418       (__v4df)_mm256_setzero_pd()))
419 
420 #define _mm256_add_round_ph(A, B, R)                                           \
421   ((__m256h)__builtin_ia32_vaddph256_round((__v16hf)(__m256h)(A),              \
422                                            (__v16hf)(__m256h)(B), (int)(R)))
423 
424 #define _mm256_mask_add_round_ph(W, U, A, B, R)                                \
425   ((__m256h)__builtin_ia32_selectph_256(                                       \
426       (__mmask16)(U), (__v16hf)_mm256_add_round_ph((A), (B), (R)),             \
427       (__v16hf)(__m256h)(W)))
428 
429 #define _mm256_maskz_add_round_ph(U, A, B, R)                                  \
430   ((__m256h)__builtin_ia32_selectph_256(                                       \
431       (__mmask16)(U), (__v16hf)_mm256_add_round_ph((A), (B), (R)),             \
432       (__v16hf)_mm256_setzero_ph()))
433 
434 #define _mm256_add_round_ps(A, B, R)                                           \
435   ((__m256)__builtin_ia32_vaddps256_round((__v8sf)(__m256)(A),                 \
436                                           (__v8sf)(__m256)(B), (int)(R)))
437 
438 #define _mm256_mask_add_round_ps(W, U, A, B, R)                                \
439   ((__m256)__builtin_ia32_selectps_256(                                        \
440       (__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)),               \
441       (__v8sf)(__m256)(W)))
442 
443 #define _mm256_maskz_add_round_ps(U, A, B, R)                                  \
444   ((__m256)__builtin_ia32_selectps_256(                                        \
445       (__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)),               \
446       (__v8sf)_mm256_setzero_ps()))
447 
448 #define _mm256_cmp_round_pd_mask(A, B, P, R)                                   \
449   ((__mmask8)__builtin_ia32_vcmppd256_round_mask(                              \
450       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(P), (__mmask8)-1,      \
451       (int)(R)))
452 
453 #define _mm256_mask_cmp_round_pd_mask(U, A, B, P, R)                           \
454   ((__mmask8)__builtin_ia32_vcmppd256_round_mask(                              \
455       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(P), (__mmask8)(U),     \
456       (int)(R)))
457 
458 #define _mm256_cmp_round_ph_mask(A, B, P, R)                                   \
459   ((__mmask16)__builtin_ia32_vcmpph256_round_mask(                             \
460       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(P), (__mmask16)-1,   \
461       (int)(R)))
462 
463 #define _mm256_mask_cmp_round_ph_mask(U, A, B, P, R)                           \
464   ((__mmask16)__builtin_ia32_vcmpph256_round_mask(                             \
465       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(P), (__mmask16)(U),  \
466       (int)(R)))
467 
468 #define _mm256_cmp_round_ps_mask(A, B, P, R)                                   \
469   ((__mmask8)__builtin_ia32_vcmpps256_round_mask(                              \
470       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(P), (__mmask8)-1,        \
471       (int)(R)))
472 
473 #define _mm256_mask_cmp_round_ps_mask(U, A, B, P, R)                           \
474   ((__mmask8)__builtin_ia32_vcmpps256_round_mask(                              \
475       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(P), (__mmask8)(U),       \
476       (int)(R)))
477 
478 #define _mm256_cvt_roundepi32_ph(A, R)                                         \
479   ((__m128h)__builtin_ia32_vcvtdq2ph256_round_mask(                            \
480       (__v8si)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
481 
482 #define _mm256_mask_cvt_roundepi32_ph(W, U, A, R)                              \
483   ((__m128h)__builtin_ia32_vcvtdq2ph256_round_mask((__v8si)(A), (__v8hf)(W),   \
484                                                    (__mmask8)(U), (int)(R)))
485 
486 #define _mm256_maskz_cvt_roundepi32_ph(U, A, R)                                \
487   ((__m128h)__builtin_ia32_vcvtdq2ph256_round_mask(                            \
488       (__v8si)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
489 
490 #define _mm256_cvt_roundepi32_ps(A, R)                                         \
491   ((__m256)__builtin_ia32_vcvtdq2ps256_round_mask((__v8si)(__m256i)(A),        \
492                                                   (__v8sf)_mm256_setzero_ps(), \
493                                                   (__mmask8)-1, (int)(R)))
494 
495 #define _mm256_mask_cvt_roundepi32_ps(W, U, A, R)                              \
496   ((__m256)__builtin_ia32_vcvtdq2ps256_round_mask(                             \
497       (__v8si)(__m256i)(A), (__v8sf)(__m256)(W), (__mmask8)(U), (int)(R)))
498 
499 #define _mm256_maskz_cvt_roundepi32_ps(U, A, R)                                \
500   ((__m256)__builtin_ia32_vcvtdq2ps256_round_mask((__v8si)(__m256i)(A),        \
501                                                   (__v8sf)_mm256_setzero_ps(), \
502                                                   (__mmask8)(U), (int)(R)))
503 
504 #define _mm256_cvt_roundpd_epi32(A, R)                                         \
505   ((__m128i)__builtin_ia32_vcvtpd2dq256_round_mask(                            \
506       (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)-1,         \
507       (int)(R)))
508 
509 #define _mm256_mask_cvt_roundpd_epi32(W, U, A, R)                              \
510   ((__m128i)__builtin_ia32_vcvtpd2dq256_round_mask(                            \
511       (__v4df)(__m256d)(A), (__v4si)(__m128i)(W), (__mmask8)(U), (int)(R)))
512 
513 #define _mm256_maskz_cvt_roundpd_epi32(U, A, R)                                \
514   ((__m128i)__builtin_ia32_vcvtpd2dq256_round_mask(                            \
515       (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)(U),        \
516       (int)(R)))
517 
518 #define _mm256_cvt_roundpd_ph(A, R)                                            \
519   ((__m128h)__builtin_ia32_vcvtpd2ph256_round_mask(                            \
520       (__v4df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
521 
522 #define _mm256_mask_cvt_roundpd_ph(W, U, A, R)                                 \
523   ((__m128h)__builtin_ia32_vcvtpd2ph256_round_mask((__v4df)(A), (__v8hf)(W),   \
524                                                    (__mmask8)(U), (int)(R)))
525 
526 #define _mm256_maskz_cvt_roundpd_ph(U, A, R)                                   \
527   ((__m128h)__builtin_ia32_vcvtpd2ph256_round_mask(                            \
528       (__v4df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
529 
530 #define _mm256_cvt_roundpd_ps(A, R)                                            \
531   ((__m128)__builtin_ia32_vcvtpd2ps256_round_mask(                             \
532       (__v4df)(__m256d)(A), (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (int)(R)))
533 
534 #define _mm256_mask_cvt_roundpd_ps(W, U, A, R)                                 \
535   ((__m128)__builtin_ia32_vcvtpd2ps256_round_mask(                             \
536       (__v4df)(__m256d)(A), (__v4sf)(__m128)(W), (__mmask8)(U), (int)(R)))
537 
538 #define _mm256_maskz_cvt_roundpd_ps(U, A, R)                                   \
539   ((__m128)__builtin_ia32_vcvtpd2ps256_round_mask((__v4df)(__m256d)(A),        \
540                                                   (__v4sf)_mm_setzero_ps(),    \
541                                                   (__mmask8)(U), (int)(R)))
542 
543 #define _mm256_cvt_roundpd_epi64(A, R)                                         \
544   ((__m256i)__builtin_ia32_vcvtpd2qq256_round_mask(                            \
545       (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1,      \
546       (int)(R)))
547 
548 #define _mm256_mask_cvt_roundpd_epi64(W, U, A, R)                              \
549   ((__m256i)__builtin_ia32_vcvtpd2qq256_round_mask(                            \
550       (__v4df)(__m256d)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R)))
551 
552 #define _mm256_maskz_cvt_roundpd_epi64(U, A, R)                                \
553   ((__m256i)__builtin_ia32_vcvtpd2qq256_round_mask(                            \
554       (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U),     \
555       (int)(R)))
556 
557 #define _mm256_cvt_roundpd_epu32(A, R)                                         \
558   ((__m128i)__builtin_ia32_vcvtpd2udq256_round_mask(                           \
559       (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1,         \
560       (int)(R)))
561 
562 #define _mm256_mask_cvt_roundpd_epu32(W, U, A, R)                              \
563   ((__m128i)__builtin_ia32_vcvtpd2udq256_round_mask(                           \
564       (__v4df)(__m256d)(A), (__v4su)(__m128i)(W), (__mmask8)(U), (int)(R)))
565 
566 #define _mm256_maskz_cvt_roundpd_epu32(U, A, R)                                \
567   ((__m128i)__builtin_ia32_vcvtpd2udq256_round_mask(                           \
568       (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)(U),        \
569       (int)(R)))
570 
571 #define _mm256_cvt_roundpd_epu64(A, R)                                         \
572   ((__m256i)__builtin_ia32_vcvtpd2uqq256_round_mask(                           \
573       (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1,      \
574       (int)(R)))
575 
576 #define _mm256_mask_cvt_roundpd_epu64(W, U, A, R)                              \
577   ((__m256i)__builtin_ia32_vcvtpd2uqq256_round_mask(                           \
578       (__v4df)(__m256d)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R)))
579 
580 #define _mm256_maskz_cvt_roundpd_epu64(U, A, R)                                \
581   ((__m256i)__builtin_ia32_vcvtpd2uqq256_round_mask(                           \
582       (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U),     \
583       (int)(R)))
584 
585 #define _mm256_cvt_roundph_epi32(A, R)                                         \
586   ((__m256i)__builtin_ia32_vcvtph2dq256_round_mask(                            \
587       (__v8hf)(A), (__v8si)_mm256_undefined_si256(), (__mmask8)(-1),           \
588       (int)(R)))
589 
590 #define _mm256_mask_cvt_roundph_epi32(W, U, A, R)                              \
591   ((__m256i)__builtin_ia32_vcvtph2dq256_round_mask((__v8hf)(A), (__v8si)(W),   \
592                                                    (__mmask8)(U), (int)(R)))
593 
594 #define _mm256_maskz_cvt_roundph_epi32(U, A, R)                                \
595   ((__m256i)__builtin_ia32_vcvtph2dq256_round_mask(                            \
596       (__v8hf)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
597 
598 #define _mm256_cvt_roundph_pd(A, R)                                            \
599   ((__m256d)__builtin_ia32_vcvtph2pd256_round_mask(                            \
600       (__v8hf)(A), (__v4df)_mm256_undefined_pd(), (__mmask8)(-1), (int)(R)))
601 
602 #define _mm256_mask_cvt_roundph_pd(W, U, A, R)                                 \
603   ((__m256d)__builtin_ia32_vcvtph2pd256_round_mask((__v8hf)(A), (__v4df)(W),   \
604                                                    (__mmask8)(U), (int)(R)))
605 
606 #define _mm256_maskz_cvt_roundph_pd(U, A, R)                                   \
607   ((__m256d)__builtin_ia32_vcvtph2pd256_round_mask(                            \
608       (__v8hf)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R)))
609 
610 #define _mm256_cvtx_roundph_ps(A, R)                                           \
611   ((__m256)__builtin_ia32_vcvtph2psx256_round_mask(                            \
612       (__v8hf)(A), (__v8sf)_mm256_undefined_ps(), (__mmask8)(-1), (int)(R)))
613 
614 #define _mm256_mask_cvtx_roundph_ps(W, U, A, R)                                \
615   ((__m256)__builtin_ia32_vcvtph2psx256_round_mask((__v8hf)(A), (__v8sf)(W),   \
616                                                    (__mmask8)(U), (int)(R)))
617 
618 #define _mm256_maskz_cvtx_roundph_ps(U, A, R)                                  \
619   ((__m256)__builtin_ia32_vcvtph2psx256_round_mask(                            \
620       (__v8hf)(A), (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R)))
621 
622 #define _mm256_cvt_roundph_epi64(A, R)                                         \
623   ((__m256i)__builtin_ia32_vcvtph2qq256_round_mask(                            \
624       (__v8hf)(A), (__v4di)_mm256_undefined_si256(), (__mmask8)(-1),           \
625       (int)(R)))
626 
627 #define _mm256_mask_cvt_roundph_epi64(W, U, A, R)                              \
628   ((__m256i)__builtin_ia32_vcvtph2qq256_round_mask((__v8hf)(A), (__v4di)(W),   \
629                                                    (__mmask8)(U), (int)(R)))
630 
631 #define _mm256_maskz_cvt_roundph_epi64(U, A, R)                                \
632   ((__m256i)__builtin_ia32_vcvtph2qq256_round_mask(                            \
633       (__v8hf)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
634 
635 #define _mm256_cvt_roundph_epu32(A, R)                                         \
636   ((__m256i)__builtin_ia32_vcvtph2udq256_round_mask(                           \
637       (__v8hf)(A), (__v8su)_mm256_undefined_si256(), (__mmask8)(-1),           \
638       (int)(R)))
639 
640 #define _mm256_mask_cvt_roundph_epu32(W, U, A, R)                              \
641   ((__m256i)__builtin_ia32_vcvtph2udq256_round_mask((__v8hf)(A), (__v8su)(W),  \
642                                                     (__mmask8)(U), (int)(R)))
643 
644 #define _mm256_maskz_cvt_roundph_epu32(U, A, R)                                \
645   ((__m256i)__builtin_ia32_vcvtph2udq256_round_mask(                           \
646       (__v8hf)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
647 
648 #define _mm256_cvt_roundph_epu64(A, R)                                         \
649   ((__m256i)__builtin_ia32_vcvtph2uqq256_round_mask(                           \
650       (__v8hf)(A), (__v4du)_mm256_undefined_si256(), (__mmask8)(-1),           \
651       (int)(R)))
652 
653 #define _mm256_mask_cvt_roundph_epu64(W, U, A, R)                              \
654   ((__m256i)__builtin_ia32_vcvtph2uqq256_round_mask((__v8hf)(A), (__v4du)(W),  \
655                                                     (__mmask8)(U), (int)(R)))
656 
657 #define _mm256_maskz_cvt_roundph_epu64(U, A, R)                                \
658   ((__m256i)__builtin_ia32_vcvtph2uqq256_round_mask(                           \
659       (__v8hf)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
660 
661 #define _mm256_cvt_roundph_epu16(A, R)                                         \
662   ((__m256i)__builtin_ia32_vcvtph2uw256_round_mask(                            \
663       (__v16hf)(A), (__v16hu)_mm256_undefined_si256(), (__mmask16)(-1),        \
664       (int)(R)))
665 
666 #define _mm256_mask_cvt_roundph_epu16(W, U, A, R)                              \
667   ((__m256i)__builtin_ia32_vcvtph2uw256_round_mask((__v16hf)(A), (__v16hu)(W), \
668                                                    (__mmask16)(U), (int)(R)))
669 
670 #define _mm256_maskz_cvt_roundph_epu16(U, A, R)                                \
671   ((__m256i)__builtin_ia32_vcvtph2uw256_round_mask(                            \
672       (__v16hf)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)(U),           \
673       (int)(R)))
674 
675 #define _mm256_cvt_roundph_epi16(A, R)                                         \
676   ((__m256i)__builtin_ia32_vcvtph2w256_round_mask(                             \
677       (__v16hf)(A), (__v16hi)_mm256_undefined_si256(), (__mmask16)(-1),        \
678       (int)(R)))
679 
680 #define _mm256_mask_cvt_roundph_epi16(W, U, A, R)                              \
681   ((__m256i)__builtin_ia32_vcvtph2w256_round_mask((__v16hf)(A), (__v16hi)(W),  \
682                                                   (__mmask16)(U), (int)(R)))
683 
684 #define _mm256_maskz_cvt_roundph_epi16(U, A, R)                                \
685   ((__m256i)__builtin_ia32_vcvtph2w256_round_mask(                             \
686       (__v16hf)(A), (__v16hi)_mm256_setzero_si256(), (__mmask16)(U),           \
687       (int)(R)))
688 
689 #define _mm256_cvt_roundps_epi32(A, R)                                         \
690   ((__m256i)__builtin_ia32_vcvtps2dq256_round_mask(                            \
691       (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)-1,       \
692       (int)(R)))
693 
694 #define _mm256_mask_cvt_roundps_epi32(W, U, A, R)                              \
695   ((__m256i)__builtin_ia32_vcvtps2dq256_round_mask(                            \
696       (__v8sf)(__m256)(A), (__v8si)(__m256i)(W), (__mmask8)(U), (int)(R)))
697 
698 #define _mm256_maskz_cvt_roundps_epi32(U, A, R)                                \
699   ((__m256i)__builtin_ia32_vcvtps2dq256_round_mask(                            \
700       (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U),      \
701       (int)(R)))
702 
703 #define _mm256_cvt_roundps_pd(A, R)                                            \
704   ((__m256d)__builtin_ia32_vcvtps2pd256_round_mask(                            \
705       (__v4sf)(__m128)(A), (__v4df)_mm256_undefined_pd(), (__mmask8)-1,        \
706       (int)(R)))
707 
708 #define _mm256_mask_cvt_roundps_pd(W, U, A, R)                                 \
709   ((__m256d)__builtin_ia32_vcvtps2pd256_round_mask(                            \
710       (__v4sf)(__m128)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
711 
712 #define _mm256_maskz_cvt_roundps_pd(U, A, R)                                   \
713   ((__m256d)__builtin_ia32_vcvtps2pd256_round_mask(                            \
714       (__v4sf)(__m128)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U),         \
715       (int)(R)))
716 
717 #define _mm256_cvt_roundps_ph(A, I)                                            \
718   ((__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I),    \
719                                              (__v8hi)_mm_undefined_si128(),    \
720                                              (__mmask8)-1))
721 
722 /* FIXME: We may use these way in future.
723 #define _mm256_cvt_roundps_ph(A, I)                                            \
724   ((__m128i)__builtin_ia32_vcvtps2ph256_round_mask(                            \
725       (__v8sf)(__m256)(A), (int)(I), (__v8hi)_mm_undefined_si128(),            \
726       (__mmask8)-1))
727 #define _mm256_mask_cvt_roundps_ph(U, W, A, I)                                 \
728   ((__m128i)__builtin_ia32_vcvtps2ph256_round_mask(                            \
729       (__v8sf)(__m256)(A), (int)(I), (__v8hi)(__m128i)(U), (__mmask8)(W)))
730 #define _mm256_maskz_cvt_roundps_ph(W, A, I)                                   \
731   ((__m128i)__builtin_ia32_vcvtps2ph256_round_mask(                            \
732       (__v8sf)(__m256)(A), (int)(I), (__v8hi)_mm_setzero_si128(),              \
733       (__mmask8)(W))) */
734 
735 #define _mm256_cvtx_roundps_ph(A, R)                                           \
736   ((__m128h)__builtin_ia32_vcvtps2phx256_round_mask(                           \
737       (__v8sf)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
738 
739 #define _mm256_mask_cvtx_roundps_ph(W, U, A, R)                                \
740   ((__m128h)__builtin_ia32_vcvtps2phx256_round_mask((__v8sf)(A), (__v8hf)(W),  \
741                                                     (__mmask8)(U), (int)(R)))
742 
743 #define _mm256_maskz_cvtx_roundps_ph(U, A, R)                                  \
744   ((__m128h)__builtin_ia32_vcvtps2phx256_round_mask(                           \
745       (__v8sf)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
746 
747 #define _mm256_cvt_roundps_epi64(A, R)                                         \
748   ((__m256i)__builtin_ia32_vcvtps2qq256_round_mask(                            \
749       (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1,       \
750       (int)(R)))
751 
752 #define _mm256_mask_cvt_roundps_epi64(W, U, A, R)                              \
753   ((__m256i)__builtin_ia32_vcvtps2qq256_round_mask(                            \
754       (__v4sf)(__m128)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R)))
755 
756 #define _mm256_maskz_cvt_roundps_epi64(U, A, R)                                \
757   ((__m256i)__builtin_ia32_vcvtps2qq256_round_mask(                            \
758       (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U),      \
759       (int)(R)))
760 
761 #define _mm256_cvt_roundps_epu32(A, R)                                         \
762   ((__m256i)__builtin_ia32_vcvtps2udq256_round_mask(                           \
763       (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1,       \
764       (int)(R)))
765 
766 #define _mm256_mask_cvt_roundps_epu32(W, U, A, R)                              \
767   ((__m256i)__builtin_ia32_vcvtps2udq256_round_mask(                           \
768       (__v8sf)(__m256)(A), (__v8su)(__m256i)(W), (__mmask8)(U), (int)(R)))
769 
770 #define _mm256_maskz_cvt_roundps_epu32(U, A, R)                                \
771   ((__m256i)__builtin_ia32_vcvtps2udq256_round_mask(                           \
772       (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U),      \
773       (int)(R)))
774 
775 #define _mm256_cvt_roundps_epu64(A, R)                                         \
776   ((__m256i)__builtin_ia32_vcvtps2uqq256_round_mask(                           \
777       (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1,       \
778       (int)(R)))
779 
780 #define _mm256_mask_cvt_roundps_epu64(W, U, A, R)                              \
781   ((__m256i)__builtin_ia32_vcvtps2uqq256_round_mask(                           \
782       (__v4sf)(__m128)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R)))
783 
784 #define _mm256_maskz_cvt_roundps_epu64(U, A, R)                                \
785   ((__m256i)__builtin_ia32_vcvtps2uqq256_round_mask(                           \
786       (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U),      \
787       (int)(R)))
788 
789 #define _mm256_cvt_roundepi64_pd(A, R)                                         \
790   ((__m256d)__builtin_ia32_vcvtqq2pd256_round_mask(                            \
791       (__v4di)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)-1,         \
792       (int)(R)))
793 
794 #define _mm256_mask_cvt_roundepi64_pd(W, U, A, R)                              \
795   ((__m256d)__builtin_ia32_vcvtqq2pd256_round_mask(                            \
796       (__v4di)(__m256i)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
797 
798 #define _mm256_maskz_cvt_roundepi64_pd(U, A, R)                                \
799   ((__m256d)__builtin_ia32_vcvtqq2pd256_round_mask(                            \
800       (__v4di)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U),        \
801       (int)(R)))
802 
803 #define _mm256_cvt_roundepi64_ph(A, R)                                         \
804   ((__m128h)__builtin_ia32_vcvtqq2ph256_round_mask(                            \
805       (__v4di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
806 
807 #define _mm256_mask_cvt_roundepi64_ph(W, U, A, R)                              \
808   ((__m128h)__builtin_ia32_vcvtqq2ph256_round_mask((__v4di)(A), (__v8hf)(W),   \
809                                                    (__mmask8)(U), (int)(R)))
810 
811 #define _mm256_maskz_cvt_roundepi64_ph(U, A, R)                                \
812   ((__m128h)__builtin_ia32_vcvtqq2ph256_round_mask(                            \
813       (__v4di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
814 
815 #define _mm256_cvt_roundepi64_ps(A, R)                                         \
816   ((__m128)__builtin_ia32_vcvtqq2ps256_round_mask(                             \
817       (__v4di)(__m256i)(A), (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (int)(R)))
818 
819 #define _mm256_mask_cvt_roundepi64_ps(W, U, A, R)                              \
820   ((__m128)__builtin_ia32_vcvtqq2ps256_round_mask(                             \
821       (__v4di)(__m256i)(A), (__v4sf)(__m128)(W), (__mmask8)(U), (int)(R)))
822 
823 #define _mm256_maskz_cvt_roundepi64_ps(U, A, R)                                \
824   ((__m128)__builtin_ia32_vcvtqq2ps256_round_mask((__v4di)(__m256i)(A),        \
825                                                   (__v4sf)_mm_setzero_ps(),    \
826                                                   (__mmask8)(U), (int)(R)))
827 
828 #define _mm256_cvtt_roundpd_epi32(A, R)                                        \
829   ((__m128i)__builtin_ia32_vcvttpd2dq256_round_mask(                           \
830       (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)-1,         \
831       (int)(R)))
832 
833 #define _mm256_mask_cvtt_roundpd_epi32(W, U, A, R)                             \
834   ((__m128i)__builtin_ia32_vcvttpd2dq256_round_mask(                           \
835       (__v4df)(__m256d)(A), (__v4si)(__m128i)(W), (__mmask8)(U), (int)(R)))
836 
837 #define _mm256_maskz_cvtt_roundpd_epi32(U, A, R)                               \
838   ((__m128i)__builtin_ia32_vcvttpd2dq256_round_mask(                           \
839       (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)(U),        \
840       (int)(R)))
841 
842 #define _mm256_cvtt_roundpd_epi64(A, R)                                        \
843   ((__m256i)__builtin_ia32_vcvttpd2qq256_round_mask(                           \
844       (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1,      \
845       (int)(R)))
846 
847 #define _mm256_mask_cvtt_roundpd_epi64(W, U, A, R)                             \
848   ((__m256i)__builtin_ia32_vcvttpd2qq256_round_mask(                           \
849       (__v4df)(__m256d)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R)))
850 
851 #define _mm256_maskz_cvtt_roundpd_epi64(U, A, R)                               \
852   ((__m256i)__builtin_ia32_vcvttpd2qq256_round_mask(                           \
853       (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U),     \
854       (int)(R)))
855 
856 #define _mm256_cvtt_roundpd_epu32(A, R)                                        \
857   ((__m128i)__builtin_ia32_vcvttpd2udq256_round_mask(                          \
858       (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1,         \
859       (int)(R)))
860 
861 #define _mm256_mask_cvtt_roundpd_epu32(W, U, A, R)                             \
862   ((__m128i)__builtin_ia32_vcvttpd2udq256_round_mask(                          \
863       (__v4df)(__m256d)(A), (__v4su)(__m128i)(W), (__mmask8)(U), (int)(R)))
864 
865 #define _mm256_maskz_cvtt_roundpd_epu32(U, A, R)                               \
866   ((__m128i)__builtin_ia32_vcvttpd2udq256_round_mask(                          \
867       (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)(U),        \
868       (int)(R)))
869 
870 #define _mm256_cvtt_roundpd_epu64(A, R)                                        \
871   ((__m256i)__builtin_ia32_vcvttpd2uqq256_round_mask(                          \
872       (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1,      \
873       (int)(R)))
874 
875 #define _mm256_mask_cvtt_roundpd_epu64(W, U, A, R)                             \
876   ((__m256i)__builtin_ia32_vcvttpd2uqq256_round_mask(                          \
877       (__v4df)(__m256d)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R)))
878 
879 #define _mm256_maskz_cvtt_roundpd_epu64(U, A, R)                               \
880   ((__m256i)__builtin_ia32_vcvttpd2uqq256_round_mask(                          \
881       (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U),     \
882       (int)(R)))
883 
884 #define _mm256_cvtt_roundph_epi32(A, R)                                        \
885   ((__m256i)__builtin_ia32_vcvttph2dq256_round_mask(                           \
886       (__v8hf)(A), (__v8si)_mm256_undefined_si256(), (__mmask8)(-1),           \
887       (int)(R)))
888 
889 #define _mm256_mask_cvtt_roundph_epi32(W, U, A, R)                             \
890   ((__m256i)__builtin_ia32_vcvttph2dq256_round_mask((__v8hf)(A), (__v8si)(W),  \
891                                                     (__mmask8)(U), (int)(R)))
892 
893 #define _mm256_maskz_cvtt_roundph_epi32(U, A, R)                               \
894   ((__m256i)__builtin_ia32_vcvttph2dq256_round_mask(                           \
895       (__v8hf)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
896 
897 #define _mm256_cvtt_roundph_epi64(A, R)                                        \
898   ((__m256i)__builtin_ia32_vcvttph2qq256_round_mask(                           \
899       (__v8hf)(A), (__v4di)_mm256_undefined_si256(), (__mmask8)(-1),           \
900       (int)(R)))
901 
902 #define _mm256_mask_cvtt_roundph_epi64(W, U, A, R)                             \
903   ((__m256i)__builtin_ia32_vcvttph2qq256_round_mask((__v8hf)(A), (__v4di)(W),  \
904                                                     (__mmask8)(U), (int)(R)))
905 
906 #define _mm256_maskz_cvtt_roundph_epi64(U, A, R)                               \
907   ((__m256i)__builtin_ia32_vcvttph2qq256_round_mask(                           \
908       (__v8hf)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
909 
910 #define _mm256_cvtt_roundph_epu32(A, R)                                        \
911   ((__m256i)__builtin_ia32_vcvttph2udq256_round_mask(                          \
912       (__v8hf)(A), (__v8su)_mm256_undefined_si256(), (__mmask8)(-1),           \
913       (int)(R)))
914 
915 #define _mm256_mask_cvtt_roundph_epu32(W, U, A, R)                             \
916   ((__m256i)__builtin_ia32_vcvttph2udq256_round_mask((__v8hf)(A), (__v8su)(W), \
917                                                      (__mmask8)(U), (int)(R)))
918 
919 #define _mm256_maskz_cvtt_roundph_epu32(U, A, R)                               \
920   ((__m256i)__builtin_ia32_vcvttph2udq256_round_mask(                          \
921       (__v8hf)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
922 
923 #define _mm256_cvtt_roundph_epu64(A, R)                                        \
924   ((__m256i)__builtin_ia32_vcvttph2uqq256_round_mask(                          \
925       (__v8hf)(A), (__v4du)_mm256_undefined_si256(), (__mmask8)(-1),           \
926       (int)(R)))
927 
928 #define _mm256_mask_cvtt_roundph_epu64(W, U, A, R)                             \
929   ((__m256i)__builtin_ia32_vcvttph2uqq256_round_mask((__v8hf)(A), (__v4du)(W), \
930                                                      (__mmask8)(U), (int)(R)))
931 
932 #define _mm256_maskz_cvtt_roundph_epu64(U, A, R)                               \
933   ((__m256i)__builtin_ia32_vcvttph2uqq256_round_mask(                          \
934       (__v8hf)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
935 
936 #define _mm256_cvtt_roundph_epu16(A, R)                                        \
937   ((__m256i)__builtin_ia32_vcvttph2uw256_round_mask(                           \
938       (__v16hf)(A), (__v16hu)_mm256_undefined_si256(), (__mmask16)(-1),        \
939       (int)(R)))
940 
941 #define _mm256_mask_cvtt_roundph_epu16(W, U, A, R)                             \
942   ((__m256i)__builtin_ia32_vcvttph2uw256_round_mask(                           \
943       (__v16hf)(A), (__v16hu)(W), (__mmask16)(U), (int)(R)))
944 
945 #define _mm256_maskz_cvtt_roundph_epu16(U, A, R)                               \
946   ((__m256i)__builtin_ia32_vcvttph2uw256_round_mask(                           \
947       (__v16hf)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)(U),           \
948       (int)(R)))
949 
950 #define _mm256_cvtt_roundph_epi16(A, R)                                        \
951   ((__m256i)__builtin_ia32_vcvttph2w256_round_mask(                            \
952       (__v16hf)(A), (__v16hi)_mm256_undefined_si256(), (__mmask16)(-1),        \
953       (int)(R)))
954 
955 #define _mm256_mask_cvtt_roundph_epi16(W, U, A, R)                             \
956   ((__m256i)__builtin_ia32_vcvttph2w256_round_mask((__v16hf)(A), (__v16hi)(W), \
957                                                    (__mmask16)(U), (int)(R)))
958 
959 #define _mm256_maskz_cvtt_roundph_epi16(U, A, R)                               \
960   ((__m256i)__builtin_ia32_vcvttph2w256_round_mask(                            \
961       (__v16hf)(A), (__v16hi)_mm256_setzero_si256(), (__mmask16)(U),           \
962       (int)(R)))
963 
964 #define _mm256_cvtt_roundps_epi32(A, R)                                        \
965   ((__m256i)__builtin_ia32_vcvttps2dq256_round_mask(                           \
966       (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)-1,       \
967       (int)(R)))
968 
969 #define _mm256_mask_cvtt_roundps_epi32(W, U, A, R)                             \
970   ((__m256i)__builtin_ia32_vcvttps2dq256_round_mask(                           \
971       (__v8sf)(__m256)(A), (__v8si)(__m256i)(W), (__mmask8)(U), (int)(R)))
972 
973 #define _mm256_maskz_cvtt_roundps_epi32(U, A, R)                               \
974   ((__m256i)__builtin_ia32_vcvttps2dq256_round_mask(                           \
975       (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U),      \
976       (int)(R)))
977 
978 #define _mm256_cvtt_roundps_epi64(A, R)                                        \
979   ((__m256i)__builtin_ia32_vcvttps2qq256_round_mask(                           \
980       (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1,       \
981       (int)(R)))
982 
983 #define _mm256_mask_cvtt_roundps_epi64(W, U, A, R)                             \
984   ((__m256i)__builtin_ia32_vcvttps2qq256_round_mask(                           \
985       (__v4sf)(__m128)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R)))
986 
987 #define _mm256_maskz_cvtt_roundps_epi64(U, A, R)                               \
988   ((__m256i)__builtin_ia32_vcvttps2qq256_round_mask(                           \
989       (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U),      \
990       (int)(R)))
991 
992 #define _mm256_cvtt_roundps_epu32(A, R)                                        \
993   ((__m256i)__builtin_ia32_vcvttps2udq256_round_mask(                          \
994       (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1,       \
995       (int)(R)))
996 
997 #define _mm256_mask_cvtt_roundps_epu32(W, U, A, R)                             \
998   ((__m256i)__builtin_ia32_vcvttps2udq256_round_mask(                          \
999       (__v8sf)(__m256)(A), (__v8su)(__m256i)(W), (__mmask8)(U), (int)(R)))
1000 
1001 #define _mm256_maskz_cvtt_roundps_epu32(U, A, R)                               \
1002   ((__m256i)__builtin_ia32_vcvttps2udq256_round_mask(                          \
1003       (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U),      \
1004       (int)(R)))
1005 
1006 #define _mm256_cvtt_roundps_epu64(A, R)                                        \
1007   ((__m256i)__builtin_ia32_vcvttps2uqq256_round_mask(                          \
1008       (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1,       \
1009       (int)(R)))
1010 
1011 #define _mm256_mask_cvtt_roundps_epu64(W, U, A, R)                             \
1012   ((__m256i)__builtin_ia32_vcvttps2uqq256_round_mask(                          \
1013       (__v4sf)(__m128)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R)))
1014 
1015 #define _mm256_maskz_cvtt_roundps_epu64(U, A, R)                               \
1016   ((__m256i)__builtin_ia32_vcvttps2uqq256_round_mask(                          \
1017       (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U),      \
1018       (int)(R)))
1019 
1020 #define _mm256_cvt_roundepu32_ph(A, R)                                         \
1021   ((__m128h)__builtin_ia32_vcvtudq2ph256_round_mask(                           \
1022       (__v8su)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1023 
1024 #define _mm256_mask_cvt_roundepu32_ph(W, U, A, R)                              \
1025   ((__m128h)__builtin_ia32_vcvtudq2ph256_round_mask((__v8su)(A), (__v8hf)(W),  \
1026                                                     (__mmask8)(U), (int)(R)))
1027 
1028 #define _mm256_maskz_cvt_roundepu32_ph(U, A, R)                                \
1029   ((__m128h)__builtin_ia32_vcvtudq2ph256_round_mask(                           \
1030       (__v8su)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1031 
1032 #define _mm256_cvt_roundepu32_ps(A, R)                                         \
1033   ((__m256)__builtin_ia32_vcvtudq2ps256_round_mask(                            \
1034       (__v8su)(__m256i)(A), (__v8sf)_mm256_setzero_ps(), (__mmask8)-1,         \
1035       (int)(R)))
1036 
1037 #define _mm256_mask_cvt_roundepu32_ps(W, U, A, R)                              \
1038   ((__m256)__builtin_ia32_vcvtudq2ps256_round_mask(                            \
1039       (__v8su)(__m256i)(A), (__v8sf)(__m256)(W), (__mmask8)(U), (int)(R)))
1040 
1041 #define _mm256_maskz_cvt_roundepu32_ps(U, A, R)                                \
1042   ((__m256)__builtin_ia32_vcvtudq2ps256_round_mask(                            \
1043       (__v8su)(__m256i)(A), (__v8sf)_mm256_setzero_ps(), (__mmask8)(U),        \
1044       (int)(R)))
1045 
1046 #define _mm256_cvt_roundepu64_pd(A, R)                                         \
1047   ((__m256d)__builtin_ia32_vcvtuqq2pd256_round_mask(                           \
1048       (__v4du)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)-1,         \
1049       (int)(R)))
1050 
1051 #define _mm256_mask_cvt_roundepu64_pd(W, U, A, R)                              \
1052   ((__m256d)__builtin_ia32_vcvtuqq2pd256_round_mask(                           \
1053       (__v4du)(__m256i)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
1054 
1055 #define _mm256_maskz_cvt_roundepu64_pd(U, A, R)                                \
1056   ((__m256d)__builtin_ia32_vcvtuqq2pd256_round_mask(                           \
1057       (__v4du)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U),        \
1058       (int)(R)))
1059 
1060 #define _mm256_cvt_roundepu64_ph(A, R)                                         \
1061   ((__m128h)__builtin_ia32_vcvtuqq2ph256_round_mask(                           \
1062       (__v4du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1063 
1064 #define _mm256_mask_cvt_roundepu64_ph(W, U, A, R)                              \
1065   ((__m128h)__builtin_ia32_vcvtuqq2ph256_round_mask((__v4du)(A), (__v8hf)(W),  \
1066                                                     (__mmask8)(U), (int)(R)))
1067 
1068 #define _mm256_maskz_cvt_roundepu64_ph(U, A, R)                                \
1069   ((__m128h)__builtin_ia32_vcvtuqq2ph256_round_mask(                           \
1070       (__v4du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1071 
1072 #define _mm256_cvt_roundepu64_ps(A, R)                                         \
1073   ((__m128)__builtin_ia32_vcvtuqq2ps256_round_mask(                            \
1074       (__v4du)(__m256i)(A), (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (int)(R)))
1075 
1076 #define _mm256_mask_cvt_roundepu64_ps(W, U, A, R)                              \
1077   ((__m128)__builtin_ia32_vcvtuqq2ps256_round_mask(                            \
1078       (__v4du)(__m256i)(A), (__v4sf)(__m128)(W), (__mmask8)(U), (int)(R)))
1079 
1080 #define _mm256_maskz_cvt_roundepu64_ps(U, A, R)                                \
1081   ((__m128)__builtin_ia32_vcvtuqq2ps256_round_mask((__v4du)(__m256i)(A),       \
1082                                                    (__v4sf)_mm_setzero_ps(),   \
1083                                                    (__mmask8)(U), (int)(R)))
1084 
1085 #define _mm256_cvt_roundepu16_ph(A, R)                                         \
1086   ((__m256h)__builtin_ia32_vcvtuw2ph256_round_mask(                            \
1087       (__v16hu)(A), (__v16hf)_mm256_undefined_ph(), (__mmask16)(-1),           \
1088       (int)(R)))
1089 
1090 #define _mm256_mask_cvt_roundepu16_ph(W, U, A, R)                              \
1091   ((__m256h)__builtin_ia32_vcvtuw2ph256_round_mask((__v16hu)(A), (__v16hf)(W), \
1092                                                    (__mmask16)(U), (int)(R)))
1093 
1094 #define _mm256_maskz_cvt_roundepu16_ph(U, A, R)                                \
1095   ((__m256h)__builtin_ia32_vcvtuw2ph256_round_mask(                            \
1096       (__v16hu)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1097 
1098 #define _mm256_cvt_roundepi16_ph(A, R)                                         \
1099   ((__m256h)__builtin_ia32_vcvtw2ph256_round_mask(                             \
1100       (__v16hi)(A), (__v16hf)_mm256_undefined_ph(), (__mmask16)(-1),           \
1101       (int)(R)))
1102 
1103 #define _mm256_mask_cvt_roundepi16_ph(W, U, A, R)                              \
1104   ((__m256h)__builtin_ia32_vcvtw2ph256_round_mask((__v16hi)(A), (__v16hf)(W),  \
1105                                                   (__mmask16)(U), (int)(R)))
1106 
1107 #define _mm256_maskz_cvt_roundepi16_ph(U, A, R)                                \
1108   ((__m256h)__builtin_ia32_vcvtw2ph256_round_mask(                             \
1109       (__v16hi)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1110 
1111 #define _mm256_div_round_pd(A, B, R)                                           \
1112   ((__m256d)__builtin_ia32_vdivpd256_round((__v4df)(__m256d)(A),               \
1113                                            (__v4df)(__m256d)(B), (int)(R)))
1114 
1115 #define _mm256_mask_div_round_pd(W, U, A, B, R)                                \
1116   ((__m256d)__builtin_ia32_selectpd_256(                                       \
1117       (__mmask8)(U), (__v4df)_mm256_div_round_pd((A), (B), (R)),               \
1118       (__v4df)(__m256d)(W)))
1119 
1120 #define _mm256_maskz_div_round_pd(U, A, B, R)                                  \
1121   ((__m256d)__builtin_ia32_selectpd_256(                                       \
1122       (__mmask8)(U), (__v4df)_mm256_div_round_pd((A), (B), (R)),               \
1123       (__v4df)_mm256_setzero_pd()))
1124 
1125 #define _mm256_div_round_ph(A, B, R)                                           \
1126   ((__m256h)__builtin_ia32_vdivph256_round((__v16hf)(__m256h)(A),              \
1127                                            (__v16hf)(__m256h)(B), (int)(R)))
1128 
1129 #define _mm256_mask_div_round_ph(W, U, A, B, R)                                \
1130   ((__m256h)__builtin_ia32_selectph_256(                                       \
1131       (__mmask16)(U), (__v16hf)_mm256_div_round_ph((A), (B), (R)),             \
1132       (__v16hf)(__m256h)(W)))
1133 
1134 #define _mm256_maskz_div_round_ph(U, A, B, R)                                  \
1135   ((__m256h)__builtin_ia32_selectph_256(                                       \
1136       (__mmask16)(U), (__v16hf)_mm256_div_round_ph((A), (B), (R)),             \
1137       (__v16hf)_mm256_setzero_ph()))
1138 
1139 #define _mm256_div_round_ps(A, B, R)                                           \
1140   ((__m256)__builtin_ia32_vdivps256_round((__v8sf)(__m256)(A),                 \
1141                                           (__v8sf)(__m256)(B), (int)(R)))
1142 
1143 #define _mm256_mask_div_round_ps(W, U, A, B, R)                                \
1144   ((__m256)__builtin_ia32_selectps_256(                                        \
1145       (__mmask8)(U), (__v8sf)_mm256_div_round_ps((A), (B), (R)),               \
1146       (__v8sf)(__m256)(W)))
1147 
1148 #define _mm256_maskz_div_round_ps(U, A, B, R)                                  \
1149   ((__m256)__builtin_ia32_selectps_256(                                        \
1150       (__mmask8)(U), (__v8sf)_mm256_div_round_ps((A), (B), (R)),               \
1151       (__v8sf)_mm256_setzero_ps()))
1152 
1153 #define _mm256_fcmadd_round_pch(A, B, C, R)                                    \
1154   ((__m256h)__builtin_ia32_vfcmaddcph256_round_mask3(                          \
1155       (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C),        \
1156       (__mmask8)-1, (int)(R)))
1157 
1158 #define _mm256_mask_fcmadd_round_pch(A, U, B, C, R)                            \
1159   ((__m256h)__builtin_ia32_vfcmaddcph256_round_mask(                           \
1160       (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C),        \
1161       (__mmask8)(U), (int)(R)))
1162 
1163 #define _mm256_mask3_fcmadd_round_pch(A, B, C, U, R)                           \
1164   ((__m256h)__builtin_ia32_vfcmaddcph256_round_mask3(                          \
1165       (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C),        \
1166       (__mmask8)(U), (int)(R)))
1167 
1168 #define _mm256_maskz_fcmadd_round_pch(U, A, B, C, R)                           \
1169   ((__m256h)__builtin_ia32_vfcmaddcph256_round_maskz(                          \
1170       (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C),        \
1171       (__mmask8)(U), (int)(R)))
1172 
1173 #define _mm256_cmul_round_pch(A, B, R)                                         \
1174   ((__m256h)__builtin_ia32_vfcmulcph256_round_mask(                            \
1175       (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B),                              \
1176       (__v8sf)(__m256h)_mm256_undefined_ph(), (__mmask8)-1, (int)(R)))
1177 
1178 #define _mm256_mask_cmul_round_pch(W, U, A, B, R)                              \
1179   ((__m256h)__builtin_ia32_vfcmulcph256_round_mask(                            \
1180       (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(W),        \
1181       (__mmask8)(U), (int)(R)))
1182 
1183 #define _mm256_maskz_cmul_round_pch(U, A, B, R)                                \
1184   ((__m256h)__builtin_ia32_vfcmulcph256_round_mask(                            \
1185       (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B),                              \
1186       (__v8sf)(__m256h)_mm256_setzero_ph(), (__mmask8)(U), (int)(R)))
1187 
1188 #define _mm256_fixupimm_round_pd(A, B, C, imm, R)                              \
1189   ((__m256d)__builtin_ia32_vfixupimmpd256_round_mask(                          \
1190       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4di)(__m256i)(C),        \
1191       (int)(imm), (__mmask8)-1, (int)(R)))
1192 
1193 #define _mm256_mask_fixupimm_round_pd(A, U, B, C, imm, R)                      \
1194   ((__m256d)__builtin_ia32_vfixupimmpd256_round_mask(                          \
1195       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4di)(__m256i)(C),        \
1196       (int)(imm), (__mmask8)(U), (int)(R)))
1197 
1198 #define _mm256_maskz_fixupimm_round_pd(U, A, B, C, imm, R)                     \
1199   ((__m256d)__builtin_ia32_vfixupimmpd256_round_maskz(                         \
1200       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4di)(__m256i)(C),        \
1201       (int)(imm), (__mmask8)(U), (int)(R)))
1202 
1203 #define _mm256_fixupimm_round_ps(A, B, C, imm, R)                              \
1204   ((__m256)__builtin_ia32_vfixupimmps256_round_mask(                           \
1205       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8si)(__m256i)(C),          \
1206       (int)(imm), (__mmask8)-1, (int)(R)))
1207 
1208 #define _mm256_mask_fixupimm_round_ps(A, U, B, C, imm, R)                      \
1209   ((__m256)__builtin_ia32_vfixupimmps256_round_mask(                           \
1210       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8si)(__m256i)(C),          \
1211       (int)(imm), (__mmask8)(U), (int)(R)))
1212 
1213 #define _mm256_maskz_fixupimm_round_ps(U, A, B, C, imm, R)                     \
1214   ((__m256)__builtin_ia32_vfixupimmps256_round_maskz(                          \
1215       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8si)(__m256i)(C),          \
1216       (int)(imm), (__mmask8)(U), (int)(R)))
1217 
1218 #define _mm256_fmadd_round_pd(A, B, C, R)                                      \
1219   ((__m256d)__builtin_ia32_vfmaddpd256_round_mask(                             \
1220       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C),        \
1221       (__mmask8)-1, (int)(R)))
1222 
1223 #define _mm256_mask_fmadd_round_pd(A, U, B, C, R)                              \
1224   ((__m256d)__builtin_ia32_vfmaddpd256_round_mask(                             \
1225       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C),        \
1226       (__mmask8)(U), (int)(R)))
1227 
1228 #define _mm256_mask3_fmadd_round_pd(A, B, C, U, R)                             \
1229   ((__m256d)__builtin_ia32_vfmaddpd256_round_mask3(                            \
1230       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C),        \
1231       (__mmask8)(U), (int)(R)))
1232 
1233 #define _mm256_maskz_fmadd_round_pd(U, A, B, C, R)                             \
1234   ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz(                            \
1235       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C),        \
1236       (__mmask8)(U), (int)(R)))
1237 
1238 #define _mm256_fmsub_round_pd(A, B, C, R)                                      \
1239   ((__m256d)__builtin_ia32_vfmaddpd256_round_mask(                             \
1240       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C),       \
1241       (__mmask8)-1, (int)(R)))
1242 
1243 #define _mm256_mask_fmsub_round_pd(A, U, B, C, R)                              \
1244   ((__m256d)__builtin_ia32_vfmaddpd256_round_mask(                             \
1245       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C),       \
1246       (__mmask8)(U), (int)(R)))
1247 
1248 #define _mm256_maskz_fmsub_round_pd(U, A, B, C, R)                             \
1249   ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz(                            \
1250       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C),       \
1251       (__mmask8)(U), (int)(R)))
1252 
1253 #define _mm256_fnmadd_round_pd(A, B, C, R)                                     \
1254   ((__m256d)__builtin_ia32_vfmaddpd256_round_mask(                             \
1255       -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C),       \
1256       (__mmask8)-1, (int)(R)))
1257 
1258 #define _mm256_mask3_fnmadd_round_pd(A, B, C, U, R)                            \
1259   ((__m256d)__builtin_ia32_vfmaddpd256_round_mask3(                            \
1260       -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C),       \
1261       (__mmask8)(U), (int)(R)))
1262 
1263 #define _mm256_maskz_fnmadd_round_pd(U, A, B, C, R)                            \
1264   ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz(                            \
1265       -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C),       \
1266       (__mmask8)(U), (int)(R)))
1267 
1268 #define _mm256_fnmsub_round_pd(A, B, C, R)                                     \
1269   ((__m256d)__builtin_ia32_vfmaddpd256_round_mask(                             \
1270       -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C),      \
1271       (__mmask8)-1, (int)(R)))
1272 
1273 #define _mm256_maskz_fnmsub_round_pd(U, A, B, C, R)                            \
1274   ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz(                            \
1275       -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C),      \
1276       (__mmask8)(U), (int)(R)))
1277 
1278 #define _mm256_fmadd_round_ph(A, B, C, R)                                      \
1279   ((__m256h)__builtin_ia32_vfmaddph256_round_mask(                             \
1280       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),     \
1281       (__mmask16)-1, (int)(R)))
1282 
1283 #define _mm256_mask_fmadd_round_ph(A, U, B, C, R)                              \
1284   ((__m256h)__builtin_ia32_vfmaddph256_round_mask(                             \
1285       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),     \
1286       (__mmask16)(U), (int)(R)))
1287 
1288 #define _mm256_mask3_fmadd_round_ph(A, B, C, U, R)                             \
1289   ((__m256h)__builtin_ia32_vfmaddph256_round_mask3(                            \
1290       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),     \
1291       (__mmask16)(U), (int)(R)))
1292 
1293 #define _mm256_maskz_fmadd_round_ph(U, A, B, C, R)                             \
1294   ((__m256h)__builtin_ia32_vfmaddph256_round_maskz(                            \
1295       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),     \
1296       (__mmask16)(U), (int)(R)))
1297 
1298 #define _mm256_fmsub_round_ph(A, B, C, R)                                      \
1299   ((__m256h)__builtin_ia32_vfmaddph256_round_mask(                             \
1300       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C),    \
1301       (__mmask16)-1, (int)(R)))
1302 
1303 #define _mm256_mask_fmsub_round_ph(A, U, B, C, R)                              \
1304   ((__m256h)__builtin_ia32_vfmaddph256_round_mask(                             \
1305       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C),    \
1306       (__mmask16)(U), (int)(R)))
1307 
1308 #define _mm256_maskz_fmsub_round_ph(U, A, B, C, R)                             \
1309   ((__m256h)__builtin_ia32_vfmaddph256_round_maskz(                            \
1310       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C),    \
1311       (__mmask16)(U), (int)(R)))
1312 
1313 #define _mm256_fnmadd_round_ph(A, B, C, R)                                     \
1314   ((__m256h)__builtin_ia32_vfmaddph256_round_mask(                             \
1315       (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),    \
1316       (__mmask16)-1, (int)(R)))
1317 
1318 #define _mm256_mask3_fnmadd_round_ph(A, B, C, U, R)                            \
1319   ((__m256h)__builtin_ia32_vfmaddph256_round_mask3(                            \
1320       -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),    \
1321       (__mmask16)(U), (int)(R)))
1322 
1323 #define _mm256_maskz_fnmadd_round_ph(U, A, B, C, R)                            \
1324   ((__m256h)__builtin_ia32_vfmaddph256_round_maskz(                            \
1325       -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),    \
1326       (__mmask16)(U), (int)(R)))
1327 
1328 #define _mm256_fnmsub_round_ph(A, B, C, R)                                     \
1329   ((__m256h)__builtin_ia32_vfmaddph256_round_mask(                             \
1330       (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C),   \
1331       (__mmask16)-1, (int)(R)))
1332 
1333 #define _mm256_maskz_fnmsub_round_ph(U, A, B, C, R)                            \
1334   ((__m256h)__builtin_ia32_vfmaddph256_round_maskz(                            \
1335       -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C),   \
1336       (__mmask16)(U), (int)(R)))
1337 
1338 #define _mm256_fmadd_round_ps(A, B, C, R)                                      \
1339   ((__m256)__builtin_ia32_vfmaddps256_round_mask(                              \
1340       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C),           \
1341       (__mmask8)-1, (int)(R)))
1342 
1343 #define _mm256_mask_fmadd_round_ps(A, U, B, C, R)                              \
1344   ((__m256)__builtin_ia32_vfmaddps256_round_mask(                              \
1345       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C),           \
1346       (__mmask8)(U), (int)(R)))
1347 
1348 #define _mm256_mask3_fmadd_round_ps(A, B, C, U, R)                             \
1349   ((__m256)__builtin_ia32_vfmaddps256_round_mask3(                             \
1350       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C),           \
1351       (__mmask8)(U), (int)(R)))
1352 
1353 #define _mm256_maskz_fmadd_round_ps(U, A, B, C, R)                             \
1354   ((__m256)__builtin_ia32_vfmaddps256_round_maskz(                             \
1355       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C),           \
1356       (__mmask8)(U), (int)(R)))
1357 
1358 #define _mm256_fmsub_round_ps(A, B, C, R)                                      \
1359   ((__m256)__builtin_ia32_vfmaddps256_round_mask(                              \
1360       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C),          \
1361       (__mmask8)-1, (int)(R)))
1362 
1363 #define _mm256_mask_fmsub_round_ps(A, U, B, C, R)                              \
1364   ((__m256)__builtin_ia32_vfmaddps256_round_mask(                              \
1365       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C),          \
1366       (__mmask8)(U), (int)(R)))
1367 
1368 #define _mm256_maskz_fmsub_round_ps(U, A, B, C, R)                             \
1369   ((__m256)__builtin_ia32_vfmaddps256_round_maskz(                             \
1370       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C),          \
1371       (__mmask8)(U), (int)(R)))
1372 
1373 #define _mm256_fnmadd_round_ps(A, B, C, R)                                     \
1374   ((__m256)__builtin_ia32_vfmaddps256_round_mask(                              \
1375       (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), (__v8sf)(__m256)(C),          \
1376       (__mmask8)-1, (int)(R)))
1377 
1378 #define _mm256_mask3_fnmadd_round_ps(A, B, C, U, R)                            \
1379   ((__m256)__builtin_ia32_vfmaddps256_round_mask3(                             \
1380       -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C),          \
1381       (__mmask8)(U), (int)(R)))
1382 
1383 #define _mm256_maskz_fnmadd_round_ps(U, A, B, C, R)                            \
1384   ((__m256)__builtin_ia32_vfmaddps256_round_maskz(                             \
1385       -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C),          \
1386       (__mmask8)(U), (int)(R)))
1387 
1388 #define _mm256_fnmsub_round_ps(A, B, C, R)                                     \
1389   ((__m256)__builtin_ia32_vfmaddps256_round_mask(                              \
1390       (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), -(__v8sf)(__m256)(C),         \
1391       (__mmask8)-1, (int)(R)))
1392 
1393 #define _mm256_maskz_fnmsub_round_ps(U, A, B, C, R)                            \
1394   ((__m256)__builtin_ia32_vfmaddps256_round_maskz(                             \
1395       -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C),         \
1396       (__mmask8)(U), (int)(R)))
1397 
1398 #define _mm256_fmadd_round_pch(A, B, C, R)                                     \
1399   ((__m256h)__builtin_ia32_vfmaddcph256_round_mask3(                           \
1400       (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C),        \
1401       (__mmask8)-1, (int)(R)))
1402 
1403 #define _mm256_mask_fmadd_round_pch(A, U, B, C, R)                             \
1404   ((__m256h)__builtin_ia32_vfmaddcph256_round_mask(                            \
1405       (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C),        \
1406       (__mmask8)(U), (int)(R)))
1407 
1408 #define _mm256_mask3_fmadd_round_pch(A, B, C, U, R)                            \
1409   ((__m256h)__builtin_ia32_vfmaddcph256_round_mask3(                           \
1410       (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C),        \
1411       (__mmask8)(U), (int)(R)))
1412 
1413 #define _mm256_maskz_fmadd_round_pch(U, A, B, C, R)                            \
1414   ((__m256h)__builtin_ia32_vfmaddcph256_round_maskz(                           \
1415       (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C),        \
1416       (__mmask8)(U), (int)(R)))
1417 
1418 #define _mm256_fmaddsub_round_pd(A, B, C, R)                                   \
1419   ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask(                          \
1420       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C),        \
1421       (__mmask8)-1, (int)(R)))
1422 
1423 #define _mm256_mask_fmaddsub_round_pd(A, U, B, C, R)                           \
1424   ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask(                          \
1425       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C),        \
1426       (__mmask8)(U), (int)(R)))
1427 
1428 #define _mm256_mask3_fmaddsub_round_pd(A, B, C, U, R)                          \
1429   ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask3(                         \
1430       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C),        \
1431       (__mmask8)(U), (int)(R)))
1432 
1433 #define _mm256_maskz_fmaddsub_round_pd(U, A, B, C, R)                          \
1434   ((__m256d)__builtin_ia32_vfmaddsubpd256_round_maskz(                         \
1435       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C),        \
1436       (__mmask8)(U), (int)(R)))
1437 
1438 #define _mm256_fmsubadd_round_pd(A, B, C, R)                                   \
1439   ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask(                          \
1440       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C),       \
1441       (__mmask8)-1, (int)(R)))
1442 
1443 #define _mm256_mask_fmsubadd_round_pd(A, U, B, C, R)                           \
1444   ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask(                          \
1445       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C),       \
1446       (__mmask8)(U), (int)(R)))
1447 
1448 #define _mm256_maskz_fmsubadd_round_pd(U, A, B, C, R)                          \
1449   ((__m256d)__builtin_ia32_vfmaddsubpd256_round_maskz(                         \
1450       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C),       \
1451       (__mmask8)(U), (int)(R)))
1452 
1453 #define _mm256_fmaddsub_round_ph(A, B, C, R)                                   \
1454   ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask(                          \
1455       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),     \
1456       (__mmask16)-1, (int)(R)))
1457 
1458 #define _mm256_mask_fmaddsub_round_ph(A, U, B, C, R)                           \
1459   ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask(                          \
1460       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),     \
1461       (__mmask16)(U), (int)(R)))
1462 
1463 #define _mm256_mask3_fmaddsub_round_ph(A, B, C, U, R)                          \
1464   ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask3(                         \
1465       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),     \
1466       (__mmask16)(U), (int)(R)))
1467 
1468 #define _mm256_maskz_fmaddsub_round_ph(U, A, B, C, R)                          \
1469   ((__m256h)__builtin_ia32_vfmaddsubph256_round_maskz(                         \
1470       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),     \
1471       (__mmask16)(U), (int)(R)))
1472 
1473 #define _mm256_fmsubadd_round_ph(A, B, C, R)                                   \
1474   ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask(                          \
1475       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C),    \
1476       (__mmask16)-1, (int)(R)))
1477 
1478 #define _mm256_mask_fmsubadd_round_ph(A, U, B, C, R)                           \
1479   ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask(                          \
1480       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C),    \
1481       (__mmask16)(U), (int)(R)))
1482 
1483 #define _mm256_maskz_fmsubadd_round_ph(U, A, B, C, R)                          \
1484   ((__m256h)__builtin_ia32_vfmaddsubph256_round_maskz(                         \
1485       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C),    \
1486       (__mmask16)(U), (int)(R)))
1487 
1488 #define _mm256_fmaddsub_round_ps(A, B, C, R)                                   \
1489   ((__m256)__builtin_ia32_vfmaddsubps256_round_mask(                           \
1490       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C),           \
1491       (__mmask8)-1, (int)(R)))
1492 
1493 #define _mm256_mask_fmaddsub_round_ps(A, U, B, C, R)                           \
1494   ((__m256)__builtin_ia32_vfmaddsubps256_round_mask(                           \
1495       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C),           \
1496       (__mmask8)(U), (int)(R)))
1497 
1498 #define _mm256_mask3_fmaddsub_round_ps(A, B, C, U, R)                          \
1499   ((__m256)__builtin_ia32_vfmaddsubps256_round_mask3(                          \
1500       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C),           \
1501       (__mmask8)(U), (int)(R)))
1502 
1503 #define _mm256_maskz_fmaddsub_round_ps(U, A, B, C, R)                          \
1504   ((__m256)__builtin_ia32_vfmaddsubps256_round_maskz(                          \
1505       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C),           \
1506       (__mmask8)(U), (int)(R)))
1507 
1508 #define _mm256_fmsubadd_round_ps(A, B, C, R)                                   \
1509   ((__m256)__builtin_ia32_vfmaddsubps256_round_mask(                           \
1510       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C),          \
1511       (__mmask8)-1, (int)(R)))
1512 
1513 #define _mm256_mask_fmsubadd_round_ps(A, U, B, C, R)                           \
1514   ((__m256)__builtin_ia32_vfmaddsubps256_round_mask(                           \
1515       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C),          \
1516       (__mmask8)(U), (int)(R)))
1517 
1518 #define _mm256_maskz_fmsubadd_round_ps(U, A, B, C, R)                          \
1519   ((__m256)__builtin_ia32_vfmaddsubps256_round_maskz(                          \
1520       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C),          \
1521       (__mmask8)(U), (int)(R)))
1522 #define _mm256_mask3_fmsub_round_pd(A, B, C, U, R)                             \
1523   ((__m256d)__builtin_ia32_vfmsubpd256_round_mask3(                            \
1524       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C),        \
1525       (__mmask8)(U), (int)(R)))
1526 
1527 #define _mm256_mask3_fmsubadd_round_pd(A, B, C, U, R)                          \
1528   ((__m256d)__builtin_ia32_vfmsubaddpd256_round_mask3(                         \
1529       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C),        \
1530       (__mmask8)(U), (int)(R)))
1531 
1532 #define _mm256_mask_fnmadd_round_pd(A, U, B, C, R)                             \
1533   ((__m256d)__builtin_ia32_vfmaddpd256_round_mask(                             \
1534       (__v4df)(__m256d)(A), -(__v4df)(__m256d)(B), (__v4df)(__m256d)(C),       \
1535       (__mmask8)(U), (int)(R)))
1536 
1537 #define _mm256_mask_fnmsub_round_pd(A, U, B, C, R)                             \
1538   ((__m256d)__builtin_ia32_vfmaddpd256_round_mask(                             \
1539       (__v4df)(__m256d)(A), -(__v4df)(__m256d)(B), -(__v4df)(__m256d)(C),      \
1540       (__mmask8)(U), (int)(R)))
1541 
1542 #define _mm256_mask3_fnmsub_round_pd(A, B, C, U, R)                            \
1543   ((__m256d)__builtin_ia32_vfmsubpd256_round_mask3(                            \
1544       -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C),       \
1545       (__mmask8)(U), (int)(R)))
1546 
1547 #define _mm256_mask3_fmsub_round_ph(A, B, C, U, R)                             \
1548   ((__m256h)__builtin_ia32_vfmsubph256_round_mask3(                            \
1549       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),     \
1550       (__mmask16)(U), (int)(R)))
1551 
1552 #define _mm256_mask3_fmsubadd_round_ph(A, B, C, U, R)                          \
1553   ((__m256h)__builtin_ia32_vfmsubaddph256_round_mask3(                         \
1554       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),     \
1555       (__mmask16)(U), (int)(R)))
1556 
1557 #define _mm256_mask_fnmadd_round_ph(A, U, B, C, R)                             \
1558   ((__m256h)__builtin_ia32_vfmaddph256_round_mask(                             \
1559       (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),    \
1560       (__mmask16)(U), (int)(R)))
1561 
1562 #define _mm256_mask_fnmsub_round_ph(A, U, B, C, R)                             \
1563   ((__m256h)__builtin_ia32_vfmaddph256_round_mask(                             \
1564       (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C),   \
1565       (__mmask16)(U), (int)(R)))
1566 
1567 #define _mm256_mask3_fnmsub_round_ph(A, B, C, U, R)                            \
1568   ((__m256h)__builtin_ia32_vfmsubph256_round_mask3(                            \
1569       -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),    \
1570       (__mmask16)(U), (int)(R)))
1571 
1572 #define _mm256_mask3_fmsub_round_ps(A, B, C, U, R)                             \
1573   ((__m256)__builtin_ia32_vfmsubps256_round_mask3(                             \
1574       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C),           \
1575       (__mmask8)(U), (int)(R)))
1576 
1577 #define _mm256_mask3_fmsubadd_round_ps(A, B, C, U, R)                          \
1578   ((__m256)__builtin_ia32_vfmsubaddps256_round_mask3(                          \
1579       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C),           \
1580       (__mmask8)(U), (int)(R)))
1581 
1582 #define _mm256_mask_fnmadd_round_ps(A, U, B, C, R)                             \
1583   ((__m256)__builtin_ia32_vfmaddps256_round_mask(                              \
1584       (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), (__v8sf)(__m256)(C),          \
1585       (__mmask8)(U), (int)(R)))
1586 
1587 #define _mm256_mask_fnmsub_round_ps(A, U, B, C, R)                             \
1588   ((__m256)__builtin_ia32_vfmaddps256_round_mask(                              \
1589       (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), -(__v8sf)(__m256)(C),         \
1590       (__mmask8)(U), (int)(R)))
1591 
1592 #define _mm256_mask3_fnmsub_round_ps(A, B, C, U, R)                            \
1593   ((__m256)__builtin_ia32_vfmsubps256_round_mask3(                             \
1594       -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C),          \
1595       (__mmask8)(U), (int)(R)))
1596 
1597 #define _mm256_mul_round_pch(A, B, R)                                          \
1598   ((__m256h)__builtin_ia32_vfmulcph256_round_mask(                             \
1599       (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B),                              \
1600       (__v8sf)(__m256h)_mm256_undefined_ph(), (__mmask8)-1, (int)(R)))
1601 
1602 #define _mm256_mask_mul_round_pch(W, U, A, B, R)                               \
1603   ((__m256h)__builtin_ia32_vfmulcph256_round_mask(                             \
1604       (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(W),        \
1605       (__mmask8)(U), (int)(R)))
1606 
1607 #define _mm256_maskz_mul_round_pch(U, A, B, R)                                 \
1608   ((__m256h)__builtin_ia32_vfmulcph256_round_mask(                             \
1609       (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B),                              \
1610       (__v8sf)(__m256h)_mm256_setzero_ph(), (__mmask8)(U), (int)(R)))
1611 
1612 #define _mm256_getexp_round_pd(A, R)                                           \
1613   ((__m256d)__builtin_ia32_vgetexppd256_round_mask(                            \
1614       (__v4df)(__m256d)(A), (__v4df)_mm256_undefined_pd(), (__mmask8)-1,       \
1615       (int)(R)))
1616 
1617 #define _mm256_mask_getexp_round_pd(W, U, A, R)                                \
1618   ((__m256d)__builtin_ia32_vgetexppd256_round_mask(                            \
1619       (__v4df)(__m256d)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
1620 
1621 #define _mm256_maskz_getexp_round_pd(U, A, R)                                  \
1622   ((__m256d)__builtin_ia32_vgetexppd256_round_mask(                            \
1623       (__v4df)(__m256d)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U),        \
1624       (int)(R)))
1625 
1626 #define _mm256_getexp_round_ph(A, R)                                           \
1627   ((__m256h)__builtin_ia32_vgetexpph256_round_mask(                            \
1628       (__v16hf)(__m256h)(A), (__v16hf)_mm256_undefined_ph(), (__mmask16)-1,    \
1629       (int)(R)))
1630 
1631 #define _mm256_mask_getexp_round_ph(W, U, A, R)                                \
1632   ((__m256h)__builtin_ia32_vgetexpph256_round_mask(                            \
1633       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(W), (__mmask16)(U), (int)(R)))
1634 
1635 #define _mm256_maskz_getexp_round_ph(U, A, R)                                  \
1636   ((__m256h)__builtin_ia32_vgetexpph256_round_mask(                            \
1637       (__v16hf)(__m256h)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U),     \
1638       (int)(R)))
1639 
1640 #define _mm256_getexp_round_ps(A, R)                                           \
1641   ((__m256)__builtin_ia32_vgetexpps256_round_mask(                             \
1642       (__v8sf)(__m256)(A), (__v8sf)_mm256_undefined_ps(), (__mmask8)-1,        \
1643       (int)(R)))
1644 
1645 #define _mm256_mask_getexp_round_ps(W, U, A, R)                                \
1646   ((__m256)__builtin_ia32_vgetexpps256_round_mask(                             \
1647       (__v8sf)(__m256)(A), (__v8sf)(__m256)(W), (__mmask8)(U), (int)(R)))
1648 
1649 #define _mm256_maskz_getexp_round_ps(U, A, R)                                  \
1650   ((__m256)__builtin_ia32_vgetexpps256_round_mask((__v8sf)(__m256)(A),         \
1651                                                   (__v8sf)_mm256_setzero_ps(), \
1652                                                   (__mmask8)(U), (int)(R)))
1653 
1654 #define _mm256_getmant_round_pd(A, B, C, R)                                    \
1655   ((__m256d)__builtin_ia32_vgetmantpd256_round_mask(                           \
1656       (__v4df)(__m256d)(A), (int)(((C) << 2) | (B)),                           \
1657       (__v4df)_mm256_undefined_pd(), (__mmask8)-1, (int)(R)))
1658 
1659 #define _mm256_mask_getmant_round_pd(W, U, A, B, C, R)                         \
1660   ((__m256d)__builtin_ia32_vgetmantpd256_round_mask(                           \
1661       (__v4df)(__m256d)(A), (int)(((C) << 2) | (B)), (__v4df)(__m256d)(W),     \
1662       (__mmask8)(U), (int)(R)))
1663 
1664 #define _mm256_maskz_getmant_round_pd(U, A, B, C, R)                           \
1665   ((__m256d)__builtin_ia32_vgetmantpd256_round_mask(                           \
1666       (__v4df)(__m256d)(A), (int)(((C) << 2) | (B)),                           \
1667       (__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R)))
1668 
1669 #define _mm256_getmant_round_ph(A, B, C, R)                                    \
1670   ((__m256h)__builtin_ia32_vgetmantph256_round_mask(                           \
1671       (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)),                          \
1672       (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, (int)(R)))
1673 
1674 #define _mm256_mask_getmant_round_ph(W, U, A, B, C, R)                         \
1675   ((__m256h)__builtin_ia32_vgetmantph256_round_mask(                           \
1676       (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), (__v16hf)(__m256h)(W),   \
1677       (__mmask16)(U), (int)(R)))
1678 
1679 #define _mm256_maskz_getmant_round_ph(U, A, B, C, R)                           \
1680   ((__m256h)__builtin_ia32_vgetmantph256_round_mask(                           \
1681       (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)),                          \
1682       (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1683 
1684 #define _mm256_getmant_round_ps(A, B, C, R)                                    \
1685   ((__m256)__builtin_ia32_vgetmantps256_round_mask(                            \
1686       (__v8sf)(__m256)(A), (int)(((C) << 2) | (B)),                            \
1687       (__v8sf)_mm256_undefined_ps(), (__mmask8)-1, (int)(R)))
1688 
1689 #define _mm256_mask_getmant_round_ps(W, U, A, B, C, R)                         \
1690   ((__m256)__builtin_ia32_vgetmantps256_round_mask(                            \
1691       (__v8sf)(__m256)(A), (int)(((C) << 2) | (B)), (__v8sf)(__m256)(W),       \
1692       (__mmask8)(U), (int)(R)))
1693 
1694 #define _mm256_maskz_getmant_round_ps(U, A, B, C, R)                           \
1695   ((__m256)__builtin_ia32_vgetmantps256_round_mask(                            \
1696       (__v8sf)(__m256)(A), (int)(((C) << 2) | (B)),                            \
1697       (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R)))
1698 
1699 #define _mm256_max_round_pd(A, B, R)                                           \
1700   ((__m256d)__builtin_ia32_vmaxpd256_round((__v4df)(__m256d)(A),               \
1701                                            (__v4df)(__m256d)(B), (int)(R)))
1702 
1703 #define _mm256_mask_max_round_pd(W, U, A, B, R)                                \
1704   ((__m256d)__builtin_ia32_selectpd_256(                                       \
1705       (__mmask8)(U), (__v4df)_mm256_max_round_pd((A), (B), (R)),               \
1706       (__v4df)(__m256d)(W)))
1707 
1708 #define _mm256_maskz_max_round_pd(U, A, B, R)                                  \
1709   ((__m256d)__builtin_ia32_selectpd_256(                                       \
1710       (__mmask8)(U), (__v4df)_mm256_max_round_pd((A), (B), (R)),               \
1711       (__v4df)_mm256_setzero_pd()))
1712 
1713 #define _mm256_max_round_ph(A, B, R)                                           \
1714   ((__m256h)__builtin_ia32_vmaxph256_round((__v16hf)(__m256h)(A),              \
1715                                            (__v16hf)(__m256h)(B), (int)(R)))
1716 
1717 #define _mm256_mask_max_round_ph(W, U, A, B, R)                                \
1718   ((__m256h)__builtin_ia32_selectph_256(                                       \
1719       (__mmask16)(U), (__v16hf)_mm256_max_round_ph((A), (B), (R)),             \
1720       (__v16hf)(__m256h)(W)))
1721 
1722 #define _mm256_maskz_max_round_ph(U, A, B, R)                                  \
1723   ((__m256h)__builtin_ia32_selectph_256(                                       \
1724       (__mmask16)(U), (__v16hf)_mm256_max_round_ph((A), (B), (R)),             \
1725       (__v16hf)_mm256_setzero_ph()))
1726 
1727 #define _mm256_max_round_ps(A, B, R)                                           \
1728   ((__m256)__builtin_ia32_vmaxps256_round((__v8sf)(__m256)(A),                 \
1729                                           (__v8sf)(__m256)(B), (int)(R)))
1730 
1731 #define _mm256_mask_max_round_ps(W, U, A, B, R)                                \
1732   ((__m256)__builtin_ia32_selectps_256(                                        \
1733       (__mmask8)(U), (__v8sf)_mm256_max_round_ps((A), (B), (R)),               \
1734       (__v8sf)(__m256)(W)))
1735 
1736 #define _mm256_maskz_max_round_ps(U, A, B, R)                                  \
1737   ((__m256)__builtin_ia32_selectps_256(                                        \
1738       (__mmask8)(U), (__v8sf)_mm256_max_round_ps((A), (B), (R)),               \
1739       (__v8sf)_mm256_setzero_ps()))
1740 
1741 #define _mm256_min_round_pd(A, B, R)                                           \
1742   ((__m256d)__builtin_ia32_vminpd256_round((__v4df)(__m256d)(A),               \
1743                                            (__v4df)(__m256d)(B), (int)(R)))
1744 
1745 #define _mm256_mask_min_round_pd(W, U, A, B, R)                                \
1746   ((__m256d)__builtin_ia32_selectpd_256(                                       \
1747       (__mmask8)(U), (__v4df)_mm256_min_round_pd((A), (B), (R)),               \
1748       (__v4df)(__m256d)(W)))
1749 
1750 #define _mm256_maskz_min_round_pd(U, A, B, R)                                  \
1751   ((__m256d)__builtin_ia32_selectpd_256(                                       \
1752       (__mmask8)(U), (__v4df)_mm256_min_round_pd((A), (B), (R)),               \
1753       (__v4df)_mm256_setzero_pd()))
1754 
1755 #define _mm256_min_round_ph(A, B, R)                                           \
1756   ((__m256h)__builtin_ia32_vminph256_round((__v16hf)(__m256h)(A),              \
1757                                            (__v16hf)(__m256h)(B), (int)(R)))
1758 
1759 #define _mm256_mask_min_round_ph(W, U, A, B, R)                                \
1760   ((__m256h)__builtin_ia32_selectph_256(                                       \
1761       (__mmask16)(U), (__v16hf)_mm256_min_round_ph((A), (B), (R)),             \
1762       (__v16hf)(__m256h)(W)))
1763 
1764 #define _mm256_maskz_min_round_ph(U, A, B, R)                                  \
1765   ((__m256h)__builtin_ia32_selectph_256(                                       \
1766       (__mmask16)(U), (__v16hf)_mm256_min_round_ph((A), (B), (R)),             \
1767       (__v16hf)_mm256_setzero_ph()))
1768 
1769 #define _mm256_min_round_ps(A, B, R)                                           \
1770   ((__m256)__builtin_ia32_vminps256_round((__v8sf)(__m256)(A),                 \
1771                                           (__v8sf)(__m256)(B), (int)(R)))
1772 
1773 #define _mm256_mask_min_round_ps(W, U, A, B, R)                                \
1774   ((__m256)__builtin_ia32_selectps_256(                                        \
1775       (__mmask8)(U), (__v8sf)_mm256_min_round_ps((A), (B), (R)),               \
1776       (__v8sf)(__m256)(W)))
1777 
1778 #define _mm256_maskz_min_round_ps(U, A, B, R)                                  \
1779   ((__m256)__builtin_ia32_selectps_256(                                        \
1780       (__mmask8)(U), (__v8sf)_mm256_min_round_ps((A), (B), (R)),               \
1781       (__v8sf)_mm256_setzero_ps()))
1782 
1783 #define _mm256_mul_round_pd(A, B, R)                                           \
1784   ((__m256d)__builtin_ia32_vmulpd256_round((__v4df)(__m256d)(A),               \
1785                                            (__v4df)(__m256d)(B), (int)(R)))
1786 
1787 #define _mm256_mask_mul_round_pd(W, U, A, B, R)                                \
1788   ((__m256d)__builtin_ia32_selectpd_256(                                       \
1789       (__mmask8)(U), (__v4df)_mm256_mul_round_pd((A), (B), (R)),               \
1790       (__v4df)(__m256d)(W)))
1791 
1792 #define _mm256_maskz_mul_round_pd(U, A, B, R)                                  \
1793   ((__m256d)__builtin_ia32_selectpd_256(                                       \
1794       (__mmask8)(U), (__v4df)_mm256_mul_round_pd((A), (B), (R)),               \
1795       (__v4df)_mm256_setzero_pd()))
1796 
1797 #define _mm256_mul_round_ph(A, B, R)                                           \
1798   ((__m256h)__builtin_ia32_vmulph256_round((__v16hf)(__m256h)(A),              \
1799                                            (__v16hf)(__m256h)(B), (int)(R)))
1800 
1801 #define _mm256_mask_mul_round_ph(W, U, A, B, R)                                \
1802   ((__m256h)__builtin_ia32_selectph_256(                                       \
1803       (__mmask16)(U), (__v16hf)_mm256_mul_round_ph((A), (B), (R)),             \
1804       (__v16hf)(__m256h)(W)))
1805 
1806 #define _mm256_maskz_mul_round_ph(U, A, B, R)                                  \
1807   ((__m256h)__builtin_ia32_selectph_256(                                       \
1808       (__mmask16)(U), (__v16hf)_mm256_mul_round_ph((A), (B), (R)),             \
1809       (__v16hf)_mm256_setzero_ph()))
1810 
1811 #define _mm256_mul_round_ps(A, B, R)                                           \
1812   ((__m256)__builtin_ia32_vmulps256_round((__v8sf)(__m256)(A),                 \
1813                                           (__v8sf)(__m256)(B), (int)(R)))
1814 
1815 #define _mm256_mask_mul_round_ps(W, U, A, B, R)                                \
1816   ((__m256)__builtin_ia32_selectps_256(                                        \
1817       (__mmask8)(U), (__v8sf)_mm256_mul_round_ps((A), (B), (R)),               \
1818       (__v8sf)(__m256)(W)))
1819 
1820 #define _mm256_maskz_mul_round_ps(U, A, B, R)                                  \
1821   ((__m256)__builtin_ia32_selectps_256(                                        \
1822       (__mmask8)(U), (__v8sf)_mm256_mul_round_ps((A), (B), (R)),               \
1823       (__v8sf)_mm256_setzero_ps()))
1824 
1825 #define _mm256_range_round_pd(A, B, C, R)                                      \
1826   ((__m256d)__builtin_ia32_vrangepd256_round_mask(                             \
1827       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C),                    \
1828       (__v4df)_mm256_setzero_pd(), (__mmask8)-1, (int)(R)))
1829 
1830 #define _mm256_mask_range_round_pd(W, U, A, B, C, R)                           \
1831   ((__m256d)__builtin_ia32_vrangepd256_round_mask(                             \
1832       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C),                    \
1833       (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
1834 
1835 #define _mm256_maskz_range_round_pd(U, A, B, C, R)                             \
1836   ((__m256d)__builtin_ia32_vrangepd256_round_mask(                             \
1837       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C),                    \
1838       (__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R)))
1839 
1840 #define _mm256_range_round_ps(A, B, C, R)                                      \
1841   ((__m256)__builtin_ia32_vrangeps256_round_mask(                              \
1842       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C),                      \
1843       (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, (int)(R)))
1844 
1845 #define _mm256_mask_range_round_ps(W, U, A, B, C, R)                           \
1846   ((__m256)__builtin_ia32_vrangeps256_round_mask(                              \
1847       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)(__m256)(W), \
1848       (__mmask8)(U), (int)(R)))
1849 
1850 #define _mm256_maskz_range_round_ps(U, A, B, C, R)                             \
1851   ((__m256)__builtin_ia32_vrangeps256_round_mask(                              \
1852       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C),                      \
1853       (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R)))
1854 
1855 #define _mm256_reduce_round_pd(A, B, R)                                        \
1856   ((__m256d)__builtin_ia32_vreducepd256_round_mask(                            \
1857       (__v4df)(__m256d)(A), (int)(B), (__v4df)_mm256_setzero_pd(),             \
1858       (__mmask8)-1, (int)(R)))
1859 
1860 #define _mm256_mask_reduce_round_pd(W, U, A, B, R)                             \
1861   ((__m256d)__builtin_ia32_vreducepd256_round_mask(                            \
1862       (__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U),     \
1863       (int)(R)))
1864 
1865 #define _mm256_maskz_reduce_round_pd(U, A, B, R)                               \
1866   ((__m256d)__builtin_ia32_vreducepd256_round_mask(                            \
1867       (__v4df)(__m256d)(A), (int)(B), (__v4df)_mm256_setzero_pd(),             \
1868       (__mmask8)(U), (int)(R)))
1869 
1870 #define _mm256_mask_reduce_round_ph(W, U, A, imm, R)                           \
1871   ((__m256h)__builtin_ia32_vreduceph256_round_mask(                            \
1872       (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)(__m256h)(W),                \
1873       (__mmask16)(U), (int)(R)))
1874 
1875 #define _mm256_maskz_reduce_round_ph(U, A, imm, R)                             \
1876   ((__m256h)__builtin_ia32_vreduceph256_round_mask(                            \
1877       (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(),         \
1878       (__mmask16)(U), (int)(R)))
1879 
1880 #define _mm256_reduce_round_ph(A, imm, R)                                      \
1881   ((__m256h)__builtin_ia32_vreduceph256_round_mask(                            \
1882       (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_undefined_ph(),       \
1883       (__mmask16)-1, (int)(R)))
1884 
1885 #define _mm256_reduce_round_ps(A, B, R)                                        \
1886   ((__m256)__builtin_ia32_vreduceps256_round_mask(                             \
1887       (__v8sf)(__m256)(A), (int)(B), (__v8sf)_mm256_setzero_ps(),              \
1888       (__mmask8)-1, (int)(R)))
1889 
1890 #define _mm256_mask_reduce_round_ps(W, U, A, B, R)                             \
1891   ((__m256)__builtin_ia32_vreduceps256_round_mask(                             \
1892       (__v8sf)(__m256)(A), (int)(B), (__v8sf)(__m256)(W), (__mmask8)(U),       \
1893       (int)(R)))
1894 
1895 #define _mm256_maskz_reduce_round_ps(U, A, B, R)                               \
1896   ((__m256)__builtin_ia32_vreduceps256_round_mask(                             \
1897       (__v8sf)(__m256)(A), (int)(B), (__v8sf)_mm256_setzero_ps(),              \
1898       (__mmask8)(U), (int)(R)))
1899 
1900 #define _mm256_roundscale_round_pd(A, imm, R)                                  \
1901   ((__m256d)__builtin_ia32_vrndscalepd256_round_mask(                          \
1902       (__v4df)(__m256d)(A), (int)(imm), (__v4df)_mm256_undefined_pd(),         \
1903       (__mmask8)-1, (int)(R)))
1904 
1905 #define _mm256_mask_roundscale_round_pd(A, B, C, imm, R)                       \
1906   ((__m256d)__builtin_ia32_vrndscalepd256_round_mask(                          \
1907       (__v4df)(__m256d)(C), (int)(imm), (__v4df)(__m256d)(A), (__mmask8)(B),   \
1908       (int)(R)))
1909 
1910 #define _mm256_maskz_roundscale_round_pd(A, B, imm, R)                         \
1911   ((__m256d)__builtin_ia32_vrndscalepd256_round_mask(                          \
1912       (__v4df)(__m256d)(B), (int)(imm), (__v4df)_mm256_setzero_pd(),           \
1913       (__mmask8)(A), (int)(R)))
1914 
1915 #define _mm256_roundscale_round_ph(A, imm, R)                                  \
1916   ((__m256h)__builtin_ia32_vrndscaleph256_round_mask(                          \
1917       (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_undefined_ph(),       \
1918       (__mmask16)-1, (int)(R)))
1919 
1920 #define _mm256_mask_roundscale_round_ph(A, B, C, imm, R)                       \
1921   ((__m256h)__builtin_ia32_vrndscaleph256_round_mask(                          \
1922       (__v16hf)(__m256h)(C), (int)(imm), (__v16hf)(__m256h)(A),                \
1923       (__mmask16)(B), (int)(R)))
1924 
1925 #define _mm256_maskz_roundscale_round_ph(A, B, imm, R)                         \
1926   ((__m256h)__builtin_ia32_vrndscaleph256_round_mask(                          \
1927       (__v16hf)(__m256h)(B), (int)(imm), (__v16hf)_mm256_setzero_ph(),         \
1928       (__mmask16)(A), (int)(R)))
1929 
1930 #define _mm256_roundscale_round_ps(A, imm, R)                                  \
1931   ((__m256)__builtin_ia32_vrndscaleps256_round_mask(                           \
1932       (__v8sf)(__m256)(A), (int)(imm), (__v8sf)_mm256_undefined_ps(),          \
1933       (__mmask8)-1, (int)(R)))
1934 
1935 #define _mm256_mask_roundscale_round_ps(A, B, C, imm, R)                       \
1936   ((__m256)__builtin_ia32_vrndscaleps256_round_mask(                           \
1937       (__v8sf)(__m256)(C), (int)(imm), (__v8sf)(__m256)(A), (__mmask8)(B),     \
1938       (int)(R)))
1939 
1940 #define _mm256_maskz_roundscale_round_ps(A, B, imm, R)                         \
1941   ((__m256)__builtin_ia32_vrndscaleps256_round_mask(                           \
1942       (__v8sf)(__m256)(B), (int)(imm), (__v8sf)_mm256_setzero_ps(),            \
1943       (__mmask8)(A), (int)(R)))
1944 
1945 #define _mm256_scalef_round_pd(A, B, R)                                        \
1946   ((__m256d)__builtin_ia32_vscalefpd256_round_mask(                            \
1947       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B),                              \
1948       (__v4df)_mm256_undefined_pd(), (__mmask8)-1, (int)(R)))
1949 
1950 #define _mm256_mask_scalef_round_pd(W, U, A, B, R)                             \
1951   ((__m256d)__builtin_ia32_vscalefpd256_round_mask(                            \
1952       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(W),        \
1953       (__mmask8)(U), (int)(R)))
1954 
1955 #define _mm256_maskz_scalef_round_pd(U, A, B, R)                               \
1956   ((__m256d)__builtin_ia32_vscalefpd256_round_mask(                            \
1957       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)_mm256_setzero_pd(), \
1958       (__mmask8)(U), (int)(R)))
1959 
1960 #define _mm256_scalef_round_ph(A, B, R)                                        \
1961   ((__m256h)__builtin_ia32_vscalefph256_round_mask(                            \
1962       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B),                            \
1963       (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, (int)(R)))
1964 
1965 #define _mm256_mask_scalef_round_ph(W, U, A, B, R)                             \
1966   ((__m256h)__builtin_ia32_vscalefph256_round_mask(                            \
1967       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(W),     \
1968       (__mmask16)(U), (int)(R)))
1969 
1970 #define _mm256_maskz_scalef_round_ph(U, A, B, R)                               \
1971   ((__m256h)__builtin_ia32_vscalefph256_round_mask(                            \
1972       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B),                            \
1973       (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1974 
1975 #define _mm256_scalef_round_ps(A, B, R)                                        \
1976   ((__m256)__builtin_ia32_vscalefps256_round_mask(                             \
1977       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)_mm256_undefined_ps(), \
1978       (__mmask8)-1, (int)(R)))
1979 
1980 #define _mm256_mask_scalef_round_ps(W, U, A, B, R)                             \
1981   ((__m256)__builtin_ia32_vscalefps256_round_mask(                             \
1982       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(W),           \
1983       (__mmask8)(U), (int)(R)))
1984 
1985 #define _mm256_maskz_scalef_round_ps(U, A, B, R)                               \
1986   ((__m256)__builtin_ia32_vscalefps256_round_mask(                             \
1987       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)_mm256_setzero_ps(),   \
1988       (__mmask8)(U), (int)(R)))
1989 
1990 #define _mm256_sqrt_round_pd(A, R)                                             \
1991   ((__m256d)__builtin_ia32_vsqrtpd256_round((__v4df)(__m256d)(A), (int)(R)))
1992 
1993 #define _mm256_mask_sqrt_round_pd(W, U, A, R)                                  \
1994   ((__m256d)__builtin_ia32_selectpd_256(                                       \
1995       (__mmask8)(U), (__v4df)_mm256_sqrt_round_pd((A), (R)),                   \
1996       (__v4df)(__m256d)(W)))
1997 
1998 #define _mm256_maskz_sqrt_round_pd(U, A, R)                                    \
1999   ((__m256d)__builtin_ia32_selectpd_256(                                       \
2000       (__mmask8)(U), (__v4df)_mm256_sqrt_round_pd((A), (R)),                   \
2001       (__v4df)_mm256_setzero_pd()))
2002 
2003 #define _mm256_sqrt_round_ph(A, R)                                             \
2004   ((__m256h)__builtin_ia32_vsqrtph256_round((__v16hf)(__m256h)(A), (int)(R)))
2005 
2006 #define _mm256_mask_sqrt_round_ph(W, U, A, R)                                  \
2007   ((__m256h)__builtin_ia32_selectph_256(                                       \
2008       (__mmask16)(U), (__v16hf)_mm256_sqrt_round_ph((A), (R)),                 \
2009       (__v16hf)(__m256h)(W)))
2010 
2011 #define _mm256_maskz_sqrt_round_ph(U, A, R)                                    \
2012   ((__m256h)__builtin_ia32_selectph_256(                                       \
2013       (__mmask16)(U), (__v16hf)_mm256_sqrt_round_ph((A), (R)),                 \
2014       (__v16hf)_mm256_setzero_ph()))
2015 
2016 #define _mm256_sqrt_round_ps(A, R)                                             \
2017   ((__m256)__builtin_ia32_vsqrtps256_round((__v8sf)(__m256)(A), (int)(R)))
2018 
2019 #define _mm256_mask_sqrt_round_ps(W, U, A, R)                                  \
2020   ((__m256)__builtin_ia32_selectps_256((__mmask8)(U),                          \
2021                                        (__v8sf)_mm256_sqrt_round_ps((A), (R)), \
2022                                        (__v8sf)(__m256)(W)))
2023 
2024 #define _mm256_maskz_sqrt_round_ps(U, A, R)                                    \
2025   ((__m256)__builtin_ia32_selectps_256((__mmask8)(U),                          \
2026                                        (__v8sf)_mm256_sqrt_round_ps((A), (R)), \
2027                                        (__v8sf)_mm256_setzero_ps()))
2028 
2029 #define _mm256_sub_round_pd(A, B, R)                                           \
2030   ((__m256d)__builtin_ia32_vsubpd256_round((__v4df)(__m256d)(A),               \
2031                                            (__v4df)(__m256d)(B), (int)(R)))
2032 
2033 #define _mm256_mask_sub_round_pd(W, U, A, B, R)                                \
2034   ((__m256d)__builtin_ia32_selectpd_256(                                       \
2035       (__mmask8)(U), (__v4df)_mm256_sub_round_pd((A), (B), (R)),               \
2036       (__v4df)(__m256d)(W)))
2037 
2038 #define _mm256_maskz_sub_round_pd(U, A, B, R)                                  \
2039   ((__m256d)__builtin_ia32_selectpd_256(                                       \
2040       (__mmask8)(U), (__v4df)_mm256_sub_round_pd((A), (B), (R)),               \
2041       (__v4df)_mm256_setzero_pd()))
2042 
2043 #define _mm256_sub_round_ph(A, B, R)                                           \
2044   ((__m256h)__builtin_ia32_vsubph256_round((__v16hf)(__m256h)(A),              \
2045                                            (__v16hf)(__m256h)(B), (int)(R)))
2046 
2047 #define _mm256_mask_sub_round_ph(W, U, A, B, R)                                \
2048   ((__m256h)__builtin_ia32_selectph_256(                                       \
2049       (__mmask16)(U), (__v16hf)_mm256_sub_round_ph((A), (B), (R)),             \
2050       (__v16hf)(__m256h)(W)))
2051 
2052 #define _mm256_maskz_sub_round_ph(U, A, B, R)                                  \
2053   ((__m256h)__builtin_ia32_selectph_256(                                       \
2054       (__mmask16)(U), (__v16hf)_mm256_sub_round_ph((A), (B), (R)),             \
2055       (__v16hf)_mm256_setzero_ph()))
2056 
2057 #define _mm256_sub_round_ps(A, B, R)                                           \
2058   ((__m256)__builtin_ia32_vsubps256_round((__v8sf)(__m256)(A),                 \
2059                                           (__v8sf)(__m256)(B), (int)(R)))
2060 
2061 #define _mm256_mask_sub_round_ps(W, U, A, B, R)                                \
2062   ((__m256)__builtin_ia32_selectps_256(                                        \
2063       (__mmask8)(U), (__v8sf)_mm256_sub_round_ps((A), (B), (R)),               \
2064       (__v8sf)(__m256)(W)))
2065 
2066 #define _mm256_maskz_sub_round_ps(U, A, B, R)                                  \
2067   ((__m256)__builtin_ia32_selectps_256(                                        \
2068       (__mmask8)(U), (__v8sf)_mm256_sub_round_ps((A), (B), (R)),               \
2069       (__v8sf)_mm256_setzero_ps()))
2070 
2071 #undef __DEFAULT_FN_ATTRS256
2072 #undef __DEFAULT_FN_ATTRS128
2073 
2074 #endif /* __AVX10_2NIINTRIN_H */
2075 #endif /* __SSE2__ */
2076