xref: /openbsd-src/gnu/llvm/clang/lib/Headers/avx512fp16intrin.h (revision 12c855180aad702bbcca06e0398d774beeafb155)
1*12c85518Srobert /*===----------- avx512fp16intrin.h - AVX512-FP16 intrinsics ---------------===
2*12c85518Srobert  *
3*12c85518Srobert  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*12c85518Srobert  * See https://llvm.org/LICENSE.txt for license information.
5*12c85518Srobert  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*12c85518Srobert  *
7*12c85518Srobert  *===-----------------------------------------------------------------------===
8*12c85518Srobert  */
9*12c85518Srobert #ifndef __IMMINTRIN_H
10*12c85518Srobert #error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
11*12c85518Srobert #endif
12*12c85518Srobert 
13*12c85518Srobert #ifdef __SSE2__
14*12c85518Srobert 
15*12c85518Srobert #ifndef __AVX512FP16INTRIN_H
16*12c85518Srobert #define __AVX512FP16INTRIN_H
17*12c85518Srobert 
18*12c85518Srobert /* Define the default attributes for the functions in this file. */
19*12c85518Srobert typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64)));
20*12c85518Srobert typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64)));
21*12c85518Srobert typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1)));
22*12c85518Srobert 
23*12c85518Srobert /* Define the default attributes for the functions in this file. */
24*12c85518Srobert #define __DEFAULT_FN_ATTRS512                                                  \
25*12c85518Srobert   __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"),     \
26*12c85518Srobert                  __min_vector_width__(512)))
27*12c85518Srobert #define __DEFAULT_FN_ATTRS256                                                  \
28*12c85518Srobert   __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"),     \
29*12c85518Srobert                  __min_vector_width__(256)))
30*12c85518Srobert #define __DEFAULT_FN_ATTRS128                                                  \
31*12c85518Srobert   __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"),     \
32*12c85518Srobert                  __min_vector_width__(128)))
33*12c85518Srobert 
_mm512_cvtsh_h(__m512h __a)34*12c85518Srobert static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_cvtsh_h(__m512h __a) {
35*12c85518Srobert   return __a[0];
36*12c85518Srobert }
37*12c85518Srobert 
_mm_setzero_ph(void)38*12c85518Srobert static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_setzero_ph(void) {
39*12c85518Srobert   return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
40*12c85518Srobert }
41*12c85518Srobert 
_mm256_setzero_ph(void)42*12c85518Srobert static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_setzero_ph(void) {
43*12c85518Srobert   return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
44*12c85518Srobert                    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
45*12c85518Srobert }
46*12c85518Srobert 
_mm256_undefined_ph(void)47*12c85518Srobert static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_undefined_ph(void) {
48*12c85518Srobert   return (__m256h)__builtin_ia32_undef256();
49*12c85518Srobert }
50*12c85518Srobert 
_mm512_setzero_ph(void)51*12c85518Srobert static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_setzero_ph(void) {
52*12c85518Srobert   return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
53*12c85518Srobert                    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
54*12c85518Srobert                    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
55*12c85518Srobert }
56*12c85518Srobert 
_mm_undefined_ph(void)57*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_undefined_ph(void) {
58*12c85518Srobert   return (__m128h)__builtin_ia32_undef128();
59*12c85518Srobert }
60*12c85518Srobert 
_mm512_undefined_ph(void)61*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_undefined_ph(void) {
62*12c85518Srobert   return (__m512h)__builtin_ia32_undef512();
63*12c85518Srobert }
64*12c85518Srobert 
_mm512_set1_ph(_Float16 __h)65*12c85518Srobert static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_set1_ph(_Float16 __h) {
66*12c85518Srobert   return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h,
67*12c85518Srobert                             __h, __h, __h, __h, __h, __h, __h, __h,
68*12c85518Srobert                             __h, __h, __h, __h, __h, __h, __h, __h,
69*12c85518Srobert                             __h, __h, __h, __h, __h, __h, __h, __h};
70*12c85518Srobert }
71*12c85518Srobert 
72*12c85518Srobert static __inline __m512h __DEFAULT_FN_ATTRS512
_mm512_set_ph(_Float16 __h1,_Float16 __h2,_Float16 __h3,_Float16 __h4,_Float16 __h5,_Float16 __h6,_Float16 __h7,_Float16 __h8,_Float16 __h9,_Float16 __h10,_Float16 __h11,_Float16 __h12,_Float16 __h13,_Float16 __h14,_Float16 __h15,_Float16 __h16,_Float16 __h17,_Float16 __h18,_Float16 __h19,_Float16 __h20,_Float16 __h21,_Float16 __h22,_Float16 __h23,_Float16 __h24,_Float16 __h25,_Float16 __h26,_Float16 __h27,_Float16 __h28,_Float16 __h29,_Float16 __h30,_Float16 __h31,_Float16 __h32)73*12c85518Srobert _mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
74*12c85518Srobert               _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
75*12c85518Srobert               _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
76*12c85518Srobert               _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16,
77*12c85518Srobert               _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20,
78*12c85518Srobert               _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24,
79*12c85518Srobert               _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28,
80*12c85518Srobert               _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) {
81*12c85518Srobert   return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26,
82*12c85518Srobert                             __h25, __h24, __h23, __h22, __h21, __h20, __h19,
83*12c85518Srobert                             __h18, __h17, __h16, __h15, __h14, __h13, __h12,
84*12c85518Srobert                             __h11, __h10, __h9,  __h8,  __h7,  __h6,  __h5,
85*12c85518Srobert                             __h4,  __h3,  __h2,  __h1};
86*12c85518Srobert }
87*12c85518Srobert 
88*12c85518Srobert #define _mm512_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
89*12c85518Srobert                        h14, h15, h16, h17, h18, h19, h20, h21, h22, h23, h24,  \
90*12c85518Srobert                        h25, h26, h27, h28, h29, h30, h31, h32)                 \
91*12c85518Srobert   _mm512_set_ph((h32), (h31), (h30), (h29), (h28), (h27), (h26), (h25), (h24), \
92*12c85518Srobert                 (h23), (h22), (h21), (h20), (h19), (h18), (h17), (h16), (h15), \
93*12c85518Srobert                 (h14), (h13), (h12), (h11), (h10), (h9), (h8), (h7), (h6),     \
94*12c85518Srobert                 (h5), (h4), (h3), (h2), (h1))
95*12c85518Srobert 
96*12c85518Srobert static __inline __m512h __DEFAULT_FN_ATTRS512
_mm512_set1_pch(_Float16 _Complex h)97*12c85518Srobert _mm512_set1_pch(_Float16 _Complex h) {
98*12c85518Srobert   return (__m512h)_mm512_set1_ps(__builtin_bit_cast(float, h));
99*12c85518Srobert }
100*12c85518Srobert 
_mm_castph_ps(__m128h __a)101*12c85518Srobert static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castph_ps(__m128h __a) {
102*12c85518Srobert   return (__m128)__a;
103*12c85518Srobert }
104*12c85518Srobert 
_mm256_castph_ps(__m256h __a)105*12c85518Srobert static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_castph_ps(__m256h __a) {
106*12c85518Srobert   return (__m256)__a;
107*12c85518Srobert }
108*12c85518Srobert 
_mm512_castph_ps(__m512h __a)109*12c85518Srobert static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_castph_ps(__m512h __a) {
110*12c85518Srobert   return (__m512)__a;
111*12c85518Srobert }
112*12c85518Srobert 
_mm_castph_pd(__m128h __a)113*12c85518Srobert static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castph_pd(__m128h __a) {
114*12c85518Srobert   return (__m128d)__a;
115*12c85518Srobert }
116*12c85518Srobert 
_mm256_castph_pd(__m256h __a)117*12c85518Srobert static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_castph_pd(__m256h __a) {
118*12c85518Srobert   return (__m256d)__a;
119*12c85518Srobert }
120*12c85518Srobert 
_mm512_castph_pd(__m512h __a)121*12c85518Srobert static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_castph_pd(__m512h __a) {
122*12c85518Srobert   return (__m512d)__a;
123*12c85518Srobert }
124*12c85518Srobert 
_mm_castph_si128(__m128h __a)125*12c85518Srobert static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_castph_si128(__m128h __a) {
126*12c85518Srobert   return (__m128i)__a;
127*12c85518Srobert }
128*12c85518Srobert 
129*12c85518Srobert static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_castph_si256(__m256h __a)130*12c85518Srobert _mm256_castph_si256(__m256h __a) {
131*12c85518Srobert   return (__m256i)__a;
132*12c85518Srobert }
133*12c85518Srobert 
134*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_castph_si512(__m512h __a)135*12c85518Srobert _mm512_castph_si512(__m512h __a) {
136*12c85518Srobert   return (__m512i)__a;
137*12c85518Srobert }
138*12c85518Srobert 
_mm_castps_ph(__m128 __a)139*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castps_ph(__m128 __a) {
140*12c85518Srobert   return (__m128h)__a;
141*12c85518Srobert }
142*12c85518Srobert 
_mm256_castps_ph(__m256 __a)143*12c85518Srobert static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castps_ph(__m256 __a) {
144*12c85518Srobert   return (__m256h)__a;
145*12c85518Srobert }
146*12c85518Srobert 
_mm512_castps_ph(__m512 __a)147*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castps_ph(__m512 __a) {
148*12c85518Srobert   return (__m512h)__a;
149*12c85518Srobert }
150*12c85518Srobert 
_mm_castpd_ph(__m128d __a)151*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castpd_ph(__m128d __a) {
152*12c85518Srobert   return (__m128h)__a;
153*12c85518Srobert }
154*12c85518Srobert 
_mm256_castpd_ph(__m256d __a)155*12c85518Srobert static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castpd_ph(__m256d __a) {
156*12c85518Srobert   return (__m256h)__a;
157*12c85518Srobert }
158*12c85518Srobert 
_mm512_castpd_ph(__m512d __a)159*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castpd_ph(__m512d __a) {
160*12c85518Srobert   return (__m512h)__a;
161*12c85518Srobert }
162*12c85518Srobert 
_mm_castsi128_ph(__m128i __a)163*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castsi128_ph(__m128i __a) {
164*12c85518Srobert   return (__m128h)__a;
165*12c85518Srobert }
166*12c85518Srobert 
167*12c85518Srobert static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_castsi256_ph(__m256i __a)168*12c85518Srobert _mm256_castsi256_ph(__m256i __a) {
169*12c85518Srobert   return (__m256h)__a;
170*12c85518Srobert }
171*12c85518Srobert 
172*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_castsi512_ph(__m512i __a)173*12c85518Srobert _mm512_castsi512_ph(__m512i __a) {
174*12c85518Srobert   return (__m512h)__a;
175*12c85518Srobert }
176*12c85518Srobert 
177*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS256
_mm256_castph256_ph128(__m256h __a)178*12c85518Srobert _mm256_castph256_ph128(__m256h __a) {
179*12c85518Srobert   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
180*12c85518Srobert }
181*12c85518Srobert 
182*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_castph512_ph128(__m512h __a)183*12c85518Srobert _mm512_castph512_ph128(__m512h __a) {
184*12c85518Srobert   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
185*12c85518Srobert }
186*12c85518Srobert 
187*12c85518Srobert static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_castph512_ph256(__m512h __a)188*12c85518Srobert _mm512_castph512_ph256(__m512h __a) {
189*12c85518Srobert   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
190*12c85518Srobert                                  12, 13, 14, 15);
191*12c85518Srobert }
192*12c85518Srobert 
193*12c85518Srobert static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_castph128_ph256(__m128h __a)194*12c85518Srobert _mm256_castph128_ph256(__m128h __a) {
195*12c85518Srobert   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
196*12c85518Srobert                                  -1, -1, -1, -1, -1);
197*12c85518Srobert }
198*12c85518Srobert 
199*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_castph128_ph512(__m128h __a)200*12c85518Srobert _mm512_castph128_ph512(__m128h __a) {
201*12c85518Srobert   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
202*12c85518Srobert                                  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
203*12c85518Srobert                                  -1, -1, -1, -1, -1, -1, -1, -1, -1);
204*12c85518Srobert }
205*12c85518Srobert 
206*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_castph256_ph512(__m256h __a)207*12c85518Srobert _mm512_castph256_ph512(__m256h __a) {
208*12c85518Srobert   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
209*12c85518Srobert                                  12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1,
210*12c85518Srobert                                  -1, -1, -1, -1, -1, -1, -1, -1);
211*12c85518Srobert }
212*12c85518Srobert 
213*12c85518Srobert /// Constructs a 256-bit floating-point vector of [16 x half] from a
214*12c85518Srobert ///    128-bit floating-point vector of [8 x half]. The lower 128 bits
215*12c85518Srobert ///    contain the value of the source vector. The upper 384 bits are set
216*12c85518Srobert ///    to zero.
217*12c85518Srobert ///
218*12c85518Srobert /// \headerfile <x86intrin.h>
219*12c85518Srobert ///
220*12c85518Srobert /// This intrinsic has no corresponding instruction.
221*12c85518Srobert ///
222*12c85518Srobert /// \param __a
223*12c85518Srobert ///    A 128-bit vector of [8 x half].
224*12c85518Srobert /// \returns A 512-bit floating-point vector of [16 x half]. The lower 128 bits
225*12c85518Srobert ///    contain the value of the parameter. The upper 384 bits are set to zero.
226*12c85518Srobert static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_zextph128_ph256(__m128h __a)227*12c85518Srobert _mm256_zextph128_ph256(__m128h __a) {
228*12c85518Srobert   return __builtin_shufflevector(__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4,
229*12c85518Srobert                                  5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
230*12c85518Srobert }
231*12c85518Srobert 
232*12c85518Srobert /// Constructs a 512-bit floating-point vector of [32 x half] from a
233*12c85518Srobert ///    128-bit floating-point vector of [8 x half]. The lower 128 bits
234*12c85518Srobert ///    contain the value of the source vector. The upper 384 bits are set
235*12c85518Srobert ///    to zero.
236*12c85518Srobert ///
237*12c85518Srobert /// \headerfile <x86intrin.h>
238*12c85518Srobert ///
239*12c85518Srobert /// This intrinsic has no corresponding instruction.
240*12c85518Srobert ///
241*12c85518Srobert /// \param __a
242*12c85518Srobert ///    A 128-bit vector of [8 x half].
243*12c85518Srobert /// \returns A 512-bit floating-point vector of [32 x half]. The lower 128 bits
244*12c85518Srobert ///    contain the value of the parameter. The upper 384 bits are set to zero.
245*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_zextph128_ph512(__m128h __a)246*12c85518Srobert _mm512_zextph128_ph512(__m128h __a) {
247*12c85518Srobert   return __builtin_shufflevector(
248*12c85518Srobert       __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
249*12c85518Srobert       13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
250*12c85518Srobert }
251*12c85518Srobert 
252*12c85518Srobert /// Constructs a 512-bit floating-point vector of [32 x half] from a
253*12c85518Srobert ///    256-bit floating-point vector of [16 x half]. The lower 256 bits
254*12c85518Srobert ///    contain the value of the source vector. The upper 256 bits are set
255*12c85518Srobert ///    to zero.
256*12c85518Srobert ///
257*12c85518Srobert /// \headerfile <x86intrin.h>
258*12c85518Srobert ///
259*12c85518Srobert /// This intrinsic has no corresponding instruction.
260*12c85518Srobert ///
261*12c85518Srobert /// \param __a
262*12c85518Srobert ///    A 256-bit vector of [16 x half].
263*12c85518Srobert /// \returns A 512-bit floating-point vector of [32 x half]. The lower 256 bits
264*12c85518Srobert ///    contain the value of the parameter. The upper 256 bits are set to zero.
265*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_zextph256_ph512(__m256h __a)266*12c85518Srobert _mm512_zextph256_ph512(__m256h __a) {
267*12c85518Srobert   return __builtin_shufflevector(__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3,
268*12c85518Srobert                                  4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
269*12c85518Srobert                                  17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
270*12c85518Srobert                                  29, 30, 31);
271*12c85518Srobert }
272*12c85518Srobert 
273*12c85518Srobert #define _mm_comi_round_sh(A, B, P, R)                                          \
274*12c85518Srobert   __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
275*12c85518Srobert 
276*12c85518Srobert #define _mm_comi_sh(A, B, pred)                                                \
277*12c85518Srobert   _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
278*12c85518Srobert 
_mm_comieq_sh(__m128h A,__m128h B)279*12c85518Srobert static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h A,
280*12c85518Srobert                                                           __m128h B) {
281*12c85518Srobert   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OS,
282*12c85518Srobert                                 _MM_FROUND_CUR_DIRECTION);
283*12c85518Srobert }
284*12c85518Srobert 
_mm_comilt_sh(__m128h A,__m128h B)285*12c85518Srobert static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h A,
286*12c85518Srobert                                                           __m128h B) {
287*12c85518Srobert   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OS,
288*12c85518Srobert                                 _MM_FROUND_CUR_DIRECTION);
289*12c85518Srobert }
290*12c85518Srobert 
_mm_comile_sh(__m128h A,__m128h B)291*12c85518Srobert static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h A,
292*12c85518Srobert                                                           __m128h B) {
293*12c85518Srobert   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OS,
294*12c85518Srobert                                 _MM_FROUND_CUR_DIRECTION);
295*12c85518Srobert }
296*12c85518Srobert 
_mm_comigt_sh(__m128h A,__m128h B)297*12c85518Srobert static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h A,
298*12c85518Srobert                                                           __m128h B) {
299*12c85518Srobert   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OS,
300*12c85518Srobert                                 _MM_FROUND_CUR_DIRECTION);
301*12c85518Srobert }
302*12c85518Srobert 
_mm_comige_sh(__m128h A,__m128h B)303*12c85518Srobert static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h A,
304*12c85518Srobert                                                           __m128h B) {
305*12c85518Srobert   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OS,
306*12c85518Srobert                                 _MM_FROUND_CUR_DIRECTION);
307*12c85518Srobert }
308*12c85518Srobert 
_mm_comineq_sh(__m128h A,__m128h B)309*12c85518Srobert static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h A,
310*12c85518Srobert                                                            __m128h B) {
311*12c85518Srobert   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_US,
312*12c85518Srobert                                 _MM_FROUND_CUR_DIRECTION);
313*12c85518Srobert }
314*12c85518Srobert 
_mm_ucomieq_sh(__m128h A,__m128h B)315*12c85518Srobert static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h A,
316*12c85518Srobert                                                            __m128h B) {
317*12c85518Srobert   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OQ,
318*12c85518Srobert                                 _MM_FROUND_CUR_DIRECTION);
319*12c85518Srobert }
320*12c85518Srobert 
_mm_ucomilt_sh(__m128h A,__m128h B)321*12c85518Srobert static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h A,
322*12c85518Srobert                                                            __m128h B) {
323*12c85518Srobert   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OQ,
324*12c85518Srobert                                 _MM_FROUND_CUR_DIRECTION);
325*12c85518Srobert }
326*12c85518Srobert 
_mm_ucomile_sh(__m128h A,__m128h B)327*12c85518Srobert static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h A,
328*12c85518Srobert                                                            __m128h B) {
329*12c85518Srobert   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OQ,
330*12c85518Srobert                                 _MM_FROUND_CUR_DIRECTION);
331*12c85518Srobert }
332*12c85518Srobert 
_mm_ucomigt_sh(__m128h A,__m128h B)333*12c85518Srobert static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h A,
334*12c85518Srobert                                                            __m128h B) {
335*12c85518Srobert   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OQ,
336*12c85518Srobert                                 _MM_FROUND_CUR_DIRECTION);
337*12c85518Srobert }
338*12c85518Srobert 
_mm_ucomige_sh(__m128h A,__m128h B)339*12c85518Srobert static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h A,
340*12c85518Srobert                                                            __m128h B) {
341*12c85518Srobert   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OQ,
342*12c85518Srobert                                 _MM_FROUND_CUR_DIRECTION);
343*12c85518Srobert }
344*12c85518Srobert 
_mm_ucomineq_sh(__m128h A,__m128h B)345*12c85518Srobert static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h A,
346*12c85518Srobert                                                             __m128h B) {
347*12c85518Srobert   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_UQ,
348*12c85518Srobert                                 _MM_FROUND_CUR_DIRECTION);
349*12c85518Srobert }
350*12c85518Srobert 
_mm512_add_ph(__m512h __A,__m512h __B)351*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_add_ph(__m512h __A,
352*12c85518Srobert                                                               __m512h __B) {
353*12c85518Srobert   return (__m512h)((__v32hf)__A + (__v32hf)__B);
354*12c85518Srobert }
355*12c85518Srobert 
356*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_add_ph(__m512h __W,__mmask32 __U,__m512h __A,__m512h __B)357*12c85518Srobert _mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
358*12c85518Srobert   return (__m512h)__builtin_ia32_selectph_512(
359*12c85518Srobert       (__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W);
360*12c85518Srobert }
361*12c85518Srobert 
362*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_add_ph(__mmask32 __U,__m512h __A,__m512h __B)363*12c85518Srobert _mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) {
364*12c85518Srobert   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
365*12c85518Srobert                                               (__v32hf)_mm512_add_ph(__A, __B),
366*12c85518Srobert                                               (__v32hf)_mm512_setzero_ph());
367*12c85518Srobert }
368*12c85518Srobert 
369*12c85518Srobert #define _mm512_add_round_ph(A, B, R)                                           \
370*12c85518Srobert   ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A),                     \
371*12c85518Srobert                                     (__v32hf)(__m512h)(B), (int)(R)))
372*12c85518Srobert 
373*12c85518Srobert #define _mm512_mask_add_round_ph(W, U, A, B, R)                                \
374*12c85518Srobert   ((__m512h)__builtin_ia32_selectph_512(                                       \
375*12c85518Srobert       (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)),             \
376*12c85518Srobert       (__v32hf)(__m512h)(W)))
377*12c85518Srobert 
378*12c85518Srobert #define _mm512_maskz_add_round_ph(U, A, B, R)                                  \
379*12c85518Srobert   ((__m512h)__builtin_ia32_selectph_512(                                       \
380*12c85518Srobert       (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)),             \
381*12c85518Srobert       (__v32hf)_mm512_setzero_ph()))
382*12c85518Srobert 
_mm512_sub_ph(__m512h __A,__m512h __B)383*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A,
384*12c85518Srobert                                                               __m512h __B) {
385*12c85518Srobert   return (__m512h)((__v32hf)__A - (__v32hf)__B);
386*12c85518Srobert }
387*12c85518Srobert 
388*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_sub_ph(__m512h __W,__mmask32 __U,__m512h __A,__m512h __B)389*12c85518Srobert _mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
390*12c85518Srobert   return (__m512h)__builtin_ia32_selectph_512(
391*12c85518Srobert       (__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W);
392*12c85518Srobert }
393*12c85518Srobert 
394*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_sub_ph(__mmask32 __U,__m512h __A,__m512h __B)395*12c85518Srobert _mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) {
396*12c85518Srobert   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
397*12c85518Srobert                                               (__v32hf)_mm512_sub_ph(__A, __B),
398*12c85518Srobert                                               (__v32hf)_mm512_setzero_ph());
399*12c85518Srobert }
400*12c85518Srobert 
401*12c85518Srobert #define _mm512_sub_round_ph(A, B, R)                                           \
402*12c85518Srobert   ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A),                     \
403*12c85518Srobert                                     (__v32hf)(__m512h)(B), (int)(R)))
404*12c85518Srobert 
405*12c85518Srobert #define _mm512_mask_sub_round_ph(W, U, A, B, R)                                \
406*12c85518Srobert   ((__m512h)__builtin_ia32_selectph_512(                                       \
407*12c85518Srobert       (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)),             \
408*12c85518Srobert       (__v32hf)(__m512h)(W)))
409*12c85518Srobert 
410*12c85518Srobert #define _mm512_maskz_sub_round_ph(U, A, B, R)                                  \
411*12c85518Srobert   ((__m512h)__builtin_ia32_selectph_512(                                       \
412*12c85518Srobert       (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)),             \
413*12c85518Srobert       (__v32hf)_mm512_setzero_ph()))
414*12c85518Srobert 
_mm512_mul_ph(__m512h __A,__m512h __B)415*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A,
416*12c85518Srobert                                                               __m512h __B) {
417*12c85518Srobert   return (__m512h)((__v32hf)__A * (__v32hf)__B);
418*12c85518Srobert }
419*12c85518Srobert 
420*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_mul_ph(__m512h __W,__mmask32 __U,__m512h __A,__m512h __B)421*12c85518Srobert _mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
422*12c85518Srobert   return (__m512h)__builtin_ia32_selectph_512(
423*12c85518Srobert       (__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W);
424*12c85518Srobert }
425*12c85518Srobert 
426*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_mul_ph(__mmask32 __U,__m512h __A,__m512h __B)427*12c85518Srobert _mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) {
428*12c85518Srobert   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
429*12c85518Srobert                                               (__v32hf)_mm512_mul_ph(__A, __B),
430*12c85518Srobert                                               (__v32hf)_mm512_setzero_ph());
431*12c85518Srobert }
432*12c85518Srobert 
433*12c85518Srobert #define _mm512_mul_round_ph(A, B, R)                                           \
434*12c85518Srobert   ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A),                     \
435*12c85518Srobert                                     (__v32hf)(__m512h)(B), (int)(R)))
436*12c85518Srobert 
437*12c85518Srobert #define _mm512_mask_mul_round_ph(W, U, A, B, R)                                \
438*12c85518Srobert   ((__m512h)__builtin_ia32_selectph_512(                                       \
439*12c85518Srobert       (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)),             \
440*12c85518Srobert       (__v32hf)(__m512h)(W)))
441*12c85518Srobert 
442*12c85518Srobert #define _mm512_maskz_mul_round_ph(U, A, B, R)                                  \
443*12c85518Srobert   ((__m512h)__builtin_ia32_selectph_512(                                       \
444*12c85518Srobert       (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)),             \
445*12c85518Srobert       (__v32hf)_mm512_setzero_ph()))
446*12c85518Srobert 
_mm512_div_ph(__m512h __A,__m512h __B)447*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A,
448*12c85518Srobert                                                               __m512h __B) {
449*12c85518Srobert   return (__m512h)((__v32hf)__A / (__v32hf)__B);
450*12c85518Srobert }
451*12c85518Srobert 
452*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_div_ph(__m512h __W,__mmask32 __U,__m512h __A,__m512h __B)453*12c85518Srobert _mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
454*12c85518Srobert   return (__m512h)__builtin_ia32_selectph_512(
455*12c85518Srobert       (__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W);
456*12c85518Srobert }
457*12c85518Srobert 
458*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_div_ph(__mmask32 __U,__m512h __A,__m512h __B)459*12c85518Srobert _mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) {
460*12c85518Srobert   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
461*12c85518Srobert                                               (__v32hf)_mm512_div_ph(__A, __B),
462*12c85518Srobert                                               (__v32hf)_mm512_setzero_ph());
463*12c85518Srobert }
464*12c85518Srobert 
465*12c85518Srobert #define _mm512_div_round_ph(A, B, R)                                           \
466*12c85518Srobert   ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A),                     \
467*12c85518Srobert                                     (__v32hf)(__m512h)(B), (int)(R)))
468*12c85518Srobert 
469*12c85518Srobert #define _mm512_mask_div_round_ph(W, U, A, B, R)                                \
470*12c85518Srobert   ((__m512h)__builtin_ia32_selectph_512(                                       \
471*12c85518Srobert       (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)),             \
472*12c85518Srobert       (__v32hf)(__m512h)(W)))
473*12c85518Srobert 
474*12c85518Srobert #define _mm512_maskz_div_round_ph(U, A, B, R)                                  \
475*12c85518Srobert   ((__m512h)__builtin_ia32_selectph_512(                                       \
476*12c85518Srobert       (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)),             \
477*12c85518Srobert       (__v32hf)_mm512_setzero_ph()))
478*12c85518Srobert 
_mm512_min_ph(__m512h __A,__m512h __B)479*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_min_ph(__m512h __A,
480*12c85518Srobert                                                               __m512h __B) {
481*12c85518Srobert   return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B,
482*12c85518Srobert                                           _MM_FROUND_CUR_DIRECTION);
483*12c85518Srobert }
484*12c85518Srobert 
485*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_min_ph(__m512h __W,__mmask32 __U,__m512h __A,__m512h __B)486*12c85518Srobert _mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
487*12c85518Srobert   return (__m512h)__builtin_ia32_selectph_512(
488*12c85518Srobert       (__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W);
489*12c85518Srobert }
490*12c85518Srobert 
491*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_min_ph(__mmask32 __U,__m512h __A,__m512h __B)492*12c85518Srobert _mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) {
493*12c85518Srobert   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
494*12c85518Srobert                                               (__v32hf)_mm512_min_ph(__A, __B),
495*12c85518Srobert                                               (__v32hf)_mm512_setzero_ph());
496*12c85518Srobert }
497*12c85518Srobert 
498*12c85518Srobert #define _mm512_min_round_ph(A, B, R)                                           \
499*12c85518Srobert   ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A),                     \
500*12c85518Srobert                                     (__v32hf)(__m512h)(B), (int)(R)))
501*12c85518Srobert 
502*12c85518Srobert #define _mm512_mask_min_round_ph(W, U, A, B, R)                                \
503*12c85518Srobert   ((__m512h)__builtin_ia32_selectph_512(                                       \
504*12c85518Srobert       (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)),             \
505*12c85518Srobert       (__v32hf)(__m512h)(W)))
506*12c85518Srobert 
507*12c85518Srobert #define _mm512_maskz_min_round_ph(U, A, B, R)                                  \
508*12c85518Srobert   ((__m512h)__builtin_ia32_selectph_512(                                       \
509*12c85518Srobert       (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)),             \
510*12c85518Srobert       (__v32hf)_mm512_setzero_ph()))
511*12c85518Srobert 
_mm512_max_ph(__m512h __A,__m512h __B)512*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_max_ph(__m512h __A,
513*12c85518Srobert                                                               __m512h __B) {
514*12c85518Srobert   return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B,
515*12c85518Srobert                                           _MM_FROUND_CUR_DIRECTION);
516*12c85518Srobert }
517*12c85518Srobert 
518*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_max_ph(__m512h __W,__mmask32 __U,__m512h __A,__m512h __B)519*12c85518Srobert _mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
520*12c85518Srobert   return (__m512h)__builtin_ia32_selectph_512(
521*12c85518Srobert       (__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W);
522*12c85518Srobert }
523*12c85518Srobert 
524*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_max_ph(__mmask32 __U,__m512h __A,__m512h __B)525*12c85518Srobert _mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) {
526*12c85518Srobert   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
527*12c85518Srobert                                               (__v32hf)_mm512_max_ph(__A, __B),
528*12c85518Srobert                                               (__v32hf)_mm512_setzero_ph());
529*12c85518Srobert }
530*12c85518Srobert 
531*12c85518Srobert #define _mm512_max_round_ph(A, B, R)                                           \
532*12c85518Srobert   ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A),                     \
533*12c85518Srobert                                     (__v32hf)(__m512h)(B), (int)(R)))
534*12c85518Srobert 
535*12c85518Srobert #define _mm512_mask_max_round_ph(W, U, A, B, R)                                \
536*12c85518Srobert   ((__m512h)__builtin_ia32_selectph_512(                                       \
537*12c85518Srobert       (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)),             \
538*12c85518Srobert       (__v32hf)(__m512h)(W)))
539*12c85518Srobert 
540*12c85518Srobert #define _mm512_maskz_max_round_ph(U, A, B, R)                                  \
541*12c85518Srobert   ((__m512h)__builtin_ia32_selectph_512(                                       \
542*12c85518Srobert       (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)),             \
543*12c85518Srobert       (__v32hf)_mm512_setzero_ph()))
544*12c85518Srobert 
_mm512_abs_ph(__m512h __A)545*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) {
546*12c85518Srobert   return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A);
547*12c85518Srobert }
548*12c85518Srobert 
_mm512_conj_pch(__m512h __A)549*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) {
550*12c85518Srobert   return (__m512h)_mm512_xor_ps((__m512)__A, _mm512_set1_ps(-0.0f));
551*12c85518Srobert }
552*12c85518Srobert 
553*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_conj_pch(__m512h __W,__mmask16 __U,__m512h __A)554*12c85518Srobert _mm512_mask_conj_pch(__m512h __W, __mmask16 __U, __m512h __A) {
555*12c85518Srobert   return (__m512h)__builtin_ia32_selectps_512(
556*12c85518Srobert       (__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W);
557*12c85518Srobert }
558*12c85518Srobert 
559*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_conj_pch(__mmask16 __U,__m512h __A)560*12c85518Srobert _mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) {
561*12c85518Srobert   return (__m512h)__builtin_ia32_selectps_512((__mmask16)__U,
562*12c85518Srobert                                               (__v16sf)_mm512_conj_pch(__A),
563*12c85518Srobert                                               (__v16sf)_mm512_setzero_ps());
564*12c85518Srobert }
565*12c85518Srobert 
_mm_add_sh(__m128h __A,__m128h __B)566*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A,
567*12c85518Srobert                                                            __m128h __B) {
568*12c85518Srobert   __A[0] += __B[0];
569*12c85518Srobert   return __A;
570*12c85518Srobert }
571*12c85518Srobert 
_mm_mask_add_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)572*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_sh(__m128h __W,
573*12c85518Srobert                                                                 __mmask8 __U,
574*12c85518Srobert                                                                 __m128h __A,
575*12c85518Srobert                                                                 __m128h __B) {
576*12c85518Srobert   __A = _mm_add_sh(__A, __B);
577*12c85518Srobert   return __builtin_ia32_selectsh_128(__U, __A, __W);
578*12c85518Srobert }
579*12c85518Srobert 
_mm_maskz_add_sh(__mmask8 __U,__m128h __A,__m128h __B)580*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U,
581*12c85518Srobert                                                                  __m128h __A,
582*12c85518Srobert                                                                  __m128h __B) {
583*12c85518Srobert   __A = _mm_add_sh(__A, __B);
584*12c85518Srobert   return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
585*12c85518Srobert }
586*12c85518Srobert 
587*12c85518Srobert #define _mm_add_round_sh(A, B, R)                                              \
588*12c85518Srobert   ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
589*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
590*12c85518Srobert       (__mmask8)-1, (int)(R)))
591*12c85518Srobert 
592*12c85518Srobert #define _mm_mask_add_round_sh(W, U, A, B, R)                                   \
593*12c85518Srobert   ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
594*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
595*12c85518Srobert       (__mmask8)(U), (int)(R)))
596*12c85518Srobert 
597*12c85518Srobert #define _mm_maskz_add_round_sh(U, A, B, R)                                     \
598*12c85518Srobert   ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
599*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
600*12c85518Srobert       (__mmask8)(U), (int)(R)))
601*12c85518Srobert 
_mm_sub_sh(__m128h __A,__m128h __B)602*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_sh(__m128h __A,
603*12c85518Srobert                                                            __m128h __B) {
604*12c85518Srobert   __A[0] -= __B[0];
605*12c85518Srobert   return __A;
606*12c85518Srobert }
607*12c85518Srobert 
_mm_mask_sub_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)608*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_sh(__m128h __W,
609*12c85518Srobert                                                                 __mmask8 __U,
610*12c85518Srobert                                                                 __m128h __A,
611*12c85518Srobert                                                                 __m128h __B) {
612*12c85518Srobert   __A = _mm_sub_sh(__A, __B);
613*12c85518Srobert   return __builtin_ia32_selectsh_128(__U, __A, __W);
614*12c85518Srobert }
615*12c85518Srobert 
_mm_maskz_sub_sh(__mmask8 __U,__m128h __A,__m128h __B)616*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U,
617*12c85518Srobert                                                                  __m128h __A,
618*12c85518Srobert                                                                  __m128h __B) {
619*12c85518Srobert   __A = _mm_sub_sh(__A, __B);
620*12c85518Srobert   return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
621*12c85518Srobert }
622*12c85518Srobert 
623*12c85518Srobert #define _mm_sub_round_sh(A, B, R)                                              \
624*12c85518Srobert   ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
625*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
626*12c85518Srobert       (__mmask8)-1, (int)(R)))
627*12c85518Srobert 
628*12c85518Srobert #define _mm_mask_sub_round_sh(W, U, A, B, R)                                   \
629*12c85518Srobert   ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
630*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
631*12c85518Srobert       (__mmask8)(U), (int)(R)))
632*12c85518Srobert 
633*12c85518Srobert #define _mm_maskz_sub_round_sh(U, A, B, R)                                     \
634*12c85518Srobert   ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
635*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
636*12c85518Srobert       (__mmask8)(U), (int)(R)))
637*12c85518Srobert 
_mm_mul_sh(__m128h __A,__m128h __B)638*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_sh(__m128h __A,
639*12c85518Srobert                                                            __m128h __B) {
640*12c85518Srobert   __A[0] *= __B[0];
641*12c85518Srobert   return __A;
642*12c85518Srobert }
643*12c85518Srobert 
_mm_mask_mul_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)644*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_sh(__m128h __W,
645*12c85518Srobert                                                                 __mmask8 __U,
646*12c85518Srobert                                                                 __m128h __A,
647*12c85518Srobert                                                                 __m128h __B) {
648*12c85518Srobert   __A = _mm_mul_sh(__A, __B);
649*12c85518Srobert   return __builtin_ia32_selectsh_128(__U, __A, __W);
650*12c85518Srobert }
651*12c85518Srobert 
_mm_maskz_mul_sh(__mmask8 __U,__m128h __A,__m128h __B)652*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U,
653*12c85518Srobert                                                                  __m128h __A,
654*12c85518Srobert                                                                  __m128h __B) {
655*12c85518Srobert   __A = _mm_mul_sh(__A, __B);
656*12c85518Srobert   return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
657*12c85518Srobert }
658*12c85518Srobert 
659*12c85518Srobert #define _mm_mul_round_sh(A, B, R)                                              \
660*12c85518Srobert   ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
661*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
662*12c85518Srobert       (__mmask8)-1, (int)(R)))
663*12c85518Srobert 
664*12c85518Srobert #define _mm_mask_mul_round_sh(W, U, A, B, R)                                   \
665*12c85518Srobert   ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
666*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
667*12c85518Srobert       (__mmask8)(U), (int)(R)))
668*12c85518Srobert 
669*12c85518Srobert #define _mm_maskz_mul_round_sh(U, A, B, R)                                     \
670*12c85518Srobert   ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
671*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
672*12c85518Srobert       (__mmask8)(U), (int)(R)))
673*12c85518Srobert 
_mm_div_sh(__m128h __A,__m128h __B)674*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_sh(__m128h __A,
675*12c85518Srobert                                                            __m128h __B) {
676*12c85518Srobert   __A[0] /= __B[0];
677*12c85518Srobert   return __A;
678*12c85518Srobert }
679*12c85518Srobert 
_mm_mask_div_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)680*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_sh(__m128h __W,
681*12c85518Srobert                                                                 __mmask8 __U,
682*12c85518Srobert                                                                 __m128h __A,
683*12c85518Srobert                                                                 __m128h __B) {
684*12c85518Srobert   __A = _mm_div_sh(__A, __B);
685*12c85518Srobert   return __builtin_ia32_selectsh_128(__U, __A, __W);
686*12c85518Srobert }
687*12c85518Srobert 
_mm_maskz_div_sh(__mmask8 __U,__m128h __A,__m128h __B)688*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_sh(__mmask8 __U,
689*12c85518Srobert                                                                  __m128h __A,
690*12c85518Srobert                                                                  __m128h __B) {
691*12c85518Srobert   __A = _mm_div_sh(__A, __B);
692*12c85518Srobert   return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
693*12c85518Srobert }
694*12c85518Srobert 
695*12c85518Srobert #define _mm_div_round_sh(A, B, R)                                              \
696*12c85518Srobert   ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
697*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
698*12c85518Srobert       (__mmask8)-1, (int)(R)))
699*12c85518Srobert 
700*12c85518Srobert #define _mm_mask_div_round_sh(W, U, A, B, R)                                   \
701*12c85518Srobert   ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
702*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
703*12c85518Srobert       (__mmask8)(U), (int)(R)))
704*12c85518Srobert 
705*12c85518Srobert #define _mm_maskz_div_round_sh(U, A, B, R)                                     \
706*12c85518Srobert   ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
707*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
708*12c85518Srobert       (__mmask8)(U), (int)(R)))
709*12c85518Srobert 
_mm_min_sh(__m128h __A,__m128h __B)710*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_sh(__m128h __A,
711*12c85518Srobert                                                            __m128h __B) {
712*12c85518Srobert   return (__m128h)__builtin_ia32_minsh_round_mask(
713*12c85518Srobert       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
714*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
715*12c85518Srobert }
716*12c85518Srobert 
_mm_mask_min_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)717*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_sh(__m128h __W,
718*12c85518Srobert                                                                 __mmask8 __U,
719*12c85518Srobert                                                                 __m128h __A,
720*12c85518Srobert                                                                 __m128h __B) {
721*12c85518Srobert   return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B,
722*12c85518Srobert                                                   (__v8hf)__W, (__mmask8)__U,
723*12c85518Srobert                                                   _MM_FROUND_CUR_DIRECTION);
724*12c85518Srobert }
725*12c85518Srobert 
_mm_maskz_min_sh(__mmask8 __U,__m128h __A,__m128h __B)726*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_sh(__mmask8 __U,
727*12c85518Srobert                                                                  __m128h __A,
728*12c85518Srobert                                                                  __m128h __B) {
729*12c85518Srobert   return (__m128h)__builtin_ia32_minsh_round_mask(
730*12c85518Srobert       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
731*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
732*12c85518Srobert }
733*12c85518Srobert 
734*12c85518Srobert #define _mm_min_round_sh(A, B, R)                                              \
735*12c85518Srobert   ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
736*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
737*12c85518Srobert       (__mmask8)-1, (int)(R)))
738*12c85518Srobert 
739*12c85518Srobert #define _mm_mask_min_round_sh(W, U, A, B, R)                                   \
740*12c85518Srobert   ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
741*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
742*12c85518Srobert       (__mmask8)(U), (int)(R)))
743*12c85518Srobert 
744*12c85518Srobert #define _mm_maskz_min_round_sh(U, A, B, R)                                     \
745*12c85518Srobert   ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
746*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
747*12c85518Srobert       (__mmask8)(U), (int)(R)))
748*12c85518Srobert 
_mm_max_sh(__m128h __A,__m128h __B)749*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_sh(__m128h __A,
750*12c85518Srobert                                                            __m128h __B) {
751*12c85518Srobert   return (__m128h)__builtin_ia32_maxsh_round_mask(
752*12c85518Srobert       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
753*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
754*12c85518Srobert }
755*12c85518Srobert 
_mm_mask_max_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)756*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_sh(__m128h __W,
757*12c85518Srobert                                                                 __mmask8 __U,
758*12c85518Srobert                                                                 __m128h __A,
759*12c85518Srobert                                                                 __m128h __B) {
760*12c85518Srobert   return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B,
761*12c85518Srobert                                                   (__v8hf)__W, (__mmask8)__U,
762*12c85518Srobert                                                   _MM_FROUND_CUR_DIRECTION);
763*12c85518Srobert }
764*12c85518Srobert 
_mm_maskz_max_sh(__mmask8 __U,__m128h __A,__m128h __B)765*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_sh(__mmask8 __U,
766*12c85518Srobert                                                                  __m128h __A,
767*12c85518Srobert                                                                  __m128h __B) {
768*12c85518Srobert   return (__m128h)__builtin_ia32_maxsh_round_mask(
769*12c85518Srobert       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
770*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
771*12c85518Srobert }
772*12c85518Srobert 
773*12c85518Srobert #define _mm_max_round_sh(A, B, R)                                              \
774*12c85518Srobert   ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
775*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
776*12c85518Srobert       (__mmask8)-1, (int)(R)))
777*12c85518Srobert 
778*12c85518Srobert #define _mm_mask_max_round_sh(W, U, A, B, R)                                   \
779*12c85518Srobert   ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
780*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
781*12c85518Srobert       (__mmask8)(U), (int)(R)))
782*12c85518Srobert 
783*12c85518Srobert #define _mm_maskz_max_round_sh(U, A, B, R)                                     \
784*12c85518Srobert   ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
785*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
786*12c85518Srobert       (__mmask8)(U), (int)(R)))
787*12c85518Srobert 
788*12c85518Srobert #define _mm512_cmp_round_ph_mask(A, B, P, R)                                   \
789*12c85518Srobert   ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A),              \
790*12c85518Srobert                                            (__v32hf)(__m512h)(B), (int)(P),    \
791*12c85518Srobert                                            (__mmask32)-1, (int)(R)))
792*12c85518Srobert 
793*12c85518Srobert #define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R)                           \
794*12c85518Srobert   ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A),              \
795*12c85518Srobert                                            (__v32hf)(__m512h)(B), (int)(P),    \
796*12c85518Srobert                                            (__mmask32)(U), (int)(R)))
797*12c85518Srobert 
798*12c85518Srobert #define _mm512_cmp_ph_mask(A, B, P)                                            \
799*12c85518Srobert   _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
800*12c85518Srobert 
801*12c85518Srobert #define _mm512_mask_cmp_ph_mask(U, A, B, P)                                    \
802*12c85518Srobert   _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
803*12c85518Srobert 
804*12c85518Srobert #define _mm_cmp_round_sh_mask(X, Y, P, R)                                      \
805*12c85518Srobert   ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X),                   \
806*12c85518Srobert                                        (__v8hf)(__m128h)(Y), (int)(P),         \
807*12c85518Srobert                                        (__mmask8)-1, (int)(R)))
808*12c85518Srobert 
809*12c85518Srobert #define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R)                              \
810*12c85518Srobert   ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X),                   \
811*12c85518Srobert                                        (__v8hf)(__m128h)(Y), (int)(P),         \
812*12c85518Srobert                                        (__mmask8)(M), (int)(R)))
813*12c85518Srobert 
814*12c85518Srobert #define _mm_cmp_sh_mask(X, Y, P)                                               \
815*12c85518Srobert   ((__mmask8)__builtin_ia32_cmpsh_mask(                                        \
816*12c85518Srobert       (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1,      \
817*12c85518Srobert       _MM_FROUND_CUR_DIRECTION))
818*12c85518Srobert 
819*12c85518Srobert #define _mm_mask_cmp_sh_mask(M, X, Y, P)                                       \
820*12c85518Srobert   ((__mmask8)__builtin_ia32_cmpsh_mask(                                        \
821*12c85518Srobert       (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M),     \
822*12c85518Srobert       _MM_FROUND_CUR_DIRECTION))
823*12c85518Srobert // loads with vmovsh:
_mm_load_sh(void const * __dp)824*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) {
825*12c85518Srobert   struct __mm_load_sh_struct {
826*12c85518Srobert     _Float16 __u;
827*12c85518Srobert   } __attribute__((__packed__, __may_alias__));
828*12c85518Srobert   _Float16 __u = ((const struct __mm_load_sh_struct *)__dp)->__u;
829*12c85518Srobert   return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
830*12c85518Srobert }
831*12c85518Srobert 
832*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_load_sh(__m128h __W,__mmask8 __U,const void * __A)833*12c85518Srobert _mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) {
834*12c85518Srobert   __m128h src = (__v8hf)__builtin_shufflevector(
835*12c85518Srobert       (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
836*12c85518Srobert 
837*12c85518Srobert   return (__m128h)__builtin_ia32_loadsh128_mask((const __v8hf *)__A, src, __U & 1);
838*12c85518Srobert }
839*12c85518Srobert 
840*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_load_sh(__mmask8 __U,const void * __A)841*12c85518Srobert _mm_maskz_load_sh(__mmask8 __U, const void *__A) {
842*12c85518Srobert   return (__m128h)__builtin_ia32_loadsh128_mask(
843*12c85518Srobert       (const __v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
844*12c85518Srobert }
845*12c85518Srobert 
846*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_load_ph(void const * __p)847*12c85518Srobert _mm512_load_ph(void const *__p) {
848*12c85518Srobert   return *(const __m512h *)__p;
849*12c85518Srobert }
850*12c85518Srobert 
851*12c85518Srobert static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_load_ph(void const * __p)852*12c85518Srobert _mm256_load_ph(void const *__p) {
853*12c85518Srobert   return *(const __m256h *)__p;
854*12c85518Srobert }
855*12c85518Srobert 
_mm_load_ph(void const * __p)856*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_ph(void const *__p) {
857*12c85518Srobert   return *(const __m128h *)__p;
858*12c85518Srobert }
859*12c85518Srobert 
860*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_loadu_ph(void const * __p)861*12c85518Srobert _mm512_loadu_ph(void const *__p) {
862*12c85518Srobert   struct __loadu_ph {
863*12c85518Srobert     __m512h_u __v;
864*12c85518Srobert   } __attribute__((__packed__, __may_alias__));
865*12c85518Srobert   return ((const struct __loadu_ph *)__p)->__v;
866*12c85518Srobert }
867*12c85518Srobert 
868*12c85518Srobert static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_loadu_ph(void const * __p)869*12c85518Srobert _mm256_loadu_ph(void const *__p) {
870*12c85518Srobert   struct __loadu_ph {
871*12c85518Srobert     __m256h_u __v;
872*12c85518Srobert   } __attribute__((__packed__, __may_alias__));
873*12c85518Srobert   return ((const struct __loadu_ph *)__p)->__v;
874*12c85518Srobert }
875*12c85518Srobert 
_mm_loadu_ph(void const * __p)876*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_loadu_ph(void const *__p) {
877*12c85518Srobert   struct __loadu_ph {
878*12c85518Srobert     __m128h_u __v;
879*12c85518Srobert   } __attribute__((__packed__, __may_alias__));
880*12c85518Srobert   return ((const struct __loadu_ph *)__p)->__v;
881*12c85518Srobert }
882*12c85518Srobert 
883*12c85518Srobert // stores with vmovsh:
_mm_store_sh(void * __dp,__m128h __a)884*12c85518Srobert static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sh(void *__dp,
885*12c85518Srobert                                                           __m128h __a) {
886*12c85518Srobert   struct __mm_store_sh_struct {
887*12c85518Srobert     _Float16 __u;
888*12c85518Srobert   } __attribute__((__packed__, __may_alias__));
889*12c85518Srobert   ((struct __mm_store_sh_struct *)__dp)->__u = __a[0];
890*12c85518Srobert }
891*12c85518Srobert 
_mm_mask_store_sh(void * __W,__mmask8 __U,__m128h __A)892*12c85518Srobert static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sh(void *__W,
893*12c85518Srobert                                                                __mmask8 __U,
894*12c85518Srobert                                                                __m128h __A) {
895*12c85518Srobert   __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1);
896*12c85518Srobert }
897*12c85518Srobert 
_mm512_store_ph(void * __P,__m512h __A)898*12c85518Srobert static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_ph(void *__P,
899*12c85518Srobert                                                              __m512h __A) {
900*12c85518Srobert   *(__m512h *)__P = __A;
901*12c85518Srobert }
902*12c85518Srobert 
_mm256_store_ph(void * __P,__m256h __A)903*12c85518Srobert static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_ph(void *__P,
904*12c85518Srobert                                                              __m256h __A) {
905*12c85518Srobert   *(__m256h *)__P = __A;
906*12c85518Srobert }
907*12c85518Srobert 
_mm_store_ph(void * __P,__m128h __A)908*12c85518Srobert static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_ph(void *__P,
909*12c85518Srobert                                                           __m128h __A) {
910*12c85518Srobert   *(__m128h *)__P = __A;
911*12c85518Srobert }
912*12c85518Srobert 
_mm512_storeu_ph(void * __P,__m512h __A)913*12c85518Srobert static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_ph(void *__P,
914*12c85518Srobert                                                               __m512h __A) {
915*12c85518Srobert   struct __storeu_ph {
916*12c85518Srobert     __m512h_u __v;
917*12c85518Srobert   } __attribute__((__packed__, __may_alias__));
918*12c85518Srobert   ((struct __storeu_ph *)__P)->__v = __A;
919*12c85518Srobert }
920*12c85518Srobert 
_mm256_storeu_ph(void * __P,__m256h __A)921*12c85518Srobert static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_ph(void *__P,
922*12c85518Srobert                                                               __m256h __A) {
923*12c85518Srobert   struct __storeu_ph {
924*12c85518Srobert     __m256h_u __v;
925*12c85518Srobert   } __attribute__((__packed__, __may_alias__));
926*12c85518Srobert   ((struct __storeu_ph *)__P)->__v = __A;
927*12c85518Srobert }
928*12c85518Srobert 
_mm_storeu_ph(void * __P,__m128h __A)929*12c85518Srobert static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P,
930*12c85518Srobert                                                            __m128h __A) {
931*12c85518Srobert   struct __storeu_ph {
932*12c85518Srobert     __m128h_u __v;
933*12c85518Srobert   } __attribute__((__packed__, __may_alias__));
934*12c85518Srobert   ((struct __storeu_ph *)__P)->__v = __A;
935*12c85518Srobert }
936*12c85518Srobert 
937*12c85518Srobert // moves with vmovsh:
_mm_move_sh(__m128h __a,__m128h __b)938*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_move_sh(__m128h __a,
939*12c85518Srobert                                                             __m128h __b) {
940*12c85518Srobert   __a[0] = __b[0];
941*12c85518Srobert   return __a;
942*12c85518Srobert }
943*12c85518Srobert 
_mm_mask_move_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)944*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_move_sh(__m128h __W,
945*12c85518Srobert                                                                  __mmask8 __U,
946*12c85518Srobert                                                                  __m128h __A,
947*12c85518Srobert                                                                  __m128h __B) {
948*12c85518Srobert   return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W);
949*12c85518Srobert }
950*12c85518Srobert 
_mm_maskz_move_sh(__mmask8 __U,__m128h __A,__m128h __B)951*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_move_sh(__mmask8 __U,
952*12c85518Srobert                                                                   __m128h __A,
953*12c85518Srobert                                                                   __m128h __B) {
954*12c85518Srobert   return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B),
955*12c85518Srobert                                      _mm_setzero_ph());
956*12c85518Srobert }
957*12c85518Srobert 
958*12c85518Srobert // vmovw:
_mm_cvtsi16_si128(short __a)959*12c85518Srobert static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsi16_si128(short __a) {
960*12c85518Srobert   return (__m128i)(__v8hi){__a, 0, 0, 0, 0, 0, 0, 0};
961*12c85518Srobert }
962*12c85518Srobert 
_mm_cvtsi128_si16(__m128i __a)963*12c85518Srobert static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) {
964*12c85518Srobert   __v8hi __b = (__v8hi)__a;
965*12c85518Srobert   return __b[0];
966*12c85518Srobert }
967*12c85518Srobert 
_mm512_rcp_ph(__m512h __A)968*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rcp_ph(__m512h __A) {
969*12c85518Srobert   return (__m512h)__builtin_ia32_rcpph512_mask(
970*12c85518Srobert       (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
971*12c85518Srobert }
972*12c85518Srobert 
973*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_rcp_ph(__m512h __W,__mmask32 __U,__m512h __A)974*12c85518Srobert _mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
975*12c85518Srobert   return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W,
976*12c85518Srobert                                                (__mmask32)__U);
977*12c85518Srobert }
978*12c85518Srobert 
979*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_rcp_ph(__mmask32 __U,__m512h __A)980*12c85518Srobert _mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) {
981*12c85518Srobert   return (__m512h)__builtin_ia32_rcpph512_mask(
982*12c85518Srobert       (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
983*12c85518Srobert }
984*12c85518Srobert 
_mm512_rsqrt_ph(__m512h __A)985*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rsqrt_ph(__m512h __A) {
986*12c85518Srobert   return (__m512h)__builtin_ia32_rsqrtph512_mask(
987*12c85518Srobert       (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
988*12c85518Srobert }
989*12c85518Srobert 
990*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_rsqrt_ph(__m512h __W,__mmask32 __U,__m512h __A)991*12c85518Srobert _mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
992*12c85518Srobert   return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W,
993*12c85518Srobert                                                  (__mmask32)__U);
994*12c85518Srobert }
995*12c85518Srobert 
996*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_rsqrt_ph(__mmask32 __U,__m512h __A)997*12c85518Srobert _mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) {
998*12c85518Srobert   return (__m512h)__builtin_ia32_rsqrtph512_mask(
999*12c85518Srobert       (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
1000*12c85518Srobert }
1001*12c85518Srobert 
1002*12c85518Srobert #define _mm512_getmant_ph(A, B, C)                                             \
1003*12c85518Srobert   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1004*12c85518Srobert       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
1005*12c85518Srobert       (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,                           \
1006*12c85518Srobert       _MM_FROUND_CUR_DIRECTION))
1007*12c85518Srobert 
1008*12c85518Srobert #define _mm512_mask_getmant_ph(W, U, A, B, C)                                  \
1009*12c85518Srobert   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1010*12c85518Srobert       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W),   \
1011*12c85518Srobert       (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1012*12c85518Srobert 
1013*12c85518Srobert #define _mm512_maskz_getmant_ph(U, A, B, C)                                    \
1014*12c85518Srobert   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1015*12c85518Srobert       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
1016*12c85518Srobert       (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1017*12c85518Srobert 
1018*12c85518Srobert #define _mm512_getmant_round_ph(A, B, C, R)                                    \
1019*12c85518Srobert   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1020*12c85518Srobert       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
1021*12c85518Srobert       (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1022*12c85518Srobert 
1023*12c85518Srobert #define _mm512_mask_getmant_round_ph(W, U, A, B, C, R)                         \
1024*12c85518Srobert   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1025*12c85518Srobert       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W),   \
1026*12c85518Srobert       (__mmask32)(U), (int)(R)))
1027*12c85518Srobert 
1028*12c85518Srobert #define _mm512_maskz_getmant_round_ph(U, A, B, C, R)                           \
1029*12c85518Srobert   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1030*12c85518Srobert       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
1031*12c85518Srobert       (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1032*12c85518Srobert 
_mm512_getexp_ph(__m512h __A)1033*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_getexp_ph(__m512h __A) {
1034*12c85518Srobert   return (__m512h)__builtin_ia32_getexpph512_mask(
1035*12c85518Srobert       (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1036*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1037*12c85518Srobert }
1038*12c85518Srobert 
1039*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_getexp_ph(__m512h __W,__mmask32 __U,__m512h __A)1040*12c85518Srobert _mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1041*12c85518Srobert   return (__m512h)__builtin_ia32_getexpph512_mask(
1042*12c85518Srobert       (__v32hf)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1043*12c85518Srobert }
1044*12c85518Srobert 
1045*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_getexp_ph(__mmask32 __U,__m512h __A)1046*12c85518Srobert _mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) {
1047*12c85518Srobert   return (__m512h)__builtin_ia32_getexpph512_mask(
1048*12c85518Srobert       (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1049*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1050*12c85518Srobert }
1051*12c85518Srobert 
1052*12c85518Srobert #define _mm512_getexp_round_ph(A, R)                                           \
1053*12c85518Srobert   ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A),             \
1054*12c85518Srobert                                             (__v32hf)_mm512_undefined_ph(),    \
1055*12c85518Srobert                                             (__mmask32)-1, (int)(R)))
1056*12c85518Srobert 
1057*12c85518Srobert #define _mm512_mask_getexp_round_ph(W, U, A, R)                                \
1058*12c85518Srobert   ((__m512h)__builtin_ia32_getexpph512_mask(                                   \
1059*12c85518Srobert       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
1060*12c85518Srobert 
1061*12c85518Srobert #define _mm512_maskz_getexp_round_ph(U, A, R)                                  \
1062*12c85518Srobert   ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A),             \
1063*12c85518Srobert                                             (__v32hf)_mm512_setzero_ph(),      \
1064*12c85518Srobert                                             (__mmask32)(U), (int)(R)))
1065*12c85518Srobert 
_mm512_scalef_ph(__m512h __A,__m512h __B)1066*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_scalef_ph(__m512h __A,
1067*12c85518Srobert                                                                  __m512h __B) {
1068*12c85518Srobert   return (__m512h)__builtin_ia32_scalefph512_mask(
1069*12c85518Srobert       (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1070*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1071*12c85518Srobert }
1072*12c85518Srobert 
1073*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_scalef_ph(__m512h __W,__mmask32 __U,__m512h __A,__m512h __B)1074*12c85518Srobert _mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
1075*12c85518Srobert   return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B,
1076*12c85518Srobert                                                   (__v32hf)__W, (__mmask32)__U,
1077*12c85518Srobert                                                   _MM_FROUND_CUR_DIRECTION);
1078*12c85518Srobert }
1079*12c85518Srobert 
1080*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_scalef_ph(__mmask32 __U,__m512h __A,__m512h __B)1081*12c85518Srobert _mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) {
1082*12c85518Srobert   return (__m512h)__builtin_ia32_scalefph512_mask(
1083*12c85518Srobert       (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1084*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1085*12c85518Srobert }
1086*12c85518Srobert 
1087*12c85518Srobert #define _mm512_scalef_round_ph(A, B, R)                                        \
1088*12c85518Srobert   ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
1089*12c85518Srobert       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B),                            \
1090*12c85518Srobert       (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1091*12c85518Srobert 
1092*12c85518Srobert #define _mm512_mask_scalef_round_ph(W, U, A, B, R)                             \
1093*12c85518Srobert   ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
1094*12c85518Srobert       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W),     \
1095*12c85518Srobert       (__mmask32)(U), (int)(R)))
1096*12c85518Srobert 
1097*12c85518Srobert #define _mm512_maskz_scalef_round_ph(U, A, B, R)                               \
1098*12c85518Srobert   ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
1099*12c85518Srobert       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B),                            \
1100*12c85518Srobert       (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1101*12c85518Srobert 
1102*12c85518Srobert #define _mm512_roundscale_ph(A, B)                                             \
1103*12c85518Srobert   ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
1104*12c85518Srobert       (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1,   \
1105*12c85518Srobert       _MM_FROUND_CUR_DIRECTION))
1106*12c85518Srobert 
1107*12c85518Srobert #define _mm512_mask_roundscale_ph(A, B, C, imm)                                \
1108*12c85518Srobert   ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
1109*12c85518Srobert       (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A),                \
1110*12c85518Srobert       (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
1111*12c85518Srobert 
1112*12c85518Srobert #define _mm512_maskz_roundscale_ph(A, B, imm)                                  \
1113*12c85518Srobert   ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
1114*12c85518Srobert       (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(),         \
1115*12c85518Srobert       (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
1116*12c85518Srobert 
1117*12c85518Srobert #define _mm512_mask_roundscale_round_ph(A, B, C, imm, R)                       \
1118*12c85518Srobert   ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm),  \
1119*12c85518Srobert                                            (__v32hf)(__m512h)(A),              \
1120*12c85518Srobert                                            (__mmask32)(B), (int)(R)))
1121*12c85518Srobert 
1122*12c85518Srobert #define _mm512_maskz_roundscale_round_ph(A, B, imm, R)                         \
1123*12c85518Srobert   ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm),  \
1124*12c85518Srobert                                            (__v32hf)_mm512_setzero_ph(),       \
1125*12c85518Srobert                                            (__mmask32)(A), (int)(R)))
1126*12c85518Srobert 
1127*12c85518Srobert #define _mm512_roundscale_round_ph(A, imm, R)                                  \
1128*12c85518Srobert   ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm),  \
1129*12c85518Srobert                                            (__v32hf)_mm512_undefined_ph(),     \
1130*12c85518Srobert                                            (__mmask32)-1, (int)(R)))
1131*12c85518Srobert 
1132*12c85518Srobert #define _mm512_reduce_ph(A, imm)                                               \
1133*12c85518Srobert   ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
1134*12c85518Srobert       (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(),       \
1135*12c85518Srobert       (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
1136*12c85518Srobert 
1137*12c85518Srobert #define _mm512_mask_reduce_ph(W, U, A, imm)                                    \
1138*12c85518Srobert   ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
1139*12c85518Srobert       (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W),                \
1140*12c85518Srobert       (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1141*12c85518Srobert 
1142*12c85518Srobert #define _mm512_maskz_reduce_ph(U, A, imm)                                      \
1143*12c85518Srobert   ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
1144*12c85518Srobert       (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(),         \
1145*12c85518Srobert       (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1146*12c85518Srobert 
1147*12c85518Srobert #define _mm512_mask_reduce_round_ph(W, U, A, imm, R)                           \
1148*12c85518Srobert   ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1149*12c85518Srobert                                             (__v32hf)(__m512h)(W),             \
1150*12c85518Srobert                                             (__mmask32)(U), (int)(R)))
1151*12c85518Srobert 
1152*12c85518Srobert #define _mm512_maskz_reduce_round_ph(U, A, imm, R)                             \
1153*12c85518Srobert   ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1154*12c85518Srobert                                             (__v32hf)_mm512_setzero_ph(),      \
1155*12c85518Srobert                                             (__mmask32)(U), (int)(R)))
1156*12c85518Srobert 
1157*12c85518Srobert #define _mm512_reduce_round_ph(A, imm, R)                                      \
1158*12c85518Srobert   ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1159*12c85518Srobert                                             (__v32hf)_mm512_undefined_ph(),    \
1160*12c85518Srobert                                             (__mmask32)-1, (int)(R)))
1161*12c85518Srobert 
_mm_rcp_sh(__m128h __A,__m128h __B)1162*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_sh(__m128h __A,
1163*12c85518Srobert                                                            __m128h __B) {
1164*12c85518Srobert   return (__m128h)__builtin_ia32_rcpsh_mask(
1165*12c85518Srobert       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1166*12c85518Srobert }
1167*12c85518Srobert 
_mm_mask_rcp_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)1168*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_sh(__m128h __W,
1169*12c85518Srobert                                                                 __mmask8 __U,
1170*12c85518Srobert                                                                 __m128h __A,
1171*12c85518Srobert                                                                 __m128h __B) {
1172*12c85518Srobert   return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B,
1173*12c85518Srobert                                             (__v8hf)__W, (__mmask8)__U);
1174*12c85518Srobert }
1175*12c85518Srobert 
_mm_maskz_rcp_sh(__mmask8 __U,__m128h __A,__m128h __B)1176*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_sh(__mmask8 __U,
1177*12c85518Srobert                                                                  __m128h __A,
1178*12c85518Srobert                                                                  __m128h __B) {
1179*12c85518Srobert   return (__m128h)__builtin_ia32_rcpsh_mask(
1180*12c85518Srobert       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1181*12c85518Srobert }
1182*12c85518Srobert 
_mm_rsqrt_sh(__m128h __A,__m128h __B)1183*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_sh(__m128h __A,
1184*12c85518Srobert                                                              __m128h __B) {
1185*12c85518Srobert   return (__m128h)__builtin_ia32_rsqrtsh_mask(
1186*12c85518Srobert       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1187*12c85518Srobert }
1188*12c85518Srobert 
_mm_mask_rsqrt_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)1189*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_sh(__m128h __W,
1190*12c85518Srobert                                                                   __mmask8 __U,
1191*12c85518Srobert                                                                   __m128h __A,
1192*12c85518Srobert                                                                   __m128h __B) {
1193*12c85518Srobert   return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B,
1194*12c85518Srobert                                               (__v8hf)__W, (__mmask8)__U);
1195*12c85518Srobert }
1196*12c85518Srobert 
1197*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_rsqrt_sh(__mmask8 __U,__m128h __A,__m128h __B)1198*12c85518Srobert _mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1199*12c85518Srobert   return (__m128h)__builtin_ia32_rsqrtsh_mask(
1200*12c85518Srobert       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1201*12c85518Srobert }
1202*12c85518Srobert 
1203*12c85518Srobert #define _mm_getmant_round_sh(A, B, C, D, R)                                    \
1204*12c85518Srobert   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1205*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1206*12c85518Srobert       (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
1207*12c85518Srobert 
1208*12c85518Srobert #define _mm_getmant_sh(A, B, C, D)                                             \
1209*12c85518Srobert   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1210*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1211*12c85518Srobert       (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
1212*12c85518Srobert 
1213*12c85518Srobert #define _mm_mask_getmant_sh(W, U, A, B, C, D)                                  \
1214*12c85518Srobert   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1215*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1216*12c85518Srobert       (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1217*12c85518Srobert 
1218*12c85518Srobert #define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R)                         \
1219*12c85518Srobert   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1220*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1221*12c85518Srobert       (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
1222*12c85518Srobert 
1223*12c85518Srobert #define _mm_maskz_getmant_sh(U, A, B, C, D)                                    \
1224*12c85518Srobert   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1225*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1226*12c85518Srobert       (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1227*12c85518Srobert 
1228*12c85518Srobert #define _mm_maskz_getmant_round_sh(U, A, B, C, D, R)                           \
1229*12c85518Srobert   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1230*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1231*12c85518Srobert       (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1232*12c85518Srobert 
1233*12c85518Srobert #define _mm_getexp_round_sh(A, B, R)                                           \
1234*12c85518Srobert   ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
1235*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1236*12c85518Srobert       (__mmask8)-1, (int)(R)))
1237*12c85518Srobert 
_mm_getexp_sh(__m128h __A,__m128h __B)1238*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_sh(__m128h __A,
1239*12c85518Srobert                                                               __m128h __B) {
1240*12c85518Srobert   return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1241*12c85518Srobert       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1242*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1243*12c85518Srobert }
1244*12c85518Srobert 
1245*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_getexp_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)1246*12c85518Srobert _mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1247*12c85518Srobert   return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1248*12c85518Srobert       (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (__mmask8)__U,
1249*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1250*12c85518Srobert }
1251*12c85518Srobert 
1252*12c85518Srobert #define _mm_mask_getexp_round_sh(W, U, A, B, R)                                \
1253*12c85518Srobert   ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
1254*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1255*12c85518Srobert       (__mmask8)(U), (int)(R)))
1256*12c85518Srobert 
1257*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_getexp_sh(__mmask8 __U,__m128h __A,__m128h __B)1258*12c85518Srobert _mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1259*12c85518Srobert   return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1260*12c85518Srobert       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1261*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1262*12c85518Srobert }
1263*12c85518Srobert 
1264*12c85518Srobert #define _mm_maskz_getexp_round_sh(U, A, B, R)                                  \
1265*12c85518Srobert   ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
1266*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1267*12c85518Srobert       (__mmask8)(U), (int)(R)))
1268*12c85518Srobert 
1269*12c85518Srobert #define _mm_scalef_round_sh(A, B, R)                                           \
1270*12c85518Srobert   ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
1271*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1272*12c85518Srobert       (__mmask8)-1, (int)(R)))
1273*12c85518Srobert 
_mm_scalef_sh(__m128h __A,__m128h __B)1274*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_sh(__m128h __A,
1275*12c85518Srobert                                                               __m128h __B) {
1276*12c85518Srobert   return (__m128h)__builtin_ia32_scalefsh_round_mask(
1277*12c85518Srobert       (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1278*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1279*12c85518Srobert }
1280*12c85518Srobert 
1281*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_scalef_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)1282*12c85518Srobert _mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1283*12c85518Srobert   return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B,
1284*12c85518Srobert                                                      (__v8hf)__W, (__mmask8)__U,
1285*12c85518Srobert                                                      _MM_FROUND_CUR_DIRECTION);
1286*12c85518Srobert }
1287*12c85518Srobert 
1288*12c85518Srobert #define _mm_mask_scalef_round_sh(W, U, A, B, R)                                \
1289*12c85518Srobert   ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
1290*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1291*12c85518Srobert       (__mmask8)(U), (int)(R)))
1292*12c85518Srobert 
1293*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_scalef_sh(__mmask8 __U,__m128h __A,__m128h __B)1294*12c85518Srobert _mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1295*12c85518Srobert   return (__m128h)__builtin_ia32_scalefsh_round_mask(
1296*12c85518Srobert       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1297*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1298*12c85518Srobert }
1299*12c85518Srobert 
1300*12c85518Srobert #define _mm_maskz_scalef_round_sh(U, A, B, R)                                  \
1301*12c85518Srobert   ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
1302*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1303*12c85518Srobert       (__mmask8)(U), (int)(R)))
1304*12c85518Srobert 
1305*12c85518Srobert #define _mm_roundscale_round_sh(A, B, imm, R)                                  \
1306*12c85518Srobert   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1307*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1308*12c85518Srobert       (__mmask8)-1, (int)(imm), (int)(R)))
1309*12c85518Srobert 
1310*12c85518Srobert #define _mm_roundscale_sh(A, B, imm)                                           \
1311*12c85518Srobert   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1312*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1313*12c85518Srobert       (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
1314*12c85518Srobert 
1315*12c85518Srobert #define _mm_mask_roundscale_sh(W, U, A, B, I)                                  \
1316*12c85518Srobert   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1317*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1318*12c85518Srobert       (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1319*12c85518Srobert 
1320*12c85518Srobert #define _mm_mask_roundscale_round_sh(W, U, A, B, I, R)                         \
1321*12c85518Srobert   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1322*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1323*12c85518Srobert       (__mmask8)(U), (int)(I), (int)(R)))
1324*12c85518Srobert 
1325*12c85518Srobert #define _mm_maskz_roundscale_sh(U, A, B, I)                                    \
1326*12c85518Srobert   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1327*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1328*12c85518Srobert       (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1329*12c85518Srobert 
1330*12c85518Srobert #define _mm_maskz_roundscale_round_sh(U, A, B, I, R)                           \
1331*12c85518Srobert   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1332*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1333*12c85518Srobert       (__mmask8)(U), (int)(I), (int)(R)))
1334*12c85518Srobert 
1335*12c85518Srobert #define _mm_reduce_sh(A, B, C)                                                 \
1336*12c85518Srobert   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1337*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1338*12c85518Srobert       (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
1339*12c85518Srobert 
1340*12c85518Srobert #define _mm_mask_reduce_sh(W, U, A, B, C)                                      \
1341*12c85518Srobert   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1342*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1343*12c85518Srobert       (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1344*12c85518Srobert 
1345*12c85518Srobert #define _mm_maskz_reduce_sh(U, A, B, C)                                        \
1346*12c85518Srobert   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1347*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1348*12c85518Srobert       (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1349*12c85518Srobert 
1350*12c85518Srobert #define _mm_reduce_round_sh(A, B, C, R)                                        \
1351*12c85518Srobert   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1352*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1353*12c85518Srobert       (__mmask8)-1, (int)(C), (int)(R)))
1354*12c85518Srobert 
1355*12c85518Srobert #define _mm_mask_reduce_round_sh(W, U, A, B, C, R)                             \
1356*12c85518Srobert   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1357*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1358*12c85518Srobert       (__mmask8)(U), (int)(C), (int)(R)))
1359*12c85518Srobert 
1360*12c85518Srobert #define _mm_maskz_reduce_round_sh(U, A, B, C, R)                               \
1361*12c85518Srobert   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1362*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1363*12c85518Srobert       (__mmask8)(U), (int)(C), (int)(R)))
1364*12c85518Srobert 
1365*12c85518Srobert #define _mm512_sqrt_round_ph(A, R)                                             \
1366*12c85518Srobert   ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
1367*12c85518Srobert 
1368*12c85518Srobert #define _mm512_mask_sqrt_round_ph(W, U, A, R)                                  \
1369*12c85518Srobert   ((__m512h)__builtin_ia32_selectph_512(                                       \
1370*12c85518Srobert       (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)),                 \
1371*12c85518Srobert       (__v32hf)(__m512h)(W)))
1372*12c85518Srobert 
1373*12c85518Srobert #define _mm512_maskz_sqrt_round_ph(U, A, R)                                    \
1374*12c85518Srobert   ((__m512h)__builtin_ia32_selectph_512(                                       \
1375*12c85518Srobert       (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)),                 \
1376*12c85518Srobert       (__v32hf)_mm512_setzero_ph()))
1377*12c85518Srobert 
_mm512_sqrt_ph(__m512h __A)1378*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) {
1379*12c85518Srobert   return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A,
1380*12c85518Srobert                                            _MM_FROUND_CUR_DIRECTION);
1381*12c85518Srobert }
1382*12c85518Srobert 
1383*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_sqrt_ph(__m512h __W,__mmask32 __U,__m512h __A)1384*12c85518Srobert _mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1385*12c85518Srobert   return (__m512h)__builtin_ia32_selectph_512(
1386*12c85518Srobert       (__mmask32)(__U),
1387*12c85518Srobert       (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
1388*12c85518Srobert       (__v32hf)(__m512h)(__W));
1389*12c85518Srobert }
1390*12c85518Srobert 
1391*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_sqrt_ph(__mmask32 __U,__m512h __A)1392*12c85518Srobert _mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) {
1393*12c85518Srobert   return (__m512h)__builtin_ia32_selectph_512(
1394*12c85518Srobert       (__mmask32)(__U),
1395*12c85518Srobert       (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
1396*12c85518Srobert       (__v32hf)_mm512_setzero_ph());
1397*12c85518Srobert }
1398*12c85518Srobert 
1399*12c85518Srobert #define _mm_sqrt_round_sh(A, B, R)                                             \
1400*12c85518Srobert   ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
1401*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1402*12c85518Srobert       (__mmask8)-1, (int)(R)))
1403*12c85518Srobert 
1404*12c85518Srobert #define _mm_mask_sqrt_round_sh(W, U, A, B, R)                                  \
1405*12c85518Srobert   ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
1406*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1407*12c85518Srobert       (__mmask8)(U), (int)(R)))
1408*12c85518Srobert 
1409*12c85518Srobert #define _mm_maskz_sqrt_round_sh(U, A, B, R)                                    \
1410*12c85518Srobert   ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
1411*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1412*12c85518Srobert       (__mmask8)(U), (int)(R)))
1413*12c85518Srobert 
_mm_sqrt_sh(__m128h __A,__m128h __B)1414*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_sh(__m128h __A,
1415*12c85518Srobert                                                             __m128h __B) {
1416*12c85518Srobert   return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1417*12c85518Srobert       (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1418*12c85518Srobert       (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
1419*12c85518Srobert }
1420*12c85518Srobert 
_mm_mask_sqrt_sh(__m128h __W,__mmask32 __U,__m128h __A,__m128h __B)1421*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sh(__m128h __W,
1422*12c85518Srobert                                                                  __mmask32 __U,
1423*12c85518Srobert                                                                  __m128h __A,
1424*12c85518Srobert                                                                  __m128h __B) {
1425*12c85518Srobert   return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1426*12c85518Srobert       (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W),
1427*12c85518Srobert       (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
1428*12c85518Srobert }
1429*12c85518Srobert 
_mm_maskz_sqrt_sh(__mmask32 __U,__m128h __A,__m128h __B)1430*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sh(__mmask32 __U,
1431*12c85518Srobert                                                                   __m128h __A,
1432*12c85518Srobert                                                                   __m128h __B) {
1433*12c85518Srobert   return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1434*12c85518Srobert       (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1435*12c85518Srobert       (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
1436*12c85518Srobert }
1437*12c85518Srobert 
1438*12c85518Srobert #define _mm512_mask_fpclass_ph_mask(U, A, imm)                                 \
1439*12c85518Srobert   ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A),          \
1440*12c85518Srobert                                                (int)(imm), (__mmask32)(U)))
1441*12c85518Srobert 
1442*12c85518Srobert #define _mm512_fpclass_ph_mask(A, imm)                                         \
1443*12c85518Srobert   ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A),          \
1444*12c85518Srobert                                                (int)(imm), (__mmask32)-1))
1445*12c85518Srobert 
1446*12c85518Srobert #define _mm_fpclass_sh_mask(A, imm)                                            \
1447*12c85518Srobert   ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm),   \
1448*12c85518Srobert                                            (__mmask8)-1))
1449*12c85518Srobert 
1450*12c85518Srobert #define _mm_mask_fpclass_sh_mask(U, A, imm)                                    \
1451*12c85518Srobert   ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm),   \
1452*12c85518Srobert                                            (__mmask8)(U)))
1453*12c85518Srobert 
1454*12c85518Srobert #define _mm512_cvt_roundpd_ph(A, R)                                            \
1455*12c85518Srobert   ((__m128h)__builtin_ia32_vcvtpd2ph512_mask(                                  \
1456*12c85518Srobert       (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1457*12c85518Srobert 
1458*12c85518Srobert #define _mm512_mask_cvt_roundpd_ph(W, U, A, R)                                 \
1459*12c85518Srobert   ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W),         \
1460*12c85518Srobert                                              (__mmask8)(U), (int)(R)))
1461*12c85518Srobert 
1462*12c85518Srobert #define _mm512_maskz_cvt_roundpd_ph(U, A, R)                                   \
1463*12c85518Srobert   ((__m128h)__builtin_ia32_vcvtpd2ph512_mask(                                  \
1464*12c85518Srobert       (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1465*12c85518Srobert 
_mm512_cvtpd_ph(__m512d __A)1466*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS512 _mm512_cvtpd_ph(__m512d __A) {
1467*12c85518Srobert   return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1468*12c85518Srobert       (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1469*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1470*12c85518Srobert }
1471*12c85518Srobert 
1472*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtpd_ph(__m128h __W,__mmask8 __U,__m512d __A)1473*12c85518Srobert _mm512_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m512d __A) {
1474*12c85518Srobert   return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1475*12c85518Srobert       (__v8df)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1476*12c85518Srobert }
1477*12c85518Srobert 
1478*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtpd_ph(__mmask8 __U,__m512d __A)1479*12c85518Srobert _mm512_maskz_cvtpd_ph(__mmask8 __U, __m512d __A) {
1480*12c85518Srobert   return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1481*12c85518Srobert       (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1482*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1483*12c85518Srobert }
1484*12c85518Srobert 
1485*12c85518Srobert #define _mm512_cvt_roundph_pd(A, R)                                            \
1486*12c85518Srobert   ((__m512d)__builtin_ia32_vcvtph2pd512_mask(                                  \
1487*12c85518Srobert       (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
1488*12c85518Srobert 
1489*12c85518Srobert #define _mm512_mask_cvt_roundph_pd(W, U, A, R)                                 \
1490*12c85518Srobert   ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W),         \
1491*12c85518Srobert                                              (__mmask8)(U), (int)(R)))
1492*12c85518Srobert 
1493*12c85518Srobert #define _mm512_maskz_cvt_roundph_pd(U, A, R)                                   \
1494*12c85518Srobert   ((__m512d)__builtin_ia32_vcvtph2pd512_mask(                                  \
1495*12c85518Srobert       (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
1496*12c85518Srobert 
_mm512_cvtph_pd(__m128h __A)1497*12c85518Srobert static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtph_pd(__m128h __A) {
1498*12c85518Srobert   return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1499*12c85518Srobert       (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1,
1500*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1501*12c85518Srobert }
1502*12c85518Srobert 
1503*12c85518Srobert static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_pd(__m512d __W,__mmask8 __U,__m128h __A)1504*12c85518Srobert _mm512_mask_cvtph_pd(__m512d __W, __mmask8 __U, __m128h __A) {
1505*12c85518Srobert   return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1506*12c85518Srobert       (__v8hf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1507*12c85518Srobert }
1508*12c85518Srobert 
1509*12c85518Srobert static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_pd(__mmask8 __U,__m128h __A)1510*12c85518Srobert _mm512_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
1511*12c85518Srobert   return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1512*12c85518Srobert       (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U,
1513*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1514*12c85518Srobert }
1515*12c85518Srobert 
1516*12c85518Srobert #define _mm_cvt_roundsh_ss(A, B, R)                                            \
1517*12c85518Srobert   ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B),       \
1518*12c85518Srobert                                                (__v4sf)_mm_undefined_ps(),     \
1519*12c85518Srobert                                                (__mmask8)(-1), (int)(R)))
1520*12c85518Srobert 
1521*12c85518Srobert #define _mm_mask_cvt_roundsh_ss(W, U, A, B, R)                                 \
1522*12c85518Srobert   ((__m128)__builtin_ia32_vcvtsh2ss_round_mask(                                \
1523*12c85518Srobert       (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
1524*12c85518Srobert 
1525*12c85518Srobert #define _mm_maskz_cvt_roundsh_ss(U, A, B, R)                                   \
1526*12c85518Srobert   ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B),       \
1527*12c85518Srobert                                                (__v4sf)_mm_setzero_ps(),       \
1528*12c85518Srobert                                                (__mmask8)(U), (int)(R)))
1529*12c85518Srobert 
_mm_cvtsh_ss(__m128 __A,__m128h __B)1530*12c85518Srobert static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtsh_ss(__m128 __A,
1531*12c85518Srobert                                                             __m128h __B) {
1532*12c85518Srobert   return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1533*12c85518Srobert       (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_undefined_ps(), (__mmask8)-1,
1534*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1535*12c85518Srobert }
1536*12c85518Srobert 
_mm_mask_cvtsh_ss(__m128 __W,__mmask8 __U,__m128 __A,__m128h __B)1537*12c85518Srobert static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_ss(__m128 __W,
1538*12c85518Srobert                                                                  __mmask8 __U,
1539*12c85518Srobert                                                                  __m128 __A,
1540*12c85518Srobert                                                                  __m128h __B) {
1541*12c85518Srobert   return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B,
1542*12c85518Srobert                                                      (__v4sf)__W, (__mmask8)__U,
1543*12c85518Srobert                                                      _MM_FROUND_CUR_DIRECTION);
1544*12c85518Srobert }
1545*12c85518Srobert 
_mm_maskz_cvtsh_ss(__mmask8 __U,__m128 __A,__m128h __B)1546*12c85518Srobert static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsh_ss(__mmask8 __U,
1547*12c85518Srobert                                                                   __m128 __A,
1548*12c85518Srobert                                                                   __m128h __B) {
1549*12c85518Srobert   return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1550*12c85518Srobert       (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U,
1551*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1552*12c85518Srobert }
1553*12c85518Srobert 
1554*12c85518Srobert #define _mm_cvt_roundss_sh(A, B, R)                                            \
1555*12c85518Srobert   ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B),      \
1556*12c85518Srobert                                                 (__v8hf)_mm_undefined_ph(),    \
1557*12c85518Srobert                                                 (__mmask8)(-1), (int)(R)))
1558*12c85518Srobert 
1559*12c85518Srobert #define _mm_mask_cvt_roundss_sh(W, U, A, B, R)                                 \
1560*12c85518Srobert   ((__m128h)__builtin_ia32_vcvtss2sh_round_mask(                               \
1561*12c85518Srobert       (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1562*12c85518Srobert 
1563*12c85518Srobert #define _mm_maskz_cvt_roundss_sh(U, A, B, R)                                   \
1564*12c85518Srobert   ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B),      \
1565*12c85518Srobert                                                 (__v8hf)_mm_setzero_ph(),      \
1566*12c85518Srobert                                                 (__mmask8)(U), (int)(R)))
1567*12c85518Srobert 
_mm_cvtss_sh(__m128h __A,__m128 __B)1568*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtss_sh(__m128h __A,
1569*12c85518Srobert                                                              __m128 __B) {
1570*12c85518Srobert   return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1571*12c85518Srobert       (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1572*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1573*12c85518Srobert }
1574*12c85518Srobert 
_mm_mask_cvtss_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128 __B)1575*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtss_sh(__m128h __W,
1576*12c85518Srobert                                                                   __mmask8 __U,
1577*12c85518Srobert                                                                   __m128h __A,
1578*12c85518Srobert                                                                   __m128 __B) {
1579*12c85518Srobert   return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1580*12c85518Srobert       (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U,
1581*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1582*12c85518Srobert }
1583*12c85518Srobert 
_mm_maskz_cvtss_sh(__mmask8 __U,__m128h __A,__m128 __B)1584*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvtss_sh(__mmask8 __U,
1585*12c85518Srobert                                                                    __m128h __A,
1586*12c85518Srobert                                                                    __m128 __B) {
1587*12c85518Srobert   return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1588*12c85518Srobert       (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1589*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1590*12c85518Srobert }
1591*12c85518Srobert 
1592*12c85518Srobert #define _mm_cvt_roundsd_sh(A, B, R)                                            \
1593*12c85518Srobert   ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B),      \
1594*12c85518Srobert                                                 (__v8hf)_mm_undefined_ph(),    \
1595*12c85518Srobert                                                 (__mmask8)(-1), (int)(R)))
1596*12c85518Srobert 
1597*12c85518Srobert #define _mm_mask_cvt_roundsd_sh(W, U, A, B, R)                                 \
1598*12c85518Srobert   ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask(                               \
1599*12c85518Srobert       (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1600*12c85518Srobert 
1601*12c85518Srobert #define _mm_maskz_cvt_roundsd_sh(U, A, B, R)                                   \
1602*12c85518Srobert   ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B),      \
1603*12c85518Srobert                                                 (__v8hf)_mm_setzero_ph(),      \
1604*12c85518Srobert                                                 (__mmask8)(U), (int)(R)))
1605*12c85518Srobert 
_mm_cvtsd_sh(__m128h __A,__m128d __B)1606*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtsd_sh(__m128h __A,
1607*12c85518Srobert                                                              __m128d __B) {
1608*12c85518Srobert   return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1609*12c85518Srobert       (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1610*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1611*12c85518Srobert }
1612*12c85518Srobert 
_mm_mask_cvtsd_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128d __B)1613*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtsd_sh(__m128h __W,
1614*12c85518Srobert                                                                   __mmask8 __U,
1615*12c85518Srobert                                                                   __m128h __A,
1616*12c85518Srobert                                                                   __m128d __B) {
1617*12c85518Srobert   return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1618*12c85518Srobert       (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (__mmask8)__U,
1619*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1620*12c85518Srobert }
1621*12c85518Srobert 
1622*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_cvtsd_sh(__mmask8 __U,__m128h __A,__m128d __B)1623*12c85518Srobert _mm_maskz_cvtsd_sh(__mmask8 __U, __m128h __A, __m128d __B) {
1624*12c85518Srobert   return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1625*12c85518Srobert       (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1626*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1627*12c85518Srobert }
1628*12c85518Srobert 
1629*12c85518Srobert #define _mm_cvt_roundsh_sd(A, B, R)                                            \
1630*12c85518Srobert   ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B),      \
1631*12c85518Srobert                                                 (__v2df)_mm_undefined_pd(),    \
1632*12c85518Srobert                                                 (__mmask8)(-1), (int)(R)))
1633*12c85518Srobert 
1634*12c85518Srobert #define _mm_mask_cvt_roundsh_sd(W, U, A, B, R)                                 \
1635*12c85518Srobert   ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask(                               \
1636*12c85518Srobert       (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
1637*12c85518Srobert 
1638*12c85518Srobert #define _mm_maskz_cvt_roundsh_sd(U, A, B, R)                                   \
1639*12c85518Srobert   ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B),      \
1640*12c85518Srobert                                                 (__v2df)_mm_setzero_pd(),      \
1641*12c85518Srobert                                                 (__mmask8)(U), (int)(R)))
1642*12c85518Srobert 
_mm_cvtsh_sd(__m128d __A,__m128h __B)1643*12c85518Srobert static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtsh_sd(__m128d __A,
1644*12c85518Srobert                                                              __m128h __B) {
1645*12c85518Srobert   return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1646*12c85518Srobert       (__v2df)__A, (__v8hf)__B, (__v2df)_mm_undefined_pd(), (__mmask8)-1,
1647*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1648*12c85518Srobert }
1649*12c85518Srobert 
_mm_mask_cvtsh_sd(__m128d __W,__mmask8 __U,__m128d __A,__m128h __B)1650*12c85518Srobert static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_sd(__m128d __W,
1651*12c85518Srobert                                                                   __mmask8 __U,
1652*12c85518Srobert                                                                   __m128d __A,
1653*12c85518Srobert                                                                   __m128h __B) {
1654*12c85518Srobert   return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1655*12c85518Srobert       (__v2df)__A, (__v8hf)__B, (__v2df)__W, (__mmask8)__U,
1656*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1657*12c85518Srobert }
1658*12c85518Srobert 
1659*12c85518Srobert static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_maskz_cvtsh_sd(__mmask8 __U,__m128d __A,__m128h __B)1660*12c85518Srobert _mm_maskz_cvtsh_sd(__mmask8 __U, __m128d __A, __m128h __B) {
1661*12c85518Srobert   return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1662*12c85518Srobert       (__v2df)__A, (__v8hf)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U,
1663*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1664*12c85518Srobert }
1665*12c85518Srobert 
1666*12c85518Srobert #define _mm512_cvt_roundph_epi16(A, R)                                         \
1667*12c85518Srobert   ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A),                      \
1668*12c85518Srobert                                             (__v32hi)_mm512_undefined_epi32(), \
1669*12c85518Srobert                                             (__mmask32)(-1), (int)(R)))
1670*12c85518Srobert 
1671*12c85518Srobert #define _mm512_mask_cvt_roundph_epi16(W, U, A, R)                              \
1672*12c85518Srobert   ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W),        \
1673*12c85518Srobert                                             (__mmask32)(U), (int)(R)))
1674*12c85518Srobert 
1675*12c85518Srobert #define _mm512_maskz_cvt_roundph_epi16(U, A, R)                                \
1676*12c85518Srobert   ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A),                      \
1677*12c85518Srobert                                             (__v32hi)_mm512_setzero_epi32(),   \
1678*12c85518Srobert                                             (__mmask32)(U), (int)(R)))
1679*12c85518Srobert 
1680*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtph_epi16(__m512h __A)1681*12c85518Srobert _mm512_cvtph_epi16(__m512h __A) {
1682*12c85518Srobert   return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1683*12c85518Srobert       (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1684*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1685*12c85518Srobert }
1686*12c85518Srobert 
1687*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_epi16(__m512i __W,__mmask32 __U,__m512h __A)1688*12c85518Srobert _mm512_mask_cvtph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1689*12c85518Srobert   return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1690*12c85518Srobert       (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1691*12c85518Srobert }
1692*12c85518Srobert 
1693*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_epi16(__mmask32 __U,__m512h __A)1694*12c85518Srobert _mm512_maskz_cvtph_epi16(__mmask32 __U, __m512h __A) {
1695*12c85518Srobert   return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1696*12c85518Srobert       (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1697*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1698*12c85518Srobert }
1699*12c85518Srobert 
1700*12c85518Srobert #define _mm512_cvtt_roundph_epi16(A, R)                                        \
1701*12c85518Srobert   ((__m512i)__builtin_ia32_vcvttph2w512_mask(                                  \
1702*12c85518Srobert       (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1),        \
1703*12c85518Srobert       (int)(R)))
1704*12c85518Srobert 
1705*12c85518Srobert #define _mm512_mask_cvtt_roundph_epi16(W, U, A, R)                             \
1706*12c85518Srobert   ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W),       \
1707*12c85518Srobert                                              (__mmask32)(U), (int)(R)))
1708*12c85518Srobert 
1709*12c85518Srobert #define _mm512_maskz_cvtt_roundph_epi16(U, A, R)                               \
1710*12c85518Srobert   ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A),                     \
1711*12c85518Srobert                                              (__v32hi)_mm512_setzero_epi32(),  \
1712*12c85518Srobert                                              (__mmask32)(U), (int)(R)))
1713*12c85518Srobert 
1714*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttph_epi16(__m512h __A)1715*12c85518Srobert _mm512_cvttph_epi16(__m512h __A) {
1716*12c85518Srobert   return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1717*12c85518Srobert       (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1718*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1719*12c85518Srobert }
1720*12c85518Srobert 
1721*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvttph_epi16(__m512i __W,__mmask32 __U,__m512h __A)1722*12c85518Srobert _mm512_mask_cvttph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1723*12c85518Srobert   return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1724*12c85518Srobert       (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1725*12c85518Srobert }
1726*12c85518Srobert 
1727*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvttph_epi16(__mmask32 __U,__m512h __A)1728*12c85518Srobert _mm512_maskz_cvttph_epi16(__mmask32 __U, __m512h __A) {
1729*12c85518Srobert   return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1730*12c85518Srobert       (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1731*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1732*12c85518Srobert }
1733*12c85518Srobert 
1734*12c85518Srobert #define _mm512_cvt_roundepi16_ph(A, R)                                         \
1735*12c85518Srobert   ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A),                      \
1736*12c85518Srobert                                             (__v32hf)_mm512_undefined_ph(),    \
1737*12c85518Srobert                                             (__mmask32)(-1), (int)(R)))
1738*12c85518Srobert 
1739*12c85518Srobert #define _mm512_mask_cvt_roundepi16_ph(W, U, A, R)                              \
1740*12c85518Srobert   ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W),        \
1741*12c85518Srobert                                             (__mmask32)(U), (int)(R)))
1742*12c85518Srobert 
1743*12c85518Srobert #define _mm512_maskz_cvt_roundepi16_ph(U, A, R)                                \
1744*12c85518Srobert   ((__m512h)__builtin_ia32_vcvtw2ph512_mask(                                   \
1745*12c85518Srobert       (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1746*12c85518Srobert 
1747*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_cvtepi16_ph(__m512i __A)1748*12c85518Srobert _mm512_cvtepi16_ph(__m512i __A) {
1749*12c85518Srobert   return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1750*12c85518Srobert       (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1751*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1752*12c85518Srobert }
1753*12c85518Srobert 
1754*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi16_ph(__m512h __W,__mmask32 __U,__m512i __A)1755*12c85518Srobert _mm512_mask_cvtepi16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1756*12c85518Srobert   return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1757*12c85518Srobert       (__v32hi)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1758*12c85518Srobert }
1759*12c85518Srobert 
1760*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepi16_ph(__mmask32 __U,__m512i __A)1761*12c85518Srobert _mm512_maskz_cvtepi16_ph(__mmask32 __U, __m512i __A) {
1762*12c85518Srobert   return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1763*12c85518Srobert       (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1764*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1765*12c85518Srobert }
1766*12c85518Srobert 
1767*12c85518Srobert #define _mm512_cvt_roundph_epu16(A, R)                                         \
1768*12c85518Srobert   ((__m512i)__builtin_ia32_vcvtph2uw512_mask(                                  \
1769*12c85518Srobert       (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1),        \
1770*12c85518Srobert       (int)(R)))
1771*12c85518Srobert 
1772*12c85518Srobert #define _mm512_mask_cvt_roundph_epu16(W, U, A, R)                              \
1773*12c85518Srobert   ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W),       \
1774*12c85518Srobert                                              (__mmask32)(U), (int)(R)))
1775*12c85518Srobert 
1776*12c85518Srobert #define _mm512_maskz_cvt_roundph_epu16(U, A, R)                                \
1777*12c85518Srobert   ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A),                     \
1778*12c85518Srobert                                              (__v32hu)_mm512_setzero_epi32(),  \
1779*12c85518Srobert                                              (__mmask32)(U), (int)(R)))
1780*12c85518Srobert 
1781*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtph_epu16(__m512h __A)1782*12c85518Srobert _mm512_cvtph_epu16(__m512h __A) {
1783*12c85518Srobert   return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1784*12c85518Srobert       (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1785*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1786*12c85518Srobert }
1787*12c85518Srobert 
1788*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_epu16(__m512i __W,__mmask32 __U,__m512h __A)1789*12c85518Srobert _mm512_mask_cvtph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1790*12c85518Srobert   return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1791*12c85518Srobert       (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1792*12c85518Srobert }
1793*12c85518Srobert 
1794*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_epu16(__mmask32 __U,__m512h __A)1795*12c85518Srobert _mm512_maskz_cvtph_epu16(__mmask32 __U, __m512h __A) {
1796*12c85518Srobert   return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1797*12c85518Srobert       (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1798*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1799*12c85518Srobert }
1800*12c85518Srobert 
1801*12c85518Srobert #define _mm512_cvtt_roundph_epu16(A, R)                                        \
1802*12c85518Srobert   ((__m512i)__builtin_ia32_vcvttph2uw512_mask(                                 \
1803*12c85518Srobert       (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1),        \
1804*12c85518Srobert       (int)(R)))
1805*12c85518Srobert 
1806*12c85518Srobert #define _mm512_mask_cvtt_roundph_epu16(W, U, A, R)                             \
1807*12c85518Srobert   ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W),      \
1808*12c85518Srobert                                               (__mmask32)(U), (int)(R)))
1809*12c85518Srobert 
1810*12c85518Srobert #define _mm512_maskz_cvtt_roundph_epu16(U, A, R)                               \
1811*12c85518Srobert   ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A),                    \
1812*12c85518Srobert                                               (__v32hu)_mm512_setzero_epi32(), \
1813*12c85518Srobert                                               (__mmask32)(U), (int)(R)))
1814*12c85518Srobert 
1815*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttph_epu16(__m512h __A)1816*12c85518Srobert _mm512_cvttph_epu16(__m512h __A) {
1817*12c85518Srobert   return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1818*12c85518Srobert       (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1819*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1820*12c85518Srobert }
1821*12c85518Srobert 
1822*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvttph_epu16(__m512i __W,__mmask32 __U,__m512h __A)1823*12c85518Srobert _mm512_mask_cvttph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1824*12c85518Srobert   return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1825*12c85518Srobert       (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1826*12c85518Srobert }
1827*12c85518Srobert 
1828*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvttph_epu16(__mmask32 __U,__m512h __A)1829*12c85518Srobert _mm512_maskz_cvttph_epu16(__mmask32 __U, __m512h __A) {
1830*12c85518Srobert   return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1831*12c85518Srobert       (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1832*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1833*12c85518Srobert }
1834*12c85518Srobert 
1835*12c85518Srobert #define _mm512_cvt_roundepu16_ph(A, R)                                         \
1836*12c85518Srobert   ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A),                     \
1837*12c85518Srobert                                              (__v32hf)_mm512_undefined_ph(),   \
1838*12c85518Srobert                                              (__mmask32)(-1), (int)(R)))
1839*12c85518Srobert 
1840*12c85518Srobert #define _mm512_mask_cvt_roundepu16_ph(W, U, A, R)                              \
1841*12c85518Srobert   ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W),       \
1842*12c85518Srobert                                              (__mmask32)(U), (int)(R)))
1843*12c85518Srobert 
1844*12c85518Srobert #define _mm512_maskz_cvt_roundepu16_ph(U, A, R)                                \
1845*12c85518Srobert   ((__m512h)__builtin_ia32_vcvtuw2ph512_mask(                                  \
1846*12c85518Srobert       (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1847*12c85518Srobert 
1848*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_cvtepu16_ph(__m512i __A)1849*12c85518Srobert _mm512_cvtepu16_ph(__m512i __A) {
1850*12c85518Srobert   return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1851*12c85518Srobert       (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1852*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1853*12c85518Srobert }
1854*12c85518Srobert 
1855*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepu16_ph(__m512h __W,__mmask32 __U,__m512i __A)1856*12c85518Srobert _mm512_mask_cvtepu16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1857*12c85518Srobert   return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1858*12c85518Srobert       (__v32hu)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1859*12c85518Srobert }
1860*12c85518Srobert 
1861*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepu16_ph(__mmask32 __U,__m512i __A)1862*12c85518Srobert _mm512_maskz_cvtepu16_ph(__mmask32 __U, __m512i __A) {
1863*12c85518Srobert   return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1864*12c85518Srobert       (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1865*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1866*12c85518Srobert }
1867*12c85518Srobert 
1868*12c85518Srobert #define _mm512_cvt_roundph_epi32(A, R)                                         \
1869*12c85518Srobert   ((__m512i)__builtin_ia32_vcvtph2dq512_mask(                                  \
1870*12c85518Srobert       (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1),        \
1871*12c85518Srobert       (int)(R)))
1872*12c85518Srobert 
1873*12c85518Srobert #define _mm512_mask_cvt_roundph_epi32(W, U, A, R)                              \
1874*12c85518Srobert   ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W),       \
1875*12c85518Srobert                                              (__mmask16)(U), (int)(R)))
1876*12c85518Srobert 
1877*12c85518Srobert #define _mm512_maskz_cvt_roundph_epi32(U, A, R)                                \
1878*12c85518Srobert   ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A),                     \
1879*12c85518Srobert                                              (__v16si)_mm512_setzero_epi32(),  \
1880*12c85518Srobert                                              (__mmask16)(U), (int)(R)))
1881*12c85518Srobert 
1882*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtph_epi32(__m256h __A)1883*12c85518Srobert _mm512_cvtph_epi32(__m256h __A) {
1884*12c85518Srobert   return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1885*12c85518Srobert       (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
1886*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1887*12c85518Srobert }
1888*12c85518Srobert 
1889*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_epi32(__m512i __W,__mmask16 __U,__m256h __A)1890*12c85518Srobert _mm512_mask_cvtph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
1891*12c85518Srobert   return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1892*12c85518Srobert       (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1893*12c85518Srobert }
1894*12c85518Srobert 
1895*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_epi32(__mmask16 __U,__m256h __A)1896*12c85518Srobert _mm512_maskz_cvtph_epi32(__mmask16 __U, __m256h __A) {
1897*12c85518Srobert   return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1898*12c85518Srobert       (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
1899*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1900*12c85518Srobert }
1901*12c85518Srobert 
1902*12c85518Srobert #define _mm512_cvt_roundph_epu32(A, R)                                         \
1903*12c85518Srobert   ((__m512i)__builtin_ia32_vcvtph2udq512_mask(                                 \
1904*12c85518Srobert       (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1),        \
1905*12c85518Srobert       (int)(R)))
1906*12c85518Srobert 
1907*12c85518Srobert #define _mm512_mask_cvt_roundph_epu32(W, U, A, R)                              \
1908*12c85518Srobert   ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W),      \
1909*12c85518Srobert                                               (__mmask16)(U), (int)(R)))
1910*12c85518Srobert 
1911*12c85518Srobert #define _mm512_maskz_cvt_roundph_epu32(U, A, R)                                \
1912*12c85518Srobert   ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A),                    \
1913*12c85518Srobert                                               (__v16su)_mm512_setzero_epi32(), \
1914*12c85518Srobert                                               (__mmask16)(U), (int)(R)))
1915*12c85518Srobert 
1916*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtph_epu32(__m256h __A)1917*12c85518Srobert _mm512_cvtph_epu32(__m256h __A) {
1918*12c85518Srobert   return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1919*12c85518Srobert       (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
1920*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1921*12c85518Srobert }
1922*12c85518Srobert 
1923*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_epu32(__m512i __W,__mmask16 __U,__m256h __A)1924*12c85518Srobert _mm512_mask_cvtph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
1925*12c85518Srobert   return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1926*12c85518Srobert       (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1927*12c85518Srobert }
1928*12c85518Srobert 
1929*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_epu32(__mmask16 __U,__m256h __A)1930*12c85518Srobert _mm512_maskz_cvtph_epu32(__mmask16 __U, __m256h __A) {
1931*12c85518Srobert   return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1932*12c85518Srobert       (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
1933*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1934*12c85518Srobert }
1935*12c85518Srobert 
1936*12c85518Srobert #define _mm512_cvt_roundepi32_ph(A, R)                                         \
1937*12c85518Srobert   ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A),                     \
1938*12c85518Srobert                                              (__v16hf)_mm256_undefined_ph(),   \
1939*12c85518Srobert                                              (__mmask16)(-1), (int)(R)))
1940*12c85518Srobert 
1941*12c85518Srobert #define _mm512_mask_cvt_roundepi32_ph(W, U, A, R)                              \
1942*12c85518Srobert   ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W),       \
1943*12c85518Srobert                                              (__mmask16)(U), (int)(R)))
1944*12c85518Srobert 
1945*12c85518Srobert #define _mm512_maskz_cvt_roundepi32_ph(U, A, R)                                \
1946*12c85518Srobert   ((__m256h)__builtin_ia32_vcvtdq2ph512_mask(                                  \
1947*12c85518Srobert       (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1948*12c85518Srobert 
1949*12c85518Srobert static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_cvtepi32_ph(__m512i __A)1950*12c85518Srobert _mm512_cvtepi32_ph(__m512i __A) {
1951*12c85518Srobert   return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1952*12c85518Srobert       (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1953*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1954*12c85518Srobert }
1955*12c85518Srobert 
1956*12c85518Srobert static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi32_ph(__m256h __W,__mmask16 __U,__m512i __A)1957*12c85518Srobert _mm512_mask_cvtepi32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1958*12c85518Srobert   return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1959*12c85518Srobert       (__v16si)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1960*12c85518Srobert }
1961*12c85518Srobert 
1962*12c85518Srobert static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepi32_ph(__mmask16 __U,__m512i __A)1963*12c85518Srobert _mm512_maskz_cvtepi32_ph(__mmask16 __U, __m512i __A) {
1964*12c85518Srobert   return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1965*12c85518Srobert       (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
1966*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1967*12c85518Srobert }
1968*12c85518Srobert 
1969*12c85518Srobert #define _mm512_cvt_roundepu32_ph(A, R)                                         \
1970*12c85518Srobert   ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A),                    \
1971*12c85518Srobert                                               (__v16hf)_mm256_undefined_ph(),  \
1972*12c85518Srobert                                               (__mmask16)(-1), (int)(R)))
1973*12c85518Srobert 
1974*12c85518Srobert #define _mm512_mask_cvt_roundepu32_ph(W, U, A, R)                              \
1975*12c85518Srobert   ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W),      \
1976*12c85518Srobert                                               (__mmask16)(U), (int)(R)))
1977*12c85518Srobert 
1978*12c85518Srobert #define _mm512_maskz_cvt_roundepu32_ph(U, A, R)                                \
1979*12c85518Srobert   ((__m256h)__builtin_ia32_vcvtudq2ph512_mask(                                 \
1980*12c85518Srobert       (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1981*12c85518Srobert 
1982*12c85518Srobert static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_cvtepu32_ph(__m512i __A)1983*12c85518Srobert _mm512_cvtepu32_ph(__m512i __A) {
1984*12c85518Srobert   return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1985*12c85518Srobert       (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1986*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
1987*12c85518Srobert }
1988*12c85518Srobert 
1989*12c85518Srobert static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepu32_ph(__m256h __W,__mmask16 __U,__m512i __A)1990*12c85518Srobert _mm512_mask_cvtepu32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1991*12c85518Srobert   return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1992*12c85518Srobert       (__v16su)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1993*12c85518Srobert }
1994*12c85518Srobert 
1995*12c85518Srobert static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepu32_ph(__mmask16 __U,__m512i __A)1996*12c85518Srobert _mm512_maskz_cvtepu32_ph(__mmask16 __U, __m512i __A) {
1997*12c85518Srobert   return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1998*12c85518Srobert       (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
1999*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2000*12c85518Srobert }
2001*12c85518Srobert 
2002*12c85518Srobert #define _mm512_cvtt_roundph_epi32(A, R)                                        \
2003*12c85518Srobert   ((__m512i)__builtin_ia32_vcvttph2dq512_mask(                                 \
2004*12c85518Srobert       (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1),        \
2005*12c85518Srobert       (int)(R)))
2006*12c85518Srobert 
2007*12c85518Srobert #define _mm512_mask_cvtt_roundph_epi32(W, U, A, R)                             \
2008*12c85518Srobert   ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W),      \
2009*12c85518Srobert                                               (__mmask16)(U), (int)(R)))
2010*12c85518Srobert 
2011*12c85518Srobert #define _mm512_maskz_cvtt_roundph_epi32(U, A, R)                               \
2012*12c85518Srobert   ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A),                    \
2013*12c85518Srobert                                               (__v16si)_mm512_setzero_epi32(), \
2014*12c85518Srobert                                               (__mmask16)(U), (int)(R)))
2015*12c85518Srobert 
2016*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttph_epi32(__m256h __A)2017*12c85518Srobert _mm512_cvttph_epi32(__m256h __A) {
2018*12c85518Srobert   return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2019*12c85518Srobert       (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
2020*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2021*12c85518Srobert }
2022*12c85518Srobert 
2023*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvttph_epi32(__m512i __W,__mmask16 __U,__m256h __A)2024*12c85518Srobert _mm512_mask_cvttph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
2025*12c85518Srobert   return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2026*12c85518Srobert       (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2027*12c85518Srobert }
2028*12c85518Srobert 
2029*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvttph_epi32(__mmask16 __U,__m256h __A)2030*12c85518Srobert _mm512_maskz_cvttph_epi32(__mmask16 __U, __m256h __A) {
2031*12c85518Srobert   return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2032*12c85518Srobert       (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
2033*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2034*12c85518Srobert }
2035*12c85518Srobert 
2036*12c85518Srobert #define _mm512_cvtt_roundph_epu32(A, R)                                        \
2037*12c85518Srobert   ((__m512i)__builtin_ia32_vcvttph2udq512_mask(                                \
2038*12c85518Srobert       (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1),        \
2039*12c85518Srobert       (int)(R)))
2040*12c85518Srobert 
2041*12c85518Srobert #define _mm512_mask_cvtt_roundph_epu32(W, U, A, R)                             \
2042*12c85518Srobert   ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W),     \
2043*12c85518Srobert                                                (__mmask16)(U), (int)(R)))
2044*12c85518Srobert 
2045*12c85518Srobert #define _mm512_maskz_cvtt_roundph_epu32(U, A, R)                               \
2046*12c85518Srobert   ((__m512i)__builtin_ia32_vcvttph2udq512_mask(                                \
2047*12c85518Srobert       (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U),           \
2048*12c85518Srobert       (int)(R)))
2049*12c85518Srobert 
2050*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttph_epu32(__m256h __A)2051*12c85518Srobert _mm512_cvttph_epu32(__m256h __A) {
2052*12c85518Srobert   return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2053*12c85518Srobert       (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
2054*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2055*12c85518Srobert }
2056*12c85518Srobert 
2057*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvttph_epu32(__m512i __W,__mmask16 __U,__m256h __A)2058*12c85518Srobert _mm512_mask_cvttph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
2059*12c85518Srobert   return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2060*12c85518Srobert       (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2061*12c85518Srobert }
2062*12c85518Srobert 
2063*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvttph_epu32(__mmask16 __U,__m256h __A)2064*12c85518Srobert _mm512_maskz_cvttph_epu32(__mmask16 __U, __m256h __A) {
2065*12c85518Srobert   return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2066*12c85518Srobert       (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
2067*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2068*12c85518Srobert }
2069*12c85518Srobert 
2070*12c85518Srobert #define _mm512_cvt_roundepi64_ph(A, R)                                         \
2071*12c85518Srobert   ((__m128h)__builtin_ia32_vcvtqq2ph512_mask(                                  \
2072*12c85518Srobert       (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2073*12c85518Srobert 
2074*12c85518Srobert #define _mm512_mask_cvt_roundepi64_ph(W, U, A, R)                              \
2075*12c85518Srobert   ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W),         \
2076*12c85518Srobert                                              (__mmask8)(U), (int)(R)))
2077*12c85518Srobert 
2078*12c85518Srobert #define _mm512_maskz_cvt_roundepi64_ph(U, A, R)                                \
2079*12c85518Srobert   ((__m128h)__builtin_ia32_vcvtqq2ph512_mask(                                  \
2080*12c85518Srobert       (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2081*12c85518Srobert 
2082*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_cvtepi64_ph(__m512i __A)2083*12c85518Srobert _mm512_cvtepi64_ph(__m512i __A) {
2084*12c85518Srobert   return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2085*12c85518Srobert       (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2086*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2087*12c85518Srobert }
2088*12c85518Srobert 
2089*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi64_ph(__m128h __W,__mmask8 __U,__m512i __A)2090*12c85518Srobert _mm512_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2091*12c85518Srobert   return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2092*12c85518Srobert       (__v8di)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2093*12c85518Srobert }
2094*12c85518Srobert 
2095*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepi64_ph(__mmask8 __U,__m512i __A)2096*12c85518Srobert _mm512_maskz_cvtepi64_ph(__mmask8 __U, __m512i __A) {
2097*12c85518Srobert   return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2098*12c85518Srobert       (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2099*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2100*12c85518Srobert }
2101*12c85518Srobert 
2102*12c85518Srobert #define _mm512_cvt_roundph_epi64(A, R)                                         \
2103*12c85518Srobert   ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A),                      \
2104*12c85518Srobert                                              (__v8di)_mm512_undefined_epi32(), \
2105*12c85518Srobert                                              (__mmask8)(-1), (int)(R)))
2106*12c85518Srobert 
2107*12c85518Srobert #define _mm512_mask_cvt_roundph_epi64(W, U, A, R)                              \
2108*12c85518Srobert   ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W),         \
2109*12c85518Srobert                                              (__mmask8)(U), (int)(R)))
2110*12c85518Srobert 
2111*12c85518Srobert #define _mm512_maskz_cvt_roundph_epi64(U, A, R)                                \
2112*12c85518Srobert   ((__m512i)__builtin_ia32_vcvtph2qq512_mask(                                  \
2113*12c85518Srobert       (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2114*12c85518Srobert 
2115*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtph_epi64(__m128h __A)2116*12c85518Srobert _mm512_cvtph_epi64(__m128h __A) {
2117*12c85518Srobert   return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2118*12c85518Srobert       (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2119*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2120*12c85518Srobert }
2121*12c85518Srobert 
2122*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_epi64(__m512i __W,__mmask8 __U,__m128h __A)2123*12c85518Srobert _mm512_mask_cvtph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2124*12c85518Srobert   return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2125*12c85518Srobert       (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2126*12c85518Srobert }
2127*12c85518Srobert 
2128*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_epi64(__mmask8 __U,__m128h __A)2129*12c85518Srobert _mm512_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
2130*12c85518Srobert   return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2131*12c85518Srobert       (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2132*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2133*12c85518Srobert }
2134*12c85518Srobert 
2135*12c85518Srobert #define _mm512_cvt_roundepu64_ph(A, R)                                         \
2136*12c85518Srobert   ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask(                                 \
2137*12c85518Srobert       (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2138*12c85518Srobert 
2139*12c85518Srobert #define _mm512_mask_cvt_roundepu64_ph(W, U, A, R)                              \
2140*12c85518Srobert   ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W),        \
2141*12c85518Srobert                                               (__mmask8)(U), (int)(R)))
2142*12c85518Srobert 
2143*12c85518Srobert #define _mm512_maskz_cvt_roundepu64_ph(U, A, R)                                \
2144*12c85518Srobert   ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask(                                 \
2145*12c85518Srobert       (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2146*12c85518Srobert 
2147*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_cvtepu64_ph(__m512i __A)2148*12c85518Srobert _mm512_cvtepu64_ph(__m512i __A) {
2149*12c85518Srobert   return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2150*12c85518Srobert       (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2151*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2152*12c85518Srobert }
2153*12c85518Srobert 
2154*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepu64_ph(__m128h __W,__mmask8 __U,__m512i __A)2155*12c85518Srobert _mm512_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2156*12c85518Srobert   return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2157*12c85518Srobert       (__v8du)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2158*12c85518Srobert }
2159*12c85518Srobert 
2160*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepu64_ph(__mmask8 __U,__m512i __A)2161*12c85518Srobert _mm512_maskz_cvtepu64_ph(__mmask8 __U, __m512i __A) {
2162*12c85518Srobert   return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2163*12c85518Srobert       (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2164*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2165*12c85518Srobert }
2166*12c85518Srobert 
2167*12c85518Srobert #define _mm512_cvt_roundph_epu64(A, R)                                         \
2168*12c85518Srobert   ((__m512i)__builtin_ia32_vcvtph2uqq512_mask(                                 \
2169*12c85518Srobert       (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1),           \
2170*12c85518Srobert       (int)(R)))
2171*12c85518Srobert 
2172*12c85518Srobert #define _mm512_mask_cvt_roundph_epu64(W, U, A, R)                              \
2173*12c85518Srobert   ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W),        \
2174*12c85518Srobert                                               (__mmask8)(U), (int)(R)))
2175*12c85518Srobert 
2176*12c85518Srobert #define _mm512_maskz_cvt_roundph_epu64(U, A, R)                                \
2177*12c85518Srobert   ((__m512i)__builtin_ia32_vcvtph2uqq512_mask(                                 \
2178*12c85518Srobert       (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2179*12c85518Srobert 
2180*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtph_epu64(__m128h __A)2181*12c85518Srobert _mm512_cvtph_epu64(__m128h __A) {
2182*12c85518Srobert   return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2183*12c85518Srobert       (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2184*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2185*12c85518Srobert }
2186*12c85518Srobert 
2187*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_epu64(__m512i __W,__mmask8 __U,__m128h __A)2188*12c85518Srobert _mm512_mask_cvtph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2189*12c85518Srobert   return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2190*12c85518Srobert       (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2191*12c85518Srobert }
2192*12c85518Srobert 
2193*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_epu64(__mmask8 __U,__m128h __A)2194*12c85518Srobert _mm512_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
2195*12c85518Srobert   return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2196*12c85518Srobert       (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2197*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2198*12c85518Srobert }
2199*12c85518Srobert 
2200*12c85518Srobert #define _mm512_cvtt_roundph_epi64(A, R)                                        \
2201*12c85518Srobert   ((__m512i)__builtin_ia32_vcvttph2qq512_mask(                                 \
2202*12c85518Srobert       (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1),           \
2203*12c85518Srobert       (int)(R)))
2204*12c85518Srobert 
2205*12c85518Srobert #define _mm512_mask_cvtt_roundph_epi64(W, U, A, R)                             \
2206*12c85518Srobert   ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W),        \
2207*12c85518Srobert                                               (__mmask8)(U), (int)(R)))
2208*12c85518Srobert 
2209*12c85518Srobert #define _mm512_maskz_cvtt_roundph_epi64(U, A, R)                               \
2210*12c85518Srobert   ((__m512i)__builtin_ia32_vcvttph2qq512_mask(                                 \
2211*12c85518Srobert       (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2212*12c85518Srobert 
2213*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttph_epi64(__m128h __A)2214*12c85518Srobert _mm512_cvttph_epi64(__m128h __A) {
2215*12c85518Srobert   return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2216*12c85518Srobert       (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2217*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2218*12c85518Srobert }
2219*12c85518Srobert 
2220*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvttph_epi64(__m512i __W,__mmask8 __U,__m128h __A)2221*12c85518Srobert _mm512_mask_cvttph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2222*12c85518Srobert   return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2223*12c85518Srobert       (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2224*12c85518Srobert }
2225*12c85518Srobert 
2226*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvttph_epi64(__mmask8 __U,__m128h __A)2227*12c85518Srobert _mm512_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
2228*12c85518Srobert   return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2229*12c85518Srobert       (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2230*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2231*12c85518Srobert }
2232*12c85518Srobert 
2233*12c85518Srobert #define _mm512_cvtt_roundph_epu64(A, R)                                        \
2234*12c85518Srobert   ((__m512i)__builtin_ia32_vcvttph2uqq512_mask(                                \
2235*12c85518Srobert       (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1),           \
2236*12c85518Srobert       (int)(R)))
2237*12c85518Srobert 
2238*12c85518Srobert #define _mm512_mask_cvtt_roundph_epu64(W, U, A, R)                             \
2239*12c85518Srobert   ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W),       \
2240*12c85518Srobert                                                (__mmask8)(U), (int)(R)))
2241*12c85518Srobert 
2242*12c85518Srobert #define _mm512_maskz_cvtt_roundph_epu64(U, A, R)                               \
2243*12c85518Srobert   ((__m512i)__builtin_ia32_vcvttph2uqq512_mask(                                \
2244*12c85518Srobert       (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2245*12c85518Srobert 
2246*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttph_epu64(__m128h __A)2247*12c85518Srobert _mm512_cvttph_epu64(__m128h __A) {
2248*12c85518Srobert   return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2249*12c85518Srobert       (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2250*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2251*12c85518Srobert }
2252*12c85518Srobert 
2253*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvttph_epu64(__m512i __W,__mmask8 __U,__m128h __A)2254*12c85518Srobert _mm512_mask_cvttph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2255*12c85518Srobert   return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2256*12c85518Srobert       (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2257*12c85518Srobert }
2258*12c85518Srobert 
2259*12c85518Srobert static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvttph_epu64(__mmask8 __U,__m128h __A)2260*12c85518Srobert _mm512_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
2261*12c85518Srobert   return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2262*12c85518Srobert       (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2263*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2264*12c85518Srobert }
2265*12c85518Srobert 
2266*12c85518Srobert #define _mm_cvt_roundsh_i32(A, R)                                              \
2267*12c85518Srobert   ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
2268*12c85518Srobert 
_mm_cvtsh_i32(__m128h __A)2269*12c85518Srobert static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvtsh_i32(__m128h __A) {
2270*12c85518Srobert   return (int)__builtin_ia32_vcvtsh2si32((__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2271*12c85518Srobert }
2272*12c85518Srobert 
2273*12c85518Srobert #define _mm_cvt_roundsh_u32(A, R)                                              \
2274*12c85518Srobert   ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
2275*12c85518Srobert 
2276*12c85518Srobert static __inline__ unsigned int __DEFAULT_FN_ATTRS128
_mm_cvtsh_u32(__m128h __A)2277*12c85518Srobert _mm_cvtsh_u32(__m128h __A) {
2278*12c85518Srobert   return (unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A,
2279*12c85518Srobert                                                    _MM_FROUND_CUR_DIRECTION);
2280*12c85518Srobert }
2281*12c85518Srobert 
2282*12c85518Srobert #ifdef __x86_64__
2283*12c85518Srobert #define _mm_cvt_roundsh_i64(A, R)                                              \
2284*12c85518Srobert   ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
2285*12c85518Srobert 
_mm_cvtsh_i64(__m128h __A)2286*12c85518Srobert static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvtsh_i64(__m128h __A) {
2287*12c85518Srobert   return (long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A,
2288*12c85518Srobert                                                _MM_FROUND_CUR_DIRECTION);
2289*12c85518Srobert }
2290*12c85518Srobert 
2291*12c85518Srobert #define _mm_cvt_roundsh_u64(A, R)                                              \
2292*12c85518Srobert   ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
2293*12c85518Srobert 
2294*12c85518Srobert static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
_mm_cvtsh_u64(__m128h __A)2295*12c85518Srobert _mm_cvtsh_u64(__m128h __A) {
2296*12c85518Srobert   return (unsigned long long)__builtin_ia32_vcvtsh2usi64(
2297*12c85518Srobert       (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2298*12c85518Srobert }
2299*12c85518Srobert #endif // __x86_64__
2300*12c85518Srobert 
2301*12c85518Srobert #define _mm_cvt_roundu32_sh(A, B, R)                                           \
2302*12c85518Srobert   ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
2303*12c85518Srobert 
2304*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_cvtu32_sh(__m128h __A,unsigned int __B)2305*12c85518Srobert _mm_cvtu32_sh(__m128h __A, unsigned int __B) {
2306*12c85518Srobert   __A[0] = __B;
2307*12c85518Srobert   return __A;
2308*12c85518Srobert }
2309*12c85518Srobert 
2310*12c85518Srobert #ifdef __x86_64__
2311*12c85518Srobert #define _mm_cvt_roundu64_sh(A, B, R)                                           \
2312*12c85518Srobert   ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B),  \
2313*12c85518Srobert                                         (int)(R)))
2314*12c85518Srobert 
2315*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_cvtu64_sh(__m128h __A,unsigned long long __B)2316*12c85518Srobert _mm_cvtu64_sh(__m128h __A, unsigned long long __B) {
2317*12c85518Srobert   __A[0] = __B;
2318*12c85518Srobert   return __A;
2319*12c85518Srobert }
2320*12c85518Srobert #endif
2321*12c85518Srobert 
2322*12c85518Srobert #define _mm_cvt_roundi32_sh(A, B, R)                                           \
2323*12c85518Srobert   ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
2324*12c85518Srobert 
_mm_cvti32_sh(__m128h __A,int __B)2325*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti32_sh(__m128h __A,
2326*12c85518Srobert                                                               int __B) {
2327*12c85518Srobert   __A[0] = __B;
2328*12c85518Srobert   return __A;
2329*12c85518Srobert }
2330*12c85518Srobert 
2331*12c85518Srobert #ifdef __x86_64__
2332*12c85518Srobert #define _mm_cvt_roundi64_sh(A, B, R)                                           \
2333*12c85518Srobert   ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
2334*12c85518Srobert 
_mm_cvti64_sh(__m128h __A,long long __B)2335*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti64_sh(__m128h __A,
2336*12c85518Srobert                                                               long long __B) {
2337*12c85518Srobert   __A[0] = __B;
2338*12c85518Srobert   return __A;
2339*12c85518Srobert }
2340*12c85518Srobert #endif
2341*12c85518Srobert 
2342*12c85518Srobert #define _mm_cvtt_roundsh_i32(A, R)                                             \
2343*12c85518Srobert   ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
2344*12c85518Srobert 
_mm_cvttsh_i32(__m128h __A)2345*12c85518Srobert static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttsh_i32(__m128h __A) {
2346*12c85518Srobert   return (int)__builtin_ia32_vcvttsh2si32((__v8hf)__A,
2347*12c85518Srobert                                           _MM_FROUND_CUR_DIRECTION);
2348*12c85518Srobert }
2349*12c85518Srobert 
2350*12c85518Srobert #ifdef __x86_64__
2351*12c85518Srobert #define _mm_cvtt_roundsh_i64(A, R)                                             \
2352*12c85518Srobert   ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
2353*12c85518Srobert 
_mm_cvttsh_i64(__m128h __A)2354*12c85518Srobert static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttsh_i64(__m128h __A) {
2355*12c85518Srobert   return (long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A,
2356*12c85518Srobert                                                 _MM_FROUND_CUR_DIRECTION);
2357*12c85518Srobert }
2358*12c85518Srobert #endif
2359*12c85518Srobert 
2360*12c85518Srobert #define _mm_cvtt_roundsh_u32(A, R)                                             \
2361*12c85518Srobert   ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
2362*12c85518Srobert 
2363*12c85518Srobert static __inline__ unsigned int __DEFAULT_FN_ATTRS128
_mm_cvttsh_u32(__m128h __A)2364*12c85518Srobert _mm_cvttsh_u32(__m128h __A) {
2365*12c85518Srobert   return (unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A,
2366*12c85518Srobert                                                     _MM_FROUND_CUR_DIRECTION);
2367*12c85518Srobert }
2368*12c85518Srobert 
2369*12c85518Srobert #ifdef __x86_64__
2370*12c85518Srobert #define _mm_cvtt_roundsh_u64(A, R)                                             \
2371*12c85518Srobert   ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
2372*12c85518Srobert 
2373*12c85518Srobert static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
_mm_cvttsh_u64(__m128h __A)2374*12c85518Srobert _mm_cvttsh_u64(__m128h __A) {
2375*12c85518Srobert   return (unsigned long long)__builtin_ia32_vcvttsh2usi64(
2376*12c85518Srobert       (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2377*12c85518Srobert }
2378*12c85518Srobert #endif
2379*12c85518Srobert 
2380*12c85518Srobert #define _mm512_cvtx_roundph_ps(A, R)                                           \
2381*12c85518Srobert   ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A),                     \
2382*12c85518Srobert                                              (__v16sf)_mm512_undefined_ps(),   \
2383*12c85518Srobert                                              (__mmask16)(-1), (int)(R)))
2384*12c85518Srobert 
2385*12c85518Srobert #define _mm512_mask_cvtx_roundph_ps(W, U, A, R)                                \
2386*12c85518Srobert   ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W),       \
2387*12c85518Srobert                                              (__mmask16)(U), (int)(R)))
2388*12c85518Srobert 
2389*12c85518Srobert #define _mm512_maskz_cvtx_roundph_ps(U, A, R)                                  \
2390*12c85518Srobert   ((__m512)__builtin_ia32_vcvtph2psx512_mask(                                  \
2391*12c85518Srobert       (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
2392*12c85518Srobert 
_mm512_cvtxph_ps(__m256h __A)2393*12c85518Srobert static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtxph_ps(__m256h __A) {
2394*12c85518Srobert   return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2395*12c85518Srobert       (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1,
2396*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2397*12c85518Srobert }
2398*12c85518Srobert 
2399*12c85518Srobert static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_cvtxph_ps(__m512 __W,__mmask16 __U,__m256h __A)2400*12c85518Srobert _mm512_mask_cvtxph_ps(__m512 __W, __mmask16 __U, __m256h __A) {
2401*12c85518Srobert   return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2402*12c85518Srobert       (__v16hf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2403*12c85518Srobert }
2404*12c85518Srobert 
2405*12c85518Srobert static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtxph_ps(__mmask16 __U,__m256h __A)2406*12c85518Srobert _mm512_maskz_cvtxph_ps(__mmask16 __U, __m256h __A) {
2407*12c85518Srobert   return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2408*12c85518Srobert       (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U,
2409*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2410*12c85518Srobert }
2411*12c85518Srobert 
2412*12c85518Srobert #define _mm512_cvtx_roundps_ph(A, R)                                           \
2413*12c85518Srobert   ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A),                    \
2414*12c85518Srobert                                               (__v16hf)_mm256_undefined_ph(),  \
2415*12c85518Srobert                                               (__mmask16)(-1), (int)(R)))
2416*12c85518Srobert 
2417*12c85518Srobert #define _mm512_mask_cvtx_roundps_ph(W, U, A, R)                                \
2418*12c85518Srobert   ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W),      \
2419*12c85518Srobert                                               (__mmask16)(U), (int)(R)))
2420*12c85518Srobert 
2421*12c85518Srobert #define _mm512_maskz_cvtx_roundps_ph(U, A, R)                                  \
2422*12c85518Srobert   ((__m256h)__builtin_ia32_vcvtps2phx512_mask(                                 \
2423*12c85518Srobert       (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
2424*12c85518Srobert 
_mm512_cvtxps_ph(__m512 __A)2425*12c85518Srobert static __inline__ __m256h __DEFAULT_FN_ATTRS512 _mm512_cvtxps_ph(__m512 __A) {
2426*12c85518Srobert   return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2427*12c85518Srobert       (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
2428*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2429*12c85518Srobert }
2430*12c85518Srobert 
2431*12c85518Srobert static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtxps_ph(__m256h __W,__mmask16 __U,__m512 __A)2432*12c85518Srobert _mm512_mask_cvtxps_ph(__m256h __W, __mmask16 __U, __m512 __A) {
2433*12c85518Srobert   return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2434*12c85518Srobert       (__v16sf)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2435*12c85518Srobert }
2436*12c85518Srobert 
2437*12c85518Srobert static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtxps_ph(__mmask16 __U,__m512 __A)2438*12c85518Srobert _mm512_maskz_cvtxps_ph(__mmask16 __U, __m512 __A) {
2439*12c85518Srobert   return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2440*12c85518Srobert       (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
2441*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2442*12c85518Srobert }
2443*12c85518Srobert 
2444*12c85518Srobert #define _mm512_fmadd_round_ph(A, B, C, R)                                      \
2445*12c85518Srobert   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2446*12c85518Srobert       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2447*12c85518Srobert       (__mmask32)-1, (int)(R)))
2448*12c85518Srobert 
2449*12c85518Srobert #define _mm512_mask_fmadd_round_ph(A, U, B, C, R)                              \
2450*12c85518Srobert   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2451*12c85518Srobert       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2452*12c85518Srobert       (__mmask32)(U), (int)(R)))
2453*12c85518Srobert 
2454*12c85518Srobert #define _mm512_mask3_fmadd_round_ph(A, B, C, U, R)                             \
2455*12c85518Srobert   ((__m512h)__builtin_ia32_vfmaddph512_mask3(                                  \
2456*12c85518Srobert       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2457*12c85518Srobert       (__mmask32)(U), (int)(R)))
2458*12c85518Srobert 
2459*12c85518Srobert #define _mm512_maskz_fmadd_round_ph(U, A, B, C, R)                             \
2460*12c85518Srobert   ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
2461*12c85518Srobert       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2462*12c85518Srobert       (__mmask32)(U), (int)(R)))
2463*12c85518Srobert 
2464*12c85518Srobert #define _mm512_fmsub_round_ph(A, B, C, R)                                      \
2465*12c85518Srobert   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2466*12c85518Srobert       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2467*12c85518Srobert       (__mmask32)-1, (int)(R)))
2468*12c85518Srobert 
2469*12c85518Srobert #define _mm512_mask_fmsub_round_ph(A, U, B, C, R)                              \
2470*12c85518Srobert   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2471*12c85518Srobert       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2472*12c85518Srobert       (__mmask32)(U), (int)(R)))
2473*12c85518Srobert 
2474*12c85518Srobert #define _mm512_maskz_fmsub_round_ph(U, A, B, C, R)                             \
2475*12c85518Srobert   ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
2476*12c85518Srobert       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2477*12c85518Srobert       (__mmask32)(U), (int)(R)))
2478*12c85518Srobert 
2479*12c85518Srobert #define _mm512_fnmadd_round_ph(A, B, C, R)                                     \
2480*12c85518Srobert   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2481*12c85518Srobert       (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2482*12c85518Srobert       (__mmask32)-1, (int)(R)))
2483*12c85518Srobert 
2484*12c85518Srobert #define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R)                            \
2485*12c85518Srobert   ((__m512h)__builtin_ia32_vfmaddph512_mask3(                                  \
2486*12c85518Srobert       -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2487*12c85518Srobert       (__mmask32)(U), (int)(R)))
2488*12c85518Srobert 
2489*12c85518Srobert #define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R)                            \
2490*12c85518Srobert   ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
2491*12c85518Srobert       -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2492*12c85518Srobert       (__mmask32)(U), (int)(R)))
2493*12c85518Srobert 
2494*12c85518Srobert #define _mm512_fnmsub_round_ph(A, B, C, R)                                     \
2495*12c85518Srobert   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2496*12c85518Srobert       (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),   \
2497*12c85518Srobert       (__mmask32)-1, (int)(R)))
2498*12c85518Srobert 
2499*12c85518Srobert #define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R)                            \
2500*12c85518Srobert   ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
2501*12c85518Srobert       -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),   \
2502*12c85518Srobert       (__mmask32)(U), (int)(R)))
2503*12c85518Srobert 
_mm512_fmadd_ph(__m512h __A,__m512h __B,__m512h __C)2504*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_ph(__m512h __A,
2505*12c85518Srobert                                                                 __m512h __B,
2506*12c85518Srobert                                                                 __m512h __C) {
2507*12c85518Srobert   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2508*12c85518Srobert                                                   (__v32hf)__C, (__mmask32)-1,
2509*12c85518Srobert                                                   _MM_FROUND_CUR_DIRECTION);
2510*12c85518Srobert }
2511*12c85518Srobert 
2512*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fmadd_ph(__m512h __A,__mmask32 __U,__m512h __B,__m512h __C)2513*12c85518Srobert _mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2514*12c85518Srobert   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2515*12c85518Srobert                                                   (__v32hf)__C, (__mmask32)__U,
2516*12c85518Srobert                                                   _MM_FROUND_CUR_DIRECTION);
2517*12c85518Srobert }
2518*12c85518Srobert 
2519*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fmadd_ph(__m512h __A,__m512h __B,__m512h __C,__mmask32 __U)2520*12c85518Srobert _mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2521*12c85518Srobert   return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B,
2522*12c85518Srobert                                                    (__v32hf)__C, (__mmask32)__U,
2523*12c85518Srobert                                                    _MM_FROUND_CUR_DIRECTION);
2524*12c85518Srobert }
2525*12c85518Srobert 
2526*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fmadd_ph(__mmask32 __U,__m512h __A,__m512h __B,__m512h __C)2527*12c85518Srobert _mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2528*12c85518Srobert   return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B,
2529*12c85518Srobert                                                    (__v32hf)__C, (__mmask32)__U,
2530*12c85518Srobert                                                    _MM_FROUND_CUR_DIRECTION);
2531*12c85518Srobert }
2532*12c85518Srobert 
_mm512_fmsub_ph(__m512h __A,__m512h __B,__m512h __C)2533*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmsub_ph(__m512h __A,
2534*12c85518Srobert                                                                 __m512h __B,
2535*12c85518Srobert                                                                 __m512h __C) {
2536*12c85518Srobert   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2537*12c85518Srobert                                                   -(__v32hf)__C, (__mmask32)-1,
2538*12c85518Srobert                                                   _MM_FROUND_CUR_DIRECTION);
2539*12c85518Srobert }
2540*12c85518Srobert 
2541*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fmsub_ph(__m512h __A,__mmask32 __U,__m512h __B,__m512h __C)2542*12c85518Srobert _mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2543*12c85518Srobert   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2544*12c85518Srobert                                                   -(__v32hf)__C, (__mmask32)__U,
2545*12c85518Srobert                                                   _MM_FROUND_CUR_DIRECTION);
2546*12c85518Srobert }
2547*12c85518Srobert 
2548*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fmsub_ph(__mmask32 __U,__m512h __A,__m512h __B,__m512h __C)2549*12c85518Srobert _mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2550*12c85518Srobert   return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2551*12c85518Srobert       (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2552*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2553*12c85518Srobert }
2554*12c85518Srobert 
_mm512_fnmadd_ph(__m512h __A,__m512h __B,__m512h __C)2555*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmadd_ph(__m512h __A,
2556*12c85518Srobert                                                                  __m512h __B,
2557*12c85518Srobert                                                                  __m512h __C) {
2558*12c85518Srobert   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2559*12c85518Srobert                                                   (__v32hf)__C, (__mmask32)-1,
2560*12c85518Srobert                                                   _MM_FROUND_CUR_DIRECTION);
2561*12c85518Srobert }
2562*12c85518Srobert 
2563*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fnmadd_ph(__m512h __A,__m512h __B,__m512h __C,__mmask32 __U)2564*12c85518Srobert _mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2565*12c85518Srobert   return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2566*12c85518Srobert                                                    (__v32hf)__C, (__mmask32)__U,
2567*12c85518Srobert                                                    _MM_FROUND_CUR_DIRECTION);
2568*12c85518Srobert }
2569*12c85518Srobert 
2570*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fnmadd_ph(__mmask32 __U,__m512h __A,__m512h __B,__m512h __C)2571*12c85518Srobert _mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2572*12c85518Srobert   return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B,
2573*12c85518Srobert                                                    (__v32hf)__C, (__mmask32)__U,
2574*12c85518Srobert                                                    _MM_FROUND_CUR_DIRECTION);
2575*12c85518Srobert }
2576*12c85518Srobert 
_mm512_fnmsub_ph(__m512h __A,__m512h __B,__m512h __C)2577*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmsub_ph(__m512h __A,
2578*12c85518Srobert                                                                  __m512h __B,
2579*12c85518Srobert                                                                  __m512h __C) {
2580*12c85518Srobert   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2581*12c85518Srobert                                                   -(__v32hf)__C, (__mmask32)-1,
2582*12c85518Srobert                                                   _MM_FROUND_CUR_DIRECTION);
2583*12c85518Srobert }
2584*12c85518Srobert 
2585*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fnmsub_ph(__mmask32 __U,__m512h __A,__m512h __B,__m512h __C)2586*12c85518Srobert _mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2587*12c85518Srobert   return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2588*12c85518Srobert       -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2589*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2590*12c85518Srobert }
2591*12c85518Srobert 
2592*12c85518Srobert #define _mm512_fmaddsub_round_ph(A, B, C, R)                                   \
2593*12c85518Srobert   ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
2594*12c85518Srobert       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2595*12c85518Srobert       (__mmask32)-1, (int)(R)))
2596*12c85518Srobert 
2597*12c85518Srobert #define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R)                           \
2598*12c85518Srobert   ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
2599*12c85518Srobert       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2600*12c85518Srobert       (__mmask32)(U), (int)(R)))
2601*12c85518Srobert 
2602*12c85518Srobert #define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R)                          \
2603*12c85518Srobert   ((__m512h)__builtin_ia32_vfmaddsubph512_mask3(                               \
2604*12c85518Srobert       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2605*12c85518Srobert       (__mmask32)(U), (int)(R)))
2606*12c85518Srobert 
2607*12c85518Srobert #define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R)                          \
2608*12c85518Srobert   ((__m512h)__builtin_ia32_vfmaddsubph512_maskz(                               \
2609*12c85518Srobert       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2610*12c85518Srobert       (__mmask32)(U), (int)(R)))
2611*12c85518Srobert 
2612*12c85518Srobert #define _mm512_fmsubadd_round_ph(A, B, C, R)                                   \
2613*12c85518Srobert   ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
2614*12c85518Srobert       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2615*12c85518Srobert       (__mmask32)-1, (int)(R)))
2616*12c85518Srobert 
2617*12c85518Srobert #define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R)                           \
2618*12c85518Srobert   ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
2619*12c85518Srobert       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2620*12c85518Srobert       (__mmask32)(U), (int)(R)))
2621*12c85518Srobert 
2622*12c85518Srobert #define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R)                          \
2623*12c85518Srobert   ((__m512h)__builtin_ia32_vfmaddsubph512_maskz(                               \
2624*12c85518Srobert       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2625*12c85518Srobert       (__mmask32)(U), (int)(R)))
2626*12c85518Srobert 
2627*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_fmaddsub_ph(__m512h __A,__m512h __B,__m512h __C)2628*12c85518Srobert _mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) {
2629*12c85518Srobert   return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2630*12c85518Srobert       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)-1,
2631*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2632*12c85518Srobert }
2633*12c85518Srobert 
2634*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fmaddsub_ph(__m512h __A,__mmask32 __U,__m512h __B,__m512h __C)2635*12c85518Srobert _mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2636*12c85518Srobert   return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2637*12c85518Srobert       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2638*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2639*12c85518Srobert }
2640*12c85518Srobert 
2641*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fmaddsub_ph(__m512h __A,__m512h __B,__m512h __C,__mmask32 __U)2642*12c85518Srobert _mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2643*12c85518Srobert   return (__m512h)__builtin_ia32_vfmaddsubph512_mask3(
2644*12c85518Srobert       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2645*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2646*12c85518Srobert }
2647*12c85518Srobert 
2648*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fmaddsub_ph(__mmask32 __U,__m512h __A,__m512h __B,__m512h __C)2649*12c85518Srobert _mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2650*12c85518Srobert   return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2651*12c85518Srobert       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2652*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2653*12c85518Srobert }
2654*12c85518Srobert 
2655*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_fmsubadd_ph(__m512h __A,__m512h __B,__m512h __C)2656*12c85518Srobert _mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) {
2657*12c85518Srobert   return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2658*12c85518Srobert       (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)-1,
2659*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2660*12c85518Srobert }
2661*12c85518Srobert 
2662*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fmsubadd_ph(__m512h __A,__mmask32 __U,__m512h __B,__m512h __C)2663*12c85518Srobert _mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2664*12c85518Srobert   return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2665*12c85518Srobert       (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2666*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2667*12c85518Srobert }
2668*12c85518Srobert 
2669*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fmsubadd_ph(__mmask32 __U,__m512h __A,__m512h __B,__m512h __C)2670*12c85518Srobert _mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2671*12c85518Srobert   return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2672*12c85518Srobert       (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2673*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2674*12c85518Srobert }
2675*12c85518Srobert 
2676*12c85518Srobert #define _mm512_mask3_fmsub_round_ph(A, B, C, U, R)                             \
2677*12c85518Srobert   ((__m512h)__builtin_ia32_vfmsubph512_mask3(                                  \
2678*12c85518Srobert       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2679*12c85518Srobert       (__mmask32)(U), (int)(R)))
2680*12c85518Srobert 
2681*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fmsub_ph(__m512h __A,__m512h __B,__m512h __C,__mmask32 __U)2682*12c85518Srobert _mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2683*12c85518Srobert   return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B,
2684*12c85518Srobert                                                    (__v32hf)__C, (__mmask32)__U,
2685*12c85518Srobert                                                    _MM_FROUND_CUR_DIRECTION);
2686*12c85518Srobert }
2687*12c85518Srobert 
2688*12c85518Srobert #define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R)                          \
2689*12c85518Srobert   ((__m512h)__builtin_ia32_vfmsubaddph512_mask3(                               \
2690*12c85518Srobert       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2691*12c85518Srobert       (__mmask32)(U), (int)(R)))
2692*12c85518Srobert 
2693*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fmsubadd_ph(__m512h __A,__m512h __B,__m512h __C,__mmask32 __U)2694*12c85518Srobert _mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2695*12c85518Srobert   return (__m512h)__builtin_ia32_vfmsubaddph512_mask3(
2696*12c85518Srobert       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2697*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
2698*12c85518Srobert }
2699*12c85518Srobert 
2700*12c85518Srobert #define _mm512_mask_fnmadd_round_ph(A, U, B, C, R)                             \
2701*12c85518Srobert   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2702*12c85518Srobert       (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2703*12c85518Srobert       (__mmask32)(U), (int)(R)))
2704*12c85518Srobert 
2705*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fnmadd_ph(__m512h __A,__mmask32 __U,__m512h __B,__m512h __C)2706*12c85518Srobert _mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2707*12c85518Srobert   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2708*12c85518Srobert                                                   (__v32hf)__C, (__mmask32)__U,
2709*12c85518Srobert                                                   _MM_FROUND_CUR_DIRECTION);
2710*12c85518Srobert }
2711*12c85518Srobert 
2712*12c85518Srobert #define _mm512_mask_fnmsub_round_ph(A, U, B, C, R)                             \
2713*12c85518Srobert   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2714*12c85518Srobert       (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),   \
2715*12c85518Srobert       (__mmask32)(U), (int)(R)))
2716*12c85518Srobert 
2717*12c85518Srobert #define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R)                            \
2718*12c85518Srobert   ((__m512h)__builtin_ia32_vfmsubph512_mask3(                                  \
2719*12c85518Srobert       -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2720*12c85518Srobert       (__mmask32)(U), (int)(R)))
2721*12c85518Srobert 
2722*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fnmsub_ph(__m512h __A,__mmask32 __U,__m512h __B,__m512h __C)2723*12c85518Srobert _mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2724*12c85518Srobert   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2725*12c85518Srobert                                                   -(__v32hf)__C, (__mmask32)__U,
2726*12c85518Srobert                                                   _MM_FROUND_CUR_DIRECTION);
2727*12c85518Srobert }
2728*12c85518Srobert 
2729*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fnmsub_ph(__m512h __A,__m512h __B,__m512h __C,__mmask32 __U)2730*12c85518Srobert _mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2731*12c85518Srobert   return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2732*12c85518Srobert                                                    (__v32hf)__C, (__mmask32)__U,
2733*12c85518Srobert                                                    _MM_FROUND_CUR_DIRECTION);
2734*12c85518Srobert }
2735*12c85518Srobert 
_mm_fmadd_sh(__m128h __W,__m128h __A,__m128h __B)2736*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sh(__m128h __W,
2737*12c85518Srobert                                                              __m128h __A,
2738*12c85518Srobert                                                              __m128h __B) {
2739*12c85518Srobert   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2740*12c85518Srobert                                        (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2741*12c85518Srobert }
2742*12c85518Srobert 
_mm_mask_fmadd_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)2743*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sh(__m128h __W,
2744*12c85518Srobert                                                                   __mmask8 __U,
2745*12c85518Srobert                                                                   __m128h __A,
2746*12c85518Srobert                                                                   __m128h __B) {
2747*12c85518Srobert   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2748*12c85518Srobert                                        (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2749*12c85518Srobert }
2750*12c85518Srobert 
2751*12c85518Srobert #define _mm_fmadd_round_sh(A, B, C, R)                                         \
2752*12c85518Srobert   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2753*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),        \
2754*12c85518Srobert       (__mmask8)-1, (int)(R)))
2755*12c85518Srobert 
2756*12c85518Srobert #define _mm_mask_fmadd_round_sh(W, U, A, B, R)                                 \
2757*12c85518Srobert   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2758*12c85518Srobert       (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B),        \
2759*12c85518Srobert       (__mmask8)(U), (int)(R)))
2760*12c85518Srobert 
2761*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fmadd_sh(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)2762*12c85518Srobert _mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2763*12c85518Srobert   return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C,
2764*12c85518Srobert                                         (__mmask8)__U,
2765*12c85518Srobert                                         _MM_FROUND_CUR_DIRECTION);
2766*12c85518Srobert }
2767*12c85518Srobert 
2768*12c85518Srobert #define _mm_maskz_fmadd_round_sh(U, A, B, C, R)                                \
2769*12c85518Srobert   ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
2770*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),        \
2771*12c85518Srobert       (__mmask8)(U), (int)(R)))
2772*12c85518Srobert 
2773*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fmadd_sh(__m128h __W,__m128h __X,__m128h __Y,__mmask8 __U)2774*12c85518Srobert _mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2775*12c85518Srobert   return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2776*12c85518Srobert                                         (__mmask8)__U,
2777*12c85518Srobert                                         _MM_FROUND_CUR_DIRECTION);
2778*12c85518Srobert }
2779*12c85518Srobert 
2780*12c85518Srobert #define _mm_mask3_fmadd_round_sh(W, X, Y, U, R)                                \
2781*12c85518Srobert   ((__m128h)__builtin_ia32_vfmaddsh3_mask3(                                    \
2782*12c85518Srobert       (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),        \
2783*12c85518Srobert       (__mmask8)(U), (int)(R)))
2784*12c85518Srobert 
_mm_fmsub_sh(__m128h __W,__m128h __A,__m128h __B)2785*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_sh(__m128h __W,
2786*12c85518Srobert                                                              __m128h __A,
2787*12c85518Srobert                                                              __m128h __B) {
2788*12c85518Srobert   return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2789*12c85518Srobert                                                 -(__v8hf)__B, (__mmask8)-1,
2790*12c85518Srobert                                                 _MM_FROUND_CUR_DIRECTION);
2791*12c85518Srobert }
2792*12c85518Srobert 
_mm_mask_fmsub_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)2793*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_sh(__m128h __W,
2794*12c85518Srobert                                                                   __mmask8 __U,
2795*12c85518Srobert                                                                   __m128h __A,
2796*12c85518Srobert                                                                   __m128h __B) {
2797*12c85518Srobert   return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2798*12c85518Srobert                                                 -(__v8hf)__B, (__mmask8)__U,
2799*12c85518Srobert                                                 _MM_FROUND_CUR_DIRECTION);
2800*12c85518Srobert }
2801*12c85518Srobert 
2802*12c85518Srobert #define _mm_fmsub_round_sh(A, B, C, R)                                         \
2803*12c85518Srobert   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2804*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),       \
2805*12c85518Srobert       (__mmask8)-1, (int)(R)))
2806*12c85518Srobert 
2807*12c85518Srobert #define _mm_mask_fmsub_round_sh(W, U, A, B, R)                                 \
2808*12c85518Srobert   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2809*12c85518Srobert       (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B),       \
2810*12c85518Srobert       (__mmask8)(U), (int)(R)))
2811*12c85518Srobert 
2812*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fmsub_sh(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)2813*12c85518Srobert _mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2814*12c85518Srobert   return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B,
2815*12c85518Srobert                                                  -(__v8hf)__C, (__mmask8)__U,
2816*12c85518Srobert                                                  _MM_FROUND_CUR_DIRECTION);
2817*12c85518Srobert }
2818*12c85518Srobert 
2819*12c85518Srobert #define _mm_maskz_fmsub_round_sh(U, A, B, C, R)                                \
2820*12c85518Srobert   ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
2821*12c85518Srobert       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),       \
2822*12c85518Srobert       (__mmask8)(U), (int)R))
2823*12c85518Srobert 
2824*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fmsub_sh(__m128h __W,__m128h __X,__m128h __Y,__mmask8 __U)2825*12c85518Srobert _mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2826*12c85518Srobert   return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2827*12c85518Srobert                                         (__mmask8)__U,
2828*12c85518Srobert                                         _MM_FROUND_CUR_DIRECTION);
2829*12c85518Srobert }
2830*12c85518Srobert 
2831*12c85518Srobert #define _mm_mask3_fmsub_round_sh(W, X, Y, U, R)                                \
2832*12c85518Srobert   ((__m128h)__builtin_ia32_vfmsubsh3_mask3(                                    \
2833*12c85518Srobert       (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),        \
2834*12c85518Srobert       (__mmask8)(U), (int)(R)))
2835*12c85518Srobert 
_mm_fnmadd_sh(__m128h __W,__m128h __A,__m128h __B)2836*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_sh(__m128h __W,
2837*12c85518Srobert                                                               __m128h __A,
2838*12c85518Srobert                                                               __m128h __B) {
2839*12c85518Srobert   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2840*12c85518Srobert                                        (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2841*12c85518Srobert }
2842*12c85518Srobert 
2843*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fnmadd_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)2844*12c85518Srobert _mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2845*12c85518Srobert   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2846*12c85518Srobert                                        (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2847*12c85518Srobert }
2848*12c85518Srobert 
2849*12c85518Srobert #define _mm_fnmadd_round_sh(A, B, C, R)                                        \
2850*12c85518Srobert   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2851*12c85518Srobert       (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),       \
2852*12c85518Srobert       (__mmask8)-1, (int)(R)))
2853*12c85518Srobert 
2854*12c85518Srobert #define _mm_mask_fnmadd_round_sh(W, U, A, B, R)                                \
2855*12c85518Srobert   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2856*12c85518Srobert       (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B),       \
2857*12c85518Srobert       (__mmask8)(U), (int)(R)))
2858*12c85518Srobert 
2859*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fnmadd_sh(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)2860*12c85518Srobert _mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2861*12c85518Srobert   return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C,
2862*12c85518Srobert                                         (__mmask8)__U,
2863*12c85518Srobert                                         _MM_FROUND_CUR_DIRECTION);
2864*12c85518Srobert }
2865*12c85518Srobert 
2866*12c85518Srobert #define _mm_maskz_fnmadd_round_sh(U, A, B, C, R)                               \
2867*12c85518Srobert   ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
2868*12c85518Srobert       (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),       \
2869*12c85518Srobert       (__mmask8)(U), (int)(R)))
2870*12c85518Srobert 
2871*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fnmadd_sh(__m128h __W,__m128h __X,__m128h __Y,__mmask8 __U)2872*12c85518Srobert _mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2873*12c85518Srobert   return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2874*12c85518Srobert                                         (__mmask8)__U,
2875*12c85518Srobert                                         _MM_FROUND_CUR_DIRECTION);
2876*12c85518Srobert }
2877*12c85518Srobert 
2878*12c85518Srobert #define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R)                               \
2879*12c85518Srobert   ((__m128h)__builtin_ia32_vfmaddsh3_mask3(                                    \
2880*12c85518Srobert       (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),       \
2881*12c85518Srobert       (__mmask8)(U), (int)(R)))
2882*12c85518Srobert 
_mm_fnmsub_sh(__m128h __W,__m128h __A,__m128h __B)2883*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_sh(__m128h __W,
2884*12c85518Srobert                                                               __m128h __A,
2885*12c85518Srobert                                                               __m128h __B) {
2886*12c85518Srobert   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2887*12c85518Srobert                                        (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2888*12c85518Srobert }
2889*12c85518Srobert 
2890*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fnmsub_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)2891*12c85518Srobert _mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2892*12c85518Srobert   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2893*12c85518Srobert                                        (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2894*12c85518Srobert }
2895*12c85518Srobert 
2896*12c85518Srobert #define _mm_fnmsub_round_sh(A, B, C, R)                                        \
2897*12c85518Srobert   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2898*12c85518Srobert       (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),      \
2899*12c85518Srobert       (__mmask8)-1, (int)(R)))
2900*12c85518Srobert 
2901*12c85518Srobert #define _mm_mask_fnmsub_round_sh(W, U, A, B, R)                                \
2902*12c85518Srobert   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2903*12c85518Srobert       (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B),      \
2904*12c85518Srobert       (__mmask8)(U), (int)(R)))
2905*12c85518Srobert 
2906*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fnmsub_sh(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)2907*12c85518Srobert _mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2908*12c85518Srobert   return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C,
2909*12c85518Srobert                                         (__mmask8)__U,
2910*12c85518Srobert                                         _MM_FROUND_CUR_DIRECTION);
2911*12c85518Srobert }
2912*12c85518Srobert 
2913*12c85518Srobert #define _mm_maskz_fnmsub_round_sh(U, A, B, C, R)                               \
2914*12c85518Srobert   ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
2915*12c85518Srobert       (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),      \
2916*12c85518Srobert       (__mmask8)(U), (int)(R)))
2917*12c85518Srobert 
2918*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fnmsub_sh(__m128h __W,__m128h __X,__m128h __Y,__mmask8 __U)2919*12c85518Srobert _mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2920*12c85518Srobert   return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2921*12c85518Srobert                                         (__mmask8)__U,
2922*12c85518Srobert                                         _MM_FROUND_CUR_DIRECTION);
2923*12c85518Srobert }
2924*12c85518Srobert 
2925*12c85518Srobert #define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R)                               \
2926*12c85518Srobert   ((__m128h)__builtin_ia32_vfmsubsh3_mask3(                                    \
2927*12c85518Srobert       (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),       \
2928*12c85518Srobert       (__mmask8)(U), (int)(R)))
2929*12c85518Srobert 
_mm_fcmadd_sch(__m128h __A,__m128h __B,__m128h __C)2930*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_sch(__m128h __A,
2931*12c85518Srobert                                                                __m128h __B,
2932*12c85518Srobert                                                                __m128h __C) {
2933*12c85518Srobert   return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2934*12c85518Srobert                                                  (__v4sf)__C, (__mmask8)-1,
2935*12c85518Srobert                                                  _MM_FROUND_CUR_DIRECTION);
2936*12c85518Srobert }
2937*12c85518Srobert 
2938*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fcmadd_sch(__m128h __A,__mmask8 __U,__m128h __B,__m128h __C)2939*12c85518Srobert _mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
2940*12c85518Srobert   return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask(
2941*12c85518Srobert       (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
2942*12c85518Srobert }
2943*12c85518Srobert 
2944*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fcmadd_sch(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)2945*12c85518Srobert _mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2946*12c85518Srobert   return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2947*12c85518Srobert                                                   (__v4sf)__C, (__mmask8)__U,
2948*12c85518Srobert                                                   _MM_FROUND_CUR_DIRECTION);
2949*12c85518Srobert }
2950*12c85518Srobert 
2951*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fcmadd_sch(__m128h __A,__m128h __B,__m128h __C,__mmask8 __U)2952*12c85518Srobert _mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
2953*12c85518Srobert   return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(
2954*12c85518Srobert       (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
2955*12c85518Srobert }
2956*12c85518Srobert 
2957*12c85518Srobert #define _mm_fcmadd_round_sch(A, B, C, R)                                       \
2958*12c85518Srobert   ((__m128h)__builtin_ia32_vfcmaddcsh_mask(                                    \
2959*12c85518Srobert       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
2960*12c85518Srobert       (__mmask8)-1, (int)(R)))
2961*12c85518Srobert 
2962*12c85518Srobert #define _mm_mask_fcmadd_round_sch(A, U, B, C, R)                               \
2963*12c85518Srobert   ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask(                              \
2964*12c85518Srobert       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
2965*12c85518Srobert       (__mmask8)(U), (int)(R)))
2966*12c85518Srobert 
2967*12c85518Srobert #define _mm_maskz_fcmadd_round_sch(U, A, B, C, R)                              \
2968*12c85518Srobert   ((__m128h)__builtin_ia32_vfcmaddcsh_maskz(                                   \
2969*12c85518Srobert       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
2970*12c85518Srobert       (__mmask8)(U), (int)(R)))
2971*12c85518Srobert 
2972*12c85518Srobert #define _mm_mask3_fcmadd_round_sch(A, B, C, U, R)                              \
2973*12c85518Srobert   ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(                             \
2974*12c85518Srobert       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
2975*12c85518Srobert       (__mmask8)(U), (int)(R)))
2976*12c85518Srobert 
_mm_fmadd_sch(__m128h __A,__m128h __B,__m128h __C)2977*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A,
2978*12c85518Srobert                                                               __m128h __B,
2979*12c85518Srobert                                                               __m128h __C) {
2980*12c85518Srobert   return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2981*12c85518Srobert                                                 (__v4sf)__C, (__mmask8)-1,
2982*12c85518Srobert                                                 _MM_FROUND_CUR_DIRECTION);
2983*12c85518Srobert }
2984*12c85518Srobert 
2985*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fmadd_sch(__m128h __A,__mmask8 __U,__m128h __B,__m128h __C)2986*12c85518Srobert _mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
2987*12c85518Srobert   return (__m128h)__builtin_ia32_vfmaddcsh_round_mask(
2988*12c85518Srobert       (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
2989*12c85518Srobert }
2990*12c85518Srobert 
2991*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fmadd_sch(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)2992*12c85518Srobert _mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2993*12c85518Srobert   return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2994*12c85518Srobert                                                  (__v4sf)__C, (__mmask8)__U,
2995*12c85518Srobert                                                  _MM_FROUND_CUR_DIRECTION);
2996*12c85518Srobert }
2997*12c85518Srobert 
2998*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fmadd_sch(__m128h __A,__m128h __B,__m128h __C,__mmask8 __U)2999*12c85518Srobert _mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
3000*12c85518Srobert   return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3(
3001*12c85518Srobert       (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
3002*12c85518Srobert }
3003*12c85518Srobert 
3004*12c85518Srobert #define _mm_fmadd_round_sch(A, B, C, R)                                        \
3005*12c85518Srobert   ((__m128h)__builtin_ia32_vfmaddcsh_mask(                                     \
3006*12c85518Srobert       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
3007*12c85518Srobert       (__mmask8)-1, (int)(R)))
3008*12c85518Srobert 
3009*12c85518Srobert #define _mm_mask_fmadd_round_sch(A, U, B, C, R)                                \
3010*12c85518Srobert   ((__m128h)__builtin_ia32_vfmaddcsh_round_mask(                               \
3011*12c85518Srobert       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
3012*12c85518Srobert       (__mmask8)(U), (int)(R)))
3013*12c85518Srobert 
3014*12c85518Srobert #define _mm_maskz_fmadd_round_sch(U, A, B, C, R)                               \
3015*12c85518Srobert   ((__m128h)__builtin_ia32_vfmaddcsh_maskz(                                    \
3016*12c85518Srobert       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
3017*12c85518Srobert       (__mmask8)(U), (int)(R)))
3018*12c85518Srobert 
3019*12c85518Srobert #define _mm_mask3_fmadd_round_sch(A, B, C, U, R)                               \
3020*12c85518Srobert   ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3(                              \
3021*12c85518Srobert       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
3022*12c85518Srobert       (__mmask8)(U), (int)(R)))
3023*12c85518Srobert 
_mm_fcmul_sch(__m128h __A,__m128h __B)3024*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A,
3025*12c85518Srobert                                                               __m128h __B) {
3026*12c85518Srobert   return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3027*12c85518Srobert       (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3028*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
3029*12c85518Srobert }
3030*12c85518Srobert 
3031*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fcmul_sch(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)3032*12c85518Srobert _mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
3033*12c85518Srobert   return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3034*12c85518Srobert                                                 (__v4sf)__W, (__mmask8)__U,
3035*12c85518Srobert                                                 _MM_FROUND_CUR_DIRECTION);
3036*12c85518Srobert }
3037*12c85518Srobert 
3038*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fcmul_sch(__mmask8 __U,__m128h __A,__m128h __B)3039*12c85518Srobert _mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3040*12c85518Srobert   return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3041*12c85518Srobert       (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3042*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
3043*12c85518Srobert }
3044*12c85518Srobert 
3045*12c85518Srobert #define _mm_fcmul_round_sch(A, B, R)                                           \
3046*12c85518Srobert   ((__m128h)__builtin_ia32_vfcmulcsh_mask(                                     \
3047*12c85518Srobert       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
3048*12c85518Srobert       (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3049*12c85518Srobert 
3050*12c85518Srobert #define _mm_mask_fcmul_round_sch(W, U, A, B, R)                                \
3051*12c85518Srobert   ((__m128h)__builtin_ia32_vfcmulcsh_mask(                                     \
3052*12c85518Srobert       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W),        \
3053*12c85518Srobert       (__mmask8)(U), (int)(R)))
3054*12c85518Srobert 
3055*12c85518Srobert #define _mm_maskz_fcmul_round_sch(U, A, B, R)                                  \
3056*12c85518Srobert   ((__m128h)__builtin_ia32_vfcmulcsh_mask(                                     \
3057*12c85518Srobert       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
3058*12c85518Srobert       (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3059*12c85518Srobert 
_mm_fmul_sch(__m128h __A,__m128h __B)3060*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_sch(__m128h __A,
3061*12c85518Srobert                                                              __m128h __B) {
3062*12c85518Srobert   return (__m128h)__builtin_ia32_vfmulcsh_mask(
3063*12c85518Srobert       (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3064*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
3065*12c85518Srobert }
3066*12c85518Srobert 
_mm_mask_fmul_sch(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)3067*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_sch(__m128h __W,
3068*12c85518Srobert                                                                   __mmask8 __U,
3069*12c85518Srobert                                                                   __m128h __A,
3070*12c85518Srobert                                                                   __m128h __B) {
3071*12c85518Srobert   return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3072*12c85518Srobert                                                (__v4sf)__W, (__mmask8)__U,
3073*12c85518Srobert                                                _MM_FROUND_CUR_DIRECTION);
3074*12c85518Srobert }
3075*12c85518Srobert 
3076*12c85518Srobert static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fmul_sch(__mmask8 __U,__m128h __A,__m128h __B)3077*12c85518Srobert _mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3078*12c85518Srobert   return (__m128h)__builtin_ia32_vfmulcsh_mask(
3079*12c85518Srobert       (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3080*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
3081*12c85518Srobert }
3082*12c85518Srobert 
3083*12c85518Srobert #define _mm_fmul_round_sch(A, B, R)                                            \
3084*12c85518Srobert   ((__m128h)__builtin_ia32_vfmulcsh_mask(                                      \
3085*12c85518Srobert       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
3086*12c85518Srobert       (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3087*12c85518Srobert 
3088*12c85518Srobert #define _mm_mask_fmul_round_sch(W, U, A, B, R)                                 \
3089*12c85518Srobert   ((__m128h)__builtin_ia32_vfmulcsh_mask(                                      \
3090*12c85518Srobert       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W),        \
3091*12c85518Srobert       (__mmask8)(U), (int)(R)))
3092*12c85518Srobert 
3093*12c85518Srobert #define _mm_maskz_fmul_round_sch(U, A, B, R)                                   \
3094*12c85518Srobert   ((__m128h)__builtin_ia32_vfmulcsh_mask(                                      \
3095*12c85518Srobert       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
3096*12c85518Srobert       (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3097*12c85518Srobert 
_mm512_fcmul_pch(__m512h __A,__m512h __B)3098*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmul_pch(__m512h __A,
3099*12c85518Srobert                                                                  __m512h __B) {
3100*12c85518Srobert   return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3101*12c85518Srobert       (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3102*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
3103*12c85518Srobert }
3104*12c85518Srobert 
3105*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fcmul_pch(__m512h __W,__mmask16 __U,__m512h __A,__m512h __B)3106*12c85518Srobert _mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3107*12c85518Srobert   return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3108*12c85518Srobert                                                    (__v16sf)__W, (__mmask16)__U,
3109*12c85518Srobert                                                    _MM_FROUND_CUR_DIRECTION);
3110*12c85518Srobert }
3111*12c85518Srobert 
3112*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fcmul_pch(__mmask16 __U,__m512h __A,__m512h __B)3113*12c85518Srobert _mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3114*12c85518Srobert   return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3115*12c85518Srobert       (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3116*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
3117*12c85518Srobert }
3118*12c85518Srobert 
3119*12c85518Srobert #define _mm512_fcmul_round_pch(A, B, R)                                        \
3120*12c85518Srobert   ((__m512h)__builtin_ia32_vfcmulcph512_mask(                                  \
3121*12c85518Srobert       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
3122*12c85518Srobert       (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3123*12c85518Srobert 
3124*12c85518Srobert #define _mm512_mask_fcmul_round_pch(W, U, A, B, R)                             \
3125*12c85518Srobert   ((__m512h)__builtin_ia32_vfcmulcph512_mask(                                  \
3126*12c85518Srobert       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W),     \
3127*12c85518Srobert       (__mmask16)(U), (int)(R)))
3128*12c85518Srobert 
3129*12c85518Srobert #define _mm512_maskz_fcmul_round_pch(U, A, B, R)                               \
3130*12c85518Srobert   ((__m512h)__builtin_ia32_vfcmulcph512_mask(                                  \
3131*12c85518Srobert       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
3132*12c85518Srobert       (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3133*12c85518Srobert 
_mm512_fmul_pch(__m512h __A,__m512h __B)3134*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmul_pch(__m512h __A,
3135*12c85518Srobert                                                                 __m512h __B) {
3136*12c85518Srobert   return (__m512h)__builtin_ia32_vfmulcph512_mask(
3137*12c85518Srobert       (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3138*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
3139*12c85518Srobert }
3140*12c85518Srobert 
3141*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fmul_pch(__m512h __W,__mmask16 __U,__m512h __A,__m512h __B)3142*12c85518Srobert _mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3143*12c85518Srobert   return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3144*12c85518Srobert                                                   (__v16sf)__W, (__mmask16)__U,
3145*12c85518Srobert                                                   _MM_FROUND_CUR_DIRECTION);
3146*12c85518Srobert }
3147*12c85518Srobert 
3148*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fmul_pch(__mmask16 __U,__m512h __A,__m512h __B)3149*12c85518Srobert _mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3150*12c85518Srobert   return (__m512h)__builtin_ia32_vfmulcph512_mask(
3151*12c85518Srobert       (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3152*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
3153*12c85518Srobert }
3154*12c85518Srobert 
3155*12c85518Srobert #define _mm512_fmul_round_pch(A, B, R)                                         \
3156*12c85518Srobert   ((__m512h)__builtin_ia32_vfmulcph512_mask(                                   \
3157*12c85518Srobert       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
3158*12c85518Srobert       (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3159*12c85518Srobert 
3160*12c85518Srobert #define _mm512_mask_fmul_round_pch(W, U, A, B, R)                              \
3161*12c85518Srobert   ((__m512h)__builtin_ia32_vfmulcph512_mask(                                   \
3162*12c85518Srobert       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W),     \
3163*12c85518Srobert       (__mmask16)(U), (int)(R)))
3164*12c85518Srobert 
3165*12c85518Srobert #define _mm512_maskz_fmul_round_pch(U, A, B, R)                                \
3166*12c85518Srobert   ((__m512h)__builtin_ia32_vfmulcph512_mask(                                   \
3167*12c85518Srobert       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
3168*12c85518Srobert       (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3169*12c85518Srobert 
_mm512_fcmadd_pch(__m512h __A,__m512h __B,__m512h __C)3170*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A,
3171*12c85518Srobert                                                                   __m512h __B,
3172*12c85518Srobert                                                                   __m512h __C) {
3173*12c85518Srobert   return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3174*12c85518Srobert       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1,
3175*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
3176*12c85518Srobert }
3177*12c85518Srobert 
3178*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fcmadd_pch(__m512h __A,__mmask16 __U,__m512h __B,__m512h __C)3179*12c85518Srobert _mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3180*12c85518Srobert   return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
3181*12c85518Srobert       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3182*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
3183*12c85518Srobert }
3184*12c85518Srobert 
3185*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fcmadd_pch(__m512h __A,__m512h __B,__m512h __C,__mmask16 __U)3186*12c85518Srobert _mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3187*12c85518Srobert   return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3188*12c85518Srobert       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3189*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
3190*12c85518Srobert }
3191*12c85518Srobert 
3192*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fcmadd_pch(__mmask16 __U,__m512h __A,__m512h __B,__m512h __C)3193*12c85518Srobert _mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3194*12c85518Srobert   return (__m512h)__builtin_ia32_vfcmaddcph512_maskz(
3195*12c85518Srobert       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3196*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
3197*12c85518Srobert }
3198*12c85518Srobert 
3199*12c85518Srobert #define _mm512_fcmadd_round_pch(A, B, C, R)                                    \
3200*12c85518Srobert   ((__m512h)__builtin_ia32_vfcmaddcph512_mask3(                                \
3201*12c85518Srobert       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3202*12c85518Srobert       (__mmask16)-1, (int)(R)))
3203*12c85518Srobert 
3204*12c85518Srobert #define _mm512_mask_fcmadd_round_pch(A, U, B, C, R)                            \
3205*12c85518Srobert   ((__m512h)__builtin_ia32_vfcmaddcph512_mask(                                 \
3206*12c85518Srobert       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3207*12c85518Srobert       (__mmask16)(U), (int)(R)))
3208*12c85518Srobert 
3209*12c85518Srobert #define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R)                           \
3210*12c85518Srobert   ((__m512h)__builtin_ia32_vfcmaddcph512_mask3(                                \
3211*12c85518Srobert       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3212*12c85518Srobert       (__mmask16)(U), (int)(R)))
3213*12c85518Srobert 
3214*12c85518Srobert #define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R)                           \
3215*12c85518Srobert   ((__m512h)__builtin_ia32_vfcmaddcph512_maskz(                                \
3216*12c85518Srobert       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3217*12c85518Srobert       (__mmask16)(U), (int)(R)))
3218*12c85518Srobert 
_mm512_fmadd_pch(__m512h __A,__m512h __B,__m512h __C)3219*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A,
3220*12c85518Srobert                                                                  __m512h __B,
3221*12c85518Srobert                                                                  __m512h __C) {
3222*12c85518Srobert   return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B,
3223*12c85518Srobert                                                     (__v16sf)__C, (__mmask16)-1,
3224*12c85518Srobert                                                     _MM_FROUND_CUR_DIRECTION);
3225*12c85518Srobert }
3226*12c85518Srobert 
3227*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fmadd_pch(__m512h __A,__mmask16 __U,__m512h __B,__m512h __C)3228*12c85518Srobert _mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3229*12c85518Srobert   return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
3230*12c85518Srobert                                                    (__v16sf)__C, (__mmask16)__U,
3231*12c85518Srobert                                                    _MM_FROUND_CUR_DIRECTION);
3232*12c85518Srobert }
3233*12c85518Srobert 
3234*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fmadd_pch(__m512h __A,__m512h __B,__m512h __C,__mmask16 __U)3235*12c85518Srobert _mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3236*12c85518Srobert   return (__m512h)__builtin_ia32_vfmaddcph512_mask3(
3237*12c85518Srobert       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3238*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
3239*12c85518Srobert }
3240*12c85518Srobert 
3241*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fmadd_pch(__mmask16 __U,__m512h __A,__m512h __B,__m512h __C)3242*12c85518Srobert _mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3243*12c85518Srobert   return (__m512h)__builtin_ia32_vfmaddcph512_maskz(
3244*12c85518Srobert       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3245*12c85518Srobert       _MM_FROUND_CUR_DIRECTION);
3246*12c85518Srobert }
3247*12c85518Srobert 
3248*12c85518Srobert #define _mm512_fmadd_round_pch(A, B, C, R)                                     \
3249*12c85518Srobert   ((__m512h)__builtin_ia32_vfmaddcph512_mask3(                                 \
3250*12c85518Srobert       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3251*12c85518Srobert       (__mmask16)-1, (int)(R)))
3252*12c85518Srobert 
3253*12c85518Srobert #define _mm512_mask_fmadd_round_pch(A, U, B, C, R)                             \
3254*12c85518Srobert   ((__m512h)__builtin_ia32_vfmaddcph512_mask(                                  \
3255*12c85518Srobert       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3256*12c85518Srobert       (__mmask16)(U), (int)(R)))
3257*12c85518Srobert 
3258*12c85518Srobert #define _mm512_mask3_fmadd_round_pch(A, B, C, U, R)                            \
3259*12c85518Srobert   ((__m512h)__builtin_ia32_vfmaddcph512_mask3(                                 \
3260*12c85518Srobert       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3261*12c85518Srobert       (__mmask16)(U), (int)(R)))
3262*12c85518Srobert 
3263*12c85518Srobert #define _mm512_maskz_fmadd_round_pch(U, A, B, C, R)                            \
3264*12c85518Srobert   ((__m512h)__builtin_ia32_vfmaddcph512_maskz(                                 \
3265*12c85518Srobert       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3266*12c85518Srobert       (__mmask16)(U), (int)(R)))
3267*12c85518Srobert 
3268*12c85518Srobert static __inline__ _Float16 __DEFAULT_FN_ATTRS512
_mm512_reduce_add_ph(__m512h __W)3269*12c85518Srobert _mm512_reduce_add_ph(__m512h __W) {
3270*12c85518Srobert   return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
3271*12c85518Srobert }
3272*12c85518Srobert 
3273*12c85518Srobert static __inline__ _Float16 __DEFAULT_FN_ATTRS512
_mm512_reduce_mul_ph(__m512h __W)3274*12c85518Srobert _mm512_reduce_mul_ph(__m512h __W) {
3275*12c85518Srobert   return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W);
3276*12c85518Srobert }
3277*12c85518Srobert 
3278*12c85518Srobert static __inline__ _Float16 __DEFAULT_FN_ATTRS512
_mm512_reduce_max_ph(__m512h __V)3279*12c85518Srobert _mm512_reduce_max_ph(__m512h __V) {
3280*12c85518Srobert   return __builtin_ia32_reduce_fmax_ph512(__V);
3281*12c85518Srobert }
3282*12c85518Srobert 
3283*12c85518Srobert static __inline__ _Float16 __DEFAULT_FN_ATTRS512
_mm512_reduce_min_ph(__m512h __V)3284*12c85518Srobert _mm512_reduce_min_ph(__m512h __V) {
3285*12c85518Srobert   return __builtin_ia32_reduce_fmin_ph512(__V);
3286*12c85518Srobert }
3287*12c85518Srobert 
3288*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_blend_ph(__mmask32 __U,__m512h __A,__m512h __W)3289*12c85518Srobert _mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
3290*12c85518Srobert   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W,
3291*12c85518Srobert                                               (__v32hf)__A);
3292*12c85518Srobert }
3293*12c85518Srobert 
3294*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_permutex2var_ph(__m512h __A,__m512i __I,__m512h __B)3295*12c85518Srobert _mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
3296*12c85518Srobert   return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
3297*12c85518Srobert                                                  (__v32hi)__B);
3298*12c85518Srobert }
3299*12c85518Srobert 
3300*12c85518Srobert static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_permutexvar_ph(__m512i __A,__m512h __B)3301*12c85518Srobert _mm512_permutexvar_ph(__m512i __A, __m512h __B) {
3302*12c85518Srobert   return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
3303*12c85518Srobert }
3304*12c85518Srobert 
3305*12c85518Srobert // intrinsics below are alias for f*mul_*ch
3306*12c85518Srobert #define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B)
3307*12c85518Srobert #define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B)
3308*12c85518Srobert #define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B)
3309*12c85518Srobert #define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R)
3310*12c85518Srobert #define _mm512_mask_mul_round_pch(W, U, A, B, R)                               \
3311*12c85518Srobert   _mm512_mask_fmul_round_pch(W, U, A, B, R)
3312*12c85518Srobert #define _mm512_maskz_mul_round_pch(U, A, B, R)                                 \
3313*12c85518Srobert   _mm512_maskz_fmul_round_pch(U, A, B, R)
3314*12c85518Srobert 
3315*12c85518Srobert #define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B)
3316*12c85518Srobert #define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B)
3317*12c85518Srobert #define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B)
3318*12c85518Srobert #define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R)
3319*12c85518Srobert #define _mm512_mask_cmul_round_pch(W, U, A, B, R)                              \
3320*12c85518Srobert   _mm512_mask_fcmul_round_pch(W, U, A, B, R)
3321*12c85518Srobert #define _mm512_maskz_cmul_round_pch(U, A, B, R)                                \
3322*12c85518Srobert   _mm512_maskz_fcmul_round_pch(U, A, B, R)
3323*12c85518Srobert 
3324*12c85518Srobert #define _mm_mul_sch(A, B) _mm_fmul_sch(A, B)
3325*12c85518Srobert #define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B)
3326*12c85518Srobert #define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B)
3327*12c85518Srobert #define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R)
3328*12c85518Srobert #define _mm_mask_mul_round_sch(W, U, A, B, R)                                  \
3329*12c85518Srobert   _mm_mask_fmul_round_sch(W, U, A, B, R)
3330*12c85518Srobert #define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R)
3331*12c85518Srobert 
3332*12c85518Srobert #define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B)
3333*12c85518Srobert #define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B)
3334*12c85518Srobert #define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B)
3335*12c85518Srobert #define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R)
3336*12c85518Srobert #define _mm_mask_cmul_round_sch(W, U, A, B, R)                                 \
3337*12c85518Srobert   _mm_mask_fcmul_round_sch(W, U, A, B, R)
3338*12c85518Srobert #define _mm_maskz_cmul_round_sch(U, A, B, R)                                   \
3339*12c85518Srobert   _mm_maskz_fcmul_round_sch(U, A, B, R)
3340*12c85518Srobert 
3341*12c85518Srobert #undef __DEFAULT_FN_ATTRS128
3342*12c85518Srobert #undef __DEFAULT_FN_ATTRS256
3343*12c85518Srobert #undef __DEFAULT_FN_ATTRS512
3344*12c85518Srobert 
3345*12c85518Srobert #endif
3346*12c85518Srobert #endif
3347