xref: /openbsd-src/gnu/llvm/clang/lib/Headers/avxneconvertintrin.h (revision 12c855180aad702bbcca06e0398d774beeafb155)
1*12c85518Srobert /*===-------------- avxneconvertintrin.h - AVXNECONVERT --------------------===
2*12c85518Srobert  *
3*12c85518Srobert  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*12c85518Srobert  * See https://llvm.org/LICENSE.txt for license information.
5*12c85518Srobert  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*12c85518Srobert  *
7*12c85518Srobert  *===-----------------------------------------------------------------------===
8*12c85518Srobert  */
9*12c85518Srobert 
10*12c85518Srobert #ifndef __IMMINTRIN_H
11*12c85518Srobert #error                                                                         \
12*12c85518Srobert     "Never use <avxneconvertintrin.h> directly; include <immintrin.h> instead."
13*12c85518Srobert #endif // __IMMINTRIN_H
14*12c85518Srobert 
15*12c85518Srobert #ifdef __SSE2__
16*12c85518Srobert 
17*12c85518Srobert #ifndef __AVXNECONVERTINTRIN_H
18*12c85518Srobert #define __AVXNECONVERTINTRIN_H
19*12c85518Srobert 
20*12c85518Srobert /* Define the default attributes for the functions in this file. */
21*12c85518Srobert #define __DEFAULT_FN_ATTRS128                                                  \
22*12c85518Srobert   __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"),   \
23*12c85518Srobert                  __min_vector_width__(128)))
24*12c85518Srobert #define __DEFAULT_FN_ATTRS256                                                  \
25*12c85518Srobert   __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"),   \
26*12c85518Srobert                  __min_vector_width__(256)))
27*12c85518Srobert 
28*12c85518Srobert /// Convert scalar BF16 (16-bit) floating-point element
29*12c85518Srobert /// stored at memory locations starting at location \a __A to a
30*12c85518Srobert /// single-precision (32-bit) floating-point, broadcast it to packed
31*12c85518Srobert /// single-precision (32-bit) floating-point elements, and store the results in
32*12c85518Srobert /// \a dst.
33*12c85518Srobert ///
34*12c85518Srobert /// \headerfile <x86intrin.h>
35*12c85518Srobert ///
36*12c85518Srobert /// \code
37*12c85518Srobert /// _mm_bcstnebf16_ps(const void *__A);
38*12c85518Srobert /// \endcode
39*12c85518Srobert ///
40*12c85518Srobert /// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
41*12c85518Srobert ///
42*12c85518Srobert /// \param __A
43*12c85518Srobert ///    A pointer to a 16-bit memory location. The address of the memory
44*12c85518Srobert ///    location does not have to be aligned.
45*12c85518Srobert /// \returns
46*12c85518Srobert ///    A 128-bit vector of [4 x float].
47*12c85518Srobert ///
48*12c85518Srobert /// \code{.operation}
49*12c85518Srobert /// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
50*12c85518Srobert /// FOR j := 0 to 3
51*12c85518Srobert ///   m := j*32
52*12c85518Srobert ///   dst[m+31:m] := b
53*12c85518Srobert /// ENDFOR
54*12c85518Srobert /// dst[MAX:128] := 0
55*12c85518Srobert /// \endcode
56*12c85518Srobert static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_bcstnebf16_ps(const void * __A)57*12c85518Srobert _mm_bcstnebf16_ps(const void *__A) {
58*12c85518Srobert   return (__m128)__builtin_ia32_vbcstnebf162ps128((const __bf16 *)__A);
59*12c85518Srobert }
60*12c85518Srobert 
61*12c85518Srobert /// Convert scalar BF16 (16-bit) floating-point element
62*12c85518Srobert /// stored at memory locations starting at location \a __A to a
63*12c85518Srobert /// single-precision (32-bit) floating-point, broadcast it to packed
64*12c85518Srobert /// single-precision (32-bit) floating-point elements, and store the results in
65*12c85518Srobert /// \a dst.
66*12c85518Srobert ///
67*12c85518Srobert /// \headerfile <x86intrin.h>
68*12c85518Srobert ///
69*12c85518Srobert /// \code
70*12c85518Srobert /// _mm256_bcstnebf16_ps(const void *__A);
71*12c85518Srobert /// \endcode
72*12c85518Srobert ///
73*12c85518Srobert /// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
74*12c85518Srobert ///
75*12c85518Srobert /// \param __A
76*12c85518Srobert ///    A pointer to a 16-bit memory location. The address of the memory
77*12c85518Srobert ///    location does not have to be aligned.
78*12c85518Srobert /// \returns
79*12c85518Srobert ///    A 256-bit vector of [8 x float].
80*12c85518Srobert ///
81*12c85518Srobert /// \code{.operation}
82*12c85518Srobert /// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
83*12c85518Srobert /// FOR j := 0 to 7
84*12c85518Srobert ///   m := j*32
85*12c85518Srobert ///   dst[m+31:m] := b
86*12c85518Srobert /// ENDFOR
87*12c85518Srobert /// dst[MAX:256] := 0
88*12c85518Srobert /// \endcode
89*12c85518Srobert static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_bcstnebf16_ps(const void * __A)90*12c85518Srobert _mm256_bcstnebf16_ps(const void *__A) {
91*12c85518Srobert   return (__m256)__builtin_ia32_vbcstnebf162ps256((const __bf16 *)__A);
92*12c85518Srobert }
93*12c85518Srobert 
94*12c85518Srobert /// Convert scalar half-precision (16-bit) floating-point element
95*12c85518Srobert /// stored at memory locations starting at location \a __A to a
96*12c85518Srobert /// single-precision (32-bit) floating-point, broadcast it to packed
97*12c85518Srobert /// single-precision (32-bit) floating-point elements, and store the results in
98*12c85518Srobert /// \a dst.
99*12c85518Srobert ///
100*12c85518Srobert /// \headerfile <x86intrin.h>
101*12c85518Srobert ///
102*12c85518Srobert /// \code
103*12c85518Srobert /// _mm_bcstnesh_ps(const void *__A);
104*12c85518Srobert /// \endcode
105*12c85518Srobert ///
106*12c85518Srobert /// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
107*12c85518Srobert ///
108*12c85518Srobert /// \param __A
109*12c85518Srobert ///    A pointer to a 16-bit memory location. The address of the memory
110*12c85518Srobert ///    location does not have to be aligned.
111*12c85518Srobert /// \returns
112*12c85518Srobert ///    A 128-bit vector of [4 x float].
113*12c85518Srobert ///
114*12c85518Srobert /// \code{.operation}
115*12c85518Srobert /// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
116*12c85518Srobert /// FOR j := 0 to 3
117*12c85518Srobert ///   m := j*32
118*12c85518Srobert ///   dst[m+31:m] := b
119*12c85518Srobert /// ENDFOR
120*12c85518Srobert /// dst[MAX:128] := 0
121*12c85518Srobert /// \endcode
122*12c85518Srobert static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_bcstnesh_ps(const void * __A)123*12c85518Srobert _mm_bcstnesh_ps(const void *__A) {
124*12c85518Srobert   return (__m128)__builtin_ia32_vbcstnesh2ps128((const _Float16 *)__A);
125*12c85518Srobert }
126*12c85518Srobert 
127*12c85518Srobert /// Convert scalar half-precision (16-bit) floating-point element
128*12c85518Srobert /// stored at memory locations starting at location \a __A to a
129*12c85518Srobert /// single-precision (32-bit) floating-point, broadcast it to packed
130*12c85518Srobert /// single-precision (32-bit) floating-point elements, and store the results in
131*12c85518Srobert /// \a dst.
132*12c85518Srobert ///
133*12c85518Srobert /// \headerfile <x86intrin.h>
134*12c85518Srobert ///
135*12c85518Srobert /// \code
136*12c85518Srobert /// _mm256_bcstnesh_ps(const void *__A);
137*12c85518Srobert /// \endcode
138*12c85518Srobert ///
139*12c85518Srobert /// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
140*12c85518Srobert ///
141*12c85518Srobert /// \param __A
142*12c85518Srobert ///    A pointer to a 16-bit memory location. The address of the memory
143*12c85518Srobert ///    location does not have to be aligned.
144*12c85518Srobert /// \returns
145*12c85518Srobert ///    A 256-bit vector of [8 x float].
146*12c85518Srobert ///
147*12c85518Srobert /// \code{.operation}
148*12c85518Srobert /// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
149*12c85518Srobert /// FOR j := 0 to 7
150*12c85518Srobert ///   m := j*32
151*12c85518Srobert ///   dst[m+31:m] := b
152*12c85518Srobert /// ENDFOR
153*12c85518Srobert /// dst[MAX:256] := 0
154*12c85518Srobert /// \endcode
155*12c85518Srobert static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_bcstnesh_ps(const void * __A)156*12c85518Srobert _mm256_bcstnesh_ps(const void *__A) {
157*12c85518Srobert   return (__m256)__builtin_ia32_vbcstnesh2ps256((const _Float16 *)__A);
158*12c85518Srobert }
159*12c85518Srobert 
160*12c85518Srobert /// Convert packed BF16 (16-bit) floating-point even-indexed elements
161*12c85518Srobert /// stored at memory locations starting at location \a __A to packed
162*12c85518Srobert /// single-precision (32-bit) floating-point elements, and store the results in
163*12c85518Srobert /// \a dst.
164*12c85518Srobert ///
165*12c85518Srobert /// \headerfile <x86intrin.h>
166*12c85518Srobert ///
167*12c85518Srobert /// \code
168*12c85518Srobert /// _mm_cvtneebf16_ps(const __m128bh *__A);
169*12c85518Srobert /// \endcode
170*12c85518Srobert ///
171*12c85518Srobert /// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
172*12c85518Srobert ///
173*12c85518Srobert /// \param __A
174*12c85518Srobert ///    A pointer to a 128-bit memory location containing 8 consecutive
175*12c85518Srobert ///    BF16 (16-bit) floating-point values.
176*12c85518Srobert /// \returns
177*12c85518Srobert ///    A 128-bit vector of [4 x float].
178*12c85518Srobert ///
179*12c85518Srobert /// \code{.operation}
180*12c85518Srobert /// FOR j := 0 to 3
181*12c85518Srobert /// 	k := j*2
182*12c85518Srobert /// 	i := k*16
183*12c85518Srobert /// 	m := j*32
184*12c85518Srobert /// 	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
185*12c85518Srobert /// ENDFOR
186*12c85518Srobert /// dst[MAX:128] := 0
187*12c85518Srobert /// \endcode
188*12c85518Srobert static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_cvtneebf16_ps(const __m128bh * __A)189*12c85518Srobert _mm_cvtneebf16_ps(const __m128bh *__A) {
190*12c85518Srobert   return (__m128)__builtin_ia32_vcvtneebf162ps128((const __v8bf *)__A);
191*12c85518Srobert }
192*12c85518Srobert 
193*12c85518Srobert /// Convert packed BF16 (16-bit) floating-point even-indexed elements
194*12c85518Srobert /// stored at memory locations starting at location \a __A to packed
195*12c85518Srobert /// single-precision (32-bit) floating-point elements, and store the results in
196*12c85518Srobert /// \a dst.
197*12c85518Srobert ///
198*12c85518Srobert /// \headerfile <x86intrin.h>
199*12c85518Srobert ///
200*12c85518Srobert /// \code
201*12c85518Srobert /// _mm256_cvtneebf16_ps(const __m256bh *__A);
202*12c85518Srobert /// \endcode
203*12c85518Srobert ///
204*12c85518Srobert /// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
205*12c85518Srobert ///
206*12c85518Srobert /// \param __A
207*12c85518Srobert ///    A pointer to a 256-bit memory location containing 16 consecutive
208*12c85518Srobert ///    BF16 (16-bit) floating-point values.
209*12c85518Srobert /// \returns
210*12c85518Srobert ///    A 256-bit vector of [8 x float].
211*12c85518Srobert ///
212*12c85518Srobert /// \code{.operation}
213*12c85518Srobert /// FOR j := 0 to 7
214*12c85518Srobert /// 	k := j*2
215*12c85518Srobert /// 	i := k*16
216*12c85518Srobert /// 	m := j*32
217*12c85518Srobert /// 	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
218*12c85518Srobert /// ENDFOR
219*12c85518Srobert /// dst[MAX:256] := 0
220*12c85518Srobert /// \endcode
221*12c85518Srobert static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_cvtneebf16_ps(const __m256bh * __A)222*12c85518Srobert _mm256_cvtneebf16_ps(const __m256bh *__A) {
223*12c85518Srobert   return (__m256)__builtin_ia32_vcvtneebf162ps256((const __v16bf *)__A);
224*12c85518Srobert }
225*12c85518Srobert 
226*12c85518Srobert /// Convert packed half-precision (16-bit) floating-point even-indexed elements
227*12c85518Srobert /// stored at memory locations starting at location \a __A to packed
228*12c85518Srobert /// single-precision (32-bit) floating-point elements, and store the results in
229*12c85518Srobert /// \a dst.
230*12c85518Srobert ///
231*12c85518Srobert /// \headerfile <x86intrin.h>
232*12c85518Srobert ///
233*12c85518Srobert /// \code
234*12c85518Srobert /// _mm_cvtneeph_ps(const __m128h *__A);
235*12c85518Srobert /// \endcode
236*12c85518Srobert ///
237*12c85518Srobert /// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
238*12c85518Srobert ///
239*12c85518Srobert /// \param __A
240*12c85518Srobert ///    A pointer to a 128-bit memory location containing 8 consecutive
241*12c85518Srobert ///    half-precision (16-bit) floating-point values.
242*12c85518Srobert /// \returns
243*12c85518Srobert ///    A 128-bit vector of [4 x float].
244*12c85518Srobert ///
245*12c85518Srobert /// \code{.operation}
246*12c85518Srobert /// FOR j := 0 to 3
247*12c85518Srobert /// 	k := j*2
248*12c85518Srobert /// 	i := k*16
249*12c85518Srobert /// 	m := j*32
250*12c85518Srobert /// 	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
251*12c85518Srobert /// ENDFOR
252*12c85518Srobert /// dst[MAX:128] := 0
253*12c85518Srobert /// \endcode
254*12c85518Srobert static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_cvtneeph_ps(const __m128h * __A)255*12c85518Srobert _mm_cvtneeph_ps(const __m128h *__A) {
256*12c85518Srobert   return (__m128)__builtin_ia32_vcvtneeph2ps128((const __v8hf *)__A);
257*12c85518Srobert }
258*12c85518Srobert 
259*12c85518Srobert /// Convert packed half-precision (16-bit) floating-point even-indexed elements
260*12c85518Srobert /// stored at memory locations starting at location \a __A to packed
261*12c85518Srobert /// single-precision (32-bit) floating-point elements, and store the results in
262*12c85518Srobert /// \a dst.
263*12c85518Srobert ///
264*12c85518Srobert /// \headerfile <x86intrin.h>
265*12c85518Srobert ///
266*12c85518Srobert /// \code
267*12c85518Srobert /// _mm256_cvtneeph_ps(const __m256h *__A);
268*12c85518Srobert /// \endcode
269*12c85518Srobert ///
270*12c85518Srobert /// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
271*12c85518Srobert ///
272*12c85518Srobert /// \param __A
273*12c85518Srobert ///    A pointer to a 256-bit memory location containing 16 consecutive
274*12c85518Srobert ///    half-precision (16-bit) floating-point values.
275*12c85518Srobert /// \returns
276*12c85518Srobert ///    A 256-bit vector of [8 x float].
277*12c85518Srobert ///
278*12c85518Srobert /// \code{.operation}
279*12c85518Srobert /// FOR j := 0 to 7
280*12c85518Srobert /// 	k := j*2
281*12c85518Srobert /// 	i := k*16
282*12c85518Srobert /// 	m := j*32
283*12c85518Srobert /// 	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
284*12c85518Srobert /// ENDFOR
285*12c85518Srobert /// dst[MAX:256] := 0
286*12c85518Srobert /// \endcode
287*12c85518Srobert static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_cvtneeph_ps(const __m256h * __A)288*12c85518Srobert _mm256_cvtneeph_ps(const __m256h *__A) {
289*12c85518Srobert   return (__m256)__builtin_ia32_vcvtneeph2ps256((const __v16hf *)__A);
290*12c85518Srobert }
291*12c85518Srobert 
292*12c85518Srobert /// Convert packed BF16 (16-bit) floating-point odd-indexed elements
293*12c85518Srobert /// stored at memory locations starting at location \a __A to packed
294*12c85518Srobert /// single-precision (32-bit) floating-point elements, and store the results in
295*12c85518Srobert /// \a dst.
296*12c85518Srobert ///
297*12c85518Srobert /// \headerfile <x86intrin.h>
298*12c85518Srobert ///
299*12c85518Srobert /// \code
300*12c85518Srobert /// _mm_cvtneobf16_ps(const __m128bh *__A);
301*12c85518Srobert /// \endcode
302*12c85518Srobert ///
303*12c85518Srobert /// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
304*12c85518Srobert ///
305*12c85518Srobert /// \param __A
306*12c85518Srobert ///    A pointer to a 128-bit memory location containing 8 consecutive
307*12c85518Srobert ///    BF16 (16-bit) floating-point values.
308*12c85518Srobert /// \returns
309*12c85518Srobert ///    A 128-bit vector of [4 x float].
310*12c85518Srobert ///
311*12c85518Srobert /// \code{.operation}
312*12c85518Srobert /// FOR j := 0 to 3
313*12c85518Srobert /// 	k := j*2+1
314*12c85518Srobert /// 	i := k*16
315*12c85518Srobert /// 	m := j*32
316*12c85518Srobert /// 	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
317*12c85518Srobert /// ENDFOR
318*12c85518Srobert /// dst[MAX:128] := 0
319*12c85518Srobert /// \endcode
320*12c85518Srobert static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_cvtneobf16_ps(const __m128bh * __A)321*12c85518Srobert _mm_cvtneobf16_ps(const __m128bh *__A) {
322*12c85518Srobert   return (__m128)__builtin_ia32_vcvtneobf162ps128((const __v8bf *)__A);
323*12c85518Srobert }
324*12c85518Srobert 
325*12c85518Srobert /// Convert packed BF16 (16-bit) floating-point odd-indexed elements
326*12c85518Srobert /// stored at memory locations starting at location \a __A to packed
327*12c85518Srobert /// single-precision (32-bit) floating-point elements, and store the results in
328*12c85518Srobert /// \a dst.
329*12c85518Srobert ///
330*12c85518Srobert /// \headerfile <x86intrin.h>
331*12c85518Srobert ///
332*12c85518Srobert /// \code
333*12c85518Srobert /// _mm256_cvtneobf16_ps(const __m256bh *__A);
334*12c85518Srobert /// \endcode
335*12c85518Srobert ///
336*12c85518Srobert /// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
337*12c85518Srobert ///
338*12c85518Srobert /// \param __A
339*12c85518Srobert ///    A pointer to a 256-bit memory location containing 16 consecutive
340*12c85518Srobert ///    BF16 (16-bit) floating-point values.
341*12c85518Srobert /// \returns
342*12c85518Srobert ///    A 256-bit vector of [8 x float].
343*12c85518Srobert ///
344*12c85518Srobert /// \code{.operation}
345*12c85518Srobert /// FOR j := 0 to 7
346*12c85518Srobert /// 	k := j*2+1
347*12c85518Srobert /// 	i := k*16
348*12c85518Srobert /// 	m := j*32
349*12c85518Srobert /// 	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
350*12c85518Srobert /// ENDFOR
351*12c85518Srobert /// dst[MAX:256] := 0
352*12c85518Srobert /// \endcode
353*12c85518Srobert static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_cvtneobf16_ps(const __m256bh * __A)354*12c85518Srobert _mm256_cvtneobf16_ps(const __m256bh *__A) {
355*12c85518Srobert   return (__m256)__builtin_ia32_vcvtneobf162ps256((const __v16bf *)__A);
356*12c85518Srobert }
357*12c85518Srobert 
358*12c85518Srobert /// Convert packed half-precision (16-bit) floating-point odd-indexed elements
359*12c85518Srobert /// stored at memory locations starting at location \a __A to packed
360*12c85518Srobert /// single-precision (32-bit) floating-point elements, and store the results in
361*12c85518Srobert /// \a dst.
362*12c85518Srobert ///
363*12c85518Srobert /// \headerfile <x86intrin.h>
364*12c85518Srobert ///
365*12c85518Srobert /// \code
366*12c85518Srobert /// _mm_cvtneoph_ps(const __m128h *__A);
367*12c85518Srobert /// \endcode
368*12c85518Srobert ///
369*12c85518Srobert /// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
370*12c85518Srobert ///
371*12c85518Srobert /// \param __A
372*12c85518Srobert ///    A pointer to a 128-bit memory location containing 8 consecutive
373*12c85518Srobert ///    half-precision (16-bit) floating-point values.
374*12c85518Srobert /// \returns
375*12c85518Srobert ///    A 128-bit vector of [4 x float].
376*12c85518Srobert ///
377*12c85518Srobert /// \code{.operation}
378*12c85518Srobert /// FOR j := 0 to 3
379*12c85518Srobert /// 	k := j*2+1
380*12c85518Srobert /// 	i := k*16
381*12c85518Srobert /// 	m := j*32
382*12c85518Srobert /// 	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
383*12c85518Srobert /// ENDFOR
384*12c85518Srobert /// dst[MAX:128] := 0
385*12c85518Srobert /// \endcode
386*12c85518Srobert static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_cvtneoph_ps(const __m128h * __A)387*12c85518Srobert _mm_cvtneoph_ps(const __m128h *__A) {
388*12c85518Srobert   return (__m128)__builtin_ia32_vcvtneoph2ps128((const __v8hf *)__A);
389*12c85518Srobert }
390*12c85518Srobert 
391*12c85518Srobert /// Convert packed half-precision (16-bit) floating-point odd-indexed elements
392*12c85518Srobert /// stored at memory locations starting at location \a __A to packed
393*12c85518Srobert /// single-precision (32-bit) floating-point elements, and store the results in
394*12c85518Srobert /// \a dst.
395*12c85518Srobert ///
396*12c85518Srobert /// \headerfile <x86intrin.h>
397*12c85518Srobert ///
398*12c85518Srobert /// \code
399*12c85518Srobert /// _mm256_cvtneoph_ps(const __m256h *__A);
400*12c85518Srobert /// \endcode
401*12c85518Srobert ///
402*12c85518Srobert /// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
403*12c85518Srobert ///
404*12c85518Srobert /// \param __A
405*12c85518Srobert ///    A pointer to a 256-bit memory location containing 16 consecutive
406*12c85518Srobert ///    half-precision (16-bit) floating-point values.
407*12c85518Srobert /// \returns
408*12c85518Srobert ///    A 256-bit vector of [8 x float].
409*12c85518Srobert ///
410*12c85518Srobert /// \code{.operation}
411*12c85518Srobert /// FOR j := 0 to 7
412*12c85518Srobert /// 	k := j*2+1
413*12c85518Srobert /// 	i := k*16
414*12c85518Srobert /// 	m := j*32
415*12c85518Srobert /// 	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
416*12c85518Srobert /// ENDFOR
417*12c85518Srobert /// dst[MAX:256] := 0
418*12c85518Srobert /// \endcode
419*12c85518Srobert static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_cvtneoph_ps(const __m256h * __A)420*12c85518Srobert _mm256_cvtneoph_ps(const __m256h *__A) {
421*12c85518Srobert   return (__m256)__builtin_ia32_vcvtneoph2ps256((const __v16hf *)__A);
422*12c85518Srobert }
423*12c85518Srobert 
424*12c85518Srobert /// Convert packed single-precision (32-bit) floating-point elements in \a __A
425*12c85518Srobert /// to packed BF16 (16-bit) floating-point elements, and store the results in \a
426*12c85518Srobert /// dst.
427*12c85518Srobert ///
428*12c85518Srobert /// \headerfile <x86intrin.h>
429*12c85518Srobert ///
430*12c85518Srobert /// \code
431*12c85518Srobert /// _mm_cvtneps_avx_pbh(__m128 __A);
432*12c85518Srobert /// \endcode
433*12c85518Srobert ///
434*12c85518Srobert /// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
435*12c85518Srobert ///
436*12c85518Srobert /// \param __A
437*12c85518Srobert ///    A 128-bit vector of [4 x float].
438*12c85518Srobert /// \returns
439*12c85518Srobert ///    A 128-bit vector of [8 x bfloat].
440*12c85518Srobert ///
441*12c85518Srobert /// \code{.operation}
442*12c85518Srobert /// FOR j := 0 to 3
443*12c85518Srobert /// 	dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j])
444*12c85518Srobert /// ENDFOR
445*12c85518Srobert /// dst[MAX:128] := 0
446*12c85518Srobert /// \endcode
447*12c85518Srobert static __inline__ __m128bh __DEFAULT_FN_ATTRS128
_mm_cvtneps_avx_pbh(__m128 __A)448*12c85518Srobert _mm_cvtneps_avx_pbh(__m128 __A) {
449*12c85518Srobert   return (__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)__A);
450*12c85518Srobert }
451*12c85518Srobert 
452*12c85518Srobert /// Convert packed single-precision (32-bit) floating-point elements in \a __A
453*12c85518Srobert /// to packed BF16 (16-bit) floating-point elements, and store the results in \a
454*12c85518Srobert /// dst.
455*12c85518Srobert ///
456*12c85518Srobert /// \headerfile <x86intrin.h>
457*12c85518Srobert ///
458*12c85518Srobert /// \code
459*12c85518Srobert /// _mm256_cvtneps_avx_pbh(__m256 __A);
460*12c85518Srobert /// \endcode
461*12c85518Srobert ///
462*12c85518Srobert /// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
463*12c85518Srobert ///
464*12c85518Srobert /// \param __A
465*12c85518Srobert ///    A 256-bit vector of [8 x float].
466*12c85518Srobert /// \returns
467*12c85518Srobert ///    A 128-bit vector of [8 x bfloat].
468*12c85518Srobert ///
469*12c85518Srobert /// \code{.operation}
470*12c85518Srobert /// FOR j := 0 to 7
471*12c85518Srobert /// 	dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
472*12c85518Srobert /// ENDFOR
473*12c85518Srobert /// dst[MAX:128] := 0
474*12c85518Srobert /// \endcode
475*12c85518Srobert static __inline__ __m128bh __DEFAULT_FN_ATTRS256
_mm256_cvtneps_avx_pbh(__m256 __A)476*12c85518Srobert _mm256_cvtneps_avx_pbh(__m256 __A) {
477*12c85518Srobert   return (__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)__A);
478*12c85518Srobert }
479*12c85518Srobert 
480*12c85518Srobert #undef __DEFAULT_FN_ATTRS128
481*12c85518Srobert #undef __DEFAULT_FN_ATTRS256
482*12c85518Srobert 
483*12c85518Srobert #endif // __AVXNECONVERTINTRIN_H
484*12c85518Srobert #endif // __SSE2__
485