1*7330f729Sjoerg /*===---- f16cintrin.h - F16C intrinsics -----------------------------------===
2*7330f729Sjoerg *
3*7330f729Sjoerg * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*7330f729Sjoerg * See https://llvm.org/LICENSE.txt for license information.
5*7330f729Sjoerg * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*7330f729Sjoerg *
7*7330f729Sjoerg *===-----------------------------------------------------------------------===
8*7330f729Sjoerg */
9*7330f729Sjoerg
10*7330f729Sjoerg #if !defined __IMMINTRIN_H
11*7330f729Sjoerg #error "Never use <f16cintrin.h> directly; include <immintrin.h> instead."
12*7330f729Sjoerg #endif
13*7330f729Sjoerg
14*7330f729Sjoerg #ifndef __F16CINTRIN_H
15*7330f729Sjoerg #define __F16CINTRIN_H
16*7330f729Sjoerg
17*7330f729Sjoerg /* Define the default attributes for the functions in this file. */
18*7330f729Sjoerg #define __DEFAULT_FN_ATTRS128 \
19*7330f729Sjoerg __attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(128)))
20*7330f729Sjoerg #define __DEFAULT_FN_ATTRS256 \
21*7330f729Sjoerg __attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(256)))
22*7330f729Sjoerg
23*7330f729Sjoerg /* NOTE: Intel documents the 128-bit versions of these as being in emmintrin.h,
24*7330f729Sjoerg * but that's because icc can emulate these without f16c using a library call.
25*7330f729Sjoerg * Since we don't do that let's leave these in f16cintrin.h.
26*7330f729Sjoerg */
27*7330f729Sjoerg
28*7330f729Sjoerg /// Converts a 16-bit half-precision float value into a 32-bit float
29*7330f729Sjoerg /// value.
30*7330f729Sjoerg ///
31*7330f729Sjoerg /// \headerfile <x86intrin.h>
32*7330f729Sjoerg ///
33*7330f729Sjoerg /// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
34*7330f729Sjoerg ///
35*7330f729Sjoerg /// \param __a
36*7330f729Sjoerg /// A 16-bit half-precision float value.
37*7330f729Sjoerg /// \returns The converted 32-bit float value.
38*7330f729Sjoerg static __inline float __DEFAULT_FN_ATTRS128
_cvtsh_ss(unsigned short __a)39*7330f729Sjoerg _cvtsh_ss(unsigned short __a)
40*7330f729Sjoerg {
41*7330f729Sjoerg __v8hi __v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
42*7330f729Sjoerg __v4sf __r = __builtin_ia32_vcvtph2ps(__v);
43*7330f729Sjoerg return __r[0];
44*7330f729Sjoerg }
45*7330f729Sjoerg
46*7330f729Sjoerg /// Converts a 32-bit single-precision float value to a 16-bit
47*7330f729Sjoerg /// half-precision float value.
48*7330f729Sjoerg ///
49*7330f729Sjoerg /// \headerfile <x86intrin.h>
50*7330f729Sjoerg ///
51*7330f729Sjoerg /// \code
52*7330f729Sjoerg /// unsigned short _cvtss_sh(float a, const int imm);
53*7330f729Sjoerg /// \endcode
54*7330f729Sjoerg ///
55*7330f729Sjoerg /// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
56*7330f729Sjoerg ///
57*7330f729Sjoerg /// \param a
58*7330f729Sjoerg /// A 32-bit single-precision float value to be converted to a 16-bit
59*7330f729Sjoerg /// half-precision float value.
60*7330f729Sjoerg /// \param imm
61*7330f729Sjoerg /// An immediate value controlling rounding using bits [2:0]: \n
62*7330f729Sjoerg /// 000: Nearest \n
63*7330f729Sjoerg /// 001: Down \n
64*7330f729Sjoerg /// 010: Up \n
65*7330f729Sjoerg /// 011: Truncate \n
66*7330f729Sjoerg /// 1XX: Use MXCSR.RC for rounding
67*7330f729Sjoerg /// \returns The converted 16-bit half-precision float value.
68*7330f729Sjoerg #define _cvtss_sh(a, imm) \
69*7330f729Sjoerg (unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
70*7330f729Sjoerg (imm)))[0])
71*7330f729Sjoerg
72*7330f729Sjoerg /// Converts a 128-bit vector containing 32-bit float values into a
73*7330f729Sjoerg /// 128-bit vector containing 16-bit half-precision float values.
74*7330f729Sjoerg ///
75*7330f729Sjoerg /// \headerfile <x86intrin.h>
76*7330f729Sjoerg ///
77*7330f729Sjoerg /// \code
78*7330f729Sjoerg /// __m128i _mm_cvtps_ph(__m128 a, const int imm);
79*7330f729Sjoerg /// \endcode
80*7330f729Sjoerg ///
81*7330f729Sjoerg /// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
82*7330f729Sjoerg ///
83*7330f729Sjoerg /// \param a
84*7330f729Sjoerg /// A 128-bit vector containing 32-bit float values.
85*7330f729Sjoerg /// \param imm
86*7330f729Sjoerg /// An immediate value controlling rounding using bits [2:0]: \n
87*7330f729Sjoerg /// 000: Nearest \n
88*7330f729Sjoerg /// 001: Down \n
89*7330f729Sjoerg /// 010: Up \n
90*7330f729Sjoerg /// 011: Truncate \n
91*7330f729Sjoerg /// 1XX: Use MXCSR.RC for rounding
92*7330f729Sjoerg /// \returns A 128-bit vector containing converted 16-bit half-precision float
93*7330f729Sjoerg /// values. The lower 64 bits are used to store the converted 16-bit
94*7330f729Sjoerg /// half-precision floating-point values.
95*7330f729Sjoerg #define _mm_cvtps_ph(a, imm) \
96*7330f729Sjoerg (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm))
97*7330f729Sjoerg
98*7330f729Sjoerg /// Converts a 128-bit vector containing 16-bit half-precision float
99*7330f729Sjoerg /// values into a 128-bit vector containing 32-bit float values.
100*7330f729Sjoerg ///
101*7330f729Sjoerg /// \headerfile <x86intrin.h>
102*7330f729Sjoerg ///
103*7330f729Sjoerg /// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
104*7330f729Sjoerg ///
105*7330f729Sjoerg /// \param __a
106*7330f729Sjoerg /// A 128-bit vector containing 16-bit half-precision float values. The lower
107*7330f729Sjoerg /// 64 bits are used in the conversion.
108*7330f729Sjoerg /// \returns A 128-bit vector of [4 x float] containing converted float values.
109*7330f729Sjoerg static __inline __m128 __DEFAULT_FN_ATTRS128
_mm_cvtph_ps(__m128i __a)110*7330f729Sjoerg _mm_cvtph_ps(__m128i __a)
111*7330f729Sjoerg {
112*7330f729Sjoerg return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
113*7330f729Sjoerg }
114*7330f729Sjoerg
115*7330f729Sjoerg /// Converts a 256-bit vector of [8 x float] into a 128-bit vector
116*7330f729Sjoerg /// containing 16-bit half-precision float values.
117*7330f729Sjoerg ///
118*7330f729Sjoerg /// \headerfile <x86intrin.h>
119*7330f729Sjoerg ///
120*7330f729Sjoerg /// \code
121*7330f729Sjoerg /// __m128i _mm256_cvtps_ph(__m256 a, const int imm);
122*7330f729Sjoerg /// \endcode
123*7330f729Sjoerg ///
124*7330f729Sjoerg /// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
125*7330f729Sjoerg ///
126*7330f729Sjoerg /// \param a
127*7330f729Sjoerg /// A 256-bit vector containing 32-bit single-precision float values to be
128*7330f729Sjoerg /// converted to 16-bit half-precision float values.
129*7330f729Sjoerg /// \param imm
130*7330f729Sjoerg /// An immediate value controlling rounding using bits [2:0]: \n
131*7330f729Sjoerg /// 000: Nearest \n
132*7330f729Sjoerg /// 001: Down \n
133*7330f729Sjoerg /// 010: Up \n
134*7330f729Sjoerg /// 011: Truncate \n
135*7330f729Sjoerg /// 1XX: Use MXCSR.RC for rounding
136*7330f729Sjoerg /// \returns A 128-bit vector containing the converted 16-bit half-precision
137*7330f729Sjoerg /// float values.
138*7330f729Sjoerg #define _mm256_cvtps_ph(a, imm) \
139*7330f729Sjoerg (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm))
140*7330f729Sjoerg
141*7330f729Sjoerg /// Converts a 128-bit vector containing 16-bit half-precision float
142*7330f729Sjoerg /// values into a 256-bit vector of [8 x float].
143*7330f729Sjoerg ///
144*7330f729Sjoerg /// \headerfile <x86intrin.h>
145*7330f729Sjoerg ///
146*7330f729Sjoerg /// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
147*7330f729Sjoerg ///
148*7330f729Sjoerg /// \param __a
149*7330f729Sjoerg /// A 128-bit vector containing 16-bit half-precision float values to be
150*7330f729Sjoerg /// converted to 32-bit single-precision float values.
151*7330f729Sjoerg /// \returns A vector of [8 x float] containing the converted 32-bit
152*7330f729Sjoerg /// single-precision float values.
153*7330f729Sjoerg static __inline __m256 __DEFAULT_FN_ATTRS256
_mm256_cvtph_ps(__m128i __a)154*7330f729Sjoerg _mm256_cvtph_ps(__m128i __a)
155*7330f729Sjoerg {
156*7330f729Sjoerg return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
157*7330f729Sjoerg }
158*7330f729Sjoerg
159*7330f729Sjoerg #undef __DEFAULT_FN_ATTRS128
160*7330f729Sjoerg #undef __DEFAULT_FN_ATTRS256
161*7330f729Sjoerg
162*7330f729Sjoerg #endif /* __F16CINTRIN_H */
163