1*bdd1243dSDimitry Andric /*===----------------- avxifmaintrin.h - IFMA intrinsics -------------------===
2*bdd1243dSDimitry Andric *
3*bdd1243dSDimitry Andric * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*bdd1243dSDimitry Andric * See https://llvm.org/LICENSE.txt for license information.
5*bdd1243dSDimitry Andric * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*bdd1243dSDimitry Andric *
7*bdd1243dSDimitry Andric *===-----------------------------------------------------------------------===
8*bdd1243dSDimitry Andric */
9*bdd1243dSDimitry Andric
10*bdd1243dSDimitry Andric #ifndef __IMMINTRIN_H
11*bdd1243dSDimitry Andric #error "Never use <avxifmaintrin.h> directly; include <immintrin.h> instead."
12*bdd1243dSDimitry Andric #endif
13*bdd1243dSDimitry Andric
14*bdd1243dSDimitry Andric #ifndef __AVXIFMAINTRIN_H
15*bdd1243dSDimitry Andric #define __AVXIFMAINTRIN_H
16*bdd1243dSDimitry Andric
17*bdd1243dSDimitry Andric /* Define the default attributes for the functions in this file. */
18*bdd1243dSDimitry Andric #define __DEFAULT_FN_ATTRS128 \
19*bdd1243dSDimitry Andric __attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \
20*bdd1243dSDimitry Andric __min_vector_width__(128)))
21*bdd1243dSDimitry Andric #define __DEFAULT_FN_ATTRS256 \
22*bdd1243dSDimitry Andric __attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \
23*bdd1243dSDimitry Andric __min_vector_width__(256)))
24*bdd1243dSDimitry Andric
25*bdd1243dSDimitry Andric // must vex-encoding
26*bdd1243dSDimitry Andric
27*bdd1243dSDimitry Andric /// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
28*bdd1243dSDimitry Andric /// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit
29*bdd1243dSDimitry Andric /// unsigned integer from the intermediate result with the corresponding
30*bdd1243dSDimitry Andric /// unsigned 64-bit integer in \a __X, and store the results in \a dst.
31*bdd1243dSDimitry Andric ///
32*bdd1243dSDimitry Andric /// \headerfile <immintrin.h>
33*bdd1243dSDimitry Andric ///
34*bdd1243dSDimitry Andric /// \code
35*bdd1243dSDimitry Andric /// __m128i
36*bdd1243dSDimitry Andric /// _mm_madd52hi_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
37*bdd1243dSDimitry Andric /// \endcode
38*bdd1243dSDimitry Andric ///
39*bdd1243dSDimitry Andric /// This intrinsic corresponds to the \c VPMADD52HUQ instruction.
40*bdd1243dSDimitry Andric ///
41*bdd1243dSDimitry Andric /// \return
42*bdd1243dSDimitry Andric /// return __m128i dst.
43*bdd1243dSDimitry Andric /// \param __X
44*bdd1243dSDimitry Andric /// A 128-bit vector of [2 x i64]
45*bdd1243dSDimitry Andric /// \param __Y
46*bdd1243dSDimitry Andric /// A 128-bit vector of [2 x i64]
47*bdd1243dSDimitry Andric /// \param __Z
48*bdd1243dSDimitry Andric /// A 128-bit vector of [2 x i64]
49*bdd1243dSDimitry Andric ///
50*bdd1243dSDimitry Andric /// \code{.operation}
51*bdd1243dSDimitry Andric /// FOR j := 0 to 1
52*bdd1243dSDimitry Andric /// i := j*64
53*bdd1243dSDimitry Andric /// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
54*bdd1243dSDimitry Andric /// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52])
55*bdd1243dSDimitry Andric /// ENDFOR
56*bdd1243dSDimitry Andric /// dst[MAX:128] := 0
57*bdd1243dSDimitry Andric /// \endcode
58*bdd1243dSDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_madd52hi_avx_epu64(__m128i __X,__m128i __Y,__m128i __Z)59*bdd1243dSDimitry Andric _mm_madd52hi_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
60*bdd1243dSDimitry Andric return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di)__X, (__v2di)__Y,
61*bdd1243dSDimitry Andric (__v2di)__Z);
62*bdd1243dSDimitry Andric }
63*bdd1243dSDimitry Andric
64*bdd1243dSDimitry Andric /// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
65*bdd1243dSDimitry Andric /// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit
66*bdd1243dSDimitry Andric /// unsigned integer from the intermediate result with the corresponding
67*bdd1243dSDimitry Andric /// unsigned 64-bit integer in \a __X, and store the results in \a dst.
68*bdd1243dSDimitry Andric ///
69*bdd1243dSDimitry Andric /// \headerfile <immintrin.h>
70*bdd1243dSDimitry Andric ///
71*bdd1243dSDimitry Andric /// \code
72*bdd1243dSDimitry Andric /// __m256i
73*bdd1243dSDimitry Andric /// _mm256_madd52hi_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
74*bdd1243dSDimitry Andric /// \endcode
75*bdd1243dSDimitry Andric ///
76*bdd1243dSDimitry Andric /// This intrinsic corresponds to the \c VPMADD52HUQ instruction.
77*bdd1243dSDimitry Andric ///
78*bdd1243dSDimitry Andric /// \return
79*bdd1243dSDimitry Andric /// return __m256i dst.
80*bdd1243dSDimitry Andric /// \param __X
81*bdd1243dSDimitry Andric /// A 256-bit vector of [4 x i64]
82*bdd1243dSDimitry Andric /// \param __Y
83*bdd1243dSDimitry Andric /// A 256-bit vector of [4 x i64]
84*bdd1243dSDimitry Andric /// \param __Z
85*bdd1243dSDimitry Andric /// A 256-bit vector of [4 x i64]
86*bdd1243dSDimitry Andric ///
87*bdd1243dSDimitry Andric /// \code{.operation}
88*bdd1243dSDimitry Andric /// FOR j := 0 to 3
89*bdd1243dSDimitry Andric /// i := j*64
90*bdd1243dSDimitry Andric /// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
91*bdd1243dSDimitry Andric /// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52])
92*bdd1243dSDimitry Andric /// ENDFOR
93*bdd1243dSDimitry Andric /// dst[MAX:256] := 0
94*bdd1243dSDimitry Andric /// \endcode
95*bdd1243dSDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_madd52hi_avx_epu64(__m256i __X,__m256i __Y,__m256i __Z)96*bdd1243dSDimitry Andric _mm256_madd52hi_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
97*bdd1243dSDimitry Andric return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y,
98*bdd1243dSDimitry Andric (__v4di)__Z);
99*bdd1243dSDimitry Andric }
100*bdd1243dSDimitry Andric
101*bdd1243dSDimitry Andric /// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
102*bdd1243dSDimitry Andric /// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit
103*bdd1243dSDimitry Andric /// unsigned integer from the intermediate result with the corresponding
104*bdd1243dSDimitry Andric /// unsigned 64-bit integer in \a __X, and store the results in \a dst.
105*bdd1243dSDimitry Andric ///
106*bdd1243dSDimitry Andric /// \headerfile <immintrin.h>
107*bdd1243dSDimitry Andric ///
108*bdd1243dSDimitry Andric /// \code
109*bdd1243dSDimitry Andric /// __m128i
110*bdd1243dSDimitry Andric /// _mm_madd52lo_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
111*bdd1243dSDimitry Andric /// \endcode
112*bdd1243dSDimitry Andric ///
113*bdd1243dSDimitry Andric /// This intrinsic corresponds to the \c VPMADD52LUQ instruction.
114*bdd1243dSDimitry Andric ///
115*bdd1243dSDimitry Andric /// \return
116*bdd1243dSDimitry Andric /// return __m128i dst.
117*bdd1243dSDimitry Andric /// \param __X
118*bdd1243dSDimitry Andric /// A 128-bit vector of [2 x i64]
119*bdd1243dSDimitry Andric /// \param __Y
120*bdd1243dSDimitry Andric /// A 128-bit vector of [2 x i64]
121*bdd1243dSDimitry Andric /// \param __Z
122*bdd1243dSDimitry Andric /// A 128-bit vector of [2 x i64]
123*bdd1243dSDimitry Andric ///
124*bdd1243dSDimitry Andric /// \code{.operation}
125*bdd1243dSDimitry Andric /// FOR j := 0 to 1
126*bdd1243dSDimitry Andric /// i := j*64
127*bdd1243dSDimitry Andric /// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
128*bdd1243dSDimitry Andric /// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0])
129*bdd1243dSDimitry Andric /// ENDFOR
130*bdd1243dSDimitry Andric /// dst[MAX:128] := 0
131*bdd1243dSDimitry Andric /// \endcode
132*bdd1243dSDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_madd52lo_avx_epu64(__m128i __X,__m128i __Y,__m128i __Z)133*bdd1243dSDimitry Andric _mm_madd52lo_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
134*bdd1243dSDimitry Andric return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y,
135*bdd1243dSDimitry Andric (__v2di)__Z);
136*bdd1243dSDimitry Andric }
137*bdd1243dSDimitry Andric
138*bdd1243dSDimitry Andric /// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
139*bdd1243dSDimitry Andric /// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit
140*bdd1243dSDimitry Andric /// unsigned integer from the intermediate result with the corresponding
141*bdd1243dSDimitry Andric /// unsigned 64-bit integer in \a __X, and store the results in \a dst.
142*bdd1243dSDimitry Andric ///
143*bdd1243dSDimitry Andric /// \headerfile <immintrin.h>
144*bdd1243dSDimitry Andric ///
145*bdd1243dSDimitry Andric /// \code
146*bdd1243dSDimitry Andric /// __m256i
147*bdd1243dSDimitry Andric /// _mm256_madd52lo_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
148*bdd1243dSDimitry Andric /// \endcode
149*bdd1243dSDimitry Andric ///
150*bdd1243dSDimitry Andric /// This intrinsic corresponds to the \c VPMADD52LUQ instruction.
151*bdd1243dSDimitry Andric ///
152*bdd1243dSDimitry Andric /// \return
153*bdd1243dSDimitry Andric /// return __m256i dst.
154*bdd1243dSDimitry Andric /// \param __X
155*bdd1243dSDimitry Andric /// A 256-bit vector of [4 x i64]
156*bdd1243dSDimitry Andric /// \param __Y
157*bdd1243dSDimitry Andric /// A 256-bit vector of [4 x i64]
158*bdd1243dSDimitry Andric /// \param __Z
159*bdd1243dSDimitry Andric /// A 256-bit vector of [4 x i64]
160*bdd1243dSDimitry Andric ///
161*bdd1243dSDimitry Andric /// \code{.operation}
162*bdd1243dSDimitry Andric /// FOR j := 0 to 3
163*bdd1243dSDimitry Andric /// i := j*64
164*bdd1243dSDimitry Andric /// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
165*bdd1243dSDimitry Andric /// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0])
166*bdd1243dSDimitry Andric /// ENDFOR
167*bdd1243dSDimitry Andric /// dst[MAX:256] := 0
168*bdd1243dSDimitry Andric /// \endcode
169*bdd1243dSDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_madd52lo_avx_epu64(__m256i __X,__m256i __Y,__m256i __Z)170*bdd1243dSDimitry Andric _mm256_madd52lo_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
171*bdd1243dSDimitry Andric return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y,
172*bdd1243dSDimitry Andric (__v4di)__Z);
173*bdd1243dSDimitry Andric }
174*bdd1243dSDimitry Andric #undef __DEFAULT_FN_ATTRS128
175*bdd1243dSDimitry Andric #undef __DEFAULT_FN_ATTRS256
176*bdd1243dSDimitry Andric
177*bdd1243dSDimitry Andric #endif // __AVXIFMAINTRIN_H
178