xref: /freebsd-src/contrib/llvm-project/clang/lib/Headers/avxvnniintrin.h (revision 81ad626541db97eb356e2c1d4a20eb2a26a766ab)
1e8d8bef9SDimitry Andric /*===--------------- avxvnniintrin.h - VNNI intrinsics --------------------===
2e8d8bef9SDimitry Andric  *
3e8d8bef9SDimitry Andric  *
4e8d8bef9SDimitry Andric  * Permission is hereby granted, free of charge, to any person obtaining a copy
5e8d8bef9SDimitry Andric  * of this software and associated documentation files (the "Software"), to deal
6e8d8bef9SDimitry Andric  * in the Software without restriction, including without limitation the rights
7e8d8bef9SDimitry Andric  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8e8d8bef9SDimitry Andric  * copies of the Software, and to permit persons to whom the Software is
9e8d8bef9SDimitry Andric  * furnished to do so, subject to the following conditions:
10e8d8bef9SDimitry Andric  *
11e8d8bef9SDimitry Andric  * The above copyright notice and this permission notice shall be included in
12e8d8bef9SDimitry Andric  * all copies or substantial portions of the Software.
13e8d8bef9SDimitry Andric  *
14e8d8bef9SDimitry Andric  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15e8d8bef9SDimitry Andric  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16e8d8bef9SDimitry Andric  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17e8d8bef9SDimitry Andric  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18e8d8bef9SDimitry Andric  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19e8d8bef9SDimitry Andric  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20e8d8bef9SDimitry Andric  * THE SOFTWARE.
21e8d8bef9SDimitry Andric  *
22e8d8bef9SDimitry Andric  *===-----------------------------------------------------------------------===
23e8d8bef9SDimitry Andric  */
24e8d8bef9SDimitry Andric #ifndef __IMMINTRIN_H
25e8d8bef9SDimitry Andric #error "Never use <avxvnniintrin.h> directly; include <immintrin.h> instead."
26e8d8bef9SDimitry Andric #endif
27e8d8bef9SDimitry Andric 
28e8d8bef9SDimitry Andric #ifndef __AVXVNNIINTRIN_H
29e8d8bef9SDimitry Andric #define __AVXVNNIINTRIN_H
30e8d8bef9SDimitry Andric 
31e8d8bef9SDimitry Andric /* Below intrinsics defined in avx512vlvnniintrin.h can be used for AVXVNNI */
32e8d8bef9SDimitry Andric /// \fn __m256i _mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
33e8d8bef9SDimitry Andric /// \fn __m256i _mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
34e8d8bef9SDimitry Andric /// \fn __m256i _mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
35e8d8bef9SDimitry Andric /// \fn __m256i _mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
36e8d8bef9SDimitry Andric /// \fn __m128i _mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
37e8d8bef9SDimitry Andric /// \fn __m128i _mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
38e8d8bef9SDimitry Andric /// \fn __m128i _mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
39e8d8bef9SDimitry Andric /// \fn __m128i _mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
40e8d8bef9SDimitry Andric 
41e8d8bef9SDimitry Andric /* Intrinsics with _avx_ prefix are for compatibility with msvc. */
42e8d8bef9SDimitry Andric /* Define the default attributes for the functions in this file. */
43e8d8bef9SDimitry Andric #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(256)))
44e8d8bef9SDimitry Andric #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(128)))
45e8d8bef9SDimitry Andric 
46e8d8bef9SDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
47e8d8bef9SDimitry Andric /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
48e8d8bef9SDimitry Andric /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
49e8d8bef9SDimitry Andric /// in \a __S, and store the packed 32-bit results in DST.
50e8d8bef9SDimitry Andric ///
51e8d8bef9SDimitry Andric /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
52e8d8bef9SDimitry Andric ///
53*81ad6265SDimitry Andric /// \code{.operation}
54e8d8bef9SDimitry Andric ///    FOR j := 0 to 7
55e8d8bef9SDimitry Andric ///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
56e8d8bef9SDimitry Andric ///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
57e8d8bef9SDimitry Andric ///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
58e8d8bef9SDimitry Andric ///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
59e8d8bef9SDimitry Andric ///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
60e8d8bef9SDimitry Andric ///    ENDFOR
61e8d8bef9SDimitry Andric ///    DST[MAX:256] := 0
62*81ad6265SDimitry Andric /// \endcode
63e8d8bef9SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbusd_avx_epi32(__m256i __S,__m256i __A,__m256i __B)64e8d8bef9SDimitry Andric _mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
65e8d8bef9SDimitry Andric {
66e8d8bef9SDimitry Andric   return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
67e8d8bef9SDimitry Andric }
68e8d8bef9SDimitry Andric 
69e8d8bef9SDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
70e8d8bef9SDimitry Andric /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
71e8d8bef9SDimitry Andric /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
72e8d8bef9SDimitry Andric /// in \a __S using signed saturation, and store the packed 32-bit results in DST.
73e8d8bef9SDimitry Andric ///
74e8d8bef9SDimitry Andric /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
75e8d8bef9SDimitry Andric ///
76*81ad6265SDimitry Andric /// \code{.operation}
77e8d8bef9SDimitry Andric ///    FOR j := 0 to 7
78e8d8bef9SDimitry Andric ///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
79e8d8bef9SDimitry Andric ///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
80e8d8bef9SDimitry Andric ///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
81e8d8bef9SDimitry Andric ///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
82e8d8bef9SDimitry Andric ///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
83e8d8bef9SDimitry Andric ///    ENDFOR
84e8d8bef9SDimitry Andric ///    DST[MAX:256] := 0
85*81ad6265SDimitry Andric /// \endcode
86e8d8bef9SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbusds_avx_epi32(__m256i __S,__m256i __A,__m256i __B)87e8d8bef9SDimitry Andric _mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
88e8d8bef9SDimitry Andric {
89e8d8bef9SDimitry Andric   return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
90e8d8bef9SDimitry Andric }
91e8d8bef9SDimitry Andric 
92e8d8bef9SDimitry Andric /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
93e8d8bef9SDimitry Andric /// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
94e8d8bef9SDimitry Andric /// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
95e8d8bef9SDimitry Andric ///  and store the packed 32-bit results in DST.
96e8d8bef9SDimitry Andric ///
97e8d8bef9SDimitry Andric /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
98e8d8bef9SDimitry Andric ///
99*81ad6265SDimitry Andric /// \code{.operation}
100e8d8bef9SDimitry Andric ///    FOR j := 0 to 7
101e8d8bef9SDimitry Andric ///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
102e8d8bef9SDimitry Andric ///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
103e8d8bef9SDimitry Andric ///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2
104e8d8bef9SDimitry Andric ///    ENDFOR
105e8d8bef9SDimitry Andric ///    DST[MAX:256] := 0
106*81ad6265SDimitry Andric /// \endcode
107e8d8bef9SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwssd_avx_epi32(__m256i __S,__m256i __A,__m256i __B)108e8d8bef9SDimitry Andric _mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
109e8d8bef9SDimitry Andric {
110e8d8bef9SDimitry Andric   return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
111e8d8bef9SDimitry Andric }
112e8d8bef9SDimitry Andric 
113e8d8bef9SDimitry Andric /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
114e8d8bef9SDimitry Andric /// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
115e8d8bef9SDimitry Andric /// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
116e8d8bef9SDimitry Andric /// using signed saturation, and store the packed 32-bit results in DST.
117e8d8bef9SDimitry Andric ///
118e8d8bef9SDimitry Andric /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
119e8d8bef9SDimitry Andric ///
120*81ad6265SDimitry Andric /// \code{.operation}
121e8d8bef9SDimitry Andric ///    FOR j := 0 to 7
122e8d8bef9SDimitry Andric ///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
123e8d8bef9SDimitry Andric ///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
124e8d8bef9SDimitry Andric ///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
125e8d8bef9SDimitry Andric ///    ENDFOR
126e8d8bef9SDimitry Andric ///    DST[MAX:256] := 0
127*81ad6265SDimitry Andric /// \endcode
128e8d8bef9SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwssds_avx_epi32(__m256i __S,__m256i __A,__m256i __B)129e8d8bef9SDimitry Andric _mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
130e8d8bef9SDimitry Andric {
131e8d8bef9SDimitry Andric   return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
132e8d8bef9SDimitry Andric }
133e8d8bef9SDimitry Andric 
134e8d8bef9SDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
135e8d8bef9SDimitry Andric /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
136e8d8bef9SDimitry Andric /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
137e8d8bef9SDimitry Andric /// in \a __S, and store the packed 32-bit results in DST.
138e8d8bef9SDimitry Andric ///
139e8d8bef9SDimitry Andric /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
140e8d8bef9SDimitry Andric ///
141*81ad6265SDimitry Andric /// \code{.operation}
142e8d8bef9SDimitry Andric ///    FOR j := 0 to 3
143e8d8bef9SDimitry Andric ///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
144e8d8bef9SDimitry Andric ///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
145e8d8bef9SDimitry Andric ///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
146e8d8bef9SDimitry Andric ///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
147e8d8bef9SDimitry Andric ///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
148e8d8bef9SDimitry Andric ///    ENDFOR
149e8d8bef9SDimitry Andric ///    DST[MAX:128] := 0
150*81ad6265SDimitry Andric /// \endcode
151e8d8bef9SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpbusd_avx_epi32(__m128i __S,__m128i __A,__m128i __B)152e8d8bef9SDimitry Andric _mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
153e8d8bef9SDimitry Andric {
154e8d8bef9SDimitry Andric   return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
155e8d8bef9SDimitry Andric }
156e8d8bef9SDimitry Andric 
157e8d8bef9SDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
158e8d8bef9SDimitry Andric /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
159e8d8bef9SDimitry Andric /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
160e8d8bef9SDimitry Andric /// in \a __S using signed saturation, and store the packed 32-bit results in DST.
161e8d8bef9SDimitry Andric ///
162e8d8bef9SDimitry Andric /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
163e8d8bef9SDimitry Andric ///
164*81ad6265SDimitry Andric /// \code{.operation}
165e8d8bef9SDimitry Andric ///    FOR j := 0 to 3
166e8d8bef9SDimitry Andric ///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
167e8d8bef9SDimitry Andric ///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
168e8d8bef9SDimitry Andric ///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
169e8d8bef9SDimitry Andric ///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
170e8d8bef9SDimitry Andric ///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
171e8d8bef9SDimitry Andric ///    ENDFOR
172e8d8bef9SDimitry Andric ///    DST[MAX:128] := 0
173*81ad6265SDimitry Andric /// \endcode
174e8d8bef9SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpbusds_avx_epi32(__m128i __S,__m128i __A,__m128i __B)175e8d8bef9SDimitry Andric _mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
176e8d8bef9SDimitry Andric {
177e8d8bef9SDimitry Andric   return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
178e8d8bef9SDimitry Andric }
179e8d8bef9SDimitry Andric 
180e8d8bef9SDimitry Andric /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
181e8d8bef9SDimitry Andric /// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
182e8d8bef9SDimitry Andric /// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
183e8d8bef9SDimitry Andric /// and store the packed 32-bit results in DST.
184e8d8bef9SDimitry Andric ///
185e8d8bef9SDimitry Andric /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
186e8d8bef9SDimitry Andric ///
187*81ad6265SDimitry Andric /// \code{.operation}
188e8d8bef9SDimitry Andric ///    FOR j := 0 to 3
189e8d8bef9SDimitry Andric ///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
190e8d8bef9SDimitry Andric ///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
191e8d8bef9SDimitry Andric ///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2
192e8d8bef9SDimitry Andric ///    ENDFOR
193e8d8bef9SDimitry Andric ///    DST[MAX:128] := 0
194*81ad6265SDimitry Andric /// \endcode
195e8d8bef9SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpwssd_avx_epi32(__m128i __S,__m128i __A,__m128i __B)196e8d8bef9SDimitry Andric _mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
197e8d8bef9SDimitry Andric {
198e8d8bef9SDimitry Andric   return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
199e8d8bef9SDimitry Andric }
200e8d8bef9SDimitry Andric 
201e8d8bef9SDimitry Andric /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
202e8d8bef9SDimitry Andric /// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
203e8d8bef9SDimitry Andric /// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
204e8d8bef9SDimitry Andric /// using signed saturation, and store the packed 32-bit results in DST.
205e8d8bef9SDimitry Andric ///
206e8d8bef9SDimitry Andric /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
207e8d8bef9SDimitry Andric ///
208*81ad6265SDimitry Andric /// \code{.operation}
209e8d8bef9SDimitry Andric ///    FOR j := 0 to 3
210e8d8bef9SDimitry Andric ///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
211e8d8bef9SDimitry Andric ///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
212e8d8bef9SDimitry Andric ///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
213e8d8bef9SDimitry Andric ///    ENDFOR
214e8d8bef9SDimitry Andric ///    DST[MAX:128] := 0
215*81ad6265SDimitry Andric /// \endcode
216e8d8bef9SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpwssds_avx_epi32(__m128i __S,__m128i __A,__m128i __B)217e8d8bef9SDimitry Andric _mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
218e8d8bef9SDimitry Andric {
219e8d8bef9SDimitry Andric   return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
220e8d8bef9SDimitry Andric }
221e8d8bef9SDimitry Andric 
222e8d8bef9SDimitry Andric #undef __DEFAULT_FN_ATTRS128
223e8d8bef9SDimitry Andric #undef __DEFAULT_FN_ATTRS256
224e8d8bef9SDimitry Andric 
225e8d8bef9SDimitry Andric #endif // __AVXVNNIINTRIN_H
226