xref: /llvm-project/clang/lib/Headers/amxavx512intrin.h (revision 48803bc8c7be25745a0e623e6753261c07281b06)
1 /*===--------------------- amxavx512intrin.h - AMXAVX512 --------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===------------------------------------------------------------------------===
8  */
9 #ifndef __IMMINTRIN_H
10 #error "Never use <amxavx512intrin.h> directly; include <immintrin.h> instead."
11 #endif // __IMMINTRIN_H
12 
13 #ifndef __AMX_AVX512INTRIN_H
14 #define __AMX_AVX512INTRIN_H
15 #if defined(__x86_64__) && defined(__SSE2__)
16 
17 #define __DEFAULT_FN_ATTRS_AVX512                                              \
18   __attribute__((__always_inline__, __nodebug__,                               \
19                  __target__("amx-avx512,avx10.2-512")))
20 
21 /// Moves a row from a tile register to a zmm destination register, converting
22 ///    the int32 source elements to fp32. The row of the tile is selected by a
23 ///    32b GPR.
24 ///
25 /// \headerfile <x86intrin.h>
26 ///
27 /// \code
28 /// __m512i _tile_cvtrowd2ps(__tile tsrc, unsigned int row);
29 /// \endcode
30 ///
31 /// \code{.operation}
32 /// VL := 512
33 /// VL_bytes := VL >> 3
34 /// row_index := row & 0xffff
35 /// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
36 /// FOR i := 0 TO (VL_bytes / 4) - 1
37 ///     IF i + row_chunk / 4 >= tsrc.colsb / 4
38 ///         dst.dword[i] := 0
39 ///     ELSE
40 ///         dst.f32[i] := CONVERT_INT32_TO_FP32(tsrc.row[row_index].dword[row_chunk/4+i], RNE)
41 ///     FI
42 /// ENDFOR
43 /// dst[MAX_VL-1:VL] := 0
44 /// zero_tileconfig_start()
45 /// \endcode
46 ///
47 /// This intrinsic corresponds to the \c TCVTROWD2PS instruction.
48 ///
49 /// \param tsrc
50 ///    The source tile. Max size is 1024 Bytes.
51 /// \param row
52 ///    The row of the source tile
53 #define _tile_cvtrowd2ps(tsrc, row) __builtin_ia32_tcvtrowd2ps(tsrc, row)
54 
55 /// Moves a row from a tile register to a zmm destination register, converting
56 ///    the fp32 source elements to bf16. It places the resulting bf16 elements
57 ///    in the high 16 bits within each dword. The row of the tile is selected
58 ///    by a 32b GPR.
59 ///
60 /// \headerfile <x86intrin.h>
61 ///
62 /// \code
63 /// __m512i _tile_cvtrowps2bf16h(__tile tsrc, unsigned int row);
64 /// \endcode
65 ///
66 /// \code{.operation}
67 /// VL := 512
68 /// VL_bytes := VL >> 3
69 /// row_index := row & 0xffff
70 /// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
71 /// FOR i := 0 TO (VL_bytes / 4) - 1
72 ///     IF i + row_chunk / 4 >= tsrc.colsb / 4
73 ///         dst.dword[i] := 0
74 ///     ELSE
75 ///         dst.word[2*i+0] := 0
76 ///         dst.bf16[2*i+1] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
77 ///     FI
78 /// ENDFOR
79 /// dst[MAX_VL-1:VL] := 0
80 /// zero_tileconfig_start()
81 /// \endcode
82 ///
83 /// This intrinsic corresponds to the \c TCVTROWPS2BF16H instruction.
84 ///
85 /// \param tsrc
86 ///    The source tile. Max size is 1024 Bytes.
87 /// \param row
88 ///    The the row of the source tile.
89 #define _tile_cvtrowps2bf16h(tsrc, row)                                        \
90   __builtin_ia32_tcvtrowps2bf16h(tsrc, row)
91 
92 /// Moves a row from a tile register to a zmm destination register, converting
93 ///    the fp32 source elements to bf16. It places the resulting bf16 elements
94 ///    in the low 16 bits within each dword. The row of the tile is selected
95 ///    by a 32b GPR.
96 ///
97 /// \headerfile <x86intrin.h>
98 ///
99 /// \code
100 /// __m512i _tile_cvtrowps2bf16l(__tile tsrc, unsigned int row);
101 /// \endcode
102 ///
103 /// \code{.operation}
104 /// VL := 512
105 /// VL_bytes := VL >> 3
106 /// row_index := row & 0xffff
107 /// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
108 /// FOR i := 0 TO (VL_bytes / 4) - 1
109 ///     IF i + row_chunk / 4 >= tsrc.colsb / 4
110 ///         dst.dword[i] := 0
111 ///     ELSE
112 ///         dst.word[2*i+1] := 0
113 ///         dst.bf16[2*i+0] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
114 ///     FI
115 /// ENDFOR
116 /// dst[MAX_VL-1:VL] := 0
117 /// zero_tileconfig_start()
118 /// \endcode
119 ///
120 /// This intrinsic corresponds to the \c TCVTROWPS2BF16L instruction.
121 ///
122 /// \param tsrc
123 ///    The source tile. Max size is 1024 Bytes.
124 /// \param row
125 ///    The the row of the source tile.
126 #define _tile_cvtrowps2bf16l(tsrc, row)                                        \
127   __builtin_ia32_tcvtrowps2bf16l(tsrc, row)
128 
129 /// Moves a row from a tile register to a zmm destination register, converting
130 ///    the fp32 source elements to fp16. It places the resulting fp16 elements
131 ///    in the high 16 bits within each dword. The row of the tile is selected
132 ///    by a 32b GPR.
133 ///
134 /// \headerfile <x86intrin.h>
135 ///
136 /// \code
137 /// __m512i _tile_cvtrowps2phh(__tile tsrc, unsigned int row);
138 /// \endcode
139 ///
140 /// \code{.operation}
141 /// VL := 512
142 /// VL_bytes := VL >> 3
143 /// row_index := row & 0xffff
144 /// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
145 /// FOR i := 0 TO (VL_bytes / 4) - 1
146 ///     IF i + row_chunk / 4 >= tsrc.colsb / 4
147 ///         dst.dword[i] := 0
148 ///     ELSE
149 ///         dst.word[2*i+0] := 0
150 ///         dst.fp16[2*i+1] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
151 ///     FI
152 /// ENDFOR
153 /// dst[MAX_VL-1:VL] := 0
154 /// zero_tileconfig_start()
155 /// \endcode
156 ///
157 /// This intrinsic corresponds to the \c TCVTROWPS2PHH instruction.
158 ///
159 /// \param tsrc
160 ///    The source tile. Max size is 1024 Bytes.
161 /// \param row
162 ///    The the row of the source tile.
163 #define _tile_cvtrowps2phh(tsrc, row) __builtin_ia32_tcvtrowps2phh(tsrc, row)
164 
165 /// Moves a row from a tile register to a zmm destination register, converting
166 ///    the fp32 source elements to fp16. It places the resulting fp16 elements
167 ///    in the low 16 bits within each dword. The row of the tile is selected
168 ///    by a 32b GPR.
169 ///
170 /// \headerfile <x86intrin.h>
171 ///
172 /// \code
173 /// __m512i _tile_cvtrowps2phl(__tile tsrc, unsigned int row);
174 /// \endcode
175 ///
176 /// \code{.operation}
177 /// VL := 512
178 /// VL_bytes := VL >> 3
179 /// row_index := row & 0xffff
180 /// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
181 /// FOR i := 0 TO (VL_bytes / 4) - 1
182 ///     IF i + row_chunk / 4 >= tsrc.colsb / 4
183 ///         dst.dword[i] := 0
184 ///     ELSE
185 ///         dst.word[2*i+1] := 0
186 ///         dst.fp16[2*i+0] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
187 ///     FI
188 /// ENDFOR
189 /// dst[MAX_VL-1:VL] := 0
190 /// zero_tileconfig_start()
191 /// \endcode
192 ///
193 /// This intrinsic corresponds to the \c TCVTROWPS2PHL instruction.
194 ///
195 /// \param tsrc
196 ///    The source tile. Max size is 1024 Bytes.
197 /// \param row
198 ///    The the row of the source tile.
199 #define _tile_cvtrowps2phl(tsrc, row) __builtin_ia32_tcvtrowps2phl(tsrc, row)
200 
201 /// Move one row of a tile data to a v16f32 data.
202 /// The row of the tile is selected by a 32b GPR.
203 ///
204 /// \headerfile <immintrin.h>
205 ///
206 /// \code
207 /// __m512 _tile_movrow(__tile a, unsigned b);
208 /// \endcode
209 ///
210 /// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction.
211 ///
212 /// \param a
213 ///     The 1st source tile. Max size is 1024 Bytes.
214 /// \param b
215 ///     The 2nd source r32. Size is 4 Bytes.
216 /// \returns
217 ///     The destination v16f32 data. Size is 64 Bytes.
218 ///
219 /// \code{.operation}
220 /// VL := 512
221 /// VL_bytes := VL>>3
222 /// row_index := b&0xffff
223 /// row_chunk := ((b>>16)&0xffff) * VL_bytes
224 /// FOR i := 0 TO (VL_bytes-1)
225 ///     IF (row_chunk + i >= a.colsb)
226 ///             dst.byte[i] := 0
227 ///     ELSE
228 ///             dst.byte[i] := a.row[row_index].byte[row_chunk+i]
229 /// ENDFOR
230 /// \endcode
231 #define _tile_movrow(a, b) __builtin_ia32_tilemovrow(a, b)
232 
233 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
234 
235 static __inline__ __m512 __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowd2ps_internal(
236     unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
237   return __builtin_ia32_tcvtrowd2ps_internal(m, n, src, u);
238 }
239 
240 static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512
241 _tile_cvtrowps2bf16h_internal(unsigned short m, unsigned short n,
242                               _tile1024i src, unsigned u) {
243   return __builtin_ia32_tcvtrowps2bf16h_internal(m, n, src, u);
244 }
245 
246 static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512
247 _tile_cvtrowps2bf16l_internal(unsigned short m, unsigned short n,
248                               _tile1024i src, unsigned u) {
249   return __builtin_ia32_tcvtrowps2bf16l_internal(m, n, src, u);
250 }
251 
252 static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phh_internal(
253     unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
254   return __builtin_ia32_tcvtrowps2phh_internal(m, n, src, u);
255 }
256 
257 static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phl_internal(
258     unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
259   return __builtin_ia32_tcvtrowps2phl_internal(m, n, src, u);
260 }
261 
262 static __inline__ __m512i __DEFAULT_FN_ATTRS_AVX512 _tile_movrow_internal(
263     unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
264   return (__m512i)__builtin_ia32_tilemovrow_internal(m, n, src, u);
265 }
266 
267 /// Move a row from a tile (src0) to a v16f32 dst, converting the int32 source
268 /// elements to fp32. No SIMD exceptions are generated. Rounding is done as if
269 /// MXCSR.RC=RNE. Embedded rounding is not supported.
270 /// The row and chunk elements of tile is fetched from 32bit src1.
271 ///
272 /// \headerfile <immintrin.h>
273 ///
274 /// This intrinsic corresponds to the <c> TCVTROWD2PS </c> instruction.
275 ///
276 /// \param src0
277 ///    The 1st source tile. Max size is 1024 Bytes.
278 /// \param src1
279 ///    The 2nd source r32. Size is 4 Bytes.
280 /// \returns
281 ///    The destination v16f32 data. Size is 64 Bytes.
282 __DEFAULT_FN_ATTRS_AVX512
283 static __m512 __tile_cvtrowd2ps(__tile1024i src0, unsigned src1) {
284   return _tile_cvtrowd2ps_internal(src0.row, src0.col, src0.tile, src1);
285 }
286 
287 /// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source
288 /// elements to bf16 at high 16-bits of each dword.
289 /// The row and chunk elements of tile is fetched from 32bit src1.
290 ///
291 /// \headerfile <immintrin.h>
292 ///
293 /// This intrinsic corresponds to the <c> TCVTROWPS2BF16H </c> instruction.
294 ///
295 /// \param src0
296 ///    The 1st source tile. Max size is 1024 Bytes.
297 /// \param src1
298 ///    The 2nd source r32. Size is 4 Bytes.
299 /// \returns
300 ///    The destination v32bf16 data. Size is 64 Bytes.
301 __DEFAULT_FN_ATTRS_AVX512
302 static __m512bh __tile_cvtrowps2bf16h(__tile1024i src0, unsigned src1) {
303   return _tile_cvtrowps2bf16h_internal(src0.row, src0.col, src0.tile, src1);
304 }
305 
306 /// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source
307 /// elements to bf16 at low 16-bits of each dword.
308 /// The row and chunk elements of tile is fetched from 32bit src1.
309 ///
310 /// \headerfile <immintrin.h>
311 ///
312 /// This intrinsic corresponds to the <c> TCVTROWPS2BF16L </c> instruction.
313 ///
314 /// \param src0
315 ///    The 1st source tile. Max size is 1024 Bytes.
316 /// \param src1
317 ///    The 2nd source r32. Size is 4 Bytes.
318 /// \returns
319 ///    The destination v32bf16 data. Size is 64 Bytes.
320 __DEFAULT_FN_ATTRS_AVX512
321 static __m512bh __tile_cvtrowps2bf16l(__tile1024i src0, unsigned src1) {
322   return _tile_cvtrowps2bf16l_internal(src0.row, src0.col, src0.tile, src1);
323 }
324 
325 /// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source
326 /// elements to fp16 at high 16-bits of each dword.
327 /// The row and chunk elements of tile is fetched from 32bit src1.
328 ///
329 /// \headerfile <immintrin.h>
330 ///
331 /// This intrinsic corresponds to the <c> TCVTROWPS2PHH </c> instruction.
332 ///
333 /// \param src0
334 ///    The 1st source tile. Max size is 1024 Bytes.
335 /// \param src1
336 ///    The 2nd source r32. Size is 4 Bytes.
337 /// \returns
338 ///    The destination v32fp16 data. Size is 64 Bytes.
339 __DEFAULT_FN_ATTRS_AVX512
340 static __m512h __tile_cvtrowps2phh(__tile1024i src0, unsigned src1) {
341   return _tile_cvtrowps2phh_internal(src0.row, src0.col, src0.tile, src1);
342 }
343 
344 /// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source
345 /// elements to fp16 at low 16-bits of each dword.
346 /// The row and chunk elements of tile is fetched from 32bit src1.
347 ///
348 /// \headerfile <immintrin.h>
349 ///
350 /// This intrinsic corresponds to the <c> TCVTROWPS2PHL </c> instruction.
351 ///
352 /// \param src0
353 ///    The 1st source tile. Max size is 1024 Bytes.
354 /// \param src1
355 ///    The 2nd source r32. Size is 4 Bytes.
356 /// \returns
357 ///    The destination v32fp16 data. Size is 64 Bytes.
358 __DEFAULT_FN_ATTRS_AVX512
359 static __m512h __tile_cvtrowps2phl(__tile1024i src0, unsigned src1) {
360   return _tile_cvtrowps2phl_internal(src0.row, src0.col, src0.tile, src1);
361 }
362 
363 /// Move one row of a tile data to a v16f32 data.
364 /// The row of the tile is selected by a 32b GPR.
365 ///
366 /// \headerfile <immintrin.h>
367 ///
368 /// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction.
369 ///
370 /// \param src0
371 ///    The 1st source tile. Max size is 1024 Bytes.
372 /// \param src1
373 ///    The 2nd source r32. Size is 4 Bytes.
374 /// \returns
375 ///    The destination v16i32 data. Size is 64 Bytes.
376 __DEFAULT_FN_ATTRS_AVX512
377 static __m512i __tile_movrow(__tile1024i src0, unsigned src1) {
378   return (__m512i)_tile_movrow_internal(src0.row, src0.col, src0.tile, src1);
379 }
380 
381 #endif // __x86_64__ && __SSE2__
382 #endif // __AMX_AVX512INTRIN_H
383