15ffd83dbSDimitry Andric /*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
25ffd83dbSDimitry Andric *
35ffd83dbSDimitry Andric * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
45ffd83dbSDimitry Andric * See https://llvm.org/LICENSE.txt for license information.
55ffd83dbSDimitry Andric * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
65ffd83dbSDimitry Andric *
75ffd83dbSDimitry Andric *===------------------------------------------------------------------------===
85ffd83dbSDimitry Andric */
95ffd83dbSDimitry Andric
105ffd83dbSDimitry Andric #ifndef __IMMINTRIN_H
115ffd83dbSDimitry Andric #error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
125ffd83dbSDimitry Andric #endif /* __IMMINTRIN_H */
135ffd83dbSDimitry Andric
145ffd83dbSDimitry Andric #ifndef __AMXINTRIN_H
155ffd83dbSDimitry Andric #define __AMXINTRIN_H
165ffd83dbSDimitry Andric #ifdef __x86_64__
175ffd83dbSDimitry Andric
18fe6060f1SDimitry Andric /* Define the default attributes for the functions in this file. */
19e8d8bef9SDimitry Andric #define __DEFAULT_FN_ATTRS_TILE \
205ffd83dbSDimitry Andric __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
21fe6060f1SDimitry Andric #define __DEFAULT_FN_ATTRS_INT8 \
22fe6060f1SDimitry Andric __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
23fe6060f1SDimitry Andric #define __DEFAULT_FN_ATTRS_BF16 \
24fe6060f1SDimitry Andric __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
25*bdd1243dSDimitry Andric #define __DEFAULT_FN_ATTRS_FP16 \
26*bdd1243dSDimitry Andric __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
275ffd83dbSDimitry Andric
285ffd83dbSDimitry Andric /// Load tile configuration from a 64-byte memory location specified by
295ffd83dbSDimitry Andric /// "mem_addr". The tile configuration includes the tile type palette, the
305ffd83dbSDimitry Andric /// number of bytes per row, and the number of rows. If the specified
315ffd83dbSDimitry Andric /// palette_id is zero, that signifies the init state for both the tile
325ffd83dbSDimitry Andric /// config and the tile data, and the tiles are zeroed. Any invalid
335ffd83dbSDimitry Andric /// configurations will result in #GP fault.
345ffd83dbSDimitry Andric ///
35fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
365ffd83dbSDimitry Andric ///
375ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
385ffd83dbSDimitry Andric ///
395ffd83dbSDimitry Andric /// \param __config
405ffd83dbSDimitry Andric /// A pointer to 512-bits configuration
41e8d8bef9SDimitry Andric static __inline__ void __DEFAULT_FN_ATTRS_TILE
_tile_loadconfig(const void * __config)42e8d8bef9SDimitry Andric _tile_loadconfig(const void *__config) {
435ffd83dbSDimitry Andric __builtin_ia32_tile_loadconfig(__config);
445ffd83dbSDimitry Andric }
455ffd83dbSDimitry Andric
465ffd83dbSDimitry Andric /// Stores the current tile configuration to a 64-byte memory location
475ffd83dbSDimitry Andric /// specified by "mem_addr". The tile configuration includes the tile type
485ffd83dbSDimitry Andric /// palette, the number of bytes per row, and the number of rows. If tiles
495ffd83dbSDimitry Andric /// are not configured, all zeroes will be stored to memory.
505ffd83dbSDimitry Andric ///
51fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
525ffd83dbSDimitry Andric ///
535ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
545ffd83dbSDimitry Andric ///
555ffd83dbSDimitry Andric /// \param __config
565ffd83dbSDimitry Andric /// A pointer to 512-bits configuration
57e8d8bef9SDimitry Andric static __inline__ void __DEFAULT_FN_ATTRS_TILE
_tile_storeconfig(void * __config)58e8d8bef9SDimitry Andric _tile_storeconfig(void *__config) {
595ffd83dbSDimitry Andric __builtin_ia32_tile_storeconfig(__config);
605ffd83dbSDimitry Andric }
615ffd83dbSDimitry Andric
625ffd83dbSDimitry Andric /// Release the tile configuration to return to the init state, which
635ffd83dbSDimitry Andric /// releases all storage it currently holds.
645ffd83dbSDimitry Andric ///
65fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
665ffd83dbSDimitry Andric ///
675ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
_tile_release(void)68e8d8bef9SDimitry Andric static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
695ffd83dbSDimitry Andric __builtin_ia32_tilerelease();
705ffd83dbSDimitry Andric }
715ffd83dbSDimitry Andric
725ffd83dbSDimitry Andric /// Load tile rows from memory specifieid by "base" address and "stride" into
735ffd83dbSDimitry Andric /// destination tile "dst" using the tile configuration previously configured
745ffd83dbSDimitry Andric /// via "_tile_loadconfig".
755ffd83dbSDimitry Andric ///
76fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
775ffd83dbSDimitry Andric ///
785ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
795ffd83dbSDimitry Andric ///
805ffd83dbSDimitry Andric /// \param dst
815ffd83dbSDimitry Andric /// A destination tile. Max size is 1024 Bytes.
825ffd83dbSDimitry Andric /// \param base
835ffd83dbSDimitry Andric /// A pointer to base address.
845ffd83dbSDimitry Andric /// \param stride
855ffd83dbSDimitry Andric /// The stride between the rows' data to be loaded in memory.
865ffd83dbSDimitry Andric #define _tile_loadd(dst, base, stride) \
87e8d8bef9SDimitry Andric __builtin_ia32_tileloadd64((dst), ((const void *)(base)), \
88e8d8bef9SDimitry Andric (__SIZE_TYPE__)(stride))
895ffd83dbSDimitry Andric
905ffd83dbSDimitry Andric /// Load tile rows from memory specifieid by "base" address and "stride" into
915ffd83dbSDimitry Andric /// destination tile "dst" using the tile configuration previously configured
925ffd83dbSDimitry Andric /// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
935ffd83dbSDimitry Andric /// that the data will likely not be reused in the near future and the data
945ffd83dbSDimitry Andric /// caching can be optimized accordingly.
955ffd83dbSDimitry Andric ///
96fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
975ffd83dbSDimitry Andric ///
985ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
995ffd83dbSDimitry Andric ///
1005ffd83dbSDimitry Andric /// \param dst
1015ffd83dbSDimitry Andric /// A destination tile. Max size is 1024 Bytes.
1025ffd83dbSDimitry Andric /// \param base
1035ffd83dbSDimitry Andric /// A pointer to base address.
1045ffd83dbSDimitry Andric /// \param stride
1055ffd83dbSDimitry Andric /// The stride between the rows' data to be loaded in memory.
1065ffd83dbSDimitry Andric #define _tile_stream_loadd(dst, base, stride) \
107e8d8bef9SDimitry Andric __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \
108e8d8bef9SDimitry Andric (__SIZE_TYPE__)(stride))
1095ffd83dbSDimitry Andric
1105ffd83dbSDimitry Andric /// Store the tile specified by "src" to memory specifieid by "base" address and
1115ffd83dbSDimitry Andric /// "stride" using the tile configuration previously configured via
1125ffd83dbSDimitry Andric /// "_tile_loadconfig".
1135ffd83dbSDimitry Andric ///
114fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
1155ffd83dbSDimitry Andric ///
1165ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
1175ffd83dbSDimitry Andric ///
1185ffd83dbSDimitry Andric /// \param dst
1195ffd83dbSDimitry Andric /// A destination tile. Max size is 1024 Bytes.
1205ffd83dbSDimitry Andric /// \param base
1215ffd83dbSDimitry Andric /// A pointer to base address.
1225ffd83dbSDimitry Andric /// \param stride
1235ffd83dbSDimitry Andric /// The stride between the rows' data to be stored in memory.
1245ffd83dbSDimitry Andric #define _tile_stored(dst, base, stride) \
1255ffd83dbSDimitry Andric __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
1265ffd83dbSDimitry Andric
1275ffd83dbSDimitry Andric /// Zero the tile specified by "tdest".
1285ffd83dbSDimitry Andric ///
129fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
1305ffd83dbSDimitry Andric ///
1315ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
1325ffd83dbSDimitry Andric ///
1335ffd83dbSDimitry Andric /// \param tile
1345ffd83dbSDimitry Andric /// The destination tile to be zero. Max size is 1024 Bytes.
1355ffd83dbSDimitry Andric #define _tile_zero(tile) __builtin_ia32_tilezero((tile))
1365ffd83dbSDimitry Andric
1375ffd83dbSDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator.
1385ffd83dbSDimitry Andric /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
1395ffd83dbSDimitry Andric /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
1405ffd83dbSDimitry Andric /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
1415ffd83dbSDimitry Andric /// and store the 32-bit result back to tile "dst".
1425ffd83dbSDimitry Andric ///
143fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
1445ffd83dbSDimitry Andric ///
1455ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
1465ffd83dbSDimitry Andric ///
1475ffd83dbSDimitry Andric /// \param dst
1485ffd83dbSDimitry Andric /// The destination tile. Max size is 1024 Bytes.
1495ffd83dbSDimitry Andric /// \param src0
1505ffd83dbSDimitry Andric /// The 1st source tile. Max size is 1024 Bytes.
1515ffd83dbSDimitry Andric /// \param src1
1525ffd83dbSDimitry Andric /// The 2nd source tile. Max size is 1024 Bytes.
153e8d8bef9SDimitry Andric #define _tile_dpbssd(dst, src0, src1) \
154e8d8bef9SDimitry Andric __builtin_ia32_tdpbssd((dst), (src0), (src1))
1555ffd83dbSDimitry Andric
1565ffd83dbSDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator.
1575ffd83dbSDimitry Andric /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
1585ffd83dbSDimitry Andric /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
1595ffd83dbSDimitry Andric /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
1605ffd83dbSDimitry Andric /// in "dst", and store the 32-bit result back to tile "dst".
1615ffd83dbSDimitry Andric ///
162fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
1635ffd83dbSDimitry Andric ///
1645ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
1655ffd83dbSDimitry Andric ///
1665ffd83dbSDimitry Andric /// \param dst
1675ffd83dbSDimitry Andric /// The destination tile. Max size is 1024 Bytes.
1685ffd83dbSDimitry Andric /// \param src0
1695ffd83dbSDimitry Andric /// The 1st source tile. Max size is 1024 Bytes.
1705ffd83dbSDimitry Andric /// \param src1
1715ffd83dbSDimitry Andric /// The 2nd source tile. Max size is 1024 Bytes.
172e8d8bef9SDimitry Andric #define _tile_dpbsud(dst, src0, src1) \
173e8d8bef9SDimitry Andric __builtin_ia32_tdpbsud((dst), (src0), (src1))
1745ffd83dbSDimitry Andric
1755ffd83dbSDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator.
1765ffd83dbSDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
1775ffd83dbSDimitry Andric /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
1785ffd83dbSDimitry Andric /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
1795ffd83dbSDimitry Andric /// and store the 32-bit result back to tile "dst".
1805ffd83dbSDimitry Andric ///
181fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
1825ffd83dbSDimitry Andric ///
1835ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
1845ffd83dbSDimitry Andric ///
1855ffd83dbSDimitry Andric /// \param dst
1865ffd83dbSDimitry Andric /// The destination tile. Max size is 1024 Bytes.
1875ffd83dbSDimitry Andric /// \param src0
1885ffd83dbSDimitry Andric /// The 1st source tile. Max size is 1024 Bytes.
1895ffd83dbSDimitry Andric /// \param src1
1905ffd83dbSDimitry Andric /// The 2nd source tile. Max size is 1024 Bytes.
191e8d8bef9SDimitry Andric #define _tile_dpbusd(dst, src0, src1) \
192e8d8bef9SDimitry Andric __builtin_ia32_tdpbusd((dst), (src0), (src1))
1935ffd83dbSDimitry Andric
1945ffd83dbSDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator.
1955ffd83dbSDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
1965ffd83dbSDimitry Andric /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
1975ffd83dbSDimitry Andric /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
1985ffd83dbSDimitry Andric /// "dst", and store the 32-bit result back to tile "dst".
1995ffd83dbSDimitry Andric ///
200fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
2015ffd83dbSDimitry Andric ///
2025ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
2035ffd83dbSDimitry Andric ///
2045ffd83dbSDimitry Andric /// \param dst
2055ffd83dbSDimitry Andric /// The destination tile. Max size is 1024 Bytes.
2065ffd83dbSDimitry Andric /// \param src0
2075ffd83dbSDimitry Andric /// The 1st source tile. Max size is 1024 Bytes.
2085ffd83dbSDimitry Andric /// \param src1
2095ffd83dbSDimitry Andric /// The 2nd source tile. Max size is 1024 Bytes.
210e8d8bef9SDimitry Andric #define _tile_dpbuud(dst, src0, src1) \
211e8d8bef9SDimitry Andric __builtin_ia32_tdpbuud((dst), (src0), (src1))
2125ffd83dbSDimitry Andric
2135ffd83dbSDimitry Andric /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
2145ffd83dbSDimitry Andric /// src1, accumulating the intermediate single-precision (32-bit) floating-point
2155ffd83dbSDimitry Andric /// elements with elements in "dst", and store the 32-bit result back to tile
2165ffd83dbSDimitry Andric /// "dst".
2175ffd83dbSDimitry Andric ///
218fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
2195ffd83dbSDimitry Andric ///
2205ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
2215ffd83dbSDimitry Andric ///
2225ffd83dbSDimitry Andric /// \param dst
2235ffd83dbSDimitry Andric /// The destination tile. Max size is 1024 Bytes.
2245ffd83dbSDimitry Andric /// \param src0
2255ffd83dbSDimitry Andric /// The 1st source tile. Max size is 1024 Bytes.
2265ffd83dbSDimitry Andric /// \param src1
2275ffd83dbSDimitry Andric /// The 2nd source tile. Max size is 1024 Bytes.
2285ffd83dbSDimitry Andric #define _tile_dpbf16ps(dst, src0, src1) \
2295ffd83dbSDimitry Andric __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
2305ffd83dbSDimitry Andric
231fe6060f1SDimitry Andric /// AMX tile register size can be configured, the maximum size is 16x64=1024
232fe6060f1SDimitry Andric /// bytes. Since there is no 2D type in llvm IR, we use vector type to
233fe6060f1SDimitry Andric /// represent 2D tile and the fixed size is maximum amx tile register size.
234e8d8bef9SDimitry Andric typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
235fe6060f1SDimitry Andric
236fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly.
237e8d8bef9SDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_loadd_internal(unsigned short m,unsigned short n,const void * base,__SIZE_TYPE__ stride)238e8d8bef9SDimitry Andric _tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
239e8d8bef9SDimitry Andric __SIZE_TYPE__ stride) {
240e8d8bef9SDimitry Andric return __builtin_ia32_tileloadd64_internal(m, n, base,
241e8d8bef9SDimitry Andric (__SIZE_TYPE__)(stride));
242e8d8bef9SDimitry Andric }
243e8d8bef9SDimitry Andric
244fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly.
245fe6060f1SDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_loaddt1_internal(unsigned short m,unsigned short n,const void * base,__SIZE_TYPE__ stride)246fe6060f1SDimitry Andric _tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
247fe6060f1SDimitry Andric __SIZE_TYPE__ stride) {
248fe6060f1SDimitry Andric return __builtin_ia32_tileloaddt164_internal(m, n, base,
249fe6060f1SDimitry Andric (__SIZE_TYPE__)(stride));
250fe6060f1SDimitry Andric }
251fe6060f1SDimitry Andric
252fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly.
253e8d8bef9SDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbssd_internal(unsigned short m,unsigned short n,unsigned short k,_tile1024i dst,_tile1024i src1,_tile1024i src2)254e8d8bef9SDimitry Andric _tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
255e8d8bef9SDimitry Andric _tile1024i dst, _tile1024i src1, _tile1024i src2) {
256e8d8bef9SDimitry Andric return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
257e8d8bef9SDimitry Andric }
258e8d8bef9SDimitry Andric
259fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly.
260fe6060f1SDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbsud_internal(unsigned short m,unsigned short n,unsigned short k,_tile1024i dst,_tile1024i src1,_tile1024i src2)261fe6060f1SDimitry Andric _tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
262fe6060f1SDimitry Andric _tile1024i dst, _tile1024i src1, _tile1024i src2) {
263fe6060f1SDimitry Andric return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
264fe6060f1SDimitry Andric }
265fe6060f1SDimitry Andric
266fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly.
267fe6060f1SDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbusd_internal(unsigned short m,unsigned short n,unsigned short k,_tile1024i dst,_tile1024i src1,_tile1024i src2)268fe6060f1SDimitry Andric _tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
269fe6060f1SDimitry Andric _tile1024i dst, _tile1024i src1, _tile1024i src2) {
270fe6060f1SDimitry Andric return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
271fe6060f1SDimitry Andric }
272fe6060f1SDimitry Andric
273fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly.
274fe6060f1SDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbuud_internal(unsigned short m,unsigned short n,unsigned short k,_tile1024i dst,_tile1024i src1,_tile1024i src2)275fe6060f1SDimitry Andric _tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
276fe6060f1SDimitry Andric _tile1024i dst, _tile1024i src1, _tile1024i src2) {
277fe6060f1SDimitry Andric return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
278fe6060f1SDimitry Andric }
279fe6060f1SDimitry Andric
280fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly.
281e8d8bef9SDimitry Andric static __inline__ void __DEFAULT_FN_ATTRS_INT8
_tile_stored_internal(unsigned short m,unsigned short n,void * base,__SIZE_TYPE__ stride,_tile1024i tile)282e8d8bef9SDimitry Andric _tile_stored_internal(unsigned short m, unsigned short n, void *base,
283e8d8bef9SDimitry Andric __SIZE_TYPE__ stride, _tile1024i tile) {
284e8d8bef9SDimitry Andric return __builtin_ia32_tilestored64_internal(m, n, base,
285e8d8bef9SDimitry Andric (__SIZE_TYPE__)(stride), tile);
286e8d8bef9SDimitry Andric }
287e8d8bef9SDimitry Andric
288fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly.
289fe6060f1SDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
_tile_dpbf16ps_internal(unsigned short m,unsigned short n,unsigned short k,_tile1024i dst,_tile1024i src1,_tile1024i src2)290fe6060f1SDimitry Andric _tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
291fe6060f1SDimitry Andric _tile1024i dst, _tile1024i src1, _tile1024i src2) {
292fe6060f1SDimitry Andric return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
293fe6060f1SDimitry Andric }
294fe6060f1SDimitry Andric
295*bdd1243dSDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly.
296*bdd1243dSDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16
_tile_dpfp16ps_internal(unsigned short m,unsigned short n,unsigned short k,_tile1024i dst,_tile1024i src1,_tile1024i src2)297*bdd1243dSDimitry Andric _tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
298*bdd1243dSDimitry Andric _tile1024i dst, _tile1024i src1, _tile1024i src2) {
299*bdd1243dSDimitry Andric return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);
300*bdd1243dSDimitry Andric }
301*bdd1243dSDimitry Andric
302fe6060f1SDimitry Andric /// This struct pack the shape and tile data together for user. We suggest
303fe6060f1SDimitry Andric /// initializing the struct as early as possible, because compiler depends
304fe6060f1SDimitry Andric /// on the shape information to do configure. The constant value is preferred
305fe6060f1SDimitry Andric /// for optimization by compiler.
306e8d8bef9SDimitry Andric typedef struct __tile1024i_str {
307e8d8bef9SDimitry Andric const unsigned short row;
308e8d8bef9SDimitry Andric const unsigned short col;
309e8d8bef9SDimitry Andric _tile1024i tile;
310e8d8bef9SDimitry Andric } __tile1024i;
311e8d8bef9SDimitry Andric
312fe6060f1SDimitry Andric /// Load tile rows from memory specifieid by "base" address and "stride" into
313fe6060f1SDimitry Andric /// destination tile "dst".
314fe6060f1SDimitry Andric ///
315fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
316fe6060f1SDimitry Andric ///
317fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
318fe6060f1SDimitry Andric ///
319fe6060f1SDimitry Andric /// \param dst
320fe6060f1SDimitry Andric /// A destination tile. Max size is 1024 Bytes.
321fe6060f1SDimitry Andric /// \param base
322fe6060f1SDimitry Andric /// A pointer to base address.
323fe6060f1SDimitry Andric /// \param stride
324fe6060f1SDimitry Andric /// The stride between the rows' data to be loaded in memory.
325e8d8bef9SDimitry Andric __DEFAULT_FN_ATTRS_TILE
__tile_loadd(__tile1024i * dst,const void * base,__SIZE_TYPE__ stride)326349cc55cSDimitry Andric static __inline__ void __tile_loadd(__tile1024i *dst, const void *base,
327e8d8bef9SDimitry Andric __SIZE_TYPE__ stride) {
328e8d8bef9SDimitry Andric dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
329e8d8bef9SDimitry Andric }
330e8d8bef9SDimitry Andric
331fe6060f1SDimitry Andric /// Load tile rows from memory specifieid by "base" address and "stride" into
332fe6060f1SDimitry Andric /// destination tile "dst". This intrinsic provides a hint to the implementation
333fe6060f1SDimitry Andric /// that the data will likely not be reused in the near future and the data
334fe6060f1SDimitry Andric /// caching can be optimized accordingly.
335fe6060f1SDimitry Andric ///
336fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
337fe6060f1SDimitry Andric ///
338fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
339fe6060f1SDimitry Andric ///
340fe6060f1SDimitry Andric /// \param dst
341fe6060f1SDimitry Andric /// A destination tile. Max size is 1024 Bytes.
342fe6060f1SDimitry Andric /// \param base
343fe6060f1SDimitry Andric /// A pointer to base address.
344fe6060f1SDimitry Andric /// \param stride
345fe6060f1SDimitry Andric /// The stride between the rows' data to be loaded in memory.
346fe6060f1SDimitry Andric __DEFAULT_FN_ATTRS_TILE
__tile_stream_loadd(__tile1024i * dst,const void * base,__SIZE_TYPE__ stride)347349cc55cSDimitry Andric static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base,
348fe6060f1SDimitry Andric __SIZE_TYPE__ stride) {
349fe6060f1SDimitry Andric dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
350e8d8bef9SDimitry Andric }
351e8d8bef9SDimitry Andric
352fe6060f1SDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator.
353fe6060f1SDimitry Andric /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
354fe6060f1SDimitry Andric /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
355fe6060f1SDimitry Andric /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
356fe6060f1SDimitry Andric /// and store the 32-bit result back to tile "dst".
357fe6060f1SDimitry Andric ///
358fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
359fe6060f1SDimitry Andric ///
360fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
361fe6060f1SDimitry Andric ///
362fe6060f1SDimitry Andric /// \param dst
363fe6060f1SDimitry Andric /// The destination tile. Max size is 1024 Bytes.
364fe6060f1SDimitry Andric /// \param src0
365fe6060f1SDimitry Andric /// The 1st source tile. Max size is 1024 Bytes.
366fe6060f1SDimitry Andric /// \param src1
367fe6060f1SDimitry Andric /// The 2nd source tile. Max size is 1024 Bytes.
368fe6060f1SDimitry Andric __DEFAULT_FN_ATTRS_INT8
__tile_dpbssd(__tile1024i * dst,__tile1024i src0,__tile1024i src1)369349cc55cSDimitry Andric static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
370fe6060f1SDimitry Andric __tile1024i src1) {
371fe6060f1SDimitry Andric dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
372fe6060f1SDimitry Andric src0.tile, src1.tile);
373fe6060f1SDimitry Andric }
374fe6060f1SDimitry Andric
375fe6060f1SDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator.
376fe6060f1SDimitry Andric /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
377fe6060f1SDimitry Andric /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
378fe6060f1SDimitry Andric /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
379fe6060f1SDimitry Andric /// in "dst", and store the 32-bit result back to tile "dst".
380fe6060f1SDimitry Andric ///
381fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
382fe6060f1SDimitry Andric ///
383fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
384fe6060f1SDimitry Andric ///
385fe6060f1SDimitry Andric /// \param dst
386fe6060f1SDimitry Andric /// The destination tile. Max size is 1024 Bytes.
387fe6060f1SDimitry Andric /// \param src0
388fe6060f1SDimitry Andric /// The 1st source tile. Max size is 1024 Bytes.
389fe6060f1SDimitry Andric /// \param src1
390fe6060f1SDimitry Andric /// The 2nd source tile. Max size is 1024 Bytes.
391fe6060f1SDimitry Andric __DEFAULT_FN_ATTRS_INT8
__tile_dpbsud(__tile1024i * dst,__tile1024i src0,__tile1024i src1)392349cc55cSDimitry Andric static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
393fe6060f1SDimitry Andric __tile1024i src1) {
394fe6060f1SDimitry Andric dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
395fe6060f1SDimitry Andric src0.tile, src1.tile);
396fe6060f1SDimitry Andric }
397fe6060f1SDimitry Andric
398fe6060f1SDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator.
399fe6060f1SDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
400fe6060f1SDimitry Andric /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
401fe6060f1SDimitry Andric /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
402fe6060f1SDimitry Andric /// and store the 32-bit result back to tile "dst".
403fe6060f1SDimitry Andric ///
404fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
405fe6060f1SDimitry Andric ///
406fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
407fe6060f1SDimitry Andric ///
408fe6060f1SDimitry Andric /// \param dst
409fe6060f1SDimitry Andric /// The destination tile. Max size is 1024 Bytes.
410fe6060f1SDimitry Andric /// \param src0
411fe6060f1SDimitry Andric /// The 1st source tile. Max size is 1024 Bytes.
412fe6060f1SDimitry Andric /// \param src1
413fe6060f1SDimitry Andric /// The 2nd source tile. Max size is 1024 Bytes.
414fe6060f1SDimitry Andric __DEFAULT_FN_ATTRS_INT8
__tile_dpbusd(__tile1024i * dst,__tile1024i src0,__tile1024i src1)415349cc55cSDimitry Andric static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
416fe6060f1SDimitry Andric __tile1024i src1) {
417fe6060f1SDimitry Andric dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
418fe6060f1SDimitry Andric src0.tile, src1.tile);
419fe6060f1SDimitry Andric }
420fe6060f1SDimitry Andric
421fe6060f1SDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator.
422fe6060f1SDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
423fe6060f1SDimitry Andric /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
424fe6060f1SDimitry Andric /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
425fe6060f1SDimitry Andric /// "dst", and store the 32-bit result back to tile "dst".
426fe6060f1SDimitry Andric ///
427fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
428fe6060f1SDimitry Andric ///
429fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
430fe6060f1SDimitry Andric ///
431fe6060f1SDimitry Andric /// \param dst
432fe6060f1SDimitry Andric /// The destination tile. Max size is 1024 Bytes.
433fe6060f1SDimitry Andric /// \param src0
434fe6060f1SDimitry Andric /// The 1st source tile. Max size is 1024 Bytes.
435fe6060f1SDimitry Andric /// \param src1
436fe6060f1SDimitry Andric /// The 2nd source tile. Max size is 1024 Bytes.
437fe6060f1SDimitry Andric __DEFAULT_FN_ATTRS_INT8
__tile_dpbuud(__tile1024i * dst,__tile1024i src0,__tile1024i src1)438349cc55cSDimitry Andric static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
439fe6060f1SDimitry Andric __tile1024i src1) {
440fe6060f1SDimitry Andric dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
441fe6060f1SDimitry Andric src0.tile, src1.tile);
442fe6060f1SDimitry Andric }
443fe6060f1SDimitry Andric
444fe6060f1SDimitry Andric /// Store the tile specified by "src" to memory specifieid by "base" address and
445fe6060f1SDimitry Andric /// "stride".
446fe6060f1SDimitry Andric ///
447fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
448fe6060f1SDimitry Andric ///
449fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
450fe6060f1SDimitry Andric ///
451fe6060f1SDimitry Andric /// \param base
452fe6060f1SDimitry Andric /// A pointer to base address.
453fe6060f1SDimitry Andric /// \param stride
454fe6060f1SDimitry Andric /// The stride between the rows' data to be stored in memory.
455e8d8bef9SDimitry Andric __DEFAULT_FN_ATTRS_TILE
__tile_stored(void * base,__SIZE_TYPE__ stride,__tile1024i src)456349cc55cSDimitry Andric static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,
457349cc55cSDimitry Andric __tile1024i src) {
458e8d8bef9SDimitry Andric _tile_stored_internal(src.row, src.col, base, stride, src.tile);
459e8d8bef9SDimitry Andric }
460e8d8bef9SDimitry Andric
461fe6060f1SDimitry Andric /// Zero the tile specified by "dst".
462fe6060f1SDimitry Andric ///
463fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
464fe6060f1SDimitry Andric ///
465fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
466fe6060f1SDimitry Andric ///
467fe6060f1SDimitry Andric /// \param dst
468fe6060f1SDimitry Andric /// The destination tile to be zero. Max size is 1024 Bytes.
469e8d8bef9SDimitry Andric __DEFAULT_FN_ATTRS_TILE
__tile_zero(__tile1024i * dst)470349cc55cSDimitry Andric static __inline__ void __tile_zero(__tile1024i *dst) {
471e8d8bef9SDimitry Andric dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
472e8d8bef9SDimitry Andric }
4735ffd83dbSDimitry Andric
474fe6060f1SDimitry Andric /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
475fe6060f1SDimitry Andric /// src1, accumulating the intermediate single-precision (32-bit) floating-point
476fe6060f1SDimitry Andric /// elements with elements in "dst", and store the 32-bit result back to tile
477fe6060f1SDimitry Andric /// "dst".
478fe6060f1SDimitry Andric ///
479fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
480fe6060f1SDimitry Andric ///
481fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
482fe6060f1SDimitry Andric ///
483fe6060f1SDimitry Andric /// \param dst
484fe6060f1SDimitry Andric /// The destination tile. Max size is 1024 Bytes.
485fe6060f1SDimitry Andric /// \param src0
486fe6060f1SDimitry Andric /// The 1st source tile. Max size is 1024 Bytes.
487fe6060f1SDimitry Andric /// \param src1
488fe6060f1SDimitry Andric /// The 2nd source tile. Max size is 1024 Bytes.
489fe6060f1SDimitry Andric __DEFAULT_FN_ATTRS_BF16
__tile_dpbf16ps(__tile1024i * dst,__tile1024i src0,__tile1024i src1)490349cc55cSDimitry Andric static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
491fe6060f1SDimitry Andric __tile1024i src1) {
492fe6060f1SDimitry Andric dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
493fe6060f1SDimitry Andric src0.tile, src1.tile);
494fe6060f1SDimitry Andric }
495fe6060f1SDimitry Andric
496*bdd1243dSDimitry Andric /// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
497*bdd1243dSDimitry Andric /// src1, accumulating the intermediate single-precision (32-bit) floating-point
498*bdd1243dSDimitry Andric /// elements with elements in "dst", and store the 32-bit result back to tile
499*bdd1243dSDimitry Andric /// "dst".
500*bdd1243dSDimitry Andric ///
501*bdd1243dSDimitry Andric /// \headerfile <immintrin.h>
502*bdd1243dSDimitry Andric ///
503*bdd1243dSDimitry Andric /// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction.
504*bdd1243dSDimitry Andric ///
505*bdd1243dSDimitry Andric /// \param dst
506*bdd1243dSDimitry Andric /// The destination tile. Max size is 1024 Bytes.
507*bdd1243dSDimitry Andric /// \param src0
508*bdd1243dSDimitry Andric /// The 1st source tile. Max size is 1024 Bytes.
509*bdd1243dSDimitry Andric /// \param src1
510*bdd1243dSDimitry Andric /// The 2nd source tile. Max size is 1024 Bytes.
511*bdd1243dSDimitry Andric __DEFAULT_FN_ATTRS_FP16
__tile_dpfp16ps(__tile1024i * dst,__tile1024i src0,__tile1024i src1)512*bdd1243dSDimitry Andric static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,
513*bdd1243dSDimitry Andric __tile1024i src1) {
514*bdd1243dSDimitry Andric dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
515*bdd1243dSDimitry Andric src0.tile, src1.tile);
516*bdd1243dSDimitry Andric }
517*bdd1243dSDimitry Andric
518fe6060f1SDimitry Andric #undef __DEFAULT_FN_ATTRS_TILE
519fe6060f1SDimitry Andric #undef __DEFAULT_FN_ATTRS_INT8
520fe6060f1SDimitry Andric #undef __DEFAULT_FN_ATTRS_BF16
521*bdd1243dSDimitry Andric #undef __DEFAULT_FN_ATTRS_FP16
522fe6060f1SDimitry Andric
5235ffd83dbSDimitry Andric #endif /* __x86_64__ */
5245ffd83dbSDimitry Andric #endif /* __AMXINTRIN_H */
525