Lines Matching +full:max +full:- +full:size
1 /*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===------------------------------------------------------------------------===
20 __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
22 __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
24 __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
26 /// Load tile configuration from a 64-byte memory location specified by
38 /// A pointer to 512-bits configuration
44 /// Stores the current tile configuration to a 64-byte memory location
54 /// A pointer to 512-bits configuration
79 /// A destination tile. Max size is 1024 Bytes.
99 /// A destination tile. Max size is 1024 Bytes.
117 /// A destination tile. Max size is 1024 Bytes.
132 /// The destination tile to be zero. Max size is 1024 Bytes.
135 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
136 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
137 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
138 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
139 /// and store the 32-bit result back to tile "dst".
146 /// The destination tile. Max size is 1024 Bytes.
148 /// The 1st source tile. Max size is 1024 Bytes.
150 /// The 2nd source tile. Max size is 1024 Bytes.
154 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
155 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
156 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
157 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
158 /// in "dst", and store the 32-bit result back to tile "dst".
165 /// The destination tile. Max size is 1024 Bytes.
167 /// The 1st source tile. Max size is 1024 Bytes.
169 /// The 2nd source tile. Max size is 1024 Bytes.
173 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
174 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
175 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
176 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
177 /// and store the 32-bit result back to tile "dst".
184 /// The destination tile. Max size is 1024 Bytes.
186 /// The 1st source tile. Max size is 1024 Bytes.
188 /// The 2nd source tile. Max size is 1024 Bytes.
192 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
193 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
194 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
195 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
196 /// "dst", and store the 32-bit result back to tile "dst".
203 /// The destination tile. Max size is 1024 Bytes.
205 /// The 1st source tile. Max size is 1024 Bytes.
207 /// The 2nd source tile. Max size is 1024 Bytes.
211 /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
212 /// src1, accumulating the intermediate single-precision (32-bit) floating-point
213 /// elements with elements in "dst", and store the 32-bit result back to tile
221 /// The destination tile. Max size is 1024 Bytes.
223 /// The 1st source tile. Max size is 1024 Bytes.
225 /// The 2nd source tile. Max size is 1024 Bytes.
229 /// AMX tile register size can be configured, the maximum size is 16x64=1024
231 /// represent 2D tile and the fixed size is maximum amx tile register size.
313 /// A destination tile. Max size is 1024 Bytes.
321 dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
334 /// A destination tile. Max size is 1024 Bytes.
342 dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
345 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
346 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
347 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
348 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
349 /// and store the 32-bit result back to tile "dst".
356 /// The destination tile. Max size is 1024 Bytes.
358 /// The 1st source tile. Max size is 1024 Bytes.
360 /// The 2nd source tile. Max size is 1024 Bytes.
364 dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
368 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
369 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
370 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
371 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
372 /// in "dst", and store the 32-bit result back to tile "dst".
379 /// The destination tile. Max size is 1024 Bytes.
381 /// The 1st source tile. Max size is 1024 Bytes.
383 /// The 2nd source tile. Max size is 1024 Bytes.
387 dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
391 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
392 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
393 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
394 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
395 /// and store the 32-bit result back to tile "dst".
402 /// The destination tile. Max size is 1024 Bytes.
404 /// The 1st source tile. Max size is 1024 Bytes.
406 /// The 2nd source tile. Max size is 1024 Bytes.
410 dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
414 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
415 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
416 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
417 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
418 /// "dst", and store the 32-bit result back to tile "dst".
425 /// The destination tile. Max size is 1024 Bytes.
427 /// The 1st source tile. Max size is 1024 Bytes.
429 /// The 2nd source tile. Max size is 1024 Bytes.
433 dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
461 /// The destination tile to be zero. Max size is 1024 Bytes.
464 dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
467 /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
468 /// src1, accumulating the intermediate single-precision (32-bit) floating-point
469 /// elements with elements in "dst", and store the 32-bit result back to tile
477 /// The destination tile. Max size is 1024 Bytes.
479 /// The 1st source tile. Max size is 1024 Bytes.
481 /// The 2nd source tile. Max size is 1024 Bytes.
485 dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,