xref: /llvm-project/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td (revision 3452149c059010e834050acd41a64595eb74df11)
1//===-- AMDGPU.td - AMDGPU dialect definitions *- tablegen -*------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef AMDGPU
10#define AMDGPU
11
12include "mlir/Interfaces/SideEffectInterfaces.td"
13include "mlir/IR/EnumAttr.td"
14include "mlir/IR/OpBase.td"
15
16def AMDGPU_Dialect : Dialect {
17  let name = "amdgpu";
18  let cppNamespace = "::mlir::amdgpu";
19  let description = [{
20    The `AMDGPU` dialect provides wrappers around AMD-specific functionality
21    and LLVM intrinsics. These wrappers should be used in conjunction with
22    more generic dialects, such as `gpu` and `vector`, when generating LLVM IR
23    that will eventually be executed on AMD hardware.
24  }];
25
26
27  let dependentDialects = [
28    "ROCDL::ROCDLDialect",
29    "arith::ArithDialect",
30    "gpu::GPUDialect"
31  ];
32  let useDefaultAttributePrinterParser = 1;
33}
34
35//===----------------------------------------------------------------------===//
36// AMDGPU Op definitions
37//===----------------------------------------------------------------------===//
38
39class AMDGPU_Op<string mnemonic, list<Trait> traits = []> :
40  Op<AMDGPU_Dialect, mnemonic, traits> {}
41
42def AMDGPU_ExtPackedFp8Op :
43    AMDGPU_Op<"ext_packed_fp8", [Pure]>,
44    Arguments<(ins AnyTypeOf<[F8E5M2FNUZ, F8E4M3FNUZ,
45        VectorOfLengthAndType<[1, 2, 3, 4], [F8E5M2FNUZ, F8E4M3FNUZ]>]>:$source,
46      ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<3>]>:$index)>,
47    Results<(outs F32:$res)> {
48  let summary = "Extend one of a vector of packed fp8 values to a float";
49  let description = [{
50    Extend the value `source[index]` to a 32-bit float and return it.
51
52    This rather unusual signature arises from the fact that AMD GPUs cannot
53    easily work with sub 32-bit quantities, so the compiler intrinsics for
54    extending 8-bit floats (which are, currently, the only way to work with
55    this operation) take packed vectors of 4 such floats.
56
57    If the passed-in vector has fewer than four elements, or the input is scalar,
58    the remaining values in the <4 x i8> will be filled with with
59    undefined values as needed.
60  }];
61  let assemblyFormat = [{
62    attr-dict $source `[` $index `]` `:` type($source) `to` type($res)
63  }];
64}
65
66def AMDGPU_PackedTrunc2xFp8Op :
67    AMDGPU_Op<"packed_trunc_2xfp8", [Pure, AttrSizedOperandSegments]>,
68    Arguments<(ins F32:$sourceA,
69      Optional<F32>:$sourceB,
70      ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<1>]>:$wordIndex,
71      Optional<FixedVectorOfLengthAndType<[4], [F8E4M3FNUZ, F8E5M2FNUZ]>>:$existing)>,
72    Results<(outs FixedVectorOfLengthAndType<[4], [F8E4M3FNUZ, F8E5M2FNUZ]>:$res)> {
73  let summary = "Round two floats into a packed vector of 8-bit floats";
74  let description = [{
75    Round the inputs `sourceA` and `sourceB` (which is undefined if not
76    specified) into the low or high word (bottom two or top two) elements
77    of the returned vector, keeping the other two elements of `existing`
78    unchanged if present (or undefined if it was not passed in).
79
80    The reason for this odd signature is that AMD GPUs cannot easily work with
81    sub-registers, and so the conversion intrinsics (which are currently the
82    only way to work with 8-bit float types) take packed vectors of 4 8-bit
83    values.
84  }];
85  let assemblyFormat = [{
86    attr-dict $sourceA `,` ($sourceB^):(`undef`)?
87    `into` ($existing^):(`undef`)? `[` `word` $wordIndex `]`
88    `:` type($sourceA) `to` type($res) (`into` type($existing)^)?
89  }];
90  let hasVerifier = 1;
91}
92
93def AMDGPU_PackedStochRoundFp8Op :
94    AMDGPU_Op<"packed_stoch_round_fp8", [Pure]>,
95    Arguments<(ins F32:$source,
96      I32:$stochiasticParam,
97      ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<3>]>:$storeIndex,
98      Optional<FixedVectorOfLengthAndType<[4], [F8E4M3FNUZ, F8E5M2FNUZ]>>:$existing)>,
99    Results<(outs FixedVectorOfLengthAndType<[4], [F8E4M3FNUZ, F8E5M2FNUZ]>:$res)> {
100  let summary = "Round float stochiastically into a packed vector of 8-bit floats";
101  let description = [{
102    Round the input `source`, adding in `stochiasticParam`, and place it into
103    the `storeIndex`th element of `res`.
104
105    If `existing` is passed in, elements of `res` other than the one at `storeIndex`
106    are copied from `existing`.
107
108    The reason for this odd signature is that AMD GPUs cannot easily work with
109    sub-registers, and so the conversion intrinsics (which are currently the
110    only way to work with 8-bit float types) take packed vectors of 4 8-bit
111    values.
112  }];
113  let assemblyFormat = [{
114    attr-dict $source `+` $stochiasticParam
115    `into` ($existing^):(`undef`)? `[` $storeIndex `]`
116    `:` type($source) `to` type($res) (`into` type($existing)^)?
117  }];
118  let hasVerifier = 1;
119}
120
121/// Raw buffer load
122def AMDGPU_RawBufferLoadOp :
123    AMDGPU_Op<"raw_buffer_load", [AllElementTypesMatch<["value", "memref"]>,
124      AttrSizedOperandSegments]>,
125    Arguments<(ins Arg<AnyMemRef, "buffer to load from", [MemRead]>:$memref,
126                   Variadic<I32>:$indices,
127                   DefaultValuedAttr<BoolAttr, "true">:$boundsCheck,
128                   OptionalAttr<I32Attr>:$indexOffset,
129                   Optional<I32>:$sgprOffset)>,
130    Results<(outs AnyType:$value)> {
131
132  let summary = "Raw Buffer load, exposing GCN features";
133  let description = [{
134    The `amdgpu.raw_buffer_load` op is a wrapper around the buffer load intrinsics
135    available on AMD GPUs, including extensions in newer GPUs.
136
137    The index into the buffer is computed as for `memref.load` with the additon
138    of `indexOffset` and `sgprOffset` (which **may or may not** be considered
139    in bounds checks and includes any offset present on the memref type if it's
140    non-zero).
141
142    All indices and offsets are in units of the memref's data type and are
143    converted to bytes during lowering.
144
145    When a load is out of bounds, the instruction returns zero.
146    Partially-out of bounds have chipset-dependent behavior: whether reading
147    2 elements starting at index 7 of a `memref<8xf32>` returns the last element
148    in the first vector component depends on the architecture.
149
150    The memref struct is converted into a buffer resource (a V#) and the arguments
151    are translated to intrinsic arguments as follows:
152    - The base address of the buffer is the base address of the memref
153    - The stride is 0 to enable raw mode
154    - The number of records is the size of the memref, in bytes
155      In the case of dynamically-shaped memrefs, this is computed at runtime
156      as max_d (size(d) * stride(d)) * sizeof(elementType(memref))
157    - The offset enable bit is 1, the index enable bit is 0.
158    - The thread ID addition bit is off
159    - If `boundsCheck` is false and the target chipset is RDNA, OOB_SELECT is set
160      to 2 to disable bounds checks, otherwise it is 3
161    - The cache coherency bits are off
162  }];
163  let assemblyFormat = [{
164    attr-dict $memref `[` $indices `]`
165      (`sgprOffset` $sgprOffset^)? `:`
166      type($memref) (`,` type($indices)^)? `->` type($value)
167  }];
168  let hasCanonicalizer = 1;
169  let hasVerifier = 1;
170}
171
172/// Raw buffer store
173def AMDGPU_RawBufferStoreOp :
174    AMDGPU_Op<"raw_buffer_store", [AllElementTypesMatch<["value", "memref"]>,
175      AttrSizedOperandSegments]>,
176    Arguments<(ins AnyType:$value,
177                   Arg<AnyMemRef, "buffer to store to", [MemWrite]>:$memref,
178                   Variadic<I32>:$indices,
179                   DefaultValuedAttr<BoolAttr, "true">:$boundsCheck,
180                   OptionalAttr<I32Attr>:$indexOffset,
181                   Optional<I32>:$sgprOffset)> {
182
183  let summary = "Raw Buffer Store, exposing GCN features";
184  let description = [{
185    The `amdgpu.raw_buffer_store` op is a wrapper around the buffer store
186    intrinsics available on AMD GPUs, including extensions in newer GPUs.
187
188    The store index is computed as in `memref.store` with the addition of
189    `indexOffset` (which is included for uniformity with atomics and may be useful
190    when writing vectorized code) and `sgprOffset` (which is added after bounds
191    checks and implicitly includes the offset of the memref type if non-zero).
192    All index components are in terms of the elements of the memref, not bytes,
193    and are scaled up appropriately.
194
195    Out of bounds stores are ignored in hardware.
196    Wthether a vector write that includes some in-bounds and soeme out-of-bounds
197    components is partically completed is chipset-dependent.
198
199    See `amdgpu.raw_buffer_load` for a description of how the underlying
200    instruction is constructed.
201  }];
202  let assemblyFormat = [{
203    attr-dict $value `->` $memref `[` $indices `]`
204      (`sgprOffset` $sgprOffset^)? `:`
205      type($value) `->` type($memref) (`,` type($indices)^)?
206  }];
207  let hasCanonicalizer = 1;
208  let hasVerifier = 1;
209}
210
211// Raw buffer atomic compare-and-swap
212def AMDGPU_RawBufferAtomicCmpswapOp :
213    AMDGPU_Op<"raw_buffer_atomic_cmpswap", [
214      AttrSizedOperandSegments,
215      AllTypesMatch<["src", "cmp", "value"]>,
216      AllElementTypesMatch<["value", "memref"]>]>,
217    Arguments<(ins AnyType:$src,
218                   AnyType:$cmp,
219                   Arg<AnyMemRef, "buffer to operate on", [MemRead, MemWrite]>:$memref,
220                   Variadic<I32>:$indices,
221                   DefaultValuedAttr<BoolAttr, "true">:$boundsCheck,
222                   OptionalAttr<I32Attr>:$indexOffset,
223                   Optional<I32>:$sgprOffset)>,
224    Results<(outs AnyType:$value)> {
225
226  let summary = "Raw Buffer Atomic compare-and-swap";
227  let description = [{
228    The `amdgpu.raw_buffer_atomic_cmpswap` op is a wrapper around the
229    buffer-based atomic compare-and-swap min available on AMD GPUs.
230
231    The index into the buffer is computed as for `memref.store` with the addition
232    of `indexOffset` (which is used to aid in emitting vectorized code) and,
233    if present `sgprOffset` (which is added after bounds checks and includes
234    any non-zero offset on the memref type).
235
236    All indexing components are given in terms of the memref's element size, not
237    the byte lengths required by the intrinsic.
238
239    Out of bounds atomic operations are ignored in hardware.
240
241    See `amdgpu.raw_buffer_load` for a description of how the underlying
242    instruction is constructed.
243  }];
244  let assemblyFormat = [{
245    attr-dict $src `,` $cmp `->` $memref `[` $indices `]`
246      (`sgprOffset` $sgprOffset^)? `:`
247      type($value) `->` type($memref) `,` type($indices)
248  }];
249  let hasCanonicalizer = 1;
250  let hasVerifier = 1;
251}
252
253// Raw buffer atomic floating point add
254def AMDGPU_RawBufferAtomicFaddOp :
255    AMDGPU_Op<"raw_buffer_atomic_fadd", [AllElementTypesMatch<["value", "memref"]>,
256      AttrSizedOperandSegments]>,
257    Arguments<(ins AnyTypeOf<[F32, VectorOfLengthAndType<[2], [F16, BF16]>]>:$value,
258                   Arg<AnyMemRef, "buffer to operate on", [MemRead, MemWrite]>:$memref,
259                   Variadic<I32>:$indices,
260                   DefaultValuedAttr<BoolAttr, "true">:$boundsCheck,
261                   OptionalAttr<I32Attr>:$indexOffset,
262                   Optional<I32>:$sgprOffset)> {
263
264  let summary = "Raw Buffer Floating-point Atomic Add (MI-* only)";
265  let description = [{
266    The `amdgpu.raw_buffer_atomic_fadd` op is a wrapper around the
267    buffer-based atomic floating point addition available on the MI-* series
268    of AMD GPUs.
269
270    The index into the buffer is computed as for `memref.store` with the addition
271    of `indexOffset` (which is used to aid in emitting vectorized code) and,
272    if present `sgprOffset` (which is added after bounds checks and includes
273    any non-zero offset on the memref type).
274
275    All indexing components are given in terms of the memref's element size, not
276    the byte lengths required by the intrinsic.
277
278    Out of bounds atomic operations are ignored in hardware.
279
280    See `amdgpu.raw_buffer_load` for a description of how the underlying
281    instruction is constructed.
282  }];
283  let assemblyFormat = [{
284    attr-dict $value `->` $memref `[` $indices `]`
285      (`sgprOffset` $sgprOffset^)? `:`
286      type($value) `->` type($memref) `,` type($indices)
287  }];
288  let hasCanonicalizer = 1;
289  let hasVerifier = 1;
290}
291
292// Raw buffer atomic floating point max
293def AMDGPU_RawBufferAtomicFmaxOp :
294    AMDGPU_Op<"raw_buffer_atomic_fmax", [AllElementTypesMatch<["value", "memref"]>,
295      AttrSizedOperandSegments]>,
296    Arguments<(ins AnyTypeOf<[F32, F64]>:$value,
297                   Arg<AnyMemRef, "buffer to operate on", [MemRead, MemWrite]>:$memref,
298                   Variadic<I32>:$indices,
299                   DefaultValuedAttr<BoolAttr, "true">:$boundsCheck,
300                   OptionalAttr<I32Attr>:$indexOffset,
301                   Optional<I32>:$sgprOffset)> {
302
303  let summary = "Raw Buffer Floating-point Atomic Max (non-GFX9)";
304  let description = [{
305    The `amdgpu.raw_buffer_atomic_fmax` op is a wrapper around the
306    buffer-based atomic floating point max available on AMD GPUs (except GFX9).
307
308    The index into the buffer is computed as for `memref.store` with the addition
309    of `indexOffset` (which is used to aid in emitting vectorized code) and,
310    if present `sgprOffset` (which is added after bounds checks and includes
311    any non-zero offset on the memref type).
312
313    All indexing components are given in terms of the memref's element size, not
314    the byte lengths required by the intrinsic.
315
316    Out of bounds atomic operations are ignored in hardware.
317
318    See `amdgpu.raw_buffer_load` for a description of how the underlying
319    instruction is constructed.
320  }];
321  let assemblyFormat = [{
322    attr-dict $value `->` $memref `[` $indices `]`
323      (`sgprOffset` $sgprOffset^)? `:`
324      type($value) `->` type($memref) `,` type($indices)
325  }];
326  let hasCanonicalizer = 1;
327  let hasVerifier = 1;
328}
329
330// Raw buffer atomic signed integer max
331def AMDGPU_RawBufferAtomicSmaxOp :
332    AMDGPU_Op<"raw_buffer_atomic_smax", [
333      AttrSizedOperandSegments]>,
334    Arguments<(ins I32:$value,
335                   Arg<AnyMemRef, "buffer to operate on", [MemRead, MemWrite]>:$memref,
336                   Variadic<I32>:$indices,
337                   DefaultValuedAttr<BoolAttr, "true">:$boundsCheck,
338                   OptionalAttr<I32Attr>:$indexOffset,
339                   Optional<I32>:$sgprOffset)> {
340
341  let summary = "Raw Buffer Signed Integer Atomic Max";
342  let description = [{
343    The `amdgpu.raw_buffer_atomic_smax` op is a wrapper around the
344    buffer-based atomic signed integer max available on AMD GPUs.
345
346    The index into the buffer is computed as for `memref.store` with the addition
347    of `indexOffset` (which is used to aid in emitting vectorized code) and,
348    if present `sgprOffset` (which is added after bounds checks and includes
349    any non-zero offset on the memref type).
350
351    All indexing components are given in terms of the memref's element size, not
352    the byte lengths required by the intrinsic.
353
354    Out of bounds atomic operations are ignored in hardware.
355
356    See `amdgpu.raw_buffer_load` for a description of how the underlying
357    instruction is constructed.
358  }];
359  let assemblyFormat = [{
360    attr-dict $value `->` $memref `[` $indices `]`
361      (`sgprOffset` $sgprOffset^)? `:`
362      type($value) `->` type($memref) `,` type($indices)
363  }];
364  let hasCanonicalizer = 1;
365  let hasVerifier = 1;
366}
367
368// Raw buffer atomic unsigned integer min
369def AMDGPU_RawBufferAtomicUminOp :
370    AMDGPU_Op<"raw_buffer_atomic_umin", [
371      AttrSizedOperandSegments]>,
372    Arguments<(ins I32:$value,
373                   Arg<AnyMemRef, "buffer to operate on", [MemRead, MemWrite]>:$memref,
374                   Variadic<I32>:$indices,
375                   DefaultValuedAttr<BoolAttr, "true">:$boundsCheck,
376                   OptionalAttr<I32Attr>:$indexOffset,
377                   Optional<I32>:$sgprOffset)> {
378
379  let summary = "Raw Buffer Unsigned Integer Atomic Min";
380  let description = [{
381    The `amdgpu.raw_buffer_atomic_umin` op is a wrapper around the
382    buffer-based atomic signed integer min available on AMD GPUs.
383
384    The index into the buffer is computed as for `memref.store` with the addition
385    of `indexOffset` (which is used to aid in emitting vectorized code) and,
386    if present `sgprOffset` (which is added after bounds checks and includes
387    any non-zero offset on the memref type).
388
389    All indexing components are given in terms of the memref's element size, not
390    the byte lengths required by the intrinsic.
391
392    Out of bounds atomic operations are ignored in hardware.
393
394    See `amdgpu.raw_buffer_load` for a description of how the underlying
395    instruction is constructed.
396  }];
397  let assemblyFormat = [{
398    attr-dict $value `->` $memref `[` $indices `]`
399      (`sgprOffset` $sgprOffset^)? `:`
400      type($value) `->` type($memref) `,` type($indices)
401  }];
402  let hasCanonicalizer = 1;
403  let hasVerifier = 1;
404}
405
406def AMDGPU_DPPPerm : I32EnumAttr<"DPPPerm",
407    "The possible permutations for a DPP operation",
408    [
409      I32EnumAttrCase<"quad_perm",  0>,
410      I32EnumAttrCase<"row_shl",    1>,
411      I32EnumAttrCase<"row_shr",    2>,
412      I32EnumAttrCase<"row_ror",    3>,
413      I32EnumAttrCase<"wave_shl",   4>,
414      I32EnumAttrCase<"wave_shr",   5>,
415      I32EnumAttrCase<"wave_ror",   6>,
416      I32EnumAttrCase<"wave_rol",   7>,
417      I32EnumAttrCase<"row_mirror", 8>,
418      I32EnumAttrCase<"row_half_mirror", 9>,
419      I32EnumAttrCase<"row_bcast_15", 10>,
420      I32EnumAttrCase<"row_bcast_31", 11>
421    ]> {
422  let genSpecializedAttr = 0;
423  let cppNamespace = "::mlir::amdgpu";
424}
425
426def AMDGPU_DPPPermAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_DPPPerm,
427  "dpp_perm">;
428
429def AMDGPU_DPPOp : AMDGPU_Op<"dpp", [SameTypeOperands, AllTypesMatch<["result", "old", "src"]>]>,
430  Arguments<(ins AnyType:$old,
431                  AnyType:$src,
432                  AMDGPU_DPPPermAttr:$kind,
433                  OptionalAttr<AnyAttrOf<[I32Attr, ArrayAttr, UnitAttr]>>:$permArgument,
434                  DefaultValuedAttr<I32Attr, "0xf">:$row_mask,
435                  DefaultValuedAttr<I32Attr, "0xf">:$bank_mask,
436                  DefaultValuedAttr<BoolAttr, "false">:$bound_ctrl)> {
437  let summary = "AMDGPU DPP operation";
438  let description = [{
439    This operation represents DPP functionality in a GPU program.
440     DPP provides the following operations:
441    - Full crossbar in a group of four (`quad_perm`)
442    - Wavefront shift left by one lane (`wave_shl`)
443    - Wavefront shift right by one lane (`wave_shr`)
444    - Wavefront rotate right by one lane (`wave_ror`)
445    - Wavefront rotate left by one lane (`wave_rol`)
446    - Row shift left by 1–15 lanes (`row_shl`)
447    - Row shift right by 1–15 lanes (`row_shr`)
448    - Row rotate right by 1–15 lanes (`row_ror`)
449    - Reverse within a row (`row_mirror`)
450    - Reverse within a half-row (`row_half_mirror`)
451    - Broadcast the 15th lane of each row to the next row (`row_bcast`)
452    - Broadcast lane 31 to rows 2 and 3 (`row_bcast`)
453  }];
454  let results = (outs AnyType:$result);
455  let assemblyFormat = [{
456    $old $src $kind (`(` $permArgument^ `)`)? attr-dict `:` type($result)
457  }];
458  let hasVerifier = 1;
459}
460
461def AMDGPU_LDSBarrierOp : AMDGPU_Op<"lds_barrier"> {
462  let summary = "Barrier that includes a wait for LDS memory operations.";
463  let description = [{
464    `amdgpu.lds_barrier` is both a barrier (all workitems in a workgroup must reach
465    the barrier before any of them may proceed past it) and a wait for all
466    operations that affect the Local Data Store (LDS) issued from that wrokgroup
467    to complete before the workgroup may continue. Since the LDS is per-workgroup
468    memory, this barrier may be used, for example, to ensure all workitems have
469    written data to LDS before any workitem attempts to read from it.
470
471    Note that `lds_barrier` does **not** force reads to or from global memory
472    to complete before execution continues. Therefore, it should be used when
473    operations on global memory can be issued far in advance of when their results
474    are used (for example, by writing them to LDS).
475
476    WARNING: On architectures that do not support the BackOffBarrier feature,
477    (those which will implement this barrier by emitting inline assembly),
478    use of this operation will impede the usabiliity of memory watches (including
479    breakpoints set on variables) when debugging.
480  }];
481  let assemblyFormat = "attr-dict";
482}
483
484def AMDGPU_SchedBarrierOpOpt : I32BitEnumAttr<"sched_barrier_opt_enum",
485    "The possible options for scheduling barriers",
486    [
487      I32BitEnumAttrCaseNone<"none">,
488      I32BitEnumAttrCaseBit<"non_mem_non_sideffect", 0>,
489      I32BitEnumAttrCaseBit<"valu", 1>,
490      I32BitEnumAttrCaseBit<"salu", 2>,
491      I32BitEnumAttrCaseBit<"mfma_wmma",  3>,
492      I32BitEnumAttrCaseBit<"all_vmem",  4>,
493      I32BitEnumAttrCaseBit<"vmem_read",  5>,
494      I32BitEnumAttrCaseBit<"vmem_write", 6>,
495      I32BitEnumAttrCaseBit<"all_ds", 7>,
496      I32BitEnumAttrCaseBit<"ds_read", 8>,
497      I32BitEnumAttrCaseBit<"ds_write", 9>,
498      I32BitEnumAttrCaseBit<"transcendental", 10>
499    ]> {
500  let genSpecializedAttr = 0;
501  let cppNamespace = "::mlir::amdgpu";
502}
503
504def AMDGPU_SchedBarrierOpOptAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_SchedBarrierOpOpt,
505  "sched_barrier_opt">{
506   let assemblyFormat = "`<` $value `>`";
507}
508
509def AMDGPU_SchedBarrierOp :
510  AMDGPU_Op<"sched_barrier">,
511  Arguments<(ins  AMDGPU_SchedBarrierOpOptAttr:$opts)>
512  {
513  let summary = "Barrier that limits the backend scheduler of instruction movement";
514  let description = [{
515    `amdgpu.sched_barrier` serves as a barrier that could be
516    configured to restrict movements of instructions through it as
517    defined by sched_barrier_opts.
518  }];
519  let assemblyFormat = [{
520    `allow` `=` $opts attr-dict
521  }];
522}
523
524def AMDGPU_MFMAPermB : I32EnumAttr<"MFMAPermB",
525    "The possible permutations of the lanes storing B available in an MFMA",
526    [
527      I32EnumAttrCase<"none",            0>,
528      I32EnumAttrCase<"bcast_first_32",  1>,
529      I32EnumAttrCase<"bcast_second_32", 2>,
530      I32EnumAttrCase<"rotate_16_right", 3>,
531      I32EnumAttrCase<"bcast_first_16",  4>,
532      I32EnumAttrCase<"bcast_second_16", 5>,
533      I32EnumAttrCase<"bcast_third_16",  6>,
534      I32EnumAttrCase<"bcast_fourth_16", 7>
535    ]> {
536  let genSpecializedAttr = 0;
537  let cppNamespace = "::mlir::amdgpu";
538}
539
540def AMDGPU_MFMAPermBAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_MFMAPermB,
541  "mfma_perm_b">;
542
543// mfma
544def MFMAInTypes : AnyTypeOf<[F32, F64, I32, I64,
545                             VectorOfLengthAndType<[2], [F32]>,
546                             VectorOfLengthAndType<[4], [F16]>,
547                             VectorOfLengthAndType<[2, 4], [BF16]>,
548                             VectorOfLengthAndType<[4, 8], [I8]>,
549                             VectorOfLengthAndType<[8], [F8E5M2FNUZ, F8E4M3FNUZ]>]>;
550def MFMAOutTypes : AnyTypeOf<[F64,
551                              VectorOfLengthAndType<[4, 16, 32], [F32]>,
552                              VectorOfLengthAndType<[4, 16, 32], [I32]>,
553                              VectorOfLengthAndType<[4], [F64]>]>;
554// wmma
555def WMMAInTypes : AnyTypeOf<[VectorOfLengthAndType<[8, 16], [F16, BF16, I8, SI8, UI8, F8E4M3FN, F8E5M2]>]>;
556def WMMAOutTypes : AnyTypeOf<[VectorOfLengthAndType<[4, 8], [F32, I32]>,
557                              VectorOfLengthAndType<[8, 16], [F16, BF16]>]>;
558
559def AMDGPU_MFMAOp :
560    AMDGPU_Op<"mfma", [AllTypesMatch<["destC", "destD"]>,
561                        Pure]>,
562    Arguments<(ins
563                   I32Attr:$m,
564                   I32Attr:$n,
565                   I32Attr:$k,
566                   I32Attr:$blocks,
567                   MFMAInTypes:$sourceA,
568                   MFMAInTypes:$sourceB,
569                   MFMAOutTypes:$destC,
570                   DefaultValuedAttr<I32Attr, "0">:$cbsz,
571                   DefaultValuedAttr<I32Attr, "0">:$abid,
572                   DefaultValuedAttr<AMDGPU_MFMAPermBAttr,
573                    "::mlir::amdgpu::MFMAPermB::none">:$blgp,
574                   UnitAttr:$reducePrecision,
575                   UnitAttr:$negateA,
576                   UnitAttr:$negateB,
577                   UnitAttr:$negateC)>,
578    Results<(outs MFMAOutTypes: $destD)> {
579  let summary = "MLIR wrapper for CDNA mfma instructions";
580  let description = [{
581    The `amdgpu.mfma` op is an MLIR wrapper around intrinsics
582    for various `mfma` instructions in the CDNA architecture, which perform
583    multiple outer products in order to allow fast matrix multiplication.
584
585    The wrapper will select an appropriate `mfma` instruction, if one is available,
586    based on the provided `m`, `k`, `n`, and `nBlks` attributes, along with the
587    types of the source and destination arguments.
588
589    For information on the layouts of the input and output matrces (which are stored
590    in `sourceA`, `sourceB`, `destC`, and `destD`), see the CDNA ISA documentation.
591
592    The `cbsz`, `abid`, and `blgp` parameters control how the lanes of the wave
593    are permuted when matrix data is being loaded: `blgp` can be any number of
594    fixed permutations, `cbsz` specifies the log_2 of the number of chunks the lanes
595    holding sourceA are split into, and `abid` selects one of those chunks.
596
597    Note, this wrapper allows specifying `vector<4Kxi8>` arguments to MFMA
598    intrinsics that take an integer type of width `4K`. For example,
599    one can provide a vector<4xi8> as an argument to an MFMA instruction that
600    logically takes 4 i8s but whose intrinsics are specified to take an i32.
601    In these cases, the bytes in the vector will be concatenated in little-endian
602    order (that is, v[0] will go to arg[7:0], v[1] to arg[15:8] and so on).
603
604    The negateA, negateB, and negateC flags are only supported for double-precision
605    operations on gfx940+.
606  }];
607  let assemblyFormat = [{
608    $sourceA `*` $sourceB `+` $destC
609    attr-dict
610    `blgp` `=` $blgp
611    `:` type($sourceA) `,` type($sourceB) `,` type($destC)
612  }];
613  let hasVerifier = 1;
614}
615
616def AMDGPU_WMMAOp :
617    AMDGPU_Op<"wmma", [AllTypesMatch<["destC", "destD"]>,
618                       AllTypesMatch<["sourceA", "sourceB"]>,
619                        Pure]>,
620    Arguments<(ins
621                   WMMAInTypes:$sourceA,
622                   WMMAInTypes:$sourceB,
623                   WMMAOutTypes:$destC,
624                   DefaultValuedAttr<ConfinedAttr<I32Attr, [IntMinValue<0>, IntMaxValue<1>]>, "0">:$subwordOffset,
625                   UnitAttr:$unsignedA,
626                   UnitAttr:$unsignedB,
627                   UnitAttr:$clamp)>,
628    Results<(outs WMMAOutTypes: $destD)> {
629  let summary = "MLIR wrapper for RDNA3 wmma instructions";
630  let description = [{
631    The `amdgpu.wmma` op is an MLIR wrapper around intrinsics
632    for various `wmma` instructions in the RDNA3 architecture, which perform
633    a 16x16 matrix multiplication for different data types.
634
635    When emitting f16->f16 (or bf16->bf16) wmma the output is a 16xf16 (or 16xbf16) vector
636    containing only 8 valid values:
637      - If `subwordOffset` is 0, then the output is stored at indices 0, 2, 4, ..., 14.
638      - If `subwordOffset` is 1, then the output is stored at indices 1, 3, 5, ..., 15.
639
640    `unsignedA` and `unsignedB` flag that the `int8` LLVM inputs are unsigned.
641
642    The `clamp` flag is used to saturate the output of type T to numeric_limits<T>::max()
643    in case of overflow.
644  }];
645  let assemblyFormat = [{
646    $sourceA `*` $sourceB `+` $destC
647    attr-dict
648    `:` type($sourceA) `,` type($sourceB) `,` type($destC)
649  }];
650  let hasVerifier = 1;
651}
652
653#endif // AMDGPU
654