xref: /llvm-project/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td (revision 5f98dd5dd5517b711bbb77655865a35904653820)
1//===-- NVGPU.td - NVGPU dialect operation definitions *- tablegen -*------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the basic operations for the NVGPU dialect.
10//
11// This NVGPU provides a bridge between the target agnostic GPU and Vector
12// dialects and lower level NVVM dialect. This allow representing PTX specific
13// operations while using MLIR high level concepts like memref and 2-D vector.
14//
15// Ops semantic are going to be based on vendor specific PTX defintion:
16// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html
17//
18//===----------------------------------------------------------------------===//
19
20#ifndef NVGPU
21#define NVGPU
22
23include "mlir/Interfaces/InferTypeOpInterface.td"
24include "mlir/Interfaces/SideEffectInterfaces.td"
25include "mlir/IR/AttrTypeBase.td"
26include "mlir/IR/OpBase.td"
27include "mlir/IR/EnumAttr.td"
28
29def NVGPU_Dialect : Dialect {
30  let name = "nvgpu";
31  let cppNamespace = "::mlir::nvgpu";
32  let description = [{
33    The `NVGPU` dialect provides a bridge between higher-level target-agnostic
34    dialects (GPU and Vector) and the lower-level target-specific dialect
35    (LLVM IR based NVVM dialect) for NVIDIA GPUs. This allow representing PTX
36    specific operations while using MLIR high level dialects such as Memref
37    and Vector for memory and target-specific register operands, respectively.
38  }];
39
40  let useDefaultTypePrinterParser = 1;
41  let useDefaultAttributePrinterParser = 1;
42
43  let extraClassDeclaration = [{
44    /// Return true if the given MemRefType has an integer address
45    /// space that matches the NVVM shared memory address space or
46    /// is a gpu::AddressSpaceAttr attribute with value 'workgroup`.
47    static bool hasSharedMemoryAddressSpace(MemRefType type);
48
49    /// Return true if the given Attribute has an integer address
50    /// space that matches the NVVM shared memory address space or
51    /// is a gpu::AddressSpaceAttr attribute with value 'workgroup`.
52    static bool isSharedMemoryAddressSpace(Attribute type);
53
54    /// Defines the MemRef memory space attribute numeric value that indicates
55    /// a memref is located in global memory. This should correspond to the
56    /// value used in NVVM.
57    static constexpr unsigned kGlobaldMemoryAddressSpace = 1;
58
59    /// Defines the MemRef memory space attribute numeric value that indicates
60    /// a memref is located in shared memory. This should correspond to the
61    /// value used in NVVM.
62    static constexpr unsigned kSharedMemoryAddressSpace = 3;
63  }];
64}
65
66//===----------------------------------------------------------------------===//
67// NVGPU Attribute Definitions
68//===----------------------------------------------------------------------===//
69
70def TensorMapSwizzleNone : I32EnumAttrCase<"SWIZZLE_NONE", 0, "none">;
71def TensorMapSwizzle32B  : I32EnumAttrCase<"SWIZZLE_32B", 1, "swizzle_32b">;
72def TensorMapSwizzle64B  : I32EnumAttrCase<"SWIZZLE_64B", 2, "swizzle_64b">;
73def TensorMapSwizzle128B : I32EnumAttrCase<"SWIZZLE_128B", 3, "swizzle_128b">;
74def TensorMapSwizzleKind : I32EnumAttr<"TensorMapSwizzleKind",
75                                "Tensor map swizzling mode of shared memory banks",
76  [ TensorMapSwizzleNone, TensorMapSwizzle32B, TensorMapSwizzle64B,
77    TensorMapSwizzle128B]> {
78  let genSpecializedAttr = 0;
79  let cppNamespace = "::mlir::nvgpu";
80}
81
82def TensorMapL2PromoNone : I32EnumAttrCase<"L2PROMO_NONE", 0, "none">;
83def TensorMapL2Promo64B  : I32EnumAttrCase<"L2PROMO_64B", 1, "l2promo_64b">;
84def TensorMapL2Promo128B : I32EnumAttrCase<"L2PROMO_128B", 2, "l2promo_128b">;
85def TensorMapL2Promo256B : I32EnumAttrCase<"L2PROMO_256B", 3, "l2promo_256b">;
86def TensorMapL2PromoKind : I32EnumAttr<"TensorMapL2PromoKind",
87                                "Tensor map L2 promotion type",
88  [ TensorMapL2PromoNone, TensorMapL2Promo64B, TensorMapL2Promo128B,
89    TensorMapL2Promo256B]> {
90  let genSpecializedAttr = 0;
91  let cppNamespace = "::mlir::nvgpu";
92}
93
94def TensorMapOOBZero : I32EnumAttrCase<"OOB_ZERO", 0, "zero">;
95def TensorMapOOBNaN  : I32EnumAttrCase<"OOB_NAN", 1, "nan">;
96def TensorMapOOBKind : I32EnumAttr<"TensorMapOOBKind",
97                                "Tensor map out-of-bounds fill type",
98  [ TensorMapOOBZero, TensorMapOOBNaN]> {
99  let genSpecializedAttr = 0;
100  let cppNamespace = "::mlir::nvgpu";
101}
102
103def TensorMapInterleaveNone : I32EnumAttrCase<"INTERLEAVE_NONE", 0, "none">;
104def TensorMapInterleave16B  : I32EnumAttrCase<"INTERLEAVE_16B", 1, "interleave_16b">;
105def TensorMapInterleave32B  : I32EnumAttrCase<"INTERLEAVE_32B", 2, "interleave_32b">;
106def TensorMapInterleaveKind : I32EnumAttr<"TensorMapInterleaveKind",
107                                "Tensor map interleave layout type",
108  [ TensorMapInterleaveNone, TensorMapInterleave16B, TensorMapInterleave32B]> {
109  let genSpecializedAttr = 0;
110  let cppNamespace = "::mlir::nvgpu";
111}
112
113def RcpApprox : I32EnumAttrCase<"APPROX", 0, "approx">;
114def RcpRN     : I32EnumAttrCase<"RN", 1, "rn">;
115def RcpRZ     : I32EnumAttrCase<"RZ", 2, "rz">;
116def RcpRM     : I32EnumAttrCase<"RM", 3, "rm">;
117def RcpRP     : I32EnumAttrCase<"RP", 4, "rp">;
118def RcpRoundingMode   : I32EnumAttr<"RcpRoundingMode", "Rounding mode of rcp",
119  [RcpApprox, RcpRN, RcpRZ, RcpRM, RcpRP]> {
120  let genSpecializedAttr = 0;
121  let cppNamespace = "::mlir::nvgpu";
122}
123
124def TensorMapSwizzleAttr : EnumAttr<NVGPU_Dialect, TensorMapSwizzleKind, "swizzle">;
125def TensorMapL2PromoAttr : EnumAttr<NVGPU_Dialect, TensorMapL2PromoKind, "l2promo">;
126def TensorMapOOBAttr : EnumAttr<NVGPU_Dialect, TensorMapOOBKind, "oob">;
127def TensorMapInterleaveAttr : EnumAttr<NVGPU_Dialect, TensorMapInterleaveKind, "interleave">;
128def RcpRoundingModeAttr : EnumAttr<NVGPU_Dialect, RcpRoundingMode, "rcp_rounding_mode">;
129
130//===----------------------------------------------------------------------===//
131// NVGPU Type Definitions
132//===----------------------------------------------------------------------===//
133
134class NVGPU_Type<string name, string typeMnemonic,
135        list<Trait> traits = []> : TypeDef<NVGPU_Dialect, name, traits> {
136  let mnemonic = typeMnemonic;
137}
138
139def NVGPU_DeviceAsyncToken : NVGPU_Type<"DeviceAsyncToken",
140                                        "device.async.token", []> {
141  let summary = "device async token type";
142  let description = [{
143    `nvgpu.device.async.token` is a type returned by an asynchronous operation
144    that runs on the GPU (device). It is used to establish an SSA-based link
145    between the async operation (e.g. DeviceAsyncCopy) and operations that
146    group or synchronize the async operations (e.g. DeviceAsyncCreateGroupOp,
147    DeviceAsyncWaitOp).
148  }];
149}
150
151def NVGPU_MBarrierGroup : NVGPU_Type<"MBarrierGroup", "mbarrier.group", []> {
152  let summary = "mbarrier barrier type";
153  let description = [{
154    This is the type for one or more mbarrier object in shared memory that is
155    used to synchronize a variable number of threads.
156
157    If `num_barriers` is not set, the number of mbarrier objects is 1.
158
159    A mbarrier object is 64 bit with 8 byte alignment. The mbarrier object
160    can be initiated and invalidated.
161
162    [See for more details in PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#size-and-alignment-of-mbarrier-object)
163  }];
164  let parameters = (ins "Attribute":$memorySpace, DefaultValuedParameter<"unsigned", "1">:$num_barriers);
165  let assemblyFormat = "`<` struct(params) `>`";
166  let builders = [
167    TypeBuilder<(ins "Attribute":$memorySpace), [{
168      return $_get($_ctxt, memorySpace, 1);
169    }]>
170  ];
171}
172
173def NVGPU_MBarrierToken : NVGPU_Type<"MBarrierToken", "mbarrier.token", []> { }
174
175// https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-map
176def NVGPU_TensorMapDescriptor : NVGPU_Type<"TensorMapDescriptor", "tensormap.descriptor", []> {
177  let summary = "TensorMap descriptor";
178  let parameters = (ins "MemRefType":$tensor,
179                        EnumParameter<TensorMapSwizzleKind>:$swizzle,
180                        EnumParameter<TensorMapL2PromoKind>:$l2promo,
181                        EnumParameter<TensorMapOOBKind>:$oob,
182                        EnumParameter<TensorMapInterleaveKind>:$interleave);
183  let description = [{
184    `nvgpu.tma.descriptor` is a type that represents a TMA descriptor. It is
185    128-byte object either in constant space or kernel paramater.
186  }];
187  let assemblyFormat = "`<` struct(params) `>`";
188}
189
190def NVGPU_WarpgroupMatrixDescriptor : NVGPU_Type<"WarpgroupMatrixDescriptor", "warpgroup.descriptor", []> {
191  let summary = "Warpgroup matrix descriptor type";
192  let description = [{
193  The descriptor specifies the properties of the matrix in shared memory that
194  is a multiplicand in the matrix multiply and accumulate operation.
195
196  The descriptor is a 64-bit value contained in a register with the following:
197  ```
198  +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+
199  |   0-13  |14-15|   16-29   |30-31|   32-45   |46-48|49-51|   52-61   |62-63|
200  +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+
201  |  14bits |2bits|   14bits  |2bits|   14bits  |2bits|3bits|   10bits  |2bits|
202  +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+
203  | BaseAddr|  0  | LeadingDim|  0  |   Stride  |  0  |Offst|     0     |Swzle|
204  +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+
205  ```
206
207  [See for more details in PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-shared-memory-layout-matrix-descriptor)
208
209  }];
210  let parameters = (ins "MemRefType":$tensor);
211  let assemblyFormat = "`<` struct(params) `>`";
212}
213
214def NVGPU_WarpgroupAccumulator : NVGPU_Type<"WarpgroupAccumulator", "warpgroup.accumulator", []> {
215  let parameters = (ins "VectorType":$fragmented);
216  let assemblyFormat = "`<` struct(params) `>`";
217  let description = [{
218    This type represents the result matrix obtained from `nvgpu.warpgroup.mma`.
219    The `$fragmented` type signifies the distributed or fragmented result
220    vector that is collectively owned by all the threads in the warp-group
221    that executed `nvgpu.warpgroup.mma`.
222    [See the details of register fragment layout for accumulator matrix D]
223    (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d)
224  }];
225}
226
227//===----------------------------------------------------------------------===//
228// NVGPU Op Definitions
229//===----------------------------------------------------------------------===//
230
231class NVGPU_Op<string mnemonic, list<Trait> traits = []> :
232  Op<NVGPU_Dialect, mnemonic, traits> {}
233
234def NVGPU_LdMatrixOp : NVGPU_Op<"ldmatrix", [
235                                MemoryEffects<[MemRead]>,
236                                PredOpTrait<"srcMemref and res have same element type",
237                                            TCresVTEtIsSameAsOp<0, 0>>]> {
238  let description = [{
239    The `nvgpu.ldmatrix` op represents loading a matrix fragment from
240    memory to registers. The source and result type must be compatible
241    with lowering to the `nvvm.ldmatrix` instruction. This op represents
242    the distributed version of a `vector.transfer_read` as an intermediate
243    step between lowering from `vector.transfer_read` to `nvvm.ldmatrix`.
244
245    This operation is meant to follow the semantic of described here:
246    https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-ldmatrix
247
248    Example:
249    ```mlir
250    %0 = nvgpu.ldmatrix %sm[%c0, %c0] {numTiles = 4 : i32, transpose = false} :
251      memref<?x?xf16, 3> -> vector<4x2xf16>
252    ```
253  }];
254
255  let arguments = (ins Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$srcMemref,
256                           Variadic<Index>:$indices, BoolAttr:$transpose,
257                           I32Attr:$numTiles);
258  let results = (outs AnyVectorOfNonZeroRank:$res);
259  let assemblyFormat = [{
260    $srcMemref`[` $indices `]` attr-dict `:` type($srcMemref) `->` type($res)
261  }];
262
263  let hasVerifier = 1;
264}
265
266class NVGPU_MmaSyncOp<string mnemonic> :
267        NVGPU_Op<mnemonic,  [Pure,
268                             PredOpTrait<"matrixA and matrixB have same element type",
269                                         TCopVTEtIsSameAs<0, 1>>]> {
270  code extraBaseClassDeclaration = [{
271    std::array<int64_t, 3> getMmaShapeAsArray() {
272      ArrayAttr mmaShape = this->getMmaShape();
273      assert(mmaShape.size() == 3 && "mmaShape should be three integers");
274      return {::llvm::cast<IntegerAttr>(mmaShape[0]).getInt(),
275              ::llvm::cast<IntegerAttr>(mmaShape[1]).getInt(),
276              ::llvm::cast<IntegerAttr>(mmaShape[2]).getInt()};
277    }
278  }];
279
280  let hasVerifier = 1;
281}
282
283def NVGPU_MmaSyncOp : NVGPU_MmaSyncOp<"mma.sync"> {
284  let description = [{
285    The `nvgpu.mma.sync` op represents the warp-level matrix-multiply-and-
286    accumulate (mma) operation that is compatible with `nvvm.mma.sync`.
287    The operands and results vector sizes are thread-level onwership to
288    the warp-level mma operation shape. `mmaShape` attribute holds the
289    warp-level matrix-multiply shape.
290
291    The `nvgpu.mma.sync` op serves as an intermediate point between lowering from
292    `vector.contract` to `nvvm.mma.sync`.
293
294    This operation is meant to follow the semantic of described here:
295      https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma
296
297    Example:
298
299    ```mlir
300    %res = nvgpu.mma.sync (%matrixA, %matrixB, %matrixC) {mmaShape = [16, 8, 16]} :
301        (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf32>) -> vector<2x2xf32>
302    ```
303  }];
304  let arguments = (ins AnyVectorOfNonZeroRank:$matrixA,
305                       AnyVectorOfNonZeroRank:$matrixB,
306                       AnyVectorOfNonZeroRank:$matrixC,
307                       I64ArrayAttr:$mmaShape,
308                       OptionalAttr<UnitAttr>:$tf32Enabled);
309
310  let results = (outs AnyVectorOfNonZeroRank:$res);
311
312  let builders = [
313    OpBuilder<(ins "Value":$matrixA,
314                   "Value":$matrixB,
315                   "Value":$matrixC,
316                   "ArrayAttr":$mmaShape)>,
317    OpBuilder<(ins "Value":$matrixA,
318                   "Value":$matrixB,
319                   "Value":$matrixC,
320                   "ArrayRef<int64_t>":$mmaShape,
321                   CArg<"bool", "false">:$tf32Enabled)>
322  ];
323
324  let assemblyFormat = [{
325    `(` $matrixA`,` $matrixB`,` $matrixC `)` attr-dict
326    `:` `(` type($matrixA) `,` type($matrixB) `,` type($matrixC) `)` `->` type($res)
327  }];
328
329  let extraClassDeclaration = extraBaseClassDeclaration;
330}
331
332def NVGPU_MmaSparseSyncMetadataType : FixedVectorOfLengthAndType<[2], [I16]>,
333                        BuildableType<"::mlir::VectorType::get("
334                          "{2},$_builder.getI16Type())">;
335
336def NVGPU_MmaSparseSyncOp : NVGPU_MmaSyncOp<"mma.sp.sync"> {
337  let description = [{
338  The `nvgu.mma.sp.sync` operation performs a warp-distributed MMA operation
339  where operand A is "structured sparse". In this case, the `matrixA` operand
340  represents the (warp-distributed) non-zero values of operand A, and the
341  `sparse_metadata` operand provides the indices.
342
343  The full description of the sparsity storage format and distribution scheme is
344  described in the PTX docs. This operation is meant to follow the semantic
345  described in the PTX documentation here:
346  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-for-sparse-mma
347
348  The way the indices are distributed among the threads in a warp is controlled
349  by the optional `sparsity_selector` operand, which is `0` by default. For
350  more information, please consult the PTX documentation linked above.
351
352  Example (targetingthe f16 16x8x32 `mma.sp` PTX instruction):
353
354  ```mlir
355  nvgpu.mma.sp.sync (%a, %b, %c) metadata (%meta) {mmaShape = [16, 8, 32]} :
356    (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
357  ```
358  }];
359
360  let arguments = (ins AnyVectorOfNonZeroRank:$matrixA,
361                       AnyVectorOfNonZeroRank:$matrixB,
362                       AnyVectorOfNonZeroRank:$matrixC,
363                       NVGPU_MmaSparseSyncMetadataType:$sparseMetadata,
364                       I64ArrayAttr:$mmaShape,
365                       DefaultValuedAttr<I32Attr, "0">:$sparsitySelector,
366                       OptionalAttr<UnitAttr>:$tf32Enabled
367                       );
368
369  let results = (outs AnyVectorOfNonZeroRank:$res);
370
371  let builders = [
372    OpBuilder<(ins "Value":$matrixA,
373                   "Value":$matrixB,
374                   "Value":$matrixC,
375                   "Value":$sparseMetadata,
376                   "ArrayRef<int64_t>":$mmaShape)>
377  ];
378
379  let assemblyFormat = [{
380    `(` $matrixA`,` $matrixB`,` $matrixC `)` `metadata` `(` $sparseMetadata `)` attr-dict
381    `:` `(` type($matrixA) `,` type($matrixB) `,` type($matrixC) `)` `->` type($res)
382  }];
383
384  let extraClassDeclaration = extraBaseClassDeclaration;
385}
386
387def NVGPU_DeviceAsyncCopyOp : NVGPU_Op<"device_async_copy", [
388                                       AttrSizedOperandSegments]> {
389  let summary = "device-side asynchronous copy";
390  let description = [{
391    The `nvgpu.device_async_copy` op initiates an asynchronous copy operation of
392    elements from source (global memory) to the destination (shared memory)
393    without blocking the thread. The async copy is added to a group.
394
395    This op is meant to be used with `nvgpu.device_async_create_group` and
396    `nvgpu.device_async_wait` to synchronize copies as explained in those ops
397    descriptions.
398
399    `bypassL1` attribute is hint to the hardware to bypass the L1 cache during
400    async copy, this hint may be ignored by the hardware.
401
402    `dstElements` attribute is the total number of elements written to
403    destination (shared memory).
404
405    `srcElements` argument is the total number of elements read from
406    source (global memory).
407
408    `srcElements` is an optional argument and when present the op only reads
409    `srcElements` number of elements from the source (global memory) and zero fills
410    the rest of the elements in the destination (shared memory).
411
412    In order to do a copy and wait for the result we need the following
413    combination:
414    ```
415    // copy 1.
416    %cp1 = nvgpu.device_async_copy %A[%c0], %B[%c0], 4 :memref<16xf32> to memref<16xf32, 3>
417    // copy 2.
418    %cp2 = nvgpu.device_async_copy %C[%c0], %D[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
419    // group 1 contains copy 1 and copy 2.
420    %token1 = nvgpu.device_async_create_group %cp1, %cp2
421    // copy 3.
422    %cp3 = nvgpu.device_async_copy %E[%c0], %F[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
423    // group 2 contains copy 3.
424    %token2 = nvgpu.device_async_create_group %cp3
425    // after the wait copy 1 and copy 2 are complete.
426    nvgpu.device_async_wait %token1
427    // after the wait copy 3 is complete.
428    nvgpu.device_async_wait %token2
429    ```
430
431    Example:
432
433    ```mlir
434    %0 = nvgpu.device_async_copy %src[%c0, %c0], %dst[%c0, %c0, %c0], 4 :
435      memref<4x5xf32> to memref<2x7x5xf32, 3>
436    ```
437  }];
438  let results = (outs NVGPU_DeviceAsyncToken:$asyncToken);
439  let arguments = (ins Arg<AnyMemRef, "", [MemWriteAt<0, FullEffect>]>:$dst,
440                       Variadic<Index>:$dstIndices,
441                       Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$src,
442                       Variadic<Index>:$srcIndices,
443                       IndexAttr:$dstElements,
444                       Optional<Index>:$srcElements,
445                       OptionalAttr<UnitAttr>:$bypassL1);
446  let assemblyFormat = [{
447    $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` `,` $dstElements (`,` $srcElements^)?
448      attr-dict `:` type($src) `to` type($dst)
449  }];
450  let hasVerifier = 1;
451}
452
453def NVGPU_DeviceAsyncCreateGroupOp : NVGPU_Op<"device_async_create_group", []> {
454  let summary = "device side asynchronous create group operation";
455  let description = [{
456    The `nvgpu.device_async_create_group` op creates a group of memory accesses
457    containing all the pending `device_async_copy` operations associated with
458    argument tokens. Each token can only be part of one group.
459
460    It returns a token that can be use to wait until the group fully completes.
461
462    This is meant to be used with `nvgpu.device_async_wait` to synchronize copies
463    as explained in those ops descriptions.
464
465    Groups are executed in the order they are created.
466
467    Example:
468
469    ```mlir
470    %0 = nvgpu.device_async_create_group
471  ```
472  }];
473  let results = (outs NVGPU_DeviceAsyncToken:$asyncToken);
474  let arguments = (ins Variadic<NVGPU_DeviceAsyncToken>:$inputTokens);
475  let assemblyFormat = [{
476    $inputTokens attr-dict
477  }];
478}
479
480def NVGPU_DeviceAsyncWaitOp : NVGPU_Op<"device_async_wait", []> {
481  let summary = "Wait for async gpu ops to complete.";
482  let description = [{
483    The `nvgpu.device_async_wait` op will block the execution thread until the group
484    associated with the source token is fully completed.
485
486    The optional `$numGroups` attribute gives an upper bound of the number of
487    groups uncompleted when the wait can unblock the thread. For example,  if
488    16 async groups are pushe and `$numGroups` is set to 12, then the thread
489    will unblock when 12 groups or fewer are in flight (4 groups have
490    completed).
491
492    Example:
493
494    ```mlir
495    nvgpu.device_async_wait %0
496    ```
497  }];
498  let arguments = (ins NVGPU_DeviceAsyncToken:$asyncDependencies,
499                       OptionalAttr<I32Attr>:$numGroups);
500  let assemblyFormat = [{
501    $asyncDependencies attr-dict
502  }];
503}
504
505def NVGPU_MBarrierCreateOp : NVGPU_Op<"mbarrier.create", []> {
506  let summary = "Creates a `nvgpu.mbarrier` object.";
507  let description = [{
508    The Op generates one or more `mbarrier` object, which is a barrier created in
509    shared memory and supports various synchronization behaviors for threads.
510
511    The `mbarrier` object has the following type and alignment requirements:
512      Type: .b64, Alignment: 8, Memory space: .shared
513
514    Example:
515    ```mlir
516      %barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
517    ```
518    }];
519  let arguments = (ins);
520  let results = (outs NVGPU_MBarrierGroup:$barriers);
521  let assemblyFormat = [{
522     attr-dict `->` type($barriers)
523  }];
524}
525
526def NVGPU_MBarrierInitOp : NVGPU_Op<"mbarrier.init", []> {
527  let summary = "Initialize the `nvgpu.mbarrier`.";
528  let description = [{
529    The Op initializes the `mbarrier` object with the given number of threads.
530
531    Example:
532    ```mlir
533      %num_threads = gpu.block_dim x
534      %barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
535      nvgpu.mbarrier.init %barrier, %num_threads : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
536    ```
537  }];
538  let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$count, Index:$mbarId, Optional<I1>:$predicate);
539  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $count (`,` `predicate` `=` $predicate^)? attr-dict `:` type($barriers)";
540}
541
542def NVGPU_MBarrierTestWaitOp : NVGPU_Op<"mbarrier.test.wait", []> {
543  let summary = "Checks if the `nvgpu.mbarrier` has completed its current phase.";
544  let description = [{
545    Checks whether the mbarrier object has completed the phase. It is is a
546    non-blocking instruction which tests for the completion of the phase.
547
548    Example:
549    ```mlir
550      %isComplete = nvgpu.mbarrier.test.wait %barrier, %token : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>, !nvgpu.mbarrier.token
551    ```
552  }];
553  let arguments = (ins NVGPU_MBarrierGroup:$barriers, NVGPU_MBarrierToken:$token, Index:$mbarId);
554  let results = (outs I1:$waitComplete);
555  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $token attr-dict `:` type($barriers) `,` type($token)";
556}
557
558def NVGPU_MBarrierArriveOp : NVGPU_Op<"mbarrier.arrive", []> {
559  let summary = "Performs arrive operation on the `nvgpu.mbarrier.arrive`.";
560  let description = [{
561    The Op performs arrive-on operation on the `mbarrier` object and returns a
562    `nvgpu.mbarrier.token`.
563
564    For more information, see
565    https://docs.nvidia.com/cuda/parallel-thread-execution/#arrive-on-operation-on-mbarrier-object
566
567    Example:
568    ```mlir
569      %token = nvgpu.mbarrier.arrive %barrier : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> -> !nvgpu.mbarrier.token
570    ```
571  }];
572  let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$mbarId);
573  let results = (outs NVGPU_MBarrierToken:$token);
574let assemblyFormat = "$barriers `[` $mbarId `]` attr-dict `:` type($barriers) `->` type($token)";
575}
576
577def NVGPU_MBarrierArriveNoCompleteOp : NVGPU_Op<"mbarrier.arrive.nocomplete", []> {
578  let summary = "Performs arrive operation on the `nvgpu.mbarrier.arrive.nocomplete` as non-blocking.";
579  let description = [{
580    The Op performs arrive-on operation on the `mbarrier` object and returns a
581    `nvgpu.mbarrier.token`.
582
583    The Op does not cause the `nvgpu.mbarrier` to complete its current phase.
584
585    Example:
586    ```mlir
587      %token = nvgpu.mbarrier.arrive.noComplete %barrier, %count : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> -> !nvgpu.mbarrier.token
588    ```
589  }];
590  let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$mbarId,
591                       Index:$count);
592  let results = (outs NVGPU_MBarrierToken:$token);
593  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $count attr-dict `:` type($barriers) `->` type($token)";
594}
595
596def NVGPU_MBarrierArriveExpectTxOp : NVGPU_Op<"mbarrier.arrive.expect_tx", []> {
597  let summary = "Performs expect_tx operation on the `nvgpu.mbarrier.arrive`";
598  let description = [{
599    A thread executing the Op performs an expect-tx operation on the mbarrier
600    object at the location specified by the address operand $barrier. The
601    expect-tx operation, with an $txcount argument, increases the tx-count of
602    an mbarrier object by the value specified by $txcount. This makes the
603    current phase of the mbarrier object to expect and track the completion of
604    additional asynchronous transactions.
605
606    The `$txCount` specifies the number of element to the expect-tx operation.
607
608    Example:
609    ```mlir
610      nvgpu.mbarrier.arrive.expect_tx %barrier, %ic0 : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
611    ```
612  }];
613  let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$txcount, Index:$mbarId, Optional<I1>:$predicate);
614  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $txcount  (`,` `predicate` `=` $predicate^)? attr-dict `:` type($barriers)";
615}
616
617def NVGPU_MBarrierTryWaitParityOp : NVGPU_Op<"mbarrier.try_wait.parity", []> {
618  let summary = "Waits for the `nvgpu.mbarrier` to complete its current phase.";
619  let description = [{
620    Checks whether the mbarrier object has completed the phase. It is is a
621    potentially blocking instruction which tests for the completion of the
622    phase. Suspended thread resumes execution when the specified phase completes
623    OR before the phase completes following a system-dependent time limit.
624
625    The `$phaseParity` specifies either even phase (0) or odd phase (1) to
626    wait.
627
628    Example:
629    ```mlir
630      nvgpu.mbarrier.try_wait.parity %barrier, %phaseParity, %ticks : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
631    ```
632  }];
633  let arguments = (ins NVGPU_MBarrierGroup:$barriers, I1:$phaseParity, Index:$ticks, Index:$mbarId);
634  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $phaseParity `,` $ticks attr-dict `:` type($barriers)";
635}
636
637def NVGPU_TmaPrefetchOp : NVGPU_Op<"tma.prefetch.descriptor", []> {
638  let summary = "Prefetch given `nvgpu.tensormap.descriptor` ";
639  let description = [{
640    The Op brings the cache line containing the given `$tmaDescriptor` for
641    subsequent use by the `tma.async.load` instruction.
642  }];
643  let arguments = (ins NVGPU_TensorMapDescriptor:$tensorMapDescriptor, Optional<I1>:$predicate);
644  let assemblyFormat = [{
645    $tensorMapDescriptor (`,` `predicate` `=` $predicate^)? attr-dict `:` type($tensorMapDescriptor)
646  }];
647}
648
649def NVGPU_TmaAsyncLoadOp : NVGPU_Op<"tma.async.load", [AttrSizedOperandSegments]> {
650  let summary = "TMA asynchronous load";
651  let description = [{
652    The Op loads a tile memory region from global memory to shared memory by
653    Tensor Memory Access (TMA).
654
655    `$tensorMapDescriptor` is tensor map descriptor which has information about
656    tile shape. The descriptor is created by `nvgpu.tma.create.descriptor`
657
658    The Op uses `$barrier` mbarrier based completion mechanism.
659  }];
660  let arguments = (ins  Arg<AnyMemRef, "", [MemWriteAt<0, FullEffect>]>:$dst,
661                        NVGPU_MBarrierGroup:$barriers,
662                        NVGPU_TensorMapDescriptor:$tensorMapDescriptor,
663                        Variadic<Index>:$coordinates,
664                        Index:$mbarId,
665                        Optional<I16>:$multicastMask,
666                        Optional<I1>:$predicate);
667  let assemblyFormat = [{
668    $tensorMapDescriptor `[` $coordinates `]` `,` $barriers `[` $mbarId `]`
669      `to` $dst
670      (`multicast_mask` `=` $multicastMask^ )?
671      (`,` `predicate` `=` $predicate^)?
672      attr-dict `:` type($tensorMapDescriptor) `,` type($barriers)
673      `->` type($dst)
674  }];
675  let hasVerifier = 1;
676
677}
678
679def NVGPU_TmaAsyncStoreOp : NVGPU_Op<"tma.async.store", [AttrSizedOperandSegments]> {
680  let summary = "TMA asynchronous store";
681  let description = [{
682    The Op store a tile memory region from global memory to shared memory by
683    Tensor Memory Access (TMA).
684
685    `$tensorMapDescriptor` is tensor map descriptor which has information about
686    tile shape. The descriptor is created by `nvgpu.tma.create.descriptor`
687  }];
688  let arguments = (ins  Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$src,
689                        Arg<NVGPU_TensorMapDescriptor, "", [MemWriteAt<0, FullEffect>]>:$tensorMapDescriptor,
690                        Variadic<Index>:$coordinates,
691                        Optional<I1>:$predicate);
692  let assemblyFormat = [{
693      $src `to` $tensorMapDescriptor `[` $coordinates `]`
694      (`,` `predicate` `=` $predicate^)?
695      attr-dict `:` type($src)
696      `->` type($tensorMapDescriptor)
697  }];
698  let hasVerifier = 1;
699}
700
701def NVGPU_TmaCreateDescriptorOp : NVGPU_Op<"tma.create.descriptor", []> {
702  let summary = "TMA create descriptor";
703  let description = [{
704    The Op creates a tensor map descriptor object representing tiled memory
705    region. To do that it calls CUDA Driver's `cuTensorMapEncodeTiled`. The
706    descriptor is used by Tensor Memory Access (TMA).
707
708    The `tensor` is the source tensor to be tiled.
709
710    The `boxDimensions` is the size of the tiled memory region in each dimension.
711
712    For more information see below:
713    https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html
714  }];
715
716  let arguments = (ins AnyUnrankedMemRef:$tensor,
717                       Variadic<Index>:$boxDimensions);
718  let results = (outs NVGPU_TensorMapDescriptor:$tensorMap);
719  let assemblyFormat = [{
720         $tensor `box` `[` $boxDimensions `]` attr-dict `:` type($tensor) `->` type($tensorMap)
721  }];
722  let hasVerifier = 1;
723}
724
725def NVGPU_WarpgroupGenerateDescriptorOp : NVGPU_Op<"warpgroup.generate.descriptor", []> {
726  let summary = "Generate a warpgroup matrix descriptor";
727  let description = [{
728  This Op builds a `nvgpu.warpgroup.descriptor` that is used by
729  `nvgpu.warpgroup.mma` to perform warpgroup-level matrix multiply and
730  accumulate.
731
732  The descriptor specifies the properties of the matrix in shared memory that
733  is a multiplicand in the matrix multiply and accumulate operation.
734  }];
735  let results = (outs NVGPU_WarpgroupMatrixDescriptor:$descriptor);
736  let arguments = (ins Arg<AnyMemRef, "", [MemRead]>:$tensor,
737                       NVGPU_TensorMapDescriptor:$tensorMap);
738  let assemblyFormat = [{$tensor `,` $tensorMap attr-dict `:` type($tensor) `,` type($tensorMap) `->` type($descriptor)}];
739  let hasVerifier = 1;
740}
741
742def NVGPU_WarpgroupMmaOp : NVGPU_Op<"warpgroup.mma"> {
743  let description = [{
744    The `nvgpu.warpgroup.mma` op performs the warpgroup-level (4 warps)
745    matrix-multiply-and-accumulate (mma) operation that results in
746    `nvvm.wgmma.mma_async`.
747
748    The operands are `descriptorA` and `descriptorB` that are wgmma matrix
749    descriptors that shows the properties of the matrix in shared memory. The
750    results are thread-level ownership to the warpgroup-level mma operation
751    shape. The shape is deduced from the descriptor types and output vector.
752
753    The Op encapsulates multiple `nvvm.wgmma.mma_async` operations to complete
754    the given shape. As `nvvm.wgmma.async` Op, or its corresponding PTX
755    instruction, is asynchronous, this Op groups the `nvvm.wgmma.async` and
756    surrounds them between `wgmma.fence.aligned` and
757    `wgmma.commit.group.sync.aligned`, `wgmma.wait.group.sync.aligned` Ops.
758
759    Example:
760    ```mlir
761      %r1,%r2 = nvgpu.warpgroup.mma %descA, %descB, %acc1, %acc2:
762                 !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>>,
763                 !nvgpu.warpgroup.descriptor<tensor = memref<64x128xf16, 3>>,
764                 !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>,
765                 !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>
766                 ->
767                 !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>,
768                 !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>
769    ```
770  }];
771
772  let arguments = (ins NVGPU_WarpgroupMatrixDescriptor:$descriptorA,
773                       NVGPU_WarpgroupMatrixDescriptor:$descriptorB,
774                       DefaultValuedOptionalAttr<I64Attr, "1">:$waitGroup,
775                       OptionalAttr<UnitAttr>:$transposeA,
776                       OptionalAttr<UnitAttr>:$transposeB,
777                       NVGPU_WarpgroupAccumulator:$matrixC);
778  let results = (outs NVGPU_WarpgroupAccumulator:$matrixD);
779  let assemblyFormat = [{
780    $descriptorA`,` $descriptorB`,` $matrixC attr-dict
781    `:` type($descriptorA) `,` type($descriptorB) `,` type($matrixC) `->` type($matrixD)
782  }];
783  let hasVerifier = 1;
784}
785
786def NVGPU_WarpgroupMmaStoreOp : NVGPU_Op<"warpgroup.mma.store"> {
787  let description = [{
788    The `nvgpu.warpgroup.mma.store` op performs the store of fragmented result
789    in $matrixD to given memref.
790
791    [See the details of register fragment layout for accumulator matrix D]
792    (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d)
793
794    Note that, the op must be run with warp group.
795  }];
796
797  let arguments = (ins NVGPU_WarpgroupAccumulator:$matrixD,
798                       Arg<AnyMemRef, "", [MemWrite]>:$dstMemref);
799
800  let assemblyFormat = [{
801    $matrixD `,` $dstMemref attr-dict `:` type($matrixD) `to` type($dstMemref)
802  }];
803  let hasVerifier = 1;
804}
805
806def NVGPU_WarpgroupMmaInitAccumulatorOp : NVGPU_Op<"warpgroup.mma.init.accumulator"> {
807  let summary = "Initializes the accumulator matrix";
808
809  let description = [{
810    This Op generates and initializes the accumulator matrix for
811    `nvgpu.warpgroup.mma` op to perform matrix-multiply-and-accumulate.
812  }];
813  let results = (outs NVGPU_WarpgroupAccumulator:$matrixC);
814  let assemblyFormat = "attr-dict `->` type($matrixC)";
815  let hasVerifier = 1;
816}
817
818def NVGPU_RcpOp : NVGPU_Op<"rcp", [Pure,
819                                   SameOperandsAndResultType]> {
820  let summary = "The reciprocal calculation for vector types";
821  let description = [{
822    Reciprocal calculation for `vector` types using `nvvm.rcp` OPs.
823
824    Currently, only the `approx` rounding mode and `ftz` are supported, and only for the `f32` type.
825
826    The input and output must be of the same vector type and shape.
827  }];
828  let arguments = (ins VectorOfNonZeroRankOf<[F32]>:$in,
829                       DefaultValuedAttr<RcpRoundingModeAttr, "RcpRoundingMode::APPROX">:$rounding,
830                       UnitAttr:$ftz);
831  let results = (outs VectorOfNonZeroRankOf<[F32]>:$out);
832  let assemblyFormat = [{
833    $in `{` `rounding` `=` $rounding (`,` `ftz` $ftz^)? `}`
834    attr-dict `:` type($out)
835  }];
836  let hasVerifier = 1;
837}
838#endif // NVGPU
839