1//===-- NVGPU.td - NVGPU dialect operation definitions *- tablegen -*------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines the basic operations for the NVGPU dialect. 10// 11// This NVGPU provides a bridge between the target agnostic GPU and Vector 12// dialects and lower level NVVM dialect. This allow representing PTX specific 13// operations while using MLIR high level concepts like memref and 2-D vector. 14// 15// Ops semantic are going to be based on vendor specific PTX defintion: 16// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html 17// 18//===----------------------------------------------------------------------===// 19 20#ifndef NVGPU 21#define NVGPU 22 23include "mlir/Interfaces/InferTypeOpInterface.td" 24include "mlir/Interfaces/SideEffectInterfaces.td" 25include "mlir/IR/AttrTypeBase.td" 26include "mlir/IR/OpBase.td" 27include "mlir/IR/EnumAttr.td" 28 29def NVGPU_Dialect : Dialect { 30 let name = "nvgpu"; 31 let cppNamespace = "::mlir::nvgpu"; 32 let description = [{ 33 The `NVGPU` dialect provides a bridge between higher-level target-agnostic 34 dialects (GPU and Vector) and the lower-level target-specific dialect 35 (LLVM IR based NVVM dialect) for NVIDIA GPUs. This allow representing PTX 36 specific operations while using MLIR high level dialects such as Memref 37 and Vector for memory and target-specific register operands, respectively. 38 }]; 39 40 let useDefaultTypePrinterParser = 1; 41 let useDefaultAttributePrinterParser = 1; 42 43 let extraClassDeclaration = [{ 44 /// Return true if the given MemRefType has an integer address 45 /// space that matches the NVVM shared memory address space or 46 /// is a gpu::AddressSpaceAttr attribute with value 'workgroup`. 47 static bool hasSharedMemoryAddressSpace(MemRefType type); 48 49 /// Return true if the given Attribute has an integer address 50 /// space that matches the NVVM shared memory address space or 51 /// is a gpu::AddressSpaceAttr attribute with value 'workgroup`. 52 static bool isSharedMemoryAddressSpace(Attribute type); 53 54 /// Defines the MemRef memory space attribute numeric value that indicates 55 /// a memref is located in global memory. This should correspond to the 56 /// value used in NVVM. 57 static constexpr unsigned kGlobaldMemoryAddressSpace = 1; 58 59 /// Defines the MemRef memory space attribute numeric value that indicates 60 /// a memref is located in shared memory. This should correspond to the 61 /// value used in NVVM. 62 static constexpr unsigned kSharedMemoryAddressSpace = 3; 63 }]; 64} 65 66//===----------------------------------------------------------------------===// 67// NVGPU Attribute Definitions 68//===----------------------------------------------------------------------===// 69 70def TensorMapSwizzleNone : I32EnumAttrCase<"SWIZZLE_NONE", 0, "none">; 71def TensorMapSwizzle32B : I32EnumAttrCase<"SWIZZLE_32B", 1, "swizzle_32b">; 72def TensorMapSwizzle64B : I32EnumAttrCase<"SWIZZLE_64B", 2, "swizzle_64b">; 73def TensorMapSwizzle128B : I32EnumAttrCase<"SWIZZLE_128B", 3, "swizzle_128b">; 74def TensorMapSwizzleKind : I32EnumAttr<"TensorMapSwizzleKind", 75 "Tensor map swizzling mode of shared memory banks", 76 [ TensorMapSwizzleNone, TensorMapSwizzle32B, TensorMapSwizzle64B, 77 TensorMapSwizzle128B]> { 78 let genSpecializedAttr = 0; 79 let cppNamespace = "::mlir::nvgpu"; 80} 81 82def TensorMapL2PromoNone : I32EnumAttrCase<"L2PROMO_NONE", 0, "none">; 83def TensorMapL2Promo64B : I32EnumAttrCase<"L2PROMO_64B", 1, "l2promo_64b">; 84def TensorMapL2Promo128B : I32EnumAttrCase<"L2PROMO_128B", 2, "l2promo_128b">; 85def TensorMapL2Promo256B : I32EnumAttrCase<"L2PROMO_256B", 3, "l2promo_256b">; 86def TensorMapL2PromoKind : I32EnumAttr<"TensorMapL2PromoKind", 87 "Tensor map L2 promotion type", 88 [ TensorMapL2PromoNone, TensorMapL2Promo64B, TensorMapL2Promo128B, 89 TensorMapL2Promo256B]> { 90 let genSpecializedAttr = 0; 91 let cppNamespace = "::mlir::nvgpu"; 92} 93 94def TensorMapOOBZero : I32EnumAttrCase<"OOB_ZERO", 0, "zero">; 95def TensorMapOOBNaN : I32EnumAttrCase<"OOB_NAN", 1, "nan">; 96def TensorMapOOBKind : I32EnumAttr<"TensorMapOOBKind", 97 "Tensor map out-of-bounds fill type", 98 [ TensorMapOOBZero, TensorMapOOBNaN]> { 99 let genSpecializedAttr = 0; 100 let cppNamespace = "::mlir::nvgpu"; 101} 102 103def TensorMapInterleaveNone : I32EnumAttrCase<"INTERLEAVE_NONE", 0, "none">; 104def TensorMapInterleave16B : I32EnumAttrCase<"INTERLEAVE_16B", 1, "interleave_16b">; 105def TensorMapInterleave32B : I32EnumAttrCase<"INTERLEAVE_32B", 2, "interleave_32b">; 106def TensorMapInterleaveKind : I32EnumAttr<"TensorMapInterleaveKind", 107 "Tensor map interleave layout type", 108 [ TensorMapInterleaveNone, TensorMapInterleave16B, TensorMapInterleave32B]> { 109 let genSpecializedAttr = 0; 110 let cppNamespace = "::mlir::nvgpu"; 111} 112 113def RcpApprox : I32EnumAttrCase<"APPROX", 0, "approx">; 114def RcpRN : I32EnumAttrCase<"RN", 1, "rn">; 115def RcpRZ : I32EnumAttrCase<"RZ", 2, "rz">; 116def RcpRM : I32EnumAttrCase<"RM", 3, "rm">; 117def RcpRP : I32EnumAttrCase<"RP", 4, "rp">; 118def RcpRoundingMode : I32EnumAttr<"RcpRoundingMode", "Rounding mode of rcp", 119 [RcpApprox, RcpRN, RcpRZ, RcpRM, RcpRP]> { 120 let genSpecializedAttr = 0; 121 let cppNamespace = "::mlir::nvgpu"; 122} 123 124def TensorMapSwizzleAttr : EnumAttr<NVGPU_Dialect, TensorMapSwizzleKind, "swizzle">; 125def TensorMapL2PromoAttr : EnumAttr<NVGPU_Dialect, TensorMapL2PromoKind, "l2promo">; 126def TensorMapOOBAttr : EnumAttr<NVGPU_Dialect, TensorMapOOBKind, "oob">; 127def TensorMapInterleaveAttr : EnumAttr<NVGPU_Dialect, TensorMapInterleaveKind, "interleave">; 128def RcpRoundingModeAttr : EnumAttr<NVGPU_Dialect, RcpRoundingMode, "rcp_rounding_mode">; 129 130//===----------------------------------------------------------------------===// 131// NVGPU Type Definitions 132//===----------------------------------------------------------------------===// 133 134class NVGPU_Type<string name, string typeMnemonic, 135 list<Trait> traits = []> : TypeDef<NVGPU_Dialect, name, traits> { 136 let mnemonic = typeMnemonic; 137} 138 139def NVGPU_DeviceAsyncToken : NVGPU_Type<"DeviceAsyncToken", 140 "device.async.token", []> { 141 let summary = "device async token type"; 142 let description = [{ 143 `nvgpu.device.async.token` is a type returned by an asynchronous operation 144 that runs on the GPU (device). It is used to establish an SSA-based link 145 between the async operation (e.g. DeviceAsyncCopy) and operations that 146 group or synchronize the async operations (e.g. DeviceAsyncCreateGroupOp, 147 DeviceAsyncWaitOp). 148 }]; 149} 150 151def NVGPU_MBarrierGroup : NVGPU_Type<"MBarrierGroup", "mbarrier.group", []> { 152 let summary = "mbarrier barrier type"; 153 let description = [{ 154 This is the type for one or more mbarrier object in shared memory that is 155 used to synchronize a variable number of threads. 156 157 If `num_barriers` is not set, the number of mbarrier objects is 1. 158 159 A mbarrier object is 64 bit with 8 byte alignment. The mbarrier object 160 can be initiated and invalidated. 161 162 [See for more details in PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#size-and-alignment-of-mbarrier-object) 163 }]; 164 let parameters = (ins "Attribute":$memorySpace, DefaultValuedParameter<"unsigned", "1">:$num_barriers); 165 let assemblyFormat = "`<` struct(params) `>`"; 166 let builders = [ 167 TypeBuilder<(ins "Attribute":$memorySpace), [{ 168 return $_get($_ctxt, memorySpace, 1); 169 }]> 170 ]; 171} 172 173def NVGPU_MBarrierToken : NVGPU_Type<"MBarrierToken", "mbarrier.token", []> { } 174 175// https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-map 176def NVGPU_TensorMapDescriptor : NVGPU_Type<"TensorMapDescriptor", "tensormap.descriptor", []> { 177 let summary = "TensorMap descriptor"; 178 let parameters = (ins "MemRefType":$tensor, 179 EnumParameter<TensorMapSwizzleKind>:$swizzle, 180 EnumParameter<TensorMapL2PromoKind>:$l2promo, 181 EnumParameter<TensorMapOOBKind>:$oob, 182 EnumParameter<TensorMapInterleaveKind>:$interleave); 183 let description = [{ 184 `nvgpu.tma.descriptor` is a type that represents a TMA descriptor. It is 185 128-byte object either in constant space or kernel paramater. 186 }]; 187 let assemblyFormat = "`<` struct(params) `>`"; 188} 189 190def NVGPU_WarpgroupMatrixDescriptor : NVGPU_Type<"WarpgroupMatrixDescriptor", "warpgroup.descriptor", []> { 191 let summary = "Warpgroup matrix descriptor type"; 192 let description = [{ 193 The descriptor specifies the properties of the matrix in shared memory that 194 is a multiplicand in the matrix multiply and accumulate operation. 195 196 The descriptor is a 64-bit value contained in a register with the following: 197 ``` 198 +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+ 199 | 0-13 |14-15| 16-29 |30-31| 32-45 |46-48|49-51| 52-61 |62-63| 200 +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+ 201 | 14bits |2bits| 14bits |2bits| 14bits |2bits|3bits| 10bits |2bits| 202 +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+ 203 | BaseAddr| 0 | LeadingDim| 0 | Stride | 0 |Offst| 0 |Swzle| 204 +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+ 205 ``` 206 207 [See for more details in PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-shared-memory-layout-matrix-descriptor) 208 209 }]; 210 let parameters = (ins "MemRefType":$tensor); 211 let assemblyFormat = "`<` struct(params) `>`"; 212} 213 214def NVGPU_WarpgroupAccumulator : NVGPU_Type<"WarpgroupAccumulator", "warpgroup.accumulator", []> { 215 let parameters = (ins "VectorType":$fragmented); 216 let assemblyFormat = "`<` struct(params) `>`"; 217 let description = [{ 218 This type represents the result matrix obtained from `nvgpu.warpgroup.mma`. 219 The `$fragmented` type signifies the distributed or fragmented result 220 vector that is collectively owned by all the threads in the warp-group 221 that executed `nvgpu.warpgroup.mma`. 222 [See the details of register fragment layout for accumulator matrix D] 223 (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d) 224 }]; 225} 226 227//===----------------------------------------------------------------------===// 228// NVGPU Op Definitions 229//===----------------------------------------------------------------------===// 230 231class NVGPU_Op<string mnemonic, list<Trait> traits = []> : 232 Op<NVGPU_Dialect, mnemonic, traits> {} 233 234def NVGPU_LdMatrixOp : NVGPU_Op<"ldmatrix", [ 235 MemoryEffects<[MemRead]>, 236 PredOpTrait<"srcMemref and res have same element type", 237 TCresVTEtIsSameAsOp<0, 0>>]> { 238 let description = [{ 239 The `nvgpu.ldmatrix` op represents loading a matrix fragment from 240 memory to registers. The source and result type must be compatible 241 with lowering to the `nvvm.ldmatrix` instruction. This op represents 242 the distributed version of a `vector.transfer_read` as an intermediate 243 step between lowering from `vector.transfer_read` to `nvvm.ldmatrix`. 244 245 This operation is meant to follow the semantic of described here: 246 https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-ldmatrix 247 248 Example: 249 ```mlir 250 %0 = nvgpu.ldmatrix %sm[%c0, %c0] {numTiles = 4 : i32, transpose = false} : 251 memref<?x?xf16, 3> -> vector<4x2xf16> 252 ``` 253 }]; 254 255 let arguments = (ins Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$srcMemref, 256 Variadic<Index>:$indices, BoolAttr:$transpose, 257 I32Attr:$numTiles); 258 let results = (outs AnyVectorOfNonZeroRank:$res); 259 let assemblyFormat = [{ 260 $srcMemref`[` $indices `]` attr-dict `:` type($srcMemref) `->` type($res) 261 }]; 262 263 let hasVerifier = 1; 264} 265 266class NVGPU_MmaSyncOp<string mnemonic> : 267 NVGPU_Op<mnemonic, [Pure, 268 PredOpTrait<"matrixA and matrixB have same element type", 269 TCopVTEtIsSameAs<0, 1>>]> { 270 code extraBaseClassDeclaration = [{ 271 std::array<int64_t, 3> getMmaShapeAsArray() { 272 ArrayAttr mmaShape = this->getMmaShape(); 273 assert(mmaShape.size() == 3 && "mmaShape should be three integers"); 274 return {::llvm::cast<IntegerAttr>(mmaShape[0]).getInt(), 275 ::llvm::cast<IntegerAttr>(mmaShape[1]).getInt(), 276 ::llvm::cast<IntegerAttr>(mmaShape[2]).getInt()}; 277 } 278 }]; 279 280 let hasVerifier = 1; 281} 282 283def NVGPU_MmaSyncOp : NVGPU_MmaSyncOp<"mma.sync"> { 284 let description = [{ 285 The `nvgpu.mma.sync` op represents the warp-level matrix-multiply-and- 286 accumulate (mma) operation that is compatible with `nvvm.mma.sync`. 287 The operands and results vector sizes are thread-level onwership to 288 the warp-level mma operation shape. `mmaShape` attribute holds the 289 warp-level matrix-multiply shape. 290 291 The `nvgpu.mma.sync` op serves as an intermediate point between lowering from 292 `vector.contract` to `nvvm.mma.sync`. 293 294 This operation is meant to follow the semantic of described here: 295 https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma 296 297 Example: 298 299 ```mlir 300 %res = nvgpu.mma.sync (%matrixA, %matrixB, %matrixC) {mmaShape = [16, 8, 16]} : 301 (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf32>) -> vector<2x2xf32> 302 ``` 303 }]; 304 let arguments = (ins AnyVectorOfNonZeroRank:$matrixA, 305 AnyVectorOfNonZeroRank:$matrixB, 306 AnyVectorOfNonZeroRank:$matrixC, 307 I64ArrayAttr:$mmaShape, 308 OptionalAttr<UnitAttr>:$tf32Enabled); 309 310 let results = (outs AnyVectorOfNonZeroRank:$res); 311 312 let builders = [ 313 OpBuilder<(ins "Value":$matrixA, 314 "Value":$matrixB, 315 "Value":$matrixC, 316 "ArrayAttr":$mmaShape)>, 317 OpBuilder<(ins "Value":$matrixA, 318 "Value":$matrixB, 319 "Value":$matrixC, 320 "ArrayRef<int64_t>":$mmaShape, 321 CArg<"bool", "false">:$tf32Enabled)> 322 ]; 323 324 let assemblyFormat = [{ 325 `(` $matrixA`,` $matrixB`,` $matrixC `)` attr-dict 326 `:` `(` type($matrixA) `,` type($matrixB) `,` type($matrixC) `)` `->` type($res) 327 }]; 328 329 let extraClassDeclaration = extraBaseClassDeclaration; 330} 331 332def NVGPU_MmaSparseSyncMetadataType : FixedVectorOfLengthAndType<[2], [I16]>, 333 BuildableType<"::mlir::VectorType::get(" 334 "{2},$_builder.getI16Type())">; 335 336def NVGPU_MmaSparseSyncOp : NVGPU_MmaSyncOp<"mma.sp.sync"> { 337 let description = [{ 338 The `nvgu.mma.sp.sync` operation performs a warp-distributed MMA operation 339 where operand A is "structured sparse". In this case, the `matrixA` operand 340 represents the (warp-distributed) non-zero values of operand A, and the 341 `sparse_metadata` operand provides the indices. 342 343 The full description of the sparsity storage format and distribution scheme is 344 described in the PTX docs. This operation is meant to follow the semantic 345 described in the PTX documentation here: 346 https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-for-sparse-mma 347 348 The way the indices are distributed among the threads in a warp is controlled 349 by the optional `sparsity_selector` operand, which is `0` by default. For 350 more information, please consult the PTX documentation linked above. 351 352 Example (targetingthe f16 16x8x32 `mma.sp` PTX instruction): 353 354 ```mlir 355 nvgpu.mma.sp.sync (%a, %b, %c) metadata (%meta) {mmaShape = [16, 8, 32]} : 356 (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16> 357 ``` 358 }]; 359 360 let arguments = (ins AnyVectorOfNonZeroRank:$matrixA, 361 AnyVectorOfNonZeroRank:$matrixB, 362 AnyVectorOfNonZeroRank:$matrixC, 363 NVGPU_MmaSparseSyncMetadataType:$sparseMetadata, 364 I64ArrayAttr:$mmaShape, 365 DefaultValuedAttr<I32Attr, "0">:$sparsitySelector, 366 OptionalAttr<UnitAttr>:$tf32Enabled 367 ); 368 369 let results = (outs AnyVectorOfNonZeroRank:$res); 370 371 let builders = [ 372 OpBuilder<(ins "Value":$matrixA, 373 "Value":$matrixB, 374 "Value":$matrixC, 375 "Value":$sparseMetadata, 376 "ArrayRef<int64_t>":$mmaShape)> 377 ]; 378 379 let assemblyFormat = [{ 380 `(` $matrixA`,` $matrixB`,` $matrixC `)` `metadata` `(` $sparseMetadata `)` attr-dict 381 `:` `(` type($matrixA) `,` type($matrixB) `,` type($matrixC) `)` `->` type($res) 382 }]; 383 384 let extraClassDeclaration = extraBaseClassDeclaration; 385} 386 387def NVGPU_DeviceAsyncCopyOp : NVGPU_Op<"device_async_copy", [ 388 AttrSizedOperandSegments]> { 389 let summary = "device-side asynchronous copy"; 390 let description = [{ 391 The `nvgpu.device_async_copy` op initiates an asynchronous copy operation of 392 elements from source (global memory) to the destination (shared memory) 393 without blocking the thread. The async copy is added to a group. 394 395 This op is meant to be used with `nvgpu.device_async_create_group` and 396 `nvgpu.device_async_wait` to synchronize copies as explained in those ops 397 descriptions. 398 399 `bypassL1` attribute is hint to the hardware to bypass the L1 cache during 400 async copy, this hint may be ignored by the hardware. 401 402 `dstElements` attribute is the total number of elements written to 403 destination (shared memory). 404 405 `srcElements` argument is the total number of elements read from 406 source (global memory). 407 408 `srcElements` is an optional argument and when present the op only reads 409 `srcElements` number of elements from the source (global memory) and zero fills 410 the rest of the elements in the destination (shared memory). 411 412 In order to do a copy and wait for the result we need the following 413 combination: 414 ``` 415 // copy 1. 416 %cp1 = nvgpu.device_async_copy %A[%c0], %B[%c0], 4 :memref<16xf32> to memref<16xf32, 3> 417 // copy 2. 418 %cp2 = nvgpu.device_async_copy %C[%c0], %D[%c0], 4 : memref<16xf32> to memref<16xf32, 3> 419 // group 1 contains copy 1 and copy 2. 420 %token1 = nvgpu.device_async_create_group %cp1, %cp2 421 // copy 3. 422 %cp3 = nvgpu.device_async_copy %E[%c0], %F[%c0], 4 : memref<16xf32> to memref<16xf32, 3> 423 // group 2 contains copy 3. 424 %token2 = nvgpu.device_async_create_group %cp3 425 // after the wait copy 1 and copy 2 are complete. 426 nvgpu.device_async_wait %token1 427 // after the wait copy 3 is complete. 428 nvgpu.device_async_wait %token2 429 ``` 430 431 Example: 432 433 ```mlir 434 %0 = nvgpu.device_async_copy %src[%c0, %c0], %dst[%c0, %c0, %c0], 4 : 435 memref<4x5xf32> to memref<2x7x5xf32, 3> 436 ``` 437 }]; 438 let results = (outs NVGPU_DeviceAsyncToken:$asyncToken); 439 let arguments = (ins Arg<AnyMemRef, "", [MemWriteAt<0, FullEffect>]>:$dst, 440 Variadic<Index>:$dstIndices, 441 Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$src, 442 Variadic<Index>:$srcIndices, 443 IndexAttr:$dstElements, 444 Optional<Index>:$srcElements, 445 OptionalAttr<UnitAttr>:$bypassL1); 446 let assemblyFormat = [{ 447 $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` `,` $dstElements (`,` $srcElements^)? 448 attr-dict `:` type($src) `to` type($dst) 449 }]; 450 let hasVerifier = 1; 451} 452 453def NVGPU_DeviceAsyncCreateGroupOp : NVGPU_Op<"device_async_create_group", []> { 454 let summary = "device side asynchronous create group operation"; 455 let description = [{ 456 The `nvgpu.device_async_create_group` op creates a group of memory accesses 457 containing all the pending `device_async_copy` operations associated with 458 argument tokens. Each token can only be part of one group. 459 460 It returns a token that can be use to wait until the group fully completes. 461 462 This is meant to be used with `nvgpu.device_async_wait` to synchronize copies 463 as explained in those ops descriptions. 464 465 Groups are executed in the order they are created. 466 467 Example: 468 469 ```mlir 470 %0 = nvgpu.device_async_create_group 471 ``` 472 }]; 473 let results = (outs NVGPU_DeviceAsyncToken:$asyncToken); 474 let arguments = (ins Variadic<NVGPU_DeviceAsyncToken>:$inputTokens); 475 let assemblyFormat = [{ 476 $inputTokens attr-dict 477 }]; 478} 479 480def NVGPU_DeviceAsyncWaitOp : NVGPU_Op<"device_async_wait", []> { 481 let summary = "Wait for async gpu ops to complete."; 482 let description = [{ 483 The `nvgpu.device_async_wait` op will block the execution thread until the group 484 associated with the source token is fully completed. 485 486 The optional `$numGroups` attribute gives an upper bound of the number of 487 groups uncompleted when the wait can unblock the thread. For example, if 488 16 async groups are pushe and `$numGroups` is set to 12, then the thread 489 will unblock when 12 groups or fewer are in flight (4 groups have 490 completed). 491 492 Example: 493 494 ```mlir 495 nvgpu.device_async_wait %0 496 ``` 497 }]; 498 let arguments = (ins NVGPU_DeviceAsyncToken:$asyncDependencies, 499 OptionalAttr<I32Attr>:$numGroups); 500 let assemblyFormat = [{ 501 $asyncDependencies attr-dict 502 }]; 503} 504 505def NVGPU_MBarrierCreateOp : NVGPU_Op<"mbarrier.create", []> { 506 let summary = "Creates a `nvgpu.mbarrier` object."; 507 let description = [{ 508 The Op generates one or more `mbarrier` object, which is a barrier created in 509 shared memory and supports various synchronization behaviors for threads. 510 511 The `mbarrier` object has the following type and alignment requirements: 512 Type: .b64, Alignment: 8, Memory space: .shared 513 514 Example: 515 ```mlir 516 %barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> 517 ``` 518 }]; 519 let arguments = (ins); 520 let results = (outs NVGPU_MBarrierGroup:$barriers); 521 let assemblyFormat = [{ 522 attr-dict `->` type($barriers) 523 }]; 524} 525 526def NVGPU_MBarrierInitOp : NVGPU_Op<"mbarrier.init", []> { 527 let summary = "Initialize the `nvgpu.mbarrier`."; 528 let description = [{ 529 The Op initializes the `mbarrier` object with the given number of threads. 530 531 Example: 532 ```mlir 533 %num_threads = gpu.block_dim x 534 %barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> 535 nvgpu.mbarrier.init %barrier, %num_threads : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> 536 ``` 537 }]; 538 let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$count, Index:$mbarId, Optional<I1>:$predicate); 539 let assemblyFormat = "$barriers `[` $mbarId `]` `,` $count (`,` `predicate` `=` $predicate^)? attr-dict `:` type($barriers)"; 540} 541 542def NVGPU_MBarrierTestWaitOp : NVGPU_Op<"mbarrier.test.wait", []> { 543 let summary = "Checks if the `nvgpu.mbarrier` has completed its current phase."; 544 let description = [{ 545 Checks whether the mbarrier object has completed the phase. It is is a 546 non-blocking instruction which tests for the completion of the phase. 547 548 Example: 549 ```mlir 550 %isComplete = nvgpu.mbarrier.test.wait %barrier, %token : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>, !nvgpu.mbarrier.token 551 ``` 552 }]; 553 let arguments = (ins NVGPU_MBarrierGroup:$barriers, NVGPU_MBarrierToken:$token, Index:$mbarId); 554 let results = (outs I1:$waitComplete); 555 let assemblyFormat = "$barriers `[` $mbarId `]` `,` $token attr-dict `:` type($barriers) `,` type($token)"; 556} 557 558def NVGPU_MBarrierArriveOp : NVGPU_Op<"mbarrier.arrive", []> { 559 let summary = "Performs arrive operation on the `nvgpu.mbarrier.arrive`."; 560 let description = [{ 561 The Op performs arrive-on operation on the `mbarrier` object and returns a 562 `nvgpu.mbarrier.token`. 563 564 For more information, see 565 https://docs.nvidia.com/cuda/parallel-thread-execution/#arrive-on-operation-on-mbarrier-object 566 567 Example: 568 ```mlir 569 %token = nvgpu.mbarrier.arrive %barrier : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> -> !nvgpu.mbarrier.token 570 ``` 571 }]; 572 let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$mbarId); 573 let results = (outs NVGPU_MBarrierToken:$token); 574let assemblyFormat = "$barriers `[` $mbarId `]` attr-dict `:` type($barriers) `->` type($token)"; 575} 576 577def NVGPU_MBarrierArriveNoCompleteOp : NVGPU_Op<"mbarrier.arrive.nocomplete", []> { 578 let summary = "Performs arrive operation on the `nvgpu.mbarrier.arrive.nocomplete` as non-blocking."; 579 let description = [{ 580 The Op performs arrive-on operation on the `mbarrier` object and returns a 581 `nvgpu.mbarrier.token`. 582 583 The Op does not cause the `nvgpu.mbarrier` to complete its current phase. 584 585 Example: 586 ```mlir 587 %token = nvgpu.mbarrier.arrive.noComplete %barrier, %count : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> -> !nvgpu.mbarrier.token 588 ``` 589 }]; 590 let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$mbarId, 591 Index:$count); 592 let results = (outs NVGPU_MBarrierToken:$token); 593 let assemblyFormat = "$barriers `[` $mbarId `]` `,` $count attr-dict `:` type($barriers) `->` type($token)"; 594} 595 596def NVGPU_MBarrierArriveExpectTxOp : NVGPU_Op<"mbarrier.arrive.expect_tx", []> { 597 let summary = "Performs expect_tx operation on the `nvgpu.mbarrier.arrive`"; 598 let description = [{ 599 A thread executing the Op performs an expect-tx operation on the mbarrier 600 object at the location specified by the address operand $barrier. The 601 expect-tx operation, with an $txcount argument, increases the tx-count of 602 an mbarrier object by the value specified by $txcount. This makes the 603 current phase of the mbarrier object to expect and track the completion of 604 additional asynchronous transactions. 605 606 The `$txCount` specifies the number of element to the expect-tx operation. 607 608 Example: 609 ```mlir 610 nvgpu.mbarrier.arrive.expect_tx %barrier, %ic0 : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> 611 ``` 612 }]; 613 let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$txcount, Index:$mbarId, Optional<I1>:$predicate); 614 let assemblyFormat = "$barriers `[` $mbarId `]` `,` $txcount (`,` `predicate` `=` $predicate^)? attr-dict `:` type($barriers)"; 615} 616 617def NVGPU_MBarrierTryWaitParityOp : NVGPU_Op<"mbarrier.try_wait.parity", []> { 618 let summary = "Waits for the `nvgpu.mbarrier` to complete its current phase."; 619 let description = [{ 620 Checks whether the mbarrier object has completed the phase. It is is a 621 potentially blocking instruction which tests for the completion of the 622 phase. Suspended thread resumes execution when the specified phase completes 623 OR before the phase completes following a system-dependent time limit. 624 625 The `$phaseParity` specifies either even phase (0) or odd phase (1) to 626 wait. 627 628 Example: 629 ```mlir 630 nvgpu.mbarrier.try_wait.parity %barrier, %phaseParity, %ticks : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> 631 ``` 632 }]; 633 let arguments = (ins NVGPU_MBarrierGroup:$barriers, I1:$phaseParity, Index:$ticks, Index:$mbarId); 634 let assemblyFormat = "$barriers `[` $mbarId `]` `,` $phaseParity `,` $ticks attr-dict `:` type($barriers)"; 635} 636 637def NVGPU_TmaPrefetchOp : NVGPU_Op<"tma.prefetch.descriptor", []> { 638 let summary = "Prefetch given `nvgpu.tensormap.descriptor` "; 639 let description = [{ 640 The Op brings the cache line containing the given `$tmaDescriptor` for 641 subsequent use by the `tma.async.load` instruction. 642 }]; 643 let arguments = (ins NVGPU_TensorMapDescriptor:$tensorMapDescriptor, Optional<I1>:$predicate); 644 let assemblyFormat = [{ 645 $tensorMapDescriptor (`,` `predicate` `=` $predicate^)? attr-dict `:` type($tensorMapDescriptor) 646 }]; 647} 648 649def NVGPU_TmaAsyncLoadOp : NVGPU_Op<"tma.async.load", [AttrSizedOperandSegments]> { 650 let summary = "TMA asynchronous load"; 651 let description = [{ 652 The Op loads a tile memory region from global memory to shared memory by 653 Tensor Memory Access (TMA). 654 655 `$tensorMapDescriptor` is tensor map descriptor which has information about 656 tile shape. The descriptor is created by `nvgpu.tma.create.descriptor` 657 658 The Op uses `$barrier` mbarrier based completion mechanism. 659 }]; 660 let arguments = (ins Arg<AnyMemRef, "", [MemWriteAt<0, FullEffect>]>:$dst, 661 NVGPU_MBarrierGroup:$barriers, 662 NVGPU_TensorMapDescriptor:$tensorMapDescriptor, 663 Variadic<Index>:$coordinates, 664 Index:$mbarId, 665 Optional<I16>:$multicastMask, 666 Optional<I1>:$predicate); 667 let assemblyFormat = [{ 668 $tensorMapDescriptor `[` $coordinates `]` `,` $barriers `[` $mbarId `]` 669 `to` $dst 670 (`multicast_mask` `=` $multicastMask^ )? 671 (`,` `predicate` `=` $predicate^)? 672 attr-dict `:` type($tensorMapDescriptor) `,` type($barriers) 673 `->` type($dst) 674 }]; 675 let hasVerifier = 1; 676 677} 678 679def NVGPU_TmaAsyncStoreOp : NVGPU_Op<"tma.async.store", [AttrSizedOperandSegments]> { 680 let summary = "TMA asynchronous store"; 681 let description = [{ 682 The Op store a tile memory region from global memory to shared memory by 683 Tensor Memory Access (TMA). 684 685 `$tensorMapDescriptor` is tensor map descriptor which has information about 686 tile shape. The descriptor is created by `nvgpu.tma.create.descriptor` 687 }]; 688 let arguments = (ins Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$src, 689 Arg<NVGPU_TensorMapDescriptor, "", [MemWriteAt<0, FullEffect>]>:$tensorMapDescriptor, 690 Variadic<Index>:$coordinates, 691 Optional<I1>:$predicate); 692 let assemblyFormat = [{ 693 $src `to` $tensorMapDescriptor `[` $coordinates `]` 694 (`,` `predicate` `=` $predicate^)? 695 attr-dict `:` type($src) 696 `->` type($tensorMapDescriptor) 697 }]; 698 let hasVerifier = 1; 699} 700 701def NVGPU_TmaCreateDescriptorOp : NVGPU_Op<"tma.create.descriptor", []> { 702 let summary = "TMA create descriptor"; 703 let description = [{ 704 The Op creates a tensor map descriptor object representing tiled memory 705 region. To do that it calls CUDA Driver's `cuTensorMapEncodeTiled`. The 706 descriptor is used by Tensor Memory Access (TMA). 707 708 The `tensor` is the source tensor to be tiled. 709 710 The `boxDimensions` is the size of the tiled memory region in each dimension. 711 712 For more information see below: 713 https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html 714 }]; 715 716 let arguments = (ins AnyUnrankedMemRef:$tensor, 717 Variadic<Index>:$boxDimensions); 718 let results = (outs NVGPU_TensorMapDescriptor:$tensorMap); 719 let assemblyFormat = [{ 720 $tensor `box` `[` $boxDimensions `]` attr-dict `:` type($tensor) `->` type($tensorMap) 721 }]; 722 let hasVerifier = 1; 723} 724 725def NVGPU_WarpgroupGenerateDescriptorOp : NVGPU_Op<"warpgroup.generate.descriptor", []> { 726 let summary = "Generate a warpgroup matrix descriptor"; 727 let description = [{ 728 This Op builds a `nvgpu.warpgroup.descriptor` that is used by 729 `nvgpu.warpgroup.mma` to perform warpgroup-level matrix multiply and 730 accumulate. 731 732 The descriptor specifies the properties of the matrix in shared memory that 733 is a multiplicand in the matrix multiply and accumulate operation. 734 }]; 735 let results = (outs NVGPU_WarpgroupMatrixDescriptor:$descriptor); 736 let arguments = (ins Arg<AnyMemRef, "", [MemRead]>:$tensor, 737 NVGPU_TensorMapDescriptor:$tensorMap); 738 let assemblyFormat = [{$tensor `,` $tensorMap attr-dict `:` type($tensor) `,` type($tensorMap) `->` type($descriptor)}]; 739 let hasVerifier = 1; 740} 741 742def NVGPU_WarpgroupMmaOp : NVGPU_Op<"warpgroup.mma"> { 743 let description = [{ 744 The `nvgpu.warpgroup.mma` op performs the warpgroup-level (4 warps) 745 matrix-multiply-and-accumulate (mma) operation that results in 746 `nvvm.wgmma.mma_async`. 747 748 The operands are `descriptorA` and `descriptorB` that are wgmma matrix 749 descriptors that shows the properties of the matrix in shared memory. The 750 results are thread-level ownership to the warpgroup-level mma operation 751 shape. The shape is deduced from the descriptor types and output vector. 752 753 The Op encapsulates multiple `nvvm.wgmma.mma_async` operations to complete 754 the given shape. As `nvvm.wgmma.async` Op, or its corresponding PTX 755 instruction, is asynchronous, this Op groups the `nvvm.wgmma.async` and 756 surrounds them between `wgmma.fence.aligned` and 757 `wgmma.commit.group.sync.aligned`, `wgmma.wait.group.sync.aligned` Ops. 758 759 Example: 760 ```mlir 761 %r1,%r2 = nvgpu.warpgroup.mma %descA, %descB, %acc1, %acc2: 762 !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>>, 763 !nvgpu.warpgroup.descriptor<tensor = memref<64x128xf16, 3>>, 764 !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>, 765 !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>> 766 -> 767 !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>, 768 !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>> 769 ``` 770 }]; 771 772 let arguments = (ins NVGPU_WarpgroupMatrixDescriptor:$descriptorA, 773 NVGPU_WarpgroupMatrixDescriptor:$descriptorB, 774 DefaultValuedOptionalAttr<I64Attr, "1">:$waitGroup, 775 OptionalAttr<UnitAttr>:$transposeA, 776 OptionalAttr<UnitAttr>:$transposeB, 777 NVGPU_WarpgroupAccumulator:$matrixC); 778 let results = (outs NVGPU_WarpgroupAccumulator:$matrixD); 779 let assemblyFormat = [{ 780 $descriptorA`,` $descriptorB`,` $matrixC attr-dict 781 `:` type($descriptorA) `,` type($descriptorB) `,` type($matrixC) `->` type($matrixD) 782 }]; 783 let hasVerifier = 1; 784} 785 786def NVGPU_WarpgroupMmaStoreOp : NVGPU_Op<"warpgroup.mma.store"> { 787 let description = [{ 788 The `nvgpu.warpgroup.mma.store` op performs the store of fragmented result 789 in $matrixD to given memref. 790 791 [See the details of register fragment layout for accumulator matrix D] 792 (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d) 793 794 Note that, the op must be run with warp group. 795 }]; 796 797 let arguments = (ins NVGPU_WarpgroupAccumulator:$matrixD, 798 Arg<AnyMemRef, "", [MemWrite]>:$dstMemref); 799 800 let assemblyFormat = [{ 801 $matrixD `,` $dstMemref attr-dict `:` type($matrixD) `to` type($dstMemref) 802 }]; 803 let hasVerifier = 1; 804} 805 806def NVGPU_WarpgroupMmaInitAccumulatorOp : NVGPU_Op<"warpgroup.mma.init.accumulator"> { 807 let summary = "Initializes the accumulator matrix"; 808 809 let description = [{ 810 This Op generates and initializes the accumulator matrix for 811 `nvgpu.warpgroup.mma` op to perform matrix-multiply-and-accumulate. 812 }]; 813 let results = (outs NVGPU_WarpgroupAccumulator:$matrixC); 814 let assemblyFormat = "attr-dict `->` type($matrixC)"; 815 let hasVerifier = 1; 816} 817 818def NVGPU_RcpOp : NVGPU_Op<"rcp", [Pure, 819 SameOperandsAndResultType]> { 820 let summary = "The reciprocal calculation for vector types"; 821 let description = [{ 822 Reciprocal calculation for `vector` types using `nvvm.rcp` OPs. 823 824 Currently, only the `approx` rounding mode and `ftz` are supported, and only for the `f32` type. 825 826 The input and output must be of the same vector type and shape. 827 }]; 828 let arguments = (ins VectorOfNonZeroRankOf<[F32]>:$in, 829 DefaultValuedAttr<RcpRoundingModeAttr, "RcpRoundingMode::APPROX">:$rounding, 830 UnitAttr:$ftz); 831 let results = (outs VectorOfNonZeroRankOf<[F32]>:$out); 832 let assemblyFormat = [{ 833 $in `{` `rounding` `=` $rounding (`,` `ftz` $ftz^)? `}` 834 attr-dict `:` type($out) 835 }]; 836 let hasVerifier = 1; 837} 838#endif // NVGPU 839