1//===-- GPUOps.td - GPU dialect operation definitions ------*- tablegen -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// Defines some operations of the GPU dialect. 10// 11//===----------------------------------------------------------------------===// 12 13#ifndef GPU_OPS 14#define GPU_OPS 15 16include "mlir/Dialect/DLTI/DLTIBase.td" 17include "mlir/Dialect/GPU/IR/GPUBase.td" 18include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td" 19include "mlir/Dialect/GPU/IR/CompilationAttrs.td" 20include "mlir/Dialect/GPU/IR/GPUDeviceMappingAttr.td" 21include "mlir/Dialect/GPU/IR/ParallelLoopMapperAttr.td" 22include "mlir/IR/CommonTypeConstraints.td" 23include "mlir/IR/EnumAttr.td" 24include "mlir/IR/RegionKindInterface.td" 25include "mlir/IR/SymbolInterfaces.td" 26include "mlir/Interfaces/ControlFlowInterfaces.td" 27include "mlir/Interfaces/DataLayoutInterfaces.td" 28include "mlir/IR/OpAsmInterface.td" 29include "mlir/Interfaces/FunctionInterfaces.td" 30include "mlir/Interfaces/InferIntRangeInterface.td" 31include "mlir/Interfaces/InferTypeOpInterface.td" 32include "mlir/Interfaces/SideEffectInterfaces.td" 33 34//===----------------------------------------------------------------------===// 35// GPU Dialect operations. 36//===----------------------------------------------------------------------===// 37 38class GPU_Op<string mnemonic, list<Trait> traits = []> : 39 Op<GPU_Dialect, mnemonic, traits>; 40 41def GPU_Dimension : I32EnumAttr<"Dimension", 42 "a dimension, either 'x', 'y', or 'z'", 43 [ 44 I32EnumAttrCase<"x", 0>, 45 I32EnumAttrCase<"y", 1>, 46 I32EnumAttrCase<"z", 2> 47 ]>{ 48 let genSpecializedAttr = 0; 49 let cppNamespace = "::mlir::gpu"; 50} 51def GPU_DimensionAttr : EnumAttr<GPU_Dialect, GPU_Dimension, "dim">; 52 53class GPU_IndexOp<string mnemonic, list<Trait> traits = []> : 54 GPU_Op<mnemonic, !listconcat(traits, [ 55 Pure, 56 DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>, 57 DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>])>, 58 Arguments<(ins GPU_DimensionAttr:$dimension, 59 OptionalAttr<IndexAttr>:$upper_bound)>, Results<(outs Index)> { 60 let assemblyFormat = "$dimension (`upper_bound` $upper_bound^)? attr-dict"; 61 let extraClassDefinition = [{ 62 void $cppClass::getAsmResultNames( 63 llvm::function_ref<void(mlir::Value, mlir::StringRef)> setNameFn) { 64 auto dimStr = stringifyDimension(getDimensionAttr().getValue()); 65 auto opName = getOperationName(); 66 opName.consume_front("gpu."); 67 SmallString<8> resultName({opName, "_", dimStr}); 68 setNameFn(getResult(),resultName); 69 } 70 }]; 71 let builders = [ 72 OpBuilder<(ins "::mlir::gpu::Dimension":$dimension), [{ 73 build($_builder, $_state, dimension, /*upperBound=*/nullptr); 74 }]>, 75 OpBuilder<(ins "::mlir::Type":$resultType, "::mlir::gpu::Dimension":$dimension), [{ 76 build($_builder, $_state, resultType, dimension, /*upperBound=*/nullptr); 77 }]> 78 ]; 79} 80 81def GPU_ClusterDimOp : GPU_IndexOp<"cluster_dim"> { 82 let description = [{ 83 Returns the number of cluster identifiers per grid along 84 the x, y, or z `dimension`. 85 86 Example: 87 88 ```mlir 89 %cDimX = gpu.cluster_dim x 90 ``` 91 92 If `upper_bound` is set, then executing (a lowering of) this operation in an 93 environment where the clusters per grid is greater than `upper_bound` causes 94 undefined behavior. 95 96 There is an implicit upper bound of `kMaxDim` (currently uint32_t::max). 97 }]; 98} 99 100def GPU_ClusterDimBlocksOp : GPU_IndexOp<"cluster_dim_blocks"> { 101 let description = [{ 102 Returns the number of thread blocks in the cluster along 103 the x, y, or z `dimension`. 104 105 Example: 106 107 ```mlir 108 %cDimBlocksX = gpu.cluster_dim_blocks x 109 ``` 110 111 If `upper_bound` is set, then executing (a lowering of) this operation in an 112 environment where the thread blocks per cluster is greater than `upper_bound` 113 causes undefined behavior. 114 115 There is an implicit upper bound of `kMaxClusterDim` (currently 8). 116 }]; 117} 118 119def GPU_ClusterIdOp : GPU_IndexOp<"cluster_id"> { 120 let description = [{ 121 Returns the cluster id, i.e. the index of the current cluster within the 122 grid along the x, y, or z `dimension`. 123 124 Example: 125 126 ```mlir 127 %cIdY = gpu.cluster_id y 128 ``` 129 130 If `upper_bound` is set, then executing (a lowering of) this operation in an 131 environment where the number of clusters in the grid along `dimension` is 132 greater than `upper_bound` causes undefined behavior. 133 134 There is an implicit upper bound of `kMaxDim` (currently uint32_t::max). 135 }]; 136} 137 138def GPU_ClusterBlockIdOp : GPU_IndexOp<"cluster_block_id"> { 139 let description = [{ 140 Returns the block id within the cluster along the x, y, or z `dimension`. 141 142 Example: 143 144 ```mlir 145 %cBlockIdY = gpu.cluster_block_id y 146 ``` 147 148 If `upper_bound` is set, then executing (a lowering of) this operation in an 149 environment where the number of thread blocks per cluster along `dimension` 150 is greater than `upper_bound` causes undefined behavior. 151 152 There is an implicit upper bound of `kMaxClusterDim` (currently 8). 153 }]; 154} 155 156def GPU_BlockDimOp : GPU_IndexOp<"block_dim"> { 157 let description = [{ 158 Returns the number of threads in the thread block (aka the block size) along 159 the x, y, or z `dimension`. 160 161 Example: 162 163 ```mlir 164 %bDimX = gpu.block_dim x 165 ``` 166 167 If `known_block_size` is set on an this operation's enclosing `gpu.func`, 168 or `gpu.known_block_size` is set on an enclosing `FunctionOpInterface` 169 implementor, or if the enclosing `gpu.launch` specifies a constant size for 170 `dimension`'s blocks, these contextual facts may be used to infer that this 171 operation has a constant value, though such a transformation will not be 172 performed by canonicalization or the default constant folder. Executions which 173 cause that constant-value assumption to be false incur undefined behavior. 174 175 If `upper_bound` is set, executions where the bblock size along `dimension` 176 exceeds `upper_bound` cause undefined behavior. 177 178 There is an implicit upper bound of `kMaxDim` (currently uint32_t::max). 179 }]; 180} 181def GPU_BlockIdOp : GPU_IndexOp<"block_id"> { 182 let description = [{ 183 Returns the block id, i.e. the index of the current block within the grid 184 along the x, y, or z `dimension`. 185 186 Example: 187 188 ```mlir 189 %bIdY = gpu.block_id y 190 ``` 191 192 If `upper_bound` is set, or if one can be inferred from `known_grid_size`-type 193 annotations in context, executions where the block index in `dimension` would 194 be greater than or equal to that bound cause undefined behavior. `upper_bound` 195 takes priority over bounds inferrable from context. 196 197 There is an implicit upper bound of `kMaxDim` (currently uint32_t::max). 198 }]; 199} 200def GPU_GridDimOp : GPU_IndexOp<"grid_dim"> { 201 let description = [{ 202 Returns the number of thread blocks in the grid along the x, y, or z 203 `dimension`. 204 205 Example: 206 207 ```mlir 208 %gDimZ = gpu.grid_dim z 209 ``` 210 211 212 If `known_grid_size` is set on an this operation's enclosing `gpu.func`, 213 or `gpu.known_grid_size` is set on an enclosing `FunctionOpInterface` 214 implementor, or if the enclosing `gpu.launch` specifies a constant size for 215 `dimension`'s grid length, these contextual facts may be used to infer that this 216 operation has a constant value, though such a transformation will not be 217 performed by canonicalization or the default constant folder. Executions which 218 cause that constant-value assumption to be false incur undefined behavior. 219 220 If `upper_bound` is set, executions where the grid size in `dimension` would 221 exceed `upper_bound` cause undefined behavior. 222 223 There is an implicit upper bound of `kMaxDim` (currently uint32_t::max). 224 }]; 225} 226def GPU_ThreadIdOp : GPU_IndexOp<"thread_id"> { 227 let description = [{ 228 Returns the thread id, i.e. the index of the current thread within the block 229 along the x, y, or z `dimension`. 230 231 Example: 232 233 ```mlir 234 %tIdX = gpu.thread_id x 235 ``` 236 237 If `upper_bound` is set, or if one can be inferred from `known_block_size`-type 238 annotations in context, executions where the thread index would be greater 239 than or equal to that bound cause undefined behavior. 240 241 There is an implicit upper bound of `kMaxDim` (currently uint32_t::max). 242 }]; 243} 244 245def GPU_LaneIdOp : GPU_Op<"lane_id", [ 246 Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]> { 247 let description = [{ 248 Returns the lane id within the subgroup (warp/wave). 249 250 Example: 251 ```mlir 252 %laneId = gpu.lane_id 253 ``` 254 255 If `upper_bound` is set, executions with more than `upper_bound` lanes per 256 subgroup cause undefined behavior. In the abscence of `upper_bound`, 257 the lane id is still assumed to be non-negative and less than the 258 target-independent `kMaxSubgroupSize` (currently 128). 259 }]; 260 let arguments = (ins OptionalAttr<IndexAttr>:$upper_bound); 261 let results = (outs Index:$result); 262 let assemblyFormat = "(`upper_bound` $upper_bound^)? attr-dict"; 263} 264 265def GPU_SubgroupIdOp : GPU_Op<"subgroup_id", [ 266 Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>, 267 Arguments<(ins OptionalAttr<IndexAttr>:$upper_bound)>, 268 Results<(outs Index:$result)> { 269 let description = [{ 270 Returns the subgroup id, i.e., the index of the current subgroup within the 271 workgroup. 272 273 Example: 274 275 ```mlir 276 %sgId = gpu.subgroup_id : index 277 ``` 278 279 Executions where there are more than `upper_bound` subgroups per workgroup 280 cause undefined behavior. There is an implicit upper bound of `kMaxDim` 281 (currently uint32_t::max). 282 }]; 283 284 let assemblyFormat = "(`upper_bound` $upper_bound^)? attr-dict `:` type($result)"; 285} 286 287def GPU_GlobalIdOp : GPU_IndexOp<"global_id"> { 288 let description = [{ 289 Returns the unique global workitem/thread id, i.e., the unique index of the 290 current workitem/thread within all workgroups / grid along the x, y, or z 291 `dimension`. 292 293 Example: 294 295 ```mlir 296 %gidX = gpu.global_id x 297 %gidX = gpu.global_id x upper_bound 65536 298 ``` 299 300 The `upper_bound` attribute defines an upper bound analogously to the ones on 301 `thread_id` and `block_id`. If one is not set, the bound may be inferred from 302 a combination of `known_block_size` and `known_grid_size`-type annotations. 303 }]; 304} 305 306 307def GPU_NumSubgroupsOp : GPU_Op<"num_subgroups", [ 308 Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>, 309 Arguments<(ins OptionalAttr<IndexAttr>:$upper_bound)>, 310 Results<(outs Index:$result)> { 311 let description = [{ 312 Returns the number of subgroups within a workgroup. 313 314 Example: 315 316 ```mlir 317 %numSg = gpu.num_subgroups : index 318 ``` 319 320 If `upper_bound` is set, executions with more than `upper_bound` subgroups 321 per workgroup cause undefined behavior. There is a default upper bound of 322 `kMaxDim` (currently uint32_t::max). 323 }]; 324 325 let assemblyFormat = "(`upper_bound` $upper_bound^)? attr-dict `:` type($result)"; 326} 327 328def GPU_SubgroupSizeOp : GPU_Op<"subgroup_size", [ 329 Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>, 330 Arguments<(ins OptionalAttr<IndexAttr>:$upper_bound)>, 331 Results<(outs Index:$result)> { 332 let description = [{ 333 Returns the number of threads within a subgroup. 334 335 Example: 336 337 ```mlir 338 %sgSz = gpu.subgroup_size : index 339 ``` 340 341 Executions where the number of threads per subgroup exceed `upper_bound` cause 342 undefined behavior. When no `upper_bound` is specified, range analyses and 343 similar machinery assume the default bound of `kMaxSubgroupSize`, currently 344 128. 345 }]; 346 347 let assemblyFormat = "(`upper_bound` $upper_bound^)? attr-dict `:` type($result)"; 348} 349 350def GPU_OptionalDimSizeHintAttr : ConfinedAttr<OptionalAttr<DenseI32ArrayAttr>, 351 [AttrConstraint<Or<[IsNullAttr.predicate, DenseArrayCount<3>.predicate]>, 352 "with 3 elements (if present)">]>; 353 354def GPU_GPUFuncOp : GPU_Op<"func", [ 355 HasParent<"GPUModuleOp">, AutomaticAllocationScope, FunctionOpInterface, 356 IsolatedFromAbove, AffineScope 357 ]> { 358 let summary = "Function executable on a GPU"; 359 360 let description = [{ 361 Defines a function that can be executed on a GPU. This supports memory 362 attribution and its body has a particular execution model. 363 364 GPU functions are either kernels (as indicated by the `kernel` attribute) or 365 regular functions. The former can be launched from the host side, while the 366 latter are device side only. 367 368 The memory attribution defines SSA values that correspond to memory buffers 369 allocated in the memory hierarchy of the GPU (see below). 370 371 The operation has one attached region that corresponds to the body of the 372 function. The region arguments consist of the function arguments without 373 modification, followed by buffers defined in memory annotations. The body of 374 a GPU function, when launched, is executed by multiple work items. There are 375 no guarantees on the order in which work items execute, or on the connection 376 between them. In particular, work items are not necessarily executed in 377 lock-step. Synchronization ops such as "gpu.barrier" should be used to 378 coordinate work items. Declarations of GPU functions, i.e. not having the 379 body region, are not supported. 380 381 A function may optionally be annotated with the block and/or grid sizes 382 that will be used when it is launched using the `known_block_size` and 383 `known_grid_size` attributes, respectively. If set, these attributes must 384 be arrays of three 32-bit integers giving the x, y, and z launch dimensions. 385 Launching a kernel that has these annotations, or that calls a function with 386 these annotations, using a block size or grid size other than what is specified 387 is undefined behavior. These attributes may be set on non-`gpu.func` functions 388 by using `gpu.known_block_size` or `gpu.known_grid_size`, but this carries 389 the risk that they will de discarded. 390 391 Syntax: 392 393 ``` 394 op ::= `gpu.func` symbol-ref-id `(` argument-list `)` (`->` 395 function-result-list)? 396 memory-attribution `kernel`? function-attributes? region 397 398 memory-attribution ::= (`workgroup` `(` ssa-id-and-type-list `)`)? 399 (`private` `(` ssa-id-and-type-list `)`)? 400 ``` 401 402 Example: 403 404 ```mlir 405 gpu.func @foo(%arg0: index) 406 workgroup(%workgroup: memref<32xf32, 3>) 407 private(%private: memref<1xf32, 5>) 408 kernel 409 attributes {qux: "quux"} { 410 gpu.return 411 } 412 ``` 413 414 The generic form illustrates the concept 415 416 ```mlir 417 "gpu.func"(%arg: index) {sym_name: "foo", kernel, qux: "quux"} ({ 418 ^bb0(%arg0: index, %workgroup: memref<32xf32, 3>, 419 %private: memref<1xf32, 5>): 420 "gpu.return"() : () -> () 421 }) : (index) -> () 422 ``` 423 424 Note the non-default memory spaces used in memref types in memory 425 attribution. 426 }]; 427 428 let arguments = (ins TypeAttrOf<FunctionType>:$function_type, 429 OptionalAttr<DictArrayAttr>:$arg_attrs, 430 OptionalAttr<DictArrayAttr>:$res_attrs, 431 OptionalAttr<DictArrayAttr>:$workgroup_attrib_attrs, 432 OptionalAttr<DictArrayAttr>:$private_attrib_attrs, 433 GPU_OptionalDimSizeHintAttr:$known_block_size, 434 GPU_OptionalDimSizeHintAttr:$known_grid_size); 435 let regions = (region AnyRegion:$body); 436 437 let skipDefaultBuilders = 1; 438 439 let builders = [ 440 OpBuilder<(ins "StringRef":$name, "FunctionType":$type, 441 CArg<"TypeRange", "{}">:$workgroupAttributions, 442 CArg<"TypeRange", "{}">:$privateAttributions, 443 CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)> 444 ]; 445 446 let extraClassDeclaration = [{ 447 /// Returns `true` if the GPU function defined by this Op is a kernel, i.e. 448 /// it is intended to be launched from host. 449 bool isKernel() { 450 return (*this)->getAttrOfType<UnitAttr>( 451 GPUDialect::getKernelFuncAttrName()) != nullptr; 452 } 453 454 /// Returns the number of buffers located in the workgroup memory. 455 unsigned getNumWorkgroupAttributions() { 456 auto attr = (*this)->getAttrOfType<IntegerAttr>( 457 getNumWorkgroupAttributionsAttrName()); 458 return attr ? attr.getInt() : 0; 459 } 460 461 /// Return the index of the first workgroup attribution in the block argument 462 /// list. 463 unsigned getFirstWorkgroupAttributionIndex() { 464 return getFunctionType().getNumInputs(); 465 } 466 467 /// Returns a list of block arguments that correspond to buffers located in 468 /// the workgroup memory 469 ArrayRef<BlockArgument> getWorkgroupAttributions() { 470 auto begin = 471 std::next(getBody().args_begin(), getFirstWorkgroupAttributionIndex()); 472 auto end = std::next(begin, getNumWorkgroupAttributions()); 473 return {begin, end}; 474 } 475 476 /// Adds a new block argument that corresponds to buffers located in 477 /// workgroup memory. 478 BlockArgument addWorkgroupAttribution(Type type, Location loc); 479 480 /// Get the workgroup attribution attribute dictionary for the attribution 481 /// at index `index`, counted from the start of the workgroup attributions. 482 DictionaryAttr getworkgroupAttributionAttrs(unsigned index); 483 484 /// Set the workgroup attribution attribute dictionary for the attribution 485 /// at index `index`, counted from the start of the workgroup attributions. 486 void setworkgroupAttributionAttrs(unsigned index, DictionaryAttr value); 487 488 /// Get an attribute for a workgroup attribution. `index` is counted 489 /// from the start of the workgroup attributions, not the start of the block. 490 Attribute getWorkgroupAttributionAttr(unsigned index, StringAttr name); 491 Attribute getWorkgroupAttributionAttr(unsigned index, StringRef name) { 492 return getWorkgroupAttributionAttr(index, StringAttr::get((*this)->getContext(), name)); 493 } 494 495 /// Set an attribute for a workgroup attribution. `index` is counted 496 /// from the start of the workgroup attributions, not the start of the block. 497 /// A null `value` removes an attributino attribute. 498 void setWorkgroupAttributionAttr(unsigned index, StringAttr name, Attribute value); 499 void setWorkgroupAttributionAttr(unsigned index, StringRef name, Attribute value) { 500 return setWorkgroupAttributionAttr(index, StringAttr::get((*this)->getContext(), name), value); 501 } 502 503 /// Returns the number of buffers located in the private memory. 504 unsigned getNumPrivateAttributions() { 505 return getBody().getNumArguments() - getFunctionType().getNumInputs() - 506 getNumWorkgroupAttributions(); 507 } 508 509 /// Returns the index of the first private buffer in the block argument list. 510 unsigned getFirstPrivateAttributionIndex() { 511 // Buffers on the private memory always come after buffers on the workgroup 512 // memory. 513 return getFunctionType().getNumInputs() + getNumWorkgroupAttributions(); 514 } 515 516 /// Returns a list of block arguments that correspond to buffers located in 517 /// the private memory. 518 ArrayRef<BlockArgument> getPrivateAttributions() { 519 auto begin = 520 std::next(getBody().args_begin(), getFirstPrivateAttributionIndex()); 521 return {begin, getBody().args_end()}; 522 } 523 524 /// Adds a new block argument that corresponds to buffers located in 525 /// private memory. 526 BlockArgument addPrivateAttribution(Type type, Location loc); 527 528 /// Get the private attribution attribute dictionary for the attribution 529 /// at index `index`, counted from the start of the private attributions. 530 DictionaryAttr getPrivateAttributionAttrs(unsigned index); 531 532 /// Set the private attribution attribute dictionary for the attribution 533 /// at index `index`, counted from the start of the private attributions. 534 void setPrivateAttributionAttrs(unsigned index, DictionaryAttr value); 535 536 /// Get an attribute for a private attribution. `index` is counted 537 /// from the start of the private attributions, not the start of the block. 538 Attribute getPrivateAttributionAttr(unsigned index, StringAttr name); 539 Attribute getPrivateAttributionAttr(unsigned index, StringRef name) { 540 return getPrivateAttributionAttr(index, StringAttr::get((*this)->getContext(), name)); 541 } 542 543 /// Set an attribute for a private attribution. `index` is counted 544 /// from the start of the private attributions, not the start of the block. 545 /// A null `value` removes an attribute. 546 void setPrivateAttributionAttr(unsigned index, StringAttr name, Attribute value); 547 void setPrivateAttributionAttr(unsigned index, StringRef name, Attribute value) { 548 return setPrivateAttributionAttr(index, StringAttr::get((*this)->getContext(), name), value); 549 } 550 551 /// Returns the name of the attribute containing the number of buffers 552 /// located in the workgroup memory. 553 static StringRef getNumWorkgroupAttributionsAttrName() { 554 return "workgroup_attributions"; 555 } 556 557 /// Returns the argument types of this function. 558 ArrayRef<Type> getArgumentTypes() { return getFunctionType().getInputs(); } 559 560 /// Returns the result types of this function. 561 ArrayRef<Type> getResultTypes() { return getFunctionType().getResults(); } 562 563 Region *getCallableRegion() { return &getBody(); } 564 565 /// Returns the keywords used in the custom syntax for this Op. 566 static StringRef getWorkgroupKeyword() { return "workgroup"; } 567 static StringRef getPrivateKeyword() { return "private"; } 568 static StringRef getKernelKeyword() { return "kernel"; } 569 570 /// Hook for FunctionOpInterface verifier. 571 LogicalResult verifyType(); 572 573 /// Verifies the body of the function. 574 LogicalResult verifyBody(); 575 }]; 576 let hasCustomAssemblyFormat = 1; 577} 578 579def GPU_DynamicSharedMemoryOp : GPU_Op<"dynamic_shared_memory", [Pure]> 580{ 581 let summary = "Get the memref for dynamic shared memory"; 582 583 let description = [{ 584 This operation provides a memref pointer to the start of dynamic shared 585 memory, often referred to as workgroup memory. It's important to note that 586 this dynamic shared memory needs to be allocated at kernel launch. One can 587 conveniently utilize `the dynamic_shared_memory_size` parameter of 588 `gpu.launch` for this purpose. 589 590 Examples: 591 ```mlir 592 %0 = gpu.dynamic.shared.memory : memref<?xi8, #gpu.address_space<workgroup>> 593 %1 = memref.view %0[%c8192][] : memref<?xi8, #gpu.address_space<workgroup>> 594 to memref<32x64xf32, #gpu.address_space<workgroup>> 595 %2 = memref.view %0[%c16384][] : memref<?xi8, #gpu.address_space<workgroup>> 596 to memref<32x64xf32, #gpu.address_space<workgroup>> 597 ``` 598 }]; 599 let arguments = (ins); 600 let results = (outs Arg<MemRefRankOf<[I8], [1]>>:$resultMemref); 601 let assemblyFormat = [{ attr-dict `:` type($resultMemref) }]; 602 let hasVerifier = 1; 603} 604 605def LaunchIndx : AnyTypeOf<[Index, I32, I64]>; 606 607def GPU_LaunchFuncOp :GPU_Op<"launch_func", [ 608 GPU_AsyncOpInterface, AttrSizedOperandSegments, 609 AllTypesMatch<["gridSizeX", "gridSizeY", "gridSizeZ", "blockSizeX", 610 "blockSizeY", "blockSizeZ"]>]>, 611 Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies, 612 SymbolRefAttr:$kernel, 613 LaunchIndx:$gridSizeX, 614 LaunchIndx:$gridSizeY, 615 LaunchIndx:$gridSizeZ, 616 LaunchIndx:$blockSizeX, 617 LaunchIndx:$blockSizeY, 618 LaunchIndx:$blockSizeZ, 619 Optional<LaunchIndx>:$clusterSizeX, 620 Optional<LaunchIndx>:$clusterSizeY, 621 Optional<LaunchIndx>:$clusterSizeZ, 622 Optional<I32>:$dynamicSharedMemorySize, 623 Variadic<AnyType>:$kernelOperands, 624 Optional<AnyType>:$asyncObject)>, 625 Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> { 626 let summary = "Launches a function as a GPU kernel"; 627 628 let description = [{ 629 Launch a kernel function on the specified grid of thread blocks. 630 `gpu.launch` operations are lowered to `gpu.launch_func` operations by 631 outlining the kernel body into a function in a dedicated module, which 632 reflects the separate compilation process. The kernel function is required 633 to have the `gpu.kernel` attribute. The module containing the kernel 634 function is required to be a gpu.module. And finally, the module containing 635 the kernel module (which thus cannot be the top-level module) is required 636 to have the `gpu.container_module` attribute. The `gpu.launch_func` 637 operation has a symbol attribute named `kernel` to identify the fully 638 specified kernel function to launch (both the gpu.module and func). 639 640 The `gpu.launch_func` supports async dependencies: the kernel does not start 641 executing until the ops producing those async dependencies have completed. 642 643 By the default, the host implicitly blocks until kernel execution has 644 completed. If the `async` keyword is present, the host does not block but 645 instead a `!gpu.async.token` is returned. Other async GPU ops can take this 646 token as dependency. 647 648 The operation requires at least the grid and block sizes along the x,y,z 649 dimensions as arguments. When a lower-dimensional kernel is required, 650 unused sizes must be explicitly set to `1`. 651 652 The remaining operands are optional. The first optional operand corresponds 653 to the amount of dynamic shared memory a kernel's workgroup should be 654 allocated; when this operand is not present, a zero size is assumed. 655 656 The remaining operands if present are passed as arguments to the kernel 657 function. 658 659 The `gpu.launch_func` also supports kernel launching with clusters if 660 supported by the target architecture. The cluster size can be set by 661 `clusterSizeX`, `clusterSizeY`, and `clusterSizeZ` arguments. When these 662 arguments are present, the Op launches a kernel that clusters the given 663 thread blocks. This feature is exclusive to certain architectures. 664 665 Example: 666 667 ```mlir 668 module attributes {gpu.container_module} { 669 670 // This module creates a separate compilation unit for the GPU compiler. 671 gpu.module @kernels { 672 func.func @kernel_1(%arg0 : f32, %arg1 : memref<?xf32, 1>) 673 attributes { nvvm.kernel = true } { 674 675 // Operations that produce block/thread IDs and dimensions are 676 // injected when outlining the `gpu.launch` body to a function called 677 // by `gpu.launch_func`. 678 %tIdX = gpu.thread_id x 679 %tIdY = gpu.thread_id y 680 %tIdZ = gpu.thread_id z 681 682 %bDimX = gpu.block_dim x 683 %bDimY = gpu.block_dim y 684 %bDimZ = gpu.block_dim z 685 686 %bIdX = gpu.block_id x 687 %bIdY = gpu.block_id y 688 %bIdZ = gpu.block_id z 689 690 %gDimX = gpu.grid_dim x 691 %gDimY = gpu.grid_dim y 692 %gDimZ = gpu.grid_dim z 693 694 // (Optional) Cluster size only for support architectures 695 %cIdX = gpu.cluster_id x 696 %cIdY = gpu.cluster_id y 697 %cIdZ = gpu.cluster_id z 698 699 %cDimX = gpu.cluster_dim x 700 %cDimY = gpu.cluster_dim y 701 %cDimZ = gpu.cluster_dim z 702 703 "some_op"(%bx, %tx) : (index, index) -> () 704 %42 = load %arg1[%bx] : memref<?xf32, 1> 705 } 706 } 707 708 %t0 = gpu.wait async 709 gpu.launch_func 710 async // (Optional) Don't block host, return token. 711 [%t0] // (Optional) Execute only after %t0 has completed. 712 @kernels::@kernel_1 // Kernel function. 713 clusters in (%cst, %cst, %cst) // (Optional) Cluster size only for support architectures. 714 blocks in (%cst, %cst, %cst) // Grid size. 715 threads in (%cst, %cst, %cst) // Block size. 716 dynamic_shared_memory_size %s // (Optional) Amount of dynamic shared 717 // memory to allocate for a workgroup. 718 args(%arg0 : f32, // (Optional) Kernel arguments. 719 %arg1 : memref<?xf32, 1>) 720 } 721 ``` 722 }]; 723 724 let skipDefaultBuilders = 1; 725 726 let builders = [ 727 OpBuilder<(ins "GPUFuncOp":$kernelFunc, "KernelDim3":$gridSize, 728 "KernelDim3":$blockSize, "Value":$dynamicSharedMemorySize, 729 "ValueRange":$kernelOperands, 730 CArg<"Type", "nullptr">:$asyncTokenType, 731 CArg<"ValueRange", "{}">:$asyncDependencies, 732 CArg<"std::optional<KernelDim3>", "std::nullopt">:$clusterSize)>, 733 OpBuilder<(ins "SymbolRefAttr":$kernel, "KernelDim3":$gridSize, 734 "KernelDim3":$blockSize, "Value":$dynamicSharedMemorySize, 735 "ValueRange":$kernelOperands, 736 "Type":$asyncTokenType, 737 CArg<"ValueRange", "{}">:$asyncDependencies, 738 CArg<"std::optional<KernelDim3>", "std::nullopt">:$clusterSize)>, 739 OpBuilder<(ins "SymbolRefAttr":$kernel, "KernelDim3":$gridSize, 740 "KernelDim3":$blockSize, "Value":$dynamicSharedMemorySize, 741 "ValueRange":$kernelOperands, 742 CArg<"Value", "nullptr">:$asyncObject, 743 CArg<"std::optional<KernelDim3>", "std::nullopt">:$clusterSize)> 744 ]; 745 746 let extraClassDeclaration = [{ 747 /// The name of the kernel's containing module. 748 StringAttr getKernelModuleName(); 749 750 /// The name of the kernel. 751 StringAttr getKernelName(); 752 753 /// Returns true if cluster size is specified. 754 bool hasClusterSize() { 755 if (getClusterSizeX() && getClusterSizeY() && getClusterSizeZ()) 756 return true; 757 return false; 758 } 759 760 /// The number of operands passed to the kernel function. 761 unsigned getNumKernelOperands(); 762 763 /// The i-th operand passed to the kernel function. 764 Value getKernelOperand(unsigned i); 765 766 /// Get the SSA values passed as operands to specify the cluster size. 767 /// When the cluster sizes are not specified, it asserts. 768 KernelDim3 getClusterSizeOperandValues(); 769 770 /// Get the SSA values passed as operands to specify the grid size. 771 KernelDim3 getGridSizeOperandValues(); 772 773 /// Get the SSA values passed as operands to specify the block size. 774 KernelDim3 getBlockSizeOperandValues(); 775 776 // This needs to quietly verify if attributes with names defined below are 777 // present since it is run before the verifier of this op. 778 friend LogicalResult GPUDialect::verifyOperationAttribute(Operation *, 779 NamedAttribute); 780 }]; 781 782 let assemblyFormat = [{ 783 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 784 (`<` $asyncObject^ `:` type($asyncObject) `>`)? 785 $kernel 786 ( `clusters` `in` ` ` `(` $clusterSizeX^ `,` $clusterSizeY `,` $clusterSizeZ `)` )? 787 `blocks` `in` ` ` `(` $gridSizeX `,` $gridSizeY `,` $gridSizeZ `)` 788 `threads` `in` ` ` `(` $blockSizeX `,` $blockSizeY `,` $blockSizeZ `)` 789 custom<LaunchDimType>(type($gridSizeX), ref($clusterSizeX), type($clusterSizeX), type($clusterSizeY), type($clusterSizeZ)) 790 (`dynamic_shared_memory_size` $dynamicSharedMemorySize^)? 791 custom<LaunchFuncOperands>($kernelOperands, type($kernelOperands)) attr-dict 792 }]; 793 let hasVerifier = 1; 794} 795 796def GPU_LaunchOp : GPU_Op<"launch", [ 797 AutomaticAllocationScope, AttrSizedOperandSegments, GPU_AsyncOpInterface, 798 DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>, 799 RecursiveMemoryEffects]>, 800 Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies, 801 Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ, 802 Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ, 803 Optional<Index>:$clusterSizeX, 804 Optional<Index>:$clusterSizeY, 805 Optional<Index>:$clusterSizeZ, 806 Optional<I32>:$dynamicSharedMemorySize, 807 OptionalAttr<SymbolRefAttr>:$kernelFunc, 808 OptionalAttr<SymbolRefAttr>:$kernelModule)>, 809 Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> { 810 let summary = "GPU kernel launch operation"; 811 812 let description = [{ 813 Launch a kernel on the specified grid of thread blocks. The body of the 814 kernel is defined by the single region that this operation contains. The 815 operation takes an optional list of async dependencies followed by six 816 operands and an optional operand. 817 818 The `async` keyword indicates the kernel should be launched asynchronously; 819 the operation returns a new !gpu.async.token when the keyword is specified. 820 The kernel launched does not start executing until the ops producing its 821 async dependencies (optional operands) have completed. 822 823 The first three operands (following any async dependencies) are grid sizes 824 along the x,y,z dimensions and the following three are block sizes along the 825 x,y,z dimensions. When a lower-dimensional kernel is required, unused sizes 826 must be explicitly set to `1`. The last operand is optional and corresponds 827 to the amount of dynamic shared memory a kernel's workgroup should be 828 allocated; when this operand is not present, a zero size is assumed. 829 830 The body region has at least _twelve_ arguments, or _eighteen_ if cluster 831 dimensions are present, grouped as follows: 832 833 - three optional arguments that contain cluster identifiers along x,y,z 834 dimensions; 835 - three arguments that contain block identifiers along x,y,z dimensions; 836 - three arguments that contain thread identifiers along x,y,z dimensions; 837 - operands of the `gpu.launch` operation as is (i.e. the operands for 838 grid and block sizes). 839 - a variadic number of Workgroup memory attributions. 840 - a variadic number of Private memory attributions. 841 842 The `kernelFunc` and `kernelModule` attributes are optional and specifies 843 the kernel name and a module in which the kernel should be outlined. 844 845 Syntax: 846 847 ``` 848 operation ::= `gpu.launch` (`async` (`[` ssa-id-list `]`)? )? 849 ( `clusters` `(` ssa-id-list `)` `in` ssa-reassignment )? 850 `blocks` `(` ssa-id-list `)` `in` ssa-reassignment 851 `threads` `(` ssa-id-list `)` `in` ssa-reassignment 852 (dynamic_shared_memory_size ssa-use)? 853 memory-attribution 854 region attr-dict? 855 ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)` 856 memory-attribution ::= (`workgroup` `(` ssa-id-and-type-list `)`)? 857 (`private` `(` ssa-id-and-type-list `)`)? 858 ``` 859 860 Example: 861 862 ```mlir 863 gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %0, %sz_by = %1, %sz_bz = %2) 864 threads(%tx, %ty, %tz) in (%sz_tx = %3, %sz_ty = %4, %sz_tz = %5) { 865 // Block and thread identifiers, as well as block/grid sizes are 866 // immediately usable inside body region. 867 "some_op"(%bx, %tx) : (index, index) -> () 868 // Assuming %val1 is defined outside the gpu.launch region. 869 %42 = load %val1[%bx] : memref<?xf32, 1> 870 } 871 872 // Generic syntax explains how the pretty syntax maps to the IR structure. 873 "gpu.launch"(%cst, %cst, %c1, // Grid sizes. 874 %cst, %c1, %c1) // Block sizes. 875 876 {/*attributes*/} 877 // All sizes and identifiers have "index" size. 878 : (index, index, index, index, index, index) -> () { 879 // The operation passes block and thread identifiers, followed by grid and 880 // block sizes. 881 ^bb0(%bx : index, %by : index, %bz : index, 882 %tx : index, %ty : index, %tz : index, 883 %num_bx : index, %num_by : index, %num_bz : index, 884 %num_tx : index, %num_ty : index, %num_tz : index) 885 "some_op"(%bx, %tx) : (index, index) -> () 886 %3 = "memref.load"(%val1, %bx) : (memref<?xf32, 1>, index) -> f32 887 } 888 889 // Launch with memory attributions. 890 gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %0, %sz_by = %1, %sz_bz = %2) 891 threads(%tx, %ty, %tz) in (%sz_tx = %3, %sz_ty = %4, %sz_tz = %5) 892 workgroup(%workgroup: memref<32xf32, 3>) 893 private(%private: memref<1xf32, 5>) { 894 // Block and thread identifiers, as well as block/grid sizes are 895 // immediately usable inside body region. 896 "some_op"(%bx, %tx) : (index, index) -> () 897 // Assuming %val1 is defined outside the gpu.launch region. 898 %42 = load %workgroup[%bx] : memref<32xf32, 3> 899 } 900 901 // Launch with clusters. 902 gpu.launch clusters(%cx, %cy, %cz) in (%sz_cx = %0, %sz_cy = %1, %sz_cz = %2) 903 blocks(%bx, %by, %bz) in (%sz_bx = %3, %sz_by = %4, %sz_bz = %5) 904 threads(%tx, %ty, %tz) in (%sz_tx = %6, %sz_ty = %7, %sz_tz = %8) 905 { 906 // Cluster, block and thread identifiers, as well as cluster/block/grid 907 // sizes are immediately usable inside body region. 908 "some_op"(%cx, %bx, %tx) : (index, index, index) -> () 909 } 910 ``` 911 912 Rationale: using operation/block arguments gives analyses a clear way of 913 understanding that a value has additional semantics (e.g., we will need to 914 know what value corresponds to threadIdx.x for coalescing). We can recover 915 these properties by analyzing the operations producing values, but it is 916 easier just to have that information by construction. 917 }]; 918 919 let regions = (region AnyRegion:$body); 920 921 let skipDefaultBuilders = 1; 922 923 let builders = [ 924 OpBuilder<(ins "Value":$gridSizeX, "Value":$gridSizeY, 925 "Value":$gridSizeZ, "Value":$blockSizeX, "Value":$blockSizeY, 926 "Value":$blockSizeZ, 927 CArg<"Value", "nullptr">:$dynamicSharedMemorySize, 928 CArg<"Type", "nullptr">:$asyncTokenType, 929 CArg<"ValueRange", "{}">:$asyncDependencies, 930 CArg<"TypeRange", "{}">:$workgroupAttributions, 931 CArg<"TypeRange", "{}">:$privateAttributions, 932 CArg<"Value", "nullptr">:$clusterSizeX, 933 CArg<"Value", "nullptr">:$clusterSizeY, 934 CArg<"Value", "nullptr">:$clusterSizeZ)> 935 ]; 936 937 let extraClassDeclaration = [{ 938 /// Get the SSA values corresponding to kernel block identifiers. 939 KernelDim3 getBlockIds(); 940 /// Get the SSA values corresponding to kernel thread identifiers. 941 KernelDim3 getThreadIds(); 942 /// Get the SSA values corresponding to kernel cluster identifiers. 943 std::optional<KernelDim3> getClusterIds(); 944 /// Get the SSA values corresponding to kernel grid size. 945 KernelDim3 getGridSize(); 946 /// Get the SSA values corresponding to kernel block size. 947 KernelDim3 getBlockSize(); 948 /// Get the SSA values corresponding to kernel cluster size. 949 std::optional<KernelDim3> getClusterSize(); 950 951 /// Get the SSA values passed as operands to specify the grid size. 952 KernelDim3 getGridSizeOperandValues(); 953 /// Get the SSA values passed as operands to specify the block size. 954 KernelDim3 getBlockSizeOperandValues(); 955 /// Get the SSA values passed as operands to specify the cluster size. 956 std::optional<KernelDim3> getClusterSizeOperandValues(); 957 958 static StringRef getBlocksKeyword() { return "blocks"; } 959 static StringRef getClustersKeyword() { return "clusters"; } 960 static StringRef getThreadsKeyword() { return "threads"; } 961 static StringRef getDynamicSharedMemorySizeKeyword() { 962 return "dynamic_shared_memory_size"; 963 } 964 965 /// The number of launch configuration operands, placed at the leading 966 /// positions of the operand list. 967 static constexpr unsigned kNumConfigOperands = 6; 968 969 /// The number of region attributes containing the launch configuration, 970 /// placed in the leading positions of the argument list. 971 static constexpr unsigned kNumConfigRegionAttributes = 12; 972 973 /// Returns true if cluster size is specified. 974 bool hasClusterSize() { 975 if (getClusterSizeX() && getClusterSizeY() && getClusterSizeZ()) 976 return true; 977 return false; 978 } 979 /// Returns the number of operands including cluster size 980 unsigned getNumConfigOperands() { 981 return kNumConfigOperands + (hasClusterSize() ? 3 : 0); 982 } 983 /// Returns the number of region attributes including cluster size 984 unsigned getNumConfigRegionAttributes() { 985 return kNumConfigRegionAttributes + (hasClusterSize() ? 6 : 0); 986 } 987 988 /// Returns the keywords used in the custom syntax for this Op. 989 static StringRef getWorkgroupKeyword() { return "workgroup"; } 990 static StringRef getPrivateKeyword() { return "private"; } 991 992 /// Returns the number of buffers located in the workgroup memory. 993 unsigned getNumWorkgroupAttributions() { 994 auto attr = (*this)->getAttrOfType<IntegerAttr>( 995 getNumWorkgroupAttributionsAttrName()); 996 return attr ? attr.getInt() : 0; 997 } 998 999 /// Returns a list of block arguments that correspond to buffers located in 1000 /// the workgroup memory 1001 ArrayRef<BlockArgument> getWorkgroupAttributions() { 1002 auto begin = 1003 std::next(getBody().args_begin(), getNumConfigRegionAttributes()); 1004 auto end = std::next(begin, getNumWorkgroupAttributions()); 1005 return {begin, end}; 1006 } 1007 1008 /// Adds a new block argument that corresponds to buffers located in 1009 /// workgroup memory. 1010 BlockArgument addWorkgroupAttribution(Type type, Location loc); 1011 1012 /// Returns the number of buffers located in the private memory. 1013 unsigned getNumPrivateAttributions() { 1014 return getBody().getNumArguments() - getNumConfigRegionAttributes() - 1015 getNumWorkgroupAttributions(); 1016 } 1017 1018 /// Returns a list of block arguments that correspond to buffers located in 1019 /// the private memory. 1020 ArrayRef<BlockArgument> getPrivateAttributions() { 1021 // Buffers on the private memory always come after buffers on the workgroup 1022 // memory. 1023 auto begin = 1024 std::next(getBody().args_begin(), 1025 getNumConfigRegionAttributes() + getNumWorkgroupAttributions()); 1026 return {begin, getBody().args_end()}; 1027 } 1028 1029 /// Adds a new block argument that corresponds to buffers located in 1030 /// private memory. 1031 BlockArgument addPrivateAttribution(Type type, Location loc); 1032 1033 /// Returns the name of the attribute containing the number of buffers 1034 /// located in the workgroup memory. 1035 static StringRef getNumWorkgroupAttributionsAttrName() { 1036 return "workgroup_attributions"; 1037 } 1038 }]; 1039 1040 let hasCanonicalizer = 1; 1041 let hasCustomAssemblyFormat = 1; 1042 let hasRegionVerifier = 1; 1043 let hasVerifier = 1; 1044} 1045 1046def GPU_PrintfOp : GPU_Op<"printf", [MemoryEffects<[MemWrite]>]>, 1047 Arguments<(ins StrAttr:$format, 1048 Variadic<AnyTypeOf<[AnyInteger, Index, AnyFloat]>>:$args)> { 1049 let summary = "Device-side printf, as in CUDA or OpenCL, for debugging"; 1050 let description = [{ 1051 `gpu.printf` takes a literal format string `format` and an arbitrary number of 1052 scalar arguments that should be printed. 1053 1054 The format string is a C-style printf string, subject to any restrictions 1055 imposed by one's target platform. 1056 }]; 1057 let assemblyFormat = [{ 1058 $format attr-dict (`,` $args^ `:` type($args))? 1059 }]; 1060} 1061 1062def GPU_ReturnOp : GPU_Op<"return", [HasParent<"GPUFuncOp">, Pure, 1063 Terminator]>, 1064 Arguments<(ins Variadic<AnyType>:$operands)>, Results<(outs)> { 1065 let summary = "Terminator for GPU functions."; 1066 let description = [{ 1067 A terminator operation for regions that appear in the body of `gpu.func` 1068 functions. The operands to the `gpu.return` are the result values returned 1069 by an invocation of the `gpu.func`. 1070 }]; 1071 1072 let builders = [OpBuilder<(ins), [{ // empty}]>]; 1073 1074 let assemblyFormat = "attr-dict ($operands^ `:` type($operands))?"; 1075 let hasVerifier = 1; 1076} 1077 1078def GPU_TerminatorOp : GPU_Op<"terminator", [HasParent<"LaunchOp">, 1079 Pure, Terminator]>, 1080 Arguments<(ins)>, Results<(outs)> { 1081 let summary = "Terminator for GPU launch regions."; 1082 let description = [{ 1083 A terminator operation for regions that appear in the body of `gpu.launch` 1084 operation. These regions are not expected to return any value so the 1085 terminator takes no operands. 1086 }]; 1087 1088 let assemblyFormat = "attr-dict"; 1089} 1090 1091def GPU_YieldOp : GPU_Op<"yield", [Pure, ReturnLike, Terminator]>, 1092 Arguments<(ins Variadic<AnyType>:$values)> { 1093 let summary = "GPU yield operation"; 1094 let description = [{ 1095 gpu.yield` is a special terminator operation for blocks inside regions 1096 in gpu ops. It returns values to the immediately enclosing gpu op. 1097 1098 Example: 1099 1100 ```mlir 1101 gpu.yield %f0, %f1 : f32, f32 1102 ``` 1103 }]; 1104 1105 let builders = [ 1106 OpBuilder<(ins), [{ /* nothing to do */ }]> 1107 ]; 1108 1109 let assemblyFormat = "attr-dict ($values^ `:` type($values))?"; 1110} 1111 1112// These mirror the reduction combining kinds from the vector dialect. 1113def GPU_AllReduceOpAdd : I32EnumAttrCase<"ADD", 0, "add">; 1114def GPU_AllReduceOpMul : I32EnumAttrCase<"MUL", 1, "mul">; 1115def GPU_AllReduceOpMinUI : I32EnumAttrCase<"MINUI", 2, "minui">; 1116def GPU_AllReduceOpMinSI : I32EnumAttrCase<"MINSI", 3, "minsi">; 1117// Follows the `arith.minnumf` semantics. 1118def GPU_AllReduceOpMinnumF : I32EnumAttrCase<"MINNUMF", 4, "minnumf">; 1119def GPU_AllReduceOpMaxUI : I32EnumAttrCase<"MAXUI", 5, "maxui">; 1120def GPU_AllReduceOpMaxSI : I32EnumAttrCase<"MAXSI", 6, "maxsi">; 1121// Follows the `arith.maxnumf` semantics. 1122def GPU_AllReduceOpMaxnumF : I32EnumAttrCase<"MAXNUMF", 7, "maxnumf">; 1123def GPU_AllReduceOpAnd : I32EnumAttrCase<"AND", 8, "and">; 1124def GPU_AllReduceOpOr : I32EnumAttrCase<"OR", 9, "or">; 1125def GPU_AllReduceOpXor : I32EnumAttrCase<"XOR", 10, "xor">; 1126// Follows the `arith.minimumf` semantics. 1127def GPU_AllReduceOpMinimumF : I32EnumAttrCase<"MINIMUMF", 11, "minimumf">; 1128// Follows the `arith.maximumf` semantics. 1129def GPU_AllReduceOpMaximumF : I32EnumAttrCase<"MAXIMUMF", 12, "maximumf">; 1130 1131def GPU_AllReduceOperation : I32EnumAttr<"AllReduceOperation", 1132 "built-in reduction operations supported by gpu.allreduce.", 1133 [ 1134 GPU_AllReduceOpAdd, 1135 GPU_AllReduceOpMul, 1136 GPU_AllReduceOpMinUI, 1137 GPU_AllReduceOpMinSI, 1138 GPU_AllReduceOpMinnumF, 1139 GPU_AllReduceOpMaxUI, 1140 GPU_AllReduceOpMaxSI, 1141 GPU_AllReduceOpMaxnumF, 1142 GPU_AllReduceOpAnd, 1143 GPU_AllReduceOpOr, 1144 GPU_AllReduceOpXor, 1145 GPU_AllReduceOpMinimumF, 1146 GPU_AllReduceOpMaximumF 1147 ]>{ 1148 let genSpecializedAttr = 0; 1149 let cppNamespace = "::mlir::gpu"; 1150} 1151 1152def AnyIntegerOrFloat : AnyTypeOf<[AnySignlessInteger, AnyFloat], "Integer or Float">; 1153 1154def GPU_AllReduceOperationAttr : EnumAttr<GPU_Dialect, GPU_AllReduceOperation, 1155 "all_reduce_op">; 1156 1157def GPU_AllReduceOp : GPU_Op<"all_reduce", 1158 [SameOperandsAndResultType, IsolatedFromAbove]> { 1159 let summary = "Reduce values among workgroup."; 1160 let description = [{ 1161 The `all_reduce` op reduces the value of every work item across a local 1162 workgroup. The result is equal for all work items of a workgroup. 1163 1164 For example, both 1165 1166 ```mlir 1167 %1 = gpu.all_reduce add %0 {} : (f32) -> (f32) 1168 %2 = gpu.all_reduce %0 { 1169 ^bb(%lhs : f32, %rhs : f32): 1170 %sum = arith.addf %lhs, %rhs : f32 1171 "gpu.yield"(%sum) : (f32) -> () 1172 } : (f32) -> (f32) 1173 ``` 1174 1175 compute the sum of each work item's %0 value. The first version specifies 1176 the accumulation as operation, whereas the second version specifies the 1177 accumulation as code region. The reduction operation must be one of: 1178 * Integer types: `add`, `mul`, `minui`, `minsi`, `maxui`, `maxsi`, `and`, 1179 `or`, `xor` 1180 * Floating point types: `add`, `mul`, `minnumf`, `maxnumf`, `minimumf`, 1181 `maximumf` 1182 1183 If `uniform` flag is set either none or all work items of a workgroup 1184 need to execute this op in convergence. 1185 }]; 1186 1187 let arguments = (ins 1188 AnyIntegerOrFloat:$value, 1189 OptionalAttr<GPU_AllReduceOperationAttr>:$op, 1190 UnitAttr:$uniform 1191 ); 1192 let results = (outs AnyIntegerOrFloat:$result); 1193 1194 let regions = (region AnyRegion:$body); 1195 let assemblyFormat = [{ custom<AllReduceOperation>($op) $value 1196 (`uniform` $uniform^)? $body attr-dict 1197 `:` functional-type(operands, results) }]; 1198 1199 let hasFolder = 1; 1200 let hasRegionVerifier = 1; 1201} 1202 1203def AnyIntegerOrFloatOr1DVector : 1204 AnyTypeOf<[AnyIntegerOrFloat, VectorOfRankAndType<[1], [AnyIntegerOrFloat]>]>; 1205 1206def GPU_SubgroupReduceOp : GPU_Op<"subgroup_reduce", [SameOperandsAndResultType]> { 1207 let summary = "Reduce values among subgroup."; 1208 let description = [{ 1209 The `subgroup_reduce` op reduces the values of lanes (work items) across a 1210 subgroup. 1211 1212 The subgroup is divided into clusters starting at lane index 0. Within each 1213 cluster, there are `size` lanes, and the lane index advances by `stride`. 1214 A reduction is done for each cluster in parallel: every lane in the cluster 1215 is reduced, and the result is equal for all lanes in the cluster. If `size` 1216 is omitted, there is a single cluster covering the entire subgroup. If 1217 `stride` is omitted, the stride is 1 (the cluster's lanes are contiguous). 1218 1219 When the reduced value is of a vector type, each vector element is reduced 1220 independently. Only 1-d vector types are allowed. 1221 1222 Example: 1223 1224 ```mlir 1225 %1 = gpu.subgroup_reduce add %a : (f32) -> f32 1226 %2 = gpu.subgroup_reduce add %b : (vector<4xf16>) -> vector<4xf16> 1227 %3 = gpu.subgroup_reduce add %c cluster(size = 4) : (f32) -> f32 1228 %3 = gpu.subgroup_reduce add %c cluster(size = 4, stride = 2) : (f32) -> f32 1229 ``` 1230 1231 If `uniform` flag is set either none or all lanes of a subgroup need to execute 1232 this op in convergence. 1233 1234 The reduction operation must be one of: 1235 * Integer types: `add`, `mul`, `minui`, `minsi`, `maxui`, `maxsi`, `and`, 1236 `or`, `xor` 1237 * Floating point types: `add`, `mul`, `minnumf`, `maxnumf`, `minimumf`, 1238 `maximumf` 1239 }]; 1240 1241 let arguments = (ins 1242 AnyIntegerOrFloatOr1DVector:$value, 1243 GPU_AllReduceOperationAttr:$op, 1244 UnitAttr:$uniform, 1245 OptionalAttr<I32Attr>:$cluster_size, 1246 DefaultValuedAttr<I32Attr,"1">:$cluster_stride 1247 ); 1248 let results = (outs AnyIntegerOrFloatOr1DVector:$result); 1249 1250 let builders = [ 1251 OpBuilder<(ins "Value":$value, 1252 "::mlir::gpu::AllReduceOperation":$op, 1253 "bool":$uniform), [{ 1254 build($_builder, $_state, value, op, uniform, std::nullopt); 1255 }]>, 1256 OpBuilder<(ins "Value":$value, 1257 "::mlir::gpu::AllReduceOperation":$op, 1258 "bool":$uniform, 1259 "std::optional<uint32_t>":$cluster_size), [{ 1260 build($_builder, $_state, value, op, uniform, 1261 cluster_size ? $_builder.getI32IntegerAttr(*cluster_size) : nullptr); 1262 }]>, 1263 OpBuilder<(ins "Value":$value, 1264 "::mlir::gpu::AllReduceOperation":$op, 1265 "bool":$uniform, 1266 "std::optional<uint32_t>":$cluster_size, 1267 "uint32_t":$cluster_stride), [{ 1268 build($_builder, $_state, value, op, uniform, 1269 cluster_size ? $_builder.getI32IntegerAttr(*cluster_size) : nullptr, 1270 cluster_stride); 1271 }]> 1272 ]; 1273 1274 let assemblyFormat = [{ custom<AllReduceOperation>($op) $value 1275 (`uniform` $uniform^)? 1276 (`cluster` `(` `size` `=` $cluster_size^ (`,` `stride` `=` $cluster_stride^)? `)`)? 1277 attr-dict 1278 `:` functional-type(operands, results) }]; 1279 1280 let hasFolder = 1; 1281 let hasVerifier = 1; 1282} 1283 1284def GPU_ShuffleOpXor : I32EnumAttrCase<"XOR", 0, "xor">; 1285def GPU_ShuffleOpDown : I32EnumAttrCase<"DOWN", 1, "down">; 1286def GPU_ShuffleOpUp : I32EnumAttrCase<"UP", 2, "up">; 1287def GPU_ShuffleOpIdx : I32EnumAttrCase<"IDX", 3, "idx">; 1288 1289def GPU_ShuffleMode : I32EnumAttr<"ShuffleMode", 1290 "Indexing modes supported by gpu.shuffle.", 1291 [ 1292 GPU_ShuffleOpXor, GPU_ShuffleOpUp, GPU_ShuffleOpDown, GPU_ShuffleOpIdx, 1293 ]> { 1294 let genSpecializedAttr = 0; 1295 let cppNamespace = "::mlir::gpu"; 1296} 1297def GPU_ShuffleModeAttr : EnumAttr<GPU_Dialect, GPU_ShuffleMode, 1298 "shuffle_mode">; 1299 1300def GPU_ShuffleOp : GPU_Op< 1301 "shuffle", [Pure, AllTypesMatch<["value", "shuffleResult"]>]>, 1302 Arguments<(ins AnyIntegerOrFloatOr1DVector:$value, I32:$offset, I32:$width, 1303 GPU_ShuffleModeAttr:$mode)>, 1304 Results<(outs AnyIntegerOrFloatOr1DVector:$shuffleResult, I1:$valid)> { 1305 let summary = "Shuffles values within a subgroup."; 1306 let description = [{ 1307 The "shuffle" op moves values to a across lanes (a.k.a., invocations, 1308 work items) within the same subgroup. The `width` argument specifies the 1309 number of lanes that participate in the shuffle, and must be uniform 1310 across all lanes. Further, the first `width` lanes of the subgroup must 1311 be active. 1312 1313 The intepretation of the `offset` arguments depends on the selected 1314 `mode`. 1315 1316 Returns the `shuffleResult` and `true` if the current lane id is smaller 1317 than `width`, and an unspecified value and `false` otherwise. 1318 1319 `xor` example: 1320 1321 ```mlir 1322 %1, %2 = gpu.shuffle xor %0, %offset, %width : f32 1323 ``` 1324 1325 For lane `k`, returns the value `%0` from lane `k ^ offset`. Every lane 1326 trades value with exactly one other lane. 1327 1328 `down` example: 1329 1330 ```mlir 1331 %cst1 = arith.constant 1 : i32 1332 %3, %4 = gpu.shuffle down %0, %cst1, %width : f32 1333 ``` 1334 1335 For lane `k`, returns the value from lane `(k + 1) % width`. 1336 1337 `up` example: 1338 1339 ```mlir 1340 %cst1 = arith.constant 1 : i32 1341 %5, %6 = gpu.shuffle up %0, %cst1, %width : f32 1342 ``` 1343 1344 For lane `k`, returns the value from lane `(k - 1) % width`. 1345 1346 `idx` example: 1347 1348 ```mlir 1349 %cst0 = arith.constant 0 : i32 1350 %7, %8 = gpu.shuffle idx %0, %cst0, %width : f32 1351 ``` 1352 1353 Broadcasts the value from lane 0 to all lanes. 1354 }]; 1355 1356 let assemblyFormat = [{ 1357 $mode $value `,` $offset `,` $width attr-dict `:` type($value) 1358 }]; 1359 1360 let builders = [ 1361 // Helper function that creates a shuffle with constant offset/width. 1362 OpBuilder<(ins "Value":$value, "int32_t":$offset, "int32_t":$width, 1363 "ShuffleMode":$mode)> 1364 ]; 1365} 1366 1367def GPU_BarrierOp : GPU_Op<"barrier"> { 1368 let summary = "Synchronizes all work items of a workgroup."; 1369 let description = [{ 1370 The "barrier" op synchronizes all work items of a workgroup. It is used 1371 to coordinate communication between the work items of the workgroup. 1372 1373 ```mlir 1374 gpu.barrier 1375 ``` 1376 1377 waits until all work items in the workgroup have reached this point 1378 and all memory accesses made by these work items prior to the op are 1379 visible to all work items in the workgroup. Data hazards between work items 1380 accessing the same memory can be avoided by synchronizing work items 1381 in-between these accesses. 1382 1383 Either none or all work items of a workgroup need to execute this op 1384 in convergence. 1385 }]; 1386 let assemblyFormat = "attr-dict"; 1387 let hasCanonicalizer = 1; 1388} 1389 1390def GPU_GPUModuleOp : GPU_Op<"module", [ 1391 DataLayoutOpInterface, HasDefaultDLTIDataLayout, IsolatedFromAbove, 1392 NoRegionArguments, SymbolTable, Symbol] # GraphRegionNoTerminator.traits> { 1393 let summary = "A top level compilation unit containing code to be run on a GPU."; 1394 let description = [{ 1395 GPU module contains code that is intended to be run on a GPU. A host device 1396 can launch this code through a gpu.launc_func that creates a fully 1397 qualified symbol through the gpu.module's symbol and a gpu.func symbol 1398 contained in the gpu.module. 1399 1400 The module's top-level scope is modeled by a single region with a single 1401 block. GPU modules are required to have a name that is used for symbol 1402 resolution by the gpu.launch_func operation. 1403 1404 Using an op with a region to define a GPU module enables "embedding" GPU 1405 modules with SIMT execution models in other dialects in a clean manner and 1406 allows filtering of code regions to execute passes on only code intended to 1407 or not intended to be run on the separate device. 1408 1409 Modules can contain zero or more target attributes. These attributes encode 1410 how to transform modules into binary strings and are used by the 1411 `gpu-module-to-binary` pass to transform modules into GPU binaries. 1412 1413 Modules can contain an optional `OffloadingTranslationAttr` attribute. This 1414 attribute will be used during the `gpu-module-to-binary` pass to specify the 1415 `OffloadingTranslationAttr` used when creating the `gpu.binary` operation. 1416 1417 ``` 1418 gpu.module @symbol_name { 1419 gpu.func {} 1420 ... 1421 } 1422 // Module with offloading handler and target attributes. 1423 gpu.module @symbol_name2 <#gpu.select_object<1>> [ 1424 #nvvm.target, 1425 #rocdl.target<chip = "gfx90a">] { 1426 gpu.func {} 1427 ... 1428 } 1429 ``` 1430 }]; 1431 let builders = [ 1432 OpBuilder<(ins "StringRef":$name, 1433 CArg<"ArrayAttr", "{}">:$targets, 1434 CArg<"Attribute", "{}">:$handler)>, 1435 OpBuilder<(ins "StringRef":$name, 1436 "ArrayRef<Attribute>":$targets, 1437 CArg<"Attribute", "{}">:$handler)> 1438 ]; 1439 1440 let arguments = (ins 1441 SymbolNameAttr:$sym_name, 1442 OptionalAttr<GPUNonEmptyTargetArrayAttr>:$targets, 1443 OptionalAttr<OffloadingTranslationAttr>:$offloadingHandler); 1444 let regions = (region SizedRegion<1>:$bodyRegion); 1445 let assemblyFormat = [{ 1446 $sym_name 1447 (`<` $offloadingHandler^ `>`)? 1448 ($targets^)? 1449 attr-dict-with-keyword $bodyRegion 1450 }]; 1451 1452 // We need to ensure the block inside the region is properly terminated; 1453 // the auto-generated builders do not guarantee that. 1454 let skipDefaultBuilders = 1; 1455 1456 let extraClassDeclaration = [{ 1457 /// Checks if `target` is in the `targets` list. 1458 bool hasTarget(Attribute target); 1459 1460 /// Sets the targets of the module. 1461 void setTargets(ArrayRef<TargetAttrInterface> targets); 1462 }]; 1463} 1464 1465def GPU_BinaryOp : GPU_Op<"binary", [Symbol]>, Arguments<(ins 1466 SymbolNameAttr:$sym_name, 1467 OptionalAttr<OffloadingTranslationAttr>:$offloadingHandler, 1468 ConfinedAttr<GPUObjectArrayAttr, [ArrayMinCount<1>]>:$objects) 1469 > { 1470 let summary = "An Op for storing serialized GPU binary objects."; 1471 let description = [{ 1472 GPU binaries provide a semantic mechanism for storing GPU objects, 1473 e.g. the result of compiling a GPU module to an object file. 1474 1475 This operation has 3 arguments: 1476 - The name of the binary. 1477 - An optional attribute implementing the offloading LLVM translation interface. 1478 - An array of GPU object attributes. 1479 1480 During translation, the offloading attribute will be called for translating 1481 GPU `binary` and `launch_func` operations. The default offloading handler is: 1482 `#gpu.select_object`, this handler selects the first object from the array 1483 and embeds it as a string. 1484 1485 Examples: 1486 ``` 1487 // Selects the first object. 1488 gpu.binary @myobject [#gpu.object<...>, #gpu.object<...>] 1489 // Uses the `#foo.my_handler` for handling the binary during translation. 1490 gpu.binary @myobject <#foo.my_handler> [#gpu.object<...>, #gpu.object<...>] 1491 // Selects the object with the `#rocdl.target` target attribute. 1492 gpu.binary @myobject <#gpu.select_object<#rocdl.target>> [#gpu.object<...>, #gpu.object<#rocdl.target, ...>] 1493 ``` 1494 }]; 1495 let builders = [ 1496 OpBuilder<(ins "StringRef":$name, 1497 "Attribute":$offloadingHandler, 1498 "ArrayAttr":$objects)>, 1499 OpBuilder<(ins "StringRef":$name, 1500 "Attribute":$offloadingHandler, 1501 "ArrayRef<Attribute>":$objects)> 1502 ]; 1503 let skipDefaultBuilders = 1; 1504 let assemblyFormat = [{ 1505 $sym_name custom<OffloadingHandler>($offloadingHandler) attr-dict $objects 1506 }]; 1507} 1508 1509def GPU_HostRegisterOp : GPU_Op<"host_register">, 1510 Arguments<(ins AnyUnrankedMemRef:$value)> { 1511 let summary = "Registers a memref for access from device."; 1512 let description = [{ 1513 This op maps the provided host buffer into the device address space. 1514 1515 This operation may not be supported in every environment, there is not yet a 1516 way to check at runtime whether this feature is supported. 1517 1518 Writes from the host are guaranteed to be visible to device kernels that are 1519 launched afterwards. Writes from the device are guaranteed to be visible on 1520 the host after synchronizing with the device kernel completion. 1521 }]; 1522 1523 let assemblyFormat = "$value attr-dict `:` type($value)"; 1524} 1525 1526def GPU_HostUnregisterOp : GPU_Op<"host_unregister">, 1527 Arguments<(ins AnyUnrankedMemRef:$value)> { 1528 let summary = "Unregisters a memref for access from device."; 1529 let description = [{ 1530 This op unmaps the provided host buffer from the device address space. 1531 1532 This operation may not be supported in every environment, there is not yet a 1533 way to check at runtime whether this feature is supported. 1534 }]; 1535 1536 let assemblyFormat = "$value attr-dict `:` type($value)"; 1537} 1538 1539def GPU_WaitOp : GPU_Op<"wait", [GPU_AsyncOpInterface]> { 1540 let summary = "Wait for async gpu ops to complete."; 1541 let description = [{ 1542 This op synchronizes the host or the device with a list of dependent ops. 1543 1544 If the op contains the `async` keyword, it returns a new async token which 1545 is synchronized with the op arguments. This new token is merely a shortcut 1546 to the argument list, and one could replace the uses of the result with the 1547 arguments for the same effect. The async version of this op is primarily 1548 used to make each async token have a single use during lowering and 1549 thereby make forks in async execution explicit. Example usage: 1550 1551 ```mlir 1552 %t0 = gpu.foo async : !gpu.async.token 1553 %t1 = gpu.bar async : !gpu.async.token 1554 %t2 = gpu.wait async [%t0, %t1] 1555 // gpu.baz doesn't run until gpu.foo and gpu.bar have both completed, just 1556 // as if the async dependencies were [%t0, %t1]. 1557 %t3 = gpu.baz async [%t2] 1558 ``` 1559 1560 If the op does not contain the `async` keyword, it does not return a new 1561 async token but blocks until all ops producing the async dependency tokens 1562 finished execution. All dependent memory operations are visible to the host 1563 once this op completes. Example usage: 1564 1565 ```mlir 1566 %t0 = gpu.foo async : !gpu.async.token 1567 %t1 = gpu.bar async : !gpu.async.token 1568 // The gpu.wait op blocks until gpu.foo and gpu.bar have completed. 1569 gpu.wait [%t0, %t1] 1570 ``` 1571 }]; 1572 1573 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies); 1574 let results = (outs Optional<GPU_AsyncToken>:$asyncToken); 1575 1576 let assemblyFormat = [{ 1577 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) attr-dict 1578 }]; 1579 1580 let hasCanonicalizer = 1; 1581} 1582 1583def GPU_AllocOp : GPU_Op<"alloc", [ 1584 GPU_AsyncOpInterface, 1585 AttrSizedOperandSegments 1586 ]> { 1587 1588 let summary = "GPU memory allocation operation."; 1589 let description = [{ 1590 The `gpu.alloc` operation allocates a region of memory on the GPU. It is 1591 similar to the `memref.alloc` op, but supports asynchronous GPU execution. 1592 1593 The op does not execute before all async dependencies have finished 1594 executing. 1595 1596 If the `async` keyword is present, the op is executed asynchronously (i.e. 1597 it does not block until the execution has finished on the device). In 1598 that case, it also returns a !gpu.async.token. 1599 1600 If the `host_shared` keyword is present, the memory will be allocated in a 1601 memory accessible both on host and on device. 1602 1603 Example: 1604 1605 ```mlir 1606 %memref, %token = gpu.alloc async [%dep] host_shared (%width) : memref<64x?xf32, 1> 1607 ``` 1608 }]; 1609 1610 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, 1611 Variadic<Index>:$dynamicSizes, Variadic<Index>:$symbolOperands, 1612 UnitAttr:$hostShared); 1613 let results = (outs Res<AnyMemRef, "", [MemAllocAt<0, FullEffect>]>:$memref, 1614 Optional<GPU_AsyncToken>:$asyncToken); 1615 1616 let extraClassDeclaration = [{ 1617 MemRefType getType() { return ::llvm::cast<MemRefType>(getMemref().getType()); } 1618 }]; 1619 1620 let assemblyFormat = [{ 1621 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) (` ` `host_shared` $hostShared^)? ` ` 1622 `(` $dynamicSizes `)` (`` `[` $symbolOperands^ `]`)? attr-dict `:` type($memref) 1623 }]; 1624 1625 let hasVerifier = 1; 1626 let hasCanonicalizer = 1; 1627} 1628 1629def GPU_DeallocOp : GPU_Op<"dealloc", [GPU_AsyncOpInterface]> { 1630 1631 let summary = "GPU memory deallocation operation"; 1632 1633 let description = [{ 1634 The `gpu.dealloc` operation frees the region of memory referenced by a 1635 memref which was originally created by the `gpu.alloc` operation. It is 1636 similar to the `memref.dealloc` op, but supports asynchronous GPU execution. 1637 1638 The op does not execute before all async dependencies have finished 1639 executing. 1640 1641 If the `async` keyword is present, the op is executed asynchronously (i.e. 1642 it does not block until the execution has finished on the device). In 1643 that case, it returns a !gpu.async.token. 1644 1645 Example: 1646 1647 ```mlir 1648 %token = gpu.dealloc async [%dep] %memref : memref<8x64xf32, 1> 1649 ``` 1650 }]; 1651 1652 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, 1653 Arg<AnyMemRef, "", [MemFreeAt<0, FullEffect>]>:$memref); 1654 let results = (outs Optional<GPU_AsyncToken>:$asyncToken); 1655 1656 let assemblyFormat = [{ 1657 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 1658 $memref attr-dict `:` type($memref) 1659 }]; 1660} 1661 1662def GPU_MemcpyOp : GPU_Op<"memcpy", [GPU_AsyncOpInterface]> { 1663 1664 let summary = "GPU memcpy operation"; 1665 1666 let description = [{ 1667 The `gpu.memcpy` operation copies the content of one memref to another. 1668 1669 The op does not execute before all async dependencies have finished 1670 executing. 1671 1672 If the `async` keyword is present, the op is executed asynchronously (i.e. 1673 it does not block until the execution has finished on the device). In 1674 that case, it returns a !gpu.async.token. 1675 1676 Example: 1677 1678 ```mlir 1679 %token = gpu.memcpy async [%dep] %dst, %src : memref<?xf32, 1>, memref<?xf32> 1680 ``` 1681 }]; 1682 1683 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, 1684 Arg<AnyMemRef, "", [MemWriteAt<0, FullEffect>]>:$dst, 1685 Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$src); 1686 let results = (outs Optional<GPU_AsyncToken>:$asyncToken); 1687 1688 let assemblyFormat = [{ 1689 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 1690 $dst`,` $src `:` type($dst)`,` type($src) attr-dict 1691 }]; 1692 let hasFolder = 1; 1693 let hasVerifier = 1; 1694 let hasCanonicalizer = 1; 1695} 1696 1697def GPU_MemsetOp : GPU_Op<"memset", 1698 [GPU_AsyncOpInterface, AllElementTypesMatch<["dst", "value"]>]> { 1699 1700 let summary = "GPU memset operation"; 1701 1702 let description = [{ 1703 The `gpu.memset` operation sets the content of memref to a scalar value. 1704 1705 The op does not execute before all async dependencies have finished 1706 executing. 1707 1708 If the `async` keyword is present, the op is executed asynchronously (i.e. 1709 it does not block until the execution has finished on the device). In 1710 that case, it returns a !gpu.async.token. 1711 1712 Example: 1713 1714 ```mlir 1715 %token = gpu.memset async [%dep] %dst, %value : memref<?xf32, 1>, f32 1716 ``` 1717 }]; 1718 1719 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, 1720 Arg<AnyMemRef, "", [MemWriteAt<0, FullEffect>]>:$dst, 1721 Arg<AnyType, "">:$value); 1722 let results = (outs Optional<GPU_AsyncToken>:$asyncToken); 1723 1724 let assemblyFormat = [{ 1725 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 1726 $dst`,` $value `:` type($dst)`,` type($value) attr-dict 1727 }]; 1728 let hasFolder = 1; 1729} 1730 1731def GPU_SetDefaultDeviceOp : GPU_Op<"set_default_device", 1732 [MemoryEffects<[MemWrite]>]>, 1733 Arguments<(ins I32:$devIndex)> { 1734 let summary = "Set default GPU for operations after this by index"; 1735 let description = [{ 1736 Operation that sets the current default GPU, using a zero-based index 1737 into the set of GPUs on the system. The default GPU setting may be 1738 thread-local. 1739 }]; 1740 let assemblyFormat = "attr-dict $devIndex"; 1741} 1742 1743def GPU_SubgroupMmaLoadMatrixOp : GPU_Op<"subgroup_mma_load_matrix", 1744 [MemoryEffects<[MemRead]>]>{ 1745 1746 let summary = "GPU warp synchronous matrix load"; 1747 1748 let description = [{ 1749 The `gpu.subgroup_mma_load_matrix` operation loads a matrix collectively 1750 using all the threads in a subgroup. 1751 1752 This operation takes a memref as its first operand: it is the source matrix 1753 from which data is to be loaded. The op returns a `!gpu.mma_matrix`. The 1754 source memref can be in global memory or shared memory. The load address is 1755 determined using `indices`. The matrix being loaded into is the result. The 1756 `leadDimension` attribute specifies the leading dimension size of the source 1757 matrix which eventually allows the lowering to determine the size of each 1758 row. If the `transpose` attribute is present then the op does a transposed load. 1759 1760 For integer types, the resulting `!gpu.mma_matrix` type needs to specify the 1761 signedness of the data if the matrix type is an `A` or `B` operand for 1762 `gpu.subgroup_mma_compute`. 1763 1764 This op is often meant to be used along with `gpu.subgroup_mma_store_matrix` and 1765 `gpu.subgroup_mma_compute`. 1766 1767 Example: 1768 1769 ```mlir 1770 %0 = gpu.subgroup_mma_load_matrix src[%i,%j] : {leadDimension = 32 : i32} 1771 : memref<32x32xf16, 3>, !gpu.mma_matrix<16x16xf16, "AOp"> 1772 ``` 1773 }]; 1774 1775 let arguments = (ins Arg<GPU_MMAMemRef, "", 1776 [MemReadAt<0, FullEffect>]>:$srcMemref, 1777 Variadic<Index>:$indices, 1778 IndexAttr:$leadDimension, 1779 OptionalAttr<UnitAttr>:$transpose); 1780 1781 let results = (outs GPU_MMAMatrix:$res); 1782 1783 let assemblyFormat = [{ 1784 $srcMemref`[`$indices`]` attr-dict `:` type($srcMemref) `->` type($res) 1785 }]; 1786 let hasVerifier = 1; 1787} 1788 1789def GPU_SubgroupMmaStoreMatrixOp : GPU_Op<"subgroup_mma_store_matrix", 1790 [MemoryEffects<[MemWrite]>]>{ 1791 1792 let summary = "GPU warp synchronous matrix store"; 1793 1794 let description = [{ 1795 The `gpu.subgroup_mma_store_matrix` operation stores a matrix collectively 1796 using all the threads in a subgroup. 1797 1798 This operation takes a `!gpu.mma_matrix` and a memref as operands. 1799 `!gpu.mma_matrix` is the source value containing the data to be stored into the 1800 destination memref which can be in global or shared memory. The store address 1801 is determined using the indices provided. The `leadDimension` attribute 1802 specifies the leading dimension of the destination matrix. If the 1803 `transpose` attribute is present then the op does a transposed store. 1804 1805 This op is often meant to be used along with `gpu.subgroup_mma_load_matrix` and 1806 `gpu.subgroup_mma_compute`. 1807 1808 Example: 1809 1810 ```mlir 1811 gpu.subgroup_mma_store_matrix %D, %sg[%i,%j] : { leadDimension = 32 : i32} 1812 : !gpu.mma_matrix<16x16xf16, "COp">, memref<32x32xf16, 3> 1813 ``` 1814 }]; 1815 1816 let arguments = (ins Arg<MMAMatrixOf<[SI8, UI8, I32, F16, F32]>>:$src, 1817 Arg<GPU_MMAMemRef, "",[MemWriteAt<0, FullEffect>]>:$dstMemref, 1818 Variadic<Index>:$indices, 1819 IndexAttr:$leadDimension, 1820 OptionalAttr<UnitAttr>:$transpose); 1821 1822 let assemblyFormat = [{ 1823 $src`,` $dstMemref`[`$indices`]` attr-dict `:` type($src)`,` type($dstMemref) 1824 }]; 1825 let hasVerifier = 1; 1826} 1827 1828def GPU_SubgroupMmaComputeOp 1829 : GPU_Op<"subgroup_mma_compute", [Pure, AllTypesMatch<["opC", "res"]>]> { 1830 1831 let summary = "GPU warp synchronous matrix multiply accumulate"; 1832 1833 let description = [{ 1834 The `gpu.subgroup_mma_compute` operation performs a matrix-multiply accumulate (mma) 1835 operation using all the threads in a subgroup. 1836 1837 This operation takes three `!gpu.mma_matrix`s as arguments: these hold `A`, 1838 `B` and `C`operands for the mma operation. The operation performed is represented 1839 as `C += A * B`. The op returns a `!gpu.mma_matrix` which contains the result of 1840 the operation held by all threads in a subgroup. `a_transpose` or 1841 `b_transpose` if present, signify that the respective operand was loaded in a 1842 transposed manner. The transpose operands are required to map to correct 1843 underlying intrisics but they currently do not seem to affect correctness 1844 even if they are absent given that the operands were loaded correctly using 1845 the `transpose` attribute in `gpu.subgroup_mma_load_matrix` op. 1846 1847 For integer types, the `A` and `B` matrices carry their signedness with their 1848 types. The accumulator type is expected to be signless and imply a signed integer 1849 with a greater width than the other two operands. 1850 1851 This op is meant to be used along with `gpu.subgroup_mma_store_matrix` and 1852 `gpu.subgroup_mma_load_matrix` ops. 1853 1854 Example: 1855 1856 ```mlir 1857 %D = gpu.subgroup_mma_compute_matrix %A, %B, %C : 1858 !gpu.mma_matrix<16x16xf16, "AOp">, !gpu.mma_matrix<16x16xf16, "BOp">> 1859 -> !gpu.mma_matrix<16x16xf16, "COp"> 1860 ``` 1861 }]; 1862 1863 let arguments = (ins Arg<MMAMatrixOf<[SI8, UI8, F16, F32]>>:$opA, 1864 Arg<MMAMatrixOf<[SI8, UI8, F16, F32]>>:$opB, 1865 Arg<MMAMatrixOf<[I32, F16, F32]>>:$opC, 1866 OptionalAttr<UnitAttr>:$a_transpose, 1867 OptionalAttr<UnitAttr>:$b_transpose); 1868 1869 let results = (outs GPU_MMAMatrix : $res); 1870 1871 let assemblyFormat = [{ 1872 $opA`,` $opB`,` $opC attr-dict `:` type($opA)`,` type($opB) `->` type($res) 1873 }]; 1874 let hasVerifier = 1; 1875} 1876 1877def GPU_SubgroupMmaConstantMatrixOp : GPU_Op<"subgroup_mma_constant_matrix", 1878 [Pure, 1879 TypesMatchWith<"value type matches element type of mma_matrix", 1880 "res", "value", 1881 "::llvm::cast<gpu::MMAMatrixType>($_self).getElementType()">]>{ 1882 1883 let summary = "GPU warp synchronous constant matrix"; 1884 1885 let description = [{ 1886 The `gpu.subgroup_mma_constant_matrix` creates a `!gpu.mma_matrix` with 1887 constant elements. 1888 1889 The operation takes a scalar input and return a `!gpu.mma_matrix` where 1890 each element of is equal to the operand constant. The destination 1891 mma_matrix type must have elememt type equal to the constant type. Since 1892 the layout of `!gpu.mma_matrix` is opaque this only support setting all the 1893 elements to the same value. 1894 1895 This op is meant to be used along with `gpu.subgroup_mma_compute`. 1896 1897 Example: 1898 1899 ```mlir 1900 %0 = gpu.subgroup_mma_constant_matrix %a : 1901 !gpu.mma_matrix<16x16xf16, "AOp"> 1902 %1 = gpu.subgroup_mma_constant_matrix %b : 1903 !gpu.mma_matrix<16x16xf32, "COp"> 1904 ``` 1905 }]; 1906 1907 let arguments = (ins AnyTypeOf<[SI8, UI8, I32, F16, F32]>:$value); 1908 1909 let results = (outs GPU_MMAMatrix:$res); 1910 1911 let extraClassDeclaration = [{ 1912 gpu::MMAMatrixType getType() { 1913 return ::llvm::cast<gpu::MMAMatrixType>(getRes().getType()); 1914 } 1915 }]; 1916 1917 let assemblyFormat = [{ 1918 $value attr-dict `:` type($res) 1919 }]; 1920} 1921 1922def GPU_ElementwiseOpAddF : I32EnumAttrCase<"ADDF", 0, "addf">; 1923def GPU_ElementwiseOpMulF : I32EnumAttrCase<"MULF", 1, "mulf">; 1924def GPU_ElementwiseOpSUBF : I32EnumAttrCase<"SUBF", 2, "subf">; 1925def GPU_ElementwiseOpMaxF : I32EnumAttrCase<"MAXF", 3, "maxf">; 1926def GPU_ElementwiseOpMinF : I32EnumAttrCase<"MINF", 4, "minf">; 1927def GPU_ElementwiseOpDivF : I32EnumAttrCase<"DIVF", 5, "divf">; 1928def GPU_ElementwiseOpAddI : I32EnumAttrCase<"ADDI", 6, "addi">; 1929def GPU_ElementwiseOpMulI : I32EnumAttrCase<"MULI", 7, "muli">; 1930def GPU_ElementwiseOpSUBI : I32EnumAttrCase<"SUBI", 8, "subi">; 1931def GPU_ElementwiseOpDivS : I32EnumAttrCase<"DIVS", 9, "divs">; 1932def GPU_ElementwiseOpDivU : I32EnumAttrCase<"DIVU", 10, "divu">; 1933def GPU_ElementwiseOpNEGF : I32EnumAttrCase<"NEGATEF", 11, "negatef">; 1934def GPU_ElementwiseOpNEGS : I32EnumAttrCase<"NEGATES", 12, "negates">; 1935def GPU_ElementwiseOpEXTF : I32EnumAttrCase<"EXTF", 13, "extf">; 1936 1937def MMAElementWise : I32EnumAttr<"MMAElementwiseOp", 1938 "elementwise operation to apply to mma matrix", [ 1939 GPU_ElementwiseOpAddF, 1940 GPU_ElementwiseOpMulF, 1941 GPU_ElementwiseOpSUBF, 1942 GPU_ElementwiseOpMaxF, 1943 GPU_ElementwiseOpMinF, 1944 GPU_ElementwiseOpDivF, 1945 GPU_ElementwiseOpAddI, 1946 GPU_ElementwiseOpMulI, 1947 GPU_ElementwiseOpSUBI, 1948 GPU_ElementwiseOpDivS, 1949 GPU_ElementwiseOpDivU, 1950 GPU_ElementwiseOpNEGF, 1951 GPU_ElementwiseOpNEGS, 1952 GPU_ElementwiseOpEXTF 1953 ]> { 1954 let genSpecializedAttr = 0; 1955 let cppNamespace = "::mlir::gpu"; 1956} 1957def MMAElementWiseAttr : EnumAttr<GPU_Dialect, MMAElementWise, 1958 "mma_element_wise">; 1959 1960def GPU_SubgroupMmaElementwiseOp : GPU_Op<"subgroup_mma_elementwise", 1961 [Pure, 1962 AllTypesMatch<["args"]>]>{ 1963 1964 let summary = "GPU warp elementwise operation on a matrix"; 1965 1966 let description = [{ 1967 The `gpu.subgroup_mma_elementwise` takes `!gpu.mma_matrix` inputs and 1968 compute a new `!gpu.mma_matrix` by applying an elementwise operation to each 1969 element. 1970 1971 Since the operation is elementwise and the matrix type must match, the 1972 matrix elements are processed independently of the matrix layout. 1973 1974 This op is meant to be used along with `gpu.subgroup_mma_compute`. 1975 1976 Example: 1977 1978 ```mlir 1979 %0 = %A, %B { opType = "ADD" } : 1980 (!gpu.mma_matrix<16x16xf16, "COp">, !gpu.mma_matrix<16x16xf16, "COp">) 1981 -> !gpu.mma_matrix<16x16xf16, "COp"> 1982 ``` 1983 }]; 1984 1985 let arguments = (ins Variadic<GPU_MMAMatrix>:$args, 1986 MMAElementWiseAttr:$opType); 1987 1988 let results = (outs GPU_MMAMatrix:$res); 1989 1990 let extraClassDeclaration = [{ 1991 gpu::MMAMatrixType getType() { 1992 return ::llvm::cast<gpu::MMAMatrixType>(getRes().getType()); 1993 } 1994 }]; 1995 1996 let assemblyFormat = [{ 1997 $opType $args attr-dict `:` functional-type($args, $res) 1998 }]; 1999} 2000 2001// 2002// Operation on sparse matrices, called from the host 2003// (currently lowers to cuSparse for CUDA only, no ROCM lowering). 2004// 2005 2006def GPU_CreateDnTensorOp : GPU_Op<"create_dn_tensor", [GPU_AsyncOpInterface, AttrSizedOperandSegments]> { 2007 let summary = "Create dense tensor operation"; 2008 let description = [{ 2009 The `gpu.create_dn_tensor` operation initializes a dense tensor from 2010 the given values buffer and sizes. The buffer must already be copied 2011 from the host to the device prior to using this operation. The 2012 operation returns a handle to the dense tensor descriptor. 2013 2014 If the `async` keyword is present, the op is executed asynchronously (i.e. 2015 it does not block until the execution has finished on the device). In 2016 that case, it returns a !gpu.async.token in addition to the environment. 2017 2018 Example: 2019 2020 ```mlir 2021 %dmat, %token = gpu.create_dn_tensor async [%dep] %mem, %dims : index, index into memref<?xf64> 2022 ``` 2023 }]; 2024 2025 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, 2026 AnyMemRef:$memref, 2027 Variadic<Index>:$dims); 2028 let results = (outs Res<GPU_SparseDnTensorHandle>:$dnTensor, Optional<GPU_AsyncToken>:$asyncToken); 2029 2030 let assemblyFormat = [{ 2031 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 2032 $memref `,` $dims attr-dict `:` type($dims) `into` type($memref) 2033 }]; 2034} 2035 2036def GPU_DestroyDnTensorOp : GPU_Op<"destroy_dn_tensor", [GPU_AsyncOpInterface]> { 2037 let summary = "Destroy dense tensor operation"; 2038 let description = [{ 2039 The `gpu.destroy_dn_tensor` operation releases all resources of a dense 2040 tensor represented by a handle that was previously created by a 2041 `gpu.create_dn_tensor` operation. 2042 2043 If the `async` keyword is present, the op is executed asynchronously (i.e. 2044 it does not block until the execution has finished on the device). In 2045 that case, it returns a !gpu.async.token in addition to the environment. 2046 2047 Example: 2048 2049 ```mlir 2050 %token = gpu.destroy_dn_tensor async [%dep] %dnTensor 2051 ``` 2052 }]; 2053 2054 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, 2055 Arg<GPU_SparseDnTensorHandle>:$dnTensor); 2056 let results = (outs Optional<GPU_AsyncToken>:$asyncToken); 2057 2058 let assemblyFormat = [{ 2059 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 2060 $dnTensor attr-dict 2061 }]; 2062} 2063 2064def GPU_CreateCooOp : GPU_Op<"create_coo", [GPU_AsyncOpInterface]> { 2065 let summary = "Create sparse matrix in COO format operation"; 2066 let description = [{ 2067 The `gpu.create_coo` operation initializes a sparse matrix in COO format 2068 with the given sizes from the given index and values buffers. The buffers 2069 must already be copied from the host to the device prior to using this 2070 operation. The operation returns a handle to the sparse matrix descriptor. 2071 Note that this operation builds the COO in SoA format. 2072 2073 If the `async` keyword is present, the op is executed asynchronously (i.e. 2074 it does not block until the execution has finished on the device). In 2075 that case, it returns a !gpu.async.token in addition to the environment. 2076 2077 Example: 2078 2079 ```mlir 2080 %spmat, %token = gpu.create_coo async [%dep] %rows, %cols, %nnz, %rowIdx, 2081 %colIdx, %values : memref<?xindex>, memref<?xindex>, memref<?xf64> 2082 ``` 2083 }]; 2084 2085 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, 2086 Index:$rows, 2087 Index:$cols, 2088 Index:$nnz, 2089 AnyMemRef:$rowIdxs, 2090 AnyMemRef:$colIdxs, 2091 AnyMemRef:$values); 2092 let results = (outs Res<GPU_SparseSpMatHandle>:$spmat, 2093 Optional<GPU_AsyncToken>:$asyncToken); 2094 2095 let assemblyFormat = [{ 2096 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 2097 $rows `,` $cols `,` $nnz `,` $rowIdxs `,` $colIdxs `,` $values attr-dict 2098 `:` type($rowIdxs) `,` type($colIdxs) `,` type($values) 2099 }]; 2100} 2101 2102def GPU_CreateCooAoSOp : GPU_Op<"create_coo_aos", [GPU_AsyncOpInterface]> { 2103 let summary = "Create sparse matrix in COO format operation (AoS)"; 2104 let description = [{ 2105 The `gpu.create_coo_aos` operation initializes a sparse matrix in COO format 2106 with the given sizes from the given index and values buffers. The buffers 2107 must already be copied from the host to the device prior to using this 2108 operation. The operation returns a handle to the sparse matrix descriptor. 2109 Unlike the default `gpu.create_coo` operation, this operation builds the 2110 COO format from a single index buffer in AoS format (note that this 2111 feature has been deprecated in cuSparse 11.2). 2112 2113 If the `async` keyword is present, the op is executed asynchronously (i.e. 2114 it does not block until the execution has finished on the device). In 2115 that case, it returns a !gpu.async.token in addition to the environment. 2116 2117 Example: 2118 2119 ```mlir 2120 %spmat, %token = gpu.create_coo_aos async [%dep] %rows, %cols, %nnz, %idxs, 2121 %values : memref<?xindex>, memref<?xf64> 2122 ``` 2123 }]; 2124 2125 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, 2126 Index:$rows, 2127 Index:$cols, 2128 Index:$nnz, 2129 AnyMemRef:$idxs, 2130 AnyMemRef:$values); 2131 let results = (outs Res<GPU_SparseSpMatHandle>:$spmat, 2132 Optional<GPU_AsyncToken>:$asyncToken); 2133 2134 let assemblyFormat = [{ 2135 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 2136 $rows `,` $cols `,` $nnz `,` $idxs `,` $values attr-dict 2137 `:` type($idxs) `,` type($values) 2138 }]; 2139} 2140 2141def GPU_CreateCsrOp : GPU_Op<"create_csr", [GPU_AsyncOpInterface]> { 2142 let summary = "Create sparse matrix in CSR format operation"; 2143 let description = [{ 2144 The `gpu.create_csr` operation initializes a sparse matrix in CSR format 2145 with the given sizes from the given position, index, and values buffers. 2146 The buffers must already be copied from the host to the device prior to 2147 using this operation. The operation returns a handle to the sparse 2148 matrix descriptor. 2149 2150 The CSR format has exactly the same memory layout as its transpose 2151 in CSC format (and vice versa). 2152 2153 If the `async` keyword is present, the op is executed asynchronously (i.e. 2154 it does not block until the execution has finished on the device). In 2155 that case, it returns a !gpu.async.token in addition to the environment. 2156 2157 Example: 2158 2159 ```mlir 2160 %spmat, %token = gpu.create_csr async [%dep] %rows, %cols, %nnz, %rowPos, 2161 %colIdx, %values : memref<?xindex>, memref<?xindex>, memref<?xf64> 2162 ``` 2163 }]; 2164 2165 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, 2166 Index:$rows, 2167 Index:$cols, 2168 Index:$nnz, 2169 AnyMemRef:$rowPos, 2170 AnyMemRef:$colIdxs, 2171 AnyMemRef:$values); 2172 let results = (outs Res<GPU_SparseSpMatHandle>:$spmat, 2173 Optional<GPU_AsyncToken>:$asyncToken); 2174 2175 let assemblyFormat = [{ 2176 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 2177 $rows `,` $cols `,` $nnz `,` $rowPos `,` $colIdxs `,` $values attr-dict 2178 `:` type($rowPos) `,` type($colIdxs) `,` type($values) 2179 }]; 2180} 2181 2182def GPU_CreateCscOp : GPU_Op<"create_csc", [GPU_AsyncOpInterface]> { 2183 let summary = "Create sparse matrix in CSC format operation"; 2184 let description = [{ 2185 The `gpu.create_csc` operation initializes a sparse matrix in CSC format 2186 with the given sizes from the given position, index, and values buffers. 2187 The buffers must already be copied from the host to the device prior to 2188 using this operation. The operation returns a handle to the sparse 2189 matrix descriptor. 2190 2191 The CSC format has exactly the same memory layout as its transpose 2192 in CSR format (and vice versa). 2193 2194 If the `async` keyword is present, the op is executed asynchronously (i.e. 2195 it does not block until the execution has finished on the device). In 2196 that case, it returns a !gpu.async.token in addition to the environment. 2197 2198 Example: 2199 2200 ```mlir 2201 %spmat, %token = gpu.create_csc async [%dep] %rows, %cols, %nnz, %colPos, 2202 %rowIdx, %values : memref<?xindex>, memref<?xindex>, memref<?xf64> 2203 ``` 2204 }]; 2205 2206 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, 2207 Index:$rows, 2208 Index:$cols, 2209 Index:$nnz, 2210 AnyMemRef:$colPos, 2211 AnyMemRef:$rowIdxs, 2212 AnyMemRef:$values); 2213 let results = (outs Res<GPU_SparseSpMatHandle>:$spmat, 2214 Optional<GPU_AsyncToken>:$asyncToken); 2215 2216 let assemblyFormat = [{ 2217 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 2218 $rows `,` $cols `,` $nnz `,` $colPos `,` $rowIdxs `,` $values attr-dict 2219 `:` type($colPos) `,` type($rowIdxs) `,` type($values) 2220 }]; 2221} 2222 2223def GPU_CreateBsrOp : GPU_Op<"create_bsr", [GPU_AsyncOpInterface]> { 2224 let summary = "Create sparse matrix in BSR format operation"; 2225 let description = [{ 2226 The `gpu.create_bsr` operation initializes a sparse matrix in BSR format 2227 with the given sizes for the matrix and blocks from the given position, 2228 index, and values buffers. The buffers must already be copied from the 2229 host to the device prior to using this operation. The operation returns 2230 a handle to the sparse matrix descriptor. 2231 2232 The BSR format is similar to CSR, where the column indices represent 2233 two-dimensional blocks instead of a single matrix entry. Note that this 2234 operation (currently) only supports storage with **square** blocks, 2235 i.e., `rBlockSize == cBlockSize`. 2236 2237 If the `async` keyword is present, the op is executed asynchronously (i.e. 2238 it does not block until the execution has finished on the device). In 2239 that case, it returns a !gpu.async.token in addition to the environment. 2240 2241 Example: 2242 2243 ```mlir 2244 %spmat, %token = gpu.create_bsr async [%dep] 2245 %brows, %bcols, %bnnz, %rBlockSize, %cBlockSize, 2246 %bRowPos, %bColIdxs, %values : memref<?xindex>, memref<?xindex>, memref<?xf64> 2247 ``` 2248 }]; 2249 2250 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, 2251 Index:$brows, 2252 Index:$bcols, 2253 Index:$bnnz, 2254 Index:$rBlockSize, 2255 Index:$cBlockSize, 2256 AnyMemRef:$bRowPos, 2257 AnyMemRef:$bColIdxs, 2258 AnyMemRef:$values); 2259 let results = (outs Res<GPU_SparseSpMatHandle>:$spmat, 2260 Optional<GPU_AsyncToken>:$asyncToken); 2261 2262 let assemblyFormat = [{ 2263 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 2264 $brows `,` $bcols `,` $bnnz `,` $rBlockSize `,` $cBlockSize `,` 2265 $bRowPos `,` $bColIdxs `,` $values attr-dict 2266 `:` type($bRowPos) `,` type($bColIdxs) `,` type($values) 2267 }]; 2268} 2269 2270def GPU_Prune2To4SpMatFlag : I32EnumAttr<"Prune2To4SpMatFlag", 2271 "pruning strategy for 2:4 sparse matrix", 2272 [ 2273 I32EnumAttrCase<"NONE", 0>, 2274 I32EnumAttrCase<"PRUNE_ONLY", 1>, 2275 I32EnumAttrCase<"PRUNE_AND_CHECK", 2>, 2276 ]> { 2277 let genSpecializedAttr = 0; 2278 let cppNamespace = GPU_Dialect.cppNamespace; 2279} 2280 2281def GPU_Prune2To4SpMatFlagAttr : EnumAttr<GPU_Dialect, GPU_Prune2To4SpMatFlag, 2282 "prune_2to4_spmat_flag">{ 2283 let defaultValue = "Prune2To4SpMatFlag::PRUNE_AND_CHECK"; 2284} 2285 2286 2287def GPU_Create2To4SpMatOp : GPU_Op<"create_2to4_spmat", [GPU_AsyncOpInterface]> { 2288 let summary = "Create sparse matrix with 2:4 sparsity operation"; 2289 let description = [{ 2290 The `gpu.create_2to4_spmat` operation initializes a sparse matrix in dense 2291 format with 2:4 sparsity. 2292 The buffers must already be copied from the host to the device prior to 2293 using this operation. The operation returns a handle to the sparse 2294 matrix descriptor. 2295 2296 If the `async` keyword is present, the op is executed asynchronously (i.e. 2297 it does not block until the execution has finished on the device). In 2298 that case, it returns a !gpu.async.token in addition to the environment. 2299 2300 Example: 2301 2302 ```mlir 2303 %spmat, %token = gpu.create_2to4_spmat async [%dep] {PRUNE_AND_CHECK} %rows, %cols, %mem: memref<?xf64> 2304 ``` 2305 }]; 2306 2307 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, 2308 Index:$rows, 2309 Index:$cols, 2310 GPU_Prune2To4SpMatFlagAttr:$pruneFlag, 2311 AnyMemRef:$memref); 2312 let results = (outs Res<GPU_SparseSpMatHandle>:$spMat, 2313 Optional<GPU_AsyncToken>:$asyncToken); 2314 2315 let assemblyFormat = [{ 2316 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 2317 `{` $pruneFlag `}` $rows `,` $cols `,` $memref attr-dict `:` type($memref) 2318 }]; 2319} 2320 2321def GPU_DestroySpMatOp : GPU_Op<"destroy_sp_mat", [GPU_AsyncOpInterface]> { 2322 let summary = "Destroy sparse matrix operation"; 2323 let description = [{ 2324 The `gpu.destroy_sp_mat` operation releases all resources of a sparse 2325 matrix represented by a handle that was previously created by a 2326 one of the sparse matrix creation operations. 2327 2328 If the `async` keyword is present, the op is executed asynchronously (i.e. 2329 it does not block until the execution has finished on the device). In 2330 that case, it returns a !gpu.async.token in addition to the environment. 2331 2332 Example: 2333 2334 ```mlir 2335 %token = gpu.destroy_sp_mat async [%dep] %spmat 2336 ``` 2337 }]; 2338 2339 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, 2340 Arg<GPU_SparseSpMatHandle>:$spmat); 2341 let results = (outs Optional<GPU_AsyncToken>:$asyncToken); 2342 2343 let assemblyFormat = [{ 2344 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) $spmat attr-dict 2345 }]; 2346} 2347 2348// To avoid coupling this dialect with cusparse.h specifics, we hardcoded magic 2349// literals in this enum. Note that this should be kept in sync with 2350// cusparseOperation_t in cusparse.h: 2351// typedef enum { 2352// CUSPARSE_OPERATION_NON_TRANSPOSE = 0, 2353// CUSPARSE_OPERATION_TRANSPOSE = 1, 2354// CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2 2355// } cusparseOperation_t; 2356// TODO: find a proper way to keep them in sync? 2357def GPU_TransposeMode : I32EnumAttr<"TransposeMode", 2358 "transpose mode of sparse matrix supported by sparse tensor ops", 2359 [ 2360 I32EnumAttrCase<"NON_TRANSPOSE", 0>, 2361 I32EnumAttrCase<"TRANSPOSE", 1>, 2362 I32EnumAttrCase<"CONJUGATE_TRANSPOSE", 2>, 2363 ]> { 2364 let genSpecializedAttr = 0; 2365 let cppNamespace = GPU_Dialect.cppNamespace; 2366} 2367 2368def GPU_TransposeModeAttr : EnumAttr<GPU_Dialect, GPU_TransposeMode, 2369 "mat_transpose_mode">{ 2370 let defaultValue = "TransposeMode::NON_TRANSPOSE"; 2371} 2372 2373def GPU_SpMVBufferSizeOp : GPU_Op<"spmv_buffer_size", [GPU_AsyncOpInterface]> { 2374 let summary = "Precompute buffersize for SpMV operation"; 2375 let description = [{ 2376 The `gpu.spmv_buffer_size` operation returns the buffer size required 2377 to perform the SpMV operation on the given sparse matrix and dense vectors. 2378 The operation expects handles returned by previous sparse operations 2379 to construct an environment and the operands for SpMV. 2380 2381 If the `async` keyword is present, the op is executed asynchronously (i.e. 2382 it does not block until the execution has finished on the device). In 2383 that case, it returns a !gpu.async.token in addition to the environment. 2384 2385 The matrix arguments can also be associated with one of the following 2386 operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value 2387 is NON_TRANSPOSE. 2388 2389 Example: 2390 2391 ```mlir 2392 %buffersz, %token = gpu.spmv_buffer_size async [%dep] %spmatA{TRANSPOSE}, %dnX, %dnY into f32 2393 ``` 2394 }]; 2395 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, 2396 GPU_TransposeModeAttr:$modeA, 2397 GPU_SparseSpMatHandle:$spmatA, 2398 GPU_SparseDnTensorHandle:$dnX, 2399 GPU_SparseDnTensorHandle:$dnY, 2400 TypeAttr:$computeType); 2401 let results = (outs Res<Index>:$bufferSz, 2402 Optional<GPU_AsyncToken>:$asyncToken); 2403 2404 let builders = [OpBuilder<(ins 2405 "Type":$bufferSz, 2406 "Type":$asyncToken, 2407 "ValueRange":$asyncDependencies, 2408 "Value":$spmatA, 2409 "Value":$dnX, 2410 "Value":$dnY, 2411 "Type":$computeType) 2412 , [{ 2413 auto modeA = gpu::TransposeMode::NON_TRANSPOSE; 2414 return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies, 2415 modeA, spmatA, dnX, dnY, computeType);}]> 2416 ]; 2417 2418 let assemblyFormat = [{ 2419 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 2420 $spmatA (`{` $modeA^ `}`)? `,` $dnX `,` $dnY attr-dict `into` $computeType 2421 }]; 2422} 2423 2424def GPU_SpMVOp : GPU_Op<"spmv", [GPU_AsyncOpInterface]> { 2425 let summary = "SpMV operation"; 2426 let description = [{ 2427 The `gpu.spmv` operation performs the SpMV operation on the given sparse matrix, 2428 dense vectors, and buffer. The operation expects handles returned by previous 2429 sparse operations to construct an environment and the operands for SpMV. The 2430 buffer must have been allocated on the device. 2431 2432 If the `async` keyword is present, the op is executed asynchronously (i.e. 2433 it does not block until the execution has finished on the device). In 2434 that case, it returns a !gpu.async.token in addition to the environment. 2435 2436 The matrix arguments can also be associated with one of the following 2437 operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value 2438 is NON_TRANSPOSE. 2439 2440 Example: 2441 2442 ```mlir 2443 %token = gpu.spmv async [%dep] %spmatA{TRANSPOSE}, %dnX, %dnY : memref<?xf64> into bf16 2444 ``` 2445 }]; 2446 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, 2447 GPU_TransposeModeAttr:$modeA, 2448 GPU_SparseSpMatHandle:$spmatA, 2449 GPU_SparseDnTensorHandle:$dnX, 2450 GPU_SparseDnTensorHandle:$dnY, 2451 TypeAttr:$computeType, 2452 AnyMemRef:$buffer); 2453 let results = (outs Optional<GPU_AsyncToken>:$asyncToken); 2454 2455 let builders = [OpBuilder<(ins 2456 "Type":$asyncToken, 2457 "ValueRange":$asyncDependencies, 2458 "Value":$spmatA, 2459 "Value":$dnX, 2460 "Value":$dnY, 2461 "Type":$computeType, 2462 "Value":$buffer), [{ 2463 auto modeA = gpu::TransposeMode::NON_TRANSPOSE; 2464 return build($_builder, $_state, asyncToken, asyncDependencies, modeA, 2465 spmatA, dnX, dnY, computeType, buffer);}]> 2466 ]; 2467 2468 let assemblyFormat = [{ 2469 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 2470 $spmatA (`{` $modeA^ `}`)? `,` $dnX `,` $dnY `,` $buffer attr-dict `:` type($buffer) `into` $computeType 2471 }]; 2472} 2473 2474def GPU_SpMMBufferSizeOp : GPU_Op<"spmm_buffer_size", [GPU_AsyncOpInterface, AttrSizedResultSegments]> { 2475 let summary = "Precompute buffersize for SpMM operation"; 2476 let description = [{ 2477 The `gpu.spmm_buffer_size` operation returns the buffer size required 2478 to perform the SpMM operation on the given sparse and dense matrix. 2479 The operation expects handles returned by previous sparse operations 2480 to construct an environment and the operands for SpMM. 2481 2482 If the `async` keyword is present, the op is executed asynchronously (i.e. 2483 it does not block until the execution has finished on the device). In 2484 that case, it returns a !gpu.async.token in addition to the environment. 2485 2486 The matrix arguments can also be associated with one of the following 2487 operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value 2488 is NON_TRANSPOSE. 2489 2490 Example: 2491 2492 ```mlir 2493 %bufferszs, %token = gpu.spmm_buffer_size async [%dep] %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC : i64 into f32 2494 ``` 2495 }]; 2496 2497 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, 2498 GPU_TransposeModeAttr:$modeA, 2499 GPU_TransposeModeAttr:$modeB, 2500 GPU_SparseSpMatHandle:$spmatA, 2501 GPU_SparseDnTensorHandle:$dnmatB, 2502 GPU_SparseDnTensorHandle:$dnmatC, 2503 TypeAttr:$computeType); 2504 let results = (outs Variadic<Index>:$bufferSzs, 2505 Optional<GPU_AsyncToken>:$asyncToken); 2506 2507 let builders = [OpBuilder<(ins 2508 "Type":$bufferSzs, 2509 "Type":$asyncToken, 2510 "ValueRange":$asyncDependencies, 2511 "Value":$spmatA, 2512 "Value":$dnmatB, 2513 "Value":$dnmatC, 2514 "Type":$computeType), [{ 2515 auto modeA = gpu::TransposeMode::NON_TRANSPOSE; 2516 auto modeB = gpu::TransposeMode::NON_TRANSPOSE; 2517 return build($_builder, $_state, bufferSzs, asyncToken, asyncDependencies, 2518 modeA, modeB, spmatA, dnmatB, dnmatC, computeType);}]> 2519 ]; 2520 2521 let assemblyFormat = [{ 2522 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 2523 $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC attr-dict `:` type($bufferSzs) `into` $computeType 2524 }]; 2525} 2526 2527def GPU_SpMMOp : GPU_Op<"spmm", [GPU_AsyncOpInterface, AttrSizedOperandSegments]> { 2528 let summary = "SpMM operation"; 2529 let description = [{ 2530 The `gpu.spmm` operation performs the SpMM operation on the given sparse and 2531 dense matrix, and buffer. The operation expects handles returned by previous 2532 sparse operations to construct an environment and the operands for SpMM. The 2533 buffer must have been allocated on the device. 2534 2535 If the `async` keyword is present, the op is executed asynchronously (i.e. 2536 it does not block until the execution has finished on the device). In 2537 that case, it returns a !gpu.async.token in addition to the environment. 2538 2539 The matrix arguments can also be associated with one of the following 2540 operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value 2541 is NON_TRANSPOSE. 2542 2543 Example: 2544 2545 ```mlir 2546 %token = gpu.spmm async [%dep] %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC, %buffers : type($buffers) into f32 2547 ``` 2548 }]; 2549 2550 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, 2551 GPU_TransposeModeAttr:$modeA, 2552 GPU_TransposeModeAttr:$modeB, 2553 GPU_SparseSpMatHandle:$spmatA, 2554 GPU_SparseDnTensorHandle:$dnmatB, 2555 GPU_SparseDnTensorHandle:$dnmatC, 2556 TypeAttr:$computeType, 2557 Variadic<AnyMemRef>:$buffers); 2558 let results = (outs Optional<GPU_AsyncToken>:$asyncToken); 2559 2560 let builders = [OpBuilder<(ins 2561 "Type":$asyncToken, 2562 "ValueRange":$asyncDependencies, 2563 "Value":$spmatA, 2564 "Value":$dnmatB, 2565 "Value":$dnmatC, 2566 "Type":$computeType, 2567 "ValueRange":$buffers), [{ 2568 auto modeA = gpu::TransposeMode::NON_TRANSPOSE; 2569 auto modeB = gpu::TransposeMode::NON_TRANSPOSE; 2570 return build($_builder, $_state, asyncToken, asyncDependencies, modeA, 2571 modeB, spmatA, dnmatB, dnmatC, computeType, buffers);}]> 2572 ]; 2573 2574 let assemblyFormat = [{ 2575 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 2576 $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC `,` $buffers attr-dict `:` type($buffers) `into` $computeType 2577 }]; 2578} 2579 2580def GPU_SDDMMBufferSizeOp : GPU_Op<"sddmm_buffer_size", [GPU_AsyncOpInterface]> { 2581 let summary = "Precompute buffersize for SDDMM operation"; 2582 let description = [{ 2583 The `gpu.sddmm_buffer_size` operation returns the buffer size required 2584 to perform the SDDMM operation on the given sparse and dense matrices. 2585 The operation expects handles returned by previous sparse operations 2586 to construct an environment and the operands for SDDMM. 2587 2588 If the `async` keyword is present, the op is executed asynchronously (i.e. 2589 it does not block until the execution has finished on the device). In 2590 that case, it returns a !gpu.async.token in addition to the environment. 2591 2592 Example: 2593 2594 ```mlir 2595 %buffersz, %token = gpu.sddmm_buffer_size async [%dep] %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC into f32 2596 ``` 2597 2598 The matrix arguments can also be associated with one of the following 2599 operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value 2600 is NON_TRANSPOSE. 2601 }]; 2602 2603 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, 2604 GPU_TransposeModeAttr:$modeA, 2605 GPU_TransposeModeAttr:$modeB, 2606 GPU_SparseDnTensorHandle:$dnmatA, 2607 GPU_SparseDnTensorHandle:$dnmatB, 2608 GPU_SparseSpMatHandle:$spmatC, 2609 TypeAttr:$computeType); 2610 let results = (outs Res<Index>:$bufferSz, Optional<GPU_AsyncToken>:$asyncToken); 2611 2612 let builders = [OpBuilder<(ins 2613 "Type":$bufferSz, 2614 "Type":$asyncToken, 2615 "ValueRange":$asyncDependencies, 2616 "Value":$dnmatA, 2617 "Value":$dnmatB, 2618 "Value":$spmatC, 2619 "Type":$computeType), [{ 2620 auto modeA = gpu::TransposeMode::NON_TRANSPOSE; 2621 auto modeB = gpu::TransposeMode::NON_TRANSPOSE; 2622 return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies, 2623 modeA, modeB, dnmatA, dnmatB, spmatC, computeType);}]> 2624 ]; 2625 2626 let assemblyFormat = [{ 2627 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 2628 $dnmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $spmatC attr-dict `into` $computeType 2629 }]; 2630} 2631 2632def GPU_SDDMMOp : GPU_Op<"sddmm", [GPU_AsyncOpInterface]> { 2633 let summary = "SDDMM operation"; 2634 let description = [{ 2635 The `gpu.sddmm` operation performs the SDDMM operation on the given sparse and 2636 dense matrices, and buffer. The operation expects handles returned by previous 2637 sparse operations to construct an environment and the operands for SDDMM. The 2638 buffer must have been allocated on the device. 2639 2640 If the `async` keyword is present, the op is executed asynchronously (i.e. 2641 it does not block until the execution has finished on the device). In 2642 that case, it returns a !gpu.async.token in addition to the environment. 2643 2644 Example: 2645 2646 ```mlir 2647 %token = gpu.sddmm async [%dep] %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC, %buffer into f32 2648 ``` 2649 2650 The matrix arguments can also be associated with one of the following 2651 operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value 2652 is NON_TRANSPOSE. 2653 }]; 2654 2655 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, 2656 GPU_TransposeModeAttr:$modeA, 2657 GPU_TransposeModeAttr:$modeB, 2658 GPU_SparseDnTensorHandle:$dnmatA, 2659 GPU_SparseDnTensorHandle:$dnmatB, 2660 GPU_SparseSpMatHandle:$spmatC, 2661 TypeAttr:$computeType, 2662 AnyMemRef:$buffer); 2663 let results = (outs Optional<GPU_AsyncToken>:$asyncToken); 2664 2665 let builders = [OpBuilder<(ins 2666 "Type":$asyncToken, 2667 "ValueRange":$asyncDependencies, 2668 "Value":$dnmatA, 2669 "Value":$dnmatB, 2670 "Value":$spmatC, 2671 "Type":$computeType, 2672 "Value":$buffer), [{ 2673 auto modeA = gpu::TransposeMode::NON_TRANSPOSE; 2674 auto modeB = gpu::TransposeMode::NON_TRANSPOSE; 2675 return build($_builder, $_state, asyncToken, asyncDependencies, modeA, 2676 modeB, dnmatA, dnmatB, spmatC, computeType, buffer);}]> 2677 ]; 2678 2679 let assemblyFormat = [{ 2680 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 2681 $dnmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $buffer attr-dict `:` type($buffer) `into` $computeType 2682 }]; 2683} 2684 2685def GPU_SpGEMMWorkEstimationOrComputeKind : I32EnumAttr<"SpGEMMWorkEstimationOrComputeKind", 2686 "choose whether spgemm_work_estimation_or_compute does work estimation or compute", 2687 [ 2688 I32EnumAttrCase<"WORK_ESTIMATION", 0>, 2689 I32EnumAttrCase<"COMPUTE", 1>, 2690 ]> { 2691 let genSpecializedAttr = 0; 2692 let cppNamespace = GPU_Dialect.cppNamespace; 2693} 2694 2695def GPU_SpGEMMWorkEstimationOrComputeKindAttr : EnumAttr<GPU_Dialect, 2696 GPU_SpGEMMWorkEstimationOrComputeKind, 2697 "spgemm_work_estimation_or_compute_kind"> {} 2698 2699def GPU_SpGEMMCreateDescrOp : GPU_Op<"spgemm_create_descr", [GPU_AsyncOpInterface]> { 2700 let summary = "SpGEMM Create Descr operation"; 2701 let description = [{ 2702 The `gpu.spgemm_create_descr` creates a descriptor for the SpGEMM operation. 2703 The descriptor describes the SpGEMM operation and stores the internal data 2704 throughout the computation. It needs to be passed as an argument to 2705 spgemm_* operations. 2706 2707 If the `async` keyword is present, the op is executed asynchronously (i.e. 2708 it does not block until the execution has finished on the device). In 2709 that case, it returns a `!gpu.async.token` in addition to the environment. 2710 2711 Example: 2712 2713 ```mlir 2714 %desc, %token = gpu.spgemm_create_descr async [%dep] 2715 ``` 2716 }]; 2717 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies); 2718 let results = (outs GPU_SparseSpGEMMOpHandle:$desc, 2719 Optional<GPU_AsyncToken>:$asyncToken); 2720 let assemblyFormat = [{ 2721 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 2722 attr-dict 2723 }]; 2724} 2725 2726def GPU_SpGEMMDestroyDescrOp : GPU_Op<"spgemm_destroy_descr", [GPU_AsyncOpInterface]> { 2727 let summary = "SpGEMM Destroy Descr operation"; 2728 let description = [{ 2729 The `gpu.spgemm_destroy_descr` destroys the SpGEMM operation descriptor. 2730 2731 If the `async` keyword is present, the op is executed asynchronously (i.e. 2732 it does not block until the execution has finished on the device). In 2733 that case, it returns a `!gpu.async.token` in addition to the environment. 2734 2735 Example: 2736 2737 ```mlir 2738 %token = gpu.spgemm_destroy_descr async [%dep] %desc 2739 ``` 2740 }]; 2741 2742 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, 2743 GPU_SparseSpGEMMOpHandle:$desc); 2744 let results = (outs Optional<GPU_AsyncToken>:$asyncToken); 2745 let assemblyFormat = [{ 2746 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 2747 $desc attr-dict 2748 }]; 2749} 2750 2751def GPU_SpGEMMWorkEstimationOrComputeOp : GPU_Op<"spgemm_work_estimation_or_compute", [GPU_AsyncOpInterface]> { 2752 let summary = "SpGEMM work estimation operation"; 2753 let description = [{ 2754 The `gpu.spgemm_work_estimation_or_compute` is used to call 2755 cusparseSpGEMM_workEstimation or cusparseSpGEMM_compute. Both of them are 2756 for both determining the buffer size and performing the actual computation. 2757 The operation expects handles returned by previous sparse operations to 2758 construct an environment and the operands for SpGEMM. 2759 The buffer must have been allocated on the device. 2760 2761 C' = alpha * op(A) * op(B) + beta * C 2762 2763 If the `async` keyword is present, the op is executed asynchronously (i.e. 2764 it does not block until the execution has finished on the device). In 2765 that case, it returns a `!gpu.async.token` in addition to the environment. 2766 2767 Example: 2768 2769 ```mlir 2770 %bufferSz, %token = gpu.spgemm_work_estimation_or_compute async [%dep] {COMPUTE} 2771 %desc, %spmatA{NON_TRANSPOSE}, %spmatB{NON_TRANSPOSE}, 2772 %spmatC, %spgemmDesc, %c0, %alloc: f32 into 2773 memref<0xi8> 2774 ``` 2775 2776 The matrix arguments can also be associated with one of the following 2777 operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value 2778 is NON_TRANSPOSE. 2779 }]; 2780 2781 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, 2782 GPU_SparseSpGEMMOpHandle:$desc, 2783 GPU_TransposeModeAttr:$modeA, 2784 GPU_TransposeModeAttr:$modeB, 2785 GPU_SparseSpMatHandle:$spmatA, 2786 GPU_SparseSpMatHandle:$spmatB, 2787 GPU_SparseSpMatHandle:$spmatC, 2788 TypeAttr:$computeType, 2789 Index:$bufferSz, 2790 AnyMemRef:$buffer, 2791 GPU_SpGEMMWorkEstimationOrComputeKindAttr:$kind); 2792 let results = (outs Res<Index>:$bufferSzNew, 2793 Optional<GPU_AsyncToken>:$asyncToken); 2794 2795 let builders = [OpBuilder<(ins 2796 "Type":$bufferSzNew, 2797 "Type":$asyncToken, 2798 "ValueRange":$asyncDependencies, 2799 "Value":$desc, 2800 "Value":$spmatA, 2801 "Value":$spmatB, 2802 "Value":$spmatC, 2803 "Type":$computeType, 2804 "Value":$bufferSz, 2805 "Value":$buffer), [{ 2806 auto modeA = gpu::TransposeMode::NON_TRANSPOSE; 2807 auto modeB = gpu::TransposeMode::NON_TRANSPOSE; 2808 auto kind = gpu::SpGEMMWorkEstimationOrComputeKind::WORK_ESTIMATION; 2809 return build($_builder, $_state, bufferSzNew, asyncToken, asyncDependencies, desc, 2810 modeA, modeB, spmatA, spmatB, spmatC, computeType, bufferSz, buffer, kind);}]> 2811 ]; 2812 2813 let assemblyFormat = [{ 2814 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 2815 `{` $kind `}` $spmatA (`{` $modeA^ `}`)? `,` $spmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $desc `,` $bufferSz `,` $buffer attr-dict `:` $computeType `into` type($buffer) 2816 }]; 2817} 2818 2819def GPU_SpGEMMCopyOp : GPU_Op<"spgemm_copy", [GPU_AsyncOpInterface]> { 2820 let summary = "SpGEMM copy operation"; 2821 let description = [{ 2822 The `gpu.spgemm_copy` operation copies the sparse matrix result of 2823 a SpGEMM computation. 2824 2825 If the `async` keyword is present, the op is executed asynchronously (i.e. 2826 it does not block until the execution has finished on the device). In 2827 that case, it returns a `!gpu.async.token` in addition to the environment. 2828 2829 Example: 2830 2831 ```mlir 2832 gpu.spgemm_copy %spmatA, %spmatB, %spmatC, %spgemmDesc: f32 2833 ``` 2834 2835 The matrix arguments can also be associated with one of the following 2836 operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value 2837 is NON_TRANSPOSE. 2838 }]; 2839 2840 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, 2841 GPU_SparseSpGEMMOpHandle:$desc, 2842 GPU_TransposeModeAttr:$modeA, 2843 GPU_TransposeModeAttr:$modeB, 2844 GPU_SparseSpMatHandle:$spmatA, 2845 GPU_SparseSpMatHandle:$spmatB, 2846 GPU_SparseSpMatHandle:$spmatC, 2847 TypeAttr:$computeType); 2848 let results = (outs Optional<GPU_AsyncToken>:$asyncToken); 2849 2850 let builders = [OpBuilder<(ins 2851 "Type":$asyncToken, 2852 "ValueRange":$asyncDependencies, 2853 "Value":$desc, 2854 "Value":$spmatA, 2855 "Value":$spmatB, 2856 "Value":$spmatC, 2857 "Type":$computeType), [{ 2858 auto modeA = gpu::TransposeMode::NON_TRANSPOSE; 2859 auto modeB = gpu::TransposeMode::NON_TRANSPOSE; 2860 return build($_builder, $_state, asyncToken, asyncDependencies, desc, 2861 modeA, modeB, spmatA, spmatB, spmatC, computeType);}]> 2862 ]; 2863 2864 let assemblyFormat = [{ 2865 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 2866 $spmatA (`{` $modeA^ `}`)? `,` $spmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $desc attr-dict `:` $computeType 2867 }]; 2868} 2869 2870def GPU_SpMatGetSizeOp : GPU_Op<"spmat_get_size", [GPU_AsyncOpInterface]> { 2871 let summary = "SpMat get size operation"; 2872 let description = [{ 2873 The `gpu.spmat_get_size` operation retrieves the number of rows, number of 2874 columns, and number of non-zero elements of a sparse matrix. 2875 2876 If the `async` keyword is present, the op is executed asynchronously (i.e. 2877 it does not block until the execution has finished on the device). In 2878 that case, it returns a `!gpu.async.token` in addition to the environment. 2879 2880 Example: 2881 2882 ```mlir 2883 %rows, %cols, %nnz, %token = gpu.spmat_get_size async [%dep] %spmatC 2884 ``` 2885 }]; 2886 2887 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, 2888 GPU_SparseSpMatHandle:$spmat); 2889 let results = (outs Index:$rows, 2890 Index:$cols, 2891 Index:$nnz, 2892 Optional<GPU_AsyncToken>:$asyncToken); 2893 2894 let assemblyFormat = [{ 2895 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 2896 $spmat attr-dict 2897 }]; 2898} 2899 2900def GPU_SetCsrPointersOp : GPU_Op<"set_csr_pointers", [GPU_AsyncOpInterface]> { 2901 let summary = "SpGEMM get size operation"; 2902 let description = [{ 2903 The `gpu.set_csr_pointers` assigns the given positions, coordinates, 2904 and values buffer that reside on the device directly to the given sparse 2905 matrix descriptor in csr format. 2906 2907 If the `async` keyword is present, the op is executed asynchronously (i.e. 2908 it does not block until the execution has finished on the device). In 2909 that case, it returns a `!gpu.async.token` in addition to the environment. 2910 2911 Example: 2912 2913 ```mlir 2914 %token = gpu.set_csr_pointers async [%dep] %positions, %coordinates, %values 2915 : memref<?xf32>, memref<?xindex>, memref<?xindex> 2916 ``` 2917 }]; 2918 2919 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, 2920 Arg<GPU_SparseSpMatHandle>:$spmat, 2921 AnyMemRef:$positions, 2922 AnyMemRef:$coordinates, 2923 AnyMemRef:$values); 2924 let results = (outs Optional<GPU_AsyncToken>:$asyncToken); 2925 2926 let assemblyFormat = [{ 2927 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 2928 $spmat `,` $positions `,` $coordinates `,` $values attr-dict 2929 `:` type($positions) `,` type($coordinates) `,` type($values) 2930 }]; 2931} 2932 2933def GPU_WarpExecuteOnLane0Op : GPU_Op<"warp_execute_on_lane_0", 2934 [DeclareOpInterfaceMethods<RegionBranchOpInterface, ["areTypesCompatible"]>, 2935 SingleBlockImplicitTerminator<"gpu::YieldOp">, 2936 RecursiveMemoryEffects]> { 2937 let summary = "Executes operations in the associated region on thread #0 of a" 2938 "SPMD program"; 2939 let description = [{ 2940 `warp_execute_on_lane_0` is an operation used to bridge the gap between 2941 vector programming and SPMD programming model like GPU SIMT. It allows to 2942 trivially convert a region of vector code meant to run on a multiple threads 2943 into a valid SPMD region and then allows incremental transformation to 2944 distribute vector operations on the threads. 2945 2946 Any code present in the region would only be executed on first thread/lane 2947 based on the `laneid` operand. The `laneid` operand is an integer ID between 2948 [0, `warp_size`). The `warp_size` attribute indicates the number of lanes in 2949 a warp. 2950 2951 Operands are vector values distributed on all lanes that may be used by 2952 the single lane execution. The matching region argument is a vector of all 2953 the values of those lanes available to the single active lane. The 2954 distributed dimension is implicit based on the shape of the operand and 2955 argument. the properties of the distribution may be described by extra 2956 attributes (e.g. affine map). 2957 2958 Return values are distributed on all lanes using laneId as index. The 2959 vector is distributed based on the shape ratio between the vector type of 2960 the yield and the result type. 2961 If the shapes are the same this means the value is broadcasted to all lanes. 2962 In the future the distribution can be made more explicit using affine_maps 2963 and will support having multiple Ids. 2964 2965 Therefore the `warp_execute_on_lane_0` operations allow to implicitly copy 2966 between lane0 and the lanes of the warp. When distributing a vector 2967 from lane0 to all the lanes, the data are distributed in a block cyclic way. 2968 For example `vector<64xf32>` gets distributed on 32 threads and map to 2969 `vector<2xf32>` where thread 0 contains vector[0] and vector[1]. 2970 2971 During lowering values passed as operands and return value need to be 2972 visible to different lanes within the warp. This would usually be done by 2973 going through memory. 2974 2975 The region is *not* isolated from above. For values coming from the parent 2976 region not going through operands only the lane 0 value will be accesible so 2977 it generally only make sense for uniform values. 2978 2979 Example: 2980 ``` 2981 // Execute in parallel on all threads/lanes. 2982 gpu.warp_execute_on_lane_0 (%laneid)[32] { 2983 // Serial code running only on thread/lane 0. 2984 ... 2985 } 2986 // Execute in parallel on all threads/lanes. 2987 ``` 2988 2989 This may be lowered to an scf.if region as below: 2990 ``` 2991 // Execute in parallel on all threads/lanes. 2992 %cnd = arith.cmpi eq, %laneid, %c0 : index 2993 scf.if %cnd { 2994 // Serial code running only on thread/lane 0. 2995 ... 2996 } 2997 // Execute in parallel on all threads/lanes. 2998 ``` 2999 3000 When the region has operands and/or return values: 3001 ``` 3002 // Execute in parallel on all threads/lanes. 3003 %0 = gpu.warp_execute_on_lane_0(%laneid)[32] 3004 args(%v0 : vector<4xi32>) -> (vector<1xf32>) { 3005 ^bb0(%arg0 : vector<128xi32>) : 3006 // Serial code running only on thread/lane 0. 3007 ... 3008 gpu.yield %1 : vector<32xf32> 3009 } 3010 // Execute in parallel on all threads/lanes. 3011 ``` 3012 3013 values at the region boundary would go through memory: 3014 ``` 3015 // Execute in parallel on all threads/lanes. 3016 ... 3017 // Store the data from each thread into memory and Synchronization. 3018 %tmp0 = memreg.alloc() : memref<128xf32> 3019 %tmp1 = memreg.alloc() : memref<32xf32> 3020 %cnd = arith.cmpi eq, %laneid, %c0 : index 3021 vector.store %v0, %tmp0[%laneid] : memref<128xf32>, vector<4xf32> 3022 some_synchronization_primitive 3023 scf.if %cnd { 3024 // Serialized code running only on thread 0. 3025 // Load the data from all the threads into a register from thread 0. This 3026 // allow threads 0 to access data from all the threads. 3027 %arg0 = vector.load %tmp0[%c0] : memref<128xf32>, vector<128xf32> 3028 ... 3029 // Store the data from thread 0 into memory. 3030 vector.store %1, %tmp1[%c0] : memref<32xf32>, vector<32xf32> 3031 } 3032 // Synchronization and load the data in a block cyclic way so that the 3033 // vector is distributed on all threads. 3034 some_synchronization_primitive 3035 %0 = vector.load %tmp1[%laneid] : memref<32xf32>, vector<32xf32> 3036 // Execute in parallel on all threads/lanes. 3037 ``` 3038 3039 }]; 3040 3041 let hasVerifier = 1; 3042 let hasCustomAssemblyFormat = 1; 3043 let arguments = (ins Index:$laneid, I64Attr:$warp_size, 3044 Variadic<AnyType>:$args); 3045 let results = (outs Variadic<AnyType>:$results); 3046 let regions = (region SizedRegion<1>:$warpRegion); 3047 3048 let skipDefaultBuilders = 1; 3049 let builders = [ 3050 OpBuilder<(ins "TypeRange":$resultTypes, "Value":$laneid, 3051 "int64_t":$warpSize)>, 3052 // `blockArgTypes` are different than `args` types as they are they 3053 // represent all the `args` instances visibile to lane 0. Therefore we need 3054 // to explicit pass the type. 3055 OpBuilder<(ins "TypeRange":$resultTypes, "Value":$laneid, 3056 "int64_t":$warpSize, "ValueRange":$args, 3057 "TypeRange":$blockArgTypes)> 3058 ]; 3059 3060 let extraClassDeclaration = [{ 3061 bool isDefinedOutsideOfRegion(Value value) { 3062 return !getRegion().isAncestor(value.getParentRegion()); 3063 } 3064 }]; 3065} 3066 3067#endif // GPU_OPS 3068