xref: /llvm-project/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td (revision f50f9698ad012882df8dd605f5482e280c138266)
1//===-- GPUOps.td - GPU dialect operation definitions ------*- tablegen -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Defines some operations of the GPU dialect.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef GPU_OPS
14#define GPU_OPS
15
16include "mlir/Dialect/DLTI/DLTIBase.td"
17include "mlir/Dialect/GPU/IR/GPUBase.td"
18include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td"
19include "mlir/Dialect/GPU/IR/CompilationAttrs.td"
20include "mlir/Dialect/GPU/IR/GPUDeviceMappingAttr.td"
21include "mlir/Dialect/GPU/IR/ParallelLoopMapperAttr.td"
22include "mlir/IR/CommonTypeConstraints.td"
23include "mlir/IR/EnumAttr.td"
24include "mlir/IR/RegionKindInterface.td"
25include "mlir/IR/SymbolInterfaces.td"
26include "mlir/Interfaces/ControlFlowInterfaces.td"
27include "mlir/Interfaces/DataLayoutInterfaces.td"
28include "mlir/IR/OpAsmInterface.td"
29include "mlir/Interfaces/FunctionInterfaces.td"
30include "mlir/Interfaces/InferIntRangeInterface.td"
31include "mlir/Interfaces/InferTypeOpInterface.td"
32include "mlir/Interfaces/SideEffectInterfaces.td"
33
34//===----------------------------------------------------------------------===//
35// GPU Dialect operations.
36//===----------------------------------------------------------------------===//
37
38class GPU_Op<string mnemonic, list<Trait> traits = []> :
39    Op<GPU_Dialect, mnemonic, traits>;
40
41def GPU_Dimension : I32EnumAttr<"Dimension",
42    "a dimension, either 'x', 'y', or 'z'",
43    [
44      I32EnumAttrCase<"x", 0>,
45      I32EnumAttrCase<"y", 1>,
46      I32EnumAttrCase<"z", 2>
47    ]>{
48  let genSpecializedAttr = 0;
49  let cppNamespace = "::mlir::gpu";
50}
51def GPU_DimensionAttr : EnumAttr<GPU_Dialect, GPU_Dimension, "dim">;
52
53class GPU_IndexOp<string mnemonic, list<Trait> traits = []> :
54    GPU_Op<mnemonic, !listconcat(traits, [
55        Pure,
56        DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
57        DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>])>,
58    Arguments<(ins GPU_DimensionAttr:$dimension,
59                   OptionalAttr<IndexAttr>:$upper_bound)>, Results<(outs Index)> {
60  let assemblyFormat = "$dimension (`upper_bound` $upper_bound^)? attr-dict";
61  let extraClassDefinition = [{
62    void $cppClass::getAsmResultNames(
63        llvm::function_ref<void(mlir::Value, mlir::StringRef)> setNameFn) {
64      auto dimStr = stringifyDimension(getDimensionAttr().getValue());
65      auto opName = getOperationName();
66      opName.consume_front("gpu.");
67      SmallString<8> resultName({opName, "_", dimStr});
68      setNameFn(getResult(),resultName);
69    }
70  }];
71  let builders = [
72    OpBuilder<(ins "::mlir::gpu::Dimension":$dimension), [{
73      build($_builder, $_state, dimension, /*upperBound=*/nullptr);
74    }]>,
75    OpBuilder<(ins "::mlir::Type":$resultType, "::mlir::gpu::Dimension":$dimension), [{
76      build($_builder, $_state, resultType, dimension, /*upperBound=*/nullptr);
77    }]>
78  ];
79}
80
81def GPU_ClusterDimOp : GPU_IndexOp<"cluster_dim"> {
82  let description = [{
83    Returns the number of cluster identifiers per grid along
84    the x, y, or z `dimension`.
85
86    Example:
87
88    ```mlir
89    %cDimX = gpu.cluster_dim x
90    ```
91
92    If `upper_bound` is set, then executing (a lowering of) this operation in an
93    environment where the clusters per grid is greater than `upper_bound` causes
94    undefined behavior.
95
96    There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
97  }];
98}
99
100def GPU_ClusterDimBlocksOp : GPU_IndexOp<"cluster_dim_blocks"> {
101  let description = [{
102    Returns the number of thread blocks in the cluster along
103    the x, y, or z `dimension`.
104
105    Example:
106
107    ```mlir
108    %cDimBlocksX = gpu.cluster_dim_blocks x
109    ```
110
111    If `upper_bound` is set, then executing (a lowering of) this operation in an
112    environment where the thread blocks per cluster  is greater than `upper_bound`
113    causes undefined behavior.
114
115    There is an implicit upper bound of `kMaxClusterDim` (currently 8).
116  }];
117}
118
119def GPU_ClusterIdOp : GPU_IndexOp<"cluster_id"> {
120  let description = [{
121    Returns the cluster id, i.e. the index of the current cluster within the
122    grid along the x, y, or z `dimension`.
123
124    Example:
125
126    ```mlir
127    %cIdY = gpu.cluster_id y
128    ```
129
130    If `upper_bound` is set, then executing (a lowering of) this operation in an
131    environment where the number of clusters in the grid along `dimension` is
132    greater than `upper_bound` causes undefined behavior.
133
134    There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
135  }];
136}
137
138def GPU_ClusterBlockIdOp : GPU_IndexOp<"cluster_block_id"> {
139  let description = [{
140    Returns the block id within the cluster along the x, y, or z `dimension`.
141
142    Example:
143
144    ```mlir
145    %cBlockIdY = gpu.cluster_block_id y
146    ```
147
148    If `upper_bound` is set, then executing (a lowering of) this operation in an
149    environment where the number of thread blocks per cluster  along `dimension`
150    is greater than `upper_bound` causes undefined behavior.
151
152    There is an implicit upper bound of `kMaxClusterDim` (currently 8).
153  }];
154}
155
156def GPU_BlockDimOp : GPU_IndexOp<"block_dim"> {
157  let description = [{
158    Returns the number of threads in the thread block (aka the block size) along
159    the x, y, or z `dimension`.
160
161    Example:
162
163    ```mlir
164    %bDimX = gpu.block_dim x
165    ```
166
167    If `known_block_size` is set on an this operation's enclosing `gpu.func`,
168    or `gpu.known_block_size` is set on an enclosing `FunctionOpInterface`
169    implementor, or if the enclosing `gpu.launch` specifies a constant size for
170    `dimension`'s blocks, these contextual facts may be used to infer that this
171    operation has a constant value, though such a transformation will not be
172    performed by canonicalization or the default constant folder. Executions which
173    cause that constant-value assumption to be false incur undefined behavior.
174
175    If `upper_bound` is set, executions where the bblock size along `dimension`
176    exceeds `upper_bound` cause undefined behavior.
177
178    There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
179  }];
180}
181def GPU_BlockIdOp : GPU_IndexOp<"block_id"> {
182  let description = [{
183    Returns the block id, i.e. the index of the current block within the grid
184    along the x, y, or z `dimension`.
185
186    Example:
187
188    ```mlir
189    %bIdY = gpu.block_id y
190    ```
191
192    If `upper_bound` is set, or if one can be inferred from `known_grid_size`-type
193    annotations in context, executions where the block index in `dimension` would
194    be greater than or equal to that bound cause undefined behavior. `upper_bound`
195    takes priority over bounds inferrable from context.
196
197    There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
198  }];
199}
200def GPU_GridDimOp : GPU_IndexOp<"grid_dim"> {
201  let description = [{
202    Returns the number of thread blocks in the grid along the x, y, or z
203    `dimension`.
204
205    Example:
206
207    ```mlir
208    %gDimZ = gpu.grid_dim z
209    ```
210
211
212    If `known_grid_size` is set on an this operation's enclosing `gpu.func`,
213    or `gpu.known_grid_size` is set on an enclosing `FunctionOpInterface`
214    implementor, or if the enclosing `gpu.launch` specifies a constant size for
215    `dimension`'s grid length, these contextual facts may be used to infer that this
216    operation has a constant value, though such a transformation will not be
217    performed by canonicalization or the default constant folder. Executions which
218    cause that constant-value assumption to be false incur undefined behavior.
219
220    If `upper_bound` is set, executions where the grid size in `dimension` would
221    exceed `upper_bound` cause undefined behavior.
222
223    There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
224  }];
225}
226def GPU_ThreadIdOp : GPU_IndexOp<"thread_id"> {
227  let description = [{
228    Returns the thread id, i.e. the index of the current thread within the block
229    along the x, y, or z `dimension`.
230
231    Example:
232
233    ```mlir
234    %tIdX = gpu.thread_id x
235    ```
236
237    If `upper_bound` is set, or if one can be inferred from `known_block_size`-type
238    annotations in context, executions where the thread index would be greater
239    than or equal to that bound cause undefined behavior.
240
241    There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
242  }];
243}
244
245def GPU_LaneIdOp : GPU_Op<"lane_id", [
246      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]> {
247  let description = [{
248    Returns the lane id within the subgroup (warp/wave).
249
250    Example:
251    ```mlir
252    %laneId = gpu.lane_id
253    ```
254
255    If `upper_bound` is set, executions with more than `upper_bound` lanes per
256    subgroup cause undefined behavior. In the abscence of `upper_bound`,
257    the lane id is still assumed to be non-negative and less than the
258    target-independent `kMaxSubgroupSize` (currently 128).
259  }];
260  let arguments = (ins OptionalAttr<IndexAttr>:$upper_bound);
261  let results = (outs Index:$result);
262  let assemblyFormat = "(`upper_bound` $upper_bound^)? attr-dict";
263}
264
265def GPU_SubgroupIdOp : GPU_Op<"subgroup_id", [
266      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
267    Arguments<(ins OptionalAttr<IndexAttr>:$upper_bound)>,
268    Results<(outs Index:$result)> {
269  let description = [{
270    Returns the subgroup id, i.e., the index of the current subgroup within the
271    workgroup.
272
273    Example:
274
275    ```mlir
276    %sgId = gpu.subgroup_id : index
277    ```
278
279    Executions where there are more than `upper_bound` subgroups per workgroup
280    cause undefined behavior. There is an implicit upper bound of `kMaxDim`
281    (currently uint32_t::max).
282  }];
283
284  let assemblyFormat = "(`upper_bound` $upper_bound^)? attr-dict `:` type($result)";
285}
286
287def GPU_GlobalIdOp : GPU_IndexOp<"global_id"> {
288  let description = [{
289    Returns the unique global workitem/thread id, i.e., the unique index of the
290    current workitem/thread within all workgroups / grid along the x, y, or z
291    `dimension`.
292
293    Example:
294
295    ```mlir
296    %gidX = gpu.global_id x
297    %gidX = gpu.global_id x upper_bound 65536
298    ```
299
300    The `upper_bound` attribute defines an upper bound analogously to the ones on
301    `thread_id` and `block_id`. If one is not set, the bound may be inferred from
302    a combination of `known_block_size` and `known_grid_size`-type annotations.
303  }];
304}
305
306
307def GPU_NumSubgroupsOp : GPU_Op<"num_subgroups", [
308      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
309    Arguments<(ins OptionalAttr<IndexAttr>:$upper_bound)>,
310    Results<(outs Index:$result)> {
311  let description = [{
312    Returns the number of subgroups within a workgroup.
313
314    Example:
315
316    ```mlir
317    %numSg = gpu.num_subgroups : index
318    ```
319
320    If `upper_bound` is set, executions with more than `upper_bound` subgroups
321    per workgroup cause undefined behavior. There is a default upper bound of
322    `kMaxDim` (currently uint32_t::max).
323  }];
324
325  let assemblyFormat = "(`upper_bound` $upper_bound^)? attr-dict `:` type($result)";
326}
327
328def GPU_SubgroupSizeOp : GPU_Op<"subgroup_size", [
329      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
330    Arguments<(ins OptionalAttr<IndexAttr>:$upper_bound)>,
331    Results<(outs Index:$result)> {
332  let description = [{
333    Returns the number of threads within a subgroup.
334
335    Example:
336
337    ```mlir
338    %sgSz = gpu.subgroup_size : index
339    ```
340
341    Executions where the number of threads per subgroup exceed `upper_bound` cause
342    undefined behavior. When no `upper_bound` is specified, range analyses and
343    similar machinery assume the default bound of `kMaxSubgroupSize`, currently
344    128.
345  }];
346
347  let assemblyFormat = "(`upper_bound` $upper_bound^)? attr-dict `:` type($result)";
348}
349
350def GPU_OptionalDimSizeHintAttr : ConfinedAttr<OptionalAttr<DenseI32ArrayAttr>,
351  [AttrConstraint<Or<[IsNullAttr.predicate, DenseArrayCount<3>.predicate]>,
352    "with 3 elements (if present)">]>;
353
354def GPU_GPUFuncOp : GPU_Op<"func", [
355    HasParent<"GPUModuleOp">, AutomaticAllocationScope, FunctionOpInterface,
356    IsolatedFromAbove, AffineScope
357  ]> {
358  let summary = "Function executable on a GPU";
359
360  let description = [{
361    Defines a function that can be executed on a GPU. This supports memory
362    attribution and its body has a particular execution model.
363
364    GPU functions are either kernels (as indicated by the `kernel` attribute) or
365    regular functions. The former can be launched from the host side, while the
366    latter are device side only.
367
368    The memory attribution defines SSA values that correspond to memory buffers
369    allocated in the memory hierarchy of the GPU (see below).
370
371    The operation has one attached region that corresponds to the body of the
372    function. The region arguments consist of the function arguments without
373    modification, followed by buffers defined in memory annotations. The body of
374    a GPU function, when launched, is executed by multiple work items. There are
375    no guarantees on the order in which work items execute, or on the connection
376    between them. In particular, work items are not necessarily executed in
377    lock-step. Synchronization ops such as "gpu.barrier" should be used to
378    coordinate work items. Declarations of GPU functions, i.e. not having the
379    body region, are not supported.
380
381    A function may optionally be annotated with the block and/or grid sizes
382    that will be used when it is launched using the `known_block_size` and
383    `known_grid_size` attributes, respectively. If set, these attributes must
384    be arrays of three 32-bit integers giving the x, y, and z launch dimensions.
385    Launching a kernel that has these annotations, or that calls a function with
386    these annotations, using a block size or grid size other than what is specified
387    is undefined behavior. These attributes may be set on non-`gpu.func` functions
388    by using `gpu.known_block_size` or `gpu.known_grid_size`, but this carries
389    the risk that they will de discarded.
390
391    Syntax:
392
393    ```
394    op ::= `gpu.func` symbol-ref-id `(` argument-list `)` (`->`
395    function-result-list)?
396           memory-attribution `kernel`? function-attributes? region
397
398    memory-attribution ::= (`workgroup` `(` ssa-id-and-type-list `)`)?
399                           (`private` `(` ssa-id-and-type-list `)`)?
400    ```
401
402    Example:
403
404    ```mlir
405    gpu.func @foo(%arg0: index)
406        workgroup(%workgroup: memref<32xf32, 3>)
407        private(%private: memref<1xf32, 5>)
408        kernel
409        attributes {qux: "quux"} {
410      gpu.return
411    }
412    ```
413
414    The generic form illustrates the concept
415
416    ```mlir
417    "gpu.func"(%arg: index) {sym_name: "foo", kernel, qux: "quux"} ({
418    ^bb0(%arg0: index, %workgroup: memref<32xf32, 3>,
419         %private: memref<1xf32, 5>):
420      "gpu.return"() : () -> ()
421    }) : (index) -> ()
422    ```
423
424    Note the non-default memory spaces used in memref types in memory
425    attribution.
426  }];
427
428  let arguments = (ins TypeAttrOf<FunctionType>:$function_type,
429                       OptionalAttr<DictArrayAttr>:$arg_attrs,
430                       OptionalAttr<DictArrayAttr>:$res_attrs,
431                       OptionalAttr<DictArrayAttr>:$workgroup_attrib_attrs,
432                       OptionalAttr<DictArrayAttr>:$private_attrib_attrs,
433                       GPU_OptionalDimSizeHintAttr:$known_block_size,
434                       GPU_OptionalDimSizeHintAttr:$known_grid_size);
435  let regions = (region AnyRegion:$body);
436
437  let skipDefaultBuilders = 1;
438
439  let builders = [
440    OpBuilder<(ins "StringRef":$name, "FunctionType":$type,
441      CArg<"TypeRange", "{}">:$workgroupAttributions,
442      CArg<"TypeRange", "{}">:$privateAttributions,
443      CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>
444  ];
445
446  let extraClassDeclaration = [{
447    /// Returns `true` if the GPU function defined by this Op is a kernel, i.e.
448    /// it is intended to be launched from host.
449    bool isKernel() {
450      return (*this)->getAttrOfType<UnitAttr>(
451          GPUDialect::getKernelFuncAttrName()) != nullptr;
452    }
453
454    /// Returns the number of buffers located in the workgroup memory.
455    unsigned getNumWorkgroupAttributions() {
456      auto attr = (*this)->getAttrOfType<IntegerAttr>(
457          getNumWorkgroupAttributionsAttrName());
458      return attr ? attr.getInt() : 0;
459    }
460
461    /// Return the index of the first workgroup attribution in the block argument
462    /// list.
463    unsigned getFirstWorkgroupAttributionIndex() {
464      return getFunctionType().getNumInputs();
465    }
466
467    /// Returns a list of block arguments that correspond to buffers located in
468    /// the workgroup memory
469    ArrayRef<BlockArgument> getWorkgroupAttributions() {
470      auto begin =
471          std::next(getBody().args_begin(), getFirstWorkgroupAttributionIndex());
472      auto end = std::next(begin, getNumWorkgroupAttributions());
473      return {begin, end};
474    }
475
476    /// Adds a new block argument that corresponds to buffers located in
477    /// workgroup memory.
478    BlockArgument addWorkgroupAttribution(Type type, Location loc);
479
480    /// Get the workgroup attribution attribute dictionary for the attribution
481    /// at index `index`, counted from the start of the workgroup attributions.
482    DictionaryAttr getworkgroupAttributionAttrs(unsigned index);
483
484    /// Set the workgroup attribution attribute dictionary for the attribution
485    /// at index `index`, counted from the start of the workgroup attributions.
486    void setworkgroupAttributionAttrs(unsigned index, DictionaryAttr value);
487
488    /// Get an attribute for a workgroup attribution. `index` is counted
489    /// from the start of the workgroup attributions, not the start of the block.
490    Attribute getWorkgroupAttributionAttr(unsigned index, StringAttr name);
491    Attribute getWorkgroupAttributionAttr(unsigned index, StringRef name) {
492      return getWorkgroupAttributionAttr(index, StringAttr::get((*this)->getContext(), name));
493    }
494
495    /// Set an attribute for a workgroup attribution. `index` is counted
496    /// from the start of the workgroup attributions, not the start of the block.
497    /// A null `value` removes an attributino attribute.
498    void setWorkgroupAttributionAttr(unsigned index, StringAttr name, Attribute value);
499    void setWorkgroupAttributionAttr(unsigned index, StringRef name, Attribute value) {
500      return setWorkgroupAttributionAttr(index, StringAttr::get((*this)->getContext(), name), value);
501    }
502
503    /// Returns the number of buffers located in the private memory.
504    unsigned getNumPrivateAttributions() {
505      return getBody().getNumArguments() - getFunctionType().getNumInputs() -
506          getNumWorkgroupAttributions();
507    }
508
509    /// Returns the index of the first private buffer in the block argument list.
510    unsigned getFirstPrivateAttributionIndex() {
511      // Buffers on the private memory always come after buffers on the workgroup
512      // memory.
513      return getFunctionType().getNumInputs() + getNumWorkgroupAttributions();
514    }
515
516    /// Returns a list of block arguments that correspond to buffers located in
517    /// the private memory.
518    ArrayRef<BlockArgument> getPrivateAttributions() {
519      auto begin =
520          std::next(getBody().args_begin(), getFirstPrivateAttributionIndex());
521      return {begin, getBody().args_end()};
522    }
523
524    /// Adds a new block argument that corresponds to buffers located in
525    /// private memory.
526    BlockArgument addPrivateAttribution(Type type, Location loc);
527
528    /// Get the private attribution attribute dictionary for the attribution
529    /// at index `index`, counted from the start of the private attributions.
530    DictionaryAttr getPrivateAttributionAttrs(unsigned index);
531
532    /// Set the private attribution attribute dictionary for the attribution
533    /// at index `index`, counted from the start of the private attributions.
534    void setPrivateAttributionAttrs(unsigned index, DictionaryAttr value);
535
536    /// Get an attribute for a private attribution. `index` is counted
537    /// from the start of the private attributions, not the start of the block.
538    Attribute getPrivateAttributionAttr(unsigned index, StringAttr name);
539    Attribute getPrivateAttributionAttr(unsigned index, StringRef name) {
540      return getPrivateAttributionAttr(index, StringAttr::get((*this)->getContext(), name));
541    }
542
543    /// Set an attribute for a private attribution. `index` is counted
544    /// from the start of the private attributions, not the start of the block.
545    /// A null `value` removes an attribute.
546    void setPrivateAttributionAttr(unsigned index, StringAttr name, Attribute value);
547    void setPrivateAttributionAttr(unsigned index, StringRef name, Attribute value) {
548      return setPrivateAttributionAttr(index, StringAttr::get((*this)->getContext(), name), value);
549    }
550
551    /// Returns the name of the attribute containing the number of buffers
552    /// located in the workgroup memory.
553    static StringRef getNumWorkgroupAttributionsAttrName() {
554      return "workgroup_attributions";
555    }
556
557    /// Returns the argument types of this function.
558    ArrayRef<Type> getArgumentTypes() { return getFunctionType().getInputs(); }
559
560    /// Returns the result types of this function.
561    ArrayRef<Type> getResultTypes() { return getFunctionType().getResults(); }
562
563    Region *getCallableRegion() { return &getBody(); }
564
565    /// Returns the keywords used in the custom syntax for this Op.
566    static StringRef getWorkgroupKeyword() { return "workgroup"; }
567    static StringRef getPrivateKeyword() { return "private"; }
568    static StringRef getKernelKeyword() { return "kernel"; }
569
570    /// Hook for FunctionOpInterface verifier.
571    LogicalResult verifyType();
572
573    /// Verifies the body of the function.
574    LogicalResult verifyBody();
575  }];
576  let hasCustomAssemblyFormat = 1;
577}
578
579def GPU_DynamicSharedMemoryOp : GPU_Op<"dynamic_shared_memory", [Pure]>
580{
581  let summary = "Get the memref for dynamic shared memory";
582
583  let description = [{
584    This operation provides a memref pointer to the start of dynamic shared
585    memory, often referred to as workgroup memory. It's important to note that
586    this dynamic shared memory needs to be allocated at kernel launch. One can
587    conveniently utilize `the dynamic_shared_memory_size` parameter of
588    `gpu.launch` for this purpose.
589
590    Examples:
591    ```mlir
592    %0 = gpu.dynamic.shared.memory : memref<?xi8, #gpu.address_space<workgroup>>
593    %1 = memref.view %0[%c8192][] : memref<?xi8, #gpu.address_space<workgroup>>
594                            to memref<32x64xf32, #gpu.address_space<workgroup>>
595    %2 = memref.view %0[%c16384][] : memref<?xi8, #gpu.address_space<workgroup>>
596                            to memref<32x64xf32, #gpu.address_space<workgroup>>
597    ```
598  }];
599  let arguments = (ins);
600  let results = (outs Arg<MemRefRankOf<[I8], [1]>>:$resultMemref);
601  let assemblyFormat = [{ attr-dict `:` type($resultMemref) }];
602  let hasVerifier = 1;
603}
604
605def LaunchIndx : AnyTypeOf<[Index, I32, I64]>;
606
607def GPU_LaunchFuncOp :GPU_Op<"launch_func", [
608      GPU_AsyncOpInterface, AttrSizedOperandSegments,
609      AllTypesMatch<["gridSizeX", "gridSizeY", "gridSizeZ", "blockSizeX",
610                     "blockSizeY", "blockSizeZ"]>]>,
611    Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
612               SymbolRefAttr:$kernel,
613               LaunchIndx:$gridSizeX,
614               LaunchIndx:$gridSizeY,
615               LaunchIndx:$gridSizeZ,
616               LaunchIndx:$blockSizeX,
617               LaunchIndx:$blockSizeY,
618               LaunchIndx:$blockSizeZ,
619               Optional<LaunchIndx>:$clusterSizeX,
620               Optional<LaunchIndx>:$clusterSizeY,
621               Optional<LaunchIndx>:$clusterSizeZ,
622               Optional<I32>:$dynamicSharedMemorySize,
623               Variadic<AnyType>:$kernelOperands,
624               Optional<AnyType>:$asyncObject)>,
625    Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> {
626  let summary = "Launches a function as a GPU kernel";
627
628  let description = [{
629    Launch a kernel function on the specified grid of thread blocks.
630    `gpu.launch` operations are lowered to `gpu.launch_func` operations by
631    outlining the kernel body into a function in a dedicated module, which
632    reflects the separate compilation process. The kernel function is required
633    to have the `gpu.kernel` attribute. The module containing the kernel
634    function is required to be a gpu.module. And finally, the module containing
635    the kernel module (which thus cannot be the top-level module) is required
636    to have the `gpu.container_module` attribute. The `gpu.launch_func`
637    operation has a symbol attribute named `kernel` to identify the fully
638    specified kernel function to launch (both the gpu.module and func).
639
640    The `gpu.launch_func` supports async dependencies: the kernel does not start
641    executing until the ops producing those async dependencies have completed.
642
643    By the default, the host implicitly blocks until kernel execution has
644    completed. If the `async` keyword is present, the host does not block but
645    instead a `!gpu.async.token` is returned. Other async GPU ops can take this
646    token as dependency.
647
648    The operation requires at least the grid and block sizes along the x,y,z
649    dimensions as arguments. When a lower-dimensional kernel is required,
650    unused sizes must be explicitly set to `1`.
651
652    The remaining operands are optional. The first optional operand corresponds
653    to the amount of dynamic shared memory a kernel's workgroup should be
654    allocated; when this operand is not present, a zero size is assumed.
655
656    The remaining operands if present are passed as arguments to the kernel
657    function.
658
659    The `gpu.launch_func` also supports kernel launching with clusters if
660    supported by the target architecture. The cluster size can be set by
661    `clusterSizeX`, `clusterSizeY`, and `clusterSizeZ` arguments. When these
662    arguments are present, the Op launches a kernel that clusters the given
663    thread blocks. This feature is exclusive to certain architectures.
664
665    Example:
666
667    ```mlir
668    module attributes {gpu.container_module} {
669
670      // This module creates a separate compilation unit for the GPU compiler.
671      gpu.module @kernels {
672        func.func @kernel_1(%arg0 : f32, %arg1 : memref<?xf32, 1>)
673            attributes { nvvm.kernel = true } {
674
675          // Operations that produce block/thread IDs and dimensions are
676          // injected when outlining the `gpu.launch` body to a function called
677          // by `gpu.launch_func`.
678          %tIdX = gpu.thread_id x
679          %tIdY = gpu.thread_id y
680          %tIdZ = gpu.thread_id z
681
682          %bDimX = gpu.block_dim x
683          %bDimY = gpu.block_dim y
684          %bDimZ = gpu.block_dim z
685
686          %bIdX = gpu.block_id x
687          %bIdY = gpu.block_id y
688          %bIdZ = gpu.block_id z
689
690          %gDimX = gpu.grid_dim x
691          %gDimY = gpu.grid_dim y
692          %gDimZ = gpu.grid_dim z
693
694          // (Optional)  Cluster size only for support architectures
695          %cIdX = gpu.cluster_id x
696          %cIdY = gpu.cluster_id y
697          %cIdZ = gpu.cluster_id z
698
699          %cDimX = gpu.cluster_dim x
700          %cDimY = gpu.cluster_dim y
701          %cDimZ = gpu.cluster_dim z
702
703          "some_op"(%bx, %tx) : (index, index) -> ()
704          %42 = load %arg1[%bx] : memref<?xf32, 1>
705        }
706      }
707
708      %t0 = gpu.wait async
709      gpu.launch_func
710          async                           // (Optional) Don't block host, return token.
711          [%t0]                           // (Optional) Execute only after %t0 has completed.
712          @kernels::@kernel_1             // Kernel function.
713          clusters in (%cst, %cst, %cst)  // (Optional) Cluster size only for support architectures.
714          blocks in (%cst, %cst, %cst)    // Grid size.
715          threads in (%cst, %cst, %cst)   // Block size.
716          dynamic_shared_memory_size %s   // (Optional) Amount of dynamic shared
717                                          // memory to allocate for a workgroup.
718          args(%arg0 : f32,               // (Optional) Kernel arguments.
719               %arg1 : memref<?xf32, 1>)
720    }
721    ```
722  }];
723
724  let skipDefaultBuilders = 1;
725
726  let builders = [
727    OpBuilder<(ins "GPUFuncOp":$kernelFunc, "KernelDim3":$gridSize,
728      "KernelDim3":$blockSize, "Value":$dynamicSharedMemorySize,
729      "ValueRange":$kernelOperands,
730      CArg<"Type", "nullptr">:$asyncTokenType,
731      CArg<"ValueRange", "{}">:$asyncDependencies,
732      CArg<"std::optional<KernelDim3>", "std::nullopt">:$clusterSize)>,
733    OpBuilder<(ins "SymbolRefAttr":$kernel, "KernelDim3":$gridSize,
734      "KernelDim3":$blockSize, "Value":$dynamicSharedMemorySize,
735      "ValueRange":$kernelOperands,
736      "Type":$asyncTokenType,
737      CArg<"ValueRange", "{}">:$asyncDependencies,
738      CArg<"std::optional<KernelDim3>", "std::nullopt">:$clusterSize)>,
739    OpBuilder<(ins "SymbolRefAttr":$kernel, "KernelDim3":$gridSize,
740      "KernelDim3":$blockSize, "Value":$dynamicSharedMemorySize,
741      "ValueRange":$kernelOperands,
742      CArg<"Value", "nullptr">:$asyncObject,
743      CArg<"std::optional<KernelDim3>", "std::nullopt">:$clusterSize)>
744  ];
745
746  let extraClassDeclaration = [{
747    /// The name of the kernel's containing module.
748    StringAttr getKernelModuleName();
749
750    /// The name of the kernel.
751    StringAttr getKernelName();
752
753    /// Returns true if cluster size is specified.
754    bool hasClusterSize() {
755      if (getClusterSizeX() && getClusterSizeY() && getClusterSizeZ())
756        return true;
757      return false;
758    }
759
760    /// The number of operands passed to the kernel function.
761    unsigned getNumKernelOperands();
762
763    /// The i-th operand passed to the kernel function.
764    Value getKernelOperand(unsigned i);
765
766    /// Get the SSA values passed as operands to specify the cluster size.
767    /// When the cluster sizes are not specified, it asserts.
768    KernelDim3 getClusterSizeOperandValues();
769
770    /// Get the SSA values passed as operands to specify the grid size.
771    KernelDim3 getGridSizeOperandValues();
772
773    /// Get the SSA values passed as operands to specify the block size.
774    KernelDim3 getBlockSizeOperandValues();
775
776    // This needs to quietly verify if attributes with names defined below are
777    // present since it is run before the verifier of this op.
778    friend LogicalResult GPUDialect::verifyOperationAttribute(Operation *,
779                                                              NamedAttribute);
780  }];
781
782  let assemblyFormat = [{
783      custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
784      (`<` $asyncObject^ `:` type($asyncObject) `>`)?
785      $kernel
786      ( `clusters` `in` ` ` `(` $clusterSizeX^ `,` $clusterSizeY `,` $clusterSizeZ `)` )?
787      `blocks` `in` ` ` `(` $gridSizeX `,` $gridSizeY `,` $gridSizeZ `)`
788      `threads` `in` ` ` `(` $blockSizeX `,` $blockSizeY `,` $blockSizeZ `)`
789      custom<LaunchDimType>(type($gridSizeX), ref($clusterSizeX), type($clusterSizeX), type($clusterSizeY), type($clusterSizeZ))
790      (`dynamic_shared_memory_size` $dynamicSharedMemorySize^)?
791      custom<LaunchFuncOperands>($kernelOperands, type($kernelOperands)) attr-dict
792  }];
793  let hasVerifier = 1;
794}
795
796def GPU_LaunchOp : GPU_Op<"launch", [
797      AutomaticAllocationScope, AttrSizedOperandSegments, GPU_AsyncOpInterface,
798      DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
799      RecursiveMemoryEffects]>,
800    Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
801               Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
802               Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ,
803               Optional<Index>:$clusterSizeX,
804               Optional<Index>:$clusterSizeY,
805               Optional<Index>:$clusterSizeZ,
806               Optional<I32>:$dynamicSharedMemorySize,
807               OptionalAttr<SymbolRefAttr>:$kernelFunc,
808               OptionalAttr<SymbolRefAttr>:$kernelModule)>,
809    Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> {
810  let summary = "GPU kernel launch operation";
811
812  let description = [{
813    Launch a kernel on the specified grid of thread blocks. The body of the
814    kernel is defined by the single region that this operation contains. The
815    operation takes an optional list of async dependencies followed by six
816    operands and an optional operand.
817
818    The `async` keyword indicates the kernel should be launched asynchronously;
819    the operation returns a new !gpu.async.token when the keyword is specified.
820    The kernel launched does not start executing until the ops producing its
821    async dependencies (optional operands) have completed.
822
823    The first three operands (following any async dependencies) are grid sizes
824    along the x,y,z dimensions and the following three are block sizes along the
825    x,y,z dimensions. When a lower-dimensional kernel is required, unused sizes
826    must be explicitly set to `1`.  The last operand is optional and corresponds
827    to the amount of dynamic shared memory a kernel's workgroup should be
828    allocated; when this operand is not present, a zero size is assumed.
829
830    The body region has at least _twelve_ arguments, or _eighteen_ if cluster
831    dimensions are present, grouped as follows:
832
833    -   three optional arguments that contain cluster identifiers along x,y,z
834        dimensions;
835    -   three arguments that contain block identifiers along x,y,z dimensions;
836    -   three arguments that contain thread identifiers along x,y,z dimensions;
837    -   operands of the `gpu.launch` operation as is (i.e. the operands for
838        grid and block sizes).
839    -   a variadic number of Workgroup memory attributions.
840    -   a variadic number of Private memory attributions.
841
842    The `kernelFunc` and `kernelModule` attributes are optional and specifies
843    the kernel name and a module in which the kernel should be outlined.
844
845    Syntax:
846
847    ```
848    operation ::= `gpu.launch` (`async` (`[` ssa-id-list `]`)? )?
849                             ( `clusters` `(` ssa-id-list `)` `in` ssa-reassignment )?
850                             `blocks` `(` ssa-id-list `)` `in` ssa-reassignment
851                             `threads` `(` ssa-id-list `)` `in` ssa-reassignment
852                             (dynamic_shared_memory_size ssa-use)?
853                             memory-attribution
854                             region attr-dict?
855    ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)`
856    memory-attribution ::= (`workgroup` `(` ssa-id-and-type-list `)`)?
857                           (`private` `(` ssa-id-and-type-list `)`)?
858    ```
859
860    Example:
861
862    ```mlir
863    gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %0, %sz_by = %1, %sz_bz = %2)
864               threads(%tx, %ty, %tz) in (%sz_tx = %3, %sz_ty = %4, %sz_tz = %5) {
865      // Block and thread identifiers, as well as block/grid sizes are
866      // immediately usable inside body region.
867      "some_op"(%bx, %tx) : (index, index) -> ()
868      // Assuming %val1 is defined outside the gpu.launch region.
869      %42 = load %val1[%bx] : memref<?xf32, 1>
870    }
871
872    // Generic syntax explains how the pretty syntax maps to the IR structure.
873    "gpu.launch"(%cst, %cst, %c1,  // Grid sizes.
874                 %cst, %c1, %c1)   // Block sizes.
875
876        {/*attributes*/}
877        // All sizes and identifiers have "index" size.
878        : (index, index, index, index, index, index) -> () {
879    // The operation passes block and thread identifiers, followed by grid and
880    // block sizes.
881    ^bb0(%bx : index, %by : index, %bz : index,
882         %tx : index, %ty : index, %tz : index,
883         %num_bx : index, %num_by : index, %num_bz : index,
884         %num_tx : index, %num_ty : index, %num_tz : index)
885      "some_op"(%bx, %tx) : (index, index) -> ()
886      %3 = "memref.load"(%val1, %bx) : (memref<?xf32, 1>, index) -> f32
887    }
888
889    // Launch with memory attributions.
890    gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %0, %sz_by = %1, %sz_bz = %2)
891               threads(%tx, %ty, %tz) in (%sz_tx = %3, %sz_ty = %4, %sz_tz = %5)
892               workgroup(%workgroup: memref<32xf32, 3>)
893               private(%private: memref<1xf32, 5>) {
894      // Block and thread identifiers, as well as block/grid sizes are
895      // immediately usable inside body region.
896      "some_op"(%bx, %tx) : (index, index) -> ()
897      // Assuming %val1 is defined outside the gpu.launch region.
898      %42 = load %workgroup[%bx] : memref<32xf32, 3>
899    }
900
901    // Launch with clusters.
902    gpu.launch clusters(%cx, %cy, %cz) in (%sz_cx = %0, %sz_cy = %1, %sz_cz = %2)
903               blocks(%bx, %by, %bz) in (%sz_bx = %3, %sz_by = %4, %sz_bz = %5)
904               threads(%tx, %ty, %tz) in (%sz_tx = %6, %sz_ty = %7, %sz_tz = %8)
905    {
906      // Cluster, block and thread identifiers, as well as cluster/block/grid
907      // sizes are immediately usable inside body region.
908      "some_op"(%cx, %bx, %tx) : (index, index, index) -> ()
909    }
910    ```
911
912    Rationale: using operation/block arguments gives analyses a clear way of
913    understanding that a value has additional semantics (e.g., we will need to
914    know what value corresponds to threadIdx.x for coalescing). We can recover
915    these properties by analyzing the operations producing values, but it is
916    easier just to have that information by construction.
917  }];
918
919  let regions = (region AnyRegion:$body);
920
921  let skipDefaultBuilders = 1;
922
923  let builders = [
924    OpBuilder<(ins "Value":$gridSizeX, "Value":$gridSizeY,
925      "Value":$gridSizeZ, "Value":$blockSizeX, "Value":$blockSizeY,
926      "Value":$blockSizeZ,
927      CArg<"Value", "nullptr">:$dynamicSharedMemorySize,
928      CArg<"Type", "nullptr">:$asyncTokenType,
929      CArg<"ValueRange", "{}">:$asyncDependencies,
930      CArg<"TypeRange", "{}">:$workgroupAttributions,
931      CArg<"TypeRange", "{}">:$privateAttributions,
932      CArg<"Value", "nullptr">:$clusterSizeX,
933      CArg<"Value", "nullptr">:$clusterSizeY,
934      CArg<"Value", "nullptr">:$clusterSizeZ)>
935  ];
936
937  let extraClassDeclaration = [{
938    /// Get the SSA values corresponding to kernel block identifiers.
939    KernelDim3 getBlockIds();
940    /// Get the SSA values corresponding to kernel thread identifiers.
941    KernelDim3 getThreadIds();
942    /// Get the SSA values corresponding to kernel cluster identifiers.
943    std::optional<KernelDim3> getClusterIds();
944    /// Get the SSA values corresponding to kernel grid size.
945    KernelDim3 getGridSize();
946    /// Get the SSA values corresponding to kernel block size.
947    KernelDim3 getBlockSize();
948    /// Get the SSA values corresponding to kernel cluster size.
949    std::optional<KernelDim3> getClusterSize();
950
951    /// Get the SSA values passed as operands to specify the grid size.
952    KernelDim3 getGridSizeOperandValues();
953    /// Get the SSA values passed as operands to specify the block size.
954    KernelDim3 getBlockSizeOperandValues();
955    /// Get the SSA values passed as operands to specify the cluster size.
956    std::optional<KernelDim3> getClusterSizeOperandValues();
957
958    static StringRef getBlocksKeyword() { return "blocks"; }
959    static StringRef getClustersKeyword() { return "clusters"; }
960    static StringRef getThreadsKeyword() { return "threads"; }
961    static StringRef getDynamicSharedMemorySizeKeyword() {
962      return "dynamic_shared_memory_size";
963    }
964
965    /// The number of launch configuration operands, placed at the leading
966    /// positions of the operand list.
967    static constexpr unsigned kNumConfigOperands = 6;
968
969    /// The number of region attributes containing the launch configuration,
970    /// placed in the leading positions of the argument list.
971    static constexpr unsigned kNumConfigRegionAttributes = 12;
972
973    /// Returns true if cluster size is specified.
974    bool hasClusterSize() {
975      if (getClusterSizeX() && getClusterSizeY() && getClusterSizeZ())
976        return true;
977      return false;
978    }
979    /// Returns the number of operands including cluster size
980    unsigned getNumConfigOperands() {
981      return kNumConfigOperands + (hasClusterSize() ? 3 : 0);
982    }
983    /// Returns the number of region attributes including cluster size
984    unsigned getNumConfigRegionAttributes() {
985      return kNumConfigRegionAttributes + (hasClusterSize() ? 6 : 0);
986    }
987
988    /// Returns the keywords used in the custom syntax for this Op.
989    static StringRef getWorkgroupKeyword() { return "workgroup"; }
990    static StringRef getPrivateKeyword() { return "private"; }
991
992    /// Returns the number of buffers located in the workgroup memory.
993    unsigned getNumWorkgroupAttributions() {
994      auto attr = (*this)->getAttrOfType<IntegerAttr>(
995          getNumWorkgroupAttributionsAttrName());
996      return attr ? attr.getInt() : 0;
997    }
998
999    /// Returns a list of block arguments that correspond to buffers located in
1000    /// the workgroup memory
1001    ArrayRef<BlockArgument> getWorkgroupAttributions() {
1002      auto begin =
1003          std::next(getBody().args_begin(), getNumConfigRegionAttributes());
1004      auto end = std::next(begin, getNumWorkgroupAttributions());
1005      return {begin, end};
1006    }
1007
1008    /// Adds a new block argument that corresponds to buffers located in
1009    /// workgroup memory.
1010    BlockArgument addWorkgroupAttribution(Type type, Location loc);
1011
1012    /// Returns the number of buffers located in the private memory.
1013    unsigned getNumPrivateAttributions() {
1014      return getBody().getNumArguments() - getNumConfigRegionAttributes() -
1015          getNumWorkgroupAttributions();
1016    }
1017
1018    /// Returns a list of block arguments that correspond to buffers located in
1019    /// the private memory.
1020    ArrayRef<BlockArgument> getPrivateAttributions() {
1021      // Buffers on the private memory always come after buffers on the workgroup
1022      // memory.
1023      auto begin =
1024          std::next(getBody().args_begin(),
1025                    getNumConfigRegionAttributes() + getNumWorkgroupAttributions());
1026      return {begin, getBody().args_end()};
1027    }
1028
1029    /// Adds a new block argument that corresponds to buffers located in
1030    /// private memory.
1031    BlockArgument addPrivateAttribution(Type type, Location loc);
1032
1033    /// Returns the name of the attribute containing the number of buffers
1034    /// located in the workgroup memory.
1035    static StringRef getNumWorkgroupAttributionsAttrName() {
1036      return "workgroup_attributions";
1037    }
1038  }];
1039
1040  let hasCanonicalizer = 1;
1041  let hasCustomAssemblyFormat = 1;
1042  let hasRegionVerifier = 1;
1043  let hasVerifier = 1;
1044}
1045
1046def GPU_PrintfOp : GPU_Op<"printf", [MemoryEffects<[MemWrite]>]>,
1047  Arguments<(ins StrAttr:$format,
1048                Variadic<AnyTypeOf<[AnyInteger, Index, AnyFloat]>>:$args)> {
1049  let summary = "Device-side printf, as in CUDA or OpenCL, for debugging";
1050  let description = [{
1051    `gpu.printf` takes a literal format string `format` and an arbitrary number of
1052    scalar arguments that should be printed.
1053
1054    The format string is a C-style printf string, subject to any restrictions
1055    imposed by one's target platform.
1056  }];
1057  let assemblyFormat = [{
1058    $format attr-dict (`,` $args^ `:` type($args))?
1059  }];
1060}
1061
1062def GPU_ReturnOp : GPU_Op<"return", [HasParent<"GPUFuncOp">, Pure,
1063                                     Terminator]>,
1064    Arguments<(ins Variadic<AnyType>:$operands)>, Results<(outs)> {
1065  let summary = "Terminator for GPU functions.";
1066  let description = [{
1067    A terminator operation for regions that appear in the body of  `gpu.func`
1068    functions. The operands to the `gpu.return` are the result values returned
1069    by an invocation of the `gpu.func`.
1070  }];
1071
1072  let builders = [OpBuilder<(ins), [{ // empty}]>];
1073
1074  let assemblyFormat = "attr-dict ($operands^ `:` type($operands))?";
1075  let hasVerifier = 1;
1076}
1077
1078def GPU_TerminatorOp : GPU_Op<"terminator", [HasParent<"LaunchOp">,
1079                                             Pure, Terminator]>,
1080    Arguments<(ins)>, Results<(outs)> {
1081  let summary = "Terminator for GPU launch regions.";
1082  let description = [{
1083    A terminator operation for regions that appear in the body of `gpu.launch`
1084    operation.  These regions are not expected to return any value so the
1085    terminator takes no operands.
1086  }];
1087
1088  let assemblyFormat = "attr-dict";
1089}
1090
1091def GPU_YieldOp : GPU_Op<"yield", [Pure, ReturnLike, Terminator]>,
1092    Arguments<(ins Variadic<AnyType>:$values)> {
1093  let summary = "GPU yield operation";
1094  let description = [{
1095    gpu.yield` is a special terminator operation for blocks inside regions
1096    in gpu ops. It returns values to the immediately enclosing gpu op.
1097
1098    Example:
1099
1100    ```mlir
1101    gpu.yield %f0, %f1 : f32, f32
1102    ```
1103  }];
1104
1105  let builders = [
1106    OpBuilder<(ins), [{ /* nothing to do */ }]>
1107  ];
1108
1109  let assemblyFormat = "attr-dict ($values^ `:` type($values))?";
1110}
1111
1112// These mirror the reduction combining kinds from the vector dialect.
1113def GPU_AllReduceOpAdd : I32EnumAttrCase<"ADD", 0, "add">;
1114def GPU_AllReduceOpMul : I32EnumAttrCase<"MUL", 1, "mul">;
1115def GPU_AllReduceOpMinUI : I32EnumAttrCase<"MINUI", 2, "minui">;
1116def GPU_AllReduceOpMinSI : I32EnumAttrCase<"MINSI", 3, "minsi">;
1117// Follows the `arith.minnumf` semantics.
1118def GPU_AllReduceOpMinnumF : I32EnumAttrCase<"MINNUMF", 4, "minnumf">;
1119def GPU_AllReduceOpMaxUI : I32EnumAttrCase<"MAXUI", 5, "maxui">;
1120def GPU_AllReduceOpMaxSI : I32EnumAttrCase<"MAXSI", 6, "maxsi">;
1121// Follows the `arith.maxnumf` semantics.
1122def GPU_AllReduceOpMaxnumF : I32EnumAttrCase<"MAXNUMF", 7, "maxnumf">;
1123def GPU_AllReduceOpAnd : I32EnumAttrCase<"AND", 8, "and">;
1124def GPU_AllReduceOpOr  : I32EnumAttrCase<"OR",  9, "or">;
1125def GPU_AllReduceOpXor : I32EnumAttrCase<"XOR", 10, "xor">;
1126// Follows the `arith.minimumf` semantics.
1127def GPU_AllReduceOpMinimumF : I32EnumAttrCase<"MINIMUMF", 11, "minimumf">;
1128// Follows the `arith.maximumf` semantics.
1129def GPU_AllReduceOpMaximumF : I32EnumAttrCase<"MAXIMUMF", 12, "maximumf">;
1130
1131def GPU_AllReduceOperation : I32EnumAttr<"AllReduceOperation",
1132    "built-in reduction operations supported by gpu.allreduce.",
1133    [
1134      GPU_AllReduceOpAdd,
1135      GPU_AllReduceOpMul,
1136      GPU_AllReduceOpMinUI,
1137      GPU_AllReduceOpMinSI,
1138      GPU_AllReduceOpMinnumF,
1139      GPU_AllReduceOpMaxUI,
1140      GPU_AllReduceOpMaxSI,
1141      GPU_AllReduceOpMaxnumF,
1142      GPU_AllReduceOpAnd,
1143      GPU_AllReduceOpOr,
1144      GPU_AllReduceOpXor,
1145      GPU_AllReduceOpMinimumF,
1146      GPU_AllReduceOpMaximumF
1147    ]>{
1148  let genSpecializedAttr = 0;
1149  let cppNamespace = "::mlir::gpu";
1150}
1151
1152def AnyIntegerOrFloat : AnyTypeOf<[AnySignlessInteger, AnyFloat], "Integer or Float">;
1153
1154def GPU_AllReduceOperationAttr : EnumAttr<GPU_Dialect, GPU_AllReduceOperation,
1155                                          "all_reduce_op">;
1156
1157def GPU_AllReduceOp : GPU_Op<"all_reduce",
1158    [SameOperandsAndResultType, IsolatedFromAbove]> {
1159  let summary = "Reduce values among workgroup.";
1160  let description = [{
1161    The `all_reduce` op reduces the value of every work item across a local
1162    workgroup. The result is equal for all work items of a workgroup.
1163
1164    For example, both
1165
1166    ```mlir
1167    %1 = gpu.all_reduce add %0 {} : (f32) -> (f32)
1168    %2 = gpu.all_reduce %0 {
1169    ^bb(%lhs : f32, %rhs : f32):
1170      %sum = arith.addf %lhs, %rhs : f32
1171      "gpu.yield"(%sum) : (f32) -> ()
1172    } : (f32) -> (f32)
1173    ```
1174
1175    compute the sum of each work item's %0 value. The first version specifies
1176    the accumulation as operation, whereas the second version specifies the
1177    accumulation as code region. The reduction operation must be one of:
1178    *  Integer types: `add`, `mul`, `minui`, `minsi`, `maxui`, `maxsi`, `and`,
1179       `or`, `xor`
1180    *  Floating point types: `add`, `mul`, `minnumf`, `maxnumf`, `minimumf`,
1181       `maximumf`
1182
1183    If `uniform` flag is set either none or all work items of a workgroup
1184    need to execute this op in convergence.
1185  }];
1186
1187  let arguments = (ins
1188    AnyIntegerOrFloat:$value,
1189    OptionalAttr<GPU_AllReduceOperationAttr>:$op,
1190    UnitAttr:$uniform
1191  );
1192  let results = (outs AnyIntegerOrFloat:$result);
1193
1194  let regions = (region AnyRegion:$body);
1195  let assemblyFormat = [{ custom<AllReduceOperation>($op) $value
1196                          (`uniform` $uniform^)? $body attr-dict
1197                          `:` functional-type(operands, results) }];
1198
1199  let hasFolder = 1;
1200  let hasRegionVerifier = 1;
1201}
1202
1203def AnyIntegerOrFloatOr1DVector :
1204  AnyTypeOf<[AnyIntegerOrFloat, VectorOfRankAndType<[1], [AnyIntegerOrFloat]>]>;
1205
1206def GPU_SubgroupReduceOp : GPU_Op<"subgroup_reduce", [SameOperandsAndResultType]> {
1207  let summary = "Reduce values among subgroup.";
1208  let description = [{
1209    The `subgroup_reduce` op reduces the values of lanes (work items) across a
1210    subgroup.
1211
1212    The subgroup is divided into clusters starting at lane index 0. Within each
1213    cluster, there are `size` lanes, and the lane index advances by `stride`.
1214    A reduction is done for each cluster in parallel: every lane in the cluster
1215    is reduced, and the result is equal for all lanes in the cluster. If `size`
1216    is omitted, there is a single cluster covering the entire subgroup. If
1217    `stride` is omitted, the stride is 1 (the cluster's lanes are contiguous).
1218
1219    When the reduced value is of a vector type, each vector element is reduced
1220    independently. Only 1-d vector types are allowed.
1221
1222    Example:
1223
1224    ```mlir
1225    %1 = gpu.subgroup_reduce add %a : (f32) -> f32
1226    %2 = gpu.subgroup_reduce add %b : (vector<4xf16>) -> vector<4xf16>
1227    %3 = gpu.subgroup_reduce add %c cluster(size = 4) : (f32) -> f32
1228    %3 = gpu.subgroup_reduce add %c cluster(size = 4, stride = 2) : (f32) -> f32
1229    ```
1230
1231    If `uniform` flag is set either none or all lanes of a subgroup need to execute
1232    this op in convergence.
1233
1234    The reduction operation must be one of:
1235    *  Integer types: `add`, `mul`, `minui`, `minsi`, `maxui`, `maxsi`, `and`,
1236       `or`, `xor`
1237    *  Floating point types: `add`, `mul`, `minnumf`, `maxnumf`, `minimumf`,
1238       `maximumf`
1239  }];
1240
1241  let arguments = (ins
1242    AnyIntegerOrFloatOr1DVector:$value,
1243    GPU_AllReduceOperationAttr:$op,
1244    UnitAttr:$uniform,
1245    OptionalAttr<I32Attr>:$cluster_size,
1246    DefaultValuedAttr<I32Attr,"1">:$cluster_stride
1247  );
1248  let results = (outs AnyIntegerOrFloatOr1DVector:$result);
1249
1250  let builders = [
1251    OpBuilder<(ins "Value":$value,
1252               "::mlir::gpu::AllReduceOperation":$op,
1253               "bool":$uniform), [{
1254      build($_builder, $_state, value, op, uniform, std::nullopt);
1255    }]>,
1256    OpBuilder<(ins "Value":$value,
1257               "::mlir::gpu::AllReduceOperation":$op,
1258               "bool":$uniform,
1259               "std::optional<uint32_t>":$cluster_size), [{
1260      build($_builder, $_state, value, op, uniform,
1261            cluster_size ? $_builder.getI32IntegerAttr(*cluster_size) : nullptr);
1262    }]>,
1263    OpBuilder<(ins "Value":$value,
1264               "::mlir::gpu::AllReduceOperation":$op,
1265               "bool":$uniform,
1266               "std::optional<uint32_t>":$cluster_size,
1267               "uint32_t":$cluster_stride), [{
1268      build($_builder, $_state, value, op, uniform,
1269            cluster_size ? $_builder.getI32IntegerAttr(*cluster_size) : nullptr,
1270            cluster_stride);
1271    }]>
1272  ];
1273
1274  let assemblyFormat = [{ custom<AllReduceOperation>($op) $value
1275                          (`uniform` $uniform^)?
1276                          (`cluster` `(` `size` `=` $cluster_size^ (`,` `stride` `=` $cluster_stride^)? `)`)?
1277                          attr-dict
1278                          `:` functional-type(operands, results) }];
1279
1280  let hasFolder = 1;
1281  let hasVerifier = 1;
1282}
1283
1284def GPU_ShuffleOpXor  : I32EnumAttrCase<"XOR",  0, "xor">;
1285def GPU_ShuffleOpDown : I32EnumAttrCase<"DOWN", 1, "down">;
1286def GPU_ShuffleOpUp   : I32EnumAttrCase<"UP",   2, "up">;
1287def GPU_ShuffleOpIdx  : I32EnumAttrCase<"IDX",  3, "idx">;
1288
1289def GPU_ShuffleMode : I32EnumAttr<"ShuffleMode",
1290    "Indexing modes supported by gpu.shuffle.",
1291    [
1292      GPU_ShuffleOpXor, GPU_ShuffleOpUp, GPU_ShuffleOpDown, GPU_ShuffleOpIdx,
1293    ]> {
1294  let genSpecializedAttr = 0;
1295  let cppNamespace = "::mlir::gpu";
1296}
1297def GPU_ShuffleModeAttr : EnumAttr<GPU_Dialect, GPU_ShuffleMode,
1298                                   "shuffle_mode">;
1299
1300def GPU_ShuffleOp : GPU_Op<
1301    "shuffle", [Pure, AllTypesMatch<["value", "shuffleResult"]>]>,
1302    Arguments<(ins AnyIntegerOrFloatOr1DVector:$value, I32:$offset, I32:$width,
1303               GPU_ShuffleModeAttr:$mode)>,
1304    Results<(outs AnyIntegerOrFloatOr1DVector:$shuffleResult, I1:$valid)> {
1305  let summary = "Shuffles values within a subgroup.";
1306  let description = [{
1307    The "shuffle" op moves values to a across lanes (a.k.a., invocations,
1308    work items) within the same subgroup. The `width` argument specifies the
1309    number of lanes that participate in the shuffle, and must be uniform
1310    across all lanes. Further, the first `width` lanes of the subgroup must
1311    be active.
1312
1313    The intepretation of the `offset` arguments depends on the selected
1314    `mode`.
1315
1316    Returns the `shuffleResult` and `true` if the current lane id is smaller
1317    than `width`, and an unspecified value and `false` otherwise.
1318
1319    `xor` example:
1320
1321    ```mlir
1322    %1, %2 = gpu.shuffle xor %0, %offset, %width : f32
1323    ```
1324
1325    For lane `k`, returns the value `%0` from lane `k ^ offset`. Every lane
1326    trades value with exactly one other lane.
1327
1328    `down` example:
1329
1330    ```mlir
1331    %cst1 = arith.constant 1 : i32
1332    %3, %4 = gpu.shuffle down %0, %cst1, %width : f32
1333    ```
1334
1335    For lane `k`, returns the value from lane `(k + 1) % width`.
1336
1337    `up` example:
1338
1339    ```mlir
1340    %cst1 = arith.constant 1 : i32
1341    %5, %6 = gpu.shuffle up %0, %cst1, %width : f32
1342    ```
1343
1344    For lane `k`, returns the value from lane `(k - 1) % width`.
1345
1346    `idx` example:
1347
1348    ```mlir
1349    %cst0 = arith.constant 0 : i32
1350    %7, %8 = gpu.shuffle idx %0, %cst0, %width : f32
1351    ```
1352
1353    Broadcasts the value from lane 0 to all lanes.
1354  }];
1355
1356  let assemblyFormat = [{
1357    $mode $value `,` $offset `,` $width attr-dict `:` type($value)
1358  }];
1359
1360  let builders = [
1361    // Helper function that creates a shuffle with constant offset/width.
1362    OpBuilder<(ins "Value":$value, "int32_t":$offset, "int32_t":$width,
1363                   "ShuffleMode":$mode)>
1364  ];
1365}
1366
1367def GPU_BarrierOp : GPU_Op<"barrier"> {
1368  let summary = "Synchronizes all work items of a workgroup.";
1369  let description = [{
1370    The "barrier" op synchronizes all work items of a workgroup. It is used
1371    to coordinate communication between the work items of the workgroup.
1372
1373    ```mlir
1374    gpu.barrier
1375    ```
1376
1377    waits until all work items in the workgroup have reached this point
1378    and all memory accesses made by these work items prior to the op are
1379    visible to all work items in the workgroup. Data hazards between work items
1380    accessing the same memory can be avoided by synchronizing work items
1381    in-between these accesses.
1382
1383    Either none or all work items of a workgroup need to execute this op
1384    in convergence.
1385  }];
1386  let assemblyFormat = "attr-dict";
1387  let hasCanonicalizer = 1;
1388}
1389
1390def GPU_GPUModuleOp : GPU_Op<"module", [
1391      DataLayoutOpInterface, HasDefaultDLTIDataLayout, IsolatedFromAbove,
1392      NoRegionArguments, SymbolTable, Symbol] # GraphRegionNoTerminator.traits> {
1393  let summary = "A top level compilation unit containing code to be run on a GPU.";
1394  let description = [{
1395    GPU module contains code that is intended to be run on a GPU. A host device
1396    can launch this code through a gpu.launc_func that creates a fully
1397    qualified symbol through the gpu.module's symbol and a gpu.func symbol
1398    contained in the gpu.module.
1399
1400    The module's top-level scope is modeled by a single region with a single
1401    block. GPU modules are required to have a name that is used for symbol
1402    resolution by the gpu.launch_func operation.
1403
1404    Using an op with a region to define a GPU module enables "embedding" GPU
1405    modules with SIMT execution models in other dialects in a clean manner and
1406    allows filtering of code regions to execute passes on only code intended to
1407    or not intended to be run on the separate device.
1408
1409    Modules can contain zero or more target attributes. These attributes encode
1410    how to transform modules into binary strings and are used by the
1411    `gpu-module-to-binary` pass to transform modules into GPU binaries.
1412
1413    Modules can contain an optional `OffloadingTranslationAttr` attribute. This
1414    attribute will be used during the `gpu-module-to-binary` pass to specify the
1415    `OffloadingTranslationAttr` used when creating the `gpu.binary` operation.
1416
1417    ```
1418    gpu.module @symbol_name {
1419      gpu.func {}
1420        ...
1421    }
1422    // Module with offloading handler and target attributes.
1423    gpu.module @symbol_name2 <#gpu.select_object<1>> [
1424        #nvvm.target,
1425        #rocdl.target<chip = "gfx90a">] {
1426      gpu.func {}
1427        ...
1428    }
1429    ```
1430  }];
1431  let builders = [
1432    OpBuilder<(ins "StringRef":$name,
1433                   CArg<"ArrayAttr", "{}">:$targets,
1434                   CArg<"Attribute", "{}">:$handler)>,
1435    OpBuilder<(ins "StringRef":$name,
1436                   "ArrayRef<Attribute>":$targets,
1437                   CArg<"Attribute", "{}">:$handler)>
1438  ];
1439
1440  let arguments = (ins
1441      SymbolNameAttr:$sym_name,
1442      OptionalAttr<GPUNonEmptyTargetArrayAttr>:$targets,
1443      OptionalAttr<OffloadingTranslationAttr>:$offloadingHandler);
1444  let regions = (region SizedRegion<1>:$bodyRegion);
1445  let assemblyFormat = [{
1446    $sym_name
1447    (`<` $offloadingHandler^ `>`)?
1448    ($targets^)?
1449    attr-dict-with-keyword $bodyRegion
1450  }];
1451
1452  // We need to ensure the block inside the region is properly terminated;
1453  // the auto-generated builders do not guarantee that.
1454  let skipDefaultBuilders = 1;
1455
1456  let extraClassDeclaration = [{
1457    /// Checks if `target` is in the `targets` list.
1458    bool hasTarget(Attribute target);
1459
1460    /// Sets the targets of the module.
1461    void setTargets(ArrayRef<TargetAttrInterface> targets);
1462  }];
1463}
1464
1465def GPU_BinaryOp : GPU_Op<"binary", [Symbol]>, Arguments<(ins
1466      SymbolNameAttr:$sym_name,
1467      OptionalAttr<OffloadingTranslationAttr>:$offloadingHandler,
1468      ConfinedAttr<GPUObjectArrayAttr, [ArrayMinCount<1>]>:$objects)
1469    > {
1470  let summary = "An Op for storing serialized GPU binary objects.";
1471  let description = [{
1472    GPU binaries provide a semantic mechanism for storing GPU objects,
1473    e.g. the result of compiling a GPU module to an object file.
1474
1475    This operation has 3 arguments:
1476     - The name of the binary.
1477     - An optional attribute implementing the offloading LLVM translation interface.
1478     - An array of GPU object attributes.
1479
1480    During translation, the offloading attribute will be called for translating
1481    GPU `binary` and `launch_func` operations. The default offloading handler is:
1482    `#gpu.select_object`, this handler selects the first object from the array
1483    and embeds it as a string.
1484
1485    Examples:
1486    ```
1487      // Selects the first object.
1488      gpu.binary @myobject [#gpu.object<...>, #gpu.object<...>]
1489      // Uses the `#foo.my_handler` for handling the binary during translation.
1490      gpu.binary @myobject <#foo.my_handler> [#gpu.object<...>, #gpu.object<...>]
1491      // Selects the object with the `#rocdl.target` target attribute.
1492      gpu.binary @myobject <#gpu.select_object<#rocdl.target>> [#gpu.object<...>, #gpu.object<#rocdl.target, ...>]
1493    ```
1494  }];
1495  let builders = [
1496    OpBuilder<(ins "StringRef":$name,
1497                   "Attribute":$offloadingHandler,
1498                   "ArrayAttr":$objects)>,
1499    OpBuilder<(ins "StringRef":$name,
1500                   "Attribute":$offloadingHandler,
1501                   "ArrayRef<Attribute>":$objects)>
1502  ];
1503  let skipDefaultBuilders = 1;
1504  let assemblyFormat = [{
1505    $sym_name custom<OffloadingHandler>($offloadingHandler) attr-dict $objects
1506  }];
1507}
1508
1509def GPU_HostRegisterOp : GPU_Op<"host_register">,
1510    Arguments<(ins AnyUnrankedMemRef:$value)> {
1511  let summary = "Registers a memref for access from device.";
1512  let description = [{
1513    This op maps the provided host buffer into the device address space.
1514
1515    This operation may not be supported in every environment, there is not yet a
1516    way to check at runtime whether this feature is supported.
1517
1518    Writes from the host are guaranteed to be visible to device kernels that are
1519    launched afterwards. Writes from the device are guaranteed to be visible on
1520    the host after synchronizing with the device kernel completion.
1521  }];
1522
1523  let assemblyFormat = "$value attr-dict `:` type($value)";
1524}
1525
1526def GPU_HostUnregisterOp : GPU_Op<"host_unregister">,
1527    Arguments<(ins AnyUnrankedMemRef:$value)> {
1528  let summary = "Unregisters a memref for access from device.";
1529  let description = [{
1530      This op unmaps the provided host buffer from the device address space.
1531
1532      This operation may not be supported in every environment, there is not yet a
1533          way to check at runtime whether this feature is supported.
1534  }];
1535
1536  let assemblyFormat = "$value attr-dict `:` type($value)";
1537}
1538
1539def GPU_WaitOp : GPU_Op<"wait", [GPU_AsyncOpInterface]> {
1540  let summary = "Wait for async gpu ops to complete.";
1541  let description = [{
1542    This op synchronizes the host or the device with a list of dependent ops.
1543
1544    If the op contains the `async` keyword, it returns a new async token which
1545    is synchronized with the op arguments. This new token is merely a shortcut
1546    to the argument list, and one could replace the uses of the result with the
1547    arguments for the same effect. The async version of this op is primarily
1548    used to make each async token have a single use during lowering and
1549    thereby make forks in async execution explicit. Example usage:
1550
1551    ```mlir
1552    %t0 = gpu.foo async : !gpu.async.token
1553    %t1 = gpu.bar async : !gpu.async.token
1554    %t2 = gpu.wait async [%t0, %t1]
1555    // gpu.baz doesn't run until gpu.foo and gpu.bar have both completed, just
1556    // as if the async dependencies were [%t0, %t1].
1557    %t3 = gpu.baz async [%t2]
1558    ```
1559
1560    If the op does not contain the `async` keyword, it does not return a new
1561    async token but blocks until all ops producing the async dependency tokens
1562    finished execution. All dependent memory operations are visible to the host
1563    once this op completes. Example usage:
1564
1565    ```mlir
1566    %t0 = gpu.foo async : !gpu.async.token
1567    %t1 = gpu.bar async : !gpu.async.token
1568    // The gpu.wait op blocks until gpu.foo and gpu.bar have completed.
1569    gpu.wait [%t0, %t1]
1570    ```
1571  }];
1572
1573  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies);
1574  let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
1575
1576  let assemblyFormat = [{
1577    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) attr-dict
1578  }];
1579
1580  let hasCanonicalizer = 1;
1581}
1582
1583def GPU_AllocOp : GPU_Op<"alloc", [
1584    GPU_AsyncOpInterface,
1585    AttrSizedOperandSegments
1586  ]> {
1587
1588  let summary = "GPU memory allocation operation.";
1589  let description = [{
1590    The `gpu.alloc` operation allocates a region of memory on the GPU. It is
1591    similar to the `memref.alloc` op, but supports asynchronous GPU execution.
1592
1593    The op does not execute before all async dependencies have finished
1594    executing.
1595
1596    If the `async` keyword is present, the op is executed asynchronously (i.e.
1597    it does not block until the execution has finished on the device). In
1598    that case, it also returns a !gpu.async.token.
1599
1600    If the `host_shared` keyword is present, the memory will be allocated in a
1601    memory accessible both on host and on device.
1602
1603    Example:
1604
1605    ```mlir
1606    %memref, %token = gpu.alloc async [%dep] host_shared (%width) : memref<64x?xf32, 1>
1607    ```
1608  }];
1609
1610  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
1611                   Variadic<Index>:$dynamicSizes, Variadic<Index>:$symbolOperands,
1612                   UnitAttr:$hostShared);
1613  let results = (outs Res<AnyMemRef, "", [MemAllocAt<0, FullEffect>]>:$memref,
1614                 Optional<GPU_AsyncToken>:$asyncToken);
1615
1616  let extraClassDeclaration = [{
1617    MemRefType getType() { return ::llvm::cast<MemRefType>(getMemref().getType()); }
1618  }];
1619
1620  let assemblyFormat = [{
1621    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) (` ` `host_shared` $hostShared^)? ` `
1622    `(` $dynamicSizes `)` (`` `[` $symbolOperands^ `]`)? attr-dict `:` type($memref)
1623  }];
1624
1625  let hasVerifier = 1;
1626  let hasCanonicalizer = 1;
1627}
1628
1629def GPU_DeallocOp : GPU_Op<"dealloc", [GPU_AsyncOpInterface]> {
1630
1631  let summary = "GPU memory deallocation operation";
1632
1633  let description = [{
1634    The `gpu.dealloc` operation frees the region of memory referenced by a
1635    memref which was originally created by the `gpu.alloc` operation. It is
1636    similar to the `memref.dealloc` op, but supports asynchronous GPU execution.
1637
1638    The op does not execute before all async dependencies have finished
1639    executing.
1640
1641    If the `async` keyword is present, the op is executed asynchronously (i.e.
1642    it does not block until the execution has finished on the device). In
1643    that case, it returns a !gpu.async.token.
1644
1645    Example:
1646
1647    ```mlir
1648    %token = gpu.dealloc async [%dep] %memref : memref<8x64xf32, 1>
1649    ```
1650  }];
1651
1652  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
1653                   Arg<AnyMemRef, "", [MemFreeAt<0, FullEffect>]>:$memref);
1654  let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
1655
1656  let assemblyFormat = [{
1657    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
1658    $memref attr-dict `:` type($memref)
1659  }];
1660}
1661
1662def GPU_MemcpyOp : GPU_Op<"memcpy", [GPU_AsyncOpInterface]> {
1663
1664  let summary = "GPU memcpy operation";
1665
1666  let description = [{
1667    The `gpu.memcpy` operation copies the content of one memref to another.
1668
1669    The op does not execute before all async dependencies have finished
1670    executing.
1671
1672    If the `async` keyword is present, the op is executed asynchronously (i.e.
1673    it does not block until the execution has finished on the device). In
1674    that case, it returns a !gpu.async.token.
1675
1676    Example:
1677
1678    ```mlir
1679    %token = gpu.memcpy async [%dep] %dst, %src : memref<?xf32, 1>, memref<?xf32>
1680    ```
1681  }];
1682
1683  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
1684                   Arg<AnyMemRef, "", [MemWriteAt<0, FullEffect>]>:$dst,
1685                   Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$src);
1686  let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
1687
1688  let assemblyFormat = [{
1689    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
1690    $dst`,` $src `:` type($dst)`,` type($src) attr-dict
1691  }];
1692  let hasFolder = 1;
1693  let hasVerifier = 1;
1694  let hasCanonicalizer = 1;
1695}
1696
1697def GPU_MemsetOp : GPU_Op<"memset",
1698  [GPU_AsyncOpInterface, AllElementTypesMatch<["dst", "value"]>]> {
1699
1700  let summary = "GPU memset operation";
1701
1702  let description = [{
1703    The `gpu.memset` operation sets the content of memref to a scalar value.
1704
1705    The op does not execute before all async dependencies have finished
1706    executing.
1707
1708    If the `async` keyword is present, the op is executed asynchronously (i.e.
1709    it does not block until the execution has finished on the device). In
1710    that case, it returns a !gpu.async.token.
1711
1712    Example:
1713
1714    ```mlir
1715    %token = gpu.memset async [%dep] %dst, %value : memref<?xf32, 1>, f32
1716    ```
1717  }];
1718
1719  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
1720                   Arg<AnyMemRef, "", [MemWriteAt<0, FullEffect>]>:$dst,
1721                   Arg<AnyType, "">:$value);
1722  let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
1723
1724  let assemblyFormat = [{
1725    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
1726    $dst`,` $value `:` type($dst)`,` type($value) attr-dict
1727  }];
1728  let hasFolder = 1;
1729}
1730
1731def GPU_SetDefaultDeviceOp : GPU_Op<"set_default_device",
1732                                    [MemoryEffects<[MemWrite]>]>,
1733    Arguments<(ins I32:$devIndex)> {
1734  let summary = "Set default GPU for operations after this by index";
1735  let description = [{
1736    Operation that sets the current default GPU, using a zero-based index
1737    into the set of GPUs on the system. The default GPU setting may be
1738    thread-local.
1739  }];
1740  let assemblyFormat = "attr-dict $devIndex";
1741}
1742
1743def GPU_SubgroupMmaLoadMatrixOp : GPU_Op<"subgroup_mma_load_matrix",
1744    [MemoryEffects<[MemRead]>]>{
1745
1746  let summary = "GPU warp synchronous matrix load";
1747
1748  let description = [{
1749    The `gpu.subgroup_mma_load_matrix` operation loads a matrix collectively
1750    using all the threads in a subgroup.
1751
1752    This operation takes a memref as its first operand: it is the source matrix
1753    from which data is to be loaded. The op returns a `!gpu.mma_matrix`. The
1754    source memref can be in global memory or shared memory. The load address is
1755    determined using `indices`. The matrix being loaded into is the result.  The
1756    `leadDimension` attribute specifies the leading dimension size of the source
1757    matrix which eventually allows the lowering to determine the size of each
1758    row.  If the `transpose` attribute is present then the op does a transposed load.
1759
1760    For integer types, the resulting `!gpu.mma_matrix` type needs to specify the
1761    signedness of the data if the matrix type is an `A` or `B` operand for
1762    `gpu.subgroup_mma_compute`.
1763
1764    This op is often meant to be used along with `gpu.subgroup_mma_store_matrix` and
1765    `gpu.subgroup_mma_compute`.
1766
1767    Example:
1768
1769    ```mlir
1770     %0 = gpu.subgroup_mma_load_matrix src[%i,%j] : {leadDimension = 32 : i32}
1771          : memref<32x32xf16, 3>, !gpu.mma_matrix<16x16xf16, "AOp">
1772    ```
1773  }];
1774
1775  let arguments = (ins Arg<GPU_MMAMemRef, "",
1776                          [MemReadAt<0, FullEffect>]>:$srcMemref,
1777                  Variadic<Index>:$indices,
1778                  IndexAttr:$leadDimension,
1779                  OptionalAttr<UnitAttr>:$transpose);
1780
1781  let results = (outs GPU_MMAMatrix:$res);
1782
1783  let assemblyFormat = [{
1784    $srcMemref`[`$indices`]` attr-dict `:` type($srcMemref) `->` type($res)
1785  }];
1786  let hasVerifier = 1;
1787}
1788
1789def GPU_SubgroupMmaStoreMatrixOp : GPU_Op<"subgroup_mma_store_matrix",
1790    [MemoryEffects<[MemWrite]>]>{
1791
1792  let summary = "GPU warp synchronous matrix store";
1793
1794  let description = [{
1795    The `gpu.subgroup_mma_store_matrix` operation stores a matrix collectively
1796    using all the threads in a subgroup.
1797
1798    This operation takes a `!gpu.mma_matrix` and a memref as operands.
1799    `!gpu.mma_matrix` is the source value containing the data to be stored into the
1800    destination memref which can be in global or shared memory.  The store address
1801    is determined using the indices provided. The `leadDimension` attribute
1802    specifies the leading dimension of the destination matrix. If the
1803    `transpose` attribute is present then the op does a transposed store.
1804
1805    This op is often meant to be used along with `gpu.subgroup_mma_load_matrix` and
1806    `gpu.subgroup_mma_compute`.
1807
1808    Example:
1809
1810    ```mlir
1811    gpu.subgroup_mma_store_matrix %D, %sg[%i,%j] : { leadDimension = 32 : i32}
1812                    : !gpu.mma_matrix<16x16xf16, "COp">, memref<32x32xf16, 3>
1813    ```
1814  }];
1815
1816  let arguments = (ins Arg<MMAMatrixOf<[SI8, UI8, I32, F16, F32]>>:$src,
1817                  Arg<GPU_MMAMemRef, "",[MemWriteAt<0, FullEffect>]>:$dstMemref,
1818                  Variadic<Index>:$indices,
1819                  IndexAttr:$leadDimension,
1820                  OptionalAttr<UnitAttr>:$transpose);
1821
1822  let assemblyFormat = [{
1823    $src`,` $dstMemref`[`$indices`]` attr-dict `:` type($src)`,` type($dstMemref)
1824  }];
1825  let hasVerifier = 1;
1826}
1827
1828def GPU_SubgroupMmaComputeOp
1829    : GPU_Op<"subgroup_mma_compute", [Pure, AllTypesMatch<["opC", "res"]>]> {
1830
1831  let summary = "GPU warp synchronous matrix multiply accumulate";
1832
1833  let description = [{
1834    The `gpu.subgroup_mma_compute` operation performs a matrix-multiply accumulate (mma)
1835    operation using all the threads in a subgroup.
1836
1837    This operation takes three `!gpu.mma_matrix`s as arguments: these hold `A`,
1838    `B` and `C`operands for the mma operation. The operation performed is represented
1839    as `C += A * B`. The op returns a `!gpu.mma_matrix` which contains the result of
1840    the operation held by all threads in a subgroup. `a_transpose` or
1841    `b_transpose` if present, signify that the respective operand was loaded in a
1842    transposed manner. The transpose operands are required to map to correct
1843    underlying intrisics but they currently do not seem to affect correctness
1844    even if they are absent given that the operands were loaded correctly using
1845    the `transpose` attribute in `gpu.subgroup_mma_load_matrix` op.
1846
1847    For integer types, the `A` and `B` matrices carry their signedness with their
1848    types. The accumulator type is expected to be signless and imply a signed integer
1849    with a greater width than the other two operands.
1850
1851    This op is meant to be used along with `gpu.subgroup_mma_store_matrix` and
1852    `gpu.subgroup_mma_load_matrix` ops.
1853
1854    Example:
1855
1856    ```mlir
1857    %D = gpu.subgroup_mma_compute_matrix %A, %B, %C :
1858      !gpu.mma_matrix<16x16xf16, "AOp">, !gpu.mma_matrix<16x16xf16, "BOp">>
1859      -> !gpu.mma_matrix<16x16xf16, "COp">
1860    ```
1861  }];
1862
1863  let arguments = (ins Arg<MMAMatrixOf<[SI8, UI8, F16, F32]>>:$opA,
1864                  Arg<MMAMatrixOf<[SI8, UI8, F16, F32]>>:$opB,
1865                  Arg<MMAMatrixOf<[I32, F16, F32]>>:$opC,
1866                  OptionalAttr<UnitAttr>:$a_transpose,
1867                  OptionalAttr<UnitAttr>:$b_transpose);
1868
1869  let results = (outs GPU_MMAMatrix : $res);
1870
1871  let assemblyFormat = [{
1872    $opA`,` $opB`,` $opC attr-dict `:` type($opA)`,` type($opB) `->` type($res)
1873  }];
1874  let hasVerifier = 1;
1875}
1876
1877def GPU_SubgroupMmaConstantMatrixOp : GPU_Op<"subgroup_mma_constant_matrix",
1878    [Pure,
1879     TypesMatchWith<"value type matches element type of mma_matrix",
1880                    "res", "value",
1881                    "::llvm::cast<gpu::MMAMatrixType>($_self).getElementType()">]>{
1882
1883  let summary = "GPU warp synchronous constant matrix";
1884
1885  let description = [{
1886    The `gpu.subgroup_mma_constant_matrix` creates a `!gpu.mma_matrix` with
1887    constant elements.
1888
1889    The operation takes a scalar input and return a `!gpu.mma_matrix` where
1890    each element of is equal to the operand constant. The destination
1891    mma_matrix type must have elememt type equal to the constant type. Since
1892    the layout of `!gpu.mma_matrix` is opaque this only support setting all the
1893    elements to the same value.
1894
1895    This op is meant to be used along with `gpu.subgroup_mma_compute`.
1896
1897    Example:
1898
1899    ```mlir
1900     %0 = gpu.subgroup_mma_constant_matrix %a :
1901       !gpu.mma_matrix<16x16xf16, "AOp">
1902     %1 = gpu.subgroup_mma_constant_matrix %b :
1903       !gpu.mma_matrix<16x16xf32, "COp">
1904    ```
1905  }];
1906
1907  let arguments = (ins AnyTypeOf<[SI8, UI8, I32, F16, F32]>:$value);
1908
1909  let results = (outs GPU_MMAMatrix:$res);
1910
1911  let extraClassDeclaration = [{
1912    gpu::MMAMatrixType getType() {
1913      return ::llvm::cast<gpu::MMAMatrixType>(getRes().getType());
1914    }
1915  }];
1916
1917  let assemblyFormat = [{
1918    $value attr-dict `:` type($res)
1919  }];
1920}
1921
1922def GPU_ElementwiseOpAddF  : I32EnumAttrCase<"ADDF", 0, "addf">;
1923def GPU_ElementwiseOpMulF  : I32EnumAttrCase<"MULF", 1, "mulf">;
1924def GPU_ElementwiseOpSUBF  : I32EnumAttrCase<"SUBF", 2, "subf">;
1925def GPU_ElementwiseOpMaxF : I32EnumAttrCase<"MAXF", 3, "maxf">;
1926def GPU_ElementwiseOpMinF : I32EnumAttrCase<"MINF", 4, "minf">;
1927def GPU_ElementwiseOpDivF : I32EnumAttrCase<"DIVF", 5, "divf">;
1928def GPU_ElementwiseOpAddI  : I32EnumAttrCase<"ADDI", 6, "addi">;
1929def GPU_ElementwiseOpMulI  : I32EnumAttrCase<"MULI", 7, "muli">;
1930def GPU_ElementwiseOpSUBI  : I32EnumAttrCase<"SUBI", 8, "subi">;
1931def GPU_ElementwiseOpDivS : I32EnumAttrCase<"DIVS", 9, "divs">;
1932def GPU_ElementwiseOpDivU : I32EnumAttrCase<"DIVU", 10, "divu">;
1933def GPU_ElementwiseOpNEGF : I32EnumAttrCase<"NEGATEF", 11, "negatef">;
1934def GPU_ElementwiseOpNEGS : I32EnumAttrCase<"NEGATES", 12, "negates">;
1935def GPU_ElementwiseOpEXTF : I32EnumAttrCase<"EXTF", 13, "extf">;
1936
1937def MMAElementWise : I32EnumAttr<"MMAElementwiseOp",
1938  "elementwise operation to apply to mma matrix", [
1939    GPU_ElementwiseOpAddF,
1940    GPU_ElementwiseOpMulF,
1941    GPU_ElementwiseOpSUBF,
1942    GPU_ElementwiseOpMaxF,
1943    GPU_ElementwiseOpMinF,
1944    GPU_ElementwiseOpDivF,
1945    GPU_ElementwiseOpAddI,
1946    GPU_ElementwiseOpMulI,
1947    GPU_ElementwiseOpSUBI,
1948    GPU_ElementwiseOpDivS,
1949    GPU_ElementwiseOpDivU,
1950    GPU_ElementwiseOpNEGF,
1951    GPU_ElementwiseOpNEGS,
1952    GPU_ElementwiseOpEXTF
1953  ]> {
1954  let genSpecializedAttr = 0;
1955  let cppNamespace = "::mlir::gpu";
1956}
1957def MMAElementWiseAttr : EnumAttr<GPU_Dialect, MMAElementWise,
1958                                  "mma_element_wise">;
1959
1960def GPU_SubgroupMmaElementwiseOp : GPU_Op<"subgroup_mma_elementwise",
1961    [Pure,
1962     AllTypesMatch<["args"]>]>{
1963
1964  let summary = "GPU warp elementwise operation on a matrix";
1965
1966  let description = [{
1967    The `gpu.subgroup_mma_elementwise` takes `!gpu.mma_matrix` inputs and
1968    compute a new `!gpu.mma_matrix` by applying an elementwise operation to each
1969    element.
1970
1971    Since the operation is elementwise and the matrix type must match, the
1972    matrix elements are processed independently of the matrix layout.
1973
1974    This op is meant to be used along with `gpu.subgroup_mma_compute`.
1975
1976    Example:
1977
1978    ```mlir
1979     %0 =  %A, %B { opType = "ADD" } :
1980      (!gpu.mma_matrix<16x16xf16, "COp">, !gpu.mma_matrix<16x16xf16, "COp">)
1981      -> !gpu.mma_matrix<16x16xf16, "COp">
1982    ```
1983  }];
1984
1985  let arguments = (ins Variadic<GPU_MMAMatrix>:$args,
1986                       MMAElementWiseAttr:$opType);
1987
1988  let results = (outs GPU_MMAMatrix:$res);
1989
1990  let extraClassDeclaration = [{
1991    gpu::MMAMatrixType getType() {
1992      return ::llvm::cast<gpu::MMAMatrixType>(getRes().getType());
1993    }
1994  }];
1995
1996  let assemblyFormat = [{
1997    $opType $args attr-dict `:` functional-type($args, $res)
1998  }];
1999}
2000
2001//
2002// Operation on sparse matrices, called from the host
2003// (currently lowers to cuSparse for CUDA only, no ROCM lowering).
2004//
2005
2006def GPU_CreateDnTensorOp : GPU_Op<"create_dn_tensor", [GPU_AsyncOpInterface, AttrSizedOperandSegments]> {
2007  let summary = "Create dense tensor operation";
2008  let description = [{
2009    The `gpu.create_dn_tensor` operation initializes a dense tensor from
2010    the given values buffer and sizes. The buffer must already be copied
2011    from the host to the device prior to using this operation. The
2012    operation returns a handle to the dense tensor descriptor.
2013
2014    If the `async` keyword is present, the op is executed asynchronously (i.e.
2015    it does not block until the execution has finished on the device). In
2016    that case, it returns a !gpu.async.token in addition to the environment.
2017
2018    Example:
2019
2020    ```mlir
2021    %dmat, %token = gpu.create_dn_tensor async [%dep] %mem, %dims : index, index into memref<?xf64>
2022    ```
2023  }];
2024
2025  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
2026                       AnyMemRef:$memref,
2027                       Variadic<Index>:$dims);
2028  let results = (outs Res<GPU_SparseDnTensorHandle>:$dnTensor, Optional<GPU_AsyncToken>:$asyncToken);
2029
2030  let assemblyFormat = [{
2031    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
2032    $memref `,` $dims attr-dict `:` type($dims) `into` type($memref)
2033  }];
2034}
2035
2036def GPU_DestroyDnTensorOp : GPU_Op<"destroy_dn_tensor", [GPU_AsyncOpInterface]> {
2037  let summary = "Destroy dense tensor operation";
2038  let description = [{
2039    The `gpu.destroy_dn_tensor` operation releases all resources of a dense
2040    tensor represented by a handle that was previously created by a
2041    `gpu.create_dn_tensor` operation.
2042
2043    If the `async` keyword is present, the op is executed asynchronously (i.e.
2044    it does not block until the execution has finished on the device). In
2045    that case, it returns a !gpu.async.token in addition to the environment.
2046
2047    Example:
2048
2049    ```mlir
2050    %token = gpu.destroy_dn_tensor async [%dep] %dnTensor
2051    ```
2052  }];
2053
2054  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
2055                       Arg<GPU_SparseDnTensorHandle>:$dnTensor);
2056  let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
2057
2058  let assemblyFormat = [{
2059    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
2060    $dnTensor attr-dict
2061  }];
2062}
2063
2064def GPU_CreateCooOp : GPU_Op<"create_coo", [GPU_AsyncOpInterface]> {
2065  let summary = "Create sparse matrix in COO format operation";
2066  let description = [{
2067    The `gpu.create_coo` operation initializes a sparse matrix in COO format
2068    with the given sizes from the given index and values buffers. The buffers
2069    must already be copied from the host to the device prior to using this
2070    operation. The operation returns a handle to the sparse matrix descriptor.
2071    Note that this operation builds the COO in SoA format.
2072
2073    If the `async` keyword is present, the op is executed asynchronously (i.e.
2074    it does not block until the execution has finished on the device). In
2075    that case, it returns a !gpu.async.token in addition to the environment.
2076
2077    Example:
2078
2079    ```mlir
2080    %spmat, %token = gpu.create_coo async [%dep] %rows, %cols, %nnz, %rowIdx,
2081        %colIdx, %values : memref<?xindex>, memref<?xindex>, memref<?xf64>
2082    ```
2083  }];
2084
2085  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
2086                       Index:$rows,
2087                       Index:$cols,
2088                       Index:$nnz,
2089                       AnyMemRef:$rowIdxs,
2090                       AnyMemRef:$colIdxs,
2091                       AnyMemRef:$values);
2092  let results = (outs Res<GPU_SparseSpMatHandle>:$spmat,
2093                      Optional<GPU_AsyncToken>:$asyncToken);
2094
2095  let assemblyFormat = [{
2096    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
2097    $rows `,` $cols `,` $nnz `,` $rowIdxs `,` $colIdxs `,` $values attr-dict
2098    `:` type($rowIdxs) `,` type($colIdxs) `,` type($values)
2099  }];
2100}
2101
2102def GPU_CreateCooAoSOp : GPU_Op<"create_coo_aos", [GPU_AsyncOpInterface]> {
2103  let summary = "Create sparse matrix in COO format operation (AoS)";
2104  let description = [{
2105    The `gpu.create_coo_aos` operation initializes a sparse matrix in COO format
2106    with the given sizes from the given index and values buffers. The buffers
2107    must already be copied from the host to the device prior to using this
2108    operation. The operation returns a handle to the sparse matrix descriptor.
2109    Unlike the default `gpu.create_coo` operation, this operation builds the
2110    COO format from a single index buffer in AoS format (note that this
2111    feature has been deprecated in cuSparse 11.2).
2112
2113    If the `async` keyword is present, the op is executed asynchronously (i.e.
2114    it does not block until the execution has finished on the device). In
2115    that case, it returns a !gpu.async.token in addition to the environment.
2116
2117    Example:
2118
2119    ```mlir
2120    %spmat, %token = gpu.create_coo_aos async [%dep] %rows, %cols, %nnz, %idxs,
2121        %values : memref<?xindex>, memref<?xf64>
2122    ```
2123  }];
2124
2125  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
2126                   Index:$rows,
2127                   Index:$cols,
2128                   Index:$nnz,
2129                   AnyMemRef:$idxs,
2130                   AnyMemRef:$values);
2131  let results = (outs Res<GPU_SparseSpMatHandle>:$spmat,
2132                      Optional<GPU_AsyncToken>:$asyncToken);
2133
2134  let assemblyFormat = [{
2135    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
2136    $rows `,` $cols `,` $nnz `,` $idxs `,` $values attr-dict
2137    `:` type($idxs) `,` type($values)
2138  }];
2139}
2140
2141def GPU_CreateCsrOp : GPU_Op<"create_csr", [GPU_AsyncOpInterface]> {
2142  let summary = "Create sparse matrix in CSR format operation";
2143  let description = [{
2144    The `gpu.create_csr` operation initializes a sparse matrix in CSR format
2145    with the given sizes from the given position, index, and values buffers.
2146    The buffers must already be copied from the host to the device prior to
2147    using this operation. The operation returns a handle to the sparse
2148    matrix descriptor.
2149
2150    The CSR format has exactly the same memory layout as its transpose
2151    in CSC format (and vice versa).
2152
2153    If the `async` keyword is present, the op is executed asynchronously (i.e.
2154    it does not block until the execution has finished on the device). In
2155    that case, it returns a !gpu.async.token in addition to the environment.
2156
2157    Example:
2158
2159    ```mlir
2160    %spmat, %token = gpu.create_csr async [%dep] %rows, %cols, %nnz, %rowPos,
2161        %colIdx, %values : memref<?xindex>, memref<?xindex>, memref<?xf64>
2162    ```
2163  }];
2164
2165  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
2166                   Index:$rows,
2167                   Index:$cols,
2168                   Index:$nnz,
2169                   AnyMemRef:$rowPos,
2170                   AnyMemRef:$colIdxs,
2171                   AnyMemRef:$values);
2172  let results = (outs Res<GPU_SparseSpMatHandle>:$spmat,
2173                      Optional<GPU_AsyncToken>:$asyncToken);
2174
2175  let assemblyFormat = [{
2176    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
2177    $rows `,` $cols `,` $nnz `,` $rowPos `,` $colIdxs `,` $values attr-dict
2178    `:` type($rowPos) `,` type($colIdxs) `,` type($values)
2179  }];
2180}
2181
2182def GPU_CreateCscOp : GPU_Op<"create_csc", [GPU_AsyncOpInterface]> {
2183  let summary = "Create sparse matrix in CSC format operation";
2184  let description = [{
2185    The `gpu.create_csc` operation initializes a sparse matrix in CSC format
2186    with the given sizes from the given position, index, and values buffers.
2187    The buffers must already be copied from the host to the device prior to
2188    using this operation. The operation returns a handle to the sparse
2189    matrix descriptor.
2190
2191    The CSC format has exactly the same memory layout as its transpose
2192    in CSR format (and vice versa).
2193
2194    If the `async` keyword is present, the op is executed asynchronously (i.e.
2195    it does not block until the execution has finished on the device). In
2196    that case, it returns a !gpu.async.token in addition to the environment.
2197
2198    Example:
2199
2200    ```mlir
2201    %spmat, %token = gpu.create_csc async [%dep] %rows, %cols, %nnz, %colPos,
2202        %rowIdx, %values : memref<?xindex>, memref<?xindex>, memref<?xf64>
2203    ```
2204  }];
2205
2206  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
2207                   Index:$rows,
2208                   Index:$cols,
2209                   Index:$nnz,
2210                   AnyMemRef:$colPos,
2211                   AnyMemRef:$rowIdxs,
2212                   AnyMemRef:$values);
2213  let results = (outs Res<GPU_SparseSpMatHandle>:$spmat,
2214                      Optional<GPU_AsyncToken>:$asyncToken);
2215
2216  let assemblyFormat = [{
2217    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
2218    $rows `,` $cols `,` $nnz `,` $colPos `,` $rowIdxs `,` $values attr-dict
2219    `:` type($colPos) `,` type($rowIdxs) `,` type($values)
2220  }];
2221}
2222
2223def GPU_CreateBsrOp : GPU_Op<"create_bsr", [GPU_AsyncOpInterface]> {
2224  let summary = "Create sparse matrix in BSR format operation";
2225  let description = [{
2226    The `gpu.create_bsr` operation initializes a sparse matrix in BSR format
2227    with the given sizes for the matrix and blocks from the given position,
2228    index, and values buffers. The buffers must already be copied from the
2229    host to the device prior to using this operation. The operation returns
2230    a handle to the sparse matrix descriptor.
2231
2232    The BSR format is similar to CSR, where the column indices represent
2233    two-dimensional blocks instead of a single matrix entry. Note that this
2234    operation (currently) only supports storage with **square** blocks,
2235    i.e., `rBlockSize == cBlockSize`.
2236
2237    If the `async` keyword is present, the op is executed asynchronously (i.e.
2238    it does not block until the execution has finished on the device). In
2239    that case, it returns a !gpu.async.token in addition to the environment.
2240
2241    Example:
2242
2243    ```mlir
2244    %spmat, %token = gpu.create_bsr async [%dep]
2245       %brows, %bcols, %bnnz, %rBlockSize, %cBlockSize,
2246       %bRowPos, %bColIdxs, %values : memref<?xindex>, memref<?xindex>, memref<?xf64>
2247    ```
2248  }];
2249
2250  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
2251                   Index:$brows,
2252                   Index:$bcols,
2253                   Index:$bnnz,
2254                   Index:$rBlockSize,
2255                   Index:$cBlockSize,
2256                   AnyMemRef:$bRowPos,
2257                   AnyMemRef:$bColIdxs,
2258                   AnyMemRef:$values);
2259  let results = (outs Res<GPU_SparseSpMatHandle>:$spmat,
2260                      Optional<GPU_AsyncToken>:$asyncToken);
2261
2262  let assemblyFormat = [{
2263    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
2264    $brows `,` $bcols `,` $bnnz `,` $rBlockSize `,` $cBlockSize `,`
2265    $bRowPos `,` $bColIdxs `,` $values attr-dict
2266    `:` type($bRowPos) `,` type($bColIdxs) `,` type($values)
2267  }];
2268}
2269
2270def GPU_Prune2To4SpMatFlag : I32EnumAttr<"Prune2To4SpMatFlag",
2271  "pruning strategy for 2:4 sparse matrix",
2272  [
2273    I32EnumAttrCase<"NONE", 0>,
2274    I32EnumAttrCase<"PRUNE_ONLY", 1>,
2275    I32EnumAttrCase<"PRUNE_AND_CHECK", 2>,
2276  ]> {
2277    let genSpecializedAttr = 0;
2278    let cppNamespace = GPU_Dialect.cppNamespace;
2279}
2280
2281def GPU_Prune2To4SpMatFlagAttr : EnumAttr<GPU_Dialect, GPU_Prune2To4SpMatFlag,
2282                                   "prune_2to4_spmat_flag">{
2283  let defaultValue = "Prune2To4SpMatFlag::PRUNE_AND_CHECK";
2284}
2285
2286
2287def GPU_Create2To4SpMatOp : GPU_Op<"create_2to4_spmat", [GPU_AsyncOpInterface]> {
2288  let summary = "Create sparse matrix with 2:4 sparsity operation";
2289  let description = [{
2290    The `gpu.create_2to4_spmat` operation initializes a sparse matrix in dense
2291    format with 2:4 sparsity.
2292    The buffers must already be copied from the host to the device prior to
2293    using this operation. The operation returns a handle to the sparse
2294    matrix descriptor.
2295
2296    If the `async` keyword is present, the op is executed asynchronously (i.e.
2297    it does not block until the execution has finished on the device). In
2298    that case, it returns a !gpu.async.token in addition to the environment.
2299
2300    Example:
2301
2302    ```mlir
2303    %spmat, %token = gpu.create_2to4_spmat async [%dep] {PRUNE_AND_CHECK} %rows, %cols, %mem: memref<?xf64>
2304    ```
2305  }];
2306
2307  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
2308                       Index:$rows,
2309                       Index:$cols,
2310                       GPU_Prune2To4SpMatFlagAttr:$pruneFlag,
2311                       AnyMemRef:$memref);
2312  let results = (outs Res<GPU_SparseSpMatHandle>:$spMat,
2313                      Optional<GPU_AsyncToken>:$asyncToken);
2314
2315  let assemblyFormat = [{
2316    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
2317     `{` $pruneFlag `}` $rows `,` $cols `,` $memref attr-dict `:` type($memref)
2318  }];
2319}
2320
2321def GPU_DestroySpMatOp : GPU_Op<"destroy_sp_mat", [GPU_AsyncOpInterface]> {
2322  let summary = "Destroy sparse matrix operation";
2323  let description = [{
2324    The `gpu.destroy_sp_mat` operation releases all resources of a sparse
2325    matrix represented by a handle that was previously created by a
2326    one of the sparse matrix creation operations.
2327
2328    If the `async` keyword is present, the op is executed asynchronously (i.e.
2329    it does not block until the execution has finished on the device). In
2330    that case, it returns a !gpu.async.token in addition to the environment.
2331
2332    Example:
2333
2334    ```mlir
2335    %token = gpu.destroy_sp_mat async [%dep] %spmat
2336    ```
2337  }];
2338
2339  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
2340                       Arg<GPU_SparseSpMatHandle>:$spmat);
2341  let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
2342
2343  let assemblyFormat = [{
2344    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) $spmat attr-dict
2345  }];
2346}
2347
2348// To avoid coupling this dialect with cusparse.h specifics, we hardcoded magic
2349// literals in this enum. Note that this should be kept in sync with
2350// cusparseOperation_t in cusparse.h:
2351// typedef enum {
2352// CUSPARSE_OPERATION_NON_TRANSPOSE       = 0,
2353// CUSPARSE_OPERATION_TRANSPOSE           = 1,
2354// CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2
2355// } cusparseOperation_t;
2356// TODO: find a proper way to keep them in sync?
2357def GPU_TransposeMode : I32EnumAttr<"TransposeMode",
2358    "transpose mode of sparse matrix supported by sparse tensor ops",
2359    [
2360      I32EnumAttrCase<"NON_TRANSPOSE", 0>,
2361      I32EnumAttrCase<"TRANSPOSE", 1>,
2362      I32EnumAttrCase<"CONJUGATE_TRANSPOSE", 2>,
2363    ]> {
2364      let genSpecializedAttr = 0;
2365      let cppNamespace = GPU_Dialect.cppNamespace;
2366}
2367
2368def GPU_TransposeModeAttr : EnumAttr<GPU_Dialect, GPU_TransposeMode,
2369                                   "mat_transpose_mode">{
2370  let defaultValue = "TransposeMode::NON_TRANSPOSE";
2371}
2372
2373def GPU_SpMVBufferSizeOp : GPU_Op<"spmv_buffer_size", [GPU_AsyncOpInterface]> {
2374  let summary = "Precompute buffersize for SpMV operation";
2375  let description = [{
2376    The `gpu.spmv_buffer_size` operation returns the buffer size required
2377    to perform the SpMV operation on the given sparse matrix and dense vectors.
2378    The operation expects handles returned by previous sparse operations
2379    to construct an environment and the operands for SpMV.
2380
2381    If the `async` keyword is present, the op is executed asynchronously (i.e.
2382    it does not block until the execution has finished on the device). In
2383    that case, it returns a !gpu.async.token in addition to the environment.
2384
2385    The matrix arguments can also be associated with one of the following
2386    operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value
2387    is NON_TRANSPOSE.
2388
2389    Example:
2390
2391    ```mlir
2392    %buffersz, %token = gpu.spmv_buffer_size async [%dep] %spmatA{TRANSPOSE}, %dnX, %dnY into f32
2393    ```
2394  }];
2395  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
2396                       GPU_TransposeModeAttr:$modeA,
2397                       GPU_SparseSpMatHandle:$spmatA,
2398                       GPU_SparseDnTensorHandle:$dnX,
2399                       GPU_SparseDnTensorHandle:$dnY,
2400                       TypeAttr:$computeType);
2401  let results = (outs Res<Index>:$bufferSz,
2402                      Optional<GPU_AsyncToken>:$asyncToken);
2403
2404  let builders = [OpBuilder<(ins
2405      "Type":$bufferSz,
2406      "Type":$asyncToken,
2407      "ValueRange":$asyncDependencies,
2408      "Value":$spmatA,
2409      "Value":$dnX,
2410      "Value":$dnY,
2411      "Type":$computeType)
2412      , [{
2413    auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
2414    return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies,
2415                 modeA, spmatA, dnX, dnY, computeType);}]>
2416  ];
2417
2418  let assemblyFormat = [{
2419    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
2420    $spmatA (`{` $modeA^ `}`)? `,` $dnX `,` $dnY attr-dict  `into` $computeType
2421  }];
2422}
2423
2424def GPU_SpMVOp : GPU_Op<"spmv", [GPU_AsyncOpInterface]> {
2425  let summary = "SpMV operation";
2426  let description = [{
2427    The `gpu.spmv` operation performs the SpMV operation on the given sparse matrix,
2428    dense vectors, and buffer.  The operation expects handles returned by previous
2429    sparse operations to construct an environment and the operands for SpMV. The
2430    buffer must have been allocated on the device.
2431
2432    If the `async` keyword is present, the op is executed asynchronously (i.e.
2433    it does not block until the execution has finished on the device). In
2434    that case, it returns a !gpu.async.token in addition to the environment.
2435
2436    The matrix arguments can also be associated with one of the following
2437    operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value
2438    is NON_TRANSPOSE.
2439
2440    Example:
2441
2442    ```mlir
2443    %token = gpu.spmv async [%dep] %spmatA{TRANSPOSE}, %dnX, %dnY : memref<?xf64> into bf16
2444    ```
2445  }];
2446  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
2447                       GPU_TransposeModeAttr:$modeA,
2448                       GPU_SparseSpMatHandle:$spmatA,
2449                       GPU_SparseDnTensorHandle:$dnX,
2450                       GPU_SparseDnTensorHandle:$dnY,
2451                       TypeAttr:$computeType,
2452                       AnyMemRef:$buffer);
2453  let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
2454
2455  let builders = [OpBuilder<(ins
2456      "Type":$asyncToken,
2457      "ValueRange":$asyncDependencies,
2458      "Value":$spmatA,
2459      "Value":$dnX,
2460      "Value":$dnY,
2461      "Type":$computeType,
2462      "Value":$buffer), [{
2463    auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
2464    return build($_builder, $_state, asyncToken, asyncDependencies, modeA,
2465                 spmatA, dnX, dnY, computeType, buffer);}]>
2466  ];
2467
2468  let assemblyFormat = [{
2469    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
2470    $spmatA (`{` $modeA^ `}`)? `,` $dnX `,` $dnY `,` $buffer attr-dict `:` type($buffer) `into` $computeType
2471  }];
2472}
2473
2474def GPU_SpMMBufferSizeOp : GPU_Op<"spmm_buffer_size", [GPU_AsyncOpInterface, AttrSizedResultSegments]> {
2475  let summary = "Precompute buffersize for SpMM operation";
2476  let description = [{
2477    The `gpu.spmm_buffer_size` operation returns the buffer size required
2478    to perform the SpMM operation on the given sparse and dense matrix.
2479    The operation expects handles returned by previous sparse operations
2480    to construct an environment and the operands for SpMM.
2481
2482    If the `async` keyword is present, the op is executed asynchronously (i.e.
2483    it does not block until the execution has finished on the device). In
2484    that case, it returns a !gpu.async.token in addition to the environment.
2485
2486    The matrix arguments can also be associated with one of the following
2487    operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value
2488    is NON_TRANSPOSE.
2489
2490    Example:
2491
2492    ```mlir
2493    %bufferszs, %token = gpu.spmm_buffer_size async [%dep] %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC : i64 into f32
2494    ```
2495  }];
2496
2497  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
2498                       GPU_TransposeModeAttr:$modeA,
2499                       GPU_TransposeModeAttr:$modeB,
2500                       GPU_SparseSpMatHandle:$spmatA,
2501                       GPU_SparseDnTensorHandle:$dnmatB,
2502                       GPU_SparseDnTensorHandle:$dnmatC,
2503                       TypeAttr:$computeType);
2504  let results = (outs Variadic<Index>:$bufferSzs,
2505                      Optional<GPU_AsyncToken>:$asyncToken);
2506
2507  let builders = [OpBuilder<(ins
2508      "Type":$bufferSzs,
2509      "Type":$asyncToken,
2510      "ValueRange":$asyncDependencies,
2511      "Value":$spmatA,
2512      "Value":$dnmatB,
2513      "Value":$dnmatC,
2514      "Type":$computeType), [{
2515    auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
2516    auto modeB = gpu::TransposeMode::NON_TRANSPOSE;
2517    return build($_builder, $_state, bufferSzs, asyncToken, asyncDependencies,
2518                 modeA, modeB, spmatA, dnmatB, dnmatC, computeType);}]>
2519  ];
2520
2521  let assemblyFormat = [{
2522    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
2523    $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC attr-dict `:` type($bufferSzs) `into` $computeType
2524  }];
2525}
2526
2527def GPU_SpMMOp : GPU_Op<"spmm", [GPU_AsyncOpInterface, AttrSizedOperandSegments]> {
2528  let summary = "SpMM operation";
2529  let description = [{
2530    The `gpu.spmm` operation performs the SpMM operation on the given sparse and
2531    dense matrix, and buffer.  The operation expects handles returned by previous
2532    sparse operations to construct an environment and the operands for SpMM. The
2533    buffer must have been allocated on the device.
2534
2535    If the `async` keyword is present, the op is executed asynchronously (i.e.
2536    it does not block until the execution has finished on the device). In
2537    that case, it returns a !gpu.async.token in addition to the environment.
2538
2539    The matrix arguments can also be associated with one of the following
2540    operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value
2541    is NON_TRANSPOSE.
2542
2543    Example:
2544
2545    ```mlir
2546    %token = gpu.spmm async [%dep] %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC, %buffers : type($buffers) into f32
2547    ```
2548  }];
2549
2550  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
2551                       GPU_TransposeModeAttr:$modeA,
2552                       GPU_TransposeModeAttr:$modeB,
2553                       GPU_SparseSpMatHandle:$spmatA,
2554                       GPU_SparseDnTensorHandle:$dnmatB,
2555                       GPU_SparseDnTensorHandle:$dnmatC,
2556                       TypeAttr:$computeType,
2557                       Variadic<AnyMemRef>:$buffers);
2558  let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
2559
2560  let builders = [OpBuilder<(ins
2561      "Type":$asyncToken,
2562      "ValueRange":$asyncDependencies,
2563      "Value":$spmatA,
2564      "Value":$dnmatB,
2565      "Value":$dnmatC,
2566      "Type":$computeType,
2567      "ValueRange":$buffers), [{
2568    auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
2569    auto modeB = gpu::TransposeMode::NON_TRANSPOSE;
2570    return build($_builder, $_state, asyncToken, asyncDependencies, modeA,
2571                 modeB, spmatA, dnmatB, dnmatC, computeType, buffers);}]>
2572  ];
2573
2574  let assemblyFormat = [{
2575    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
2576    $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC `,` $buffers attr-dict `:` type($buffers) `into` $computeType
2577  }];
2578}
2579
2580def GPU_SDDMMBufferSizeOp : GPU_Op<"sddmm_buffer_size", [GPU_AsyncOpInterface]> {
2581  let summary = "Precompute buffersize for SDDMM operation";
2582  let description = [{
2583    The `gpu.sddmm_buffer_size` operation returns the buffer size required
2584    to perform the SDDMM operation on the given sparse and dense matrices.
2585    The operation expects handles returned by previous sparse operations
2586    to construct an environment and the operands for SDDMM.
2587
2588    If the `async` keyword is present, the op is executed asynchronously (i.e.
2589    it does not block until the execution has finished on the device). In
2590    that case, it returns a !gpu.async.token in addition to the environment.
2591
2592    Example:
2593
2594    ```mlir
2595    %buffersz, %token = gpu.sddmm_buffer_size async [%dep] %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC into f32
2596    ```
2597
2598    The matrix arguments can also be associated with one of the following
2599    operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value
2600    is NON_TRANSPOSE.
2601  }];
2602
2603  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
2604                   GPU_TransposeModeAttr:$modeA,
2605                   GPU_TransposeModeAttr:$modeB,
2606                   GPU_SparseDnTensorHandle:$dnmatA,
2607                   GPU_SparseDnTensorHandle:$dnmatB,
2608                   GPU_SparseSpMatHandle:$spmatC,
2609                   TypeAttr:$computeType);
2610  let results = (outs Res<Index>:$bufferSz, Optional<GPU_AsyncToken>:$asyncToken);
2611
2612  let builders = [OpBuilder<(ins
2613      "Type":$bufferSz,
2614      "Type":$asyncToken,
2615      "ValueRange":$asyncDependencies,
2616      "Value":$dnmatA,
2617      "Value":$dnmatB,
2618      "Value":$spmatC,
2619      "Type":$computeType), [{
2620    auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
2621    auto modeB = gpu::TransposeMode::NON_TRANSPOSE;
2622    return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies,
2623                 modeA, modeB, dnmatA, dnmatB, spmatC, computeType);}]>
2624  ];
2625
2626  let assemblyFormat = [{
2627    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
2628    $dnmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $spmatC attr-dict `into` $computeType
2629  }];
2630}
2631
2632def GPU_SDDMMOp : GPU_Op<"sddmm", [GPU_AsyncOpInterface]> {
2633  let summary = "SDDMM operation";
2634  let description = [{
2635    The `gpu.sddmm` operation performs the SDDMM operation on the given sparse and
2636    dense matrices, and buffer.  The operation expects handles returned by previous
2637    sparse operations to construct an environment and the operands for SDDMM. The
2638    buffer must have been allocated on the device.
2639
2640    If the `async` keyword is present, the op is executed asynchronously (i.e.
2641    it does not block until the execution has finished on the device). In
2642    that case, it returns a !gpu.async.token in addition to the environment.
2643
2644    Example:
2645
2646    ```mlir
2647    %token = gpu.sddmm async [%dep] %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC, %buffer into f32
2648    ```
2649
2650    The matrix arguments can also be associated with one of the following
2651    operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value
2652    is NON_TRANSPOSE.
2653  }];
2654
2655  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
2656                   GPU_TransposeModeAttr:$modeA,
2657                   GPU_TransposeModeAttr:$modeB,
2658                   GPU_SparseDnTensorHandle:$dnmatA,
2659                   GPU_SparseDnTensorHandle:$dnmatB,
2660                   GPU_SparseSpMatHandle:$spmatC,
2661                   TypeAttr:$computeType,
2662                   AnyMemRef:$buffer);
2663  let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
2664
2665  let builders = [OpBuilder<(ins
2666    "Type":$asyncToken,
2667    "ValueRange":$asyncDependencies,
2668    "Value":$dnmatA,
2669    "Value":$dnmatB,
2670    "Value":$spmatC,
2671    "Type":$computeType,
2672    "Value":$buffer), [{
2673  auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
2674  auto modeB = gpu::TransposeMode::NON_TRANSPOSE;
2675  return build($_builder, $_state, asyncToken, asyncDependencies, modeA,
2676                modeB, dnmatA, dnmatB, spmatC, computeType, buffer);}]>
2677  ];
2678
2679  let assemblyFormat = [{
2680    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
2681    $dnmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $buffer attr-dict `:` type($buffer) `into` $computeType
2682  }];
2683}
2684
2685def GPU_SpGEMMWorkEstimationOrComputeKind : I32EnumAttr<"SpGEMMWorkEstimationOrComputeKind",
2686    "choose whether spgemm_work_estimation_or_compute does work estimation or compute",
2687    [
2688      I32EnumAttrCase<"WORK_ESTIMATION", 0>,
2689      I32EnumAttrCase<"COMPUTE", 1>,
2690    ]> {
2691      let genSpecializedAttr = 0;
2692      let cppNamespace = GPU_Dialect.cppNamespace;
2693}
2694
2695def GPU_SpGEMMWorkEstimationOrComputeKindAttr : EnumAttr<GPU_Dialect,
2696    GPU_SpGEMMWorkEstimationOrComputeKind,
2697    "spgemm_work_estimation_or_compute_kind"> {}
2698
2699def GPU_SpGEMMCreateDescrOp : GPU_Op<"spgemm_create_descr", [GPU_AsyncOpInterface]> {
2700  let summary = "SpGEMM Create Descr operation";
2701  let description = [{
2702    The `gpu.spgemm_create_descr` creates a descriptor for the SpGEMM operation.
2703    The descriptor describes the SpGEMM operation and stores the internal data
2704    throughout the computation. It needs to be passed as an argument to
2705    spgemm_* operations.
2706
2707    If the `async` keyword is present, the op is executed asynchronously (i.e.
2708    it does not block until the execution has finished on the device). In
2709    that case, it returns a `!gpu.async.token` in addition to the environment.
2710
2711    Example:
2712
2713    ```mlir
2714    %desc, %token = gpu.spgemm_create_descr async [%dep]
2715    ```
2716  }];
2717  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies);
2718  let results = (outs GPU_SparseSpGEMMOpHandle:$desc,
2719                      Optional<GPU_AsyncToken>:$asyncToken);
2720  let assemblyFormat = [{
2721    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
2722    attr-dict
2723  }];
2724}
2725
2726def GPU_SpGEMMDestroyDescrOp : GPU_Op<"spgemm_destroy_descr", [GPU_AsyncOpInterface]> {
2727  let summary = "SpGEMM Destroy Descr operation";
2728  let description = [{
2729    The `gpu.spgemm_destroy_descr` destroys the SpGEMM operation descriptor.
2730
2731    If the `async` keyword is present, the op is executed asynchronously (i.e.
2732    it does not block until the execution has finished on the device). In
2733    that case, it returns a `!gpu.async.token` in addition to the environment.
2734
2735    Example:
2736
2737    ```mlir
2738    %token = gpu.spgemm_destroy_descr async [%dep] %desc
2739    ```
2740  }];
2741
2742  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
2743                       GPU_SparseSpGEMMOpHandle:$desc);
2744  let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
2745  let assemblyFormat = [{
2746    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
2747    $desc attr-dict
2748  }];
2749}
2750
2751def GPU_SpGEMMWorkEstimationOrComputeOp : GPU_Op<"spgemm_work_estimation_or_compute", [GPU_AsyncOpInterface]> {
2752  let summary = "SpGEMM work estimation operation";
2753  let description = [{
2754    The `gpu.spgemm_work_estimation_or_compute` is used to call
2755    cusparseSpGEMM_workEstimation or cusparseSpGEMM_compute. Both of them are
2756    for both determining the buffer size and performing the actual computation.
2757    The operation expects handles returned by previous sparse operations to
2758    construct an environment and the operands for SpGEMM.
2759    The buffer must have been allocated on the device.
2760
2761    C' = alpha * op(A) * op(B) + beta * C
2762
2763    If the `async` keyword is present, the op is executed asynchronously (i.e.
2764    it does not block until the execution has finished on the device). In
2765    that case, it returns a `!gpu.async.token` in addition to the environment.
2766
2767    Example:
2768
2769    ```mlir
2770    %bufferSz, %token = gpu.spgemm_work_estimation_or_compute async [%dep] {COMPUTE}
2771                          %desc, %spmatA{NON_TRANSPOSE}, %spmatB{NON_TRANSPOSE},
2772                          %spmatC, %spgemmDesc, %c0, %alloc: f32 into
2773                          memref<0xi8>
2774    ```
2775
2776    The matrix arguments can also be associated with one of the following
2777    operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value
2778    is NON_TRANSPOSE.
2779  }];
2780
2781  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
2782                       GPU_SparseSpGEMMOpHandle:$desc,
2783                       GPU_TransposeModeAttr:$modeA,
2784                       GPU_TransposeModeAttr:$modeB,
2785                       GPU_SparseSpMatHandle:$spmatA,
2786                       GPU_SparseSpMatHandle:$spmatB,
2787                       GPU_SparseSpMatHandle:$spmatC,
2788                       TypeAttr:$computeType,
2789                       Index:$bufferSz,
2790                       AnyMemRef:$buffer,
2791                       GPU_SpGEMMWorkEstimationOrComputeKindAttr:$kind);
2792  let results = (outs Res<Index>:$bufferSzNew,
2793                      Optional<GPU_AsyncToken>:$asyncToken);
2794
2795  let builders = [OpBuilder<(ins
2796    "Type":$bufferSzNew,
2797    "Type":$asyncToken,
2798    "ValueRange":$asyncDependencies,
2799    "Value":$desc,
2800    "Value":$spmatA,
2801    "Value":$spmatB,
2802    "Value":$spmatC,
2803    "Type":$computeType,
2804    "Value":$bufferSz,
2805    "Value":$buffer), [{
2806  auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
2807  auto modeB = gpu::TransposeMode::NON_TRANSPOSE;
2808  auto kind = gpu::SpGEMMWorkEstimationOrComputeKind::WORK_ESTIMATION;
2809  return build($_builder, $_state, bufferSzNew, asyncToken, asyncDependencies, desc,
2810               modeA, modeB, spmatA, spmatB, spmatC, computeType, bufferSz, buffer, kind);}]>
2811  ];
2812
2813  let assemblyFormat = [{
2814    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
2815    `{` $kind `}` $spmatA (`{` $modeA^ `}`)? `,` $spmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $desc `,` $bufferSz `,` $buffer  attr-dict `:` $computeType `into` type($buffer)
2816  }];
2817}
2818
2819def GPU_SpGEMMCopyOp : GPU_Op<"spgemm_copy", [GPU_AsyncOpInterface]> {
2820  let summary = "SpGEMM copy operation";
2821  let description = [{
2822    The `gpu.spgemm_copy` operation copies the sparse matrix result of
2823    a SpGEMM computation.
2824
2825    If the `async` keyword is present, the op is executed asynchronously (i.e.
2826    it does not block until the execution has finished on the device). In
2827    that case, it returns a `!gpu.async.token` in addition to the environment.
2828
2829    Example:
2830
2831    ```mlir
2832    gpu.spgemm_copy %spmatA, %spmatB, %spmatC, %spgemmDesc: f32
2833    ```
2834
2835    The matrix arguments can also be associated with one of the following
2836    operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value
2837    is NON_TRANSPOSE.
2838  }];
2839
2840  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
2841                       GPU_SparseSpGEMMOpHandle:$desc,
2842                       GPU_TransposeModeAttr:$modeA,
2843                       GPU_TransposeModeAttr:$modeB,
2844                       GPU_SparseSpMatHandle:$spmatA,
2845                       GPU_SparseSpMatHandle:$spmatB,
2846                       GPU_SparseSpMatHandle:$spmatC,
2847                       TypeAttr:$computeType);
2848  let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
2849
2850  let builders = [OpBuilder<(ins
2851    "Type":$asyncToken,
2852    "ValueRange":$asyncDependencies,
2853    "Value":$desc,
2854    "Value":$spmatA,
2855    "Value":$spmatB,
2856    "Value":$spmatC,
2857    "Type":$computeType), [{
2858  auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
2859  auto modeB = gpu::TransposeMode::NON_TRANSPOSE;
2860  return build($_builder, $_state, asyncToken, asyncDependencies, desc,
2861               modeA, modeB, spmatA, spmatB, spmatC, computeType);}]>
2862  ];
2863
2864  let assemblyFormat = [{
2865    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
2866    $spmatA (`{` $modeA^ `}`)? `,` $spmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $desc attr-dict `:` $computeType
2867  }];
2868}
2869
2870def GPU_SpMatGetSizeOp : GPU_Op<"spmat_get_size", [GPU_AsyncOpInterface]> {
2871  let summary = "SpMat get size operation";
2872  let description = [{
2873    The `gpu.spmat_get_size` operation retrieves the number of rows, number of
2874    columns, and number of non-zero elements of a sparse matrix.
2875
2876    If the `async` keyword is present, the op is executed asynchronously (i.e.
2877    it does not block until the execution has finished on the device). In
2878    that case, it returns a `!gpu.async.token` in addition to the environment.
2879
2880    Example:
2881
2882    ```mlir
2883    %rows, %cols, %nnz, %token = gpu.spmat_get_size async [%dep] %spmatC
2884    ```
2885  }];
2886
2887  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
2888                       GPU_SparseSpMatHandle:$spmat);
2889  let results = (outs Index:$rows,
2890                      Index:$cols,
2891                      Index:$nnz,
2892                      Optional<GPU_AsyncToken>:$asyncToken);
2893
2894  let assemblyFormat = [{
2895    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
2896    $spmat attr-dict
2897  }];
2898}
2899
2900def GPU_SetCsrPointersOp : GPU_Op<"set_csr_pointers", [GPU_AsyncOpInterface]> {
2901  let summary = "SpGEMM get size operation";
2902  let description = [{
2903    The `gpu.set_csr_pointers` assigns the given positions, coordinates,
2904    and values buffer that reside on the device directly to the given sparse
2905    matrix descriptor in csr format.
2906
2907    If the `async` keyword is present, the op is executed asynchronously (i.e.
2908    it does not block until the execution has finished on the device). In
2909    that case, it returns a `!gpu.async.token` in addition to the environment.
2910
2911    Example:
2912
2913    ```mlir
2914    %token = gpu.set_csr_pointers async [%dep] %positions, %coordinates, %values
2915          : memref<?xf32>, memref<?xindex>, memref<?xindex>
2916    ```
2917  }];
2918
2919  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
2920                       Arg<GPU_SparseSpMatHandle>:$spmat,
2921                       AnyMemRef:$positions,
2922                       AnyMemRef:$coordinates,
2923		       AnyMemRef:$values);
2924  let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
2925
2926  let assemblyFormat = [{
2927    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
2928      $spmat `,` $positions `,` $coordinates `,` $values attr-dict
2929        `:` type($positions) `,` type($coordinates) `,` type($values)
2930  }];
2931}
2932
2933def GPU_WarpExecuteOnLane0Op : GPU_Op<"warp_execute_on_lane_0",
2934      [DeclareOpInterfaceMethods<RegionBranchOpInterface, ["areTypesCompatible"]>,
2935       SingleBlockImplicitTerminator<"gpu::YieldOp">,
2936       RecursiveMemoryEffects]> {
2937  let summary = "Executes operations in the associated region on thread #0 of a"
2938                "SPMD program";
2939  let description = [{
2940    `warp_execute_on_lane_0` is an operation used to bridge the gap between
2941    vector programming and SPMD programming model like GPU SIMT. It allows to
2942    trivially convert a region of vector code meant to run on a multiple threads
2943    into a valid SPMD region and then allows incremental transformation to
2944    distribute vector operations on the threads.
2945
2946    Any code present in the region would only be executed on first thread/lane
2947    based on the `laneid` operand. The `laneid` operand is an integer ID between
2948    [0, `warp_size`). The `warp_size` attribute indicates the number of lanes in
2949    a warp.
2950
2951    Operands are vector values distributed on all lanes that may be used by
2952    the single lane execution. The matching region argument is a vector of all
2953    the values of those lanes available to the single active lane. The
2954    distributed dimension is implicit based on the shape of the operand and
2955    argument. the properties of the distribution may be described by extra
2956    attributes (e.g. affine map).
2957
2958    Return values are distributed on all lanes using laneId as index. The
2959    vector is distributed based on the shape ratio between the vector type of
2960    the yield and the result type.
2961    If the shapes are the same this means the value is broadcasted to all lanes.
2962    In the future the distribution can be made more explicit using affine_maps
2963    and will support having multiple Ids.
2964
2965    Therefore the `warp_execute_on_lane_0` operations allow to implicitly copy
2966    between lane0 and the lanes of the warp. When distributing a vector
2967    from lane0 to all the lanes, the data are distributed in a block cyclic way.
2968    For example `vector<64xf32>` gets distributed on 32 threads and map to
2969    `vector<2xf32>` where thread 0 contains vector[0] and vector[1].
2970
2971    During lowering values passed as operands and return value need to be
2972    visible to different lanes within the warp. This would usually be done by
2973    going through memory.
2974
2975    The region is *not* isolated from above. For values coming from the parent
2976    region not going through operands only the lane 0 value will be accesible so
2977    it generally only make sense for uniform values.
2978
2979    Example:
2980    ```
2981    // Execute in parallel on all threads/lanes.
2982    gpu.warp_execute_on_lane_0 (%laneid)[32] {
2983      // Serial code running only on thread/lane 0.
2984      ...
2985    }
2986    // Execute in parallel on all threads/lanes.
2987    ```
2988
2989    This may be lowered to an scf.if region as below:
2990    ```
2991      // Execute in parallel on all threads/lanes.
2992      %cnd = arith.cmpi eq, %laneid, %c0 : index
2993      scf.if %cnd {
2994        // Serial code running only on thread/lane 0.
2995        ...
2996      }
2997      // Execute in parallel on all threads/lanes.
2998    ```
2999
3000    When the region has operands and/or return values:
3001    ```
3002    // Execute in parallel on all threads/lanes.
3003    %0 = gpu.warp_execute_on_lane_0(%laneid)[32]
3004    args(%v0 : vector<4xi32>) -> (vector<1xf32>) {
3005    ^bb0(%arg0 : vector<128xi32>) :
3006      // Serial code running only on thread/lane 0.
3007      ...
3008      gpu.yield %1 : vector<32xf32>
3009    }
3010    // Execute in parallel on all threads/lanes.
3011    ```
3012
3013    values at the region boundary would go through memory:
3014    ```
3015    // Execute in parallel on all threads/lanes.
3016    ...
3017    // Store the data from each thread into memory and Synchronization.
3018    %tmp0 = memreg.alloc() : memref<128xf32>
3019    %tmp1 = memreg.alloc() : memref<32xf32>
3020    %cnd = arith.cmpi eq, %laneid, %c0 : index
3021    vector.store %v0, %tmp0[%laneid] : memref<128xf32>, vector<4xf32>
3022    some_synchronization_primitive
3023    scf.if %cnd {
3024      // Serialized code running only on thread 0.
3025      // Load the data from all the threads into a register from thread 0. This
3026      // allow threads 0 to access data from all the threads.
3027      %arg0 = vector.load %tmp0[%c0] : memref<128xf32>, vector<128xf32>
3028      ...
3029      // Store the data from thread 0 into memory.
3030      vector.store %1, %tmp1[%c0] : memref<32xf32>, vector<32xf32>
3031    }
3032    // Synchronization and load the data in a block cyclic way so that the
3033    // vector is distributed on all threads.
3034    some_synchronization_primitive
3035    %0 = vector.load %tmp1[%laneid] : memref<32xf32>, vector<32xf32>
3036    // Execute in parallel on all threads/lanes.
3037    ```
3038
3039  }];
3040
3041  let hasVerifier = 1;
3042  let hasCustomAssemblyFormat = 1;
3043  let arguments = (ins Index:$laneid, I64Attr:$warp_size,
3044                       Variadic<AnyType>:$args);
3045  let results = (outs Variadic<AnyType>:$results);
3046  let regions = (region SizedRegion<1>:$warpRegion);
3047
3048  let skipDefaultBuilders = 1;
3049  let builders = [
3050    OpBuilder<(ins "TypeRange":$resultTypes, "Value":$laneid,
3051                   "int64_t":$warpSize)>,
3052    // `blockArgTypes` are different than `args` types as they are they
3053    // represent all the `args` instances visibile to lane 0. Therefore we need
3054    // to explicit pass the type.
3055    OpBuilder<(ins "TypeRange":$resultTypes, "Value":$laneid,
3056                   "int64_t":$warpSize, "ValueRange":$args,
3057                   "TypeRange":$blockArgTypes)>
3058  ];
3059
3060  let extraClassDeclaration = [{
3061    bool isDefinedOutsideOfRegion(Value value) {
3062      return !getRegion().isAncestor(value.getParentRegion());
3063    }
3064  }];
3065}
3066
3067#endif // GPU_OPS
3068