xref: /llvm-project/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td (revision 776ac21c7f95e092759ba39e5533aad90d63c86e)
1//===- GPUTransformOps.td - GPU transform ops --------------*- tablegen -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef GPU_TRANSFORM_OPS
10#define GPU_TRANSFORM_OPS
11
12include "mlir/Dialect/Transform/IR/TransformDialect.td"
13include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
14include "mlir/Interfaces/SideEffectInterfaces.td"
15include "mlir/IR/OpBase.td"
16
17//===----------------------------------------------------------------------===//
18// Apply...ConversionPatternsOp
19//===----------------------------------------------------------------------===//
20
21def ApplyGPUToNVVMConversionPatternsOp : Op<Transform_Dialect,
22    "apply_conversion_patterns.gpu.gpu_to_nvvm",
23    [DeclareOpInterfaceMethods<ConversionPatternDescriptorOpInterface,
24                               ["verifyTypeConverter"]>]> {
25  let description = [{
26    Collects patterns that convert GPU dialect ops to NVVM dialect ops. These
27    patterns require an "LLVMTypeConverter".
28  }];
29  let assemblyFormat = "attr-dict";
30}
31
32def ApplyGPUWwmaToNVVMConversionPatternsOp : Op<Transform_Dialect,
33    "apply_conversion_patterns.gpu.gpu_wmma_to_nvvm",
34    [DeclareOpInterfaceMethods<ConversionPatternDescriptorOpInterface,
35                               ["verifyTypeConverter"]>]> {
36  let description = [{
37    Collects patterns that convert GPU dialect ops related to wmma ops
38    to NVVM dialect ops.
39    These patterns require an "LLVMTypeConverter".
40  }];
41  let assemblyFormat = "attr-dict";
42}
43
44def ApplyGPUSubgroupReduceToNVVMConversionPatternsOp : Op<Transform_Dialect,
45    "apply_conversion_patterns.gpu.gpu_subgroup_reduce_to_nvvm",
46    [DeclareOpInterfaceMethods<ConversionPatternDescriptorOpInterface,
47                               ["verifyTypeConverter"]>]> {
48  let description = [{
49    Collects patterns that convert GPU dialect ops related to wmma ops
50    to NVVM dialect ops.
51    These patterns require an "LLVMTypeConverter".
52  }];
53  let assemblyFormat = "attr-dict";
54}
55
56//===----------------------------------------------------------------------===//
57// Apply...PatternsOp
58//===----------------------------------------------------------------------===//
59
60def ApplyGPURewritePatternsOp : Op<Transform_Dialect,
61    "apply_patterns.gpu.gpu_rewrite_patterns",
62    [DeclareOpInterfaceMethods<PatternDescriptorOpInterface>]> {
63  let description = [{
64    Collects GPU rewrite patterns comprising:
65      1. GpuAllReduceRewrite patterns
66      2. GpuGlobalIdRewriter patterns
67      3. GpuShuffleRewriter patterns
68  }];
69  let assemblyFormat = "attr-dict";
70}
71
72def ApplyUnrollVectorsSubgroupMmaOp : Op<Transform_Dialect,
73    "apply_patterns.gpu.unroll_vectors_subgroup_mma",
74    [DeclareOpInterfaceMethods<PatternDescriptorOpInterface>]> {
75  let description = [{
76    Unrolls contractions to the target `m`, `n`, and `k` native vector size,
77    along with other vector operations based on expected usage. `transfer_read`
78    ops unroll based on the extract slice shape introduced by unrolling the
79    contractions, while elementwise and `transfer_write` ops unroll to the shape of
80    the C matrix (`m x n`).
81
82    This operation applies to pure vector operations and should be applied before
83    lowering to subgroup_mma ops.
84  }];
85
86  let arguments = (ins I64Attr:$m,
87                       I64Attr:$n,
88                       I64Attr:$k);
89
90  let assemblyFormat = [{
91    `[` $m `,` $n `,` $k `]` attr-dict
92  }];
93}
94
95def EliminateBarriersOp :
96  Op<Transform_Dialect, "apply_patterns.gpu.eliminate_barriers",
97    [DeclareOpInterfaceMethods<PatternDescriptorOpInterface>]> {
98  let description = [{
99    Removes unnecessary GPU barriers from the function. If a barrier does not
100    enforce any conflicting pair of memory effects, including a pair that is
101    enforced by another barrier, it is unnecessary and can be removed.
102
103    The approach is based on "High-Performance GPU-to-CPU Transpilation and
104    Optimization via High-Level Parallel Constructs" by  Moses, Ivanov,
105    Domke, Endo, Doerfert, and Zinenko in PPoPP 2023. Specifically, it
106    analyzes the memory effects of the operations before and after the given
107    barrier and checks if the barrier enforces any of the memory
108    effect-induced dependencies that aren't already enforced by another
109    barrier.
110
111    For example, in the following code
112
113    ```mlir
114      store %A
115      barrier  // enforces load-after-store
116      load %A
117      barrier  // load-after-store already enforced by the previous barrier
118      load %A
119    ```
120
121    the second barrier can be removed.
122  }];
123
124  let assemblyFormat = [{ attr-dict }];
125}
126
127def MapNestedForallToThreads :
128  Op<Transform_Dialect, "gpu.map_nested_forall_to_threads",
129    [FunctionalStyleTransformOpTrait,
130     MemoryEffectsOpInterface,
131     TransformEachOpTrait,
132     TransformOpInterface]> {
133  let description = [{
134      Target the `gpu.launch op` and rewrite all `scf.forall` nested in it to
135      distributed `gpu.thread_id` attribute.
136
137      The operation searches for `scf.forall` ops nested under `target` and maps
138      each such op to GPU threads.
139
140      `scf.forall` induction variables are rewritten to `gpu.thread_id` according
141      to the `mapping` attribute.
142
143      Different types of mappings attributes are supported:
144        - the block_dims is a list of integers that specifies the number of
145          threads in each dimension. This is a mandatory attribute that is used
146          to constrain the number of threads in each dimension. If an
147          `scf.forall` op is mapped to fewer threads, predication occurs.
148        - the warp_dims is a list of integers that specifies the number of
149          warps in each dimension. This is an optional attribute that is used
150          to constrain the number of warps in each dimension. When present, this
151          attribute must be specified in a way that is compatible with the
152          block_dims attribute. If an `scf.forall` op is mapped to fewer warps,
153          predication occurs.
154
155      Dynamic `scf.forall` trip counts are currently not supported.
156      Dynamic block dim sizes are currently not supported.
157
158      Only **bufferized** `scf.forall` are currently supported.
159      Only `scf.forall` distributed to **at most 3 dimensions** are
160      currently supported.
161
162      The `sync_after_distribute`attribute controls whether a `gpu.barrier` is
163      inserted after each scf.forall op. At this time, this is an all or nothing
164      choice. This will need to be tightened in the future.
165
166      The operation alters the block size of the given gpu_launch using the
167      mandatory block_dims argument.
168
169      #### Return modes:
170
171      This operation ignores non-`gpu_launch` ops and drops them in the return.
172
173      If any scf.forall with tensors is found, the transform definitely
174      fails.
175
176      If all the `scf.forall` operations with gpu.thread mapping contained
177      within the `LaunchOp` referred to by the `target` handle lower to GPU
178      properly, the transform succeeds. Otherwise the transform definitely
179      fails.
180
181      scf.forall operations with mappings other than gpu.thread are
182      ignored.
183
184      The returned handle points to the same LaunchOp operand, consuming it and
185      producing a new SSA value to satisfy chaining and linearity of the IR
186      properties.
187
188      #### Example:
189
190      ```
191      gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2)
192                 threads(%tx, %ty, %tz) in (%tx = %3, %ty = %4, %tz = %5) {
193        scf.forall (%i, %j) in (7, 9) {
194          ... // body 1
195        } {mapping = [#gpu.thread<x>, #gpu.thread<y>, #gpu.thread<z>]}
196        scf.forall (%i) in (12) {
197          ... // body 2
198        } {mapping = [#gpu.thread<x>]}
199        gpu.terminator
200      }
201      ```
202
203      is translated to:
204
205      ```
206      %bdimX = arith.constant 12 : index
207      %bdimY = arith.constant 9 : index
208      gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2)
209             threads(%tx, %ty, %tz) in (%tx = %bdimX, %ty = %bdimY, %tz = %5) {
210        if (threadIdx.x < 9 && threadIdx.y < 7) {
211          ... // body 1
212        }
213        gpu.barrier
214        if (threadIdx.y < 1) {
215          ... // body 2
216        }
217        gpu.barrier
218        gpu.terminator
219      }
220      ```
221    }];
222
223  let arguments = (ins TransformHandleTypeInterface:$target,
224                   DefaultValuedAttr<DenseI64ArrayAttr, "{}">:$block_dims,
225                   DefaultValuedAttr<BoolAttr, "true">:$sync_after_distribute,
226                   DefaultValuedAttr<I64Attr, "32">:$warp_size);
227  let results = (outs TransformHandleTypeInterface:$result);
228
229  let assemblyFormat = [{
230    $target
231    `block_dims` `=` $block_dims
232    (`sync_after_distribute` `=` $sync_after_distribute^)?
233    (`warp_size` `=` $warp_size^)?
234    attr-dict
235    `:` functional-type($target, $result)
236  }];
237  let extraClassDeclaration = [{
238    ::mlir::DiagnosedSilenceableFailure applyToOne(
239        ::mlir::transform::TransformRewriter &rewriter,
240        ::mlir::Operation *target,
241        ::mlir::transform::ApplyToEachResultList &results,
242        ::mlir::transform::TransformState &state);
243  }];
244}
245
246def MapForallToBlocks :
247  Op<Transform_Dialect, "gpu.map_forall_to_blocks",
248    [FunctionalStyleTransformOpTrait,
249     MemoryEffectsOpInterface,
250     TransformOpInterface,
251     TransformEachOpTrait]> {
252  let description = [{
253    Target the gpu_launch op and rewrite the top level `scf.forall`
254    to distributed gpu.block_id attribute. If `generate_gpu_launch` attribute
255    is set, then first generates `gpu_launch` and moves the top level
256    `scf.forall` inside.
257
258    The operation searches top level `scf.forall` ops under
259    `gpu_launch` and maps each such op to GPU blocks. Mapping is
260    one-to-one and the induction variables of `scf.forall` are
261    rewritten to gpu.block_id according to the `thread_dim_mapping` attribute.
262
263    Dynamic, `scf.forall` trip counts are currently not supported.
264    Dynamic block dim sizes are currently not supported.
265
266    Only **bufferized** scf.forall are currently supported.
267    Only scf.forall distributed to **at most 3 dimensions** are
268    currently supported.
269
270    The operation alters the block size of the given gpu_launch using the
271    grid_dims argument.
272
273    #### Return modes:
274
275    This operation ignores non-gpu_launch ops and drops them in the return.
276
277    If any scf.forall with tensors is found, the transform definitely
278    fails.
279
280    If all the `scf.forall` operations contained within the LaunchOp
281    referred to by the `target` handle lower to GPU properly, the
282    transform succeeds. Otherwise the transform definitely fails.
283
284    The returned handle points to the same LaunchOp operand, consuming it and
285    producing a new SSA value to satisfy chaining and linearity of the IR
286    properties.
287  }];
288
289  let arguments = (ins TransformHandleTypeInterface:$target,
290                   DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$grid_dims,
291                   UnitAttr:$generate_gpu_launch);
292  let results = (outs TransformHandleTypeInterface:$result);
293
294  let assemblyFormat = [{
295    $target
296    (`generate_gpu_launch` $generate_gpu_launch^)?
297    (`grid_dims` `=` $grid_dims^)?
298    attr-dict
299    `:` functional-type($target, $result)
300  }];
301  let hasVerifier = 1;
302
303  let extraClassDeclaration = [{
304    ::mlir::DiagnosedSilenceableFailure applyToOne(
305        ::mlir::transform::TransformRewriter &rewriter,
306        ::mlir::Operation *target,
307        ::mlir::transform::ApplyToEachResultList &results,
308        ::mlir::transform::TransformState &state);
309  }];
310}
311
312#endif // GPU_TRANSFORM_OPS
313