1//===- GPUTransformOps.td - GPU transform ops --------------*- tablegen -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8 9#ifndef GPU_TRANSFORM_OPS 10#define GPU_TRANSFORM_OPS 11 12include "mlir/Dialect/Transform/IR/TransformDialect.td" 13include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td" 14include "mlir/Interfaces/SideEffectInterfaces.td" 15include "mlir/IR/OpBase.td" 16 17//===----------------------------------------------------------------------===// 18// Apply...ConversionPatternsOp 19//===----------------------------------------------------------------------===// 20 21def ApplyGPUToNVVMConversionPatternsOp : Op<Transform_Dialect, 22 "apply_conversion_patterns.gpu.gpu_to_nvvm", 23 [DeclareOpInterfaceMethods<ConversionPatternDescriptorOpInterface, 24 ["verifyTypeConverter"]>]> { 25 let description = [{ 26 Collects patterns that convert GPU dialect ops to NVVM dialect ops. These 27 patterns require an "LLVMTypeConverter". 28 }]; 29 let assemblyFormat = "attr-dict"; 30} 31 32def ApplyGPUWwmaToNVVMConversionPatternsOp : Op<Transform_Dialect, 33 "apply_conversion_patterns.gpu.gpu_wmma_to_nvvm", 34 [DeclareOpInterfaceMethods<ConversionPatternDescriptorOpInterface, 35 ["verifyTypeConverter"]>]> { 36 let description = [{ 37 Collects patterns that convert GPU dialect ops related to wmma ops 38 to NVVM dialect ops. 39 These patterns require an "LLVMTypeConverter". 40 }]; 41 let assemblyFormat = "attr-dict"; 42} 43 44def ApplyGPUSubgroupReduceToNVVMConversionPatternsOp : Op<Transform_Dialect, 45 "apply_conversion_patterns.gpu.gpu_subgroup_reduce_to_nvvm", 46 [DeclareOpInterfaceMethods<ConversionPatternDescriptorOpInterface, 47 ["verifyTypeConverter"]>]> { 48 let description = [{ 49 Collects patterns that convert GPU dialect ops related to wmma ops 50 to NVVM dialect ops. 51 These patterns require an "LLVMTypeConverter". 52 }]; 53 let assemblyFormat = "attr-dict"; 54} 55 56//===----------------------------------------------------------------------===// 57// Apply...PatternsOp 58//===----------------------------------------------------------------------===// 59 60def ApplyGPURewritePatternsOp : Op<Transform_Dialect, 61 "apply_patterns.gpu.gpu_rewrite_patterns", 62 [DeclareOpInterfaceMethods<PatternDescriptorOpInterface>]> { 63 let description = [{ 64 Collects GPU rewrite patterns comprising: 65 1. GpuAllReduceRewrite patterns 66 2. GpuGlobalIdRewriter patterns 67 3. GpuShuffleRewriter patterns 68 }]; 69 let assemblyFormat = "attr-dict"; 70} 71 72def ApplyUnrollVectorsSubgroupMmaOp : Op<Transform_Dialect, 73 "apply_patterns.gpu.unroll_vectors_subgroup_mma", 74 [DeclareOpInterfaceMethods<PatternDescriptorOpInterface>]> { 75 let description = [{ 76 Unrolls contractions to the target `m`, `n`, and `k` native vector size, 77 along with other vector operations based on expected usage. `transfer_read` 78 ops unroll based on the extract slice shape introduced by unrolling the 79 contractions, while elementwise and `transfer_write` ops unroll to the shape of 80 the C matrix (`m x n`). 81 82 This operation applies to pure vector operations and should be applied before 83 lowering to subgroup_mma ops. 84 }]; 85 86 let arguments = (ins I64Attr:$m, 87 I64Attr:$n, 88 I64Attr:$k); 89 90 let assemblyFormat = [{ 91 `[` $m `,` $n `,` $k `]` attr-dict 92 }]; 93} 94 95def EliminateBarriersOp : 96 Op<Transform_Dialect, "apply_patterns.gpu.eliminate_barriers", 97 [DeclareOpInterfaceMethods<PatternDescriptorOpInterface>]> { 98 let description = [{ 99 Removes unnecessary GPU barriers from the function. If a barrier does not 100 enforce any conflicting pair of memory effects, including a pair that is 101 enforced by another barrier, it is unnecessary and can be removed. 102 103 The approach is based on "High-Performance GPU-to-CPU Transpilation and 104 Optimization via High-Level Parallel Constructs" by Moses, Ivanov, 105 Domke, Endo, Doerfert, and Zinenko in PPoPP 2023. Specifically, it 106 analyzes the memory effects of the operations before and after the given 107 barrier and checks if the barrier enforces any of the memory 108 effect-induced dependencies that aren't already enforced by another 109 barrier. 110 111 For example, in the following code 112 113 ```mlir 114 store %A 115 barrier // enforces load-after-store 116 load %A 117 barrier // load-after-store already enforced by the previous barrier 118 load %A 119 ``` 120 121 the second barrier can be removed. 122 }]; 123 124 let assemblyFormat = [{ attr-dict }]; 125} 126 127def MapNestedForallToThreads : 128 Op<Transform_Dialect, "gpu.map_nested_forall_to_threads", 129 [FunctionalStyleTransformOpTrait, 130 MemoryEffectsOpInterface, 131 TransformEachOpTrait, 132 TransformOpInterface]> { 133 let description = [{ 134 Target the `gpu.launch op` and rewrite all `scf.forall` nested in it to 135 distributed `gpu.thread_id` attribute. 136 137 The operation searches for `scf.forall` ops nested under `target` and maps 138 each such op to GPU threads. 139 140 `scf.forall` induction variables are rewritten to `gpu.thread_id` according 141 to the `mapping` attribute. 142 143 Different types of mappings attributes are supported: 144 - the block_dims is a list of integers that specifies the number of 145 threads in each dimension. This is a mandatory attribute that is used 146 to constrain the number of threads in each dimension. If an 147 `scf.forall` op is mapped to fewer threads, predication occurs. 148 - the warp_dims is a list of integers that specifies the number of 149 warps in each dimension. This is an optional attribute that is used 150 to constrain the number of warps in each dimension. When present, this 151 attribute must be specified in a way that is compatible with the 152 block_dims attribute. If an `scf.forall` op is mapped to fewer warps, 153 predication occurs. 154 155 Dynamic `scf.forall` trip counts are currently not supported. 156 Dynamic block dim sizes are currently not supported. 157 158 Only **bufferized** `scf.forall` are currently supported. 159 Only `scf.forall` distributed to **at most 3 dimensions** are 160 currently supported. 161 162 The `sync_after_distribute`attribute controls whether a `gpu.barrier` is 163 inserted after each scf.forall op. At this time, this is an all or nothing 164 choice. This will need to be tightened in the future. 165 166 The operation alters the block size of the given gpu_launch using the 167 mandatory block_dims argument. 168 169 #### Return modes: 170 171 This operation ignores non-`gpu_launch` ops and drops them in the return. 172 173 If any scf.forall with tensors is found, the transform definitely 174 fails. 175 176 If all the `scf.forall` operations with gpu.thread mapping contained 177 within the `LaunchOp` referred to by the `target` handle lower to GPU 178 properly, the transform succeeds. Otherwise the transform definitely 179 fails. 180 181 scf.forall operations with mappings other than gpu.thread are 182 ignored. 183 184 The returned handle points to the same LaunchOp operand, consuming it and 185 producing a new SSA value to satisfy chaining and linearity of the IR 186 properties. 187 188 #### Example: 189 190 ``` 191 gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2) 192 threads(%tx, %ty, %tz) in (%tx = %3, %ty = %4, %tz = %5) { 193 scf.forall (%i, %j) in (7, 9) { 194 ... // body 1 195 } {mapping = [#gpu.thread<x>, #gpu.thread<y>, #gpu.thread<z>]} 196 scf.forall (%i) in (12) { 197 ... // body 2 198 } {mapping = [#gpu.thread<x>]} 199 gpu.terminator 200 } 201 ``` 202 203 is translated to: 204 205 ``` 206 %bdimX = arith.constant 12 : index 207 %bdimY = arith.constant 9 : index 208 gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2) 209 threads(%tx, %ty, %tz) in (%tx = %bdimX, %ty = %bdimY, %tz = %5) { 210 if (threadIdx.x < 9 && threadIdx.y < 7) { 211 ... // body 1 212 } 213 gpu.barrier 214 if (threadIdx.y < 1) { 215 ... // body 2 216 } 217 gpu.barrier 218 gpu.terminator 219 } 220 ``` 221 }]; 222 223 let arguments = (ins TransformHandleTypeInterface:$target, 224 DefaultValuedAttr<DenseI64ArrayAttr, "{}">:$block_dims, 225 DefaultValuedAttr<BoolAttr, "true">:$sync_after_distribute, 226 DefaultValuedAttr<I64Attr, "32">:$warp_size); 227 let results = (outs TransformHandleTypeInterface:$result); 228 229 let assemblyFormat = [{ 230 $target 231 `block_dims` `=` $block_dims 232 (`sync_after_distribute` `=` $sync_after_distribute^)? 233 (`warp_size` `=` $warp_size^)? 234 attr-dict 235 `:` functional-type($target, $result) 236 }]; 237 let extraClassDeclaration = [{ 238 ::mlir::DiagnosedSilenceableFailure applyToOne( 239 ::mlir::transform::TransformRewriter &rewriter, 240 ::mlir::Operation *target, 241 ::mlir::transform::ApplyToEachResultList &results, 242 ::mlir::transform::TransformState &state); 243 }]; 244} 245 246def MapForallToBlocks : 247 Op<Transform_Dialect, "gpu.map_forall_to_blocks", 248 [FunctionalStyleTransformOpTrait, 249 MemoryEffectsOpInterface, 250 TransformOpInterface, 251 TransformEachOpTrait]> { 252 let description = [{ 253 Target the gpu_launch op and rewrite the top level `scf.forall` 254 to distributed gpu.block_id attribute. If `generate_gpu_launch` attribute 255 is set, then first generates `gpu_launch` and moves the top level 256 `scf.forall` inside. 257 258 The operation searches top level `scf.forall` ops under 259 `gpu_launch` and maps each such op to GPU blocks. Mapping is 260 one-to-one and the induction variables of `scf.forall` are 261 rewritten to gpu.block_id according to the `thread_dim_mapping` attribute. 262 263 Dynamic, `scf.forall` trip counts are currently not supported. 264 Dynamic block dim sizes are currently not supported. 265 266 Only **bufferized** scf.forall are currently supported. 267 Only scf.forall distributed to **at most 3 dimensions** are 268 currently supported. 269 270 The operation alters the block size of the given gpu_launch using the 271 grid_dims argument. 272 273 #### Return modes: 274 275 This operation ignores non-gpu_launch ops and drops them in the return. 276 277 If any scf.forall with tensors is found, the transform definitely 278 fails. 279 280 If all the `scf.forall` operations contained within the LaunchOp 281 referred to by the `target` handle lower to GPU properly, the 282 transform succeeds. Otherwise the transform definitely fails. 283 284 The returned handle points to the same LaunchOp operand, consuming it and 285 producing a new SSA value to satisfy chaining and linearity of the IR 286 properties. 287 }]; 288 289 let arguments = (ins TransformHandleTypeInterface:$target, 290 DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$grid_dims, 291 UnitAttr:$generate_gpu_launch); 292 let results = (outs TransformHandleTypeInterface:$result); 293 294 let assemblyFormat = [{ 295 $target 296 (`generate_gpu_launch` $generate_gpu_launch^)? 297 (`grid_dims` `=` $grid_dims^)? 298 attr-dict 299 `:` functional-type($target, $result) 300 }]; 301 let hasVerifier = 1; 302 303 let extraClassDeclaration = [{ 304 ::mlir::DiagnosedSilenceableFailure applyToOne( 305 ::mlir::transform::TransformRewriter &rewriter, 306 ::mlir::Operation *target, 307 ::mlir::transform::ApplyToEachResultList &results, 308 ::mlir::transform::TransformState &state); 309 }]; 310} 311 312#endif // GPU_TRANSFORM_OPS 313