1 //===- Utils.cpp - Utils for GPU transform ops ----------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "mlir/Dialect/GPU/TransformOps/Utils.h" 10 11 #include "mlir/Dialect/Affine/IR/AffineOps.h" 12 #include "mlir/Dialect/Arith/IR/Arith.h" 13 #include "mlir/Dialect/Func/IR/FuncOps.h" 14 #include "mlir/Dialect/GPU/IR/GPUDialect.h" 15 #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h" 16 #include "mlir/Dialect/MemRef/IR/MemRef.h" 17 #include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h" 18 #include "mlir/Dialect/SCF/IR/SCF.h" 19 #include "mlir/Dialect/Transform/IR/TransformDialect.h" 20 #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h" 21 #include "mlir/Dialect/Utils/IndexingUtils.h" 22 #include "mlir/Dialect/Vector/IR/VectorOps.h" 23 #include "mlir/IR/AffineExpr.h" 24 #include "mlir/IR/Builders.h" 25 #include "mlir/IR/BuiltinAttributes.h" 26 #include "mlir/IR/IRMapping.h" 27 #include "mlir/IR/MLIRContext.h" 28 #include "mlir/IR/OpDefinition.h" 29 #include "mlir/IR/Value.h" 30 #include "mlir/IR/Visitors.h" 31 #include "mlir/Support/LLVM.h" 32 #include "llvm/ADT/STLExtras.h" 33 #include "llvm/ADT/SmallVector.h" 34 #include "llvm/ADT/TypeSwitch.h" 35 #include "llvm/Support/Debug.h" 36 37 using namespace mlir; 38 using namespace mlir::gpu; 39 using namespace mlir::transform; 40 using namespace mlir::transform::gpu; 41 42 #define DEBUG_TYPE "gpu-transforms" 43 44 #define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ") 45 #define LDBG(X) LLVM_DEBUG(DBGS() << (X) << "\n") 46 #define DBGS_ALIAS() (llvm::dbgs() << '[' << DEBUG_TYPE_ALIAS << "] ") 47 48 /// Return a flattened thread id for the workgroup with given sizes. 49 template <typename ThreadOrBlockIdOp> 50 static Value buildLinearId(RewriterBase &rewriter, Location loc, 51 ArrayRef<OpFoldResult> originalBasisOfr) { 52 LLVM_DEBUG(llvm::interleaveComma( 53 originalBasisOfr, 54 DBGS() << "----buildLinearId with originalBasisOfr: "); 55 llvm::dbgs() << "\n"); 56 assert(originalBasisOfr.size() == 3 && "expected 3 sizes"); 57 IndexType indexType = rewriter.getIndexType(); 58 AffineExpr tx, ty, tz, bdx, bdy; 59 bindDims(rewriter.getContext(), tx, ty, tz); 60 bindSymbols(rewriter.getContext(), bdx, bdy); 61 SmallVector<OpFoldResult> vals{ 62 rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::x) 63 .getResult(), 64 rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::y) 65 .getResult(), 66 rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::z) 67 .getResult(), 68 originalBasisOfr[0], originalBasisOfr[1]}; 69 OpFoldResult ofr = affine::makeComposedFoldedAffineApply( 70 rewriter, loc, tx + ty * bdx + tz * bdx * bdy, vals); 71 return getValueOrCreateConstantIndexOp(rewriter, loc, ofr); 72 } 73 74 /// Create a linear id builder that takes the `originalBasisOfr` and decompose 75 /// it in the basis of `forallMappingSizes`. The linear id builder returns an 76 /// n-D vector of ids for indexing and 1-D size + id for predicate generation. 77 template <typename ThreadOrBlockIdOp> 78 static GpuIdBuilderFnType commonLinearIdBuilderFn(int64_t multiplicity = 1) { 79 auto res = [multiplicity](RewriterBase &rewriter, Location loc, 80 ArrayRef<int64_t> forallMappingSizes, 81 ArrayRef<int64_t> originalBasis) { 82 SmallVector<OpFoldResult> originalBasisOfr = 83 getAsIndexOpFoldResult(rewriter.getContext(), originalBasis); 84 OpFoldResult linearId = 85 buildLinearId<ThreadOrBlockIdOp>(rewriter, loc, originalBasisOfr); 86 // Sizes in [0 .. n] -> [n .. 0] order to properly compute strides in 87 // "row-major" order. 88 SmallVector<int64_t> reverseBasisSizes(llvm::reverse(forallMappingSizes)); 89 SmallVector<int64_t> strides = computeStrides(reverseBasisSizes); 90 AffineExpr d0 = getAffineDimExpr(0, rewriter.getContext()); 91 OpFoldResult scaledLinearId = affine::makeComposedFoldedAffineApply( 92 rewriter, loc, d0.floorDiv(multiplicity), {linearId}); 93 SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides); 94 SmallVector<Value> ids; 95 // Reverse back to be in [0 .. n] order. 96 for (AffineExpr e : llvm::reverse(delinearizingExprs)) { 97 ids.push_back( 98 affine::makeComposedAffineApply(rewriter, loc, e, {scaledLinearId})); 99 } 100 101 // clang-format off 102 LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes, 103 DBGS() << "--delinearization basis: "); 104 llvm::dbgs() << "\n"; 105 llvm::interleaveComma(strides, 106 DBGS() << "--delinearization strides: "); 107 llvm::dbgs() << "\n"; 108 llvm::interleaveComma(delinearizingExprs, 109 DBGS() << "--delinearization exprs: "); 110 llvm::dbgs() << "\n"; 111 llvm::interleaveComma(ids, DBGS() << "--ids: "); 112 llvm::dbgs() << "\n";); 113 // clang-format on 114 115 // Return n-D ids for indexing and 1-D size + id for predicate generation. 116 return IdBuilderResult{ 117 /*mappingIdOps=*/ids, 118 /*availableMappingSizes=*/ 119 SmallVector<int64_t>{computeProduct(originalBasis)}, 120 // `forallMappingSizes` iterate in the scaled basis, they need to be 121 // scaled back into the original basis to provide tight 122 // activeMappingSizes quantities for predication. 123 /*activeMappingSizes=*/ 124 SmallVector<int64_t>{computeProduct(forallMappingSizes) * 125 multiplicity}, 126 /*activeIdOps=*/SmallVector<Value>{cast<Value>(linearId)}}; 127 }; 128 129 return res; 130 } 131 132 /// Create a simple 3-D id builder that takes the `originalBasisOfr` 133 /// The 3-D id builder returns a 3-D vector of ids for indexing and 3-D sizes 134 /// + ids for predicate generation. 135 template <typename ThreadOrBlockIdOp> 136 static GpuIdBuilderFnType common3DIdBuilderFn(int64_t multiplicity = 1) { 137 auto res = [multiplicity](RewriterBase &rewriter, Location loc, 138 ArrayRef<int64_t> forallMappingSizes, 139 ArrayRef<int64_t> originalBasis) { 140 IndexType indexType = rewriter.getIndexType(); 141 SmallVector<Value> ids{ 142 rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::x), 143 rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::y), 144 rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::z)}; 145 // In the 3-D mapping case, scale the first dimension by the multiplicity. 146 SmallVector<Value> scaledIds = ids; 147 AffineExpr d0 = getAffineDimExpr(0, rewriter.getContext()); 148 scaledIds[0] = cast<Value>(affine::makeComposedFoldedAffineApply( 149 rewriter, loc, d0.floorDiv(multiplicity), {scaledIds[0]})); 150 // In the 3-D mapping case, unscale the first dimension by the multiplicity. 151 SmallVector<int64_t> forallMappingSizeInOriginalBasis(forallMappingSizes); 152 forallMappingSizeInOriginalBasis[0] *= multiplicity; 153 return IdBuilderResult{ 154 /*mappingIdOps=*/scaledIds, 155 /*availableMappingSizes=*/SmallVector<int64_t>{originalBasis}, 156 // `forallMappingSizes` iterate in the scaled basis, they need to be 157 // scaled back into the original basis to provide tight 158 // activeMappingSizes quantities for predication. 159 /*activeMappingSizes=*/ 160 SmallVector<int64_t>{forallMappingSizeInOriginalBasis}, 161 /*activeIdOps=*/ids}; 162 }; 163 return res; 164 } 165 166 namespace mlir { 167 namespace transform { 168 namespace gpu { 169 170 GpuIdBuilder::GpuIdBuilder(MLIRContext *ctx, bool useLinearMapping, 171 const MappingIdBuilderFnType &fn) 172 : mappingAttributes(), idBuilder() { 173 if (useLinearMapping) { 174 for (uint64_t d = static_cast<uint64_t>(MappingId::LinearDim0), 175 e = getMaxEnumValForMappingId(); 176 d <= e; ++d) 177 mappingAttributes.push_back(fn(ctx, symbolizeMappingId(d).value())); 178 } else { 179 for (uint64_t d = static_cast<uint64_t>(MappingId::DimX), 180 e = static_cast<uint64_t>(MappingId::DimZ); 181 d <= e; ++d) 182 mappingAttributes.push_back(fn(ctx, symbolizeMappingId(d).value())); 183 } 184 } 185 186 GpuBlockIdBuilder::GpuBlockIdBuilder(MLIRContext *ctx, bool useLinearMapping) 187 : GpuIdBuilder(ctx, useLinearMapping, [](MLIRContext *ctx, MappingId id) { 188 return GPUBlockMappingAttr::get(ctx, id); 189 }) { 190 idBuilder = useLinearMapping 191 ? commonLinearIdBuilderFn<BlockIdOp>(/*multiplicity=*/1) 192 : common3DIdBuilderFn<BlockIdOp>(/*multiplicity=*/1); 193 } 194 195 GpuWarpgroupIdBuilder::GpuWarpgroupIdBuilder(MLIRContext *ctx, int64_t warpSize, 196 bool useLinearMapping) 197 : GpuIdBuilder(ctx, useLinearMapping, 198 [](MLIRContext *ctx, MappingId id) { 199 return GPUWarpgroupMappingAttr::get(ctx, id); 200 }), 201 warpSize(warpSize) { 202 idBuilder = useLinearMapping 203 ? commonLinearIdBuilderFn<ThreadIdOp>( 204 /*multiplicity=*/kNumWarpsPerGroup * warpSize) 205 : common3DIdBuilderFn<ThreadIdOp>( 206 /*multiplicity=*/kNumWarpsPerGroup * warpSize); 207 } 208 209 GpuWarpIdBuilder::GpuWarpIdBuilder(MLIRContext *ctx, int64_t warpSize, 210 bool useLinearMapping) 211 : GpuIdBuilder(ctx, useLinearMapping, 212 [](MLIRContext *ctx, MappingId id) { 213 return GPUWarpMappingAttr::get(ctx, id); 214 }), 215 warpSize(warpSize) { 216 idBuilder = 217 useLinearMapping 218 ? commonLinearIdBuilderFn<ThreadIdOp>(/*multiplicity=*/warpSize) 219 : common3DIdBuilderFn<ThreadIdOp>(/*multiplicity=*/warpSize); 220 } 221 222 GpuThreadIdBuilder::GpuThreadIdBuilder(MLIRContext *ctx, bool useLinearMapping) 223 : GpuIdBuilder(ctx, useLinearMapping, [](MLIRContext *ctx, MappingId id) { 224 return GPUThreadMappingAttr::get(ctx, id); 225 }) { 226 idBuilder = useLinearMapping 227 ? commonLinearIdBuilderFn<ThreadIdOp>(/*multiplicity=*/1) 228 : common3DIdBuilderFn<ThreadIdOp>(/*multiplicity=*/1); 229 } 230 231 DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp, 232 std::optional<int64_t> gridDimX, 233 std::optional<int64_t> gridDimY, 234 std::optional<int64_t> gridDimZ, 235 std::optional<int64_t> blockDimX, 236 std::optional<int64_t> blockDimY, 237 std::optional<int64_t> blockDimZ) { 238 239 // TODO: pass a configuration object to set the limits properly. 240 static constexpr int maxTotalBlockdim = 1024; 241 static constexpr int maxBlockdimx = 1024; 242 static constexpr int maxBlockdimy = 1024; 243 static constexpr int maxBlockdimz = 64; 244 static constexpr int maxTotalGriddim = 2147483647; 245 static constexpr int maxGriddimx = 2147483647; 246 static constexpr int maxGriddimy = 65535; 247 static constexpr int maxGriddimz = 65535; 248 249 if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) > 250 maxTotalBlockdim || 251 (gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) > 252 maxTotalGriddim || 253 blockDimX.value_or(1) > maxBlockdimx || 254 blockDimY.value_or(1) > maxBlockdimy || 255 blockDimZ.value_or(1) > maxBlockdimz || 256 gridDimY.value_or(1) > maxGriddimy || 257 gridDimZ.value_or(1) > maxGriddimz || 258 gridDimX.value_or(1) > maxGriddimx) { 259 return transformOp.emitSilenceableError() 260 << "Trying to launch a GPU kernel with grid_dims = (" 261 << gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", " 262 << gridDimZ.value_or(1) << ") block_dims = (" 263 << blockDimX.value_or(1) << ", " << blockDimY.value_or(1) << ", " 264 << blockDimZ.value_or(1) << "). It is larger than the limits."; 265 } 266 return DiagnosedSilenceableFailure::success(); 267 } 268 269 DiagnosedSilenceableFailure createGpuLaunch( 270 RewriterBase &rewriter, Location loc, TransformOpInterface transformOp, 271 LaunchOp &launchOp, std::optional<int64_t> gridDimX, 272 std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ, 273 std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY, 274 std::optional<int64_t> blockDimZ) { 275 DiagnosedSilenceableFailure diag = 276 checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX, 277 blockDimY, blockDimZ); 278 if (!diag.succeeded()) 279 return diag; 280 281 auto createConst = [&](int dim) { 282 return rewriter.create<arith::ConstantIndexOp>(loc, dim); 283 }; 284 OpBuilder::InsertionGuard guard(rewriter); 285 Value one = createConst(1); 286 Value gridSizeX = gridDimX.has_value() ? createConst(gridDimX.value()) : one; 287 Value gridSizeY = gridDimY.has_value() ? createConst(gridDimY.value()) : one; 288 Value gridSizeZ = gridDimZ.has_value() ? createConst(gridDimZ.value()) : one; 289 Value blkSizeX = blockDimX.has_value() ? createConst(blockDimX.value()) : one; 290 Value blkSizeY = blockDimY.has_value() ? createConst(blockDimY.value()) : one; 291 Value blkSizeZ = blockDimZ.has_value() ? createConst(blockDimZ.value()) : one; 292 launchOp = rewriter.create<LaunchOp>(loc, gridSizeX, gridSizeY, gridSizeZ, 293 blkSizeX, blkSizeY, blkSizeZ); 294 rewriter.setInsertionPointToEnd(&launchOp.getBody().front()); 295 rewriter.create<TerminatorOp>(loc); 296 return DiagnosedSilenceableFailure::success(); 297 } 298 299 /// Alter kernel configuration of the given kernel. 300 DiagnosedSilenceableFailure alterGpuLaunch( 301 RewriterBase &rewriter, LaunchOp gpuLaunch, 302 TransformOpInterface transformOp, std::optional<int64_t> gridDimX, 303 std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ, 304 std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY, 305 std::optional<int64_t> blockDimZ) { 306 DiagnosedSilenceableFailure diag = 307 checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX, 308 blockDimY, blockDimZ); 309 if (!diag.succeeded()) 310 return diag; 311 312 KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues(); 313 OpBuilder::InsertionGuard guard(rewriter); 314 rewriter.setInsertionPointAfterValue(currentBlockdim.x); 315 auto createConstValue = [&](int dim) { 316 return rewriter.create<arith::ConstantIndexOp>(currentBlockdim.x.getLoc(), 317 dim); 318 }; 319 320 if (gridDimX.has_value()) 321 gpuLaunch.getGridSizeXMutable().assign(createConstValue(gridDimX.value())); 322 if (gridDimY.has_value()) 323 gpuLaunch.getGridSizeYMutable().assign(createConstValue(gridDimY.value())); 324 if (gridDimZ.has_value()) 325 gpuLaunch.getGridSizeZMutable().assign(createConstValue(gridDimZ.value())); 326 if (blockDimX.has_value()) 327 gpuLaunch.getBlockSizeXMutable().assign( 328 createConstValue(blockDimX.value())); 329 if (blockDimY.has_value()) 330 gpuLaunch.getBlockSizeYMutable().assign( 331 createConstValue(blockDimY.value())); 332 if (blockDimZ.has_value()) 333 gpuLaunch.getBlockSizeZMutable().assign( 334 createConstValue(blockDimZ.value())); 335 return DiagnosedSilenceableFailure::success(); 336 } 337 338 } // namespace gpu 339 } // namespace transform 340 } // namespace mlir 341