//===- Utils.cpp - Utils for GPU transform ops ----------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "mlir/Dialect/GPU/TransformOps/Utils.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/Transform/IR/TransformDialect.h" #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h" #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/IRMapping.h" #include "mlir/IR/MLIRContext.h" #include "mlir/IR/OpDefinition.h" #include "mlir/IR/Value.h" #include "mlir/IR/Visitors.h" #include "mlir/Support/LLVM.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/Debug.h" using namespace mlir; using namespace mlir::gpu; using namespace mlir::transform; using namespace mlir::transform::gpu; #define DEBUG_TYPE "gpu-transforms" #define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ") #define LDBG(X) LLVM_DEBUG(DBGS() << (X) << "\n") #define DBGS_ALIAS() (llvm::dbgs() << '[' << DEBUG_TYPE_ALIAS << "] ") /// Return a flattened thread id for the workgroup with given sizes. template static Value buildLinearId(RewriterBase &rewriter, Location loc, ArrayRef originalBasisOfr) { LLVM_DEBUG(llvm::interleaveComma( originalBasisOfr, DBGS() << "----buildLinearId with originalBasisOfr: "); llvm::dbgs() << "\n"); assert(originalBasisOfr.size() == 3 && "expected 3 sizes"); IndexType indexType = rewriter.getIndexType(); AffineExpr tx, ty, tz, bdx, bdy; bindDims(rewriter.getContext(), tx, ty, tz); bindSymbols(rewriter.getContext(), bdx, bdy); SmallVector vals{ rewriter.create(loc, indexType, Dimension::x) .getResult(), rewriter.create(loc, indexType, Dimension::y) .getResult(), rewriter.create(loc, indexType, Dimension::z) .getResult(), originalBasisOfr[0], originalBasisOfr[1]}; OpFoldResult ofr = affine::makeComposedFoldedAffineApply( rewriter, loc, tx + ty * bdx + tz * bdx * bdy, vals); return getValueOrCreateConstantIndexOp(rewriter, loc, ofr); } /// Create a linear id builder that takes the `originalBasisOfr` and decompose /// it in the basis of `forallMappingSizes`. The linear id builder returns an /// n-D vector of ids for indexing and 1-D size + id for predicate generation. template static GpuIdBuilderFnType commonLinearIdBuilderFn(int64_t multiplicity = 1) { auto res = [multiplicity](RewriterBase &rewriter, Location loc, ArrayRef forallMappingSizes, ArrayRef originalBasis) { SmallVector originalBasisOfr = getAsIndexOpFoldResult(rewriter.getContext(), originalBasis); OpFoldResult linearId = buildLinearId(rewriter, loc, originalBasisOfr); // Sizes in [0 .. n] -> [n .. 0] order to properly compute strides in // "row-major" order. SmallVector reverseBasisSizes(llvm::reverse(forallMappingSizes)); SmallVector strides = computeStrides(reverseBasisSizes); AffineExpr d0 = getAffineDimExpr(0, rewriter.getContext()); OpFoldResult scaledLinearId = affine::makeComposedFoldedAffineApply( rewriter, loc, d0.floorDiv(multiplicity), {linearId}); SmallVector delinearizingExprs = delinearize(d0, strides); SmallVector ids; // Reverse back to be in [0 .. n] order. for (AffineExpr e : llvm::reverse(delinearizingExprs)) { ids.push_back( affine::makeComposedAffineApply(rewriter, loc, e, {scaledLinearId})); } // clang-format off LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes, DBGS() << "--delinearization basis: "); llvm::dbgs() << "\n"; llvm::interleaveComma(strides, DBGS() << "--delinearization strides: "); llvm::dbgs() << "\n"; llvm::interleaveComma(delinearizingExprs, DBGS() << "--delinearization exprs: "); llvm::dbgs() << "\n"; llvm::interleaveComma(ids, DBGS() << "--ids: "); llvm::dbgs() << "\n";); // clang-format on // Return n-D ids for indexing and 1-D size + id for predicate generation. return IdBuilderResult{ /*mappingIdOps=*/ids, /*availableMappingSizes=*/ SmallVector{computeProduct(originalBasis)}, // `forallMappingSizes` iterate in the scaled basis, they need to be // scaled back into the original basis to provide tight // activeMappingSizes quantities for predication. /*activeMappingSizes=*/ SmallVector{computeProduct(forallMappingSizes) * multiplicity}, /*activeIdOps=*/SmallVector{cast(linearId)}}; }; return res; } /// Create a simple 3-D id builder that takes the `originalBasisOfr` /// The 3-D id builder returns a 3-D vector of ids for indexing and 3-D sizes /// + ids for predicate generation. template static GpuIdBuilderFnType common3DIdBuilderFn(int64_t multiplicity = 1) { auto res = [multiplicity](RewriterBase &rewriter, Location loc, ArrayRef forallMappingSizes, ArrayRef originalBasis) { IndexType indexType = rewriter.getIndexType(); SmallVector ids{ rewriter.create(loc, indexType, Dimension::x), rewriter.create(loc, indexType, Dimension::y), rewriter.create(loc, indexType, Dimension::z)}; // In the 3-D mapping case, scale the first dimension by the multiplicity. SmallVector scaledIds = ids; AffineExpr d0 = getAffineDimExpr(0, rewriter.getContext()); scaledIds[0] = cast(affine::makeComposedFoldedAffineApply( rewriter, loc, d0.floorDiv(multiplicity), {scaledIds[0]})); // In the 3-D mapping case, unscale the first dimension by the multiplicity. SmallVector forallMappingSizeInOriginalBasis(forallMappingSizes); forallMappingSizeInOriginalBasis[0] *= multiplicity; return IdBuilderResult{ /*mappingIdOps=*/scaledIds, /*availableMappingSizes=*/SmallVector{originalBasis}, // `forallMappingSizes` iterate in the scaled basis, they need to be // scaled back into the original basis to provide tight // activeMappingSizes quantities for predication. /*activeMappingSizes=*/ SmallVector{forallMappingSizeInOriginalBasis}, /*activeIdOps=*/ids}; }; return res; } namespace mlir { namespace transform { namespace gpu { GpuIdBuilder::GpuIdBuilder(MLIRContext *ctx, bool useLinearMapping, const MappingIdBuilderFnType &fn) : mappingAttributes(), idBuilder() { if (useLinearMapping) { for (uint64_t d = static_cast(MappingId::LinearDim0), e = getMaxEnumValForMappingId(); d <= e; ++d) mappingAttributes.push_back(fn(ctx, symbolizeMappingId(d).value())); } else { for (uint64_t d = static_cast(MappingId::DimX), e = static_cast(MappingId::DimZ); d <= e; ++d) mappingAttributes.push_back(fn(ctx, symbolizeMappingId(d).value())); } } GpuBlockIdBuilder::GpuBlockIdBuilder(MLIRContext *ctx, bool useLinearMapping) : GpuIdBuilder(ctx, useLinearMapping, [](MLIRContext *ctx, MappingId id) { return GPUBlockMappingAttr::get(ctx, id); }) { idBuilder = useLinearMapping ? commonLinearIdBuilderFn(/*multiplicity=*/1) : common3DIdBuilderFn(/*multiplicity=*/1); } GpuWarpgroupIdBuilder::GpuWarpgroupIdBuilder(MLIRContext *ctx, int64_t warpSize, bool useLinearMapping) : GpuIdBuilder(ctx, useLinearMapping, [](MLIRContext *ctx, MappingId id) { return GPUWarpgroupMappingAttr::get(ctx, id); }), warpSize(warpSize) { idBuilder = useLinearMapping ? commonLinearIdBuilderFn( /*multiplicity=*/kNumWarpsPerGroup * warpSize) : common3DIdBuilderFn( /*multiplicity=*/kNumWarpsPerGroup * warpSize); } GpuWarpIdBuilder::GpuWarpIdBuilder(MLIRContext *ctx, int64_t warpSize, bool useLinearMapping) : GpuIdBuilder(ctx, useLinearMapping, [](MLIRContext *ctx, MappingId id) { return GPUWarpMappingAttr::get(ctx, id); }), warpSize(warpSize) { idBuilder = useLinearMapping ? commonLinearIdBuilderFn(/*multiplicity=*/warpSize) : common3DIdBuilderFn(/*multiplicity=*/warpSize); } GpuThreadIdBuilder::GpuThreadIdBuilder(MLIRContext *ctx, bool useLinearMapping) : GpuIdBuilder(ctx, useLinearMapping, [](MLIRContext *ctx, MappingId id) { return GPUThreadMappingAttr::get(ctx, id); }) { idBuilder = useLinearMapping ? commonLinearIdBuilderFn(/*multiplicity=*/1) : common3DIdBuilderFn(/*multiplicity=*/1); } DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp, std::optional gridDimX, std::optional gridDimY, std::optional gridDimZ, std::optional blockDimX, std::optional blockDimY, std::optional blockDimZ) { // TODO: pass a configuration object to set the limits properly. static constexpr int maxTotalBlockdim = 1024; static constexpr int maxBlockdimx = 1024; static constexpr int maxBlockdimy = 1024; static constexpr int maxBlockdimz = 64; static constexpr int maxTotalGriddim = 2147483647; static constexpr int maxGriddimx = 2147483647; static constexpr int maxGriddimy = 65535; static constexpr int maxGriddimz = 65535; if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) > maxTotalBlockdim || (gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) > maxTotalGriddim || blockDimX.value_or(1) > maxBlockdimx || blockDimY.value_or(1) > maxBlockdimy || blockDimZ.value_or(1) > maxBlockdimz || gridDimY.value_or(1) > maxGriddimy || gridDimZ.value_or(1) > maxGriddimz || gridDimX.value_or(1) > maxGriddimx) { return transformOp.emitSilenceableError() << "Trying to launch a GPU kernel with grid_dims = (" << gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", " << gridDimZ.value_or(1) << ") block_dims = (" << blockDimX.value_or(1) << ", " << blockDimY.value_or(1) << ", " << blockDimZ.value_or(1) << "). It is larger than the limits."; } return DiagnosedSilenceableFailure::success(); } DiagnosedSilenceableFailure createGpuLaunch( RewriterBase &rewriter, Location loc, TransformOpInterface transformOp, LaunchOp &launchOp, std::optional gridDimX, std::optional gridDimY, std::optional gridDimZ, std::optional blockDimX, std::optional blockDimY, std::optional blockDimZ) { DiagnosedSilenceableFailure diag = checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ); if (!diag.succeeded()) return diag; auto createConst = [&](int dim) { return rewriter.create(loc, dim); }; OpBuilder::InsertionGuard guard(rewriter); Value one = createConst(1); Value gridSizeX = gridDimX.has_value() ? createConst(gridDimX.value()) : one; Value gridSizeY = gridDimY.has_value() ? createConst(gridDimY.value()) : one; Value gridSizeZ = gridDimZ.has_value() ? createConst(gridDimZ.value()) : one; Value blkSizeX = blockDimX.has_value() ? createConst(blockDimX.value()) : one; Value blkSizeY = blockDimY.has_value() ? createConst(blockDimY.value()) : one; Value blkSizeZ = blockDimZ.has_value() ? createConst(blockDimZ.value()) : one; launchOp = rewriter.create(loc, gridSizeX, gridSizeY, gridSizeZ, blkSizeX, blkSizeY, blkSizeZ); rewriter.setInsertionPointToEnd(&launchOp.getBody().front()); rewriter.create(loc); return DiagnosedSilenceableFailure::success(); } /// Alter kernel configuration of the given kernel. DiagnosedSilenceableFailure alterGpuLaunch( RewriterBase &rewriter, LaunchOp gpuLaunch, TransformOpInterface transformOp, std::optional gridDimX, std::optional gridDimY, std::optional gridDimZ, std::optional blockDimX, std::optional blockDimY, std::optional blockDimZ) { DiagnosedSilenceableFailure diag = checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ); if (!diag.succeeded()) return diag; KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues(); OpBuilder::InsertionGuard guard(rewriter); rewriter.setInsertionPointAfterValue(currentBlockdim.x); auto createConstValue = [&](int dim) { return rewriter.create(currentBlockdim.x.getLoc(), dim); }; if (gridDimX.has_value()) gpuLaunch.getGridSizeXMutable().assign(createConstValue(gridDimX.value())); if (gridDimY.has_value()) gpuLaunch.getGridSizeYMutable().assign(createConstValue(gridDimY.value())); if (gridDimZ.has_value()) gpuLaunch.getGridSizeZMutable().assign(createConstValue(gridDimZ.value())); if (blockDimX.has_value()) gpuLaunch.getBlockSizeXMutable().assign( createConstValue(blockDimX.value())); if (blockDimY.has_value()) gpuLaunch.getBlockSizeYMutable().assign( createConstValue(blockDimY.value())); if (blockDimZ.has_value()) gpuLaunch.getBlockSizeZMutable().assign( createConstValue(blockDimZ.value())); return DiagnosedSilenceableFailure::success(); } } // namespace gpu } // namespace transform } // namespace mlir