1 //===- SCFToGPUPass.cpp - Convert a loop nest to a GPU kernel -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "mlir/Conversion/SCFToGPU/SCFToGPUPass.h" 10 #include "../PassDetail.h" 11 #include "mlir/Conversion/SCFToGPU/SCFToGPU.h" 12 #include "mlir/Dialect/Affine/IR/AffineOps.h" 13 #include "mlir/Dialect/GPU/GPUDialect.h" 14 #include "mlir/Dialect/SCF/SCF.h" 15 #include "mlir/Dialect/StandardOps/IR/Ops.h" 16 #include "mlir/Transforms/DialectConversion.h" 17 18 #include "llvm/ADT/ArrayRef.h" 19 #include "llvm/Support/CommandLine.h" 20 21 #define PASS_NAME "convert-scf-to-gpu" 22 #define LOOPOP_TO_GPU_PASS_NAME "convert-loop-op-to-gpu" 23 24 using namespace mlir; 25 using namespace mlir::scf; 26 27 namespace { 28 // A pass that traverses top-level loops in the function and converts them to 29 // GPU launch operations. Nested launches are not allowed, so this does not 30 // walk the function recursively to avoid considering nested loops. 31 struct ForLoopMapper : public ConvertSimpleSCFToGPUBase<ForLoopMapper> { 32 ForLoopMapper() = default; 33 ForLoopMapper(unsigned numBlockDims, unsigned numThreadDims) { 34 this->numBlockDims = numBlockDims; 35 this->numThreadDims = numThreadDims; 36 } 37 38 void runOnFunction() override { 39 for (Operation &op : llvm::make_early_inc_range(getFunction().getOps())) { 40 if (auto forOp = dyn_cast<AffineForOp>(&op)) { 41 if (failed(convertAffineLoopNestToGPULaunch(forOp, numBlockDims, 42 numThreadDims))) 43 signalPassFailure(); 44 } else if (auto forOp = dyn_cast<ForOp>(&op)) { 45 if (failed( 46 convertLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims))) 47 signalPassFailure(); 48 } 49 } 50 } 51 }; 52 53 // A pass that traverses top-level loops in the function and convertes them to 54 // GPU launch operations. The top-level loops itself does not have to be 55 // perfectly nested. The only requirement is that there be as many perfectly 56 // nested loops as the size of `numWorkGroups`. Within these any loop nest has 57 // to be perfectly nested upto depth equal to size of `workGroupSize`. 58 struct ImperfectlyNestedForLoopMapper 59 : public ConvertSCFToGPUBase<ImperfectlyNestedForLoopMapper> { 60 ImperfectlyNestedForLoopMapper() = default; 61 ImperfectlyNestedForLoopMapper(ArrayRef<int64_t> numWorkGroups, 62 ArrayRef<int64_t> workGroupSize) { 63 this->numWorkGroups = numWorkGroups; 64 this->workGroupSize = workGroupSize; 65 } 66 67 void runOnFunction() override { 68 // Insert the num work groups and workgroup sizes as constant values. This 69 // pass is only used for testing. 70 FuncOp funcOp = getFunction(); 71 OpBuilder builder(funcOp.getOperation()->getRegion(0)); 72 SmallVector<Value, 3> numWorkGroupsVal, workGroupSizeVal; 73 for (auto val : numWorkGroups) { 74 auto constOp = builder.create<ConstantOp>( 75 funcOp.getLoc(), builder.getIntegerAttr(builder.getIndexType(), val)); 76 numWorkGroupsVal.push_back(constOp); 77 } 78 for (auto val : workGroupSize) { 79 auto constOp = builder.create<ConstantOp>( 80 funcOp.getLoc(), builder.getIntegerAttr(builder.getIndexType(), val)); 81 workGroupSizeVal.push_back(constOp); 82 } 83 for (ForOp forOp : llvm::make_early_inc_range(funcOp.getOps<ForOp>())) { 84 if (failed(convertLoopToGPULaunch(forOp, numWorkGroupsVal, 85 workGroupSizeVal))) { 86 return signalPassFailure(); 87 } 88 } 89 } 90 }; 91 92 struct ParallelLoopToGpuPass 93 : public ConvertParallelLoopToGpuBase<ParallelLoopToGpuPass> { 94 void runOnOperation() override { 95 OwningRewritePatternList patterns; 96 populateParallelLoopToGPUPatterns(patterns, &getContext()); 97 ConversionTarget target(getContext()); 98 target.addLegalDialect<StandardOpsDialect>(); 99 target.addLegalDialect<AffineDialect>(); 100 target.addLegalDialect<gpu::GPUDialect>(); 101 target.addLegalDialect<scf::SCFDialect>(); 102 target.addIllegalOp<scf::ParallelOp>(); 103 if (failed(applyPartialConversion(getOperation(), target, patterns))) 104 signalPassFailure(); 105 } 106 }; 107 108 } // namespace 109 110 std::unique_ptr<OperationPass<FuncOp>> 111 mlir::createSimpleSCFToGPUPass(unsigned numBlockDims, unsigned numThreadDims) { 112 return std::make_unique<ForLoopMapper>(numBlockDims, numThreadDims); 113 } 114 std::unique_ptr<OperationPass<FuncOp>> mlir::createSimpleSCFToGPUPass() { 115 return std::make_unique<ForLoopMapper>(); 116 } 117 118 std::unique_ptr<OperationPass<FuncOp>> 119 mlir::createLoopToGPUPass(ArrayRef<int64_t> numWorkGroups, 120 ArrayRef<int64_t> workGroupSize) { 121 return std::make_unique<ImperfectlyNestedForLoopMapper>(numWorkGroups, 122 workGroupSize); 123 } 124 std::unique_ptr<OperationPass<FuncOp>> mlir::createLoopToGPUPass() { 125 return std::make_unique<ImperfectlyNestedForLoopMapper>(); 126 } 127 128 std::unique_ptr<Pass> mlir::createParallelLoopToGpuPass() { 129 return std::make_unique<ParallelLoopToGpuPass>(); 130 } 131