xref: /llvm-project/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp (revision 4ead2cf76c4a1df260e7dff0fa767074bae6e2b8)
1 //===- SCFToGPUPass.cpp - Convert a loop nest to a GPU kernel -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "mlir/Conversion/SCFToGPU/SCFToGPUPass.h"
10 #include "../PassDetail.h"
11 #include "mlir/Conversion/SCFToGPU/SCFToGPU.h"
12 #include "mlir/Dialect/Affine/IR/AffineOps.h"
13 #include "mlir/Dialect/GPU/GPUDialect.h"
14 #include "mlir/Dialect/SCF/SCF.h"
15 #include "mlir/Dialect/StandardOps/IR/Ops.h"
16 #include "mlir/Transforms/DialectConversion.h"
17 
18 #include "llvm/ADT/ArrayRef.h"
19 #include "llvm/Support/CommandLine.h"
20 
21 #define PASS_NAME "convert-scf-to-gpu"
22 #define LOOPOP_TO_GPU_PASS_NAME "convert-loop-op-to-gpu"
23 
24 using namespace mlir;
25 using namespace mlir::scf;
26 
27 namespace {
28 // A pass that traverses top-level loops in the function and converts them to
29 // GPU launch operations.  Nested launches are not allowed, so this does not
30 // walk the function recursively to avoid considering nested loops.
31 struct ForLoopMapper : public ConvertSimpleSCFToGPUBase<ForLoopMapper> {
32   ForLoopMapper() = default;
33   ForLoopMapper(unsigned numBlockDims, unsigned numThreadDims) {
34     this->numBlockDims = numBlockDims;
35     this->numThreadDims = numThreadDims;
36   }
37 
38   void runOnFunction() override {
39     for (Operation &op : llvm::make_early_inc_range(getFunction().getOps())) {
40       if (auto forOp = dyn_cast<AffineForOp>(&op)) {
41         if (failed(convertAffineLoopNestToGPULaunch(forOp, numBlockDims,
42                                                     numThreadDims)))
43           signalPassFailure();
44       } else if (auto forOp = dyn_cast<ForOp>(&op)) {
45         if (failed(
46                 convertLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims)))
47           signalPassFailure();
48       }
49     }
50   }
51 };
52 
53 // A pass that traverses top-level loops in the function and convertes them to
54 // GPU launch operations. The top-level loops itself does not have to be
55 // perfectly nested. The only requirement is that there be as many perfectly
56 // nested loops as the size of `numWorkGroups`. Within these any loop nest has
57 // to be perfectly nested upto depth equal to size of `workGroupSize`.
58 struct ImperfectlyNestedForLoopMapper
59     : public ConvertSCFToGPUBase<ImperfectlyNestedForLoopMapper> {
60   ImperfectlyNestedForLoopMapper() = default;
61   ImperfectlyNestedForLoopMapper(ArrayRef<int64_t> numWorkGroups,
62                                  ArrayRef<int64_t> workGroupSize) {
63     this->numWorkGroups = numWorkGroups;
64     this->workGroupSize = workGroupSize;
65   }
66 
67   void runOnFunction() override {
68     // Insert the num work groups and workgroup sizes as constant values. This
69     // pass is only used for testing.
70     FuncOp funcOp = getFunction();
71     OpBuilder builder(funcOp.getOperation()->getRegion(0));
72     SmallVector<Value, 3> numWorkGroupsVal, workGroupSizeVal;
73     for (auto val : numWorkGroups) {
74       auto constOp = builder.create<ConstantOp>(
75           funcOp.getLoc(), builder.getIntegerAttr(builder.getIndexType(), val));
76       numWorkGroupsVal.push_back(constOp);
77     }
78     for (auto val : workGroupSize) {
79       auto constOp = builder.create<ConstantOp>(
80           funcOp.getLoc(), builder.getIntegerAttr(builder.getIndexType(), val));
81       workGroupSizeVal.push_back(constOp);
82     }
83     for (ForOp forOp : llvm::make_early_inc_range(funcOp.getOps<ForOp>())) {
84       if (failed(convertLoopToGPULaunch(forOp, numWorkGroupsVal,
85                                         workGroupSizeVal))) {
86         return signalPassFailure();
87       }
88     }
89   }
90 };
91 
92 struct ParallelLoopToGpuPass
93     : public ConvertParallelLoopToGpuBase<ParallelLoopToGpuPass> {
94   void runOnOperation() override {
95     OwningRewritePatternList patterns;
96     populateParallelLoopToGPUPatterns(patterns, &getContext());
97     ConversionTarget target(getContext());
98     target.addLegalDialect<StandardOpsDialect>();
99     target.addLegalDialect<AffineDialect>();
100     target.addLegalDialect<gpu::GPUDialect>();
101     target.addLegalDialect<scf::SCFDialect>();
102     target.addIllegalOp<scf::ParallelOp>();
103     if (failed(applyPartialConversion(getOperation(), target, patterns)))
104       signalPassFailure();
105   }
106 };
107 
108 } // namespace
109 
110 std::unique_ptr<OperationPass<FuncOp>>
111 mlir::createSimpleSCFToGPUPass(unsigned numBlockDims, unsigned numThreadDims) {
112   return std::make_unique<ForLoopMapper>(numBlockDims, numThreadDims);
113 }
114 std::unique_ptr<OperationPass<FuncOp>> mlir::createSimpleSCFToGPUPass() {
115   return std::make_unique<ForLoopMapper>();
116 }
117 
118 std::unique_ptr<OperationPass<FuncOp>>
119 mlir::createLoopToGPUPass(ArrayRef<int64_t> numWorkGroups,
120                           ArrayRef<int64_t> workGroupSize) {
121   return std::make_unique<ImperfectlyNestedForLoopMapper>(numWorkGroups,
122                                                           workGroupSize);
123 }
124 std::unique_ptr<OperationPass<FuncOp>> mlir::createLoopToGPUPass() {
125   return std::make_unique<ImperfectlyNestedForLoopMapper>();
126 }
127 
128 std::unique_ptr<Pass> mlir::createParallelLoopToGpuPass() {
129   return std::make_unique<ParallelLoopToGpuPass>();
130 }
131