xref: /llvm-project/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp (revision e8bcc37fff5bda7dd9326903a2c31e6703b4fe68)
1 //===- KernelOutlining.cpp - Implementation of GPU kernel outlining -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the GPU dialect kernel outlining pass.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "mlir/Dialect/GPU/Transforms/Passes.h"
14 
15 #include "mlir/AsmParser/AsmParser.h"
16 #include "mlir/Dialect/Arith/IR/Arith.h"
17 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
18 #include "mlir/Dialect/DLTI/DLTI.h"
19 #include "mlir/Dialect/Func/IR/FuncOps.h"
20 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
21 #include "mlir/Dialect/GPU/Transforms/Utils.h"
22 #include "mlir/Dialect/MemRef/IR/MemRef.h"
23 #include "mlir/IR/BlockAndValueMapping.h"
24 #include "mlir/IR/Builders.h"
25 #include "mlir/IR/Matchers.h"
26 #include "mlir/IR/SymbolTable.h"
27 #include "mlir/Support/LLVM.h"
28 #include "mlir/Transforms/RegionUtils.h"
29 
30 namespace mlir {
31 #define GEN_PASS_DEF_GPULAUNCHSINKINDEXCOMPUTATIONS
32 #define GEN_PASS_DEF_GPUKERNELOUTLINING
33 #include "mlir/Dialect/GPU/Transforms/Passes.h.inc"
34 } // namespace mlir
35 
36 using namespace mlir;
37 
38 template <typename OpTy>
39 static void createForAllDimensions(OpBuilder &builder, Location loc,
40                                    SmallVectorImpl<Value> &values) {
41   for (auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z})
42     values.push_back(builder.create<OpTy>(loc, builder.getIndexType(), dim));
43 }
44 
45 /// Adds operations generating block/thread ids and grid/block dimensions at the
46 /// beginning of the `launchFuncOpBody` region. Add mapping from argument in
47 /// entry block of `launchOpBody`, to the corresponding result value of the
48 /// added operations.
49 static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody,
50                                      Region &launchOpBody,
51                                      BlockAndValueMapping &map) {
52   OpBuilder builder(loc->getContext());
53   Block &firstBlock = launchOpBody.front();
54   builder.setInsertionPointToStart(&launchFuncOpBody.front());
55   SmallVector<Value, 12> indexOps;
56   createForAllDimensions<gpu::BlockIdOp>(builder, loc, indexOps);
57   createForAllDimensions<gpu::ThreadIdOp>(builder, loc, indexOps);
58   createForAllDimensions<gpu::GridDimOp>(builder, loc, indexOps);
59   createForAllDimensions<gpu::BlockDimOp>(builder, loc, indexOps);
60   // Replace the leading 12 function args with the respective thread/block index
61   // operations. Iterate backwards since args are erased and indices change.
62   for (const auto &indexOp : enumerate(indexOps))
63     map.map(firstBlock.getArgument(indexOp.index()), indexOp.value());
64 }
65 
66 /// Identifies operations that are beneficial to sink into kernels. These
67 /// operations may not have side-effects, as otherwise sinking (and hence
68 /// duplicating them) is not legal.
69 static bool isLikelyAnIndexComputation(Operation *op) {
70   return matchPattern(op, m_Constant()) ||
71          isa<memref::DimOp, arith::SelectOp, arith::CmpIOp>(op);
72 }
73 
74 /// For a given operation `op`, computes whether it is beneficial to sink the
75 /// operation into the kernel. An operation can be sunk if doing so does not
76 /// introduce new kernel arguments. Whether a value is already available in the
77 /// kernel (and hence does not introduce new arguments) is checked by
78 /// querying `existingDependencies` and `availableValues`.
79 /// If an operand is not yet available, we recursively check whether it can be
80 /// made available by siking its defining op.
81 /// Operations that are indentified for sinking are added to `beneficiaryOps` in
82 /// the order they should appear in the kernel. Furthermore, `availableValues`
83 /// is updated with results that will be available after sinking the identified
84 /// ops.
85 static bool extractBeneficiaryOps(
86     Operation *op, const SetVector<Value> &existingDependencies,
87     SetVector<Operation *> &beneficiaryOps,
88     llvm::SmallPtrSetImpl<Value> &availableValues,
89     llvm::function_ref<bool(Operation *)> isSinkingBeneficiary) {
90   if (beneficiaryOps.count(op))
91     return true;
92 
93   if (!isSinkingBeneficiary(op))
94     return false;
95 
96   for (Value operand : op->getOperands()) {
97     // It is already visible in the kernel, keep going.
98     if (availableValues.count(operand))
99       continue;
100     // Else check whether it can be made available via sinking or already is a
101     // dependency.
102     Operation *definingOp = operand.getDefiningOp();
103     if ((!definingOp || !extractBeneficiaryOps(definingOp, existingDependencies,
104                                                beneficiaryOps, availableValues,
105                                                isSinkingBeneficiary)) &&
106         !existingDependencies.count(operand))
107       return false;
108   }
109   // We will sink the operation, mark its results as now available.
110   beneficiaryOps.insert(op);
111   for (Value result : op->getResults())
112     availableValues.insert(result);
113   return true;
114 }
115 
116 LogicalResult mlir::sinkOperationsIntoLaunchOp(
117     gpu::LaunchOp launchOp,
118     llvm::function_ref<bool(Operation *)> isSinkingBeneficiary) {
119   assert(isSinkingBeneficiary);
120   Region &launchOpBody = launchOp.getBody();
121 
122   // Identify uses from values defined outside of the scope of the launch
123   // operation.
124   SetVector<Value> sinkCandidates;
125   getUsedValuesDefinedAbove(launchOpBody, sinkCandidates);
126 
127   SetVector<Operation *> toBeSunk;
128   llvm::SmallPtrSet<Value, 4> availableValues;
129   for (Value operand : sinkCandidates) {
130     Operation *operandOp = operand.getDefiningOp();
131     if (!operandOp)
132       continue;
133     extractBeneficiaryOps(operandOp, sinkCandidates, toBeSunk, availableValues,
134                           isSinkingBeneficiary);
135   }
136 
137   // Insert operations so that the defs get cloned before uses.
138   BlockAndValueMapping map;
139   OpBuilder builder(launchOpBody);
140   for (Operation *op : toBeSunk) {
141     Operation *clonedOp = builder.clone(*op, map);
142     // Only replace uses within the launch op.
143     for (auto pair : llvm::zip(op->getResults(), clonedOp->getResults()))
144       replaceAllUsesInRegionWith(std::get<0>(pair), std::get<1>(pair),
145                                  launchOp.getBody());
146   }
147   return success();
148 }
149 
150 /// Outline the `gpu.launch` operation body into a kernel function. Replace
151 /// `gpu.terminator` operations by `gpu.return` in the generated function.
152 static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
153                                             StringRef kernelFnName,
154                                             SetVector<Value> &operands) {
155   Location loc = launchOp.getLoc();
156   // Create a builder with no insertion point, insertion will happen separately
157   // due to symbol table manipulation.
158   OpBuilder builder(launchOp.getContext());
159   Region &launchOpBody = launchOp.getBody();
160 
161   // Identify uses from values defined outside of the scope of the launch
162   // operation.
163   getUsedValuesDefinedAbove(launchOpBody, operands);
164 
165   // Create the gpu.func operation.
166   SmallVector<Type, 4> kernelOperandTypes;
167   kernelOperandTypes.reserve(operands.size());
168   for (Value operand : operands) {
169     kernelOperandTypes.push_back(operand.getType());
170   }
171   FunctionType type =
172       FunctionType::get(launchOp.getContext(), kernelOperandTypes, {});
173   auto outlinedFunc = builder.create<gpu::GPUFuncOp>(loc, kernelFnName, type);
174   outlinedFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
175                         builder.getUnitAttr());
176   BlockAndValueMapping map;
177 
178   // Map the arguments corresponding to the launch parameters like blockIdx,
179   // threadIdx, etc.
180   Region &outlinedFuncBody = outlinedFunc.getBody();
181   injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map);
182 
183   // Map arguments from gpu.launch region to the arguments of the gpu.func
184   // operation.
185   Block &entryBlock = outlinedFuncBody.front();
186   for (const auto &operand : enumerate(operands))
187     map.map(operand.value(), entryBlock.getArgument(operand.index()));
188 
189   // Clone the region of the gpu.launch operation into the gpu.func operation.
190   // TODO: If cloneInto can be modified such that if a mapping for
191   // a block exists, that block will be used to clone operations into (at the
192   // end of the block), instead of creating a new block, this would be much
193   // cleaner.
194   launchOpBody.cloneInto(&outlinedFuncBody, map);
195 
196   // Branch from entry of the gpu.func operation to the block that is cloned
197   // from the entry block of the gpu.launch operation.
198   Block &launchOpEntry = launchOpBody.front();
199   Block *clonedLaunchOpEntry = map.lookup(&launchOpEntry);
200   builder.setInsertionPointToEnd(&entryBlock);
201   builder.create<cf::BranchOp>(loc, clonedLaunchOpEntry);
202 
203   outlinedFunc.walk([](gpu::TerminatorOp op) {
204     OpBuilder replacer(op);
205     replacer.create<gpu::ReturnOp>(op.getLoc());
206     op.erase();
207   });
208   return outlinedFunc;
209 }
210 
211 gpu::GPUFuncOp mlir::outlineKernelFunc(gpu::LaunchOp launchOp,
212                                        StringRef kernelFnName,
213                                        llvm::SmallVectorImpl<Value> &operands) {
214   DenseSet<Value> inputOperandSet;
215   inputOperandSet.insert(operands.begin(), operands.end());
216   SetVector<Value> operandSet(operands.begin(), operands.end());
217   auto funcOp = outlineKernelFuncImpl(launchOp, kernelFnName, operandSet);
218   for (auto operand : operandSet) {
219     if (!inputOperandSet.count(operand))
220       operands.push_back(operand);
221   }
222   return funcOp;
223 }
224 
225 /// Replace `gpu.launch` operations with an `gpu.launch_func` operation
226 /// launching `kernelFunc`. The kernel func contains the body of the
227 /// `gpu.launch` with constant region arguments inlined.
228 static void convertToLaunchFuncOp(gpu::LaunchOp launchOp,
229                                   gpu::GPUFuncOp kernelFunc,
230                                   ValueRange operands) {
231   OpBuilder builder(launchOp);
232   // The launch op has an optional dynamic shared memory size. If it doesn't
233   // exist, we use zero.
234   Value asyncToken = launchOp.getAsyncToken();
235   auto launchFunc = builder.create<gpu::LaunchFuncOp>(
236       launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(),
237       launchOp.getBlockSizeOperandValues(),
238       launchOp.getDynamicSharedMemorySize(), operands,
239       asyncToken ? asyncToken.getType() : nullptr,
240       launchOp.getAsyncDependencies());
241   launchOp.replaceAllUsesWith(launchFunc);
242   launchOp.erase();
243 }
244 
245 namespace {
246 /// Pass that moves ops which are likely an index computation into gpu.launch
247 /// body.
248 class GpuLaunchSinkIndexComputationsPass
249     : public impl::GpuLaunchSinkIndexComputationsBase<
250           GpuLaunchSinkIndexComputationsPass> {
251 public:
252   void runOnOperation() override {
253     Operation *op = getOperation();
254     if (op->walk([](gpu::LaunchOp launch) {
255             // Pull in instructions that can be sunk
256             if (failed(sinkOperationsIntoLaunchOp(launch,
257                                                   isLikelyAnIndexComputation)))
258               return WalkResult::interrupt();
259 
260             return WalkResult::advance();
261           }).wasInterrupted())
262       signalPassFailure();
263   }
264 };
265 
266 /// Pass that moves the kernel of each LaunchOp into its separate nested module.
267 ///
268 /// This pass moves the kernel code of each LaunchOp into a function created
269 /// inside a nested module. It also creates an external function of the same
270 /// name in the parent module.
271 ///
272 /// The gpu.modules are intended to be compiled to a cubin blob independently in
273 /// a separate pass. The external functions can then be annotated with the
274 /// symbol of the cubin accessor function.
275 class GpuKernelOutliningPass
276     : public impl::GpuKernelOutliningBase<GpuKernelOutliningPass> {
277 public:
278   GpuKernelOutliningPass(StringRef dlStr) {
279     if (!dlStr.empty() && !dataLayoutStr.hasValue())
280       dataLayoutStr = dlStr.str();
281   }
282 
283   GpuKernelOutliningPass(const GpuKernelOutliningPass &other)
284       : GpuKernelOutliningBase(other), dataLayoutSpec(other.dataLayoutSpec) {
285     dataLayoutStr = other.dataLayoutStr.getValue();
286   }
287 
288   LogicalResult initialize(MLIRContext *context) override {
289     // Initialize the data layout specification from the data layout string.
290     if (!dataLayoutStr.empty()) {
291       Attribute resultAttr = mlir::parseAttribute(dataLayoutStr, context);
292       if (!resultAttr)
293         return failure();
294 
295       dataLayoutSpec = resultAttr.dyn_cast<DataLayoutSpecInterface>();
296       if (!dataLayoutSpec)
297         return failure();
298     }
299 
300     return success();
301   }
302 
303   void runOnOperation() override {
304     SymbolTable symbolTable(getOperation());
305     bool modified = false;
306     for (auto func : getOperation().getOps<func::FuncOp>()) {
307       // Insert just after the function.
308       Block::iterator insertPt(func->getNextNode());
309       auto funcWalkResult = func.walk([&](gpu::LaunchOp op) {
310         SetVector<Value> operands;
311         std::string kernelFnName =
312             Twine(op->getParentOfType<func::FuncOp>().getName(), "_kernel")
313                 .str();
314 
315         gpu::GPUFuncOp outlinedFunc =
316             outlineKernelFuncImpl(op, kernelFnName, operands);
317 
318         // Create nested module and insert outlinedFunc. The module will
319         // originally get the same name as the function, but may be renamed on
320         // insertion into the parent module.
321         auto kernelModule = createKernelModule(outlinedFunc, symbolTable);
322         symbolTable.insert(kernelModule, insertPt);
323 
324         // Potentially changes signature, pulling in constants.
325         convertToLaunchFuncOp(op, outlinedFunc, operands.getArrayRef());
326         modified = true;
327         return WalkResult::advance();
328       });
329       if (funcWalkResult.wasInterrupted())
330         return signalPassFailure();
331     }
332 
333     // If any new module was inserted in this module, annotate this module as
334     // a container module.
335     if (modified)
336       getOperation()->setAttr(gpu::GPUDialect::getContainerModuleAttrName(),
337                               UnitAttr::get(&getContext()));
338   }
339 
340 private:
341   /// Returns a gpu.module containing kernelFunc and all callees (recursive).
342   gpu::GPUModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc,
343                                       const SymbolTable &parentSymbolTable) {
344     // TODO: This code cannot use an OpBuilder because it must be inserted into
345     // a SymbolTable by the caller. SymbolTable needs to be refactored to
346     // prevent manual building of Ops with symbols in code using SymbolTables
347     // and then this needs to use the OpBuilder.
348     auto *context = getOperation().getContext();
349     OpBuilder builder(context);
350     auto kernelModule = builder.create<gpu::GPUModuleOp>(kernelFunc.getLoc(),
351                                                          kernelFunc.getName());
352 
353     // If a valid data layout spec was provided, attach it to the kernel module.
354     // Otherwise, the default data layout will be used.
355     if (dataLayoutSpec)
356       kernelModule->setAttr(DLTIDialect::kDataLayoutAttrName, dataLayoutSpec);
357 
358     SymbolTable symbolTable(kernelModule);
359     symbolTable.insert(kernelFunc);
360 
361     SmallVector<Operation *, 8> symbolDefWorklist = {kernelFunc};
362     while (!symbolDefWorklist.empty()) {
363       if (std::optional<SymbolTable::UseRange> symbolUses =
364               SymbolTable::getSymbolUses(symbolDefWorklist.pop_back_val())) {
365         for (SymbolTable::SymbolUse symbolUse : *symbolUses) {
366           StringRef symbolName =
367               symbolUse.getSymbolRef().cast<FlatSymbolRefAttr>().getValue();
368           if (symbolTable.lookup(symbolName))
369             continue;
370 
371           Operation *symbolDefClone =
372               parentSymbolTable.lookup(symbolName)->clone();
373           symbolDefWorklist.push_back(symbolDefClone);
374           symbolTable.insert(symbolDefClone);
375         }
376       }
377     }
378 
379     return kernelModule;
380   }
381 
382   Option<std::string> dataLayoutStr{
383       *this, "data-layout-str",
384       llvm::cl::desc("String containing the data layout specification to be "
385                      "attached to the GPU kernel module")};
386 
387   DataLayoutSpecInterface dataLayoutSpec;
388 };
389 
390 } // namespace
391 
392 std::unique_ptr<Pass> mlir::createGpuLauchSinkIndexComputationsPass() {
393   return std::make_unique<GpuLaunchSinkIndexComputationsPass>();
394 }
395 
396 std::unique_ptr<OperationPass<ModuleOp>>
397 mlir::createGpuKernelOutliningPass(StringRef dataLayoutStr) {
398   return std::make_unique<GpuKernelOutliningPass>(dataLayoutStr);
399 }
400