1 //===- KernelOutlining.cpp - Implementation of GPU kernel outlining -------===// 2 // 3 // Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the GPU dialect kernel outlining pass. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "mlir/Dialect/GPU/GPUDialect.h" 14 #include "mlir/Dialect/GPU/Passes.h" 15 #include "mlir/Dialect/StandardOps/Ops.h" 16 #include "mlir/IR/BlockAndValueMapping.h" 17 #include "mlir/IR/Builders.h" 18 #include "mlir/IR/SymbolTable.h" 19 #include "mlir/Pass/Pass.h" 20 21 using namespace mlir; 22 23 template <typename OpTy> 24 static void createForAllDimensions(OpBuilder &builder, Location loc, 25 SmallVectorImpl<Value> &values) { 26 for (StringRef dim : {"x", "y", "z"}) { 27 Value v = builder.create<OpTy>(loc, builder.getIndexType(), 28 builder.getStringAttr(dim)); 29 values.push_back(v); 30 } 31 } 32 33 // Add operations generating block/thread ids and grid/block dimensions at the 34 // beginning of the `body` region and replace uses of the respective function 35 // arguments. 36 static void injectGpuIndexOperations(Location loc, Region &body) { 37 OpBuilder builder(loc->getContext()); 38 Block &firstBlock = body.front(); 39 builder.setInsertionPointToStart(&firstBlock); 40 SmallVector<Value, 12> indexOps; 41 createForAllDimensions<gpu::BlockIdOp>(builder, loc, indexOps); 42 createForAllDimensions<gpu::ThreadIdOp>(builder, loc, indexOps); 43 createForAllDimensions<gpu::GridDimOp>(builder, loc, indexOps); 44 createForAllDimensions<gpu::BlockDimOp>(builder, loc, indexOps); 45 // Replace the leading 12 function args with the respective thread/block index 46 // operations. Iterate backwards since args are erased and indices change. 47 for (int i = 11; i >= 0; --i) { 48 firstBlock.getArgument(i)->replaceAllUsesWith(indexOps[i]); 49 firstBlock.eraseArgument(i); 50 } 51 } 52 53 static bool isInliningBeneficiary(Operation *op) { 54 return isa<ConstantOp>(op) || isa<DimOp>(op); 55 } 56 57 // Move arguments of the given kernel function into the function if this reduces 58 // the number of kernel arguments. 59 static gpu::LaunchFuncOp inlineBeneficiaryOps(gpu::GPUFuncOp kernelFunc, 60 gpu::LaunchFuncOp launch) { 61 OpBuilder kernelBuilder(kernelFunc.getBody()); 62 auto &firstBlock = kernelFunc.getBody().front(); 63 SmallVector<Value, 8> newLaunchArgs; 64 BlockAndValueMapping map; 65 for (int i = 0, e = launch.getNumKernelOperands(); i < e; ++i) { 66 map.map(launch.getKernelOperand(i), kernelFunc.getArgument(i)); 67 } 68 for (int i = launch.getNumKernelOperands() - 1; i >= 0; --i) { 69 auto operandOp = launch.getKernelOperand(i)->getDefiningOp(); 70 if (!operandOp || !isInliningBeneficiary(operandOp)) { 71 newLaunchArgs.push_back(launch.getKernelOperand(i)); 72 continue; 73 } 74 // Only inline operations that do not create new arguments. 75 if (!llvm::all_of(operandOp->getOperands(), 76 [map](Value value) { return map.contains(value); })) { 77 continue; 78 } 79 auto clone = kernelBuilder.clone(*operandOp, map); 80 firstBlock.getArgument(i)->replaceAllUsesWith(clone->getResult(0)); 81 firstBlock.eraseArgument(i); 82 } 83 if (newLaunchArgs.size() == launch.getNumKernelOperands()) 84 return launch; 85 86 std::reverse(newLaunchArgs.begin(), newLaunchArgs.end()); 87 OpBuilder LaunchBuilder(launch); 88 SmallVector<Type, 8> newArgumentTypes; 89 newArgumentTypes.reserve(firstBlock.getNumArguments()); 90 for (auto value : firstBlock.getArguments()) { 91 newArgumentTypes.push_back(value->getType()); 92 } 93 kernelFunc.setType(LaunchBuilder.getFunctionType(newArgumentTypes, {})); 94 auto newLaunch = LaunchBuilder.create<gpu::LaunchFuncOp>( 95 launch.getLoc(), kernelFunc, launch.getGridSizeOperandValues(), 96 launch.getBlockSizeOperandValues(), newLaunchArgs); 97 launch.erase(); 98 return newLaunch; 99 } 100 101 // Outline the `gpu.launch` operation body into a kernel function. Replace 102 // `gpu.return` operations by `std.return` in the generated function. 103 static gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp) { 104 Location loc = launchOp.getLoc(); 105 // Create a builder with no insertion point, insertion will happen separately 106 // due to symbol table manipulation. 107 OpBuilder builder(launchOp.getContext()); 108 109 SmallVector<Type, 4> kernelOperandTypes(launchOp.getKernelOperandTypes()); 110 FunctionType type = 111 FunctionType::get(kernelOperandTypes, {}, launchOp.getContext()); 112 std::string kernelFuncName = 113 Twine(launchOp.getParentOfType<FuncOp>().getName(), "_kernel").str(); 114 auto outlinedFunc = builder.create<gpu::GPUFuncOp>(loc, kernelFuncName, type); 115 outlinedFunc.setAttr(gpu::GPUDialect::getKernelFuncAttrName(), 116 builder.getUnitAttr()); 117 outlinedFunc.body().takeBody(launchOp.body()); 118 injectGpuIndexOperations(loc, outlinedFunc.body()); 119 return outlinedFunc; 120 } 121 122 // Replace `gpu.launch` operations with an `gpu.launch_func` operation launching 123 // `kernelFunc`. The kernel func contains the body of the `gpu.launch` with 124 // constant region arguments inlined. 125 static void convertToLaunchFuncOp(gpu::LaunchOp &launchOp, 126 gpu::GPUFuncOp kernelFunc) { 127 OpBuilder builder(launchOp); 128 auto launchFuncOp = builder.create<gpu::LaunchFuncOp>( 129 launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(), 130 launchOp.getBlockSizeOperandValues(), launchOp.getKernelOperandValues()); 131 inlineBeneficiaryOps(kernelFunc, launchFuncOp); 132 launchOp.erase(); 133 } 134 135 namespace { 136 137 /// Pass that moves the kernel of each LaunchOp into its separate nested module. 138 /// 139 /// This pass moves the kernel code of each LaunchOp into a function created 140 /// inside a nested module. It also creates an external function of the same 141 /// name in the parent module. 142 /// 143 /// The kernel modules are intended to be compiled to a cubin blob independently 144 /// in a separate pass. The external functions can then be annotated with the 145 /// symbol of the cubin accessor function. 146 class GpuKernelOutliningPass : public ModulePass<GpuKernelOutliningPass> { 147 public: 148 void runOnModule() override { 149 SymbolTable symbolTable(getModule()); 150 bool modified = false; 151 for (auto func : getModule().getOps<FuncOp>()) { 152 // Insert just after the function. 153 Block::iterator insertPt(func.getOperation()->getNextNode()); 154 func.walk([&](gpu::LaunchOp op) { 155 gpu::GPUFuncOp outlinedFunc = outlineKernelFunc(op); 156 157 // Create nested module and insert outlinedFunc. The module will 158 // originally get the same name as the function, but may be renamed on 159 // insertion into the parent module. 160 auto kernelModule = createKernelModule(outlinedFunc, symbolTable); 161 symbolTable.insert(kernelModule, insertPt); 162 163 // Potentially changes signature, pulling in constants. 164 convertToLaunchFuncOp(op, outlinedFunc); 165 modified = true; 166 }); 167 } 168 169 // If any new module was inserted in this module, annotate this module as 170 // a container module. 171 if (modified) 172 getModule().setAttr(gpu::GPUDialect::getContainerModuleAttrName(), 173 UnitAttr::get(&getContext())); 174 } 175 176 private: 177 // Returns a module containing kernelFunc and all callees (recursive). 178 ModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc, 179 const SymbolTable &parentSymbolTable) { 180 auto context = getModule().getContext(); 181 Builder builder(context); 182 auto kernelModule = 183 ModuleOp::create(builder.getUnknownLoc(), kernelFunc.getName()); 184 kernelModule.setAttr(gpu::GPUDialect::getKernelModuleAttrName(), 185 builder.getUnitAttr()); 186 SymbolTable symbolTable(kernelModule); 187 symbolTable.insert(kernelFunc); 188 189 SmallVector<Operation *, 8> symbolDefWorklist = {kernelFunc}; 190 while (!symbolDefWorklist.empty()) { 191 if (Optional<SymbolTable::UseRange> symbolUses = 192 SymbolTable::getSymbolUses(symbolDefWorklist.pop_back_val())) { 193 for (SymbolTable::SymbolUse symbolUse : *symbolUses) { 194 StringRef symbolName = 195 symbolUse.getSymbolRef().cast<FlatSymbolRefAttr>().getValue(); 196 if (symbolTable.lookup(symbolName)) 197 continue; 198 199 Operation *symbolDefClone = 200 parentSymbolTable.lookup(symbolName)->clone(); 201 symbolDefWorklist.push_back(symbolDefClone); 202 symbolTable.insert(symbolDefClone); 203 } 204 } 205 } 206 207 return kernelModule; 208 } 209 }; 210 211 } // namespace 212 213 std::unique_ptr<OpPassBase<ModuleOp>> mlir::createGpuKernelOutliningPass() { 214 return std::make_unique<GpuKernelOutliningPass>(); 215 } 216 217 static PassRegistration<GpuKernelOutliningPass> 218 pass("gpu-kernel-outlining", 219 "Outline gpu.launch bodies to kernel functions."); 220