1 //===- KernelOutlining.cpp - Implementation of GPU kernel outlining -------===// 2 // 3 // Copyright 2019 The MLIR Authors. 4 // 5 // Licensed under the Apache License, Version 2.0 (the "License"); 6 // you may not use this file except in compliance with the License. 7 // You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 // ============================================================================= 17 // 18 // This file implements the GPU dialect kernel outlining pass. 19 // 20 //===----------------------------------------------------------------------===// 21 22 #include "mlir/Dialect/GPU/GPUDialect.h" 23 #include "mlir/Dialect/GPU/Passes.h" 24 #include "mlir/Dialect/StandardOps/Ops.h" 25 #include "mlir/IR/BlockAndValueMapping.h" 26 #include "mlir/IR/Builders.h" 27 #include "mlir/IR/SymbolTable.h" 28 #include "mlir/Pass/Pass.h" 29 30 using namespace mlir; 31 32 template <typename OpTy> 33 static void createForAllDimensions(OpBuilder &builder, Location loc, 34 SmallVectorImpl<Value *> &values) { 35 for (StringRef dim : {"x", "y", "z"}) { 36 Value *v = builder.create<OpTy>(loc, builder.getIndexType(), 37 builder.getStringAttr(dim)); 38 values.push_back(v); 39 } 40 } 41 42 // Add operations generating block/thread ids and gird/block dimensions at the 43 // beginning of `kernelFunc` and replace uses of the respective function args. 44 static void injectGpuIndexOperations(Location loc, FuncOp kernelFunc) { 45 OpBuilder OpBuilder(kernelFunc.getBody()); 46 SmallVector<Value *, 12> indexOps; 47 createForAllDimensions<gpu::BlockIdOp>(OpBuilder, loc, indexOps); 48 createForAllDimensions<gpu::ThreadIdOp>(OpBuilder, loc, indexOps); 49 createForAllDimensions<gpu::GridDimOp>(OpBuilder, loc, indexOps); 50 createForAllDimensions<gpu::BlockDimOp>(OpBuilder, loc, indexOps); 51 // Replace the leading 12 function args with the respective thread/block index 52 // operations. Iterate backwards since args are erased and indices change. 53 for (int i = 11; i >= 0; --i) { 54 auto &firstBlock = kernelFunc.front(); 55 firstBlock.getArgument(i)->replaceAllUsesWith(indexOps[i]); 56 firstBlock.eraseArgument(i); 57 } 58 } 59 60 static bool isInliningBeneficiary(Operation *op) { 61 return isa<ConstantOp>(op) || isa<DimOp>(op); 62 } 63 64 // Move arguments of the given kernel function into the function if this reduces 65 // the number of kernel arguments. 66 static gpu::LaunchFuncOp inlineBeneficiaryOps(FuncOp kernelFunc, 67 gpu::LaunchFuncOp launch) { 68 OpBuilder kernelBuilder(kernelFunc.getBody()); 69 auto &firstBlock = kernelFunc.getBody().front(); 70 llvm::SmallVector<Value *, 8> newLaunchArgs; 71 BlockAndValueMapping map; 72 for (int i = 0, e = launch.getNumKernelOperands(); i < e; ++i) { 73 map.map(launch.getKernelOperand(i), kernelFunc.getArgument(i)); 74 } 75 for (int i = launch.getNumKernelOperands() - 1; i >= 0; --i) { 76 auto operandOp = launch.getKernelOperand(i)->getDefiningOp(); 77 if (!operandOp || !isInliningBeneficiary(operandOp)) { 78 newLaunchArgs.push_back(launch.getKernelOperand(i)); 79 continue; 80 } 81 // Only inline operations that do not create new arguments. 82 if (!llvm::all_of(operandOp->getOperands(), 83 [map](Value *value) { return map.contains(value); })) { 84 continue; 85 } 86 auto clone = kernelBuilder.clone(*operandOp, map); 87 firstBlock.getArgument(i)->replaceAllUsesWith(clone->getResult(0)); 88 firstBlock.eraseArgument(i); 89 } 90 if (newLaunchArgs.size() == launch.getNumKernelOperands()) 91 return launch; 92 93 std::reverse(newLaunchArgs.begin(), newLaunchArgs.end()); 94 OpBuilder LaunchBuilder(launch); 95 SmallVector<Type, 8> newArgumentTypes; 96 newArgumentTypes.reserve(firstBlock.getNumArguments()); 97 for (auto value : firstBlock.getArguments()) { 98 newArgumentTypes.push_back(value->getType()); 99 } 100 kernelFunc.setType(LaunchBuilder.getFunctionType(newArgumentTypes, {})); 101 auto newLaunch = LaunchBuilder.create<gpu::LaunchFuncOp>( 102 launch.getLoc(), kernelFunc, launch.getGridSizeOperandValues(), 103 launch.getBlockSizeOperandValues(), newLaunchArgs); 104 launch.erase(); 105 return newLaunch; 106 } 107 108 // Outline the `gpu.launch` operation body into a kernel function. Replace 109 // `gpu.return` operations by `std.return` in the generated function. 110 static FuncOp outlineKernelFunc(gpu::LaunchOp launchOp) { 111 Location loc = launchOp.getLoc(); 112 SmallVector<Type, 4> kernelOperandTypes(launchOp.getKernelOperandTypes()); 113 FunctionType type = 114 FunctionType::get(kernelOperandTypes, {}, launchOp.getContext()); 115 std::string kernelFuncName = 116 Twine(launchOp.getParentOfType<FuncOp>().getName(), "_kernel").str(); 117 FuncOp outlinedFunc = FuncOp::create(loc, kernelFuncName, type); 118 outlinedFunc.getBody().takeBody(launchOp.body()); 119 Builder builder(launchOp.getContext()); 120 outlinedFunc.setAttr(gpu::GPUDialect::getKernelFuncAttrName(), 121 builder.getUnitAttr()); 122 injectGpuIndexOperations(loc, outlinedFunc); 123 outlinedFunc.walk([](gpu::ReturnOp op) { 124 OpBuilder replacer(op); 125 replacer.create<ReturnOp>(op.getLoc()); 126 op.erase(); 127 }); 128 return outlinedFunc; 129 } 130 131 // Replace `gpu.launch` operations with an `gpu.launch_func` operation launching 132 // `kernelFunc`. The kernel func contains the body of the `gpu.launch` with 133 // constant region arguments inlined. 134 static void convertToLaunchFuncOp(gpu::LaunchOp &launchOp, FuncOp kernelFunc) { 135 OpBuilder builder(launchOp); 136 auto launchFuncOp = builder.create<gpu::LaunchFuncOp>( 137 launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(), 138 launchOp.getBlockSizeOperandValues(), launchOp.getKernelOperandValues()); 139 inlineBeneficiaryOps(kernelFunc, launchFuncOp); 140 launchOp.erase(); 141 } 142 143 namespace { 144 145 /// Pass that moves the kernel of each LaunchOp into its separate nested module. 146 /// 147 /// This pass moves the kernel code of each LaunchOp into a function created 148 /// inside a nested module. It also creates an external function of the same 149 /// name in the parent module. 150 /// 151 /// The kernel modules are intended to be compiled to a cubin blob independently 152 /// in a separate pass. The external functions can then be annotated with the 153 /// symbol of the cubin accessor function. 154 class GpuKernelOutliningPass : public ModulePass<GpuKernelOutliningPass> { 155 public: 156 void runOnModule() override { 157 SymbolTable symbolTable(getModule()); 158 bool modified = false; 159 for (auto func : getModule().getOps<FuncOp>()) { 160 // Insert just after the function. 161 Block::iterator insertPt(func.getOperation()->getNextNode()); 162 func.walk([&](gpu::LaunchOp op) { 163 FuncOp outlinedFunc = outlineKernelFunc(op); 164 165 // Create nested module and insert outlinedFunc. The module will 166 // originally get the same name as the function, but may be renamed on 167 // insertion into the parent module. 168 auto kernelModule = createKernelModule(outlinedFunc, symbolTable); 169 symbolTable.insert(kernelModule, insertPt); 170 171 // Potentially changes signature, pulling in constants. 172 convertToLaunchFuncOp(op, outlinedFunc); 173 modified = true; 174 }); 175 } 176 177 // If any new module was inserted in this module, annotate this module as 178 // a container module. 179 if (modified) 180 getModule().setAttr(gpu::GPUDialect::getContainerModuleAttrName(), 181 UnitAttr::get(&getContext())); 182 } 183 184 private: 185 // Returns a module containing kernelFunc and all callees (recursive). 186 ModuleOp createKernelModule(FuncOp kernelFunc, 187 const SymbolTable &parentSymbolTable) { 188 auto context = getModule().getContext(); 189 Builder builder(context); 190 auto kernelModule = 191 ModuleOp::create(builder.getUnknownLoc(), kernelFunc.getName()); 192 kernelModule.setAttr(gpu::GPUDialect::getKernelModuleAttrName(), 193 builder.getUnitAttr()); 194 SymbolTable symbolTable(kernelModule); 195 symbolTable.insert(kernelFunc); 196 197 llvm::SmallVector<Operation *, 8> symbolDefWorklist = {kernelFunc}; 198 while (!symbolDefWorklist.empty()) { 199 if (Optional<SymbolTable::UseRange> symbolUses = 200 SymbolTable::getSymbolUses(symbolDefWorklist.pop_back_val())) { 201 for (SymbolTable::SymbolUse symbolUse : *symbolUses) { 202 StringRef symbolName = 203 symbolUse.getSymbolRef().cast<FlatSymbolRefAttr>().getValue(); 204 if (symbolTable.lookup(symbolName)) 205 continue; 206 207 Operation *symbolDefClone = 208 parentSymbolTable.lookup(symbolName)->clone(); 209 symbolDefWorklist.push_back(symbolDefClone); 210 symbolTable.insert(symbolDefClone); 211 } 212 } 213 } 214 215 return kernelModule; 216 } 217 }; 218 219 } // namespace 220 221 std::unique_ptr<OpPassBase<ModuleOp>> mlir::createGpuKernelOutliningPass() { 222 return std::make_unique<GpuKernelOutliningPass>(); 223 } 224 225 static PassRegistration<GpuKernelOutliningPass> 226 pass("gpu-kernel-outlining", 227 "Outline gpu.launch bodies to kernel functions."); 228