1 //===- KernelOutlining.cpp - Implementation of GPU kernel outlining -------===// 2 // 3 // Copyright 2019 The MLIR Authors. 4 // 5 // Licensed under the Apache License, Version 2.0 (the "License"); 6 // you may not use this file except in compliance with the License. 7 // You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 // ============================================================================= 17 // 18 // This file implements the GPU dialect kernel outlining pass. 19 // 20 //===----------------------------------------------------------------------===// 21 22 #include "mlir/Dialect/GPU/GPUDialect.h" 23 #include "mlir/Dialect/GPU/Passes.h" 24 #include "mlir/Dialect/StandardOps/Ops.h" 25 #include "mlir/IR/BlockAndValueMapping.h" 26 #include "mlir/IR/Builders.h" 27 #include "mlir/Pass/Pass.h" 28 29 using namespace mlir; 30 31 template <typename OpTy> 32 static void createForAllDimensions(OpBuilder &builder, Location loc, 33 SmallVectorImpl<Value *> &values) { 34 for (StringRef dim : {"x", "y", "z"}) { 35 Value *v = builder.create<OpTy>(loc, builder.getIndexType(), 36 builder.getStringAttr(dim)); 37 values.push_back(v); 38 } 39 } 40 41 // Add operations generating block/thread ids and gird/block dimensions at the 42 // beginning of `kernelFunc` and replace uses of the respective function args. 43 static void injectGpuIndexOperations(Location loc, FuncOp kernelFunc) { 44 OpBuilder OpBuilder(kernelFunc.getBody()); 45 SmallVector<Value *, 12> indexOps; 46 createForAllDimensions<gpu::BlockIdOp>(OpBuilder, loc, indexOps); 47 createForAllDimensions<gpu::ThreadIdOp>(OpBuilder, loc, indexOps); 48 createForAllDimensions<gpu::GridDimOp>(OpBuilder, loc, indexOps); 49 createForAllDimensions<gpu::BlockDimOp>(OpBuilder, loc, indexOps); 50 // Replace the leading 12 function args with the respective thread/block index 51 // operations. Iterate backwards since args are erased and indices change. 52 for (int i = 11; i >= 0; --i) { 53 auto &firstBlock = kernelFunc.front(); 54 firstBlock.getArgument(i)->replaceAllUsesWith(indexOps[i]); 55 firstBlock.eraseArgument(i); 56 } 57 } 58 59 // Move all constant arguments of the given kernel function into the function, 60 // thereby reducing the number of kernel arguments. 61 static gpu::LaunchFuncOp inlineConstants(FuncOp kernelFunc, 62 gpu::LaunchFuncOp launch) { 63 OpBuilder kernelBuilder(kernelFunc.getBody()); 64 auto &firstBlock = kernelFunc.getBody().front(); 65 llvm::SmallVector<Value *, 8> newLaunchArgs; 66 for (int i = launch.getNumKernelOperands() - 1; i >= 0; --i) { 67 auto operandOp = launch.getKernelOperand(i)->getDefiningOp(); 68 auto constant = dyn_cast_or_null<ConstantOp>(operandOp); 69 if (!constant) { 70 newLaunchArgs.push_back(launch.getKernelOperand(i)); 71 continue; 72 } 73 auto newConstant = kernelBuilder.clone(*operandOp); 74 firstBlock.getArgument(i)->replaceAllUsesWith(newConstant->getResult(0)); 75 firstBlock.eraseArgument(i); 76 } 77 if (newLaunchArgs.size() == launch.getNumKernelOperands()) 78 return launch; 79 80 std::reverse(newLaunchArgs.begin(), newLaunchArgs.end()); 81 OpBuilder LaunchBuilder(launch); 82 SmallVector<Type, 8> newArgumentTypes; 83 newArgumentTypes.reserve(firstBlock.getNumArguments()); 84 for (auto value : firstBlock.getArguments()) { 85 newArgumentTypes.push_back(value->getType()); 86 } 87 kernelFunc.setType(LaunchBuilder.getFunctionType(newArgumentTypes, {})); 88 auto newLaunch = LaunchBuilder.create<gpu::LaunchFuncOp>( 89 launch.getLoc(), kernelFunc, launch.getGridSizeOperandValues(), 90 launch.getBlockSizeOperandValues(), newLaunchArgs); 91 launch.erase(); 92 return newLaunch; 93 } 94 95 // Outline the `gpu.launch` operation body into a kernel function. Replace 96 // `gpu.return` operations by `std.return` in the generated function. 97 static FuncOp outlineKernelFunc(gpu::LaunchOp launchOp) { 98 Location loc = launchOp.getLoc(); 99 SmallVector<Type, 4> kernelOperandTypes(launchOp.getKernelOperandTypes()); 100 FunctionType type = 101 FunctionType::get(kernelOperandTypes, {}, launchOp.getContext()); 102 std::string kernelFuncName = 103 Twine(launchOp.getParentOfType<FuncOp>().getName(), "_kernel").str(); 104 FuncOp outlinedFunc = FuncOp::create(loc, kernelFuncName, type); 105 outlinedFunc.getBody().takeBody(launchOp.getBody()); 106 Builder builder(launchOp.getContext()); 107 outlinedFunc.setAttr(gpu::GPUDialect::getKernelFuncAttrName(), 108 builder.getUnitAttr()); 109 injectGpuIndexOperations(loc, outlinedFunc); 110 outlinedFunc.walk([](gpu::ReturnOp op) { 111 OpBuilder replacer(op); 112 replacer.create<ReturnOp>(op.getLoc()); 113 op.erase(); 114 }); 115 return outlinedFunc; 116 } 117 118 // Replace `gpu.launch` operations with an `gpu.launch_func` operation launching 119 // `kernelFunc`. The kernel func contains the body of the `gpu.launch` with 120 // constant region arguments inlined. 121 static void convertToLaunchFuncOp(gpu::LaunchOp &launchOp, FuncOp kernelFunc) { 122 OpBuilder builder(launchOp); 123 SmallVector<Value *, 4> kernelOperandValues( 124 launchOp.getKernelOperandValues()); 125 auto launchFuncOp = builder.create<gpu::LaunchFuncOp>( 126 launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(), 127 launchOp.getBlockSizeOperandValues(), kernelOperandValues); 128 inlineConstants(kernelFunc, launchFuncOp); 129 launchOp.erase(); 130 } 131 132 namespace { 133 134 /// Pass that moves the kernel of each LaunchOp into its separate nested module. 135 /// 136 /// This pass moves the kernel code of each LaunchOp into a function created 137 /// inside a nested module. It also creates an external function of the same 138 /// name in the parent module. 139 /// 140 /// The kernel modules are intended to be compiled to a cubin blob independently 141 /// in a separate pass. The external functions can then be annotated with the 142 /// symbol of the cubin accessor function. 143 class GpuKernelOutliningPass : public ModulePass<GpuKernelOutliningPass> { 144 public: 145 void runOnModule() override { 146 ModuleManager moduleManager(getModule()); 147 bool modified = false; 148 for (auto func : getModule().getOps<FuncOp>()) { 149 // Insert just after the function. 150 Block::iterator insertPt(func.getOperation()->getNextNode()); 151 func.walk([&](gpu::LaunchOp op) { 152 FuncOp outlinedFunc = outlineKernelFunc(op); 153 154 // Create nested module and insert outlinedFunc. The module will 155 // originally get the same name as the function, but may be renamed on 156 // insertion into the parent module. 157 auto kernelModule = createKernelModule(outlinedFunc, moduleManager); 158 moduleManager.insert(insertPt, kernelModule); 159 160 // Potentially changes signature, pulling in constants. 161 convertToLaunchFuncOp(op, outlinedFunc); 162 modified = true; 163 }); 164 } 165 166 // If any new module was inserted in this module, annotate this module as 167 // a container module. 168 if (modified) 169 getModule().setAttr(gpu::GPUDialect::getContainerModuleAttrName(), 170 UnitAttr::get(&getContext())); 171 } 172 173 private: 174 // Returns a module containing kernelFunc and all callees (recursive). 175 ModuleOp createKernelModule(FuncOp kernelFunc, 176 const ModuleManager &parentModuleManager) { 177 auto context = getModule().getContext(); 178 Builder builder(context); 179 auto kernelModule = 180 ModuleOp::create(builder.getUnknownLoc(), kernelFunc.getName()); 181 kernelModule.setAttr(gpu::GPUDialect::getKernelModuleAttrName(), 182 builder.getUnitAttr()); 183 ModuleManager moduleManager(kernelModule); 184 185 llvm::SmallVector<FuncOp, 8> funcsToInsert = {kernelFunc}; 186 while (!funcsToInsert.empty()) { 187 FuncOp func = funcsToInsert.pop_back_val(); 188 moduleManager.insert(func); 189 190 // TODO(b/141098412): Support any op with a callable interface. 191 func.walk([&](CallOp call) { 192 auto callee = call.callee(); 193 if (moduleManager.lookupSymbol<FuncOp>(callee)) 194 return; 195 196 auto calleeFromParent = 197 parentModuleManager.lookupSymbol<FuncOp>(callee); 198 funcsToInsert.push_back(calleeFromParent.clone()); 199 }); 200 } 201 202 return kernelModule; 203 } 204 }; 205 206 } // namespace 207 208 std::unique_ptr<OpPassBase<ModuleOp>> mlir::createGpuKernelOutliningPass() { 209 return std::make_unique<GpuKernelOutliningPass>(); 210 } 211 212 static PassRegistration<GpuKernelOutliningPass> 213 pass("gpu-kernel-outlining", 214 "Outline gpu.launch bodies to kernel functions."); 215