1 //===- KernelOutlining.cpp - Implementation of GPU kernel outlining -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the GPU dialect kernel outlining pass. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "mlir/Dialect/GPU/GPUDialect.h" 14 #include "mlir/Dialect/GPU/Passes.h" 15 #include "mlir/Dialect/StandardOps/Ops.h" 16 #include "mlir/IR/BlockAndValueMapping.h" 17 #include "mlir/IR/Builders.h" 18 #include "mlir/IR/SymbolTable.h" 19 #include "mlir/Pass/Pass.h" 20 21 using namespace mlir; 22 23 template <typename OpTy> 24 static void createForAllDimensions(OpBuilder &builder, Location loc, 25 SmallVectorImpl<Value> &values) { 26 for (StringRef dim : {"x", "y", "z"}) { 27 Value v = builder.create<OpTy>(loc, builder.getIndexType(), 28 builder.getStringAttr(dim)); 29 values.push_back(v); 30 } 31 } 32 33 // Add operations generating block/thread ids and grid/block dimensions at the 34 // beginning of the `body` region and replace uses of the respective function 35 // arguments. 36 static void injectGpuIndexOperations(Location loc, Region &body) { 37 OpBuilder builder(loc->getContext()); 38 Block &firstBlock = body.front(); 39 builder.setInsertionPointToStart(&firstBlock); 40 SmallVector<Value, 12> indexOps; 41 createForAllDimensions<gpu::BlockIdOp>(builder, loc, indexOps); 42 createForAllDimensions<gpu::ThreadIdOp>(builder, loc, indexOps); 43 createForAllDimensions<gpu::GridDimOp>(builder, loc, indexOps); 44 createForAllDimensions<gpu::BlockDimOp>(builder, loc, indexOps); 45 // Replace the leading 12 function args with the respective thread/block index 46 // operations. Iterate backwards since args are erased and indices change. 47 for (int i = 11; i >= 0; --i) { 48 firstBlock.getArgument(i).replaceAllUsesWith(indexOps[i]); 49 firstBlock.eraseArgument(i); 50 } 51 } 52 53 static bool isInliningBeneficiary(Operation *op) { 54 return isa<ConstantOp>(op) || isa<DimOp>(op); 55 } 56 57 // Move arguments of the given kernel function into the function if this reduces 58 // the number of kernel arguments. 59 static gpu::LaunchFuncOp inlineBeneficiaryOps(gpu::GPUFuncOp kernelFunc, 60 gpu::LaunchFuncOp launch) { 61 OpBuilder kernelBuilder(kernelFunc.getBody()); 62 auto &firstBlock = kernelFunc.getBody().front(); 63 SmallVector<Value, 8> newLaunchArgs; 64 BlockAndValueMapping map; 65 for (int i = 0, e = launch.getNumKernelOperands(); i < e; ++i) { 66 map.map(launch.getKernelOperand(i), kernelFunc.getArgument(i)); 67 } 68 for (int i = launch.getNumKernelOperands() - 1; i >= 0; --i) { 69 auto operandOp = launch.getKernelOperand(i).getDefiningOp(); 70 if (!operandOp || !isInliningBeneficiary(operandOp)) { 71 newLaunchArgs.push_back(launch.getKernelOperand(i)); 72 continue; 73 } 74 // Only inline operations that do not create new arguments. 75 if (!llvm::all_of(operandOp->getOperands(), 76 [map](Value value) { return map.contains(value); })) { 77 continue; 78 } 79 auto clone = kernelBuilder.clone(*operandOp, map); 80 firstBlock.getArgument(i).replaceAllUsesWith(clone->getResult(0)); 81 firstBlock.eraseArgument(i); 82 } 83 if (newLaunchArgs.size() == launch.getNumKernelOperands()) 84 return launch; 85 86 std::reverse(newLaunchArgs.begin(), newLaunchArgs.end()); 87 OpBuilder LaunchBuilder(launch); 88 SmallVector<Type, 8> newArgumentTypes; 89 newArgumentTypes.reserve(firstBlock.getNumArguments()); 90 for (auto value : firstBlock.getArguments()) { 91 newArgumentTypes.push_back(value.getType()); 92 } 93 kernelFunc.setType(LaunchBuilder.getFunctionType(newArgumentTypes, {})); 94 auto newLaunch = LaunchBuilder.create<gpu::LaunchFuncOp>( 95 launch.getLoc(), kernelFunc, launch.getGridSizeOperandValues(), 96 launch.getBlockSizeOperandValues(), newLaunchArgs); 97 launch.erase(); 98 return newLaunch; 99 } 100 101 // Outline the `gpu.launch` operation body into a kernel function. Replace 102 // `gpu.terminator` operations by `gpu.return` in the generated function. 103 static gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp) { 104 Location loc = launchOp.getLoc(); 105 // Create a builder with no insertion point, insertion will happen separately 106 // due to symbol table manipulation. 107 OpBuilder builder(launchOp.getContext()); 108 109 SmallVector<Type, 4> kernelOperandTypes(launchOp.getKernelOperandTypes()); 110 FunctionType type = 111 FunctionType::get(kernelOperandTypes, {}, launchOp.getContext()); 112 std::string kernelFuncName = 113 Twine(launchOp.getParentOfType<FuncOp>().getName(), "_kernel").str(); 114 auto outlinedFunc = builder.create<gpu::GPUFuncOp>(loc, kernelFuncName, type); 115 outlinedFunc.setAttr(gpu::GPUDialect::getKernelFuncAttrName(), 116 builder.getUnitAttr()); 117 outlinedFunc.body().takeBody(launchOp.body()); 118 injectGpuIndexOperations(loc, outlinedFunc.body()); 119 outlinedFunc.walk([](gpu::TerminatorOp op) { 120 OpBuilder replacer(op); 121 replacer.create<gpu::ReturnOp>(op.getLoc()); 122 op.erase(); 123 }); 124 125 return outlinedFunc; 126 } 127 128 // Replace `gpu.launch` operations with an `gpu.launch_func` operation launching 129 // `kernelFunc`. The kernel func contains the body of the `gpu.launch` with 130 // constant region arguments inlined. 131 static void convertToLaunchFuncOp(gpu::LaunchOp &launchOp, 132 gpu::GPUFuncOp kernelFunc) { 133 OpBuilder builder(launchOp); 134 auto launchFuncOp = builder.create<gpu::LaunchFuncOp>( 135 launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(), 136 launchOp.getBlockSizeOperandValues(), launchOp.getKernelOperandValues()); 137 inlineBeneficiaryOps(kernelFunc, launchFuncOp); 138 launchOp.erase(); 139 } 140 141 namespace { 142 143 /// Pass that moves the kernel of each LaunchOp into its separate nested module. 144 /// 145 /// This pass moves the kernel code of each LaunchOp into a function created 146 /// inside a nested module. It also creates an external function of the same 147 /// name in the parent module. 148 /// 149 /// The gpu.modules are intended to be compiled to a cubin blob independently in 150 /// a separate pass. The external functions can then be annotated with the 151 /// symbol of the cubin accessor function. 152 class GpuKernelOutliningPass : public ModulePass<GpuKernelOutliningPass> { 153 public: 154 void runOnModule() override { 155 SymbolTable symbolTable(getModule()); 156 bool modified = false; 157 for (auto func : getModule().getOps<FuncOp>()) { 158 // Insert just after the function. 159 Block::iterator insertPt(func.getOperation()->getNextNode()); 160 func.walk([&](gpu::LaunchOp op) { 161 gpu::GPUFuncOp outlinedFunc = outlineKernelFunc(op); 162 163 // Create nested module and insert outlinedFunc. The module will 164 // originally get the same name as the function, but may be renamed on 165 // insertion into the parent module. 166 auto kernelModule = createKernelModule(outlinedFunc, symbolTable); 167 symbolTable.insert(kernelModule, insertPt); 168 169 // Potentially changes signature, pulling in constants. 170 convertToLaunchFuncOp(op, outlinedFunc); 171 modified = true; 172 }); 173 } 174 175 // If any new module was inserted in this module, annotate this module as 176 // a container module. 177 if (modified) 178 getModule().setAttr(gpu::GPUDialect::getContainerModuleAttrName(), 179 UnitAttr::get(&getContext())); 180 } 181 182 private: 183 // Returns a gpu.module containing kernelFunc and all callees (recursive). 184 gpu::GPUModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc, 185 const SymbolTable &parentSymbolTable) { 186 // TODO: This code cannot use an OpBuilder because it must be inserted into 187 // a SymbolTable by the caller. SymbolTable needs to be refactored to 188 // prevent manual building of Ops with symbols in code using SymbolTables 189 // and then this needs to use the OpBuilder. 190 auto context = getModule().getContext(); 191 Builder builder(context); 192 OperationState state(kernelFunc.getLoc(), 193 gpu::GPUModuleOp::getOperationName()); 194 gpu::GPUModuleOp::build(&builder, state, kernelFunc.getName()); 195 auto kernelModule = cast<gpu::GPUModuleOp>(Operation::create(state)); 196 SymbolTable symbolTable(kernelModule); 197 symbolTable.insert(kernelFunc); 198 199 SmallVector<Operation *, 8> symbolDefWorklist = {kernelFunc}; 200 while (!symbolDefWorklist.empty()) { 201 if (Optional<SymbolTable::UseRange> symbolUses = 202 SymbolTable::getSymbolUses(symbolDefWorklist.pop_back_val())) { 203 for (SymbolTable::SymbolUse symbolUse : *symbolUses) { 204 StringRef symbolName = 205 symbolUse.getSymbolRef().cast<FlatSymbolRefAttr>().getValue(); 206 if (symbolTable.lookup(symbolName)) 207 continue; 208 209 Operation *symbolDefClone = 210 parentSymbolTable.lookup(symbolName)->clone(); 211 symbolDefWorklist.push_back(symbolDefClone); 212 symbolTable.insert(symbolDefClone); 213 } 214 } 215 } 216 217 return kernelModule; 218 } 219 }; 220 221 } // namespace 222 223 std::unique_ptr<OpPassBase<ModuleOp>> mlir::createGpuKernelOutliningPass() { 224 return std::make_unique<GpuKernelOutliningPass>(); 225 } 226 227 static PassRegistration<GpuKernelOutliningPass> 228 pass("gpu-kernel-outlining", 229 "Outline gpu.launch bodies to kernel functions."); 230