1 //===- AsyncRegionRewriter.cpp - Implementation of GPU async rewriters ----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the GPU dialect pattern rewriters that make GPU op 10 // within a region execute asynchronously. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "PassDetail.h" 15 #include "mlir/Dialect/Async/IR/Async.h" 16 #include "mlir/Dialect/GPU/GPUDialect.h" 17 #include "mlir/Dialect/GPU/Passes.h" 18 #include "mlir/Dialect/GPU/Utils.h" 19 #include "mlir/Dialect/StandardOps/IR/Ops.h" 20 #include "mlir/IR/BlockAndValueMapping.h" 21 #include "mlir/IR/Builders.h" 22 #include "mlir/IR/PatternMatch.h" 23 #include "mlir/IR/SymbolTable.h" 24 #include "mlir/Support/LLVM.h" 25 #include "mlir/Transforms/RegionUtils.h" 26 #include "llvm/ADT/TypeSwitch.h" 27 28 using namespace mlir; 29 namespace { 30 class GpuAsyncRegionPass : public GpuAsyncRegionPassBase<GpuAsyncRegionPass> { 31 struct ThreadTokenCallback; 32 struct DeferWaitCallback; 33 struct SingleTokenUseCallback; 34 void runOnFunction() override; 35 }; 36 } // namespace 37 38 static bool isTerminator(Operation *op) { 39 return op->mightHaveTrait<OpTrait::IsTerminator>(); 40 } 41 static bool hasSideEffects(Operation *op) { 42 return !MemoryEffectOpInterface::hasNoEffect(op); 43 } 44 45 // Region walk callback which makes GPU ops implementing the AsyncOpInterface 46 // execute asynchronously. 47 struct GpuAsyncRegionPass::ThreadTokenCallback { 48 ThreadTokenCallback(MLIRContext &context) : builder(&context) {} 49 50 // If `op` implements the AsyncOpInterface, insert a `gpu.wait async` to 51 // create a current token (unless it already exists), and 'thread' that token 52 // through the `op` so that it executes asynchronously. 53 // 54 // If `op` is a terminator or an op with side-effects, insert a `gpu.wait` to 55 // host-synchronize execution. A `!gpu.async.token` will therefore only be 56 // used inside of its block and GPU execution will always synchronize with 57 // the host at block boundaries. 58 WalkResult operator()(Operation *op) { 59 if (isa<gpu::LaunchOp>(op)) 60 return op->emitOpError("replace with gpu.launch_func first"); 61 if (isa<gpu::WaitOp>(op)) 62 return op->emitOpError("unexpected pre-existing gpu.wait"); 63 builder.setInsertionPoint(op); 64 if (auto asyncOp = dyn_cast<gpu::AsyncOpInterface>(op)) 65 return rewriteAsyncOp(asyncOp); // Replace GPU op with async version. 66 if (!currentToken) 67 return success(); 68 // Insert host synchronization before terminator or op with side effects. 69 if (isTerminator(op) || hasSideEffects(op)) 70 currentToken = createWaitOp(op->getLoc(), Type(), {currentToken}); 71 return success(); 72 } 73 74 private: 75 // Replaces asyncOp with a clone that returns a token. 76 LogicalResult rewriteAsyncOp(gpu::AsyncOpInterface asyncOp) { 77 auto *op = asyncOp.getOperation(); 78 if (asyncOp.getAsyncToken()) 79 // TODO: Support ops that are already async. 80 return op->emitOpError("is already async"); 81 82 auto tokenType = builder.getType<gpu::AsyncTokenType>(); 83 84 // If there is no current token, insert a `gpu.wait async` without 85 // dependencies to create one. 86 if (!currentToken) 87 currentToken = createWaitOp(op->getLoc(), tokenType, {}); 88 asyncOp.addAsyncDependency(currentToken); 89 90 // Clone the op to return a token in addition to the other results. 91 SmallVector<Type, 1> resultTypes; 92 resultTypes.reserve(1 + op->getNumResults()); 93 copy(op->getResultTypes(), std::back_inserter(resultTypes)); 94 resultTypes.push_back(tokenType); 95 auto *newOp = Operation::create(op->getLoc(), op->getName(), resultTypes, 96 op->getOperands(), op->getAttrDictionary(), 97 op->getSuccessors(), op->getNumRegions()); 98 99 // Clone regions into new op. 100 BlockAndValueMapping mapping; 101 for (auto pair : llvm::zip_first(op->getRegions(), newOp->getRegions())) 102 std::get<0>(pair).cloneInto(&std::get<1>(pair), mapping); 103 104 // Replace the op with the async clone. 105 auto results = newOp->getResults(); 106 currentToken = results.back(); 107 builder.insert(newOp); 108 op->replaceAllUsesWith(results.drop_back()); 109 op->erase(); 110 111 return success(); 112 } 113 114 Value createWaitOp(Location loc, Type resultType, ValueRange operands) { 115 return builder.create<gpu::WaitOp>(loc, resultType, operands).asyncToken(); 116 } 117 118 OpBuilder builder; 119 120 // The token that represents the current asynchronous dependency. It's valid 121 // range starts with a `gpu.wait async` op, and ends with a `gpu.wait` op. 122 // In between, each gpu::AsyncOpInterface depends on the current token and 123 // produces the new one. 124 Value currentToken = {}; 125 }; 126 127 /// Erases `executeOp` and returns a clone with additional `results`. 128 async::ExecuteOp addExecuteResults(async::ExecuteOp executeOp, 129 ValueRange results) { 130 // Add values to async.yield op. 131 Operation *yieldOp = executeOp.getBody()->getTerminator(); 132 yieldOp->insertOperands(yieldOp->getNumOperands(), results); 133 134 // Construct new result type list with additional types. 135 SmallVector<Type, 2> resultTypes; 136 resultTypes.reserve(executeOp.getNumResults() + results.size()); 137 transform(executeOp.getResultTypes(), std::back_inserter(resultTypes), 138 [](Type type) { 139 // Extract value type from !async.value. 140 if (auto valueType = type.dyn_cast<async::ValueType>()) 141 return valueType.getValueType(); 142 assert(type.isa<async::TokenType>() && "expected token type"); 143 return type; 144 }); 145 transform(results, std::back_inserter(resultTypes), 146 [](Value value) { return value.getType(); }); 147 148 // Clone executeOp with the extra results. 149 OpBuilder builder(executeOp); 150 auto newOp = builder.create<async::ExecuteOp>( 151 executeOp.getLoc(), TypeRange{resultTypes}.drop_front() /*drop token*/, 152 executeOp.dependencies(), executeOp.operands()); 153 BlockAndValueMapping mapper; 154 newOp.getRegion().getBlocks().clear(); 155 executeOp.getRegion().cloneInto(&newOp.getRegion(), mapper); 156 157 // Replace executeOp with cloned one. 158 executeOp.getOperation()->replaceAllUsesWith( 159 newOp.getResults().drop_back(results.size())); 160 executeOp.erase(); 161 162 return newOp; 163 } 164 165 // Callback for `async.execute` ops which tries to push the contained 166 // synchronous `gpu.wait` op to the dependencies of the `async.execute`. 167 struct GpuAsyncRegionPass::DeferWaitCallback { 168 // If the `executeOp`s token is used only in `async.execute` or `async.await` 169 // ops, add the region's last `gpu.wait` op to the worklist if it is 170 // synchronous and is the last op with side effects. 171 void operator()(async::ExecuteOp executeOp) { 172 if (!areAllUsersExecuteOrAwait(executeOp.token())) 173 return; 174 // async.execute's region is currently restricted to one block. 175 for (auto &op : llvm::reverse(executeOp.getBody()->without_terminator())) { 176 if (auto waitOp = dyn_cast<gpu::WaitOp>(op)) { 177 if (!waitOp.asyncToken()) 178 worklist.push_back(waitOp); 179 return; 180 } 181 if (hasSideEffects(&op)) 182 return; 183 } 184 } 185 186 // The destructor performs the actual rewrite work. 187 ~DeferWaitCallback() { 188 for (size_t i = 0; i < worklist.size(); ++i) { 189 auto waitOp = worklist[i]; 190 auto executeOp = waitOp->getParentOfType<async::ExecuteOp>(); 191 192 // Erase `gpu.wait` and return async dependencies from execute op instead. 193 SmallVector<Value, 4> dependencies = waitOp.asyncDependencies(); 194 waitOp.erase(); 195 executeOp = addExecuteResults(executeOp, dependencies); 196 197 // Add the async dependency to each user of the `async.execute` token. 198 auto asyncTokens = executeOp.getResults().take_back(dependencies.size()); 199 for (Operation *user : executeOp.token().getUsers()) 200 addAsyncDependencyAfter(asyncTokens, user); 201 } 202 } 203 204 private: 205 // Returns whether all token users are either 'async.execute' or 'async.await' 206 // ops. This is used as a requirement for pushing 'gpu.wait' ops from a 207 // 'async.execute' body to it's users. Specifically, we do not allow 208 // terminator users, because it could mean that the `async.execute` is inside 209 // control flow code. 210 static bool areAllUsersExecuteOrAwait(Value token) { 211 return !token.use_empty() && 212 llvm::all_of(token.getUsers(), [](Operation *user) { 213 return isa<async::ExecuteOp, async::AwaitOp>(user); 214 }); 215 } 216 217 // Add the `asyncToken` as dependency as needed after `op`. 218 void addAsyncDependencyAfter(ValueRange asyncTokens, Operation *op) { 219 OpBuilder builder(op->getContext()); 220 auto loc = op->getLoc(); 221 222 Block::iterator it; 223 SmallVector<Value, 1> tokens; 224 tokens.reserve(asyncTokens.size()); 225 TypeSwitch<Operation *>(op) 226 .Case<async::AwaitOp>([&](auto awaitOp) { 227 // Add async.await ops to wait for the !gpu.async.tokens. 228 builder.setInsertionPointAfter(op); 229 for (auto asyncToken : asyncTokens) 230 tokens.push_back( 231 builder.create<async::AwaitOp>(loc, asyncToken).result()); 232 // Set `it` after the inserted async.await ops. 233 it = builder.getInsertionPoint(); 234 }) 235 .Case<async::ExecuteOp>([&](auto executeOp) { 236 // Set `it` to the beginning of the region and add asyncTokens to the 237 // async.execute operands. 238 it = executeOp.getBody()->begin(); 239 executeOp.operandsMutable().append(asyncTokens); 240 SmallVector<Type, 1> tokenTypes( 241 asyncTokens.size(), builder.getType<gpu::AsyncTokenType>()); 242 copy(executeOp.getBody()->addArguments(tokenTypes), 243 std::back_inserter(tokens)); 244 }); 245 246 // Advance `it` to terminator or op with side-effects. 247 it = std::find_if(it, Block::iterator(), [](Operation &op) { 248 return isTerminator(&op) || hasSideEffects(&op); 249 }); 250 251 // If `op` implements the AsyncOpInterface, add `token` to the list of async 252 // dependencies. 253 if (auto asyncOp = dyn_cast<gpu::AsyncOpInterface>(*it)) { 254 for (auto token : tokens) 255 asyncOp.addAsyncDependency(token); 256 return; 257 } 258 259 // Otherwise, insert a gpu.wait before 'it'. 260 builder.setInsertionPoint(it->getBlock(), it); 261 auto waitOp = builder.create<gpu::WaitOp>(loc, Type{}, tokens); 262 263 // If the new waitOp is at the end of an async.execute region, add it to the 264 // worklist. 'operator()(executeOp)' would do the same, but this is faster. 265 auto executeOp = dyn_cast<async::ExecuteOp>(it->getParentOp()); 266 if (executeOp && areAllUsersExecuteOrAwait(executeOp.token()) && 267 !it->getNextNode()) 268 worklist.push_back(waitOp); 269 } 270 271 SmallVector<gpu::WaitOp, 8> worklist; 272 }; 273 274 // Callback for `async.execute` ops which repeats !gpu.async.token results 275 // so that each of them is only used once. 276 struct GpuAsyncRegionPass::SingleTokenUseCallback { 277 void operator()(async::ExecuteOp executeOp) { 278 // Extract !gpu.async.token results which have multiple uses. 279 auto multiUseResults = 280 llvm::make_filter_range(executeOp.results(), [](OpResult result) { 281 if (result.use_empty() || result.hasOneUse()) 282 return false; 283 auto valueType = result.getType().dyn_cast<async::ValueType>(); 284 return valueType && 285 valueType.getValueType().isa<gpu::AsyncTokenType>(); 286 }); 287 if (multiUseResults.empty()) 288 return; 289 290 // Indices within !async.execute results (i.e. without the async.token). 291 SmallVector<int, 4> indices; 292 transform(multiUseResults, std::back_inserter(indices), 293 [](OpResult result) { 294 return result.getResultNumber() - 1; // Index without token. 295 }); 296 297 for (auto index : indices) { 298 assert(!executeOp.results()[index].getUses().empty()); 299 // Repeat async.yield token result, one for each use after the first one. 300 auto uses = llvm::drop_begin(executeOp.results()[index].getUses()); 301 auto count = std::distance(uses.begin(), uses.end()); 302 auto yieldOp = cast<async::YieldOp>(executeOp.getBody()->getTerminator()); 303 SmallVector<Value, 4> operands(count, yieldOp.getOperand(index)); 304 executeOp = addExecuteResults(executeOp, operands); 305 // Update 'uses' to refer to the new executeOp. 306 uses = llvm::drop_begin(executeOp.results()[index].getUses()); 307 auto results = executeOp.results().take_back(count); 308 for (auto pair : llvm::zip(uses, results)) 309 std::get<0>(pair).set(std::get<1>(pair)); 310 } 311 } 312 }; 313 314 // Replaces synchronous GPU ops in the op's region with asynchronous ones and 315 // inserts the necessary synchronization (as gpu.wait ops). Assumes sequential 316 // execution semantics and that no GPU ops are asynchronous yet. 317 void GpuAsyncRegionPass::runOnFunction() { 318 if (getFunction() 319 .getRegion() 320 .walk(ThreadTokenCallback(getContext())) 321 .wasInterrupted()) 322 return signalPassFailure(); 323 324 // Collect gpu.wait ops that we can move out of async.execute regions. 325 getFunction().getRegion().walk(DeferWaitCallback()); 326 // Makes each !gpu.async.token returned from async.execute op have single use. 327 getFunction().getRegion().walk(SingleTokenUseCallback()); 328 } 329 330 std::unique_ptr<OperationPass<FuncOp>> mlir::createGpuAsyncRegionPass() { 331 return std::make_unique<GpuAsyncRegionPass>(); 332 } 333