xref: /llvm-project/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp (revision a0d019fc89c57736e54a476aa4db63027a2dace2)
1 //===- AsyncRegionRewriter.cpp - Implementation of GPU async rewriters ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the GPU dialect pattern rewriters that make GPU op
10 // within a region execute asynchronously.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "PassDetail.h"
15 #include "mlir/Dialect/Async/IR/Async.h"
16 #include "mlir/Dialect/GPU/GPUDialect.h"
17 #include "mlir/Dialect/GPU/Passes.h"
18 #include "mlir/Dialect/GPU/Utils.h"
19 #include "mlir/Dialect/StandardOps/IR/Ops.h"
20 #include "mlir/IR/BlockAndValueMapping.h"
21 #include "mlir/IR/Builders.h"
22 #include "mlir/IR/PatternMatch.h"
23 #include "mlir/IR/SymbolTable.h"
24 #include "mlir/Support/LLVM.h"
25 #include "mlir/Transforms/RegionUtils.h"
26 #include "llvm/ADT/TypeSwitch.h"
27 
28 using namespace mlir;
29 namespace {
30 class GpuAsyncRegionPass : public GpuAsyncRegionPassBase<GpuAsyncRegionPass> {
31   struct ThreadTokenCallback;
32   struct DeferWaitCallback;
33   struct SingleTokenUseCallback;
34   void runOnFunction() override;
35 };
36 } // namespace
37 
38 static bool isTerminator(Operation *op) {
39   return op->mightHaveTrait<OpTrait::IsTerminator>();
40 }
41 static bool hasSideEffects(Operation *op) {
42   return !MemoryEffectOpInterface::hasNoEffect(op);
43 }
44 
45 // Region walk callback which makes GPU ops implementing the AsyncOpInterface
46 // execute asynchronously.
47 struct GpuAsyncRegionPass::ThreadTokenCallback {
48   ThreadTokenCallback(MLIRContext &context) : builder(&context) {}
49 
50   // If `op` implements the AsyncOpInterface, insert a `gpu.wait async` to
51   // create a current token (unless it already exists), and 'thread' that token
52   // through the `op` so that it executes asynchronously.
53   //
54   // If `op` is a terminator or an op with side-effects, insert a `gpu.wait` to
55   // host-synchronize execution. A `!gpu.async.token` will therefore only be
56   // used inside of its block and GPU execution will always synchronize with
57   // the host at block boundaries.
58   WalkResult operator()(Operation *op) {
59     if (isa<gpu::LaunchOp>(op))
60       return op->emitOpError("replace with gpu.launch_func first");
61     if (isa<gpu::WaitOp>(op))
62       return op->emitOpError("unexpected pre-existing gpu.wait");
63     builder.setInsertionPoint(op);
64     if (auto asyncOp = dyn_cast<gpu::AsyncOpInterface>(op))
65       return rewriteAsyncOp(asyncOp); // Replace GPU op with async version.
66     if (!currentToken)
67       return success();
68     // Insert host synchronization before terminator or op with side effects.
69     if (isTerminator(op) || hasSideEffects(op))
70       currentToken = createWaitOp(op->getLoc(), Type(), {currentToken});
71     return success();
72   }
73 
74 private:
75   // Replaces asyncOp with a clone that returns a token.
76   LogicalResult rewriteAsyncOp(gpu::AsyncOpInterface asyncOp) {
77     auto *op = asyncOp.getOperation();
78     if (asyncOp.getAsyncToken())
79       // TODO: Support ops that are already async.
80       return op->emitOpError("is already async");
81 
82     auto tokenType = builder.getType<gpu::AsyncTokenType>();
83 
84     // If there is no current token, insert a `gpu.wait async` without
85     // dependencies to create one.
86     if (!currentToken)
87       currentToken = createWaitOp(op->getLoc(), tokenType, {});
88     asyncOp.addAsyncDependency(currentToken);
89 
90     // Clone the op to return a token in addition to the other results.
91     SmallVector<Type, 1> resultTypes;
92     resultTypes.reserve(1 + op->getNumResults());
93     copy(op->getResultTypes(), std::back_inserter(resultTypes));
94     resultTypes.push_back(tokenType);
95     auto *newOp = Operation::create(op->getLoc(), op->getName(), resultTypes,
96                                     op->getOperands(), op->getAttrDictionary(),
97                                     op->getSuccessors(), op->getNumRegions());
98 
99     // Clone regions into new op.
100     BlockAndValueMapping mapping;
101     for (auto pair : llvm::zip_first(op->getRegions(), newOp->getRegions()))
102       std::get<0>(pair).cloneInto(&std::get<1>(pair), mapping);
103 
104     // Replace the op with the async clone.
105     auto results = newOp->getResults();
106     currentToken = results.back();
107     builder.insert(newOp);
108     op->replaceAllUsesWith(results.drop_back());
109     op->erase();
110 
111     return success();
112   }
113 
114   Value createWaitOp(Location loc, Type resultType, ValueRange operands) {
115     return builder.create<gpu::WaitOp>(loc, resultType, operands).asyncToken();
116   }
117 
118   OpBuilder builder;
119 
120   // The token that represents the current asynchronous dependency. It's valid
121   // range starts with a `gpu.wait async` op, and ends with a `gpu.wait` op.
122   // In between, each gpu::AsyncOpInterface depends on the current token and
123   // produces the new one.
124   Value currentToken = {};
125 };
126 
127 /// Erases `executeOp` and returns a clone with additional `results`.
128 async::ExecuteOp addExecuteResults(async::ExecuteOp executeOp,
129                                    ValueRange results) {
130   // Add values to async.yield op.
131   Operation *yieldOp = executeOp.getBody()->getTerminator();
132   yieldOp->insertOperands(yieldOp->getNumOperands(), results);
133 
134   // Construct new result type list with additional types.
135   SmallVector<Type, 2> resultTypes;
136   resultTypes.reserve(executeOp.getNumResults() + results.size());
137   transform(executeOp.getResultTypes(), std::back_inserter(resultTypes),
138             [](Type type) {
139               // Extract value type from !async.value.
140               if (auto valueType = type.dyn_cast<async::ValueType>())
141                 return valueType.getValueType();
142               assert(type.isa<async::TokenType>() && "expected token type");
143               return type;
144             });
145   transform(results, std::back_inserter(resultTypes),
146             [](Value value) { return value.getType(); });
147 
148   // Clone executeOp with the extra results.
149   OpBuilder builder(executeOp);
150   auto newOp = builder.create<async::ExecuteOp>(
151       executeOp.getLoc(), TypeRange{resultTypes}.drop_front() /*drop token*/,
152       executeOp.dependencies(), executeOp.operands());
153   BlockAndValueMapping mapper;
154   newOp.getRegion().getBlocks().clear();
155   executeOp.getRegion().cloneInto(&newOp.getRegion(), mapper);
156 
157   // Replace executeOp with cloned one.
158   executeOp.getOperation()->replaceAllUsesWith(
159       newOp.getResults().drop_back(results.size()));
160   executeOp.erase();
161 
162   return newOp;
163 }
164 
165 // Callback for `async.execute` ops which tries to push the contained
166 // synchronous `gpu.wait` op to the dependencies of the `async.execute`.
167 struct GpuAsyncRegionPass::DeferWaitCallback {
168   // If the `executeOp`s token is used only in `async.execute` or `async.await`
169   // ops, add the region's last `gpu.wait` op to the worklist if it is
170   // synchronous and is the last op with side effects.
171   void operator()(async::ExecuteOp executeOp) {
172     if (!areAllUsersExecuteOrAwait(executeOp.token()))
173       return;
174     // async.execute's region is currently restricted to one block.
175     for (auto &op : llvm::reverse(executeOp.getBody()->without_terminator())) {
176       if (auto waitOp = dyn_cast<gpu::WaitOp>(op)) {
177         if (!waitOp.asyncToken())
178           worklist.push_back(waitOp);
179         return;
180       }
181       if (hasSideEffects(&op))
182         return;
183     }
184   }
185 
186   // The destructor performs the actual rewrite work.
187   ~DeferWaitCallback() {
188     for (size_t i = 0; i < worklist.size(); ++i) {
189       auto waitOp = worklist[i];
190       auto executeOp = waitOp->getParentOfType<async::ExecuteOp>();
191 
192       // Erase `gpu.wait` and return async dependencies from execute op instead.
193       SmallVector<Value, 4> dependencies = waitOp.asyncDependencies();
194       waitOp.erase();
195       executeOp = addExecuteResults(executeOp, dependencies);
196 
197       // Add the async dependency to each user of the `async.execute` token.
198       auto asyncTokens = executeOp.getResults().take_back(dependencies.size());
199       for (Operation *user : executeOp.token().getUsers())
200         addAsyncDependencyAfter(asyncTokens, user);
201     }
202   }
203 
204 private:
205   // Returns whether all token users are either 'async.execute' or 'async.await'
206   // ops. This is used as a requirement for pushing 'gpu.wait' ops from a
207   // 'async.execute' body to it's users. Specifically, we do not allow
208   // terminator users, because it could mean that the `async.execute` is inside
209   // control flow code.
210   static bool areAllUsersExecuteOrAwait(Value token) {
211     return !token.use_empty() &&
212            llvm::all_of(token.getUsers(), [](Operation *user) {
213              return isa<async::ExecuteOp, async::AwaitOp>(user);
214            });
215   }
216 
217   // Add the `asyncToken` as dependency as needed after `op`.
218   void addAsyncDependencyAfter(ValueRange asyncTokens, Operation *op) {
219     OpBuilder builder(op->getContext());
220     auto loc = op->getLoc();
221 
222     Block::iterator it;
223     SmallVector<Value, 1> tokens;
224     tokens.reserve(asyncTokens.size());
225     TypeSwitch<Operation *>(op)
226         .Case<async::AwaitOp>([&](auto awaitOp) {
227           // Add async.await ops to wait for the !gpu.async.tokens.
228           builder.setInsertionPointAfter(op);
229           for (auto asyncToken : asyncTokens)
230             tokens.push_back(
231                 builder.create<async::AwaitOp>(loc, asyncToken).result());
232           // Set `it` after the inserted async.await ops.
233           it = builder.getInsertionPoint();
234         })
235         .Case<async::ExecuteOp>([&](auto executeOp) {
236           // Set `it` to the beginning of the region and add asyncTokens to the
237           // async.execute operands.
238           it = executeOp.getBody()->begin();
239           executeOp.operandsMutable().append(asyncTokens);
240           SmallVector<Type, 1> tokenTypes(
241               asyncTokens.size(), builder.getType<gpu::AsyncTokenType>());
242           copy(executeOp.getBody()->addArguments(tokenTypes),
243                std::back_inserter(tokens));
244         });
245 
246     // Advance `it` to terminator or op with side-effects.
247     it = std::find_if(it, Block::iterator(), [](Operation &op) {
248       return isTerminator(&op) || hasSideEffects(&op);
249     });
250 
251     // If `op` implements the AsyncOpInterface, add `token` to the list of async
252     // dependencies.
253     if (auto asyncOp = dyn_cast<gpu::AsyncOpInterface>(*it)) {
254       for (auto token : tokens)
255         asyncOp.addAsyncDependency(token);
256       return;
257     }
258 
259     // Otherwise, insert a gpu.wait before 'it'.
260     builder.setInsertionPoint(it->getBlock(), it);
261     auto waitOp = builder.create<gpu::WaitOp>(loc, Type{}, tokens);
262 
263     // If the new waitOp is at the end of an async.execute region, add it to the
264     // worklist. 'operator()(executeOp)' would do the same, but this is faster.
265     auto executeOp = dyn_cast<async::ExecuteOp>(it->getParentOp());
266     if (executeOp && areAllUsersExecuteOrAwait(executeOp.token()) &&
267         !it->getNextNode())
268       worklist.push_back(waitOp);
269   }
270 
271   SmallVector<gpu::WaitOp, 8> worklist;
272 };
273 
274 // Callback for `async.execute` ops which repeats !gpu.async.token results
275 // so that each of them is only used once.
276 struct GpuAsyncRegionPass::SingleTokenUseCallback {
277   void operator()(async::ExecuteOp executeOp) {
278     // Extract !gpu.async.token results which have multiple uses.
279     auto multiUseResults =
280         llvm::make_filter_range(executeOp.results(), [](OpResult result) {
281           if (result.use_empty() || result.hasOneUse())
282             return false;
283           auto valueType = result.getType().dyn_cast<async::ValueType>();
284           return valueType &&
285                  valueType.getValueType().isa<gpu::AsyncTokenType>();
286         });
287     if (multiUseResults.empty())
288       return;
289 
290     // Indices within !async.execute results (i.e. without the async.token).
291     SmallVector<int, 4> indices;
292     transform(multiUseResults, std::back_inserter(indices),
293               [](OpResult result) {
294                 return result.getResultNumber() - 1; // Index without token.
295               });
296 
297     for (auto index : indices) {
298       assert(!executeOp.results()[index].getUses().empty());
299       // Repeat async.yield token result, one for each use after the first one.
300       auto uses = llvm::drop_begin(executeOp.results()[index].getUses());
301       auto count = std::distance(uses.begin(), uses.end());
302       auto yieldOp = cast<async::YieldOp>(executeOp.getBody()->getTerminator());
303       SmallVector<Value, 4> operands(count, yieldOp.getOperand(index));
304       executeOp = addExecuteResults(executeOp, operands);
305       // Update 'uses' to refer to the new executeOp.
306       uses = llvm::drop_begin(executeOp.results()[index].getUses());
307       auto results = executeOp.results().take_back(count);
308       for (auto pair : llvm::zip(uses, results))
309         std::get<0>(pair).set(std::get<1>(pair));
310     }
311   }
312 };
313 
314 // Replaces synchronous GPU ops in the op's region with asynchronous ones and
315 // inserts the necessary synchronization (as gpu.wait ops). Assumes sequential
316 // execution semantics and that no GPU ops are asynchronous yet.
317 void GpuAsyncRegionPass::runOnFunction() {
318   if (getFunction()
319           .getRegion()
320           .walk(ThreadTokenCallback(getContext()))
321           .wasInterrupted())
322     return signalPassFailure();
323 
324   // Collect gpu.wait ops that we can move out of async.execute regions.
325   getFunction().getRegion().walk(DeferWaitCallback());
326   // Makes each !gpu.async.token returned from async.execute op have single use.
327   getFunction().getRegion().walk(SingleTokenUseCallback());
328 }
329 
330 std::unique_ptr<OperationPass<FuncOp>> mlir::createGpuAsyncRegionPass() {
331   return std::make_unique<GpuAsyncRegionPass>();
332 }
333