xref: /llvm-project/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp (revision 4ead2cf76c4a1df260e7dff0fa767074bae6e2b8)
1 //===- SCFToGPU.cpp - Convert an affine loop nest to a GPU kernel -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This implements a straightforward conversion of an loop nest into a GPU
10 // kernel.  The caller is expected to guarantee that the conversion is correct
11 // or to further transform the kernel to ensure correctness.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "mlir/Conversion/SCFToGPU/SCFToGPU.h"
16 
17 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
18 #include "mlir/Dialect/Affine/IR/AffineOps.h"
19 #include "mlir/Dialect/GPU/GPUDialect.h"
20 #include "mlir/Dialect/GPU/ParallelLoopMapper.h"
21 #include "mlir/Dialect/SCF/SCF.h"
22 #include "mlir/Dialect/StandardOps/IR/Ops.h"
23 #include "mlir/IR/AffineExpr.h"
24 #include "mlir/IR/BlockAndValueMapping.h"
25 #include "mlir/IR/Builders.h"
26 #include "mlir/Pass/Pass.h"
27 #include "mlir/Transforms/DialectConversion.h"
28 #include "mlir/Transforms/LoopUtils.h"
29 #include "mlir/Transforms/Passes.h"
30 #include "mlir/Transforms/RegionUtils.h"
31 #include "llvm/ADT/Sequence.h"
32 #include "llvm/Support/Debug.h"
33 
34 #define DEBUG_TYPE "loops-to-gpu"
35 
36 using namespace mlir;
37 using namespace mlir::scf;
38 
39 using llvm::seq;
40 
41 // Extract an indexed value from KernelDim3.
42 static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) {
43   switch (pos) {
44   case 0:
45     return dim3.x;
46   case 1:
47     return dim3.y;
48   case 2:
49     return dim3.z;
50   default:
51     llvm_unreachable("dim3 position out of bounds");
52   }
53   return nullptr;
54 }
55 
56 // Get the lower bound-related operands of a loop operation.
57 static Operation::operand_range getLowerBoundOperands(AffineForOp forOp) {
58   return forOp.getLowerBoundOperands();
59 }
60 static SmallVector<Value, 1> getLowerBoundOperands(ForOp forOp) {
61   SmallVector<Value, 1> bounds(1, forOp.lowerBound());
62   return bounds;
63 }
64 
65 // Get the upper bound-related operands of a loop operation.
66 static Operation::operand_range getUpperBoundOperands(AffineForOp forOp) {
67   return forOp.getUpperBoundOperands();
68 }
69 static SmallVector<Value, 1> getUpperBoundOperands(ForOp forOp) {
70   SmallVector<Value, 1> bounds(1, forOp.upperBound());
71   return bounds;
72 }
73 
74 // Get a Value that corresponds to the loop step.  If the step is an attribute,
75 // materialize a corresponding constant using builder.
76 static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder) {
77   return builder.create<ConstantIndexOp>(forOp.getLoc(), forOp.getStep());
78 }
79 static Value getOrCreateStep(ForOp forOp, OpBuilder &) { return forOp.step(); }
80 
81 // Get a Value for the loop lower bound.  If the value requires computation,
82 // materialize the instructions using builder.
83 static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) {
84   return lowerAffineLowerBound(forOp, builder);
85 }
86 static Value getOrEmitLowerBound(ForOp forOp, OpBuilder &) {
87   return forOp.lowerBound();
88 }
89 
90 // Get a Value for the loop upper bound.  If the value requires computation,
91 // materialize the instructions using builder.
92 static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) {
93   return lowerAffineUpperBound(forOp, builder);
94 }
95 static Value getOrEmitUpperBound(ForOp forOp, OpBuilder &) {
96   return forOp.upperBound();
97 }
98 
99 // Check the structure of the loop nest:
100 //   - there are enough loops to map to numDims;
101 //   - the loops are perfectly nested;
102 //   - the loop bounds can be computed above the outermost loop.
103 // This roughly corresponds to the "matcher" part of the pattern-based
104 // rewriting infrastructure.
105 template <typename OpTy>
106 static LogicalResult checkLoopNestMappableImpl(OpTy forOp, unsigned numDims) {
107   Region &limit = forOp.region();
108   for (unsigned i = 0, e = numDims; i < e; ++i) {
109     Operation *nested = &forOp.getBody()->front();
110     if (!areValuesDefinedAbove(getLowerBoundOperands(forOp), limit) ||
111         !areValuesDefinedAbove(getUpperBoundOperands(forOp), limit))
112       return forOp.emitError(
113           "loops with bounds depending on other mapped loops "
114           "are not supported");
115 
116     // The innermost loop can have an arbitrary body, skip the perfect nesting
117     // check for it.
118     if (i == e - 1)
119       break;
120 
121     auto begin = forOp.getBody()->begin(), end = forOp.getBody()->end();
122     if (forOp.getBody()->empty() || std::next(begin, 2) != end)
123       return forOp.emitError("expected perfectly nested loops in the body");
124 
125     if (!(forOp = dyn_cast<OpTy>(nested)))
126       return nested->emitError("expected a nested loop");
127   }
128   return success();
129 }
130 
131 template <typename OpTy>
132 static LogicalResult checkLoopNestMappable(OpTy forOp, unsigned numBlockDims,
133                                            unsigned numThreadDims) {
134   if (numBlockDims < 1 || numThreadDims < 1) {
135     LLVM_DEBUG(llvm::dbgs() << "nothing to map");
136     return success();
137   }
138 
139   if (numBlockDims > 3) {
140     return forOp.emitError("cannot map to more than 3 block dimensions");
141   }
142   if (numThreadDims > 3) {
143     return forOp.emitError("cannot map to more than 3 thread dimensions");
144   }
145   return checkLoopNestMappableImpl(forOp, numBlockDims + numThreadDims);
146 }
147 
148 template <typename OpTy>
149 static LogicalResult checkLoopOpMappable(OpTy forOp, unsigned numBlockDims,
150                                          unsigned numThreadDims) {
151   if (numBlockDims < 1 || numThreadDims < 1) {
152     LLVM_DEBUG(llvm::dbgs() << "nothing to map");
153     return success();
154   }
155 
156   if (numBlockDims > 3) {
157     return forOp.emitError("cannot map to more than 3 block dimensions");
158   }
159   if (numThreadDims > 3) {
160     return forOp.emitError("cannot map to more than 3 thread dimensions");
161   }
162   if (numBlockDims != numThreadDims) {
163     // TODO(ravishankarm) : This can probably be relaxed by having a one-trip
164     // loop for the missing dimension, but there is not reason to handle this
165     // case for now.
166     return forOp.emitError(
167         "mismatch in block dimensions and thread dimensions");
168   }
169 
170   // Check that the forOp contains perfectly nested loops for numBlockDims
171   if (failed(checkLoopNestMappableImpl(forOp, numBlockDims))) {
172     return failure();
173   }
174 
175   // Get to the innermost loop.
176   for (auto i : seq<unsigned>(0, numBlockDims - 1)) {
177     forOp = cast<OpTy>(&forOp.getBody()->front());
178     (void)i;
179   }
180 
181   // The forOp now points to the body of the innermost loop mapped to blocks.
182   for (Operation &op : *forOp.getBody()) {
183     // If the operation is a loop, check that it is mappable to workItems.
184     if (auto innerLoop = dyn_cast<OpTy>(&op)) {
185       if (failed(checkLoopNestMappableImpl(innerLoop, numThreadDims))) {
186         return failure();
187       }
188       continue;
189     }
190     // TODO(ravishankarm) : If it is not a loop op, it is assumed that the
191     // statement is executed by all threads. It might be a collective operation,
192     // or some non-side effect instruction. Have to decide on "allowable"
193     // statements and check for those here.
194   }
195   return success();
196 }
197 
198 namespace {
199 // Helper structure that holds common state of the loop to GPU kernel
200 // conversion.
201 struct LoopToGpuConverter {
202   template <typename OpTy>
203   Optional<OpTy> collectBounds(OpTy forOp, unsigned numLoops);
204 
205   template <typename OpTy>
206   void createLaunch(OpTy rootForOp, OpTy innermostForOp, unsigned numBlockDims,
207                     unsigned numThreadDims);
208 
209   // Ranges of the loops mapped to blocks or threads.
210   SmallVector<Value, 6> dims;
211   // Lower bounds of the loops mapped to blocks or threads.
212   SmallVector<Value, 6> lbs;
213   // Induction variables of the loops mapped to blocks or threads.
214   SmallVector<Value, 6> ivs;
215   // Steps of the loops mapped to blocks or threads.
216   SmallVector<Value, 6> steps;
217 };
218 } // namespace
219 
220 // Return true if the value is obviously a constant "one".
221 static bool isConstantOne(Value value) {
222   if (auto def = value.getDefiningOp<ConstantIndexOp>())
223     return def.getValue() == 1;
224   return false;
225 }
226 
227 // Collect ranges, bounds, steps and induction variables in preparation for
228 // mapping a loop nest of depth "numLoops" rooted at "forOp" to a GPU kernel.
229 // This may fail if the IR for computing loop bounds cannot be constructed, for
230 // example if an affine loop uses semi-affine maps. Return the last loop to be
231 // mapped on success, llvm::None on failure.
232 template <typename OpTy>
233 Optional<OpTy> LoopToGpuConverter::collectBounds(OpTy forOp,
234                                                  unsigned numLoops) {
235   OpBuilder builder(forOp.getOperation());
236   dims.reserve(numLoops);
237   lbs.reserve(numLoops);
238   ivs.reserve(numLoops);
239   steps.reserve(numLoops);
240   OpTy currentLoop = forOp;
241   for (unsigned i = 0; i < numLoops; ++i) {
242     Value lowerBound = getOrEmitLowerBound(currentLoop, builder);
243     Value upperBound = getOrEmitUpperBound(currentLoop, builder);
244     if (!lowerBound || !upperBound) {
245       return llvm::None;
246     }
247 
248     Value range =
249         builder.create<SubIOp>(currentLoop.getLoc(), upperBound, lowerBound);
250     Value step = getOrCreateStep(currentLoop, builder);
251     if (!isConstantOne(step))
252       range = builder.create<SignedDivIOp>(currentLoop.getLoc(), range, step);
253     dims.push_back(range);
254 
255     lbs.push_back(lowerBound);
256     ivs.push_back(currentLoop.getInductionVar());
257     steps.push_back(step);
258 
259     if (i != numLoops - 1)
260       currentLoop = cast<OpTy>(&currentLoop.getBody()->front());
261   }
262   return currentLoop;
263 }
264 
265 /// Given `nDims` perfectly nested loops rooted as `rootForOp`, convert them o
266 /// be partitioned across workgroups or workitems. The values for the
267 /// workgroup/workitem id along each dimension is passed in with `ids`. The
268 /// number of workgroups/workitems along each dimension are passed in with
269 /// `nids`. The innermost loop is mapped to the x-dimension, followed by the
270 /// next innermost loop to y-dimension, followed by z-dimension.
271 template <typename OpTy>
272 static OpTy createGPULaunchLoops(OpTy rootForOp, ArrayRef<Value> ids,
273                                  ArrayRef<Value> nids) {
274   auto nDims = ids.size();
275   assert(nDims == nids.size());
276   for (auto dim : llvm::seq<unsigned>(0, nDims)) {
277     // TODO(ravishankarm): Don't always need to generate a loop here. If nids >=
278     // number of iterations of the original loop, this becomes a if
279     // condition. Though that does rely on how the workgroup/workitem sizes are
280     // specified to begin with.
281     mapLoopToProcessorIds(rootForOp, ids[dim], nids[dim]);
282     if (dim != nDims - 1) {
283       rootForOp = cast<OpTy>(rootForOp.getBody()->front());
284     }
285   }
286   return rootForOp;
287 }
288 
289 /// Utility method to convert the gpu::KernelDim3 object for representing id of
290 /// each workgroup/workitem and number of workgroup/workitems along a dimension
291 /// of the launch into a container.
292 static void packIdAndNumId(gpu::KernelDim3 kernelIds,
293                            gpu::KernelDim3 kernelNids, unsigned nDims,
294                            SmallVectorImpl<Value> &ids,
295                            SmallVectorImpl<Value> &nids) {
296   assert(nDims <= 3 && "invalid number of launch dimensions");
297   std::array<Value, 3> allIds = {kernelIds.z, kernelIds.y, kernelIds.x};
298   std::array<Value, 3> allNids = {kernelNids.z, kernelNids.y, kernelNids.x};
299   ids.clear();
300   ids.append(std::next(allIds.begin(), allIds.size() - nDims), allIds.end());
301   nids.clear();
302   nids.append(std::next(allNids.begin(), allNids.size() - nDims),
303               allNids.end());
304 }
305 
306 /// Generate the body of the launch operation.
307 template <typename OpTy>
308 static LogicalResult
309 createLaunchBody(OpBuilder &builder, OpTy rootForOp, gpu::LaunchOp launchOp,
310                  unsigned numBlockDims, unsigned numThreadDims) {
311   OpBuilder::InsertionGuard bodyInsertionGuard(builder);
312   builder.setInsertionPointToEnd(&launchOp.body().front());
313   auto terminatorOp = builder.create<gpu::TerminatorOp>(launchOp.getLoc());
314 
315   rootForOp.getOperation()->moveBefore(terminatorOp);
316   SmallVector<Value, 3> workgroupID, numWorkGroups;
317   packIdAndNumId(launchOp.getBlockIds(), launchOp.getGridSize(), numBlockDims,
318                  workgroupID, numWorkGroups);
319 
320   // Partition the loop for mapping to workgroups.
321   auto loopOp = createGPULaunchLoops(rootForOp, workgroupID, numWorkGroups);
322 
323   // Iterate over the body of the loopOp and get the loops to partition for
324   // thread blocks.
325   SmallVector<OpTy, 1> threadRootForOps;
326   for (Operation &op : *loopOp.getBody()) {
327     if (auto threadRootForOp = dyn_cast<OpTy>(&op)) {
328       threadRootForOps.push_back(threadRootForOp);
329     }
330   }
331 
332   SmallVector<Value, 3> workItemID, workGroupSize;
333   packIdAndNumId(launchOp.getThreadIds(), launchOp.getBlockSize(),
334                  numThreadDims, workItemID, workGroupSize);
335   for (auto &loopOp : threadRootForOps) {
336     builder.setInsertionPoint(loopOp);
337     createGPULaunchLoops(loopOp, workItemID, workGroupSize);
338   }
339   return success();
340 }
341 
342 // Convert the computation rooted at the `rootForOp`, into a GPU kernel with the
343 // given workgroup size and number of workgroups.
344 template <typename OpTy>
345 static LogicalResult createLaunchFromOp(OpTy rootForOp,
346                                         ArrayRef<Value> numWorkGroups,
347                                         ArrayRef<Value> workGroupSizes) {
348   OpBuilder builder(rootForOp.getOperation());
349   if (numWorkGroups.size() > 3) {
350     return rootForOp.emitError("invalid ")
351            << numWorkGroups.size() << "-D workgroup specification";
352   }
353   auto loc = rootForOp.getLoc();
354   Value one = builder.create<ConstantOp>(
355       loc, builder.getIntegerAttr(builder.getIndexType(), 1));
356   SmallVector<Value, 3> numWorkGroups3D(3, one), workGroupSize3D(3, one);
357   for (auto numWorkGroup : enumerate(numWorkGroups)) {
358     numWorkGroups3D[numWorkGroup.index()] = numWorkGroup.value();
359   }
360   for (auto workGroupSize : enumerate(workGroupSizes)) {
361     workGroupSize3D[workGroupSize.index()] = workGroupSize.value();
362   }
363 
364   auto launchOp = builder.create<gpu::LaunchOp>(
365       rootForOp.getLoc(), numWorkGroups3D[0], numWorkGroups3D[1],
366       numWorkGroups3D[2], workGroupSize3D[0], workGroupSize3D[1],
367       workGroupSize3D[2]);
368   if (failed(createLaunchBody(builder, rootForOp, launchOp,
369                               numWorkGroups.size(), workGroupSizes.size()))) {
370     return failure();
371   }
372 
373   return success();
374 }
375 
376 // Replace the rooted at "rootForOp" with a GPU launch operation.  This expects
377 // "innermostForOp" to point to the last loop to be transformed to the kernel,
378 // and to have (numBlockDims + numThreadDims) perfectly nested loops between
379 // "rootForOp" and "innermostForOp".
380 // TODO(ravishankarm) : This method can be modified to use the
381 // createLaunchFromOp method, since that is a strict generalization of this
382 // method.
383 template <typename OpTy>
384 void LoopToGpuConverter::createLaunch(OpTy rootForOp, OpTy innermostForOp,
385                                       unsigned numBlockDims,
386                                       unsigned numThreadDims) {
387   OpBuilder builder(rootForOp.getOperation());
388   // Prepare the grid and block sizes for the launch operation.  If there is
389   // no loop mapped to a specific dimension, use constant "1" as its size.
390   Value constOne = (numBlockDims < 3 || numThreadDims < 3)
391                        ? builder.create<ConstantIndexOp>(rootForOp.getLoc(), 1)
392                        : nullptr;
393   Value gridSizeX = numBlockDims > 0 ? dims[0] : constOne;
394   Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne;
395   Value gridSizeZ = numBlockDims > 2 ? dims[2] : constOne;
396   Value blockSizeX = numThreadDims > 0 ? dims[numBlockDims] : constOne;
397   Value blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne;
398   Value blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne;
399 
400   // Create a launch op and move the body region of the innermost loop to the
401   // launch op.
402   auto launchOp = builder.create<gpu::LaunchOp>(
403       rootForOp.getLoc(), gridSizeX, gridSizeY, gridSizeZ, blockSizeX,
404       blockSizeY, blockSizeZ);
405 
406   // Replace the loop terminator (loops contain only a single block) with the
407   // gpu terminator and move the operations from the loop body block to the gpu
408   // launch body block.  Do not move the entire block because of the difference
409   // in block arguments.
410   Operation &terminator = innermostForOp.getBody()->back();
411   Location terminatorLoc = terminator.getLoc();
412   terminator.erase();
413   builder.setInsertionPointToEnd(innermostForOp.getBody());
414   builder.create<gpu::TerminatorOp>(terminatorLoc, llvm::None);
415   launchOp.body().front().getOperations().splice(
416       launchOp.body().front().begin(),
417       innermostForOp.getBody()->getOperations());
418 
419   // Remap the loop iterators to use block/thread identifiers instead.  Loops
420   // may iterate from LB with step S whereas GPU thread/block ids always iterate
421   // from 0 to N with step 1.  Therefore, loop induction variables are replaced
422   // with (gpu-thread/block-id * S) + LB.
423   builder.setInsertionPointToStart(&launchOp.body().front());
424   auto lbArgumentIt = lbs.begin();
425   auto stepArgumentIt = steps.begin();
426   for (auto en : llvm::enumerate(ivs)) {
427     Value id =
428         en.index() < numBlockDims
429             ? getDim3Value(launchOp.getBlockIds(), en.index())
430             : getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims);
431     Value step = steps[en.index()];
432     if (!isConstantOne(step))
433       id = builder.create<MulIOp>(rootForOp.getLoc(), step, id);
434 
435     Value ivReplacement =
436         builder.create<AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id);
437     en.value().replaceAllUsesWith(ivReplacement);
438     std::advance(lbArgumentIt, 1);
439     std::advance(stepArgumentIt, 1);
440   }
441 
442   // We are done and can erase the original outermost loop.
443   rootForOp.erase();
444 }
445 
446 // Generic loop to GPU kernel conversion function.
447 template <typename OpTy>
448 static LogicalResult convertLoopNestToGPULaunch(OpTy forOp,
449                                                 unsigned numBlockDims,
450                                                 unsigned numThreadDims) {
451   if (failed(checkLoopNestMappable(forOp, numBlockDims, numThreadDims)))
452     return failure();
453 
454   LoopToGpuConverter converter;
455   auto maybeInnerLoop =
456       converter.collectBounds(forOp, numBlockDims + numThreadDims);
457   if (!maybeInnerLoop)
458     return failure();
459   converter.createLaunch(forOp, *maybeInnerLoop, numBlockDims, numThreadDims);
460 
461   return success();
462 }
463 
464 // Generic loop to GPU kernel conversion function when loop is imperfectly
465 // nested. The workgroup size and num workgroups is provided as input
466 template <typename OpTy>
467 static LogicalResult convertLoopToGPULaunch(OpTy forOp,
468                                             ArrayRef<Value> numWorkGroups,
469                                             ArrayRef<Value> workGroupSize) {
470   if (failed(checkLoopOpMappable(forOp, numWorkGroups.size(),
471                                  workGroupSize.size()))) {
472     return failure();
473   }
474   return createLaunchFromOp(forOp, numWorkGroups, workGroupSize);
475 }
476 
477 LogicalResult mlir::convertAffineLoopNestToGPULaunch(AffineForOp forOp,
478                                                      unsigned numBlockDims,
479                                                      unsigned numThreadDims) {
480   return ::convertLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims);
481 }
482 
483 LogicalResult mlir::convertLoopNestToGPULaunch(ForOp forOp,
484                                                unsigned numBlockDims,
485                                                unsigned numThreadDims) {
486   return ::convertLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims);
487 }
488 
489 LogicalResult mlir::convertLoopToGPULaunch(scf::ForOp forOp,
490                                            ArrayRef<Value> numWorkGroups,
491                                            ArrayRef<Value> workGroupSizes) {
492   return ::convertLoopToGPULaunch(forOp, numWorkGroups, workGroupSizes);
493 }
494 
495 namespace {
496 struct ParallelToGpuLaunchLowering : public OpRewritePattern<ParallelOp> {
497   using OpRewritePattern<ParallelOp>::OpRewritePattern;
498 
499   LogicalResult matchAndRewrite(ParallelOp parallelOp,
500                                 PatternRewriter &rewriter) const override;
501 };
502 } // namespace
503 
504 /// Tries to derive a static upper bound from the defining operation of
505 /// `upperBound`.
506 static Value deriveStaticUpperBound(Value upperBound,
507                                     PatternRewriter &rewriter) {
508   if (auto op = upperBound.getDefiningOp<ConstantIndexOp>()) {
509     return op;
510   }
511 
512   if (auto minOp = upperBound.getDefiningOp<AffineMinOp>()) {
513     for (const AffineExpr &result : minOp.map().getResults()) {
514       if (auto constExpr = result.dyn_cast<AffineConstantExpr>()) {
515         return rewriter.create<ConstantIndexOp>(minOp.getLoc(),
516                                                 constExpr.getValue());
517       }
518     }
519   }
520 
521   if (auto multiplyOp = upperBound.getDefiningOp<MulIOp>()) {
522     if (auto lhs = dyn_cast_or_null<ConstantIndexOp>(
523             deriveStaticUpperBound(multiplyOp.getOperand(0), rewriter)
524                 .getDefiningOp()))
525       if (auto rhs = dyn_cast_or_null<ConstantIndexOp>(
526               deriveStaticUpperBound(multiplyOp.getOperand(1), rewriter)
527                   .getDefiningOp())) {
528         // Assumptions about the upper bound of minimum computations no longer
529         // work if multiplied by a negative value, so abort in this case.
530         if (lhs.getValue() < 0 || rhs.getValue() < 0)
531           return {};
532 
533         return rewriter.create<ConstantIndexOp>(
534             multiplyOp.getLoc(), lhs.getValue() * rhs.getValue());
535       }
536   }
537 
538   return {};
539 }
540 
541 static bool isMappedToProcessor(gpu::Processor processor) {
542   return processor != gpu::Processor::Sequential;
543 }
544 
545 static unsigned getLaunchOpArgumentNum(gpu::Processor processor) {
546   switch (processor) {
547   case gpu::Processor::BlockX:
548     return 0;
549   case gpu::Processor::BlockY:
550     return 1;
551   case gpu::Processor::BlockZ:
552     return 2;
553   case gpu::Processor::ThreadX:
554     return 3;
555   case gpu::Processor::ThreadY:
556     return 4;
557   case gpu::Processor::ThreadZ:
558     return 5;
559   default:;
560   }
561   llvm_unreachable(
562       "invalid processor type while retrieving launch op argument number");
563 }
564 
565 /// Modifies the current transformation state to capture the effect of the given
566 /// `scf.parallel` operation on index substitutions and the operations to be
567 /// inserted.
568 /// Specifically, if a dimension of a parallel loop is mapped to a hardware id,
569 /// this function will
570 /// - compute the loop index based on the hardware id and affine map from the
571 ///   mapping and update `cloningMap` to substitute all uses.
572 /// - derive a new upper bound for the hardware id and augment the provided
573 ///   `gpu.launch operation` accordingly.
574 /// - if the upper bound is imprecise, insert a conditional in the `gpu.launch`
575 ///   and update the rewriter to insert into the conditional's body.
576 /// If the dimension is mapped to sequential,
577 /// - insert a for loop into the body and update the rewriter to insert into
578 ///   the for loop's body.
579 /// - update the `cloningMap` to replace uses of the index with the index of
580 ///   the new for loop.
581 /// In either case,
582 /// - append the instructions from the loops body to worklist, in reverse order.
583 /// To note the end of the current scope in case a loop or conditional was
584 /// inserted, a sentinel (the `gpu.launch` operation) is inserted into the
585 /// worklist. This signals the processor of the worklist to pop the rewriter
586 /// one scope-level up.
587 static LogicalResult processParallelLoop(
588     ParallelOp parallelOp, gpu::LaunchOp launchOp,
589     BlockAndValueMapping &cloningMap, SmallVectorImpl<Operation *> &worklist,
590     DenseMap<gpu::Processor, Value> &bounds, PatternRewriter &rewriter) {
591   // TODO(herhut): Verify that this is a valid GPU mapping.
592   // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential
593   ArrayAttr mapping =
594       parallelOp.getAttrOfType<ArrayAttr>(gpu::getMappingAttrName());
595 
596   // TODO(herhut): Support reductions.
597   if (!mapping || parallelOp.getNumResults() != 0)
598     return failure();
599 
600   Location loc = parallelOp.getLoc();
601 
602   auto launchIndependent = [&launchOp](Value val) {
603     return val.getParentRegion()->isAncestor(launchOp.getParentRegion());
604   };
605 
606   auto ensureLaunchIndependent = [&rewriter,
607                                   launchIndependent](Value val) -> Value {
608     if (launchIndependent(val))
609       return val;
610     if (ConstantOp constOp = val.getDefiningOp<ConstantOp>())
611       return rewriter.create<ConstantOp>(constOp.getLoc(), constOp.getValue());
612     return {};
613   };
614 
615   for (auto config : llvm::zip(mapping, parallelOp.getInductionVars(),
616                                parallelOp.lowerBound(), parallelOp.upperBound(),
617                                parallelOp.step())) {
618     Attribute mappingAttribute;
619     Value iv, lowerBound, upperBound, step;
620     std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config;
621     auto annotation = mappingAttribute.dyn_cast<gpu::ParallelLoopDimMapping>();
622     if (!annotation)
623       return parallelOp.emitOpError()
624              << "expected mapping attribute for lowering to GPU";
625     Value newIndex;
626     gpu::Processor processor = gpu::getProcessor(annotation);
627 
628     if (isMappedToProcessor(processor)) {
629       // Use the corresponding thread/grid index as replacement for the loop iv.
630       Value operand = launchOp.body().front().getArgument(
631           getLaunchOpArgumentNum(processor));
632       // Take the indexmap and add the lower bound and step computations in.
633       // This computes operand * step + lowerBound.
634       // Use an affine map here so that it composes nicely with the provided
635       // annotation.
636       AffineMap lowerAndStep = AffineMap::get(
637           1, 2,
638           rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) +
639               rewriter.getAffineSymbolExpr(1));
640       newIndex = rewriter.create<AffineApplyOp>(
641           loc, annotation.map().getValue().compose(lowerAndStep),
642           ValueRange{operand, step, lowerBound});
643       // If there was also a bound, insert that, too.
644       // TODO(herhut): Check that we do not assign bounds twice.
645       if (annotation.bound().getValue()) {
646         // We pass as the single operand to the bound-map the number of
647         // iterations, which is (upperBound - lowerBound) ceilDiv step. To
648         // support inner loops with dynamic upper bounds (as generated by e.g.
649         // tiling), try to derive a max for the bounds. If the used bound for
650         // the hardware id is imprecise, wrap the contained code into a
651         // conditional. If the lower-bound is constant or defined before the
652         // launch, we can use it in the launch bounds. Otherwise fail.
653         if (!launchIndependent(lowerBound) &&
654             !isa_and_nonnull<ConstantOp>(lowerBound.getDefiningOp()))
655           return failure();
656         // The step must also be constant or defined outside of the loop nest.
657         if (!launchIndependent(step) &&
658             !isa_and_nonnull<ConstantOp>(step.getDefiningOp()))
659           return failure();
660         // If the upper-bound is constant or defined before the launch, we can
661         // use it in the launch bounds directly. Otherwise try derive a bound.
662         bool boundIsPrecise =
663             launchIndependent(upperBound) ||
664             isa_and_nonnull<ConstantOp>(upperBound.getDefiningOp());
665         {
666           PatternRewriter::InsertionGuard guard(rewriter);
667           rewriter.setInsertionPoint(launchOp);
668           if (!boundIsPrecise) {
669             upperBound = deriveStaticUpperBound(upperBound, rewriter);
670             if (!upperBound) {
671               return parallelOp.emitOpError()
672                      << "cannot derive loop-invariant upper bound for number "
673                         "of iterations";
674             }
675           }
676           // Compute the number of iterations needed. We compute this as an
677           // affine expression ceilDiv (upperBound - lowerBound) step. We use
678           // affine.apply here so that it composes nicely with the provided map.
679           AffineMap stepMap =
680               AffineMap::get(0, 3,
681                              ((rewriter.getAffineSymbolExpr(0) -
682                                rewriter.getAffineSymbolExpr(1))
683                                   .ceilDiv(rewriter.getAffineSymbolExpr(2))));
684           Value launchBound = rewriter.create<AffineApplyOp>(
685               loc, annotation.bound().getValue().compose(stepMap),
686               ValueRange{
687                   ensureLaunchIndependent(
688                       cloningMap.lookupOrDefault(upperBound)),
689                   ensureLaunchIndependent(
690                       cloningMap.lookupOrDefault(lowerBound)),
691                   ensureLaunchIndependent(cloningMap.lookupOrDefault(step))});
692           // todo(herhut,ravishankarm): Update the behavior of setMappingAttr
693           // when this condition is relaxed.
694           if (bounds.find(processor) != bounds.end()) {
695             return parallelOp.emitOpError()
696                    << "cannot redefine the bound for processor "
697                    << static_cast<int64_t>(processor);
698           }
699           bounds[processor] = launchBound;
700         }
701         if (!boundIsPrecise) {
702           // We are using an approximation, create a surrounding conditional.
703           Value originalBound = std::get<3>(config);
704           CmpIOp pred = rewriter.create<CmpIOp>(
705               loc, CmpIPredicate::slt, newIndex,
706               cloningMap.lookupOrDefault(originalBound));
707           scf::IfOp ifOp = rewriter.create<scf::IfOp>(loc, pred, false);
708           rewriter.setInsertionPointToStart(&ifOp.thenRegion().front());
709           // Put a sentinel into the worklist so we know when to pop out of the
710           // if body again. We use the launchOp here, as that cannot be part of
711           // the bodies instruction.
712           worklist.push_back(launchOp.getOperation());
713         }
714       }
715     } else {
716       // Create a sequential for loop.
717       auto loopOp = rewriter.create<scf::ForOp>(
718           loc, cloningMap.lookupOrDefault(lowerBound),
719           cloningMap.lookupOrDefault(upperBound),
720           cloningMap.lookupOrDefault(step));
721       newIndex = loopOp.getInductionVar();
722       rewriter.setInsertionPointToStart(loopOp.getBody());
723       // Put a sentinel into the worklist so we know when to pop out of the loop
724       // body again. We use the launchOp here, as that cannot be part of the
725       // bodies instruction.
726       worklist.push_back(launchOp.getOperation());
727     }
728     cloningMap.map(iv, newIndex);
729   }
730   Block *body = parallelOp.getBody();
731   worklist.reserve(worklist.size() + body->getOperations().size());
732   for (Operation &op : llvm::reverse(body->without_terminator()))
733     worklist.push_back(&op);
734   return success();
735 }
736 
737 /// Lower a `scf.parallel` operation into a corresponding `gpu.launch`
738 /// operation.
739 ///
740 /// This essentially transforms a loop nest into a corresponding SIMT function.
741 /// The conversion is driven by mapping annotations on the `scf.parallel`
742 /// operations. The mapping is provided via a `DictionaryAttribute` named
743 /// `mapping`, which has three entries:
744 ///  - processor: the hardware id to map to. 0-2 are block dimensions, 3-5 are
745 ///               thread dimensions and 6 is sequential.
746 ///  - map : An affine map that is used to pre-process hardware ids before
747 ///          substitution.
748 ///  - bound : An affine map that is used to compute the bound of the hardware
749 ///            id based on an upper bound of the number of iterations.
750 /// If the `scf.parallel` contains nested `scf.parallel` operations, those
751 /// need to be annotated, as well. Structurally, the transformation works by
752 /// splicing all operations from nested `scf.parallel` operations into a single
753 /// sequence. Indices mapped to hardware ids are substituted with those ids,
754 /// wheras sequential mappings result in a sequential for-loop. To have more
755 /// flexibility when mapping code to hardware ids, the transform supports two
756 /// affine maps. The first `map` is used to compute the actual index for
757 /// substitution from the hardware id. The second `bound` is used to compute the
758 /// launch dimension for the hardware id from the number of iterations the
759 /// mapped loop is performing. Note that the number of iterations might be
760 /// imprecise if the corresponding loop-bounds are loop-dependent. In such case,
761 /// the hardware id might iterate over additional indices. The transformation
762 /// caters for this by predicating the created sequence of instructions on
763 /// the actual loop bound. This only works if an static upper bound for the
764 /// dynamic loop bound can be derived, currently via analyzing `affine.min`
765 /// operations.
766 LogicalResult
767 ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
768                                              PatternRewriter &rewriter) const {
769   // Create a launch operation. We start with bound one for all grid/block
770   // sizes. Those will be refined later as we discover them from mappings.
771   Location loc = parallelOp.getLoc();
772   Value constantOne = rewriter.create<ConstantIndexOp>(parallelOp.getLoc(), 1);
773   gpu::LaunchOp launchOp = rewriter.create<gpu::LaunchOp>(
774       parallelOp.getLoc(), constantOne, constantOne, constantOne, constantOne,
775       constantOne, constantOne);
776   rewriter.setInsertionPointToEnd(&launchOp.body().front());
777   rewriter.create<gpu::TerminatorOp>(loc);
778   rewriter.setInsertionPointToStart(&launchOp.body().front());
779 
780   BlockAndValueMapping cloningMap;
781   llvm::DenseMap<gpu::Processor, Value> launchBounds;
782   SmallVector<Operation *, 16> worklist;
783   if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist,
784                                  launchBounds, rewriter)))
785     return failure();
786 
787   // Whether we have seen any side-effects. Reset when leaving an inner scope.
788   bool seenSideeffects = false;
789   // Whether we have left a nesting scope (and hence are no longer innermost).
790   bool leftNestingScope = false;
791   while (!worklist.empty()) {
792     Operation *op = worklist.pop_back_val();
793     // Now walk over the body and clone it.
794     // TODO: This is only correct if there either is no further scf.parallel
795     //       nested or this code is side-effect free. Otherwise we might need
796     //       predication. We are overly conservative for now and only allow
797     //       side-effects in the innermost scope.
798     if (auto nestedParallel = dyn_cast<ParallelOp>(op)) {
799       // Before entering a nested scope, make sure there have been no
800       // sideeffects until now.
801       if (seenSideeffects)
802         return failure();
803       // A nested scf.parallel needs insertion of code to compute indices.
804       // Insert that now. This will also update the worklist with the loops
805       // body.
806       if (failed(processParallelLoop(nestedParallel, launchOp, cloningMap,
807                                      worklist, launchBounds, rewriter)))
808         return failure();
809     } else if (op == launchOp.getOperation()) {
810       // Found our sentinel value. We have finished the operations from one
811       // nesting level, pop one level back up.
812       auto parent = rewriter.getInsertionPoint()->getParentOp();
813       rewriter.setInsertionPointAfter(parent);
814       leftNestingScope = true;
815       seenSideeffects = false;
816     } else {
817       // Otherwise we copy it over.
818       Operation *clone = rewriter.clone(*op, cloningMap);
819       cloningMap.map(op->getResults(), clone->getResults());
820       // Check for side effects.
821       // TODO: Handle region side effects properly.
822       seenSideeffects |= !MemoryEffectOpInterface::hasNoEffect(clone) ||
823                          clone->getNumRegions() != 0;
824       // If we are no longer in the innermost scope, sideeffects are disallowed.
825       if (seenSideeffects && leftNestingScope)
826         return failure();
827     }
828   }
829 
830   // Now that we succeeded creating the launch operation, also update the
831   // bounds.
832   for (auto bound : launchBounds)
833     launchOp.setOperand(getLaunchOpArgumentNum(std::get<0>(bound)),
834                         std::get<1>(bound));
835 
836   rewriter.eraseOp(parallelOp);
837   return success();
838 }
839 
840 void mlir::populateParallelLoopToGPUPatterns(OwningRewritePatternList &patterns,
841                                              MLIRContext *ctx) {
842   patterns.insert<ParallelToGpuLaunchLowering>(ctx);
843 }
844