xref: /llvm-project/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp (revision 8aa6c3765b924d86f623d452777eb76b83bf2787)
1 //===- SCFToGPU.cpp - Convert an affine loop nest to a GPU kernel -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This implements a straightforward conversion of an loop nest into a GPU
10 // kernel.  The caller is expected to guarantee that the conversion is correct
11 // or to further transform the kernel to ensure correctness.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "mlir/Conversion/SCFToGPU/SCFToGPU.h"
16 
17 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
18 #include "mlir/Dialect/Affine/IR/AffineOps.h"
19 #include "mlir/Dialect/GPU/GPUDialect.h"
20 #include "mlir/Dialect/GPU/ParallelLoopMapper.h"
21 #include "mlir/Dialect/MemRef/IR/MemRef.h"
22 #include "mlir/Dialect/SCF/SCF.h"
23 #include "mlir/Dialect/StandardOps/IR/Ops.h"
24 #include "mlir/IR/AffineExpr.h"
25 #include "mlir/IR/BlockAndValueMapping.h"
26 #include "mlir/IR/Builders.h"
27 #include "mlir/Pass/Pass.h"
28 #include "mlir/Transforms/DialectConversion.h"
29 #include "mlir/Transforms/LoopUtils.h"
30 #include "mlir/Transforms/Passes.h"
31 #include "mlir/Transforms/RegionUtils.h"
32 #include "llvm/ADT/Sequence.h"
33 #include "llvm/Support/Debug.h"
34 
35 #define DEBUG_TYPE "loops-to-gpu"
36 
37 using namespace mlir;
38 using namespace mlir::scf;
39 
40 // Extract an indexed value from KernelDim3.
41 static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) {
42   switch (pos) {
43   case 0:
44     return dim3.x;
45   case 1:
46     return dim3.y;
47   case 2:
48     return dim3.z;
49   default:
50     llvm_unreachable("dim3 position out of bounds");
51   }
52   return nullptr;
53 }
54 
55 // Get the lower bound-related operands of a loop operation.
56 static Operation::operand_range getLowerBoundOperands(AffineForOp forOp) {
57   return forOp.getLowerBoundOperands();
58 }
59 
60 // Get the upper bound-related operands of a loop operation.
61 static Operation::operand_range getUpperBoundOperands(AffineForOp forOp) {
62   return forOp.getUpperBoundOperands();
63 }
64 
65 // Get a Value that corresponds to the loop step.  If the step is an attribute,
66 // materialize a corresponding constant using builder.
67 static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder) {
68   return builder.create<ConstantIndexOp>(forOp.getLoc(), forOp.getStep());
69 }
70 
71 // Get a Value for the loop lower bound.  If the value requires computation,
72 // materialize the instructions using builder.
73 static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) {
74   return lowerAffineLowerBound(forOp, builder);
75 }
76 
77 // Get a Value for the loop upper bound.  If the value requires computation,
78 // materialize the instructions using builder.
79 static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) {
80   return lowerAffineUpperBound(forOp, builder);
81 }
82 
83 // Check the structure of the loop nest:
84 //   - there are enough loops to map to numDims;
85 //   - the loops are perfectly nested;
86 //   - the loop bounds can be computed above the outermost loop.
87 // This roughly corresponds to the "matcher" part of the pattern-based
88 // rewriting infrastructure.
89 static LogicalResult checkAffineLoopNestMappableImpl(AffineForOp forOp,
90                                                      unsigned numDims) {
91   Region &limit = forOp.region();
92   for (unsigned i = 0, e = numDims; i < e; ++i) {
93     Operation *nested = &forOp.getBody()->front();
94     if (!areValuesDefinedAbove(getLowerBoundOperands(forOp), limit) ||
95         !areValuesDefinedAbove(getUpperBoundOperands(forOp), limit))
96       return forOp.emitError(
97           "loops with bounds depending on other mapped loops "
98           "are not supported");
99 
100     // The innermost loop can have an arbitrary body, skip the perfect nesting
101     // check for it.
102     if (i == e - 1)
103       break;
104 
105     auto begin = forOp.getBody()->begin(), end = forOp.getBody()->end();
106     if (forOp.getBody()->empty() || std::next(begin, 2) != end)
107       return forOp.emitError("expected perfectly nested loops in the body");
108 
109     if (!(forOp = dyn_cast<AffineForOp>(nested)))
110       return nested->emitError("expected a nested loop");
111   }
112   return success();
113 }
114 
115 static LogicalResult checkAffineLoopNestMappable(AffineForOp forOp,
116                                                  unsigned numBlockDims,
117                                                  unsigned numThreadDims) {
118   if (numBlockDims < 1 || numThreadDims < 1) {
119     LLVM_DEBUG(llvm::dbgs() << "nothing to map");
120     return success();
121   }
122 
123   if (numBlockDims > 3) {
124     return forOp.emitError("cannot map to more than 3 block dimensions");
125   }
126   if (numThreadDims > 3) {
127     return forOp.emitError("cannot map to more than 3 thread dimensions");
128   }
129   return checkAffineLoopNestMappableImpl(forOp, numBlockDims + numThreadDims);
130 }
131 
132 namespace {
133 // Helper structure that holds common state of the loop to GPU kernel
134 // conversion.
135 struct AffineLoopToGpuConverter {
136   Optional<AffineForOp> collectBounds(AffineForOp forOp, unsigned numLoops);
137 
138   void createLaunch(AffineForOp rootForOp, AffineForOp innermostForOp,
139                     unsigned numBlockDims, unsigned numThreadDims);
140 
141   // Ranges of the loops mapped to blocks or threads.
142   SmallVector<Value, 6> dims;
143   // Lower bounds of the loops mapped to blocks or threads.
144   SmallVector<Value, 6> lbs;
145   // Induction variables of the loops mapped to blocks or threads.
146   SmallVector<Value, 6> ivs;
147   // Steps of the loops mapped to blocks or threads.
148   SmallVector<Value, 6> steps;
149 };
150 } // namespace
151 
152 // Return true if the value is obviously a constant "one".
153 static bool isConstantOne(Value value) {
154   if (auto def = value.getDefiningOp<ConstantIndexOp>())
155     return def.getValue() == 1;
156   return false;
157 }
158 
159 // Collect ranges, bounds, steps and induction variables in preparation for
160 // mapping a loop nest of depth "numLoops" rooted at "forOp" to a GPU kernel.
161 // This may fail if the IR for computing loop bounds cannot be constructed, for
162 // example if an affine loop uses semi-affine maps. Return the last loop to be
163 // mapped on success, llvm::None on failure.
164 Optional<AffineForOp>
165 AffineLoopToGpuConverter::collectBounds(AffineForOp forOp, unsigned numLoops) {
166   OpBuilder builder(forOp.getOperation());
167   dims.reserve(numLoops);
168   lbs.reserve(numLoops);
169   ivs.reserve(numLoops);
170   steps.reserve(numLoops);
171   AffineForOp currentLoop = forOp;
172   for (unsigned i = 0; i < numLoops; ++i) {
173     Value lowerBound = getOrEmitLowerBound(currentLoop, builder);
174     Value upperBound = getOrEmitUpperBound(currentLoop, builder);
175     if (!lowerBound || !upperBound) {
176       return llvm::None;
177     }
178 
179     Value range =
180         builder.create<SubIOp>(currentLoop.getLoc(), upperBound, lowerBound);
181     Value step = getOrCreateStep(currentLoop, builder);
182     if (!isConstantOne(step))
183       range = builder.create<SignedDivIOp>(currentLoop.getLoc(), range, step);
184     dims.push_back(range);
185 
186     lbs.push_back(lowerBound);
187     ivs.push_back(currentLoop.getInductionVar());
188     steps.push_back(step);
189 
190     if (i != numLoops - 1)
191       currentLoop = cast<AffineForOp>(&currentLoop.getBody()->front());
192   }
193   return currentLoop;
194 }
195 
196 // Replace the rooted at "rootForOp" with a GPU launch operation.  This expects
197 // "innermostForOp" to point to the last loop to be transformed to the kernel,
198 // and to have (numBlockDims + numThreadDims) perfectly nested loops between
199 // "rootForOp" and "innermostForOp".
200 void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp,
201                                             AffineForOp innermostForOp,
202                                             unsigned numBlockDims,
203                                             unsigned numThreadDims) {
204   OpBuilder builder(rootForOp.getOperation());
205   // Prepare the grid and block sizes for the launch operation.  If there is
206   // no loop mapped to a specific dimension, use constant "1" as its size.
207   Value constOne = (numBlockDims < 3 || numThreadDims < 3)
208                        ? builder.create<ConstantIndexOp>(rootForOp.getLoc(), 1)
209                        : nullptr;
210   Value gridSizeX = numBlockDims > 0 ? dims[0] : constOne;
211   Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne;
212   Value gridSizeZ = numBlockDims > 2 ? dims[2] : constOne;
213   Value blockSizeX = numThreadDims > 0 ? dims[numBlockDims] : constOne;
214   Value blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne;
215   Value blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne;
216 
217   // Create a launch op and move the body region of the innermost loop to the
218   // launch op.
219   auto launchOp = builder.create<gpu::LaunchOp>(
220       rootForOp.getLoc(), gridSizeX, gridSizeY, gridSizeZ, blockSizeX,
221       blockSizeY, blockSizeZ);
222 
223   // Replace the loop terminator (loops contain only a single block) with the
224   // gpu terminator and move the operations from the loop body block to the gpu
225   // launch body block.  Do not move the entire block because of the difference
226   // in block arguments.
227   Operation &terminator = innermostForOp.getBody()->back();
228   Location terminatorLoc = terminator.getLoc();
229   terminator.erase();
230   builder.setInsertionPointToEnd(innermostForOp.getBody());
231   builder.create<gpu::TerminatorOp>(terminatorLoc, llvm::None);
232   launchOp.body().front().getOperations().splice(
233       launchOp.body().front().begin(),
234       innermostForOp.getBody()->getOperations());
235 
236   // Remap the loop iterators to use block/thread identifiers instead.  Loops
237   // may iterate from LB with step S whereas GPU thread/block ids always iterate
238   // from 0 to N with step 1.  Therefore, loop induction variables are replaced
239   // with (gpu-thread/block-id * S) + LB.
240   builder.setInsertionPointToStart(&launchOp.body().front());
241   auto lbArgumentIt = lbs.begin();
242   auto stepArgumentIt = steps.begin();
243   for (auto en : llvm::enumerate(ivs)) {
244     Value id =
245         en.index() < numBlockDims
246             ? getDim3Value(launchOp.getBlockIds(), en.index())
247             : getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims);
248     Value step = steps[en.index()];
249     if (!isConstantOne(step))
250       id = builder.create<MulIOp>(rootForOp.getLoc(), step, id);
251 
252     Value ivReplacement =
253         builder.create<AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id);
254     en.value().replaceAllUsesWith(ivReplacement);
255     std::advance(lbArgumentIt, 1);
256     std::advance(stepArgumentIt, 1);
257   }
258 
259   // We are done and can erase the original outermost loop.
260   rootForOp.erase();
261 }
262 
263 // Generic loop to GPU kernel conversion function.
264 static LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp,
265                                                       unsigned numBlockDims,
266                                                       unsigned numThreadDims) {
267   if (failed(checkAffineLoopNestMappable(forOp, numBlockDims, numThreadDims)))
268     return failure();
269 
270   AffineLoopToGpuConverter converter;
271   auto maybeInnerLoop =
272       converter.collectBounds(forOp, numBlockDims + numThreadDims);
273   if (!maybeInnerLoop)
274     return failure();
275   converter.createLaunch(forOp, *maybeInnerLoop, numBlockDims, numThreadDims);
276 
277   return success();
278 }
279 
280 LogicalResult mlir::convertAffineLoopNestToGPULaunch(AffineForOp forOp,
281                                                      unsigned numBlockDims,
282                                                      unsigned numThreadDims) {
283   return ::convertAffineLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims);
284 }
285 
286 namespace {
287 struct ParallelToGpuLaunchLowering : public OpRewritePattern<ParallelOp> {
288   using OpRewritePattern<ParallelOp>::OpRewritePattern;
289 
290   LogicalResult matchAndRewrite(ParallelOp parallelOp,
291                                 PatternRewriter &rewriter) const override;
292 };
293 } // namespace
294 
295 /// Tries to derive a static upper bound from the defining operation of
296 /// `upperBound`.
297 static Value deriveStaticUpperBound(Value upperBound,
298                                     PatternRewriter &rewriter) {
299   if (auto op = upperBound.getDefiningOp<ConstantIndexOp>()) {
300     return op;
301   }
302 
303   if (auto minOp = upperBound.getDefiningOp<AffineMinOp>()) {
304     for (const AffineExpr &result : minOp.map().getResults()) {
305       if (auto constExpr = result.dyn_cast<AffineConstantExpr>()) {
306         return rewriter.create<ConstantIndexOp>(minOp.getLoc(),
307                                                 constExpr.getValue());
308       }
309     }
310   }
311 
312   if (auto multiplyOp = upperBound.getDefiningOp<MulIOp>()) {
313     if (auto lhs = dyn_cast_or_null<ConstantIndexOp>(
314             deriveStaticUpperBound(multiplyOp.getOperand(0), rewriter)
315                 .getDefiningOp()))
316       if (auto rhs = dyn_cast_or_null<ConstantIndexOp>(
317               deriveStaticUpperBound(multiplyOp.getOperand(1), rewriter)
318                   .getDefiningOp())) {
319         // Assumptions about the upper bound of minimum computations no longer
320         // work if multiplied by a negative value, so abort in this case.
321         if (lhs.getValue() < 0 || rhs.getValue() < 0)
322           return {};
323 
324         return rewriter.create<ConstantIndexOp>(
325             multiplyOp.getLoc(), lhs.getValue() * rhs.getValue());
326       }
327   }
328 
329   return {};
330 }
331 
332 static bool isMappedToProcessor(gpu::Processor processor) {
333   return processor != gpu::Processor::Sequential;
334 }
335 
336 static unsigned getLaunchOpArgumentNum(gpu::Processor processor) {
337   switch (processor) {
338   case gpu::Processor::BlockX:
339     return 0;
340   case gpu::Processor::BlockY:
341     return 1;
342   case gpu::Processor::BlockZ:
343     return 2;
344   case gpu::Processor::ThreadX:
345     return 3;
346   case gpu::Processor::ThreadY:
347     return 4;
348   case gpu::Processor::ThreadZ:
349     return 5;
350   default:;
351   }
352   llvm_unreachable(
353       "invalid processor type while retrieving launch op argument number");
354 }
355 
356 /// Modifies the current transformation state to capture the effect of the given
357 /// `scf.parallel` operation on index substitutions and the operations to be
358 /// inserted.
359 /// Specifically, if a dimension of a parallel loop is mapped to a hardware id,
360 /// this function will
361 /// - compute the loop index based on the hardware id and affine map from the
362 ///   mapping and update `cloningMap` to substitute all uses.
363 /// - derive a new upper bound for the hardware id and augment the provided
364 ///   `gpu.launch operation` accordingly.
365 /// - if the upper bound is imprecise, insert a conditional in the `gpu.launch`
366 ///   and update the rewriter to insert into the conditional's body.
367 /// If the dimension is mapped to sequential,
368 /// - insert a for loop into the body and update the rewriter to insert into
369 ///   the for loop's body.
370 /// - update the `cloningMap` to replace uses of the index with the index of
371 ///   the new for loop.
372 /// In either case,
373 /// - append the instructions from the loops body to worklist, in reverse order.
374 /// To note the end of the current scope in case a loop or conditional was
375 /// inserted, a sentinel (the `gpu.launch` operation) is inserted into the
376 /// worklist. This signals the processor of the worklist to pop the rewriter
377 /// one scope-level up.
378 static LogicalResult processParallelLoop(
379     ParallelOp parallelOp, gpu::LaunchOp launchOp,
380     BlockAndValueMapping &cloningMap, SmallVectorImpl<Operation *> &worklist,
381     DenseMap<gpu::Processor, Value> &bounds, PatternRewriter &rewriter) {
382   // TODO: Verify that this is a valid GPU mapping.
383   // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential
384   ArrayAttr mapping =
385       parallelOp->getAttrOfType<ArrayAttr>(gpu::getMappingAttrName());
386 
387   // TODO: Support reductions.
388   if (!mapping || parallelOp.getNumResults() != 0)
389     return failure();
390 
391   Location loc = parallelOp.getLoc();
392 
393   auto launchIndependent = [&launchOp](Value val) {
394     return val.getParentRegion()->isAncestor(launchOp->getParentRegion());
395   };
396 
397   auto ensureLaunchIndependent = [&rewriter,
398                                   launchIndependent](Value val) -> Value {
399     if (launchIndependent(val))
400       return val;
401     if (ConstantOp constOp = val.getDefiningOp<ConstantOp>())
402       return rewriter.create<ConstantOp>(constOp.getLoc(), constOp.getValue());
403     return {};
404   };
405 
406   for (auto config : llvm::zip(mapping, parallelOp.getInductionVars(),
407                                parallelOp.lowerBound(), parallelOp.upperBound(),
408                                parallelOp.step())) {
409     Attribute mappingAttribute;
410     Value iv, lowerBound, upperBound, step;
411     std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config;
412     auto annotation = mappingAttribute.dyn_cast<gpu::ParallelLoopDimMapping>();
413     if (!annotation)
414       return parallelOp.emitOpError()
415              << "expected mapping attribute for lowering to GPU";
416     Value newIndex;
417     gpu::Processor processor = gpu::getProcessor(annotation);
418 
419     if (isMappedToProcessor(processor)) {
420       // Use the corresponding thread/grid index as replacement for the loop iv.
421       Value operand =
422           launchOp.body().getArgument(getLaunchOpArgumentNum(processor));
423       // Take the indexmap and add the lower bound and step computations in.
424       // This computes operand * step + lowerBound.
425       // Use an affine map here so that it composes nicely with the provided
426       // annotation.
427       AffineMap lowerAndStep = AffineMap::get(
428           1, 2,
429           rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) +
430               rewriter.getAffineSymbolExpr(1));
431       newIndex = rewriter.create<AffineApplyOp>(
432           loc, annotation.map().getValue().compose(lowerAndStep),
433           ValueRange{operand, step, lowerBound});
434       // If there was also a bound, insert that, too.
435       // TODO: Check that we do not assign bounds twice.
436       if (annotation.bound().getValue()) {
437         // We pass as the single operand to the bound-map the number of
438         // iterations, which is (upperBound - lowerBound) ceilDiv step. To
439         // support inner loops with dynamic upper bounds (as generated by e.g.
440         // tiling), try to derive a max for the bounds. If the used bound for
441         // the hardware id is imprecise, wrap the contained code into a
442         // conditional. If the lower-bound is constant or defined before the
443         // launch, we can use it in the launch bounds. Otherwise fail.
444         if (!launchIndependent(lowerBound) &&
445             !isa_and_nonnull<ConstantOp>(lowerBound.getDefiningOp()))
446           return failure();
447         // The step must also be constant or defined outside of the loop nest.
448         if (!launchIndependent(step) &&
449             !isa_and_nonnull<ConstantOp>(step.getDefiningOp()))
450           return failure();
451         // If the upper-bound is constant or defined before the launch, we can
452         // use it in the launch bounds directly. Otherwise try derive a bound.
453         bool boundIsPrecise =
454             launchIndependent(upperBound) ||
455             isa_and_nonnull<ConstantOp>(upperBound.getDefiningOp());
456         {
457           PatternRewriter::InsertionGuard guard(rewriter);
458           rewriter.setInsertionPoint(launchOp);
459           if (!boundIsPrecise) {
460             upperBound = deriveStaticUpperBound(upperBound, rewriter);
461             if (!upperBound) {
462               return rewriter.notifyMatchFailure(
463                   parallelOp,
464                   "cannot derive loop-invariant upper bound for number of"
465                   "iterations");
466             }
467           }
468           // Compute the number of iterations needed. We compute this as an
469           // affine expression ceilDiv (upperBound - lowerBound) step. We use
470           // affine.apply here so that it composes nicely with the provided map.
471           AffineMap stepMap = AffineMap::get(
472               1, 2,
473               ((rewriter.getAffineDimExpr(0) - rewriter.getAffineSymbolExpr(0))
474                    .ceilDiv(rewriter.getAffineSymbolExpr(1))));
475           Value launchBound = rewriter.create<AffineApplyOp>(
476               loc, annotation.bound().getValue().compose(stepMap),
477               ValueRange{
478                   ensureLaunchIndependent(
479                       cloningMap.lookupOrDefault(upperBound)),
480                   ensureLaunchIndependent(
481                       cloningMap.lookupOrDefault(lowerBound)),
482                   ensureLaunchIndependent(cloningMap.lookupOrDefault(step))});
483           // todo(herhut,ravishankarm): Update the behavior of setMappingAttr
484           // when this condition is relaxed.
485           if (bounds.find(processor) != bounds.end()) {
486             return rewriter.notifyMatchFailure(
487                 parallelOp, "cannot redefine the bound for processor " +
488                                 Twine(static_cast<int64_t>(processor)));
489           }
490           bounds[processor] = launchBound;
491         }
492         if (!boundIsPrecise) {
493           // We are using an approximation, create a surrounding conditional.
494           Value originalBound = std::get<3>(config);
495           CmpIOp pred = rewriter.create<CmpIOp>(
496               loc, CmpIPredicate::slt, newIndex,
497               cloningMap.lookupOrDefault(originalBound));
498           scf::IfOp ifOp = rewriter.create<scf::IfOp>(loc, pred, false);
499           rewriter.setInsertionPointToStart(&ifOp.thenRegion().front());
500           // Put a sentinel into the worklist so we know when to pop out of the
501           // if body again. We use the launchOp here, as that cannot be part of
502           // the bodies instruction.
503           worklist.push_back(launchOp.getOperation());
504         }
505       }
506     } else {
507       // Create a sequential for loop.
508       auto loopOp = rewriter.create<scf::ForOp>(
509           loc, cloningMap.lookupOrDefault(lowerBound),
510           cloningMap.lookupOrDefault(upperBound),
511           cloningMap.lookupOrDefault(step));
512       newIndex = loopOp.getInductionVar();
513       rewriter.setInsertionPointToStart(loopOp.getBody());
514       // Put a sentinel into the worklist so we know when to pop out of the loop
515       // body again. We use the launchOp here, as that cannot be part of the
516       // bodies instruction.
517       worklist.push_back(launchOp.getOperation());
518     }
519     cloningMap.map(iv, newIndex);
520   }
521 
522   // Propagate custom user defined optional attributes, that can be used at
523   // later stage, such as extension data for GPU kernel dispatch
524   for (const auto &namedAttr : parallelOp.getAttrs()) {
525     if (namedAttr.first == gpu::getMappingAttrName() ||
526         namedAttr.first == ParallelOp::getOperandSegmentSizeAttr())
527       continue;
528     launchOp->setAttr(namedAttr.first, namedAttr.second);
529   }
530 
531   Block *body = parallelOp.getBody();
532   worklist.reserve(worklist.size() + body->getOperations().size());
533   for (Operation &op : llvm::reverse(body->without_terminator()))
534     worklist.push_back(&op);
535   return success();
536 }
537 
538 /// Lower a `scf.parallel` operation into a corresponding `gpu.launch`
539 /// operation.
540 ///
541 /// This essentially transforms a loop nest into a corresponding SIMT function.
542 /// The conversion is driven by mapping annotations on the `scf.parallel`
543 /// operations. The mapping is provided via a `DictionaryAttribute` named
544 /// `mapping`, which has three entries:
545 ///  - processor: the hardware id to map to. 0-2 are block dimensions, 3-5 are
546 ///               thread dimensions and 6 is sequential.
547 ///  - map : An affine map that is used to pre-process hardware ids before
548 ///          substitution.
549 ///  - bound : An affine map that is used to compute the bound of the hardware
550 ///            id based on an upper bound of the number of iterations.
551 /// If the `scf.parallel` contains nested `scf.parallel` operations, those
552 /// need to be annotated, as well. Structurally, the transformation works by
553 /// splicing all operations from nested `scf.parallel` operations into a single
554 /// sequence. Indices mapped to hardware ids are substituted with those ids,
555 /// wheras sequential mappings result in a sequential for-loop. To have more
556 /// flexibility when mapping code to hardware ids, the transform supports two
557 /// affine maps. The first `map` is used to compute the actual index for
558 /// substitution from the hardware id. The second `bound` is used to compute the
559 /// launch dimension for the hardware id from the number of iterations the
560 /// mapped loop is performing. Note that the number of iterations might be
561 /// imprecise if the corresponding loop-bounds are loop-dependent. In such case,
562 /// the hardware id might iterate over additional indices. The transformation
563 /// caters for this by predicating the created sequence of instructions on
564 /// the actual loop bound. This only works if an static upper bound for the
565 /// dynamic loop bound can be derived, currently via analyzing `affine.min`
566 /// operations.
567 LogicalResult
568 ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
569                                              PatternRewriter &rewriter) const {
570   // We can only transform starting at the outer-most loop. Launches inside of
571   // parallel loops are not supported.
572   if (auto parentLoop = parallelOp->getParentOfType<ParallelOp>())
573     return failure();
574   // Create a launch operation. We start with bound one for all grid/block
575   // sizes. Those will be refined later as we discover them from mappings.
576   Location loc = parallelOp.getLoc();
577   Value constantOne = rewriter.create<ConstantIndexOp>(parallelOp.getLoc(), 1);
578   gpu::LaunchOp launchOp = rewriter.create<gpu::LaunchOp>(
579       parallelOp.getLoc(), constantOne, constantOne, constantOne, constantOne,
580       constantOne, constantOne);
581   rewriter.setInsertionPointToEnd(&launchOp.body().front());
582   rewriter.create<gpu::TerminatorOp>(loc);
583   rewriter.setInsertionPointToStart(&launchOp.body().front());
584 
585   BlockAndValueMapping cloningMap;
586   llvm::DenseMap<gpu::Processor, Value> launchBounds;
587   SmallVector<Operation *, 16> worklist;
588   if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist,
589                                  launchBounds, rewriter)))
590     return failure();
591 
592   // Whether we have seen any side-effects. Reset when leaving an inner scope.
593   bool seenSideeffects = false;
594   // Whether we have left a nesting scope (and hence are no longer innermost).
595   bool leftNestingScope = false;
596   while (!worklist.empty()) {
597     Operation *op = worklist.pop_back_val();
598     // Now walk over the body and clone it.
599     // TODO: This is only correct if there either is no further scf.parallel
600     //       nested or this code is side-effect free. Otherwise we might need
601     //       predication. We are overly conservative for now and only allow
602     //       side-effects in the innermost scope.
603     if (auto nestedParallel = dyn_cast<ParallelOp>(op)) {
604       // Before entering a nested scope, make sure there have been no
605       // sideeffects until now.
606       if (seenSideeffects)
607         return failure();
608       // A nested scf.parallel needs insertion of code to compute indices.
609       // Insert that now. This will also update the worklist with the loops
610       // body.
611       if (failed(processParallelLoop(nestedParallel, launchOp, cloningMap,
612                                      worklist, launchBounds, rewriter)))
613         return failure();
614     } else if (op == launchOp.getOperation()) {
615       // Found our sentinel value. We have finished the operations from one
616       // nesting level, pop one level back up.
617       auto parent = rewriter.getInsertionPoint()->getParentOp();
618       rewriter.setInsertionPointAfter(parent);
619       leftNestingScope = true;
620       seenSideeffects = false;
621     } else {
622       // Otherwise we copy it over.
623       Operation *clone = rewriter.clone(*op, cloningMap);
624       cloningMap.map(op->getResults(), clone->getResults());
625       // Check for side effects.
626       // TODO: Handle region side effects properly.
627       seenSideeffects |= !MemoryEffectOpInterface::hasNoEffect(clone) ||
628                          clone->getNumRegions() != 0;
629       // If we are no longer in the innermost scope, sideeffects are disallowed.
630       if (seenSideeffects && leftNestingScope)
631         return failure();
632     }
633   }
634 
635   // Now that we succeeded creating the launch operation, also update the
636   // bounds.
637   for (auto bound : launchBounds)
638     launchOp.setOperand(getLaunchOpArgumentNum(std::get<0>(bound)),
639                         std::get<1>(bound));
640 
641   rewriter.eraseOp(parallelOp);
642   return success();
643 }
644 
645 void mlir::populateParallelLoopToGPUPatterns(OwningRewritePatternList &patterns,
646                                              MLIRContext *ctx) {
647   patterns.insert<ParallelToGpuLaunchLowering>(ctx);
648 }
649 
650 void mlir::configureParallelLoopToGPULegality(ConversionTarget &target) {
651   target.addLegalDialect<memref::MemRefDialect>();
652   target.addDynamicallyLegalOp<scf::ParallelOp>([](scf::ParallelOp parallelOp) {
653     return !parallelOp->getAttr(gpu::getMappingAttrName());
654   });
655 }
656