xref: /llvm-project/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp (revision cb7bda2ace81226c5b33165411dd0316f93fa57e)
1 //===- SCFToGPU.cpp - Convert an affine loop nest to a GPU kernel -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This implements a straightforward conversion of an loop nest into a GPU
10 // kernel.  The caller is expected to guarantee that the conversion is correct
11 // or to further transform the kernel to ensure correctness.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "mlir/Conversion/SCFToGPU/SCFToGPU.h"
16 
17 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
18 #include "mlir/Dialect/Affine/IR/AffineOps.h"
19 #include "mlir/Dialect/Arith/IR/Arith.h"
20 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
21 #include "mlir/Dialect/GPU/Transforms/ParallelLoopMapper.h"
22 #include "mlir/Dialect/MemRef/IR/MemRef.h"
23 #include "mlir/Dialect/SCF/IR/SCF.h"
24 #include "mlir/IR/AffineExpr.h"
25 #include "mlir/IR/Builders.h"
26 #include "mlir/IR/IRMapping.h"
27 #include "mlir/Interfaces/SideEffectInterfaces.h"
28 #include "mlir/Pass/Pass.h"
29 #include "mlir/Transforms/DialectConversion.h"
30 #include "mlir/Transforms/Passes.h"
31 #include "mlir/Transforms/RegionUtils.h"
32 #include "llvm/ADT/Sequence.h"
33 #include "llvm/Support/Debug.h"
34 #include <optional>
35 
36 #define DEBUG_TYPE "loops-to-gpu"
37 
38 using namespace mlir;
39 using namespace mlir::affine;
40 using namespace mlir::scf;
41 
42 // Name of internal attribute to mark visited operations during conversion.
43 //
44 // NOTE: The conversion originally used the following legality criteria:
45 //   `!parallelOp->hasAttr(gpu::getMappingAttrName())`
46 // But the provided pattern might reject some cases based on more detailed
47 // analysis of the `mapping` attribute.
48 // To avoid dialect conversion failure due to non-converted illegal operation
49 // we use this extra Unit attribute as a marker, that the operation was checked
50 // by the pattern and is should be considered as legal in the following legality
51 // checks. The `finalizeParallelLoopToGPUConversion` function performs clean up
52 // of this extra attributes ans is supposed to be called after the dialect
53 // conversion.
54 //
55 // TODO: Implement a cleaner solution, factoring out the "matching" logic
56 // from the pattern and its callees into a separate function that can be called
57 // from both the pattern and the op legality check.
58 static constexpr StringLiteral kVisitedAttrName = "SCFToGPU_visited";
59 
60 // Extract an indexed value from KernelDim3.
61 static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) {
62   switch (pos) {
63   case 0:
64     return dim3.x;
65   case 1:
66     return dim3.y;
67   case 2:
68     return dim3.z;
69   default:
70     llvm_unreachable("dim3 position out of bounds");
71   }
72   return nullptr;
73 }
74 
75 // Get the lower bound-related operands of a loop operation.
76 static Operation::operand_range getLowerBoundOperands(AffineForOp forOp) {
77   return forOp.getLowerBoundOperands();
78 }
79 
80 // Get the upper bound-related operands of a loop operation.
81 static Operation::operand_range getUpperBoundOperands(AffineForOp forOp) {
82   return forOp.getUpperBoundOperands();
83 }
84 
85 // Get a Value that corresponds to the loop step.  If the step is an attribute,
86 // materialize a corresponding constant using builder.
87 static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder) {
88   return builder.create<arith::ConstantIndexOp>(forOp.getLoc(),
89                                                 forOp.getStep());
90 }
91 
92 // Get a Value for the loop lower bound.  If the value requires computation,
93 // materialize the instructions using builder.
94 static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) {
95   return lowerAffineLowerBound(forOp, builder);
96 }
97 
98 // Get a Value for the loop upper bound.  If the value requires computation,
99 // materialize the instructions using builder.
100 static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) {
101   return lowerAffineUpperBound(forOp, builder);
102 }
103 
104 // Check the structure of the loop nest:
105 //   - there are enough loops to map to numDims;
106 //   - the loops are perfectly nested;
107 //   - the loop bounds can be computed above the outermost loop.
108 // This roughly corresponds to the "matcher" part of the pattern-based
109 // rewriting infrastructure.
110 static LogicalResult checkAffineLoopNestMappableImpl(AffineForOp forOp,
111                                                      unsigned numDims) {
112   Region &limit = forOp.getRegion();
113   for (unsigned i = 0, e = numDims; i < e; ++i) {
114     Operation *nested = &forOp.getBody()->front();
115     if (!areValuesDefinedAbove(getLowerBoundOperands(forOp), limit) ||
116         !areValuesDefinedAbove(getUpperBoundOperands(forOp), limit))
117       return forOp.emitError(
118           "loops with bounds depending on other mapped loops "
119           "are not supported");
120 
121     // The innermost loop can have an arbitrary body, skip the perfect nesting
122     // check for it.
123     if (i == e - 1)
124       break;
125 
126     auto begin = forOp.getBody()->begin(), end = forOp.getBody()->end();
127     if (forOp.getBody()->empty() || std::next(begin, 2) != end)
128       return forOp.emitError("expected perfectly nested loops in the body");
129 
130     if (!(forOp = dyn_cast<AffineForOp>(nested)))
131       return nested->emitError("expected a nested loop");
132   }
133   return success();
134 }
135 
136 static LogicalResult checkAffineLoopNestMappable(AffineForOp forOp,
137                                                  unsigned numBlockDims,
138                                                  unsigned numThreadDims) {
139   if (numBlockDims < 1 || numThreadDims < 1) {
140     LLVM_DEBUG(llvm::dbgs() << "nothing to map");
141     return success();
142   }
143 
144   if (numBlockDims > 3) {
145     return forOp.emitError("cannot map to more than 3 block dimensions");
146   }
147   if (numThreadDims > 3) {
148     return forOp.emitError("cannot map to more than 3 thread dimensions");
149   }
150   return checkAffineLoopNestMappableImpl(forOp, numBlockDims + numThreadDims);
151 }
152 
153 namespace {
154 // Helper structure that holds common state of the loop to GPU kernel
155 // conversion.
156 struct AffineLoopToGpuConverter {
157   std::optional<AffineForOp> collectBounds(AffineForOp forOp,
158                                            unsigned numLoops);
159 
160   void createLaunch(AffineForOp rootForOp, AffineForOp innermostForOp,
161                     unsigned numBlockDims, unsigned numThreadDims);
162 
163   // Ranges of the loops mapped to blocks or threads.
164   SmallVector<Value, 6> dims;
165   // Lower bounds of the loops mapped to blocks or threads.
166   SmallVector<Value, 6> lbs;
167   // Induction variables of the loops mapped to blocks or threads.
168   SmallVector<Value, 6> ivs;
169   // Steps of the loops mapped to blocks or threads.
170   SmallVector<Value, 6> steps;
171 };
172 } // namespace
173 
174 // Collect ranges, bounds, steps and induction variables in preparation for
175 // mapping a loop nest of depth "numLoops" rooted at "forOp" to a GPU kernel.
176 // This may fail if the IR for computing loop bounds cannot be constructed, for
177 // example if an affine loop uses semi-affine maps. Return the last loop to be
178 // mapped on success, std::nullopt on failure.
179 std::optional<AffineForOp>
180 AffineLoopToGpuConverter::collectBounds(AffineForOp forOp, unsigned numLoops) {
181   OpBuilder builder(forOp.getOperation());
182   dims.reserve(numLoops);
183   lbs.reserve(numLoops);
184   ivs.reserve(numLoops);
185   steps.reserve(numLoops);
186   AffineForOp currentLoop = forOp;
187   for (unsigned i = 0; i < numLoops; ++i) {
188     Value lowerBound = getOrEmitLowerBound(currentLoop, builder);
189     Value upperBound = getOrEmitUpperBound(currentLoop, builder);
190     if (!lowerBound || !upperBound) {
191       return std::nullopt;
192     }
193 
194     Value range = builder.create<arith::SubIOp>(currentLoop.getLoc(),
195                                                 upperBound, lowerBound);
196     Value step = getOrCreateStep(currentLoop, builder);
197     if (getConstantIntValue(step) != static_cast<int64_t>(1))
198       range = builder.create<arith::DivSIOp>(currentLoop.getLoc(), range, step);
199     dims.push_back(range);
200 
201     lbs.push_back(lowerBound);
202     ivs.push_back(currentLoop.getInductionVar());
203     steps.push_back(step);
204 
205     if (i != numLoops - 1)
206       currentLoop = cast<AffineForOp>(&currentLoop.getBody()->front());
207   }
208   return currentLoop;
209 }
210 
211 // Replace the rooted at "rootForOp" with a GPU launch operation.  This expects
212 // "innermostForOp" to point to the last loop to be transformed to the kernel,
213 // and to have (numBlockDims + numThreadDims) perfectly nested loops between
214 // "rootForOp" and "innermostForOp".
215 void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp,
216                                             AffineForOp innermostForOp,
217                                             unsigned numBlockDims,
218                                             unsigned numThreadDims) {
219   OpBuilder builder(rootForOp.getOperation());
220   // Prepare the grid and block sizes for the launch operation.  If there is
221   // no loop mapped to a specific dimension, use constant "1" as its size.
222   Value constOne =
223       (numBlockDims < 3 || numThreadDims < 3)
224           ? builder.create<arith::ConstantIndexOp>(rootForOp.getLoc(), 1)
225           : nullptr;
226   Value gridSizeX = numBlockDims > 0 ? dims[0] : constOne;
227   Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne;
228   Value gridSizeZ = numBlockDims > 2 ? dims[2] : constOne;
229   Value blockSizeX = numThreadDims > 0 ? dims[numBlockDims] : constOne;
230   Value blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne;
231   Value blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne;
232 
233   // Create a launch op and move the body region of the innermost loop to the
234   // launch op.
235   auto launchOp = builder.create<gpu::LaunchOp>(
236       rootForOp.getLoc(), gridSizeX, gridSizeY, gridSizeZ, blockSizeX,
237       blockSizeY, blockSizeZ);
238 
239   // Replace the loop terminator (loops contain only a single block) with the
240   // gpu terminator and move the operations from the loop body block to the gpu
241   // launch body block.  Do not move the entire block because of the difference
242   // in block arguments.
243   Operation &terminator = innermostForOp.getBody()->back();
244   Location terminatorLoc = terminator.getLoc();
245   terminator.erase();
246   builder.setInsertionPointToEnd(innermostForOp.getBody());
247   builder.create<gpu::TerminatorOp>(terminatorLoc, std::nullopt);
248   launchOp.getBody().front().getOperations().splice(
249       launchOp.getBody().front().begin(),
250       innermostForOp.getBody()->getOperations());
251 
252   // Remap the loop iterators to use block/thread identifiers instead.  Loops
253   // may iterate from LB with step S whereas GPU thread/block ids always iterate
254   // from 0 to N with step 1.  Therefore, loop induction variables are replaced
255   // with (gpu-thread/block-id * S) + LB.
256   builder.setInsertionPointToStart(&launchOp.getBody().front());
257   auto *lbArgumentIt = lbs.begin();
258   auto *stepArgumentIt = steps.begin();
259   for (const auto &en : llvm::enumerate(ivs)) {
260     Value id =
261         en.index() < numBlockDims
262             ? getDim3Value(launchOp.getBlockIds(), en.index())
263             : getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims);
264     Value step = steps[en.index()];
265     if (getConstantIntValue(step) != static_cast<int64_t>(1))
266       id = builder.create<arith::MulIOp>(rootForOp.getLoc(), step, id);
267 
268     Value ivReplacement =
269         builder.create<arith::AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id);
270     en.value().replaceAllUsesWith(ivReplacement);
271     std::advance(lbArgumentIt, 1);
272     std::advance(stepArgumentIt, 1);
273   }
274 
275   // We are done and can erase the original outermost loop.
276   rootForOp.erase();
277 }
278 
279 // Generic loop to GPU kernel conversion function.
280 static LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp,
281                                                       unsigned numBlockDims,
282                                                       unsigned numThreadDims) {
283   if (failed(checkAffineLoopNestMappable(forOp, numBlockDims, numThreadDims)))
284     return failure();
285 
286   AffineLoopToGpuConverter converter;
287   auto maybeInnerLoop =
288       converter.collectBounds(forOp, numBlockDims + numThreadDims);
289   if (!maybeInnerLoop)
290     return failure();
291   converter.createLaunch(forOp, *maybeInnerLoop, numBlockDims, numThreadDims);
292 
293   return success();
294 }
295 
296 LogicalResult mlir::convertAffineLoopNestToGPULaunch(AffineForOp forOp,
297                                                      unsigned numBlockDims,
298                                                      unsigned numThreadDims) {
299   return ::convertAffineLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims);
300 }
301 
302 namespace {
303 struct ParallelToGpuLaunchLowering : public OpRewritePattern<ParallelOp> {
304   using OpRewritePattern<ParallelOp>::OpRewritePattern;
305 
306   LogicalResult matchAndRewrite(ParallelOp parallelOp,
307                                 PatternRewriter &rewriter) const override;
308 };
309 } // namespace
310 
311 /// Tries to derive a static upper bound from the defining operation of
312 /// `upperBound`.
313 static Value deriveStaticUpperBound(Value upperBound,
314                                     PatternRewriter &rewriter) {
315   if (auto op = upperBound.getDefiningOp<arith::ConstantIndexOp>()) {
316     return op;
317   }
318 
319   if (auto minOp = upperBound.getDefiningOp<AffineMinOp>()) {
320     for (const AffineExpr &result : minOp.getMap().getResults()) {
321       if (auto constExpr = result.dyn_cast<AffineConstantExpr>()) {
322         return rewriter.create<arith::ConstantIndexOp>(minOp.getLoc(),
323                                                        constExpr.getValue());
324       }
325     }
326   }
327 
328   if (auto minOp = upperBound.getDefiningOp<arith::MinSIOp>()) {
329     for (Value operand : {minOp.getLhs(), minOp.getRhs()}) {
330       if (auto staticBound = deriveStaticUpperBound(operand, rewriter))
331         return staticBound;
332     }
333   }
334 
335   if (auto multiplyOp = upperBound.getDefiningOp<arith::MulIOp>()) {
336     if (auto lhs = dyn_cast_or_null<arith::ConstantIndexOp>(
337             deriveStaticUpperBound(multiplyOp.getOperand(0), rewriter)
338                 .getDefiningOp()))
339       if (auto rhs = dyn_cast_or_null<arith::ConstantIndexOp>(
340               deriveStaticUpperBound(multiplyOp.getOperand(1), rewriter)
341                   .getDefiningOp())) {
342         // Assumptions about the upper bound of minimum computations no longer
343         // work if multiplied by mixed signs, so abort in this case.
344         if ((lhs.value() < 0) != (rhs.value() < 0))
345           return {};
346 
347         return rewriter.create<arith::ConstantIndexOp>(
348             multiplyOp.getLoc(), lhs.value() * rhs.value());
349       }
350   }
351 
352   return {};
353 }
354 
355 static bool isMappedToProcessor(gpu::Processor processor) {
356   return processor != gpu::Processor::Sequential;
357 }
358 
359 static unsigned getLaunchOpArgumentNum(gpu::Processor processor) {
360   switch (processor) {
361   case gpu::Processor::BlockX:
362     return 0;
363   case gpu::Processor::BlockY:
364     return 1;
365   case gpu::Processor::BlockZ:
366     return 2;
367   case gpu::Processor::ThreadX:
368     return 3;
369   case gpu::Processor::ThreadY:
370     return 4;
371   case gpu::Processor::ThreadZ:
372     return 5;
373   default:;
374   }
375   llvm_unreachable(
376       "invalid processor type while retrieving launch op argument number");
377 }
378 
379 /// Modifies the current transformation state to capture the effect of the given
380 /// `scf.parallel` operation on index substitutions and the operations to be
381 /// inserted.
382 /// Specifically, if a dimension of a parallel loop is mapped to a hardware id,
383 /// this function will
384 /// - compute the loop index based on the hardware id and affine map from the
385 ///   mapping and update `cloningMap` to substitute all uses.
386 /// - derive a new upper bound for the hardware id and augment the provided
387 ///   `gpu.launch operation` accordingly.
388 /// - if the upper bound is imprecise, insert a conditional in the `gpu.launch`
389 ///   and update the rewriter to insert into the conditional's body.
390 /// If the dimension is mapped to sequential,
391 /// - insert a for loop into the body and update the rewriter to insert into
392 ///   the for loop's body.
393 /// - update the `cloningMap` to replace uses of the index with the index of
394 ///   the new for loop.
395 /// In either case,
396 /// - append the instructions from the loops body to worklist, in reverse order.
397 /// To note the end of the current scope in case a loop or conditional was
398 /// inserted, a sentinel (the `gpu.launch` operation) is inserted into the
399 /// worklist. This signals the processor of the worklist to pop the rewriter
400 /// one scope-level up.
401 static LogicalResult processParallelLoop(
402     ParallelOp parallelOp, gpu::LaunchOp launchOp, IRMapping &cloningMap,
403     SmallVectorImpl<Operation *> &worklist,
404     DenseMap<gpu::Processor, Value> &bounds, PatternRewriter &rewriter) {
405   // TODO: Verify that this is a valid GPU mapping.
406   // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential
407   ArrayAttr mapping =
408       parallelOp->getAttrOfType<ArrayAttr>(gpu::getMappingAttrName());
409 
410   // TODO: Support reductions.
411   if (!mapping || parallelOp.getNumResults() != 0)
412     return failure();
413 
414   Location loc = parallelOp.getLoc();
415 
416   auto launchIndependent = [&launchOp](Value val) {
417     return val.getParentRegion()->isAncestor(launchOp->getParentRegion());
418   };
419 
420   auto ensureLaunchIndependent = [&rewriter,
421                                   launchIndependent](Value val) -> Value {
422     if (launchIndependent(val))
423       return val;
424     if (auto constOp = val.getDefiningOp<arith::ConstantOp>())
425       return rewriter.create<arith::ConstantOp>(constOp.getLoc(),
426                                                 constOp.getValue());
427     return {};
428   };
429 
430   for (auto config : llvm::zip(
431            mapping, parallelOp.getInductionVars(), parallelOp.getLowerBound(),
432            parallelOp.getUpperBound(), parallelOp.getStep())) {
433     Attribute mappingAttribute;
434     Value iv, lowerBound, upperBound, step;
435     std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config;
436     auto annotation =
437         dyn_cast<gpu::ParallelLoopDimMappingAttr>(mappingAttribute);
438     if (!annotation)
439       return parallelOp.emitOpError()
440              << "expected mapping attribute for lowering to GPU";
441     Value newIndex;
442     gpu::Processor processor = annotation.getProcessor();
443 
444     if (isMappedToProcessor(processor)) {
445       // Use the corresponding thread/grid index as replacement for the loop iv.
446       Value operand =
447           launchOp.getBody().getArgument(getLaunchOpArgumentNum(processor));
448       // Take the indexmap and add the lower bound and step computations in.
449       // This computes operand * step + lowerBound.
450       // Use an affine map here so that it composes nicely with the provided
451       // annotation.
452       AffineMap lowerAndStep = AffineMap::get(
453           1, 2,
454           rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) +
455               rewriter.getAffineSymbolExpr(1));
456       newIndex = rewriter.create<AffineApplyOp>(
457           loc, annotation.getMap().compose(lowerAndStep),
458           ValueRange{operand, step, lowerBound});
459       // If there was also a bound, insert that, too.
460       // TODO: Check that we do not assign bounds twice.
461       if (annotation.getBound()) {
462         // We pass as the single operand to the bound-map the number of
463         // iterations, which is (upperBound - lowerBound) ceilDiv step. To
464         // support inner loops with dynamic upper bounds (as generated by e.g.
465         // tiling), try to derive a max for the bounds. If the used bound for
466         // the hardware id is imprecise, wrap the contained code into a
467         // conditional. If the lower-bound is constant or defined before the
468         // launch, we can use it in the launch bounds. Otherwise fail.
469         if (!launchIndependent(lowerBound) &&
470             !isa_and_nonnull<arith::ConstantOp>(lowerBound.getDefiningOp()))
471           return failure();
472         // The step must also be constant or defined outside of the loop nest.
473         if (!launchIndependent(step) &&
474             !isa_and_nonnull<arith::ConstantOp>(step.getDefiningOp()))
475           return failure();
476         // If the upper-bound is constant or defined before the launch, we can
477         // use it in the launch bounds directly. Otherwise try derive a bound.
478         bool boundIsPrecise =
479             launchIndependent(upperBound) ||
480             isa_and_nonnull<arith::ConstantOp>(upperBound.getDefiningOp());
481         {
482           PatternRewriter::InsertionGuard guard(rewriter);
483           rewriter.setInsertionPoint(launchOp);
484           if (!boundIsPrecise) {
485             upperBound = deriveStaticUpperBound(upperBound, rewriter);
486             if (!upperBound) {
487               return rewriter.notifyMatchFailure(
488                   parallelOp,
489                   "cannot derive loop-invariant upper bound for number of"
490                   "iterations");
491             }
492           }
493           // Compute the number of iterations needed. We compute this as an
494           // affine expression ceilDiv (upperBound - lowerBound) step. We use
495           // affine.apply here so that it composes nicely with the provided map.
496           AffineMap stepMap = AffineMap::get(
497               1, 2,
498               ((rewriter.getAffineDimExpr(0) - rewriter.getAffineSymbolExpr(0))
499                    .ceilDiv(rewriter.getAffineSymbolExpr(1))));
500           Value launchBound = rewriter.create<AffineApplyOp>(
501               loc, annotation.getBound().compose(stepMap),
502               ValueRange{
503                   ensureLaunchIndependent(
504                       cloningMap.lookupOrDefault(upperBound)),
505                   ensureLaunchIndependent(
506                       cloningMap.lookupOrDefault(lowerBound)),
507                   ensureLaunchIndependent(cloningMap.lookupOrDefault(step))});
508           // todo(herhut,ravishankarm): Update the behavior of setMappingAttr
509           // when this condition is relaxed.
510           if (bounds.contains(processor)) {
511             return rewriter.notifyMatchFailure(
512                 parallelOp, "cannot redefine the bound for processor " +
513                                 Twine(static_cast<int64_t>(processor)));
514           }
515           bounds[processor] = launchBound;
516         }
517         if (!boundIsPrecise) {
518           // We are using an approximation, create a surrounding conditional.
519           Value originalBound = std::get<3>(config);
520           arith::CmpIOp pred = rewriter.create<arith::CmpIOp>(
521               loc, arith::CmpIPredicate::slt, newIndex,
522               cloningMap.lookupOrDefault(originalBound));
523           scf::IfOp ifOp = rewriter.create<scf::IfOp>(loc, pred, false);
524           rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
525           // Put a sentinel into the worklist so we know when to pop out of the
526           // if body again. We use the launchOp here, as that cannot be part of
527           // the bodies instruction.
528           worklist.push_back(launchOp.getOperation());
529         }
530       }
531     } else {
532       // Create a sequential for loop.
533       auto loopOp = rewriter.create<scf::ForOp>(
534           loc, cloningMap.lookupOrDefault(lowerBound),
535           cloningMap.lookupOrDefault(upperBound),
536           cloningMap.lookupOrDefault(step));
537       newIndex = loopOp.getInductionVar();
538       rewriter.setInsertionPointToStart(loopOp.getBody());
539       // Put a sentinel into the worklist so we know when to pop out of the loop
540       // body again. We use the launchOp here, as that cannot be part of the
541       // bodies instruction.
542       worklist.push_back(launchOp.getOperation());
543     }
544     cloningMap.map(iv, newIndex);
545   }
546 
547   // Propagate custom user defined optional attributes, that can be used at
548   // later stage, such as extension data for GPU kernel dispatch
549   for (const auto &namedAttr : parallelOp->getAttrs()) {
550     if (namedAttr.getName() == gpu::getMappingAttrName() ||
551         namedAttr.getName() == ParallelOp::getOperandSegmentSizeAttr())
552       continue;
553     launchOp->setAttr(namedAttr.getName(), namedAttr.getValue());
554   }
555 
556   Block *body = parallelOp.getBody();
557   worklist.reserve(worklist.size() + body->getOperations().size());
558   for (Operation &op : llvm::reverse(body->without_terminator()))
559     worklist.push_back(&op);
560   return success();
561 }
562 
563 /// Lower a `scf.parallel` operation into a corresponding `gpu.launch`
564 /// operation.
565 ///
566 /// This essentially transforms a loop nest into a corresponding SIMT function.
567 /// The conversion is driven by mapping annotations on the `scf.parallel`
568 /// operations. The mapping is provided via a `DictionaryAttribute` named
569 /// `mapping`, which has three entries:
570 ///  - processor: the hardware id to map to. 0-2 are block dimensions, 3-5 are
571 ///               thread dimensions and 6 is sequential.
572 ///  - map : An affine map that is used to pre-process hardware ids before
573 ///          substitution.
574 ///  - bound : An affine map that is used to compute the bound of the hardware
575 ///            id based on an upper bound of the number of iterations.
576 /// If the `scf.parallel` contains nested `scf.parallel` operations, those
577 /// need to be annotated, as well. Structurally, the transformation works by
578 /// splicing all operations from nested `scf.parallel` operations into a single
579 /// sequence. Indices mapped to hardware ids are substituted with those ids,
580 /// wheras sequential mappings result in a sequential for-loop. To have more
581 /// flexibility when mapping code to hardware ids, the transform supports two
582 /// affine maps. The first `map` is used to compute the actual index for
583 /// substitution from the hardware id. The second `bound` is used to compute the
584 /// launch dimension for the hardware id from the number of iterations the
585 /// mapped loop is performing. Note that the number of iterations might be
586 /// imprecise if the corresponding loop-bounds are loop-dependent. In such case,
587 /// the hardware id might iterate over additional indices. The transformation
588 /// caters for this by predicating the created sequence of instructions on
589 /// the actual loop bound. This only works if an static upper bound for the
590 /// dynamic loop bound can be derived, currently via analyzing `affine.min`
591 /// operations.
592 LogicalResult
593 ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
594                                              PatternRewriter &rewriter) const {
595   // Mark the operation as visited for recursive legality check.
596   parallelOp->setAttr(kVisitedAttrName, rewriter.getUnitAttr());
597 
598   // We can only transform starting at the outer-most loop. Launches inside of
599   // parallel loops are not supported.
600   if (auto parentLoop = parallelOp->getParentOfType<ParallelOp>())
601     return failure();
602   // Create a launch operation. We start with bound one for all grid/block
603   // sizes. Those will be refined later as we discover them from mappings.
604   Location loc = parallelOp.getLoc();
605   Value constantOne =
606       rewriter.create<arith::ConstantIndexOp>(parallelOp.getLoc(), 1);
607   gpu::LaunchOp launchOp = rewriter.create<gpu::LaunchOp>(
608       parallelOp.getLoc(), constantOne, constantOne, constantOne, constantOne,
609       constantOne, constantOne);
610   rewriter.setInsertionPointToEnd(&launchOp.getBody().front());
611   rewriter.create<gpu::TerminatorOp>(loc);
612   rewriter.setInsertionPointToStart(&launchOp.getBody().front());
613 
614   IRMapping cloningMap;
615   llvm::DenseMap<gpu::Processor, Value> launchBounds;
616   SmallVector<Operation *, 16> worklist;
617   if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist,
618                                  launchBounds, rewriter)))
619     return failure();
620 
621   // Whether we have seen any side-effects. Reset when leaving an inner scope.
622   bool seenSideeffects = false;
623   // Whether we have left a nesting scope (and hence are no longer innermost).
624   bool leftNestingScope = false;
625   while (!worklist.empty()) {
626     Operation *op = worklist.pop_back_val();
627     // Now walk over the body and clone it.
628     // TODO: This is only correct if there either is no further scf.parallel
629     //       nested or this code is side-effect free. Otherwise we might need
630     //       predication. We are overly conservative for now and only allow
631     //       side-effects in the innermost scope.
632     if (auto nestedParallel = dyn_cast<ParallelOp>(op)) {
633       // Before entering a nested scope, make sure there have been no
634       // sideeffects until now.
635       if (seenSideeffects)
636         return failure();
637       // A nested scf.parallel needs insertion of code to compute indices.
638       // Insert that now. This will also update the worklist with the loops
639       // body.
640       if (failed(processParallelLoop(nestedParallel, launchOp, cloningMap,
641                                      worklist, launchBounds, rewriter)))
642         return failure();
643     } else if (op == launchOp.getOperation()) {
644       // Found our sentinel value. We have finished the operations from one
645       // nesting level, pop one level back up.
646       auto *parent = rewriter.getInsertionPoint()->getParentOp();
647       rewriter.setInsertionPointAfter(parent);
648       leftNestingScope = true;
649       seenSideeffects = false;
650     } else {
651       // Otherwise we copy it over.
652       Operation *clone = rewriter.clone(*op, cloningMap);
653       cloningMap.map(op->getResults(), clone->getResults());
654       // Check for side effects.
655       // TODO: Handle region side effects properly.
656       seenSideeffects |=
657           !isMemoryEffectFree(clone) || clone->getNumRegions() != 0;
658       // If we are no longer in the innermost scope, sideeffects are disallowed.
659       if (seenSideeffects && leftNestingScope)
660         return failure();
661     }
662   }
663 
664   // Now that we succeeded creating the launch operation, also update the
665   // bounds.
666   for (auto bound : launchBounds)
667     launchOp.setOperand(getLaunchOpArgumentNum(std::get<0>(bound)),
668                         std::get<1>(bound));
669 
670   rewriter.eraseOp(parallelOp);
671   return success();
672 }
673 
674 void mlir::populateParallelLoopToGPUPatterns(RewritePatternSet &patterns) {
675   patterns.add<ParallelToGpuLaunchLowering>(patterns.getContext());
676 }
677 
678 void mlir::configureParallelLoopToGPULegality(ConversionTarget &target) {
679   target.addLegalDialect<memref::MemRefDialect>();
680   target.addDynamicallyLegalOp<scf::ParallelOp>([](scf::ParallelOp parallelOp) {
681     return !parallelOp->hasAttr(gpu::getMappingAttrName()) ||
682            parallelOp->hasAttr(kVisitedAttrName);
683   });
684 }
685 
686 void mlir::finalizeParallelLoopToGPUConversion(Operation *op) {
687   op->walk([](scf::ParallelOp parallelOp) {
688     parallelOp->removeAttr(kVisitedAttrName);
689   });
690 }
691