1 //===- SCFToGPU.cpp - Convert an affine loop nest to a GPU kernel -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This implements a straightforward conversion of an loop nest into a GPU 10 // kernel. The caller is expected to guarantee that the conversion is correct 11 // or to further transform the kernel to ensure correctness. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "mlir/Conversion/SCFToGPU/SCFToGPU.h" 16 17 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" 18 #include "mlir/Dialect/Affine/IR/AffineOps.h" 19 #include "mlir/Dialect/GPU/GPUDialect.h" 20 #include "mlir/Dialect/GPU/ParallelLoopMapper.h" 21 #include "mlir/Dialect/MemRef/IR/MemRef.h" 22 #include "mlir/Dialect/SCF/SCF.h" 23 #include "mlir/Dialect/StandardOps/IR/Ops.h" 24 #include "mlir/IR/AffineExpr.h" 25 #include "mlir/IR/BlockAndValueMapping.h" 26 #include "mlir/IR/Builders.h" 27 #include "mlir/Pass/Pass.h" 28 #include "mlir/Transforms/DialectConversion.h" 29 #include "mlir/Transforms/LoopUtils.h" 30 #include "mlir/Transforms/Passes.h" 31 #include "mlir/Transforms/RegionUtils.h" 32 #include "llvm/ADT/Sequence.h" 33 #include "llvm/Support/Debug.h" 34 35 #define DEBUG_TYPE "loops-to-gpu" 36 37 using namespace mlir; 38 using namespace mlir::scf; 39 40 // Extract an indexed value from KernelDim3. 41 static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) { 42 switch (pos) { 43 case 0: 44 return dim3.x; 45 case 1: 46 return dim3.y; 47 case 2: 48 return dim3.z; 49 default: 50 llvm_unreachable("dim3 position out of bounds"); 51 } 52 return nullptr; 53 } 54 55 // Get the lower bound-related operands of a loop operation. 56 static Operation::operand_range getLowerBoundOperands(AffineForOp forOp) { 57 return forOp.getLowerBoundOperands(); 58 } 59 60 // Get the upper bound-related operands of a loop operation. 61 static Operation::operand_range getUpperBoundOperands(AffineForOp forOp) { 62 return forOp.getUpperBoundOperands(); 63 } 64 65 // Get a Value that corresponds to the loop step. If the step is an attribute, 66 // materialize a corresponding constant using builder. 67 static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder) { 68 return builder.create<ConstantIndexOp>(forOp.getLoc(), forOp.getStep()); 69 } 70 71 // Get a Value for the loop lower bound. If the value requires computation, 72 // materialize the instructions using builder. 73 static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) { 74 return lowerAffineLowerBound(forOp, builder); 75 } 76 77 // Get a Value for the loop upper bound. If the value requires computation, 78 // materialize the instructions using builder. 79 static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) { 80 return lowerAffineUpperBound(forOp, builder); 81 } 82 83 // Check the structure of the loop nest: 84 // - there are enough loops to map to numDims; 85 // - the loops are perfectly nested; 86 // - the loop bounds can be computed above the outermost loop. 87 // This roughly corresponds to the "matcher" part of the pattern-based 88 // rewriting infrastructure. 89 static LogicalResult checkAffineLoopNestMappableImpl(AffineForOp forOp, 90 unsigned numDims) { 91 Region &limit = forOp.region(); 92 for (unsigned i = 0, e = numDims; i < e; ++i) { 93 Operation *nested = &forOp.getBody()->front(); 94 if (!areValuesDefinedAbove(getLowerBoundOperands(forOp), limit) || 95 !areValuesDefinedAbove(getUpperBoundOperands(forOp), limit)) 96 return forOp.emitError( 97 "loops with bounds depending on other mapped loops " 98 "are not supported"); 99 100 // The innermost loop can have an arbitrary body, skip the perfect nesting 101 // check for it. 102 if (i == e - 1) 103 break; 104 105 auto begin = forOp.getBody()->begin(), end = forOp.getBody()->end(); 106 if (forOp.getBody()->empty() || std::next(begin, 2) != end) 107 return forOp.emitError("expected perfectly nested loops in the body"); 108 109 if (!(forOp = dyn_cast<AffineForOp>(nested))) 110 return nested->emitError("expected a nested loop"); 111 } 112 return success(); 113 } 114 115 static LogicalResult checkAffineLoopNestMappable(AffineForOp forOp, 116 unsigned numBlockDims, 117 unsigned numThreadDims) { 118 if (numBlockDims < 1 || numThreadDims < 1) { 119 LLVM_DEBUG(llvm::dbgs() << "nothing to map"); 120 return success(); 121 } 122 123 if (numBlockDims > 3) { 124 return forOp.emitError("cannot map to more than 3 block dimensions"); 125 } 126 if (numThreadDims > 3) { 127 return forOp.emitError("cannot map to more than 3 thread dimensions"); 128 } 129 return checkAffineLoopNestMappableImpl(forOp, numBlockDims + numThreadDims); 130 } 131 132 namespace { 133 // Helper structure that holds common state of the loop to GPU kernel 134 // conversion. 135 struct AffineLoopToGpuConverter { 136 Optional<AffineForOp> collectBounds(AffineForOp forOp, unsigned numLoops); 137 138 void createLaunch(AffineForOp rootForOp, AffineForOp innermostForOp, 139 unsigned numBlockDims, unsigned numThreadDims); 140 141 // Ranges of the loops mapped to blocks or threads. 142 SmallVector<Value, 6> dims; 143 // Lower bounds of the loops mapped to blocks or threads. 144 SmallVector<Value, 6> lbs; 145 // Induction variables of the loops mapped to blocks or threads. 146 SmallVector<Value, 6> ivs; 147 // Steps of the loops mapped to blocks or threads. 148 SmallVector<Value, 6> steps; 149 }; 150 } // namespace 151 152 // Return true if the value is obviously a constant "one". 153 static bool isConstantOne(Value value) { 154 if (auto def = value.getDefiningOp<ConstantIndexOp>()) 155 return def.getValue() == 1; 156 return false; 157 } 158 159 // Collect ranges, bounds, steps and induction variables in preparation for 160 // mapping a loop nest of depth "numLoops" rooted at "forOp" to a GPU kernel. 161 // This may fail if the IR for computing loop bounds cannot be constructed, for 162 // example if an affine loop uses semi-affine maps. Return the last loop to be 163 // mapped on success, llvm::None on failure. 164 Optional<AffineForOp> 165 AffineLoopToGpuConverter::collectBounds(AffineForOp forOp, unsigned numLoops) { 166 OpBuilder builder(forOp.getOperation()); 167 dims.reserve(numLoops); 168 lbs.reserve(numLoops); 169 ivs.reserve(numLoops); 170 steps.reserve(numLoops); 171 AffineForOp currentLoop = forOp; 172 for (unsigned i = 0; i < numLoops; ++i) { 173 Value lowerBound = getOrEmitLowerBound(currentLoop, builder); 174 Value upperBound = getOrEmitUpperBound(currentLoop, builder); 175 if (!lowerBound || !upperBound) { 176 return llvm::None; 177 } 178 179 Value range = 180 builder.create<SubIOp>(currentLoop.getLoc(), upperBound, lowerBound); 181 Value step = getOrCreateStep(currentLoop, builder); 182 if (!isConstantOne(step)) 183 range = builder.create<SignedDivIOp>(currentLoop.getLoc(), range, step); 184 dims.push_back(range); 185 186 lbs.push_back(lowerBound); 187 ivs.push_back(currentLoop.getInductionVar()); 188 steps.push_back(step); 189 190 if (i != numLoops - 1) 191 currentLoop = cast<AffineForOp>(¤tLoop.getBody()->front()); 192 } 193 return currentLoop; 194 } 195 196 // Replace the rooted at "rootForOp" with a GPU launch operation. This expects 197 // "innermostForOp" to point to the last loop to be transformed to the kernel, 198 // and to have (numBlockDims + numThreadDims) perfectly nested loops between 199 // "rootForOp" and "innermostForOp". 200 void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp, 201 AffineForOp innermostForOp, 202 unsigned numBlockDims, 203 unsigned numThreadDims) { 204 OpBuilder builder(rootForOp.getOperation()); 205 // Prepare the grid and block sizes for the launch operation. If there is 206 // no loop mapped to a specific dimension, use constant "1" as its size. 207 Value constOne = (numBlockDims < 3 || numThreadDims < 3) 208 ? builder.create<ConstantIndexOp>(rootForOp.getLoc(), 1) 209 : nullptr; 210 Value gridSizeX = numBlockDims > 0 ? dims[0] : constOne; 211 Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne; 212 Value gridSizeZ = numBlockDims > 2 ? dims[2] : constOne; 213 Value blockSizeX = numThreadDims > 0 ? dims[numBlockDims] : constOne; 214 Value blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne; 215 Value blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne; 216 217 // Create a launch op and move the body region of the innermost loop to the 218 // launch op. 219 auto launchOp = builder.create<gpu::LaunchOp>( 220 rootForOp.getLoc(), gridSizeX, gridSizeY, gridSizeZ, blockSizeX, 221 blockSizeY, blockSizeZ); 222 223 // Replace the loop terminator (loops contain only a single block) with the 224 // gpu terminator and move the operations from the loop body block to the gpu 225 // launch body block. Do not move the entire block because of the difference 226 // in block arguments. 227 Operation &terminator = innermostForOp.getBody()->back(); 228 Location terminatorLoc = terminator.getLoc(); 229 terminator.erase(); 230 builder.setInsertionPointToEnd(innermostForOp.getBody()); 231 builder.create<gpu::TerminatorOp>(terminatorLoc, llvm::None); 232 launchOp.body().front().getOperations().splice( 233 launchOp.body().front().begin(), 234 innermostForOp.getBody()->getOperations()); 235 236 // Remap the loop iterators to use block/thread identifiers instead. Loops 237 // may iterate from LB with step S whereas GPU thread/block ids always iterate 238 // from 0 to N with step 1. Therefore, loop induction variables are replaced 239 // with (gpu-thread/block-id * S) + LB. 240 builder.setInsertionPointToStart(&launchOp.body().front()); 241 auto lbArgumentIt = lbs.begin(); 242 auto stepArgumentIt = steps.begin(); 243 for (auto en : llvm::enumerate(ivs)) { 244 Value id = 245 en.index() < numBlockDims 246 ? getDim3Value(launchOp.getBlockIds(), en.index()) 247 : getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims); 248 Value step = steps[en.index()]; 249 if (!isConstantOne(step)) 250 id = builder.create<MulIOp>(rootForOp.getLoc(), step, id); 251 252 Value ivReplacement = 253 builder.create<AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id); 254 en.value().replaceAllUsesWith(ivReplacement); 255 std::advance(lbArgumentIt, 1); 256 std::advance(stepArgumentIt, 1); 257 } 258 259 // We are done and can erase the original outermost loop. 260 rootForOp.erase(); 261 } 262 263 // Generic loop to GPU kernel conversion function. 264 static LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp, 265 unsigned numBlockDims, 266 unsigned numThreadDims) { 267 if (failed(checkAffineLoopNestMappable(forOp, numBlockDims, numThreadDims))) 268 return failure(); 269 270 AffineLoopToGpuConverter converter; 271 auto maybeInnerLoop = 272 converter.collectBounds(forOp, numBlockDims + numThreadDims); 273 if (!maybeInnerLoop) 274 return failure(); 275 converter.createLaunch(forOp, *maybeInnerLoop, numBlockDims, numThreadDims); 276 277 return success(); 278 } 279 280 LogicalResult mlir::convertAffineLoopNestToGPULaunch(AffineForOp forOp, 281 unsigned numBlockDims, 282 unsigned numThreadDims) { 283 return ::convertAffineLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims); 284 } 285 286 namespace { 287 struct ParallelToGpuLaunchLowering : public OpRewritePattern<ParallelOp> { 288 using OpRewritePattern<ParallelOp>::OpRewritePattern; 289 290 LogicalResult matchAndRewrite(ParallelOp parallelOp, 291 PatternRewriter &rewriter) const override; 292 }; 293 } // namespace 294 295 /// Tries to derive a static upper bound from the defining operation of 296 /// `upperBound`. 297 static Value deriveStaticUpperBound(Value upperBound, 298 PatternRewriter &rewriter) { 299 if (auto op = upperBound.getDefiningOp<ConstantIndexOp>()) { 300 return op; 301 } 302 303 if (auto minOp = upperBound.getDefiningOp<AffineMinOp>()) { 304 for (const AffineExpr &result : minOp.map().getResults()) { 305 if (auto constExpr = result.dyn_cast<AffineConstantExpr>()) { 306 return rewriter.create<ConstantIndexOp>(minOp.getLoc(), 307 constExpr.getValue()); 308 } 309 } 310 } 311 312 if (auto multiplyOp = upperBound.getDefiningOp<MulIOp>()) { 313 if (auto lhs = dyn_cast_or_null<ConstantIndexOp>( 314 deriveStaticUpperBound(multiplyOp.getOperand(0), rewriter) 315 .getDefiningOp())) 316 if (auto rhs = dyn_cast_or_null<ConstantIndexOp>( 317 deriveStaticUpperBound(multiplyOp.getOperand(1), rewriter) 318 .getDefiningOp())) { 319 // Assumptions about the upper bound of minimum computations no longer 320 // work if multiplied by a negative value, so abort in this case. 321 if (lhs.getValue() < 0 || rhs.getValue() < 0) 322 return {}; 323 324 return rewriter.create<ConstantIndexOp>( 325 multiplyOp.getLoc(), lhs.getValue() * rhs.getValue()); 326 } 327 } 328 329 return {}; 330 } 331 332 static bool isMappedToProcessor(gpu::Processor processor) { 333 return processor != gpu::Processor::Sequential; 334 } 335 336 static unsigned getLaunchOpArgumentNum(gpu::Processor processor) { 337 switch (processor) { 338 case gpu::Processor::BlockX: 339 return 0; 340 case gpu::Processor::BlockY: 341 return 1; 342 case gpu::Processor::BlockZ: 343 return 2; 344 case gpu::Processor::ThreadX: 345 return 3; 346 case gpu::Processor::ThreadY: 347 return 4; 348 case gpu::Processor::ThreadZ: 349 return 5; 350 default:; 351 } 352 llvm_unreachable( 353 "invalid processor type while retrieving launch op argument number"); 354 } 355 356 /// Modifies the current transformation state to capture the effect of the given 357 /// `scf.parallel` operation on index substitutions and the operations to be 358 /// inserted. 359 /// Specifically, if a dimension of a parallel loop is mapped to a hardware id, 360 /// this function will 361 /// - compute the loop index based on the hardware id and affine map from the 362 /// mapping and update `cloningMap` to substitute all uses. 363 /// - derive a new upper bound for the hardware id and augment the provided 364 /// `gpu.launch operation` accordingly. 365 /// - if the upper bound is imprecise, insert a conditional in the `gpu.launch` 366 /// and update the rewriter to insert into the conditional's body. 367 /// If the dimension is mapped to sequential, 368 /// - insert a for loop into the body and update the rewriter to insert into 369 /// the for loop's body. 370 /// - update the `cloningMap` to replace uses of the index with the index of 371 /// the new for loop. 372 /// In either case, 373 /// - append the instructions from the loops body to worklist, in reverse order. 374 /// To note the end of the current scope in case a loop or conditional was 375 /// inserted, a sentinel (the `gpu.launch` operation) is inserted into the 376 /// worklist. This signals the processor of the worklist to pop the rewriter 377 /// one scope-level up. 378 static LogicalResult processParallelLoop( 379 ParallelOp parallelOp, gpu::LaunchOp launchOp, 380 BlockAndValueMapping &cloningMap, SmallVectorImpl<Operation *> &worklist, 381 DenseMap<gpu::Processor, Value> &bounds, PatternRewriter &rewriter) { 382 // TODO: Verify that this is a valid GPU mapping. 383 // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential 384 ArrayAttr mapping = 385 parallelOp->getAttrOfType<ArrayAttr>(gpu::getMappingAttrName()); 386 387 // TODO: Support reductions. 388 if (!mapping || parallelOp.getNumResults() != 0) 389 return failure(); 390 391 Location loc = parallelOp.getLoc(); 392 393 auto launchIndependent = [&launchOp](Value val) { 394 return val.getParentRegion()->isAncestor(launchOp->getParentRegion()); 395 }; 396 397 auto ensureLaunchIndependent = [&rewriter, 398 launchIndependent](Value val) -> Value { 399 if (launchIndependent(val)) 400 return val; 401 if (ConstantOp constOp = val.getDefiningOp<ConstantOp>()) 402 return rewriter.create<ConstantOp>(constOp.getLoc(), constOp.getValue()); 403 return {}; 404 }; 405 406 for (auto config : llvm::zip(mapping, parallelOp.getInductionVars(), 407 parallelOp.lowerBound(), parallelOp.upperBound(), 408 parallelOp.step())) { 409 Attribute mappingAttribute; 410 Value iv, lowerBound, upperBound, step; 411 std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config; 412 auto annotation = mappingAttribute.dyn_cast<gpu::ParallelLoopDimMapping>(); 413 if (!annotation) 414 return parallelOp.emitOpError() 415 << "expected mapping attribute for lowering to GPU"; 416 Value newIndex; 417 gpu::Processor processor = gpu::getProcessor(annotation); 418 419 if (isMappedToProcessor(processor)) { 420 // Use the corresponding thread/grid index as replacement for the loop iv. 421 Value operand = 422 launchOp.body().getArgument(getLaunchOpArgumentNum(processor)); 423 // Take the indexmap and add the lower bound and step computations in. 424 // This computes operand * step + lowerBound. 425 // Use an affine map here so that it composes nicely with the provided 426 // annotation. 427 AffineMap lowerAndStep = AffineMap::get( 428 1, 2, 429 rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) + 430 rewriter.getAffineSymbolExpr(1)); 431 newIndex = rewriter.create<AffineApplyOp>( 432 loc, annotation.map().getValue().compose(lowerAndStep), 433 ValueRange{operand, step, lowerBound}); 434 // If there was also a bound, insert that, too. 435 // TODO: Check that we do not assign bounds twice. 436 if (annotation.bound().getValue()) { 437 // We pass as the single operand to the bound-map the number of 438 // iterations, which is (upperBound - lowerBound) ceilDiv step. To 439 // support inner loops with dynamic upper bounds (as generated by e.g. 440 // tiling), try to derive a max for the bounds. If the used bound for 441 // the hardware id is imprecise, wrap the contained code into a 442 // conditional. If the lower-bound is constant or defined before the 443 // launch, we can use it in the launch bounds. Otherwise fail. 444 if (!launchIndependent(lowerBound) && 445 !isa_and_nonnull<ConstantOp>(lowerBound.getDefiningOp())) 446 return failure(); 447 // The step must also be constant or defined outside of the loop nest. 448 if (!launchIndependent(step) && 449 !isa_and_nonnull<ConstantOp>(step.getDefiningOp())) 450 return failure(); 451 // If the upper-bound is constant or defined before the launch, we can 452 // use it in the launch bounds directly. Otherwise try derive a bound. 453 bool boundIsPrecise = 454 launchIndependent(upperBound) || 455 isa_and_nonnull<ConstantOp>(upperBound.getDefiningOp()); 456 { 457 PatternRewriter::InsertionGuard guard(rewriter); 458 rewriter.setInsertionPoint(launchOp); 459 if (!boundIsPrecise) { 460 upperBound = deriveStaticUpperBound(upperBound, rewriter); 461 if (!upperBound) { 462 return rewriter.notifyMatchFailure( 463 parallelOp, 464 "cannot derive loop-invariant upper bound for number of" 465 "iterations"); 466 } 467 } 468 // Compute the number of iterations needed. We compute this as an 469 // affine expression ceilDiv (upperBound - lowerBound) step. We use 470 // affine.apply here so that it composes nicely with the provided map. 471 AffineMap stepMap = AffineMap::get( 472 1, 2, 473 ((rewriter.getAffineDimExpr(0) - rewriter.getAffineSymbolExpr(0)) 474 .ceilDiv(rewriter.getAffineSymbolExpr(1)))); 475 Value launchBound = rewriter.create<AffineApplyOp>( 476 loc, annotation.bound().getValue().compose(stepMap), 477 ValueRange{ 478 ensureLaunchIndependent( 479 cloningMap.lookupOrDefault(upperBound)), 480 ensureLaunchIndependent( 481 cloningMap.lookupOrDefault(lowerBound)), 482 ensureLaunchIndependent(cloningMap.lookupOrDefault(step))}); 483 // todo(herhut,ravishankarm): Update the behavior of setMappingAttr 484 // when this condition is relaxed. 485 if (bounds.find(processor) != bounds.end()) { 486 return rewriter.notifyMatchFailure( 487 parallelOp, "cannot redefine the bound for processor " + 488 Twine(static_cast<int64_t>(processor))); 489 } 490 bounds[processor] = launchBound; 491 } 492 if (!boundIsPrecise) { 493 // We are using an approximation, create a surrounding conditional. 494 Value originalBound = std::get<3>(config); 495 CmpIOp pred = rewriter.create<CmpIOp>( 496 loc, CmpIPredicate::slt, newIndex, 497 cloningMap.lookupOrDefault(originalBound)); 498 scf::IfOp ifOp = rewriter.create<scf::IfOp>(loc, pred, false); 499 rewriter.setInsertionPointToStart(&ifOp.thenRegion().front()); 500 // Put a sentinel into the worklist so we know when to pop out of the 501 // if body again. We use the launchOp here, as that cannot be part of 502 // the bodies instruction. 503 worklist.push_back(launchOp.getOperation()); 504 } 505 } 506 } else { 507 // Create a sequential for loop. 508 auto loopOp = rewriter.create<scf::ForOp>( 509 loc, cloningMap.lookupOrDefault(lowerBound), 510 cloningMap.lookupOrDefault(upperBound), 511 cloningMap.lookupOrDefault(step)); 512 newIndex = loopOp.getInductionVar(); 513 rewriter.setInsertionPointToStart(loopOp.getBody()); 514 // Put a sentinel into the worklist so we know when to pop out of the loop 515 // body again. We use the launchOp here, as that cannot be part of the 516 // bodies instruction. 517 worklist.push_back(launchOp.getOperation()); 518 } 519 cloningMap.map(iv, newIndex); 520 } 521 522 // Propagate custom user defined optional attributes, that can be used at 523 // later stage, such as extension data for GPU kernel dispatch 524 for (const auto &namedAttr : parallelOp.getAttrs()) { 525 if (namedAttr.first == gpu::getMappingAttrName() || 526 namedAttr.first == ParallelOp::getOperandSegmentSizeAttr()) 527 continue; 528 launchOp->setAttr(namedAttr.first, namedAttr.second); 529 } 530 531 Block *body = parallelOp.getBody(); 532 worklist.reserve(worklist.size() + body->getOperations().size()); 533 for (Operation &op : llvm::reverse(body->without_terminator())) 534 worklist.push_back(&op); 535 return success(); 536 } 537 538 /// Lower a `scf.parallel` operation into a corresponding `gpu.launch` 539 /// operation. 540 /// 541 /// This essentially transforms a loop nest into a corresponding SIMT function. 542 /// The conversion is driven by mapping annotations on the `scf.parallel` 543 /// operations. The mapping is provided via a `DictionaryAttribute` named 544 /// `mapping`, which has three entries: 545 /// - processor: the hardware id to map to. 0-2 are block dimensions, 3-5 are 546 /// thread dimensions and 6 is sequential. 547 /// - map : An affine map that is used to pre-process hardware ids before 548 /// substitution. 549 /// - bound : An affine map that is used to compute the bound of the hardware 550 /// id based on an upper bound of the number of iterations. 551 /// If the `scf.parallel` contains nested `scf.parallel` operations, those 552 /// need to be annotated, as well. Structurally, the transformation works by 553 /// splicing all operations from nested `scf.parallel` operations into a single 554 /// sequence. Indices mapped to hardware ids are substituted with those ids, 555 /// wheras sequential mappings result in a sequential for-loop. To have more 556 /// flexibility when mapping code to hardware ids, the transform supports two 557 /// affine maps. The first `map` is used to compute the actual index for 558 /// substitution from the hardware id. The second `bound` is used to compute the 559 /// launch dimension for the hardware id from the number of iterations the 560 /// mapped loop is performing. Note that the number of iterations might be 561 /// imprecise if the corresponding loop-bounds are loop-dependent. In such case, 562 /// the hardware id might iterate over additional indices. The transformation 563 /// caters for this by predicating the created sequence of instructions on 564 /// the actual loop bound. This only works if an static upper bound for the 565 /// dynamic loop bound can be derived, currently via analyzing `affine.min` 566 /// operations. 567 LogicalResult 568 ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, 569 PatternRewriter &rewriter) const { 570 // We can only transform starting at the outer-most loop. Launches inside of 571 // parallel loops are not supported. 572 if (auto parentLoop = parallelOp->getParentOfType<ParallelOp>()) 573 return failure(); 574 // Create a launch operation. We start with bound one for all grid/block 575 // sizes. Those will be refined later as we discover them from mappings. 576 Location loc = parallelOp.getLoc(); 577 Value constantOne = rewriter.create<ConstantIndexOp>(parallelOp.getLoc(), 1); 578 gpu::LaunchOp launchOp = rewriter.create<gpu::LaunchOp>( 579 parallelOp.getLoc(), constantOne, constantOne, constantOne, constantOne, 580 constantOne, constantOne); 581 rewriter.setInsertionPointToEnd(&launchOp.body().front()); 582 rewriter.create<gpu::TerminatorOp>(loc); 583 rewriter.setInsertionPointToStart(&launchOp.body().front()); 584 585 BlockAndValueMapping cloningMap; 586 llvm::DenseMap<gpu::Processor, Value> launchBounds; 587 SmallVector<Operation *, 16> worklist; 588 if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist, 589 launchBounds, rewriter))) 590 return failure(); 591 592 // Whether we have seen any side-effects. Reset when leaving an inner scope. 593 bool seenSideeffects = false; 594 // Whether we have left a nesting scope (and hence are no longer innermost). 595 bool leftNestingScope = false; 596 while (!worklist.empty()) { 597 Operation *op = worklist.pop_back_val(); 598 // Now walk over the body and clone it. 599 // TODO: This is only correct if there either is no further scf.parallel 600 // nested or this code is side-effect free. Otherwise we might need 601 // predication. We are overly conservative for now and only allow 602 // side-effects in the innermost scope. 603 if (auto nestedParallel = dyn_cast<ParallelOp>(op)) { 604 // Before entering a nested scope, make sure there have been no 605 // sideeffects until now. 606 if (seenSideeffects) 607 return failure(); 608 // A nested scf.parallel needs insertion of code to compute indices. 609 // Insert that now. This will also update the worklist with the loops 610 // body. 611 if (failed(processParallelLoop(nestedParallel, launchOp, cloningMap, 612 worklist, launchBounds, rewriter))) 613 return failure(); 614 } else if (op == launchOp.getOperation()) { 615 // Found our sentinel value. We have finished the operations from one 616 // nesting level, pop one level back up. 617 auto parent = rewriter.getInsertionPoint()->getParentOp(); 618 rewriter.setInsertionPointAfter(parent); 619 leftNestingScope = true; 620 seenSideeffects = false; 621 } else { 622 // Otherwise we copy it over. 623 Operation *clone = rewriter.clone(*op, cloningMap); 624 cloningMap.map(op->getResults(), clone->getResults()); 625 // Check for side effects. 626 // TODO: Handle region side effects properly. 627 seenSideeffects |= !MemoryEffectOpInterface::hasNoEffect(clone) || 628 clone->getNumRegions() != 0; 629 // If we are no longer in the innermost scope, sideeffects are disallowed. 630 if (seenSideeffects && leftNestingScope) 631 return failure(); 632 } 633 } 634 635 // Now that we succeeded creating the launch operation, also update the 636 // bounds. 637 for (auto bound : launchBounds) 638 launchOp.setOperand(getLaunchOpArgumentNum(std::get<0>(bound)), 639 std::get<1>(bound)); 640 641 rewriter.eraseOp(parallelOp); 642 return success(); 643 } 644 645 void mlir::populateParallelLoopToGPUPatterns(OwningRewritePatternList &patterns, 646 MLIRContext *ctx) { 647 patterns.insert<ParallelToGpuLaunchLowering>(ctx); 648 } 649 650 void mlir::configureParallelLoopToGPULegality(ConversionTarget &target) { 651 target.addLegalDialect<memref::MemRefDialect>(); 652 target.addDynamicallyLegalOp<scf::ParallelOp>([](scf::ParallelOp parallelOp) { 653 return !parallelOp->getAttr(gpu::getMappingAttrName()); 654 }); 655 } 656