1 //===- SCFToGPU.cpp - Convert an affine loop nest to a GPU kernel -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This implements a straightforward conversion of an loop nest into a GPU 10 // kernel. The caller is expected to guarantee that the conversion is correct 11 // or to further transform the kernel to ensure correctness. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "mlir/Conversion/SCFToGPU/SCFToGPU.h" 16 17 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" 18 #include "mlir/Dialect/Affine/IR/AffineOps.h" 19 #include "mlir/Dialect/Arith/IR/Arith.h" 20 #include "mlir/Dialect/GPU/IR/GPUDialect.h" 21 #include "mlir/Dialect/GPU/Transforms/ParallelLoopMapper.h" 22 #include "mlir/Dialect/MemRef/IR/MemRef.h" 23 #include "mlir/Dialect/SCF/IR/SCF.h" 24 #include "mlir/IR/AffineExpr.h" 25 #include "mlir/IR/BlockAndValueMapping.h" 26 #include "mlir/IR/Builders.h" 27 #include "mlir/Interfaces/SideEffectInterfaces.h" 28 #include "mlir/Pass/Pass.h" 29 #include "mlir/Transforms/DialectConversion.h" 30 #include "mlir/Transforms/Passes.h" 31 #include "mlir/Transforms/RegionUtils.h" 32 #include "llvm/ADT/Sequence.h" 33 #include "llvm/Support/Debug.h" 34 35 #define DEBUG_TYPE "loops-to-gpu" 36 37 using namespace mlir; 38 using namespace mlir::scf; 39 40 // Name of internal attribute to mark visited operations during conversion. 41 // 42 // NOTE: The conversion originally used the following legality criteria: 43 // `!parallelOp->hasAttr(gpu::getMappingAttrName())` 44 // But the provided pattern might reject some cases based on more detailed 45 // analysis of the `mapping` attribute. 46 // To avoid dialect conversion failure due to non-converted illegal operation 47 // we use this extra Unit attribute as a marker, that the operation was checked 48 // by the pattern and is should be considered as legal in the following legality 49 // checks. The `finalizeParallelLoopToGPUConversion` function performs clean up 50 // of this extra attributes ans is supposed to be called after the dialect 51 // conversion. 52 // 53 // TODO: Implement a cleaner solution, factoring out the "matching" logic 54 // from the pattern and its callees into a separate function that can be called 55 // from both the pattern and the op legality check. 56 static constexpr StringLiteral kVisitedAttrName = "SCFToGPU_visited"; 57 58 // Extract an indexed value from KernelDim3. 59 static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) { 60 switch (pos) { 61 case 0: 62 return dim3.x; 63 case 1: 64 return dim3.y; 65 case 2: 66 return dim3.z; 67 default: 68 llvm_unreachable("dim3 position out of bounds"); 69 } 70 return nullptr; 71 } 72 73 // Get the lower bound-related operands of a loop operation. 74 static Operation::operand_range getLowerBoundOperands(AffineForOp forOp) { 75 return forOp.getLowerBoundOperands(); 76 } 77 78 // Get the upper bound-related operands of a loop operation. 79 static Operation::operand_range getUpperBoundOperands(AffineForOp forOp) { 80 return forOp.getUpperBoundOperands(); 81 } 82 83 // Get a Value that corresponds to the loop step. If the step is an attribute, 84 // materialize a corresponding constant using builder. 85 static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder) { 86 return builder.create<arith::ConstantIndexOp>(forOp.getLoc(), 87 forOp.getStep()); 88 } 89 90 // Get a Value for the loop lower bound. If the value requires computation, 91 // materialize the instructions using builder. 92 static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) { 93 return lowerAffineLowerBound(forOp, builder); 94 } 95 96 // Get a Value for the loop upper bound. If the value requires computation, 97 // materialize the instructions using builder. 98 static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) { 99 return lowerAffineUpperBound(forOp, builder); 100 } 101 102 // Check the structure of the loop nest: 103 // - there are enough loops to map to numDims; 104 // - the loops are perfectly nested; 105 // - the loop bounds can be computed above the outermost loop. 106 // This roughly corresponds to the "matcher" part of the pattern-based 107 // rewriting infrastructure. 108 static LogicalResult checkAffineLoopNestMappableImpl(AffineForOp forOp, 109 unsigned numDims) { 110 Region &limit = forOp.getRegion(); 111 for (unsigned i = 0, e = numDims; i < e; ++i) { 112 Operation *nested = &forOp.getBody()->front(); 113 if (!areValuesDefinedAbove(getLowerBoundOperands(forOp), limit) || 114 !areValuesDefinedAbove(getUpperBoundOperands(forOp), limit)) 115 return forOp.emitError( 116 "loops with bounds depending on other mapped loops " 117 "are not supported"); 118 119 // The innermost loop can have an arbitrary body, skip the perfect nesting 120 // check for it. 121 if (i == e - 1) 122 break; 123 124 auto begin = forOp.getBody()->begin(), end = forOp.getBody()->end(); 125 if (forOp.getBody()->empty() || std::next(begin, 2) != end) 126 return forOp.emitError("expected perfectly nested loops in the body"); 127 128 if (!(forOp = dyn_cast<AffineForOp>(nested))) 129 return nested->emitError("expected a nested loop"); 130 } 131 return success(); 132 } 133 134 static LogicalResult checkAffineLoopNestMappable(AffineForOp forOp, 135 unsigned numBlockDims, 136 unsigned numThreadDims) { 137 if (numBlockDims < 1 || numThreadDims < 1) { 138 LLVM_DEBUG(llvm::dbgs() << "nothing to map"); 139 return success(); 140 } 141 142 if (numBlockDims > 3) { 143 return forOp.emitError("cannot map to more than 3 block dimensions"); 144 } 145 if (numThreadDims > 3) { 146 return forOp.emitError("cannot map to more than 3 thread dimensions"); 147 } 148 return checkAffineLoopNestMappableImpl(forOp, numBlockDims + numThreadDims); 149 } 150 151 namespace { 152 // Helper structure that holds common state of the loop to GPU kernel 153 // conversion. 154 struct AffineLoopToGpuConverter { 155 Optional<AffineForOp> collectBounds(AffineForOp forOp, unsigned numLoops); 156 157 void createLaunch(AffineForOp rootForOp, AffineForOp innermostForOp, 158 unsigned numBlockDims, unsigned numThreadDims); 159 160 // Ranges of the loops mapped to blocks or threads. 161 SmallVector<Value, 6> dims; 162 // Lower bounds of the loops mapped to blocks or threads. 163 SmallVector<Value, 6> lbs; 164 // Induction variables of the loops mapped to blocks or threads. 165 SmallVector<Value, 6> ivs; 166 // Steps of the loops mapped to blocks or threads. 167 SmallVector<Value, 6> steps; 168 }; 169 } // namespace 170 171 // Return true if the value is obviously a constant "one". 172 static bool isConstantOne(Value value) { 173 if (auto def = value.getDefiningOp<arith::ConstantIndexOp>()) 174 return def.value() == 1; 175 return false; 176 } 177 178 // Collect ranges, bounds, steps and induction variables in preparation for 179 // mapping a loop nest of depth "numLoops" rooted at "forOp" to a GPU kernel. 180 // This may fail if the IR for computing loop bounds cannot be constructed, for 181 // example if an affine loop uses semi-affine maps. Return the last loop to be 182 // mapped on success, llvm::None on failure. 183 Optional<AffineForOp> 184 AffineLoopToGpuConverter::collectBounds(AffineForOp forOp, unsigned numLoops) { 185 OpBuilder builder(forOp.getOperation()); 186 dims.reserve(numLoops); 187 lbs.reserve(numLoops); 188 ivs.reserve(numLoops); 189 steps.reserve(numLoops); 190 AffineForOp currentLoop = forOp; 191 for (unsigned i = 0; i < numLoops; ++i) { 192 Value lowerBound = getOrEmitLowerBound(currentLoop, builder); 193 Value upperBound = getOrEmitUpperBound(currentLoop, builder); 194 if (!lowerBound || !upperBound) { 195 return llvm::None; 196 } 197 198 Value range = builder.create<arith::SubIOp>(currentLoop.getLoc(), 199 upperBound, lowerBound); 200 Value step = getOrCreateStep(currentLoop, builder); 201 if (!isConstantOne(step)) 202 range = builder.create<arith::DivSIOp>(currentLoop.getLoc(), range, step); 203 dims.push_back(range); 204 205 lbs.push_back(lowerBound); 206 ivs.push_back(currentLoop.getInductionVar()); 207 steps.push_back(step); 208 209 if (i != numLoops - 1) 210 currentLoop = cast<AffineForOp>(¤tLoop.getBody()->front()); 211 } 212 return currentLoop; 213 } 214 215 // Replace the rooted at "rootForOp" with a GPU launch operation. This expects 216 // "innermostForOp" to point to the last loop to be transformed to the kernel, 217 // and to have (numBlockDims + numThreadDims) perfectly nested loops between 218 // "rootForOp" and "innermostForOp". 219 void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp, 220 AffineForOp innermostForOp, 221 unsigned numBlockDims, 222 unsigned numThreadDims) { 223 OpBuilder builder(rootForOp.getOperation()); 224 // Prepare the grid and block sizes for the launch operation. If there is 225 // no loop mapped to a specific dimension, use constant "1" as its size. 226 Value constOne = 227 (numBlockDims < 3 || numThreadDims < 3) 228 ? builder.create<arith::ConstantIndexOp>(rootForOp.getLoc(), 1) 229 : nullptr; 230 Value gridSizeX = numBlockDims > 0 ? dims[0] : constOne; 231 Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne; 232 Value gridSizeZ = numBlockDims > 2 ? dims[2] : constOne; 233 Value blockSizeX = numThreadDims > 0 ? dims[numBlockDims] : constOne; 234 Value blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne; 235 Value blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne; 236 237 // Create a launch op and move the body region of the innermost loop to the 238 // launch op. 239 auto launchOp = builder.create<gpu::LaunchOp>( 240 rootForOp.getLoc(), gridSizeX, gridSizeY, gridSizeZ, blockSizeX, 241 blockSizeY, blockSizeZ); 242 243 // Replace the loop terminator (loops contain only a single block) with the 244 // gpu terminator and move the operations from the loop body block to the gpu 245 // launch body block. Do not move the entire block because of the difference 246 // in block arguments. 247 Operation &terminator = innermostForOp.getBody()->back(); 248 Location terminatorLoc = terminator.getLoc(); 249 terminator.erase(); 250 builder.setInsertionPointToEnd(innermostForOp.getBody()); 251 builder.create<gpu::TerminatorOp>(terminatorLoc, llvm::None); 252 launchOp.getBody().front().getOperations().splice( 253 launchOp.getBody().front().begin(), 254 innermostForOp.getBody()->getOperations()); 255 256 // Remap the loop iterators to use block/thread identifiers instead. Loops 257 // may iterate from LB with step S whereas GPU thread/block ids always iterate 258 // from 0 to N with step 1. Therefore, loop induction variables are replaced 259 // with (gpu-thread/block-id * S) + LB. 260 builder.setInsertionPointToStart(&launchOp.getBody().front()); 261 auto *lbArgumentIt = lbs.begin(); 262 auto *stepArgumentIt = steps.begin(); 263 for (const auto &en : llvm::enumerate(ivs)) { 264 Value id = 265 en.index() < numBlockDims 266 ? getDim3Value(launchOp.getBlockIds(), en.index()) 267 : getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims); 268 Value step = steps[en.index()]; 269 if (!isConstantOne(step)) 270 id = builder.create<arith::MulIOp>(rootForOp.getLoc(), step, id); 271 272 Value ivReplacement = 273 builder.create<arith::AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id); 274 en.value().replaceAllUsesWith(ivReplacement); 275 std::advance(lbArgumentIt, 1); 276 std::advance(stepArgumentIt, 1); 277 } 278 279 // We are done and can erase the original outermost loop. 280 rootForOp.erase(); 281 } 282 283 // Generic loop to GPU kernel conversion function. 284 static LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp, 285 unsigned numBlockDims, 286 unsigned numThreadDims) { 287 if (failed(checkAffineLoopNestMappable(forOp, numBlockDims, numThreadDims))) 288 return failure(); 289 290 AffineLoopToGpuConverter converter; 291 auto maybeInnerLoop = 292 converter.collectBounds(forOp, numBlockDims + numThreadDims); 293 if (!maybeInnerLoop) 294 return failure(); 295 converter.createLaunch(forOp, *maybeInnerLoop, numBlockDims, numThreadDims); 296 297 return success(); 298 } 299 300 LogicalResult mlir::convertAffineLoopNestToGPULaunch(AffineForOp forOp, 301 unsigned numBlockDims, 302 unsigned numThreadDims) { 303 return ::convertAffineLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims); 304 } 305 306 namespace { 307 struct ParallelToGpuLaunchLowering : public OpRewritePattern<ParallelOp> { 308 using OpRewritePattern<ParallelOp>::OpRewritePattern; 309 310 LogicalResult matchAndRewrite(ParallelOp parallelOp, 311 PatternRewriter &rewriter) const override; 312 }; 313 } // namespace 314 315 /// Tries to derive a static upper bound from the defining operation of 316 /// `upperBound`. 317 static Value deriveStaticUpperBound(Value upperBound, 318 PatternRewriter &rewriter) { 319 if (auto op = upperBound.getDefiningOp<arith::ConstantIndexOp>()) { 320 return op; 321 } 322 323 if (auto minOp = upperBound.getDefiningOp<AffineMinOp>()) { 324 for (const AffineExpr &result : minOp.getMap().getResults()) { 325 if (auto constExpr = result.dyn_cast<AffineConstantExpr>()) { 326 return rewriter.create<arith::ConstantIndexOp>(minOp.getLoc(), 327 constExpr.getValue()); 328 } 329 } 330 } 331 332 if (auto minOp = upperBound.getDefiningOp<arith::MinSIOp>()) { 333 for (Value operand : {minOp.getLhs(), minOp.getRhs()}) { 334 if (auto staticBound = deriveStaticUpperBound(operand, rewriter)) 335 return staticBound; 336 } 337 } 338 339 if (auto multiplyOp = upperBound.getDefiningOp<arith::MulIOp>()) { 340 if (auto lhs = dyn_cast_or_null<arith::ConstantIndexOp>( 341 deriveStaticUpperBound(multiplyOp.getOperand(0), rewriter) 342 .getDefiningOp())) 343 if (auto rhs = dyn_cast_or_null<arith::ConstantIndexOp>( 344 deriveStaticUpperBound(multiplyOp.getOperand(1), rewriter) 345 .getDefiningOp())) { 346 // Assumptions about the upper bound of minimum computations no longer 347 // work if multiplied by mixed signs, so abort in this case. 348 if ((lhs.value() < 0) != (rhs.value() < 0)) 349 return {}; 350 351 return rewriter.create<arith::ConstantIndexOp>( 352 multiplyOp.getLoc(), lhs.value() * rhs.value()); 353 } 354 } 355 356 return {}; 357 } 358 359 static bool isMappedToProcessor(gpu::Processor processor) { 360 return processor != gpu::Processor::Sequential; 361 } 362 363 static unsigned getLaunchOpArgumentNum(gpu::Processor processor) { 364 switch (processor) { 365 case gpu::Processor::BlockX: 366 return 0; 367 case gpu::Processor::BlockY: 368 return 1; 369 case gpu::Processor::BlockZ: 370 return 2; 371 case gpu::Processor::ThreadX: 372 return 3; 373 case gpu::Processor::ThreadY: 374 return 4; 375 case gpu::Processor::ThreadZ: 376 return 5; 377 default:; 378 } 379 llvm_unreachable( 380 "invalid processor type while retrieving launch op argument number"); 381 } 382 383 /// Modifies the current transformation state to capture the effect of the given 384 /// `scf.parallel` operation on index substitutions and the operations to be 385 /// inserted. 386 /// Specifically, if a dimension of a parallel loop is mapped to a hardware id, 387 /// this function will 388 /// - compute the loop index based on the hardware id and affine map from the 389 /// mapping and update `cloningMap` to substitute all uses. 390 /// - derive a new upper bound for the hardware id and augment the provided 391 /// `gpu.launch operation` accordingly. 392 /// - if the upper bound is imprecise, insert a conditional in the `gpu.launch` 393 /// and update the rewriter to insert into the conditional's body. 394 /// If the dimension is mapped to sequential, 395 /// - insert a for loop into the body and update the rewriter to insert into 396 /// the for loop's body. 397 /// - update the `cloningMap` to replace uses of the index with the index of 398 /// the new for loop. 399 /// In either case, 400 /// - append the instructions from the loops body to worklist, in reverse order. 401 /// To note the end of the current scope in case a loop or conditional was 402 /// inserted, a sentinel (the `gpu.launch` operation) is inserted into the 403 /// worklist. This signals the processor of the worklist to pop the rewriter 404 /// one scope-level up. 405 static LogicalResult processParallelLoop( 406 ParallelOp parallelOp, gpu::LaunchOp launchOp, 407 BlockAndValueMapping &cloningMap, SmallVectorImpl<Operation *> &worklist, 408 DenseMap<gpu::Processor, Value> &bounds, PatternRewriter &rewriter) { 409 // TODO: Verify that this is a valid GPU mapping. 410 // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential 411 ArrayAttr mapping = 412 parallelOp->getAttrOfType<ArrayAttr>(gpu::getMappingAttrName()); 413 414 // TODO: Support reductions. 415 if (!mapping || parallelOp.getNumResults() != 0) 416 return failure(); 417 418 Location loc = parallelOp.getLoc(); 419 420 auto launchIndependent = [&launchOp](Value val) { 421 return val.getParentRegion()->isAncestor(launchOp->getParentRegion()); 422 }; 423 424 auto ensureLaunchIndependent = [&rewriter, 425 launchIndependent](Value val) -> Value { 426 if (launchIndependent(val)) 427 return val; 428 if (auto constOp = val.getDefiningOp<arith::ConstantOp>()) 429 return rewriter.create<arith::ConstantOp>(constOp.getLoc(), 430 constOp.getValue()); 431 return {}; 432 }; 433 434 for (auto config : llvm::zip( 435 mapping, parallelOp.getInductionVars(), parallelOp.getLowerBound(), 436 parallelOp.getUpperBound(), parallelOp.getStep())) { 437 Attribute mappingAttribute; 438 Value iv, lowerBound, upperBound, step; 439 std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config; 440 auto annotation = 441 mappingAttribute.dyn_cast<gpu::ParallelLoopDimMappingAttr>(); 442 if (!annotation) 443 return parallelOp.emitOpError() 444 << "expected mapping attribute for lowering to GPU"; 445 Value newIndex; 446 gpu::Processor processor = annotation.getProcessor(); 447 448 if (isMappedToProcessor(processor)) { 449 // Use the corresponding thread/grid index as replacement for the loop iv. 450 Value operand = 451 launchOp.getBody().getArgument(getLaunchOpArgumentNum(processor)); 452 // Take the indexmap and add the lower bound and step computations in. 453 // This computes operand * step + lowerBound. 454 // Use an affine map here so that it composes nicely with the provided 455 // annotation. 456 AffineMap lowerAndStep = AffineMap::get( 457 1, 2, 458 rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) + 459 rewriter.getAffineSymbolExpr(1)); 460 newIndex = rewriter.create<AffineApplyOp>( 461 loc, annotation.getMap().compose(lowerAndStep), 462 ValueRange{operand, step, lowerBound}); 463 // If there was also a bound, insert that, too. 464 // TODO: Check that we do not assign bounds twice. 465 if (annotation.getBound()) { 466 // We pass as the single operand to the bound-map the number of 467 // iterations, which is (upperBound - lowerBound) ceilDiv step. To 468 // support inner loops with dynamic upper bounds (as generated by e.g. 469 // tiling), try to derive a max for the bounds. If the used bound for 470 // the hardware id is imprecise, wrap the contained code into a 471 // conditional. If the lower-bound is constant or defined before the 472 // launch, we can use it in the launch bounds. Otherwise fail. 473 if (!launchIndependent(lowerBound) && 474 !isa_and_nonnull<arith::ConstantOp>(lowerBound.getDefiningOp())) 475 return failure(); 476 // The step must also be constant or defined outside of the loop nest. 477 if (!launchIndependent(step) && 478 !isa_and_nonnull<arith::ConstantOp>(step.getDefiningOp())) 479 return failure(); 480 // If the upper-bound is constant or defined before the launch, we can 481 // use it in the launch bounds directly. Otherwise try derive a bound. 482 bool boundIsPrecise = 483 launchIndependent(upperBound) || 484 isa_and_nonnull<arith::ConstantOp>(upperBound.getDefiningOp()); 485 { 486 PatternRewriter::InsertionGuard guard(rewriter); 487 rewriter.setInsertionPoint(launchOp); 488 if (!boundIsPrecise) { 489 upperBound = deriveStaticUpperBound(upperBound, rewriter); 490 if (!upperBound) { 491 return rewriter.notifyMatchFailure( 492 parallelOp, 493 "cannot derive loop-invariant upper bound for number of" 494 "iterations"); 495 } 496 } 497 // Compute the number of iterations needed. We compute this as an 498 // affine expression ceilDiv (upperBound - lowerBound) step. We use 499 // affine.apply here so that it composes nicely with the provided map. 500 AffineMap stepMap = AffineMap::get( 501 1, 2, 502 ((rewriter.getAffineDimExpr(0) - rewriter.getAffineSymbolExpr(0)) 503 .ceilDiv(rewriter.getAffineSymbolExpr(1)))); 504 Value launchBound = rewriter.create<AffineApplyOp>( 505 loc, annotation.getBound().compose(stepMap), 506 ValueRange{ 507 ensureLaunchIndependent( 508 cloningMap.lookupOrDefault(upperBound)), 509 ensureLaunchIndependent( 510 cloningMap.lookupOrDefault(lowerBound)), 511 ensureLaunchIndependent(cloningMap.lookupOrDefault(step))}); 512 // todo(herhut,ravishankarm): Update the behavior of setMappingAttr 513 // when this condition is relaxed. 514 if (bounds.find(processor) != bounds.end()) { 515 return rewriter.notifyMatchFailure( 516 parallelOp, "cannot redefine the bound for processor " + 517 Twine(static_cast<int64_t>(processor))); 518 } 519 bounds[processor] = launchBound; 520 } 521 if (!boundIsPrecise) { 522 // We are using an approximation, create a surrounding conditional. 523 Value originalBound = std::get<3>(config); 524 arith::CmpIOp pred = rewriter.create<arith::CmpIOp>( 525 loc, arith::CmpIPredicate::slt, newIndex, 526 cloningMap.lookupOrDefault(originalBound)); 527 scf::IfOp ifOp = rewriter.create<scf::IfOp>(loc, pred, false); 528 rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front()); 529 // Put a sentinel into the worklist so we know when to pop out of the 530 // if body again. We use the launchOp here, as that cannot be part of 531 // the bodies instruction. 532 worklist.push_back(launchOp.getOperation()); 533 } 534 } 535 } else { 536 // Create a sequential for loop. 537 auto loopOp = rewriter.create<scf::ForOp>( 538 loc, cloningMap.lookupOrDefault(lowerBound), 539 cloningMap.lookupOrDefault(upperBound), 540 cloningMap.lookupOrDefault(step)); 541 newIndex = loopOp.getInductionVar(); 542 rewriter.setInsertionPointToStart(loopOp.getBody()); 543 // Put a sentinel into the worklist so we know when to pop out of the loop 544 // body again. We use the launchOp here, as that cannot be part of the 545 // bodies instruction. 546 worklist.push_back(launchOp.getOperation()); 547 } 548 cloningMap.map(iv, newIndex); 549 } 550 551 // Propagate custom user defined optional attributes, that can be used at 552 // later stage, such as extension data for GPU kernel dispatch 553 for (const auto &namedAttr : parallelOp->getAttrs()) { 554 if (namedAttr.getName() == gpu::getMappingAttrName() || 555 namedAttr.getName() == ParallelOp::getOperandSegmentSizeAttr()) 556 continue; 557 launchOp->setAttr(namedAttr.getName(), namedAttr.getValue()); 558 } 559 560 Block *body = parallelOp.getBody(); 561 worklist.reserve(worklist.size() + body->getOperations().size()); 562 for (Operation &op : llvm::reverse(body->without_terminator())) 563 worklist.push_back(&op); 564 return success(); 565 } 566 567 /// Lower a `scf.parallel` operation into a corresponding `gpu.launch` 568 /// operation. 569 /// 570 /// This essentially transforms a loop nest into a corresponding SIMT function. 571 /// The conversion is driven by mapping annotations on the `scf.parallel` 572 /// operations. The mapping is provided via a `DictionaryAttribute` named 573 /// `mapping`, which has three entries: 574 /// - processor: the hardware id to map to. 0-2 are block dimensions, 3-5 are 575 /// thread dimensions and 6 is sequential. 576 /// - map : An affine map that is used to pre-process hardware ids before 577 /// substitution. 578 /// - bound : An affine map that is used to compute the bound of the hardware 579 /// id based on an upper bound of the number of iterations. 580 /// If the `scf.parallel` contains nested `scf.parallel` operations, those 581 /// need to be annotated, as well. Structurally, the transformation works by 582 /// splicing all operations from nested `scf.parallel` operations into a single 583 /// sequence. Indices mapped to hardware ids are substituted with those ids, 584 /// wheras sequential mappings result in a sequential for-loop. To have more 585 /// flexibility when mapping code to hardware ids, the transform supports two 586 /// affine maps. The first `map` is used to compute the actual index for 587 /// substitution from the hardware id. The second `bound` is used to compute the 588 /// launch dimension for the hardware id from the number of iterations the 589 /// mapped loop is performing. Note that the number of iterations might be 590 /// imprecise if the corresponding loop-bounds are loop-dependent. In such case, 591 /// the hardware id might iterate over additional indices. The transformation 592 /// caters for this by predicating the created sequence of instructions on 593 /// the actual loop bound. This only works if an static upper bound for the 594 /// dynamic loop bound can be derived, currently via analyzing `affine.min` 595 /// operations. 596 LogicalResult 597 ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, 598 PatternRewriter &rewriter) const { 599 // Mark the operation as visited for recursive legality check. 600 parallelOp->setAttr(kVisitedAttrName, rewriter.getUnitAttr()); 601 602 // We can only transform starting at the outer-most loop. Launches inside of 603 // parallel loops are not supported. 604 if (auto parentLoop = parallelOp->getParentOfType<ParallelOp>()) 605 return failure(); 606 // Create a launch operation. We start with bound one for all grid/block 607 // sizes. Those will be refined later as we discover them from mappings. 608 Location loc = parallelOp.getLoc(); 609 Value constantOne = 610 rewriter.create<arith::ConstantIndexOp>(parallelOp.getLoc(), 1); 611 gpu::LaunchOp launchOp = rewriter.create<gpu::LaunchOp>( 612 parallelOp.getLoc(), constantOne, constantOne, constantOne, constantOne, 613 constantOne, constantOne); 614 rewriter.setInsertionPointToEnd(&launchOp.getBody().front()); 615 rewriter.create<gpu::TerminatorOp>(loc); 616 rewriter.setInsertionPointToStart(&launchOp.getBody().front()); 617 618 BlockAndValueMapping cloningMap; 619 llvm::DenseMap<gpu::Processor, Value> launchBounds; 620 SmallVector<Operation *, 16> worklist; 621 if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist, 622 launchBounds, rewriter))) 623 return failure(); 624 625 // Whether we have seen any side-effects. Reset when leaving an inner scope. 626 bool seenSideeffects = false; 627 // Whether we have left a nesting scope (and hence are no longer innermost). 628 bool leftNestingScope = false; 629 while (!worklist.empty()) { 630 Operation *op = worklist.pop_back_val(); 631 // Now walk over the body and clone it. 632 // TODO: This is only correct if there either is no further scf.parallel 633 // nested or this code is side-effect free. Otherwise we might need 634 // predication. We are overly conservative for now and only allow 635 // side-effects in the innermost scope. 636 if (auto nestedParallel = dyn_cast<ParallelOp>(op)) { 637 // Before entering a nested scope, make sure there have been no 638 // sideeffects until now. 639 if (seenSideeffects) 640 return failure(); 641 // A nested scf.parallel needs insertion of code to compute indices. 642 // Insert that now. This will also update the worklist with the loops 643 // body. 644 if (failed(processParallelLoop(nestedParallel, launchOp, cloningMap, 645 worklist, launchBounds, rewriter))) 646 return failure(); 647 } else if (op == launchOp.getOperation()) { 648 // Found our sentinel value. We have finished the operations from one 649 // nesting level, pop one level back up. 650 auto *parent = rewriter.getInsertionPoint()->getParentOp(); 651 rewriter.setInsertionPointAfter(parent); 652 leftNestingScope = true; 653 seenSideeffects = false; 654 } else { 655 // Otherwise we copy it over. 656 Operation *clone = rewriter.clone(*op, cloningMap); 657 cloningMap.map(op->getResults(), clone->getResults()); 658 // Check for side effects. 659 // TODO: Handle region side effects properly. 660 seenSideeffects |= 661 !isMemoryEffectFree(clone) || clone->getNumRegions() != 0; 662 // If we are no longer in the innermost scope, sideeffects are disallowed. 663 if (seenSideeffects && leftNestingScope) 664 return failure(); 665 } 666 } 667 668 // Now that we succeeded creating the launch operation, also update the 669 // bounds. 670 for (auto bound : launchBounds) 671 launchOp.setOperand(getLaunchOpArgumentNum(std::get<0>(bound)), 672 std::get<1>(bound)); 673 674 rewriter.eraseOp(parallelOp); 675 return success(); 676 } 677 678 void mlir::populateParallelLoopToGPUPatterns(RewritePatternSet &patterns) { 679 patterns.add<ParallelToGpuLaunchLowering>(patterns.getContext()); 680 } 681 682 void mlir::configureParallelLoopToGPULegality(ConversionTarget &target) { 683 target.addLegalDialect<memref::MemRefDialect>(); 684 target.addDynamicallyLegalOp<scf::ParallelOp>([](scf::ParallelOp parallelOp) { 685 return !parallelOp->hasAttr(gpu::getMappingAttrName()) || 686 parallelOp->hasAttr(kVisitedAttrName); 687 }); 688 } 689 690 void mlir::finalizeParallelLoopToGPUConversion(Operation *op) { 691 op->walk([](scf::ParallelOp parallelOp) { 692 parallelOp->removeAttr(kVisitedAttrName); 693 }); 694 } 695