1 //===- SCFToGPU.cpp - Convert an affine loop nest to a GPU kernel -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This implements a straightforward conversion of an loop nest into a GPU 10 // kernel. The caller is expected to guarantee that the conversion is correct 11 // or to further transform the kernel to ensure correctness. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "mlir/Conversion/SCFToGPU/SCFToGPU.h" 16 17 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" 18 #include "mlir/Dialect/Affine/IR/AffineOps.h" 19 #include "mlir/Dialect/Arith/IR/Arith.h" 20 #include "mlir/Dialect/GPU/IR/GPUDialect.h" 21 #include "mlir/Dialect/GPU/Transforms/ParallelLoopMapper.h" 22 #include "mlir/Dialect/MemRef/IR/MemRef.h" 23 #include "mlir/Dialect/SCF/IR/SCF.h" 24 #include "mlir/IR/AffineExpr.h" 25 #include "mlir/IR/BlockAndValueMapping.h" 26 #include "mlir/IR/Builders.h" 27 #include "mlir/Pass/Pass.h" 28 #include "mlir/Transforms/DialectConversion.h" 29 #include "mlir/Transforms/Passes.h" 30 #include "mlir/Transforms/RegionUtils.h" 31 #include "llvm/ADT/Sequence.h" 32 #include "llvm/Support/Debug.h" 33 34 #define DEBUG_TYPE "loops-to-gpu" 35 36 using namespace mlir; 37 using namespace mlir::scf; 38 39 // Name of internal attribute to mark visited operations during conversion. 40 // 41 // NOTE: The conversion originally used the following legality criteria: 42 // `!parallelOp->hasAttr(gpu::getMappingAttrName())` 43 // But the provided pattern might reject some cases based on more detailed 44 // analysis of the `mapping` attribute. 45 // To avoid dialect conversion failure due to non-converted illegal operation 46 // we use this extra Unit attribute as a marker, that the operation was checked 47 // by the pattern and is should be considered as legal in the following legality 48 // checks. The `finalizeParallelLoopToGPUConversion` function performs clean up 49 // of this extra attributes ans is supposed to be called after the dialect 50 // conversion. 51 // 52 // TODO: Implement a cleaner solution, factoring out the "matching" logic 53 // from the pattern and its callees into a separate function that can be called 54 // from both the pattern and the op legality check. 55 static constexpr StringLiteral kVisitedAttrName = "SCFToGPU_visited"; 56 57 // Extract an indexed value from KernelDim3. 58 static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) { 59 switch (pos) { 60 case 0: 61 return dim3.x; 62 case 1: 63 return dim3.y; 64 case 2: 65 return dim3.z; 66 default: 67 llvm_unreachable("dim3 position out of bounds"); 68 } 69 return nullptr; 70 } 71 72 // Get the lower bound-related operands of a loop operation. 73 static Operation::operand_range getLowerBoundOperands(AffineForOp forOp) { 74 return forOp.getLowerBoundOperands(); 75 } 76 77 // Get the upper bound-related operands of a loop operation. 78 static Operation::operand_range getUpperBoundOperands(AffineForOp forOp) { 79 return forOp.getUpperBoundOperands(); 80 } 81 82 // Get a Value that corresponds to the loop step. If the step is an attribute, 83 // materialize a corresponding constant using builder. 84 static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder) { 85 return builder.create<arith::ConstantIndexOp>(forOp.getLoc(), 86 forOp.getStep()); 87 } 88 89 // Get a Value for the loop lower bound. If the value requires computation, 90 // materialize the instructions using builder. 91 static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) { 92 return lowerAffineLowerBound(forOp, builder); 93 } 94 95 // Get a Value for the loop upper bound. If the value requires computation, 96 // materialize the instructions using builder. 97 static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) { 98 return lowerAffineUpperBound(forOp, builder); 99 } 100 101 // Check the structure of the loop nest: 102 // - there are enough loops to map to numDims; 103 // - the loops are perfectly nested; 104 // - the loop bounds can be computed above the outermost loop. 105 // This roughly corresponds to the "matcher" part of the pattern-based 106 // rewriting infrastructure. 107 static LogicalResult checkAffineLoopNestMappableImpl(AffineForOp forOp, 108 unsigned numDims) { 109 Region &limit = forOp.getRegion(); 110 for (unsigned i = 0, e = numDims; i < e; ++i) { 111 Operation *nested = &forOp.getBody()->front(); 112 if (!areValuesDefinedAbove(getLowerBoundOperands(forOp), limit) || 113 !areValuesDefinedAbove(getUpperBoundOperands(forOp), limit)) 114 return forOp.emitError( 115 "loops with bounds depending on other mapped loops " 116 "are not supported"); 117 118 // The innermost loop can have an arbitrary body, skip the perfect nesting 119 // check for it. 120 if (i == e - 1) 121 break; 122 123 auto begin = forOp.getBody()->begin(), end = forOp.getBody()->end(); 124 if (forOp.getBody()->empty() || std::next(begin, 2) != end) 125 return forOp.emitError("expected perfectly nested loops in the body"); 126 127 if (!(forOp = dyn_cast<AffineForOp>(nested))) 128 return nested->emitError("expected a nested loop"); 129 } 130 return success(); 131 } 132 133 static LogicalResult checkAffineLoopNestMappable(AffineForOp forOp, 134 unsigned numBlockDims, 135 unsigned numThreadDims) { 136 if (numBlockDims < 1 || numThreadDims < 1) { 137 LLVM_DEBUG(llvm::dbgs() << "nothing to map"); 138 return success(); 139 } 140 141 if (numBlockDims > 3) { 142 return forOp.emitError("cannot map to more than 3 block dimensions"); 143 } 144 if (numThreadDims > 3) { 145 return forOp.emitError("cannot map to more than 3 thread dimensions"); 146 } 147 return checkAffineLoopNestMappableImpl(forOp, numBlockDims + numThreadDims); 148 } 149 150 namespace { 151 // Helper structure that holds common state of the loop to GPU kernel 152 // conversion. 153 struct AffineLoopToGpuConverter { 154 Optional<AffineForOp> collectBounds(AffineForOp forOp, unsigned numLoops); 155 156 void createLaunch(AffineForOp rootForOp, AffineForOp innermostForOp, 157 unsigned numBlockDims, unsigned numThreadDims); 158 159 // Ranges of the loops mapped to blocks or threads. 160 SmallVector<Value, 6> dims; 161 // Lower bounds of the loops mapped to blocks or threads. 162 SmallVector<Value, 6> lbs; 163 // Induction variables of the loops mapped to blocks or threads. 164 SmallVector<Value, 6> ivs; 165 // Steps of the loops mapped to blocks or threads. 166 SmallVector<Value, 6> steps; 167 }; 168 } // namespace 169 170 // Return true if the value is obviously a constant "one". 171 static bool isConstantOne(Value value) { 172 if (auto def = value.getDefiningOp<arith::ConstantIndexOp>()) 173 return def.value() == 1; 174 return false; 175 } 176 177 // Collect ranges, bounds, steps and induction variables in preparation for 178 // mapping a loop nest of depth "numLoops" rooted at "forOp" to a GPU kernel. 179 // This may fail if the IR for computing loop bounds cannot be constructed, for 180 // example if an affine loop uses semi-affine maps. Return the last loop to be 181 // mapped on success, llvm::None on failure. 182 Optional<AffineForOp> 183 AffineLoopToGpuConverter::collectBounds(AffineForOp forOp, unsigned numLoops) { 184 OpBuilder builder(forOp.getOperation()); 185 dims.reserve(numLoops); 186 lbs.reserve(numLoops); 187 ivs.reserve(numLoops); 188 steps.reserve(numLoops); 189 AffineForOp currentLoop = forOp; 190 for (unsigned i = 0; i < numLoops; ++i) { 191 Value lowerBound = getOrEmitLowerBound(currentLoop, builder); 192 Value upperBound = getOrEmitUpperBound(currentLoop, builder); 193 if (!lowerBound || !upperBound) { 194 return llvm::None; 195 } 196 197 Value range = builder.create<arith::SubIOp>(currentLoop.getLoc(), 198 upperBound, lowerBound); 199 Value step = getOrCreateStep(currentLoop, builder); 200 if (!isConstantOne(step)) 201 range = builder.create<arith::DivSIOp>(currentLoop.getLoc(), range, step); 202 dims.push_back(range); 203 204 lbs.push_back(lowerBound); 205 ivs.push_back(currentLoop.getInductionVar()); 206 steps.push_back(step); 207 208 if (i != numLoops - 1) 209 currentLoop = cast<AffineForOp>(¤tLoop.getBody()->front()); 210 } 211 return currentLoop; 212 } 213 214 // Replace the rooted at "rootForOp" with a GPU launch operation. This expects 215 // "innermostForOp" to point to the last loop to be transformed to the kernel, 216 // and to have (numBlockDims + numThreadDims) perfectly nested loops between 217 // "rootForOp" and "innermostForOp". 218 void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp, 219 AffineForOp innermostForOp, 220 unsigned numBlockDims, 221 unsigned numThreadDims) { 222 OpBuilder builder(rootForOp.getOperation()); 223 // Prepare the grid and block sizes for the launch operation. If there is 224 // no loop mapped to a specific dimension, use constant "1" as its size. 225 Value constOne = 226 (numBlockDims < 3 || numThreadDims < 3) 227 ? builder.create<arith::ConstantIndexOp>(rootForOp.getLoc(), 1) 228 : nullptr; 229 Value gridSizeX = numBlockDims > 0 ? dims[0] : constOne; 230 Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne; 231 Value gridSizeZ = numBlockDims > 2 ? dims[2] : constOne; 232 Value blockSizeX = numThreadDims > 0 ? dims[numBlockDims] : constOne; 233 Value blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne; 234 Value blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne; 235 236 // Create a launch op and move the body region of the innermost loop to the 237 // launch op. 238 auto launchOp = builder.create<gpu::LaunchOp>( 239 rootForOp.getLoc(), gridSizeX, gridSizeY, gridSizeZ, blockSizeX, 240 blockSizeY, blockSizeZ); 241 242 // Replace the loop terminator (loops contain only a single block) with the 243 // gpu terminator and move the operations from the loop body block to the gpu 244 // launch body block. Do not move the entire block because of the difference 245 // in block arguments. 246 Operation &terminator = innermostForOp.getBody()->back(); 247 Location terminatorLoc = terminator.getLoc(); 248 terminator.erase(); 249 builder.setInsertionPointToEnd(innermostForOp.getBody()); 250 builder.create<gpu::TerminatorOp>(terminatorLoc, llvm::None); 251 launchOp.getBody().front().getOperations().splice( 252 launchOp.getBody().front().begin(), 253 innermostForOp.getBody()->getOperations()); 254 255 // Remap the loop iterators to use block/thread identifiers instead. Loops 256 // may iterate from LB with step S whereas GPU thread/block ids always iterate 257 // from 0 to N with step 1. Therefore, loop induction variables are replaced 258 // with (gpu-thread/block-id * S) + LB. 259 builder.setInsertionPointToStart(&launchOp.getBody().front()); 260 auto *lbArgumentIt = lbs.begin(); 261 auto *stepArgumentIt = steps.begin(); 262 for (const auto &en : llvm::enumerate(ivs)) { 263 Value id = 264 en.index() < numBlockDims 265 ? getDim3Value(launchOp.getBlockIds(), en.index()) 266 : getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims); 267 Value step = steps[en.index()]; 268 if (!isConstantOne(step)) 269 id = builder.create<arith::MulIOp>(rootForOp.getLoc(), step, id); 270 271 Value ivReplacement = 272 builder.create<arith::AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id); 273 en.value().replaceAllUsesWith(ivReplacement); 274 std::advance(lbArgumentIt, 1); 275 std::advance(stepArgumentIt, 1); 276 } 277 278 // We are done and can erase the original outermost loop. 279 rootForOp.erase(); 280 } 281 282 // Generic loop to GPU kernel conversion function. 283 static LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp, 284 unsigned numBlockDims, 285 unsigned numThreadDims) { 286 if (failed(checkAffineLoopNestMappable(forOp, numBlockDims, numThreadDims))) 287 return failure(); 288 289 AffineLoopToGpuConverter converter; 290 auto maybeInnerLoop = 291 converter.collectBounds(forOp, numBlockDims + numThreadDims); 292 if (!maybeInnerLoop) 293 return failure(); 294 converter.createLaunch(forOp, *maybeInnerLoop, numBlockDims, numThreadDims); 295 296 return success(); 297 } 298 299 LogicalResult mlir::convertAffineLoopNestToGPULaunch(AffineForOp forOp, 300 unsigned numBlockDims, 301 unsigned numThreadDims) { 302 return ::convertAffineLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims); 303 } 304 305 namespace { 306 struct ParallelToGpuLaunchLowering : public OpRewritePattern<ParallelOp> { 307 using OpRewritePattern<ParallelOp>::OpRewritePattern; 308 309 LogicalResult matchAndRewrite(ParallelOp parallelOp, 310 PatternRewriter &rewriter) const override; 311 }; 312 } // namespace 313 314 /// Tries to derive a static upper bound from the defining operation of 315 /// `upperBound`. 316 static Value deriveStaticUpperBound(Value upperBound, 317 PatternRewriter &rewriter) { 318 if (auto op = upperBound.getDefiningOp<arith::ConstantIndexOp>()) { 319 return op; 320 } 321 322 if (auto minOp = upperBound.getDefiningOp<AffineMinOp>()) { 323 for (const AffineExpr &result : minOp.getMap().getResults()) { 324 if (auto constExpr = result.dyn_cast<AffineConstantExpr>()) { 325 return rewriter.create<arith::ConstantIndexOp>(minOp.getLoc(), 326 constExpr.getValue()); 327 } 328 } 329 } 330 331 if (auto minOp = upperBound.getDefiningOp<arith::MinSIOp>()) { 332 for (Value operand : {minOp.getLhs(), minOp.getRhs()}) { 333 if (auto staticBound = deriveStaticUpperBound(operand, rewriter)) 334 return staticBound; 335 } 336 } 337 338 if (auto multiplyOp = upperBound.getDefiningOp<arith::MulIOp>()) { 339 if (auto lhs = dyn_cast_or_null<arith::ConstantIndexOp>( 340 deriveStaticUpperBound(multiplyOp.getOperand(0), rewriter) 341 .getDefiningOp())) 342 if (auto rhs = dyn_cast_or_null<arith::ConstantIndexOp>( 343 deriveStaticUpperBound(multiplyOp.getOperand(1), rewriter) 344 .getDefiningOp())) { 345 // Assumptions about the upper bound of minimum computations no longer 346 // work if multiplied by mixed signs, so abort in this case. 347 if ((lhs.value() < 0) != (rhs.value() < 0)) 348 return {}; 349 350 return rewriter.create<arith::ConstantIndexOp>( 351 multiplyOp.getLoc(), lhs.value() * rhs.value()); 352 } 353 } 354 355 return {}; 356 } 357 358 static bool isMappedToProcessor(gpu::Processor processor) { 359 return processor != gpu::Processor::Sequential; 360 } 361 362 static unsigned getLaunchOpArgumentNum(gpu::Processor processor) { 363 switch (processor) { 364 case gpu::Processor::BlockX: 365 return 0; 366 case gpu::Processor::BlockY: 367 return 1; 368 case gpu::Processor::BlockZ: 369 return 2; 370 case gpu::Processor::ThreadX: 371 return 3; 372 case gpu::Processor::ThreadY: 373 return 4; 374 case gpu::Processor::ThreadZ: 375 return 5; 376 default:; 377 } 378 llvm_unreachable( 379 "invalid processor type while retrieving launch op argument number"); 380 } 381 382 /// Modifies the current transformation state to capture the effect of the given 383 /// `scf.parallel` operation on index substitutions and the operations to be 384 /// inserted. 385 /// Specifically, if a dimension of a parallel loop is mapped to a hardware id, 386 /// this function will 387 /// - compute the loop index based on the hardware id and affine map from the 388 /// mapping and update `cloningMap` to substitute all uses. 389 /// - derive a new upper bound for the hardware id and augment the provided 390 /// `gpu.launch operation` accordingly. 391 /// - if the upper bound is imprecise, insert a conditional in the `gpu.launch` 392 /// and update the rewriter to insert into the conditional's body. 393 /// If the dimension is mapped to sequential, 394 /// - insert a for loop into the body and update the rewriter to insert into 395 /// the for loop's body. 396 /// - update the `cloningMap` to replace uses of the index with the index of 397 /// the new for loop. 398 /// In either case, 399 /// - append the instructions from the loops body to worklist, in reverse order. 400 /// To note the end of the current scope in case a loop or conditional was 401 /// inserted, a sentinel (the `gpu.launch` operation) is inserted into the 402 /// worklist. This signals the processor of the worklist to pop the rewriter 403 /// one scope-level up. 404 static LogicalResult processParallelLoop( 405 ParallelOp parallelOp, gpu::LaunchOp launchOp, 406 BlockAndValueMapping &cloningMap, SmallVectorImpl<Operation *> &worklist, 407 DenseMap<gpu::Processor, Value> &bounds, PatternRewriter &rewriter) { 408 // TODO: Verify that this is a valid GPU mapping. 409 // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential 410 ArrayAttr mapping = 411 parallelOp->getAttrOfType<ArrayAttr>(gpu::getMappingAttrName()); 412 413 // TODO: Support reductions. 414 if (!mapping || parallelOp.getNumResults() != 0) 415 return failure(); 416 417 Location loc = parallelOp.getLoc(); 418 419 auto launchIndependent = [&launchOp](Value val) { 420 return val.getParentRegion()->isAncestor(launchOp->getParentRegion()); 421 }; 422 423 auto ensureLaunchIndependent = [&rewriter, 424 launchIndependent](Value val) -> Value { 425 if (launchIndependent(val)) 426 return val; 427 if (auto constOp = val.getDefiningOp<arith::ConstantOp>()) 428 return rewriter.create<arith::ConstantOp>(constOp.getLoc(), 429 constOp.getValue()); 430 return {}; 431 }; 432 433 for (auto config : llvm::zip( 434 mapping, parallelOp.getInductionVars(), parallelOp.getLowerBound(), 435 parallelOp.getUpperBound(), parallelOp.getStep())) { 436 Attribute mappingAttribute; 437 Value iv, lowerBound, upperBound, step; 438 std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config; 439 auto annotation = 440 mappingAttribute.dyn_cast<gpu::ParallelLoopDimMappingAttr>(); 441 if (!annotation) 442 return parallelOp.emitOpError() 443 << "expected mapping attribute for lowering to GPU"; 444 Value newIndex; 445 gpu::Processor processor = annotation.getProcessor(); 446 447 if (isMappedToProcessor(processor)) { 448 // Use the corresponding thread/grid index as replacement for the loop iv. 449 Value operand = 450 launchOp.getBody().getArgument(getLaunchOpArgumentNum(processor)); 451 // Take the indexmap and add the lower bound and step computations in. 452 // This computes operand * step + lowerBound. 453 // Use an affine map here so that it composes nicely with the provided 454 // annotation. 455 AffineMap lowerAndStep = AffineMap::get( 456 1, 2, 457 rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) + 458 rewriter.getAffineSymbolExpr(1)); 459 newIndex = rewriter.create<AffineApplyOp>( 460 loc, annotation.getMap().compose(lowerAndStep), 461 ValueRange{operand, step, lowerBound}); 462 // If there was also a bound, insert that, too. 463 // TODO: Check that we do not assign bounds twice. 464 if (annotation.getBound()) { 465 // We pass as the single operand to the bound-map the number of 466 // iterations, which is (upperBound - lowerBound) ceilDiv step. To 467 // support inner loops with dynamic upper bounds (as generated by e.g. 468 // tiling), try to derive a max for the bounds. If the used bound for 469 // the hardware id is imprecise, wrap the contained code into a 470 // conditional. If the lower-bound is constant or defined before the 471 // launch, we can use it in the launch bounds. Otherwise fail. 472 if (!launchIndependent(lowerBound) && 473 !isa_and_nonnull<arith::ConstantOp>(lowerBound.getDefiningOp())) 474 return failure(); 475 // The step must also be constant or defined outside of the loop nest. 476 if (!launchIndependent(step) && 477 !isa_and_nonnull<arith::ConstantOp>(step.getDefiningOp())) 478 return failure(); 479 // If the upper-bound is constant or defined before the launch, we can 480 // use it in the launch bounds directly. Otherwise try derive a bound. 481 bool boundIsPrecise = 482 launchIndependent(upperBound) || 483 isa_and_nonnull<arith::ConstantOp>(upperBound.getDefiningOp()); 484 { 485 PatternRewriter::InsertionGuard guard(rewriter); 486 rewriter.setInsertionPoint(launchOp); 487 if (!boundIsPrecise) { 488 upperBound = deriveStaticUpperBound(upperBound, rewriter); 489 if (!upperBound) { 490 return rewriter.notifyMatchFailure( 491 parallelOp, 492 "cannot derive loop-invariant upper bound for number of" 493 "iterations"); 494 } 495 } 496 // Compute the number of iterations needed. We compute this as an 497 // affine expression ceilDiv (upperBound - lowerBound) step. We use 498 // affine.apply here so that it composes nicely with the provided map. 499 AffineMap stepMap = AffineMap::get( 500 1, 2, 501 ((rewriter.getAffineDimExpr(0) - rewriter.getAffineSymbolExpr(0)) 502 .ceilDiv(rewriter.getAffineSymbolExpr(1)))); 503 Value launchBound = rewriter.create<AffineApplyOp>( 504 loc, annotation.getBound().compose(stepMap), 505 ValueRange{ 506 ensureLaunchIndependent( 507 cloningMap.lookupOrDefault(upperBound)), 508 ensureLaunchIndependent( 509 cloningMap.lookupOrDefault(lowerBound)), 510 ensureLaunchIndependent(cloningMap.lookupOrDefault(step))}); 511 // todo(herhut,ravishankarm): Update the behavior of setMappingAttr 512 // when this condition is relaxed. 513 if (bounds.find(processor) != bounds.end()) { 514 return rewriter.notifyMatchFailure( 515 parallelOp, "cannot redefine the bound for processor " + 516 Twine(static_cast<int64_t>(processor))); 517 } 518 bounds[processor] = launchBound; 519 } 520 if (!boundIsPrecise) { 521 // We are using an approximation, create a surrounding conditional. 522 Value originalBound = std::get<3>(config); 523 arith::CmpIOp pred = rewriter.create<arith::CmpIOp>( 524 loc, arith::CmpIPredicate::slt, newIndex, 525 cloningMap.lookupOrDefault(originalBound)); 526 scf::IfOp ifOp = rewriter.create<scf::IfOp>(loc, pred, false); 527 rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front()); 528 // Put a sentinel into the worklist so we know when to pop out of the 529 // if body again. We use the launchOp here, as that cannot be part of 530 // the bodies instruction. 531 worklist.push_back(launchOp.getOperation()); 532 } 533 } 534 } else { 535 // Create a sequential for loop. 536 auto loopOp = rewriter.create<scf::ForOp>( 537 loc, cloningMap.lookupOrDefault(lowerBound), 538 cloningMap.lookupOrDefault(upperBound), 539 cloningMap.lookupOrDefault(step)); 540 newIndex = loopOp.getInductionVar(); 541 rewriter.setInsertionPointToStart(loopOp.getBody()); 542 // Put a sentinel into the worklist so we know when to pop out of the loop 543 // body again. We use the launchOp here, as that cannot be part of the 544 // bodies instruction. 545 worklist.push_back(launchOp.getOperation()); 546 } 547 cloningMap.map(iv, newIndex); 548 } 549 550 // Propagate custom user defined optional attributes, that can be used at 551 // later stage, such as extension data for GPU kernel dispatch 552 for (const auto &namedAttr : parallelOp->getAttrs()) { 553 if (namedAttr.getName() == gpu::getMappingAttrName() || 554 namedAttr.getName() == ParallelOp::getOperandSegmentSizeAttr()) 555 continue; 556 launchOp->setAttr(namedAttr.getName(), namedAttr.getValue()); 557 } 558 559 Block *body = parallelOp.getBody(); 560 worklist.reserve(worklist.size() + body->getOperations().size()); 561 for (Operation &op : llvm::reverse(body->without_terminator())) 562 worklist.push_back(&op); 563 return success(); 564 } 565 566 /// Lower a `scf.parallel` operation into a corresponding `gpu.launch` 567 /// operation. 568 /// 569 /// This essentially transforms a loop nest into a corresponding SIMT function. 570 /// The conversion is driven by mapping annotations on the `scf.parallel` 571 /// operations. The mapping is provided via a `DictionaryAttribute` named 572 /// `mapping`, which has three entries: 573 /// - processor: the hardware id to map to. 0-2 are block dimensions, 3-5 are 574 /// thread dimensions and 6 is sequential. 575 /// - map : An affine map that is used to pre-process hardware ids before 576 /// substitution. 577 /// - bound : An affine map that is used to compute the bound of the hardware 578 /// id based on an upper bound of the number of iterations. 579 /// If the `scf.parallel` contains nested `scf.parallel` operations, those 580 /// need to be annotated, as well. Structurally, the transformation works by 581 /// splicing all operations from nested `scf.parallel` operations into a single 582 /// sequence. Indices mapped to hardware ids are substituted with those ids, 583 /// wheras sequential mappings result in a sequential for-loop. To have more 584 /// flexibility when mapping code to hardware ids, the transform supports two 585 /// affine maps. The first `map` is used to compute the actual index for 586 /// substitution from the hardware id. The second `bound` is used to compute the 587 /// launch dimension for the hardware id from the number of iterations the 588 /// mapped loop is performing. Note that the number of iterations might be 589 /// imprecise if the corresponding loop-bounds are loop-dependent. In such case, 590 /// the hardware id might iterate over additional indices. The transformation 591 /// caters for this by predicating the created sequence of instructions on 592 /// the actual loop bound. This only works if an static upper bound for the 593 /// dynamic loop bound can be derived, currently via analyzing `affine.min` 594 /// operations. 595 LogicalResult 596 ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, 597 PatternRewriter &rewriter) const { 598 // Mark the operation as visited for recursive legality check. 599 parallelOp->setAttr(kVisitedAttrName, rewriter.getUnitAttr()); 600 601 // We can only transform starting at the outer-most loop. Launches inside of 602 // parallel loops are not supported. 603 if (auto parentLoop = parallelOp->getParentOfType<ParallelOp>()) 604 return failure(); 605 // Create a launch operation. We start with bound one for all grid/block 606 // sizes. Those will be refined later as we discover them from mappings. 607 Location loc = parallelOp.getLoc(); 608 Value constantOne = 609 rewriter.create<arith::ConstantIndexOp>(parallelOp.getLoc(), 1); 610 gpu::LaunchOp launchOp = rewriter.create<gpu::LaunchOp>( 611 parallelOp.getLoc(), constantOne, constantOne, constantOne, constantOne, 612 constantOne, constantOne); 613 rewriter.setInsertionPointToEnd(&launchOp.getBody().front()); 614 rewriter.create<gpu::TerminatorOp>(loc); 615 rewriter.setInsertionPointToStart(&launchOp.getBody().front()); 616 617 BlockAndValueMapping cloningMap; 618 llvm::DenseMap<gpu::Processor, Value> launchBounds; 619 SmallVector<Operation *, 16> worklist; 620 if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist, 621 launchBounds, rewriter))) 622 return failure(); 623 624 // Whether we have seen any side-effects. Reset when leaving an inner scope. 625 bool seenSideeffects = false; 626 // Whether we have left a nesting scope (and hence are no longer innermost). 627 bool leftNestingScope = false; 628 while (!worklist.empty()) { 629 Operation *op = worklist.pop_back_val(); 630 // Now walk over the body and clone it. 631 // TODO: This is only correct if there either is no further scf.parallel 632 // nested or this code is side-effect free. Otherwise we might need 633 // predication. We are overly conservative for now and only allow 634 // side-effects in the innermost scope. 635 if (auto nestedParallel = dyn_cast<ParallelOp>(op)) { 636 // Before entering a nested scope, make sure there have been no 637 // sideeffects until now. 638 if (seenSideeffects) 639 return failure(); 640 // A nested scf.parallel needs insertion of code to compute indices. 641 // Insert that now. This will also update the worklist with the loops 642 // body. 643 if (failed(processParallelLoop(nestedParallel, launchOp, cloningMap, 644 worklist, launchBounds, rewriter))) 645 return failure(); 646 } else if (op == launchOp.getOperation()) { 647 // Found our sentinel value. We have finished the operations from one 648 // nesting level, pop one level back up. 649 auto *parent = rewriter.getInsertionPoint()->getParentOp(); 650 rewriter.setInsertionPointAfter(parent); 651 leftNestingScope = true; 652 seenSideeffects = false; 653 } else { 654 // Otherwise we copy it over. 655 Operation *clone = rewriter.clone(*op, cloningMap); 656 cloningMap.map(op->getResults(), clone->getResults()); 657 // Check for side effects. 658 // TODO: Handle region side effects properly. 659 seenSideeffects |= !MemoryEffectOpInterface::hasNoEffect(clone) || 660 clone->getNumRegions() != 0; 661 // If we are no longer in the innermost scope, sideeffects are disallowed. 662 if (seenSideeffects && leftNestingScope) 663 return failure(); 664 } 665 } 666 667 // Now that we succeeded creating the launch operation, also update the 668 // bounds. 669 for (auto bound : launchBounds) 670 launchOp.setOperand(getLaunchOpArgumentNum(std::get<0>(bound)), 671 std::get<1>(bound)); 672 673 rewriter.eraseOp(parallelOp); 674 return success(); 675 } 676 677 void mlir::populateParallelLoopToGPUPatterns(RewritePatternSet &patterns) { 678 patterns.add<ParallelToGpuLaunchLowering>(patterns.getContext()); 679 } 680 681 void mlir::configureParallelLoopToGPULegality(ConversionTarget &target) { 682 target.addLegalDialect<memref::MemRefDialect>(); 683 target.addDynamicallyLegalOp<scf::ParallelOp>([](scf::ParallelOp parallelOp) { 684 return !parallelOp->hasAttr(gpu::getMappingAttrName()) || 685 parallelOp->hasAttr(kVisitedAttrName); 686 }); 687 } 688 689 void mlir::finalizeParallelLoopToGPUConversion(Operation *op) { 690 op->walk([](scf::ParallelOp parallelOp) { 691 parallelOp->removeAttr(kVisitedAttrName); 692 }); 693 } 694