1 //===- SCFToGPU.cpp - Convert an affine loop nest to a GPU kernel -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This implements a straightforward conversion of an loop nest into a GPU 10 // kernel. The caller is expected to guarantee that the conversion is correct 11 // or to further transform the kernel to ensure correctness. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "mlir/Conversion/SCFToGPU/SCFToGPU.h" 16 17 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" 18 #include "mlir/Dialect/Affine/IR/AffineOps.h" 19 #include "mlir/Dialect/Arith/IR/Arith.h" 20 #include "mlir/Dialect/GPU/IR/GPUDialect.h" 21 #include "mlir/Dialect/GPU/Transforms/ParallelLoopMapper.h" 22 #include "mlir/Dialect/MemRef/IR/MemRef.h" 23 #include "mlir/Dialect/SCF/IR/SCF.h" 24 #include "mlir/IR/AffineExpr.h" 25 #include "mlir/IR/Builders.h" 26 #include "mlir/IR/IRMapping.h" 27 #include "mlir/Interfaces/SideEffectInterfaces.h" 28 #include "mlir/Pass/Pass.h" 29 #include "mlir/Transforms/DialectConversion.h" 30 #include "mlir/Transforms/Passes.h" 31 #include "mlir/Transforms/RegionUtils.h" 32 #include "llvm/ADT/Sequence.h" 33 #include "llvm/Support/Debug.h" 34 #include <optional> 35 36 #define DEBUG_TYPE "loops-to-gpu" 37 38 using namespace mlir; 39 using namespace mlir::affine; 40 using namespace mlir::scf; 41 42 // Name of internal attribute to mark visited operations during conversion. 43 // 44 // NOTE: The conversion originally used the following legality criteria: 45 // `!parallelOp->hasAttr(gpu::getMappingAttrName())` 46 // But the provided pattern might reject some cases based on more detailed 47 // analysis of the `mapping` attribute. 48 // To avoid dialect conversion failure due to non-converted illegal operation 49 // we use this extra Unit attribute as a marker, that the operation was checked 50 // by the pattern and is should be considered as legal in the following legality 51 // checks. The `finalizeParallelLoopToGPUConversion` function performs clean up 52 // of this extra attributes ans is supposed to be called after the dialect 53 // conversion. 54 // 55 // TODO: Implement a cleaner solution, factoring out the "matching" logic 56 // from the pattern and its callees into a separate function that can be called 57 // from both the pattern and the op legality check. 58 static constexpr StringLiteral kVisitedAttrName = "SCFToGPU_visited"; 59 60 // Extract an indexed value from KernelDim3. 61 static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) { 62 switch (pos) { 63 case 0: 64 return dim3.x; 65 case 1: 66 return dim3.y; 67 case 2: 68 return dim3.z; 69 default: 70 llvm_unreachable("dim3 position out of bounds"); 71 } 72 return nullptr; 73 } 74 75 // Get the lower bound-related operands of a loop operation. 76 static Operation::operand_range getLowerBoundOperands(AffineForOp forOp) { 77 return forOp.getLowerBoundOperands(); 78 } 79 80 // Get the upper bound-related operands of a loop operation. 81 static Operation::operand_range getUpperBoundOperands(AffineForOp forOp) { 82 return forOp.getUpperBoundOperands(); 83 } 84 85 // Get a Value that corresponds to the loop step. If the step is an attribute, 86 // materialize a corresponding constant using builder. 87 static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder) { 88 return builder.create<arith::ConstantIndexOp>(forOp.getLoc(), 89 forOp.getStepAsInt()); 90 } 91 92 // Get a Value for the loop lower bound. If the value requires computation, 93 // materialize the instructions using builder. 94 static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) { 95 return lowerAffineLowerBound(forOp, builder); 96 } 97 98 // Get a Value for the loop upper bound. If the value requires computation, 99 // materialize the instructions using builder. 100 static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) { 101 return lowerAffineUpperBound(forOp, builder); 102 } 103 104 // Check the structure of the loop nest: 105 // - there are enough loops to map to numDims; 106 // - the loops are perfectly nested; 107 // - the loop bounds can be computed above the outermost loop. 108 // This roughly corresponds to the "matcher" part of the pattern-based 109 // rewriting infrastructure. 110 static LogicalResult checkAffineLoopNestMappableImpl(AffineForOp forOp, 111 unsigned numDims) { 112 Region &limit = forOp.getRegion(); 113 for (unsigned i = 0, e = numDims; i < e; ++i) { 114 Operation *nested = &forOp.getBody()->front(); 115 if (!areValuesDefinedAbove(getLowerBoundOperands(forOp), limit) || 116 !areValuesDefinedAbove(getUpperBoundOperands(forOp), limit)) 117 return forOp.emitError( 118 "loops with bounds depending on other mapped loops " 119 "are not supported"); 120 121 // The innermost loop can have an arbitrary body, skip the perfect nesting 122 // check for it. 123 if (i == e - 1) 124 break; 125 126 auto begin = forOp.getBody()->begin(), end = forOp.getBody()->end(); 127 if (forOp.getBody()->empty() || std::next(begin, 2) != end) 128 return forOp.emitError("expected perfectly nested loops in the body"); 129 130 if (!(forOp = dyn_cast<AffineForOp>(nested))) 131 return nested->emitError("expected a nested loop"); 132 } 133 return success(); 134 } 135 136 static LogicalResult checkAffineLoopNestMappable(AffineForOp forOp, 137 unsigned numBlockDims, 138 unsigned numThreadDims) { 139 if (numBlockDims < 1 || numThreadDims < 1) { 140 LLVM_DEBUG(llvm::dbgs() << "nothing to map"); 141 return success(); 142 } 143 144 if (numBlockDims > 3) { 145 return forOp.emitError("cannot map to more than 3 block dimensions"); 146 } 147 if (numThreadDims > 3) { 148 return forOp.emitError("cannot map to more than 3 thread dimensions"); 149 } 150 return checkAffineLoopNestMappableImpl(forOp, numBlockDims + numThreadDims); 151 } 152 153 namespace { 154 // Helper structure that holds common state of the loop to GPU kernel 155 // conversion. 156 struct AffineLoopToGpuConverter { 157 std::optional<AffineForOp> collectBounds(AffineForOp forOp, 158 unsigned numLoops); 159 160 void createLaunch(AffineForOp rootForOp, AffineForOp innermostForOp, 161 unsigned numBlockDims, unsigned numThreadDims); 162 163 // Ranges of the loops mapped to blocks or threads. 164 SmallVector<Value, 6> dims; 165 // Lower bounds of the loops mapped to blocks or threads. 166 SmallVector<Value, 6> lbs; 167 // Induction variables of the loops mapped to blocks or threads. 168 SmallVector<Value, 6> ivs; 169 // Steps of the loops mapped to blocks or threads. 170 SmallVector<Value, 6> steps; 171 }; 172 } // namespace 173 174 // Collect ranges, bounds, steps and induction variables in preparation for 175 // mapping a loop nest of depth "numLoops" rooted at "forOp" to a GPU kernel. 176 // This may fail if the IR for computing loop bounds cannot be constructed, for 177 // example if an affine loop uses semi-affine maps. Return the last loop to be 178 // mapped on success, std::nullopt on failure. 179 std::optional<AffineForOp> 180 AffineLoopToGpuConverter::collectBounds(AffineForOp forOp, unsigned numLoops) { 181 OpBuilder builder(forOp.getOperation()); 182 dims.reserve(numLoops); 183 lbs.reserve(numLoops); 184 ivs.reserve(numLoops); 185 steps.reserve(numLoops); 186 AffineForOp currentLoop = forOp; 187 for (unsigned i = 0; i < numLoops; ++i) { 188 Value lowerBound = getOrEmitLowerBound(currentLoop, builder); 189 Value upperBound = getOrEmitUpperBound(currentLoop, builder); 190 if (!lowerBound || !upperBound) { 191 return std::nullopt; 192 } 193 194 Value range = builder.create<arith::SubIOp>(currentLoop.getLoc(), 195 upperBound, lowerBound); 196 Value step = getOrCreateStep(currentLoop, builder); 197 if (getConstantIntValue(step) != static_cast<int64_t>(1)) 198 range = 199 builder.create<arith::CeilDivSIOp>(currentLoop.getLoc(), range, step); 200 dims.push_back(range); 201 202 lbs.push_back(lowerBound); 203 ivs.push_back(currentLoop.getInductionVar()); 204 steps.push_back(step); 205 206 if (i != numLoops - 1) 207 currentLoop = cast<AffineForOp>(¤tLoop.getBody()->front()); 208 } 209 return currentLoop; 210 } 211 212 // Replace the rooted at "rootForOp" with a GPU launch operation. This expects 213 // "innermostForOp" to point to the last loop to be transformed to the kernel, 214 // and to have (numBlockDims + numThreadDims) perfectly nested loops between 215 // "rootForOp" and "innermostForOp". 216 void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp, 217 AffineForOp innermostForOp, 218 unsigned numBlockDims, 219 unsigned numThreadDims) { 220 OpBuilder builder(rootForOp.getOperation()); 221 // Prepare the grid and block sizes for the launch operation. If there is 222 // no loop mapped to a specific dimension, use constant "1" as its size. 223 Value constOne = 224 (numBlockDims < 3 || numThreadDims < 3) 225 ? builder.create<arith::ConstantIndexOp>(rootForOp.getLoc(), 1) 226 : nullptr; 227 Value gridSizeX = numBlockDims > 0 ? dims[0] : constOne; 228 Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne; 229 Value gridSizeZ = numBlockDims > 2 ? dims[2] : constOne; 230 Value blockSizeX = numThreadDims > 0 ? dims[numBlockDims] : constOne; 231 Value blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne; 232 Value blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne; 233 234 // Create a launch op and move the body region of the innermost loop to the 235 // launch op. 236 auto launchOp = builder.create<gpu::LaunchOp>( 237 rootForOp.getLoc(), gridSizeX, gridSizeY, gridSizeZ, blockSizeX, 238 blockSizeY, blockSizeZ); 239 240 // Replace the loop terminator (loops contain only a single block) with the 241 // gpu terminator and move the operations from the loop body block to the gpu 242 // launch body block. Do not move the entire block because of the difference 243 // in block arguments. 244 Operation &terminator = innermostForOp.getBody()->back(); 245 Location terminatorLoc = terminator.getLoc(); 246 terminator.erase(); 247 builder.setInsertionPointToEnd(innermostForOp.getBody()); 248 builder.create<gpu::TerminatorOp>(terminatorLoc, std::nullopt); 249 launchOp.getBody().front().getOperations().splice( 250 launchOp.getBody().front().begin(), 251 innermostForOp.getBody()->getOperations()); 252 253 // Remap the loop iterators to use block/thread identifiers instead. Loops 254 // may iterate from LB with step S whereas GPU thread/block ids always iterate 255 // from 0 to N with step 1. Therefore, loop induction variables are replaced 256 // with (gpu-thread/block-id * S) + LB. 257 builder.setInsertionPointToStart(&launchOp.getBody().front()); 258 auto *lbArgumentIt = lbs.begin(); 259 auto *stepArgumentIt = steps.begin(); 260 for (const auto &en : llvm::enumerate(ivs)) { 261 Value id = 262 en.index() < numBlockDims 263 ? getDim3Value(launchOp.getBlockIds(), en.index()) 264 : getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims); 265 Value step = steps[en.index()]; 266 if (getConstantIntValue(step) != static_cast<int64_t>(1)) 267 id = builder.create<arith::MulIOp>(rootForOp.getLoc(), step, id); 268 269 Value ivReplacement = 270 builder.create<arith::AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id); 271 en.value().replaceAllUsesWith(ivReplacement); 272 std::advance(lbArgumentIt, 1); 273 std::advance(stepArgumentIt, 1); 274 } 275 276 // We are done and can erase the original outermost loop. 277 rootForOp.erase(); 278 } 279 280 // Generic loop to GPU kernel conversion function. 281 static LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp, 282 unsigned numBlockDims, 283 unsigned numThreadDims) { 284 if (failed(checkAffineLoopNestMappable(forOp, numBlockDims, numThreadDims))) 285 return failure(); 286 287 AffineLoopToGpuConverter converter; 288 auto maybeInnerLoop = 289 converter.collectBounds(forOp, numBlockDims + numThreadDims); 290 if (!maybeInnerLoop) 291 return failure(); 292 converter.createLaunch(forOp, *maybeInnerLoop, numBlockDims, numThreadDims); 293 294 return success(); 295 } 296 297 LogicalResult mlir::convertAffineLoopNestToGPULaunch(AffineForOp forOp, 298 unsigned numBlockDims, 299 unsigned numThreadDims) { 300 return ::convertAffineLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims); 301 } 302 303 namespace { 304 struct ParallelToGpuLaunchLowering : public OpRewritePattern<ParallelOp> { 305 using OpRewritePattern<ParallelOp>::OpRewritePattern; 306 307 LogicalResult matchAndRewrite(ParallelOp parallelOp, 308 PatternRewriter &rewriter) const override; 309 }; 310 } // namespace 311 312 /// Tries to derive a static upper bound from the defining operation of 313 /// `upperBound`. 314 static Value deriveStaticUpperBound(Value upperBound, 315 PatternRewriter &rewriter) { 316 if (auto op = upperBound.getDefiningOp<arith::ConstantIndexOp>()) { 317 return op; 318 } 319 320 if (auto minOp = upperBound.getDefiningOp<AffineMinOp>()) { 321 for (const AffineExpr &result : minOp.getMap().getResults()) { 322 if (auto constExpr = dyn_cast<AffineConstantExpr>(result)) { 323 return rewriter.create<arith::ConstantIndexOp>(minOp.getLoc(), 324 constExpr.getValue()); 325 } 326 } 327 } 328 329 if (auto minOp = upperBound.getDefiningOp<arith::MinSIOp>()) { 330 for (Value operand : {minOp.getLhs(), minOp.getRhs()}) { 331 if (auto staticBound = deriveStaticUpperBound(operand, rewriter)) 332 return staticBound; 333 } 334 } 335 336 if (auto multiplyOp = upperBound.getDefiningOp<arith::MulIOp>()) { 337 if (auto lhs = dyn_cast_or_null<arith::ConstantIndexOp>( 338 deriveStaticUpperBound(multiplyOp.getOperand(0), rewriter) 339 .getDefiningOp())) 340 if (auto rhs = dyn_cast_or_null<arith::ConstantIndexOp>( 341 deriveStaticUpperBound(multiplyOp.getOperand(1), rewriter) 342 .getDefiningOp())) { 343 // Assumptions about the upper bound of minimum computations no longer 344 // work if multiplied by mixed signs, so abort in this case. 345 if ((lhs.value() < 0) != (rhs.value() < 0)) 346 return {}; 347 348 return rewriter.create<arith::ConstantIndexOp>( 349 multiplyOp.getLoc(), lhs.value() * rhs.value()); 350 } 351 } 352 353 return {}; 354 } 355 356 static bool isMappedToProcessor(gpu::Processor processor) { 357 return processor != gpu::Processor::Sequential; 358 } 359 360 static unsigned getLaunchOpArgumentNum(gpu::Processor processor) { 361 switch (processor) { 362 case gpu::Processor::BlockX: 363 return 0; 364 case gpu::Processor::BlockY: 365 return 1; 366 case gpu::Processor::BlockZ: 367 return 2; 368 case gpu::Processor::ThreadX: 369 return 3; 370 case gpu::Processor::ThreadY: 371 return 4; 372 case gpu::Processor::ThreadZ: 373 return 5; 374 default:; 375 } 376 llvm_unreachable( 377 "invalid processor type while retrieving launch op argument number"); 378 } 379 380 /// Modifies the current transformation state to capture the effect of the given 381 /// `scf.parallel` operation on index substitutions and the operations to be 382 /// inserted. 383 /// Specifically, if a dimension of a parallel loop is mapped to a hardware id, 384 /// this function will 385 /// - compute the loop index based on the hardware id and affine map from the 386 /// mapping and update `cloningMap` to substitute all uses. 387 /// - derive a new upper bound for the hardware id and augment the provided 388 /// `gpu.launch operation` accordingly. 389 /// - if the upper bound is imprecise, insert a conditional in the `gpu.launch` 390 /// and update the rewriter to insert into the conditional's body. 391 /// If the dimension is mapped to sequential, 392 /// - insert a for loop into the body and update the rewriter to insert into 393 /// the for loop's body. 394 /// - update the `cloningMap` to replace uses of the index with the index of 395 /// the new for loop. 396 /// In either case, 397 /// - append the instructions from the loops body to worklist, in reverse order. 398 /// To note the end of the current scope in case a loop or conditional was 399 /// inserted, a sentinel (the `gpu.launch` operation) is inserted into the 400 /// worklist. This signals the processor of the worklist to pop the rewriter 401 /// one scope-level up. 402 static LogicalResult processParallelLoop( 403 ParallelOp parallelOp, gpu::LaunchOp launchOp, IRMapping &cloningMap, 404 SmallVectorImpl<Operation *> &worklist, 405 DenseMap<gpu::Processor, Value> &bounds, PatternRewriter &rewriter) { 406 // TODO: Verify that this is a valid GPU mapping. 407 // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential 408 ArrayAttr mapping = 409 parallelOp->getAttrOfType<ArrayAttr>(gpu::getMappingAttrName()); 410 411 // TODO: Support multiple reductions. 412 if (!mapping || parallelOp.getNumResults() > 1) 413 return failure(); 414 415 Location loc = parallelOp.getLoc(); 416 417 auto launchIndependent = [&launchOp](Value val) { 418 return val.getParentRegion()->isAncestor(launchOp->getParentRegion()); 419 }; 420 421 auto ensureLaunchIndependent = [&rewriter, 422 launchIndependent](Value val) -> Value { 423 if (launchIndependent(val)) 424 return val; 425 if (auto constOp = val.getDefiningOp<arith::ConstantOp>()) 426 return rewriter.create<arith::ConstantOp>(constOp.getLoc(), 427 constOp.getValue()); 428 return {}; 429 }; 430 431 for (auto config : llvm::zip( 432 mapping, parallelOp.getInductionVars(), parallelOp.getLowerBound(), 433 parallelOp.getUpperBound(), parallelOp.getStep())) { 434 Attribute mappingAttribute; 435 Value iv, lowerBound, upperBound, step; 436 std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config; 437 auto annotation = 438 dyn_cast<gpu::ParallelLoopDimMappingAttr>(mappingAttribute); 439 if (!annotation) 440 return parallelOp.emitOpError() 441 << "expected mapping attribute for lowering to GPU"; 442 Value newIndex; 443 gpu::Processor processor = annotation.getProcessor(); 444 445 if (isMappedToProcessor(processor)) { 446 // Use the corresponding thread/grid index as replacement for the loop iv. 447 Value operand = 448 launchOp.getBody().getArgument(getLaunchOpArgumentNum(processor)); 449 // Take the indexmap and add the lower bound and step computations in. 450 // This computes operand * step + lowerBound. 451 // Use an affine map here so that it composes nicely with the provided 452 // annotation. 453 AffineMap lowerAndStep = AffineMap::get( 454 1, 2, 455 rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) + 456 rewriter.getAffineSymbolExpr(1)); 457 newIndex = rewriter.create<AffineApplyOp>( 458 loc, annotation.getMap().compose(lowerAndStep), 459 ValueRange{operand, ensureLaunchIndependent(step), 460 ensureLaunchIndependent(lowerBound)}); 461 // If there was also a bound, insert that, too. 462 // TODO: Check that we do not assign bounds twice. 463 if (annotation.getBound()) { 464 // We pass as the single operand to the bound-map the number of 465 // iterations, which is (upperBound - lowerBound) ceilDiv step. To 466 // support inner loops with dynamic upper bounds (as generated by e.g. 467 // tiling), try to derive a max for the bounds. If the used bound for 468 // the hardware id is imprecise, wrap the contained code into a 469 // conditional. If the lower-bound is constant or defined before the 470 // launch, we can use it in the launch bounds. Otherwise fail. 471 if (!launchIndependent(lowerBound) && 472 !isa_and_nonnull<arith::ConstantOp>(lowerBound.getDefiningOp())) 473 return failure(); 474 // The step must also be constant or defined outside of the loop nest. 475 if (!launchIndependent(step) && 476 !isa_and_nonnull<arith::ConstantOp>(step.getDefiningOp())) 477 return failure(); 478 // If the upper-bound is constant or defined before the launch, we can 479 // use it in the launch bounds directly. Otherwise try derive a bound. 480 bool boundIsPrecise = 481 launchIndependent(upperBound) || 482 isa_and_nonnull<arith::ConstantOp>(upperBound.getDefiningOp()); 483 { 484 PatternRewriter::InsertionGuard guard(rewriter); 485 rewriter.setInsertionPoint(launchOp); 486 if (!boundIsPrecise) { 487 upperBound = deriveStaticUpperBound(upperBound, rewriter); 488 if (!upperBound) { 489 return rewriter.notifyMatchFailure( 490 parallelOp, 491 "cannot derive loop-invariant upper bound for number of" 492 "iterations"); 493 } 494 } 495 // Compute the number of iterations needed. We compute this as an 496 // affine expression ceilDiv (upperBound - lowerBound) step. We use 497 // affine.apply here so that it composes nicely with the provided map. 498 AffineMap stepMap = AffineMap::get( 499 1, 2, 500 ((rewriter.getAffineDimExpr(0) - rewriter.getAffineSymbolExpr(0)) 501 .ceilDiv(rewriter.getAffineSymbolExpr(1)))); 502 Value launchBound = rewriter.create<AffineApplyOp>( 503 loc, annotation.getBound().compose(stepMap), 504 ValueRange{ 505 ensureLaunchIndependent( 506 cloningMap.lookupOrDefault(upperBound)), 507 ensureLaunchIndependent( 508 cloningMap.lookupOrDefault(lowerBound)), 509 ensureLaunchIndependent(cloningMap.lookupOrDefault(step))}); 510 // todo(herhut,ravishankarm): Update the behavior of setMappingAttr 511 // when this condition is relaxed. 512 if (!bounds.try_emplace(processor, launchBound).second) { 513 return rewriter.notifyMatchFailure( 514 parallelOp, "cannot redefine the bound for processor " + 515 Twine(static_cast<int64_t>(processor))); 516 } 517 } 518 if (!boundIsPrecise) { 519 // We are using an approximation, create a surrounding conditional. 520 Value originalBound = std::get<3>(config); 521 arith::CmpIOp pred = rewriter.create<arith::CmpIOp>( 522 loc, arith::CmpIPredicate::slt, newIndex, 523 cloningMap.lookupOrDefault(originalBound)); 524 scf::IfOp ifOp = rewriter.create<scf::IfOp>(loc, pred, false); 525 rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front()); 526 // Put a sentinel into the worklist so we know when to pop out of the 527 // if body again. We use the launchOp here, as that cannot be part of 528 // the bodies instruction. 529 worklist.push_back(launchOp.getOperation()); 530 } 531 } 532 } else { 533 // Create a sequential for loop. 534 auto loopOp = rewriter.create<scf::ForOp>( 535 loc, cloningMap.lookupOrDefault(lowerBound), 536 cloningMap.lookupOrDefault(upperBound), 537 cloningMap.lookupOrDefault(step)); 538 newIndex = loopOp.getInductionVar(); 539 rewriter.setInsertionPointToStart(loopOp.getBody()); 540 // Put a sentinel into the worklist so we know when to pop out of the loop 541 // body again. We use the launchOp here, as that cannot be part of the 542 // bodies instruction. 543 worklist.push_back(launchOp.getOperation()); 544 } 545 cloningMap.map(iv, newIndex); 546 } 547 548 // Propagate custom user defined optional attributes, that can be used at 549 // later stage, such as extension data for GPU kernel dispatch 550 for (const auto &namedAttr : parallelOp->getAttrs()) { 551 if (namedAttr.getName() == gpu::getMappingAttrName() || 552 namedAttr.getName() == ParallelOp::getOperandSegmentSizeAttr()) 553 continue; 554 launchOp->setAttr(namedAttr.getName(), namedAttr.getValue()); 555 } 556 557 Block *body = parallelOp.getBody(); 558 worklist.reserve(worklist.size() + body->getOperations().size()); 559 // Include scf.reduce terminator if exists and has an operand. 560 if (auto terminator = body->getTerminator(); 561 isa<scf::ReduceOp>(terminator) && terminator->getOperands().size() == 1) { 562 worklist.push_back(terminator); 563 } 564 for (Operation &op : llvm::reverse(body->without_terminator())) 565 worklist.push_back(&op); 566 return success(); 567 } 568 569 /// Lower a `scf.parallel` operation into a corresponding `gpu.launch` 570 /// operation. 571 /// 572 /// This essentially transforms a loop nest into a corresponding SIMT function. 573 /// The conversion is driven by mapping annotations on the `scf.parallel` 574 /// operations. The mapping is provided via a `DictionaryAttribute` named 575 /// `mapping`, which has three entries: 576 /// - processor: the hardware id to map to. 0-2 are block dimensions, 3-5 are 577 /// thread dimensions and 6 is sequential. 578 /// - map : An affine map that is used to pre-process hardware ids before 579 /// substitution. 580 /// - bound : An affine map that is used to compute the bound of the hardware 581 /// id based on an upper bound of the number of iterations. 582 /// If the `scf.parallel` contains nested `scf.parallel` operations, those 583 /// need to be annotated, as well. Structurally, the transformation works by 584 /// splicing all operations from nested `scf.parallel` operations into a single 585 /// sequence. Indices mapped to hardware ids are substituted with those ids, 586 /// wheras sequential mappings result in a sequential for-loop. To have more 587 /// flexibility when mapping code to hardware ids, the transform supports two 588 /// affine maps. The first `map` is used to compute the actual index for 589 /// substitution from the hardware id. The second `bound` is used to compute the 590 /// launch dimension for the hardware id from the number of iterations the 591 /// mapped loop is performing. Note that the number of iterations might be 592 /// imprecise if the corresponding loop-bounds are loop-dependent. In such case, 593 /// the hardware id might iterate over additional indices. The transformation 594 /// caters for this by predicating the created sequence of instructions on 595 /// the actual loop bound. This only works if an static upper bound for the 596 /// dynamic loop bound can be derived, currently via analyzing `affine.min` 597 /// operations. 598 LogicalResult 599 ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, 600 PatternRewriter &rewriter) const { 601 // Mark the operation as visited for recursive legality check. 602 parallelOp->setAttr(kVisitedAttrName, rewriter.getUnitAttr()); 603 604 // We can only transform starting at the outer-most loop. Launches inside of 605 // parallel loops are not supported. 606 if (auto parentLoop = parallelOp->getParentOfType<ParallelOp>()) 607 return failure(); 608 // Create a launch operation. We start with bound one for all grid/block 609 // sizes. Those will be refined later as we discover them from mappings. 610 Location loc = parallelOp.getLoc(); 611 Value constantOne = 612 rewriter.create<arith::ConstantIndexOp>(parallelOp.getLoc(), 1); 613 gpu::LaunchOp launchOp = rewriter.create<gpu::LaunchOp>( 614 parallelOp.getLoc(), constantOne, constantOne, constantOne, constantOne, 615 constantOne, constantOne); 616 rewriter.setInsertionPointToEnd(&launchOp.getBody().front()); 617 rewriter.create<gpu::TerminatorOp>(loc); 618 rewriter.setInsertionPointToStart(&launchOp.getBody().front()); 619 620 IRMapping cloningMap; 621 llvm::DenseMap<gpu::Processor, Value> launchBounds; 622 SmallVector<Operation *, 16> worklist; 623 if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist, 624 launchBounds, rewriter))) 625 return failure(); 626 627 // Whether we have seen any side-effects. Reset when leaving an inner scope. 628 bool seenSideeffects = false; 629 // Whether we have left a nesting scope (and hence are no longer innermost). 630 bool leftNestingScope = false; 631 while (!worklist.empty()) { 632 Operation *op = worklist.pop_back_val(); 633 // Now walk over the body and clone it. 634 // TODO: This is only correct if there either is no further scf.parallel 635 // nested or this code is side-effect free. Otherwise we might need 636 // predication. We are overly conservative for now and only allow 637 // side-effects in the innermost scope. 638 if (auto nestedParallel = dyn_cast<ParallelOp>(op)) { 639 // Before entering a nested scope, make sure there have been no 640 // sideeffects until now. 641 if (seenSideeffects) 642 return failure(); 643 // A nested scf.parallel needs insertion of code to compute indices. 644 // Insert that now. This will also update the worklist with the loops 645 // body. 646 if (failed(processParallelLoop(nestedParallel, launchOp, cloningMap, 647 worklist, launchBounds, rewriter))) 648 return failure(); 649 } else if (op == launchOp.getOperation()) { 650 // Found our sentinel value. We have finished the operations from one 651 // nesting level, pop one level back up. 652 auto *parent = rewriter.getInsertionPoint()->getParentOp(); 653 rewriter.setInsertionPointAfter(parent); 654 leftNestingScope = true; 655 seenSideeffects = false; 656 } else if (auto reduceOp = dyn_cast<scf::ReduceOp>(op)) { 657 // Convert scf.reduction op 658 auto parentLoop = op->getParentOfType<ParallelOp>(); 659 if (!parentLoop || op->getOperands().size() != 1) 660 return failure(); 661 auto operand = op->getOperands().front(); 662 auto newValue = cloningMap.lookupOrNull(operand); 663 if (!newValue || !operand.getType().isSignlessIntOrFloat()) 664 return failure(); 665 // Ensure reduction region is isolated from above. 666 llvm::SetVector<Value> externalValues; 667 getUsedValuesDefinedAbove(reduceOp.getRegion(0), externalValues); 668 if (externalValues.size()) 669 return failure(); 670 // Replace by gpu.all_reduce. 671 auto gpuRedOp = rewriter.create<gpu::AllReduceOp>(loc, newValue); 672 cloningMap.map(parentLoop->getResult(0), gpuRedOp.getResult()); 673 // Copy region. 674 rewriter.inlineRegionBefore(reduceOp.getRegion(0), gpuRedOp.getRegion(), 675 gpuRedOp.getRegion().begin()); 676 // Replace src.reduce.return with gpu.yield. 677 auto scfReturn = gpuRedOp.getRegion().front().getTerminator(); 678 auto ip = rewriter.saveInsertionPoint(); 679 rewriter.setInsertionPointToEnd(&gpuRedOp.getRegion().front()); 680 rewriter.replaceOpWithNewOp<gpu::YieldOp>( 681 scfReturn, scfReturn->getOperands().front()); 682 rewriter.restoreInsertionPoint(ip); 683 } else { 684 // Otherwise we copy it over. 685 Operation *clone = rewriter.clone(*op, cloningMap); 686 cloningMap.map(op->getResults(), clone->getResults()); 687 // Check for side effects. 688 // TODO: Handle region side effects properly. 689 seenSideeffects |= 690 !isMemoryEffectFree(clone) || clone->getNumRegions() != 0; 691 // If we are no longer in the innermost scope, sideeffects are disallowed. 692 if (seenSideeffects && leftNestingScope) 693 return failure(); 694 } 695 } 696 697 // Now that we succeeded creating the launch operation, also update the 698 // bounds. 699 for (auto bound : launchBounds) 700 launchOp.setOperand(getLaunchOpArgumentNum(std::get<0>(bound)), 701 std::get<1>(bound)); 702 703 rewriter.eraseOp(parallelOp); 704 return success(); 705 } 706 707 void mlir::populateParallelLoopToGPUPatterns(RewritePatternSet &patterns) { 708 patterns.add<ParallelToGpuLaunchLowering>(patterns.getContext()); 709 } 710 711 void mlir::configureParallelLoopToGPULegality(ConversionTarget &target) { 712 target.addLegalDialect<memref::MemRefDialect>(); 713 target.addDynamicallyLegalOp<scf::ParallelOp>([](scf::ParallelOp parallelOp) { 714 return !parallelOp->hasAttr(gpu::getMappingAttrName()) || 715 parallelOp->hasAttr(kVisitedAttrName); 716 }); 717 } 718 719 void mlir::finalizeParallelLoopToGPUConversion(Operation *op) { 720 op->walk([](scf::ParallelOp parallelOp) { 721 parallelOp->removeAttr(kVisitedAttrName); 722 }); 723 } 724