1 //===- SCFToGPU.cpp - Convert an affine loop nest to a GPU kernel -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This implements a straightforward conversion of an loop nest into a GPU 10 // kernel. The caller is expected to guarantee that the conversion is correct 11 // or to further transform the kernel to ensure correctness. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "mlir/Conversion/SCFToGPU/SCFToGPU.h" 16 17 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" 18 #include "mlir/Dialect/Affine/IR/AffineOps.h" 19 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" 20 #include "mlir/Dialect/GPU/GPUDialect.h" 21 #include "mlir/Dialect/GPU/ParallelLoopMapper.h" 22 #include "mlir/Dialect/MemRef/IR/MemRef.h" 23 #include "mlir/Dialect/SCF/SCF.h" 24 #include "mlir/Dialect/StandardOps/IR/Ops.h" 25 #include "mlir/IR/AffineExpr.h" 26 #include "mlir/IR/BlockAndValueMapping.h" 27 #include "mlir/IR/Builders.h" 28 #include "mlir/Pass/Pass.h" 29 #include "mlir/Transforms/DialectConversion.h" 30 #include "mlir/Transforms/Passes.h" 31 #include "mlir/Transforms/RegionUtils.h" 32 #include "llvm/ADT/Sequence.h" 33 #include "llvm/Support/Debug.h" 34 35 #define DEBUG_TYPE "loops-to-gpu" 36 37 using namespace mlir; 38 using namespace mlir::scf; 39 40 // Name of internal attribute to mark visited operations during conversion. 41 // 42 // NOTE: The conversion originally used the following legality criteria: 43 // `!parallelOp->hasAttr(gpu::getMappingAttrName())` 44 // But the provided pattern might reject some cases based on more detailed 45 // analysis of the `mapping` attribute. 46 // To avoid dialect conversion failure due to non-converted illegal operation 47 // we use this extra Unit attribute as a marker, that the operation was checked 48 // by the pattern and is should be considered as legal in the following legality 49 // checks. The `finalizeParallelLoopToGPUConversion` function performs clean up 50 // of this extra attributes ans is supposed to be called after the dialect 51 // conversion. 52 // 53 // TODO: Implement a cleaner solution, factoring out the "matching" logic 54 // from the pattern and its callees into a separate function that can be called 55 // from both the pattern and the op legality check. 56 static constexpr StringLiteral kVisitedAttrName = "SCFToGPU_visited"; 57 58 // Extract an indexed value from KernelDim3. 59 static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) { 60 switch (pos) { 61 case 0: 62 return dim3.x; 63 case 1: 64 return dim3.y; 65 case 2: 66 return dim3.z; 67 default: 68 llvm_unreachable("dim3 position out of bounds"); 69 } 70 return nullptr; 71 } 72 73 // Get the lower bound-related operands of a loop operation. 74 static Operation::operand_range getLowerBoundOperands(AffineForOp forOp) { 75 return forOp.getLowerBoundOperands(); 76 } 77 78 // Get the upper bound-related operands of a loop operation. 79 static Operation::operand_range getUpperBoundOperands(AffineForOp forOp) { 80 return forOp.getUpperBoundOperands(); 81 } 82 83 // Get a Value that corresponds to the loop step. If the step is an attribute, 84 // materialize a corresponding constant using builder. 85 static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder) { 86 return builder.create<arith::ConstantIndexOp>(forOp.getLoc(), 87 forOp.getStep()); 88 } 89 90 // Get a Value for the loop lower bound. If the value requires computation, 91 // materialize the instructions using builder. 92 static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) { 93 return lowerAffineLowerBound(forOp, builder); 94 } 95 96 // Get a Value for the loop upper bound. If the value requires computation, 97 // materialize the instructions using builder. 98 static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) { 99 return lowerAffineUpperBound(forOp, builder); 100 } 101 102 // Check the structure of the loop nest: 103 // - there are enough loops to map to numDims; 104 // - the loops are perfectly nested; 105 // - the loop bounds can be computed above the outermost loop. 106 // This roughly corresponds to the "matcher" part of the pattern-based 107 // rewriting infrastructure. 108 static LogicalResult checkAffineLoopNestMappableImpl(AffineForOp forOp, 109 unsigned numDims) { 110 Region &limit = forOp.region(); 111 for (unsigned i = 0, e = numDims; i < e; ++i) { 112 Operation *nested = &forOp.getBody()->front(); 113 if (!areValuesDefinedAbove(getLowerBoundOperands(forOp), limit) || 114 !areValuesDefinedAbove(getUpperBoundOperands(forOp), limit)) 115 return forOp.emitError( 116 "loops with bounds depending on other mapped loops " 117 "are not supported"); 118 119 // The innermost loop can have an arbitrary body, skip the perfect nesting 120 // check for it. 121 if (i == e - 1) 122 break; 123 124 auto begin = forOp.getBody()->begin(), end = forOp.getBody()->end(); 125 if (forOp.getBody()->empty() || std::next(begin, 2) != end) 126 return forOp.emitError("expected perfectly nested loops in the body"); 127 128 if (!(forOp = dyn_cast<AffineForOp>(nested))) 129 return nested->emitError("expected a nested loop"); 130 } 131 return success(); 132 } 133 134 static LogicalResult checkAffineLoopNestMappable(AffineForOp forOp, 135 unsigned numBlockDims, 136 unsigned numThreadDims) { 137 if (numBlockDims < 1 || numThreadDims < 1) { 138 LLVM_DEBUG(llvm::dbgs() << "nothing to map"); 139 return success(); 140 } 141 142 if (numBlockDims > 3) { 143 return forOp.emitError("cannot map to more than 3 block dimensions"); 144 } 145 if (numThreadDims > 3) { 146 return forOp.emitError("cannot map to more than 3 thread dimensions"); 147 } 148 return checkAffineLoopNestMappableImpl(forOp, numBlockDims + numThreadDims); 149 } 150 151 namespace { 152 // Helper structure that holds common state of the loop to GPU kernel 153 // conversion. 154 struct AffineLoopToGpuConverter { 155 Optional<AffineForOp> collectBounds(AffineForOp forOp, unsigned numLoops); 156 157 void createLaunch(AffineForOp rootForOp, AffineForOp innermostForOp, 158 unsigned numBlockDims, unsigned numThreadDims); 159 160 // Ranges of the loops mapped to blocks or threads. 161 SmallVector<Value, 6> dims; 162 // Lower bounds of the loops mapped to blocks or threads. 163 SmallVector<Value, 6> lbs; 164 // Induction variables of the loops mapped to blocks or threads. 165 SmallVector<Value, 6> ivs; 166 // Steps of the loops mapped to blocks or threads. 167 SmallVector<Value, 6> steps; 168 }; 169 } // namespace 170 171 // Return true if the value is obviously a constant "one". 172 static bool isConstantOne(Value value) { 173 if (auto def = value.getDefiningOp<arith::ConstantIndexOp>()) 174 return def.value() == 1; 175 return false; 176 } 177 178 // Collect ranges, bounds, steps and induction variables in preparation for 179 // mapping a loop nest of depth "numLoops" rooted at "forOp" to a GPU kernel. 180 // This may fail if the IR for computing loop bounds cannot be constructed, for 181 // example if an affine loop uses semi-affine maps. Return the last loop to be 182 // mapped on success, llvm::None on failure. 183 Optional<AffineForOp> 184 AffineLoopToGpuConverter::collectBounds(AffineForOp forOp, unsigned numLoops) { 185 OpBuilder builder(forOp.getOperation()); 186 dims.reserve(numLoops); 187 lbs.reserve(numLoops); 188 ivs.reserve(numLoops); 189 steps.reserve(numLoops); 190 AffineForOp currentLoop = forOp; 191 for (unsigned i = 0; i < numLoops; ++i) { 192 Value lowerBound = getOrEmitLowerBound(currentLoop, builder); 193 Value upperBound = getOrEmitUpperBound(currentLoop, builder); 194 if (!lowerBound || !upperBound) { 195 return llvm::None; 196 } 197 198 Value range = builder.create<arith::SubIOp>(currentLoop.getLoc(), 199 upperBound, lowerBound); 200 Value step = getOrCreateStep(currentLoop, builder); 201 if (!isConstantOne(step)) 202 range = builder.create<arith::DivSIOp>(currentLoop.getLoc(), range, step); 203 dims.push_back(range); 204 205 lbs.push_back(lowerBound); 206 ivs.push_back(currentLoop.getInductionVar()); 207 steps.push_back(step); 208 209 if (i != numLoops - 1) 210 currentLoop = cast<AffineForOp>(¤tLoop.getBody()->front()); 211 } 212 return currentLoop; 213 } 214 215 // Replace the rooted at "rootForOp" with a GPU launch operation. This expects 216 // "innermostForOp" to point to the last loop to be transformed to the kernel, 217 // and to have (numBlockDims + numThreadDims) perfectly nested loops between 218 // "rootForOp" and "innermostForOp". 219 void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp, 220 AffineForOp innermostForOp, 221 unsigned numBlockDims, 222 unsigned numThreadDims) { 223 OpBuilder builder(rootForOp.getOperation()); 224 // Prepare the grid and block sizes for the launch operation. If there is 225 // no loop mapped to a specific dimension, use constant "1" as its size. 226 Value constOne = 227 (numBlockDims < 3 || numThreadDims < 3) 228 ? builder.create<arith::ConstantIndexOp>(rootForOp.getLoc(), 1) 229 : nullptr; 230 Value gridSizeX = numBlockDims > 0 ? dims[0] : constOne; 231 Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne; 232 Value gridSizeZ = numBlockDims > 2 ? dims[2] : constOne; 233 Value blockSizeX = numThreadDims > 0 ? dims[numBlockDims] : constOne; 234 Value blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne; 235 Value blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne; 236 237 // Create a launch op and move the body region of the innermost loop to the 238 // launch op. 239 auto launchOp = builder.create<gpu::LaunchOp>( 240 rootForOp.getLoc(), gridSizeX, gridSizeY, gridSizeZ, blockSizeX, 241 blockSizeY, blockSizeZ); 242 243 // Replace the loop terminator (loops contain only a single block) with the 244 // gpu terminator and move the operations from the loop body block to the gpu 245 // launch body block. Do not move the entire block because of the difference 246 // in block arguments. 247 Operation &terminator = innermostForOp.getBody()->back(); 248 Location terminatorLoc = terminator.getLoc(); 249 terminator.erase(); 250 builder.setInsertionPointToEnd(innermostForOp.getBody()); 251 builder.create<gpu::TerminatorOp>(terminatorLoc, llvm::None); 252 launchOp.body().front().getOperations().splice( 253 launchOp.body().front().begin(), 254 innermostForOp.getBody()->getOperations()); 255 256 // Remap the loop iterators to use block/thread identifiers instead. Loops 257 // may iterate from LB with step S whereas GPU thread/block ids always iterate 258 // from 0 to N with step 1. Therefore, loop induction variables are replaced 259 // with (gpu-thread/block-id * S) + LB. 260 builder.setInsertionPointToStart(&launchOp.body().front()); 261 auto *lbArgumentIt = lbs.begin(); 262 auto *stepArgumentIt = steps.begin(); 263 for (const auto &en : llvm::enumerate(ivs)) { 264 Value id = 265 en.index() < numBlockDims 266 ? getDim3Value(launchOp.getBlockIds(), en.index()) 267 : getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims); 268 Value step = steps[en.index()]; 269 if (!isConstantOne(step)) 270 id = builder.create<arith::MulIOp>(rootForOp.getLoc(), step, id); 271 272 Value ivReplacement = 273 builder.create<arith::AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id); 274 en.value().replaceAllUsesWith(ivReplacement); 275 std::advance(lbArgumentIt, 1); 276 std::advance(stepArgumentIt, 1); 277 } 278 279 // We are done and can erase the original outermost loop. 280 rootForOp.erase(); 281 } 282 283 // Generic loop to GPU kernel conversion function. 284 static LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp, 285 unsigned numBlockDims, 286 unsigned numThreadDims) { 287 if (failed(checkAffineLoopNestMappable(forOp, numBlockDims, numThreadDims))) 288 return failure(); 289 290 AffineLoopToGpuConverter converter; 291 auto maybeInnerLoop = 292 converter.collectBounds(forOp, numBlockDims + numThreadDims); 293 if (!maybeInnerLoop) 294 return failure(); 295 converter.createLaunch(forOp, *maybeInnerLoop, numBlockDims, numThreadDims); 296 297 return success(); 298 } 299 300 LogicalResult mlir::convertAffineLoopNestToGPULaunch(AffineForOp forOp, 301 unsigned numBlockDims, 302 unsigned numThreadDims) { 303 return ::convertAffineLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims); 304 } 305 306 namespace { 307 struct ParallelToGpuLaunchLowering : public OpRewritePattern<ParallelOp> { 308 using OpRewritePattern<ParallelOp>::OpRewritePattern; 309 310 LogicalResult matchAndRewrite(ParallelOp parallelOp, 311 PatternRewriter &rewriter) const override; 312 }; 313 } // namespace 314 315 /// Tries to derive a static upper bound from the defining operation of 316 /// `upperBound`. 317 static Value deriveStaticUpperBound(Value upperBound, 318 PatternRewriter &rewriter) { 319 if (auto op = upperBound.getDefiningOp<arith::ConstantIndexOp>()) { 320 return op; 321 } 322 323 if (auto minOp = upperBound.getDefiningOp<AffineMinOp>()) { 324 for (const AffineExpr &result : minOp.map().getResults()) { 325 if (auto constExpr = result.dyn_cast<AffineConstantExpr>()) { 326 return rewriter.create<arith::ConstantIndexOp>(minOp.getLoc(), 327 constExpr.getValue()); 328 } 329 } 330 } 331 332 if (auto multiplyOp = upperBound.getDefiningOp<arith::MulIOp>()) { 333 if (auto lhs = dyn_cast_or_null<arith::ConstantIndexOp>( 334 deriveStaticUpperBound(multiplyOp.getOperand(0), rewriter) 335 .getDefiningOp())) 336 if (auto rhs = dyn_cast_or_null<arith::ConstantIndexOp>( 337 deriveStaticUpperBound(multiplyOp.getOperand(1), rewriter) 338 .getDefiningOp())) { 339 // Assumptions about the upper bound of minimum computations no longer 340 // work if multiplied by a negative value, so abort in this case. 341 if (lhs.value() < 0 || rhs.value() < 0) 342 return {}; 343 344 return rewriter.create<arith::ConstantIndexOp>( 345 multiplyOp.getLoc(), lhs.value() * rhs.value()); 346 } 347 } 348 349 return {}; 350 } 351 352 static bool isMappedToProcessor(gpu::Processor processor) { 353 return processor != gpu::Processor::Sequential; 354 } 355 356 static unsigned getLaunchOpArgumentNum(gpu::Processor processor) { 357 switch (processor) { 358 case gpu::Processor::BlockX: 359 return 0; 360 case gpu::Processor::BlockY: 361 return 1; 362 case gpu::Processor::BlockZ: 363 return 2; 364 case gpu::Processor::ThreadX: 365 return 3; 366 case gpu::Processor::ThreadY: 367 return 4; 368 case gpu::Processor::ThreadZ: 369 return 5; 370 default:; 371 } 372 llvm_unreachable( 373 "invalid processor type while retrieving launch op argument number"); 374 } 375 376 /// Modifies the current transformation state to capture the effect of the given 377 /// `scf.parallel` operation on index substitutions and the operations to be 378 /// inserted. 379 /// Specifically, if a dimension of a parallel loop is mapped to a hardware id, 380 /// this function will 381 /// - compute the loop index based on the hardware id and affine map from the 382 /// mapping and update `cloningMap` to substitute all uses. 383 /// - derive a new upper bound for the hardware id and augment the provided 384 /// `gpu.launch operation` accordingly. 385 /// - if the upper bound is imprecise, insert a conditional in the `gpu.launch` 386 /// and update the rewriter to insert into the conditional's body. 387 /// If the dimension is mapped to sequential, 388 /// - insert a for loop into the body and update the rewriter to insert into 389 /// the for loop's body. 390 /// - update the `cloningMap` to replace uses of the index with the index of 391 /// the new for loop. 392 /// In either case, 393 /// - append the instructions from the loops body to worklist, in reverse order. 394 /// To note the end of the current scope in case a loop or conditional was 395 /// inserted, a sentinel (the `gpu.launch` operation) is inserted into the 396 /// worklist. This signals the processor of the worklist to pop the rewriter 397 /// one scope-level up. 398 static LogicalResult processParallelLoop( 399 ParallelOp parallelOp, gpu::LaunchOp launchOp, 400 BlockAndValueMapping &cloningMap, SmallVectorImpl<Operation *> &worklist, 401 DenseMap<gpu::Processor, Value> &bounds, PatternRewriter &rewriter) { 402 // TODO: Verify that this is a valid GPU mapping. 403 // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential 404 ArrayAttr mapping = 405 parallelOp->getAttrOfType<ArrayAttr>(gpu::getMappingAttrName()); 406 407 // TODO: Support reductions. 408 if (!mapping || parallelOp.getNumResults() != 0) 409 return failure(); 410 411 Location loc = parallelOp.getLoc(); 412 413 auto launchIndependent = [&launchOp](Value val) { 414 return val.getParentRegion()->isAncestor(launchOp->getParentRegion()); 415 }; 416 417 auto ensureLaunchIndependent = [&rewriter, 418 launchIndependent](Value val) -> Value { 419 if (launchIndependent(val)) 420 return val; 421 if (auto constOp = val.getDefiningOp<arith::ConstantOp>()) 422 return rewriter.create<arith::ConstantOp>(constOp.getLoc(), 423 constOp.getValue()); 424 return {}; 425 }; 426 427 for (auto config : llvm::zip( 428 mapping, parallelOp.getInductionVars(), parallelOp.getLowerBound(), 429 parallelOp.getUpperBound(), parallelOp.getStep())) { 430 Attribute mappingAttribute; 431 Value iv, lowerBound, upperBound, step; 432 std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config; 433 auto annotation = mappingAttribute.dyn_cast<gpu::ParallelLoopDimMapping>(); 434 if (!annotation) 435 return parallelOp.emitOpError() 436 << "expected mapping attribute for lowering to GPU"; 437 Value newIndex; 438 gpu::Processor processor = gpu::getProcessor(annotation); 439 440 if (isMappedToProcessor(processor)) { 441 // Use the corresponding thread/grid index as replacement for the loop iv. 442 Value operand = 443 launchOp.body().getArgument(getLaunchOpArgumentNum(processor)); 444 // Take the indexmap and add the lower bound and step computations in. 445 // This computes operand * step + lowerBound. 446 // Use an affine map here so that it composes nicely with the provided 447 // annotation. 448 AffineMap lowerAndStep = AffineMap::get( 449 1, 2, 450 rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) + 451 rewriter.getAffineSymbolExpr(1)); 452 newIndex = rewriter.create<AffineApplyOp>( 453 loc, annotation.map().getValue().compose(lowerAndStep), 454 ValueRange{operand, step, lowerBound}); 455 // If there was also a bound, insert that, too. 456 // TODO: Check that we do not assign bounds twice. 457 if (annotation.bound().getValue()) { 458 // We pass as the single operand to the bound-map the number of 459 // iterations, which is (upperBound - lowerBound) ceilDiv step. To 460 // support inner loops with dynamic upper bounds (as generated by e.g. 461 // tiling), try to derive a max for the bounds. If the used bound for 462 // the hardware id is imprecise, wrap the contained code into a 463 // conditional. If the lower-bound is constant or defined before the 464 // launch, we can use it in the launch bounds. Otherwise fail. 465 if (!launchIndependent(lowerBound) && 466 !isa_and_nonnull<arith::ConstantOp>(lowerBound.getDefiningOp())) 467 return failure(); 468 // The step must also be constant or defined outside of the loop nest. 469 if (!launchIndependent(step) && 470 !isa_and_nonnull<arith::ConstantOp>(step.getDefiningOp())) 471 return failure(); 472 // If the upper-bound is constant or defined before the launch, we can 473 // use it in the launch bounds directly. Otherwise try derive a bound. 474 bool boundIsPrecise = 475 launchIndependent(upperBound) || 476 isa_and_nonnull<arith::ConstantOp>(upperBound.getDefiningOp()); 477 { 478 PatternRewriter::InsertionGuard guard(rewriter); 479 rewriter.setInsertionPoint(launchOp); 480 if (!boundIsPrecise) { 481 upperBound = deriveStaticUpperBound(upperBound, rewriter); 482 if (!upperBound) { 483 return rewriter.notifyMatchFailure( 484 parallelOp, 485 "cannot derive loop-invariant upper bound for number of" 486 "iterations"); 487 } 488 } 489 // Compute the number of iterations needed. We compute this as an 490 // affine expression ceilDiv (upperBound - lowerBound) step. We use 491 // affine.apply here so that it composes nicely with the provided map. 492 AffineMap stepMap = AffineMap::get( 493 1, 2, 494 ((rewriter.getAffineDimExpr(0) - rewriter.getAffineSymbolExpr(0)) 495 .ceilDiv(rewriter.getAffineSymbolExpr(1)))); 496 Value launchBound = rewriter.create<AffineApplyOp>( 497 loc, annotation.bound().getValue().compose(stepMap), 498 ValueRange{ 499 ensureLaunchIndependent( 500 cloningMap.lookupOrDefault(upperBound)), 501 ensureLaunchIndependent( 502 cloningMap.lookupOrDefault(lowerBound)), 503 ensureLaunchIndependent(cloningMap.lookupOrDefault(step))}); 504 // todo(herhut,ravishankarm): Update the behavior of setMappingAttr 505 // when this condition is relaxed. 506 if (bounds.find(processor) != bounds.end()) { 507 return rewriter.notifyMatchFailure( 508 parallelOp, "cannot redefine the bound for processor " + 509 Twine(static_cast<int64_t>(processor))); 510 } 511 bounds[processor] = launchBound; 512 } 513 if (!boundIsPrecise) { 514 // We are using an approximation, create a surrounding conditional. 515 Value originalBound = std::get<3>(config); 516 arith::CmpIOp pred = rewriter.create<arith::CmpIOp>( 517 loc, arith::CmpIPredicate::slt, newIndex, 518 cloningMap.lookupOrDefault(originalBound)); 519 scf::IfOp ifOp = rewriter.create<scf::IfOp>(loc, pred, false); 520 rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front()); 521 // Put a sentinel into the worklist so we know when to pop out of the 522 // if body again. We use the launchOp here, as that cannot be part of 523 // the bodies instruction. 524 worklist.push_back(launchOp.getOperation()); 525 } 526 } 527 } else { 528 // Create a sequential for loop. 529 auto loopOp = rewriter.create<scf::ForOp>( 530 loc, cloningMap.lookupOrDefault(lowerBound), 531 cloningMap.lookupOrDefault(upperBound), 532 cloningMap.lookupOrDefault(step)); 533 newIndex = loopOp.getInductionVar(); 534 rewriter.setInsertionPointToStart(loopOp.getBody()); 535 // Put a sentinel into the worklist so we know when to pop out of the loop 536 // body again. We use the launchOp here, as that cannot be part of the 537 // bodies instruction. 538 worklist.push_back(launchOp.getOperation()); 539 } 540 cloningMap.map(iv, newIndex); 541 } 542 543 // Propagate custom user defined optional attributes, that can be used at 544 // later stage, such as extension data for GPU kernel dispatch 545 for (const auto &namedAttr : parallelOp->getAttrs()) { 546 if (namedAttr.getName() == gpu::getMappingAttrName() || 547 namedAttr.getName() == ParallelOp::getOperandSegmentSizeAttr()) 548 continue; 549 launchOp->setAttr(namedAttr.getName(), namedAttr.getValue()); 550 } 551 552 Block *body = parallelOp.getBody(); 553 worklist.reserve(worklist.size() + body->getOperations().size()); 554 for (Operation &op : llvm::reverse(body->without_terminator())) 555 worklist.push_back(&op); 556 return success(); 557 } 558 559 /// Lower a `scf.parallel` operation into a corresponding `gpu.launch` 560 /// operation. 561 /// 562 /// This essentially transforms a loop nest into a corresponding SIMT function. 563 /// The conversion is driven by mapping annotations on the `scf.parallel` 564 /// operations. The mapping is provided via a `DictionaryAttribute` named 565 /// `mapping`, which has three entries: 566 /// - processor: the hardware id to map to. 0-2 are block dimensions, 3-5 are 567 /// thread dimensions and 6 is sequential. 568 /// - map : An affine map that is used to pre-process hardware ids before 569 /// substitution. 570 /// - bound : An affine map that is used to compute the bound of the hardware 571 /// id based on an upper bound of the number of iterations. 572 /// If the `scf.parallel` contains nested `scf.parallel` operations, those 573 /// need to be annotated, as well. Structurally, the transformation works by 574 /// splicing all operations from nested `scf.parallel` operations into a single 575 /// sequence. Indices mapped to hardware ids are substituted with those ids, 576 /// wheras sequential mappings result in a sequential for-loop. To have more 577 /// flexibility when mapping code to hardware ids, the transform supports two 578 /// affine maps. The first `map` is used to compute the actual index for 579 /// substitution from the hardware id. The second `bound` is used to compute the 580 /// launch dimension for the hardware id from the number of iterations the 581 /// mapped loop is performing. Note that the number of iterations might be 582 /// imprecise if the corresponding loop-bounds are loop-dependent. In such case, 583 /// the hardware id might iterate over additional indices. The transformation 584 /// caters for this by predicating the created sequence of instructions on 585 /// the actual loop bound. This only works if an static upper bound for the 586 /// dynamic loop bound can be derived, currently via analyzing `affine.min` 587 /// operations. 588 LogicalResult 589 ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, 590 PatternRewriter &rewriter) const { 591 // Mark the operation as visited for recursive legality check. 592 parallelOp->setAttr(kVisitedAttrName, rewriter.getUnitAttr()); 593 594 // We can only transform starting at the outer-most loop. Launches inside of 595 // parallel loops are not supported. 596 if (auto parentLoop = parallelOp->getParentOfType<ParallelOp>()) 597 return failure(); 598 // Create a launch operation. We start with bound one for all grid/block 599 // sizes. Those will be refined later as we discover them from mappings. 600 Location loc = parallelOp.getLoc(); 601 Value constantOne = 602 rewriter.create<arith::ConstantIndexOp>(parallelOp.getLoc(), 1); 603 gpu::LaunchOp launchOp = rewriter.create<gpu::LaunchOp>( 604 parallelOp.getLoc(), constantOne, constantOne, constantOne, constantOne, 605 constantOne, constantOne); 606 rewriter.setInsertionPointToEnd(&launchOp.body().front()); 607 rewriter.create<gpu::TerminatorOp>(loc); 608 rewriter.setInsertionPointToStart(&launchOp.body().front()); 609 610 BlockAndValueMapping cloningMap; 611 llvm::DenseMap<gpu::Processor, Value> launchBounds; 612 SmallVector<Operation *, 16> worklist; 613 if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist, 614 launchBounds, rewriter))) 615 return failure(); 616 617 // Whether we have seen any side-effects. Reset when leaving an inner scope. 618 bool seenSideeffects = false; 619 // Whether we have left a nesting scope (and hence are no longer innermost). 620 bool leftNestingScope = false; 621 while (!worklist.empty()) { 622 Operation *op = worklist.pop_back_val(); 623 // Now walk over the body and clone it. 624 // TODO: This is only correct if there either is no further scf.parallel 625 // nested or this code is side-effect free. Otherwise we might need 626 // predication. We are overly conservative for now and only allow 627 // side-effects in the innermost scope. 628 if (auto nestedParallel = dyn_cast<ParallelOp>(op)) { 629 // Before entering a nested scope, make sure there have been no 630 // sideeffects until now. 631 if (seenSideeffects) 632 return failure(); 633 // A nested scf.parallel needs insertion of code to compute indices. 634 // Insert that now. This will also update the worklist with the loops 635 // body. 636 if (failed(processParallelLoop(nestedParallel, launchOp, cloningMap, 637 worklist, launchBounds, rewriter))) 638 return failure(); 639 } else if (op == launchOp.getOperation()) { 640 // Found our sentinel value. We have finished the operations from one 641 // nesting level, pop one level back up. 642 auto *parent = rewriter.getInsertionPoint()->getParentOp(); 643 rewriter.setInsertionPointAfter(parent); 644 leftNestingScope = true; 645 seenSideeffects = false; 646 } else { 647 // Otherwise we copy it over. 648 Operation *clone = rewriter.clone(*op, cloningMap); 649 cloningMap.map(op->getResults(), clone->getResults()); 650 // Check for side effects. 651 // TODO: Handle region side effects properly. 652 seenSideeffects |= !MemoryEffectOpInterface::hasNoEffect(clone) || 653 clone->getNumRegions() != 0; 654 // If we are no longer in the innermost scope, sideeffects are disallowed. 655 if (seenSideeffects && leftNestingScope) 656 return failure(); 657 } 658 } 659 660 // Now that we succeeded creating the launch operation, also update the 661 // bounds. 662 for (auto bound : launchBounds) 663 launchOp.setOperand(getLaunchOpArgumentNum(std::get<0>(bound)), 664 std::get<1>(bound)); 665 666 rewriter.eraseOp(parallelOp); 667 return success(); 668 } 669 670 void mlir::populateParallelLoopToGPUPatterns(RewritePatternSet &patterns) { 671 patterns.add<ParallelToGpuLaunchLowering>(patterns.getContext()); 672 } 673 674 void mlir::configureParallelLoopToGPULegality(ConversionTarget &target) { 675 target.addLegalDialect<memref::MemRefDialect>(); 676 target.addDynamicallyLegalOp<scf::ParallelOp>([](scf::ParallelOp parallelOp) { 677 return !parallelOp->hasAttr(gpu::getMappingAttrName()) || 678 parallelOp->hasAttr(kVisitedAttrName); 679 }); 680 } 681 682 void mlir::finalizeParallelLoopToGPUConversion(Operation *op) { 683 op->walk([](scf::ParallelOp parallelOp) { 684 parallelOp->removeAttr(kVisitedAttrName); 685 }); 686 } 687