1 //===- SCFToGPU.cpp - Convert an affine loop nest to a GPU kernel -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This implements a straightforward conversion of an loop nest into a GPU 10 // kernel. The caller is expected to guarantee that the conversion is correct 11 // or to further transform the kernel to ensure correctness. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "mlir/Conversion/SCFToGPU/SCFToGPU.h" 16 17 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" 18 #include "mlir/Dialect/Affine/IR/AffineOps.h" 19 #include "mlir/Dialect/GPU/GPUDialect.h" 20 #include "mlir/Dialect/GPU/ParallelLoopMapper.h" 21 #include "mlir/Dialect/SCF/SCF.h" 22 #include "mlir/Dialect/StandardOps/IR/Ops.h" 23 #include "mlir/IR/AffineExpr.h" 24 #include "mlir/IR/BlockAndValueMapping.h" 25 #include "mlir/IR/Builders.h" 26 #include "mlir/Pass/Pass.h" 27 #include "mlir/Transforms/DialectConversion.h" 28 #include "mlir/Transforms/LoopUtils.h" 29 #include "mlir/Transforms/Passes.h" 30 #include "mlir/Transforms/RegionUtils.h" 31 #include "llvm/ADT/Sequence.h" 32 #include "llvm/Support/Debug.h" 33 34 #define DEBUG_TYPE "loops-to-gpu" 35 36 using namespace mlir; 37 using namespace mlir::scf; 38 39 using llvm::seq; 40 41 // Extract an indexed value from KernelDim3. 42 static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) { 43 switch (pos) { 44 case 0: 45 return dim3.x; 46 case 1: 47 return dim3.y; 48 case 2: 49 return dim3.z; 50 default: 51 llvm_unreachable("dim3 position out of bounds"); 52 } 53 return nullptr; 54 } 55 56 // Get the lower bound-related operands of a loop operation. 57 static Operation::operand_range getLowerBoundOperands(AffineForOp forOp) { 58 return forOp.getLowerBoundOperands(); 59 } 60 static SmallVector<Value, 1> getLowerBoundOperands(ForOp forOp) { 61 SmallVector<Value, 1> bounds(1, forOp.lowerBound()); 62 return bounds; 63 } 64 65 // Get the upper bound-related operands of a loop operation. 66 static Operation::operand_range getUpperBoundOperands(AffineForOp forOp) { 67 return forOp.getUpperBoundOperands(); 68 } 69 static SmallVector<Value, 1> getUpperBoundOperands(ForOp forOp) { 70 SmallVector<Value, 1> bounds(1, forOp.upperBound()); 71 return bounds; 72 } 73 74 // Get a Value that corresponds to the loop step. If the step is an attribute, 75 // materialize a corresponding constant using builder. 76 static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder) { 77 return builder.create<ConstantIndexOp>(forOp.getLoc(), forOp.getStep()); 78 } 79 static Value getOrCreateStep(ForOp forOp, OpBuilder &) { return forOp.step(); } 80 81 // Get a Value for the loop lower bound. If the value requires computation, 82 // materialize the instructions using builder. 83 static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) { 84 return lowerAffineLowerBound(forOp, builder); 85 } 86 static Value getOrEmitLowerBound(ForOp forOp, OpBuilder &) { 87 return forOp.lowerBound(); 88 } 89 90 // Get a Value for the loop upper bound. If the value requires computation, 91 // materialize the instructions using builder. 92 static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) { 93 return lowerAffineUpperBound(forOp, builder); 94 } 95 static Value getOrEmitUpperBound(ForOp forOp, OpBuilder &) { 96 return forOp.upperBound(); 97 } 98 99 // Check the structure of the loop nest: 100 // - there are enough loops to map to numDims; 101 // - the loops are perfectly nested; 102 // - the loop bounds can be computed above the outermost loop. 103 // This roughly corresponds to the "matcher" part of the pattern-based 104 // rewriting infrastructure. 105 template <typename OpTy> 106 static LogicalResult checkLoopNestMappableImpl(OpTy forOp, unsigned numDims) { 107 Region &limit = forOp.region(); 108 for (unsigned i = 0, e = numDims; i < e; ++i) { 109 Operation *nested = &forOp.getBody()->front(); 110 if (!areValuesDefinedAbove(getLowerBoundOperands(forOp), limit) || 111 !areValuesDefinedAbove(getUpperBoundOperands(forOp), limit)) 112 return forOp.emitError( 113 "loops with bounds depending on other mapped loops " 114 "are not supported"); 115 116 // The innermost loop can have an arbitrary body, skip the perfect nesting 117 // check for it. 118 if (i == e - 1) 119 break; 120 121 auto begin = forOp.getBody()->begin(), end = forOp.getBody()->end(); 122 if (forOp.getBody()->empty() || std::next(begin, 2) != end) 123 return forOp.emitError("expected perfectly nested loops in the body"); 124 125 if (!(forOp = dyn_cast<OpTy>(nested))) 126 return nested->emitError("expected a nested loop"); 127 } 128 return success(); 129 } 130 131 template <typename OpTy> 132 static LogicalResult checkLoopNestMappable(OpTy forOp, unsigned numBlockDims, 133 unsigned numThreadDims) { 134 if (numBlockDims < 1 || numThreadDims < 1) { 135 LLVM_DEBUG(llvm::dbgs() << "nothing to map"); 136 return success(); 137 } 138 139 if (numBlockDims > 3) { 140 return forOp.emitError("cannot map to more than 3 block dimensions"); 141 } 142 if (numThreadDims > 3) { 143 return forOp.emitError("cannot map to more than 3 thread dimensions"); 144 } 145 return checkLoopNestMappableImpl(forOp, numBlockDims + numThreadDims); 146 } 147 148 template <typename OpTy> 149 static LogicalResult checkLoopOpMappable(OpTy forOp, unsigned numBlockDims, 150 unsigned numThreadDims) { 151 if (numBlockDims < 1 || numThreadDims < 1) { 152 LLVM_DEBUG(llvm::dbgs() << "nothing to map"); 153 return success(); 154 } 155 156 if (numBlockDims > 3) { 157 return forOp.emitError("cannot map to more than 3 block dimensions"); 158 } 159 if (numThreadDims > 3) { 160 return forOp.emitError("cannot map to more than 3 thread dimensions"); 161 } 162 if (numBlockDims != numThreadDims) { 163 // TODO(ravishankarm) : This can probably be relaxed by having a one-trip 164 // loop for the missing dimension, but there is not reason to handle this 165 // case for now. 166 return forOp.emitError( 167 "mismatch in block dimensions and thread dimensions"); 168 } 169 170 // Check that the forOp contains perfectly nested loops for numBlockDims 171 if (failed(checkLoopNestMappableImpl(forOp, numBlockDims))) { 172 return failure(); 173 } 174 175 // Get to the innermost loop. 176 for (auto i : seq<unsigned>(0, numBlockDims - 1)) { 177 forOp = cast<OpTy>(&forOp.getBody()->front()); 178 (void)i; 179 } 180 181 // The forOp now points to the body of the innermost loop mapped to blocks. 182 for (Operation &op : *forOp.getBody()) { 183 // If the operation is a loop, check that it is mappable to workItems. 184 if (auto innerLoop = dyn_cast<OpTy>(&op)) { 185 if (failed(checkLoopNestMappableImpl(innerLoop, numThreadDims))) { 186 return failure(); 187 } 188 continue; 189 } 190 // TODO(ravishankarm) : If it is not a loop op, it is assumed that the 191 // statement is executed by all threads. It might be a collective operation, 192 // or some non-side effect instruction. Have to decide on "allowable" 193 // statements and check for those here. 194 } 195 return success(); 196 } 197 198 namespace { 199 // Helper structure that holds common state of the loop to GPU kernel 200 // conversion. 201 struct LoopToGpuConverter { 202 template <typename OpTy> 203 Optional<OpTy> collectBounds(OpTy forOp, unsigned numLoops); 204 205 template <typename OpTy> 206 void createLaunch(OpTy rootForOp, OpTy innermostForOp, unsigned numBlockDims, 207 unsigned numThreadDims); 208 209 // Ranges of the loops mapped to blocks or threads. 210 SmallVector<Value, 6> dims; 211 // Lower bounds of the loops mapped to blocks or threads. 212 SmallVector<Value, 6> lbs; 213 // Induction variables of the loops mapped to blocks or threads. 214 SmallVector<Value, 6> ivs; 215 // Steps of the loops mapped to blocks or threads. 216 SmallVector<Value, 6> steps; 217 }; 218 } // namespace 219 220 // Return true if the value is obviously a constant "one". 221 static bool isConstantOne(Value value) { 222 if (auto def = value.getDefiningOp<ConstantIndexOp>()) 223 return def.getValue() == 1; 224 return false; 225 } 226 227 // Collect ranges, bounds, steps and induction variables in preparation for 228 // mapping a loop nest of depth "numLoops" rooted at "forOp" to a GPU kernel. 229 // This may fail if the IR for computing loop bounds cannot be constructed, for 230 // example if an affine loop uses semi-affine maps. Return the last loop to be 231 // mapped on success, llvm::None on failure. 232 template <typename OpTy> 233 Optional<OpTy> LoopToGpuConverter::collectBounds(OpTy forOp, 234 unsigned numLoops) { 235 OpBuilder builder(forOp.getOperation()); 236 dims.reserve(numLoops); 237 lbs.reserve(numLoops); 238 ivs.reserve(numLoops); 239 steps.reserve(numLoops); 240 OpTy currentLoop = forOp; 241 for (unsigned i = 0; i < numLoops; ++i) { 242 Value lowerBound = getOrEmitLowerBound(currentLoop, builder); 243 Value upperBound = getOrEmitUpperBound(currentLoop, builder); 244 if (!lowerBound || !upperBound) { 245 return llvm::None; 246 } 247 248 Value range = 249 builder.create<SubIOp>(currentLoop.getLoc(), upperBound, lowerBound); 250 Value step = getOrCreateStep(currentLoop, builder); 251 if (!isConstantOne(step)) 252 range = builder.create<SignedDivIOp>(currentLoop.getLoc(), range, step); 253 dims.push_back(range); 254 255 lbs.push_back(lowerBound); 256 ivs.push_back(currentLoop.getInductionVar()); 257 steps.push_back(step); 258 259 if (i != numLoops - 1) 260 currentLoop = cast<OpTy>(¤tLoop.getBody()->front()); 261 } 262 return currentLoop; 263 } 264 265 /// Given `nDims` perfectly nested loops rooted as `rootForOp`, convert them o 266 /// be partitioned across workgroups or workitems. The values for the 267 /// workgroup/workitem id along each dimension is passed in with `ids`. The 268 /// number of workgroups/workitems along each dimension are passed in with 269 /// `nids`. The innermost loop is mapped to the x-dimension, followed by the 270 /// next innermost loop to y-dimension, followed by z-dimension. 271 template <typename OpTy> 272 static OpTy createGPULaunchLoops(OpTy rootForOp, ArrayRef<Value> ids, 273 ArrayRef<Value> nids) { 274 auto nDims = ids.size(); 275 assert(nDims == nids.size()); 276 for (auto dim : llvm::seq<unsigned>(0, nDims)) { 277 // TODO(ravishankarm): Don't always need to generate a loop here. If nids >= 278 // number of iterations of the original loop, this becomes a if 279 // condition. Though that does rely on how the workgroup/workitem sizes are 280 // specified to begin with. 281 mapLoopToProcessorIds(rootForOp, ids[dim], nids[dim]); 282 if (dim != nDims - 1) { 283 rootForOp = cast<OpTy>(rootForOp.getBody()->front()); 284 } 285 } 286 return rootForOp; 287 } 288 289 /// Utility method to convert the gpu::KernelDim3 object for representing id of 290 /// each workgroup/workitem and number of workgroup/workitems along a dimension 291 /// of the launch into a container. 292 static void packIdAndNumId(gpu::KernelDim3 kernelIds, 293 gpu::KernelDim3 kernelNids, unsigned nDims, 294 SmallVectorImpl<Value> &ids, 295 SmallVectorImpl<Value> &nids) { 296 assert(nDims <= 3 && "invalid number of launch dimensions"); 297 std::array<Value, 3> allIds = {kernelIds.z, kernelIds.y, kernelIds.x}; 298 std::array<Value, 3> allNids = {kernelNids.z, kernelNids.y, kernelNids.x}; 299 ids.clear(); 300 ids.append(std::next(allIds.begin(), allIds.size() - nDims), allIds.end()); 301 nids.clear(); 302 nids.append(std::next(allNids.begin(), allNids.size() - nDims), 303 allNids.end()); 304 } 305 306 /// Generate the body of the launch operation. 307 template <typename OpTy> 308 static LogicalResult 309 createLaunchBody(OpBuilder &builder, OpTy rootForOp, gpu::LaunchOp launchOp, 310 unsigned numBlockDims, unsigned numThreadDims) { 311 OpBuilder::InsertionGuard bodyInsertionGuard(builder); 312 builder.setInsertionPointToEnd(&launchOp.body().front()); 313 auto terminatorOp = builder.create<gpu::TerminatorOp>(launchOp.getLoc()); 314 315 rootForOp.getOperation()->moveBefore(terminatorOp); 316 SmallVector<Value, 3> workgroupID, numWorkGroups; 317 packIdAndNumId(launchOp.getBlockIds(), launchOp.getGridSize(), numBlockDims, 318 workgroupID, numWorkGroups); 319 320 // Partition the loop for mapping to workgroups. 321 auto loopOp = createGPULaunchLoops(rootForOp, workgroupID, numWorkGroups); 322 323 // Iterate over the body of the loopOp and get the loops to partition for 324 // thread blocks. 325 SmallVector<OpTy, 1> threadRootForOps; 326 for (Operation &op : *loopOp.getBody()) { 327 if (auto threadRootForOp = dyn_cast<OpTy>(&op)) { 328 threadRootForOps.push_back(threadRootForOp); 329 } 330 } 331 332 SmallVector<Value, 3> workItemID, workGroupSize; 333 packIdAndNumId(launchOp.getThreadIds(), launchOp.getBlockSize(), 334 numThreadDims, workItemID, workGroupSize); 335 for (auto &loopOp : threadRootForOps) { 336 builder.setInsertionPoint(loopOp); 337 createGPULaunchLoops(loopOp, workItemID, workGroupSize); 338 } 339 return success(); 340 } 341 342 // Convert the computation rooted at the `rootForOp`, into a GPU kernel with the 343 // given workgroup size and number of workgroups. 344 template <typename OpTy> 345 static LogicalResult createLaunchFromOp(OpTy rootForOp, 346 ArrayRef<Value> numWorkGroups, 347 ArrayRef<Value> workGroupSizes) { 348 OpBuilder builder(rootForOp.getOperation()); 349 if (numWorkGroups.size() > 3) { 350 return rootForOp.emitError("invalid ") 351 << numWorkGroups.size() << "-D workgroup specification"; 352 } 353 auto loc = rootForOp.getLoc(); 354 Value one = builder.create<ConstantOp>( 355 loc, builder.getIntegerAttr(builder.getIndexType(), 1)); 356 SmallVector<Value, 3> numWorkGroups3D(3, one), workGroupSize3D(3, one); 357 for (auto numWorkGroup : enumerate(numWorkGroups)) { 358 numWorkGroups3D[numWorkGroup.index()] = numWorkGroup.value(); 359 } 360 for (auto workGroupSize : enumerate(workGroupSizes)) { 361 workGroupSize3D[workGroupSize.index()] = workGroupSize.value(); 362 } 363 364 auto launchOp = builder.create<gpu::LaunchOp>( 365 rootForOp.getLoc(), numWorkGroups3D[0], numWorkGroups3D[1], 366 numWorkGroups3D[2], workGroupSize3D[0], workGroupSize3D[1], 367 workGroupSize3D[2]); 368 if (failed(createLaunchBody(builder, rootForOp, launchOp, 369 numWorkGroups.size(), workGroupSizes.size()))) { 370 return failure(); 371 } 372 373 return success(); 374 } 375 376 // Replace the rooted at "rootForOp" with a GPU launch operation. This expects 377 // "innermostForOp" to point to the last loop to be transformed to the kernel, 378 // and to have (numBlockDims + numThreadDims) perfectly nested loops between 379 // "rootForOp" and "innermostForOp". 380 // TODO(ravishankarm) : This method can be modified to use the 381 // createLaunchFromOp method, since that is a strict generalization of this 382 // method. 383 template <typename OpTy> 384 void LoopToGpuConverter::createLaunch(OpTy rootForOp, OpTy innermostForOp, 385 unsigned numBlockDims, 386 unsigned numThreadDims) { 387 OpBuilder builder(rootForOp.getOperation()); 388 // Prepare the grid and block sizes for the launch operation. If there is 389 // no loop mapped to a specific dimension, use constant "1" as its size. 390 Value constOne = (numBlockDims < 3 || numThreadDims < 3) 391 ? builder.create<ConstantIndexOp>(rootForOp.getLoc(), 1) 392 : nullptr; 393 Value gridSizeX = numBlockDims > 0 ? dims[0] : constOne; 394 Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne; 395 Value gridSizeZ = numBlockDims > 2 ? dims[2] : constOne; 396 Value blockSizeX = numThreadDims > 0 ? dims[numBlockDims] : constOne; 397 Value blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne; 398 Value blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne; 399 400 // Create a launch op and move the body region of the innermost loop to the 401 // launch op. 402 auto launchOp = builder.create<gpu::LaunchOp>( 403 rootForOp.getLoc(), gridSizeX, gridSizeY, gridSizeZ, blockSizeX, 404 blockSizeY, blockSizeZ); 405 406 // Replace the loop terminator (loops contain only a single block) with the 407 // gpu terminator and move the operations from the loop body block to the gpu 408 // launch body block. Do not move the entire block because of the difference 409 // in block arguments. 410 Operation &terminator = innermostForOp.getBody()->back(); 411 Location terminatorLoc = terminator.getLoc(); 412 terminator.erase(); 413 builder.setInsertionPointToEnd(innermostForOp.getBody()); 414 builder.create<gpu::TerminatorOp>(terminatorLoc, llvm::None); 415 launchOp.body().front().getOperations().splice( 416 launchOp.body().front().begin(), 417 innermostForOp.getBody()->getOperations()); 418 419 // Remap the loop iterators to use block/thread identifiers instead. Loops 420 // may iterate from LB with step S whereas GPU thread/block ids always iterate 421 // from 0 to N with step 1. Therefore, loop induction variables are replaced 422 // with (gpu-thread/block-id * S) + LB. 423 builder.setInsertionPointToStart(&launchOp.body().front()); 424 auto lbArgumentIt = lbs.begin(); 425 auto stepArgumentIt = steps.begin(); 426 for (auto en : llvm::enumerate(ivs)) { 427 Value id = 428 en.index() < numBlockDims 429 ? getDim3Value(launchOp.getBlockIds(), en.index()) 430 : getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims); 431 Value step = steps[en.index()]; 432 if (!isConstantOne(step)) 433 id = builder.create<MulIOp>(rootForOp.getLoc(), step, id); 434 435 Value ivReplacement = 436 builder.create<AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id); 437 en.value().replaceAllUsesWith(ivReplacement); 438 std::advance(lbArgumentIt, 1); 439 std::advance(stepArgumentIt, 1); 440 } 441 442 // We are done and can erase the original outermost loop. 443 rootForOp.erase(); 444 } 445 446 // Generic loop to GPU kernel conversion function. 447 template <typename OpTy> 448 static LogicalResult convertLoopNestToGPULaunch(OpTy forOp, 449 unsigned numBlockDims, 450 unsigned numThreadDims) { 451 if (failed(checkLoopNestMappable(forOp, numBlockDims, numThreadDims))) 452 return failure(); 453 454 LoopToGpuConverter converter; 455 auto maybeInnerLoop = 456 converter.collectBounds(forOp, numBlockDims + numThreadDims); 457 if (!maybeInnerLoop) 458 return failure(); 459 converter.createLaunch(forOp, *maybeInnerLoop, numBlockDims, numThreadDims); 460 461 return success(); 462 } 463 464 // Generic loop to GPU kernel conversion function when loop is imperfectly 465 // nested. The workgroup size and num workgroups is provided as input 466 template <typename OpTy> 467 static LogicalResult convertLoopToGPULaunch(OpTy forOp, 468 ArrayRef<Value> numWorkGroups, 469 ArrayRef<Value> workGroupSize) { 470 if (failed(checkLoopOpMappable(forOp, numWorkGroups.size(), 471 workGroupSize.size()))) { 472 return failure(); 473 } 474 return createLaunchFromOp(forOp, numWorkGroups, workGroupSize); 475 } 476 477 LogicalResult mlir::convertAffineLoopNestToGPULaunch(AffineForOp forOp, 478 unsigned numBlockDims, 479 unsigned numThreadDims) { 480 return ::convertLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims); 481 } 482 483 LogicalResult mlir::convertLoopNestToGPULaunch(ForOp forOp, 484 unsigned numBlockDims, 485 unsigned numThreadDims) { 486 return ::convertLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims); 487 } 488 489 LogicalResult mlir::convertLoopToGPULaunch(scf::ForOp forOp, 490 ArrayRef<Value> numWorkGroups, 491 ArrayRef<Value> workGroupSizes) { 492 return ::convertLoopToGPULaunch(forOp, numWorkGroups, workGroupSizes); 493 } 494 495 namespace { 496 struct ParallelToGpuLaunchLowering : public OpRewritePattern<ParallelOp> { 497 using OpRewritePattern<ParallelOp>::OpRewritePattern; 498 499 LogicalResult matchAndRewrite(ParallelOp parallelOp, 500 PatternRewriter &rewriter) const override; 501 }; 502 } // namespace 503 504 /// Tries to derive a static upper bound from the defining operation of 505 /// `upperBound`. 506 static Value deriveStaticUpperBound(Value upperBound, 507 PatternRewriter &rewriter) { 508 if (auto op = upperBound.getDefiningOp<ConstantIndexOp>()) { 509 return op; 510 } 511 512 if (auto minOp = upperBound.getDefiningOp<AffineMinOp>()) { 513 for (const AffineExpr &result : minOp.map().getResults()) { 514 if (auto constExpr = result.dyn_cast<AffineConstantExpr>()) { 515 return rewriter.create<ConstantIndexOp>(minOp.getLoc(), 516 constExpr.getValue()); 517 } 518 } 519 } 520 521 if (auto multiplyOp = upperBound.getDefiningOp<MulIOp>()) { 522 if (auto lhs = dyn_cast_or_null<ConstantIndexOp>( 523 deriveStaticUpperBound(multiplyOp.getOperand(0), rewriter) 524 .getDefiningOp())) 525 if (auto rhs = dyn_cast_or_null<ConstantIndexOp>( 526 deriveStaticUpperBound(multiplyOp.getOperand(1), rewriter) 527 .getDefiningOp())) { 528 // Assumptions about the upper bound of minimum computations no longer 529 // work if multiplied by a negative value, so abort in this case. 530 if (lhs.getValue() < 0 || rhs.getValue() < 0) 531 return {}; 532 533 return rewriter.create<ConstantIndexOp>( 534 multiplyOp.getLoc(), lhs.getValue() * rhs.getValue()); 535 } 536 } 537 538 return {}; 539 } 540 541 static bool isMappedToProcessor(gpu::Processor processor) { 542 return processor != gpu::Processor::Sequential; 543 } 544 545 static unsigned getLaunchOpArgumentNum(gpu::Processor processor) { 546 switch (processor) { 547 case gpu::Processor::BlockX: 548 return 0; 549 case gpu::Processor::BlockY: 550 return 1; 551 case gpu::Processor::BlockZ: 552 return 2; 553 case gpu::Processor::ThreadX: 554 return 3; 555 case gpu::Processor::ThreadY: 556 return 4; 557 case gpu::Processor::ThreadZ: 558 return 5; 559 default:; 560 } 561 llvm_unreachable( 562 "invalid processor type while retrieving launch op argument number"); 563 } 564 565 /// Modifies the current transformation state to capture the effect of the given 566 /// `scf.parallel` operation on index substitutions and the operations to be 567 /// inserted. 568 /// Specifically, if a dimension of a parallel loop is mapped to a hardware id, 569 /// this function will 570 /// - compute the loop index based on the hardware id and affine map from the 571 /// mapping and update `cloningMap` to substitute all uses. 572 /// - derive a new upper bound for the hardware id and augment the provided 573 /// `gpu.launch operation` accordingly. 574 /// - if the upper bound is imprecise, insert a conditional in the `gpu.launch` 575 /// and update the rewriter to insert into the conditional's body. 576 /// If the dimension is mapped to sequential, 577 /// - insert a for loop into the body and update the rewriter to insert into 578 /// the for loop's body. 579 /// - update the `cloningMap` to replace uses of the index with the index of 580 /// the new for loop. 581 /// In either case, 582 /// - append the instructions from the loops body to worklist, in reverse order. 583 /// To note the end of the current scope in case a loop or conditional was 584 /// inserted, a sentinel (the `gpu.launch` operation) is inserted into the 585 /// worklist. This signals the processor of the worklist to pop the rewriter 586 /// one scope-level up. 587 static LogicalResult processParallelLoop( 588 ParallelOp parallelOp, gpu::LaunchOp launchOp, 589 BlockAndValueMapping &cloningMap, SmallVectorImpl<Operation *> &worklist, 590 DenseMap<gpu::Processor, Value> &bounds, PatternRewriter &rewriter) { 591 // TODO(herhut): Verify that this is a valid GPU mapping. 592 // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential 593 ArrayAttr mapping = 594 parallelOp.getAttrOfType<ArrayAttr>(gpu::getMappingAttrName()); 595 596 // TODO(herhut): Support reductions. 597 if (!mapping || parallelOp.getNumResults() != 0) 598 return failure(); 599 600 Location loc = parallelOp.getLoc(); 601 602 auto launchIndependent = [&launchOp](Value val) { 603 return val.getParentRegion()->isAncestor(launchOp.getParentRegion()); 604 }; 605 606 auto ensureLaunchIndependent = [&rewriter, 607 launchIndependent](Value val) -> Value { 608 if (launchIndependent(val)) 609 return val; 610 if (ConstantOp constOp = val.getDefiningOp<ConstantOp>()) 611 return rewriter.create<ConstantOp>(constOp.getLoc(), constOp.getValue()); 612 return {}; 613 }; 614 615 for (auto config : llvm::zip(mapping, parallelOp.getInductionVars(), 616 parallelOp.lowerBound(), parallelOp.upperBound(), 617 parallelOp.step())) { 618 Attribute mappingAttribute; 619 Value iv, lowerBound, upperBound, step; 620 std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config; 621 auto annotation = mappingAttribute.dyn_cast<gpu::ParallelLoopDimMapping>(); 622 if (!annotation) 623 return parallelOp.emitOpError() 624 << "expected mapping attribute for lowering to GPU"; 625 Value newIndex; 626 gpu::Processor processor = gpu::getProcessor(annotation); 627 628 if (isMappedToProcessor(processor)) { 629 // Use the corresponding thread/grid index as replacement for the loop iv. 630 Value operand = launchOp.body().front().getArgument( 631 getLaunchOpArgumentNum(processor)); 632 // Take the indexmap and add the lower bound and step computations in. 633 // This computes operand * step + lowerBound. 634 // Use an affine map here so that it composes nicely with the provided 635 // annotation. 636 AffineMap lowerAndStep = AffineMap::get( 637 1, 2, 638 rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) + 639 rewriter.getAffineSymbolExpr(1)); 640 newIndex = rewriter.create<AffineApplyOp>( 641 loc, annotation.map().getValue().compose(lowerAndStep), 642 ValueRange{operand, step, lowerBound}); 643 // If there was also a bound, insert that, too. 644 // TODO(herhut): Check that we do not assign bounds twice. 645 if (annotation.bound().getValue()) { 646 // We pass as the single operand to the bound-map the number of 647 // iterations, which is (upperBound - lowerBound) ceilDiv step. To 648 // support inner loops with dynamic upper bounds (as generated by e.g. 649 // tiling), try to derive a max for the bounds. If the used bound for 650 // the hardware id is imprecise, wrap the contained code into a 651 // conditional. If the lower-bound is constant or defined before the 652 // launch, we can use it in the launch bounds. Otherwise fail. 653 if (!launchIndependent(lowerBound) && 654 !isa_and_nonnull<ConstantOp>(lowerBound.getDefiningOp())) 655 return failure(); 656 // The step must also be constant or defined outside of the loop nest. 657 if (!launchIndependent(step) && 658 !isa_and_nonnull<ConstantOp>(step.getDefiningOp())) 659 return failure(); 660 // If the upper-bound is constant or defined before the launch, we can 661 // use it in the launch bounds directly. Otherwise try derive a bound. 662 bool boundIsPrecise = 663 launchIndependent(upperBound) || 664 isa_and_nonnull<ConstantOp>(upperBound.getDefiningOp()); 665 { 666 PatternRewriter::InsertionGuard guard(rewriter); 667 rewriter.setInsertionPoint(launchOp); 668 if (!boundIsPrecise) { 669 upperBound = deriveStaticUpperBound(upperBound, rewriter); 670 if (!upperBound) { 671 return parallelOp.emitOpError() 672 << "cannot derive loop-invariant upper bound for number " 673 "of iterations"; 674 } 675 } 676 // Compute the number of iterations needed. We compute this as an 677 // affine expression ceilDiv (upperBound - lowerBound) step. We use 678 // affine.apply here so that it composes nicely with the provided map. 679 AffineMap stepMap = 680 AffineMap::get(0, 3, 681 ((rewriter.getAffineSymbolExpr(0) - 682 rewriter.getAffineSymbolExpr(1)) 683 .ceilDiv(rewriter.getAffineSymbolExpr(2)))); 684 Value launchBound = rewriter.create<AffineApplyOp>( 685 loc, annotation.bound().getValue().compose(stepMap), 686 ValueRange{ 687 ensureLaunchIndependent( 688 cloningMap.lookupOrDefault(upperBound)), 689 ensureLaunchIndependent( 690 cloningMap.lookupOrDefault(lowerBound)), 691 ensureLaunchIndependent(cloningMap.lookupOrDefault(step))}); 692 // todo(herhut,ravishankarm): Update the behavior of setMappingAttr 693 // when this condition is relaxed. 694 if (bounds.find(processor) != bounds.end()) { 695 return parallelOp.emitOpError() 696 << "cannot redefine the bound for processor " 697 << static_cast<int64_t>(processor); 698 } 699 bounds[processor] = launchBound; 700 } 701 if (!boundIsPrecise) { 702 // We are using an approximation, create a surrounding conditional. 703 Value originalBound = std::get<3>(config); 704 CmpIOp pred = rewriter.create<CmpIOp>( 705 loc, CmpIPredicate::slt, newIndex, 706 cloningMap.lookupOrDefault(originalBound)); 707 scf::IfOp ifOp = rewriter.create<scf::IfOp>(loc, pred, false); 708 rewriter.setInsertionPointToStart(&ifOp.thenRegion().front()); 709 // Put a sentinel into the worklist so we know when to pop out of the 710 // if body again. We use the launchOp here, as that cannot be part of 711 // the bodies instruction. 712 worklist.push_back(launchOp.getOperation()); 713 } 714 } 715 } else { 716 // Create a sequential for loop. 717 auto loopOp = rewriter.create<scf::ForOp>( 718 loc, cloningMap.lookupOrDefault(lowerBound), 719 cloningMap.lookupOrDefault(upperBound), 720 cloningMap.lookupOrDefault(step)); 721 newIndex = loopOp.getInductionVar(); 722 rewriter.setInsertionPointToStart(loopOp.getBody()); 723 // Put a sentinel into the worklist so we know when to pop out of the loop 724 // body again. We use the launchOp here, as that cannot be part of the 725 // bodies instruction. 726 worklist.push_back(launchOp.getOperation()); 727 } 728 cloningMap.map(iv, newIndex); 729 } 730 Block *body = parallelOp.getBody(); 731 worklist.reserve(worklist.size() + body->getOperations().size()); 732 for (Operation &op : llvm::reverse(body->without_terminator())) 733 worklist.push_back(&op); 734 return success(); 735 } 736 737 /// Lower a `scf.parallel` operation into a corresponding `gpu.launch` 738 /// operation. 739 /// 740 /// This essentially transforms a loop nest into a corresponding SIMT function. 741 /// The conversion is driven by mapping annotations on the `scf.parallel` 742 /// operations. The mapping is provided via a `DictionaryAttribute` named 743 /// `mapping`, which has three entries: 744 /// - processor: the hardware id to map to. 0-2 are block dimensions, 3-5 are 745 /// thread dimensions and 6 is sequential. 746 /// - map : An affine map that is used to pre-process hardware ids before 747 /// substitution. 748 /// - bound : An affine map that is used to compute the bound of the hardware 749 /// id based on an upper bound of the number of iterations. 750 /// If the `scf.parallel` contains nested `scf.parallel` operations, those 751 /// need to be annotated, as well. Structurally, the transformation works by 752 /// splicing all operations from nested `scf.parallel` operations into a single 753 /// sequence. Indices mapped to hardware ids are substituted with those ids, 754 /// wheras sequential mappings result in a sequential for-loop. To have more 755 /// flexibility when mapping code to hardware ids, the transform supports two 756 /// affine maps. The first `map` is used to compute the actual index for 757 /// substitution from the hardware id. The second `bound` is used to compute the 758 /// launch dimension for the hardware id from the number of iterations the 759 /// mapped loop is performing. Note that the number of iterations might be 760 /// imprecise if the corresponding loop-bounds are loop-dependent. In such case, 761 /// the hardware id might iterate over additional indices. The transformation 762 /// caters for this by predicating the created sequence of instructions on 763 /// the actual loop bound. This only works if an static upper bound for the 764 /// dynamic loop bound can be derived, currently via analyzing `affine.min` 765 /// operations. 766 LogicalResult 767 ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, 768 PatternRewriter &rewriter) const { 769 // Create a launch operation. We start with bound one for all grid/block 770 // sizes. Those will be refined later as we discover them from mappings. 771 Location loc = parallelOp.getLoc(); 772 Value constantOne = rewriter.create<ConstantIndexOp>(parallelOp.getLoc(), 1); 773 gpu::LaunchOp launchOp = rewriter.create<gpu::LaunchOp>( 774 parallelOp.getLoc(), constantOne, constantOne, constantOne, constantOne, 775 constantOne, constantOne); 776 rewriter.setInsertionPointToEnd(&launchOp.body().front()); 777 rewriter.create<gpu::TerminatorOp>(loc); 778 rewriter.setInsertionPointToStart(&launchOp.body().front()); 779 780 BlockAndValueMapping cloningMap; 781 llvm::DenseMap<gpu::Processor, Value> launchBounds; 782 SmallVector<Operation *, 16> worklist; 783 if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist, 784 launchBounds, rewriter))) 785 return failure(); 786 787 // Whether we have seen any side-effects. Reset when leaving an inner scope. 788 bool seenSideeffects = false; 789 // Whether we have left a nesting scope (and hence are no longer innermost). 790 bool leftNestingScope = false; 791 while (!worklist.empty()) { 792 Operation *op = worklist.pop_back_val(); 793 // Now walk over the body and clone it. 794 // TODO: This is only correct if there either is no further scf.parallel 795 // nested or this code is side-effect free. Otherwise we might need 796 // predication. We are overly conservative for now and only allow 797 // side-effects in the innermost scope. 798 if (auto nestedParallel = dyn_cast<ParallelOp>(op)) { 799 // Before entering a nested scope, make sure there have been no 800 // sideeffects until now. 801 if (seenSideeffects) 802 return failure(); 803 // A nested scf.parallel needs insertion of code to compute indices. 804 // Insert that now. This will also update the worklist with the loops 805 // body. 806 if (failed(processParallelLoop(nestedParallel, launchOp, cloningMap, 807 worklist, launchBounds, rewriter))) 808 return failure(); 809 } else if (op == launchOp.getOperation()) { 810 // Found our sentinel value. We have finished the operations from one 811 // nesting level, pop one level back up. 812 auto parent = rewriter.getInsertionPoint()->getParentOp(); 813 rewriter.setInsertionPointAfter(parent); 814 leftNestingScope = true; 815 seenSideeffects = false; 816 } else { 817 // Otherwise we copy it over. 818 Operation *clone = rewriter.clone(*op, cloningMap); 819 cloningMap.map(op->getResults(), clone->getResults()); 820 // Check for side effects. 821 // TODO: Handle region side effects properly. 822 seenSideeffects |= !MemoryEffectOpInterface::hasNoEffect(clone) || 823 clone->getNumRegions() != 0; 824 // If we are no longer in the innermost scope, sideeffects are disallowed. 825 if (seenSideeffects && leftNestingScope) 826 return failure(); 827 } 828 } 829 830 // Now that we succeeded creating the launch operation, also update the 831 // bounds. 832 for (auto bound : launchBounds) 833 launchOp.setOperand(getLaunchOpArgumentNum(std::get<0>(bound)), 834 std::get<1>(bound)); 835 836 rewriter.eraseOp(parallelOp); 837 return success(); 838 } 839 840 void mlir::populateParallelLoopToGPUPatterns(OwningRewritePatternList &patterns, 841 MLIRContext *ctx) { 842 patterns.insert<ParallelToGpuLaunchLowering>(ctx); 843 } 844