14ead2cf7SAlex Zinenko //===- SCFToGPU.cpp - Convert an affine loop nest to a GPU kernel -------===// 24ead2cf7SAlex Zinenko // 34ead2cf7SAlex Zinenko // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 44ead2cf7SAlex Zinenko // See https://llvm.org/LICENSE.txt for license information. 54ead2cf7SAlex Zinenko // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 64ead2cf7SAlex Zinenko // 74ead2cf7SAlex Zinenko //===----------------------------------------------------------------------===// 84ead2cf7SAlex Zinenko // 94ead2cf7SAlex Zinenko // This implements a straightforward conversion of an loop nest into a GPU 104ead2cf7SAlex Zinenko // kernel. The caller is expected to guarantee that the conversion is correct 114ead2cf7SAlex Zinenko // or to further transform the kernel to ensure correctness. 124ead2cf7SAlex Zinenko // 134ead2cf7SAlex Zinenko //===----------------------------------------------------------------------===// 144ead2cf7SAlex Zinenko 154ead2cf7SAlex Zinenko #include "mlir/Conversion/SCFToGPU/SCFToGPU.h" 164ead2cf7SAlex Zinenko 174ead2cf7SAlex Zinenko #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" 184ead2cf7SAlex Zinenko #include "mlir/Dialect/Affine/IR/AffineOps.h" 19abc362a1SJakub Kuderski #include "mlir/Dialect/Arith/IR/Arith.h" 20d7ef488bSMogball #include "mlir/Dialect/GPU/IR/GPUDialect.h" 21d7ef488bSMogball #include "mlir/Dialect/GPU/Transforms/ParallelLoopMapper.h" 22e2310704SJulian Gross #include "mlir/Dialect/MemRef/IR/MemRef.h" 238b68da2cSAlex Zinenko #include "mlir/Dialect/SCF/IR/SCF.h" 244ead2cf7SAlex Zinenko #include "mlir/IR/AffineExpr.h" 254ead2cf7SAlex Zinenko #include "mlir/IR/Builders.h" 264d67b278SJeff Niu #include "mlir/IR/IRMapping.h" 27fc367dfaSMahesh Ravishankar #include "mlir/Interfaces/SideEffectInterfaces.h" 284ead2cf7SAlex Zinenko #include "mlir/Pass/Pass.h" 294ead2cf7SAlex Zinenko #include "mlir/Transforms/DialectConversion.h" 304ead2cf7SAlex Zinenko #include "mlir/Transforms/Passes.h" 314ead2cf7SAlex Zinenko #include "mlir/Transforms/RegionUtils.h" 324ead2cf7SAlex Zinenko #include "llvm/ADT/Sequence.h" 334ead2cf7SAlex Zinenko #include "llvm/Support/Debug.h" 34a1fe1f5fSKazu Hirata #include <optional> 354ead2cf7SAlex Zinenko 364ead2cf7SAlex Zinenko #define DEBUG_TYPE "loops-to-gpu" 374ead2cf7SAlex Zinenko 384ead2cf7SAlex Zinenko using namespace mlir; 394c48f016SMatthias Springer using namespace mlir::affine; 404ead2cf7SAlex Zinenko using namespace mlir::scf; 414ead2cf7SAlex Zinenko 42ec03bbe8SVladislav Vinogradov // Name of internal attribute to mark visited operations during conversion. 43ec03bbe8SVladislav Vinogradov // 44ec03bbe8SVladislav Vinogradov // NOTE: The conversion originally used the following legality criteria: 45ec03bbe8SVladislav Vinogradov // `!parallelOp->hasAttr(gpu::getMappingAttrName())` 46ec03bbe8SVladislav Vinogradov // But the provided pattern might reject some cases based on more detailed 47ec03bbe8SVladislav Vinogradov // analysis of the `mapping` attribute. 48ec03bbe8SVladislav Vinogradov // To avoid dialect conversion failure due to non-converted illegal operation 49ec03bbe8SVladislav Vinogradov // we use this extra Unit attribute as a marker, that the operation was checked 50ec03bbe8SVladislav Vinogradov // by the pattern and is should be considered as legal in the following legality 51ec03bbe8SVladislav Vinogradov // checks. The `finalizeParallelLoopToGPUConversion` function performs clean up 52ec03bbe8SVladislav Vinogradov // of this extra attributes ans is supposed to be called after the dialect 53ec03bbe8SVladislav Vinogradov // conversion. 54ec03bbe8SVladislav Vinogradov // 55ec03bbe8SVladislav Vinogradov // TODO: Implement a cleaner solution, factoring out the "matching" logic 56ec03bbe8SVladislav Vinogradov // from the pattern and its callees into a separate function that can be called 57ec03bbe8SVladislav Vinogradov // from both the pattern and the op legality check. 58ec03bbe8SVladislav Vinogradov static constexpr StringLiteral kVisitedAttrName = "SCFToGPU_visited"; 59ec03bbe8SVladislav Vinogradov 604ead2cf7SAlex Zinenko // Extract an indexed value from KernelDim3. 614ead2cf7SAlex Zinenko static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) { 624ead2cf7SAlex Zinenko switch (pos) { 634ead2cf7SAlex Zinenko case 0: 644ead2cf7SAlex Zinenko return dim3.x; 654ead2cf7SAlex Zinenko case 1: 664ead2cf7SAlex Zinenko return dim3.y; 674ead2cf7SAlex Zinenko case 2: 684ead2cf7SAlex Zinenko return dim3.z; 694ead2cf7SAlex Zinenko default: 704ead2cf7SAlex Zinenko llvm_unreachable("dim3 position out of bounds"); 714ead2cf7SAlex Zinenko } 724ead2cf7SAlex Zinenko return nullptr; 734ead2cf7SAlex Zinenko } 744ead2cf7SAlex Zinenko 754ead2cf7SAlex Zinenko // Get the lower bound-related operands of a loop operation. 764ead2cf7SAlex Zinenko static Operation::operand_range getLowerBoundOperands(AffineForOp forOp) { 774ead2cf7SAlex Zinenko return forOp.getLowerBoundOperands(); 784ead2cf7SAlex Zinenko } 794ead2cf7SAlex Zinenko 804ead2cf7SAlex Zinenko // Get the upper bound-related operands of a loop operation. 814ead2cf7SAlex Zinenko static Operation::operand_range getUpperBoundOperands(AffineForOp forOp) { 824ead2cf7SAlex Zinenko return forOp.getUpperBoundOperands(); 834ead2cf7SAlex Zinenko } 844ead2cf7SAlex Zinenko 854ead2cf7SAlex Zinenko // Get a Value that corresponds to the loop step. If the step is an attribute, 864ead2cf7SAlex Zinenko // materialize a corresponding constant using builder. 874ead2cf7SAlex Zinenko static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder) { 88a54f4eaeSMogball return builder.create<arith::ConstantIndexOp>(forOp.getLoc(), 8923b794f7SMatthias Springer forOp.getStepAsInt()); 904ead2cf7SAlex Zinenko } 914ead2cf7SAlex Zinenko 924ead2cf7SAlex Zinenko // Get a Value for the loop lower bound. If the value requires computation, 934ead2cf7SAlex Zinenko // materialize the instructions using builder. 944ead2cf7SAlex Zinenko static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) { 954ead2cf7SAlex Zinenko return lowerAffineLowerBound(forOp, builder); 964ead2cf7SAlex Zinenko } 974ead2cf7SAlex Zinenko 984ead2cf7SAlex Zinenko // Get a Value for the loop upper bound. If the value requires computation, 994ead2cf7SAlex Zinenko // materialize the instructions using builder. 1004ead2cf7SAlex Zinenko static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) { 1014ead2cf7SAlex Zinenko return lowerAffineUpperBound(forOp, builder); 1024ead2cf7SAlex Zinenko } 1034ead2cf7SAlex Zinenko 1044ead2cf7SAlex Zinenko // Check the structure of the loop nest: 1054ead2cf7SAlex Zinenko // - there are enough loops to map to numDims; 1064ead2cf7SAlex Zinenko // - the loops are perfectly nested; 1074ead2cf7SAlex Zinenko // - the loop bounds can be computed above the outermost loop. 1084ead2cf7SAlex Zinenko // This roughly corresponds to the "matcher" part of the pattern-based 1094ead2cf7SAlex Zinenko // rewriting infrastructure. 1102bcd1927SMaheshRavishankar static LogicalResult checkAffineLoopNestMappableImpl(AffineForOp forOp, 1112bcd1927SMaheshRavishankar unsigned numDims) { 1128df54a6aSJacques Pienaar Region &limit = forOp.getRegion(); 1134ead2cf7SAlex Zinenko for (unsigned i = 0, e = numDims; i < e; ++i) { 1144ead2cf7SAlex Zinenko Operation *nested = &forOp.getBody()->front(); 1154ead2cf7SAlex Zinenko if (!areValuesDefinedAbove(getLowerBoundOperands(forOp), limit) || 1164ead2cf7SAlex Zinenko !areValuesDefinedAbove(getUpperBoundOperands(forOp), limit)) 1174ead2cf7SAlex Zinenko return forOp.emitError( 1184ead2cf7SAlex Zinenko "loops with bounds depending on other mapped loops " 1194ead2cf7SAlex Zinenko "are not supported"); 1204ead2cf7SAlex Zinenko 1214ead2cf7SAlex Zinenko // The innermost loop can have an arbitrary body, skip the perfect nesting 1224ead2cf7SAlex Zinenko // check for it. 1234ead2cf7SAlex Zinenko if (i == e - 1) 1244ead2cf7SAlex Zinenko break; 1254ead2cf7SAlex Zinenko 1264ead2cf7SAlex Zinenko auto begin = forOp.getBody()->begin(), end = forOp.getBody()->end(); 1274ead2cf7SAlex Zinenko if (forOp.getBody()->empty() || std::next(begin, 2) != end) 1284ead2cf7SAlex Zinenko return forOp.emitError("expected perfectly nested loops in the body"); 1294ead2cf7SAlex Zinenko 1302bcd1927SMaheshRavishankar if (!(forOp = dyn_cast<AffineForOp>(nested))) 1314ead2cf7SAlex Zinenko return nested->emitError("expected a nested loop"); 1324ead2cf7SAlex Zinenko } 1334ead2cf7SAlex Zinenko return success(); 1344ead2cf7SAlex Zinenko } 1354ead2cf7SAlex Zinenko 1362bcd1927SMaheshRavishankar static LogicalResult checkAffineLoopNestMappable(AffineForOp forOp, 1372bcd1927SMaheshRavishankar unsigned numBlockDims, 1384ead2cf7SAlex Zinenko unsigned numThreadDims) { 1394ead2cf7SAlex Zinenko if (numBlockDims < 1 || numThreadDims < 1) { 1404ead2cf7SAlex Zinenko LLVM_DEBUG(llvm::dbgs() << "nothing to map"); 1414ead2cf7SAlex Zinenko return success(); 1424ead2cf7SAlex Zinenko } 1434ead2cf7SAlex Zinenko 1444ead2cf7SAlex Zinenko if (numBlockDims > 3) { 1454ead2cf7SAlex Zinenko return forOp.emitError("cannot map to more than 3 block dimensions"); 1464ead2cf7SAlex Zinenko } 1474ead2cf7SAlex Zinenko if (numThreadDims > 3) { 1484ead2cf7SAlex Zinenko return forOp.emitError("cannot map to more than 3 thread dimensions"); 1494ead2cf7SAlex Zinenko } 1502bcd1927SMaheshRavishankar return checkAffineLoopNestMappableImpl(forOp, numBlockDims + numThreadDims); 1514ead2cf7SAlex Zinenko } 1524ead2cf7SAlex Zinenko 1534ead2cf7SAlex Zinenko namespace { 1544ead2cf7SAlex Zinenko // Helper structure that holds common state of the loop to GPU kernel 1554ead2cf7SAlex Zinenko // conversion. 1562bcd1927SMaheshRavishankar struct AffineLoopToGpuConverter { 1570a81ace0SKazu Hirata std::optional<AffineForOp> collectBounds(AffineForOp forOp, 1580a81ace0SKazu Hirata unsigned numLoops); 1594ead2cf7SAlex Zinenko 1602bcd1927SMaheshRavishankar void createLaunch(AffineForOp rootForOp, AffineForOp innermostForOp, 1612bcd1927SMaheshRavishankar unsigned numBlockDims, unsigned numThreadDims); 1624ead2cf7SAlex Zinenko 1634ead2cf7SAlex Zinenko // Ranges of the loops mapped to blocks or threads. 1644ead2cf7SAlex Zinenko SmallVector<Value, 6> dims; 1654ead2cf7SAlex Zinenko // Lower bounds of the loops mapped to blocks or threads. 1664ead2cf7SAlex Zinenko SmallVector<Value, 6> lbs; 1674ead2cf7SAlex Zinenko // Induction variables of the loops mapped to blocks or threads. 1684ead2cf7SAlex Zinenko SmallVector<Value, 6> ivs; 1694ead2cf7SAlex Zinenko // Steps of the loops mapped to blocks or threads. 1704ead2cf7SAlex Zinenko SmallVector<Value, 6> steps; 1714ead2cf7SAlex Zinenko }; 1724ead2cf7SAlex Zinenko } // namespace 1734ead2cf7SAlex Zinenko 1744ead2cf7SAlex Zinenko // Collect ranges, bounds, steps and induction variables in preparation for 1754ead2cf7SAlex Zinenko // mapping a loop nest of depth "numLoops" rooted at "forOp" to a GPU kernel. 1764ead2cf7SAlex Zinenko // This may fail if the IR for computing loop bounds cannot be constructed, for 1774ead2cf7SAlex Zinenko // example if an affine loop uses semi-affine maps. Return the last loop to be 178192d9dd7SKazu Hirata // mapped on success, std::nullopt on failure. 1790a81ace0SKazu Hirata std::optional<AffineForOp> 1802bcd1927SMaheshRavishankar AffineLoopToGpuConverter::collectBounds(AffineForOp forOp, unsigned numLoops) { 1814ead2cf7SAlex Zinenko OpBuilder builder(forOp.getOperation()); 1824ead2cf7SAlex Zinenko dims.reserve(numLoops); 1834ead2cf7SAlex Zinenko lbs.reserve(numLoops); 1844ead2cf7SAlex Zinenko ivs.reserve(numLoops); 1854ead2cf7SAlex Zinenko steps.reserve(numLoops); 1862bcd1927SMaheshRavishankar AffineForOp currentLoop = forOp; 1874ead2cf7SAlex Zinenko for (unsigned i = 0; i < numLoops; ++i) { 1884ead2cf7SAlex Zinenko Value lowerBound = getOrEmitLowerBound(currentLoop, builder); 1894ead2cf7SAlex Zinenko Value upperBound = getOrEmitUpperBound(currentLoop, builder); 1904ead2cf7SAlex Zinenko if (!lowerBound || !upperBound) { 1911a36588eSKazu Hirata return std::nullopt; 1924ead2cf7SAlex Zinenko } 1934ead2cf7SAlex Zinenko 194a54f4eaeSMogball Value range = builder.create<arith::SubIOp>(currentLoop.getLoc(), 195a54f4eaeSMogball upperBound, lowerBound); 1964ead2cf7SAlex Zinenko Value step = getOrCreateStep(currentLoop, builder); 197cb7bda2aSMatthias Springer if (getConstantIntValue(step) != static_cast<int64_t>(1)) 198477c0b67SHsiangkai Wang range = 199477c0b67SHsiangkai Wang builder.create<arith::CeilDivSIOp>(currentLoop.getLoc(), range, step); 2004ead2cf7SAlex Zinenko dims.push_back(range); 2014ead2cf7SAlex Zinenko 2024ead2cf7SAlex Zinenko lbs.push_back(lowerBound); 2034ead2cf7SAlex Zinenko ivs.push_back(currentLoop.getInductionVar()); 2044ead2cf7SAlex Zinenko steps.push_back(step); 2054ead2cf7SAlex Zinenko 2064ead2cf7SAlex Zinenko if (i != numLoops - 1) 2072bcd1927SMaheshRavishankar currentLoop = cast<AffineForOp>(¤tLoop.getBody()->front()); 2084ead2cf7SAlex Zinenko } 2094ead2cf7SAlex Zinenko return currentLoop; 2104ead2cf7SAlex Zinenko } 2114ead2cf7SAlex Zinenko 2124ead2cf7SAlex Zinenko // Replace the rooted at "rootForOp" with a GPU launch operation. This expects 2134ead2cf7SAlex Zinenko // "innermostForOp" to point to the last loop to be transformed to the kernel, 2144ead2cf7SAlex Zinenko // and to have (numBlockDims + numThreadDims) perfectly nested loops between 2154ead2cf7SAlex Zinenko // "rootForOp" and "innermostForOp". 2162bcd1927SMaheshRavishankar void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp, 2172bcd1927SMaheshRavishankar AffineForOp innermostForOp, 2184ead2cf7SAlex Zinenko unsigned numBlockDims, 2194ead2cf7SAlex Zinenko unsigned numThreadDims) { 2204ead2cf7SAlex Zinenko OpBuilder builder(rootForOp.getOperation()); 2214ead2cf7SAlex Zinenko // Prepare the grid and block sizes for the launch operation. If there is 2224ead2cf7SAlex Zinenko // no loop mapped to a specific dimension, use constant "1" as its size. 223a54f4eaeSMogball Value constOne = 224a54f4eaeSMogball (numBlockDims < 3 || numThreadDims < 3) 225a54f4eaeSMogball ? builder.create<arith::ConstantIndexOp>(rootForOp.getLoc(), 1) 2264ead2cf7SAlex Zinenko : nullptr; 2274ead2cf7SAlex Zinenko Value gridSizeX = numBlockDims > 0 ? dims[0] : constOne; 2284ead2cf7SAlex Zinenko Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne; 2294ead2cf7SAlex Zinenko Value gridSizeZ = numBlockDims > 2 ? dims[2] : constOne; 2304ead2cf7SAlex Zinenko Value blockSizeX = numThreadDims > 0 ? dims[numBlockDims] : constOne; 2314ead2cf7SAlex Zinenko Value blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne; 2324ead2cf7SAlex Zinenko Value blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne; 2334ead2cf7SAlex Zinenko 2344ead2cf7SAlex Zinenko // Create a launch op and move the body region of the innermost loop to the 2354ead2cf7SAlex Zinenko // launch op. 2364ead2cf7SAlex Zinenko auto launchOp = builder.create<gpu::LaunchOp>( 2374ead2cf7SAlex Zinenko rootForOp.getLoc(), gridSizeX, gridSizeY, gridSizeZ, blockSizeX, 2384ead2cf7SAlex Zinenko blockSizeY, blockSizeZ); 2394ead2cf7SAlex Zinenko 2404ead2cf7SAlex Zinenko // Replace the loop terminator (loops contain only a single block) with the 2414ead2cf7SAlex Zinenko // gpu terminator and move the operations from the loop body block to the gpu 2424ead2cf7SAlex Zinenko // launch body block. Do not move the entire block because of the difference 2434ead2cf7SAlex Zinenko // in block arguments. 2444ead2cf7SAlex Zinenko Operation &terminator = innermostForOp.getBody()->back(); 2454ead2cf7SAlex Zinenko Location terminatorLoc = terminator.getLoc(); 2464ead2cf7SAlex Zinenko terminator.erase(); 2474ead2cf7SAlex Zinenko builder.setInsertionPointToEnd(innermostForOp.getBody()); 2481a36588eSKazu Hirata builder.create<gpu::TerminatorOp>(terminatorLoc, std::nullopt); 24910c04f46SRiver Riddle launchOp.getBody().front().getOperations().splice( 25010c04f46SRiver Riddle launchOp.getBody().front().begin(), 2514ead2cf7SAlex Zinenko innermostForOp.getBody()->getOperations()); 2524ead2cf7SAlex Zinenko 2534ead2cf7SAlex Zinenko // Remap the loop iterators to use block/thread identifiers instead. Loops 2544ead2cf7SAlex Zinenko // may iterate from LB with step S whereas GPU thread/block ids always iterate 2554ead2cf7SAlex Zinenko // from 0 to N with step 1. Therefore, loop induction variables are replaced 2564ead2cf7SAlex Zinenko // with (gpu-thread/block-id * S) + LB. 25710c04f46SRiver Riddle builder.setInsertionPointToStart(&launchOp.getBody().front()); 25802b6fb21SMehdi Amini auto *lbArgumentIt = lbs.begin(); 25902b6fb21SMehdi Amini auto *stepArgumentIt = steps.begin(); 260e4853be2SMehdi Amini for (const auto &en : llvm::enumerate(ivs)) { 2614ead2cf7SAlex Zinenko Value id = 2624ead2cf7SAlex Zinenko en.index() < numBlockDims 2634ead2cf7SAlex Zinenko ? getDim3Value(launchOp.getBlockIds(), en.index()) 2644ead2cf7SAlex Zinenko : getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims); 2654ead2cf7SAlex Zinenko Value step = steps[en.index()]; 266cb7bda2aSMatthias Springer if (getConstantIntValue(step) != static_cast<int64_t>(1)) 267a54f4eaeSMogball id = builder.create<arith::MulIOp>(rootForOp.getLoc(), step, id); 2684ead2cf7SAlex Zinenko 2694ead2cf7SAlex Zinenko Value ivReplacement = 270a54f4eaeSMogball builder.create<arith::AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id); 2714ead2cf7SAlex Zinenko en.value().replaceAllUsesWith(ivReplacement); 2724ead2cf7SAlex Zinenko std::advance(lbArgumentIt, 1); 2734ead2cf7SAlex Zinenko std::advance(stepArgumentIt, 1); 2744ead2cf7SAlex Zinenko } 2754ead2cf7SAlex Zinenko 2764ead2cf7SAlex Zinenko // We are done and can erase the original outermost loop. 2774ead2cf7SAlex Zinenko rootForOp.erase(); 2784ead2cf7SAlex Zinenko } 2794ead2cf7SAlex Zinenko 2804ead2cf7SAlex Zinenko // Generic loop to GPU kernel conversion function. 2812bcd1927SMaheshRavishankar static LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp, 2824ead2cf7SAlex Zinenko unsigned numBlockDims, 2834ead2cf7SAlex Zinenko unsigned numThreadDims) { 2842bcd1927SMaheshRavishankar if (failed(checkAffineLoopNestMappable(forOp, numBlockDims, numThreadDims))) 2854ead2cf7SAlex Zinenko return failure(); 2864ead2cf7SAlex Zinenko 2872bcd1927SMaheshRavishankar AffineLoopToGpuConverter converter; 2884ead2cf7SAlex Zinenko auto maybeInnerLoop = 2894ead2cf7SAlex Zinenko converter.collectBounds(forOp, numBlockDims + numThreadDims); 2904ead2cf7SAlex Zinenko if (!maybeInnerLoop) 2914ead2cf7SAlex Zinenko return failure(); 2924ead2cf7SAlex Zinenko converter.createLaunch(forOp, *maybeInnerLoop, numBlockDims, numThreadDims); 2934ead2cf7SAlex Zinenko 2944ead2cf7SAlex Zinenko return success(); 2954ead2cf7SAlex Zinenko } 2964ead2cf7SAlex Zinenko 2974ead2cf7SAlex Zinenko LogicalResult mlir::convertAffineLoopNestToGPULaunch(AffineForOp forOp, 2984ead2cf7SAlex Zinenko unsigned numBlockDims, 2994ead2cf7SAlex Zinenko unsigned numThreadDims) { 3002bcd1927SMaheshRavishankar return ::convertAffineLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims); 3014ead2cf7SAlex Zinenko } 3024ead2cf7SAlex Zinenko 3034ead2cf7SAlex Zinenko namespace { 3044ead2cf7SAlex Zinenko struct ParallelToGpuLaunchLowering : public OpRewritePattern<ParallelOp> { 3054ead2cf7SAlex Zinenko using OpRewritePattern<ParallelOp>::OpRewritePattern; 3064ead2cf7SAlex Zinenko 3074ead2cf7SAlex Zinenko LogicalResult matchAndRewrite(ParallelOp parallelOp, 3084ead2cf7SAlex Zinenko PatternRewriter &rewriter) const override; 3094ead2cf7SAlex Zinenko }; 3104ead2cf7SAlex Zinenko } // namespace 3114ead2cf7SAlex Zinenko 3124ead2cf7SAlex Zinenko /// Tries to derive a static upper bound from the defining operation of 3134ead2cf7SAlex Zinenko /// `upperBound`. 3144ead2cf7SAlex Zinenko static Value deriveStaticUpperBound(Value upperBound, 3154ead2cf7SAlex Zinenko PatternRewriter &rewriter) { 316a54f4eaeSMogball if (auto op = upperBound.getDefiningOp<arith::ConstantIndexOp>()) { 3174ead2cf7SAlex Zinenko return op; 3184ead2cf7SAlex Zinenko } 3194ead2cf7SAlex Zinenko 3204ead2cf7SAlex Zinenko if (auto minOp = upperBound.getDefiningOp<AffineMinOp>()) { 3218df54a6aSJacques Pienaar for (const AffineExpr &result : minOp.getMap().getResults()) { 3221609f1c2Slong.chen if (auto constExpr = dyn_cast<AffineConstantExpr>(result)) { 323a54f4eaeSMogball return rewriter.create<arith::ConstantIndexOp>(minOp.getLoc(), 3244ead2cf7SAlex Zinenko constExpr.getValue()); 3254ead2cf7SAlex Zinenko } 3264ead2cf7SAlex Zinenko } 3274ead2cf7SAlex Zinenko } 3284ead2cf7SAlex Zinenko 329459fd3fbSChristian Sigg if (auto minOp = upperBound.getDefiningOp<arith::MinSIOp>()) { 330459fd3fbSChristian Sigg for (Value operand : {minOp.getLhs(), minOp.getRhs()}) { 331459fd3fbSChristian Sigg if (auto staticBound = deriveStaticUpperBound(operand, rewriter)) 332459fd3fbSChristian Sigg return staticBound; 333459fd3fbSChristian Sigg } 334459fd3fbSChristian Sigg } 335459fd3fbSChristian Sigg 336a54f4eaeSMogball if (auto multiplyOp = upperBound.getDefiningOp<arith::MulIOp>()) { 337a54f4eaeSMogball if (auto lhs = dyn_cast_or_null<arith::ConstantIndexOp>( 3384ead2cf7SAlex Zinenko deriveStaticUpperBound(multiplyOp.getOperand(0), rewriter) 3394ead2cf7SAlex Zinenko .getDefiningOp())) 340a54f4eaeSMogball if (auto rhs = dyn_cast_or_null<arith::ConstantIndexOp>( 3414ead2cf7SAlex Zinenko deriveStaticUpperBound(multiplyOp.getOperand(1), rewriter) 3424ead2cf7SAlex Zinenko .getDefiningOp())) { 3434ead2cf7SAlex Zinenko // Assumptions about the upper bound of minimum computations no longer 344459fd3fbSChristian Sigg // work if multiplied by mixed signs, so abort in this case. 34563022c48SChristian Sigg if ((lhs.value() < 0) != (rhs.value() < 0)) 3464ead2cf7SAlex Zinenko return {}; 3474ead2cf7SAlex Zinenko 348a54f4eaeSMogball return rewriter.create<arith::ConstantIndexOp>( 349a54f4eaeSMogball multiplyOp.getLoc(), lhs.value() * rhs.value()); 3504ead2cf7SAlex Zinenko } 3514ead2cf7SAlex Zinenko } 3524ead2cf7SAlex Zinenko 3534ead2cf7SAlex Zinenko return {}; 3544ead2cf7SAlex Zinenko } 3554ead2cf7SAlex Zinenko 3564ead2cf7SAlex Zinenko static bool isMappedToProcessor(gpu::Processor processor) { 3574ead2cf7SAlex Zinenko return processor != gpu::Processor::Sequential; 3584ead2cf7SAlex Zinenko } 3594ead2cf7SAlex Zinenko 3604ead2cf7SAlex Zinenko static unsigned getLaunchOpArgumentNum(gpu::Processor processor) { 3614ead2cf7SAlex Zinenko switch (processor) { 3624ead2cf7SAlex Zinenko case gpu::Processor::BlockX: 3634ead2cf7SAlex Zinenko return 0; 3644ead2cf7SAlex Zinenko case gpu::Processor::BlockY: 3654ead2cf7SAlex Zinenko return 1; 3664ead2cf7SAlex Zinenko case gpu::Processor::BlockZ: 3674ead2cf7SAlex Zinenko return 2; 3684ead2cf7SAlex Zinenko case gpu::Processor::ThreadX: 3694ead2cf7SAlex Zinenko return 3; 3704ead2cf7SAlex Zinenko case gpu::Processor::ThreadY: 3714ead2cf7SAlex Zinenko return 4; 3724ead2cf7SAlex Zinenko case gpu::Processor::ThreadZ: 3734ead2cf7SAlex Zinenko return 5; 3744ead2cf7SAlex Zinenko default:; 3754ead2cf7SAlex Zinenko } 3764ead2cf7SAlex Zinenko llvm_unreachable( 3774ead2cf7SAlex Zinenko "invalid processor type while retrieving launch op argument number"); 3784ead2cf7SAlex Zinenko } 3794ead2cf7SAlex Zinenko 3804ead2cf7SAlex Zinenko /// Modifies the current transformation state to capture the effect of the given 3814ead2cf7SAlex Zinenko /// `scf.parallel` operation on index substitutions and the operations to be 3824ead2cf7SAlex Zinenko /// inserted. 3834ead2cf7SAlex Zinenko /// Specifically, if a dimension of a parallel loop is mapped to a hardware id, 3844ead2cf7SAlex Zinenko /// this function will 3854ead2cf7SAlex Zinenko /// - compute the loop index based on the hardware id and affine map from the 3864ead2cf7SAlex Zinenko /// mapping and update `cloningMap` to substitute all uses. 3874ead2cf7SAlex Zinenko /// - derive a new upper bound for the hardware id and augment the provided 3884ead2cf7SAlex Zinenko /// `gpu.launch operation` accordingly. 3894ead2cf7SAlex Zinenko /// - if the upper bound is imprecise, insert a conditional in the `gpu.launch` 3904ead2cf7SAlex Zinenko /// and update the rewriter to insert into the conditional's body. 3914ead2cf7SAlex Zinenko /// If the dimension is mapped to sequential, 3924ead2cf7SAlex Zinenko /// - insert a for loop into the body and update the rewriter to insert into 3934ead2cf7SAlex Zinenko /// the for loop's body. 3944ead2cf7SAlex Zinenko /// - update the `cloningMap` to replace uses of the index with the index of 3954ead2cf7SAlex Zinenko /// the new for loop. 3964ead2cf7SAlex Zinenko /// In either case, 3974ead2cf7SAlex Zinenko /// - append the instructions from the loops body to worklist, in reverse order. 3984ead2cf7SAlex Zinenko /// To note the end of the current scope in case a loop or conditional was 3994ead2cf7SAlex Zinenko /// inserted, a sentinel (the `gpu.launch` operation) is inserted into the 4004ead2cf7SAlex Zinenko /// worklist. This signals the processor of the worklist to pop the rewriter 4014ead2cf7SAlex Zinenko /// one scope-level up. 4024ead2cf7SAlex Zinenko static LogicalResult processParallelLoop( 4034d67b278SJeff Niu ParallelOp parallelOp, gpu::LaunchOp launchOp, IRMapping &cloningMap, 4044d67b278SJeff Niu SmallVectorImpl<Operation *> &worklist, 4054ead2cf7SAlex Zinenko DenseMap<gpu::Processor, Value> &bounds, PatternRewriter &rewriter) { 4069db53a18SRiver Riddle // TODO: Verify that this is a valid GPU mapping. 4074ead2cf7SAlex Zinenko // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential 4084ead2cf7SAlex Zinenko ArrayAttr mapping = 4090bf4a82aSChristian Sigg parallelOp->getAttrOfType<ArrayAttr>(gpu::getMappingAttrName()); 4104ead2cf7SAlex Zinenko 411*0e944a30STuomas Kärnä // TODO: Support multiple reductions. 412*0e944a30STuomas Kärnä if (!mapping || parallelOp.getNumResults() > 1) 4134ead2cf7SAlex Zinenko return failure(); 4144ead2cf7SAlex Zinenko 4154ead2cf7SAlex Zinenko Location loc = parallelOp.getLoc(); 4164ead2cf7SAlex Zinenko 4174ead2cf7SAlex Zinenko auto launchIndependent = [&launchOp](Value val) { 4180bf4a82aSChristian Sigg return val.getParentRegion()->isAncestor(launchOp->getParentRegion()); 4194ead2cf7SAlex Zinenko }; 4204ead2cf7SAlex Zinenko 4214ead2cf7SAlex Zinenko auto ensureLaunchIndependent = [&rewriter, 4224ead2cf7SAlex Zinenko launchIndependent](Value val) -> Value { 4234ead2cf7SAlex Zinenko if (launchIndependent(val)) 4244ead2cf7SAlex Zinenko return val; 425a54f4eaeSMogball if (auto constOp = val.getDefiningOp<arith::ConstantOp>()) 426a54f4eaeSMogball return rewriter.create<arith::ConstantOp>(constOp.getLoc(), 427cfb72fd3SJacques Pienaar constOp.getValue()); 4284ead2cf7SAlex Zinenko return {}; 4294ead2cf7SAlex Zinenko }; 4304ead2cf7SAlex Zinenko 431c0342a2dSJacques Pienaar for (auto config : llvm::zip( 432c0342a2dSJacques Pienaar mapping, parallelOp.getInductionVars(), parallelOp.getLowerBound(), 433c0342a2dSJacques Pienaar parallelOp.getUpperBound(), parallelOp.getStep())) { 4344ead2cf7SAlex Zinenko Attribute mappingAttribute; 4354ead2cf7SAlex Zinenko Value iv, lowerBound, upperBound, step; 4364ead2cf7SAlex Zinenko std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config; 4377bdd3722SMogball auto annotation = 4385550c821STres Popp dyn_cast<gpu::ParallelLoopDimMappingAttr>(mappingAttribute); 4394ead2cf7SAlex Zinenko if (!annotation) 4404ead2cf7SAlex Zinenko return parallelOp.emitOpError() 4414ead2cf7SAlex Zinenko << "expected mapping attribute for lowering to GPU"; 4424ead2cf7SAlex Zinenko Value newIndex; 4437bdd3722SMogball gpu::Processor processor = annotation.getProcessor(); 4444ead2cf7SAlex Zinenko 4454ead2cf7SAlex Zinenko if (isMappedToProcessor(processor)) { 4464ead2cf7SAlex Zinenko // Use the corresponding thread/grid index as replacement for the loop iv. 447e2b71610SRahul Joshi Value operand = 44810c04f46SRiver Riddle launchOp.getBody().getArgument(getLaunchOpArgumentNum(processor)); 4494ead2cf7SAlex Zinenko // Take the indexmap and add the lower bound and step computations in. 4504ead2cf7SAlex Zinenko // This computes operand * step + lowerBound. 4514ead2cf7SAlex Zinenko // Use an affine map here so that it composes nicely with the provided 4524ead2cf7SAlex Zinenko // annotation. 4534ead2cf7SAlex Zinenko AffineMap lowerAndStep = AffineMap::get( 4544ead2cf7SAlex Zinenko 1, 2, 4554ead2cf7SAlex Zinenko rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) + 4564ead2cf7SAlex Zinenko rewriter.getAffineSymbolExpr(1)); 4574ead2cf7SAlex Zinenko newIndex = rewriter.create<AffineApplyOp>( 4587bdd3722SMogball loc, annotation.getMap().compose(lowerAndStep), 459227bfa1fSlong.chen ValueRange{operand, ensureLaunchIndependent(step), 460227bfa1fSlong.chen ensureLaunchIndependent(lowerBound)}); 4614ead2cf7SAlex Zinenko // If there was also a bound, insert that, too. 4629db53a18SRiver Riddle // TODO: Check that we do not assign bounds twice. 4637bdd3722SMogball if (annotation.getBound()) { 4644ead2cf7SAlex Zinenko // We pass as the single operand to the bound-map the number of 4654ead2cf7SAlex Zinenko // iterations, which is (upperBound - lowerBound) ceilDiv step. To 4664ead2cf7SAlex Zinenko // support inner loops with dynamic upper bounds (as generated by e.g. 4674ead2cf7SAlex Zinenko // tiling), try to derive a max for the bounds. If the used bound for 4684ead2cf7SAlex Zinenko // the hardware id is imprecise, wrap the contained code into a 4694ead2cf7SAlex Zinenko // conditional. If the lower-bound is constant or defined before the 4704ead2cf7SAlex Zinenko // launch, we can use it in the launch bounds. Otherwise fail. 4714ead2cf7SAlex Zinenko if (!launchIndependent(lowerBound) && 472a54f4eaeSMogball !isa_and_nonnull<arith::ConstantOp>(lowerBound.getDefiningOp())) 4734ead2cf7SAlex Zinenko return failure(); 4744ead2cf7SAlex Zinenko // The step must also be constant or defined outside of the loop nest. 4754ead2cf7SAlex Zinenko if (!launchIndependent(step) && 476a54f4eaeSMogball !isa_and_nonnull<arith::ConstantOp>(step.getDefiningOp())) 4774ead2cf7SAlex Zinenko return failure(); 4784ead2cf7SAlex Zinenko // If the upper-bound is constant or defined before the launch, we can 4794ead2cf7SAlex Zinenko // use it in the launch bounds directly. Otherwise try derive a bound. 4804ead2cf7SAlex Zinenko bool boundIsPrecise = 4814ead2cf7SAlex Zinenko launchIndependent(upperBound) || 482a54f4eaeSMogball isa_and_nonnull<arith::ConstantOp>(upperBound.getDefiningOp()); 4834ead2cf7SAlex Zinenko { 4844ead2cf7SAlex Zinenko PatternRewriter::InsertionGuard guard(rewriter); 4854ead2cf7SAlex Zinenko rewriter.setInsertionPoint(launchOp); 4864ead2cf7SAlex Zinenko if (!boundIsPrecise) { 4874ead2cf7SAlex Zinenko upperBound = deriveStaticUpperBound(upperBound, rewriter); 4884ead2cf7SAlex Zinenko if (!upperBound) { 4895da2423bSStephan Herhut return rewriter.notifyMatchFailure( 4905da2423bSStephan Herhut parallelOp, 4915da2423bSStephan Herhut "cannot derive loop-invariant upper bound for number of" 4925da2423bSStephan Herhut "iterations"); 4934ead2cf7SAlex Zinenko } 4944ead2cf7SAlex Zinenko } 4954ead2cf7SAlex Zinenko // Compute the number of iterations needed. We compute this as an 4964ead2cf7SAlex Zinenko // affine expression ceilDiv (upperBound - lowerBound) step. We use 4974ead2cf7SAlex Zinenko // affine.apply here so that it composes nicely with the provided map. 49872d5ac90STres Popp AffineMap stepMap = AffineMap::get( 49972d5ac90STres Popp 1, 2, 50072d5ac90STres Popp ((rewriter.getAffineDimExpr(0) - rewriter.getAffineSymbolExpr(0)) 50172d5ac90STres Popp .ceilDiv(rewriter.getAffineSymbolExpr(1)))); 5024ead2cf7SAlex Zinenko Value launchBound = rewriter.create<AffineApplyOp>( 5037bdd3722SMogball loc, annotation.getBound().compose(stepMap), 5044ead2cf7SAlex Zinenko ValueRange{ 5054ead2cf7SAlex Zinenko ensureLaunchIndependent( 5064ead2cf7SAlex Zinenko cloningMap.lookupOrDefault(upperBound)), 5074ead2cf7SAlex Zinenko ensureLaunchIndependent( 5084ead2cf7SAlex Zinenko cloningMap.lookupOrDefault(lowerBound)), 5094ead2cf7SAlex Zinenko ensureLaunchIndependent(cloningMap.lookupOrDefault(step))}); 5104ead2cf7SAlex Zinenko // todo(herhut,ravishankarm): Update the behavior of setMappingAttr 5114ead2cf7SAlex Zinenko // when this condition is relaxed. 51201a0e85aSKazu Hirata if (!bounds.try_emplace(processor, launchBound).second) { 5135da2423bSStephan Herhut return rewriter.notifyMatchFailure( 5145da2423bSStephan Herhut parallelOp, "cannot redefine the bound for processor " + 5155da2423bSStephan Herhut Twine(static_cast<int64_t>(processor))); 5164ead2cf7SAlex Zinenko } 5174ead2cf7SAlex Zinenko } 5184ead2cf7SAlex Zinenko if (!boundIsPrecise) { 5194ead2cf7SAlex Zinenko // We are using an approximation, create a surrounding conditional. 5204ead2cf7SAlex Zinenko Value originalBound = std::get<3>(config); 521a54f4eaeSMogball arith::CmpIOp pred = rewriter.create<arith::CmpIOp>( 522a54f4eaeSMogball loc, arith::CmpIPredicate::slt, newIndex, 5234ead2cf7SAlex Zinenko cloningMap.lookupOrDefault(originalBound)); 5244ead2cf7SAlex Zinenko scf::IfOp ifOp = rewriter.create<scf::IfOp>(loc, pred, false); 525c0342a2dSJacques Pienaar rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front()); 5264ead2cf7SAlex Zinenko // Put a sentinel into the worklist so we know when to pop out of the 5274ead2cf7SAlex Zinenko // if body again. We use the launchOp here, as that cannot be part of 5284ead2cf7SAlex Zinenko // the bodies instruction. 5294ead2cf7SAlex Zinenko worklist.push_back(launchOp.getOperation()); 5304ead2cf7SAlex Zinenko } 5314ead2cf7SAlex Zinenko } 5324ead2cf7SAlex Zinenko } else { 5334ead2cf7SAlex Zinenko // Create a sequential for loop. 5344ead2cf7SAlex Zinenko auto loopOp = rewriter.create<scf::ForOp>( 5354ead2cf7SAlex Zinenko loc, cloningMap.lookupOrDefault(lowerBound), 5364ead2cf7SAlex Zinenko cloningMap.lookupOrDefault(upperBound), 5374ead2cf7SAlex Zinenko cloningMap.lookupOrDefault(step)); 5384ead2cf7SAlex Zinenko newIndex = loopOp.getInductionVar(); 5394ead2cf7SAlex Zinenko rewriter.setInsertionPointToStart(loopOp.getBody()); 5404ead2cf7SAlex Zinenko // Put a sentinel into the worklist so we know when to pop out of the loop 5414ead2cf7SAlex Zinenko // body again. We use the launchOp here, as that cannot be part of the 5424ead2cf7SAlex Zinenko // bodies instruction. 5434ead2cf7SAlex Zinenko worklist.push_back(launchOp.getOperation()); 5444ead2cf7SAlex Zinenko } 5454ead2cf7SAlex Zinenko cloningMap.map(iv, newIndex); 5464ead2cf7SAlex Zinenko } 547396e7f45SArtur Bialas 548396e7f45SArtur Bialas // Propagate custom user defined optional attributes, that can be used at 549396e7f45SArtur Bialas // later stage, such as extension data for GPU kernel dispatch 55056774bddSMarius Brehler for (const auto &namedAttr : parallelOp->getAttrs()) { 5510c7890c8SRiver Riddle if (namedAttr.getName() == gpu::getMappingAttrName() || 5520c7890c8SRiver Riddle namedAttr.getName() == ParallelOp::getOperandSegmentSizeAttr()) 553396e7f45SArtur Bialas continue; 5540c7890c8SRiver Riddle launchOp->setAttr(namedAttr.getName(), namedAttr.getValue()); 555396e7f45SArtur Bialas } 556396e7f45SArtur Bialas 5574ead2cf7SAlex Zinenko Block *body = parallelOp.getBody(); 5584ead2cf7SAlex Zinenko worklist.reserve(worklist.size() + body->getOperations().size()); 559*0e944a30STuomas Kärnä // Include scf.reduce terminator if exists and has an operand. 560*0e944a30STuomas Kärnä if (auto terminator = body->getTerminator(); 561*0e944a30STuomas Kärnä isa<scf::ReduceOp>(terminator) && terminator->getOperands().size() == 1) { 562*0e944a30STuomas Kärnä worklist.push_back(terminator); 563*0e944a30STuomas Kärnä } 5644ead2cf7SAlex Zinenko for (Operation &op : llvm::reverse(body->without_terminator())) 5654ead2cf7SAlex Zinenko worklist.push_back(&op); 5664ead2cf7SAlex Zinenko return success(); 5674ead2cf7SAlex Zinenko } 5684ead2cf7SAlex Zinenko 5694ead2cf7SAlex Zinenko /// Lower a `scf.parallel` operation into a corresponding `gpu.launch` 5704ead2cf7SAlex Zinenko /// operation. 5714ead2cf7SAlex Zinenko /// 5724ead2cf7SAlex Zinenko /// This essentially transforms a loop nest into a corresponding SIMT function. 5734ead2cf7SAlex Zinenko /// The conversion is driven by mapping annotations on the `scf.parallel` 5744ead2cf7SAlex Zinenko /// operations. The mapping is provided via a `DictionaryAttribute` named 5754ead2cf7SAlex Zinenko /// `mapping`, which has three entries: 5764ead2cf7SAlex Zinenko /// - processor: the hardware id to map to. 0-2 are block dimensions, 3-5 are 5774ead2cf7SAlex Zinenko /// thread dimensions and 6 is sequential. 5784ead2cf7SAlex Zinenko /// - map : An affine map that is used to pre-process hardware ids before 5794ead2cf7SAlex Zinenko /// substitution. 5804ead2cf7SAlex Zinenko /// - bound : An affine map that is used to compute the bound of the hardware 5814ead2cf7SAlex Zinenko /// id based on an upper bound of the number of iterations. 5824ead2cf7SAlex Zinenko /// If the `scf.parallel` contains nested `scf.parallel` operations, those 5834ead2cf7SAlex Zinenko /// need to be annotated, as well. Structurally, the transformation works by 5844ead2cf7SAlex Zinenko /// splicing all operations from nested `scf.parallel` operations into a single 5854ead2cf7SAlex Zinenko /// sequence. Indices mapped to hardware ids are substituted with those ids, 5864ead2cf7SAlex Zinenko /// wheras sequential mappings result in a sequential for-loop. To have more 5874ead2cf7SAlex Zinenko /// flexibility when mapping code to hardware ids, the transform supports two 5884ead2cf7SAlex Zinenko /// affine maps. The first `map` is used to compute the actual index for 5894ead2cf7SAlex Zinenko /// substitution from the hardware id. The second `bound` is used to compute the 5904ead2cf7SAlex Zinenko /// launch dimension for the hardware id from the number of iterations the 5914ead2cf7SAlex Zinenko /// mapped loop is performing. Note that the number of iterations might be 5924ead2cf7SAlex Zinenko /// imprecise if the corresponding loop-bounds are loop-dependent. In such case, 5934ead2cf7SAlex Zinenko /// the hardware id might iterate over additional indices. The transformation 5944ead2cf7SAlex Zinenko /// caters for this by predicating the created sequence of instructions on 5954ead2cf7SAlex Zinenko /// the actual loop bound. This only works if an static upper bound for the 5964ead2cf7SAlex Zinenko /// dynamic loop bound can be derived, currently via analyzing `affine.min` 5974ead2cf7SAlex Zinenko /// operations. 5984ead2cf7SAlex Zinenko LogicalResult 5994ead2cf7SAlex Zinenko ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, 6004ead2cf7SAlex Zinenko PatternRewriter &rewriter) const { 601ec03bbe8SVladislav Vinogradov // Mark the operation as visited for recursive legality check. 602ec03bbe8SVladislav Vinogradov parallelOp->setAttr(kVisitedAttrName, rewriter.getUnitAttr()); 603ec03bbe8SVladislav Vinogradov 6045da2423bSStephan Herhut // We can only transform starting at the outer-most loop. Launches inside of 6055da2423bSStephan Herhut // parallel loops are not supported. 6060bf4a82aSChristian Sigg if (auto parentLoop = parallelOp->getParentOfType<ParallelOp>()) 6075da2423bSStephan Herhut return failure(); 6084ead2cf7SAlex Zinenko // Create a launch operation. We start with bound one for all grid/block 6094ead2cf7SAlex Zinenko // sizes. Those will be refined later as we discover them from mappings. 6104ead2cf7SAlex Zinenko Location loc = parallelOp.getLoc(); 611a54f4eaeSMogball Value constantOne = 612a54f4eaeSMogball rewriter.create<arith::ConstantIndexOp>(parallelOp.getLoc(), 1); 6134ead2cf7SAlex Zinenko gpu::LaunchOp launchOp = rewriter.create<gpu::LaunchOp>( 6144ead2cf7SAlex Zinenko parallelOp.getLoc(), constantOne, constantOne, constantOne, constantOne, 6154ead2cf7SAlex Zinenko constantOne, constantOne); 61610c04f46SRiver Riddle rewriter.setInsertionPointToEnd(&launchOp.getBody().front()); 6174ead2cf7SAlex Zinenko rewriter.create<gpu::TerminatorOp>(loc); 61810c04f46SRiver Riddle rewriter.setInsertionPointToStart(&launchOp.getBody().front()); 6194ead2cf7SAlex Zinenko 6204d67b278SJeff Niu IRMapping cloningMap; 6214ead2cf7SAlex Zinenko llvm::DenseMap<gpu::Processor, Value> launchBounds; 6224ead2cf7SAlex Zinenko SmallVector<Operation *, 16> worklist; 6234ead2cf7SAlex Zinenko if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist, 6244ead2cf7SAlex Zinenko launchBounds, rewriter))) 6254ead2cf7SAlex Zinenko return failure(); 6264ead2cf7SAlex Zinenko 6274ead2cf7SAlex Zinenko // Whether we have seen any side-effects. Reset when leaving an inner scope. 6284ead2cf7SAlex Zinenko bool seenSideeffects = false; 6294ead2cf7SAlex Zinenko // Whether we have left a nesting scope (and hence are no longer innermost). 6304ead2cf7SAlex Zinenko bool leftNestingScope = false; 6314ead2cf7SAlex Zinenko while (!worklist.empty()) { 6324ead2cf7SAlex Zinenko Operation *op = worklist.pop_back_val(); 6334ead2cf7SAlex Zinenko // Now walk over the body and clone it. 6344ead2cf7SAlex Zinenko // TODO: This is only correct if there either is no further scf.parallel 6354ead2cf7SAlex Zinenko // nested or this code is side-effect free. Otherwise we might need 6364ead2cf7SAlex Zinenko // predication. We are overly conservative for now and only allow 6374ead2cf7SAlex Zinenko // side-effects in the innermost scope. 6384ead2cf7SAlex Zinenko if (auto nestedParallel = dyn_cast<ParallelOp>(op)) { 6394ead2cf7SAlex Zinenko // Before entering a nested scope, make sure there have been no 6404ead2cf7SAlex Zinenko // sideeffects until now. 6414ead2cf7SAlex Zinenko if (seenSideeffects) 6424ead2cf7SAlex Zinenko return failure(); 6434ead2cf7SAlex Zinenko // A nested scf.parallel needs insertion of code to compute indices. 6444ead2cf7SAlex Zinenko // Insert that now. This will also update the worklist with the loops 6454ead2cf7SAlex Zinenko // body. 6464ead2cf7SAlex Zinenko if (failed(processParallelLoop(nestedParallel, launchOp, cloningMap, 6474ead2cf7SAlex Zinenko worklist, launchBounds, rewriter))) 6484ead2cf7SAlex Zinenko return failure(); 6494ead2cf7SAlex Zinenko } else if (op == launchOp.getOperation()) { 6504ead2cf7SAlex Zinenko // Found our sentinel value. We have finished the operations from one 6514ead2cf7SAlex Zinenko // nesting level, pop one level back up. 65202b6fb21SMehdi Amini auto *parent = rewriter.getInsertionPoint()->getParentOp(); 6534ead2cf7SAlex Zinenko rewriter.setInsertionPointAfter(parent); 6544ead2cf7SAlex Zinenko leftNestingScope = true; 6554ead2cf7SAlex Zinenko seenSideeffects = false; 656*0e944a30STuomas Kärnä } else if (auto reduceOp = dyn_cast<scf::ReduceOp>(op)) { 657*0e944a30STuomas Kärnä // Convert scf.reduction op 658*0e944a30STuomas Kärnä auto parentLoop = op->getParentOfType<ParallelOp>(); 659*0e944a30STuomas Kärnä if (!parentLoop || op->getOperands().size() != 1) 660*0e944a30STuomas Kärnä return failure(); 661*0e944a30STuomas Kärnä auto operand = op->getOperands().front(); 662*0e944a30STuomas Kärnä auto newValue = cloningMap.lookupOrNull(operand); 663*0e944a30STuomas Kärnä if (!newValue || !operand.getType().isSignlessIntOrFloat()) 664*0e944a30STuomas Kärnä return failure(); 665*0e944a30STuomas Kärnä // Ensure reduction region is isolated from above. 666*0e944a30STuomas Kärnä llvm::SetVector<Value> externalValues; 667*0e944a30STuomas Kärnä getUsedValuesDefinedAbove(reduceOp.getRegion(0), externalValues); 668*0e944a30STuomas Kärnä if (externalValues.size()) 669*0e944a30STuomas Kärnä return failure(); 670*0e944a30STuomas Kärnä // Replace by gpu.all_reduce. 671*0e944a30STuomas Kärnä auto gpuRedOp = rewriter.create<gpu::AllReduceOp>(loc, newValue); 672*0e944a30STuomas Kärnä cloningMap.map(parentLoop->getResult(0), gpuRedOp.getResult()); 673*0e944a30STuomas Kärnä // Copy region. 674*0e944a30STuomas Kärnä rewriter.inlineRegionBefore(reduceOp.getRegion(0), gpuRedOp.getRegion(), 675*0e944a30STuomas Kärnä gpuRedOp.getRegion().begin()); 676*0e944a30STuomas Kärnä // Replace src.reduce.return with gpu.yield. 677*0e944a30STuomas Kärnä auto scfReturn = gpuRedOp.getRegion().front().getTerminator(); 678*0e944a30STuomas Kärnä auto ip = rewriter.saveInsertionPoint(); 679*0e944a30STuomas Kärnä rewriter.setInsertionPointToEnd(&gpuRedOp.getRegion().front()); 680*0e944a30STuomas Kärnä rewriter.replaceOpWithNewOp<gpu::YieldOp>( 681*0e944a30STuomas Kärnä scfReturn, scfReturn->getOperands().front()); 682*0e944a30STuomas Kärnä rewriter.restoreInsertionPoint(ip); 6834ead2cf7SAlex Zinenko } else { 6844ead2cf7SAlex Zinenko // Otherwise we copy it over. 6854ead2cf7SAlex Zinenko Operation *clone = rewriter.clone(*op, cloningMap); 6864ead2cf7SAlex Zinenko cloningMap.map(op->getResults(), clone->getResults()); 6874ead2cf7SAlex Zinenko // Check for side effects. 6884ead2cf7SAlex Zinenko // TODO: Handle region side effects properly. 689fc367dfaSMahesh Ravishankar seenSideeffects |= 690fc367dfaSMahesh Ravishankar !isMemoryEffectFree(clone) || clone->getNumRegions() != 0; 6914ead2cf7SAlex Zinenko // If we are no longer in the innermost scope, sideeffects are disallowed. 6924ead2cf7SAlex Zinenko if (seenSideeffects && leftNestingScope) 6934ead2cf7SAlex Zinenko return failure(); 6944ead2cf7SAlex Zinenko } 6954ead2cf7SAlex Zinenko } 6964ead2cf7SAlex Zinenko 6974ead2cf7SAlex Zinenko // Now that we succeeded creating the launch operation, also update the 6984ead2cf7SAlex Zinenko // bounds. 6994ead2cf7SAlex Zinenko for (auto bound : launchBounds) 7004ead2cf7SAlex Zinenko launchOp.setOperand(getLaunchOpArgumentNum(std::get<0>(bound)), 7014ead2cf7SAlex Zinenko std::get<1>(bound)); 7024ead2cf7SAlex Zinenko 7034ead2cf7SAlex Zinenko rewriter.eraseOp(parallelOp); 7044ead2cf7SAlex Zinenko return success(); 7054ead2cf7SAlex Zinenko } 7064ead2cf7SAlex Zinenko 707dc4e913bSChris Lattner void mlir::populateParallelLoopToGPUPatterns(RewritePatternSet &patterns) { 708dc4e913bSChris Lattner patterns.add<ParallelToGpuLaunchLowering>(patterns.getContext()); 7094ead2cf7SAlex Zinenko } 7105da2423bSStephan Herhut 7115da2423bSStephan Herhut void mlir::configureParallelLoopToGPULegality(ConversionTarget &target) { 712e2310704SJulian Gross target.addLegalDialect<memref::MemRefDialect>(); 7135da2423bSStephan Herhut target.addDynamicallyLegalOp<scf::ParallelOp>([](scf::ParallelOp parallelOp) { 714ec03bbe8SVladislav Vinogradov return !parallelOp->hasAttr(gpu::getMappingAttrName()) || 715ec03bbe8SVladislav Vinogradov parallelOp->hasAttr(kVisitedAttrName); 716ec03bbe8SVladislav Vinogradov }); 717ec03bbe8SVladislav Vinogradov } 718ec03bbe8SVladislav Vinogradov 719ec03bbe8SVladislav Vinogradov void mlir::finalizeParallelLoopToGPUConversion(Operation *op) { 720ec03bbe8SVladislav Vinogradov op->walk([](scf::ParallelOp parallelOp) { 721ec03bbe8SVladislav Vinogradov parallelOp->removeAttr(kVisitedAttrName); 7225da2423bSStephan Herhut }); 7235da2423bSStephan Herhut } 724