1 //===- LoopSpecialization.cpp - scf.parallel/SCR.for specialization -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Specializes parallel loops and for loops for easier unrolling and 10 // vectorization. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "mlir/Dialect/SCF/Transforms/Passes.h" 15 16 #include "mlir/Dialect/Affine/Analysis/AffineStructures.h" 17 #include "mlir/Dialect/Affine/IR/AffineOps.h" 18 #include "mlir/Dialect/Arith/IR/Arith.h" 19 #include "mlir/Dialect/SCF/IR/SCF.h" 20 #include "mlir/Dialect/SCF/Transforms/Transforms.h" 21 #include "mlir/Dialect/SCF/Utils/AffineCanonicalizationUtils.h" 22 #include "mlir/Dialect/Utils/StaticValueUtils.h" 23 #include "mlir/IR/AffineExpr.h" 24 #include "mlir/IR/IRMapping.h" 25 #include "mlir/IR/PatternMatch.h" 26 #include "mlir/Transforms/GreedyPatternRewriteDriver.h" 27 #include "llvm/ADT/DenseMap.h" 28 29 namespace mlir { 30 #define GEN_PASS_DEF_SCFFORLOOPPEELING 31 #define GEN_PASS_DEF_SCFFORLOOPSPECIALIZATION 32 #define GEN_PASS_DEF_SCFPARALLELLOOPSPECIALIZATION 33 #include "mlir/Dialect/SCF/Transforms/Passes.h.inc" 34 } // namespace mlir 35 36 using namespace mlir; 37 using namespace mlir::affine; 38 using scf::ForOp; 39 using scf::ParallelOp; 40 41 /// Rewrite a parallel loop with bounds defined by an affine.min with a constant 42 /// into 2 loops after checking if the bounds are equal to that constant. This 43 /// is beneficial if the loop will almost always have the constant bound and 44 /// that version can be fully unrolled and vectorized. 45 static void specializeParallelLoopForUnrolling(ParallelOp op) { 46 SmallVector<int64_t, 2> constantIndices; 47 constantIndices.reserve(op.getUpperBound().size()); 48 for (auto bound : op.getUpperBound()) { 49 auto minOp = bound.getDefiningOp<AffineMinOp>(); 50 if (!minOp) 51 return; 52 int64_t minConstant = std::numeric_limits<int64_t>::max(); 53 for (AffineExpr expr : minOp.getMap().getResults()) { 54 if (auto constantIndex = dyn_cast<AffineConstantExpr>(expr)) 55 minConstant = std::min(minConstant, constantIndex.getValue()); 56 } 57 if (minConstant == std::numeric_limits<int64_t>::max()) 58 return; 59 constantIndices.push_back(minConstant); 60 } 61 62 OpBuilder b(op); 63 IRMapping map; 64 Value cond; 65 for (auto bound : llvm::zip(op.getUpperBound(), constantIndices)) { 66 Value constant = 67 b.create<arith::ConstantIndexOp>(op.getLoc(), std::get<1>(bound)); 68 Value cmp = b.create<arith::CmpIOp>(op.getLoc(), arith::CmpIPredicate::eq, 69 std::get<0>(bound), constant); 70 cond = cond ? b.create<arith::AndIOp>(op.getLoc(), cond, cmp) : cmp; 71 map.map(std::get<0>(bound), constant); 72 } 73 auto ifOp = b.create<scf::IfOp>(op.getLoc(), cond, /*withElseRegion=*/true); 74 ifOp.getThenBodyBuilder().clone(*op.getOperation(), map); 75 ifOp.getElseBodyBuilder().clone(*op.getOperation()); 76 op.erase(); 77 } 78 79 /// Rewrite a for loop with bounds defined by an affine.min with a constant into 80 /// 2 loops after checking if the bounds are equal to that constant. This is 81 /// beneficial if the loop will almost always have the constant bound and that 82 /// version can be fully unrolled and vectorized. 83 static void specializeForLoopForUnrolling(ForOp op) { 84 auto bound = op.getUpperBound(); 85 auto minOp = bound.getDefiningOp<AffineMinOp>(); 86 if (!minOp) 87 return; 88 int64_t minConstant = std::numeric_limits<int64_t>::max(); 89 for (AffineExpr expr : minOp.getMap().getResults()) { 90 if (auto constantIndex = dyn_cast<AffineConstantExpr>(expr)) 91 minConstant = std::min(minConstant, constantIndex.getValue()); 92 } 93 if (minConstant == std::numeric_limits<int64_t>::max()) 94 return; 95 96 OpBuilder b(op); 97 IRMapping map; 98 Value constant = b.create<arith::ConstantIndexOp>(op.getLoc(), minConstant); 99 Value cond = b.create<arith::CmpIOp>(op.getLoc(), arith::CmpIPredicate::eq, 100 bound, constant); 101 map.map(bound, constant); 102 auto ifOp = b.create<scf::IfOp>(op.getLoc(), cond, /*withElseRegion=*/true); 103 ifOp.getThenBodyBuilder().clone(*op.getOperation(), map); 104 ifOp.getElseBodyBuilder().clone(*op.getOperation()); 105 op.erase(); 106 } 107 108 /// Rewrite a for loop with bounds/step that potentially do not divide evenly 109 /// into a for loop where the step divides the iteration space evenly, followed 110 /// by an scf.if for the last (partial) iteration (if any). 111 /// 112 /// This function rewrites the given scf.for loop in-place and creates a new 113 /// scf.if operation for the last iteration. It replaces all uses of the 114 /// unpeeled loop with the results of the newly generated scf.if. 115 /// 116 /// The newly generated scf.if operation is returned via `ifOp`. The boundary 117 /// at which the loop is split (new upper bound) is returned via `splitBound`. 118 /// The return value indicates whether the loop was rewritten or not. 119 static LogicalResult peelForLoop(RewriterBase &b, ForOp forOp, 120 ForOp &partialIteration, Value &splitBound) { 121 RewriterBase::InsertionGuard guard(b); 122 auto lbInt = getConstantIntValue(forOp.getLowerBound()); 123 auto ubInt = getConstantIntValue(forOp.getUpperBound()); 124 auto stepInt = getConstantIntValue(forOp.getStep()); 125 126 // No specialization necessary if step size is 1. 127 if (getConstantIntValue(forOp.getStep()) == static_cast<int64_t>(1)) 128 return failure(); 129 130 // No specialization necessary if step already divides upper bound evenly. 131 // Fast path: lb, ub and step are constants. 132 if (lbInt && ubInt && stepInt && (*ubInt - *lbInt) % *stepInt == 0) 133 return failure(); 134 // Slow path: Examine the ops that define lb, ub and step. 135 AffineExpr sym0, sym1, sym2; 136 bindSymbols(b.getContext(), sym0, sym1, sym2); 137 SmallVector<Value> operands{forOp.getLowerBound(), forOp.getUpperBound(), 138 forOp.getStep()}; 139 AffineMap map = AffineMap::get(0, 3, {(sym1 - sym0) % sym2}); 140 affine::fullyComposeAffineMapAndOperands(&map, &operands); 141 if (auto constExpr = dyn_cast<AffineConstantExpr>(map.getResult(0))) 142 if (constExpr.getValue() == 0) 143 return failure(); 144 145 // New upper bound: %ub - (%ub - %lb) mod %step 146 auto modMap = AffineMap::get(0, 3, {sym1 - ((sym1 - sym0) % sym2)}); 147 b.setInsertionPoint(forOp); 148 auto loc = forOp.getLoc(); 149 splitBound = b.createOrFold<AffineApplyOp>(loc, modMap, 150 ValueRange{forOp.getLowerBound(), 151 forOp.getUpperBound(), 152 forOp.getStep()}); 153 154 // Create ForOp for partial iteration. 155 b.setInsertionPointAfter(forOp); 156 partialIteration = cast<ForOp>(b.clone(*forOp.getOperation())); 157 partialIteration.getLowerBoundMutable().assign(splitBound); 158 b.replaceAllUsesWith(forOp.getResults(), partialIteration->getResults()); 159 partialIteration.getInitArgsMutable().assign(forOp->getResults()); 160 161 // Set new upper loop bound. 162 b.updateRootInPlace( 163 forOp, [&]() { forOp.getUpperBoundMutable().assign(splitBound); }); 164 165 return success(); 166 } 167 168 static void rewriteAffineOpAfterPeeling(RewriterBase &rewriter, ForOp forOp, 169 ForOp partialIteration, 170 Value previousUb) { 171 Value mainIv = forOp.getInductionVar(); 172 Value partialIv = partialIteration.getInductionVar(); 173 assert(forOp.getStep() == partialIteration.getStep() && 174 "expected same step in main and partial loop"); 175 Value step = forOp.getStep(); 176 177 forOp.walk([&](Operation *affineOp) { 178 if (!isa<AffineMinOp, AffineMaxOp>(affineOp)) 179 return WalkResult::advance(); 180 (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, mainIv, previousUb, 181 step, 182 /*insideLoop=*/true); 183 return WalkResult::advance(); 184 }); 185 partialIteration.walk([&](Operation *affineOp) { 186 if (!isa<AffineMinOp, AffineMaxOp>(affineOp)) 187 return WalkResult::advance(); 188 (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, partialIv, previousUb, 189 step, /*insideLoop=*/false); 190 return WalkResult::advance(); 191 }); 192 } 193 194 LogicalResult mlir::scf::peelForLoopAndSimplifyBounds(RewriterBase &rewriter, 195 ForOp forOp, 196 ForOp &partialIteration) { 197 Value previousUb = forOp.getUpperBound(); 198 Value splitBound; 199 if (failed(peelForLoop(rewriter, forOp, partialIteration, splitBound))) 200 return failure(); 201 202 // Rewrite affine.min and affine.max ops. 203 rewriteAffineOpAfterPeeling(rewriter, forOp, partialIteration, previousUb); 204 205 return success(); 206 } 207 208 static constexpr char kPeeledLoopLabel[] = "__peeled_loop__"; 209 static constexpr char kPartialIterationLabel[] = "__partial_iteration__"; 210 211 namespace { 212 struct ForLoopPeelingPattern : public OpRewritePattern<ForOp> { 213 ForLoopPeelingPattern(MLIRContext *ctx, bool skipPartial) 214 : OpRewritePattern<ForOp>(ctx), skipPartial(skipPartial) {} 215 216 LogicalResult matchAndRewrite(ForOp forOp, 217 PatternRewriter &rewriter) const override { 218 // Do not peel already peeled loops. 219 if (forOp->hasAttr(kPeeledLoopLabel)) 220 return failure(); 221 if (skipPartial) { 222 // No peeling of loops inside the partial iteration of another peeled 223 // loop. 224 Operation *op = forOp.getOperation(); 225 while ((op = op->getParentOfType<scf::ForOp>())) { 226 if (op->hasAttr(kPartialIterationLabel)) 227 return failure(); 228 } 229 } 230 // Apply loop peeling. 231 scf::ForOp partialIteration; 232 if (failed(peelForLoopAndSimplifyBounds(rewriter, forOp, partialIteration))) 233 return failure(); 234 // Apply label, so that the same loop is not rewritten a second time. 235 rewriter.updateRootInPlace(partialIteration, [&]() { 236 partialIteration->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr()); 237 partialIteration->setAttr(kPartialIterationLabel, rewriter.getUnitAttr()); 238 }); 239 rewriter.updateRootInPlace(forOp, [&]() { 240 forOp->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr()); 241 }); 242 return success(); 243 } 244 245 /// If set to true, loops inside partial iterations of another peeled loop 246 /// are not peeled. This reduces the size of the generated code. Partial 247 /// iterations are not usually performance critical. 248 /// Note: Takes into account the entire chain of parent operations, not just 249 /// the direct parent. 250 bool skipPartial; 251 }; 252 } // namespace 253 254 namespace { 255 struct ParallelLoopSpecialization 256 : public impl::SCFParallelLoopSpecializationBase< 257 ParallelLoopSpecialization> { 258 void runOnOperation() override { 259 getOperation()->walk( 260 [](ParallelOp op) { specializeParallelLoopForUnrolling(op); }); 261 } 262 }; 263 264 struct ForLoopSpecialization 265 : public impl::SCFForLoopSpecializationBase<ForLoopSpecialization> { 266 void runOnOperation() override { 267 getOperation()->walk([](ForOp op) { specializeForLoopForUnrolling(op); }); 268 } 269 }; 270 271 struct ForLoopPeeling : public impl::SCFForLoopPeelingBase<ForLoopPeeling> { 272 void runOnOperation() override { 273 auto *parentOp = getOperation(); 274 MLIRContext *ctx = parentOp->getContext(); 275 RewritePatternSet patterns(ctx); 276 patterns.add<ForLoopPeelingPattern>(ctx, skipPartial); 277 (void)applyPatternsAndFoldGreedily(parentOp, std::move(patterns)); 278 279 // Drop the markers. 280 parentOp->walk([](Operation *op) { 281 op->removeAttr(kPeeledLoopLabel); 282 op->removeAttr(kPartialIterationLabel); 283 }); 284 } 285 }; 286 } // namespace 287 288 std::unique_ptr<Pass> mlir::createParallelLoopSpecializationPass() { 289 return std::make_unique<ParallelLoopSpecialization>(); 290 } 291 292 std::unique_ptr<Pass> mlir::createForLoopSpecializationPass() { 293 return std::make_unique<ForLoopSpecialization>(); 294 } 295 296 std::unique_ptr<Pass> mlir::createForLoopPeelingPass() { 297 return std::make_unique<ForLoopPeeling>(); 298 } 299