1 //===- LoopSpecialization.cpp - scf.parallel/SCR.for specialization -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Specializes parallel loops and for loops for easier unrolling and 10 // vectorization. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "mlir/Dialect/SCF/Transforms/Passes.h" 15 16 #include "mlir/Dialect/Affine/Analysis/AffineStructures.h" 17 #include "mlir/Dialect/Affine/IR/AffineOps.h" 18 #include "mlir/Dialect/Arith/IR/Arith.h" 19 #include "mlir/Dialect/SCF/IR/SCF.h" 20 #include "mlir/Dialect/SCF/Transforms/Transforms.h" 21 #include "mlir/Dialect/SCF/Utils/AffineCanonicalizationUtils.h" 22 #include "mlir/Dialect/Utils/StaticValueUtils.h" 23 #include "mlir/IR/AffineExpr.h" 24 #include "mlir/IR/IRMapping.h" 25 #include "mlir/IR/PatternMatch.h" 26 #include "mlir/Transforms/GreedyPatternRewriteDriver.h" 27 #include "llvm/ADT/DenseMap.h" 28 29 namespace mlir { 30 #define GEN_PASS_DEF_SCFFORLOOPPEELING 31 #define GEN_PASS_DEF_SCFFORLOOPSPECIALIZATION 32 #define GEN_PASS_DEF_SCFPARALLELLOOPSPECIALIZATION 33 #include "mlir/Dialect/SCF/Transforms/Passes.h.inc" 34 } // namespace mlir 35 36 using namespace mlir; 37 using namespace mlir::affine; 38 using scf::ForOp; 39 using scf::ParallelOp; 40 41 /// Rewrite a parallel loop with bounds defined by an affine.min with a constant 42 /// into 2 loops after checking if the bounds are equal to that constant. This 43 /// is beneficial if the loop will almost always have the constant bound and 44 /// that version can be fully unrolled and vectorized. 45 static void specializeParallelLoopForUnrolling(ParallelOp op) { 46 SmallVector<int64_t, 2> constantIndices; 47 constantIndices.reserve(op.getUpperBound().size()); 48 for (auto bound : op.getUpperBound()) { 49 auto minOp = bound.getDefiningOp<AffineMinOp>(); 50 if (!minOp) 51 return; 52 int64_t minConstant = std::numeric_limits<int64_t>::max(); 53 for (AffineExpr expr : minOp.getMap().getResults()) { 54 if (auto constantIndex = expr.dyn_cast<AffineConstantExpr>()) 55 minConstant = std::min(minConstant, constantIndex.getValue()); 56 } 57 if (minConstant == std::numeric_limits<int64_t>::max()) 58 return; 59 constantIndices.push_back(minConstant); 60 } 61 62 OpBuilder b(op); 63 IRMapping map; 64 Value cond; 65 for (auto bound : llvm::zip(op.getUpperBound(), constantIndices)) { 66 Value constant = 67 b.create<arith::ConstantIndexOp>(op.getLoc(), std::get<1>(bound)); 68 Value cmp = b.create<arith::CmpIOp>(op.getLoc(), arith::CmpIPredicate::eq, 69 std::get<0>(bound), constant); 70 cond = cond ? b.create<arith::AndIOp>(op.getLoc(), cond, cmp) : cmp; 71 map.map(std::get<0>(bound), constant); 72 } 73 auto ifOp = b.create<scf::IfOp>(op.getLoc(), cond, /*withElseRegion=*/true); 74 ifOp.getThenBodyBuilder().clone(*op.getOperation(), map); 75 ifOp.getElseBodyBuilder().clone(*op.getOperation()); 76 op.erase(); 77 } 78 79 /// Rewrite a for loop with bounds defined by an affine.min with a constant into 80 /// 2 loops after checking if the bounds are equal to that constant. This is 81 /// beneficial if the loop will almost always have the constant bound and that 82 /// version can be fully unrolled and vectorized. 83 static void specializeForLoopForUnrolling(ForOp op) { 84 auto bound = op.getUpperBound(); 85 auto minOp = bound.getDefiningOp<AffineMinOp>(); 86 if (!minOp) 87 return; 88 int64_t minConstant = std::numeric_limits<int64_t>::max(); 89 for (AffineExpr expr : minOp.getMap().getResults()) { 90 if (auto constantIndex = expr.dyn_cast<AffineConstantExpr>()) 91 minConstant = std::min(minConstant, constantIndex.getValue()); 92 } 93 if (minConstant == std::numeric_limits<int64_t>::max()) 94 return; 95 96 OpBuilder b(op); 97 IRMapping map; 98 Value constant = b.create<arith::ConstantIndexOp>(op.getLoc(), minConstant); 99 Value cond = b.create<arith::CmpIOp>(op.getLoc(), arith::CmpIPredicate::eq, 100 bound, constant); 101 map.map(bound, constant); 102 auto ifOp = b.create<scf::IfOp>(op.getLoc(), cond, /*withElseRegion=*/true); 103 ifOp.getThenBodyBuilder().clone(*op.getOperation(), map); 104 ifOp.getElseBodyBuilder().clone(*op.getOperation()); 105 op.erase(); 106 } 107 108 /// Rewrite a for loop with bounds/step that potentially do not divide evenly 109 /// into a for loop where the step divides the iteration space evenly, followed 110 /// by an scf.if for the last (partial) iteration (if any). 111 /// 112 /// This function rewrites the given scf.for loop in-place and creates a new 113 /// scf.if operation for the last iteration. It replaces all uses of the 114 /// unpeeled loop with the results of the newly generated scf.if. 115 /// 116 /// The newly generated scf.if operation is returned via `ifOp`. The boundary 117 /// at which the loop is split (new upper bound) is returned via `splitBound`. 118 /// The return value indicates whether the loop was rewritten or not. 119 static LogicalResult peelForLoop(RewriterBase &b, ForOp forOp, 120 ForOp &partialIteration, Value &splitBound) { 121 RewriterBase::InsertionGuard guard(b); 122 auto lbInt = getConstantIntValue(forOp.getLowerBound()); 123 auto ubInt = getConstantIntValue(forOp.getUpperBound()); 124 auto stepInt = getConstantIntValue(forOp.getStep()); 125 126 // No specialization necessary if step already divides upper bound evenly. 127 if (lbInt && ubInt && stepInt && (*ubInt - *lbInt) % *stepInt == 0) 128 return failure(); 129 // No specialization necessary if step size is 1. 130 if (stepInt == static_cast<int64_t>(1)) 131 return failure(); 132 133 auto loc = forOp.getLoc(); 134 AffineExpr sym0, sym1, sym2; 135 bindSymbols(b.getContext(), sym0, sym1, sym2); 136 // New upper bound: %ub - (%ub - %lb) mod %step 137 auto modMap = AffineMap::get(0, 3, {sym1 - ((sym1 - sym0) % sym2)}); 138 b.setInsertionPoint(forOp); 139 splitBound = b.createOrFold<AffineApplyOp>(loc, modMap, 140 ValueRange{forOp.getLowerBound(), 141 forOp.getUpperBound(), 142 forOp.getStep()}); 143 144 // Create ForOp for partial iteration. 145 b.setInsertionPointAfter(forOp); 146 partialIteration = cast<ForOp>(b.clone(*forOp.getOperation())); 147 partialIteration.getLowerBoundMutable().assign(splitBound); 148 b.replaceAllUsesWith(forOp.getResults(), partialIteration->getResults()); 149 partialIteration.getInitArgsMutable().assign(forOp->getResults()); 150 151 // Set new upper loop bound. 152 b.updateRootInPlace( 153 forOp, [&]() { forOp.getUpperBoundMutable().assign(splitBound); }); 154 155 return success(); 156 } 157 158 static void rewriteAffineOpAfterPeeling(RewriterBase &rewriter, ForOp forOp, 159 ForOp partialIteration, 160 Value previousUb) { 161 Value mainIv = forOp.getInductionVar(); 162 Value partialIv = partialIteration.getInductionVar(); 163 assert(forOp.getStep() == partialIteration.getStep() && 164 "expected same step in main and partial loop"); 165 Value step = forOp.getStep(); 166 167 forOp.walk([&](Operation *affineOp) { 168 if (!isa<AffineMinOp, AffineMaxOp>(affineOp)) 169 return WalkResult::advance(); 170 (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, mainIv, previousUb, 171 step, 172 /*insideLoop=*/true); 173 return WalkResult::advance(); 174 }); 175 partialIteration.walk([&](Operation *affineOp) { 176 if (!isa<AffineMinOp, AffineMaxOp>(affineOp)) 177 return WalkResult::advance(); 178 (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, partialIv, previousUb, 179 step, /*insideLoop=*/false); 180 return WalkResult::advance(); 181 }); 182 } 183 184 LogicalResult mlir::scf::peelForLoopAndSimplifyBounds(RewriterBase &rewriter, 185 ForOp forOp, 186 ForOp &partialIteration) { 187 Value previousUb = forOp.getUpperBound(); 188 Value splitBound; 189 if (failed(peelForLoop(rewriter, forOp, partialIteration, splitBound))) 190 return failure(); 191 192 // Rewrite affine.min and affine.max ops. 193 rewriteAffineOpAfterPeeling(rewriter, forOp, partialIteration, previousUb); 194 195 return success(); 196 } 197 198 static constexpr char kPeeledLoopLabel[] = "__peeled_loop__"; 199 static constexpr char kPartialIterationLabel[] = "__partial_iteration__"; 200 201 namespace { 202 struct ForLoopPeelingPattern : public OpRewritePattern<ForOp> { 203 ForLoopPeelingPattern(MLIRContext *ctx, bool skipPartial) 204 : OpRewritePattern<ForOp>(ctx), skipPartial(skipPartial) {} 205 206 LogicalResult matchAndRewrite(ForOp forOp, 207 PatternRewriter &rewriter) const override { 208 // Do not peel already peeled loops. 209 if (forOp->hasAttr(kPeeledLoopLabel)) 210 return failure(); 211 if (skipPartial) { 212 // No peeling of loops inside the partial iteration of another peeled 213 // loop. 214 Operation *op = forOp.getOperation(); 215 while ((op = op->getParentOfType<scf::ForOp>())) { 216 if (op->hasAttr(kPartialIterationLabel)) 217 return failure(); 218 } 219 } 220 // Apply loop peeling. 221 scf::ForOp partialIteration; 222 if (failed(peelForLoopAndSimplifyBounds(rewriter, forOp, partialIteration))) 223 return failure(); 224 // Apply label, so that the same loop is not rewritten a second time. 225 rewriter.updateRootInPlace(partialIteration, [&]() { 226 partialIteration->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr()); 227 partialIteration->setAttr(kPartialIterationLabel, rewriter.getUnitAttr()); 228 }); 229 rewriter.updateRootInPlace(forOp, [&]() { 230 forOp->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr()); 231 }); 232 return success(); 233 } 234 235 /// If set to true, loops inside partial iterations of another peeled loop 236 /// are not peeled. This reduces the size of the generated code. Partial 237 /// iterations are not usually performance critical. 238 /// Note: Takes into account the entire chain of parent operations, not just 239 /// the direct parent. 240 bool skipPartial; 241 }; 242 } // namespace 243 244 namespace { 245 struct ParallelLoopSpecialization 246 : public impl::SCFParallelLoopSpecializationBase< 247 ParallelLoopSpecialization> { 248 void runOnOperation() override { 249 getOperation()->walk( 250 [](ParallelOp op) { specializeParallelLoopForUnrolling(op); }); 251 } 252 }; 253 254 struct ForLoopSpecialization 255 : public impl::SCFForLoopSpecializationBase<ForLoopSpecialization> { 256 void runOnOperation() override { 257 getOperation()->walk([](ForOp op) { specializeForLoopForUnrolling(op); }); 258 } 259 }; 260 261 struct ForLoopPeeling : public impl::SCFForLoopPeelingBase<ForLoopPeeling> { 262 void runOnOperation() override { 263 auto *parentOp = getOperation(); 264 MLIRContext *ctx = parentOp->getContext(); 265 RewritePatternSet patterns(ctx); 266 patterns.add<ForLoopPeelingPattern>(ctx, skipPartial); 267 (void)applyPatternsAndFoldGreedily(parentOp, std::move(patterns)); 268 269 // Drop the markers. 270 parentOp->walk([](Operation *op) { 271 op->removeAttr(kPeeledLoopLabel); 272 op->removeAttr(kPartialIterationLabel); 273 }); 274 } 275 }; 276 } // namespace 277 278 std::unique_ptr<Pass> mlir::createParallelLoopSpecializationPass() { 279 return std::make_unique<ParallelLoopSpecialization>(); 280 } 281 282 std::unique_ptr<Pass> mlir::createForLoopSpecializationPass() { 283 return std::make_unique<ForLoopSpecialization>(); 284 } 285 286 std::unique_ptr<Pass> mlir::createForLoopPeelingPass() { 287 return std::make_unique<ForLoopPeeling>(); 288 } 289