1 //===- LoopSpecialization.cpp - scf.parallel/SCR.for specialization -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Specializes parallel loops and for loops for easier unrolling and 10 // vectorization. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "PassDetail.h" 15 #include "mlir/Dialect/Affine/Analysis/AffineStructures.h" 16 #include "mlir/Dialect/Affine/IR/AffineOps.h" 17 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" 18 #include "mlir/Dialect/SCF/Passes.h" 19 #include "mlir/Dialect/SCF/SCF.h" 20 #include "mlir/Dialect/SCF/Transforms.h" 21 #include "mlir/Dialect/SCF/Utils/AffineCanonicalizationUtils.h" 22 #include "mlir/Dialect/Utils/StaticValueUtils.h" 23 #include "mlir/IR/AffineExpr.h" 24 #include "mlir/IR/BlockAndValueMapping.h" 25 #include "mlir/IR/PatternMatch.h" 26 #include "mlir/Transforms/GreedyPatternRewriteDriver.h" 27 #include "llvm/ADT/DenseMap.h" 28 29 using namespace mlir; 30 using scf::ForOp; 31 using scf::ParallelOp; 32 33 /// Rewrite a parallel loop with bounds defined by an affine.min with a constant 34 /// into 2 loops after checking if the bounds are equal to that constant. This 35 /// is beneficial if the loop will almost always have the constant bound and 36 /// that version can be fully unrolled and vectorized. 37 static void specializeParallelLoopForUnrolling(ParallelOp op) { 38 SmallVector<int64_t, 2> constantIndices; 39 constantIndices.reserve(op.getUpperBound().size()); 40 for (auto bound : op.getUpperBound()) { 41 auto minOp = bound.getDefiningOp<AffineMinOp>(); 42 if (!minOp) 43 return; 44 int64_t minConstant = std::numeric_limits<int64_t>::max(); 45 for (AffineExpr expr : minOp.map().getResults()) { 46 if (auto constantIndex = expr.dyn_cast<AffineConstantExpr>()) 47 minConstant = std::min(minConstant, constantIndex.getValue()); 48 } 49 if (minConstant == std::numeric_limits<int64_t>::max()) 50 return; 51 constantIndices.push_back(minConstant); 52 } 53 54 OpBuilder b(op); 55 BlockAndValueMapping map; 56 Value cond; 57 for (auto bound : llvm::zip(op.getUpperBound(), constantIndices)) { 58 Value constant = 59 b.create<arith::ConstantIndexOp>(op.getLoc(), std::get<1>(bound)); 60 Value cmp = b.create<arith::CmpIOp>(op.getLoc(), arith::CmpIPredicate::eq, 61 std::get<0>(bound), constant); 62 cond = cond ? b.create<arith::AndIOp>(op.getLoc(), cond, cmp) : cmp; 63 map.map(std::get<0>(bound), constant); 64 } 65 auto ifOp = b.create<scf::IfOp>(op.getLoc(), cond, /*withElseRegion=*/true); 66 ifOp.getThenBodyBuilder().clone(*op.getOperation(), map); 67 ifOp.getElseBodyBuilder().clone(*op.getOperation()); 68 op.erase(); 69 } 70 71 /// Rewrite a for loop with bounds defined by an affine.min with a constant into 72 /// 2 loops after checking if the bounds are equal to that constant. This is 73 /// beneficial if the loop will almost always have the constant bound and that 74 /// version can be fully unrolled and vectorized. 75 static void specializeForLoopForUnrolling(ForOp op) { 76 auto bound = op.getUpperBound(); 77 auto minOp = bound.getDefiningOp<AffineMinOp>(); 78 if (!minOp) 79 return; 80 int64_t minConstant = std::numeric_limits<int64_t>::max(); 81 for (AffineExpr expr : minOp.map().getResults()) { 82 if (auto constantIndex = expr.dyn_cast<AffineConstantExpr>()) 83 minConstant = std::min(minConstant, constantIndex.getValue()); 84 } 85 if (minConstant == std::numeric_limits<int64_t>::max()) 86 return; 87 88 OpBuilder b(op); 89 BlockAndValueMapping map; 90 Value constant = b.create<arith::ConstantIndexOp>(op.getLoc(), minConstant); 91 Value cond = b.create<arith::CmpIOp>(op.getLoc(), arith::CmpIPredicate::eq, 92 bound, constant); 93 map.map(bound, constant); 94 auto ifOp = b.create<scf::IfOp>(op.getLoc(), cond, /*withElseRegion=*/true); 95 ifOp.getThenBodyBuilder().clone(*op.getOperation(), map); 96 ifOp.getElseBodyBuilder().clone(*op.getOperation()); 97 op.erase(); 98 } 99 100 /// Rewrite a for loop with bounds/step that potentially do not divide evenly 101 /// into a for loop where the step divides the iteration space evenly, followed 102 /// by an scf.if for the last (partial) iteration (if any). 103 /// 104 /// This function rewrites the given scf.for loop in-place and creates a new 105 /// scf.if operation for the last iteration. It replaces all uses of the 106 /// unpeeled loop with the results of the newly generated scf.if. 107 /// 108 /// The newly generated scf.if operation is returned via `ifOp`. The boundary 109 /// at which the loop is split (new upper bound) is returned via `splitBound`. 110 /// The return value indicates whether the loop was rewritten or not. 111 static LogicalResult peelForLoop(RewriterBase &b, ForOp forOp, 112 ForOp &partialIteration, Value &splitBound) { 113 RewriterBase::InsertionGuard guard(b); 114 auto lbInt = getConstantIntValue(forOp.getLowerBound()); 115 auto ubInt = getConstantIntValue(forOp.getUpperBound()); 116 auto stepInt = getConstantIntValue(forOp.getStep()); 117 118 // No specialization necessary if step already divides upper bound evenly. 119 if (lbInt && ubInt && stepInt && (*ubInt - *lbInt) % *stepInt == 0) 120 return failure(); 121 // No specialization necessary if step size is 1. 122 if (stepInt == static_cast<int64_t>(1)) 123 return failure(); 124 125 auto loc = forOp.getLoc(); 126 AffineExpr sym0, sym1, sym2; 127 bindSymbols(b.getContext(), sym0, sym1, sym2); 128 // New upper bound: %ub - (%ub - %lb) mod %step 129 auto modMap = AffineMap::get(0, 3, {sym1 - ((sym1 - sym0) % sym2)}); 130 b.setInsertionPoint(forOp); 131 splitBound = b.createOrFold<AffineApplyOp>(loc, modMap, 132 ValueRange{forOp.getLowerBound(), 133 forOp.getUpperBound(), 134 forOp.getStep()}); 135 136 // Create ForOp for partial iteration. 137 b.setInsertionPointAfter(forOp); 138 partialIteration = cast<ForOp>(b.clone(*forOp.getOperation())); 139 partialIteration.getLowerBoundMutable().assign(splitBound); 140 forOp.replaceAllUsesWith(partialIteration->getResults()); 141 partialIteration.getInitArgsMutable().assign(forOp->getResults()); 142 143 // Set new upper loop bound. 144 b.updateRootInPlace( 145 forOp, [&]() { forOp.getUpperBoundMutable().assign(splitBound); }); 146 147 return success(); 148 } 149 150 template <typename OpTy, bool IsMin> 151 static void rewriteAffineOpAfterPeeling(RewriterBase &rewriter, ForOp forOp, 152 ForOp partialIteration, 153 Value previousUb) { 154 Value mainIv = forOp.getInductionVar(); 155 Value partialIv = partialIteration.getInductionVar(); 156 assert(forOp.getStep() == partialIteration.getStep() && 157 "expected same step in main and partial loop"); 158 Value step = forOp.getStep(); 159 160 forOp.walk([&](OpTy affineOp) { 161 AffineMap map = affineOp.getAffineMap(); 162 (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, map, 163 affineOp.operands(), IsMin, mainIv, 164 previousUb, step, 165 /*insideLoop=*/true); 166 }); 167 partialIteration.walk([&](OpTy affineOp) { 168 AffineMap map = affineOp.getAffineMap(); 169 (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, map, 170 affineOp.operands(), IsMin, partialIv, 171 previousUb, step, /*insideLoop=*/false); 172 }); 173 } 174 175 LogicalResult mlir::scf::peelAndCanonicalizeForLoop(RewriterBase &rewriter, 176 ForOp forOp, 177 ForOp &partialIteration) { 178 Value previousUb = forOp.getUpperBound(); 179 Value splitBound; 180 if (failed(peelForLoop(rewriter, forOp, partialIteration, splitBound))) 181 return failure(); 182 183 // Rewrite affine.min and affine.max ops. 184 rewriteAffineOpAfterPeeling<AffineMinOp, /*IsMin=*/true>( 185 rewriter, forOp, partialIteration, previousUb); 186 rewriteAffineOpAfterPeeling<AffineMaxOp, /*IsMin=*/false>( 187 rewriter, forOp, partialIteration, previousUb); 188 189 return success(); 190 } 191 192 static constexpr char kPeeledLoopLabel[] = "__peeled_loop__"; 193 static constexpr char kPartialIterationLabel[] = "__partial_iteration__"; 194 195 namespace { 196 struct ForLoopPeelingPattern : public OpRewritePattern<ForOp> { 197 ForLoopPeelingPattern(MLIRContext *ctx, bool skipPartial) 198 : OpRewritePattern<ForOp>(ctx), skipPartial(skipPartial) {} 199 200 LogicalResult matchAndRewrite(ForOp forOp, 201 PatternRewriter &rewriter) const override { 202 // Do not peel already peeled loops. 203 if (forOp->hasAttr(kPeeledLoopLabel)) 204 return failure(); 205 if (skipPartial) { 206 // No peeling of loops inside the partial iteration of another peeled 207 // loop. 208 Operation *op = forOp.getOperation(); 209 while ((op = op->getParentOfType<scf::ForOp>())) { 210 if (op->hasAttr(kPartialIterationLabel)) 211 return failure(); 212 } 213 } 214 // Apply loop peeling. 215 scf::ForOp partialIteration; 216 if (failed(peelAndCanonicalizeForLoop(rewriter, forOp, partialIteration))) 217 return failure(); 218 // Apply label, so that the same loop is not rewritten a second time. 219 partialIteration->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr()); 220 rewriter.updateRootInPlace(forOp, [&]() { 221 forOp->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr()); 222 }); 223 partialIteration->setAttr(kPartialIterationLabel, rewriter.getUnitAttr()); 224 return success(); 225 } 226 227 /// If set to true, loops inside partial iterations of another peeled loop 228 /// are not peeled. This reduces the size of the generated code. Partial 229 /// iterations are not usually performance critical. 230 /// Note: Takes into account the entire chain of parent operations, not just 231 /// the direct parent. 232 bool skipPartial; 233 }; 234 } // namespace 235 236 namespace { 237 struct ParallelLoopSpecialization 238 : public SCFParallelLoopSpecializationBase<ParallelLoopSpecialization> { 239 void runOnOperation() override { 240 getOperation().walk( 241 [](ParallelOp op) { specializeParallelLoopForUnrolling(op); }); 242 } 243 }; 244 245 struct ForLoopSpecialization 246 : public SCFForLoopSpecializationBase<ForLoopSpecialization> { 247 void runOnOperation() override { 248 getOperation().walk([](ForOp op) { specializeForLoopForUnrolling(op); }); 249 } 250 }; 251 252 struct ForLoopPeeling : public SCFForLoopPeelingBase<ForLoopPeeling> { 253 void runOnOperation() override { 254 FuncOp funcOp = getOperation(); 255 MLIRContext *ctx = funcOp.getContext(); 256 RewritePatternSet patterns(ctx); 257 patterns.add<ForLoopPeelingPattern>(ctx, skipPartial); 258 (void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns)); 259 260 // Drop the markers. 261 funcOp.walk([](Operation *op) { 262 op->removeAttr(kPeeledLoopLabel); 263 op->removeAttr(kPartialIterationLabel); 264 }); 265 } 266 }; 267 } // namespace 268 269 std::unique_ptr<Pass> mlir::createParallelLoopSpecializationPass() { 270 return std::make_unique<ParallelLoopSpecialization>(); 271 } 272 273 std::unique_ptr<Pass> mlir::createForLoopSpecializationPass() { 274 return std::make_unique<ForLoopSpecialization>(); 275 } 276 277 std::unique_ptr<Pass> mlir::createForLoopPeelingPass() { 278 return std::make_unique<ForLoopPeeling>(); 279 } 280