1 //===- LoopSpecialization.cpp - scf.parallel/SCR.for specialization -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Specializes parallel loops and for loops for easier unrolling and 10 // vectorization. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "mlir/Dialect/SCF/Transforms/Passes.h" 15 16 #include "mlir/Dialect/Affine/Analysis/AffineStructures.h" 17 #include "mlir/Dialect/Affine/IR/AffineOps.h" 18 #include "mlir/Dialect/Arith/IR/Arith.h" 19 #include "mlir/Dialect/SCF/IR/SCF.h" 20 #include "mlir/Dialect/SCF/Transforms/Transforms.h" 21 #include "mlir/Dialect/SCF/Utils/AffineCanonicalizationUtils.h" 22 #include "mlir/Dialect/Utils/StaticValueUtils.h" 23 #include "mlir/IR/AffineExpr.h" 24 #include "mlir/IR/IRMapping.h" 25 #include "mlir/IR/PatternMatch.h" 26 #include "mlir/Transforms/GreedyPatternRewriteDriver.h" 27 #include "llvm/ADT/DenseMap.h" 28 29 namespace mlir { 30 #define GEN_PASS_DEF_SCFFORLOOPPEELING 31 #define GEN_PASS_DEF_SCFFORLOOPSPECIALIZATION 32 #define GEN_PASS_DEF_SCFPARALLELLOOPSPECIALIZATION 33 #include "mlir/Dialect/SCF/Transforms/Passes.h.inc" 34 } // namespace mlir 35 36 using namespace mlir; 37 using namespace mlir::affine; 38 using scf::ForOp; 39 using scf::ParallelOp; 40 41 /// Rewrite a parallel loop with bounds defined by an affine.min with a constant 42 /// into 2 loops after checking if the bounds are equal to that constant. This 43 /// is beneficial if the loop will almost always have the constant bound and 44 /// that version can be fully unrolled and vectorized. 45 static void specializeParallelLoopForUnrolling(ParallelOp op) { 46 SmallVector<int64_t, 2> constantIndices; 47 constantIndices.reserve(op.getUpperBound().size()); 48 for (auto bound : op.getUpperBound()) { 49 auto minOp = bound.getDefiningOp<AffineMinOp>(); 50 if (!minOp) 51 return; 52 int64_t minConstant = std::numeric_limits<int64_t>::max(); 53 for (AffineExpr expr : minOp.getMap().getResults()) { 54 if (auto constantIndex = dyn_cast<AffineConstantExpr>(expr)) 55 minConstant = std::min(minConstant, constantIndex.getValue()); 56 } 57 if (minConstant == std::numeric_limits<int64_t>::max()) 58 return; 59 constantIndices.push_back(minConstant); 60 } 61 62 OpBuilder b(op); 63 IRMapping map; 64 Value cond; 65 for (auto bound : llvm::zip(op.getUpperBound(), constantIndices)) { 66 Value constant = 67 b.create<arith::ConstantIndexOp>(op.getLoc(), std::get<1>(bound)); 68 Value cmp = b.create<arith::CmpIOp>(op.getLoc(), arith::CmpIPredicate::eq, 69 std::get<0>(bound), constant); 70 cond = cond ? b.create<arith::AndIOp>(op.getLoc(), cond, cmp) : cmp; 71 map.map(std::get<0>(bound), constant); 72 } 73 auto ifOp = b.create<scf::IfOp>(op.getLoc(), cond, /*withElseRegion=*/true); 74 ifOp.getThenBodyBuilder().clone(*op.getOperation(), map); 75 ifOp.getElseBodyBuilder().clone(*op.getOperation()); 76 op.erase(); 77 } 78 79 /// Rewrite a for loop with bounds defined by an affine.min with a constant into 80 /// 2 loops after checking if the bounds are equal to that constant. This is 81 /// beneficial if the loop will almost always have the constant bound and that 82 /// version can be fully unrolled and vectorized. 83 static void specializeForLoopForUnrolling(ForOp op) { 84 auto bound = op.getUpperBound(); 85 auto minOp = bound.getDefiningOp<AffineMinOp>(); 86 if (!minOp) 87 return; 88 int64_t minConstant = std::numeric_limits<int64_t>::max(); 89 for (AffineExpr expr : minOp.getMap().getResults()) { 90 if (auto constantIndex = dyn_cast<AffineConstantExpr>(expr)) 91 minConstant = std::min(minConstant, constantIndex.getValue()); 92 } 93 if (minConstant == std::numeric_limits<int64_t>::max()) 94 return; 95 96 OpBuilder b(op); 97 IRMapping map; 98 Value constant = b.create<arith::ConstantIndexOp>(op.getLoc(), minConstant); 99 Value cond = b.create<arith::CmpIOp>(op.getLoc(), arith::CmpIPredicate::eq, 100 bound, constant); 101 map.map(bound, constant); 102 auto ifOp = b.create<scf::IfOp>(op.getLoc(), cond, /*withElseRegion=*/true); 103 ifOp.getThenBodyBuilder().clone(*op.getOperation(), map); 104 ifOp.getElseBodyBuilder().clone(*op.getOperation()); 105 op.erase(); 106 } 107 108 /// Rewrite a for loop with bounds/step that potentially do not divide evenly 109 /// into a for loop where the step divides the iteration space evenly, followed 110 /// by an scf.if for the last (partial) iteration (if any). 111 /// 112 /// This function rewrites the given scf.for loop in-place and creates a new 113 /// scf.if operation for the last iteration. It replaces all uses of the 114 /// unpeeled loop with the results of the newly generated scf.if. 115 /// 116 /// The newly generated scf.if operation is returned via `ifOp`. The boundary 117 /// at which the loop is split (new upper bound) is returned via `splitBound`. 118 /// The return value indicates whether the loop was rewritten or not. 119 static LogicalResult peelForLoop(RewriterBase &b, ForOp forOp, 120 ForOp &partialIteration, Value &splitBound) { 121 RewriterBase::InsertionGuard guard(b); 122 auto lbInt = getConstantIntValue(forOp.getLowerBound()); 123 auto ubInt = getConstantIntValue(forOp.getUpperBound()); 124 auto stepInt = getConstantIntValue(forOp.getStep()); 125 126 // No specialization necessary if step size is 1. 127 if (getConstantIntValue(forOp.getStep()) == static_cast<int64_t>(1)) 128 return failure(); 129 130 // No specialization necessary if step already divides upper bound evenly. 131 // Fast path: lb, ub and step are constants. 132 if (lbInt && ubInt && stepInt && (*ubInt - *lbInt) % *stepInt == 0) 133 return failure(); 134 // Slow path: Examine the ops that define lb, ub and step. 135 AffineExpr sym0, sym1, sym2; 136 bindSymbols(b.getContext(), sym0, sym1, sym2); 137 SmallVector<Value> operands{forOp.getLowerBound(), forOp.getUpperBound(), 138 forOp.getStep()}; 139 AffineMap map = AffineMap::get(0, 3, {(sym1 - sym0) % sym2}); 140 affine::fullyComposeAffineMapAndOperands(&map, &operands); 141 if (auto constExpr = dyn_cast<AffineConstantExpr>(map.getResult(0))) 142 if (constExpr.getValue() == 0) 143 return failure(); 144 145 // New upper bound: %ub - (%ub - %lb) mod %step 146 auto modMap = AffineMap::get(0, 3, {sym1 - ((sym1 - sym0) % sym2)}); 147 b.setInsertionPoint(forOp); 148 auto loc = forOp.getLoc(); 149 splitBound = b.createOrFold<AffineApplyOp>(loc, modMap, 150 ValueRange{forOp.getLowerBound(), 151 forOp.getUpperBound(), 152 forOp.getStep()}); 153 154 // Create ForOp for partial iteration. 155 b.setInsertionPointAfter(forOp); 156 partialIteration = cast<ForOp>(b.clone(*forOp.getOperation())); 157 partialIteration.getLowerBoundMutable().assign(splitBound); 158 b.replaceAllUsesWith(forOp.getResults(), partialIteration->getResults()); 159 partialIteration.getInitArgsMutable().assign(forOp->getResults()); 160 161 // Set new upper loop bound. 162 b.updateRootInPlace( 163 forOp, [&]() { forOp.getUpperBoundMutable().assign(splitBound); }); 164 165 return success(); 166 } 167 168 static void rewriteAffineOpAfterPeeling(RewriterBase &rewriter, ForOp forOp, 169 ForOp partialIteration, 170 Value previousUb) { 171 Value mainIv = forOp.getInductionVar(); 172 Value partialIv = partialIteration.getInductionVar(); 173 assert(forOp.getStep() == partialIteration.getStep() && 174 "expected same step in main and partial loop"); 175 Value step = forOp.getStep(); 176 177 forOp.walk([&](Operation *affineOp) { 178 if (!isa<AffineMinOp, AffineMaxOp>(affineOp)) 179 return WalkResult::advance(); 180 (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, mainIv, previousUb, 181 step, 182 /*insideLoop=*/true); 183 return WalkResult::advance(); 184 }); 185 partialIteration.walk([&](Operation *affineOp) { 186 if (!isa<AffineMinOp, AffineMaxOp>(affineOp)) 187 return WalkResult::advance(); 188 (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, partialIv, previousUb, 189 step, /*insideLoop=*/false); 190 return WalkResult::advance(); 191 }); 192 } 193 194 LogicalResult mlir::scf::peelForLoopAndSimplifyBounds(RewriterBase &rewriter, 195 ForOp forOp, 196 ForOp &partialIteration) { 197 Value previousUb = forOp.getUpperBound(); 198 Value splitBound; 199 if (failed(peelForLoop(rewriter, forOp, partialIteration, splitBound))) 200 return failure(); 201 202 // Rewrite affine.min and affine.max ops. 203 rewriteAffineOpAfterPeeling(rewriter, forOp, partialIteration, previousUb); 204 205 return success(); 206 } 207 208 /// When the `peelFront` option is set as true, the first iteration of the loop 209 /// is peeled off. This function rewrites the original scf::ForOp as two 210 /// scf::ForOp Ops, the first scf::ForOp corresponds to the first iteration of 211 /// the loop which can be canonicalized away in the following optimization. The 212 /// second loop Op contains the remaining iteration, and the new lower bound is 213 /// the original lower bound plus the number of steps. 214 LogicalResult mlir::scf::peelForLoopFirstIteration(RewriterBase &b, ForOp forOp, 215 ForOp &firstIteration) { 216 RewriterBase::InsertionGuard guard(b); 217 auto lbInt = getConstantIntValue(forOp.getLowerBound()); 218 auto ubInt = getConstantIntValue(forOp.getUpperBound()); 219 auto stepInt = getConstantIntValue(forOp.getStep()); 220 221 // Peeling is not needed if there is one or less iteration. 222 if (lbInt && ubInt && stepInt && (*ubInt - *lbInt) / *stepInt <= 1) 223 return failure(); 224 225 AffineExpr lbSymbol, stepSymbol; 226 bindSymbols(b.getContext(), lbSymbol, stepSymbol); 227 228 // New lower bound for main loop: %lb + %step 229 auto ubMap = AffineMap::get(0, 2, {lbSymbol + stepSymbol}); 230 b.setInsertionPoint(forOp); 231 auto loc = forOp.getLoc(); 232 Value splitBound = b.createOrFold<AffineApplyOp>( 233 loc, ubMap, ValueRange{forOp.getLowerBound(), forOp.getStep()}); 234 235 // Peel the first iteration. 236 IRMapping map; 237 map.map(forOp.getUpperBound(), splitBound); 238 firstIteration = cast<ForOp>(b.clone(*forOp.getOperation(), map)); 239 240 // Update main loop with new lower bound. 241 b.updateRootInPlace(forOp, [&]() { 242 forOp.getInitArgsMutable().assign(firstIteration->getResults()); 243 forOp.getLowerBoundMutable().assign(splitBound); 244 }); 245 246 return success(); 247 } 248 249 static constexpr char kPeeledLoopLabel[] = "__peeled_loop__"; 250 static constexpr char kPartialIterationLabel[] = "__partial_iteration__"; 251 252 namespace { 253 struct ForLoopPeelingPattern : public OpRewritePattern<ForOp> { 254 ForLoopPeelingPattern(MLIRContext *ctx, bool peelFront, bool skipPartial) 255 : OpRewritePattern<ForOp>(ctx), peelFront(peelFront), 256 skipPartial(skipPartial) {} 257 258 LogicalResult matchAndRewrite(ForOp forOp, 259 PatternRewriter &rewriter) const override { 260 // Do not peel already peeled loops. 261 if (forOp->hasAttr(kPeeledLoopLabel)) 262 return failure(); 263 264 scf::ForOp partialIteration; 265 // The case for peeling the first iteration of the loop. 266 if (peelFront) { 267 if (failed( 268 peelForLoopFirstIteration(rewriter, forOp, partialIteration))) { 269 return failure(); 270 } 271 } else { 272 if (skipPartial) { 273 // No peeling of loops inside the partial iteration of another peeled 274 // loop. 275 Operation *op = forOp.getOperation(); 276 while ((op = op->getParentOfType<scf::ForOp>())) { 277 if (op->hasAttr(kPartialIterationLabel)) 278 return failure(); 279 } 280 } 281 // Apply loop peeling. 282 if (failed( 283 peelForLoopAndSimplifyBounds(rewriter, forOp, partialIteration))) 284 return failure(); 285 } 286 287 // Apply label, so that the same loop is not rewritten a second time. 288 rewriter.updateRootInPlace(partialIteration, [&]() { 289 partialIteration->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr()); 290 partialIteration->setAttr(kPartialIterationLabel, rewriter.getUnitAttr()); 291 }); 292 rewriter.updateRootInPlace(forOp, [&]() { 293 forOp->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr()); 294 }); 295 return success(); 296 } 297 298 // If set to true, the first iteration of the loop will be peeled. Otherwise, 299 // the unevenly divisible loop will be peeled at the end. 300 bool peelFront; 301 302 /// If set to true, loops inside partial iterations of another peeled loop 303 /// are not peeled. This reduces the size of the generated code. Partial 304 /// iterations are not usually performance critical. 305 /// Note: Takes into account the entire chain of parent operations, not just 306 /// the direct parent. 307 bool skipPartial; 308 }; 309 } // namespace 310 311 namespace { 312 struct ParallelLoopSpecialization 313 : public impl::SCFParallelLoopSpecializationBase< 314 ParallelLoopSpecialization> { 315 void runOnOperation() override { 316 getOperation()->walk( 317 [](ParallelOp op) { specializeParallelLoopForUnrolling(op); }); 318 } 319 }; 320 321 struct ForLoopSpecialization 322 : public impl::SCFForLoopSpecializationBase<ForLoopSpecialization> { 323 void runOnOperation() override { 324 getOperation()->walk([](ForOp op) { specializeForLoopForUnrolling(op); }); 325 } 326 }; 327 328 struct ForLoopPeeling : public impl::SCFForLoopPeelingBase<ForLoopPeeling> { 329 void runOnOperation() override { 330 auto *parentOp = getOperation(); 331 MLIRContext *ctx = parentOp->getContext(); 332 RewritePatternSet patterns(ctx); 333 patterns.add<ForLoopPeelingPattern>(ctx, peelFront, skipPartial); 334 (void)applyPatternsAndFoldGreedily(parentOp, std::move(patterns)); 335 336 // Drop the markers. 337 parentOp->walk([](Operation *op) { 338 op->removeAttr(kPeeledLoopLabel); 339 op->removeAttr(kPartialIterationLabel); 340 }); 341 } 342 }; 343 } // namespace 344 345 std::unique_ptr<Pass> mlir::createParallelLoopSpecializationPass() { 346 return std::make_unique<ParallelLoopSpecialization>(); 347 } 348 349 std::unique_ptr<Pass> mlir::createForLoopSpecializationPass() { 350 return std::make_unique<ForLoopSpecialization>(); 351 } 352 353 std::unique_ptr<Pass> mlir::createForLoopPeelingPass() { 354 return std::make_unique<ForLoopPeeling>(); 355 } 356