1 //===- LoopSpecialization.cpp - scf.parallel/SCR.for specialization -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Specializes parallel loops and for loops for easier unrolling and 10 // vectorization. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "mlir/Dialect/SCF/Transforms/Passes.h" 15 16 #include "mlir/Dialect/Affine/Analysis/AffineStructures.h" 17 #include "mlir/Dialect/Affine/IR/AffineOps.h" 18 #include "mlir/Dialect/Arith/IR/Arith.h" 19 #include "mlir/Dialect/SCF/IR/SCF.h" 20 #include "mlir/Dialect/SCF/Transforms/Transforms.h" 21 #include "mlir/Dialect/SCF/Utils/AffineCanonicalizationUtils.h" 22 #include "mlir/Dialect/Utils/StaticValueUtils.h" 23 #include "mlir/IR/AffineExpr.h" 24 #include "mlir/IR/IRMapping.h" 25 #include "mlir/IR/PatternMatch.h" 26 #include "mlir/Transforms/GreedyPatternRewriteDriver.h" 27 #include "llvm/ADT/DenseMap.h" 28 29 namespace mlir { 30 #define GEN_PASS_DEF_SCFFORLOOPPEELING 31 #define GEN_PASS_DEF_SCFFORLOOPSPECIALIZATION 32 #define GEN_PASS_DEF_SCFPARALLELLOOPSPECIALIZATION 33 #include "mlir/Dialect/SCF/Transforms/Passes.h.inc" 34 } // namespace mlir 35 36 using namespace mlir; 37 using namespace mlir::affine; 38 using scf::ForOp; 39 using scf::ParallelOp; 40 41 /// Rewrite a parallel loop with bounds defined by an affine.min with a constant 42 /// into 2 loops after checking if the bounds are equal to that constant. This 43 /// is beneficial if the loop will almost always have the constant bound and 44 /// that version can be fully unrolled and vectorized. 45 static void specializeParallelLoopForUnrolling(ParallelOp op) { 46 SmallVector<int64_t, 2> constantIndices; 47 constantIndices.reserve(op.getUpperBound().size()); 48 for (auto bound : op.getUpperBound()) { 49 auto minOp = bound.getDefiningOp<AffineMinOp>(); 50 if (!minOp) 51 return; 52 int64_t minConstant = std::numeric_limits<int64_t>::max(); 53 for (AffineExpr expr : minOp.getMap().getResults()) { 54 if (auto constantIndex = dyn_cast<AffineConstantExpr>(expr)) 55 minConstant = std::min(minConstant, constantIndex.getValue()); 56 } 57 if (minConstant == std::numeric_limits<int64_t>::max()) 58 return; 59 constantIndices.push_back(minConstant); 60 } 61 62 OpBuilder b(op); 63 IRMapping map; 64 Value cond; 65 for (auto bound : llvm::zip(op.getUpperBound(), constantIndices)) { 66 Value constant = 67 b.create<arith::ConstantIndexOp>(op.getLoc(), std::get<1>(bound)); 68 Value cmp = b.create<arith::CmpIOp>(op.getLoc(), arith::CmpIPredicate::eq, 69 std::get<0>(bound), constant); 70 cond = cond ? b.create<arith::AndIOp>(op.getLoc(), cond, cmp) : cmp; 71 map.map(std::get<0>(bound), constant); 72 } 73 auto ifOp = b.create<scf::IfOp>(op.getLoc(), cond, /*withElseRegion=*/true); 74 ifOp.getThenBodyBuilder().clone(*op.getOperation(), map); 75 ifOp.getElseBodyBuilder().clone(*op.getOperation()); 76 op.erase(); 77 } 78 79 /// Rewrite a for loop with bounds defined by an affine.min with a constant into 80 /// 2 loops after checking if the bounds are equal to that constant. This is 81 /// beneficial if the loop will almost always have the constant bound and that 82 /// version can be fully unrolled and vectorized. 83 static void specializeForLoopForUnrolling(ForOp op) { 84 auto bound = op.getUpperBound(); 85 auto minOp = bound.getDefiningOp<AffineMinOp>(); 86 if (!minOp) 87 return; 88 int64_t minConstant = std::numeric_limits<int64_t>::max(); 89 for (AffineExpr expr : minOp.getMap().getResults()) { 90 if (auto constantIndex = dyn_cast<AffineConstantExpr>(expr)) 91 minConstant = std::min(minConstant, constantIndex.getValue()); 92 } 93 if (minConstant == std::numeric_limits<int64_t>::max()) 94 return; 95 96 OpBuilder b(op); 97 IRMapping map; 98 Value constant = b.create<arith::ConstantIndexOp>(op.getLoc(), minConstant); 99 Value cond = b.create<arith::CmpIOp>(op.getLoc(), arith::CmpIPredicate::eq, 100 bound, constant); 101 map.map(bound, constant); 102 auto ifOp = b.create<scf::IfOp>(op.getLoc(), cond, /*withElseRegion=*/true); 103 ifOp.getThenBodyBuilder().clone(*op.getOperation(), map); 104 ifOp.getElseBodyBuilder().clone(*op.getOperation()); 105 op.erase(); 106 } 107 108 /// Rewrite a for loop with bounds/step that potentially do not divide evenly 109 /// into a for loop where the step divides the iteration space evenly, followed 110 /// by an scf.if for the last (partial) iteration (if any). 111 /// 112 /// This function rewrites the given scf.for loop in-place and creates a new 113 /// scf.if operation for the last iteration. It replaces all uses of the 114 /// unpeeled loop with the results of the newly generated scf.if. 115 /// 116 /// The newly generated scf.if operation is returned via `ifOp`. The boundary 117 /// at which the loop is split (new upper bound) is returned via `splitBound`. 118 /// The return value indicates whether the loop was rewritten or not. 119 static LogicalResult peelForLoop(RewriterBase &b, ForOp forOp, 120 ForOp &partialIteration, Value &splitBound) { 121 RewriterBase::InsertionGuard guard(b); 122 auto lbInt = getConstantIntValue(forOp.getLowerBound()); 123 auto ubInt = getConstantIntValue(forOp.getUpperBound()); 124 auto stepInt = getConstantIntValue(forOp.getStep()); 125 126 // No specialization necessary if step size is 1. Also bail out in case of an 127 // invalid zero or negative step which might have happened during folding. 128 if (stepInt && *stepInt <= 1) 129 return failure(); 130 131 // No specialization necessary if step already divides upper bound evenly. 132 // Fast path: lb, ub and step are constants. 133 if (lbInt && ubInt && stepInt && (*ubInt - *lbInt) % *stepInt == 0) 134 return failure(); 135 // Slow path: Examine the ops that define lb, ub and step. 136 AffineExpr sym0, sym1, sym2; 137 bindSymbols(b.getContext(), sym0, sym1, sym2); 138 SmallVector<Value> operands{forOp.getLowerBound(), forOp.getUpperBound(), 139 forOp.getStep()}; 140 AffineMap map = AffineMap::get(0, 3, {(sym1 - sym0) % sym2}); 141 affine::fullyComposeAffineMapAndOperands(&map, &operands); 142 if (auto constExpr = dyn_cast<AffineConstantExpr>(map.getResult(0))) 143 if (constExpr.getValue() == 0) 144 return failure(); 145 146 // New upper bound: %ub - (%ub - %lb) mod %step 147 auto modMap = AffineMap::get(0, 3, {sym1 - ((sym1 - sym0) % sym2)}); 148 b.setInsertionPoint(forOp); 149 auto loc = forOp.getLoc(); 150 splitBound = b.createOrFold<AffineApplyOp>(loc, modMap, 151 ValueRange{forOp.getLowerBound(), 152 forOp.getUpperBound(), 153 forOp.getStep()}); 154 155 // Create ForOp for partial iteration. 156 b.setInsertionPointAfter(forOp); 157 partialIteration = cast<ForOp>(b.clone(*forOp.getOperation())); 158 partialIteration.getLowerBoundMutable().assign(splitBound); 159 b.replaceAllUsesWith(forOp.getResults(), partialIteration->getResults()); 160 partialIteration.getInitArgsMutable().assign(forOp->getResults()); 161 162 // Set new upper loop bound. 163 b.modifyOpInPlace(forOp, 164 [&]() { forOp.getUpperBoundMutable().assign(splitBound); }); 165 166 return success(); 167 } 168 169 static void rewriteAffineOpAfterPeeling(RewriterBase &rewriter, ForOp forOp, 170 ForOp partialIteration, 171 Value previousUb) { 172 Value mainIv = forOp.getInductionVar(); 173 Value partialIv = partialIteration.getInductionVar(); 174 assert(forOp.getStep() == partialIteration.getStep() && 175 "expected same step in main and partial loop"); 176 Value step = forOp.getStep(); 177 178 forOp.walk([&](Operation *affineOp) { 179 if (!isa<AffineMinOp, AffineMaxOp>(affineOp)) 180 return WalkResult::advance(); 181 (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, mainIv, previousUb, 182 step, 183 /*insideLoop=*/true); 184 return WalkResult::advance(); 185 }); 186 partialIteration.walk([&](Operation *affineOp) { 187 if (!isa<AffineMinOp, AffineMaxOp>(affineOp)) 188 return WalkResult::advance(); 189 (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, partialIv, previousUb, 190 step, /*insideLoop=*/false); 191 return WalkResult::advance(); 192 }); 193 } 194 195 LogicalResult mlir::scf::peelForLoopAndSimplifyBounds(RewriterBase &rewriter, 196 ForOp forOp, 197 ForOp &partialIteration) { 198 Value previousUb = forOp.getUpperBound(); 199 Value splitBound; 200 if (failed(peelForLoop(rewriter, forOp, partialIteration, splitBound))) 201 return failure(); 202 203 // Rewrite affine.min and affine.max ops. 204 rewriteAffineOpAfterPeeling(rewriter, forOp, partialIteration, previousUb); 205 206 return success(); 207 } 208 209 /// Rewrites the original scf::ForOp as two scf::ForOp Ops, the first 210 /// scf::ForOp corresponds to the first iteration of the loop which can be 211 /// canonicalized away in the following optimizations. The second loop Op 212 /// contains the remaining iterations, with a lower bound updated as the 213 /// original lower bound plus the step (i.e. skips the first iteration). 214 LogicalResult mlir::scf::peelForLoopFirstIteration(RewriterBase &b, ForOp forOp, 215 ForOp &firstIteration) { 216 RewriterBase::InsertionGuard guard(b); 217 auto lbInt = getConstantIntValue(forOp.getLowerBound()); 218 auto ubInt = getConstantIntValue(forOp.getUpperBound()); 219 auto stepInt = getConstantIntValue(forOp.getStep()); 220 221 // Peeling is not needed if there is one or less iteration. 222 if (lbInt && ubInt && stepInt && ceil(float(*ubInt - *lbInt) / *stepInt) <= 1) 223 return failure(); 224 225 AffineExpr lbSymbol, stepSymbol; 226 bindSymbols(b.getContext(), lbSymbol, stepSymbol); 227 228 // New lower bound for main loop: %lb + %step 229 auto ubMap = AffineMap::get(0, 2, {lbSymbol + stepSymbol}); 230 b.setInsertionPoint(forOp); 231 auto loc = forOp.getLoc(); 232 Value splitBound = b.createOrFold<AffineApplyOp>( 233 loc, ubMap, ValueRange{forOp.getLowerBound(), forOp.getStep()}); 234 235 // Peel the first iteration. 236 IRMapping map; 237 map.map(forOp.getUpperBound(), splitBound); 238 firstIteration = cast<ForOp>(b.clone(*forOp.getOperation(), map)); 239 240 // Update main loop with new lower bound. 241 b.modifyOpInPlace(forOp, [&]() { 242 forOp.getInitArgsMutable().assign(firstIteration->getResults()); 243 forOp.getLowerBoundMutable().assign(splitBound); 244 }); 245 246 return success(); 247 } 248 249 static constexpr char kPeeledLoopLabel[] = "__peeled_loop__"; 250 static constexpr char kPartialIterationLabel[] = "__partial_iteration__"; 251 252 namespace { 253 struct ForLoopPeelingPattern : public OpRewritePattern<ForOp> { 254 ForLoopPeelingPattern(MLIRContext *ctx, bool peelFront, bool skipPartial) 255 : OpRewritePattern<ForOp>(ctx), peelFront(peelFront), 256 skipPartial(skipPartial) {} 257 258 LogicalResult matchAndRewrite(ForOp forOp, 259 PatternRewriter &rewriter) const override { 260 // Do not peel already peeled loops. 261 if (forOp->hasAttr(kPeeledLoopLabel)) 262 return failure(); 263 264 scf::ForOp partialIteration; 265 // The case for peeling the first iteration of the loop. 266 if (peelFront) { 267 if (failed( 268 peelForLoopFirstIteration(rewriter, forOp, partialIteration))) { 269 return failure(); 270 } 271 } else { 272 if (skipPartial) { 273 // No peeling of loops inside the partial iteration of another peeled 274 // loop. 275 Operation *op = forOp.getOperation(); 276 while ((op = op->getParentOfType<scf::ForOp>())) { 277 if (op->hasAttr(kPartialIterationLabel)) 278 return failure(); 279 } 280 } 281 // Apply loop peeling. 282 if (failed( 283 peelForLoopAndSimplifyBounds(rewriter, forOp, partialIteration))) 284 return failure(); 285 } 286 287 // Apply label, so that the same loop is not rewritten a second time. 288 rewriter.modifyOpInPlace(partialIteration, [&]() { 289 partialIteration->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr()); 290 partialIteration->setAttr(kPartialIterationLabel, rewriter.getUnitAttr()); 291 }); 292 rewriter.modifyOpInPlace(forOp, [&]() { 293 forOp->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr()); 294 }); 295 return success(); 296 } 297 298 // If set to true, the first iteration of the loop will be peeled. Otherwise, 299 // the unevenly divisible loop will be peeled at the end. 300 bool peelFront; 301 302 /// If set to true, loops inside partial iterations of another peeled loop 303 /// are not peeled. This reduces the size of the generated code. Partial 304 /// iterations are not usually performance critical. 305 /// Note: Takes into account the entire chain of parent operations, not just 306 /// the direct parent. 307 bool skipPartial; 308 }; 309 } // namespace 310 311 namespace { 312 struct ParallelLoopSpecialization 313 : public impl::SCFParallelLoopSpecializationBase< 314 ParallelLoopSpecialization> { 315 void runOnOperation() override { 316 getOperation()->walk( 317 [](ParallelOp op) { specializeParallelLoopForUnrolling(op); }); 318 } 319 }; 320 321 struct ForLoopSpecialization 322 : public impl::SCFForLoopSpecializationBase<ForLoopSpecialization> { 323 void runOnOperation() override { 324 getOperation()->walk([](ForOp op) { specializeForLoopForUnrolling(op); }); 325 } 326 }; 327 328 struct ForLoopPeeling : public impl::SCFForLoopPeelingBase<ForLoopPeeling> { 329 void runOnOperation() override { 330 auto *parentOp = getOperation(); 331 MLIRContext *ctx = parentOp->getContext(); 332 RewritePatternSet patterns(ctx); 333 patterns.add<ForLoopPeelingPattern>(ctx, peelFront, skipPartial); 334 (void)applyPatternsGreedily(parentOp, std::move(patterns)); 335 336 // Drop the markers. 337 parentOp->walk([](Operation *op) { 338 op->removeAttr(kPeeledLoopLabel); 339 op->removeAttr(kPartialIterationLabel); 340 }); 341 } 342 }; 343 } // namespace 344 345 std::unique_ptr<Pass> mlir::createParallelLoopSpecializationPass() { 346 return std::make_unique<ParallelLoopSpecialization>(); 347 } 348 349 std::unique_ptr<Pass> mlir::createForLoopSpecializationPass() { 350 return std::make_unique<ForLoopSpecialization>(); 351 } 352 353 std::unique_ptr<Pass> mlir::createForLoopPeelingPass() { 354 return std::make_unique<ForLoopPeeling>(); 355 } 356