1 //===- LoopPeel.cpp -------------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Loop Peeling Utilities. 10 //===----------------------------------------------------------------------===// 11 12 #include "llvm/Transforms/Utils/LoopPeel.h" 13 #include "llvm/ADT/DenseMap.h" 14 #include "llvm/ADT/Optional.h" 15 #include "llvm/ADT/SmallVector.h" 16 #include "llvm/ADT/Statistic.h" 17 #include "llvm/Analysis/Loads.h" 18 #include "llvm/Analysis/LoopInfo.h" 19 #include "llvm/Analysis/LoopIterator.h" 20 #include "llvm/Analysis/ScalarEvolution.h" 21 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 22 #include "llvm/Analysis/TargetTransformInfo.h" 23 #include "llvm/IR/BasicBlock.h" 24 #include "llvm/IR/Dominators.h" 25 #include "llvm/IR/Function.h" 26 #include "llvm/IR/InstrTypes.h" 27 #include "llvm/IR/Instruction.h" 28 #include "llvm/IR/Instructions.h" 29 #include "llvm/IR/LLVMContext.h" 30 #include "llvm/IR/MDBuilder.h" 31 #include "llvm/IR/PatternMatch.h" 32 #include "llvm/IR/ProfDataUtils.h" 33 #include "llvm/Support/Casting.h" 34 #include "llvm/Support/CommandLine.h" 35 #include "llvm/Support/Debug.h" 36 #include "llvm/Support/raw_ostream.h" 37 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 38 #include "llvm/Transforms/Utils/Cloning.h" 39 #include "llvm/Transforms/Utils/LoopSimplify.h" 40 #include "llvm/Transforms/Utils/LoopUtils.h" 41 #include "llvm/Transforms/Utils/ValueMapper.h" 42 #include <algorithm> 43 #include <cassert> 44 #include <cstdint> 45 #include <optional> 46 47 using namespace llvm; 48 using namespace llvm::PatternMatch; 49 50 #define DEBUG_TYPE "loop-peel" 51 52 STATISTIC(NumPeeled, "Number of loops peeled"); 53 54 static cl::opt<unsigned> UnrollPeelCount( 55 "unroll-peel-count", cl::Hidden, 56 cl::desc("Set the unroll peeling count, for testing purposes")); 57 58 static cl::opt<bool> 59 UnrollAllowPeeling("unroll-allow-peeling", cl::init(true), cl::Hidden, 60 cl::desc("Allows loops to be peeled when the dynamic " 61 "trip count is known to be low.")); 62 63 static cl::opt<bool> 64 UnrollAllowLoopNestsPeeling("unroll-allow-loop-nests-peeling", 65 cl::init(false), cl::Hidden, 66 cl::desc("Allows loop nests to be peeled.")); 67 68 static cl::opt<unsigned> UnrollPeelMaxCount( 69 "unroll-peel-max-count", cl::init(7), cl::Hidden, 70 cl::desc("Max average trip count which will cause loop peeling.")); 71 72 static cl::opt<unsigned> UnrollForcePeelCount( 73 "unroll-force-peel-count", cl::init(0), cl::Hidden, 74 cl::desc("Force a peel count regardless of profiling information.")); 75 76 static cl::opt<bool> DisableAdvancedPeeling( 77 "disable-advanced-peeling", cl::init(false), cl::Hidden, 78 cl::desc( 79 "Disable advance peeling. Issues for convergent targets (D134803).")); 80 81 static const char *PeeledCountMetaData = "llvm.loop.peeled.count"; 82 83 // Check whether we are capable of peeling this loop. 84 bool llvm::canPeel(const Loop *L) { 85 // Make sure the loop is in simplified form 86 if (!L->isLoopSimplifyForm()) 87 return false; 88 if (!DisableAdvancedPeeling) 89 return true; 90 91 SmallVector<BasicBlock *, 4> Exits; 92 L->getUniqueNonLatchExitBlocks(Exits); 93 // The latch must either be the only exiting block or all non-latch exit 94 // blocks have either a deopt or unreachable terminator or compose a chain of 95 // blocks where the last one is either deopt or unreachable terminated. Both 96 // deopt and unreachable terminators are a strong indication they are not 97 // taken. Note that this is a profitability check, not a legality check. Also 98 // note that LoopPeeling currently can only update the branch weights of latch 99 // blocks and branch weights to blocks with deopt or unreachable do not need 100 // updating. 101 return llvm::all_of(Exits, IsBlockFollowedByDeoptOrUnreachable); 102 } 103 104 namespace { 105 106 // As a loop is peeled, it may be the case that Phi nodes become 107 // loop-invariant (ie, known because there is only one choice). 108 // For example, consider the following function: 109 // void g(int); 110 // void binary() { 111 // int x = 0; 112 // int y = 0; 113 // int a = 0; 114 // for(int i = 0; i <100000; ++i) { 115 // g(x); 116 // x = y; 117 // g(a); 118 // y = a + 1; 119 // a = 5; 120 // } 121 // } 122 // Peeling 3 iterations is beneficial because the values for x, y and a 123 // become known. The IR for this loop looks something like the following: 124 // 125 // %i = phi i32 [ 0, %entry ], [ %inc, %if.end ] 126 // %a = phi i32 [ 0, %entry ], [ 5, %if.end ] 127 // %y = phi i32 [ 0, %entry ], [ %add, %if.end ] 128 // %x = phi i32 [ 0, %entry ], [ %y, %if.end ] 129 // ... 130 // tail call void @_Z1gi(i32 signext %x) 131 // tail call void @_Z1gi(i32 signext %a) 132 // %add = add nuw nsw i32 %a, 1 133 // %inc = add nuw nsw i32 %i, 1 134 // %exitcond = icmp eq i32 %inc, 100000 135 // br i1 %exitcond, label %for.cond.cleanup, label %for.body 136 // 137 // The arguments for the calls to g will become known after 3 iterations 138 // of the loop, because the phi nodes values become known after 3 iterations 139 // of the loop (ie, they are known on the 4th iteration, so peel 3 iterations). 140 // The first iteration has g(0), g(0); the second has g(0), g(5); the 141 // third has g(1), g(5) and the fourth (and all subsequent) have g(6), g(5). 142 // Now consider the phi nodes: 143 // %a is a phi with constants so it is determined after iteration 1. 144 // %y is a phi based on a constant and %a so it is determined on 145 // the iteration after %a is determined, so iteration 2. 146 // %x is a phi based on a constant and %y so it is determined on 147 // the iteration after %y, so iteration 3. 148 // %i is based on itself (and is an induction variable) so it is 149 // never determined. 150 // This means that peeling off 3 iterations will result in being able to 151 // remove the phi nodes for %a, %y, and %x. The arguments for the 152 // corresponding calls to g are determined and the code for computing 153 // x, y, and a can be removed. 154 // 155 // The PhiAnalyzer class calculates how many times a loop should be 156 // peeled based on the above analysis of the phi nodes in the loop while 157 // respecting the maximum specified. 158 class PhiAnalyzer { 159 public: 160 PhiAnalyzer(const Loop &L, unsigned MaxIterations); 161 162 // Calculate the sufficient minimum number of iterations of the loop to peel 163 // such that phi instructions become determined (subject to allowable limits) 164 Optional<unsigned> calculateIterationsToPeel(); 165 166 protected: 167 using PeelCounter = std::optional<unsigned>; 168 const PeelCounter Unknown = std::nullopt; 169 170 // Add 1 respecting Unknown and return Unknown if result over MaxIterations 171 PeelCounter addOne(PeelCounter PC) const { 172 if (PC == Unknown) 173 return Unknown; 174 return (*PC + 1 <= MaxIterations) ? PeelCounter{*PC + 1} : Unknown; 175 } 176 177 // Calculate the number of iterations after which the given value 178 // becomes an invariant. 179 PeelCounter calculate(const Value &); 180 181 const Loop &L; 182 const unsigned MaxIterations; 183 184 // Map of Values to number of iterations to invariance 185 SmallDenseMap<const Value *, PeelCounter> IterationsToInvariance; 186 }; 187 188 PhiAnalyzer::PhiAnalyzer(const Loop &L, unsigned MaxIterations) 189 : L(L), MaxIterations(MaxIterations) { 190 assert(canPeel(&L) && "loop is not suitable for peeling"); 191 assert(MaxIterations > 0 && "no peeling is allowed?"); 192 } 193 194 // This function calculates the number of iterations after which the value 195 // becomes an invariant. The pre-calculated values are memorized in a map. 196 // N.B. This number will be Unknown or <= MaxIterations. 197 // The function is calculated according to the following definition: 198 // Given %x = phi <Inputs from above the loop>, ..., [%y, %back.edge]. 199 // F(%x) = G(%y) + 1 (N.B. [MaxIterations | Unknown] + 1 => Unknown) 200 // G(%y) = 0 if %y is a loop invariant 201 // G(%y) = G(%BackEdgeValue) if %y is a phi in the header block 202 // G(%y) = TODO: if %y is an expression based on phis and loop invariants 203 // The example looks like: 204 // %x = phi(0, %a) <-- becomes invariant starting from 3rd iteration. 205 // %y = phi(0, 5) 206 // %a = %y + 1 207 // G(%y) = Unknown otherwise (including phi not in header block) 208 PhiAnalyzer::PeelCounter PhiAnalyzer::calculate(const Value &V) { 209 // If we already know the answer, take it from the map. 210 auto I = IterationsToInvariance.find(&V); 211 if (I != IterationsToInvariance.end()) 212 return I->second; 213 214 // Place Unknown to map to avoid infinite recursion. Such 215 // cycles can never stop on an invariant. 216 IterationsToInvariance[&V] = Unknown; 217 218 if (L.isLoopInvariant(&V)) 219 // Loop invariant so known at start. 220 return (IterationsToInvariance[&V] = 0); 221 if (const PHINode *Phi = dyn_cast<PHINode>(&V)) { 222 if (Phi->getParent() != L.getHeader()) { 223 // Phi is not in header block so Unknown. 224 assert(IterationsToInvariance[&V] == Unknown && "unexpected value saved"); 225 return Unknown; 226 } 227 // We need to analyze the input from the back edge and add 1. 228 Value *Input = Phi->getIncomingValueForBlock(L.getLoopLatch()); 229 PeelCounter Iterations = calculate(*Input); 230 assert(IterationsToInvariance[Input] == Iterations && 231 "unexpected value saved"); 232 return (IterationsToInvariance[Phi] = addOne(Iterations)); 233 } 234 // TODO: handle expressions 235 236 // Everything else is Unknown. 237 assert(IterationsToInvariance[&V] == Unknown && "unexpected value saved"); 238 return Unknown; 239 } 240 241 Optional<unsigned> PhiAnalyzer::calculateIterationsToPeel() { 242 unsigned Iterations = 0; 243 for (auto &PHI : L.getHeader()->phis()) { 244 PeelCounter ToInvariance = calculate(PHI); 245 if (ToInvariance != Unknown) { 246 assert(*ToInvariance <= MaxIterations && "bad result in phi analysis"); 247 Iterations = std::max(Iterations, *ToInvariance); 248 if (Iterations == MaxIterations) 249 break; 250 } 251 } 252 assert((Iterations <= MaxIterations) && "bad result in phi analysis"); 253 return Iterations ? Optional<unsigned>(Iterations) : std::nullopt; 254 } 255 256 } // unnamed namespace 257 258 // Try to find any invariant memory reads that will become dereferenceable in 259 // the remainder loop after peeling. The load must also be used (transitively) 260 // by an exit condition. Returns the number of iterations to peel off (at the 261 // moment either 0 or 1). 262 static unsigned peelToTurnInvariantLoadsDerefencebale(Loop &L, 263 DominatorTree &DT, 264 AssumptionCache *AC) { 265 // Skip loops with a single exiting block, because there should be no benefit 266 // for the heuristic below. 267 if (L.getExitingBlock()) 268 return 0; 269 270 // All non-latch exit blocks must have an UnreachableInst terminator. 271 // Otherwise the heuristic below may not be profitable. 272 SmallVector<BasicBlock *, 4> Exits; 273 L.getUniqueNonLatchExitBlocks(Exits); 274 if (any_of(Exits, [](const BasicBlock *BB) { 275 return !isa<UnreachableInst>(BB->getTerminator()); 276 })) 277 return 0; 278 279 // Now look for invariant loads that dominate the latch and are not known to 280 // be dereferenceable. If there are such loads and no writes, they will become 281 // dereferenceable in the loop if the first iteration is peeled off. Also 282 // collect the set of instructions controlled by such loads. Only peel if an 283 // exit condition uses (transitively) such a load. 284 BasicBlock *Header = L.getHeader(); 285 BasicBlock *Latch = L.getLoopLatch(); 286 SmallPtrSet<Value *, 8> LoadUsers; 287 const DataLayout &DL = L.getHeader()->getModule()->getDataLayout(); 288 for (BasicBlock *BB : L.blocks()) { 289 for (Instruction &I : *BB) { 290 if (I.mayWriteToMemory()) 291 return 0; 292 293 auto Iter = LoadUsers.find(&I); 294 if (Iter != LoadUsers.end()) { 295 for (Value *U : I.users()) 296 LoadUsers.insert(U); 297 } 298 // Do not look for reads in the header; they can already be hoisted 299 // without peeling. 300 if (BB == Header) 301 continue; 302 if (auto *LI = dyn_cast<LoadInst>(&I)) { 303 Value *Ptr = LI->getPointerOperand(); 304 if (DT.dominates(BB, Latch) && L.isLoopInvariant(Ptr) && 305 !isDereferenceablePointer(Ptr, LI->getType(), DL, LI, AC, &DT)) 306 for (Value *U : I.users()) 307 LoadUsers.insert(U); 308 } 309 } 310 } 311 SmallVector<BasicBlock *> ExitingBlocks; 312 L.getExitingBlocks(ExitingBlocks); 313 if (any_of(ExitingBlocks, [&LoadUsers](BasicBlock *Exiting) { 314 return LoadUsers.contains(Exiting->getTerminator()); 315 })) 316 return 1; 317 return 0; 318 } 319 320 // Return the number of iterations to peel off that make conditions in the 321 // body true/false. For example, if we peel 2 iterations off the loop below, 322 // the condition i < 2 can be evaluated at compile time. 323 // for (i = 0; i < n; i++) 324 // if (i < 2) 325 // .. 326 // else 327 // .. 328 // } 329 static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount, 330 ScalarEvolution &SE) { 331 assert(L.isLoopSimplifyForm() && "Loop needs to be in loop simplify form"); 332 unsigned DesiredPeelCount = 0; 333 334 for (auto *BB : L.blocks()) { 335 auto *BI = dyn_cast<BranchInst>(BB->getTerminator()); 336 if (!BI || BI->isUnconditional()) 337 continue; 338 339 // Ignore loop exit condition. 340 if (L.getLoopLatch() == BB) 341 continue; 342 343 Value *Condition = BI->getCondition(); 344 Value *LeftVal, *RightVal; 345 CmpInst::Predicate Pred; 346 if (!match(Condition, m_ICmp(Pred, m_Value(LeftVal), m_Value(RightVal)))) 347 continue; 348 349 const SCEV *LeftSCEV = SE.getSCEV(LeftVal); 350 const SCEV *RightSCEV = SE.getSCEV(RightVal); 351 352 // Do not consider predicates that are known to be true or false 353 // independently of the loop iteration. 354 if (SE.evaluatePredicate(Pred, LeftSCEV, RightSCEV)) 355 continue; 356 357 // Check if we have a condition with one AddRec and one non AddRec 358 // expression. Normalize LeftSCEV to be the AddRec. 359 if (!isa<SCEVAddRecExpr>(LeftSCEV)) { 360 if (isa<SCEVAddRecExpr>(RightSCEV)) { 361 std::swap(LeftSCEV, RightSCEV); 362 Pred = ICmpInst::getSwappedPredicate(Pred); 363 } else 364 continue; 365 } 366 367 const SCEVAddRecExpr *LeftAR = cast<SCEVAddRecExpr>(LeftSCEV); 368 369 // Avoid huge SCEV computations in the loop below, make sure we only 370 // consider AddRecs of the loop we are trying to peel. 371 if (!LeftAR->isAffine() || LeftAR->getLoop() != &L) 372 continue; 373 if (!(ICmpInst::isEquality(Pred) && LeftAR->hasNoSelfWrap()) && 374 !SE.getMonotonicPredicateType(LeftAR, Pred)) 375 continue; 376 377 // Check if extending the current DesiredPeelCount lets us evaluate Pred 378 // or !Pred in the loop body statically. 379 unsigned NewPeelCount = DesiredPeelCount; 380 381 const SCEV *IterVal = LeftAR->evaluateAtIteration( 382 SE.getConstant(LeftSCEV->getType(), NewPeelCount), SE); 383 384 // If the original condition is not known, get the negated predicate 385 // (which holds on the else branch) and check if it is known. This allows 386 // us to peel of iterations that make the original condition false. 387 if (!SE.isKnownPredicate(Pred, IterVal, RightSCEV)) 388 Pred = ICmpInst::getInversePredicate(Pred); 389 390 const SCEV *Step = LeftAR->getStepRecurrence(SE); 391 const SCEV *NextIterVal = SE.getAddExpr(IterVal, Step); 392 auto PeelOneMoreIteration = [&IterVal, &NextIterVal, &SE, Step, 393 &NewPeelCount]() { 394 IterVal = NextIterVal; 395 NextIterVal = SE.getAddExpr(IterVal, Step); 396 NewPeelCount++; 397 }; 398 399 auto CanPeelOneMoreIteration = [&NewPeelCount, &MaxPeelCount]() { 400 return NewPeelCount < MaxPeelCount; 401 }; 402 403 while (CanPeelOneMoreIteration() && 404 SE.isKnownPredicate(Pred, IterVal, RightSCEV)) 405 PeelOneMoreIteration(); 406 407 // With *that* peel count, does the predicate !Pred become known in the 408 // first iteration of the loop body after peeling? 409 if (!SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), IterVal, 410 RightSCEV)) 411 continue; // If not, give up. 412 413 // However, for equality comparisons, that isn't always sufficient to 414 // eliminate the comparsion in loop body, we may need to peel one more 415 // iteration. See if that makes !Pred become unknown again. 416 if (ICmpInst::isEquality(Pred) && 417 !SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), NextIterVal, 418 RightSCEV) && 419 !SE.isKnownPredicate(Pred, IterVal, RightSCEV) && 420 SE.isKnownPredicate(Pred, NextIterVal, RightSCEV)) { 421 if (!CanPeelOneMoreIteration()) 422 continue; // Need to peel one more iteration, but can't. Give up. 423 PeelOneMoreIteration(); // Great! 424 } 425 426 DesiredPeelCount = std::max(DesiredPeelCount, NewPeelCount); 427 } 428 429 return DesiredPeelCount; 430 } 431 432 /// This "heuristic" exactly matches implicit behavior which used to exist 433 /// inside getLoopEstimatedTripCount. It was added here to keep an 434 /// improvement inside that API from causing peeling to become more aggressive. 435 /// This should probably be removed. 436 static bool violatesLegacyMultiExitLoopCheck(Loop *L) { 437 BasicBlock *Latch = L->getLoopLatch(); 438 if (!Latch) 439 return true; 440 441 BranchInst *LatchBR = dyn_cast<BranchInst>(Latch->getTerminator()); 442 if (!LatchBR || LatchBR->getNumSuccessors() != 2 || !L->isLoopExiting(Latch)) 443 return true; 444 445 assert((LatchBR->getSuccessor(0) == L->getHeader() || 446 LatchBR->getSuccessor(1) == L->getHeader()) && 447 "At least one edge out of the latch must go to the header"); 448 449 SmallVector<BasicBlock *, 4> ExitBlocks; 450 L->getUniqueNonLatchExitBlocks(ExitBlocks); 451 return any_of(ExitBlocks, [](const BasicBlock *EB) { 452 return !EB->getTerminatingDeoptimizeCall(); 453 }); 454 } 455 456 457 // Return the number of iterations we want to peel off. 458 void llvm::computePeelCount(Loop *L, unsigned LoopSize, 459 TargetTransformInfo::PeelingPreferences &PP, 460 unsigned TripCount, DominatorTree &DT, 461 ScalarEvolution &SE, AssumptionCache *AC, 462 unsigned Threshold) { 463 assert(LoopSize > 0 && "Zero loop size is not allowed!"); 464 // Save the PP.PeelCount value set by the target in 465 // TTI.getPeelingPreferences or by the flag -unroll-peel-count. 466 unsigned TargetPeelCount = PP.PeelCount; 467 PP.PeelCount = 0; 468 if (!canPeel(L)) 469 return; 470 471 // Only try to peel innermost loops by default. 472 // The constraint can be relaxed by the target in TTI.getPeelingPreferences 473 // or by the flag -unroll-allow-loop-nests-peeling. 474 if (!PP.AllowLoopNestsPeeling && !L->isInnermost()) 475 return; 476 477 // If the user provided a peel count, use that. 478 bool UserPeelCount = UnrollForcePeelCount.getNumOccurrences() > 0; 479 if (UserPeelCount) { 480 LLVM_DEBUG(dbgs() << "Force-peeling first " << UnrollForcePeelCount 481 << " iterations.\n"); 482 PP.PeelCount = UnrollForcePeelCount; 483 PP.PeelProfiledIterations = true; 484 return; 485 } 486 487 // Skip peeling if it's disabled. 488 if (!PP.AllowPeeling) 489 return; 490 491 // Check that we can peel at least one iteration. 492 if (2 * LoopSize > Threshold) 493 return; 494 495 unsigned AlreadyPeeled = 0; 496 if (auto Peeled = getOptionalIntLoopAttribute(L, PeeledCountMetaData)) 497 AlreadyPeeled = *Peeled; 498 // Stop if we already peeled off the maximum number of iterations. 499 if (AlreadyPeeled >= UnrollPeelMaxCount) 500 return; 501 502 // Pay respect to limitations implied by loop size and the max peel count. 503 unsigned MaxPeelCount = UnrollPeelMaxCount; 504 MaxPeelCount = std::min(MaxPeelCount, Threshold / LoopSize - 1); 505 506 // Start the max computation with the PP.PeelCount value set by the target 507 // in TTI.getPeelingPreferences or by the flag -unroll-peel-count. 508 unsigned DesiredPeelCount = TargetPeelCount; 509 510 // Here we try to get rid of Phis which become invariants after 1, 2, ..., N 511 // iterations of the loop. For this we compute the number for iterations after 512 // which every Phi is guaranteed to become an invariant, and try to peel the 513 // maximum number of iterations among these values, thus turning all those 514 // Phis into invariants. 515 if (MaxPeelCount > DesiredPeelCount) { 516 // Check how many iterations are useful for resolving Phis 517 auto NumPeels = PhiAnalyzer(*L, MaxPeelCount).calculateIterationsToPeel(); 518 if (NumPeels) 519 DesiredPeelCount = std::max(DesiredPeelCount, *NumPeels); 520 } 521 522 DesiredPeelCount = std::max(DesiredPeelCount, 523 countToEliminateCompares(*L, MaxPeelCount, SE)); 524 525 if (DesiredPeelCount == 0) 526 DesiredPeelCount = peelToTurnInvariantLoadsDerefencebale(*L, DT, AC); 527 528 if (DesiredPeelCount > 0) { 529 DesiredPeelCount = std::min(DesiredPeelCount, MaxPeelCount); 530 // Consider max peel count limitation. 531 assert(DesiredPeelCount > 0 && "Wrong loop size estimation?"); 532 if (DesiredPeelCount + AlreadyPeeled <= UnrollPeelMaxCount) { 533 LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount 534 << " iteration(s) to turn" 535 << " some Phis into invariants.\n"); 536 PP.PeelCount = DesiredPeelCount; 537 PP.PeelProfiledIterations = false; 538 return; 539 } 540 } 541 542 // Bail if we know the statically calculated trip count. 543 // In this case we rather prefer partial unrolling. 544 if (TripCount) 545 return; 546 547 // Do not apply profile base peeling if it is disabled. 548 if (!PP.PeelProfiledIterations) 549 return; 550 // If we don't know the trip count, but have reason to believe the average 551 // trip count is low, peeling should be beneficial, since we will usually 552 // hit the peeled section. 553 // We only do this in the presence of profile information, since otherwise 554 // our estimates of the trip count are not reliable enough. 555 if (L->getHeader()->getParent()->hasProfileData()) { 556 if (violatesLegacyMultiExitLoopCheck(L)) 557 return; 558 Optional<unsigned> EstimatedTripCount = getLoopEstimatedTripCount(L); 559 if (!EstimatedTripCount) 560 return; 561 562 LLVM_DEBUG(dbgs() << "Profile-based estimated trip count is " 563 << *EstimatedTripCount << "\n"); 564 565 if (*EstimatedTripCount) { 566 if (*EstimatedTripCount + AlreadyPeeled <= MaxPeelCount) { 567 unsigned PeelCount = *EstimatedTripCount; 568 LLVM_DEBUG(dbgs() << "Peeling first " << PeelCount << " iterations.\n"); 569 PP.PeelCount = PeelCount; 570 return; 571 } 572 LLVM_DEBUG(dbgs() << "Already peel count: " << AlreadyPeeled << "\n"); 573 LLVM_DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n"); 574 LLVM_DEBUG(dbgs() << "Loop cost: " << LoopSize << "\n"); 575 LLVM_DEBUG(dbgs() << "Max peel cost: " << Threshold << "\n"); 576 LLVM_DEBUG(dbgs() << "Max peel count by cost: " 577 << (Threshold / LoopSize - 1) << "\n"); 578 } 579 } 580 } 581 582 struct WeightInfo { 583 // Weights for current iteration. 584 SmallVector<uint32_t> Weights; 585 // Weights to subtract after each iteration. 586 const SmallVector<uint32_t> SubWeights; 587 }; 588 589 /// Update the branch weights of an exiting block of a peeled-off loop 590 /// iteration. 591 /// Let F is a weight of the edge to continue (fallthrough) into the loop. 592 /// Let E is a weight of the edge to an exit. 593 /// F/(F+E) is a probability to go to loop and E/(F+E) is a probability to 594 /// go to exit. 595 /// Then, Estimated ExitCount = F / E. 596 /// For I-th (counting from 0) peeled off iteration we set the the weights for 597 /// the peeled exit as (EC - I, 1). It gives us reasonable distribution, 598 /// The probability to go to exit 1/(EC-I) increases. At the same time 599 /// the estimated exit count in the remainder loop reduces by I. 600 /// To avoid dealing with division rounding we can just multiple both part 601 /// of weights to E and use weight as (F - I * E, E). 602 static void updateBranchWeights(Instruction *Term, WeightInfo &Info) { 603 MDBuilder MDB(Term->getContext()); 604 Term->setMetadata(LLVMContext::MD_prof, 605 MDB.createBranchWeights(Info.Weights)); 606 for (auto [Idx, SubWeight] : enumerate(Info.SubWeights)) 607 if (SubWeight != 0) 608 Info.Weights[Idx] = Info.Weights[Idx] > SubWeight 609 ? Info.Weights[Idx] - SubWeight 610 : 1; 611 } 612 613 /// Initialize the weights for all exiting blocks. 614 static void initBranchWeights(DenseMap<Instruction *, WeightInfo> &WeightInfos, 615 Loop *L) { 616 SmallVector<BasicBlock *> ExitingBlocks; 617 L->getExitingBlocks(ExitingBlocks); 618 for (BasicBlock *ExitingBlock : ExitingBlocks) { 619 Instruction *Term = ExitingBlock->getTerminator(); 620 SmallVector<uint32_t> Weights; 621 if (!extractBranchWeights(*Term, Weights)) 622 continue; 623 624 // See the comment on updateBranchWeights() for an explanation of what we 625 // do here. 626 uint32_t FallThroughWeights = 0; 627 uint32_t ExitWeights = 0; 628 for (auto [Succ, Weight] : zip(successors(Term), Weights)) { 629 if (L->contains(Succ)) 630 FallThroughWeights += Weight; 631 else 632 ExitWeights += Weight; 633 } 634 635 // Don't try to update weights for degenerate case. 636 if (FallThroughWeights == 0) 637 continue; 638 639 SmallVector<uint32_t> SubWeights; 640 for (auto [Succ, Weight] : zip(successors(Term), Weights)) { 641 if (!L->contains(Succ)) { 642 // Exit weights stay the same. 643 SubWeights.push_back(0); 644 continue; 645 } 646 647 // Subtract exit weights on each iteration, distributed across all 648 // fallthrough edges. 649 double W = (double)Weight / (double)FallThroughWeights; 650 SubWeights.push_back((uint32_t)(ExitWeights * W)); 651 } 652 653 WeightInfos.insert({Term, {std::move(Weights), std::move(SubWeights)}}); 654 } 655 } 656 657 /// Update the weights of original exiting block after peeling off all 658 /// iterations. 659 static void fixupBranchWeights(Instruction *Term, const WeightInfo &Info) { 660 MDBuilder MDB(Term->getContext()); 661 Term->setMetadata(LLVMContext::MD_prof, 662 MDB.createBranchWeights(Info.Weights)); 663 } 664 665 /// Clones the body of the loop L, putting it between \p InsertTop and \p 666 /// InsertBot. 667 /// \param IterNumber The serial number of the iteration currently being 668 /// peeled off. 669 /// \param ExitEdges The exit edges of the original loop. 670 /// \param[out] NewBlocks A list of the blocks in the newly created clone 671 /// \param[out] VMap The value map between the loop and the new clone. 672 /// \param LoopBlocks A helper for DFS-traversal of the loop. 673 /// \param LVMap A value-map that maps instructions from the original loop to 674 /// instructions in the last peeled-off iteration. 675 static void cloneLoopBlocks( 676 Loop *L, unsigned IterNumber, BasicBlock *InsertTop, BasicBlock *InsertBot, 677 SmallVectorImpl<std::pair<BasicBlock *, BasicBlock *>> &ExitEdges, 678 SmallVectorImpl<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks, 679 ValueToValueMapTy &VMap, ValueToValueMapTy &LVMap, DominatorTree *DT, 680 LoopInfo *LI, ArrayRef<MDNode *> LoopLocalNoAliasDeclScopes, 681 ScalarEvolution &SE) { 682 BasicBlock *Header = L->getHeader(); 683 BasicBlock *Latch = L->getLoopLatch(); 684 BasicBlock *PreHeader = L->getLoopPreheader(); 685 686 Function *F = Header->getParent(); 687 LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO(); 688 LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO(); 689 Loop *ParentLoop = L->getParentLoop(); 690 691 // For each block in the original loop, create a new copy, 692 // and update the value map with the newly created values. 693 for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) { 694 BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".peel", F); 695 NewBlocks.push_back(NewBB); 696 697 // If an original block is an immediate child of the loop L, its copy 698 // is a child of a ParentLoop after peeling. If a block is a child of 699 // a nested loop, it is handled in the cloneLoop() call below. 700 if (ParentLoop && LI->getLoopFor(*BB) == L) 701 ParentLoop->addBasicBlockToLoop(NewBB, *LI); 702 703 VMap[*BB] = NewBB; 704 705 // If dominator tree is available, insert nodes to represent cloned blocks. 706 if (DT) { 707 if (Header == *BB) 708 DT->addNewBlock(NewBB, InsertTop); 709 else { 710 DomTreeNode *IDom = DT->getNode(*BB)->getIDom(); 711 // VMap must contain entry for IDom, as the iteration order is RPO. 712 DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDom->getBlock()])); 713 } 714 } 715 } 716 717 { 718 // Identify what other metadata depends on the cloned version. After 719 // cloning, replace the metadata with the corrected version for both 720 // memory instructions and noalias intrinsics. 721 std::string Ext = (Twine("Peel") + Twine(IterNumber)).str(); 722 cloneAndAdaptNoAliasScopes(LoopLocalNoAliasDeclScopes, NewBlocks, 723 Header->getContext(), Ext); 724 } 725 726 // Recursively create the new Loop objects for nested loops, if any, 727 // to preserve LoopInfo. 728 for (Loop *ChildLoop : *L) { 729 cloneLoop(ChildLoop, ParentLoop, VMap, LI, nullptr); 730 } 731 732 // Hook-up the control flow for the newly inserted blocks. 733 // The new header is hooked up directly to the "top", which is either 734 // the original loop preheader (for the first iteration) or the previous 735 // iteration's exiting block (for every other iteration) 736 InsertTop->getTerminator()->setSuccessor(0, cast<BasicBlock>(VMap[Header])); 737 738 // Similarly, for the latch: 739 // The original exiting edge is still hooked up to the loop exit. 740 // The backedge now goes to the "bottom", which is either the loop's real 741 // header (for the last peeled iteration) or the copied header of the next 742 // iteration (for every other iteration) 743 BasicBlock *NewLatch = cast<BasicBlock>(VMap[Latch]); 744 auto *LatchTerm = cast<Instruction>(NewLatch->getTerminator()); 745 for (unsigned idx = 0, e = LatchTerm->getNumSuccessors(); idx < e; ++idx) 746 if (LatchTerm->getSuccessor(idx) == Header) { 747 LatchTerm->setSuccessor(idx, InsertBot); 748 break; 749 } 750 if (DT) 751 DT->changeImmediateDominator(InsertBot, NewLatch); 752 753 // The new copy of the loop body starts with a bunch of PHI nodes 754 // that pick an incoming value from either the preheader, or the previous 755 // loop iteration. Since this copy is no longer part of the loop, we 756 // resolve this statically: 757 // For the first iteration, we use the value from the preheader directly. 758 // For any other iteration, we replace the phi with the value generated by 759 // the immediately preceding clone of the loop body (which represents 760 // the previous iteration). 761 for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { 762 PHINode *NewPHI = cast<PHINode>(VMap[&*I]); 763 if (IterNumber == 0) { 764 VMap[&*I] = NewPHI->getIncomingValueForBlock(PreHeader); 765 } else { 766 Value *LatchVal = NewPHI->getIncomingValueForBlock(Latch); 767 Instruction *LatchInst = dyn_cast<Instruction>(LatchVal); 768 if (LatchInst && L->contains(LatchInst)) 769 VMap[&*I] = LVMap[LatchInst]; 770 else 771 VMap[&*I] = LatchVal; 772 } 773 NewPHI->eraseFromParent(); 774 } 775 776 // Fix up the outgoing values - we need to add a value for the iteration 777 // we've just created. Note that this must happen *after* the incoming 778 // values are adjusted, since the value going out of the latch may also be 779 // a value coming into the header. 780 for (auto Edge : ExitEdges) 781 for (PHINode &PHI : Edge.second->phis()) { 782 Value *LatchVal = PHI.getIncomingValueForBlock(Edge.first); 783 Instruction *LatchInst = dyn_cast<Instruction>(LatchVal); 784 if (LatchInst && L->contains(LatchInst)) 785 LatchVal = VMap[LatchVal]; 786 PHI.addIncoming(LatchVal, cast<BasicBlock>(VMap[Edge.first])); 787 SE.forgetValue(&PHI); 788 } 789 790 // LastValueMap is updated with the values for the current loop 791 // which are used the next time this function is called. 792 for (auto KV : VMap) 793 LVMap[KV.first] = KV.second; 794 } 795 796 TargetTransformInfo::PeelingPreferences llvm::gatherPeelingPreferences( 797 Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, 798 Optional<bool> UserAllowPeeling, 799 Optional<bool> UserAllowProfileBasedPeeling, bool UnrollingSpecficValues) { 800 TargetTransformInfo::PeelingPreferences PP; 801 802 // Set the default values. 803 PP.PeelCount = 0; 804 PP.AllowPeeling = true; 805 PP.AllowLoopNestsPeeling = false; 806 PP.PeelProfiledIterations = true; 807 808 // Get the target specifc values. 809 TTI.getPeelingPreferences(L, SE, PP); 810 811 // User specified values using cl::opt. 812 if (UnrollingSpecficValues) { 813 if (UnrollPeelCount.getNumOccurrences() > 0) 814 PP.PeelCount = UnrollPeelCount; 815 if (UnrollAllowPeeling.getNumOccurrences() > 0) 816 PP.AllowPeeling = UnrollAllowPeeling; 817 if (UnrollAllowLoopNestsPeeling.getNumOccurrences() > 0) 818 PP.AllowLoopNestsPeeling = UnrollAllowLoopNestsPeeling; 819 } 820 821 // User specifed values provided by argument. 822 if (UserAllowPeeling) 823 PP.AllowPeeling = *UserAllowPeeling; 824 if (UserAllowProfileBasedPeeling) 825 PP.PeelProfiledIterations = *UserAllowProfileBasedPeeling; 826 827 return PP; 828 } 829 830 /// Peel off the first \p PeelCount iterations of loop \p L. 831 /// 832 /// Note that this does not peel them off as a single straight-line block. 833 /// Rather, each iteration is peeled off separately, and needs to check the 834 /// exit condition. 835 /// For loops that dynamically execute \p PeelCount iterations or less 836 /// this provides a benefit, since the peeled off iterations, which account 837 /// for the bulk of dynamic execution, can be further simplified by scalar 838 /// optimizations. 839 bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, 840 ScalarEvolution *SE, DominatorTree &DT, AssumptionCache *AC, 841 bool PreserveLCSSA) { 842 assert(PeelCount > 0 && "Attempt to peel out zero iterations?"); 843 assert(canPeel(L) && "Attempt to peel a loop which is not peelable?"); 844 845 LoopBlocksDFS LoopBlocks(L); 846 LoopBlocks.perform(LI); 847 848 BasicBlock *Header = L->getHeader(); 849 BasicBlock *PreHeader = L->getLoopPreheader(); 850 BasicBlock *Latch = L->getLoopLatch(); 851 SmallVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitEdges; 852 L->getExitEdges(ExitEdges); 853 854 // Remember dominators of blocks we might reach through exits to change them 855 // later. Immediate dominator of such block might change, because we add more 856 // routes which can lead to the exit: we can reach it from the peeled 857 // iterations too. 858 DenseMap<BasicBlock *, BasicBlock *> NonLoopBlocksIDom; 859 for (auto *BB : L->blocks()) { 860 auto *BBDomNode = DT.getNode(BB); 861 SmallVector<BasicBlock *, 16> ChildrenToUpdate; 862 for (auto *ChildDomNode : BBDomNode->children()) { 863 auto *ChildBB = ChildDomNode->getBlock(); 864 if (!L->contains(ChildBB)) 865 ChildrenToUpdate.push_back(ChildBB); 866 } 867 // The new idom of the block will be the nearest common dominator 868 // of all copies of the previous idom. This is equivalent to the 869 // nearest common dominator of the previous idom and the first latch, 870 // which dominates all copies of the previous idom. 871 BasicBlock *NewIDom = DT.findNearestCommonDominator(BB, Latch); 872 for (auto *ChildBB : ChildrenToUpdate) 873 NonLoopBlocksIDom[ChildBB] = NewIDom; 874 } 875 876 Function *F = Header->getParent(); 877 878 // Set up all the necessary basic blocks. It is convenient to split the 879 // preheader into 3 parts - two blocks to anchor the peeled copy of the loop 880 // body, and a new preheader for the "real" loop. 881 882 // Peeling the first iteration transforms. 883 // 884 // PreHeader: 885 // ... 886 // Header: 887 // LoopBody 888 // If (cond) goto Header 889 // Exit: 890 // 891 // into 892 // 893 // InsertTop: 894 // LoopBody 895 // If (!cond) goto Exit 896 // InsertBot: 897 // NewPreHeader: 898 // ... 899 // Header: 900 // LoopBody 901 // If (cond) goto Header 902 // Exit: 903 // 904 // Each following iteration will split the current bottom anchor in two, 905 // and put the new copy of the loop body between these two blocks. That is, 906 // after peeling another iteration from the example above, we'll split 907 // InsertBot, and get: 908 // 909 // InsertTop: 910 // LoopBody 911 // If (!cond) goto Exit 912 // InsertBot: 913 // LoopBody 914 // If (!cond) goto Exit 915 // InsertBot.next: 916 // NewPreHeader: 917 // ... 918 // Header: 919 // LoopBody 920 // If (cond) goto Header 921 // Exit: 922 923 BasicBlock *InsertTop = SplitEdge(PreHeader, Header, &DT, LI); 924 BasicBlock *InsertBot = 925 SplitBlock(InsertTop, InsertTop->getTerminator(), &DT, LI); 926 BasicBlock *NewPreHeader = 927 SplitBlock(InsertBot, InsertBot->getTerminator(), &DT, LI); 928 929 InsertTop->setName(Header->getName() + ".peel.begin"); 930 InsertBot->setName(Header->getName() + ".peel.next"); 931 NewPreHeader->setName(PreHeader->getName() + ".peel.newph"); 932 933 ValueToValueMapTy LVMap; 934 935 Instruction *LatchTerm = 936 cast<Instruction>(cast<BasicBlock>(Latch)->getTerminator()); 937 938 // If we have branch weight information, we'll want to update it for the 939 // newly created branches. 940 DenseMap<Instruction *, WeightInfo> Weights; 941 initBranchWeights(Weights, L); 942 943 // Identify what noalias metadata is inside the loop: if it is inside the 944 // loop, the associated metadata must be cloned for each iteration. 945 SmallVector<MDNode *, 6> LoopLocalNoAliasDeclScopes; 946 identifyNoAliasScopesToClone(L->getBlocks(), LoopLocalNoAliasDeclScopes); 947 948 // For each peeled-off iteration, make a copy of the loop. 949 for (unsigned Iter = 0; Iter < PeelCount; ++Iter) { 950 SmallVector<BasicBlock *, 8> NewBlocks; 951 ValueToValueMapTy VMap; 952 953 cloneLoopBlocks(L, Iter, InsertTop, InsertBot, ExitEdges, NewBlocks, 954 LoopBlocks, VMap, LVMap, &DT, LI, 955 LoopLocalNoAliasDeclScopes, *SE); 956 957 // Remap to use values from the current iteration instead of the 958 // previous one. 959 remapInstructionsInBlocks(NewBlocks, VMap); 960 961 // Update IDoms of the blocks reachable through exits. 962 if (Iter == 0) 963 for (auto BBIDom : NonLoopBlocksIDom) 964 DT.changeImmediateDominator(BBIDom.first, 965 cast<BasicBlock>(LVMap[BBIDom.second])); 966 #ifdef EXPENSIVE_CHECKS 967 assert(DT.verify(DominatorTree::VerificationLevel::Fast)); 968 #endif 969 970 for (auto &[Term, Info] : Weights) { 971 auto *TermCopy = cast<Instruction>(VMap[Term]); 972 updateBranchWeights(TermCopy, Info); 973 } 974 975 // Remove Loop metadata from the latch branch instruction 976 // because it is not the Loop's latch branch anymore. 977 auto *LatchTermCopy = cast<Instruction>(VMap[LatchTerm]); 978 LatchTermCopy->setMetadata(LLVMContext::MD_loop, nullptr); 979 980 InsertTop = InsertBot; 981 InsertBot = SplitBlock(InsertBot, InsertBot->getTerminator(), &DT, LI); 982 InsertBot->setName(Header->getName() + ".peel.next"); 983 984 F->getBasicBlockList().splice(InsertTop->getIterator(), 985 F->getBasicBlockList(), 986 NewBlocks[0]->getIterator(), F->end()); 987 } 988 989 // Now adjust the phi nodes in the loop header to get their initial values 990 // from the last peeled-off iteration instead of the preheader. 991 for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { 992 PHINode *PHI = cast<PHINode>(I); 993 Value *NewVal = PHI->getIncomingValueForBlock(Latch); 994 Instruction *LatchInst = dyn_cast<Instruction>(NewVal); 995 if (LatchInst && L->contains(LatchInst)) 996 NewVal = LVMap[LatchInst]; 997 998 PHI->setIncomingValueForBlock(NewPreHeader, NewVal); 999 } 1000 1001 for (const auto &[Term, Info] : Weights) 1002 fixupBranchWeights(Term, Info); 1003 1004 // Update Metadata for count of peeled off iterations. 1005 unsigned AlreadyPeeled = 0; 1006 if (auto Peeled = getOptionalIntLoopAttribute(L, PeeledCountMetaData)) 1007 AlreadyPeeled = *Peeled; 1008 addStringMetadataToLoop(L, PeeledCountMetaData, AlreadyPeeled + PeelCount); 1009 1010 if (Loop *ParentLoop = L->getParentLoop()) 1011 L = ParentLoop; 1012 1013 // We modified the loop, update SE. 1014 SE->forgetTopmostLoop(L); 1015 1016 #ifdef EXPENSIVE_CHECKS 1017 // Finally DomtTree must be correct. 1018 assert(DT.verify(DominatorTree::VerificationLevel::Fast)); 1019 #endif 1020 1021 // FIXME: Incrementally update loop-simplify 1022 simplifyLoop(L, &DT, LI, SE, AC, nullptr, PreserveLCSSA); 1023 1024 NumPeeled++; 1025 1026 return true; 1027 } 1028