1 //===- IndVarSimplify.cpp - Induction Variable Elimination ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This transformation analyzes and transforms the induction variables (and 10 // computations derived from them) into simpler forms suitable for subsequent 11 // analysis and transformation. 12 // 13 // If the trip count of a loop is computable, this pass also makes the following 14 // changes: 15 // 1. The exit condition for the loop is canonicalized to compare the 16 // induction value against the exit value. This turns loops like: 17 // 'for (i = 7; i*i < 1000; ++i)' into 'for (i = 0; i != 25; ++i)' 18 // 2. Any use outside of the loop of an expression derived from the indvar 19 // is changed to compute the derived value outside of the loop, eliminating 20 // the dependence on the exit value of the induction variable. If the only 21 // purpose of the loop is to compute the exit value of some derived 22 // expression, this transformation will make the loop dead. 23 // 24 //===----------------------------------------------------------------------===// 25 26 #include "llvm/Transforms/Scalar/IndVarSimplify.h" 27 #include "llvm/ADT/APFloat.h" 28 #include "llvm/ADT/ArrayRef.h" 29 #include "llvm/ADT/STLExtras.h" 30 #include "llvm/ADT/SmallPtrSet.h" 31 #include "llvm/ADT/SmallSet.h" 32 #include "llvm/ADT/SmallVector.h" 33 #include "llvm/ADT/Statistic.h" 34 #include "llvm/ADT/iterator_range.h" 35 #include "llvm/Analysis/LoopInfo.h" 36 #include "llvm/Analysis/LoopPass.h" 37 #include "llvm/Analysis/MemorySSA.h" 38 #include "llvm/Analysis/MemorySSAUpdater.h" 39 #include "llvm/Analysis/ScalarEvolution.h" 40 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 41 #include "llvm/Analysis/TargetLibraryInfo.h" 42 #include "llvm/Analysis/TargetTransformInfo.h" 43 #include "llvm/Analysis/ValueTracking.h" 44 #include "llvm/IR/BasicBlock.h" 45 #include "llvm/IR/Constant.h" 46 #include "llvm/IR/ConstantRange.h" 47 #include "llvm/IR/Constants.h" 48 #include "llvm/IR/DataLayout.h" 49 #include "llvm/IR/DerivedTypes.h" 50 #include "llvm/IR/Dominators.h" 51 #include "llvm/IR/Function.h" 52 #include "llvm/IR/IRBuilder.h" 53 #include "llvm/IR/InstrTypes.h" 54 #include "llvm/IR/Instruction.h" 55 #include "llvm/IR/Instructions.h" 56 #include "llvm/IR/IntrinsicInst.h" 57 #include "llvm/IR/Intrinsics.h" 58 #include "llvm/IR/Module.h" 59 #include "llvm/IR/Operator.h" 60 #include "llvm/IR/PassManager.h" 61 #include "llvm/IR/PatternMatch.h" 62 #include "llvm/IR/Type.h" 63 #include "llvm/IR/Use.h" 64 #include "llvm/IR/User.h" 65 #include "llvm/IR/Value.h" 66 #include "llvm/IR/ValueHandle.h" 67 #include "llvm/Support/Casting.h" 68 #include "llvm/Support/CommandLine.h" 69 #include "llvm/Support/Compiler.h" 70 #include "llvm/Support/Debug.h" 71 #include "llvm/Support/MathExtras.h" 72 #include "llvm/Support/raw_ostream.h" 73 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 74 #include "llvm/Transforms/Utils/Local.h" 75 #include "llvm/Transforms/Utils/LoopUtils.h" 76 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 77 #include "llvm/Transforms/Utils/SimplifyIndVar.h" 78 #include <cassert> 79 #include <cstdint> 80 #include <utility> 81 82 using namespace llvm; 83 using namespace PatternMatch; 84 85 #define DEBUG_TYPE "indvars" 86 87 STATISTIC(NumWidened , "Number of indvars widened"); 88 STATISTIC(NumReplaced , "Number of exit values replaced"); 89 STATISTIC(NumLFTR , "Number of loop exit tests replaced"); 90 STATISTIC(NumElimExt , "Number of IV sign/zero extends eliminated"); 91 STATISTIC(NumElimIV , "Number of congruent IVs eliminated"); 92 93 static cl::opt<ReplaceExitVal> ReplaceExitValue( 94 "replexitval", cl::Hidden, cl::init(OnlyCheapRepl), 95 cl::desc("Choose the strategy to replace exit value in IndVarSimplify"), 96 cl::values( 97 clEnumValN(NeverRepl, "never", "never replace exit value"), 98 clEnumValN(OnlyCheapRepl, "cheap", 99 "only replace exit value when the cost is cheap"), 100 clEnumValN( 101 UnusedIndVarInLoop, "unusedindvarinloop", 102 "only replace exit value when it is an unused " 103 "induction variable in the loop and has cheap replacement cost"), 104 clEnumValN(NoHardUse, "noharduse", 105 "only replace exit values when loop def likely dead"), 106 clEnumValN(AlwaysRepl, "always", 107 "always replace exit value whenever possible"))); 108 109 static cl::opt<bool> UsePostIncrementRanges( 110 "indvars-post-increment-ranges", cl::Hidden, 111 cl::desc("Use post increment control-dependent ranges in IndVarSimplify"), 112 cl::init(true)); 113 114 static cl::opt<bool> 115 DisableLFTR("disable-lftr", cl::Hidden, cl::init(false), 116 cl::desc("Disable Linear Function Test Replace optimization")); 117 118 static cl::opt<bool> 119 LoopPredication("indvars-predicate-loops", cl::Hidden, cl::init(true), 120 cl::desc("Predicate conditions in read only loops")); 121 122 static cl::opt<bool> 123 AllowIVWidening("indvars-widen-indvars", cl::Hidden, cl::init(true), 124 cl::desc("Allow widening of indvars to eliminate s/zext")); 125 126 namespace { 127 128 class IndVarSimplify { 129 LoopInfo *LI; 130 ScalarEvolution *SE; 131 DominatorTree *DT; 132 const DataLayout &DL; 133 TargetLibraryInfo *TLI; 134 const TargetTransformInfo *TTI; 135 std::unique_ptr<MemorySSAUpdater> MSSAU; 136 137 SmallVector<WeakTrackingVH, 16> DeadInsts; 138 bool WidenIndVars; 139 140 bool handleFloatingPointIV(Loop *L, PHINode *PH); 141 bool rewriteNonIntegerIVs(Loop *L); 142 143 bool simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LoopInfo *LI); 144 /// Try to improve our exit conditions by converting condition from signed 145 /// to unsigned or rotating computation out of the loop. 146 /// (See inline comment about why this is duplicated from simplifyAndExtend) 147 bool canonicalizeExitCondition(Loop *L); 148 /// Try to eliminate loop exits based on analyzeable exit counts 149 bool optimizeLoopExits(Loop *L, SCEVExpander &Rewriter); 150 /// Try to form loop invariant tests for loop exits by changing how many 151 /// iterations of the loop run when that is unobservable. 152 bool predicateLoopExits(Loop *L, SCEVExpander &Rewriter); 153 154 bool rewriteFirstIterationLoopExitValues(Loop *L); 155 156 bool linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB, 157 const SCEV *ExitCount, 158 PHINode *IndVar, SCEVExpander &Rewriter); 159 160 bool sinkUnusedInvariants(Loop *L); 161 162 public: 163 IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 164 const DataLayout &DL, TargetLibraryInfo *TLI, 165 TargetTransformInfo *TTI, MemorySSA *MSSA, bool WidenIndVars) 166 : LI(LI), SE(SE), DT(DT), DL(DL), TLI(TLI), TTI(TTI), 167 WidenIndVars(WidenIndVars) { 168 if (MSSA) 169 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA); 170 } 171 172 bool run(Loop *L); 173 }; 174 175 } // end anonymous namespace 176 177 //===----------------------------------------------------------------------===// 178 // rewriteNonIntegerIVs and helpers. Prefer integer IVs. 179 //===----------------------------------------------------------------------===// 180 181 /// Convert APF to an integer, if possible. 182 static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) { 183 bool isExact = false; 184 // See if we can convert this to an int64_t 185 uint64_t UIntVal; 186 if (APF.convertToInteger(MutableArrayRef(UIntVal), 64, true, 187 APFloat::rmTowardZero, &isExact) != APFloat::opOK || 188 !isExact) 189 return false; 190 IntVal = UIntVal; 191 return true; 192 } 193 194 /// If the loop has floating induction variable then insert corresponding 195 /// integer induction variable if possible. 196 /// For example, 197 /// for(double i = 0; i < 10000; ++i) 198 /// bar(i) 199 /// is converted into 200 /// for(int i = 0; i < 10000; ++i) 201 /// bar((double)i); 202 bool IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) { 203 unsigned IncomingEdge = L->contains(PN->getIncomingBlock(0)); 204 unsigned BackEdge = IncomingEdge^1; 205 206 // Check incoming value. 207 auto *InitValueVal = dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge)); 208 209 int64_t InitValue; 210 if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue)) 211 return false; 212 213 // Check IV increment. Reject this PN if increment operation is not 214 // an add or increment value can not be represented by an integer. 215 auto *Incr = dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge)); 216 if (Incr == nullptr || Incr->getOpcode() != Instruction::FAdd) return false; 217 218 // If this is not an add of the PHI with a constantfp, or if the constant fp 219 // is not an integer, bail out. 220 ConstantFP *IncValueVal = dyn_cast<ConstantFP>(Incr->getOperand(1)); 221 int64_t IncValue; 222 if (IncValueVal == nullptr || Incr->getOperand(0) != PN || 223 !ConvertToSInt(IncValueVal->getValueAPF(), IncValue)) 224 return false; 225 226 // Check Incr uses. One user is PN and the other user is an exit condition 227 // used by the conditional terminator. 228 Value::user_iterator IncrUse = Incr->user_begin(); 229 Instruction *U1 = cast<Instruction>(*IncrUse++); 230 if (IncrUse == Incr->user_end()) return false; 231 Instruction *U2 = cast<Instruction>(*IncrUse++); 232 if (IncrUse != Incr->user_end()) return false; 233 234 // Find exit condition, which is an fcmp. If it doesn't exist, or if it isn't 235 // only used by a branch, we can't transform it. 236 FCmpInst *Compare = dyn_cast<FCmpInst>(U1); 237 if (!Compare) 238 Compare = dyn_cast<FCmpInst>(U2); 239 if (!Compare || !Compare->hasOneUse() || 240 !isa<BranchInst>(Compare->user_back())) 241 return false; 242 243 BranchInst *TheBr = cast<BranchInst>(Compare->user_back()); 244 245 // We need to verify that the branch actually controls the iteration count 246 // of the loop. If not, the new IV can overflow and no one will notice. 247 // The branch block must be in the loop and one of the successors must be out 248 // of the loop. 249 assert(TheBr->isConditional() && "Can't use fcmp if not conditional"); 250 if (!L->contains(TheBr->getParent()) || 251 (L->contains(TheBr->getSuccessor(0)) && 252 L->contains(TheBr->getSuccessor(1)))) 253 return false; 254 255 // If it isn't a comparison with an integer-as-fp (the exit value), we can't 256 // transform it. 257 ConstantFP *ExitValueVal = dyn_cast<ConstantFP>(Compare->getOperand(1)); 258 int64_t ExitValue; 259 if (ExitValueVal == nullptr || 260 !ConvertToSInt(ExitValueVal->getValueAPF(), ExitValue)) 261 return false; 262 263 // Find new predicate for integer comparison. 264 CmpInst::Predicate NewPred = CmpInst::BAD_ICMP_PREDICATE; 265 switch (Compare->getPredicate()) { 266 default: return false; // Unknown comparison. 267 case CmpInst::FCMP_OEQ: 268 case CmpInst::FCMP_UEQ: NewPred = CmpInst::ICMP_EQ; break; 269 case CmpInst::FCMP_ONE: 270 case CmpInst::FCMP_UNE: NewPred = CmpInst::ICMP_NE; break; 271 case CmpInst::FCMP_OGT: 272 case CmpInst::FCMP_UGT: NewPred = CmpInst::ICMP_SGT; break; 273 case CmpInst::FCMP_OGE: 274 case CmpInst::FCMP_UGE: NewPred = CmpInst::ICMP_SGE; break; 275 case CmpInst::FCMP_OLT: 276 case CmpInst::FCMP_ULT: NewPred = CmpInst::ICMP_SLT; break; 277 case CmpInst::FCMP_OLE: 278 case CmpInst::FCMP_ULE: NewPred = CmpInst::ICMP_SLE; break; 279 } 280 281 // We convert the floating point induction variable to a signed i32 value if 282 // we can. This is only safe if the comparison will not overflow in a way 283 // that won't be trapped by the integer equivalent operations. Check for this 284 // now. 285 // TODO: We could use i64 if it is native and the range requires it. 286 287 // The start/stride/exit values must all fit in signed i32. 288 if (!isInt<32>(InitValue) || !isInt<32>(IncValue) || !isInt<32>(ExitValue)) 289 return false; 290 291 // If not actually striding (add x, 0.0), avoid touching the code. 292 if (IncValue == 0) 293 return false; 294 295 // Positive and negative strides have different safety conditions. 296 if (IncValue > 0) { 297 // If we have a positive stride, we require the init to be less than the 298 // exit value. 299 if (InitValue >= ExitValue) 300 return false; 301 302 uint32_t Range = uint32_t(ExitValue-InitValue); 303 // Check for infinite loop, either: 304 // while (i <= Exit) or until (i > Exit) 305 if (NewPred == CmpInst::ICMP_SLE || NewPred == CmpInst::ICMP_SGT) { 306 if (++Range == 0) return false; // Range overflows. 307 } 308 309 unsigned Leftover = Range % uint32_t(IncValue); 310 311 // If this is an equality comparison, we require that the strided value 312 // exactly land on the exit value, otherwise the IV condition will wrap 313 // around and do things the fp IV wouldn't. 314 if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) && 315 Leftover != 0) 316 return false; 317 318 // If the stride would wrap around the i32 before exiting, we can't 319 // transform the IV. 320 if (Leftover != 0 && int32_t(ExitValue+IncValue) < ExitValue) 321 return false; 322 } else { 323 // If we have a negative stride, we require the init to be greater than the 324 // exit value. 325 if (InitValue <= ExitValue) 326 return false; 327 328 uint32_t Range = uint32_t(InitValue-ExitValue); 329 // Check for infinite loop, either: 330 // while (i >= Exit) or until (i < Exit) 331 if (NewPred == CmpInst::ICMP_SGE || NewPred == CmpInst::ICMP_SLT) { 332 if (++Range == 0) return false; // Range overflows. 333 } 334 335 unsigned Leftover = Range % uint32_t(-IncValue); 336 337 // If this is an equality comparison, we require that the strided value 338 // exactly land on the exit value, otherwise the IV condition will wrap 339 // around and do things the fp IV wouldn't. 340 if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) && 341 Leftover != 0) 342 return false; 343 344 // If the stride would wrap around the i32 before exiting, we can't 345 // transform the IV. 346 if (Leftover != 0 && int32_t(ExitValue+IncValue) > ExitValue) 347 return false; 348 } 349 350 IntegerType *Int32Ty = Type::getInt32Ty(PN->getContext()); 351 352 // Insert new integer induction variable. 353 PHINode *NewPHI = 354 PHINode::Create(Int32Ty, 2, PN->getName() + ".int", PN->getIterator()); 355 NewPHI->addIncoming(ConstantInt::get(Int32Ty, InitValue), 356 PN->getIncomingBlock(IncomingEdge)); 357 358 Value *NewAdd = 359 BinaryOperator::CreateAdd(NewPHI, ConstantInt::get(Int32Ty, IncValue), 360 Incr->getName() + ".int", Incr->getIterator()); 361 NewPHI->addIncoming(NewAdd, PN->getIncomingBlock(BackEdge)); 362 363 ICmpInst *NewCompare = 364 new ICmpInst(TheBr->getIterator(), NewPred, NewAdd, 365 ConstantInt::get(Int32Ty, ExitValue), Compare->getName()); 366 367 // In the following deletions, PN may become dead and may be deleted. 368 // Use a WeakTrackingVH to observe whether this happens. 369 WeakTrackingVH WeakPH = PN; 370 371 // Delete the old floating point exit comparison. The branch starts using the 372 // new comparison. 373 NewCompare->takeName(Compare); 374 Compare->replaceAllUsesWith(NewCompare); 375 RecursivelyDeleteTriviallyDeadInstructions(Compare, TLI, MSSAU.get()); 376 377 // Delete the old floating point increment. 378 Incr->replaceAllUsesWith(PoisonValue::get(Incr->getType())); 379 RecursivelyDeleteTriviallyDeadInstructions(Incr, TLI, MSSAU.get()); 380 381 // If the FP induction variable still has uses, this is because something else 382 // in the loop uses its value. In order to canonicalize the induction 383 // variable, we chose to eliminate the IV and rewrite it in terms of an 384 // int->fp cast. 385 // 386 // We give preference to sitofp over uitofp because it is faster on most 387 // platforms. 388 if (WeakPH) { 389 Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv", 390 PN->getParent()->getFirstInsertionPt()); 391 PN->replaceAllUsesWith(Conv); 392 RecursivelyDeleteTriviallyDeadInstructions(PN, TLI, MSSAU.get()); 393 } 394 return true; 395 } 396 397 bool IndVarSimplify::rewriteNonIntegerIVs(Loop *L) { 398 // First step. Check to see if there are any floating-point recurrences. 399 // If there are, change them into integer recurrences, permitting analysis by 400 // the SCEV routines. 401 BasicBlock *Header = L->getHeader(); 402 403 SmallVector<WeakTrackingVH, 8> PHIs; 404 for (PHINode &PN : Header->phis()) 405 PHIs.push_back(&PN); 406 407 bool Changed = false; 408 for (WeakTrackingVH &PHI : PHIs) 409 if (PHINode *PN = dyn_cast_or_null<PHINode>(&*PHI)) 410 Changed |= handleFloatingPointIV(L, PN); 411 412 // If the loop previously had floating-point IV, ScalarEvolution 413 // may not have been able to compute a trip count. Now that we've done some 414 // re-writing, the trip count may be computable. 415 if (Changed) 416 SE->forgetLoop(L); 417 return Changed; 418 } 419 420 //===---------------------------------------------------------------------===// 421 // rewriteFirstIterationLoopExitValues: Rewrite loop exit values if we know 422 // they will exit at the first iteration. 423 //===---------------------------------------------------------------------===// 424 425 /// Check to see if this loop has loop invariant conditions which lead to loop 426 /// exits. If so, we know that if the exit path is taken, it is at the first 427 /// loop iteration. This lets us predict exit values of PHI nodes that live in 428 /// loop header. 429 bool IndVarSimplify::rewriteFirstIterationLoopExitValues(Loop *L) { 430 // Verify the input to the pass is already in LCSSA form. 431 assert(L->isLCSSAForm(*DT)); 432 433 SmallVector<BasicBlock *, 8> ExitBlocks; 434 L->getUniqueExitBlocks(ExitBlocks); 435 436 bool MadeAnyChanges = false; 437 for (auto *ExitBB : ExitBlocks) { 438 // If there are no more PHI nodes in this exit block, then no more 439 // values defined inside the loop are used on this path. 440 for (PHINode &PN : ExitBB->phis()) { 441 for (unsigned IncomingValIdx = 0, E = PN.getNumIncomingValues(); 442 IncomingValIdx != E; ++IncomingValIdx) { 443 auto *IncomingBB = PN.getIncomingBlock(IncomingValIdx); 444 445 // Can we prove that the exit must run on the first iteration if it 446 // runs at all? (i.e. early exits are fine for our purposes, but 447 // traces which lead to this exit being taken on the 2nd iteration 448 // aren't.) Note that this is about whether the exit branch is 449 // executed, not about whether it is taken. 450 if (!L->getLoopLatch() || 451 !DT->dominates(IncomingBB, L->getLoopLatch())) 452 continue; 453 454 // Get condition that leads to the exit path. 455 auto *TermInst = IncomingBB->getTerminator(); 456 457 Value *Cond = nullptr; 458 if (auto *BI = dyn_cast<BranchInst>(TermInst)) { 459 // Must be a conditional branch, otherwise the block 460 // should not be in the loop. 461 Cond = BI->getCondition(); 462 } else if (auto *SI = dyn_cast<SwitchInst>(TermInst)) 463 Cond = SI->getCondition(); 464 else 465 continue; 466 467 if (!L->isLoopInvariant(Cond)) 468 continue; 469 470 auto *ExitVal = dyn_cast<PHINode>(PN.getIncomingValue(IncomingValIdx)); 471 472 // Only deal with PHIs in the loop header. 473 if (!ExitVal || ExitVal->getParent() != L->getHeader()) 474 continue; 475 476 // If ExitVal is a PHI on the loop header, then we know its 477 // value along this exit because the exit can only be taken 478 // on the first iteration. 479 auto *LoopPreheader = L->getLoopPreheader(); 480 assert(LoopPreheader && "Invalid loop"); 481 int PreheaderIdx = ExitVal->getBasicBlockIndex(LoopPreheader); 482 if (PreheaderIdx != -1) { 483 assert(ExitVal->getParent() == L->getHeader() && 484 "ExitVal must be in loop header"); 485 MadeAnyChanges = true; 486 PN.setIncomingValue(IncomingValIdx, 487 ExitVal->getIncomingValue(PreheaderIdx)); 488 SE->forgetValue(&PN); 489 } 490 } 491 } 492 } 493 return MadeAnyChanges; 494 } 495 496 //===----------------------------------------------------------------------===// 497 // IV Widening - Extend the width of an IV to cover its widest uses. 498 //===----------------------------------------------------------------------===// 499 500 /// Update information about the induction variable that is extended by this 501 /// sign or zero extend operation. This is used to determine the final width of 502 /// the IV before actually widening it. 503 static void visitIVCast(CastInst *Cast, WideIVInfo &WI, 504 ScalarEvolution *SE, 505 const TargetTransformInfo *TTI) { 506 bool IsSigned = Cast->getOpcode() == Instruction::SExt; 507 if (!IsSigned && Cast->getOpcode() != Instruction::ZExt) 508 return; 509 510 Type *Ty = Cast->getType(); 511 uint64_t Width = SE->getTypeSizeInBits(Ty); 512 if (!Cast->getModule()->getDataLayout().isLegalInteger(Width)) 513 return; 514 515 // Check that `Cast` actually extends the induction variable (we rely on this 516 // later). This takes care of cases where `Cast` is extending a truncation of 517 // the narrow induction variable, and thus can end up being narrower than the 518 // "narrow" induction variable. 519 uint64_t NarrowIVWidth = SE->getTypeSizeInBits(WI.NarrowIV->getType()); 520 if (NarrowIVWidth >= Width) 521 return; 522 523 // Cast is either an sext or zext up to this point. 524 // We should not widen an indvar if arithmetics on the wider indvar are more 525 // expensive than those on the narrower indvar. We check only the cost of ADD 526 // because at least an ADD is required to increment the induction variable. We 527 // could compute more comprehensively the cost of all instructions on the 528 // induction variable when necessary. 529 if (TTI && 530 TTI->getArithmeticInstrCost(Instruction::Add, Ty) > 531 TTI->getArithmeticInstrCost(Instruction::Add, 532 Cast->getOperand(0)->getType())) { 533 return; 534 } 535 536 if (!WI.WidestNativeType || 537 Width > SE->getTypeSizeInBits(WI.WidestNativeType)) { 538 WI.WidestNativeType = SE->getEffectiveSCEVType(Ty); 539 WI.IsSigned = IsSigned; 540 return; 541 } 542 543 // We extend the IV to satisfy the sign of its user(s), or 'signed' 544 // if there are multiple users with both sign- and zero extensions, 545 // in order not to introduce nondeterministic behaviour based on the 546 // unspecified order of a PHI nodes' users-iterator. 547 WI.IsSigned |= IsSigned; 548 } 549 550 //===----------------------------------------------------------------------===// 551 // Live IV Reduction - Minimize IVs live across the loop. 552 //===----------------------------------------------------------------------===// 553 554 //===----------------------------------------------------------------------===// 555 // Simplification of IV users based on SCEV evaluation. 556 //===----------------------------------------------------------------------===// 557 558 namespace { 559 560 class IndVarSimplifyVisitor : public IVVisitor { 561 ScalarEvolution *SE; 562 const TargetTransformInfo *TTI; 563 PHINode *IVPhi; 564 565 public: 566 WideIVInfo WI; 567 568 IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV, 569 const TargetTransformInfo *TTI, 570 const DominatorTree *DTree) 571 : SE(SCEV), TTI(TTI), IVPhi(IV) { 572 DT = DTree; 573 WI.NarrowIV = IVPhi; 574 } 575 576 // Implement the interface used by simplifyUsersOfIV. 577 void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); } 578 }; 579 580 } // end anonymous namespace 581 582 /// Iteratively perform simplification on a worklist of IV users. Each 583 /// successive simplification may push more users which may themselves be 584 /// candidates for simplification. 585 /// 586 /// Sign/Zero extend elimination is interleaved with IV simplification. 587 bool IndVarSimplify::simplifyAndExtend(Loop *L, 588 SCEVExpander &Rewriter, 589 LoopInfo *LI) { 590 SmallVector<WideIVInfo, 8> WideIVs; 591 592 auto *GuardDecl = L->getBlocks()[0]->getModule()->getFunction( 593 Intrinsic::getName(Intrinsic::experimental_guard)); 594 bool HasGuards = GuardDecl && !GuardDecl->use_empty(); 595 596 SmallVector<PHINode *, 8> LoopPhis; 597 for (PHINode &PN : L->getHeader()->phis()) 598 LoopPhis.push_back(&PN); 599 600 // Each round of simplification iterates through the SimplifyIVUsers worklist 601 // for all current phis, then determines whether any IVs can be 602 // widened. Widening adds new phis to LoopPhis, inducing another round of 603 // simplification on the wide IVs. 604 bool Changed = false; 605 while (!LoopPhis.empty()) { 606 // Evaluate as many IV expressions as possible before widening any IVs. This 607 // forces SCEV to set no-wrap flags before evaluating sign/zero 608 // extension. The first time SCEV attempts to normalize sign/zero extension, 609 // the result becomes final. So for the most predictable results, we delay 610 // evaluation of sign/zero extend evaluation until needed, and avoid running 611 // other SCEV based analysis prior to simplifyAndExtend. 612 do { 613 PHINode *CurrIV = LoopPhis.pop_back_val(); 614 615 // Information about sign/zero extensions of CurrIV. 616 IndVarSimplifyVisitor Visitor(CurrIV, SE, TTI, DT); 617 618 Changed |= simplifyUsersOfIV(CurrIV, SE, DT, LI, TTI, DeadInsts, Rewriter, 619 &Visitor); 620 621 if (Visitor.WI.WidestNativeType) { 622 WideIVs.push_back(Visitor.WI); 623 } 624 } while(!LoopPhis.empty()); 625 626 // Continue if we disallowed widening. 627 if (!WidenIndVars) 628 continue; 629 630 for (; !WideIVs.empty(); WideIVs.pop_back()) { 631 unsigned ElimExt; 632 unsigned Widened; 633 if (PHINode *WidePhi = createWideIV(WideIVs.back(), LI, SE, Rewriter, 634 DT, DeadInsts, ElimExt, Widened, 635 HasGuards, UsePostIncrementRanges)) { 636 NumElimExt += ElimExt; 637 NumWidened += Widened; 638 Changed = true; 639 LoopPhis.push_back(WidePhi); 640 } 641 } 642 } 643 return Changed; 644 } 645 646 //===----------------------------------------------------------------------===// 647 // linearFunctionTestReplace and its kin. Rewrite the loop exit condition. 648 //===----------------------------------------------------------------------===// 649 650 /// Given an Value which is hoped to be part of an add recurance in the given 651 /// loop, return the associated Phi node if so. Otherwise, return null. Note 652 /// that this is less general than SCEVs AddRec checking. 653 static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L) { 654 Instruction *IncI = dyn_cast<Instruction>(IncV); 655 if (!IncI) 656 return nullptr; 657 658 switch (IncI->getOpcode()) { 659 case Instruction::Add: 660 case Instruction::Sub: 661 break; 662 case Instruction::GetElementPtr: 663 // An IV counter must preserve its type. 664 if (IncI->getNumOperands() == 2) 665 break; 666 [[fallthrough]]; 667 default: 668 return nullptr; 669 } 670 671 PHINode *Phi = dyn_cast<PHINode>(IncI->getOperand(0)); 672 if (Phi && Phi->getParent() == L->getHeader()) { 673 if (L->isLoopInvariant(IncI->getOperand(1))) 674 return Phi; 675 return nullptr; 676 } 677 if (IncI->getOpcode() == Instruction::GetElementPtr) 678 return nullptr; 679 680 // Allow add/sub to be commuted. 681 Phi = dyn_cast<PHINode>(IncI->getOperand(1)); 682 if (Phi && Phi->getParent() == L->getHeader()) { 683 if (L->isLoopInvariant(IncI->getOperand(0))) 684 return Phi; 685 } 686 return nullptr; 687 } 688 689 /// Whether the current loop exit test is based on this value. Currently this 690 /// is limited to a direct use in the loop condition. 691 static bool isLoopExitTestBasedOn(Value *V, BasicBlock *ExitingBB) { 692 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 693 ICmpInst *ICmp = dyn_cast<ICmpInst>(BI->getCondition()); 694 // TODO: Allow non-icmp loop test. 695 if (!ICmp) 696 return false; 697 698 // TODO: Allow indirect use. 699 return ICmp->getOperand(0) == V || ICmp->getOperand(1) == V; 700 } 701 702 /// linearFunctionTestReplace policy. Return true unless we can show that the 703 /// current exit test is already sufficiently canonical. 704 static bool needsLFTR(Loop *L, BasicBlock *ExitingBB) { 705 assert(L->getLoopLatch() && "Must be in simplified form"); 706 707 // Avoid converting a constant or loop invariant test back to a runtime 708 // test. This is critical for when SCEV's cached ExitCount is less precise 709 // than the current IR (such as after we've proven a particular exit is 710 // actually dead and thus the BE count never reaches our ExitCount.) 711 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 712 if (L->isLoopInvariant(BI->getCondition())) 713 return false; 714 715 // Do LFTR to simplify the exit condition to an ICMP. 716 ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition()); 717 if (!Cond) 718 return true; 719 720 // Do LFTR to simplify the exit ICMP to EQ/NE 721 ICmpInst::Predicate Pred = Cond->getPredicate(); 722 if (Pred != ICmpInst::ICMP_NE && Pred != ICmpInst::ICMP_EQ) 723 return true; 724 725 // Look for a loop invariant RHS 726 Value *LHS = Cond->getOperand(0); 727 Value *RHS = Cond->getOperand(1); 728 if (!L->isLoopInvariant(RHS)) { 729 if (!L->isLoopInvariant(LHS)) 730 return true; 731 std::swap(LHS, RHS); 732 } 733 // Look for a simple IV counter LHS 734 PHINode *Phi = dyn_cast<PHINode>(LHS); 735 if (!Phi) 736 Phi = getLoopPhiForCounter(LHS, L); 737 738 if (!Phi) 739 return true; 740 741 // Do LFTR if PHI node is defined in the loop, but is *not* a counter. 742 int Idx = Phi->getBasicBlockIndex(L->getLoopLatch()); 743 if (Idx < 0) 744 return true; 745 746 // Do LFTR if the exit condition's IV is *not* a simple counter. 747 Value *IncV = Phi->getIncomingValue(Idx); 748 return Phi != getLoopPhiForCounter(IncV, L); 749 } 750 751 /// Recursive helper for hasConcreteDef(). Unfortunately, this currently boils 752 /// down to checking that all operands are constant and listing instructions 753 /// that may hide undef. 754 static bool hasConcreteDefImpl(Value *V, SmallPtrSetImpl<Value*> &Visited, 755 unsigned Depth) { 756 if (isa<Constant>(V)) 757 return !isa<UndefValue>(V); 758 759 if (Depth >= 6) 760 return false; 761 762 // Conservatively handle non-constant non-instructions. For example, Arguments 763 // may be undef. 764 Instruction *I = dyn_cast<Instruction>(V); 765 if (!I) 766 return false; 767 768 // Load and return values may be undef. 769 if(I->mayReadFromMemory() || isa<CallInst>(I) || isa<InvokeInst>(I)) 770 return false; 771 772 // Optimistically handle other instructions. 773 for (Value *Op : I->operands()) { 774 if (!Visited.insert(Op).second) 775 continue; 776 if (!hasConcreteDefImpl(Op, Visited, Depth+1)) 777 return false; 778 } 779 return true; 780 } 781 782 /// Return true if the given value is concrete. We must prove that undef can 783 /// never reach it. 784 /// 785 /// TODO: If we decide that this is a good approach to checking for undef, we 786 /// may factor it into a common location. 787 static bool hasConcreteDef(Value *V) { 788 SmallPtrSet<Value*, 8> Visited; 789 Visited.insert(V); 790 return hasConcreteDefImpl(V, Visited, 0); 791 } 792 793 /// Return true if the given phi is a "counter" in L. A counter is an 794 /// add recurance (of integer or pointer type) with an arbitrary start, and a 795 /// step of 1. Note that L must have exactly one latch. 796 static bool isLoopCounter(PHINode* Phi, Loop *L, 797 ScalarEvolution *SE) { 798 assert(Phi->getParent() == L->getHeader()); 799 assert(L->getLoopLatch()); 800 801 if (!SE->isSCEVable(Phi->getType())) 802 return false; 803 804 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Phi)); 805 if (!AR || AR->getLoop() != L || !AR->isAffine()) 806 return false; 807 808 const SCEV *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)); 809 if (!Step || !Step->isOne()) 810 return false; 811 812 int LatchIdx = Phi->getBasicBlockIndex(L->getLoopLatch()); 813 Value *IncV = Phi->getIncomingValue(LatchIdx); 814 return (getLoopPhiForCounter(IncV, L) == Phi && 815 isa<SCEVAddRecExpr>(SE->getSCEV(IncV))); 816 } 817 818 /// Search the loop header for a loop counter (anadd rec w/step of one) 819 /// suitable for use by LFTR. If multiple counters are available, select the 820 /// "best" one based profitable heuristics. 821 /// 822 /// BECount may be an i8* pointer type. The pointer difference is already 823 /// valid count without scaling the address stride, so it remains a pointer 824 /// expression as far as SCEV is concerned. 825 static PHINode *FindLoopCounter(Loop *L, BasicBlock *ExitingBB, 826 const SCEV *BECount, 827 ScalarEvolution *SE, DominatorTree *DT) { 828 uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType()); 829 830 Value *Cond = cast<BranchInst>(ExitingBB->getTerminator())->getCondition(); 831 832 // Loop over all of the PHI nodes, looking for a simple counter. 833 PHINode *BestPhi = nullptr; 834 const SCEV *BestInit = nullptr; 835 BasicBlock *LatchBlock = L->getLoopLatch(); 836 assert(LatchBlock && "Must be in simplified form"); 837 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 838 839 for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { 840 PHINode *Phi = cast<PHINode>(I); 841 if (!isLoopCounter(Phi, L, SE)) 842 continue; 843 844 const auto *AR = cast<SCEVAddRecExpr>(SE->getSCEV(Phi)); 845 846 // AR may be a pointer type, while BECount is an integer type. 847 // AR may be wider than BECount. With eq/ne tests overflow is immaterial. 848 // AR may not be a narrower type, or we may never exit. 849 uint64_t PhiWidth = SE->getTypeSizeInBits(AR->getType()); 850 if (PhiWidth < BCWidth || !DL.isLegalInteger(PhiWidth)) 851 continue; 852 853 // Avoid reusing a potentially undef value to compute other values that may 854 // have originally had a concrete definition. 855 if (!hasConcreteDef(Phi)) { 856 // We explicitly allow unknown phis as long as they are already used by 857 // the loop exit test. This is legal since performing LFTR could not 858 // increase the number of undef users. 859 Value *IncPhi = Phi->getIncomingValueForBlock(LatchBlock); 860 if (!isLoopExitTestBasedOn(Phi, ExitingBB) && 861 !isLoopExitTestBasedOn(IncPhi, ExitingBB)) 862 continue; 863 } 864 865 // Avoid introducing undefined behavior due to poison which didn't exist in 866 // the original program. (Annoyingly, the rules for poison and undef 867 // propagation are distinct, so this does NOT cover the undef case above.) 868 // We have to ensure that we don't introduce UB by introducing a use on an 869 // iteration where said IV produces poison. Our strategy here differs for 870 // pointers and integer IVs. For integers, we strip and reinfer as needed, 871 // see code in linearFunctionTestReplace. For pointers, we restrict 872 // transforms as there is no good way to reinfer inbounds once lost. 873 if (!Phi->getType()->isIntegerTy() && 874 !mustExecuteUBIfPoisonOnPathTo(Phi, ExitingBB->getTerminator(), DT)) 875 continue; 876 877 const SCEV *Init = AR->getStart(); 878 879 if (BestPhi && !isAlmostDeadIV(BestPhi, LatchBlock, Cond)) { 880 // Don't force a live loop counter if another IV can be used. 881 if (isAlmostDeadIV(Phi, LatchBlock, Cond)) 882 continue; 883 884 // Prefer to count-from-zero. This is a more "canonical" counter form. It 885 // also prefers integer to pointer IVs. 886 if (BestInit->isZero() != Init->isZero()) { 887 if (BestInit->isZero()) 888 continue; 889 } 890 // If two IVs both count from zero or both count from nonzero then the 891 // narrower is likely a dead phi that has been widened. Use the wider phi 892 // to allow the other to be eliminated. 893 else if (PhiWidth <= SE->getTypeSizeInBits(BestPhi->getType())) 894 continue; 895 } 896 BestPhi = Phi; 897 BestInit = Init; 898 } 899 return BestPhi; 900 } 901 902 /// Insert an IR expression which computes the value held by the IV IndVar 903 /// (which must be an loop counter w/unit stride) after the backedge of loop L 904 /// is taken ExitCount times. 905 static Value *genLoopLimit(PHINode *IndVar, BasicBlock *ExitingBB, 906 const SCEV *ExitCount, bool UsePostInc, Loop *L, 907 SCEVExpander &Rewriter, ScalarEvolution *SE) { 908 assert(isLoopCounter(IndVar, L, SE)); 909 assert(ExitCount->getType()->isIntegerTy() && "exit count must be integer"); 910 const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IndVar)); 911 assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride"); 912 913 // For integer IVs, truncate the IV before computing the limit unless we 914 // know apriori that the limit must be a constant when evaluated in the 915 // bitwidth of the IV. We prefer (potentially) keeping a truncate of the 916 // IV in the loop over a (potentially) expensive expansion of the widened 917 // exit count add(zext(add)) expression. 918 if (IndVar->getType()->isIntegerTy() && 919 SE->getTypeSizeInBits(AR->getType()) > 920 SE->getTypeSizeInBits(ExitCount->getType())) { 921 const SCEV *IVInit = AR->getStart(); 922 if (!isa<SCEVConstant>(IVInit) || !isa<SCEVConstant>(ExitCount)) 923 AR = cast<SCEVAddRecExpr>(SE->getTruncateExpr(AR, ExitCount->getType())); 924 } 925 926 const SCEVAddRecExpr *ARBase = UsePostInc ? AR->getPostIncExpr(*SE) : AR; 927 const SCEV *IVLimit = ARBase->evaluateAtIteration(ExitCount, *SE); 928 assert(SE->isLoopInvariant(IVLimit, L) && 929 "Computed iteration count is not loop invariant!"); 930 return Rewriter.expandCodeFor(IVLimit, ARBase->getType(), 931 ExitingBB->getTerminator()); 932 } 933 934 /// This method rewrites the exit condition of the loop to be a canonical != 935 /// comparison against the incremented loop induction variable. This pass is 936 /// able to rewrite the exit tests of any loop where the SCEV analysis can 937 /// determine a loop-invariant trip count of the loop, which is actually a much 938 /// broader range than just linear tests. 939 bool IndVarSimplify:: 940 linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB, 941 const SCEV *ExitCount, 942 PHINode *IndVar, SCEVExpander &Rewriter) { 943 assert(L->getLoopLatch() && "Loop no longer in simplified form?"); 944 assert(isLoopCounter(IndVar, L, SE)); 945 Instruction * const IncVar = 946 cast<Instruction>(IndVar->getIncomingValueForBlock(L->getLoopLatch())); 947 948 // Initialize CmpIndVar to the preincremented IV. 949 Value *CmpIndVar = IndVar; 950 bool UsePostInc = false; 951 952 // If the exiting block is the same as the backedge block, we prefer to 953 // compare against the post-incremented value, otherwise we must compare 954 // against the preincremented value. 955 if (ExitingBB == L->getLoopLatch()) { 956 // For pointer IVs, we chose to not strip inbounds which requires us not 957 // to add a potentially UB introducing use. We need to either a) show 958 // the loop test we're modifying is already in post-inc form, or b) show 959 // that adding a use must not introduce UB. 960 bool SafeToPostInc = 961 IndVar->getType()->isIntegerTy() || 962 isLoopExitTestBasedOn(IncVar, ExitingBB) || 963 mustExecuteUBIfPoisonOnPathTo(IncVar, ExitingBB->getTerminator(), DT); 964 if (SafeToPostInc) { 965 UsePostInc = true; 966 CmpIndVar = IncVar; 967 } 968 } 969 970 // It may be necessary to drop nowrap flags on the incrementing instruction 971 // if either LFTR moves from a pre-inc check to a post-inc check (in which 972 // case the increment might have previously been poison on the last iteration 973 // only) or if LFTR switches to a different IV that was previously dynamically 974 // dead (and as such may be arbitrarily poison). We remove any nowrap flags 975 // that SCEV didn't infer for the post-inc addrec (even if we use a pre-inc 976 // check), because the pre-inc addrec flags may be adopted from the original 977 // instruction, while SCEV has to explicitly prove the post-inc nowrap flags. 978 // TODO: This handling is inaccurate for one case: If we switch to a 979 // dynamically dead IV that wraps on the first loop iteration only, which is 980 // not covered by the post-inc addrec. (If the new IV was not dynamically 981 // dead, it could not be poison on the first iteration in the first place.) 982 if (auto *BO = dyn_cast<BinaryOperator>(IncVar)) { 983 const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IncVar)); 984 if (BO->hasNoUnsignedWrap()) 985 BO->setHasNoUnsignedWrap(AR->hasNoUnsignedWrap()); 986 if (BO->hasNoSignedWrap()) 987 BO->setHasNoSignedWrap(AR->hasNoSignedWrap()); 988 } 989 990 Value *ExitCnt = genLoopLimit( 991 IndVar, ExitingBB, ExitCount, UsePostInc, L, Rewriter, SE); 992 assert(ExitCnt->getType()->isPointerTy() == 993 IndVar->getType()->isPointerTy() && 994 "genLoopLimit missed a cast"); 995 996 // Insert a new icmp_ne or icmp_eq instruction before the branch. 997 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 998 ICmpInst::Predicate P; 999 if (L->contains(BI->getSuccessor(0))) 1000 P = ICmpInst::ICMP_NE; 1001 else 1002 P = ICmpInst::ICMP_EQ; 1003 1004 IRBuilder<> Builder(BI); 1005 1006 // The new loop exit condition should reuse the debug location of the 1007 // original loop exit condition. 1008 if (auto *Cond = dyn_cast<Instruction>(BI->getCondition())) 1009 Builder.SetCurrentDebugLocation(Cond->getDebugLoc()); 1010 1011 // For integer IVs, if we evaluated the limit in the narrower bitwidth to 1012 // avoid the expensive expansion of the limit expression in the wider type, 1013 // emit a truncate to narrow the IV to the ExitCount type. This is safe 1014 // since we know (from the exit count bitwidth), that we can't self-wrap in 1015 // the narrower type. 1016 unsigned CmpIndVarSize = SE->getTypeSizeInBits(CmpIndVar->getType()); 1017 unsigned ExitCntSize = SE->getTypeSizeInBits(ExitCnt->getType()); 1018 if (CmpIndVarSize > ExitCntSize) { 1019 assert(!CmpIndVar->getType()->isPointerTy() && 1020 !ExitCnt->getType()->isPointerTy()); 1021 1022 // Before resorting to actually inserting the truncate, use the same 1023 // reasoning as from SimplifyIndvar::eliminateTrunc to see if we can extend 1024 // the other side of the comparison instead. We still evaluate the limit 1025 // in the narrower bitwidth, we just prefer a zext/sext outside the loop to 1026 // a truncate within in. 1027 bool Extended = false; 1028 const SCEV *IV = SE->getSCEV(CmpIndVar); 1029 const SCEV *TruncatedIV = SE->getTruncateExpr(IV, ExitCnt->getType()); 1030 const SCEV *ZExtTrunc = 1031 SE->getZeroExtendExpr(TruncatedIV, CmpIndVar->getType()); 1032 1033 if (ZExtTrunc == IV) { 1034 Extended = true; 1035 ExitCnt = Builder.CreateZExt(ExitCnt, IndVar->getType(), 1036 "wide.trip.count"); 1037 } else { 1038 const SCEV *SExtTrunc = 1039 SE->getSignExtendExpr(TruncatedIV, CmpIndVar->getType()); 1040 if (SExtTrunc == IV) { 1041 Extended = true; 1042 ExitCnt = Builder.CreateSExt(ExitCnt, IndVar->getType(), 1043 "wide.trip.count"); 1044 } 1045 } 1046 1047 if (Extended) { 1048 bool Discard; 1049 L->makeLoopInvariant(ExitCnt, Discard); 1050 } else 1051 CmpIndVar = Builder.CreateTrunc(CmpIndVar, ExitCnt->getType(), 1052 "lftr.wideiv"); 1053 } 1054 LLVM_DEBUG(dbgs() << "INDVARS: Rewriting loop exit condition to:\n" 1055 << " LHS:" << *CmpIndVar << '\n' 1056 << " op:\t" << (P == ICmpInst::ICMP_NE ? "!=" : "==") 1057 << "\n" 1058 << " RHS:\t" << *ExitCnt << "\n" 1059 << "ExitCount:\t" << *ExitCount << "\n" 1060 << " was: " << *BI->getCondition() << "\n"); 1061 1062 Value *Cond = Builder.CreateICmp(P, CmpIndVar, ExitCnt, "exitcond"); 1063 Value *OrigCond = BI->getCondition(); 1064 // It's tempting to use replaceAllUsesWith here to fully replace the old 1065 // comparison, but that's not immediately safe, since users of the old 1066 // comparison may not be dominated by the new comparison. Instead, just 1067 // update the branch to use the new comparison; in the common case this 1068 // will make old comparison dead. 1069 BI->setCondition(Cond); 1070 DeadInsts.emplace_back(OrigCond); 1071 1072 ++NumLFTR; 1073 return true; 1074 } 1075 1076 //===----------------------------------------------------------------------===// 1077 // sinkUnusedInvariants. A late subpass to cleanup loop preheaders. 1078 //===----------------------------------------------------------------------===// 1079 1080 /// If there's a single exit block, sink any loop-invariant values that 1081 /// were defined in the preheader but not used inside the loop into the 1082 /// exit block to reduce register pressure in the loop. 1083 bool IndVarSimplify::sinkUnusedInvariants(Loop *L) { 1084 BasicBlock *ExitBlock = L->getExitBlock(); 1085 if (!ExitBlock) return false; 1086 1087 BasicBlock *Preheader = L->getLoopPreheader(); 1088 if (!Preheader) return false; 1089 1090 bool MadeAnyChanges = false; 1091 BasicBlock::iterator InsertPt = ExitBlock->getFirstInsertionPt(); 1092 BasicBlock::iterator I(Preheader->getTerminator()); 1093 while (I != Preheader->begin()) { 1094 --I; 1095 // New instructions were inserted at the end of the preheader. 1096 if (isa<PHINode>(I)) 1097 break; 1098 1099 // Don't move instructions which might have side effects, since the side 1100 // effects need to complete before instructions inside the loop. Also don't 1101 // move instructions which might read memory, since the loop may modify 1102 // memory. Note that it's okay if the instruction might have undefined 1103 // behavior: LoopSimplify guarantees that the preheader dominates the exit 1104 // block. 1105 if (I->mayHaveSideEffects() || I->mayReadFromMemory()) 1106 continue; 1107 1108 // Skip debug info intrinsics. 1109 if (isa<DbgInfoIntrinsic>(I)) 1110 continue; 1111 1112 // Skip eh pad instructions. 1113 if (I->isEHPad()) 1114 continue; 1115 1116 // Don't sink alloca: we never want to sink static alloca's out of the 1117 // entry block, and correctly sinking dynamic alloca's requires 1118 // checks for stacksave/stackrestore intrinsics. 1119 // FIXME: Refactor this check somehow? 1120 if (isa<AllocaInst>(I)) 1121 continue; 1122 1123 // Determine if there is a use in or before the loop (direct or 1124 // otherwise). 1125 bool UsedInLoop = false; 1126 for (Use &U : I->uses()) { 1127 Instruction *User = cast<Instruction>(U.getUser()); 1128 BasicBlock *UseBB = User->getParent(); 1129 if (PHINode *P = dyn_cast<PHINode>(User)) { 1130 unsigned i = 1131 PHINode::getIncomingValueNumForOperand(U.getOperandNo()); 1132 UseBB = P->getIncomingBlock(i); 1133 } 1134 if (UseBB == Preheader || L->contains(UseBB)) { 1135 UsedInLoop = true; 1136 break; 1137 } 1138 } 1139 1140 // If there is, the def must remain in the preheader. 1141 if (UsedInLoop) 1142 continue; 1143 1144 // Otherwise, sink it to the exit block. 1145 Instruction *ToMove = &*I; 1146 bool Done = false; 1147 1148 if (I != Preheader->begin()) { 1149 // Skip debug info intrinsics. 1150 do { 1151 --I; 1152 } while (I->isDebugOrPseudoInst() && I != Preheader->begin()); 1153 1154 if (I->isDebugOrPseudoInst() && I == Preheader->begin()) 1155 Done = true; 1156 } else { 1157 Done = true; 1158 } 1159 1160 MadeAnyChanges = true; 1161 ToMove->moveBefore(*ExitBlock, InsertPt); 1162 SE->forgetValue(ToMove); 1163 if (Done) break; 1164 InsertPt = ToMove->getIterator(); 1165 } 1166 1167 return MadeAnyChanges; 1168 } 1169 1170 static void replaceExitCond(BranchInst *BI, Value *NewCond, 1171 SmallVectorImpl<WeakTrackingVH> &DeadInsts) { 1172 auto *OldCond = BI->getCondition(); 1173 LLVM_DEBUG(dbgs() << "Replacing condition of loop-exiting branch " << *BI 1174 << " with " << *NewCond << "\n"); 1175 BI->setCondition(NewCond); 1176 if (OldCond->use_empty()) 1177 DeadInsts.emplace_back(OldCond); 1178 } 1179 1180 static Constant *createFoldedExitCond(const Loop *L, BasicBlock *ExitingBB, 1181 bool IsTaken) { 1182 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1183 bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB)); 1184 auto *OldCond = BI->getCondition(); 1185 return ConstantInt::get(OldCond->getType(), 1186 IsTaken ? ExitIfTrue : !ExitIfTrue); 1187 } 1188 1189 static void foldExit(const Loop *L, BasicBlock *ExitingBB, bool IsTaken, 1190 SmallVectorImpl<WeakTrackingVH> &DeadInsts) { 1191 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1192 auto *NewCond = createFoldedExitCond(L, ExitingBB, IsTaken); 1193 replaceExitCond(BI, NewCond, DeadInsts); 1194 } 1195 1196 static void replaceLoopPHINodesWithPreheaderValues( 1197 LoopInfo *LI, Loop *L, SmallVectorImpl<WeakTrackingVH> &DeadInsts, 1198 ScalarEvolution &SE) { 1199 assert(L->isLoopSimplifyForm() && "Should only do it in simplify form!"); 1200 auto *LoopPreheader = L->getLoopPreheader(); 1201 auto *LoopHeader = L->getHeader(); 1202 SmallVector<Instruction *> Worklist; 1203 for (auto &PN : LoopHeader->phis()) { 1204 auto *PreheaderIncoming = PN.getIncomingValueForBlock(LoopPreheader); 1205 for (User *U : PN.users()) 1206 Worklist.push_back(cast<Instruction>(U)); 1207 SE.forgetValue(&PN); 1208 PN.replaceAllUsesWith(PreheaderIncoming); 1209 DeadInsts.emplace_back(&PN); 1210 } 1211 1212 // Replacing with the preheader value will often allow IV users to simplify 1213 // (especially if the preheader value is a constant). 1214 SmallPtrSet<Instruction *, 16> Visited; 1215 while (!Worklist.empty()) { 1216 auto *I = cast<Instruction>(Worklist.pop_back_val()); 1217 if (!Visited.insert(I).second) 1218 continue; 1219 1220 // Don't simplify instructions outside the loop. 1221 if (!L->contains(I)) 1222 continue; 1223 1224 Value *Res = simplifyInstruction(I, I->getModule()->getDataLayout()); 1225 if (Res && LI->replacementPreservesLCSSAForm(I, Res)) { 1226 for (User *U : I->users()) 1227 Worklist.push_back(cast<Instruction>(U)); 1228 I->replaceAllUsesWith(Res); 1229 DeadInsts.emplace_back(I); 1230 } 1231 } 1232 } 1233 1234 static Value * 1235 createInvariantCond(const Loop *L, BasicBlock *ExitingBB, 1236 const ScalarEvolution::LoopInvariantPredicate &LIP, 1237 SCEVExpander &Rewriter) { 1238 ICmpInst::Predicate InvariantPred = LIP.Pred; 1239 BasicBlock *Preheader = L->getLoopPreheader(); 1240 assert(Preheader && "Preheader doesn't exist"); 1241 Rewriter.setInsertPoint(Preheader->getTerminator()); 1242 auto *LHSV = Rewriter.expandCodeFor(LIP.LHS); 1243 auto *RHSV = Rewriter.expandCodeFor(LIP.RHS); 1244 bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB)); 1245 if (ExitIfTrue) 1246 InvariantPred = ICmpInst::getInversePredicate(InvariantPred); 1247 IRBuilder<> Builder(Preheader->getTerminator()); 1248 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1249 return Builder.CreateICmp(InvariantPred, LHSV, RHSV, 1250 BI->getCondition()->getName()); 1251 } 1252 1253 static std::optional<Value *> 1254 createReplacement(ICmpInst *ICmp, const Loop *L, BasicBlock *ExitingBB, 1255 const SCEV *MaxIter, bool Inverted, bool SkipLastIter, 1256 ScalarEvolution *SE, SCEVExpander &Rewriter) { 1257 ICmpInst::Predicate Pred = ICmp->getPredicate(); 1258 Value *LHS = ICmp->getOperand(0); 1259 Value *RHS = ICmp->getOperand(1); 1260 1261 // 'LHS pred RHS' should now mean that we stay in loop. 1262 auto *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1263 if (Inverted) 1264 Pred = CmpInst::getInversePredicate(Pred); 1265 1266 const SCEV *LHSS = SE->getSCEVAtScope(LHS, L); 1267 const SCEV *RHSS = SE->getSCEVAtScope(RHS, L); 1268 // Can we prove it to be trivially true or false? 1269 if (auto EV = SE->evaluatePredicateAt(Pred, LHSS, RHSS, BI)) 1270 return createFoldedExitCond(L, ExitingBB, /*IsTaken*/ !*EV); 1271 1272 auto *ARTy = LHSS->getType(); 1273 auto *MaxIterTy = MaxIter->getType(); 1274 // If possible, adjust types. 1275 if (SE->getTypeSizeInBits(ARTy) > SE->getTypeSizeInBits(MaxIterTy)) 1276 MaxIter = SE->getZeroExtendExpr(MaxIter, ARTy); 1277 else if (SE->getTypeSizeInBits(ARTy) < SE->getTypeSizeInBits(MaxIterTy)) { 1278 const SCEV *MinusOne = SE->getMinusOne(ARTy); 1279 auto *MaxAllowedIter = SE->getZeroExtendExpr(MinusOne, MaxIterTy); 1280 if (SE->isKnownPredicateAt(ICmpInst::ICMP_ULE, MaxIter, MaxAllowedIter, BI)) 1281 MaxIter = SE->getTruncateExpr(MaxIter, ARTy); 1282 } 1283 1284 if (SkipLastIter) { 1285 // Semantically skip last iter is "subtract 1, do not bother about unsigned 1286 // wrap". getLoopInvariantExitCondDuringFirstIterations knows how to deal 1287 // with umin in a smart way, but umin(a, b) - 1 will likely not simplify. 1288 // So we manually construct umin(a - 1, b - 1). 1289 SmallVector<const SCEV *, 4> Elements; 1290 if (auto *UMin = dyn_cast<SCEVUMinExpr>(MaxIter)) { 1291 for (auto *Op : UMin->operands()) 1292 Elements.push_back(SE->getMinusSCEV(Op, SE->getOne(Op->getType()))); 1293 MaxIter = SE->getUMinFromMismatchedTypes(Elements); 1294 } else 1295 MaxIter = SE->getMinusSCEV(MaxIter, SE->getOne(MaxIter->getType())); 1296 } 1297 1298 // Check if there is a loop-invariant predicate equivalent to our check. 1299 auto LIP = SE->getLoopInvariantExitCondDuringFirstIterations(Pred, LHSS, RHSS, 1300 L, BI, MaxIter); 1301 if (!LIP) 1302 return std::nullopt; 1303 1304 // Can we prove it to be trivially true? 1305 if (SE->isKnownPredicateAt(LIP->Pred, LIP->LHS, LIP->RHS, BI)) 1306 return createFoldedExitCond(L, ExitingBB, /*IsTaken*/ false); 1307 else 1308 return createInvariantCond(L, ExitingBB, *LIP, Rewriter); 1309 } 1310 1311 static bool optimizeLoopExitWithUnknownExitCount( 1312 const Loop *L, BranchInst *BI, BasicBlock *ExitingBB, const SCEV *MaxIter, 1313 bool SkipLastIter, ScalarEvolution *SE, SCEVExpander &Rewriter, 1314 SmallVectorImpl<WeakTrackingVH> &DeadInsts) { 1315 assert( 1316 (L->contains(BI->getSuccessor(0)) != L->contains(BI->getSuccessor(1))) && 1317 "Not a loop exit!"); 1318 1319 // For branch that stays in loop by TRUE condition, go through AND. For branch 1320 // that stays in loop by FALSE condition, go through OR. Both gives the 1321 // similar logic: "stay in loop iff all conditions are true(false)". 1322 bool Inverted = L->contains(BI->getSuccessor(1)); 1323 SmallVector<ICmpInst *, 4> LeafConditions; 1324 SmallVector<Value *, 4> Worklist; 1325 SmallPtrSet<Value *, 4> Visited; 1326 Value *OldCond = BI->getCondition(); 1327 Visited.insert(OldCond); 1328 Worklist.push_back(OldCond); 1329 1330 auto GoThrough = [&](Value *V) { 1331 Value *LHS = nullptr, *RHS = nullptr; 1332 if (Inverted) { 1333 if (!match(V, m_LogicalOr(m_Value(LHS), m_Value(RHS)))) 1334 return false; 1335 } else { 1336 if (!match(V, m_LogicalAnd(m_Value(LHS), m_Value(RHS)))) 1337 return false; 1338 } 1339 if (Visited.insert(LHS).second) 1340 Worklist.push_back(LHS); 1341 if (Visited.insert(RHS).second) 1342 Worklist.push_back(RHS); 1343 return true; 1344 }; 1345 1346 do { 1347 Value *Curr = Worklist.pop_back_val(); 1348 // Go through AND/OR conditions. Collect leaf ICMPs. We only care about 1349 // those with one use, to avoid instruction duplication. 1350 if (Curr->hasOneUse()) 1351 if (!GoThrough(Curr)) 1352 if (auto *ICmp = dyn_cast<ICmpInst>(Curr)) 1353 LeafConditions.push_back(ICmp); 1354 } while (!Worklist.empty()); 1355 1356 // If the current basic block has the same exit count as the whole loop, and 1357 // it consists of multiple icmp's, try to collect all icmp's that give exact 1358 // same exit count. For all other icmp's, we could use one less iteration, 1359 // because their value on the last iteration doesn't really matter. 1360 SmallPtrSet<ICmpInst *, 4> ICmpsFailingOnLastIter; 1361 if (!SkipLastIter && LeafConditions.size() > 1 && 1362 SE->getExitCount(L, ExitingBB, 1363 ScalarEvolution::ExitCountKind::SymbolicMaximum) == 1364 MaxIter) 1365 for (auto *ICmp : LeafConditions) { 1366 auto EL = SE->computeExitLimitFromCond(L, ICmp, Inverted, 1367 /*ControlsExit*/ false); 1368 auto *ExitMax = EL.SymbolicMaxNotTaken; 1369 if (isa<SCEVCouldNotCompute>(ExitMax)) 1370 continue; 1371 // They could be of different types (specifically this happens after 1372 // IV widening). 1373 auto *WiderType = 1374 SE->getWiderType(ExitMax->getType(), MaxIter->getType()); 1375 auto *WideExitMax = SE->getNoopOrZeroExtend(ExitMax, WiderType); 1376 auto *WideMaxIter = SE->getNoopOrZeroExtend(MaxIter, WiderType); 1377 if (WideExitMax == WideMaxIter) 1378 ICmpsFailingOnLastIter.insert(ICmp); 1379 } 1380 1381 bool Changed = false; 1382 for (auto *OldCond : LeafConditions) { 1383 // Skip last iteration for this icmp under one of two conditions: 1384 // - We do it for all conditions; 1385 // - There is another ICmp that would fail on last iter, so this one doesn't 1386 // really matter. 1387 bool OptimisticSkipLastIter = SkipLastIter; 1388 if (!OptimisticSkipLastIter) { 1389 if (ICmpsFailingOnLastIter.size() > 1) 1390 OptimisticSkipLastIter = true; 1391 else if (ICmpsFailingOnLastIter.size() == 1) 1392 OptimisticSkipLastIter = !ICmpsFailingOnLastIter.count(OldCond); 1393 } 1394 if (auto Replaced = 1395 createReplacement(OldCond, L, ExitingBB, MaxIter, Inverted, 1396 OptimisticSkipLastIter, SE, Rewriter)) { 1397 Changed = true; 1398 auto *NewCond = *Replaced; 1399 if (auto *NCI = dyn_cast<Instruction>(NewCond)) { 1400 NCI->setName(OldCond->getName() + ".first_iter"); 1401 } 1402 LLVM_DEBUG(dbgs() << "Unknown exit count: Replacing " << *OldCond 1403 << " with " << *NewCond << "\n"); 1404 assert(OldCond->hasOneUse() && "Must be!"); 1405 OldCond->replaceAllUsesWith(NewCond); 1406 DeadInsts.push_back(OldCond); 1407 // Make sure we no longer consider this condition as failing on last 1408 // iteration. 1409 ICmpsFailingOnLastIter.erase(OldCond); 1410 } 1411 } 1412 return Changed; 1413 } 1414 1415 bool IndVarSimplify::canonicalizeExitCondition(Loop *L) { 1416 // Note: This is duplicating a particular part on SimplifyIndVars reasoning. 1417 // We need to duplicate it because given icmp zext(small-iv), C, IVUsers 1418 // never reaches the icmp since the zext doesn't fold to an AddRec unless 1419 // it already has flags. The alternative to this would be to extending the 1420 // set of "interesting" IV users to include the icmp, but doing that 1421 // regresses results in practice by querying SCEVs before trip counts which 1422 // rely on them which results in SCEV caching sub-optimal answers. The 1423 // concern about caching sub-optimal results is why we only query SCEVs of 1424 // the loop invariant RHS here. 1425 SmallVector<BasicBlock*, 16> ExitingBlocks; 1426 L->getExitingBlocks(ExitingBlocks); 1427 bool Changed = false; 1428 for (auto *ExitingBB : ExitingBlocks) { 1429 auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); 1430 if (!BI) 1431 continue; 1432 assert(BI->isConditional() && "exit branch must be conditional"); 1433 1434 auto *ICmp = dyn_cast<ICmpInst>(BI->getCondition()); 1435 if (!ICmp || !ICmp->hasOneUse()) 1436 continue; 1437 1438 auto *LHS = ICmp->getOperand(0); 1439 auto *RHS = ICmp->getOperand(1); 1440 // For the range reasoning, avoid computing SCEVs in the loop to avoid 1441 // poisoning cache with sub-optimal results. For the must-execute case, 1442 // this is a neccessary precondition for correctness. 1443 if (!L->isLoopInvariant(RHS)) { 1444 if (!L->isLoopInvariant(LHS)) 1445 continue; 1446 // Same logic applies for the inverse case 1447 std::swap(LHS, RHS); 1448 } 1449 1450 // Match (icmp signed-cond zext, RHS) 1451 Value *LHSOp = nullptr; 1452 if (!match(LHS, m_ZExt(m_Value(LHSOp))) || !ICmp->isSigned()) 1453 continue; 1454 1455 const DataLayout &DL = ExitingBB->getModule()->getDataLayout(); 1456 const unsigned InnerBitWidth = DL.getTypeSizeInBits(LHSOp->getType()); 1457 const unsigned OuterBitWidth = DL.getTypeSizeInBits(RHS->getType()); 1458 auto FullCR = ConstantRange::getFull(InnerBitWidth); 1459 FullCR = FullCR.zeroExtend(OuterBitWidth); 1460 auto RHSCR = SE->getUnsignedRange(SE->applyLoopGuards(SE->getSCEV(RHS), L)); 1461 if (FullCR.contains(RHSCR)) { 1462 // We have now matched icmp signed-cond zext(X), zext(Y'), and can thus 1463 // replace the signed condition with the unsigned version. 1464 ICmp->setPredicate(ICmp->getUnsignedPredicate()); 1465 Changed = true; 1466 // Note: No SCEV invalidation needed. We've changed the predicate, but 1467 // have not changed exit counts, or the values produced by the compare. 1468 continue; 1469 } 1470 } 1471 1472 // Now that we've canonicalized the condition to match the extend, 1473 // see if we can rotate the extend out of the loop. 1474 for (auto *ExitingBB : ExitingBlocks) { 1475 auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); 1476 if (!BI) 1477 continue; 1478 assert(BI->isConditional() && "exit branch must be conditional"); 1479 1480 auto *ICmp = dyn_cast<ICmpInst>(BI->getCondition()); 1481 if (!ICmp || !ICmp->hasOneUse() || !ICmp->isUnsigned()) 1482 continue; 1483 1484 bool Swapped = false; 1485 auto *LHS = ICmp->getOperand(0); 1486 auto *RHS = ICmp->getOperand(1); 1487 if (L->isLoopInvariant(LHS) == L->isLoopInvariant(RHS)) 1488 // Nothing to rotate 1489 continue; 1490 if (L->isLoopInvariant(LHS)) { 1491 // Same logic applies for the inverse case until we actually pick 1492 // which operand of the compare to update. 1493 Swapped = true; 1494 std::swap(LHS, RHS); 1495 } 1496 assert(!L->isLoopInvariant(LHS) && L->isLoopInvariant(RHS)); 1497 1498 // Match (icmp unsigned-cond zext, RHS) 1499 // TODO: Extend to handle corresponding sext/signed-cmp case 1500 // TODO: Extend to other invertible functions 1501 Value *LHSOp = nullptr; 1502 if (!match(LHS, m_ZExt(m_Value(LHSOp)))) 1503 continue; 1504 1505 // In general, we only rotate if we can do so without increasing the number 1506 // of instructions. The exception is when we have an zext(add-rec). The 1507 // reason for allowing this exception is that we know we need to get rid 1508 // of the zext for SCEV to be able to compute a trip count for said loops; 1509 // we consider the new trip count valuable enough to increase instruction 1510 // count by one. 1511 if (!LHS->hasOneUse() && !isa<SCEVAddRecExpr>(SE->getSCEV(LHSOp))) 1512 continue; 1513 1514 // Given a icmp unsigned-cond zext(Op) where zext(trunc(RHS)) == RHS 1515 // replace with an icmp of the form icmp unsigned-cond Op, trunc(RHS) 1516 // when zext is loop varying and RHS is loop invariant. This converts 1517 // loop varying work to loop-invariant work. 1518 auto doRotateTransform = [&]() { 1519 assert(ICmp->isUnsigned() && "must have proven unsigned already"); 1520 auto *NewRHS = CastInst::Create( 1521 Instruction::Trunc, RHS, LHSOp->getType(), "", 1522 L->getLoopPreheader()->getTerminator()->getIterator()); 1523 ICmp->setOperand(Swapped ? 1 : 0, LHSOp); 1524 ICmp->setOperand(Swapped ? 0 : 1, NewRHS); 1525 if (LHS->use_empty()) 1526 DeadInsts.push_back(LHS); 1527 }; 1528 1529 1530 const DataLayout &DL = ExitingBB->getModule()->getDataLayout(); 1531 const unsigned InnerBitWidth = DL.getTypeSizeInBits(LHSOp->getType()); 1532 const unsigned OuterBitWidth = DL.getTypeSizeInBits(RHS->getType()); 1533 auto FullCR = ConstantRange::getFull(InnerBitWidth); 1534 FullCR = FullCR.zeroExtend(OuterBitWidth); 1535 auto RHSCR = SE->getUnsignedRange(SE->applyLoopGuards(SE->getSCEV(RHS), L)); 1536 if (FullCR.contains(RHSCR)) { 1537 doRotateTransform(); 1538 Changed = true; 1539 // Note, we are leaving SCEV in an unfortunately imprecise case here 1540 // as rotation tends to reveal information about trip counts not 1541 // previously visible. 1542 continue; 1543 } 1544 } 1545 1546 return Changed; 1547 } 1548 1549 bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) { 1550 SmallVector<BasicBlock*, 16> ExitingBlocks; 1551 L->getExitingBlocks(ExitingBlocks); 1552 1553 // Remove all exits which aren't both rewriteable and execute on every 1554 // iteration. 1555 llvm::erase_if(ExitingBlocks, [&](BasicBlock *ExitingBB) { 1556 // If our exitting block exits multiple loops, we can only rewrite the 1557 // innermost one. Otherwise, we're changing how many times the innermost 1558 // loop runs before it exits. 1559 if (LI->getLoopFor(ExitingBB) != L) 1560 return true; 1561 1562 // Can't rewrite non-branch yet. 1563 BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); 1564 if (!BI) 1565 return true; 1566 1567 // Likewise, the loop latch must be dominated by the exiting BB. 1568 if (!DT->dominates(ExitingBB, L->getLoopLatch())) 1569 return true; 1570 1571 if (auto *CI = dyn_cast<ConstantInt>(BI->getCondition())) { 1572 // If already constant, nothing to do. However, if this is an 1573 // unconditional exit, we can still replace header phis with their 1574 // preheader value. 1575 if (!L->contains(BI->getSuccessor(CI->isNullValue()))) 1576 replaceLoopPHINodesWithPreheaderValues(LI, L, DeadInsts, *SE); 1577 return true; 1578 } 1579 1580 return false; 1581 }); 1582 1583 if (ExitingBlocks.empty()) 1584 return false; 1585 1586 // Get a symbolic upper bound on the loop backedge taken count. 1587 const SCEV *MaxBECount = SE->getSymbolicMaxBackedgeTakenCount(L); 1588 if (isa<SCEVCouldNotCompute>(MaxBECount)) 1589 return false; 1590 1591 // Visit our exit blocks in order of dominance. We know from the fact that 1592 // all exits must dominate the latch, so there is a total dominance order 1593 // between them. 1594 llvm::sort(ExitingBlocks, [&](BasicBlock *A, BasicBlock *B) { 1595 // std::sort sorts in ascending order, so we want the inverse of 1596 // the normal dominance relation. 1597 if (A == B) return false; 1598 if (DT->properlyDominates(A, B)) 1599 return true; 1600 else { 1601 assert(DT->properlyDominates(B, A) && 1602 "expected total dominance order!"); 1603 return false; 1604 } 1605 }); 1606 #ifdef ASSERT 1607 for (unsigned i = 1; i < ExitingBlocks.size(); i++) { 1608 assert(DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i])); 1609 } 1610 #endif 1611 1612 bool Changed = false; 1613 bool SkipLastIter = false; 1614 const SCEV *CurrMaxExit = SE->getCouldNotCompute(); 1615 auto UpdateSkipLastIter = [&](const SCEV *MaxExitCount) { 1616 if (SkipLastIter || isa<SCEVCouldNotCompute>(MaxExitCount)) 1617 return; 1618 if (isa<SCEVCouldNotCompute>(CurrMaxExit)) 1619 CurrMaxExit = MaxExitCount; 1620 else 1621 CurrMaxExit = SE->getUMinFromMismatchedTypes(CurrMaxExit, MaxExitCount); 1622 // If the loop has more than 1 iteration, all further checks will be 1623 // executed 1 iteration less. 1624 if (CurrMaxExit == MaxBECount) 1625 SkipLastIter = true; 1626 }; 1627 SmallSet<const SCEV *, 8> DominatingExactExitCounts; 1628 for (BasicBlock *ExitingBB : ExitingBlocks) { 1629 const SCEV *ExactExitCount = SE->getExitCount(L, ExitingBB); 1630 const SCEV *MaxExitCount = SE->getExitCount( 1631 L, ExitingBB, ScalarEvolution::ExitCountKind::SymbolicMaximum); 1632 if (isa<SCEVCouldNotCompute>(ExactExitCount)) { 1633 // Okay, we do not know the exit count here. Can we at least prove that it 1634 // will remain the same within iteration space? 1635 auto *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1636 auto OptimizeCond = [&](bool SkipLastIter) { 1637 return optimizeLoopExitWithUnknownExitCount(L, BI, ExitingBB, 1638 MaxBECount, SkipLastIter, 1639 SE, Rewriter, DeadInsts); 1640 }; 1641 1642 // TODO: We might have proved that we can skip the last iteration for 1643 // this check. In this case, we only want to check the condition on the 1644 // pre-last iteration (MaxBECount - 1). However, there is a nasty 1645 // corner case: 1646 // 1647 // for (i = len; i != 0; i--) { ... check (i ult X) ... } 1648 // 1649 // If we could not prove that len != 0, then we also could not prove that 1650 // (len - 1) is not a UINT_MAX. If we simply query (len - 1), then 1651 // OptimizeCond will likely not prove anything for it, even if it could 1652 // prove the same fact for len. 1653 // 1654 // As a temporary solution, we query both last and pre-last iterations in 1655 // hope that we will be able to prove triviality for at least one of 1656 // them. We can stop querying MaxBECount for this case once SCEV 1657 // understands that (MaxBECount - 1) will not overflow here. 1658 if (OptimizeCond(false)) 1659 Changed = true; 1660 else if (SkipLastIter && OptimizeCond(true)) 1661 Changed = true; 1662 UpdateSkipLastIter(MaxExitCount); 1663 continue; 1664 } 1665 1666 UpdateSkipLastIter(ExactExitCount); 1667 1668 // If we know we'd exit on the first iteration, rewrite the exit to 1669 // reflect this. This does not imply the loop must exit through this 1670 // exit; there may be an earlier one taken on the first iteration. 1671 // We know that the backedge can't be taken, so we replace all 1672 // the header PHIs with values coming from the preheader. 1673 if (ExactExitCount->isZero()) { 1674 foldExit(L, ExitingBB, true, DeadInsts); 1675 replaceLoopPHINodesWithPreheaderValues(LI, L, DeadInsts, *SE); 1676 Changed = true; 1677 continue; 1678 } 1679 1680 assert(ExactExitCount->getType()->isIntegerTy() && 1681 MaxBECount->getType()->isIntegerTy() && 1682 "Exit counts must be integers"); 1683 1684 Type *WiderType = 1685 SE->getWiderType(MaxBECount->getType(), ExactExitCount->getType()); 1686 ExactExitCount = SE->getNoopOrZeroExtend(ExactExitCount, WiderType); 1687 MaxBECount = SE->getNoopOrZeroExtend(MaxBECount, WiderType); 1688 assert(MaxBECount->getType() == ExactExitCount->getType()); 1689 1690 // Can we prove that some other exit must be taken strictly before this 1691 // one? 1692 if (SE->isLoopEntryGuardedByCond(L, CmpInst::ICMP_ULT, MaxBECount, 1693 ExactExitCount)) { 1694 foldExit(L, ExitingBB, false, DeadInsts); 1695 Changed = true; 1696 continue; 1697 } 1698 1699 // As we run, keep track of which exit counts we've encountered. If we 1700 // find a duplicate, we've found an exit which would have exited on the 1701 // exiting iteration, but (from the visit order) strictly follows another 1702 // which does the same and is thus dead. 1703 if (!DominatingExactExitCounts.insert(ExactExitCount).second) { 1704 foldExit(L, ExitingBB, false, DeadInsts); 1705 Changed = true; 1706 continue; 1707 } 1708 1709 // TODO: There might be another oppurtunity to leverage SCEV's reasoning 1710 // here. If we kept track of the min of dominanting exits so far, we could 1711 // discharge exits with EC >= MDEC. This is less powerful than the existing 1712 // transform (since later exits aren't considered), but potentially more 1713 // powerful for any case where SCEV can prove a >=u b, but neither a == b 1714 // or a >u b. Such a case is not currently known. 1715 } 1716 return Changed; 1717 } 1718 1719 bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { 1720 SmallVector<BasicBlock*, 16> ExitingBlocks; 1721 L->getExitingBlocks(ExitingBlocks); 1722 1723 // Finally, see if we can rewrite our exit conditions into a loop invariant 1724 // form. If we have a read-only loop, and we can tell that we must exit down 1725 // a path which does not need any of the values computed within the loop, we 1726 // can rewrite the loop to exit on the first iteration. Note that this 1727 // doesn't either a) tell us the loop exits on the first iteration (unless 1728 // *all* exits are predicateable) or b) tell us *which* exit might be taken. 1729 // This transformation looks a lot like a restricted form of dead loop 1730 // elimination, but restricted to read-only loops and without neccesssarily 1731 // needing to kill the loop entirely. 1732 if (!LoopPredication) 1733 return false; 1734 1735 // Note: ExactBTC is the exact backedge taken count *iff* the loop exits 1736 // through *explicit* control flow. We have to eliminate the possibility of 1737 // implicit exits (see below) before we know it's truly exact. 1738 const SCEV *ExactBTC = SE->getBackedgeTakenCount(L); 1739 if (isa<SCEVCouldNotCompute>(ExactBTC) || !Rewriter.isSafeToExpand(ExactBTC)) 1740 return false; 1741 1742 assert(SE->isLoopInvariant(ExactBTC, L) && "BTC must be loop invariant"); 1743 assert(ExactBTC->getType()->isIntegerTy() && "BTC must be integer"); 1744 1745 auto BadExit = [&](BasicBlock *ExitingBB) { 1746 // If our exiting block exits multiple loops, we can only rewrite the 1747 // innermost one. Otherwise, we're changing how many times the innermost 1748 // loop runs before it exits. 1749 if (LI->getLoopFor(ExitingBB) != L) 1750 return true; 1751 1752 // Can't rewrite non-branch yet. 1753 BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); 1754 if (!BI) 1755 return true; 1756 1757 // If already constant, nothing to do. 1758 if (isa<Constant>(BI->getCondition())) 1759 return true; 1760 1761 // If the exit block has phis, we need to be able to compute the values 1762 // within the loop which contains them. This assumes trivially lcssa phis 1763 // have already been removed; TODO: generalize 1764 BasicBlock *ExitBlock = 1765 BI->getSuccessor(L->contains(BI->getSuccessor(0)) ? 1 : 0); 1766 if (!ExitBlock->phis().empty()) 1767 return true; 1768 1769 const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); 1770 if (isa<SCEVCouldNotCompute>(ExitCount) || 1771 !Rewriter.isSafeToExpand(ExitCount)) 1772 return true; 1773 1774 assert(SE->isLoopInvariant(ExitCount, L) && 1775 "Exit count must be loop invariant"); 1776 assert(ExitCount->getType()->isIntegerTy() && "Exit count must be integer"); 1777 return false; 1778 }; 1779 1780 // If we have any exits which can't be predicated themselves, than we can't 1781 // predicate any exit which isn't guaranteed to execute before it. Consider 1782 // two exits (a) and (b) which would both exit on the same iteration. If we 1783 // can predicate (b), but not (a), and (a) preceeds (b) along some path, then 1784 // we could convert a loop from exiting through (a) to one exiting through 1785 // (b). Note that this problem exists only for exits with the same exit 1786 // count, and we could be more aggressive when exit counts are known inequal. 1787 llvm::sort(ExitingBlocks, 1788 [&](BasicBlock *A, BasicBlock *B) { 1789 // std::sort sorts in ascending order, so we want the inverse of 1790 // the normal dominance relation, plus a tie breaker for blocks 1791 // unordered by dominance. 1792 if (DT->properlyDominates(A, B)) return true; 1793 if (DT->properlyDominates(B, A)) return false; 1794 return A->getName() < B->getName(); 1795 }); 1796 // Check to see if our exit blocks are a total order (i.e. a linear chain of 1797 // exits before the backedge). If they aren't, reasoning about reachability 1798 // is complicated and we choose not to for now. 1799 for (unsigned i = 1; i < ExitingBlocks.size(); i++) 1800 if (!DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i])) 1801 return false; 1802 1803 // Given our sorted total order, we know that exit[j] must be evaluated 1804 // after all exit[i] such j > i. 1805 for (unsigned i = 0, e = ExitingBlocks.size(); i < e; i++) 1806 if (BadExit(ExitingBlocks[i])) { 1807 ExitingBlocks.resize(i); 1808 break; 1809 } 1810 1811 if (ExitingBlocks.empty()) 1812 return false; 1813 1814 // We rely on not being able to reach an exiting block on a later iteration 1815 // then it's statically compute exit count. The implementaton of 1816 // getExitCount currently has this invariant, but assert it here so that 1817 // breakage is obvious if this ever changes.. 1818 assert(llvm::all_of(ExitingBlocks, [&](BasicBlock *ExitingBB) { 1819 return DT->dominates(ExitingBB, L->getLoopLatch()); 1820 })); 1821 1822 // At this point, ExitingBlocks consists of only those blocks which are 1823 // predicatable. Given that, we know we have at least one exit we can 1824 // predicate if the loop is doesn't have side effects and doesn't have any 1825 // implicit exits (because then our exact BTC isn't actually exact). 1826 // @Reviewers - As structured, this is O(I^2) for loop nests. Any 1827 // suggestions on how to improve this? I can obviously bail out for outer 1828 // loops, but that seems less than ideal. MemorySSA can find memory writes, 1829 // is that enough for *all* side effects? 1830 for (BasicBlock *BB : L->blocks()) 1831 for (auto &I : *BB) 1832 // TODO:isGuaranteedToTransfer 1833 if (I.mayHaveSideEffects()) 1834 return false; 1835 1836 bool Changed = false; 1837 // Finally, do the actual predication for all predicatable blocks. A couple 1838 // of notes here: 1839 // 1) We don't bother to constant fold dominated exits with identical exit 1840 // counts; that's simply a form of CSE/equality propagation and we leave 1841 // it for dedicated passes. 1842 // 2) We insert the comparison at the branch. Hoisting introduces additional 1843 // legality constraints and we leave that to dedicated logic. We want to 1844 // predicate even if we can't insert a loop invariant expression as 1845 // peeling or unrolling will likely reduce the cost of the otherwise loop 1846 // varying check. 1847 Rewriter.setInsertPoint(L->getLoopPreheader()->getTerminator()); 1848 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 1849 Value *ExactBTCV = nullptr; // Lazily generated if needed. 1850 for (BasicBlock *ExitingBB : ExitingBlocks) { 1851 const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); 1852 1853 auto *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1854 Value *NewCond; 1855 if (ExitCount == ExactBTC) { 1856 NewCond = L->contains(BI->getSuccessor(0)) ? 1857 B.getFalse() : B.getTrue(); 1858 } else { 1859 Value *ECV = Rewriter.expandCodeFor(ExitCount); 1860 if (!ExactBTCV) 1861 ExactBTCV = Rewriter.expandCodeFor(ExactBTC); 1862 Value *RHS = ExactBTCV; 1863 if (ECV->getType() != RHS->getType()) { 1864 Type *WiderTy = SE->getWiderType(ECV->getType(), RHS->getType()); 1865 ECV = B.CreateZExt(ECV, WiderTy); 1866 RHS = B.CreateZExt(RHS, WiderTy); 1867 } 1868 auto Pred = L->contains(BI->getSuccessor(0)) ? 1869 ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ; 1870 NewCond = B.CreateICmp(Pred, ECV, RHS); 1871 } 1872 Value *OldCond = BI->getCondition(); 1873 BI->setCondition(NewCond); 1874 if (OldCond->use_empty()) 1875 DeadInsts.emplace_back(OldCond); 1876 Changed = true; 1877 } 1878 1879 return Changed; 1880 } 1881 1882 //===----------------------------------------------------------------------===// 1883 // IndVarSimplify driver. Manage several subpasses of IV simplification. 1884 //===----------------------------------------------------------------------===// 1885 1886 bool IndVarSimplify::run(Loop *L) { 1887 // We need (and expect!) the incoming loop to be in LCSSA. 1888 assert(L->isRecursivelyLCSSAForm(*DT, *LI) && 1889 "LCSSA required to run indvars!"); 1890 1891 // If LoopSimplify form is not available, stay out of trouble. Some notes: 1892 // - LSR currently only supports LoopSimplify-form loops. Indvars' 1893 // canonicalization can be a pessimization without LSR to "clean up" 1894 // afterwards. 1895 // - We depend on having a preheader; in particular, 1896 // Loop::getCanonicalInductionVariable only supports loops with preheaders, 1897 // and we're in trouble if we can't find the induction variable even when 1898 // we've manually inserted one. 1899 // - LFTR relies on having a single backedge. 1900 if (!L->isLoopSimplifyForm()) 1901 return false; 1902 1903 bool Changed = false; 1904 // If there are any floating-point recurrences, attempt to 1905 // transform them to use integer recurrences. 1906 Changed |= rewriteNonIntegerIVs(L); 1907 1908 // Create a rewriter object which we'll use to transform the code with. 1909 SCEVExpander Rewriter(*SE, DL, "indvars"); 1910 #ifndef NDEBUG 1911 Rewriter.setDebugType(DEBUG_TYPE); 1912 #endif 1913 1914 // Eliminate redundant IV users. 1915 // 1916 // Simplification works best when run before other consumers of SCEV. We 1917 // attempt to avoid evaluating SCEVs for sign/zero extend operations until 1918 // other expressions involving loop IVs have been evaluated. This helps SCEV 1919 // set no-wrap flags before normalizing sign/zero extension. 1920 Rewriter.disableCanonicalMode(); 1921 Changed |= simplifyAndExtend(L, Rewriter, LI); 1922 1923 // Check to see if we can compute the final value of any expressions 1924 // that are recurrent in the loop, and substitute the exit values from the 1925 // loop into any instructions outside of the loop that use the final values 1926 // of the current expressions. 1927 if (ReplaceExitValue != NeverRepl) { 1928 if (int Rewrites = rewriteLoopExitValues(L, LI, TLI, SE, TTI, Rewriter, DT, 1929 ReplaceExitValue, DeadInsts)) { 1930 NumReplaced += Rewrites; 1931 Changed = true; 1932 } 1933 } 1934 1935 // Eliminate redundant IV cycles. 1936 NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts, TTI); 1937 1938 // Try to convert exit conditions to unsigned and rotate computation 1939 // out of the loop. Note: Handles invalidation internally if needed. 1940 Changed |= canonicalizeExitCondition(L); 1941 1942 // Try to eliminate loop exits based on analyzeable exit counts 1943 if (optimizeLoopExits(L, Rewriter)) { 1944 Changed = true; 1945 // Given we've changed exit counts, notify SCEV 1946 // Some nested loops may share same folded exit basic block, 1947 // thus we need to notify top most loop. 1948 SE->forgetTopmostLoop(L); 1949 } 1950 1951 // Try to form loop invariant tests for loop exits by changing how many 1952 // iterations of the loop run when that is unobservable. 1953 if (predicateLoopExits(L, Rewriter)) { 1954 Changed = true; 1955 // Given we've changed exit counts, notify SCEV 1956 SE->forgetLoop(L); 1957 } 1958 1959 // If we have a trip count expression, rewrite the loop's exit condition 1960 // using it. 1961 if (!DisableLFTR) { 1962 BasicBlock *PreHeader = L->getLoopPreheader(); 1963 1964 SmallVector<BasicBlock*, 16> ExitingBlocks; 1965 L->getExitingBlocks(ExitingBlocks); 1966 for (BasicBlock *ExitingBB : ExitingBlocks) { 1967 // Can't rewrite non-branch yet. 1968 if (!isa<BranchInst>(ExitingBB->getTerminator())) 1969 continue; 1970 1971 // If our exitting block exits multiple loops, we can only rewrite the 1972 // innermost one. Otherwise, we're changing how many times the innermost 1973 // loop runs before it exits. 1974 if (LI->getLoopFor(ExitingBB) != L) 1975 continue; 1976 1977 if (!needsLFTR(L, ExitingBB)) 1978 continue; 1979 1980 const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); 1981 if (isa<SCEVCouldNotCompute>(ExitCount)) 1982 continue; 1983 1984 // This was handled above, but as we form SCEVs, we can sometimes refine 1985 // existing ones; this allows exit counts to be folded to zero which 1986 // weren't when optimizeLoopExits saw them. Arguably, we should iterate 1987 // until stable to handle cases like this better. 1988 if (ExitCount->isZero()) 1989 continue; 1990 1991 PHINode *IndVar = FindLoopCounter(L, ExitingBB, ExitCount, SE, DT); 1992 if (!IndVar) 1993 continue; 1994 1995 // Avoid high cost expansions. Note: This heuristic is questionable in 1996 // that our definition of "high cost" is not exactly principled. 1997 if (Rewriter.isHighCostExpansion(ExitCount, L, SCEVCheapExpansionBudget, 1998 TTI, PreHeader->getTerminator())) 1999 continue; 2000 2001 if (!Rewriter.isSafeToExpand(ExitCount)) 2002 continue; 2003 2004 Changed |= linearFunctionTestReplace(L, ExitingBB, 2005 ExitCount, IndVar, 2006 Rewriter); 2007 } 2008 } 2009 // Clear the rewriter cache, because values that are in the rewriter's cache 2010 // can be deleted in the loop below, causing the AssertingVH in the cache to 2011 // trigger. 2012 Rewriter.clear(); 2013 2014 // Now that we're done iterating through lists, clean up any instructions 2015 // which are now dead. 2016 while (!DeadInsts.empty()) { 2017 Value *V = DeadInsts.pop_back_val(); 2018 2019 if (PHINode *PHI = dyn_cast_or_null<PHINode>(V)) 2020 Changed |= RecursivelyDeleteDeadPHINode(PHI, TLI, MSSAU.get()); 2021 else if (Instruction *Inst = dyn_cast_or_null<Instruction>(V)) 2022 Changed |= 2023 RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI, MSSAU.get()); 2024 } 2025 2026 // The Rewriter may not be used from this point on. 2027 2028 // Loop-invariant instructions in the preheader that aren't used in the 2029 // loop may be sunk below the loop to reduce register pressure. 2030 Changed |= sinkUnusedInvariants(L); 2031 2032 // rewriteFirstIterationLoopExitValues does not rely on the computation of 2033 // trip count and therefore can further simplify exit values in addition to 2034 // rewriteLoopExitValues. 2035 Changed |= rewriteFirstIterationLoopExitValues(L); 2036 2037 // Clean up dead instructions. 2038 Changed |= DeleteDeadPHIs(L->getHeader(), TLI, MSSAU.get()); 2039 2040 // Check a post-condition. 2041 assert(L->isRecursivelyLCSSAForm(*DT, *LI) && 2042 "Indvars did not preserve LCSSA!"); 2043 if (VerifyMemorySSA && MSSAU) 2044 MSSAU->getMemorySSA()->verifyMemorySSA(); 2045 2046 return Changed; 2047 } 2048 2049 PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM, 2050 LoopStandardAnalysisResults &AR, 2051 LPMUpdater &) { 2052 Function *F = L.getHeader()->getParent(); 2053 const DataLayout &DL = F->getParent()->getDataLayout(); 2054 2055 IndVarSimplify IVS(&AR.LI, &AR.SE, &AR.DT, DL, &AR.TLI, &AR.TTI, AR.MSSA, 2056 WidenIndVars && AllowIVWidening); 2057 if (!IVS.run(&L)) 2058 return PreservedAnalyses::all(); 2059 2060 auto PA = getLoopPassPreservedAnalyses(); 2061 PA.preserveSet<CFGAnalyses>(); 2062 if (AR.MSSA) 2063 PA.preserve<MemorySSAAnalysis>(); 2064 return PA; 2065 } 2066