1 //===- IndVarSimplify.cpp - Induction Variable Elimination ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This transformation analyzes and transforms the induction variables (and 10 // computations derived from them) into simpler forms suitable for subsequent 11 // analysis and transformation. 12 // 13 // If the trip count of a loop is computable, this pass also makes the following 14 // changes: 15 // 1. The exit condition for the loop is canonicalized to compare the 16 // induction value against the exit value. This turns loops like: 17 // 'for (i = 7; i*i < 1000; ++i)' into 'for (i = 0; i != 25; ++i)' 18 // 2. Any use outside of the loop of an expression derived from the indvar 19 // is changed to compute the derived value outside of the loop, eliminating 20 // the dependence on the exit value of the induction variable. If the only 21 // purpose of the loop is to compute the exit value of some derived 22 // expression, this transformation will make the loop dead. 23 // 24 //===----------------------------------------------------------------------===// 25 26 #include "llvm/Transforms/Scalar/IndVarSimplify.h" 27 #include "llvm/ADT/APFloat.h" 28 #include "llvm/ADT/ArrayRef.h" 29 #include "llvm/ADT/STLExtras.h" 30 #include "llvm/ADT/SmallPtrSet.h" 31 #include "llvm/ADT/SmallSet.h" 32 #include "llvm/ADT/SmallVector.h" 33 #include "llvm/ADT/Statistic.h" 34 #include "llvm/ADT/iterator_range.h" 35 #include "llvm/Analysis/LoopInfo.h" 36 #include "llvm/Analysis/LoopPass.h" 37 #include "llvm/Analysis/MemorySSA.h" 38 #include "llvm/Analysis/MemorySSAUpdater.h" 39 #include "llvm/Analysis/ScalarEvolution.h" 40 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 41 #include "llvm/Analysis/TargetLibraryInfo.h" 42 #include "llvm/Analysis/TargetTransformInfo.h" 43 #include "llvm/Analysis/ValueTracking.h" 44 #include "llvm/IR/BasicBlock.h" 45 #include "llvm/IR/Constant.h" 46 #include "llvm/IR/ConstantRange.h" 47 #include "llvm/IR/Constants.h" 48 #include "llvm/IR/DataLayout.h" 49 #include "llvm/IR/DerivedTypes.h" 50 #include "llvm/IR/Dominators.h" 51 #include "llvm/IR/Function.h" 52 #include "llvm/IR/IRBuilder.h" 53 #include "llvm/IR/InstrTypes.h" 54 #include "llvm/IR/Instruction.h" 55 #include "llvm/IR/Instructions.h" 56 #include "llvm/IR/IntrinsicInst.h" 57 #include "llvm/IR/Intrinsics.h" 58 #include "llvm/IR/Module.h" 59 #include "llvm/IR/Operator.h" 60 #include "llvm/IR/PassManager.h" 61 #include "llvm/IR/PatternMatch.h" 62 #include "llvm/IR/Type.h" 63 #include "llvm/IR/Use.h" 64 #include "llvm/IR/User.h" 65 #include "llvm/IR/Value.h" 66 #include "llvm/IR/ValueHandle.h" 67 #include "llvm/InitializePasses.h" 68 #include "llvm/Pass.h" 69 #include "llvm/Support/Casting.h" 70 #include "llvm/Support/CommandLine.h" 71 #include "llvm/Support/Compiler.h" 72 #include "llvm/Support/Debug.h" 73 #include "llvm/Support/MathExtras.h" 74 #include "llvm/Support/raw_ostream.h" 75 #include "llvm/Transforms/Scalar.h" 76 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 77 #include "llvm/Transforms/Utils/Local.h" 78 #include "llvm/Transforms/Utils/LoopUtils.h" 79 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 80 #include "llvm/Transforms/Utils/SimplifyIndVar.h" 81 #include <cassert> 82 #include <cstdint> 83 #include <utility> 84 85 using namespace llvm; 86 using namespace PatternMatch; 87 88 #define DEBUG_TYPE "indvars" 89 90 STATISTIC(NumWidened , "Number of indvars widened"); 91 STATISTIC(NumReplaced , "Number of exit values replaced"); 92 STATISTIC(NumLFTR , "Number of loop exit tests replaced"); 93 STATISTIC(NumElimExt , "Number of IV sign/zero extends eliminated"); 94 STATISTIC(NumElimIV , "Number of congruent IVs eliminated"); 95 96 static cl::opt<ReplaceExitVal> ReplaceExitValue( 97 "replexitval", cl::Hidden, cl::init(OnlyCheapRepl), 98 cl::desc("Choose the strategy to replace exit value in IndVarSimplify"), 99 cl::values( 100 clEnumValN(NeverRepl, "never", "never replace exit value"), 101 clEnumValN(OnlyCheapRepl, "cheap", 102 "only replace exit value when the cost is cheap"), 103 clEnumValN( 104 UnusedIndVarInLoop, "unusedindvarinloop", 105 "only replace exit value when it is an unused " 106 "induction variable in the loop and has cheap replacement cost"), 107 clEnumValN(NoHardUse, "noharduse", 108 "only replace exit values when loop def likely dead"), 109 clEnumValN(AlwaysRepl, "always", 110 "always replace exit value whenever possible"))); 111 112 static cl::opt<bool> UsePostIncrementRanges( 113 "indvars-post-increment-ranges", cl::Hidden, 114 cl::desc("Use post increment control-dependent ranges in IndVarSimplify"), 115 cl::init(true)); 116 117 static cl::opt<bool> 118 DisableLFTR("disable-lftr", cl::Hidden, cl::init(false), 119 cl::desc("Disable Linear Function Test Replace optimization")); 120 121 static cl::opt<bool> 122 LoopPredication("indvars-predicate-loops", cl::Hidden, cl::init(true), 123 cl::desc("Predicate conditions in read only loops")); 124 125 static cl::opt<bool> 126 AllowIVWidening("indvars-widen-indvars", cl::Hidden, cl::init(true), 127 cl::desc("Allow widening of indvars to eliminate s/zext")); 128 129 namespace { 130 131 class IndVarSimplify { 132 LoopInfo *LI; 133 ScalarEvolution *SE; 134 DominatorTree *DT; 135 const DataLayout &DL; 136 TargetLibraryInfo *TLI; 137 const TargetTransformInfo *TTI; 138 std::unique_ptr<MemorySSAUpdater> MSSAU; 139 140 SmallVector<WeakTrackingVH, 16> DeadInsts; 141 bool WidenIndVars; 142 143 bool handleFloatingPointIV(Loop *L, PHINode *PH); 144 bool rewriteNonIntegerIVs(Loop *L); 145 146 bool simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LoopInfo *LI); 147 /// Try to improve our exit conditions by converting condition from signed 148 /// to unsigned or rotating computation out of the loop. 149 /// (See inline comment about why this is duplicated from simplifyAndExtend) 150 bool canonicalizeExitCondition(Loop *L); 151 /// Try to eliminate loop exits based on analyzeable exit counts 152 bool optimizeLoopExits(Loop *L, SCEVExpander &Rewriter); 153 /// Try to form loop invariant tests for loop exits by changing how many 154 /// iterations of the loop run when that is unobservable. 155 bool predicateLoopExits(Loop *L, SCEVExpander &Rewriter); 156 157 bool rewriteFirstIterationLoopExitValues(Loop *L); 158 159 bool linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB, 160 const SCEV *ExitCount, 161 PHINode *IndVar, SCEVExpander &Rewriter); 162 163 bool sinkUnusedInvariants(Loop *L); 164 165 public: 166 IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 167 const DataLayout &DL, TargetLibraryInfo *TLI, 168 TargetTransformInfo *TTI, MemorySSA *MSSA, bool WidenIndVars) 169 : LI(LI), SE(SE), DT(DT), DL(DL), TLI(TLI), TTI(TTI), 170 WidenIndVars(WidenIndVars) { 171 if (MSSA) 172 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA); 173 } 174 175 bool run(Loop *L); 176 }; 177 178 } // end anonymous namespace 179 180 //===----------------------------------------------------------------------===// 181 // rewriteNonIntegerIVs and helpers. Prefer integer IVs. 182 //===----------------------------------------------------------------------===// 183 184 /// Convert APF to an integer, if possible. 185 static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) { 186 bool isExact = false; 187 // See if we can convert this to an int64_t 188 uint64_t UIntVal; 189 if (APF.convertToInteger(MutableArrayRef(UIntVal), 64, true, 190 APFloat::rmTowardZero, &isExact) != APFloat::opOK || 191 !isExact) 192 return false; 193 IntVal = UIntVal; 194 return true; 195 } 196 197 /// If the loop has floating induction variable then insert corresponding 198 /// integer induction variable if possible. 199 /// For example, 200 /// for(double i = 0; i < 10000; ++i) 201 /// bar(i) 202 /// is converted into 203 /// for(int i = 0; i < 10000; ++i) 204 /// bar((double)i); 205 bool IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) { 206 unsigned IncomingEdge = L->contains(PN->getIncomingBlock(0)); 207 unsigned BackEdge = IncomingEdge^1; 208 209 // Check incoming value. 210 auto *InitValueVal = dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge)); 211 212 int64_t InitValue; 213 if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue)) 214 return false; 215 216 // Check IV increment. Reject this PN if increment operation is not 217 // an add or increment value can not be represented by an integer. 218 auto *Incr = dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge)); 219 if (Incr == nullptr || Incr->getOpcode() != Instruction::FAdd) return false; 220 221 // If this is not an add of the PHI with a constantfp, or if the constant fp 222 // is not an integer, bail out. 223 ConstantFP *IncValueVal = dyn_cast<ConstantFP>(Incr->getOperand(1)); 224 int64_t IncValue; 225 if (IncValueVal == nullptr || Incr->getOperand(0) != PN || 226 !ConvertToSInt(IncValueVal->getValueAPF(), IncValue)) 227 return false; 228 229 // Check Incr uses. One user is PN and the other user is an exit condition 230 // used by the conditional terminator. 231 Value::user_iterator IncrUse = Incr->user_begin(); 232 Instruction *U1 = cast<Instruction>(*IncrUse++); 233 if (IncrUse == Incr->user_end()) return false; 234 Instruction *U2 = cast<Instruction>(*IncrUse++); 235 if (IncrUse != Incr->user_end()) return false; 236 237 // Find exit condition, which is an fcmp. If it doesn't exist, or if it isn't 238 // only used by a branch, we can't transform it. 239 FCmpInst *Compare = dyn_cast<FCmpInst>(U1); 240 if (!Compare) 241 Compare = dyn_cast<FCmpInst>(U2); 242 if (!Compare || !Compare->hasOneUse() || 243 !isa<BranchInst>(Compare->user_back())) 244 return false; 245 246 BranchInst *TheBr = cast<BranchInst>(Compare->user_back()); 247 248 // We need to verify that the branch actually controls the iteration count 249 // of the loop. If not, the new IV can overflow and no one will notice. 250 // The branch block must be in the loop and one of the successors must be out 251 // of the loop. 252 assert(TheBr->isConditional() && "Can't use fcmp if not conditional"); 253 if (!L->contains(TheBr->getParent()) || 254 (L->contains(TheBr->getSuccessor(0)) && 255 L->contains(TheBr->getSuccessor(1)))) 256 return false; 257 258 // If it isn't a comparison with an integer-as-fp (the exit value), we can't 259 // transform it. 260 ConstantFP *ExitValueVal = dyn_cast<ConstantFP>(Compare->getOperand(1)); 261 int64_t ExitValue; 262 if (ExitValueVal == nullptr || 263 !ConvertToSInt(ExitValueVal->getValueAPF(), ExitValue)) 264 return false; 265 266 // Find new predicate for integer comparison. 267 CmpInst::Predicate NewPred = CmpInst::BAD_ICMP_PREDICATE; 268 switch (Compare->getPredicate()) { 269 default: return false; // Unknown comparison. 270 case CmpInst::FCMP_OEQ: 271 case CmpInst::FCMP_UEQ: NewPred = CmpInst::ICMP_EQ; break; 272 case CmpInst::FCMP_ONE: 273 case CmpInst::FCMP_UNE: NewPred = CmpInst::ICMP_NE; break; 274 case CmpInst::FCMP_OGT: 275 case CmpInst::FCMP_UGT: NewPred = CmpInst::ICMP_SGT; break; 276 case CmpInst::FCMP_OGE: 277 case CmpInst::FCMP_UGE: NewPred = CmpInst::ICMP_SGE; break; 278 case CmpInst::FCMP_OLT: 279 case CmpInst::FCMP_ULT: NewPred = CmpInst::ICMP_SLT; break; 280 case CmpInst::FCMP_OLE: 281 case CmpInst::FCMP_ULE: NewPred = CmpInst::ICMP_SLE; break; 282 } 283 284 // We convert the floating point induction variable to a signed i32 value if 285 // we can. This is only safe if the comparison will not overflow in a way 286 // that won't be trapped by the integer equivalent operations. Check for this 287 // now. 288 // TODO: We could use i64 if it is native and the range requires it. 289 290 // The start/stride/exit values must all fit in signed i32. 291 if (!isInt<32>(InitValue) || !isInt<32>(IncValue) || !isInt<32>(ExitValue)) 292 return false; 293 294 // If not actually striding (add x, 0.0), avoid touching the code. 295 if (IncValue == 0) 296 return false; 297 298 // Positive and negative strides have different safety conditions. 299 if (IncValue > 0) { 300 // If we have a positive stride, we require the init to be less than the 301 // exit value. 302 if (InitValue >= ExitValue) 303 return false; 304 305 uint32_t Range = uint32_t(ExitValue-InitValue); 306 // Check for infinite loop, either: 307 // while (i <= Exit) or until (i > Exit) 308 if (NewPred == CmpInst::ICMP_SLE || NewPred == CmpInst::ICMP_SGT) { 309 if (++Range == 0) return false; // Range overflows. 310 } 311 312 unsigned Leftover = Range % uint32_t(IncValue); 313 314 // If this is an equality comparison, we require that the strided value 315 // exactly land on the exit value, otherwise the IV condition will wrap 316 // around and do things the fp IV wouldn't. 317 if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) && 318 Leftover != 0) 319 return false; 320 321 // If the stride would wrap around the i32 before exiting, we can't 322 // transform the IV. 323 if (Leftover != 0 && int32_t(ExitValue+IncValue) < ExitValue) 324 return false; 325 } else { 326 // If we have a negative stride, we require the init to be greater than the 327 // exit value. 328 if (InitValue <= ExitValue) 329 return false; 330 331 uint32_t Range = uint32_t(InitValue-ExitValue); 332 // Check for infinite loop, either: 333 // while (i >= Exit) or until (i < Exit) 334 if (NewPred == CmpInst::ICMP_SGE || NewPred == CmpInst::ICMP_SLT) { 335 if (++Range == 0) return false; // Range overflows. 336 } 337 338 unsigned Leftover = Range % uint32_t(-IncValue); 339 340 // If this is an equality comparison, we require that the strided value 341 // exactly land on the exit value, otherwise the IV condition will wrap 342 // around and do things the fp IV wouldn't. 343 if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) && 344 Leftover != 0) 345 return false; 346 347 // If the stride would wrap around the i32 before exiting, we can't 348 // transform the IV. 349 if (Leftover != 0 && int32_t(ExitValue+IncValue) > ExitValue) 350 return false; 351 } 352 353 IntegerType *Int32Ty = Type::getInt32Ty(PN->getContext()); 354 355 // Insert new integer induction variable. 356 PHINode *NewPHI = PHINode::Create(Int32Ty, 2, PN->getName()+".int", PN); 357 NewPHI->addIncoming(ConstantInt::get(Int32Ty, InitValue), 358 PN->getIncomingBlock(IncomingEdge)); 359 360 Value *NewAdd = 361 BinaryOperator::CreateAdd(NewPHI, ConstantInt::get(Int32Ty, IncValue), 362 Incr->getName()+".int", Incr); 363 NewPHI->addIncoming(NewAdd, PN->getIncomingBlock(BackEdge)); 364 365 ICmpInst *NewCompare = new ICmpInst(TheBr, NewPred, NewAdd, 366 ConstantInt::get(Int32Ty, ExitValue), 367 Compare->getName()); 368 369 // In the following deletions, PN may become dead and may be deleted. 370 // Use a WeakTrackingVH to observe whether this happens. 371 WeakTrackingVH WeakPH = PN; 372 373 // Delete the old floating point exit comparison. The branch starts using the 374 // new comparison. 375 NewCompare->takeName(Compare); 376 Compare->replaceAllUsesWith(NewCompare); 377 RecursivelyDeleteTriviallyDeadInstructions(Compare, TLI, MSSAU.get()); 378 379 // Delete the old floating point increment. 380 Incr->replaceAllUsesWith(PoisonValue::get(Incr->getType())); 381 RecursivelyDeleteTriviallyDeadInstructions(Incr, TLI, MSSAU.get()); 382 383 // If the FP induction variable still has uses, this is because something else 384 // in the loop uses its value. In order to canonicalize the induction 385 // variable, we chose to eliminate the IV and rewrite it in terms of an 386 // int->fp cast. 387 // 388 // We give preference to sitofp over uitofp because it is faster on most 389 // platforms. 390 if (WeakPH) { 391 Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv", 392 &*PN->getParent()->getFirstInsertionPt()); 393 PN->replaceAllUsesWith(Conv); 394 RecursivelyDeleteTriviallyDeadInstructions(PN, TLI, MSSAU.get()); 395 } 396 return true; 397 } 398 399 bool IndVarSimplify::rewriteNonIntegerIVs(Loop *L) { 400 // First step. Check to see if there are any floating-point recurrences. 401 // If there are, change them into integer recurrences, permitting analysis by 402 // the SCEV routines. 403 BasicBlock *Header = L->getHeader(); 404 405 SmallVector<WeakTrackingVH, 8> PHIs; 406 for (PHINode &PN : Header->phis()) 407 PHIs.push_back(&PN); 408 409 bool Changed = false; 410 for (WeakTrackingVH &PHI : PHIs) 411 if (PHINode *PN = dyn_cast_or_null<PHINode>(&*PHI)) 412 Changed |= handleFloatingPointIV(L, PN); 413 414 // If the loop previously had floating-point IV, ScalarEvolution 415 // may not have been able to compute a trip count. Now that we've done some 416 // re-writing, the trip count may be computable. 417 if (Changed) 418 SE->forgetLoop(L); 419 return Changed; 420 } 421 422 //===---------------------------------------------------------------------===// 423 // rewriteFirstIterationLoopExitValues: Rewrite loop exit values if we know 424 // they will exit at the first iteration. 425 //===---------------------------------------------------------------------===// 426 427 /// Check to see if this loop has loop invariant conditions which lead to loop 428 /// exits. If so, we know that if the exit path is taken, it is at the first 429 /// loop iteration. This lets us predict exit values of PHI nodes that live in 430 /// loop header. 431 bool IndVarSimplify::rewriteFirstIterationLoopExitValues(Loop *L) { 432 // Verify the input to the pass is already in LCSSA form. 433 assert(L->isLCSSAForm(*DT)); 434 435 SmallVector<BasicBlock *, 8> ExitBlocks; 436 L->getUniqueExitBlocks(ExitBlocks); 437 438 bool MadeAnyChanges = false; 439 for (auto *ExitBB : ExitBlocks) { 440 // If there are no more PHI nodes in this exit block, then no more 441 // values defined inside the loop are used on this path. 442 for (PHINode &PN : ExitBB->phis()) { 443 for (unsigned IncomingValIdx = 0, E = PN.getNumIncomingValues(); 444 IncomingValIdx != E; ++IncomingValIdx) { 445 auto *IncomingBB = PN.getIncomingBlock(IncomingValIdx); 446 447 // Can we prove that the exit must run on the first iteration if it 448 // runs at all? (i.e. early exits are fine for our purposes, but 449 // traces which lead to this exit being taken on the 2nd iteration 450 // aren't.) Note that this is about whether the exit branch is 451 // executed, not about whether it is taken. 452 if (!L->getLoopLatch() || 453 !DT->dominates(IncomingBB, L->getLoopLatch())) 454 continue; 455 456 // Get condition that leads to the exit path. 457 auto *TermInst = IncomingBB->getTerminator(); 458 459 Value *Cond = nullptr; 460 if (auto *BI = dyn_cast<BranchInst>(TermInst)) { 461 // Must be a conditional branch, otherwise the block 462 // should not be in the loop. 463 Cond = BI->getCondition(); 464 } else if (auto *SI = dyn_cast<SwitchInst>(TermInst)) 465 Cond = SI->getCondition(); 466 else 467 continue; 468 469 if (!L->isLoopInvariant(Cond)) 470 continue; 471 472 auto *ExitVal = dyn_cast<PHINode>(PN.getIncomingValue(IncomingValIdx)); 473 474 // Only deal with PHIs in the loop header. 475 if (!ExitVal || ExitVal->getParent() != L->getHeader()) 476 continue; 477 478 // If ExitVal is a PHI on the loop header, then we know its 479 // value along this exit because the exit can only be taken 480 // on the first iteration. 481 auto *LoopPreheader = L->getLoopPreheader(); 482 assert(LoopPreheader && "Invalid loop"); 483 int PreheaderIdx = ExitVal->getBasicBlockIndex(LoopPreheader); 484 if (PreheaderIdx != -1) { 485 assert(ExitVal->getParent() == L->getHeader() && 486 "ExitVal must be in loop header"); 487 MadeAnyChanges = true; 488 PN.setIncomingValue(IncomingValIdx, 489 ExitVal->getIncomingValue(PreheaderIdx)); 490 SE->forgetValue(&PN); 491 } 492 } 493 } 494 } 495 return MadeAnyChanges; 496 } 497 498 //===----------------------------------------------------------------------===// 499 // IV Widening - Extend the width of an IV to cover its widest uses. 500 //===----------------------------------------------------------------------===// 501 502 /// Update information about the induction variable that is extended by this 503 /// sign or zero extend operation. This is used to determine the final width of 504 /// the IV before actually widening it. 505 static void visitIVCast(CastInst *Cast, WideIVInfo &WI, 506 ScalarEvolution *SE, 507 const TargetTransformInfo *TTI) { 508 bool IsSigned = Cast->getOpcode() == Instruction::SExt; 509 if (!IsSigned && Cast->getOpcode() != Instruction::ZExt) 510 return; 511 512 Type *Ty = Cast->getType(); 513 uint64_t Width = SE->getTypeSizeInBits(Ty); 514 if (!Cast->getModule()->getDataLayout().isLegalInteger(Width)) 515 return; 516 517 // Check that `Cast` actually extends the induction variable (we rely on this 518 // later). This takes care of cases where `Cast` is extending a truncation of 519 // the narrow induction variable, and thus can end up being narrower than the 520 // "narrow" induction variable. 521 uint64_t NarrowIVWidth = SE->getTypeSizeInBits(WI.NarrowIV->getType()); 522 if (NarrowIVWidth >= Width) 523 return; 524 525 // Cast is either an sext or zext up to this point. 526 // We should not widen an indvar if arithmetics on the wider indvar are more 527 // expensive than those on the narrower indvar. We check only the cost of ADD 528 // because at least an ADD is required to increment the induction variable. We 529 // could compute more comprehensively the cost of all instructions on the 530 // induction variable when necessary. 531 if (TTI && 532 TTI->getArithmeticInstrCost(Instruction::Add, Ty) > 533 TTI->getArithmeticInstrCost(Instruction::Add, 534 Cast->getOperand(0)->getType())) { 535 return; 536 } 537 538 if (!WI.WidestNativeType || 539 Width > SE->getTypeSizeInBits(WI.WidestNativeType)) { 540 WI.WidestNativeType = SE->getEffectiveSCEVType(Ty); 541 WI.IsSigned = IsSigned; 542 return; 543 } 544 545 // We extend the IV to satisfy the sign of its user(s), or 'signed' 546 // if there are multiple users with both sign- and zero extensions, 547 // in order not to introduce nondeterministic behaviour based on the 548 // unspecified order of a PHI nodes' users-iterator. 549 WI.IsSigned |= IsSigned; 550 } 551 552 //===----------------------------------------------------------------------===// 553 // Live IV Reduction - Minimize IVs live across the loop. 554 //===----------------------------------------------------------------------===// 555 556 //===----------------------------------------------------------------------===// 557 // Simplification of IV users based on SCEV evaluation. 558 //===----------------------------------------------------------------------===// 559 560 namespace { 561 562 class IndVarSimplifyVisitor : public IVVisitor { 563 ScalarEvolution *SE; 564 const TargetTransformInfo *TTI; 565 PHINode *IVPhi; 566 567 public: 568 WideIVInfo WI; 569 570 IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV, 571 const TargetTransformInfo *TTI, 572 const DominatorTree *DTree) 573 : SE(SCEV), TTI(TTI), IVPhi(IV) { 574 DT = DTree; 575 WI.NarrowIV = IVPhi; 576 } 577 578 // Implement the interface used by simplifyUsersOfIV. 579 void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); } 580 }; 581 582 } // end anonymous namespace 583 584 /// Iteratively perform simplification on a worklist of IV users. Each 585 /// successive simplification may push more users which may themselves be 586 /// candidates for simplification. 587 /// 588 /// Sign/Zero extend elimination is interleaved with IV simplification. 589 bool IndVarSimplify::simplifyAndExtend(Loop *L, 590 SCEVExpander &Rewriter, 591 LoopInfo *LI) { 592 SmallVector<WideIVInfo, 8> WideIVs; 593 594 auto *GuardDecl = L->getBlocks()[0]->getModule()->getFunction( 595 Intrinsic::getName(Intrinsic::experimental_guard)); 596 bool HasGuards = GuardDecl && !GuardDecl->use_empty(); 597 598 SmallVector<PHINode *, 8> LoopPhis; 599 for (PHINode &PN : L->getHeader()->phis()) 600 LoopPhis.push_back(&PN); 601 602 // Each round of simplification iterates through the SimplifyIVUsers worklist 603 // for all current phis, then determines whether any IVs can be 604 // widened. Widening adds new phis to LoopPhis, inducing another round of 605 // simplification on the wide IVs. 606 bool Changed = false; 607 while (!LoopPhis.empty()) { 608 // Evaluate as many IV expressions as possible before widening any IVs. This 609 // forces SCEV to set no-wrap flags before evaluating sign/zero 610 // extension. The first time SCEV attempts to normalize sign/zero extension, 611 // the result becomes final. So for the most predictable results, we delay 612 // evaluation of sign/zero extend evaluation until needed, and avoid running 613 // other SCEV based analysis prior to simplifyAndExtend. 614 do { 615 PHINode *CurrIV = LoopPhis.pop_back_val(); 616 617 // Information about sign/zero extensions of CurrIV. 618 IndVarSimplifyVisitor Visitor(CurrIV, SE, TTI, DT); 619 620 Changed |= simplifyUsersOfIV(CurrIV, SE, DT, LI, TTI, DeadInsts, Rewriter, 621 &Visitor); 622 623 if (Visitor.WI.WidestNativeType) { 624 WideIVs.push_back(Visitor.WI); 625 } 626 } while(!LoopPhis.empty()); 627 628 // Continue if we disallowed widening. 629 if (!WidenIndVars) 630 continue; 631 632 for (; !WideIVs.empty(); WideIVs.pop_back()) { 633 unsigned ElimExt; 634 unsigned Widened; 635 if (PHINode *WidePhi = createWideIV(WideIVs.back(), LI, SE, Rewriter, 636 DT, DeadInsts, ElimExt, Widened, 637 HasGuards, UsePostIncrementRanges)) { 638 NumElimExt += ElimExt; 639 NumWidened += Widened; 640 Changed = true; 641 LoopPhis.push_back(WidePhi); 642 } 643 } 644 } 645 return Changed; 646 } 647 648 //===----------------------------------------------------------------------===// 649 // linearFunctionTestReplace and its kin. Rewrite the loop exit condition. 650 //===----------------------------------------------------------------------===// 651 652 /// Given an Value which is hoped to be part of an add recurance in the given 653 /// loop, return the associated Phi node if so. Otherwise, return null. Note 654 /// that this is less general than SCEVs AddRec checking. 655 static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L) { 656 Instruction *IncI = dyn_cast<Instruction>(IncV); 657 if (!IncI) 658 return nullptr; 659 660 switch (IncI->getOpcode()) { 661 case Instruction::Add: 662 case Instruction::Sub: 663 break; 664 case Instruction::GetElementPtr: 665 // An IV counter must preserve its type. 666 if (IncI->getNumOperands() == 2) 667 break; 668 [[fallthrough]]; 669 default: 670 return nullptr; 671 } 672 673 PHINode *Phi = dyn_cast<PHINode>(IncI->getOperand(0)); 674 if (Phi && Phi->getParent() == L->getHeader()) { 675 if (L->isLoopInvariant(IncI->getOperand(1))) 676 return Phi; 677 return nullptr; 678 } 679 if (IncI->getOpcode() == Instruction::GetElementPtr) 680 return nullptr; 681 682 // Allow add/sub to be commuted. 683 Phi = dyn_cast<PHINode>(IncI->getOperand(1)); 684 if (Phi && Phi->getParent() == L->getHeader()) { 685 if (L->isLoopInvariant(IncI->getOperand(0))) 686 return Phi; 687 } 688 return nullptr; 689 } 690 691 /// Whether the current loop exit test is based on this value. Currently this 692 /// is limited to a direct use in the loop condition. 693 static bool isLoopExitTestBasedOn(Value *V, BasicBlock *ExitingBB) { 694 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 695 ICmpInst *ICmp = dyn_cast<ICmpInst>(BI->getCondition()); 696 // TODO: Allow non-icmp loop test. 697 if (!ICmp) 698 return false; 699 700 // TODO: Allow indirect use. 701 return ICmp->getOperand(0) == V || ICmp->getOperand(1) == V; 702 } 703 704 /// linearFunctionTestReplace policy. Return true unless we can show that the 705 /// current exit test is already sufficiently canonical. 706 static bool needsLFTR(Loop *L, BasicBlock *ExitingBB) { 707 assert(L->getLoopLatch() && "Must be in simplified form"); 708 709 // Avoid converting a constant or loop invariant test back to a runtime 710 // test. This is critical for when SCEV's cached ExitCount is less precise 711 // than the current IR (such as after we've proven a particular exit is 712 // actually dead and thus the BE count never reaches our ExitCount.) 713 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 714 if (L->isLoopInvariant(BI->getCondition())) 715 return false; 716 717 // Do LFTR to simplify the exit condition to an ICMP. 718 ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition()); 719 if (!Cond) 720 return true; 721 722 // Do LFTR to simplify the exit ICMP to EQ/NE 723 ICmpInst::Predicate Pred = Cond->getPredicate(); 724 if (Pred != ICmpInst::ICMP_NE && Pred != ICmpInst::ICMP_EQ) 725 return true; 726 727 // Look for a loop invariant RHS 728 Value *LHS = Cond->getOperand(0); 729 Value *RHS = Cond->getOperand(1); 730 if (!L->isLoopInvariant(RHS)) { 731 if (!L->isLoopInvariant(LHS)) 732 return true; 733 std::swap(LHS, RHS); 734 } 735 // Look for a simple IV counter LHS 736 PHINode *Phi = dyn_cast<PHINode>(LHS); 737 if (!Phi) 738 Phi = getLoopPhiForCounter(LHS, L); 739 740 if (!Phi) 741 return true; 742 743 // Do LFTR if PHI node is defined in the loop, but is *not* a counter. 744 int Idx = Phi->getBasicBlockIndex(L->getLoopLatch()); 745 if (Idx < 0) 746 return true; 747 748 // Do LFTR if the exit condition's IV is *not* a simple counter. 749 Value *IncV = Phi->getIncomingValue(Idx); 750 return Phi != getLoopPhiForCounter(IncV, L); 751 } 752 753 /// Recursive helper for hasConcreteDef(). Unfortunately, this currently boils 754 /// down to checking that all operands are constant and listing instructions 755 /// that may hide undef. 756 static bool hasConcreteDefImpl(Value *V, SmallPtrSetImpl<Value*> &Visited, 757 unsigned Depth) { 758 if (isa<Constant>(V)) 759 return !isa<UndefValue>(V); 760 761 if (Depth >= 6) 762 return false; 763 764 // Conservatively handle non-constant non-instructions. For example, Arguments 765 // may be undef. 766 Instruction *I = dyn_cast<Instruction>(V); 767 if (!I) 768 return false; 769 770 // Load and return values may be undef. 771 if(I->mayReadFromMemory() || isa<CallInst>(I) || isa<InvokeInst>(I)) 772 return false; 773 774 // Optimistically handle other instructions. 775 for (Value *Op : I->operands()) { 776 if (!Visited.insert(Op).second) 777 continue; 778 if (!hasConcreteDefImpl(Op, Visited, Depth+1)) 779 return false; 780 } 781 return true; 782 } 783 784 /// Return true if the given value is concrete. We must prove that undef can 785 /// never reach it. 786 /// 787 /// TODO: If we decide that this is a good approach to checking for undef, we 788 /// may factor it into a common location. 789 static bool hasConcreteDef(Value *V) { 790 SmallPtrSet<Value*, 8> Visited; 791 Visited.insert(V); 792 return hasConcreteDefImpl(V, Visited, 0); 793 } 794 795 /// Return true if the given phi is a "counter" in L. A counter is an 796 /// add recurance (of integer or pointer type) with an arbitrary start, and a 797 /// step of 1. Note that L must have exactly one latch. 798 static bool isLoopCounter(PHINode* Phi, Loop *L, 799 ScalarEvolution *SE) { 800 assert(Phi->getParent() == L->getHeader()); 801 assert(L->getLoopLatch()); 802 803 if (!SE->isSCEVable(Phi->getType())) 804 return false; 805 806 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Phi)); 807 if (!AR || AR->getLoop() != L || !AR->isAffine()) 808 return false; 809 810 const SCEV *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)); 811 if (!Step || !Step->isOne()) 812 return false; 813 814 int LatchIdx = Phi->getBasicBlockIndex(L->getLoopLatch()); 815 Value *IncV = Phi->getIncomingValue(LatchIdx); 816 return (getLoopPhiForCounter(IncV, L) == Phi && 817 isa<SCEVAddRecExpr>(SE->getSCEV(IncV))); 818 } 819 820 /// Search the loop header for a loop counter (anadd rec w/step of one) 821 /// suitable for use by LFTR. If multiple counters are available, select the 822 /// "best" one based profitable heuristics. 823 /// 824 /// BECount may be an i8* pointer type. The pointer difference is already 825 /// valid count without scaling the address stride, so it remains a pointer 826 /// expression as far as SCEV is concerned. 827 static PHINode *FindLoopCounter(Loop *L, BasicBlock *ExitingBB, 828 const SCEV *BECount, 829 ScalarEvolution *SE, DominatorTree *DT) { 830 uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType()); 831 832 Value *Cond = cast<BranchInst>(ExitingBB->getTerminator())->getCondition(); 833 834 // Loop over all of the PHI nodes, looking for a simple counter. 835 PHINode *BestPhi = nullptr; 836 const SCEV *BestInit = nullptr; 837 BasicBlock *LatchBlock = L->getLoopLatch(); 838 assert(LatchBlock && "Must be in simplified form"); 839 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 840 841 for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { 842 PHINode *Phi = cast<PHINode>(I); 843 if (!isLoopCounter(Phi, L, SE)) 844 continue; 845 846 const auto *AR = cast<SCEVAddRecExpr>(SE->getSCEV(Phi)); 847 848 // AR may be a pointer type, while BECount is an integer type. 849 // AR may be wider than BECount. With eq/ne tests overflow is immaterial. 850 // AR may not be a narrower type, or we may never exit. 851 uint64_t PhiWidth = SE->getTypeSizeInBits(AR->getType()); 852 if (PhiWidth < BCWidth || !DL.isLegalInteger(PhiWidth)) 853 continue; 854 855 // Avoid reusing a potentially undef value to compute other values that may 856 // have originally had a concrete definition. 857 if (!hasConcreteDef(Phi)) { 858 // We explicitly allow unknown phis as long as they are already used by 859 // the loop exit test. This is legal since performing LFTR could not 860 // increase the number of undef users. 861 Value *IncPhi = Phi->getIncomingValueForBlock(LatchBlock); 862 if (!isLoopExitTestBasedOn(Phi, ExitingBB) && 863 !isLoopExitTestBasedOn(IncPhi, ExitingBB)) 864 continue; 865 } 866 867 // Avoid introducing undefined behavior due to poison which didn't exist in 868 // the original program. (Annoyingly, the rules for poison and undef 869 // propagation are distinct, so this does NOT cover the undef case above.) 870 // We have to ensure that we don't introduce UB by introducing a use on an 871 // iteration where said IV produces poison. Our strategy here differs for 872 // pointers and integer IVs. For integers, we strip and reinfer as needed, 873 // see code in linearFunctionTestReplace. For pointers, we restrict 874 // transforms as there is no good way to reinfer inbounds once lost. 875 if (!Phi->getType()->isIntegerTy() && 876 !mustExecuteUBIfPoisonOnPathTo(Phi, ExitingBB->getTerminator(), DT)) 877 continue; 878 879 const SCEV *Init = AR->getStart(); 880 881 if (BestPhi && !isAlmostDeadIV(BestPhi, LatchBlock, Cond)) { 882 // Don't force a live loop counter if another IV can be used. 883 if (isAlmostDeadIV(Phi, LatchBlock, Cond)) 884 continue; 885 886 // Prefer to count-from-zero. This is a more "canonical" counter form. It 887 // also prefers integer to pointer IVs. 888 if (BestInit->isZero() != Init->isZero()) { 889 if (BestInit->isZero()) 890 continue; 891 } 892 // If two IVs both count from zero or both count from nonzero then the 893 // narrower is likely a dead phi that has been widened. Use the wider phi 894 // to allow the other to be eliminated. 895 else if (PhiWidth <= SE->getTypeSizeInBits(BestPhi->getType())) 896 continue; 897 } 898 BestPhi = Phi; 899 BestInit = Init; 900 } 901 return BestPhi; 902 } 903 904 /// Insert an IR expression which computes the value held by the IV IndVar 905 /// (which must be an loop counter w/unit stride) after the backedge of loop L 906 /// is taken ExitCount times. 907 static Value *genLoopLimit(PHINode *IndVar, BasicBlock *ExitingBB, 908 const SCEV *ExitCount, bool UsePostInc, Loop *L, 909 SCEVExpander &Rewriter, ScalarEvolution *SE) { 910 assert(isLoopCounter(IndVar, L, SE)); 911 assert(ExitCount->getType()->isIntegerTy() && "exit count must be integer"); 912 const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IndVar)); 913 assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride"); 914 915 // For integer IVs, truncate the IV before computing the limit unless we 916 // know apriori that the limit must be a constant when evaluated in the 917 // bitwidth of the IV. We prefer (potentially) keeping a truncate of the 918 // IV in the loop over a (potentially) expensive expansion of the widened 919 // exit count add(zext(add)) expression. 920 if (IndVar->getType()->isIntegerTy() && 921 SE->getTypeSizeInBits(AR->getType()) > 922 SE->getTypeSizeInBits(ExitCount->getType())) { 923 const SCEV *IVInit = AR->getStart(); 924 if (!isa<SCEVConstant>(IVInit) || !isa<SCEVConstant>(ExitCount)) 925 AR = cast<SCEVAddRecExpr>(SE->getTruncateExpr(AR, ExitCount->getType())); 926 } 927 928 const SCEVAddRecExpr *ARBase = UsePostInc ? AR->getPostIncExpr(*SE) : AR; 929 const SCEV *IVLimit = ARBase->evaluateAtIteration(ExitCount, *SE); 930 assert(SE->isLoopInvariant(IVLimit, L) && 931 "Computed iteration count is not loop invariant!"); 932 return Rewriter.expandCodeFor(IVLimit, ARBase->getType(), 933 ExitingBB->getTerminator()); 934 } 935 936 /// This method rewrites the exit condition of the loop to be a canonical != 937 /// comparison against the incremented loop induction variable. This pass is 938 /// able to rewrite the exit tests of any loop where the SCEV analysis can 939 /// determine a loop-invariant trip count of the loop, which is actually a much 940 /// broader range than just linear tests. 941 bool IndVarSimplify:: 942 linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB, 943 const SCEV *ExitCount, 944 PHINode *IndVar, SCEVExpander &Rewriter) { 945 assert(L->getLoopLatch() && "Loop no longer in simplified form?"); 946 assert(isLoopCounter(IndVar, L, SE)); 947 Instruction * const IncVar = 948 cast<Instruction>(IndVar->getIncomingValueForBlock(L->getLoopLatch())); 949 950 // Initialize CmpIndVar to the preincremented IV. 951 Value *CmpIndVar = IndVar; 952 bool UsePostInc = false; 953 954 // If the exiting block is the same as the backedge block, we prefer to 955 // compare against the post-incremented value, otherwise we must compare 956 // against the preincremented value. 957 if (ExitingBB == L->getLoopLatch()) { 958 // For pointer IVs, we chose to not strip inbounds which requires us not 959 // to add a potentially UB introducing use. We need to either a) show 960 // the loop test we're modifying is already in post-inc form, or b) show 961 // that adding a use must not introduce UB. 962 bool SafeToPostInc = 963 IndVar->getType()->isIntegerTy() || 964 isLoopExitTestBasedOn(IncVar, ExitingBB) || 965 mustExecuteUBIfPoisonOnPathTo(IncVar, ExitingBB->getTerminator(), DT); 966 if (SafeToPostInc) { 967 UsePostInc = true; 968 CmpIndVar = IncVar; 969 } 970 } 971 972 // It may be necessary to drop nowrap flags on the incrementing instruction 973 // if either LFTR moves from a pre-inc check to a post-inc check (in which 974 // case the increment might have previously been poison on the last iteration 975 // only) or if LFTR switches to a different IV that was previously dynamically 976 // dead (and as such may be arbitrarily poison). We remove any nowrap flags 977 // that SCEV didn't infer for the post-inc addrec (even if we use a pre-inc 978 // check), because the pre-inc addrec flags may be adopted from the original 979 // instruction, while SCEV has to explicitly prove the post-inc nowrap flags. 980 // TODO: This handling is inaccurate for one case: If we switch to a 981 // dynamically dead IV that wraps on the first loop iteration only, which is 982 // not covered by the post-inc addrec. (If the new IV was not dynamically 983 // dead, it could not be poison on the first iteration in the first place.) 984 if (auto *BO = dyn_cast<BinaryOperator>(IncVar)) { 985 const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IncVar)); 986 if (BO->hasNoUnsignedWrap()) 987 BO->setHasNoUnsignedWrap(AR->hasNoUnsignedWrap()); 988 if (BO->hasNoSignedWrap()) 989 BO->setHasNoSignedWrap(AR->hasNoSignedWrap()); 990 } 991 992 Value *ExitCnt = genLoopLimit( 993 IndVar, ExitingBB, ExitCount, UsePostInc, L, Rewriter, SE); 994 assert(ExitCnt->getType()->isPointerTy() == 995 IndVar->getType()->isPointerTy() && 996 "genLoopLimit missed a cast"); 997 998 // Insert a new icmp_ne or icmp_eq instruction before the branch. 999 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1000 ICmpInst::Predicate P; 1001 if (L->contains(BI->getSuccessor(0))) 1002 P = ICmpInst::ICMP_NE; 1003 else 1004 P = ICmpInst::ICMP_EQ; 1005 1006 IRBuilder<> Builder(BI); 1007 1008 // The new loop exit condition should reuse the debug location of the 1009 // original loop exit condition. 1010 if (auto *Cond = dyn_cast<Instruction>(BI->getCondition())) 1011 Builder.SetCurrentDebugLocation(Cond->getDebugLoc()); 1012 1013 // For integer IVs, if we evaluated the limit in the narrower bitwidth to 1014 // avoid the expensive expansion of the limit expression in the wider type, 1015 // emit a truncate to narrow the IV to the ExitCount type. This is safe 1016 // since we know (from the exit count bitwidth), that we can't self-wrap in 1017 // the narrower type. 1018 unsigned CmpIndVarSize = SE->getTypeSizeInBits(CmpIndVar->getType()); 1019 unsigned ExitCntSize = SE->getTypeSizeInBits(ExitCnt->getType()); 1020 if (CmpIndVarSize > ExitCntSize) { 1021 assert(!CmpIndVar->getType()->isPointerTy() && 1022 !ExitCnt->getType()->isPointerTy()); 1023 1024 // Before resorting to actually inserting the truncate, use the same 1025 // reasoning as from SimplifyIndvar::eliminateTrunc to see if we can extend 1026 // the other side of the comparison instead. We still evaluate the limit 1027 // in the narrower bitwidth, we just prefer a zext/sext outside the loop to 1028 // a truncate within in. 1029 bool Extended = false; 1030 const SCEV *IV = SE->getSCEV(CmpIndVar); 1031 const SCEV *TruncatedIV = SE->getTruncateExpr(IV, ExitCnt->getType()); 1032 const SCEV *ZExtTrunc = 1033 SE->getZeroExtendExpr(TruncatedIV, CmpIndVar->getType()); 1034 1035 if (ZExtTrunc == IV) { 1036 Extended = true; 1037 ExitCnt = Builder.CreateZExt(ExitCnt, IndVar->getType(), 1038 "wide.trip.count"); 1039 } else { 1040 const SCEV *SExtTrunc = 1041 SE->getSignExtendExpr(TruncatedIV, CmpIndVar->getType()); 1042 if (SExtTrunc == IV) { 1043 Extended = true; 1044 ExitCnt = Builder.CreateSExt(ExitCnt, IndVar->getType(), 1045 "wide.trip.count"); 1046 } 1047 } 1048 1049 if (Extended) { 1050 bool Discard; 1051 L->makeLoopInvariant(ExitCnt, Discard); 1052 } else 1053 CmpIndVar = Builder.CreateTrunc(CmpIndVar, ExitCnt->getType(), 1054 "lftr.wideiv"); 1055 } 1056 LLVM_DEBUG(dbgs() << "INDVARS: Rewriting loop exit condition to:\n" 1057 << " LHS:" << *CmpIndVar << '\n' 1058 << " op:\t" << (P == ICmpInst::ICMP_NE ? "!=" : "==") 1059 << "\n" 1060 << " RHS:\t" << *ExitCnt << "\n" 1061 << "ExitCount:\t" << *ExitCount << "\n" 1062 << " was: " << *BI->getCondition() << "\n"); 1063 1064 Value *Cond = Builder.CreateICmp(P, CmpIndVar, ExitCnt, "exitcond"); 1065 Value *OrigCond = BI->getCondition(); 1066 // It's tempting to use replaceAllUsesWith here to fully replace the old 1067 // comparison, but that's not immediately safe, since users of the old 1068 // comparison may not be dominated by the new comparison. Instead, just 1069 // update the branch to use the new comparison; in the common case this 1070 // will make old comparison dead. 1071 BI->setCondition(Cond); 1072 DeadInsts.emplace_back(OrigCond); 1073 1074 ++NumLFTR; 1075 return true; 1076 } 1077 1078 //===----------------------------------------------------------------------===// 1079 // sinkUnusedInvariants. A late subpass to cleanup loop preheaders. 1080 //===----------------------------------------------------------------------===// 1081 1082 /// If there's a single exit block, sink any loop-invariant values that 1083 /// were defined in the preheader but not used inside the loop into the 1084 /// exit block to reduce register pressure in the loop. 1085 bool IndVarSimplify::sinkUnusedInvariants(Loop *L) { 1086 BasicBlock *ExitBlock = L->getExitBlock(); 1087 if (!ExitBlock) return false; 1088 1089 BasicBlock *Preheader = L->getLoopPreheader(); 1090 if (!Preheader) return false; 1091 1092 bool MadeAnyChanges = false; 1093 BasicBlock::iterator InsertPt = ExitBlock->getFirstInsertionPt(); 1094 BasicBlock::iterator I(Preheader->getTerminator()); 1095 while (I != Preheader->begin()) { 1096 --I; 1097 // New instructions were inserted at the end of the preheader. 1098 if (isa<PHINode>(I)) 1099 break; 1100 1101 // Don't move instructions which might have side effects, since the side 1102 // effects need to complete before instructions inside the loop. Also don't 1103 // move instructions which might read memory, since the loop may modify 1104 // memory. Note that it's okay if the instruction might have undefined 1105 // behavior: LoopSimplify guarantees that the preheader dominates the exit 1106 // block. 1107 if (I->mayHaveSideEffects() || I->mayReadFromMemory()) 1108 continue; 1109 1110 // Skip debug info intrinsics. 1111 if (isa<DbgInfoIntrinsic>(I)) 1112 continue; 1113 1114 // Skip eh pad instructions. 1115 if (I->isEHPad()) 1116 continue; 1117 1118 // Don't sink alloca: we never want to sink static alloca's out of the 1119 // entry block, and correctly sinking dynamic alloca's requires 1120 // checks for stacksave/stackrestore intrinsics. 1121 // FIXME: Refactor this check somehow? 1122 if (isa<AllocaInst>(I)) 1123 continue; 1124 1125 // Determine if there is a use in or before the loop (direct or 1126 // otherwise). 1127 bool UsedInLoop = false; 1128 for (Use &U : I->uses()) { 1129 Instruction *User = cast<Instruction>(U.getUser()); 1130 BasicBlock *UseBB = User->getParent(); 1131 if (PHINode *P = dyn_cast<PHINode>(User)) { 1132 unsigned i = 1133 PHINode::getIncomingValueNumForOperand(U.getOperandNo()); 1134 UseBB = P->getIncomingBlock(i); 1135 } 1136 if (UseBB == Preheader || L->contains(UseBB)) { 1137 UsedInLoop = true; 1138 break; 1139 } 1140 } 1141 1142 // If there is, the def must remain in the preheader. 1143 if (UsedInLoop) 1144 continue; 1145 1146 // Otherwise, sink it to the exit block. 1147 Instruction *ToMove = &*I; 1148 bool Done = false; 1149 1150 if (I != Preheader->begin()) { 1151 // Skip debug info intrinsics. 1152 do { 1153 --I; 1154 } while (I->isDebugOrPseudoInst() && I != Preheader->begin()); 1155 1156 if (I->isDebugOrPseudoInst() && I == Preheader->begin()) 1157 Done = true; 1158 } else { 1159 Done = true; 1160 } 1161 1162 MadeAnyChanges = true; 1163 ToMove->moveBefore(*ExitBlock, InsertPt); 1164 SE->forgetValue(ToMove); 1165 if (Done) break; 1166 InsertPt = ToMove->getIterator(); 1167 } 1168 1169 return MadeAnyChanges; 1170 } 1171 1172 static void replaceExitCond(BranchInst *BI, Value *NewCond, 1173 SmallVectorImpl<WeakTrackingVH> &DeadInsts) { 1174 auto *OldCond = BI->getCondition(); 1175 LLVM_DEBUG(dbgs() << "Replacing condition of loop-exiting branch " << *BI 1176 << " with " << *NewCond << "\n"); 1177 BI->setCondition(NewCond); 1178 if (OldCond->use_empty()) 1179 DeadInsts.emplace_back(OldCond); 1180 } 1181 1182 static Constant *createFoldedExitCond(const Loop *L, BasicBlock *ExitingBB, 1183 bool IsTaken) { 1184 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1185 bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB)); 1186 auto *OldCond = BI->getCondition(); 1187 return ConstantInt::get(OldCond->getType(), 1188 IsTaken ? ExitIfTrue : !ExitIfTrue); 1189 } 1190 1191 static void foldExit(const Loop *L, BasicBlock *ExitingBB, bool IsTaken, 1192 SmallVectorImpl<WeakTrackingVH> &DeadInsts) { 1193 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1194 auto *NewCond = createFoldedExitCond(L, ExitingBB, IsTaken); 1195 replaceExitCond(BI, NewCond, DeadInsts); 1196 } 1197 1198 static void replaceLoopPHINodesWithPreheaderValues( 1199 LoopInfo *LI, Loop *L, SmallVectorImpl<WeakTrackingVH> &DeadInsts, 1200 ScalarEvolution &SE) { 1201 assert(L->isLoopSimplifyForm() && "Should only do it in simplify form!"); 1202 auto *LoopPreheader = L->getLoopPreheader(); 1203 auto *LoopHeader = L->getHeader(); 1204 SmallVector<Instruction *> Worklist; 1205 for (auto &PN : LoopHeader->phis()) { 1206 auto *PreheaderIncoming = PN.getIncomingValueForBlock(LoopPreheader); 1207 for (User *U : PN.users()) 1208 Worklist.push_back(cast<Instruction>(U)); 1209 SE.forgetValue(&PN); 1210 PN.replaceAllUsesWith(PreheaderIncoming); 1211 DeadInsts.emplace_back(&PN); 1212 } 1213 1214 // Replacing with the preheader value will often allow IV users to simplify 1215 // (especially if the preheader value is a constant). 1216 SmallPtrSet<Instruction *, 16> Visited; 1217 while (!Worklist.empty()) { 1218 auto *I = cast<Instruction>(Worklist.pop_back_val()); 1219 if (!Visited.insert(I).second) 1220 continue; 1221 1222 // Don't simplify instructions outside the loop. 1223 if (!L->contains(I)) 1224 continue; 1225 1226 Value *Res = simplifyInstruction(I, I->getModule()->getDataLayout()); 1227 if (Res && LI->replacementPreservesLCSSAForm(I, Res)) { 1228 for (User *U : I->users()) 1229 Worklist.push_back(cast<Instruction>(U)); 1230 I->replaceAllUsesWith(Res); 1231 DeadInsts.emplace_back(I); 1232 } 1233 } 1234 } 1235 1236 static Value * 1237 createInvariantCond(const Loop *L, BasicBlock *ExitingBB, 1238 const ScalarEvolution::LoopInvariantPredicate &LIP, 1239 SCEVExpander &Rewriter) { 1240 ICmpInst::Predicate InvariantPred = LIP.Pred; 1241 BasicBlock *Preheader = L->getLoopPreheader(); 1242 assert(Preheader && "Preheader doesn't exist"); 1243 Rewriter.setInsertPoint(Preheader->getTerminator()); 1244 auto *LHSV = Rewriter.expandCodeFor(LIP.LHS); 1245 auto *RHSV = Rewriter.expandCodeFor(LIP.RHS); 1246 bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB)); 1247 if (ExitIfTrue) 1248 InvariantPred = ICmpInst::getInversePredicate(InvariantPred); 1249 IRBuilder<> Builder(Preheader->getTerminator()); 1250 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1251 return Builder.CreateICmp(InvariantPred, LHSV, RHSV, 1252 BI->getCondition()->getName()); 1253 } 1254 1255 static std::optional<Value *> 1256 createReplacement(ICmpInst *ICmp, const Loop *L, BasicBlock *ExitingBB, 1257 const SCEV *MaxIter, bool Inverted, bool SkipLastIter, 1258 ScalarEvolution *SE, SCEVExpander &Rewriter) { 1259 ICmpInst::Predicate Pred = ICmp->getPredicate(); 1260 Value *LHS = ICmp->getOperand(0); 1261 Value *RHS = ICmp->getOperand(1); 1262 1263 // 'LHS pred RHS' should now mean that we stay in loop. 1264 auto *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1265 if (Inverted) 1266 Pred = CmpInst::getInversePredicate(Pred); 1267 1268 const SCEV *LHSS = SE->getSCEVAtScope(LHS, L); 1269 const SCEV *RHSS = SE->getSCEVAtScope(RHS, L); 1270 // Can we prove it to be trivially true or false? 1271 if (auto EV = SE->evaluatePredicateAt(Pred, LHSS, RHSS, BI)) 1272 return createFoldedExitCond(L, ExitingBB, /*IsTaken*/ !*EV); 1273 1274 auto *ARTy = LHSS->getType(); 1275 auto *MaxIterTy = MaxIter->getType(); 1276 // If possible, adjust types. 1277 if (SE->getTypeSizeInBits(ARTy) > SE->getTypeSizeInBits(MaxIterTy)) 1278 MaxIter = SE->getZeroExtendExpr(MaxIter, ARTy); 1279 else if (SE->getTypeSizeInBits(ARTy) < SE->getTypeSizeInBits(MaxIterTy)) { 1280 const SCEV *MinusOne = SE->getMinusOne(ARTy); 1281 auto *MaxAllowedIter = SE->getZeroExtendExpr(MinusOne, MaxIterTy); 1282 if (SE->isKnownPredicateAt(ICmpInst::ICMP_ULE, MaxIter, MaxAllowedIter, BI)) 1283 MaxIter = SE->getTruncateExpr(MaxIter, ARTy); 1284 } 1285 1286 if (SkipLastIter) { 1287 // Semantically skip last iter is "subtract 1, do not bother about unsigned 1288 // wrap". getLoopInvariantExitCondDuringFirstIterations knows how to deal 1289 // with umin in a smart way, but umin(a, b) - 1 will likely not simplify. 1290 // So we manually construct umin(a - 1, b - 1). 1291 SmallVector<const SCEV *, 4> Elements; 1292 if (auto *UMin = dyn_cast<SCEVUMinExpr>(MaxIter)) { 1293 for (auto *Op : UMin->operands()) 1294 Elements.push_back(SE->getMinusSCEV(Op, SE->getOne(Op->getType()))); 1295 MaxIter = SE->getUMinFromMismatchedTypes(Elements); 1296 } else 1297 MaxIter = SE->getMinusSCEV(MaxIter, SE->getOne(MaxIter->getType())); 1298 } 1299 1300 // Check if there is a loop-invariant predicate equivalent to our check. 1301 auto LIP = SE->getLoopInvariantExitCondDuringFirstIterations(Pred, LHSS, RHSS, 1302 L, BI, MaxIter); 1303 if (!LIP) 1304 return std::nullopt; 1305 1306 // Can we prove it to be trivially true? 1307 if (SE->isKnownPredicateAt(LIP->Pred, LIP->LHS, LIP->RHS, BI)) 1308 return createFoldedExitCond(L, ExitingBB, /*IsTaken*/ false); 1309 else 1310 return createInvariantCond(L, ExitingBB, *LIP, Rewriter); 1311 } 1312 1313 static bool optimizeLoopExitWithUnknownExitCount( 1314 const Loop *L, BranchInst *BI, BasicBlock *ExitingBB, const SCEV *MaxIter, 1315 bool SkipLastIter, ScalarEvolution *SE, SCEVExpander &Rewriter, 1316 SmallVectorImpl<WeakTrackingVH> &DeadInsts) { 1317 assert( 1318 (L->contains(BI->getSuccessor(0)) != L->contains(BI->getSuccessor(1))) && 1319 "Not a loop exit!"); 1320 1321 // For branch that stays in loop by TRUE condition, go through AND. For branch 1322 // that stays in loop by FALSE condition, go through OR. Both gives the 1323 // similar logic: "stay in loop iff all conditions are true(false)". 1324 bool Inverted = L->contains(BI->getSuccessor(1)); 1325 SmallVector<ICmpInst *, 4> LeafConditions; 1326 SmallVector<Value *, 4> Worklist; 1327 SmallPtrSet<Value *, 4> Visited; 1328 Value *OldCond = BI->getCondition(); 1329 Visited.insert(OldCond); 1330 Worklist.push_back(OldCond); 1331 1332 auto GoThrough = [&](Value *V) { 1333 Value *LHS = nullptr, *RHS = nullptr; 1334 if (Inverted) { 1335 if (!match(V, m_LogicalOr(m_Value(LHS), m_Value(RHS)))) 1336 return false; 1337 } else { 1338 if (!match(V, m_LogicalAnd(m_Value(LHS), m_Value(RHS)))) 1339 return false; 1340 } 1341 if (Visited.insert(LHS).second) 1342 Worklist.push_back(LHS); 1343 if (Visited.insert(RHS).second) 1344 Worklist.push_back(RHS); 1345 return true; 1346 }; 1347 1348 do { 1349 Value *Curr = Worklist.pop_back_val(); 1350 // Go through AND/OR conditions. Collect leaf ICMPs. We only care about 1351 // those with one use, to avoid instruction duplication. 1352 if (Curr->hasOneUse()) 1353 if (!GoThrough(Curr)) 1354 if (auto *ICmp = dyn_cast<ICmpInst>(Curr)) 1355 LeafConditions.push_back(ICmp); 1356 } while (!Worklist.empty()); 1357 1358 // If the current basic block has the same exit count as the whole loop, and 1359 // it consists of multiple icmp's, try to collect all icmp's that give exact 1360 // same exit count. For all other icmp's, we could use one less iteration, 1361 // because their value on the last iteration doesn't really matter. 1362 SmallPtrSet<ICmpInst *, 4> ICmpsFailingOnLastIter; 1363 if (!SkipLastIter && LeafConditions.size() > 1 && 1364 SE->getExitCount(L, ExitingBB, 1365 ScalarEvolution::ExitCountKind::SymbolicMaximum) == 1366 MaxIter) 1367 for (auto *ICmp : LeafConditions) { 1368 auto EL = SE->computeExitLimitFromCond(L, ICmp, Inverted, 1369 /*ControlsExit*/ false); 1370 auto *ExitMax = EL.SymbolicMaxNotTaken; 1371 if (isa<SCEVCouldNotCompute>(ExitMax)) 1372 continue; 1373 // They could be of different types (specifically this happens after 1374 // IV widening). 1375 auto *WiderType = 1376 SE->getWiderType(ExitMax->getType(), MaxIter->getType()); 1377 auto *WideExitMax = SE->getNoopOrZeroExtend(ExitMax, WiderType); 1378 auto *WideMaxIter = SE->getNoopOrZeroExtend(MaxIter, WiderType); 1379 if (WideExitMax == WideMaxIter) 1380 ICmpsFailingOnLastIter.insert(ICmp); 1381 } 1382 1383 bool Changed = false; 1384 for (auto *OldCond : LeafConditions) { 1385 // Skip last iteration for this icmp under one of two conditions: 1386 // - We do it for all conditions; 1387 // - There is another ICmp that would fail on last iter, so this one doesn't 1388 // really matter. 1389 bool OptimisticSkipLastIter = SkipLastIter; 1390 if (!OptimisticSkipLastIter) { 1391 if (ICmpsFailingOnLastIter.size() > 1) 1392 OptimisticSkipLastIter = true; 1393 else if (ICmpsFailingOnLastIter.size() == 1) 1394 OptimisticSkipLastIter = !ICmpsFailingOnLastIter.count(OldCond); 1395 } 1396 if (auto Replaced = 1397 createReplacement(OldCond, L, ExitingBB, MaxIter, Inverted, 1398 OptimisticSkipLastIter, SE, Rewriter)) { 1399 Changed = true; 1400 auto *NewCond = *Replaced; 1401 if (auto *NCI = dyn_cast<Instruction>(NewCond)) { 1402 NCI->setName(OldCond->getName() + ".first_iter"); 1403 } 1404 LLVM_DEBUG(dbgs() << "Unknown exit count: Replacing " << *OldCond 1405 << " with " << *NewCond << "\n"); 1406 assert(OldCond->hasOneUse() && "Must be!"); 1407 OldCond->replaceAllUsesWith(NewCond); 1408 DeadInsts.push_back(OldCond); 1409 // Make sure we no longer consider this condition as failing on last 1410 // iteration. 1411 ICmpsFailingOnLastIter.erase(OldCond); 1412 } 1413 } 1414 return Changed; 1415 } 1416 1417 bool IndVarSimplify::canonicalizeExitCondition(Loop *L) { 1418 // Note: This is duplicating a particular part on SimplifyIndVars reasoning. 1419 // We need to duplicate it because given icmp zext(small-iv), C, IVUsers 1420 // never reaches the icmp since the zext doesn't fold to an AddRec unless 1421 // it already has flags. The alternative to this would be to extending the 1422 // set of "interesting" IV users to include the icmp, but doing that 1423 // regresses results in practice by querying SCEVs before trip counts which 1424 // rely on them which results in SCEV caching sub-optimal answers. The 1425 // concern about caching sub-optimal results is why we only query SCEVs of 1426 // the loop invariant RHS here. 1427 SmallVector<BasicBlock*, 16> ExitingBlocks; 1428 L->getExitingBlocks(ExitingBlocks); 1429 bool Changed = false; 1430 for (auto *ExitingBB : ExitingBlocks) { 1431 auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); 1432 if (!BI) 1433 continue; 1434 assert(BI->isConditional() && "exit branch must be conditional"); 1435 1436 auto *ICmp = dyn_cast<ICmpInst>(BI->getCondition()); 1437 if (!ICmp || !ICmp->hasOneUse()) 1438 continue; 1439 1440 auto *LHS = ICmp->getOperand(0); 1441 auto *RHS = ICmp->getOperand(1); 1442 // For the range reasoning, avoid computing SCEVs in the loop to avoid 1443 // poisoning cache with sub-optimal results. For the must-execute case, 1444 // this is a neccessary precondition for correctness. 1445 if (!L->isLoopInvariant(RHS)) { 1446 if (!L->isLoopInvariant(LHS)) 1447 continue; 1448 // Same logic applies for the inverse case 1449 std::swap(LHS, RHS); 1450 } 1451 1452 // Match (icmp signed-cond zext, RHS) 1453 Value *LHSOp = nullptr; 1454 if (!match(LHS, m_ZExt(m_Value(LHSOp))) || !ICmp->isSigned()) 1455 continue; 1456 1457 const DataLayout &DL = ExitingBB->getModule()->getDataLayout(); 1458 const unsigned InnerBitWidth = DL.getTypeSizeInBits(LHSOp->getType()); 1459 const unsigned OuterBitWidth = DL.getTypeSizeInBits(RHS->getType()); 1460 auto FullCR = ConstantRange::getFull(InnerBitWidth); 1461 FullCR = FullCR.zeroExtend(OuterBitWidth); 1462 auto RHSCR = SE->getUnsignedRange(SE->applyLoopGuards(SE->getSCEV(RHS), L)); 1463 if (FullCR.contains(RHSCR)) { 1464 // We have now matched icmp signed-cond zext(X), zext(Y'), and can thus 1465 // replace the signed condition with the unsigned version. 1466 ICmp->setPredicate(ICmp->getUnsignedPredicate()); 1467 Changed = true; 1468 // Note: No SCEV invalidation needed. We've changed the predicate, but 1469 // have not changed exit counts, or the values produced by the compare. 1470 continue; 1471 } 1472 } 1473 1474 // Now that we've canonicalized the condition to match the extend, 1475 // see if we can rotate the extend out of the loop. 1476 for (auto *ExitingBB : ExitingBlocks) { 1477 auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); 1478 if (!BI) 1479 continue; 1480 assert(BI->isConditional() && "exit branch must be conditional"); 1481 1482 auto *ICmp = dyn_cast<ICmpInst>(BI->getCondition()); 1483 if (!ICmp || !ICmp->hasOneUse() || !ICmp->isUnsigned()) 1484 continue; 1485 1486 bool Swapped = false; 1487 auto *LHS = ICmp->getOperand(0); 1488 auto *RHS = ICmp->getOperand(1); 1489 if (L->isLoopInvariant(LHS) == L->isLoopInvariant(RHS)) 1490 // Nothing to rotate 1491 continue; 1492 if (L->isLoopInvariant(LHS)) { 1493 // Same logic applies for the inverse case until we actually pick 1494 // which operand of the compare to update. 1495 Swapped = true; 1496 std::swap(LHS, RHS); 1497 } 1498 assert(!L->isLoopInvariant(LHS) && L->isLoopInvariant(RHS)); 1499 1500 // Match (icmp unsigned-cond zext, RHS) 1501 // TODO: Extend to handle corresponding sext/signed-cmp case 1502 // TODO: Extend to other invertible functions 1503 Value *LHSOp = nullptr; 1504 if (!match(LHS, m_ZExt(m_Value(LHSOp)))) 1505 continue; 1506 1507 // In general, we only rotate if we can do so without increasing the number 1508 // of instructions. The exception is when we have an zext(add-rec). The 1509 // reason for allowing this exception is that we know we need to get rid 1510 // of the zext for SCEV to be able to compute a trip count for said loops; 1511 // we consider the new trip count valuable enough to increase instruction 1512 // count by one. 1513 if (!LHS->hasOneUse() && !isa<SCEVAddRecExpr>(SE->getSCEV(LHSOp))) 1514 continue; 1515 1516 // Given a icmp unsigned-cond zext(Op) where zext(trunc(RHS)) == RHS 1517 // replace with an icmp of the form icmp unsigned-cond Op, trunc(RHS) 1518 // when zext is loop varying and RHS is loop invariant. This converts 1519 // loop varying work to loop-invariant work. 1520 auto doRotateTransform = [&]() { 1521 assert(ICmp->isUnsigned() && "must have proven unsigned already"); 1522 auto *NewRHS = 1523 CastInst::Create(Instruction::Trunc, RHS, LHSOp->getType(), "", 1524 L->getLoopPreheader()->getTerminator()); 1525 ICmp->setOperand(Swapped ? 1 : 0, LHSOp); 1526 ICmp->setOperand(Swapped ? 0 : 1, NewRHS); 1527 if (LHS->use_empty()) 1528 DeadInsts.push_back(LHS); 1529 }; 1530 1531 1532 const DataLayout &DL = ExitingBB->getModule()->getDataLayout(); 1533 const unsigned InnerBitWidth = DL.getTypeSizeInBits(LHSOp->getType()); 1534 const unsigned OuterBitWidth = DL.getTypeSizeInBits(RHS->getType()); 1535 auto FullCR = ConstantRange::getFull(InnerBitWidth); 1536 FullCR = FullCR.zeroExtend(OuterBitWidth); 1537 auto RHSCR = SE->getUnsignedRange(SE->applyLoopGuards(SE->getSCEV(RHS), L)); 1538 if (FullCR.contains(RHSCR)) { 1539 doRotateTransform(); 1540 Changed = true; 1541 // Note, we are leaving SCEV in an unfortunately imprecise case here 1542 // as rotation tends to reveal information about trip counts not 1543 // previously visible. 1544 continue; 1545 } 1546 } 1547 1548 return Changed; 1549 } 1550 1551 bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) { 1552 SmallVector<BasicBlock*, 16> ExitingBlocks; 1553 L->getExitingBlocks(ExitingBlocks); 1554 1555 // Remove all exits which aren't both rewriteable and execute on every 1556 // iteration. 1557 llvm::erase_if(ExitingBlocks, [&](BasicBlock *ExitingBB) { 1558 // If our exitting block exits multiple loops, we can only rewrite the 1559 // innermost one. Otherwise, we're changing how many times the innermost 1560 // loop runs before it exits. 1561 if (LI->getLoopFor(ExitingBB) != L) 1562 return true; 1563 1564 // Can't rewrite non-branch yet. 1565 BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); 1566 if (!BI) 1567 return true; 1568 1569 // Likewise, the loop latch must be dominated by the exiting BB. 1570 if (!DT->dominates(ExitingBB, L->getLoopLatch())) 1571 return true; 1572 1573 if (auto *CI = dyn_cast<ConstantInt>(BI->getCondition())) { 1574 // If already constant, nothing to do. However, if this is an 1575 // unconditional exit, we can still replace header phis with their 1576 // preheader value. 1577 if (!L->contains(BI->getSuccessor(CI->isNullValue()))) 1578 replaceLoopPHINodesWithPreheaderValues(LI, L, DeadInsts, *SE); 1579 return true; 1580 } 1581 1582 return false; 1583 }); 1584 1585 if (ExitingBlocks.empty()) 1586 return false; 1587 1588 // Get a symbolic upper bound on the loop backedge taken count. 1589 const SCEV *MaxBECount = SE->getSymbolicMaxBackedgeTakenCount(L); 1590 if (isa<SCEVCouldNotCompute>(MaxBECount)) 1591 return false; 1592 1593 // Visit our exit blocks in order of dominance. We know from the fact that 1594 // all exits must dominate the latch, so there is a total dominance order 1595 // between them. 1596 llvm::sort(ExitingBlocks, [&](BasicBlock *A, BasicBlock *B) { 1597 // std::sort sorts in ascending order, so we want the inverse of 1598 // the normal dominance relation. 1599 if (A == B) return false; 1600 if (DT->properlyDominates(A, B)) 1601 return true; 1602 else { 1603 assert(DT->properlyDominates(B, A) && 1604 "expected total dominance order!"); 1605 return false; 1606 } 1607 }); 1608 #ifdef ASSERT 1609 for (unsigned i = 1; i < ExitingBlocks.size(); i++) { 1610 assert(DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i])); 1611 } 1612 #endif 1613 1614 bool Changed = false; 1615 bool SkipLastIter = false; 1616 const SCEV *CurrMaxExit = SE->getCouldNotCompute(); 1617 auto UpdateSkipLastIter = [&](const SCEV *MaxExitCount) { 1618 if (SkipLastIter || isa<SCEVCouldNotCompute>(MaxExitCount)) 1619 return; 1620 if (isa<SCEVCouldNotCompute>(CurrMaxExit)) 1621 CurrMaxExit = MaxExitCount; 1622 else 1623 CurrMaxExit = SE->getUMinFromMismatchedTypes(CurrMaxExit, MaxExitCount); 1624 // If the loop has more than 1 iteration, all further checks will be 1625 // executed 1 iteration less. 1626 if (CurrMaxExit == MaxBECount) 1627 SkipLastIter = true; 1628 }; 1629 SmallSet<const SCEV *, 8> DominatingExactExitCounts; 1630 for (BasicBlock *ExitingBB : ExitingBlocks) { 1631 const SCEV *ExactExitCount = SE->getExitCount(L, ExitingBB); 1632 const SCEV *MaxExitCount = SE->getExitCount( 1633 L, ExitingBB, ScalarEvolution::ExitCountKind::SymbolicMaximum); 1634 if (isa<SCEVCouldNotCompute>(ExactExitCount)) { 1635 // Okay, we do not know the exit count here. Can we at least prove that it 1636 // will remain the same within iteration space? 1637 auto *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1638 auto OptimizeCond = [&](bool SkipLastIter) { 1639 return optimizeLoopExitWithUnknownExitCount(L, BI, ExitingBB, 1640 MaxBECount, SkipLastIter, 1641 SE, Rewriter, DeadInsts); 1642 }; 1643 1644 // TODO: We might have proved that we can skip the last iteration for 1645 // this check. In this case, we only want to check the condition on the 1646 // pre-last iteration (MaxBECount - 1). However, there is a nasty 1647 // corner case: 1648 // 1649 // for (i = len; i != 0; i--) { ... check (i ult X) ... } 1650 // 1651 // If we could not prove that len != 0, then we also could not prove that 1652 // (len - 1) is not a UINT_MAX. If we simply query (len - 1), then 1653 // OptimizeCond will likely not prove anything for it, even if it could 1654 // prove the same fact for len. 1655 // 1656 // As a temporary solution, we query both last and pre-last iterations in 1657 // hope that we will be able to prove triviality for at least one of 1658 // them. We can stop querying MaxBECount for this case once SCEV 1659 // understands that (MaxBECount - 1) will not overflow here. 1660 if (OptimizeCond(false)) 1661 Changed = true; 1662 else if (SkipLastIter && OptimizeCond(true)) 1663 Changed = true; 1664 UpdateSkipLastIter(MaxExitCount); 1665 continue; 1666 } 1667 1668 UpdateSkipLastIter(ExactExitCount); 1669 1670 // If we know we'd exit on the first iteration, rewrite the exit to 1671 // reflect this. This does not imply the loop must exit through this 1672 // exit; there may be an earlier one taken on the first iteration. 1673 // We know that the backedge can't be taken, so we replace all 1674 // the header PHIs with values coming from the preheader. 1675 if (ExactExitCount->isZero()) { 1676 foldExit(L, ExitingBB, true, DeadInsts); 1677 replaceLoopPHINodesWithPreheaderValues(LI, L, DeadInsts, *SE); 1678 Changed = true; 1679 continue; 1680 } 1681 1682 assert(ExactExitCount->getType()->isIntegerTy() && 1683 MaxBECount->getType()->isIntegerTy() && 1684 "Exit counts must be integers"); 1685 1686 Type *WiderType = 1687 SE->getWiderType(MaxBECount->getType(), ExactExitCount->getType()); 1688 ExactExitCount = SE->getNoopOrZeroExtend(ExactExitCount, WiderType); 1689 MaxBECount = SE->getNoopOrZeroExtend(MaxBECount, WiderType); 1690 assert(MaxBECount->getType() == ExactExitCount->getType()); 1691 1692 // Can we prove that some other exit must be taken strictly before this 1693 // one? 1694 if (SE->isLoopEntryGuardedByCond(L, CmpInst::ICMP_ULT, MaxBECount, 1695 ExactExitCount)) { 1696 foldExit(L, ExitingBB, false, DeadInsts); 1697 Changed = true; 1698 continue; 1699 } 1700 1701 // As we run, keep track of which exit counts we've encountered. If we 1702 // find a duplicate, we've found an exit which would have exited on the 1703 // exiting iteration, but (from the visit order) strictly follows another 1704 // which does the same and is thus dead. 1705 if (!DominatingExactExitCounts.insert(ExactExitCount).second) { 1706 foldExit(L, ExitingBB, false, DeadInsts); 1707 Changed = true; 1708 continue; 1709 } 1710 1711 // TODO: There might be another oppurtunity to leverage SCEV's reasoning 1712 // here. If we kept track of the min of dominanting exits so far, we could 1713 // discharge exits with EC >= MDEC. This is less powerful than the existing 1714 // transform (since later exits aren't considered), but potentially more 1715 // powerful for any case where SCEV can prove a >=u b, but neither a == b 1716 // or a >u b. Such a case is not currently known. 1717 } 1718 return Changed; 1719 } 1720 1721 bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { 1722 SmallVector<BasicBlock*, 16> ExitingBlocks; 1723 L->getExitingBlocks(ExitingBlocks); 1724 1725 // Finally, see if we can rewrite our exit conditions into a loop invariant 1726 // form. If we have a read-only loop, and we can tell that we must exit down 1727 // a path which does not need any of the values computed within the loop, we 1728 // can rewrite the loop to exit on the first iteration. Note that this 1729 // doesn't either a) tell us the loop exits on the first iteration (unless 1730 // *all* exits are predicateable) or b) tell us *which* exit might be taken. 1731 // This transformation looks a lot like a restricted form of dead loop 1732 // elimination, but restricted to read-only loops and without neccesssarily 1733 // needing to kill the loop entirely. 1734 if (!LoopPredication) 1735 return false; 1736 1737 // Note: ExactBTC is the exact backedge taken count *iff* the loop exits 1738 // through *explicit* control flow. We have to eliminate the possibility of 1739 // implicit exits (see below) before we know it's truly exact. 1740 const SCEV *ExactBTC = SE->getBackedgeTakenCount(L); 1741 if (isa<SCEVCouldNotCompute>(ExactBTC) || !Rewriter.isSafeToExpand(ExactBTC)) 1742 return false; 1743 1744 assert(SE->isLoopInvariant(ExactBTC, L) && "BTC must be loop invariant"); 1745 assert(ExactBTC->getType()->isIntegerTy() && "BTC must be integer"); 1746 1747 auto BadExit = [&](BasicBlock *ExitingBB) { 1748 // If our exiting block exits multiple loops, we can only rewrite the 1749 // innermost one. Otherwise, we're changing how many times the innermost 1750 // loop runs before it exits. 1751 if (LI->getLoopFor(ExitingBB) != L) 1752 return true; 1753 1754 // Can't rewrite non-branch yet. 1755 BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); 1756 if (!BI) 1757 return true; 1758 1759 // If already constant, nothing to do. 1760 if (isa<Constant>(BI->getCondition())) 1761 return true; 1762 1763 // If the exit block has phis, we need to be able to compute the values 1764 // within the loop which contains them. This assumes trivially lcssa phis 1765 // have already been removed; TODO: generalize 1766 BasicBlock *ExitBlock = 1767 BI->getSuccessor(L->contains(BI->getSuccessor(0)) ? 1 : 0); 1768 if (!ExitBlock->phis().empty()) 1769 return true; 1770 1771 const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); 1772 if (isa<SCEVCouldNotCompute>(ExitCount) || 1773 !Rewriter.isSafeToExpand(ExitCount)) 1774 return true; 1775 1776 assert(SE->isLoopInvariant(ExitCount, L) && 1777 "Exit count must be loop invariant"); 1778 assert(ExitCount->getType()->isIntegerTy() && "Exit count must be integer"); 1779 return false; 1780 }; 1781 1782 // If we have any exits which can't be predicated themselves, than we can't 1783 // predicate any exit which isn't guaranteed to execute before it. Consider 1784 // two exits (a) and (b) which would both exit on the same iteration. If we 1785 // can predicate (b), but not (a), and (a) preceeds (b) along some path, then 1786 // we could convert a loop from exiting through (a) to one exiting through 1787 // (b). Note that this problem exists only for exits with the same exit 1788 // count, and we could be more aggressive when exit counts are known inequal. 1789 llvm::sort(ExitingBlocks, 1790 [&](BasicBlock *A, BasicBlock *B) { 1791 // std::sort sorts in ascending order, so we want the inverse of 1792 // the normal dominance relation, plus a tie breaker for blocks 1793 // unordered by dominance. 1794 if (DT->properlyDominates(A, B)) return true; 1795 if (DT->properlyDominates(B, A)) return false; 1796 return A->getName() < B->getName(); 1797 }); 1798 // Check to see if our exit blocks are a total order (i.e. a linear chain of 1799 // exits before the backedge). If they aren't, reasoning about reachability 1800 // is complicated and we choose not to for now. 1801 for (unsigned i = 1; i < ExitingBlocks.size(); i++) 1802 if (!DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i])) 1803 return false; 1804 1805 // Given our sorted total order, we know that exit[j] must be evaluated 1806 // after all exit[i] such j > i. 1807 for (unsigned i = 0, e = ExitingBlocks.size(); i < e; i++) 1808 if (BadExit(ExitingBlocks[i])) { 1809 ExitingBlocks.resize(i); 1810 break; 1811 } 1812 1813 if (ExitingBlocks.empty()) 1814 return false; 1815 1816 // We rely on not being able to reach an exiting block on a later iteration 1817 // then it's statically compute exit count. The implementaton of 1818 // getExitCount currently has this invariant, but assert it here so that 1819 // breakage is obvious if this ever changes.. 1820 assert(llvm::all_of(ExitingBlocks, [&](BasicBlock *ExitingBB) { 1821 return DT->dominates(ExitingBB, L->getLoopLatch()); 1822 })); 1823 1824 // At this point, ExitingBlocks consists of only those blocks which are 1825 // predicatable. Given that, we know we have at least one exit we can 1826 // predicate if the loop is doesn't have side effects and doesn't have any 1827 // implicit exits (because then our exact BTC isn't actually exact). 1828 // @Reviewers - As structured, this is O(I^2) for loop nests. Any 1829 // suggestions on how to improve this? I can obviously bail out for outer 1830 // loops, but that seems less than ideal. MemorySSA can find memory writes, 1831 // is that enough for *all* side effects? 1832 for (BasicBlock *BB : L->blocks()) 1833 for (auto &I : *BB) 1834 // TODO:isGuaranteedToTransfer 1835 if (I.mayHaveSideEffects()) 1836 return false; 1837 1838 bool Changed = false; 1839 // Finally, do the actual predication for all predicatable blocks. A couple 1840 // of notes here: 1841 // 1) We don't bother to constant fold dominated exits with identical exit 1842 // counts; that's simply a form of CSE/equality propagation and we leave 1843 // it for dedicated passes. 1844 // 2) We insert the comparison at the branch. Hoisting introduces additional 1845 // legality constraints and we leave that to dedicated logic. We want to 1846 // predicate even if we can't insert a loop invariant expression as 1847 // peeling or unrolling will likely reduce the cost of the otherwise loop 1848 // varying check. 1849 Rewriter.setInsertPoint(L->getLoopPreheader()->getTerminator()); 1850 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 1851 Value *ExactBTCV = nullptr; // Lazily generated if needed. 1852 for (BasicBlock *ExitingBB : ExitingBlocks) { 1853 const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); 1854 1855 auto *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1856 Value *NewCond; 1857 if (ExitCount == ExactBTC) { 1858 NewCond = L->contains(BI->getSuccessor(0)) ? 1859 B.getFalse() : B.getTrue(); 1860 } else { 1861 Value *ECV = Rewriter.expandCodeFor(ExitCount); 1862 if (!ExactBTCV) 1863 ExactBTCV = Rewriter.expandCodeFor(ExactBTC); 1864 Value *RHS = ExactBTCV; 1865 if (ECV->getType() != RHS->getType()) { 1866 Type *WiderTy = SE->getWiderType(ECV->getType(), RHS->getType()); 1867 ECV = B.CreateZExt(ECV, WiderTy); 1868 RHS = B.CreateZExt(RHS, WiderTy); 1869 } 1870 auto Pred = L->contains(BI->getSuccessor(0)) ? 1871 ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ; 1872 NewCond = B.CreateICmp(Pred, ECV, RHS); 1873 } 1874 Value *OldCond = BI->getCondition(); 1875 BI->setCondition(NewCond); 1876 if (OldCond->use_empty()) 1877 DeadInsts.emplace_back(OldCond); 1878 Changed = true; 1879 } 1880 1881 return Changed; 1882 } 1883 1884 //===----------------------------------------------------------------------===// 1885 // IndVarSimplify driver. Manage several subpasses of IV simplification. 1886 //===----------------------------------------------------------------------===// 1887 1888 bool IndVarSimplify::run(Loop *L) { 1889 // We need (and expect!) the incoming loop to be in LCSSA. 1890 assert(L->isRecursivelyLCSSAForm(*DT, *LI) && 1891 "LCSSA required to run indvars!"); 1892 1893 // If LoopSimplify form is not available, stay out of trouble. Some notes: 1894 // - LSR currently only supports LoopSimplify-form loops. Indvars' 1895 // canonicalization can be a pessimization without LSR to "clean up" 1896 // afterwards. 1897 // - We depend on having a preheader; in particular, 1898 // Loop::getCanonicalInductionVariable only supports loops with preheaders, 1899 // and we're in trouble if we can't find the induction variable even when 1900 // we've manually inserted one. 1901 // - LFTR relies on having a single backedge. 1902 if (!L->isLoopSimplifyForm()) 1903 return false; 1904 1905 bool Changed = false; 1906 // If there are any floating-point recurrences, attempt to 1907 // transform them to use integer recurrences. 1908 Changed |= rewriteNonIntegerIVs(L); 1909 1910 // Create a rewriter object which we'll use to transform the code with. 1911 SCEVExpander Rewriter(*SE, DL, "indvars"); 1912 #ifndef NDEBUG 1913 Rewriter.setDebugType(DEBUG_TYPE); 1914 #endif 1915 1916 // Eliminate redundant IV users. 1917 // 1918 // Simplification works best when run before other consumers of SCEV. We 1919 // attempt to avoid evaluating SCEVs for sign/zero extend operations until 1920 // other expressions involving loop IVs have been evaluated. This helps SCEV 1921 // set no-wrap flags before normalizing sign/zero extension. 1922 Rewriter.disableCanonicalMode(); 1923 Changed |= simplifyAndExtend(L, Rewriter, LI); 1924 1925 // Check to see if we can compute the final value of any expressions 1926 // that are recurrent in the loop, and substitute the exit values from the 1927 // loop into any instructions outside of the loop that use the final values 1928 // of the current expressions. 1929 if (ReplaceExitValue != NeverRepl) { 1930 if (int Rewrites = rewriteLoopExitValues(L, LI, TLI, SE, TTI, Rewriter, DT, 1931 ReplaceExitValue, DeadInsts)) { 1932 NumReplaced += Rewrites; 1933 Changed = true; 1934 } 1935 } 1936 1937 // Eliminate redundant IV cycles. 1938 NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts, TTI); 1939 1940 // Try to convert exit conditions to unsigned and rotate computation 1941 // out of the loop. Note: Handles invalidation internally if needed. 1942 Changed |= canonicalizeExitCondition(L); 1943 1944 // Try to eliminate loop exits based on analyzeable exit counts 1945 if (optimizeLoopExits(L, Rewriter)) { 1946 Changed = true; 1947 // Given we've changed exit counts, notify SCEV 1948 // Some nested loops may share same folded exit basic block, 1949 // thus we need to notify top most loop. 1950 SE->forgetTopmostLoop(L); 1951 } 1952 1953 // Try to form loop invariant tests for loop exits by changing how many 1954 // iterations of the loop run when that is unobservable. 1955 if (predicateLoopExits(L, Rewriter)) { 1956 Changed = true; 1957 // Given we've changed exit counts, notify SCEV 1958 SE->forgetLoop(L); 1959 } 1960 1961 // If we have a trip count expression, rewrite the loop's exit condition 1962 // using it. 1963 if (!DisableLFTR) { 1964 BasicBlock *PreHeader = L->getLoopPreheader(); 1965 1966 SmallVector<BasicBlock*, 16> ExitingBlocks; 1967 L->getExitingBlocks(ExitingBlocks); 1968 for (BasicBlock *ExitingBB : ExitingBlocks) { 1969 // Can't rewrite non-branch yet. 1970 if (!isa<BranchInst>(ExitingBB->getTerminator())) 1971 continue; 1972 1973 // If our exitting block exits multiple loops, we can only rewrite the 1974 // innermost one. Otherwise, we're changing how many times the innermost 1975 // loop runs before it exits. 1976 if (LI->getLoopFor(ExitingBB) != L) 1977 continue; 1978 1979 if (!needsLFTR(L, ExitingBB)) 1980 continue; 1981 1982 const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); 1983 if (isa<SCEVCouldNotCompute>(ExitCount)) 1984 continue; 1985 1986 // This was handled above, but as we form SCEVs, we can sometimes refine 1987 // existing ones; this allows exit counts to be folded to zero which 1988 // weren't when optimizeLoopExits saw them. Arguably, we should iterate 1989 // until stable to handle cases like this better. 1990 if (ExitCount->isZero()) 1991 continue; 1992 1993 PHINode *IndVar = FindLoopCounter(L, ExitingBB, ExitCount, SE, DT); 1994 if (!IndVar) 1995 continue; 1996 1997 // Avoid high cost expansions. Note: This heuristic is questionable in 1998 // that our definition of "high cost" is not exactly principled. 1999 if (Rewriter.isHighCostExpansion(ExitCount, L, SCEVCheapExpansionBudget, 2000 TTI, PreHeader->getTerminator())) 2001 continue; 2002 2003 // Check preconditions for proper SCEVExpander operation. SCEV does not 2004 // express SCEVExpander's dependencies, such as LoopSimplify. Instead 2005 // any pass that uses the SCEVExpander must do it. This does not work 2006 // well for loop passes because SCEVExpander makes assumptions about 2007 // all loops, while LoopPassManager only forces the current loop to be 2008 // simplified. 2009 // 2010 // FIXME: SCEV expansion has no way to bail out, so the caller must 2011 // explicitly check any assumptions made by SCEV. Brittle. 2012 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(ExitCount); 2013 if (!AR || AR->getLoop()->getLoopPreheader()) 2014 Changed |= linearFunctionTestReplace(L, ExitingBB, 2015 ExitCount, IndVar, 2016 Rewriter); 2017 } 2018 } 2019 // Clear the rewriter cache, because values that are in the rewriter's cache 2020 // can be deleted in the loop below, causing the AssertingVH in the cache to 2021 // trigger. 2022 Rewriter.clear(); 2023 2024 // Now that we're done iterating through lists, clean up any instructions 2025 // which are now dead. 2026 while (!DeadInsts.empty()) { 2027 Value *V = DeadInsts.pop_back_val(); 2028 2029 if (PHINode *PHI = dyn_cast_or_null<PHINode>(V)) 2030 Changed |= RecursivelyDeleteDeadPHINode(PHI, TLI, MSSAU.get()); 2031 else if (Instruction *Inst = dyn_cast_or_null<Instruction>(V)) 2032 Changed |= 2033 RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI, MSSAU.get()); 2034 } 2035 2036 // The Rewriter may not be used from this point on. 2037 2038 // Loop-invariant instructions in the preheader that aren't used in the 2039 // loop may be sunk below the loop to reduce register pressure. 2040 Changed |= sinkUnusedInvariants(L); 2041 2042 // rewriteFirstIterationLoopExitValues does not rely on the computation of 2043 // trip count and therefore can further simplify exit values in addition to 2044 // rewriteLoopExitValues. 2045 Changed |= rewriteFirstIterationLoopExitValues(L); 2046 2047 // Clean up dead instructions. 2048 Changed |= DeleteDeadPHIs(L->getHeader(), TLI, MSSAU.get()); 2049 2050 // Check a post-condition. 2051 assert(L->isRecursivelyLCSSAForm(*DT, *LI) && 2052 "Indvars did not preserve LCSSA!"); 2053 if (VerifyMemorySSA && MSSAU) 2054 MSSAU->getMemorySSA()->verifyMemorySSA(); 2055 2056 return Changed; 2057 } 2058 2059 PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM, 2060 LoopStandardAnalysisResults &AR, 2061 LPMUpdater &) { 2062 Function *F = L.getHeader()->getParent(); 2063 const DataLayout &DL = F->getParent()->getDataLayout(); 2064 2065 IndVarSimplify IVS(&AR.LI, &AR.SE, &AR.DT, DL, &AR.TLI, &AR.TTI, AR.MSSA, 2066 WidenIndVars && AllowIVWidening); 2067 if (!IVS.run(&L)) 2068 return PreservedAnalyses::all(); 2069 2070 auto PA = getLoopPassPreservedAnalyses(); 2071 PA.preserveSet<CFGAnalyses>(); 2072 if (AR.MSSA) 2073 PA.preserve<MemorySSAAnalysis>(); 2074 return PA; 2075 } 2076