1 //===- LoopInterchange.cpp - Loop interchange pass-------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This Pass handles loop interchange transform. 10 // This pass interchanges loops to provide a more cache-friendly memory access 11 // patterns. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/Transforms/Scalar/LoopInterchange.h" 16 #include "llvm/ADT/STLExtras.h" 17 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/ADT/Statistic.h" 19 #include "llvm/ADT/StringRef.h" 20 #include "llvm/Analysis/DependenceAnalysis.h" 21 #include "llvm/Analysis/LoopCacheAnalysis.h" 22 #include "llvm/Analysis/LoopInfo.h" 23 #include "llvm/Analysis/LoopNestAnalysis.h" 24 #include "llvm/Analysis/LoopPass.h" 25 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 26 #include "llvm/Analysis/ScalarEvolution.h" 27 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 28 #include "llvm/IR/BasicBlock.h" 29 #include "llvm/IR/Constants.h" 30 #include "llvm/IR/DiagnosticInfo.h" 31 #include "llvm/IR/Dominators.h" 32 #include "llvm/IR/Function.h" 33 #include "llvm/IR/IRBuilder.h" 34 #include "llvm/IR/InstrTypes.h" 35 #include "llvm/IR/Instruction.h" 36 #include "llvm/IR/Instructions.h" 37 #include "llvm/IR/User.h" 38 #include "llvm/IR/Value.h" 39 #include "llvm/Support/Casting.h" 40 #include "llvm/Support/CommandLine.h" 41 #include "llvm/Support/Debug.h" 42 #include "llvm/Support/ErrorHandling.h" 43 #include "llvm/Support/raw_ostream.h" 44 #include "llvm/Transforms/Scalar/LoopPassManager.h" 45 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 46 #include "llvm/Transforms/Utils/LoopUtils.h" 47 #include <cassert> 48 #include <utility> 49 #include <vector> 50 51 using namespace llvm; 52 53 #define DEBUG_TYPE "loop-interchange" 54 55 STATISTIC(LoopsInterchanged, "Number of loops interchanged"); 56 57 static cl::opt<int> LoopInterchangeCostThreshold( 58 "loop-interchange-threshold", cl::init(0), cl::Hidden, 59 cl::desc("Interchange if you gain more than this number")); 60 61 namespace { 62 63 using LoopVector = SmallVector<Loop *, 8>; 64 65 // TODO: Check if we can use a sparse matrix here. 66 using CharMatrix = std::vector<std::vector<char>>; 67 68 } // end anonymous namespace 69 70 // Maximum number of dependencies that can be handled in the dependency matrix. 71 static const unsigned MaxMemInstrCount = 100; 72 73 // Maximum loop depth supported. 74 static const unsigned MaxLoopNestDepth = 10; 75 76 #ifdef DUMP_DEP_MATRICIES 77 static void printDepMatrix(CharMatrix &DepMatrix) { 78 for (auto &Row : DepMatrix) { 79 for (auto D : Row) 80 LLVM_DEBUG(dbgs() << D << " "); 81 LLVM_DEBUG(dbgs() << "\n"); 82 } 83 } 84 #endif 85 86 static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, 87 Loop *L, DependenceInfo *DI, 88 ScalarEvolution *SE) { 89 using ValueVector = SmallVector<Value *, 16>; 90 91 ValueVector MemInstr; 92 93 // For each block. 94 for (BasicBlock *BB : L->blocks()) { 95 // Scan the BB and collect legal loads and stores. 96 for (Instruction &I : *BB) { 97 if (!isa<Instruction>(I)) 98 return false; 99 if (auto *Ld = dyn_cast<LoadInst>(&I)) { 100 if (!Ld->isSimple()) 101 return false; 102 MemInstr.push_back(&I); 103 } else if (auto *St = dyn_cast<StoreInst>(&I)) { 104 if (!St->isSimple()) 105 return false; 106 MemInstr.push_back(&I); 107 } 108 } 109 } 110 111 LLVM_DEBUG(dbgs() << "Found " << MemInstr.size() 112 << " Loads and Stores to analyze\n"); 113 114 ValueVector::iterator I, IE, J, JE; 115 116 for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) { 117 for (J = I, JE = MemInstr.end(); J != JE; ++J) { 118 std::vector<char> Dep; 119 Instruction *Src = cast<Instruction>(*I); 120 Instruction *Dst = cast<Instruction>(*J); 121 // Ignore Input dependencies. 122 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst)) 123 continue; 124 // Track Output, Flow, and Anti dependencies. 125 if (auto D = DI->depends(Src, Dst, true)) { 126 assert(D->isOrdered() && "Expected an output, flow or anti dep."); 127 // If the direction vector is negative, normalize it to 128 // make it non-negative. 129 if (D->normalize(SE)) 130 LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n"); 131 LLVM_DEBUG(StringRef DepType = 132 D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output"; 133 dbgs() << "Found " << DepType 134 << " dependency between Src and Dst\n" 135 << " Src:" << *Src << "\n Dst:" << *Dst << '\n'); 136 unsigned Levels = D->getLevels(); 137 char Direction; 138 for (unsigned II = 1; II <= Levels; ++II) { 139 if (D->isScalar(II)) { 140 Direction = 'S'; 141 Dep.push_back(Direction); 142 } else { 143 unsigned Dir = D->getDirection(II); 144 if (Dir == Dependence::DVEntry::LT || 145 Dir == Dependence::DVEntry::LE) 146 Direction = '<'; 147 else if (Dir == Dependence::DVEntry::GT || 148 Dir == Dependence::DVEntry::GE) 149 Direction = '>'; 150 else if (Dir == Dependence::DVEntry::EQ) 151 Direction = '='; 152 else 153 Direction = '*'; 154 Dep.push_back(Direction); 155 } 156 } 157 while (Dep.size() != Level) { 158 Dep.push_back('I'); 159 } 160 161 DepMatrix.push_back(Dep); 162 if (DepMatrix.size() > MaxMemInstrCount) { 163 LLVM_DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount 164 << " dependencies inside loop\n"); 165 return false; 166 } 167 } 168 } 169 } 170 171 return true; 172 } 173 174 // A loop is moved from index 'from' to an index 'to'. Update the Dependence 175 // matrix by exchanging the two columns. 176 static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx, 177 unsigned ToIndx) { 178 for (unsigned I = 0, E = DepMatrix.size(); I < E; ++I) 179 std::swap(DepMatrix[I][ToIndx], DepMatrix[I][FromIndx]); 180 } 181 182 // After interchanging, check if the direction vector is valid. 183 // [Theorem] A permutation of the loops in a perfect nest is legal if and only 184 // if the direction matrix, after the same permutation is applied to its 185 // columns, has no ">" direction as the leftmost non-"=" direction in any row. 186 static bool isLexicographicallyPositive(std::vector<char> &DV) { 187 for (unsigned char Direction : DV) { 188 if (Direction == '<') 189 return true; 190 if (Direction == '>' || Direction == '*') 191 return false; 192 } 193 return true; 194 } 195 196 // Checks if it is legal to interchange 2 loops. 197 static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, 198 unsigned InnerLoopId, 199 unsigned OuterLoopId) { 200 unsigned NumRows = DepMatrix.size(); 201 std::vector<char> Cur; 202 // For each row check if it is valid to interchange. 203 for (unsigned Row = 0; Row < NumRows; ++Row) { 204 // Create temporary DepVector check its lexicographical order 205 // before and after swapping OuterLoop vs InnerLoop 206 Cur = DepMatrix[Row]; 207 if (!isLexicographicallyPositive(Cur)) 208 return false; 209 std::swap(Cur[InnerLoopId], Cur[OuterLoopId]); 210 if (!isLexicographicallyPositive(Cur)) 211 return false; 212 } 213 return true; 214 } 215 216 static void populateWorklist(Loop &L, LoopVector &LoopList) { 217 LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: " 218 << L.getHeader()->getParent()->getName() << " Loop: %" 219 << L.getHeader()->getName() << '\n'); 220 assert(LoopList.empty() && "LoopList should initially be empty!"); 221 Loop *CurrentLoop = &L; 222 const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops(); 223 while (!Vec->empty()) { 224 // The current loop has multiple subloops in it hence it is not tightly 225 // nested. 226 // Discard all loops above it added into Worklist. 227 if (Vec->size() != 1) { 228 LoopList = {}; 229 return; 230 } 231 232 LoopList.push_back(CurrentLoop); 233 CurrentLoop = Vec->front(); 234 Vec = &CurrentLoop->getSubLoops(); 235 } 236 LoopList.push_back(CurrentLoop); 237 } 238 239 namespace { 240 241 /// LoopInterchangeLegality checks if it is legal to interchange the loop. 242 class LoopInterchangeLegality { 243 public: 244 LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE, 245 OptimizationRemarkEmitter *ORE) 246 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {} 247 248 /// Check if the loops can be interchanged. 249 bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, 250 CharMatrix &DepMatrix); 251 252 /// Discover induction PHIs in the header of \p L. Induction 253 /// PHIs are added to \p Inductions. 254 bool findInductions(Loop *L, SmallVectorImpl<PHINode *> &Inductions); 255 256 /// Check if the loop structure is understood. We do not handle triangular 257 /// loops for now. 258 bool isLoopStructureUnderstood(); 259 260 bool currentLimitations(); 261 262 const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const { 263 return OuterInnerReductions; 264 } 265 266 const SmallVectorImpl<PHINode *> &getInnerLoopInductions() const { 267 return InnerLoopInductions; 268 } 269 270 private: 271 bool tightlyNested(Loop *Outer, Loop *Inner); 272 bool containsUnsafeInstructions(BasicBlock *BB); 273 274 /// Discover induction and reduction PHIs in the header of \p L. Induction 275 /// PHIs are added to \p Inductions, reductions are added to 276 /// OuterInnerReductions. When the outer loop is passed, the inner loop needs 277 /// to be passed as \p InnerLoop. 278 bool findInductionAndReductions(Loop *L, 279 SmallVector<PHINode *, 8> &Inductions, 280 Loop *InnerLoop); 281 282 Loop *OuterLoop; 283 Loop *InnerLoop; 284 285 ScalarEvolution *SE; 286 287 /// Interface to emit optimization remarks. 288 OptimizationRemarkEmitter *ORE; 289 290 /// Set of reduction PHIs taking part of a reduction across the inner and 291 /// outer loop. 292 SmallPtrSet<PHINode *, 4> OuterInnerReductions; 293 294 /// Set of inner loop induction PHIs 295 SmallVector<PHINode *, 8> InnerLoopInductions; 296 }; 297 298 /// LoopInterchangeProfitability checks if it is profitable to interchange the 299 /// loop. 300 class LoopInterchangeProfitability { 301 public: 302 LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE, 303 OptimizationRemarkEmitter *ORE) 304 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {} 305 306 /// Check if the loop interchange is profitable. 307 bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop, 308 unsigned InnerLoopId, unsigned OuterLoopId, 309 CharMatrix &DepMatrix, 310 const DenseMap<const Loop *, unsigned> &CostMap, 311 std::unique_ptr<CacheCost> &CC); 312 313 private: 314 int getInstrOrderCost(); 315 std::optional<bool> isProfitablePerLoopCacheAnalysis( 316 const DenseMap<const Loop *, unsigned> &CostMap, 317 std::unique_ptr<CacheCost> &CC); 318 std::optional<bool> isProfitablePerInstrOrderCost(); 319 std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId, 320 unsigned OuterLoopId, 321 CharMatrix &DepMatrix); 322 Loop *OuterLoop; 323 Loop *InnerLoop; 324 325 /// Scev analysis. 326 ScalarEvolution *SE; 327 328 /// Interface to emit optimization remarks. 329 OptimizationRemarkEmitter *ORE; 330 }; 331 332 /// LoopInterchangeTransform interchanges the loop. 333 class LoopInterchangeTransform { 334 public: 335 LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE, 336 LoopInfo *LI, DominatorTree *DT, 337 const LoopInterchangeLegality &LIL) 338 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {} 339 340 /// Interchange OuterLoop and InnerLoop. 341 bool transform(); 342 void restructureLoops(Loop *NewInner, Loop *NewOuter, 343 BasicBlock *OrigInnerPreHeader, 344 BasicBlock *OrigOuterPreHeader); 345 void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop); 346 347 private: 348 bool adjustLoopLinks(); 349 bool adjustLoopBranches(); 350 351 Loop *OuterLoop; 352 Loop *InnerLoop; 353 354 /// Scev analysis. 355 ScalarEvolution *SE; 356 357 LoopInfo *LI; 358 DominatorTree *DT; 359 360 const LoopInterchangeLegality &LIL; 361 }; 362 363 struct LoopInterchange { 364 ScalarEvolution *SE = nullptr; 365 LoopInfo *LI = nullptr; 366 DependenceInfo *DI = nullptr; 367 DominatorTree *DT = nullptr; 368 std::unique_ptr<CacheCost> CC = nullptr; 369 370 /// Interface to emit optimization remarks. 371 OptimizationRemarkEmitter *ORE; 372 373 LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI, 374 DominatorTree *DT, std::unique_ptr<CacheCost> &CC, 375 OptimizationRemarkEmitter *ORE) 376 : SE(SE), LI(LI), DI(DI), DT(DT), CC(std::move(CC)), ORE(ORE) {} 377 378 bool run(Loop *L) { 379 if (L->getParentLoop()) 380 return false; 381 SmallVector<Loop *, 8> LoopList; 382 populateWorklist(*L, LoopList); 383 return processLoopList(LoopList); 384 } 385 386 bool run(LoopNest &LN) { 387 SmallVector<Loop *, 8> LoopList(LN.getLoops().begin(), LN.getLoops().end()); 388 for (unsigned I = 1; I < LoopList.size(); ++I) 389 if (LoopList[I]->getParentLoop() != LoopList[I - 1]) 390 return false; 391 return processLoopList(LoopList); 392 } 393 394 bool isComputableLoopNest(ArrayRef<Loop *> LoopList) { 395 for (Loop *L : LoopList) { 396 const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L); 397 if (isa<SCEVCouldNotCompute>(ExitCountOuter)) { 398 LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n"); 399 return false; 400 } 401 if (L->getNumBackEdges() != 1) { 402 LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n"); 403 return false; 404 } 405 if (!L->getExitingBlock()) { 406 LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n"); 407 return false; 408 } 409 } 410 return true; 411 } 412 413 unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) { 414 // TODO: Add a better heuristic to select the loop to be interchanged based 415 // on the dependence matrix. Currently we select the innermost loop. 416 return LoopList.size() - 1; 417 } 418 419 bool processLoopList(SmallVectorImpl<Loop *> &LoopList) { 420 bool Changed = false; 421 unsigned LoopNestDepth = LoopList.size(); 422 if (LoopNestDepth < 2) { 423 LLVM_DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n"); 424 return false; 425 } 426 if (LoopNestDepth > MaxLoopNestDepth) { 427 LLVM_DEBUG(dbgs() << "Cannot handle loops of depth greater than " 428 << MaxLoopNestDepth << "\n"); 429 return false; 430 } 431 if (!isComputableLoopNest(LoopList)) { 432 LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n"); 433 return false; 434 } 435 436 LLVM_DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth 437 << "\n"); 438 439 CharMatrix DependencyMatrix; 440 Loop *OuterMostLoop = *(LoopList.begin()); 441 if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth, 442 OuterMostLoop, DI, SE)) { 443 LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n"); 444 return false; 445 } 446 #ifdef DUMP_DEP_MATRICIES 447 LLVM_DEBUG(dbgs() << "Dependence before interchange\n"); 448 printDepMatrix(DependencyMatrix); 449 #endif 450 451 // Get the Outermost loop exit. 452 BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock(); 453 if (!LoopNestExit) { 454 LLVM_DEBUG(dbgs() << "OuterMostLoop needs an unique exit block"); 455 return false; 456 } 457 458 unsigned SelecLoopId = selectLoopForInterchange(LoopList); 459 // Obtain the loop vector returned from loop cache analysis beforehand, 460 // and put each <Loop, index> pair into a map for constant time query 461 // later. Indices in loop vector reprsent the optimal order of the 462 // corresponding loop, e.g., given a loopnest with depth N, index 0 463 // indicates the loop should be placed as the outermost loop and index N 464 // indicates the loop should be placed as the innermost loop. 465 // 466 // For the old pass manager CacheCost would be null. 467 DenseMap<const Loop *, unsigned> CostMap; 468 if (CC != nullptr) { 469 const auto &LoopCosts = CC->getLoopCosts(); 470 for (unsigned i = 0; i < LoopCosts.size(); i++) { 471 CostMap[LoopCosts[i].first] = i; 472 } 473 } 474 // We try to achieve the globally optimal memory access for the loopnest, 475 // and do interchange based on a bubble-sort fasion. We start from 476 // the innermost loop, move it outwards to the best possible position 477 // and repeat this process. 478 for (unsigned j = SelecLoopId; j > 0; j--) { 479 bool ChangedPerIter = false; 480 for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) { 481 bool Interchanged = processLoop(LoopList[i], LoopList[i - 1], i, i - 1, 482 DependencyMatrix, CostMap); 483 if (!Interchanged) 484 continue; 485 // Loops interchanged, update LoopList accordingly. 486 std::swap(LoopList[i - 1], LoopList[i]); 487 // Update the DependencyMatrix 488 interChangeDependencies(DependencyMatrix, i, i - 1); 489 #ifdef DUMP_DEP_MATRICIES 490 LLVM_DEBUG(dbgs() << "Dependence after interchange\n"); 491 printDepMatrix(DependencyMatrix); 492 #endif 493 ChangedPerIter |= Interchanged; 494 Changed |= Interchanged; 495 } 496 // Early abort if there was no interchange during an entire round of 497 // moving loops outwards. 498 if (!ChangedPerIter) 499 break; 500 } 501 return Changed; 502 } 503 504 bool processLoop(Loop *InnerLoop, Loop *OuterLoop, unsigned InnerLoopId, 505 unsigned OuterLoopId, 506 std::vector<std::vector<char>> &DependencyMatrix, 507 const DenseMap<const Loop *, unsigned> &CostMap) { 508 LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId 509 << " and OuterLoopId = " << OuterLoopId << "\n"); 510 LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE); 511 if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) { 512 LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n"); 513 return false; 514 } 515 LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n"); 516 LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE); 517 if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId, 518 DependencyMatrix, CostMap, CC)) { 519 LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n"); 520 return false; 521 } 522 523 ORE->emit([&]() { 524 return OptimizationRemark(DEBUG_TYPE, "Interchanged", 525 InnerLoop->getStartLoc(), 526 InnerLoop->getHeader()) 527 << "Loop interchanged with enclosing loop."; 528 }); 529 530 LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL); 531 LIT.transform(); 532 LLVM_DEBUG(dbgs() << "Loops interchanged.\n"); 533 LoopsInterchanged++; 534 535 llvm::formLCSSARecursively(*OuterLoop, *DT, LI, SE); 536 return true; 537 } 538 }; 539 540 } // end anonymous namespace 541 542 bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB) { 543 return any_of(*BB, [](const Instruction &I) { 544 return I.mayHaveSideEffects() || I.mayReadFromMemory(); 545 }); 546 } 547 548 bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) { 549 BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); 550 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 551 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); 552 553 LLVM_DEBUG(dbgs() << "Checking if loops are tightly nested\n"); 554 555 // A perfectly nested loop will not have any branch in between the outer and 556 // inner block i.e. outer header will branch to either inner preheader and 557 // outerloop latch. 558 BranchInst *OuterLoopHeaderBI = 559 dyn_cast<BranchInst>(OuterLoopHeader->getTerminator()); 560 if (!OuterLoopHeaderBI) 561 return false; 562 563 for (BasicBlock *Succ : successors(OuterLoopHeaderBI)) 564 if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() && 565 Succ != OuterLoopLatch) 566 return false; 567 568 LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n"); 569 // We do not have any basic block in between now make sure the outer header 570 // and outer loop latch doesn't contain any unsafe instructions. 571 if (containsUnsafeInstructions(OuterLoopHeader) || 572 containsUnsafeInstructions(OuterLoopLatch)) 573 return false; 574 575 // Also make sure the inner loop preheader does not contain any unsafe 576 // instructions. Note that all instructions in the preheader will be moved to 577 // the outer loop header when interchanging. 578 if (InnerLoopPreHeader != OuterLoopHeader && 579 containsUnsafeInstructions(InnerLoopPreHeader)) 580 return false; 581 582 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock(); 583 // Ensure the inner loop exit block flows to the outer loop latch possibly 584 // through empty blocks. 585 const BasicBlock &SuccInner = 586 LoopNest::skipEmptyBlockUntil(InnerLoopExit, OuterLoopLatch); 587 if (&SuccInner != OuterLoopLatch) { 588 LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExit 589 << " does not lead to the outer loop latch.\n";); 590 return false; 591 } 592 // The inner loop exit block does flow to the outer loop latch and not some 593 // other BBs, now make sure it contains safe instructions, since it will be 594 // moved into the (new) inner loop after interchange. 595 if (containsUnsafeInstructions(InnerLoopExit)) 596 return false; 597 598 LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n"); 599 // We have a perfect loop nest. 600 return true; 601 } 602 603 bool LoopInterchangeLegality::isLoopStructureUnderstood() { 604 BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader(); 605 for (PHINode *InnerInduction : InnerLoopInductions) { 606 unsigned Num = InnerInduction->getNumOperands(); 607 for (unsigned i = 0; i < Num; ++i) { 608 Value *Val = InnerInduction->getOperand(i); 609 if (isa<Constant>(Val)) 610 continue; 611 Instruction *I = dyn_cast<Instruction>(Val); 612 if (!I) 613 return false; 614 // TODO: Handle triangular loops. 615 // e.g. for(int i=0;i<N;i++) 616 // for(int j=i;j<N;j++) 617 unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i); 618 if (InnerInduction->getIncomingBlock(IncomBlockIndx) == 619 InnerLoopPreheader && 620 !OuterLoop->isLoopInvariant(I)) { 621 return false; 622 } 623 } 624 } 625 626 // TODO: Handle triangular loops of another form. 627 // e.g. for(int i=0;i<N;i++) 628 // for(int j=0;j<i;j++) 629 // or, 630 // for(int i=0;i<N;i++) 631 // for(int j=0;j*i<N;j++) 632 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); 633 BranchInst *InnerLoopLatchBI = 634 dyn_cast<BranchInst>(InnerLoopLatch->getTerminator()); 635 if (!InnerLoopLatchBI->isConditional()) 636 return false; 637 if (CmpInst *InnerLoopCmp = 638 dyn_cast<CmpInst>(InnerLoopLatchBI->getCondition())) { 639 Value *Op0 = InnerLoopCmp->getOperand(0); 640 Value *Op1 = InnerLoopCmp->getOperand(1); 641 642 // LHS and RHS of the inner loop exit condition, e.g., 643 // in "for(int j=0;j<i;j++)", LHS is j and RHS is i. 644 Value *Left = nullptr; 645 Value *Right = nullptr; 646 647 // Check if V only involves inner loop induction variable. 648 // Return true if V is InnerInduction, or a cast from 649 // InnerInduction, or a binary operator that involves 650 // InnerInduction and a constant. 651 std::function<bool(Value *)> IsPathToInnerIndVar; 652 IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool { 653 if (llvm::is_contained(InnerLoopInductions, V)) 654 return true; 655 if (isa<Constant>(V)) 656 return true; 657 const Instruction *I = dyn_cast<Instruction>(V); 658 if (!I) 659 return false; 660 if (isa<CastInst>(I)) 661 return IsPathToInnerIndVar(I->getOperand(0)); 662 if (isa<BinaryOperator>(I)) 663 return IsPathToInnerIndVar(I->getOperand(0)) && 664 IsPathToInnerIndVar(I->getOperand(1)); 665 return false; 666 }; 667 668 // In case of multiple inner loop indvars, it is okay if LHS and RHS 669 // are both inner indvar related variables. 670 if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1)) 671 return true; 672 673 // Otherwise we check if the cmp instruction compares an inner indvar 674 // related variable (Left) with a outer loop invariant (Right). 675 if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Op0)) { 676 Left = Op0; 677 Right = Op1; 678 } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Op1)) { 679 Left = Op1; 680 Right = Op0; 681 } 682 683 if (Left == nullptr) 684 return false; 685 686 const SCEV *S = SE->getSCEV(Right); 687 if (!SE->isLoopInvariant(S, OuterLoop)) 688 return false; 689 } 690 691 return true; 692 } 693 694 // If SV is a LCSSA PHI node with a single incoming value, return the incoming 695 // value. 696 static Value *followLCSSA(Value *SV) { 697 PHINode *PHI = dyn_cast<PHINode>(SV); 698 if (!PHI) 699 return SV; 700 701 if (PHI->getNumIncomingValues() != 1) 702 return SV; 703 return followLCSSA(PHI->getIncomingValue(0)); 704 } 705 706 // Check V's users to see if it is involved in a reduction in L. 707 static PHINode *findInnerReductionPhi(Loop *L, Value *V) { 708 // Reduction variables cannot be constants. 709 if (isa<Constant>(V)) 710 return nullptr; 711 712 for (Value *User : V->users()) { 713 if (PHINode *PHI = dyn_cast<PHINode>(User)) { 714 if (PHI->getNumIncomingValues() == 1) 715 continue; 716 RecurrenceDescriptor RD; 717 if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) { 718 // Detect floating point reduction only when it can be reordered. 719 if (RD.getExactFPMathInst() != nullptr) 720 return nullptr; 721 return PHI; 722 } 723 return nullptr; 724 } 725 } 726 727 return nullptr; 728 } 729 730 bool LoopInterchangeLegality::findInductionAndReductions( 731 Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) { 732 if (!L->getLoopLatch() || !L->getLoopPredecessor()) 733 return false; 734 for (PHINode &PHI : L->getHeader()->phis()) { 735 InductionDescriptor ID; 736 if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID)) 737 Inductions.push_back(&PHI); 738 else { 739 // PHIs in inner loops need to be part of a reduction in the outer loop, 740 // discovered when checking the PHIs of the outer loop earlier. 741 if (!InnerLoop) { 742 if (!OuterInnerReductions.count(&PHI)) { 743 LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions " 744 "across the outer loop.\n"); 745 return false; 746 } 747 } else { 748 assert(PHI.getNumIncomingValues() == 2 && 749 "Phis in loop header should have exactly 2 incoming values"); 750 // Check if we have a PHI node in the outer loop that has a reduction 751 // result from the inner loop as an incoming value. 752 Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch())); 753 PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V); 754 if (!InnerRedPhi || 755 !llvm::is_contained(InnerRedPhi->incoming_values(), &PHI)) { 756 LLVM_DEBUG( 757 dbgs() 758 << "Failed to recognize PHI as an induction or reduction.\n"); 759 return false; 760 } 761 OuterInnerReductions.insert(&PHI); 762 OuterInnerReductions.insert(InnerRedPhi); 763 } 764 } 765 } 766 return true; 767 } 768 769 // This function indicates the current limitations in the transform as a result 770 // of which we do not proceed. 771 bool LoopInterchangeLegality::currentLimitations() { 772 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); 773 774 // transform currently expects the loop latches to also be the exiting 775 // blocks. 776 if (InnerLoop->getExitingBlock() != InnerLoopLatch || 777 OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() || 778 !isa<BranchInst>(InnerLoopLatch->getTerminator()) || 779 !isa<BranchInst>(OuterLoop->getLoopLatch()->getTerminator())) { 780 LLVM_DEBUG( 781 dbgs() << "Loops where the latch is not the exiting block are not" 782 << " supported currently.\n"); 783 ORE->emit([&]() { 784 return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch", 785 OuterLoop->getStartLoc(), 786 OuterLoop->getHeader()) 787 << "Loops where the latch is not the exiting block cannot be" 788 " interchange currently."; 789 }); 790 return true; 791 } 792 793 SmallVector<PHINode *, 8> Inductions; 794 if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) { 795 LLVM_DEBUG( 796 dbgs() << "Only outer loops with induction or reduction PHI nodes " 797 << "are supported currently.\n"); 798 ORE->emit([&]() { 799 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter", 800 OuterLoop->getStartLoc(), 801 OuterLoop->getHeader()) 802 << "Only outer loops with induction or reduction PHI nodes can be" 803 " interchanged currently."; 804 }); 805 return true; 806 } 807 808 Inductions.clear(); 809 // For multi-level loop nests, make sure that all phi nodes for inner loops 810 // at all levels can be recognized as a induction or reduction phi. Bail out 811 // if a phi node at a certain nesting level cannot be properly recognized. 812 Loop *CurLevelLoop = OuterLoop; 813 while (!CurLevelLoop->getSubLoops().empty()) { 814 // We already made sure that the loop nest is tightly nested. 815 CurLevelLoop = CurLevelLoop->getSubLoops().front(); 816 if (!findInductionAndReductions(CurLevelLoop, Inductions, nullptr)) { 817 LLVM_DEBUG( 818 dbgs() << "Only inner loops with induction or reduction PHI nodes " 819 << "are supported currently.\n"); 820 ORE->emit([&]() { 821 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner", 822 CurLevelLoop->getStartLoc(), 823 CurLevelLoop->getHeader()) 824 << "Only inner loops with induction or reduction PHI nodes can be" 825 " interchange currently."; 826 }); 827 return true; 828 } 829 } 830 831 // TODO: Triangular loops are not handled for now. 832 if (!isLoopStructureUnderstood()) { 833 LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n"); 834 ORE->emit([&]() { 835 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner", 836 InnerLoop->getStartLoc(), 837 InnerLoop->getHeader()) 838 << "Inner loop structure not understood currently."; 839 }); 840 return true; 841 } 842 843 return false; 844 } 845 846 bool LoopInterchangeLegality::findInductions( 847 Loop *L, SmallVectorImpl<PHINode *> &Inductions) { 848 for (PHINode &PHI : L->getHeader()->phis()) { 849 InductionDescriptor ID; 850 if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID)) 851 Inductions.push_back(&PHI); 852 } 853 return !Inductions.empty(); 854 } 855 856 // We currently only support LCSSA PHI nodes in the inner loop exit, if their 857 // users are either reduction PHIs or PHIs outside the outer loop (which means 858 // the we are only interested in the final value after the loop). 859 static bool 860 areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL, 861 SmallPtrSetImpl<PHINode *> &Reductions) { 862 BasicBlock *InnerExit = OuterL->getUniqueExitBlock(); 863 for (PHINode &PHI : InnerExit->phis()) { 864 // Reduction lcssa phi will have only 1 incoming block that from loop latch. 865 if (PHI.getNumIncomingValues() > 1) 866 return false; 867 if (any_of(PHI.users(), [&Reductions, OuterL](User *U) { 868 PHINode *PN = dyn_cast<PHINode>(U); 869 return !PN || 870 (!Reductions.count(PN) && OuterL->contains(PN->getParent())); 871 })) { 872 return false; 873 } 874 } 875 return true; 876 } 877 878 // We currently support LCSSA PHI nodes in the outer loop exit, if their 879 // incoming values do not come from the outer loop latch or if the 880 // outer loop latch has a single predecessor. In that case, the value will 881 // be available if both the inner and outer loop conditions are true, which 882 // will still be true after interchanging. If we have multiple predecessor, 883 // that may not be the case, e.g. because the outer loop latch may be executed 884 // if the inner loop is not executed. 885 static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) { 886 BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock(); 887 for (PHINode &PHI : LoopNestExit->phis()) { 888 for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) { 889 Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i)); 890 if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch()) 891 continue; 892 893 // The incoming value is defined in the outer loop latch. Currently we 894 // only support that in case the outer loop latch has a single predecessor. 895 // This guarantees that the outer loop latch is executed if and only if 896 // the inner loop is executed (because tightlyNested() guarantees that the 897 // outer loop header only branches to the inner loop or the outer loop 898 // latch). 899 // FIXME: We could weaken this logic and allow multiple predecessors, 900 // if the values are produced outside the loop latch. We would need 901 // additional logic to update the PHI nodes in the exit block as 902 // well. 903 if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr) 904 return false; 905 } 906 } 907 return true; 908 } 909 910 // In case of multi-level nested loops, it may occur that lcssa phis exist in 911 // the latch of InnerLoop, i.e., when defs of the incoming values are further 912 // inside the loopnest. Sometimes those incoming values are not available 913 // after interchange, since the original inner latch will become the new outer 914 // latch which may have predecessor paths that do not include those incoming 915 // values. 916 // TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of 917 // multi-level loop nests. 918 static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) { 919 if (InnerLoop->getSubLoops().empty()) 920 return true; 921 // If the original outer latch has only one predecessor, then values defined 922 // further inside the looploop, e.g., in the innermost loop, will be available 923 // at the new outer latch after interchange. 924 if (OuterLoop->getLoopLatch()->getUniquePredecessor() != nullptr) 925 return true; 926 927 // The outer latch has more than one predecessors, i.e., the inner 928 // exit and the inner header. 929 // PHI nodes in the inner latch are lcssa phis where the incoming values 930 // are defined further inside the loopnest. Check if those phis are used 931 // in the original inner latch. If that is the case then bail out since 932 // those incoming values may not be available at the new outer latch. 933 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); 934 for (PHINode &PHI : InnerLoopLatch->phis()) { 935 for (auto *U : PHI.users()) { 936 Instruction *UI = cast<Instruction>(U); 937 if (InnerLoopLatch == UI->getParent()) 938 return false; 939 } 940 } 941 return true; 942 } 943 944 bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, 945 unsigned OuterLoopId, 946 CharMatrix &DepMatrix) { 947 if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) { 948 LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId 949 << " and OuterLoopId = " << OuterLoopId 950 << " due to dependence\n"); 951 ORE->emit([&]() { 952 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence", 953 InnerLoop->getStartLoc(), 954 InnerLoop->getHeader()) 955 << "Cannot interchange loops due to dependences."; 956 }); 957 return false; 958 } 959 // Check if outer and inner loop contain legal instructions only. 960 for (auto *BB : OuterLoop->blocks()) 961 for (Instruction &I : BB->instructionsWithoutDebug()) 962 if (CallInst *CI = dyn_cast<CallInst>(&I)) { 963 // readnone functions do not prevent interchanging. 964 if (CI->onlyWritesMemory()) 965 continue; 966 LLVM_DEBUG( 967 dbgs() << "Loops with call instructions cannot be interchanged " 968 << "safely."); 969 ORE->emit([&]() { 970 return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst", 971 CI->getDebugLoc(), 972 CI->getParent()) 973 << "Cannot interchange loops due to call instruction."; 974 }); 975 976 return false; 977 } 978 979 if (!findInductions(InnerLoop, InnerLoopInductions)) { 980 LLVM_DEBUG(dbgs() << "Cound not find inner loop induction variables.\n"); 981 return false; 982 } 983 984 if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) { 985 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n"); 986 ORE->emit([&]() { 987 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerLatchPHI", 988 InnerLoop->getStartLoc(), 989 InnerLoop->getHeader()) 990 << "Cannot interchange loops because unsupported PHI nodes found " 991 "in inner loop latch."; 992 }); 993 return false; 994 } 995 996 // TODO: The loops could not be interchanged due to current limitations in the 997 // transform module. 998 if (currentLimitations()) { 999 LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n"); 1000 return false; 1001 } 1002 1003 // Check if the loops are tightly nested. 1004 if (!tightlyNested(OuterLoop, InnerLoop)) { 1005 LLVM_DEBUG(dbgs() << "Loops not tightly nested\n"); 1006 ORE->emit([&]() { 1007 return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested", 1008 InnerLoop->getStartLoc(), 1009 InnerLoop->getHeader()) 1010 << "Cannot interchange loops because they are not tightly " 1011 "nested."; 1012 }); 1013 return false; 1014 } 1015 1016 if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop, 1017 OuterInnerReductions)) { 1018 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n"); 1019 ORE->emit([&]() { 1020 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI", 1021 InnerLoop->getStartLoc(), 1022 InnerLoop->getHeader()) 1023 << "Found unsupported PHI node in loop exit."; 1024 }); 1025 return false; 1026 } 1027 1028 if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) { 1029 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n"); 1030 ORE->emit([&]() { 1031 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI", 1032 OuterLoop->getStartLoc(), 1033 OuterLoop->getHeader()) 1034 << "Found unsupported PHI node in loop exit."; 1035 }); 1036 return false; 1037 } 1038 1039 return true; 1040 } 1041 1042 int LoopInterchangeProfitability::getInstrOrderCost() { 1043 unsigned GoodOrder, BadOrder; 1044 BadOrder = GoodOrder = 0; 1045 for (BasicBlock *BB : InnerLoop->blocks()) { 1046 for (Instruction &Ins : *BB) { 1047 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) { 1048 unsigned NumOp = GEP->getNumOperands(); 1049 bool FoundInnerInduction = false; 1050 bool FoundOuterInduction = false; 1051 for (unsigned i = 0; i < NumOp; ++i) { 1052 // Skip operands that are not SCEV-able. 1053 if (!SE->isSCEVable(GEP->getOperand(i)->getType())) 1054 continue; 1055 1056 const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i)); 1057 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal); 1058 if (!AR) 1059 continue; 1060 1061 // If we find the inner induction after an outer induction e.g. 1062 // for(int i=0;i<N;i++) 1063 // for(int j=0;j<N;j++) 1064 // A[i][j] = A[i-1][j-1]+k; 1065 // then it is a good order. 1066 if (AR->getLoop() == InnerLoop) { 1067 // We found an InnerLoop induction after OuterLoop induction. It is 1068 // a good order. 1069 FoundInnerInduction = true; 1070 if (FoundOuterInduction) { 1071 GoodOrder++; 1072 break; 1073 } 1074 } 1075 // If we find the outer induction after an inner induction e.g. 1076 // for(int i=0;i<N;i++) 1077 // for(int j=0;j<N;j++) 1078 // A[j][i] = A[j-1][i-1]+k; 1079 // then it is a bad order. 1080 if (AR->getLoop() == OuterLoop) { 1081 // We found an OuterLoop induction after InnerLoop induction. It is 1082 // a bad order. 1083 FoundOuterInduction = true; 1084 if (FoundInnerInduction) { 1085 BadOrder++; 1086 break; 1087 } 1088 } 1089 } 1090 } 1091 } 1092 } 1093 return GoodOrder - BadOrder; 1094 } 1095 1096 std::optional<bool> 1097 LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis( 1098 const DenseMap<const Loop *, unsigned> &CostMap, 1099 std::unique_ptr<CacheCost> &CC) { 1100 // This is the new cost model returned from loop cache analysis. 1101 // A smaller index means the loop should be placed an outer loop, and vice 1102 // versa. 1103 if (CostMap.contains(InnerLoop) && CostMap.contains(OuterLoop)) { 1104 unsigned InnerIndex = 0, OuterIndex = 0; 1105 InnerIndex = CostMap.find(InnerLoop)->second; 1106 OuterIndex = CostMap.find(OuterLoop)->second; 1107 LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex 1108 << ", OuterIndex = " << OuterIndex << "\n"); 1109 if (InnerIndex < OuterIndex) 1110 return std::optional<bool>(true); 1111 assert(InnerIndex != OuterIndex && "CostMap should assign unique " 1112 "numbers to each loop"); 1113 if (CC->getLoopCost(*OuterLoop) == CC->getLoopCost(*InnerLoop)) 1114 return std::nullopt; 1115 return std::optional<bool>(false); 1116 } 1117 return std::nullopt; 1118 } 1119 1120 std::optional<bool> 1121 LoopInterchangeProfitability::isProfitablePerInstrOrderCost() { 1122 // Legacy cost model: this is rough cost estimation algorithm. It counts the 1123 // good and bad order of induction variables in the instruction and allows 1124 // reordering if number of bad orders is more than good. 1125 int Cost = getInstrOrderCost(); 1126 LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n"); 1127 if (Cost < 0 && Cost < LoopInterchangeCostThreshold) 1128 return std::optional<bool>(true); 1129 1130 return std::nullopt; 1131 } 1132 1133 std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization( 1134 unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) { 1135 for (auto &Row : DepMatrix) { 1136 // If the inner loop is loop independent or doesn't carry any dependency 1137 // it is not profitable to move this to outer position, since we are 1138 // likely able to do inner loop vectorization already. 1139 if (Row[InnerLoopId] == 'I' || Row[InnerLoopId] == '=') 1140 return std::optional<bool>(false); 1141 1142 // If the outer loop is not loop independent it is not profitable to move 1143 // this to inner position, since doing so would not enable inner loop 1144 // parallelism. 1145 if (Row[OuterLoopId] != 'I' && Row[OuterLoopId] != '=') 1146 return std::optional<bool>(false); 1147 } 1148 // If inner loop has dependence and outer loop is loop independent then it 1149 // is/ profitable to interchange to enable inner loop parallelism. 1150 // If there are no dependences, interchanging will not improve anything. 1151 return std::optional<bool>(!DepMatrix.empty()); 1152 } 1153 1154 bool LoopInterchangeProfitability::isProfitable( 1155 const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId, 1156 unsigned OuterLoopId, CharMatrix &DepMatrix, 1157 const DenseMap<const Loop *, unsigned> &CostMap, 1158 std::unique_ptr<CacheCost> &CC) { 1159 // isProfitable() is structured to avoid endless loop interchange. 1160 // If loop cache analysis could decide the profitability then, 1161 // profitability check will stop and return the analysis result. 1162 // If cache analysis failed to analyze the loopnest (e.g., 1163 // due to delinearization issues) then only check whether it is 1164 // profitable for InstrOrderCost. Likewise, if InstrOrderCost failed to 1165 // analysis the profitability then only, isProfitableForVectorization 1166 // will decide. 1167 std::optional<bool> shouldInterchange = 1168 isProfitablePerLoopCacheAnalysis(CostMap, CC); 1169 if (!shouldInterchange.has_value()) { 1170 shouldInterchange = isProfitablePerInstrOrderCost(); 1171 if (!shouldInterchange.has_value()) 1172 shouldInterchange = 1173 isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix); 1174 } 1175 if (!shouldInterchange.has_value()) { 1176 ORE->emit([&]() { 1177 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable", 1178 InnerLoop->getStartLoc(), 1179 InnerLoop->getHeader()) 1180 << "Insufficient information to calculate the cost of loop for " 1181 "interchange."; 1182 }); 1183 return false; 1184 } else if (!shouldInterchange.value()) { 1185 ORE->emit([&]() { 1186 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable", 1187 InnerLoop->getStartLoc(), 1188 InnerLoop->getHeader()) 1189 << "Interchanging loops is not considered to improve cache " 1190 "locality nor vectorization."; 1191 }); 1192 return false; 1193 } 1194 return true; 1195 } 1196 1197 void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop, 1198 Loop *InnerLoop) { 1199 for (Loop *L : *OuterLoop) 1200 if (L == InnerLoop) { 1201 OuterLoop->removeChildLoop(L); 1202 return; 1203 } 1204 llvm_unreachable("Couldn't find loop"); 1205 } 1206 1207 /// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the 1208 /// new inner and outer loop after interchanging: NewInner is the original 1209 /// outer loop and NewOuter is the original inner loop. 1210 /// 1211 /// Before interchanging, we have the following structure 1212 /// Outer preheader 1213 // Outer header 1214 // Inner preheader 1215 // Inner header 1216 // Inner body 1217 // Inner latch 1218 // outer bbs 1219 // Outer latch 1220 // 1221 // After interchanging: 1222 // Inner preheader 1223 // Inner header 1224 // Outer preheader 1225 // Outer header 1226 // Inner body 1227 // outer bbs 1228 // Outer latch 1229 // Inner latch 1230 void LoopInterchangeTransform::restructureLoops( 1231 Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader, 1232 BasicBlock *OrigOuterPreHeader) { 1233 Loop *OuterLoopParent = OuterLoop->getParentLoop(); 1234 // The original inner loop preheader moves from the new inner loop to 1235 // the parent loop, if there is one. 1236 NewInner->removeBlockFromLoop(OrigInnerPreHeader); 1237 LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent); 1238 1239 // Switch the loop levels. 1240 if (OuterLoopParent) { 1241 // Remove the loop from its parent loop. 1242 removeChildLoop(OuterLoopParent, NewInner); 1243 removeChildLoop(NewInner, NewOuter); 1244 OuterLoopParent->addChildLoop(NewOuter); 1245 } else { 1246 removeChildLoop(NewInner, NewOuter); 1247 LI->changeTopLevelLoop(NewInner, NewOuter); 1248 } 1249 while (!NewOuter->isInnermost()) 1250 NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin())); 1251 NewOuter->addChildLoop(NewInner); 1252 1253 // BBs from the original inner loop. 1254 SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks()); 1255 1256 // Add BBs from the original outer loop to the original inner loop (excluding 1257 // BBs already in inner loop) 1258 for (BasicBlock *BB : NewInner->blocks()) 1259 if (LI->getLoopFor(BB) == NewInner) 1260 NewOuter->addBlockEntry(BB); 1261 1262 // Now remove inner loop header and latch from the new inner loop and move 1263 // other BBs (the loop body) to the new inner loop. 1264 BasicBlock *OuterHeader = NewOuter->getHeader(); 1265 BasicBlock *OuterLatch = NewOuter->getLoopLatch(); 1266 for (BasicBlock *BB : OrigInnerBBs) { 1267 // Nothing will change for BBs in child loops. 1268 if (LI->getLoopFor(BB) != NewOuter) 1269 continue; 1270 // Remove the new outer loop header and latch from the new inner loop. 1271 if (BB == OuterHeader || BB == OuterLatch) 1272 NewInner->removeBlockFromLoop(BB); 1273 else 1274 LI->changeLoopFor(BB, NewInner); 1275 } 1276 1277 // The preheader of the original outer loop becomes part of the new 1278 // outer loop. 1279 NewOuter->addBlockEntry(OrigOuterPreHeader); 1280 LI->changeLoopFor(OrigOuterPreHeader, NewOuter); 1281 1282 // Tell SE that we move the loops around. 1283 SE->forgetLoop(NewOuter); 1284 } 1285 1286 bool LoopInterchangeTransform::transform() { 1287 bool Transformed = false; 1288 1289 if (InnerLoop->getSubLoops().empty()) { 1290 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 1291 LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n"); 1292 auto &InductionPHIs = LIL.getInnerLoopInductions(); 1293 if (InductionPHIs.empty()) { 1294 LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n"); 1295 return false; 1296 } 1297 1298 SmallVector<Instruction *, 8> InnerIndexVarList; 1299 for (PHINode *CurInductionPHI : InductionPHIs) { 1300 if (CurInductionPHI->getIncomingBlock(0) == InnerLoopPreHeader) 1301 InnerIndexVarList.push_back( 1302 dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(1))); 1303 else 1304 InnerIndexVarList.push_back( 1305 dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(0))); 1306 } 1307 1308 // Create a new latch block for the inner loop. We split at the 1309 // current latch's terminator and then move the condition and all 1310 // operands that are not either loop-invariant or the induction PHI into the 1311 // new latch block. 1312 BasicBlock *NewLatch = 1313 SplitBlock(InnerLoop->getLoopLatch(), 1314 InnerLoop->getLoopLatch()->getTerminator(), DT, LI); 1315 1316 SmallSetVector<Instruction *, 4> WorkList; 1317 unsigned i = 0; 1318 auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() { 1319 for (; i < WorkList.size(); i++) { 1320 // Duplicate instruction and move it the new latch. Update uses that 1321 // have been moved. 1322 Instruction *NewI = WorkList[i]->clone(); 1323 NewI->insertBefore(NewLatch->getFirstNonPHI()); 1324 assert(!NewI->mayHaveSideEffects() && 1325 "Moving instructions with side-effects may change behavior of " 1326 "the loop nest!"); 1327 for (Use &U : llvm::make_early_inc_range(WorkList[i]->uses())) { 1328 Instruction *UserI = cast<Instruction>(U.getUser()); 1329 if (!InnerLoop->contains(UserI->getParent()) || 1330 UserI->getParent() == NewLatch || 1331 llvm::is_contained(InductionPHIs, UserI)) 1332 U.set(NewI); 1333 } 1334 // Add operands of moved instruction to the worklist, except if they are 1335 // outside the inner loop or are the induction PHI. 1336 for (Value *Op : WorkList[i]->operands()) { 1337 Instruction *OpI = dyn_cast<Instruction>(Op); 1338 if (!OpI || 1339 this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop || 1340 llvm::is_contained(InductionPHIs, OpI)) 1341 continue; 1342 WorkList.insert(OpI); 1343 } 1344 } 1345 }; 1346 1347 // FIXME: Should we interchange when we have a constant condition? 1348 Instruction *CondI = dyn_cast<Instruction>( 1349 cast<BranchInst>(InnerLoop->getLoopLatch()->getTerminator()) 1350 ->getCondition()); 1351 if (CondI) 1352 WorkList.insert(CondI); 1353 MoveInstructions(); 1354 for (Instruction *InnerIndexVar : InnerIndexVarList) 1355 WorkList.insert(cast<Instruction>(InnerIndexVar)); 1356 MoveInstructions(); 1357 } 1358 1359 // Ensure the inner loop phi nodes have a separate basic block. 1360 BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); 1361 if (InnerLoopHeader->getFirstNonPHI() != InnerLoopHeader->getTerminator()) { 1362 SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI); 1363 LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n"); 1364 } 1365 1366 // Instructions in the original inner loop preheader may depend on values 1367 // defined in the outer loop header. Move them there, because the original 1368 // inner loop preheader will become the entry into the interchanged loop nest. 1369 // Currently we move all instructions and rely on LICM to move invariant 1370 // instructions outside the loop nest. 1371 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 1372 BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); 1373 if (InnerLoopPreHeader != OuterLoopHeader) { 1374 SmallPtrSet<Instruction *, 4> NeedsMoving; 1375 for (Instruction &I : 1376 make_early_inc_range(make_range(InnerLoopPreHeader->begin(), 1377 std::prev(InnerLoopPreHeader->end())))) 1378 I.moveBefore(OuterLoopHeader->getTerminator()); 1379 } 1380 1381 Transformed |= adjustLoopLinks(); 1382 if (!Transformed) { 1383 LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n"); 1384 return false; 1385 } 1386 1387 return true; 1388 } 1389 1390 /// \brief Move all instructions except the terminator from FromBB right before 1391 /// InsertBefore 1392 static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) { 1393 BasicBlock *ToBB = InsertBefore->getParent(); 1394 1395 ToBB->splice(InsertBefore->getIterator(), FromBB, FromBB->begin(), 1396 FromBB->getTerminator()->getIterator()); 1397 } 1398 1399 /// Swap instructions between \p BB1 and \p BB2 but keep terminators intact. 1400 static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) { 1401 // Save all non-terminator instructions of BB1 into TempInstrs and unlink them 1402 // from BB1 afterwards. 1403 auto Iter = map_range(*BB1, [](Instruction &I) { return &I; }); 1404 SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(Iter.end())); 1405 for (Instruction *I : TempInstrs) 1406 I->removeFromParent(); 1407 1408 // Move instructions from BB2 to BB1. 1409 moveBBContents(BB2, BB1->getTerminator()); 1410 1411 // Move instructions from TempInstrs to BB2. 1412 for (Instruction *I : TempInstrs) 1413 I->insertBefore(BB2->getTerminator()); 1414 } 1415 1416 // Update BI to jump to NewBB instead of OldBB. Records updates to the 1417 // dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that 1418 // \p OldBB is exactly once in BI's successor list. 1419 static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB, 1420 BasicBlock *NewBB, 1421 std::vector<DominatorTree::UpdateType> &DTUpdates, 1422 bool MustUpdateOnce = true) { 1423 assert((!MustUpdateOnce || 1424 llvm::count_if(successors(BI), 1425 [OldBB](BasicBlock *BB) { 1426 return BB == OldBB; 1427 }) == 1) && "BI must jump to OldBB exactly once."); 1428 bool Changed = false; 1429 for (Use &Op : BI->operands()) 1430 if (Op == OldBB) { 1431 Op.set(NewBB); 1432 Changed = true; 1433 } 1434 1435 if (Changed) { 1436 DTUpdates.push_back( 1437 {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB}); 1438 DTUpdates.push_back( 1439 {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB}); 1440 } 1441 assert(Changed && "Expected a successor to be updated"); 1442 } 1443 1444 // Move Lcssa PHIs to the right place. 1445 static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader, 1446 BasicBlock *InnerLatch, BasicBlock *OuterHeader, 1447 BasicBlock *OuterLatch, BasicBlock *OuterExit, 1448 Loop *InnerLoop, LoopInfo *LI) { 1449 1450 // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are 1451 // defined either in the header or latch. Those blocks will become header and 1452 // latch of the new outer loop, and the only possible users can PHI nodes 1453 // in the exit block of the loop nest or the outer loop header (reduction 1454 // PHIs, in that case, the incoming value must be defined in the inner loop 1455 // header). We can just substitute the user with the incoming value and remove 1456 // the PHI. 1457 for (PHINode &P : make_early_inc_range(InnerExit->phis())) { 1458 assert(P.getNumIncomingValues() == 1 && 1459 "Only loops with a single exit are supported!"); 1460 1461 // Incoming values are guaranteed be instructions currently. 1462 auto IncI = cast<Instruction>(P.getIncomingValueForBlock(InnerLatch)); 1463 // In case of multi-level nested loops, follow LCSSA to find the incoming 1464 // value defined from the innermost loop. 1465 auto IncIInnerMost = cast<Instruction>(followLCSSA(IncI)); 1466 // Skip phis with incoming values from the inner loop body, excluding the 1467 // header and latch. 1468 if (IncIInnerMost->getParent() != InnerLatch && 1469 IncIInnerMost->getParent() != InnerHeader) 1470 continue; 1471 1472 assert(all_of(P.users(), 1473 [OuterHeader, OuterExit, IncI, InnerHeader](User *U) { 1474 return (cast<PHINode>(U)->getParent() == OuterHeader && 1475 IncI->getParent() == InnerHeader) || 1476 cast<PHINode>(U)->getParent() == OuterExit; 1477 }) && 1478 "Can only replace phis iff the uses are in the loop nest exit or " 1479 "the incoming value is defined in the inner header (it will " 1480 "dominate all loop blocks after interchanging)"); 1481 P.replaceAllUsesWith(IncI); 1482 P.eraseFromParent(); 1483 } 1484 1485 SmallVector<PHINode *, 8> LcssaInnerExit; 1486 for (PHINode &P : InnerExit->phis()) 1487 LcssaInnerExit.push_back(&P); 1488 1489 SmallVector<PHINode *, 8> LcssaInnerLatch; 1490 for (PHINode &P : InnerLatch->phis()) 1491 LcssaInnerLatch.push_back(&P); 1492 1493 // Lcssa PHIs for values used outside the inner loop are in InnerExit. 1494 // If a PHI node has users outside of InnerExit, it has a use outside the 1495 // interchanged loop and we have to preserve it. We move these to 1496 // InnerLatch, which will become the new exit block for the innermost 1497 // loop after interchanging. 1498 for (PHINode *P : LcssaInnerExit) 1499 P->moveBefore(InnerLatch->getFirstNonPHI()); 1500 1501 // If the inner loop latch contains LCSSA PHIs, those come from a child loop 1502 // and we have to move them to the new inner latch. 1503 for (PHINode *P : LcssaInnerLatch) 1504 P->moveBefore(InnerExit->getFirstNonPHI()); 1505 1506 // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have 1507 // incoming values defined in the outer loop, we have to add a new PHI 1508 // in the inner loop latch, which became the exit block of the outer loop, 1509 // after interchanging. 1510 if (OuterExit) { 1511 for (PHINode &P : OuterExit->phis()) { 1512 if (P.getNumIncomingValues() != 1) 1513 continue; 1514 // Skip Phis with incoming values defined in the inner loop. Those should 1515 // already have been updated. 1516 auto I = dyn_cast<Instruction>(P.getIncomingValue(0)); 1517 if (!I || LI->getLoopFor(I->getParent()) == InnerLoop) 1518 continue; 1519 1520 PHINode *NewPhi = dyn_cast<PHINode>(P.clone()); 1521 NewPhi->setIncomingValue(0, P.getIncomingValue(0)); 1522 NewPhi->setIncomingBlock(0, OuterLatch); 1523 // We might have incoming edges from other BBs, i.e., the original outer 1524 // header. 1525 for (auto *Pred : predecessors(InnerLatch)) { 1526 if (Pred == OuterLatch) 1527 continue; 1528 NewPhi->addIncoming(P.getIncomingValue(0), Pred); 1529 } 1530 NewPhi->insertBefore(InnerLatch->getFirstNonPHI()); 1531 P.setIncomingValue(0, NewPhi); 1532 } 1533 } 1534 1535 // Now adjust the incoming blocks for the LCSSA PHIs. 1536 // For PHIs moved from Inner's exit block, we need to replace Inner's latch 1537 // with the new latch. 1538 InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch); 1539 } 1540 1541 bool LoopInterchangeTransform::adjustLoopBranches() { 1542 LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n"); 1543 std::vector<DominatorTree::UpdateType> DTUpdates; 1544 1545 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); 1546 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 1547 1548 assert(OuterLoopPreHeader != OuterLoop->getHeader() && 1549 InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader && 1550 InnerLoopPreHeader && "Guaranteed by loop-simplify form"); 1551 // Ensure that both preheaders do not contain PHI nodes and have single 1552 // predecessors. This allows us to move them easily. We use 1553 // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing 1554 // preheaders do not satisfy those conditions. 1555 if (isa<PHINode>(OuterLoopPreHeader->begin()) || 1556 !OuterLoopPreHeader->getUniquePredecessor()) 1557 OuterLoopPreHeader = 1558 InsertPreheaderForLoop(OuterLoop, DT, LI, nullptr, true); 1559 if (InnerLoopPreHeader == OuterLoop->getHeader()) 1560 InnerLoopPreHeader = 1561 InsertPreheaderForLoop(InnerLoop, DT, LI, nullptr, true); 1562 1563 // Adjust the loop preheader 1564 BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); 1565 BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); 1566 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); 1567 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); 1568 BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor(); 1569 BasicBlock *InnerLoopLatchPredecessor = 1570 InnerLoopLatch->getUniquePredecessor(); 1571 BasicBlock *InnerLoopLatchSuccessor; 1572 BasicBlock *OuterLoopLatchSuccessor; 1573 1574 BranchInst *OuterLoopLatchBI = 1575 dyn_cast<BranchInst>(OuterLoopLatch->getTerminator()); 1576 BranchInst *InnerLoopLatchBI = 1577 dyn_cast<BranchInst>(InnerLoopLatch->getTerminator()); 1578 BranchInst *OuterLoopHeaderBI = 1579 dyn_cast<BranchInst>(OuterLoopHeader->getTerminator()); 1580 BranchInst *InnerLoopHeaderBI = 1581 dyn_cast<BranchInst>(InnerLoopHeader->getTerminator()); 1582 1583 if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor || 1584 !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI || 1585 !InnerLoopHeaderBI) 1586 return false; 1587 1588 BranchInst *InnerLoopLatchPredecessorBI = 1589 dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator()); 1590 BranchInst *OuterLoopPredecessorBI = 1591 dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator()); 1592 1593 if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI) 1594 return false; 1595 BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor(); 1596 if (!InnerLoopHeaderSuccessor) 1597 return false; 1598 1599 // Adjust Loop Preheader and headers. 1600 // The branches in the outer loop predecessor and the outer loop header can 1601 // be unconditional branches or conditional branches with duplicates. Consider 1602 // this when updating the successors. 1603 updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader, 1604 InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false); 1605 // The outer loop header might or might not branch to the outer latch. 1606 // We are guaranteed to branch to the inner loop preheader. 1607 if (llvm::is_contained(OuterLoopHeaderBI->successors(), OuterLoopLatch)) { 1608 // In this case the outerLoopHeader should branch to the InnerLoopLatch. 1609 updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, InnerLoopLatch, 1610 DTUpdates, 1611 /*MustUpdateOnce=*/false); 1612 } 1613 updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader, 1614 InnerLoopHeaderSuccessor, DTUpdates, 1615 /*MustUpdateOnce=*/false); 1616 1617 // Adjust reduction PHI's now that the incoming block has changed. 1618 InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader, 1619 OuterLoopHeader); 1620 1621 updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor, 1622 OuterLoopPreHeader, DTUpdates); 1623 1624 // -------------Adjust loop latches----------- 1625 if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader) 1626 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1); 1627 else 1628 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0); 1629 1630 updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch, 1631 InnerLoopLatchSuccessor, DTUpdates); 1632 1633 if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader) 1634 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1); 1635 else 1636 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0); 1637 1638 updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor, 1639 OuterLoopLatchSuccessor, DTUpdates); 1640 updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch, 1641 DTUpdates); 1642 1643 DT->applyUpdates(DTUpdates); 1644 restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader, 1645 OuterLoopPreHeader); 1646 1647 moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch, 1648 OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(), 1649 InnerLoop, LI); 1650 // For PHIs in the exit block of the outer loop, outer's latch has been 1651 // replaced by Inners'. 1652 OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch); 1653 1654 auto &OuterInnerReductions = LIL.getOuterInnerReductions(); 1655 // Now update the reduction PHIs in the inner and outer loop headers. 1656 SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs; 1657 for (PHINode &PHI : InnerLoopHeader->phis()) 1658 if (OuterInnerReductions.contains(&PHI)) 1659 InnerLoopPHIs.push_back(&PHI); 1660 1661 for (PHINode &PHI : OuterLoopHeader->phis()) 1662 if (OuterInnerReductions.contains(&PHI)) 1663 OuterLoopPHIs.push_back(&PHI); 1664 1665 // Now move the remaining reduction PHIs from outer to inner loop header and 1666 // vice versa. The PHI nodes must be part of a reduction across the inner and 1667 // outer loop and all the remains to do is and updating the incoming blocks. 1668 for (PHINode *PHI : OuterLoopPHIs) { 1669 LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump();); 1670 PHI->moveBefore(InnerLoopHeader->getFirstNonPHI()); 1671 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node"); 1672 } 1673 for (PHINode *PHI : InnerLoopPHIs) { 1674 LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump();); 1675 PHI->moveBefore(OuterLoopHeader->getFirstNonPHI()); 1676 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node"); 1677 } 1678 1679 // Update the incoming blocks for moved PHI nodes. 1680 OuterLoopHeader->replacePhiUsesWith(InnerLoopPreHeader, OuterLoopPreHeader); 1681 OuterLoopHeader->replacePhiUsesWith(InnerLoopLatch, OuterLoopLatch); 1682 InnerLoopHeader->replacePhiUsesWith(OuterLoopPreHeader, InnerLoopPreHeader); 1683 InnerLoopHeader->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch); 1684 1685 // Values defined in the outer loop header could be used in the inner loop 1686 // latch. In that case, we need to create LCSSA phis for them, because after 1687 // interchanging they will be defined in the new inner loop and used in the 1688 // new outer loop. 1689 IRBuilder<> Builder(OuterLoopHeader->getContext()); 1690 SmallVector<Instruction *, 4> MayNeedLCSSAPhis; 1691 for (Instruction &I : 1692 make_range(OuterLoopHeader->begin(), std::prev(OuterLoopHeader->end()))) 1693 MayNeedLCSSAPhis.push_back(&I); 1694 formLCSSAForInstructions(MayNeedLCSSAPhis, *DT, *LI, SE, Builder); 1695 1696 return true; 1697 } 1698 1699 bool LoopInterchangeTransform::adjustLoopLinks() { 1700 // Adjust all branches in the inner and outer loop. 1701 bool Changed = adjustLoopBranches(); 1702 if (Changed) { 1703 // We have interchanged the preheaders so we need to interchange the data in 1704 // the preheaders as well. This is because the content of the inner 1705 // preheader was previously executed inside the outer loop. 1706 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); 1707 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 1708 swapBBContents(OuterLoopPreHeader, InnerLoopPreHeader); 1709 } 1710 return Changed; 1711 } 1712 1713 PreservedAnalyses LoopInterchangePass::run(LoopNest &LN, 1714 LoopAnalysisManager &AM, 1715 LoopStandardAnalysisResults &AR, 1716 LPMUpdater &U) { 1717 Function &F = *LN.getParent(); 1718 1719 DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI); 1720 std::unique_ptr<CacheCost> CC = 1721 CacheCost::getCacheCost(LN.getOutermostLoop(), AR, DI); 1722 OptimizationRemarkEmitter ORE(&F); 1723 if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN)) 1724 return PreservedAnalyses::all(); 1725 U.markLoopNestChanged(true); 1726 return getLoopPassPreservedAnalyses(); 1727 } 1728