xref: /llvm-project/llvm/lib/Transforms/Scalar/LoopInterchange.cpp (revision 75b33d6bd518f6502a63f96e79c0f4be3691b1d5)
1 //===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This Pass handles loop interchange transform.
10 // This pass interchanges loops to provide a more cache-friendly memory access
11 // patterns.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/Transforms/Scalar/LoopInterchange.h"
16 #include "llvm/ADT/STLExtras.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/Statistic.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/Analysis/DependenceAnalysis.h"
21 #include "llvm/Analysis/LoopCacheAnalysis.h"
22 #include "llvm/Analysis/LoopInfo.h"
23 #include "llvm/Analysis/LoopNestAnalysis.h"
24 #include "llvm/Analysis/LoopPass.h"
25 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
26 #include "llvm/Analysis/ScalarEvolution.h"
27 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
28 #include "llvm/IR/BasicBlock.h"
29 #include "llvm/IR/Constants.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Dominators.h"
32 #include "llvm/IR/Function.h"
33 #include "llvm/IR/IRBuilder.h"
34 #include "llvm/IR/InstrTypes.h"
35 #include "llvm/IR/Instruction.h"
36 #include "llvm/IR/Instructions.h"
37 #include "llvm/IR/User.h"
38 #include "llvm/IR/Value.h"
39 #include "llvm/InitializePasses.h"
40 #include "llvm/Pass.h"
41 #include "llvm/Support/Casting.h"
42 #include "llvm/Support/CommandLine.h"
43 #include "llvm/Support/Debug.h"
44 #include "llvm/Support/ErrorHandling.h"
45 #include "llvm/Support/raw_ostream.h"
46 #include "llvm/Transforms/Scalar.h"
47 #include "llvm/Transforms/Scalar/LoopPassManager.h"
48 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
49 #include "llvm/Transforms/Utils/LoopUtils.h"
50 #include <cassert>
51 #include <utility>
52 #include <vector>
53 
54 using namespace llvm;
55 
56 #define DEBUG_TYPE "loop-interchange"
57 
58 STATISTIC(LoopsInterchanged, "Number of loops interchanged");
59 
60 static cl::opt<int> LoopInterchangeCostThreshold(
61     "loop-interchange-threshold", cl::init(0), cl::Hidden,
62     cl::desc("Interchange if you gain more than this number"));
63 
64 namespace {
65 
66 using LoopVector = SmallVector<Loop *, 8>;
67 
68 // TODO: Check if we can use a sparse matrix here.
69 using CharMatrix = std::vector<std::vector<char>>;
70 
71 } // end anonymous namespace
72 
73 // Maximum number of dependencies that can be handled in the dependency matrix.
74 static const unsigned MaxMemInstrCount = 100;
75 
76 // Maximum loop depth supported.
77 static const unsigned MaxLoopNestDepth = 10;
78 
79 #ifdef DUMP_DEP_MATRICIES
80 static void printDepMatrix(CharMatrix &DepMatrix) {
81   for (auto &Row : DepMatrix) {
82     for (auto D : Row)
83       LLVM_DEBUG(dbgs() << D << " ");
84     LLVM_DEBUG(dbgs() << "\n");
85   }
86 }
87 #endif
88 
89 static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
90                                      Loop *L, DependenceInfo *DI,
91                                      ScalarEvolution *SE) {
92   using ValueVector = SmallVector<Value *, 16>;
93 
94   ValueVector MemInstr;
95 
96   // For each block.
97   for (BasicBlock *BB : L->blocks()) {
98     // Scan the BB and collect legal loads and stores.
99     for (Instruction &I : *BB) {
100       if (!isa<Instruction>(I))
101         return false;
102       if (auto *Ld = dyn_cast<LoadInst>(&I)) {
103         if (!Ld->isSimple())
104           return false;
105         MemInstr.push_back(&I);
106       } else if (auto *St = dyn_cast<StoreInst>(&I)) {
107         if (!St->isSimple())
108           return false;
109         MemInstr.push_back(&I);
110       }
111     }
112   }
113 
114   LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()
115                     << " Loads and Stores to analyze\n");
116 
117   ValueVector::iterator I, IE, J, JE;
118 
119   for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
120     for (J = I, JE = MemInstr.end(); J != JE; ++J) {
121       std::vector<char> Dep;
122       Instruction *Src = cast<Instruction>(*I);
123       Instruction *Dst = cast<Instruction>(*J);
124       // Ignore Input dependencies.
125       if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
126         continue;
127       // Track Output, Flow, and Anti dependencies.
128       if (auto D = DI->depends(Src, Dst, true)) {
129         assert(D->isOrdered() && "Expected an output, flow or anti dep.");
130         // If the direction vector is negative, normalize it to
131         // make it non-negative.
132         if (D->normalize(SE))
133           LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
134         LLVM_DEBUG(StringRef DepType =
135                        D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
136                    dbgs() << "Found " << DepType
137                           << " dependency between Src and Dst\n"
138                           << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
139         unsigned Levels = D->getLevels();
140         char Direction;
141         for (unsigned II = 1; II <= Levels; ++II) {
142           if (D->isScalar(II)) {
143             Direction = 'S';
144             Dep.push_back(Direction);
145           } else {
146             unsigned Dir = D->getDirection(II);
147             if (Dir == Dependence::DVEntry::LT ||
148                 Dir == Dependence::DVEntry::LE)
149               Direction = '<';
150             else if (Dir == Dependence::DVEntry::GT ||
151                      Dir == Dependence::DVEntry::GE)
152               Direction = '>';
153             else if (Dir == Dependence::DVEntry::EQ)
154               Direction = '=';
155             else
156               Direction = '*';
157             Dep.push_back(Direction);
158           }
159         }
160         while (Dep.size() != Level) {
161           Dep.push_back('I');
162         }
163 
164         DepMatrix.push_back(Dep);
165         if (DepMatrix.size() > MaxMemInstrCount) {
166           LLVM_DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
167                             << " dependencies inside loop\n");
168           return false;
169         }
170       }
171     }
172   }
173 
174   return true;
175 }
176 
177 // A loop is moved from index 'from' to an index 'to'. Update the Dependence
178 // matrix by exchanging the two columns.
179 static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
180                                     unsigned ToIndx) {
181   for (unsigned I = 0, E = DepMatrix.size(); I < E; ++I)
182     std::swap(DepMatrix[I][ToIndx], DepMatrix[I][FromIndx]);
183 }
184 
185 // Checks if no dependence exist in the dependency matrix in Row before Column.
186 static bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row,
187                                  unsigned Column) {
188   for (unsigned i = 0; i < Column; ++i) {
189     if (DepMatrix[Row][i] != '=' && DepMatrix[Row][i] != 'S' &&
190         DepMatrix[Row][i] != 'I')
191       return false;
192   }
193   return true;
194 }
195 
196 static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row,
197                                 unsigned OuterLoopId, char InnerDep,
198                                 char OuterDep) {
199   if (InnerDep == OuterDep)
200     return true;
201 
202   // It is legal to interchange if and only if after interchange no row has a
203   // '>' direction as the leftmost non-'='.
204 
205   if (InnerDep == '=' || InnerDep == 'S' || InnerDep == 'I')
206     return true;
207 
208   if (InnerDep == '<')
209     return true;
210 
211   if (InnerDep == '>') {
212     // If OuterLoopId represents outermost loop then interchanging will make the
213     // 1st dependency as '>'
214     if (OuterLoopId == 0)
215       return false;
216 
217     // If all dependencies before OuterloopId are '=','S'or 'I'. Then
218     // interchanging will result in this row having an outermost non '='
219     // dependency of '>'
220     if (!containsNoDependence(DepMatrix, Row, OuterLoopId))
221       return true;
222   }
223 
224   return false;
225 }
226 
227 // Checks if it is legal to interchange 2 loops.
228 // [Theorem] A permutation of the loops in a perfect nest is legal if and only
229 // if the direction matrix, after the same permutation is applied to its
230 // columns, has no ">" direction as the leftmost non-"=" direction in any row.
231 static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
232                                       unsigned InnerLoopId,
233                                       unsigned OuterLoopId) {
234   unsigned NumRows = DepMatrix.size();
235   // For each row check if it is valid to interchange.
236   for (unsigned Row = 0; Row < NumRows; ++Row) {
237     char InnerDep = DepMatrix[Row][InnerLoopId];
238     char OuterDep = DepMatrix[Row][OuterLoopId];
239     if (InnerDep == '*' || OuterDep == '*')
240       return false;
241     if (!validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep, OuterDep))
242       return false;
243   }
244   return true;
245 }
246 
247 static void populateWorklist(Loop &L, LoopVector &LoopList) {
248   LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
249                     << L.getHeader()->getParent()->getName() << " Loop: %"
250                     << L.getHeader()->getName() << '\n');
251   assert(LoopList.empty() && "LoopList should initially be empty!");
252   Loop *CurrentLoop = &L;
253   const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
254   while (!Vec->empty()) {
255     // The current loop has multiple subloops in it hence it is not tightly
256     // nested.
257     // Discard all loops above it added into Worklist.
258     if (Vec->size() != 1) {
259       LoopList = {};
260       return;
261     }
262 
263     LoopList.push_back(CurrentLoop);
264     CurrentLoop = Vec->front();
265     Vec = &CurrentLoop->getSubLoops();
266   }
267   LoopList.push_back(CurrentLoop);
268 }
269 
270 namespace {
271 
272 /// LoopInterchangeLegality checks if it is legal to interchange the loop.
273 class LoopInterchangeLegality {
274 public:
275   LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
276                           OptimizationRemarkEmitter *ORE)
277       : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
278 
279   /// Check if the loops can be interchanged.
280   bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
281                            CharMatrix &DepMatrix);
282 
283   /// Discover induction PHIs in the header of \p L. Induction
284   /// PHIs are added to \p Inductions.
285   bool findInductions(Loop *L, SmallVectorImpl<PHINode *> &Inductions);
286 
287   /// Check if the loop structure is understood. We do not handle triangular
288   /// loops for now.
289   bool isLoopStructureUnderstood();
290 
291   bool currentLimitations();
292 
293   const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const {
294     return OuterInnerReductions;
295   }
296 
297   const SmallVectorImpl<PHINode *> &getInnerLoopInductions() const {
298     return InnerLoopInductions;
299   }
300 
301 private:
302   bool tightlyNested(Loop *Outer, Loop *Inner);
303   bool containsUnsafeInstructions(BasicBlock *BB);
304 
305   /// Discover induction and reduction PHIs in the header of \p L. Induction
306   /// PHIs are added to \p Inductions, reductions are added to
307   /// OuterInnerReductions. When the outer loop is passed, the inner loop needs
308   /// to be passed as \p InnerLoop.
309   bool findInductionAndReductions(Loop *L,
310                                   SmallVector<PHINode *, 8> &Inductions,
311                                   Loop *InnerLoop);
312 
313   Loop *OuterLoop;
314   Loop *InnerLoop;
315 
316   ScalarEvolution *SE;
317 
318   /// Interface to emit optimization remarks.
319   OptimizationRemarkEmitter *ORE;
320 
321   /// Set of reduction PHIs taking part of a reduction across the inner and
322   /// outer loop.
323   SmallPtrSet<PHINode *, 4> OuterInnerReductions;
324 
325   /// Set of inner loop induction PHIs
326   SmallVector<PHINode *, 8> InnerLoopInductions;
327 };
328 
329 /// LoopInterchangeProfitability checks if it is profitable to interchange the
330 /// loop.
331 class LoopInterchangeProfitability {
332 public:
333   LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
334                                OptimizationRemarkEmitter *ORE)
335       : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
336 
337   /// Check if the loop interchange is profitable.
338   bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
339                     unsigned InnerLoopId, unsigned OuterLoopId,
340                     CharMatrix &DepMatrix,
341                     const DenseMap<const Loop *, unsigned> &CostMap);
342 
343 private:
344   int getInstrOrderCost();
345 
346   Loop *OuterLoop;
347   Loop *InnerLoop;
348 
349   /// Scev analysis.
350   ScalarEvolution *SE;
351 
352   /// Interface to emit optimization remarks.
353   OptimizationRemarkEmitter *ORE;
354 };
355 
356 /// LoopInterchangeTransform interchanges the loop.
357 class LoopInterchangeTransform {
358 public:
359   LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
360                            LoopInfo *LI, DominatorTree *DT,
361                            const LoopInterchangeLegality &LIL)
362       : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {}
363 
364   /// Interchange OuterLoop and InnerLoop.
365   bool transform();
366   void restructureLoops(Loop *NewInner, Loop *NewOuter,
367                         BasicBlock *OrigInnerPreHeader,
368                         BasicBlock *OrigOuterPreHeader);
369   void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
370 
371 private:
372   bool adjustLoopLinks();
373   bool adjustLoopBranches();
374 
375   Loop *OuterLoop;
376   Loop *InnerLoop;
377 
378   /// Scev analysis.
379   ScalarEvolution *SE;
380 
381   LoopInfo *LI;
382   DominatorTree *DT;
383 
384   const LoopInterchangeLegality &LIL;
385 };
386 
387 struct LoopInterchange {
388   ScalarEvolution *SE = nullptr;
389   LoopInfo *LI = nullptr;
390   DependenceInfo *DI = nullptr;
391   DominatorTree *DT = nullptr;
392   std::unique_ptr<CacheCost> CC = nullptr;
393 
394   /// Interface to emit optimization remarks.
395   OptimizationRemarkEmitter *ORE;
396 
397   LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
398                   DominatorTree *DT, std::unique_ptr<CacheCost> &CC,
399                   OptimizationRemarkEmitter *ORE)
400       : SE(SE), LI(LI), DI(DI), DT(DT), CC(std::move(CC)), ORE(ORE) {}
401 
402   bool run(Loop *L) {
403     if (L->getParentLoop())
404       return false;
405     SmallVector<Loop *, 8> LoopList;
406     populateWorklist(*L, LoopList);
407     return processLoopList(LoopList);
408   }
409 
410   bool run(LoopNest &LN) {
411     SmallVector<Loop *, 8> LoopList(LN.getLoops().begin(), LN.getLoops().end());
412     for (unsigned I = 1; I < LoopList.size(); ++I)
413       if (LoopList[I]->getParentLoop() != LoopList[I - 1])
414         return false;
415     return processLoopList(LoopList);
416   }
417 
418   bool isComputableLoopNest(ArrayRef<Loop *> LoopList) {
419     for (Loop *L : LoopList) {
420       const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
421       if (isa<SCEVCouldNotCompute>(ExitCountOuter)) {
422         LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
423         return false;
424       }
425       if (L->getNumBackEdges() != 1) {
426         LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
427         return false;
428       }
429       if (!L->getExitingBlock()) {
430         LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
431         return false;
432       }
433     }
434     return true;
435   }
436 
437   unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) {
438     // TODO: Add a better heuristic to select the loop to be interchanged based
439     // on the dependence matrix. Currently we select the innermost loop.
440     return LoopList.size() - 1;
441   }
442 
443   bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
444     bool Changed = false;
445     unsigned LoopNestDepth = LoopList.size();
446     if (LoopNestDepth < 2) {
447       LLVM_DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
448       return false;
449     }
450     if (LoopNestDepth > MaxLoopNestDepth) {
451       LLVM_DEBUG(dbgs() << "Cannot handle loops of depth greater than "
452                         << MaxLoopNestDepth << "\n");
453       return false;
454     }
455     if (!isComputableLoopNest(LoopList)) {
456       LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
457       return false;
458     }
459 
460     LLVM_DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth
461                       << "\n");
462 
463     CharMatrix DependencyMatrix;
464     Loop *OuterMostLoop = *(LoopList.begin());
465     if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
466                                   OuterMostLoop, DI, SE)) {
467       LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
468       return false;
469     }
470 #ifdef DUMP_DEP_MATRICIES
471     LLVM_DEBUG(dbgs() << "Dependence before interchange\n");
472     printDepMatrix(DependencyMatrix);
473 #endif
474 
475     // Get the Outermost loop exit.
476     BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
477     if (!LoopNestExit) {
478       LLVM_DEBUG(dbgs() << "OuterMostLoop needs an unique exit block");
479       return false;
480     }
481 
482     unsigned SelecLoopId = selectLoopForInterchange(LoopList);
483     // Obtain the loop vector returned from loop cache analysis beforehand,
484     // and put each <Loop, index> pair into a map for constant time query
485     // later. Indices in loop vector reprsent the optimal order of the
486     // corresponding loop, e.g., given a loopnest with depth N, index 0
487     // indicates the loop should be placed as the outermost loop and index N
488     // indicates the loop should be placed as the innermost loop.
489     //
490     // For the old pass manager CacheCost would be null.
491     DenseMap<const Loop *, unsigned> CostMap;
492     if (CC != nullptr) {
493       const auto &LoopCosts = CC->getLoopCosts();
494       for (unsigned i = 0; i < LoopCosts.size(); i++) {
495         CostMap[LoopCosts[i].first] = i;
496       }
497     }
498     // We try to achieve the globally optimal memory access for the loopnest,
499     // and do interchange based on a bubble-sort fasion. We start from
500     // the innermost loop, move it outwards to the best possible position
501     // and repeat this process.
502     for (unsigned j = SelecLoopId; j > 0; j--) {
503       bool ChangedPerIter = false;
504       for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
505         bool Interchanged = processLoop(LoopList[i], LoopList[i - 1], i, i - 1,
506                                         DependencyMatrix, CostMap);
507         if (!Interchanged)
508           continue;
509         // Loops interchanged, update LoopList accordingly.
510         std::swap(LoopList[i - 1], LoopList[i]);
511         // Update the DependencyMatrix
512         interChangeDependencies(DependencyMatrix, i, i - 1);
513 #ifdef DUMP_DEP_MATRICIES
514         LLVM_DEBUG(dbgs() << "Dependence after interchange\n");
515         printDepMatrix(DependencyMatrix);
516 #endif
517         ChangedPerIter |= Interchanged;
518         Changed |= Interchanged;
519       }
520       // Early abort if there was no interchange during an entire round of
521       // moving loops outwards.
522       if (!ChangedPerIter)
523         break;
524     }
525     return Changed;
526   }
527 
528   bool processLoop(Loop *InnerLoop, Loop *OuterLoop, unsigned InnerLoopId,
529                    unsigned OuterLoopId,
530                    std::vector<std::vector<char>> &DependencyMatrix,
531                    const DenseMap<const Loop *, unsigned> &CostMap) {
532     LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
533                       << " and OuterLoopId = " << OuterLoopId << "\n");
534     LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
535     if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
536       LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n");
537       return false;
538     }
539     LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
540     LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
541     if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
542                           DependencyMatrix, CostMap)) {
543       LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
544       return false;
545     }
546 
547     ORE->emit([&]() {
548       return OptimizationRemark(DEBUG_TYPE, "Interchanged",
549                                 InnerLoop->getStartLoc(),
550                                 InnerLoop->getHeader())
551              << "Loop interchanged with enclosing loop.";
552     });
553 
554     LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL);
555     LIT.transform();
556     LLVM_DEBUG(dbgs() << "Loops interchanged.\n");
557     LoopsInterchanged++;
558 
559     llvm::formLCSSARecursively(*OuterLoop, *DT, LI, SE);
560     return true;
561   }
562 };
563 
564 } // end anonymous namespace
565 
566 bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB) {
567   return any_of(*BB, [](const Instruction &I) {
568     return I.mayHaveSideEffects() || I.mayReadFromMemory();
569   });
570 }
571 
572 bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
573   BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
574   BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
575   BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
576 
577   LLVM_DEBUG(dbgs() << "Checking if loops are tightly nested\n");
578 
579   // A perfectly nested loop will not have any branch in between the outer and
580   // inner block i.e. outer header will branch to either inner preheader and
581   // outerloop latch.
582   BranchInst *OuterLoopHeaderBI =
583       dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
584   if (!OuterLoopHeaderBI)
585     return false;
586 
587   for (BasicBlock *Succ : successors(OuterLoopHeaderBI))
588     if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() &&
589         Succ != OuterLoopLatch)
590       return false;
591 
592   LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
593   // We do not have any basic block in between now make sure the outer header
594   // and outer loop latch doesn't contain any unsafe instructions.
595   if (containsUnsafeInstructions(OuterLoopHeader) ||
596       containsUnsafeInstructions(OuterLoopLatch))
597     return false;
598 
599   // Also make sure the inner loop preheader does not contain any unsafe
600   // instructions. Note that all instructions in the preheader will be moved to
601   // the outer loop header when interchanging.
602   if (InnerLoopPreHeader != OuterLoopHeader &&
603       containsUnsafeInstructions(InnerLoopPreHeader))
604     return false;
605 
606   BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
607   // Ensure the inner loop exit block flows to the outer loop latch possibly
608   // through empty blocks.
609   const BasicBlock &SuccInner =
610       LoopNest::skipEmptyBlockUntil(InnerLoopExit, OuterLoopLatch);
611   if (&SuccInner != OuterLoopLatch) {
612     LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExit
613                       << " does not lead to the outer loop latch.\n";);
614     return false;
615   }
616   // The inner loop exit block does flow to the outer loop latch and not some
617   // other BBs, now make sure it contains safe instructions, since it will be
618   // moved into the (new) inner loop after interchange.
619   if (containsUnsafeInstructions(InnerLoopExit))
620     return false;
621 
622   LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
623   // We have a perfect loop nest.
624   return true;
625 }
626 
627 bool LoopInterchangeLegality::isLoopStructureUnderstood() {
628   BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
629   for (PHINode *InnerInduction : InnerLoopInductions) {
630     unsigned Num = InnerInduction->getNumOperands();
631     for (unsigned i = 0; i < Num; ++i) {
632       Value *Val = InnerInduction->getOperand(i);
633       if (isa<Constant>(Val))
634         continue;
635       Instruction *I = dyn_cast<Instruction>(Val);
636       if (!I)
637         return false;
638       // TODO: Handle triangular loops.
639       // e.g. for(int i=0;i<N;i++)
640       //        for(int j=i;j<N;j++)
641       unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
642       if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
643               InnerLoopPreheader &&
644           !OuterLoop->isLoopInvariant(I)) {
645         return false;
646       }
647     }
648   }
649 
650   // TODO: Handle triangular loops of another form.
651   // e.g. for(int i=0;i<N;i++)
652   //        for(int j=0;j<i;j++)
653   // or,
654   //      for(int i=0;i<N;i++)
655   //        for(int j=0;j*i<N;j++)
656   BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
657   BranchInst *InnerLoopLatchBI =
658       dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
659   if (!InnerLoopLatchBI->isConditional())
660     return false;
661   if (CmpInst *InnerLoopCmp =
662           dyn_cast<CmpInst>(InnerLoopLatchBI->getCondition())) {
663     Value *Op0 = InnerLoopCmp->getOperand(0);
664     Value *Op1 = InnerLoopCmp->getOperand(1);
665 
666     // LHS and RHS of the inner loop exit condition, e.g.,
667     // in "for(int j=0;j<i;j++)", LHS is j and RHS is i.
668     Value *Left = nullptr;
669     Value *Right = nullptr;
670 
671     // Check if V only involves inner loop induction variable.
672     // Return true if V is InnerInduction, or a cast from
673     // InnerInduction, or a binary operator that involves
674     // InnerInduction and a constant.
675     std::function<bool(Value *)> IsPathToInnerIndVar;
676     IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool {
677       if (llvm::is_contained(InnerLoopInductions, V))
678         return true;
679       if (isa<Constant>(V))
680         return true;
681       const Instruction *I = dyn_cast<Instruction>(V);
682       if (!I)
683         return false;
684       if (isa<CastInst>(I))
685         return IsPathToInnerIndVar(I->getOperand(0));
686       if (isa<BinaryOperator>(I))
687         return IsPathToInnerIndVar(I->getOperand(0)) &&
688                IsPathToInnerIndVar(I->getOperand(1));
689       return false;
690     };
691 
692     // In case of multiple inner loop indvars, it is okay if LHS and RHS
693     // are both inner indvar related variables.
694     if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1))
695       return true;
696 
697     // Otherwise we check if the cmp instruction compares an inner indvar
698     // related variable (Left) with a outer loop invariant (Right).
699     if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Op0)) {
700       Left = Op0;
701       Right = Op1;
702     } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Op1)) {
703       Left = Op1;
704       Right = Op0;
705     }
706 
707     if (Left == nullptr)
708       return false;
709 
710     const SCEV *S = SE->getSCEV(Right);
711     if (!SE->isLoopInvariant(S, OuterLoop))
712       return false;
713   }
714 
715   return true;
716 }
717 
718 // If SV is a LCSSA PHI node with a single incoming value, return the incoming
719 // value.
720 static Value *followLCSSA(Value *SV) {
721   PHINode *PHI = dyn_cast<PHINode>(SV);
722   if (!PHI)
723     return SV;
724 
725   if (PHI->getNumIncomingValues() != 1)
726     return SV;
727   return followLCSSA(PHI->getIncomingValue(0));
728 }
729 
730 // Check V's users to see if it is involved in a reduction in L.
731 static PHINode *findInnerReductionPhi(Loop *L, Value *V) {
732   // Reduction variables cannot be constants.
733   if (isa<Constant>(V))
734     return nullptr;
735 
736   for (Value *User : V->users()) {
737     if (PHINode *PHI = dyn_cast<PHINode>(User)) {
738       if (PHI->getNumIncomingValues() == 1)
739         continue;
740       RecurrenceDescriptor RD;
741       if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) {
742         // Detect floating point reduction only when it can be reordered.
743         if (RD.getExactFPMathInst() != nullptr)
744           return nullptr;
745         return PHI;
746       }
747       return nullptr;
748     }
749   }
750 
751   return nullptr;
752 }
753 
754 bool LoopInterchangeLegality::findInductionAndReductions(
755     Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) {
756   if (!L->getLoopLatch() || !L->getLoopPredecessor())
757     return false;
758   for (PHINode &PHI : L->getHeader()->phis()) {
759     RecurrenceDescriptor RD;
760     InductionDescriptor ID;
761     if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID))
762       Inductions.push_back(&PHI);
763     else {
764       // PHIs in inner loops need to be part of a reduction in the outer loop,
765       // discovered when checking the PHIs of the outer loop earlier.
766       if (!InnerLoop) {
767         if (!OuterInnerReductions.count(&PHI)) {
768           LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions "
769                                "across the outer loop.\n");
770           return false;
771         }
772       } else {
773         assert(PHI.getNumIncomingValues() == 2 &&
774                "Phis in loop header should have exactly 2 incoming values");
775         // Check if we have a PHI node in the outer loop that has a reduction
776         // result from the inner loop as an incoming value.
777         Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch()));
778         PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V);
779         if (!InnerRedPhi ||
780             !llvm::is_contained(InnerRedPhi->incoming_values(), &PHI)) {
781           LLVM_DEBUG(
782               dbgs()
783               << "Failed to recognize PHI as an induction or reduction.\n");
784           return false;
785         }
786         OuterInnerReductions.insert(&PHI);
787         OuterInnerReductions.insert(InnerRedPhi);
788       }
789     }
790   }
791   return true;
792 }
793 
794 // This function indicates the current limitations in the transform as a result
795 // of which we do not proceed.
796 bool LoopInterchangeLegality::currentLimitations() {
797   BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
798 
799   // transform currently expects the loop latches to also be the exiting
800   // blocks.
801   if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
802       OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
803       !isa<BranchInst>(InnerLoopLatch->getTerminator()) ||
804       !isa<BranchInst>(OuterLoop->getLoopLatch()->getTerminator())) {
805     LLVM_DEBUG(
806         dbgs() << "Loops where the latch is not the exiting block are not"
807                << " supported currently.\n");
808     ORE->emit([&]() {
809       return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",
810                                       OuterLoop->getStartLoc(),
811                                       OuterLoop->getHeader())
812              << "Loops where the latch is not the exiting block cannot be"
813                 " interchange currently.";
814     });
815     return true;
816   }
817 
818   SmallVector<PHINode *, 8> Inductions;
819   if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) {
820     LLVM_DEBUG(
821         dbgs() << "Only outer loops with induction or reduction PHI nodes "
822                << "are supported currently.\n");
823     ORE->emit([&]() {
824       return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
825                                       OuterLoop->getStartLoc(),
826                                       OuterLoop->getHeader())
827              << "Only outer loops with induction or reduction PHI nodes can be"
828                 " interchanged currently.";
829     });
830     return true;
831   }
832 
833   Inductions.clear();
834   // For multi-level loop nests, make sure that all phi nodes for inner loops
835   // at all levels can be recognized as a induction or reduction phi. Bail out
836   // if a phi node at a certain nesting level cannot be properly recognized.
837   Loop *CurLevelLoop = OuterLoop;
838   while (!CurLevelLoop->getSubLoops().empty()) {
839     // We already made sure that the loop nest is tightly nested.
840     CurLevelLoop = CurLevelLoop->getSubLoops().front();
841     if (!findInductionAndReductions(CurLevelLoop, Inductions, nullptr)) {
842       LLVM_DEBUG(
843           dbgs() << "Only inner loops with induction or reduction PHI nodes "
844                 << "are supported currently.\n");
845       ORE->emit([&]() {
846         return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
847                                         CurLevelLoop->getStartLoc(),
848                                         CurLevelLoop->getHeader())
849               << "Only inner loops with induction or reduction PHI nodes can be"
850                   " interchange currently.";
851       });
852       return true;
853     }
854   }
855 
856   // TODO: Triangular loops are not handled for now.
857   if (!isLoopStructureUnderstood()) {
858     LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
859     ORE->emit([&]() {
860       return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
861                                       InnerLoop->getStartLoc(),
862                                       InnerLoop->getHeader())
863              << "Inner loop structure not understood currently.";
864     });
865     return true;
866   }
867 
868   return false;
869 }
870 
871 bool LoopInterchangeLegality::findInductions(
872     Loop *L, SmallVectorImpl<PHINode *> &Inductions) {
873   for (PHINode &PHI : L->getHeader()->phis()) {
874     InductionDescriptor ID;
875     if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID))
876       Inductions.push_back(&PHI);
877   }
878   return !Inductions.empty();
879 }
880 
881 // We currently only support LCSSA PHI nodes in the inner loop exit, if their
882 // users are either reduction PHIs or PHIs outside the outer loop (which means
883 // the we are only interested in the final value after the loop).
884 static bool
885 areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL,
886                               SmallPtrSetImpl<PHINode *> &Reductions) {
887   BasicBlock *InnerExit = OuterL->getUniqueExitBlock();
888   for (PHINode &PHI : InnerExit->phis()) {
889     // Reduction lcssa phi will have only 1 incoming block that from loop latch.
890     if (PHI.getNumIncomingValues() > 1)
891       return false;
892     if (any_of(PHI.users(), [&Reductions, OuterL](User *U) {
893           PHINode *PN = dyn_cast<PHINode>(U);
894           return !PN ||
895                  (!Reductions.count(PN) && OuterL->contains(PN->getParent()));
896         })) {
897       return false;
898     }
899   }
900   return true;
901 }
902 
903 // We currently support LCSSA PHI nodes in the outer loop exit, if their
904 // incoming values do not come from the outer loop latch or if the
905 // outer loop latch has a single predecessor. In that case, the value will
906 // be available if both the inner and outer loop conditions are true, which
907 // will still be true after interchanging. If we have multiple predecessor,
908 // that may not be the case, e.g. because the outer loop latch may be executed
909 // if the inner loop is not executed.
910 static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
911   BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
912   for (PHINode &PHI : LoopNestExit->phis()) {
913     for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) {
914       Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i));
915       if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
916         continue;
917 
918       // The incoming value is defined in the outer loop latch. Currently we
919       // only support that in case the outer loop latch has a single predecessor.
920       // This guarantees that the outer loop latch is executed if and only if
921       // the inner loop is executed (because tightlyNested() guarantees that the
922       // outer loop header only branches to the inner loop or the outer loop
923       // latch).
924       // FIXME: We could weaken this logic and allow multiple predecessors,
925       //        if the values are produced outside the loop latch. We would need
926       //        additional logic to update the PHI nodes in the exit block as
927       //        well.
928       if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
929         return false;
930     }
931   }
932   return true;
933 }
934 
935 // In case of multi-level nested loops, it may occur that lcssa phis exist in
936 // the latch of InnerLoop, i.e., when defs of the incoming values are further
937 // inside the loopnest. Sometimes those incoming values are not available
938 // after interchange, since the original inner latch will become the new outer
939 // latch which may have predecessor paths that do not include those incoming
940 // values.
941 // TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of
942 // multi-level loop nests.
943 static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
944   if (InnerLoop->getSubLoops().empty())
945     return true;
946   // If the original outer latch has only one predecessor, then values defined
947   // further inside the looploop, e.g., in the innermost loop, will be available
948   // at the new outer latch after interchange.
949   if (OuterLoop->getLoopLatch()->getUniquePredecessor() != nullptr)
950     return true;
951 
952   // The outer latch has more than one predecessors, i.e., the inner
953   // exit and the inner header.
954   // PHI nodes in the inner latch are lcssa phis where the incoming values
955   // are defined further inside the loopnest. Check if those phis are used
956   // in the original inner latch. If that is the case then bail out since
957   // those incoming values may not be available at the new outer latch.
958   BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
959   for (PHINode &PHI : InnerLoopLatch->phis()) {
960     for (auto *U : PHI.users()) {
961       Instruction *UI = cast<Instruction>(U);
962       if (InnerLoopLatch == UI->getParent())
963         return false;
964     }
965   }
966   return true;
967 }
968 
969 bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
970                                                   unsigned OuterLoopId,
971                                                   CharMatrix &DepMatrix) {
972   if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
973     LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
974                       << " and OuterLoopId = " << OuterLoopId
975                       << " due to dependence\n");
976     ORE->emit([&]() {
977       return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
978                                       InnerLoop->getStartLoc(),
979                                       InnerLoop->getHeader())
980              << "Cannot interchange loops due to dependences.";
981     });
982     return false;
983   }
984   // Check if outer and inner loop contain legal instructions only.
985   for (auto *BB : OuterLoop->blocks())
986     for (Instruction &I : BB->instructionsWithoutDebug())
987       if (CallInst *CI = dyn_cast<CallInst>(&I)) {
988         // readnone functions do not prevent interchanging.
989         if (CI->onlyWritesMemory())
990           continue;
991         LLVM_DEBUG(
992             dbgs() << "Loops with call instructions cannot be interchanged "
993                    << "safely.");
994         ORE->emit([&]() {
995           return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst",
996                                           CI->getDebugLoc(),
997                                           CI->getParent())
998                  << "Cannot interchange loops due to call instruction.";
999         });
1000 
1001         return false;
1002       }
1003 
1004   if (!findInductions(InnerLoop, InnerLoopInductions)) {
1005     LLVM_DEBUG(dbgs() << "Cound not find inner loop induction variables.\n");
1006     return false;
1007   }
1008 
1009   if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) {
1010     LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n");
1011     ORE->emit([&]() {
1012       return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerLatchPHI",
1013                                       InnerLoop->getStartLoc(),
1014                                       InnerLoop->getHeader())
1015              << "Cannot interchange loops because unsupported PHI nodes found "
1016                 "in inner loop latch.";
1017     });
1018     return false;
1019   }
1020 
1021   // TODO: The loops could not be interchanged due to current limitations in the
1022   // transform module.
1023   if (currentLimitations()) {
1024     LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
1025     return false;
1026   }
1027 
1028   // Check if the loops are tightly nested.
1029   if (!tightlyNested(OuterLoop, InnerLoop)) {
1030     LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
1031     ORE->emit([&]() {
1032       return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
1033                                       InnerLoop->getStartLoc(),
1034                                       InnerLoop->getHeader())
1035              << "Cannot interchange loops because they are not tightly "
1036                 "nested.";
1037     });
1038     return false;
1039   }
1040 
1041   if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop,
1042                                      OuterInnerReductions)) {
1043     LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
1044     ORE->emit([&]() {
1045       return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1046                                       InnerLoop->getStartLoc(),
1047                                       InnerLoop->getHeader())
1048              << "Found unsupported PHI node in loop exit.";
1049     });
1050     return false;
1051   }
1052 
1053   if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
1054     LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
1055     ORE->emit([&]() {
1056       return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1057                                       OuterLoop->getStartLoc(),
1058                                       OuterLoop->getHeader())
1059              << "Found unsupported PHI node in loop exit.";
1060     });
1061     return false;
1062   }
1063 
1064   return true;
1065 }
1066 
1067 int LoopInterchangeProfitability::getInstrOrderCost() {
1068   unsigned GoodOrder, BadOrder;
1069   BadOrder = GoodOrder = 0;
1070   for (BasicBlock *BB : InnerLoop->blocks()) {
1071     for (Instruction &Ins : *BB) {
1072       if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
1073         unsigned NumOp = GEP->getNumOperands();
1074         bool FoundInnerInduction = false;
1075         bool FoundOuterInduction = false;
1076         for (unsigned i = 0; i < NumOp; ++i) {
1077           // Skip operands that are not SCEV-able.
1078           if (!SE->isSCEVable(GEP->getOperand(i)->getType()))
1079             continue;
1080 
1081           const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i));
1082           const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
1083           if (!AR)
1084             continue;
1085 
1086           // If we find the inner induction after an outer induction e.g.
1087           // for(int i=0;i<N;i++)
1088           //   for(int j=0;j<N;j++)
1089           //     A[i][j] = A[i-1][j-1]+k;
1090           // then it is a good order.
1091           if (AR->getLoop() == InnerLoop) {
1092             // We found an InnerLoop induction after OuterLoop induction. It is
1093             // a good order.
1094             FoundInnerInduction = true;
1095             if (FoundOuterInduction) {
1096               GoodOrder++;
1097               break;
1098             }
1099           }
1100           // If we find the outer induction after an inner induction e.g.
1101           // for(int i=0;i<N;i++)
1102           //   for(int j=0;j<N;j++)
1103           //     A[j][i] = A[j-1][i-1]+k;
1104           // then it is a bad order.
1105           if (AR->getLoop() == OuterLoop) {
1106             // We found an OuterLoop induction after InnerLoop induction. It is
1107             // a bad order.
1108             FoundOuterInduction = true;
1109             if (FoundInnerInduction) {
1110               BadOrder++;
1111               break;
1112             }
1113           }
1114         }
1115       }
1116     }
1117   }
1118   return GoodOrder - BadOrder;
1119 }
1120 
1121 static bool isProfitableForVectorization(unsigned InnerLoopId,
1122                                          unsigned OuterLoopId,
1123                                          CharMatrix &DepMatrix) {
1124   // TODO: Improve this heuristic to catch more cases.
1125   // If the inner loop is loop independent or doesn't carry any dependency it is
1126   // profitable to move this to outer position.
1127   for (auto &Row : DepMatrix) {
1128     if (Row[InnerLoopId] != 'S' && Row[InnerLoopId] != 'I')
1129       return false;
1130     // TODO: We need to improve this heuristic.
1131     if (Row[OuterLoopId] != '=')
1132       return false;
1133   }
1134   // If outer loop has dependence and inner loop is loop independent then it is
1135   // profitable to interchange to enable parallelism.
1136   // If there are no dependences, interchanging will not improve anything.
1137   return !DepMatrix.empty();
1138 }
1139 
1140 bool LoopInterchangeProfitability::isProfitable(
1141     const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
1142     unsigned OuterLoopId, CharMatrix &DepMatrix,
1143     const DenseMap<const Loop *, unsigned> &CostMap) {
1144   // TODO: Remove the legacy cost model.
1145 
1146   // This is the new cost model returned from loop cache analysis.
1147   // A smaller index means the loop should be placed an outer loop, and vice
1148   // versa.
1149   if (CostMap.find(InnerLoop) != CostMap.end() &&
1150       CostMap.find(OuterLoop) != CostMap.end()) {
1151     unsigned InnerIndex = 0, OuterIndex = 0;
1152     InnerIndex = CostMap.find(InnerLoop)->second;
1153     OuterIndex = CostMap.find(OuterLoop)->second;
1154     LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
1155                       << ", OuterIndex = " << OuterIndex << "\n");
1156     if (InnerIndex < OuterIndex)
1157       return true;
1158   } else {
1159     // Legacy cost model: this is rough cost estimation algorithm. It counts the
1160     // good and bad order of induction variables in the instruction and allows
1161     // reordering if number of bad orders is more than good.
1162     int Cost = getInstrOrderCost();
1163     LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
1164     if (Cost < -LoopInterchangeCostThreshold)
1165       return true;
1166   }
1167 
1168   // It is not profitable as per current cache profitability model. But check if
1169   // we can move this loop outside to improve parallelism.
1170   if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix))
1171     return true;
1172 
1173   ORE->emit([&]() {
1174     return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1175                                     InnerLoop->getStartLoc(),
1176                                     InnerLoop->getHeader())
1177            << "Interchanging loops is too costly and it does not improve "
1178               "parallelism.";
1179   });
1180   return false;
1181 }
1182 
1183 void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
1184                                                Loop *InnerLoop) {
1185   for (Loop *L : *OuterLoop)
1186     if (L == InnerLoop) {
1187       OuterLoop->removeChildLoop(L);
1188       return;
1189     }
1190   llvm_unreachable("Couldn't find loop");
1191 }
1192 
1193 /// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
1194 /// new inner and outer loop after interchanging: NewInner is the original
1195 /// outer loop and NewOuter is the original inner loop.
1196 ///
1197 /// Before interchanging, we have the following structure
1198 /// Outer preheader
1199 //  Outer header
1200 //    Inner preheader
1201 //    Inner header
1202 //      Inner body
1203 //      Inner latch
1204 //   outer bbs
1205 //   Outer latch
1206 //
1207 // After interchanging:
1208 // Inner preheader
1209 // Inner header
1210 //   Outer preheader
1211 //   Outer header
1212 //     Inner body
1213 //     outer bbs
1214 //     Outer latch
1215 //   Inner latch
1216 void LoopInterchangeTransform::restructureLoops(
1217     Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
1218     BasicBlock *OrigOuterPreHeader) {
1219   Loop *OuterLoopParent = OuterLoop->getParentLoop();
1220   // The original inner loop preheader moves from the new inner loop to
1221   // the parent loop, if there is one.
1222   NewInner->removeBlockFromLoop(OrigInnerPreHeader);
1223   LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent);
1224 
1225   // Switch the loop levels.
1226   if (OuterLoopParent) {
1227     // Remove the loop from its parent loop.
1228     removeChildLoop(OuterLoopParent, NewInner);
1229     removeChildLoop(NewInner, NewOuter);
1230     OuterLoopParent->addChildLoop(NewOuter);
1231   } else {
1232     removeChildLoop(NewInner, NewOuter);
1233     LI->changeTopLevelLoop(NewInner, NewOuter);
1234   }
1235   while (!NewOuter->isInnermost())
1236     NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin()));
1237   NewOuter->addChildLoop(NewInner);
1238 
1239   // BBs from the original inner loop.
1240   SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
1241 
1242   // Add BBs from the original outer loop to the original inner loop (excluding
1243   // BBs already in inner loop)
1244   for (BasicBlock *BB : NewInner->blocks())
1245     if (LI->getLoopFor(BB) == NewInner)
1246       NewOuter->addBlockEntry(BB);
1247 
1248   // Now remove inner loop header and latch from the new inner loop and move
1249   // other BBs (the loop body) to the new inner loop.
1250   BasicBlock *OuterHeader = NewOuter->getHeader();
1251   BasicBlock *OuterLatch = NewOuter->getLoopLatch();
1252   for (BasicBlock *BB : OrigInnerBBs) {
1253     // Nothing will change for BBs in child loops.
1254     if (LI->getLoopFor(BB) != NewOuter)
1255       continue;
1256     // Remove the new outer loop header and latch from the new inner loop.
1257     if (BB == OuterHeader || BB == OuterLatch)
1258       NewInner->removeBlockFromLoop(BB);
1259     else
1260       LI->changeLoopFor(BB, NewInner);
1261   }
1262 
1263   // The preheader of the original outer loop becomes part of the new
1264   // outer loop.
1265   NewOuter->addBlockEntry(OrigOuterPreHeader);
1266   LI->changeLoopFor(OrigOuterPreHeader, NewOuter);
1267 
1268   // Tell SE that we move the loops around.
1269   SE->forgetLoop(NewOuter);
1270   SE->forgetLoop(NewInner);
1271 }
1272 
1273 bool LoopInterchangeTransform::transform() {
1274   bool Transformed = false;
1275 
1276   if (InnerLoop->getSubLoops().empty()) {
1277     BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1278     LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
1279     auto &InductionPHIs = LIL.getInnerLoopInductions();
1280     if (InductionPHIs.empty()) {
1281       LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
1282       return false;
1283     }
1284 
1285     SmallVector<Instruction *, 8> InnerIndexVarList;
1286     for (PHINode *CurInductionPHI : InductionPHIs) {
1287       if (CurInductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
1288         InnerIndexVarList.push_back(
1289             dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(1)));
1290       else
1291         InnerIndexVarList.push_back(
1292             dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(0)));
1293     }
1294 
1295     // Create a new latch block for the inner loop. We split at the
1296     // current latch's terminator and then move the condition and all
1297     // operands that are not either loop-invariant or the induction PHI into the
1298     // new latch block.
1299     BasicBlock *NewLatch =
1300         SplitBlock(InnerLoop->getLoopLatch(),
1301                    InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
1302 
1303     SmallSetVector<Instruction *, 4> WorkList;
1304     unsigned i = 0;
1305     auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() {
1306       for (; i < WorkList.size(); i++) {
1307         // Duplicate instruction and move it the new latch. Update uses that
1308         // have been moved.
1309         Instruction *NewI = WorkList[i]->clone();
1310         NewI->insertBefore(NewLatch->getFirstNonPHI());
1311         assert(!NewI->mayHaveSideEffects() &&
1312                "Moving instructions with side-effects may change behavior of "
1313                "the loop nest!");
1314         for (Use &U : llvm::make_early_inc_range(WorkList[i]->uses())) {
1315           Instruction *UserI = cast<Instruction>(U.getUser());
1316           if (!InnerLoop->contains(UserI->getParent()) ||
1317               UserI->getParent() == NewLatch ||
1318               llvm::is_contained(InductionPHIs, UserI))
1319             U.set(NewI);
1320         }
1321         // Add operands of moved instruction to the worklist, except if they are
1322         // outside the inner loop or are the induction PHI.
1323         for (Value *Op : WorkList[i]->operands()) {
1324           Instruction *OpI = dyn_cast<Instruction>(Op);
1325           if (!OpI ||
1326               this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop ||
1327               llvm::is_contained(InductionPHIs, OpI))
1328             continue;
1329           WorkList.insert(OpI);
1330         }
1331       }
1332     };
1333 
1334     // FIXME: Should we interchange when we have a constant condition?
1335     Instruction *CondI = dyn_cast<Instruction>(
1336         cast<BranchInst>(InnerLoop->getLoopLatch()->getTerminator())
1337             ->getCondition());
1338     if (CondI)
1339       WorkList.insert(CondI);
1340     MoveInstructions();
1341     for (Instruction *InnerIndexVar : InnerIndexVarList)
1342       WorkList.insert(cast<Instruction>(InnerIndexVar));
1343     MoveInstructions();
1344   }
1345 
1346   // Ensure the inner loop phi nodes have a separate basic block.
1347   BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1348   if (InnerLoopHeader->getFirstNonPHI() != InnerLoopHeader->getTerminator()) {
1349     SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
1350     LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
1351   }
1352 
1353   // Instructions in the original inner loop preheader may depend on values
1354   // defined in the outer loop header. Move them there, because the original
1355   // inner loop preheader will become the entry into the interchanged loop nest.
1356   // Currently we move all instructions and rely on LICM to move invariant
1357   // instructions outside the loop nest.
1358   BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1359   BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1360   if (InnerLoopPreHeader != OuterLoopHeader) {
1361     SmallPtrSet<Instruction *, 4> NeedsMoving;
1362     for (Instruction &I :
1363          make_early_inc_range(make_range(InnerLoopPreHeader->begin(),
1364                                          std::prev(InnerLoopPreHeader->end()))))
1365       I.moveBefore(OuterLoopHeader->getTerminator());
1366   }
1367 
1368   Transformed |= adjustLoopLinks();
1369   if (!Transformed) {
1370     LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
1371     return false;
1372   }
1373 
1374   return true;
1375 }
1376 
1377 /// \brief Move all instructions except the terminator from FromBB right before
1378 /// InsertBefore
1379 static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
1380   auto &ToList = InsertBefore->getParent()->getInstList();
1381   auto &FromList = FromBB->getInstList();
1382 
1383   ToList.splice(InsertBefore->getIterator(), FromList, FromList.begin(),
1384                 FromBB->getTerminator()->getIterator());
1385 }
1386 
1387 /// Swap instructions between \p BB1 and \p BB2 but keep terminators intact.
1388 static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) {
1389   // Save all non-terminator instructions of BB1 into TempInstrs and unlink them
1390   // from BB1 afterwards.
1391   auto Iter = map_range(*BB1, [](Instruction &I) { return &I; });
1392   SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(Iter.end()));
1393   for (Instruction *I : TempInstrs)
1394     I->removeFromParent();
1395 
1396   // Move instructions from BB2 to BB1.
1397   moveBBContents(BB2, BB1->getTerminator());
1398 
1399   // Move instructions from TempInstrs to BB2.
1400   for (Instruction *I : TempInstrs)
1401     I->insertBefore(BB2->getTerminator());
1402 }
1403 
1404 // Update BI to jump to NewBB instead of OldBB. Records updates to the
1405 // dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
1406 // \p OldBB  is exactly once in BI's successor list.
1407 static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB,
1408                             BasicBlock *NewBB,
1409                             std::vector<DominatorTree::UpdateType> &DTUpdates,
1410                             bool MustUpdateOnce = true) {
1411   assert((!MustUpdateOnce ||
1412           llvm::count_if(successors(BI),
1413                          [OldBB](BasicBlock *BB) {
1414                            return BB == OldBB;
1415                          }) == 1) && "BI must jump to OldBB exactly once.");
1416   bool Changed = false;
1417   for (Use &Op : BI->operands())
1418     if (Op == OldBB) {
1419       Op.set(NewBB);
1420       Changed = true;
1421     }
1422 
1423   if (Changed) {
1424     DTUpdates.push_back(
1425         {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB});
1426     DTUpdates.push_back(
1427         {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB});
1428   }
1429   assert(Changed && "Expected a successor to be updated");
1430 }
1431 
1432 // Move Lcssa PHIs to the right place.
1433 static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
1434                           BasicBlock *InnerLatch, BasicBlock *OuterHeader,
1435                           BasicBlock *OuterLatch, BasicBlock *OuterExit,
1436                           Loop *InnerLoop, LoopInfo *LI) {
1437 
1438   // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are
1439   // defined either in the header or latch. Those blocks will become header and
1440   // latch of the new outer loop, and the only possible users can PHI nodes
1441   // in the exit block of the loop nest or the outer loop header (reduction
1442   // PHIs, in that case, the incoming value must be defined in the inner loop
1443   // header). We can just substitute the user with the incoming value and remove
1444   // the PHI.
1445   for (PHINode &P : make_early_inc_range(InnerExit->phis())) {
1446     assert(P.getNumIncomingValues() == 1 &&
1447            "Only loops with a single exit are supported!");
1448 
1449     // Incoming values are guaranteed be instructions currently.
1450     auto IncI = cast<Instruction>(P.getIncomingValueForBlock(InnerLatch));
1451     // In case of multi-level nested loops, follow LCSSA to find the incoming
1452     // value defined from the innermost loop.
1453     auto IncIInnerMost = cast<Instruction>(followLCSSA(IncI));
1454     // Skip phis with incoming values from the inner loop body, excluding the
1455     // header and latch.
1456     if (IncIInnerMost->getParent() != InnerLatch &&
1457         IncIInnerMost->getParent() != InnerHeader)
1458       continue;
1459 
1460     assert(all_of(P.users(),
1461                   [OuterHeader, OuterExit, IncI, InnerHeader](User *U) {
1462                     return (cast<PHINode>(U)->getParent() == OuterHeader &&
1463                             IncI->getParent() == InnerHeader) ||
1464                            cast<PHINode>(U)->getParent() == OuterExit;
1465                   }) &&
1466            "Can only replace phis iff the uses are in the loop nest exit or "
1467            "the incoming value is defined in the inner header (it will "
1468            "dominate all loop blocks after interchanging)");
1469     P.replaceAllUsesWith(IncI);
1470     P.eraseFromParent();
1471   }
1472 
1473   SmallVector<PHINode *, 8> LcssaInnerExit;
1474   for (PHINode &P : InnerExit->phis())
1475     LcssaInnerExit.push_back(&P);
1476 
1477   SmallVector<PHINode *, 8> LcssaInnerLatch;
1478   for (PHINode &P : InnerLatch->phis())
1479     LcssaInnerLatch.push_back(&P);
1480 
1481   // Lcssa PHIs for values used outside the inner loop are in InnerExit.
1482   // If a PHI node has users outside of InnerExit, it has a use outside the
1483   // interchanged loop and we have to preserve it. We move these to
1484   // InnerLatch, which will become the new exit block for the innermost
1485   // loop after interchanging.
1486   for (PHINode *P : LcssaInnerExit)
1487     P->moveBefore(InnerLatch->getFirstNonPHI());
1488 
1489   // If the inner loop latch contains LCSSA PHIs, those come from a child loop
1490   // and we have to move them to the new inner latch.
1491   for (PHINode *P : LcssaInnerLatch)
1492     P->moveBefore(InnerExit->getFirstNonPHI());
1493 
1494   // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
1495   // incoming values defined in the outer loop, we have to add a new PHI
1496   // in the inner loop latch, which became the exit block of the outer loop,
1497   // after interchanging.
1498   if (OuterExit) {
1499     for (PHINode &P : OuterExit->phis()) {
1500       if (P.getNumIncomingValues() != 1)
1501         continue;
1502       // Skip Phis with incoming values defined in the inner loop. Those should
1503       // already have been updated.
1504       auto I = dyn_cast<Instruction>(P.getIncomingValue(0));
1505       if (!I || LI->getLoopFor(I->getParent()) == InnerLoop)
1506         continue;
1507 
1508       PHINode *NewPhi = dyn_cast<PHINode>(P.clone());
1509       NewPhi->setIncomingValue(0, P.getIncomingValue(0));
1510       NewPhi->setIncomingBlock(0, OuterLatch);
1511       // We might have incoming edges from other BBs, i.e., the original outer
1512       // header.
1513       for (auto *Pred : predecessors(InnerLatch)) {
1514         if (Pred == OuterLatch)
1515           continue;
1516         NewPhi->addIncoming(P.getIncomingValue(0), Pred);
1517       }
1518       NewPhi->insertBefore(InnerLatch->getFirstNonPHI());
1519       P.setIncomingValue(0, NewPhi);
1520     }
1521   }
1522 
1523   // Now adjust the incoming blocks for the LCSSA PHIs.
1524   // For PHIs moved from Inner's exit block, we need to replace Inner's latch
1525   // with the new latch.
1526   InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch);
1527 }
1528 
1529 bool LoopInterchangeTransform::adjustLoopBranches() {
1530   LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
1531   std::vector<DominatorTree::UpdateType> DTUpdates;
1532 
1533   BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1534   BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1535 
1536   assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
1537          InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
1538          InnerLoopPreHeader && "Guaranteed by loop-simplify form");
1539   // Ensure that both preheaders do not contain PHI nodes and have single
1540   // predecessors. This allows us to move them easily. We use
1541   // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
1542   // preheaders do not satisfy those conditions.
1543   if (isa<PHINode>(OuterLoopPreHeader->begin()) ||
1544       !OuterLoopPreHeader->getUniquePredecessor())
1545     OuterLoopPreHeader =
1546         InsertPreheaderForLoop(OuterLoop, DT, LI, nullptr, true);
1547   if (InnerLoopPreHeader == OuterLoop->getHeader())
1548     InnerLoopPreHeader =
1549         InsertPreheaderForLoop(InnerLoop, DT, LI, nullptr, true);
1550 
1551   // Adjust the loop preheader
1552   BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1553   BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1554   BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1555   BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
1556   BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
1557   BasicBlock *InnerLoopLatchPredecessor =
1558       InnerLoopLatch->getUniquePredecessor();
1559   BasicBlock *InnerLoopLatchSuccessor;
1560   BasicBlock *OuterLoopLatchSuccessor;
1561 
1562   BranchInst *OuterLoopLatchBI =
1563       dyn_cast<BranchInst>(OuterLoopLatch->getTerminator());
1564   BranchInst *InnerLoopLatchBI =
1565       dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
1566   BranchInst *OuterLoopHeaderBI =
1567       dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
1568   BranchInst *InnerLoopHeaderBI =
1569       dyn_cast<BranchInst>(InnerLoopHeader->getTerminator());
1570 
1571   if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
1572       !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
1573       !InnerLoopHeaderBI)
1574     return false;
1575 
1576   BranchInst *InnerLoopLatchPredecessorBI =
1577       dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator());
1578   BranchInst *OuterLoopPredecessorBI =
1579       dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator());
1580 
1581   if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
1582     return false;
1583   BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
1584   if (!InnerLoopHeaderSuccessor)
1585     return false;
1586 
1587   // Adjust Loop Preheader and headers.
1588   // The branches in the outer loop predecessor and the outer loop header can
1589   // be unconditional branches or conditional branches with duplicates. Consider
1590   // this when updating the successors.
1591   updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader,
1592                   InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);
1593   // The outer loop header might or might not branch to the outer latch.
1594   // We are guaranteed to branch to the inner loop preheader.
1595   if (llvm::is_contained(OuterLoopHeaderBI->successors(), OuterLoopLatch)) {
1596     // In this case the outerLoopHeader should branch to the InnerLoopLatch.
1597     updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, InnerLoopLatch,
1598                     DTUpdates,
1599                     /*MustUpdateOnce=*/false);
1600   }
1601   updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,
1602                   InnerLoopHeaderSuccessor, DTUpdates,
1603                   /*MustUpdateOnce=*/false);
1604 
1605   // Adjust reduction PHI's now that the incoming block has changed.
1606   InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader,
1607                                                OuterLoopHeader);
1608 
1609   updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor,
1610                   OuterLoopPreHeader, DTUpdates);
1611 
1612   // -------------Adjust loop latches-----------
1613   if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
1614     InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
1615   else
1616     InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
1617 
1618   updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,
1619                   InnerLoopLatchSuccessor, DTUpdates);
1620 
1621   if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
1622     OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
1623   else
1624     OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
1625 
1626   updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor,
1627                   OuterLoopLatchSuccessor, DTUpdates);
1628   updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch,
1629                   DTUpdates);
1630 
1631   DT->applyUpdates(DTUpdates);
1632   restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader,
1633                    OuterLoopPreHeader);
1634 
1635   moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch,
1636                 OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(),
1637                 InnerLoop, LI);
1638   // For PHIs in the exit block of the outer loop, outer's latch has been
1639   // replaced by Inners'.
1640   OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
1641 
1642   auto &OuterInnerReductions = LIL.getOuterInnerReductions();
1643   // Now update the reduction PHIs in the inner and outer loop headers.
1644   SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
1645   for (PHINode &PHI : InnerLoopHeader->phis())
1646     if (OuterInnerReductions.contains(&PHI))
1647       InnerLoopPHIs.push_back(&PHI);
1648 
1649   for (PHINode &PHI : OuterLoopHeader->phis())
1650     if (OuterInnerReductions.contains(&PHI))
1651       OuterLoopPHIs.push_back(&PHI);
1652 
1653   // Now move the remaining reduction PHIs from outer to inner loop header and
1654   // vice versa. The PHI nodes must be part of a reduction across the inner and
1655   // outer loop and all the remains to do is and updating the incoming blocks.
1656   for (PHINode *PHI : OuterLoopPHIs) {
1657     LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump(););
1658     PHI->moveBefore(InnerLoopHeader->getFirstNonPHI());
1659     assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
1660   }
1661   for (PHINode *PHI : InnerLoopPHIs) {
1662     LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump(););
1663     PHI->moveBefore(OuterLoopHeader->getFirstNonPHI());
1664     assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
1665   }
1666 
1667   // Update the incoming blocks for moved PHI nodes.
1668   OuterLoopHeader->replacePhiUsesWith(InnerLoopPreHeader, OuterLoopPreHeader);
1669   OuterLoopHeader->replacePhiUsesWith(InnerLoopLatch, OuterLoopLatch);
1670   InnerLoopHeader->replacePhiUsesWith(OuterLoopPreHeader, InnerLoopPreHeader);
1671   InnerLoopHeader->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
1672 
1673   // Values defined in the outer loop header could be used in the inner loop
1674   // latch. In that case, we need to create LCSSA phis for them, because after
1675   // interchanging they will be defined in the new inner loop and used in the
1676   // new outer loop.
1677   IRBuilder<> Builder(OuterLoopHeader->getContext());
1678   SmallVector<Instruction *, 4> MayNeedLCSSAPhis;
1679   for (Instruction &I :
1680        make_range(OuterLoopHeader->begin(), std::prev(OuterLoopHeader->end())))
1681     MayNeedLCSSAPhis.push_back(&I);
1682   formLCSSAForInstructions(MayNeedLCSSAPhis, *DT, *LI, SE, Builder);
1683 
1684   return true;
1685 }
1686 
1687 bool LoopInterchangeTransform::adjustLoopLinks() {
1688   // Adjust all branches in the inner and outer loop.
1689   bool Changed = adjustLoopBranches();
1690   if (Changed) {
1691     // We have interchanged the preheaders so we need to interchange the data in
1692     // the preheaders as well. This is because the content of the inner
1693     // preheader was previously executed inside the outer loop.
1694     BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1695     BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1696     swapBBContents(OuterLoopPreHeader, InnerLoopPreHeader);
1697   }
1698   return Changed;
1699 }
1700 
1701 namespace {
1702 /// Main LoopInterchange Pass.
1703 struct LoopInterchangeLegacyPass : public LoopPass {
1704   static char ID;
1705 
1706   LoopInterchangeLegacyPass() : LoopPass(ID) {
1707     initializeLoopInterchangeLegacyPassPass(*PassRegistry::getPassRegistry());
1708   }
1709 
1710   void getAnalysisUsage(AnalysisUsage &AU) const override {
1711     AU.addRequired<DependenceAnalysisWrapperPass>();
1712     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1713 
1714     getLoopAnalysisUsage(AU);
1715   }
1716 
1717   bool runOnLoop(Loop *L, LPPassManager &LPM) override {
1718     if (skipLoop(L))
1719       return false;
1720 
1721     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1722     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1723     auto *DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI();
1724     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1725     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1726     std::unique_ptr<CacheCost> CC = nullptr;
1727     return LoopInterchange(SE, LI, DI, DT, CC, ORE).run(L);
1728   }
1729 };
1730 } // namespace
1731 
1732 char LoopInterchangeLegacyPass::ID = 0;
1733 
1734 INITIALIZE_PASS_BEGIN(LoopInterchangeLegacyPass, "loop-interchange",
1735                       "Interchanges loops for cache reuse", false, false)
1736 INITIALIZE_PASS_DEPENDENCY(LoopPass)
1737 INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
1738 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
1739 
1740 INITIALIZE_PASS_END(LoopInterchangeLegacyPass, "loop-interchange",
1741                     "Interchanges loops for cache reuse", false, false)
1742 
1743 Pass *llvm::createLoopInterchangePass() {
1744   return new LoopInterchangeLegacyPass();
1745 }
1746 
1747 PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
1748                                            LoopAnalysisManager &AM,
1749                                            LoopStandardAnalysisResults &AR,
1750                                            LPMUpdater &U) {
1751   Function &F = *LN.getParent();
1752 
1753   DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
1754   std::unique_ptr<CacheCost> CC =
1755       CacheCost::getCacheCost(LN.getOutermostLoop(), AR, DI);
1756   OptimizationRemarkEmitter ORE(&F);
1757   if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN))
1758     return PreservedAnalyses::all();
1759   U.markLoopNestChanged(true);
1760   return getLoopPassPreservedAnalyses();
1761 }
1762