xref: /llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp (revision 8e702735090388a3231a863e343f880d0f96fecb)
1 //===- LoopFuse.cpp - Loop Fusion Pass ------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements the loop fusion pass.
11 /// The implementation is largely based on the following document:
12 ///
13 ///       Code Transformations to Augment the Scope of Loop Fusion in a
14 ///         Production Compiler
15 ///       Christopher Mark Barton
16 ///       MSc Thesis
17 ///       https://webdocs.cs.ualberta.ca/~amaral/thesis/ChristopherBartonMSc.pdf
18 ///
19 /// The general approach taken is to collect sets of control flow equivalent
20 /// loops and test whether they can be fused. The necessary conditions for
21 /// fusion are:
22 ///    1. The loops must be adjacent (there cannot be any statements between
23 ///       the two loops).
24 ///    2. The loops must be conforming (they must execute the same number of
25 ///       iterations).
26 ///    3. The loops must be control flow equivalent (if one loop executes, the
27 ///       other is guaranteed to execute).
28 ///    4. There cannot be any negative distance dependencies between the loops.
29 /// If all of these conditions are satisfied, it is safe to fuse the loops.
30 ///
31 /// This implementation creates FusionCandidates that represent the loop and the
32 /// necessary information needed by fusion. It then operates on the fusion
33 /// candidates, first confirming that the candidate is eligible for fusion. The
34 /// candidates are then collected into control flow equivalent sets, sorted in
35 /// dominance order. Each set of control flow equivalent candidates is then
36 /// traversed, attempting to fuse pairs of candidates in the set. If all
37 /// requirements for fusion are met, the two candidates are fused, creating a
38 /// new (fused) candidate which is then added back into the set to consider for
39 /// additional fusion.
40 ///
41 /// This implementation currently does not make any modifications to remove
42 /// conditions for fusion. Code transformations to make loops conform to each of
43 /// the conditions for fusion are discussed in more detail in the document
44 /// above. These can be added to the current implementation in the future.
45 //===----------------------------------------------------------------------===//
46 
47 #include "llvm/Transforms/Scalar/LoopFuse.h"
48 #include "llvm/ADT/Statistic.h"
49 #include "llvm/Analysis/AssumptionCache.h"
50 #include "llvm/Analysis/DependenceAnalysis.h"
51 #include "llvm/Analysis/DomTreeUpdater.h"
52 #include "llvm/Analysis/LoopInfo.h"
53 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
54 #include "llvm/Analysis/PostDominators.h"
55 #include "llvm/Analysis/ScalarEvolution.h"
56 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
57 #include "llvm/Analysis/TargetTransformInfo.h"
58 #include "llvm/IR/Function.h"
59 #include "llvm/IR/Verifier.h"
60 #include "llvm/Support/CommandLine.h"
61 #include "llvm/Support/Debug.h"
62 #include "llvm/Support/raw_ostream.h"
63 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
64 #include "llvm/Transforms/Utils/CodeMoverUtils.h"
65 #include "llvm/Transforms/Utils/LoopPeel.h"
66 #include "llvm/Transforms/Utils/LoopSimplify.h"
67 
68 using namespace llvm;
69 
70 #define DEBUG_TYPE "loop-fusion"
71 
72 STATISTIC(FuseCounter, "Loops fused");
73 STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion");
74 STATISTIC(InvalidPreheader, "Loop has invalid preheader");
75 STATISTIC(InvalidHeader, "Loop has invalid header");
76 STATISTIC(InvalidExitingBlock, "Loop has invalid exiting blocks");
77 STATISTIC(InvalidExitBlock, "Loop has invalid exit block");
78 STATISTIC(InvalidLatch, "Loop has invalid latch");
79 STATISTIC(InvalidLoop, "Loop is invalid");
80 STATISTIC(AddressTakenBB, "Basic block has address taken");
81 STATISTIC(MayThrowException, "Loop may throw an exception");
82 STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access");
83 STATISTIC(NotSimplifiedForm, "Loop is not in simplified form");
84 STATISTIC(InvalidDependencies, "Dependencies prevent fusion");
85 STATISTIC(UnknownTripCount, "Loop has unknown trip count");
86 STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop");
87 STATISTIC(NonEqualTripCount, "Loop trip counts are not the same");
88 STATISTIC(NonAdjacent, "Loops are not adjacent");
89 STATISTIC(
90     NonEmptyPreheader,
91     "Loop has a non-empty preheader with instructions that cannot be moved");
92 STATISTIC(FusionNotBeneficial, "Fusion is not beneficial");
93 STATISTIC(NonIdenticalGuards, "Candidates have different guards");
94 STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block with "
95                              "instructions that cannot be moved");
96 STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block with "
97                               "instructions that cannot be moved");
98 STATISTIC(NotRotated, "Candidate is not rotated");
99 STATISTIC(OnlySecondCandidateIsGuarded,
100           "The second candidate is guarded while the first one is not");
101 STATISTIC(NumHoistedInsts, "Number of hoisted preheader instructions.");
102 STATISTIC(NumSunkInsts, "Number of hoisted preheader instructions.");
103 
104 enum FusionDependenceAnalysisChoice {
105   FUSION_DEPENDENCE_ANALYSIS_SCEV,
106   FUSION_DEPENDENCE_ANALYSIS_DA,
107   FUSION_DEPENDENCE_ANALYSIS_ALL,
108 };
109 
110 static cl::opt<FusionDependenceAnalysisChoice> FusionDependenceAnalysis(
111     "loop-fusion-dependence-analysis",
112     cl::desc("Which dependence analysis should loop fusion use?"),
113     cl::values(clEnumValN(FUSION_DEPENDENCE_ANALYSIS_SCEV, "scev",
114                           "Use the scalar evolution interface"),
115                clEnumValN(FUSION_DEPENDENCE_ANALYSIS_DA, "da",
116                           "Use the dependence analysis interface"),
117                clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL, "all",
118                           "Use all available analyses")),
119     cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL));
120 
121 static cl::opt<unsigned> FusionPeelMaxCount(
122     "loop-fusion-peel-max-count", cl::init(0), cl::Hidden,
123     cl::desc("Max number of iterations to be peeled from a loop, such that "
124              "fusion can take place"));
125 
126 #ifndef NDEBUG
127 static cl::opt<bool>
128     VerboseFusionDebugging("loop-fusion-verbose-debug",
129                            cl::desc("Enable verbose debugging for Loop Fusion"),
130                            cl::Hidden, cl::init(false));
131 #endif
132 
133 namespace {
134 /// This class is used to represent a candidate for loop fusion. When it is
135 /// constructed, it checks the conditions for loop fusion to ensure that it
136 /// represents a valid candidate. It caches several parts of a loop that are
137 /// used throughout loop fusion (e.g., loop preheader, loop header, etc) instead
138 /// of continually querying the underlying Loop to retrieve these values. It is
139 /// assumed these will not change throughout loop fusion.
140 ///
141 /// The invalidate method should be used to indicate that the FusionCandidate is
142 /// no longer a valid candidate for fusion. Similarly, the isValid() method can
143 /// be used to ensure that the FusionCandidate is still valid for fusion.
144 struct FusionCandidate {
145   /// Cache of parts of the loop used throughout loop fusion. These should not
146   /// need to change throughout the analysis and transformation.
147   /// These parts are cached to avoid repeatedly looking up in the Loop class.
148 
149   /// Preheader of the loop this candidate represents
150   BasicBlock *Preheader;
151   /// Header of the loop this candidate represents
152   BasicBlock *Header;
153   /// Blocks in the loop that exit the loop
154   BasicBlock *ExitingBlock;
155   /// The successor block of this loop (where the exiting blocks go to)
156   BasicBlock *ExitBlock;
157   /// Latch of the loop
158   BasicBlock *Latch;
159   /// The loop that this fusion candidate represents
160   Loop *L;
161   /// Vector of instructions in this loop that read from memory
162   SmallVector<Instruction *, 16> MemReads;
163   /// Vector of instructions in this loop that write to memory
164   SmallVector<Instruction *, 16> MemWrites;
165   /// Are all of the members of this fusion candidate still valid
166   bool Valid;
167   /// Guard branch of the loop, if it exists
168   BranchInst *GuardBranch;
169   /// Peeling Paramaters of the Loop.
170   TTI::PeelingPreferences PP;
171   /// Can you Peel this Loop?
172   bool AbleToPeel;
173   /// Has this loop been Peeled
174   bool Peeled;
175 
176   /// Dominator and PostDominator trees are needed for the
177   /// FusionCandidateCompare function, required by FusionCandidateSet to
178   /// determine where the FusionCandidate should be inserted into the set. These
179   /// are used to establish ordering of the FusionCandidates based on dominance.
180   DominatorTree &DT;
181   const PostDominatorTree *PDT;
182 
183   OptimizationRemarkEmitter &ORE;
184 
185   FusionCandidate(Loop *L, DominatorTree &DT, const PostDominatorTree *PDT,
186                   OptimizationRemarkEmitter &ORE, TTI::PeelingPreferences PP)
187       : Preheader(L->getLoopPreheader()), Header(L->getHeader()),
188         ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()),
189         Latch(L->getLoopLatch()), L(L), Valid(true),
190         GuardBranch(L->getLoopGuardBranch()), PP(PP), AbleToPeel(canPeel(L)),
191         Peeled(false), DT(DT), PDT(PDT), ORE(ORE) {
192 
193     // Walk over all blocks in the loop and check for conditions that may
194     // prevent fusion. For each block, walk over all instructions and collect
195     // the memory reads and writes If any instructions that prevent fusion are
196     // found, invalidate this object and return.
197     for (BasicBlock *BB : L->blocks()) {
198       if (BB->hasAddressTaken()) {
199         invalidate();
200         reportInvalidCandidate(AddressTakenBB);
201         return;
202       }
203 
204       for (Instruction &I : *BB) {
205         if (I.mayThrow()) {
206           invalidate();
207           reportInvalidCandidate(MayThrowException);
208           return;
209         }
210         if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
211           if (SI->isVolatile()) {
212             invalidate();
213             reportInvalidCandidate(ContainsVolatileAccess);
214             return;
215           }
216         }
217         if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
218           if (LI->isVolatile()) {
219             invalidate();
220             reportInvalidCandidate(ContainsVolatileAccess);
221             return;
222           }
223         }
224         if (I.mayWriteToMemory())
225           MemWrites.push_back(&I);
226         if (I.mayReadFromMemory())
227           MemReads.push_back(&I);
228       }
229     }
230   }
231 
232   /// Check if all members of the class are valid.
233   bool isValid() const {
234     return Preheader && Header && ExitingBlock && ExitBlock && Latch && L &&
235            !L->isInvalid() && Valid;
236   }
237 
238   /// Verify that all members are in sync with the Loop object.
239   void verify() const {
240     assert(isValid() && "Candidate is not valid!!");
241     assert(!L->isInvalid() && "Loop is invalid!");
242     assert(Preheader == L->getLoopPreheader() && "Preheader is out of sync");
243     assert(Header == L->getHeader() && "Header is out of sync");
244     assert(ExitingBlock == L->getExitingBlock() &&
245            "Exiting Blocks is out of sync");
246     assert(ExitBlock == L->getExitBlock() && "Exit block is out of sync");
247     assert(Latch == L->getLoopLatch() && "Latch is out of sync");
248   }
249 
250   /// Get the entry block for this fusion candidate.
251   ///
252   /// If this fusion candidate represents a guarded loop, the entry block is the
253   /// loop guard block. If it represents an unguarded loop, the entry block is
254   /// the preheader of the loop.
255   BasicBlock *getEntryBlock() const {
256     if (GuardBranch)
257       return GuardBranch->getParent();
258     else
259       return Preheader;
260   }
261 
262   /// After Peeling the loop is modified quite a bit, hence all of the Blocks
263   /// need to be updated accordingly.
264   void updateAfterPeeling() {
265     Preheader = L->getLoopPreheader();
266     Header = L->getHeader();
267     ExitingBlock = L->getExitingBlock();
268     ExitBlock = L->getExitBlock();
269     Latch = L->getLoopLatch();
270     verify();
271   }
272 
273   /// Given a guarded loop, get the successor of the guard that is not in the
274   /// loop.
275   ///
276   /// This method returns the successor of the loop guard that is not located
277   /// within the loop (i.e., the successor of the guard that is not the
278   /// preheader).
279   /// This method is only valid for guarded loops.
280   BasicBlock *getNonLoopBlock() const {
281     assert(GuardBranch && "Only valid on guarded loops.");
282     assert(GuardBranch->isConditional() &&
283            "Expecting guard to be a conditional branch.");
284     if (Peeled)
285       return GuardBranch->getSuccessor(1);
286     return (GuardBranch->getSuccessor(0) == Preheader)
287                ? GuardBranch->getSuccessor(1)
288                : GuardBranch->getSuccessor(0);
289   }
290 
291 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
292   LLVM_DUMP_METHOD void dump() const {
293     dbgs() << "\tGuardBranch: ";
294     if (GuardBranch)
295       dbgs() << *GuardBranch;
296     else
297       dbgs() << "nullptr";
298     dbgs() << "\n"
299            << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n"
300            << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr")
301            << "\n"
302            << "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n"
303            << "\tExitingBB: "
304            << (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n"
305            << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr")
306            << "\n"
307            << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n"
308            << "\tEntryBlock: "
309            << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr")
310            << "\n";
311   }
312 #endif
313 
314   /// Determine if a fusion candidate (representing a loop) is eligible for
315   /// fusion. Note that this only checks whether a single loop can be fused - it
316   /// does not check whether it is *legal* to fuse two loops together.
317   bool isEligibleForFusion(ScalarEvolution &SE) const {
318     if (!isValid()) {
319       LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n");
320       if (!Preheader)
321         ++InvalidPreheader;
322       if (!Header)
323         ++InvalidHeader;
324       if (!ExitingBlock)
325         ++InvalidExitingBlock;
326       if (!ExitBlock)
327         ++InvalidExitBlock;
328       if (!Latch)
329         ++InvalidLatch;
330       if (L->isInvalid())
331         ++InvalidLoop;
332 
333       return false;
334     }
335 
336     // Require ScalarEvolution to be able to determine a trip count.
337     if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
338       LLVM_DEBUG(dbgs() << "Loop " << L->getName()
339                         << " trip count not computable!\n");
340       return reportInvalidCandidate(UnknownTripCount);
341     }
342 
343     if (!L->isLoopSimplifyForm()) {
344       LLVM_DEBUG(dbgs() << "Loop " << L->getName()
345                         << " is not in simplified form!\n");
346       return reportInvalidCandidate(NotSimplifiedForm);
347     }
348 
349     if (!L->isRotatedForm()) {
350       LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " is not rotated!\n");
351       return reportInvalidCandidate(NotRotated);
352     }
353 
354     return true;
355   }
356 
357 private:
358   // This is only used internally for now, to clear the MemWrites and MemReads
359   // list and setting Valid to false. I can't envision other uses of this right
360   // now, since once FusionCandidates are put into the FusionCandidateSet they
361   // are immutable. Thus, any time we need to change/update a FusionCandidate,
362   // we must create a new one and insert it into the FusionCandidateSet to
363   // ensure the FusionCandidateSet remains ordered correctly.
364   void invalidate() {
365     MemWrites.clear();
366     MemReads.clear();
367     Valid = false;
368   }
369 
370   bool reportInvalidCandidate(llvm::Statistic &Stat) const {
371     using namespace ore;
372     assert(L && Preheader && "Fusion candidate not initialized properly!");
373 #if LLVM_ENABLE_STATS
374     ++Stat;
375     ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(),
376                                         L->getStartLoc(), Preheader)
377              << "[" << Preheader->getParent()->getName() << "]: "
378              << "Loop is not a candidate for fusion: " << Stat.getDesc());
379 #endif
380     return false;
381   }
382 };
383 
384 struct FusionCandidateCompare {
385   /// Comparison functor to sort two Control Flow Equivalent fusion candidates
386   /// into dominance order.
387   /// If LHS dominates RHS and RHS post-dominates LHS, return true;
388   /// If RHS dominates LHS and LHS post-dominates RHS, return false;
389   /// If both LHS and RHS are not dominating each other then, non-strictly
390   /// post dominate check will decide the order of candidates. If RHS
391   /// non-strictly post dominates LHS then, return true. If LHS non-strictly
392   /// post dominates RHS then, return false. If both are non-strictly post
393   /// dominate each other then, level in the post dominator tree will decide
394   /// the order of candidates.
395   bool operator()(const FusionCandidate &LHS,
396                   const FusionCandidate &RHS) const {
397     const DominatorTree *DT = &(LHS.DT);
398 
399     BasicBlock *LHSEntryBlock = LHS.getEntryBlock();
400     BasicBlock *RHSEntryBlock = RHS.getEntryBlock();
401 
402     // Do not save PDT to local variable as it is only used in asserts and thus
403     // will trigger an unused variable warning if building without asserts.
404     assert(DT && LHS.PDT && "Expecting valid dominator tree");
405 
406     // Do this compare first so if LHS == RHS, function returns false.
407     if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) {
408       // RHS dominates LHS
409       // Verify LHS post-dominates RHS
410       assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock));
411       return false;
412     }
413 
414     if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) {
415       // Verify RHS Postdominates LHS
416       assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock));
417       return true;
418     }
419 
420     // If two FusionCandidates are in the same level of dominator tree,
421     // they will not dominate each other, but may still be control flow
422     // equivalent. To sort those FusionCandidates, nonStrictlyPostDominate()
423     // function is needed.
424     bool WrongOrder =
425         nonStrictlyPostDominate(LHSEntryBlock, RHSEntryBlock, DT, LHS.PDT);
426     bool RightOrder =
427         nonStrictlyPostDominate(RHSEntryBlock, LHSEntryBlock, DT, LHS.PDT);
428     if (WrongOrder && RightOrder) {
429       // If common predecessor of LHS and RHS post dominates both
430       // FusionCandidates then, Order of FusionCandidate can be
431       // identified by its level in post dominator tree.
432       DomTreeNode *LNode = LHS.PDT->getNode(LHSEntryBlock);
433       DomTreeNode *RNode = LHS.PDT->getNode(RHSEntryBlock);
434       return LNode->getLevel() > RNode->getLevel();
435     } else if (WrongOrder)
436       return false;
437     else if (RightOrder)
438       return true;
439 
440     // If LHS does not non-strict Postdominate RHS and RHS does not non-strict
441     // Postdominate LHS then, there is no dominance relationship between the
442     // two FusionCandidates. Thus, they should not be in the same set together.
443     llvm_unreachable(
444         "No dominance relationship between these fusion candidates!");
445   }
446 };
447 
448 using LoopVector = SmallVector<Loop *, 4>;
449 
450 // Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance
451 // order. Thus, if FC0 comes *before* FC1 in a FusionCandidateSet, then FC0
452 // dominates FC1 and FC1 post-dominates FC0.
453 // std::set was chosen because we want a sorted data structure with stable
454 // iterators. A subsequent patch to loop fusion will enable fusing non-adjacent
455 // loops by moving intervening code around. When this intervening code contains
456 // loops, those loops will be moved also. The corresponding FusionCandidates
457 // will also need to be moved accordingly. As this is done, having stable
458 // iterators will simplify the logic. Similarly, having an efficient insert that
459 // keeps the FusionCandidateSet sorted will also simplify the implementation.
460 using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>;
461 using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>;
462 
463 #if !defined(NDEBUG)
464 static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
465                                      const FusionCandidate &FC) {
466   if (FC.isValid())
467     OS << FC.Preheader->getName();
468   else
469     OS << "<Invalid>";
470 
471   return OS;
472 }
473 
474 static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
475                                      const FusionCandidateSet &CandSet) {
476   for (const FusionCandidate &FC : CandSet)
477     OS << FC << '\n';
478 
479   return OS;
480 }
481 
482 static void
483 printFusionCandidates(const FusionCandidateCollection &FusionCandidates) {
484   dbgs() << "Fusion Candidates: \n";
485   for (const auto &CandidateSet : FusionCandidates) {
486     dbgs() << "*** Fusion Candidate Set ***\n";
487     dbgs() << CandidateSet;
488     dbgs() << "****************************\n";
489   }
490 }
491 #endif
492 
493 /// Collect all loops in function at the same nest level, starting at the
494 /// outermost level.
495 ///
496 /// This data structure collects all loops at the same nest level for a
497 /// given function (specified by the LoopInfo object). It starts at the
498 /// outermost level.
499 struct LoopDepthTree {
500   using LoopsOnLevelTy = SmallVector<LoopVector, 4>;
501   using iterator = LoopsOnLevelTy::iterator;
502   using const_iterator = LoopsOnLevelTy::const_iterator;
503 
504   LoopDepthTree(LoopInfo &LI) : Depth(1) {
505     if (!LI.empty())
506       LoopsOnLevel.emplace_back(LoopVector(LI.rbegin(), LI.rend()));
507   }
508 
509   /// Test whether a given loop has been removed from the function, and thus is
510   /// no longer valid.
511   bool isRemovedLoop(const Loop *L) const { return RemovedLoops.count(L); }
512 
513   /// Record that a given loop has been removed from the function and is no
514   /// longer valid.
515   void removeLoop(const Loop *L) { RemovedLoops.insert(L); }
516 
517   /// Descend the tree to the next (inner) nesting level
518   void descend() {
519     LoopsOnLevelTy LoopsOnNextLevel;
520 
521     for (const LoopVector &LV : *this)
522       for (Loop *L : LV)
523         if (!isRemovedLoop(L) && L->begin() != L->end())
524           LoopsOnNextLevel.emplace_back(LoopVector(L->begin(), L->end()));
525 
526     LoopsOnLevel = LoopsOnNextLevel;
527     RemovedLoops.clear();
528     Depth++;
529   }
530 
531   bool empty() const { return size() == 0; }
532   size_t size() const { return LoopsOnLevel.size() - RemovedLoops.size(); }
533   unsigned getDepth() const { return Depth; }
534 
535   iterator begin() { return LoopsOnLevel.begin(); }
536   iterator end() { return LoopsOnLevel.end(); }
537   const_iterator begin() const { return LoopsOnLevel.begin(); }
538   const_iterator end() const { return LoopsOnLevel.end(); }
539 
540 private:
541   /// Set of loops that have been removed from the function and are no longer
542   /// valid.
543   SmallPtrSet<const Loop *, 8> RemovedLoops;
544 
545   /// Depth of the current level, starting at 1 (outermost loops).
546   unsigned Depth;
547 
548   /// Vector of loops at the current depth level that have the same parent loop
549   LoopsOnLevelTy LoopsOnLevel;
550 };
551 
552 #ifndef NDEBUG
553 static void printLoopVector(const LoopVector &LV) {
554   dbgs() << "****************************\n";
555   for (auto *L : LV)
556     printLoop(*L, dbgs());
557   dbgs() << "****************************\n";
558 }
559 #endif
560 
561 struct LoopFuser {
562 private:
563   // Sets of control flow equivalent fusion candidates for a given nest level.
564   FusionCandidateCollection FusionCandidates;
565 
566   LoopDepthTree LDT;
567   DomTreeUpdater DTU;
568 
569   LoopInfo &LI;
570   DominatorTree &DT;
571   DependenceInfo &DI;
572   ScalarEvolution &SE;
573   PostDominatorTree &PDT;
574   OptimizationRemarkEmitter &ORE;
575   AssumptionCache &AC;
576   const TargetTransformInfo &TTI;
577 
578 public:
579   LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI,
580             ScalarEvolution &SE, PostDominatorTree &PDT,
581             OptimizationRemarkEmitter &ORE, const DataLayout &DL,
582             AssumptionCache &AC, const TargetTransformInfo &TTI)
583       : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI),
584         DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI) {}
585 
586   /// This is the main entry point for loop fusion. It will traverse the
587   /// specified function and collect candidate loops to fuse, starting at the
588   /// outermost nesting level and working inwards.
589   bool fuseLoops(Function &F) {
590 #ifndef NDEBUG
591     if (VerboseFusionDebugging) {
592       LI.print(dbgs());
593     }
594 #endif
595 
596     LLVM_DEBUG(dbgs() << "Performing Loop Fusion on function " << F.getName()
597                       << "\n");
598     bool Changed = false;
599 
600     while (!LDT.empty()) {
601       LLVM_DEBUG(dbgs() << "Got " << LDT.size() << " loop sets for depth "
602                         << LDT.getDepth() << "\n";);
603 
604       for (const LoopVector &LV : LDT) {
605         assert(LV.size() > 0 && "Empty loop set was build!");
606 
607         // Skip singleton loop sets as they do not offer fusion opportunities on
608         // this level.
609         if (LV.size() == 1)
610           continue;
611 #ifndef NDEBUG
612         if (VerboseFusionDebugging) {
613           LLVM_DEBUG({
614             dbgs() << "  Visit loop set (#" << LV.size() << "):\n";
615             printLoopVector(LV);
616           });
617         }
618 #endif
619 
620         collectFusionCandidates(LV);
621         Changed |= fuseCandidates();
622       }
623 
624       // Finished analyzing candidates at this level.
625       // Descend to the next level and clear all of the candidates currently
626       // collected. Note that it will not be possible to fuse any of the
627       // existing candidates with new candidates because the new candidates will
628       // be at a different nest level and thus not be control flow equivalent
629       // with all of the candidates collected so far.
630       LLVM_DEBUG(dbgs() << "Descend one level!\n");
631       LDT.descend();
632       FusionCandidates.clear();
633     }
634 
635     if (Changed)
636       LLVM_DEBUG(dbgs() << "Function after Loop Fusion: \n"; F.dump(););
637 
638 #ifndef NDEBUG
639     assert(DT.verify());
640     assert(PDT.verify());
641     LI.verify(DT);
642     SE.verify();
643 #endif
644 
645     LLVM_DEBUG(dbgs() << "Loop Fusion complete\n");
646     return Changed;
647   }
648 
649 private:
650   /// Determine if two fusion candidates are control flow equivalent.
651   ///
652   /// Two fusion candidates are control flow equivalent if when one executes,
653   /// the other is guaranteed to execute. This is determined using dominators
654   /// and post-dominators: if A dominates B and B post-dominates A then A and B
655   /// are control-flow equivalent.
656   bool isControlFlowEquivalent(const FusionCandidate &FC0,
657                                const FusionCandidate &FC1) const {
658     assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders");
659 
660     return ::isControlFlowEquivalent(*FC0.getEntryBlock(), *FC1.getEntryBlock(),
661                                      DT, PDT);
662   }
663 
664   /// Iterate over all loops in the given loop set and identify the loops that
665   /// are eligible for fusion. Place all eligible fusion candidates into Control
666   /// Flow Equivalent sets, sorted by dominance.
667   void collectFusionCandidates(const LoopVector &LV) {
668     for (Loop *L : LV) {
669       TTI::PeelingPreferences PP =
670           gatherPeelingPreferences(L, SE, TTI, std::nullopt, std::nullopt);
671       FusionCandidate CurrCand(L, DT, &PDT, ORE, PP);
672       if (!CurrCand.isEligibleForFusion(SE))
673         continue;
674 
675       // Go through each list in FusionCandidates and determine if L is control
676       // flow equivalent with the first loop in that list. If it is, append LV.
677       // If not, go to the next list.
678       // If no suitable list is found, start another list and add it to
679       // FusionCandidates.
680       bool FoundSet = false;
681 
682       for (auto &CurrCandSet : FusionCandidates) {
683         if (isControlFlowEquivalent(*CurrCandSet.begin(), CurrCand)) {
684           CurrCandSet.insert(CurrCand);
685           FoundSet = true;
686 #ifndef NDEBUG
687           if (VerboseFusionDebugging)
688             LLVM_DEBUG(dbgs() << "Adding " << CurrCand
689                               << " to existing candidate set\n");
690 #endif
691           break;
692         }
693       }
694       if (!FoundSet) {
695         // No set was found. Create a new set and add to FusionCandidates
696 #ifndef NDEBUG
697         if (VerboseFusionDebugging)
698           LLVM_DEBUG(dbgs() << "Adding " << CurrCand << " to new set\n");
699 #endif
700         FusionCandidateSet NewCandSet;
701         NewCandSet.insert(CurrCand);
702         FusionCandidates.push_back(NewCandSet);
703       }
704       NumFusionCandidates++;
705     }
706   }
707 
708   /// Determine if it is beneficial to fuse two loops.
709   ///
710   /// For now, this method simply returns true because we want to fuse as much
711   /// as possible (primarily to test the pass). This method will evolve, over
712   /// time, to add heuristics for profitability of fusion.
713   bool isBeneficialFusion(const FusionCandidate &FC0,
714                           const FusionCandidate &FC1) {
715     return true;
716   }
717 
718   /// Determine if two fusion candidates have the same trip count (i.e., they
719   /// execute the same number of iterations).
720   ///
721   /// This function will return a pair of values. The first is a boolean,
722   /// stating whether or not the two candidates are known at compile time to
723   /// have the same TripCount. The second is the difference in the two
724   /// TripCounts. This information can be used later to determine whether or not
725   /// peeling can be performed on either one of the candidates.
726   std::pair<bool, std::optional<unsigned>>
727   haveIdenticalTripCounts(const FusionCandidate &FC0,
728                           const FusionCandidate &FC1) const {
729     const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L);
730     if (isa<SCEVCouldNotCompute>(TripCount0)) {
731       UncomputableTripCount++;
732       LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!");
733       return {false, std::nullopt};
734     }
735 
736     const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L);
737     if (isa<SCEVCouldNotCompute>(TripCount1)) {
738       UncomputableTripCount++;
739       LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!");
740       return {false, std::nullopt};
741     }
742 
743     LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & "
744                       << *TripCount1 << " are "
745                       << (TripCount0 == TripCount1 ? "identical" : "different")
746                       << "\n");
747 
748     if (TripCount0 == TripCount1)
749       return {true, 0};
750 
751     LLVM_DEBUG(dbgs() << "The loops do not have the same tripcount, "
752                          "determining the difference between trip counts\n");
753 
754     // Currently only considering loops with a single exit point
755     // and a non-constant trip count.
756     const unsigned TC0 = SE.getSmallConstantTripCount(FC0.L);
757     const unsigned TC1 = SE.getSmallConstantTripCount(FC1.L);
758 
759     // If any of the tripcounts are zero that means that loop(s) do not have
760     // a single exit or a constant tripcount.
761     if (TC0 == 0 || TC1 == 0) {
762       LLVM_DEBUG(dbgs() << "Loop(s) do not have a single exit point or do not "
763                            "have a constant number of iterations. Peeling "
764                            "is not benefical\n");
765       return {false, std::nullopt};
766     }
767 
768     std::optional<unsigned> Difference;
769     int Diff = TC0 - TC1;
770 
771     if (Diff > 0)
772       Difference = Diff;
773     else {
774       LLVM_DEBUG(
775           dbgs() << "Difference is less than 0. FC1 (second loop) has more "
776                     "iterations than the first one. Currently not supported\n");
777     }
778 
779     LLVM_DEBUG(dbgs() << "Difference in loop trip count is: " << Difference
780                       << "\n");
781 
782     return {false, Difference};
783   }
784 
785   void peelFusionCandidate(FusionCandidate &FC0, const FusionCandidate &FC1,
786                            unsigned PeelCount) {
787     assert(FC0.AbleToPeel && "Should be able to peel loop");
788 
789     LLVM_DEBUG(dbgs() << "Attempting to peel first " << PeelCount
790                       << " iterations of the first loop. \n");
791 
792     ValueToValueMapTy VMap;
793     FC0.Peeled = peelLoop(FC0.L, PeelCount, &LI, &SE, DT, &AC, true, VMap);
794     if (FC0.Peeled) {
795       LLVM_DEBUG(dbgs() << "Done Peeling\n");
796 
797 #ifndef NDEBUG
798       auto IdenticalTripCount = haveIdenticalTripCounts(FC0, FC1);
799 
800       assert(IdenticalTripCount.first && *IdenticalTripCount.second == 0 &&
801              "Loops should have identical trip counts after peeling");
802 #endif
803 
804       FC0.PP.PeelCount += PeelCount;
805 
806       // Peeling does not update the PDT
807       PDT.recalculate(*FC0.Preheader->getParent());
808 
809       FC0.updateAfterPeeling();
810 
811       // In this case the iterations of the loop are constant, so the first
812       // loop will execute completely (will not jump from one of
813       // the peeled blocks to the second loop). Here we are updating the
814       // branch conditions of each of the peeled blocks, such that it will
815       // branch to its successor which is not the preheader of the second loop
816       // in the case of unguarded loops, or the succesors of the exit block of
817       // the first loop otherwise. Doing this update will ensure that the entry
818       // block of the first loop dominates the entry block of the second loop.
819       BasicBlock *BB =
820           FC0.GuardBranch ? FC0.ExitBlock->getUniqueSuccessor() : FC1.Preheader;
821       if (BB) {
822         SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
823         SmallVector<Instruction *, 8> WorkList;
824         for (BasicBlock *Pred : predecessors(BB)) {
825           if (Pred != FC0.ExitBlock) {
826             WorkList.emplace_back(Pred->getTerminator());
827             TreeUpdates.emplace_back(
828                 DominatorTree::UpdateType(DominatorTree::Delete, Pred, BB));
829           }
830         }
831         // Cannot modify the predecessors inside the above loop as it will cause
832         // the iterators to be nullptrs, causing memory errors.
833         for (Instruction *CurrentBranch : WorkList) {
834           BasicBlock *Succ = CurrentBranch->getSuccessor(0);
835           if (Succ == BB)
836             Succ = CurrentBranch->getSuccessor(1);
837           ReplaceInstWithInst(CurrentBranch, BranchInst::Create(Succ));
838         }
839 
840         DTU.applyUpdates(TreeUpdates);
841         DTU.flush();
842       }
843       LLVM_DEBUG(
844           dbgs() << "Sucessfully peeled " << FC0.PP.PeelCount
845                  << " iterations from the first loop.\n"
846                     "Both Loops have the same number of iterations now.\n");
847     }
848   }
849 
850   /// Walk each set of control flow equivalent fusion candidates and attempt to
851   /// fuse them. This does a single linear traversal of all candidates in the
852   /// set. The conditions for legal fusion are checked at this point. If a pair
853   /// of fusion candidates passes all legality checks, they are fused together
854   /// and a new fusion candidate is created and added to the FusionCandidateSet.
855   /// The original fusion candidates are then removed, as they are no longer
856   /// valid.
857   bool fuseCandidates() {
858     bool Fused = false;
859     LLVM_DEBUG(printFusionCandidates(FusionCandidates));
860     for (auto &CandidateSet : FusionCandidates) {
861       if (CandidateSet.size() < 2)
862         continue;
863 
864       LLVM_DEBUG(dbgs() << "Attempting fusion on Candidate Set:\n"
865                         << CandidateSet << "\n");
866 
867       for (auto FC0 = CandidateSet.begin(); FC0 != CandidateSet.end(); ++FC0) {
868         assert(!LDT.isRemovedLoop(FC0->L) &&
869                "Should not have removed loops in CandidateSet!");
870         auto FC1 = FC0;
871         for (++FC1; FC1 != CandidateSet.end(); ++FC1) {
872           assert(!LDT.isRemovedLoop(FC1->L) &&
873                  "Should not have removed loops in CandidateSet!");
874 
875           LLVM_DEBUG(dbgs() << "Attempting to fuse candidate \n"; FC0->dump();
876                      dbgs() << " with\n"; FC1->dump(); dbgs() << "\n");
877 
878           FC0->verify();
879           FC1->verify();
880 
881           // Check if the candidates have identical tripcounts (first value of
882           // pair), and if not check the difference in the tripcounts between
883           // the loops (second value of pair). The difference is not equal to
884           // std::nullopt iff the loops iterate a constant number of times, and
885           // have a single exit.
886           std::pair<bool, std::optional<unsigned>> IdenticalTripCountRes =
887               haveIdenticalTripCounts(*FC0, *FC1);
888           bool SameTripCount = IdenticalTripCountRes.first;
889           std::optional<unsigned> TCDifference = IdenticalTripCountRes.second;
890 
891           // Here we are checking that FC0 (the first loop) can be peeled, and
892           // both loops have different tripcounts.
893           if (FC0->AbleToPeel && !SameTripCount && TCDifference) {
894             if (*TCDifference > FusionPeelMaxCount) {
895               LLVM_DEBUG(dbgs()
896                          << "Difference in loop trip counts: " << *TCDifference
897                          << " is greater than maximum peel count specificed: "
898                          << FusionPeelMaxCount << "\n");
899             } else {
900               // Dependent on peeling being performed on the first loop, and
901               // assuming all other conditions for fusion return true.
902               SameTripCount = true;
903             }
904           }
905 
906           if (!SameTripCount) {
907             LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip "
908                                  "counts. Not fusing.\n");
909             reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
910                                                        NonEqualTripCount);
911             continue;
912           }
913 
914           if (!isAdjacent(*FC0, *FC1)) {
915             LLVM_DEBUG(dbgs()
916                        << "Fusion candidates are not adjacent. Not fusing.\n");
917             reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent);
918             continue;
919           }
920 
921           if ((!FC0->GuardBranch && FC1->GuardBranch) ||
922               (FC0->GuardBranch && !FC1->GuardBranch)) {
923             LLVM_DEBUG(dbgs() << "The one of candidate is guarded while the "
924                                  "another one is not. Not fusing.\n");
925             reportLoopFusion<OptimizationRemarkMissed>(
926                 *FC0, *FC1, OnlySecondCandidateIsGuarded);
927             continue;
928           }
929 
930           // Ensure that FC0 and FC1 have identical guards.
931           // If one (or both) are not guarded, this check is not necessary.
932           if (FC0->GuardBranch && FC1->GuardBranch &&
933               !haveIdenticalGuards(*FC0, *FC1) && !TCDifference) {
934             LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical "
935                                  "guards. Not Fusing.\n");
936             reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
937                                                        NonIdenticalGuards);
938             continue;
939           }
940 
941           if (FC0->GuardBranch) {
942             assert(FC1->GuardBranch && "Expecting valid FC1 guard branch");
943 
944             if (!isSafeToMoveBefore(*FC0->ExitBlock,
945                                     *FC1->ExitBlock->getFirstNonPHIOrDbg(), DT,
946                                     &PDT, &DI)) {
947               LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe "
948                                    "instructions in exit block. Not fusing.\n");
949               reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
950                                                          NonEmptyExitBlock);
951               continue;
952             }
953 
954             if (!isSafeToMoveBefore(
955                     *FC1->GuardBranch->getParent(),
956                     *FC0->GuardBranch->getParent()->getTerminator(), DT, &PDT,
957                     &DI)) {
958               LLVM_DEBUG(dbgs()
959                          << "Fusion candidate contains unsafe "
960                             "instructions in guard block. Not fusing.\n");
961               reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
962                                                          NonEmptyGuardBlock);
963               continue;
964             }
965           }
966 
967           // Check the dependencies across the loops and do not fuse if it would
968           // violate them.
969           if (!dependencesAllowFusion(*FC0, *FC1)) {
970             LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n");
971             reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
972                                                        InvalidDependencies);
973             continue;
974           }
975 
976           // If the second loop has instructions in the pre-header, attempt to
977           // hoist them up to the first loop's pre-header or sink them into the
978           // body of the second loop.
979           SmallVector<Instruction *, 4> SafeToHoist;
980           SmallVector<Instruction *, 4> SafeToSink;
981           // At this point, this is the last remaining legality check.
982           // Which means if we can make this pre-header empty, we can fuse
983           // these loops
984           if (!isEmptyPreheader(*FC1)) {
985             LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty "
986                                  "preheader.\n");
987 
988             // If it is not safe to hoist/sink all instructions in the
989             // pre-header, we cannot fuse these loops.
990             if (!collectMovablePreheaderInsts(*FC0, *FC1, SafeToHoist,
991                                               SafeToSink)) {
992               LLVM_DEBUG(dbgs() << "Could not hoist/sink all instructions in "
993                                    "Fusion Candidate Pre-header.\n"
994                                 << "Not Fusing.\n");
995               reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
996                                                          NonEmptyPreheader);
997               continue;
998             }
999           }
1000 
1001           bool BeneficialToFuse = isBeneficialFusion(*FC0, *FC1);
1002           LLVM_DEBUG(dbgs()
1003                      << "\tFusion appears to be "
1004                      << (BeneficialToFuse ? "" : "un") << "profitable!\n");
1005           if (!BeneficialToFuse) {
1006             reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
1007                                                        FusionNotBeneficial);
1008             continue;
1009           }
1010           // All analysis has completed and has determined that fusion is legal
1011           // and profitable. At this point, start transforming the code and
1012           // perform fusion.
1013 
1014           // Execute the hoist/sink operations on preheader instructions
1015           movePreheaderInsts(*FC0, *FC1, SafeToHoist, SafeToSink);
1016 
1017           LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and "
1018                             << *FC1 << "\n");
1019 
1020           FusionCandidate FC0Copy = *FC0;
1021           // Peel the loop after determining that fusion is legal. The Loops
1022           // will still be safe to fuse after the peeling is performed.
1023           bool Peel = TCDifference && *TCDifference > 0;
1024           if (Peel)
1025             peelFusionCandidate(FC0Copy, *FC1, *TCDifference);
1026 
1027           // Report fusion to the Optimization Remarks.
1028           // Note this needs to be done *before* performFusion because
1029           // performFusion will change the original loops, making it not
1030           // possible to identify them after fusion is complete.
1031           reportLoopFusion<OptimizationRemark>((Peel ? FC0Copy : *FC0), *FC1,
1032                                                FuseCounter);
1033 
1034           FusionCandidate FusedCand(
1035               performFusion((Peel ? FC0Copy : *FC0), *FC1), DT, &PDT, ORE,
1036               FC0Copy.PP);
1037           FusedCand.verify();
1038           assert(FusedCand.isEligibleForFusion(SE) &&
1039                  "Fused candidate should be eligible for fusion!");
1040 
1041           // Notify the loop-depth-tree that these loops are not valid objects
1042           LDT.removeLoop(FC1->L);
1043 
1044           CandidateSet.erase(FC0);
1045           CandidateSet.erase(FC1);
1046 
1047           auto InsertPos = CandidateSet.insert(FusedCand);
1048 
1049           assert(InsertPos.second &&
1050                  "Unable to insert TargetCandidate in CandidateSet!");
1051 
1052           // Reset FC0 and FC1 the new (fused) candidate. Subsequent iterations
1053           // of the FC1 loop will attempt to fuse the new (fused) loop with the
1054           // remaining candidates in the current candidate set.
1055           FC0 = FC1 = InsertPos.first;
1056 
1057           LLVM_DEBUG(dbgs() << "Candidate Set (after fusion): " << CandidateSet
1058                             << "\n");
1059 
1060           Fused = true;
1061         }
1062       }
1063     }
1064     return Fused;
1065   }
1066 
1067   // Returns true if the instruction \p I can be hoisted to the end of the
1068   // preheader of \p FC0. \p SafeToHoist contains the instructions that are
1069   // known to be safe to hoist. The instructions encountered that cannot be
1070   // hoisted are in \p NotHoisting.
1071   // TODO: Move functionality into CodeMoverUtils
1072   bool canHoistInst(Instruction &I,
1073                     const SmallVector<Instruction *, 4> &SafeToHoist,
1074                     const SmallVector<Instruction *, 4> &NotHoisting,
1075                     const FusionCandidate &FC0) const {
1076     const BasicBlock *FC0PreheaderTarget = FC0.Preheader->getSingleSuccessor();
1077     assert(FC0PreheaderTarget &&
1078            "Expected single successor for loop preheader.");
1079 
1080     for (Use &Op : I.operands()) {
1081       if (auto *OpInst = dyn_cast<Instruction>(Op)) {
1082         bool OpHoisted = is_contained(SafeToHoist, OpInst);
1083         // Check if we have already decided to hoist this operand. In this
1084         // case, it does not dominate FC0 *yet*, but will after we hoist it.
1085         if (!(OpHoisted || DT.dominates(OpInst, FC0PreheaderTarget))) {
1086           return false;
1087         }
1088       }
1089     }
1090 
1091     // PHIs in FC1's header only have FC0 blocks as predecessors. PHIs
1092     // cannot be hoisted and should be sunk to the exit of the fused loop.
1093     if (isa<PHINode>(I))
1094       return false;
1095 
1096     // If this isn't a memory inst, hoisting is safe
1097     if (!I.mayReadOrWriteMemory())
1098       return true;
1099 
1100     LLVM_DEBUG(dbgs() << "Checking if this mem inst can be hoisted.\n");
1101     for (Instruction *NotHoistedInst : NotHoisting) {
1102       if (auto D = DI.depends(&I, NotHoistedInst, true)) {
1103         // Dependency is not read-before-write, write-before-read or
1104         // write-before-write
1105         if (D->isFlow() || D->isAnti() || D->isOutput()) {
1106           LLVM_DEBUG(dbgs() << "Inst depends on an instruction in FC1's "
1107                                "preheader that is not being hoisted.\n");
1108           return false;
1109         }
1110       }
1111     }
1112 
1113     for (Instruction *ReadInst : FC0.MemReads) {
1114       if (auto D = DI.depends(ReadInst, &I, true)) {
1115         // Dependency is not read-before-write
1116         if (D->isAnti()) {
1117           LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC0.\n");
1118           return false;
1119         }
1120       }
1121     }
1122 
1123     for (Instruction *WriteInst : FC0.MemWrites) {
1124       if (auto D = DI.depends(WriteInst, &I, true)) {
1125         // Dependency is not write-before-read or write-before-write
1126         if (D->isFlow() || D->isOutput()) {
1127           LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC0.\n");
1128           return false;
1129         }
1130       }
1131     }
1132     return true;
1133   }
1134 
1135   // Returns true if the instruction \p I can be sunk to the top of the exit
1136   // block of \p FC1.
1137   // TODO: Move functionality into CodeMoverUtils
1138   bool canSinkInst(Instruction &I, const FusionCandidate &FC1) const {
1139     for (User *U : I.users()) {
1140       if (auto *UI{dyn_cast<Instruction>(U)}) {
1141         // Cannot sink if user in loop
1142         // If FC1 has phi users of this value, we cannot sink it into FC1.
1143         if (FC1.L->contains(UI)) {
1144           // Cannot hoist or sink this instruction. No hoisting/sinking
1145           // should take place, loops should not fuse
1146           return false;
1147         }
1148       }
1149     }
1150 
1151     // If this isn't a memory inst, sinking is safe
1152     if (!I.mayReadOrWriteMemory())
1153       return true;
1154 
1155     for (Instruction *ReadInst : FC1.MemReads) {
1156       if (auto D = DI.depends(&I, ReadInst, true)) {
1157         // Dependency is not write-before-read
1158         if (D->isFlow()) {
1159           LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC1.\n");
1160           return false;
1161         }
1162       }
1163     }
1164 
1165     for (Instruction *WriteInst : FC1.MemWrites) {
1166       if (auto D = DI.depends(&I, WriteInst, true)) {
1167         // Dependency is not write-before-write or read-before-write
1168         if (D->isOutput() || D->isAnti()) {
1169           LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC1.\n");
1170           return false;
1171         }
1172       }
1173     }
1174 
1175     return true;
1176   }
1177 
1178   /// Collect instructions in the \p FC1 Preheader that can be hoisted
1179   /// to the \p FC0 Preheader or sunk into the \p FC1 Body
1180   bool collectMovablePreheaderInsts(
1181       const FusionCandidate &FC0, const FusionCandidate &FC1,
1182       SmallVector<Instruction *, 4> &SafeToHoist,
1183       SmallVector<Instruction *, 4> &SafeToSink) const {
1184     BasicBlock *FC1Preheader = FC1.Preheader;
1185     // Save the instructions that are not being hoisted, so we know not to hoist
1186     // mem insts that they dominate.
1187     SmallVector<Instruction *, 4> NotHoisting;
1188 
1189     for (Instruction &I : *FC1Preheader) {
1190       // Can't move a branch
1191       if (&I == FC1Preheader->getTerminator())
1192         continue;
1193       // If the instruction has side-effects, give up.
1194       // TODO: The case of mayReadFromMemory we can handle but requires
1195       // additional work with a dependence analysis so for now we give
1196       // up on memory reads.
1197       if (I.mayThrow() || !I.willReturn()) {
1198         LLVM_DEBUG(dbgs() << "Inst: " << I << " may throw or won't return.\n");
1199         return false;
1200       }
1201 
1202       LLVM_DEBUG(dbgs() << "Checking Inst: " << I << "\n");
1203 
1204       if (I.isAtomic() || I.isVolatile()) {
1205         LLVM_DEBUG(
1206             dbgs() << "\tInstruction is volatile or atomic. Cannot move it.\n");
1207         return false;
1208       }
1209 
1210       if (canHoistInst(I, SafeToHoist, NotHoisting, FC0)) {
1211         SafeToHoist.push_back(&I);
1212         LLVM_DEBUG(dbgs() << "\tSafe to hoist.\n");
1213       } else {
1214         LLVM_DEBUG(dbgs() << "\tCould not hoist. Trying to sink...\n");
1215         NotHoisting.push_back(&I);
1216 
1217         if (canSinkInst(I, FC1)) {
1218           SafeToSink.push_back(&I);
1219           LLVM_DEBUG(dbgs() << "\tSafe to sink.\n");
1220         } else {
1221           LLVM_DEBUG(dbgs() << "\tCould not sink.\n");
1222           return false;
1223         }
1224       }
1225     }
1226     LLVM_DEBUG(
1227         dbgs() << "All preheader instructions could be sunk or hoisted!\n");
1228     return true;
1229   }
1230 
1231   /// Rewrite all additive recurrences in a SCEV to use a new loop.
1232   class AddRecLoopReplacer : public SCEVRewriteVisitor<AddRecLoopReplacer> {
1233   public:
1234     AddRecLoopReplacer(ScalarEvolution &SE, const Loop &OldL, const Loop &NewL,
1235                        bool UseMax = true)
1236         : SCEVRewriteVisitor(SE), Valid(true), UseMax(UseMax), OldL(OldL),
1237           NewL(NewL) {}
1238 
1239     const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
1240       const Loop *ExprL = Expr->getLoop();
1241       SmallVector<const SCEV *, 2> Operands;
1242       if (ExprL == &OldL) {
1243         append_range(Operands, Expr->operands());
1244         return SE.getAddRecExpr(Operands, &NewL, Expr->getNoWrapFlags());
1245       }
1246 
1247       if (OldL.contains(ExprL)) {
1248         bool Pos = SE.isKnownPositive(Expr->getStepRecurrence(SE));
1249         if (!UseMax || !Pos || !Expr->isAffine()) {
1250           Valid = false;
1251           return Expr;
1252         }
1253         return visit(Expr->getStart());
1254       }
1255 
1256       for (const SCEV *Op : Expr->operands())
1257         Operands.push_back(visit(Op));
1258       return SE.getAddRecExpr(Operands, ExprL, Expr->getNoWrapFlags());
1259     }
1260 
1261     bool wasValidSCEV() const { return Valid; }
1262 
1263   private:
1264     bool Valid, UseMax;
1265     const Loop &OldL, &NewL;
1266   };
1267 
1268   /// Return false if the access functions of \p I0 and \p I1 could cause
1269   /// a negative dependence.
1270   bool accessDiffIsPositive(const Loop &L0, const Loop &L1, Instruction &I0,
1271                             Instruction &I1, bool EqualIsInvalid) {
1272     Value *Ptr0 = getLoadStorePointerOperand(&I0);
1273     Value *Ptr1 = getLoadStorePointerOperand(&I1);
1274     if (!Ptr0 || !Ptr1)
1275       return false;
1276 
1277     const SCEV *SCEVPtr0 = SE.getSCEVAtScope(Ptr0, &L0);
1278     const SCEV *SCEVPtr1 = SE.getSCEVAtScope(Ptr1, &L1);
1279 #ifndef NDEBUG
1280     if (VerboseFusionDebugging)
1281       LLVM_DEBUG(dbgs() << "    Access function check: " << *SCEVPtr0 << " vs "
1282                         << *SCEVPtr1 << "\n");
1283 #endif
1284     AddRecLoopReplacer Rewriter(SE, L0, L1);
1285     SCEVPtr0 = Rewriter.visit(SCEVPtr0);
1286 #ifndef NDEBUG
1287     if (VerboseFusionDebugging)
1288       LLVM_DEBUG(dbgs() << "    Access function after rewrite: " << *SCEVPtr0
1289                         << " [Valid: " << Rewriter.wasValidSCEV() << "]\n");
1290 #endif
1291     if (!Rewriter.wasValidSCEV())
1292       return false;
1293 
1294     // TODO: isKnownPredicate doesnt work well when one SCEV is loop carried (by
1295     //       L0) and the other is not. We could check if it is monotone and test
1296     //       the beginning and end value instead.
1297 
1298     BasicBlock *L0Header = L0.getHeader();
1299     auto HasNonLinearDominanceRelation = [&](const SCEV *S) {
1300       const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S);
1301       if (!AddRec)
1302         return false;
1303       return !DT.dominates(L0Header, AddRec->getLoop()->getHeader()) &&
1304              !DT.dominates(AddRec->getLoop()->getHeader(), L0Header);
1305     };
1306     if (SCEVExprContains(SCEVPtr1, HasNonLinearDominanceRelation))
1307       return false;
1308 
1309     ICmpInst::Predicate Pred =
1310         EqualIsInvalid ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_SGE;
1311     bool IsAlwaysGE = SE.isKnownPredicate(Pred, SCEVPtr0, SCEVPtr1);
1312 #ifndef NDEBUG
1313     if (VerboseFusionDebugging)
1314       LLVM_DEBUG(dbgs() << "    Relation: " << *SCEVPtr0
1315                         << (IsAlwaysGE ? "  >=  " : "  may <  ") << *SCEVPtr1
1316                         << "\n");
1317 #endif
1318     return IsAlwaysGE;
1319   }
1320 
1321   /// Return true if the dependences between @p I0 (in @p L0) and @p I1 (in
1322   /// @p L1) allow loop fusion of @p L0 and @p L1. The dependence analyses
1323   /// specified by @p DepChoice are used to determine this.
1324   bool dependencesAllowFusion(const FusionCandidate &FC0,
1325                               const FusionCandidate &FC1, Instruction &I0,
1326                               Instruction &I1, bool AnyDep,
1327                               FusionDependenceAnalysisChoice DepChoice) {
1328 #ifndef NDEBUG
1329     if (VerboseFusionDebugging) {
1330       LLVM_DEBUG(dbgs() << "Check dep: " << I0 << " vs " << I1 << " : "
1331                         << DepChoice << "\n");
1332     }
1333 #endif
1334     switch (DepChoice) {
1335     case FUSION_DEPENDENCE_ANALYSIS_SCEV:
1336       return accessDiffIsPositive(*FC0.L, *FC1.L, I0, I1, AnyDep);
1337     case FUSION_DEPENDENCE_ANALYSIS_DA: {
1338       auto DepResult = DI.depends(&I0, &I1, true);
1339       if (!DepResult)
1340         return true;
1341 #ifndef NDEBUG
1342       if (VerboseFusionDebugging) {
1343         LLVM_DEBUG(dbgs() << "DA res: "; DepResult->dump(dbgs());
1344                    dbgs() << " [#l: " << DepResult->getLevels() << "][Ordered: "
1345                           << (DepResult->isOrdered() ? "true" : "false")
1346                           << "]\n");
1347         LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult->getLevels()
1348                           << "\n");
1349       }
1350 #endif
1351 
1352       if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor())
1353         LLVM_DEBUG(
1354             dbgs() << "TODO: Implement pred/succ dependence handling!\n");
1355 
1356       // TODO: Can we actually use the dependence info analysis here?
1357       return false;
1358     }
1359 
1360     case FUSION_DEPENDENCE_ANALYSIS_ALL:
1361       return dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep,
1362                                     FUSION_DEPENDENCE_ANALYSIS_SCEV) ||
1363              dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep,
1364                                     FUSION_DEPENDENCE_ANALYSIS_DA);
1365     }
1366 
1367     llvm_unreachable("Unknown fusion dependence analysis choice!");
1368   }
1369 
1370   /// Perform a dependence check and return if @p FC0 and @p FC1 can be fused.
1371   bool dependencesAllowFusion(const FusionCandidate &FC0,
1372                               const FusionCandidate &FC1) {
1373     LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1
1374                       << "\n");
1375     assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth());
1376     assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock()));
1377 
1378     for (Instruction *WriteL0 : FC0.MemWrites) {
1379       for (Instruction *WriteL1 : FC1.MemWrites)
1380         if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1,
1381                                     /* AnyDep */ false,
1382                                     FusionDependenceAnalysis)) {
1383           InvalidDependencies++;
1384           return false;
1385         }
1386       for (Instruction *ReadL1 : FC1.MemReads)
1387         if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *ReadL1,
1388                                     /* AnyDep */ false,
1389                                     FusionDependenceAnalysis)) {
1390           InvalidDependencies++;
1391           return false;
1392         }
1393     }
1394 
1395     for (Instruction *WriteL1 : FC1.MemWrites) {
1396       for (Instruction *WriteL0 : FC0.MemWrites)
1397         if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1,
1398                                     /* AnyDep */ false,
1399                                     FusionDependenceAnalysis)) {
1400           InvalidDependencies++;
1401           return false;
1402         }
1403       for (Instruction *ReadL0 : FC0.MemReads)
1404         if (!dependencesAllowFusion(FC0, FC1, *ReadL0, *WriteL1,
1405                                     /* AnyDep */ false,
1406                                     FusionDependenceAnalysis)) {
1407           InvalidDependencies++;
1408           return false;
1409         }
1410     }
1411 
1412     // Walk through all uses in FC1. For each use, find the reaching def. If the
1413     // def is located in FC0 then it is not safe to fuse.
1414     for (BasicBlock *BB : FC1.L->blocks())
1415       for (Instruction &I : *BB)
1416         for (auto &Op : I.operands())
1417           if (Instruction *Def = dyn_cast<Instruction>(Op))
1418             if (FC0.L->contains(Def->getParent())) {
1419               InvalidDependencies++;
1420               return false;
1421             }
1422 
1423     return true;
1424   }
1425 
1426   /// Determine if two fusion candidates are adjacent in the CFG.
1427   ///
1428   /// This method will determine if there are additional basic blocks in the CFG
1429   /// between the exit of \p FC0 and the entry of \p FC1.
1430   /// If the two candidates are guarded loops, then it checks whether the
1431   /// non-loop successor of the \p FC0 guard branch is the entry block of \p
1432   /// FC1. If not, then the loops are not adjacent. If the two candidates are
1433   /// not guarded loops, then it checks whether the exit block of \p FC0 is the
1434   /// preheader of \p FC1.
1435   bool isAdjacent(const FusionCandidate &FC0,
1436                   const FusionCandidate &FC1) const {
1437     // If the successor of the guard branch is FC1, then the loops are adjacent
1438     if (FC0.GuardBranch)
1439       return FC0.getNonLoopBlock() == FC1.getEntryBlock();
1440     else
1441       return FC0.ExitBlock == FC1.getEntryBlock();
1442   }
1443 
1444   bool isEmptyPreheader(const FusionCandidate &FC) const {
1445     return FC.Preheader->size() == 1;
1446   }
1447 
1448   /// Hoist \p FC1 Preheader instructions to \p FC0 Preheader
1449   /// and sink others into the body of \p FC1.
1450   void movePreheaderInsts(const FusionCandidate &FC0,
1451                           const FusionCandidate &FC1,
1452                           SmallVector<Instruction *, 4> &HoistInsts,
1453                           SmallVector<Instruction *, 4> &SinkInsts) const {
1454     // All preheader instructions except the branch must be hoisted or sunk
1455     assert(HoistInsts.size() + SinkInsts.size() == FC1.Preheader->size() - 1 &&
1456            "Attempting to sink and hoist preheader instructions, but not all "
1457            "the preheader instructions are accounted for.");
1458 
1459     NumHoistedInsts += HoistInsts.size();
1460     NumSunkInsts += SinkInsts.size();
1461 
1462     LLVM_DEBUG(if (VerboseFusionDebugging) {
1463       if (!HoistInsts.empty())
1464         dbgs() << "Hoisting: \n";
1465       for (Instruction *I : HoistInsts)
1466         dbgs() << *I << "\n";
1467       if (!SinkInsts.empty())
1468         dbgs() << "Sinking: \n";
1469       for (Instruction *I : SinkInsts)
1470         dbgs() << *I << "\n";
1471     });
1472 
1473     for (Instruction *I : HoistInsts) {
1474       assert(I->getParent() == FC1.Preheader);
1475       I->moveBefore(*FC0.Preheader,
1476                     FC0.Preheader->getTerminator()->getIterator());
1477     }
1478     // insert instructions in reverse order to maintain dominance relationship
1479     for (Instruction *I : reverse(SinkInsts)) {
1480       assert(I->getParent() == FC1.Preheader);
1481       I->moveBefore(*FC1.ExitBlock, FC1.ExitBlock->getFirstInsertionPt());
1482     }
1483   }
1484 
1485   /// Determine if two fusion candidates have identical guards
1486   ///
1487   /// This method will determine if two fusion candidates have the same guards.
1488   /// The guards are considered the same if:
1489   ///   1. The instructions to compute the condition used in the compare are
1490   ///      identical.
1491   ///   2. The successors of the guard have the same flow into/around the loop.
1492   /// If the compare instructions are identical, then the first successor of the
1493   /// guard must go to the same place (either the preheader of the loop or the
1494   /// NonLoopBlock). In other words, the first successor of both loops must
1495   /// both go into the loop (i.e., the preheader) or go around the loop (i.e.,
1496   /// the NonLoopBlock). The same must be true for the second successor.
1497   bool haveIdenticalGuards(const FusionCandidate &FC0,
1498                            const FusionCandidate &FC1) const {
1499     assert(FC0.GuardBranch && FC1.GuardBranch &&
1500            "Expecting FC0 and FC1 to be guarded loops.");
1501 
1502     if (auto FC0CmpInst =
1503             dyn_cast<Instruction>(FC0.GuardBranch->getCondition()))
1504       if (auto FC1CmpInst =
1505               dyn_cast<Instruction>(FC1.GuardBranch->getCondition()))
1506         if (!FC0CmpInst->isIdenticalTo(FC1CmpInst))
1507           return false;
1508 
1509     // The compare instructions are identical.
1510     // Now make sure the successor of the guards have the same flow into/around
1511     // the loop
1512     if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader)
1513       return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader);
1514     else
1515       return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader);
1516   }
1517 
1518   /// Modify the latch branch of FC to be unconditional since successors of the
1519   /// branch are the same.
1520   void simplifyLatchBranch(const FusionCandidate &FC) const {
1521     BranchInst *FCLatchBranch = dyn_cast<BranchInst>(FC.Latch->getTerminator());
1522     if (FCLatchBranch) {
1523       assert(FCLatchBranch->isConditional() &&
1524              FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) &&
1525              "Expecting the two successors of FCLatchBranch to be the same");
1526       BranchInst *NewBranch =
1527           BranchInst::Create(FCLatchBranch->getSuccessor(0));
1528       ReplaceInstWithInst(FCLatchBranch, NewBranch);
1529     }
1530   }
1531 
1532   /// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique
1533   /// successor, then merge FC0.Latch with its unique successor.
1534   void mergeLatch(const FusionCandidate &FC0, const FusionCandidate &FC1) {
1535     moveInstructionsToTheBeginning(*FC0.Latch, *FC1.Latch, DT, PDT, DI);
1536     if (BasicBlock *Succ = FC0.Latch->getUniqueSuccessor()) {
1537       MergeBlockIntoPredecessor(Succ, &DTU, &LI);
1538       DTU.flush();
1539     }
1540   }
1541 
1542   /// Fuse two fusion candidates, creating a new fused loop.
1543   ///
1544   /// This method contains the mechanics of fusing two loops, represented by \p
1545   /// FC0 and \p FC1. It is assumed that \p FC0 dominates \p FC1 and \p FC1
1546   /// postdominates \p FC0 (making them control flow equivalent). It also
1547   /// assumes that the other conditions for fusion have been met: adjacent,
1548   /// identical trip counts, and no negative distance dependencies exist that
1549   /// would prevent fusion. Thus, there is no checking for these conditions in
1550   /// this method.
1551   ///
1552   /// Fusion is performed by rewiring the CFG to update successor blocks of the
1553   /// components of tho loop. Specifically, the following changes are done:
1554   ///
1555   ///   1. The preheader of \p FC1 is removed as it is no longer necessary
1556   ///   (because it is currently only a single statement block).
1557   ///   2. The latch of \p FC0 is modified to jump to the header of \p FC1.
1558   ///   3. The latch of \p FC1 i modified to jump to the header of \p FC0.
1559   ///   4. All blocks from \p FC1 are removed from FC1 and added to FC0.
1560   ///
1561   /// All of these modifications are done with dominator tree updates, thus
1562   /// keeping the dominator (and post dominator) information up-to-date.
1563   ///
1564   /// This can be improved in the future by actually merging blocks during
1565   /// fusion. For example, the preheader of \p FC1 can be merged with the
1566   /// preheader of \p FC0. This would allow loops with more than a single
1567   /// statement in the preheader to be fused. Similarly, the latch blocks of the
1568   /// two loops could also be fused into a single block. This will require
1569   /// analysis to prove it is safe to move the contents of the block past
1570   /// existing code, which currently has not been implemented.
1571   Loop *performFusion(const FusionCandidate &FC0, const FusionCandidate &FC1) {
1572     assert(FC0.isValid() && FC1.isValid() &&
1573            "Expecting valid fusion candidates");
1574 
1575     LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump();
1576                dbgs() << "Fusion Candidate 1: \n"; FC1.dump(););
1577 
1578     // Move instructions from the preheader of FC1 to the end of the preheader
1579     // of FC0.
1580     moveInstructionsToTheEnd(*FC1.Preheader, *FC0.Preheader, DT, PDT, DI);
1581 
1582     // Fusing guarded loops is handled slightly differently than non-guarded
1583     // loops and has been broken out into a separate method instead of trying to
1584     // intersperse the logic within a single method.
1585     if (FC0.GuardBranch)
1586       return fuseGuardedLoops(FC0, FC1);
1587 
1588     assert(FC1.Preheader ==
1589            (FC0.Peeled ? FC0.ExitBlock->getUniqueSuccessor() : FC0.ExitBlock));
1590     assert(FC1.Preheader->size() == 1 &&
1591            FC1.Preheader->getSingleSuccessor() == FC1.Header);
1592 
1593     // Remember the phi nodes originally in the header of FC0 in order to rewire
1594     // them later. However, this is only necessary if the new loop carried
1595     // values might not dominate the exiting branch. While we do not generally
1596     // test if this is the case but simply insert intermediate phi nodes, we
1597     // need to make sure these intermediate phi nodes have different
1598     // predecessors. To this end, we filter the special case where the exiting
1599     // block is the latch block of the first loop. Nothing needs to be done
1600     // anyway as all loop carried values dominate the latch and thereby also the
1601     // exiting branch.
1602     SmallVector<PHINode *, 8> OriginalFC0PHIs;
1603     if (FC0.ExitingBlock != FC0.Latch)
1604       for (PHINode &PHI : FC0.Header->phis())
1605         OriginalFC0PHIs.push_back(&PHI);
1606 
1607     // Replace incoming blocks for header PHIs first.
1608     FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader);
1609     FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);
1610 
1611     // Then modify the control flow and update DT and PDT.
1612     SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
1613 
1614     // The old exiting block of the first loop (FC0) has to jump to the header
1615     // of the second as we need to execute the code in the second header block
1616     // regardless of the trip count. That is, if the trip count is 0, so the
1617     // back edge is never taken, we still have to execute both loop headers,
1618     // especially (but not only!) if the second is a do-while style loop.
1619     // However, doing so might invalidate the phi nodes of the first loop as
1620     // the new values do only need to dominate their latch and not the exiting
1621     // predicate. To remedy this potential problem we always introduce phi
1622     // nodes in the header of the second loop later that select the loop carried
1623     // value, if the second header was reached through an old latch of the
1624     // first, or undef otherwise. This is sound as exiting the first implies the
1625     // second will exit too, __without__ taking the back-edge. [Their
1626     // trip-counts are equal after all.
1627     // KB: Would this sequence be simpler to just make FC0.ExitingBlock go
1628     // to FC1.Header? I think this is basically what the three sequences are
1629     // trying to accomplish; however, doing this directly in the CFG may mean
1630     // the DT/PDT becomes invalid
1631     if (!FC0.Peeled) {
1632       FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader,
1633                                                            FC1.Header);
1634       TreeUpdates.emplace_back(DominatorTree::UpdateType(
1635           DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader));
1636       TreeUpdates.emplace_back(DominatorTree::UpdateType(
1637           DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
1638     } else {
1639       TreeUpdates.emplace_back(DominatorTree::UpdateType(
1640           DominatorTree::Delete, FC0.ExitBlock, FC1.Preheader));
1641 
1642       // Remove the ExitBlock of the first Loop (also not needed)
1643       FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock,
1644                                                            FC1.Header);
1645       TreeUpdates.emplace_back(DominatorTree::UpdateType(
1646           DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));
1647       FC0.ExitBlock->getTerminator()->eraseFromParent();
1648       TreeUpdates.emplace_back(DominatorTree::UpdateType(
1649           DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
1650       new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);
1651     }
1652 
1653     // The pre-header of L1 is not necessary anymore.
1654     assert(pred_empty(FC1.Preheader));
1655     FC1.Preheader->getTerminator()->eraseFromParent();
1656     new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);
1657     TreeUpdates.emplace_back(DominatorTree::UpdateType(
1658         DominatorTree::Delete, FC1.Preheader, FC1.Header));
1659 
1660     // Moves the phi nodes from the second to the first loops header block.
1661     while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) {
1662       if (SE.isSCEVable(PHI->getType()))
1663         SE.forgetValue(PHI);
1664       if (PHI->hasNUsesOrMore(1))
1665         PHI->moveBefore(FC0.Header->getFirstInsertionPt());
1666       else
1667         PHI->eraseFromParent();
1668     }
1669 
1670     // Introduce new phi nodes in the second loop header to ensure
1671     // exiting the first and jumping to the header of the second does not break
1672     // the SSA property of the phis originally in the first loop. See also the
1673     // comment above.
1674     BasicBlock::iterator L1HeaderIP = FC1.Header->begin();
1675     for (PHINode *LCPHI : OriginalFC0PHIs) {
1676       int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);
1677       assert(L1LatchBBIdx >= 0 &&
1678              "Expected loop carried value to be rewired at this point!");
1679 
1680       Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);
1681 
1682       PHINode *L1HeaderPHI =
1683           PHINode::Create(LCV->getType(), 2, LCPHI->getName() + ".afterFC0");
1684       L1HeaderPHI->insertBefore(L1HeaderIP);
1685       L1HeaderPHI->addIncoming(LCV, FC0.Latch);
1686       L1HeaderPHI->addIncoming(PoisonValue::get(LCV->getType()),
1687                                FC0.ExitingBlock);
1688 
1689       LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);
1690     }
1691 
1692     // Replace latch terminator destinations.
1693     FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
1694     FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
1695 
1696     // Modify the latch branch of FC0 to be unconditional as both successors of
1697     // the branch are the same.
1698     simplifyLatchBranch(FC0);
1699 
1700     // If FC0.Latch and FC0.ExitingBlock are the same then we have already
1701     // performed the updates above.
1702     if (FC0.Latch != FC0.ExitingBlock)
1703       TreeUpdates.emplace_back(DominatorTree::UpdateType(
1704           DominatorTree::Insert, FC0.Latch, FC1.Header));
1705 
1706     TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
1707                                                        FC0.Latch, FC0.Header));
1708     TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert,
1709                                                        FC1.Latch, FC0.Header));
1710     TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
1711                                                        FC1.Latch, FC1.Header));
1712 
1713     // Update DT/PDT
1714     DTU.applyUpdates(TreeUpdates);
1715 
1716     LI.removeBlock(FC1.Preheader);
1717     DTU.deleteBB(FC1.Preheader);
1718     if (FC0.Peeled) {
1719       LI.removeBlock(FC0.ExitBlock);
1720       DTU.deleteBB(FC0.ExitBlock);
1721     }
1722 
1723     DTU.flush();
1724 
1725     // Is there a way to keep SE up-to-date so we don't need to forget the loops
1726     // and rebuild the information in subsequent passes of fusion?
1727     // Note: Need to forget the loops before merging the loop latches, as
1728     // mergeLatch may remove the only block in FC1.
1729     SE.forgetLoop(FC1.L);
1730     SE.forgetLoop(FC0.L);
1731     // Forget block dispositions as well, so that there are no dangling
1732     // pointers to erased/free'ed blocks.
1733     SE.forgetBlockAndLoopDispositions();
1734 
1735     // Move instructions from FC0.Latch to FC1.Latch.
1736     // Note: mergeLatch requires an updated DT.
1737     mergeLatch(FC0, FC1);
1738 
1739     // Merge the loops.
1740     SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
1741     for (BasicBlock *BB : Blocks) {
1742       FC0.L->addBlockEntry(BB);
1743       FC1.L->removeBlockFromLoop(BB);
1744       if (LI.getLoopFor(BB) != FC1.L)
1745         continue;
1746       LI.changeLoopFor(BB, FC0.L);
1747     }
1748     while (!FC1.L->isInnermost()) {
1749       const auto &ChildLoopIt = FC1.L->begin();
1750       Loop *ChildLoop = *ChildLoopIt;
1751       FC1.L->removeChildLoop(ChildLoopIt);
1752       FC0.L->addChildLoop(ChildLoop);
1753     }
1754 
1755     // Delete the now empty loop L1.
1756     LI.erase(FC1.L);
1757 
1758 #ifndef NDEBUG
1759     assert(!verifyFunction(*FC0.Header->getParent(), &errs()));
1760     assert(DT.verify(DominatorTree::VerificationLevel::Fast));
1761     assert(PDT.verify());
1762     LI.verify(DT);
1763     SE.verify();
1764 #endif
1765 
1766     LLVM_DEBUG(dbgs() << "Fusion done:\n");
1767 
1768     return FC0.L;
1769   }
1770 
1771   /// Report details on loop fusion opportunities.
1772   ///
1773   /// This template function can be used to report both successful and missed
1774   /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should
1775   /// be one of:
1776   ///   - OptimizationRemarkMissed to report when loop fusion is unsuccessful
1777   ///     given two valid fusion candidates.
1778   ///   - OptimizationRemark to report successful fusion of two fusion
1779   ///     candidates.
1780   /// The remarks will be printed using the form:
1781   ///    <path/filename>:<line number>:<column number>: [<function name>]:
1782   ///       <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description>
1783   template <typename RemarkKind>
1784   void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1,
1785                         llvm::Statistic &Stat) {
1786     assert(FC0.Preheader && FC1.Preheader &&
1787            "Expecting valid fusion candidates");
1788     using namespace ore;
1789 #if LLVM_ENABLE_STATS
1790     ++Stat;
1791     ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(),
1792                         FC0.Preheader)
1793              << "[" << FC0.Preheader->getParent()->getName()
1794              << "]: " << NV("Cand1", StringRef(FC0.Preheader->getName()))
1795              << " and " << NV("Cand2", StringRef(FC1.Preheader->getName()))
1796              << ": " << Stat.getDesc());
1797 #endif
1798   }
1799 
1800   /// Fuse two guarded fusion candidates, creating a new fused loop.
1801   ///
1802   /// Fusing guarded loops is handled much the same way as fusing non-guarded
1803   /// loops. The rewiring of the CFG is slightly different though, because of
1804   /// the presence of the guards around the loops and the exit blocks after the
1805   /// loop body. As such, the new loop is rewired as follows:
1806   ///    1. Keep the guard branch from FC0 and use the non-loop block target
1807   /// from the FC1 guard branch.
1808   ///    2. Remove the exit block from FC0 (this exit block should be empty
1809   /// right now).
1810   ///    3. Remove the guard branch for FC1
1811   ///    4. Remove the preheader for FC1.
1812   /// The exit block successor for the latch of FC0 is updated to be the header
1813   /// of FC1 and the non-exit block successor of the latch of FC1 is updated to
1814   /// be the header of FC0, thus creating the fused loop.
1815   Loop *fuseGuardedLoops(const FusionCandidate &FC0,
1816                          const FusionCandidate &FC1) {
1817     assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops");
1818 
1819     BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent();
1820     BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent();
1821     BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock();
1822     BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock();
1823     BasicBlock *FC0ExitBlockSuccessor = FC0.ExitBlock->getUniqueSuccessor();
1824 
1825     // Move instructions from the exit block of FC0 to the beginning of the exit
1826     // block of FC1, in the case that the FC0 loop has not been peeled. In the
1827     // case that FC0 loop is peeled, then move the instructions of the successor
1828     // of the FC0 Exit block to the beginning of the exit block of FC1.
1829     moveInstructionsToTheBeginning(
1830         (FC0.Peeled ? *FC0ExitBlockSuccessor : *FC0.ExitBlock), *FC1.ExitBlock,
1831         DT, PDT, DI);
1832 
1833     // Move instructions from the guard block of FC1 to the end of the guard
1834     // block of FC0.
1835     moveInstructionsToTheEnd(*FC1GuardBlock, *FC0GuardBlock, DT, PDT, DI);
1836 
1837     assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent");
1838 
1839     SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
1840 
1841     ////////////////////////////////////////////////////////////////////////////
1842     // Update the Loop Guard
1843     ////////////////////////////////////////////////////////////////////////////
1844     // The guard for FC0 is updated to guard both FC0 and FC1. This is done by
1845     // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1.
1846     // Thus, one path from the guard goes to the preheader for FC0 (and thus
1847     // executes the new fused loop) and the other path goes to the NonLoopBlock
1848     // for FC1 (where FC1 guard would have gone if FC1 was not executed).
1849     FC1NonLoopBlock->replacePhiUsesWith(FC1GuardBlock, FC0GuardBlock);
1850     FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock);
1851 
1852     BasicBlock *BBToUpdate = FC0.Peeled ? FC0ExitBlockSuccessor : FC0.ExitBlock;
1853     BBToUpdate->getTerminator()->replaceUsesOfWith(FC1GuardBlock, FC1.Header);
1854 
1855     // The guard of FC1 is not necessary anymore.
1856     FC1.GuardBranch->eraseFromParent();
1857     new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock);
1858 
1859     TreeUpdates.emplace_back(DominatorTree::UpdateType(
1860         DominatorTree::Delete, FC1GuardBlock, FC1.Preheader));
1861     TreeUpdates.emplace_back(DominatorTree::UpdateType(
1862         DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock));
1863     TreeUpdates.emplace_back(DominatorTree::UpdateType(
1864         DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock));
1865     TreeUpdates.emplace_back(DominatorTree::UpdateType(
1866         DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock));
1867 
1868     if (FC0.Peeled) {
1869       // Remove the Block after the ExitBlock of FC0
1870       TreeUpdates.emplace_back(DominatorTree::UpdateType(
1871           DominatorTree::Delete, FC0ExitBlockSuccessor, FC1GuardBlock));
1872       FC0ExitBlockSuccessor->getTerminator()->eraseFromParent();
1873       new UnreachableInst(FC0ExitBlockSuccessor->getContext(),
1874                           FC0ExitBlockSuccessor);
1875     }
1876 
1877     assert(pred_empty(FC1GuardBlock) &&
1878            "Expecting guard block to have no predecessors");
1879     assert(succ_empty(FC1GuardBlock) &&
1880            "Expecting guard block to have no successors");
1881 
1882     // Remember the phi nodes originally in the header of FC0 in order to rewire
1883     // them later. However, this is only necessary if the new loop carried
1884     // values might not dominate the exiting branch. While we do not generally
1885     // test if this is the case but simply insert intermediate phi nodes, we
1886     // need to make sure these intermediate phi nodes have different
1887     // predecessors. To this end, we filter the special case where the exiting
1888     // block is the latch block of the first loop. Nothing needs to be done
1889     // anyway as all loop carried values dominate the latch and thereby also the
1890     // exiting branch.
1891     // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch
1892     // (because the loops are rotated. Thus, nothing will ever be added to
1893     // OriginalFC0PHIs.
1894     SmallVector<PHINode *, 8> OriginalFC0PHIs;
1895     if (FC0.ExitingBlock != FC0.Latch)
1896       for (PHINode &PHI : FC0.Header->phis())
1897         OriginalFC0PHIs.push_back(&PHI);
1898 
1899     assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!");
1900 
1901     // Replace incoming blocks for header PHIs first.
1902     FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader);
1903     FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);
1904 
1905     // The old exiting block of the first loop (FC0) has to jump to the header
1906     // of the second as we need to execute the code in the second header block
1907     // regardless of the trip count. That is, if the trip count is 0, so the
1908     // back edge is never taken, we still have to execute both loop headers,
1909     // especially (but not only!) if the second is a do-while style loop.
1910     // However, doing so might invalidate the phi nodes of the first loop as
1911     // the new values do only need to dominate their latch and not the exiting
1912     // predicate. To remedy this potential problem we always introduce phi
1913     // nodes in the header of the second loop later that select the loop carried
1914     // value, if the second header was reached through an old latch of the
1915     // first, or undef otherwise. This is sound as exiting the first implies the
1916     // second will exit too, __without__ taking the back-edge (their
1917     // trip-counts are equal after all).
1918     FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock,
1919                                                          FC1.Header);
1920 
1921     TreeUpdates.emplace_back(DominatorTree::UpdateType(
1922         DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));
1923     TreeUpdates.emplace_back(DominatorTree::UpdateType(
1924         DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
1925 
1926     // Remove FC0 Exit Block
1927     // The exit block for FC0 is no longer needed since control will flow
1928     // directly to the header of FC1. Since it is an empty block, it can be
1929     // removed at this point.
1930     // TODO: In the future, we can handle non-empty exit blocks my merging any
1931     // instructions from FC0 exit block into FC1 exit block prior to removing
1932     // the block.
1933     assert(pred_empty(FC0.ExitBlock) && "Expecting exit block to be empty");
1934     FC0.ExitBlock->getTerminator()->eraseFromParent();
1935     new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);
1936 
1937     // Remove FC1 Preheader
1938     // The pre-header of L1 is not necessary anymore.
1939     assert(pred_empty(FC1.Preheader));
1940     FC1.Preheader->getTerminator()->eraseFromParent();
1941     new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);
1942     TreeUpdates.emplace_back(DominatorTree::UpdateType(
1943         DominatorTree::Delete, FC1.Preheader, FC1.Header));
1944 
1945     // Moves the phi nodes from the second to the first loops header block.
1946     while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) {
1947       if (SE.isSCEVable(PHI->getType()))
1948         SE.forgetValue(PHI);
1949       if (PHI->hasNUsesOrMore(1))
1950         PHI->moveBefore(FC0.Header->getFirstInsertionPt());
1951       else
1952         PHI->eraseFromParent();
1953     }
1954 
1955     // Introduce new phi nodes in the second loop header to ensure
1956     // exiting the first and jumping to the header of the second does not break
1957     // the SSA property of the phis originally in the first loop. See also the
1958     // comment above.
1959     BasicBlock::iterator L1HeaderIP = FC1.Header->begin();
1960     for (PHINode *LCPHI : OriginalFC0PHIs) {
1961       int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);
1962       assert(L1LatchBBIdx >= 0 &&
1963              "Expected loop carried value to be rewired at this point!");
1964 
1965       Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);
1966 
1967       PHINode *L1HeaderPHI =
1968           PHINode::Create(LCV->getType(), 2, LCPHI->getName() + ".afterFC0");
1969       L1HeaderPHI->insertBefore(L1HeaderIP);
1970       L1HeaderPHI->addIncoming(LCV, FC0.Latch);
1971       L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()),
1972                                FC0.ExitingBlock);
1973 
1974       LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);
1975     }
1976 
1977     // Update the latches
1978 
1979     // Replace latch terminator destinations.
1980     FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
1981     FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
1982 
1983     // Modify the latch branch of FC0 to be unconditional as both successors of
1984     // the branch are the same.
1985     simplifyLatchBranch(FC0);
1986 
1987     // If FC0.Latch and FC0.ExitingBlock are the same then we have already
1988     // performed the updates above.
1989     if (FC0.Latch != FC0.ExitingBlock)
1990       TreeUpdates.emplace_back(DominatorTree::UpdateType(
1991           DominatorTree::Insert, FC0.Latch, FC1.Header));
1992 
1993     TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
1994                                                        FC0.Latch, FC0.Header));
1995     TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert,
1996                                                        FC1.Latch, FC0.Header));
1997     TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
1998                                                        FC1.Latch, FC1.Header));
1999 
2000     // All done
2001     // Apply the updates to the Dominator Tree and cleanup.
2002 
2003     assert(succ_empty(FC1GuardBlock) && "FC1GuardBlock has successors!!");
2004     assert(pred_empty(FC1GuardBlock) && "FC1GuardBlock has predecessors!!");
2005 
2006     // Update DT/PDT
2007     DTU.applyUpdates(TreeUpdates);
2008 
2009     LI.removeBlock(FC1GuardBlock);
2010     LI.removeBlock(FC1.Preheader);
2011     LI.removeBlock(FC0.ExitBlock);
2012     if (FC0.Peeled) {
2013       LI.removeBlock(FC0ExitBlockSuccessor);
2014       DTU.deleteBB(FC0ExitBlockSuccessor);
2015     }
2016     DTU.deleteBB(FC1GuardBlock);
2017     DTU.deleteBB(FC1.Preheader);
2018     DTU.deleteBB(FC0.ExitBlock);
2019     DTU.flush();
2020 
2021     // Is there a way to keep SE up-to-date so we don't need to forget the loops
2022     // and rebuild the information in subsequent passes of fusion?
2023     // Note: Need to forget the loops before merging the loop latches, as
2024     // mergeLatch may remove the only block in FC1.
2025     SE.forgetLoop(FC1.L);
2026     SE.forgetLoop(FC0.L);
2027     // Forget block dispositions as well, so that there are no dangling
2028     // pointers to erased/free'ed blocks.
2029     SE.forgetBlockAndLoopDispositions();
2030 
2031     // Move instructions from FC0.Latch to FC1.Latch.
2032     // Note: mergeLatch requires an updated DT.
2033     mergeLatch(FC0, FC1);
2034 
2035     // Merge the loops.
2036     SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
2037     for (BasicBlock *BB : Blocks) {
2038       FC0.L->addBlockEntry(BB);
2039       FC1.L->removeBlockFromLoop(BB);
2040       if (LI.getLoopFor(BB) != FC1.L)
2041         continue;
2042       LI.changeLoopFor(BB, FC0.L);
2043     }
2044     while (!FC1.L->isInnermost()) {
2045       const auto &ChildLoopIt = FC1.L->begin();
2046       Loop *ChildLoop = *ChildLoopIt;
2047       FC1.L->removeChildLoop(ChildLoopIt);
2048       FC0.L->addChildLoop(ChildLoop);
2049     }
2050 
2051     // Delete the now empty loop L1.
2052     LI.erase(FC1.L);
2053 
2054 #ifndef NDEBUG
2055     assert(!verifyFunction(*FC0.Header->getParent(), &errs()));
2056     assert(DT.verify(DominatorTree::VerificationLevel::Fast));
2057     assert(PDT.verify());
2058     LI.verify(DT);
2059     SE.verify();
2060 #endif
2061 
2062     LLVM_DEBUG(dbgs() << "Fusion done:\n");
2063 
2064     return FC0.L;
2065   }
2066 };
2067 } // namespace
2068 
2069 PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) {
2070   auto &LI = AM.getResult<LoopAnalysis>(F);
2071   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
2072   auto &DI = AM.getResult<DependenceAnalysis>(F);
2073   auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
2074   auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
2075   auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
2076   auto &AC = AM.getResult<AssumptionAnalysis>(F);
2077   const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
2078   const DataLayout &DL = F.getDataLayout();
2079 
2080   // Ensure loops are in simplifed form which is a pre-requisite for loop fusion
2081   // pass. Added only for new PM since the legacy PM has already added
2082   // LoopSimplify pass as a dependency.
2083   bool Changed = false;
2084   for (auto &L : LI) {
2085     Changed |=
2086         simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */);
2087   }
2088   if (Changed)
2089     PDT.recalculate(F);
2090 
2091   LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI);
2092   Changed |= LF.fuseLoops(F);
2093   if (!Changed)
2094     return PreservedAnalyses::all();
2095 
2096   PreservedAnalyses PA;
2097   PA.preserve<DominatorTreeAnalysis>();
2098   PA.preserve<PostDominatorTreeAnalysis>();
2099   PA.preserve<ScalarEvolutionAnalysis>();
2100   PA.preserve<LoopAnalysis>();
2101   return PA;
2102 }
2103