xref: /llvm-project/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp (revision baf045fb284c8d1c263478b5bad99b737b905e4b)
1 //===- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow ---------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 
10 #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
11 #include "llvm/ADT/DenseMap.h"
12 #include "llvm/ADT/STLExtras.h"
13 #include "llvm/ADT/Sequence.h"
14 #include "llvm/ADT/SetVector.h"
15 #include "llvm/ADT/SmallPtrSet.h"
16 #include "llvm/ADT/SmallVector.h"
17 #include "llvm/ADT/Statistic.h"
18 #include "llvm/ADT/Twine.h"
19 #include "llvm/Analysis/AssumptionCache.h"
20 #include "llvm/Analysis/CFG.h"
21 #include "llvm/Analysis/CodeMetrics.h"
22 #include "llvm/Analysis/LoopAnalysisManager.h"
23 #include "llvm/Analysis/LoopInfo.h"
24 #include "llvm/Analysis/LoopIterator.h"
25 #include "llvm/Analysis/LoopPass.h"
26 #include "llvm/IR/BasicBlock.h"
27 #include "llvm/IR/Constant.h"
28 #include "llvm/IR/Constants.h"
29 #include "llvm/IR/Dominators.h"
30 #include "llvm/IR/Function.h"
31 #include "llvm/IR/InstrTypes.h"
32 #include "llvm/IR/Instruction.h"
33 #include "llvm/IR/Instructions.h"
34 #include "llvm/IR/IntrinsicInst.h"
35 #include "llvm/IR/Use.h"
36 #include "llvm/IR/Value.h"
37 #include "llvm/Pass.h"
38 #include "llvm/Support/Casting.h"
39 #include "llvm/Support/Debug.h"
40 #include "llvm/Support/ErrorHandling.h"
41 #include "llvm/Support/GenericDomTree.h"
42 #include "llvm/Support/raw_ostream.h"
43 #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
44 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
45 #include "llvm/Transforms/Utils/Cloning.h"
46 #include "llvm/Transforms/Utils/LoopUtils.h"
47 #include "llvm/Transforms/Utils/ValueMapper.h"
48 #include <algorithm>
49 #include <cassert>
50 #include <iterator>
51 #include <numeric>
52 #include <utility>
53 
54 #define DEBUG_TYPE "simple-loop-unswitch"
55 
56 using namespace llvm;
57 
58 STATISTIC(NumBranches, "Number of branches unswitched");
59 STATISTIC(NumSwitches, "Number of switches unswitched");
60 STATISTIC(NumTrivial, "Number of unswitches that are trivial");
61 
62 static cl::opt<bool> EnableNonTrivialUnswitch(
63     "enable-nontrivial-unswitch", cl::init(false), cl::Hidden,
64     cl::desc("Forcibly enables non-trivial loop unswitching rather than "
65              "following the configuration passed into the pass."));
66 
67 static cl::opt<int>
68     UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden,
69                       cl::desc("The cost threshold for unswitching a loop."));
70 
71 static void replaceLoopUsesWithConstant(Loop &L, Value &LIC,
72                                         Constant &Replacement) {
73   assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?");
74 
75   // Replace uses of LIC in the loop with the given constant.
76   for (auto UI = LIC.use_begin(), UE = LIC.use_end(); UI != UE;) {
77     // Grab the use and walk past it so we can clobber it in the use list.
78     Use *U = &*UI++;
79     Instruction *UserI = dyn_cast<Instruction>(U->getUser());
80     if (!UserI || !L.contains(UserI))
81       continue;
82 
83     // Replace this use within the loop body.
84     *U = &Replacement;
85   }
86 }
87 
88 /// Check that all the LCSSA PHI nodes in the loop exit block have trivial
89 /// incoming values along this edge.
90 static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB,
91                                          BasicBlock &ExitBB) {
92   for (Instruction &I : ExitBB) {
93     auto *PN = dyn_cast<PHINode>(&I);
94     if (!PN)
95       // No more PHIs to check.
96       return true;
97 
98     // If the incoming value for this edge isn't loop invariant the unswitch
99     // won't be trivial.
100     if (!L.isLoopInvariant(PN->getIncomingValueForBlock(&ExitingBB)))
101       return false;
102   }
103   llvm_unreachable("Basic blocks should never be empty!");
104 }
105 
106 /// Rewrite the PHI nodes in an unswitched loop exit basic block.
107 ///
108 /// Requires that the loop exit and unswitched basic block are the same, and
109 /// that the exiting block was a unique predecessor of that block. Rewrites the
110 /// PHI nodes in that block such that what were LCSSA PHI nodes become trivial
111 /// PHI nodes from the old preheader that now contains the unswitched
112 /// terminator.
113 static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB,
114                                                   BasicBlock &OldExitingBB,
115                                                   BasicBlock &OldPH) {
116   for (PHINode &PN : UnswitchedBB.phis()) {
117     // When the loop exit is directly unswitched we just need to update the
118     // incoming basic block. We loop to handle weird cases with repeated
119     // incoming blocks, but expect to typically only have one operand here.
120     for (auto i : seq<int>(0, PN.getNumOperands())) {
121       assert(PN.getIncomingBlock(i) == &OldExitingBB &&
122              "Found incoming block different from unique predecessor!");
123       PN.setIncomingBlock(i, &OldPH);
124     }
125   }
126 }
127 
128 /// Rewrite the PHI nodes in the loop exit basic block and the split off
129 /// unswitched block.
130 ///
131 /// Because the exit block remains an exit from the loop, this rewrites the
132 /// LCSSA PHI nodes in it to remove the unswitched edge and introduces PHI
133 /// nodes into the unswitched basic block to select between the value in the
134 /// old preheader and the loop exit.
135 static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB,
136                                                       BasicBlock &UnswitchedBB,
137                                                       BasicBlock &OldExitingBB,
138                                                       BasicBlock &OldPH) {
139   assert(&ExitBB != &UnswitchedBB &&
140          "Must have different loop exit and unswitched blocks!");
141   Instruction *InsertPt = &*UnswitchedBB.begin();
142   for (PHINode &PN : ExitBB.phis()) {
143     auto *NewPN = PHINode::Create(PN.getType(), /*NumReservedValues*/ 2,
144                                   PN.getName() + ".split", InsertPt);
145 
146     // Walk backwards over the old PHI node's inputs to minimize the cost of
147     // removing each one. We have to do this weird loop manually so that we
148     // create the same number of new incoming edges in the new PHI as we expect
149     // each case-based edge to be included in the unswitched switch in some
150     // cases.
151     // FIXME: This is really, really gross. It would be much cleaner if LLVM
152     // allowed us to create a single entry for a predecessor block without
153     // having separate entries for each "edge" even though these edges are
154     // required to produce identical results.
155     for (int i = PN.getNumIncomingValues() - 1; i >= 0; --i) {
156       if (PN.getIncomingBlock(i) != &OldExitingBB)
157         continue;
158 
159       Value *Incoming = PN.removeIncomingValue(i);
160       NewPN->addIncoming(Incoming, &OldPH);
161     }
162 
163     // Now replace the old PHI with the new one and wire the old one in as an
164     // input to the new one.
165     PN.replaceAllUsesWith(NewPN);
166     NewPN->addIncoming(&PN, &ExitBB);
167   }
168 }
169 
170 /// Unswitch a trivial branch if the condition is loop invariant.
171 ///
172 /// This routine should only be called when loop code leading to the branch has
173 /// been validated as trivial (no side effects). This routine checks if the
174 /// condition is invariant and one of the successors is a loop exit. This
175 /// allows us to unswitch without duplicating the loop, making it trivial.
176 ///
177 /// If this routine fails to unswitch the branch it returns false.
178 ///
179 /// If the branch can be unswitched, this routine splits the preheader and
180 /// hoists the branch above that split. Preserves loop simplified form
181 /// (splitting the exit block as necessary). It simplifies the branch within
182 /// the loop to an unconditional branch but doesn't remove it entirely. Further
183 /// cleanup can be done with some simplify-cfg like pass.
184 static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
185                                   LoopInfo &LI) {
186   assert(BI.isConditional() && "Can only unswitch a conditional branch!");
187   DEBUG(dbgs() << "  Trying to unswitch branch: " << BI << "\n");
188 
189   Value *LoopCond = BI.getCondition();
190 
191   // Need a trivial loop condition to unswitch.
192   if (!L.isLoopInvariant(LoopCond))
193     return false;
194 
195   // Check to see if a successor of the branch is guaranteed to
196   // exit through a unique exit block without having any
197   // side-effects.  If so, determine the value of Cond that causes
198   // it to do this.
199   ConstantInt *CondVal = ConstantInt::getTrue(BI.getContext());
200   ConstantInt *Replacement = ConstantInt::getFalse(BI.getContext());
201   int LoopExitSuccIdx = 0;
202   auto *LoopExitBB = BI.getSuccessor(0);
203   if (L.contains(LoopExitBB)) {
204     std::swap(CondVal, Replacement);
205     LoopExitSuccIdx = 1;
206     LoopExitBB = BI.getSuccessor(1);
207     if (L.contains(LoopExitBB))
208       return false;
209   }
210   auto *ContinueBB = BI.getSuccessor(1 - LoopExitSuccIdx);
211   auto *ParentBB = BI.getParent();
212   if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, *LoopExitBB))
213     return false;
214 
215   DEBUG(dbgs() << "    unswitching trivial branch when: " << CondVal
216                << " == " << LoopCond << "\n");
217 
218   // Split the preheader, so that we know that there is a safe place to insert
219   // the conditional branch. We will change the preheader to have a conditional
220   // branch on LoopCond.
221   BasicBlock *OldPH = L.getLoopPreheader();
222   BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI);
223 
224   // Now that we have a place to insert the conditional branch, create a place
225   // to branch to: this is the exit block out of the loop that we are
226   // unswitching. We need to split this if there are other loop predecessors.
227   // Because the loop is in simplified form, *any* other predecessor is enough.
228   BasicBlock *UnswitchedBB;
229   if (BasicBlock *PredBB = LoopExitBB->getUniquePredecessor()) {
230     (void)PredBB;
231     assert(PredBB == BI.getParent() &&
232            "A branch's parent isn't a predecessor!");
233     UnswitchedBB = LoopExitBB;
234   } else {
235     UnswitchedBB = SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI);
236   }
237 
238   // Now splice the branch to gate reaching the new preheader and re-point its
239   // successors.
240   OldPH->getInstList().splice(std::prev(OldPH->end()),
241                               BI.getParent()->getInstList(), BI);
242   OldPH->getTerminator()->eraseFromParent();
243   BI.setSuccessor(LoopExitSuccIdx, UnswitchedBB);
244   BI.setSuccessor(1 - LoopExitSuccIdx, NewPH);
245 
246   // Create a new unconditional branch that will continue the loop as a new
247   // terminator.
248   BranchInst::Create(ContinueBB, ParentBB);
249 
250   // Rewrite the relevant PHI nodes.
251   if (UnswitchedBB == LoopExitBB)
252     rewritePHINodesForUnswitchedExitBlock(*UnswitchedBB, *ParentBB, *OldPH);
253   else
254     rewritePHINodesForExitAndUnswitchedBlocks(*LoopExitBB, *UnswitchedBB,
255                                               *ParentBB, *OldPH);
256 
257   // Now we need to update the dominator tree.
258   DT.applyUpdates(
259       {{DT.Delete, ParentBB, UnswitchedBB}, {DT.Insert, OldPH, UnswitchedBB}});
260 
261   // Since this is an i1 condition we can also trivially replace uses of it
262   // within the loop with a constant.
263   replaceLoopUsesWithConstant(L, *LoopCond, *Replacement);
264 
265   ++NumTrivial;
266   ++NumBranches;
267   return true;
268 }
269 
270 /// Unswitch a trivial switch if the condition is loop invariant.
271 ///
272 /// This routine should only be called when loop code leading to the switch has
273 /// been validated as trivial (no side effects). This routine checks if the
274 /// condition is invariant and that at least one of the successors is a loop
275 /// exit. This allows us to unswitch without duplicating the loop, making it
276 /// trivial.
277 ///
278 /// If this routine fails to unswitch the switch it returns false.
279 ///
280 /// If the switch can be unswitched, this routine splits the preheader and
281 /// copies the switch above that split. If the default case is one of the
282 /// exiting cases, it copies the non-exiting cases and points them at the new
283 /// preheader. If the default case is not exiting, it copies the exiting cases
284 /// and points the default at the preheader. It preserves loop simplified form
285 /// (splitting the exit blocks as necessary). It simplifies the switch within
286 /// the loop by removing now-dead cases. If the default case is one of those
287 /// unswitched, it replaces its destination with a new basic block containing
288 /// only unreachable. Such basic blocks, while technically loop exits, are not
289 /// considered for unswitching so this is a stable transform and the same
290 /// switch will not be revisited. If after unswitching there is only a single
291 /// in-loop successor, the switch is further simplified to an unconditional
292 /// branch. Still more cleanup can be done with some simplify-cfg like pass.
293 static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
294                                   LoopInfo &LI) {
295   DEBUG(dbgs() << "  Trying to unswitch switch: " << SI << "\n");
296   Value *LoopCond = SI.getCondition();
297 
298   // If this isn't switching on an invariant condition, we can't unswitch it.
299   if (!L.isLoopInvariant(LoopCond))
300     return false;
301 
302   auto *ParentBB = SI.getParent();
303 
304   SmallVector<int, 4> ExitCaseIndices;
305   for (auto Case : SI.cases()) {
306     auto *SuccBB = Case.getCaseSuccessor();
307     if (!L.contains(SuccBB) &&
308         areLoopExitPHIsLoopInvariant(L, *ParentBB, *SuccBB))
309       ExitCaseIndices.push_back(Case.getCaseIndex());
310   }
311   BasicBlock *DefaultExitBB = nullptr;
312   if (!L.contains(SI.getDefaultDest()) &&
313       areLoopExitPHIsLoopInvariant(L, *ParentBB, *SI.getDefaultDest()) &&
314       !isa<UnreachableInst>(SI.getDefaultDest()->getTerminator()))
315     DefaultExitBB = SI.getDefaultDest();
316   else if (ExitCaseIndices.empty())
317     return false;
318 
319   DEBUG(dbgs() << "    unswitching trivial cases...\n");
320 
321   SmallVector<std::pair<ConstantInt *, BasicBlock *>, 4> ExitCases;
322   ExitCases.reserve(ExitCaseIndices.size());
323   // We walk the case indices backwards so that we remove the last case first
324   // and don't disrupt the earlier indices.
325   for (unsigned Index : reverse(ExitCaseIndices)) {
326     auto CaseI = SI.case_begin() + Index;
327     // Save the value of this case.
328     ExitCases.push_back({CaseI->getCaseValue(), CaseI->getCaseSuccessor()});
329     // Delete the unswitched cases.
330     SI.removeCase(CaseI);
331   }
332 
333   // Check if after this all of the remaining cases point at the same
334   // successor.
335   BasicBlock *CommonSuccBB = nullptr;
336   if (SI.getNumCases() > 0 &&
337       std::all_of(std::next(SI.case_begin()), SI.case_end(),
338                   [&SI](const SwitchInst::CaseHandle &Case) {
339                     return Case.getCaseSuccessor() ==
340                            SI.case_begin()->getCaseSuccessor();
341                   }))
342     CommonSuccBB = SI.case_begin()->getCaseSuccessor();
343 
344   if (DefaultExitBB) {
345     // We can't remove the default edge so replace it with an edge to either
346     // the single common remaining successor (if we have one) or an unreachable
347     // block.
348     if (CommonSuccBB) {
349       SI.setDefaultDest(CommonSuccBB);
350     } else {
351       BasicBlock *UnreachableBB = BasicBlock::Create(
352           ParentBB->getContext(),
353           Twine(ParentBB->getName()) + ".unreachable_default",
354           ParentBB->getParent());
355       new UnreachableInst(ParentBB->getContext(), UnreachableBB);
356       SI.setDefaultDest(UnreachableBB);
357       DT.addNewBlock(UnreachableBB, ParentBB);
358     }
359   } else {
360     // If we're not unswitching the default, we need it to match any cases to
361     // have a common successor or if we have no cases it is the common
362     // successor.
363     if (SI.getNumCases() == 0)
364       CommonSuccBB = SI.getDefaultDest();
365     else if (SI.getDefaultDest() != CommonSuccBB)
366       CommonSuccBB = nullptr;
367   }
368 
369   // Split the preheader, so that we know that there is a safe place to insert
370   // the switch.
371   BasicBlock *OldPH = L.getLoopPreheader();
372   BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI);
373   OldPH->getTerminator()->eraseFromParent();
374 
375   // Now add the unswitched switch.
376   auto *NewSI = SwitchInst::Create(LoopCond, NewPH, ExitCases.size(), OldPH);
377 
378   // Rewrite the IR for the unswitched basic blocks. This requires two steps.
379   // First, we split any exit blocks with remaining in-loop predecessors. Then
380   // we update the PHIs in one of two ways depending on if there was a split.
381   // We walk in reverse so that we split in the same order as the cases
382   // appeared. This is purely for convenience of reading the resulting IR, but
383   // it doesn't cost anything really.
384   SmallPtrSet<BasicBlock *, 2> UnswitchedExitBBs;
385   SmallDenseMap<BasicBlock *, BasicBlock *, 2> SplitExitBBMap;
386   // Handle the default exit if necessary.
387   // FIXME: It'd be great if we could merge this with the loop below but LLVM's
388   // ranges aren't quite powerful enough yet.
389   if (DefaultExitBB) {
390     if (pred_empty(DefaultExitBB)) {
391       UnswitchedExitBBs.insert(DefaultExitBB);
392       rewritePHINodesForUnswitchedExitBlock(*DefaultExitBB, *ParentBB, *OldPH);
393     } else {
394       auto *SplitBB =
395           SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI);
396       rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB,
397                                                 *ParentBB, *OldPH);
398       DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB;
399     }
400   }
401   // Note that we must use a reference in the for loop so that we update the
402   // container.
403   for (auto &CasePair : reverse(ExitCases)) {
404     // Grab a reference to the exit block in the pair so that we can update it.
405     BasicBlock *ExitBB = CasePair.second;
406 
407     // If this case is the last edge into the exit block, we can simply reuse it
408     // as it will no longer be a loop exit. No mapping necessary.
409     if (pred_empty(ExitBB)) {
410       // Only rewrite once.
411       if (UnswitchedExitBBs.insert(ExitBB).second)
412         rewritePHINodesForUnswitchedExitBlock(*ExitBB, *ParentBB, *OldPH);
413       continue;
414     }
415 
416     // Otherwise we need to split the exit block so that we retain an exit
417     // block from the loop and a target for the unswitched condition.
418     BasicBlock *&SplitExitBB = SplitExitBBMap[ExitBB];
419     if (!SplitExitBB) {
420       // If this is the first time we see this, do the split and remember it.
421       SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI);
422       rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB,
423                                                 *ParentBB, *OldPH);
424     }
425     // Update the case pair to point to the split block.
426     CasePair.second = SplitExitBB;
427   }
428 
429   // Now add the unswitched cases. We do this in reverse order as we built them
430   // in reverse order.
431   for (auto CasePair : reverse(ExitCases)) {
432     ConstantInt *CaseVal = CasePair.first;
433     BasicBlock *UnswitchedBB = CasePair.second;
434 
435     NewSI->addCase(CaseVal, UnswitchedBB);
436   }
437 
438   // If the default was unswitched, re-point it and add explicit cases for
439   // entering the loop.
440   if (DefaultExitBB) {
441     NewSI->setDefaultDest(DefaultExitBB);
442 
443     // We removed all the exit cases, so we just copy the cases to the
444     // unswitched switch.
445     for (auto Case : SI.cases())
446       NewSI->addCase(Case.getCaseValue(), NewPH);
447   }
448 
449   // If we ended up with a common successor for every path through the switch
450   // after unswitching, rewrite it to an unconditional branch to make it easy
451   // to recognize. Otherwise we potentially have to recognize the default case
452   // pointing at unreachable and other complexity.
453   if (CommonSuccBB) {
454     BasicBlock *BB = SI.getParent();
455     SI.eraseFromParent();
456     BranchInst::Create(CommonSuccBB, BB);
457   }
458 
459   // Walk the unswitched exit blocks and the unswitched split blocks and update
460   // the dominator tree based on the CFG edits. While we are walking unordered
461   // containers here, the API for applyUpdates takes an unordered list of
462   // updates and requires them to not contain duplicates.
463   SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
464   for (auto *UnswitchedExitBB : UnswitchedExitBBs) {
465     DTUpdates.push_back({DT.Delete, ParentBB, UnswitchedExitBB});
466     DTUpdates.push_back({DT.Insert, OldPH, UnswitchedExitBB});
467   }
468   for (auto SplitUnswitchedPair : SplitExitBBMap) {
469     auto *UnswitchedBB = SplitUnswitchedPair.second;
470     DTUpdates.push_back({DT.Delete, ParentBB, UnswitchedBB});
471     DTUpdates.push_back({DT.Insert, OldPH, UnswitchedBB});
472   }
473   DT.applyUpdates(DTUpdates);
474 
475   assert(DT.verify(DominatorTree::VerificationLevel::Fast));
476   ++NumTrivial;
477   ++NumSwitches;
478   return true;
479 }
480 
481 /// This routine scans the loop to find a branch or switch which occurs before
482 /// any side effects occur. These can potentially be unswitched without
483 /// duplicating the loop. If a branch or switch is successfully unswitched the
484 /// scanning continues to see if subsequent branches or switches have become
485 /// trivial. Once all trivial candidates have been unswitched, this routine
486 /// returns.
487 ///
488 /// The return value indicates whether anything was unswitched (and therefore
489 /// changed).
490 static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
491                                          LoopInfo &LI) {
492   bool Changed = false;
493 
494   // If loop header has only one reachable successor we should keep looking for
495   // trivial condition candidates in the successor as well. An alternative is
496   // to constant fold conditions and merge successors into loop header (then we
497   // only need to check header's terminator). The reason for not doing this in
498   // LoopUnswitch pass is that it could potentially break LoopPassManager's
499   // invariants. Folding dead branches could either eliminate the current loop
500   // or make other loops unreachable. LCSSA form might also not be preserved
501   // after deleting branches. The following code keeps traversing loop header's
502   // successors until it finds the trivial condition candidate (condition that
503   // is not a constant). Since unswitching generates branches with constant
504   // conditions, this scenario could be very common in practice.
505   BasicBlock *CurrentBB = L.getHeader();
506   SmallPtrSet<BasicBlock *, 8> Visited;
507   Visited.insert(CurrentBB);
508   do {
509     // Check if there are any side-effecting instructions (e.g. stores, calls,
510     // volatile loads) in the part of the loop that the code *would* execute
511     // without unswitching.
512     if (llvm::any_of(*CurrentBB,
513                      [](Instruction &I) { return I.mayHaveSideEffects(); }))
514       return Changed;
515 
516     TerminatorInst *CurrentTerm = CurrentBB->getTerminator();
517 
518     if (auto *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
519       // Don't bother trying to unswitch past a switch with a constant
520       // condition. This should be removed prior to running this pass by
521       // simplify-cfg.
522       if (isa<Constant>(SI->getCondition()))
523         return Changed;
524 
525       if (!unswitchTrivialSwitch(L, *SI, DT, LI))
526         // Coludn't unswitch this one so we're done.
527         return Changed;
528 
529       // Mark that we managed to unswitch something.
530       Changed = true;
531 
532       // If unswitching turned the terminator into an unconditional branch then
533       // we can continue. The unswitching logic specifically works to fold any
534       // cases it can into an unconditional branch to make it easier to
535       // recognize here.
536       auto *BI = dyn_cast<BranchInst>(CurrentBB->getTerminator());
537       if (!BI || BI->isConditional())
538         return Changed;
539 
540       CurrentBB = BI->getSuccessor(0);
541       continue;
542     }
543 
544     auto *BI = dyn_cast<BranchInst>(CurrentTerm);
545     if (!BI)
546       // We do not understand other terminator instructions.
547       return Changed;
548 
549     // Don't bother trying to unswitch past an unconditional branch or a branch
550     // with a constant value. These should be removed by simplify-cfg prior to
551     // running this pass.
552     if (!BI->isConditional() || isa<Constant>(BI->getCondition()))
553       return Changed;
554 
555     // Found a trivial condition candidate: non-foldable conditional branch. If
556     // we fail to unswitch this, we can't do anything else that is trivial.
557     if (!unswitchTrivialBranch(L, *BI, DT, LI))
558       return Changed;
559 
560     // Mark that we managed to unswitch something.
561     Changed = true;
562 
563     // We unswitched the branch. This should always leave us with an
564     // unconditional branch that we can follow now.
565     BI = cast<BranchInst>(CurrentBB->getTerminator());
566     assert(!BI->isConditional() &&
567            "Cannot form a conditional branch by unswitching1");
568     CurrentBB = BI->getSuccessor(0);
569 
570     // When continuing, if we exit the loop or reach a previous visited block,
571     // then we can not reach any trivial condition candidates (unfoldable
572     // branch instructions or switch instructions) and no unswitch can happen.
573   } while (L.contains(CurrentBB) && Visited.insert(CurrentBB).second);
574 
575   return Changed;
576 }
577 
578 /// Build the cloned blocks for an unswitched copy of the given loop.
579 ///
580 /// The cloned blocks are inserted before the loop preheader (`LoopPH`) and
581 /// after the split block (`SplitBB`) that will be used to select between the
582 /// cloned and original loop.
583 ///
584 /// This routine handles cloning all of the necessary loop blocks and exit
585 /// blocks including rewriting their instructions and the relevant PHI nodes.
586 /// It skips loop and exit blocks that are not necessary based on the provided
587 /// set. It also correctly creates the unconditional branch in the cloned
588 /// unswitched parent block to only point at the unswitched successor.
589 ///
590 /// This does not handle most of the necessary updates to `LoopInfo`. Only exit
591 /// block splitting is correctly reflected in `LoopInfo`, essentially all of
592 /// the cloned blocks (and their loops) are left without full `LoopInfo`
593 /// updates. This also doesn't fully update `DominatorTree`. It adds the cloned
594 /// blocks to them but doesn't create the cloned `DominatorTree` structure and
595 /// instead the caller must recompute an accurate DT. It *does* correctly
596 /// update the `AssumptionCache` provided in `AC`.
597 static BasicBlock *buildClonedLoopBlocks(
598     Loop &L, BasicBlock *LoopPH, BasicBlock *SplitBB,
599     ArrayRef<BasicBlock *> ExitBlocks, BasicBlock *ParentBB,
600     BasicBlock *UnswitchedSuccBB, BasicBlock *ContinueSuccBB,
601     const SmallPtrSetImpl<BasicBlock *> &SkippedLoopAndExitBlocks,
602     ValueToValueMapTy &VMap,
603     SmallVectorImpl<DominatorTree::UpdateType> &DTUpdates, AssumptionCache &AC,
604     DominatorTree &DT, LoopInfo &LI) {
605   SmallVector<BasicBlock *, 4> NewBlocks;
606   NewBlocks.reserve(L.getNumBlocks() + ExitBlocks.size());
607 
608   // We will need to clone a bunch of blocks, wrap up the clone operation in
609   // a helper.
610   auto CloneBlock = [&](BasicBlock *OldBB) {
611     // Clone the basic block and insert it before the new preheader.
612     BasicBlock *NewBB = CloneBasicBlock(OldBB, VMap, ".us", OldBB->getParent());
613     NewBB->moveBefore(LoopPH);
614 
615     // Record this block and the mapping.
616     NewBlocks.push_back(NewBB);
617     VMap[OldBB] = NewBB;
618 
619     return NewBB;
620   };
621 
622   // First, clone the preheader.
623   auto *ClonedPH = CloneBlock(LoopPH);
624 
625   // Then clone all the loop blocks, skipping the ones that aren't necessary.
626   for (auto *LoopBB : L.blocks())
627     if (!SkippedLoopAndExitBlocks.count(LoopBB))
628       CloneBlock(LoopBB);
629 
630   // Split all the loop exit edges so that when we clone the exit blocks, if
631   // any of the exit blocks are *also* a preheader for some other loop, we
632   // don't create multiple predecessors entering the loop header.
633   for (auto *ExitBB : ExitBlocks) {
634     if (SkippedLoopAndExitBlocks.count(ExitBB))
635       continue;
636 
637     // When we are going to clone an exit, we don't need to clone all the
638     // instructions in the exit block and we want to ensure we have an easy
639     // place to merge the CFG, so split the exit first. This is always safe to
640     // do because there cannot be any non-loop predecessors of a loop exit in
641     // loop simplified form.
642     auto *MergeBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI);
643 
644     // Rearrange the names to make it easier to write test cases by having the
645     // exit block carry the suffix rather than the merge block carrying the
646     // suffix.
647     MergeBB->takeName(ExitBB);
648     ExitBB->setName(Twine(MergeBB->getName()) + ".split");
649 
650     // Now clone the original exit block.
651     auto *ClonedExitBB = CloneBlock(ExitBB);
652     assert(ClonedExitBB->getTerminator()->getNumSuccessors() == 1 &&
653            "Exit block should have been split to have one successor!");
654     assert(ClonedExitBB->getTerminator()->getSuccessor(0) == MergeBB &&
655            "Cloned exit block has the wrong successor!");
656 
657     // Remap any cloned instructions and create a merge phi node for them.
658     for (auto ZippedInsts : llvm::zip_first(
659              llvm::make_range(ExitBB->begin(), std::prev(ExitBB->end())),
660              llvm::make_range(ClonedExitBB->begin(),
661                               std::prev(ClonedExitBB->end())))) {
662       Instruction &I = std::get<0>(ZippedInsts);
663       Instruction &ClonedI = std::get<1>(ZippedInsts);
664 
665       // The only instructions in the exit block should be PHI nodes and
666       // potentially a landing pad.
667       assert(
668           (isa<PHINode>(I) || isa<LandingPadInst>(I) || isa<CatchPadInst>(I)) &&
669           "Bad instruction in exit block!");
670       // We should have a value map between the instruction and its clone.
671       assert(VMap.lookup(&I) == &ClonedI && "Mismatch in the value map!");
672 
673       auto *MergePN =
674           PHINode::Create(I.getType(), /*NumReservedValues*/ 2, ".us-phi",
675                           &*MergeBB->getFirstInsertionPt());
676       I.replaceAllUsesWith(MergePN);
677       MergePN->addIncoming(&I, ExitBB);
678       MergePN->addIncoming(&ClonedI, ClonedExitBB);
679     }
680   }
681 
682   // Rewrite the instructions in the cloned blocks to refer to the instructions
683   // in the cloned blocks. We have to do this as a second pass so that we have
684   // everything available. Also, we have inserted new instructions which may
685   // include assume intrinsics, so we update the assumption cache while
686   // processing this.
687   for (auto *ClonedBB : NewBlocks)
688     for (Instruction &I : *ClonedBB) {
689       RemapInstruction(&I, VMap,
690                        RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
691       if (auto *II = dyn_cast<IntrinsicInst>(&I))
692         if (II->getIntrinsicID() == Intrinsic::assume)
693           AC.registerAssumption(II);
694     }
695 
696   // Remove the cloned parent as a predecessor of the cloned continue successor
697   // if we did in fact clone it.
698   auto *ClonedParentBB = cast<BasicBlock>(VMap.lookup(ParentBB));
699   if (auto *ClonedContinueSuccBB =
700           cast_or_null<BasicBlock>(VMap.lookup(ContinueSuccBB)))
701     ClonedContinueSuccBB->removePredecessor(ClonedParentBB,
702                                             /*DontDeleteUselessPHIs*/ true);
703   // Replace the cloned branch with an unconditional branch to the cloned
704   // unswitched successor.
705   auto *ClonedSuccBB = cast<BasicBlock>(VMap.lookup(UnswitchedSuccBB));
706   ClonedParentBB->getTerminator()->eraseFromParent();
707   BranchInst::Create(ClonedSuccBB, ClonedParentBB);
708 
709   // Update any PHI nodes in the cloned successors of the skipped blocks to not
710   // have spurious incoming values.
711   for (auto *LoopBB : L.blocks())
712     if (SkippedLoopAndExitBlocks.count(LoopBB))
713       for (auto *SuccBB : successors(LoopBB))
714         if (auto *ClonedSuccBB = cast_or_null<BasicBlock>(VMap.lookup(SuccBB)))
715           for (PHINode &PN : ClonedSuccBB->phis())
716             PN.removeIncomingValue(LoopBB, /*DeletePHIIfEmpty*/ false);
717 
718   // Record the domtree updates for the new blocks.
719   SmallPtrSet<BasicBlock *, 4> SuccSet;
720   for (auto *ClonedBB : NewBlocks) {
721     for (auto *SuccBB : successors(ClonedBB))
722       if (SuccSet.insert(SuccBB).second)
723         DTUpdates.push_back({DominatorTree::Insert, ClonedBB, SuccBB});
724     SuccSet.clear();
725   }
726 
727   return ClonedPH;
728 }
729 
730 /// Recursively clone the specified loop and all of its children.
731 ///
732 /// The target parent loop for the clone should be provided, or can be null if
733 /// the clone is a top-level loop. While cloning, all the blocks are mapped
734 /// with the provided value map. The entire original loop must be present in
735 /// the value map. The cloned loop is returned.
736 static Loop *cloneLoopNest(Loop &OrigRootL, Loop *RootParentL,
737                            const ValueToValueMapTy &VMap, LoopInfo &LI) {
738   auto AddClonedBlocksToLoop = [&](Loop &OrigL, Loop &ClonedL) {
739     assert(ClonedL.getBlocks().empty() && "Must start with an empty loop!");
740     ClonedL.reserveBlocks(OrigL.getNumBlocks());
741     for (auto *BB : OrigL.blocks()) {
742       auto *ClonedBB = cast<BasicBlock>(VMap.lookup(BB));
743       ClonedL.addBlockEntry(ClonedBB);
744       if (LI.getLoopFor(BB) == &OrigL)
745         LI.changeLoopFor(ClonedBB, &ClonedL);
746     }
747   };
748 
749   // We specially handle the first loop because it may get cloned into
750   // a different parent and because we most commonly are cloning leaf loops.
751   Loop *ClonedRootL = LI.AllocateLoop();
752   if (RootParentL)
753     RootParentL->addChildLoop(ClonedRootL);
754   else
755     LI.addTopLevelLoop(ClonedRootL);
756   AddClonedBlocksToLoop(OrigRootL, *ClonedRootL);
757 
758   if (OrigRootL.empty())
759     return ClonedRootL;
760 
761   // If we have a nest, we can quickly clone the entire loop nest using an
762   // iterative approach because it is a tree. We keep the cloned parent in the
763   // data structure to avoid repeatedly querying through a map to find it.
764   SmallVector<std::pair<Loop *, Loop *>, 16> LoopsToClone;
765   // Build up the loops to clone in reverse order as we'll clone them from the
766   // back.
767   for (Loop *ChildL : llvm::reverse(OrigRootL))
768     LoopsToClone.push_back({ClonedRootL, ChildL});
769   do {
770     Loop *ClonedParentL, *L;
771     std::tie(ClonedParentL, L) = LoopsToClone.pop_back_val();
772     Loop *ClonedL = LI.AllocateLoop();
773     ClonedParentL->addChildLoop(ClonedL);
774     AddClonedBlocksToLoop(*L, *ClonedL);
775     for (Loop *ChildL : llvm::reverse(*L))
776       LoopsToClone.push_back({ClonedL, ChildL});
777   } while (!LoopsToClone.empty());
778 
779   return ClonedRootL;
780 }
781 
782 /// Build the cloned loops of an original loop from unswitching.
783 ///
784 /// Because unswitching simplifies the CFG of the loop, this isn't a trivial
785 /// operation. We need to re-verify that there even is a loop (as the backedge
786 /// may not have been cloned), and even if there are remaining backedges the
787 /// backedge set may be different. However, we know that each child loop is
788 /// undisturbed, we only need to find where to place each child loop within
789 /// either any parent loop or within a cloned version of the original loop.
790 ///
791 /// Because child loops may end up cloned outside of any cloned version of the
792 /// original loop, multiple cloned sibling loops may be created. All of them
793 /// are returned so that the newly introduced loop nest roots can be
794 /// identified.
795 static Loop *buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks,
796                               const ValueToValueMapTy &VMap, LoopInfo &LI,
797                               SmallVectorImpl<Loop *> &NonChildClonedLoops) {
798   Loop *ClonedL = nullptr;
799 
800   auto *OrigPH = OrigL.getLoopPreheader();
801   auto *OrigHeader = OrigL.getHeader();
802 
803   auto *ClonedPH = cast<BasicBlock>(VMap.lookup(OrigPH));
804   auto *ClonedHeader = cast<BasicBlock>(VMap.lookup(OrigHeader));
805 
806   // We need to know the loops of the cloned exit blocks to even compute the
807   // accurate parent loop. If we only clone exits to some parent of the
808   // original parent, we want to clone into that outer loop. We also keep track
809   // of the loops that our cloned exit blocks participate in.
810   Loop *ParentL = nullptr;
811   SmallVector<BasicBlock *, 4> ClonedExitsInLoops;
812   SmallDenseMap<BasicBlock *, Loop *, 16> ExitLoopMap;
813   ClonedExitsInLoops.reserve(ExitBlocks.size());
814   for (auto *ExitBB : ExitBlocks)
815     if (auto *ClonedExitBB = cast_or_null<BasicBlock>(VMap.lookup(ExitBB)))
816       if (Loop *ExitL = LI.getLoopFor(ExitBB)) {
817         ExitLoopMap[ClonedExitBB] = ExitL;
818         ClonedExitsInLoops.push_back(ClonedExitBB);
819         if (!ParentL || (ParentL != ExitL && ParentL->contains(ExitL)))
820           ParentL = ExitL;
821       }
822   assert((!ParentL || ParentL == OrigL.getParentLoop() ||
823           ParentL->contains(OrigL.getParentLoop())) &&
824          "The computed parent loop should always contain (or be) the parent of "
825          "the original loop.");
826 
827   // We build the set of blocks dominated by the cloned header from the set of
828   // cloned blocks out of the original loop. While not all of these will
829   // necessarily be in the cloned loop, it is enough to establish that they
830   // aren't in unreachable cycles, etc.
831   SmallSetVector<BasicBlock *, 16> ClonedLoopBlocks;
832   for (auto *BB : OrigL.blocks())
833     if (auto *ClonedBB = cast_or_null<BasicBlock>(VMap.lookup(BB)))
834       ClonedLoopBlocks.insert(ClonedBB);
835 
836   // Rebuild the set of blocks that will end up in the cloned loop. We may have
837   // skipped cloning some region of this loop which can in turn skip some of
838   // the backedges so we have to rebuild the blocks in the loop based on the
839   // backedges that remain after cloning.
840   SmallVector<BasicBlock *, 16> Worklist;
841   SmallPtrSet<BasicBlock *, 16> BlocksInClonedLoop;
842   for (auto *Pred : predecessors(ClonedHeader)) {
843     // The only possible non-loop header predecessor is the preheader because
844     // we know we cloned the loop in simplified form.
845     if (Pred == ClonedPH)
846       continue;
847 
848     // Because the loop was in simplified form, the only non-loop predecessor
849     // should be the preheader.
850     assert(ClonedLoopBlocks.count(Pred) && "Found a predecessor of the loop "
851                                            "header other than the preheader "
852                                            "that is not part of the loop!");
853 
854     // Insert this block into the loop set and on the first visit (and if it
855     // isn't the header we're currently walking) put it into the worklist to
856     // recurse through.
857     if (BlocksInClonedLoop.insert(Pred).second && Pred != ClonedHeader)
858       Worklist.push_back(Pred);
859   }
860 
861   // If we had any backedges then there *is* a cloned loop. Put the header into
862   // the loop set and then walk the worklist backwards to find all the blocks
863   // that remain within the loop after cloning.
864   if (!BlocksInClonedLoop.empty()) {
865     BlocksInClonedLoop.insert(ClonedHeader);
866 
867     while (!Worklist.empty()) {
868       BasicBlock *BB = Worklist.pop_back_val();
869       assert(BlocksInClonedLoop.count(BB) &&
870              "Didn't put block into the loop set!");
871 
872       // Insert any predecessors that are in the possible set into the cloned
873       // set, and if the insert is successful, add them to the worklist. Note
874       // that we filter on the blocks that are definitely reachable via the
875       // backedge to the loop header so we may prune out dead code within the
876       // cloned loop.
877       for (auto *Pred : predecessors(BB))
878         if (ClonedLoopBlocks.count(Pred) &&
879             BlocksInClonedLoop.insert(Pred).second)
880           Worklist.push_back(Pred);
881     }
882 
883     ClonedL = LI.AllocateLoop();
884     if (ParentL) {
885       ParentL->addBasicBlockToLoop(ClonedPH, LI);
886       ParentL->addChildLoop(ClonedL);
887     } else {
888       LI.addTopLevelLoop(ClonedL);
889     }
890 
891     ClonedL->reserveBlocks(BlocksInClonedLoop.size());
892     // We don't want to just add the cloned loop blocks based on how we
893     // discovered them. The original order of blocks was carefully built in
894     // a way that doesn't rely on predecessor ordering. Rather than re-invent
895     // that logic, we just re-walk the original blocks (and those of the child
896     // loops) and filter them as we add them into the cloned loop.
897     for (auto *BB : OrigL.blocks()) {
898       auto *ClonedBB = cast_or_null<BasicBlock>(VMap.lookup(BB));
899       if (!ClonedBB || !BlocksInClonedLoop.count(ClonedBB))
900         continue;
901 
902       // Directly add the blocks that are only in this loop.
903       if (LI.getLoopFor(BB) == &OrigL) {
904         ClonedL->addBasicBlockToLoop(ClonedBB, LI);
905         continue;
906       }
907 
908       // We want to manually add it to this loop and parents.
909       // Registering it with LoopInfo will happen when we clone the top
910       // loop for this block.
911       for (Loop *PL = ClonedL; PL; PL = PL->getParentLoop())
912         PL->addBlockEntry(ClonedBB);
913     }
914 
915     // Now add each child loop whose header remains within the cloned loop. All
916     // of the blocks within the loop must satisfy the same constraints as the
917     // header so once we pass the header checks we can just clone the entire
918     // child loop nest.
919     for (Loop *ChildL : OrigL) {
920       auto *ClonedChildHeader =
921           cast_or_null<BasicBlock>(VMap.lookup(ChildL->getHeader()));
922       if (!ClonedChildHeader || !BlocksInClonedLoop.count(ClonedChildHeader))
923         continue;
924 
925 #ifndef NDEBUG
926       // We should never have a cloned child loop header but fail to have
927       // all of the blocks for that child loop.
928       for (auto *ChildLoopBB : ChildL->blocks())
929         assert(BlocksInClonedLoop.count(
930                    cast<BasicBlock>(VMap.lookup(ChildLoopBB))) &&
931                "Child cloned loop has a header within the cloned outer "
932                "loop but not all of its blocks!");
933 #endif
934 
935       cloneLoopNest(*ChildL, ClonedL, VMap, LI);
936     }
937   }
938 
939   // Now that we've handled all the components of the original loop that were
940   // cloned into a new loop, we still need to handle anything from the original
941   // loop that wasn't in a cloned loop.
942 
943   // Figure out what blocks are left to place within any loop nest containing
944   // the unswitched loop. If we never formed a loop, the cloned PH is one of
945   // them.
946   SmallPtrSet<BasicBlock *, 16> UnloopedBlockSet;
947   if (BlocksInClonedLoop.empty())
948     UnloopedBlockSet.insert(ClonedPH);
949   for (auto *ClonedBB : ClonedLoopBlocks)
950     if (!BlocksInClonedLoop.count(ClonedBB))
951       UnloopedBlockSet.insert(ClonedBB);
952 
953   // Copy the cloned exits and sort them in ascending loop depth, we'll work
954   // backwards across these to process them inside out. The order shouldn't
955   // matter as we're just trying to build up the map from inside-out; we use
956   // the map in a more stably ordered way below.
957   auto OrderedClonedExitsInLoops = ClonedExitsInLoops;
958   llvm::sort(OrderedClonedExitsInLoops.begin(),
959              OrderedClonedExitsInLoops.end(),
960              [&](BasicBlock *LHS, BasicBlock *RHS) {
961                return ExitLoopMap.lookup(LHS)->getLoopDepth() <
962                       ExitLoopMap.lookup(RHS)->getLoopDepth();
963              });
964 
965   // Populate the existing ExitLoopMap with everything reachable from each
966   // exit, starting from the inner most exit.
967   while (!UnloopedBlockSet.empty() && !OrderedClonedExitsInLoops.empty()) {
968     assert(Worklist.empty() && "Didn't clear worklist!");
969 
970     BasicBlock *ExitBB = OrderedClonedExitsInLoops.pop_back_val();
971     Loop *ExitL = ExitLoopMap.lookup(ExitBB);
972 
973     // Walk the CFG back until we hit the cloned PH adding everything reachable
974     // and in the unlooped set to this exit block's loop.
975     Worklist.push_back(ExitBB);
976     do {
977       BasicBlock *BB = Worklist.pop_back_val();
978       // We can stop recursing at the cloned preheader (if we get there).
979       if (BB == ClonedPH)
980         continue;
981 
982       for (BasicBlock *PredBB : predecessors(BB)) {
983         // If this pred has already been moved to our set or is part of some
984         // (inner) loop, no update needed.
985         if (!UnloopedBlockSet.erase(PredBB)) {
986           assert(
987               (BlocksInClonedLoop.count(PredBB) || ExitLoopMap.count(PredBB)) &&
988               "Predecessor not mapped to a loop!");
989           continue;
990         }
991 
992         // We just insert into the loop set here. We'll add these blocks to the
993         // exit loop after we build up the set in an order that doesn't rely on
994         // predecessor order (which in turn relies on use list order).
995         bool Inserted = ExitLoopMap.insert({PredBB, ExitL}).second;
996         (void)Inserted;
997         assert(Inserted && "Should only visit an unlooped block once!");
998 
999         // And recurse through to its predecessors.
1000         Worklist.push_back(PredBB);
1001       }
1002     } while (!Worklist.empty());
1003   }
1004 
1005   // Now that the ExitLoopMap gives as  mapping for all the non-looping cloned
1006   // blocks to their outer loops, walk the cloned blocks and the cloned exits
1007   // in their original order adding them to the correct loop.
1008 
1009   // We need a stable insertion order. We use the order of the original loop
1010   // order and map into the correct parent loop.
1011   for (auto *BB : llvm::concat<BasicBlock *const>(
1012            makeArrayRef(ClonedPH), ClonedLoopBlocks, ClonedExitsInLoops))
1013     if (Loop *OuterL = ExitLoopMap.lookup(BB))
1014       OuterL->addBasicBlockToLoop(BB, LI);
1015 
1016 #ifndef NDEBUG
1017   for (auto &BBAndL : ExitLoopMap) {
1018     auto *BB = BBAndL.first;
1019     auto *OuterL = BBAndL.second;
1020     assert(LI.getLoopFor(BB) == OuterL &&
1021            "Failed to put all blocks into outer loops!");
1022   }
1023 #endif
1024 
1025   // Now that all the blocks are placed into the correct containing loop in the
1026   // absence of child loops, find all the potentially cloned child loops and
1027   // clone them into whatever outer loop we placed their header into.
1028   for (Loop *ChildL : OrigL) {
1029     auto *ClonedChildHeader =
1030         cast_or_null<BasicBlock>(VMap.lookup(ChildL->getHeader()));
1031     if (!ClonedChildHeader || BlocksInClonedLoop.count(ClonedChildHeader))
1032       continue;
1033 
1034 #ifndef NDEBUG
1035     for (auto *ChildLoopBB : ChildL->blocks())
1036       assert(VMap.count(ChildLoopBB) &&
1037              "Cloned a child loop header but not all of that loops blocks!");
1038 #endif
1039 
1040     NonChildClonedLoops.push_back(cloneLoopNest(
1041         *ChildL, ExitLoopMap.lookup(ClonedChildHeader), VMap, LI));
1042   }
1043 
1044   // Return the main cloned loop if any.
1045   return ClonedL;
1046 }
1047 
1048 static void
1049 deleteDeadBlocksFromLoop(Loop &L,
1050                          const SmallVectorImpl<BasicBlock *> &DeadBlocks,
1051                          SmallVectorImpl<BasicBlock *> &ExitBlocks,
1052                          DominatorTree &DT, LoopInfo &LI) {
1053   SmallPtrSet<BasicBlock *, 16> DeadBlockSet(DeadBlocks.begin(),
1054                                              DeadBlocks.end());
1055 
1056   // Filter out the dead blocks from the exit blocks list so that it can be
1057   // used in the caller.
1058   llvm::erase_if(ExitBlocks,
1059                  [&](BasicBlock *BB) { return DeadBlockSet.count(BB); });
1060 
1061   // Remove these blocks from their successors.
1062   for (auto *BB : DeadBlocks)
1063     for (BasicBlock *SuccBB : successors(BB))
1064       SuccBB->removePredecessor(BB, /*DontDeleteUselessPHIs*/ true);
1065 
1066   // Walk from this loop up through its parents removing all of the dead blocks.
1067   for (Loop *ParentL = &L; ParentL; ParentL = ParentL->getParentLoop()) {
1068     for (auto *BB : DeadBlocks)
1069       ParentL->getBlocksSet().erase(BB);
1070     llvm::erase_if(ParentL->getBlocksVector(),
1071                    [&](BasicBlock *BB) { return DeadBlockSet.count(BB); });
1072   }
1073 
1074   // Now delete the dead child loops. This raw delete will clear them
1075   // recursively.
1076   llvm::erase_if(L.getSubLoopsVector(), [&](Loop *ChildL) {
1077     if (!DeadBlockSet.count(ChildL->getHeader()))
1078       return false;
1079 
1080     assert(llvm::all_of(ChildL->blocks(),
1081                         [&](BasicBlock *ChildBB) {
1082                           return DeadBlockSet.count(ChildBB);
1083                         }) &&
1084            "If the child loop header is dead all blocks in the child loop must "
1085            "be dead as well!");
1086     LI.destroy(ChildL);
1087     return true;
1088   });
1089 
1090   // Remove the loop mappings for the dead blocks and drop all the references
1091   // from these blocks to others to handle cyclic references as we start
1092   // deleting the blocks themselves.
1093   for (auto *BB : DeadBlocks) {
1094     // Check that the dominator tree has already been updated.
1095     assert(!DT.getNode(BB) && "Should already have cleared domtree!");
1096     LI.changeLoopFor(BB, nullptr);
1097     BB->dropAllReferences();
1098   }
1099 
1100   // Actually delete the blocks now that they've been fully unhooked from the
1101   // IR.
1102   for (auto *BB : DeadBlocks)
1103     BB->eraseFromParent();
1104 }
1105 
1106 /// Recompute the set of blocks in a loop after unswitching.
1107 ///
1108 /// This walks from the original headers predecessors to rebuild the loop. We
1109 /// take advantage of the fact that new blocks can't have been added, and so we
1110 /// filter by the original loop's blocks. This also handles potentially
1111 /// unreachable code that we don't want to explore but might be found examining
1112 /// the predecessors of the header.
1113 ///
1114 /// If the original loop is no longer a loop, this will return an empty set. If
1115 /// it remains a loop, all the blocks within it will be added to the set
1116 /// (including those blocks in inner loops).
1117 static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L,
1118                                                                  LoopInfo &LI) {
1119   SmallPtrSet<const BasicBlock *, 16> LoopBlockSet;
1120 
1121   auto *PH = L.getLoopPreheader();
1122   auto *Header = L.getHeader();
1123 
1124   // A worklist to use while walking backwards from the header.
1125   SmallVector<BasicBlock *, 16> Worklist;
1126 
1127   // First walk the predecessors of the header to find the backedges. This will
1128   // form the basis of our walk.
1129   for (auto *Pred : predecessors(Header)) {
1130     // Skip the preheader.
1131     if (Pred == PH)
1132       continue;
1133 
1134     // Because the loop was in simplified form, the only non-loop predecessor
1135     // is the preheader.
1136     assert(L.contains(Pred) && "Found a predecessor of the loop header other "
1137                                "than the preheader that is not part of the "
1138                                "loop!");
1139 
1140     // Insert this block into the loop set and on the first visit and, if it
1141     // isn't the header we're currently walking, put it into the worklist to
1142     // recurse through.
1143     if (LoopBlockSet.insert(Pred).second && Pred != Header)
1144       Worklist.push_back(Pred);
1145   }
1146 
1147   // If no backedges were found, we're done.
1148   if (LoopBlockSet.empty())
1149     return LoopBlockSet;
1150 
1151   // We found backedges, recurse through them to identify the loop blocks.
1152   while (!Worklist.empty()) {
1153     BasicBlock *BB = Worklist.pop_back_val();
1154     assert(LoopBlockSet.count(BB) && "Didn't put block into the loop set!");
1155 
1156     // No need to walk past the header.
1157     if (BB == Header)
1158       continue;
1159 
1160     // Because we know the inner loop structure remains valid we can use the
1161     // loop structure to jump immediately across the entire nested loop.
1162     // Further, because it is in loop simplified form, we can directly jump
1163     // to its preheader afterward.
1164     if (Loop *InnerL = LI.getLoopFor(BB))
1165       if (InnerL != &L) {
1166         assert(L.contains(InnerL) &&
1167                "Should not reach a loop *outside* this loop!");
1168         // The preheader is the only possible predecessor of the loop so
1169         // insert it into the set and check whether it was already handled.
1170         auto *InnerPH = InnerL->getLoopPreheader();
1171         assert(L.contains(InnerPH) && "Cannot contain an inner loop block "
1172                                       "but not contain the inner loop "
1173                                       "preheader!");
1174         if (!LoopBlockSet.insert(InnerPH).second)
1175           // The only way to reach the preheader is through the loop body
1176           // itself so if it has been visited the loop is already handled.
1177           continue;
1178 
1179         // Insert all of the blocks (other than those already present) into
1180         // the loop set. We expect at least the block that led us to find the
1181         // inner loop to be in the block set, but we may also have other loop
1182         // blocks if they were already enqueued as predecessors of some other
1183         // outer loop block.
1184         for (auto *InnerBB : InnerL->blocks()) {
1185           if (InnerBB == BB) {
1186             assert(LoopBlockSet.count(InnerBB) &&
1187                    "Block should already be in the set!");
1188             continue;
1189           }
1190 
1191           LoopBlockSet.insert(InnerBB);
1192         }
1193 
1194         // Add the preheader to the worklist so we will continue past the
1195         // loop body.
1196         Worklist.push_back(InnerPH);
1197         continue;
1198       }
1199 
1200     // Insert any predecessors that were in the original loop into the new
1201     // set, and if the insert is successful, add them to the worklist.
1202     for (auto *Pred : predecessors(BB))
1203       if (L.contains(Pred) && LoopBlockSet.insert(Pred).second)
1204         Worklist.push_back(Pred);
1205   }
1206 
1207   assert(LoopBlockSet.count(Header) && "Cannot fail to add the header!");
1208 
1209   // We've found all the blocks participating in the loop, return our completed
1210   // set.
1211   return LoopBlockSet;
1212 }
1213 
1214 /// Rebuild a loop after unswitching removes some subset of blocks and edges.
1215 ///
1216 /// The removal may have removed some child loops entirely but cannot have
1217 /// disturbed any remaining child loops. However, they may need to be hoisted
1218 /// to the parent loop (or to be top-level loops). The original loop may be
1219 /// completely removed.
1220 ///
1221 /// The sibling loops resulting from this update are returned. If the original
1222 /// loop remains a valid loop, it will be the first entry in this list with all
1223 /// of the newly sibling loops following it.
1224 ///
1225 /// Returns true if the loop remains a loop after unswitching, and false if it
1226 /// is no longer a loop after unswitching (and should not continue to be
1227 /// referenced).
1228 static bool rebuildLoopAfterUnswitch(Loop &L, ArrayRef<BasicBlock *> ExitBlocks,
1229                                      LoopInfo &LI,
1230                                      SmallVectorImpl<Loop *> &HoistedLoops) {
1231   auto *PH = L.getLoopPreheader();
1232 
1233   // Compute the actual parent loop from the exit blocks. Because we may have
1234   // pruned some exits the loop may be different from the original parent.
1235   Loop *ParentL = nullptr;
1236   SmallVector<Loop *, 4> ExitLoops;
1237   SmallVector<BasicBlock *, 4> ExitsInLoops;
1238   ExitsInLoops.reserve(ExitBlocks.size());
1239   for (auto *ExitBB : ExitBlocks)
1240     if (Loop *ExitL = LI.getLoopFor(ExitBB)) {
1241       ExitLoops.push_back(ExitL);
1242       ExitsInLoops.push_back(ExitBB);
1243       if (!ParentL || (ParentL != ExitL && ParentL->contains(ExitL)))
1244         ParentL = ExitL;
1245     }
1246 
1247   // Recompute the blocks participating in this loop. This may be empty if it
1248   // is no longer a loop.
1249   auto LoopBlockSet = recomputeLoopBlockSet(L, LI);
1250 
1251   // If we still have a loop, we need to re-set the loop's parent as the exit
1252   // block set changing may have moved it within the loop nest. Note that this
1253   // can only happen when this loop has a parent as it can only hoist the loop
1254   // *up* the nest.
1255   if (!LoopBlockSet.empty() && L.getParentLoop() != ParentL) {
1256     // Remove this loop's (original) blocks from all of the intervening loops.
1257     for (Loop *IL = L.getParentLoop(); IL != ParentL;
1258          IL = IL->getParentLoop()) {
1259       IL->getBlocksSet().erase(PH);
1260       for (auto *BB : L.blocks())
1261         IL->getBlocksSet().erase(BB);
1262       llvm::erase_if(IL->getBlocksVector(), [&](BasicBlock *BB) {
1263         return BB == PH || L.contains(BB);
1264       });
1265     }
1266 
1267     LI.changeLoopFor(PH, ParentL);
1268     L.getParentLoop()->removeChildLoop(&L);
1269     if (ParentL)
1270       ParentL->addChildLoop(&L);
1271     else
1272       LI.addTopLevelLoop(&L);
1273   }
1274 
1275   // Now we update all the blocks which are no longer within the loop.
1276   auto &Blocks = L.getBlocksVector();
1277   auto BlocksSplitI =
1278       LoopBlockSet.empty()
1279           ? Blocks.begin()
1280           : std::stable_partition(
1281                 Blocks.begin(), Blocks.end(),
1282                 [&](BasicBlock *BB) { return LoopBlockSet.count(BB); });
1283 
1284   // Before we erase the list of unlooped blocks, build a set of them.
1285   SmallPtrSet<BasicBlock *, 16> UnloopedBlocks(BlocksSplitI, Blocks.end());
1286   if (LoopBlockSet.empty())
1287     UnloopedBlocks.insert(PH);
1288 
1289   // Now erase these blocks from the loop.
1290   for (auto *BB : make_range(BlocksSplitI, Blocks.end()))
1291     L.getBlocksSet().erase(BB);
1292   Blocks.erase(BlocksSplitI, Blocks.end());
1293 
1294   // Sort the exits in ascending loop depth, we'll work backwards across these
1295   // to process them inside out.
1296   std::stable_sort(ExitsInLoops.begin(), ExitsInLoops.end(),
1297                    [&](BasicBlock *LHS, BasicBlock *RHS) {
1298                      return LI.getLoopDepth(LHS) < LI.getLoopDepth(RHS);
1299                    });
1300 
1301   // We'll build up a set for each exit loop.
1302   SmallPtrSet<BasicBlock *, 16> NewExitLoopBlocks;
1303   Loop *PrevExitL = L.getParentLoop(); // The deepest possible exit loop.
1304 
1305   auto RemoveUnloopedBlocksFromLoop =
1306       [](Loop &L, SmallPtrSetImpl<BasicBlock *> &UnloopedBlocks) {
1307         for (auto *BB : UnloopedBlocks)
1308           L.getBlocksSet().erase(BB);
1309         llvm::erase_if(L.getBlocksVector(), [&](BasicBlock *BB) {
1310           return UnloopedBlocks.count(BB);
1311         });
1312       };
1313 
1314   SmallVector<BasicBlock *, 16> Worklist;
1315   while (!UnloopedBlocks.empty() && !ExitsInLoops.empty()) {
1316     assert(Worklist.empty() && "Didn't clear worklist!");
1317     assert(NewExitLoopBlocks.empty() && "Didn't clear loop set!");
1318 
1319     // Grab the next exit block, in decreasing loop depth order.
1320     BasicBlock *ExitBB = ExitsInLoops.pop_back_val();
1321     Loop &ExitL = *LI.getLoopFor(ExitBB);
1322     assert(ExitL.contains(&L) && "Exit loop must contain the inner loop!");
1323 
1324     // Erase all of the unlooped blocks from the loops between the previous
1325     // exit loop and this exit loop. This works because the ExitInLoops list is
1326     // sorted in increasing order of loop depth and thus we visit loops in
1327     // decreasing order of loop depth.
1328     for (; PrevExitL != &ExitL; PrevExitL = PrevExitL->getParentLoop())
1329       RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks);
1330 
1331     // Walk the CFG back until we hit the cloned PH adding everything reachable
1332     // and in the unlooped set to this exit block's loop.
1333     Worklist.push_back(ExitBB);
1334     do {
1335       BasicBlock *BB = Worklist.pop_back_val();
1336       // We can stop recursing at the cloned preheader (if we get there).
1337       if (BB == PH)
1338         continue;
1339 
1340       for (BasicBlock *PredBB : predecessors(BB)) {
1341         // If this pred has already been moved to our set or is part of some
1342         // (inner) loop, no update needed.
1343         if (!UnloopedBlocks.erase(PredBB)) {
1344           assert((NewExitLoopBlocks.count(PredBB) ||
1345                   ExitL.contains(LI.getLoopFor(PredBB))) &&
1346                  "Predecessor not in a nested loop (or already visited)!");
1347           continue;
1348         }
1349 
1350         // We just insert into the loop set here. We'll add these blocks to the
1351         // exit loop after we build up the set in a deterministic order rather
1352         // than the predecessor-influenced visit order.
1353         bool Inserted = NewExitLoopBlocks.insert(PredBB).second;
1354         (void)Inserted;
1355         assert(Inserted && "Should only visit an unlooped block once!");
1356 
1357         // And recurse through to its predecessors.
1358         Worklist.push_back(PredBB);
1359       }
1360     } while (!Worklist.empty());
1361 
1362     // If blocks in this exit loop were directly part of the original loop (as
1363     // opposed to a child loop) update the map to point to this exit loop. This
1364     // just updates a map and so the fact that the order is unstable is fine.
1365     for (auto *BB : NewExitLoopBlocks)
1366       if (Loop *BBL = LI.getLoopFor(BB))
1367         if (BBL == &L || !L.contains(BBL))
1368           LI.changeLoopFor(BB, &ExitL);
1369 
1370     // We will remove the remaining unlooped blocks from this loop in the next
1371     // iteration or below.
1372     NewExitLoopBlocks.clear();
1373   }
1374 
1375   // Any remaining unlooped blocks are no longer part of any loop unless they
1376   // are part of some child loop.
1377   for (; PrevExitL; PrevExitL = PrevExitL->getParentLoop())
1378     RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks);
1379   for (auto *BB : UnloopedBlocks)
1380     if (Loop *BBL = LI.getLoopFor(BB))
1381       if (BBL == &L || !L.contains(BBL))
1382         LI.changeLoopFor(BB, nullptr);
1383 
1384   // Sink all the child loops whose headers are no longer in the loop set to
1385   // the parent (or to be top level loops). We reach into the loop and directly
1386   // update its subloop vector to make this batch update efficient.
1387   auto &SubLoops = L.getSubLoopsVector();
1388   auto SubLoopsSplitI =
1389       LoopBlockSet.empty()
1390           ? SubLoops.begin()
1391           : std::stable_partition(
1392                 SubLoops.begin(), SubLoops.end(), [&](Loop *SubL) {
1393                   return LoopBlockSet.count(SubL->getHeader());
1394                 });
1395   for (auto *HoistedL : make_range(SubLoopsSplitI, SubLoops.end())) {
1396     HoistedLoops.push_back(HoistedL);
1397     HoistedL->setParentLoop(nullptr);
1398 
1399     // To compute the new parent of this hoisted loop we look at where we
1400     // placed the preheader above. We can't lookup the header itself because we
1401     // retained the mapping from the header to the hoisted loop. But the
1402     // preheader and header should have the exact same new parent computed
1403     // based on the set of exit blocks from the original loop as the preheader
1404     // is a predecessor of the header and so reached in the reverse walk. And
1405     // because the loops were all in simplified form the preheader of the
1406     // hoisted loop can't be part of some *other* loop.
1407     if (auto *NewParentL = LI.getLoopFor(HoistedL->getLoopPreheader()))
1408       NewParentL->addChildLoop(HoistedL);
1409     else
1410       LI.addTopLevelLoop(HoistedL);
1411   }
1412   SubLoops.erase(SubLoopsSplitI, SubLoops.end());
1413 
1414   // Actually delete the loop if nothing remained within it.
1415   if (Blocks.empty()) {
1416     assert(SubLoops.empty() &&
1417            "Failed to remove all subloops from the original loop!");
1418     if (Loop *ParentL = L.getParentLoop())
1419       ParentL->removeChildLoop(llvm::find(*ParentL, &L));
1420     else
1421       LI.removeLoop(llvm::find(LI, &L));
1422     LI.destroy(&L);
1423     return false;
1424   }
1425 
1426   return true;
1427 }
1428 
1429 /// Helper to visit a dominator subtree, invoking a callable on each node.
1430 ///
1431 /// Returning false at any point will stop walking past that node of the tree.
1432 template <typename CallableT>
1433 void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) {
1434   SmallVector<DomTreeNode *, 4> DomWorklist;
1435   DomWorklist.push_back(DT[BB]);
1436 #ifndef NDEBUG
1437   SmallPtrSet<DomTreeNode *, 4> Visited;
1438   Visited.insert(DT[BB]);
1439 #endif
1440   do {
1441     DomTreeNode *N = DomWorklist.pop_back_val();
1442 
1443     // Visit this node.
1444     if (!Callable(N->getBlock()))
1445       continue;
1446 
1447     // Accumulate the child nodes.
1448     for (DomTreeNode *ChildN : *N) {
1449       assert(Visited.insert(ChildN).second &&
1450              "Cannot visit a node twice when walking a tree!");
1451       DomWorklist.push_back(ChildN);
1452     }
1453   } while (!DomWorklist.empty());
1454 }
1455 
1456 /// Take an invariant branch that has been determined to be safe and worthwhile
1457 /// to unswitch despite being non-trivial to do so and perform the unswitch.
1458 ///
1459 /// This directly updates the CFG to hoist the predicate out of the loop, and
1460 /// clone the necessary parts of the loop to maintain behavior.
1461 ///
1462 /// It also updates both dominator tree and loopinfo based on the unswitching.
1463 ///
1464 /// Once unswitching has been performed it runs the provided callback to report
1465 /// the new loops and no-longer valid loops to the caller.
1466 static bool unswitchInvariantBranch(
1467     Loop &L, BranchInst &BI, DominatorTree &DT, LoopInfo &LI,
1468     AssumptionCache &AC,
1469     function_ref<void(bool, ArrayRef<Loop *>)> NonTrivialUnswitchCB) {
1470   assert(BI.isConditional() && "Can only unswitch a conditional branch!");
1471   assert(L.isLoopInvariant(BI.getCondition()) &&
1472          "Can only unswitch an invariant branch condition!");
1473 
1474   // Constant and BBs tracking the cloned and continuing successor.
1475   const int ClonedSucc = 0;
1476   auto *ParentBB = BI.getParent();
1477   auto *UnswitchedSuccBB = BI.getSuccessor(ClonedSucc);
1478   auto *ContinueSuccBB = BI.getSuccessor(1 - ClonedSucc);
1479 
1480   assert(UnswitchedSuccBB != ContinueSuccBB &&
1481          "Should not unswitch a branch that always goes to the same place!");
1482 
1483   // The branch should be in this exact loop. Any inner loop's invariant branch
1484   // should be handled by unswitching that inner loop. The caller of this
1485   // routine should filter out any candidates that remain (but were skipped for
1486   // whatever reason).
1487   assert(LI.getLoopFor(ParentBB) == &L && "Branch in an inner loop!");
1488 
1489   SmallVector<BasicBlock *, 4> ExitBlocks;
1490   L.getUniqueExitBlocks(ExitBlocks);
1491 
1492   // We cannot unswitch if exit blocks contain a cleanuppad instruction as we
1493   // don't know how to split those exit blocks.
1494   // FIXME: We should teach SplitBlock to handle this and remove this
1495   // restriction.
1496   for (auto *ExitBB : ExitBlocks)
1497     if (isa<CleanupPadInst>(ExitBB->getFirstNonPHI()))
1498       return false;
1499 
1500   SmallPtrSet<BasicBlock *, 4> ExitBlockSet(ExitBlocks.begin(),
1501                                             ExitBlocks.end());
1502 
1503   // Compute the parent loop now before we start hacking on things.
1504   Loop *ParentL = L.getParentLoop();
1505 
1506   // Compute the outer-most loop containing one of our exit blocks. This is the
1507   // furthest up our loopnest which can be mutated, which we will use below to
1508   // update things.
1509   Loop *OuterExitL = &L;
1510   for (auto *ExitBB : ExitBlocks) {
1511     Loop *NewOuterExitL = LI.getLoopFor(ExitBB);
1512     if (!NewOuterExitL) {
1513       // We exited the entire nest with this block, so we're done.
1514       OuterExitL = nullptr;
1515       break;
1516     }
1517     if (NewOuterExitL != OuterExitL && NewOuterExitL->contains(OuterExitL))
1518       OuterExitL = NewOuterExitL;
1519   }
1520 
1521   // If the edge we *aren't* cloning in the unswitch (the continuing edge)
1522   // dominates its target, we can skip cloning the dominated region of the loop
1523   // and its exits. We compute this as a set of nodes to be skipped.
1524   SmallPtrSet<BasicBlock *, 4> SkippedLoopAndExitBlocks;
1525   if (ContinueSuccBB->getUniquePredecessor() ||
1526       llvm::all_of(predecessors(ContinueSuccBB), [&](BasicBlock *PredBB) {
1527         return PredBB == ParentBB || DT.dominates(ContinueSuccBB, PredBB);
1528       })) {
1529     visitDomSubTree(DT, ContinueSuccBB, [&](BasicBlock *BB) {
1530       SkippedLoopAndExitBlocks.insert(BB);
1531       return true;
1532     });
1533   }
1534   // Similarly, if the edge we *are* cloning in the unswitch (the unswitched
1535   // edge) dominates its target, we will end up with dead nodes in the original
1536   // loop and its exits that will need to be deleted. Here, we just retain that
1537   // the property holds and will compute the deleted set later.
1538   bool DeleteUnswitchedSucc =
1539       UnswitchedSuccBB->getUniquePredecessor() ||
1540       llvm::all_of(predecessors(UnswitchedSuccBB), [&](BasicBlock *PredBB) {
1541         return PredBB == ParentBB || DT.dominates(UnswitchedSuccBB, PredBB);
1542       });
1543 
1544   // Split the preheader, so that we know that there is a safe place to insert
1545   // the conditional branch. We will change the preheader to have a conditional
1546   // branch on LoopCond. The original preheader will become the split point
1547   // between the unswitched versions, and we will have a new preheader for the
1548   // original loop.
1549   BasicBlock *SplitBB = L.getLoopPreheader();
1550   BasicBlock *LoopPH = SplitEdge(SplitBB, L.getHeader(), &DT, &LI);
1551 
1552   // Keep a mapping for the cloned values.
1553   ValueToValueMapTy VMap;
1554 
1555   // Keep track of the dominator tree updates needed.
1556   SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
1557 
1558   // Build the cloned blocks from the loop.
1559   auto *ClonedPH = buildClonedLoopBlocks(
1560       L, LoopPH, SplitBB, ExitBlocks, ParentBB, UnswitchedSuccBB,
1561       ContinueSuccBB, SkippedLoopAndExitBlocks, VMap, DTUpdates, AC, DT, LI);
1562 
1563   // Remove the parent as a predecessor of the unswitched successor.
1564   UnswitchedSuccBB->removePredecessor(ParentBB, /*DontDeleteUselessPHIs*/ true);
1565 
1566   // Now splice the branch from the original loop and use it to select between
1567   // the two loops.
1568   SplitBB->getTerminator()->eraseFromParent();
1569   SplitBB->getInstList().splice(SplitBB->end(), ParentBB->getInstList(), BI);
1570   BI.setSuccessor(ClonedSucc, ClonedPH);
1571   BI.setSuccessor(1 - ClonedSucc, LoopPH);
1572 
1573   // Create a new unconditional branch to the continuing block (as opposed to
1574   // the one cloned).
1575   BranchInst::Create(ContinueSuccBB, ParentBB);
1576 
1577   // Before we update the dominator tree, collect the dead blocks if we're going
1578   // to end up deleting the unswitched successor.
1579   SmallVector<BasicBlock *, 16> DeadBlocks;
1580   if (DeleteUnswitchedSucc) {
1581     DeadBlocks.push_back(UnswitchedSuccBB);
1582     for (int i = 0; i < (int)DeadBlocks.size(); ++i) {
1583       // If we reach an exit block, stop recursing as the unswitched loop will
1584       // end up reaching the merge block which we make the successor of the
1585       // exit.
1586       if (ExitBlockSet.count(DeadBlocks[i]))
1587         continue;
1588 
1589       // Insert the children that are within the loop or exit block set. Other
1590       // children may reach out of the loop. While we don't expect these to be
1591       // dead (as the unswitched clone should reach them) we don't try to prove
1592       // that here.
1593       for (DomTreeNode *ChildN : *DT[DeadBlocks[i]])
1594         if (L.contains(ChildN->getBlock()) ||
1595             ExitBlockSet.count(ChildN->getBlock()))
1596           DeadBlocks.push_back(ChildN->getBlock());
1597     }
1598   }
1599 
1600   // Add the remaining edges to our updates and apply them to get an up-to-date
1601   // dominator tree. Note that this will cause the dead blocks above to be
1602   // unreachable and no longer in the dominator tree.
1603   DTUpdates.push_back({DominatorTree::Delete, ParentBB, UnswitchedSuccBB});
1604   DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
1605   DT.applyUpdates(DTUpdates);
1606 
1607   // Build the cloned loop structure itself. This may be substantially
1608   // different from the original structure due to the simplified CFG. This also
1609   // handles inserting all the cloned blocks into the correct loops.
1610   SmallVector<Loop *, 4> NonChildClonedLoops;
1611   Loop *ClonedL =
1612       buildClonedLoops(L, ExitBlocks, VMap, LI, NonChildClonedLoops);
1613 
1614   // Delete anything that was made dead in the original loop due to
1615   // unswitching.
1616   if (!DeadBlocks.empty())
1617     deleteDeadBlocksFromLoop(L, DeadBlocks, ExitBlocks, DT, LI);
1618 
1619   SmallVector<Loop *, 4> HoistedLoops;
1620   bool IsStillLoop = rebuildLoopAfterUnswitch(L, ExitBlocks, LI, HoistedLoops);
1621 
1622   // This transformation has a high risk of corrupting the dominator tree, and
1623   // the below steps to rebuild loop structures will result in hard to debug
1624   // errors in that case so verify that the dominator tree is sane first.
1625   // FIXME: Remove this when the bugs stop showing up and rely on existing
1626   // verification steps.
1627   assert(DT.verify(DominatorTree::VerificationLevel::Fast));
1628 
1629   // We can change which blocks are exit blocks of all the cloned sibling
1630   // loops, the current loop, and any parent loops which shared exit blocks
1631   // with the current loop. As a consequence, we need to re-form LCSSA for
1632   // them. But we shouldn't need to re-form LCSSA for any child loops.
1633   // FIXME: This could be made more efficient by tracking which exit blocks are
1634   // new, and focusing on them, but that isn't likely to be necessary.
1635   //
1636   // In order to reasonably rebuild LCSSA we need to walk inside-out across the
1637   // loop nest and update every loop that could have had its exits changed. We
1638   // also need to cover any intervening loops. We add all of these loops to
1639   // a list and sort them by loop depth to achieve this without updating
1640   // unnecessary loops.
1641   auto UpdateLCSSA = [&](Loop &UpdateL) {
1642 #ifndef NDEBUG
1643     UpdateL.verifyLoop();
1644     for (Loop *ChildL : UpdateL) {
1645       ChildL->verifyLoop();
1646       assert(ChildL->isRecursivelyLCSSAForm(DT, LI) &&
1647              "Perturbed a child loop's LCSSA form!");
1648     }
1649 #endif
1650     formLCSSA(UpdateL, DT, &LI, nullptr);
1651   };
1652 
1653   // For non-child cloned loops and hoisted loops, we just need to update LCSSA
1654   // and we can do it in any order as they don't nest relative to each other.
1655   for (Loop *UpdatedL : llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops))
1656     UpdateLCSSA(*UpdatedL);
1657 
1658   // If the original loop had exit blocks, walk up through the outer most loop
1659   // of those exit blocks to update LCSSA and form updated dedicated exits.
1660   if (OuterExitL != &L) {
1661     SmallVector<Loop *, 4> OuterLoops;
1662     // We start with the cloned loop and the current loop if they are loops and
1663     // move toward OuterExitL. Also, if either the cloned loop or the current
1664     // loop have become top level loops we need to walk all the way out.
1665     if (ClonedL) {
1666       OuterLoops.push_back(ClonedL);
1667       if (!ClonedL->getParentLoop())
1668         OuterExitL = nullptr;
1669     }
1670     if (IsStillLoop) {
1671       OuterLoops.push_back(&L);
1672       if (!L.getParentLoop())
1673         OuterExitL = nullptr;
1674     }
1675     // Grab all of the enclosing loops now.
1676     for (Loop *OuterL = ParentL; OuterL != OuterExitL;
1677          OuterL = OuterL->getParentLoop())
1678       OuterLoops.push_back(OuterL);
1679 
1680     // Finally, update our list of outer loops. This is nicely ordered to work
1681     // inside-out.
1682     for (Loop *OuterL : OuterLoops) {
1683       // First build LCSSA for this loop so that we can preserve it when
1684       // forming dedicated exits. We don't want to perturb some other loop's
1685       // LCSSA while doing that CFG edit.
1686       UpdateLCSSA(*OuterL);
1687 
1688       // For loops reached by this loop's original exit blocks we may
1689       // introduced new, non-dedicated exits. At least try to re-form dedicated
1690       // exits for these loops. This may fail if they couldn't have dedicated
1691       // exits to start with.
1692       formDedicatedExitBlocks(OuterL, &DT, &LI, /*PreserveLCSSA*/ true);
1693     }
1694   }
1695 
1696 #ifndef NDEBUG
1697   // Verify the entire loop structure to catch any incorrect updates before we
1698   // progress in the pass pipeline.
1699   LI.verify(DT);
1700 #endif
1701 
1702   // Now that we've unswitched something, make callbacks to report the changes.
1703   // For that we need to merge together the updated loops and the cloned loops
1704   // and check whether the original loop survived.
1705   SmallVector<Loop *, 4> SibLoops;
1706   for (Loop *UpdatedL : llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops))
1707     if (UpdatedL->getParentLoop() == ParentL)
1708       SibLoops.push_back(UpdatedL);
1709   NonTrivialUnswitchCB(IsStillLoop, SibLoops);
1710 
1711   ++NumBranches;
1712   return true;
1713 }
1714 
1715 /// Recursively compute the cost of a dominator subtree based on the per-block
1716 /// cost map provided.
1717 ///
1718 /// The recursive computation is memozied into the provided DT-indexed cost map
1719 /// to allow querying it for most nodes in the domtree without it becoming
1720 /// quadratic.
1721 static int
1722 computeDomSubtreeCost(DomTreeNode &N,
1723                       const SmallDenseMap<BasicBlock *, int, 4> &BBCostMap,
1724                       SmallDenseMap<DomTreeNode *, int, 4> &DTCostMap) {
1725   // Don't accumulate cost (or recurse through) blocks not in our block cost
1726   // map and thus not part of the duplication cost being considered.
1727   auto BBCostIt = BBCostMap.find(N.getBlock());
1728   if (BBCostIt == BBCostMap.end())
1729     return 0;
1730 
1731   // Lookup this node to see if we already computed its cost.
1732   auto DTCostIt = DTCostMap.find(&N);
1733   if (DTCostIt != DTCostMap.end())
1734     return DTCostIt->second;
1735 
1736   // If not, we have to compute it. We can't use insert above and update
1737   // because computing the cost may insert more things into the map.
1738   int Cost = std::accumulate(
1739       N.begin(), N.end(), BBCostIt->second, [&](int Sum, DomTreeNode *ChildN) {
1740         return Sum + computeDomSubtreeCost(*ChildN, BBCostMap, DTCostMap);
1741       });
1742   bool Inserted = DTCostMap.insert({&N, Cost}).second;
1743   (void)Inserted;
1744   assert(Inserted && "Should not insert a node while visiting children!");
1745   return Cost;
1746 }
1747 
1748 /// Unswitch control flow predicated on loop invariant conditions.
1749 ///
1750 /// This first hoists all branches or switches which are trivial (IE, do not
1751 /// require duplicating any part of the loop) out of the loop body. It then
1752 /// looks at other loop invariant control flows and tries to unswitch those as
1753 /// well by cloning the loop if the result is small enough.
1754 static bool
1755 unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
1756              TargetTransformInfo &TTI, bool NonTrivial,
1757              function_ref<void(bool, ArrayRef<Loop *>)> NonTrivialUnswitchCB) {
1758   assert(L.isRecursivelyLCSSAForm(DT, LI) &&
1759          "Loops must be in LCSSA form before unswitching.");
1760   bool Changed = false;
1761 
1762   // Must be in loop simplified form: we need a preheader and dedicated exits.
1763   if (!L.isLoopSimplifyForm())
1764     return false;
1765 
1766   // Try trivial unswitch first before loop over other basic blocks in the loop.
1767   Changed |= unswitchAllTrivialConditions(L, DT, LI);
1768 
1769   // If we're not doing non-trivial unswitching, we're done. We both accept
1770   // a parameter but also check a local flag that can be used for testing
1771   // a debugging.
1772   if (!NonTrivial && !EnableNonTrivialUnswitch)
1773     return Changed;
1774 
1775   // Collect all remaining invariant branch conditions within this loop (as
1776   // opposed to an inner loop which would be handled when visiting that inner
1777   // loop).
1778   SmallVector<TerminatorInst *, 4> UnswitchCandidates;
1779   for (auto *BB : L.blocks())
1780     if (LI.getLoopFor(BB) == &L)
1781       if (auto *BI = dyn_cast<BranchInst>(BB->getTerminator()))
1782         if (BI->isConditional() && L.isLoopInvariant(BI->getCondition()) &&
1783             BI->getSuccessor(0) != BI->getSuccessor(1))
1784           UnswitchCandidates.push_back(BI);
1785 
1786   // If we didn't find any candidates, we're done.
1787   if (UnswitchCandidates.empty())
1788     return Changed;
1789 
1790   // Check if there are irreducible CFG cycles in this loop. If so, we cannot
1791   // easily unswitch non-trivial edges out of the loop. Doing so might turn the
1792   // irreducible control flow into reducible control flow and introduce new
1793   // loops "out of thin air". If we ever discover important use cases for doing
1794   // this, we can add support to loop unswitch, but it is a lot of complexity
1795   // for what seems little or no real world benifit.
1796   LoopBlocksRPO RPOT(&L);
1797   RPOT.perform(&LI);
1798   if (containsIrreducibleCFG<const BasicBlock *>(RPOT, LI))
1799     return Changed;
1800 
1801   DEBUG(dbgs() << "Considering " << UnswitchCandidates.size()
1802                << " non-trivial loop invariant conditions for unswitching.\n");
1803 
1804   // Given that unswitching these terminators will require duplicating parts of
1805   // the loop, so we need to be able to model that cost. Compute the ephemeral
1806   // values and set up a data structure to hold per-BB costs. We cache each
1807   // block's cost so that we don't recompute this when considering different
1808   // subsets of the loop for duplication during unswitching.
1809   SmallPtrSet<const Value *, 4> EphValues;
1810   CodeMetrics::collectEphemeralValues(&L, &AC, EphValues);
1811   SmallDenseMap<BasicBlock *, int, 4> BBCostMap;
1812 
1813   // Compute the cost of each block, as well as the total loop cost. Also, bail
1814   // out if we see instructions which are incompatible with loop unswitching
1815   // (convergent, noduplicate, or cross-basic-block tokens).
1816   // FIXME: We might be able to safely handle some of these in non-duplicated
1817   // regions.
1818   int LoopCost = 0;
1819   for (auto *BB : L.blocks()) {
1820     int Cost = 0;
1821     for (auto &I : *BB) {
1822       if (EphValues.count(&I))
1823         continue;
1824 
1825       if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB))
1826         return Changed;
1827       if (auto CS = CallSite(&I))
1828         if (CS.isConvergent() || CS.cannotDuplicate())
1829           return Changed;
1830 
1831       Cost += TTI.getUserCost(&I);
1832     }
1833     assert(Cost >= 0 && "Must not have negative costs!");
1834     LoopCost += Cost;
1835     assert(LoopCost >= 0 && "Must not have negative loop costs!");
1836     BBCostMap[BB] = Cost;
1837   }
1838   DEBUG(dbgs() << "  Total loop cost: " << LoopCost << "\n");
1839 
1840   // Now we find the best candidate by searching for the one with the following
1841   // properties in order:
1842   //
1843   // 1) An unswitching cost below the threshold
1844   // 2) The smallest number of duplicated unswitch candidates (to avoid
1845   //    creating redundant subsequent unswitching)
1846   // 3) The smallest cost after unswitching.
1847   //
1848   // We prioritize reducing fanout of unswitch candidates provided the cost
1849   // remains below the threshold because this has a multiplicative effect.
1850   //
1851   // This requires memoizing each dominator subtree to avoid redundant work.
1852   //
1853   // FIXME: Need to actually do the number of candidates part above.
1854   SmallDenseMap<DomTreeNode *, int, 4> DTCostMap;
1855   // Given a terminator which might be unswitched, computes the non-duplicated
1856   // cost for that terminator.
1857   auto ComputeUnswitchedCost = [&](TerminatorInst *TI) {
1858     BasicBlock &BB = *TI->getParent();
1859     SmallPtrSet<BasicBlock *, 4> Visited;
1860 
1861     int Cost = LoopCost;
1862     for (BasicBlock *SuccBB : successors(&BB)) {
1863       // Don't count successors more than once.
1864       if (!Visited.insert(SuccBB).second)
1865         continue;
1866 
1867       // This successor's domtree will not need to be duplicated after
1868       // unswitching if the edge to the successor dominates it (and thus the
1869       // entire tree). This essentially means there is no other path into this
1870       // subtree and so it will end up live in only one clone of the loop.
1871       if (SuccBB->getUniquePredecessor() ||
1872           llvm::all_of(predecessors(SuccBB), [&](BasicBlock *PredBB) {
1873             return PredBB == &BB || DT.dominates(SuccBB, PredBB);
1874           })) {
1875         Cost -= computeDomSubtreeCost(*DT[SuccBB], BBCostMap, DTCostMap);
1876         assert(Cost >= 0 &&
1877                "Non-duplicated cost should never exceed total loop cost!");
1878       }
1879     }
1880 
1881     // Now scale the cost by the number of unique successors minus one. We
1882     // subtract one because there is already at least one copy of the entire
1883     // loop. This is computing the new cost of unswitching a condition.
1884     assert(Visited.size() > 1 &&
1885            "Cannot unswitch a condition without multiple distinct successors!");
1886     return Cost * (Visited.size() - 1);
1887   };
1888   TerminatorInst *BestUnswitchTI = nullptr;
1889   int BestUnswitchCost;
1890   for (TerminatorInst *CandidateTI : UnswitchCandidates) {
1891     int CandidateCost = ComputeUnswitchedCost(CandidateTI);
1892     DEBUG(dbgs() << "  Computed cost of " << CandidateCost
1893                  << " for unswitch candidate: " << *CandidateTI << "\n");
1894     if (!BestUnswitchTI || CandidateCost < BestUnswitchCost) {
1895       BestUnswitchTI = CandidateTI;
1896       BestUnswitchCost = CandidateCost;
1897     }
1898   }
1899 
1900   if (BestUnswitchCost < UnswitchThreshold) {
1901     DEBUG(dbgs() << "  Trying to unswitch non-trivial (cost = "
1902                  << BestUnswitchCost << ") branch: " << *BestUnswitchTI
1903                  << "\n");
1904     Changed |= unswitchInvariantBranch(L, cast<BranchInst>(*BestUnswitchTI), DT,
1905                                        LI, AC, NonTrivialUnswitchCB);
1906   } else {
1907     DEBUG(dbgs() << "Cannot unswitch, lowest cost found: " << BestUnswitchCost
1908                  << "\n");
1909   }
1910 
1911   return Changed;
1912 }
1913 
1914 PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
1915                                               LoopStandardAnalysisResults &AR,
1916                                               LPMUpdater &U) {
1917   Function &F = *L.getHeader()->getParent();
1918   (void)F;
1919 
1920   DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L << "\n");
1921 
1922   // Save the current loop name in a variable so that we can report it even
1923   // after it has been deleted.
1924   std::string LoopName = L.getName();
1925 
1926   auto NonTrivialUnswitchCB = [&L, &U, &LoopName](bool CurrentLoopValid,
1927                                                   ArrayRef<Loop *> NewLoops) {
1928     // If we did a non-trivial unswitch, we have added new (cloned) loops.
1929     U.addSiblingLoops(NewLoops);
1930 
1931     // If the current loop remains valid, we should revisit it to catch any
1932     // other unswitch opportunities. Otherwise, we need to mark it as deleted.
1933     if (CurrentLoopValid)
1934       U.revisitCurrentLoop();
1935     else
1936       U.markLoopAsDeleted(L, LoopName);
1937   };
1938 
1939   if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.TTI, NonTrivial,
1940                     NonTrivialUnswitchCB))
1941     return PreservedAnalyses::all();
1942 
1943   // Historically this pass has had issues with the dominator tree so verify it
1944   // in asserts builds.
1945   assert(AR.DT.verify(DominatorTree::VerificationLevel::Fast));
1946   return getLoopPassPreservedAnalyses();
1947 }
1948 
1949 namespace {
1950 
1951 class SimpleLoopUnswitchLegacyPass : public LoopPass {
1952   bool NonTrivial;
1953 
1954 public:
1955   static char ID; // Pass ID, replacement for typeid
1956 
1957   explicit SimpleLoopUnswitchLegacyPass(bool NonTrivial = false)
1958       : LoopPass(ID), NonTrivial(NonTrivial) {
1959     initializeSimpleLoopUnswitchLegacyPassPass(
1960         *PassRegistry::getPassRegistry());
1961   }
1962 
1963   bool runOnLoop(Loop *L, LPPassManager &LPM) override;
1964 
1965   void getAnalysisUsage(AnalysisUsage &AU) const override {
1966     AU.addRequired<AssumptionCacheTracker>();
1967     AU.addRequired<TargetTransformInfoWrapperPass>();
1968     getLoopAnalysisUsage(AU);
1969   }
1970 };
1971 
1972 } // end anonymous namespace
1973 
1974 bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
1975   if (skipLoop(L))
1976     return false;
1977 
1978   Function &F = *L->getHeader()->getParent();
1979 
1980   DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << *L << "\n");
1981 
1982   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1983   auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1984   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1985   auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1986 
1987   auto NonTrivialUnswitchCB = [&L, &LPM](bool CurrentLoopValid,
1988                                          ArrayRef<Loop *> NewLoops) {
1989     // If we did a non-trivial unswitch, we have added new (cloned) loops.
1990     for (auto *NewL : NewLoops)
1991       LPM.addLoop(*NewL);
1992 
1993     // If the current loop remains valid, re-add it to the queue. This is
1994     // a little wasteful as we'll finish processing the current loop as well,
1995     // but it is the best we can do in the old PM.
1996     if (CurrentLoopValid)
1997       LPM.addLoop(*L);
1998     else
1999       LPM.markLoopAsDeleted(*L);
2000   };
2001 
2002   bool Changed =
2003       unswitchLoop(*L, DT, LI, AC, TTI, NonTrivial, NonTrivialUnswitchCB);
2004 
2005   // If anything was unswitched, also clear any cached information about this
2006   // loop.
2007   LPM.deleteSimpleAnalysisLoop(L);
2008 
2009   // Historically this pass has had issues with the dominator tree so verify it
2010   // in asserts builds.
2011   assert(DT.verify(DominatorTree::VerificationLevel::Fast));
2012 
2013   return Changed;
2014 }
2015 
2016 char SimpleLoopUnswitchLegacyPass::ID = 0;
2017 INITIALIZE_PASS_BEGIN(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch",
2018                       "Simple unswitch loops", false, false)
2019 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
2020 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
2021 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
2022 INITIALIZE_PASS_DEPENDENCY(LoopPass)
2023 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
2024 INITIALIZE_PASS_END(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch",
2025                     "Simple unswitch loops", false, false)
2026 
2027 Pass *llvm::createSimpleLoopUnswitchLegacyPass(bool NonTrivial) {
2028   return new SimpleLoopUnswitchLegacyPass(NonTrivial);
2029 }
2030