xref: /llvm-project/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp (revision 2c85a23123f1d7d7a475967af8164557355eade3)
1 //===- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow ---------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 
10 #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
11 #include "llvm/ADT/DenseMap.h"
12 #include "llvm/ADT/STLExtras.h"
13 #include "llvm/ADT/Sequence.h"
14 #include "llvm/ADT/SetVector.h"
15 #include "llvm/ADT/SmallPtrSet.h"
16 #include "llvm/ADT/SmallVector.h"
17 #include "llvm/ADT/Statistic.h"
18 #include "llvm/ADT/Twine.h"
19 #include "llvm/Analysis/AssumptionCache.h"
20 #include "llvm/Analysis/CFG.h"
21 #include "llvm/Analysis/CodeMetrics.h"
22 #include "llvm/Analysis/LoopAnalysisManager.h"
23 #include "llvm/Analysis/LoopInfo.h"
24 #include "llvm/Analysis/LoopIterator.h"
25 #include "llvm/Analysis/LoopPass.h"
26 #include "llvm/IR/BasicBlock.h"
27 #include "llvm/IR/Constant.h"
28 #include "llvm/IR/Constants.h"
29 #include "llvm/IR/Dominators.h"
30 #include "llvm/IR/Function.h"
31 #include "llvm/IR/InstrTypes.h"
32 #include "llvm/IR/Instruction.h"
33 #include "llvm/IR/Instructions.h"
34 #include "llvm/IR/IntrinsicInst.h"
35 #include "llvm/IR/Use.h"
36 #include "llvm/IR/Value.h"
37 #include "llvm/Pass.h"
38 #include "llvm/Support/Casting.h"
39 #include "llvm/Support/Debug.h"
40 #include "llvm/Support/ErrorHandling.h"
41 #include "llvm/Support/GenericDomTree.h"
42 #include "llvm/Support/raw_ostream.h"
43 #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
44 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
45 #include "llvm/Transforms/Utils/Cloning.h"
46 #include "llvm/Transforms/Utils/LoopUtils.h"
47 #include "llvm/Transforms/Utils/ValueMapper.h"
48 #include <algorithm>
49 #include <cassert>
50 #include <iterator>
51 #include <numeric>
52 #include <utility>
53 
54 #define DEBUG_TYPE "simple-loop-unswitch"
55 
56 using namespace llvm;
57 
58 STATISTIC(NumBranches, "Number of branches unswitched");
59 STATISTIC(NumSwitches, "Number of switches unswitched");
60 STATISTIC(NumTrivial, "Number of unswitches that are trivial");
61 
62 static cl::opt<bool> EnableNonTrivialUnswitch(
63     "enable-nontrivial-unswitch", cl::init(false), cl::Hidden,
64     cl::desc("Forcibly enables non-trivial loop unswitching rather than "
65              "following the configuration passed into the pass."));
66 
67 static cl::opt<int>
68     UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden,
69                       cl::desc("The cost threshold for unswitching a loop."));
70 
71 static void replaceLoopUsesWithConstant(Loop &L, Value &LIC,
72                                         Constant &Replacement) {
73   assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?");
74 
75   // Replace uses of LIC in the loop with the given constant.
76   for (auto UI = LIC.use_begin(), UE = LIC.use_end(); UI != UE;) {
77     // Grab the use and walk past it so we can clobber it in the use list.
78     Use *U = &*UI++;
79     Instruction *UserI = dyn_cast<Instruction>(U->getUser());
80     if (!UserI || !L.contains(UserI))
81       continue;
82 
83     // Replace this use within the loop body.
84     *U = &Replacement;
85   }
86 }
87 
88 /// Check that all the LCSSA PHI nodes in the loop exit block have trivial
89 /// incoming values along this edge.
90 static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB,
91                                          BasicBlock &ExitBB) {
92   for (Instruction &I : ExitBB) {
93     auto *PN = dyn_cast<PHINode>(&I);
94     if (!PN)
95       // No more PHIs to check.
96       return true;
97 
98     // If the incoming value for this edge isn't loop invariant the unswitch
99     // won't be trivial.
100     if (!L.isLoopInvariant(PN->getIncomingValueForBlock(&ExitingBB)))
101       return false;
102   }
103   llvm_unreachable("Basic blocks should never be empty!");
104 }
105 
106 /// Rewrite the PHI nodes in an unswitched loop exit basic block.
107 ///
108 /// Requires that the loop exit and unswitched basic block are the same, and
109 /// that the exiting block was a unique predecessor of that block. Rewrites the
110 /// PHI nodes in that block such that what were LCSSA PHI nodes become trivial
111 /// PHI nodes from the old preheader that now contains the unswitched
112 /// terminator.
113 static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB,
114                                                   BasicBlock &OldExitingBB,
115                                                   BasicBlock &OldPH) {
116   for (PHINode &PN : UnswitchedBB.phis()) {
117     // When the loop exit is directly unswitched we just need to update the
118     // incoming basic block. We loop to handle weird cases with repeated
119     // incoming blocks, but expect to typically only have one operand here.
120     for (auto i : seq<int>(0, PN.getNumOperands())) {
121       assert(PN.getIncomingBlock(i) == &OldExitingBB &&
122              "Found incoming block different from unique predecessor!");
123       PN.setIncomingBlock(i, &OldPH);
124     }
125   }
126 }
127 
128 /// Rewrite the PHI nodes in the loop exit basic block and the split off
129 /// unswitched block.
130 ///
131 /// Because the exit block remains an exit from the loop, this rewrites the
132 /// LCSSA PHI nodes in it to remove the unswitched edge and introduces PHI
133 /// nodes into the unswitched basic block to select between the value in the
134 /// old preheader and the loop exit.
135 static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB,
136                                                       BasicBlock &UnswitchedBB,
137                                                       BasicBlock &OldExitingBB,
138                                                       BasicBlock &OldPH) {
139   assert(&ExitBB != &UnswitchedBB &&
140          "Must have different loop exit and unswitched blocks!");
141   Instruction *InsertPt = &*UnswitchedBB.begin();
142   for (PHINode &PN : ExitBB.phis()) {
143     auto *NewPN = PHINode::Create(PN.getType(), /*NumReservedValues*/ 2,
144                                   PN.getName() + ".split", InsertPt);
145 
146     // Walk backwards over the old PHI node's inputs to minimize the cost of
147     // removing each one. We have to do this weird loop manually so that we
148     // create the same number of new incoming edges in the new PHI as we expect
149     // each case-based edge to be included in the unswitched switch in some
150     // cases.
151     // FIXME: This is really, really gross. It would be much cleaner if LLVM
152     // allowed us to create a single entry for a predecessor block without
153     // having separate entries for each "edge" even though these edges are
154     // required to produce identical results.
155     for (int i = PN.getNumIncomingValues() - 1; i >= 0; --i) {
156       if (PN.getIncomingBlock(i) != &OldExitingBB)
157         continue;
158 
159       Value *Incoming = PN.removeIncomingValue(i);
160       NewPN->addIncoming(Incoming, &OldPH);
161     }
162 
163     // Now replace the old PHI with the new one and wire the old one in as an
164     // input to the new one.
165     PN.replaceAllUsesWith(NewPN);
166     NewPN->addIncoming(&PN, &ExitBB);
167   }
168 }
169 
170 /// Unswitch a trivial branch if the condition is loop invariant.
171 ///
172 /// This routine should only be called when loop code leading to the branch has
173 /// been validated as trivial (no side effects). This routine checks if the
174 /// condition is invariant and one of the successors is a loop exit. This
175 /// allows us to unswitch without duplicating the loop, making it trivial.
176 ///
177 /// If this routine fails to unswitch the branch it returns false.
178 ///
179 /// If the branch can be unswitched, this routine splits the preheader and
180 /// hoists the branch above that split. Preserves loop simplified form
181 /// (splitting the exit block as necessary). It simplifies the branch within
182 /// the loop to an unconditional branch but doesn't remove it entirely. Further
183 /// cleanup can be done with some simplify-cfg like pass.
184 static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
185                                   LoopInfo &LI) {
186   assert(BI.isConditional() && "Can only unswitch a conditional branch!");
187   DEBUG(dbgs() << "  Trying to unswitch branch: " << BI << "\n");
188 
189   Value *LoopCond = BI.getCondition();
190 
191   // Need a trivial loop condition to unswitch.
192   if (!L.isLoopInvariant(LoopCond))
193     return false;
194 
195   // FIXME: We should compute this once at the start and update it!
196   SmallVector<BasicBlock *, 16> ExitBlocks;
197   L.getExitBlocks(ExitBlocks);
198   SmallPtrSet<BasicBlock *, 16> ExitBlockSet(ExitBlocks.begin(),
199                                              ExitBlocks.end());
200 
201   // Check to see if a successor of the branch is guaranteed to
202   // exit through a unique exit block without having any
203   // side-effects.  If so, determine the value of Cond that causes
204   // it to do this.
205   ConstantInt *CondVal = ConstantInt::getTrue(BI.getContext());
206   ConstantInt *Replacement = ConstantInt::getFalse(BI.getContext());
207   int LoopExitSuccIdx = 0;
208   auto *LoopExitBB = BI.getSuccessor(0);
209   if (!ExitBlockSet.count(LoopExitBB)) {
210     std::swap(CondVal, Replacement);
211     LoopExitSuccIdx = 1;
212     LoopExitBB = BI.getSuccessor(1);
213     if (!ExitBlockSet.count(LoopExitBB))
214       return false;
215   }
216   auto *ContinueBB = BI.getSuccessor(1 - LoopExitSuccIdx);
217   assert(L.contains(ContinueBB) &&
218          "Cannot have both successors exit and still be in the loop!");
219 
220   auto *ParentBB = BI.getParent();
221   if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, *LoopExitBB))
222     return false;
223 
224   DEBUG(dbgs() << "    unswitching trivial branch when: " << CondVal
225                << " == " << LoopCond << "\n");
226 
227   // Split the preheader, so that we know that there is a safe place to insert
228   // the conditional branch. We will change the preheader to have a conditional
229   // branch on LoopCond.
230   BasicBlock *OldPH = L.getLoopPreheader();
231   BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI);
232 
233   // Now that we have a place to insert the conditional branch, create a place
234   // to branch to: this is the exit block out of the loop that we are
235   // unswitching. We need to split this if there are other loop predecessors.
236   // Because the loop is in simplified form, *any* other predecessor is enough.
237   BasicBlock *UnswitchedBB;
238   if (BasicBlock *PredBB = LoopExitBB->getUniquePredecessor()) {
239     (void)PredBB;
240     assert(PredBB == BI.getParent() &&
241            "A branch's parent isn't a predecessor!");
242     UnswitchedBB = LoopExitBB;
243   } else {
244     UnswitchedBB = SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI);
245   }
246 
247   // Now splice the branch to gate reaching the new preheader and re-point its
248   // successors.
249   OldPH->getInstList().splice(std::prev(OldPH->end()),
250                               BI.getParent()->getInstList(), BI);
251   OldPH->getTerminator()->eraseFromParent();
252   BI.setSuccessor(LoopExitSuccIdx, UnswitchedBB);
253   BI.setSuccessor(1 - LoopExitSuccIdx, NewPH);
254 
255   // Create a new unconditional branch that will continue the loop as a new
256   // terminator.
257   BranchInst::Create(ContinueBB, ParentBB);
258 
259   // Rewrite the relevant PHI nodes.
260   if (UnswitchedBB == LoopExitBB)
261     rewritePHINodesForUnswitchedExitBlock(*UnswitchedBB, *ParentBB, *OldPH);
262   else
263     rewritePHINodesForExitAndUnswitchedBlocks(*LoopExitBB, *UnswitchedBB,
264                                               *ParentBB, *OldPH);
265 
266   // Now we need to update the dominator tree.
267   DT.applyUpdates(
268       {{DT.Delete, ParentBB, UnswitchedBB}, {DT.Insert, OldPH, UnswitchedBB}});
269 
270   // Since this is an i1 condition we can also trivially replace uses of it
271   // within the loop with a constant.
272   replaceLoopUsesWithConstant(L, *LoopCond, *Replacement);
273 
274   ++NumTrivial;
275   ++NumBranches;
276   return true;
277 }
278 
279 /// Unswitch a trivial switch if the condition is loop invariant.
280 ///
281 /// This routine should only be called when loop code leading to the switch has
282 /// been validated as trivial (no side effects). This routine checks if the
283 /// condition is invariant and that at least one of the successors is a loop
284 /// exit. This allows us to unswitch without duplicating the loop, making it
285 /// trivial.
286 ///
287 /// If this routine fails to unswitch the switch it returns false.
288 ///
289 /// If the switch can be unswitched, this routine splits the preheader and
290 /// copies the switch above that split. If the default case is one of the
291 /// exiting cases, it copies the non-exiting cases and points them at the new
292 /// preheader. If the default case is not exiting, it copies the exiting cases
293 /// and points the default at the preheader. It preserves loop simplified form
294 /// (splitting the exit blocks as necessary). It simplifies the switch within
295 /// the loop by removing now-dead cases. If the default case is one of those
296 /// unswitched, it replaces its destination with a new basic block containing
297 /// only unreachable. Such basic blocks, while technically loop exits, are not
298 /// considered for unswitching so this is a stable transform and the same
299 /// switch will not be revisited. If after unswitching there is only a single
300 /// in-loop successor, the switch is further simplified to an unconditional
301 /// branch. Still more cleanup can be done with some simplify-cfg like pass.
302 static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
303                                   LoopInfo &LI) {
304   DEBUG(dbgs() << "  Trying to unswitch switch: " << SI << "\n");
305   Value *LoopCond = SI.getCondition();
306 
307   // If this isn't switching on an invariant condition, we can't unswitch it.
308   if (!L.isLoopInvariant(LoopCond))
309     return false;
310 
311   auto *ParentBB = SI.getParent();
312 
313   // FIXME: We should compute this once at the start and update it!
314   SmallVector<BasicBlock *, 16> ExitBlocks;
315   L.getExitBlocks(ExitBlocks);
316   SmallPtrSet<BasicBlock *, 16> ExitBlockSet(ExitBlocks.begin(),
317                                              ExitBlocks.end());
318 
319   SmallVector<int, 4> ExitCaseIndices;
320   for (auto Case : SI.cases()) {
321     auto *SuccBB = Case.getCaseSuccessor();
322     if (ExitBlockSet.count(SuccBB) &&
323         areLoopExitPHIsLoopInvariant(L, *ParentBB, *SuccBB))
324       ExitCaseIndices.push_back(Case.getCaseIndex());
325   }
326   BasicBlock *DefaultExitBB = nullptr;
327   if (ExitBlockSet.count(SI.getDefaultDest()) &&
328       areLoopExitPHIsLoopInvariant(L, *ParentBB, *SI.getDefaultDest()) &&
329       !isa<UnreachableInst>(SI.getDefaultDest()->getTerminator()))
330     DefaultExitBB = SI.getDefaultDest();
331   else if (ExitCaseIndices.empty())
332     return false;
333 
334   DEBUG(dbgs() << "    unswitching trivial cases...\n");
335 
336   SmallVector<std::pair<ConstantInt *, BasicBlock *>, 4> ExitCases;
337   ExitCases.reserve(ExitCaseIndices.size());
338   // We walk the case indices backwards so that we remove the last case first
339   // and don't disrupt the earlier indices.
340   for (unsigned Index : reverse(ExitCaseIndices)) {
341     auto CaseI = SI.case_begin() + Index;
342     // Save the value of this case.
343     ExitCases.push_back({CaseI->getCaseValue(), CaseI->getCaseSuccessor()});
344     // Delete the unswitched cases.
345     SI.removeCase(CaseI);
346   }
347 
348   // Check if after this all of the remaining cases point at the same
349   // successor.
350   BasicBlock *CommonSuccBB = nullptr;
351   if (SI.getNumCases() > 0 &&
352       std::all_of(std::next(SI.case_begin()), SI.case_end(),
353                   [&SI](const SwitchInst::CaseHandle &Case) {
354                     return Case.getCaseSuccessor() ==
355                            SI.case_begin()->getCaseSuccessor();
356                   }))
357     CommonSuccBB = SI.case_begin()->getCaseSuccessor();
358 
359   if (DefaultExitBB) {
360     // We can't remove the default edge so replace it with an edge to either
361     // the single common remaining successor (if we have one) or an unreachable
362     // block.
363     if (CommonSuccBB) {
364       SI.setDefaultDest(CommonSuccBB);
365     } else {
366       BasicBlock *UnreachableBB = BasicBlock::Create(
367           ParentBB->getContext(),
368           Twine(ParentBB->getName()) + ".unreachable_default",
369           ParentBB->getParent());
370       new UnreachableInst(ParentBB->getContext(), UnreachableBB);
371       SI.setDefaultDest(UnreachableBB);
372       DT.addNewBlock(UnreachableBB, ParentBB);
373     }
374   } else {
375     // If we're not unswitching the default, we need it to match any cases to
376     // have a common successor or if we have no cases it is the common
377     // successor.
378     if (SI.getNumCases() == 0)
379       CommonSuccBB = SI.getDefaultDest();
380     else if (SI.getDefaultDest() != CommonSuccBB)
381       CommonSuccBB = nullptr;
382   }
383 
384   // Split the preheader, so that we know that there is a safe place to insert
385   // the switch.
386   BasicBlock *OldPH = L.getLoopPreheader();
387   BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI);
388   OldPH->getTerminator()->eraseFromParent();
389 
390   // Now add the unswitched switch.
391   auto *NewSI = SwitchInst::Create(LoopCond, NewPH, ExitCases.size(), OldPH);
392 
393   // Rewrite the IR for the unswitched basic blocks. This requires two steps.
394   // First, we split any exit blocks with remaining in-loop predecessors. Then
395   // we update the PHIs in one of two ways depending on if there was a split.
396   // We walk in reverse so that we split in the same order as the cases
397   // appeared. This is purely for convenience of reading the resulting IR, but
398   // it doesn't cost anything really.
399   SmallPtrSet<BasicBlock *, 2> UnswitchedExitBBs;
400   SmallDenseMap<BasicBlock *, BasicBlock *, 2> SplitExitBBMap;
401   // Handle the default exit if necessary.
402   // FIXME: It'd be great if we could merge this with the loop below but LLVM's
403   // ranges aren't quite powerful enough yet.
404   if (DefaultExitBB) {
405     if (pred_empty(DefaultExitBB)) {
406       UnswitchedExitBBs.insert(DefaultExitBB);
407       rewritePHINodesForUnswitchedExitBlock(*DefaultExitBB, *ParentBB, *OldPH);
408     } else {
409       auto *SplitBB =
410           SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI);
411       rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB,
412                                                 *ParentBB, *OldPH);
413       DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB;
414     }
415   }
416   // Note that we must use a reference in the for loop so that we update the
417   // container.
418   for (auto &CasePair : reverse(ExitCases)) {
419     // Grab a reference to the exit block in the pair so that we can update it.
420     BasicBlock *ExitBB = CasePair.second;
421 
422     // If this case is the last edge into the exit block, we can simply reuse it
423     // as it will no longer be a loop exit. No mapping necessary.
424     if (pred_empty(ExitBB)) {
425       // Only rewrite once.
426       if (UnswitchedExitBBs.insert(ExitBB).second)
427         rewritePHINodesForUnswitchedExitBlock(*ExitBB, *ParentBB, *OldPH);
428       continue;
429     }
430 
431     // Otherwise we need to split the exit block so that we retain an exit
432     // block from the loop and a target for the unswitched condition.
433     BasicBlock *&SplitExitBB = SplitExitBBMap[ExitBB];
434     if (!SplitExitBB) {
435       // If this is the first time we see this, do the split and remember it.
436       SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI);
437       rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB,
438                                                 *ParentBB, *OldPH);
439     }
440     // Update the case pair to point to the split block.
441     CasePair.second = SplitExitBB;
442   }
443 
444   // Now add the unswitched cases. We do this in reverse order as we built them
445   // in reverse order.
446   for (auto CasePair : reverse(ExitCases)) {
447     ConstantInt *CaseVal = CasePair.first;
448     BasicBlock *UnswitchedBB = CasePair.second;
449 
450     NewSI->addCase(CaseVal, UnswitchedBB);
451   }
452 
453   // If the default was unswitched, re-point it and add explicit cases for
454   // entering the loop.
455   if (DefaultExitBB) {
456     NewSI->setDefaultDest(DefaultExitBB);
457 
458     // We removed all the exit cases, so we just copy the cases to the
459     // unswitched switch.
460     for (auto Case : SI.cases())
461       NewSI->addCase(Case.getCaseValue(), NewPH);
462   }
463 
464   // If we ended up with a common successor for every path through the switch
465   // after unswitching, rewrite it to an unconditional branch to make it easy
466   // to recognize. Otherwise we potentially have to recognize the default case
467   // pointing at unreachable and other complexity.
468   if (CommonSuccBB) {
469     BasicBlock *BB = SI.getParent();
470     SI.eraseFromParent();
471     BranchInst::Create(CommonSuccBB, BB);
472   }
473 
474   // Walk the unswitched exit blocks and the unswitched split blocks and update
475   // the dominator tree based on the CFG edits. While we are walking unordered
476   // containers here, the API for applyUpdates takes an unordered list of
477   // updates and requires them to not contain duplicates.
478   SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
479   for (auto *UnswitchedExitBB : UnswitchedExitBBs) {
480     DTUpdates.push_back({DT.Delete, ParentBB, UnswitchedExitBB});
481     DTUpdates.push_back({DT.Insert, OldPH, UnswitchedExitBB});
482   }
483   for (auto SplitUnswitchedPair : SplitExitBBMap) {
484     auto *UnswitchedBB = SplitUnswitchedPair.second;
485     DTUpdates.push_back({DT.Delete, ParentBB, UnswitchedBB});
486     DTUpdates.push_back({DT.Insert, OldPH, UnswitchedBB});
487   }
488   DT.applyUpdates(DTUpdates);
489 
490   assert(DT.verify(DominatorTree::VerificationLevel::Fast));
491   ++NumTrivial;
492   ++NumSwitches;
493   return true;
494 }
495 
496 /// This routine scans the loop to find a branch or switch which occurs before
497 /// any side effects occur. These can potentially be unswitched without
498 /// duplicating the loop. If a branch or switch is successfully unswitched the
499 /// scanning continues to see if subsequent branches or switches have become
500 /// trivial. Once all trivial candidates have been unswitched, this routine
501 /// returns.
502 ///
503 /// The return value indicates whether anything was unswitched (and therefore
504 /// changed).
505 static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
506                                          LoopInfo &LI) {
507   bool Changed = false;
508 
509   // If loop header has only one reachable successor we should keep looking for
510   // trivial condition candidates in the successor as well. An alternative is
511   // to constant fold conditions and merge successors into loop header (then we
512   // only need to check header's terminator). The reason for not doing this in
513   // LoopUnswitch pass is that it could potentially break LoopPassManager's
514   // invariants. Folding dead branches could either eliminate the current loop
515   // or make other loops unreachable. LCSSA form might also not be preserved
516   // after deleting branches. The following code keeps traversing loop header's
517   // successors until it finds the trivial condition candidate (condition that
518   // is not a constant). Since unswitching generates branches with constant
519   // conditions, this scenario could be very common in practice.
520   BasicBlock *CurrentBB = L.getHeader();
521   SmallPtrSet<BasicBlock *, 8> Visited;
522   Visited.insert(CurrentBB);
523   do {
524     // Check if there are any side-effecting instructions (e.g. stores, calls,
525     // volatile loads) in the part of the loop that the code *would* execute
526     // without unswitching.
527     if (llvm::any_of(*CurrentBB,
528                      [](Instruction &I) { return I.mayHaveSideEffects(); }))
529       return Changed;
530 
531     TerminatorInst *CurrentTerm = CurrentBB->getTerminator();
532 
533     if (auto *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
534       // Don't bother trying to unswitch past a switch with a constant
535       // condition. This should be removed prior to running this pass by
536       // simplify-cfg.
537       if (isa<Constant>(SI->getCondition()))
538         return Changed;
539 
540       if (!unswitchTrivialSwitch(L, *SI, DT, LI))
541         // Coludn't unswitch this one so we're done.
542         return Changed;
543 
544       // Mark that we managed to unswitch something.
545       Changed = true;
546 
547       // If unswitching turned the terminator into an unconditional branch then
548       // we can continue. The unswitching logic specifically works to fold any
549       // cases it can into an unconditional branch to make it easier to
550       // recognize here.
551       auto *BI = dyn_cast<BranchInst>(CurrentBB->getTerminator());
552       if (!BI || BI->isConditional())
553         return Changed;
554 
555       CurrentBB = BI->getSuccessor(0);
556       continue;
557     }
558 
559     auto *BI = dyn_cast<BranchInst>(CurrentTerm);
560     if (!BI)
561       // We do not understand other terminator instructions.
562       return Changed;
563 
564     // Don't bother trying to unswitch past an unconditional branch or a branch
565     // with a constant value. These should be removed by simplify-cfg prior to
566     // running this pass.
567     if (!BI->isConditional() || isa<Constant>(BI->getCondition()))
568       return Changed;
569 
570     // Found a trivial condition candidate: non-foldable conditional branch. If
571     // we fail to unswitch this, we can't do anything else that is trivial.
572     if (!unswitchTrivialBranch(L, *BI, DT, LI))
573       return Changed;
574 
575     // Mark that we managed to unswitch something.
576     Changed = true;
577 
578     // We unswitched the branch. This should always leave us with an
579     // unconditional branch that we can follow now.
580     BI = cast<BranchInst>(CurrentBB->getTerminator());
581     assert(!BI->isConditional() &&
582            "Cannot form a conditional branch by unswitching1");
583     CurrentBB = BI->getSuccessor(0);
584 
585     // When continuing, if we exit the loop or reach a previous visited block,
586     // then we can not reach any trivial condition candidates (unfoldable
587     // branch instructions or switch instructions) and no unswitch can happen.
588   } while (L.contains(CurrentBB) && Visited.insert(CurrentBB).second);
589 
590   return Changed;
591 }
592 
593 /// Build the cloned blocks for an unswitched copy of the given loop.
594 ///
595 /// The cloned blocks are inserted before the loop preheader (`LoopPH`) and
596 /// after the split block (`SplitBB`) that will be used to select between the
597 /// cloned and original loop.
598 ///
599 /// This routine handles cloning all of the necessary loop blocks and exit
600 /// blocks including rewriting their instructions and the relevant PHI nodes.
601 /// It skips loop and exit blocks that are not necessary based on the provided
602 /// set. It also correctly creates the unconditional branch in the cloned
603 /// unswitched parent block to only point at the unswitched successor.
604 ///
605 /// This does not handle most of the necessary updates to `LoopInfo`. Only exit
606 /// block splitting is correctly reflected in `LoopInfo`, essentially all of
607 /// the cloned blocks (and their loops) are left without full `LoopInfo`
608 /// updates. This also doesn't fully update `DominatorTree`. It adds the cloned
609 /// blocks to them but doesn't create the cloned `DominatorTree` structure and
610 /// instead the caller must recompute an accurate DT. It *does* correctly
611 /// update the `AssumptionCache` provided in `AC`.
612 static BasicBlock *buildClonedLoopBlocks(
613     Loop &L, BasicBlock *LoopPH, BasicBlock *SplitBB,
614     ArrayRef<BasicBlock *> ExitBlocks, BasicBlock *ParentBB,
615     BasicBlock *UnswitchedSuccBB, BasicBlock *ContinueSuccBB,
616     const SmallPtrSetImpl<BasicBlock *> &SkippedLoopAndExitBlocks,
617     ValueToValueMapTy &VMap,
618     SmallVectorImpl<DominatorTree::UpdateType> &DTUpdates, AssumptionCache &AC,
619     DominatorTree &DT, LoopInfo &LI) {
620   SmallVector<BasicBlock *, 4> NewBlocks;
621   NewBlocks.reserve(L.getNumBlocks() + ExitBlocks.size());
622 
623   // We will need to clone a bunch of blocks, wrap up the clone operation in
624   // a helper.
625   auto CloneBlock = [&](BasicBlock *OldBB) {
626     // Clone the basic block and insert it before the new preheader.
627     BasicBlock *NewBB = CloneBasicBlock(OldBB, VMap, ".us", OldBB->getParent());
628     NewBB->moveBefore(LoopPH);
629 
630     // Record this block and the mapping.
631     NewBlocks.push_back(NewBB);
632     VMap[OldBB] = NewBB;
633 
634     return NewBB;
635   };
636 
637   // First, clone the preheader.
638   auto *ClonedPH = CloneBlock(LoopPH);
639 
640   // Then clone all the loop blocks, skipping the ones that aren't necessary.
641   for (auto *LoopBB : L.blocks())
642     if (!SkippedLoopAndExitBlocks.count(LoopBB))
643       CloneBlock(LoopBB);
644 
645   // Split all the loop exit edges so that when we clone the exit blocks, if
646   // any of the exit blocks are *also* a preheader for some other loop, we
647   // don't create multiple predecessors entering the loop header.
648   for (auto *ExitBB : ExitBlocks) {
649     if (SkippedLoopAndExitBlocks.count(ExitBB))
650       continue;
651 
652     // When we are going to clone an exit, we don't need to clone all the
653     // instructions in the exit block and we want to ensure we have an easy
654     // place to merge the CFG, so split the exit first. This is always safe to
655     // do because there cannot be any non-loop predecessors of a loop exit in
656     // loop simplified form.
657     auto *MergeBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI);
658 
659     // Rearrange the names to make it easier to write test cases by having the
660     // exit block carry the suffix rather than the merge block carrying the
661     // suffix.
662     MergeBB->takeName(ExitBB);
663     ExitBB->setName(Twine(MergeBB->getName()) + ".split");
664 
665     // Now clone the original exit block.
666     auto *ClonedExitBB = CloneBlock(ExitBB);
667     assert(ClonedExitBB->getTerminator()->getNumSuccessors() == 1 &&
668            "Exit block should have been split to have one successor!");
669     assert(ClonedExitBB->getTerminator()->getSuccessor(0) == MergeBB &&
670            "Cloned exit block has the wrong successor!");
671 
672     // Remap any cloned instructions and create a merge phi node for them.
673     for (auto ZippedInsts : llvm::zip_first(
674              llvm::make_range(ExitBB->begin(), std::prev(ExitBB->end())),
675              llvm::make_range(ClonedExitBB->begin(),
676                               std::prev(ClonedExitBB->end())))) {
677       Instruction &I = std::get<0>(ZippedInsts);
678       Instruction &ClonedI = std::get<1>(ZippedInsts);
679 
680       // The only instructions in the exit block should be PHI nodes and
681       // potentially a landing pad.
682       assert(
683           (isa<PHINode>(I) || isa<LandingPadInst>(I) || isa<CatchPadInst>(I)) &&
684           "Bad instruction in exit block!");
685       // We should have a value map between the instruction and its clone.
686       assert(VMap.lookup(&I) == &ClonedI && "Mismatch in the value map!");
687 
688       auto *MergePN =
689           PHINode::Create(I.getType(), /*NumReservedValues*/ 2, ".us-phi",
690                           &*MergeBB->getFirstInsertionPt());
691       I.replaceAllUsesWith(MergePN);
692       MergePN->addIncoming(&I, ExitBB);
693       MergePN->addIncoming(&ClonedI, ClonedExitBB);
694     }
695   }
696 
697   // Rewrite the instructions in the cloned blocks to refer to the instructions
698   // in the cloned blocks. We have to do this as a second pass so that we have
699   // everything available. Also, we have inserted new instructions which may
700   // include assume intrinsics, so we update the assumption cache while
701   // processing this.
702   for (auto *ClonedBB : NewBlocks)
703     for (Instruction &I : *ClonedBB) {
704       RemapInstruction(&I, VMap,
705                        RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
706       if (auto *II = dyn_cast<IntrinsicInst>(&I))
707         if (II->getIntrinsicID() == Intrinsic::assume)
708           AC.registerAssumption(II);
709     }
710 
711   // Remove the cloned parent as a predecessor of the cloned continue successor
712   // if we did in fact clone it.
713   auto *ClonedParentBB = cast<BasicBlock>(VMap.lookup(ParentBB));
714   if (auto *ClonedContinueSuccBB =
715           cast_or_null<BasicBlock>(VMap.lookup(ContinueSuccBB)))
716     ClonedContinueSuccBB->removePredecessor(ClonedParentBB,
717                                             /*DontDeleteUselessPHIs*/ true);
718   // Replace the cloned branch with an unconditional branch to the cloned
719   // unswitched successor.
720   auto *ClonedSuccBB = cast<BasicBlock>(VMap.lookup(UnswitchedSuccBB));
721   ClonedParentBB->getTerminator()->eraseFromParent();
722   BranchInst::Create(ClonedSuccBB, ClonedParentBB);
723 
724   // Update any PHI nodes in the cloned successors of the skipped blocks to not
725   // have spurious incoming values.
726   for (auto *LoopBB : L.blocks())
727     if (SkippedLoopAndExitBlocks.count(LoopBB))
728       for (auto *SuccBB : successors(LoopBB))
729         if (auto *ClonedSuccBB = cast_or_null<BasicBlock>(VMap.lookup(SuccBB)))
730           for (PHINode &PN : ClonedSuccBB->phis())
731             PN.removeIncomingValue(LoopBB, /*DeletePHIIfEmpty*/ false);
732 
733   // Record the domtree updates for the new blocks.
734   SmallPtrSet<BasicBlock *, 4> SuccSet;
735   for (auto *ClonedBB : NewBlocks) {
736     for (auto *SuccBB : successors(ClonedBB))
737       if (SuccSet.insert(SuccBB).second)
738         DTUpdates.push_back({DominatorTree::Insert, ClonedBB, SuccBB});
739     SuccSet.clear();
740   }
741 
742   return ClonedPH;
743 }
744 
745 /// Recursively clone the specified loop and all of its children.
746 ///
747 /// The target parent loop for the clone should be provided, or can be null if
748 /// the clone is a top-level loop. While cloning, all the blocks are mapped
749 /// with the provided value map. The entire original loop must be present in
750 /// the value map. The cloned loop is returned.
751 static Loop *cloneLoopNest(Loop &OrigRootL, Loop *RootParentL,
752                            const ValueToValueMapTy &VMap, LoopInfo &LI) {
753   auto AddClonedBlocksToLoop = [&](Loop &OrigL, Loop &ClonedL) {
754     assert(ClonedL.getBlocks().empty() && "Must start with an empty loop!");
755     ClonedL.reserveBlocks(OrigL.getNumBlocks());
756     for (auto *BB : OrigL.blocks()) {
757       auto *ClonedBB = cast<BasicBlock>(VMap.lookup(BB));
758       ClonedL.addBlockEntry(ClonedBB);
759       if (LI.getLoopFor(BB) == &OrigL)
760         LI.changeLoopFor(ClonedBB, &ClonedL);
761     }
762   };
763 
764   // We specially handle the first loop because it may get cloned into
765   // a different parent and because we most commonly are cloning leaf loops.
766   Loop *ClonedRootL = LI.AllocateLoop();
767   if (RootParentL)
768     RootParentL->addChildLoop(ClonedRootL);
769   else
770     LI.addTopLevelLoop(ClonedRootL);
771   AddClonedBlocksToLoop(OrigRootL, *ClonedRootL);
772 
773   if (OrigRootL.empty())
774     return ClonedRootL;
775 
776   // If we have a nest, we can quickly clone the entire loop nest using an
777   // iterative approach because it is a tree. We keep the cloned parent in the
778   // data structure to avoid repeatedly querying through a map to find it.
779   SmallVector<std::pair<Loop *, Loop *>, 16> LoopsToClone;
780   // Build up the loops to clone in reverse order as we'll clone them from the
781   // back.
782   for (Loop *ChildL : llvm::reverse(OrigRootL))
783     LoopsToClone.push_back({ClonedRootL, ChildL});
784   do {
785     Loop *ClonedParentL, *L;
786     std::tie(ClonedParentL, L) = LoopsToClone.pop_back_val();
787     Loop *ClonedL = LI.AllocateLoop();
788     ClonedParentL->addChildLoop(ClonedL);
789     AddClonedBlocksToLoop(*L, *ClonedL);
790     for (Loop *ChildL : llvm::reverse(*L))
791       LoopsToClone.push_back({ClonedL, ChildL});
792   } while (!LoopsToClone.empty());
793 
794   return ClonedRootL;
795 }
796 
797 /// Build the cloned loops of an original loop from unswitching.
798 ///
799 /// Because unswitching simplifies the CFG of the loop, this isn't a trivial
800 /// operation. We need to re-verify that there even is a loop (as the backedge
801 /// may not have been cloned), and even if there are remaining backedges the
802 /// backedge set may be different. However, we know that each child loop is
803 /// undisturbed, we only need to find where to place each child loop within
804 /// either any parent loop or within a cloned version of the original loop.
805 ///
806 /// Because child loops may end up cloned outside of any cloned version of the
807 /// original loop, multiple cloned sibling loops may be created. All of them
808 /// are returned so that the newly introduced loop nest roots can be
809 /// identified.
810 static Loop *buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks,
811                               const ValueToValueMapTy &VMap, LoopInfo &LI,
812                               SmallVectorImpl<Loop *> &NonChildClonedLoops) {
813   Loop *ClonedL = nullptr;
814 
815   auto *OrigPH = OrigL.getLoopPreheader();
816   auto *OrigHeader = OrigL.getHeader();
817 
818   auto *ClonedPH = cast<BasicBlock>(VMap.lookup(OrigPH));
819   auto *ClonedHeader = cast<BasicBlock>(VMap.lookup(OrigHeader));
820 
821   // We need to know the loops of the cloned exit blocks to even compute the
822   // accurate parent loop. If we only clone exits to some parent of the
823   // original parent, we want to clone into that outer loop. We also keep track
824   // of the loops that our cloned exit blocks participate in.
825   Loop *ParentL = nullptr;
826   SmallVector<BasicBlock *, 4> ClonedExitsInLoops;
827   SmallDenseMap<BasicBlock *, Loop *, 16> ExitLoopMap;
828   ClonedExitsInLoops.reserve(ExitBlocks.size());
829   for (auto *ExitBB : ExitBlocks)
830     if (auto *ClonedExitBB = cast_or_null<BasicBlock>(VMap.lookup(ExitBB)))
831       if (Loop *ExitL = LI.getLoopFor(ExitBB)) {
832         ExitLoopMap[ClonedExitBB] = ExitL;
833         ClonedExitsInLoops.push_back(ClonedExitBB);
834         if (!ParentL || (ParentL != ExitL && ParentL->contains(ExitL)))
835           ParentL = ExitL;
836       }
837   assert((!ParentL || ParentL == OrigL.getParentLoop() ||
838           ParentL->contains(OrigL.getParentLoop())) &&
839          "The computed parent loop should always contain (or be) the parent of "
840          "the original loop.");
841 
842   // We build the set of blocks dominated by the cloned header from the set of
843   // cloned blocks out of the original loop. While not all of these will
844   // necessarily be in the cloned loop, it is enough to establish that they
845   // aren't in unreachable cycles, etc.
846   SmallSetVector<BasicBlock *, 16> ClonedLoopBlocks;
847   for (auto *BB : OrigL.blocks())
848     if (auto *ClonedBB = cast_or_null<BasicBlock>(VMap.lookup(BB)))
849       ClonedLoopBlocks.insert(ClonedBB);
850 
851   // Rebuild the set of blocks that will end up in the cloned loop. We may have
852   // skipped cloning some region of this loop which can in turn skip some of
853   // the backedges so we have to rebuild the blocks in the loop based on the
854   // backedges that remain after cloning.
855   SmallVector<BasicBlock *, 16> Worklist;
856   SmallPtrSet<BasicBlock *, 16> BlocksInClonedLoop;
857   for (auto *Pred : predecessors(ClonedHeader)) {
858     // The only possible non-loop header predecessor is the preheader because
859     // we know we cloned the loop in simplified form.
860     if (Pred == ClonedPH)
861       continue;
862 
863     // Because the loop was in simplified form, the only non-loop predecessor
864     // should be the preheader.
865     assert(ClonedLoopBlocks.count(Pred) && "Found a predecessor of the loop "
866                                            "header other than the preheader "
867                                            "that is not part of the loop!");
868 
869     // Insert this block into the loop set and on the first visit (and if it
870     // isn't the header we're currently walking) put it into the worklist to
871     // recurse through.
872     if (BlocksInClonedLoop.insert(Pred).second && Pred != ClonedHeader)
873       Worklist.push_back(Pred);
874   }
875 
876   // If we had any backedges then there *is* a cloned loop. Put the header into
877   // the loop set and then walk the worklist backwards to find all the blocks
878   // that remain within the loop after cloning.
879   if (!BlocksInClonedLoop.empty()) {
880     BlocksInClonedLoop.insert(ClonedHeader);
881 
882     while (!Worklist.empty()) {
883       BasicBlock *BB = Worklist.pop_back_val();
884       assert(BlocksInClonedLoop.count(BB) &&
885              "Didn't put block into the loop set!");
886 
887       // Insert any predecessors that are in the possible set into the cloned
888       // set, and if the insert is successful, add them to the worklist. Note
889       // that we filter on the blocks that are definitely reachable via the
890       // backedge to the loop header so we may prune out dead code within the
891       // cloned loop.
892       for (auto *Pred : predecessors(BB))
893         if (ClonedLoopBlocks.count(Pred) &&
894             BlocksInClonedLoop.insert(Pred).second)
895           Worklist.push_back(Pred);
896     }
897 
898     ClonedL = LI.AllocateLoop();
899     if (ParentL) {
900       ParentL->addBasicBlockToLoop(ClonedPH, LI);
901       ParentL->addChildLoop(ClonedL);
902     } else {
903       LI.addTopLevelLoop(ClonedL);
904     }
905 
906     ClonedL->reserveBlocks(BlocksInClonedLoop.size());
907     // We don't want to just add the cloned loop blocks based on how we
908     // discovered them. The original order of blocks was carefully built in
909     // a way that doesn't rely on predecessor ordering. Rather than re-invent
910     // that logic, we just re-walk the original blocks (and those of the child
911     // loops) and filter them as we add them into the cloned loop.
912     for (auto *BB : OrigL.blocks()) {
913       auto *ClonedBB = cast_or_null<BasicBlock>(VMap.lookup(BB));
914       if (!ClonedBB || !BlocksInClonedLoop.count(ClonedBB))
915         continue;
916 
917       // Directly add the blocks that are only in this loop.
918       if (LI.getLoopFor(BB) == &OrigL) {
919         ClonedL->addBasicBlockToLoop(ClonedBB, LI);
920         continue;
921       }
922 
923       // We want to manually add it to this loop and parents.
924       // Registering it with LoopInfo will happen when we clone the top
925       // loop for this block.
926       for (Loop *PL = ClonedL; PL; PL = PL->getParentLoop())
927         PL->addBlockEntry(ClonedBB);
928     }
929 
930     // Now add each child loop whose header remains within the cloned loop. All
931     // of the blocks within the loop must satisfy the same constraints as the
932     // header so once we pass the header checks we can just clone the entire
933     // child loop nest.
934     for (Loop *ChildL : OrigL) {
935       auto *ClonedChildHeader =
936           cast_or_null<BasicBlock>(VMap.lookup(ChildL->getHeader()));
937       if (!ClonedChildHeader || !BlocksInClonedLoop.count(ClonedChildHeader))
938         continue;
939 
940 #ifndef NDEBUG
941       // We should never have a cloned child loop header but fail to have
942       // all of the blocks for that child loop.
943       for (auto *ChildLoopBB : ChildL->blocks())
944         assert(BlocksInClonedLoop.count(
945                    cast<BasicBlock>(VMap.lookup(ChildLoopBB))) &&
946                "Child cloned loop has a header within the cloned outer "
947                "loop but not all of its blocks!");
948 #endif
949 
950       cloneLoopNest(*ChildL, ClonedL, VMap, LI);
951     }
952   }
953 
954   // Now that we've handled all the components of the original loop that were
955   // cloned into a new loop, we still need to handle anything from the original
956   // loop that wasn't in a cloned loop.
957 
958   // Figure out what blocks are left to place within any loop nest containing
959   // the unswitched loop. If we never formed a loop, the cloned PH is one of
960   // them.
961   SmallPtrSet<BasicBlock *, 16> UnloopedBlockSet;
962   if (BlocksInClonedLoop.empty())
963     UnloopedBlockSet.insert(ClonedPH);
964   for (auto *ClonedBB : ClonedLoopBlocks)
965     if (!BlocksInClonedLoop.count(ClonedBB))
966       UnloopedBlockSet.insert(ClonedBB);
967 
968   // Copy the cloned exits and sort them in ascending loop depth, we'll work
969   // backwards across these to process them inside out. The order shouldn't
970   // matter as we're just trying to build up the map from inside-out; we use
971   // the map in a more stably ordered way below.
972   auto OrderedClonedExitsInLoops = ClonedExitsInLoops;
973   llvm::sort(OrderedClonedExitsInLoops.begin(),
974              OrderedClonedExitsInLoops.end(),
975              [&](BasicBlock *LHS, BasicBlock *RHS) {
976                return ExitLoopMap.lookup(LHS)->getLoopDepth() <
977                       ExitLoopMap.lookup(RHS)->getLoopDepth();
978              });
979 
980   // Populate the existing ExitLoopMap with everything reachable from each
981   // exit, starting from the inner most exit.
982   while (!UnloopedBlockSet.empty() && !OrderedClonedExitsInLoops.empty()) {
983     assert(Worklist.empty() && "Didn't clear worklist!");
984 
985     BasicBlock *ExitBB = OrderedClonedExitsInLoops.pop_back_val();
986     Loop *ExitL = ExitLoopMap.lookup(ExitBB);
987 
988     // Walk the CFG back until we hit the cloned PH adding everything reachable
989     // and in the unlooped set to this exit block's loop.
990     Worklist.push_back(ExitBB);
991     do {
992       BasicBlock *BB = Worklist.pop_back_val();
993       // We can stop recursing at the cloned preheader (if we get there).
994       if (BB == ClonedPH)
995         continue;
996 
997       for (BasicBlock *PredBB : predecessors(BB)) {
998         // If this pred has already been moved to our set or is part of some
999         // (inner) loop, no update needed.
1000         if (!UnloopedBlockSet.erase(PredBB)) {
1001           assert(
1002               (BlocksInClonedLoop.count(PredBB) || ExitLoopMap.count(PredBB)) &&
1003               "Predecessor not mapped to a loop!");
1004           continue;
1005         }
1006 
1007         // We just insert into the loop set here. We'll add these blocks to the
1008         // exit loop after we build up the set in an order that doesn't rely on
1009         // predecessor order (which in turn relies on use list order).
1010         bool Inserted = ExitLoopMap.insert({PredBB, ExitL}).second;
1011         (void)Inserted;
1012         assert(Inserted && "Should only visit an unlooped block once!");
1013 
1014         // And recurse through to its predecessors.
1015         Worklist.push_back(PredBB);
1016       }
1017     } while (!Worklist.empty());
1018   }
1019 
1020   // Now that the ExitLoopMap gives as  mapping for all the non-looping cloned
1021   // blocks to their outer loops, walk the cloned blocks and the cloned exits
1022   // in their original order adding them to the correct loop.
1023 
1024   // We need a stable insertion order. We use the order of the original loop
1025   // order and map into the correct parent loop.
1026   for (auto *BB : llvm::concat<BasicBlock *const>(
1027            makeArrayRef(ClonedPH), ClonedLoopBlocks, ClonedExitsInLoops))
1028     if (Loop *OuterL = ExitLoopMap.lookup(BB))
1029       OuterL->addBasicBlockToLoop(BB, LI);
1030 
1031 #ifndef NDEBUG
1032   for (auto &BBAndL : ExitLoopMap) {
1033     auto *BB = BBAndL.first;
1034     auto *OuterL = BBAndL.second;
1035     assert(LI.getLoopFor(BB) == OuterL &&
1036            "Failed to put all blocks into outer loops!");
1037   }
1038 #endif
1039 
1040   // Now that all the blocks are placed into the correct containing loop in the
1041   // absence of child loops, find all the potentially cloned child loops and
1042   // clone them into whatever outer loop we placed their header into.
1043   for (Loop *ChildL : OrigL) {
1044     auto *ClonedChildHeader =
1045         cast_or_null<BasicBlock>(VMap.lookup(ChildL->getHeader()));
1046     if (!ClonedChildHeader || BlocksInClonedLoop.count(ClonedChildHeader))
1047       continue;
1048 
1049 #ifndef NDEBUG
1050     for (auto *ChildLoopBB : ChildL->blocks())
1051       assert(VMap.count(ChildLoopBB) &&
1052              "Cloned a child loop header but not all of that loops blocks!");
1053 #endif
1054 
1055     NonChildClonedLoops.push_back(cloneLoopNest(
1056         *ChildL, ExitLoopMap.lookup(ClonedChildHeader), VMap, LI));
1057   }
1058 
1059   // Return the main cloned loop if any.
1060   return ClonedL;
1061 }
1062 
1063 static void
1064 deleteDeadBlocksFromLoop(Loop &L,
1065                          const SmallVectorImpl<BasicBlock *> &DeadBlocks,
1066                          SmallVectorImpl<BasicBlock *> &ExitBlocks,
1067                          DominatorTree &DT, LoopInfo &LI) {
1068   SmallPtrSet<BasicBlock *, 16> DeadBlockSet(DeadBlocks.begin(),
1069                                              DeadBlocks.end());
1070 
1071   // Filter out the dead blocks from the exit blocks list so that it can be
1072   // used in the caller.
1073   llvm::erase_if(ExitBlocks,
1074                  [&](BasicBlock *BB) { return DeadBlockSet.count(BB); });
1075 
1076   // Remove these blocks from their successors.
1077   for (auto *BB : DeadBlocks)
1078     for (BasicBlock *SuccBB : successors(BB))
1079       SuccBB->removePredecessor(BB, /*DontDeleteUselessPHIs*/ true);
1080 
1081   // Walk from this loop up through its parents removing all of the dead blocks.
1082   for (Loop *ParentL = &L; ParentL; ParentL = ParentL->getParentLoop()) {
1083     for (auto *BB : DeadBlocks)
1084       ParentL->getBlocksSet().erase(BB);
1085     llvm::erase_if(ParentL->getBlocksVector(),
1086                    [&](BasicBlock *BB) { return DeadBlockSet.count(BB); });
1087   }
1088 
1089   // Now delete the dead child loops. This raw delete will clear them
1090   // recursively.
1091   llvm::erase_if(L.getSubLoopsVector(), [&](Loop *ChildL) {
1092     if (!DeadBlockSet.count(ChildL->getHeader()))
1093       return false;
1094 
1095     assert(llvm::all_of(ChildL->blocks(),
1096                         [&](BasicBlock *ChildBB) {
1097                           return DeadBlockSet.count(ChildBB);
1098                         }) &&
1099            "If the child loop header is dead all blocks in the child loop must "
1100            "be dead as well!");
1101     LI.destroy(ChildL);
1102     return true;
1103   });
1104 
1105   // Remove the loop mappings for the dead blocks and drop all the references
1106   // from these blocks to others to handle cyclic references as we start
1107   // deleting the blocks themselves.
1108   for (auto *BB : DeadBlocks) {
1109     // Check that the dominator tree has already been updated.
1110     assert(!DT.getNode(BB) && "Should already have cleared domtree!");
1111     LI.changeLoopFor(BB, nullptr);
1112     BB->dropAllReferences();
1113   }
1114 
1115   // Actually delete the blocks now that they've been fully unhooked from the
1116   // IR.
1117   for (auto *BB : DeadBlocks)
1118     BB->eraseFromParent();
1119 }
1120 
1121 /// Recompute the set of blocks in a loop after unswitching.
1122 ///
1123 /// This walks from the original headers predecessors to rebuild the loop. We
1124 /// take advantage of the fact that new blocks can't have been added, and so we
1125 /// filter by the original loop's blocks. This also handles potentially
1126 /// unreachable code that we don't want to explore but might be found examining
1127 /// the predecessors of the header.
1128 ///
1129 /// If the original loop is no longer a loop, this will return an empty set. If
1130 /// it remains a loop, all the blocks within it will be added to the set
1131 /// (including those blocks in inner loops).
1132 static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L,
1133                                                                  LoopInfo &LI) {
1134   SmallPtrSet<const BasicBlock *, 16> LoopBlockSet;
1135 
1136   auto *PH = L.getLoopPreheader();
1137   auto *Header = L.getHeader();
1138 
1139   // A worklist to use while walking backwards from the header.
1140   SmallVector<BasicBlock *, 16> Worklist;
1141 
1142   // First walk the predecessors of the header to find the backedges. This will
1143   // form the basis of our walk.
1144   for (auto *Pred : predecessors(Header)) {
1145     // Skip the preheader.
1146     if (Pred == PH)
1147       continue;
1148 
1149     // Because the loop was in simplified form, the only non-loop predecessor
1150     // is the preheader.
1151     assert(L.contains(Pred) && "Found a predecessor of the loop header other "
1152                                "than the preheader that is not part of the "
1153                                "loop!");
1154 
1155     // Insert this block into the loop set and on the first visit and, if it
1156     // isn't the header we're currently walking, put it into the worklist to
1157     // recurse through.
1158     if (LoopBlockSet.insert(Pred).second && Pred != Header)
1159       Worklist.push_back(Pred);
1160   }
1161 
1162   // If no backedges were found, we're done.
1163   if (LoopBlockSet.empty())
1164     return LoopBlockSet;
1165 
1166   // We found backedges, recurse through them to identify the loop blocks.
1167   while (!Worklist.empty()) {
1168     BasicBlock *BB = Worklist.pop_back_val();
1169     assert(LoopBlockSet.count(BB) && "Didn't put block into the loop set!");
1170 
1171     // No need to walk past the header.
1172     if (BB == Header)
1173       continue;
1174 
1175     // Because we know the inner loop structure remains valid we can use the
1176     // loop structure to jump immediately across the entire nested loop.
1177     // Further, because it is in loop simplified form, we can directly jump
1178     // to its preheader afterward.
1179     if (Loop *InnerL = LI.getLoopFor(BB))
1180       if (InnerL != &L) {
1181         assert(L.contains(InnerL) &&
1182                "Should not reach a loop *outside* this loop!");
1183         // The preheader is the only possible predecessor of the loop so
1184         // insert it into the set and check whether it was already handled.
1185         auto *InnerPH = InnerL->getLoopPreheader();
1186         assert(L.contains(InnerPH) && "Cannot contain an inner loop block "
1187                                       "but not contain the inner loop "
1188                                       "preheader!");
1189         if (!LoopBlockSet.insert(InnerPH).second)
1190           // The only way to reach the preheader is through the loop body
1191           // itself so if it has been visited the loop is already handled.
1192           continue;
1193 
1194         // Insert all of the blocks (other than those already present) into
1195         // the loop set. We expect at least the block that led us to find the
1196         // inner loop to be in the block set, but we may also have other loop
1197         // blocks if they were already enqueued as predecessors of some other
1198         // outer loop block.
1199         for (auto *InnerBB : InnerL->blocks()) {
1200           if (InnerBB == BB) {
1201             assert(LoopBlockSet.count(InnerBB) &&
1202                    "Block should already be in the set!");
1203             continue;
1204           }
1205 
1206           LoopBlockSet.insert(InnerBB);
1207         }
1208 
1209         // Add the preheader to the worklist so we will continue past the
1210         // loop body.
1211         Worklist.push_back(InnerPH);
1212         continue;
1213       }
1214 
1215     // Insert any predecessors that were in the original loop into the new
1216     // set, and if the insert is successful, add them to the worklist.
1217     for (auto *Pred : predecessors(BB))
1218       if (L.contains(Pred) && LoopBlockSet.insert(Pred).second)
1219         Worklist.push_back(Pred);
1220   }
1221 
1222   assert(LoopBlockSet.count(Header) && "Cannot fail to add the header!");
1223 
1224   // We've found all the blocks participating in the loop, return our completed
1225   // set.
1226   return LoopBlockSet;
1227 }
1228 
1229 /// Rebuild a loop after unswitching removes some subset of blocks and edges.
1230 ///
1231 /// The removal may have removed some child loops entirely but cannot have
1232 /// disturbed any remaining child loops. However, they may need to be hoisted
1233 /// to the parent loop (or to be top-level loops). The original loop may be
1234 /// completely removed.
1235 ///
1236 /// The sibling loops resulting from this update are returned. If the original
1237 /// loop remains a valid loop, it will be the first entry in this list with all
1238 /// of the newly sibling loops following it.
1239 ///
1240 /// Returns true if the loop remains a loop after unswitching, and false if it
1241 /// is no longer a loop after unswitching (and should not continue to be
1242 /// referenced).
1243 static bool rebuildLoopAfterUnswitch(Loop &L, ArrayRef<BasicBlock *> ExitBlocks,
1244                                      LoopInfo &LI,
1245                                      SmallVectorImpl<Loop *> &HoistedLoops) {
1246   auto *PH = L.getLoopPreheader();
1247 
1248   // Compute the actual parent loop from the exit blocks. Because we may have
1249   // pruned some exits the loop may be different from the original parent.
1250   Loop *ParentL = nullptr;
1251   SmallVector<Loop *, 4> ExitLoops;
1252   SmallVector<BasicBlock *, 4> ExitsInLoops;
1253   ExitsInLoops.reserve(ExitBlocks.size());
1254   for (auto *ExitBB : ExitBlocks)
1255     if (Loop *ExitL = LI.getLoopFor(ExitBB)) {
1256       ExitLoops.push_back(ExitL);
1257       ExitsInLoops.push_back(ExitBB);
1258       if (!ParentL || (ParentL != ExitL && ParentL->contains(ExitL)))
1259         ParentL = ExitL;
1260     }
1261 
1262   // Recompute the blocks participating in this loop. This may be empty if it
1263   // is no longer a loop.
1264   auto LoopBlockSet = recomputeLoopBlockSet(L, LI);
1265 
1266   // If we still have a loop, we need to re-set the loop's parent as the exit
1267   // block set changing may have moved it within the loop nest. Note that this
1268   // can only happen when this loop has a parent as it can only hoist the loop
1269   // *up* the nest.
1270   if (!LoopBlockSet.empty() && L.getParentLoop() != ParentL) {
1271     // Remove this loop's (original) blocks from all of the intervening loops.
1272     for (Loop *IL = L.getParentLoop(); IL != ParentL;
1273          IL = IL->getParentLoop()) {
1274       IL->getBlocksSet().erase(PH);
1275       for (auto *BB : L.blocks())
1276         IL->getBlocksSet().erase(BB);
1277       llvm::erase_if(IL->getBlocksVector(), [&](BasicBlock *BB) {
1278         return BB == PH || L.contains(BB);
1279       });
1280     }
1281 
1282     LI.changeLoopFor(PH, ParentL);
1283     L.getParentLoop()->removeChildLoop(&L);
1284     if (ParentL)
1285       ParentL->addChildLoop(&L);
1286     else
1287       LI.addTopLevelLoop(&L);
1288   }
1289 
1290   // Now we update all the blocks which are no longer within the loop.
1291   auto &Blocks = L.getBlocksVector();
1292   auto BlocksSplitI =
1293       LoopBlockSet.empty()
1294           ? Blocks.begin()
1295           : std::stable_partition(
1296                 Blocks.begin(), Blocks.end(),
1297                 [&](BasicBlock *BB) { return LoopBlockSet.count(BB); });
1298 
1299   // Before we erase the list of unlooped blocks, build a set of them.
1300   SmallPtrSet<BasicBlock *, 16> UnloopedBlocks(BlocksSplitI, Blocks.end());
1301   if (LoopBlockSet.empty())
1302     UnloopedBlocks.insert(PH);
1303 
1304   // Now erase these blocks from the loop.
1305   for (auto *BB : make_range(BlocksSplitI, Blocks.end()))
1306     L.getBlocksSet().erase(BB);
1307   Blocks.erase(BlocksSplitI, Blocks.end());
1308 
1309   // Sort the exits in ascending loop depth, we'll work backwards across these
1310   // to process them inside out.
1311   std::stable_sort(ExitsInLoops.begin(), ExitsInLoops.end(),
1312                    [&](BasicBlock *LHS, BasicBlock *RHS) {
1313                      return LI.getLoopDepth(LHS) < LI.getLoopDepth(RHS);
1314                    });
1315 
1316   // We'll build up a set for each exit loop.
1317   SmallPtrSet<BasicBlock *, 16> NewExitLoopBlocks;
1318   Loop *PrevExitL = L.getParentLoop(); // The deepest possible exit loop.
1319 
1320   auto RemoveUnloopedBlocksFromLoop =
1321       [](Loop &L, SmallPtrSetImpl<BasicBlock *> &UnloopedBlocks) {
1322         for (auto *BB : UnloopedBlocks)
1323           L.getBlocksSet().erase(BB);
1324         llvm::erase_if(L.getBlocksVector(), [&](BasicBlock *BB) {
1325           return UnloopedBlocks.count(BB);
1326         });
1327       };
1328 
1329   SmallVector<BasicBlock *, 16> Worklist;
1330   while (!UnloopedBlocks.empty() && !ExitsInLoops.empty()) {
1331     assert(Worklist.empty() && "Didn't clear worklist!");
1332     assert(NewExitLoopBlocks.empty() && "Didn't clear loop set!");
1333 
1334     // Grab the next exit block, in decreasing loop depth order.
1335     BasicBlock *ExitBB = ExitsInLoops.pop_back_val();
1336     Loop &ExitL = *LI.getLoopFor(ExitBB);
1337     assert(ExitL.contains(&L) && "Exit loop must contain the inner loop!");
1338 
1339     // Erase all of the unlooped blocks from the loops between the previous
1340     // exit loop and this exit loop. This works because the ExitInLoops list is
1341     // sorted in increasing order of loop depth and thus we visit loops in
1342     // decreasing order of loop depth.
1343     for (; PrevExitL != &ExitL; PrevExitL = PrevExitL->getParentLoop())
1344       RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks);
1345 
1346     // Walk the CFG back until we hit the cloned PH adding everything reachable
1347     // and in the unlooped set to this exit block's loop.
1348     Worklist.push_back(ExitBB);
1349     do {
1350       BasicBlock *BB = Worklist.pop_back_val();
1351       // We can stop recursing at the cloned preheader (if we get there).
1352       if (BB == PH)
1353         continue;
1354 
1355       for (BasicBlock *PredBB : predecessors(BB)) {
1356         // If this pred has already been moved to our set or is part of some
1357         // (inner) loop, no update needed.
1358         if (!UnloopedBlocks.erase(PredBB)) {
1359           assert((NewExitLoopBlocks.count(PredBB) ||
1360                   ExitL.contains(LI.getLoopFor(PredBB))) &&
1361                  "Predecessor not in a nested loop (or already visited)!");
1362           continue;
1363         }
1364 
1365         // We just insert into the loop set here. We'll add these blocks to the
1366         // exit loop after we build up the set in a deterministic order rather
1367         // than the predecessor-influenced visit order.
1368         bool Inserted = NewExitLoopBlocks.insert(PredBB).second;
1369         (void)Inserted;
1370         assert(Inserted && "Should only visit an unlooped block once!");
1371 
1372         // And recurse through to its predecessors.
1373         Worklist.push_back(PredBB);
1374       }
1375     } while (!Worklist.empty());
1376 
1377     // If blocks in this exit loop were directly part of the original loop (as
1378     // opposed to a child loop) update the map to point to this exit loop. This
1379     // just updates a map and so the fact that the order is unstable is fine.
1380     for (auto *BB : NewExitLoopBlocks)
1381       if (Loop *BBL = LI.getLoopFor(BB))
1382         if (BBL == &L || !L.contains(BBL))
1383           LI.changeLoopFor(BB, &ExitL);
1384 
1385     // We will remove the remaining unlooped blocks from this loop in the next
1386     // iteration or below.
1387     NewExitLoopBlocks.clear();
1388   }
1389 
1390   // Any remaining unlooped blocks are no longer part of any loop unless they
1391   // are part of some child loop.
1392   for (; PrevExitL; PrevExitL = PrevExitL->getParentLoop())
1393     RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks);
1394   for (auto *BB : UnloopedBlocks)
1395     if (Loop *BBL = LI.getLoopFor(BB))
1396       if (BBL == &L || !L.contains(BBL))
1397         LI.changeLoopFor(BB, nullptr);
1398 
1399   // Sink all the child loops whose headers are no longer in the loop set to
1400   // the parent (or to be top level loops). We reach into the loop and directly
1401   // update its subloop vector to make this batch update efficient.
1402   auto &SubLoops = L.getSubLoopsVector();
1403   auto SubLoopsSplitI =
1404       LoopBlockSet.empty()
1405           ? SubLoops.begin()
1406           : std::stable_partition(
1407                 SubLoops.begin(), SubLoops.end(), [&](Loop *SubL) {
1408                   return LoopBlockSet.count(SubL->getHeader());
1409                 });
1410   for (auto *HoistedL : make_range(SubLoopsSplitI, SubLoops.end())) {
1411     HoistedLoops.push_back(HoistedL);
1412     HoistedL->setParentLoop(nullptr);
1413 
1414     // To compute the new parent of this hoisted loop we look at where we
1415     // placed the preheader above. We can't lookup the header itself because we
1416     // retained the mapping from the header to the hoisted loop. But the
1417     // preheader and header should have the exact same new parent computed
1418     // based on the set of exit blocks from the original loop as the preheader
1419     // is a predecessor of the header and so reached in the reverse walk. And
1420     // because the loops were all in simplified form the preheader of the
1421     // hoisted loop can't be part of some *other* loop.
1422     if (auto *NewParentL = LI.getLoopFor(HoistedL->getLoopPreheader()))
1423       NewParentL->addChildLoop(HoistedL);
1424     else
1425       LI.addTopLevelLoop(HoistedL);
1426   }
1427   SubLoops.erase(SubLoopsSplitI, SubLoops.end());
1428 
1429   // Actually delete the loop if nothing remained within it.
1430   if (Blocks.empty()) {
1431     assert(SubLoops.empty() &&
1432            "Failed to remove all subloops from the original loop!");
1433     if (Loop *ParentL = L.getParentLoop())
1434       ParentL->removeChildLoop(llvm::find(*ParentL, &L));
1435     else
1436       LI.removeLoop(llvm::find(LI, &L));
1437     LI.destroy(&L);
1438     return false;
1439   }
1440 
1441   return true;
1442 }
1443 
1444 /// Helper to visit a dominator subtree, invoking a callable on each node.
1445 ///
1446 /// Returning false at any point will stop walking past that node of the tree.
1447 template <typename CallableT>
1448 void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) {
1449   SmallVector<DomTreeNode *, 4> DomWorklist;
1450   DomWorklist.push_back(DT[BB]);
1451 #ifndef NDEBUG
1452   SmallPtrSet<DomTreeNode *, 4> Visited;
1453   Visited.insert(DT[BB]);
1454 #endif
1455   do {
1456     DomTreeNode *N = DomWorklist.pop_back_val();
1457 
1458     // Visit this node.
1459     if (!Callable(N->getBlock()))
1460       continue;
1461 
1462     // Accumulate the child nodes.
1463     for (DomTreeNode *ChildN : *N) {
1464       assert(Visited.insert(ChildN).second &&
1465              "Cannot visit a node twice when walking a tree!");
1466       DomWorklist.push_back(ChildN);
1467     }
1468   } while (!DomWorklist.empty());
1469 }
1470 
1471 /// Take an invariant branch that has been determined to be safe and worthwhile
1472 /// to unswitch despite being non-trivial to do so and perform the unswitch.
1473 ///
1474 /// This directly updates the CFG to hoist the predicate out of the loop, and
1475 /// clone the necessary parts of the loop to maintain behavior.
1476 ///
1477 /// It also updates both dominator tree and loopinfo based on the unswitching.
1478 ///
1479 /// Once unswitching has been performed it runs the provided callback to report
1480 /// the new loops and no-longer valid loops to the caller.
1481 static bool unswitchInvariantBranch(
1482     Loop &L, BranchInst &BI, DominatorTree &DT, LoopInfo &LI,
1483     AssumptionCache &AC,
1484     function_ref<void(bool, ArrayRef<Loop *>)> NonTrivialUnswitchCB) {
1485   assert(BI.isConditional() && "Can only unswitch a conditional branch!");
1486   assert(L.isLoopInvariant(BI.getCondition()) &&
1487          "Can only unswitch an invariant branch condition!");
1488 
1489   // Constant and BBs tracking the cloned and continuing successor.
1490   const int ClonedSucc = 0;
1491   auto *ParentBB = BI.getParent();
1492   auto *UnswitchedSuccBB = BI.getSuccessor(ClonedSucc);
1493   auto *ContinueSuccBB = BI.getSuccessor(1 - ClonedSucc);
1494 
1495   assert(UnswitchedSuccBB != ContinueSuccBB &&
1496          "Should not unswitch a branch that always goes to the same place!");
1497 
1498   // The branch should be in this exact loop. Any inner loop's invariant branch
1499   // should be handled by unswitching that inner loop. The caller of this
1500   // routine should filter out any candidates that remain (but were skipped for
1501   // whatever reason).
1502   assert(LI.getLoopFor(ParentBB) == &L && "Branch in an inner loop!");
1503 
1504   SmallVector<BasicBlock *, 4> ExitBlocks;
1505   L.getUniqueExitBlocks(ExitBlocks);
1506 
1507   // We cannot unswitch if exit blocks contain a cleanuppad instruction as we
1508   // don't know how to split those exit blocks.
1509   // FIXME: We should teach SplitBlock to handle this and remove this
1510   // restriction.
1511   for (auto *ExitBB : ExitBlocks)
1512     if (isa<CleanupPadInst>(ExitBB->getFirstNonPHI()))
1513       return false;
1514 
1515   SmallPtrSet<BasicBlock *, 4> ExitBlockSet(ExitBlocks.begin(),
1516                                             ExitBlocks.end());
1517 
1518   // Compute the parent loop now before we start hacking on things.
1519   Loop *ParentL = L.getParentLoop();
1520 
1521   // Compute the outer-most loop containing one of our exit blocks. This is the
1522   // furthest up our loopnest which can be mutated, which we will use below to
1523   // update things.
1524   Loop *OuterExitL = &L;
1525   for (auto *ExitBB : ExitBlocks) {
1526     Loop *NewOuterExitL = LI.getLoopFor(ExitBB);
1527     if (!NewOuterExitL) {
1528       // We exited the entire nest with this block, so we're done.
1529       OuterExitL = nullptr;
1530       break;
1531     }
1532     if (NewOuterExitL != OuterExitL && NewOuterExitL->contains(OuterExitL))
1533       OuterExitL = NewOuterExitL;
1534   }
1535 
1536   // If the edge we *aren't* cloning in the unswitch (the continuing edge)
1537   // dominates its target, we can skip cloning the dominated region of the loop
1538   // and its exits. We compute this as a set of nodes to be skipped.
1539   SmallPtrSet<BasicBlock *, 4> SkippedLoopAndExitBlocks;
1540   if (ContinueSuccBB->getUniquePredecessor() ||
1541       llvm::all_of(predecessors(ContinueSuccBB), [&](BasicBlock *PredBB) {
1542         return PredBB == ParentBB || DT.dominates(ContinueSuccBB, PredBB);
1543       })) {
1544     visitDomSubTree(DT, ContinueSuccBB, [&](BasicBlock *BB) {
1545       SkippedLoopAndExitBlocks.insert(BB);
1546       return true;
1547     });
1548   }
1549   // Similarly, if the edge we *are* cloning in the unswitch (the unswitched
1550   // edge) dominates its target, we will end up with dead nodes in the original
1551   // loop and its exits that will need to be deleted. Here, we just retain that
1552   // the property holds and will compute the deleted set later.
1553   bool DeleteUnswitchedSucc =
1554       UnswitchedSuccBB->getUniquePredecessor() ||
1555       llvm::all_of(predecessors(UnswitchedSuccBB), [&](BasicBlock *PredBB) {
1556         return PredBB == ParentBB || DT.dominates(UnswitchedSuccBB, PredBB);
1557       });
1558 
1559   // Split the preheader, so that we know that there is a safe place to insert
1560   // the conditional branch. We will change the preheader to have a conditional
1561   // branch on LoopCond. The original preheader will become the split point
1562   // between the unswitched versions, and we will have a new preheader for the
1563   // original loop.
1564   BasicBlock *SplitBB = L.getLoopPreheader();
1565   BasicBlock *LoopPH = SplitEdge(SplitBB, L.getHeader(), &DT, &LI);
1566 
1567   // Keep a mapping for the cloned values.
1568   ValueToValueMapTy VMap;
1569 
1570   // Keep track of the dominator tree updates needed.
1571   SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
1572 
1573   // Build the cloned blocks from the loop.
1574   auto *ClonedPH = buildClonedLoopBlocks(
1575       L, LoopPH, SplitBB, ExitBlocks, ParentBB, UnswitchedSuccBB,
1576       ContinueSuccBB, SkippedLoopAndExitBlocks, VMap, DTUpdates, AC, DT, LI);
1577 
1578   // Remove the parent as a predecessor of the unswitched successor.
1579   UnswitchedSuccBB->removePredecessor(ParentBB, /*DontDeleteUselessPHIs*/ true);
1580 
1581   // Now splice the branch from the original loop and use it to select between
1582   // the two loops.
1583   SplitBB->getTerminator()->eraseFromParent();
1584   SplitBB->getInstList().splice(SplitBB->end(), ParentBB->getInstList(), BI);
1585   BI.setSuccessor(ClonedSucc, ClonedPH);
1586   BI.setSuccessor(1 - ClonedSucc, LoopPH);
1587 
1588   // Create a new unconditional branch to the continuing block (as opposed to
1589   // the one cloned).
1590   BranchInst::Create(ContinueSuccBB, ParentBB);
1591 
1592   // Before we update the dominator tree, collect the dead blocks if we're going
1593   // to end up deleting the unswitched successor.
1594   SmallVector<BasicBlock *, 16> DeadBlocks;
1595   if (DeleteUnswitchedSucc) {
1596     DeadBlocks.push_back(UnswitchedSuccBB);
1597     for (int i = 0; i < (int)DeadBlocks.size(); ++i) {
1598       // If we reach an exit block, stop recursing as the unswitched loop will
1599       // end up reaching the merge block which we make the successor of the
1600       // exit.
1601       if (ExitBlockSet.count(DeadBlocks[i]))
1602         continue;
1603 
1604       // Insert the children that are within the loop or exit block set. Other
1605       // children may reach out of the loop. While we don't expect these to be
1606       // dead (as the unswitched clone should reach them) we don't try to prove
1607       // that here.
1608       for (DomTreeNode *ChildN : *DT[DeadBlocks[i]])
1609         if (L.contains(ChildN->getBlock()) ||
1610             ExitBlockSet.count(ChildN->getBlock()))
1611           DeadBlocks.push_back(ChildN->getBlock());
1612     }
1613   }
1614 
1615   // Add the remaining edges to our updates and apply them to get an up-to-date
1616   // dominator tree. Note that this will cause the dead blocks above to be
1617   // unreachable and no longer in the dominator tree.
1618   DTUpdates.push_back({DominatorTree::Delete, ParentBB, UnswitchedSuccBB});
1619   DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
1620   DT.applyUpdates(DTUpdates);
1621 
1622   // Build the cloned loop structure itself. This may be substantially
1623   // different from the original structure due to the simplified CFG. This also
1624   // handles inserting all the cloned blocks into the correct loops.
1625   SmallVector<Loop *, 4> NonChildClonedLoops;
1626   Loop *ClonedL =
1627       buildClonedLoops(L, ExitBlocks, VMap, LI, NonChildClonedLoops);
1628 
1629   // Delete anything that was made dead in the original loop due to
1630   // unswitching.
1631   if (!DeadBlocks.empty())
1632     deleteDeadBlocksFromLoop(L, DeadBlocks, ExitBlocks, DT, LI);
1633 
1634   SmallVector<Loop *, 4> HoistedLoops;
1635   bool IsStillLoop = rebuildLoopAfterUnswitch(L, ExitBlocks, LI, HoistedLoops);
1636 
1637   // This transformation has a high risk of corrupting the dominator tree, and
1638   // the below steps to rebuild loop structures will result in hard to debug
1639   // errors in that case so verify that the dominator tree is sane first.
1640   // FIXME: Remove this when the bugs stop showing up and rely on existing
1641   // verification steps.
1642   assert(DT.verify(DominatorTree::VerificationLevel::Fast));
1643 
1644   // We can change which blocks are exit blocks of all the cloned sibling
1645   // loops, the current loop, and any parent loops which shared exit blocks
1646   // with the current loop. As a consequence, we need to re-form LCSSA for
1647   // them. But we shouldn't need to re-form LCSSA for any child loops.
1648   // FIXME: This could be made more efficient by tracking which exit blocks are
1649   // new, and focusing on them, but that isn't likely to be necessary.
1650   //
1651   // In order to reasonably rebuild LCSSA we need to walk inside-out across the
1652   // loop nest and update every loop that could have had its exits changed. We
1653   // also need to cover any intervening loops. We add all of these loops to
1654   // a list and sort them by loop depth to achieve this without updating
1655   // unnecessary loops.
1656   auto UpdateLCSSA = [&](Loop &UpdateL) {
1657 #ifndef NDEBUG
1658     UpdateL.verifyLoop();
1659     for (Loop *ChildL : UpdateL) {
1660       ChildL->verifyLoop();
1661       assert(ChildL->isRecursivelyLCSSAForm(DT, LI) &&
1662              "Perturbed a child loop's LCSSA form!");
1663     }
1664 #endif
1665     formLCSSA(UpdateL, DT, &LI, nullptr);
1666   };
1667 
1668   // For non-child cloned loops and hoisted loops, we just need to update LCSSA
1669   // and we can do it in any order as they don't nest relative to each other.
1670   for (Loop *UpdatedL : llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops))
1671     UpdateLCSSA(*UpdatedL);
1672 
1673   // If the original loop had exit blocks, walk up through the outer most loop
1674   // of those exit blocks to update LCSSA and form updated dedicated exits.
1675   if (OuterExitL != &L) {
1676     SmallVector<Loop *, 4> OuterLoops;
1677     // We start with the cloned loop and the current loop if they are loops and
1678     // move toward OuterExitL. Also, if either the cloned loop or the current
1679     // loop have become top level loops we need to walk all the way out.
1680     if (ClonedL) {
1681       OuterLoops.push_back(ClonedL);
1682       if (!ClonedL->getParentLoop())
1683         OuterExitL = nullptr;
1684     }
1685     if (IsStillLoop) {
1686       OuterLoops.push_back(&L);
1687       if (!L.getParentLoop())
1688         OuterExitL = nullptr;
1689     }
1690     // Grab all of the enclosing loops now.
1691     for (Loop *OuterL = ParentL; OuterL != OuterExitL;
1692          OuterL = OuterL->getParentLoop())
1693       OuterLoops.push_back(OuterL);
1694 
1695     // Finally, update our list of outer loops. This is nicely ordered to work
1696     // inside-out.
1697     for (Loop *OuterL : OuterLoops) {
1698       // First build LCSSA for this loop so that we can preserve it when
1699       // forming dedicated exits. We don't want to perturb some other loop's
1700       // LCSSA while doing that CFG edit.
1701       UpdateLCSSA(*OuterL);
1702 
1703       // For loops reached by this loop's original exit blocks we may
1704       // introduced new, non-dedicated exits. At least try to re-form dedicated
1705       // exits for these loops. This may fail if they couldn't have dedicated
1706       // exits to start with.
1707       formDedicatedExitBlocks(OuterL, &DT, &LI, /*PreserveLCSSA*/ true);
1708     }
1709   }
1710 
1711 #ifndef NDEBUG
1712   // Verify the entire loop structure to catch any incorrect updates before we
1713   // progress in the pass pipeline.
1714   LI.verify(DT);
1715 #endif
1716 
1717   // Now that we've unswitched something, make callbacks to report the changes.
1718   // For that we need to merge together the updated loops and the cloned loops
1719   // and check whether the original loop survived.
1720   SmallVector<Loop *, 4> SibLoops;
1721   for (Loop *UpdatedL : llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops))
1722     if (UpdatedL->getParentLoop() == ParentL)
1723       SibLoops.push_back(UpdatedL);
1724   NonTrivialUnswitchCB(IsStillLoop, SibLoops);
1725 
1726   ++NumBranches;
1727   return true;
1728 }
1729 
1730 /// Recursively compute the cost of a dominator subtree based on the per-block
1731 /// cost map provided.
1732 ///
1733 /// The recursive computation is memozied into the provided DT-indexed cost map
1734 /// to allow querying it for most nodes in the domtree without it becoming
1735 /// quadratic.
1736 static int
1737 computeDomSubtreeCost(DomTreeNode &N,
1738                       const SmallDenseMap<BasicBlock *, int, 4> &BBCostMap,
1739                       SmallDenseMap<DomTreeNode *, int, 4> &DTCostMap) {
1740   // Don't accumulate cost (or recurse through) blocks not in our block cost
1741   // map and thus not part of the duplication cost being considered.
1742   auto BBCostIt = BBCostMap.find(N.getBlock());
1743   if (BBCostIt == BBCostMap.end())
1744     return 0;
1745 
1746   // Lookup this node to see if we already computed its cost.
1747   auto DTCostIt = DTCostMap.find(&N);
1748   if (DTCostIt != DTCostMap.end())
1749     return DTCostIt->second;
1750 
1751   // If not, we have to compute it. We can't use insert above and update
1752   // because computing the cost may insert more things into the map.
1753   int Cost = std::accumulate(
1754       N.begin(), N.end(), BBCostIt->second, [&](int Sum, DomTreeNode *ChildN) {
1755         return Sum + computeDomSubtreeCost(*ChildN, BBCostMap, DTCostMap);
1756       });
1757   bool Inserted = DTCostMap.insert({&N, Cost}).second;
1758   (void)Inserted;
1759   assert(Inserted && "Should not insert a node while visiting children!");
1760   return Cost;
1761 }
1762 
1763 /// Unswitch control flow predicated on loop invariant conditions.
1764 ///
1765 /// This first hoists all branches or switches which are trivial (IE, do not
1766 /// require duplicating any part of the loop) out of the loop body. It then
1767 /// looks at other loop invariant control flows and tries to unswitch those as
1768 /// well by cloning the loop if the result is small enough.
1769 static bool
1770 unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
1771              TargetTransformInfo &TTI, bool NonTrivial,
1772              function_ref<void(bool, ArrayRef<Loop *>)> NonTrivialUnswitchCB) {
1773   assert(L.isRecursivelyLCSSAForm(DT, LI) &&
1774          "Loops must be in LCSSA form before unswitching.");
1775   bool Changed = false;
1776 
1777   // Must be in loop simplified form: we need a preheader and dedicated exits.
1778   if (!L.isLoopSimplifyForm())
1779     return false;
1780 
1781   // Try trivial unswitch first before loop over other basic blocks in the loop.
1782   Changed |= unswitchAllTrivialConditions(L, DT, LI);
1783 
1784   // If we're not doing non-trivial unswitching, we're done. We both accept
1785   // a parameter but also check a local flag that can be used for testing
1786   // a debugging.
1787   if (!NonTrivial && !EnableNonTrivialUnswitch)
1788     return Changed;
1789 
1790   // Collect all remaining invariant branch conditions within this loop (as
1791   // opposed to an inner loop which would be handled when visiting that inner
1792   // loop).
1793   SmallVector<TerminatorInst *, 4> UnswitchCandidates;
1794   for (auto *BB : L.blocks())
1795     if (LI.getLoopFor(BB) == &L)
1796       if (auto *BI = dyn_cast<BranchInst>(BB->getTerminator()))
1797         if (BI->isConditional() && L.isLoopInvariant(BI->getCondition()) &&
1798             BI->getSuccessor(0) != BI->getSuccessor(1))
1799           UnswitchCandidates.push_back(BI);
1800 
1801   // If we didn't find any candidates, we're done.
1802   if (UnswitchCandidates.empty())
1803     return Changed;
1804 
1805   // Check if there are irreducible CFG cycles in this loop. If so, we cannot
1806   // easily unswitch non-trivial edges out of the loop. Doing so might turn the
1807   // irreducible control flow into reducible control flow and introduce new
1808   // loops "out of thin air". If we ever discover important use cases for doing
1809   // this, we can add support to loop unswitch, but it is a lot of complexity
1810   // for what seems little or no real world benifit.
1811   LoopBlocksRPO RPOT(&L);
1812   RPOT.perform(&LI);
1813   if (containsIrreducibleCFG<const BasicBlock *>(RPOT, LI))
1814     return Changed;
1815 
1816   DEBUG(dbgs() << "Considering " << UnswitchCandidates.size()
1817                << " non-trivial loop invariant conditions for unswitching.\n");
1818 
1819   // Given that unswitching these terminators will require duplicating parts of
1820   // the loop, so we need to be able to model that cost. Compute the ephemeral
1821   // values and set up a data structure to hold per-BB costs. We cache each
1822   // block's cost so that we don't recompute this when considering different
1823   // subsets of the loop for duplication during unswitching.
1824   SmallPtrSet<const Value *, 4> EphValues;
1825   CodeMetrics::collectEphemeralValues(&L, &AC, EphValues);
1826   SmallDenseMap<BasicBlock *, int, 4> BBCostMap;
1827 
1828   // Compute the cost of each block, as well as the total loop cost. Also, bail
1829   // out if we see instructions which are incompatible with loop unswitching
1830   // (convergent, noduplicate, or cross-basic-block tokens).
1831   // FIXME: We might be able to safely handle some of these in non-duplicated
1832   // regions.
1833   int LoopCost = 0;
1834   for (auto *BB : L.blocks()) {
1835     int Cost = 0;
1836     for (auto &I : *BB) {
1837       if (EphValues.count(&I))
1838         continue;
1839 
1840       if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB))
1841         return Changed;
1842       if (auto CS = CallSite(&I))
1843         if (CS.isConvergent() || CS.cannotDuplicate())
1844           return Changed;
1845 
1846       Cost += TTI.getUserCost(&I);
1847     }
1848     assert(Cost >= 0 && "Must not have negative costs!");
1849     LoopCost += Cost;
1850     assert(LoopCost >= 0 && "Must not have negative loop costs!");
1851     BBCostMap[BB] = Cost;
1852   }
1853   DEBUG(dbgs() << "  Total loop cost: " << LoopCost << "\n");
1854 
1855   // Now we find the best candidate by searching for the one with the following
1856   // properties in order:
1857   //
1858   // 1) An unswitching cost below the threshold
1859   // 2) The smallest number of duplicated unswitch candidates (to avoid
1860   //    creating redundant subsequent unswitching)
1861   // 3) The smallest cost after unswitching.
1862   //
1863   // We prioritize reducing fanout of unswitch candidates provided the cost
1864   // remains below the threshold because this has a multiplicative effect.
1865   //
1866   // This requires memoizing each dominator subtree to avoid redundant work.
1867   //
1868   // FIXME: Need to actually do the number of candidates part above.
1869   SmallDenseMap<DomTreeNode *, int, 4> DTCostMap;
1870   // Given a terminator which might be unswitched, computes the non-duplicated
1871   // cost for that terminator.
1872   auto ComputeUnswitchedCost = [&](TerminatorInst *TI) {
1873     BasicBlock &BB = *TI->getParent();
1874     SmallPtrSet<BasicBlock *, 4> Visited;
1875 
1876     int Cost = LoopCost;
1877     for (BasicBlock *SuccBB : successors(&BB)) {
1878       // Don't count successors more than once.
1879       if (!Visited.insert(SuccBB).second)
1880         continue;
1881 
1882       // This successor's domtree will not need to be duplicated after
1883       // unswitching if the edge to the successor dominates it (and thus the
1884       // entire tree). This essentially means there is no other path into this
1885       // subtree and so it will end up live in only one clone of the loop.
1886       if (SuccBB->getUniquePredecessor() ||
1887           llvm::all_of(predecessors(SuccBB), [&](BasicBlock *PredBB) {
1888             return PredBB == &BB || DT.dominates(SuccBB, PredBB);
1889           })) {
1890         Cost -= computeDomSubtreeCost(*DT[SuccBB], BBCostMap, DTCostMap);
1891         assert(Cost >= 0 &&
1892                "Non-duplicated cost should never exceed total loop cost!");
1893       }
1894     }
1895 
1896     // Now scale the cost by the number of unique successors minus one. We
1897     // subtract one because there is already at least one copy of the entire
1898     // loop. This is computing the new cost of unswitching a condition.
1899     assert(Visited.size() > 1 &&
1900            "Cannot unswitch a condition without multiple distinct successors!");
1901     return Cost * (Visited.size() - 1);
1902   };
1903   TerminatorInst *BestUnswitchTI = nullptr;
1904   int BestUnswitchCost;
1905   for (TerminatorInst *CandidateTI : UnswitchCandidates) {
1906     int CandidateCost = ComputeUnswitchedCost(CandidateTI);
1907     DEBUG(dbgs() << "  Computed cost of " << CandidateCost
1908                  << " for unswitch candidate: " << *CandidateTI << "\n");
1909     if (!BestUnswitchTI || CandidateCost < BestUnswitchCost) {
1910       BestUnswitchTI = CandidateTI;
1911       BestUnswitchCost = CandidateCost;
1912     }
1913   }
1914 
1915   if (BestUnswitchCost < UnswitchThreshold) {
1916     DEBUG(dbgs() << "  Trying to unswitch non-trivial (cost = "
1917                  << BestUnswitchCost << ") branch: " << *BestUnswitchTI
1918                  << "\n");
1919     Changed |= unswitchInvariantBranch(L, cast<BranchInst>(*BestUnswitchTI), DT,
1920                                        LI, AC, NonTrivialUnswitchCB);
1921   } else {
1922     DEBUG(dbgs() << "Cannot unswitch, lowest cost found: " << BestUnswitchCost
1923                  << "\n");
1924   }
1925 
1926   return Changed;
1927 }
1928 
1929 PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
1930                                               LoopStandardAnalysisResults &AR,
1931                                               LPMUpdater &U) {
1932   Function &F = *L.getHeader()->getParent();
1933   (void)F;
1934 
1935   DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L << "\n");
1936 
1937   // Save the current loop name in a variable so that we can report it even
1938   // after it has been deleted.
1939   std::string LoopName = L.getName();
1940 
1941   auto NonTrivialUnswitchCB = [&L, &U, &LoopName](bool CurrentLoopValid,
1942                                                   ArrayRef<Loop *> NewLoops) {
1943     // If we did a non-trivial unswitch, we have added new (cloned) loops.
1944     U.addSiblingLoops(NewLoops);
1945 
1946     // If the current loop remains valid, we should revisit it to catch any
1947     // other unswitch opportunities. Otherwise, we need to mark it as deleted.
1948     if (CurrentLoopValid)
1949       U.revisitCurrentLoop();
1950     else
1951       U.markLoopAsDeleted(L, LoopName);
1952   };
1953 
1954   if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.TTI, NonTrivial,
1955                     NonTrivialUnswitchCB))
1956     return PreservedAnalyses::all();
1957 
1958   // Historically this pass has had issues with the dominator tree so verify it
1959   // in asserts builds.
1960   assert(AR.DT.verify(DominatorTree::VerificationLevel::Fast));
1961   return getLoopPassPreservedAnalyses();
1962 }
1963 
1964 namespace {
1965 
1966 class SimpleLoopUnswitchLegacyPass : public LoopPass {
1967   bool NonTrivial;
1968 
1969 public:
1970   static char ID; // Pass ID, replacement for typeid
1971 
1972   explicit SimpleLoopUnswitchLegacyPass(bool NonTrivial = false)
1973       : LoopPass(ID), NonTrivial(NonTrivial) {
1974     initializeSimpleLoopUnswitchLegacyPassPass(
1975         *PassRegistry::getPassRegistry());
1976   }
1977 
1978   bool runOnLoop(Loop *L, LPPassManager &LPM) override;
1979 
1980   void getAnalysisUsage(AnalysisUsage &AU) const override {
1981     AU.addRequired<AssumptionCacheTracker>();
1982     AU.addRequired<TargetTransformInfoWrapperPass>();
1983     getLoopAnalysisUsage(AU);
1984   }
1985 };
1986 
1987 } // end anonymous namespace
1988 
1989 bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
1990   if (skipLoop(L))
1991     return false;
1992 
1993   Function &F = *L->getHeader()->getParent();
1994 
1995   DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << *L << "\n");
1996 
1997   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1998   auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1999   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2000   auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2001 
2002   auto NonTrivialUnswitchCB = [&L, &LPM](bool CurrentLoopValid,
2003                                          ArrayRef<Loop *> NewLoops) {
2004     // If we did a non-trivial unswitch, we have added new (cloned) loops.
2005     for (auto *NewL : NewLoops)
2006       LPM.addLoop(*NewL);
2007 
2008     // If the current loop remains valid, re-add it to the queue. This is
2009     // a little wasteful as we'll finish processing the current loop as well,
2010     // but it is the best we can do in the old PM.
2011     if (CurrentLoopValid)
2012       LPM.addLoop(*L);
2013     else
2014       LPM.markLoopAsDeleted(*L);
2015   };
2016 
2017   bool Changed =
2018       unswitchLoop(*L, DT, LI, AC, TTI, NonTrivial, NonTrivialUnswitchCB);
2019 
2020   // If anything was unswitched, also clear any cached information about this
2021   // loop.
2022   LPM.deleteSimpleAnalysisLoop(L);
2023 
2024   // Historically this pass has had issues with the dominator tree so verify it
2025   // in asserts builds.
2026   assert(DT.verify(DominatorTree::VerificationLevel::Fast));
2027 
2028   return Changed;
2029 }
2030 
2031 char SimpleLoopUnswitchLegacyPass::ID = 0;
2032 INITIALIZE_PASS_BEGIN(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch",
2033                       "Simple unswitch loops", false, false)
2034 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
2035 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
2036 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
2037 INITIALIZE_PASS_DEPENDENCY(LoopPass)
2038 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
2039 INITIALIZE_PASS_END(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch",
2040                     "Simple unswitch loops", false, false)
2041 
2042 Pass *llvm::createSimpleLoopUnswitchLegacyPass(bool NonTrivial) {
2043   return new SimpleLoopUnswitchLegacyPass(NonTrivial);
2044 }
2045