xref: /llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp (revision eab3b90a148ce154fc54572ece849bf83589ae3a)
1 //===-- LoopUnroll.cpp - Loop unroller pass -------------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass implements a simple loop unroller.  It works best when loops have
11 // been canonicalized by the -indvars pass, allowing it to determine the trip
12 // counts of loops easily.
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/Transforms/Scalar/LoopUnrollPass.h"
16 #include "llvm/ADT/SetVector.h"
17 #include "llvm/Analysis/AssumptionCache.h"
18 #include "llvm/Analysis/CodeMetrics.h"
19 #include "llvm/Analysis/GlobalsModRef.h"
20 #include "llvm/Analysis/InstructionSimplify.h"
21 #include "llvm/Analysis/LoopPass.h"
22 #include "llvm/Analysis/LoopUnrollAnalyzer.h"
23 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
24 #include "llvm/Analysis/ScalarEvolution.h"
25 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
26 #include "llvm/IR/DataLayout.h"
27 #include "llvm/IR/Dominators.h"
28 #include "llvm/IR/InstVisitor.h"
29 #include "llvm/IR/IntrinsicInst.h"
30 #include "llvm/IR/Metadata.h"
31 #include "llvm/Support/CommandLine.h"
32 #include "llvm/Support/Debug.h"
33 #include "llvm/Support/raw_ostream.h"
34 #include "llvm/Transforms/Scalar.h"
35 #include "llvm/Transforms/Scalar/LoopPassManager.h"
36 #include "llvm/Transforms/Utils/LoopUtils.h"
37 #include "llvm/Transforms/Utils/UnrollLoop.h"
38 #include <climits>
39 #include <utility>
40 
41 using namespace llvm;
42 
43 #define DEBUG_TYPE "loop-unroll"
44 
45 static cl::opt<unsigned>
46     UnrollThreshold("unroll-threshold", cl::Hidden,
47                     cl::desc("The cost threshold for loop unrolling"));
48 
49 static cl::opt<unsigned> UnrollPartialThreshold(
50     "unroll-partial-threshold", cl::Hidden,
51     cl::desc("The cost threshold for partial loop unrolling"));
52 
53 static cl::opt<unsigned> UnrollMaxPercentThresholdBoost(
54     "unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden,
55     cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied "
56              "to the threshold when aggressively unrolling a loop due to the "
57              "dynamic cost savings. If completely unrolling a loop will reduce "
58              "the total runtime from X to Y, we boost the loop unroll "
59              "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, "
60              "X/Y). This limit avoids excessive code bloat."));
61 
62 static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(
63     "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden,
64     cl::desc("Don't allow loop unrolling to simulate more than this number of"
65              "iterations when checking full unroll profitability"));
66 
67 static cl::opt<unsigned> UnrollCount(
68     "unroll-count", cl::Hidden,
69     cl::desc("Use this unroll count for all loops including those with "
70              "unroll_count pragma values, for testing purposes"));
71 
72 static cl::opt<unsigned> UnrollMaxCount(
73     "unroll-max-count", cl::Hidden,
74     cl::desc("Set the max unroll count for partial and runtime unrolling, for"
75              "testing purposes"));
76 
77 static cl::opt<unsigned> UnrollFullMaxCount(
78     "unroll-full-max-count", cl::Hidden,
79     cl::desc(
80         "Set the max unroll count for full unrolling, for testing purposes"));
81 
82 static cl::opt<bool>
83     UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
84                        cl::desc("Allows loops to be partially unrolled until "
85                                 "-unroll-threshold loop size is reached."));
86 
87 static cl::opt<bool> UnrollAllowRemainder(
88     "unroll-allow-remainder", cl::Hidden,
89     cl::desc("Allow generation of a loop remainder (extra iterations) "
90              "when unrolling a loop."));
91 
92 static cl::opt<bool>
93     UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden,
94                   cl::desc("Unroll loops with run-time trip counts"));
95 
96 static cl::opt<unsigned> UnrollMaxUpperBound(
97     "unroll-max-upperbound", cl::init(8), cl::Hidden,
98     cl::desc(
99         "The max of trip count upper bound that is considered in unrolling"));
100 
101 static cl::opt<unsigned> PragmaUnrollThreshold(
102     "pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
103     cl::desc("Unrolled size limit for loops with an unroll(full) or "
104              "unroll_count pragma."));
105 
106 static cl::opt<unsigned> FlatLoopTripCountThreshold(
107     "flat-loop-tripcount-threshold", cl::init(5), cl::Hidden,
108     cl::desc("If the runtime tripcount for the loop is lower than the "
109              "threshold, the loop is considered as flat and will be less "
110              "aggressively unrolled."));
111 
112 static cl::opt<bool>
113     UnrollAllowPeeling("unroll-allow-peeling", cl::Hidden,
114                        cl::desc("Allows loops to be peeled when the dynamic "
115                                 "trip count is known to be low."));
116 
117 // This option isn't ever intended to be enabled, it serves to allow
118 // experiments to check the assumptions about when this kind of revisit is
119 // necessary.
120 static cl::opt<bool> UnrollRevisitChildLoops(
121     "unroll-revisit-child-loops", cl::Hidden,
122     cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. "
123              "This shouldn't typically be needed as child loops (or their "
124              "clones) were already visited."));
125 
126 /// A magic value for use with the Threshold parameter to indicate
127 /// that the loop unroll should be performed regardless of how much
128 /// code expansion would result.
129 static const unsigned NoThreshold = UINT_MAX;
130 
131 /// Gather the various unrolling parameters based on the defaults, compiler
132 /// flags, TTI overrides and user specified parameters.
133 static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
134     Loop *L, const TargetTransformInfo &TTI, Optional<unsigned> UserThreshold,
135     Optional<unsigned> UserCount, Optional<bool> UserAllowPartial,
136     Optional<bool> UserRuntime, Optional<bool> UserUpperBound) {
137   TargetTransformInfo::UnrollingPreferences UP;
138 
139   // Set up the defaults
140   UP.Threshold = 150;
141   UP.MaxPercentThresholdBoost = 400;
142   UP.OptSizeThreshold = 0;
143   UP.PartialThreshold = 150;
144   UP.PartialOptSizeThreshold = 0;
145   UP.Count = 0;
146   UP.PeelCount = 0;
147   UP.DefaultUnrollRuntimeCount = 8;
148   UP.MaxCount = UINT_MAX;
149   UP.FullUnrollMaxCount = UINT_MAX;
150   UP.BEInsns = 2;
151   UP.Partial = false;
152   UP.Runtime = false;
153   UP.AllowRemainder = true;
154   UP.AllowExpensiveTripCount = false;
155   UP.Force = false;
156   UP.UpperBound = false;
157   UP.AllowPeeling = false;
158 
159   // Override with any target specific settings
160   TTI.getUnrollingPreferences(L, UP);
161 
162   // Apply size attributes
163   if (L->getHeader()->getParent()->optForSize()) {
164     UP.Threshold = UP.OptSizeThreshold;
165     UP.PartialThreshold = UP.PartialOptSizeThreshold;
166   }
167 
168   // Apply any user values specified by cl::opt
169   if (UnrollThreshold.getNumOccurrences() > 0)
170     UP.Threshold = UnrollThreshold;
171   if (UnrollPartialThreshold.getNumOccurrences() > 0)
172     UP.PartialThreshold = UnrollPartialThreshold;
173   if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0)
174     UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost;
175   if (UnrollMaxCount.getNumOccurrences() > 0)
176     UP.MaxCount = UnrollMaxCount;
177   if (UnrollFullMaxCount.getNumOccurrences() > 0)
178     UP.FullUnrollMaxCount = UnrollFullMaxCount;
179   if (UnrollAllowPartial.getNumOccurrences() > 0)
180     UP.Partial = UnrollAllowPartial;
181   if (UnrollAllowRemainder.getNumOccurrences() > 0)
182     UP.AllowRemainder = UnrollAllowRemainder;
183   if (UnrollRuntime.getNumOccurrences() > 0)
184     UP.Runtime = UnrollRuntime;
185   if (UnrollMaxUpperBound == 0)
186     UP.UpperBound = false;
187   if (UnrollAllowPeeling.getNumOccurrences() > 0)
188     UP.AllowPeeling = UnrollAllowPeeling;
189 
190   // Apply user values provided by argument
191   if (UserThreshold.hasValue()) {
192     UP.Threshold = *UserThreshold;
193     UP.PartialThreshold = *UserThreshold;
194   }
195   if (UserCount.hasValue())
196     UP.Count = *UserCount;
197   if (UserAllowPartial.hasValue())
198     UP.Partial = *UserAllowPartial;
199   if (UserRuntime.hasValue())
200     UP.Runtime = *UserRuntime;
201   if (UserUpperBound.hasValue())
202     UP.UpperBound = *UserUpperBound;
203 
204   return UP;
205 }
206 
207 namespace {
208 /// A struct to densely store the state of an instruction after unrolling at
209 /// each iteration.
210 ///
211 /// This is designed to work like a tuple of <Instruction *, int> for the
212 /// purposes of hashing and lookup, but to be able to associate two boolean
213 /// states with each key.
214 struct UnrolledInstState {
215   Instruction *I;
216   int Iteration : 30;
217   unsigned IsFree : 1;
218   unsigned IsCounted : 1;
219 };
220 
221 /// Hashing and equality testing for a set of the instruction states.
222 struct UnrolledInstStateKeyInfo {
223   typedef DenseMapInfo<Instruction *> PtrInfo;
224   typedef DenseMapInfo<std::pair<Instruction *, int>> PairInfo;
225   static inline UnrolledInstState getEmptyKey() {
226     return {PtrInfo::getEmptyKey(), 0, 0, 0};
227   }
228   static inline UnrolledInstState getTombstoneKey() {
229     return {PtrInfo::getTombstoneKey(), 0, 0, 0};
230   }
231   static inline unsigned getHashValue(const UnrolledInstState &S) {
232     return PairInfo::getHashValue({S.I, S.Iteration});
233   }
234   static inline bool isEqual(const UnrolledInstState &LHS,
235                              const UnrolledInstState &RHS) {
236     return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration});
237   }
238 };
239 }
240 
241 namespace {
242 struct EstimatedUnrollCost {
243   /// \brief The estimated cost after unrolling.
244   unsigned UnrolledCost;
245 
246   /// \brief The estimated dynamic cost of executing the instructions in the
247   /// rolled form.
248   unsigned RolledDynamicCost;
249 };
250 }
251 
252 /// \brief Figure out if the loop is worth full unrolling.
253 ///
254 /// Complete loop unrolling can make some loads constant, and we need to know
255 /// if that would expose any further optimization opportunities.  This routine
256 /// estimates this optimization.  It computes cost of unrolled loop
257 /// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By
258 /// dynamic cost we mean that we won't count costs of blocks that are known not
259 /// to be executed (i.e. if we have a branch in the loop and we know that at the
260 /// given iteration its condition would be resolved to true, we won't add up the
261 /// cost of the 'false'-block).
262 /// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If
263 /// the analysis failed (no benefits expected from the unrolling, or the loop is
264 /// too big to analyze), the returned value is None.
265 static Optional<EstimatedUnrollCost>
266 analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
267                       ScalarEvolution &SE, const TargetTransformInfo &TTI,
268                       unsigned MaxUnrolledLoopSize) {
269   // We want to be able to scale offsets by the trip count and add more offsets
270   // to them without checking for overflows, and we already don't want to
271   // analyze *massive* trip counts, so we force the max to be reasonably small.
272   assert(UnrollMaxIterationsCountToAnalyze < (INT_MAX / 2) &&
273          "The unroll iterations max is too large!");
274 
275   // Only analyze inner loops. We can't properly estimate cost of nested loops
276   // and we won't visit inner loops again anyway.
277   if (!L->empty())
278     return None;
279 
280   // Don't simulate loops with a big or unknown tripcount
281   if (!UnrollMaxIterationsCountToAnalyze || !TripCount ||
282       TripCount > UnrollMaxIterationsCountToAnalyze)
283     return None;
284 
285   SmallSetVector<BasicBlock *, 16> BBWorklist;
286   SmallSetVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitWorklist;
287   DenseMap<Value *, Constant *> SimplifiedValues;
288   SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues;
289 
290   // The estimated cost of the unrolled form of the loop. We try to estimate
291   // this by simplifying as much as we can while computing the estimate.
292   unsigned UnrolledCost = 0;
293 
294   // We also track the estimated dynamic (that is, actually executed) cost in
295   // the rolled form. This helps identify cases when the savings from unrolling
296   // aren't just exposing dead control flows, but actual reduced dynamic
297   // instructions due to the simplifications which we expect to occur after
298   // unrolling.
299   unsigned RolledDynamicCost = 0;
300 
301   // We track the simplification of each instruction in each iteration. We use
302   // this to recursively merge costs into the unrolled cost on-demand so that
303   // we don't count the cost of any dead code. This is essentially a map from
304   // <instruction, int> to <bool, bool>, but stored as a densely packed struct.
305   DenseSet<UnrolledInstState, UnrolledInstStateKeyInfo> InstCostMap;
306 
307   // A small worklist used to accumulate cost of instructions from each
308   // observable and reached root in the loop.
309   SmallVector<Instruction *, 16> CostWorklist;
310 
311   // PHI-used worklist used between iterations while accumulating cost.
312   SmallVector<Instruction *, 4> PHIUsedList;
313 
314   // Helper function to accumulate cost for instructions in the loop.
315   auto AddCostRecursively = [&](Instruction &RootI, int Iteration) {
316     assert(Iteration >= 0 && "Cannot have a negative iteration!");
317     assert(CostWorklist.empty() && "Must start with an empty cost list");
318     assert(PHIUsedList.empty() && "Must start with an empty phi used list");
319     CostWorklist.push_back(&RootI);
320     for (;; --Iteration) {
321       do {
322         Instruction *I = CostWorklist.pop_back_val();
323 
324         // InstCostMap only uses I and Iteration as a key, the other two values
325         // don't matter here.
326         auto CostIter = InstCostMap.find({I, Iteration, 0, 0});
327         if (CostIter == InstCostMap.end())
328           // If an input to a PHI node comes from a dead path through the loop
329           // we may have no cost data for it here. What that actually means is
330           // that it is free.
331           continue;
332         auto &Cost = *CostIter;
333         if (Cost.IsCounted)
334           // Already counted this instruction.
335           continue;
336 
337         // Mark that we are counting the cost of this instruction now.
338         Cost.IsCounted = true;
339 
340         // If this is a PHI node in the loop header, just add it to the PHI set.
341         if (auto *PhiI = dyn_cast<PHINode>(I))
342           if (PhiI->getParent() == L->getHeader()) {
343             assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they "
344                                   "inherently simplify during unrolling.");
345             if (Iteration == 0)
346               continue;
347 
348             // Push the incoming value from the backedge into the PHI used list
349             // if it is an in-loop instruction. We'll use this to populate the
350             // cost worklist for the next iteration (as we count backwards).
351             if (auto *OpI = dyn_cast<Instruction>(
352                     PhiI->getIncomingValueForBlock(L->getLoopLatch())))
353               if (L->contains(OpI))
354                 PHIUsedList.push_back(OpI);
355             continue;
356           }
357 
358         // First accumulate the cost of this instruction.
359         if (!Cost.IsFree) {
360           UnrolledCost += TTI.getUserCost(I);
361           DEBUG(dbgs() << "Adding cost of instruction (iteration " << Iteration
362                        << "): ");
363           DEBUG(I->dump());
364         }
365 
366         // We must count the cost of every operand which is not free,
367         // recursively. If we reach a loop PHI node, simply add it to the set
368         // to be considered on the next iteration (backwards!).
369         for (Value *Op : I->operands()) {
370           // Check whether this operand is free due to being a constant or
371           // outside the loop.
372           auto *OpI = dyn_cast<Instruction>(Op);
373           if (!OpI || !L->contains(OpI))
374             continue;
375 
376           // Otherwise accumulate its cost.
377           CostWorklist.push_back(OpI);
378         }
379       } while (!CostWorklist.empty());
380 
381       if (PHIUsedList.empty())
382         // We've exhausted the search.
383         break;
384 
385       assert(Iteration > 0 &&
386              "Cannot track PHI-used values past the first iteration!");
387       CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end());
388       PHIUsedList.clear();
389     }
390   };
391 
392   // Ensure that we don't violate the loop structure invariants relied on by
393   // this analysis.
394   assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
395   assert(L->isLCSSAForm(DT) &&
396          "Must have loops in LCSSA form to track live-out values.");
397 
398   DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
399 
400   // Simulate execution of each iteration of the loop counting instructions,
401   // which would be simplified.
402   // Since the same load will take different values on different iterations,
403   // we literally have to go through all loop's iterations.
404   for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
405     DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
406 
407     // Prepare for the iteration by collecting any simplified entry or backedge
408     // inputs.
409     for (Instruction &I : *L->getHeader()) {
410       auto *PHI = dyn_cast<PHINode>(&I);
411       if (!PHI)
412         break;
413 
414       // The loop header PHI nodes must have exactly two input: one from the
415       // loop preheader and one from the loop latch.
416       assert(
417           PHI->getNumIncomingValues() == 2 &&
418           "Must have an incoming value only for the preheader and the latch.");
419 
420       Value *V = PHI->getIncomingValueForBlock(
421           Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch());
422       Constant *C = dyn_cast<Constant>(V);
423       if (Iteration != 0 && !C)
424         C = SimplifiedValues.lookup(V);
425       if (C)
426         SimplifiedInputValues.push_back({PHI, C});
427     }
428 
429     // Now clear and re-populate the map for the next iteration.
430     SimplifiedValues.clear();
431     while (!SimplifiedInputValues.empty())
432       SimplifiedValues.insert(SimplifiedInputValues.pop_back_val());
433 
434     UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L);
435 
436     BBWorklist.clear();
437     BBWorklist.insert(L->getHeader());
438     // Note that we *must not* cache the size, this loop grows the worklist.
439     for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
440       BasicBlock *BB = BBWorklist[Idx];
441 
442       // Visit all instructions in the given basic block and try to simplify
443       // it.  We don't change the actual IR, just count optimization
444       // opportunities.
445       for (Instruction &I : *BB) {
446         if (isa<DbgInfoIntrinsic>(I))
447           continue;
448 
449         // Track this instruction's expected baseline cost when executing the
450         // rolled loop form.
451         RolledDynamicCost += TTI.getUserCost(&I);
452 
453         // Visit the instruction to analyze its loop cost after unrolling,
454         // and if the visitor returns true, mark the instruction as free after
455         // unrolling and continue.
456         bool IsFree = Analyzer.visit(I);
457         bool Inserted = InstCostMap.insert({&I, (int)Iteration,
458                                            (unsigned)IsFree,
459                                            /*IsCounted*/ false}).second;
460         (void)Inserted;
461         assert(Inserted && "Cannot have a state for an unvisited instruction!");
462 
463         if (IsFree)
464           continue;
465 
466         // Can't properly model a cost of a call.
467         // FIXME: With a proper cost model we should be able to do it.
468         if(isa<CallInst>(&I))
469           return None;
470 
471         // If the instruction might have a side-effect recursively account for
472         // the cost of it and all the instructions leading up to it.
473         if (I.mayHaveSideEffects())
474           AddCostRecursively(I, Iteration);
475 
476         // If unrolled body turns out to be too big, bail out.
477         if (UnrolledCost > MaxUnrolledLoopSize) {
478           DEBUG(dbgs() << "  Exceeded threshold.. exiting.\n"
479                        << "  UnrolledCost: " << UnrolledCost
480                        << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize
481                        << "\n");
482           return None;
483         }
484       }
485 
486       TerminatorInst *TI = BB->getTerminator();
487 
488       // Add in the live successors by first checking whether we have terminator
489       // that may be simplified based on the values simplified by this call.
490       BasicBlock *KnownSucc = nullptr;
491       if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
492         if (BI->isConditional()) {
493           if (Constant *SimpleCond =
494                   SimplifiedValues.lookup(BI->getCondition())) {
495             // Just take the first successor if condition is undef
496             if (isa<UndefValue>(SimpleCond))
497               KnownSucc = BI->getSuccessor(0);
498             else if (ConstantInt *SimpleCondVal =
499                          dyn_cast<ConstantInt>(SimpleCond))
500               KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0);
501           }
502         }
503       } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
504         if (Constant *SimpleCond =
505                 SimplifiedValues.lookup(SI->getCondition())) {
506           // Just take the first successor if condition is undef
507           if (isa<UndefValue>(SimpleCond))
508             KnownSucc = SI->getSuccessor(0);
509           else if (ConstantInt *SimpleCondVal =
510                        dyn_cast<ConstantInt>(SimpleCond))
511             KnownSucc = SI->findCaseValue(SimpleCondVal).getCaseSuccessor();
512         }
513       }
514       if (KnownSucc) {
515         if (L->contains(KnownSucc))
516           BBWorklist.insert(KnownSucc);
517         else
518           ExitWorklist.insert({BB, KnownSucc});
519         continue;
520       }
521 
522       // Add BB's successors to the worklist.
523       for (BasicBlock *Succ : successors(BB))
524         if (L->contains(Succ))
525           BBWorklist.insert(Succ);
526         else
527           ExitWorklist.insert({BB, Succ});
528       AddCostRecursively(*TI, Iteration);
529     }
530 
531     // If we found no optimization opportunities on the first iteration, we
532     // won't find them on later ones too.
533     if (UnrolledCost == RolledDynamicCost) {
534       DEBUG(dbgs() << "  No opportunities found.. exiting.\n"
535                    << "  UnrolledCost: " << UnrolledCost << "\n");
536       return None;
537     }
538   }
539 
540   while (!ExitWorklist.empty()) {
541     BasicBlock *ExitingBB, *ExitBB;
542     std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val();
543 
544     for (Instruction &I : *ExitBB) {
545       auto *PN = dyn_cast<PHINode>(&I);
546       if (!PN)
547         break;
548 
549       Value *Op = PN->getIncomingValueForBlock(ExitingBB);
550       if (auto *OpI = dyn_cast<Instruction>(Op))
551         if (L->contains(OpI))
552           AddCostRecursively(*OpI, TripCount - 1);
553     }
554   }
555 
556   DEBUG(dbgs() << "Analysis finished:\n"
557                << "UnrolledCost: " << UnrolledCost << ", "
558                << "RolledDynamicCost: " << RolledDynamicCost << "\n");
559   return {{UnrolledCost, RolledDynamicCost}};
560 }
561 
562 /// ApproximateLoopSize - Approximate the size of the loop.
563 static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
564                                     bool &NotDuplicatable, bool &Convergent,
565                                     const TargetTransformInfo &TTI,
566                                     AssumptionCache *AC, unsigned BEInsns) {
567   SmallPtrSet<const Value *, 32> EphValues;
568   CodeMetrics::collectEphemeralValues(L, AC, EphValues);
569 
570   CodeMetrics Metrics;
571   for (BasicBlock *BB : L->blocks())
572     Metrics.analyzeBasicBlock(BB, TTI, EphValues);
573   NumCalls = Metrics.NumInlineCandidates;
574   NotDuplicatable = Metrics.notDuplicatable;
575   Convergent = Metrics.convergent;
576 
577   unsigned LoopSize = Metrics.NumInsts;
578 
579   // Don't allow an estimate of size zero.  This would allows unrolling of loops
580   // with huge iteration counts, which is a compile time problem even if it's
581   // not a problem for code quality. Also, the code using this size may assume
582   // that each loop has at least three instructions (likely a conditional
583   // branch, a comparison feeding that branch, and some kind of loop increment
584   // feeding that comparison instruction).
585   LoopSize = std::max(LoopSize, BEInsns + 1);
586 
587   return LoopSize;
588 }
589 
590 // Returns the loop hint metadata node with the given name (for example,
591 // "llvm.loop.unroll.count").  If no such metadata node exists, then nullptr is
592 // returned.
593 static MDNode *GetUnrollMetadataForLoop(const Loop *L, StringRef Name) {
594   if (MDNode *LoopID = L->getLoopID())
595     return GetUnrollMetadata(LoopID, Name);
596   return nullptr;
597 }
598 
599 // Returns true if the loop has an unroll(full) pragma.
600 static bool HasUnrollFullPragma(const Loop *L) {
601   return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.full");
602 }
603 
604 // Returns true if the loop has an unroll(enable) pragma. This metadata is used
605 // for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives.
606 static bool HasUnrollEnablePragma(const Loop *L) {
607   return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.enable");
608 }
609 
610 // Returns true if the loop has an unroll(disable) pragma.
611 static bool HasUnrollDisablePragma(const Loop *L) {
612   return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.disable");
613 }
614 
615 // Returns true if the loop has an runtime unroll(disable) pragma.
616 static bool HasRuntimeUnrollDisablePragma(const Loop *L) {
617   return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable");
618 }
619 
620 // If loop has an unroll_count pragma return the (necessarily
621 // positive) value from the pragma.  Otherwise return 0.
622 static unsigned UnrollCountPragmaValue(const Loop *L) {
623   MDNode *MD = GetUnrollMetadataForLoop(L, "llvm.loop.unroll.count");
624   if (MD) {
625     assert(MD->getNumOperands() == 2 &&
626            "Unroll count hint metadata should have two operands.");
627     unsigned Count =
628         mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
629     assert(Count >= 1 && "Unroll count must be positive.");
630     return Count;
631   }
632   return 0;
633 }
634 
635 // Remove existing unroll metadata and add unroll disable metadata to
636 // indicate the loop has already been unrolled.  This prevents a loop
637 // from being unrolled more than is directed by a pragma if the loop
638 // unrolling pass is run more than once (which it generally is).
639 static void SetLoopAlreadyUnrolled(Loop *L) {
640   MDNode *LoopID = L->getLoopID();
641   // First remove any existing loop unrolling metadata.
642   SmallVector<Metadata *, 4> MDs;
643   // Reserve first location for self reference to the LoopID metadata node.
644   MDs.push_back(nullptr);
645 
646   if (LoopID) {
647     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
648       bool IsUnrollMetadata = false;
649       MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
650       if (MD) {
651         const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
652         IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll.");
653       }
654       if (!IsUnrollMetadata)
655         MDs.push_back(LoopID->getOperand(i));
656     }
657   }
658 
659   // Add unroll(disable) metadata to disable future unrolling.
660   LLVMContext &Context = L->getHeader()->getContext();
661   SmallVector<Metadata *, 1> DisableOperands;
662   DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable"));
663   MDNode *DisableNode = MDNode::get(Context, DisableOperands);
664   MDs.push_back(DisableNode);
665 
666   MDNode *NewLoopID = MDNode::get(Context, MDs);
667   // Set operand 0 to refer to the loop id itself.
668   NewLoopID->replaceOperandWith(0, NewLoopID);
669   L->setLoopID(NewLoopID);
670 }
671 
672 // Computes the boosting factor for complete unrolling.
673 // If fully unrolling the loop would save a lot of RolledDynamicCost, it would
674 // be beneficial to fully unroll the loop even if unrolledcost is large. We
675 // use (RolledDynamicCost / UnrolledCost) to model the unroll benefits to adjust
676 // the unroll threshold.
677 static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost,
678                                             unsigned MaxPercentThresholdBoost) {
679   if (Cost.RolledDynamicCost >= UINT_MAX / 100)
680     return 100;
681   else if (Cost.UnrolledCost != 0)
682     // The boosting factor is RolledDynamicCost / UnrolledCost
683     return std::min(100 * Cost.RolledDynamicCost / Cost.UnrolledCost,
684                     MaxPercentThresholdBoost);
685   else
686     return MaxPercentThresholdBoost;
687 }
688 
689 // Returns loop size estimation for unrolled loop.
690 static uint64_t getUnrolledLoopSize(
691     unsigned LoopSize,
692     TargetTransformInfo::UnrollingPreferences &UP) {
693   assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
694   return (uint64_t)(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;
695 }
696 
697 // Returns true if unroll count was set explicitly.
698 // Calculates unroll count and writes it to UP.Count.
699 static bool computeUnrollCount(
700     Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
701     ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, unsigned &TripCount,
702     unsigned MaxTripCount, unsigned &TripMultiple, unsigned LoopSize,
703     TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) {
704   // Check for explicit Count.
705   // 1st priority is unroll count set by "unroll-count" option.
706   bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
707   if (UserUnrollCount) {
708     UP.Count = UnrollCount;
709     UP.AllowExpensiveTripCount = true;
710     UP.Force = true;
711     if (UP.AllowRemainder && getUnrolledLoopSize(LoopSize, UP) < UP.Threshold)
712       return true;
713   }
714 
715   // 2nd priority is unroll count set by pragma.
716   unsigned PragmaCount = UnrollCountPragmaValue(L);
717   if (PragmaCount > 0) {
718     UP.Count = PragmaCount;
719     UP.Runtime = true;
720     UP.AllowExpensiveTripCount = true;
721     UP.Force = true;
722     if (UP.AllowRemainder &&
723         getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
724       return true;
725   }
726   bool PragmaFullUnroll = HasUnrollFullPragma(L);
727   if (PragmaFullUnroll && TripCount != 0) {
728     UP.Count = TripCount;
729     if (getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
730       return false;
731   }
732 
733   bool PragmaEnableUnroll = HasUnrollEnablePragma(L);
734   bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||
735                         PragmaEnableUnroll || UserUnrollCount;
736 
737   if (ExplicitUnroll && TripCount != 0) {
738     // If the loop has an unrolling pragma, we want to be more aggressive with
739     // unrolling limits. Set thresholds to at least the PragmaThreshold value
740     // which is larger than the default limits.
741     UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
742     UP.PartialThreshold =
743         std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
744   }
745 
746   // 3rd priority is full unroll count.
747   // Full unroll makes sense only when TripCount or its upper bound could be
748   // statically calculated.
749   // Also we need to check if we exceed FullUnrollMaxCount.
750   // If using the upper bound to unroll, TripMultiple should be set to 1 because
751   // we do not know when loop may exit.
752   // MaxTripCount and ExactTripCount cannot both be non zero since we only
753   // compute the former when the latter is zero.
754   unsigned ExactTripCount = TripCount;
755   assert((ExactTripCount == 0 || MaxTripCount == 0) &&
756          "ExtractTripCound and MaxTripCount cannot both be non zero.");
757   unsigned FullUnrollTripCount = ExactTripCount ? ExactTripCount : MaxTripCount;
758   UP.Count = FullUnrollTripCount;
759   if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
760     // When computing the unrolled size, note that BEInsns are not replicated
761     // like the rest of the loop body.
762     if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) {
763       UseUpperBound = (MaxTripCount == FullUnrollTripCount);
764       TripCount = FullUnrollTripCount;
765       TripMultiple = UP.UpperBound ? 1 : TripMultiple;
766       return ExplicitUnroll;
767     } else {
768       // The loop isn't that small, but we still can fully unroll it if that
769       // helps to remove a significant number of instructions.
770       // To check that, run additional analysis on the loop.
771       if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
772               L, FullUnrollTripCount, DT, *SE, TTI,
773               UP.Threshold * UP.MaxPercentThresholdBoost / 100)) {
774         unsigned Boost =
775             getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
776         if (Cost->UnrolledCost < UP.Threshold * Boost / 100) {
777           UseUpperBound = (MaxTripCount == FullUnrollTripCount);
778           TripCount = FullUnrollTripCount;
779           TripMultiple = UP.UpperBound ? 1 : TripMultiple;
780           return ExplicitUnroll;
781         }
782       }
783     }
784   }
785 
786   // 4rd priority is partial unrolling.
787   // Try partial unroll only when TripCount could be staticaly calculated.
788   if (TripCount) {
789     UP.Partial |= ExplicitUnroll;
790     if (!UP.Partial) {
791       DEBUG(dbgs() << "  will not try to unroll partially because "
792                    << "-unroll-allow-partial not given\n");
793       UP.Count = 0;
794       return false;
795     }
796     if (UP.Count == 0)
797       UP.Count = TripCount;
798     if (UP.PartialThreshold != NoThreshold) {
799       // Reduce unroll count to be modulo of TripCount for partial unrolling.
800       if (getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
801         UP.Count =
802             (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
803             (LoopSize - UP.BEInsns);
804       if (UP.Count > UP.MaxCount)
805         UP.Count = UP.MaxCount;
806       while (UP.Count != 0 && TripCount % UP.Count != 0)
807         UP.Count--;
808       if (UP.AllowRemainder && UP.Count <= 1) {
809         // If there is no Count that is modulo of TripCount, set Count to
810         // largest power-of-two factor that satisfies the threshold limit.
811         // As we'll create fixup loop, do the type of unrolling only if
812         // remainder loop is allowed.
813         UP.Count = UP.DefaultUnrollRuntimeCount;
814         while (UP.Count != 0 &&
815                getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
816           UP.Count >>= 1;
817       }
818       if (UP.Count < 2) {
819         if (PragmaEnableUnroll)
820           ORE->emit(
821               OptimizationRemarkMissed(DEBUG_TYPE, "UnrollAsDirectedTooLarge",
822                                        L->getStartLoc(), L->getHeader())
823               << "Unable to unroll loop as directed by unroll(enable) pragma "
824                  "because unrolled size is too large.");
825         UP.Count = 0;
826       }
827     } else {
828       UP.Count = TripCount;
829     }
830     if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
831         UP.Count != TripCount)
832       ORE->emit(
833           OptimizationRemarkMissed(DEBUG_TYPE, "FullUnrollAsDirectedTooLarge",
834                                    L->getStartLoc(), L->getHeader())
835           << "Unable to fully unroll loop as directed by unroll pragma because "
836              "unrolled size is too large.");
837     return ExplicitUnroll;
838   }
839   assert(TripCount == 0 &&
840          "All cases when TripCount is constant should be covered here.");
841   if (PragmaFullUnroll)
842     ORE->emit(
843         OptimizationRemarkMissed(DEBUG_TYPE,
844                                  "CantFullUnrollAsDirectedRuntimeTripCount",
845                                  L->getStartLoc(), L->getHeader())
846         << "Unable to fully unroll loop as directed by unroll(full) pragma "
847            "because loop has a runtime trip count.");
848 
849   // 5th priority is loop peeling
850   computePeelCount(L, LoopSize, UP);
851   if (UP.PeelCount) {
852     UP.Runtime = false;
853     UP.Count = 1;
854     return ExplicitUnroll;
855   }
856 
857   // 6th priority is runtime unrolling.
858   // Don't unroll a runtime trip count loop when it is disabled.
859   if (HasRuntimeUnrollDisablePragma(L)) {
860     UP.Count = 0;
861     return false;
862   }
863 
864   // Check if the runtime trip count is too small when profile is available.
865   if (L->getHeader()->getParent()->getEntryCount()) {
866     if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
867       if (*ProfileTripCount < FlatLoopTripCountThreshold)
868         return false;
869       else
870         UP.AllowExpensiveTripCount = true;
871     }
872   }
873 
874   // Reduce count based on the type of unrolling and the threshold values.
875   UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount;
876   if (!UP.Runtime) {
877     DEBUG(dbgs() << "  will not try to unroll loop with runtime trip count "
878                  << "-unroll-runtime not given\n");
879     UP.Count = 0;
880     return false;
881   }
882   if (UP.Count == 0)
883     UP.Count = UP.DefaultUnrollRuntimeCount;
884 
885   // Reduce unroll count to be the largest power-of-two factor of
886   // the original count which satisfies the threshold limit.
887   while (UP.Count != 0 &&
888          getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
889     UP.Count >>= 1;
890 
891 #ifndef NDEBUG
892   unsigned OrigCount = UP.Count;
893 #endif
894 
895   if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) {
896     while (UP.Count != 0 && TripMultiple % UP.Count != 0)
897       UP.Count >>= 1;
898     DEBUG(dbgs() << "Remainder loop is restricted (that could architecture "
899                     "specific or because the loop contains a convergent "
900                     "instruction), so unroll count must divide the trip "
901                     "multiple, "
902                  << TripMultiple << ".  Reducing unroll count from "
903                  << OrigCount << " to " << UP.Count << ".\n");
904     using namespace ore;
905     if (PragmaCount > 0 && !UP.AllowRemainder)
906       ORE->emit(
907           OptimizationRemarkMissed(DEBUG_TYPE,
908                                    "DifferentUnrollCountFromDirected",
909                                    L->getStartLoc(), L->getHeader())
910           << "Unable to unroll loop the number of times directed by "
911              "unroll_count pragma because remainder loop is restricted "
912              "(that could architecture specific or because the loop "
913              "contains a convergent instruction) and so must have an unroll "
914              "count that divides the loop trip multiple of "
915           << NV("TripMultiple", TripMultiple) << ".  Unrolling instead "
916           << NV("UnrollCount", UP.Count) << " time(s).");
917   }
918 
919   if (UP.Count > UP.MaxCount)
920     UP.Count = UP.MaxCount;
921   DEBUG(dbgs() << "  partially unrolling with count: " << UP.Count << "\n");
922   if (UP.Count < 2)
923     UP.Count = 0;
924   return ExplicitUnroll;
925 }
926 
927 static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
928                             ScalarEvolution *SE, const TargetTransformInfo &TTI,
929                             AssumptionCache &AC, OptimizationRemarkEmitter &ORE,
930                             bool PreserveLCSSA,
931                             Optional<unsigned> ProvidedCount,
932                             Optional<unsigned> ProvidedThreshold,
933                             Optional<bool> ProvidedAllowPartial,
934                             Optional<bool> ProvidedRuntime,
935                             Optional<bool> ProvidedUpperBound) {
936   DEBUG(dbgs() << "Loop Unroll: F[" << L->getHeader()->getParent()->getName()
937                << "] Loop %" << L->getHeader()->getName() << "\n");
938   if (HasUnrollDisablePragma(L))
939     return false;
940   if (!L->isLoopSimplifyForm()) {
941     DEBUG(
942         dbgs() << "  Not unrolling loop which is not in loop-simplify form.\n");
943     return false;
944   }
945 
946   unsigned NumInlineCandidates;
947   bool NotDuplicatable;
948   bool Convergent;
949   TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
950       L, TTI, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial,
951       ProvidedRuntime, ProvidedUpperBound);
952   // Exit early if unrolling is disabled.
953   if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0))
954     return false;
955   unsigned LoopSize = ApproximateLoopSize(
956       L, NumInlineCandidates, NotDuplicatable, Convergent, TTI, &AC, UP.BEInsns);
957   DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
958   if (NotDuplicatable) {
959     DEBUG(dbgs() << "  Not unrolling loop which contains non-duplicatable"
960                  << " instructions.\n");
961     return false;
962   }
963   if (NumInlineCandidates != 0) {
964     DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
965     return false;
966   }
967 
968   // Find trip count and trip multiple if count is not available
969   unsigned TripCount = 0;
970   unsigned MaxTripCount = 0;
971   unsigned TripMultiple = 1;
972   // If there are multiple exiting blocks but one of them is the latch, use the
973   // latch for the trip count estimation. Otherwise insist on a single exiting
974   // block for the trip count estimation.
975   BasicBlock *ExitingBlock = L->getLoopLatch();
976   if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
977     ExitingBlock = L->getExitingBlock();
978   if (ExitingBlock) {
979     TripCount = SE->getSmallConstantTripCount(L, ExitingBlock);
980     TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock);
981   }
982 
983   // If the loop contains a convergent operation, the prelude we'd add
984   // to do the first few instructions before we hit the unrolled loop
985   // is unsafe -- it adds a control-flow dependency to the convergent
986   // operation.  Therefore restrict remainder loop (try unrollig without).
987   //
988   // TODO: This is quite conservative.  In practice, convergent_op()
989   // is likely to be called unconditionally in the loop.  In this
990   // case, the program would be ill-formed (on most architectures)
991   // unless n were the same on all threads in a thread group.
992   // Assuming n is the same on all threads, any kind of unrolling is
993   // safe.  But currently llvm's notion of convergence isn't powerful
994   // enough to express this.
995   if (Convergent)
996     UP.AllowRemainder = false;
997 
998   // Try to find the trip count upper bound if we cannot find the exact trip
999   // count.
1000   bool MaxOrZero = false;
1001   if (!TripCount) {
1002     MaxTripCount = SE->getSmallConstantMaxTripCount(L);
1003     MaxOrZero = SE->isBackedgeTakenCountMaxOrZero(L);
1004     // We can unroll by the upper bound amount if it's generally allowed or if
1005     // we know that the loop is executed either the upper bound or zero times.
1006     // (MaxOrZero unrolling keeps only the first loop test, so the number of
1007     // loop tests remains the same compared to the non-unrolled version, whereas
1008     // the generic upper bound unrolling keeps all but the last loop test so the
1009     // number of loop tests goes up which may end up being worse on targets with
1010     // constriained branch predictor resources so is controlled by an option.)
1011     // In addition we only unroll small upper bounds.
1012     if (!(UP.UpperBound || MaxOrZero) || MaxTripCount > UnrollMaxUpperBound) {
1013       MaxTripCount = 0;
1014     }
1015   }
1016 
1017   // computeUnrollCount() decides whether it is beneficial to use upper bound to
1018   // fully unroll the loop.
1019   bool UseUpperBound = false;
1020   bool IsCountSetExplicitly =
1021       computeUnrollCount(L, TTI, DT, LI, SE, &ORE, TripCount, MaxTripCount,
1022                          TripMultiple, LoopSize, UP, UseUpperBound);
1023   if (!UP.Count)
1024     return false;
1025   // Unroll factor (Count) must be less or equal to TripCount.
1026   if (TripCount && UP.Count > TripCount)
1027     UP.Count = TripCount;
1028 
1029   // Unroll the loop.
1030   if (!UnrollLoop(L, UP.Count, TripCount, UP.Force, UP.Runtime,
1031                   UP.AllowExpensiveTripCount, UseUpperBound, MaxOrZero,
1032                   TripMultiple, UP.PeelCount, LI, SE, &DT, &AC, &ORE,
1033                   PreserveLCSSA))
1034     return false;
1035 
1036   // If loop has an unroll count pragma or unrolled by explicitly set count
1037   // mark loop as unrolled to prevent unrolling beyond that requested.
1038   // If the loop was peeled, we already "used up" the profile information
1039   // we had, so we don't want to unroll or peel again.
1040   if (IsCountSetExplicitly || UP.PeelCount)
1041     SetLoopAlreadyUnrolled(L);
1042 
1043   return true;
1044 }
1045 
1046 namespace {
1047 class LoopUnroll : public LoopPass {
1048 public:
1049   static char ID; // Pass ID, replacement for typeid
1050   LoopUnroll(Optional<unsigned> Threshold = None,
1051              Optional<unsigned> Count = None,
1052              Optional<bool> AllowPartial = None, Optional<bool> Runtime = None,
1053              Optional<bool> UpperBound = None)
1054       : LoopPass(ID), ProvidedCount(std::move(Count)),
1055         ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
1056         ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound) {
1057     initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
1058   }
1059 
1060   Optional<unsigned> ProvidedCount;
1061   Optional<unsigned> ProvidedThreshold;
1062   Optional<bool> ProvidedAllowPartial;
1063   Optional<bool> ProvidedRuntime;
1064   Optional<bool> ProvidedUpperBound;
1065 
1066   bool runOnLoop(Loop *L, LPPassManager &) override {
1067     if (skipLoop(L))
1068       return false;
1069 
1070     Function &F = *L->getHeader()->getParent();
1071 
1072     auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1073     LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1074     ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1075     const TargetTransformInfo &TTI =
1076         getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1077     auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1078     // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
1079     // pass.  Function analyses need to be preserved across loop transformations
1080     // but ORE cannot be preserved (see comment before the pass definition).
1081     OptimizationRemarkEmitter ORE(&F);
1082     bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
1083 
1084     return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA,
1085                            ProvidedCount, ProvidedThreshold,
1086                            ProvidedAllowPartial, ProvidedRuntime,
1087                            ProvidedUpperBound);
1088   }
1089 
1090   /// This transformation requires natural loop information & requires that
1091   /// loop preheaders be inserted into the CFG...
1092   ///
1093   void getAnalysisUsage(AnalysisUsage &AU) const override {
1094     AU.addRequired<AssumptionCacheTracker>();
1095     AU.addRequired<TargetTransformInfoWrapperPass>();
1096     // FIXME: Loop passes are required to preserve domtree, and for now we just
1097     // recreate dom info if anything gets unrolled.
1098     getLoopAnalysisUsage(AU);
1099   }
1100 };
1101 }
1102 
1103 char LoopUnroll::ID = 0;
1104 INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
1105 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1106 INITIALIZE_PASS_DEPENDENCY(LoopPass)
1107 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1108 INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
1109 
1110 Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial,
1111                                  int Runtime, int UpperBound) {
1112   // TODO: It would make more sense for this function to take the optionals
1113   // directly, but that's dangerous since it would silently break out of tree
1114   // callers.
1115   return new LoopUnroll(Threshold == -1 ? None : Optional<unsigned>(Threshold),
1116                         Count == -1 ? None : Optional<unsigned>(Count),
1117                         AllowPartial == -1 ? None
1118                                            : Optional<bool>(AllowPartial),
1119                         Runtime == -1 ? None : Optional<bool>(Runtime),
1120                         UpperBound == -1 ? None : Optional<bool>(UpperBound));
1121 }
1122 
1123 Pass *llvm::createSimpleLoopUnrollPass() {
1124   return llvm::createLoopUnrollPass(-1, -1, 0, 0, 0);
1125 }
1126 
1127 PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
1128                                       LoopStandardAnalysisResults &AR,
1129                                       LPMUpdater &Updater) {
1130   const auto &FAM =
1131       AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
1132   Function *F = L.getHeader()->getParent();
1133 
1134   auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
1135   // FIXME: This should probably be optional rather than required.
1136   if (!ORE)
1137     report_fatal_error("LoopUnrollPass: OptimizationRemarkEmitterAnalysis not "
1138                        "cached at a higher level");
1139 
1140   // Keep track of the previous loop structure so we can identify new loops
1141   // created by unrolling.
1142   Loop *ParentL = L.getParentLoop();
1143   SmallPtrSet<Loop *, 4> OldLoops;
1144   if (ParentL)
1145     OldLoops.insert(ParentL->begin(), ParentL->end());
1146   else
1147     OldLoops.insert(AR.LI.begin(), AR.LI.end());
1148 
1149   // The API here is quite complex to call, but there are only two interesting
1150   // states we support: partial and full (or "simple") unrolling. However, to
1151   // enable these things we actually pass "None" in for the optional to avoid
1152   // providing an explicit choice.
1153   Optional<bool> AllowPartialParam, RuntimeParam, UpperBoundParam;
1154   if (!AllowPartialUnrolling)
1155     AllowPartialParam = RuntimeParam = UpperBoundParam = false;
1156   bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, &AR.SE, AR.TTI, AR.AC, *ORE,
1157                                  /*PreserveLCSSA*/ true, /*Count*/ None,
1158                                  /*Threshold*/ None, AllowPartialParam,
1159                                  RuntimeParam, UpperBoundParam);
1160   if (!Changed)
1161     return PreservedAnalyses::all();
1162 
1163   // The parent must not be damaged by unrolling!
1164 #ifndef NDEBUG
1165   if (ParentL)
1166     ParentL->verifyLoop();
1167 #endif
1168 
1169   // Unrolling can do several things to introduce new loops into a loop nest:
1170   // - Partial unrolling clones child loops within the current loop. If it
1171   //   uses a remainder, then it can also create any number of sibling loops.
1172   // - Full unrolling clones child loops within the current loop but then
1173   //   removes the current loop making all of the children appear to be new
1174   //   sibling loops.
1175   // - Loop peeling can directly introduce new sibling loops by peeling one
1176   //   iteration.
1177   //
1178   // When a new loop appears as a sibling loop, either from peeling an
1179   // iteration or fully unrolling, its nesting structure has fundamentally
1180   // changed and we want to revisit it to reflect that.
1181   //
1182   // When unrolling has removed the current loop, we need to tell the
1183   // infrastructure that it is gone.
1184   //
1185   // Finally, we support a debugging/testing mode where we revisit child loops
1186   // as well. These are not expected to require further optimizations as either
1187   // they or the loop they were cloned from have been directly visited already.
1188   // But the debugging mode allows us to check this assumption.
1189   bool IsCurrentLoopValid = false;
1190   SmallVector<Loop *, 4> SibLoops;
1191   if (ParentL)
1192     SibLoops.append(ParentL->begin(), ParentL->end());
1193   else
1194     SibLoops.append(AR.LI.begin(), AR.LI.end());
1195   erase_if(SibLoops, [&](Loop *SibLoop) {
1196     if (SibLoop == &L) {
1197       IsCurrentLoopValid = true;
1198       return true;
1199     }
1200 
1201     // Otherwise erase the loop from the list if it was in the old loops.
1202     return OldLoops.count(SibLoop) != 0;
1203   });
1204   Updater.addSiblingLoops(SibLoops);
1205 
1206   if (!IsCurrentLoopValid) {
1207     Updater.markLoopAsDeleted(L);
1208   } else {
1209     // We can only walk child loops if the current loop remained valid.
1210     if (UnrollRevisitChildLoops) {
1211       // Walk *all* of the child loops. This is a highly speculative mode
1212       // anyways so look for any simplifications that arose from partial
1213       // unrolling or peeling off of iterations.
1214       SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end());
1215       Updater.addChildLoops(ChildLoops);
1216     }
1217   }
1218 
1219   return getLoopPassPreservedAnalyses();
1220 }
1221